python：爬虫：爬取股票信息应用实例并入库mysql，进而实现可视化

阅读：评论：0

候选数据网站的选择
选取原则：股票信息静态存在于HTML页面中，非js代码生成，没有Robots协议限制
选取方法：浏览器F12，源代码查看等程序的结构设计：
步骤1：从东方财富网获取股票列表
步骤2：根据股票列表 逐个到百度股票获取个股信息
步骤3：将结果存储到文件

参考视频：=47

代码：

import requests
import time
from bs4 import BeautifulSoup
import traceback
import redef getHTMLText(url,code='utf-8'):try:r = (url,timeout=30)r.raise_for_status()r.encoding= code # 页面自动解析编码写法 r.encoding= r.apparent_cept:return ""def  getStockList(lst,stockURL):html = getHTMLText(stockURL)soup = BeautifulSoup(html,"html.parser")a = soup.find_all('a')for i  in a :# print(type(i)) #<class 'bs4.element.Tag'># s = "连板"# print(s)# if s in str(i): #判断字符串中是否包含子串try:href= i.attrs['href']lst.append(re.findall(r"d{6}",href)[0]) #以s开头 然后是h或z字母（因为股票代码不是上海sh就是深圳sz开头） 注意：re.findall返回列表，如[sh100012] 然后再取值出sh100012 然后再append，否则就append进去[[sh100012]]了except:continuedef getStockInfo(lst,stockURL,fpath):count=0 # 实现 进度条for stock in lst:url = stockURL + stock +".html" #拼接urlhtml = getHTMLText(url) # 获取股票页面内容try:if html == "": #空页面的处理continueinfoDict = {}soup = BeautifulSoup(html,'html.parser') # 解析网页text &## 正则匹配出股票名称stockName = re.search(r'[wu4e00-u9fcc]+',text).group(0)# 正则匹配出股票代码stockNumber = re.search(r'[0-9]d{5}',text).group(0)# 正则匹配出股票个股日历p = repile(r"[个][股][日][u4e00-u9fa5]+[sS]+")stockHistory=p.findall(text)print(stockHistory)stockHistory1= re.split(r" +",stockHistory[0])  #按照多个空格分割print(stockHistory1[0])#将“个股日历 替换为空格”stockHistory2 = stockHistory1[0].replace("个股日历","")print(stockHistory2)stockHistory3 = place("n",";",100).replace("r",";")print(str(stockHistory3))infoDict.update({'股票名称':stockName}) #将 这个信息增加到字典中infoDict.update({'股票代码':stockNumber}) #将 这个信息增加到字典中infoDict.update({'股票日历':stockHistory2}) #将 这个信息增加到字典中# 股票信息部分如下# keyList = stockInfo.find_all('dt') #键# valueList = stockInfo.find_all('dd') #值# 还原为键值对并存到字典中# for i in range(len(keyList)):#     key = keyList[i].text#     val = valueList[i].text#     infoDict[key]= val #字典可以直接使用key=value向字典中新增内容#将相关股票信息保存在文件中with open(fpath,'a',encoding='utf-8') as f:f.write(str(infoDict) + 'n')count = count +1 # 实现 进度条print('r当前速度：{:.2f}%'.format(count * 100/len(lst)),end='') # r能够将我们打印的字符串的最后的光标提到当前这一行的头部，那么下一次再进行 相关打印的时候，打印信息就会覆盖之前的内容。实现一个不换行的动态展示的进度条# 每10秒抓一次数据time.sleep(10)except:count = count +1 # 实现 进度条print('r当前速度：{:.2f}%'.format(count * 100/len(lst)),end='')traceback.print_exc()continueif __name__ == '__main__':# stock_list_url = ".html"stock_list_url = ".html"# stock_info_url = "" #sh上证 sz深圳stock_info_url = ""output_file = "D://pythontest/files/"# slist = ['002656','002702','000001','000002']slist=[]getStockList(slist,stock_list_url)getStockInfo(slist,stock_info_url,output_file)

抓取多个信息并入库mysql实例：

import requests
from bs4 import BeautifulSoup
import traceback
import re
from 基础用法.toMysql import ToMySql
def getHTMLText(url,code='utf-8'):try:r = (url,timeout=30)r.raise_for_status()r.encoding= code # 页面自动解析编码写法 r.encoding= r.apparent_cept:return ""def  getStockList(lst,stockURL):html = getHTMLText(stockURL)soup = BeautifulSoup(html,"html.parser")a = soup.find_all('a')for i  in a :try:href= i.attrs['href']lst.append(re.findall(r"d{6}",href)[0]) #以s开头 然后是h或z字母（因为股票代码不是上海sh就是深圳sz开头） 注意：re.findall返回列表，如[sh100012] 然后再取值出sh100012 然后再append，否则就append进去[[sh100012]]了except:continuedef getStockInfo(lst,stockURL,fpath):count=0 # 实现 进度条for stock in lst:url = stockURL + stock +".html" #拼接urlhtml = getHTMLText(url) # 获取股票页面内容print('==================')print(url)try:if html == "": #空页面的处理continueinfoDict = {}soup = BeautifulSoup(html,'html.parser') # 解析网页text &## 正则匹配出股票名称stockName = re.search(r'[wu4e00-u9fcc]+',text).group(0)# 正则匹配出股票代码stockNumber = re.search(r'[0-9]d{5}',text).group(0)# 正则匹配出股票个股日历jk = repile(r"[今][开][:：][0-9]*[.][0-9]*")zs = repile(r"[昨][收][:：][0-9]*[.][0-9]*")zg = repile(r"[最][高][:：][0-9]*[.][0-9]*")zd = repile(r"[最][低][:：][0-9]*[.][0-9]*")hsl = repile(r"[换][手][率][:：][0-9]*[.][0-9]*")syl = repile(r"[市][盈][率][:：][0-9]*[.][0-9]*")jkV=jk.findall(text)zsV=zs.findall(text)zgV=zg.findall(text)zdV=zd.findall(text)hslV=hsl.findall(text)sylV=syl.findall(text)# print(sylV[0])hslV = str(hslV[0])+"%"print(jkV[0])print(jkV[0].split(":")[1])print(zsV[0])print(zgV[0])print(zdV[0])print(str(hslV))print(sylV[0])# 将数据写入mysqlsql = """ INSERT INTO stock_infos(stock_name,stock_code,jk,zs,zg,zd,hsl,syl) VALUES(%s,%s,%s,%s,%s,%s,%s,%s) """data = (stockName,stockNumber,jkV[0].split(":")[1],zsV[0].split(":")[1],zgV[0].split(":")[1],zdV[0].split(":")[1],hslV.split(":")[1],str(sylV[0].split(":")[1])) #直接写数字类型也能写入result = ToMySql.writeDb(sql, data)# 将数据写入文件中infoDict.update({'名称':stockName}) #将 这个信息增加到字典中infoDict.update({'代码':stockNumber})infoDict.update({'今开':jkV[0].split(":")[1]})infoDict.update({'昨收':zsV[0].split(":")[1]})infoDict.update({'最高':zgV[0].split(":")[1]})infoDict.update({'最低':zdV[0].split(":")[1]})infoDict.update({'换手率':hslV.split(":")[1]})infoDict.update({'市盈率':str(sylV[0].split(":")[1])})#将相关股票信息保存在文件中with open(fpath,'a',encoding='utf-8') as f:f.write(str(infoDict) + 'n')count = count +1 # 实现 进度条print('r当前速度：{:.2f}%'.format(count * 100/len(lst)),end='') # r能够将我们打印的字符串的最后的光标提到当前这一行的头部，那么下一次再进行 相关打印的时候，打印信息就会覆盖之前的内容。实现一个不换行的动态展示的进度条# 每10秒抓一次数据# time.sleep(10)except:count = count +1 # 实现 进度条print('r当前速度：{:.2f}%'.format(count * 100/len(lst)),end='')traceback.print_exc()continueif __name__ == '__main__':# stock_list_url = ".html"stock_list_url = ".html"# stock_info_url = "" #sh上证 sz深圳stock_info_url = ""output_file = "D://pythontest/files/gupiao/我的股票信息.csv"# slist = ['002656','002702','000001','000002']slist=[]getStockList(slist,stock_list_url)getStockInfo(slist,stock_info_url,output_file)

入库mysql封装的方法：

import pymysql
import logging
import pandas as pddb_name = 'python'
db_user = 'root'
db_pass = 'root'
db_ip = '127.0.0.1'
db_port = 3306#写入数据到数据库中
def writeDb(sql,db_data=()):"""连接mysql数据库（写），并进行写的操作"""try:conn = t(db=db_name,user=db_user,passwd=db_pass,host=db_ip,port=int(db_port),charset="utf8")cursor = conn.cursor()except Exception as e:print(('数据库连接失败:%s' % e)return ute(sql, db_data)connmit()except Exception as llback()('数据写入失败:%s' % e)return Falsefinally:cursor.close()conn.close()return True#
# sql = """ INSERT INTO user(email,last_name) VALUES(%s,%s) """
# data = ("632443020@qq", "男")
# result = writeDb(sql, data)sql = """ INSERT INTO stock_infos(stock_name,stock_code,jk,zs,zg,zd,hsl,syl) VALUES(%s,%s,%s,%s,%s,%s,%s,%s) """
data = ("1","2","3","4","5","6","7","8") #直接写数字类型也能写入
result = writeDb(sql, data)

Stocks信息表：

 CREATE TABLEstock_infos(id INT NOT NULL AUTO_INCREMENT COMMENT '主键',stock_name VARCHAR(30),stock_code VARCHAR(30),jk VARCHAR(10) COMMENT '今开',zs VARCHAR(10) COMMENT '昨收 ',zg VARCHAR(10) COMMENT '最高 ',zd VARCHAR(10) COMMENT '最低',hsl VARCHAR(10) COMMENT '换手率',syl VARCHAR(10) COMMENT '市盈率',PRIMARY KEY (id))ENGINE=InnoDB DEFAULT CHARSET=utf8mb4

爬取股票信息实现百分比进度条：

本文发布于:2024-01-28 10:27:15，感谢您对本站的认可！

本文链接：https://www.4u4v.net/it/17064088396763.html

上一篇：python爬虫实战：爬取股票信息，对上交所和深交所所有的股票信息进行搜集

下一篇：集算器 SPL 抓取网页数据