候选数据网站的选择
选取原则:股票信息静态存在于HTML页面中,非js代码生成,没有Robots协议限制
选取方法:浏览器F12,源代码查看等程序的结构设计:
步骤1:从东方财富网获取股票列表
步骤2:根据股票列表 逐个到百度股票获取个股信息
步骤3:将结果存储到文件
参考视频:=47
代码:
import requests
import time
from bs4 import BeautifulSoup
import traceback
import redef getHTMLText(url,code='utf-8'):try:r = (url,timeout=30)r.raise_for_status()r.encoding= code # 页面自动解析编码写法 r.encoding= r.apparent_cept:return ""def getStockList(lst,stockURL):html = getHTMLText(stockURL)soup = BeautifulSoup(html,"html.parser")a = soup.find_all('a')for i in a :# print(type(i)) #<class 'bs4.element.Tag'># s = "连板"# print(s)# if s in str(i): #判断字符串中是否包含子串try:href= i.attrs['href']lst.append(re.findall(r"d{6}",href)[0]) #以s开头 然后是h或z字母(因为股票代码不是上海sh就是深圳sz开头) 注意:re.findall返回列表,如[sh100012] 然后再取值出sh100012 然后再append,否则就append进去[[sh100012]]了except:continuedef getStockInfo(lst,stockURL,fpath):count=0 # 实现 进度条for stock in lst:url = stockURL + stock +".html" #拼接urlhtml = getHTMLText(url) # 获取股票页面内容try:if html == "": #空页面的处理continueinfoDict = {}soup = BeautifulSoup(html,'html.parser') # 解析网页text # 正则匹配出股票名称stockName = re.search(r'[wu4e00-u9fcc]+',text).group(0)# 正则匹配出股票代码stockNumber = re.search(r'[0-9]d{5}',text).group(0)# 正则匹配出股票个股日历p = repile(r"[个][股][日][u4e00-u9fa5]+[sS]+")stockHistory=p.findall(text)print(stockHistory)stockHistory1= re.split(r" +",stockHistory[0]) #按照多个空格分割print(stockHistory1[0])#将“个股日历 替换为空格”stockHistory2 = stockHistory1[0].replace("个股日历","")print(stockHistory2)stockHistory3 = place("n",";",100).replace("r",";")print(str(stockHistory3))infoDict.update({'股票名称':stockName}) #将 这个信息增加到字典中infoDict.update({'股票代码':stockNumber}) #将 这个信息增加到字典中infoDict.update({'股票日历':stockHistory2}) #将 这个信息增加到字典中# 股票信息部分如下# keyList = stockInfo.find_all('dt') #键# valueList = stockInfo.find_all('dd') #值# 还原为键值对并存到字典中# for i in range(len(keyList)):# key = keyList[i].text# val = valueList[i].text# infoDict[key]= val #字典可以直接使用key=value向字典中新增内容#将相关股票信息保存在文件中with open(fpath,'a',encoding='utf-8') as f:f.write(str(infoDict) + 'n')count = count +1 # 实现 进度条print('r当前速度:{:.2f}%'.format(count * 100/len(lst)),end='') # r能够将我们打印的字符串的最后的光标提到当前这一行的头部,那么下一次再进行 相关打印的时候,打印信息就会覆盖之前的内容。实现一个不换行的动态展示的进度条# 每10秒抓一次数据time.sleep(10)except:count = count +1 # 实现 进度条print('r当前速度:{:.2f}%'.format(count * 100/len(lst)),end='')traceback.print_exc()continueif __name__ == '__main__':# stock_list_url = ".html"stock_list_url = ".html"# stock_info_url = "" #sh上证 sz深圳stock_info_url = ""output_file = "D://pythontest/files/"# slist = ['002656','002702','000001','000002']slist=[]getStockList(slist,stock_list_url)getStockInfo(slist,stock_info_url,output_file)
抓取多个信息并入库mysql实例:
import requests
from bs4 import BeautifulSoup
import traceback
import re
from 基础用法.toMysql import ToMySql
def getHTMLText(url,code='utf-8'):try:r = (url,timeout=30)r.raise_for_status()r.encoding= code # 页面自动解析编码写法 r.encoding= r.apparent_cept:return ""def getStockList(lst,stockURL):html = getHTMLText(stockURL)soup = BeautifulSoup(html,"html.parser")a = soup.find_all('a')for i in a :try:href= i.attrs['href']lst.append(re.findall(r"d{6}",href)[0]) #以s开头 然后是h或z字母(因为股票代码不是上海sh就是深圳sz开头) 注意:re.findall返回列表,如[sh100012] 然后再取值出sh100012 然后再append,否则就append进去[[sh100012]]了except:continuedef getStockInfo(lst,stockURL,fpath):count=0 # 实现 进度条for stock in lst:url = stockURL + stock +".html" #拼接urlhtml = getHTMLText(url) # 获取股票页面内容print('==================')print(url)try:if html == "": #空页面的处理continueinfoDict = {}soup = BeautifulSoup(html,'html.parser') # 解析网页text # 正则匹配出股票名称stockName = re.search(r'[wu4e00-u9fcc]+',text).group(0)# 正则匹配出股票代码stockNumber = re.search(r'[0-9]d{5}',text).group(0)# 正则匹配出股票个股日历jk = repile(r"[今][开][::][0-9]*[.][0-9]*")zs = repile(r"[昨][收][::][0-9]*[.][0-9]*")zg = repile(r"[最][高][::][0-9]*[.][0-9]*")zd = repile(r"[最][低][::][0-9]*[.][0-9]*")hsl = repile(r"[换][手][率][::][0-9]*[.][0-9]*")syl = repile(r"[市][盈][率][::][0-9]*[.][0-9]*")jkV=jk.findall(text)zsV=zs.findall(text)zgV=zg.findall(text)zdV=zd.findall(text)hslV=hsl.findall(text)sylV=syl.findall(text)# print(sylV[0])hslV = str(hslV[0])+"%"print(jkV[0])print(jkV[0].split(":")[1])print(zsV[0])print(zgV[0])print(zdV[0])print(str(hslV))print(sylV[0])# 将数据写入mysqlsql = """ INSERT INTO stock_infos(stock_name,stock_code,jk,zs,zg,zd,hsl,syl) VALUES(%s,%s,%s,%s,%s,%s,%s,%s) """data = (stockName,stockNumber,jkV[0].split(":")[1],zsV[0].split(":")[1],zgV[0].split(":")[1],zdV[0].split(":")[1],hslV.split(":")[1],str(sylV[0].split(":")[1])) #直接写数字类型也能写入result = ToMySql.writeDb(sql, data)# 将数据写入文件中infoDict.update({'名称':stockName}) #将 这个信息增加到字典中infoDict.update({'代码':stockNumber})infoDict.update({'今开':jkV[0].split(":")[1]})infoDict.update({'昨收':zsV[0].split(":")[1]})infoDict.update({'最高':zgV[0].split(":")[1]})infoDict.update({'最低':zdV[0].split(":")[1]})infoDict.update({'换手率':hslV.split(":")[1]})infoDict.update({'市盈率':str(sylV[0].split(":")[1])})#将相关股票信息保存在文件中with open(fpath,'a',encoding='utf-8') as f:f.write(str(infoDict) + 'n')count = count +1 # 实现 进度条print('r当前速度:{:.2f}%'.format(count * 100/len(lst)),end='') # r能够将我们打印的字符串的最后的光标提到当前这一行的头部,那么下一次再进行 相关打印的时候,打印信息就会覆盖之前的内容。实现一个不换行的动态展示的进度条# 每10秒抓一次数据# time.sleep(10)except:count = count +1 # 实现 进度条print('r当前速度:{:.2f}%'.format(count * 100/len(lst)),end='')traceback.print_exc()continueif __name__ == '__main__':# stock_list_url = ".html"stock_list_url = ".html"# stock_info_url = "" #sh上证 sz深圳stock_info_url = ""output_file = "D://pythontest/files/gupiao/我的股票信息.csv"# slist = ['002656','002702','000001','000002']slist=[]getStockList(slist,stock_list_url)getStockInfo(slist,stock_info_url,output_file)
入库mysql封装的方法:
import pymysql
import logging
import pandas as pddb_name = 'python'
db_user = 'root'
db_pass = 'root'
db_ip = '127.0.0.1'
db_port = 3306#写入数据到数据库中
def writeDb(sql,db_data=()):"""连接mysql数据库(写),并进行写的操作"""try:conn = t(db=db_name,user=db_user,passwd=db_pass,host=db_ip,port=int(db_port),charset="utf8")cursor = conn.cursor()except Exception as e:print(('数据库连接失败:%s' % e)return ute(sql, db_data)connmit()except Exception as llback()('数据写入失败:%s' % e)return Falsefinally:cursor.close()conn.close()return True#
# sql = """ INSERT INTO user(email,last_name) VALUES(%s,%s) """
# data = ("632443020@qq", "男")
# result = writeDb(sql, data)sql = """ INSERT INTO stock_infos(stock_name,stock_code,jk,zs,zg,zd,hsl,syl) VALUES(%s,%s,%s,%s,%s,%s,%s,%s) """
data = ("1","2","3","4","5","6","7","8") #直接写数字类型也能写入
result = writeDb(sql, data)
Stocks信息表:
CREATE TABLEstock_infos(id INT NOT NULL AUTO_INCREMENT COMMENT '主键',stock_name VARCHAR(30),stock_code VARCHAR(30),jk VARCHAR(10) COMMENT '今开',zs VARCHAR(10) COMMENT '昨收 ',zg VARCHAR(10) COMMENT '最高 ',zd VARCHAR(10) COMMENT '最低',hsl VARCHAR(10) COMMENT '换手率',syl VARCHAR(10) COMMENT '市盈率',PRIMARY KEY (id))ENGINE=InnoDB DEFAULT CHARSET=utf8mb4
爬取股票信息实现百分比进度条:
本文发布于:2024-01-28 10:27:15,感谢您对本站的认可!
本文链接:https://www.4u4v.net/it/17064088396763.html
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系,我们将在24小时内删除。
留言与评论(共有 0 条评论) |