爬取币世界标红快讯内容(移动版)
# 引入依赖
from lxml import etree
import requests
import pymongo
import time
client = pymongo.MongoClient('写你自己的数据库地址', 27017) # 需要自己安装mongodb客户端
mydb = client['mydb']
information = mydb['information'] # 数据库表名
currentTime = time.strftime("%m%d%H", time.localtime())
saveTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# 伪造成手机
header = {
'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1'
}
def get_url(url):
html = (url, headers=header)
selector = etree.)
infos = selector.xpath('//div[@id="kuaixun_list"]/div/article/section[@class="focus"]')
onlyOne = selector.xpath('//div[@id="kuaixun_list"]/div/article/section[@class="focus"]')[0]
saveId = onlyOne.xpath('../@id')[0]
file = open(r'C:/Users/SCZ/PycharmProjects/CommunityCrawl/newest', 'w') # 写你自己的文件地址
file.write(currentTime +' '+saveId)
file.close()
for info in infos:
try:
title = (info.xpath('h3[@class="text_title"]/text()')[0]).strip()
content = (info.xpath('p[@class="text_show"]/text()')[0]).strip()
date = info.xpath('../h3[@class="timenode"]/text()')[0]
infoId = info.xpath('../@id')[0]
data = {
'title': title,
'id': infoId,
'date': saveTime,
'content': content,
'source': 'bishijie'
}
print(data)
if (int(infoId) > int(saveId) - 20):
print('插入了一条新数据!')
information.insert_one(data)
else:
print('无新数据产生!')
except IndexError:
pass
if __name__ == '__main__':
fs = open('C:/Users/SCZ/PycharmProjects/CommunityCrawl/newest', 'r+') # 写你自己的文件地址
line = fs.read()
fileDate = line[0:6]
if (fileDate != currentTime):
print('时间不一致,宕机使用当前系统时间进行爬取!')
urls = ['=' + currentTime]
for url in urls:
get_url(url)
time.sleep(2)
else:
print('时间一致, 正常运行!')
urls = ['=' + currentTime]
for url in urls:
get_url(url)
time.sleep(2)
主要要求掌握内容: xpath语法,python操作文件,python的基础语法
本文内容比较基础,写的不好,多多指教!大家一起进步!!!
我的其他关于python的文章
本文发布于:2024-02-01 20:48:49,感谢您对本站的认可!
本文链接:https://www.4u4v.net/it/170679172639338.html
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系,我们将在24小时内删除。
留言与评论(共有 0 条评论) |