需要安装requests,stmplib和openpyxl库,加入了发送邮件功能,部分代码来自CSDN:
其中smtplib需使用pip install py-email 命令来安装
完整代码如下
import requests
import json
from openpyxl import Workbook
import time
import hashlib
import os
import datetime
import smtplib
import random
from import MIMEText
from email.header import Header
from email.mime.multipart import MIMEMultipartstart_url = '/?category=news_hot&utm_source=toutiao&widen=1&max_behot_time='
url = ''headers={'user-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
}
cookies = {'tt_webid':str(random.randint(66499490848923624618,98746537462725254568))} # 为避免反爬,采用随机数cookiesmax_behot_time = '0' # 链接参数
title = [] # 存储新闻标题
source_url = [] # 存储新闻的链接
s_url = [] # 存储新闻的完整链接
source = [] # 存储发布新闻的公众号
media_url = {} # 存储公众号的完整链接# 第三方 SMTP 服务参数
mail_host=ail.qq" #设置服务器
mail_user="201XXXXX@XXX" #用户名
mail_pass="r6ctG345kw8Mdai" #口令
sender = '20XXXXX@XXX' # 发送邮件,可设置为你的QQ邮箱或者其他邮箱
receivers = ['XXXXX@qq'] # 接收邮件,可设置为你的QQ邮箱或者其他邮箱def get_as_cp(): # 该函数主要是为了获取as和cp参数,程序参考今日头条中的加密js文件:home_4abea46.jszz = {}now = round(time.time())print(now) # 获取当前计算机时间e = hex(int(now)).upper()[2:] #hex()转换一个整数对象为16进制的字符串表示print('e:', e)a = hashlib.md5() #hashlib.md5().hexdigest()创建hash对象并返回16进制结果print('a:', a)a.update(str(int(now)).encode('utf-8'))i = a.hexdigest().upper()print('i:', i)if len(e)!=8:zz = {'as':'479BB4B7254C150','cp':'7E0AC8874BB0985'}return zzn = i[:5]a = i[-5:]r = ''s = ''for i in range(5):s= s+n[i]+e[i]for j in range(5):r = r+e[j+3]+a[j]zz ={'as':'A1'+s+e[-3:],'cp':e[0:3]+r+'E1'}print('zz:', zz)return zzdef getdata(url, headers, cookies): # 解析网页函数r = (url, headers=headers, cookies=cookies)print(url)data = json.)return datadef savedata(title, s_url, source, media_url): # 存储数据到文件# 存储数据到xlxs文件wb = Workbook()filename ='toutiao.xlsx' # 新建存储结果的excel文件ws = wb.activews.title = 'data' # 更改工作表的标题ws['A1'] = '标题' # 对表格加入标题ws['B1'] = '新闻链接'ws['C1'] = '头条号'ws['D1'] = '头条号链接'for row in range(2, len(title)+2): # 将数据写入表格_= ws.cell(column=1, row=row, value=title[row-2])_= ws.cell(column=2, row=row, value=s_url[row-2])_= ws.cell(column=3, row=row, value=source[row-2])_= ws.cell(column=4, row=row, value=media_url])wb.save(filename=filename) # 保存文件def main(max_behot_time, title, source_url, s_url, source, media_url): # 主函数for i in range(10): # 刷新次数,数值越多,得到的数据越多ascp = get_as_cp() # 获取as和cp参数的函数demo = getdata(start_url+max_behot_time+'&max_behot_time_tmp='+max_behot_time+'&tadrequire=true&as='+ascp['as']+'&cp='+ascp['cp'], headers, cookies)print(demo)# time.sleep(1)for j in range(len(demo['data'])):# print(demo['data'][j]['title'])if demo['data'][j]['title'] not in title:title.append(demo['data'][j]['title']) # 获取新闻标题source_url.append(demo['data'][j]['source_url']) # 获取新闻链接source.append(demo['data'][j]['source']) # 获取发布新闻的公众号if demo['data'][j]['source'] not in media_url:media_url[demo['data'][j]['source']] = url+demo['data'][j]['media_url'] # 获取公众号链接print(max_behot_time)max_behot_time = str(demo['next']['max_behot_time']) # 获取下一个链接的max_behot_time参数的值for index in range(len(title)):print('标题:', title[index])if 'https' not in source_url[index]:s_url.append(url+source_url[index])print('新闻链接:', url+source_url[index])else:print('新闻链接:', source_url[index])s_url.append(source_url[index])# print('源链接:', url+source_url[index])print('头条号:', source[index])print(len(title)) # 获取的新闻数量def sendmail(mail_host,mail_user,mail_pass,sender,receivers): #创建一个带附件的实例message = MIMEMultipart()message['From'] = Header("每日今日头条文章爬取", 'utf-8')message['To'] = Header("不知道是谁", 'utf-8')subject = '今天的头条新闻都在这里了'message['Subject'] = Header(subject, 'utf-8')#邮件正文内容message.attach(MIMEText('今天的新闻,请查收', 'plain', 'utf-8'))# 构造附件1,传送当前目录下的 文件att1 = MIMEText(open('toutiao.xlsx', 'rb').read(), 'base64', 'utf-8')att1["Content-Type"] = 'application/octet-stream'# 这里的filename可以任意写,写什么名字,邮件中显示什么名字att1["Content-Disposition"] = 'attachment; filename="toutiao.xlsx"'message.attach(att1)# 构造附件2,传送当前目录下的 文件#att2 = MIMEText(open(', 'rb').read(), 'base64', 'utf-8')#att2["Content-Type"] = 'application/octet-stream'#att2["Content-Disposition"] = 'attachment; filename="'#message.attach(att2)try:smtpObj = smtplib.SMTP() t(mail_host, 25) # 465 为 SMTP 端口号smtpObj.login(mail_user,mail_pass) smtpObj.sendmail(sender, receivers, message.as_string())print ("邮件发送成功")except smtplib.SMTPException:print ("Error: 无法发送邮件")if __name__ == '__main__':main(max_behot_time, title, source_url, s_url, source, media_url)savedata(title, s_url, source, media_url)sendmail(mail_host,mail_user,mail_pass,sender,receivers)exit()
代码均采用结构化程序设计,简单易读,修改方便,只需要修改参数部分即可。
本文发布于:2024-01-30 18:39:49,感谢您对本站的认可!
本文链接:https://www.4u4v.net/it/170661119322036.html
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系,我们将在24小时内删除。
留言与评论(共有 0 条评论) |