今日头条爬取程序+邮件发到邮箱

阅读: 评论:0

今日头条爬取程序+邮件发到邮箱

今日头条爬取程序+邮件发到邮箱

需要安装requests,stmplib和openpyxl库,加入了发送邮件功能,部分代码来自CSDN:

其中smtplib需使用pip install py-email 命令来安装

完整代码如下

import requests  
import json
from openpyxl import Workbook
import time
import hashlib
import os
import datetime
import smtplib
import random
from  import MIMEText
from email.header import Header
from email.mime.multipart import MIMEMultipartstart_url = '/?category=news_hot&utm_source=toutiao&widen=1&max_behot_time='
url = ''headers={'user-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
}
cookies = {'tt_webid':str(random.randint(66499490848923624618,98746537462725254568))} # 为避免反爬,采用随机数cookiesmax_behot_time = '0'   # 链接参数
title = []       # 存储新闻标题
source_url = []  # 存储新闻的链接
s_url = []       # 存储新闻的完整链接
source = []      # 存储发布新闻的公众号
media_url = {}   # 存储公众号的完整链接# 第三方 SMTP 服务参数
mail_host=&#ail.qq"  #设置服务器
mail_user="201XXXXX@XXX"    #用户名
mail_pass="r6ctG345kw8Mdai"   #口令 
sender = '20XXXXX@XXX' # 发送邮件,可设置为你的QQ邮箱或者其他邮箱
receivers = ['XXXXX@qq']  # 接收邮件,可设置为你的QQ邮箱或者其他邮箱def get_as_cp():  # 该函数主要是为了获取as和cp参数,程序参考今日头条中的加密js文件:home_4abea46.jszz = {}now = round(time.time())print(now) # 获取当前计算机时间e = hex(int(now)).upper()[2:] #hex()转换一个整数对象为16进制的字符串表示print('e:', e)a = hashlib.md5()  #hashlib.md5().hexdigest()创建hash对象并返回16进制结果print('a:', a)a.update(str(int(now)).encode('utf-8'))i = a.hexdigest().upper()print('i:', i)if len(e)!=8:zz = {'as':'479BB4B7254C150','cp':'7E0AC8874BB0985'}return zzn = i[:5]a = i[-5:]r = ''s = ''for i in range(5):s= s+n[i]+e[i]for j in range(5):r = r+e[j+3]+a[j]zz ={'as':'A1'+s+e[-3:],'cp':e[0:3]+r+'E1'}print('zz:', zz)return zzdef getdata(url, headers, cookies):  # 解析网页函数r = (url, headers=headers, cookies=cookies)print(url)data = json.)return datadef savedata(title, s_url, source, media_url):  # 存储数据到文件# 存储数据到xlxs文件wb = Workbook()filename ='toutiao.xlsx' # 新建存储结果的excel文件ws = wb.activews.title = 'data'   # 更改工作表的标题ws['A1'] = '标题'   # 对表格加入标题ws['B1'] = '新闻链接'ws['C1'] = '头条号'ws['D1'] = '头条号链接'for row in range(2, len(title)+2):   # 将数据写入表格_= ws.cell(column=1, row=row, value=title[row-2])_= ws.cell(column=2, row=row, value=s_url[row-2])_= ws.cell(column=3, row=row, value=source[row-2])_= ws.cell(column=4, row=row, value=media_url])wb.save(filename=filename)  # 保存文件def main(max_behot_time, title, source_url, s_url, source, media_url):   # 主函数for i in range(10):   # 刷新次数,数值越多,得到的数据越多ascp = get_as_cp()    # 获取as和cp参数的函数demo = getdata(start_url+max_behot_time+'&max_behot_time_tmp='+max_behot_time+'&tadrequire=true&as='+ascp['as']+'&cp='+ascp['cp'], headers, cookies)print(demo)# time.sleep(1)for j in range(len(demo['data'])):# print(demo['data'][j]['title'])if demo['data'][j]['title'] not in title:title.append(demo['data'][j]['title'])  # 获取新闻标题source_url.append(demo['data'][j]['source_url'])  # 获取新闻链接source.append(demo['data'][j]['source'])  # 获取发布新闻的公众号if demo['data'][j]['source'] not in media_url:media_url[demo['data'][j]['source']] = url+demo['data'][j]['media_url']  # 获取公众号链接print(max_behot_time)max_behot_time = str(demo['next']['max_behot_time'])  # 获取下一个链接的max_behot_time参数的值for index in range(len(title)):print('标题:', title[index])if 'https' not in source_url[index]:s_url.append(url+source_url[index])print('新闻链接:', url+source_url[index])else:print('新闻链接:', source_url[index])s_url.append(source_url[index])# print('源链接:', url+source_url[index])print('头条号:', source[index])print(len(title))   # 获取的新闻数量def sendmail(mail_host,mail_user,mail_pass,sender,receivers): #创建一个带附件的实例message = MIMEMultipart()message['From'] = Header("每日今日头条文章爬取", 'utf-8')message['To'] =  Header("不知道是谁", 'utf-8')subject = '今天的头条新闻都在这里了'message['Subject'] = Header(subject, 'utf-8')#邮件正文内容message.attach(MIMEText('今天的新闻,请查收', 'plain', 'utf-8'))# 构造附件1,传送当前目录下的  文件att1 = MIMEText(open('toutiao.xlsx', 'rb').read(), 'base64', 'utf-8')att1["Content-Type"] = 'application/octet-stream'# 这里的filename可以任意写,写什么名字,邮件中显示什么名字att1["Content-Disposition"] = 'attachment; filename="toutiao.xlsx"'message.attach(att1)# 构造附件2,传送当前目录下的  文件#att2 = MIMEText(open(&#', 'rb').read(), 'base64', 'utf-8')#att2["Content-Type"] = 'application/octet-stream'#att2["Content-Disposition"] = 'attachment; filename=&#"'#message.attach(att2)try:smtpObj = smtplib.SMTP() t(mail_host, 25)    # 465 为 SMTP 端口号smtpObj.login(mail_user,mail_pass)  smtpObj.sendmail(sender, receivers, message.as_string())print ("邮件发送成功")except smtplib.SMTPException:print ("Error: 无法发送邮件")if __name__ == '__main__':main(max_behot_time, title, source_url, s_url, source, media_url)savedata(title, s_url, source, media_url)sendmail(mail_host,mail_user,mail_pass,sender,receivers)exit()
代码均采用结构化程序设计,简单易读,修改方便,只需要修改参数部分即可。

本文发布于:2024-01-30 18:39:49,感谢您对本站的认可!

本文链接:https://www.4u4v.net/it/170661119322036.html

版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系,我们将在24小时内删除。

标签:头条   邮箱   邮件   今日   程序
留言与评论(共有 0 条评论)
   
验证码:

Copyright ©2019-2022 Comsenz Inc.Powered by ©

网站地图1 网站地图2 网站地图3 网站地图4 网站地图5 网站地图6 网站地图7 网站地图8 网站地图9 网站地图10 网站地图11 网站地图12 网站地图13 网站地图14 网站地图15 网站地图16 网站地图17 网站地图18 网站地图19 网站地图20 网站地图21 网站地图22/a> 网站地图23