通用 聚焦 增量式
2.什么事UA检测,如何破解?
服务器通过获取请求,获取请求头中的UA,判断UA的值,请求的载体身份标识 给个伪headers
3.简述https的加密流程
4.什么是动态加载数据?如何爬取动态加载的数据?
有的网站 部分数据使用ajax生成动态数据,所见非所得,使用抓包工具进行分析,获取参数发送请求,得到数据.
url data headers proxies(代理)
对称加密: 客户端和服务区端交互的时候,客户端制定加密规则,把解密规则和密文给我们服务器端 服务器就可以解密,坏处,一旦被第3放拦截就能破解
非对称机密:服务端创建加密/解密(公钥/私钥)方式,把公钥给客户端 客户端使用公钥加密 把密文发个服务器 效率低
https证书机制:客户端服务器端 找一个信任的3方机构 服务器端想把公钥给客户端,发之前先找第3方认证机构 进行签名,会有一个证书,结合公钥一并发给客户端,客户端会坚持这个公钥是不是3方认证机构签的如果是 就可以拿这个公钥进行加密
1.指定url
2.发送请求
3.获取相应数据
4.持久化存储
#1指定url url = '/' #2.发起请求 response = (url=url) #3获取响应数据 page_text = #text返回的是字符串类型的数据 #持久化存储 with open('./sogou.html','w',encoding='utf-8') as fp:fp.write(page_text) print('over!')
import requests wd = input('enter a word:') url = '' #参数的封装 param = {'query':wd } #UA伪装 headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' } response = (url=url,params=param,headers=headers) #手动修改响应数据的编码 ding = 'utf-8' page_text = fileName = wd + '.html' with open(fileName,'w',encoding='utf-8') as fp:fp.write(page_text) print(fileName,'爬取成功!!!')
import requests wd = input('enter a word:') url = '' #参数的封装 param = {'query':wd } #UA伪装 headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' } response = (url=url,params=param,headers=headers) #手动修改响应数据的编码 ding = 'utf-8' page_text = fileName = wd + '.html' with open(fileName,'w',encoding='utf-8') as fp:fp.write(page_text) print(fileName,'爬取成功!!!')
#动态加载的数据 city = input('enter a cityName:') url = '.ashx?op=keyword' data = {"cname": "","pid": "","keyword": city,"pageIndex": "2","pageSize": "10", } #UA伪装 headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' } response = requests.post(url=url,headers=headers,data=data)json_text = print(json_text)
import jsonimport requests#爬取任意城市对应的肯德基餐厅的位置信息 #动态加载的数据 city = input('enter a cityName:') for i in range(1,9):url = '.ashx?op=keyword'data = {"cname": "","pid": "","keyword": city,"pageIndex": i,"pageSize": "10",}#UA伪装headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'}response = requests.post(url=url,headers=headers,data=data)json_text# data_dump = json.dumps(json_text)with open('data.json',"a", encoding="UTF-8") as f:f.write(json_text)
#注意事项 如果是动态数据就需要全局搜索确认找逻辑
import requests headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' } first_url = '125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsList' ids = [] for page in range(1,11):data = {"on": "true","page": str(page),"pageSize": "15","productName": "","conditionType": "1","applyname": "","applysn": "",}response = requests.post(url=first_url,data=data,headers=headers)#response.headers返回的是响应头信息(字典)if response.headers['Content-Type'] == 'application/json;charset=UTF-8':json_obj = response.json()for dic in json_obj['list']:ids.append(dic['ID'])detail_url = '125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsById' for _id in ids:data = {'id':_id}company_text = requests.post(detail_url,data=data,headers=headers).textprint(company_text)
import requests headers={ "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36" } img_url='.png' img_data(url=img_url,headers=headers).content with open('./baidu_log.jpg','wb') as f:f.write(img_data)#######方法二 from urllib import request img_url='.png' request.urlretrieve(img_url,'./baidu_log2.jpg')
解析原理: 标签定位,提取标签中存储的文本数据,或标签属性中的数据
爬取糗事百科正则首页所有图
pip install requests
''' <div class="thumb"><a href="/article/121859578" target="_blank"> <img src="//pic.qiushibaike/system/pictures/12185/121859578/medium/YZQA73IAY8J68GXC.jpg" alt="麻烦p的搞笑一点"> </a></div> ''' import os import re import requests from urllib import request if not ists('./qiutu'):os.mkdir('./qiutu') headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' } url='/' page_text(url=url,headers=headers).textex='<div class="thumb">.*?<img src="(.*?)" alt.*?</div>' img_url=re.findall(ex,page_text,re.S)#re.S 去掉空格 for url in img_url:url='https:'+urlimg_name=url.split('/')[-1]img_path='./qiutu/'+img_namerequest.urlretrieve(url,img_path)print(img_name,'下载成功')
解析原理:实例化一个Beautifulsoup(谁又福)的对象,将页面源码数据加载到该对象中
使用该对象的相关属性和方法实现标签和数据提取
2种方式
BeautifulSoup(page_test,'lmxl')#从互联网请求到数据源码加载到对象中 BeautifulSoup(fb,'lmxl')#将本地源码加载在对象中
好处就是 自带标签
from bs4 import BeautifulSoup fp = open('./test.html','r',encoding='utf-8') soup=BeautifulSoup(fp,"lxml") # print(soup.title) # print(soup.div)#默认会找第一个div # print(soup.find('a'))#查询a 默认第一个
# #属性定位 # print(soup.find('div',class_='song')) # print(soup.find_all('div')[2])#查找所有div 并找出第二个div 从0开始的#select(选择器) # print(soup.select('.song')) # print(soup.select('div'))#变成一个list
#层级 # >表示一个层级 空格表示多个层级 # print(soup.select('.tang > ul > li >a'))#取出所有的a # print(soup.select('.tang a'))#取出所有的a#取出直系文本数据 text获取全部的数据 # print(soup.p.string) # print(soup.find('div',class_='tang').get_text()) # print(soup.find('div',class_='tang').text)
#取属性 # print(soup.a['href']) # print(soup.select('.tang>ul>li>a')[0]['href'])
解析原理:实例一个etree对象,将页面源码加载该对象中,使用etrr中的xpath方法结合xpath表达式进行标签定位和数据提取
2种方式
etree.parse('本地文件路径')
etrss.Html(page_text)#远程文件
from lxml import etree tree=etree.parse('./test.html')#定位title标签 # print(tree.xpath('/html/head/title/text()'))#查找title # print(tree.xpath('/html//title')) # print(tree.xpath('//title/text()')) #定位class print(tree.xpath('//div[@class="song"]/p[1]/text()')[0]) print(tree.xpath('//div[@class="tang"]/ul/li[4]/a/text()')) #定位id print(tree.xpath('//div[@class="id"]/ul/li[4]/a/text()')) #取属性 print(tree.xpath('//a/@title'))#找到所有的title属性 遇到属性取属性 print(tree.xpath('//a/@href'))#找到所有的hraf属性
import requests from lxml import etree start_page = int(input('start page num:')) end_page = int(input('end page num:'))if not ists('./meinvs'):os.mkdir('./meinvs')#通用的url模板(不能修改) url = '.html' for page in range(start_page,end_page+1):if page == 1:new_url = '/'else:new_url = format(url%page)response = (url=new_url,headers=headers) # ding = 'utf-8'page_text = #解析名称和图片的src属性值tree = etree.HTML(page_text)li_list = tree.xpath('//div[@class="slist"]/ul/li')for li in li_list:img_name = li.xpath('./a/img/@alt')[0] img_name = de('iso-8859-1').decode('gbk')+'.jpg'img_src = ''+li.xpath('./a/img/@src')[0]img_path = './meinvs/'+img_namerequest.urlretrieve(img_src,img_path)print(img_name,'下载成功!!!')
#爬取全国城市名称 url = '/' page_text = (url=url,headers=headers).texttree = etree.HTML(page_text) # hot_city = tree.xpath('//div[@class="bottom"]/ul/li/a/text()') # all_city = tree.xpath('//div[@class="bottom"]/ul/div[2]/li/a/text()') # all_city tree.xpath('//div[@class="bottom"]/ul/div[2]/li/a/text() | //div[@class="bottom"]/ul/li/a/text()')
import requests from lxml import etree headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36' } url = '/?query=python%E7%88%AC%E8%99%AB&city=101010100&industry=&position=' page_text = (url=url, headers=headers).text # 数据解析 tree = etree.HTML(page_text) list = tree.xpath('//div[@class="job-list"]//li') for i in list:position = i.xpath('.//div[@class="job-title"]/text()')[0]salary = i.xpath('.//span[@class="red"]/text()')[0]gongsi = i.xpath('.//div[@class="company-text"]/h3/a/text()')[0]url_tail = i.xpath('.//div[@class="info-primary"]//a/@href')[0]print(url_tail)url_tail = '/' + url_tailpage_text_tail(url=url_tail,headers=headers).texttree2 = etree.HTML(page_text_tail)maiosu_list=tree2.xpath('//div[@class="detail-content"]')for v in maiosu_list:a=v.xpath('.//div[@class="job-sec"]/div[@class="text"]/text()')print(position,salary,gongsi,a)
- 匿名度:- 透明:对方服务器可以知道你使用了代理,并且也知道你的真实IP- 匿名:对方服务器可以知道你使用了代理,但不知道你的真实IP- 高匿:对方服务器不知道你使用了代理,更不知道你的真实IP。
- 类型:- http:该类型的代理ip只可以发起http协议头对应的请求- https:该类型的代理ip只可以发起https协议头对应的请求
requests的get和post方法常用的参数:
url
headers
data/params post用data
proxies 代理
Connection:close #来一个连接关闭一个
import os import requests from lxml import etree # start_page=int(input('start page num:')) # end_page=int(input('end page num:'))if not ists('./daili'):os.mkdir('./daili')headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' } for i in range(1,3):url = '/{}'.format(i)response(url=url,headers=headers).text#实例化tree=etree.HTML(response)tr_list=tree.xpath('//*[@id="ip_list"]//tr[@class="odd"]')# print(tr_list)for tr in tr_list:one_ip=tr.xpath('.//td[2]/text()')[0]port=tr.xpath('.//td[3]/text()')[0]list_wr=one_ip+':'+port# print(list_wr)with open('./ip.txt','a') as f:f.write(list_wr+'n')
import random import requests https=[{'https':"122.193.244.58:9999"}, ] http = [{'http':"101.132.131.158:8118"},{'http':"120.210.219.101:8080"} ] headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' } url = '=ip'if url.split(':')[0] == 'https':page_text = (url=url,headers=headers,proxies=random.choice(https)).textprint(page_text) else:page_text = (url=url,headers=headers,proxies=random.choice(http)).textwith open('./ip.html','w',encoding='utf-8') as fp:fp.write(page_text)
- cookie:可是使得服务器端记录客户端的相关状态
-处理cookie的方式
-手动处理 cookie是有效时常,动态变化的
-自动处理 使用会发机制session
-session用法:
实例化一个会话对象:requests.Session()
可以进行请求发送(post,get)
请求过程如果产生了cookie就会被自动存储到session中
#需求:爬取雪球网中的新闻标题和对应的内容简介 url = '.json?since_id=-1&max_id=-1&count=10&category=-1' headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' } json_obj = (url=url,headers=headers).json() print(json_obj) {'error_description': '遇到错误,请刷新页面或者重新登录帐号后再试', 'error_uri': '/v4/statuses/public_timeline_by_category.json', 'error_code': '400016'}
加session的方法
import requests headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' } session=requests.Session() #如果这一步产生了cookie,则cookie会被自动存储到session中 (url='/',headers=headers) #想要对如下的url发送请求,且是携带cookie发动请求 url='.json?since_id=-1&max_id=-1&count=10&category=-1'#把这里的requests改成session json_obj = (url=url,headers=headers).json() print(json_obj)
import http.client, mimetypes, urllib, json, time, requests ###################################################################### class YDMHttp:apiurl = '.php'username = ''password = ''appid = ''appkey = ''def __init__(self, username, password, appid, appkey):self.username = usernameself.password = passwordself.appid = str(appid)self.appkey = appkeydef request(self, fields, files=[]):response = self.post_url(self.apiurl, fields, files)response = json.loads(response)return responsedef balance(self):data = {'method': 'balance', 'username': self.username, 'password': self.password, 'appid': self.appid,'appkey': self.appkey}response = quest(data)if (response):if (response['ret'] and response['ret'] < 0):return response['ret']else:return response['balance']else:return -9001def login(self):data = {'method': 'login', 'username': self.username, 'password': self.password, 'appid': self.appid,'appkey': self.appkey}response = quest(data)if (response):if (response['ret'] and response['ret'] < 0):return response['ret']else:return response['uid']else:return -9001def upload(self, filename, codetype, timeout):data = {'method': 'upload', 'username': self.username, 'password': self.password, 'appid': self.appid,'appkey': self.appkey, 'codetype': str(codetype), 'timeout': str(timeout)}file = {'file': filename}response = quest(data, file)if (response):if (response['ret'] and response['ret'] < 0):return response['ret']else:return response['cid']else:return -9001def result(self, cid):data = {'method': 'result', 'username': self.username, 'password': self.password, 'appid': self.appid,'appkey': self.appkey, 'cid': str(cid)}response = quest(data)return response and response['text'] or ''def decode(self, filename, codetype, timeout):cid = self.upload(filename, codetype, timeout)if (cid > 0):for i in range(0, timeout):result = sult(cid)if (result != ''):return cid, resultelse:time.sleep(1)return -3003, ''else:return cid, ''def report(self, cid):data = {'method': 'report', 'username': self.username, 'password': self.password, 'appid': self.appid,'appkey': self.appkey, 'cid': str(cid), 'flag': '0'}response = quest(data)if (response):return response['ret']else:return -9001def post_url(self, url, fields, files=[]):for key in files:files[key] = open(files[key], 'rb');res = requests.post(url, files=files, data=fields) ######################开始########################################### # 将示例代码中的可执行程序封装成函数 def transformCodeImg(imgPath, imgType):# 普通用户名username = 'bobo328410948'# 密码password = 'bobo328410948'# 软件ID,开发者分成必要参数。登录开发者后台【我的软件】获得!appid = 6003# 软件密钥,开发者分成必要参数。登录开发者后台【我的软件】获得!appkey = '1f4b564483ae5c907a1d34f8e2f2776c'# 图片文件filename = imgPath# 验证码类型,# 例:1004表示4位字母数字,不同类型收费不同。请准确填写,否则影响识别率。在此查询所有类型 .htmlcodetype = imgType# 超时时间,秒timeout = 30result = None# 检查if (username == 'username'):print('请设置好相关参数再测试')else:# 初始化yundama = YDMHttp(username, password, appid, appkey)# 登陆云打码uid = yundama.login();print('uid: %s' % uid)# 查询余额balance = yundama.balance();print('balance: %s' % balance)# 开始识别,图片路径,验证码类型ID,超时时间(秒),识别结果cid, result = yundama.decode(filename, codetype, timeout);return resultimport requests from lxml import etree headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' } #访问的时候带了cookie s=requests.Session() url='.aspx?from=.aspx' page_text(url=url,headers=headers).text tree=etree.HTML(page_text) #验证码图片地址 img_src='/'+tree.xpath('//*[@id="imgCode"]/@src')[0] #获取到的图片二进制文件写入 img_data(url=img_src,headers=headers).content##验证码也会产生一个cookie with open('./gushiwen.jpg','wb') as f:f.write(img_data)#验证码图片 类型 result=transformCodeImg('./gushiwen.jpg',1004) print(result,'打印出打码后的验证码')#登陆的时候发送的值 __VIEWSTATE=tree.xpath('//*[@id="__VIEWSTATE"]/@value')[0] __VIEWSTATEGENERATOR=tree.xpath('//*[@id="__VIEWSTATEGENERATOR"]/@value')[0]#模拟登陆 post_url='.aspx?from=' data={ "__VIEWSTATE":__VIEWSTATE, "__VIEWSTATEGENERATOR":__VIEWSTATEGENERATOR, "from":"", "email": "1820405927@qq", "pwd": "1213", "code": result, "denglu": "登录" } response=s.post(url=post_url,headers=headers,data=data) print(response.status_code)#登陆后的状态 page_text#得到登陆后的主页写入html文件 with open('./gushiwen.html','w',encoding='utf-8') as f:f.write(page_text)
from time import sleep import time from multiprocessing.dummy import Pool urls=['www.baidu','www.songou','www.xinlang'] def request(url):print('正在请求:',url)sleep(2)print('下载成功',url) start=time.time()pool=Pool(3) pool.map(request,urls) print(time.time()-start)
*线程池需要作用到 爬虫为最耗时的操作中
耗时操作:视频下载,视频的保存
# 使用线程池爬取视频中的短视频 from lxml import etreeimport requests import random headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' } url = '' page_text = (url=url, headers=headers).textfrom multiprocessing.dummy import Pool pool = Pool(4) viseo_urls = [] # 所有视频的url tree = etree.HTML(page_text)# 解析视频详情url li_list = tree.xpath('//*[@id="listvideoListUl"]/li') ''' var contId="1559965",liveStatusUrl="liveStatus.jsp",liveSta="",playSta="1",autoPlay=!1,isLiving=!1,isVrVideo=!1,hdflvUrl="",sdflvUrl="",hdUrl="",sdUrl="",ldUrl="", srcUrl=".mp4",vdoUrl=srcUrl,skinRes="//www.pearvideo/domain/skin",videoCDN="//video.pearvideo"; ex='srcUrl="(.*?)",vdoUrl' ''' import redef getiVideoData(url):(url=url, headers=headers).content #进行随机保存 def saveVido(data):name=str(random.randint(0,9999))+'.mp4'with open(name,'wb') as f:f.write(data)print(name,'下载成功') for li in li_list:detail_url = '/' + li.xpath('./div/a/@href')[0]detail_page_text = (url=detail_url, headers=headers).textex = 'srcUrl="(.*?)",vdoUrl'video_src = re.findall(ex, detail_page_text, re.S)[0] # 正则获取视屏url viseo_urls.append(video_src) print(viseo_urls) #使用线程池进行视频数据的异步下载 all_video_data_list=pool.map(getiVideoData, viseo_urls) #保存视频 pool.map(saveVido,all_video_data_list)
转载于:.html
本文发布于:2024-02-01 07:05:49,感谢您对本站的认可!
本文链接:https://www.4u4v.net/it/170674235234764.html
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系,我们将在24小时内删除。
留言与评论(共有 0 条评论) |