
通用 聚焦 增量式
2.什么事UA检测,如何破解?
服务器通过获取请求,获取请求头中的UA,判断UA的值,请求的载体身份标识 给个伪headers
3.简述https的加密流程
4.什么是动态加载数据?如何爬取动态加载的数据?
有的网站 部分数据使用ajax生成动态数据,所见非所得,使用抓包工具进行分析,获取参数发送请求,得到数据.
url data headers proxies(代理)
对称加密: 客户端和服务区端交互的时候,客户端制定加密规则,把解密规则和密文给我们服务器端 服务器就可以解密,坏处,一旦被第3放拦截就能破解
非对称机密:服务端创建加密/解密(公钥/私钥)方式,把公钥给客户端 客户端使用公钥加密 把密文发个服务器 效率低
https证书机制:客户端服务器端 找一个信任的3方机构 服务器端想把公钥给客户端,发之前先找第3方认证机构 进行签名,会有一个证书,结合公钥一并发给客户端,客户端会坚持这个公钥是不是3方认证机构签的如果是 就可以拿这个公钥进行加密
1.指定url
2.发送请求
3.获取相应数据
4.持久化存储
#1指定url
url = '/'
#2.发起请求
response = (url=url)
#3获取响应数据
page_text = #text返回的是字符串类型的数据
#持久化存储
with open('./sogou.html','w',encoding='utf-8') as fp:fp.write(page_text)
print('over!') import requests
wd = input('enter a word:')
url = ''
#参数的封装
param = {'query':wd
}
#UA伪装
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
}
response = (url=url,params=param,headers=headers)
#手动修改响应数据的编码
ding = 'utf-8'
page_text =
fileName = wd + '.html'
with open(fileName,'w',encoding='utf-8') as fp:fp.write(page_text)
print(fileName,'爬取成功!!!') import requests
wd = input('enter a word:')
url = ''
#参数的封装
param = {'query':wd
}
#UA伪装
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
}
response = (url=url,params=param,headers=headers)
#手动修改响应数据的编码
ding = 'utf-8'
page_text =
fileName = wd + '.html'
with open(fileName,'w',encoding='utf-8') as fp:fp.write(page_text)
print(fileName,'爬取成功!!!') #动态加载的数据
city = input('enter a cityName:')
url = '.ashx?op=keyword'
data = {"cname": "","pid": "","keyword": city,"pageIndex": "2","pageSize": "10",
}
#UA伪装
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
}
response = requests.post(url=url,headers=headers,data=data)json_text = print(json_text) import jsonimport requests#爬取任意城市对应的肯德基餐厅的位置信息
#动态加载的数据
city = input('enter a cityName:')
for i in range(1,9):url = '.ashx?op=keyword'data = {"cname": "","pid": "","keyword": city,"pageIndex": i,"pageSize": "10",}#UA伪装headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'}response = requests.post(url=url,headers=headers,data=data)json_text# data_dump = json.dumps(json_text)with open('data.json',"a", encoding="UTF-8") as f:f.write(json_text) #注意事项 如果是动态数据就需要全局搜索确认找逻辑
import requests headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' } first_url = '125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsList' ids = [] for page in range(1,11):data = {"on": "true","page": str(page),"pageSize": "15","productName": "","conditionType": "1","applyname": "","applysn": "",}response = requests.post(url=first_url,data=data,headers=headers)#response.headers返回的是响应头信息(字典)if response.headers['Content-Type'] == 'application/json;charset=UTF-8':json_obj = response.json()for dic in json_obj['list']:ids.append(dic['ID'])detail_url = '125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsById' for _id in ids:data = {'id':_id}company_text = requests.post(detail_url,data=data,headers=headers).textprint(company_text)
import requests
headers={
"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"
}
img_url='.png'
img_data(url=img_url,headers=headers).content
with open('./baidu_log.jpg','wb') as f:f.write(img_data)#######方法二
from urllib import request
img_url='.png'
request.urlretrieve(img_url,'./baidu_log2.jpg') 解析原理: 标签定位,提取标签中存储的文本数据,或标签属性中的数据
爬取糗事百科正则首页所有图
pip install requests
'''
<div class="thumb"><a href="/article/121859578" target="_blank">
<img src="//pic.qiushibaike/system/pictures/12185/121859578/medium/YZQA73IAY8J68GXC.jpg" alt="麻烦p的搞笑一点">
</a></div>
'''
import os
import re
import requests
from urllib import request
if not ists('./qiutu'):os.mkdir('./qiutu')
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
}
url='/'
page_text(url=url,headers=headers).textex='<div class="thumb">.*?<img src="(.*?)" alt.*?</div>'
img_url=re.findall(ex,page_text,re.S)#re.S 去掉空格
for url in img_url:url='https:'+urlimg_name=url.split('/')[-1]img_path='./qiutu/'+img_namerequest.urlretrieve(url,img_path)print(img_name,'下载成功') 解析原理:实例化一个Beautifulsoup(谁又福)的对象,将页面源码数据加载到该对象中
使用该对象的相关属性和方法实现标签和数据提取
2种方式
BeautifulSoup(page_test,'lmxl')#从互联网请求到数据源码加载到对象中 BeautifulSoup(fb,'lmxl')#将本地源码加载在对象中
好处就是 自带标签
from bs4 import BeautifulSoup
fp = open('./test.html','r',encoding='utf-8')
soup=BeautifulSoup(fp,"lxml")
# print(soup.title)
# print(soup.div)#默认会找第一个div
# print(soup.find('a'))#查询a 默认第一个
# #属性定位
# print(soup.find('div',class_='song'))
# print(soup.find_all('div')[2])#查找所有div 并找出第二个div 从0开始的#select(选择器)
# print(soup.select('.song'))
# print(soup.select('div'))#变成一个list
#层级
# >表示一个层级 空格表示多个层级
# print(soup.select('.tang > ul > li >a'))#取出所有的a
# print(soup.select('.tang a'))#取出所有的a#取出直系文本数据 text获取全部的数据
# print(soup.p.string)
# print(soup.find('div',class_='tang').get_text())
# print(soup.find('div',class_='tang').text)
#取属性
# print(soup.a['href'])
# print(soup.select('.tang>ul>li>a')[0]['href']) 解析原理:实例一个etree对象,将页面源码加载该对象中,使用etrr中的xpath方法结合xpath表达式进行标签定位和数据提取
2种方式
etree.parse('本地文件路径')
etrss.Html(page_text)#远程文件
from lxml import etree
tree=etree.parse('./test.html')#定位title标签
# print(tree.xpath('/html/head/title/text()'))#查找title
# print(tree.xpath('/html//title'))
# print(tree.xpath('//title/text()'))
#定位class
print(tree.xpath('//div[@class="song"]/p[1]/text()')[0])
print(tree.xpath('//div[@class="tang"]/ul/li[4]/a/text()'))
#定位id
print(tree.xpath('//div[@class="id"]/ul/li[4]/a/text()'))
#取属性
print(tree.xpath('//a/@title'))#找到所有的title属性 遇到属性取属性
print(tree.xpath('//a/@href'))#找到所有的hraf属性 import requests
from lxml import etree
start_page = int(input('start page num:'))
end_page = int(input('end page num:'))if not ists('./meinvs'):os.mkdir('./meinvs')#通用的url模板(不能修改)
url = '.html'
for page in range(start_page,end_page+1):if page == 1:new_url = '/'else:new_url = format(url%page)response = (url=new_url,headers=headers)
# ding = 'utf-8'page_text = #解析名称和图片的src属性值tree = etree.HTML(page_text)li_list = tree.xpath('//div[@class="slist"]/ul/li')for li in li_list:img_name = li.xpath('./a/img/@alt')[0] img_name = de('iso-8859-1').decode('gbk')+'.jpg'img_src = ''+li.xpath('./a/img/@src')[0]img_path = './meinvs/'+img_namerequest.urlretrieve(img_src,img_path)print(img_name,'下载成功!!!') #爬取全国城市名称
url = '/'
page_text = (url=url,headers=headers).texttree = etree.HTML(page_text)
# hot_city = tree.xpath('//div[@class="bottom"]/ul/li/a/text()')
# all_city = tree.xpath('//div[@class="bottom"]/ul/div[2]/li/a/text()')
# all_city
tree.xpath('//div[@class="bottom"]/ul/div[2]/li/a/text() | //div[@class="bottom"]/ul/li/a/text()') import requests
from lxml import etree
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}
url = '/?query=python%E7%88%AC%E8%99%AB&city=101010100&industry=&position='
page_text = (url=url, headers=headers).text
# 数据解析
tree = etree.HTML(page_text)
list = tree.xpath('//div[@class="job-list"]//li')
for i in list:position = i.xpath('.//div[@class="job-title"]/text()')[0]salary = i.xpath('.//span[@class="red"]/text()')[0]gongsi = i.xpath('.//div[@class="company-text"]/h3/a/text()')[0]url_tail = i.xpath('.//div[@class="info-primary"]//a/@href')[0]print(url_tail)url_tail = '/' + url_tailpage_text_tail(url=url_tail,headers=headers).texttree2 = etree.HTML(page_text_tail)maiosu_list=tree2.xpath('//div[@class="detail-content"]')for v in maiosu_list:a=v.xpath('.//div[@class="job-sec"]/div[@class="text"]/text()')print(position,salary,gongsi,a) - 匿名度:- 透明:对方服务器可以知道你使用了代理,并且也知道你的真实IP- 匿名:对方服务器可以知道你使用了代理,但不知道你的真实IP- 高匿:对方服务器不知道你使用了代理,更不知道你的真实IP。
- 类型:- http:该类型的代理ip只可以发起http协议头对应的请求- https:该类型的代理ip只可以发起https协议头对应的请求
requests的get和post方法常用的参数:
url
headers
data/params post用data
proxies 代理
Connection:close #来一个连接关闭一个
import os
import requests
from lxml import etree
# start_page=int(input('start page num:'))
# end_page=int(input('end page num:'))if not ists('./daili'):os.mkdir('./daili')headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
}
for i in range(1,3):url = '/{}'.format(i)response(url=url,headers=headers).text#实例化tree=etree.HTML(response)tr_list=tree.xpath('//*[@id="ip_list"]//tr[@class="odd"]')# print(tr_list)for tr in tr_list:one_ip=tr.xpath('.//td[2]/text()')[0]port=tr.xpath('.//td[3]/text()')[0]list_wr=one_ip+':'+port# print(list_wr)with open('./ip.txt','a') as f:f.write(list_wr+'n') import random
import requests
https=[{'https':"122.193.244.58:9999"},
]
http = [{'http':"101.132.131.158:8118"},{'http':"120.210.219.101:8080"}
]
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
}
url = '=ip'if url.split(':')[0] == 'https':page_text = (url=url,headers=headers,proxies=random.choice(https)).textprint(page_text)
else:page_text = (url=url,headers=headers,proxies=random.choice(http)).textwith open('./ip.html','w',encoding='utf-8') as fp:fp.write(page_text) - cookie:可是使得服务器端记录客户端的相关状态
-处理cookie的方式
-手动处理 cookie是有效时常,动态变化的
-自动处理 使用会发机制session
-session用法:
实例化一个会话对象:requests.Session()
可以进行请求发送(post,get)
请求过程如果产生了cookie就会被自动存储到session中
#需求:爬取雪球网中的新闻标题和对应的内容简介
url = '.json?since_id=-1&max_id=-1&count=10&category=-1'
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
}
json_obj = (url=url,headers=headers).json()
print(json_obj)
{'error_description': '遇到错误,请刷新页面或者重新登录帐号后再试', 'error_uri': '/v4/statuses/public_timeline_by_category.json', 'error_code': '400016'} 加session的方法
import requests
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
}
session=requests.Session()
#如果这一步产生了cookie,则cookie会被自动存储到session中
(url='/',headers=headers)
#想要对如下的url发送请求,且是携带cookie发动请求
url='.json?since_id=-1&max_id=-1&count=10&category=-1'#把这里的requests改成session
json_obj = (url=url,headers=headers).json()
print(json_obj) import http.client, mimetypes, urllib, json, time, requests
######################################################################
class YDMHttp:apiurl = '.php'username = ''password = ''appid = ''appkey = ''def __init__(self, username, password, appid, appkey):self.username = usernameself.password = passwordself.appid = str(appid)self.appkey = appkeydef request(self, fields, files=[]):response = self.post_url(self.apiurl, fields, files)response = json.loads(response)return responsedef balance(self):data = {'method': 'balance', 'username': self.username, 'password': self.password, 'appid': self.appid,'appkey': self.appkey}response = quest(data)if (response):if (response['ret'] and response['ret'] < 0):return response['ret']else:return response['balance']else:return -9001def login(self):data = {'method': 'login', 'username': self.username, 'password': self.password, 'appid': self.appid,'appkey': self.appkey}response = quest(data)if (response):if (response['ret'] and response['ret'] < 0):return response['ret']else:return response['uid']else:return -9001def upload(self, filename, codetype, timeout):data = {'method': 'upload', 'username': self.username, 'password': self.password, 'appid': self.appid,'appkey': self.appkey, 'codetype': str(codetype), 'timeout': str(timeout)}file = {'file': filename}response = quest(data, file)if (response):if (response['ret'] and response['ret'] < 0):return response['ret']else:return response['cid']else:return -9001def result(self, cid):data = {'method': 'result', 'username': self.username, 'password': self.password, 'appid': self.appid,'appkey': self.appkey, 'cid': str(cid)}response = quest(data)return response and response['text'] or ''def decode(self, filename, codetype, timeout):cid = self.upload(filename, codetype, timeout)if (cid > 0):for i in range(0, timeout):result = sult(cid)if (result != ''):return cid, resultelse:time.sleep(1)return -3003, ''else:return cid, ''def report(self, cid):data = {'method': 'report', 'username': self.username, 'password': self.password, 'appid': self.appid,'appkey': self.appkey, 'cid': str(cid), 'flag': '0'}response = quest(data)if (response):return response['ret']else:return -9001def post_url(self, url, fields, files=[]):for key in files:files[key] = open(files[key], 'rb');res = requests.post(url, files=files, data=fields)
######################开始###########################################
# 将示例代码中的可执行程序封装成函数
def transformCodeImg(imgPath, imgType):# 普通用户名username = 'bobo328410948'# 密码password = 'bobo328410948'# 软件ID,开发者分成必要参数。登录开发者后台【我的软件】获得!appid = 6003# 软件密钥,开发者分成必要参数。登录开发者后台【我的软件】获得!appkey = '1f4b564483ae5c907a1d34f8e2f2776c'# 图片文件filename = imgPath# 验证码类型,# 例:1004表示4位字母数字,不同类型收费不同。请准确填写,否则影响识别率。在此查询所有类型 .htmlcodetype = imgType# 超时时间,秒timeout = 30result = None# 检查if (username == 'username'):print('请设置好相关参数再测试')else:# 初始化yundama = YDMHttp(username, password, appid, appkey)# 登陆云打码uid = yundama.login();print('uid: %s' % uid)# 查询余额balance = yundama.balance();print('balance: %s' % balance)# 开始识别,图片路径,验证码类型ID,超时时间(秒),识别结果cid, result = yundama.decode(filename, codetype, timeout);return resultimport requests
from lxml import etree
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
}
#访问的时候带了cookie
s=requests.Session()
url='.aspx?from=.aspx'
page_text(url=url,headers=headers).text
tree=etree.HTML(page_text)
#验证码图片地址
img_src='/'+tree.xpath('//*[@id="imgCode"]/@src')[0]
#获取到的图片二进制文件写入
img_data(url=img_src,headers=headers).content##验证码也会产生一个cookie
with open('./gushiwen.jpg','wb') as f:f.write(img_data)#验证码图片 类型
result=transformCodeImg('./gushiwen.jpg',1004)
print(result,'打印出打码后的验证码')#登陆的时候发送的值
__VIEWSTATE=tree.xpath('//*[@id="__VIEWSTATE"]/@value')[0]
__VIEWSTATEGENERATOR=tree.xpath('//*[@id="__VIEWSTATEGENERATOR"]/@value')[0]#模拟登陆
post_url='.aspx?from='
data={
"__VIEWSTATE":__VIEWSTATE,
"__VIEWSTATEGENERATOR":__VIEWSTATEGENERATOR,
"from":"",
"email": "1820405927@qq",
"pwd": "1213",
"code": result,
"denglu": "登录"
}
response=s.post(url=post_url,headers=headers,data=data)
print(response.status_code)#登陆后的状态
page_text#得到登陆后的主页写入html文件
with open('./gushiwen.html','w',encoding='utf-8') as f:f.write(page_text) from time import sleep
import time
from multiprocessing.dummy import Pool
urls=['www.baidu','www.songou','www.xinlang']
def request(url):print('正在请求:',url)sleep(2)print('下载成功',url)
start=time.time()pool=Pool(3)
pool.map(request,urls)
print(time.time()-start) *线程池需要作用到 爬虫为最耗时的操作中
耗时操作:视频下载,视频的保存
# 使用线程池爬取视频中的短视频
from lxml import etreeimport requests
import random
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
}
url = ''
page_text = (url=url, headers=headers).textfrom multiprocessing.dummy import Pool
pool = Pool(4)
viseo_urls = [] # 所有视频的url
tree = etree.HTML(page_text)# 解析视频详情url
li_list = tree.xpath('//*[@id="listvideoListUl"]/li')
'''
var contId="1559965",liveStatusUrl="liveStatus.jsp",liveSta="",playSta="1",autoPlay=!1,isLiving=!1,isVrVideo=!1,hdflvUrl="",sdflvUrl="",hdUrl="",sdUrl="",ldUrl="",
srcUrl=".mp4",vdoUrl=srcUrl,skinRes="//www.pearvideo/domain/skin",videoCDN="//video.pearvideo";
ex='srcUrl="(.*?)",vdoUrl'
'''
import redef getiVideoData(url):(url=url, headers=headers).content
#进行随机保存
def saveVido(data):name=str(random.randint(0,9999))+'.mp4'with open(name,'wb') as f:f.write(data)print(name,'下载成功')
for li in li_list:detail_url = '/' + li.xpath('./div/a/@href')[0]detail_page_text = (url=detail_url, headers=headers).textex = 'srcUrl="(.*?)",vdoUrl'video_src = re.findall(ex, detail_page_text, re.S)[0] # 正则获取视屏url
viseo_urls.append(video_src)
print(viseo_urls)
#使用线程池进行视频数据的异步下载
all_video_data_list=pool.map(getiVideoData, viseo_urls)
#保存视频
pool.map(saveVido,all_video_data_list)
转载于:.html
本文发布于:2024-02-01 07:05:49,感谢您对本站的认可!
本文链接:https://www.4u4v.net/it/170674235234764.html
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系,我们将在24小时内删除。
| 留言与评论(共有 0 条评论) |