python爬取天极网图片
使用python爬取天极网图片使用python爬取天极网图片,需要导入requests模块,以及os,bs4模块,获取网页地址,打开HTML页面,分析代码结构,查找图片对应的div标签,对应的class属性,再找每张图片对应的子标签,找出其中包含的img标签,对应的url地址,拿到url之后,使用requests发送请求,将其保存在django项目下的img文件夹中。
# low 版
# 爬取页面显示的所有缩略图
import os
import requests #发送请求
from bs4 import BeautifulSoup # 解析文本
base_path = os.path.dirname(os.path.abspath(__file__))
print(base_path)
img_path = os.path.join(base_path,'img')
response = ('.shtml')
soup = ,'html.parser') # 将请求结果交给BS4解析
div_obj = soup.find(name='div',attrs={'class':'lb_box'}) # 经过分析之后定位到指定div
#从div中找到所有的dl标签(每一张图片的外部标点)
list_dl = div_obj.find_all(name='dl')
for dl in list_dl: # 每一张图片的dl
# 从dl中找img,要考虑图片是否唯一
img = dl.find(name='img')
#从标签中获取属性,用标签对象点get
img_src = ('src')
#拿到img的url,使用requests 发请求
img_response = (img_src)
#处理路径和文件名
file_path = os.path.join(img_path,img_src.rsplit('/',1)[-1])
with open(file_path,'wb') as f:
f.write(t)
# 进阶版
#点击图片后显示的几张缩略图
import os
import requests #发送请求
from bs4 import BeautifulSoup # 解析文本
base_path = os.path.dirname(os.path.abspath(__file__))
img_path = os.path.join(base_path,'img')
response = ('.shtml')
soup = ,'html.parser') # 将请求结果交给BS4解析
div_obj = soup.find(name='div',attrs={'class':'lb_box'}) # 经过分析之后定位到指定div
#从div中找到所有的dl标签(每一张图片的外部标点)
list_dd = div_obj.find_all(name='dd')
for dd in list_dd: # 每一张图片的dl
a_obj = dd.find('a')
# print()
# 拼接文件夹的路径,并创建文件夹
dir_path = os.path.join(img_path,)
if not os.path.isdir(dir_path): # 判断文件夹是否存在
os.mkdir(dir_path)
a_response = (('href'))
ding = 'GBK'
soup2 = BeautifulSoup(,'html.parser')
div_obj2 = soup2.find(name='div',attrs={'class':'overview'})
# print(div_obj2)
# try:
img_list = div_obj2.find_all(name='img')
for img in img_list:
img_src = ('src')
img_response = (img_src)
file_path = os.path.join(dir_path,img_src.rsplit('/',1)[-1])
with open(file_path,'wb') as f:
f.write(t)
# except Exception as e:
break
#高清图
# 每张图片下所有的高清图
import os
import requests #发送请求
from bs4 import BeautifulSoup # 解析文本
base_path = os.path.dirname(os.path.abspath(__file__))
img_path = os.path.join(base_path,'img')
response = ('.shtml')
soup = ,'html.parser') # 将请求结果交给BS4解析
div_obj = soup.find(name='div',attrs={'class':'lb_box'}) # 经过分析之后定位到指定div
#从div中找到所有的dl标签(每一张图片的外部标点)
list_dd = div_obj.find_all(name='dd')
for dd in list_dd: # 每一张图片的dl
a_obj = dd.find('a')
# print()
# 拼接文件夹的路径,并创建文件夹
dir_path = os.path.join(img_path,)
if not os.path.isdir(dir_path): # 判断文件夹是否存在
os.mkdir(dir_path)
a_response = (('href'))
ding = 'GBK'
soup2 = BeautifulSoup(,'html.parser')
div_obj2 = soup2.find(name='div',attrs={'class':'overview'})
# print(div_obj2)
try:
img_list = div_obj2.find_all(name='img')
for img in img_list:
img_src = ('src')
img_response = (place('113x113','740x-')) # 路径替换
file_path = os.path.join(dir_path,img_src.rsplit('/',1)[-1])
with open(file_path,'wb') as f:
f.write(t)
except Exception as e:
pass
多进程/多线程爬取五张页面所用高清图
import threading
import os
import requests # 发送请求
from bs4 import BeautifulSoup # 解析文本
from concurrent.futures import ThreadPoolExecutor,ProcessPoolExecutor
from multiprocessing import cpu_count # 获取本机的CPU核数
base_path = os.path.dirname(os.path.abspath(__file__))
img_path = os.path.join(base_path, 'img')
def picture(num):
response = (f'{num}.shtml')
soup = , 'html.parser') # 将请求结果交给BS4解析
div_obj = soup.find(name='div', attrs={'class': 'lb_box'}) # 经过分析之后定位到指定div
# 从div中找到所有的dl标签(每一张图片的外部标点)
list_dd = div_obj.find_all(name='dd')
for dd in list_dd: # 每一张图片的dl
a_obj = dd.find('a')
# 拼接文件夹的路径,并创建文件夹
dir_path = os.path.join(img_path, )
if not os.path.isdir(dir_path): # 判断文件夹是否存在
os.mkdir(dir_path)
a_response = (('href'))
ding = 'GBK'
soup2 = BeautifulSoup(, 'html.parser')
div_obj2 = soup2.find(name='div', attrs={'class': 'overview'})
# print(div_obj2)
try:
img_list = div_obj2.find_all(name='img')
for img in img_list:
img_src = ('src')
img_response = (place('113x113', '740x-')) # 路径替换
file_path = os.path.join(dir_path, img_src.rsplit('/', 1)[-1])
with open(file_path, 'wb') as f:
f.write(t)
except Exception as e:
pass
if __name__ == "__main__":
import time
start = time.time()
# 进程池
# p = ProcessPoolExecutor(max_workers=cpu_count())
# # print(cpu_count())
# for i in range(1,6):
# p.submit(func, i)
# p.shutdown()
# 线程池
t = ThreadPoolExecutor(max_workers=cpu_count())
for i in range(1,6):
t.submit(picture,i)
t.shutdown()
print('执行时间:{}'.format(time.time()-start))
# for i in range(1,6):
# a = threading.Thread(target=picture,args=(i,))
# a.start()
本文发布于:2024-01-31 01:46:44,感谢您对本站的认可!
本文链接:https://www.4u4v.net/it/170663680524445.html
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系,我们将在24小时内删除。
留言与评论(共有 0 条评论) |