python爬取照片

阅读: 评论:0

python爬取照片

python爬取照片

python爬取天极网图片

使用python爬取天极网图片使用python爬取天极网图片,需要导入requests模块,以及os,bs4模块,获取网页地址,打开HTML页面,分析代码结构,查找图片对应的div标签,对应的class属性,再找每张图片对应的子标签,找出其中包含的img标签,对应的url地址,拿到url之后,使用requests发送请求,将其保存在django项目下的img文件夹中。

# low 版

# 爬取页面显示的所有缩略图

import os

import requests #发送请求

from bs4 import BeautifulSoup # 解析文本

base_path = os.path.dirname(os.path.abspath(__file__))

print(base_path)

img_path = os.path.join(base_path,'img')

response = ('.shtml')

soup = ,'html.parser') # 将请求结果交给BS4解析

div_obj = soup.find(name='div',attrs={'class':'lb_box'}) # 经过分析之后定位到指定div

#从div中找到所有的dl标签(每一张图片的外部标点)

list_dl = div_obj.find_all(name='dl')

for dl in list_dl: # 每一张图片的dl

# 从dl中找img,要考虑图片是否唯一

img = dl.find(name='img')

#从标签中获取属性,用标签对象点get

img_src = ('src')

#拿到img的url,使用requests 发请求

img_response = (img_src)

#处理路径和文件名

file_path = os.path.join(img_path,img_src.rsplit('/',1)[-1])

with open(file_path,'wb') as f:

f.write(t)

# 进阶版

#点击图片后显示的几张缩略图

import os

import requests #发送请求

from bs4 import BeautifulSoup # 解析文本

base_path = os.path.dirname(os.path.abspath(__file__))

img_path = os.path.join(base_path,'img')

response = ('.shtml')

soup = ,'html.parser') # 将请求结果交给BS4解析

div_obj = soup.find(name='div',attrs={'class':'lb_box'}) # 经过分析之后定位到指定div

#从div中找到所有的dl标签(每一张图片的外部标点)

list_dd = div_obj.find_all(name='dd')

for dd in list_dd: # 每一张图片的dl

a_obj = dd.find('a')

# print()

# 拼接文件夹的路径,并创建文件夹

dir_path = os.path.join(img_path,)

if not os.path.isdir(dir_path): # 判断文件夹是否存在

os.mkdir(dir_path)

a_response = (('href'))

ding = 'GBK'

soup2 = BeautifulSoup(,'html.parser')

div_obj2 = soup2.find(name='div',attrs={'class':'overview'})

# print(div_obj2)

# try:

img_list = div_obj2.find_all(name='img')

for img in img_list:

img_src = ('src')

img_response = (img_src)

file_path = os.path.join(dir_path,img_src.rsplit('/',1)[-1])

with open(file_path,'wb') as f:

f.write(t)

# except Exception as e:

break

#高清图

# 每张图片下所有的高清图

import os

import requests #发送请求

from bs4 import BeautifulSoup # 解析文本

base_path = os.path.dirname(os.path.abspath(__file__))

img_path = os.path.join(base_path,'img')

response = ('.shtml')

soup = ,'html.parser') # 将请求结果交给BS4解析

div_obj = soup.find(name='div',attrs={'class':'lb_box'}) # 经过分析之后定位到指定div

#从div中找到所有的dl标签(每一张图片的外部标点)

list_dd = div_obj.find_all(name='dd')

for dd in list_dd: # 每一张图片的dl

a_obj = dd.find('a')

# print()

# 拼接文件夹的路径,并创建文件夹

dir_path = os.path.join(img_path,)

if not os.path.isdir(dir_path): # 判断文件夹是否存在

os.mkdir(dir_path)

a_response = (('href'))

ding = 'GBK'

soup2 = BeautifulSoup(,'html.parser')

div_obj2 = soup2.find(name='div',attrs={'class':'overview'})

# print(div_obj2)

try:

img_list = div_obj2.find_all(name='img')

for img in img_list:

img_src = ('src')

img_response = (place('113x113','740x-')) # 路径替换

file_path = os.path.join(dir_path,img_src.rsplit('/',1)[-1])

with open(file_path,'wb') as f:

f.write(t)

except Exception as e:

pass

多进程/多线程爬取五张页面所用高清图

import threading

import os

import requests # 发送请求

from bs4 import BeautifulSoup # 解析文本

from concurrent.futures import ThreadPoolExecutor,ProcessPoolExecutor

from multiprocessing import cpu_count # 获取本机的CPU核数

base_path = os.path.dirname(os.path.abspath(__file__))

img_path = os.path.join(base_path, 'img')

def picture(num):

response = (f'{num}.shtml')

soup = , 'html.parser') # 将请求结果交给BS4解析

div_obj = soup.find(name='div', attrs={'class': 'lb_box'}) # 经过分析之后定位到指定div

# 从div中找到所有的dl标签(每一张图片的外部标点)

list_dd = div_obj.find_all(name='dd')

for dd in list_dd: # 每一张图片的dl

a_obj = dd.find('a')

# 拼接文件夹的路径,并创建文件夹

dir_path = os.path.join(img_path, )

if not os.path.isdir(dir_path): # 判断文件夹是否存在

os.mkdir(dir_path)

a_response = (('href'))

ding = 'GBK'

soup2 = BeautifulSoup(, 'html.parser')

div_obj2 = soup2.find(name='div', attrs={'class': 'overview'})

# print(div_obj2)

try:

img_list = div_obj2.find_all(name='img')

for img in img_list:

img_src = ('src')

img_response = (place('113x113', '740x-')) # 路径替换

file_path = os.path.join(dir_path, img_src.rsplit('/', 1)[-1])

with open(file_path, 'wb') as f:

f.write(t)

except Exception as e:

pass

if __name__ == "__main__":

import time

start = time.time()

# 进程池

# p = ProcessPoolExecutor(max_workers=cpu_count())

# # print(cpu_count())

# for i in range(1,6):

# p.submit(func, i)

# p.shutdown()

# 线程池

t = ThreadPoolExecutor(max_workers=cpu_count())

for i in range(1,6):

t.submit(picture,i)

t.shutdown()

print('执行时间:{}'.format(time.time()-start))

# for i in range(1,6):

# a = threading.Thread(target=picture,args=(i,))

# a.start()

本文发布于:2024-01-31 01:46:44,感谢您对本站的认可!

本文链接:https://www.4u4v.net/it/170663680524445.html

版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系,我们将在24小时内删除。

标签:照片   python
留言与评论(共有 0 条评论)
   
验证码:

Copyright ©2019-2022 Comsenz Inc.Powered by ©

网站地图1 网站地图2 网站地图3 网站地图4 网站地图5 网站地图6 网站地图7 网站地图8 网站地图9 网站地图10 网站地图11 网站地图12 网站地图13 网站地图14 网站地图15 网站地图16 网站地图17 网站地图18 网站地图19 网站地图20 网站地图21 网站地图22/a> 网站地图23