python爬取照片

阅读：评论：0

python爬取照片

python爬取天极网图片

使用python爬取天极网图片使用python爬取天极网图片，需要导入requests模块，以及os，bs4模块，获取网页地址，打开HTML页面，分析代码结构，查找图片对应的div标签，对应的class属性，再找每张图片对应的子标签，找出其中包含的img标签，对应的url地址，拿到url之后，使用requests发送请求，将其保存在django项目下的img文件夹中。

# low 版

# 爬取页面显示的所有缩略图

import os

import requests #发送请求

from bs4 import BeautifulSoup # 解析文本

base_path = os.path.dirname(os.path.abspath(__file__))

print(base_path)

img_path = os.path.join(base_path,'img')

response = ('.shtml')

soup = ,'html.parser') # 将请求结果交给BS4解析

div_obj = soup.find(name='div',attrs={'class':'lb_box'}) # 经过分析之后定位到指定div

#从div中找到所有的dl标签（每一张图片的外部标点）

list_dl = div_obj.find_all(name='dl')

for dl in list_dl: # 每一张图片的dl

# 从dl中找img，要考虑图片是否唯一

img = dl.find(name='img')

#从标签中获取属性，用标签对象点get

img_src = ('src')

#拿到img的url,使用requests 发请求

img_response = (img_src)

#处理路径和文件名

file_path = os.path.join(img_path,img_src.rsplit('/',1)[-1])

with open(file_path,'wb') as f:

f.write(t)

# 进阶版

#点击图片后显示的几张缩略图

import os

import requests #发送请求

from bs4 import BeautifulSoup # 解析文本

base_path = os.path.dirname(os.path.abspath(__file__))

img_path = os.path.join(base_path,'img')

response = ('.shtml')

soup = ,'html.parser') # 将请求结果交给BS4解析

div_obj = soup.find(name='div',attrs={'class':'lb_box'}) # 经过分析之后定位到指定div

#从div中找到所有的dl标签（每一张图片的外部标点）

list_dd = div_obj.find_all(name='dd')

for dd in list_dd: # 每一张图片的dl

a_obj = dd.find('a')

# print()

# 拼接文件夹的路径，并创建文件夹

dir_path = os.path.join(img_path,)

if not os.path.isdir(dir_path): # 判断文件夹是否存在

os.mkdir(dir_path)

a_response = (('href'))

ding = 'GBK'

soup2 = BeautifulSoup(,'html.parser')

div_obj2 = soup2.find(name='div',attrs={'class':'overview'})

# print(div_obj2)

# try:

img_list = div_obj2.find_all(name='img')

for img in img_list:

img_src = ('src')

img_response = (img_src)

file_path = os.path.join(dir_path,img_src.rsplit('/',1)[-1])

with open(file_path,'wb') as f:

f.write(t)

# except Exception as e:

break

#高清图

# 每张图片下所有的高清图

import os

import requests #发送请求

from bs4 import BeautifulSoup # 解析文本

base_path = os.path.dirname(os.path.abspath(__file__))

img_path = os.path.join(base_path,'img')

response = ('.shtml')

soup = ,'html.parser') # 将请求结果交给BS4解析

div_obj = soup.find(name='div',attrs={'class':'lb_box'}) # 经过分析之后定位到指定div

#从div中找到所有的dl标签（每一张图片的外部标点）

list_dd = div_obj.find_all(name='dd')

for dd in list_dd: # 每一张图片的dl

a_obj = dd.find('a')

# print()

# 拼接文件夹的路径，并创建文件夹

dir_path = os.path.join(img_path,)

if not os.path.isdir(dir_path): # 判断文件夹是否存在

os.mkdir(dir_path)

a_response = (('href'))

ding = 'GBK'

soup2 = BeautifulSoup(,'html.parser')

div_obj2 = soup2.find(name='div',attrs={'class':'overview'})

# print(div_obj2)

try:

img_list = div_obj2.find_all(name='img')

for img in img_list:

img_src = ('src')

img_response = (place('113x113','740x-')) # 路径替换

file_path = os.path.join(dir_path,img_src.rsplit('/',1)[-1])

with open(file_path,'wb') as f:

f.write(t)

except Exception as e:

pass

多进程/多线程爬取五张页面所用高清图

import threading

import os

import requests # 发送请求

from bs4 import BeautifulSoup # 解析文本

from concurrent.futures import ThreadPoolExecutor,ProcessPoolExecutor

from multiprocessing import cpu_count # 获取本机的CPU核数

base_path = os.path.dirname(os.path.abspath(__file__))

img_path = os.path.join(base_path, 'img')

def picture(num):

response = (f'{num}.shtml')

soup = , 'html.parser') # 将请求结果交给BS4解析

div_obj = soup.find(name='div', attrs={'class': 'lb_box'}) # 经过分析之后定位到指定div

# 从div中找到所有的dl标签（每一张图片的外部标点）

list_dd = div_obj.find_all(name='dd')

for dd in list_dd: # 每一张图片的dl

a_obj = dd.find('a')

# 拼接文件夹的路径，并创建文件夹

dir_path = os.path.join(img_path, )

if not os.path.isdir(dir_path): # 判断文件夹是否存在

os.mkdir(dir_path)

a_response = (('href'))

ding = 'GBK'

soup2 = BeautifulSoup(, 'html.parser')

div_obj2 = soup2.find(name='div', attrs={'class': 'overview'})

# print(div_obj2)

try:

img_list = div_obj2.find_all(name='img')

for img in img_list:

img_src = ('src')

img_response = (place('113x113', '740x-')) # 路径替换

file_path = os.path.join(dir_path, img_src.rsplit('/', 1)[-1])

with open(file_path, 'wb') as f:

f.write(t)

except Exception as e:

pass

if __name__ == "__main__":

import time

start = time.time()

# 进程池

# p = ProcessPoolExecutor(max_workers=cpu_count())

# # print(cpu_count())

# for i in range(1,6):

# p.submit(func, i)

# p.shutdown()

# 线程池

t = ThreadPoolExecutor(max_workers=cpu_count())

for i in range(1,6):

t.submit(picture,i)

t.shutdown()

print('执行时间:{}'.format(time.time()-start))

# for i in range(1,6):

# a = threading.Thread(target=picture,args=(i,))

# a.start()

本文发布于:2024-01-31 01:46:44，感谢您对本站的认可！

本文链接：https://www.4u4v.net/it/170663680524445.html

上一篇：hp linux 磁盘阵列,【惠普磁盘阵列】报价

下一篇：java使用jsoup，多线程批量爬取天极网某分类下的图片

标签：照片 python

留言与评论（共有 0 条评论）