# -*- coding: utf-8 -*-
import scrapy
import re
from bs4 import BeautifulSoup
import osclass GetalllinkSpider(scrapy.Spider):name = 'getalllink'# allowed_domains = [v']start_urls = ['.html']def parse(self, response):global domaindomain = ''gz = response.css( *').extract() #高中cz = response.css( *').extract() #初中xx = response.css( *').extract() #小学gz = ''.join(gz)cz = ''.join(cz)xx = ''.join(xx)# print(type(gz))gzsoup = BeautifulSoup(gz, 'html.parser')czsoup = BeautifulSoup(cz, 'html.parser')xxsoup = BeautifulSoup(xx, 'html.parser')gzh3 = gzsoup.find_all('h3')czh3 = czsoup.find_all('h3')xxh3 = xxsoup.find_all('h3')for i in range(0, 3):link = gzh3[i].a.get('href')link = domain + linkif not ists('./高中'):os.mkdir('./高中')yield scrapy.Request(link, callback=self.parsesecond, meta={'grade': './高中/'})for i in range(0, 3):link = czh3[i].a.get('href')link = domain + linkif not ists('./初中'):os.mkdir('./初中')yield scrapy.Request(link, callback=self.parsesecond, meta={'grade': './初中/'})for i in range(0, 6):link = xxh3[i].a.get('href')link = domain + linkif not ists('./小学'):os.mkdir('./小学')yield scrapy.Request(link, callback=self.parsesecond, meta={'grade': './小学/'})def parsesecond(self, response):a = response.css('a').extract()a = ''.join(a)list = re.findall('/zt/sj.*?html', a)for i in list:yield scrapy.Request(domain + i, callback=self.parsethird, meta={'grade': a['grade'], 'time': 1})def parsethird(self, response):a['time'])ftitle = response.css('a#ftitle::text').extract_first()title = response.css('span#title::text').extract_first()title = ftitle + ''.join(title) + '.txt'alist = response.css('a[target=_blank]').extract()alist = ''.join(alist)linksoup = BeautifulSoup(alist, 'html.parser')if a['grade'] + title):passelse:with a['grade'] + title, 'a') as f:f.close()for i in linksoup.find_all('a'):videotitle = i['title']url = domain + i['href']yield scrapy.Request(url, callback=self.parseinside,meta={'title': videotitle, 'ftitle': a['grade'] + title})#下一页a = response.css('p#page_control_bar *').extract()flag = Falsefor i in a:if re.search('current', i) is not None:flag = Truecontinueif flag:link = re.findall('/zt/sj.*?html', str(i))[0]yield scrapy.Request(domain + str(link), callback=self.parsethird,meta={'grade': a['grade'], 'time': 2})def parseinside(self, response):title = a['title']ftitle = a['ftitle']playhost = '.*mp4|.*mp4' #匹配链接字符串resp = itle = response.css('h3#title::text').extract_first()playlink = re.search(playhost, resp)if playlink is not None:video = up(0))with open(ftitle, 'a') as f:f.write(title + ':' + video + 'n')f.close()
爬是爬到了,就是后期还要改成xlsx以便排序
本文需要的库:scrapy,BeautifulSoup4
本文发布于:2024-02-01 08:26:06,感谢您对本站的认可!
本文链接:https://www.4u4v.net/it/170674717035227.html
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系,我们将在24小时内删除。
留言与评论(共有 0 条评论) |