2019独角兽企业重金招聘Python工程师标准>>>
Scrapy爬虫】批量采集百度网页_百度知道_百度新闻_360图片_优酷视频
有一堆关键词,采集一批对应的内容,仅需一个脚本:说白就是一个关键词对应有几篇内容、知道、新闻以及图片和视频
可以用来干什么:使用web框架(Flask、Django),CMS(帝国、织梦)等聚合一堆页面。。。
需求主要使用Scarpy爬虫框架,涉及很多实用小技巧:
#coding:utf-8import scrapy,re,urllib,chardet,json
from seo.items import SeoItem
from scrapy.http import Request
adability import Document
# import PyV8import sys
reload(sys)
sys.setdefaultencoding('utf-8')def number(content):text = re.sub("[s+.!/_,$%^*(+"']+|[+——!,::。?、~@#¥%……&*()“”《》]+".decode("utf8"), "".decode("utf8"),content) #去除中英文标点符号text2 = re.sub('<[^>]*?>','',text) #去除所有标签words_number = len(text2)return int(words_number)def bianma(i):i=str(i).strip()mychar = chardet.detect(i)bianma = mychar['encoding']if bianma == 'utf-8' or bianma == 'UTF-8':data=ielse:data=i.decode('gbk','ignore').encode('utf-8')return datadef search(req,html):text = re.search(req,html)if text:data = up(1)else:data = 'no'return datadef extract_data(div,xpath_data):loading = div.xpath('%s'%xpath_data)if loading:loading=bianma(re.sub('<[^>]*?>','',search('<a[^>]*?>([sS]*?)</a>',act()[0])))else:loading='Aladdin'return loadingdef qu_b(re_data):if re_data:loading=bianma(re.sub('<[^>]*?>','',re_data))loading=bianma(re.sub(' ','',loading))else:loading='Aladdin'return loadingclass DmozSpider(scrapy.Spider):name = 'seo' start_urls=[]for word in open('):query=word.strip()start_urls.append('=%s' % urllib.quote(query))start_urls.append('=10&word=%s' % urllib.quote(query))start_urls.append('=20&word=%s' % urllib.quote(query))start_urls.append('=30&word=%s' % urllib.quote(query))start_urls.append('=40&word=%s' % urllib.quote(query))start_urls.append('=50&word=%s' % urllib.quote(query))start_urls.append('=2&rn=20&tn=news&word=%s' % urllib.quote(query))start_urls.append('=10&ie=gbk&word=%s' % urllib.quote(query))start_urls.append('=%s' % urllib.quote(query))start_urls.append('' % urllib.quote(query))def __init__(self):# #init js_ctx# ctx = PyV8.JSContext()# ()# self.js_ctx = ctxself.op_txt=open(','a')self.zidian={}c=0with open(') as f:for i adlines():i=i.strip()self.zidian['%s'%(i)]=cc+=1def __get_url_query(self,url):m = re.search("word=(.*)",url).group(1)return mdef __get_imgurl_query(self,url):m = re.search("q=(.*)",url).group(1)return mdef __get_vediourl_query(self,url):m = re.search("q_(.*)",url).group(1)return mdef parse(self,response):judge_url=response.urlif 'www.baidu' in judge_url:re_url=repile(r'class="c-showurl" style="text-decoration:none;">(.*?)</a>')url_list=re.findall(re_url,response.body)data_table='baidu_pc_search'query=urllib.unquote(self.__get_url_query(judge_url))for url in url_list:url=''+qu_b(url).strip()yield Request(url,callback=lambda response, typid=data_table, typeid=query: self.page_parse(response, typid, typeid))if 'zhidao.baidu' in judge_url:re_url=repile(r'<a href="(zhidao.baidu/question/.*?html?fr=iks&word=.*?&ie=gbk)"')url_list=re.findall(re_url,response.body)data_table='baidu_pc_zhidao'query=urllib.unquote(self.__get_url_query(judge_url))for url in url_list:yield Request(url,callback=lambda response, typid=data_table, typeid=query: self.page_parse(response, typid, typeid))if 'news.baidu' in judge_url:re_url=repile(r'<h3 class="c-title"><a href="(.*?)"')url_list=re.findall(re_url,response.body)data_table='baidu_pc_news'query=urllib.unquote(self.__get_url_query(judge_url))for url in url_list:yield Request(url,callback=lambda response, typid=data_table, typeid=query: self.page_parse(response, typid, typeid))if 'image.so' in judge_url:# only_url=response.urljson_str=response.bodydata_table='so_pc_img'query=urllib.unquote(self.__get_imgurl_query(judge_url))if len(json_str) > 0:# fret = self.js_ctx.eval("""# function func() {# var data = """ + json_str + """;# var json_data = JSON.stringify(data);# return json_data;# }# """)# jsond = self.js_ctx.locals.func()json_data = json.loads(json_str)# print json_datalist_img = json_data['list']for i in list_img:original_img=i['img'] huancun_img=i['thumb_bak']if self.zidian.has_key(judge_url):print u'<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<采集url重复>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>'passelse:print original_img,huancun_imgitem = SeoItem()item['table'] = data_tableitem['query'] = queryitem['title'] = original_img#.encode('utf-8')item['article'] = huancun_img#.encode('utf-8')self.op_txt.writelines(original_img+'n')yield itemif 'soku' in judge_url:re_url=repile(r'<a title=".*?" target="_blank" href="(v.youku/v_show/.*?)"')url_list=re.findall(re_url,response.body)data_table='youku_pc_swf'query=urllib.unquote(self.__get_vediourl_query(judge_url))for url in url_list:print urlyield Request(url,callback=lambda response, typid=data_table, typeid=query: self.page_parse(response, typid, typeid))def page_parse(self,response,typid,typeid):only_url=response.urlif self.zidian.has_key(only_url):print u'<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<采集url重复>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>'passelse:html = response.bodyif typid=='youku_pc_swf':title=search(r'</a><h1 class="title" title="(.*?)"><a href',html)article=search(r"<embed src='(player.youku/player.php/.*?swf)'.*?</embed>",html)item = SeoItem()item['table'] = typiditem['query'] = typeiditem['title'] = title#.encode('utf-8')item['article'] = article#.encode('utf-8')self.op_txt.writelines(only_url+'n')yield itemelse:title = Document(html).short_title()article = Document(html).summary()a = re.sub(r'<script[sS]*?</script>| ','',article).strip()b = re.sub(r'<(?!p|img|/p|br|iframe)[^<>]*?>','',a).strip()c = re.sub(r'<p[^>]*?>','<p>',b).strip().replace('n','')article = re.sub(r'<p>s+<p>','',c)num = number(b) if num > 1 and '出错' not in title:if '404' not in title:# print title,articleitem = SeoItem()item['table'] = typiditem['query'] = typeiditem['title'] = title#.encode('utf-8')item['article'] = article#.encode('utf-8')self.op_txt.writelines(only_url+'n')yield itemelse:print u'<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<没有内容pass掉>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>'pass
转载于:
本文发布于:2024-01-29 04:12:32,感谢您对本站的认可!
本文链接:https://www.4u4v.net/it/170647275712609.html
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系,我们将在24小时内删除。
留言与评论(共有 0 条评论) |