2.用scrapy 爬取链家网站全国的二手房信息。

阅读：评论：0

# endcoding:utf-8
import scrapy,time,random,re
要使用该爬虫在命令行输入以下命令即可。
# scrapy runspider quotes_spider.py -o quotes.json 
class QuotesSpider(scrapy.Spider):# allowed_domains='lianjia'name = 'lianjia_ershou'start_urls=['/']headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'}def start_requests(self):for url in self.start_urls:yield scrapy.Request(url,callback=self.parse_city,headers=self.headers)def parse_city(self,response):urls=response.css('.city_list a::attr("href")').getall()urls=[url+'ershoufang/' for url in urls]for url in urls:yield scrapy.Request(url, self.parse_quyu,headers=self.headers)def parse_quyu(self,response):# print('=='*50)urls=response.xpath('//div[@data-role="ershoufang"]/div[1]/a/@href').getall()for url in urls:url=response.urljoin(url)if url!=response.url:print(url)yield scrapy.Request(url,self.parse,headers=self.headers)def parse(self, response):data=response.xpath('//div[@class="info clear"]')titles=data.css('.title a::text').getall()address=data.css('.address .houseInfo a::text').getall()houseInfo=data.css('.address .houseInfo::text').getall()floods=data.css('.flood').xpath('div/text()').getall()priceInfos=data.css('.priceInfo')totalPrices=[i+'万' for i in data.css('.totalPrice span::text').getall()]unitPrices=priceInfos.css('.unitPrice span::text').getall()for title,address,houseInfo,flood,totalPrice,unitPrice in zip(titles,address,houseInfo,floods,totalPrices,unitPrices):yield{'title':title,'address':address,'houseInfo':houseInfo,'flood':flood,'totalPrice':totalPrice,'unitPrice':unitPrice,}resp=response.xpath('//div[@class="page-box house-lst-page-box"]')try:totalPage&#(r'totalPage":(d+)')[0]curPage&#(r'curPage":(d+)}')[0]except Exception as e:print(e)return # .sleep((random.random()+0.5)*2)if  int(curPage)!=int(totalPage):if not 'pg' in response.url:url1=response.url+'pg'+str(int(curPage)+1)+'/'else:# url1=response.url+'pg'+str(int(curPage)+1)+'/'url1=re.sub(repile(r"d+", re.S), str(int(curPage)+1), response.url)# print(url1)# exit()yield scrapy.Request(url1,callback=self.parse,headers=self.headers)

本文发布于:2024-02-01 17:27:15，感谢您对本站的认可！

本文链接：https://www.4u4v.net/it/170678082738270.html

上一篇：Matlab中复型中I与j效果相同吗

下一篇：Springboot连接Python爬取网站信息