此篇文章中介绍基于requests,lxml模块和Xpath选择器,爬取链家房产数据的案例。
抓取链家二手房网站中的房源信息,如房源名称、地址、户型、面积、方位、是否精装、楼层、年代、类型、总价。
//li[@class='clear LOGVIEWDATA LOGCLICKDATA']
.//div[@class='positionInfo']/a[1]/text()
.//div[@class='positionInfo']/a[2]/text()
.//div[@class='houseInfo']/text()
.//div[@class='totalPrice']/span/text()
.//div[@class='unitPrice']/span/text()
注意:
1、在写xpath表达式时一切以响应内容为主
2、页面HTML为最终渲染完之后的,和响应内容的HTML不一定相同
3、防止页面中出现特殊数据,所以在取下标索引前需要先进行判断
4、如果出现特殊页面迟迟不给响应,则设立重试机制
def __init__(self):self.url = '{}/' # url地址self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36'} # 重构请求头self.i = 0 # 初始化计数
def get_html(self, url):"""function: 获取响应内容函数in: url:传入的url地址out: Nonereturn: int >0 okothers: Get Response Content Function"""for i in range(3): # 如果有异常,尝试三次# noinspection PyBroadExceptiontry:html = (url=url, headers=self.headers, timeout=3).text # 设置3秒钟的超时时间self.parse_html(html) # 调用 xpath提取数据函数self.i += 1 # 爬取成功,计数+1print("第{}页爬取成功!".format(self.i)) # 打印break # 跳出except Exception as e:print(") # 捕捉异常
def parse_html(self, html):"""function: xpath提取数据函数in: html:响应内容out: Nonereturn: Noneothers: Extract Data By Xpath Function"""p = etree.HTML(html) # 创造解析对象li_list = p.xpath("//li[@class='clear LOGVIEWDATA LOGCLICKDATA']") # 解析对象调用xpathitem = {} # 定义一个空字典for li in li_list: # 遍历 解析对象调用xpath后 得到的数据name_list = li.xpath(".//div[@class='positionInfo']/a[1]/text()")item["名称"] = name_list[0].strip() if name_list else None # 判断得到的名称列表是否为空address_list = li.xpath(".//div[@class='positionInfo']/a[2]/text()")item["地址"] = address_list[0].strip() if name_list else None # 判断得到的地址列表是否为空info_li = li.xpath(".//div[@class='houseInfo']/text()")if info_li: # 判断房源信息是否为空info_li = info_li[0].split("|") # 用"|"分割if len(info_li) == 7: # 长度=7item["户型"] = info_li[0].strip()item["面积"] = info_li[1].strip()item["朝向"] = info_li[2].strip()item["装修"] = info_li[3].strip()item["楼层"] = info_li[4].strip()item["年限"] = info_li[5].strip()item["种类"] = info_li[6].strip()else:if len(info_li) == 6: # 长度=6item["户型"] = info_li[0].strip()item["面积"] = info_li[1].strip()item["朝向"] = info_li[2].strip()item["装修"] = info_li[3].strip()item["楼层"] = info_li[4].strip()item["种类"] = info_li[5].strip()else:if len(info_li) == 8: # 长度=8item["户型"] = info_li[0].strip()item["面积"] = info_li[1].strip()item["朝向"] = info_li[2].strip()item["装修"] = info_li[3].strip()item["楼层"] = info_li[4].strip()item["年限"] = info_li[5].strip()item["种类"] = info_li[6].strip()item["种类"] += info_li[7].strip()else:item["户型"] = item["面积"] = item["朝向"] = item["装修"] = item["楼层"] = item["年限"] = item["种类"] = Noneelse:item["户型"] = item["面积"] = item["朝向"] = item["装修"] = item["楼层"] = item["年限"] = item["种类"] = Nonetotal_list = li.xpath(".//div[@class='totalPrice totalPrice2']/span/text()")item["总价"] = total_list[0].strip() if total_list else None # 判断得到的总价列表是否为空unit_list = li.xpath(".//div[@class='unitPrice']/span/text()")item["单价"] = unit_list[0].strip() if unit_list else None # 判断得到的单价列表是否为空print(item) # 打印信息
def run(self):"""function: 程序入口函数in: Noneout: Nonereturn: Noneothers: Program Entry Function"""for pg in range(1, 6): # 爬取1-5页url = self.url.format(pg) # 拼接url地址_html(url) # 调用 获取响应内容函数time.sleep(random.randint(1, 2)) # 1-2s延时
import time
import random
import requests
from lxml import etreeclass LianjiaSpider:"""链家二手房数据抓取"""def __init__(self):self.url = '{}/' # url地址self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36'} # 重构请求头self.i = 0 # 初始化计数def get_html(self, url):"""function: 获取响应内容函数in: url:传入的url地址out: Nonereturn: int >0 okothers: Get Response Content Function"""for i in range(3): # 如果有异常,尝试三次# noinspection PyBroadExceptiontry:html = (url=url, headers=self.headers, timeout=3).text # 设置3秒钟的超时时间self.parse_html(html) # 调用 xpath提取数据函数self.i += 1 # 爬取成功,计数+1print("第{}页爬取成功!".format(self.i)) # 打印break # 跳出except Exception as e:print(") # 捕捉异常def parse_html(self, html):"""function: xpath提取数据函数in: html:响应内容out: Nonereturn: Noneothers: Extract Data By Xpath Function"""p = etree.HTML(html) # 创造解析对象li_list = p.xpath("//li[@class='clear LOGVIEWDATA LOGCLICKDATA']") # 解析对象调用xpathitem = {} # 定义一个空字典for li in li_list: # 遍历 解析对象调用xpath后 得到的数据name_list = li.xpath(".//div[@class='positionInfo']/a[1]/text()")item["名称"] = name_list[0].strip() if name_list else None # 判断得到的名称列表是否为空address_list = li.xpath(".//div[@class='positionInfo']/a[2]/text()")item["地址"] = address_list[0].strip() if name_list else None # 判断得到的地址列表是否为空info_li = li.xpath(".//div[@class='houseInfo']/text()")if info_li: # 判断房源信息是否为空info_li = info_li[0].split("|") # 用"|"分割if len(info_li) == 7: # 长度=7item["户型"] = info_li[0].strip()item["面积"] = info_li[1].strip()item["朝向"] = info_li[2].strip()item["装修"] = info_li[3].strip()item["楼层"] = info_li[4].strip()item["年限"] = info_li[5].strip()item["种类"] = info_li[6].strip()else:if len(info_li) == 6: # 长度=6item["户型"] = info_li[0].strip()item["面积"] = info_li[1].strip()item["朝向"] = info_li[2].strip()item["装修"] = info_li[3].strip()item["楼层"] = info_li[4].strip()item["种类"] = info_li[5].strip()else:if len(info_li) == 8: # 长度=8item["户型"] = info_li[0].strip()item["面积"] = info_li[1].strip()item["朝向"] = info_li[2].strip()item["装修"] = info_li[3].strip()item["楼层"] = info_li[4].strip()item["年限"] = info_li[5].strip()item["种类"] = info_li[6].strip()item["种类"] += info_li[7].strip()else:item["户型"] = item["面积"] = item["朝向"] = item["装修"] = item["楼层"] = item["年限"] = item["种类"] = Noneelse:item["户型"] = item["面积"] = item["朝向"] = item["装修"] = item["楼层"] = item["年限"] = item["种类"] = Nonetotal_list = li.xpath(".//div[@class='totalPrice totalPrice2']/span/text()")item["总价"] = total_list[0].strip() if total_list else None # 判断得到的总价列表是否为空unit_list = li.xpath(".//div[@class='unitPrice']/span/text()")item["单价"] = unit_list[0].strip() if unit_list else None # 判断得到的单价列表是否为空print(item) # 打印信息def run(self):"""function: 程序入口函数in: Noneout: Nonereturn: Noneothers: Program Entry Function"""for pg in range(1, 6): # 爬取1-5页url = self.url.format(pg) # 拼接url地址_html(url) # 调用 获取响应内容函数time.sleep(random.randint(1, 2)) # 1-2s延时if __name__ == '__main__':spider = LianjiaSpider() # 类实例化spider.run() # 调用入口函数
本文发布于:2024-01-28 00:33:22,感谢您对本站的认可!
本文链接:https://www.4u4v.net/it/17063732043572.html
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系,我们将在24小时内删除。
留言与评论(共有 0 条评论) |