爬虫—美食天下各类菜谱的菜单信息

阅读：评论：0

爬虫—美食天下各类菜谱的菜单信息

~~文章仅用于学习，请勿肆意的爬取网站信息，作为非法用途~~
文章仅用于学习，请勿肆意的爬取网站信息，作为非法用途
文章仅用于学习，请勿肆意的爬取网站信息，作为非法用途

效果图

源码

import csv
import os
import randomimport requests
from lxml import etree# 请求头
head = ['Mozilla/5.0', 'Chrome/78.0.3904.97', 'Safari/537.36']
headers = {'user-agent': head[random.randint(0, 2)]
}def makedir(path):path = path.strip()# 去除尾部  符号path = path.rstrip("\")if not ists(path):os.mkdir(path)return Trueelse:return Falsedef getHtml(url):try:response = quest("GET", url=url, headers=headers)if response.status_code == ding = 'utf-8'lse:return response.status_codeexcept Exception as e:return edef htmlToTree(html):return etree.HTML(html)def elementToString(element):string(element, pretty_print=True, encoding='utf-8').decode('utf-8')def parseHtml(html):allcategory = []tree = htmlToTree(html)path = '//div[@class="wrap"]//div[@class="category_box mt20"]/div[@class="category_sub clear"]'data = tree.xpath(path)for item in data:category_list = []div = elementToString(item)div_tree = htmlToTree(div)category = div_tree.xpath('//h3/text()')[0]category_name = div_tree.xpath('//ul/li/a/text()')category_url = div_tree.xpath('//ul/li/a/@href')category_list.append(category)category_list.append(dict(zip(category_name, category_url)))allcategory.append(category_list)return allcategorydef writerCsv(food_list, category_name, name):try:title = [key for key in food_list[0].keys()]  # 表头的代码去掉if '/' in name or category_name:name = place('/', '')category_name = place('/', '')path = os.path.wd() + '\source\{0}'.format(category_name))flag = makedir(path)csvdata = open('source/{0}/{1}.csv'.format(category_name, name), 'a', encoding='utf-8')dictwriter = csv.DictWriter(csvdata, fieldnames=title)dictwriter.writeheader()# for data in food_dict:#     dictWriter.writerow(data)dictwriter.writerows(food_list)csvdata.close()return Trueexcept Exception as e:return Falsedef parseRecipe(html, category_name, name):tree = htmlToTree(html)path = '//div[@class="wrap"]//div[@id="J_list"]/ul/li'data = tree.xpath(path)food_list = []if data is not None and data != []:for item in data[0:1]:food_dict = {}li = elementToString(item)li_tree = htmlToTree(li)food_name = li_tree.xpath('//div[@class="pic"]/a/@title')[0]food_detailUrl = li_tree.xpath('//div[@class="pic"]/a/@href')[0]food_content = str(li_tree.xpath('//div[@class="detail"]/p[@class="subcontent"]/text()')[0])[3:-1]food_dict["菜名"] = food_namefood_dict["详情链接"] = food_detailUrlfood_dict["原料"] = food_contentfood_list.append(food_dict)res = writerCsv(food_list, category_name, name)return resdef getRecipePerCate(data):res = bool()for items in data[:4]:category_name = items[0]for name, url in items[1].items():html = getHtml(url)res = parseRecipe(html, category_name, name)if res:return "写入成功"else:return "写入失败"if __name__ == '__main__':url = '.html'data = parseHtml(getHtml(url))res = getRecipePerCate(data)print(res)

本文发布于:2024-01-27 22:42:53，感谢您对本站的认可！

本文链接：https://www.4u4v.net/it/17063665743090.html

上一篇：王建宙：新基建下5G建设的挑战和建议

下一篇：《基于B/S结构项目实训》设计报告

标签：爬虫菜谱菜单美食天下

留言与评论（共有 0 条评论）