quest
from bs4 import BeautifulSoup
import time
import pymongo
import pymysql
#,000000,0000,00,9,99,python,2,2.html
def handle_request(keyword,page,url):
headers = {
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36’,
}
url = url.format(keyword,page)
request = quest.Request(url=url,headers=headers)
return request
#用bs4解析
def parse_content(content,db):
soup = BeautifulSoup(content,‘lxml’)
div_list = soup.select(’#resultList > .el’)[1:]
# print(div_list)
#逐一解析
for os in div_list:#公司职业jobname = os.select('.t1 > span > a')[0]['title']#公司名称company = os.select('.t2 > a ')[0]['title']#工作地点area = os.select('.t3')[0].string#薪资salary = os.select('.t4')[0].string#发布时间publish_time = os.select('.t5')[0].string#print(salary,publishtime)items = {'公司职业':jobname,'公司名称':company,'工作地点':area,'薪资':salary,'发布时间':publish_time,}#String = str(items)#print(items)#fp.write(String,'n')save_to_mysql(db,items)#fp.insert(items)
#第一种是保存到mysql中
def connect_db():
db=pymysql.Connect(host=‘localhost’,port=3306,user=‘root’,password=‘123456’,database=‘51job’,charset=‘utf8’)
#两种引擎,一种是innodb 一种是myisam
return db
#第二种是保存到mongodb中
def connect_mongodb():
#连接mongodb
client = pymongo.MongoClient(host=‘localhost’,port=27017)
return client
#如果用mysql需要自己建立数据库,再创建对应的表格
def save_to_mysql(db,items):
#获取cursor
cursor = db.cursor()
#拼接sql语句
sql = 'insert into job(jobname, company, area, salary, publish_time) values("%s","%s","%s","%s","%s")' % (items['公司职业'], items['公司名称'], items['工作地点'], items['薪资'], items['发布时间'])
ute(sql)dbmit()
except Exception as e:print(llback()
def main():
keyword = input(‘请输入要搜索的关键字-’)
start_page = int(input(‘请输入起始页码-’))
end_page = int(input(‘请输入结束页码-’))
url = ‘,000000,0000,00,9,99,{},2,{}.html’
#fp = open(‘’,‘w’,encoding=‘utf8’)
db = connect_db()
#一次遍历每一页的数据
#client = connect_mongodb()
#选择mongodb的数据库
#db = client.job51
#选择mongodb的集合
#fp = db.job
for page in range(start_page,end_page + 1):print('正在爬取--第%s页--....' % page)request = handle_request(keyword,page,url)content = quest.urlopen(request).read().decode('gbk')parse_content(content,db)print('结束爬取--第%s页--...' % page)time.sleep(2)
db.close()
#fp.close()
#client.close()
if name == ‘main’:
main()
本文发布于:2024-01-30 17:54:57,感谢您对本站的认可!
本文链接:https://www.4u4v.net/it/170660849921791.html
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系,我们将在24小时内删除。
留言与评论(共有 0 条评论) |