# -*- coding: utf-8 -*-
import scrapy
from qcwy.items import QcwyItem
from urllib import parse
import reclass A51jobSpider(scrapy.Spider):name = '51job'allowed_domains = ['51job']keyword = "python开发工程师" # 此地方可以灵活更换kw = parse.quote(parse.quote(keyword))base_url = ",000000,0000,00,9,99,{0},2,{1}.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare="offset = 1start_urls = [base_url.format(kw, offset)]def parse(self, response):if response.body is not None:string = response.xpath('//span[@class="td"]').extract()[0]pattern = ".*?(d+).*?"count = re.match(pattern, string).groups()[0]for data in response.xpath('//div[@class="el"]'):item = QcwyItem()item['position'] = data.xpath('./p/span/a/text()').extract()if len(item['position']) != 0:item['position'] = str(item['position'][0]).replace('rn', "").strip()item['position_href'] = data.xpath('./p/span/a/@href').extract()[0]item['company'] = data.xpath('./span/a/text()').extract()[0]item['company_href'] = data.xpath('./span/a/@href').extract()[0]item['workplace'] = data.xpath('./span[@class="t3"]/text()').extract()[0]item['pay'] = data.xpath('./span[@class="t4"]/text()').extract()if len(item['pay']) != 0:item['pay'] = item['pay'][0]else:item['pay'] = ""item['release_time'] = data.xpath('./span[@class="t5"]/text()').extract()[0]yield scrapy.Request(url=item['position_href'], callback=self.parse_info,meta={"iteminfo": item},headers={'referer': item['position_href']})# yield itemself.offset += 1if self.offset > int(count):returnyield scrapy.Request(self.base_url.format(self.kw, self.offset), callback=self.parse, headers={'referer': self.base_url.format(self.kw, self.offset)})def parse_info(self, response):"""处理详细信息:param response::return:"""item = a["iteminfo"]position_ask = response.xpath('//div[@class="cn"]/p[@class="msg ltype"]/text()').extract()item['position_ask'] = " ".join(str(x).replace('\s', "").strip() for x in position_ask if x is not None)position_welfare = response.xpath('//div[@class="cn"]//div[@class="t1"]/span/text()').extract()item['position_welfare'] = " ".join(str(x).replace('\s', "").strip() for x in position_welfare if x is not None)position_info = response.xpath('//div[@class="tCompany_main"]//div[@class="bmsg job_msg inbox"]//text()').extract()item['position_info'] = " ".join(str(x).replace('\s', "").strip() for x in position_info if x is not None)position_el = response.xpath('//div[@class="tCompany_main"]/div[@class="tBorderTop_box"]/div[@class="bmsg inbox"]//text()').extract()item['position_el'] = " ".join(str(x).replace('\s', "").strip() for x in position_el if x is not None)yield item
# -*- coding: utf-8 -*-# Define here the models for your scraped items
#
# See documentation in:
# .htmlimport scrapyclass QcwyItem(scrapy.Item):# define the fields for your item here like:# name = scrapy.Field()# 职位position = scrapy.Field()# 职位链接position_href = scrapy.Field()# 职位要求position_ask = scrapy.Field()# 福利position_welfare = scrapy.Field()# 职位信息position_info = scrapy.Field()# 职位其他信息position_el = scrapy.Field()# 公司company = scrapy.Field()# 公司链接company_href = scrapy.Field()# 工作地点workplace = scrapy.Field()# 薪资pay = scrapy.Field()# 发布时间release_time = scrapy.Field()
# -*- coding: utf-8 -*-# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: .html
from pymongo import MongoClientclass QcwyPipeline(object):def process_item(self, item, spider):client = MongoClient(host="127.0.0.1", port=27017)db = client['qcwy']col = db['python'] # collections可以灵活改变col.insert_one(dict(item))return item
# -*- coding: utf-8 -*-# Scrapy settings for qcwy project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# .html
# .html
# .htmlBOT_NAME = 'qcwy'SPIDER_MODULES = ['qcwy.spiders']
NEWSPIDER_MODULE = 'qcwy.spiders'# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'# rules
ROBOTSTXT_OBEY = False# Configure maximum concurrent requests performed by Scrapy (default: 16)
# CONCURRENT_REQUESTS = 32# Configure a delay for requests for the same website (default: 0)
# See .html#download-delay
# See also autothrottle settings and docs
# DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
# CONCURRENT_REQUESTS_PER_IP = 16# Disable cookies (enabled by default)
# COOKIES_ENABLED = False# Disable Telnet Console (enabled by default)
# TELNETCONSOLE_ENABLED = False# Override the default request headers:
# DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
# }# Enable or disable spider middlewares
# See .html
# SPIDER_MIDDLEWARES = {
# 'qcwy.middlewares.QcwySpiderMiddleware': 543,
# }# Enable or disable downloader middlewares
# See .html
# DOWNLOADER_MIDDLEWARES = {
# 'qcwy.middlewares.QcwyDownloaderMiddleware': 543,
# }# Enable or disable extensions
# See .html
# EXTENSIONS = {
# lnet.TelnetConsole': None,
# }# Configure item pipelines
# See .html
ITEM_PIPELINES = {'qcwy.pipelines.QcwyPipeline': 300,
}# Enable and configure the AutoThrottle extension (disabled by default)
# See .html
# AUTOTHROTTLE_ENABLED = True
# The initial download delay
# AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
# AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
# AUTOTHROTTLE_DEBUG = False# Enable and configure HTTP caching (disabled by default)
# See .html#httpcache-middleware-settings
# HTTPCACHE_ENABLED = True
# HTTPCACHE_EXPIRATION_SECS = 0
# HTTPCACHE_DIR = 'httpcache'
# HTTPCACHE_IGNORE_HTTP_CODES = []
# HTTPCACHE_STORAGE = sions.httpcache.FilesystemCacheStorage'
本文发布于:2024-01-29 17:34:23,感谢您对本站的认可!
本文链接:https://www.4u4v.net/it/170652086617112.html
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系,我们将在24小时内删除。
留言与评论(共有 0 条评论) |