翻页:
https://careers.tencent.com/search.html?index=1
https://careers.tencent.com/search.html?index=2
1、创建scrapy项目
scrapy startproject tencent
2、创建爬虫程序
scrapy genspider spider tencent.com
3、写一个运行文件
from scrapy import cmdlinecmdline.execute('scrpay crawl spider'.split())# 或者 cmdline.execute(['scrapy','crawl','spider1'])
4、spider.py文件
import scrapyimport jsonfrom tencent.items import TencentItemclass SpiderSpider(scrapy.Spider):name = 'spider'allowed_domains = ['tencent.com']one_url = 'https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1646136476254&countryId=&cityId=&bgIds=&productId=&categoryId=&parentCategoryId=&attrId=&keyword=&pageIndex={}&pageSize=10&language=zh-cn&area=cn 'detail_url = 'https://careers.tencent.com/tencentcareer/api/post/ByPostId?timestamp=1650456224399&postId={}&language=zh-cn'# 翻页def start_requests(self):for page in range(1, 11):url = self.one_url.format(page)yield scrapy.Request(url, self.parse)def parse(self, response):# 解析数据 xpath css# parse库data = json.loads(response.text)for job in data['Data']['Posts']:item = TencentItem()item['job_name'] = job['RecruitPostName']post_id = job['PostId']# 获取详情页面urldetailUrl = self.detail_url.format(post_id)yield scrapy.Request(url=detailUrl,callback=self.parse_detail,meta={'item':item})def parse_detail(self,response):# 解析详情页面item = response.meta.get('item')data = json.loads(response.text)item['job_duty'] = data['Data']['Responsibility']print(item)
5、items.py文件
class TencentItem(scrapy.Item):# define the fields for your item here like:# name = scrapy.Field()# 职位名称job_name = scrapy.Field()# 职位要求job_duty = scrapy.Field()
