Scarpy - 爬取腾讯招聘信息 - 《爬虫代码集》

翻页：
https://careers.tencent.com/search.html?index=1
https://careers.tencent.com/search.html?index=2
1、创建scrapy项目

scrapy startproject tencent

2、创建爬虫程序

scrapy genspider spider tencent.com

3、写一个运行文件

from scrapy import cmdline
cmdline.execute('scrpay crawl spider'.split())
# 或者 cmdline.execute(['scrapy','crawl','spider1'])

4、spider.py文件

import scrapy
import json
from tencent.items import TencentItem
class SpiderSpider(scrapy.Spider):
    name = 'spider'
    allowed_domains = ['tencent.com']
    one_url = 'https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1646136476254&countryId=&cityId=&bgIds=&productId=&categoryId=&parentCategoryId=&attrId=&keyword=&pageIndex={}&pageSize=10&language=zh-cn&area=cn '
    detail_url = 'https://careers.tencent.com/tencentcareer/api/post/ByPostId?timestamp=1650456224399&postId={}&language=zh-cn'
    # 翻页
    def start_requests(self):
        for page in range(1, 11):
            url = self.one_url.format(page)
            yield scrapy.Request(url, self.parse)
    def parse(self, response):
        # 解析数据  xpath css
        # parse库
        data = json.loads(response.text)
        for job in data['Data']['Posts']:
            item = TencentItem()
            item['job_name'] = job['RecruitPostName']
            post_id = job['PostId']
            # 获取详情页面url
            detailUrl = self.detail_url.format(post_id)
            yield scrapy.Request(url=detailUrl,callback=self.parse_detail,meta={'item':item})
    def parse_detail(self,response):
        # 解析详情页面
        item = response.meta.get('item')
        data = json.loads(response.text)
        item['job_duty'] = data['Data']['Responsibility']
        print(item)

5、items.py文件

class TencentItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    # 职位名称
    job_name = scrapy.Field()
    # 职位要求
    job_duty = scrapy.Field()