翻页:
https://careers.tencent.com/search.html?index=1
https://careers.tencent.com/search.html?index=2
1、创建scrapy项目
scrapy startproject tencent
2、创建爬虫程序
scrapy genspider spider tencent.com
3、写一个运行文件
from scrapy import cmdline
cmdline.execute('scrpay crawl spider'.split())
# 或者 cmdline.execute(['scrapy','crawl','spider1'])
4、spider.py文件
import scrapy
import json
from tencent.items import TencentItem
class SpiderSpider(scrapy.Spider):
name = 'spider'
allowed_domains = ['tencent.com']
one_url = 'https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1646136476254&countryId=&cityId=&bgIds=&productId=&categoryId=&parentCategoryId=&attrId=&keyword=&pageIndex={}&pageSize=10&language=zh-cn&area=cn '
detail_url = 'https://careers.tencent.com/tencentcareer/api/post/ByPostId?timestamp=1650456224399&postId={}&language=zh-cn'
# 翻页
def start_requests(self):
for page in range(1, 11):
url = self.one_url.format(page)
yield scrapy.Request(url, self.parse)
def parse(self, response):
# 解析数据 xpath css
# parse库
data = json.loads(response.text)
for job in data['Data']['Posts']:
item = TencentItem()
item['job_name'] = job['RecruitPostName']
post_id = job['PostId']
# 获取详情页面url
detailUrl = self.detail_url.format(post_id)
yield scrapy.Request(url=detailUrl,callback=self.parse_detail,meta={'item':item})
def parse_detail(self,response):
# 解析详情页面
item = response.meta.get('item')
data = json.loads(response.text)
item['job_duty'] = data['Data']['Responsibility']
print(item)
5、items.py文件
class TencentItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# 职位名称
job_name = scrapy.Field()
# 职位要求
job_duty = scrapy.Field()