学期:2021-2022学年第一学期
学 院 | 大数据与智能工程学院 | 年 级、 专 业、 班 | 18级数据科学与大数据技术(专升本)一班 | 姓 名 | 学号 | ||
---|---|---|---|---|---|---|---|
实验项目 名称 | 抓取 1000条 51job 网站的大数据职位数据的岗位,地点,薪资 |
实验学时: 3h 同组学生姓名: 王美琴、尤博欣、周青青、李昕辰 实验地点: 9317
实验日期: 实验成绩: 批改教师: 批改时间:
指导教师评阅:
- 实验目的:抓取 1000条 51job 网站的大数据职位数据的岗位,地点,薪资
- 实验原理:requests请求、re正则表达式、 随机函数模块
- 实验环境 :win11、python3.9、vscode、edge
- 实验步骤:
- 携带请求头请求51job网站的主页
- 使用re正则表达式语法解析网页数据
- 返回51job网站中大数据职位数据的岗位,地点,数据保存为json文件
核心代码:
# coding=utf8
import random
from concurrent.futures.thread import ThreadPoolExecutor
import requests
import re
import json
from lxml import etree
def readeIP():
params = [
{"HTTP": "HTTP://211.65.197.93:80"},
{"HTTP": "HTTP://218.14.108.53:8060"},
{"HTTP": "HTTP://124.93.201.59:42672"},
{"HTTP": "HTTP://47.100.14.22:9006"},
{"HTTP": "HTTP://121.237.88.63:3000"},
{"HTTP": "HTTP://47.116.142.11:7890"},
{"HTTP": "HTTP://111.231.86.149:7890"},
]
return random.choice(params)
def ReadeUserAgent():
"""
:return:随机浏览器头部
"""
headers = [
{
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36"},
{
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET4.0C; .NET4.0E; rv:11.0) like Gecko"},
{"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko"},
{"User-Agent": "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)"},
{"User-Agent": "User-Agent,Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0"},
{"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0"},
{"User-Agent": "User-Agent,Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)"},
{"User-Agent": "User-Agent,Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)"},
{"User-Agent": "User-Agent, Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)"},
{
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.2669.400 QQBrowser/9.6.10990.400"},
{
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0"},
{
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/5.0.3.4000 Chrome/47.0.2526.73 Safari/537.36"},
{
"User-Agent": "User-Agent,Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50"},
]
return random.choice(headers)
def home_url(i, name, headers, params):
url = "https://search.51job.com/list/000000,000000,0000,00,9,99," + name + ",2," + str(i) + ".html"
response = requests.get(url, headers=headers, params=params)
ob1 = re.compile(r'window.__SEARCH_RESULT__ =(.*?)</script>', re.S)
html = ob1.findall(response.text)[0]
html = json.loads(html)
engine_jds = html["engine_jds"]
data_url(engine_jds, headers)
def data_url(engine_jds, headers):
for data in engine_jds:
item = {}
item["job_title"] = data["job_title"] # 职位名称
item["providesalary_text"] = data["providesalary_text"] # 薪资
item["issuedate"] = data["issuedate"] # 发布时间
item["attribute_text"] = data["attribute_text"] # 工作地址、经验、学历、招聘几人
item["jobwelf"] = data["jobwelf"] # 福利待遇
item["job_href"] = data["job_href"] # url详细链接地址
# print(item)
xiangxi_url(item["job_href"], headers, item)
break
def xiangxi_url(url, headers, item):
params = readeIP()
# print(url, headers)
response = requests.get(url, headers=headers, params=params)
response.encoding = "utf-8"
# print(response.text)
try:
# 滑动处理
if re.findall(r'<title>滑动验证页面</title>', response.text):
pass
pass
except:
pass
html = etree.HTML(response.text)
text = html.xpath('/html/body/div[3]/div[2]/div[3]/div[1]/div[@class="bmsg job_msg inbox"]//text()')
print(text)
def main():
name = "爬虫"
# home_url(i, name, headers=headers,params=params)
with ThreadPoolExecutor(50) as t:
for i in range(1, 3):
# 随机浏览器头部
headers = ReadeUserAgent()
params = readeIP()
# 把下载任务提交给线程池
t.submit(home_url, i=i, name=name, headers=headers)
if __name__ == '__main__':
main()
实验结果及分析:
通过requests库,并添加网页请求头,使用代理IP,请求网页url,利用随机函数得到网页数据源代码,并通过re正则表达式进行数据整理提取,最后存入json文件。
实验总结:
通过抓取1000条51job网站的大数据职位数据岗位、地点、薪资实验当中,实验小组成员发现有一些公司招聘并没有写学历要求,直接爬取会进行保存,超出索引范围,要进行判断; 保存txt文本的时候,会出现特殊字符无法保存需要把标题进行正则匹配,替换掉特殊字符。