import requests
from bs4 import BeautifulSoup
import random
import time
def get_data(school_num, l, z):
location = {1: "北京", 2: "天津", 3: "辽宁", 4: "吉林", 5: "黑龙江", 6: "上海", 7: "江苏", 8: "浙江", 9: "安徽",
10: "福建", 11: "山东", 12: "湖北", 13: "湖南", 14: "广东", 15: "重庆", 16: "四川", 17: "陕西", 18: "甘肃",
19: "河北", 20: "山西", 21: "内蒙古", 22: "河南", 23: "海南", 24: "广西", 25: "贵州", 26: "云南", 27: "西藏", 28: "青海",
29: "宁夏", 30: "新疆", 31: "江西", 33: "香港", 38: "澳门", 39: "台湾"}
cat = {1: "理科", 2: "文科", 3: "综合", 4: "艺术理", 5: "艺术文"}
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36",
"Accept": "text/javascript, application/javascript, application/ecmascript, application/x-ecmascript, */*; q=0.01",
"Accept-Encoding": "gzip,deflate",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
"Content-Encoding": "gzip",
"Server": "nginx",
"Vary": "Accept-Encoding",
"Connection": "close",
}
proxyHost = "http-dyn.abuyun.com"
proxyPort = "9020"
# 代理隧道验证信息
proxyUser = "HX7YIG4D7IR9907D"
proxyPass = "D9A0153CCBC8081F"
proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
"host": proxyHost,
"port": proxyPort,
"user": proxyUser,
"pass": proxyPass,
}
proxies = {
"http": proxyMeta,
"https": proxyMeta,
}
result = []
contain = []
s = requests.Session()
s.keep_alive = False
# for l in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 33, 38, 39]:
# for z in [1, 2, 3, 4, 5]:
url = "http://college.gaokao.com/school/tinfo/" + str(school_num) + "/result/" + str(l) + "/" + str(z) + "/"
try:
html = s.get(url, headers=headers, proxies=proxies, allow_redirects=False).text
time.sleep(0.1)
bs = BeautifulSoup(html, 'lxml')
try:
bs_school_name = bs.find("div", class_="bg_sez").find('h2').get_text()
print("收集 { %d } %d %s %s %s 数据" % (school_num, l, bs_school_name, location[l], cat[z]))
try:
bs_parser = bs.find("div", id="pointbyarea").find_all("tr", class_=True)
for i in bs_parser:
res = i.find_all("td")
contain.append(bs_school_name)
contain.append(location[l])
contain.append(cat[z])
for m in res:
data = m.get_text()
contain.append(data)
# print(contain)
lis = contain.copy()
result.append(lis)
contain.clear()
# s.close()
# print(result)
except AttributeError:
try:
bs_parser = bs.find("div", class_="ts").find("h3").get_text()
contain.append(bs_school_name)
contain.append(location[l])
contain.append(cat[z])
contain.extend([bs_parser, bs_parser, bs_parser, bs_parser, bs_parser, bs_parser])
lis_1 = contain.copy()
result.append(lis_1)
# print(contain)
contain.clear()
# return result
except AttributeError:
bs_parser = bs.find("div", class_="ts lq").find("h3").get_text()
contain.append(bs_school_name)
contain.append(location[l])
contain.append(cat[z])
contain.extend([bs_parser, bs_parser, bs_parser, bs_parser, bs_parser, bs_parser])
lis_1 = contain.copy()
result.append(lis_1)
# s.close()
# print(contain)
contain.clear()
except AttributeError:
print("'NoneType' object has no attribute 'find'")
return get_data(school_num, l, z)
except requests.exceptions.ProxyError:
print("ProxyError!")
return get_data(school_num, l, z)
# contain.append(school_num)
# contain.append(location[l])
# contain.append(cat[z])
# contain.extend(["代理异常!", "代理异常!", "代理异常!", "代理异常!", "代理异常!", "代理异常!"])
# lis_1 = contain.copy()
# result.append(lis_1)
# contain.clear()
except requests.exceptions.ChunkedEncodingError:
print("ChunkedEncodingError!")
return get_data(school_num, l, z)
s.close()
return result
# start = time.time()
# test_2 = get_data(school_num=1)
# end = time.time()
# print("用时:%s" % (end - start))
# location = {1: "北京", 2: "天津", 3: "辽宁", 4: "吉林", 5: "黑龙江", 6: "上海", 7: "江苏", 8: "浙江", 9: "安徽",
# 10: "福建", 11: "山东", 12: "湖北", 13: "湖南", 14: "广东", 15: "重庆", 16: "四川", 17: "陕西", 18: "甘肃",
# 19: "河北", 20: "山西", 21: "内蒙古", 22: "河南", 23: "海南", 24: "广西", 25: "贵州", 26: "云南", 27: "西藏", 28: "青海",
# 29: "宁夏", 30: "新疆", 31: "江西", 33: "香港", 38: "澳门", 39: "台湾"}
#
# cat = {1: "理科", 2: "文科", 3: "综合", 4: "艺术理", 5: "艺术文"}
final = []
#
start = time.time()
for q in range(1021, 1031): # 2668
for w in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 33, 38, 39]:
for ze in [1, 2, 3, 4, 5]:
res_uid = get_data(q, w, ze)
final.extend(res_uid)
# time.sleep(0.5 + random.random())
end = time.time()
print("总共耗时:%f" % (end - start))