import requestsfrom bs4 import BeautifulSoupimport randomimport timedef get_data(school_num, l, z): location = {1: "北京", 2: "天津", 3: "辽宁", 4: "吉林", 5: "黑龙江", 6: "上海", 7: "江苏", 8: "浙江", 9: "安徽", 10: "福建", 11: "山东", 12: "湖北", 13: "湖南", 14: "广东", 15: "重庆", 16: "四川", 17: "陕西", 18: "甘肃", 19: "河北", 20: "山西", 21: "内蒙古", 22: "河南", 23: "海南", 24: "广西", 25: "贵州", 26: "云南", 27: "西藏", 28: "青海", 29: "宁夏", 30: "新疆", 31: "江西", 33: "香港", 38: "澳门", 39: "台湾"} cat = {1: "理科", 2: "文科", 3: "综合", 4: "艺术理", 5: "艺术文"} headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36", "Accept": "text/javascript, application/javascript, application/ecmascript, application/x-ecmascript, */*; q=0.01", "Accept-Encoding": "gzip,deflate", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", "Content-Encoding": "gzip", "Server": "nginx", "Vary": "Accept-Encoding", "Connection": "close", } proxyHost = "http-dyn.abuyun.com" proxyPort = "9020" # 代理隧道验证信息 proxyUser = "HX7YIG4D7IR9907D" proxyPass = "D9A0153CCBC8081F" proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % { "host": proxyHost, "port": proxyPort, "user": proxyUser, "pass": proxyPass, } proxies = { "http": proxyMeta, "https": proxyMeta, } result = [] contain = [] s = requests.Session() s.keep_alive = False # for l in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 33, 38, 39]: # for z in [1, 2, 3, 4, 5]: url = "http://college.gaokao.com/school/tinfo/" + str(school_num) + "/result/" + str(l) + "/" + str(z) + "/" try: html = s.get(url, headers=headers, proxies=proxies, allow_redirects=False).text time.sleep(0.1) bs = BeautifulSoup(html, 'lxml') try: bs_school_name = bs.find("div", class_="bg_sez").find('h2').get_text() print("收集 { %d } %d %s %s %s 数据" % (school_num, l, bs_school_name, location[l], cat[z])) try: bs_parser = bs.find("div", id="pointbyarea").find_all("tr", class_=True) for i in bs_parser: res = i.find_all("td") contain.append(bs_school_name) contain.append(location[l]) contain.append(cat[z]) for m in res: data = m.get_text() contain.append(data) # print(contain) lis = contain.copy() result.append(lis) contain.clear() # s.close() # print(result) except AttributeError: try: bs_parser = bs.find("div", class_="ts").find("h3").get_text() contain.append(bs_school_name) contain.append(location[l]) contain.append(cat[z]) contain.extend([bs_parser, bs_parser, bs_parser, bs_parser, bs_parser, bs_parser]) lis_1 = contain.copy() result.append(lis_1) # print(contain) contain.clear() # return result except AttributeError: bs_parser = bs.find("div", class_="ts lq").find("h3").get_text() contain.append(bs_school_name) contain.append(location[l]) contain.append(cat[z]) contain.extend([bs_parser, bs_parser, bs_parser, bs_parser, bs_parser, bs_parser]) lis_1 = contain.copy() result.append(lis_1) # s.close() # print(contain) contain.clear() except AttributeError: print("'NoneType' object has no attribute 'find'") return get_data(school_num, l, z) except requests.exceptions.ProxyError: print("ProxyError!") return get_data(school_num, l, z) # contain.append(school_num) # contain.append(location[l]) # contain.append(cat[z]) # contain.extend(["代理异常!", "代理异常!", "代理异常!", "代理异常!", "代理异常!", "代理异常!"]) # lis_1 = contain.copy() # result.append(lis_1) # contain.clear() except requests.exceptions.ChunkedEncodingError: print("ChunkedEncodingError!") return get_data(school_num, l, z) s.close() return result# start = time.time()# test_2 = get_data(school_num=1)# end = time.time()# print("用时:%s" % (end - start))# location = {1: "北京", 2: "天津", 3: "辽宁", 4: "吉林", 5: "黑龙江", 6: "上海", 7: "江苏", 8: "浙江", 9: "安徽",# 10: "福建", 11: "山东", 12: "湖北", 13: "湖南", 14: "广东", 15: "重庆", 16: "四川", 17: "陕西", 18: "甘肃",# 19: "河北", 20: "山西", 21: "内蒙古", 22: "河南", 23: "海南", 24: "广西", 25: "贵州", 26: "云南", 27: "西藏", 28: "青海",# 29: "宁夏", 30: "新疆", 31: "江西", 33: "香港", 38: "澳门", 39: "台湾"}## cat = {1: "理科", 2: "文科", 3: "综合", 4: "艺术理", 5: "艺术文"}final = []#start = time.time()for q in range(1021, 1031): # 2668 for w in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 33, 38, 39]: for ze in [1, 2, 3, 4, 5]: res_uid = get_data(q, w, ze) final.extend(res_uid) # time.sleep(0.5 + random.random())end = time.time()print("总共耗时:%f" % (end - start))