1. import requests
    2. from bs4 import BeautifulSoup
    3. import random
    4. import time
    5. def get_data(school_num, l, z):
    6. location = {1: "北京", 2: "天津", 3: "辽宁", 4: "吉林", 5: "黑龙江", 6: "上海", 7: "江苏", 8: "浙江", 9: "安徽",
    7. 10: "福建", 11: "山东", 12: "湖北", 13: "湖南", 14: "广东", 15: "重庆", 16: "四川", 17: "陕西", 18: "甘肃",
    8. 19: "河北", 20: "山西", 21: "内蒙古", 22: "河南", 23: "海南", 24: "广西", 25: "贵州", 26: "云南", 27: "西藏", 28: "青海",
    9. 29: "宁夏", 30: "新疆", 31: "江西", 33: "香港", 38: "澳门", 39: "台湾"}
    10. cat = {1: "理科", 2: "文科", 3: "综合", 4: "艺术理", 5: "艺术文"}
    11. headers = {
    12. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36",
    13. "Accept": "text/javascript, application/javascript, application/ecmascript, application/x-ecmascript, */*; q=0.01",
    14. "Accept-Encoding": "gzip,deflate",
    15. "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
    16. "Content-Encoding": "gzip",
    17. "Server": "nginx",
    18. "Vary": "Accept-Encoding",
    19. "Connection": "close",
    20. }
    21. proxyHost = "http-dyn.abuyun.com"
    22. proxyPort = "9020"
    23. # 代理隧道验证信息
    24. proxyUser = "HX7YIG4D7IR9907D"
    25. proxyPass = "D9A0153CCBC8081F"
    26. proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
    27. "host": proxyHost,
    28. "port": proxyPort,
    29. "user": proxyUser,
    30. "pass": proxyPass,
    31. }
    32. proxies = {
    33. "http": proxyMeta,
    34. "https": proxyMeta,
    35. }
    36. result = []
    37. contain = []
    38. s = requests.Session()
    39. s.keep_alive = False
    40. # for l in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 33, 38, 39]:
    41. # for z in [1, 2, 3, 4, 5]:
    42. url = "http://college.gaokao.com/school/tinfo/" + str(school_num) + "/result/" + str(l) + "/" + str(z) + "/"
    43. try:
    44. html = s.get(url, headers=headers, proxies=proxies, allow_redirects=False).text
    45. time.sleep(0.1)
    46. bs = BeautifulSoup(html, 'lxml')
    47. try:
    48. bs_school_name = bs.find("div", class_="bg_sez").find('h2').get_text()
    49. print("收集 { %d } %d %s %s %s 数据" % (school_num, l, bs_school_name, location[l], cat[z]))
    50. try:
    51. bs_parser = bs.find("div", id="pointbyarea").find_all("tr", class_=True)
    52. for i in bs_parser:
    53. res = i.find_all("td")
    54. contain.append(bs_school_name)
    55. contain.append(location[l])
    56. contain.append(cat[z])
    57. for m in res:
    58. data = m.get_text()
    59. contain.append(data)
    60. # print(contain)
    61. lis = contain.copy()
    62. result.append(lis)
    63. contain.clear()
    64. # s.close()
    65. # print(result)
    66. except AttributeError:
    67. try:
    68. bs_parser = bs.find("div", class_="ts").find("h3").get_text()
    69. contain.append(bs_school_name)
    70. contain.append(location[l])
    71. contain.append(cat[z])
    72. contain.extend([bs_parser, bs_parser, bs_parser, bs_parser, bs_parser, bs_parser])
    73. lis_1 = contain.copy()
    74. result.append(lis_1)
    75. # print(contain)
    76. contain.clear()
    77. # return result
    78. except AttributeError:
    79. bs_parser = bs.find("div", class_="ts lq").find("h3").get_text()
    80. contain.append(bs_school_name)
    81. contain.append(location[l])
    82. contain.append(cat[z])
    83. contain.extend([bs_parser, bs_parser, bs_parser, bs_parser, bs_parser, bs_parser])
    84. lis_1 = contain.copy()
    85. result.append(lis_1)
    86. # s.close()
    87. # print(contain)
    88. contain.clear()
    89. except AttributeError:
    90. print("'NoneType' object has no attribute 'find'")
    91. return get_data(school_num, l, z)
    92. except requests.exceptions.ProxyError:
    93. print("ProxyError!")
    94. return get_data(school_num, l, z)
    95. # contain.append(school_num)
    96. # contain.append(location[l])
    97. # contain.append(cat[z])
    98. # contain.extend(["代理异常!", "代理异常!", "代理异常!", "代理异常!", "代理异常!", "代理异常!"])
    99. # lis_1 = contain.copy()
    100. # result.append(lis_1)
    101. # contain.clear()
    102. except requests.exceptions.ChunkedEncodingError:
    103. print("ChunkedEncodingError!")
    104. return get_data(school_num, l, z)
    105. s.close()
    106. return result
    107. # start = time.time()
    108. # test_2 = get_data(school_num=1)
    109. # end = time.time()
    110. # print("用时:%s" % (end - start))
    111. # location = {1: "北京", 2: "天津", 3: "辽宁", 4: "吉林", 5: "黑龙江", 6: "上海", 7: "江苏", 8: "浙江", 9: "安徽",
    112. # 10: "福建", 11: "山东", 12: "湖北", 13: "湖南", 14: "广东", 15: "重庆", 16: "四川", 17: "陕西", 18: "甘肃",
    113. # 19: "河北", 20: "山西", 21: "内蒙古", 22: "河南", 23: "海南", 24: "广西", 25: "贵州", 26: "云南", 27: "西藏", 28: "青海",
    114. # 29: "宁夏", 30: "新疆", 31: "江西", 33: "香港", 38: "澳门", 39: "台湾"}
    115. #
    116. # cat = {1: "理科", 2: "文科", 3: "综合", 4: "艺术理", 5: "艺术文"}
    117. final = []
    118. #
    119. start = time.time()
    120. for q in range(1021, 1031): # 2668
    121. for w in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 33, 38, 39]:
    122. for ze in [1, 2, 3, 4, 5]:
    123. res_uid = get_data(q, w, ze)
    124. final.extend(res_uid)
    125. # time.sleep(0.5 + random.random())
    126. end = time.time()
    127. print("总共耗时:%f" % (end - start))