1. import requests
    2. from bs4 import BeautifulSoup
    3. import re
    4. import time
    5. import random
    6. import get_auto_id
    7. def crawler_program(url):
    8. res = {}
    9. results = []
    10. headers = [{'User-Agent': "Mozilla/5.0 (X11; CrOS x86_64 10066.0.0) AppleWebKit/537.36 (KHTML, like Gecko)"
    11. " Chrome/69.0.3497.100 Safari/537.36",
    12. 'Accept': 'text/html;q=0.9,*/*;q=0.8',
    13. 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
    14. 'Connection': 'close'},
    15. {'User-Agent': "Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1 (KHTML,"
    16. " like Gecko) CriOS/69.0.3497.100 Mobile/13B143 Safari/601.1.4",
    17. 'Accept': 'text/html;q=0.9,*/*;q=0.8',
    18. 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
    19. 'Connection': 'close'},
    20. {'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.75.14 (KHTML, "
    21. "like Gecko) Version/7.0.3 Safari/7046A194A",
    22. 'Accept': 'text/html;q=0.9,*/*;q=0,8',
    23. 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
    24. 'Connection': 'close'},
    25. {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0",
    26. 'Accept': 'application/json, text/plain, */*',
    27. 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
    28. 'Connection': 'close'},
    29. {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
    30. 'Accept': 'application/json, text/plain, */*',
    31. 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
    32. 'Connection': 'close'}]
    33. s = requests.Session()
    34. s.keep_alive = False
    35. html = s.get(url, headers=random.choice(headers), allow_redirects=False).text
    36. if html != '':
    37. bs = BeautifulSoup(html, "html.parser")
    38. time.sleep(0.2)
    39. try:
    40. obj = bs.find("div", id="maodian").find_all("div", class_="mouthcon") ###
    41. if not obj:
    42. raise AttributeError
    43. else:
    44. for i in obj:
    45. date = i.find("div", class_="mouthcon-cont-right commentParentBox").find("div", class_="title-name name-width-01").find("a").get_text()
    46. try:
    47. name = re.findall('[.a-zA-Z0-9_\u4e00-\u9fa5]+', i.find("div", class_="mouthcon-cont-left").find("div", class_="name-text").get_text())[0]
    48. except IndexError:
    49. name = "用户名含有特殊字符"
    50. # print(name)
    51. res["用户名"] = [name]
    52. res["发帖日期"] = [date]
    53. object_text = i.find("div", class_="choose-con mt-10").find_all("dl", class_="choose-dl")
    54. for ii in object_text:
    55. try:
    56. title = re.findall('[.a-zA-Z0-9_\u4e00-\u9fa5]+', ii.find("dt").get_text())[0]
    57. except IndexError:
    58. title = ""
    59. # print(title)
    60. content = re.findall('[.a-zA-Z0-9_\u4e00-\u9fa5]+', ii.find("dd").get_text())
    61. res[title] = content
    62. # print(res)
    63. results.append(res.copy())
    64. res.clear()
    65. s.close()
    66. return results
    67. except AttributeError:
    68. try:
    69. obj = bs.find("div", id="maodian").find("div", class_="mouth")
    70. text = re.findall('[.a-zA-Z0-9_\u4e00-\u9fa5]+', obj.find("div", class_="text-normal").get_text())[0]
    71. if text == "暂无符合该列表的口碑":
    72. results.append(text)
    73. print(text)
    74. s.close()
    75. return results
    76. else:
    77. results.append("未知错误")
    78. print("未知错误")
    79. return results
    80. except AttributeError:
    81. print("AttributeError")
    82. time.sleep(5)
    83. s.close()
    84. return crawler_program(url)
    85. elif html == '':
    86. results.append("该车型不存在在售状态")
    87. print("该车型不存在在售状态")
    88. s.close()
    89. return results
    90. def gen_url(auto_id, sell_state):
    91. url_lists = []
    92. s = requests.Session()
    93. try:
    94. if sell_state == "在售":
    95. urls = "https://k.autohome.com.cn/" + auto_id + "/index_2.html#dataList"
    96. html = s.get(urls, allow_redirects=False).text
    97. if html == '':
    98. raise AttributeError
    99. else:
    100. bs = BeautifulSoup(html, "html.parser")
    101. page_num = int(bs.find("span", class_="page-item-info").get_text()[1:-1])
    102. for ai in range(2, page_num + 1):
    103. url_lists.append("https://k.autohome.com.cn/" + auto_id + "/index_" + str(ai) + ".html#dataList")
    104. url_lists.insert(0, "https://k.autohome.com.cn/" + auto_id + "/#pvareaid=102519")
    105. elif sell_state == "停售":
    106. urls = "https://k.autohome.com.cn/" + auto_id + "/StopSelling/index_2.html#dataList"
    107. html = s.get(urls, allow_redirects=False).text
    108. if html == '':
    109. raise AttributeError
    110. else:
    111. bs = BeautifulSoup(html, "html.parser")
    112. page_num = int(bs.find("span", class_="page-item-info").get_text()[1:-1])
    113. for ai in range(2, page_num + 1):
    114. url_lists.append("https://k.autohome.com.cn/" + auto_id + "/StopSelling/index_" + str(ai) + ".html#dataList")
    115. url_lists.insert(0, "https://k.autohome.com.cn/" + auto_id + "/StopSelling/#pvareaid=102519")
    116. s.close()
    117. return url_lists
    118. except AttributeError:
    119. if sell_state == "在售":
    120. url_lists.append("https://k.autohome.com.cn/" + auto_id + "/#pvareaid=102519")
    121. elif sell_state == "停售":
    122. url_lists.append("https://k.autohome.com.cn/" + auto_id + "/StopSelling/#pvareaid=102519")
    123. return url_lists
    124. def data_clear(res):
    125. result = []
    126. contain = []
    127. for xi in res:
    128. for xii in ["车型代码", "销售状态", "用户名", "发帖日期", "购买车型", "购买地点", "购车经销商", "购买时间", "裸车购买价", "油耗目前行驶", "耗电量目前行驶", "空间", "动力", "操控", "油耗", "舒适性", "外观", "内饰", "性价比", "购车目的"]:
    129. if xii == "车型代码":
    130. contain.append(xi[xii][0])
    131. if xii == "销售状态":
    132. contain.append(xi[xii][0])
    133. if xii in ["用户名", "发帖日期"]:
    134. contain.append(xi[xii][0])
    135. if xii == "购买车型":
    136. chexing = xi[xii][0]
    137. xinghao = " ".join(xi[xii][1:])
    138. contain.append(chexing)
    139. contain.append(xinghao)
    140. try:
    141. if xii == "购买地点":
    142. contain.append(xi[xii][0])
    143. except IndexError:
    144. contain.append("")
    145. try:
    146. if xii == "购车经销商":
    147. contain.append("")
    148. except KeyError or IndexError:
    149. contain.append("")
    150. try:
    151. if xii == "购买时间":
    152. contain.append(xi[xii][0])
    153. except KeyError or IndexError:
    154. contain.append("")
    155. if xii == "裸车购买价":
    156. contain.append("".join(xi[xii]))
    157. try:
    158. if xii == "油耗目前行驶":
    159. contain.append("".join(xi[xii][:2]))
    160. contain.append("".join(xi[xii][2:])[3:])
    161. except KeyError or IndexError:
    162. contain.append("")
    163. contain.append("")
    164. try:
    165. if xii == "耗电量目前行驶":
    166. contain.append("".join(xi[xii][:2]))
    167. contain.append("".join(xi[xii][2:])[3:])
    168. except KeyError or IndexError:
    169. contain.append("")
    170. contain.append("")
    171. try:
    172. if xii in ["空间", "动力", "操控", "油耗", "舒适性", "外观", "内饰", "性价比"]:
    173. contain.append(xi[xii][0])
    174. except KeyError:
    175. if xii in ["空间", "动力", "操控", "耗电量", "舒适性", "外观", "内饰", "性价比"]:
    176. contain.append(xi[xii][0])
    177. if xii == "购车目的":
    178. contain.append(" ".join(xi[xii]))
    179. print(contain)
    180. result.append(contain.copy())
    181. contain.clear()
    182. return result
    183. def get_auto_series_data(series_id, sell_state):
    184. url_list = gen_url(series_id, sell_state)
    185. print(url_list)
    186. rr = []
    187. for i in url_list:
    188. r = crawler_program(i)
    189. if isinstance(r[0], dict):
    190. for ii in r:
    191. ii["车型代码"] = [series_id]
    192. ii["销售状态"] = [sell_state]
    193. r_clear = data_clear(r)
    194. rr.extend(r_clear)
    195. else:
    196. rr.append(series_id)
    197. rr.append(r[0])
    198. return rr
    199. auto_id_list = get_auto_id.get_auto_id_data() # 获取auto的Factory id,总计308个
    200. auto_series_list = get_auto_id.get_specific_model_auto(auto_id_list) # 获取auto不同series的id,总计2226个
    201. final = []
    202. for s in auto_series_list[500:700]:
    203. print(s)
    204. tt = get_auto_series_data(str(s[1]), "在售")
    205. final.append(tt)
    1. import requests
    2. import requests.exceptions
    3. import time
    4. import random
    5. import json
    6. def read_csv(name):
    7. '''该函数读取CSV文件数据'''
    8. import csv
    9. csv_file = csv.reader(open("C:\\Users\\mengxin\\Desktop\\" + name + ".csv", "r", encoding="utf8"))
    10. object_website = []
    11. for i in csv_file:
    12. object_website.append(i[0])
    13. # print(i)
    14. return object_website
    15. def get_auto_id(brand_id):
    16. url = "https://k.autohome.com.cn/ajax/factorybybrand?&brandid=" + brand_id + "&state=0X001C&typeid=0"
    17. headers = [{'User-Agent': "Mozilla/5.0 (X11; CrOS x86_64 10066.0.0) AppleWebKit/537.36 (KHTML, like Gecko)"
    18. " Chrome/69.0.3497.100 Safari/537.36",
    19. 'Accept': 'text/html;q=0.9,*/*;q=0.8',
    20. 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
    21. 'Connection': 'close'},
    22. {'User-Agent': "Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1 (KHTML,"
    23. " like Gecko) CriOS/69.0.3497.100 Mobile/13B143 Safari/601.1.4",
    24. 'Accept': 'text/html;q=0.9,*/*;q=0.8',
    25. 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
    26. 'Connection': 'close'},
    27. {'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.75.14 (KHTML, "
    28. "like Gecko) Version/7.0.3 Safari/7046A194A",
    29. 'Accept': 'text/html;q=0.9,*/*;q=0,8',
    30. 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
    31. 'Connection': 'close'},
    32. {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0",
    33. 'Accept': 'application/json, text/plain, */*',
    34. 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
    35. 'Connection': 'close'},
    36. {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
    37. 'Accept': 'application/json, text/plain, */*',
    38. 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
    39. 'Connection': 'close'}]
    40. # 代理服务器
    41. proxyHost = "http-dyn.abuyun.com"
    42. proxyPort = "9020"
    43. # 代理隧道验证信息
    44. proxyUser = "H21HT1P7C30F1P5D"
    45. proxyPass = "8F451A3BD7C939D0"
    46. proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
    47. "host": proxyHost,
    48. "port": proxyPort,
    49. "user": proxyUser,
    50. "pass": proxyPass,
    51. }
    52. proxies = {
    53. "http": proxyMeta,
    54. "https": proxyMeta,
    55. }
    56. try:
    57. s = requests.Session()
    58. js = s.get(url, headers=random.choice(headers), allow_redirects=False).json()
    59. js = json.loads(js)
    60. time.sleep(0.2)
    61. result = []
    62. if js["message"] == "成功":
    63. res = js["result"]["factoryitems"]
    64. for i in res:
    65. name = i["name"]
    66. idn = i["id"]
    67. first_letter = i["ffirstletter"]
    68. result.append([name, idn, first_letter])
    69. s.close()
    70. return result
    71. except requests.exceptions.ProxyError:
    72. print("ProxyError")
    73. time.sleep(2)
    74. return get_auto_id(brand_id)
    75. except json.decoder.JSONDecodeError:
    76. print("JSONDecodeError")
    77. time.sleep(2)
    78. return get_auto_id(brand_id)
    79. def get_auto_id_data():
    80. brand_id_list = read_csv("brand_id_list")
    81. obj_dict = {}
    82. brand_id_list[0] = "117"
    83. for k in brand_id_list:
    84. get_auto_id_list = get_auto_id(k)
    85. print(get_auto_id_list)
    86. obj_dict[k] = get_auto_id_list
    87. auto_id_list = []
    88. for m in obj_dict:
    89. brand = obj_dict[m]
    90. for n in brand:
    91. n.insert(0, m)
    92. auto_id_list.append(n)
    93. return auto_id_list
    94. def get_specific_model_auto(auto_id_list):
    95. ress = []
    96. results = []
    97. for d in auto_id_list:
    98. brand_id = d[0]
    99. factory_id = d[2]
    100. try:
    101. url = "https://k.autohome.com.cn/ajax/seriesbyfactory?&brandid=" + brand_id + "&factoryid=" + str(factory_id) + "&state=0X001C&typeid=0"
    102. s = requests.Session()
    103. js = s.get(url, allow_redirects=False).json()
    104. js = json.loads(js)
    105. if js["message"] == "成功":
    106. res = js["result"]["seriesitems"]
    107. for ii in res[:]:
    108. # print(ii)
    109. name = ii["name"]
    110. idn = ii["id"]
    111. sfirst_letter = ii["sfirstletter"]
    112. state = ii["seriesstate"]
    113. order = ii["seriesorder"]
    114. results.append([name, idn, sfirst_letter, state, order])
    115. print(results)
    116. ress.extend(results.copy())
    117. results.clear()
    118. except requests.exceptions.ProxyError:
    119. print("ProxyError")
    120. time.sleep(3)
    121. return get_specific_model_auto(auto_id_list)
    122. except json.decoder.JSONDecodeError:
    123. print("JSONDecodeError")
    124. time.sleep(3)
    125. return get_specific_model_auto(auto_id_list)
    126. s.close()
    127. return ress