import requestsfrom bs4 import BeautifulSoupimport reimport timeimport randomimport get_auto_iddef crawler_program(url): res = {} results = [] headers = [{'User-Agent': "Mozilla/5.0 (X11; CrOS x86_64 10066.0.0) AppleWebKit/537.36 (KHTML, like Gecko)" " Chrome/69.0.3497.100 Safari/537.36", 'Accept': 'text/html;q=0.9,*/*;q=0.8', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 'Connection': 'close'}, {'User-Agent': "Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1 (KHTML," " like Gecko) CriOS/69.0.3497.100 Mobile/13B143 Safari/601.1.4", 'Accept': 'text/html;q=0.9,*/*;q=0.8', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 'Connection': 'close'}, {'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.75.14 (KHTML, " "like Gecko) Version/7.0.3 Safari/7046A194A", 'Accept': 'text/html;q=0.9,*/*;q=0,8', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 'Connection': 'close'}, {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0", 'Accept': 'application/json, text/plain, */*', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 'Connection': 'close'}, {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko", 'Accept': 'application/json, text/plain, */*', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 'Connection': 'close'}] s = requests.Session() s.keep_alive = False html = s.get(url, headers=random.choice(headers), allow_redirects=False).text if html != '': bs = BeautifulSoup(html, "html.parser") time.sleep(0.2) try: obj = bs.find("div", id="maodian").find_all("div", class_="mouthcon") ### if not obj: raise AttributeError else: for i in obj: date = i.find("div", class_="mouthcon-cont-right commentParentBox").find("div", class_="title-name name-width-01").find("a").get_text() try: name = re.findall('[.a-zA-Z0-9_\u4e00-\u9fa5]+', i.find("div", class_="mouthcon-cont-left").find("div", class_="name-text").get_text())[0] except IndexError: name = "用户名含有特殊字符" # print(name) res["用户名"] = [name] res["发帖日期"] = [date] object_text = i.find("div", class_="choose-con mt-10").find_all("dl", class_="choose-dl") for ii in object_text: try: title = re.findall('[.a-zA-Z0-9_\u4e00-\u9fa5]+', ii.find("dt").get_text())[0] except IndexError: title = "" # print(title) content = re.findall('[.a-zA-Z0-9_\u4e00-\u9fa5]+', ii.find("dd").get_text()) res[title] = content # print(res) results.append(res.copy()) res.clear() s.close() return results except AttributeError: try: obj = bs.find("div", id="maodian").find("div", class_="mouth") text = re.findall('[.a-zA-Z0-9_\u4e00-\u9fa5]+', obj.find("div", class_="text-normal").get_text())[0] if text == "暂无符合该列表的口碑": results.append(text) print(text) s.close() return results else: results.append("未知错误") print("未知错误") return results except AttributeError: print("AttributeError") time.sleep(5) s.close() return crawler_program(url) elif html == '': results.append("该车型不存在在售状态") print("该车型不存在在售状态") s.close() return resultsdef gen_url(auto_id, sell_state): url_lists = [] s = requests.Session() try: if sell_state == "在售": urls = "https://k.autohome.com.cn/" + auto_id + "/index_2.html#dataList" html = s.get(urls, allow_redirects=False).text if html == '': raise AttributeError else: bs = BeautifulSoup(html, "html.parser") page_num = int(bs.find("span", class_="page-item-info").get_text()[1:-1]) for ai in range(2, page_num + 1): url_lists.append("https://k.autohome.com.cn/" + auto_id + "/index_" + str(ai) + ".html#dataList") url_lists.insert(0, "https://k.autohome.com.cn/" + auto_id + "/#pvareaid=102519") elif sell_state == "停售": urls = "https://k.autohome.com.cn/" + auto_id + "/StopSelling/index_2.html#dataList" html = s.get(urls, allow_redirects=False).text if html == '': raise AttributeError else: bs = BeautifulSoup(html, "html.parser") page_num = int(bs.find("span", class_="page-item-info").get_text()[1:-1]) for ai in range(2, page_num + 1): url_lists.append("https://k.autohome.com.cn/" + auto_id + "/StopSelling/index_" + str(ai) + ".html#dataList") url_lists.insert(0, "https://k.autohome.com.cn/" + auto_id + "/StopSelling/#pvareaid=102519") s.close() return url_lists except AttributeError: if sell_state == "在售": url_lists.append("https://k.autohome.com.cn/" + auto_id + "/#pvareaid=102519") elif sell_state == "停售": url_lists.append("https://k.autohome.com.cn/" + auto_id + "/StopSelling/#pvareaid=102519") return url_listsdef data_clear(res): result = [] contain = [] for xi in res: for xii in ["车型代码", "销售状态", "用户名", "发帖日期", "购买车型", "购买地点", "购车经销商", "购买时间", "裸车购买价", "油耗目前行驶", "耗电量目前行驶", "空间", "动力", "操控", "油耗", "舒适性", "外观", "内饰", "性价比", "购车目的"]: if xii == "车型代码": contain.append(xi[xii][0]) if xii == "销售状态": contain.append(xi[xii][0]) if xii in ["用户名", "发帖日期"]: contain.append(xi[xii][0]) if xii == "购买车型": chexing = xi[xii][0] xinghao = " ".join(xi[xii][1:]) contain.append(chexing) contain.append(xinghao) try: if xii == "购买地点": contain.append(xi[xii][0]) except IndexError: contain.append("") try: if xii == "购车经销商": contain.append("") except KeyError or IndexError: contain.append("") try: if xii == "购买时间": contain.append(xi[xii][0]) except KeyError or IndexError: contain.append("") if xii == "裸车购买价": contain.append("".join(xi[xii])) try: if xii == "油耗目前行驶": contain.append("".join(xi[xii][:2])) contain.append("".join(xi[xii][2:])[3:]) except KeyError or IndexError: contain.append("") contain.append("") try: if xii == "耗电量目前行驶": contain.append("".join(xi[xii][:2])) contain.append("".join(xi[xii][2:])[3:]) except KeyError or IndexError: contain.append("") contain.append("") try: if xii in ["空间", "动力", "操控", "油耗", "舒适性", "外观", "内饰", "性价比"]: contain.append(xi[xii][0]) except KeyError: if xii in ["空间", "动力", "操控", "耗电量", "舒适性", "外观", "内饰", "性价比"]: contain.append(xi[xii][0]) if xii == "购车目的": contain.append(" ".join(xi[xii])) print(contain) result.append(contain.copy()) contain.clear() return resultdef get_auto_series_data(series_id, sell_state): url_list = gen_url(series_id, sell_state) print(url_list) rr = [] for i in url_list: r = crawler_program(i) if isinstance(r[0], dict): for ii in r: ii["车型代码"] = [series_id] ii["销售状态"] = [sell_state] r_clear = data_clear(r) rr.extend(r_clear) else: rr.append(series_id) rr.append(r[0]) return rrauto_id_list = get_auto_id.get_auto_id_data() # 获取auto的Factory id,总计308个auto_series_list = get_auto_id.get_specific_model_auto(auto_id_list) # 获取auto不同series的id,总计2226个final = []for s in auto_series_list[500:700]: print(s) tt = get_auto_series_data(str(s[1]), "在售") final.append(tt)
import requestsimport requests.exceptionsimport timeimport randomimport jsondef read_csv(name): '''该函数读取CSV文件数据''' import csv csv_file = csv.reader(open("C:\\Users\\mengxin\\Desktop\\" + name + ".csv", "r", encoding="utf8")) object_website = [] for i in csv_file: object_website.append(i[0]) # print(i) return object_websitedef get_auto_id(brand_id): url = "https://k.autohome.com.cn/ajax/factorybybrand?&brandid=" + brand_id + "&state=0X001C&typeid=0" headers = [{'User-Agent': "Mozilla/5.0 (X11; CrOS x86_64 10066.0.0) AppleWebKit/537.36 (KHTML, like Gecko)" " Chrome/69.0.3497.100 Safari/537.36", 'Accept': 'text/html;q=0.9,*/*;q=0.8', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 'Connection': 'close'}, {'User-Agent': "Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1 (KHTML," " like Gecko) CriOS/69.0.3497.100 Mobile/13B143 Safari/601.1.4", 'Accept': 'text/html;q=0.9,*/*;q=0.8', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 'Connection': 'close'}, {'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.75.14 (KHTML, " "like Gecko) Version/7.0.3 Safari/7046A194A", 'Accept': 'text/html;q=0.9,*/*;q=0,8', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 'Connection': 'close'}, {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0", 'Accept': 'application/json, text/plain, */*', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 'Connection': 'close'}, {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko", 'Accept': 'application/json, text/plain, */*', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 'Connection': 'close'}] # 代理服务器 proxyHost = "http-dyn.abuyun.com" proxyPort = "9020" # 代理隧道验证信息 proxyUser = "H21HT1P7C30F1P5D" proxyPass = "8F451A3BD7C939D0" proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % { "host": proxyHost, "port": proxyPort, "user": proxyUser, "pass": proxyPass, } proxies = { "http": proxyMeta, "https": proxyMeta, } try: s = requests.Session() js = s.get(url, headers=random.choice(headers), allow_redirects=False).json() js = json.loads(js) time.sleep(0.2) result = [] if js["message"] == "成功": res = js["result"]["factoryitems"] for i in res: name = i["name"] idn = i["id"] first_letter = i["ffirstletter"] result.append([name, idn, first_letter]) s.close() return result except requests.exceptions.ProxyError: print("ProxyError") time.sleep(2) return get_auto_id(brand_id) except json.decoder.JSONDecodeError: print("JSONDecodeError") time.sleep(2) return get_auto_id(brand_id)def get_auto_id_data(): brand_id_list = read_csv("brand_id_list") obj_dict = {} brand_id_list[0] = "117" for k in brand_id_list: get_auto_id_list = get_auto_id(k) print(get_auto_id_list) obj_dict[k] = get_auto_id_list auto_id_list = [] for m in obj_dict: brand = obj_dict[m] for n in brand: n.insert(0, m) auto_id_list.append(n) return auto_id_listdef get_specific_model_auto(auto_id_list): ress = [] results = [] for d in auto_id_list: brand_id = d[0] factory_id = d[2] try: url = "https://k.autohome.com.cn/ajax/seriesbyfactory?&brandid=" + brand_id + "&factoryid=" + str(factory_id) + "&state=0X001C&typeid=0" s = requests.Session() js = s.get(url, allow_redirects=False).json() js = json.loads(js) if js["message"] == "成功": res = js["result"]["seriesitems"] for ii in res[:]: # print(ii) name = ii["name"] idn = ii["id"] sfirst_letter = ii["sfirstletter"] state = ii["seriesstate"] order = ii["seriesorder"] results.append([name, idn, sfirst_letter, state, order]) print(results) ress.extend(results.copy()) results.clear() except requests.exceptions.ProxyError: print("ProxyError") time.sleep(3) return get_specific_model_auto(auto_id_list) except json.decoder.JSONDecodeError: print("JSONDecodeError") time.sleep(3) return get_specific_model_auto(auto_id_list) s.close() return ress