import requests
from bs4 import BeautifulSoup
import re
import time
import random
import get_auto_id
def crawler_program(url):
res = {}
results = []
headers = [{'User-Agent': "Mozilla/5.0 (X11; CrOS x86_64 10066.0.0) AppleWebKit/537.36 (KHTML, like Gecko)"
" Chrome/69.0.3497.100 Safari/537.36",
'Accept': 'text/html;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Connection': 'close'},
{'User-Agent': "Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1 (KHTML,"
" like Gecko) CriOS/69.0.3497.100 Mobile/13B143 Safari/601.1.4",
'Accept': 'text/html;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Connection': 'close'},
{'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.75.14 (KHTML, "
"like Gecko) Version/7.0.3 Safari/7046A194A",
'Accept': 'text/html;q=0.9,*/*;q=0,8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Connection': 'close'},
{'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0",
'Accept': 'application/json, text/plain, */*',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Connection': 'close'},
{'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
'Accept': 'application/json, text/plain, */*',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Connection': 'close'}]
s = requests.Session()
s.keep_alive = False
html = s.get(url, headers=random.choice(headers), allow_redirects=False).text
if html != '':
bs = BeautifulSoup(html, "html.parser")
time.sleep(0.2)
try:
obj = bs.find("div", id="maodian").find_all("div", class_="mouthcon") ###
if not obj:
raise AttributeError
else:
for i in obj:
date = i.find("div", class_="mouthcon-cont-right commentParentBox").find("div", class_="title-name name-width-01").find("a").get_text()
try:
name = re.findall('[.a-zA-Z0-9_\u4e00-\u9fa5]+', i.find("div", class_="mouthcon-cont-left").find("div", class_="name-text").get_text())[0]
except IndexError:
name = "用户名含有特殊字符"
# print(name)
res["用户名"] = [name]
res["发帖日期"] = [date]
object_text = i.find("div", class_="choose-con mt-10").find_all("dl", class_="choose-dl")
for ii in object_text:
try:
title = re.findall('[.a-zA-Z0-9_\u4e00-\u9fa5]+', ii.find("dt").get_text())[0]
except IndexError:
title = ""
# print(title)
content = re.findall('[.a-zA-Z0-9_\u4e00-\u9fa5]+', ii.find("dd").get_text())
res[title] = content
# print(res)
results.append(res.copy())
res.clear()
s.close()
return results
except AttributeError:
try:
obj = bs.find("div", id="maodian").find("div", class_="mouth")
text = re.findall('[.a-zA-Z0-9_\u4e00-\u9fa5]+', obj.find("div", class_="text-normal").get_text())[0]
if text == "暂无符合该列表的口碑":
results.append(text)
print(text)
s.close()
return results
else:
results.append("未知错误")
print("未知错误")
return results
except AttributeError:
print("AttributeError")
time.sleep(5)
s.close()
return crawler_program(url)
elif html == '':
results.append("该车型不存在在售状态")
print("该车型不存在在售状态")
s.close()
return results
def gen_url(auto_id, sell_state):
url_lists = []
s = requests.Session()
try:
if sell_state == "在售":
urls = "https://k.autohome.com.cn/" + auto_id + "/index_2.html#dataList"
html = s.get(urls, allow_redirects=False).text
if html == '':
raise AttributeError
else:
bs = BeautifulSoup(html, "html.parser")
page_num = int(bs.find("span", class_="page-item-info").get_text()[1:-1])
for ai in range(2, page_num + 1):
url_lists.append("https://k.autohome.com.cn/" + auto_id + "/index_" + str(ai) + ".html#dataList")
url_lists.insert(0, "https://k.autohome.com.cn/" + auto_id + "/#pvareaid=102519")
elif sell_state == "停售":
urls = "https://k.autohome.com.cn/" + auto_id + "/StopSelling/index_2.html#dataList"
html = s.get(urls, allow_redirects=False).text
if html == '':
raise AttributeError
else:
bs = BeautifulSoup(html, "html.parser")
page_num = int(bs.find("span", class_="page-item-info").get_text()[1:-1])
for ai in range(2, page_num + 1):
url_lists.append("https://k.autohome.com.cn/" + auto_id + "/StopSelling/index_" + str(ai) + ".html#dataList")
url_lists.insert(0, "https://k.autohome.com.cn/" + auto_id + "/StopSelling/#pvareaid=102519")
s.close()
return url_lists
except AttributeError:
if sell_state == "在售":
url_lists.append("https://k.autohome.com.cn/" + auto_id + "/#pvareaid=102519")
elif sell_state == "停售":
url_lists.append("https://k.autohome.com.cn/" + auto_id + "/StopSelling/#pvareaid=102519")
return url_lists
def data_clear(res):
result = []
contain = []
for xi in res:
for xii in ["车型代码", "销售状态", "用户名", "发帖日期", "购买车型", "购买地点", "购车经销商", "购买时间", "裸车购买价", "油耗目前行驶", "耗电量目前行驶", "空间", "动力", "操控", "油耗", "舒适性", "外观", "内饰", "性价比", "购车目的"]:
if xii == "车型代码":
contain.append(xi[xii][0])
if xii == "销售状态":
contain.append(xi[xii][0])
if xii in ["用户名", "发帖日期"]:
contain.append(xi[xii][0])
if xii == "购买车型":
chexing = xi[xii][0]
xinghao = " ".join(xi[xii][1:])
contain.append(chexing)
contain.append(xinghao)
try:
if xii == "购买地点":
contain.append(xi[xii][0])
except IndexError:
contain.append("")
try:
if xii == "购车经销商":
contain.append("")
except KeyError or IndexError:
contain.append("")
try:
if xii == "购买时间":
contain.append(xi[xii][0])
except KeyError or IndexError:
contain.append("")
if xii == "裸车购买价":
contain.append("".join(xi[xii]))
try:
if xii == "油耗目前行驶":
contain.append("".join(xi[xii][:2]))
contain.append("".join(xi[xii][2:])[3:])
except KeyError or IndexError:
contain.append("")
contain.append("")
try:
if xii == "耗电量目前行驶":
contain.append("".join(xi[xii][:2]))
contain.append("".join(xi[xii][2:])[3:])
except KeyError or IndexError:
contain.append("")
contain.append("")
try:
if xii in ["空间", "动力", "操控", "油耗", "舒适性", "外观", "内饰", "性价比"]:
contain.append(xi[xii][0])
except KeyError:
if xii in ["空间", "动力", "操控", "耗电量", "舒适性", "外观", "内饰", "性价比"]:
contain.append(xi[xii][0])
if xii == "购车目的":
contain.append(" ".join(xi[xii]))
print(contain)
result.append(contain.copy())
contain.clear()
return result
def get_auto_series_data(series_id, sell_state):
url_list = gen_url(series_id, sell_state)
print(url_list)
rr = []
for i in url_list:
r = crawler_program(i)
if isinstance(r[0], dict):
for ii in r:
ii["车型代码"] = [series_id]
ii["销售状态"] = [sell_state]
r_clear = data_clear(r)
rr.extend(r_clear)
else:
rr.append(series_id)
rr.append(r[0])
return rr
auto_id_list = get_auto_id.get_auto_id_data() # 获取auto的Factory id,总计308个
auto_series_list = get_auto_id.get_specific_model_auto(auto_id_list) # 获取auto不同series的id,总计2226个
final = []
for s in auto_series_list[500:700]:
print(s)
tt = get_auto_series_data(str(s[1]), "在售")
final.append(tt)
import requests
import requests.exceptions
import time
import random
import json
def read_csv(name):
'''该函数读取CSV文件数据'''
import csv
csv_file = csv.reader(open("C:\\Users\\mengxin\\Desktop\\" + name + ".csv", "r", encoding="utf8"))
object_website = []
for i in csv_file:
object_website.append(i[0])
# print(i)
return object_website
def get_auto_id(brand_id):
url = "https://k.autohome.com.cn/ajax/factorybybrand?&brandid=" + brand_id + "&state=0X001C&typeid=0"
headers = [{'User-Agent': "Mozilla/5.0 (X11; CrOS x86_64 10066.0.0) AppleWebKit/537.36 (KHTML, like Gecko)"
" Chrome/69.0.3497.100 Safari/537.36",
'Accept': 'text/html;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Connection': 'close'},
{'User-Agent': "Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1 (KHTML,"
" like Gecko) CriOS/69.0.3497.100 Mobile/13B143 Safari/601.1.4",
'Accept': 'text/html;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Connection': 'close'},
{'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.75.14 (KHTML, "
"like Gecko) Version/7.0.3 Safari/7046A194A",
'Accept': 'text/html;q=0.9,*/*;q=0,8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Connection': 'close'},
{'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0",
'Accept': 'application/json, text/plain, */*',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Connection': 'close'},
{'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
'Accept': 'application/json, text/plain, */*',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Connection': 'close'}]
# 代理服务器
proxyHost = "http-dyn.abuyun.com"
proxyPort = "9020"
# 代理隧道验证信息
proxyUser = "H21HT1P7C30F1P5D"
proxyPass = "8F451A3BD7C939D0"
proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
"host": proxyHost,
"port": proxyPort,
"user": proxyUser,
"pass": proxyPass,
}
proxies = {
"http": proxyMeta,
"https": proxyMeta,
}
try:
s = requests.Session()
js = s.get(url, headers=random.choice(headers), allow_redirects=False).json()
js = json.loads(js)
time.sleep(0.2)
result = []
if js["message"] == "成功":
res = js["result"]["factoryitems"]
for i in res:
name = i["name"]
idn = i["id"]
first_letter = i["ffirstletter"]
result.append([name, idn, first_letter])
s.close()
return result
except requests.exceptions.ProxyError:
print("ProxyError")
time.sleep(2)
return get_auto_id(brand_id)
except json.decoder.JSONDecodeError:
print("JSONDecodeError")
time.sleep(2)
return get_auto_id(brand_id)
def get_auto_id_data():
brand_id_list = read_csv("brand_id_list")
obj_dict = {}
brand_id_list[0] = "117"
for k in brand_id_list:
get_auto_id_list = get_auto_id(k)
print(get_auto_id_list)
obj_dict[k] = get_auto_id_list
auto_id_list = []
for m in obj_dict:
brand = obj_dict[m]
for n in brand:
n.insert(0, m)
auto_id_list.append(n)
return auto_id_list
def get_specific_model_auto(auto_id_list):
ress = []
results = []
for d in auto_id_list:
brand_id = d[0]
factory_id = d[2]
try:
url = "https://k.autohome.com.cn/ajax/seriesbyfactory?&brandid=" + brand_id + "&factoryid=" + str(factory_id) + "&state=0X001C&typeid=0"
s = requests.Session()
js = s.get(url, allow_redirects=False).json()
js = json.loads(js)
if js["message"] == "成功":
res = js["result"]["seriesitems"]
for ii in res[:]:
# print(ii)
name = ii["name"]
idn = ii["id"]
sfirst_letter = ii["sfirstletter"]
state = ii["seriesstate"]
order = ii["seriesorder"]
results.append([name, idn, sfirst_letter, state, order])
print(results)
ress.extend(results.copy())
results.clear()
except requests.exceptions.ProxyError:
print("ProxyError")
time.sleep(3)
return get_specific_model_auto(auto_id_list)
except json.decoder.JSONDecodeError:
print("JSONDecodeError")
time.sleep(3)
return get_specific_model_auto(auto_id_list)
s.close()
return ress