# -*- coding: utf-8 -*-
# @Time : 12/6/18 1:11 AM
# @Author : Wai Mengxin
# @Email : weimengxin2012@hotmail.com
# @File : estate.py
# @Software: PyCharm
import requests
from bs4 import BeautifulSoup
import re
import time
import random
def get_building_code(url):
'''获得楼层code'''
res = []
head = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36',
'Connection': 'close'}
s = requests.Session()
s.keep_alive = False
html = s.get(url, headers=head).text
bs = BeautifulSoup(html, 'lxml')
ob = bs.find("div", id="unitTran-left").find_all("a", href=True)
for k in ob:
code = k.attrs["href"]
res.append(code[-10:])
s.close()
return res
def get_house_id(building_acode):
'''获得房屋的ID'''
house_id = []
res = []
url = 'http://www1.centadata.com/tfs_centadata/Pih2Sln/TransactionHistory.aspx?type=1&code=' + building_acode
print(url)
head = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36',
'Connection': 'close'}
s = requests.Session()
s.keep_alive = False
html = s.get(url, headers=head).text
bs4 = BeautifulSoup(html, 'lxml')
ob = bs4.find_all("table", class_="unitTran-sub-table")
for i in ob:
house_num = i.find_all("tr", class_="trHasTrans")
house_id.extend(house_num)
for it in house_id:
res.append(it.attrs["id"])
s.close()
return res
def get_history_tran(house_id, building_code):
'''获得交易历史'''
url = "http://www1.centadata.com/tfs_centadata/Pih2Sln/Ajax/AjaxServices.asmx/GenTransactionHistoryPinfo"
headers = [{'User-Agent': "Mozilla/5.0 (X11; CrOS x86_64 10066.0.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36",
'Accept': 'text/html;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Connection': 'close'},
{'User-Agent': "Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1 (KHTML, like Gecko) CriOS/69.0.3497.100 Mobile/13B143 Safari/601.1.46",
'Accept': 'text/html;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Connection': 'close'},
{'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/7046A194A",
'Accept': 'text/html;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Connection': 'close'},
{'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0",
'Accept': 'application/json, text/plain, */*',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Connection': 'close'},
{'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
'Accept': 'application/json, text/plain, */*',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Connection': 'close'}]
info = {
"acode": house_id,
"cblgcode": building_code,
"cci_price": 4,
"cultureInfo": "SimplifiedChinese"
}
# proxyHost = "http-dyn.abuyun.com"
# # proxyPort = "9020"
# #
# # # 代理隧道验证信息
# # proxyUser = "H8J5995E1K5NF88D"
# # proxyPass = "2E09DAB9F476C071"
# #
# # proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
# # "host": proxyHost,
# # "port": proxyPort,
# # "user": proxyUser,
# # "pass": proxyPass,
# # }
# #
# # proxies = {
# # "http": proxyMeta,
# # "https": proxyMeta,
# # }
s = requests.Session()
s.keep_alive = False
try:
html = s.post(url, headers=random.choice(headers), data=info, allow_redirects=False)
time.sleep(0.6)
# bs = BeautifulSoup(html.content, "lxml")
res = re.findall(r'\btd>.*?<\b', html.text)
time.sleep(0.2)
out_list = []
finals = []
need = []
for i in res:
result = re.sub("td>|</td><| </td></tr><", "", i)
out_list.append(result)
for k in out_list:
ret = re.sub("</tr></tr></table>|</tr></table></div><|/tr></table><|/tr><|<", "", k)
finals.append(ret)
for ite in finals:
if ite != "":
need.append(ite)
except requests.exceptions.ConnectionError:
print("ConnectionError....")
time.sleep(60)
return get_history_tran(house_id, building_code)
# except requests.exceptions.ProxyError:
# print("ProxyError!")
# return get_history_tran(house_id, building_code)
except requests.exceptions.ChunkedEncodingError:
print("ProxyError!")
return get_history_tran(house_id, building_code)
s.close()
return need
def data_processing(obj):
'''整理数据'''
final = []
res = []
lis = []
for k in obj:
if len(k) <= 2 or k[-2] != "实":
lis.append(k)
# try:
if lis[-1][0] != "@":
lis.append("----")
else:
pass
start = 5
t = 0
while start < len(lis):
if lis[start][0] != "@" and lis[start] != "----":
lis.insert(start, "----")
t += 1
else:
pass
start += 3
# print(lis)
# except IndexError:
# print("IndexError")
# return data_processing(obj)
num = (len(lis) - 3) / 3
index = 0
while index < num:
res.extend(lis[:3])
res.extend(lis[3 + 3*index:6 + 3*index])
li = res.copy()
final.append(li)
res.clear()
index += 1
return final
def run(urlss):
result_1 = []
building_code = get_building_code(urlss) # 获取楼层code
for build in building_code:
house_id = get_house_id(build) # 获取特定楼层的house id
for h in house_id:
raw_data = get_history_tran(h, build)
# print(raw_data)
# try:
data = data_processing(raw_data)
for item in data:
print(item)
result_1.extend(data)
return result_1
urls = "http://www1.centadata.com/tfs_centadata/Pih2Sln/TransactionHistory.aspx?type=3&code=XSHNIHZXHN&info=basicinfo"
res_new_4 = run(urls)