# -*- coding: utf-8 -*-# @Time : 12/6/18 1:11 AM# @Author : Wai Mengxin# @Email : weimengxin2012@hotmail.com# @File : estate.py# @Software: PyCharmimport requestsfrom bs4 import BeautifulSoupimport reimport timeimport randomdef get_building_code(url): '''获得楼层code''' res = [] head = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36', 'Connection': 'close'} s = requests.Session() s.keep_alive = False html = s.get(url, headers=head).text bs = BeautifulSoup(html, 'lxml') ob = bs.find("div", id="unitTran-left").find_all("a", href=True) for k in ob: code = k.attrs["href"] res.append(code[-10:]) s.close() return resdef get_house_id(building_acode): '''获得房屋的ID''' house_id = [] res = [] url = 'http://www1.centadata.com/tfs_centadata/Pih2Sln/TransactionHistory.aspx?type=1&code=' + building_acode print(url) head = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36', 'Connection': 'close'} s = requests.Session() s.keep_alive = False html = s.get(url, headers=head).text bs4 = BeautifulSoup(html, 'lxml') ob = bs4.find_all("table", class_="unitTran-sub-table") for i in ob: house_num = i.find_all("tr", class_="trHasTrans") house_id.extend(house_num) for it in house_id: res.append(it.attrs["id"]) s.close() return resdef get_history_tran(house_id, building_code): '''获得交易历史''' url = "http://www1.centadata.com/tfs_centadata/Pih2Sln/Ajax/AjaxServices.asmx/GenTransactionHistoryPinfo" headers = [{'User-Agent': "Mozilla/5.0 (X11; CrOS x86_64 10066.0.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36", 'Accept': 'text/html;q=0.9,*/*;q=0.8', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 'Connection': 'close'}, {'User-Agent': "Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1 (KHTML, like Gecko) CriOS/69.0.3497.100 Mobile/13B143 Safari/601.1.46", 'Accept': 'text/html;q=0.9,*/*;q=0.8', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 'Connection': 'close'}, {'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/7046A194A", 'Accept': 'text/html;q=0.9,*/*;q=0.8', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 'Connection': 'close'}, {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0", 'Accept': 'application/json, text/plain, */*', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 'Connection': 'close'}, {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko", 'Accept': 'application/json, text/plain, */*', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 'Connection': 'close'}] info = { "acode": house_id, "cblgcode": building_code, "cci_price": 4, "cultureInfo": "SimplifiedChinese" } # proxyHost = "http-dyn.abuyun.com" # # proxyPort = "9020" # # # # # 代理隧道验证信息 # # proxyUser = "H8J5995E1K5NF88D" # # proxyPass = "2E09DAB9F476C071" # # # # proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % { # # "host": proxyHost, # # "port": proxyPort, # # "user": proxyUser, # # "pass": proxyPass, # # } # # # # proxies = { # # "http": proxyMeta, # # "https": proxyMeta, # # } s = requests.Session() s.keep_alive = False try: html = s.post(url, headers=random.choice(headers), data=info, allow_redirects=False) time.sleep(0.6) # bs = BeautifulSoup(html.content, "lxml") res = re.findall(r'\btd>.*?<\b', html.text) time.sleep(0.2) out_list = [] finals = [] need = [] for i in res: result = re.sub("td>|</td><| </td></tr><", "", i) out_list.append(result) for k in out_list: ret = re.sub("</tr></tr></table>|</tr></table></div><|/tr></table><|/tr><|<", "", k) finals.append(ret) for ite in finals: if ite != "": need.append(ite) except requests.exceptions.ConnectionError: print("ConnectionError....") time.sleep(60) return get_history_tran(house_id, building_code) # except requests.exceptions.ProxyError: # print("ProxyError!") # return get_history_tran(house_id, building_code) except requests.exceptions.ChunkedEncodingError: print("ProxyError!") return get_history_tran(house_id, building_code) s.close() return needdef data_processing(obj): '''整理数据''' final = [] res = [] lis = [] for k in obj: if len(k) <= 2 or k[-2] != "实": lis.append(k) # try: if lis[-1][0] != "@": lis.append("----") else: pass start = 5 t = 0 while start < len(lis): if lis[start][0] != "@" and lis[start] != "----": lis.insert(start, "----") t += 1 else: pass start += 3 # print(lis) # except IndexError: # print("IndexError") # return data_processing(obj) num = (len(lis) - 3) / 3 index = 0 while index < num: res.extend(lis[:3]) res.extend(lis[3 + 3*index:6 + 3*index]) li = res.copy() final.append(li) res.clear() index += 1 return finaldef run(urlss): result_1 = [] building_code = get_building_code(urlss) # 获取楼层code for build in building_code: house_id = get_house_id(build) # 获取特定楼层的house id for h in house_id: raw_data = get_history_tran(h, build) # print(raw_data) # try: data = data_processing(raw_data) for item in data: print(item) result_1.extend(data) return result_1urls = "http://www1.centadata.com/tfs_centadata/Pih2Sln/TransactionHistory.aspx?type=3&code=XSHNIHZXHN&info=basicinfo"res_new_4 = run(urls)