一、项目目的:
该项目的主要目的是获取香港以下4区所有楼盘的历史成交数据:
- 港岛区:http://www1.centadata.com/paddresssearch1.aspx?type=district16&code=HK
- 九龙区:http://www1.centadata.com/paddresssearch1.aspx?type=district16&code=KL
- 新界东:http://www1.centadata.com/paddresssearch1.aspx?type=district16&code=NE
- 新界西:http://www1.centadata.com/paddresssearch1.aspx?type=district16&code=NW
二、项目思路:
- 根据以上网址获取需要爬取得楼盘的id号(楼盘id)
- 结合楼盘网址,获取该楼盘每个房间的id号(房间id)
- 根据楼盘id和房间id,构造新的URL地址,发送Post请求,获取Ajax提供的原代码数据
- 通过BeautifulSoup、正则表达式,将获取的html原代码数据解析为关系型数据
- 最后,构造二维list将所有结果输出成CSV文件
三、项目原代码:
# -*- coding: utf-8 -*-# @Time : 2/23/19 4:35 PM# @Author : Wai Mengxin# @Email : weimengxin2012@hotmail.com# @File : HK_xinjiedong.py# @Software: PyCharmimport requestsfrom bs4 import BeautifulSoupimport reimport timeimport randomdef get_building_code(url):'''获得楼层code'''res = []head = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36','Connection': 'close'}s = requests.Session()s.keep_alive = Falsehtml = s.get(url, headers=head).textbs = BeautifulSoup(html, 'lxml')ob = bs.find("div", id="unitTran-left").find_all("a", href=True)for k in ob:code = k.attrs["href"]res.append(code[-10:])s.close()return resdef get_house_id(building_acode):'''获得房屋的ID'''house_id = []res = []url = 'http://www1.centadata.com/tfs_centadata/Pih2Sln/TransactionHistory.aspx?type=1&code=' + building_acodeprint(url)head = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36','Connection': 'close'}s = requests.Session()s.keep_alive = Falsehtml = s.get(url, headers=head).textbs4 = BeautifulSoup(html, 'lxml')ob = bs4.find_all("table", class_="unitTran-sub-table")for i in ob:house_num = i.find_all("tr", class_="trHasTrans")house_id.extend(house_num)for it in house_id:res.append(it.attrs["id"])s.close()return resdef get_history_tran(house_id, building_code):'''主程序:获得交易历史'''url = "http://www1.centadata.com/tfs_centadata/Pih2Sln/Ajax/AjaxServices.asmx/GenTransactionHistoryPinfo"headers = [{'User-Agent': "Mozilla/5.0 (X11; CrOS x86_64 10066.0.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36",'Accept': 'text/html;q=0.9,*/*;q=0.8','Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3','Connection': 'close'},{'User-Agent': "Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1 (KHTML, like Gecko) CriOS/69.0.3497.100 Mobile/13B143 Safari/601.1.46",'Accept': 'text/html;q=0.9,*/*;q=0.8','Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3','Connection': 'close'},{'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/7046A194A",'Accept': 'text/html;q=0.9,*/*;q=0.8','Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3','Connection': 'close'},{'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0",'Accept': 'application/json, text/plain, */*','Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3','Connection': 'close'},{'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",'Accept': 'application/json, text/plain, */*','Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3','Connection': 'close'}]info = {"acode": house_id,"cblgcode": building_code,"cci_price": 4,"cultureInfo": "SimplifiedChinese"}s = requests.Session()s.keep_alive = Falsetry:html = s.post(url, headers=random.choice(headers), data=info, allow_redirects=False)time.sleep(0.15)# bs = BeautifulSoup(html.content, "lxml")res = re.findall(r'\btd>.*?<\b', html.text)time.sleep(0.1)out_list = []finals = []need = []for i in res:result = re.sub("td>|</td><| </td></tr><", "", i)out_list.append(result)for k in out_list:ret = re.sub("</tr></tr></table>|</tr></table></div><|/tr></table><|/tr><|<", "", k)finals.append(ret)for ite in finals:if ite != "":need.append(ite)except requests.exceptions.ConnectionError:print("ConnectionError....")time.sleep(21)return get_history_tran(house_id, building_code)except requests.exceptions.ChunkedEncodingError:print("ProxyError!")return get_history_tran(house_id, building_code)except TimeoutError:print("TimeoutError!")time.sleep(5)return get_history_tran(house_id, building_code)s.close()return needdef data_processing(obj):'''整理数据'''final = []res = []lis = []for k in obj:if len(k) <= 2 or k[-2] != "实":lis.append(k)# try:if lis[-1][0] != "@":lis.append("----")else:passstart = 5t = 0while start < len(lis):if lis[start][0] != "@" and lis[start] != "----":lis.insert(start, "----")t += 1else:passstart += 3# print(lis)# except IndexError:# print("IndexError")# return data_processing(obj)num = (len(lis) - 3) / 3index = 0while index < num:res.extend(lis[:3])res.extend(lis[3 + 3*index:6 + 3*index])li = res.copy()final.append(li)res.clear()index += 1return finaldef run(urlss):result_1 = []building_code = get_building_code(urlss) # 获取楼层codefor build in building_code:house_id = get_house_id(build) # 获取特定楼层的house idfor h in house_id:raw_data = get_history_tran(h, build)# print(raw_data)# try:data = data_processing(raw_data)for item in data:print(item)result_1.extend(data)return result_1def output_data(all_data, output_file_name):'''该函数将最终数据输出为CSV文件'''import pandas as pd# name = ["building_name", "net_area", "floor_area", "time", "price", "area_price"]table = pd.DataFrame(data=all_data)# table = pd.DataFrame(data=all_data)table.to_csv("/usr/local/python_files" + output_file_name + ".csv")return tabledef get_obj_set(url):import requestsfrom bs4 import BeautifulSoupsets = []# url = "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=101&info=&code2=&page=0"header = {'User-Agent': "Mozilla/5.0 (X11; CrOS x86_64 10066.0.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36",'Connection': 'close'}s = requests.Session()s.keep_alive = Falsee = s.get(url, headers=header).texthtml = BeautifulSoup(e, 'lxml')obj = html.find_all("td", class_="tdscp1tr")for i in obj:try:h = i.find("a")["href"]start = h.find("'")res = h[start + 1:start+11]sets.append(res)print(res)except TypeError:print("TypeError & PASS")return setsexcept_code_xinjie = ["XSHNIHSXHT", "DBPPWPPJPW", "XSHNIHZXHN", "GYWKPPKYPS", "GYSEBPYEPK","BYSPWPDOPA", "DBSPWWPXWD", "DFDJURSYRV", "GEPPWPPJPY", "GYGGGPGXPB","GYYYPPGOPS", "VJOVQRSERQ", "XSHMTHNOHT", "BDAAGPYXPA", "XSHSTHZHHT","DBPPWPPSPD", "DEPPWPPSPE", "XSSZZHNSHT", "BDBAGPEHPA", "GWPGGPAEPS","VOOFFRFARR", "LIDHTHXXHT", "WBPPWPPEPP", "WDPPWPPEPB", "AEWPPPSOPW","BSPPGPSEPP", "BEPPWPPAPK", "WBPPWPPRPP", "VDORQRVYRU", "NTHHIHZEHH","KKPPWPPEPS", "DMXSZHLXHD", "WBPPWPPHPP", "BSPPWPPJPP", "BEPPWPPHPW","BGSSBPAXPS", "AEPPWPPAPG", "AEBKPPSJPK", "AEPPWPPRPK", "BEPPWWPOWD"]url_set = ["http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=208&info=&code2=&page=0","http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=208&info=&code2=lord:~lordtype:~tabIdx:0&page=1","http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=301&info=&code2=&page=0","http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=301&info=&code2=lord:~lordtype:~tabIdx:0&page=1","http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=302&info=&code2=&page=0","http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=302&info=&code2=lord:~lordtype:~tabIdx:0&page=1","http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=303&info=&code2=&page=0","http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=303&info=&code2=lord:~lordtype:~tabIdx:0&page=1","http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=304&info=&code2=&page=0","http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=304&info=&code2=lord:~lordtype:~tabIdx:0&page=1","http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=306&info=&code2=&page=0","http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=306&info=&code2=lord:~lordtype:~tabIdx:0&page=1","http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=307&info=&code2=&page=0","http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=307&info=&code2=lord:~lordtype:~tabIdx:0&page=1","http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=307&info=&code2=lord:namec~lordtype:desc~tabIdx:0&page=2","http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=307&info=&code2=lord:namec~lordtype:desc~tabIdx:0&page=3","http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=308&info=&code2=&page=0","http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=308&info=&code2=lord:~lordtype:~tabIdx:0&page=1","http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=308&info=&code2=lord:namec~lordtype:desc~tabIdx:0&page=2","http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=308&info=&code2=lord:namec~lordtype:desc~tabIdx:0&page=3","http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=103&info=&code2=&page=0","http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=309&info=&code2=&page=0","http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=401&info=&code2=&page=0","http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=401&info=&code2=lord:~lordtype:~tabIdx:0&page=1","http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=401&info=&code2=lord:namec~lordtype:desc~tabIdx:0&page=2","http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=401&info=&code2=lord:namec~lordtype:desc~tabIdx:0&page=3","http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=401&info=&code2=lord:namec~lordtype:desc~tabIdx:0&page=4","http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=401&info=&code2=lord:namec~lordtype:desc~tabIdx:0&page=5","http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=401&info=&code2=lord:namec~lordtype:desc~tabIdx:0&page=6","http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=401&info=&code2=lord:namec~lordtype:desc~tabIdx:0&page=7","http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=401&info=&code2=lord:namec~lordtype:desc~tabIdx:0&page=8","http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=401&info=&code2=lord:namec~lordtype:desc~tabIdx:0&page=9","http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=401&info=&code2=lord:namec~lordtype:desc~tabIdx:0&page=10","http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=402&info=&code2=&page=0","http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=402&info=&code2=lord:~lordtype:~tabIdx:0&page=1","http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=402&info=&code2=lord:namec~lordtype:desc~tabIdx:0&page=2","http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=402&info=&code2=lord:namec~lordtype:desc~tabIdx:0&page=3","http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=402&info=&code2=lord:namec~lordtype:desc~tabIdx:0&page=4","http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=402&info=&code2=lord:namec~lordtype:desc~tabIdx:0&page=5","http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=402&info=&code2=lord:namec~lordtype:desc~tabIdx:0&page=6","http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=403&info=&code2=&page=0","http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=404&info=&code2=&page=0","http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=404&info=&code2=lord:~lordtype:~tabIdx:0&page=1","http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=404&info=&code2=lord:namec~lordtype:desc~tabIdx:0&page=2","http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=404&info=&code2=lord:namec~lordtype:desc~tabIdx:0&page=3","http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=404&info=&code2=lord:namec~lordtype:desc~tabIdx:0&page=4","http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=405&info=&code2=&page=0","http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=405&info=&code2=lord:~lordtype:~tabIdx:0&page=1","http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=405&info=&code2=lord:namec~lordtype:desc~tabIdx:0&page=2","http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=405&info=&code2=lord:namec~lordtype:desc~tabIdx:0&page=3","http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=406&info=&code2=&page=0","http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=406&info=&code2=lord:~lordtype:~tabIdx:0&page=1","http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=407&info=&code2=&page=0","http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=408&info=&code2=&page=0","http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=408&info=&code2=lord:~lordtype:~tabIdx:0&page=1","http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=409&info=&code2=&page=0","http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=409&info=&code2=lord:~lordtype:~tabIdx:0&page=1","http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=409&info=&code2=lord:namec~lordtype:desc~tabIdx:0&page=2","http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=410&info=&code2=&page=0"]build_code = []for i in url_set:res = get_obj_set(i)time.sleep(0.3)build_code.extend(res)final_0 = []for i in build_code[145:]:if i not in except_code_xinjie:print(i)try:urls = "http://www1.centadata.com/tfs_centadata/Pih2Sln/TransactionHistory.aspx?type=3&code=" + i + "&info=basicinfo"# urls = "http://hk.centadata.com/transactionhistory.aspx?type=1&code=" + i + "&ci=zh-hk"res_1 = run(urls)final_0.append(res_1)except AttributeError:print("AttributeError!")final_0.append(i)def read_csv(name):'''该函数读取CSV文件数据'''import csvcsv_file = csv.reader(open("/Users/viemaxwei/Downloads/" + name + ".csv", "r", encoding="utf8"))object_website = []for i in csv_file:object_website.append(i)# print(i)return object_websitedata = []for i in final_600:if isinstance(i, list) is True:data.extend(i)def output_data(all_data, output_file_name):'''该函数将最终数据输出为CSV文件'''import pandas as pdtable = pd.DataFrame(data=all_data)# table = pd.DataFrame(data=all_data)table.to_csv("/Users/viemaxwei/Downloads/" + output_file_name + ".csv")return tableresults = output_data(data, "港岛") # 输出最终结果
