中原地产(香港)房产交易历史数据收集 - 《Python爬虫笔记》

一、项目目的：
二、项目思路：
三、项目原代码：
一、项目目的：

该项目的主要目的是获取香港以下4区所有楼盘的历史成交数据：
港岛区：http://www1.centadata.com/paddresssearch1.aspx?type=district16&code=HK
九龙区：http://www1.centadata.com/paddresssearch1.aspx?type=district16&code=KL
新界东：http://www1.centadata.com/paddresssearch1.aspx?type=district16&code=NE
新界西：http://www1.centadata.com/paddresssearch1.aspx?type=district16&code=NW
二、项目思路：

根据以上网址获取需要爬取得楼盘的id号（楼盘id）
结合楼盘网址，获取该楼盘每个房间的id号（房间id）
根据楼盘id和房间id，构造新的URL地址，发送Post请求，获取Ajax提供的原代码数据
通过BeautifulSoup、正则表达式，将获取的html原代码数据解析为关系型数据
最后，构造二维list将所有结果输出成CSV文件
三、项目原代码：

# -*- coding: utf-8 -*-
# @Time    : 2/23/19 4:35 PM
# @Author  : Wai Mengxin
# @Email   : weimengxin2012@hotmail.com
# @File    : HK_xinjiedong.py
# @Software: PyCharm
import requests
from bs4 import BeautifulSoup
import re
import time
import random
def get_building_code(url):
    '''获得楼层code'''
    res = []
    head = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36',
        'Connection': 'close'}
    s = requests.Session()
    s.keep_alive = False
    html = s.get(url, headers=head).text
    bs = BeautifulSoup(html, 'lxml')
    ob = bs.find("div", id="unitTran-left").find_all("a", href=True)
    for k in ob:
        code = k.attrs["href"]
        res.append(code[-10:])
    s.close()
    return res
def get_house_id(building_acode):
    '''获得房屋的ID'''
    house_id = []
    res = []
    url = 'http://www1.centadata.com/tfs_centadata/Pih2Sln/TransactionHistory.aspx?type=1&code=' + building_acode
    print(url)
    head = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36',
        'Connection': 'close'}
    s = requests.Session()
    s.keep_alive = False
    html = s.get(url, headers=head).text
    bs4 = BeautifulSoup(html, 'lxml')
    ob = bs4.find_all("table", class_="unitTran-sub-table")
    for i in ob:
        house_num = i.find_all("tr", class_="trHasTrans")
        house_id.extend(house_num)
    for it in house_id:
        res.append(it.attrs["id"])
    s.close()
    return res
def get_history_tran(house_id, building_code):
    '''主程序：获得交易历史'''
    url = "http://www1.centadata.com/tfs_centadata/Pih2Sln/Ajax/AjaxServices.asmx/GenTransactionHistoryPinfo"
    headers = [{'User-Agent': "Mozilla/5.0 (X11; CrOS x86_64 10066.0.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36",
                'Accept': 'text/html;q=0.9,*/*;q=0.8',
                'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
                'Connection': 'close'},
               {'User-Agent': "Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1 (KHTML, like Gecko) CriOS/69.0.3497.100 Mobile/13B143 Safari/601.1.46",
                'Accept': 'text/html;q=0.9,*/*;q=0.8',
                'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
                'Connection': 'close'},
               {'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/7046A194A",
                'Accept': 'text/html;q=0.9,*/*;q=0.8',
                'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
                'Connection': 'close'},
               {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0",
                'Accept': 'application/json, text/plain, */*',
                'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
                'Connection': 'close'},
               {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
                'Accept': 'application/json, text/plain, */*',
                'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
                'Connection': 'close'}]
    info = {
            "acode": house_id,
            "cblgcode": building_code,
            "cci_price": 4,
            "cultureInfo": "SimplifiedChinese"
            }
    s = requests.Session()
    s.keep_alive = False
    try:
        html = s.post(url, headers=random.choice(headers), data=info, allow_redirects=False)
        time.sleep(0.15)
        # bs = BeautifulSoup(html.content, "lxml")
        res = re.findall(r'\btd&gt;.*?&lt;\b', html.text)
        time.sleep(0.1)
        out_list = []
        finals = []
        need = []
        for i in res:
            result = re.sub("td&gt;|&lt;/td&gt;&lt;| &lt;/td&gt;&lt;/tr&gt;&lt;", "", i)
            out_list.append(result)
        for k in out_list:
            ret = re.sub("&lt;/tr&gt;&lt;/tr&gt;&lt;/table&gt;|&lt;/tr&gt;&lt;/table&gt;&lt;/div&gt;&lt;|/tr&gt;&lt;/table&gt;&lt;|/tr&gt;&lt;|&lt;", "", k)
            finals.append(ret)
        for ite in finals:
            if ite != "":
                need.append(ite)
    except requests.exceptions.ConnectionError:
        print("ConnectionError....")
        time.sleep(21)
        return get_history_tran(house_id, building_code)
    except requests.exceptions.ChunkedEncodingError:
        print("ProxyError!")
        return get_history_tran(house_id, building_code)
    except TimeoutError:
        print("TimeoutError!")
        time.sleep(5)
        return get_history_tran(house_id, building_code)
    s.close()
    return need
def data_processing(obj):
    '''整理数据'''
    final = []
    res = []
    lis = []
    for k in obj:
        if len(k) <= 2 or k[-2] != "实":
            lis.append(k)
    # try:
    if lis[-1][0] != "@":
        lis.append("----")
    else:
        pass
    start = 5
    t = 0
    while start < len(lis):
        if lis[start][0] != "@" and lis[start] != "----":
            lis.insert(start, "----")
            t += 1
        else:
            pass
        start += 3
        # print(lis)
    # except IndexError:
    #     print("IndexError")
    #     return data_processing(obj)
    num = (len(lis) - 3) / 3
    index = 0
    while index < num:
        res.extend(lis[:3])
        res.extend(lis[3 + 3*index:6 + 3*index])
        li = res.copy()
        final.append(li)
        res.clear()
        index += 1
    return final
def run(urlss):
    result_1 = []
    building_code = get_building_code(urlss)  # 获取楼层code
    for build in building_code:
        house_id = get_house_id(build)   # 获取特定楼层的house id
        for h in house_id:
            raw_data = get_history_tran(h, build)
            # print(raw_data)
            # try:
            data = data_processing(raw_data)
            for item in data:
                print(item)
            result_1.extend(data)
    return result_1
def output_data(all_data, output_file_name):
    '''该函数将最终数据输出为CSV文件'''
    import pandas as pd
    # name = ["building_name", "net_area", "floor_area", "time", "price", "area_price"]
    table = pd.DataFrame(data=all_data)
    # table = pd.DataFrame(data=all_data)
    table.to_csv("/usr/local/python_files" + output_file_name + ".csv")
    return table
def get_obj_set(url):
    import requests
    from bs4 import BeautifulSoup
    sets = []
    # url = "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=101&info=&code2=&page=0"
    header = {'User-Agent': "Mozilla/5.0 (X11; CrOS x86_64 10066.0.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36",
              'Connection': 'close'}
    s = requests.Session()
    s.keep_alive = False
    e = s.get(url, headers=header).text
    html = BeautifulSoup(e, 'lxml')
    obj = html.find_all("td", class_="tdscp1tr")
    for i in obj:
        try:
            h = i.find("a")["href"]
            start = h.find("'")
            res = h[start + 1:start+11]
            sets.append(res)
            print(res)
        except TypeError:
            print("TypeError & PASS")
    return sets
except_code_xinjie = ["XSHNIHSXHT", "DBPPWPPJPW", "XSHNIHZXHN", "GYWKPPKYPS", "GYSEBPYEPK",
               "BYSPWPDOPA", "DBSPWWPXWD", "DFDJURSYRV", "GEPPWPPJPY", "GYGGGPGXPB",
               "GYYYPPGOPS", "VJOVQRSERQ", "XSHMTHNOHT", "BDAAGPYXPA", "XSHSTHZHHT",
               "DBPPWPPSPD", "DEPPWPPSPE", "XSSZZHNSHT", "BDBAGPEHPA", "GWPGGPAEPS",
               "VOOFFRFARR", "LIDHTHXXHT", "WBPPWPPEPP", "WDPPWPPEPB", "AEWPPPSOPW",
               "BSPPGPSEPP", "BEPPWPPAPK", "WBPPWPPRPP", "VDORQRVYRU", "NTHHIHZEHH",
               "KKPPWPPEPS", "DMXSZHLXHD", "WBPPWPPHPP", "BSPPWPPJPP", "BEPPWPPHPW",
               "BGSSBPAXPS", "AEPPWPPAPG", "AEBKPPSJPK", "AEPPWPPRPK", "BEPPWWPOWD"]
url_set = ["http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=208&info=&code2=&page=0",
           "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=208&info=&code2=lord:~lordtype:~tabIdx:0&page=1",
           "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=301&info=&code2=&page=0",
           "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=301&info=&code2=lord:~lordtype:~tabIdx:0&page=1",
           "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=302&info=&code2=&page=0",
           "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=302&info=&code2=lord:~lordtype:~tabIdx:0&page=1",
           "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=303&info=&code2=&page=0",
           "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=303&info=&code2=lord:~lordtype:~tabIdx:0&page=1",
           "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=304&info=&code2=&page=0",
           "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=304&info=&code2=lord:~lordtype:~tabIdx:0&page=1",
           "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=306&info=&code2=&page=0",
           "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=306&info=&code2=lord:~lordtype:~tabIdx:0&page=1",
           "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=307&info=&code2=&page=0",
           "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=307&info=&code2=lord:~lordtype:~tabIdx:0&page=1",
           "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=307&info=&code2=lord:namec~lordtype:desc~tabIdx:0&page=2",
           "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=307&info=&code2=lord:namec~lordtype:desc~tabIdx:0&page=3",
           "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=308&info=&code2=&page=0",
           "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=308&info=&code2=lord:~lordtype:~tabIdx:0&page=1",
           "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=308&info=&code2=lord:namec~lordtype:desc~tabIdx:0&page=2",
           "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=308&info=&code2=lord:namec~lordtype:desc~tabIdx:0&page=3",
           "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=103&info=&code2=&page=0",
           "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=309&info=&code2=&page=0",
           "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=401&info=&code2=&page=0",
           "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=401&info=&code2=lord:~lordtype:~tabIdx:0&page=1",
           "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=401&info=&code2=lord:namec~lordtype:desc~tabIdx:0&page=2",
           "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=401&info=&code2=lord:namec~lordtype:desc~tabIdx:0&page=3",
           "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=401&info=&code2=lord:namec~lordtype:desc~tabIdx:0&page=4",
           "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=401&info=&code2=lord:namec~lordtype:desc~tabIdx:0&page=5",
           "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=401&info=&code2=lord:namec~lordtype:desc~tabIdx:0&page=6",
           "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=401&info=&code2=lord:namec~lordtype:desc~tabIdx:0&page=7",
           "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=401&info=&code2=lord:namec~lordtype:desc~tabIdx:0&page=8",
           "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=401&info=&code2=lord:namec~lordtype:desc~tabIdx:0&page=9",
           "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=401&info=&code2=lord:namec~lordtype:desc~tabIdx:0&page=10",
           "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=402&info=&code2=&page=0",
           "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=402&info=&code2=lord:~lordtype:~tabIdx:0&page=1",
           "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=402&info=&code2=lord:namec~lordtype:desc~tabIdx:0&page=2",
           "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=402&info=&code2=lord:namec~lordtype:desc~tabIdx:0&page=3",
           "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=402&info=&code2=lord:namec~lordtype:desc~tabIdx:0&page=4",
           "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=402&info=&code2=lord:namec~lordtype:desc~tabIdx:0&page=5",
           "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=402&info=&code2=lord:namec~lordtype:desc~tabIdx:0&page=6",
           "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=403&info=&code2=&page=0",
           "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=404&info=&code2=&page=0",
           "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=404&info=&code2=lord:~lordtype:~tabIdx:0&page=1",
           "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=404&info=&code2=lord:namec~lordtype:desc~tabIdx:0&page=2",
           "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=404&info=&code2=lord:namec~lordtype:desc~tabIdx:0&page=3",
           "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=404&info=&code2=lord:namec~lordtype:desc~tabIdx:0&page=4",
           "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=405&info=&code2=&page=0",
           "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=405&info=&code2=lord:~lordtype:~tabIdx:0&page=1",
           "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=405&info=&code2=lord:namec~lordtype:desc~tabIdx:0&page=2",
           "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=405&info=&code2=lord:namec~lordtype:desc~tabIdx:0&page=3",
           "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=406&info=&code2=&page=0",
           "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=406&info=&code2=lord:~lordtype:~tabIdx:0&page=1",
           "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=407&info=&code2=&page=0",
           "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=408&info=&code2=&page=0",
           "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=408&info=&code2=lord:~lordtype:~tabIdx:0&page=1",
           "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=409&info=&code2=&page=0",
           "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=409&info=&code2=lord:~lordtype:~tabIdx:0&page=1",
           "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=409&info=&code2=lord:namec~lordtype:desc~tabIdx:0&page=2",
           "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=410&info=&code2=&page=0"]
build_code = []
for i in url_set:
    res = get_obj_set(i)
    time.sleep(0.3)
    build_code.extend(res)
final_0 = []
for i in build_code[145:]:
    if i not in except_code_xinjie:
        print(i)
        try:
            urls = "http://www1.centadata.com/tfs_centadata/Pih2Sln/TransactionHistory.aspx?type=3&code=" + i + "&info=basicinfo"
            # urls = "http://hk.centadata.com/transactionhistory.aspx?type=1&code=" + i + "&ci=zh-hk"
            res_1 = run(urls)
            final_0.append(res_1)
        except AttributeError:
            print("AttributeError!")
            final_0.append(i)
def read_csv(name):
    '''该函数读取CSV文件数据'''
    import csv
    csv_file = csv.reader(open("/Users/viemaxwei/Downloads/" + name + ".csv", "r", encoding="utf8"))
    object_website = []
    for i in csv_file:
        object_website.append(i)
        # print(i)
    return object_website
data = []
for i in final_600:
    if isinstance(i, list) is True:
        data.extend(i)
def output_data(all_data, output_file_name):
    '''该函数将最终数据输出为CSV文件'''
    import pandas as pd
    table = pd.DataFrame(data=all_data)
    # table = pd.DataFrame(data=all_data)
    table.to_csv("/Users/viemaxwei/Downloads/" + output_file_name + ".csv")
    return table
results = output_data(data, "港岛")  # 输出最终结果