一、项目目的:

该项目的主要目的是获取香港以下4区所有楼盘的历史成交数据:

二、项目思路:

  1. 根据以上网址获取需要爬取得楼盘的id号(楼盘id)
  2. 结合楼盘网址,获取该楼盘每个房间的id号(房间id)
  3. 根据楼盘id和房间id,构造新的URL地址,发送Post请求,获取Ajax提供的原代码数据
  4. 通过BeautifulSoup、正则表达式,将获取的html原代码数据解析为关系型数据
  5. 最后,构造二维list将所有结果输出成CSV文件

三、项目原代码:

  1. # -*- coding: utf-8 -*-
  2. # @Time : 2/23/19 4:35 PM
  3. # @Author : Wai Mengxin
  4. # @Email : weimengxin2012@hotmail.com
  5. # @File : HK_xinjiedong.py
  6. # @Software: PyCharm
  7. import requests
  8. from bs4 import BeautifulSoup
  9. import re
  10. import time
  11. import random
  12. def get_building_code(url):
  13. '''获得楼层code'''
  14. res = []
  15. head = {
  16. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36',
  17. 'Connection': 'close'}
  18. s = requests.Session()
  19. s.keep_alive = False
  20. html = s.get(url, headers=head).text
  21. bs = BeautifulSoup(html, 'lxml')
  22. ob = bs.find("div", id="unitTran-left").find_all("a", href=True)
  23. for k in ob:
  24. code = k.attrs["href"]
  25. res.append(code[-10:])
  26. s.close()
  27. return res
  28. def get_house_id(building_acode):
  29. '''获得房屋的ID'''
  30. house_id = []
  31. res = []
  32. url = 'http://www1.centadata.com/tfs_centadata/Pih2Sln/TransactionHistory.aspx?type=1&code=' + building_acode
  33. print(url)
  34. head = {
  35. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36',
  36. 'Connection': 'close'}
  37. s = requests.Session()
  38. s.keep_alive = False
  39. html = s.get(url, headers=head).text
  40. bs4 = BeautifulSoup(html, 'lxml')
  41. ob = bs4.find_all("table", class_="unitTran-sub-table")
  42. for i in ob:
  43. house_num = i.find_all("tr", class_="trHasTrans")
  44. house_id.extend(house_num)
  45. for it in house_id:
  46. res.append(it.attrs["id"])
  47. s.close()
  48. return res
  49. def get_history_tran(house_id, building_code):
  50. '''主程序:获得交易历史'''
  51. url = "http://www1.centadata.com/tfs_centadata/Pih2Sln/Ajax/AjaxServices.asmx/GenTransactionHistoryPinfo"
  52. headers = [{'User-Agent': "Mozilla/5.0 (X11; CrOS x86_64 10066.0.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36",
  53. 'Accept': 'text/html;q=0.9,*/*;q=0.8',
  54. 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
  55. 'Connection': 'close'},
  56. {'User-Agent': "Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1 (KHTML, like Gecko) CriOS/69.0.3497.100 Mobile/13B143 Safari/601.1.46",
  57. 'Accept': 'text/html;q=0.9,*/*;q=0.8',
  58. 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
  59. 'Connection': 'close'},
  60. {'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/7046A194A",
  61. 'Accept': 'text/html;q=0.9,*/*;q=0.8',
  62. 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
  63. 'Connection': 'close'},
  64. {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0",
  65. 'Accept': 'application/json, text/plain, */*',
  66. 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
  67. 'Connection': 'close'},
  68. {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
  69. 'Accept': 'application/json, text/plain, */*',
  70. 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
  71. 'Connection': 'close'}]
  72. info = {
  73. "acode": house_id,
  74. "cblgcode": building_code,
  75. "cci_price": 4,
  76. "cultureInfo": "SimplifiedChinese"
  77. }
  78. s = requests.Session()
  79. s.keep_alive = False
  80. try:
  81. html = s.post(url, headers=random.choice(headers), data=info, allow_redirects=False)
  82. time.sleep(0.15)
  83. # bs = BeautifulSoup(html.content, "lxml")
  84. res = re.findall(r'\btd>.*?<\b', html.text)
  85. time.sleep(0.1)
  86. out_list = []
  87. finals = []
  88. need = []
  89. for i in res:
  90. result = re.sub("td>|</td><| </td></tr><", "", i)
  91. out_list.append(result)
  92. for k in out_list:
  93. ret = re.sub("</tr></tr></table>|</tr></table></div><|/tr></table><|/tr><|<", "", k)
  94. finals.append(ret)
  95. for ite in finals:
  96. if ite != "":
  97. need.append(ite)
  98. except requests.exceptions.ConnectionError:
  99. print("ConnectionError....")
  100. time.sleep(21)
  101. return get_history_tran(house_id, building_code)
  102. except requests.exceptions.ChunkedEncodingError:
  103. print("ProxyError!")
  104. return get_history_tran(house_id, building_code)
  105. except TimeoutError:
  106. print("TimeoutError!")
  107. time.sleep(5)
  108. return get_history_tran(house_id, building_code)
  109. s.close()
  110. return need
  111. def data_processing(obj):
  112. '''整理数据'''
  113. final = []
  114. res = []
  115. lis = []
  116. for k in obj:
  117. if len(k) <= 2 or k[-2] != "实":
  118. lis.append(k)
  119. # try:
  120. if lis[-1][0] != "@":
  121. lis.append("----")
  122. else:
  123. pass
  124. start = 5
  125. t = 0
  126. while start < len(lis):
  127. if lis[start][0] != "@" and lis[start] != "----":
  128. lis.insert(start, "----")
  129. t += 1
  130. else:
  131. pass
  132. start += 3
  133. # print(lis)
  134. # except IndexError:
  135. # print("IndexError")
  136. # return data_processing(obj)
  137. num = (len(lis) - 3) / 3
  138. index = 0
  139. while index < num:
  140. res.extend(lis[:3])
  141. res.extend(lis[3 + 3*index:6 + 3*index])
  142. li = res.copy()
  143. final.append(li)
  144. res.clear()
  145. index += 1
  146. return final
  147. def run(urlss):
  148. result_1 = []
  149. building_code = get_building_code(urlss) # 获取楼层code
  150. for build in building_code:
  151. house_id = get_house_id(build) # 获取特定楼层的house id
  152. for h in house_id:
  153. raw_data = get_history_tran(h, build)
  154. # print(raw_data)
  155. # try:
  156. data = data_processing(raw_data)
  157. for item in data:
  158. print(item)
  159. result_1.extend(data)
  160. return result_1
  161. def output_data(all_data, output_file_name):
  162. '''该函数将最终数据输出为CSV文件'''
  163. import pandas as pd
  164. # name = ["building_name", "net_area", "floor_area", "time", "price", "area_price"]
  165. table = pd.DataFrame(data=all_data)
  166. # table = pd.DataFrame(data=all_data)
  167. table.to_csv("/usr/local/python_files" + output_file_name + ".csv")
  168. return table
  169. def get_obj_set(url):
  170. import requests
  171. from bs4 import BeautifulSoup
  172. sets = []
  173. # url = "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=101&info=&code2=&page=0"
  174. header = {'User-Agent': "Mozilla/5.0 (X11; CrOS x86_64 10066.0.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36",
  175. 'Connection': 'close'}
  176. s = requests.Session()
  177. s.keep_alive = False
  178. e = s.get(url, headers=header).text
  179. html = BeautifulSoup(e, 'lxml')
  180. obj = html.find_all("td", class_="tdscp1tr")
  181. for i in obj:
  182. try:
  183. h = i.find("a")["href"]
  184. start = h.find("'")
  185. res = h[start + 1:start+11]
  186. sets.append(res)
  187. print(res)
  188. except TypeError:
  189. print("TypeError & PASS")
  190. return sets
  191. except_code_xinjie = ["XSHNIHSXHT", "DBPPWPPJPW", "XSHNIHZXHN", "GYWKPPKYPS", "GYSEBPYEPK",
  192. "BYSPWPDOPA", "DBSPWWPXWD", "DFDJURSYRV", "GEPPWPPJPY", "GYGGGPGXPB",
  193. "GYYYPPGOPS", "VJOVQRSERQ", "XSHMTHNOHT", "BDAAGPYXPA", "XSHSTHZHHT",
  194. "DBPPWPPSPD", "DEPPWPPSPE", "XSSZZHNSHT", "BDBAGPEHPA", "GWPGGPAEPS",
  195. "VOOFFRFARR", "LIDHTHXXHT", "WBPPWPPEPP", "WDPPWPPEPB", "AEWPPPSOPW",
  196. "BSPPGPSEPP", "BEPPWPPAPK", "WBPPWPPRPP", "VDORQRVYRU", "NTHHIHZEHH",
  197. "KKPPWPPEPS", "DMXSZHLXHD", "WBPPWPPHPP", "BSPPWPPJPP", "BEPPWPPHPW",
  198. "BGSSBPAXPS", "AEPPWPPAPG", "AEBKPPSJPK", "AEPPWPPRPK", "BEPPWWPOWD"]
  199. url_set = ["http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=208&info=&code2=&page=0",
  200. "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=208&info=&code2=lord:~lordtype:~tabIdx:0&page=1",
  201. "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=301&info=&code2=&page=0",
  202. "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=301&info=&code2=lord:~lordtype:~tabIdx:0&page=1",
  203. "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=302&info=&code2=&page=0",
  204. "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=302&info=&code2=lord:~lordtype:~tabIdx:0&page=1",
  205. "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=303&info=&code2=&page=0",
  206. "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=303&info=&code2=lord:~lordtype:~tabIdx:0&page=1",
  207. "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=304&info=&code2=&page=0",
  208. "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=304&info=&code2=lord:~lordtype:~tabIdx:0&page=1",
  209. "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=306&info=&code2=&page=0",
  210. "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=306&info=&code2=lord:~lordtype:~tabIdx:0&page=1",
  211. "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=307&info=&code2=&page=0",
  212. "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=307&info=&code2=lord:~lordtype:~tabIdx:0&page=1",
  213. "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=307&info=&code2=lord:namec~lordtype:desc~tabIdx:0&page=2",
  214. "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=307&info=&code2=lord:namec~lordtype:desc~tabIdx:0&page=3",
  215. "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=308&info=&code2=&page=0",
  216. "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=308&info=&code2=lord:~lordtype:~tabIdx:0&page=1",
  217. "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=308&info=&code2=lord:namec~lordtype:desc~tabIdx:0&page=2",
  218. "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=308&info=&code2=lord:namec~lordtype:desc~tabIdx:0&page=3",
  219. "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=103&info=&code2=&page=0",
  220. "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=309&info=&code2=&page=0",
  221. "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=401&info=&code2=&page=0",
  222. "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=401&info=&code2=lord:~lordtype:~tabIdx:0&page=1",
  223. "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=401&info=&code2=lord:namec~lordtype:desc~tabIdx:0&page=2",
  224. "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=401&info=&code2=lord:namec~lordtype:desc~tabIdx:0&page=3",
  225. "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=401&info=&code2=lord:namec~lordtype:desc~tabIdx:0&page=4",
  226. "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=401&info=&code2=lord:namec~lordtype:desc~tabIdx:0&page=5",
  227. "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=401&info=&code2=lord:namec~lordtype:desc~tabIdx:0&page=6",
  228. "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=401&info=&code2=lord:namec~lordtype:desc~tabIdx:0&page=7",
  229. "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=401&info=&code2=lord:namec~lordtype:desc~tabIdx:0&page=8",
  230. "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=401&info=&code2=lord:namec~lordtype:desc~tabIdx:0&page=9",
  231. "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=401&info=&code2=lord:namec~lordtype:desc~tabIdx:0&page=10",
  232. "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=402&info=&code2=&page=0",
  233. "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=402&info=&code2=lord:~lordtype:~tabIdx:0&page=1",
  234. "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=402&info=&code2=lord:namec~lordtype:desc~tabIdx:0&page=2",
  235. "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=402&info=&code2=lord:namec~lordtype:desc~tabIdx:0&page=3",
  236. "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=402&info=&code2=lord:namec~lordtype:desc~tabIdx:0&page=4",
  237. "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=402&info=&code2=lord:namec~lordtype:desc~tabIdx:0&page=5",
  238. "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=402&info=&code2=lord:namec~lordtype:desc~tabIdx:0&page=6",
  239. "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=403&info=&code2=&page=0",
  240. "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=404&info=&code2=&page=0",
  241. "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=404&info=&code2=lord:~lordtype:~tabIdx:0&page=1",
  242. "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=404&info=&code2=lord:namec~lordtype:desc~tabIdx:0&page=2",
  243. "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=404&info=&code2=lord:namec~lordtype:desc~tabIdx:0&page=3",
  244. "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=404&info=&code2=lord:namec~lordtype:desc~tabIdx:0&page=4",
  245. "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=405&info=&code2=&page=0",
  246. "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=405&info=&code2=lord:~lordtype:~tabIdx:0&page=1",
  247. "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=405&info=&code2=lord:namec~lordtype:desc~tabIdx:0&page=2",
  248. "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=405&info=&code2=lord:namec~lordtype:desc~tabIdx:0&page=3",
  249. "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=406&info=&code2=&page=0",
  250. "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=406&info=&code2=lord:~lordtype:~tabIdx:0&page=1",
  251. "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=407&info=&code2=&page=0",
  252. "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=408&info=&code2=&page=0",
  253. "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=408&info=&code2=lord:~lordtype:~tabIdx:0&page=1",
  254. "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=409&info=&code2=&page=0",
  255. "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=409&info=&code2=lord:~lordtype:~tabIdx:0&page=1",
  256. "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=409&info=&code2=lord:namec~lordtype:desc~tabIdx:0&page=2",
  257. "http://www1.centadata.com/paddresssearch1.aspx?type=district17&code=410&info=&code2=&page=0"]
  258. build_code = []
  259. for i in url_set:
  260. res = get_obj_set(i)
  261. time.sleep(0.3)
  262. build_code.extend(res)
  263. final_0 = []
  264. for i in build_code[145:]:
  265. if i not in except_code_xinjie:
  266. print(i)
  267. try:
  268. urls = "http://www1.centadata.com/tfs_centadata/Pih2Sln/TransactionHistory.aspx?type=3&code=" + i + "&info=basicinfo"
  269. # urls = "http://hk.centadata.com/transactionhistory.aspx?type=1&code=" + i + "&ci=zh-hk"
  270. res_1 = run(urls)
  271. final_0.append(res_1)
  272. except AttributeError:
  273. print("AttributeError!")
  274. final_0.append(i)
  275. def read_csv(name):
  276. '''该函数读取CSV文件数据'''
  277. import csv
  278. csv_file = csv.reader(open("/Users/viemaxwei/Downloads/" + name + ".csv", "r", encoding="utf8"))
  279. object_website = []
  280. for i in csv_file:
  281. object_website.append(i)
  282. # print(i)
  283. return object_website
  284. data = []
  285. for i in final_600:
  286. if isinstance(i, list) is True:
  287. data.extend(i)
  288. def output_data(all_data, output_file_name):
  289. '''该函数将最终数据输出为CSV文件'''
  290. import pandas as pd
  291. table = pd.DataFrame(data=all_data)
  292. # table = pd.DataFrame(data=all_data)
  293. table.to_csv("/Users/viemaxwei/Downloads/" + output_file_name + ".csv")
  294. return table
  295. results = output_data(data, "港岛") # 输出最终结果