1. # -*- coding: utf-8 -*-
    2. # @Time : 12/6/18 1:11 AM
    3. # @Author : Wai Mengxin
    4. # @Email : weimengxin2012@hotmail.com
    5. # @File : estate.py
    6. # @Software: PyCharm
    7. import requests
    8. from bs4 import BeautifulSoup
    9. import re
    10. import time
    11. import random
    12. def get_building_code(url):
    13. '''获得楼层code'''
    14. res = []
    15. head = {
    16. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36',
    17. 'Connection': 'close'}
    18. s = requests.Session()
    19. s.keep_alive = False
    20. html = s.get(url, headers=head).text
    21. bs = BeautifulSoup(html, 'lxml')
    22. ob = bs.find("div", id="unitTran-left").find_all("a", href=True)
    23. for k in ob:
    24. code = k.attrs["href"]
    25. res.append(code[-10:])
    26. s.close()
    27. return res
    28. def get_house_id(building_acode):
    29. '''获得房屋的ID'''
    30. house_id = []
    31. res = []
    32. url = 'http://www1.centadata.com/tfs_centadata/Pih2Sln/TransactionHistory.aspx?type=1&code=' + building_acode
    33. print(url)
    34. head = {
    35. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36',
    36. 'Connection': 'close'}
    37. s = requests.Session()
    38. s.keep_alive = False
    39. html = s.get(url, headers=head).text
    40. bs4 = BeautifulSoup(html, 'lxml')
    41. ob = bs4.find_all("table", class_="unitTran-sub-table")
    42. for i in ob:
    43. house_num = i.find_all("tr", class_="trHasTrans")
    44. house_id.extend(house_num)
    45. for it in house_id:
    46. res.append(it.attrs["id"])
    47. s.close()
    48. return res
    49. def get_history_tran(house_id, building_code):
    50. '''获得交易历史'''
    51. url = "http://www1.centadata.com/tfs_centadata/Pih2Sln/Ajax/AjaxServices.asmx/GenTransactionHistoryPinfo"
    52. headers = [{'User-Agent': "Mozilla/5.0 (X11; CrOS x86_64 10066.0.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36",
    53. 'Accept': 'text/html;q=0.9,*/*;q=0.8',
    54. 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
    55. 'Connection': 'close'},
    56. {'User-Agent': "Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1 (KHTML, like Gecko) CriOS/69.0.3497.100 Mobile/13B143 Safari/601.1.46",
    57. 'Accept': 'text/html;q=0.9,*/*;q=0.8',
    58. 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
    59. 'Connection': 'close'},
    60. {'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/7046A194A",
    61. 'Accept': 'text/html;q=0.9,*/*;q=0.8',
    62. 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
    63. 'Connection': 'close'},
    64. {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0",
    65. 'Accept': 'application/json, text/plain, */*',
    66. 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
    67. 'Connection': 'close'},
    68. {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
    69. 'Accept': 'application/json, text/plain, */*',
    70. 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
    71. 'Connection': 'close'}]
    72. info = {
    73. "acode": house_id,
    74. "cblgcode": building_code,
    75. "cci_price": 4,
    76. "cultureInfo": "SimplifiedChinese"
    77. }
    78. # proxyHost = "http-dyn.abuyun.com"
    79. # # proxyPort = "9020"
    80. # #
    81. # # # 代理隧道验证信息
    82. # # proxyUser = "H8J5995E1K5NF88D"
    83. # # proxyPass = "2E09DAB9F476C071"
    84. # #
    85. # # proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
    86. # # "host": proxyHost,
    87. # # "port": proxyPort,
    88. # # "user": proxyUser,
    89. # # "pass": proxyPass,
    90. # # }
    91. # #
    92. # # proxies = {
    93. # # "http": proxyMeta,
    94. # # "https": proxyMeta,
    95. # # }
    96. s = requests.Session()
    97. s.keep_alive = False
    98. try:
    99. html = s.post(url, headers=random.choice(headers), data=info, allow_redirects=False)
    100. time.sleep(0.6)
    101. # bs = BeautifulSoup(html.content, "lxml")
    102. res = re.findall(r'\btd>.*?<\b', html.text)
    103. time.sleep(0.2)
    104. out_list = []
    105. finals = []
    106. need = []
    107. for i in res:
    108. result = re.sub("td>|</td><| </td></tr><", "", i)
    109. out_list.append(result)
    110. for k in out_list:
    111. ret = re.sub("</tr></tr></table>|</tr></table></div><|/tr></table><|/tr><|<", "", k)
    112. finals.append(ret)
    113. for ite in finals:
    114. if ite != "":
    115. need.append(ite)
    116. except requests.exceptions.ConnectionError:
    117. print("ConnectionError....")
    118. time.sleep(60)
    119. return get_history_tran(house_id, building_code)
    120. # except requests.exceptions.ProxyError:
    121. # print("ProxyError!")
    122. # return get_history_tran(house_id, building_code)
    123. except requests.exceptions.ChunkedEncodingError:
    124. print("ProxyError!")
    125. return get_history_tran(house_id, building_code)
    126. s.close()
    127. return need
    128. def data_processing(obj):
    129. '''整理数据'''
    130. final = []
    131. res = []
    132. lis = []
    133. for k in obj:
    134. if len(k) <= 2 or k[-2] != "实":
    135. lis.append(k)
    136. # try:
    137. if lis[-1][0] != "@":
    138. lis.append("----")
    139. else:
    140. pass
    141. start = 5
    142. t = 0
    143. while start < len(lis):
    144. if lis[start][0] != "@" and lis[start] != "----":
    145. lis.insert(start, "----")
    146. t += 1
    147. else:
    148. pass
    149. start += 3
    150. # print(lis)
    151. # except IndexError:
    152. # print("IndexError")
    153. # return data_processing(obj)
    154. num = (len(lis) - 3) / 3
    155. index = 0
    156. while index < num:
    157. res.extend(lis[:3])
    158. res.extend(lis[3 + 3*index:6 + 3*index])
    159. li = res.copy()
    160. final.append(li)
    161. res.clear()
    162. index += 1
    163. return final
    164. def run(urlss):
    165. result_1 = []
    166. building_code = get_building_code(urlss) # 获取楼层code
    167. for build in building_code:
    168. house_id = get_house_id(build) # 获取特定楼层的house id
    169. for h in house_id:
    170. raw_data = get_history_tran(h, build)
    171. # print(raw_data)
    172. # try:
    173. data = data_processing(raw_data)
    174. for item in data:
    175. print(item)
    176. result_1.extend(data)
    177. return result_1
    178. urls = "http://www1.centadata.com/tfs_centadata/Pih2Sln/TransactionHistory.aspx?type=3&code=XSHNIHZXHN&info=basicinfo"
    179. res_new_4 = run(urls)