'''需求url:https://hhht.lianjia.com/ershoufang找前5页信息要保存 名称,地址,面积,房价不限解析方式'''import requestsimport csvfrom lxml import etreefrom fake_useragent import UserAgentclass house: def __init__(self): # pass # url = "http://httpbin.org/ip" # res = requests.get(url) # print('原始ip',res.text) # proxy = { # 'http': '120.220.220.95:8085' # } ua=UserAgent().chrome self.headers = { 'User-Agent': ua, 'Cookie': 'select_city=150100; lianjia_uuid=a45bb30e-c7c4-4afd-a5ae-efecea09527c; lianjia_ssid=bee89378-3a22-411f-8772-7157dce354cd; Hm_lvt_9152f8221cb6243a53c83b956842be8a=1649515501; UM_distinctid=1800ec9196d54-00a9907a23f12f-3a67551f-100200-1800ec9196e344; CNZZDATA1254525948=1381701028-1649509297-%7C1649509297; CNZZDATA1255633284=1386718929-1649513264-%7C1649513264; CNZZDATA1255604082=1695179702-1649512840-%7C1649512840; _smt_uid=62519bee.69f8de8; _jzqa=1.2671819174863935000.1649515504.1649515504.1649515504.1; _jzqc=1; _jzqckmp=1; _qzjc=1; sajssdk_2015_cross_new_user=1; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%221800ec925604f-01ee9a279f0ad9-3a67551f-1049088-1800ec925613b6%22%2C%22%24device_id%22%3A%221800ec925604f-01ee9a279f0ad9-3a67551f-1049088-1800ec925613b6%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%7D%7D; _ga=GA1.2.2000272087.1649515507; _gid=GA1.2.1687020338.1649515507; Hm_lpvt_9152f8221cb6243a53c83b956842be8a=1649517375; _qzja=1.2016957562.1649515504617.1649515504617.1649515504617.1649515871282.1649517375512.0.0.0.8.1; _qzjb=1.1649515504617.8.0.0.0; _qzjto=8.1.0; _jzqb=1.8.10.1649515504.1; srcid=eyJ0Ijoie1wiZGF0YVwiOlwiZjYwMGVkMDlkNjg3NmNhZDBlZTI0NGZmYmMwYzRkOTQ3ZjgxMWI5YTE4YzQ0ZWRiMGU1YmQxZTFlZmZjNTg0MmM1ZjM3MGQyMTY5ZmVlYzk3MjhhM2E5MjFlNTQ2YTUyNWZkNTgzYTczYTMzNDkxZDdkZmU3MmRjNTBjNjhjYTdmNjI5ZDI1ZjQ1OTkxZDcwMTBjMjNmNjhjZjRkNDI5ZjhkNWMwOTMyZDk3MDY3NDI3N2Y4NTg0YjE2YjRkZjU4OGQ3Y2ExMTllMjkyZjVjYWQwNGI0NWI2NGYxYzgzZGI1ZGU5NGM2OTQ3Y2Q2NDU0MjI0MmM2NGFkMjI3YTk1MFwiLFwia2V5X2lkXCI6XCIxXCIsXCJzaWduXCI6XCIyYmEwYzY4OVwifSIsInIiOiJodHRwczovL2hoaHQubGlhbmppYS5jb20vZXJzaG91ZmFuZy9wZzEvIiwib3MiOiJ3ZWIiLCJ2IjoiMC4xIn0=' } # res = requests.get(url, proxies=proxy, headers=self.headers) # # time.sleep(0.5) # print('代理ip',res.text) def readpage(self,url): res = requests.get(url,headers=self.headers) res.encoding='utf-8' return res.text def jiexi(self,html): h=etree.HTML(html) info=[] ul= h.xpath('//ul[@class="sellListContent"]/li') for li in ul: titles=li.xpath('.//div[@class="title"]/a/text()')[0] title=''.join(titles.split(' ')) position1 = li.xpath('.//div[@class="flood"]//text()')[0].strip() position2 = li.xpath('.//div[@class="flood"]//text()')[2].strip() position=position1+'-'+position2 addresss = li.xpath('.//div[@class="houseInfo"]//text()')[0] address = ','.join(addresss.split(' | ')) totalprices = ''.join(li.xpath('.//div[@class="priceInfo"]//text()')[1:3]) unitprices = li.xpath('.//div[@class="priceInfo"]//text()')[-1] price='总价'+totalprices+',单价'+unitprices infos={'房源':title,'位置':position,'户型信息':address,'价格':price} info.append(infos) return info def save(self,data): with open('二手房信息.csv','a+',encoding='utf-8',newline='') as f: wr=csv.DictWriter(f,fieldnames=['房源','位置','户型信息','价格']) wr.writeheader() wr.writerows(data) def run(self): datalist=[] for i in range(1,6): url=f'https://hhht.lianjia.com/ershoufang/pg{i}/' html=self.readpage(url) data=self.jiexi(html) datalist+=data self.save(datalist)if __name__ == '__main__': house().run()