爬房天下网站上的信息

url:https://jn.newhouse.fang.com/house/s/
通过对网站源码的html 进行提取,使用xpath语句提取
xpath:https://www.w3school.com.cn/xpath/xpath_syntax.asp

  1. import time
  2. import requests,json
  3. from lxml import etree
  4. headers = {
  5. "Cookie":'''global_cookie=n0fzyms7fj8a1nuguwz5x0w2110l0j4w0wk; g_sourcepage=xf_lp%5Elb_pc'; csrfToken=paUPFPaTTTMJDqfRO-x1i7B2; unique_cookie=U_n0fzyms7fj8a1nuguwz5x0w2110l0j4w0wk*1; __utma=147393320.1775981245.1646804518.1646804518.1646804518.1; __utmc=147393320; __utmz=147393320.1646804518.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utmt_t0=1; __utmt_t1=1; __utmt_t2=1; __utmt_t3=1; __utmt_t4=1; __utmb=147393320.5.10.1646804518''',
  6. "User-Agent":'''Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36'''
  7. }
  8. url = "https://jn.newhouse.fang.com/house/s/"
  9. for page in range(1,10):
  10. response =requests.get(url=url,headers=headers)
  11. response.encoding= response.apparent_encoding
  12. res = response.text
  13. html = etree.HTML(res)
  14. content =html.xpath('//div[@class="nlc_details"]')
  15. for i in content:
  16. time.sleep(4)
  17. name = i.xpath('.//div[@class="nlcd_name"]/a/text()')[0].strip()#分割函数,去除首尾的换行符和空格
  18. address = i.xpath('.//div[@class="address"]/a/@title')
  19. price = i.xpath('.//div[@class="nhouse_price"]/span/text()')
  20. print(name,address,price)