爬房天下网站上的信息
url:https://jn.newhouse.fang.com/house/s/
通过对网站源码的html 进行提取,使用xpath语句提取
xpath:https://www.w3school.com.cn/xpath/xpath_syntax.asp
import timeimport requests,jsonfrom lxml import etreeheaders = {"Cookie":'''global_cookie=n0fzyms7fj8a1nuguwz5x0w2110l0j4w0wk; g_sourcepage=xf_lp%5Elb_pc'; csrfToken=paUPFPaTTTMJDqfRO-x1i7B2; unique_cookie=U_n0fzyms7fj8a1nuguwz5x0w2110l0j4w0wk*1; __utma=147393320.1775981245.1646804518.1646804518.1646804518.1; __utmc=147393320; __utmz=147393320.1646804518.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utmt_t0=1; __utmt_t1=1; __utmt_t2=1; __utmt_t3=1; __utmt_t4=1; __utmb=147393320.5.10.1646804518''',"User-Agent":'''Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36'''}url = "https://jn.newhouse.fang.com/house/s/"for page in range(1,10):response =requests.get(url=url,headers=headers)response.encoding= response.apparent_encodingres = response.texthtml = etree.HTML(res)content =html.xpath('//div[@class="nlc_details"]')for i in content:time.sleep(4)name = i.xpath('.//div[@class="nlcd_name"]/a/text()')[0].strip()#分割函数,去除首尾的换行符和空格address = i.xpath('.//div[@class="address"]/a/@title')price = i.xpath('.//div[@class="nhouse_price"]/span/text()')print(name,address,price)
