2022.3.9 day4 - 《实习随记》

爬房天下网站上的信息

爬房天下网站上的信息

url:https://jn.newhouse.fang.com/house/s/
通过对网站源码的html 进行提取，使用xpath语句提取
xpath：https://www.w3school.com.cn/xpath/xpath_syntax.asp

import time
import requests,json
from lxml import etree
headers = {
    "Cookie":'''global_cookie=n0fzyms7fj8a1nuguwz5x0w2110l0j4w0wk; g_sourcepage=xf_lp%5Elb_pc'; csrfToken=paUPFPaTTTMJDqfRO-x1i7B2; unique_cookie=U_n0fzyms7fj8a1nuguwz5x0w2110l0j4w0wk*1; __utma=147393320.1775981245.1646804518.1646804518.1646804518.1; __utmc=147393320; __utmz=147393320.1646804518.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utmt_t0=1; __utmt_t1=1; __utmt_t2=1; __utmt_t3=1; __utmt_t4=1; __utmb=147393320.5.10.1646804518''',
    "User-Agent":'''Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36'''
}
url = "https://jn.newhouse.fang.com/house/s/"
for page in range(1,10):
    response =requests.get(url=url,headers=headers)
    response.encoding= response.apparent_encoding
    res = response.text
    html = etree.HTML(res)
    content =html.xpath('//div[@class="nlc_details"]')
    for i in content:
        time.sleep(4)
        name = i.xpath('.//div[@class="nlcd_name"]/a/text()')[0].strip()#分割函数，去除首尾的换行符和空格
        address = i.xpath('.//div[@class="address"]/a/@title')
        price = i.xpath('.//div[@class="nhouse_price"]/span/text()')
        print(name,address,price)