import requests
from lxml import etree
import csv
def read_page(url, headers):
data_list = []
res1 = requests.get(url, headers=headers)
html1 = etree.HTML(res1.text)
div_tags = html1.xpath('//div[@id="main"]/div/div')
for div_tag in div_tags:
item = {}
url2 = "https:"+div_tag.xpath('./a/@href')[0]
res2 = requests.get(url2, headers=headers)
html2 = etree.HTML(res2.text)
item['url'] = html2.xpath('//ul[@class="clearfix"]/li[1]/a/@href')[0]
item['name'] = div_tag.xpath('./p/a/text()')[0].replace('免费下载', '')
data_list.append(item)
return data_list
def write(dig):
with open('jianlimuban.csv', 'w', encoding='utf-8-sig', newline='') as f:
wt = csv.DictWriter(f, fieldnames=['url', 'name'])
wt.writeheader()
wt.writerows(data_list)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36'
}
for i in range(1, 6):
url = f'https://aspx.sc.chinaz.com/query.aspx?keyword=%E5%85%8D%E8%B4%B9&issale=&classID=864&page={i}'
data_list = read_page(url, headers)
write(data_list)