- 目标网站:https://aspx.sc.chinaz.com/query.aspx?keyword=%E5%85%8D%E8%B4%B9&issale=&classID=864&page=1
- 爬取要求:
- 1、翻页获取到页面的源码
- 2、用xpath解析数据,获取到页面所有模板名字和下载链接
- 3、把数据保存到csv
from fake_useragent import UserAgent
from lxml import etree
from threading import Thread
import requests
import csv
headers = {
‘User-Agent’ : UserAgent().chrome
}
def pa_page(n):
url=f”https://aspx.sc.chinaz.com/query.aspx?keyword=%E5%85%8D%E8%B4%B9&issale=&classID=864&page={n}“
resp = requests.get(url, headers=headers)
html = etree.HTML(resp.text)
muban = html.xpath('//*[@id="container"]/div/p/a')
data=[]<br /> for i in muban:<br /> data.append({"模板名":i.text**,**"链接":i.xpath('@href')[**0**]})
with open('模板.csv'**, **'a'**, **newline=''**, **encoding='utf-8') as f :<br /> writer = csv.DictWriter(f**, **fieldnames=['模板名'**, **'链接']) # 提前预览列名,当下面代码写入数据时,会将其一一对应。<br /> writer.writeheader() # 写入列名<br /> writer.writerows(data) # 写入数据<br /> print("数据已经写入成功!!!")
if name == ‘main‘:
for i in range(1,100):
Thread(target=pa_page,args=(i,)).start()