# http://www.17989.com/xiaohua/5.htm
# 标题和正文
from lxml import etree
import requests
from fake_useragent import UserAgent
import csv
# 获取网页源代码
def download(url):
headers = {
'User-Agent':UserAgent().random
}
html = requests.get(url=url,headers=headers)
html.encoding='utf-=8'
return html.text
# 获取内容
def get_data(html):
xml = etree.HTML(html)
li_list = xml.xpath('//div[@class="module articlelist"]/ul/li')
j = 1
for li in li_list:
item = {}
title = li.xpath('.//div[@class="hd"]/text()')[0]
content = li.xpath('.//pre/text()')
contentData = ' '.join([i.strip() for i in content])
item['title'] = title
item['content'] = contentData
data_list.append(item)
j += 1
return data_list
def saveData():
headers = ['title','content']
with open('xiaohua.csv', 'w', encoding='utf-8-sig', newline='') as file:
dwriter = csv.DictWriter(file,headers)
dwriter.writeheader()
dwriter.writerows(data_list)
# 获取翻页
def get_next_page(html):
xml = etree.HTML(html)
href = xml.xpath('//div[@class="page"]/a[text()="下一页"]/@href')[0]
return "http://www.17989.com" + href
if __name__ == '__main__':
url = 'http://www.17989.com/xiaohua/1.htm'
i = 1
data_list = []
while i < 100:
html = download(url)
get_data(html)
url = get_next_page(html)
i += 1
if not url:
break
saveData()