# http://www.17989.com/xiaohua/5.htm
# 标题和正文
from lxml import etree
import requests
from fake_useragent import UserAgent
# 获取网页源代码
def download(url):
headers = {
'User-Agent':UserAgent().random
}
html = requests.get(url=url,headers=headers)
html.encoding='utf-=8'
return html.text
# 获取内容
def get_data(html):
xml = etree.HTML(html)
li_list = xml.xpath('//div[@class="module articlelist"]/ul/li')
j = 1
for li in li_list:
title = li.xpath('.//div[@class="hd"]/text()')[0]
content = li.xpath('.//pre/text()')
print(f'第{i}页、第{j}条——标题【{title}】 内容为:{content}')
data_list.append({'title':title,'content':content})
j += 1
return data_list
# 获取翻页
def get_next_page(html):
xml = etree.HTML(html)
href = xml.xpath('//div[@class="page"]/a[text()="下一页"]/@href')[0]
return "http://www.17989.com" + href
if __name__ == '__main__':
url = 'http://www.17989.com/xiaohua/1.htm'
i = 1
data_list = []
while i < 10:
html = download(url)
get_data(html)
url = get_next_page(html)
i += 1
if not url:
break
print(data_list)