1. # http://www.17989.com/xiaohua/5.htm
    2. # 标题和正文
    3. from lxml import etree
    4. import requests
    5. from fake_useragent import UserAgent
    6. # 获取网页源代码
    7. def download(url):
    8. headers = {
    9. 'User-Agent':UserAgent().random
    10. }
    11. html = requests.get(url=url,headers=headers)
    12. html.encoding='utf-=8'
    13. return html.text
    14. # 获取内容
    15. def get_data(html):
    16. xml = etree.HTML(html)
    17. li_list = xml.xpath('//div[@class="module articlelist"]/ul/li')
    18. j = 1
    19. for li in li_list:
    20. title = li.xpath('.//div[@class="hd"]/text()')[0]
    21. content = li.xpath('.//pre/text()')
    22. print(f'第{i}页、第{j}条——标题【{title}】 内容为:{content}')
    23. data_list.append({'title':title,'content':content})
    24. j += 1
    25. return data_list
    26. # 获取翻页
    27. def get_next_page(html):
    28. xml = etree.HTML(html)
    29. href = xml.xpath('//div[@class="page"]/a[text()="下一页"]/@href')[0]
    30. return "http://www.17989.com" + href
    31. if __name__ == '__main__':
    32. url = 'http://www.17989.com/xiaohua/1.htm'
    33. i = 1
    34. data_list = []
    35. while i < 10:
    36. html = download(url)
    37. get_data(html)
    38. url = get_next_page(html)
    39. i += 1
    40. if not url:
    41. break
    42. print(data_list)