1. # http://www.17989.com/xiaohua/5.htm
    2. # 标题和正文
    3. from lxml import etree
    4. import requests
    5. from fake_useragent import UserAgent
    6. import csv
    7. # 获取网页源代码
    8. def download(url):
    9. headers = {
    10. 'User-Agent':UserAgent().random
    11. }
    12. html = requests.get(url=url,headers=headers)
    13. html.encoding='utf-=8'
    14. return html.text
    15. # 获取内容
    16. def get_data(html):
    17. xml = etree.HTML(html)
    18. li_list = xml.xpath('//div[@class="module articlelist"]/ul/li')
    19. j = 1
    20. for li in li_list:
    21. item = {}
    22. title = li.xpath('.//div[@class="hd"]/text()')[0]
    23. content = li.xpath('.//pre/text()')
    24. contentData = ' '.join([i.strip() for i in content])
    25. item['title'] = title
    26. item['content'] = contentData
    27. data_list.append(item)
    28. j += 1
    29. return data_list
    30. def saveData():
    31. headers = ['title','content']
    32. with open('xiaohua.csv', 'w', encoding='utf-8-sig', newline='') as file:
    33. dwriter = csv.DictWriter(file,headers)
    34. dwriter.writeheader()
    35. dwriter.writerows(data_list)
    36. # 获取翻页
    37. def get_next_page(html):
    38. xml = etree.HTML(html)
    39. href = xml.xpath('//div[@class="page"]/a[text()="下一页"]/@href')[0]
    40. return "http://www.17989.com" + href
    41. if __name__ == '__main__':
    42. url = 'http://www.17989.com/xiaohua/1.htm'
    43. i = 1
    44. data_list = []
    45. while i < 100:
    46. html = download(url)
    47. get_data(html)
    48. url = get_next_page(html)
    49. i += 1
    50. if not url:
    51. break
    52. saveData()