爬取笑话网站

代码

  1. from lxml import etree
  2. import requests
  3. import re
  4. import csv
  5. class XiaoHua:
  6. # 初始化函数
  7. def __init__(self):
  8. self.headers = {
  9. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36'
  10. }
  11. self.url = 'http://www.17989.com/xiaohua/1.htm'
  12. self.li = []
  13. self.header = ['title', 'section']
  14. # 发送请求函数
  15. def read_url(self, url):
  16. res = requests.get(url, self.headers)
  17. html = res.content.decode('utf-8')
  18. # print(html)
  19. return html
  20. # 数据解析函数
  21. def parse_html(self, html):
  22. tree = etree.HTML(html)
  23. li_tags = tree.xpath('//div[@class="module articlelist"]//li')
  24. # print(li_tags)
  25. for li_tag in li_tags:
  26. pat = '\r\n'
  27. item = {}
  28. title = li_tag.xpath('./div[@class="hd"]/text()')[0].strip()
  29. item['title'] = title
  30. section = li_tag.xpath('./pre/text()')[0].strip()
  31. item['section'] = re.sub(pat, '', section)
  32. # print(item)
  33. self.li.append(item)
  34. # 数据保存函数
  35. def write_data(self):
  36. with open('笑话.csv', 'w', encoding='utf-8', newline='') as f:
  37. w = csv.DictWriter(f, self.header)
  38. w.writeheader()
  39. w.writerows(self.li)
  40. print("保存成功")
  41. # 主函数
  42. def main(self):
  43. # 发送请求
  44. html = self.read_url(self.url)
  45. # 使用xpath数据解析
  46. self.parse_html(html)
  47. # print(self.li)
  48. # 翻页操作
  49. # # 1.爬取所有
  50. # while True:
  51. # tree2 = etree.HTML(html)
  52. # next_url = 'http://www.17989.com/' + tree2.xpath('//a[text()="下一页"]/@href')[0]
  53. # if next_url:
  54. # # 使用新的url发送请求
  55. # html = self.read_url(next_url)
  56. # # 数据解析
  57. # self.parse_html(html)
  58. # print(next_url)
  59. # else:
  60. # break
  61. # 2.选择性爬取 由于爬取全部太多了
  62. num = int(input("请输入你要爬取的页数:"))
  63. for i in range(num - 1):
  64. tree2 = etree.HTML(html)
  65. next_url = 'http://www.17989.com/' + tree2.xpath('//a[text()="下一页"]/@href')[0]
  66. html = self.read_url(next_url)
  67. self.parse_html(html)
  68. print(next_url)
  69. # 保存
  70. self.write_data()
  71. # 启动
  72. if __name__ == '__main__':
  73. xh = XiaoHua()
  74. xh.main()

展示

image.png

image.png