1. """
    2. 目标网站:
    3. https://www.shicimingju.com/book/sanguoyanyi.html
    4. 需求:
    5. bs4爬取三国演义所有章节标题和内容
    6. 模块:
    7. requests, bs4
    8. """
    9. import requests, os
    10. from bs4 import BeautifulSoup
    11. url = 'https://www.shicimingju.com/book/sanguoyanyi.html'
    12. headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36'}
    13. # 01. 抓取首页的章节目录
    14. toc_text = requests.get(url, headers=headers).content.decode()
    15. # 实例化bs4对象并加载页面数据
    16. soup = BeautifulSoup(toc_text, 'lxml')
    17. a_list = soup.select('.book-mulu ul li a') # 此时获得所有的a标签对象(列表)
    18. f = open('三国演义.txt', 'w', encoding='utf-8')
    19. for a in a_list:
    20. novel_title = a.text
    21. toc_url = 'https://www.shicimingju.com' + a['href']
    22. novel_text = requests.get(toc_url, headers=headers).content.decode()
    23. soup1 = BeautifulSoup(novel_text, 'lxml')
    24. novel = soup1.find(class_='chapter_content').text
    25. f.write(novel_title +'\n' + novel)
    26. print(novel_title, '已下载完成')
    27. print('全部章节已经下载完成~')
    28. f.close()