"""
目标网站:
https://www.shicimingju.com/book/sanguoyanyi.html
需求:
bs4爬取三国演义所有章节标题和内容
模块:
requests, bs4
"""
import requests, os
from bs4 import BeautifulSoup
url = 'https://www.shicimingju.com/book/sanguoyanyi.html'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36'}
# 01. 抓取首页的章节目录
toc_text = requests.get(url, headers=headers).content.decode()
# 实例化bs4对象并加载页面数据
soup = BeautifulSoup(toc_text, 'lxml')
a_list = soup.select('.book-mulu ul li a') # 此时获得所有的a标签对象(列表)
f = open('三国演义.txt', 'w', encoding='utf-8')
for a in a_list:
novel_title = a.text
toc_url = 'https://www.shicimingju.com' + a['href']
novel_text = requests.get(toc_url, headers=headers).content.decode()
soup1 = BeautifulSoup(novel_text, 'lxml')
novel = soup1.find(class_='chapter_content').text
f.write(novel_title +'\n' + novel)
print(novel_title, '已下载完成')
print('全部章节已经下载完成~')
f.close()