1. # 需求:爬取三国演义小说所有的章节标题和章节内容http://www.shicimingju.com/book/sanguoyanyi.html
    2. if __name__ == "__main__":
    3. # 对首页的页面数据进行爬取
    4. headers = {
    5. 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
    6. }
    7. url = 'http://www.shicimingju.com/book/sanguoyanyi.html'
    8. page_text = requests.get(url=url, headers=headers).text.encode('ISO-8859-1')
    9. # 在首页中解析出章节的标题和详情页的url
    10. # 1.实例化BeautifulSoup对象,需要将页面源码数据加载到该对象中
    11. soup = BeautifulSoup(page_text, 'lxml')
    12. # 解析章节标题和详情页的url
    13. li_list = soup.select('.book-mulu > ul > li')
    14. fp = open('./sanguo.txt', 'w', encoding='utf-8')
    15. for li in li_list:
    16. title = li.a.string
    17. detail_url = 'http://www.shicimingju.com' + li.a['href']
    18. # 对详情页发起请求,解析出章节内容
    19. detail_page_text = requests.get(url=detail_url, headers=headers).text.encode('ISO-8859-1')
    20. # 解析出详情页中相关的章节内容
    21. detail_soup = BeautifulSoup(detail_page_text, 'lxml')
    22. div_tag = detail_soup.find('div', class_='chapter_content')
    23. # 解析到了章节的内容
    24. content = div_tag.text
    25. fp.write(title + ':' + content + '\n')
    26. print(title, '爬取成功!!!')