一、页面分析:

1、需要在一级页面获取:章节大标题和大章节链接

  1. a_list = // li[contains(@id,''menu-item-")]/a
  2. for a in a_list:
  3. a.xpath('./text()')
  4. a.xpath('./@href')

2、在二级页面获取 小标题以及跳转到小说内容链接。

  1. a_list = //article/a
  2. for a in a_list:
  3. a.xpath('./text()')
  4. a.xpath('./@href')

3、在三级页面获取到小说具体内容。

  1. //article[@class="article-content"]/p/text()
  2. '\n'.join(['段落1','段落2'])拼接段落内容

二、编写代码

1、编写scrapy

  1. import scrapy
  2. from daomubiji.items import DaomubijiItem
  3. import os
  4. import re
  5. from copy import deepcopy
  6. class DaomubijispiderSpider(scrapy.Spider):
  7. name = 'daomubijiSpider'
  8. allowed_domains = ['daomubiji.com']
  9. start_urls = ['http://daomubiji.com/']
  10. def parse(self, response):
  11. # 解析一级页面
  12. a_list = response.xpath('//li[contains(@id,"menu-item-")]/a')
  13. for a in a_list:
  14. item = DaomubijiItem()
  15. # 大标题
  16. item['first_title'] = a.xpath('./text()').get()
  17. first_url = a.xpath('./@href').get()
  18. new_second_title = re.sub(r'[\\\/\:\*\?\"\<\>\|]', '_', item['first_title'])
  19. dir_path = './novel/{}/'.format(new_second_title)
  20. # 先判断有没有文件 没有就创建
  21. if not os.path.exists(dir_path):
  22. os.makedirs(dir_path)
  23. yield scrapy.Request(url = first_url,meta={'item':deepcopy(item)},callback=self.parse_second)
  24. # 二级页面的解析
  25. def parse_second(self,response):
  26. item = response.meta.get('item')
  27. a_list = response.xpath('//article/a')
  28. for a in a_list:
  29. # 小标题
  30. item['second_title'] = a.xpath('./text()').get()
  31. # # 跳转到三级页面的url
  32. second_url = a.xpath('./@href').get()
  33. yield scrapy.Request(url =second_url,meta={'item':deepcopy(item)},callback=self.parse_content)
  34. # 解析三级页面
  35. def parse_content(self,response):
  36. item = response.meta.get('item')
  37. content_list = response.xpath('//article[@class="article-content"]/p/text()').getall()
  38. item['content'] = '\n'.join(content_list)
  39. yield item

2、编写items.py

  1. class DaomubijiItem(scrapy.Item):
  2. first_title = scrapy.Field()
  3. second_title = scrapy.Field()
  4. content = scrapy.Field()

3、编写管道文件

  1. import re
  2. class DaomubijiPipeline:
  3. def process_item(self, item, spider):
  4. filename = './novel/{}/{}.txt'.format(
  5. re.sub(r'[\\\/\:\*\?\"\<\>\|]', '_', item['first_title']),
  6. re.sub(r'[\\\/\:\*\?\"\<\>\|]', '_', item['second_title'])
  7. )
  8. print('正在写入:', filename)
  9. with open(filename, 'w', encoding='utf-8') as file_obj:
  10. file_obj.write(item['content'])
  11. return item

4、编写settings.py文件

  1. DEFAULT_REQUEST_HEADERS = {
  2. 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  3. 'Accept-Language': 'en',
  4. 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36'
  5. }
  6. ITEM_PIPELINES = {
  7. 'daomubiji.pipelines.DaomubijiPipeline': 300,
  8. }
  9. LOG_LEVEL = 'WARNING'
  10. ROBOTSTXT_OBEY = False # 是否遵守此协议