1、继承scrapy_redis类并修改

  1. import scrapy
  2. from daomu.items import DaomuItem
  3. import os
  4. import re
  5. from copy import deepcopy
  6. from scrapy_redis.spiders import RedisSpider # 导入类包
  7. class DmSpider(RedisSpider): # 这里改成继承RedisSpider
  8. name = 'dm'
  9. allowed_domains = ['daomubiji.com']
  10. # start_urls = ['http://daomubiji.com/'] # 删除start_urls
  11. redis_key = 'dmbj' # 增加redis_key
  12. def parse(self, response):
  13. # 解析一级页面
  14. a_list = response.xpath('//li[contains(@id, "menu-item-")]/a')
  15. for a in a_list:
  16. item = DaomuItem()
  17. # 大标题
  18. item['first_title'] = a.xpath('./text()').get()
  19. # 跳转到二级页面的url
  20. first_url = a.xpath('./@href').get()
  21. # print(item, first_url)
  22. # 创建文件夹
  23. new_second_title = re.sub(r'[\\\/\:\*\?\"\<\>\|]', '_', item['first_title'])
  24. dir_path = './novel/{}/'.format(new_second_title)
  25. # 先判断有没有文件 没有就创建
  26. if not os.path.exists(dir_path):
  27. os.makedirs(dir_path)
  28. yield scrapy.Request(
  29. url=first_url,
  30. meta={'item': deepcopy(item)},
  31. callback=self.parse_second
  32. )
  33. # 二级页面解析
  34. def parse_second(self, response):
  35. item = response.meta.get('item')
  36. a_list = response.xpath('//article/a')
  37. for a in a_list:
  38. # 小标题
  39. item['second_title'] = a.xpath('./text()').get()
  40. # 跳转到三级页面的url
  41. second_url = a.xpath('./@href').get()
  42. # print(item, second_url)
  43. yield scrapy.Request(
  44. url=second_url,
  45. meta={'item': deepcopy(item)},
  46. callback=self.parse_content
  47. )
  48. # 解析三级页面
  49. def parse_content(self, response):
  50. item = response.meta.get('item')
  51. content_list = response.xpath('//article[@class="article-content"]/p/text()').getall()
  52. item['content'] = '\n'.join(content_list)
  53. yield item

2、items.py文件

  1. class DaomuItem(scrapy.Item):
  2. # define the fields for your item here like:
  3. # name = scrapy.Field()
  4. first_title = scrapy.Field()
  5. second_title = scrapy.Field()
  6. content = scrapy.Field()
  7. pass

3、管道文件 pipeline.py

  1. class DaomuPipeline:
  2. def process_item(self, item, spider):
  3. filename = './novel/{}/{}.txt'.format(
  4. re.sub(r'[\\\/\:\*\?\"\<\>\|]', '_', item['first_title']),
  5. re.sub(r'[\\\/\:\*\?\"\<\>\|]', '_', item['second_title'])
  6. )
  7. print('正在写入:', filename)
  8. with open(filename, 'w', encoding='utf-8') as file_obj:
  9. file_obj.write(item['content'])
  10. return item

4、修改配置文件settings.py

  1. # 需要改
  2. USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36'
  3. # 指定去重方式 给请求对象去重
  4. DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
  5. # 设置调度器
  6. SCHEDULER = "scrapy_redis.scheduler.Scheduler"
  7. # 队列中的内容是否进行持久保留 True redis关闭的时候数据会保留
  8. # False 不会保留
  9. SCHEDULER_PERSIST = True
  10. ITEM_PIPELINES = {
  11. 'daomu.pipelines.DaomuPipeline': 300,
  12. # 将数据保存到redis中
  13. 'scrapy_redis.pipelines.RedisPipeline': 400,
  14. }
  15. DEFAULT_REQUEST_HEADERS = {
  16. 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  17. 'Accept-Language': 'en',
  18. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36'
  19. }