1、继承scrapy_redis类并修改
import scrapyfrom daomu.items import DaomuItemimport osimport refrom copy import deepcopyfrom scrapy_redis.spiders import RedisSpider # 导入类包class DmSpider(RedisSpider): # 这里改成继承RedisSpider name = 'dm' allowed_domains = ['daomubiji.com'] # start_urls = ['http://daomubiji.com/'] # 删除start_urls redis_key = 'dmbj' # 增加redis_key def parse(self, response): # 解析一级页面 a_list = response.xpath('//li[contains(@id, "menu-item-")]/a') for a in a_list: item = DaomuItem() # 大标题 item['first_title'] = a.xpath('./text()').get() # 跳转到二级页面的url first_url = a.xpath('./@href').get() # print(item, first_url) # 创建文件夹 new_second_title = re.sub(r'[\\\/\:\*\?\"\<\>\|]', '_', item['first_title']) dir_path = './novel/{}/'.format(new_second_title) # 先判断有没有文件 没有就创建 if not os.path.exists(dir_path): os.makedirs(dir_path) yield scrapy.Request( url=first_url, meta={'item': deepcopy(item)}, callback=self.parse_second ) # 二级页面解析 def parse_second(self, response): item = response.meta.get('item') a_list = response.xpath('//article/a') for a in a_list: # 小标题 item['second_title'] = a.xpath('./text()').get() # 跳转到三级页面的url second_url = a.xpath('./@href').get() # print(item, second_url) yield scrapy.Request( url=second_url, meta={'item': deepcopy(item)}, callback=self.parse_content ) # 解析三级页面 def parse_content(self, response): item = response.meta.get('item') content_list = response.xpath('//article[@class="article-content"]/p/text()').getall() item['content'] = '\n'.join(content_list) yield item
2、items.py文件
class DaomuItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() first_title = scrapy.Field() second_title = scrapy.Field() content = scrapy.Field() pass
3、管道文件 pipeline.py
class DaomuPipeline: def process_item(self, item, spider): filename = './novel/{}/{}.txt'.format( re.sub(r'[\\\/\:\*\?\"\<\>\|]', '_', item['first_title']), re.sub(r'[\\\/\:\*\?\"\<\>\|]', '_', item['second_title']) ) print('正在写入:', filename) with open(filename, 'w', encoding='utf-8') as file_obj: file_obj.write(item['content']) return item
4、修改配置文件settings.py
# 需要改USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36'# 指定去重方式 给请求对象去重DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"# 设置调度器SCHEDULER = "scrapy_redis.scheduler.Scheduler"# 队列中的内容是否进行持久保留 True redis关闭的时候数据会保留# False 不会保留SCHEDULER_PERSIST = TrueITEM_PIPELINES = { 'daomu.pipelines.DaomuPipeline': 300, # 将数据保存到redis中 'scrapy_redis.pipelines.RedisPipeline': 400,}DEFAULT_REQUEST_HEADERS = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36'}