1、继承scrapy_redis类并修改
import scrapy
from daomu.items import DaomuItem
import os
import re
from copy import deepcopy
from scrapy_redis.spiders import RedisSpider # 导入类包
class DmSpider(RedisSpider): # 这里改成继承RedisSpider
name = 'dm'
allowed_domains = ['daomubiji.com']
# start_urls = ['http://daomubiji.com/'] # 删除start_urls
redis_key = 'dmbj' # 增加redis_key
def parse(self, response):
# 解析一级页面
a_list = response.xpath('//li[contains(@id, "menu-item-")]/a')
for a in a_list:
item = DaomuItem()
# 大标题
item['first_title'] = a.xpath('./text()').get()
# 跳转到二级页面的url
first_url = a.xpath('./@href').get()
# print(item, first_url)
# 创建文件夹
new_second_title = re.sub(r'[\\\/\:\*\?\"\<\>\|]', '_', item['first_title'])
dir_path = './novel/{}/'.format(new_second_title)
# 先判断有没有文件 没有就创建
if not os.path.exists(dir_path):
os.makedirs(dir_path)
yield scrapy.Request(
url=first_url,
meta={'item': deepcopy(item)},
callback=self.parse_second
)
# 二级页面解析
def parse_second(self, response):
item = response.meta.get('item')
a_list = response.xpath('//article/a')
for a in a_list:
# 小标题
item['second_title'] = a.xpath('./text()').get()
# 跳转到三级页面的url
second_url = a.xpath('./@href').get()
# print(item, second_url)
yield scrapy.Request(
url=second_url,
meta={'item': deepcopy(item)},
callback=self.parse_content
)
# 解析三级页面
def parse_content(self, response):
item = response.meta.get('item')
content_list = response.xpath('//article[@class="article-content"]/p/text()').getall()
item['content'] = '\n'.join(content_list)
yield item
2、items.py文件
class DaomuItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
first_title = scrapy.Field()
second_title = scrapy.Field()
content = scrapy.Field()
pass
3、管道文件 pipeline.py
class DaomuPipeline:
def process_item(self, item, spider):
filename = './novel/{}/{}.txt'.format(
re.sub(r'[\\\/\:\*\?\"\<\>\|]', '_', item['first_title']),
re.sub(r'[\\\/\:\*\?\"\<\>\|]', '_', item['second_title'])
)
print('正在写入:', filename)
with open(filename, 'w', encoding='utf-8') as file_obj:
file_obj.write(item['content'])
return item
4、修改配置文件settings.py
# 需要改
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36'
# 指定去重方式 给请求对象去重
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
# 设置调度器
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# 队列中的内容是否进行持久保留 True redis关闭的时候数据会保留
# False 不会保留
SCHEDULER_PERSIST = True
ITEM_PIPELINES = {
'daomu.pipelines.DaomuPipeline': 300,
# 将数据保存到redis中
'scrapy_redis.pipelines.RedisPipeline': 400,
}
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36'
}