一、页面分析:
1、需要在一级页面获取:章节大标题和大章节链接
a_list = // li[contains(@id,''menu-item-")]/afor a in a_list: a.xpath('./text()') a.xpath('./@href')
2、在二级页面获取 小标题以及跳转到小说内容链接。
a_list = //article/afor a in a_list: a.xpath('./text()') a.xpath('./@href')
3、在三级页面获取到小说具体内容。
//article[@class="article-content"]/p/text()'\n'.join(['段落1','段落2'])拼接段落内容
二、编写代码
1、编写scrapy
import scrapyfrom daomubiji.items import DaomubijiItemimport osimport refrom copy import deepcopyclass DaomubijispiderSpider(scrapy.Spider): name = 'daomubijiSpider' allowed_domains = ['daomubiji.com'] start_urls = ['http://daomubiji.com/'] def parse(self, response): # 解析一级页面 a_list = response.xpath('//li[contains(@id,"menu-item-")]/a') for a in a_list: item = DaomubijiItem() # 大标题 item['first_title'] = a.xpath('./text()').get() first_url = a.xpath('./@href').get() new_second_title = re.sub(r'[\\\/\:\*\?\"\<\>\|]', '_', item['first_title']) dir_path = './novel/{}/'.format(new_second_title) # 先判断有没有文件 没有就创建 if not os.path.exists(dir_path): os.makedirs(dir_path) yield scrapy.Request(url = first_url,meta={'item':deepcopy(item)},callback=self.parse_second) # 二级页面的解析 def parse_second(self,response): item = response.meta.get('item') a_list = response.xpath('//article/a') for a in a_list: # 小标题 item['second_title'] = a.xpath('./text()').get() # # 跳转到三级页面的url second_url = a.xpath('./@href').get() yield scrapy.Request(url =second_url,meta={'item':deepcopy(item)},callback=self.parse_content) # 解析三级页面 def parse_content(self,response): item = response.meta.get('item') content_list = response.xpath('//article[@class="article-content"]/p/text()').getall() item['content'] = '\n'.join(content_list) yield item
2、编写items.py
class DaomubijiItem(scrapy.Item): first_title = scrapy.Field() second_title = scrapy.Field() content = scrapy.Field()
3、编写管道文件
import reclass DaomubijiPipeline: def process_item(self, item, spider): filename = './novel/{}/{}.txt'.format( re.sub(r'[\\\/\:\*\?\"\<\>\|]', '_', item['first_title']), re.sub(r'[\\\/\:\*\?\"\<\>\|]', '_', item['second_title']) ) print('正在写入:', filename) with open(filename, 'w', encoding='utf-8') as file_obj: file_obj.write(item['content']) return item
4、编写settings.py文件
DEFAULT_REQUEST_HEADERS = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en', 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36'}ITEM_PIPELINES = { 'daomubiji.pipelines.DaomubijiPipeline': 300,}LOG_LEVEL = 'WARNING'ROBOTSTXT_OBEY = False # 是否遵守此协议