一、页面分析:
1、需要在一级页面获取:章节大标题和大章节链接
a_list = // li[contains(@id,''menu-item-")]/a
for a in a_list:
a.xpath('./text()')
a.xpath('./@href')
2、在二级页面获取 小标题以及跳转到小说内容链接。
a_list = //article/a
for a in a_list:
a.xpath('./text()')
a.xpath('./@href')
3、在三级页面获取到小说具体内容。
//article[@class="article-content"]/p/text()
'\n'.join(['段落1','段落2'])拼接段落内容
二、编写代码
1、编写scrapy
import scrapy
from daomubiji.items import DaomubijiItem
import os
import re
from copy import deepcopy
class DaomubijispiderSpider(scrapy.Spider):
name = 'daomubijiSpider'
allowed_domains = ['daomubiji.com']
start_urls = ['http://daomubiji.com/']
def parse(self, response):
# 解析一级页面
a_list = response.xpath('//li[contains(@id,"menu-item-")]/a')
for a in a_list:
item = DaomubijiItem()
# 大标题
item['first_title'] = a.xpath('./text()').get()
first_url = a.xpath('./@href').get()
new_second_title = re.sub(r'[\\\/\:\*\?\"\<\>\|]', '_', item['first_title'])
dir_path = './novel/{}/'.format(new_second_title)
# 先判断有没有文件 没有就创建
if not os.path.exists(dir_path):
os.makedirs(dir_path)
yield scrapy.Request(url = first_url,meta={'item':deepcopy(item)},callback=self.parse_second)
# 二级页面的解析
def parse_second(self,response):
item = response.meta.get('item')
a_list = response.xpath('//article/a')
for a in a_list:
# 小标题
item['second_title'] = a.xpath('./text()').get()
# # 跳转到三级页面的url
second_url = a.xpath('./@href').get()
yield scrapy.Request(url =second_url,meta={'item':deepcopy(item)},callback=self.parse_content)
# 解析三级页面
def parse_content(self,response):
item = response.meta.get('item')
content_list = response.xpath('//article[@class="article-content"]/p/text()').getall()
item['content'] = '\n'.join(content_list)
yield item
2、编写items.py
class DaomubijiItem(scrapy.Item):
first_title = scrapy.Field()
second_title = scrapy.Field()
content = scrapy.Field()
3、编写管道文件
import re
class DaomubijiPipeline:
def process_item(self, item, spider):
filename = './novel/{}/{}.txt'.format(
re.sub(r'[\\\/\:\*\?\"\<\>\|]', '_', item['first_title']),
re.sub(r'[\\\/\:\*\?\"\<\>\|]', '_', item['second_title'])
)
print('正在写入:', filename)
with open(filename, 'w', encoding='utf-8') as file_obj:
file_obj.write(item['content'])
return item
4、编写settings.py文件
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36'
}
ITEM_PIPELINES = {
'daomubiji.pipelines.DaomubijiPipeline': 300,
}
LOG_LEVEL = 'WARNING'
ROBOTSTXT_OBEY = False # 是否遵守此协议