网址:https://www.shicimingju.com/paiming

1、分析请求网页

第1页:https://www.shicimingju.com/paiming
第2页:https://www.shicimingju.com/paiming?p=2
第3页:https://www.shicimingju.com/paiming?p=3

2、新建项目

  1. scrapy startproject poetry
  2. cd poetry
  3. scrapy genspider poetrySpider www.shicimingju.com/paiming

3、程序

1、创建随机请求头的中间件

from fake_useragent import UserAgent

class RandomHeaderMiddleware(object):

    def __init__(self,crawler):
        self.ua = UserAgent()
        self.type = crawler.settings.get('RANDOM_UA_TYPE','chrome')

    @classmethod
    def from_crawler(cls,crawler):
        return cls(
            crawler
        )

    # 发送网络请求时调用该方法
    def process_request(self,request,spider):
        #setdefault(self, key, value):
        request.headers.setdefault('User-Agent',getattr(self.ua,self.type))
  说明:
  在setting.py中开启中间件,并标记Random_UA_TYPE的值
  RANDOM_UA_TYPE ='random'
  DOWNLOADER_MIDDLEWARES = {
   # 'poetry.middlewares.PoetryDownloaderMiddleware': 543,
   'poetry.middlewares.RandomHeaderMiddleware': 300,
}

2、编写Items.py文件

在该文件中创建用于保存诗词标题,作者以及内容的Item对象

class PoetryItem(scrapy.Item):
    title = scrapy.Field()
    author = scrapy.Field()
    content = scrapy.Field()

3、编写管道文件pipeline.py

创建项目管道中mysql数据库的操作,并重写process_item()方法

import pymysql

class PoetryPipeline:

    def __init__(self,host,user,password,database,port):
        self.host = host
        self.user = user
        self.password = password
        self.database = database
        self.port = port

    @classmethod
    def from_crawler(cls, crawler):
        # 返回cls()实例对象,其中包含通过crawler获取配置文件中的数据库参数
        return cls(
            host=crawler.settings.get('SQL_HOST'),
            user=crawler.settings.get('SQL_USER'),
            password=crawler.settings.get('SQL_PASSWORD'),
            database=crawler.settings.get('SQL_DATABASE'),
            port=crawler.settings.get('SQL_PORT')
        )
    def open_spider(self,spider):
        self.db = pymysql.connect(host=self.host,user=self.user,password=self.password,database=self.database,port=self.port)
        self.cursor = self.db.cursor()

    def close_spider(self,spider):
        self.db.close()

    def process_item(self, item, spider):
        data = dict(item)  # 将item转换成字典类型
        # sql语句
        sql = 'insert into poetry (title,author,content) values(%s,%s,%s)'
        # 执行插入多条数据
        self.cursor.executemany(sql, [(data['title'], data['author'], data['content'])])
        self.db.commit()  # 提交
        return item  # 返回item

4、编写spider文件

import scrapy
from redis import Redis
from poetry.items import PoetryItem
import re
class ProetryspiderSpider(scrapy.Spider):
    name = 'proetrySpider'
    allowed_domains = ['www.shicimingju.com']
    start_urls = ['https://www.shicimingju.com/paiming/']

    def start_requests(self):
        conn = Redis(host='192.168.0.113',port=6379)
        for i in range(1,101):
            url = self.start_urls[0] + "?p={page}".format(page=i)
            add = conn.sadd('poetry_url',url) # 集合
            if add == 1: # 因为redis集合中的唯一性,如果插入队列则返回1,否则返回0
                yield scrapy.Request(url=url,callback=self.parse)
            else:
                print(f'第{i}页请求地址已存在无需请求')

    def parse(self, response):
        item = PoetryItem()  # 创建item对象
        shici_all = response.css('.card.shici_card')  # 获取每页所有诗词内容
        for shici in shici_all:  # 循环遍历每页中每个诗词
            title = shici.css('h3 a::text').get()  # 获取诗词标题名称
            author = shici.xpath('./div[@class= "list_num_info"]') \
                .xpath('string()').get()  # 获取作者
            author = author.strip()  # 删除所有空格
            content = shici.css('.shici_content').xpath('string()').getall()[0]
            if '展开全文' in content:  # 判断诗词内容是否为展开全文模式
                content = re.sub(' |展开全文|收起|\n', '', content)
            else:
                content = re.sub(' |\n', '', content)
            item['title'] = title  # 将诗词标题名称添加至item
            item['author'] = author  # 将诗词作者添加至item
            item['content'] = content  # 将诗词内容添加至item
            yield item  # 打印item信息
        pass

# 导入CrawlerProcess类
from scrapy.crawler import CrawlerProcess
# 导入获取项目配置信息
from scrapy.utils.project import get_project_settings

# 程序入口
if __name__ == '__main__':
    # 创建CrawlerProcess类对象并传入项目设置信息参数
    process = CrawlerProcess(get_project_settings())
    # 设置需要启动的爬虫名称
    process.crawl('proetrySpider')
    # 启动爬虫
    process.start()

5、编写settings.pywenjian

在该文件中对整个分布式爬虫项目进行配置

BOT_NAME = 'poetry'

SPIDER_MODULES = ['poetry.spiders']
NEWSPIDER_MODULE = 'poetry.spiders'
RANDOM_UA_TYPE ='random'

SQL_HOST = '192.168.0.113'
SQL_USER = 'root'
SQL_PASSWORD = 'root'
SQL_DATABASE = 'news_data'
SQL_PORT = 3306

DOWNLOADER_MIDDLEWARES = {
   # 'poetry.middlewares.PoetryDownloaderMiddleware': 543,
   'poetry.middlewares.RandomHeaderMiddleware': 300,
}
ITEM_PIPELINES = {
   'poetry.pipelines.PoetryPipeline': 300,
}