网址:https://www.shicimingju.com/paiming
1、分析请求网页
第1页:https://www.shicimingju.com/paiming
第2页:https://www.shicimingju.com/paiming?p=2
第3页:https://www.shicimingju.com/paiming?p=3
2、新建项目
scrapy startproject poetry
cd poetry
scrapy genspider poetrySpider www.shicimingju.com/paiming
3、程序
1、创建随机请求头的中间件
from fake_useragent import UserAgent
class RandomHeaderMiddleware(object):
def __init__(self,crawler):
self.ua = UserAgent()
self.type = crawler.settings.get('RANDOM_UA_TYPE','chrome')
@classmethod
def from_crawler(cls,crawler):
return cls(
crawler
)
# 发送网络请求时调用该方法
def process_request(self,request,spider):
#setdefault(self, key, value):
request.headers.setdefault('User-Agent',getattr(self.ua,self.type))
说明:
在setting.py中开启中间件,并标记Random_UA_TYPE的值
RANDOM_UA_TYPE ='random'
DOWNLOADER_MIDDLEWARES = {
# 'poetry.middlewares.PoetryDownloaderMiddleware': 543,
'poetry.middlewares.RandomHeaderMiddleware': 300,
}
2、编写Items.py文件
在该文件中创建用于保存诗词标题,作者以及内容的Item对象
class PoetryItem(scrapy.Item):
title = scrapy.Field()
author = scrapy.Field()
content = scrapy.Field()
3、编写管道文件pipeline.py
创建项目管道中mysql数据库的操作,并重写process_item()方法
import pymysql
class PoetryPipeline:
def __init__(self,host,user,password,database,port):
self.host = host
self.user = user
self.password = password
self.database = database
self.port = port
@classmethod
def from_crawler(cls, crawler):
# 返回cls()实例对象,其中包含通过crawler获取配置文件中的数据库参数
return cls(
host=crawler.settings.get('SQL_HOST'),
user=crawler.settings.get('SQL_USER'),
password=crawler.settings.get('SQL_PASSWORD'),
database=crawler.settings.get('SQL_DATABASE'),
port=crawler.settings.get('SQL_PORT')
)
def open_spider(self,spider):
self.db = pymysql.connect(host=self.host,user=self.user,password=self.password,database=self.database,port=self.port)
self.cursor = self.db.cursor()
def close_spider(self,spider):
self.db.close()
def process_item(self, item, spider):
data = dict(item) # 将item转换成字典类型
# sql语句
sql = 'insert into poetry (title,author,content) values(%s,%s,%s)'
# 执行插入多条数据
self.cursor.executemany(sql, [(data['title'], data['author'], data['content'])])
self.db.commit() # 提交
return item # 返回item
4、编写spider文件
import scrapy
from redis import Redis
from poetry.items import PoetryItem
import re
class ProetryspiderSpider(scrapy.Spider):
name = 'proetrySpider'
allowed_domains = ['www.shicimingju.com']
start_urls = ['https://www.shicimingju.com/paiming/']
def start_requests(self):
conn = Redis(host='192.168.0.113',port=6379)
for i in range(1,101):
url = self.start_urls[0] + "?p={page}".format(page=i)
add = conn.sadd('poetry_url',url) # 集合
if add == 1: # 因为redis集合中的唯一性,如果插入队列则返回1,否则返回0
yield scrapy.Request(url=url,callback=self.parse)
else:
print(f'第{i}页请求地址已存在无需请求')
def parse(self, response):
item = PoetryItem() # 创建item对象
shici_all = response.css('.card.shici_card') # 获取每页所有诗词内容
for shici in shici_all: # 循环遍历每页中每个诗词
title = shici.css('h3 a::text').get() # 获取诗词标题名称
author = shici.xpath('./div[@class= "list_num_info"]') \
.xpath('string()').get() # 获取作者
author = author.strip() # 删除所有空格
content = shici.css('.shici_content').xpath('string()').getall()[0]
if '展开全文' in content: # 判断诗词内容是否为展开全文模式
content = re.sub(' |展开全文|收起|\n', '', content)
else:
content = re.sub(' |\n', '', content)
item['title'] = title # 将诗词标题名称添加至item
item['author'] = author # 将诗词作者添加至item
item['content'] = content # 将诗词内容添加至item
yield item # 打印item信息
pass
# 导入CrawlerProcess类
from scrapy.crawler import CrawlerProcess
# 导入获取项目配置信息
from scrapy.utils.project import get_project_settings
# 程序入口
if __name__ == '__main__':
# 创建CrawlerProcess类对象并传入项目设置信息参数
process = CrawlerProcess(get_project_settings())
# 设置需要启动的爬虫名称
process.crawl('proetrySpider')
# 启动爬虫
process.start()
5、编写settings.pywenjian
在该文件中对整个分布式爬虫项目进行配置
BOT_NAME = 'poetry'
SPIDER_MODULES = ['poetry.spiders']
NEWSPIDER_MODULE = 'poetry.spiders'
RANDOM_UA_TYPE ='random'
SQL_HOST = '192.168.0.113'
SQL_USER = 'root'
SQL_PASSWORD = 'root'
SQL_DATABASE = 'news_data'
SQL_PORT = 3306
DOWNLOADER_MIDDLEWARES = {
# 'poetry.middlewares.PoetryDownloaderMiddleware': 543,
'poetry.middlewares.RandomHeaderMiddleware': 300,
}
ITEM_PIPELINES = {
'poetry.pipelines.PoetryPipeline': 300,
}