1、创建数据表
2、创建Scrapy项目
scrapy startproject distribute
cd distribute
scrapy genspider distributedSpider china.chinadaily.com.cn
创建随机请求头
from fake_useragent import UserAgent # 导入请求头类
# 自定义随机请求头的中间件
class RandomHeaderMiddleware(object):
def __init__(self, crawler):
self.ua = UserAgent() # 随机请求头对象
# 如果配置文件中不存在就使用默认的Google Chrome请求头
self.type = crawler.settings.get("RANDOM_UA_TYPE", "chrome")
@classmethod
def from_crawler(cls, crawler):
# 返回cls()实例对象
return cls(crawler)
# 发送网络请求时调用该方法
def process_request(self, request, spider):
# 设置随机生成的请求头
request.headers.setdefault('User-Agent', getattr(self.ua, self.type))
2、编写items文件
class DistributeItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
news_title = scrapy.Field()
news_synopsis = scrapy.Field()
news_url = scrapy.Field()
news_time = scrapy.Field()
3、编写pipelines文件 ``` import pymysql
class DistributePipeline:
# 初始化数据库参数
def __init__(self,host,database,user,password,port):
self.host = host
self.database = database
self.user = user
self.password = password
self.port = port
@classmethod
def from_crawler(cls,crawler):
return cls(
host = crawler.settings.get('SQL_HOST'),
user = crawler.settings.get('SQL_USER'),
password = crawler.settings.get('SQL_PASSWORD'),
database = crawler.settings.get('SQL_DATABASE'),
port = crawler.settings.get('SQL_PORT'),
)
# 打开爬虫调用
def open_spider(self,spider):
self.db = pymysql.connect(host=self.host,user=self.user,password=self.password,database=self.database,port=self.port,charset='utf-8')
self.cursor = self.db.cursor()
# 关闭爬虫时嗲用
def close_spider(self,spider):
self.db.close()
def process_item(self, item, spider):
data = dict(item)
sql = 'insert into news (title,synopsis,url,time) values (%s,%s,%s,%s)'
# 插入多条数据
self.cursor.executemany(sql,[(data['news_title'],data['news_synopsis'],data['news_url'],data['news_time'])])
4、编写spider文件
from distributed.items import DistributedItem # 导入Item对象 class DistributedspiderSpider(scrapy.Spider): name = ‘distributedSpider’ allowed_domains = [‘china.chinadaily.com.cn’] start_urls = [‘http://china.chinadaily.com.cn/‘]
# 发送网络请求
def start_requests(self):
for i in range(1,101): # 由于新闻网页共计100页,所以循环执行100次
# 拼接请求地址
url = self.start_urls[0] + '5bd5639ca3101a87ca8ff636/page_{page}.html'.format(page=i)
# 执行请求
yield scrapy.Request(url=url,callback=self.parse)
# 处理请求结果
def parse(self, response):
item = DistributedItem() # 创建item对象
all = response.css('.busBox3') # 获取每页所有新闻内容
for i in all: # 循环遍历每页中每条新闻
title = i.css('h3 a::text').get() # 获取每条新闻标题
synopsis = i.css('p::text').get() # 获取每条新闻简介
url = 'http:'+i.css('h3 a::attr(href)').get() # 获取每条新闻详情页地址
time_ = i.css('p b::text').get() # 获取新闻发布时间
item['news_title'] = title # 将新闻标题添加至item
item['news_synopsis'] = synopsis # 将新闻简介内容添加至item
item['news_url'] = url # 将新闻详情页地址添加至item
item['news_time'] = time_ # 将新闻发布时间添加至item
yield item # 打印item信息
pass
导入CrawlerProcess类
from scrapy.crawler import CrawlerProcess
导入获取项目配置信息
from scrapy.utils.project import get_project_settings
程序入口
if name==’main‘:
# 创建CrawlerProcess类对象并传入项目设置信息参数
process = CrawlerProcess(get_project_settings())
# 设置需要启动的爬虫名称
process.crawl('distributedSpider')
# 启动爬虫
process.start()
5、编写配置文件
启用redis调度存储请求队列
SCHEDULER = ‘scrapy_redis.scheduler.Scheduler’
确保所有爬虫通过redis共享相同的重复筛选器。
DUPEFILTER_CLASS = ‘scrapy_redis.dupefilter.RFPDupeFilter’
不清理redis队列,允许暂停/恢复爬虫
SCHEDULER_PERSIST =True
使用默认的优先级队列调度请求
SCHEDULER_QUEUE_CLASS =’scrapy_redis.queue.PriorityQueue’ REDIS_URL =’redis://192.168.0.113:6379’ # 本地ip DOWNLOADER_MIDDLEWARES = {
# 启动自定义随机请求头中间件
'distributed.middlewares.RandomHeaderMiddleware': 200,
# 'distributed.middlewares.DistributedDownloaderMiddleware': 543,
}
配置请求头类型为随机,此处还可以设置为ie、firefox以及chrome
RANDOM_UA_TYPE = “random” ITEM_PIPELINES = { ‘distributed.pipelines.DistributedPipeline’: 300, ‘scrapy_redis.pipelines.RedisPipeline’:400 }
配置数据库连接信息
SQL_HOST = ‘192.168.0.113’ # 数据库地址 # 本地ip SQL_USER = ‘root’ # 用户名 SQL_PASSWORD=’root’ # 密码 SQL_DATABASE = ‘news_data’ # 数据库名称 SQL_PORT = 3306 # 端口 ```