1、创建数据表
2、创建Scrapy项目
scrapy startproject distributecd distributescrapy genspider distributedSpider china.chinadaily.com.cn
创建随机请求头
from fake_useragent import UserAgent # 导入请求头类# 自定义随机请求头的中间件class RandomHeaderMiddleware(object):def __init__(self, crawler):self.ua = UserAgent() # 随机请求头对象# 如果配置文件中不存在就使用默认的Google Chrome请求头self.type = crawler.settings.get("RANDOM_UA_TYPE", "chrome")@classmethoddef from_crawler(cls, crawler):# 返回cls()实例对象return cls(crawler)# 发送网络请求时调用该方法def process_request(self, request, spider):# 设置随机生成的请求头request.headers.setdefault('User-Agent', getattr(self.ua, self.type))
2、编写items文件
class DistributeItem(scrapy.Item):# define the fields for your item here like:# name = scrapy.Field()news_title = scrapy.Field()news_synopsis = scrapy.Field()news_url = scrapy.Field()news_time = scrapy.Field()
3、编写pipelines文件 ``` import pymysql
class DistributePipeline:
# 初始化数据库参数def __init__(self,host,database,user,password,port):self.host = hostself.database = databaseself.user = userself.password = passwordself.port = port@classmethoddef from_crawler(cls,crawler):return cls(host = crawler.settings.get('SQL_HOST'),user = crawler.settings.get('SQL_USER'),password = crawler.settings.get('SQL_PASSWORD'),database = crawler.settings.get('SQL_DATABASE'),port = crawler.settings.get('SQL_PORT'),)# 打开爬虫调用def open_spider(self,spider):self.db = pymysql.connect(host=self.host,user=self.user,password=self.password,database=self.database,port=self.port,charset='utf-8')self.cursor = self.db.cursor()# 关闭爬虫时嗲用def close_spider(self,spider):self.db.close()def process_item(self, item, spider):data = dict(item)sql = 'insert into news (title,synopsis,url,time) values (%s,%s,%s,%s)'# 插入多条数据self.cursor.executemany(sql,[(data['news_title'],data['news_synopsis'],data['news_url'],data['news_time'])])
4、编写spider文件
from distributed.items import DistributedItem # 导入Item对象 class DistributedspiderSpider(scrapy.Spider): name = ‘distributedSpider’ allowed_domains = [‘china.chinadaily.com.cn’] start_urls = [‘http://china.chinadaily.com.cn/‘]
# 发送网络请求def start_requests(self):for i in range(1,101): # 由于新闻网页共计100页,所以循环执行100次# 拼接请求地址url = self.start_urls[0] + '5bd5639ca3101a87ca8ff636/page_{page}.html'.format(page=i)# 执行请求yield scrapy.Request(url=url,callback=self.parse)# 处理请求结果def parse(self, response):item = DistributedItem() # 创建item对象all = response.css('.busBox3') # 获取每页所有新闻内容for i in all: # 循环遍历每页中每条新闻title = i.css('h3 a::text').get() # 获取每条新闻标题synopsis = i.css('p::text').get() # 获取每条新闻简介url = 'http:'+i.css('h3 a::attr(href)').get() # 获取每条新闻详情页地址time_ = i.css('p b::text').get() # 获取新闻发布时间item['news_title'] = title # 将新闻标题添加至itemitem['news_synopsis'] = synopsis # 将新闻简介内容添加至itemitem['news_url'] = url # 将新闻详情页地址添加至itemitem['news_time'] = time_ # 将新闻发布时间添加至itemyield item # 打印item信息pass
导入CrawlerProcess类
from scrapy.crawler import CrawlerProcess
导入获取项目配置信息
from scrapy.utils.project import get_project_settings
程序入口
if name==’main‘:
# 创建CrawlerProcess类对象并传入项目设置信息参数process = CrawlerProcess(get_project_settings())# 设置需要启动的爬虫名称process.crawl('distributedSpider')# 启动爬虫process.start()
5、编写配置文件
启用redis调度存储请求队列
SCHEDULER = ‘scrapy_redis.scheduler.Scheduler’
确保所有爬虫通过redis共享相同的重复筛选器。
DUPEFILTER_CLASS = ‘scrapy_redis.dupefilter.RFPDupeFilter’
不清理redis队列,允许暂停/恢复爬虫
SCHEDULER_PERSIST =True
使用默认的优先级队列调度请求
SCHEDULER_QUEUE_CLASS =’scrapy_redis.queue.PriorityQueue’ REDIS_URL =’redis://192.168.0.113:6379’ # 本地ip DOWNLOADER_MIDDLEWARES = {
# 启动自定义随机请求头中间件'distributed.middlewares.RandomHeaderMiddleware': 200,# 'distributed.middlewares.DistributedDownloaderMiddleware': 543,
}
配置请求头类型为随机,此处还可以设置为ie、firefox以及chrome
RANDOM_UA_TYPE = “random” ITEM_PIPELINES = { ‘distributed.pipelines.DistributedPipeline’: 300, ‘scrapy_redis.pipelines.RedisPipeline’:400 }
配置数据库连接信息
SQL_HOST = ‘192.168.0.113’ # 数据库地址 # 本地ip SQL_USER = ‘root’ # 用户名 SQL_PASSWORD=’root’ # 密码 SQL_DATABASE = ‘news_data’ # 数据库名称 SQL_PORT = 3306 # 端口 ```
