1、创建数据表
2、创建Scrapy项目

  1. scrapy startproject distribute
  2. cd distribute
  3. scrapy genspider distributedSpider china.chinadaily.com.cn
  1. 创建随机请求头

    1. from fake_useragent import UserAgent # 导入请求头类
    2. # 自定义随机请求头的中间件
    3. class RandomHeaderMiddleware(object):
    4. def __init__(self, crawler):
    5. self.ua = UserAgent() # 随机请求头对象
    6. # 如果配置文件中不存在就使用默认的Google Chrome请求头
    7. self.type = crawler.settings.get("RANDOM_UA_TYPE", "chrome")
    8. @classmethod
    9. def from_crawler(cls, crawler):
    10. # 返回cls()实例对象
    11. return cls(crawler)
    12. # 发送网络请求时调用该方法
    13. def process_request(self, request, spider):
    14. # 设置随机生成的请求头
    15. request.headers.setdefault('User-Agent', getattr(self.ua, self.type))

    2、编写items文件

    1. class DistributeItem(scrapy.Item):
    2. # define the fields for your item here like:
    3. # name = scrapy.Field()
    4. news_title = scrapy.Field()
    5. news_synopsis = scrapy.Field()
    6. news_url = scrapy.Field()
    7. news_time = scrapy.Field()

    3、编写pipelines文件 ``` import pymysql

class DistributePipeline:

  1. # 初始化数据库参数
  2. def __init__(self,host,database,user,password,port):
  3. self.host = host
  4. self.database = database
  5. self.user = user
  6. self.password = password
  7. self.port = port
  8. @classmethod
  9. def from_crawler(cls,crawler):
  10. return cls(
  11. host = crawler.settings.get('SQL_HOST'),
  12. user = crawler.settings.get('SQL_USER'),
  13. password = crawler.settings.get('SQL_PASSWORD'),
  14. database = crawler.settings.get('SQL_DATABASE'),
  15. port = crawler.settings.get('SQL_PORT'),
  16. )
  17. # 打开爬虫调用
  18. def open_spider(self,spider):
  19. self.db = pymysql.connect(host=self.host,user=self.user,password=self.password,database=self.database,port=self.port,charset='utf-8')
  20. self.cursor = self.db.cursor()
  21. # 关闭爬虫时嗲用
  22. def close_spider(self,spider):
  23. self.db.close()
  24. def process_item(self, item, spider):
  25. data = dict(item)
  26. sql = 'insert into news (title,synopsis,url,time) values (%s,%s,%s,%s)'
  27. # 插入多条数据
  28. self.cursor.executemany(sql,[(data['news_title'],data['news_synopsis'],data['news_url'],data['news_time'])])
  1. 4、编写spider文件

from distributed.items import DistributedItem # 导入Item对象 class DistributedspiderSpider(scrapy.Spider): name = ‘distributedSpider’ allowed_domains = [‘china.chinadaily.com.cn’] start_urls = [‘http://china.chinadaily.com.cn/‘]

  1. # 发送网络请求
  2. def start_requests(self):
  3. for i in range(1,101): # 由于新闻网页共计100页,所以循环执行100次
  4. # 拼接请求地址
  5. url = self.start_urls[0] + '5bd5639ca3101a87ca8ff636/page_{page}.html'.format(page=i)
  6. # 执行请求
  7. yield scrapy.Request(url=url,callback=self.parse)
  8. # 处理请求结果
  9. def parse(self, response):
  10. item = DistributedItem() # 创建item对象
  11. all = response.css('.busBox3') # 获取每页所有新闻内容
  12. for i in all: # 循环遍历每页中每条新闻
  13. title = i.css('h3 a::text').get() # 获取每条新闻标题
  14. synopsis = i.css('p::text').get() # 获取每条新闻简介
  15. url = 'http:'+i.css('h3 a::attr(href)').get() # 获取每条新闻详情页地址
  16. time_ = i.css('p b::text').get() # 获取新闻发布时间
  17. item['news_title'] = title # 将新闻标题添加至item
  18. item['news_synopsis'] = synopsis # 将新闻简介内容添加至item
  19. item['news_url'] = url # 将新闻详情页地址添加至item
  20. item['news_time'] = time_ # 将新闻发布时间添加至item
  21. yield item # 打印item信息
  22. pass

导入CrawlerProcess类

from scrapy.crawler import CrawlerProcess

导入获取项目配置信息

from scrapy.utils.project import get_project_settings

程序入口

if name==’main‘:

  1. # 创建CrawlerProcess类对象并传入项目设置信息参数
  2. process = CrawlerProcess(get_project_settings())
  3. # 设置需要启动的爬虫名称
  4. process.crawl('distributedSpider')
  5. # 启动爬虫
  6. process.start()
  1. 5、编写配置文件

启用redis调度存储请求队列

SCHEDULER = ‘scrapy_redis.scheduler.Scheduler’

确保所有爬虫通过redis共享相同的重复筛选器。

DUPEFILTER_CLASS = ‘scrapy_redis.dupefilter.RFPDupeFilter’

不清理redis队列,允许暂停/恢复爬虫

SCHEDULER_PERSIST =True

使用默认的优先级队列调度请求

SCHEDULER_QUEUE_CLASS =’scrapy_redis.queue.PriorityQueue’ REDIS_URL =’redis://192.168.0.113:6379’ # 本地ip DOWNLOADER_MIDDLEWARES = {

  1. # 启动自定义随机请求头中间件
  2. 'distributed.middlewares.RandomHeaderMiddleware': 200,
  3. # 'distributed.middlewares.DistributedDownloaderMiddleware': 543,

}

配置请求头类型为随机,此处还可以设置为ie、firefox以及chrome

RANDOM_UA_TYPE = “random” ITEM_PIPELINES = { ‘distributed.pipelines.DistributedPipeline’: 300, ‘scrapy_redis.pipelines.RedisPipeline’:400 }

配置数据库连接信息

SQL_HOST = ‘192.168.0.113’ # 数据库地址 # 本地ip SQL_USER = ‘root’ # 用户名 SQL_PASSWORD=’root’ # 密码 SQL_DATABASE = ‘news_data’ # 数据库名称 SQL_PORT = 3306 # 端口 ```