网址:http://www.shuoshuokong.com/tupian/
1、编写scrapy文件
import scrapyfrom shuoshuo.items import ShuoshuoItemimport jsonclass ShuoshuokongSpider(scrapy.Spider):name = 'shuoshuokong1'allowed_domains = ['www.shuoshuokong.com']start_urls = ['http://www.shuoshuokong.com/']def start_requests(self):for i in range(1,101):if i == 1:url = self.start_urls[0] + "tupian/index.html"else:url = self.start_urls[0] + "tupian/index_{page}.html".format(page=i)yield scrapy.Request(url=url,callback=self.parse)def parse(self, response):all_ul = response.xpath('//ul[@class="g-list-box"]/li')for li in all_ul:item = ShuoshuoItem()name = li.css('a::attr(title)').get()link = li.css('a::attr(href)').get()zhaiyao = li.css('p::text').get()lover_nums = li.css('span em:nth-child(1)::text').get()create_time = li.css('span em:nth-child(2)::text').get()item['name'] = nameitem['zhaiyao'] = zhaiyaoitem['lover_nums'] = lover_numsitem['create_time'] = create_timedetail_url = 'http://www.shuoshuokong.com' + linkyield scrapy.Request(url=detail_url,callback=self.get_detail,meta={'item':item})# 获取文章详情def get_detail(self,response):item = response.meta['item']content = response.xpath('//div[@class="g-detail-box"]/div[@class="g-detail-font"]/p/text()').getall()content = ' '.join(content)item['content'] = contentyield item
2、编写item.py文件
import scrapyclass ShuoshuoItem(scrapy.Item):# define the fields for your item here like:# name = scrapy.Field()name = scrapy.Field()zhaiyao = scrapy.Field()lover_nums = scrapy.Field()create_time = scrapy.Field()content = scrapy.Field()
3、编写中间件
class RandomHeaderMiddleware(object):def __init__(self,crawler):self.ua = UserAgent()# 如果配置文件中不存在就使用默认的Google Chrome请求头self.type = crawler.settings.get('RANDOM_UA_TYPE','chrome')@classmethoddef from_crawler(cls,crawler):# 返回cls()实例对象return cls(crawler)# 发送网络请求时调用该方法def process_request(self,request,spider):request.headers.setdefault('User-Agent', getattr(self.ua, self.type))注意:1、该下载器要在settings.py中开启,并要配置RANDOM_UA_TYPE参数RANDOM_UA_TYPE = "random"DOWNLOADER_MIDDLEWARES = {#'shuoshuo.middlewares.ShuoshuoDownloaderMiddleware': 543,'shuoshuo.middlewares.RandomHeaderMiddleware': 300,}
4、编写管道文件:pipeline.py文件
import pymysqlclass ShuoshuoPipeline:# 初始化数据库参数def __init__(self,host,database,user,password,port):self.host = hostself.database = databaseself.user = userself.password = passwordself.port = port# 重写from_crawler@classmethoddef from_crawler(cls,crawler):return cls(host = crawler.settings.get('SQL_HOST'),user = crawler.settings.get('SQL_USER'),password = crawler.settings.get('SQL_PASSWORD'),database = crawler.settings.get('SQL_DATABASE'),port = crawler.settings.get('SQL_PORT'),)# 打开爬虫时调用def open_spider(self,spider):self.db = pymysql.connect(host=self.host,user=self.user,password=self.password,port=self.port,database=self.database)self.cursor = self.db.cursor()# 关闭爬虫时调用def close_spider(self,spider):self.db.close()def process_item(self, item, spider):data = dict(item)sql = 'insert into article (name,zhaiyao,lover_nums,create_time,content) values (%s,%s,%s,%s,%s)'# 执行插入多条数据self.cursor.executemany(sql, [(data['name'], data['zhaiyao'], data['lover_nums'], data['create_time'], data['content'])])self.db.commit()return item说明:1、在settings.py文件中配置数据库信息,并开启管道SQL_HOST = '127.0.0.1' # 数据库地址SQL_USER = 'root' # 用户名SQL_PASSWORD='root' # 密码SQL_DATABASE = 'article' # 数据库名称SQL_PORT = 3306 # 端口ITEM_PIPELINES = {'shuoshuo.pipelines.ShuoshuoPipeline': 300,}
