网址:http://www.shuoshuokong.com/tupian/
1、编写scrapy文件
import scrapy
from shuoshuo.items import ShuoshuoItem
import json
class ShuoshuokongSpider(scrapy.Spider):
name = 'shuoshuokong1'
allowed_domains = ['www.shuoshuokong.com']
start_urls = ['http://www.shuoshuokong.com/']
def start_requests(self):
for i in range(1,101):
if i == 1:
url = self.start_urls[0] + "tupian/index.html"
else:
url = self.start_urls[0] + "tupian/index_{page}.html".format(page=i)
yield scrapy.Request(url=url,callback=self.parse)
def parse(self, response):
all_ul = response.xpath('//ul[@class="g-list-box"]/li')
for li in all_ul:
item = ShuoshuoItem()
name = li.css('a::attr(title)').get()
link = li.css('a::attr(href)').get()
zhaiyao = li.css('p::text').get()
lover_nums = li.css('span em:nth-child(1)::text').get()
create_time = li.css('span em:nth-child(2)::text').get()
item['name'] = name
item['zhaiyao'] = zhaiyao
item['lover_nums'] = lover_nums
item['create_time'] = create_time
detail_url = 'http://www.shuoshuokong.com' + link
yield scrapy.Request(url=detail_url,callback=self.get_detail,meta={'item':item})
# 获取文章详情
def get_detail(self,response):
item = response.meta['item']
content = response.xpath('//div[@class="g-detail-box"]/div[@class="g-detail-font"]/p/text()').getall()
content = ' '.join(content)
item['content'] = content
yield item
2、编写item.py文件
import scrapy
class ShuoshuoItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
name = scrapy.Field()
zhaiyao = scrapy.Field()
lover_nums = scrapy.Field()
create_time = scrapy.Field()
content = scrapy.Field()
3、编写中间件
class RandomHeaderMiddleware(object):
def __init__(self,crawler):
self.ua = UserAgent()
# 如果配置文件中不存在就使用默认的Google Chrome请求头
self.type = crawler.settings.get('RANDOM_UA_TYPE','chrome')
@classmethod
def from_crawler(cls,crawler):
# 返回cls()实例对象
return cls(crawler)
# 发送网络请求时调用该方法
def process_request(self,request,spider):
request.headers.setdefault('User-Agent', getattr(self.ua, self.type))
注意:
1、该下载器要在settings.py中开启,并要配置RANDOM_UA_TYPE参数
RANDOM_UA_TYPE = "random"
DOWNLOADER_MIDDLEWARES = {
#'shuoshuo.middlewares.ShuoshuoDownloaderMiddleware': 543,
'shuoshuo.middlewares.RandomHeaderMiddleware': 300,
}
4、编写管道文件:pipeline.py文件
import pymysql
class ShuoshuoPipeline:
# 初始化数据库参数
def __init__(self,host,database,user,password,port):
self.host = host
self.database = database
self.user = user
self.password = password
self.port = port
# 重写from_crawler
@classmethod
def from_crawler(cls,crawler):
return cls(
host = crawler.settings.get('SQL_HOST'),
user = crawler.settings.get('SQL_USER'),
password = crawler.settings.get('SQL_PASSWORD'),
database = crawler.settings.get('SQL_DATABASE'),
port = crawler.settings.get('SQL_PORT'),
)
# 打开爬虫时调用
def open_spider(self,spider):
self.db = pymysql.connect(host=self.host,user=self.user,password=self.password,port=self.port,database=self.database)
self.cursor = self.db.cursor()
# 关闭爬虫时调用
def close_spider(self,spider):
self.db.close()
def process_item(self, item, spider):
data = dict(item)
sql = 'insert into article (name,zhaiyao,lover_nums,create_time,content) values (%s,%s,%s,%s,%s)'
# 执行插入多条数据
self.cursor.executemany(sql, [(data['name'], data['zhaiyao'], data['lover_nums'], data['create_time'], data['content'])])
self.db.commit()
return item
说明:
1、在settings.py文件中配置数据库信息,并开启管道
SQL_HOST = '127.0.0.1' # 数据库地址
SQL_USER = 'root' # 用户名
SQL_PASSWORD='root' # 密码
SQL_DATABASE = 'article' # 数据库名称
SQL_PORT = 3306 # 端口
ITEM_PIPELINES = {
'shuoshuo.pipelines.ShuoshuoPipeline': 300,
}