网址:http://www.shuoshuokong.com/tupian/
    1、编写scrapy文件

    1. import scrapy
    2. from shuoshuo.items import ShuoshuoItem
    3. import json
    4. class ShuoshuokongSpider(scrapy.Spider):
    5. name = 'shuoshuokong1'
    6. allowed_domains = ['www.shuoshuokong.com']
    7. start_urls = ['http://www.shuoshuokong.com/']
    8. def start_requests(self):
    9. for i in range(1,101):
    10. if i == 1:
    11. url = self.start_urls[0] + "tupian/index.html"
    12. else:
    13. url = self.start_urls[0] + "tupian/index_{page}.html".format(page=i)
    14. yield scrapy.Request(url=url,callback=self.parse)
    15. def parse(self, response):
    16. all_ul = response.xpath('//ul[@class="g-list-box"]/li')
    17. for li in all_ul:
    18. item = ShuoshuoItem()
    19. name = li.css('a::attr(title)').get()
    20. link = li.css('a::attr(href)').get()
    21. zhaiyao = li.css('p::text').get()
    22. lover_nums = li.css('span em:nth-child(1)::text').get()
    23. create_time = li.css('span em:nth-child(2)::text').get()
    24. item['name'] = name
    25. item['zhaiyao'] = zhaiyao
    26. item['lover_nums'] = lover_nums
    27. item['create_time'] = create_time
    28. detail_url = 'http://www.shuoshuokong.com' + link
    29. yield scrapy.Request(url=detail_url,callback=self.get_detail,meta={'item':item})
    30. # 获取文章详情
    31. def get_detail(self,response):
    32. item = response.meta['item']
    33. content = response.xpath('//div[@class="g-detail-box"]/div[@class="g-detail-font"]/p/text()').getall()
    34. content = ' '.join(content)
    35. item['content'] = content
    36. yield item

    2、编写item.py文件

    1. import scrapy
    2. class ShuoshuoItem(scrapy.Item):
    3. # define the fields for your item here like:
    4. # name = scrapy.Field()
    5. name = scrapy.Field()
    6. zhaiyao = scrapy.Field()
    7. lover_nums = scrapy.Field()
    8. create_time = scrapy.Field()
    9. content = scrapy.Field()

    3、编写中间件

    1. class RandomHeaderMiddleware(object):
    2. def __init__(self,crawler):
    3. self.ua = UserAgent()
    4. # 如果配置文件中不存在就使用默认的Google Chrome请求头
    5. self.type = crawler.settings.get('RANDOM_UA_TYPE','chrome')
    6. @classmethod
    7. def from_crawler(cls,crawler):
    8. # 返回cls()实例对象
    9. return cls(crawler)
    10. # 发送网络请求时调用该方法
    11. def process_request(self,request,spider):
    12. request.headers.setdefault('User-Agent', getattr(self.ua, self.type))
    13. 注意:
    14. 1、该下载器要在settings.py中开启,并要配置RANDOM_UA_TYPE参数
    15. RANDOM_UA_TYPE = "random"
    16. DOWNLOADER_MIDDLEWARES = {
    17. #'shuoshuo.middlewares.ShuoshuoDownloaderMiddleware': 543,
    18. 'shuoshuo.middlewares.RandomHeaderMiddleware': 300,
    19. }

    4、编写管道文件:pipeline.py文件

    1. import pymysql
    2. class ShuoshuoPipeline:
    3. # 初始化数据库参数
    4. def __init__(self,host,database,user,password,port):
    5. self.host = host
    6. self.database = database
    7. self.user = user
    8. self.password = password
    9. self.port = port
    10. # 重写from_crawler
    11. @classmethod
    12. def from_crawler(cls,crawler):
    13. return cls(
    14. host = crawler.settings.get('SQL_HOST'),
    15. user = crawler.settings.get('SQL_USER'),
    16. password = crawler.settings.get('SQL_PASSWORD'),
    17. database = crawler.settings.get('SQL_DATABASE'),
    18. port = crawler.settings.get('SQL_PORT'),
    19. )
    20. # 打开爬虫时调用
    21. def open_spider(self,spider):
    22. self.db = pymysql.connect(host=self.host,user=self.user,password=self.password,port=self.port,database=self.database)
    23. self.cursor = self.db.cursor()
    24. # 关闭爬虫时调用
    25. def close_spider(self,spider):
    26. self.db.close()
    27. def process_item(self, item, spider):
    28. data = dict(item)
    29. sql = 'insert into article (name,zhaiyao,lover_nums,create_time,content) values (%s,%s,%s,%s,%s)'
    30. # 执行插入多条数据
    31. self.cursor.executemany(sql, [(data['name'], data['zhaiyao'], data['lover_nums'], data['create_time'], data['content'])])
    32. self.db.commit()
    33. return item
    34. 说明:
    35. 1、在settings.py文件中配置数据库信息,并开启管道
    36. SQL_HOST = '127.0.0.1' # 数据库地址
    37. SQL_USER = 'root' # 用户名
    38. SQL_PASSWORD='root' # 密码
    39. SQL_DATABASE = 'article' # 数据库名称
    40. SQL_PORT = 3306 # 端口
    41. ITEM_PIPELINES = {
    42. 'shuoshuo.pipelines.ShuoshuoPipeline': 300,
    43. }