1. class JdItem(scrapy.Item):
    2. bookName = scrapy.Field()
    3. sellPrice = scrapy.Field()
    4. authors = scrapy.Field()
    5. coverUrl = scrapy.Field()
    6. bookId = scrapy.Field()
    7. definePrice = scrapy.Field()
    8. publisher = scrapy.Field()
    9. discount = scrapy.Field()
    1. import scrapy
    2. import json
    3. from jd.items import JdItem
    4. class JdShopSpider(scrapy.Spider):
    5. name = 'jd_shop'
    6. # allowed_domains = ['jd.com']
    7. # start_urls = ['http://jd.com/']
    8. def start_requests(self):
    9. url = 'https://gw-e.jd.com/client.action?callback=func&body=%7B%22moduleType%22%3A1%2C%22page%22%3A4%2C%22pageSize%22%3A20%2C%22scopeType%22%3A1%7D&functionId=bookRank&client=e.jd.com&_=1650724495119'
    10. yield scrapy.Request(url=url,callback=self.parse)
    11. def parse(self, response):
    12. data = response.text.lstrip('func(')
    13. data = data.rstrip(')')
    14. all_books = json.loads(data)['data']['books']
    15. for book in all_books:
    16. item = JdItem()
    17. item['bookName'] = book['bookName']
    18. item['sellPrice'] = book['sellPrice']
    19. item['authors'] = book['authors']
    20. item['coverUrl'] = book['coverUrl']
    21. item['bookId'] = book['bookId']
    22. item['definePrice'] = book['definePrice']
    23. item['publisher'] = book['publisher']
    24. item['discount'] = book['discount']
    25. yield item
    1. from scrapy.crawler import CrawlerProcess
    2. from scrapy.utils.project import get_project_settings
    3. if __name__ == '__main__':
    4. process = CrawlerProcess(get_project_settings())
    5. process.crawl('jd_shop')
    6. process.start()
    1. from itemadapter import ItemAdapter
    2. import pymysql
    3. import json
    4. class JdPipeline(object):
    5. # 初始化数据库参数
    6. def __init__(self,host,database,user,password,port):
    7. self.host = host
    8. self.database = database
    9. self.user = user
    10. self.password = password
    11. self.port = port
    12. @classmethod
    13. def from_crawler(cls,crawler):
    14. return cls(
    15. host = crawler.settings.get('SQL_HOST'),
    16. user = crawler.settings.get('SQL_USER'),
    17. password = crawler.settings.get('SQL_PASSWORD'),
    18. database = crawler.settings.get('SQL_DATABASE'),
    19. port = crawler.settings.get('SQL_PORT'),
    20. )
    21. # 打开爬虫时调用
    22. def open_spider(self,spider):
    23. self.db = pymysql.connect(host=self.host, user=self.user, password=self.password, database=self.database, port=self.port,autocommit=True)
    24. self.cursor = self.db.cursor() # 创建游标
    25. # 关闭爬虫时调用
    26. def close_spider(self,spider):
    27. self.db.close()
    28. # 重写process_item()方法,首先先将item对象转换为字典类型的数据,然后将数据通过zip()函数转换为每条数据为
    29. # ['book_name','press','author']类型的数据,然后提交并返回item对象
    30. def process_item(self, item, spider):
    31. data = dict(item)
    32. authors = json.dumps(data['authors'])
    33. query = """insert into ranking (bookName,sellPrice,authors,coverUrl,bookId,definePrice,publisher,discount) values (%s,%s,%s,%s,%s,%s,%s,%s)"""
    34. values = (str(data["bookName"]), str(data["sellPrice"]), str(authors),str(data["coverUrl"]),str(data["bookId"]),str(data["definePrice"]),str(data["publisher"]),str(data["discount"]))
    35. self.cursor.execute(query, values)
    36. self.db.commit()
    37. return item
    1. SQL_HOST = 'localhost'
    2. SQL_USER = 'root'
    3. SQL_PASSWORD = 'root'
    4. SQL_DATABASE = 'mysoft'
    5. SQL_PORT = 3306
    6. ITEM_PIPELINES = {
    7. 'jd.pipelines.JdPipeline': 300,
    8. }

    表结构:
    image.png