class JdItem(scrapy.Item):
bookName = scrapy.Field()
sellPrice = scrapy.Field()
authors = scrapy.Field()
coverUrl = scrapy.Field()
bookId = scrapy.Field()
definePrice = scrapy.Field()
publisher = scrapy.Field()
discount = scrapy.Field()
import scrapy
import json
from jd.items import JdItem
class JdShopSpider(scrapy.Spider):
name = 'jd_shop'
# allowed_domains = ['jd.com']
# start_urls = ['http://jd.com/']
def start_requests(self):
url = 'https://gw-e.jd.com/client.action?callback=func&body=%7B%22moduleType%22%3A1%2C%22page%22%3A4%2C%22pageSize%22%3A20%2C%22scopeType%22%3A1%7D&functionId=bookRank&client=e.jd.com&_=1650724495119'
yield scrapy.Request(url=url,callback=self.parse)
def parse(self, response):
data = response.text.lstrip('func(')
data = data.rstrip(')')
all_books = json.loads(data)['data']['books']
for book in all_books:
item = JdItem()
item['bookName'] = book['bookName']
item['sellPrice'] = book['sellPrice']
item['authors'] = book['authors']
item['coverUrl'] = book['coverUrl']
item['bookId'] = book['bookId']
item['definePrice'] = book['definePrice']
item['publisher'] = book['publisher']
item['discount'] = book['discount']
yield item
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
if __name__ == '__main__':
process = CrawlerProcess(get_project_settings())
process.crawl('jd_shop')
process.start()
from itemadapter import ItemAdapter
import pymysql
import json
class JdPipeline(object):
# 初始化数据库参数
def __init__(self,host,database,user,password,port):
self.host = host
self.database = database
self.user = user
self.password = password
self.port = port
@classmethod
def from_crawler(cls,crawler):
return cls(
host = crawler.settings.get('SQL_HOST'),
user = crawler.settings.get('SQL_USER'),
password = crawler.settings.get('SQL_PASSWORD'),
database = crawler.settings.get('SQL_DATABASE'),
port = crawler.settings.get('SQL_PORT'),
)
# 打开爬虫时调用
def open_spider(self,spider):
self.db = pymysql.connect(host=self.host, user=self.user, password=self.password, database=self.database, port=self.port,autocommit=True)
self.cursor = self.db.cursor() # 创建游标
# 关闭爬虫时调用
def close_spider(self,spider):
self.db.close()
# 重写process_item()方法,首先先将item对象转换为字典类型的数据,然后将数据通过zip()函数转换为每条数据为
# ['book_name','press','author']类型的数据,然后提交并返回item对象
def process_item(self, item, spider):
data = dict(item)
authors = json.dumps(data['authors'])
query = """insert into ranking (bookName,sellPrice,authors,coverUrl,bookId,definePrice,publisher,discount) values (%s,%s,%s,%s,%s,%s,%s,%s)"""
values = (str(data["bookName"]), str(data["sellPrice"]), str(authors),str(data["coverUrl"]),str(data["bookId"]),str(data["definePrice"]),str(data["publisher"]),str(data["discount"]))
self.cursor.execute(query, values)
self.db.commit()
return item
SQL_HOST = 'localhost'
SQL_USER = 'root'
SQL_PASSWORD = 'root'
SQL_DATABASE = 'mysoft'
SQL_PORT = 3306
ITEM_PIPELINES = {
'jd.pipelines.JdPipeline': 300,
}
表结构: