简介
Colly 是一个用于构建网络爬虫的 Golang 框架。使用 Colly,您可以构建各种复杂的网络爬虫,从简单的爬虫到处理数百万网页的复杂异步网站爬虫。Colly 提供了一个 API 用于执行网络请求和处理接收到的内容(例如与 HTML 文档的 DOM 树交互)。
import json
import logging
import threading
import time
import requests
class IBoxNFT:
def __init__(self, phone, sms):
self.phone = phone
self.sms = sms
class Crawler(IBoxNFT):
def __init__(self, basic, suffix, phone, sms):
self.basic = basic
self.link = f'{self.basic}/{suffix}'
self.un_know = False
self.start = time.perf_counter()
super(Crawler, self).__init__(phone, sms)
def login_ibox(self, version=None, new_suffix=None):
"""
实现IBox的登录功能,登录请求的版本和后缀有所不同,需要对其进行更新
:param version: 版本号,初始版本为v1.2
:param new_suffix: 登录的后缀
:return: 返回登录完成信息
"""
if not (version, new_suffix):
v, s = "v1.1", "/user/login"
header = Crawler.public_params("headers")[0]
resp = Crawler.crawler_link(
'post',
self.suffix("v1.1", "/user/login"),
headers=header,
json={"phoneNumber": self.phone, "code": str(self.sms)},
)
# if resp.get('code') != 1:
# print("登录失败,检查登录信息")
print("登录信息\n", resp.json())
def suffix(self, version, new_suffix):
link = f"{self.link.replace('http', 'https').replace('v1.2', str(version))[:46]}{new_suffix}"
return link
@staticmethod
def public_params(param) -> dict:
"""通过闭包返回公共的params参数,其中包括
(请求、构造、公共请求头参数)
"""
def inner():
if "crawler_min_nft" in param:
return {
"classifyId": "",
"origin": "0",
"pageSize": "20",
"sort": "1",
"page": "1",
"type": "0"
}
if "crawler_new_nft" in param:
return {
"classifyId": "",
"origin": "0",
"pageSize": "20",
"sort": "0",
"page": "1",
"type": "0"
}
if "headers" in param:
return {
'Accept-Encoding': 'gzip',
'IB-PLATFORM-TYPE': 'android',
'Host': 'api-app.ibox.art',
'language': 'zh-CN',
'ib-app-version':' 1.1.4',
'Content-Type': 'application/json; charset=UTF-8',
'Accept-Language': 'zh-CN',
'Connection': 'Keep-Alive',
'user-agent': 'iBoxApp209'
},
return inner()
@property
def get_links(self):
return self.link
@get_links.setter
def upgrade_links(self, new_link):
"""
ibox版本更新迭代很快, {upgrade_links} 用于更新IBox后续的请求URL
:param new_link: 更新后的ibox基本链接
"""
self.link = new_link
@property
def check_status(self) -> bool:
return self.un_know
def crawler_book_nft(self, payload: dict):
pass
def crawler_min_nft(self):
"""获取ibox的最低价格nft, 实时更新"""
resp = Crawler.crawler_link(
'get',
self.link,
headers=Crawler.public_params("headers")[0],
params=Crawler.public_params(self.crawler_min_nft.__name__)
)
nft = dict()
for i in self.get_min_price(resp.json()):
nft = {
'图藏系列': i['albumName'],
'价格': i['priceCny'],
'图藏URL': "www.ibox.art{}".format(i['thumbPic'])
}
if not i:
self.un_know = True
return nft
@staticmethod
def get_min_price(args):
"""监测价格数据,如果满足条件,触发通知"""
for k, v in enumerate(args['data']['list']):
if args['data']['list'][k]['albumName'] == 'iBox礼遇系列' \
and int(args['data']['list'][k]['priceCny']) < 800:
yield args['data']['list'][k]
if args['data']['list'][k]['albumName'] == 'iBox纪念系列' \
and int(args['data']['list'][k]['priceCny']) < 600:
yield args['data']['list'][k]
if args['data']['list'][k]['albumName'] == 'iBox赛博生肖系列' \
and int(args['data']['list'][k]['priceCny']) > 800:
yield args['data']['list'][k]
if args['data']['list'][k]['albumName'] == '异星夜袭' \
and int(args['data']['list'][k]['priceCny']) > 2000:
yield args['data']['list'][k]
@staticmethod
def crawler_link(methods, link, headers, params=None, data=None, json=None, proxy=None):
try:
if 'get' in methods:
r = requests.get(link, params=params, headers=headers, proxies=proxy)
return r
if 'post' in methods:
r = requests.post(link, data=data, json=json, proxies=proxy)
return r
except (requests.exceptions.ConnectionError, requests.exceptions.RequestException) as e:
logging.error("The Request is fail", e, r.status_code)
except (requests.exceptions.ConnectTimeout, requests.exceptions.ConnectionError) as e:
logging.error(e, r.status_code)
@staticmethod
def genrator_wtoken():
pass
def run(self):
print(self.crawler_min_nft())
if __name__ == '__main__':
print("启动爬虫获取ibox")
#while True:
a = Crawler("http://api-app.ibox.art", "nft-mall-web/v1.2/nft/product/getResellList", "13420280437", 516254)
a.login_ibox()
# if a.check_status:
# break
初始化Collector采集实例
colly.NewCollector(options ...func(*Collector)) *Collector
采集器可以设置多个可选参数:
限制爬取的域名
- colly.AllowedDomains(“www.baidu.com”, “baidu.com”)
设置请求头
- colly.UserAgent(“xy”)
启动异步
- colly.Async(true)
启动调试器
- colly.Debugger(&debug.LogDebugger{})
设置爬取页面的深度, 如果设置为1,只抓取Visit()中的url
- colly.MaxDepth(1)
回调函数
在发起请求前被调用
func (c *Collector) OnRequest(f RequestCallback) {}
在请求过程中发生错误被调用
func (c *Collector) OnError(f RequestCallback) {}
收到请求后被调用
func (c *Collector) OnResponse(f RequestCallback) {}
收到内容按html的jquery进行数据清洗
func (c *Collector) OnHTML(goquerySelector string,f HTMLCallback) {}
收到内容按XML的xpath进行数据清洗
func (c *Collector) OnXML(xpathQuery string,f XMLCallback) {}
在OnHTML之后被调用
func (c *Collector) OnScraped(f ScrapedCallback) {}