简介

Colly 是一个用于构建网络爬虫的 Golang 框架。使用 Colly,您可以构建各种复杂的网络爬虫,从简单的爬虫到处理数百万网页的复杂异步网站爬虫。Colly 提供了一个 API 用于执行网络请求和处理接收到的内容(例如与 HTML 文档的 DOM 树交互)。

  1. import json
  2. import logging
  3. import threading
  4. import time
  5. import requests
  6. class IBoxNFT:
  7. def __init__(self, phone, sms):
  8. self.phone = phone
  9. self.sms = sms
  10. class Crawler(IBoxNFT):
  11. def __init__(self, basic, suffix, phone, sms):
  12. self.basic = basic
  13. self.link = f'{self.basic}/{suffix}'
  14. self.un_know = False
  15. self.start = time.perf_counter()
  16. super(Crawler, self).__init__(phone, sms)
  17. def login_ibox(self, version=None, new_suffix=None):
  18. """
  19. 实现IBox的登录功能,登录请求的版本和后缀有所不同,需要对其进行更新
  20. :param version: 版本号,初始版本为v1.2
  21. :param new_suffix: 登录的后缀
  22. :return: 返回登录完成信息
  23. """
  24. if not (version, new_suffix):
  25. v, s = "v1.1", "/user/login"
  26. header = Crawler.public_params("headers")[0]
  27. resp = Crawler.crawler_link(
  28. 'post',
  29. self.suffix("v1.1", "/user/login"),
  30. headers=header,
  31. json={"phoneNumber": self.phone, "code": str(self.sms)},
  32. )
  33. # if resp.get('code') != 1:
  34. # print("登录失败,检查登录信息")
  35. print("登录信息\n", resp.json())
  36. def suffix(self, version, new_suffix):
  37. link = f"{self.link.replace('http', 'https').replace('v1.2', str(version))[:46]}{new_suffix}"
  38. return link
  39. @staticmethod
  40. def public_params(param) -> dict:
  41. """通过闭包返回公共的params参数,其中包括
  42. (请求、构造、公共请求头参数)
  43. """
  44. def inner():
  45. if "crawler_min_nft" in param:
  46. return {
  47. "classifyId": "",
  48. "origin": "0",
  49. "pageSize": "20",
  50. "sort": "1",
  51. "page": "1",
  52. "type": "0"
  53. }
  54. if "crawler_new_nft" in param:
  55. return {
  56. "classifyId": "",
  57. "origin": "0",
  58. "pageSize": "20",
  59. "sort": "0",
  60. "page": "1",
  61. "type": "0"
  62. }
  63. if "headers" in param:
  64. return {
  65. 'Accept-Encoding': 'gzip',
  66. 'IB-PLATFORM-TYPE': 'android',
  67. 'Host': 'api-app.ibox.art',
  68. 'language': 'zh-CN',
  69. 'ib-app-version':' 1.1.4',
  70. 'Content-Type': 'application/json; charset=UTF-8',
  71. 'Accept-Language': 'zh-CN',
  72. 'Connection': 'Keep-Alive',
  73. 'user-agent': 'iBoxApp209'
  74. },
  75. return inner()
  76. @property
  77. def get_links(self):
  78. return self.link
  79. @get_links.setter
  80. def upgrade_links(self, new_link):
  81. """
  82. ibox版本更新迭代很快, {upgrade_links} 用于更新IBox后续的请求URL
  83. :param new_link: 更新后的ibox基本链接
  84. """
  85. self.link = new_link
  86. @property
  87. def check_status(self) -> bool:
  88. return self.un_know
  89. def crawler_book_nft(self, payload: dict):
  90. pass
  91. def crawler_min_nft(self):
  92. """获取ibox的最低价格nft, 实时更新"""
  93. resp = Crawler.crawler_link(
  94. 'get',
  95. self.link,
  96. headers=Crawler.public_params("headers")[0],
  97. params=Crawler.public_params(self.crawler_min_nft.__name__)
  98. )
  99. nft = dict()
  100. for i in self.get_min_price(resp.json()):
  101. nft = {
  102. '图藏系列': i['albumName'],
  103. '价格': i['priceCny'],
  104. '图藏URL': "www.ibox.art{}".format(i['thumbPic'])
  105. }
  106. if not i:
  107. self.un_know = True
  108. return nft
  109. @staticmethod
  110. def get_min_price(args):
  111. """监测价格数据,如果满足条件,触发通知"""
  112. for k, v in enumerate(args['data']['list']):
  113. if args['data']['list'][k]['albumName'] == 'iBox礼遇系列' \
  114. and int(args['data']['list'][k]['priceCny']) < 800:
  115. yield args['data']['list'][k]
  116. if args['data']['list'][k]['albumName'] == 'iBox纪念系列' \
  117. and int(args['data']['list'][k]['priceCny']) < 600:
  118. yield args['data']['list'][k]
  119. if args['data']['list'][k]['albumName'] == 'iBox赛博生肖系列' \
  120. and int(args['data']['list'][k]['priceCny']) > 800:
  121. yield args['data']['list'][k]
  122. if args['data']['list'][k]['albumName'] == '异星夜袭' \
  123. and int(args['data']['list'][k]['priceCny']) > 2000:
  124. yield args['data']['list'][k]
  125. @staticmethod
  126. def crawler_link(methods, link, headers, params=None, data=None, json=None, proxy=None):
  127. try:
  128. if 'get' in methods:
  129. r = requests.get(link, params=params, headers=headers, proxies=proxy)
  130. return r
  131. if 'post' in methods:
  132. r = requests.post(link, data=data, json=json, proxies=proxy)
  133. return r
  134. except (requests.exceptions.ConnectionError, requests.exceptions.RequestException) as e:
  135. logging.error("The Request is fail", e, r.status_code)
  136. except (requests.exceptions.ConnectTimeout, requests.exceptions.ConnectionError) as e:
  137. logging.error(e, r.status_code)
  138. @staticmethod
  139. def genrator_wtoken():
  140. pass
  141. def run(self):
  142. print(self.crawler_min_nft())
  143. if __name__ == '__main__':
  144. print("启动爬虫获取ibox")
  145. #while True:
  146. a = Crawler("http://api-app.ibox.art", "nft-mall-web/v1.2/nft/product/getResellList", "13420280437", 516254)
  147. a.login_ibox()
  148. # if a.check_status:
  149. # break


基本配置

初始化Collector采集实例

colly.NewCollector(options ...func(*Collector)) *Collector
采集器可以设置多个可选参数:

  • 限制爬取的域名

    • colly.AllowedDomains(“www.baidu.com”, “baidu.com”)
  • 设置请求头

    • colly.UserAgent(“xy”)
  • 启动异步

    • colly.Async(true)
  • 启动调试器

    • colly.Debugger(&debug.LogDebugger{})
  • 设置爬取页面的深度, 如果设置为1,只抓取Visit()中的url

    • colly.MaxDepth(1)

https://lk668.github.io/2021/04/05/2021-04-05-%E6%89%8B%E6%8A%8A%E6%89%8B%E6%95%99%E4%BD%A0%E5%A6%82%E4%BD%95%E7%94%A8golang%E5%AE%9E%E7%8E%B0%E4%B8%80%E4%B8%AAtimewheel/

回调函数

在发起请求前被调用

func (c *Collector) OnRequest(f RequestCallback) {}

在请求过程中发生错误被调用

func (c *Collector) OnError(f RequestCallback) {}

收到请求后被调用

func (c *Collector) OnResponse(f RequestCallback) {}

收到内容按html的jquery进行数据清洗

func (c *Collector) OnHTML(goquerySelector string,f HTMLCallback) {}

收到内容按XML的xpath进行数据清洗

func (c *Collector) OnXML(xpathQuery string,f XMLCallback) {}

在OnHTML之后被调用

func (c *Collector) OnScraped(f ScrapedCallback) {}

并行爬取