多线程爬虫

image.png

多线程爬取表情包

  1. import threading
  2. import requests
  3. from lxml import etree
  4. from urllib import request
  5. import os
  6. import re
  7. from queue import Queue
  8. class Producer(threading.Thread):
  9. headers = {
  10. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'
  11. }
  12. def __init__(self, page_queue, img_queue, *args, **kwargs):
  13. super(Producer, self).__init__(*args, **kwargs)
  14. self.page_queue = page_queue
  15. self.img_queue = img_queue
  16. def run(self):
  17. while True:
  18. if self.page_queue.empty():
  19. break
  20. url = self.page_queue.get()
  21. self.parse_page(url)
  22. def parse_page(self, url):
  23. response = requests.get(url, headers=self.headers)
  24. text = response.text
  25. html = etree.HTML(text)
  26. imgs = html.xpath("//div[@class='page-content text-center']//a//img")
  27. for img in imgs:
  28. if img.get('class') == 'gif':
  29. continue
  30. img_url = img.xpath(".//@data-original")[0]
  31. suffix = os.path.splitext(img_url)[1]
  32. alt = img.xpath(".//@alt")[0]
  33. alt = re.sub(r'[,。??,/\\·]', '', alt)
  34. img_name = alt + suffix
  35. self.img_queue.put((img_url, img_name))
  36. class Consumer(threading.Thread):
  37. def __init__(self, page_queue, img_queue, *args, **kwargs):
  38. super(Consumer, self).__init__(*args, **kwargs)
  39. self.page_queue = page_queue
  40. self.img_queue = img_queue
  41. def run(self):
  42. while True:
  43. if self.img_queue.empty():
  44. if self.page_queue.empty():
  45. return
  46. img = self.img_queue.get(block=True)
  47. url, filename = img
  48. request.urlretrieve(url, 'images/' + filename)
  49. print(filename + ' 下载完成!')
  50. def main():
  51. page_queue = Queue(100)
  52. img_queue = Queue(500)
  53. for x in range(1, 101):
  54. url = "http://www.doutula.com/photo/list/?page=%d" % x
  55. page_queue.put(url)
  56. for x in range(5):
  57. t = Producer(page_queue, img_queue)
  58. t.start()
  59. for x in range(5):
  60. t = Consumer(page_queue, img_queue)
  61. t.start()
  62. if __name__ == '__main__':
  63. main()

分布式爬虫2 多线程爬虫 - 图2
分布式爬虫2 多线程爬虫 - 图3分布式爬虫2 多线程爬虫 - 图4分布式爬虫2 多线程爬虫 - 图5分布式爬虫2 多线程爬虫 - 图6分布式爬虫2 多线程爬虫 - 图7分布式爬虫2 多线程爬虫 - 图8分布式爬虫2 多线程爬虫 - 图9分布式爬虫2 多线程爬虫 - 图10分布式爬虫2 多线程爬虫 - 图11分布式爬虫2 多线程爬虫 - 图12
正文
正文标题 1标题 2标题 3标题 4

回复
十万阿里人都在用的笔记与文档知识库
关于语雀使用帮助数据安全服务协议English