模板一

  1. # -*- coding: utf-8 -*-
  2. import time
  3. from queue import Queue
  4. from threading import Thread
  5. # 适合一层网站解析
  6. class Demo(Thread):
  7. def __init__(self, queue):
  8. Thread.__init__(self)
  9. self.queue = queue
  10. def run(self):
  11. while True:
  12. q_url = self.queue.get()
  13. try:
  14. self.parse(q_url)
  15. finally:
  16. self.queue.task_done()
  17. def parse(self, q_url):
  18. print("开始解析链接:", q_url)
  19. if __name__ == '__main__':
  20. start = time.time()
  21. # 构造链接
  22. base_url = 'https://www.baidu.com/{}.html'
  23. url_list = [base_url.format(i) for i in range(1, 201)]
  24. queue = Queue()
  25. # 创建线程
  26. for x in range(20):
  27. worker = Demo(queue)
  28. worker.daemon = True
  29. worker.start()
  30. for url in url_list:
  31. queue.put(url)
  32. queue.join()
  33. print('下载完毕耗时:{}s'.format(round(time.time() - start, 2)))

模板二

  1. import queue
  2. import threading
  3. # 适合两层网站访问
  4. # 解析线程类
  5. class Parse(threading.Thread):
  6. def __init__(self, number, data_list, req_thread):
  7. super(Parse, self).__init__()
  8. self.number = number
  9. self.data_list = data_list
  10. self.req_thread = req_thread
  11. self.is_parse = True # 判断是否从数据队列里提取数据
  12. def run(self):
  13. print('启动%d号解析线程' % self.number)
  14. while True:
  15. # 如何判断解析线程的结束条件
  16. for t in self.req_thread:
  17. if t.is_alive():
  18. break
  19. else:
  20. if self.data_list.qsize() == 0:
  21. self.is_parse = False
  22. if self.is_parse: # 解析
  23. try:
  24. data = self.data_list.get(timeout=3)
  25. except Exception as e:
  26. data = None
  27. if data is not None:
  28. self.parse(data)
  29. else:
  30. break
  31. print('退出%d号解析线程' % self.number)
  32. # 页面解析函数
  33. def parse(self, data):
  34. # 下载文件
  35. pass
  36. # 采集线程类
  37. class Crawl(threading.Thread):
  38. def __init__(self, number, req_list, data_list):
  39. super(Crawl, self).__init__()
  40. self.number = number
  41. self.req_list = req_list
  42. self.data_list = data_list
  43. self.headers = {
  44. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Safari/537.36'
  45. }
  46. def run(self):
  47. print('启动采集线程%d号' % self.number)
  48. while self.req_list.qsize() > 0:
  49. url = self.req_list.get()
  50. print('%d号线程采集:%s' % (self.number, url))
  51. # time.sleep(random.randint(1, 3))
  52. self.data_list.put("填充详细页面链接") # 向数据队列里追加
  53. def main():
  54. concurrent = 3
  55. conparse = 3
  56. # 生成请求队列
  57. req_list = queue.Queue()
  58. # 生成数据队列
  59. data_list = queue.Queue()
  60. # 填充请求数据
  61. for i in range(1, 13 + 1):
  62. base_url = 'https://www.qiushibaike.com/8hr/page/%d/' % i
  63. req_list.put(base_url)
  64. # 生成N个采集线程
  65. req_thread = []
  66. for i in range(concurrent):
  67. t = Crawl(i + 1, req_list, data_list) # 创造线程
  68. t.start()
  69. req_thread.append(t)
  70. # 生成N个解析线程
  71. parse_thread = []
  72. for i in range(conparse):
  73. t = Parse(i + 1, data_list, req_thread) # 创造解析线程
  74. t.start()
  75. parse_thread.append(t)
  76. for t in req_thread:
  77. t.join()
  78. for t in parse_thread:
  79. t.join()
  80. if __name__ == '__main__':
  81. main()