模板一

  1. import multiprocessing
  2. import os
  3. class ProcessClass(multiprocessing.Process):
  4. def __init__(self, queue):
  5. multiprocessing.Process.__init__(self)
  6. self.queue = queue
  7. def run(self):
  8. while True:
  9. if self.queue.empty():
  10. break
  11. else:
  12. item = self.queue.get()
  13. self.parse(item)
  14. def parse(self, item):
  15. print('[{}]号进程:{}'.format(os.getpid(), item))
  16. # time.sleep(1)
  17. def main():
  18. # 队列必须使用多进程的队列,使用queue模块会报错
  19. queue = multiprocessing.Queue()
  20. for i in range(20):
  21. queue.put(i)
  22. process_list = []
  23. process_num = 4
  24. for i in range(process_num):
  25. p = ProcessClass(queue)
  26. p.start()
  27. process_list.append(p)
  28. for p in process_list:
  29. p.join()
  30. if __name__ == '__main__':
  31. main()

模板二

  1. import multiprocessing
  2. import os
  3. from queue import Queue
  4. from threading import Thread
  5. # 多进程+多线程 适合两层网站的解析
  6. class ProcessClass(multiprocessing.Process):
  7. def __init__(self, queue):
  8. multiprocessing.Process.__init__(self)
  9. self.queue = queue
  10. def run(self):
  11. while True:
  12. if self.queue.empty():
  13. break
  14. else:
  15. item = self.queue.get()
  16. self.parse(item)
  17. def parse(self, item):
  18. print('[{}]号进程:{}'.format(os.getpid(), item))
  19. # 解析出页面链接
  20. url_list = ["123", "123", "123"]
  21. queue = Queue()
  22. for x in range(20):
  23. worker = Crawl(queue)
  24. worker.daemon = True
  25. worker.start()
  26. for url in url_list:
  27. queue.put(url)
  28. queue.join()
  29. class Crawl(Thread):
  30. def __init__(self, queue):
  31. Thread.__init__(self)
  32. self.queue = queue
  33. def run(self):
  34. while True:
  35. q_url = self.queue.get()
  36. try:
  37. self.parse(q_url)
  38. finally:
  39. self.queue.task_done()
  40. def parse(self, q_url):
  41. print("开始解析链接:", q_url)
  42. def main():
  43. # 队列必须使用多进程的队列,使用queue模块会报错
  44. queue = multiprocessing.Queue()
  45. base_url = 'https://www.baidu.com/{}.html'
  46. for i in range(1, 20):
  47. queue.put(base_url.format(i))
  48. process_list = []
  49. process_num = 4
  50. for i in range(process_num):
  51. p = ProcessClass(queue)
  52. p.start()
  53. process_list.append(p)
  54. for p in process_list:
  55. p.join()
  56. if __name__ == '__main__':
  57. main()