模板一
import multiprocessing
import os
class ProcessClass(multiprocessing.Process):
def __init__(self, queue):
multiprocessing.Process.__init__(self)
self.queue = queue
def run(self):
while True:
if self.queue.empty():
break
else:
item = self.queue.get()
self.parse(item)
def parse(self, item):
print('[{}]号进程:{}'.format(os.getpid(), item))
# time.sleep(1)
def main():
# 队列必须使用多进程的队列,使用queue模块会报错
queue = multiprocessing.Queue()
for i in range(20):
queue.put(i)
process_list = []
process_num = 4
for i in range(process_num):
p = ProcessClass(queue)
p.start()
process_list.append(p)
for p in process_list:
p.join()
if __name__ == '__main__':
main()
模板二
import multiprocessing
import os
from queue import Queue
from threading import Thread
# 多进程+多线程 适合两层网站的解析
class ProcessClass(multiprocessing.Process):
def __init__(self, queue):
multiprocessing.Process.__init__(self)
self.queue = queue
def run(self):
while True:
if self.queue.empty():
break
else:
item = self.queue.get()
self.parse(item)
def parse(self, item):
print('[{}]号进程:{}'.format(os.getpid(), item))
# 解析出页面链接
url_list = ["123", "123", "123"]
queue = Queue()
for x in range(20):
worker = Crawl(queue)
worker.daemon = True
worker.start()
for url in url_list:
queue.put(url)
queue.join()
class Crawl(Thread):
def __init__(self, queue):
Thread.__init__(self)
self.queue = queue
def run(self):
while True:
q_url = self.queue.get()
try:
self.parse(q_url)
finally:
self.queue.task_done()
def parse(self, q_url):
print("开始解析链接:", q_url)
def main():
# 队列必须使用多进程的队列,使用queue模块会报错
queue = multiprocessing.Queue()
base_url = 'https://www.baidu.com/{}.html'
for i in range(1, 20):
queue.put(base_url.format(i))
process_list = []
process_num = 4
for i in range(process_num):
p = ProcessClass(queue)
p.start()
process_list.append(p)
for p in process_list:
p.join()
if __name__ == '__main__':
main()