一、说明

生产者消费者模式链接

二、代码

  1. # -*- coding:utf-8 -*-
  2. import requests
  3. from lxml import etree
  4. import threading
  5. from queue import Queue
  6. import os
  7. class Producer_mulu(threading.Thread):
  8. def __init__(self, mulu_queue, zhangjie_queue, *args, **kwargs):
  9. super(Producer_mulu, self).__init__(*args, **kwargs)
  10. self.mulu_queue = mulu_queue
  11. self.zhangjie_queue = zhangjie_queue
  12. def run(self):
  13. while True:
  14. if self.mulu_queue.empty():
  15. print("所有目录path都已生产完成,生产者停止")
  16. break
  17. xuanhuan_url = self.mulu_queue.get()
  18. self.get_mulu(xuanhuan_url)
  19. def get_mulu(self, xuanhuan_url):
  20. req_text = requests.get(xuanhuan_url).content.decode('utf-8')
  21. text_html = etree.HTML(req_text)
  22. file_name = text_html.xpath('//div[@id="newscontent"]/div[1]/ul/li/span[1]/a/text()')
  23. mulu_url = text_html.xpath('//div[@id="newscontent"]/div[1]/ul/li/span[1]/a/@href')
  24. mulu_path = list(zip(file_name, mulu_url))
  25. for xiaoshuo_name, url in mulu_path:
  26. # self.zhangjie_queue.put((xiaoshuo_name,url))
  27. # print(self.zhangjie_queue.get())
  28. self.zhangjie_queue.put(url)
  29. class Producer_zhangjie(threading.Thread):
  30. def __init__(self, mulu_queue, zhangjie_queue, content_queue, *args, **kwargs):
  31. super(Producer_zhangjie, self).__init__(*args, **kwargs)
  32. self.mulu_queue = mulu_queue
  33. self.zhangjie_queue = zhangjie_queue
  34. self.content_queue = content_queue
  35. def run(self):
  36. while True:
  37. if self.mulu_queue.empty() and self.zhangjie_queue.empty():
  38. print("所有章节目录都已生产完成,生产者停止")
  39. break
  40. mulu_url = self.zhangjie_queue.get()
  41. self.get_zj_path(mulu_url)
  42. # 获取章节path
  43. def get_zj_path(self, mulu_url):
  44. base_url = 'http://www.xbiquge.la/'
  45. req_mulu = requests.get(mulu_url).content.decode('utf-8')
  46. mulu_html = etree.HTML(req_mulu)
  47. base_path_list = mulu_html.xpath("//div[@class='box_con'][2]/div[@id='list']/dl/dd/a/@href")
  48. zhang_name_list = mulu_html.xpath("//div[@class='box_con'][2]/div[@id='list']/dl/dd/a/text()")
  49. mulu_path_list = [base_url + x for x in base_path_list]
  50. zj_path = list(zip(zhang_name_list, mulu_path_list))
  51. for zhangjie_name, content_url in zj_path:
  52. self.content_queue.put((zhangjie_name, content_url))
  53. class Consumers_content(threading.Thread):
  54. def __init__(self, mulu_queue, zhangjie_queue, content_queue, *args, **kwargs):
  55. super(Consumers_content, self).__init__(*args, **kwargs)
  56. self.mulu_queue = mulu_queue
  57. self.zhangjie_queue = zhangjie_queue
  58. self.content_queue = content_queue
  59. def run(self):
  60. while True:
  61. if self.mulu_queue.empty() and self.zhangjie_queue.empty() and self.content_queue.empty():
  62. break
  63. zhangjie_name, content_url = self.content_queue.get()
  64. self.get_text(content_url, zhangjie_name)
  65. # 获取内容
  66. def get_text(self, text_url, text_name):
  67. req_text = requests.get(text_url).content.decode('utf-8')
  68. text_html = etree.HTML(req_text)
  69. xiaoshuo_name = text_html.xpath("//div[@class='box_con']/div[@class='con_top']/a[3]/text()")[0]
  70. data = text_html.xpath("//div[@class='box_con']/div[@id='content']/text()")
  71. data = (str(data).replace('\\xa0', '')).replace("\\r',", '').replace('\'', '').replace('”', '').replace('“', '')
  72. dirs = '.\\xiaoshuo\\{}'.format(xiaoshuo_name)
  73. if not os.path.exists(dirs):
  74. os.makedirs(dirs)
  75. with open(r'.\xiaoshuo\{0}\{1}'.format(xiaoshuo_name, text_name) + '.txt', 'w', encoding='utf-8') as f:
  76. f.write(text_name)
  77. f.write('\n')
  78. f.write(data[1:-1])
  79. f.write('\n')
  80. print("《" + xiaoshuo_name + "》" + text_name + "下载成功")
  81. if __name__ == '__main__':
  82. mulu_Queue = Queue(1000000)
  83. zhangjie_Queue = Queue(1000000)
  84. content_Queue = Queue(1000000)
  85. for i in range(2, 100):
  86. xh_url = "http://www.xbiquge.la/fenlei/1_{}.html".format(i)
  87. mulu_Queue.put(xh_url)
  88. for i in range(10):
  89. mulu = Producer_mulu(mulu_Queue, zhangjie_Queue)
  90. mulu.start()
  91. for i in range(10):
  92. zhangjie = Producer_zhangjie(mulu_Queue, zhangjie_Queue, content_Queue)
  93. zhangjie.start()
  94. for i in range(100):
  95. content = Consumers_content(mulu_Queue, zhangjie_Queue, content_Queue)
  96. content.start()

三、总结

流程说明

使用threading.Thread创建两个生产者,一个消费者;
Producer_mulu生产玄幻小说分类下的所有小说“href”,
Consumers_content生产每个小说的章节名称和章节URL,
Consumers_content使用Consumers_content生产的内容生成小说内容并分类保存

zip的使用

  1. mulu_path = list(zip(file_name, mulu_url))
  2. for zhangjie_name, content_url in zj_path:
  1. a = [1,2,3]
  2. b = [4,5,6]
  3. c = [4,5,6,7,8]
  4. >>> zipped = zip(a,b) # 打包为元组的列表
  5. [(1, 4), (2, 5), (3, 6)]
  6. >>> zip(a,c) # 元素个数与最短的列表一致
  7. [(1, 4), (2, 5), (3, 6)]
  8. >>> zip(*zipped) # 与 zip 相反,*zipped 可理解为解压,返回二维矩阵式
  9. [(1, 2, 3), (4, 5, 6)]

相关操作

  1. a_list = ["a1", "a2", "a3"]
  2. b_list = ["b1", "b2", "b3"]
  3. c_list = ["c1", "c2", "c3"] # 元素个数与最短的列表一致
  4. lis = list(map(list, zip(a_list, b_list, c_list)))
  5. lisb = list(zip(a_list, b_list, c_list)) # 打包为元组的列表
  6. new_list = []
  7. for i in lis:
  8. new_dict = {'A': i[0], 'B': i[1], 'C': i[2]}
  9. new_list.append(new_dict)
  10. print(lis)
  11. print(lisb)
  12. print(new_list)
  13. for a, b, c in lis:
  14. print(a, b, c)
  15. lis [['a1', 'b1', 'c1'], ['a2', 'b2', 'c2'], ['a3', 'b3', 'c3']]
  16. lisb [('a1', 'b1', 'c1'), ('a2', 'b2', 'c2'), ('a3', 'b3', 'c3')]
  17. new_list [{'A': 'a1', 'B': 'b1', 'C': 'c1'}, {'A': 'a2', 'B': 'b2', 'C': 'c2'}, {'A': 'a3', 'B': 'b3', 'C': 'c3'}]
  18. a1 b1 c1
  19. a2 b2 c2
  20. a3 b3 c3

lxml

  1. req_mulu = requests.get(mulu_url).content.decode('utf-8')
  2. mulu_html = etree.HTML(req_mulu)
  3. base_path_list = mulu_html.xpath("//div[@class='box_con'][2]/div[@id='list']/dl/dd/a/@href")

Queue

  1. for i in range(2, 100):
  2. xh_url = "http://www.xbiquge.la/fenlei/1_{}.html".format(i)
  3. mulu_Queue.put(xh_url)
  4. for i in range(10):
  5. mulu = Producer_mulu(mulu_Queue, zhangjie_Queue)
  6. mulu.start()
  7. for i in range(10):
  8. zhangjie = Producer_zhangjie(mulu_Queue, zhangjie_Queue, content_Queue)
  9. zhangjie.start()
  10. for i in range(100):
  11. content = Consumers_content(mulu_Queue, zhangjie_Queue, content_Queue)
  12. content.start()

将xh_url即翻页url,循环后存入mulu_Queue队列中
mulu_Queue中的url在Producer_mulu中取出,并生成zhangjie_Queue队列;Producer_zhangjie生产content_Queue队列
Consumers_content中mulu_Queue,zhangjie_Queue,content_Queue全为空后,循环停止