1. """
    2. 需求分析:
    3. 1.目标网站:https://www.mzitu.com/xinggan/
    4. 2.发起请求获取所有的翻页链接
    5. 3.利用多线程抓取每一页的图片
    6. 4.保存到目标文件夹
    7. """
    8. """
    9. 模块:
    10. requests, threading
    11. """
    12. # -*- coding: UTF-8 -*-
    13. import requests
    14. import threading #多线程模块
    15. import re #正则表达式模块
    16. import time #时间模块
    17. import os #目录操作模块
    18. """================= 01.获取所有的翻页链接 ================="""
    19. # 所有分页url
    20. all_urls = ['https://www.mzitu.com/xinggan/']
    21. class Spider(object):
    22. def __init__(self, target_url, headers):
    23. self.target_url = target_url
    24. self.headers = headers
    25. # 获取所有的分页url
    26. def getUrls(self, start_page, page_num):
    27. # 循环得到分页的url
    28. for page in range(start_page, page_num):
    29. url = self.target_url %page
    30. all_urls.append(url)
    31. print(all_urls)
    32. print('所有分页链接获取完毕*********')
    33. """================= 02.获取所有单页的图片链接 ================="""
    34. # 图片列表页面
    35. all_img_urls = []
    36. # 初始化互斥锁
    37. g_lock = threading.Lock()
    38. # 生产者:负责从每个页面获取当前页面的图片链接
    39. class Producer(threading.Thread):
    40. # 重写run()方法
    41. def run(self):
    42. headers = {
    43. "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36",
    44. 'referer': 'https://www.mzitu.com/'
    45. }
    46. global all_urls
    47. while len(all_urls) > 0:
    48. print(len(all_urls))
    49. # 加锁
    50. g_lock.acquire()
    51. # 通过pop方法移除最后一个元素,并且返回该值
    52. page_url = all_urls.pop()
    53. # 释放锁
    54. g_lock.release()
    55. try:
    56. print("分析:" + page_url)
    57. response = requests.get(page_url, headers=headers)
    58. # print(len(response.text))
    59. all_pic_link = re.findall('<li><a href="(.*?)" target="_blank"><img class=', response.text, re.S)
    60. # 这里用数组拼接所以要声明全局变量,如果用.append则不需要
    61. global all_img_urls
    62. # 加锁
    63. g_lock.acquire()
    64. # 这里直接将两个数组拼接,获取单页所有图片的链接
    65. all_img_urls += all_pic_link
    66. # 释放锁
    67. g_lock.release()
    68. except:
    69. pass
    70. print('已获取当前分页的组图链接' + str(all_img_urls))
    71. print('02-等等吧,不然又要被封IP了......................')
    72. time.sleep(2)
    73. print('02.所有单页组图链接获取完毕***********')
    74. """================= 03.获取每一个图册里所以目标图片的链接 ================="""
    75. # 目标图片的链接
    76. pic_links = []
    77. # 消费者
    78. class Consumer(threading.Thread):
    79. # 重写run()方法
    80. def run(self):
    81. headers = {
    82. 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36',
    83. 'referer': 'https://www.mzitu.com/'
    84. }
    85. # 调用全局的图片详情页面链接数组
    86. global all_img_urls
    87. print('%s 正在运行' %threading.current_thread)
    88. while len(all_img_urls) > 0:
    89. # 加锁
    90. g_lock.acquire()
    91. img_url = all_img_urls.pop()
    92. # 释放锁
    93. g_lock.release()
    94. try:
    95. response = requests.get(img_url, headers=headers)
    96. # 由于我们调用的页面编码是GB2312,所以需要设置一下编码
    97. # response.encoding = 'gb2312'
    98. title = re.search('<h2 class="main-title">(.*?)</h2>', response.text).group(1)
    99. all_pic_src = re.findall('<img class="blur" src="(.*?)"', response.text)
    100. print(title, all_pic_src)
    101. # 创建图片字典
    102. pic_dict = {title : all_pic_src}
    103. # 加锁
    104. g_lock.acquire()
    105. pic_links.append(pic_dict)
    106. print(title + '获取成功')
    107. # 释放锁
    108. g_lock.release()
    109. except:
    110. pass
    111. print('03-等等吧,不然又要被封IP了......................')
    112. time.sleep(2)
    113. print('03.所有组图详情标题和图片链接获取完毕*********')
    114. """================= 04.储存目标图片 ================="""
    115. class DownPic(threading.Thread):
    116. # 重写run()方法
    117. def run(self):
    118. headers = {
    119. 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/',
    120. 'referer': 'https://www.mzitu.com/'
    121. }
    122. i = 0
    123. while True:
    124. i += 1
    125. global pic_links
    126. # 上锁
    127. g_lock.acquire()
    128. # 如果没有图片了就解锁
    129. if len(pic_links) == 0:
    130. g_lock.release()
    131. continue
    132. else:
    133. pic = pic_links.pop()
    134. g_lock.release()
    135. # 遍历图片字典,key:title; values:图片链接
    136. for key, values in pic.items():
    137. # path = key.rstrip('\\')
    138. path = ('@美女图片')
    139. # os.path.exists(path)判断路径为path的文件是否存在
    140. is_exists = os.path.exists(path)
    141. # 判断结果
    142. if not is_exists:
    143. # 如果不存在则创建目录
    144. os.makedirs(path)
    145. print('{}目录创建成功'.format(path))
    146. else:
    147. # 如果目录存在则不创建,并提示目录已存在
    148. print(path + '目录已存在')
    149. # ===================================
    150. for pic in values:
    151. pic_name = key + '.jpg'
    152. # 设置图片的名字:路径 + 图片名
    153. filename = path + '/' + pic_name
    154. if os.path.exists(filename):
    155. continue
    156. else:
    157. try:
    158. response = requests.get(pic, headers=headers)
    159. with open(filename, 'wb') as f:
    160. f.write(response.content)
    161. # Exception可以将所有的异常包括在内,并将异常赋予变量e
    162. except Exception as e:
    163. print(e)
    164. print('04-等等吧,不然又要被封IP了......................')
    165. time.sleep(2)
    166. print('图片下载中*************')
    167. if __name__ == '__main__':
    168. headers = {
    169. 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36',
    170. 'referer': 'https://www.mzitu.com/'
    171. }
    172. target_url = 'https://www.mzitu.com/xinggan/page/%d/'
    173. spider = Spider(target_url, headers)
    174. spider.getUrls(2,3)
    175. # print(all_urls)
    176. all_urls.reverse()
    177. threads = []
    178. # start = time.time()
    179. # 创建5个线程去访问每个单页
    180. for x in range(2):
    181. p = Producer()
    182. p.start()
    183. threads.append(p)
    184. # 首先创建了一个空列表threads,再通过循环添加join使得主线程等待子线程运行完毕之后运行
    185. for pp in threads:
    186. pp.join()
    187. # end = time.time()
    188. # print('总耗时', end - start)
    189. # 创建10个线程去获取每个单页图片链接
    190. for x in range(10):
    191. c = Consumer()
    192. c.start()
    193. # # 创建10个线程获取并保存每个详情页的图片
    194. for x in range(10):
    195. d = DownPic()
    196. d.start()
    197. print('程序结束')