import requestsimport threadingfrom lxml import etreefrom fake_useragent import UserAgentimport refrom queue import Queue# 获取url列表def get_urls(page): # 创建一个队列保存网址 url_queue = Queue(page) for i in range(1,page+1): url = f'https://www.fabiaoqing.com/biaoqing/lists/page/{i}.html' url_queue.put(url) return url_queue# 获取图片的地址def get_imgurl(url_queue,imgs_queue): url = url_queue.get() if not url_queue.empty(): headers = { 'User-Agent': UserAgent().random } html = requests.get(url=url, headers=headers) xml = etree.HTML(html.text) img_list = xml.xpath('//img[@class="ui image lazy"]') for img in img_list: item = {} item['title'] = img.xpath('./@title')[0] item['src'] = img.xpath('./@data-original')[0] imgs_queue.put(item)# 保存数据def save_data(imgs_queue): while not imgs_queue.empty(): item = imgs_queue.get() name = item['title'] + '.' + item['src'].split('.')[-1] # 把文件名中的特殊字符替换掉 name = re.sub(r'[\\/:*?"<>!]', '', name) print(name) data = requests.get(item['src']).content with open('./static/' + name, 'wb') as file: file.write(data)if __name__ == '__main__': page = int(input('请输入要爬取的页数')) url_queue = get_urls(page) imgs_queue = Queue(45 * page) t_list = [] # 启动获取图片的线程 for i in range(5): t = threading.Thread(target=get_imgurl,args=(url_queue,imgs_queue)) t.start() t_list.append(t) # 等待子线程结束 for i in t_list: i.join() # print(imgs_queue.qsize()) # 启动下载图片的小程序 for i in range(5): t = threading.Thread(target=save_data,args=(imgs_queue,)) t.start()