import requests
import threading
from lxml import etree
from fake_useragent import UserAgent
import re
from queue import Queue
# 获取url列表
def get_urls(page):
# 创建一个队列保存网址
url_queue = Queue(page)
for i in range(1,page+1):
url = f'https://www.fabiaoqing.com/biaoqing/lists/page/{i}.html'
url_queue.put(url)
return url_queue
# 获取图片的地址
def get_imgurl(url_queue,imgs_queue):
url = url_queue.get()
if not url_queue.empty():
headers = {
'User-Agent': UserAgent().random
}
html = requests.get(url=url, headers=headers)
xml = etree.HTML(html.text)
img_list = xml.xpath('//img[@class="ui image lazy"]')
for img in img_list:
item = {}
item['title'] = img.xpath('./@title')[0]
item['src'] = img.xpath('./@data-original')[0]
imgs_queue.put(item)
# 保存数据
def save_data(imgs_queue):
while not imgs_queue.empty():
item = imgs_queue.get()
name = item['title'] + '.' + item['src'].split('.')[-1]
# 把文件名中的特殊字符替换掉
name = re.sub(r'[\\/:*?"<>!]', '', name)
print(name)
data = requests.get(item['src']).content
with open('./static/' + name, 'wb') as file:
file.write(data)
if __name__ == '__main__':
page = int(input('请输入要爬取的页数'))
url_queue = get_urls(page)
imgs_queue = Queue(45 * page)
t_list = []
# 启动获取图片的线程
for i in range(5):
t = threading.Thread(target=get_imgurl,args=(url_queue,imgs_queue))
t.start()
t_list.append(t)
# 等待子线程结束
for i in t_list:
i.join()
# print(imgs_queue.qsize())
# 启动下载图片的小程序
for i in range(5):
t = threading.Thread(target=save_data,args=(imgs_queue,))
t.start()