1. import requests
    2. import threading
    3. from lxml import etree
    4. from fake_useragent import UserAgent
    5. import re
    6. from queue import Queue
    7. # 获取url列表
    8. def get_urls(page):
    9. # 创建一个队列保存网址
    10. url_queue = Queue(page)
    11. for i in range(1,page+1):
    12. url = f'https://www.fabiaoqing.com/biaoqing/lists/page/{i}.html'
    13. url_queue.put(url)
    14. return url_queue
    15. # 获取图片的地址
    16. def get_imgurl(url_queue,imgs_queue):
    17. url = url_queue.get()
    18. if not url_queue.empty():
    19. headers = {
    20. 'User-Agent': UserAgent().random
    21. }
    22. html = requests.get(url=url, headers=headers)
    23. xml = etree.HTML(html.text)
    24. img_list = xml.xpath('//img[@class="ui image lazy"]')
    25. for img in img_list:
    26. item = {}
    27. item['title'] = img.xpath('./@title')[0]
    28. item['src'] = img.xpath('./@data-original')[0]
    29. imgs_queue.put(item)
    30. # 保存数据
    31. def save_data(imgs_queue):
    32. while not imgs_queue.empty():
    33. item = imgs_queue.get()
    34. name = item['title'] + '.' + item['src'].split('.')[-1]
    35. # 把文件名中的特殊字符替换掉
    36. name = re.sub(r'[\\/:*?"<>!]', '', name)
    37. print(name)
    38. data = requests.get(item['src']).content
    39. with open('./static/' + name, 'wb') as file:
    40. file.write(data)
    41. if __name__ == '__main__':
    42. page = int(input('请输入要爬取的页数'))
    43. url_queue = get_urls(page)
    44. imgs_queue = Queue(45 * page)
    45. t_list = []
    46. # 启动获取图片的线程
    47. for i in range(5):
    48. t = threading.Thread(target=get_imgurl,args=(url_queue,imgs_queue))
    49. t.start()
    50. t_list.append(t)
    51. # 等待子线程结束
    52. for i in t_list:
    53. i.join()
    54. # print(imgs_queue.qsize())
    55. # 启动下载图片的小程序
    56. for i in range(5):
    57. t = threading.Thread(target=save_data,args=(imgs_queue,))
    58. t.start()