• 爬取斗图啦网站表情包https://www.doutula.com/photo/list/?page=1
    • 两个队列,一个用来存需要请求url,一个用来存获取到的数据,多个值要存的话可以使用元组或列表;
    • run方法中结合while True循环来使用,只要队列不空,线程就不停止;
    • 使用for循环创建多个线程; ``` import requests import threading import os import re from urllib import request from lxml import etree from queue import Queue

    class Producer(threading.Thread): def init(self, pagequeue, imgqueue): super().__init() self.headers = { ‘Host’: ‘www.doutula.com’, ‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36’, ‘Cookie’: ‘’’Hm_lvt_2fc12699c699441729d4b335ce117f40 = 1589614436;_agep = 1589614437;_agfp = 94fe8c7b8365e77f010428ecce544b8f;_agtk = 8582218b02a79d1ffb5747bdc5e52ae6;XSRF - TOKEN = eyJpdiI6IldOVnI0S3hnRlZEZUY0aENlZkFWNXc9PSIsInZhbHVlIjoicjBYdVJqNnQxVVpDT3lzdkhSV01HeWFHaXVKZDBick1WUlFsbmVWbXpxbkljUmlKdmFkNDBUZDd0NXJxUVl0ZSIsIm1hYyI6IjY3N2JmZjQzY2ZhODE0ZTAxYWVmODJhMzg2NDMzYTAyZmI3ODgwNWFhMWY1OWI0MWQyYzRjY2FlNzVjMDViMmUifQ%3D%3D;doutula_session = eyJpdiI6Img1QWtpM0ZzVGJWa2MzcmkrcWdrQmc9PSIsInZhbHVlIjoiRjNPeUFPdGcrcjRlOXpzd1wvaWgzc0hVRlBJMlA5TU5EUkozc0xNKzFxZG5YcVZLcmV3REFUWmlCV0NlNWZDNUIiLCJtYWMiOiIzMTIzY2NiNWZmNGNkZGE1MzE0YjM4N2U1NmE0NTY0MWM5MmJkODgxM2FiZjk3OTdjMWQ4M2JiMmExY2QzMDQ1In0%3D;Hm_lpvt_2fc12699c699441729d4b335ce117f40 = 1589614506’’’ } self.page_url_queue = page_queue self.img_queue = img_queue

    1. def parse_data(self, url):
    2. res = requests.get(url, headers=self.headers)
    3. html = etree.HTML(res.text)
    4. imgs = html.xpath("//div[@class='page-content text-center']//img")
    5. for img in imgs:
    6. img_url = img.get("data-original")
    7. img_name = img.get("alt")
    8. new_name = re.sub(r"[\/\\\:\*\?\"\<\>\|]", "", str(img_name))
    9. filename = "img/" + new_name + ".jpg"
    10. self.img_queue.put([img_url, filename])
    11. print(img_url)
    12. if imgs.index(img) == 5:
    13. break
    14. def run(self):
    15. while True:
    16. if self.page_url_queue.empty():
    17. break
    18. url = self.page_url_queue.get()
    19. self.parse_data(url)

    class Consumer(threading.Thread): def init(self, pagequeue, imgqueue): super().__init() self.img_queue = img_queue self.page_queue = page_queue

    def run(self):
        while True:
            if self.img_queue.empty() and self.page_queue.empty():
                print("保存完毕")
                break
            img_url, filename = self.img_queue.get()
            if img_url:
                print(img_url, filename)
                request.urlretrieve(img_url, filename)
    

    def main(): url = “https://www.doutula.com/photo/list/?page={}“ page_queue = Queue(100) img_queue = Queue(10000)

    if not os.path.exists("img"):
        os.mkdir("img")
    
    for page in range(1, 31):
        page_url = url.format(page)
        page_queue.put(page_url)
    
    for i in range(5):
        producer = Producer(page_queue, img_queue)
        consumer = Consumer(page_queue, img_queue)
    
        producer.start()
        consumer.start()
    

    if name == ‘main‘: main() ```