- 爬取斗图啦网站表情包https://www.doutula.com/photo/list/?page=1
- 两个队列,一个用来存需要请求url,一个用来存获取到的数据,多个值要存的话可以使用元组或列表;
- run方法中结合while True循环来使用,只要队列不空,线程就不停止;
- 使用for循环创建多个线程; ``` import requests import threading import os import re from urllib import request from lxml import etree from queue import Queue
class Producer(threading.Thread): def init(self, pagequeue, imgqueue): super().__init() self.headers = { ‘Host’: ‘www.doutula.com’, ‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36’, ‘Cookie’: ‘’’Hm_lvt_2fc12699c699441729d4b335ce117f40 = 1589614436;_agep = 1589614437;_agfp = 94fe8c7b8365e77f010428ecce544b8f;_agtk = 8582218b02a79d1ffb5747bdc5e52ae6;XSRF - TOKEN = eyJpdiI6IldOVnI0S3hnRlZEZUY0aENlZkFWNXc9PSIsInZhbHVlIjoicjBYdVJqNnQxVVpDT3lzdkhSV01HeWFHaXVKZDBick1WUlFsbmVWbXpxbkljUmlKdmFkNDBUZDd0NXJxUVl0ZSIsIm1hYyI6IjY3N2JmZjQzY2ZhODE0ZTAxYWVmODJhMzg2NDMzYTAyZmI3ODgwNWFhMWY1OWI0MWQyYzRjY2FlNzVjMDViMmUifQ%3D%3D;doutula_session = eyJpdiI6Img1QWtpM0ZzVGJWa2MzcmkrcWdrQmc9PSIsInZhbHVlIjoiRjNPeUFPdGcrcjRlOXpzd1wvaWgzc0hVRlBJMlA5TU5EUkozc0xNKzFxZG5YcVZLcmV3REFUWmlCV0NlNWZDNUIiLCJtYWMiOiIzMTIzY2NiNWZmNGNkZGE1MzE0YjM4N2U1NmE0NTY0MWM5MmJkODgxM2FiZjk3OTdjMWQ4M2JiMmExY2QzMDQ1In0%3D;Hm_lpvt_2fc12699c699441729d4b335ce117f40 = 1589614506’’’ } self.page_url_queue = page_queue self.img_queue = img_queue
def parse_data(self, url):
res = requests.get(url, headers=self.headers)
html = etree.HTML(res.text)
imgs = html.xpath("//div[@class='page-content text-center']//img")
for img in imgs:
img_url = img.get("data-original")
img_name = img.get("alt")
new_name = re.sub(r"[\/\\\:\*\?\"\<\>\|]", "", str(img_name))
filename = "img/" + new_name + ".jpg"
self.img_queue.put([img_url, filename])
print(img_url)
if imgs.index(img) == 5:
break
def run(self):
while True:
if self.page_url_queue.empty():
break
url = self.page_url_queue.get()
self.parse_data(url)
class Consumer(threading.Thread): def init(self, pagequeue, imgqueue): super().__init() self.img_queue = img_queue self.page_queue = page_queue
def run(self):
while True:
if self.img_queue.empty() and self.page_queue.empty():
print("保存完毕")
break
img_url, filename = self.img_queue.get()
if img_url:
print(img_url, filename)
request.urlretrieve(img_url, filename)
def main(): url = “https://www.doutula.com/photo/list/?page={}“ page_queue = Queue(100) img_queue = Queue(10000)
if not os.path.exists("img"):
os.mkdir("img")
for page in range(1, 31):
page_url = url.format(page)
page_queue.put(page_url)
for i in range(5):
producer = Producer(page_queue, img_queue)
consumer = Consumer(page_queue, img_queue)
producer.start()
consumer.start()
if name == ‘main‘: main() ```