import requests
from bs4 import BeautifulSoup
import os
from hashlib import md5
class meizi(object):
def init(self):
self.baseURL = 'https://www.mzitu.com/tag/youhuo/page/{}/'
self.headers = {'If-None-Match': 'W/"5cc2cd8f-2c58"',
"Referer": "http://www.mzitu.com/all/",
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 UBrowser/6.1.2107.204 SafarMozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
}
self.page = '2'
def getPage(self):
url = self.baseURL.format(self.page)
data = self.getRequest(url)
soup = BeautifulSoup(data, 'lxml')
self.page = soup.select('.page-numbers')[-2].get_text()
print(self.page)
def getRequest(self, url):
data = requests.get(url, headers=self.headers).content.decode()
return data
def getURL(self):
urlList = []
for i in range(1, (int(self.page) + 1)):
urlList.append(self.baseURL.format(str(i)))
return urlList
def xpath(self, data):
soup = BeautifulSoup(data, 'lxml')
list = soup.select('#pins li')
for item in list:
dataoriginal = item.select_one('img').get('data-original')
self.saveImage(dataoriginal)
def saveImage(self, imageURL, filePath='img'):
respose = requests.get(imageURL, headers=self.headers)
if respose.status_code == 200:
data = respose.content
try:
if not os.path.exists(filePath):
print('文件夹', filePath, '不存在,重新建立')
os.makedirs(filePath)
# 获得图片后缀
file_suffix = os.path.splitext(imageURL)[1]
# 拼接图片名(包含路径)
filename = '{}/{}{}'.format(filePath, md5(data).hexdigest(),
file_suffix)
with open(filename, 'wb')as f:
f.write(data)
except IOError as e:
print('文件操作失败:' + e)
except Exception as e:
print('错误:' + e)
def startRun(self):
# self.getPage()
urlList = self.getURL()
for url in urlList:
print(url)
data = self.getRequest(url)
self.xpath(data)
if name == 'main':
meizi().startRun()