方法一:使用requests和lxml以及re
import requestsfrom lxml import etreeimport reimport jsonimport osclass GetVideo:videoImages = []videoTitles = []videoUrls = []videoIds = []def __init__(self,url):self.url = url# 获取视频信息并下载def getVideoUrl(self):response = requests.get(self.url)html = etree.HTML(response.text)liHtml = html.xpath('//ul[@class="listvideo-list clearfix"]/li')[0]# 视频Idself.videoIds = liHtml.xpath('//div[@class="vervideo-bd"]/a[@class="vervideo-lilink actplay"]/@href')# 获取视频url地址for videoId in self.videoIds:videoLink = self.getVideoLink(videoId)self.videoUrls.append(videoLink)# 获取视频的图片videoImg = liHtml.xpath('//div[@class="vervideo-img"]/div/div[@class="img"]/@style')for img in videoImg:imgUrl = ''.join(img)imgUrl = imgUrl.split(': ')[1]imgUrl = re.findall(r'\((.*?)\)', imgUrl)[0]self.videoImages.append(imgUrl)# 视频标题self.videoTitles = liHtml.xpath('//div[@class="vervideo-title"]/text()')print('视频正在下载中....')self.downVideo()# 获取视频url地址def getVideoLink(self,videoId):# videoId video_1749535# 不能播放的视频地址:url = 'https://video.pearvideo.com/mp4/third/20220107/1641654761203-10054243-154532-hd.mp4'# 正常播放地址为: https://video.pearvideo.com/mp4/third/20220107/cont-1749535-10054243-154532-hd.mp4contId = videoId.split('_')[-1]url = f'https://www.pearvideo.com/videoStatus.jsp?contId={contId}'headers = {'Referer':f'https://www.pearvideo.com/{videoId}','User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36'}response = requests.get(url,headers=headers)if response.status_code == 200:videoLink = json.loads(response.text)['videoInfo']['videos']['srcUrl']# 得到视频url:https://video.pearvideo.com/mp4/third/20211229/1641655664960-10054243-161801-hd.mp4# 此时该视频是不能播放的 拿到需要替换的字符串resplaceTxt = videoLink.split('/')[-1].split('-')[0]#print("需要替换的内容为:",resplaceTxt)newVideoLink = videoLink.replace(resplaceTxt,f'cont-{contId}')return newVideoLink# 下载视频def downVideo(self):if not os.path.exists('./videos'):os.mkdir('./videos')for i in range(0,len(self.videoUrls)):videoContent = requests.get(self.videoUrls[i])print(f'正在下载视频:{self.videoTitles[i]}')with open(f'./videos/{self.videoTitles[i]}.mp4','wb') as file:file.write(videoContent.content)print('恭喜你已爬完该页视频')if __name__ == '__main__':#url = input('请输入梨视频地址')url = 'https://www.pearvideo.com/category_6'obj = GetVideo(url)response = obj.getVideoUrl()#print(response)
方法二、使用requests_html
from requests_html import HTMLSession,UserAgentimport jsonimport osimport reclass GetVideo:def __init__(self,url):self.url = urlself.headers = {'User-Agent':UserAgent().random}self.session = HTMLSession()# 获取视频链接def getVideoUrl(self):response = self.session.get(url)if response.status_code == 200:html = response.html# 获取最新的视频print('视频正在下载中....')# 使用xpath获取视频信息#allLi = html.xpath('//*[@class="category-list clearfix"]/li')#for li in allLi:#videoId = li.find('a')[0].attrs.get('href') # 视频id#videoImg = li.search('background-image: url({})')[0] # 视频图片#videoTitle = li.xpath('//*[@class="vervideo-title"]/text()')[0]#videoLink = self.getVideoLink(videoId) # 视频地址#self.downVideo(videoLink,videoTitle)# 使用正则表达式匹配获取allLi = ul.search_all('<li class="categoryem">{}</li>')for li in allLi:# 视频idvideoId = re.findall('<a href="(.*?)" class="vervideo-lilink actplay">',li[0])[0]videoLink = self.getVideoLink(videoId) # 视频地址# 视频封面图videoImg = re.findall('\((.*?)\)',li[0])[0]videoTitle = re.findall('<div class="vervideo-title">(.*?)</div>',li[0])[0]self.downVideo(videoLink, videoTitle)print('恭喜你已爬完该页视频')# 下载视频def downVideo(self,videoLink,videoTitle):if not os.path.exists('./videos'):os.mkdir('./videos')result = self.session.get(videoLink)print(f'正在下载视频:{videoTitle}')with open(f'./videos/{videoTitle}.mp4', 'wb') as file:file.write(result.content)# 获取可以访问的视频地址def getVideoLink(self,videoId):contId = videoId.split('_')[-1]url = f'https://www.pearvideo.com/videoStatus.jsp?contId={contId}'headers = {'Referer': f'https://www.pearvideo.com/{videoId}','User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36'}response = self.session.get(url, headers=headers)if response.status_code == 200:data = json.loads(response.text)videoLink = data['videoInfo']['videos']['srcUrl']# 得到视频url:https://video.pearvideo.com/mp4/third/20211229/1641655664960-10054243-161801-hd.mp4# 此时该视频是不能播放的 拿到需要替换的字符串resplaceTxt = videoLink.split('/')[-1].split('-')[0]# print("需要替换的内容为:",resplaceTxt)newVideoLink = videoLink.replace(resplaceTxt, f'cont-{contId}')return newVideoLinkif __name__ == '__main__':url = 'https://www.pearvideo.com/category_6'obj = GetVideo(url)obj.getVideoUrl()
