方法一:使用requests和lxml以及re
import requests
from lxml import etree
import re
import json
import os
class GetVideo:
videoImages = []
videoTitles = []
videoUrls = []
videoIds = []
def __init__(self,url):
self.url = url
# 获取视频信息并下载
def getVideoUrl(self):
response = requests.get(self.url)
html = etree.HTML(response.text)
liHtml = html.xpath('//ul[@class="listvideo-list clearfix"]/li')[0]
# 视频Id
self.videoIds = liHtml.xpath('//div[@class="vervideo-bd"]/a[@class="vervideo-lilink actplay"]/@href')
# 获取视频url地址
for videoId in self.videoIds:
videoLink = self.getVideoLink(videoId)
self.videoUrls.append(videoLink)
# 获取视频的图片
videoImg = liHtml.xpath('//div[@class="vervideo-img"]/div/div[@class="img"]/@style')
for img in videoImg:
imgUrl = ''.join(img)
imgUrl = imgUrl.split(': ')[1]
imgUrl = re.findall(r'\((.*?)\)', imgUrl)[0]
self.videoImages.append(imgUrl)
# 视频标题
self.videoTitles = liHtml.xpath('//div[@class="vervideo-title"]/text()')
print('视频正在下载中....')
self.downVideo()
# 获取视频url地址
def getVideoLink(self,videoId):
# videoId video_1749535
# 不能播放的视频地址:url = 'https://video.pearvideo.com/mp4/third/20220107/1641654761203-10054243-154532-hd.mp4'
# 正常播放地址为: https://video.pearvideo.com/mp4/third/20220107/cont-1749535-10054243-154532-hd.mp4
contId = videoId.split('_')[-1]
url = f'https://www.pearvideo.com/videoStatus.jsp?contId={contId}'
headers = {
'Referer':f'https://www.pearvideo.com/{videoId}',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36'
}
response = requests.get(url,headers=headers)
if response.status_code == 200:
videoLink = json.loads(response.text)['videoInfo']['videos']['srcUrl']
# 得到视频url:https://video.pearvideo.com/mp4/third/20211229/1641655664960-10054243-161801-hd.mp4
# 此时该视频是不能播放的 拿到需要替换的字符串
resplaceTxt = videoLink.split('/')[-1].split('-')[0]
#print("需要替换的内容为:",resplaceTxt)
newVideoLink = videoLink.replace(resplaceTxt,f'cont-{contId}')
return newVideoLink
# 下载视频
def downVideo(self):
if not os.path.exists('./videos'):
os.mkdir('./videos')
for i in range(0,len(self.videoUrls)):
videoContent = requests.get(self.videoUrls[i])
print(f'正在下载视频:{self.videoTitles[i]}')
with open(f'./videos/{self.videoTitles[i]}.mp4','wb') as file:
file.write(videoContent.content)
print('恭喜你已爬完该页视频')
if __name__ == '__main__':
#url = input('请输入梨视频地址')
url = 'https://www.pearvideo.com/category_6'
obj = GetVideo(url)
response = obj.getVideoUrl()
#print(response)
方法二、使用requests_html
from requests_html import HTMLSession,UserAgent
import json
import os
import re
class GetVideo:
def __init__(self,url):
self.url = url
self.headers = {
'User-Agent':UserAgent().random
}
self.session = HTMLSession()
# 获取视频链接
def getVideoUrl(self):
response = self.session.get(url)
if response.status_code == 200:
html = response.html
# 获取最新的视频
print('视频正在下载中....')
# 使用xpath获取视频信息
#allLi = html.xpath('//*[@class="category-list clearfix"]/li')
#for li in allLi:
#videoId = li.find('a')[0].attrs.get('href') # 视频id
#videoImg = li.search('background-image: url({})')[0] # 视频图片
#videoTitle = li.xpath('//*[@class="vervideo-title"]/text()')[0]
#videoLink = self.getVideoLink(videoId) # 视频地址
#self.downVideo(videoLink,videoTitle)
# 使用正则表达式匹配获取
allLi = ul.search_all('<li class="categoryem">{}</li>')
for li in allLi:
# 视频id
videoId = re.findall('<a href="(.*?)" class="vervideo-lilink actplay">',li[0])[0]
videoLink = self.getVideoLink(videoId) # 视频地址
# 视频封面图
videoImg = re.findall('\((.*?)\)',li[0])[0]
videoTitle = re.findall('<div class="vervideo-title">(.*?)</div>',li[0])[0]
self.downVideo(videoLink, videoTitle)
print('恭喜你已爬完该页视频')
# 下载视频
def downVideo(self,videoLink,videoTitle):
if not os.path.exists('./videos'):
os.mkdir('./videos')
result = self.session.get(videoLink)
print(f'正在下载视频:{videoTitle}')
with open(f'./videos/{videoTitle}.mp4', 'wb') as file:
file.write(result.content)
# 获取可以访问的视频地址
def getVideoLink(self,videoId):
contId = videoId.split('_')[-1]
url = f'https://www.pearvideo.com/videoStatus.jsp?contId={contId}'
headers = {
'Referer': f'https://www.pearvideo.com/{videoId}',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36'
}
response = self.session.get(url, headers=headers)
if response.status_code == 200:
data = json.loads(response.text)
videoLink = data['videoInfo']['videos']['srcUrl']
# 得到视频url:https://video.pearvideo.com/mp4/third/20211229/1641655664960-10054243-161801-hd.mp4
# 此时该视频是不能播放的 拿到需要替换的字符串
resplaceTxt = videoLink.split('/')[-1].split('-')[0]
# print("需要替换的内容为:",resplaceTxt)
newVideoLink = videoLink.replace(resplaceTxt, f'cont-{contId}')
return newVideoLink
if __name__ == '__main__':
url = 'https://www.pearvideo.com/category_6'
obj = GetVideo(url)
obj.getVideoUrl()