方法一:使用requests和lxml以及re

    1. import requests
    2. from lxml import etree
    3. import re
    4. import json
    5. import os
    6. class GetVideo:
    7. videoImages = []
    8. videoTitles = []
    9. videoUrls = []
    10. videoIds = []
    11. def __init__(self,url):
    12. self.url = url
    13. # 获取视频信息并下载
    14. def getVideoUrl(self):
    15. response = requests.get(self.url)
    16. html = etree.HTML(response.text)
    17. liHtml = html.xpath('//ul[@class="listvideo-list clearfix"]/li')[0]
    18. # 视频Id
    19. self.videoIds = liHtml.xpath('//div[@class="vervideo-bd"]/a[@class="vervideo-lilink actplay"]/@href')
    20. # 获取视频url地址
    21. for videoId in self.videoIds:
    22. videoLink = self.getVideoLink(videoId)
    23. self.videoUrls.append(videoLink)
    24. # 获取视频的图片
    25. videoImg = liHtml.xpath('//div[@class="vervideo-img"]/div/div[@class="img"]/@style')
    26. for img in videoImg:
    27. imgUrl = ''.join(img)
    28. imgUrl = imgUrl.split(': ')[1]
    29. imgUrl = re.findall(r'\((.*?)\)', imgUrl)[0]
    30. self.videoImages.append(imgUrl)
    31. # 视频标题
    32. self.videoTitles = liHtml.xpath('//div[@class="vervideo-title"]/text()')
    33. print('视频正在下载中....')
    34. self.downVideo()
    35. # 获取视频url地址
    36. def getVideoLink(self,videoId):
    37. # videoId video_1749535
    38. # 不能播放的视频地址:url = 'https://video.pearvideo.com/mp4/third/20220107/1641654761203-10054243-154532-hd.mp4'
    39. # 正常播放地址为: https://video.pearvideo.com/mp4/third/20220107/cont-1749535-10054243-154532-hd.mp4
    40. contId = videoId.split('_')[-1]
    41. url = f'https://www.pearvideo.com/videoStatus.jsp?contId={contId}'
    42. headers = {
    43. 'Referer':f'https://www.pearvideo.com/{videoId}',
    44. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36'
    45. }
    46. response = requests.get(url,headers=headers)
    47. if response.status_code == 200:
    48. videoLink = json.loads(response.text)['videoInfo']['videos']['srcUrl']
    49. # 得到视频url:https://video.pearvideo.com/mp4/third/20211229/1641655664960-10054243-161801-hd.mp4
    50. # 此时该视频是不能播放的 拿到需要替换的字符串
    51. resplaceTxt = videoLink.split('/')[-1].split('-')[0]
    52. #print("需要替换的内容为:",resplaceTxt)
    53. newVideoLink = videoLink.replace(resplaceTxt,f'cont-{contId}')
    54. return newVideoLink
    55. # 下载视频
    56. def downVideo(self):
    57. if not os.path.exists('./videos'):
    58. os.mkdir('./videos')
    59. for i in range(0,len(self.videoUrls)):
    60. videoContent = requests.get(self.videoUrls[i])
    61. print(f'正在下载视频:{self.videoTitles[i]}')
    62. with open(f'./videos/{self.videoTitles[i]}.mp4','wb') as file:
    63. file.write(videoContent.content)
    64. print('恭喜你已爬完该页视频')
    65. if __name__ == '__main__':
    66. #url = input('请输入梨视频地址')
    67. url = 'https://www.pearvideo.com/category_6'
    68. obj = GetVideo(url)
    69. response = obj.getVideoUrl()
    70. #print(response)

    方法二、使用requests_html

    1. from requests_html import HTMLSession,UserAgent
    2. import json
    3. import os
    4. import re
    5. class GetVideo:
    6. def __init__(self,url):
    7. self.url = url
    8. self.headers = {
    9. 'User-Agent':UserAgent().random
    10. }
    11. self.session = HTMLSession()
    12. # 获取视频链接
    13. def getVideoUrl(self):
    14. response = self.session.get(url)
    15. if response.status_code == 200:
    16. html = response.html
    17. # 获取最新的视频
    18. print('视频正在下载中....')
    19. # 使用xpath获取视频信息
    20. #allLi = html.xpath('//*[@class="category-list clearfix"]/li')
    21. #for li in allLi:
    22. #videoId = li.find('a')[0].attrs.get('href') # 视频id
    23. #videoImg = li.search('background-image: url({})')[0] # 视频图片
    24. #videoTitle = li.xpath('//*[@class="vervideo-title"]/text()')[0]
    25. #videoLink = self.getVideoLink(videoId) # 视频地址
    26. #self.downVideo(videoLink,videoTitle)
    27. # 使用正则表达式匹配获取
    28. allLi = ul.search_all('<li class="categoryem">{}</li>')
    29. for li in allLi:
    30. # 视频id
    31. videoId = re.findall('<a href="(.*?)" class="vervideo-lilink actplay">',li[0])[0]
    32. videoLink = self.getVideoLink(videoId) # 视频地址
    33. # 视频封面图
    34. videoImg = re.findall('\((.*?)\)',li[0])[0]
    35. videoTitle = re.findall('<div class="vervideo-title">(.*?)</div>',li[0])[0]
    36. self.downVideo(videoLink, videoTitle)
    37. print('恭喜你已爬完该页视频')
    38. # 下载视频
    39. def downVideo(self,videoLink,videoTitle):
    40. if not os.path.exists('./videos'):
    41. os.mkdir('./videos')
    42. result = self.session.get(videoLink)
    43. print(f'正在下载视频:{videoTitle}')
    44. with open(f'./videos/{videoTitle}.mp4', 'wb') as file:
    45. file.write(result.content)
    46. # 获取可以访问的视频地址
    47. def getVideoLink(self,videoId):
    48. contId = videoId.split('_')[-1]
    49. url = f'https://www.pearvideo.com/videoStatus.jsp?contId={contId}'
    50. headers = {
    51. 'Referer': f'https://www.pearvideo.com/{videoId}',
    52. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36'
    53. }
    54. response = self.session.get(url, headers=headers)
    55. if response.status_code == 200:
    56. data = json.loads(response.text)
    57. videoLink = data['videoInfo']['videos']['srcUrl']
    58. # 得到视频url:https://video.pearvideo.com/mp4/third/20211229/1641655664960-10054243-161801-hd.mp4
    59. # 此时该视频是不能播放的 拿到需要替换的字符串
    60. resplaceTxt = videoLink.split('/')[-1].split('-')[0]
    61. # print("需要替换的内容为:",resplaceTxt)
    62. newVideoLink = videoLink.replace(resplaceTxt, f'cont-{contId}')
    63. return newVideoLink
    64. if __name__ == '__main__':
    65. url = 'https://www.pearvideo.com/category_6'
    66. obj = GetVideo(url)
    67. obj.getVideoUrl()