普通爬虫 - 爬取梨视频 - 《爬虫代码集》

方法一：使用requests和lxml以及re

import requests
from lxml import etree
import re
import json
import os
class GetVideo:
    videoImages = []
    videoTitles = []
    videoUrls = []
    videoIds = []
    def __init__(self,url):
        self.url = url
    # 获取视频信息并下载
    def getVideoUrl(self):
        response = requests.get(self.url)
        html = etree.HTML(response.text)
        liHtml = html.xpath('//ul[@class="listvideo-list clearfix"]/li')[0]
        # 视频Id
        self.videoIds = liHtml.xpath('//div[@class="vervideo-bd"]/a[@class="vervideo-lilink actplay"]/@href')
        # 获取视频url地址
        for videoId in self.videoIds:
            videoLink = self.getVideoLink(videoId)
            self.videoUrls.append(videoLink)
        # 获取视频的图片
        videoImg = liHtml.xpath('//div[@class="vervideo-img"]/div/div[@class="img"]/@style')
        for img in videoImg:
            imgUrl = ''.join(img)
            imgUrl = imgUrl.split(': ')[1]
            imgUrl = re.findall(r'\((.*?)\)', imgUrl)[0]
            self.videoImages.append(imgUrl)
        # 视频标题
        self.videoTitles = liHtml.xpath('//div[@class="vervideo-title"]/text()')
        print('视频正在下载中....')
        self.downVideo()
    # 获取视频url地址
    def getVideoLink(self,videoId):
        # videoId  video_1749535
        # 不能播放的视频地址：url = 'https://video.pearvideo.com/mp4/third/20220107/1641654761203-10054243-154532-hd.mp4'
        # 正常播放地址为: https://video.pearvideo.com/mp4/third/20220107/cont-1749535-10054243-154532-hd.mp4
        contId = videoId.split('_')[-1]
        url = f'https://www.pearvideo.com/videoStatus.jsp?contId={contId}'
        headers = {
            'Referer':f'https://www.pearvideo.com/{videoId}',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36'
        }
        response = requests.get(url,headers=headers)
        if response.status_code == 200:
            videoLink = json.loads(response.text)['videoInfo']['videos']['srcUrl']
            # 得到视频url：https://video.pearvideo.com/mp4/third/20211229/1641655664960-10054243-161801-hd.mp4
            # 此时该视频是不能播放的 拿到需要替换的字符串
            resplaceTxt = videoLink.split('/')[-1].split('-')[0]
            #print("需要替换的内容为:",resplaceTxt)
            newVideoLink = videoLink.replace(resplaceTxt,f'cont-{contId}')
            return newVideoLink
    # 下载视频
    def downVideo(self):
        if not os.path.exists('./videos'):
            os.mkdir('./videos')
        for i in range(0,len(self.videoUrls)):
            videoContent = requests.get(self.videoUrls[i])
            print(f'正在下载视频:{self.videoTitles[i]}')
            with open(f'./videos/{self.videoTitles[i]}.mp4','wb') as file:
                file.write(videoContent.content)
        print('恭喜你已爬完该页视频')
if __name__ == '__main__':
    #url = input('请输入梨视频地址')
    url = 'https://www.pearvideo.com/category_6'
    obj = GetVideo(url)
    response = obj.getVideoUrl()
    #print(response)

方法二、使用requests_html

from requests_html import HTMLSession,UserAgent
import json
import os
import re
class GetVideo:
    def __init__(self,url):
        self.url = url
        self.headers = {
            'User-Agent':UserAgent().random
        }
        self.session = HTMLSession()
    # 获取视频链接
    def getVideoUrl(self):
        response = self.session.get(url)
        if response.status_code == 200:
            html = response.html
            # 获取最新的视频                                             
            print('视频正在下载中....')
            # 使用xpath获取视频信息 
            #allLi = html.xpath('//*[@class="category-list clearfix"]/li')
            #for li in allLi:
                #videoId = li.find('a')[0].attrs.get('href') # 视频id
                #videoImg = li.search('background-image: url({})')[0] # 视频图片
                #videoTitle = li.xpath('//*[@class="vervideo-title"]/text()')[0]
                #videoLink = self.getVideoLink(videoId) # 视频地址
                #self.downVideo(videoLink,videoTitle)
            # 使用正则表达式匹配获取
            allLi = ul.search_all('<li class="categoryem">{}</li>')
            for li in allLi:
                # 视频id
                videoId = re.findall('<a href="(.*?)" class="vervideo-lilink actplay">',li[0])[0]
                videoLink = self.getVideoLink(videoId)  # 视频地址
                # 视频封面图
                videoImg = re.findall('\((.*?)\)',li[0])[0]
                videoTitle = re.findall('<div class="vervideo-title">(.*?)</div>',li[0])[0]
                self.downVideo(videoLink, videoTitle)
            print('恭喜你已爬完该页视频')
    # 下载视频
    def downVideo(self,videoLink,videoTitle):
        if not os.path.exists('./videos'):
            os.mkdir('./videos')
        result = self.session.get(videoLink)
        print(f'正在下载视频:{videoTitle}')
        with open(f'./videos/{videoTitle}.mp4', 'wb') as file:
            file.write(result.content)
    # 获取可以访问的视频地址
    def getVideoLink(self,videoId):
        contId = videoId.split('_')[-1]
        url = f'https://www.pearvideo.com/videoStatus.jsp?contId={contId}'
        headers = {
            'Referer': f'https://www.pearvideo.com/{videoId}',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36'
        }
        response = self.session.get(url, headers=headers)
        if response.status_code == 200:
            data = json.loads(response.text)
            videoLink = data['videoInfo']['videos']['srcUrl']
            # 得到视频url：https://video.pearvideo.com/mp4/third/20211229/1641655664960-10054243-161801-hd.mp4
            # 此时该视频是不能播放的 拿到需要替换的字符串
            resplaceTxt = videoLink.split('/')[-1].split('-')[0]
            # print("需要替换的内容为:",resplaceTxt)
            newVideoLink = videoLink.replace(resplaceTxt, f'cont-{contId}')
            return newVideoLink
if __name__ == '__main__':
    url = 'https://www.pearvideo.com/category_6'
    obj = GetVideo(url)
    obj.getVideoUrl()