普通爬虫 - 正则匹配爬取梨视频 - 《爬虫代码集》


import requests
import re
import json
import os
from fake_useragent import UserAgent
class LiVideo:
    def __init__(self,cate_id):
        self.base_url = 'https://www.pearvideo.com/'  # 主域名
        self.url = self.base_url + 'category_' + cate_id  # 首页的请求地址
        self.file_path = 'category_' + cate_id + 'page.html'     # 保存页面文件
        self.videoDir = './video-' + cate_id  # 视频分类目录地址
        self.pre_video = self.base_url + 'videoStatus.jsp?contId=' # 单个视频请求地址
        self.checkDir(self.videoDir)
        self.checkDir(self.videoDir)
    # 创建视频目录
    def checkDir(self,dirPath):
        if  not os.path.exists(dirPath):
            os.mkdir(dirPath)
    # 获取用户信息
    def getVideo(self):
        if os.path.exists(self.file_path):
            self.getVideoInfo()
        else:
            self.savePageHtml()
    # 保存页面
    def savePageHtml(self):
        response = requests.get(self.url)
        if response.status_code == 200:
            with open(self.file_path,'w',encoding='utf-8') as file:
                file.write(response.text)
        # 获取页面视频信息
        self.getVideoInfo()
    # 保存数据
    def getVideoInfo(self):
        videoes = []
        with open(self.file_path,'r',encoding='utf-8') as file:
            html = file.read()
            images = re.findall(r'<div class="verimg-view">.*? url\((.*?)\);">',html)
            titles = re.findall(r'<div class="vervideo-title">(.*?)</div>',html)
            videoIds = re.findall(r'<a href="video_(.*?)" class="vervideo-lilink actplay">',html)
            authos = re.findall(r'<a href=".*?" class="column">(.*?)</a>',html);
            for i in range(0,len(videoIds)):
                videoInfo = {'title':titles[i],'author':authos[i],'image':images[i],'videoId':videoIds[i]}
                videoes.append(videoInfo)
        if videoes:
            print('开始下载视频...')
            for video in videoes:
                requestVideoUrl = self.pre_video + video['videoId']
                headers = {
                    'User-Agent': UserAgent().random,
                    'Referer': self.base_url + 'video_' + video['videoId']
                }
                response = requests.get(requestVideoUrl, headers=headers)
                if response.status_code == 200:
                    result = json.loads(response.text)
                    # 得到视频url：https://video.pearvideo.com/mp4/third/20211229/1641655664960-10054243-161801-hd.mp4
                    # 实际播放地址：url: https://video.pearvideo.com/mp4/third/20220113/cont-1749919-10054243-151326-hd.mp4
                    videoUrl = result['videoInfo']['videos']['srcUrl']
                    newVideoUrl = re.sub(r'/(\d+)-', f'/cont-{video["videoId"]}-', videoUrl)
                    try:
                        videoContent = requests.get(newVideoUrl)
                        if response.status_code == 200:
                            print(f'下载视频:{video["title"]}中')
                            with open(f'{self.videoDir}/{video["title"]}.mp4', 'wb') as file:
                                file.write(videoContent.content)
                    except Exception as error:
                        print(error)
                else:
                    print(response.status_code)
            print('下载完毕!!!')
        else:
            print('该页面不存在')
cate_id = input('请输入分类id')
video = LiVideo(cate_id)
video.getVideo()
地址: https://www.pearvideo.com/category_6 输入category_后的值