"""
目标网站:
https://www.pearvideo.com/category_5
需求:
01.获取梨视频里的视频链接
02.使用线程池下载获取的视频
"""
import requests
import time
import random
from lxml import etree
from multiprocessing.dummy import Pool
page_url = 'https://www.pearvideo.com/category_5'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36'
}
page_text = requests.get(url=page_url, headers=headers).text
# 实例化etree对象
page_tree = etree.HTML(page_text)
# 获取所有包含目标视频数据的li
# 页面只显示了3个视频,代码能获取4个,第二个可能是反爬的坑,使用position()<4排除第四个li
li_list = page_tree.xpath('//*[@id="listvideoListUl"]/li[position()<4]')
# 构建一个存放video_dic的空列表
video_urls = []
for li in li_list:
# 获取video_href 因为这里的视频数据是动态加载的所以不需要拼接链接,而是要用Ajax请求里的链接
video_href = li.xpath('./div/a/@href')[0] # video_href:video_1744850
video_id = video_href.split('_')[-1]
video_title = li.xpath('./div/a/div[2]/text()')[0] + '.mp4'
# Ajax请求的原链接为:'https://www.pearvideo.com/videoStatus.jsp?contId=1744850&mrd=0.9755309499686131' 注意构造参数contId和mrd
ajax_url = 'https://www.pearvideo.com/videoStatus.jsp'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36',
# 直接请求时发现无法获取数据,对比浏览器请求数据和程序请求数据发现,缺少referer参数
'Referer': 'https://www.pearvideo.com/' + video_href
}
params = {
'contId': video_id,
'mrd': random.random() # 这里的mrd是一个时间戳,通过random可以模拟
}
time.sleep(0.5)
# 使用response.json(),将json数据转为字典数据
video_data = requests.get(url=ajax_url, headers=headers, params=params).json()
# 此时得到的数据video_url为:https://video.pearvideo.com/mp4/adshort/20211101/1635865058548-15790523_adpkg-ad_hd.mp4
video_url = video_data['videoInfo']['videos']['srcUrl']
# 通过点击发现这个url返回的是404不可用,如下对比,替换'1635865058548'为'cont-1744850'
# 响应url:https://video.pearvideo.com/mp4/adshort/20211101/1635865058548-15790523_adpkg-ad_hd.mp4
# 原网页url:https://video.pearvideo.com/mp4/adshort/20211101/cont-1744850-15790523_adpkg-ad_hd.mp4
video_url = video_url.replace(video_url.split('/')[-1].split('-')[0], 'cont-%s' %video_id)
video_dic = {
'title': video_title,
'url': video_url
}
video_urls.append(video_dic)
def getVideoData(dic):
video_title = dic['title']
video_url = dic['url']
time.sleep(0.5)
print(video_title, '开始下载')
video_content = requests.get(url=video_url, headers=headers).content
with open(video_title, 'wb') as fp:
fp.write(video_content)
# time.sleep(1)
print(video_title, '下载完成')
# 使用进程池来下载视频
pool = Pool(3)
pool.map(getVideoData, video_urls)
pool.close()
pool.join() # 这里要先关闭再JOIN。进程池中进程执行完后再关闭,如果注释,那么程序直接关闭
"""
总结:
01.XPath中position()---可以获取前几个或者后几个对象
02.random.random()---可以随机生成0-1之间的浮点数
03.respons.json()---可以将获取的JSON数据转化为字典数据
04.使用Pool池时记得close()关闭,join()等待进程结束
"""