"""目标网站: https://www.pearvideo.com/category_5需求: 01.获取梨视频里的视频链接 02.使用线程池下载获取的视频"""import requestsimport timeimport randomfrom lxml import etreefrom multiprocessing.dummy import Poolpage_url = 'https://www.pearvideo.com/category_5'headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36'}page_text = requests.get(url=page_url, headers=headers).text# 实例化etree对象page_tree = etree.HTML(page_text)# 获取所有包含目标视频数据的li# 页面只显示了3个视频,代码能获取4个,第二个可能是反爬的坑,使用position()<4排除第四个lili_list = page_tree.xpath('//*[@id="listvideoListUl"]/li[position()<4]')# 构建一个存放video_dic的空列表video_urls = []for li in li_list: # 获取video_href 因为这里的视频数据是动态加载的所以不需要拼接链接,而是要用Ajax请求里的链接 video_href = li.xpath('./div/a/@href')[0] # video_href:video_1744850 video_id = video_href.split('_')[-1] video_title = li.xpath('./div/a/div[2]/text()')[0] + '.mp4' # Ajax请求的原链接为:'https://www.pearvideo.com/videoStatus.jsp?contId=1744850&mrd=0.9755309499686131' 注意构造参数contId和mrd ajax_url = 'https://www.pearvideo.com/videoStatus.jsp' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36', # 直接请求时发现无法获取数据,对比浏览器请求数据和程序请求数据发现,缺少referer参数 'Referer': 'https://www.pearvideo.com/' + video_href } params = { 'contId': video_id, 'mrd': random.random() # 这里的mrd是一个时间戳,通过random可以模拟 } time.sleep(0.5) # 使用response.json(),将json数据转为字典数据 video_data = requests.get(url=ajax_url, headers=headers, params=params).json() # 此时得到的数据video_url为:https://video.pearvideo.com/mp4/adshort/20211101/1635865058548-15790523_adpkg-ad_hd.mp4 video_url = video_data['videoInfo']['videos']['srcUrl'] # 通过点击发现这个url返回的是404不可用,如下对比,替换'1635865058548'为'cont-1744850' # 响应url:https://video.pearvideo.com/mp4/adshort/20211101/1635865058548-15790523_adpkg-ad_hd.mp4 # 原网页url:https://video.pearvideo.com/mp4/adshort/20211101/cont-1744850-15790523_adpkg-ad_hd.mp4 video_url = video_url.replace(video_url.split('/')[-1].split('-')[0], 'cont-%s' %video_id) video_dic = { 'title': video_title, 'url': video_url } video_urls.append(video_dic)def getVideoData(dic): video_title = dic['title'] video_url = dic['url'] time.sleep(0.5) print(video_title, '开始下载') video_content = requests.get(url=video_url, headers=headers).content with open(video_title, 'wb') as fp: fp.write(video_content) # time.sleep(1) print(video_title, '下载完成')# 使用进程池来下载视频pool = Pool(3)pool.map(getVideoData, video_urls)pool.close()pool.join() # 这里要先关闭再JOIN。进程池中进程执行完后再关闭,如果注释,那么程序直接关闭"""总结: 01.XPath中position()---可以获取前几个或者后几个对象 02.random.random()---可以随机生成0-1之间的浮点数 03.respons.json()---可以将获取的JSON数据转化为字典数据 04.使用Pool池时记得close()关闭,join()等待进程结束"""