1. """
    2. 目标网站:
    3. https://www.pearvideo.com/category_5
    4. 需求:
    5. 01.获取梨视频里的视频链接
    6. 02.使用线程池下载获取的视频
    7. """
    8. import requests
    9. import time
    10. import random
    11. from lxml import etree
    12. from multiprocessing.dummy import Pool
    13. page_url = 'https://www.pearvideo.com/category_5'
    14. headers = {
    15. 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36'
    16. }
    17. page_text = requests.get(url=page_url, headers=headers).text
    18. # 实例化etree对象
    19. page_tree = etree.HTML(page_text)
    20. # 获取所有包含目标视频数据的li
    21. # 页面只显示了3个视频,代码能获取4个,第二个可能是反爬的坑,使用position()<4排除第四个li
    22. li_list = page_tree.xpath('//*[@id="listvideoListUl"]/li[position()<4]')
    23. # 构建一个存放video_dic的空列表
    24. video_urls = []
    25. for li in li_list:
    26. # 获取video_href 因为这里的视频数据是动态加载的所以不需要拼接链接,而是要用Ajax请求里的链接
    27. video_href = li.xpath('./div/a/@href')[0] # video_href:video_1744850
    28. video_id = video_href.split('_')[-1]
    29. video_title = li.xpath('./div/a/div[2]/text()')[0] + '.mp4'
    30. # Ajax请求的原链接为:'https://www.pearvideo.com/videoStatus.jsp?contId=1744850&mrd=0.9755309499686131' 注意构造参数contId和mrd
    31. ajax_url = 'https://www.pearvideo.com/videoStatus.jsp'
    32. headers = {
    33. 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36',
    34. # 直接请求时发现无法获取数据,对比浏览器请求数据和程序请求数据发现,缺少referer参数
    35. 'Referer': 'https://www.pearvideo.com/' + video_href
    36. }
    37. params = {
    38. 'contId': video_id,
    39. 'mrd': random.random() # 这里的mrd是一个时间戳,通过random可以模拟
    40. }
    41. time.sleep(0.5)
    42. # 使用response.json(),将json数据转为字典数据
    43. video_data = requests.get(url=ajax_url, headers=headers, params=params).json()
    44. # 此时得到的数据video_url为:https://video.pearvideo.com/mp4/adshort/20211101/1635865058548-15790523_adpkg-ad_hd.mp4
    45. video_url = video_data['videoInfo']['videos']['srcUrl']
    46. # 通过点击发现这个url返回的是404不可用,如下对比,替换'1635865058548'为'cont-1744850'
    47. # 响应url:https://video.pearvideo.com/mp4/adshort/20211101/1635865058548-15790523_adpkg-ad_hd.mp4
    48. # 原网页url:https://video.pearvideo.com/mp4/adshort/20211101/cont-1744850-15790523_adpkg-ad_hd.mp4
    49. video_url = video_url.replace(video_url.split('/')[-1].split('-')[0], 'cont-%s' %video_id)
    50. video_dic = {
    51. 'title': video_title,
    52. 'url': video_url
    53. }
    54. video_urls.append(video_dic)
    55. def getVideoData(dic):
    56. video_title = dic['title']
    57. video_url = dic['url']
    58. time.sleep(0.5)
    59. print(video_title, '开始下载')
    60. video_content = requests.get(url=video_url, headers=headers).content
    61. with open(video_title, 'wb') as fp:
    62. fp.write(video_content)
    63. # time.sleep(1)
    64. print(video_title, '下载完成')
    65. # 使用进程池来下载视频
    66. pool = Pool(3)
    67. pool.map(getVideoData, video_urls)
    68. pool.close()
    69. pool.join() # 这里要先关闭再JOIN。进程池中进程执行完后再关闭,如果注释,那么程序直接关闭
    70. """
    71. 总结:
    72. 01.XPath中position()---可以获取前几个或者后几个对象
    73. 02.random.random()---可以随机生成0-1之间的浮点数
    74. 03.respons.json()---可以将获取的JSON数据转化为字典数据
    75. 04.使用Pool池时记得close()关闭,join()等待进程结束
    76. """