需要ffmpeg、该路径

    1. # -*- coding:utf-8 -*-
    2. # author: LIUWENYU
    3. # datetime: 2020/11/2 15:39
    4. # describe:
    5. import requests
    6. from fake_useragent import UserAgent
    7. from lxml import etree
    8. import re
    9. import time
    10. import os
    11. import subprocess
    12. class Bilib():
    13. def __init__(self,search_name=None):
    14. """
    15. :param search_name: 查询关键词
    16. 其他参数说明:
    17. keyword:关键词 例如:西游记
    18. order: totalrank 综合排序,click 最多点击,pubdate 最新发布,dm 最多弹幕,stow 最多收藏
    19. duration:0 全部时长,1 10分钟以下,2 10-30分钟,3 30-60分钟,4 60分钟以上
    20. tids_1: 0 全部分区,1 动画,13 番剧,167 国创,3 音乐,129 舞蹈。。等等 这个选项有点多
    21. page:1 第一页,2 第二页
    22. eg:https://search.bilibili.com/video?keyword=西游记&order=totalrank&duration=1&tids_1=0&page=1
    23. """
    24. # 关键字
    25. self.search_name = search_name
    26. # 请求头
    27. self.headers = {"User-Agent": UserAgent().random}
    28. # 下载视频请求头
    29. self.downloadVideoHeaders = {
    30. 'referer': 'https://www.bilibili.com/video/BV1WJ411e76L?from=search',
    31. 'sec-ch-ua': '"\\Not;A\"Brand";v="99", "Google Chrome";v="85", "Chromium";v="85"',
    32. 'sec-ch-ua-mobile': '?0',
    33. 'sec-fetch-dest': 'empty',
    34. 'sec-fetch-mode': 'cors',
    35. 'sec-fetch-site': 'cross-site',
    36. 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36'
    37. }
    38. self.link_list = []
    39. # 获取网页源码
    40. def get_html(self,url):
    41. result_url = requests.get(url=url,headers=self.headers)
    42. if result_url.status_code == 200:
    43. return result_url.text
    44. else:
    45. print('Get Html Code is False:%s,没有查询到相关视频' % result_url.status_code)
    46. return
    47. # 获取视频页数
    48. def get_page_count(self,html):
    49. # 对网页数据进行解析,提取需要的视频链接
    50. result_etree = etree.HTML(html)
    51. page_num = result_etree.xpath('//li[contains(@class,"page-item last")]/button/text()')
    52. page_num = re.findall(r'\d+',page_num[0])[0]
    53. return page_num
    54. # 获取视频地址
    55. def get_link(self,html):
    56. if not html: return
    57. # 对网页数据进行解析,提取需要的视频链接
    58. result_etree = etree.HTML(html)
    59. link_list_result = result_etree.xpath('//*[@id="video-list"]/ul/li/a/@href')
    60. for item in link_list_result:
    61. self.link_list.append(item)
    62. return self.link_list
    63. # 根据视频地址,获取视频
    64. def get_video(self,video_count,page_num_t):
    65. # 存放路径
    66. path = os.path.split(os.path.realpath(__file__))[0] + os.sep + 'video'
    67. if not os.path.exists(path):
    68. os.makedirs(path)
    69. # 合并后的视频位置
    70. path_new = os.path.split(os.path.realpath(__file__))[0] + os.sep + 'videoNew'
    71. if not os.path.exists(path_new):
    72. os.makedirs(path_new)
    73. # 下载数量
    74. if 0 < video_count < page_num_t * 20:
    75. for i in range(len(self.link_list))[:video_count]:
    76. result_video = requests.get(url='http:' + self.link_list[i], headers=self.headers).text
    77. video_url = re.findall(r'"base_url":"(.*?)"', result_video) # 匹配视频,音频地址
    78. video_name = etree.HTML(result_video).xpath('//*[@id="viewbox_report"]/h1/span/text()')
    79. video_name = '-'.join(re.findall(r'[\w]+', video_name[0])) # 视频名称
    80. print(f'{video_name}---下载中...')
    81. # 存视频
    82. info_s = requests.get(url=video_url[0], headers=self.downloadVideoHeaders).content
    83. with open(f'{path}{os.sep}{video_name}1.mp4', 'wb') as f:
    84. f.write(info_s)
    85. # 存音频
    86. info_y = requests.get(url=video_url[-1], headers=self.downloadVideoHeaders).content
    87. with open(f'{path}{os.sep}{video_name}2.mp4', 'wb') as f:
    88. f.write(info_y)
    89. time.sleep(1)
    90. filename = f'{path_new}{os.sep}{video_name}.mp4'
    91. yin_video = f'{path}{os.sep}{video_name}1.mp4'
    92. shi_video = f'{path}{os.sep}{video_name}2.mp4'
    93. # 合并音视频
    94. self.video_add_mp4(filename,yin_video,shi_video)
    95. print(f'{video_name}---下载完成...')
    96. # 下载全部视频
    97. if video_count == 0:
    98. for i in self.link_list:
    99. result_video = requests.get(url='http:' + self.link_list[i],headers=self.headers).text
    100. video_url = re.findall(r'"base_url":"(.*?)"',result_video) # 匹配视频,音频地址
    101. video_name = etree.HTML(result_video).xpath('//*[@id="viewbox_report"]/h1/span/text()')
    102. video_name = '-'.join(re.findall(r'[\w]+', video_name[0])) # 视频名称
    103. print(video_name)
    104. # 存视频
    105. info_s = requests.get(url=video_url[0], headers=self.downloadVideoHeaders).content
    106. with open(f'{path}{os.sep}{video_name}1.mp4', 'wb') as f:
    107. f.write(info_s)
    108. # 存音频
    109. info_y = requests.get(url=video_url[-1], headers=self.downloadVideoHeaders).content
    110. with open(f'{path}{os.sep}{video_name}2.mp4', 'wb') as f:
    111. f.write(info_y)
    112. # 视频 + 音频 合并
    113. def video_add_mp4(self,filename,yin_video,shi_videop):
    114. cmd = f'X:\\bili\\video\\FFmpeg\\ffmpeg -i {yin_video} -i {shi_videop} -acodec copy -vcodec copy {filename}'
    115. print(cmd)
    116. subprocess.call(cmd, shell=True)
    117. def main(self):
    118. while True:
    119. search_name = input('请输入查询视频关键字:')
    120. # search_name = '西游记'
    121. if not search_name:
    122. continue
    123. break
    124. # 页数
    125. url = f"https://search.bilibili.com/video?keyword={search_name}&order=totalrank&duration=1&tids_1=0"
    126. html = self.get_html(url=url)
    127. page_num = self.get_page_count(html)
    128. print(f'相关视频有{page_num}页,每页视频大概20个')
    129. while True:
    130. page_num_t = int(input('请选择下载几页:'))
    131. if page_num_t > int(page_num):
    132. print('输入页码有误,请重新输入')
    133. continue
    134. break
    135. # 视频数量
    136. while True:
    137. print('数量规则:0<数量<20*页数,0表示选择页数的全部视频')
    138. video_count = int(input('请输入下载视频数量:'))
    139. if video_count < 0 or video_count > page_num_t *20 :
    140. print('输入数量有误,请重新输入')
    141. continue
    142. break
    143. # 地址
    144. links = ''
    145. for i in range(1,page_num_t+1):
    146. url_t = f"https://search.bilibili.com/video?keyword={search_name}&order=totalrank&duration=1&tids_1=0&page={i}"
    147. print(url_t)
    148. html = self.get_html(url=url_t)
    149. links = self.get_link(html)
    150. print(f'第{i}页,视频地址获取完成')
    151. time.sleep(2) # 睡眠一会,访问频率过快,会被认为网络爬虫,将IP地址关进小黑屋
    152. print(f'总共获取视频地址数量{len(links)}')
    153. # 下载视频
    154. self.get_video(video_count,page_num_t)
    155. if __name__ == '__main__':
    156. b = Bilib()
    157. b.main()