import requests
import time
import os
from lxml import etree
for page in range(1, 3):
print(f'正在抓取第{page}页=================')
page_url= f'https://www.ximalaya.com/youshengshu/22963309/p{page}/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36'
}
page_text = requests.get(url=page_url, headers=headers).text
page_tree = etree.HTML(page_text)
media_dir = page_tree.xpath('////h1[@class="title k_Z"]/text()')[0].split(' | ')[0]
div_list = page_tree.xpath('//div[@class="text Mi_"]')
if not os.path.exists(media_dir):
os.mkdir(media_dir)
# 这里使用了枚举法--enumerate(squ)已达到给每个小标题加上序列的目的
for num, div in enumerate(div_list):
media_title = div.xpath('./a/@title')[0] # media_title:重生八零:媳妇有点辣 0001 我是破鞋?
media_index = (page-1) * 30 + num + 1
# 当序号小于10时,在序号前面加上一个0
if media_index < 10:
media_index = '0' + str(media_index)
media_path = f"{media_index} {media_title.split(' ')[-1]}.mp3"
media_href = div.xpath('./a/@href')[0] # media_href:/youshengshu/22963309/456562317
media_id = media_href.split('/')[-1] # media_id:456562317
ajax_url = f'https://www.ximalaya.com/revision/play/v1/audio?id={media_id}&ptype=1'
time.sleep(0.5)
ajax_data = requests.get(url=ajax_url, headers=headers).json()
media_url = ajax_data['data']['src']
print(media_path, '开始下载...')
time.sleep(0.5)
media_data = requests.get(url=media_url, headers=headers).content
with open(f'{media_dir}/{media_path}', 'wb') as fp:
fp.write(media_data)
print(media_path, '下载完成!')
print(media_dir,f'所有章节已下载完毕 共{page}页 {media_index}章节!!!')
"""
注意:
01.if not os.path.exists(path)---判断是否有某个文件夹
02.os.mkdir(path)---创建文件夹
03.for in i, path in enumerate(squ)---enumerate()枚举法
"""