普通爬虫 - 下载酷我音乐并导出excel - 《爬虫代码集》

import requests
import re
from fake_useragent import UserAgent
import os
import xlsxwriter
import json
class KuwoMusic:
    def __init__(self,cate_id):
        self.cate_id = cate_id
        self.base_url = 'https://www.kuwo.cn'
        self.url = self.base_url + '/playlist_detail/' + cate_id
        self.headers = {
            'User-Agent':UserAgent().random,
            'Cookie':'Hm_lvt_cdb524f42f0ce19b169a8071123a4797=1642227195; _ga=GA1.2.1122063446.1642227195; _gid=GA1.2.280619156.1642227195; gtoken=j8VE8ybEY8ot; gid=085db92c-1032-4576-a03c-a8fca605c0c0; Hm_lpvt_cdb524f42f0ce19b169a8071123a4797=1642239672; kw_token=6O9JR1N139B'
        }
    # 创建文件夹
    def checkDir(self,path):
        if not os.path.exists(path):
            os.mkdir(path)
    # 保存酷我音乐页面
    def saveMusicHtml(self):
        self.headers['Referer'] = self.url
        if not os.path.exists(f'{self.cate_id}-music.html'):
            response = requests.get(self.url, headers=self.headers)
            if response.status_code == 200:
                with open(f'{self.cate_id}-music.html', 'w', encoding='utf-8') as file:
                    file.write(response.content.decode('utf-8'))
            else:
                print(response.reason)
            self.getMusicInfo()
        else:
            self.getMusicInfo()
    # 获取音乐信息
    def getMusicInfo(self):
        with open(f'{self.cate_id}-music.html','r',encoding='utf-8') as file:
            html = file.read()
            allLi = re.findall('<li class="song_item flex_c" data-v-1344465b>.*?</li>',html)
            songTitles = []
            songArtistes = []
            times = []
            links = []
            songLyrices = []
            for li in allLi:
                songName = re.findall('<div class="song_name flex_c" .*?<a title="(.*?)" href="(.*?)".*?data-v-1344465b>(.*?)</a>.*?</div>',li)[0]
                songArtist = re.findall('<div class="song_artist" .*?<span title="(.*?)" data-v-1344465b>(.*?)</span></div>',li)[0]
                time = re.findall('<div class="song_time" data-v-1344465b.*?<span data-v-1344465b>(.*?)</span></div>',li)[0]
                #print(f'音乐名称为:【{songName[0]}】,歌手:{songArtist[0]},时长为:{time},链接为:{self.base_url + songName[1]}')
                songLyric = self.getMusicDetail(songName[0],self.base_url + songName[1])
                print(f'下载歌曲{songName[0]}中...')
                # 下载音乐
                songId = songName[1].split('/')[-1]  # 歌曲ID值
                isDownMusic = self.downMusic(songName[0],songId)
                if isDownMusic == True:
                    songTitles.append(songName[0])
                    songArtistes.append(songArtist[0])
                    times.append(time)
                    links.append(self.base_url + songName[1])
                    songLyrices.append(songLyric)
            if songTitles:
                print('导出数据中...')
                fileName = f'{cate_id}-歌词.xlsx'
                workbook = xlsxwriter.Workbook(fileName)
                # 创建一个sheet
                worksheet = workbook.add_worksheet()
                bold = workbook.add_format({'bold': 1})
                # # --------1、准备数据并写入excel---------------
                # # 向excel中写入数据，建立图标时要用到
                headings = ['序号', '歌名', '歌手', '时长', '链接', '歌词']
                data = [
                    [i for i in range(0, len(songTitles))],
                    songTitles,
                    songArtistes,
                    times,
                    links,
                    songLyrices,
                ]
                # 写入表头
                worksheet.write_row('A1', headings, bold)
                # 写入数据
                worksheet.write_column('A2', data[0])
                worksheet.write_column('B2', data[1])
                worksheet.write_column('C2', data[2])
                worksheet.write_column('D2', data[3])
                worksheet.write_column('E2', data[4])
                worksheet.write_column('F2', data[5])
                workbook.close()
        print('程序执行完毕')
    # 下载音乐
    def downMusic(self,songName,songId):
        url = f'https://www.kuwo.cn/api/v1/www/music/playUrl'
        params = {
            'mid':songId,
            'type':'music',
            'httpsStatus':1,
            'reqId':'b53f83a1-75e2-11ec-9e65-9b1b46a0f773'
        }
        response = requests.get(url,params=params,headers=self.headers)
        if response.status_code == 200:
            result = json.loads(response.content.decode('utf-8'))
            if result['code'] == 200:
                response = requests.get(result['data']['url'])
                self.checkDir('music')
                with open(f'music/{songName}.mp3','wb') as file:
                    file.write(response.content)
            return True
    # 获取歌曲-歌词
    def getMusicDetail(self,songName,link):
        self.headers['Referer'] = link
        response = requests.get(link,headers=self.headers)
        self.checkDir('detail')
        if response.status_code == 200:
            if os.path.exists(f'detail/{songName}-歌词.html'):
                with open(f'detail/{songName}-歌词.html', 'r', encoding='utf-8') as file:
                    html = file.read()
            else:
                html = response.content.decode('utf-8')
                with open(f'detail/{songName}-歌词.html', 'w', encoding='utf-8') as file:
                    file.write(html)
            allP = re.findall('<div id="lyric" class="lyric" .*? data-v-34783d0c>.*?<div data-v-34783d0c>(.*?)</div></div>',html)
            for p in allP:
                songLyrics = re.findall('<p data-v-34783d0c.*?>(.*?)</p>',p)
                songLyrics = ','.join(songLyrics)
                return songLyrics
        else:
            print(response.reason)
#cate_id = input('请输入歌曲分类id')
cate_id = '3301373706'
music = KuwoMusic(cate_id)
music.saveMusicHtml()