目标网站: http://www.kuwo.cn/
爬取需求
1、爬取任意一个歌单页面的html代码
2、用正则解析数据,拿到歌名和歌曲链接
import requestsimport refrom fake_useragent import UserAgentimport osimport xlsxwriterclass KuwoMusic:def __init__(self,cate_id):self.cate_id = cate_idself.base_url = 'https://www.kuwo.cn'self.url = self.base_url + '/playlist_detail/' + cate_idself.headers = {'User-Agent':UserAgent().random,'Cookie':'Hm_lvt_cdb524f42f0ce19b169a8071123a4797=1642227195; _ga=GA1.2.1122063446.1642227195; _gid=GA1.2.280619156.1642227195; gtoken=j8VE8ybEY8ot; gid=085db92c-1032-4576-a03c-a8fca605c0c0; Hm_lpvt_cdb524f42f0ce19b169a8071123a4797=1642231516; kw_token=29VMFISUU6K',}# 保存酷我音乐页面def saveMusicHtml(self):self.headers['Referer'] = self.urlif not os.path.exists(f'{self.cate_id}-music.html'):response = requests.get(self.url, headers=self.headers)if response.status_code == 200:with open(f'{self.cate_id}-music.html', 'w', encoding='utf-8') as file:file.write(response.content.decode('utf-8'))else:print(response.reason)self.getMusicInfo()else:self.getMusicInfo()# 获取音乐信息def getMusicInfo(self):with open(f'{self.cate_id}-music.html','r',encoding='utf-8') as file:html = file.read()allLi = re.findall('<li class="song_item flex_c" data-v-1344465b>.*?</li>',html)songTitles = []songArtistes = []times = []links = []songLyrices = []for li in allLi:songName = re.findall('<div class="song_name flex_c" .*?<a title="(.*?)" href="(.*?)".*?data-v-1344465b>(.*?)</a>.*?</div>',li)[0]songArtist = re.findall('<div class="song_artist" .*?<span title="(.*?)" data-v-1344465b>(.*?)</span></div>',li)[0]time = re.findall('<div class="song_time" data-v-1344465b.*?<span data-v-1344465b>(.*?)</span></div>',li)[0]#print(f'音乐名称为:【{songName[0]}】,歌手:{songArtist[0]},时长为:{time},链接为:{self.base_url + songName[1]}')songLyric = self.getMusicDetail(songName[0],self.base_url + songName[1])#print('歌词为:',songLyric)songTitles.append(songName[0])songArtistes.append(songArtist[0])times.append(time)links.append(self.base_url + songName[1])songLyrices.append(songLyric)fileName = f'{cate_id}-歌词.xlsx'workbook = xlsxwriter.Workbook(fileName)# 创建一个sheetworksheet = workbook.add_worksheet()bold = workbook.add_format({'bold': 1})# # --------1、准备数据并写入excel---------------# # 向excel中写入数据,建立图标时要用到headings = ['序号', '歌名', '歌手','时长','链接','歌词']data = [[i for i in range(0,len(songTitles))],songTitles,songArtistes,times,links,songLyrices,]# 写入表头worksheet.write_row('A1', headings, bold)# 写入数据worksheet.write_column('A2', data[0])worksheet.write_column('B2', data[1])worksheet.write_column('C2', data[2])worksheet.write_column('D2', data[3])worksheet.write_column('E2', data[4])worksheet.write_column('F2', data[5])workbook.close()# 获取歌曲-歌词def getMusicDetail(self,songName,link):self.headers['Referer'] = linkresponse = requests.get(link,headers=self.headers)if response.status_code == 200:if os.path.exists(f'detail/{songName}-歌词.html'):with open(f'detail/{songName}-歌词.html', 'r', encoding='utf-8') as file:html = file.read()else:html = response.content.decode('utf-8')with open(f'detail/{songName}-歌词.html', 'w', encoding='utf-8') as file:file.write(html)allP = re.findall('<div id="lyric" class="lyric" .*? data-v-34783d0c>.*?<div data-v-34783d0c>(.*?)</div></div>',html)for p in allP:songLyrics = re.findall('<p data-v-34783d0c.*?>(.*?)</p>',p)songLyrics = ','.join(songLyrics)return songLyricselse:print(response.reason)#cate_id = input('请输入歌曲分类id')cate_id = '1191296579'music = KuwoMusic(cate_id)music.saveMusicHtml()
