目标网站: http://www.kuwo.cn/
    爬取需求
    1、爬取任意一个歌单页面的html代码
    2、用正则解析数据,拿到歌名和歌曲链接

    1. import requests
    2. import re
    3. from fake_useragent import UserAgent
    4. import os
    5. import xlsxwriter
    6. class KuwoMusic:
    7. def __init__(self,cate_id):
    8. self.cate_id = cate_id
    9. self.base_url = 'https://www.kuwo.cn'
    10. self.url = self.base_url + '/playlist_detail/' + cate_id
    11. self.headers = {
    12. 'User-Agent':UserAgent().random,
    13. 'Cookie':'Hm_lvt_cdb524f42f0ce19b169a8071123a4797=1642227195; _ga=GA1.2.1122063446.1642227195; _gid=GA1.2.280619156.1642227195; gtoken=j8VE8ybEY8ot; gid=085db92c-1032-4576-a03c-a8fca605c0c0; Hm_lpvt_cdb524f42f0ce19b169a8071123a4797=1642231516; kw_token=29VMFISUU6K',
    14. }
    15. # 保存酷我音乐页面
    16. def saveMusicHtml(self):
    17. self.headers['Referer'] = self.url
    18. if not os.path.exists(f'{self.cate_id}-music.html'):
    19. response = requests.get(self.url, headers=self.headers)
    20. if response.status_code == 200:
    21. with open(f'{self.cate_id}-music.html', 'w', encoding='utf-8') as file:
    22. file.write(response.content.decode('utf-8'))
    23. else:
    24. print(response.reason)
    25. self.getMusicInfo()
    26. else:
    27. self.getMusicInfo()
    28. # 获取音乐信息
    29. def getMusicInfo(self):
    30. with open(f'{self.cate_id}-music.html','r',encoding='utf-8') as file:
    31. html = file.read()
    32. allLi = re.findall('<li class="song_item flex_c" data-v-1344465b>.*?</li>',html)
    33. songTitles = []
    34. songArtistes = []
    35. times = []
    36. links = []
    37. songLyrices = []
    38. for li in allLi:
    39. songName = re.findall('<div class="song_name flex_c" .*?<a title="(.*?)" href="(.*?)".*?data-v-1344465b>(.*?)</a>.*?</div>',li)[0]
    40. songArtist = re.findall('<div class="song_artist" .*?<span title="(.*?)" data-v-1344465b>(.*?)</span></div>',li)[0]
    41. time = re.findall('<div class="song_time" data-v-1344465b.*?<span data-v-1344465b>(.*?)</span></div>',li)[0]
    42. #print(f'音乐名称为:【{songName[0]}】,歌手:{songArtist[0]},时长为:{time},链接为:{self.base_url + songName[1]}')
    43. songLyric = self.getMusicDetail(songName[0],self.base_url + songName[1])
    44. #print('歌词为:',songLyric)
    45. songTitles.append(songName[0])
    46. songArtistes.append(songArtist[0])
    47. times.append(time)
    48. links.append(self.base_url + songName[1])
    49. songLyrices.append(songLyric)
    50. fileName = f'{cate_id}-歌词.xlsx'
    51. workbook = xlsxwriter.Workbook(fileName)
    52. # 创建一个sheet
    53. worksheet = workbook.add_worksheet()
    54. bold = workbook.add_format({'bold': 1})
    55. # # --------1、准备数据并写入excel---------------
    56. # # 向excel中写入数据,建立图标时要用到
    57. headings = ['序号', '歌名', '歌手','时长','链接','歌词']
    58. data = [
    59. [i for i in range(0,len(songTitles))],
    60. songTitles,
    61. songArtistes,
    62. times,
    63. links,
    64. songLyrices,
    65. ]
    66. # 写入表头
    67. worksheet.write_row('A1', headings, bold)
    68. # 写入数据
    69. worksheet.write_column('A2', data[0])
    70. worksheet.write_column('B2', data[1])
    71. worksheet.write_column('C2', data[2])
    72. worksheet.write_column('D2', data[3])
    73. worksheet.write_column('E2', data[4])
    74. worksheet.write_column('F2', data[5])
    75. workbook.close()
    76. # 获取歌曲-歌词
    77. def getMusicDetail(self,songName,link):
    78. self.headers['Referer'] = link
    79. response = requests.get(link,headers=self.headers)
    80. if response.status_code == 200:
    81. if os.path.exists(f'detail/{songName}-歌词.html'):
    82. with open(f'detail/{songName}-歌词.html', 'r', encoding='utf-8') as file:
    83. html = file.read()
    84. else:
    85. html = response.content.decode('utf-8')
    86. with open(f'detail/{songName}-歌词.html', 'w', encoding='utf-8') as file:
    87. file.write(html)
    88. allP = re.findall('<div id="lyric" class="lyric" .*? data-v-34783d0c>.*?<div data-v-34783d0c>(.*?)</div></div>',html)
    89. for p in allP:
    90. songLyrics = re.findall('<p data-v-34783d0c.*?>(.*?)</p>',p)
    91. songLyrics = ','.join(songLyrics)
    92. return songLyrics
    93. else:
    94. print(response.reason)
    95. #cate_id = input('请输入歌曲分类id')
    96. cate_id = '1191296579'
    97. music = KuwoMusic(cate_id)
    98. music.saveMusicHtml()