爬取所有的歌手信息(artists.py)

  1. """
  2. 获取所有的歌手信息
  3. """
  4. import requests
  5. from bs4 import BeautifulSoup
  6. import threading
  7. from queue import Queue
  8. from music_163 import sql
  9. class Producer_artists(threading.Thread):
  10. headers = {
  11. "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36",
  12. 'origin': 'https://music.163.com',
  13. 'referer': 'https://music.163.com/'
  14. }
  15. def __init__(self, artists_sort_queue, *args, **kwargs):
  16. super(Producer_artists, self).__init__(*args, **kwargs)
  17. self.artists_sort_queue = artists_sort_queue
  18. self.db = sql.Db('localhost', 'root', 'netis', 'test')
  19. # self.zhangjie_queue = zhangjie_queue
  20. def run(self):
  21. while True:
  22. if self.artists_sort_queue.empty():
  23. print("所有歌手信息都已生产完成,生产者停止")
  24. break
  25. group_id, initial = self.artists_sort_queue.get()
  26. self.save_artist(group_id, initial)
  27. def save_artist(self, group_id, initial):
  28. # group_id = 1001 华语男歌手
  29. # initial 69-95
  30. db = self.db.get_instance()
  31. cursor = db.cursor()
  32. params = {'id': group_id, 'initial': initial}
  33. r = requests.get('http://music.163.com/discover/artist/cat', headers=self.headers, params=params,
  34. )
  35. # 网页解析
  36. soup = BeautifulSoup(r.content.decode(), 'lxml')
  37. artists = soup.find_all('a', class_='nm nm-icn f-thide s-fc0')
  38. for artist in artists:
  39. artist_id = artist['href'].replace('/artist?id=', '').strip()
  40. artist_name = artist['title'].replace('的音乐', '')
  41. try:
  42. sqll = "INSERT INTO `artists` (`ARTIST_ID`, `ARTIST_NAME` , `ARTIST_SORT_ID`) VALUES (%s, %s, %s)"
  43. cursor.execute(sqll, (artist_id, artist_name, group_id))
  44. db.commit()
  45. except Exception as e:
  46. # 打印错误日志
  47. print(e)
  48. if __name__ == '__main__':
  49. artists_sort_Queue = Queue(maxsize=0)
  50. urls = ['1001', '1002', '1003', '2001', '2002', '2003', '6001', '6002', '6003', '7001', '7002', '7003', '4001',
  51. '4002', '4003']
  52. for index in urls:
  53. for i in range(65, 91):
  54. qu = index, i
  55. artists_sort_Queue.put(qu)
  56. for i in range(50):
  57. mulu = Producer_artists(artists_sort_Queue)
  58. mulu.start()

根据歌手信息爬取所有的专辑信息(album_by _artist.py)

  1. """
  2. 根据上一步获取的歌手的 ID 来用于获取所有的专辑 ID
  3. """
  4. import requests
  5. from bs4 import BeautifulSoup
  6. import time
  7. from music_163 import sql
  8. import threading
  9. from queue import Queue
  10. class Producer_Album(threading.Thread):
  11. headers = {
  12. "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36",
  13. 'origin': 'https://music.163.com',
  14. 'referer': 'https://music.163.com/'
  15. }
  16. def __init__(self, album_queue, *args, **kwargs):
  17. super(Producer_Album, self).__init__(*args, **kwargs)
  18. self.album_queue = album_queue
  19. self.db = sql.Db('localhost', 'root', 'netis', 'test')
  20. # self.zhangjie_queue = zhangjie_queue
  21. def run(self):
  22. while True:
  23. if self.album_queue.empty():
  24. print("所有歌手信息都已生产完成,生产者停止")
  25. break
  26. artist_id = self.album_queue.get()
  27. self.save_albums(artist_id)
  28. def save_albums(self, artist_id):
  29. db = self.db.get_instance()
  30. cursor = db.cursor()
  31. params = {'id': artist_id, 'limit': '200'}
  32. # 获取歌手个人主页
  33. # 'https://music.163.com/#/artist/album?id=12429072'
  34. r = requests.get('http://music.163.com/artist/album', headers=self.headers, params=params)
  35. # proxies=get_random_proxy()
  36. # 网页解析
  37. soup = BeautifulSoup(r.content.decode(), 'lxml')
  38. # 获取所有专辑
  39. ul = soup.find_all('a', class_="tit s-fc0")
  40. times = [tt.get_text() for tt in soup.find_all('span', class_="s-fc3")]
  41. album_ids = [albums['href'].replace('/album?id=', '') for albums in ul]
  42. album_names = [albums.get_text() for albums in ul]
  43. albums = (zip(album_ids, album_names, times))
  44. for albume_id, album_name, album_time in albums:
  45. print(albume_id, album_name, album_time)
  46. try:
  47. sqll = "INSERT INTO `albums` (`ALBUM_ID`, `ARTIST_ID`, `ALBUM_NAME`, `ALBUM_TIME`) VALUES (%s,%s,%s,%s)"
  48. cursor.execute(sqll, (albume_id, artist_id, album_name, album_time))
  49. db.commit()
  50. except Exception as e:
  51. # 打印错误日志
  52. print(str(albume_id) + ': ' + str(e))
  53. if __name__ == '__main__':
  54. album_Queue = Queue(maxsize=0)
  55. artists = sql.get_all_artist()
  56. for artist_ids in artists:
  57. album_Queue.put(artist_ids['ARTIST_ID'])
  58. for i in range(200):
  59. mulu = Producer_Album(album_Queue)
  60. mulu.start()

根据专辑信息爬取所有的歌曲信息(music_by _album.py)

  1. """
  2. 根据专辑 ID 获取到所有的音乐 ID
  3. """
  4. import requests
  5. from bs4 import BeautifulSoup
  6. from music_163 import sql
  7. import threading
  8. from queue import Queue
  9. class Producer_Music(threading.Thread):
  10. headers = {
  11. "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36",
  12. 'origin': 'https://music.163.com',
  13. 'referer': 'https://music.163.com/'
  14. }
  15. def __init__(self, music_queue, *args, **kwargs):
  16. super(Producer_Music, self).__init__(*args, **kwargs)
  17. self.music_queue = music_queue
  18. self.db = sql.Db('localhost', 'root', 'netis', 'test')
  19. def run(self):
  20. while True:
  21. if self.music_queue.empty():
  22. print("所有音乐信息都已生产完成,生产者停止")
  23. break
  24. album_id = self.music_queue.get()
  25. self.save_music(album_id)
  26. def save_music(self, album_id):
  27. db = self.db.get_instance()
  28. cursor = db.cursor()
  29. params = {'id': album_id}
  30. # 获取专辑对应的页面
  31. r = requests.get('http://music.163.com/album', headers=self.headers, params=params)
  32. # 网页解析
  33. soup = BeautifulSoup(r.content.decode(), 'html.parser')
  34. body = soup.body
  35. musics = body.find('ul', attrs={'class': 'f-hide'}).find_all('li') # 获取专辑的所有音乐
  36. for music in musics:
  37. music = music.find('a')
  38. music_id = music['href'].replace('/song?id=', '')
  39. music_name = music.getText()
  40. print(music_id, music_name, album_id)
  41. try:
  42. sqll = "INSERT INTO `musics` (`MUSIC_ID`, `MUSIC_NAME`, `ALBUM_ID`) VALUES (%s, %s, %s)"
  43. cursor.execute(sqll, (music_id, music_name, album_id))
  44. db.commit()
  45. except Exception as e:
  46. # 打印错误日志
  47. print(str(album_id) + ': ' + str(e))
  48. if __name__ == '__main__':
  49. music_Queue = Queue(maxsize=0)
  50. albums = sql.get_all_album()
  51. for album_ids in albums:
  52. music_Queue.put(album_ids[0])
  53. for i in range(400):
  54. mulu = Producer_Music(music_Queue)
  55. mulu.start()

根据歌曲信息爬取其评论条数(comments_by _music.py)

  1. """
  2. 根据歌曲 ID 获得所有的歌曲所对应的评论信息
  3. """
  4. import requests
  5. from music_163 import sql
  6. import time
  7. import threading
  8. import pymysql.cursors
  9. from jsonpath import jsonpath
  10. import pprint
  11. import math
  12. from queue import Queue
  13. import re
  14. from itertools import chain
  15. class Producer_urls(threading.Thread):
  16. headers = {
  17. 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
  18. 'Accept-Encoding': 'gzip, deflate',
  19. 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
  20. 'Connection': 'keep-alive',
  21. 'Host': 'music.163.com',
  22. 'Upgrade-Insecure-Requests': '1',
  23. 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36'
  24. }
  25. def __init__(self, ids_queue, music_queue, *args, **kwargs):
  26. super(Producer_urls, self).__init__(*args, **kwargs)
  27. self.ids_queue = ids_queue
  28. self.music_queue = music_queue
  29. def run(self):
  30. while True:
  31. if self.ids_queue.empty():
  32. print("所有url都已生产完成,生产者停止")
  33. break
  34. music_ids = self.ids_queue.get()
  35. # print(music_ids)
  36. self.producer_urls(music_ids)
  37. def producer_urls(self, music_ids):
  38. index_url = 'http://music.163.com/api/v1/resource/comments/R_SO_4_{}?offset=0&limit=20'.format(music_ids)
  39. req = requests.get(url=index_url, headers=self.headers).json()
  40. total = math.floor(jsonpath(req, '$..total')[0] / 20)
  41. for ind in range(0, total):
  42. req_urls = "http://music.163.com/api/v1/resource/comments/R_SO_4_{}?offset={}&limit=20".format(
  43. music_ids,
  44. (ind * 20))
  45. self.music_queue.put(req_urls)
  46. class Producer_Comments(threading.Thread):
  47. url = 'https://music.163.com/weapi/v1/resource/comments/R_SO_4_862102137?csrf_token=' # 歌评url
  48. headers = {
  49. 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
  50. 'Accept-Encoding': 'gzip, deflate',
  51. 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
  52. 'Connection': 'keep-alive',
  53. 'Host': 'music.163.com',
  54. 'Upgrade-Insecure-Requests': '1',
  55. 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36'
  56. }
  57. def __init__(self, ids_queue, music_queue, *args, **kwargs):
  58. super(Producer_Comments, self).__init__(*args, **kwargs)
  59. self.ids_queue = ids_queue
  60. self.music_queue = music_queue
  61. self.db = sql.Db('localhost', 'root', 'netis', 'test')
  62. def run(self):
  63. while True:
  64. if self.ids_queue.empty() and self.music_queue.empty():
  65. print("所有歌手信息都已生产完成,生产者停止")
  66. break
  67. req_url = self.music_queue.get()
  68. try:
  69. self.get_comments(req_url)
  70. except Exception as e:
  71. print(e)
  72. def get_comments(self, req_url):
  73. db = self.db.get_instance()
  74. cursor = db.cursor()
  75. r = requests.get(url=req_url, headers=self.headers).json()
  76. music_ids = re.search(r'.*R_SO_4_(\d*)\?.*', req_url).group(1)
  77. userIds = jsonpath(r, "$['comments'][*]['user']['userId']")
  78. if userIds:
  79. nicknames = jsonpath(r, "$['comments'][*]['user']['nickname']")
  80. # avatarUrls = jsonpath(r, '$..avatarUrl')
  81. contents = jsonpath(r, "$['comments'][*]['content']")
  82. likedCoutns = jsonpath(r, "$['comments'][*]['likedCount']")
  83. ll = zip(userIds, nicknames, contents, likedCoutns)
  84. for userId, nickname, content, likedCount in ll:
  85. try:
  86. sqll = "INSERT INTO `comments` (`MUSIC_ID`, `COMMENTS`, `USER_ID`, `NICK_NAME` ,`DETAILS`) VALUES (%s,%s,%s,%s,%s)"
  87. cursor.execute(sqll, (music_ids, content, userId, nickname, likedCount))
  88. db.commit()
  89. except Exception as e:
  90. # 打印错误日志
  91. print(str(req_url) + ': ' + str(e))
  92. else:
  93. print(req_url + "不存在")
  94. if __name__ == '__main__':
  95. music_Queue = Queue(maxsize=0)
  96. ids_Queue = Queue(maxsize=0)
  97. musics_id = sql.get_all_music()
  98. for music_id in musics_id:
  99. ids_Queue.put(music_id[0])
  100. # print(ids_Queue.get())
  101. for i in range(10):
  102. mulu = Producer_urls(ids_Queue, music_Queue)
  103. mulu.start()
  104. for i in range(10):
  105. mulu = Producer_Comments(ids_Queue, music_Queue)
  106. mulu.start()

数据库相关的语句(sql.py)

  1. """
  2. 一般 Python 用于连接 MySQL 的工具:pymysql
  3. """
  4. import pymysql.cursors
  5. import threading
  6. class Db(object):
  7. def __init__(self, host=None, username=None, password=None, dbname=None):
  8. self.pool = {}
  9. self.host = host
  10. self.username = username
  11. self.password = password
  12. self.dbname = dbname
  13. self.charset = 'utf8mb4'
  14. self.cursorclass = pymysql.cursors.DictCursor
  15. def get_instance(self, ):
  16. name = threading.current_thread().name
  17. if name not in self.pool:
  18. conn = pymysql.connect(self.host, self.username, self.password, self.dbname)
  19. self.pool[name] = conn
  20. return self.pool[name]
  21. def insert_artist(self, artist_id, artist_name, group_id, dbb):
  22. with self.get_instance() as db:
  23. sql = "INSERT INTO `artists` (`ARTIST_ID`, `ARTIST_NAME` , `ARTIST_SORT_ID`) VALUES (%s, %s, %s)"
  24. db.execute(sql, (artist_id, artist_name, group_id))
  25. dbb.commit()
  26. connection = pymysql.connect(host='localhost',
  27. user='root',
  28. password='netis',
  29. db='test',
  30. charset='utf8mb4',
  31. cursorclass=pymysql.cursors.DictCursor)
  32. connection0 = pymysql.connect(host='localhost',
  33. user='root',
  34. password='netis',
  35. db='test',
  36. charset='utf8mb4',
  37. cursorclass=pymysql.cursors.SSCursor)
  38. # 保存评论
  39. def insert_comments(self, music_id, comments, detail, connection0):
  40. with connection0.cursor() as cursor:
  41. sql = "INSERT INTO `comments` (`MUSIC_ID`, `COMMENTS`, `DETAILS`) VALUES (%s, %s, %s)"
  42. cursor.execute(sql, (music_id, comments, detail))
  43. connection0.commit()
  44. # 保存音乐
  45. def insert_music(music_id, music_name, album_id):
  46. with connection.cursor() as cursor:
  47. sql = "INSERT INTO `musics` (`MUSIC_ID`, `MUSIC_NAME`, `ALBUM_ID`) VALUES (%s, %s, %s)"
  48. cursor.execute(sql, (music_id, music_name, album_id))
  49. connection.commit()
  50. # 保存专辑
  51. def insert_album(album_id, artist_id):
  52. with connection.cursor() as cursor:
  53. sql = "INSERT INTO `albums` (`ALBUM_ID`, `ARTIST_ID`) VALUES (%s, %s)"
  54. cursor.execute(sql, (album_id, artist_id))
  55. connection.commit()
  56. # 保存歌手
  57. def insert_artist(self, artist_id, artist_name, group_id):
  58. with self.connection.cursor() as cursor:
  59. sql = "INSERT INTO `artists` (`ARTIST_ID`, `ARTIST_NAME` , `ARTIST_SORT_ID`) VALUES (%s, %s, %s)"
  60. cursor.execute(sql, (artist_id, artist_name, group_id))
  61. connection.commit()
  62. # 获取所有歌手的 ID
  63. def get_all_artist():
  64. with connection.cursor() as cursor:
  65. sql = "SELECT `ARTIST_ID` FROM `artists` ORDER BY ARTIST_ID"
  66. cursor.execute(sql, ())
  67. return cursor.fetchall()
  68. # 获取所有专辑的 ID
  69. def get_all_album():
  70. with connection0.cursor() as cursor:
  71. sql = "SELECT `ALBUM_ID` FROM `albums` ORDER BY ALBUM_ID"
  72. cursor.execute(sql, ())
  73. return cursor.fetchall()
  74. # 获取所有音乐的 ID
  75. def get_all_music():
  76. with connection0.cursor() as cursor:
  77. sql = "SELECT `MUSIC_ID` FROM `musics` ORDER BY MUSIC_ID"
  78. cursor.execute(sql, ())
  79. return cursor.fetchall()
  80. # 获取前一半音乐的 ID
  81. def get_before_music():
  82. with connection.cursor() as cursor:
  83. sql = "SELECT `MUSIC_ID` FROM `musics` ORDER BY MUSIC_ID LIMIT 0, 800000"
  84. cursor.execute(sql, ())
  85. return cursor.fetchall()
  86. # 获取后一半音乐的 ID
  87. def get_after_music():
  88. with connection.cursor() as cursor:
  89. sql = "SELECT `MUSIC_ID` FROM `musics` ORDER BY MUSIC_ID LIMIT 800000, 1197429"
  90. cursor.execute(sql, ())
  91. return cursor.fetchall()
  92. def dis_connect():
  93. connection.close()

mysql.sql

SET NAMES utf8mb4;
SET FOREIGN_KEY_CHECKS = 0;

-- ----------------------------
-- Table structure for albums
-- ----------------------------
DROP TABLE IF EXISTS `albums`;
CREATE TABLE `albums`  (
  `ID` int(11) NOT NULL AUTO_INCREMENT,
  `ALBUM_ID` int(11) NOT NULL,
  `ARTIST_ID` int(11) NULL DEFAULT NULL,
  `ALBUM_NAME` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL DEFAULT NULL,
  `ALBUM_TIME` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL DEFAULT NULL,
  PRIMARY KEY (`ID`) USING BTREE
) ENGINE = InnoDB AUTO_INCREMENT = 446526 CHARACTER SET = latin1 COLLATE = latin1_swedish_ci ROW_FORMAT = Dynamic;

-- ----------------------------
-- Table structure for artists
-- ----------------------------
DROP TABLE IF EXISTS `artists`;
CREATE TABLE `artists`  (
  `ID` int(100) NOT NULL AUTO_INCREMENT,
  `ARTIST_NAME` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL DEFAULT NULL,
  `ARTIST_ID` int(100) NULL DEFAULT NULL,
  `ARTIST_SORT_ID` int(100) NULL DEFAULT NULL,
  PRIMARY KEY (`ID`) USING BTREE
) ENGINE = InnoDB AUTO_INCREMENT = 33870 CHARACTER SET = latin1 COLLATE = latin1_swedish_ci ROW_FORMAT = Dynamic;

-- ----------------------------
-- Table structure for comments
-- ----------------------------
DROP TABLE IF EXISTS `comments`;
CREATE TABLE `comments`  (
  `ID` int(100) NOT NULL AUTO_INCREMENT,
  `MUSIC_ID` int(100) NULL DEFAULT NULL,
  `COMMENTS` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL DEFAULT NULL,
  `USER_ID` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL DEFAULT NULL,
  `NICK_NAME` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL DEFAULT NULL,
  `DETAILS` int(100) NULL DEFAULT NULL,
  PRIMARY KEY (`ID`) USING BTREE
) ENGINE = InnoDB AUTO_INCREMENT = 52092 CHARACTER SET = latin1 COLLATE = latin1_swedish_ci ROW_FORMAT = Dynamic;

-- ----------------------------
-- Table structure for musics
-- ----------------------------
DROP TABLE IF EXISTS `musics`;
CREATE TABLE `musics`  (
  `MUSIC_ID` int(20) NOT NULL,
  `MUSIC_NAME` varchar(20) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL DEFAULT NULL,
  `ID` int(20) NOT NULL AUTO_INCREMENT,
  `ALBUM_ID` int(11) NULL DEFAULT NULL,
  PRIMARY KEY (`ID`) USING BTREE
) ENGINE = InnoDB AUTO_INCREMENT = 19661 CHARACTER SET = latin1 COLLATE = latin1_swedish_ci ROW_FORMAT = Dynamic;

SET FOREIGN_KEY_CHECKS = 1;