爬取所有的歌手信息(artists.py)
"""获取所有的歌手信息"""import requestsfrom bs4 import BeautifulSoupimport threadingfrom queue import Queuefrom music_163 import sqlclass Producer_artists(threading.Thread):headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36",'origin': 'https://music.163.com','referer': 'https://music.163.com/'}def __init__(self, artists_sort_queue, *args, **kwargs):super(Producer_artists, self).__init__(*args, **kwargs)self.artists_sort_queue = artists_sort_queueself.db = sql.Db('localhost', 'root', 'netis', 'test')# self.zhangjie_queue = zhangjie_queuedef run(self):while True:if self.artists_sort_queue.empty():print("所有歌手信息都已生产完成,生产者停止")breakgroup_id, initial = self.artists_sort_queue.get()self.save_artist(group_id, initial)def save_artist(self, group_id, initial):# group_id = 1001 华语男歌手# initial 69-95db = self.db.get_instance()cursor = db.cursor()params = {'id': group_id, 'initial': initial}r = requests.get('http://music.163.com/discover/artist/cat', headers=self.headers, params=params,)# 网页解析soup = BeautifulSoup(r.content.decode(), 'lxml')artists = soup.find_all('a', class_='nm nm-icn f-thide s-fc0')for artist in artists:artist_id = artist['href'].replace('/artist?id=', '').strip()artist_name = artist['title'].replace('的音乐', '')try:sqll = "INSERT INTO `artists` (`ARTIST_ID`, `ARTIST_NAME` , `ARTIST_SORT_ID`) VALUES (%s, %s, %s)"cursor.execute(sqll, (artist_id, artist_name, group_id))db.commit()except Exception as e:# 打印错误日志print(e)if __name__ == '__main__':artists_sort_Queue = Queue(maxsize=0)urls = ['1001', '1002', '1003', '2001', '2002', '2003', '6001', '6002', '6003', '7001', '7002', '7003', '4001','4002', '4003']for index in urls:for i in range(65, 91):qu = index, iartists_sort_Queue.put(qu)for i in range(50):mulu = Producer_artists(artists_sort_Queue)mulu.start()
根据歌手信息爬取所有的专辑信息(album_by _artist.py)
"""根据上一步获取的歌手的 ID 来用于获取所有的专辑 ID"""import requestsfrom bs4 import BeautifulSoupimport timefrom music_163 import sqlimport threadingfrom queue import Queueclass Producer_Album(threading.Thread):headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36",'origin': 'https://music.163.com','referer': 'https://music.163.com/'}def __init__(self, album_queue, *args, **kwargs):super(Producer_Album, self).__init__(*args, **kwargs)self.album_queue = album_queueself.db = sql.Db('localhost', 'root', 'netis', 'test')# self.zhangjie_queue = zhangjie_queuedef run(self):while True:if self.album_queue.empty():print("所有歌手信息都已生产完成,生产者停止")breakartist_id = self.album_queue.get()self.save_albums(artist_id)def save_albums(self, artist_id):db = self.db.get_instance()cursor = db.cursor()params = {'id': artist_id, 'limit': '200'}# 获取歌手个人主页# 'https://music.163.com/#/artist/album?id=12429072'r = requests.get('http://music.163.com/artist/album', headers=self.headers, params=params)# proxies=get_random_proxy()# 网页解析soup = BeautifulSoup(r.content.decode(), 'lxml')# 获取所有专辑ul = soup.find_all('a', class_="tit s-fc0")times = [tt.get_text() for tt in soup.find_all('span', class_="s-fc3")]album_ids = [albums['href'].replace('/album?id=', '') for albums in ul]album_names = [albums.get_text() for albums in ul]albums = (zip(album_ids, album_names, times))for albume_id, album_name, album_time in albums:print(albume_id, album_name, album_time)try:sqll = "INSERT INTO `albums` (`ALBUM_ID`, `ARTIST_ID`, `ALBUM_NAME`, `ALBUM_TIME`) VALUES (%s,%s,%s,%s)"cursor.execute(sqll, (albume_id, artist_id, album_name, album_time))db.commit()except Exception as e:# 打印错误日志print(str(albume_id) + ': ' + str(e))if __name__ == '__main__':album_Queue = Queue(maxsize=0)artists = sql.get_all_artist()for artist_ids in artists:album_Queue.put(artist_ids['ARTIST_ID'])for i in range(200):mulu = Producer_Album(album_Queue)mulu.start()
根据专辑信息爬取所有的歌曲信息(music_by _album.py)
"""根据专辑 ID 获取到所有的音乐 ID"""import requestsfrom bs4 import BeautifulSoupfrom music_163 import sqlimport threadingfrom queue import Queueclass Producer_Music(threading.Thread):headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36",'origin': 'https://music.163.com','referer': 'https://music.163.com/'}def __init__(self, music_queue, *args, **kwargs):super(Producer_Music, self).__init__(*args, **kwargs)self.music_queue = music_queueself.db = sql.Db('localhost', 'root', 'netis', 'test')def run(self):while True:if self.music_queue.empty():print("所有音乐信息都已生产完成,生产者停止")breakalbum_id = self.music_queue.get()self.save_music(album_id)def save_music(self, album_id):db = self.db.get_instance()cursor = db.cursor()params = {'id': album_id}# 获取专辑对应的页面r = requests.get('http://music.163.com/album', headers=self.headers, params=params)# 网页解析soup = BeautifulSoup(r.content.decode(), 'html.parser')body = soup.bodymusics = body.find('ul', attrs={'class': 'f-hide'}).find_all('li') # 获取专辑的所有音乐for music in musics:music = music.find('a')music_id = music['href'].replace('/song?id=', '')music_name = music.getText()print(music_id, music_name, album_id)try:sqll = "INSERT INTO `musics` (`MUSIC_ID`, `MUSIC_NAME`, `ALBUM_ID`) VALUES (%s, %s, %s)"cursor.execute(sqll, (music_id, music_name, album_id))db.commit()except Exception as e:# 打印错误日志print(str(album_id) + ': ' + str(e))if __name__ == '__main__':music_Queue = Queue(maxsize=0)albums = sql.get_all_album()for album_ids in albums:music_Queue.put(album_ids[0])for i in range(400):mulu = Producer_Music(music_Queue)mulu.start()
根据歌曲信息爬取其评论条数(comments_by _music.py)
"""根据歌曲 ID 获得所有的歌曲所对应的评论信息"""import requestsfrom music_163 import sqlimport timeimport threadingimport pymysql.cursorsfrom jsonpath import jsonpathimport pprintimport mathfrom queue import Queueimport refrom itertools import chainclass Producer_urls(threading.Thread):headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9','Accept-Encoding': 'gzip, deflate','Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8','Connection': 'keep-alive','Host': 'music.163.com','Upgrade-Insecure-Requests': '1','User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36'}def __init__(self, ids_queue, music_queue, *args, **kwargs):super(Producer_urls, self).__init__(*args, **kwargs)self.ids_queue = ids_queueself.music_queue = music_queuedef run(self):while True:if self.ids_queue.empty():print("所有url都已生产完成,生产者停止")breakmusic_ids = self.ids_queue.get()# print(music_ids)self.producer_urls(music_ids)def producer_urls(self, music_ids):index_url = 'http://music.163.com/api/v1/resource/comments/R_SO_4_{}?offset=0&limit=20'.format(music_ids)req = requests.get(url=index_url, headers=self.headers).json()total = math.floor(jsonpath(req, '$..total')[0] / 20)for ind in range(0, total):req_urls = "http://music.163.com/api/v1/resource/comments/R_SO_4_{}?offset={}&limit=20".format(music_ids,(ind * 20))self.music_queue.put(req_urls)class Producer_Comments(threading.Thread):url = 'https://music.163.com/weapi/v1/resource/comments/R_SO_4_862102137?csrf_token=' # 歌评urlheaders = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9','Accept-Encoding': 'gzip, deflate','Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8','Connection': 'keep-alive','Host': 'music.163.com','Upgrade-Insecure-Requests': '1','User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36'}def __init__(self, ids_queue, music_queue, *args, **kwargs):super(Producer_Comments, self).__init__(*args, **kwargs)self.ids_queue = ids_queueself.music_queue = music_queueself.db = sql.Db('localhost', 'root', 'netis', 'test')def run(self):while True:if self.ids_queue.empty() and self.music_queue.empty():print("所有歌手信息都已生产完成,生产者停止")breakreq_url = self.music_queue.get()try:self.get_comments(req_url)except Exception as e:print(e)def get_comments(self, req_url):db = self.db.get_instance()cursor = db.cursor()r = requests.get(url=req_url, headers=self.headers).json()music_ids = re.search(r'.*R_SO_4_(\d*)\?.*', req_url).group(1)userIds = jsonpath(r, "$['comments'][*]['user']['userId']")if userIds:nicknames = jsonpath(r, "$['comments'][*]['user']['nickname']")# avatarUrls = jsonpath(r, '$..avatarUrl')contents = jsonpath(r, "$['comments'][*]['content']")likedCoutns = jsonpath(r, "$['comments'][*]['likedCount']")ll = zip(userIds, nicknames, contents, likedCoutns)for userId, nickname, content, likedCount in ll:try:sqll = "INSERT INTO `comments` (`MUSIC_ID`, `COMMENTS`, `USER_ID`, `NICK_NAME` ,`DETAILS`) VALUES (%s,%s,%s,%s,%s)"cursor.execute(sqll, (music_ids, content, userId, nickname, likedCount))db.commit()except Exception as e:# 打印错误日志print(str(req_url) + ': ' + str(e))else:print(req_url + "不存在")if __name__ == '__main__':music_Queue = Queue(maxsize=0)ids_Queue = Queue(maxsize=0)musics_id = sql.get_all_music()for music_id in musics_id:ids_Queue.put(music_id[0])# print(ids_Queue.get())for i in range(10):mulu = Producer_urls(ids_Queue, music_Queue)mulu.start()for i in range(10):mulu = Producer_Comments(ids_Queue, music_Queue)mulu.start()
数据库相关的语句(sql.py)
"""一般 Python 用于连接 MySQL 的工具:pymysql"""import pymysql.cursorsimport threadingclass Db(object):def __init__(self, host=None, username=None, password=None, dbname=None):self.pool = {}self.host = hostself.username = usernameself.password = passwordself.dbname = dbnameself.charset = 'utf8mb4'self.cursorclass = pymysql.cursors.DictCursordef get_instance(self, ):name = threading.current_thread().nameif name not in self.pool:conn = pymysql.connect(self.host, self.username, self.password, self.dbname)self.pool[name] = connreturn self.pool[name]def insert_artist(self, artist_id, artist_name, group_id, dbb):with self.get_instance() as db:sql = "INSERT INTO `artists` (`ARTIST_ID`, `ARTIST_NAME` , `ARTIST_SORT_ID`) VALUES (%s, %s, %s)"db.execute(sql, (artist_id, artist_name, group_id))dbb.commit()connection = pymysql.connect(host='localhost',user='root',password='netis',db='test',charset='utf8mb4',cursorclass=pymysql.cursors.DictCursor)connection0 = pymysql.connect(host='localhost',user='root',password='netis',db='test',charset='utf8mb4',cursorclass=pymysql.cursors.SSCursor)# 保存评论def insert_comments(self, music_id, comments, detail, connection0):with connection0.cursor() as cursor:sql = "INSERT INTO `comments` (`MUSIC_ID`, `COMMENTS`, `DETAILS`) VALUES (%s, %s, %s)"cursor.execute(sql, (music_id, comments, detail))connection0.commit()# 保存音乐def insert_music(music_id, music_name, album_id):with connection.cursor() as cursor:sql = "INSERT INTO `musics` (`MUSIC_ID`, `MUSIC_NAME`, `ALBUM_ID`) VALUES (%s, %s, %s)"cursor.execute(sql, (music_id, music_name, album_id))connection.commit()# 保存专辑def insert_album(album_id, artist_id):with connection.cursor() as cursor:sql = "INSERT INTO `albums` (`ALBUM_ID`, `ARTIST_ID`) VALUES (%s, %s)"cursor.execute(sql, (album_id, artist_id))connection.commit()# 保存歌手def insert_artist(self, artist_id, artist_name, group_id):with self.connection.cursor() as cursor:sql = "INSERT INTO `artists` (`ARTIST_ID`, `ARTIST_NAME` , `ARTIST_SORT_ID`) VALUES (%s, %s, %s)"cursor.execute(sql, (artist_id, artist_name, group_id))connection.commit()# 获取所有歌手的 IDdef get_all_artist():with connection.cursor() as cursor:sql = "SELECT `ARTIST_ID` FROM `artists` ORDER BY ARTIST_ID"cursor.execute(sql, ())return cursor.fetchall()# 获取所有专辑的 IDdef get_all_album():with connection0.cursor() as cursor:sql = "SELECT `ALBUM_ID` FROM `albums` ORDER BY ALBUM_ID"cursor.execute(sql, ())return cursor.fetchall()# 获取所有音乐的 IDdef get_all_music():with connection0.cursor() as cursor:sql = "SELECT `MUSIC_ID` FROM `musics` ORDER BY MUSIC_ID"cursor.execute(sql, ())return cursor.fetchall()# 获取前一半音乐的 IDdef get_before_music():with connection.cursor() as cursor:sql = "SELECT `MUSIC_ID` FROM `musics` ORDER BY MUSIC_ID LIMIT 0, 800000"cursor.execute(sql, ())return cursor.fetchall()# 获取后一半音乐的 IDdef get_after_music():with connection.cursor() as cursor:sql = "SELECT `MUSIC_ID` FROM `musics` ORDER BY MUSIC_ID LIMIT 800000, 1197429"cursor.execute(sql, ())return cursor.fetchall()def dis_connect():connection.close()
mysql.sql
SET NAMES utf8mb4;
SET FOREIGN_KEY_CHECKS = 0;
-- ----------------------------
-- Table structure for albums
-- ----------------------------
DROP TABLE IF EXISTS `albums`;
CREATE TABLE `albums` (
`ID` int(11) NOT NULL AUTO_INCREMENT,
`ALBUM_ID` int(11) NOT NULL,
`ARTIST_ID` int(11) NULL DEFAULT NULL,
`ALBUM_NAME` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL DEFAULT NULL,
`ALBUM_TIME` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL DEFAULT NULL,
PRIMARY KEY (`ID`) USING BTREE
) ENGINE = InnoDB AUTO_INCREMENT = 446526 CHARACTER SET = latin1 COLLATE = latin1_swedish_ci ROW_FORMAT = Dynamic;
-- ----------------------------
-- Table structure for artists
-- ----------------------------
DROP TABLE IF EXISTS `artists`;
CREATE TABLE `artists` (
`ID` int(100) NOT NULL AUTO_INCREMENT,
`ARTIST_NAME` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL DEFAULT NULL,
`ARTIST_ID` int(100) NULL DEFAULT NULL,
`ARTIST_SORT_ID` int(100) NULL DEFAULT NULL,
PRIMARY KEY (`ID`) USING BTREE
) ENGINE = InnoDB AUTO_INCREMENT = 33870 CHARACTER SET = latin1 COLLATE = latin1_swedish_ci ROW_FORMAT = Dynamic;
-- ----------------------------
-- Table structure for comments
-- ----------------------------
DROP TABLE IF EXISTS `comments`;
CREATE TABLE `comments` (
`ID` int(100) NOT NULL AUTO_INCREMENT,
`MUSIC_ID` int(100) NULL DEFAULT NULL,
`COMMENTS` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL DEFAULT NULL,
`USER_ID` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL DEFAULT NULL,
`NICK_NAME` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL DEFAULT NULL,
`DETAILS` int(100) NULL DEFAULT NULL,
PRIMARY KEY (`ID`) USING BTREE
) ENGINE = InnoDB AUTO_INCREMENT = 52092 CHARACTER SET = latin1 COLLATE = latin1_swedish_ci ROW_FORMAT = Dynamic;
-- ----------------------------
-- Table structure for musics
-- ----------------------------
DROP TABLE IF EXISTS `musics`;
CREATE TABLE `musics` (
`MUSIC_ID` int(20) NOT NULL,
`MUSIC_NAME` varchar(20) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL DEFAULT NULL,
`ID` int(20) NOT NULL AUTO_INCREMENT,
`ALBUM_ID` int(11) NULL DEFAULT NULL,
PRIMARY KEY (`ID`) USING BTREE
) ENGINE = InnoDB AUTO_INCREMENT = 19661 CHARACTER SET = latin1 COLLATE = latin1_swedish_ci ROW_FORMAT = Dynamic;
SET FOREIGN_KEY_CHECKS = 1;
