爬取所有的歌手信息(artists.py)
"""
获取所有的歌手信息
"""
import requests
from bs4 import BeautifulSoup
import threading
from queue import Queue
from music_163 import sql
class Producer_artists(threading.Thread):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36",
'origin': 'https://music.163.com',
'referer': 'https://music.163.com/'
}
def __init__(self, artists_sort_queue, *args, **kwargs):
super(Producer_artists, self).__init__(*args, **kwargs)
self.artists_sort_queue = artists_sort_queue
self.db = sql.Db('localhost', 'root', 'netis', 'test')
# self.zhangjie_queue = zhangjie_queue
def run(self):
while True:
if self.artists_sort_queue.empty():
print("所有歌手信息都已生产完成,生产者停止")
break
group_id, initial = self.artists_sort_queue.get()
self.save_artist(group_id, initial)
def save_artist(self, group_id, initial):
# group_id = 1001 华语男歌手
# initial 69-95
db = self.db.get_instance()
cursor = db.cursor()
params = {'id': group_id, 'initial': initial}
r = requests.get('http://music.163.com/discover/artist/cat', headers=self.headers, params=params,
)
# 网页解析
soup = BeautifulSoup(r.content.decode(), 'lxml')
artists = soup.find_all('a', class_='nm nm-icn f-thide s-fc0')
for artist in artists:
artist_id = artist['href'].replace('/artist?id=', '').strip()
artist_name = artist['title'].replace('的音乐', '')
try:
sqll = "INSERT INTO `artists` (`ARTIST_ID`, `ARTIST_NAME` , `ARTIST_SORT_ID`) VALUES (%s, %s, %s)"
cursor.execute(sqll, (artist_id, artist_name, group_id))
db.commit()
except Exception as e:
# 打印错误日志
print(e)
if __name__ == '__main__':
artists_sort_Queue = Queue(maxsize=0)
urls = ['1001', '1002', '1003', '2001', '2002', '2003', '6001', '6002', '6003', '7001', '7002', '7003', '4001',
'4002', '4003']
for index in urls:
for i in range(65, 91):
qu = index, i
artists_sort_Queue.put(qu)
for i in range(50):
mulu = Producer_artists(artists_sort_Queue)
mulu.start()
根据歌手信息爬取所有的专辑信息(album_by _artist.py)
"""
根据上一步获取的歌手的 ID 来用于获取所有的专辑 ID
"""
import requests
from bs4 import BeautifulSoup
import time
from music_163 import sql
import threading
from queue import Queue
class Producer_Album(threading.Thread):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36",
'origin': 'https://music.163.com',
'referer': 'https://music.163.com/'
}
def __init__(self, album_queue, *args, **kwargs):
super(Producer_Album, self).__init__(*args, **kwargs)
self.album_queue = album_queue
self.db = sql.Db('localhost', 'root', 'netis', 'test')
# self.zhangjie_queue = zhangjie_queue
def run(self):
while True:
if self.album_queue.empty():
print("所有歌手信息都已生产完成,生产者停止")
break
artist_id = self.album_queue.get()
self.save_albums(artist_id)
def save_albums(self, artist_id):
db = self.db.get_instance()
cursor = db.cursor()
params = {'id': artist_id, 'limit': '200'}
# 获取歌手个人主页
# 'https://music.163.com/#/artist/album?id=12429072'
r = requests.get('http://music.163.com/artist/album', headers=self.headers, params=params)
# proxies=get_random_proxy()
# 网页解析
soup = BeautifulSoup(r.content.decode(), 'lxml')
# 获取所有专辑
ul = soup.find_all('a', class_="tit s-fc0")
times = [tt.get_text() for tt in soup.find_all('span', class_="s-fc3")]
album_ids = [albums['href'].replace('/album?id=', '') for albums in ul]
album_names = [albums.get_text() for albums in ul]
albums = (zip(album_ids, album_names, times))
for albume_id, album_name, album_time in albums:
print(albume_id, album_name, album_time)
try:
sqll = "INSERT INTO `albums` (`ALBUM_ID`, `ARTIST_ID`, `ALBUM_NAME`, `ALBUM_TIME`) VALUES (%s,%s,%s,%s)"
cursor.execute(sqll, (albume_id, artist_id, album_name, album_time))
db.commit()
except Exception as e:
# 打印错误日志
print(str(albume_id) + ': ' + str(e))
if __name__ == '__main__':
album_Queue = Queue(maxsize=0)
artists = sql.get_all_artist()
for artist_ids in artists:
album_Queue.put(artist_ids['ARTIST_ID'])
for i in range(200):
mulu = Producer_Album(album_Queue)
mulu.start()
根据专辑信息爬取所有的歌曲信息(music_by _album.py)
"""
根据专辑 ID 获取到所有的音乐 ID
"""
import requests
from bs4 import BeautifulSoup
from music_163 import sql
import threading
from queue import Queue
class Producer_Music(threading.Thread):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36",
'origin': 'https://music.163.com',
'referer': 'https://music.163.com/'
}
def __init__(self, music_queue, *args, **kwargs):
super(Producer_Music, self).__init__(*args, **kwargs)
self.music_queue = music_queue
self.db = sql.Db('localhost', 'root', 'netis', 'test')
def run(self):
while True:
if self.music_queue.empty():
print("所有音乐信息都已生产完成,生产者停止")
break
album_id = self.music_queue.get()
self.save_music(album_id)
def save_music(self, album_id):
db = self.db.get_instance()
cursor = db.cursor()
params = {'id': album_id}
# 获取专辑对应的页面
r = requests.get('http://music.163.com/album', headers=self.headers, params=params)
# 网页解析
soup = BeautifulSoup(r.content.decode(), 'html.parser')
body = soup.body
musics = body.find('ul', attrs={'class': 'f-hide'}).find_all('li') # 获取专辑的所有音乐
for music in musics:
music = music.find('a')
music_id = music['href'].replace('/song?id=', '')
music_name = music.getText()
print(music_id, music_name, album_id)
try:
sqll = "INSERT INTO `musics` (`MUSIC_ID`, `MUSIC_NAME`, `ALBUM_ID`) VALUES (%s, %s, %s)"
cursor.execute(sqll, (music_id, music_name, album_id))
db.commit()
except Exception as e:
# 打印错误日志
print(str(album_id) + ': ' + str(e))
if __name__ == '__main__':
music_Queue = Queue(maxsize=0)
albums = sql.get_all_album()
for album_ids in albums:
music_Queue.put(album_ids[0])
for i in range(400):
mulu = Producer_Music(music_Queue)
mulu.start()
根据歌曲信息爬取其评论条数(comments_by _music.py)
"""
根据歌曲 ID 获得所有的歌曲所对应的评论信息
"""
import requests
from music_163 import sql
import time
import threading
import pymysql.cursors
from jsonpath import jsonpath
import pprint
import math
from queue import Queue
import re
from itertools import chain
class Producer_urls(threading.Thread):
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Connection': 'keep-alive',
'Host': 'music.163.com',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36'
}
def __init__(self, ids_queue, music_queue, *args, **kwargs):
super(Producer_urls, self).__init__(*args, **kwargs)
self.ids_queue = ids_queue
self.music_queue = music_queue
def run(self):
while True:
if self.ids_queue.empty():
print("所有url都已生产完成,生产者停止")
break
music_ids = self.ids_queue.get()
# print(music_ids)
self.producer_urls(music_ids)
def producer_urls(self, music_ids):
index_url = 'http://music.163.com/api/v1/resource/comments/R_SO_4_{}?offset=0&limit=20'.format(music_ids)
req = requests.get(url=index_url, headers=self.headers).json()
total = math.floor(jsonpath(req, '$..total')[0] / 20)
for ind in range(0, total):
req_urls = "http://music.163.com/api/v1/resource/comments/R_SO_4_{}?offset={}&limit=20".format(
music_ids,
(ind * 20))
self.music_queue.put(req_urls)
class Producer_Comments(threading.Thread):
url = 'https://music.163.com/weapi/v1/resource/comments/R_SO_4_862102137?csrf_token=' # 歌评url
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Connection': 'keep-alive',
'Host': 'music.163.com',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36'
}
def __init__(self, ids_queue, music_queue, *args, **kwargs):
super(Producer_Comments, self).__init__(*args, **kwargs)
self.ids_queue = ids_queue
self.music_queue = music_queue
self.db = sql.Db('localhost', 'root', 'netis', 'test')
def run(self):
while True:
if self.ids_queue.empty() and self.music_queue.empty():
print("所有歌手信息都已生产完成,生产者停止")
break
req_url = self.music_queue.get()
try:
self.get_comments(req_url)
except Exception as e:
print(e)
def get_comments(self, req_url):
db = self.db.get_instance()
cursor = db.cursor()
r = requests.get(url=req_url, headers=self.headers).json()
music_ids = re.search(r'.*R_SO_4_(\d*)\?.*', req_url).group(1)
userIds = jsonpath(r, "$['comments'][*]['user']['userId']")
if userIds:
nicknames = jsonpath(r, "$['comments'][*]['user']['nickname']")
# avatarUrls = jsonpath(r, '$..avatarUrl')
contents = jsonpath(r, "$['comments'][*]['content']")
likedCoutns = jsonpath(r, "$['comments'][*]['likedCount']")
ll = zip(userIds, nicknames, contents, likedCoutns)
for userId, nickname, content, likedCount in ll:
try:
sqll = "INSERT INTO `comments` (`MUSIC_ID`, `COMMENTS`, `USER_ID`, `NICK_NAME` ,`DETAILS`) VALUES (%s,%s,%s,%s,%s)"
cursor.execute(sqll, (music_ids, content, userId, nickname, likedCount))
db.commit()
except Exception as e:
# 打印错误日志
print(str(req_url) + ': ' + str(e))
else:
print(req_url + "不存在")
if __name__ == '__main__':
music_Queue = Queue(maxsize=0)
ids_Queue = Queue(maxsize=0)
musics_id = sql.get_all_music()
for music_id in musics_id:
ids_Queue.put(music_id[0])
# print(ids_Queue.get())
for i in range(10):
mulu = Producer_urls(ids_Queue, music_Queue)
mulu.start()
for i in range(10):
mulu = Producer_Comments(ids_Queue, music_Queue)
mulu.start()
数据库相关的语句(sql.py)
"""
一般 Python 用于连接 MySQL 的工具:pymysql
"""
import pymysql.cursors
import threading
class Db(object):
def __init__(self, host=None, username=None, password=None, dbname=None):
self.pool = {}
self.host = host
self.username = username
self.password = password
self.dbname = dbname
self.charset = 'utf8mb4'
self.cursorclass = pymysql.cursors.DictCursor
def get_instance(self, ):
name = threading.current_thread().name
if name not in self.pool:
conn = pymysql.connect(self.host, self.username, self.password, self.dbname)
self.pool[name] = conn
return self.pool[name]
def insert_artist(self, artist_id, artist_name, group_id, dbb):
with self.get_instance() as db:
sql = "INSERT INTO `artists` (`ARTIST_ID`, `ARTIST_NAME` , `ARTIST_SORT_ID`) VALUES (%s, %s, %s)"
db.execute(sql, (artist_id, artist_name, group_id))
dbb.commit()
connection = pymysql.connect(host='localhost',
user='root',
password='netis',
db='test',
charset='utf8mb4',
cursorclass=pymysql.cursors.DictCursor)
connection0 = pymysql.connect(host='localhost',
user='root',
password='netis',
db='test',
charset='utf8mb4',
cursorclass=pymysql.cursors.SSCursor)
# 保存评论
def insert_comments(self, music_id, comments, detail, connection0):
with connection0.cursor() as cursor:
sql = "INSERT INTO `comments` (`MUSIC_ID`, `COMMENTS`, `DETAILS`) VALUES (%s, %s, %s)"
cursor.execute(sql, (music_id, comments, detail))
connection0.commit()
# 保存音乐
def insert_music(music_id, music_name, album_id):
with connection.cursor() as cursor:
sql = "INSERT INTO `musics` (`MUSIC_ID`, `MUSIC_NAME`, `ALBUM_ID`) VALUES (%s, %s, %s)"
cursor.execute(sql, (music_id, music_name, album_id))
connection.commit()
# 保存专辑
def insert_album(album_id, artist_id):
with connection.cursor() as cursor:
sql = "INSERT INTO `albums` (`ALBUM_ID`, `ARTIST_ID`) VALUES (%s, %s)"
cursor.execute(sql, (album_id, artist_id))
connection.commit()
# 保存歌手
def insert_artist(self, artist_id, artist_name, group_id):
with self.connection.cursor() as cursor:
sql = "INSERT INTO `artists` (`ARTIST_ID`, `ARTIST_NAME` , `ARTIST_SORT_ID`) VALUES (%s, %s, %s)"
cursor.execute(sql, (artist_id, artist_name, group_id))
connection.commit()
# 获取所有歌手的 ID
def get_all_artist():
with connection.cursor() as cursor:
sql = "SELECT `ARTIST_ID` FROM `artists` ORDER BY ARTIST_ID"
cursor.execute(sql, ())
return cursor.fetchall()
# 获取所有专辑的 ID
def get_all_album():
with connection0.cursor() as cursor:
sql = "SELECT `ALBUM_ID` FROM `albums` ORDER BY ALBUM_ID"
cursor.execute(sql, ())
return cursor.fetchall()
# 获取所有音乐的 ID
def get_all_music():
with connection0.cursor() as cursor:
sql = "SELECT `MUSIC_ID` FROM `musics` ORDER BY MUSIC_ID"
cursor.execute(sql, ())
return cursor.fetchall()
# 获取前一半音乐的 ID
def get_before_music():
with connection.cursor() as cursor:
sql = "SELECT `MUSIC_ID` FROM `musics` ORDER BY MUSIC_ID LIMIT 0, 800000"
cursor.execute(sql, ())
return cursor.fetchall()
# 获取后一半音乐的 ID
def get_after_music():
with connection.cursor() as cursor:
sql = "SELECT `MUSIC_ID` FROM `musics` ORDER BY MUSIC_ID LIMIT 800000, 1197429"
cursor.execute(sql, ())
return cursor.fetchall()
def dis_connect():
connection.close()
mysql.sql
SET NAMES utf8mb4;
SET FOREIGN_KEY_CHECKS = 0;
-- ----------------------------
-- Table structure for albums
-- ----------------------------
DROP TABLE IF EXISTS `albums`;
CREATE TABLE `albums` (
`ID` int(11) NOT NULL AUTO_INCREMENT,
`ALBUM_ID` int(11) NOT NULL,
`ARTIST_ID` int(11) NULL DEFAULT NULL,
`ALBUM_NAME` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL DEFAULT NULL,
`ALBUM_TIME` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL DEFAULT NULL,
PRIMARY KEY (`ID`) USING BTREE
) ENGINE = InnoDB AUTO_INCREMENT = 446526 CHARACTER SET = latin1 COLLATE = latin1_swedish_ci ROW_FORMAT = Dynamic;
-- ----------------------------
-- Table structure for artists
-- ----------------------------
DROP TABLE IF EXISTS `artists`;
CREATE TABLE `artists` (
`ID` int(100) NOT NULL AUTO_INCREMENT,
`ARTIST_NAME` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL DEFAULT NULL,
`ARTIST_ID` int(100) NULL DEFAULT NULL,
`ARTIST_SORT_ID` int(100) NULL DEFAULT NULL,
PRIMARY KEY (`ID`) USING BTREE
) ENGINE = InnoDB AUTO_INCREMENT = 33870 CHARACTER SET = latin1 COLLATE = latin1_swedish_ci ROW_FORMAT = Dynamic;
-- ----------------------------
-- Table structure for comments
-- ----------------------------
DROP TABLE IF EXISTS `comments`;
CREATE TABLE `comments` (
`ID` int(100) NOT NULL AUTO_INCREMENT,
`MUSIC_ID` int(100) NULL DEFAULT NULL,
`COMMENTS` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL DEFAULT NULL,
`USER_ID` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL DEFAULT NULL,
`NICK_NAME` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL DEFAULT NULL,
`DETAILS` int(100) NULL DEFAULT NULL,
PRIMARY KEY (`ID`) USING BTREE
) ENGINE = InnoDB AUTO_INCREMENT = 52092 CHARACTER SET = latin1 COLLATE = latin1_swedish_ci ROW_FORMAT = Dynamic;
-- ----------------------------
-- Table structure for musics
-- ----------------------------
DROP TABLE IF EXISTS `musics`;
CREATE TABLE `musics` (
`MUSIC_ID` int(20) NOT NULL,
`MUSIC_NAME` varchar(20) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL DEFAULT NULL,
`ID` int(20) NOT NULL AUTO_INCREMENT,
`ALBUM_ID` int(11) NULL DEFAULT NULL,
PRIMARY KEY (`ID`) USING BTREE
) ENGINE = InnoDB AUTO_INCREMENT = 19661 CHARACTER SET = latin1 COLLATE = latin1_swedish_ci ROW_FORMAT = Dynamic;
SET FOREIGN_KEY_CHECKS = 1;