官网github地址:https://github.com/NicolasHug/Surprise
我的实践代码:https://github.com/SeafyLiang/machine_learning_study/blob/master/实践项目/协同过滤/userCF_itemCF.py
pip3 install scikit-surprise -i https://pypi.tuna.tsinghua.edu.cn/simple
以KNN实现UserCF和ItemCF
#!/usr/bin/env python# -*- encoding: utf-8 -*-"""@File : userCF_itemCF.py@Modify Time @Author @Version @Description------------ ------- -------- -----------2022/3/29 21:51 SeafyLiang 1.0 surprise实现UserCF和ItemCF"""import pandas as pdfrom surprise import Readerfrom surprise import Datasetfrom surprise import KNNBasic# 读取用户评价矩阵ratings = pd.read_csv('u.data', sep='\t', names=['UserID', 'ItemID', 'rating', 'timestamp'])# df转DataSetreader = Reader(rating_scale=(1, 5))ratingDataSet = Dataset.load_from_df(ratings[['UserID', 'ItemID', 'rating']], reader)"""Args:k(int): The (max) number of neighbors to take into account foraggregation (see :ref:`this note <actual_k_note>`). Default is``40``.min_k(int): The minimum number of neighbors to take into account foraggregation. If there are not enough neighbors, the prediction isset to the global mean of all ratings. Default is ``1``.sim_options(dict): A dictionary of options for the similaritymeasure. See :ref:`similarity_measures_configuration` for acceptedoptions.verbose(bool): Whether to print trace messages of bias estimation,similarity, etc. Default is True."""# 使用knn实现UserCFdef userCF_method():userCF = KNNBasic(k=40, min_k=3,sim_options={'user_based': True})# 训练userCFuserCF.fit(ratingDataSet.build_full_trainset())# 预测# 目标用户iduid = 196# 该用户看过的所有电影IDwatchedItemIDs = ratings[ratings['UserID'] == uid]['ItemID'].drop_duplicates().values# 所有电影IDallItemIDs = ratings['ItemID'].drop_duplicates().values# 保存用户和电影之间的评分userCF_itemIDs = []userCF_ratings = []# 遍历所有电影,拿到每部电影的IDfor itemID in allItemIDs:# 如果还没看过这部电影if itemID not in watchedItemIDs:userCF_itemIDs.append(itemID)# 调用userCF模型的预测方法,预测用户对电影的评分userCF_ratings.append(userCF.predict(uid, itemID).est)# 结果转dfresult = pd.DataFrame({'userCF_itemID': userCF_itemIDs,'userCF_rating': userCF_ratings})# 结果按评分倒序排序result.sort_values(by='userCF_rating', inplace=True, ascending=False)print(result)"""userCF_itemID userCF_rating1000 1189 5.0000001397 1293 5.000000232 64 4.6431351200 1367 4.5788421454 1191 4.567010... ... ...1487 1408 1.0000001266 1432 1.000000930 777 1.0000001134 437 1.000000873 314 1.000000"""# 使用knn实现UserCFdef itemCF_method():itemCF = KNNBasic(k=40, min_k=3,sim_options={'user_based': False})itemCF.fit(ratingDataSet.build_full_trainset())# 目标物品IDitemID = 110# DataSet会对数据集中的物品ID重新编码,需要先找出模型中的id,inner_iditem_inner_id = itemCF.trainset.to_inner_iid(itemID)# 使用inner_id进行相似物品计算,找出与该物品最接近的10个物品item_inner_neighbors = itemCF.get_neighbors(item_inner_id, k=10)# 把inner_id转换为数据集中的IDitemID_neighbors = [itemCF.trainset.to_raw_iid(inner_id)for inner_id in item_inner_neighbors]print(itemID_neighbors)"""[979, 919, 1211, 339, 872, 695, 903, 1115, 960, 869]"""if __name__ == '__main__':userCF_method()itemCF_method()
