官网github地址:https://github.com/NicolasHug/Surprise
我的实践代码:https://github.com/SeafyLiang/machine_learning_study/blob/master/实践项目/协同过滤/userCF_itemCF.py

  1. pip3 install scikit-surprise -i https://pypi.tuna.tsinghua.edu.cn/simple

以KNN实现UserCF和ItemCF

  1. #!/usr/bin/env python
  2. # -*- encoding: utf-8 -*-
  3. """
  4. @File : userCF_itemCF.py
  5. @Modify Time @Author @Version @Description
  6. ------------ ------- -------- -----------
  7. 2022/3/29 21:51 SeafyLiang 1.0 surprise实现UserCF和ItemCF
  8. """
  9. import pandas as pd
  10. from surprise import Reader
  11. from surprise import Dataset
  12. from surprise import KNNBasic
  13. # 读取用户评价矩阵
  14. ratings = pd.read_csv('u.data', sep='\t', names=['UserID', 'ItemID', 'rating', 'timestamp'])
  15. # df转DataSet
  16. reader = Reader(rating_scale=(1, 5))
  17. ratingDataSet = Dataset.load_from_df(ratings[['UserID', 'ItemID', 'rating']], reader)
  18. """
  19. Args:
  20. k(int): The (max) number of neighbors to take into account for
  21. aggregation (see :ref:`this note <actual_k_note>`). Default is
  22. ``40``.
  23. min_k(int): The minimum number of neighbors to take into account for
  24. aggregation. If there are not enough neighbors, the prediction is
  25. set to the global mean of all ratings. Default is ``1``.
  26. sim_options(dict): A dictionary of options for the similarity
  27. measure. See :ref:`similarity_measures_configuration` for accepted
  28. options.
  29. verbose(bool): Whether to print trace messages of bias estimation,
  30. similarity, etc. Default is True.
  31. """
  32. # 使用knn实现UserCF
  33. def userCF_method():
  34. userCF = KNNBasic(
  35. k=40, min_k=3,
  36. sim_options={'user_based': True}
  37. )
  38. # 训练userCF
  39. userCF.fit(
  40. ratingDataSet.build_full_trainset()
  41. )
  42. # 预测
  43. # 目标用户id
  44. uid = 196
  45. # 该用户看过的所有电影ID
  46. watchedItemIDs = ratings[ratings['UserID'] == uid]['ItemID'].drop_duplicates().values
  47. # 所有电影ID
  48. allItemIDs = ratings['ItemID'].drop_duplicates().values
  49. # 保存用户和电影之间的评分
  50. userCF_itemIDs = []
  51. userCF_ratings = []
  52. # 遍历所有电影,拿到每部电影的ID
  53. for itemID in allItemIDs:
  54. # 如果还没看过这部电影
  55. if itemID not in watchedItemIDs:
  56. userCF_itemIDs.append(itemID)
  57. # 调用userCF模型的预测方法,预测用户对电影的评分
  58. userCF_ratings.append(userCF.predict(uid, itemID).est)
  59. # 结果转df
  60. result = pd.DataFrame({
  61. 'userCF_itemID': userCF_itemIDs,
  62. 'userCF_rating': userCF_ratings
  63. })
  64. # 结果按评分倒序排序
  65. result.sort_values(by='userCF_rating', inplace=True, ascending=False)
  66. print(result)
  67. """
  68. userCF_itemID userCF_rating
  69. 1000 1189 5.000000
  70. 1397 1293 5.000000
  71. 232 64 4.643135
  72. 1200 1367 4.578842
  73. 1454 1191 4.567010
  74. ... ... ...
  75. 1487 1408 1.000000
  76. 1266 1432 1.000000
  77. 930 777 1.000000
  78. 1134 437 1.000000
  79. 873 314 1.000000
  80. """
  81. # 使用knn实现UserCF
  82. def itemCF_method():
  83. itemCF = KNNBasic(
  84. k=40, min_k=3,
  85. sim_options={'user_based': False}
  86. )
  87. itemCF.fit(ratingDataSet.build_full_trainset())
  88. # 目标物品ID
  89. itemID = 110
  90. # DataSet会对数据集中的物品ID重新编码,需要先找出模型中的id,inner_id
  91. item_inner_id = itemCF.trainset.to_inner_iid(itemID)
  92. # 使用inner_id进行相似物品计算,找出与该物品最接近的10个物品
  93. item_inner_neighbors = itemCF.get_neighbors(item_inner_id, k=10)
  94. # 把inner_id转换为数据集中的ID
  95. itemID_neighbors = [
  96. itemCF.trainset.to_raw_iid(inner_id)
  97. for inner_id in item_inner_neighbors
  98. ]
  99. print(itemID_neighbors)
  100. """
  101. [979, 919, 1211, 339, 872, 695, 903, 1115, 960, 869]
  102. """
  103. if __name__ == '__main__':
  104. userCF_method()
  105. itemCF_method()