PUBG吃鸡排名预测案例

因数据集过大不予演示

大致流程

  1. 数据清洗
  2. 模型构建
  3. 模型评估

案例分析:

  1. 获取数据
  2. 数据处理
    去除na,去除开挂人群(无武器击杀,击杀过多,爆头率过高)
  3. 确认x:数据集的特征值
    y:数据集的目标值(标签)
  4. 实例化估计器,进行训练
  5. 进行predict预测,计算score,mae
  1. import pandas as pd
  2. import numpy as np
  3. from sklearn.metrics import mean_absolute_error
  4. from sklearn.model_selection import train_test_split
  5. from sklearn.preprocessing import StandardScaler
  6. from sklearn.linear_model import Ridge
  7. # 获取数据
  8. from sklearn.preprocessing import StandardScaler
  9. file_path = "./data/train_V2.csv"
  10. df = pd.read_csv(file_path)
  11. print(df.head())
  12. # 基本数据处理
  13. # 去除na
  14. df_new = df.dropna(axis=0, how=any) # 去除带有na的行
  15. # 选取开挂的数据
  16. df_part = df_new
  17. # 无武器击杀
  18. df_part_distance = []
  19. df_part_distance["weapon"] = (df_part_distance["kills"] > 0) & (df_part_distance["weaponsAcquired"] == 0)
  20. df_part_weapon0 = df_part_distance.drop(
  21. df_part_distance[df_part_distance["weapon"] == True].index # 去除无武器击杀的外挂
  22. )
  23. # 击杀过高
  24. df_part_kills = df_part_weapon0.drop(df_part_distance[df_part_distance["kills"] > 30].index)
  25. # 爆头过高
  26. df_part_kills["head_rate"] = df_part_kills["headshot_Kills"]/df_part_kills["kills"]
  27. df_part_kills["head_rate"] = df_part_kills["head_rate"].fillna(0)
  28. df_part_head = df_part_kills.drop(
  29. df_part_kills[(df_part_kills["head_rate"] == 1) & (df_part_kills["kills"] >= 9)]
  30. )
  31. # 筛选有用的数据作为特征值
  32. df_select = df_part_head[["assists","boosts","DBNOs","heals","kills","matchDuration","rankPoints","revives","rideDistance","teamKills","vehicleDestroys","walkDistance","winPoints"]]
  33. # 确认特征值x和目标值y
  34. x = df_select
  35. y = df_part_head[["winPlacePerc"]]
  36. x_train, x_test, y_train, y_test = train_test_split(df_select.data,df_select.target,random_state=22, test_size=0.2)
  37. # 标准化数据
  38. transfer = StandardScaler()
  39. x_train = transfer.fit_transform(x_train)
  40. x_test = transfer.fit_transform(x_test)
  41. # 实例化估计器 进行训练
  42. estimator = Ridge(alpha=1.0, fit_intercept=True, normalize=False, copy_X=True, max_iter=None,tol=1e-3, solver="auto",random_state=None)
  43. estimator.fit(x_train, y_train)
  44. # 预测值
  45. y_pre = estimator.predict(x_test)
  46. # 准确率
  47. score = estimator.score(x_test, y_test)
  48. # MAE 平均绝对误差(Mean Absolute Error),预测值与真实值的误差绝对值的平均值
  49. mae = mean_absolute_error(y_test, y_pre)