PUBG吃鸡排名预测案例
因数据集过大不予演示
大致流程
- 数据清洗
- 模型构建
- 模型评估
案例分析:
- 获取数据
- 数据处理
去除na,去除开挂人群(无武器击杀,击杀过多,爆头率过高) - 确认x:数据集的特征值
y:数据集的目标值(标签) - 实例化估计器,进行训练
- 进行predict预测,计算score,mae
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
# 获取数据
from sklearn.preprocessing import StandardScaler
file_path = "./data/train_V2.csv"
df = pd.read_csv(file_path)
print(df.head())
# 基本数据处理
# 去除na
df_new = df.dropna(axis=0, how=any) # 去除带有na的行
# 选取开挂的数据
df_part = df_new
# 无武器击杀
df_part_distance = []
df_part_distance["weapon"] = (df_part_distance["kills"] > 0) & (df_part_distance["weaponsAcquired"] == 0)
df_part_weapon0 = df_part_distance.drop(
df_part_distance[df_part_distance["weapon"] == True].index # 去除无武器击杀的外挂
)
# 击杀过高
df_part_kills = df_part_weapon0.drop(df_part_distance[df_part_distance["kills"] > 30].index)
# 爆头过高
df_part_kills["head_rate"] = df_part_kills["headshot_Kills"]/df_part_kills["kills"]
df_part_kills["head_rate"] = df_part_kills["head_rate"].fillna(0)
df_part_head = df_part_kills.drop(
df_part_kills[(df_part_kills["head_rate"] == 1) & (df_part_kills["kills"] >= 9)]
)
# 筛选有用的数据作为特征值
df_select = df_part_head[["assists","boosts","DBNOs","heals","kills","matchDuration","rankPoints","revives","rideDistance","teamKills","vehicleDestroys","walkDistance","winPoints"]]
# 确认特征值x和目标值y
x = df_select
y = df_part_head[["winPlacePerc"]]
x_train, x_test, y_train, y_test = train_test_split(df_select.data,df_select.target,random_state=22, test_size=0.2)
# 标准化数据
transfer = StandardScaler()
x_train = transfer.fit_transform(x_train)
x_test = transfer.fit_transform(x_test)
# 实例化估计器 进行训练
estimator = Ridge(alpha=1.0, fit_intercept=True, normalize=False, copy_X=True, max_iter=None,tol=1e-3, solver="auto",random_state=None)
estimator.fit(x_train, y_train)
# 预测值
y_pre = estimator.predict(x_test)
# 准确率
score = estimator.score(x_test, y_test)
# MAE 平均绝对误差(Mean Absolute Error),预测值与真实值的误差绝对值的平均值
mae = mean_absolute_error(y_test, y_pre)