import numpy as npimport pandas as pdfrom sklearn.model_selection import train_test_splitfrom sklearn.linear_model import LinearRegressionfrom sklearn.linear_model import LassoCVfrom sklearn.linear_model import RidgeCVfrom sklearn.metrics import mean_squared_error #MSEimport warningswarnings.filterwarnings("ignore")
import seaborn as sns# 年龄distributionsns.distplot(patients['年龄'],bins=5)
独热编码
def get(data,name):temp=pd.get_dummies(data[name],prefix=name)data=pd.concat((data,temp),axis=1)return data
生成特征部分和标签部分
x_col = [x for x in patients.columns if x not in ['病人ID','生存时长']]# 特征部分X = patients[x_col]X# 标签部分# y = patients['生存时长']# y
测试集和训练集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)# 利用训练集建模# now = time.time()lr = LinearRegression()lr.fit(X_train, y_train)# end =time.time()# 返回模型的各个参数print(lr.coef_)
预测
y_pred=lr.predict(X_test)y_testy_pred
检验结果
RMSE = np.sqrt(mean_squared_error(y_test,y_pred))
模型报道性能
print(classification_report(y_test,predict))
