1. import numpy as np
    2. import pandas as pd
    3. from sklearn.model_selection import train_test_split
    4. from sklearn.linear_model import LinearRegression
    5. from sklearn.linear_model import LassoCV
    6. from sklearn.linear_model import RidgeCV
    7. from sklearn.metrics import mean_squared_error #MSE
    8. import warnings
    9. warnings.filterwarnings("ignore")
    1. import seaborn as sns
    2. # 年龄distribution
    3. sns.distplot(patients['年龄'],bins=5)

    独热编码

    1. def get(data,name):
    2. temp=pd.get_dummies(data[name],prefix=name)
    3. data=pd.concat((data,temp),axis=1)
    4. return data

    生成特征部分和标签部分

    1. x_col = [x for x in patients.columns if x not in ['病人ID','生存时长']]
    2. # 特征部分
    3. X = patients[x_col]
    4. X
    5. # 标签部分
    6. # y = patients['生存时长']
    7. # y

    测试集和训练集

    1. X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)
    2. # 利用训练集建模
    3. # now = time.time()
    4. lr = LinearRegression()
    5. lr.fit(X_train, y_train)
    6. # end =time.time()
    7. # 返回模型的各个参数
    8. print(lr.coef_)

    预测

    1. y_pred=lr.predict(X_test)
    2. y_test
    3. y_pred

    检验结果

    1. RMSE = np.sqrt(mean_squared_error(y_test,y_pred))

    模型报道性能

    1. print(classification_report(y_test,predict))