使用比较简单,直接调用库就好了。
教程目录 - sklearn中文文档
sklearn 快速入门教程 - 郭峰g - 博客园
Python之Sklearn使用入门教程python脚本之家
代码 D:\00000py\1\mcm 2020bq3_0008own.py
# 定义模型 训练
models_str = [‘LinearRegression’, # 普通最小二乘法 (线性回归)
‘MLPRegressor’, # 多层感知器(MLP) (神经网络) # 训练异常 可能本身有问题
‘DecisionTree’, # 决策树
‘SVR’, # 支持向量机
‘GBDT’, # 梯度提升决策树
‘lightGBM’, # 轻型梯度提升机
‘RandomForest’ # 随机森林
_]
2020B 0027:XGBoost LightGBM 随机森林 SVM BP神经网络
2020B 0089:随机森林(用于确定重要性)
2020B 0036:随机森林(用于确定重要性/回归) GBR(GBDT)
2020B 0008:各模型对比 LR LightGBM GBDT SVR DT(DecisionTree) 神经网络
2020B 0116:随机森林 决策树 SVM GBDT
2020B 0031:。。。
随机森林
回归 预测
文章 D:\00000MCM\0 研究生数学建模竞赛历年真题和优秀论文集锦\研究生数学建模-优秀论文\2020年优秀论文\B题\B20102470089.pdf
回归预测
见后面代码
选择重要变量 重要性排序
importances = models[6].feature_importances_ # 获取重要性 选择定义的模型print(importances)print(sorted(importances, reverse=True)) # 排序
其他
回归 预测
**'LinearRegression'**, _# 普通最小二乘法 (线性回归)<br /> _**'MLPRegressor'**, _# 多层感知器(MLP) (神经网络) # 训练异常 可能本身有问题<br /> _**'DecisionTree'**, _# 决策树<br /> _**'SVR'**, _# 支持向量机<br /> _**'GBDT'**, _# 梯度提升决策树<br /> _**'lightGBM'**, _# 轻型梯度提升机_
代码
D:\00000py\1\mcm 2020b_q3_0008own.py
import numpy as npimport matplotlib.pyplot as pltimport lightgbm as lgbfrom sklearn.linear_model import LinearRegressionfrom sklearn.svm import SVRfrom sklearn.neural_network import MLPRegressorfrom sklearn.tree import DecisionTreeRegressorfrom sklearn.ensemble import RandomForestRegressorfrom sklearn.ensemble import GradientBoostingRegressorfrom sklearn.metrics import mean_squared_error, mean_absolute_error, r2_scorefrom sklearn.model_selection import cross_val_score # 交叉验证from sklearn.datasets import load_irisimport timeimport csvimport refrom openpyxl import load_workbook # 读取xlsx 类型的文件需要专门的读取程序import xlrdimport warnings# filter warningswarnings.filterwarnings('ignore')# 正常显示中文from pylab import mplmpl.rcParams['font.sans-serif'] = ['SimHei']# 正常显示符号from matplotlib import rcParamsrcParams['axes.unicode_minus']=Falsedef normalization(data): # normalization 和normalization1 是对所有数据进行归一化,不太合理。而normalization2 可以对列进行归一化_range = np.max(data) - np.min(data)return (data - np.min(data)) / _rangedef standardization(data):mu = np.mean(data, axis=0)sigma = np.std(data, axis=0)return (data - mu) / sigma# 如果归一化后的范围是[-1, 1]的话,可以将normalization()函数改为:def normalization1(data):_range = np.max(abs(data))return data / _rangedef normalization2(data):minVals = data.min(0)maxVals = data.max(0)ranges = maxVals - minValsm = data.shape[0]normData = data - np.tile(minVals, (m, 1))normData = normData/np.tile(ranges, (m, 1))return normData, ranges, minValsdef draw(x_train_label, x_test_label, x_label, train_y, test_y, original_y, picture_path, model_name):plt.title(model_name, fontsize=22)plt.xlabel('样本编号')plt.ylabel('RON 损失值')# plt.xlim(xmax=9, xmin=0)# plt.ylim(ymax=9, ymin=0)# 画两条(0-9)的坐标轴并设置轴标签x,ycolors1 = '#FFA500' # 点的颜色 '#C0504D'ori '训练集' 橘黄colors2 = '#FF4500' #'#2894FF' '#00EEEE'ori '测试集' 橘红colors3 = '#1E90FF' # '#FF6600'ori '原始数据' 浅蓝# 画散点图area1 = np.pi * 2 ** 2 # 点面积area2 = np.pi * 3 ** 2 # 点面积area3 = np.pi * 4 ** 2 # 点面积# plt.scatter(x_train_label, train_y, marker='^', s=area2, c=colors1, alpha=1, label='训练集')# plt.scatter(x_test_label, test_y, marker='*', s=area3, c=colors2, alpha=1, label='测试集')# plt.scatter(x_label, original_y, marker='o', s=area1, c=colors3, alpha=1, label='原始数据')# # # plt.plot([0, 9.5], [9.5, 0], linewidth='0.5', color='#000000')# 画折线图plt.plot(x_train_label, train_y, c=colors1, linewidth=0.9, label='训练集')plt.plot(x_test_label, test_y, c=colors2, linewidth=0.9, label='测试集')plt.plot(x_label, original_y, c=colors3, linewidth=0.9, label='原始数据')plt.legend()plt.savefig(picture_path, dpi=300) # 重新运行 保存的图像可能不刷新plt.show()def p_words(string):string_list = re.findall(r"\d+\.\d+", string)return string_list[0]plt.rcParams['font.sans-serif'] = ['SimHei']plt.rcParams['axes.unicode_minus'] = False# matplotlib 画图中中文显示会有问题,需要这两行设置默认字体# 读取数据# all_data = []# with open("fina_30_samples.csv", "r", encoding="utf-8") as f: # 数据已放附录# # with open("325 个样本数据888.csv", "r", encoding="utf-8") as f:# # 遇到编码问题,可以考虑把xlsx 文件转化为utf-8 格式的csv 文件# f_csv = csv.reader(f)# for row in f_csv:# row = [float(p_words(item)) for item in row]# # 在进行正则匹配时,一定要先把csv 里面存储的数据转化为数值格式,并设置小数点# all_data.append(row)# 读取数据 转为ndarraydata = xlrd.open_workbook(r'D:\00000MCM\0 codes\2020B\q3forml.xlsx') # datas q2xxhgtable = data.sheet_by_index(0) #按索引获取工作表,0就是工作表1all_data = []for i in range(1, table.nrows): #table.nrows表示总行数 去除第一行line=table.row_values(i) #读取每行数据,保存在line里面,line是listall_data.append(line) #将line加入到resArray中,resArray是二维listall_data = np.array(all_data) #将resArray从二维list变成数组# 分割数据data_label = all_data[:, 0] # 第0列 数据标签 1-n 用于画图target = all_data[:, -1] # 最后一列 目标值# target = normalization(all_data[:, -1])data, _, _ = normalization2(all_data[:, 1:-1]) # normalization 和normalization1 是对所有数据进行归一化,不太合理。而normalization2 可以对列进行归一化print(data[0])print(len(data[0]))print(target[0])print()# 分割训练集 测试集train_size = int(len(data)*0.7)x_train = data[:train_size]x_test = data[train_size:]y_train = target[:train_size]y_test = target[train_size:]x_train_label = data_label[:train_size]x_test_label = data_label[train_size:]# x_train, x_test, y_train, y_test = train_test_split(data, target, test_size=0.3, random_state=20)# x_train_label, x_test_label = train_test_split(data_label, test_size=0.3, random_state=20)x_label = data_label# 定义模型 训练models_str = ['LinearRegression', # 普通最小二乘法 (线性回归)'MLPRegressor', # 多层感知器(MLP) (神经网络) # 训练异常 可能是输出y数据没有归一化 可能本身有问题'DecisionTree', # 决策树'SVR', # 支持向量机'GBDT', # 梯度提升决策树'lightGBM', # 轻型梯度提升机'RandomForest' # 随机森林]models = [LinearRegression(normalize=True),MLPRegressor(alpha=0.01), # MLPRegressor(hidden_layer_sizes=50, solver='sgd', alpha=0.0001, learning_rate_init=0.0003)DecisionTreeRegressor(),SVR(),GradientBoostingRegressor(),lgb.LGBMRegressor(objective='regression', num_leaves=31, learning_rate=0.05, n_estimators=20),RandomForestRegressor()]MSE_lists = []MAE_lists = []R2_lists = []Run_Time_lists = []cv_jiaocha_flag = False # 交叉验证开关 True Falsen_folds = 5 # 交叉验证Cross_val_lists = []for name, model in zip(models_str, models):print('开始训练模型:' + name)model = model # 建立模型model.fit(x_train, y_train) # 训练startTime = time.time()y_train_pred = model.predict(x_train) # 预测y_test_pred = model.predict(x_test) # 预测stopTime = time.time()if cv_jiaocha_flag: # 交叉验证Cross_val = cross_val_score(model, x_train, y_train, cv=n_folds) # 交叉验证if name == 'LinearRegression': # 输出线性回归系数print('截距: ',end=''); print(model.intercept_) #截距print('回归系数: ',end=''); print(model.coef_) #回归系数# 可视化 图片保存# save_path = ".\\\\画图数据\\" + name + ".tif"save_path = '.\\datasets\\ml\\' + name + '.jpg' # 重新运行 保存的图像可能不刷新draw(x_train_label, x_test_label, x_label, y_train_pred, y_test_pred, target, save_path, name)# 指标评估MSE = mean_squared_error(y_test, y_test_pred) ** 0.5MAE = mean_absolute_error(y_test, y_test_pred)R2 = r2_score(y_test, y_test_pred)Run_Time = stopTime - startTimeMSE_lists.append(MSE)MAE_lists.append(MAE)R2_lists.append(R2)Run_Time_lists.append(Run_Time)if cv_jiaocha_flag: # 交叉验证Cross_val_lists.append(Cross_val)print('The rmse of prediction is:', MSE)print('The mae of prediction is:', MAE)print('The r2 of prediction is:', R2)print('The Run_Time of prediction is:', Run_Time)print()print('models: ', end='')print(models_str)print('MSE_lists: ', end='')print(MSE_lists)print('MAE_lists: ', end='')print(MAE_lists)print('R2_lists: ', end='')print(R2_lists)print('Run_Time_lists: ', end='')print(Run_Time_lists)if cv_jiaocha_flag: # 交叉验证print('Cross_val_lists: ', end='')print(Cross_val_lists)print('finish')importances = models[6].feature_importances_ # 获取重要性 选择定义的模型print(importances)print(sorted(importances, reverse=True)) # 排序
