使用比较简单,直接调用库就好了。

教程目录 - sklearn中文文档
sklearn 快速入门教程 - 郭峰g - 博客园
Python之Sklearn使用入门教程python脚本之家

代码 D:\00000py\1\mcm 2020bq3_0008own.py
# 定义模型 训练
models_str = [‘LinearRegression’, # 普通最小二乘法 (线性回归)
‘MLPRegressor’, # 多层感知器(MLP) (神经网络) # 训练异常 可能本身有问题
‘DecisionTree’, # 决策树
‘SVR’, # 支持向量机
‘GBDT’, # 梯度提升决策树
‘lightGBM’, # 轻型梯度提升机
‘RandomForest’ # 随机森林
_]

2020B 0027:XGBoost LightGBM 随机森林 SVM BP神经网络
2020B 0089:随机森林(用于确定重要性)
2020B 0036:随机森林(用于确定重要性/回归) GBR(GBDT
2020B 0008:各模型对比 LR LightGBM GBDT SVR DT(DecisionTree) 神经网络
2020B 0116:随机森林 决策树 SVM GBDT
2020B 0031:。。。

随机森林

回归 预测

文章 D:\00000MCM\0 研究生数学建模竞赛历年真题和优秀论文集锦\研究生数学建模-优秀论文\2020年优秀论文\B题\B20102470089.pdf

回归预测
见后面代码

选择重要变量 重要性排序

  1. importances = models[6].feature_importances_ # 获取重要性 选择定义的模型
  2. print(importances)
  3. print(sorted(importances, reverse=True)) # 排序

其他

回归 预测

  1. **'LinearRegression'**, _# 普通最小二乘法 (线性回归)<br /> _**'MLPRegressor'**, _# 多层感知器(MLP) (神经网络) # 训练异常 可能本身有问题<br /> _**'DecisionTree'**, _# 决策树<br /> _**'SVR'**, _# 支持向量机<br /> _**'GBDT'**, _# 梯度提升决策树<br /> _**'lightGBM'**, _# 轻型梯度提升机_

代码

D:\00000py\1\mcm 2020b_q3_0008own.py

  1. import numpy as np
  2. import matplotlib.pyplot as plt
  3. import lightgbm as lgb
  4. from sklearn.linear_model import LinearRegression
  5. from sklearn.svm import SVR
  6. from sklearn.neural_network import MLPRegressor
  7. from sklearn.tree import DecisionTreeRegressor
  8. from sklearn.ensemble import RandomForestRegressor
  9. from sklearn.ensemble import GradientBoostingRegressor
  10. from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
  11. from sklearn.model_selection import cross_val_score # 交叉验证
  12. from sklearn.datasets import load_iris
  13. import time
  14. import csv
  15. import re
  16. from openpyxl import load_workbook # 读取xlsx 类型的文件需要专门的读取程序
  17. import xlrd
  18. import warnings
  19. # filter warnings
  20. warnings.filterwarnings('ignore')
  21. # 正常显示中文
  22. from pylab import mpl
  23. mpl.rcParams['font.sans-serif'] = ['SimHei']
  24. # 正常显示符号
  25. from matplotlib import rcParams
  26. rcParams['axes.unicode_minus']=False
  27. def normalization(data): # normalization 和normalization1 是对所有数据进行归一化,不太合理。而normalization2 可以对列进行归一化
  28. _range = np.max(data) - np.min(data)
  29. return (data - np.min(data)) / _range
  30. def standardization(data):
  31. mu = np.mean(data, axis=0)
  32. sigma = np.std(data, axis=0)
  33. return (data - mu) / sigma
  34. # 如果归一化后的范围是[-1, 1]的话,可以将normalization()函数改为:
  35. def normalization1(data):
  36. _range = np.max(abs(data))
  37. return data / _range
  38. def normalization2(data):
  39. minVals = data.min(0)
  40. maxVals = data.max(0)
  41. ranges = maxVals - minVals
  42. m = data.shape[0]
  43. normData = data - np.tile(minVals, (m, 1))
  44. normData = normData/np.tile(ranges, (m, 1))
  45. return normData, ranges, minVals
  46. def draw(x_train_label, x_test_label, x_label, train_y, test_y, original_y, picture_path, model_name):
  47. plt.title(model_name, fontsize=22)
  48. plt.xlabel('样本编号')
  49. plt.ylabel('RON 损失值')
  50. # plt.xlim(xmax=9, xmin=0)
  51. # plt.ylim(ymax=9, ymin=0)
  52. # 画两条(0-9)的坐标轴并设置轴标签x,y
  53. colors1 = '#FFA500' # 点的颜色 '#C0504D'ori '训练集' 橘黄
  54. colors2 = '#FF4500' #'#2894FF' '#00EEEE'ori '测试集' 橘红
  55. colors3 = '#1E90FF' # '#FF6600'ori '原始数据' 浅蓝
  56. # 画散点图
  57. area1 = np.pi * 2 ** 2 # 点面积
  58. area2 = np.pi * 3 ** 2 # 点面积
  59. area3 = np.pi * 4 ** 2 # 点面积
  60. # plt.scatter(x_train_label, train_y, marker='^', s=area2, c=colors1, alpha=1, label='训练集')
  61. # plt.scatter(x_test_label, test_y, marker='*', s=area3, c=colors2, alpha=1, label='测试集')
  62. # plt.scatter(x_label, original_y, marker='o', s=area1, c=colors3, alpha=1, label='原始数据')
  63. # # # plt.plot([0, 9.5], [9.5, 0], linewidth='0.5', color='#000000')
  64. # 画折线图
  65. plt.plot(x_train_label, train_y, c=colors1, linewidth=0.9, label='训练集')
  66. plt.plot(x_test_label, test_y, c=colors2, linewidth=0.9, label='测试集')
  67. plt.plot(x_label, original_y, c=colors3, linewidth=0.9, label='原始数据')
  68. plt.legend()
  69. plt.savefig(picture_path, dpi=300) # 重新运行 保存的图像可能不刷新
  70. plt.show()
  71. def p_words(string):
  72. string_list = re.findall(r"\d+\.\d+", string)
  73. return string_list[0]
  74. plt.rcParams['font.sans-serif'] = ['SimHei']
  75. plt.rcParams['axes.unicode_minus'] = False
  76. # matplotlib 画图中中文显示会有问题,需要这两行设置默认字体
  77. # 读取数据
  78. # all_data = []
  79. # with open("fina_30_samples.csv", "r", encoding="utf-8") as f: # 数据已放附录
  80. # # with open("325 个样本数据888.csv", "r", encoding="utf-8") as f:
  81. # # 遇到编码问题,可以考虑把xlsx 文件转化为utf-8 格式的csv 文件
  82. # f_csv = csv.reader(f)
  83. # for row in f_csv:
  84. # row = [float(p_words(item)) for item in row]
  85. # # 在进行正则匹配时,一定要先把csv 里面存储的数据转化为数值格式,并设置小数点
  86. # all_data.append(row)
  87. # 读取数据 转为ndarray
  88. data = xlrd.open_workbook(r'D:\00000MCM\0 codes\2020B\q3forml.xlsx') # datas q2xxhg
  89. table = data.sheet_by_index(0) #按索引获取工作表,0就是工作表1
  90. all_data = []
  91. for i in range(1, table.nrows): #table.nrows表示总行数 去除第一行
  92. line=table.row_values(i) #读取每行数据,保存在line里面,line是list
  93. all_data.append(line) #将line加入到resArray中,resArray是二维list
  94. all_data = np.array(all_data) #将resArray从二维list变成数组
  95. # 分割数据
  96. data_label = all_data[:, 0] # 第0列 数据标签 1-n 用于画图
  97. target = all_data[:, -1] # 最后一列 目标值
  98. # target = normalization(all_data[:, -1])
  99. data, _, _ = normalization2(all_data[:, 1:-1]) # normalization 和normalization1 是对所有数据进行归一化,不太合理。而normalization2 可以对列进行归一化
  100. print(data[0])
  101. print(len(data[0]))
  102. print(target[0])
  103. print()
  104. # 分割训练集 测试集
  105. train_size = int(len(data)*0.7)
  106. x_train = data[:train_size]
  107. x_test = data[train_size:]
  108. y_train = target[:train_size]
  109. y_test = target[train_size:]
  110. x_train_label = data_label[:train_size]
  111. x_test_label = data_label[train_size:]
  112. # x_train, x_test, y_train, y_test = train_test_split(data, target, test_size=0.3, random_state=20)
  113. # x_train_label, x_test_label = train_test_split(data_label, test_size=0.3, random_state=20)
  114. x_label = data_label
  115. # 定义模型 训练
  116. models_str = ['LinearRegression', # 普通最小二乘法 (线性回归)
  117. 'MLPRegressor', # 多层感知器(MLP) (神经网络) # 训练异常 可能是输出y数据没有归一化 可能本身有问题
  118. 'DecisionTree', # 决策树
  119. 'SVR', # 支持向量机
  120. 'GBDT', # 梯度提升决策树
  121. 'lightGBM', # 轻型梯度提升机
  122. 'RandomForest' # 随机森林
  123. ]
  124. models = [LinearRegression(normalize=True),
  125. MLPRegressor(alpha=0.01), # MLPRegressor(hidden_layer_sizes=50, solver='sgd', alpha=0.0001, learning_rate_init=0.0003)
  126. DecisionTreeRegressor(),
  127. SVR(),
  128. GradientBoostingRegressor(),
  129. lgb.LGBMRegressor(objective='regression', num_leaves=31, learning_rate=0.05, n_estimators=20),
  130. RandomForestRegressor()
  131. ]
  132. MSE_lists = []
  133. MAE_lists = []
  134. R2_lists = []
  135. Run_Time_lists = []
  136. cv_jiaocha_flag = False # 交叉验证开关 True False
  137. n_folds = 5 # 交叉验证
  138. Cross_val_lists = []
  139. for name, model in zip(models_str, models):
  140. print('开始训练模型:' + name)
  141. model = model # 建立模型
  142. model.fit(x_train, y_train) # 训练
  143. startTime = time.time()
  144. y_train_pred = model.predict(x_train) # 预测
  145. y_test_pred = model.predict(x_test) # 预测
  146. stopTime = time.time()
  147. if cv_jiaocha_flag: # 交叉验证
  148. Cross_val = cross_val_score(model, x_train, y_train, cv=n_folds) # 交叉验证
  149. if name == 'LinearRegression': # 输出线性回归系数
  150. print('截距: ',end=''); print(model.intercept_) #截距
  151. print('回归系数: ',end=''); print(model.coef_) #回归系数
  152. # 可视化 图片保存
  153. # save_path = ".\\\\画图数据\\" + name + ".tif"
  154. save_path = '.\\datasets\\ml\\' + name + '.jpg' # 重新运行 保存的图像可能不刷新
  155. draw(x_train_label, x_test_label, x_label, y_train_pred, y_test_pred, target, save_path, name)
  156. # 指标评估
  157. MSE = mean_squared_error(y_test, y_test_pred) ** 0.5
  158. MAE = mean_absolute_error(y_test, y_test_pred)
  159. R2 = r2_score(y_test, y_test_pred)
  160. Run_Time = stopTime - startTime
  161. MSE_lists.append(MSE)
  162. MAE_lists.append(MAE)
  163. R2_lists.append(R2)
  164. Run_Time_lists.append(Run_Time)
  165. if cv_jiaocha_flag: # 交叉验证
  166. Cross_val_lists.append(Cross_val)
  167. print('The rmse of prediction is:', MSE)
  168. print('The mae of prediction is:', MAE)
  169. print('The r2 of prediction is:', R2)
  170. print('The Run_Time of prediction is:', Run_Time)
  171. print()
  172. print('models: ', end='')
  173. print(models_str)
  174. print('MSE_lists: ', end='')
  175. print(MSE_lists)
  176. print('MAE_lists: ', end='')
  177. print(MAE_lists)
  178. print('R2_lists: ', end='')
  179. print(R2_lists)
  180. print('Run_Time_lists: ', end='')
  181. print(Run_Time_lists)
  182. if cv_jiaocha_flag: # 交叉验证
  183. print('Cross_val_lists: ', end='')
  184. print(Cross_val_lists)
  185. print('finish')
  186. importances = models[6].feature_importances_ # 获取重要性 选择定义的模型
  187. print(importances)
  188. print(sorted(importances, reverse=True)) # 排序