使用比较简单,直接调用库就好了。
教程目录 - sklearn中文文档
sklearn 快速入门教程 - 郭峰g - 博客园
Python之Sklearn使用入门教程python脚本之家
代码 D:\00000py\1\mcm 2020bq3_0008own.py
# 定义模型 训练
models_str = [‘LinearRegression’, # 普通最小二乘法 (线性回归)
‘MLPRegressor’, # 多层感知器(MLP) (神经网络) # 训练异常 可能本身有问题
‘DecisionTree’, # 决策树
‘SVR’, # 支持向量机
‘GBDT’, # 梯度提升决策树
‘lightGBM’, # 轻型梯度提升机
‘RandomForest’ # 随机森林
_]
2020B 0027:XGBoost LightGBM 随机森林 SVM BP神经网络
2020B 0089:随机森林(用于确定重要性)
2020B 0036:随机森林(用于确定重要性/回归) GBR(GBDT)
2020B 0008:各模型对比 LR LightGBM GBDT SVR DT(DecisionTree) 神经网络
2020B 0116:随机森林 决策树 SVM GBDT
2020B 0031:。。。
随机森林
回归 预测
文章 D:\00000MCM\0 研究生数学建模竞赛历年真题和优秀论文集锦\研究生数学建模-优秀论文\2020年优秀论文\B题\B20102470089.pdf
回归预测
见后面代码
选择重要变量 重要性排序
importances = models[6].feature_importances_ # 获取重要性 选择定义的模型
print(importances)
print(sorted(importances, reverse=True)) # 排序
其他
回归 预测
**'LinearRegression'**, _# 普通最小二乘法 (线性回归)<br /> _**'MLPRegressor'**, _# 多层感知器(MLP) (神经网络) # 训练异常 可能本身有问题<br /> _**'DecisionTree'**, _# 决策树<br /> _**'SVR'**, _# 支持向量机<br /> _**'GBDT'**, _# 梯度提升决策树<br /> _**'lightGBM'**, _# 轻型梯度提升机_
代码
D:\00000py\1\mcm 2020b_q3_0008own.py
import numpy as np
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import cross_val_score # 交叉验证
from sklearn.datasets import load_iris
import time
import csv
import re
from openpyxl import load_workbook # 读取xlsx 类型的文件需要专门的读取程序
import xlrd
import warnings
# filter warnings
warnings.filterwarnings('ignore')
# 正常显示中文
from pylab import mpl
mpl.rcParams['font.sans-serif'] = ['SimHei']
# 正常显示符号
from matplotlib import rcParams
rcParams['axes.unicode_minus']=False
def normalization(data): # normalization 和normalization1 是对所有数据进行归一化,不太合理。而normalization2 可以对列进行归一化
_range = np.max(data) - np.min(data)
return (data - np.min(data)) / _range
def standardization(data):
mu = np.mean(data, axis=0)
sigma = np.std(data, axis=0)
return (data - mu) / sigma
# 如果归一化后的范围是[-1, 1]的话,可以将normalization()函数改为:
def normalization1(data):
_range = np.max(abs(data))
return data / _range
def normalization2(data):
minVals = data.min(0)
maxVals = data.max(0)
ranges = maxVals - minVals
m = data.shape[0]
normData = data - np.tile(minVals, (m, 1))
normData = normData/np.tile(ranges, (m, 1))
return normData, ranges, minVals
def draw(x_train_label, x_test_label, x_label, train_y, test_y, original_y, picture_path, model_name):
plt.title(model_name, fontsize=22)
plt.xlabel('样本编号')
plt.ylabel('RON 损失值')
# plt.xlim(xmax=9, xmin=0)
# plt.ylim(ymax=9, ymin=0)
# 画两条(0-9)的坐标轴并设置轴标签x,y
colors1 = '#FFA500' # 点的颜色 '#C0504D'ori '训练集' 橘黄
colors2 = '#FF4500' #'#2894FF' '#00EEEE'ori '测试集' 橘红
colors3 = '#1E90FF' # '#FF6600'ori '原始数据' 浅蓝
# 画散点图
area1 = np.pi * 2 ** 2 # 点面积
area2 = np.pi * 3 ** 2 # 点面积
area3 = np.pi * 4 ** 2 # 点面积
# plt.scatter(x_train_label, train_y, marker='^', s=area2, c=colors1, alpha=1, label='训练集')
# plt.scatter(x_test_label, test_y, marker='*', s=area3, c=colors2, alpha=1, label='测试集')
# plt.scatter(x_label, original_y, marker='o', s=area1, c=colors3, alpha=1, label='原始数据')
# # # plt.plot([0, 9.5], [9.5, 0], linewidth='0.5', color='#000000')
# 画折线图
plt.plot(x_train_label, train_y, c=colors1, linewidth=0.9, label='训练集')
plt.plot(x_test_label, test_y, c=colors2, linewidth=0.9, label='测试集')
plt.plot(x_label, original_y, c=colors3, linewidth=0.9, label='原始数据')
plt.legend()
plt.savefig(picture_path, dpi=300) # 重新运行 保存的图像可能不刷新
plt.show()
def p_words(string):
string_list = re.findall(r"\d+\.\d+", string)
return string_list[0]
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
# matplotlib 画图中中文显示会有问题,需要这两行设置默认字体
# 读取数据
# all_data = []
# with open("fina_30_samples.csv", "r", encoding="utf-8") as f: # 数据已放附录
# # with open("325 个样本数据888.csv", "r", encoding="utf-8") as f:
# # 遇到编码问题,可以考虑把xlsx 文件转化为utf-8 格式的csv 文件
# f_csv = csv.reader(f)
# for row in f_csv:
# row = [float(p_words(item)) for item in row]
# # 在进行正则匹配时,一定要先把csv 里面存储的数据转化为数值格式,并设置小数点
# all_data.append(row)
# 读取数据 转为ndarray
data = xlrd.open_workbook(r'D:\00000MCM\0 codes\2020B\q3forml.xlsx') # datas q2xxhg
table = data.sheet_by_index(0) #按索引获取工作表,0就是工作表1
all_data = []
for i in range(1, table.nrows): #table.nrows表示总行数 去除第一行
line=table.row_values(i) #读取每行数据,保存在line里面,line是list
all_data.append(line) #将line加入到resArray中,resArray是二维list
all_data = np.array(all_data) #将resArray从二维list变成数组
# 分割数据
data_label = all_data[:, 0] # 第0列 数据标签 1-n 用于画图
target = all_data[:, -1] # 最后一列 目标值
# target = normalization(all_data[:, -1])
data, _, _ = normalization2(all_data[:, 1:-1]) # normalization 和normalization1 是对所有数据进行归一化,不太合理。而normalization2 可以对列进行归一化
print(data[0])
print(len(data[0]))
print(target[0])
print()
# 分割训练集 测试集
train_size = int(len(data)*0.7)
x_train = data[:train_size]
x_test = data[train_size:]
y_train = target[:train_size]
y_test = target[train_size:]
x_train_label = data_label[:train_size]
x_test_label = data_label[train_size:]
# x_train, x_test, y_train, y_test = train_test_split(data, target, test_size=0.3, random_state=20)
# x_train_label, x_test_label = train_test_split(data_label, test_size=0.3, random_state=20)
x_label = data_label
# 定义模型 训练
models_str = ['LinearRegression', # 普通最小二乘法 (线性回归)
'MLPRegressor', # 多层感知器(MLP) (神经网络) # 训练异常 可能是输出y数据没有归一化 可能本身有问题
'DecisionTree', # 决策树
'SVR', # 支持向量机
'GBDT', # 梯度提升决策树
'lightGBM', # 轻型梯度提升机
'RandomForest' # 随机森林
]
models = [LinearRegression(normalize=True),
MLPRegressor(alpha=0.01), # MLPRegressor(hidden_layer_sizes=50, solver='sgd', alpha=0.0001, learning_rate_init=0.0003)
DecisionTreeRegressor(),
SVR(),
GradientBoostingRegressor(),
lgb.LGBMRegressor(objective='regression', num_leaves=31, learning_rate=0.05, n_estimators=20),
RandomForestRegressor()
]
MSE_lists = []
MAE_lists = []
R2_lists = []
Run_Time_lists = []
cv_jiaocha_flag = False # 交叉验证开关 True False
n_folds = 5 # 交叉验证
Cross_val_lists = []
for name, model in zip(models_str, models):
print('开始训练模型:' + name)
model = model # 建立模型
model.fit(x_train, y_train) # 训练
startTime = time.time()
y_train_pred = model.predict(x_train) # 预测
y_test_pred = model.predict(x_test) # 预测
stopTime = time.time()
if cv_jiaocha_flag: # 交叉验证
Cross_val = cross_val_score(model, x_train, y_train, cv=n_folds) # 交叉验证
if name == 'LinearRegression': # 输出线性回归系数
print('截距: ',end=''); print(model.intercept_) #截距
print('回归系数: ',end=''); print(model.coef_) #回归系数
# 可视化 图片保存
# save_path = ".\\\\画图数据\\" + name + ".tif"
save_path = '.\\datasets\\ml\\' + name + '.jpg' # 重新运行 保存的图像可能不刷新
draw(x_train_label, x_test_label, x_label, y_train_pred, y_test_pred, target, save_path, name)
# 指标评估
MSE = mean_squared_error(y_test, y_test_pred) ** 0.5
MAE = mean_absolute_error(y_test, y_test_pred)
R2 = r2_score(y_test, y_test_pred)
Run_Time = stopTime - startTime
MSE_lists.append(MSE)
MAE_lists.append(MAE)
R2_lists.append(R2)
Run_Time_lists.append(Run_Time)
if cv_jiaocha_flag: # 交叉验证
Cross_val_lists.append(Cross_val)
print('The rmse of prediction is:', MSE)
print('The mae of prediction is:', MAE)
print('The r2 of prediction is:', R2)
print('The Run_Time of prediction is:', Run_Time)
print()
print('models: ', end='')
print(models_str)
print('MSE_lists: ', end='')
print(MSE_lists)
print('MAE_lists: ', end='')
print(MAE_lists)
print('R2_lists: ', end='')
print(R2_lists)
print('Run_Time_lists: ', end='')
print(Run_Time_lists)
if cv_jiaocha_flag: # 交叉验证
print('Cross_val_lists: ', end='')
print(Cross_val_lists)
print('finish')
importances = models[6].feature_importances_ # 获取重要性 选择定义的模型
print(importances)
print(sorted(importances, reverse=True)) # 排序