使用场景-当训练数据不成线性时,多项式回归是给数据进行升维。

测试用例

  1. import numpy as np
  2. from matplotlib import pyplot as plt
  3. x = np.random.uniform(-3, 3, size=100)
  4. X = x.reshape(-1, 1)
  5. y = 0.5 * x**2 + x + 2 + np.random.normal(0, 1, size=100)
  6. plt.scatter(x, y)

image.png

使用线性模型拟合

  1. from sklearn.linear_model import LinearRegression
  2. lin1 = LinearRegression()
  3. lin1.fit(X, y)
  4. y_p = lin1.predict(X)
  5. plt.scatter(x, y)
  6. plt.plot(x, y_p)

image.png

将数据升维

  1. X2 = np.hstack([X, X**2])
  2. lin2 = LinearRegression()
  3. lin2.fit(X2, y)
  4. plt.scatter(x, y)
  5. plt.plot(np.sort(x), lin2.predict(X2)[np.argsort(x)], color='r')

image.png

PolynomialFeatures进行封装多项式回归

  1. from sklearn.preprocessing import PolynomialFeatures
  2. def PolynomiaRegression(X, y, degree):
  3. poly = PolynomialFeatures(degree=degree)
  4. X2 = poly.fit_transform(X)
  5. lin3 = LinearRegression()
  6. lin3.fit(X2, y)
  7. return lin3
  8. poly_reg = PolynomiaRegression(X, y, 2)
  9. poly = PolynomialFeatures(degree=2)
  10. X3 = poly.fit_transform(X)
  11. plt.scatter(x, y)
  12. plt.plot(np.sort(x), poly_reg.predict(X3)[np.argsort(x)], color='r')

image.png

Pipeline-管道的使用

  1. from sklearn.pipeline import Pipeline
  2. from sklearn.preprocessing import StandardScaler
  3. def PolynomiaRegression(degree):
  4. return Pipeline([
  5. ('poly', PolynomialFeatures(degree=degree)),
  6. ('std', StandardScaler()),
  7. ('line', LinearRegression())
  8. ])
  9. poly_reg = PolynomiaRegression(2)
  10. poly_reg.fit(X, y)
  11. plt.scatter(x, y)
  12. plt.plot(np.sort(x), poly_reg.predict(X)[np.argsort(x)], color='r')

image.png

过拟合和欠拟合

过拟合:训练数据集误差较小,测试集误差较大。
欠拟合:训练集和测试集的误差都大。

  1. # 引入均方误差
  2. from sklearn.metrics import mean_squared_error
  3. from sklearn.model_selection import train_test_split
  4. # 简单线性回归模型训练和预测结果
  5. lin1 = LinearRegression()
  6. lin1.fit(X_train, y_train)
  7. # 训练集误差
  8. print("训练集误差:", mean_squared_error(y_train, lin1.predict(X_train)))
  9. print("测试集误差:", mean_squared_error(y_test, lin1.predict(X_test)))

image.png

  1. # 多项式回归模型
  2. poly_reg = PolynomiaRegression(2)
  3. poly_reg.fit(X_train, y_train)
  4. print("训练集误差:", mean_squared_error(y_train, poly_reg.predict(X_train)))
  5. print("测试集误差:", mean_squared_error(y_test, poly_reg.predict(X_test)))

image.png

  1. # 多项式过拟合模型
  2. poly_reg20 = PolynomiaRegression(30)
  3. poly_reg20.fit(X, y)
  4. plt.scatter(x, y)
  5. plt.plot(np.sort(x), poly_reg20.predict(X)[np.argsort(x)], color='r')

image.png

  1. # 过拟合拟合的误差
  2. poly_reg30 = PolynomiaRegression(30)
  3. poly_reg30.fit(X_train, y_train)
  4. print("训练集误差:", mean_squared_error(y_train, poly_reg30.predict(X_train)))
  5. print("测试集误差:", mean_squared_error(y_test, poly_reg30.predict(X_test)))

image.png

交叉验证-网格搜索

在机器学习中将数据分成,训练集、验证集、测试集,训练集+验证集用来建立模型,测试集来评价模型的好坏,不参与模型的建立。不要训练集+测试集的原因是防止在建立模型的时候对测试数据过拟合。

  1. from sklearn import datasets
  2. from sklearn.neighbors import KNeighborsClassifier
  3. from sklearn.model_selection import cross_val_score
  4. knn_clf = KNeighborsClassifier()
  5. cross_val_score(knn_clf, X_digit_train, y_digit_train,cv=5)

image.png

  1. from sklearn.model_selection import GridSearchCV
  2. param_grid = [
  3. {
  4. 'weights': ['distance'],
  5. 'n_neighbors': [i for i in range(1, 11)],
  6. 'p': [i for i in range(1, 5)],
  7. },
  8. ]
  9. grid = GridSearchCV(KNeighborsClassifier(), param_grid, n_jobs=4, cv=5)
  10. grid.fit(X_digit_train, y_digit_train)
  11. #最佳参数
  12. grid.best_params_
  13. best_knn = grid.best_estimator_
  14. best_knn.fit(X_digit_train, y_digit_train)
  15. best_knn.score(X_digit_test, y_digit_test)

image.png

正则化-L1-L2

正则化的目的主要是解决模型的过拟合问题,现在参数的大小。
L1:可以对特征进行选择,L2:将参数尽可能的减小,不会去掉特征。

L2

image.png

  1. from sklearn.linear_model import Ridge
  2. def RidgeRegression(degree, alpha):
  3. return Pipeline([
  4. ('poly', PolynomialFeatures(degree=degree)),
  5. ('std', StandardScaler()),
  6. ('ridge', Ridge(alpha=alpha))
  7. ])
  8. # 多项式过拟合模型-将alpha=0不加入正则项
  9. ridge = RidgeRegression(degree=50, alpha=0)
  10. ridge.fit(X, y)
  11. plt.scatter(x, y)
  12. plt.plot(np.sort(x), ridge.predict(X)[np.argsort(x)], color='r')

image.png

  1. ridge = RidgeRegression(degree=50, alpha=0.1)
  2. ridge.fit(X, y)
  3. plt.scatter(x, y)
  4. plt.plot(np.sort(x), ridge.predict(X)[np.argsort(x)], color='r')

image.png

  1. ridge = RidgeRegression(degree=50, alpha=10)
  2. ridge.fit(X, y)
  3. plt.scatter(x, y)
  4. plt.plot(np.sort(x), ridge.predict(X)[np.argsort(x)], color='r')

image.png

  1. # 当alpha过大时参数都是0误差最小
  2. ridge = RidgeRegression(degree=50, alpha=1000000000)
  3. ridge.fit(X, y)
  4. plt.scatter(x, y)
  5. plt.plot(np.sort(x), ridge.predict(X)[np.argsort(x)], color='r')

image.png

L1

image.png

  1. from sklearn.linear_model import Lasso
  2. def LassoRegression(degree, alpha):
  3. return Pipeline([
  4. ('poly', PolynomialFeatures(degree=degree)),
  5. ('std', StandardScaler()),
  6. ('lasso', Lasso(alpha=alpha))
  7. ])
  8. lasso = LassoRegression(degree=50, alpha=0.01)
  9. lasso.fit(X, y)
  10. plt.scatter(x, y)
  11. plt.plot(np.sort(x), lasso.predict(X)[np.argsort(x)], color='r')

image.png

  1. lasso = LassoRegression(degree=50, alpha=0.1)
  2. lasso.fit(X, y)
  3. plt.scatter(x, y)
  4. plt.plot(np.sort(x), lasso.predict(X)[np.argsort(x)], color='r')

image.png

  1. lasso = LassoRegression(degree=50, alpha=10)
  2. lasso.fit(X, y)
  3. plt.scatter(x, y)
  4. plt.plot(np.sort(x), lasso.predict(X)[np.argsort(x)], color='r')

image.png