准备数据

  1. import numpy as np
  2. import matplotlib.pyplot as plt
  3. np.random.seed(666)
  4. x = np.random.uniform(-3.0, 3.0, size=100)
  5. X = x.reshape(-1, 1)
  6. y = 0.5 * x**2 + x + 2 + np.random.normal(0, 1, size=100)
  7. plt.scatter(x, y)
  8. plt.show()

image.png

线性回归建模

  1. from sklearn.linear_model import LinearRegression
  2. lin_reg = LinearRegression()
  3. lin_reg.fit(X, y)
  4. lin_reg.score(X, y) # 0.49537078118650091
  5. y_predict = lin_reg.predict(X)
  6. plt.scatter(x, y)
  7. plt.plot(np.sort(x), y_predict[np.argsort(x)], color='r')
  8. plt.show()
  9. from sklearn.metrics import mean_squared_error
  10. y_predict = lin_reg.predict(X)
  11. mean_squared_error(y, y_predict) # 3.0750025765636577

image.png
多少样本点没有落在模型上,称为 欠拟合

多项式回归建模

  1. from sklearn.pipeline import Pipeline
  2. from sklearn.preprocessing import PolynomialFeatures
  3. from sklearn.preprocessing import StandardScaler
  4. # 配置初始化
  5. def PolynomialRegression(degree):
  6. return Pipeline([
  7. ("poly", PolynomialFeatures(degree=degree)),
  8. ("std_scaler", StandardScaler()),
  9. ("lin_reg", LinearRegression())
  10. ])
  11. # 建模
  12. poly2_reg = PolynomialRegression(degree=2) # 2阶多项式
  13. poly2_reg.fit(X, y)
  14. # 预测值
  15. y2_predict = poly2_reg.predict(X)
  16. # 误差
  17. mean_squared_error(y, y2_predict) # 1.0987392142417856
  18. # 可视化
  19. plt.scatter(x, y)
  20. plt.plot(np.sort(x), y2_predict[np.argsort(x)], color='r')
  21. plt.show()

image.png

调参

  1. poly10_reg = PolynomialRegression(degree=10) # 10阶多项式
  2. poly10_reg.fit(X, y)
  3. y10_predict = poly10_reg.predict(X)
  4. mean_squared_error(y, y10_predict) # 1.0508466763764164
  5. plt.scatter(x, y)
  6. plt.plot(np.sort(x), y10_predict[np.argsort(x)], color='r')
  7. plt.show()

image.png
从图可以看出来,有点过拟合。

  1. poly100_reg = PolynomialRegression(degree=100) # 100阶多项式
  2. poly100_reg.fit(X, y)
  3. y100_predict = poly100_reg.predict(X)
  4. mean_squared_error(y, y100_predict) # 0.68743577834336944
  5. plt.scatter(x, y)
  6. plt.plot(np.sort(x), y100_predict[np.argsort(x)], color='r')
  7. plt.show()

image.png

总结

机器学习,主要解决的是过拟合问题。