准备数据

import numpy as np
import matplotlib.pyplot as plt
np.random.seed(666)
x = np.random.uniform(-3.0, 3.0, size=100)
X = x.reshape(-1, 1)
y = 0.5 * x**2 + x + 2 + np.random.normal(0, 1, size=100)
plt.scatter(x, y)
plt.show()

线性回归建模

from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(X, y)
lin_reg.score(X, y)  # 0.49537078118650091
y_predict = lin_reg.predict(X)
plt.scatter(x, y)
plt.plot(np.sort(x), y_predict[np.argsort(x)], color='r')
plt.show()
from sklearn.metrics import mean_squared_error
y_predict = lin_reg.predict(X)
mean_squared_error(y, y_predict)  # 3.0750025765636577

多少样本点没有落在模型上，称为 欠拟合。

多项式回归建模

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
# 配置初始化
def PolynomialRegression(degree):
    return Pipeline([
        ("poly", PolynomialFeatures(degree=degree)),
        ("std_scaler", StandardScaler()),
        ("lin_reg", LinearRegression())
    ])
# 建模
poly2_reg = PolynomialRegression(degree=2)  # 2阶多项式
poly2_reg.fit(X, y)
# 预测值
y2_predict = poly2_reg.predict(X)
# 误差
mean_squared_error(y, y2_predict)  # 1.0987392142417856
# 可视化
plt.scatter(x, y)
plt.plot(np.sort(x), y2_predict[np.argsort(x)], color='r')
plt.show()

调参

poly10_reg = PolynomialRegression(degree=10)  # 10阶多项式
poly10_reg.fit(X, y)
y10_predict = poly10_reg.predict(X)
mean_squared_error(y, y10_predict)  # 1.0508466763764164
plt.scatter(x, y)
plt.plot(np.sort(x), y10_predict[np.argsort(x)], color='r')
plt.show()

从图可以看出来，有点过拟合。

poly100_reg = PolynomialRegression(degree=100)  # 100阶多项式
poly100_reg.fit(X, y)
y100_predict = poly100_reg.predict(X)
mean_squared_error(y, y100_predict)  # 0.68743577834336944
plt.scatter(x, y)
plt.plot(np.sort(x), y100_predict[np.argsort(x)], color='r')
plt.show()

总结

机器学习，主要解决的是过拟合问题。

Python 全栈开发与分析电子书(上)-V2022

03 过拟合与欠拟合

准备数据

线性回归建模

多项式回归建模

调参

总结