自实现PCA

PCA模型封装

  1. import numpy as np
  2. class PCA:
  3. def __init__(self, n_components):
  4. """初始化PCA"""
  5. assert n_components >= 1, "n_components must be valid"
  6. self.n_components = n_components
  7. self.components_ = None
  8. def fit(self, X, eta=0.01, n_iters=1e4):
  9. """获得数据集X的前n个主成分"""
  10. assert self.n_components <= X.shape[1], \
  11. "n_components must not be greater than the feature number of X"
  12. def demean(X):
  13. return X - np.mean(X, axis=0)
  14. def f(w, X):
  15. return np.sum((X.dot(w) ** 2)) / len(X)
  16. def df(w, X):
  17. return X.T.dot(X.dot(w)) * 2. / len(X)
  18. def direction(w):
  19. return w / np.linalg.norm(w)
  20. def first_component(X, initial_w, eta=0.01, n_iters=1e4, epsilon=1e-8):
  21. w = direction(initial_w)
  22. cur_iter = 0
  23. while cur_iter < n_iters:
  24. gradient = df(w, X)
  25. last_w = w
  26. w = w + eta * gradient
  27. w = direction(w)
  28. if (abs(f(w, X) - f(last_w, X)) < epsilon):
  29. break
  30. cur_iter += 1
  31. return w
  32. X_pca = demean(X)
  33. self.components_ = np.empty(shape=(self.n_components, X.shape[1]))
  34. for i in range(self.n_components):
  35. initial_w = np.random.random(X_pca.shape[1])
  36. w = first_component(X_pca, initial_w, eta, n_iters)
  37. self.components_[i,:] = w
  38. X_pca = X_pca - X_pca.dot(w).reshape(-1, 1) * w
  39. return self
  40. def transform(self, X):
  41. """将给定的X,映射到各个主成分分量中"""
  42. assert X.shape[1] == self.components_.shape[1]
  43. return X.dot(self.components_.T)
  44. def inverse_transform(self, X):
  45. """将给定的X,反向映射回原来的特征空间"""
  46. assert X.shape[1] == self.components_.shape[0]
  47. return X.dot(self.components_)
  48. def __repr__(self):
  49. return "PCA(n_components=%d)" % self.n_components

使用

  1. import numpy as np
  2. import matplotlib.pyplot as plt
  3. # 准备数据
  4. X = np.empty((100, 2))
  5. X[:,0] = np.random.uniform(0., 100., size=100)
  6. X[:,1] = 0.75 * X[:,0] + 3. + np.random.normal(0, 10., size=100)
  7. # 求解2个主成分
  8. from playML.PCA import PCA
  9. pca = PCA(n_components=2)
  10. pca.fit(X)
  11. pca.components_ # array([[ 0.76676948, 0.64192256], [-0.64191827, 0.76677307]])
  12. # 求解第一主成分
  13. pca = PCA(n_components=1)
  14. pca.fit(X)
  15. # 将数据降维(高维转为低维)
  16. X_reduction = pca.transform(X) # X_reduction.shape : (100, 1)
  17. # 将数据还原(低维转为高维)
  18. X_restore = pca.inverse_transform(X_reduction) # X_restore.shape : (100, 2)
  19. # 可视化
  20. # # 蓝色代表原数据,红色代表降维后在第一维的数据
  21. plt.scatter(X[:,0], X[:,1], color='b', alpha=0.5)
  22. plt.scatter(X_restore[:,0], X_restore[:,1], color='r', alpha=0.5)
  23. plt.show()

image.png

scikit-learn实现PCA

  1. from sklearn.decomposition import PCA
  2. # 降维
  3. pca = PCA(n_components=1)
  4. pca.fit(X)
  5. # 求解第一主成分
  6. pca.components_ # array([[-0.77670058, -0.62987 ]])
  7. # 将数据降维(高维转为低维)
  8. X_reduction = pca.transform(X)
  9. # 将数据还原(低维转为高维)
  10. X_restore = pca.inverse_transform(X_reduction)
  11. # 可视化
  12. # # 蓝色代表原数据,红色代表降维后在第一维的数据
  13. plt.scatter(X[:,0], X[:,1], color='b', alpha=0.5)
  14. plt.scatter(X_restore[:,0], X_restore[:,1], color='r', alpha=0.5)
  15. plt.show()

image.png