无标题 - 图1

监督学习

回归问题(regress)

预测连续量 如:房价预测
image.png

kc_test.csvkc_train.csv

  1. #%% md
  2. # **Import Some Packages**
  3. #%%
  4. import numpy as np
  5. import pandas as pd
  6. import matplotlib.pyplot as plt
  7. #%% md
  8. # **Some Utilities**
  9. #%%
  10. def prepare_for_training(data, polynomial_degree=0, sinusoid_degree=0, normalize_data=True):
  11. num_example = data.shape[0]
  12. data_processed = np.copy(data)
  13. # 数据预处理
  14. features_mean = 0
  15. features_deviation = 0
  16. data_normalized = data_processed
  17. if normalize_data:
  18. (data_normalized,
  19. features_mean,
  20. features_deviation) = normalize(data_processed)
  21. data_processed = data_normalized
  22. # 增加正弦量
  23. if sinusoid_degree > 0:
  24. sinusoids = generate_sinusoids(data_normalized, sinusoid_degree)
  25. data_processed = np.concatenate((data_processed, sinusoids), axis=1)
  26. # 增加多项式
  27. if polynomial_degree > 0:
  28. polynomials = generate_polynomials(data_normalized, polynomial_degree, normalize_data)
  29. data_processed = np.concatenate((data_processed, polynomials), axis=1)
  30. # 加一列1
  31. data_processed = np.hstack((np.ones((num_example, 1)), data_processed))
  32. return data_processed, features_mean, features_deviation
  33. # 数据归一化函数
  34. def normalize(features):
  35. features_normalized = np.copy(features).astype(float)
  36. features_mean = np.mean(features, 0)
  37. features_deviation = np.std(features, 0)
  38. if features.shape[0] > 1:
  39. features_normalized -= features_mean
  40. features_deviation[features_deviation == 0] = 1
  41. features_normalized /= features_deviation
  42. return features_normalized, features_mean, features_deviation
  43. def generate_sinusoids(dataset, sinusoid_degree):
  44. num_examples = dataset.shape[0]
  45. sinusoids = np.empty((num_examples, 0))
  46. # for degree in range(1, sinusoid_degree)
  47. def generate_polynomials(dataset, polynomial_degree, normalize_data=False):
  48. """
  49. x1,x2,x1*x2,x2*x2,x1*x,x1*x2*x2
  50. """
  51. features_split = np.array_split(dataset, 2, axis=1)
  52. dataset_1 = features_split[0]
  53. dataset_2 = features_split[1]
  54. (num_examples_1, num_features_1) = dataset_1.shape
  55. (num_examples_2, num_features_2) = dataset_2.shape
  56. if num_examples_1 != num_examples_2:
  57. raise ValueError("Can not generate polynomials for two sets with different number")
  58. if num_features_1 == 0 and num_features_2 == 0:
  59. raise ValueError("Can not generate polynomials for two sets with no columns ")
  60. if num_features_1 == 0:
  61. dataset_1 = dataset_2
  62. elif num_features_2 == 0:
  63. dataset_2 = dataset_1
  64. num_features = num_features_1 if num_features_1 < num_examples_2 else num_features_2
  65. dataset_1 = dataset_1[:, :num_features]
  66. dataset_2 = dataset_2[:, :num_features]
  67. polynomials = np.empty((num_examples_1, 0))
  68. for i in range(1, polynomial_degree + 1):
  69. for j in range(i + 1):
  70. polynomial_feature = (dataset_1 ** (i - j)) * (dataset_2 ** j)
  71. polynomials = np.concatenate((polynomials, polynomial_feature), axis=1)
  72. if normalize_data:
  73. polynomials = normalize(polynomials)[0]
  74. return polynomials
  75. #%%
  76. class LinearRegression:
  77. def __init__(self, data, labels, polynomial_degree=0, sinusiod_degree=0, normalize_data=True):
  78. (data_processed,
  79. features_mean,
  80. features_deviation) = prepare_for_training(data, polynomial_degree, sinusiod_degree, normalize_data)
  81. self.data = data_processed
  82. self.labels = labels
  83. self.features_mean = features_mean
  84. self.features_deviation = features_deviation
  85. self.polynomial_degree = polynomial_degree
  86. self.sinusiod_degree = sinusiod_degree
  87. self.normalize_data = normalize_data
  88. num_features = self.data.shape[1]
  89. self.theta = np.zeros((num_features, 1))
  90. def train(self, alpha, num_iterations=500):
  91. cost_history = self.gradient_descent(alpha, num_iterations)
  92. return self.theta, cost_history
  93. def gradient_descent(self, alpha, num_iterations):
  94. cost_history = []
  95. for _ in range(num_iterations):
  96. self.theta = self.gradient_step(alpha)
  97. cost_history.append(self.cost_function(self.data, self.labels))
  98. return cost_history
  99. def gradient_step(self, alpha):
  100. num_examples = self.data.shape[0]
  101. prediction = LinearRegression.hypothesis(self.data, self.theta)
  102. delta = prediction - self.labels
  103. theta = self.theta
  104. theta = theta - alpha * (1 / num_examples) * (np.dot(delta.T, self.data)).T
  105. return theta
  106. def cost_function(self, data, labels):
  107. num_examples = data.shape[0]
  108. prediction = LinearRegression.hypothesis(data, self.theta)
  109. delta = prediction - labels
  110. cost = (1 / 2) * np.dot(delta.T, delta)/num_examples
  111. return cost[0][0]
  112. @staticmethod
  113. def hypothesis(data, theta):
  114. predictions = np.dot(data, theta)
  115. return predictions
  116. def get_cost(self, data, labels):
  117. data_processed = prepare_for_training(data,
  118. self.polynomial_degree,
  119. self.sinusiod_degree,
  120. self.normalize_data)[0]
  121. return self.cost_function(data_processed, labels)
  122. def predict(self, data):
  123. data_processed = prepare_for_training(data,
  124. self.polynomial_degree,
  125. self.sinusiod_degree,
  126. self.normalize_data)[0]
  127. prediction = LinearRegression.hypothesis(data_processed, self.theta)
  128. return prediction
  129. #%%
  130. train_data = pd.read_csv("./input/kc_train.csv")
  131. test_data = pd.read_csv("./input/kc_test.csv")
  132. train_data.columns = ['销售日期', '销售价格', '卧室数', '浴室数', '房屋面积',
  133. '停车面积', '楼层数', '房屋评分', '建筑面积', '地下室面积',
  134. '建筑年份', '修复年份', '纬度', '经度']
  135. test_data.columns = ['销售日期', '卧室数', '浴室数', '房屋面积',
  136. '停车面积', '楼层数', '房屋评分', '建筑面积', '地下室面积',
  137. '建筑年份', '修复年份', '纬度', '经度']
  138. input_param_name = ["房屋面积"]
  139. output_param_name = ["销售价格"]
  140. x_train = train_data[input_param_name].values
  141. y_train = train_data[output_param_name].values
  142. # plt.scatter(x_train, y_train, label="Train data")
  143. # plt.xlabel(input_param_name)
  144. # plt.ylabel(output_param_name)
  145. # plt.title("happy")
  146. # plt.legend()
  147. # plt.show()
  148. num_iterations = 500
  149. learning_rate = 0.01
  150. linearRegression = LinearRegression(x_train, y_train)
  151. theta, cost_history = linearRegression.train(learning_rate, num_iterations)
  152. print("开始时的损失:", cost_history[0], theta)
  153. print("结束时的损失:", cost_history[-1], theta)
  154. plt.scatter(range(1, 500 + 1), cost_history, label="cost")
  155. plt.show()

分类问题(calssification)

预测离散量 如:肿瘤分类