监督学习
回归问题(regress)
预测连续量 如:房价预测
#%% md# **Import Some Packages**#%%import numpy as npimport pandas as pdimport matplotlib.pyplot as plt#%% md# **Some Utilities**#%%def prepare_for_training(data, polynomial_degree=0, sinusoid_degree=0, normalize_data=True):num_example = data.shape[0]data_processed = np.copy(data)# 数据预处理features_mean = 0features_deviation = 0data_normalized = data_processedif normalize_data:(data_normalized,features_mean,features_deviation) = normalize(data_processed)data_processed = data_normalized# 增加正弦量if sinusoid_degree > 0:sinusoids = generate_sinusoids(data_normalized, sinusoid_degree)data_processed = np.concatenate((data_processed, sinusoids), axis=1)# 增加多项式if polynomial_degree > 0:polynomials = generate_polynomials(data_normalized, polynomial_degree, normalize_data)data_processed = np.concatenate((data_processed, polynomials), axis=1)# 加一列1data_processed = np.hstack((np.ones((num_example, 1)), data_processed))return data_processed, features_mean, features_deviation# 数据归一化函数def normalize(features):features_normalized = np.copy(features).astype(float)features_mean = np.mean(features, 0)features_deviation = np.std(features, 0)if features.shape[0] > 1:features_normalized -= features_meanfeatures_deviation[features_deviation == 0] = 1features_normalized /= features_deviationreturn features_normalized, features_mean, features_deviationdef generate_sinusoids(dataset, sinusoid_degree):num_examples = dataset.shape[0]sinusoids = np.empty((num_examples, 0))# for degree in range(1, sinusoid_degree)def generate_polynomials(dataset, polynomial_degree, normalize_data=False):"""x1,x2,x1*x2,x2*x2,x1*x,x1*x2*x2"""features_split = np.array_split(dataset, 2, axis=1)dataset_1 = features_split[0]dataset_2 = features_split[1](num_examples_1, num_features_1) = dataset_1.shape(num_examples_2, num_features_2) = dataset_2.shapeif num_examples_1 != num_examples_2:raise ValueError("Can not generate polynomials for two sets with different number")if num_features_1 == 0 and num_features_2 == 0:raise ValueError("Can not generate polynomials for two sets with no columns ")if num_features_1 == 0:dataset_1 = dataset_2elif num_features_2 == 0:dataset_2 = dataset_1num_features = num_features_1 if num_features_1 < num_examples_2 else num_features_2dataset_1 = dataset_1[:, :num_features]dataset_2 = dataset_2[:, :num_features]polynomials = np.empty((num_examples_1, 0))for i in range(1, polynomial_degree + 1):for j in range(i + 1):polynomial_feature = (dataset_1 ** (i - j)) * (dataset_2 ** j)polynomials = np.concatenate((polynomials, polynomial_feature), axis=1)if normalize_data:polynomials = normalize(polynomials)[0]return polynomials#%%class LinearRegression:def __init__(self, data, labels, polynomial_degree=0, sinusiod_degree=0, normalize_data=True):(data_processed,features_mean,features_deviation) = prepare_for_training(data, polynomial_degree, sinusiod_degree, normalize_data)self.data = data_processedself.labels = labelsself.features_mean = features_meanself.features_deviation = features_deviationself.polynomial_degree = polynomial_degreeself.sinusiod_degree = sinusiod_degreeself.normalize_data = normalize_datanum_features = self.data.shape[1]self.theta = np.zeros((num_features, 1))def train(self, alpha, num_iterations=500):cost_history = self.gradient_descent(alpha, num_iterations)return self.theta, cost_historydef gradient_descent(self, alpha, num_iterations):cost_history = []for _ in range(num_iterations):self.theta = self.gradient_step(alpha)cost_history.append(self.cost_function(self.data, self.labels))return cost_historydef gradient_step(self, alpha):num_examples = self.data.shape[0]prediction = LinearRegression.hypothesis(self.data, self.theta)delta = prediction - self.labelstheta = self.thetatheta = theta - alpha * (1 / num_examples) * (np.dot(delta.T, self.data)).Treturn thetadef cost_function(self, data, labels):num_examples = data.shape[0]prediction = LinearRegression.hypothesis(data, self.theta)delta = prediction - labelscost = (1 / 2) * np.dot(delta.T, delta)/num_examplesreturn cost[0][0]@staticmethoddef hypothesis(data, theta):predictions = np.dot(data, theta)return predictionsdef get_cost(self, data, labels):data_processed = prepare_for_training(data,self.polynomial_degree,self.sinusiod_degree,self.normalize_data)[0]return self.cost_function(data_processed, labels)def predict(self, data):data_processed = prepare_for_training(data,self.polynomial_degree,self.sinusiod_degree,self.normalize_data)[0]prediction = LinearRegression.hypothesis(data_processed, self.theta)return prediction#%%train_data = pd.read_csv("./input/kc_train.csv")test_data = pd.read_csv("./input/kc_test.csv")train_data.columns = ['销售日期', '销售价格', '卧室数', '浴室数', '房屋面积','停车面积', '楼层数', '房屋评分', '建筑面积', '地下室面积','建筑年份', '修复年份', '纬度', '经度']test_data.columns = ['销售日期', '卧室数', '浴室数', '房屋面积','停车面积', '楼层数', '房屋评分', '建筑面积', '地下室面积','建筑年份', '修复年份', '纬度', '经度']input_param_name = ["房屋面积"]output_param_name = ["销售价格"]x_train = train_data[input_param_name].valuesy_train = train_data[output_param_name].values# plt.scatter(x_train, y_train, label="Train data")# plt.xlabel(input_param_name)# plt.ylabel(output_param_name)# plt.title("happy")# plt.legend()# plt.show()num_iterations = 500learning_rate = 0.01linearRegression = LinearRegression(x_train, y_train)theta, cost_history = linearRegression.train(learning_rate, num_iterations)print("开始时的损失:", cost_history[0], theta)print("结束时的损失:", cost_history[-1], theta)plt.scatter(range(1, 500 + 1), cost_history, label="cost")plt.show()
分类问题(calssification)
预测离散量 如:肿瘤分类
