数据预处理

data_reader

  1. from csv import reader
  2. def read_csv(name_of_file_to_be_read):
  3. dataset = list()
  4. with open(name_of_file_to_be_read, 'r') as file:
  5. every_line_of_file = reader(file)
  6. for line in every_line_of_file:
  7. if not line:
  8. continue
  9. dataset.append(line)
  10. return dataset
  11. def convert_string_to_float(dataset, column):
  12. dataset = dataset[1:]
  13. for row in dataset:
  14. row[column] = float(row[column].strip())

rescaling

  1. import data_reader
  2. dataset = data_reader.read_csv('diabetes.csv')
  3. for i in range(len(dataset[0])):
  4. data_reader.convert_string_to_float(dataset, i)
  5. dataset = dataset[1:]
  6. def find_max_and_min_in_dataset(dataset):
  7. max_and_min = list()
  8. for i in range(len(dataset[0])):
  9. col_values = [row[i] for row in dataset]
  10. value_max = max(col_values)
  11. value_min = min(col_values)
  12. max_and_min.append([value_max, value_min])
  13. return max_and_min
  14. def max_min_normalization(dataset, max_and_min):
  15. for row in dataset:
  16. for i in range(len(row)):
  17. row[i] = (row[i] - max_and_min[i][1])/(max_and_min[i][0] - max_and_min[i][1])
  18. def find_mean_of_dataset(dataset):
  19. means = list()
  20. for i in range(len(dataset[0])):
  21. col_values = [row[i] for row in dataset]
  22. mean = sum(col_values)/float((len(dataset)))
  23. means.append(mean)
  24. return means
  25. def calculate_standard_deviation_of_dataset(dataset, means):
  26. standard_deviations = list()
  27. for i in range(len(dataset[0])):
  28. variance = [pow(row[i] - means[i], 2) for row in dataset]
  29. standard_deviation = pow(sum(variance)/float((len(variance)-1)), 0.5)
  30. standard_deviations.append(standard_deviation)
  31. return standard_deviations
  32. def standardization(dataset, means, standard_deviations):
  33. for row in dataset:
  34. for i in range(len(row)):
  35. row[i] = (row[i] - means[i])/standard_deviations[i]
  36. mean_list = find_mean_of_dataset(dataset)
  37. standard_deviation_list = calculate_standard_deviation_of_dataset(dataset, mean_list)
  38. standardization(dataset, mean_list, standard_deviation_list)
  39. print(dataset)

train_test_split

  1. from random import seed
  2. from random import randrange
  3. def train_test_split(dataset, train_proportion = 0.6):
  4. train_set = list()
  5. train_size = train_proportion * len(dataset)
  6. dataset_copy = list(dataset)
  7. while len(train_set) < train_size:
  8. random_choose = randrange(len(dataset_copy))
  9. train_set.append(dataset_copy.pop(random_choose))
  10. return train_set, dataset_copy

模型评估

k_fold_cross_validation

  1. from random import seed
  2. from random import randrange
  3. def k_fold(dataset, k = 10):
  4. basket_for_splitted_data = list()
  5. fold_size = int(len(dataset)/k)
  6. dataset_copy = list(dataset)
  7. for i in range(k):
  8. choosen_fold = list()
  9. while len(choosen_fold) < fold_size:
  10. random_choose = randrange(len(dataset_copy))
  11. choosen_fold.append(dataset_copy.pop(random_choose))
  12. basket_for_splitted_data.append(choosen_fold)
  13. return basket_for_splitted_data
  14. seed(888)
  15. dataset = [[1], [2], [3], [4], [5], [6], [7], [8], [9], [10]]
  16. k_fold_split = k_fold(dataset, 5)
  17. print(k_fold_split)

calculate_accuracy

  1. def calculate_accuracy_of_prediction(actual_data, predicted_data):
  2. correct_num = 0
  3. for i in range(len(actual_data)):
  4. if actual_data[i] == predicted_data[i]:
  5. correct_num += 1
  6. return correct_num/len(actual_data)
  7. actual_data = [0,0,0,0,0,1,1,1,1,1]
  8. predicted_data = [1,1,1,1,1,0,0,0,0,1]
  9. print(calculate_accuracy_of_prediction(actual_data, predicted_data))

confusion_matrix

  1. def confusion_matrix(actual_data, predicated_data):
  2. unique_calss_in_data = set(actual_data)
  3. matrix = [list() for x in range(len(unique_calss_in_data))]
  4. for i in range(len(unique_calss_in_data)):
  5. matrix[i] = [0 for x in range(len(unique_calss_in_data))]
  6. indexing_class = dict()
  7. for i, class_value in enumerate(unique_calss_in_data):
  8. indexing_class[class_value] = i
  9. for i in range(len(actual_data)):
  10. col = indexing_class[actual_data[i]]
  11. row = indexing_class[predicted_data[i]]
  12. matrix[row][col] += 1
  13. return unique_calss_in_data, matrix
  14. def pretty_matrix(unique_calss_in_data, matrix):
  15. print('Actual ' + ' '.join(str(x) for x in unique_calss_in_data))
  16. print('Predicted—-—-—-—-—-—-—-—-—-—-—-—-—-—-—-—-—-—-—-—-')
  17. for i, x in enumerate(unique_calss_in_data):
  18. print(' {} | {}'.format(x, ' '.join(str(y) for y in matrix[i])))
  19. actual_data = [0, 2, 0, 0, 0, 1, 1, 5, 2, 1]
  20. predicted_data = [2, 0, 2, 0, 1, 0, 0, 0, 0, 1]
  21. unique_calss_in_data, matrix = confusion_matrix(actual_data, predicted_data)
  22. pretty_matrix(unique_calss_in_data, matrix)

MAE法与RMSE法

对于回归问题,最简单的检测误差方法:

平均绝对误差Pure Python - 图1

  1. def calculate_MAE(predicted_data, actual_data):
  2. sum_of_error = 0
  3. for i in range(len(predicted_data)):
  4. sum_of_error += abs(predicted_data[i] - actual_data[i])
  5. return sum_of_error/len(predicted_data)
  6. actual_data = range(1, 11)
  7. predicted_data = [2, 4, 3, 5, 4, 6, 5, 7, 6, 8]
  8. calculate_MAE(predicted_data, actual_data)

均方根误差Pure Python - 图2

  1. def calculate_RMSE(predicted_data, actual_data):
  2. sum_of_error = 0
  3. for i in range(len(predicted_data)):
  4. sum_of_error += pow(predicted_data[i] - actual_data[i], 2)
  5. return pow(sum_of_error/len(predicted_data), 0.5)
  6. actual_data = range(1, 11)
  7. predicted_data = [2, 4, 3, 5, 4, 6, 5, 7, 6, 8]
  8. calculate_RMSE(predicted_data, actual_data)

随机预测算法建立基准模型

Random Prediction Algorithm

  1. def Random_Prediction_Algorithm(training_data, testing_data):
  2. values = [row[-1] for row in training_data] # list is unhashable
  3. unique_values = list(set(values))
  4. randomly_predicted_data = list()
  5. for row in testing_data:
  6. index = randrange(len(unique_values))
  7. randomly_predicted_data.append(unique_values[index])
  8. return randomly_predicted_data
  9. seed(888)
  10. training_data = [[0], [1], [2], [3], [2], [1], [0], [0], [1], [2]]
  11. testing_data = [[None], [None], [None], [None], [None]]
  12. predictions = Random_Prediction_Algorithm(training_data, testing_data)
  13. print(predictions)

返回:

  1. [0, 3, 3, 3, 3]

ZeroR Algorithm Classification

  1. def ZeroR_Algorithm_Classification(training_data, testing_data):
  2. values = [row[-1] for row in training_data]
  3. highest_counts = max(set(values), key = values.count)
  4. ZeroR_Prediction = [highest_counts for i in range(len(testing_data))] # 重复最大值
  5. return ZeroR_Prediction
  6. seed(888)
  7. training_data = [[0], [1], [2], [3], [2], [1], [0], [0], [1], [2]]
  8. testing_data = [[None], [None], [None], [None], [None]]
  9. predictions = ZeroR_Algorithm_Classification(training_data, testing_data)
  10. print(predictions)

返回:

  1. [0, 0, 0, 0, 0]

ZeroR Algorithm Regression

  1. def ZeroR_Algorithm_Regression(training_data, testing_data):
  2. values = [row[-1] for row in training_data]
  3. prediction = sum(values)/len(values)
  4. ZeroR_prediction = [prediction for i in range(len(testing_data))]
  5. return ZeroR_prediction
  6. seed(888)
  7. training_data = [[1], [2],[3], [4], [5], [6], [7], [8], [9], [10]]
  8. testing_data = [[None], [None], [None], [None], [None], [None], [None], [None], [None], [None]]
  9. predictions = ZeroR_Algorithm_Regression(training_data, testing_data)
  10. print(predictions)

返回:

  1. [5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5]

Python与统计学

描述性统计与推断性统计

Descriptive stastics

  1. 平均数
  2. 中位数

Statistical inference

收集全量数据成本过高,所以需要以样本推断整体

sample —- sampliing —- statistic —- estimate —- hypothesis test —- > all(parameter)

Estimate:

  1. Point estimation (sample size:n=100 ![](https://cdn.nlark.com/yuque/__latex/03db50bcc0d560eecac8f30a7fd08ce3.svg#card=math&code=%5Coverline%20X%20%3D%20175&height=19&width=56))
  2. Confidence interval (confidence level:95% ![](https://cdn.nlark.com/yuque/__latex/79b6ffeb1d738d6772eadc5be9dfe0d5.svg#card=math&code=%5Cmu%20%3D%20%5Coverline%7BX%7D%20%5Cpm%201.96%20%5Ctimes%20S_%7B%5Coverline%20%7BX%7D%7D&height=24&width=126))
    1. ![](https://cdn.nlark.com/yuque/__latex/1ce0cdaf66b9bf4b74d1f118c30080ef.svg#card=math&code=%5Coverline%7BX%7D&height=19&width=13)![](https://cdn.nlark.com/yuque/__latex/775563ef20ecacac415f0a661f21ee48.svg#card=math&code=_%7B%5Coverline%20%7BX%7D%7D&height=13&width=11)
    2. “95%” 和 “1.96”

Hypothesis test:

  1. 提出假设$H{0},H{a}$
    1. ![](https://cdn.nlark.com/yuque/__latex/7c5081abe6c2100f0e44396b6ac51661.svg#card=math&code=H_%7B0%7D&height=16&width=19) is the Hypothesis you want to reject
    2. ![](https://cdn.nlark.com/yuque/__latex/290b5855249618ae5a3d3ee809968b4b.svg#card=math&code=H_%7Ba%7D&height=16&width=19) is the Hypothesis you want to accept
    3. Example:
      1. one tail:![](https://cdn.nlark.com/yuque/__latex/981527060d01cb0dc20c1aaec8507255.svg#card=math&code=H_%7B0%7D%3A%5Cmu%20%5Cgeq%20175%2C%20H_%7Ba%7D%3A%20%5Cmu%20%3C%20175&height=16&width=172)
      2. two tails:![](https://cdn.nlark.com/yuque/__latex/fd089cde7b03129d613923707a8eb096.svg#card=math&code=H_%7B0%7D%3A%5Cmu%20%3D%20175%2C%20H_%7Ba%7D%3A%20%5Cmu%20%5Cneq%20175&height=16&width=172)
  2. Calculation
    ![](https://cdn.nlark.com/yuque/__latex/389bf8cca4c3b7bd09dcf638a211d5f0.svg#card=math&code=t%20%3D%20%5Cfrac%7B%5Coverline%20%7BX%7D%20-%20175%7D%7BS_%7B%5Coverline%20X%7D%7D&height=45&width=85)
  3. Draw distribution
  4. What is your confidence?

N-1与贝塞尔校正

Population Y:

  1. ![](https://cdn.nlark.com/yuque/__latex/364e9abd6caee5970e6cf69c0844fd41.svg#card=math&code=Mean%28Y%29%3D%5Cmu%3D%5Cfrac%7B1%7D%7BN%7D%5Csum_%7Bi%3D1%7D%5E%7BN%7DY_%7Bi%7D&height=47&width=170)
  2. Pure Python - 图3

Sample X:

  1. Pure Python - 图4
  2. Pure Python - 图5

随机生成10万个总体数据

  1. %matplotlib inline
  2. import matplotlib.pyplot as plt
  3. from IPython.core.pylabtools import figsize
  4. import pandas as pd
  5. import numpy as np
  1. np.random.seed(42)
  2. Population = 100000
  3. Population = pd.Series(np.random.randint(1, 11, Population))
  1. Population.head()
  1. 0 7
  2. 1 4
  3. 2 8
  4. 3 5
  5. 4 7
  6. dtype: int32

随机抽取500次样本

  1. samples = {}
  2. sample_size = 30
  3. num_of_samples = 500
  4. for i in range(num_of_samples):
  5. samples[i] = Population.sample(sample_size).reset_index(drop = True)
  1. samples = pd.DataFrame(samples)
  1. samples
0 1 2 3 4 5 6 7 8 9 490 491 492 493 494 495 496 497 498 499
0 3 4 1 3 3 7 6 8 6 9 4 6 5 3 4 8 3 9 10 9
1 2 7 6 9 9 2 10 5 10 10 5 7 9 6 3 9 9 9 2 2
2 2 1 5 3 7 3 4 4 7 7 8 1 5 8 4 10 8 4 9 10
3 3 10 2 6 9 7 10 3 4 2 4 3 2 4 5 2 5 7 8 6
4 1 3 5 10 10 5 9 2 3 3 2 4 10 9 1 9 3 9 5 4
5 5 5 6 2 1 6 9 9 3 6 9 1 9 9 7 10 2 4 5 7
6 6 2 1 3 1 9 4 2 1 4 10 7 1 10 9 5 9 10 8 2
7 2 10 5 4 2 7 2 1 7 1 4 2 4 3 4 9 3 2 1 1
8 1 3 5 9 10 1 1 3 6 6 8 6 9 10 7 7 6 10 8 6
9 10 3 5 2 1 3 1 2 1 9 5 1 1 7 4 3 8 5 1 1
10 10 1 8 8 1 3 5 2 7 5 6 7 3 9 5 7 10 9 1 5
11 5 2 5 6 3 5 6 5 5 3 9 1 7 9 6 9 1 8 6 2
12 5 9 7 10 7 9 1 6 8 8 8 7 1 2 9 6 7 10 5 7
13 9 1 6 5 1 1 3 1 9 2 9 1 9 10 7 10 6 6 6 7
14 4 3 2 2 6 5 9 7 1 8 9 7 6 9 4 4 9 1 3 3
15 8 8 7 2 5 3 9 4 2 6 5 8 3 5 2 2 2 6 2 2
16 4 7 2 6 3 10 3 6 6 3 4 9 8 1 8 10 4 8 9 7
17 7 5 1 1 10 8 8 9 4 3 9 6 4 5 6 1 7 10 2 10
18 6 4 4 7 1 7 1 1 7 2 2 1 7 1 8 9 5 6 6 5
19 5 8 7 10 7 10 1 4 9 5 2 3 10 2 8 10 4 7 4 6
20 10 6 6 5 5 3 4 2 1 7 6 2 7 6 8 4 10 1 9 3
21 8 3 9 2 3 2 6 5 5 4 6 7 5 9 7 5 9 5 2 10
22 10 6 2 9 8 6 6 7 2 6 8 1 1 5 8 8 3 4 10 1
23 9 5 4 9 9 10 7 1 6 2 7 1 4 9 3 4 3 10 2 4
24 7 2 4 10 3 9 9 1 6 2 6 3 4 4 8 7 8 1 3 7
25 5 7 2 7 5 2 1 10 1 6 9 2 3 9 2 6 10 2 6 9
26 7 1 4 3 6 5 8 5 3 2 6 6 7 4 6 1 1 9 8 6
27 8 2 5 10 3 3 4 6 2 5 8 5 2 6 1 8 6 6 1 6
28 9 1 2 6 2 4 7 3 9 10 2 5 10 10 10 7 8 2 10 5
29 8 7 3 2 3 10 10 3 7 7 7 9 4 5 6 3 10 7 2 8

30 rows × 500 columns

  1. # Delta degree of freedom
  2. # ddof = 0, divided by n; ddof = 1, divided by n-1
  3. biased_samples = samples.var(ddof = 0).to_frame()
  4. biased_samples.head()
0
0 7.965556
1 7.782222
2 4.632222
3 9.410000
4 9.560000
  1. biased_samples = biased_samples.expanding().mean()
  2. biased_samples.head()
0
0 7.965556
1 7.873889
2 6.793333
3 7.447500
4 7.870000
  1. biased_samples.columns = ['biased var estimate (divided by n)']
  2. biased_samples.plot()
  1. <matplotlib.axes._subplots.AxesSubplot at 0x2660c0b7b70>

output_14_1.png

  1. unbiased_samples = samples.var(ddof = 1).to_frame()
  2. unbiased_samples.head()
0
0 8.240230
1 8.050575
2 4.791954
3 9.734483
4 9.889655
  1. unbiased_samples = unbiased_samples.expanding().mean()
  2. unbiased_samples.head()
0
0 8.240230
1 8.145402
2 7.027586
3 7.704310
4 8.141379
  1. unbiased_samples.columns = ['unbiased var estimate (divided by n-1)']
  2. unbiased_samples.plot()
  1. <matplotlib.axes._subplots.AxesSubplot at 0x2660c3e6588>

output_17_1.png

  1. ax = unbiased_samples.plot()
  2. biased_samples.plot(ax =ax)
  3. real_population_variance = pd.Series(Population.var(ddof = 0), index = samples.columns)
  4. real_population_variance.plot()
  1. <matplotlib.axes._subplots.AxesSubplot at 0x2660c487f98>

output_18_1.png

利用假设性检验,检验相关性

T-Test:相关性是否靠谱?

第一步_立无相关性靶子

two tails test

  1. Pure Python - 图9
  2. Pure Python - 图10

第二步_计算T统计量
Pure Python - 图11

自由度
当以样本的统计量来估计总体的参数时,样本中独立或能自由变化的数据的个数

第三步_画分布,查T分布表

第四步_做判断

局限性

  1. 1. 非线性相关
  2. 2. 异常值
  3. 3. 伪相关