1. import numpy as np
  2. from sklearn import preprocessing
  3. from sklearn.naive_bayes import GaussianNB
  4. ###STEP1###
  5. # 加载数据
  6. input_file = r'R:\Ricky\程序\数据仓库与挖掘\实验五_分类算法\adult.data.txt'
  7. X = []
  8. y = []
  9. num_lessthan50k = 0
  10. num_morethan50k = 0
  11. num_threshold = 30000 # 数据总数
  12. with open(input_file, 'r') as f:
  13. for line in f.readlines():
  14. if '?' in line: # 如果数据存在缺失,则剔除该行数据
  15. continue
  16. data = line[:-1].split(', ')
  17. if data[-1] == '<=50K' and num_lessthan50k < num_threshold:
  18. X.append(data)
  19. num_lessthan50k = num_lessthan50k + 1
  20. # 问题一:依据target的不同,各选取相同大小的数据,即选取'<=50K'与'>50K'数量相等的数据
  21. if data[-1] == '>50K' and num_morethan50k < num_threshold:
  22. X.append(data)
  23. num_morethan50k = num_morethan50k + 1
  24. if num_lessthan50k >= num_threshold and num_morethan50k >= num_threshold:
  25. break
  26. X = np.array(X)
  27. for i,item in enumerate(X[0]):
  28. print(i,item)
  29. ###STEP2###
  30. label_encoder = []
  31. X_encoded = np.empty(X.shape)
  32. # 这里只取X[0]是因为数据的类型都一样,只需要判断该位置是否为字符串即可,所以只需要一个样例。
  33. for i,item in enumerate(X[0]): # eunm枚举类,形同 将可迭代数据['hello','world']转化为:[(0,'hello'),(1,'world')]
  34. if item.isdigit(): # 判断是否为数值型
  35. X_encoded[:, i] = X[:, i] # 一维全匹配
  36. else:
  37. le=preprocessing.LabelEncoder()
  38. label_encoder.append(le)
  39. # 问题二:转换数据的属性,已知原数据中含有英文字符,请将英文转换为数值型,提示:使用preprocessing.LabelEncoder()方法
  40. X_encoded[:,i] = le.fit_transform(X[:,i])
  41. X = X_encoded[:, :-1].astype(int)
  42. y = X_encoded[:, -1].astype(int)
  43. ###STEP3###
  44. # 问题三:创建分类器并进行训练
  45. # 交叉验证
  46. # from sklearn import cross_validation 该方法在新版sklearn中已被移动,不能这样导入
  47. from sklearn.model_selection import train_test_split,cross_val_score
  48. X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=5)
  49. classifier_gaussiannb = GaussianNB()
  50. classifier_gaussiannb.fit(X_train, y_train)
  51. y_test_pred = classifier_gaussiannb.predict(X_test)
  52. # # 问题三:计算F1得分
  53. f1 = cross_val_score(classifier_gaussiannb,X,y,scoring='f1_weighted',cv=5)
  54. print ("F1 score: " + str(round(100*f1.mean(), 2)) + "%")
  55. ###STEP4###
  56. # 创建个例,将其进行同样编码处理
  57. input_data = ['39', 'State-gov', '77516', 'Bachelors', '13', 'Never-married', 'Adm-clerical', 'Not-in-family', 'White', 'Male', '2174', '0', '40', 'United-States']
  58. count = 0
  59. input_data_encoded = [-1] * len(input_data)
  60. for i,item in enumerate(input_data):
  61. if item.isdigit():
  62. input_data_encoded[i] = int(input_data[i])
  63. else:
  64. input_data_encoded[i] = int(label_encoder[count].transform([input_data[i]]))
  65. count = count + 1
  66. input_data_encoded = np.array(input_data_encoded)
  67. # 问题四:请将个体进行预测分类,并输出结果
  68. # 预测可以直接调用 predict方法
  69. output_class = classifier_gaussiannb.predict(input_data_encoded.reshape(1,-1)) # 这里需要降维,否则会报错
  70. print (label_encoder[-1].inverse_transform(output_class)[0])
  1. import numpy as np
  2. from sklearn import preprocessing
  3. from sklearn.naive_bayes import GaussianNB
  4. ###STEP1###
  5. # 加载数据
  6. input_file = r'R:\Ricky\程序\数据仓库与挖掘\实验五_分类算法\adult.data.txt'
  7. X = []
  8. y = []
  9. num_lessthan50k = 0
  10. num_morethan50k = 0
  11. num_threshold = 30000 # 数据总数
  12. with open(input_file, 'r') as f:
  13. for line in f.readlines():
  14. if '?' in line: # 如果数据存在缺失,则剔除该行数据
  15. continue
  16. data = line[:-1].split(', ')
  17. if data[-1] == '<=50K' and num_lessthan50k < num_threshold:
  18. X.append(data)
  19. num_lessthan50k = num_lessthan50k + 1
  20. # 问题一:依据target的不同,各选取相同大小的数据,即选取'<=50K'与'>50K'数量相等的数据
  21. if data[-1] == '>50K' and num_morethan50k < num_threshold:
  22. X.append(data)
  23. num_morethan50k = num_morethan50k + 1
  24. if num_lessthan50k >= num_threshold and num_morethan50k >= num_threshold:
  25. break
  26. X = np.array(X)
  27. ###STEP2###
  28. label_encoder = []
  29. X_encoded = np.empty(X.shape)
  30. # 这里只取X[0]是因为数据的类型都一样,只需要判断该位置是否为字符串即可,所以只需要一个样例。
  31. for i,item in enumerate(X[0]): # eunm枚举类,形同 将可迭代数据['hello','world']转化为:[(0,'hello'),(1,'world')]
  32. if item.isdigit(): # 判断是否为数值型
  33. X_encoded[:, i] = X[:, i] # 一维全匹配
  34. else:
  35. le=preprocessing.LabelEncoder()
  36. label_encoder.append(le)
  37. # 问题二:转换数据的属性,已知原数据中含有英文字符,请将英文转换为数值型,提示:使用preprocessing.LabelEncoder()方法
  38. X_encoded[:,i] = le.fit_transform(X[:,i])
  39. X = X_encoded[:, :-1].astype(int)
  40. y = X_encoded[:, -1].astype(int)
  41. # 创建KNN分类器
  42. from sklearn.neighbors import KNeighborsClassifier
  43. from sklearn.preprocessing import StandardScaler
  44. # 创建knn前需要先标准化数据
  45. stand = StandardScaler()
  46. # 创建 5 个邻居的分类器
  47. knn = KNeighborsClassifier(n_neighbors=5, n_jobs=1)
  48. # 交叉验证
  49. from sklearn.model_selection import train_test_split,cross_val_score
  50. X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=5)
  51. knn.fit(X_train,y_train)
  52. y_test_pred = knn.predict(X_test)
  53. # 计算得分
  54. f1 = cross_val_score(knn,X,y,scoring='f1_weighted',cv=5)
  55. print ("F1 score: " + str(round(100*f1.mean(), 2)) + "%")

自己实现KNN算法

  1. import numpy as np
  2. import operator
  3. import pandas as pd
  4. # newInput: 新输入的待分类数据(x_test),本分类器一次只能对一个新输入分类
  5. # dataset:输入的训练数据集(x_train),array类型,每一行为一个输入训练集
  6. # labels:输入训练集对应的类别标签(y_train),格式为['A','B']而不是[['A'],['B']]
  7. # k:近邻数
  8. # weight:决策规则,"uniform" 多数表决法,"distance" 距离加权表决法
  9. def KNNClassify(newInput, dataset, labels, k, weight):
  10. numSamples = dataset.shape[0]
  11. """step1: 计算待分类数据与训练集各数据点的距离(欧氏距离:距离差值平方和开根号)"""
  12. diff = np.tile(newInput, (numSamples, 1)) - dataset # 凸显numpy数组的高效性——元素级的运算
  13. squaredist = diff ** 2
  14. distance = (squaredist.sum(axis=1)) ** 0.5 # axis=1,按行累加
  15. """step2:将距离按升序排序,并取距离最近的k个近邻点"""
  16. # 对数组distance按升序排序,返回数组排序后的值对应的索引值
  17. sortedDistance = distance.argsort()
  18. # 定义一个空字典,存放k个近邻点的分类计数
  19. classCount = {}
  20. # 对k个近邻点分类计数,多数表决法
  21. for i in range(k):
  22. # 第i个近邻点在distance数组中的索引,对应的分类
  23. votelabel = labels[sortedDistance[i]]
  24. if weight == "uniform":
  25. # votelabel作为字典的key,对相同的key值累加(多数表决法)
  26. classCount[votelabel] = classCount.get(votelabel, 0) + 1
  27. elif weight == "distance":
  28. # 对相同的key值按距离加权累加(加权表决法)
  29. classCount[votelabel] = classCount.get(votelabel, 0) + (1 / distance[sortedDistance[i]])
  30. else:
  31. print("分类决策规则错误!")
  32. print("\"uniform\"多数表决法\"distance\"距离加权表决法")
  33. break
  34. # 对k个近邻点的分类计数按降序排序,返回得票数最多的分类结果
  35. sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
  36. if weight == "uniform":
  37. print("新输入到训练集的最近%d个点的计数为:" % k, "\n", classCount)
  38. print("新输入的类别是:", sortedClassCount[0][0])
  39. elif weight == "distance":
  40. print("新输入到训练集的最近%d个点的距离加权计数为:" % k, "\n", classCount)
  41. print("新输入的类别是:", sortedClassCount[0][0])
  42. return sortedClassCount[0][0]
  43. iris=pd.read_csv("iris.txt") #读取数据文件
  44. iris.head()
  45. iris_x=iris.iloc[:,[0,1,2,3]] #基于索引位来选取数据集
  46. iris_y=iris.iloc[:,[4]]
  47. indices=np.random.permutation(len(iris_x)) #将鸢尾花的长度数据打乱
  48. iris_x_train=iris_x.iloc[indices[0:130]]
  49. iris_y_train=iris_y.iloc[indices[0:130]]
  50. iris_x_test=iris_x.iloc[indices[130:150]]
  51. iris_y_test=iris_y.iloc[indices[130:150]]
  52. # 将dataframe格式的数据转换为numpy array格式,便于 调用函数计算
  53. iris_x_train=np.array(iris_x_train)
  54. iris_y_train=np.array(iris_y_train)
  55. iris_x_test=np.array(iris_x_test)
  56. iris_y_test=np.array(iris_y_test)
  57. # 将labels的形状设置为(130,)
  58. iris_y_train.shape=(130,)
  59. # if __name__ == '__main__':
  60. # test_index = 12
  61. # predict = KNNClassify(iris_x_test[test_index], iris_x_train, iris_y_train, 20, "distance")
  62. # print(predict)
  63. # print("新输入的实际类别是:", iris_y_test[test_index])
  64. # print("\n")
  65. #
  66. # if predict == iris_y_test[test_index]:
  67. # print("预测准确!")
  68. # else:
  69. # print("预测错误!")