import numpy as npfrom sklearn import preprocessingfrom sklearn.naive_bayes import GaussianNB ###STEP1#### 加载数据input_file = r'R:\Ricky\程序\数据仓库与挖掘\实验五_分类算法\adult.data.txt'X = []y = []num_lessthan50k = 0num_morethan50k = 0num_threshold = 30000 # 数据总数with open(input_file, 'r') as f: for line in f.readlines(): if '?' in line: # 如果数据存在缺失,则剔除该行数据 continue data = line[:-1].split(', ') if data[-1] == '<=50K' and num_lessthan50k < num_threshold: X.append(data) num_lessthan50k = num_lessthan50k + 1 # 问题一:依据target的不同,各选取相同大小的数据,即选取'<=50K'与'>50K'数量相等的数据 if data[-1] == '>50K' and num_morethan50k < num_threshold: X.append(data) num_morethan50k = num_morethan50k + 1 if num_lessthan50k >= num_threshold and num_morethan50k >= num_threshold: break X = np.array(X) for i,item in enumerate(X[0]): print(i,item) ###STEP2### label_encoder = [] X_encoded = np.empty(X.shape) # 这里只取X[0]是因为数据的类型都一样,只需要判断该位置是否为字符串即可,所以只需要一个样例。 for i,item in enumerate(X[0]): # eunm枚举类,形同 将可迭代数据['hello','world']转化为:[(0,'hello'),(1,'world')] if item.isdigit(): # 判断是否为数值型 X_encoded[:, i] = X[:, i] # 一维全匹配 else: le=preprocessing.LabelEncoder() label_encoder.append(le) # 问题二:转换数据的属性,已知原数据中含有英文字符,请将英文转换为数值型,提示:使用preprocessing.LabelEncoder()方法 X_encoded[:,i] = le.fit_transform(X[:,i]) X = X_encoded[:, :-1].astype(int) y = X_encoded[:, -1].astype(int) ###STEP3### # 问题三:创建分类器并进行训练 # 交叉验证 # from sklearn import cross_validation 该方法在新版sklearn中已被移动,不能这样导入 from sklearn.model_selection import train_test_split,cross_val_score X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=5) classifier_gaussiannb = GaussianNB() classifier_gaussiannb.fit(X_train, y_train) y_test_pred = classifier_gaussiannb.predict(X_test) # # 问题三:计算F1得分 f1 = cross_val_score(classifier_gaussiannb,X,y,scoring='f1_weighted',cv=5) print ("F1 score: " + str(round(100*f1.mean(), 2)) + "%") ###STEP4### # 创建个例,将其进行同样编码处理 input_data = ['39', 'State-gov', '77516', 'Bachelors', '13', 'Never-married', 'Adm-clerical', 'Not-in-family', 'White', 'Male', '2174', '0', '40', 'United-States'] count = 0 input_data_encoded = [-1] * len(input_data) for i,item in enumerate(input_data): if item.isdigit(): input_data_encoded[i] = int(input_data[i]) else: input_data_encoded[i] = int(label_encoder[count].transform([input_data[i]])) count = count + 1 input_data_encoded = np.array(input_data_encoded) # 问题四:请将个体进行预测分类,并输出结果 # 预测可以直接调用 predict方法 output_class = classifier_gaussiannb.predict(input_data_encoded.reshape(1,-1)) # 这里需要降维,否则会报错print (label_encoder[-1].inverse_transform(output_class)[0])
import numpy as npfrom sklearn import preprocessingfrom sklearn.naive_bayes import GaussianNB ###STEP1#### 加载数据input_file = r'R:\Ricky\程序\数据仓库与挖掘\实验五_分类算法\adult.data.txt'X = []y = []num_lessthan50k = 0num_morethan50k = 0num_threshold = 30000 # 数据总数with open(input_file, 'r') as f: for line in f.readlines(): if '?' in line: # 如果数据存在缺失,则剔除该行数据 continue data = line[:-1].split(', ') if data[-1] == '<=50K' and num_lessthan50k < num_threshold: X.append(data) num_lessthan50k = num_lessthan50k + 1 # 问题一:依据target的不同,各选取相同大小的数据,即选取'<=50K'与'>50K'数量相等的数据 if data[-1] == '>50K' and num_morethan50k < num_threshold: X.append(data) num_morethan50k = num_morethan50k + 1 if num_lessthan50k >= num_threshold and num_morethan50k >= num_threshold: breakX = np.array(X)###STEP2###label_encoder = [] X_encoded = np.empty(X.shape)# 这里只取X[0]是因为数据的类型都一样,只需要判断该位置是否为字符串即可,所以只需要一个样例。for i,item in enumerate(X[0]): # eunm枚举类,形同 将可迭代数据['hello','world']转化为:[(0,'hello'),(1,'world')] if item.isdigit(): # 判断是否为数值型 X_encoded[:, i] = X[:, i] # 一维全匹配 else: le=preprocessing.LabelEncoder() label_encoder.append(le) # 问题二:转换数据的属性,已知原数据中含有英文字符,请将英文转换为数值型,提示:使用preprocessing.LabelEncoder()方法 X_encoded[:,i] = le.fit_transform(X[:,i])X = X_encoded[:, :-1].astype(int)y = X_encoded[:, -1].astype(int)# 创建KNN分类器from sklearn.neighbors import KNeighborsClassifierfrom sklearn.preprocessing import StandardScaler# 创建knn前需要先标准化数据stand = StandardScaler()# 创建 5 个邻居的分类器knn = KNeighborsClassifier(n_neighbors=5, n_jobs=1)# 交叉验证from sklearn.model_selection import train_test_split,cross_val_scoreX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=5)knn.fit(X_train,y_train)y_test_pred = knn.predict(X_test)# 计算得分f1 = cross_val_score(knn,X,y,scoring='f1_weighted',cv=5)print ("F1 score: " + str(round(100*f1.mean(), 2)) + "%")
自己实现KNN算法
import numpy as npimport operatorimport pandas as pd# newInput: 新输入的待分类数据(x_test),本分类器一次只能对一个新输入分类# dataset:输入的训练数据集(x_train),array类型,每一行为一个输入训练集# labels:输入训练集对应的类别标签(y_train),格式为['A','B']而不是[['A'],['B']]# k:近邻数# weight:决策规则,"uniform" 多数表决法,"distance" 距离加权表决法def KNNClassify(newInput, dataset, labels, k, weight): numSamples = dataset.shape[0] """step1: 计算待分类数据与训练集各数据点的距离(欧氏距离:距离差值平方和开根号)""" diff = np.tile(newInput, (numSamples, 1)) - dataset # 凸显numpy数组的高效性——元素级的运算 squaredist = diff ** 2 distance = (squaredist.sum(axis=1)) ** 0.5 # axis=1,按行累加 """step2:将距离按升序排序,并取距离最近的k个近邻点""" # 对数组distance按升序排序,返回数组排序后的值对应的索引值 sortedDistance = distance.argsort() # 定义一个空字典,存放k个近邻点的分类计数 classCount = {} # 对k个近邻点分类计数,多数表决法 for i in range(k): # 第i个近邻点在distance数组中的索引,对应的分类 votelabel = labels[sortedDistance[i]] if weight == "uniform": # votelabel作为字典的key,对相同的key值累加(多数表决法) classCount[votelabel] = classCount.get(votelabel, 0) + 1 elif weight == "distance": # 对相同的key值按距离加权累加(加权表决法) classCount[votelabel] = classCount.get(votelabel, 0) + (1 / distance[sortedDistance[i]]) else: print("分类决策规则错误!") print("\"uniform\"多数表决法\"distance\"距离加权表决法") break # 对k个近邻点的分类计数按降序排序,返回得票数最多的分类结果 sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True) if weight == "uniform": print("新输入到训练集的最近%d个点的计数为:" % k, "\n", classCount) print("新输入的类别是:", sortedClassCount[0][0]) elif weight == "distance": print("新输入到训练集的最近%d个点的距离加权计数为:" % k, "\n", classCount) print("新输入的类别是:", sortedClassCount[0][0]) return sortedClassCount[0][0]iris=pd.read_csv("iris.txt") #读取数据文件iris.head()iris_x=iris.iloc[:,[0,1,2,3]] #基于索引位来选取数据集iris_y=iris.iloc[:,[4]]indices=np.random.permutation(len(iris_x)) #将鸢尾花的长度数据打乱iris_x_train=iris_x.iloc[indices[0:130]]iris_y_train=iris_y.iloc[indices[0:130]]iris_x_test=iris_x.iloc[indices[130:150]]iris_y_test=iris_y.iloc[indices[130:150]]# 将dataframe格式的数据转换为numpy array格式,便于 调用函数计算iris_x_train=np.array(iris_x_train)iris_y_train=np.array(iris_y_train)iris_x_test=np.array(iris_x_test)iris_y_test=np.array(iris_y_test)# 将labels的形状设置为(130,)iris_y_train.shape=(130,)# if __name__ == '__main__':# test_index = 12# predict = KNNClassify(iris_x_test[test_index], iris_x_train, iris_y_train, 20, "distance")# print(predict)# print("新输入的实际类别是:", iris_y_test[test_index])# print("\n")## if predict == iris_y_test[test_index]:# print("预测准确!")# else:# print("预测错误!")