import numpy as np
from sklearn import preprocessing
from sklearn.naive_bayes import GaussianNB
###STEP1###
# 加载数据
input_file = r'R:\Ricky\程序\数据仓库与挖掘\实验五_分类算法\adult.data.txt'
X = []
y = []
num_lessthan50k = 0
num_morethan50k = 0
num_threshold = 30000 # 数据总数
with open(input_file, 'r') as f:
for line in f.readlines():
if '?' in line: # 如果数据存在缺失,则剔除该行数据
continue
data = line[:-1].split(', ')
if data[-1] == '<=50K' and num_lessthan50k < num_threshold:
X.append(data)
num_lessthan50k = num_lessthan50k + 1
# 问题一:依据target的不同,各选取相同大小的数据,即选取'<=50K'与'>50K'数量相等的数据
if data[-1] == '>50K' and num_morethan50k < num_threshold:
X.append(data)
num_morethan50k = num_morethan50k + 1
if num_lessthan50k >= num_threshold and num_morethan50k >= num_threshold:
break
X = np.array(X)
for i,item in enumerate(X[0]):
print(i,item)
###STEP2###
label_encoder = []
X_encoded = np.empty(X.shape)
# 这里只取X[0]是因为数据的类型都一样,只需要判断该位置是否为字符串即可,所以只需要一个样例。
for i,item in enumerate(X[0]): # eunm枚举类,形同 将可迭代数据['hello','world']转化为:[(0,'hello'),(1,'world')]
if item.isdigit(): # 判断是否为数值型
X_encoded[:, i] = X[:, i] # 一维全匹配
else:
le=preprocessing.LabelEncoder()
label_encoder.append(le)
# 问题二:转换数据的属性,已知原数据中含有英文字符,请将英文转换为数值型,提示:使用preprocessing.LabelEncoder()方法
X_encoded[:,i] = le.fit_transform(X[:,i])
X = X_encoded[:, :-1].astype(int)
y = X_encoded[:, -1].astype(int)
###STEP3###
# 问题三:创建分类器并进行训练
# 交叉验证
# from sklearn import cross_validation 该方法在新版sklearn中已被移动,不能这样导入
from sklearn.model_selection import train_test_split,cross_val_score
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=5)
classifier_gaussiannb = GaussianNB()
classifier_gaussiannb.fit(X_train, y_train)
y_test_pred = classifier_gaussiannb.predict(X_test)
# # 问题三:计算F1得分
f1 = cross_val_score(classifier_gaussiannb,X,y,scoring='f1_weighted',cv=5)
print ("F1 score: " + str(round(100*f1.mean(), 2)) + "%")
###STEP4###
# 创建个例,将其进行同样编码处理
input_data = ['39', 'State-gov', '77516', 'Bachelors', '13', 'Never-married', 'Adm-clerical', 'Not-in-family', 'White', 'Male', '2174', '0', '40', 'United-States']
count = 0
input_data_encoded = [-1] * len(input_data)
for i,item in enumerate(input_data):
if item.isdigit():
input_data_encoded[i] = int(input_data[i])
else:
input_data_encoded[i] = int(label_encoder[count].transform([input_data[i]]))
count = count + 1
input_data_encoded = np.array(input_data_encoded)
# 问题四:请将个体进行预测分类,并输出结果
# 预测可以直接调用 predict方法
output_class = classifier_gaussiannb.predict(input_data_encoded.reshape(1,-1)) # 这里需要降维,否则会报错
print (label_encoder[-1].inverse_transform(output_class)[0])
import numpy as np
from sklearn import preprocessing
from sklearn.naive_bayes import GaussianNB
###STEP1###
# 加载数据
input_file = r'R:\Ricky\程序\数据仓库与挖掘\实验五_分类算法\adult.data.txt'
X = []
y = []
num_lessthan50k = 0
num_morethan50k = 0
num_threshold = 30000 # 数据总数
with open(input_file, 'r') as f:
for line in f.readlines():
if '?' in line: # 如果数据存在缺失,则剔除该行数据
continue
data = line[:-1].split(', ')
if data[-1] == '<=50K' and num_lessthan50k < num_threshold:
X.append(data)
num_lessthan50k = num_lessthan50k + 1
# 问题一:依据target的不同,各选取相同大小的数据,即选取'<=50K'与'>50K'数量相等的数据
if data[-1] == '>50K' and num_morethan50k < num_threshold:
X.append(data)
num_morethan50k = num_morethan50k + 1
if num_lessthan50k >= num_threshold and num_morethan50k >= num_threshold:
break
X = np.array(X)
###STEP2###
label_encoder = []
X_encoded = np.empty(X.shape)
# 这里只取X[0]是因为数据的类型都一样,只需要判断该位置是否为字符串即可,所以只需要一个样例。
for i,item in enumerate(X[0]): # eunm枚举类,形同 将可迭代数据['hello','world']转化为:[(0,'hello'),(1,'world')]
if item.isdigit(): # 判断是否为数值型
X_encoded[:, i] = X[:, i] # 一维全匹配
else:
le=preprocessing.LabelEncoder()
label_encoder.append(le)
# 问题二:转换数据的属性,已知原数据中含有英文字符,请将英文转换为数值型,提示:使用preprocessing.LabelEncoder()方法
X_encoded[:,i] = le.fit_transform(X[:,i])
X = X_encoded[:, :-1].astype(int)
y = X_encoded[:, -1].astype(int)
# 创建KNN分类器
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
# 创建knn前需要先标准化数据
stand = StandardScaler()
# 创建 5 个邻居的分类器
knn = KNeighborsClassifier(n_neighbors=5, n_jobs=1)
# 交叉验证
from sklearn.model_selection import train_test_split,cross_val_score
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=5)
knn.fit(X_train,y_train)
y_test_pred = knn.predict(X_test)
# 计算得分
f1 = cross_val_score(knn,X,y,scoring='f1_weighted',cv=5)
print ("F1 score: " + str(round(100*f1.mean(), 2)) + "%")
自己实现KNN算法
import numpy as np
import operator
import pandas as pd
# newInput: 新输入的待分类数据(x_test),本分类器一次只能对一个新输入分类
# dataset:输入的训练数据集(x_train),array类型,每一行为一个输入训练集
# labels:输入训练集对应的类别标签(y_train),格式为['A','B']而不是[['A'],['B']]
# k:近邻数
# weight:决策规则,"uniform" 多数表决法,"distance" 距离加权表决法
def KNNClassify(newInput, dataset, labels, k, weight):
numSamples = dataset.shape[0]
"""step1: 计算待分类数据与训练集各数据点的距离(欧氏距离:距离差值平方和开根号)"""
diff = np.tile(newInput, (numSamples, 1)) - dataset # 凸显numpy数组的高效性——元素级的运算
squaredist = diff ** 2
distance = (squaredist.sum(axis=1)) ** 0.5 # axis=1,按行累加
"""step2:将距离按升序排序,并取距离最近的k个近邻点"""
# 对数组distance按升序排序,返回数组排序后的值对应的索引值
sortedDistance = distance.argsort()
# 定义一个空字典,存放k个近邻点的分类计数
classCount = {}
# 对k个近邻点分类计数,多数表决法
for i in range(k):
# 第i个近邻点在distance数组中的索引,对应的分类
votelabel = labels[sortedDistance[i]]
if weight == "uniform":
# votelabel作为字典的key,对相同的key值累加(多数表决法)
classCount[votelabel] = classCount.get(votelabel, 0) + 1
elif weight == "distance":
# 对相同的key值按距离加权累加(加权表决法)
classCount[votelabel] = classCount.get(votelabel, 0) + (1 / distance[sortedDistance[i]])
else:
print("分类决策规则错误!")
print("\"uniform\"多数表决法\"distance\"距离加权表决法")
break
# 对k个近邻点的分类计数按降序排序,返回得票数最多的分类结果
sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
if weight == "uniform":
print("新输入到训练集的最近%d个点的计数为:" % k, "\n", classCount)
print("新输入的类别是:", sortedClassCount[0][0])
elif weight == "distance":
print("新输入到训练集的最近%d个点的距离加权计数为:" % k, "\n", classCount)
print("新输入的类别是:", sortedClassCount[0][0])
return sortedClassCount[0][0]
iris=pd.read_csv("iris.txt") #读取数据文件
iris.head()
iris_x=iris.iloc[:,[0,1,2,3]] #基于索引位来选取数据集
iris_y=iris.iloc[:,[4]]
indices=np.random.permutation(len(iris_x)) #将鸢尾花的长度数据打乱
iris_x_train=iris_x.iloc[indices[0:130]]
iris_y_train=iris_y.iloc[indices[0:130]]
iris_x_test=iris_x.iloc[indices[130:150]]
iris_y_test=iris_y.iloc[indices[130:150]]
# 将dataframe格式的数据转换为numpy array格式,便于 调用函数计算
iris_x_train=np.array(iris_x_train)
iris_y_train=np.array(iris_y_train)
iris_x_test=np.array(iris_x_test)
iris_y_test=np.array(iris_y_test)
# 将labels的形状设置为(130,)
iris_y_train.shape=(130,)
# if __name__ == '__main__':
# test_index = 12
# predict = KNNClassify(iris_x_test[test_index], iris_x_train, iris_y_train, 20, "distance")
# print(predict)
# print("新输入的实际类别是:", iris_y_test[test_index])
# print("\n")
#
# if predict == iris_y_test[test_index]:
# print("预测准确!")
# else:
# print("预测错误!")