kMeans.py
无监督学习:聚类

K-均值聚类算法

  1. from numpy import *
  2. def loadDataSet(fileName): #general function to parse tab -delimited floats
  3. dataMat = [] #assume last column is target value
  4. fr = open(fileName)
  5. for line in fr.readlines():
  6. curLine = line.strip().split('\t')
  7. fltLine = list(map(float,curLine)) #map all elements to float()
  8. dataMat.append(fltLine)
  9. return dataMat
  10. def distEclud(vecA, vecB):
  11. return sqrt(sum(power(vecA - vecB, 2))) #la.norm(vecA-vecB)
  12. def randCent(dataSet, k):
  13. n = shape(dataSet)[1]
  14. centroids = mat(zeros((k,n)))#create centroid mat
  15. for j in range(n):#create random cluster centers, within bounds of each dimension
  16. minJ = min(dataSet[:,j])
  17. rangeJ = float(max(dataSet[:,j]) - minJ)
  18. centroids[:,j] = mat(minJ + rangeJ * random.rand(k,1))
  19. return centroids

image.png
image.png

  1. def kMeans(dataSet, k, distMeas=distEclud, createCent=randCent):
  2. m = shape(dataSet)[0]
  3. clusterAssment = mat(zeros((m,2)))#create mat to assign data points
  4. #to a centroid, also holds SE of each point
  5. centroids = createCent(dataSet, k)
  6. clusterChanged = True
  7. while clusterChanged:
  8. clusterChanged = False
  9. for i in range(m):#for each data point assign it to the closest centroid
  10. minDist = inf; minIndex = -1
  11. for j in range(k):
  12. distJI = distMeas(centroids[j,:],dataSet[i,:])
  13. if distJI < minDist:
  14. minDist = distJI; minIndex = j
  15. if clusterAssment[i,0] != minIndex: clusterChanged = True
  16. clusterAssment[i,:] = minIndex,minDist**2
  17. print(centroids)
  18. for cent in range(k):#recalculate centroids
  19. ptsInClust = dataSet[nonzero(clusterAssment[:,0].A==cent)[0]]#get all the point in this cluster
  20. centroids[cent,:] = mean(ptsInClust, axis=0) #assign centroid to mean
  21. return centroids, clusterAssment

后处理提高聚类性能

二分K-均值算法

  1. def biKmeans(dataSet, k, distMeas=distEclud):
  2. m = shape(dataSet)[0]
  3. clusterAssment = mat(zeros((m,2)))
  4. centroid0 = mean(dataSet, axis=0).tolist()[0]
  5. centList =[centroid0] #create a list with one centroid
  6. for j in range(m):#calc initial Error
  7. clusterAssment[j,1] = distMeas(mat(centroid0), dataSet[j,:])**2
  8. while (len(centList) < k):
  9. lowestSSE = inf
  10. for i in range(len(centList)):
  11. ptsInCurrCluster = dataSet[nonzero(clusterAssment[:,0].A==i)[0],:]#get the data points currently in cluster i
  12. centroidMat, splitClustAss = kMeans(ptsInCurrCluster, 2, distMeas)
  13. sseSplit = sum(splitClustAss[:,1])#compare the SSE to the currrent minimum
  14. sseNotSplit = sum(clusterAssment[nonzero(clusterAssment[:,0].A!=i)[0],1])
  15. print("sseSplit, and notSplit: ",sseSplit,sseNotSplit)
  16. if (sseSplit + sseNotSplit) < lowestSSE:
  17. bestCentToSplit = i
  18. bestNewCents = centroidMat
  19. bestClustAss = splitClustAss.copy()
  20. lowestSSE = sseSplit + sseNotSplit
  21. bestClustAss[nonzero(bestClustAss[:,0].A == 1)[0],0] = len(centList) #change 1 to 3,4, or whatever
  22. bestClustAss[nonzero(bestClustAss[:,0].A == 0)[0],0] = bestCentToSplit
  23. print('the bestCentToSplit is: ',bestCentToSplit)
  24. print('the len of bestClustAss is: ', len(bestClustAss))
  25. centList[bestCentToSplit] = bestNewCents[0,:].tolist()[0]#replace a centroid with two best centroids
  26. centList.append(bestNewCents[1,:].tolist()[0])
  27. clusterAssment[nonzero(clusterAssment[:,0].A == bestCentToSplit)[0],:]= bestClustAss#reassign new clusters, and SSE
  28. return mat(centList), clusterAssment

image.png
image.png

示例:对地图上的点进行聚类