svmMLiA.py

基于最大间隔分割数据

N维的数据需要N-1维来分隔,称为分隔超平面
数据离超平面越远,则获得的结果越可信。
支持向量就是离分隔超平面最近的那些点。需要最大化支持向量到分隔面的距离。

寻找最大间隔

SVM主要应用于二类的分类问题,对于多类需要进行代码修改。

SMO高效优化算法

  1. def loadDataSet(fileName):
  2. dataMat = []; labelMat = []
  3. fr = open(fileName)
  4. for line in fr.readlines():
  5. lineArr = line.strip().split('\t')
  6. dataMat.append([float(lineArr[0]), float(lineArr[1])])
  7. labelMat.append(float(lineArr[2]))
  8. return dataMat,labelMat
  9. def selectJrand(i,m):
  10. j=i #we want to select any J not equal to i
  11. while (j==i):
  12. j = int(random.uniform(0,m))
  13. return j
  14. def clipAlpha(aj,H,L):
  15. if aj > H:
  16. aj = H
  17. if L > aj:
  18. aj = L
  19. return aj

image.png

  1. def smoSimple(dataMatIn, classLabels, C, toler, maxIter):
  2. dataMatrix = mat(dataMatIn); labelMat = mat(classLabels).transpose()
  3. b = 0; m,n = shape(dataMatrix)
  4. alphas = mat(zeros((m,1)))
  5. iter = 0
  6. while (iter < maxIter):
  7. alphaPairsChanged = 0
  8. for i in range(m):
  9. fXi = float(multiply(alphas,labelMat).T*(dataMatrix*dataMatrix[i,:].T)) + b
  10. Ei = fXi - float(labelMat[i])#if checks if an example violates KKT conditions
  11. if ((labelMat[i]*Ei < -toler) and (alphas[i] < C)) or ((labelMat[i]*Ei > toler) and (alphas[i] > 0)):
  12. j = selectJrand(i,m)
  13. fXj = float(multiply(alphas,labelMat).T*(dataMatrix*dataMatrix[j,:].T)) + b
  14. Ej = fXj - float(labelMat[j])
  15. alphaIold = alphas[i].copy(); alphaJold = alphas[j].copy();
  16. if (labelMat[i] != labelMat[j]):
  17. L = max(0, alphas[j] - alphas[i])
  18. H = min(C, C + alphas[j] - alphas[i])
  19. else:
  20. L = max(0, alphas[j] + alphas[i] - C)
  21. H = min(C, alphas[j] + alphas[i])
  22. if L==H: print("L==H"); continue
  23. eta = 2.0 * dataMatrix[i,:]*dataMatrix[j,:].T - dataMatrix[i,:]*dataMatrix[i,:].T - dataMatrix[j,:]*dataMatrix[j,:].T
  24. if eta >= 0: print("eta>=0"); continue
  25. alphas[j] -= labelMat[j]*(Ei - Ej)/eta
  26. alphas[j] = clipAlpha(alphas[j],H,L)
  27. if (abs(alphas[j] - alphaJold) < 0.00001): print("j not moving enough"); continue
  28. alphas[i] += labelMat[j]*labelMat[i]*(alphaJold - alphas[j])#update i by the same amount as j
  29. #the update is in the oppostie direction
  30. b1 = b - Ei- labelMat[i]*(alphas[i]-alphaIold)*dataMatrix[i,:]*dataMatrix[i,:].T - labelMat[j]*(alphas[j]-alphaJold)*dataMatrix[i,:]*dataMatrix[j,:].T
  31. b2 = b - Ej- labelMat[i]*(alphas[i]-alphaIold)*dataMatrix[i,:]*dataMatrix[j,:].T - labelMat[j]*(alphas[j]-alphaJold)*dataMatrix[j,:]*dataMatrix[j,:].T
  32. if (0 < alphas[i]) and (C > alphas[i]): b = b1
  33. elif (0 < alphas[j]) and (C > alphas[j]): b = b2
  34. else: b = (b1 + b2)/2.0
  35. alphaPairsChanged += 1
  36. print("iter: %d i:%d, pairs changed %d" % (iter,i,alphaPairsChanged))
  37. if (alphaPairsChanged == 0): iter += 1
  38. else: iter = 0
  39. print("iteration number: %d" % iter)
  40. return b,alphas

image.png
image.png
image.png
image.png

利用完整Platt SMO算法加速优化

  1. class optStruct:
  2. def __init__(self,dataMatIn, classLabels, C, toler, kTup): # Initialize the structure with the parameters
  3. self.X = dataMatIn
  4. self.labelMat = classLabels
  5. self.C = C
  6. self.tol = toler
  7. self.m = shape(dataMatIn)[0]
  8. self.alphas = mat(zeros((self.m,1)))
  9. self.b = 0
  10. self.eCache = mat(zeros((self.m,2))) #first column is valid flag
  11. self.K = mat(zeros((self.m,self.m)))
  12. for i in range(self.m):
  13. self.K[:,i] = kernelTrans(self.X, self.X[i,:], kTup)
  14. def calcEk(oS, k):
  15. fXk = float(multiply(oS.alphas,oS.labelMat).T*oS.K[:,k] + oS.b)
  16. Ek = fXk - float(oS.labelMat[k])
  17. return Ek
  18. def selectJ(i, oS, Ei): #this is the second choice -heurstic, and calcs Ej
  19. maxK = -1; maxDeltaE = 0; Ej = 0
  20. oS.eCache[i] = [1,Ei] #set valid #choose the alpha that gives the maximum delta E
  21. validEcacheList = nonzero(oS.eCache[:,0].A)[0]
  22. if (len(validEcacheList)) > 1:
  23. for k in validEcacheList: #loop through valid Ecache values and find the one that maximizes delta E
  24. if k == i: continue #don't calc for i, waste of time
  25. Ek = calcEk(oS, k)
  26. deltaE = abs(Ei - Ek)
  27. if (deltaE > maxDeltaE):
  28. maxK = k; maxDeltaE = deltaE; Ej = Ek
  29. return maxK, Ej
  30. else: #in this case (first time around) we don't have any valid eCache values
  31. j = selectJrand(i, oS.m)
  32. Ej = calcEk(oS, j)
  33. return j, Ej
  34. def updateEk(oS, k):#after any alpha has changed update the new value in the cache
  35. Ek = calcEk(oS, k)
  36. oS.eCache[k] = [1,Ek]
  1. def innerL(i, oS):
  2. Ei = calcEk(oS, i)
  3. if ((oS.labelMat[i]*Ei < -oS.tol) and (oS.alphas[i] < oS.C)) or ((oS.labelMat[i]*Ei > oS.tol) and (oS.alphas[i] > 0)):
  4. j,Ej = selectJ(i, oS, Ei) #this has been changed from selectJrand
  5. alphaIold = oS.alphas[i].copy(); alphaJold = oS.alphas[j].copy();
  6. if (oS.labelMat[i] != oS.labelMat[j]):
  7. L = max(0, oS.alphas[j] - oS.alphas[i])
  8. H = min(oS.C, oS.C + oS.alphas[j] - oS.alphas[i])
  9. else:
  10. L = max(0, oS.alphas[j] + oS.alphas[i] - oS.C)
  11. H = min(oS.C, oS.alphas[j] + oS.alphas[i])
  12. if L==H: print("L==H"); return 0
  13. eta = 2.0 * oS.K[i,j] - oS.K[i,i] - oS.K[j,j] #changed for kernel
  14. if eta >= 0: print("eta>=0"); return 0
  15. oS.alphas[j] -= oS.labelMat[j]*(Ei - Ej)/eta
  16. oS.alphas[j] = clipAlpha(oS.alphas[j],H,L)
  17. updateEk(oS, j) #added this for the Ecache
  18. if (abs(oS.alphas[j] - alphaJold) < 0.00001): print("j not moving enough"); return 0
  19. oS.alphas[i] += oS.labelMat[j]*oS.labelMat[i]*(alphaJold - oS.alphas[j])#update i by the same amount as j
  20. updateEk(oS, i) #added this for the Ecache #the update is in the oppostie direction
  21. b1 = oS.b - Ei- oS.labelMat[i]*(oS.alphas[i]-alphaIold)*oS.K[i,i] - oS.labelMat[j]*(oS.alphas[j]-alphaJold)*oS.K[i,j]
  22. b2 = oS.b - Ej- oS.labelMat[i]*(oS.alphas[i]-alphaIold)*oS.K[i,j]- oS.labelMat[j]*(oS.alphas[j]-alphaJold)*oS.K[j,j]
  23. if (0 < oS.alphas[i]) and (oS.C > oS.alphas[i]): oS.b = b1
  24. elif (0 < oS.alphas[j]) and (oS.C > oS.alphas[j]): oS.b = b2
  25. else: oS.b = (b1 + b2)/2.0
  26. return 1
  27. else: return 0
  1. def smoP(dataMatIn, classLabels, C, toler, maxIter,kTup=('lin', 0)): #full Platt SMO
  2. oS = optStruct(mat(dataMatIn),mat(classLabels).transpose(),C,toler, kTup)
  3. iter = 0
  4. entireSet = True; alphaPairsChanged = 0
  5. while (iter < maxIter) and ((alphaPairsChanged > 0) or (entireSet)):
  6. alphaPairsChanged = 0
  7. if entireSet: #go over all
  8. for i in range(oS.m):
  9. alphaPairsChanged += innerL(i,oS)
  10. print("fullSet, iter: %d i:%d, pairs changed %d" % (iter,i,alphaPairsChanged))
  11. iter += 1
  12. else:#go over non-bound (railed) alphas
  13. nonBoundIs = nonzero((oS.alphas.A > 0) * (oS.alphas.A < C))[0]
  14. for i in nonBoundIs:
  15. alphaPairsChanged += innerL(i,oS)
  16. print("non-bound, iter: %d i:%d, pairs changed %d" % (iter,i,alphaPairsChanged))
  17. iter += 1
  18. if entireSet: entireSet = False #toggle entire set loop
  19. elif (alphaPairsChanged == 0): entireSet = True
  20. print("iteration number: %d" % iter)
  21. return oS.b,oS.alphas

image.png
image.png
image.png

在复杂数据上应用核函数

核函数(kerenl)
实现:将数据进行处理,从一个特征空间映射到另一个特征空间。
从很难处理的形式转为比较容易处理的形式。

径向基核函数,主流的核函数。

  1. def kernelTrans(X, A, kTup): #calc the kernel or transform data to a higher dimensional space
  2. m,n = shape(X)
  3. K = mat(zeros((m,1)))
  4. if kTup[0]=='lin': K = X * A.T #linear kernel
  5. elif kTup[0]=='rbf':
  6. for j in range(m):
  7. deltaRow = X[j,:] - A
  8. K[j] = deltaRow*deltaRow.T
  9. K = exp(K/(-1*kTup[1]**2)) #divide in NumPy is element-wise not matrix like Matlab
  10. else: raise NameError('Houston We Have a Problem -- \
  11. That Kernel is not recognized')
  12. return K
  13. class optStruct:
  14. def __init__(self,dataMatIn, classLabels, C, toler, kTup): # Initialize the structure with the parameters
  15. self.X = dataMatIn
  16. self.labelMat = classLabels
  17. self.C = C
  18. self.tol = toler
  19. self.m = shape(dataMatIn)[0]
  20. self.alphas = mat(zeros((self.m,1)))
  21. self.b = 0
  22. self.eCache = mat(zeros((self.m,2))) #first column is valid flag
  23. self.K = mat(zeros((self.m,self.m)))
  24. for i in range(self.m):
  25. self.K[:,i] = kernelTrans(self.X, self.X[i,:], kTup)

在numpy矩阵中,除法运算意味着对所有矩阵元素进行运算,而matlab中表示对矩阵求逆。

在测试中使用核函数

  1. def testRbf(k1=1.3):
  2. dataArr,labelArr = loadDataSet('testSetRBF.txt')
  3. b,alphas = smoP(dataArr, labelArr, 200, 0.0001, 10000, ('rbf', k1)) #C=200 important
  4. datMat=mat(dataArr); labelMat = mat(labelArr).transpose()
  5. svInd=nonzero(alphas.A>0)[0]
  6. sVs=datMat[svInd] #get matrix of only support vectors
  7. labelSV = labelMat[svInd];
  8. print("there are %d Support Vectors" % shape(sVs)[0])
  9. m,n = shape(datMat)
  10. errorCount = 0
  11. for i in range(m):
  12. kernelEval = kernelTrans(sVs,datMat[i,:],('rbf', k1))
  13. predict=kernelEval.T * multiply(labelSV,alphas[svInd]) + b
  14. if sign(predict)!=sign(labelArr[i]): errorCount += 1
  15. print("the training error rate is: %f" % (float(errorCount)/m))
  16. dataArr,labelArr = loadDataSet('testSetRBF2.txt')
  17. errorCount = 0
  18. datMat=mat(dataArr); labelMat = mat(labelArr).transpose()
  19. m,n = shape(datMat)
  20. for i in range(m):
  21. kernelEval = kernelTrans(sVs,datMat[i,:],('rbf', k1))
  22. predict=kernelEval.T * multiply(labelSV,alphas[svInd]) + b
  23. if sign(predict)!=sign(labelArr[i]): errorCount += 1
  24. print("the test error rate is: %f" % (float(errorCount)/m))

image.png
image.png

示例:手写问题回顾

  1. def loadImages(dirName):
  2. from os import listdir
  3. hwLabels = []
  4. trainingFileList = listdir(dirName) #load the training set
  5. m = len(trainingFileList)
  6. trainingMat = zeros((m,1024))
  7. for i in range(m):
  8. fileNameStr = trainingFileList[i]
  9. fileStr = fileNameStr.split('.')[0] #take off .txt
  10. classNumStr = int(fileStr.split('_')[0])
  11. if classNumStr == 9: hwLabels.append(-1)
  12. else: hwLabels.append(1)
  13. trainingMat[i,:] = img2vector('%s/%s' % (dirName, fileNameStr))
  14. return trainingMat, hwLabels
  15. def testDigits(kTup=('rbf', 10)):
  16. dataArr,labelArr = loadImages('trainingDigits')
  17. b,alphas = smoP(dataArr, labelArr, 200, 0.0001, 10000, kTup)
  18. datMat=mat(dataArr); labelMat = mat(labelArr).transpose()
  19. svInd=nonzero(alphas.A>0)[0]
  20. sVs=datMat[svInd]
  21. labelSV = labelMat[svInd];
  22. print("there are %d Support Vectors" % shape(sVs)[0])
  23. m,n = shape(datMat)
  24. errorCount = 0
  25. for i in range(m):
  26. kernelEval = kernelTrans(sVs,datMat[i,:],kTup)
  27. predict=kernelEval.T * multiply(labelSV,alphas[svInd]) + b
  28. if sign(predict)!=sign(labelArr[i]): errorCount += 1
  29. print("the training error rate is: %f" % (float(errorCount)/m))
  30. dataArr,labelArr = loadImages('testDigits')
  31. errorCount = 0
  32. datMat=mat(dataArr); labelMat = mat(labelArr).transpose()
  33. m,n = shape(datMat)
  34. for i in range(m):
  35. kernelEval = kernelTrans(sVs,datMat[i,:],kTup)
  36. predict=kernelEval.T * multiply(labelSV,alphas[svInd]) + b
  37. if sign(predict)!=sign(labelArr[i]): errorCount += 1
  38. print("the test error rate is: %f" % (float(errorCount)/m))

image.png