logRegres.py

  1. def loadDataSet():
  2. dataMat = []; labelMat = []
  3. fr = open('testSet.txt')
  4. for line in fr.readlines():
  5. lineArr = line.strip().split()
  6. dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])])
  7. labelMat.append(int(lineArr[2]))
  8. return dataMat, labelMat
  9. def sigmoid(inX):
  10. return 1.0 / (1 + np.exp(-inX))
  11. def gradAscent(dataMatIn, classLabels):
  12. dataMatrix = np.mat(dataMatIn) #convert to NumPy matrix
  13. labelMat = np.mat(classLabels).transpose() #convert to NumPy matrix
  14. m, n = np.shape(dataMatrix)
  15. alpha = 0.001
  16. maxCycles = 500
  17. weights = np.ones((n, 1))
  18. for k in range(maxCycles): #heavy on matrix operations
  19. h = sigmoid(dataMatrix*weights) #matrix mult
  20. error = (labelMat - h) #vector subtraction
  21. weights = weights + alpha * dataMatrix.transpose()* error #matrix mult
  22. return weights

image.png
(获得梯度上升的参数weight,回归系数)

  1. def plotBestFit(weights):
  2. import matplotlib.pyplot as plt
  3. dataMat, labelMat = loadDataSet()
  4. dataArr = np.array(dataMat)
  5. n = np.shape(dataArr)[0]
  6. xcord1 = []; ycord1 = []
  7. xcord2 = []; ycord2 = []
  8. for i in range(n):
  9. if int(labelMat[i]) == 1:
  10. xcord1.append(dataArr[i, 1]); ycord1.append(dataArr[i, 2])
  11. else:
  12. xcord2.append(dataArr[i, 1]); ycord2.append(dataArr[i, 2])
  13. fig = plt.figure()
  14. ax = fig.add_subplot(111)
  15. ax.scatter(xcord1, ycord1, s=30, c='red', marker='s')
  16. ax.scatter(xcord2, ycord2, s=30, c='green')
  17. x = np.arange(-3.0, 3.0, 0.1)
  18. y = (-weights[0]-weights[1]*x)/weights[2]
  19. ax.plot(x, y)
  20. plt.xlabel('X1'); plt.ylabel('X2')
  21. plt.show()

image.png

image.png

随机梯度上升算法:通过每次用一个样本点来更新回归系数,减少算法复杂度

  1. def stocGradAscent0(dataMatrix, classLabels):
  2. m, n = np.shape(dataMatrix)
  3. alpha = 0.01
  4. weights = np.ones(n) #initialize to all ones
  5. for i in range(m):
  6. h = sigmoid(sum(dataMatrix[i]*weights))
  7. error = classLabels[i] - h
  8. weights = weights + alpha * error * dataMatrix[i]
  9. return weights

image.png

image.png

改进随机梯度上升算法:

  1. def stocGradAscent1(dataMatrix, classLabels, numIter=150):
  2. m, n = np.shape(dataMatrix)
  3. weights = np.ones(n) #initialize to all ones
  4. for j in range(numIter):
  5. dataIndex = list(range(m))
  6. for i in range(m):
  7. alpha = 4/(1.0+j+i)+0.0001 #apha decreases with iteration, does not
  8. randIndex = int(np.random.uniform(0, len(dataIndex)))#go to 0 because of the constant
  9. h = sigmoid(sum(dataMatrix[randIndex]*weights))
  10. error = classLabels[randIndex] - h
  11. weights = weights + alpha * error * dataMatrix[randIndex]
  12. del(dataIndex[randIndex])
  13. return weights

image.png
image.png

示例:从疝气病预测病马死亡率

  1. def classifyVector(inX, weights):
  2. prob = sigmoid(sum(inX*weights))
  3. if prob > 0.5: return 1.0
  4. else: return 0.0
  5. def colicTest():
  6. frTrain = open('horseColicTraining.txt'); frTest = open('horseColicTest.txt')
  7. trainingSet = []; trainingLabels = []
  8. for line in frTrain.readlines():
  9. currLine = line.strip().split('\t')
  10. lineArr = []
  11. for i in range(21):
  12. lineArr.append(float(currLine[i]))
  13. trainingSet.append(lineArr)
  14. trainingLabels.append(float(currLine[21]))
  15. trainWeights = stocGradAscent1(np.array(trainingSet), trainingLabels, 1000)
  16. errorCount = 0; numTestVec = 0.0
  17. for line in frTest.readlines():
  18. numTestVec += 1.0
  19. currLine = line.strip().split('\t')
  20. lineArr = []
  21. for i in range(21):
  22. lineArr.append(float(currLine[i]))
  23. if int(classifyVector(np.array(lineArr), trainWeights)) != int(currLine[21]):
  24. errorCount += 1
  25. errorRate = (float(errorCount)/numTestVec)
  26. print("the error rate of this test is: %f" % errorRate)
  27. return errorRate
  28. def multiTest():
  29. numTests = 10; errorSum = 0.0
  30. for k in range(numTests):
  31. errorSum += colicTest()
  32. print("after %d iterations the average error rate is: %f" % (numTests, errorSum/float(numTests)))

image.png