1. 支持向量机

1.1 可视化数据集1

  1. data = load_mat_file('./data/ex6data1.mat')
  2. X = data['X']
  3. y = data['y']
  4. plt.ion()
  5. plt.figure()
  6. plot_data(X, y)
  7. plt.pause(1)
  8. plt.close()
  9. def plot_data(x, y):
  10. # 取出那些行中,列号为0的元素等于1的行号。返回的是一个元组,其中存在一个元素,类型为ndarray,使用[0]取出这个这个矩阵(一维向量)。
  11. pos = np.where(y[:, 0] == 1)[0]
  12. neg = np.where(y[:, 0] == 0)[0]
  13. # pos代表那些正样本的行索引,列号为0代表横坐标。1代表那些正样本的纵坐标。
  14. plt.scatter(x[pos, 0], x[pos, 1], marker='+')
  15. plt.scatter(x[neg, 0], x[neg, 1], marker='o')

image.png

1.2 调用sklearn中的svm

  1. C = 1
  2. Classification = SVC(C=C, kernel='linear')
  3. # fit(X, y, sample_weight=None), y : array-like, shape (n_samples,)
  4. # y.raval返回y的一维向量形式,按照每行降维,返回的是一个视图,修改会影响原值。
  5. Classification.fit(X, y.ravel())
  6. plot_pad = 0.5
  7. plot_x_min, plot_x_max = X[:, 0].min() - plot_pad, X[:, 0].max() + plot_pad
  8. plot_y_min, plot_y_max = X[:, 1].min() - plot_pad, X[:, 1].max() + plot_pad
  9. plot_step = 0.01
  10. # 按照0.01的间隔生成min到max之间的一系列网格矩阵。
  11. plot_x, plot_y = np.meshgrid(np.arange(plot_x_min, plot_x_max, plot_step),
  12. np.arange(plot_y_min, plot_y_max, plot_step))
  13. #np.c_,左右连接,要求行数相等
  14. plot_z = Classification.predict(np.c_[plot_x.ravel(), plot_y.ravel()]).reshape(plot_x.shape)
  15. #绘制等高线图,alpha代表透明度,1为完全透明
  16. plt.contourf(plot_x, plot_y, plot_z, cmap="Wistia", alpha=0.2)
  17. plt.pause(1)
  18. plt.close()

image.png

有高斯核的SVM

高斯核实现

image.png
theta参数决定着,相似程度的下降速度。theta越大,高斯曲线越平缓,相似度下降度慢。反之,相似度下降越快。

  1. def gaussian_kernel(x1, x2, sigma):
  2. #linalg.norm,求向量的2范数
  3. return np.exp(-((linalg.norm(x1 - x2)) ** 2) / (2 * (sigma ** 2)))

可视化数据集2

image.png
RBF指的就是高斯核函数。当C为100时,产生过拟合,其表现为:决策边界强制区分正负样本。

  1. Classification = SVC(C=100, kernel='rbf', gamma=6)
  2. # fit(X, y, sample_weight=None), y : array-like, shape (n_samples,)
  3. Classification.fit(X, y.ravel())
  4. plot_pad = 0.5
  5. plot_x_min, plot_x_max = X[:, 0].min() - plot_pad, X[:, 0].max() + plot_pad
  6. plot_y_min, plot_y_max = X[:, 1].min() - plot_pad, X[:, 1].max() + plot_pad
  7. plot_step = 0.01
  8. plot_x, plot_y = np.meshgrid(np.arange(plot_x_min, plot_x_max, plot_step),
  9. np.arange(plot_y_min, plot_y_max, plot_step))
  10. plot_z = Classification.predict(np.c_[plot_x.ravel(), plot_y.ravel()]).reshape(plot_x.shape)
  11. plt.contourf(plot_x, plot_y, plot_z, cmap="Wistia", alpha=0.2)
  12. plt.axis([-0.1, 1.1, 0.3, 1.05])
  13. plt.pause(1)
  14. plt.close()

image.png

可视化数据集3

image.png

  1. Classification = SVC(C=1, kernel='poly', degree=3, gamma=10)
  2. # fit(X, y, sample_weight=None), y : array-like, shape (n_samples,)
  3. Classification.fit(X, y.ravel())
  4. plot_pad = 0.5
  5. plot_x_min, plot_x_max = X[:, 0].min() - plot_pad, X[:, 0].max() + plot_pad
  6. plot_y_min, plot_y_max = X[:, 1].min() - plot_pad, X[:, 1].max() + plot_pad
  7. plot_step = 0.01
  8. plot_x, plot_y = np.meshgrid(np.arange(plot_x_min, plot_x_max, plot_step),
  9. np.arange(plot_y_min, plot_y_max, plot_step))
  10. plot_z = Classification.predict(np.c_[plot_x.ravel(), plot_y.ravel()]).reshape(plot_x.shape)
  11. plt.contourf(plot_x, plot_y, plot_z, cmap="Wistia", alpha=0.2)
  12. plt.axis([-0.8, 0.4, -0.8, 0.8])
  13. plt.pause(1)
  14. plt.close()

image.png
当C=1时,SVM可以发现异常点。泛化能力较好。

垃圾邮件分类

邮件预处理

  1. def process_email(email_contents):
  2. # Load Vocabulary
  3. vocab_list = get_vocab_list()[:, 1]
  4. word_indices = []
  5. # ========================== Preprocess Email ===========================
  6. # Lower case
  7. email_contents = str(email_contents)
  8. email_contents = email_contents.lower()
  9. # Strip all HTML
  10. # Looks for any expression that starts with < and ends with > and replace
  11. # and does not have any < or > in the tag it with a space
  12. # [^<>]表示除了<>的字符匹配1次或者多次
  13. email_contents = re.sub(r'<[^<>]+>', ' ', email_contents)
  14. # Handle Numbers
  15. # Look for one or more characters between 0-9
  16. email_contents = re.sub(r'[0-9]+', 'number', email_contents)
  17. # Handle URLS
  18. # Look for strings starting with http:// or https://
  19. email_contents = re.sub(r'(http|https)://[^\s]*', 'httpaddr', email_contents)
  20. # Handle Email Addresses
  21. # Look for strings with @ in the middle
  22. email_contents = re.sub(r'[^\s]+@[^\s]+', 'emailaddr', email_contents)
  23. # Handle $ sign
  24. email_contents = re.sub(r'[$]+', 'dollar', email_contents)
  25. # ========================== Tokenize Email ===========================
  26. # Output the email to screen as well
  27. print('\n==== Processed Email ====\n\n')
  28. # Process file
  29. l_count = 0
  30. # Tokenize and also get rid of any punctuation
  31. partition_text = re.split(r'[ @$/#.-:&*+=\[\]?!(){},\'\">_<;%\n\f]', email_contents)
  32. stemmer = PorterStemmer()
  33. for one_word in partition_text:
  34. if one_word != '':
  35. # Remove any non alphanumeric characters
  36. one_word = re.sub(r'[^a-zA-Z0-9]', '', one_word)
  37. # Stem the word
  38. # (the porterStemmer sometimes has issues, so we use a try catch block)
  39. one_word = stemmer.stem(one_word)
  40. # % Skip the word if it is too short.
  41. if str == '':
  42. continue
  43. temp = np.argwhere(vocab_list == one_word)
  44. if temp.size == 1:
  45. word_indices.append(temp.min())
  46. # % Print to screen, ensuring that the output lines are not too long
  47. if (l_count + len(one_word) + 1) > 78:
  48. print('\n')
  49. l_count = 0
  50. print('%s' % one_word, end=' ')
  51. l_count = l_count + len(one_word) + 1
  52. print('\n')
  53. # Print footer
  54. print('\n\n=========================\n')
  55. return np.array(word_indices)

特征提取

def email_features(word_indices):
    # Total number of words in the dictionary
    n = 1899

    # You need to return the following variables correctly.
    x = np.zeros((n, 1))
    for i in range(word_indices.size):
        x[word_indices[i]] = 1
    return x

为垃圾邮件分类训练SVM

     # =========== Part 3: Train Linear SVM for Spam Classification ========
    # In this section, you will train a linear classifier to determine if an
    # email is Spam or Not-Spam.

    # Load the Spam Email dataset
    data = load_mat_file('../data/spamTrain.mat')
    print('\nTraining Linear SVM (Spam Classification)\n')
    print('(this may take 1 to 2 minutes) ...\n')
    X = data['X']
    y = data['y'].ravel()
    C = 0.1
    Classification = SVC(C=C, kernel='linear')
    Classification.fit(X, y)
    p = Classification.predict(X)
    print('Training Accuracy: {:.2f}\n'.format((np.mean((p == y)) * 100)))
    print('Program paused. Press enter to continue.\n')
    # pause_func()

    # =================== Part 4: Test Spam Classification ================
    data = load_mat_file('../data/spamTest.mat')
    Xtest = data['Xtest']
    ytest = data['ytest'].ravel()
    p = Classification.predict(Xtest)
    print('Test Accuracy: {:.2f}\n'.format((np.mean((p == ytest)) * 100)))
    print('Program paused. Press enter to continue.\n')

测试

    # =================== Part 4: Test Spam Classification ================
    data = load_mat_file('../data/spamTest.mat')
    Xtest = data['Xtest']
    ytest = data['ytest'].ravel()
    p = Classification.predict(Xtest)
    print('Test Accuracy: {:.2f}\n'.format((np.mean((p == ytest)) * 100)))
    print('Program paused. Press enter to continue.\n')
    # pause_func()

    # ================= Part 5: Top Predictors of Spam ====================
    # np.argsort用于排序分类器的参数大小
    index_array = np.argsort(Classification.coef_).ravel()[::-1]
    vocab_list = get_vocab_list()[:, 1]
    for i in range(15):
        print(' %-15s (%f) \n' % (vocab_list[index_array[i]], Classification.coef_[:,             index_array[i]]))

    print('Program paused. Press enter to continue.\n')