1 引言

目前选取3个特征:

  • 原本 text部分的所有字符
  • 句子长度
  • 每个句子的前10个高频字符(去除标点符号的)

    2 步骤

    2.1 导入工具包

    ```python import pandas as pd import matplotlib.pyplot as plt import numpy as np import seaborn as sns from scipy import stats

from sklearn.preprocessing import MultiLabelBinarizer from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfVectorizer from scipy import sparse

  1. <a name="ecLvf"></a>
  2. ## 2.1 统计top10高词频作为一个特征
  3. ```python
  4. # 统计句子中的前10个高频字符,添加到train_data组成新的特征
  5. def select_top10_highfrequency_word(data):
  6. text_Word_frequency = []
  7. from collections import Counter
  8. datalen = len(data)
  9. for i in range(0,datalen):
  10. one_lines = ''.join(list(data['text'][i][1:-1]))
  11. all_word_count = Counter(one_lines.split(" "))
  12. all_word_count = sorted(all_word_count.items(), key=lambda d:d[1], reverse = True)
  13. # 删除693和328两个字符,根据数据分析阶段的词频统计,这两个字符可能是标点符号
  14. dict_word_count = dict(all_word_count)
  15. if dict_word_count.get('693') !=None:
  16. del dict_word_count['693']
  17. if dict_word_count.get('328') !=None:
  18. del dict_word_count['328']
  19. # 取前10个高频词
  20. word_count ={}
  21. if len(dict_word_count) > 9:
  22. for i,(k,v) in enumerate(dict_word_count.items()):
  23. word_count[k] = v
  24. if i ==9:
  25. break
  26. else:
  27. word_count =dict_word_count
  28. string_top10_high_frequency_word = list(word_count.keys())
  29. if '' in string_top10_high_frequency_word:
  30. string_top10_high_frequency_word.remove('')
  31. list_to_str = " ".join(string_top10_high_frequency_word)
  32. text_Word_frequency.append(list_to_str)
  33. return text_Word_frequency

2.2 统计句子长度作为第二个特征

  1. # 统计每行的句子长度并添加到train_data中组成新的一个特征
  2. def count_text_len(data):
  3. text_len =[]
  4. datalen = len(data)
  5. for i in range(0,datalen):
  6. one_lines = ''.join(list(data['text'][i][1:]))
  7. len_text = one_lines.split(" ")
  8. text_len.append(len(len_text))
  9. return text_len

2.3 原始数据text作为第三个特征

  1. # 去除text中的|字符和首尾空格
  2. def modify_text(data):
  3. new_text =[]
  4. datalen = len(data)
  5. for i in range(0,datalen):
  6. one_lines = ''.join(list(data['text'][i][1:-1])).strip()
  7. new_text.append(one_lines)
  8. return new_text

2.4 label缺失值处理

  1. # label缺失值处理,为空的label设置为字符串17,代表无异常
  2. def label_code(data):
  3. new_code_label =[]
  4. datalen = len(data)
  5. for i in range(0,datalen):
  6. one_lines = ''.join(list(data['label'][i][1:])).strip()
  7. if one_lines =='':#空label
  8. new_code_label.append('17')
  9. else:
  10. new_code_label.append(one_lines)
  11. return new_code_label
  12. # print()

2.5 IDTDF算法提取text和highfrequency两列数据的特征

  1. def feature_extraction_tfidf_vectorizer(data,columns_list):
  2. TfidfVec = TfidfVectorizer(ngram_range=(1,1),max_features=100)
  3. for i,col in enumerate(columns_list):
  4. TfidfVec.fit(data[col])
  5. data_temp = TfidfVec.transform(data[col])
  6. if i==0 :
  7. data_cat = data_temp
  8. else:
  9. data_cat = sparse.hstack((data_cat,data_temp))
  10. df_tfidf = pd.DataFrame(data_cat.toarray())
  11. return df_tfidf

3 Main函数

  1. if __name__=="__main__":
  2. test_data = pd.read_csv('./data/track1_round1_testA_20210222.csv',header=None, names=['id', 'text'])#sep='\t',
  3. train_data = pd.read_csv('./data/track1_round1_train_20210222.csv',header=None, names=['id', 'text','label'])
  4. # 训练集的sample集和label集,测试集的sample集
  5. all_train_data_sample = pd.DataFrame(columns=['textlen'])
  6. all_train_data_label = pd.DataFrame(columns=['label'])
  7. all_test_data_sample = pd.DataFrame(columns=['textlen'])
  8. # 生成label集合,一个标签占一个表格,总共有18列,最后一列表示无异常
  9. train_data_label = pd.DataFrame(columns=['label'])
  10. train_data_label['label'] = label_code(train_data)
  11. mlb = MultiLabelBinarizer(classes=['0','1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17'])#17表示无异常
  12. Ylist = []
  13. all_train_data_label = pd.DataFrame(columns=['f0','f1', 'f2', 'f3','f4','f5','f6','f7', 'f8', 'f9','f10','f11','f12','f13', 'f14', 'f15','f16','f17'],index=[])
  14. indexsize = 0
  15. for i in range(len(train_data_label)):
  16. templist = train_data_label['label'][i].split()
  17. # 转二值化编码
  18. label_code_list = list(mlb.fit_transform(templist)[0])
  19. print(indexsize)
  20. # 0,1表示,一个数字占一个表格
  21. # str_label_list = list(map(lambda x:str(x).encode('UTF-8'), label_code_list))
  22. byte_label = list(map(int, label_code_list))
  23. all_train_data_label.loc[indexsize] =byte_label
  24. indexsize = indexsize + 1
  25. train_data_label.to_csv('./data/train_data_label.csv',index=False)
  26. print()
  27. # 生成三个特征的sample训练集
  28. train_data_sample = pd.DataFrame(columns=['text', 'textlen', 'highfrequency'])
  29. # 训练集中统计句子中的前10个高频字符,添加到train_data组成新的特征
  30. train_data_sample['highfrequency'] = select_top10_highfrequency_word(train_data)
  31. # 训练集中统计每行的句子长度并添加到train_data中组成新的一个特征
  32. train_data_sample['textlen'] = count_text_len(train_data)
  33. # 去除text中的|字符和首尾空格
  34. train_data_sample['text'] = modify_text(train_data)
  35. # train_data_sample.to_csv('./data/train_data_sample.csv',index=False)
  36. df_tfidf = feature_extraction_tfidf_vectorizer(train_data_sample,['text','highfrequency'])
  37. df_tfidf.columns = ['tfidf_'+str(i) for i in df_tfidf.columns]
  38. all_train_data_sample['textlen'] = train_data_sample['textlen']
  39. all_train_data_sample = pd.concat([all_train_data_sample,df_tfidf],axis = 1)
  40. all_train_data_sample.to_csv('./data/all_train_data_sample.csv')
  41. # 生成三个特征的测试集
  42. test_data_sample = pd.DataFrame(columns=['text', 'textlen', 'highfrequency'])
  43. # 测试集中统计句子中的前10个高频字符,添加到test_data组成新的特征
  44. test_data_sample['highfrequency'] = select_top10_highfrequency_word(test_data)
  45. # 测试集中统计每行的句子长度并添加到test_data中组成新的一个特征
  46. test_data_sample['textlen'] = count_text_len(test_data)
  47. # 去除text中的|字符和首尾空格
  48. test_data_sample['text'] = modify_text(test_data)
  49. df_tfidf = feature_extraction_tfidf_vectorizer(test_data_sample,['text','highfrequency'])
  50. df_tfidf.columns = ['tfidf_'+str(i) for i in df_tfidf.columns]
  51. all_test_data_sample['textlen'] = test_data_sample['textlen']
  52. all_test_data_sample = pd.concat([all_test_data_sample,df_tfidf],axis = 1)
  53. all_test_data_sample.to_csv('./data/all_test_data_sample.csv')
  54. print()

4 数据格式展示

4.1 训练样本集

  1. train_data_sample = pd.read_csv('./data/all_train_data_sample.csv')
  2. print(train_data_sample.info())

RangeIndex: 10000 entries, 0 to 9999 Data columns (total 82 columns):

Column Non-Null Count Dtype


0 Unnamed: 0 10000 non-null int64

1 textlen 10000 non-null int64

2 tfidf_0 10000 non-null float64

3 tfidf_1 10000 non-null float64

4 tfidf_2 10000 non-null float64

5 tfidf_3 10000 non-null float64

6 tfidf_4 10000 non-null float64

7 tfidf_5 10000 non-null float64

8 tfidf_6 10000 non-null float64

9 tfidf_7 10000 non-null float64

10 tfidf_8 10000 non-null float64

11 tfidf_9 10000 non-null float64

12 tfidf_10 10000 non-null float64

13 tfidf_11 10000 non-null float64

14 tfidf_12 10000 non-null float64

15 tfidf_13 10000 non-null float64

16 tfidf_14 10000 non-null float64

17 tfidf_15 10000 non-null float64

18 tfidf_16 10000 non-null float64

19 tfidf_17 10000 non-null float64

20 tfidf_18 10000 non-null float64

21 tfidf_19 10000 non-null float64

22 tfidf_20 10000 non-null float64

23 tfidf_21 10000 non-null float64

24 tfidf_22 10000 non-null float64

25 tfidf_23 10000 non-null float64

26 tfidf_24 10000 non-null float64

27 tfidf_25 10000 non-null float64

28 tfidf_26 10000 non-null float64

29 tfidf_27 10000 non-null float64

30 tfidf_28 10000 non-null float64

31 tfidf_29 10000 non-null float64

32 tfidf_30 10000 non-null float64

33 tfidf_31 10000 non-null float64

34 tfidf_32 10000 non-null float64

35 tfidf_33 10000 non-null float64

36 tfidf_34 10000 non-null float64

37 tfidf_35 10000 non-null float64

38 tfidf_36 10000 non-null float64

39 tfidf_37 10000 non-null float64

40 tfidf_38 10000 non-null float64

41 tfidf_39 10000 non-null float64

42 tfidf_40 10000 non-null float64

43 tfidf_41 10000 non-null float64

44 tfidf_42 10000 non-null float64

45 tfidf_43 10000 non-null float64

46 tfidf_44 10000 non-null float64

47 tfidf_45 10000 non-null float64

48 tfidf_46 10000 non-null float64

49 tfidf_47 10000 non-null float64

50 tfidf_48 10000 non-null float64

51 tfidf_49 10000 non-null float64

52 tfidf_50 10000 non-null float64

53 tfidf_51 10000 non-null float64

54 tfidf_52 10000 non-null float64

55 tfidf_53 10000 non-null float64

56 tfidf_54 10000 non-null float64

57 tfidf_55 10000 non-null float64

58 tfidf_56 10000 non-null float64

59 tfidf_57 10000 non-null float64

60 tfidf_58 10000 non-null float64

61 tfidf_59 10000 non-null float64

62 tfidf_60 10000 non-null float64

63 tfidf_61 10000 non-null float64

64 tfidf_62 10000 non-null float64

65 tfidf_63 10000 non-null float64

66 tfidf_64 10000 non-null float64

67 tfidf_65 10000 non-null float64

68 tfidf_66 10000 non-null float64

69 tfidf_67 10000 non-null float64

70 tfidf_68 10000 non-null float64

71 tfidf_69 10000 non-null float64

72 tfidf_70 10000 non-null float64

73 tfidf_71 10000 non-null float64

74 tfidf_72 10000 non-null float64

75 tfidf_73 10000 non-null float64

76 tfidf_74 10000 non-null float64

77 tfidf_75 10000 non-null float64

78 tfidf_76 10000 non-null float64

79 tfidf_77 10000 non-null float64

80 tfidf_78 10000 non-null float64

81 tfidf_79 10000 non-null float64

dtypes: float64(80), int64(2)

memory usage: 6.3 MB

image.png

4.2 训练label集

  1. train_data_label= pd.read_csv('./data/all_train_data_label.csv')
  2. print(train_data_label.info())

RangeIndex: 10000 entries, 0 to 9999

Data columns (total 18 columns):

Column Non-Null Count Dtype


0 f0 10000 non-null int64

1 f1 10000 non-null int64

2 f2 10000 non-null int64

3 f3 10000 non-null int64

4 f4 10000 non-null int64

5 f5 10000 non-null int64

6 f6 10000 non-null int64

7 f7 10000 non-null int64

8 f8 10000 non-null int64

9 f9 10000 non-null int64

10 f10 10000 non-null int64

11 f11 10000 non-null int64

12 f12 10000 non-null int64

13 f13 10000 non-null int64

14 f14 10000 non-null int64

15 f15 10000 non-null int64

16 f16 10000 non-null int64

17 f17 10000 non-null int64

dtypes: int64(18)

memory usage: 1.4 MB

image.png

4.4 测试样本集

  1. test_data_sample = pd.read_csv('./data/all_test_data_sample.csv')
  2. print(test_data_sample.info())

RangeIndex: 3000 entries, 0 to 2999

Data columns (total 82 columns):

Column Non-Null Count Dtype


0 Unnamed: 0 3000 non-null int64

1 textlen 3000 non-null int64

2 tfidf_0 3000 non-null float64

3 tfidf_1 3000 non-null float64

4 tfidf_2 3000 non-null float64

5 tfidf_3 3000 non-null float64

6 tfidf_4 3000 non-null float64

7 tfidf_5 3000 non-null float64

8 tfidf_6 3000 non-null float64

9 tfidf_7 3000 non-null float64

10 tfidf_8 3000 non-null float64

11 tfidf_9 3000 non-null float64

12 tfidf_10 3000 non-null float64

13 tfidf_11 3000 non-null float64

14 tfidf_12 3000 non-null float64

15 tfidf_13 3000 non-null float64

16 tfidf_14 3000 non-null float64

17 tfidf_15 3000 non-null float64

18 tfidf_16 3000 non-null float64

19 tfidf_17 3000 non-null float64

20 tfidf_18 3000 non-null float64

21 tfidf_19 3000 non-null float64

22 tfidf_20 3000 non-null float64

23 tfidf_21 3000 non-null float64

24 tfidf_22 3000 non-null float64

25 tfidf_23 3000 non-null float64

26 tfidf_24 3000 non-null float64

27 tfidf_25 3000 non-null float64

28 tfidf_26 3000 non-null float64

29 tfidf_27 3000 non-null float64

30 tfidf_28 3000 non-null float64

31 tfidf_29 3000 non-null float64

32 tfidf_30 3000 non-null float64

33 tfidf_31 3000 non-null float64

34 tfidf_32 3000 non-null float64

35 tfidf_33 3000 non-null float64

36 tfidf_34 3000 non-null float64

37 tfidf_35 3000 non-null float64

38 tfidf_36 3000 non-null float64

39 tfidf_37 3000 non-null float64

40 tfidf_38 3000 non-null float64

41 tfidf_39 3000 non-null float64

42 tfidf_40 3000 non-null float64

43 tfidf_41 3000 non-null float64

44 tfidf_42 3000 non-null float64

45 tfidf_43 3000 non-null float64

46 tfidf_44 3000 non-null float64

47 tfidf_45 3000 non-null float64

48 tfidf_46 3000 non-null float64

49 tfidf_47 3000 non-null float64

50 tfidf_48 3000 non-null float64

51 tfidf_49 3000 non-null float64

52 tfidf_50 3000 non-null float64

53 tfidf_51 3000 non-null float64

54 tfidf_52 3000 non-null float64

55 tfidf_53 3000 non-null float64

56 tfidf_54 3000 non-null float64

57 tfidf_55 3000 non-null float64

58 tfidf_56 3000 non-null float64

59 tfidf_57 3000 non-null float64

60 tfidf_58 3000 non-null float64

61 tfidf_59 3000 non-null float64

62 tfidf_60 3000 non-null float64

63 tfidf_61 3000 non-null float64

64 tfidf_62 3000 non-null float64

65 tfidf_63 3000 non-null float64

66 tfidf_64 3000 non-null float64

67 tfidf_65 3000 non-null float64

68 tfidf_66 3000 non-null float64

69 tfidf_67 3000 non-null float64

70 tfidf_68 3000 non-null float64

71 tfidf_69 3000 non-null float64

72 tfidf_70 3000 non-null float64

73 tfidf_71 3000 non-null float64

74 tfidf_72 3000 non-null float64

75 tfidf_73 3000 non-null float64

76 tfidf_74 3000 non-null float64

77 tfidf_75 3000 non-null float64

78 tfidf_76 3000 non-null float64

79 tfidf_77 3000 non-null float64

80 tfidf_78 3000 non-null float64

81 tfidf_79 3000 non-null float64

dtypes: float64(80), int64(2)

memory usage: 1.9 MB

image.png