试图实现Extractive Summarization as Text Matching中的模型

数据处理读取

基本思路

  1. 暂时使用lead3里的处理和读取
  2. 然后对应的候选摘要是对原Docment里的所有句子和原文档计算Rouge平均值,然后选择较高的5个。原论文使用的是BertExt,没弄明白,先用这个代替
  3. 然后对5个句子排列组合C(5,3)得到10个句子,这里用到了python里的from itertools import combinations。使用起来比较方便
  4. 对所有的候选和所有的数据用token编码,并转为tensor

    调用结果

    ```python from Dataloader import Loader document_path=”train.txt.src” label_path=”train.txt.tgt” base_path=”D:\New_desktop\summarization_papers\“

def Train(): loader=Loader(“bert”) train_data,train_label,train_candi=loader.read_data(base_path+document_path,base_path+label_path,50) print(“\n”) print(train_data.size()) print(train_label.size()) print(train_candi.size())

Train()

  1. ![image.png](https://cdn.nlark.com/yuque/0/2020/png/296244/1594915936844-f0640533-a931-4b80-9205-fbe691233e28.png#align=left&display=inline&height=245&margin=%5Bobject%20Object%5D&name=image.png&originHeight=245&originWidth=785&size=17815&status=done&style=none&width=785)
  2. ```python
  3. import re
  4. import numpy as np
  5. import torch.nn as nn
  6. import torch.nn.functional as F
  7. import torch.optim as optim
  8. import torch.autograd as autograd
  9. import torch.nn.functional
  10. from torch.utils.data import Dataset, DataLoader
  11. import torch
  12. import time
  13. import argparse
  14. import os
  15. from itertools import combinations
  16. from transformers import BertTokenizer
  17. from transformers import BertForSequenceClassification
  18. from transformers import BertConfig
  19. from transformers import BertModel
  20. USE_CUDA = torch.cuda.is_available()
  21. base_path="D:\\New_desktop\\summarization_papers\\"
  22. tokenizer = BertTokenizer.from_pretrained(base_path)
  23. print("----init tokenizer finished----")
  24. def normalizeString(s):
  25. #s = re.sub(r"([.!?])", r" \1", s)
  26. #s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
  27. s=s.strip()
  28. s = s.replace("\n", "")
  29. return s
  30. def possess_sentence(s):
  31. lines=s.split("##SENT##")
  32. for i in lines:
  33. i=normalizeString(i)
  34. return lines
  35. from rouge import Rouge
  36. from util import ProgressBar
  37. class Loader:
  38. def __init__(self, name):
  39. self.name = name
  40. self.train_data={}
  41. self.train_data['text']=[]
  42. self.train_data['label']=[]
  43. self.train_data["candi"]=[]
  44. self.rouge=Rouge()
  45. def get_document(self,document):
  46. sentences=possess_sentence(document)
  47. return sentences
  48. def get_labels(self,label):
  49. sentences = possess_sentence(label)
  50. return sentences
  51. def get_score(self,sen1,sen2):
  52. score=0
  53. rouge_score = self.rouge.get_scores(sen1, sen2)
  54. score+= rouge_score[0]["rouge-1"]['r']
  55. score+= rouge_score[0]["rouge-2"]['r']
  56. score+= rouge_score[0]["rouge-l"]['r']
  57. return score/3
  58. def read_data(self,path1,path2,pairs_num,max_len=40):
  59. print("----start Read train data----")
  60. fo = open(path1, "r", encoding='gb18030', errors='ignore')
  61. fl = open(path2, "r", encoding='gb18030', errors='ignore')
  62. data_list=[]
  63. label_list=[]
  64. candi_list=[]
  65. pbar = ProgressBar(n_total=pairs_num, desc='Loading')
  66. for i in range(pairs_num):
  67. pbar(i, {'current': i})
  68. line1 = fo.readline()
  69. line2 = fl.readline()
  70. #line1="A ##SENT## B ##SENT## C ##SENT## D ##SENT## E ##SENT## F"
  71. do = self.get_document(line1)
  72. la = self.get_labels(line2)
  73. sentences={}
  74. document = " ".join(do)
  75. la = " ".join(la)
  76. for i in do:
  77. if i != None:
  78. sentences[i]=self.get_score(i,document)
  79. sentences = sorted(sentences.items(), key=lambda x: x[1],reverse = True)
  80. candidata_sentence_set=sentences[:5]
  81. sentences=[]
  82. #print(candidata_sentence_set)
  83. #print(type(candidata_sentence_set))
  84. for i in candidata_sentence_set:
  85. sentences.append(i[0])
  86. #indices = list(combinations(sentences, 2))
  87. indices = list(combinations(sentences, 3))
  88. candidata=[]
  89. #print(sentences)
  90. #print(indices)
  91. for i in indices:
  92. #print(type(i))
  93. #print(len(i))
  94. candidata.append(" ".join(i))
  95. #print(candidata[0])
  96. #print(type(candidata[0]))
  97. candidata_data=[]
  98. for i in candidata:
  99. candidata_data.append(tokenizer.encode(i, add_special_tokens=False))
  100. #print(len(candidata_data))
  101. #print(candidata_data[0])
  102. self.train_data['text'].append(tokenizer.encode(document, add_special_tokens=False))
  103. self.train_data['label'].append(tokenizer.encode(la, add_special_tokens=False))
  104. self.train_data['candi'].append(candidata_data)
  105. for x in self.train_data['text']:
  106. if len(x) >= max_len:
  107. x = x[0:max_len-1]
  108. while len(x) < max_len:
  109. x.append(0)
  110. x.append(102)
  111. l = x
  112. x = [101]
  113. x.extend(l)
  114. data_list.append(x)
  115. for x in self.train_data['label']:
  116. if len(x) >= max_len:
  117. x = x[0:max_len-1]
  118. while len(x) < max_len:
  119. x.append(0)
  120. x.append(102)
  121. l = x
  122. x = [101]
  123. x.extend(l)
  124. label_list.append(x)
  125. for i in self.train_data['candi']:
  126. temp=[]
  127. for x in i:
  128. if len(x) >= max_len:
  129. x = x[0:max_len-1]
  130. while len(x) < max_len:
  131. x.append(0)
  132. x.append(102)
  133. l = x
  134. x = [101]
  135. x.extend(l)
  136. temp.append(x)
  137. candi_list.append(temp)
  138. train_data = torch.tensor(data_list)
  139. train_label = torch.Tensor(label_list)
  140. train_candi = torch.Tensor(candi_list)
  141. return train_data,train_label,train_candi

构造损失函数

补充资料

原论文开源代码里用的损失函数我完全没有用过,看论文的时候只有几个公式觉得很简单来着。。。
开始补充资料
这个损失函数看起来很有意思?
似乎我之前做句子相似度匹配TextMatching的时候可以用的?

MarginRankingLoss 你们可能对这个损失函数比较陌生。在机器学习领域,了解一个概念最直观的最快速的方式即是从它的名字开始。前端

MarginRankingLoss也是如此,拆分一下,Margin,Ranking,Loss。机器学习

Margin:前端同窗对Margin是再熟悉不过了,它表示两个元素之间的间隔。在机器学习中其实Margin也有相似的意思,它能够理解为一个可变的加在loss上的一个偏移量。也就是代表这个方法能够手动调节偏移。固然Margin不是重点。函数

Ranking:它是该损失函数的重点和核心,也就是排序!若是排序的内容仅仅是两个元素而已,那么对于某一个元素,只有两个结果,那就是在第二个元素以前或者在第二个元素以前。其实这就是该损失函数的核心了。学习

咱们看一下它的loss funcion表达式。cdn

Match_Sum复现 - 图1

margin咱们能够先无论它,其实模型的含义不言而喻。blog

y只能有两个取值,也就是1或者-1。排序

  1. 当y=1的时候,表示咱们预期x1的排名要比x2高,也就是x1-x2>0
  2. 当y=-1的时候,表示咱们预期x1的排名要比x2高,也就是x1-x2<0

何时用?

  1. GAN
  2. 排名任务
  3. 开源实现和实例很是少