预处理

  1. 将Bottom-up attention提供的标准特征转换为h5文件,建立索引,可以根据imageid读取到h5中对应的特征
  1. target: tsv -> h5py
  2. format of tsv:
  3. {imageid:
  4. image_w:
  5. image_h:
  6. features:
  7. boxes:
  8. num_boxes:
  9. }
  10. 使用csv.DictReader()读取tsv文件,再迭代地创建h5文件
  11. 创建好的h5的文件结构应该是:
  12. #N表示数据集的数量
  13. image_features":shape -> [N,num_boxes,features_dim]
  14. "bbox":shape -> [N,num_boxes,4]
  15. "spatial_features:" shape -> [N,num_boxes,6]
  16. #spatial_features表示的是每个box经过标准化后在整张图片中的相对位置信息
  17. #根据h5文件的架构,我们需要创建image_id到h5数据集的索引image_id2idx,在此之前我们需要收集
  18. 分别属于训练集、验证集和测试集的'data'_imageid。
  1. 处理问题,创建词典
def create_dictionary():
  tokenize(word)
  #将词标识化word2idx{word:idx},idx2word[word1,word2,...],可以用word2idx找到词对应的标识
  #也可以用idx2word找到标识对应的词
    embedding_word(idx)
  #将词典中的词向量化
  1. 处理annotaions
#我们希望预处理标记文件最后获得的数据结构如ann这样
ann = {"question_id":,"image_id":,"label":['tokens from ans2label'],"score":['score for every label']}
#下面是分步处理过程
def filter_answer():
#过滤出现次数少于一定次数的答案,注意是用的MultiChoiceAnswer统计次数
  punct_replace()#处理标点符号,去除特指介词(a,an,the)等,还原部分被误删的常用表达(youll -> you'll)
def create_answer2label()
  #创建和词典一样的数据,ans2label->{ans:label},label2ans->[ans1,ans2,..],两者是一一对应的
 def compute_score()
  #计算每一条标注中十个回答各自得分数,根据出现的次数确定得分

加载

我们希望获得的数据包括:
1. 图像特征:features
2. 空间特征: spatial_features,处理空间关系时使用
2. 问题: 经过tokenize的问题q_token
3. label: ans_label and score

于是根据我们的需求就得到如下的处理方法:

class VQADataset(Dataset):
    def __init__(self, name, dictionary, dataroot='data'):
        super(VQADataset, self).__init__()
        assert name in ['train', 'val']

        ans2label_path = os.path.join(dataroot, 'cache', 'trainval_ans2label.pkl')
        label2ans_path = os.path.join(dataroot, 'cache', 'trainval_label2ans.pkl')
        self.ans2label = cPickle.load(open(ans2label_path, 'rb'))
        self.label2ans = cPickle.load(open(label2ans_path, 'rb'))
        self.num_ans_candidates = len(self.ans2label)#所有候选答案的长度

        self.dictionary = dictionary#之前创建的dictionary

        self.img_id2idx = cPickle.load(
            open(os.path.join(dataroot, '%s36_imgid2idx.pkl' % name)))
        print('loading features from h5 file')
        h5_path = os.path.join(dataroot, '%s36.hdf5' % name)
        with h5py.File(h5_path, 'r') as hf:
            self.features = np.array(hf.get('image_features'))
            self.spatials = np.array(hf.get('spatial_features'))

        self.entries = _load_dataset(dataroot, name, self.img_id2idx)
        """读取数据集,需要返回的数据结构为一个列表,每一个列表元素entry的结构如下:
        {
        "question_id":
        "answer":{"question_id":,"image_id":,"label":['tokens from ans2label'],\
        "score":['score for every label']}
        "image_id":
        "image":idx(of_image_id)
        "question":
        }
        """
        self.tokenize()
        #获得序列化的问题,得到entry["q_token"]
        self.tensorize()
        self.v_dim = self.features.size(2)
        self.s_dim = self.spatials.size(2)

    def tokenize(self, max_length=14):
        """Tokenizes the questions.
        This will add q_token in each entry of the dataset.
        -1 represent nil, and should be treated as padding_idx in embedding
        """
        for entry in self.entries:
            tokens = self.dictionary.tokenize(entry['question'], False)
            tokens = tokens[:max_length]
            if len(tokens) < max_length:
                # Note here we pad in front of the sentence
                padding = [self.dictionary.padding_idx] * (max_length - len(tokens))
                tokens = padding + tokens
            utils.assert_eq(len(tokens), max_length)
            entry['q_token'] = tokens

    def tensorize(self):
        self.features = torch.from_numpy(self.features)
        self.spatials = torch.from_numpy(self.spatials)

        for entry in self.entries:
            question = torch.from_numpy(np.array(entry['q_token']))
            entry['q_token'] = question

            answer = entry['answer']
            labels = np.array(answer['labels'])
            scores = np.array(answer['scores'], dtype=np.float32)
            if len(labels):
                labels = torch.from_numpy(labels)
                scores = torch.from_numpy(scores)
                entry['answer']['labels'] = labels
                entry['answer']['scores'] = scores
            else:
                entry['answer']['labels'] = None
                entry['answer']['scores'] = None

    def __getitem__(self, index):
        entry = self.entries[index]
        features = self.features[entry['image']]
        spatials = self.spatials[entry['image']]

        question = entry['q_token']
        answer = entry['answer']
        labels = answer['labels']
        scores = answer['scores']
        target = torch.zeros(self.num_ans_candidates)
        if labels is not None:
            target.scatter_(0, labels, scores)

        return features, spatials, question, target

    def __len__(self):
        return len(self.entries)