预处理
- 将Bottom-up attention提供的标准特征转换为h5文件,建立索引,可以根据imageid读取到h5中对应的特征
target: tsv -> h5py
format of tsv:
{imageid:
image_w:
image_h:
features:
boxes:
num_boxes:
}
使用csv.DictReader()读取tsv文件,再迭代地创建h5文件
创建好的h5的文件结构应该是:
#N表示数据集的数量
“image_features":shape -> [N,num_boxes,features_dim]
"bbox":shape -> [N,num_boxes,4]
"spatial_features:" shape -> [N,num_boxes,6]
#spatial_features表示的是每个box经过标准化后在整张图片中的相对位置信息
#根据h5文件的架构,我们需要创建image_id到h5数据集的索引image_id2idx,在此之前我们需要收集
分别属于训练集、验证集和测试集的'data'_imageid。
- 处理问题,创建词典
def create_dictionary():
tokenize(word)
#将词标识化word2idx{word:idx},idx2word[word1,word2,...],可以用word2idx找到词对应的标识
#也可以用idx2word找到标识对应的词
embedding_word(idx)
#将词典中的词向量化
- 处理annotaions
#我们希望预处理标记文件最后获得的数据结构如ann这样
ann = {"question_id":,"image_id":,"label":['tokens from ans2label'],"score":['score for every label']}
#下面是分步处理过程
def filter_answer():
#过滤出现次数少于一定次数的答案,注意是用的MultiChoiceAnswer统计次数
punct_replace()#处理标点符号,去除特指介词(a,an,the)等,还原部分被误删的常用表达(youll -> you'll)
def create_answer2label()
#创建和词典一样的数据,ans2label->{ans:label},label2ans->[ans1,ans2,..],两者是一一对应的
def compute_score()
#计算每一条标注中十个回答各自得分数,根据出现的次数确定得分
加载
我们希望获得的数据包括:
1. 图像特征:features
2. 空间特征: spatial_features,处理空间关系时使用
2. 问题: 经过tokenize的问题q_token
3. label: ans_label and score
于是根据我们的需求就得到如下的处理方法:
class VQADataset(Dataset):
def __init__(self, name, dictionary, dataroot='data'):
super(VQADataset, self).__init__()
assert name in ['train', 'val']
ans2label_path = os.path.join(dataroot, 'cache', 'trainval_ans2label.pkl')
label2ans_path = os.path.join(dataroot, 'cache', 'trainval_label2ans.pkl')
self.ans2label = cPickle.load(open(ans2label_path, 'rb'))
self.label2ans = cPickle.load(open(label2ans_path, 'rb'))
self.num_ans_candidates = len(self.ans2label)#所有候选答案的长度
self.dictionary = dictionary#之前创建的dictionary
self.img_id2idx = cPickle.load(
open(os.path.join(dataroot, '%s36_imgid2idx.pkl' % name)))
print('loading features from h5 file')
h5_path = os.path.join(dataroot, '%s36.hdf5' % name)
with h5py.File(h5_path, 'r') as hf:
self.features = np.array(hf.get('image_features'))
self.spatials = np.array(hf.get('spatial_features'))
self.entries = _load_dataset(dataroot, name, self.img_id2idx)
"""读取数据集,需要返回的数据结构为一个列表,每一个列表元素entry的结构如下:
{
"question_id":
"answer":{"question_id":,"image_id":,"label":['tokens from ans2label'],\
"score":['score for every label']}
"image_id":
"image":idx(of_image_id)
"question":
}
"""
self.tokenize()
#获得序列化的问题,得到entry["q_token"]
self.tensorize()
self.v_dim = self.features.size(2)
self.s_dim = self.spatials.size(2)
def tokenize(self, max_length=14):
"""Tokenizes the questions.
This will add q_token in each entry of the dataset.
-1 represent nil, and should be treated as padding_idx in embedding
"""
for entry in self.entries:
tokens = self.dictionary.tokenize(entry['question'], False)
tokens = tokens[:max_length]
if len(tokens) < max_length:
# Note here we pad in front of the sentence
padding = [self.dictionary.padding_idx] * (max_length - len(tokens))
tokens = padding + tokens
utils.assert_eq(len(tokens), max_length)
entry['q_token'] = tokens
def tensorize(self):
self.features = torch.from_numpy(self.features)
self.spatials = torch.from_numpy(self.spatials)
for entry in self.entries:
question = torch.from_numpy(np.array(entry['q_token']))
entry['q_token'] = question
answer = entry['answer']
labels = np.array(answer['labels'])
scores = np.array(answer['scores'], dtype=np.float32)
if len(labels):
labels = torch.from_numpy(labels)
scores = torch.from_numpy(scores)
entry['answer']['labels'] = labels
entry['answer']['scores'] = scores
else:
entry['answer']['labels'] = None
entry['answer']['scores'] = None
def __getitem__(self, index):
entry = self.entries[index]
features = self.features[entry['image']]
spatials = self.spatials[entry['image']]
question = entry['q_token']
answer = entry['answer']
labels = answer['labels']
scores = answer['scores']
target = torch.zeros(self.num_ans_candidates)
if labels is not None:
target.scatter_(0, labels, scores)
return features, spatials, question, target
def __len__(self):
return len(self.entries)