1.数据集处理
将标签处理为 x中心点偏移量,y中心点偏移量,宽度,高度 (都进行归一化)
with open(opt.train_annotation_path) as f:train_lines = f.readlines()with open(opt.val_annotation_path) as f:val_lines = f.readlines()num_train = len(train_lines)num_val = len(val_lines)train_dataset = YoloDataset(train_lines, opt.input_shape, train = True)train_dataloader = DataLoader(train_dataset, shuffle = True, batch_size = opt.batch_size, num_workers = 4, pin_memory=True,drop_last=True)
import cv2import numpy as npimport torchfrom PIL import Imagefrom torch.utils.data.dataset import Datasetclass YoloDataset(Dataset):def __init__(self, annotation_lines, input_shape, train):super(YoloDataset, self).__init__()self.annotation_lines = annotation_linesself.input_shape = input_shapeself.length = len(self.annotation_lines)self.train = trainself.build_target = build_target()def __len__(self):return self.lengthdef __getitem__(self, index):index = index % self.length#---------------------------------------------------## 数据增强#---------------------------------------------------#image, boxes = self.get_random_data(self.annotation_lines[index], self.input_shape[0:2], random = self.train )image = torch.from_numpy(image.astype(np.float32)).permute(2, 0, 1) / 255.boxes = np.array(boxes, dtype=np.float32)if len(boxes) != 0:boxes[:, [0, 2]] = boxes[:, [0, 2]] / self.input_shape[1]boxes[:, [1, 3]] = boxes[:, [1, 3]] / self.input_shape[0]boxes[:, 2:4] = boxes[:, 2:4] - boxes[:, 0:2]boxes[:, 0:2] = boxes[:, 0:2] + boxes[:, 2:4] / 2image, targets = self.build_target(image, boxes[:,:4], boxes[:,4])return image, targetsdef get_random_data(self, annotation_line, input_shape, jitter=.3, hue=.1, sat=0.7, val=0.4, random=True):line = annotation_line.split()image = Image.open(line[0])image = image.convert('RGB')iw, ih = image.sizeh, w = input_shape#------------------------------## 获取边框信息#------------------------------#box = np.array([np.array(list(map(int,box.split(',')))) for box in line[1:]])if not random:scale = min(w/iw, h/ih)nw = int(iw*scale)nh = int(ih*scale)dx = (w-nw)//2dy = (h-nh)//2#---------------------------------## 添加灰度条#---------------------------------#image = image.resize((nw,nh), Image.BICUBIC)new_image = Image.new('RGB', (w,h), (128,128,128))new_image.paste(image, (dx, dy))image_data = np.array(new_image, np.float32)#---------------------------------## 调整边框#---------------------------------#if len(box)>0:np.random.shuffle(box)box[:, [0,2]] = box[:, [0,2]]*nw/iw + dxbox[:, [1,3]] = box[:, [1,3]]*nh/ih + dybox[:, 0:2][box[:, 0:2]<0] = 0box[:, 2][box[:, 2]>w] = wbox[:, 3][box[:, 3]>h] = hbox_w = box[:, 2] - box[:, 0]box_h = box[:, 3] - box[:, 1]box = box[np.logical_and(box_w>1, box_h>1)] # discard invalid boxif box.size == 0:print(line[0])if len(box)==0:print(line[0])return image_data, box#------------------------------------------## 对图像进行缩放,并进行长和宽的扭曲#------------------------------------------#new_ar = iw/ih * self.rand(1-jitter,1+jitter) / self.rand(1-jitter,1+jitter)scale = self.rand(.25, 2)if new_ar < 1:nh = int(scale*h)nw = int(nh*new_ar)else:nw = int(scale*w)nh = int(nw/new_ar)image = image.resize((nw,nh), Image.BICUBIC)#------------------------------------------## 添加灰度条#------------------------------------------#dx = int(self.rand(0, w-nw))dy = int(self.rand(0, h-nh))new_image = Image.new('RGB', (w,h), (128,128,128))new_image.paste(image, (dx, dy))image = new_image#------------------------------------------## 翻转图像#------------------------------------------#flip = self.rand()<.5if flip: image = image.transpose(Image.FLIP_LEFT_RIGHT)image_data = np.array(image, np.uint8)#---------------------------------## 对图像进行色域变换#---------------------------------#r = np.random.uniform(-1, 1, 3) * [hue, sat, val] + 1hue, sat, val = cv2.split(cv2.cvtColor(image_data, cv2.COLOR_RGB2HSV))dtype = image_data.dtypex = np.arange(0, 256, dtype=r.dtype)lut_hue = ((x * r[0]) % 180).astype(dtype)lut_sat = np.clip(x * r[1], 0, 255).astype(dtype)lut_val = np.clip(x * r[2], 0, 255).astype(dtype)image_data = cv2.merge((cv2.LUT(hue, lut_hue), cv2.LUT(sat, lut_sat), cv2.LUT(val, lut_val)))image_data = cv2.cvtColor(image_data, cv2.COLOR_HSV2RGB)#---------------------------------## 调整边框#---------------------------------#if len(box)>0:np.random.shuffle(box)box[:, [0,2]] = box[:, [0,2]]*nw/iw + dxbox[:, [1,3]] = box[:, [1,3]]*nh/ih + dyif flip: box[:, [0,2]] = w - box[:, [2,0]]box[:, 0:2][box[:, 0:2]<0] = 0box[:, 2][box[:, 2]>w] = wbox[:, 3][box[:, 3]>h] = hbox_w = box[:, 2] - box[:, 0]box_h = box[:, 3] - box[:, 1]box = box[np.logical_and(box_w>1, box_h>1)]return image_data, boxdef rand(self, a=0, b=1):return np.random.rand()*(b-a) + adef build_target():return TargetTransoform()class TargetTransoform(object):def __init__(self, target_shape=(7, 7, 30), class_nums=20, cell_nums=7):self.target_shape = target_shapeself.class_nums = class_numsself.cell_nums = cell_numsdef __call__(self, image, boxes, labels):"""labels = [1,2,3,4]boxes = [0.2 0.3 0.4 0.8]return [self.S,self.S,self.B*5+self.C]"""labels = np.array(labels,"int8")np_target = np.zeros(self.target_shape)np_class = np.zeros((len(boxes), self.class_nums))for i in range(len(labels)):np_class[i][labels[i]] = 1step = 1.0 / self.cell_numsfor i in range(len(boxes)):box = boxes[i]label = np_class[i]cx, cy, w, h = box#获取属于哪个网格bx = int(cx // (step + 1e-5))by = int(cy // (step + 1e-5))#每个网格左上角的坐标cx = (cx % step) / stepcy = (cy % step) / stepbox = [cx, cy, w, h]np_target[by][bx][:4] = boxnp_target[by][bx][4] = 1np_target[by][bx][5:9] = boxnp_target[by][bx][9] = 1np_target[by][bx][10:] = labelreturn image, np_targetif __name__ == "__main__":tain_path = "/home/users/user1/Documents/AI_Files/gitfies/pytorch-YOLO-v1/2007_train.txt"with open(tain_path) as f:train_lines = f.readlines()train_dataset = YoloDataset(train_lines, [448,448], train = True)image,Target = train_dataset[3054]
2. 损失函数
训练使得网络输出 x中心点偏移量,y中心点偏移量,宽度开根号,高度开根号 (都进行归一化)
import numpy as npimport torchimport torch.nn.functional as Ffrom torch.autograd import Variablefrom torch.nn import *class yoloLoss(Module):def __init__(self, num_class=20):super(yoloLoss, self).__init__()self.lambda_coord = 5self.lambda_noobj = 0.5self.S = 7self.B = 2self.C = num_classself.step = 1.0 / 7def compute_iou(self, box1, box2, index):box1 = torch.clone(box1)box2 = torch.clone(box2)box1 = self.conver_box(box1, index)#[x_c,y_c,w,h]box2 = self.conver_box(box2, index)box1[:, 2] = torch.pow(box1[:, 2],2)box1[:, 3] = torch.pow(box1[:, 3],2)x1, y1, w1, h1 = box1[:, 0]- box1[:, 2] / 2, box1[:, 1]- box1[:, 3] / 2, box1[:, 2], box1[:, 3]x2, y2, w2, h2 = box2[:, 0]- box2[:, 2] / 2, box2[:, 1]- box2[:, 3] / 2, box2[:, 2], box2[:, 3]inter_w = (w1 + w2) - (torch.max(x1 + w1, x2 + w2) - torch.min(x1, x2))inter_h = (h1 + h2) - (torch.max(y1 + h1, y2 + h2) - torch.min(y1, y2))inter_h = torch.clamp(inter_h, 0)inter_w = torch.clamp(inter_w, 0)inter = inter_w * inter_hunion = w1 * h1 + w2 * h2 - interreturn inter / uniondef conver_box(self, box, index):i, j = indexbox[:, 0], box[:, 1] = [(box[:, 0] + i) * self.step, (box[:, 1] + j) * self.step]box = torch.clamp(box, 0)return boxdef forward(self, pred, target):batch_size = pred.size(0)target_boxes = target[:, :, :, :10].contiguous().reshape((-1, 7, 7, 2, 5))pred_boxes = pred[:, :, :, :10].contiguous().reshape((-1, 7, 7, 2, 5))target_cls = target[:, :, :, 10:]pred_cls = pred[:, :, :, 10:]#h获取含有目标的坐标obj_mask = (target_boxes[..., 4] > 0).byte()sig_mask = obj_mask[..., 1].bool()index = torch.where(sig_mask == True)#img_i代表第几张图片,y属于网格y坐标,x属于网格x坐标for img_i, y, x in zip(*index):img_i, y, x = img_i.item(), y.item(), x.item()pbox = pred_boxes[img_i, y, x]target_box = target_boxes[img_i, y, x]ious = self.compute_iou(pbox[:, :4], target_box[:, :4], [x, y])iou, max_i = ious.max(0)#将有目标的置信度标签且IOU较大的框框的置信度设为IOUtarget_boxes[img_i, y, x, max_i, 4] = iou.item()#另一个框框设为0target_boxes[img_i, y, x, 1 - max_i, 4] = 0obj_mask[img_i, y, x, 1 - max_i] = 0obj_mask = obj_mask.bool()noobj_mask = ~obj_masknoobj_loss = F.mse_loss(pred_boxes[noobj_mask][:, 4],target_boxes[noobj_mask][:, 4],reduction="sum")obj_loss = F.mse_loss(pred_boxes[obj_mask][:, 4],target_boxes[obj_mask][:, 4],reduction="sum")xy_loss = F.mse_loss(pred_boxes[obj_mask][:, :2],target_boxes[obj_mask][:, :2],reduction="sum")wh_loss = F.mse_loss(pred_boxes[obj_mask][:, 2:4],torch.sqrt(target_boxes[obj_mask][:, 2:4]),reduction="sum")class_loss = F.mse_loss(pred_cls[sig_mask],target_cls[sig_mask],reduction="sum")loss = dict(conf_loss=(obj_loss + self.lambda_noobj * noobj_loss) /batch_size,reg_loss=(self.lambda_coord * xy_loss +self.lambda_coord * wh_loss) / batch_size,cls_loss=class_loss / batch_size)return loss
