1.数据集处理

将标签处理为 x中心点偏移量，y中心点偏移量，宽度，高度（都进行归一化）

with open(opt.train_annotation_path) as f:
    train_lines = f.readlines()
 with open(opt.val_annotation_path) as f:
        val_lines   = f.readlines()
num_train   = len(train_lines)
num_val     = len(val_lines)
train_dataset     = YoloDataset(train_lines, opt.input_shape, train = True)
train_dataloader  = DataLoader(train_dataset, shuffle = True, batch_size = opt.batch_size, num_workers = 4, pin_memory=True,
                                       drop_last=True)

import cv2
import numpy as np
import torch
from PIL import Image
from torch.utils.data.dataset import Dataset
class YoloDataset(Dataset):
    def __init__(self, annotation_lines, input_shape, train):
        super(YoloDataset, self).__init__()
        self.annotation_lines   = annotation_lines
        self.input_shape        = input_shape
        self.length             = len(self.annotation_lines)
        self.train              = train
        self.build_target       = build_target()
    def __len__(self):
        return self.length
    def __getitem__(self, index):
        index       = index % self.length
        #---------------------------------------------------#
        #   数据增强
        #---------------------------------------------------#
        image, boxes  = self.get_random_data(self.annotation_lines[index], self.input_shape[0:2], random = self.train )
        image = torch.from_numpy(image.astype(np.float32)).permute(2, 0, 1) / 255.
        boxes         = np.array(boxes, dtype=np.float32)
        if len(boxes) != 0:
            boxes[:, [0, 2]] = boxes[:, [0, 2]] / self.input_shape[1]
            boxes[:, [1, 3]] = boxes[:, [1, 3]] / self.input_shape[0]
            boxes[:, 2:4] = boxes[:, 2:4] - boxes[:, 0:2]
            boxes[:, 0:2] = boxes[:, 0:2] + boxes[:, 2:4] / 2
        image, targets = self.build_target(image, boxes[:,:4], boxes[:,4])
        return image, targets
    def get_random_data(self, annotation_line, input_shape, jitter=.3, hue=.1, sat=0.7, val=0.4, random=True):
        line    = annotation_line.split()
        image   = Image.open(line[0])
        image = image.convert('RGB')
        iw, ih  = image.size
        h, w    = input_shape
        #------------------------------#
        #   获取边框信息
        #------------------------------#
        box     = np.array([np.array(list(map(int,box.split(',')))) for box in line[1:]])
        if not random:
            scale = min(w/iw, h/ih)
            nw = int(iw*scale)
            nh = int(ih*scale)
            dx = (w-nw)//2
            dy = (h-nh)//2
            #---------------------------------#
            #   添加灰度条
            #---------------------------------#
            image       = image.resize((nw,nh), Image.BICUBIC)
            new_image   = Image.new('RGB', (w,h), (128,128,128))
            new_image.paste(image, (dx, dy))
            image_data  = np.array(new_image, np.float32)
            #---------------------------------#
            #   调整边框
            #---------------------------------#
            if len(box)>0:
                np.random.shuffle(box)
                box[:, [0,2]] = box[:, [0,2]]*nw/iw + dx
                box[:, [1,3]] = box[:, [1,3]]*nh/ih + dy
                box[:, 0:2][box[:, 0:2]<0] = 0
                box[:, 2][box[:, 2]>w] = w
                box[:, 3][box[:, 3]>h] = h
                box_w = box[:, 2] - box[:, 0]
                box_h = box[:, 3] - box[:, 1]
                box = box[np.logical_and(box_w>1, box_h>1)] # discard invalid box
            if box.size == 0:
                print(line[0])
            if len(box)==0:
                print(line[0])
            return image_data, box
        #------------------------------------------#
        #   对图像进行缩放，并进行长和宽的扭曲
        #------------------------------------------#
        new_ar = iw/ih * self.rand(1-jitter,1+jitter) / self.rand(1-jitter,1+jitter)
        scale = self.rand(.25, 2)
        if new_ar < 1:
            nh = int(scale*h)
            nw = int(nh*new_ar)
        else:
            nw = int(scale*w)
            nh = int(nw/new_ar)
        image = image.resize((nw,nh), Image.BICUBIC)
        #------------------------------------------#
        #   添加灰度条
        #------------------------------------------#
        dx = int(self.rand(0, w-nw))
        dy = int(self.rand(0, h-nh))
        new_image = Image.new('RGB', (w,h), (128,128,128))
        new_image.paste(image, (dx, dy))
        image = new_image
        #------------------------------------------#
        #   翻转图像
        #------------------------------------------#
        flip = self.rand()<.5
        if flip: image = image.transpose(Image.FLIP_LEFT_RIGHT)
        image_data      = np.array(image, np.uint8)
        #---------------------------------#
        #   对图像进行色域变换
        #---------------------------------#
        r               = np.random.uniform(-1, 1, 3) * [hue, sat, val] + 1
        hue, sat, val   = cv2.split(cv2.cvtColor(image_data, cv2.COLOR_RGB2HSV))
        dtype           = image_data.dtype
        x       = np.arange(0, 256, dtype=r.dtype)
        lut_hue = ((x * r[0]) % 180).astype(dtype)
        lut_sat = np.clip(x * r[1], 0, 255).astype(dtype)
        lut_val = np.clip(x * r[2], 0, 255).astype(dtype)
        image_data = cv2.merge((cv2.LUT(hue, lut_hue), cv2.LUT(sat, lut_sat), cv2.LUT(val, lut_val)))
        image_data = cv2.cvtColor(image_data, cv2.COLOR_HSV2RGB)
        #---------------------------------#
        #   调整边框
        #---------------------------------#
        if len(box)>0:
            np.random.shuffle(box)
            box[:, [0,2]] = box[:, [0,2]]*nw/iw + dx
            box[:, [1,3]] = box[:, [1,3]]*nh/ih + dy
            if flip: box[:, [0,2]] = w - box[:, [2,0]]
            box[:, 0:2][box[:, 0:2]<0] = 0
            box[:, 2][box[:, 2]>w] = w
            box[:, 3][box[:, 3]>h] = h
            box_w = box[:, 2] - box[:, 0]
            box_h = box[:, 3] - box[:, 1]
            box = box[np.logical_and(box_w>1, box_h>1)] 
        return image_data, box
    def rand(self, a=0, b=1):
        return np.random.rand()*(b-a) + a
def build_target():
        return TargetTransoform()
class TargetTransoform(object):
    def __init__(self, target_shape=(7, 7, 30), class_nums=20, cell_nums=7):
        self.target_shape = target_shape
        self.class_nums = class_nums
        self.cell_nums = cell_nums
    def __call__(self, image, boxes, labels):
        """
            labels = [1,2,3,4] 
            boxes = [0.2 0.3 0.4 0.8]
            return [self.S,self.S,self.B*5+self.C]
            """
        labels = np.array(labels,"int8")
        np_target = np.zeros(self.target_shape)
        np_class = np.zeros((len(boxes), self.class_nums))
        for i in range(len(labels)):
            np_class[i][labels[i]] = 1
        step = 1.0 / self.cell_nums
        for i in range(len(boxes)):
            box = boxes[i]
            label = np_class[i]
            cx, cy, w, h = box
            #获取属于哪个网格
            bx = int(cx // (step + 1e-5))
            by = int(cy // (step + 1e-5))
            #每个网格左上角的坐标
            cx = (cx % step) / step
            cy = (cy % step) / step
            box = [cx, cy, w, h]
            np_target[by][bx][:4] = box
            np_target[by][bx][4] = 1
            np_target[by][bx][5:9] = box
            np_target[by][bx][9] = 1
            np_target[by][bx][10:] = label
        return image, np_target
if __name__ == "__main__":
    tain_path = "/home/users/user1/Documents/AI_Files/gitfies/pytorch-YOLO-v1/2007_train.txt"
    with open(tain_path) as f:
        train_lines = f.readlines()
    train_dataset     = YoloDataset(train_lines, [448,448],  train = True)
    image,Target = train_dataset[3054]

2. 损失函数

训练使得网络输出 x中心点偏移量，y中心点偏移量，宽度开根号，高度开根号（都进行归一化）

import numpy as np
import torch
import torch.nn.functional as F
from torch.autograd import Variable
from torch.nn import *
class yoloLoss(Module):
    def __init__(self, num_class=20):
        super(yoloLoss, self).__init__()
        self.lambda_coord = 5
        self.lambda_noobj = 0.5
        self.S = 7
        self.B = 2
        self.C = num_class
        self.step = 1.0 / 7
    def compute_iou(self, box1, box2, index):
        box1 = torch.clone(box1)
        box2 = torch.clone(box2)
        box1 = self.conver_box(box1, index)#[x_c,y_c,w,h]
        box2 = self.conver_box(box2, index)
        box1[:, 2] = torch.pow(box1[:, 2],2)
        box1[:, 3] = torch.pow(box1[:, 3],2)
        x1, y1, w1, h1 = box1[:, 0]- box1[:, 2] / 2, box1[:, 1]- box1[:, 3] / 2, box1[:, 2], box1[:, 3]
        x2, y2, w2, h2 = box2[:, 0]- box2[:, 2] / 2, box2[:, 1]- box2[:, 3] / 2, box2[:, 2], box2[:, 3]
        inter_w = (w1 + w2) - (torch.max(x1 + w1, x2 + w2) - torch.min(x1, x2))
        inter_h = (h1 + h2) - (torch.max(y1 + h1, y2 + h2) - torch.min(y1, y2))
        inter_h = torch.clamp(inter_h, 0)
        inter_w = torch.clamp(inter_w, 0)
        inter = inter_w * inter_h
        union = w1 * h1 + w2 * h2 - inter
        return inter / union
    def conver_box(self, box, index):
        i, j = index
        box[:, 0], box[:, 1] = [(box[:, 0] + i) * self.step, (box[:, 1] + j) * self.step]
        box = torch.clamp(box, 0)
        return box
    def forward(self, pred, target):
        batch_size = pred.size(0)
        target_boxes = target[:, :, :, :10].contiguous().reshape(
            (-1, 7, 7, 2, 5))
        pred_boxes = pred[:, :, :, :10].contiguous().reshape((-1, 7, 7, 2, 5))
        target_cls = target[:, :, :, 10:]
        pred_cls = pred[:, :, :, 10:]
        #h获取含有目标的坐标
        obj_mask = (target_boxes[..., 4] > 0).byte()
        sig_mask = obj_mask[..., 1].bool()
        index = torch.where(sig_mask == True)
        #img_i代表第几张图片，y属于网格y坐标，x属于网格x坐标
        for img_i, y, x in zip(*index):
            img_i, y, x = img_i.item(), y.item(), x.item()
            pbox = pred_boxes[img_i, y, x]
            target_box = target_boxes[img_i, y, x]
            ious = self.compute_iou(pbox[:, :4], target_box[:, :4], [x, y])
            iou, max_i = ious.max(0)
            #将有目标的置信度标签且IOU较大的框框的置信度设为IOU
            target_boxes[img_i, y, x, max_i, 4] = iou.item()
            #另一个框框设为0
            target_boxes[img_i, y, x, 1 - max_i, 4] = 0
            obj_mask[img_i, y, x, 1 - max_i] = 0
        obj_mask = obj_mask.bool()
        noobj_mask = ~obj_mask
        noobj_loss = F.mse_loss(pred_boxes[noobj_mask][:, 4],
                                target_boxes[noobj_mask][:, 4],
                                reduction="sum")
        obj_loss = F.mse_loss(pred_boxes[obj_mask][:, 4],
                              target_boxes[obj_mask][:, 4],
                              reduction="sum")
        xy_loss = F.mse_loss(pred_boxes[obj_mask][:, :2],
                             target_boxes[obj_mask][:, :2],
                             reduction="sum")
        wh_loss = F.mse_loss(pred_boxes[obj_mask][:, 2:4],
                            torch.sqrt(target_boxes[obj_mask][:, 2:4]),
                             reduction="sum")
        class_loss = F.mse_loss(pred_cls[sig_mask],
                                target_cls[sig_mask],
                                reduction="sum")
        loss = dict(conf_loss=(obj_loss + self.lambda_noobj * noobj_loss) /
                    batch_size,
                    reg_loss=(self.lambda_coord * xy_loss +
                              self.lambda_coord * wh_loss) / batch_size,
                    cls_loss=class_loss / batch_size)
        return loss

CV 代码笔记

YOLOv1 预处理

1.数据集处理

2. 损失函数