1.数据集处理
将标签处理为 x中心点偏移量,y中心点偏移量,宽度,高度 (都进行归一化)
with open(opt.train_annotation_path) as f:
train_lines = f.readlines()
with open(opt.val_annotation_path) as f:
val_lines = f.readlines()
num_train = len(train_lines)
num_val = len(val_lines)
train_dataset = YoloDataset(train_lines, opt.input_shape, train = True)
train_dataloader = DataLoader(train_dataset, shuffle = True, batch_size = opt.batch_size, num_workers = 4, pin_memory=True,
drop_last=True)
import cv2
import numpy as np
import torch
from PIL import Image
from torch.utils.data.dataset import Dataset
class YoloDataset(Dataset):
def __init__(self, annotation_lines, input_shape, train):
super(YoloDataset, self).__init__()
self.annotation_lines = annotation_lines
self.input_shape = input_shape
self.length = len(self.annotation_lines)
self.train = train
self.build_target = build_target()
def __len__(self):
return self.length
def __getitem__(self, index):
index = index % self.length
#---------------------------------------------------#
# 数据增强
#---------------------------------------------------#
image, boxes = self.get_random_data(self.annotation_lines[index], self.input_shape[0:2], random = self.train )
image = torch.from_numpy(image.astype(np.float32)).permute(2, 0, 1) / 255.
boxes = np.array(boxes, dtype=np.float32)
if len(boxes) != 0:
boxes[:, [0, 2]] = boxes[:, [0, 2]] / self.input_shape[1]
boxes[:, [1, 3]] = boxes[:, [1, 3]] / self.input_shape[0]
boxes[:, 2:4] = boxes[:, 2:4] - boxes[:, 0:2]
boxes[:, 0:2] = boxes[:, 0:2] + boxes[:, 2:4] / 2
image, targets = self.build_target(image, boxes[:,:4], boxes[:,4])
return image, targets
def get_random_data(self, annotation_line, input_shape, jitter=.3, hue=.1, sat=0.7, val=0.4, random=True):
line = annotation_line.split()
image = Image.open(line[0])
image = image.convert('RGB')
iw, ih = image.size
h, w = input_shape
#------------------------------#
# 获取边框信息
#------------------------------#
box = np.array([np.array(list(map(int,box.split(',')))) for box in line[1:]])
if not random:
scale = min(w/iw, h/ih)
nw = int(iw*scale)
nh = int(ih*scale)
dx = (w-nw)//2
dy = (h-nh)//2
#---------------------------------#
# 添加灰度条
#---------------------------------#
image = image.resize((nw,nh), Image.BICUBIC)
new_image = Image.new('RGB', (w,h), (128,128,128))
new_image.paste(image, (dx, dy))
image_data = np.array(new_image, np.float32)
#---------------------------------#
# 调整边框
#---------------------------------#
if len(box)>0:
np.random.shuffle(box)
box[:, [0,2]] = box[:, [0,2]]*nw/iw + dx
box[:, [1,3]] = box[:, [1,3]]*nh/ih + dy
box[:, 0:2][box[:, 0:2]<0] = 0
box[:, 2][box[:, 2]>w] = w
box[:, 3][box[:, 3]>h] = h
box_w = box[:, 2] - box[:, 0]
box_h = box[:, 3] - box[:, 1]
box = box[np.logical_and(box_w>1, box_h>1)] # discard invalid box
if box.size == 0:
print(line[0])
if len(box)==0:
print(line[0])
return image_data, box
#------------------------------------------#
# 对图像进行缩放,并进行长和宽的扭曲
#------------------------------------------#
new_ar = iw/ih * self.rand(1-jitter,1+jitter) / self.rand(1-jitter,1+jitter)
scale = self.rand(.25, 2)
if new_ar < 1:
nh = int(scale*h)
nw = int(nh*new_ar)
else:
nw = int(scale*w)
nh = int(nw/new_ar)
image = image.resize((nw,nh), Image.BICUBIC)
#------------------------------------------#
# 添加灰度条
#------------------------------------------#
dx = int(self.rand(0, w-nw))
dy = int(self.rand(0, h-nh))
new_image = Image.new('RGB', (w,h), (128,128,128))
new_image.paste(image, (dx, dy))
image = new_image
#------------------------------------------#
# 翻转图像
#------------------------------------------#
flip = self.rand()<.5
if flip: image = image.transpose(Image.FLIP_LEFT_RIGHT)
image_data = np.array(image, np.uint8)
#---------------------------------#
# 对图像进行色域变换
#---------------------------------#
r = np.random.uniform(-1, 1, 3) * [hue, sat, val] + 1
hue, sat, val = cv2.split(cv2.cvtColor(image_data, cv2.COLOR_RGB2HSV))
dtype = image_data.dtype
x = np.arange(0, 256, dtype=r.dtype)
lut_hue = ((x * r[0]) % 180).astype(dtype)
lut_sat = np.clip(x * r[1], 0, 255).astype(dtype)
lut_val = np.clip(x * r[2], 0, 255).astype(dtype)
image_data = cv2.merge((cv2.LUT(hue, lut_hue), cv2.LUT(sat, lut_sat), cv2.LUT(val, lut_val)))
image_data = cv2.cvtColor(image_data, cv2.COLOR_HSV2RGB)
#---------------------------------#
# 调整边框
#---------------------------------#
if len(box)>0:
np.random.shuffle(box)
box[:, [0,2]] = box[:, [0,2]]*nw/iw + dx
box[:, [1,3]] = box[:, [1,3]]*nh/ih + dy
if flip: box[:, [0,2]] = w - box[:, [2,0]]
box[:, 0:2][box[:, 0:2]<0] = 0
box[:, 2][box[:, 2]>w] = w
box[:, 3][box[:, 3]>h] = h
box_w = box[:, 2] - box[:, 0]
box_h = box[:, 3] - box[:, 1]
box = box[np.logical_and(box_w>1, box_h>1)]
return image_data, box
def rand(self, a=0, b=1):
return np.random.rand()*(b-a) + a
def build_target():
return TargetTransoform()
class TargetTransoform(object):
def __init__(self, target_shape=(7, 7, 30), class_nums=20, cell_nums=7):
self.target_shape = target_shape
self.class_nums = class_nums
self.cell_nums = cell_nums
def __call__(self, image, boxes, labels):
"""
labels = [1,2,3,4]
boxes = [0.2 0.3 0.4 0.8]
return [self.S,self.S,self.B*5+self.C]
"""
labels = np.array(labels,"int8")
np_target = np.zeros(self.target_shape)
np_class = np.zeros((len(boxes), self.class_nums))
for i in range(len(labels)):
np_class[i][labels[i]] = 1
step = 1.0 / self.cell_nums
for i in range(len(boxes)):
box = boxes[i]
label = np_class[i]
cx, cy, w, h = box
#获取属于哪个网格
bx = int(cx // (step + 1e-5))
by = int(cy // (step + 1e-5))
#每个网格左上角的坐标
cx = (cx % step) / step
cy = (cy % step) / step
box = [cx, cy, w, h]
np_target[by][bx][:4] = box
np_target[by][bx][4] = 1
np_target[by][bx][5:9] = box
np_target[by][bx][9] = 1
np_target[by][bx][10:] = label
return image, np_target
if __name__ == "__main__":
tain_path = "/home/users/user1/Documents/AI_Files/gitfies/pytorch-YOLO-v1/2007_train.txt"
with open(tain_path) as f:
train_lines = f.readlines()
train_dataset = YoloDataset(train_lines, [448,448], train = True)
image,Target = train_dataset[3054]
2. 损失函数
训练使得网络输出 x中心点偏移量,y中心点偏移量,宽度开根号,高度开根号 (都进行归一化)
import numpy as np
import torch
import torch.nn.functional as F
from torch.autograd import Variable
from torch.nn import *
class yoloLoss(Module):
def __init__(self, num_class=20):
super(yoloLoss, self).__init__()
self.lambda_coord = 5
self.lambda_noobj = 0.5
self.S = 7
self.B = 2
self.C = num_class
self.step = 1.0 / 7
def compute_iou(self, box1, box2, index):
box1 = torch.clone(box1)
box2 = torch.clone(box2)
box1 = self.conver_box(box1, index)#[x_c,y_c,w,h]
box2 = self.conver_box(box2, index)
box1[:, 2] = torch.pow(box1[:, 2],2)
box1[:, 3] = torch.pow(box1[:, 3],2)
x1, y1, w1, h1 = box1[:, 0]- box1[:, 2] / 2, box1[:, 1]- box1[:, 3] / 2, box1[:, 2], box1[:, 3]
x2, y2, w2, h2 = box2[:, 0]- box2[:, 2] / 2, box2[:, 1]- box2[:, 3] / 2, box2[:, 2], box2[:, 3]
inter_w = (w1 + w2) - (torch.max(x1 + w1, x2 + w2) - torch.min(x1, x2))
inter_h = (h1 + h2) - (torch.max(y1 + h1, y2 + h2) - torch.min(y1, y2))
inter_h = torch.clamp(inter_h, 0)
inter_w = torch.clamp(inter_w, 0)
inter = inter_w * inter_h
union = w1 * h1 + w2 * h2 - inter
return inter / union
def conver_box(self, box, index):
i, j = index
box[:, 0], box[:, 1] = [(box[:, 0] + i) * self.step, (box[:, 1] + j) * self.step]
box = torch.clamp(box, 0)
return box
def forward(self, pred, target):
batch_size = pred.size(0)
target_boxes = target[:, :, :, :10].contiguous().reshape(
(-1, 7, 7, 2, 5))
pred_boxes = pred[:, :, :, :10].contiguous().reshape((-1, 7, 7, 2, 5))
target_cls = target[:, :, :, 10:]
pred_cls = pred[:, :, :, 10:]
#h获取含有目标的坐标
obj_mask = (target_boxes[..., 4] > 0).byte()
sig_mask = obj_mask[..., 1].bool()
index = torch.where(sig_mask == True)
#img_i代表第几张图片,y属于网格y坐标,x属于网格x坐标
for img_i, y, x in zip(*index):
img_i, y, x = img_i.item(), y.item(), x.item()
pbox = pred_boxes[img_i, y, x]
target_box = target_boxes[img_i, y, x]
ious = self.compute_iou(pbox[:, :4], target_box[:, :4], [x, y])
iou, max_i = ious.max(0)
#将有目标的置信度标签且IOU较大的框框的置信度设为IOU
target_boxes[img_i, y, x, max_i, 4] = iou.item()
#另一个框框设为0
target_boxes[img_i, y, x, 1 - max_i, 4] = 0
obj_mask[img_i, y, x, 1 - max_i] = 0
obj_mask = obj_mask.bool()
noobj_mask = ~obj_mask
noobj_loss = F.mse_loss(pred_boxes[noobj_mask][:, 4],
target_boxes[noobj_mask][:, 4],
reduction="sum")
obj_loss = F.mse_loss(pred_boxes[obj_mask][:, 4],
target_boxes[obj_mask][:, 4],
reduction="sum")
xy_loss = F.mse_loss(pred_boxes[obj_mask][:, :2],
target_boxes[obj_mask][:, :2],
reduction="sum")
wh_loss = F.mse_loss(pred_boxes[obj_mask][:, 2:4],
torch.sqrt(target_boxes[obj_mask][:, 2:4]),
reduction="sum")
class_loss = F.mse_loss(pred_cls[sig_mask],
target_cls[sig_mask],
reduction="sum")
loss = dict(conf_loss=(obj_loss + self.lambda_noobj * noobj_loss) /
batch_size,
reg_loss=(self.lambda_coord * xy_loss +
self.lambda_coord * wh_loss) / batch_size,
cls_loss=class_loss / batch_size)
return loss