进过前几次的学习,我们将Faster R-CNN整个核心代码都过了一遍了,接下来我们要做的事,了解trainval_net,test_net,demo.py中各个参数是如何配置的,并且能够针对性的做出一些修改,来适应自己的数据集。
该笔记大部分参考来自:https://blog.csdn.net/weixin_43872578/article/details/86607742
1.导入相关的库
# --------------------------------------------------------# Tensorflow Faster R-CNN# Licensed under The MIT License [see LICENSE for details]# Written by Jiasen Lu, Jianwei Yang, based on code from Ross Girshick# --------------------------------------------------------# Python提供了__future__模块,把下一个新版本的特性导入到当前版本from __future__ import absolute_import'''导入python未来支持的语言特征division(精确除法),当我们没有在程序中导入该特征时,"/"操作符执行的是截断除法(Truncating Division)当我们导入精确除法之后,"/"执行的是精确除法'''from __future__ import division# 即使在python2.X,使用print就得像python3.X那样加括号使用from __future__ import print_functionimport _init_pathsimport os #通过os模块调用系统命令import sys #sys 模块包括了一组非常实用的服务,内含很多函数方法和变量'''numpy用来处理图片数据(多维数组), 尤其是numpy的broadcasting特性,使得不同维度的数组可以一起操作(加,减,乘, 除, 等).'''import numpy as npimport argparse # 为py文件封装好可以选择的参数import pprint #提供了打印出任何python数据结构类和方法。import pdb #使用 Pdb调试 Python程序import timeimport cv2import torchfrom torch.autograd import Variable#自动微分 vairable是tensor的一个外包装import torch.nn as nnimport torch.optim as optim'''为了方便加载以上五种数据库的数据,pytorch团队帮我们写了一个torchvision包。使用torchvision就可以轻松实现数据的加载和预处理。'''import torchvision.transforms as transformsimport torchvision.datasets as dset'''scipy.misc 下的图像处理imread():返回的是 numpy.ndarray 也即 numpy 下的多维数组对象;'''from scipy.misc import imread'''这些封装的库都是前面的笔记中所学习的'''from roi_data_layer.roidb import combined_roidbfrom roi_data_layer.roibatchLoader import roibatchLoader# demo.py运行过程中的配置基本上都在config.py了. 后续的代码流程中会用到这些配置值.from model.utils.config import cfg, cfg_from_file, cfg_from_list, get_output_dirfrom model.rpn.bbox_transform import clip_boxes# from model.nms.nms_wrapper import nmsfrom model.roi_layers import nmsfrom model.rpn.bbox_transform import bbox_transform_invfrom model.utils.net_utils import save_net, load_net, vis_detectionsfrom model.utils.blob import im_list_to_blobfrom model.faster_rcnn.vgg16 import vgg16from model.faster_rcnn.resnet import resnetimport pdbtry:xrange # Python 2except NameError:xrange = range # Python 3
2.解析参数 parse_args()
def parse_args():"""Parse input arguments"""parser = argparse.ArgumentParser(description='Train a Fast R-CNN network')parser.add_argument('--dataset', dest='dataset',help='training dataset',default='pascal_voc', type=str)# 指代你跑得数据集名称,例如默认为pascal-vocparser.add_argument('--cfg', dest='cfg_file',help='optional config file',default='cfgs/vgg16.yml', type=str)#配置文件 在目录faster_rcnn.pytorch-pytorch-1.0/cfgs中parser.add_argument('--net', dest='net',help='vgg16, res50, res101, res152',default='res101', type=str)#backbone网络类型parser.add_argument('--set', dest='set_cfgs',help='set config keys', default=None,nargs=argparse.REMAINDER)#设置parser.add_argument('--load_dir', dest='load_dir',help='directory to load models',default="/srv/share/jyang375/models")#模型目录parser.add_argument('--image_dir', dest='image_dir',help='directory to load images for demo',default="images")#图片目录parser.add_argument('--cuda', dest='cuda',help='whether use CUDA',action='store_true')#是否用GPUparser.add_argument('--mGPUs', dest='mGPUs',help='whether use multiple GPUs',action='store_true')#是不是多GPUparser.add_argument('--cag', dest='class_agnostic',help='whether perform class_agnostic bbox regression',action='store_true')'''class-agnostic 方式只回归2类bounding box,即前景和背景结合每个box在classification 网络中对应着所有类别的得分,以及检测阈值条件,就可以得到图片中所有类别的检测结果'''parser.add_argument('--parallel_type', dest='parallel_type',help='which part of model to parallel, 0: all, 1: model before roi pooling',default=0, type=int)#模型的哪一部分并行'''以下是关于模型保存check的一些相关参数这里的三个check参数,是定义了训好的检测模型名称faster_rcnn_1_20_10021,代表了checksession = 1,checkepoch = 20, checkpoint = 10021,这样才可以读到模型“faster_rcnn_1_20_10021”。'''parser.add_argument('--checksession', dest='checksession',help='checksession to load model',default=1, type=int)parser.add_argument('--checkepoch', dest='checkepoch',help='checkepoch to load network',default=1, type=int)parser.add_argument('--checkpoint', dest='checkpoint',help='checkpoint to load network',default=10021, type=int)parser.add_argument('--bs', dest='batch_size',help='batch_size',default=1, type=int)#批大小parser.add_argument('--vis', dest='vis',help='visualization mode',action='store_true')#可视化模型parser.add_argument('--webcam_num', dest='webcam_num',help='webcam ID number',default=-1, type=int)# 是否调用摄像头#parse_args()是将之前add_argument()定义的参数进行赋值,并返回相关的namespace。args = parser.parse_args()return argslr = cfg.TRAIN.LEARNING_RATE #学习率momentum = cfg.TRAIN.MOMENTUM #动量weight_decay = cfg.TRAIN.WEIGHT_DECAY #权重衰减
3.函数 _get_image_blob(im)
def _get_image_blob(im):#这个函数其实就是读取图片,然后做尺寸变换,然后存储成矩阵的形式'''Converts an image into a network input.Arguments:im (ndarray): a color image in BGR orderReturns:blob (ndarray): a data blob holding an image pyramidim_scale_factors (list): list of image scales (relative to im) usedin the image pyramid'''#Numpy中 astype:转换数组的数据类型。im_orig = im.astype(np.float32, copy=True)#而pixel mean的话,其实是把训练集里面所有图片的所有R通道像素,求了均值,G,B通道类似im_orig -= cfg.PIXEL_MEANSim_shape = im_orig.shapeim_size_min = np.min(im_shape[0:2])im_size_max = np.max(im_shape[0:2])processed_ims = []im_scale_factors = []for target_size in cfg.TEST.SCALES: # target_size = 600#遍历cfg.TEST.SCALES这个元组或列表中的值im_scale = float(target_size) / float(im_size_min)#测试的尺度除以图像最小长度(宽高的最小值)# Prevent the biggest axis from being more than MAX_SIZE#防止最大值超过MAX_SIZE,round函数四舍五入if np.round(im_scale * im_size_max) > cfg.TEST.MAX_SIZE:im_scale = float(cfg.TEST.MAX_SIZE) / float(im_size_max)#调整im_orig大小im = cv2.resize(im_orig, None, None, fx=im_scale, fy=im_scale,interpolation=cv2.INTER_LINEAR)#保存尺度值im_scale_factors.append(im_scale)#保存调整后的图像processed_ims.append(im)# Create a blob to hold the input images'''创建一个blob来保存输入图像这个函数出自这里 from model.utils.blob import im_list_to_blob'''blob = im_list_to_blob(processed_ims)return blob, np.array(im_scale_factors)
4. if name == ‘main’:
if __name__ == '__main__':#这就是上面定义的那个函数args = parse_args()print('Called with args:')print(args)if args.cfg_file is not None: #配置文件cfg_from_file(args.cfg_file)#model.utils.config 该文件中函数#Load a config file and merge it into the default options.if args.set_cfgs is not None:cfg_from_list(args.set_cfgs)#model.utils.config文件中# Set config keys via list (e.g., from command line).'''Use GPU implementation of non-maximum suppression解析参数是不是用GPU'''cfg.USE_GPU_NMS = args.cudaprint('Using config:')pprint.pprint(cfg)'''设置随机数种子每次运行代码时设置相同的seed,则每次生成的随机数也相同,如果不设置seed,则每次生成的随机数都会不一样'''np.random.seed(cfg.RNG_SEED)# train set# -- Note: Use validation set and disable the flipped to enable faster loading.input_dir = args.load_dir + "/" + args.net + "/" + args.datasetif not os.path.exists(input_dir):#当程序出现错误,python会自动引发异常,#也可以通过raise显示地引发异常。一旦执行了raise语句,raise后面的语句将不能执行raise Exception('There is no input directory for loading network from ' + input_dir)load_name = os.path.join(input_dir,'faster_rcnn_{}_{}_{}.pth'.format(args.checksession,args.checkepoch,args.checkpoint))#这里的三个check参数,是定义了训好的检测模型名称,例如训好的名称为faster_rcnn_1_20_10021,'''PASCAL类别 1类背景 + 20类Objectarray和asarray都可以将结构数据转化为ndarray,但是主要区别就是当数据源是ndarray时,array仍然会copy出一个副本,占用新的内存,但asarray不会。'''pascal_classes = np.asarray(['__background__','aeroplane', 'bicycle', 'bird', 'boat','bottle', 'bus', 'car', 'cat', 'chair','cow', 'diningtable', 'dog', 'horse','motorbike', 'person', 'pottedplant','sheep', 'sofa', 'train', 'tvmonitor'])# initilize the network here.'''class-agnostic 方式只回归2类bounding box,即前景和背景'''if args.net == 'vgg16':fasterRCNN = vgg16(pascal_classes, pretrained=False, class_agnostic=args.class_agnostic)elif args.net == 'res101':fasterRCNN = resnet(pascal_classes, 101, pretrained=False, class_agnostic=args.class_agnostic)elif args.net == 'res50':fasterRCNN = resnet(pascal_classes, 50, pretrained=False, class_agnostic=args.class_agnostic)elif args.net == 'res152':fasterRCNN = resnet(pascal_classes, 152, pretrained=False, class_agnostic=args.class_agnostic)else:print("network is not defined")#到了pdb.set_trace()那就会定下来,就可以看到调试的提示符(Pdb)了pdb.set_trace()fasterRCNN.create_architecture()#model.faster_rcnn.faster_rcnn.py 初始化模型 初始化权重print("load checkpoint %s" % (load_name))#模型路径if args.cuda > 0:checkpoint = torch.load(load_name)else:checkpoint = torch.load(load_name, map_location=(lambda storage, loc: storage))fasterRCNN.load_state_dict(checkpoint['model'])#恢复模型if 'pooling_mode' in checkpoint.keys():cfg.POOLING_MODE = checkpoint['pooling_mode'] #pooling方式print('load model successfully!')# pdb.set_trace()print("load checkpoint %s" % (load_name))# initilize the tensor holder here.#新建一些 一维Tensorim_data = torch.FloatTensor(1)im_info = torch.FloatTensor(1)num_boxes = torch.LongTensor(1)gt_boxes = torch.FloatTensor(1)# ship to cudaif args.cuda > 0:#如果用GPU,张量放到GPU上im_data = im_data.cuda()im_info = im_info.cuda()num_boxes = num_boxes.cuda()gt_boxes = gt_boxes.cuda()# make variable#Variable的volatile属性默认为False,如果某一个variable的volatile属性被设为True,#volatile属性为True的节点不会求导,volatile的优先级比requires_grad高。im_data = Variable(im_data, volatile=True)im_info = Variable(im_info, volatile=True)num_boxes = Variable(num_boxes, volatile=True)gt_boxes = Variable(gt_boxes, volatile=True)if args.cuda > 0:cfg.CUDA = Trueif args.cuda > 0:fasterRCNN.cuda()'''model.eval(),让model变成测试模式,对dropout和batch normalization的操作在训练和测试的时候是不一样的pytorch会自动把BN和DropOut固定住,不会取平均,而是用训练好的值'''fasterRCNN.eval()start = time.time() #通过time()函数可以获取当前的时间max_per_image = 100thresh = 0.05vis = Truewebcam_num = args.webcam_num# Set up webcam or get image directoriesif webcam_num >= 0 :#应该就是判断要不要自己用电脑录视频'''cap = cv2.VideoCapture(0) 打开笔记本的内置摄像头。cap = cv2.VideoCapture('D:\output.avi') 打开视频文件'''cap = cv2.VideoCapture(webcam_num)num_images = 0else:#如果不用电脑录视频,那么就读取image路径下的图片#os.listdir() 方法用于返回指定的文件夹包含的文件或文件夹的名字的列表#这个列表以字母顺序imglist = os.listdir(args.image_dir)#有多少张图片num_images = len(imglist)print('Loaded Photo: {} images.'.format(num_images))while (num_images >= 0):total_tic = time.time()#当前时间if webcam_num == -1:#如果不用摄像头num_images -= 1# Get image from the webcam#从电脑摄像头读取图片if webcam_num >= 0:if not cap.isOpened():#摄像头开启失败raise RuntimeError("Webcam could not open. Please check connection.")#ret 为True 或者False,代表有没有读取到图片#frame表示截取到一帧的图片ret, frame = cap.read()im_in = np.array(frame)# Load the demo imageelse:#图片路径im_file = os.path.join(args.image_dir, imglist[num_images])# im = cv2.imread(im_file)#读取的的图片 存储为numpy数组im_in = np.array(imread(im_file))if len(im_in.shape) == 2:#np.newaxis的作用就是在这一位置增加一个一维,#这一位置指的是np.newaxis所在的位置,比较抽象,需要配合例子理解。'''example:x1 = np.array([1, 2, 3, 4, 5])the shape of x1 is (5,)x1_new = x1[:, np.newaxis]now, the shape of x1_new is (5, 1)array([[1],[2],[3],[4],[5]])x1_new = x1[np.newaxis,:]now, the shape of x1_new is (1, 5)array([[1, 2, 3, 4, 5]])'''im_in = im_in[:,:,np.newaxis]im_in = np.concatenate((im_in,im_in,im_in), axis=2)'''数组拼接若axis=0,则要求除了a.shape[0]和b.shape[0]可以不等之外,其它维度必须相等若axis=0,则要求除了a.shape[0]和b.shape[0]可以不等之外,其它维度必须相等axis>=2 的情况以此类推,axis的值必须小于数组的维度'''# 可以理解为批处理# rgb -> bgrim = im_in[:,:,::-1]blobs, im_scales = _get_image_blob(im)#图片变换 该文件上面定义的函数,返回处理后的值 和尺度assert len(im_scales) == 1, "Only single-image batch implemented"im_blob = blobs #处理后的值,图像信息,长、宽、尺度im_info_np = np.array([[im_blob.shape[1],im_blob.shape[2],im_scales[0]]], dtype=np.float32)#从numpy变为Tensorim_data_pt = torch.from_numpy(im_blob)#permute 将tensor的维度换位。im_data_pt = im_data_pt.permute(0, 3, 1, 2)#图像信息也变为tensorim_info_pt = torch.from_numpy(im_info_np)with torch.no_grad():'''将上面定义的一些tensor大小调整为指定大小如果元素个数比当前的内存大小大,就将底层存储大小调整为与新元素数目一致的大小。'''im_data.resize_(im_data_pt.size()).copy_(im_data_pt)im_info.resize_(im_info_pt.size()).copy_(im_info_pt)gt_boxes.resize_(1, 1, 5).zero_()num_boxes.resize_(1).zero_()# pdb.set_trace()det_tic = time.time()#当前时间rois, cls_prob, bbox_pred, \rpn_loss_cls, rpn_loss_box, \RCNN_loss_cls, RCNN_loss_bbox, \rois_label = fasterRCNN(im_data, im_info, gt_boxes, num_boxes)'''rois:rois blob: holds R regions of interest, each is a 5-tuple(n, x1, y1, x2, y2) specifying an image batch index n and arectangle (x1, y1, x2, y2)top[0].reshape(1, 5)cls_prob: softmax得到的概率值bbox_pred: 偏移rpn_loss_cls:分类损失,计算softmax的损失,输入labels和cls layer的18个输出(中间reshape了一下),输出损失函数的具体值rpn_loss_box 计算的框回归损失函数具体的值'''scores = cls_prob.data #分类概率值boxes = rois.data[:, :, 1:5] # 输出rois的坐标值if cfg.TEST.BBOX_REG: #Train bounding-box regressors TRUE or FALSE# Apply bounding-box regression deltasbox_deltas = bbox_pred.data #偏移值if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED:# Optionally normalize targets by a precomputed mean and stdevif args.class_agnostic:if args.cuda > 0:#box_deltas.view改变维度box_deltas = box_deltas.view(-1, 4) * torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_STDS).cuda() \+ torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_MEANS).cuda()else:box_deltas = box_deltas.view(-1, 4) * torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_STDS) \+ torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_MEANS)box_deltas = box_deltas.view(1, -1, 4)else:if args.cuda > 0:box_deltas = box_deltas.view(-1, 4) * torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_STDS).cuda() \+ torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_MEANS).cuda()else:box_deltas = box_deltas.view(-1, 4) * torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_STDS) \+ torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_MEANS)box_deltas = box_deltas.view(1, -1, 4 * len(pascal_classes))'''model.rpn.bbox_transform 根据anchor和偏移量计算proposals最后返回的是左上和右下顶点的坐标[x1,y1,x2,y2]。'''pred_boxes = bbox_transform_inv(boxes, box_deltas, 1)'''model.rpn.bbox_transform将改变坐标信息后超过图像边界的框的边框裁剪一下,使之在图像边界之内'''pred_boxes = clip_boxes(pred_boxes, im_info.data, 1)else:# Simply repeat the boxes, once for each class# Numpy的 tile() 函数,就是将原矩阵横向、纵向地复制,这里是横向pred_boxes = np.tile(boxes, (1, scores.shape[1]))pred_boxes /= im_scales[0]#squeeze 函数:从数组的形状中删除单维度条目,即把shape中为1的维度去掉scores = scores.squeeze()pred_boxes = pred_boxes.squeeze()det_toc = time.time()detect_time = det_toc - det_tic #detect_timemisc_tic = time.time()if vis:im2show = np.copy(im)for j in xrange(1, len(pascal_classes)): #所有类别'''torch.nonzero返回一个包含输入input中非零元素索引的张量,输出张量中的每行包含输入中非零元素的索引若输入input有n维,则输出的索引张量output形状为z * n, 这里z是输入张量input中所有非零元素的个数'''inds = torch.nonzero(scores[:,j]>thresh).view(-1)#参数中的-1就代表这个位置由其他位置的数字来推断# if there is det#torch.numel() 返回一个tensor变量内所有元素个数,可以理解为矩阵内元素的个数if inds.numel() > 0:cls_scores = scores[:,j][inds]#torch.sort(input, dim=None, descending=False, out=None)有true,#则表示降序,默认升序,沿第0列降序_, order = torch.sort(cls_scores, 0, True)if args.class_agnostic: #两类cls_boxes = pred_boxes[inds, :]else:cls_boxes = pred_boxes[inds][:, j * 4:(j + 1) * 4]#按行连接起来,torch.unsqueeze()这个函数主要是对数据维度进行扩充cls_dets = torch.cat((cls_boxes, cls_scores.unsqueeze(1)), 1)# cls_dets = torch.cat((cls_boxes, cls_scores), 1)cls_dets = cls_dets[order]# keep = nms(cls_dets, cfg.TEST.NMS, force_cpu=not cfg.USE_GPU_NMS)keep = nms(cls_boxes[order, :], cls_scores[order], cfg.TEST.NMS)cls_dets = cls_dets[keep.view(-1).long()]if vis:im2show = vis_detections(im2show, pascal_classes[j], cls_dets.cpu().numpy(), 0.5)misc_toc = time.time()nms_time = misc_toc - misc_ticif webcam_num == -1:sys.stdout.write('im_detect: {:d}/{:d} {:.3f}s {:.3f}s \r' \.format(num_images + 1, len(imglist), detect_time, nms_time))sys.stdout.flush()if vis and webcam_num == -1:# cv2.imshow('test', im2show)# cv2.waitKey(0)result_path = os.path.join(args.image_dir, imglist[num_images][:-4] + "_det.jpg")cv2.imwrite(result_path, im2show)else:im2showRGB = cv2.cvtColor(im2show, cv2.COLOR_BGR2RGB)cv2.imshow("frame", im2showRGB)total_toc = time.time()total_time = total_toc - total_ticframe_rate = 1 / total_timeprint('Frame rate:', frame_rate)if cv2.waitKey(1) & 0xFF == ord('q'):breakif webcam_num >= 0:cap.release()cv2.destroyAllWindows()
5.运行demo.py,查看检测结果
这里有一个巨坑!!!!!!!!!!
不要直接运行demo.py,因为文件里没有!!!!!没有!!!! 模型!!!!!
此处说的模型不是指Vgg或Resnet在分类数据集ImageNet下训好的Transfer模型,而是指的是在目标检测集VOC或COCO下进行fine-tune的模型。对于VOC2007数据集,训练好后该模型大概1个G,所以代码作者没有上传到github,要不你就自己训练,要不你就跟训好的人要,不过建议你自己训练,跑trainval_net.py。
