common.py这个脚本主要包括conv_act_layer,legacy_conv_act_layer,multi_layer_feature,multibox_layer四个函数,因为在symbol_builder.py中先后调用了multi_layer_feature和multibox_layer两个函数,前面一个主要用来生成特征融合的几个层,后面一个主要用来生成分类层,回归层,生成anchor层,比较重要,因此可以先看看这两个函数。

    1. import mxnet as mx
    2. import numpy as np
    3. def multi_layer_feature(body, from_layers, num_filters, strides, pads, min_filter=128):
    4. """Wrapper function to extract features from base network, attaching extra
    5. layers and SSD specific layers
    6. Parameters
    7. ----------
    8. from_layers : list of str
    9. feature extraction layers, use '' for add extra layers
    10. For example:
    11. from_layers = ['relu4_3', 'fc7', '', '', '', '']
    12. which means extract feature from relu4_3 and fc7, adding 4 extra layers
    13. on top of fc7
    14. num_filters : list of int
    15. number of filters for extra layers, you can use -1 for extracted features,
    16. however, if normalization and scale is applied, the number of filter for
    17. that layer must be provided.
    18. For example:
    19. num_filters = [512, -1, 512, 256, 256, 256]
    20. strides : list of int
    21. strides for the 3x3 convolution appended, -1 can be used for extracted
    22. feature layers
    23. pads : list of int
    24. paddings for the 3x3 convolution, -1 can be used for extracted layers
    25. min_filter : int
    26. minimum number of filters used in 1x1 convolution
    27. Returns
    28. -------
    29. list of mx.Symbols
    30. """
    31. # arguments check
    32. assert len(from_layers) > 0
    33. assert isinstance(from_layers[0], str) and len(from_layers[0].strip()) > 0
    34. assert len(from_layers) == len(num_filters) == len(strides) == len(pads)
    35. # 这个body是你导入的vgg16的symbol,get_internals函数是获取这个symbol的所有参数层的信息
    36. internals = body.get_internals()
    37. layers = []
    38. for k, params in enumerate(zip(from_layers, num_filters, strides, pads)):
    39. from_layer, num_filter, s, p = params
    40. # from_layer大概是这样的['relu4_3','relu7','','','',''],因此如果是后面的空字符串,那么就会跳到else那一部分
    41. if from_layer.strip():
    42. # extract from base network
    43. # 得到from_layer里面指定名称的层,然后append到layers这个列表中
    44. layer = internals[from_layer.strip() + '_output']
    45. layers.append(layer)
    46. else: # 这一部分就是给原来的VGG16网络加上几个新的卷积操作
    47. # attach from last feature layer
    48. assert len(layers) > 0
    49. assert num_filter > 0
    50. layer = layers[-1] # 这一行表示现在要加的层的输入是上一层的输出
    51. num_1x1 = max(min_filter, num_filter // 2)
    52. # 一共添加了1*1和3*3两个卷积层,前面一个起减少计算量的作用,一般把这两个当做一个层,或者叫symbol
    53. conv_1x1 = conv_act_layer(layer, 'multi_feat_%d_conv_1x1' % (k),
    54. num_1x1, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu')
    55. conv_3x3 = conv_act_layer(conv_1x1, 'multi_feat_%d_conv_3x3' % (k),
    56. num_filter, kernel=(3, 3), pad=(p, p), stride=(s, s), act_type='relu')
    57. layers.append(conv_3x3)
    58. # 最终这个layers列表会有6个symbol(以输入大小是300*300的vgg16的basemodel为例,就是上面的那个from_layer)
    59. return layers
    60. def multibox_layer(from_layers, num_classes, sizes=[.2, .95],
    61. ratios=[1], normalization=-1, num_channels=[],
    62. clip=False, interm_layer=0, steps=[]):
    63. """
    64. the basic aggregation module for SSD detection. Takes in multiple layers,
    65. generate multiple object detection targets by customized layers
    66. Parameters:
    67. ----------
    68. from_layers : list of mx.symbol
    69. generate multibox detection from layers
    70. num_classes : int
    71. number of classes excluding background, will automatically handle
    72. background in this function
    73. sizes : list or list of list
    74. [min_size, max_size] for all layers or [[], [], []...] for specific layers
    75. ratios : list or list of list
    76. [ratio1, ratio2...] for all layers or [[], [], ...] for specific layers
    77. normalizations : int or list of int
    78. use normalizations value for all layers or [...] for specific layers,
    79. -1 indicate no normalizations and scales
    80. num_channels : list of int
    81. number of input layer channels, used when normalization is enabled, the
    82. length of list should equals to number of normalization layers
    83. clip : bool
    84. whether to clip out-of-image boxes
    85. interm_layer : int
    86. if > 0, will add a intermediate Convolution layer
    87. steps : list
    88. specify steps for each MultiBoxPrior layer, leave empty, it will calculate
    89. according to layer dimensions
    90. Returns:
    91. ----------
    92. list of outputs, as [loc_preds, cls_preds, anchor_boxes]
    93. loc_preds : localization regression prediction
    94. cls_preds : classification prediction
    95. anchor_boxes : generated anchor boxes
    96. """
    97. assert len(from_layers) > 0, "from_layers must not be empty list"
    98. assert num_classes > 0, \
    99. "num_classes {} must be larger than 0".format(num_classes)
    100. assert len(ratios) > 0, "aspect ratios must not be empty list"
    101. if not isinstance(ratios[0], list):
    102. # provided only one ratio list, broadcast to all from_layers
    103. ratios = [ratios] * len(from_layers)
    104. assert len(ratios) == len(from_layers), \
    105. "ratios and from_layers must have same length"
    106. assert len(sizes) > 0, "sizes must not be empty list"
    107. if len(sizes) == 2 and not isinstance(sizes[0], list):
    108. # provided size range, we need to compute the sizes for each layer
    109. assert sizes[0] > 0 and sizes[0] < 1
    110. assert sizes[1] > 0 and sizes[1] < 1 and sizes[1] > sizes[0]
    111. tmp = np.linspace(sizes[0], sizes[1], num=(len(from_layers)-1))
    112. min_sizes = [start_offset] + tmp.tolist()
    113. max_sizes = tmp.tolist() + [tmp[-1]+start_offset]
    114. sizes = zip(min_sizes, max_sizes)
    115. assert len(sizes) == len(from_layers), \
    116. "sizes and from_layers must have same length"
    117. if not isinstance(normalization, list):
    118. normalization = [normalization] * len(from_layers)
    119. assert len(normalization) == len(from_layers)
    120. assert sum(x > 0 for x in normalization) <= len(num_channels), \
    121. "must provide number of channels for each normalized layer"
    122. if steps:
    123. assert len(steps) == len(from_layers), "provide steps for all layers or leave empty"
    124. # 初始化了3个空列表,后面会用到
    125. loc_pred_layers = []
    126. cls_pred_layers = []
    127. anchor_layers = []
    128. num_classes += 1 # always use background as label 0
    129. # 前面基本上都是一些check操作,从这里开始才是硬货,这个大的for循环将循环每一个需要做特征融合的层(以VGG16网络,
    130. # 输入为300*300为例,有2个层是VGG16本身的,有4个层是额外加上去的,因此from_layers这个列表包含上述6个symbol)
    131. for k, from_layer in enumerate(from_layers):
    132. from_name = from_layer.name
    133. # normalize
    134. # normalization是一个列表,以输入为300*300的VGG16网络为例,默认的normalization是[20,-1,-1,-1,-1,-1],
    135. # 这里的-1表示该层不进行normalization。另外,normalization在这里是通过symbol中的L2Normalization和Variable来实现的。
    136. # 在L2Normalization中mode是channel,表示按照channel里面数值的L2范数进行归一化,
    137. # 类似这样:for i in 0...N , out[:,i,:,...,:] = data[:,i,:,...,:] / sqrt(sum(data[:,i,:,...,:] ** 2) + eps)。
    138. # broadcast_mul是做一个broadcast的乘法,简单讲就是将scale和from_layer做点乘
    139. if normalization[k] > 0:
    140. from_layer = mx.symbol.L2Normalization(data=from_layer, \
    141. mode="channel", name="{}_norm".format(from_name))
    142. scale = mx.symbol.Variable(name="{}_scale".format(from_name),
    143. shape=(1, num_channels.pop(0), 1, 1),
    144. init=mx.init.Constant(normalization[k]),
    145. attr={'__wd_mult__': '0.1'})
    146. from_layer = mx.symbol.broadcast_mul(lhs=scale, rhs=from_layer)
    147. if interm_layer > 0:
    148. from_layer = mx.symbol.Convolution(data=from_layer, kernel=(3,3), \
    149. stride=(1,1), pad=(1,1), num_filter=interm_layer, \
    150. name="{}_inter_conv".format(from_name))
    151. from_layer = mx.symbol.Activation(data=from_layer, act_type="relu", \
    152. name="{}_inter_relu".format(from_name))
    153. # estimate number of anchors per location
    154. # here I follow the original version in caffe
    155. # TODO: better way to shape the anchors??
    156. size = sizes[k]
    157. assert len(size) > 0, "must provide at least one size"
    158. size_str = "(" + ",".join([str(x) for x in size]) + ")"
    159. ratio = ratios[k]
    160. assert len(ratio) > 0, "must provide at least one ratio"
    161. ratio_str = "(" + ",".join([str(x) for x in ratio]) + ")"
    162. # anchor数目的计算,这里采用的并非len(size)*len(ratio)
    163. num_anchors = len(size) -1 + len(ratio)
    164. # create location prediction layer
    165. # 这一部分主要是和location相关的,也就是回归框的具体位置,
    166. # 主要是用一个filter数量为(anchor数目乘以4)的3*3的卷积操作得到预测的各anchor的坐标,该层是接在from_layer的后面。
    167. # 因为pad=1,stride=1,所以该卷积是不改变feature map的大小的。最后把增加的层append到loc_pred_layers列表。
    168. # 每个anchor对应4个坐标值
    169. num_loc_pred = num_anchors * 4
    170. # __lr_mult__是指在该层的学习率要乘以的数值,得到的结果作为该层的学习率
    171. bias = mx.symbol.Variable(name="{}_loc_pred_conv_bias".format(from_name),
    172. init=mx.init.Constant(0.0), attr={'__lr_mult__': '2.0'})
    173. loc_pred = mx.symbol.Convolution(data=from_layer, bias=bias, kernel=(3,3), \
    174. stride=(1,1), pad=(1,1), num_filter=num_loc_pred, \
    175. name="{}_loc_pred_conv".format(from_name))
    176. # transpose是交换的作用,比如loc_pred的维度是32*24*19*19,那么经过交换后得到的维度变成32*19*19*24,
    177. # Flatten是将4个维度转换成2个维度,比如从32*19*19*24转换到32*8664。
    178. # 之所以这样操作是为了后面特征融合时不受feature map大小不一致的影响。
    179. loc_pred = mx.symbol.transpose(loc_pred, axes=(0,2,3,1))
    180. loc_pred = mx.symbol.Flatten(data=loc_pred)
    181. loc_pred_layers.append(loc_pred)
    182. # create class prediction layer
    183. # 这一部分主要是和类别预测相关,num_cls_pred对应要用到的3*3卷积的卷积核数量,这里num_classes是实际类别数+1(背景算一类),
    184. # 最后把增加的层append到cls_pred_layers列表。
    185. num_cls_pred = num_anchors * num_classes
    186. bias = mx.symbol.Variable(name="{}_cls_pred_conv_bias".format(from_name),
    187. init=mx.init.Constant(0.0), attr={'__lr_mult__': '2.0'})
    188. cls_pred = mx.symbol.Convolution(data=from_layer, bias=bias, kernel=(3,3), \
    189. stride=(1,1), pad=(1,1), num_filter=num_cls_pred, \
    190. name="{}_cls_pred_conv".format(from_name))
    191. cls_pred = mx.symbol.transpose(cls_pred, axes=(0,2,3,1))
    192. cls_pred = mx.symbol.Flatten(data=cls_pred)
    193. cls_pred_layers.append(cls_pred)
    194. # create anchor generation layer
    195. # 这一部分是用来生成anchor的,在mxnet中有专门的symbol用来做这个是,那就是mxnet.contrib.symbol.MultiBoxPrior,
    196. # 输入也是from_layer,也就是要提取特征做融合的层。size_str是该层的anchor尺寸是原图尺寸的倍数关系,
    197. # 比如(0.1,0.141),ratio_str是anchor的宽高比例,比如(1,2,0.5),clip是False,step是(0.0266667,0.0266667),
    198. # 表示该层的feature map的尺寸是输入图像尺寸的多少倍,
    199. # 生成的anchors的维度是batch size*(h*w*(len(sizes)+len(ratios)-1))*4,是3维的。
    200. # 最后把增加的层append到anchor_layers列表。
    201. if steps:
    202. step = (steps[k], steps[k])
    203. else:
    204. step = '(-1.0, -1.0)'
    205. anchors = mx.contrib.symbol.MultiBoxPrior(from_layer, sizes=size_str, ratios=ratio_str, \
    206. clip=clip, name="{}_anchors".format(from_name), steps=step)
    207. anchors = mx.symbol.Flatten(data=anchors)
    208. anchor_layers.append(anchors)
    209. # 出来这个for循环后,就要对几个层做concate操作了,分别生成loc_preds,cls_preds,anchor_boxes这三个symbol,
    210. # 这里做Reshape的时候,参数shape中有些值是-1,表示该维度的大小等于(原来几个维度的乘积)除以(现在其他维度的乘积),
    211. # 因此shape里面最多只能有一个-1,要不就没法计算了,因此reshape后的cls_preds的维度就是:batch size*number of anchors*num_classes。
    212. # 同样,reshape后的anchor_boxes的维度是:batch size*number of anchors*coordinate。
    213. loc_preds = mx.symbol.Concat(*loc_pred_layers, num_args=len(loc_pred_layers), \
    214. dim=1, name="multibox_loc_pred")
    215. cls_preds = mx.symbol.Concat(*cls_pred_layers, num_args=len(cls_pred_layers), \
    216. dim=1)
    217. cls_preds = mx.symbol.Reshape(data=cls_preds, shape=(0, -1, num_classes))
    218. cls_preds = mx.symbol.transpose(cls_preds, axes=(0, 2, 1), name="multibox_cls_pred")
    219. anchor_boxes = mx.symbol.Concat(*anchor_layers, \
    220. num_args=len(anchor_layers), dim=1)
    221. anchor_boxes = mx.symbol.Reshape(data=anchor_boxes, shape=(0, -1, 4), name="multibox_anchors")
    222. return [loc_preds, cls_preds, anchor_boxes]