VGG16 结构,实现函数相关
VGG : Very Deep Convolutional Networks for Large-Scale Image Recognition
VGG16指的就是Configuration中的D配置,E配置为VGG19。为了方便表示,根据池化层可以将网络分为五个block,和最后的全连接层,以及softmax层。全连接层与卷积层有权重,D有16层权重层,E有19层权重层,这是命名来源。
对于卷积层,使用conv3-X表示,X表示的是卷积输出的通道数;3代表的是卷积核大小为$3\times3$。其他参数为:填充方式(Padding),SAME;步长,1;激活函数,ReLU。网络的输入是$224\times224$,通道数是3。池化层均采用的是$2\times2$的最大池化,所有隐含层均使用ReLU函数激活。
VGG16函数实现
import tensorflow as tf
import tensorflow.contrib as tc
import tensorflow.contrib.layers as tcl
from rbox_functions import *
class VGG16(object):
def __init__(self, prior_num, para_num, cls_num, is_fpn, use_third_layer, BATCH_SIZE):
self.name = 'VGG16'
self.eps = 1e-10
self.scale = 20.0
self.para_num = para_num
self.cls_num = cls_num
self.prior_num = prior_num
self.cls_out = cls_num + 1
self.loc_output_num = prior_num[-1] * para_num
self.conf_output_num = prior_num[-1] * (cls_num + 1)
self.fpn = is_fpn
self.use_third_layer = use_third_layer
self.BATCH_SIZE = BATCH_SIZE
'''
参数部分
scale是什么?-是一个归一化参数,SSD中用的就是20,可能是一个经验值
para_num:参数数量这里指的应该是位置、大小、和角度,5个参数
cls_num:分类数1
prior_num:每点的prior_box数量,这里的参数由于长宽比是固定的,这样预测框数量就是长度*角度(数量)
cls_out:指的是输出类,对于分类器还有空白+1
loc_output_num指的是每点参数量
conf指的是置信度总量,即prior_num*clsout
'''
'''为什么要做归一化处理?SSD:<https://github.com/weiliu89/caffe/issues/241>'''
'''
conv2d :
https://tensorflow.google.cn/versions/r1.15/api_docs/python/tf/contrib/layers/conv2d
input,
num_output,channels//在vgg16中,
strides(滑动步长),是一个[batch_1, height, width, channels_1]
默认使用了relu激活函数 activation_fn=tf.nn.relu,
reshape 参数,-1表示未指定
'''
def __call__(self, inputs, reuse=False):
with tf.variable_scope(self.name) as scope:
if reuse:
scope.reuse_variables()
c1_1 = tcl.conv2d(inputs, num_outputs=64, kernel_size=3, stride=1, padding='SAME', scope='conv1_1')
c1_2 = tcl.conv2d(c1_1, num_outputs=64, kernel_size=3, stride=1, padding='SAME', scope='conv1_2')
p1 = tcl.max_pool2d(inputs=c1_2, kernel_size=2, stride=2, padding='SAME')
c2_1 = tcl.conv2d(p1, num_outputs=128, kernel_size=3, stride=1, padding='SAME', scope='conv2_1')
c2_2 = tcl.conv2d(c2_1, num_outputs=128, kernel_size=3, stride=1, padding='SAME', scope='conv2_2')
p2 = tcl.max_pool2d(inputs=c2_2, kernel_size=2, stride=2, padding='SAME')
c3_1 = tcl.conv2d(p2, num_outputs=256, kernel_size=3, stride=1, padding='SAME', scope='conv3_1')
c3_2 = tcl.conv2d(c3_1, num_outputs=256, kernel_size=3, stride=1, padding='SAME', scope='conv3_2')
c3_3 = tcl.conv2d(c3_2, num_outputs=256, kernel_size=3, stride=1, padding='SAME', scope='conv3_3')
p3 = tcl.max_pool2d(inputs=c3_3, kernel_size=2, stride=2, padding='SAME')
c4_1 = tcl.conv2d(p3, num_outputs=512, kernel_size=3, stride=1, padding='SAME', scope='conv4_1')
c4_2 = tcl.conv2d(c4_1, num_outputs=512, kernel_size=3, stride=1, padding='SAME', scope='conv4_2')
c4_3 = tcl.conv2d(c4_2, num_outputs=512, kernel_size=3, stride=1, padding='SAME', scope='conv4_3')
if self.fpn:
p4 = tcl.max_pool2d(inputs=c4_3, kernel_size=2, stride=2, padding='SAME')
c5_1 = tcl.conv2d(p4, num_outputs=512, kernel_size=3, stride=1, padding='SAME', scope='conv5_1')
c5_2 = tcl.conv2d(c5_1, num_outputs=512, kernel_size=3, stride=1, padding='SAME', scope='conv5_2')
c5_3 = tcl.conv2d(c5_2, num_outputs=512, kernel_size=3, stride=1, padding='SAME', scope='conv5_3')
c4_3_conv = tcl.conv2d(c4_3, num_outputs=256, kernel_size=1, stride=1, padding='SAME',weights_initializer=tf.random_normal_initializer(0, 0.1),biases_initializer=tf.constant_initializer(0.1), scope='conv43_conv')
c5_3_conv = tcl.conv2d(c5_3, num_outputs=256, kernel_size=1, stride=1, padding='SAME',weights_initializer=tf.random_normal_initializer(0, 0.1), biases_initializer=tf.constant_initializer(0.1), scope='conv53_conv')
# c5_3_up = tcl.conv2d_transpose(c5_3, num_outputs=512, kernel_size=3, stride=2, padding='SAME', weights_initializer = tf.random_normal_initializer(0,0.1), biases_initializer=tf.constant_initializer(0.1), scope='conv53_up')
'''
shape--给出张量的维度矩阵
reshape实际上是做了一个fc操作
resize--resize图片大小
add--同形状矩阵相加
q1:为什么要相加add --是为了做fpn
'''
c4_3_shape = tf.shape(c4_3_conv)
c5_3_up = tf.image.resize_images(c5_3_conv, c4_3_shape[1:3], method=0)
c4_3_new = tf.add(c4_3_conv, c5_3_up)
o4 = self.normalize_channel(c4_3_new)
loc4 = tcl.conv2d(o4, num_outputs=self.prior_num[1] * self.para_num, kernel_size=3, stride=1,padding='SAME', activation_fn=None, scope='loc4')
conf4 = tcl.conv2d(o4, num_outputs=self.prior_num[1] * self.cls_out, kernel_size=3, stride=1,padding='SAME', activation_fn=None, scope='conf4')
if self.use_third_layer:
'''先使用fpn、的方法将三四层融合
reshape手动reshape先切片再拼合
[batchsize, height, width, channel]
slice--切片
也就是说,如果size输入值是-1的话,在那个维度剩下的数都会slice走。
slice(input,begin,size)
concat_example
t1 = [[1, 2, 3], [4, 5, 6]]
t2 = [[7, 8, 9], [10, 11, 12]]
tf.concat([t1, t2], 0) # [[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]]
tf.concat([t1, t2], 1) # [[1, 2, 3, 7, 8, 9], [4, 5, 6, 10, 11, 12]]
'''
c3_3_conv = tcl.conv2d(c3_3, num_outputs=256, kernel_size=1, stride=1, padding='SAME',weights_initializer=tf.random_normal_initializer(0, 0.1),biases_initializer=tf.constant_initializer(0.1), scope='conv33_conv')
c3_3_shape = tf.shape(c3_3_conv)
c4_3_up = tf.image.resize_images(c4_3_new, c3_3_shape[1:3], method=0)
c3_3_new = tf.add(c3_3_conv, c4_3_up)
o3 = self.normalize_channel(c3_3_new)
loc3 = tcl.conv2d(o3, num_outputs=self.prior_num[0] * self.para_num, kernel_size=3, stride=1, padding='SAME', activation_fn=None, scope='loc3')
conf3 = tcl.conv2d(o3, num_outputs=self.prior_num[0] * self.cls_out, kernel_size=3, stride=1, padding='SAME', activation_fn=None, scope='conf3')
'''总体来说,将batch分离,然后reshape成x个参数的形式
然后将三四层合并tensor(为什么不用add,这里用concat保持两层)
最后拼成一个tensor
感觉是做了一个大的reshape
这里不用add 融合的特征在4里,不用3这个。
'''
for idx in range(self.BATCH_SIZE):
loc3_i = tf.slice(loc3, [idx, 0, 0, 0], [1, -1, -1, -1])
loc3_i_reshape = tf.reshape(loc3_i, [-1, self.para_num])
loc4_i = tf.slice(loc4, [idx, 0, 0, 0], [1, -1, -1, -1])
loc4_i_reshape = tf.reshape(loc4_i, [-1, self.para_num])
if idx == 0:
loc_reshape = tf.concat([loc3_i_reshape, loc4_i_reshape], axis=0)
else:
loc34_i_reshape = tf.concat([loc3_i_reshape, loc4_i_reshape], axis=0)
loc_reshape = tf.concat([loc_reshape, loc34_i_reshape], axis=0)
conf3_i = tf.slice(conf3, [idx, 0, 0, 0], [1, -1, -1, -1])
conf3_i_reshape = tf.reshape(conf3_i, [-1, self.cls_out])
conf4_i = tf.slice(conf4, [idx, 0, 0, 0], [1, -1, -1, -1])
conf4_i_reshape = tf.reshape(conf4_i, [-1, self.cls_out])
if idx == 0:
conf_reshape = tf.concat([conf3_i_reshape, conf4_i_reshape], axis=0)
else:
conf34_i_reshape = tf.concat([conf3_i_reshape, conf4_i_reshape], axis=0)
conf_reshape = tf.concat([conf_reshape, conf34_i_reshape], axis=0)
else:
loc_reshape = tf.reshape(loc4, [-1, self.para_num])
conf_reshape = tf.reshape(conf4, [-1, self.cls_out])
else:
if self.use_third_layer:
o3 = self.normalize_channel(c3_3)
loc3 = tcl.conv2d(o3, num_outputs=self.prior_num[0] * self.para_num, kernel_size=3, stride=1, padding='SAME', activation_fn=None, scope='loc3')
conf3 = tcl.conv2d(o3, num_outputs=self.prior_num[0] * self.cls_out, kernel_size=3, stride=1, padding='SAME', activation_fn=None, scope='conf3')
o4 = self.normalize_channel(c4_3)
loc4 = tcl.conv2d(o4, num_outputs=self.loc_output_num, kernel_size=3, stride=1, padding='SAME', activation_fn=None, scope='loc4')
conf4 = tcl.conv2d(o4, num_outputs=self.conf_output_num, kernel_size=3, stride=1, padding='SAME', activation_fn=None, scope='conf4')
for idx in range(self.BATCH_SIZE):
loc3_i = tf.slice(loc3, [idx, 0, 0, 0], [1, -1, -1, -1])
loc3_i_reshape = tf.reshape(loc3_i, [-1, self.para_num])
loc4_i = tf.slice(loc4, [idx, 0, 0, 0], [1, -1, -1, -1])
loc4_i_reshape = tf.reshape(loc4_i, [-1, self.para_num])
if idx == 0:
loc_reshape = tf.concat([loc3_i_reshape, loc4_i_reshape], axis=0)
else:
loc34_i_reshape = tf.concat([loc3_i_reshape, loc4_i_reshape], axis=0)
loc_reshape = tf.concat([loc_reshape, loc34_i_reshape], axis=0)
conf3_i = tf.slice(conf3, [idx, 0, 0, 0], [1, -1, -1, -1])
conf3_i_reshape = tf.reshape(conf3_i, [-1, self.cls_out])
conf4_i = tf.slice(conf4, [idx, 0, 0, 0], [1, -1, -1, -1])
conf4_i_reshape = tf.reshape(conf4_i, [-1, self.cls_out])
if idx == 0:
conf_reshape = tf.concat([conf3_i_reshape, conf4_i_reshape], axis=0)
else:
conf34_i_reshape = tf.concat([conf3_i_reshape, conf4_i_reshape], axis=0)
conf_reshape = tf.concat([conf_reshape, conf34_i_reshape], axis=0)
else:
o4 = self.normalize_channel(c4_3)
# p4 = tcl.max_pool2d(inputs=c4_3, kernel_size=2, stride=2, padding='SAME')
# c5_1 = tcl.conv2d(p4, num_outputs=512, kernel_size=3, stride=1, padding='SAME', scope='conv5_1')
# c5_2 = tcl.conv2d(c5_1, num_outputs=512, kernel_size=3, stride=1, padding='SAME', scope='conv5_2')
# c5_3 = tcl.conv2d(c5_2, num_outputs=512, kernel_size=3, stride=1, padding='SAME', scope='conv5_3')
# p5 = tcl.max_pool2d(inputs=c5_3, kernel_size=2, stride=2, padding='SAME')
# fc = tcl.flatten(p5)
# outputs = tcl.fully_connected(fc, num_outputs=21, activation_fn=None, scope='fc')
loc = tcl.conv2d(o4, num_outputs=self.loc_output_num, kernel_size=3, stride=1, padding='SAME', activation_fn=None, scope='loc')
conf = tcl.conv2d(o4, num_outputs=self.conf_output_num, kernel_size=3, stride=1, padding='SAME', activation_fn=None, scope='conf')
loc_reshape = tf.reshape(loc, [-1, self.para_num])
conf_reshape = tf.reshape(conf, [-1, self.cls_num + 1])
return loc_reshape, conf_reshape
def normalize_channel(self, inputs):
norm_data = tf.square(inputs)
norm_data = tf.reduce_sum(norm_data, axis=3)
norm_data = tf.sqrt(norm_data + self.eps)
norm_data = tf.expand_dims(norm_data, axis=3)
outputs = self.scale * tf.divide(inputs, norm_data)
return outputs
def regular_loss(self, lamda):
var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.name)
loss = []
for var in var_list:
loss += [tcl.l2_regularizer(lamda)(var)]
return loss
@property
def vars(self):
return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.name)
@property
def vars_train(self):
return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='trains')
- 与VGG16相比,这里与SSD的做法类似,但是更为精简。相比于SSD算法,这里使用的是第三层和第四层移除了FC层和softmax,对前两个FC4096,替换为conv层,使用两个conv操作分别获得需要的检测结果(位置参数信息与置信度),reshape格式化返回。这里有两个可选参数,一个是FPN,另一个是use_third_layer。这里有一个归一化操作,作者是这样解释的,主要是为了保证数据规模一致,可以认为是一个经验做法。
- FPN是特征金字塔Feature Pyramid Network的缩写,指的是对更深层次的特征图(conv5_x)进行上采样,然后与低一层的特征图(conv4_x)进行相加,减少特征图的层数。在进行相加操作之前,要使用$1\times1$卷积核减少通道数,(为什么要这样做?)对于FPN的效果,作者指出,浅层特征图的语义特征较弱,而位置特征较强;相反,深层特征图的语义特征较强, 但损失了精确地位置特征。同时利用浅层特征图和深层特征图,能够综合考量强位置特征和强语义特征,因此提升模型效果。
- 关于使用第三层 for循环在做什么?for循环的操作:注释。