Enable BatchNorm to use global mean and variane during training (#14630)

* Enable BatchNorm to use global mean and variane during training
* Update doc and follow comments.
f7c96f079b
qingqing01 6 years ago committed by GitHub
parent 400cf19f14
commit 731d45a39a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -69,7 +69,7 @@ paddle.fluid.layers.sequence_softmax ArgSpec(args=['input', 'use_cudnn', 'name']
paddle.fluid.layers.softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(True, None))
paddle.fluid.layers.pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True))
paddle.fluid.layers.pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True))
paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False))
paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu', 'use_global_stats'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False, False))
paddle.fluid.layers.beam_search_decode ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
paddle.fluid.layers.conv3d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))

@ -146,7 +146,9 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
const float epsilon = ctx.Attr<float>("epsilon");
const float momentum = ctx.Attr<float>("momentum");
const bool is_test = ctx.Attr<bool>("is_test");
const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
const bool fuse_with_relu = ctx.Attr<bool>("fuse_with_relu");
bool global_stats = is_test || use_global_stats;
const auto *x = ctx.Input<Tensor>("X");
const auto *mean = ctx.Input<Tensor>("Mean");
@ -177,13 +179,14 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
T *batch_mean_data = nullptr;
T *batch_variance_data = nullptr;
if (!is_test) {
if (!global_stats) {
batch_mean_data = batch_mean->mutable_data<T>(ctx.GetPlace());
batch_variance_data = batch_variance->mutable_data<T>(ctx.GetPlace());
}
auto propagation = is_test == true ? mkldnn::prop_kind::forward_scoring
: mkldnn::prop_kind::forward_training;
auto propagation = global_stats == true
? mkldnn::prop_kind::forward_scoring
: mkldnn::prop_kind::forward_training;
auto src_tz = paddle::framework::vectorize2int(x->dims());
auto scale_tz = paddle::framework::vectorize2int(scale->dims());
@ -199,7 +202,7 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
shift->data<T>() + ic, &scaleshift_data);
unsigned flags = mkldnn::use_scale_shift;
if (is_test) flags |= mkldnn::use_global_stats;
if (global_stats) flags |= mkldnn::use_global_stats;
if (fuse_with_relu) flags |= mkldnn::fuse_bn_relu;
// create mkldnn memory from input x tensor
@ -208,7 +211,7 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
// keys for backward pass
const std::string key = BatchNormMKLDNNHandler::GetHash(
src_tz, epsilon, flags, is_test, input_format,
src_tz, epsilon, flags, global_stats, input_format,
ctx.op().Output("SavedMean"));
const std::string key_batch_norm_fwd_pd = key + "@bn_fwd_pd";
@ -239,7 +242,7 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
batch_norm_fwd_pd->dst_primitive_desc().desc(), y_data);
std::shared_ptr<batch_norm_fwd> batch_norm_p;
if (is_test) {
if (global_stats) {
// create mkldnn memory for stats (as input)
std::shared_ptr<memory> mean_memory =
handler.AcquireMeanMemoryFromPrimitive(to_void_cast(mean_data));
@ -269,7 +272,7 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
pipeline.push_back(*batch_norm_p);
mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
if (!is_test) {
if (!global_stats) {
// mkldnn only compute stats for current batch
// so we need compute momentum stats via Eigen lib
EigenVectorArrayMap<T> batch_mean_e(batch_mean_data, ic);

File diff suppressed because it is too large Load Diff

@ -2300,7 +2300,8 @@ def batch_norm(input,
moving_mean_name=None,
moving_variance_name=None,
do_model_average_for_mean_and_var=False,
fuse_with_relu=False):
fuse_with_relu=False,
use_global_stats=False):
"""
**Batch Normalization Layer**
@ -2327,6 +2328,19 @@ def batch_norm(input,
\\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
When use_global_stats = True, the :math:`\\mu_{\\beta}`
and :math:`\\sigma_{\\beta}^{2}` are not the statistics of one mini-batch.
They are global (or running) statistics. (It usually got from the
pre-trained model.)
The training and testing (or inference) have the same behavior:
.. math::
\\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
\\sigma_{\\beta}^{2} + \\epsilon}} \\\\
y_i &\\gets \\gamma \\hat{x_i} + \\beta
Args:
input(variable): The input variable which is a LoDTensor.
act(string, Default None): Activation type, linear|relu|prelu|...
@ -2349,6 +2363,11 @@ def batch_norm(input,
moving_variance_name(string, Default None): The name of the moving_variance which store the global Variance.
do_model_average_for_mean_and_var(bool, Default False): Do model average for mean and variance or not.
fuse_with_relu (bool): if True, this OP performs relu after batch norm.
use_global_stats(bool, Default False): Whether to use global mean and
variance. In inference or test mode, set use_global_stats to true
or is_test to true, and the behavior is equivalent.
In train mode, when setting use_global_stats True, the global mean
and variance are also used during train period.
Returns:
Variable: A tensor variable which is the result after applying batch normalization on the input.
@ -2381,9 +2400,15 @@ def batch_norm(input,
shape=param_shape,
dtype=dtype,
default_initializer=Constant(1.0))
# setting stop_gradient=True to reduce computation
if use_global_stats and helper.param_attr.learning_rate == 0.:
scale.stop_gradient = True
bias = helper.create_parameter(
attr=helper.bias_attr, shape=param_shape, dtype=dtype, is_bias=True)
# setting stop_gradient=True to reduce computation
if use_global_stats and helper.bias_attr.learning_rate == 0.:
scale.stop_gradient = True
mean = helper.create_parameter(
attr=ParamAttr(
@ -2439,7 +2464,8 @@ def batch_norm(input,
"epsilon": epsilon,
"is_test": is_test,
"use_mkldnn": False,
"fuse_with_relu": fuse_with_relu
"fuse_with_relu": fuse_with_relu,
"use_global_stats": use_global_stats
})
return helper.append_activation(batch_norm_out)

@ -54,6 +54,19 @@ def _reference_testing(x, scale, offset, mean, var, epsilon, data_format):
return y
def _cal_mean_variance(x, epsilon, data_format):
assert data_format in ['NCHW', 'NHWC']
x_square = x * x
axis = (0, 2, 3) if data_format == 'NCHW' else (0, 1, 2)
C = x.shape[1] if data_format == 'NCHW' else x.shape[-1]
x_square_sum = np.sum(x_square, axis)
x_sum = np.sum(x, axis=axis)
element_count = np.size(x) / C
mean = x_sum / element_count
var = x_square_sum / element_count - mean * mean
return mean, var
def _reference_training(x, scale, offset, epsilon, data_format):
x_shape = x.shape
@ -294,7 +307,18 @@ class TestBatchNormOpTraining(unittest.TestCase):
self.use_mkldnn = False
self.fuse_with_relu = False
self.data_formats = ["NCHW", "NHWC"]
self.momentum = 0.9
self.epsilon = 0.00001
self.init_kernel_type()
self.init_test_case()
def init_test_case(self):
self.use_global_stats = False
self.no_grad_set = set()
self.fetch_list = [
'y', 'mean', 'variance', 'saved_mean', 'saved_variance', 'x@GRAD',
'scale@GRAD', 'bias@GRAD'
]
def __assert_close(self, tensor, np_array, msg, atol=1e-4):
np.allclose(np.array(tensor), np_array, atol=atol)
@ -313,11 +337,22 @@ class TestBatchNormOpTraining(unittest.TestCase):
return y, mean_out, variance_out, saved_mean, saved_variance, x_grad, scale_grad, bias_grad
def set_mean_variance(self, scale_shape, x, data_layout):
mean = np.zeros(scale_shape).astype(np.float32)
variance = np.ones(scale_shape).astype(np.float32)
# computing global mean/variance for one step
if self.use_global_stats:
mom = self.momentum
x_mean, x_var = _cal_mean_variance(x, self.epsilon, data_layout)
mean = x_mean * (1. - mom) + mom * mean
variance = x_var * (1. - mom) + mom * variance
return mean, variance
def test_forward_backward(self):
def test_with_place(place, data_layout, shape):
# attr
epsilon = 0.00001
momentum = 0.9
epsilon = self.epsilon
momentum = self.momentum
if data_layout == "NCHW":
n, c, h, w = shape[0], shape[1], shape[2], shape[3]
else:
@ -328,9 +363,7 @@ class TestBatchNormOpTraining(unittest.TestCase):
x = np.random.random_sample(shape).astype(np.float32)
scale = np.random.random_sample(scale_shape).astype(np.float32)
bias = np.random.random_sample(scale_shape).astype(np.float32)
mean = np.zeros(scale_shape).astype(np.float32)
variance = np.ones(scale_shape).astype(np.float32)
mean, variance = self.set_mean_variance(scale_shape, x, data_layout)
y_grad = np.random.random_sample(shape).astype(np.float32)
y, mean_out, variance_out, saved_mean, saved_variance, x_grad, scale_grad, bias_grad = self.ref_forward_backward(
@ -339,6 +372,9 @@ class TestBatchNormOpTraining(unittest.TestCase):
var_dict = locals()
var_dict['y@GRAD'] = y_grad
var_dict['x@GRAD'] = x_grad
var_dict['scale@GRAD'] = scale_grad
var_dict['bias@GRAD'] = bias_grad
var_names = [
'x', 'scale', 'bias', 'mean', 'variance', 'y', 'saved_mean',
@ -365,9 +401,8 @@ class TestBatchNormOpTraining(unittest.TestCase):
},
outputs={
"Y": block.var('y'),
"MeanOut": block.var('mean'), # share the same memory
"VarianceOut":
block.var('variance'), # share the same memory
"MeanOut": block.var('mean'), # share memory
"VarianceOut": block.var('variance'), # share memory
"SavedMean": block.var('saved_mean'),
"SavedVariance": block.var('saved_variance')
},
@ -377,13 +412,14 @@ class TestBatchNormOpTraining(unittest.TestCase):
"is_test": False,
"data_layout": data_layout,
"use_mkldnn": self.use_mkldnn,
"fuse_with_relu": self.fuse_with_relu
"fuse_with_relu": self.fuse_with_relu,
"use_global_stats": self.use_global_stats
})
block.create_var(name='y@GRAD', dtype='float32', shape=y.shape)
# generate backward op_desc
grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(
bn_op.desc, set(), [])
bn_op.desc, self.no_grad_set, [])
grad_op_desc = grad_op_desc_list[0]
new_op_desc = block.desc.append_op()
new_op_desc.copy_from(grad_op_desc)
@ -403,20 +439,10 @@ class TestBatchNormOpTraining(unittest.TestCase):
for name in
['x', 'scale', 'bias', 'mean', 'variance', 'y@GRAD']
},
fetch_list=[
'y', 'mean', 'variance', 'saved_mean', 'saved_variance',
'x@GRAD', 'scale@GRAD', 'bias@GRAD'
])
self.__assert_close(y, out[0], "y")
self.__assert_close(mean_out, out[1], "mean")
self.__assert_close(variance_out, out[2], "variance", 1e-3)
self.__assert_close(saved_mean, out[3], "saved_mean")
self.__assert_close(saved_variance, out[4], "saved_variance", 1e-3)
self.__assert_close(x_grad, out[5], "x_grad")
self.__assert_close(scale_grad, out[6], "scale_grad")
self.__assert_close(bias_grad, out[7], "bias_grad")
fetch_list=self.fetch_list)
for id, name in enumerate(self.fetch_list):
self.__assert_close(var_dict[name], out[id], name)
print("op test forward passed: ", str(place), data_layout)
places = [core.CPUPlace()]
@ -432,5 +458,66 @@ class TestBatchNormOpTraining(unittest.TestCase):
pass
class TestBatchNormOpFreezeStatsTraining(TestBatchNormOpTraining):
def init_test_case(self):
self.use_global_stats = True
self.no_grad_set = set()
self.fetch_list = [
'y', 'mean', 'variance', 'x@GRAD', 'scale@GRAD', 'bias@GRAD'
]
def reference_grad(self, x, y_grad, scale, mean, var, epsilon, data_format):
if data_format == "NCHW":
x = np.transpose(x, (0, 2, 3, 1))
y_grad = np.transpose(y_grad, (0, 2, 3, 1))
x_grad = scale * y_grad / np.sqrt(var + epsilon)
grad_scale = np.sum(y_grad * (x - mean) / np.sqrt(var + epsilon),
axis=(0, 1, 2))
grad_offset = np.sum(y_grad, axis=(0, 1, 2))
# transfer back to N, C, H, W
if data_format == "NCHW":
x_grad = np.transpose(x_grad, (0, 3, 1, 2))
x = np.transpose(x, (0, 3, 1, 2))
y_grad = np.transpose(y_grad, (0, 3, 1, 2))
return x_grad, grad_scale, grad_offset
def ref_forward_backward(self, x, y_grad, scale, bias, mean, variance,
epsilon, momentum, shape, data_layout):
if data_layout != "NCHW" and data_layout != "NHWC":
raise ValueError("Unknown data order.")
if data_layout == "NCHW":
x = np.transpose(x, (0, 2, 3, 1))
# run normalizaton
normalized = (x - mean) / np.sqrt(variance + epsilon)
y = normalized * scale + bias
# transfer back to N, C, H, W
if data_layout == "NCHW":
x = np.transpose(x, (0, 3, 1, 2))
y = np.transpose(y, (0, 3, 1, 2))
mean_out = mean
variance_out = variance
saved_variance = 1. / np.sqrt(variance + epsilon)
# run backward
x_grad, scale_grad, bias_grad = self.reference_grad(
x, y_grad, scale, mean, variance, epsilon, data_layout)
return y, mean_out, variance_out, mean, saved_variance, x_grad, scale_grad, bias_grad
class TestBatchNormOpFreezeStatsAndScaleBiasTraining(
TestBatchNormOpFreezeStatsTraining):
def init_test_case(self):
self.use_global_stats = True
self.no_grad_set = set(['scale@GRAD', 'bias@GRAD'])
self.fetch_list = ['y', 'mean', 'variance', 'x@GRAD']
if __name__ == '__main__':
unittest.main()

@ -955,6 +955,15 @@ class TestBook(unittest.TestCase):
print(str(program))
def test_batch_norm(self):
program = Program()
with program_guard(program):
data = layers.data(
name='data', shape=[32, 128, 128], dtype="float32")
out = layers.batch_norm(data)
print(str(program))
if __name__ == '__main__':
unittest.main()

Loading…
Cancel
Save