add fuse_bn_act op (#27230)

* add fused_bn_add_relu op
5 years ago · 906e7f921e
parent 5034d181f3
commit 906e7f921e
9 changed files with 1120 additions and 6 deletions
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@ -127,7 +127,8 @@ function(op_library TARGET)
 "tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op"
 "fusion_transpose_flatten_concat_op" "fusion_conv_inception_op"
 "sync_batch_norm_op" "dgc_op" "fused_fc_elementwise_layernorm_op"
-"multihead_matmul_op" "fusion_group_op" "fused_bn_activation_op" "fused_embedding_eltwise_layernorm_op" "fusion_gru_op")
+"multihead_matmul_op" "fusion_group_op" "fused_bn_activation_op" "fused_embedding_eltwise_layernorm_op" "fusion_gru_op"
+"fused_bn_add_activation_op")
        if ("${TARGET}" STREQUAL "${manual_pybind_op}")
            set(pybind_flag 1)
        endif()
--- a/paddle/fluid/operators/fused/CMakeLists.txt
+++ b/paddle/fluid/operators/fused/CMakeLists.txt
@ -8,7 +8,8 @@ register_operators(EXCLUDES
    multihead_matmul_op
    fused_embedding_eltwise_layernorm_op
    fusion_group_op
-    fusion_gru_op)
+    fusion_gru_op
+    fused_bn_add_activation_op)

 # fusion_gru_op does not have CUDA kernel
 op_library(fusion_gru_op)
@ -47,4 +48,9 @@ if (WITH_GPU)
        file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(fusion_group);\n")
        cc_test(test_fusion_group_op SRCS fusion_group_op_test.cc DEPS fusion_group_op)
    endif()
+    # fused_bn_add_activation
+    if (NOT ${CUDNN_VERSION} VERSION_LESS 7401)
+    op_library(fused_bn_add_activation_op)
+    file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(fused_bn_add_activation);\n")
+    endif()
 endif()
--- a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cc
+++ b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cc
--- a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu
+++ b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu
--- a/paddle/fluid/operators/fused/fused_bn_add_activation_op.h
+++ b/paddle/fluid/operators/fused/fused_bn_add_activation_op.h
@ -0,0 +1,106 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include "paddle/fluid/framework/grad_op_desc_maker.h"
+#include "paddle/fluid/framework/op_proto_maker.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/var_type_inference.h"
+
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+
+class FusedBatchNormAddActOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override;
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override;
+};
+
+class FusedBatchNormAddActGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override;
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override;
+};
+
+class FusedBatchNormAddActOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override;
+};
+
+template <typename T>
+class FusedBatchNormAddActGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType(this->ForwardOpType() + "_grad");
+    op->SetInput("X", this->Input("X"));
+    op->SetInput("Z", this->Input("Z"));
+    op->SetInput("Y", this->Output("Y"));
+    op->SetInput(framework::GradVarName("Y"), this->OutputGrad("Y"));
+
+    op->SetInput("Scale", this->Input("Scale"));
+    op->SetInput("Bias", this->Input("Bias"));
+    op->SetInput("SavedMean", this->Output("SavedMean"));
+    op->SetInput("SavedVariance", this->Output("SavedVariance"));
+    op->SetInput("ReserveSpace", this->Output("ReserveSpace"));
+
+    op->SetAttrMap(this->Attrs());
+
+    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    op->SetOutput(framework::GradVarName("Z"), this->InputGrad("Z"));
+    op->SetOutput(framework::GradVarName("Scale"), this->InputGrad("Scale"));
+    op->SetOutput(framework::GradVarName("Bias"), this->InputGrad("Bias"));
+  }
+};
+
+class FusedBatchNormAddActOpInferVarType
+    : public framework::PassInDtypeAndVarTypeToOutput {
+ protected:
+  std::unordered_map<std::string, std::string>& GetInputOutputWithSameType()
+      const override {
+    static std::unordered_map<std::string, std::string> m{{"X", /*->*/ "Y"}};
+    return m;
+  }
+};
+
+template <typename DeviceContext, typename T>
+class FusedBatchNormAddActKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override;
+};
+
+template <typename DeviceContext, typename T>
+class FusedBatchNormAddActGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override;
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/python/paddle/fluid/contrib/layers/nn.py
+++ b/python/paddle/fluid/contrib/layers/nn.py
@ -45,6 +45,7 @@ from paddle.fluid.initializer import Normal, Constant, NumpyArrayInitializer
 from paddle.fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype, convert_dtype

 from paddle.fluid import core
+from paddle.fluid.param_attr import ParamAttr
 from paddle.fluid.entry_attr import ProbabilityEntry, CountFilterEntry

 from paddle.fluid.framework import Variable, convert_np_dtype_to_dtype_
@ -57,7 +58,7 @@ __all__ = [
    'multiclass_nms2', 'search_pyramid_hash', 'shuffle_batch', 'partial_concat',
    'sparse_embedding', 'partial_sum', 'tdm_child', 'rank_attention',
    'tdm_sampler', 'batch_fc', '_pull_box_extended_sparse', 'bilateral_slice',
-    'correlation'
+    'correlation', 'fused_bn_add_act'
 ]


@ -1625,3 +1626,191 @@ def correlation(x,
            },
            outputs={"Output": output})
    return output
+
+
+def fused_bn_add_act(x,
+                     y,
+                     momentum=0.9,
+                     epsilon=1e-05,
+                     param_attr=None,
+                     bias_attr=None,
+                     moving_mean_name=None,
+                     moving_variance_name=None,
+                     act=None,
+                     name=None):
+    """
+    This Op performs batch norm on input x, and adds the result to input y. Then
+    it performs activation on the sum. The data format of inputs must be NHWC
+    `[batch, in_height, in_width, in_channels]`.
+
+    Args:
+        x(Tensor): The rank of input tensor can be 2, 3, 4, 5. The data type
+            is float16.
+        y(Tensor): The rank of input tensor can be 2, 3, 4, 5. The data type
+            is float16.
+        momentum(float|Tensor, optional): The value used for the moving_mean and
+            moving_var computation. This should be a float number or a tensor with
+            shape [1] and data type as float32. The updated formula is:
+            :math:`moving\_mean = moving\_mean * momentum + new\_mean * (1. - momentum)`
+            :math:`moving\_var = moving\_var * momentum + new\_var * (1. - momentum)`
+            Default is 0.9.
+        epsilon(float, optional): A value added to the denominator for
+            numerical stability. Default is 1e-5.
+        param_attr(ParamAttr, optional): The parameter attribute for Parameter `scale`
+            of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
+	        will create ParamAttr as param_attr, the name of scale can be set in ParamAttr.
+	        If the Initializer of the param_attr is not set, the parameter is initialized
+	        with Xavier. Default: None.
+        bias_attr(ParamAttr, optional): The parameter attribute for the bias of batch_norm.
+            If it is set to None or one attribute of ParamAttr, batch_norm
+	        will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr.
+	        If the Initializer of the bias_attr is not set, the bias is initialized zero.
+	        Default: None.
+        moving_mean_name(str, optional): The name of moving_mean which store the global Mean. If it
+            is set to None, batch_norm will save global mean with a random name, otherwise, batch_norm
+            will save global mean with the string.
+        moving_variance_name(str, optional): The name of the moving_variance which store the global Variance.
+            If it is set to None, batch_norm will save global variance with a random name, otherwise, batch_norm
+            will save global variance with the string.
+        act(string, optional): Activation type, linear|relu|prelu|...
+        name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`.
+            Usually name is no need to set and None by default.
+
+    Examples:
+            .. code-block:: python
+
+            import paddle.fluid as fluid
+
+            def build_program(main_program, startup_program):
+                with fluid.program_guard(main_program, startup_program):
+                    x = fluid.layers.data(name='x', shape=[1, 28, 28], dtype='float32')
+                    y = fluid.layers.data(name="y", shape=[1], dtype='int64')
+                    conv1_1 = fluid.layers.conv2d(
+                        input=x,
+                        filter_size=3,
+                        num_filters=32,
+                        stride=1,
+                        padding=1,
+                        act=None,
+                        bias_attr=False,
+                        data_format='NHWC')
+                    conv1_2 = fluid.layers.conv2d(
+                        input=x,
+                        filter_size=3,
+                        num_filters=32,
+                        stride=1,
+                        padding=1,
+                        act=None,
+                        bias_attr=False,
+                        data_format='NHWC')
+                    bn = fluid.layers.batch_norm(
+                        input=conv1_1,
+                        act=None,
+                        data_layout='NHWC')
+                    fused_bn_add_act = fluid.contrib.layers.fused_bn_add_act(conv1_2, bn)
+                    prediction = fluid.layers.fc(input=fused_bn_add_act, size=10, act='softmax')
+                    loss = fluid.layers.cross_entropy(input=prediction, label=y)
+                    loss = fluid.layers.mean(loss)
+                    sgd = fluid.optimizer.SGD(learning_rate=0.001)
+                    sgd = fluid.contrib.mixed_precision.decorate(
+                        sgd, use_dynamic_loss_scaling=True, init_loss_scaling=128.0)
+                    sgd.minimize(loss)
+
+                return x, y, loss
+
+            iters = 5
+            batch_size = 16
+            support_gpu = fluid.is_compiled_with_cuda()
+            if support_gpu:
+                main_program = fluid.Program()
+                startup_program = fluid.Program()
+                place = fluid.CUDAPlace(0)
+                x, y, loss = build_program(main_program, startup_program)
+  
+                feeder = fluid.DataFeeder(feed_list=[x, y], place=place)
+                train_reader = paddle.batch(
+                    paddle.dataset.mnist.train(), batch_size=batch_size)
+                exe = fluid.Executor(place)
+                scope = fluid.Scope()
+                with fluid.scope_guard(scope):
+                    exe.run(startup_program)
+                    for _ in range(iters):
+                        data = next(train_reader())
+                        loss_v = exe.run(main_program, feed=feeder.feed(data), fetch_list=[loss])
+    """
+    helper = LayerHelper('fused_bn_add_act', **locals())
+
+    check_variable_and_dtype(x, 'input', ['float16', 'float32', 'float64'],
+                             'fused_bn_add_act')
+    check_variable_and_dtype(y, 'input', ['float16', 'float32', 'float64'],
+                             'fused_bn_add_act')
+    bn_param_dtype = core.VarDesc.VarType.FP32
+
+    x_shape = x.shape
+    channel_num = x_shape[-1]
+    param_shape = [channel_num]
+
+    # create parameter
+    scale = helper.create_parameter(
+        attr=helper.param_attr,
+        shape=param_shape,
+        dtype=bn_param_dtype,
+        default_initializer=Constant(1.0))
+    bias = helper.create_parameter(
+        attr=helper.bias_attr,
+        shape=param_shape,
+        dtype=bn_param_dtype,
+        is_bias=True)
+    mean = helper.create_parameter(
+        attr=ParamAttr(
+            name=moving_mean_name, initializer=Constant(0.0), trainable=False),
+        shape=param_shape,
+        dtype=bn_param_dtype)
+    mean.stop_gradient = True
+    variance = helper.create_parameter(
+        attr=ParamAttr(
+            name=moving_variance_name,
+            initializer=Constant(1.0),
+            trainable=False),
+        shape=param_shape,
+        dtype=bn_param_dtype)
+    variance.stop_gradient = True
+
+    # create output
+    # mean and mean_out share the same memory
+    mean_out = mean
+    # variance and variance out share the same memory
+    variance_out = variance
+    saved_mean = helper.create_variable_for_type_inference(
+        dtype=bn_param_dtype, stop_gradient=True)
+    saved_variance = helper.create_variable_for_type_inference(
+        dtype=bn_param_dtype, stop_gradient=True)
+    reserve_space = helper.create_variable_for_type_inference(
+        dtype=core.VarDesc.VarType.FP16, stop_gradient=True)
+    batch_norm_out = helper.create_variable_for_type_inference(
+        core.VarDesc.VarType.FP16)
+
+    inputs = {
+        "X": x,
+        "Z": y,
+        "Scale": scale,
+        "Bias": bias,
+    }
+    attrs = {"epsilon": epsilon, 'momentum': momentum}
+
+    outputs = {
+        "Y": batch_norm_out,
+        "MeanOut": mean_out,
+        "VarianceOut": variance_out,
+        "SavedMean": saved_mean,
+        "SavedVariance": saved_variance,
+        "ReserveSpace": reserve_space
+    }
+
+    helper.append_op(
+        type="fused_bn_add_activation",
+        inputs=inputs,
+        outputs=outputs,
+        attrs=attrs)
+
+    return batch_norm_out
--- a/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py
+++ b/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py
@ -135,6 +135,7 @@ gray_list = {
    'get_tensor_from_selected_rows',
    'sign',
    'cast',
+    'fused_bn_add_activation',
 }
 '''
 # The set of ops that don't support fp16 calculation
--- a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
+++ b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
@ -69,8 +69,10 @@ def _insert_cast_op(block, op, idx, src_dtype, dest_dtype):
    ]

    for in_name in op.input_names:
-        if src_dtype == core.VarDesc.VarType.FP32 and op.type == 'batch_norm':
-            if in_name != 'X':
+        if src_dtype == core.VarDesc.VarType.FP32 and op.type in [
+                'batch_norm', 'fused_bn_add_activation'
+        ]:
+            if in_name not in {'X', 'Z'}:
                continue
        for in_var_name in op.input(in_name):
            in_var = block.var(in_var_name)
@ -102,7 +104,8 @@ def _insert_cast_op(block, op, idx, src_dtype, dest_dtype):
                    op._set_attr('in_dtype', dest_dtype)
    if src_dtype == core.VarDesc.VarType.FP32 and dest_dtype == core.VarDesc.VarType.FP16:
        for out_name in op.output_names:
-            if op.type == 'batch_norm' and out_name != 'Y':
+            if op.type in ['batch_norm', 'fused_bn_add_activation'
+                           ] and out_name != 'Y':
                continue
            for out_var_name in op.output(out_name):
                out_var = block.var(out_var_name)
--- a/python/paddle/fluid/tests/unittests/test_fused_bn_add_act.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_bn_add_act.py
@ -0,0 +1,215 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "Paddle core is not compiled with CUDA")
+class TestFusedBnAddActAPI(unittest.TestCase):
+    def setUp(self):
+        self.conv_param_attr1 = fluid.ParamAttr(
+            name='conv2d_1.weight',
+            initializer=fluid.initializer.Xavier(uniform=False),
+            learning_rate=0.001)
+        self.conv_param_attr2 = fluid.ParamAttr(
+            name='conv2d_2.weight',
+            initializer=fluid.initializer.Xavier(uniform=False),
+            learning_rate=0.001)
+        self.bn_param_attr1 = fluid.ParamAttr(
+            name='batch_norm_w_1',
+            initializer=fluid.initializer.Constant(value=1.0))
+        self.bn_bias_attr1 = fluid.ParamAttr(
+            name='batch_norm_b_1',
+            initializer=fluid.initializer.Constant(value=0.0))
+        self.bn_param_attr2 = fluid.ParamAttr(
+            name='batch_norm_w_2',
+            initializer=fluid.initializer.Constant(value=1.0))
+        self.bn_bias_attr2 = fluid.ParamAttr(
+            name='batch_norm_b_2',
+            initializer=fluid.initializer.Constant(value=0.0))
+        self.fc_param_attr = fluid.ParamAttr(
+            name='fc.weight',
+            initializer=fluid.initializer.Xavier(uniform=False))
+
+    def build_fused_program(self,
+                            main_program,
+                            startup_program,
+                            use_cuda,
+                            seed=1):
+        with fluid.program_guard(main_program, startup_program):
+            x = fluid.layers.data(name='x', shape=[1, 28, 28], dtype='float32')
+            y = fluid.layers.data(name="y", shape=[1], dtype='int64')
+            conv1_1 = fluid.layers.conv2d(
+                input=x,
+                filter_size=3,
+                num_filters=32,
+                stride=1,
+                padding=1,
+                act=None,
+                param_attr=self.conv_param_attr1,
+                bias_attr=False,
+                data_format='NHWC')
+            conv1_2 = fluid.layers.conv2d(
+                input=x,
+                filter_size=3,
+                num_filters=32,
+                stride=1,
+                padding=1,
+                act=None,
+                param_attr=self.conv_param_attr2,
+                bias_attr=False,
+                data_format='NHWC')
+            bn = fluid.layers.batch_norm(
+                input=conv1_1,
+                param_attr=self.bn_param_attr1,
+                bias_attr=self.bn_bias_attr1,
+                act=None,
+                data_layout='NHWC')
+            fused_bn_add_act = fluid.contrib.layers.fused_bn_add_act(
+                conv1_2,
+                bn,
+                param_attr=self.bn_param_attr2,
+                bias_attr=self.bn_bias_attr2)
+            prediction = fluid.layers.fc(input=fused_bn_add_act,
+                                         size=10,
+                                         act='softmax',
+                                         param_attr=self.fc_param_attr)
+            loss = fluid.layers.cross_entropy(input=prediction, label=y)
+            loss = fluid.layers.mean(loss)
+            sgd = fluid.optimizer.SGD(learning_rate=0.001)
+            sgd = fluid.contrib.mixed_precision.decorate(
+                sgd, use_dynamic_loss_scaling=True, init_loss_scaling=128.0)
+            sgd.minimize(loss)
+
+        return x, y, loss
+
+    def build_origin_program(self,
+                             main_program,
+                             startup_program,
+                             use_cuda,
+                             seed=1):
+        with fluid.program_guard(main_program, startup_program):
+            x = fluid.layers.data(name='x', shape=[1, 28, 28], dtype='float32')
+            y = fluid.layers.data(name="y", shape=[1], dtype='int64')
+            conv1_1 = fluid.layers.conv2d(
+                input=x,
+                filter_size=3,
+                num_filters=32,
+                stride=1,
+                padding=1,
+                act=None,
+                param_attr=self.conv_param_attr1,
+                bias_attr=False,
+                data_format='NHWC')
+            conv1_2 = fluid.layers.conv2d(
+                input=x,
+                filter_size=3,
+                num_filters=32,
+                stride=1,
+                padding=1,
+                act=None,
+                param_attr=self.conv_param_attr2,
+                bias_attr=False,
+                data_format='NHWC')
+            bn1 = fluid.layers.batch_norm(
+                input=conv1_1,
+                param_attr=self.bn_param_attr1,
+                bias_attr=self.bn_bias_attr1,
+                act=None,
+                data_layout='NHWC')
+            bn2 = fluid.layers.batch_norm(
+                input=conv1_2,
+                param_attr=self.bn_param_attr2,
+                bias_attr=self.bn_bias_attr2,
+                act=None,
+                data_layout='NHWC')
+            out = bn1 + bn2
+            out = fluid.layers.relu(out)
+            prediction = fluid.layers.fc(input=out,
+                                         size=10,
+                                         act='softmax',
+                                         param_attr=self.fc_param_attr)
+            loss = fluid.layers.cross_entropy(input=prediction, label=y)
+            loss = fluid.layers.mean(loss)
+            sgd = fluid.optimizer.SGD(learning_rate=0.001)
+            sgd = fluid.contrib.mixed_precision.decorate(
+                sgd, use_dynamic_loss_scaling=True, init_loss_scaling=128.0)
+            sgd.minimize(loss)
+
+        return x, y, loss
+
+    def check(self, place, use_cuda):
+        paddle.manual_seed(1)
+        paddle.framework.random._manual_program_seed(1)
+        iters = 5
+        batch_size = 16
+
+        # build_fused_program
+        main_program = fluid.Program()
+        startup_program = fluid.Program()
+        x, y, loss = self.build_fused_program(main_program, startup_program,
+                                              use_cuda)
+        feeder = fluid.DataFeeder(feed_list=[x, y], place=place)
+        train_reader = paddle.batch(
+            paddle.dataset.mnist.train(), batch_size=batch_size)
+        exe = fluid.Executor(place)
+        loss_vals_fused = []
+        scope = fluid.Scope()
+        with fluid.scope_guard(scope):
+            exe.run(startup_program)
+            for _ in range(iters):
+                data = next(train_reader())
+                loss_v = exe.run(main_program,
+                                 feed=feeder.feed(data),
+                                 fetch_list=[loss])
+                loss_vals_fused.append(loss_v[0][0])
+
+        # build_origin_program
+        main_program = fluid.Program()
+        startup_program = fluid.Program()
+        x, y, loss = self.build_origin_program(main_program, startup_program,
+                                               use_cuda)
+        feeder = fluid.DataFeeder(feed_list=[x, y], place=place)
+        train_reader = paddle.batch(
+            paddle.dataset.mnist.train(), batch_size=batch_size)
+        loss_vals = []
+        scope = fluid.Scope()
+        with fluid.scope_guard(scope):
+            exe.run(startup_program)
+            for _ in range(iters):
+                data = next(train_reader())
+                loss_v = exe.run(main_program,
+                                 feed=feeder.feed(data),
+                                 fetch_list=[loss])
+                loss_vals.append(loss_v[0][0])
+
+        # check loss
+        for i in range(iters):
+            self.assertAlmostEqual(loss_vals[i], loss_vals_fused[i], delta=1e-5)
+
+    def test_fuse_bn_add_act(self):
+        place = fluid.CUDAPlace(0)
+        self.check(place, use_cuda=True)
+
+
+if __name__ == '__main__':
+    unittest.main()