Add bn and relu fuse pass (#22048)

* add bn and relu fuse pass

* add op attr assert and dtype assert

* fix some inputs&&outputs bugs for the fused op and pattern.

* add the unittest for fuse_bn_act_pass. test=develop

* use normative enforce statements. test=develop

* add the cpu test. test=develop

* add the support of batch_size=1 for the bn with relu op. test=develop

* add the error type for paddle throws. test=develop

* add fused_batch_norm_act and fused_batch_norm_act_grad to op_has_unsed_vars_white_list. test=develop
revert-22710-feature/integrated_ps_api
Zhen Wang 6 years ago committed by Zeng Jinle
parent 0d82baf837
commit 46189b166d

@ -118,7 +118,7 @@ function(op_library TARGET)
"tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op"
"fusion_transpose_flatten_concat_op" "fusion_conv_inception_op"
"sync_batch_norm_op" "dgc_op" "fused_fc_elementwise_layernorm_op"
"multihead_matmul_op" "fusion_group_op")
"multihead_matmul_op" "fusion_group_op" "fused_bn_activation_op")
if ("${TARGET}" STREQUAL "${manual_pybind_op}")
set(pybind_flag 1)
endif()

@ -100,7 +100,7 @@ endif()
cc_library(build_strategy SRCS build_strategy.cc DEPS
graph_viz_pass multi_devices_graph_pass
multi_devices_graph_print_pass multi_devices_graph_check_pass
fuse_elewise_add_act_pass multi_batch_merge_pass
fuse_elewise_add_act_pass fuse_bn_act_pass multi_batch_merge_pass
fuse_relu_depthwise_conv_pass
lock_free_optimize_pass
coalesce_grad_tensor_pass fuse_all_reduce_op_pass backward_optimizer_op_deps_pass

@ -167,6 +167,7 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
"fuse_relu_depthwise_conv_pass");
AppendPassWithCheck(strategy_.fuse_elewise_add_act_ops_,
"fuse_elewise_add_act_pass");
AppendPassWithCheck(strategy_.fuse_bn_act_ops_, "fuse_bn_act_pass");
// for single card training, fuse_all_reduce_ops is unnecessary.
// coalesce_grad_tensor_pass should be before of MultiDevPass.
AppendPassWithCheck(strategy_.fuse_all_reduce_ops_,
@ -369,6 +370,12 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
"GPU, skipped.";
continue;
}
} else if (pass->Type() == "fuse_bn_act_pass") {
if (!use_cuda) {
LOG(WARNING) << "fuse_bn_act_pass is only supported on "
"GPU, skipped.";
continue;
}
} else if (pass->Type() == "mkldnn_placement_pass") {
pass->Set("mkldnn_enabled_op_types",
new std::unordered_set<std::string>(mkldnn_enabled_op_types_));
@ -394,6 +401,7 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
USE_PASS(sync_batch_norm_pass);
USE_PASS(fuse_relu_depthwise_conv_pass);
USE_PASS(fuse_elewise_add_act_pass);
USE_PASS(fuse_bn_act_pass);
USE_PASS(graph_viz_pass);
USE_PASS(multi_batch_merge_pass);
USE_PASS(reduce_mode_multi_devices_pass);

@ -87,6 +87,7 @@ struct BuildStrategy {
// TODO(dev-paddle): fuse_elewise_add_act_ops may cause some models have
// cycle.
bool fuse_elewise_add_act_ops_{false};
bool fuse_bn_act_ops_{false};
// Fuse_all_optimizer_ops and fuse_all_reduce_ops require that gradients
// should not be sparse types
boost::optional<bool> fuse_all_optimizer_ops_{false};

@ -106,6 +106,7 @@ if(WITH_NGRAPH)
set(INFER_IR_PASSES ${INFER_IR_PASSES} ngraph_subgraph_pass CACHE INTERNAL "")
endif()
cc_library(fuse_bn_act_pass SRCS fuse_bn_act_pass.cc DEPS pass graph_pattern_detector )
cc_library(fuse_elewise_add_act_pass SRCS fuse_elewise_add_act_pass.cc DEPS pass graph_pattern_detector )
cc_library(fuse_relu_depthwise_conv_pass SRCS fuse_relu_depthwise_conv_pass.cc DEPS pass graph_pattern_detector )

File diff suppressed because it is too large Load Diff

@ -0,0 +1,64 @@
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <unordered_set>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
#include "paddle/fluid/framework/ir/pass.h"
namespace paddle {
namespace framework {
namespace ir {
/*
* Fuse the BatchNorm and activation.
*/
class FuseBatchNormActPass : public FusePassBase {
public:
virtual ~FuseBatchNormActPass() {}
protected:
void ApplyImpl(ir::Graph *graph) const override;
ir::Graph *FuseBatchNormAct(
ir::Graph *graph, const std::unordered_set<std::string> &act_types) const;
ir::Graph *FuseBatchNormActGrad(
ir::Graph *graph,
const std::unordered_set<std::string> &act_grad_types) const;
std::vector<Node *> ReplaceNode(Node *cur_node, Node *new_node,
const std::vector<Node *> &nodes) const;
void ReLinkNodes(Graph *graph, const Node *intermediate_out, Node *op_1,
Node *op_2, Node *fused_op) const;
Node *CreateFusedBatchNormActNode(
Graph *g, const Node *act, const Node *bn, const std::string &bn_x_n,
const std::string &bn_scale_n, const std::string &bn_bias_n,
const std::string &bn_variance_n, const std::string &bn_mean_n,
const std::string &bn_mean_out_n, const std::string &bn_variance_out_n,
const std::string &bn_saved_variance_n,
const std::string &bn_saved_mean_n, const std::string &bn_reserve_space_n,
const std::string &act_out_n) const;
};
} // namespace ir
} // namespace framework
} // namespace paddle

@ -383,6 +383,13 @@ PDNode *PDNode::assert_is_var() {
return this;
}
PDNode *PDNode::assert_var_dtype(proto::VarType::Type dtype) {
assert_is_var();
asserts_.emplace_back(
[dtype](Node *x) { return x->Var()->GetDataType() == dtype; });
return this;
}
PDNode *PDNode::assert_is_not_ctrl_var() {
asserts_.emplace_back([](Node *x) { return x && !x->IsCtrlVar(); });
return this;
@ -476,6 +483,7 @@ PDNode *PDNode::assert_is_op_output(const std::string &op_type,
assert_is_op_nth_output(op_type, argument, 0);
return this;
}
PDNode *PDNode::assert_is_op_input(const std::string &op_type) {
assert_is_var();
asserts_.emplace_back([=](Node *x) {
@ -489,6 +497,16 @@ PDNode *PDNode::assert_is_op_input(const std::string &op_type) {
return this;
}
PDNode *PDNode::assert_is_not_op_input(const std::string &argument) {
assert_is_op();
asserts_.emplace_back([=](Node *x) {
auto &ins = x->Op()->Inputs();
auto iter = ins.find(argument);
return iter == ins.end() || iter->second.empty();
});
return this;
}
PDNode *PDNode::assert_is_op_input(const std::string &op_type,
const std::string &argument) {
assert_is_var();
@ -1048,6 +1066,117 @@ PDNode *patterns::ActElewiseAdd::operator()(
return elewise_add_out;
}
PDNode *patterns::BatchNormAct::operator()(
paddle::framework::ir::PDNode *bn_x_var,
std::unordered_set<std::string> act_types) {
auto *bn_scale_var = pattern->NewNode(bn_scale_repr())
->assert_is_op_input("batch_norm", "Scale");
auto *bn_bias_var = pattern->NewNode(bn_bias_repr())
->assert_is_op_input("batch_norm", "Bias");
auto *bn_variance_var = pattern->NewNode(bn_variance_repr())
->assert_is_op_input("batch_norm", "Variance");
auto *bn_mean_var = pattern->NewNode(bn_mean_repr())
->assert_is_op_input("batch_norm", "Mean");
auto *bn = pattern->NewNode(batch_norm_repr())
->assert_is_op("batch_norm")
->assert_is_not_op_input("MomentumTensor")
->assert_op_attr<bool>("is_test", false)
->assert_op_attr<bool>("use_global_stats", false)
->assert_op_attr<std::string>("data_layout", "NHWC");
auto *bn_mean_out_var = pattern->NewNode(bn_mean_out_repr())
->assert_is_op_output("batch_norm", "MeanOut");
auto *bn_variance_out_var =
pattern->NewNode(bn_variance_out_repr())
->assert_is_op_output("batch_norm", "VarianceOut");
auto *bn_saved_variance_var =
pattern->NewNode(bn_saved_variance_repr())
->assert_is_op_output("batch_norm", "SavedVariance");
auto *bn_saved_mean_var =
pattern->NewNode(bn_saved_mean_repr())
->assert_is_op_output("batch_norm", "SavedMean");
auto *bn_reserve_space =
pattern->NewNode(bn_reserve_space_repr())
->assert_is_op_output("batch_norm", "ReserveSpace");
auto *bn_out_var = pattern->NewNode(bn_out_repr())
->assert_is_op_output("batch_norm", "Y")
->assert_has_n_outputs(1);
bn_out_var->AsIntermediate()->assert_is_ops_input(act_types);
auto *act = pattern->NewNode(act_repr())->assert_is_ops(act_types);
auto *act_out_var =
pattern->NewNode(act_out_repr())->assert_is_ops_output(act_types, "Out");
bn->LinksFrom(
{bn_x_var, bn_scale_var, bn_bias_var, bn_variance_var, bn_mean_var})
.LinksTo({bn_mean_out_var, bn_variance_out_var, bn_saved_variance_var,
bn_saved_mean_var, bn_reserve_space, bn_out_var});
act->LinksFrom({bn_out_var}).LinksTo({act_out_var});
return act_out_var;
}
PDNode *patterns::BatchNormActGrad::operator()(
paddle::framework::ir::PDNode *d_act_out_var,
std::unordered_set<std::string> act_grad_types) {
auto *act_grad =
pattern->NewNode(act_grad_repr())->assert_is_ops(act_grad_types);
auto *bn_grad = pattern->NewNode(batch_norm_grad_repr())
->assert_is_op("batch_norm_grad")
->assert_op_attr<bool>("use_global_stats", false)
->assert_op_attr<std::string>("data_layout", "NHWC");
auto *act_out_var = pattern->NewNode(act_out_repr())
->assert_is_ops_input(act_grad_types, "Out");
auto *d_intermediate_var =
pattern->NewNode(d_itermediate_out_repr())
->assert_is_ops_output(act_grad_types, GradVarName("X"))
->assert_has_n_outputs(1);
auto *bn_x_var = pattern->NewNode(bn_x_repr())
->assert_is_op_input("batch_norm_grad", "X")
->assert_var_dtype(proto::VarType::FP16);
auto *bn_scale_var = pattern->NewNode(bn_scale_repr())
->assert_is_op_input("batch_norm_grad", "Scale");
auto *bn_bias_var = pattern->NewNode(bn_bias_repr())
->assert_is_op_input("batch_norm_grad", "Bias");
auto *bn_saved_mean_var =
pattern->NewNode(bn_saved_mean_repr())
->assert_is_op_input("batch_norm_grad", "SavedMean");
auto *bn_saved_variance_var =
pattern->NewNode(bn_saved_variance_repr())
->assert_is_op_input("batch_norm_grad", "SavedVariance");
// ReserveSpace as the output is equal to:
// data_layout == 'NHWC' && FLAGS_cudnn_batchnorm_spatial_persistent == true
auto *bn_reserve_space =
pattern->NewNode(bn_reserve_space_repr())
->assert_is_op_input("batch_norm_grad", "ReserveSpace");
auto *d_bn_x_var =
pattern->NewNode(d_bn_x_repr())
->assert_is_not_ctrl_var()
->assert_is_op_output("batch_norm_grad", GradVarName("X"));
auto *d_bn_scale_var =
pattern->NewNode(d_bn_scale_repr())
->assert_is_not_ctrl_var()
->assert_is_op_output("batch_norm_grad", GradVarName("Scale"));
auto *d_bn_bias_var =
pattern->NewNode(d_bn_bias_repr())
->assert_is_not_ctrl_var()
->assert_is_op_output("batch_norm_grad", GradVarName("Bias"));
act_grad->LinksFrom({d_act_out_var, act_out_var})
.LinksTo({d_intermediate_var});
bn_grad
->LinksFrom({bn_x_var, d_intermediate_var, bn_scale_var, bn_bias_var,
bn_saved_mean_var, bn_saved_variance_var, bn_reserve_space})
.LinksTo({d_bn_x_var, d_bn_scale_var, d_bn_bias_var});
return bn_grad;
}
PDNode *patterns::ElewiseAddAct::operator()(
paddle::framework::ir::PDNode *ele_x_var,
std::unordered_set<std::string> act_types) {

@ -27,6 +27,7 @@
#include <unordered_set>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/node.h"
#include "paddle/fluid/inference/analysis/dot.h"
@ -100,6 +101,7 @@ struct PDNode {
PDNode* assert_is_op();
PDNode* assert_is_op(const std::string& op_type);
PDNode* assert_is_var();
PDNode* assert_var_dtype(proto::VarType::Type dtype);
PDNode* assert_is_not_ctrl_var();
PDNode* assert_var_not_persistable();
PDNode* assert_is_persistable_var();
@ -111,6 +113,7 @@ struct PDNode {
const std::string& argument);
PDNode* assert_is_op_nth_input(const std::string& op_type,
const std::string& argument, int nth);
PDNode* assert_is_not_op_input(const std::string& argument);
PDNode* assert_is_op_nth_output(const std::string& op_type,
const std::string& argument, int nth);
PDNode* assert_is_only_input_of_op(const std::string& op_type);
@ -590,6 +593,64 @@ struct GRU : public PatternBase {
PATTERN_DECL_NODE(Hidden);
};
// The following pattern is used to fuse batch_norm and act
// formula: act(bn(x))
// op: batch_norm + act
struct BatchNormAct : public PatternBase {
BatchNormAct(PDPattern* pattern, const std::string& name_scope)
: PatternBase(pattern, name_scope, "bn_act") {}
PDNode* operator()(PDNode* x, std::unordered_set<std::string> acts);
// declare operator node's name
PATTERN_DECL_NODE(batch_norm);
PATTERN_DECL_NODE(act);
// declare variable node's name
// BN inputs
PATTERN_DECL_NODE(bn_scale);
PATTERN_DECL_NODE(bn_bias);
PATTERN_DECL_NODE(bn_variance);
PATTERN_DECL_NODE(bn_mean);
// BN outputs
PATTERN_DECL_NODE(bn_mean_out);
PATTERN_DECL_NODE(bn_variance_out);
PATTERN_DECL_NODE(bn_saved_variance);
PATTERN_DECL_NODE(bn_saved_mean);
PATTERN_DECL_NODE(bn_reserve_space);
PATTERN_DECL_NODE(bn_out);
// ACT output
PATTERN_DECL_NODE(act_out);
};
// the backward of act(bn(x))
// op: batch_norm_grad + act_grad
struct BatchNormActGrad : public PatternBase {
BatchNormActGrad(PDPattern* pattern, const std::string& name_scope)
: PatternBase(pattern, name_scope, "bn_act_grad") {}
// act_grad: in["Out", "Out@GRAD"], out["X@GRAD"]
// bn_grad: in["X", "Y@GRAD", "Scale", "Bias", "SavedMean", "SavedVariance",
// "ReserveSpace"],
// out["X@GRAD", "Scale@GRAD", "Bias@GRAD"]
PDNode* operator()(PDNode* x, std::unordered_set<std::string> act_grad_types);
// declare operator node's name
PATTERN_DECL_NODE(act_grad);
PATTERN_DECL_NODE(batch_norm_grad);
// declare variable node's name
PATTERN_DECL_NODE(act_out);
PATTERN_DECL_NODE(d_itermediate_out);
PATTERN_DECL_NODE(bn_x);
PATTERN_DECL_NODE(bn_scale);
PATTERN_DECL_NODE(bn_bias);
PATTERN_DECL_NODE(bn_saved_mean);
PATTERN_DECL_NODE(bn_saved_variance);
PATTERN_DECL_NODE(bn_reserve_space);
PATTERN_DECL_NODE(d_bn_x);
PATTERN_DECL_NODE(d_bn_scale);
PATTERN_DECL_NODE(d_bn_bias);
};
// The following patterns are used to fuse elewise_add and act
// formula: act(ele_add(x, y))
// op: elementwise_add + act

@ -30,6 +30,8 @@ const std::unordered_set<std::string> op_has_unsed_vars_white_list = {
"auc",
"batch_norm",
"batch_norm_grad",
"fused_batch_norm_act",
"fused_batch_norm_act_grad",
"sync_batch_norm_grad",
"center_loss_grad",
"crop",

@ -1,5 +1,6 @@
include(operators)
register_operators(EXCLUDES
fused_bn_activation_op
conv_fusion_op
fusion_transpose_flatten_concat_op
fusion_conv_inception_op
@ -8,6 +9,11 @@ register_operators(EXCLUDES
fusion_group_op)
if (WITH_GPU)
# fused_bn_activation_op needs cudnn 7.4.1 above
if (NOT ${CUDNN_VERSION} VERSION_LESS 7401)
op_library(fused_bn_activation_op)
file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(fused_batch_norm_act);\n")
endif()
# conv_fusion_op needs cudnn 7 above
if (NOT ${CUDNN_VERSION} VERSION_LESS 7100)
op_library(conv_fusion_op)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -0,0 +1,109 @@
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include "paddle/fluid/framework/grad_op_desc_maker.h"
#include "paddle/fluid/framework/op_proto_maker.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/tensor.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
class FusedBatchNormActOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override;
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override;
framework::OpKernelType GetKernelTypeForVar(
const std::string& var_name, const Tensor& tensor,
const framework::OpKernelType& expected_kernel_type) const override;
};
class FusedBatchNormActGradOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override;
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override;
};
class FusedBatchNormActOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override;
};
template <typename T>
class FusedBatchNormActGradOpMaker : public framework::SingleGradOpMaker<T> {
public:
using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
protected:
std::unique_ptr<T> Apply() const override {
std::unique_ptr<T> op(new T());
op->SetType(this->ForwardOpType() + "_grad");
op->SetInput("X", this->Input("X"));
op->SetInput("Y", this->Output("Y"));
op->SetInput(framework::GradVarName("Y"), this->OutputGrad("Y"));
op->SetInput("Scale", this->Input("Scale"));
op->SetInput("Bias", this->Input("Bias"));
op->SetInput("SavedMean", this->Output("SavedMean"));
op->SetInput("SavedVariance", this->Output("SavedVariance"));
op->SetInput("ReserveSpace", this->Output("ReserveSpace"));
op->SetAttrMap(this->Attrs());
op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
op->SetOutput(framework::GradVarName("Scale"), this->InputGrad("Scale"));
op->SetOutput(framework::GradVarName("Bias"), this->InputGrad("Bias"));
return op;
}
};
class FusedBatchNormActOpInferVarType
: public framework::PassInDtypeAndVarTypeToOutput {
protected:
std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
const override {
return std::unordered_map<std::string, std::string>{{"X", /*->*/ "Y"}};
}
};
template <typename DeviceContext, typename T>
class FusedBatchNormActKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override;
};
template <typename DeviceContext, typename T>
class FusedBatchNormActGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override;
};
} // namespace operators
} // namespace paddle

@ -1994,6 +1994,26 @@ All parameter, weight, gradient are variables in Paddle.
build_strategy = fluid.BuildStrategy()
build_strategy.fuse_elewise_add_act_ops = True
)DOC")
.def_property(
"fuse_bn_act_ops",
[](const BuildStrategy &self) { return self.fuse_bn_act_ops_; },
[](BuildStrategy &self, bool b) {
PADDLE_ENFORCE_EQ(!self.IsFinalized(), true,
platform::errors::PreconditionNotMet(
"BuildStrategy is finlaized."));
self.fuse_bn_act_ops_ = b;
},
R"DOC((bool, optional): fuse_bn_act_ops indicate whether
to fuse batch_norm and activation_op,
it may make the execution faster. Default is False.
Examples:
.. code-block:: python
import paddle.fluid as fluid
build_strategy = fluid.BuildStrategy()
build_strategy.fuse_bn_act_ops = True
)DOC")
.def_property(
"fuse_relu_depthwise_conv",
[](const BuildStrategy &self) {

@ -182,6 +182,7 @@ list(REMOVE_ITEM TEST_OPS test_basic_gru_unit_op)
list(REMOVE_ITEM TEST_OPS test_basic_lstm_api)
list(REMOVE_ITEM TEST_OPS test_basic_lstm_unit_op)
list(REMOVE_ITEM TEST_OPS test_imperative_debug_string)
list(REMOVE_ITEM TEST_OPS test_fuse_bn_act_pass)
if (APPLE OR WIN32)
list(REMOVE_ITEM TEST_OPS test_dataset)
@ -301,6 +302,7 @@ py_test_modules(test_parallel_executor_seresnext_base_cpu MODULES test_parallel_
py_test_modules(test_parallel_executor_seresnext_with_reduce_cpu MODULES test_parallel_executor_seresnext_with_reduce_cpu)
py_test_modules(test_parallel_executor_seresnext_with_fuse_all_reduce_cpu MODULES test_parallel_executor_seresnext_with_fuse_all_reduce_cpu)
py_test_modules(test_data_norm_op MODULES test_data_norm_op)
py_test_modules(test_fuse_bn_act_pass MODULES test_fuse_bn_act_pass ENVS FLAGS_cudnn_deterministic=1 FLAGS_cudnn_batchnorm_spatial_persistent=1 FLAGS_conv_workspace_size_limit=1000)
if(NOT WIN32)
py_test_modules(test_ir_memory_optimize_transformer MODULES test_ir_memory_optimize_transformer)
@ -330,5 +332,5 @@ set_tests_properties(test_parallel_executor_test_while_train test_parallel_execu
test_parallel_executor_crf test_sync_batch_norm_op
test_parallel_executor_feed_persistable_var
test_parallel_executor_crf_auto_growth test_buffer_shared_memory_reuse_pass_and_fuse_optimization_op_pass
test_data_norm_op test_imperative_using_non_zero_gpu
test_data_norm_op test_imperative_using_non_zero_gpu test_fuse_bn_act_pass
test_buffer_shared_memory_reuse_pass PROPERTIES LABELS "RUN_TYPE=DIST")

@ -0,0 +1,121 @@
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.fluid as fluid
import unittest
class TestFuseBatchNormActPass(unittest.TestCase):
def build_program(self, main_program, startup_program, use_cuda, seed=1):
main_program.random_seed = seed
startup_program.random_seed = seed
with fluid.program_guard(main_program, startup_program):
x = fluid.layers.data(name='x', shape=[1, 28, 28], dtype='float32')
y = fluid.layers.data(name="y", shape=[1], dtype='int64')
hidden1 = fluid.layers.conv2d(
input=x,
filter_size=3,
num_filters=32,
stride=1,
padding=1,
act=None,
bias_attr=False,
data_format='NHWC')
param_attr = fluid.ParamAttr(
name='batch_norm_w',
initializer=fluid.initializer.Constant(value=1.0))
bias_attr = fluid.ParamAttr(
name='batch_norm_b',
initializer=fluid.initializer.Constant(value=0.0))
hidden2 = fluid.layers.batch_norm(
input=hidden1,
param_attr=param_attr,
bias_attr=bias_attr,
act='relu',
data_layout='NHWC')
hidden3 = fluid.layers.fc(input=hidden2, size=128, act='relu')
hidden4 = fluid.layers.batch_norm(
input=hidden3, act='relu', data_layout='NHWC')
prediction = fluid.layers.fc(input=hidden4, size=10, act='softmax')
loss = fluid.layers.cross_entropy(input=prediction, label=y)
loss = fluid.layers.mean(loss)
sgd = fluid.optimizer.SGD(learning_rate=0.001)
if use_cuda:
sgd = fluid.contrib.mixed_precision.decorate(
sgd, use_dynamic_loss_scaling=True, init_loss_scaling=128.0)
sgd.minimize(loss)
return x, y, loss
def check(self, place, use_cuda):
main_program = fluid.Program()
startup_program = fluid.Program()
x, y, loss = self.build_program(main_program, startup_program, use_cuda)
exe = fluid.Executor(place)
iters = 10
batch_size = 16
feeder = fluid.DataFeeder(feed_list=[x, y], place=place)
# close fused_bn_act_ops
build_strategy = fluid.BuildStrategy()
build_strategy.fuse_bn_act_ops = False
binary = fluid.CompiledProgram(main_program).with_data_parallel(
loss_name=loss.name, build_strategy=build_strategy)
train_reader = paddle.batch(
paddle.dataset.mnist.train(), batch_size=batch_size)
loss_vals = []
scope = fluid.Scope()
with fluid.scope_guard(scope):
exe.run(startup_program)
for _ in range(iters):
data = next(train_reader())
loss_v = exe.run(binary,
feed=feeder.feed(data),
fetch_list=[loss])
loss_vals.append(loss_v[0][0])
# open fused_bn_act_ops
build_strategy_fused = fluid.BuildStrategy()
build_strategy_fused.fuse_bn_act_ops = True
binary_fused = fluid.CompiledProgram(main_program).with_data_parallel(
loss_name=loss.name, build_strategy=build_strategy_fused)
train_reader_fused = paddle.batch(
paddle.dataset.mnist.train(), batch_size=batch_size)
loss_vals_fused = []
scope_fused = fluid.Scope()
with fluid.scope_guard(scope_fused):
exe.run(startup_program)
for _ in range(iters):
data = next(train_reader_fused())
loss_v = exe.run(binary_fused,
feed=feeder.feed(data),
fetch_list=[loss])
loss_vals_fused.append(loss_v[0][0])
# check loss
for i in range(iters):
self.assertAlmostEqual(loss_vals[i], loss_vals_fused[i], delta=1e-5)
def test_fuse_bn_act_pass_cpu(self):
place = fluid.CPUPlace()
self.check(place, use_cuda=False)
def test_fuse_bn_act_pass_cuda(self):
if fluid.core.is_compiled_with_cuda():
place = fluid.CUDAPlace(0)
self.check(place, use_cuda=True)
if __name__ == '__main__':
unittest.main()
Loading…
Cancel
Save