Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into dev_op_tensor_support

7 years ago · 24d51de022
parent 27df3a9f2b b2435a3a11
commit 24d51de022
36 changed files with 2131 additions and 88 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -65,6 +65,7 @@ option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better d
 option(WITH_ANAKIN      "Compile with Anakin library"                   OFF)
 option(WITH_GRPC     "Use grpc as the default rpc framework"            ${WITH_DISTRIBUTE})
 option(WITH_BRPC_RDMA     "Use brpc rdma as the rpc protocal"           OFF)
 option(WITH_INFERENCE    "Compile fluid inference library"              ON)
 option(WITH_SYSTEM_BLAS   "Use system blas library"           OFF)
 option(PY_VERSION       "Compile PaddlePaddle with python3 support"     ${PY_VERSION})
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@ -264,6 +264,8 @@ function(cc_test TARGET_NAME)
             WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
    if (${cc_test_SERIAL})
        set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
    endif()
@ -330,6 +332,8 @@ function(nv_test TARGET_NAME)
    add_test(${TARGET_NAME} ${TARGET_NAME})
    if (nv_test_SERIAL)
        set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
    endif()
@ -580,6 +584,7 @@ function(py_test TARGET_NAME)
    cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
    add_test(NAME ${TARGET_NAME}
             COMMAND env FLAGS_init_allocated_mem=true FLAGS_cudnn_deterministic=true
             FLAGS_cpu_deterministic=true
             PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
             ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
             WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
--- a/doc/survey/op_fusion_design.md
+++ b/doc/survey/op_fusion_design.md
@ -0,0 +1,20 @@
 # Operator fusion  
 Fusing multiple operators together is an important method to optimize the program execution, particularly for GPU or other specialized accelerators. An obvious benefit is to avoid the overhead of saving the intermediate result back into global memory.   
 There are generally two ways to fuse operators, fusing directly connected operators and fusing non directly connected operators. The first method is mainly used by [NNVM Compiler](https://github.com/dmlc/tvm/) and [XLA](https://www.tensorflow.org/performance/xla/). The second method is mainly used by Dynet and TensorFlow Fold to do auto-batching. The principle of fusing operator is according to some rules to combine multiple operations into one, for example, `Y = X * W` and `Z = Y + B` can be fused to `Z = X * W + B`, and `Y1 = X1 * W` and `Y2 = X2 * W` can be fused to `[Y1;Y2] = [X1;X2] * W`. In order to get a short-term profit, we decided to try to manually specify these rules.   
 ## Challenge
 The challenge of fusing operators is:
  - how to make the rules.
  - how to implement these rules efficiently.
 ### How to make the rules?
 The problem of determining the best single location for a fusion operator is an NP-hard combinatorial problem. After analysis the operators of the DL model, we found there are two group of operators can be fused explicitly, one is the simple and adjacent operations, for example, `tmp = x + y` and `z = Relu(tmp)`, and the other is the operators that have the same function, for example, a serials of `SGD` or `Momentum`. They usually appear in the model in a large number. So we should think about how to fuse them separately first.
 ### How to implement these rules efficiently?
 #### How to fuse the adjacent operations efficiently?
 Here we use a template function to represent the fused operations. The pros of using a template function are that it is simple and efficient, and the cons are that it is not easy to expand, and it can only be used to express some simple operations. So taking into account our current needs, the template function is more appropriate.
 #### How to fuse the operators that have the same function efficiently?
 We take SGD operator as an example, the training model may have hundreds of parameters and correspondingly have the same number of SGD operators. The expression(`w = w - lr*w_g`) of those operators is the same, so during of training, the executor will execute this expression hundreds time in CPU or other specialized accelerators. If we can fuse them and make the address of all `w` and all `w_g` continuous respectively, we only need execute one time. For some accelerators, the time of launching kernel is not neglected, so the time of hundreds of times of launching and executing kernel may be larger than launching and executing only once. There usually are many operators that similar to `SGD` in the DL model, such as `AllReduce` and `FC`.
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@ -336,6 +336,7 @@ paddle.fluid.contrib.BeamSearchDecoder.decode ArgSpec(args=['self'], varargs=Non
 paddle.fluid.contrib.BeamSearchDecoder.early_stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.contrib.BeamSearchDecoder.read_array ArgSpec(args=['self', 'init', 'is_ids', 'is_scores'], varargs=None, keywords=None, defaults=(False, False))
 paddle.fluid.contrib.BeamSearchDecoder.update_array ArgSpec(args=['self', 'array', 'value'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.contrib.memory_usage ArgSpec(args=['program', 'batch_size'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.transpiler.DistributeTranspiler.create_splited_vars ArgSpec(args=['self', 'source_var', 'block', 'tag'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
--- a/paddle/fluid/CMakeLists.txt
+++ b/paddle/fluid/CMakeLists.txt
@ -5,5 +5,7 @@ add_subdirectory(operators)
 add_subdirectory(pybind)
 add_subdirectory(string)
 add_subdirectory(recordio)
-# NOTE: please add subdirectory inference at last.
+if(WITH_INFERENCE)
-add_subdirectory(inference)
+  # NOTE: please add subdirectory inference at last.
  add_subdirectory(inference)
 endif()
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@ -21,6 +21,26 @@ namespace framework {
 namespace details {
 struct BuildStrategy {
  // ParallelExecutor supports two modes of ReduceStrategy, kAllReduce and
  // kReduce, for CPU and GPU. If you use kAllReduce, different threads
  // optimize their parameters separately. If you use kReduce, the optimizations
  // of parameters are distributed to different threads.
  // For example, a model has 100 parameters and is running with four threads,
  // if you choose kAllReduce, every thread is to optimize 100 parameters
  // separately, if you choose kReduce, every thread is to optimize 25
  // parameters.
  // Of particular note is, if you use kReduce when using CPU training,
  // all the parameters are shared between different threads. This feature will
  // save memory.
  // FIXME(zcd): The result of the two modes(kAllReduce and kReduce) maybe not
  // equal for GPU. Because, the result of the different order of summing maybe
  // different, for example, the result of `a+b+c+d` may be different with the
  // result of `c+a+b+d`.
  // For GPU, the implementation of kAllReduce and kReduce is adopted NCCL,
  // so the result of kAllReduce and kReduce maybe not equal.
  // For CPU, if you want to fix the order of summing to make the result
  // of kAllReduce and kReduce no diff, you can add
  // `FLAGS_cpu_deterministic=true` to env.
  enum class ReduceStrategy { kAllReduce = 0, kReduce = 1 };
  enum class GradientScaleStrategy {
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@ -275,7 +275,8 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
      if (strategy_.gradient_scale_ !=
          BuildStrategy::GradientScaleStrategy::kCustomized) {
        // TODO(paddle-dev): Why is there no input for this op_handle?
-        CreateScaleLossGradOp(&result);
+        auto loss_grad_name = node->Op()->OutputArgumentNames()[0];
        CreateScaleLossGradOp(&result, loss_grad_name);
      }
      // This assumes the backward generating code will ensure IsScaleLossOp
      // is true only for the op that scale the final scalar loss.
@ -535,7 +536,8 @@ int MultiDevSSAGraphBuilder::GetVarDeviceID(const ir::Graph &graph,
  return got == sharded_var_device.end() ? -1 : got->second;
 }
-void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(ir::Graph *result) const {
+void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(
    ir::Graph *result, const std::string &loss_grad_name) const {
  for (size_t i = 0; i < places_.size(); ++i) {
 // Insert ScaleCost OpHandle
 #ifdef PADDLE_WITH_CUDA
@ -558,9 +560,9 @@ void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(ir::Graph *result) const {
    // loss->pending_ops_.emplace_back(op_handle);
    // op_handle->inputs_.emplace_back(loss);
-    CreateOpOutput(result, op_handle,
+    CreateOpOutput(
-                   result->CreateEmptyNode(GradVarName(loss_var_name_),
+        result, op_handle,
-                                           ir::Node::Type::kVariable),
+        result->CreateEmptyNode(loss_grad_name, ir::Node::Type::kVariable),
        places_[i], i);
  }
 }
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h
@ -75,7 +75,9 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
  void CreateComputationalOps(ir::Graph *result, ir::Node *node,
                              size_t num_places) const;
-  void CreateScaleLossGradOp(ir::Graph *result) const;
+  void CreateScaleLossGradOp(ir::Graph *result,
                             const std::string &loss_grad_name) const;
  VarHandle *CreateReduceOp(ir::Graph *result, const std::string &og,
                            int dst_dev_id) const;
  void CreateComputationalOp(ir::Graph *result, ir::Node *node,
--- a/paddle/fluid/framework/details/reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle.cc
@ -18,6 +18,10 @@
 #include "paddle/fluid/framework/details/variable_visitor.h"
 #include "paddle/fluid/platform/profiler.h"
 DEFINE_bool(
    cpu_deterministic, false,
    "Whether to make the result of computation deterministic in CPU side.");
 namespace paddle {
 namespace framework {
 namespace details {
@ -91,11 +95,33 @@ void ReduceOpHandle::RunImpl() {
  } else {
    std::vector<const LoDTensor *> lod_tensors =
        GetInputValues<LoDTensor>(in_var_handles, var_scopes);
    if (paddle::platform::is_cpu_place(lod_tensors[0]->place())) {
      this->RunAndRecordEvent([&] {
        // FIXME(zcd): The order of summing is important,
        // especially when the type of data is float or double.
        // For example, the result of `a+b+c+d` may be different
        // with the result of `c+a+b+d`, so the summing order should be fixed.
        if (!FLAGS_cpu_deterministic) {
          ReduceLoDTensor func(lod_tensors,
                               out_var->GetMutable<framework::LoDTensor>());
          VisitDataType(ToDataType(lod_tensors[0]->type()), func);
        } else {
          // We sum lod_tensors to reduce_sum_trg which is in local_scopes_0
          // here, but it doesn't mean reduce_sum_trg must be in local_scopes_0.
          auto &reduce_sum_trg = *this->local_scopes_[0]
                                      ->FindVar(kLocalExecScopeName)
                                      ->Get<Scope *>()
                                      ->FindVar(out_var_handle->name_)
                                      ->GetMutable<framework::LoDTensor>();
          ReduceLoDTensor func(lod_tensors, &reduce_sum_trg);
          VisitDataType(ToDataType(lod_tensors[0]->type()), func);
          auto trg = out_var->GetMutable<framework::LoDTensor>();
          if (reduce_sum_trg.data<void>() != trg->data<void>()) {
            TensorCopy(reduce_sum_trg, platform::CPUPlace(), trg);
          }
        }
      });
    } else if (paddle::platform::is_gpu_place(lod_tensors[0]->place())) {
 #ifdef PADDLE_WITH_CUDA
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@ -778,6 +778,7 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
    const ExecutionContext& ctx) const {
  auto& scope = ctx.scope();
  int data_type = -1;
  std::string last_input_name;
  for (auto& input : this->inputs_) {
    for (auto& ipt_name : input.second) {
      auto* var = scope.FindVar(ipt_name);
@ -794,9 +795,10 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
          int tmp = static_cast<int>(ToDataType(t->type()));
          PADDLE_ENFORCE(
              tmp == data_type || data_type == -1,
-              "DataType of Paddle Op %s must be the same. Get %d != %d", Type(),
+              "DataType of Paddle Op %s must be the same. Get %s(%d) != %s(%d)",
-              data_type, tmp);
+              Type(), last_input_name, data_type, ipt_name, tmp);
          data_type = tmp;
          last_input_name = ipt_name;
        }
      }
    }
--- a/paddle/fluid/inference/analysis/analyzer.cc
+++ b/paddle/fluid/inference/analysis/analyzer.cc
@ -24,7 +24,7 @@
 namespace paddle {
-DEFINE_bool(inference_analysis_enable_tensorrt_subgraph_engine, false,
+DEFINE_bool(inference_analysis_enable_tensorrt_subgraph_engine, true,
            "Enable subgraph to TensorRT engine for acceleration");
 DEFINE_string(inference_analysis_graphviz_log_root, "./",
@ -42,10 +42,19 @@ class DfgPassManagerImpl final : public DfgPassManager {
    // TODO(Superjomn) set the key with pass reprs.
    AddPass("fluid-to-data-flow-graph", new FluidToDataFlowGraphPass);
    if (FLAGS_inference_analysis_enable_tensorrt_subgraph_engine) {
-      auto trt_teller = [](const Node* node) {
+      auto trt_teller = [&](const Node* node) {
        std::unordered_set<std::string> teller_set(
            {"elementwise_add", "mul", "conv2d", "pool2d", "relu"});
        if (!node->IsFunction()) return false;
-        return static_cast<const Function*>(node)->func_type() == "mul";
+
        const auto* func = static_cast<const Function*>(node);
        if (teller_set.count(func->func_type()))
          return true;
        else {
          return false;
        }
      };
      AddPass("tensorrt-subgraph-marker",
              new TensorRTSubgraphNodeMarkPass(trt_teller));
      AddPass("tensorrt-subgraph", new TensorRTSubGraphPass(trt_teller));
--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
@ -23,7 +23,7 @@
 namespace paddle {
 namespace inference {
-DEFINE_int32(tensorrt_max_batchsize, 300, "TensorRT maximum batch size");
+DEFINE_int32(tensorrt_max_batchsize, 3, "TensorRT maximum batch size");
 DEFINE_int32(tensorrt_workspace_size, 2048, "TensorRT workspace size");
 namespace analysis {
@ -87,34 +87,113 @@ void DataFlowGraphToFluidPass::AddFluidOp(Node *node) {
 }
 void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph,
-                       const framework::proto::BlockDesc &block) {
+                       framework::proto::BlockDesc *block) {
  static int counter{0};
  PADDLE_ENFORCE(node->IsFunctionBlock());
  framework::OpDesc desc;
  auto *func = static_cast<FunctionBlock *>(node);
  // collect inputs
-  std::vector<std::string> io;
+  std::unordered_set<std::string> input_names;
  for (auto *x : func->inlinks) {
-    io.push_back(x->name());
+    input_names.insert(x->name());
  }
-  desc.SetInput("Xs", io);
+  desc.SetInput(
      "Xs", std::vector<std::string>(input_names.begin(), input_names.end()));
-  // collect outputs
+  std::unordered_set<std::string> output_names;
  io.clear();
  for (auto *x : func->outlinks) {
-    io.push_back(x->name());
+    output_names.insert(x->name());
  }
-  desc.SetOutput("Ys", io);
+
  std::vector<std::string> output_temp(output_names.begin(),
                                       output_names.end());
  desc.SetOutput("Ys", output_temp);
  desc.SetType("tensorrt_engine");
-  PADDLE_ENFORCE(!block.vars().empty(), "the block has no var-desc");
+  std::unordered_map<std::string, std::string> output_name_map;
  // The following procedure is used to rename all the intermediate
  // variables and the output variables of the subgraph.
  // Why we do this?
  // During the transition from fluid OP to tensorrt OP, we map
  // the input and output Tensor(fluid data structure) of fluid OP
  // to the correspondin ITensor (trt data structure) through the
  // Tensor name. When we set up ITensor for an variable, we must
  // ensure that it has not been set before.
  // If there is variable in the fluid graph, which is not only the
  // input of a OP, but also the output of a Op, there will be problems.
  // So we have to rename the variable in the subgraph to make sure
  // it is either an OP's input or an OP's output.
  auto subgraph_nodes = func->subgraph;
  for (int index = 0; index < block->ops_size(); index++) {
    framework::proto::OpDesc *op = block->mutable_ops(index);
    auto correspond_node = subgraph_nodes[index];
    PADDLE_ENFORCE_EQ(correspond_node->name(), op->type());
    std::unordered_map<std::string, size_t> var2id;
    for (auto *in_var : correspond_node->inlinks) {
      var2id[in_var->name()] = in_var->id();
    }
    // rename for the input variables of op inside subgraph
    for (int i = 0; i < op->inputs_size(); i++) {
      framework::proto::OpDesc_Var *in_var = op->mutable_inputs(i);
      std::vector<std::string> replaced_names;
      for (int k = 0; k < in_var->arguments_size(); k++) {
        std::string arg_value = in_var->arguments(k);
        if (input_names.count(arg_value)) {
          replaced_names.push_back(arg_value);
        } else {
          replaced_names.push_back(arg_value +
                                   std::to_string(var2id[arg_value]));
        }
      }
      in_var->clear_arguments();
      for (size_t k = 0; k < replaced_names.size(); k++) {
        in_var->add_arguments(replaced_names[k]);
      }
    }
    var2id.clear();
    for (auto out_var : correspond_node->outlinks) {
      var2id[out_var->name()] = out_var->id();
    }
    // rename for the output variables of op inside subgraph
    for (int i = 0; i < op->outputs_size(); i++) {
      framework::proto::OpDesc_Var *out_var = op->mutable_outputs(i);
      std::vector<std::string> replaced_names;
      for (int k = 0; k < out_var->arguments_size(); k++) {
        std::string arg_value = out_var->arguments(k);
        if (output_names.count(arg_value)) {
          output_name_map[arg_value] =
              arg_value + std::to_string(var2id[arg_value]);
        }
        replaced_names.push_back(arg_value + std::to_string(var2id[arg_value]));
      }
      out_var->clear_arguments();
      for (size_t k = 0; k < replaced_names.size(); k++) {
        out_var->add_arguments(replaced_names[k]);
      }
    }
  }
  // When tensorrt engine runs at the end of the operation,
  // output_mapping help us copy the data from the renamed ITensor
  // to Tensor.
  std::vector<std::string> output_mapping;
  for (auto name : output_names) {
    PADDLE_ENFORCE(output_name_map.count(name) != 0);
    output_mapping.push_back(output_name_map[name]);
  }
  PADDLE_ENFORCE(!block->vars().empty(), "the block has no var-desc");
  // Set attrs
-  SetAttr(desc.Proto(), "subgraph", block.SerializeAsString());
+  SetAttr(desc.Proto(), "subgraph", block->SerializeAsString());
  SetAttr(desc.Proto(), "engine_uniq_key", "trt-" + std::to_string(counter++));
  SetAttr(desc.Proto(), "max_batch", FLAGS_tensorrt_max_batchsize);
  SetAttr(desc.Proto(), "max_workspace", FLAGS_tensorrt_workspace_size);
  SetAttr(desc.Proto(), "parameters", ExtractParameters(graph.nodes.nodes()));
  SetAttr(desc.Proto(), "output_name_mapping", output_mapping);
  node->SetPbMsg(desc.Proto()->SerializeAsString());
 }
@ -146,15 +225,17 @@ void DataFlowGraphToFluidPass::AddEngineOp(Node *node) {
  LOG(INFO) << "transformed variable size: "
            << block_desc.Proto()->vars().size();
  // copy ops.
  for (auto *node : block_node->subgraph) {
    auto *op = block_desc.AppendOp();
    PADDLE_ENFORCE(!node->pb_msg().empty());
    op->Proto()->ParseFromString(node->pb_msg());
  }
  *block_desc.Proto()->mutable_vars() =
      argument_->origin_program_desc->blocks(0).vars();
  PADDLE_ENFORCE(!block_desc.Proto()->vars().empty());
-  CreateTrtEngineOp(node, *argument_->main_dfg, *block_desc.Proto());
+  CreateTrtEngineOp(node, *argument_->main_dfg, block_desc.Proto());
  auto *main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
  auto *op = main_block->add_ops();
  PADDLE_ENFORCE(!node->pb_msg().empty(), "failed to set desc for block");
--- a/paddle/fluid/inference/analysis/subgraph_splitter.cc
+++ b/paddle/fluid/inference/analysis/subgraph_splitter.cc
@ -76,7 +76,7 @@ void UnionFindCombine(const node_map_t &node_map, size_t a, size_t b) {
 std::vector<std::vector<Node *>> SubGraphSplitter::ExtractSubGraphs() {
  std::vector<Node *> marked_nodes;
-  for (auto &node : GraphTraits<DataFlowGraph>(graph_).nodes()) {
+  for (auto &node : GraphTraits<DataFlowGraph>(graph_).nodes_in_TS()) {
    if (node.attr(kMarkerAttrName).Bool()) {
      marked_nodes.push_back(&node);
    }
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@ -1,6 +1,7 @@
 # Add TRT tests
 nv_library(tensorrt_converter
  SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
 activation_op.cc
  DEPS tensorrt_engine operator scope framework_proto op_registry)
 nv_test(test_op_converter SRCS test_op_converter.cc DEPS
--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@ -55,7 +55,6 @@ class OpConverter {
        it = Registry<OpConverter>::Lookup("fc");
      }
    }
    if (op_desc.Type().find("elementwise") != std::string::npos) {
      static std::unordered_set<std::string> add_tensor_op_set{
          "add", "mul", "sub", "div", "max", "min", "pow"};
@ -72,6 +71,8 @@ class OpConverter {
                       "Unsupported elementwise type" + op_type);
        it =
            Registry<OpConverter>::Lookup("elementwise_" + op_type + "_weight");
        PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]",
                                op_desc.Type());
      } else {
        PADDLE_ENFORCE(add_tensor_op_set.count(op_type) > 0,
                       "Unsupported elementwise type" + op_type);
--- a/paddle/fluid/operators/fused_elemwise_activation_op.cc
+++ b/paddle/fluid/operators/fused_elemwise_activation_op.cc
@ -0,0 +1,221 @@
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include <string>
 #include <vector>
 #include "paddle/fluid/operators/fused_elemwise_activation_op.h"
 namespace paddle {
 namespace operators {
 class FusedElemwiseActivationOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
  void InferShape(framework::InferShapeContext *ctx) const override {
    PADDLE_ENFORCE(
        ctx->HasInput("X"),
        "Input(X) of FusedElemwiseActivationOp op should not be null.");
    PADDLE_ENFORCE(
        ctx->HasInput("Y"),
        "Input(Y) of FusedElemwiseActivationOp op should not be null.");
    PADDLE_ENFORCE(
        ctx->HasOutput("Out"),
        "Output(Out) of FusedElemwiseActivationOp op should not be null.");
    auto x_dim = ctx->GetInputDim("X");
    auto y_dim = ctx->GetInputDim("Y");
    PADDLE_ENFORCE_GE(x_dim.size(), y_dim.size(),
                      "Rank of first input must >= rank of second input.");
    ctx->SetOutputDim("Out", x_dim);
    ctx->ShareLoD("X", /*->*/ "Out");
  }
 protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext &ctx) const override {
    PADDLE_ENFORCE_EQ(ctx.Input<framework::Tensor>("X")->type(),
                      ctx.Input<framework::Tensor>("Y")->type(),
                      "The element's type of input should be the same.");
    auto input_data_type =
        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type());
    return framework::OpKernelType(input_data_type, ctx.GetPlace());
  }
 };
 class FusedElemwiseActivationMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override {
    AddInput("X", "(vector<Tensor>)");
    AddInput("Y", "(vector<Tensor>)");
    AddOutput("Out", "vector<Tensor>");
    AddAttr<int>("axis",
                 "axis is used by elementwise_op, the default value is -1.")
        .SetDefault(-1);
    AddAttr<float>("scale",
                   "scale is used by scale_op, the default value is 0.0.")
        .SetDefault(0.0);
    AddAttr<bool>("recomputation",
                  "Whether to recompute the Out."
                  "fused_elemwise_activation_grad has two methods to get the "
                  "dx and dy, one "
                  "is to use the 'Out', and the other is not to use it. "
                  "The former method will save the time of recomputing the "
                  "'Out', but it must occupy the memory to store the 'out'. "
                  "While, the later method can avoid occupying the memory, "
                  "but it must recompute the 'Out'. The default value is true.")
        .SetDefault(true);
    AddAttr<std::vector<std::string>>("functor_list",
                                      "The functors that should be fused.")
        .AddCustomChecker([&](const std::vector<std::string> &functor_list) {
          PADDLE_ENFORCE(ValidCheck(functor_list));
        });
    AddComment(R"DOC(
 FusedElemwiseActivation Operator.
 At present, FusedElemwiseActivation only supports Two kinds of compound
 operators (elementwise_op and activation_op):
    Z = Binary(X, Unary(Y))
    Z = Unary(Binary(X, Y))
 The attributions of activation_op can be get from fused_elemwise_activation_op's
 attributions. functor_list records the functors to be fused, for example
 "scale,elementwise_add".
 )DOC");
  }
 private:
  bool ValidCheck(const std::vector<std::string> &functors) {
    std::unordered_set<std::string> unary_fun = {"scale", "relu"};
    std::unordered_set<std::string> binary_fun = {"elementwise_add"};
    std::string unary_fun_str;
    if (binary_fun.count(functors[0])) {
      unary_fun_str = functors[1];
    } else if (binary_fun.count(functors[1])) {
      unary_fun_str = functors[0];
    } else {
      PADDLE_THROW("%s and %s are not included in fused_list.", functors[0],
                   functors[1]);
    }
    PADDLE_ENFORCE_EQ(unary_fun.count(unary_fun_str), 1,
                      "%s is not included in fused_list.", unary_fun_str);
    return true;
  }
 };
 class FusedElemwiseActivationGradMaker
    : public framework::SingleGradOpDescMaker {
 public:
  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 protected:
  std::unique_ptr<framework::OpDesc> Apply() const override {
    auto *op_desc_ptr = new framework::OpDesc();
    op_desc_ptr->SetType(this->ForwardOpType() + "_grad");
    for (auto &input_param : this->InputNames()) {
      op_desc_ptr->SetInput(input_param, this->Input(input_param));
      op_desc_ptr->SetOutput(framework::GradVarName(input_param),
                             this->InputGrad(input_param, true));
    }
    for (auto &output_param : this->OutputNames()) {
      op_desc_ptr->SetInput(output_param, this->Output(output_param));
      op_desc_ptr->SetInput(framework::GradVarName(output_param),
                            this->OutputGrad(output_param));
    }
    op_desc_ptr->SetAttrMap(this->Attrs());
    std::vector<std::string> functor_names =
        boost::get<std::vector<std::string>>(
            op_desc_ptr->GetAttr("functor_list"));
    functor_names[0] += "_grad";
    functor_names[1] += "_grad";
    op_desc_ptr->SetAttr("functor_list", functor_names);
    return std::unique_ptr<framework::OpDesc>(op_desc_ptr);
  }
 };
 class FusedElemwiseActivationOpGrad : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
  void InferShape(framework::InferShapeContext *ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null");
    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
                   "Input(Out@GRAD) should not be null");
    auto x_dims = ctx->GetInputDim("X");
    auto y_dims = ctx->GetInputDim("Y");
    auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
    PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
                      "Rank of first input must >= rank of second input.");
    auto x_grad_name = framework::GradVarName("X");
    auto y_grad_name = framework::GradVarName("Y");
    if (ctx->HasOutput(x_grad_name)) {
      ctx->SetOutputDim(x_grad_name, x_dims);
    }
    if (ctx->HasOutput(y_grad_name)) {
      ctx->SetOutputDim(y_grad_name, y_dims);
    }
  }
 protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext &ctx) const override {
    auto input_data_type_index = ctx.Input<framework::Tensor>("X")->type();
    PADDLE_ENFORCE_EQ(input_data_type_index,
                      ctx.Input<framework::Tensor>("Y")->type(),
                      "The element's type of input should be the same.");
    PADDLE_ENFORCE_EQ(
        input_data_type_index,
        ctx.Input<framework::Tensor>(framework::GradVarName("Out"))->type(),
        "The element's type of input should be the same.");
    auto input_data_type = framework::ToDataType(input_data_type_index);
    return framework::OpKernelType(input_data_type, ctx.GetPlace());
  }
 };
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(fused_elemwise_activation, ops::FusedElemwiseActivationOp,
                  ops::FusedElemwiseActivationMaker,
                  ops::FusedElemwiseActivationGradMaker);
 REGISTER_OPERATOR(fused_elemwise_activation_grad,
                  ops::FusedElemwiseActivationOpGrad);
 REGISTER_OP_CPU_KERNEL(
    fused_elemwise_activation,
    ops::FusedElemwiseActivationKernel<paddle::platform::CPUDeviceContext,
                                       float>,
    ops::FusedElemwiseActivationKernel<paddle::platform::CPUDeviceContext,
                                       double>);
 REGISTER_OP_CPU_KERNEL(
    fused_elemwise_activation_grad,
    ops::FusedElemwiseActivationGradKernel<paddle::platform::CPUDeviceContext,
                                           float>,
    ops::FusedElemwiseActivationGradKernel<paddle::platform::CPUDeviceContext,
                                           double>);
--- a/paddle/fluid/operators/fused_elemwise_activation_op.cu
+++ b/paddle/fluid/operators/fused_elemwise_activation_op.cu
@ -0,0 +1,30 @@
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/fused_elemwise_activation_op.h"
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
    fused_elemwise_activation,
    ops::FusedElemwiseActivationKernel<paddle::platform::CUDADeviceContext,
                                       float>,
    ops::FusedElemwiseActivationKernel<paddle::platform::CUDADeviceContext,
                                       double>);
 REGISTER_OP_CUDA_KERNEL(
    fused_elemwise_activation_grad,
    ops::FusedElemwiseActivationGradKernel<paddle::platform::CUDADeviceContext,
                                           float>,
    ops::FusedElemwiseActivationGradKernel<paddle::platform::CUDADeviceContext,
                                           double>);
--- a/paddle/fluid/operators/fused_elemwise_activation_op.h
+++ b/paddle/fluid/operators/fused_elemwise_activation_op.h
--- a/paddle/fluid/operators/math/functors.h
+++ b/paddle/fluid/operators/math/functors.h
@ -0,0 +1,71 @@
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 namespace paddle {
 namespace operators {
 namespace math {
 // AddFunctor
 template <typename T>
 struct AddFunctor {
  // out = x + y;
  inline HOSTDEVICE T operator()(T x, T y) { return x + y; }
 };
 template <typename T>
 struct AddGradFunctor {
  inline HOSTDEVICE T operator()(T x, T y) { return 1; }
  inline HOSTDEVICE T operator()(T x, T y, T out) const { return 1; }
 };
 template <typename T>
 struct ScaleFunctor {
  explicit ScaleFunctor(const T coeff) : coeff_(coeff) {}
  inline HOSTDEVICE T operator()(T ele) { return ele * coeff_; }
 private:
  T coeff_;
 };
 template <typename T>
 struct ScaleGradFunctor {
  explicit ScaleGradFunctor(T coeff) : coeff_(coeff) {}
  inline HOSTDEVICE T operator()(T x) { return coeff_; }
  inline HOSTDEVICE T operator()(T x, T out) { return coeff_; }
 private:
  T coeff_;
 };
 template <typename T>
 struct ReluFunctor {
  inline HOSTDEVICE T operator()(T x) { return x * (x > 0); }
 };
 template <typename T>
 struct ReluGradFunctor {
  inline HOSTDEVICE T operator()(T x) { return x > 0 ? 1 : 0; }
  inline HOSTDEVICE T operator()(T x, T out) { return x > 0 ? 1 : 0; }
 };
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/parallel_do_op.cc
+++ b/paddle/fluid/operators/parallel_do_op.cc
@ -163,8 +163,7 @@ class ParallelDoOp : public framework::OperatorBase {
      auto &place = places[place_idx];
      auto *cur_scope = sub_scopes[place_idx];
-      workers.emplace_back(
+      workers.emplace_back(framework::Async([program, cur_scope, place, block] {
          framework::Async([program, cur_scope, place, block, place_idx] {
        framework::Executor executor(place);
        executor.Run(*program, cur_scope, block->ID(),
                     false /*create_local_scope*/);
@ -239,8 +238,7 @@ class ParallelDoGradOp : public framework::OperatorBase {
      auto *cur_scope = sub_scopes[i];
      // execute
-      workers.emplace_back(
+      workers.emplace_back(framework::Async([program, cur_scope, place, block] {
          framework::Async([program, cur_scope, place, block, i] {
        framework::Executor executor(place);
        executor.Run(*program, cur_scope, block->ID(),
                     false /*create_local_scope*/);
--- a/paddle/fluid/operators/read_op.cc
+++ b/paddle/fluid/operators/read_op.cc
@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
 #include "paddle/fluid/platform/profiler.h"
 namespace paddle {
 namespace operators {
@ -65,6 +66,12 @@ class ReadOp : public framework::OperatorBase {
            .GetMutable<framework::ReaderHolder>();
    std::vector<std::string> out_arg_names = Outputs("Out");
    std::vector<framework::LoDTensor> ins;
    // For profiling
    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
    auto& ctx = *pool.Get(dev_place);
    platform::RecordEvent record_event(Type(), &ctx);
    reader->ReadNext(&ins);
    if (ins.empty()) {
      if (Attr<bool>("throw_eof_exp")) {
--- a/paddle/fluid/operators/tensorrt_engine_op.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op.cc
@ -55,18 +55,8 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t> &shape) {
                    "TensorRT' tensor input requires at least 2 dimensions");
  PADDLE_ENFORCE_LE(shape.size(), 4UL,
                    "TensorRT' tensor input requires at most 4 dimensions");
-
+  PADDLE_ENFORCE_EQ(shape.size(), 4UL);
-  switch (shape.size()) {
+  return nvinfer1::DimsCHW(shape[1], shape[2], shape[3]);
    case 2:
      return nvinfer1::Dims2(1, shape[1]);
    case 3:
      return nvinfer1::Dims3(1, shape[1], shape[2]);
    case 4:
      return nvinfer1::Dims4(1, shape[1], shape[2], shape[3]);
    default:
      return nvinfer1::Dims();
  }
  return nvinfer1::Dims();
 }
 }  // namespace
@ -86,6 +76,9 @@ void TensorRTEngineKernel<DeviceContext, T>::Prepare(
    parameters.insert(param);
  }
  std::vector<std::string> output_maps =
      context.Attr<std::vector<std::string>>("output_name_mapping");
  // TODO(Superjomn) replace this with a different stream
  auto *engine = Singleton<TRT_EngineManager>::Global().Create(
      max_batch, max_workspace, nullptr /*engine hold its own stream*/,
@ -97,6 +90,7 @@ void TensorRTEngineKernel<DeviceContext, T>::Prepare(
  // Add inputs
  VLOG(4) << "declare inputs";
  for (auto &input : context.Inputs("Xs")) {
    if (parameters.count(input)) continue;
    VLOG(4) << "declare input " << input;
    auto *var = block.FindVar(input);
    // TensorRT engine need to create parameters. The parameter's description
@ -122,7 +116,7 @@ void TensorRTEngineKernel<DeviceContext, T>::Prepare(
      block_desc, parameters, context.scope(), engine);
  // Add outputs
-  for (auto &output : context.Outputs("Ys")) {
+  for (auto &output : output_maps) {
    engine->DeclareOutput(output);
  }
--- a/paddle/fluid/operators/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt_engine_op.h
@ -66,8 +66,17 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
    PADDLE_ENFORCE_LE(FLAGS_tensorrt_engine_batch_size,
                      context.Attr<int>("max_batch"));
    std::vector<std::string> output_maps =
        context.Attr<std::vector<std::string>>("output_name_mapping");
    auto params = context.Attr<std::vector<std::string>>("parameters");
    std::unordered_set<std::string> parameters;
    for (const auto& param : params) {
      parameters.insert(param);
    }
    // Convert input tensor from fluid to engine.
    for (const auto& x : context.Inputs("Xs")) {
      if (parameters.count(x)) continue;
      // convert input and copy to TRT engine's buffer
      auto& t = inference::analysis::GetFromScope<framework::LoDTensor>(
          context.scope(), x);
@ -82,10 +91,12 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
    // Execute the engine.
    PADDLE_ENFORCE_GT(FLAGS_tensorrt_engine_batch_size, 0);
    engine->Execute(FLAGS_tensorrt_engine_batch_size);
    // Convert output tensor from engine to fluid
    int output_index = 0;
    for (const auto& y : context.Outputs("Ys")) {
      // convert output and copy to fluid.
-      nvinfer1::ITensor* trt_t = engine->GetITensor(y);
+      nvinfer1::ITensor* trt_t = engine->GetITensor(output_maps[output_index]);
      auto dims = trt_t->getDimensions();
      // Use the output ITensor's dims to reshape the Fluid Tensor.
      std::vector<int> ddim(dims.d, dims.d + dims.nbDims);
@ -102,7 +113,7 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
      // TODO(Superjomn) change this float to dtype size.
      auto size = inference::analysis::AccuDims(dims.d, dims.nbDims) *
                  FLAGS_tensorrt_engine_batch_size;
-      engine->GetOutputInCPU(y,
+      engine->GetOutputInCPU(output_maps[output_index],
                             fluid_t->mutable_data<float>(platform::CPUPlace()),
                             size * sizeof(float));
      //} else {
@ -110,6 +121,7 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
      // y, fluid_t->mutable_data<float>(platform::CUDAPlace()),
      // size * sizeof(float));
      //}
      output_index += 1;
    }
    cudaStreamSynchronize(*engine->stream());
--- a/paddle/fluid/operators/tensorrt_engine_op_test.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op_test.cc
@ -103,6 +103,9 @@ TEST(TensorRTEngineOp, manual) {
  SetAttr<std::string>(engine_op_desc.Proto(), "engine_uniq_key", "a_engine");
  SetAttr<std::vector<std::string>>(engine_op_desc.Proto(), "parameters",
                                    std::vector<std::string>({}));
  SetAttr<std::vector<std::string>>(engine_op_desc.Proto(),
                                    "output_name_mapping",
                                    std::vector<std::string>({"z0"}));
  LOG(INFO) << "create engine op";
  auto engine_op = framework::OpRegistry::CreateOp(*engine_op_desc.Proto());
@ -196,6 +199,10 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
      std::vector<std::string>({"y0", "y1", "y2", "y3"}));
  SetAttr<std::string>(engine_op_desc.Proto(), "engine_uniq_key", "b_engine");
  SetAttr<std::vector<std::string>>(engine_op_desc.Proto(),
                                    "output_name_mapping",
                                    std::vector<std::string>({"z3"}));
  auto engine_op = framework::OpRegistry::CreateOp(*engine_op_desc.Proto());
  // Execute them.
--- a/python/paddle/fluid/init.py
+++ b/python/paddle/fluid/init.py
@ -123,7 +123,8 @@ def __bootstrap__():
    read_env_flags = [
        'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir',
        'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb',
-        'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads'
+        'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads',
        'cpu_deterministic'
    ]
    if core.is_compiled_with_dist():
        read_env_flags.append('rpc_deadline')
--- a/Show More
+++ b/Show More