Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into refine_trt

test=develop
6 years ago · 2a84054372
parent b742d46520 f17b05d4a1
commit 2a84054372
361 changed files with 6793 additions and 2431 deletions
--- a/.gitignore
+++ b/.gitignore
@ -4,6 +4,7 @@ paddle/operators/tensor.save
 python/paddle/v2/fluid/tests/book/image_classification_resnet.inference.model/
 python/paddle/v2/fluid/tests/book/image_classification_vgg.inference.model/
 python/paddle/v2/fluid/tests/book/label_semantic_roles.inference.model/
 paddle/fluid/operators/distributed/send_recv.proto
 *.DS_Store
 *.vs
 build/
@ -28,4 +29,5 @@ third_party/
 build_*
 # clion workspace.
 cmake-build-*
 paddle/fluid/operators/distributed/send_recv.proto
 model_test
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -302,6 +302,14 @@ set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build")
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
 set(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
 if (ON_INFER)
    message(STATUS "On inference mode, will take place some specific optimization.")
    add_definitions(-DPADDLE_ON_INFERENCE)
 else()
    #TODO(luotao), combine this warning with `make inference_lib_dist` command.
    message(WARNING "On inference mode, will take place some specific optimization. Turn on the ON_INFER flag when building inference_lib only.")
 endif()
 add_subdirectory(paddle)
 if(WITH_PYTHON)
    add_subdirectory(python)
@ -312,10 +320,3 @@ if(WITH_DOC)
    find_python_module(recommonmark REQUIRED)
    add_subdirectory(doc)
 endif()
 if (ON_INFER)
    message(STATUS "On inference mode, will take place some specific optimization.")
 else()
    #TODO(luotao), combine this warning with `make inference_lib_dist` command.
    message(WARNING "On inference mode, will take place some specific optimization. Turn on the ON_INFER flag when building inference_lib only.")
 endif()
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@ -0,0 +1,219 @@
 set(PART_CUDA_KERNEL_FILES)
 function(op_library TARGET)
    # op_library is a function to create op library. The interface is same as
    # cc_library. But it handle split GPU/CPU code and link some common library
    # for ops.
    set(cc_srcs)
    set(cu_srcs)
    set(hip_cu_srcs)
    set(miopen_hip_cc_srcs)
    set(cu_cc_srcs)
    set(cudnn_cu_cc_srcs)
    set(CUDNN_FILE)
    set(mkldnn_cc_srcs)
    set(MKLDNN_FILE)
    set(op_common_deps operator op_registry math_function)
    set(options "")
    set(oneValueArgs "")
    set(multiValueArgs SRCS DEPS)
    set(pybind_flag 0)
    cmake_parse_arguments(op_library "${options}" "${oneValueArgs}"
            "${multiValueArgs}" ${ARGN})
    list(LENGTH op_library_SRCS op_library_SRCS_len)
    if (${op_library_SRCS_len} EQUAL 0)
        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc)
            list(APPEND cc_srcs ${TARGET}.cc)
        endif()
        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu.cc)
            list(APPEND cu_cc_srcs ${TARGET}.cu.cc)
        endif()
        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
            list(APPEND cu_srcs ${TARGET}.cu)
        endif()
        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu)
            set(PART_CUDA_KERNEL_FILES ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu
                    ${PART_CUDA_KERNEL_FILES} PARENT_SCOPE)
            list(APPEND cu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu)
        endif()
        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.hip.cu)
            list(APPEND hip_cu_srcs ${TARGET}.hip.cu)
        endif()
        string(REPLACE "_op" "_cudnn_op" CUDNN_FILE "${TARGET}")
        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${CUDNN_FILE}.cu.cc)
            list(APPEND cudnn_cu_cc_srcs ${CUDNN_FILE}.cu.cc)
        endif()
        if(WITH_AMD_GPU)
            string(REPLACE "_op" "_miopen_op" MIOPEN_FILE "${TARGET}")
            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MIOPEN_FILE}.hip.cc)
                list(APPEND miopen_hip_cc_srcs ${MIOPEN_FILE}.hip.cc)
            endif()
        endif()
        if(WITH_MKLDNN)
            string(REPLACE "_op" "_mkldnn_op" MKLDNN_FILE "${TARGET}")
            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MKLDNN_FILE}.cc)
                list(APPEND mkldnn_cc_srcs ${MKLDNN_FILE}.cc)
            endif()
        endif()
    else()
        foreach(src ${op_library_SRCS})
            if (${src} MATCHES ".*\\.hip.cu$")
                list(APPEND hip_cu_srcs ${src})
            elseif (${src} MATCHES ".*\\.cu$")
                list(APPEND cu_srcs ${src})
            elseif(${src} MATCHES ".*_cudnn_op.cu.cc$")
                list(APPEND cudnn_cu_cc_srcs ${src})
            elseif(WITH_AMD_GPU AND ${src} MATCHES ".*_miopen_op.hip.cc$")
                list(APPEND miopen_hip_cc_srcs ${src})
            elseif(WITH_MKLDNN AND ${src} MATCHES ".*_mkldnn_op.cc$")
                list(APPEND mkldnn_cc_srcs ${src})
            elseif(${src} MATCHES ".*\\.cu.cc$")
                list(APPEND cu_cc_srcs ${src})
            elseif(${src} MATCHES ".*\\.cc$")
                list(APPEND cc_srcs ${src})
            else()
                message(FATAL_ERROR "${TARGET} Source file ${src} should only be .cc or .cu")
            endif()
        endforeach()
    endif()
    list(LENGTH cc_srcs cc_srcs_len)
    if (${cc_srcs_len} EQUAL 0)
        message(FATAL_ERROR "The op library ${TARGET} should contains at least one .cc file")
    endif()
    if (WIN32)
    # remove windows unsupported op, because windows has no nccl, no warpctc such ops.
    foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op" "hierarchical_sigmoid_op"
     "crf_decoding_op" "select_op" "lstmp_op" "gru_op" "fusion_gru_op" "lstm_op" "fusion_lstm_op" "cumsum_op"
      "fusion_seqconv_eltadd_relu_op" "channel_send_op" "channel_create_op" "channel_close_op" "channel_recv_op")
        if ("${TARGET}" STREQUAL "${windows_unsupport_op}")
          return()
        endif()
    endforeach()
    endif(WIN32)
    set(OP_LIBRARY ${TARGET} ${OP_LIBRARY} CACHE INTERNAL "op libs")
    list(LENGTH op_library_DEPS op_library_DEPS_len)
    if (${op_library_DEPS_len} GREATER 0)
        set(DEPS_OPS ${TARGET} ${DEPS_OPS} PARENT_SCOPE)
    endif()
    if (WITH_GPU)
        nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cudnn_cu_cc_srcs} ${mkldnn_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS}
                ${op_common_deps})
    elseif (WITH_AMD_GPU)
        hip_library(${TARGET} SRCS ${cc_srcs} ${hip_cu_srcs} ${miopen_hip_cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS}
                ${op_common_deps})
    else()
        cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS}
            ${op_common_deps})
    endif()
    # Define operators that don't need pybind here.
    foreach(manual_pybind_op "compare_op" "logical_op" "nccl_op"
 "tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op")
        if ("${TARGET}" STREQUAL "${manual_pybind_op}")
            set(pybind_flag 1)
        endif()
    endforeach()
    # The registration of USE_OP, please refer to paddle/fluid/framework/op_registry.h.
    # Note that it's enough to just adding one operator to pybind in a *_op.cc file.
    # And for detail pybind information, please see generated paddle/pybind/pybind.h.
    file(READ ${TARGET}.cc TARGET_CONTENT)
    string(REGEX MATCH "REGISTER_OPERATOR\\(.*REGISTER_OPERATOR\\(" multi_register "${TARGET_CONTENT}")
    string(REGEX MATCH "REGISTER_OPERATOR\\([a-z0-9_]*," one_register "${multi_register}")
    if (one_register STREQUAL "")
        string(REPLACE "_op" "" TARGET "${TARGET}")
    else ()
        string(REPLACE "REGISTER_OPERATOR(" "" TARGET "${one_register}")
        string(REPLACE "," "" TARGET "${TARGET}")
    endif()
    # pybind USE_NO_KERNEL_OP
    # HACK: if REGISTER_OP_CPU_KERNEL presents the operator must have kernel
    string(REGEX MATCH "REGISTER_OP_CPU_KERNEL" regex_result "${TARGET_CONTENT}")
    string(REPLACE "_op" "" TARGET "${TARGET}")
    if (${pybind_flag} EQUAL 0 AND regex_result STREQUAL "")
        file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(${TARGET});\n")
        set(pybind_flag 1)
    endif()
    # pybind USE_CPU_ONLY_OP
    list(LENGTH cu_srcs cu_srcs_len)
    list(LENGTH cu_cc_srcs cu_cc_srcs_len)
    list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len)
    list(LENGTH hip_cu_srcs hip_cu_srcs_len)
    list(LENGTH miopen_hip_cc_srcs miopen_hip_cc_srcs_len)
    if (${pybind_flag} EQUAL 0 AND ${mkldnn_cc_srcs_len} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0 AND
        ${hip_cu_srcs_len} EQUAL 0 AND ${miopen_hip_cc_srcs_len} EQUAL 0)
        file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
        set(pybind_flag 1)
    endif()
    # pybind USE_OP_DEVICE_KERNEL for CUDNN
    list(LENGTH cudnn_cu_cc_srcs cudnn_cu_cc_srcs_len)
    if (WITH_GPU AND ${cudnn_cu_cc_srcs_len} GREATER 0)
        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n")
    endif()
    # pybind USE_OP_DEVICE_KERNEL for MIOPEN
    if (WITH_AMD_GPU AND ${miopen_hip_cc_srcs_len} GREATER 0)
        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MIOPEN);\n")
    endif()
    # pybind USE_OP_DEVICE_KERNEL for MKLDNN
    if (WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0)
      # Append first implemented MKLDNN activation operator
      if (${MKLDNN_FILE} STREQUAL "activation_mkldnn_op")
        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, MKLDNN);\n")
      else()
        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MKLDNN);\n")
      endif()
    endif()
    # pybind USE_OP
    if (${pybind_flag} EQUAL 0)
      # NOTE(*): activation use macro to regist the kernels, set use_op manually.
      if(${TARGET} STREQUAL "activation")
        file(APPEND ${pybind_file} "USE_OP(relu);\n")
      elseif(${TARGET} STREQUAL "fake_dequantize")
        file(APPEND ${pybind_file} "USE_OP(fake_dequantize_max_abs);\n")
      elseif(${TARGET} STREQUAL "fake_quantize")
        file(APPEND ${pybind_file} "USE_OP(fake_quantize_abs_max);\n")
      elseif(${TARGET} STREQUAL "tensorrt_engine_op")
          message(STATUS "Pybind skips [tensorrt_engine_op], for this OP is only used in inference")
      elseif(${TARGET} STREQUAL "fc")
        # HACK: fc only have mkldnn and cpu, which would mismatch the cpu only condition
        file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
      else()
        file(APPEND ${pybind_file} "USE_OP(${TARGET});\n")
      endif()
    endif()
 endfunction()
 function(register_operators)
    set(options "")
    set(oneValueArgs "")
    set(multiValueArgs EXCLUDES DEPS)
    cmake_parse_arguments(register_operators "${options}" "${oneValueArgs}"
            "${multiValueArgs}" ${ARGN})
    file(GLOB OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc")
    string(REPLACE "_mkldnn" "" OPS "${OPS}")
    string(REPLACE ".cc" "" OPS "${OPS}")
    list(REMOVE_DUPLICATES OPS)
    list(LENGTH register_operators_DEPS register_operators_DEPS_len)
    foreach(src ${OPS})
        list(FIND register_operators_EXCLUDES ${src} _index)
        if (${_index} EQUAL -1)
            if (${register_operators_DEPS_len} GREATER 0)
                op_library(${src} DEPS ${register_operators_DEPS})
            else()
                op_library(${src})
            endif()
        endif()
    endforeach()
 endfunction()
--- a/paddle/fluid/framework/data_device_transform_test.cu
+++ b/paddle/fluid/framework/data_device_transform_test.cu
@ -17,7 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/elementwise_op_function.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/init.h"
--- a/paddle/fluid/framework/details/exception_holder.h
+++ b/paddle/fluid/framework/details/exception_holder.h
@ -30,6 +30,8 @@ class ExceptionHolder {
      Catch(exp);
    } catch (platform::EnforceNotMet exp) {
      Catch(exp);
    } catch (std::exception& ex) {
      LOG(FATAL) << "std::exception caught, " << ex.what();
    } catch (...) {
      LOG(FATAL) << "Unknown exception caught";
    }
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@ -418,11 +418,6 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
      DeleteUnusedTensors(*local_scope, op.get(), gc.get(),
                          &(ctx->cur_ref_cnts_));
    }
    if (FLAGS_benchmark) {
      VLOG(20) << "Memory used after operator " + op->Type() + " running: "
               << memory::memory_usage(place_);
    }
  }
  if (gc != nullptr) {
@ -444,13 +439,6 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
      scope->DropKids();
    }
  }
  if (FLAGS_benchmark) {
    VLOG(20) << "-------------------------------------------------------";
    VLOG(20) << "Memory used after deleting local scope: "
             << memory::memory_usage(place_);
    VLOG(20) << "-------------------------------------------------------";
  }
 }
 void Executor::RunPreparedContext(
--- a/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.cc
--- a/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.h
@ -15,24 +15,119 @@
 #pragma once
 #include <string>
 #include <tuple>
 #include <utility>
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include <boost/optional.hpp>
 namespace paddle {
 namespace framework {
 namespace ir {
-class ConvElementwiseAddMKLDNNFusePass : public FusePassBase {
+using graph_ptr = std::unique_ptr<ir::Graph>;
 using GraphWithStats = std::pair<ir::Graph*, int>;
 void CorrectGraphEdges(Graph* graph, Node* from, Node* to);
 bool IsReachable(ir::Graph* graph, Node* from, Node* to);
 boost::optional<Node*> HasBias(const Node& op, const std::string& bias_name);
 class ResidualConnectionMKLDNNFusePass : public FusePassBase {
 private:
  GraphWithStats FuseConvAsX(const std::string& name_scope,
                             const GraphWithStats& graph_with_stats) const;
  GraphWithStats FuseConvAsY(const std::string& name_scope,
                             const GraphWithStats& graph_with_stats) const;
  GraphWithStats FuseProjectionConv(
      const std::string& name_scope,
      const GraphWithStats& graph_with_stats) const;
  template <typename RetType>
  using GetNodeFunc =
      std::function<RetType(const GraphPatternDetector::subgraph_t& subgraph)>;
  using IdentityConvFunc = GetNodeFunc<std::tuple<Node*, Node*, Node*, Node*>>;
  using IdentityElementwiseAddFunc =
      GetNodeFunc<std::tuple<Node*, Node*, Node*>>;
  using ProjectionConvFunc = IdentityConvFunc;
  using ProjectionElementwiseAddFunc = GetNodeFunc<std::tuple<Node*, Node*>>;
  using CanFuseFunc = std::function<bool(Node*, Node*)>;
  std::tuple<Node*, Node*, Node*, Node*> GetNodesFromConv(
      const patterns::Conv& conv_pattern,
      const GraphPatternDetector::subgraph_t& subgraph) const;
  std::tuple<Node*, Node*, Node*, Node*> GetNodesFromProjectionConv(
      const patterns::Conv& conv_pattern,
      const GraphPatternDetector::subgraph_t& subgraph) const;
  template <typename HandleType, typename... OpFuncs>
  GraphWithStats ExecuteHandleOnGraph(GraphPatternDetector* gpd,
                                      const GraphWithStats& graph_with_stats,
                                      OpFuncs&&... op_funcs) const {
    ir::Graph* graph;
    int stats;
    std::tie(graph, stats) = graph_with_stats;
    auto can_fuse = [this](Node* op1, Node* op2) -> bool {
      return this->FindFuseOption(*op1, *op2) == FUSE_MKLDNN;
    };
    auto fuse_handle = HandleType{can_fuse, std::forward<OpFuncs>(op_funcs)...};
    (*gpd)(graph, fuse_handle);
    return std::make_pair(graph, stats + fuse_handle.get_stats());
  }
  struct IdentityFuseHandle {
    IdentityFuseHandle(
        const CanFuseFunc& can_fuse_func,
        const IdentityConvFunc& get_node_from_conv_op,
        const IdentityElementwiseAddFunc& get_node_from_elementwise_add_op);
    void operator()(const GraphPatternDetector::subgraph_t& subgraph,
                    Graph* graph);
    int get_stats() const { return *fusion_stats; }
   private:
    std::shared_ptr<int> fusion_stats;
    CanFuseFunc can_fuse_func;
    IdentityConvFunc get_node_from_conv_op;
    IdentityElementwiseAddFunc get_node_from_elementwise_add_op;
  };
  struct ProjectionFuseHandle {
    ProjectionFuseHandle(
        const CanFuseFunc& can_fuse_func,
        const ProjectionConvFunc& get_node_from_conv_x_op,
        const ProjectionConvFunc& get_node_from_conv_y_op,
        const ProjectionElementwiseAddFunc& get_node_from_elementwise_add_op);
    void operator()(const GraphPatternDetector::subgraph_t& subgraph,
                    Graph* graph);
    int get_stats() const { return *fusion_stats; }
   private:
    std::shared_ptr<int> fusion_stats;
    CanFuseFunc can_fuse_func;
    ProjectionConvFunc get_node_from_conv_x_op;
    ProjectionConvFunc get_node_from_conv_y_op;
    ProjectionElementwiseAddFunc get_node_from_elementwise_add_op;
  };
 public:
-  virtual ~ConvElementwiseAddMKLDNNFusePass() {}
+  virtual ~ResidualConnectionMKLDNNFusePass() {}
 protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(graph_ptr graph) const;
-  const std::string name_scope_{"residual_connections_fuse_pass"};
+  const std::string name_scope_{"residual_connection_fuse_pass"};
 };
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass_tester.cc
@ -40,7 +40,7 @@ void SetOp(ProgramDesc* prog, const std::string& type,
  op->SetOutput(output.first, {output.second});
 }
-struct IsReachable {
+struct TestIsReachable {
  using func = std::function<bool(const std::string&, const std::string&)>;
  auto operator()(const std::unique_ptr<ir::Graph>& graph) -> func {
@ -89,7 +89,9 @@ struct IsReachable {
  }
 };
-void AssertOpsCount(const std::unique_ptr<ir::Graph>& graph) {
+void AssertOpsCount(const std::unique_ptr<ir::Graph>& graph,
                    int expected_conv_count,
                    int expected_elementwise_add_count = 0) {
  int conv_count = 0;
  int elementwise_add_count = 0;
@ -101,8 +103,8 @@ void AssertOpsCount(const std::unique_ptr<ir::Graph>& graph) {
      ++elementwise_add_count;
    }
  }
-  EXPECT_EQ(conv_count, 1);
+  EXPECT_EQ(conv_count, expected_conv_count);
-  EXPECT_EQ(elementwise_add_count, 0);
+  EXPECT_EQ(elementwise_add_count, expected_elementwise_add_count);
 }
 ProgramDesc BuildProgramDesc(const std::vector<std::string>& transient_vars,
@ -127,22 +129,13 @@ ProgramDesc BuildProgramDesc(const std::vector<std::string>& transient_vars,
  return prog;
 }
 }  // namespace
 TEST(ConvElementwiseAddMKLDNNFusePass, ConvolutionWithElementwiseAddRelu) {
  auto prog =
      BuildProgramDesc({"a", "b", "c", "d", "e", "f"}, {"bias", "weights"});
  SetOp(&prog, "conv2d",
        {{"Input", "a"}, {"Bias", "bias"}, {"Filter", "weights"}},
        {"Output", "b"});
  SetOp(&prog, "elementwise_add", {{"X", "b"}, {"Y", "c"}}, {"Out", "d"});
  SetOp(&prog, "relu", {{"X", "d"}}, {"Out", "e"});
-  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+void RunPassAndAssert(ProgramDesc* prog, const std::string& from,
                      const std::string& to, int expected_conv_num) {
  std::unique_ptr<ir::Graph> graph(new ir::Graph(*prog));
-  IsReachable is_reachable;
+  TestIsReachable is_reachable;
-  EXPECT_TRUE(is_reachable(graph)("a", "relu"));
+  EXPECT_TRUE(is_reachable(graph)(from, to));
  auto pass =
      PassRegistry::Instance().Get("conv_elementwise_add_mkldnn_fuse_pass");
@ -150,82 +143,87 @@ TEST(ConvElementwiseAddMKLDNNFusePass, ConvolutionWithElementwiseAddRelu) {
  graph = pass->Apply(std::move(graph));
  int current_nodes_num = graph->Nodes().size();
-  EXPECT_TRUE(is_reachable(graph)("a", "relu"));
+  EXPECT_TRUE(is_reachable(graph)(from, to));
  EXPECT_EQ(original_nodes_num - nodes_removed + nodes_added,
            current_nodes_num);
-  AssertOpsCount(graph);
+  AssertOpsCount(graph, expected_conv_num);
 }
 }  // namespace
-TEST(ConvElementwiseAddMKLDNNFusePass,
+TEST(ConvElementwiseAddMKLDNNFusePass, ConvolutionAsYWithElementwiseAddRelu) {
-     ConvolutionWithElementwiseAddReluNoBias) {
+  auto prog = BuildProgramDesc({"a", "b", "c", "d", "e"}, {"bias", "weights"});
  auto prog = BuildProgramDesc({"a", "b", "c", "d", "e"}, {"weights"});
  SetOp(&prog, "conv2d", {{"Input", "a"}, {"Filter", "weights"}},
        {"Output", "b"});
  SetOp(&prog, "elementwise_add", {{"X", "b"}, {"Y", "c"}}, {"Out", "d"});
  SetOp(&prog, "relu", {{"X", "d"}}, {"Out", "e"});
  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
-  IsReachable is_reachable;
+  SetOp(&prog, "sigmoid", {{"X", "a"}}, {"Out", "b"});
  SetOp(&prog, "conv2d",
        {{"Input", "b"}, {"Bias", "bias"}, {"Filter", "weights"}},
        {"Output", "c"});
-  EXPECT_TRUE(is_reachable(graph)("a", "relu"));
+  SetOp(&prog, "elementwise_add", {{"X", "a"}, {"Y", "c"}}, {"Out", "d"});
  SetOp(&prog, "relu", {{"X", "d"}}, {"Out", "e"});
-  auto pass =
+  RunPassAndAssert(&prog, "a", "relu", 1);
-      PassRegistry::Instance().Get("conv_elementwise_add_mkldnn_fuse_pass");
+}
  int original_nodes_num = graph->Nodes().size();
  graph = pass->Apply(std::move(graph));
  int current_nodes_num = graph->Nodes().size();
-  EXPECT_TRUE(is_reachable(graph)("a", "relu"));
+TEST(ConvElementwiseAddMKLDNNFusePass,
     ConvolutionAsYWithElementwiseAddReluNoBias) {
  auto prog = BuildProgramDesc({"a", "b", "c", "d", "e"}, {"weights"});
-  EXPECT_EQ(original_nodes_num - nodes_removed + nodes_added,
+  SetOp(&prog, "sigmoid", {{"X", "a"}}, {"Out", "b"});
-            current_nodes_num);
+  SetOp(&prog, "conv2d", {{"Input", "b"}, {"Filter", "weights"}},
        {"Output", "c"});
  SetOp(&prog, "elementwise_add", {{"X", "a"}, {"Y", "c"}}, {"Out", "d"});
  SetOp(&prog, "relu", {{"X", "d"}}, {"Out", "e"});
-  AssertOpsCount(graph);
+  RunPassAndAssert(&prog, "a", "relu", 1);
 }
-TEST(ConvElementwiseAddMKLDNNFusePass, ConvolutionElementwiseAdd) {
+TEST(ConvElementwiseAddMKLDNNFusePass, ConvolutionAsXWithElementwiseAddRelu) {
-  auto prog = BuildProgramDesc({"a", "b", "c", "d"}, {"bias", "weights"});
+  auto prog = BuildProgramDesc({"a", "b", "c", "d", "e"}, {"bias", "weights"});
  SetOp(&prog, "sigmoid", {{"X", "a"}}, {"Out", "b"});
  SetOp(&prog, "conv2d",
-        {{"Input", "a"}, {"Bias", "bias"}, {"Filter", "weights"}},
+        {{"Input", "b"}, {"Bias", "bias"}, {"Filter", "weights"}},
-        {"Output", "b"});
+        {"Output", "c"});
  SetOp(&prog, "elementwise_add", {{"X", "b"}, {"Y", "c"}}, {"Out", "d"});
-  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+  SetOp(&prog, "elementwise_add", {{"X", "c"}, {"Y", "a"}}, {"Out", "d"});
  SetOp(&prog, "relu", {{"X", "d"}}, {"Out", "e"});
-  IsReachable is_reachable;
+  RunPassAndAssert(&prog, "a", "relu", 1);
-  EXPECT_TRUE(is_reachable(graph)("a", "d"));
+}
-  auto pass =
+TEST(ConvElementwiseAddMKLDNNFusePass,
-      PassRegistry::Instance().Get("conv_elementwise_add_mkldnn_fuse_pass");
+     ConvolutionAsXWithElementwiseAddReluNoBias) {
-  int original_nodes_num = graph->Nodes().size();
+  auto prog = BuildProgramDesc({"a", "b", "c", "d", "e"}, {"weights"});
  graph = pass->Apply(std::move(graph));
  int current_nodes_num = graph->Nodes().size();
-  EXPECT_FALSE(is_reachable(graph)("a", "d"));
+  SetOp(&prog, "sigmoid", {{"X", "a"}}, {"Out", "b"});
  SetOp(&prog, "conv2d", {{"Input", "b"}, {"Filter", "weights"}},
        {"Output", "c"});
  SetOp(&prog, "elementwise_add", {{"X", "c"}, {"Y", "a"}}, {"Out", "d"});
  SetOp(&prog, "relu", {{"X", "d"}}, {"Out", "e"});
-  EXPECT_EQ(original_nodes_num - nodes_removed + nodes_added,
+  RunPassAndAssert(&prog, "a", "relu", 1);
            current_nodes_num);
  AssertOpsCount(graph);
 }
-TEST(ConvElementwiseAddMKLDNNFusePass, SigmoidConvolutionAddElementwiseRelu) {
+TEST(ConvElementwiseAddMKLDNNFusePass, NoFusion) {
  auto prog =
-      BuildProgramDesc({"a", "b", "c", "d", "e", "f"}, {"bias", "weights"});
+      BuildProgramDesc({"a", "b", "c", "d", "e", "f", "g"}, {"weights"});
  SetOp(&prog, "sigmoid", {{"X", "a"}}, {"Out", "b"});
-  SetOp(&prog, "conv2d",
+  SetOp(&prog, "conv2d", {{"Input", "b"}, {"Filter", "weights"}},
        {{"Input", "b"}, {"Bias", "bias"}, {"Filter", "weights"}},
        {"Output", "c"});
  SetOp(&prog, "elementwise_add", {{"X", "c"}, {"Y", "d"}}, {"Out", "e"});
  SetOp(&prog, "relu", {{"X", "e"}}, {"Out", "f"});
-  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+  SetOp(&prog, "conv2d", {{"Input", "d"}, {"Filter", "weights"}},
        {"Output", "e"});
-  IsReachable is_reachable;
+  SetOp(&prog, "elementwise_add", {{"X", "c"}, {"Y", "e"}}, {"Out", "f"});
  SetOp(&prog, "relu", {{"X", "f"}}, {"Out", "g"});
-  EXPECT_TRUE(is_reachable(graph)("a", "f"));
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
  TestIsReachable is_reachable;
  EXPECT_TRUE(is_reachable(graph)("a", "g"));
  auto pass =
      PassRegistry::Instance().Get("conv_elementwise_add_mkldnn_fuse_pass");
@ -233,11 +231,10 @@ TEST(ConvElementwiseAddMKLDNNFusePass, SigmoidConvolutionAddElementwiseRelu) {
  graph = pass->Apply(std::move(graph));
  int current_nodes_num = graph->Nodes().size();
-  EXPECT_TRUE(is_reachable(graph)("a", "f"));
+  EXPECT_TRUE(is_reachable(graph)("a", "g"));
  EXPECT_EQ(original_nodes_num, current_nodes_num);
-  EXPECT_EQ(original_nodes_num - nodes_removed + nodes_added,
+  AssertOpsCount(graph, 2, 1);
            current_nodes_num);
  AssertOpsCount(graph);
 }
 }  // namespace ir
--- a/paddle/fluid/framework/ir/graph_helper.cc
+++ b/paddle/fluid/framework/ir/graph_helper.cc
@ -15,8 +15,15 @@ limitations under the License. */
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include <algorithm>
 #include <deque>
 #include <fstream>
 #include <iosfwd>
 #include <ostream>
 #include <unordered_set>
 DEFINE_string(print_sub_graph_dir, "",
              "FLAGS_print_sub_graph_dir is used "
              "to print the nodes of sub_graphs.");
 namespace paddle {
 namespace framework {
 namespace ir {
@ -164,12 +171,15 @@ size_t GraphNum(const Graph &graph) {
    graph_nodes.emplace_back(g_nodes);
  }
-  if (VLOG_IS_ON(100)) {
+  if (FLAGS_print_sub_graph_dir.size()) {
-    VLOG(100) << "graph_num: " << graph_nodes.size();
+    if (graph_nodes.size() > 1) {
-    for (auto &g_n : graph_nodes) {
+      std::stringstream out;
-      VLOG(100) << "graph_nodes: " << g_n.size();
+      for (auto &g_n : graph_nodes) {
-      if (g_n.size() < 10) {
+        out << "graph_nodes: " << g_n.size() << "\n";
-        std::stringstream out;
+      }
      out << "\n\n";
      for (auto &g_n : graph_nodes) {
        out << "graph_nodes: " << g_n.size();
        for (auto &node : g_n) {
          out << "\nNode: " << node->Name() << " in [";
          for (auto &n : node->inputs) {
@ -181,8 +191,12 @@ size_t GraphNum(const Graph &graph) {
          }
          out << "]";
        }
-        VLOG(100) << out.str();
+        out << "\n\n\n";
      }
      std::unique_ptr<std::ostream> fout(
          new std::ofstream(FLAGS_print_sub_graph_dir));
      PADDLE_ENFORCE(fout->good());
      *fout << out.str();
    }
  }
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@ -1084,16 +1084,12 @@ PDNode *patterns::Conv::operator()() {
  return output_var;
 }
-PDNode *patterns::ElementwiseAdd::operator()(PDNode *x_var) {
+PDNode *patterns::ElementwiseAdd::operator()(PDNode *x_var, PDNode *y_var) {
  auto elementwise_add_op = pattern->NewNode(elementwise_add_op_repr())
                                ->assert_is_op("elementwise_add");
-  x_var->assert_is_op_input("elementwise_add", "X");
+  x_var->AsInput()->assert_is_op_input("elementwise_add", "X");
-
+  y_var->AsInput()->assert_is_op_input("elementwise_add", "Y");
  auto y_var = pattern->NewNode(elementwise_add_x_repr())
                   ->AsInput()
                   ->assert_is_op_input("elementwise_add", "Y");
  auto out_var = pattern->NewNode(elementwise_add_out_repr())
                     ->AsOutput()
                     ->assert_is_op_output("elementwise_add", "Out");
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@ -664,7 +664,7 @@ struct ElementwiseAdd : public PatternBase {
  ElementwiseAdd(PDPattern* pattern, const std::string& name_scope)
      : PatternBase(pattern, name_scope, "elementwise_add") {}
-  PDNode* operator()(PDNode* x_var);
+  PDNode* operator()(PDNode* x_var, PDNode* y_var);
  PATTERN_DECL_NODE(elementwise_add_op);
  PATTERN_DECL_NODE(elementwise_add_x);
--- a/paddle/fluid/framework/lod_tensor.h
+++ b/paddle/fluid/framework/lod_tensor.h
@ -111,9 +111,6 @@ class LoDTensor : public Tensor {
 public:
  LoDTensor() : Tensor() {}
  /* Constructor with place should only be used in pybind */
  explicit LoDTensor(const platform::Place& place) : Tensor(place) {}
  explicit LoDTensor(const LoD& lod) : lod_(lod) {}
  void set_lod(const LoD& lod) { lod_ = lod; }
--- a/paddle/fluid/framework/mixed_vector.h
+++ b/paddle/fluid/framework/mixed_vector.h
@ -23,6 +23,7 @@
 #include "paddle/fluid/framework/details/cow_ptr.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "glog/logging.h"
@ -31,46 +32,6 @@ namespace paddle {
 namespace framework {
 #if defined(PADDLE_WITH_CUDA)
 namespace details {
 struct CUDABuffer {
  void *data_{nullptr};
  size_t size_{0};
  platform::CUDAPlace place_;
  CUDABuffer() {}
  CUDABuffer(platform::Place place, size_t size)
      : size_(size), place_(boost::get<platform::CUDAPlace>(place)) {
    data_ = memory::Alloc(place_, size);
  }
  ~CUDABuffer() { ClearMemory(); }
  CUDABuffer(const CUDABuffer &o) = delete;
  CUDABuffer &operator=(const CUDABuffer &o) = delete;
  void Resize(platform::Place place, size_t size) {
    ClearMemory();
    place_ = boost::get<platform::CUDAPlace>(place);
    data_ = memory::Alloc(place_, size);
    PADDLE_ENFORCE_NOT_NULL(data_);
    size_ = size;
  }
  void Swap(CUDABuffer &o) {
    std::swap(data_, o.data_);
    std::swap(place_, o.place_);
    std::swap(size_, o.size_);
  }
 private:
  void ClearMemory() const {
    if (data_ != nullptr) {
      memory::Free(place_, data_);
    }
  }
 };
 }  // namespace details
 // Vector<T> implements the std::vector interface, and can get Data or
 // MutableData from any place. The data will be synced implicitly inside.
 template <typename T>
@ -103,8 +64,6 @@ class Vector {
      o.ImmutableCPU();
      cpu_ = o.cpu_;
      flag_ = kDataInCPU;
      details::CUDABuffer null;
      gpu_.Swap(null);
      return *this;
    }
@ -199,7 +158,7 @@ class Vector {
      PADDLE_ENFORCE(platform::is_gpu_place(place),
                     "CUDA Data must on CUDA place");
      ImmutableCUDA(place);
-      return reinterpret_cast<T *>(gpu_.data_);
+      return reinterpret_cast<T *>(gpu_->ptr());
    }
    // get cuda ptr. mutable
@ -234,13 +193,11 @@ class Vector {
    std::mutex &Mutex() const { return mtx_; }
-    std::unique_ptr<platform::CUDAPlace> CUDAPlace() const {
+    boost::optional<platform::CUDAPlace> CUDAPlace() const {
-      if (gpu_.data_ == nullptr) {
+      return gpu_ == nullptr
-        return nullptr;
+                 ? boost::none
-      } else {
+                 : boost::optional<platform::CUDAPlace>(
-        return std::unique_ptr<platform::CUDAPlace>(
+                       boost::get<platform::CUDAPlace>(gpu_->place()));
            new platform::CUDAPlace(gpu_.place_));
      }
    }
   private:
@ -254,13 +211,12 @@ class Vector {
    void CopyToCPU() const {
      // COPY GPU Data To CPU
      auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
-          platform::DeviceContextPool::Instance().Get(
+          platform::DeviceContextPool::Instance().Get(gpu_->place()));
              platform::Place(gpu_.place_)));
      auto stream = dev_ctx->stream();
-      void *src = gpu_.data_;
+      void *src = gpu_->ptr();
      void *dst = cpu_.data();
-      memory::Copy(platform::CPUPlace(), dst, gpu_.place_, src, gpu_.size_,
+      memory::Copy(platform::CPUPlace(), dst, CUDAPlace().get(), src,
-                   stream);
+                   gpu_->size(), stream);
      dev_ctx->Wait();
    }
@ -277,8 +233,7 @@ class Vector {
          CopyCPUDataToCUDA(place);
          UnsetFlag(kDirty);
          SetFlag(kDataInCUDA);
-        } else if (IsInCUDA() &&
+        } else if (IsInCUDA() && !(place == gpu_->place())) {
                   !(boost::get<platform::CUDAPlace>(place) == gpu_.place_)) {
          PADDLE_THROW("This situation should not happen");
          // Still dirty
        } else {
@ -290,7 +245,7 @@ class Vector {
          // Even data is not dirty. However, data is not in CUDA. Copy data.
          CopyCPUDataToCUDA(place);
          SetFlag(kDataInCUDA);
-        } else if (!(boost::get<platform::CUDAPlace>(place) == gpu_.place_)) {
+        } else if (!(place == gpu_->place())) {
          PADDLE_THROW("This situation should not happen.");
        } else {
          // Not Dirty && DataInCUDA && Device is same
@ -301,13 +256,13 @@ class Vector {
    void CopyCPUDataToCUDA(const platform::Place &place) const {
      void *src = cpu_.data();
-      gpu_.Resize(place, cpu_.size() * sizeof(T));
+      gpu_ = memory::Alloc(place, cpu_.size() * sizeof(T));
-      void *dst = gpu_.data_;
+      void *dst = gpu_->ptr();
      auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
          platform::DeviceContextPool::Instance().Get(place));
      auto stream = dev_ctx->stream();
-      memory::Copy(gpu_.place_, dst, platform::CPUPlace(), src, gpu_.size_,
+      memory::Copy(CUDAPlace().get(), dst, platform::CPUPlace(), src,
-                   stream);
+                   gpu_->size(), stream);
    }
    void ImmutableCPU() const {
@ -329,7 +284,7 @@ class Vector {
    bool IsInCPU() const { return flag_ & kDataInCPU; }
    mutable std::vector<T> cpu_;
-    mutable details::CUDABuffer gpu_;
+    mutable memory::AllocationPtr gpu_;
    mutable int flag_;
    mutable std::mutex mtx_;
@ -428,8 +383,8 @@ class Vector {
      auto &mtx = m_.Data().Mutex();
      std::lock_guard<std::mutex> guard(mtx);
      auto cuda_place = m_.Data().CUDAPlace();
-      if (cuda_place == nullptr ||
+      if (cuda_place == boost::none ||
-          *cuda_place == boost::get<platform::CUDAPlace>(place)) {
+          cuda_place == boost::get<platform::CUDAPlace>(place)) {
        return m_.Data().CUDAData(place);
      }
    }
@ -444,8 +399,8 @@ class Vector {
      auto &mtx = m_.Data().Mutex();
      std::lock_guard<std::mutex> guard(mtx);
      auto cuda_place = m_.Data().CUDAPlace();
-      if (cuda_place == nullptr ||
+      if (cuda_place == boost::none ||
-          *cuda_place == boost::get<platform::CUDAPlace>(place)) {
+          cuda_place == boost::get<platform::CUDAPlace>(place)) {
        return m_.MutableData()->CUDAMutableData(place);
      }
    }
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@ -171,8 +171,17 @@ ParallelExecutor::ParallelExecutor(
  }
  // If the loss_var_name is given, the number of graph should be only one.
  if (loss_var_name.size()) {
-    PADDLE_ENFORCE_EQ(ir::GraphNum(*graph), 1,
+    size_t graph_num = ir::GraphNum(*graph);
-                      "The number of graph should be only one");
+    if (graph_num > 1) {
      LOG(WARNING)
          << "The number of graph should be only one, "
             "but the current graph has "
          << ir::GraphNum(*graph)
          << " sub_graphs. If you want to see the nodes of the "
             "sub_graphs, you should use 'FLAGS_print_sub_graph_dir' "
             "to specify the output dir. NOTES: if you not do training, "
             "please don't pass loss_var_name.";
    }
  }
  if (exec_strategy.type_ == ExecutionStrategy::kDefault) {
--- a/paddle/fluid/framework/tensor.cc
+++ b/paddle/fluid/framework/tensor.cc
@ -32,10 +32,9 @@ size_t Tensor::memory_size() const {
 }
 void* Tensor::mutable_data(platform::Place place, std::type_index type,
                           memory::Allocator::Attr attr,
                           size_t requested_size) {
-  if (holder_ != nullptr) {
+  type_ = type;
    holder_->set_type(type);
  }
  PADDLE_ENFORCE_GE(numel(), 0,
                    "When calling this method, the Tensor's numel must be "
                    "equal or larger than zero. "
@ -48,35 +47,18 @@ void* Tensor::mutable_data(platform::Place place, std::type_index type,
  /* some versions of boost::variant don't have operator!= */
  if (holder_ == nullptr || !(holder_->place() == place) ||
      holder_->size() < size + offset_) {
-    if (platform::is_cpu_place(place)) {
+    holder_ = memory::AllocShared(place, size, attr);
      holder_.reset(new PlaceholderImpl<platform::CPUPlace>(
          boost::get<platform::CPUPlace>(place), size, type));
    } else if (platform::is_gpu_place(place) ||
               platform::is_cuda_pinned_place(place)) {
 #ifndef PADDLE_WITH_CUDA
      PADDLE_THROW(
          "CUDAPlace or CUDAPinnedPlace is not supported in CPU-only mode.");
    }
 #else
      if (platform::is_gpu_place(place)) {
        holder_.reset(new PlaceholderImpl<platform::CUDAPlace>(
            boost::get<platform::CUDAPlace>(place), size, type));
      } else if (platform::is_cuda_pinned_place(place)) {
        holder_.reset(new PlaceholderImpl<platform::CUDAPinnedPlace>(
            boost::get<platform::CUDAPinnedPlace>(place), size, type));
      }
    }
 #endif
    offset_ = 0;
  }
  return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
                                 offset_);
 }
-void* Tensor::mutable_data(platform::Place place, size_t requested_size) {
+void* Tensor::mutable_data(platform::Place place, memory::Allocator::Attr attr,
                           size_t requested_size) {
  PADDLE_ENFORCE(this->holder_ != nullptr,
                 "Cannot invoke mutable data if current hold nothing.");
-  return mutable_data(place, holder_->type(), requested_size);
+  return mutable_data(place, type_, attr, requested_size);
 }
 Tensor& Tensor::ShareDataWith(const Tensor& src) {
@ -101,6 +83,7 @@ Tensor Tensor::Slice(int begin_idx, int end_idx) const {
    Tensor dst;
    dst.holder_ = holder_;
    dst.set_layout(layout_);
    dst.type_ = type_;
    DDim dst_dims = dims_;
    dst_dims[0] = end_idx - begin_idx;
    dst.Resize(dst_dims);
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@ -67,12 +67,7 @@ class Tensor {
  friend struct EigenVector;
 public:
-  Tensor() : offset_(0) {}
+  Tensor() : type_(typeid(float)), offset_(0) {}
  /*! Constructor with place should only be used in pybind. */
  explicit Tensor(const platform::Place& place) : offset_(0) {
    holder_->set_place(place);
  }
  /*! Return a pointer to mutable memory block. */
  template <typename T>
@ -89,12 +84,17 @@ class Tensor {
   * @note    If not exist, then allocation.
   */
  template <typename T>
-  T* mutable_data(platform::Place place, size_t requested_size = 0);
+  T* mutable_data(platform::Place place,
                  memory::Allocator::Attr attr = memory::Allocator::kDefault,
                  size_t requested_size = 0);
  void* mutable_data(platform::Place place, std::type_index type,
                     memory::Allocator::Attr attr = memory::Allocator::kDefault,
                     size_t requested_size = 0);
-  void* mutable_data(platform::Place place, size_t requested_size = 0);
+  void* mutable_data(platform::Place place,
                     memory::Allocator::Attr attr = memory::Allocator::kDefault,
                     size_t requested_size = 0);
  /**
   * @brief     Return a pointer to mutable memory block.
@ -106,7 +106,9 @@ class Tensor {
   * @note      If not exist, then allocation.
   */
  template <typename T>
-  T* mutable_data(DDim dims, platform::Place place, size_t requested_size = 0);
+  T* mutable_data(DDim dims, platform::Place place,
                  memory::Allocator::Attr attr = memory::Allocator::kDefault,
                  size_t requested_size = 0);
  /*! Return the dimensions of the memory block. */
  const DDim& dims() const;
@ -139,7 +141,7 @@ class Tensor {
  std::type_index type() const {
    PADDLE_ENFORCE_NOT_NULL(
        holder_, "Tensor not initialized yet when Tensor::type() is called.");
-    return holder_->type();
+    return type_;
  }
  // memory size returns the holding memory size in byte.
@ -153,56 +155,13 @@ class Tensor {
  void clear() { holder_ = nullptr; }
- private:
+  const std::shared_ptr<memory::Allocation>& Holder() const { return holder_; }
-  /**
+  size_t offset() const { return offset_; }
   * @note    Placeholder hides type T, so it doesn't appear as a template
   *          parameter of Variable.
   */
  struct Placeholder {
    virtual ~Placeholder() = default;
    virtual void* ptr() const = 0;
    virtual size_t size() const = 0;
    virtual std::type_index type() const = 0;
    virtual platform::Place place() const = 0;
    virtual void set_type(std::type_index type) = 0;
    virtual void set_place(platform::Place place) = 0;
  };
  template <typename Place>
  struct PlaceholderImpl : public Placeholder {
    PlaceholderImpl(Place place, size_t size, std::type_index type)
        : ptr_(static_cast<uint8_t*>(memory::Alloc(place, size)),
               memory::PODDeleter<uint8_t, Place>(place)),
          place_(place),
          size_(size),
          type_(type) {
      PADDLE_ENFORCE_NOT_NULL(ptr_, "Insufficient %s memory to allocation.",
                              (is_cpu_place(place_) ? "CPU" : "GPU"));
    }
    virtual size_t size() const { return size_; }
    virtual platform::Place place() const { return place_; }
    virtual void* ptr() const { return static_cast<void*>(ptr_.get()); }
    virtual std::type_index type() const { return type_; }
    virtual void set_type(std::type_index type) { type_ = type; }
    virtual void set_place(platform::Place place) { place_ = place; }
    /*! the pointer of memory block. */
    std::unique_ptr<uint8_t, memory::PODDeleter<uint8_t, Place>> ptr_;
    /*! the place of memory block. */
    platform::Place place_;
    /*! the size of memory block. */
    size_t size_;
    /* the current type of memory */
    std::type_index type_;
  };
 private:
  /*! holds the memory block if allocated. */
-  std::shared_ptr<Placeholder> holder_;
+  std::shared_ptr<memory::Allocation> holder_;
-
+  std::type_index type_;
  /**
   * @brief points to elements dimensions.
   *
--- a/paddle/fluid/framework/tensor_impl.h
+++ b/paddle/fluid/framework/tensor_impl.h
@ -23,10 +23,10 @@ namespace framework {
 template <typename T>
 inline const T* Tensor::data() const {
  check_memory_size();
-  bool valid = std::is_same<T, void>::value ||
+  bool valid =
-               holder_->type() == std::type_index(typeid(T));
+      std::is_same<T, void>::value || type_ == std::type_index(typeid(T));
  PADDLE_ENFORCE(valid, "Tensor holds the wrong type, it holds %s",
-                 this->holder_->type().name());
+                 type_.name());
  return reinterpret_cast<const T*>(
      reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
@ -37,26 +37,30 @@ inline bool Tensor::IsInitialized() const { return holder_ != nullptr; }
 template <typename T>
 inline T* Tensor::data() {
  check_memory_size();
-  bool valid = std::is_same<T, void>::value ||
+  bool valid =
-               holder_->type() == std::type_index(typeid(T));
+      std::is_same<T, void>::value || type_ == std::type_index(typeid(T));
  PADDLE_ENFORCE(valid, "Tensor holds the wrong type, it holds %s",
-                 this->holder_->type().name());
+                 type_.name());
  return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
                              offset_);
 }
 template <typename T>
 inline T* Tensor::mutable_data(DDim dims, platform::Place place,
                               memory::Allocator::Attr attr,
                               size_t requested_size) {
  static_assert(std::is_pod<T>::value, "T must be POD");
  Resize(dims);
-  return mutable_data<T>(place, requested_size);
+  return mutable_data<T>(place, attr, requested_size);
 }
 template <typename T>
-inline T* Tensor::mutable_data(platform::Place place, size_t requested_size) {
+inline T* Tensor::mutable_data(platform::Place place,
                               memory::Allocator::Attr attr,
                               size_t requested_size) {
  static_assert(std::is_pod<T>::value, "T must be POD");
-  return reinterpret_cast<T*>(mutable_data(place, typeid(T), requested_size));
+  return reinterpret_cast<T*>(
      mutable_data(place, typeid(T), attr, requested_size));
 }
 inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) {
--- a/paddle/fluid/framework/tensor_util_test.cc
+++ b/paddle/fluid/framework/tensor_util_test.cc
@ -379,7 +379,9 @@ TEST(Tensor, FromAndToStream) {
    TensorToStream(oss, gpu_tensor, gpu_ctx);
    std::istringstream iss(oss.str());
-    TensorFromStream(iss, &dst_tensor, gpu_ctx);
+    TensorFromStream(
        iss, &dst_tensor,
        *platform::DeviceContextPool::Instance().Get(platform::CPUPlace()));
    int* dst_ptr = dst_tensor.mutable_data<int>(platform::CPUPlace());
    for (int i = 0; i < 6; ++i) {
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@ -13,7 +13,7 @@ set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor)
 # TODO(panyx0718): Should this be called paddle_fluid_inference_api_internal?
 cc_library(paddle_fluid_api
    SRCS io.cc
-    DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB}) 
+    DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS})
 get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
 get_property(cuda_modules GLOBAL PROPERTY CUDA_MODULES)
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@ -114,7 +114,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
  // it is either an OP's input or an OP's output.
  auto &subgraph_nodes = *Agent(node).subgraph();
-  for (size_t index = 0; index < block_desc.OpSize(); index++) {
+  for (size_t index = 0; index < block_desc.OpSize(); ++index) {
    framework::proto::OpDesc *op = block_desc.Op(index)->Proto();
    auto correspond_node = subgraph_nodes[index];
    PADDLE_ENFORCE_EQ(correspond_node->Name(), op->type());
--- a/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc
@ -45,7 +45,8 @@ void IrAnalysisComposePass::InitTensorRTAttrs(Argument *argument) {
      std::unordered_set<std::string> teller_set(
          {"mul", "conv2d", "pool2d", "relu", "softmax", "sigmoid",
           "depthwise_conv2d", "batch_norm", "concat", "tanh", "pad",
-           "elementwise_add", "dropout", "split", "prelu", "conv2d_transpose"});
+           "elementwise_add", "elementwise_mul", "dropout", "split", "prelu",
           "conv2d_transpose"});
      if (!node->IsOp()) return false;
      if (teller_set.count(node->Op()->Type())) {
--- a/paddle/fluid/inference/tensorrt/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/CMakeLists.txt
@ -1,4 +1,4 @@
-nv_library(tensorrt_engine SRCS engine.cc DEPS framework_proto device_context)
+nv_library(tensorrt_engine SRCS engine.cc DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context)
 nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader)
 nv_test(test_tensorrt_engine SRCS test_engine.cc DEPS dynload_cuda tensorrt_engine)
 add_subdirectory(plugin)
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@ -1,39 +1,40 @@
 # Add TRT tests
 nv_library(tensorrt_converter
-  SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
+           SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
-batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc
+                batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc
-pad_op.cc split_op.cc prelu_op.cc
+                pad_op.cc split_op.cc prelu_op.cc
-  DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry)
+           DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry)
 nv_test(test_op_converter SRCS test_op_converter.cc DEPS
-  ${FLUID_CORE_MODULES} tensorrt_engine tensorrt_converter)
+  ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine tensorrt_converter)
 nv_test(test_io_converter SRCS test_io_converter.cc io_converter.cc DEPS dynload_cuda dynamic_loader lod_tensor)
 nv_test(test_trt_mul_op SRCS test_mul_op.cc mul_op.cc
-        DEPS ${FLUID_CORE_MODULES} tensorrt_engine mul_op SERIAL)
+        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine mul_op SERIAL)
 nv_test(test_trt_fc_op SRCS test_fc_op.cc fc_op.cc
-        DEPS ${FLUID_CORE_MODULES} tensorrt_engine mul_op SERIAL)
+        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine mul_op SERIAL)
 nv_test(test_trt_activation_op SRCS test_activation_op.cc activation_op.cc
-        DEPS ${FLUID_CORE_MODULES} tensorrt_engine activation_op SERIAL)
+        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine activation_op SERIAL)
 nv_test(test_trt_conv_op SRCS test_conv2d_op.cc conv2d_op.cc
-        DEPS ${FLUID_CORE_MODULES} tensorrt_engine conv_op conv_transpose_op SERIAL)
+        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine conv_op conv_transpose_op SERIAL)
 nv_test(test_trt_pool2d_op SRCS test_pool2d_op.cc pool2d_op.cc
-        DEPS ${FLUID_CORE_MODULES} tensorrt_engine pool_op SERIAL)
+        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine pool_op SERIAL)
 nv_test(test_trt_elementwise_op SRCS test_elementwise_op.cc elementwise_op.cc
-        DEPS ${FLUID_CORE_MODULES} tensorrt_engine elementwise_add_op SERIAL)
+        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine tensorrt_plugin
             elementwise_add_op elementwise_mul_op SERIAL)
 nv_test(test_trt_softmax_op SRCS test_softmax_op.cc softmax_op.cc
-        DEPS ${FLUID_CORE_MODULES} tensorrt_engine softmax_op SERIAL)
+        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine softmax_op SERIAL)
 nv_test(test_trt_batch_norm_op SRCS test_batch_norm_op.cc batch_norm_op.cc
-        DEPS ${FLUID_CORE_MODULES} tensorrt_engine batch_norm_op SERIAL)
+        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine batch_norm_op SERIAL)
 nv_test(test_trt_concat_op SRCS test_concat_op.cc concat_op.cc
-        DEPS ${FLUID_CORE_MODULES} tensorrt_engine concat_op SERIAL)
+        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine concat_op SERIAL)
 nv_test(test_trt_dropout_op SRCS test_dropout_op.cc dropout_op.cc
-        DEPS ${FLUID_CORE_MODULES} tensorrt_engine dropout_op SERIAL)
+        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine dropout_op SERIAL)
 nv_test(test_trt_pad_op SRCS test_pad_op.cc pad_op.cc
-        DEPS ${FLUID_CORE_MODULES} tensorrt_engine pad_op SERIAL)
+        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine pad_op SERIAL)
 nv_test(test_trt_split_op SRCS test_split_op.cc split_op.cc
-        DEPS ${FLUID_CORE_MODULES} tensorrt_engine tensorrt_plugin
+        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine tensorrt_plugin
-        split_op concat_op SERIAL)
+             split_op concat_op SERIAL)
 nv_test(test_trt_prelu_op SRCS test_prelu_op.cc prelu_op.cc
-        DEPS ${FLUID_CORE_MODULES} tensorrt_engine tensorrt_plugin
+        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine tensorrt_plugin
        prelu_op SERIAL)
--- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
@ -13,11 +13,25 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h"
 namespace paddle {
 namespace inference {
 namespace tensorrt {
 static bool CheckDims(const nvinfer1::Dims& dims_x,
                      const nvinfer1::Dims& dims_y) {
  if (dims_x.nbDims != dims_y.nbDims) {
    return false;
  }
  for (int i = 0; i < dims_x.nbDims; i++) {
    if (dims_x.d[i] != dims_y.d[i]) {
      return false;
    }
  }
  return true;
 }
 class ElementwiseWeightOpConverter : public OpConverter {
 public:
  ElementwiseWeightOpConverter() {}
@ -26,7 +40,7 @@ class ElementwiseWeightOpConverter : public OpConverter {
    // Here the two nullptr looks strange, that's because the
    // framework::OpDesc's constructor is strange.
    framework::OpDesc op_desc(op, nullptr);
-    VLOG(3) << "convert a fluid elementwise op to tensorrt IScaleLayer";
+    VLOG(3) << "Convert a fluid elementwise op to TensorRT IScaleLayer";
    PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
    PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1);  // Y is a weight
@ -106,10 +120,12 @@ class ElementwiseTensorOpConverter : public OpConverter {
  ElementwiseTensorOpConverter() {}
  void operator()(const framework::proto::OpDesc& op,
                  const framework::Scope& scope, bool test_mode) override {
    auto op_pair = ops.find(op_type_);
    PADDLE_ENFORCE(op_pair != ops.end(), "Wrong elementwise op type!");
    // Here the two nullptr looks strange, that's because the
    // framework::OpDesc's constructor is strange.
    framework::OpDesc op_desc(op, nullptr);
    VLOG(3) << "convert a fluid elementwise op to tensorrt IScaleLayer";
    PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
    PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1);  // Y is a weight
@ -120,29 +136,35 @@ class ElementwiseTensorOpConverter : public OpConverter {
    nvinfer1::Dims dims_x = X->getDimensions();
    nvinfer1::Dims dims_y = Y->getDimensions();
-    // The two input tensor should have the same dims
+    int axis = boost::get<int>(op_desc.GetAttr("axis"));
-    PADDLE_ENFORCE(dims_x.nbDims >= 3);
+    auto output_name = op_desc.Output("Out")[0];
-    if (dims_x.nbDims == dims_y.nbDims) {
+    if (CheckDims(dims_x, dims_y)) {
-      for (int i = 0; i < dims_x.nbDims; i++) {
+      // The two input tensor should have the same dims
-        if (dims_x.d[i] != dims_y.d[i])
+      VLOG(3) << "Convert a fluid elementwise op to TensorRT IElementWiseLayer";
          PADDLE_THROW("TensorRT unsupported tensor shape for Elementwise op!");
      }
    } else {
      PADDLE_THROW("TensorRT unsupported tensor shape for Elementwise op!");
    }
-    auto op_pair = ops.find(op_type_);
+      nvinfer1::IElementWiseLayer* layer = TRT_ENGINE_ADD_LAYER(
-    if (op_pair == ops.end()) {
+          engine_, ElementWise, *const_cast<nvinfer1::ITensor*>(X),
-      PADDLE_THROW("Wrong elementwise op type!");
+          *const_cast<nvinfer1::ITensor*>(Y), op_pair->second);
    }
    nvinfer1::IElementWiseLayer* layer = TRT_ENGINE_ADD_LAYER(
        engine_, ElementWise, *const_cast<nvinfer1::ITensor*>(X),
        *const_cast<nvinfer1::ITensor*>(Y), op_pair->second);
-    auto output_name = op_desc.Output("Out")[0];
+      layer->setName(("elementwise (Output: " + output_name + ")").c_str());
-    layer->setName(("elementwise (Output: " + output_name + ")").c_str());
+      layer->getOutput(0)->setName(output_name.c_str());
-    layer->getOutput(0)->setName(output_name.c_str());
+      engine_->SetITensor(output_name, layer->getOutput(0));
-    engine_->SetITensor(output_name, layer->getOutput(0));
+    } else {
      VLOG(3) << "Convert a fluid elementwise op to TensorRT "
                 "ElementWisePluginLayer";
      plugin::ElementWisePlugin* plugin =
          new plugin::ElementWisePlugin(op_pair->second, dims_x, dims_y, axis);
      plugin->AddInput(X);
      plugin->AddInput(Y);
      nvinfer1::IPluginLayer* layer = engine_->AddPlugin(
          const_cast<nvinfer1::ITensor* const*>(plugin->GetInputs().data()), 2,
          reinterpret_cast<plugin::PluginTensorRT*>(plugin));
      layer->setName(("elementwise (Output: " + output_name + ")").c_str());
      layer->getOutput(0)->setName(output_name.c_str());
      engine_->SetITensor(output_name, layer->getOutput(0));
    }
    if (test_mode) {  // the test framework can not determine which is the
                      // output, so place the declaration inside.
      engine_->DeclareOutput(output_name);
--- a/Show More
+++ b/Show More