Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into accelerate_ddpg

test=develop
6 years ago · a318a490ab
parent 8ed0233924 c0bcff00dc
commit a318a490ab
73 changed files with 1508 additions and 1021 deletions
--- a/cmake/simd.cmake
+++ b/cmake/simd.cmake
@ -57,46 +57,43 @@ int main()
    return 0;
 }" SSE3_FOUND)

-# disable AVX by default on windows
-if(NOT WIN32)
-    # Check AVX
-    set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG})
-    set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
-    CHECK_CXX_SOURCE_RUNS("
-    #include <immintrin.h>
-    int main()
-    {
-        __m256 a = _mm256_set_ps (-1.0f, 2.0f, -3.0f, 4.0f, -1.0f, 2.0f, -3.0f, 4.0f);
-        __m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f);
-        __m256 result = _mm256_add_ps (a, b);
-        return 0;
-    }" AVX_FOUND)
+# Check AVX
+set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG})
+set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
+CHECK_CXX_SOURCE_RUNS("
+#include <immintrin.h>
+int main()
+{
+    __m256 a = _mm256_set_ps (-1.0f, 2.0f, -3.0f, 4.0f, -1.0f, 2.0f, -3.0f, 4.0f);
+    __m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f);
+    __m256 result = _mm256_add_ps (a, b);
+    return 0;
+}" AVX_FOUND)

-    # Check AVX 2
-    set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG})
-    set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
-    CHECK_CXX_SOURCE_RUNS("
-    #include <immintrin.h>
-    int main()
-    {
-        __m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4);
-        __m256i result = _mm256_abs_epi32 (a);
-        return 0;
-    }" AVX2_FOUND)
+# Check AVX 2
+set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG})
+set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
+CHECK_CXX_SOURCE_RUNS("
+#include <immintrin.h>
+int main()
+{
+    __m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4);
+    __m256i result = _mm256_abs_epi32 (a);
+    return 0;
+}" AVX2_FOUND)

-    # Check AVX512F
-    set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG})
-    set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
-    CHECK_CXX_SOURCE_RUNS("
-    #include <immintrin.h>
-    int main()
-    {
-        __m512i a = _mm512_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4,
-                                      13, -5, 6, -7, 9, 2, -6, 3);
-        __m512i result = _mm512_abs_epi32 (a);
-        return 0;
-    }" AVX512F_FOUND)
-endif(NOT WIN32)
+# Check AVX512F
+set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG})
+set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
+CHECK_CXX_SOURCE_RUNS("
+#include <immintrin.h>
+int main()
+{
+    __m512i a = _mm512_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4,
+                                  13, -5, 6, -7, 9, 2, -6, 3);
+    __m512i result = _mm512_abs_epi32 (a);
+    return 0;
+}" AVX512F_FOUND)

 set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_RETAINED})
 mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND AVX512F_FOUND)
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@ -7,27 +7,17 @@ function(windows_symbolic TARGET)
  cmake_parse_arguments(windows_symbolic "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
  set(final_path ${CMAKE_CURRENT_SOURCE_DIR}/${windows_symbolic_PATH})
  foreach(src ${windows_symbolic_SRCS})
-  get_filename_component(src ${src} NAME_WE)
-  if (NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc OR NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cu)
-      message(FATAL " ${src}.cc and ${src}.cu must exsits, and ${src}.cu must be symbolic file.")
-  endif()
-
-#only copy the xx.cu to.xx.cu when the content are modified
-  set(copy_flag 1)
-  if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu)
-  file(READ ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc SOURCE_STR)
-  file(READ ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu TARGET_STR)
-  if (SOURCE_STR STREQUAL TARGET_STR)
-    set(copy_flag 0)
-  endif()
-  endif()
-  if (copy_flag)
-  add_custom_command(OUTPUT .${src}.cu
-          COMMAND ${CMAKE_COMMAND} -E remove ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu
-          COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc" "${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu"
-          COMMENT "create hidden file of ${src}.cu")
-  endif(copy_flag)
-  add_custom_target(${TARGET} ALL DEPENDS .${src}.cu)
+    get_filename_component(src ${src} NAME_WE)
+    if (NOT EXISTS ${final_path}/${src}.cc OR NOT EXISTS ${final_path}/${src}.cu)
+        message(FATAL " ${src}.cc and ${src}.cu must exsits, and ${src}.cu must be symbolic file.")
+    endif()
+
+    file(GENERATE OUTPUT ${final_path}/.${src}.cu INPUT ${final_path}/${src}.cc)
+
+    add_custom_command(OUTPUT ${final_path}/.${src}.cu
+            COMMAND ${CMAKE_COMMAND} -E copy_if_different "${final_path}/${src}.cc" "${final_path}/.${src}.cu"
+            COMMENT "create hidden file of ${src}.cu")
+    add_custom_target(${TARGET} ALL DEPENDS .${src}.cu)
  endforeach()
 endfunction()

@ -78,18 +68,23 @@ cc_library(garbage_collector SRCS garbage_collector.cc DEPS device_context memor
 cc_library(reader SRCS reader.cc DEPS lod_tensor ddim)
 cc_test(reader_test SRCS reader_test.cc DEPS reader)

-cc_test(variable_test SRCS variable_test.cc)
-
 cc_library(threadpool SRCS threadpool.cc DEPS enforce)
 cc_test(threadpool_test SRCS threadpool_test.cc DEPS threadpool)

-cc_library(scope SRCS scope.cc DEPS glog threadpool xxhash)
+cc_library(var_type_traits SRCS var_type_traits DEPS lod_tensor selected_rows framework_proto)
+if (WITH_GPU)
+  target_link_libraries(var_type_traits dynload_cuda)
+endif()
+cc_test(var_type_traits_test SRCS var_type_traits_test.cc DEPS var_type_traits)
+
+cc_library(scope SRCS scope.cc DEPS glog threadpool xxhash var_type_traits)
 cc_library(scope_pool SRCS scope_pool.cc DEPS scope)
 cc_test(scope_test SRCS scope_test.cc DEPS scope)
+cc_test(variable_test SRCS variable_test.cc DEPS tensor var_type_traits)

 cc_library(data_device_transform SRCS data_device_transform.cc DEPS tensor)
 nv_test(data_device_transform_test SRCS data_device_transform_test.cu
-        DEPS operator op_registry device_context math_function)
+        DEPS operator op_registry device_context math_function scope)

 if(WITH_GPU)
  if (WIN32)
--- a/paddle/fluid/framework/data_device_transform_test.cu
+++ b/paddle/fluid/framework/data_device_transform_test.cu
@ -17,6 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device_context.h"
--- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc
+++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
@ -88,7 +88,7 @@ void EagerDeletionOpHandle::RunImpl() {
      }
    } else {
      PADDLE_THROW("Type %s of %s is not supported eager deletion",
-                   var->Type().name(), name);
+                   framework::ToTypeName(var->Type()), name);
    }
  }

--- a/paddle/fluid/framework/details/execution_strategy.h
+++ b/paddle/fluid/framework/details/execution_strategy.h
@ -25,7 +25,7 @@ struct ExecutionStrategy {
  size_t num_threads_{0};
  bool use_cuda_{true};
  bool allow_op_delay_{false};
-  size_t num_iteration_per_drop_scope_{1};
+  size_t num_iteration_per_drop_scope_{100};
  ExecutorType type_{kDefault};
  bool dry_run_{false};
 };
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h
@ -45,7 +45,7 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
 #endif

  int GetVarDeviceID(
-      const ir::Graph &graph, const std::string &varname,
+      const std::string &varname,
      const std::unordered_map<std::string, int> &sharded_var_device) const;

  bool IsScaleLossOp(ir::Node *node) const;
@ -57,12 +57,6 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
      ir::Graph *result, ir::Node *node,
      std::unordered_map<std::string, int> *sharded_var_device) const;

-  std::vector<std::string> FindDistTrainSendVars(
-      const std::vector<ir::Node *> &nodes) const;
-
-  std::vector<std::string> FindDistTrainRecvVars(
-      const std::vector<ir::Node *> &nodes) const;
-
  void CreateComputationalOps(ir::Graph *result, ir::Node *node,
                              size_t num_places) const;

@ -77,7 +71,7 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
                             int dev_id) const;

  int GetOpDeviceID(
-      const ir::Graph &graph, ir::Node *node,
+      ir::Node *node,
      const std::unordered_map<std::string, int> &sharded_var_device) const;

  void InsertAllReduceOp(ir::Graph *result, const std::string &og) const;
@ -100,6 +94,15 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
  void SetCommunicationContext(OpHandleBase *op_handle,
                               const platform::Place &p) const;

+  std::vector<ir::Node *> SortForReduceMode(
+      const std::vector<ir::Node *> &) const;
+
+  int GetOpDeviceID(
+      ir::Node *node,
+      const std::unordered_map<std::string, int> &shared_var_device,
+      std::unordered_map<std::string, std::vector<ir::Node *>> *delay_ops)
+      const;
+
  mutable std::string loss_var_name_;
  mutable std::vector<platform::Place> places_;
  mutable std::vector<Scope *> local_scopes_;
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
@ -85,7 +85,6 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(

    drop_scope_counter_ = 0;
  }
-
  if (eptr) {
    std::rethrow_exception(eptr);
  } else {
--- a/paddle/fluid/framework/details/variable_visitor.cc
+++ b/paddle/fluid/framework/details/variable_visitor.cc
@ -24,7 +24,7 @@ static void VisitVariable(Variable* var, Func* func) {
  } else if (var->IsType<SelectedRows>()) {
    (*func)(var->GetMutable<SelectedRows>());
  } else {
-    PADDLE_THROW("Not supported type %s", var->Type().name());
+    PADDLE_THROW("Not supported type %s", ToTypeName(var->Type()));
  }
 }

@ -35,7 +35,7 @@ static void VisitVariable(const Variable& var, Func* func) {
  } else if (var.IsType<SelectedRows>()) {
    (*func)(var.Get<SelectedRows>());
  } else {
-    PADDLE_THROW("Not supported type %s", var.Type().name());
+    PADDLE_THROW("Not supported type %s", ToTypeName(var.Type()));
  }
 }

--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@ -119,7 +119,7 @@ static void DeleteUnusedTensors(
          }
        } else {
          PADDLE_THROW("Type %s of %s is not supported eager deletion",
-                       var->Type().name(), name);
+                       framework::ToTypeName(var->Type()), name);
        }
      }
    }
--- a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc
@ -40,18 +40,20 @@ framework::proto::OpDesc PrepareOpDesc(
    const std::string& output) {
  auto proto = base_desc;
  framework::OpDesc desc(proto, nullptr);
+  desc.SetType("conv2d_fusion");
  desc.SetInput("Bias", {bias});
  desc.SetInput("ResidualData", {bias1});
  desc.SetAttr("activation", activation);
  desc.SetOutput("Output", {output});
  desc.SetAttr("is_test", true);
-
+  desc.SetAttr("use_cudnn", false);
+  desc.Flush();
  return *desc.Proto();
 }

 std::unique_ptr<ir::Graph> ConvElementwiseAdd2ActFusePass::ApplyImpl(
    std::unique_ptr<ir::Graph> graph) const {
-  const std::string pattern_name = "conv_elementwise_add_act_fuse";
+  const std::string pattern_name = "conv_elementwise_add2_act_fuse";
  FusePassBase::Init(pattern_name, graph.get());

  GraphPatternDetector gpd;
@ -76,22 +78,23 @@ std::unique_ptr<ir::Graph> ConvElementwiseAdd2ActFusePass::ApplyImpl(
    framework::OpDesc new_op_desc(new_op_proto, nullptr);

    // Create a new node for the fused op.
-    graph->CreateOpNode(&new_op_desc);
+    auto* new_conv_op = graph->CreateOpNode(&new_op_desc);

    // Link inputs and outputs.
    PADDLE_ENFORCE(subgraph.count(x));
    auto* conv_in_node = subgraph.at(x);

-    IR_NODE_LINK_TO(conv_in_node, conv_op);            // Input
-    IR_NODE_LINK_TO(conv_filter, conv_op);             // Filter
-    IR_NODE_LINK_TO(conv_op, conv_out);                // Output
-    IR_NODE_LINK_TO(elementwise_add_in_y, conv_op);    // Bias
-    IR_NODE_LINK_TO(elementwise_add_in_y_1, conv_op);  // Bias
+    IR_NODE_LINK_TO(conv_in_node, new_conv_op);            // Input
+    IR_NODE_LINK_TO(conv_filter, new_conv_op);             // Filter
+    IR_NODE_LINK_TO(elementwise_add_in_y, new_conv_op);    // Bias
+    IR_NODE_LINK_TO(elementwise_add_in_y_1, new_conv_op);  // Bias
+    IR_NODE_LINK_TO(new_conv_op, act_out);                 // Output

    // Delete the unneeded nodes.
-    GraphSafeRemoveNodes(graph.get(),
-                         {conv_op, elementwise_add_op, elementwise_add_op_1,
-                          elementwise_add_out});
+    GraphSafeRemoveNodes(
+        graph.get(),
+        {conv_op, conv_out, elementwise_add_op, elementwise_add_op_1,
+         elementwise_add_out, elementwise_add_out_1, act_op});
  };
  gpd(graph.get(), handler);
  return graph;
--- a/paddle/fluid/framework/ir/graph.cc
+++ b/paddle/fluid/framework/ir/graph.cc
@ -20,102 +20,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/var_desc.h"

-DEFINE_bool(enforce_when_check_program, true,
-            "Checking whether the program is correct or not. We will log "
-            "errors rather than throwing exceptions if this flag turned off");
-
 namespace paddle {
 namespace framework {
 namespace ir {
-namespace {
-
-void CheckProgram(const ProgramDesc &program) {
-#define _INT(role) static_cast<int>(role)
-
-  std::map<int, bool> visit;
-  for (OpDesc *op : program.Block(0).AllOps()) {
-    // For backward compatibility, some program doesn't have role added.
-    if (!op->HasAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) continue;
-    int role_id =
-        boost::get<int>(op->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName()));
-    visit[role_id] = true;
-    switch (role_id) {
-      case _INT(OpRole::kForward):
-        if (visit.find(_INT(OpRole::kBackward)) != visit.end()) {
-          LOG(ERROR) << "Cannot add backward operator before forward operator "
-                     << op->Type();
-        }
-        break;
-      case _INT(OpRole::kBackward):
-      case _INT(OpRole::kBackward) | _INT(OpRole::kLoss):
-        if (!FLAGS_enforce_when_check_program) {
-          PADDLE_ENFORCE(
-              visit.find(_INT(OpRole::kOptimize)) == visit.end(),
-              "Cannot add backward operator %s after optimize operator.",
-              op->Type());
-        } else {
-          if (visit.find(_INT(OpRole::kOptimize)) != visit.end()) {
-            LOG(ERROR)
-                << "Cannot add backward operator %s after optimize operator."
-                << op->Type();
-          }
-        }
-        break;
-      case _INT(OpRole::kForward) | _INT(OpRole::kLoss):
-        if (!FLAGS_enforce_when_check_program) {
-          PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward) |
-                                    _INT(OpRole::kLoss)) == visit.end(),
-                         "Cannot add backward|loss operator before "
-                         "forward|loss operator %s.",
-                         op->Type());
-          PADDLE_ENFORCE(
-              visit.find(_INT(OpRole::kOptimize)) == visit.end(),
-              "Cannot add forward|loss operator %s after optimize operator.",
-              op->Type());
-        } else {
-          if (visit.find(_INT(OpRole::kBackward) | _INT(OpRole::kLoss)) !=
-              visit.end()) {
-            LOG(ERROR) << "Cannot add backward|loss operator before "
-                       << "forward|loss operator %s." << op->Type();
-          }
-
-          if (visit.find(_INT(OpRole::kOptimize)) != visit.end()) {
-            LOG(ERROR) << "Cannot add forward|loss operator %s after optimize "
-                          "operator."
-                       << op->Type();
-          }
-        }
-        break;
-      case _INT(OpRole::kOptimize):
-      case _INT(OpRole::kOptimize) | _INT(OpRole::kLRSched):
-        if (!FLAGS_enforce_when_check_program) {
-          PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward)) != visit.end(),
-                         "Optimize operators %s must follow backward operator.",
-                         op->Type());
-        } else {
-          if (visit.find(_INT(OpRole::kBackward)) == visit.end()) {
-            LOG(ERROR) << "Optimize operators %s must follow backward operator."
-                       << op->Type();
-          }
-        }
-        break;
-      case _INT(OpRole::kLRSched):
-      case _INT(OpRole::kDist):
-      case _INT(OpRole::kRPC):
-      case _INT(OpRole::kNotSpecified):
-        break;
-      default:
-        LOG(FATAL) << "Unknown operator role. Don't add new role because "
-                      "you don't know what you are doing.";
-    }
-  }
-
-#undef _INT
-}
-}  // namespace

 Graph::Graph(const ProgramDesc &program) : program_(program) {
-  CheckProgram(program_);
  auto var_nodes = InitFromProgram(program_);
  ResolveHazard(var_nodes);
 }
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@ -1101,9 +1101,7 @@ PDNode *patterns::ElementwiseAdd::operator()(PDNode *x_var, PDNode *y_var) {
  return out_var;
 }

-std::unordered_set<std::string> conv_act_set({"identity", "sigmoid", "relu",
-                                              "relu6", "relux", "tanh",
-                                              "band_pass"});
+std::unordered_set<std::string> conv_act_set({"identity", "relu"});

 PDNode *patterns::ConvElementwiseaddAct::operator()(PDNode *conv_in) {
  conv_in->AsInput();
@ -1169,13 +1167,13 @@ PDNode *patterns::ConvElementwiseadd2Act::operator()(PDNode *conv_in) {
                                  ->AsInput();
  auto elementwise_add_out = pattern->NewNode(elementwise_add_out_repr())
                                 ->assert_is_op_output("elementwise_add")
-                                 ->assert_is_op_input("elementwise_add", "X")
+                                 ->assert_is_op_input("elementwise_add", "Y")
                                 ->AsIntermediate();

  auto elementwise_add_op_1 = pattern->NewNode(elementwise_add_op_1_repr())
                                  ->assert_is_op("elementwise_add");
  auto elementwise_add_in_y_1 = pattern->NewNode(elementwise_add_in_y_1_repr())
-                                    ->assert_is_op_input("elementwise_add", "Y")
+                                    ->assert_is_op_input("elementwise_add", "X")
                                    ->AsInput();
  auto elementwise_add_out_1 = pattern->NewNode(elementwise_add_out_1_repr())
                                   ->assert_is_op_output("elementwise_add")
@ -1203,8 +1201,8 @@ PDNode *patterns::ConvElementwiseadd2Act::operator()(PDNode *conv_in) {
  conv_op->LinksFrom({conv_in, conv_filter}).LinksTo({conv_out});
  elementwise_add_op->LinksFrom({conv_out, elementwise_add_in_y})
      .LinksTo({elementwise_add_out});
-  elementwise_add_op_1->LinksFrom(
-      {elementwise_add_out, elementwise_add_in_y_1});
+  elementwise_add_op_1->LinksFrom({elementwise_add_out, elementwise_add_in_y_1})
+      .LinksTo({elementwise_add_out_1});
  act_op->LinksFrom({elementwise_add_out_1}).LinksTo({act_out});
  return act_out;
 }
--- a/paddle/fluid/framework/mixed_vector.h
+++ b/paddle/fluid/framework/mixed_vector.h
@ -215,8 +215,8 @@ class Vector {
      auto stream = dev_ctx->stream();
      void *src = gpu_->ptr();
      void *dst = cpu_.data();
-      memory::Copy(platform::CPUPlace(), dst, CUDAPlace().get(), src,
-                   gpu_->size(), stream);
+      paddle::memory::Copy(platform::CPUPlace(), dst, CUDAPlace().get(), src,
+                           gpu_->size(), stream);
      dev_ctx->Wait();
    }

@ -261,8 +261,8 @@ class Vector {
      auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
          platform::DeviceContextPool::Instance().Get(place));
      auto stream = dev_ctx->stream();
-      memory::Copy(CUDAPlace().get(), dst, platform::CPUPlace(), src,
-                   gpu_->size(), stream);
+      paddle::memory::Copy(CUDAPlace().get(), dst, platform::CPUPlace(), src,
+                           gpu_->size(), stream);
    }

    void ImmutableCPU() const {
@ -284,7 +284,7 @@ class Vector {
    bool IsInCPU() const { return flag_ & kDataInCPU; }

    mutable std::vector<T> cpu_;
-    mutable memory::AllocationPtr gpu_;
+    mutable paddle::memory::AllocationPtr gpu_;
    mutable int flag_;

    mutable std::mutex mtx_;
--- a/paddle/fluid/framework/op_proto_maker.cc
+++ b/paddle/fluid/framework/op_proto_maker.cc
@ -82,10 +82,6 @@ void OpProtoAndCheckerMaker::operator()(proto::OpProto* proto,
  AddAttr<std::string>(OpNamescopeAttrName(), "Operator name with namesope.")
      .SetDefault("");

-  AddAttr<std::vector<std::string>>(OpCreationCallstackAttrName(),
-                                    "Callstack for Op Creatation.")
-      .SetDefault({});
-
  Validate();
 }

--- a/paddle/fluid/framework/op_proto_maker.h
+++ b/paddle/fluid/framework/op_proto_maker.h
@ -47,7 +47,6 @@ class OpProtoAndCheckerMaker {
  static const char *OpRoleAttrName() { return "op_role"; }
  static const char *OpRoleVarAttrName() { return "op_role_var"; }
  static const char *OpNamescopeAttrName() { return "op_namescope"; }
-  static const char *OpCreationCallstackAttrName() { return "op_callstack"; }

  void operator()(proto::OpProto *proto, OpAttrChecker *attr_checker);

--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@ -23,7 +23,8 @@ limitations under the License. */
 #include <unordered_map>
 #include <unordered_set>

-#include "glog/logging.h"  // For VLOG()
+#define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
+#include "glog/logging.h"               // For VLOG()
 #include "paddle/fluid/framework/attribute.h"
 #include "paddle/fluid/framework/details/op_registry.h"
 #include "paddle/fluid/framework/framework.pb.h"
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@ -16,15 +16,10 @@ limitations under the License. */
 #include <glog/logging.h>

 #include <algorithm>
-#include <sstream>
-#include <string>
-#include <vector>
-#include "gflags/gflags.h"
-#include "glog/logging.h"
+
 #include "paddle/fluid/framework/data_transform.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_proto_maker.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/shape_inference.h"
 #include "paddle/fluid/framework/transfer_scope_cache.h"
@ -162,59 +157,27 @@ RuntimeContext::RuntimeContext(const VariableNameMap& innames,
 }

 void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
-  try {
-    if (VLOG_IS_ON(4)) {
-      VLOG(4) << place << " " << DebugStringEx(&scope);
-    }
-    if (platform::is_gpu_place(place)) {
+  VLOG(4) << place << " " << DebugStringEx(&scope);
+  if (platform::is_gpu_place(place)) {
 #ifndef PADDLE_WITH_CUDA
-      PADDLE_THROW("Cannot run operator on place %s", place);
+    PADDLE_THROW("Cannot run operator on place %s", place);
 #else
-      auto dev_id = boost::get<platform::CUDAPlace>(place).device;
-      platform::SetDeviceId(dev_id);
+    auto dev_id = boost::get<platform::CUDAPlace>(place).device;
+    platform::SetDeviceId(dev_id);
 #endif
-    }
-
-    // The profile has a process-wide mutex, results in serious performance
-    // issue
-    // in concurrency scenerio. Here use an `if` to fix this issue.
-    // Please not remove the `if`, ask @Superjomn if there are any concern.
-    if (platform::IsProfileEnabled()) {
-      platform::DeviceContextPool& pool =
-          platform::DeviceContextPool::Instance();
-      platform::RecordEvent record_event(Type(), pool.Get(place));
-      RunImpl(scope, place);
-    } else {
-      RunImpl(scope, place);
-    }
-
-    if (VLOG_IS_ON(3)) {
-      VLOG(3) << place << " " << DebugStringEx(&scope);
-    }
-  } catch (platform::EnforceNotMet exception) {
-    if (Attrs().count("sub_block") != 0) {
-      throw exception;
-    }
-
-    auto& callstack = Attr<std::vector<std::string>>(
-        OpProtoAndCheckerMaker::OpCreationCallstackAttrName());
+  }

-    if (callstack.empty()) {
-      throw exception;
-    }
-    std::ostringstream sout;
-    sout << "Invoke operator " << Type() << " error.\n";
-    sout << "Python Callstacks: \n";
-    for (auto& line : callstack) {
-      sout << line;
-    }
-    sout << "C++ Callstacks: \n";
-    sout << exception.err_str_;
-    exception.err_str_ = sout.str();
-    throw exception;
-  } catch (...) {
-    std::rethrow_exception(std::current_exception());
+  // The profile has a process-wide mutex, results in serious performance issue
+  // in concurrency scenerio. Here use an `if` to fix this issue.
+  // Please not remove the `if`, ask @Superjomn if there are any concern.
+  if (platform::IsProfileEnabled()) {
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    platform::RecordEvent record_event(Type(), pool.Get(place));
+    RunImpl(scope, place);
+  } else {
+    RunImpl(scope, place);
  }
+  VLOG(3) << place << " " << DebugStringEx(&scope);
 }

 bool OperatorBase::HasInputs(const std::string& name) const {
@ -417,7 +380,7 @@ const Tensor* GetLoDTensorOrSelectedRowsValueFromVar(const Variable& var) {
    return &(var.Get<SelectedRows>().value());
  } else {
    PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.",
-                 var.Type().name());
+                 ToTypeName(var.Type()));
  }
 }

@ -428,7 +391,7 @@ Tensor* GetMutableLoDTensorOrSelectedRowsValueFromVar(Variable* var) {
    return var->GetMutable<SelectedRows>()->mutable_value();
  } else {
    PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.",
-                 var->Type().name());
+                 ToTypeName(var->Type()));
  }
 }

@ -522,7 +485,7 @@ const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
                   PADDLE_ENFORCE(
                       var->IsType<LoDTensor>(),
                       "should be LoDTensor, but the received type is %s",
-                       var->Type().name());
+                       ToTypeName(var->Type()));
                   return &(var->Get<LoDTensor>());
                 });
  return res;
@ -541,7 +504,7 @@ const std::vector<const Tensor*> ExecutionContext::LegacyMultiInput<Tensor>(
                   PADDLE_ENFORCE(
                       var->IsType<LoDTensor>(),
                       "%s should be LoDTensor, but the received type is %s",
-                       sub_name, var->Type().name());
+                       sub_name, ToTypeName(var->Type()));
                   return &(var->Get<LoDTensor>());
                 });
  return res;
@ -570,7 +533,7 @@ std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>(
                   PADDLE_ENFORCE(
                       var->IsType<LoDTensor>(),
                       "%s should be LoDTensor, but the received type is %s",
-                       sub_name, var->Type().name());
+                       sub_name, ToTypeName(var->Type()));
                   return var->GetMutable<LoDTensor>();
                 });
  return res;
@ -812,7 +775,7 @@ class RuntimeInferShapeContext : public InferShapeContext {
      PADDLE_THROW(
          "Only LoDTensor/SelectedRows support 'GetDim', but Variables "
          "type_id is %s.",
-          var->Type().name());
+          ToTypeName(var->Type()));
    }
  }

@ -835,7 +798,7 @@ class RuntimeInferShapeContext : public InferShapeContext {
      var->GetMutable<SelectedRows>()->set_height(dim[0]);
    } else {
      PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.",
-                   var->Type().name());
+                   ToTypeName(var->Type()));
    }
  }

--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@ -49,6 +49,8 @@ constexpr char kTempVarName[] = "@TEMP@";
 /// e.g. Variable "x@GRAD" is the gradient of varibale "x".
 constexpr char kGradVarSuffix[] = "@GRAD";

+constexpr size_t kGradVarSuffixSize = 5U;
+
 /// Variables with this suffix are supposed to be filled up with zeros.
 constexpr char kZeroVarSuffix[] = "@ZERO";

@ -60,7 +62,11 @@ constexpr char kNewGradSuffix[] = "@NEWGRAD@";
 extern std::vector<std::tuple<platform::Place, LibraryType>> kKernelPriority;

 inline std::string GradVarName(const std::string& var_name) {
-  return var_name + kGradVarSuffix;
+  std::string result;
+  result.reserve(var_name.size() + kGradVarSuffixSize);
+  result += var_name;
+  result += kGradVarSuffix;
+  return result;
 }

 proto::VarType::Type GetDataTypeOfVar(const Variable* var);
@ -110,8 +116,8 @@ class OperatorBase {
  bool HasAttr(const std::string& name) const { return attrs_.count(name); }
  template <typename T>
  inline const T& Attr(const std::string& name) const {
-    PADDLE_ENFORCE(attrs_.count(name) != 0, "%s should be in AttributeMap",
-                   name);
+    PADDLE_ENFORCE(attrs_.find(name) != attrs_.end(),
+                   "%s should be in AttributeMap", name);
    return boost::get<T>(attrs_.at(name));
  }
  const AttributeMap& Attrs() const { return attrs_; }
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@ -320,6 +320,7 @@ void ParallelExecutor::BCastParamsToDevices(
    if (paddle::platform::is_gpu_place(main_tensor.place())) {
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
      std::vector<void *> buffers;
+      buffers.reserve(member_->places_.size());
      size_t numel = main_tensor.numel();
      ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type());
      for (size_t i = 0; i < member_->places_.size(); ++i) {
@ -353,9 +354,7 @@ void ParallelExecutor::BCastParamsToDevices(
 #endif
    } else {
      platform::CPUPlace cpu;
-      for (size_t i = 0; i < member_->places_.size(); ++i) {
-        if (i == 0) continue;
-
+      for (size_t i = 1; i < member_->places_.size(); ++i) {
        auto local_scope = member_->local_scopes_[i];
        auto *t = local_scope->Var(var)->GetMutable<LoDTensor>();

--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@ -47,15 +47,9 @@ DEFINE_bool(fast_eager_deletion_mode, false,
 // the mutex will cause serious performance issue.
 // So the mutex is disabled when `ON_INFER`.
 #ifdef PADDLE_ON_INFERENCE
-#define SCOPE_KIDS_READER_LOCK
-#define SCOPE_KIDS_WRITER_LOCK
-#define SCOPE_VARS_READER_LOCK
-#define SCOPE_VARS_WRITER_LOCK
+#define SCOPE_LOCK_GUARD
 #else
-#define SCOPE_KIDS_READER_LOCK AutoRDLock auto_lock(&kids_lock_);
-#define SCOPE_KIDS_WRITER_LOCK AutoWRLock auto_lock(&kids_lock_);
-#define SCOPE_VARS_READER_LOCK AutoRDLock auto_lock(&vars_lock_);
-#define SCOPE_VARS_WRITER_LOCK AutoWRLock auto_lock(&vars_lock_);
+#define SCOPE_LOCK_GUARD std::lock_guard<std::mutex> lock(mutex_);
 #endif

 namespace paddle {
@ -73,69 +67,64 @@ bool IsFastEagerDeletionModeEnabled() { return FLAGS_fast_eager_deletion_mode; }
 Scope::~Scope() { DropKids(); }

 Scope& Scope::NewScope() const {
-  Scope* child = new Scope(this);
-  {
-    SCOPE_KIDS_WRITER_LOCK
-    kids_.push_back(child);
-  }
-  return *child;
+  SCOPE_LOCK_GUARD
+  kids_.push_back(new Scope(this));
+  return *kids_.back();
 }

 Variable* Scope::Var(const std::string& name) {
-  SCOPE_VARS_WRITER_LOCK
+  SCOPE_LOCK_GUARD
  return VarInternal(name);
 }

 Variable* Scope::Var(std::string* name) {
+  SCOPE_LOCK_GUARD
  auto new_name = string::Sprintf("%p.%d", this, vars_.size());
  if (name != nullptr) {
    *name = new_name;
  }
-  SCOPE_VARS_WRITER_LOCK
  return VarInternal(new_name);
 }

 Variable* Scope::FindVar(const std::string& name) const {
-  SCOPE_VARS_READER_LOCK
+  SCOPE_LOCK_GUARD
  return FindVarInternal(name);
 }

 Variable* Scope::FindLocalVar(const std::string& name) const {
-  SCOPE_VARS_READER_LOCK
+  SCOPE_LOCK_GUARD
  return FindVarLocally(name);
 }

 const Scope* Scope::FindScope(const Variable* var) const {
-  SCOPE_VARS_READER_LOCK
+  SCOPE_LOCK_GUARD
  return FindScopeInternal(var);
 }

 void Scope::DropKids() {
-  SCOPE_KIDS_WRITER_LOCK
+  SCOPE_LOCK_GUARD
  for (Scope* s : kids_) delete s;
  kids_.clear();
 }

 bool Scope::HasKid(const Scope* scope) const {
-  SCOPE_KIDS_READER_LOCK
+  SCOPE_LOCK_GUARD
  auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
  return it != this->kids_.end();
 }

 std::vector<std::string> Scope::LocalVarNames() const {
+  SCOPE_LOCK_GUARD
  std::vector<std::string> known_vars;
-  {
-    SCOPE_VARS_READER_LOCK
-    known_vars.reserve(this->vars_.size());
-    for (auto& p : vars_) {
-      known_vars.emplace_back(p.first);
-    }
+  known_vars.reserve(this->vars_.size());
+  for (auto& p : vars_) {
+    known_vars.emplace_back(p.first);
  }
  return known_vars;
 }

 void Scope::DeleteScope(Scope* scope) const {
-  SCOPE_KIDS_WRITER_LOCK
+  SCOPE_LOCK_GUARD
  auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
  PADDLE_ENFORCE(it != this->kids_.end(), "%p Cannot find %p as kid scope",
                 this, scope);
@ -149,8 +138,8 @@ void Scope::DeleteScope(Scope* scope) const {
 }

 void Scope::EraseVars(const std::vector<std::string>& var_names) {
+  SCOPE_LOCK_GUARD
  std::set<std::string> var_set(var_names.begin(), var_names.end());
-  SCOPE_VARS_WRITER_LOCK
  for (auto it = vars_.begin(); it != vars_.end();) {
    if (var_set.find(it->first) != var_set.end()) {
      it = vars_.erase(it);
@ -162,12 +151,12 @@ void Scope::EraseVars(const std::vector<std::string>& var_names) {

 void Scope::Rename(const std::string& origin_name,
                   const std::string& new_name) const {
-  SCOPE_VARS_WRITER_LOCK
+  SCOPE_LOCK_GUARD
  RenameInternal(origin_name, new_name);
 }

 std::string Scope::Rename(const std::string& origin_name) const {
-  SCOPE_VARS_WRITER_LOCK
+  SCOPE_LOCK_GUARD
  auto new_name = string::Sprintf("%p.%d", this, vars_.size());
  RenameInternal(origin_name, new_name);
  return new_name;
@ -176,11 +165,9 @@ std::string Scope::Rename(const std::string& origin_name) const {
 Variable* Scope::VarInternal(const std::string& name) {
  auto* v = FindVarLocally(name);
  if (v != nullptr) return v;
-
  v = new Variable();
-  vars_[name].reset(v);
+  vars_.emplace(name, std::unique_ptr<Variable>(v));
  VLOG(3) << "Create variable " << name;
-  v->name_ = &(vars_.find(name)->first);
  return v;
 }

--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@ -14,19 +14,12 @@ limitations under the License. */

 #pragma once

-extern "C" {
-#include <xxhash.h>
-}
-
-#include <functional>
 #include <list>
-#include <memory>
+#include <mutex>  // NOLINT
 #include <string>
 #include <unordered_map>
-#include <utility>
 #include <vector>

-#include "paddle/fluid/framework/rw_lock.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/platform/macros.h"

@ -138,8 +131,7 @@ class Scope {
  DISABLE_COPY_AND_ASSIGN(Scope);

 private:
-  mutable RWLock kids_lock_;
-  mutable RWLock vars_lock_;
+  mutable std::mutex mutex_;
 };

 // Generate some debug string about the inherience structure of scope, quite
--- a/paddle/fluid/framework/var_type.h
+++ b/paddle/fluid/framework/var_type.h
@ -19,52 +19,50 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/var_type_traits.h"
 #include "paddle/fluid/framework/variable.h"

 namespace paddle {
 namespace framework {

 template <typename T>
-inline bool IsType(const std::type_index& type_index) {
-  return type_index == std::type_index(typeid(T));
+inline bool IsType(const std::type_index& type) {
+  return type == typeid(T);
 }

-inline proto::VarType::Type ToVarType(std::type_index type) {
-  if (IsType<LoDTensor>(type)) {
-    return proto::VarType_Type_LOD_TENSOR;
-  } else if (IsType<LoDRankTable>(type)) {
-    return proto::VarType_Type_LOD_RANK_TABLE;
-  } else if (IsType<LoDTensorArray>(type)) {
-    return proto::VarType_Type_LOD_TENSOR_ARRAY;
-  } else if (IsType<SelectedRows>(type)) {
-    return proto::VarType_Type_SELECTED_ROWS;
-  } else if (IsType<ReaderHolder>(type)) {
-    return proto::VarType_Type_READER;
-  } else {
-    PADDLE_THROW("ToVarType:Unsupported type %s", type.name());
+inline proto::VarType::Type ToVarType(int type) {
+  switch (type) {
+    case proto::VarType::LOD_TENSOR:
+    case proto::VarType::SELECTED_ROWS:
+    case proto::VarType::LOD_RANK_TABLE:
+    case proto::VarType::LOD_TENSOR_ARRAY:
+    case proto::VarType::READER:
+      return static_cast<proto::VarType::Type>(type);
+    default:
+      PADDLE_THROW("ToVarType:Unsupported type %d", type);
  }
 }

 template <typename Visitor>
 inline void VisitVarType(const framework::Variable& var, Visitor visitor) {
-  switch (ToVarType(var.Type())) {
-    case proto::VarType_Type_LOD_TENSOR:
+  switch (var.Type()) {
+    case proto::VarType::LOD_TENSOR:
      visitor(var.Get<LoDTensor>());
      return;
-    case proto::VarType_Type_LOD_RANK_TABLE:
+    case proto::VarType::LOD_RANK_TABLE:
      visitor(var.Get<LoDRankTable>());
      return;
-    case proto::VarType_Type_LOD_TENSOR_ARRAY:
+    case proto::VarType::LOD_TENSOR_ARRAY:
      visitor(var.Get<LoDTensorArray>());
      return;
-    case proto::VarType_Type_SELECTED_ROWS:
+    case proto::VarType::SELECTED_ROWS:
      visitor(var.Get<SelectedRows>());
      return;
-    case proto::VarType_Type_READER:
+    case proto::VarType::READER:
      visitor(var.Get<ReaderHolder>());
      return;
    default:
-      PADDLE_THROW("Not supported visit type, %d", ToVarType(var.Type()));
+      PADDLE_THROW("Not supported visit type, %s", ToTypeName(var.Type()));
  }
 }

--- a/paddle/fluid/framework/var_type_inference_test.cc
+++ b/paddle/fluid/framework/var_type_inference_test.cc
@ -108,7 +108,7 @@ TEST(InferVarType, sum_op_without_infer_var_type) {

  op->InferVarType(prog.MutableBlock(0));

-  ASSERT_EQ(proto::VarType_Type_LOD_TENSOR,
+  ASSERT_EQ(proto::VarType::LOD_TENSOR,
            prog.MutableBlock(0)->Var("test2_out")->GetType());
 }

--- a/paddle/fluid/framework/var_type_traits.cc
+++ b/paddle/fluid/framework/var_type_traits.cc
@ -0,0 +1,119 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/var_type_traits.h"
+#include "paddle/fluid/framework/lod_rank_table.h"
+#include "paddle/fluid/framework/reader.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
+#include "paddle/fluid/platform/macros.h"
+#ifdef PADDLE_WITH_CUDA
+#ifndef _WIN32
+#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
+#endif
+#include <cudnn.h>
+#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
+#include "paddle/fluid/operators/cudnn_rnn_cache.h"
+#endif
+
+namespace paddle {
+namespace framework {
+
+// Besides registering variable type id, it is helpful to register a
+// var_id -> std::type_index map (for example, get type names according to id)
+namespace detail {
+
+template <int kStart, int kEnd, bool kStop>
+struct VarIdToTypeIndexMapInitializerImpl {
+  template <typename MapType1, typename MapType2>
+  static void Init(MapType1 *id_to_type, MapType2 *type_to_id) {
+    using Type =
+        typename std::tuple_element<kStart, VarTypeRegistry::ArgTuple>::type;
+    static_assert(!std::is_same<Type, void>::value, "Type cannot be void");
+    constexpr int kId = VarTypeTrait<Type>::kId;
+    auto type = std::type_index(typeid(Type));
+    PADDLE_ENFORCE(id_to_type->count(kId) == 0,
+                   "Registered duplicate type id %d for type %s", kId,
+                   type.name());
+    PADDLE_ENFORCE(type_to_id->count(type) == 0,
+                   "Registered duplicate type_index %s for id %d", type.name(),
+                   kId);
+    id_to_type->emplace(kId, type);
+    type_to_id->emplace(type, kId);
+    VarIdToTypeIndexMapInitializerImpl<kStart + 1, kEnd,
+                                       kStart + 1 == kEnd>::Init(id_to_type,
+                                                                 type_to_id);
+  }
+};
+
+template <int kStart, int kEnd>
+struct VarIdToTypeIndexMapInitializerImpl<kStart, kEnd, true> {
+  template <typename MapType1, typename MapType2>
+  static void Init(MapType1 *, MapType2 *) {}
+};
+
+// VarIdToTypeIndexMapInitializer is designed to initialize var_id ->
+// std::type_index map and std::type_index -> var_id map
+using VarIdToTypeIndexMapInitializer =
+    VarIdToTypeIndexMapInitializerImpl<0, VarTypeRegistry::kRegisteredTypeNum,
+                                       VarTypeRegistry::kRegisteredTypeNum ==
+                                           0>;
+
+struct VarIdToTypeIndexMapHolder {
+  DISABLE_COPY_AND_ASSIGN(VarIdToTypeIndexMapHolder);
+
+ public:
+  static const std::type_index &ToTypeIndex(int var_id) {
+    auto it = Instance().id_to_type_map_.find(var_id);
+    PADDLE_ENFORCE(it != Instance().id_to_type_map_.end(),
+                   "VarId %d is not registered.", var_id);
+    return it->second;
+  }
+
+  static int ToTypeId(const std::type_index &type) {
+    auto it = Instance().type_to_id_map_.find(type);
+    PADDLE_ENFORCE(it != Instance().type_to_id_map_.end(),
+                   "VarType %s is not registered.", type.name());
+    return it->second;
+  }
+
+ private:
+  VarIdToTypeIndexMapHolder() {
+    VarIdToTypeIndexMapInitializer::Init(&id_to_type_map_, &type_to_id_map_);
+  }
+
+  static const VarIdToTypeIndexMapHolder &Instance() {
+    static const VarIdToTypeIndexMapHolder instance;
+    return instance;
+  }
+
+  std::unordered_map<int, std::type_index> id_to_type_map_;
+  std::unordered_map<std::type_index, int> type_to_id_map_;
+};
+
+}  // namespace detail
+
+const std::type_index &ToTypeIndex(int var_id) {
+  return detail::VarIdToTypeIndexMapHolder::ToTypeIndex(var_id);
+}
+
+const char *ToTypeName(int var_id) { return ToTypeIndex(var_id).name(); }
+
+int ToTypeId(const std::type_index &type) {
+  return detail::VarIdToTypeIndexMapHolder::ToTypeId(type);
+}
+
+}  // namespace framework
+}  // namespace paddle
--- a/Show More
+++ b/Show More