merge develop

test=develop
7 years ago · db2daefe50
parent 8b9d33fa1e f95ee9c09f
commit db2daefe50
122 changed files with 2083 additions and 361 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -54,7 +54,7 @@ option(WITH_PYTHON      "Compile PaddlePaddle with python interpreter"  ON)
 option(WITH_DOUBLE      "Compile PaddlePaddle with double precision"    OFF)
 option(WITH_RDMA        "Compile PaddlePaddle with RDMA support"        OFF)
 option(WITH_TIMER       "Compile PaddlePaddle with stats timer"         OFF)
-option(WITH_PROFILER    "Compile PaddlePaddle with GPU profiler"        OFF)
+option(WITH_PROFILER    "Compile PaddlePaddle with GPU profiler and gperftools"        OFF)
 option(WITH_DOC         "Compile PaddlePaddle with documentation"       OFF)
 option(WITH_COVERAGE    "Compile PaddlePaddle with code coverage"       OFF)
 option(COVERALLS_UPLOAD "Package code coverage data to coveralls"       OFF)
@ -254,6 +254,12 @@ elseif()
    set(WITH_ANAKIN OFF CACHE STRING "Anakin is used in MKL only now." FORCE)
 endif()

+if (WITH_PROFILER)
+    find_package(Gperftools REQUIRED)
+    include_directories(${GPERFTOOLS_INCLUDE_DIR})
+    add_definitions(-DWITH_GPERFTOOLS)
+endif()
+
 include(generic)            # simplify cmake module
 include(package)            # set paddle packages
 include(ccache)             # set ccache for compilation
--- a/README.md
+++ b/README.md
@ -2,8 +2,8 @@


 [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
-[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://paddlepaddle.org/documentation/docs/en/1.1/getstarted/index_en.html)
-[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://paddlepaddle.org/documentation/docs/zh/1.1/beginners_guide/index.html)
+[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html)
+[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/index.html)
 [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
 [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)

@ -19,7 +19,7 @@ Our vision is to enable deep learning for everyone via PaddlePaddle.
 Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle.


-### Latest PaddlePaddle Release: [Fluid 1.1.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.1)
+### Latest PaddlePaddle Release: [Fluid 1.2.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.2)
 ### Install Latest Stable Release:
 ```
 # Linux CPU
@ -27,9 +27,9 @@ pip install paddlepaddle
 # Linux GPU cuda9cudnn7
 pip install paddlepaddle-gpu
 # Linux GPU cuda8cudnn7
-pip install paddlepaddle-gpu==1.1.0.post87
+pip install paddlepaddle-gpu==1.2.0.post87
 # Linux GPU cuda8cudnn5
-pip install paddlepaddle-gpu==1.1.0.post85
+pip install paddlepaddle-gpu==1.2.0.post85

 # For installation on other platform, refer to http://paddlepaddle.org/
 ```
@ -76,26 +76,26 @@ pip install paddlepaddle-gpu==1.1.0.post85

 ## Installation

-It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/1.1/beginners_guide/index.html) on our website.
+It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/install/index_cn.html) on our website.

 ## Documentation

-We provide [English](http://paddlepaddle.org/documentation/docs/en/1.1/getstarted/index_en.html) and
-[Chinese](http://paddlepaddle.org/documentation/docs/zh/1.1/beginners_guide/index.html) documentation.
+We provide [English](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html) and
+[Chinese](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/index.html) documentation.

 - [Deep Learning 101](https://github.com/PaddlePaddle/book)

  You might want to start from this online interactive book that can run in a Jupyter Notebook.

- [Distributed Training](http://paddlepaddle.org/documentation/docs/zh/1.1/user_guides/howto/training/cluster_howto.html)
+- [Distributed Training](http://paddlepaddle.org/documentation/docs/zh/1.2/user_guides/howto/training/cluster_howto.html)

  You can run distributed training jobs on MPI clusters.

- [Python API](http://paddlepaddle.org/documentation/api/zh/1.1/fluid.html)
+- [Python API](http://paddlepaddle.org/documentation/docs/zh/1.2/api_cn/index_cn.html)

   Our new API enables much shorter programs.

- [How to Contribute](http://paddlepaddle.org/documentation/docs/zh/1.1/advanced_usage/development/contribute_to_paddle.html)
+- [How to Contribute](http://paddlepaddle.org/documentation/docs/zh/1.2/advanced_usage/development/contribute_to_paddle/index_cn.html)

   We appreciate your contributions!

--- a/cmake/FindGperftools.cmake
+++ b/cmake/FindGperftools.cmake
@ -0,0 +1,63 @@
+# Tries to find Gperftools.
+#
+# Usage of this module as follows:
+#
+#     find_package(Gperftools)
+#
+# Variables used by this module, they can change the default behaviour and need
+# to be set before calling find_package:
+#
+#  Gperftools_ROOT_DIR  Set this variable to the root installation of
+#                       Gperftools if the module has problems finding
+#                       the proper installation path.
+#
+# Variables defined by this module:
+#
+#  GPERFTOOLS_FOUND              System has Gperftools libs/headers
+#  GPERFTOOLS_LIBRARIES          The Gperftools libraries (tcmalloc & profiler)
+#  GPERFTOOLS_INCLUDE_DIR        The location of Gperftools headers
+
+find_library(GPERFTOOLS_TCMALLOC
+  NAMES tcmalloc
+  HINTS ${Gperftools_ROOT_DIR}/lib)
+
+find_library(GPERFTOOLS_PROFILER
+  NAMES profiler
+  HINTS ${Gperftools_ROOT_DIR}/lib)
+
+find_library(GPERFTOOLS_TCMALLOC_AND_PROFILER
+  NAMES tcmalloc_and_profiler
+  HINTS ${Gperftools_ROOT_DIR}/lib)
+
+find_path(GPERFTOOLS_INCLUDE_DIR
+  NAMES gperftools/heap-profiler.h
+  HINTS ${Gperftools_ROOT_DIR}/include)
+
+set(GPERFTOOLS_LIBRARIES ${GPERFTOOLS_TCMALLOC_AND_PROFILER})
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(
+  Gperftools
+  DEFAULT_MSG
+  GPERFTOOLS_LIBRARIES
+  GPERFTOOLS_INCLUDE_DIR)
+
+mark_as_advanced(
+  Gperftools_ROOT_DIR
+  GPERFTOOLS_TCMALLOC
+  GPERFTOOLS_PROFILER
+  GPERFTOOLS_TCMALLOC_AND_PROFILER
+  GPERFTOOLS_LIBRARIES
+  GPERFTOOLS_INCLUDE_DIR)
+
+# create IMPORTED targets
+if (Gperftools_FOUND AND NOT TARGET gperftools::tcmalloc)
+  add_library(gperftools::tcmalloc UNKNOWN IMPORTED)
+  set_target_properties(gperftools::tcmalloc PROPERTIES
+    IMPORTED_LOCATION ${GPERFTOOLS_TCMALLOC}
+    INTERFACE_INCLUDE_DIRECTORIES "${GPERFTOOLS_INCLUDE_DIR}")
+  add_library(gperftools::profiler UNKNOWN IMPORTED)
+  set_target_properties(gperftools::profiler PROPERTIES
+    IMPORTED_LOCATION ${GPERFTOOLS_PROFILER}
+    INTERFACE_INCLUDE_DIRECTORIES "${GPERFTOOLS_INCLUDE_DIR}")
+endif()
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@ -86,6 +86,7 @@ endif(NOT WITH_GOLANG)

 if(WITH_GPU)
    add_definitions(-DPADDLE_WITH_CUDA)
+    add_definitions(-DEIGEN_USE_GPU)

    FIND_PACKAGE(CUDA REQUIRED)

--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@ -110,6 +110,14 @@ function(find_fluid_modules TARGET_NAME)
  endif()
 endfunction(find_fluid_modules)

+
+function(common_link TARGET_NAME)
+  if (WITH_PROFILER)
+    target_link_libraries(${TARGET_NAME} gperftools::profiler)
+  endif()
+endfunction()
+
+
 # find all third_party modules is used for paddle static library
 # for reduce the dependency when building the inference libs.
 set_property(GLOBAL PROPERTY FLUID_THIRD_PARTY)
@ -274,6 +282,7 @@ function(cc_library TARGET_NAME)
      endif()
      target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
      add_dependencies(${TARGET_NAME} ${cc_library_DEPS})
+      common_link(${TARGET_NAME})
    endif()

    # cpplint code style
@ -340,6 +349,7 @@ function(cc_binary TARGET_NAME)
  if(cc_binary_DEPS)
    target_link_libraries(${TARGET_NAME} ${cc_binary_DEPS})
    add_dependencies(${TARGET_NAME} ${cc_binary_DEPS})
+    common_link(${TARGET_NAME})
  endif()
 endfunction(cc_binary)

@ -362,6 +372,7 @@ function(cc_test TARGET_NAME)
      target_link_libraries(${TARGET_NAME} ${win32_deps})
    endif(WIN32)
    add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
+    common_link(${TARGET_NAME})
    add_test(NAME ${TARGET_NAME}
             COMMAND ${TARGET_NAME} ${cc_test_ARGS}
             WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
@ -420,6 +431,7 @@ function(nv_binary TARGET_NAME)
    if(nv_binary_DEPS)
      target_link_libraries(${TARGET_NAME} ${nv_binary_DEPS})
      add_dependencies(${TARGET_NAME} ${nv_binary_DEPS})
+      common_link(${TARGET_NAME})
    endif()
  endif()
 endfunction(nv_binary)
@ -433,6 +445,7 @@ function(nv_test TARGET_NAME)
    cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS})
    target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
    add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
+    common_link(${TARGET_NAME})
    add_test(${TARGET_NAME} ${TARGET_NAME})
    if (nv_test_SERIAL)
        set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
@ -499,6 +512,7 @@ function(hip_binary TARGET_NAME)
    if(hip_binary_DEPS)
      target_link_libraries(${TARGET_NAME} ${hip_binary_DEPS})
      add_dependencies(${TARGET_NAME} ${hip_binary_DEPS})
+      common_link(${TARGET_NAME})
    endif()
  endif()
 endfunction(hip_binary)
@ -518,6 +532,7 @@ function(hip_test TARGET_NAME)
    set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE HIP)
    target_link_libraries(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main memory gtest gflags)
    add_dependencies(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main memory gtest gflags)
+    common_link(${TARGET_NAME})
    add_test(${TARGET_NAME} ${TARGET_NAME})
  endif()
 endfunction(hip_test)
@ -560,6 +575,7 @@ function(go_library TARGET_NAME)
  endif()
  if(go_library_DEPS)
    add_dependencies(${TARGET_NAME} ${go_library_DEPS})
+    common_link(${TARGET_NAME})
  endif(go_library_DEPS)

  # The "source file" of the library is `${dummyfile}` which never
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@ -66,6 +66,7 @@ paddle.fluid.layers.linear_chain_crf ArgSpec(args=['input', 'label', 'param_attr
 paddle.fluid.layers.crf_decoding ArgSpec(args=['input', 'param_attr', 'label'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.cos_sim ArgSpec(args=['X', 'Y'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.cross_entropy ArgSpec(args=['input', 'label', 'soft_label', 'ignore_index'], varargs=None, keywords=None, defaults=(False, -100))
+paddle.fluid.layers.bpr_loss ArgSpec(args=['input', 'label', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.square_error_cost ArgSpec(args=['input', 'label'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.chunk_eval ArgSpec(args=['input', 'label', 'chunk_scheme', 'num_chunk_types', 'excluded_chunk_types'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.sequence_conv ArgSpec(args=['input', 'num_filters', 'filter_size', 'filter_stride', 'padding', 'bias_attr', 'param_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(3, 1, None, None, None, None, None))
--- a/paddle/fluid/CMakeLists.txt
+++ b/paddle/fluid/CMakeLists.txt
@ -1,6 +1,7 @@
 add_subdirectory(memory)
 add_subdirectory(platform)
 add_subdirectory(framework)
+add_subdirectory(imperative)
 add_subdirectory(operators)
 add_subdirectory(string)
 add_subdirectory(recordio)
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@ -129,11 +129,13 @@ cc_test(version_test SRCS version_test.cc DEPS version)

 cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version)

-if(NOT WIN32)
-cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto ngraph)
-cc_library(ngraph_operator SRCS ngraph_operator.cc DEPS ngraph_bridge operator op_info device_context tensor scope glog
-  shape_inference data_transform lod_tensor profiler)
-endif(NOT WIN32)
+if(WITH_NGRAPH)
+  if(NOT WIN32)
+    cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto ngraph)
+    cc_library(ngraph_operator SRCS ngraph_operator.cc DEPS ngraph_bridge operator op_info device_context tensor scope glog
+      shape_inference data_transform lod_tensor profiler ngraph)
+  endif(NOT WIN32)
+endif(WITH_NGRAPH)

 cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
 nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
@ -169,11 +171,15 @@ if(WITH_DISTRIBUTE)
  set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
  set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 else()
-  if(NOT WIN32)
-    cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph_operator variable_helper)
-  else(NOT WIN32)
+  if(WITH_NGRAPH)
+    if(NOT WIN32)
+      cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph ngraph_operator variable_helper)
+    else(NOT WIN32)
+      cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper)
+    endif(NOT WIN32)
+  else(WITH_NGRAPH)
    cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper)
-  endif(NOT WIN32)
+  endif(WITH_NGRAPH)
  cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op)
 endif()

--- a/paddle/fluid/framework/data_layout_transform.cc
+++ b/paddle/fluid/framework/data_layout_transform.cc
@ -151,19 +151,22 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
  auto out_format =
      platform::MKLDNNFormatForSize(in_tz.size(), ToMKLDNNFormat(out_layout));

-  void* in_data = GetDataFromTensor(in, in_type);
-
  // output tensor has the same dims as input. Reorder don't change dims
  out->Resize(in.dims());

-  auto out_data = out->mutable_data(expected_kernel_type.place_, in.type());
-
-  auto in_memory = memory({{{in_tz}, in_type, in_format}, cpu_engine}, in_data);
-  auto out_memory =
-      memory({{{out_tz}, out_type, out_format}, cpu_engine}, out_data);
+  if (in_format != out_format) {
+    void* in_data = GetDataFromTensor(in, in_type);
+    auto out_data = out->mutable_data(expected_kernel_type.place_, in.type());

-  platform::Reorder(in_memory, out_memory);
+    auto in_memory =
+        memory({{{in_tz}, in_type, in_format}, cpu_engine}, in_data);
+    auto out_memory =
+        memory({{{out_tz}, out_type, out_format}, cpu_engine}, out_data);

+    platform::Reorder(in_memory, out_memory);
+  } else {
+    out->ShareDataWith(in);
+  }
  out->set_layout(out_layout);
  // reset format since the out tensor will be feed to non-MKLDNN OPkernel
  out->set_format(memory::format::format_undef);
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@ -17,7 +17,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/framework/lod_rank_table.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
-#include "paddle/fluid/framework/ngraph_operator.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/framework/transfer_scope_cache.h"
@ -26,6 +25,10 @@ limitations under the License. */
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"

+#ifdef PADDLE_WITH_NGRAPH
+#include "paddle/fluid/framework/ngraph_operator.h"
+#endif
+
 DECLARE_bool(benchmark);
 DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run");
 DEFINE_bool(use_ngraph, false, "Use NGRAPH to run");
@ -88,11 +91,11 @@ static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op,
 static void EnableFusedOp(ExecutorPrepareContext* ctx) {
 #ifdef PADDLE_WITH_NGRAPH
  VLOG(3) << "use_ngraph=True";
-  auto intervals = FusedOperator::FusedOpIntervals(&ctx->ops_);
+  auto intervals = NgraphOperator::NgraphOpIntervals(&ctx->ops_);
  for (auto& interval : intervals) {
-    auto* fused_op = new FusedOperator(ctx->prog_, ctx->block_id_,
-                                       interval.at(0), interval.at(1));
-    *interval[0] = std::unique_ptr<OperatorBase>(fused_op);
+    auto* ng_op = new NgraphOperator(ctx->prog_, ctx->block_id_, interval.at(0),
+                                     interval.at(1));
+    *interval[0] = std::unique_ptr<OperatorBase>(ng_op);
  }
  for (auto it = intervals.rbegin(); it != intervals.rend(); ++it) {
    ctx->ops_.erase(it->at(0) + 1, it->at(1));
--- a/paddle/fluid/framework/feed_fetch_method.cc
+++ b/paddle/fluid/framework/feed_fetch_method.cc
@ -16,7 +16,9 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "glog/logging.h"
+#include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/platform/place.h"

 namespace paddle {
 namespace framework {
@ -53,5 +55,12 @@ LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name,
  return tensor;
 }

+LoDTensor& GetVariableTensor(const Scope& scope, const std::string& var_name) {
+  Variable* var = scope.FindVar(var_name);
+  PADDLE_ENFORCE(var, "%s no in scope", var_name);
+  PADDLE_ENFORCE(var->IsType<LoDTensor>(), "Only support lod tensor now.");
+  return *var->GetMutable<LoDTensor>();
+}
+
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/feed_fetch_method.h
+++ b/paddle/fluid/framework/feed_fetch_method.h
@ -27,5 +27,7 @@ void SetFeedVariable(Scope* scope, const LoDTensor& input,
 LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name,
                            size_t index);

+LoDTensor& GetVariableTensor(const Scope& scope, const std::string& var_name);
+
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/graph.cc
+++ b/paddle/fluid/framework/ir/graph.cc
@ -38,9 +38,8 @@ void CheckProgram(const ProgramDesc &program) {
    switch (role_id) {
      case _INT(OpRole::kForward):
        if (visit.find(_INT(OpRole::kBackward)) != visit.end()) {
-          LOG(ERROR)
-              << "Cannot add backward operator before forward operator %s."
-              << op->Type();
+          LOG(ERROR) << "Cannot add backward operator before forward operator "
+                     << op->Type();
        }
        break;
      case _INT(OpRole::kBackward):
--- a/paddle/fluid/framework/ir/is_test_pass.cc
+++ b/paddle/fluid/framework/ir/is_test_pass.cc
@ -38,7 +38,7 @@ std::unique_ptr<ir::Graph> IsTestPass::ApplyImpl(
  for (const Node* n : graph->Nodes()) {
    if (n->IsOp()) {
      auto* op = n->Op();
-      if (n->RuntimeHasAttr("is_test")) {
+      if (op->HasAttr("is_test") || op->HasProtoAttr("is_test")) {
        op->SetAttr("is_test", true);
      } else if (std::find(begin(op_list), end(op_list), op->Type()) !=
                 end(op_list)) {
--- a/paddle/fluid/framework/ir/is_test_pass_tester.cc
+++ b/paddle/fluid/framework/ir/is_test_pass_tester.cc
@ -104,9 +104,9 @@ TEST(IsTestPass, basic) {
      auto* op = node->Op();
      auto op_name = boost::get<std::string>(op->GetAttr("name"));
      if (op_name == "conv3") {
-        ASSERT_FALSE(node->RuntimeHasAttr("is_test"));
+        ASSERT_FALSE(op->HasAttr("is_test"));
      } else {
-        ASSERT_TRUE(node->RuntimeHasAttr("is_test"));
+        ASSERT_TRUE(op->HasAttr("is_test"));
        EXPECT_TRUE(boost::get<bool>(op->GetAttr("is_test")));
      }
    }
--- a/paddle/fluid/framework/ir/mkldnn_placement_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn_placement_pass.cc
@ -25,12 +25,15 @@ std::unique_ptr<ir::Graph> MKLDNNPlacementPass::ApplyImpl(
  const auto& op_types_list =
      Get<std::unordered_set<std::string>>("mkldnn_enabled_op_types");
  for (const Node* n : graph->Nodes()) {
-    if (n->IsOp() && n->RuntimeHasAttr("use_mkldnn")) {
-      if (op_types_list.empty()) {
-        n->Op()->SetAttr("use_mkldnn", true);
-      } else if (std::find(op_types_list.begin(), op_types_list.end(),
-                           n->Name()) != op_types_list.end()) {
-        n->Op()->SetAttr("use_mkldnn", true);
+    if (n->IsOp()) {
+      auto* op = n->Op();
+      if (op->HasAttr("use_mkldnn") || op->HasProtoAttr("use_mkldnn")) {
+        if (op_types_list.empty()) {
+          op->SetAttr("use_mkldnn", true);
+        } else if (std::find(op_types_list.begin(), op_types_list.end(),
+                             n->Name()) != op_types_list.end()) {
+          op->SetAttr("use_mkldnn", true);
+        }
      }
    }
  }
--- a/paddle/fluid/framework/ir/node.cc
+++ b/paddle/fluid/framework/ir/node.cc
@ -30,28 +30,6 @@ std::unique_ptr<Node> CreateNodeForTest(const std::string &name,
  return std::unique_ptr<Node>(new Node(name, type));
 }

-bool Node::RuntimeHasAttr(const std::string &name) const {
-  if (Op()->HasAttr(name)) {
-    return true;
-  } else {
-    auto &op_info = OpInfoMap::Instance();
-    auto op_type = Op()->Type();
-    if (op_info.Has(op_type)) {
-      auto op_info_ptr = op_info.Get(op_type);
-      if (op_info_ptr.HasOpProtoAndChecker()) {
-        const proto::OpProto &proto = op_info_ptr.Proto();
-        for (int i = 0; i != proto.attrs_size(); ++i) {
-          const proto::OpProto::Attr &attr = proto.attrs(i);
-          if (attr.name() == name) {
-            return true;
-          }
-        }
-      }
-    }
-  }
-  return false;
-}
-
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/node.h
+++ b/paddle/fluid/framework/ir/node.h
@ -108,18 +108,6 @@ class Node {
           Name().find(ir::Node::kControlDepVarName) != std::string::npos;
  }

-  // RuntimeHasAttr is different with HasAttr now.
-  // 1. For Op()->HasAttr(), it judges whether a stored program_desc_ has attr,
-  // thus, if stored program_desc_ are old which don't have an attr, a new
-  // library which adds the attr already will fail on this function.
-  // Details:
-  // https://github.com/PaddlePaddle/Paddle/pull/14608#issuecomment-442309087
-  // 2. For Op()->RuntimeHasAttr, it judges the attr in runtime to avoid above
-  // problem.
-  // TODO(luotao): Maybe we should enhance HasAttr later, instead of adding
-  // RuntimeHasAttr.
-  bool RuntimeHasAttr(const std::string& name) const;
-
  std::vector<Node*> inputs;
  std::vector<Node*> outputs;

--- a/paddle/fluid/framework/ngraph_bridge.cc
+++ b/paddle/fluid/framework/ngraph_bridge.cc
@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#ifdef PADDLE_WITH_NGRAPH
 #include <algorithm>
 #include <functional>
 #include <vector>
@ -27,14 +26,15 @@ namespace paddle {
 namespace framework {

 static std::shared_ptr<ngraph::Node> GetNode(
-    const std::shared_ptr<OperatorBase>& op, const std::string prm,
+    const std::shared_ptr<OperatorBase>& op, const std::string name,
    const VariableNameMap& var_map,
    std::shared_ptr<
        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
        ngb_node_map) {
-  auto& var_names = var_map.at(prm);
+  auto& var_names = var_map.at(name);
  PADDLE_ENFORCE_EQ(var_names.size(), 1,
-                    "op %s prm %s expects one associated var", op->Type(), prm);
+                    "op %s name %s expects one associated var", op->Type(),
+                    name);
  if (ngb_node_map->find(var_names[0]) != ngb_node_map->end()) {
    return (*ngb_node_map)[var_names[0]];
  } else {
@ -43,42 +43,42 @@ static std::shared_ptr<ngraph::Node> GetNode(
 }

 static std::shared_ptr<ngraph::Node> GetInputNode(
-    const std::shared_ptr<OperatorBase>& op, const std::string prm,
+    const std::shared_ptr<OperatorBase>& op, const std::string name,
    std::shared_ptr<
        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
        ngb_node_map) {
-  return GetNode(op, prm, op->Inputs(), ngb_node_map);
+  return GetNode(op, name, op->Inputs(), ngb_node_map);
 }

 static std::shared_ptr<ngraph::Node> GetOutputNode(
-    const std::shared_ptr<OperatorBase>& op, const std::string prm,
+    const std::shared_ptr<OperatorBase>& op, const std::string name,
    std::shared_ptr<
        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
        ngb_node_map) {
-  return GetNode(op, prm, op->Outputs(), ngb_node_map);
+  return GetNode(op, name, op->Outputs(), ngb_node_map);
 }

 static void SetOutputNode(
-    const std::shared_ptr<OperatorBase>& op, const std::string prm,
+    const std::shared_ptr<OperatorBase>& op, const std::string name,
    std::shared_ptr<ngraph::Node> node,
    std::shared_ptr<
        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
        ngb_node_map) {
-  auto& var_names = op->Outputs().at(prm);
+  auto& var_names = op->Outputs().at(name);
  if (var_names.size() == 1) {
    (*ngb_node_map)[var_names[0]] = node;
  } else if (var_names.size() == 0) {
    (*ngb_node_map)[""] = node;
  } else {
-    PADDLE_THROW("prm %s has more than 1 var_names.", prm);
+    PADDLE_THROW("name %s has more than 1 var_names.", name);
  }
 }

 static bool HasOutput(const std::shared_ptr<OperatorBase>& op,
-                      const std::string prm) {
+                      const std::string name) {
  auto& outputs = op->Outputs();
-  if (outputs.find(prm) == outputs.end()) return false;
-  return outputs.at(prm).size() > 0;
+  if (outputs.find(name) == outputs.end()) return false;
+  return outputs.at(name).size() > 0;
 }

 template <typename T>
@ -118,4 +118,3 @@ void NgraphBridge::BuildNgNode(const std::shared_ptr<OperatorBase>& op) {

 }  // namespace framework
 }  // namespace paddle
-#endif
--- a/paddle/fluid/framework/ngraph_bridge.h
+++ b/paddle/fluid/framework/ngraph_bridge.h
@ -14,8 +14,6 @@ limitations under the License. */

 #pragma once

-#ifdef PADDLE_WITH_NGRAPH
-
 #include <algorithm>
 #include <map>
 #include <string>
@ -53,4 +51,3 @@ class NgraphBridge {

 }  // namespace framework
 }  // namespace paddle
-#endif
--- a/paddle/fluid/framework/ngraph_operator.cc
+++ b/paddle/fluid/framework/ngraph_operator.cc
@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#ifdef PADDLE_WITH_NGRAPH
 #include <glog/logging.h>

 #include <algorithm>
@ -58,16 +57,16 @@ typedef enum {                /* nGraph support state on ops          */
 } op_state;

 // perform graph build through bridge and execute computation
-class NgraphOperator {
+class NgraphEngine {
 public:
-  explicit NgraphOperator(const Scope& scope, const platform::Place& place,
-                          const std::vector<std::shared_ptr<OperatorBase>>& ops,
-                          const std::unordered_map<
-                              std::string, ngraph::element::Type>& var_type_map,
-                          const std::unordered_set<std::string>& persist,
-                          const std::unordered_set<std::string>& fetches,
-                          const std::unordered_set<std::string>& post_op_inputs,
-                          op_state ng_op_state)
+  explicit NgraphEngine(const Scope& scope, const platform::Place& place,
+                        const std::vector<std::shared_ptr<OperatorBase>>& ops,
+                        const std::unordered_map<
+                            std::string, ngraph::element::Type>& var_type_map,
+                        const std::unordered_set<std::string>& persist,
+                        const std::unordered_set<std::string>& fetches,
+                        const std::unordered_set<std::string>& post_op_inputs,
+                        op_state ng_op_state)
      : scope_(scope),
        place_(place),
        fused_ops_(ops),
@ -132,7 +131,7 @@ class NgraphOperator {
 };

 std::vector<std::vector<std::vector<std::unique_ptr<OperatorBase>>::iterator>>
-FusedOperator::FusedOpIntervals(
+NgraphOperator::NgraphOpIntervals(
    std::vector<std::unique_ptr<paddle::framework::OperatorBase>>* ops) {
  std::vector<std::vector<std::vector<std::unique_ptr<OperatorBase>>::iterator>>
      intervals;
@ -185,7 +184,7 @@ FusedOperator::FusedOpIntervals(
  return intervals;
 }

-FusedOperator::FusedOperator(
+NgraphOperator::NgraphOperator(
    const ProgramDesc& prog, size_t block_id,
    std::vector<std::unique_ptr<OperatorBase>>::iterator start,
    std::vector<std::unique_ptr<OperatorBase>>::iterator end,
@ -215,7 +214,7 @@ FusedOperator::FusedOperator(
  Process();
 }

-void FusedOperator::Process() {
+void NgraphOperator::Process() {
  auto& bdesc = pdesc_.Block(block_);
  for (auto& var : bdesc.AllVars()) {
    if (!(var->GetType() == proto::VarType::SELECTED_ROWS ||
@ -251,8 +250,8 @@ void FusedOperator::Process() {
  }
 }

-void FusedOperator::RunImpl(const Scope& scope,
-                            const platform::Place& place) const {
+void NgraphOperator::RunImpl(const Scope& scope,
+                             const platform::Place& place) const {
  op_state ng_op_state = PARTIAL_TEST;
  auto& bdesc = pdesc_.Block(block_);
  for (auto* op : bdesc.AllOps()) {
@ -266,19 +265,19 @@ void FusedOperator::RunImpl(const Scope& scope,
    ng_op_state = ng_op_state == PARTIAL_TEST ? FULL_TEST : FULL_TRAIN;
  }

-  NgraphOperator ngraph_op(scope, place, fused_ops_, var_type_map_,
-                           persistables_, fetches_, post_op_inputs_,
-                           ng_op_state);
-  ngraph_op.Run(scope, place);
+  NgraphEngine ngraph_engine(scope, place, fused_ops_, var_type_map_,
+                             persistables_, fetches_, post_op_inputs_,
+                             ng_op_state);
+  ngraph_engine.Run(scope, place);
 }

 std::unordered_map<std::string, std::shared_ptr<ngraph::Function>>
-    NgraphOperator::func_cache_ = {};
+    NgraphEngine::func_cache_ = {};

-std::shared_ptr<ngraph::runtime::Backend> NgraphOperator::backend_ =
+std::shared_ptr<ngraph::runtime::Backend> NgraphEngine::backend_ =
    ngraph::runtime::Backend::create("CPU");

-void NgraphOperator::GetNgInputShape(std::shared_ptr<OperatorBase> op) {
+void NgraphEngine::GetNgInputShape(std::shared_ptr<OperatorBase> op) {
  op->RuntimeInferShape(scope_, place_);
  for (auto& var_name_item : op->Inputs()) {
    for (auto& var_name : var_name_item.second) {
@ -301,7 +300,7 @@ void NgraphOperator::GetNgInputShape(std::shared_ptr<OperatorBase> op) {
  }
 }

-void NgraphOperator::BuildNgNodes() {
+void NgraphEngine::BuildNgNodes() {
  for (auto& var_name : var_out_) {
    if (var_node_map_->find(var_name) == var_node_map_->end()) {
      auto* var = scope_.FindVar(var_name);
@ -323,7 +322,7 @@ void NgraphOperator::BuildNgNodes() {
  }
 }

-void NgraphOperator::BuildNgIO() {
+void NgraphEngine::BuildNgIO() {
  std::unordered_set<std::string> inputs;
  std::unordered_set<std::string> outputs;

@ -395,7 +394,7 @@ void NgraphOperator::BuildNgIO() {
  }
 }

-void NgraphOperator::BuildNgFunction() {
+void NgraphEngine::BuildNgFunction() {
  BuildNgNodes();
  ngraph_function_ = nullptr;
  ngraph::NodeVector func_outputs;
@ -416,7 +415,7 @@ void NgraphOperator::BuildNgFunction() {
      std::make_shared<ngraph::Function>(func_outputs, func_inputs);
 }

-std::shared_ptr<std::string> NgraphOperator::GetCacheKey() {
+std::shared_ptr<std::string> NgraphEngine::GetCacheKey() {
  auto cache_key = std::make_shared<std::string>("");
  *cache_key += std::to_string(fused_ops_.size());
  for (auto& op : fused_ops_) {
@ -444,7 +443,7 @@ std::shared_ptr<std::string> NgraphOperator::GetCacheKey() {
  return cache_key;
 }

-void NgraphOperator::GetNgFunction() {
+void NgraphEngine::GetNgFunction() {
  bool cache_on = true;
  if (cache_on) {
    std::string cache_key_val = *GetCacheKey();
@ -459,8 +458,7 @@ void NgraphOperator::GetNgFunction() {
  }
 }

-void NgraphOperator::Run(const Scope& scope,
-                         const platform::Place& place) const {
+void NgraphEngine::Run(const Scope& scope, const platform::Place& place) const {
  std::vector<std::shared_ptr<ngraph::runtime::Tensor>> t_in;
  std::vector<std::shared_ptr<ngraph::runtime::Tensor>> t_out;

@ -545,7 +543,6 @@ void NgraphOperator::Run(const Scope& scope,
  }

  backend_->call(ngraph_function_, t_out, t_in);
-}  // NgraphOperator::RunImpl
+}  // NgraphEngine::RunImpl
 }  // namespace framework
 }  // namespace paddle
-#endif
--- a/paddle/fluid/framework/ngraph_operator.h
+++ b/paddle/fluid/framework/ngraph_operator.h
@ -14,8 +14,6 @@ limitations under the License. */

 #pragma once

-#ifdef PADDLE_WITH_NGRAPH
-
 #include <algorithm>
 #include <string>
 #include <unordered_map>
@ -34,14 +32,14 @@ limitations under the License. */
 namespace paddle {
 namespace framework {

-class FusedOperator : public OperatorBase {
+class NgraphOperator : public OperatorBase {
 public:
  static std::vector<
      std::vector<std::vector<std::unique_ptr<OperatorBase>>::iterator>>
-  FusedOpIntervals(
+  NgraphOpIntervals(
      std::vector<std::unique_ptr<paddle::framework::OperatorBase>>* ops);

-  explicit FusedOperator(
+  explicit NgraphOperator(
      const ProgramDesc& prog, size_t block_id,
      std::vector<std::unique_ptr<OperatorBase>>::iterator start,
      std::vector<std::unique_ptr<OperatorBase>>::iterator end,
@ -64,4 +62,3 @@ class FusedOperator : public OperatorBase {
 };
 }  // namespace framework
 }  // namespace paddle
-#endif
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@ -239,6 +239,23 @@ void OpDesc::SetOutput(const std::string &param_name,
  this->outputs_[param_name] = args;
 }

+bool OpDesc::HasProtoAttr(const std::string &name) const {
+  auto &op_info = OpInfoMap::Instance();
+  if (op_info.Has(desc_.type())) {
+    auto op_info_ptr = op_info.Get(desc_.type());
+    if (op_info_ptr.HasOpProtoAndChecker()) {
+      const proto::OpProto &proto = op_info_ptr.Proto();
+      for (int i = 0; i != proto.attrs_size(); ++i) {
+        const proto::OpProto::Attr &attr = proto.attrs(i);
+        if (attr.name() == name) {
+          return true;
+        }
+      }
+    }
+  }
+  return false;
+}
+
 proto::AttrType OpDesc::GetAttrType(const std::string &name) const {
  auto it = attrs_.find(name);
  PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
--- a/paddle/fluid/framework/op_desc.h
+++ b/paddle/fluid/framework/op_desc.h
@ -65,6 +65,8 @@ class OpDesc {
    return attrs_.find(name) != attrs_.end();
  }

+  bool HasProtoAttr(const std::string &name) const;
+
  proto::AttrType GetAttrType(const std::string &name) const;

  std::vector<std::string> AttrNames() const;
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@ -30,13 +30,36 @@ limitations under the License. */
 #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
 #include "paddle/fluid/platform/profiler.h"

+#ifdef WITH_GPERFTOOLS
+#include "gperftools/profiler.h"
+#endif
+DEFINE_string(pe_profile_fname, "",
+              "Profiler filename for PE, which generated by gperftools."
+              "Only valid when compiled `WITH_PRIFILER=ON`. Empty if disable.");
+
 namespace paddle {
 namespace framework {

+static std::once_flag gProfileOnce;
+#ifdef WITH_GPERFTOOLS
+static bool gProfileStarted = false;
+#endif
 class ParallelExecutorPrivate {
 public:
  explicit ParallelExecutorPrivate(const std::vector<platform::Place> &places)
-      : places_(places) {}
+      : places_(places) {
+    if (!FLAGS_pe_profile_fname.empty()) {
+      std::call_once(gProfileOnce, [] {
+#ifdef WITH_GPERFTOOLS
+        ProfilerStart(FLAGS_pe_profile_fname.c_str());
+        gProfileStarted = true;
+#else
+        LOG(WARNING) << "Paddle is not compiled with gperftools. "
+                        "FLAGS_pe_profile_fname will be ignored";
+#endif
+      });
+    }
+  }

  ~ParallelExecutorPrivate() {
    if (own_local_scope_) {
@ -270,6 +293,12 @@ void ParallelExecutor::BCastParamsToDevices(

 void ParallelExecutor::Run(const std::vector<std::string> &fetch_tensors,
                           const std::string &fetched_var_name) {
+#ifdef WITH_GPERFTOOLS
+  if (gProfileStarted) {
+    ProfilerFlush();
+  }
+#endif
+
  platform::RecordBlock b(0);
 #ifdef PADDLE_WITH_CUDA
  if (!gcs_.empty()) {
--- a/Show More
+++ b/Show More