Feature/mem opt pass refactor (#18735)

* first version memory optimize pass, test=develop * remove move_tensor_sharing_pass, test=develop * refine code comments, add unittests, test=develop * turn off memory_optimize by default, test=develop * follow huihuang's comments, test=develop * follow chengduoZH's comments, test=develop * fix grammar error, add const qualifier, fix pass_test exception message, test=develop * follow chengduoZH's comments 2nd, test=develop
6 years ago · a802da650b
parent c5f47c2107
commit a802da650b
30 changed files with 1015 additions and 275 deletions
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@ -3,7 +3,10 @@ cc_library(op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context

 cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
 cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
+
+cc_library(share_tensor_buffer_functor SRCS share_tensor_buffer_functor.cc DEPS framework_proto scope place operator op_registry) 
 cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry)
+cc_library(share_tensor_buffer_op_handle SRCS share_tensor_buffer_op_handle.cc DEPS op_handle_base scope computation_op_handle share_tensor_buffer_functor)
 cc_library(rpc_op_handle SRCS rpc_op_handle.cc DEPS framework_proto scope place operator op_registry)
 cc_library(fetch_barrier_op_handle SRCS fetch_barrier_op_handle.cc DEPS framework_proto scope place operator op_registry)
 cc_library(multi_devices_helper SRCS multi_devices_helper.cc DEPS graph graph_helper)
@ -59,12 +62,7 @@ cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope d

 cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows reference_count_pass_helper)

-cc_library(share_tensor_buffer_op_handle SRCS share_tensor_buffer_op_handle.cc DEPS op_handle_base scope)
-
-set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass memory_optimize_pass inplace_op_pass buffer_shared_inplace_op_pass)
-if (WITH_GPU)
-  list(APPEND SSA_GRAPH_EXECUTOR_DEPS reference_count_pass)
-endif()
+set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass memory_optimize_pass inplace_op_pass buffer_shared_inplace_op_pass buffer_shared_cross_op_memory_reuse_pass)
 cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ${SSA_GRAPH_EXECUTOR_DEPS})

 cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@ -24,6 +24,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/ir/graph_to_program_pass.h"
 #include "paddle/fluid/framework/ir/graph_viz_pass.h"
 #include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_helper.h"
+#include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h"
 #include "paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h"
 #include "paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_print_pass.h"

--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@ -108,11 +108,6 @@ struct BuildStrategy {
  // FLAGS_use_mkldnn=false
  std::unordered_set<std::string> mkldnn_enabled_op_types_;

-  // FIXME(liuwei1031) disable memory_optimzie and enable_inplace in 1.4
-  // to open them by default, we need to solve the fetch variable issue
-  // TODO(liuwei1031): memory_optimize depends on kStaleProgramOpDescs,
-  // it is not appropriate, because kStaleProgramOpDescs will be removed in the
-  // near future.
  bool memory_optimize_{false};

  // Turn on inplace by default.
--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@ -108,6 +108,8 @@ class OpHandleBase {

  ir::Node *Node() { return node_; }

+  const ir::Node *Node() const { return node_; }
+
  void SetLocalExecScopes(
      const std::unordered_map<Scope *, Scope *> &scope_map);

--- a/paddle/fluid/framework/details/share_tensor_buffer_functor.cc
+++ b/paddle/fluid/framework/details/share_tensor_buffer_functor.cc
@ -0,0 +1,126 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/share_tensor_buffer_functor.h"
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+// TODO(zjl): support SelectedRows
+static inline const Tensor &GetTensorFromVar(const Variable *var) {
+  if (var->IsType<LoDTensor>()) {
+    return var->Get<LoDTensor>();
+  } else {
+    PADDLE_THROW("Variable must be type of LoDTensor");
+  }
+}
+
+static inline Tensor *GetMutableTensorFromVar(Variable *var) {
+  if (var->IsType<LoDTensor>()) {
+    return var->GetMutable<LoDTensor>();
+  } else {
+    PADDLE_THROW("Variable must be type of LoDTensor");
+  }
+}
+
+ShareTensorBufferFunctor::ShareTensorBufferFunctor(
+    Scope *scope, size_t scope_idx, const std::string &op_type,
+    const std::vector<const ir::MemOptVarInfo *> &in_var_infos,
+    const std::vector<std::string> &out_var_names)
+    : scope_(scope),
+      scope_idx_(scope_idx),
+      op_type_(op_type),
+      in_var_infos_(in_var_infos),
+      out_var_names_(out_var_names) {
+  PADDLE_ENFORCE_EQ(in_var_infos_.size(), out_var_names_.size());
+  for (size_t i = 0; i < in_var_infos_.size(); ++i) {
+    AddReuseVarPair(in_var_infos_[i], out_var_names_[i]);
+  }
+}
+
+std::unordered_map<std::string, std::string>
+ShareTensorBufferFunctor::ReusedVars() const {
+  std::unordered_map<std::string, std::string> result;
+  for (size_t i = 0; i < in_var_infos_.size(); ++i) {
+    result.insert({in_var_infos_[i]->Name(), out_var_names_[i]});
+  }
+  return result;
+}
+
+void ShareTensorBufferFunctor::AddReuseVarPair(
+    const ir::MemOptVarInfo *in_var_info, const std::string &out_var_name) {
+  PADDLE_ENFORCE_NOT_NULL(in_var_info, "in_var_info cannot be nullptr");
+  PADDLE_ENFORCE_NE(in_var_info->Name(), out_var_name,
+                    "in/out cannot have same name: %s", out_var_name);
+  in_var_infos_.emplace_back(in_var_info);
+  out_var_names_.emplace_back(out_var_name);
+}
+
+void ShareTensorBufferFunctor::CallOnce() {
+  PADDLE_ENFORCE(in_out_vars_.empty(), "in_out_vars_ must be initialized here");
+  for (size_t i = 0; i < in_var_infos_.size(); ++i) {
+    auto *in_var = exec_scope_->FindVar(in_var_infos_[i]->Name());
+    auto *out_var = exec_scope_->FindVar(out_var_names_[i]);
+    PADDLE_ENFORCE_NOT_NULL(in_var);
+    PADDLE_ENFORCE_NOT_NULL(out_var);
+    PADDLE_ENFORCE_NE(in_var, out_var);
+    in_out_vars_.emplace_back(in_var, out_var);
+  }
+}
+
+void ShareTensorBufferFunctor::operator()(Scope *exec_scope) {
+  if (!exec_scope_) {
+    PADDLE_ENFORCE_NOT_NULL(exec_scope);
+    exec_scope_ = exec_scope;
+    CallOnce();
+  } else {
+    PADDLE_ENFORCE(exec_scope_ == exec_scope, "Scope must be the same");
+  }
+
+  for (size_t i = 0; i < in_var_infos_.size(); ++i) {
+    const auto &in_tensor = GetTensorFromVar(in_out_vars_[i].first);
+    auto *out_tensor = GetMutableTensorFromVar(in_out_vars_[i].second);
+    auto *in_var_info = in_var_infos_[i];
+
+    if (UNLIKELY(in_var_info->IsSkipped())) {
+      // If in_var is inplaced in the previous batch and we want to fetch
+      // in_var in the current batch, we have to reset memory of out_var
+      // to avoid wrong calculation result.
+      if (in_tensor.Holder() == out_tensor->Holder()) {
+        VLOG(1) << "Clear " << out_var_names_[i]
+                << " because you may want to fetch an inplaced variable "
+                << in_var_info->Name()
+                << " in previous batch: " << in_var_info->Name() << " -> "
+                << out_var_names_[i];
+        out_tensor->clear();
+      }
+    } else {
+      out_tensor->ShareBufferWith(in_tensor);
+
+      VLOG(2) << "Share tensor buffer when running " << op_type_ << " : "
+              << in_var_info->Name() << " -> " << out_var_names_[i];
+    }
+  }
+}
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/share_tensor_buffer_functor.h
+++ b/paddle/fluid/framework/details/share_tensor_buffer_functor.h
@ -0,0 +1,73 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/variable.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+// NOTE(paddle-dev): ShareTensorBufferFunctor is responsible for
+// performing memory reuse in run-time. ShareTensorBufferOpHandle
+// is only a wrapper of ShareTensorBufferFunctor.
+// Once we find the run-time memory reuse strategy is time-consuming in
+// scheduling, we should need a pass to move ShareTensorBufferFunctor into
+// each ComputationOpHandle. ShareTensorBufferFunctor is preserved for
+// this probable movement.
+class ShareTensorBufferFunctor {
+ public:
+  ShareTensorBufferFunctor(
+      Scope *scope, size_t scope_idx, const std::string &op_type,
+      const std::vector<const ir::MemOptVarInfo *> &in_var_infos,
+      const std::vector<std::string> &out_var_names);
+
+  void AddReuseVarPair(const ir::MemOptVarInfo *in_var_info,
+                       const std::string &out_var_name);
+
+  void operator()(Scope *exec_scope);
+
+  std::unordered_map<std::string, std::string> ReusedVars() const;
+
+  size_t GetScopeIdx() const { return scope_idx_; }
+
+  Scope *GetScope() { return scope_; }
+
+ private:
+  void CallOnce();
+
+ private:
+  Scope *scope_;
+  Scope *exec_scope_{nullptr};
+
+  size_t scope_idx_;
+  std::string op_type_;
+  std::vector<const ir::MemOptVarInfo *> in_var_infos_;
+  std::vector<std::string> out_var_names_;
+
+  std::vector<std::pair<const Variable *, Variable *>> in_out_vars_;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc
+++ b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc
@ -25,55 +25,42 @@ namespace paddle {
 namespace framework {
 namespace details {

-// TODO(zjl): support SelectedRows
-static inline const Tensor &GetTensorFromVar(const Variable *var) {
-  if (var->IsType<LoDTensor>()) {
-    return var->Get<LoDTensor>();
-  } else {
-    PADDLE_THROW("Variable must be type of LoDTensor");
-  }
-}
+ComputationOpHandle *GetUniquePendingComputationOpHandle(
+    ShareTensorBufferOpHandle *share_tensor_op) {
+  ComputationOpHandle *result_op = nullptr;
+  for (ir::Node *out_var : share_tensor_op->Node()->outputs) {
+    for (ir::Node *pending_op : out_var->outputs) {
+      auto &op = pending_op->Wrapper<OpHandleBase>();
+      auto *compute_op = dynamic_cast<ComputationOpHandle *>(&op);
+      PADDLE_ENFORCE_NOT_NULL(compute_op);

-static inline Tensor *GetMutableTensorFromVar(Variable *var) {
-  if (var->IsType<LoDTensor>()) {
-    return var->GetMutable<LoDTensor>();
-  } else {
-    PADDLE_THROW("Variable must be type of LoDTensor");
+      if (result_op == nullptr) {
+        result_op = compute_op;
+      } else {
+        PADDLE_ENFORCE_EQ(result_op, compute_op);
+      }
+    }
  }
+
+  PADDLE_ENFORCE_NOT_NULL(result_op);
+  return result_op;
 }

 ShareTensorBufferOpHandle::ShareTensorBufferOpHandle(
    ir::Node *node, Scope *scope, size_t scope_idx, const std::string &op_type,
-    const std::vector<ir::MemOptVarInfo *> &in_var_infos,
+    const std::vector<const ir::MemOptVarInfo *> &in_var_infos,
    const std::vector<std::string> &out_var_names)
    : OpHandleBase(node),
-      scope_(scope),
-      scope_idx_(scope_idx),
-      op_type_(op_type),
-      in_var_infos_(in_var_infos),
-      out_var_names_(out_var_names) {
-  PADDLE_ENFORCE_EQ(in_var_infos_.size(), out_var_names_.size());
-  for (size_t i = 0; i < in_var_infos_.size(); ++i) {
-    Add(in_var_infos_[i], out_var_names_[i]);
-  }
-}
+      functor_(scope, scope_idx, op_type, in_var_infos, out_var_names) {}

-std::unordered_set<std::string> ShareTensorBufferOpHandle::ReusedVarSet()
-    const {
-  std::unordered_set<std::string> result;
-  for (auto &in_var_info : in_var_infos_) {
-    result.insert(in_var_info->Name());
-  }
-  return result;
+std::unordered_map<std::string, std::string>
+ShareTensorBufferOpHandle::ReusedVars() const {
+  return functor_.ReusedVars();
 }

-void ShareTensorBufferOpHandle::Add(ir::MemOptVarInfo *in_var_info,
-                                    const std::string &out_var_name) {
-  PADDLE_ENFORCE_NOT_NULL(in_var_info, "in_var_info cannot be nullptr");
-  PADDLE_ENFORCE_NE(in_var_info->Name(), out_var_name,
-                    "in/out cannot have same name: %s", out_var_name);
-  in_var_infos_.emplace_back(in_var_info);
-  out_var_names_.emplace_back(out_var_name);
+void ShareTensorBufferOpHandle::AddReuseVarPair(
+    const ir::MemOptVarInfo *in_var_info, const std::string &out_var_name) {
+  functor_.AddReuseVarPair(in_var_info, out_var_name);
 }

 void ShareTensorBufferOpHandle::InitCUDA() {
@ -84,49 +71,7 @@ void ShareTensorBufferOpHandle::InitCUDA() {
 #endif
 }

-void ShareTensorBufferOpHandle::CallOnce() {
-  PADDLE_ENFORCE(in_out_vars_.empty(), "in_out_vars_ must be initialized here");
-  Scope *exec_scope = local_exec_scopes_[0];
-  for (size_t i = 0; i < in_var_infos_.size(); ++i) {
-    auto *in_var = exec_scope->FindVar(in_var_infos_[i]->Name());
-    auto *out_var = exec_scope->FindVar(out_var_names_[i]);
-    PADDLE_ENFORCE_NOT_NULL(in_var);
-    PADDLE_ENFORCE_NOT_NULL(out_var);
-    PADDLE_ENFORCE_NE(in_var, out_var);
-    in_out_vars_.emplace_back(in_var, out_var);
-  }
-}
-
-void ShareTensorBufferOpHandle::RunImpl() {
-  if (in_var_infos_.size() != in_out_vars_.size()) {
-    CallOnce();
-  }
-
-  for (size_t i = 0; i < in_var_infos_.size(); ++i) {
-    const auto &in_tensor = GetTensorFromVar(in_out_vars_[i].first);
-    auto *out_tensor = GetMutableTensorFromVar(in_out_vars_[i].second);
-    auto *in_var_info = in_var_infos_[i];
-
-    if (UNLIKELY(in_var_info->IsSkipped())) {
-      // If in_var is inplaced in the previous batch and we want to fetch
-      // in_var in the current batch, we have to reset memory of out_var
-      // to avoid wrong calculation result.
-      if (in_tensor.Holder() == out_tensor->Holder()) {
-        VLOG(1) << "Clear " << out_var_names_[i]
-                << " because you may want to fetch an inplaced variable "
-                << in_var_info->Name()
-                << " in previous batch: " << in_var_info->Name() << " -> "
-                << out_var_names_[i];
-        out_tensor->clear();
-      }
-    } else {
-      out_tensor->ShareBufferWith(in_tensor);
-
-      VLOG(2) << "Share tensor buffer when running " << op_type_ << " : "
-              << in_var_info->Name() << " -> " << out_var_names_[i];
-    }
-  }
-}
+void ShareTensorBufferOpHandle::RunImpl() { functor_(local_exec_scopes_[0]); }

 }  // namespace details
 }  // namespace framework
--- a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.h
+++ b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.h
@ -14,22 +14,15 @@
 #pragma once

 #include <string>
-#include <unordered_set>
+#include <unordered_map>
 #include <utility>
 #include <vector>
+#include "paddle/fluid/framework/details/computation_op_handle.h"
 #include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/details/share_tensor_buffer_functor.h"

 namespace paddle {
 namespace framework {
-
-class Variable;
-class Scope;
-class Tensor;
-
-namespace ir {
-class MemOptVarInfo;
-}  // namespace ir
-
 namespace details {

 class ShareTensorBufferOpHandle : public OpHandleBase {
@ -37,16 +30,19 @@ class ShareTensorBufferOpHandle : public OpHandleBase {
  ShareTensorBufferOpHandle(
      ir::Node *node, Scope *scope, size_t scope_idx,
      const std::string &op_type,
-      const std::vector<ir::MemOptVarInfo *> &in_vars_infos,
+      const std::vector<const ir::MemOptVarInfo *> &in_vars_infos,
      const std::vector<std::string> &out_var_names);

-  std::unordered_set<std::string> ReusedVarSet() const;
+  std::unordered_map<std::string, std::string> ReusedVars() const;

  Priority GetPriority() const override { return Priority::kHighest; }

-  size_t GetScopeIdx() const { return scope_idx_; }
+  size_t GetScopeIdx() const { return functor_.GetScopeIdx(); }
+
+  void AddReuseVarPair(const ir::MemOptVarInfo *in_var_info,
+                       const std::string &out_var_name);

-  void Add(ir::MemOptVarInfo *in_var_info, const std::string &ou_var_name);
+  const ShareTensorBufferFunctor &Functor() const { return functor_; }

 protected:
  std::string Name() const override { return "buffer_share"; }
@ -55,20 +51,17 @@ class ShareTensorBufferOpHandle : public OpHandleBase {

  void InitCUDA() override;

-  std::vector<Scope *> GetLocalScopes() override { return {scope_}; }
+  std::vector<Scope *> GetLocalScopes() override {
+    return {functor_.GetScope()};
+  }

 private:
-  void CallOnce();
-
-  Scope *scope_;
-  size_t scope_idx_;
-  std::string op_type_;
-  std::vector<ir::MemOptVarInfo *> in_var_infos_;
-  std::vector<std::string> out_var_names_;
-
-  std::vector<std::pair<const Variable *, Variable *>> in_out_vars_;
+  ShareTensorBufferFunctor functor_;
 };

+ComputationOpHandle *GetUniquePendingComputationOpHandle(
+    ShareTensorBufferOpHandle *share_tensor_op);
+
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/details/var_handle.h
+++ b/paddle/fluid/framework/details/var_handle.h
@ -21,6 +21,7 @@
 #include <utility>

 #include "paddle/fluid/framework/ir/node.h"
+#include "paddle/fluid/platform/macros.h"
 #include "paddle/fluid/platform/place.h"

 namespace paddle {
@ -74,12 +75,16 @@ struct VarHandleBase {

  OpHandleBase* GeneratedOp() { return generated_op_; }

+  const OpHandleBase* GeneratedOp() const { return generated_op_; }
+
  const std::unordered_set<OpHandleBase*>& PendingOps() const {
    return pending_ops_;
  }

  ir::Node* Node() { return node_; }

+  const ir::Node* Node() const { return node_; }
+
 protected:
  // The operator who generate this variable. nullptr if the variable
  // is a root node.
@ -96,6 +101,9 @@ struct VarHandleBase {
 //
 // NOTE: runtime variables have place.
 struct VarHandle : public VarHandleBase {
+  DISABLE_COPY_AND_ASSIGN(VarHandle);
+
+ public:
  virtual ~VarHandle();

  std::string DebugString() const override;
--- a/paddle/fluid/framework/inplace_op_inference_test.cc
+++ b/paddle/fluid/framework/inplace_op_inference_test.cc
@ -19,6 +19,7 @@
 #include <vector>
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_helper.h"
+#include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h"
 #include "paddle/fluid/framework/ir/pass_builder.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/op_registry.h"
--- a/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt
@ -22,3 +22,4 @@ cc_library(record_skip_memory_opt_vars_pass SRCS record_skip_memory_opt_vars_pas
 cc_library(memory_reuse_pass SRCS memory_reuse_pass.cc DEPS computation_op_handle reference_count_pass_helper share_tensor_buffer_op_handle multi_devices_helper graph pass) 

 cc_library(buffer_shared_inplace_op_pass SRCS buffer_shared_inplace_op_pass.cc DEPS memory_reuse_pass)
+cc_library(buffer_shared_cross_op_memory_reuse_pass SRCS buffer_shared_cross_op_memory_reuse_pass.cc DEPS memory_reuse_pass) 
--- a/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_cross_op_memory_reuse_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_cross_op_memory_reuse_pass.cc
--- a/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc
@ -50,11 +50,11 @@ void BufferSharedInplaceOpPass::Run(Graph *graph) const {
    for (auto &pair : each_scope_ops) {
      // If variable has more than 1 last lived ops, this variable cannot
      // be inplaced.
-      if (pair.second.size() != 1) {
+      if (pair.second.ops().size() != 1) {
        continue;
      }

-      auto *op = *(pair.second.begin());
+      auto *op = *(pair.second.ops().begin());
      const std::string &op_type = op->GetOp()->Type();
      const framework::OpDesc *op_desc = op->Node()->Op();
      PADDLE_ENFORCE_NOT_NULL(op_desc);
@ -141,7 +141,7 @@ void BufferSharedInplaceOpPass::Run(Graph *graph) const {
                << out_var_handle_ptr->Name()
                << ". Debug String is: " << op->GetOp()->DebugString();
      } else {
-        VLOG(4) << "Inplace failed in op " << op_type << ": "
+        VLOG(3) << "Inplace failed in op " << op_type << ": "
                << in_var_handle_ptr->Name() << " -> "
                << out_var_handle_ptr->Name();
      }
--- a/paddle/fluid/framework/ir/memory_optimize_pass/eager_deletion_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/eager_deletion_pass.cc
@ -205,7 +205,7 @@ void EagerDeletionPass::ApplyImpl(ir::Graph *graph) const {
  for (auto &var_ops_map : last_live_ops) {
    for (auto &var_ops_pair : var_ops_map) {
      const std::string &var_name = var_ops_pair.first;
-      for (auto *op : var_ops_pair.second) {
+      for (auto *op : var_ops_pair.second.ops()) {
        op_vars_map[op].insert(var_name);
      }
    }
--- a/paddle/fluid/framework/ir/memory_optimize_pass/inplace_op_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/inplace_op_pass.cc
@ -19,6 +19,7 @@
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_pass.h"
+#include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h"
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/framework/op_info.h"

--- a/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_helper.h
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_helper.h
@ -36,8 +36,6 @@ namespace ir {
 constexpr char kMemOptSkipVars[] = "@MEM_OPT_SKIP_VARS@";
 typedef std::unordered_set<std::string> MemOptSkipVars;

-constexpr char kUseCuda[] = "use_cuda";
-
 std::vector<ir::Node*> SortOpLikeDescOrder(const ir::Graph& graph);

 // NOTE(dzh): A ordered set for node reuse in memory optimize.
--- a/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.cc
--- a/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.h
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.h
@ -81,18 +81,26 @@ class MemoryReusePass : public Pass {
  bool TryReuseVar(details::VarHandle *in_var,
                   details::VarHandle *out_var) const;

-  std::unordered_set<ir::Node *> FindNodesByName(
-      const std::string &name, const std::vector<ir::Node *> &nodes) const;
+  bool IsInVarReusable(const details::VarHandle &in_var) const;
+
+  bool IsOutVarReusable(const details::VarHandle &out_var) const;
+
+  std::unordered_set<Node *> FindNodesByName(
+      const std::string &name, const std::vector<Node *> &nodes) const;

  size_t ScopeNum() const { return all_vars_->size(); }

+  int64_t GetMemorySize(const details::VarHandle &var) const;
+
 private:
-  VarDesc *GetVarDesc(details::VarHandle *var) const;
+  VarDesc *GetVarDesc(const details::VarHandle &var) const;

-  bool IsVarsReusable(details::VarHandle *in_var,
-                      details::VarHandle *out_var) const;
+  bool IsVarPairReusable(const details::VarHandle &in_var,
+                         const details::VarHandle &out_var) const;

-  bool IsVarAlreadyReused(details::VarHandle *var) const;
+  bool IsInVarAlreadyReused(const details::VarHandle &in_var) const;
+
+  bool IsOutVarAlreadyReused(const details::VarHandle &out_var) const;

  details::ShareTensorBufferOpHandle *InsertShareTensorBufferOpHandleToGraph(
      details::ComputationOpHandle *op) const;
@ -110,15 +118,19 @@ class MemoryReusePass : public Pass {

 private:
  mutable Graph *graph_;
+  mutable bool use_cuda_;
+
  mutable details::GraphVars *all_vars_;
  mutable MemOptVarInfoMapList *var_infos_;
+
  mutable std::vector<LastLiveOpsOfVars> *last_live_ops_of_vars_;

  mutable std::unordered_map<details::ComputationOpHandle *,
                             details::ShareTensorBufferOpHandle *>
      ops_;

-  mutable std::vector<std::unordered_set<std::string>> reused_var_names_;
+  mutable std::vector<std::unordered_set<std::string>> reused_in_var_names_;
+  mutable std::vector<std::unordered_set<std::string>> reused_out_var_names_;

  mutable std::vector<std::unordered_map<std::string, VarDesc *>> var_descs_;
 };
--- a/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.cc
@ -66,6 +66,24 @@ const std::unordered_set<details::OpHandleBase *> &OpGraphView::PendingOps(
  return pending_ops_.at(op);
 }

+const std::unordered_set<details::OpHandleBase *> &OpGraphView::PrecedingOps(
+    details::OpHandleBase *op) const {
+  EnforceHasOp(op);
+  return preceding_ops_.at(op);
+}
+
+std::unordered_map<details::OpHandleBase *, size_t>
+OpGraphView::GetPrecedingDepNum() const {
+  std::unordered_map<details::OpHandleBase *, size_t> result;
+  result.reserve(preceding_ops_.size());
+  for (auto &pair : preceding_ops_) {
+    result.emplace(pair.first, pair.second.size());
+  }
+  return result;
+}
+
+size_t OpGraphView::OpNumber() const { return preceding_ops_.size(); }
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.h
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.h
@ -33,13 +33,24 @@ class OpGraphView {
  const std::unordered_set<details::OpHandleBase *> &PendingOps(
      details::OpHandleBase *op) const;

+  const std::unordered_set<details::OpHandleBase *> &PrecedingOps(
+      details::OpHandleBase *op) const;
+
+  std::unordered_map<details::OpHandleBase *, size_t> GetPrecedingDepNum()
+      const;
+
  bool HasOp(details::OpHandleBase *op) const;

+  size_t OpNumber() const;
+
  // Use a visitor to visit all pending ops of op
  // Stop when callback returns false
  template <typename Callback>
  bool VisitAllPendingOps(details::OpHandleBase *op, Callback &&callback) const;

+  template <typename Callback>
+  void BreadthFirstVisit(Callback &&callback) const;
+
 private:
  void Build(const std::vector<details::OpHandleBase *> &ops);
  void EnforceHasOp(details::OpHandleBase *op) const;
@ -75,6 +86,52 @@ bool OpGraphView::VisitAllPendingOps(details::OpHandleBase *op,
  return true;
 }

+template <typename Callback>
+void OpGraphView::BreadthFirstVisit(Callback &&callback) const {
+  auto op_deps = GetPrecedingDepNum();
+  size_t op_num = op_deps.size();
+
+  std::unordered_set<details::OpHandleBase *> visited_ops;
+  std::queue<details::OpHandleBase *> ready_ops;
+  size_t num_calls = 0;
+  for (auto iter = op_deps.begin(); iter != op_deps.end();) {
+    if (iter->second != 0) {
+      ++iter;
+      continue;
+    }
+
+    visited_ops.insert(iter->first);
+    ready_ops.push(iter->first);
+    callback(iter->first);
+    ++num_calls;
+    op_deps.erase(iter++);
+  }
+
+  while (!ready_ops.empty()) {
+    auto *cur_op = ready_ops.front();
+    ready_ops.pop();
+
+    auto &pending_ops = PendingOps(cur_op);
+    for (auto *pending_op : pending_ops) {
+      if (visited_ops.count(pending_op) > 0) {
+        continue;
+      }
+
+      if (--op_deps.at(pending_op) == 0) {
+        visited_ops.insert(pending_op);
+        op_deps.erase(pending_op);
+        ready_ops.push(pending_op);
+        callback(pending_op);
+        ++num_calls;
+      }
+    }
+  }
+
+  PADDLE_ENFORCE_EQ(num_calls, op_num, "There are unvisited ops");
+  PADDLE_ENFORCE_EQ(visited_ops.size(), op_num, "There are unvisited ops");
+  PADDLE_ENFORCE(op_deps.empty(), "There are unvisited ops");
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass.cc
@ -346,6 +346,8 @@ void ReferenceCountPass::ApplyImpl(ir::Graph *graph) const {
        // Seldomly, some vars may have no pending or preceding computation ops
        // Just break;
        if (status == LastLiveOpSearchStatus::kFailure) {
+          VLOG(1) << "Cannot find last live ops of variable " << var_name
+                  << " in scope " << (*iter)->scope_idx();
          break;
        }

@ -362,7 +364,9 @@ void ReferenceCountPass::ApplyImpl(ir::Graph *graph) const {
        VLOG(10) << "Extract " << result.size() << " ops of var " << var_name;
        var_infos[i][var_name].reset(
            new MemOptVarInfo(var_name, result.size()));
-        last_live_ops_of_vars[i].emplace(var_name, std::move(result));
+        auto &last_live_ops_of_var = last_live_ops_of_vars[i][var_name];
+        last_live_ops_of_var.set_var(*iter);
+        *(last_live_ops_of_var.mutable_ops()) = std::move(result);
        break;
      }

--- a/paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h
@ -39,10 +39,28 @@ using GarbageCollectorMap =
 const char kMemOptVarInfoMapList[] = "mem_opt_var_info_map_list";
 const char kGarbageCollector[] = "garbage_collector";
 const char kAllPlaces[] = "all_places";
+const char kUseCuda[] = "use_cuda";

-using LastLiveOpsOfVars =
-    std::unordered_map<std::string,
-                       std::unordered_set<details::ComputationOpHandle *>>;
+class LastLiveOpOfVarInfo {
+ public:
+  details::VarHandle *var() { return var_; }
+
+  void set_var(details::VarHandle *var) { var_ = var; }
+
+  const std::unordered_set<details::ComputationOpHandle *> &ops() const {
+    return ops_;
+  }
+
+  std::unordered_set<details::ComputationOpHandle *> *mutable_ops() {
+    return &ops_;
+  }
+
+ private:
+  details::VarHandle *var_{nullptr};
+  std::unordered_set<details::ComputationOpHandle *> ops_;
+};
+
+using LastLiveOpsOfVars = std::unordered_map<std::string, LastLiveOpOfVarInfo>;
 const char kLastLiveOpsOfVars[] = "last_live_ops_of_var";

 VarDesc *TryGetLatestVarDesc(const std::vector<details::VarHandle *> &vars);
--- a/paddle/fluid/framework/ir/node.h
+++ b/paddle/fluid/framework/ir/node.h
@ -99,7 +99,7 @@ class Node {

  // Test if the Node is wrapped by type T.
  template <typename T>
-  bool IsWrappedBy() {
+  bool IsWrappedBy() const {
    return std::type_index(typeid(T)) == wrapper_type_;
  }

--- a/paddle/fluid/framework/ir/pass.cc
+++ b/paddle/fluid/framework/ir/pass.cc
@ -36,7 +36,8 @@ Graph* Pass::Apply(Graph* graph) const {
  ApplyImpl(graph);
  // TODO(panyx0718): Add more verifications.
  PADDLE_ENFORCE(!HasCircle(*graph),
-                 "Illegal Pass. Generated graph shouldn't has cycle.");
+                 "Illegal Pass %s. Generated graph shouldn't have cycle.",
+                 Type());
  PADDLE_ENFORCE(VarDescIsConsistency(*graph),
                 "The VarDescs of persistable variable are not consistency.");
  applied_ = true;
--- a/paddle/fluid/framework/ir/pass_test.cc
+++ b/paddle/fluid/framework/ir/pass_test.cc
@ -99,7 +99,7 @@ TEST(PassTest, TestPassAttrCheck) {
  } catch (paddle::platform::EnforceNotMet e) {
    exception = std::string(e.what());
  }
-  ASSERT_TRUE(exception.find("shouldn't has cycle") != exception.npos);
+  ASSERT_TRUE(exception.find("shouldn't have cycle") != exception.npos);
 }

 }  // namespace ir
--- a/Show More
+++ b/Show More