add details. test=develop

6 years ago · 04e9776aef
parent bec68fa0b3
commit 04e9776aef
23 changed files with 842 additions and 1027 deletions
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@ -21,12 +21,13 @@ function(CheckCompilerCXX11Flag)
            if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 3.3)
                message(FATAL_ERROR "Unsupported Clang version. Clang >= 3.3 required.")
            endif()
-        endif()   
+        endif()
    endif()
 endfunction()

 CheckCompilerCXX11Flag()
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m64")
 # safe_set_flag
 #
 # Set a compile flag only if compiler is support
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@ -54,8 +54,6 @@ cc_library(memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph grap
 cc_library(memory_optimize_pass SRCS memory_optimize_pass.cc DEPS memory_optimize_helper pass)
 cc_library(inplace_op_pass SRCS inplace_op_pass.cc DEPS memory_optimize_pass op_info)
 cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper)
-cc_library(memory_early_delete_pass SRCS memory_early_delete_pass.cc DEPS memory_optimize_pass computation_op_handle scale_loss_grad_op_handle rpc_op_handle
-        all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle graph graph_helper pass)
 cc_library(reference_count_pass_helper SRCS reference_count_pass_helper.cc DEPS garbage_collector computation_op_handle)
 cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows reference_count_pass_helper)
 cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass)
@ -67,13 +65,11 @@ cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_he
 cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle
        scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle fused_broadcast_op_handle)

-set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass memory_optimize_pass memory_early_delete_pass inplace_op_pass)
+set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass memory_optimize_pass inplace_op_pass)
 if (WITH_GPU)
  list(APPEND SSA_GRAPH_EXECUTOR_DEPS reference_count_pass)
 endif()
-cc_test(memory_optimize_helper_test SRCS memory_optimize_helper_test.cc memory_optimize_helper.cc DEPS framework_proto graph)
-cc_test(memory_optimize_pass_test SRCS memory_optimize_pass_test.cc memory_optimize_pass.cc memory_optimize_helper.cc DEPS framework_proto graph graph_helper op_registry pass)
-
+cc_test(memory_optimize_helper_test SRCS memory_optimize_helper_test.cc memory_optimize_helper.cc DEPS framework_proto graph graph_helper op_registry)
 cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ${SSA_GRAPH_EXECUTOR_DEPS})

 cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@ -206,8 +206,6 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
          new std::vector<OpDesc *>(main_program.Block(0).AllOps());
      graph->Set<const std::vector<OpDesc *>>(kAllOpDescs,
                                              all_op_descs);  // take ownership
-      graph->Set<GraphNodePool>(kGraphNodePool,
-                                new GraphNodePool);  // take ownership

      pass->Erase(kAllOpDescs);
      pass->SetNotOwned<const std::vector<OpDesc *>>(kAllOpDescs, all_op_descs);
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@ -77,9 +77,6 @@ struct BuildStrategy {
  bool fuse_relu_depthwise_conv_{false};

  bool memory_optimize_{false};
-
-  bool memory_early_delete_{false};
-
  // TODO(dzhwinter):
  // make enable_inplace, memory_optimize_
  // memory_early_delete_ true by default
--- a/paddle/fluid/framework/details/inplace_op_pass.cc
+++ b/paddle/fluid/framework/details/inplace_op_pass.cc
@ -171,16 +171,15 @@ void InplacePass::InplaceModifyDesc(const std::string& var,
  }
 }

-const SSANodePair InplacePass::TryInplaceModifyVar(const std::string& var,
-                                                   const std::string& cache_var,
-                                                   const size_t& idx,
-                                                   ir::Graph* graph) const {
+const NodeSwapQueue InplacePass::TryInplaceModifyVar(
+    const std::string& var, const std::string& cache_var, const size_t& idx,
+    ir::Graph* graph) const {
  PADDLE_ENFORCE(var_nodes_[var].size() >= 1 &&
                 var_nodes_[var].at(0)->Var() != nullptr);
  std::unique_ptr<VarDesc> var_desc(new VarDesc(*var_nodes_[var].at(0)->Var()));
  var_desc->SetName(cache_var);

-  SSANodePair swap_nodes;
+  NodeSwapQueue swap_nodes;

  for (size_t i = idx; i < view_.AllOps().size(); ++i) {
    auto* op = view_.AllOps()[i];
@ -230,7 +229,7 @@ const SSANodePair InplacePass::TryInplaceModifyVar(const std::string& var,
  return swap_nodes;
 }

-void InplacePass::CommitModify(const SSANodePair& swap_nodes,
+void InplacePass::CommitModify(const NodeSwapQueue& swap_nodes,
                               ir::Graph* graph) const {
  for (auto& pair : swap_nodes) {
    auto *node = pair.first, *cache_node = pair.second;
@ -245,7 +244,7 @@ void InplacePass::CommitModify(const SSANodePair& swap_nodes,
  }
 }

-void InplacePass::WithdrawModify(const SSANodePair& nodes,
+void InplacePass::WithdrawModify(const NodeSwapQueue& nodes,
                                 ir::Graph* graph) const {
  for (auto& pair : nodes) {
    auto *node = pair.first, *cache_node = pair.second;
--- a/paddle/fluid/framework/details/inplace_op_pass.h
+++ b/paddle/fluid/framework/details/inplace_op_pass.h
@ -56,7 +56,8 @@ class GraphView {
  std::map<ir::Node*, std::unordered_set<ir::Node*>> adj_list_;
 };

-typedef std::vector<std::pair<ir::Node*, ir::Node*>> SSANodePair;
+// swap pairs in sequence
+typedef std::vector<std::pair<ir::Node*, ir::Node*>> NodeSwapQueue;
 class InplacePass : public ir::Pass {
 public:
  InplacePass();
@ -68,14 +69,14 @@ class InplacePass : public ir::Pass {
  void InitSSAGraphNodes() const;

 private:
-  const SSANodePair TryInplaceModifyVar(const std::string& var,
-                                        const std::string& cache_var,
-                                        const size_t& idx,
-                                        ir::Graph* graph) const;
+  const NodeSwapQueue TryInplaceModifyVar(const std::string& var,
+                                          const std::string& cache_var,
+                                          const size_t& idx,
+                                          ir::Graph* graph) const;

-  void CommitModify(const SSANodePair&, ir::Graph* graph) const;
+  void CommitModify(const NodeSwapQueue&, ir::Graph* graph) const;

-  void WithdrawModify(const SSANodePair& nodes, ir::Graph* graph) const;
+  void WithdrawModify(const NodeSwapQueue& nodes, ir::Graph* graph) const;

  void InplaceModifyDesc(const std::string& in_var, const std::string& out_var,
                         const size_t& idx) const;
--- a/paddle/fluid/framework/details/memory_early_delete_pass.cc
+++ b/paddle/fluid/framework/details/memory_early_delete_pass.cc
@ -1,117 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/details/memory_early_delete_pass.h"
-#include <queue>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/details/memory_optimize_helper.h"
-#include "paddle/fluid/framework/details/multi_devices_helper.h"
-#include "paddle/fluid/framework/details/reference_count_pass_helper.h"
-#include "paddle/fluid/framework/ir/graph_helper.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-static ComputationOpHandle* FindNextComputationOpHandle(VarHandle* var_in) {
-  std::queue<VarHandleBase*> queue;
-  queue.push(var_in);
-  do {
-    auto* var = queue.front();
-    queue.pop();
-    for (auto* op : var->PendingOps()) {
-      auto* compute_op = dynamic_cast<ComputationOpHandle*>(op);
-      if (compute_op != nullptr && compute_op->GetPlace() == var_in->place()) {
-        return compute_op;
-      }
-      for (auto* out_var : op->Outputs()) {
-        queue.push(out_var);
-      }
-    }
-  } while (!queue.empty());
-  return nullptr;
-}
-
-std::unique_ptr<ir::Graph> MemoryEarlyDeletePass::ApplyImpl(
-    std::unique_ptr<ir::Graph> graph) const {
-  auto& graph_pool = Get<GraphNodePool>(kGraphNodePool);
-  auto& gcs = Get<GarbageCollectorMap>(kGarbageCollector);
-
-  std::unordered_map<std::string, std::unordered_set<OpDesc*>> unlived_vars;
-  unlived_vars.reserve(graph_pool.size());
-  for (auto& pair : graph_pool) {
-    unlived_vars.insert(std::make_pair(pair.first, pair.second));
-  }
-
-  auto compare_and_insert_early_delete_op = [&](
-      OpHandleBase* op, const std::vector<VarHandleBase*>& vars) {
-    if (unlived_vars.empty()) return;
-    // unlived vars can be deleted after the last used op has finished.
-    auto* compute_op = dynamic_cast<ComputationOpHandle*>(op);
-    const auto& places = Get<std::vector<platform::Place>>(kAllPlaces);
-    for (auto& var : vars) {
-      auto* var_handle = dynamic_cast<VarHandle*>(var);
-      auto var_name = var->Node()->Name();
-      auto& var_place = var_handle->place();
-      if (unlived_vars.count(var_name) == 0) continue;
-      if (!unlived_vars[var_name].empty()) {
-        if (compute_op != nullptr &&
-            unlived_vars[var_name].count(compute_op->Node()->Op()) != 0) {
-          unlived_vars[var_name].erase(compute_op->Node()->Op());
-        }
-        continue;
-      }
-
-      if (var_handle == nullptr || !var_handle->Node()->IsVar() ||
-          var_handle->Node()->IsCtrlVar())
-        continue;
-
-      // shameless copyed from reference count pass.
-      if (compute_op == nullptr) {
-        // use next computation op scope
-        compute_op = FindNextComputationOpHandle(var_handle);
-      }
-      auto* early_delete_node =
-          graph->CreateEmptyNode("early_delete", ir::Node::Type::kOperation);
-      GarbageCollector* gc = gcs.at(places[compute_op->GetScopeIdx()]).get();
-      auto* early_delete_handle = new EarlyDeleteOpHandle(
-          early_delete_node, compute_op->GetScope(), var_place, {var_name}, gc);
-      if (compute_op->Outputs().empty()) {
-        auto* dep_var = new DummyVarHandle(graph->CreateControlDepVar());
-        compute_op->AddOutput(dep_var);
-        graph->Get<GraphDepVars>(kGraphDepVars).emplace(dep_var);
-      }
-      early_delete_handle->AddInput(compute_op->Outputs().front());
-      VLOG(5) << "Add early delete op " << var_name << " to Operator"
-              << compute_op->Name();
-    }
-  };
-
-  auto all_ops = ir::FilterByNodeWrapper<OpHandleBase>(*graph);
-  for (auto& op : all_ops) {
-    compare_and_insert_early_delete_op(op, op->Inputs());
-    compare_and_insert_early_delete_op(op, op->Outputs());
-  }
-  return graph;
-}
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(memory_early_delete_pass,
-              paddle::framework::details::MemoryEarlyDeletePass)
-    .RequireGraphAttr(paddle::framework::details::kGraphNodePool)
-    .RequireGraphAttr(paddle::framework::details::kGarbageCollector);
--- a/paddle/fluid/framework/details/memory_early_delete_pass.h
+++ b/paddle/fluid/framework/details/memory_early_delete_pass.h
@ -1,32 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "paddle/fluid/framework/details/early_delete_op_handle.h"
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/pass.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-class MemoryEarlyDeletePass : public ir::Pass {
- protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
-      std::unique_ptr<ir::Graph> graph) const override;
-};
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
--- a/paddle/fluid/framework/details/memory_optimize_helper.cc
+++ b/paddle/fluid/framework/details/memory_optimize_helper.cc
--- a/paddle/fluid/framework/details/memory_optimize_helper.h
+++ b/paddle/fluid/framework/details/memory_optimize_helper.h
@ -17,6 +17,8 @@
 #include <iostream>
 #include <iterator>
 #include <list>
+#include <map>
+#include <set>
 #include <string>
 #include <utility>
 #include <vector>
@ -27,41 +29,41 @@ namespace paddle {
 namespace framework {
 namespace details {

-constexpr char kFetchedVars[] = "fetched_vars";
-constexpr char kGraphNodePool[] = "graph_node_pool";
+constexpr char kAllOpDescs[] = "all_op_descs";

-// NOTE(dzh): Variable and the operators use the var.
-// for early delete pass.
-// Because analysis var pass build base on ir::Node, which maybe released
-// or modified between passes, so we use OpDesc* to mark ops.
-using GraphNodePool = std::vector<
-    std::pair<std::string /*var node*/, std::unordered_set<OpDesc*> /* ops */>>;
+std::vector<ir::Node*> SortOpLikeDescOrder(const ir::Graph& graph);

-// NOTE(dzh): by default, it sort node in ascend order(by node bytes size).
-// in fluid, -1 means the batch_size is determined in runtime.
-// the node batch_size equal -1 always ranking in the front than the node not.
+// NOTE(dzh): A ordered set for node reuse in memory optimize.
+// the orderedset sort node in ascend order(by node bytes size).
+// in fluid, -1 means the batch_size, which is determined in runtime.
+// So the reuse happens between nodes who's batch_size both are -1
+// simultaneously or not.
+//
+// sort rule:
+// rule 0 : smaller node ranking in front.
+// rule 1 : batch_size equal -1 ranking in the front than the node not.
+//
 // For example,
 // node0[-1, 1] node1[-1, 1, 1], node2[1,1], node3[1,1024], ..
-// O(1) insert, delete
-class OrderedNodeList {
- public:
-  using NodePair = std::pair<ir::Node*, std::unordered_set<ir::Node*>>;
-  using Iter = typename std::list<NodePair>::iterator;
-  using ConstIter = typename std::list<NodePair>::const_iterator;

-  void Insert(ir::Node* var, ir::Node* op);
+class OrderedSet {
+ public:
+  // nodes with same name exists in pool.
+  using NodeVector = std::vector<ir::Node*>;
+  using Iter = typename std::list<NodeVector>::iterator;
+  using ConstIter = typename std::list<NodeVector>::const_iterator;

+  void Insert(ir::Node* var);
  void Erase(ir::Node* var);
-
-  void Erase(const std::string& var);
-
-  bool Has(ir::Node* var) { return mark_table_.count(var->Name()); }
-
-  bool Has(const std::string& var) { return mark_table_.count(var); }
-
-  ir::Node* NodeMatch(ir::Node* var) const;
+  bool Has(ir::Node* var) const;
+  void Clear() {
+    mark_table_.clear();
+    nodes_.clear();
+  }
+  // find the bestfit shape node block with var.
+  ir::Node* FindBestFitNode(ir::Node* var) const;
  // map store non-const iterator, can not promise const
-  int GetIndex(ir::Node* var);
+  int GetNodeIndexInPool(ir::Node* var);
  // pool all node to string
  std::string ToString() const;

@ -69,18 +71,54 @@ class OrderedNodeList {
  Iter end() { return nodes_.end(); }
  ConstIter begin() const { return nodes_.begin(); }
  ConstIter end() const { return nodes_.end(); }
-  size_t size() const { return nodes_.size(); }

-  void Clear() {
-    mark_table_.clear();
-    nodes_.clear();
-  }
+  size_t size() const { return nodes_.size(); }

 private:
  // for searching.
  std::unordered_map<std::string, Iter> mark_table_;
-  // node swap pairs. var -> ops dep var
-  std::list<NodePair> nodes_;
+  // node pool
+  std::list<NodeVector> nodes_;
+};
+
+class ControlFlowGraph {
+ public:
+  ControlFlowGraph() = default;
+  // IR Graph
+  explicit ControlFlowGraph(const ir::Graph& graph);
+
+  void LiveVariableAnalysis();
+
+  void RenameVarInCFGGraph(const std::string& old_node,
+                           const std::string& new_node, int begin_idx);
+
+  const std::set<std::string> LiveIn(ir::Node* op) const;
+  const std::set<std::string> LiveOut(ir::Node* op) const;
+  const std::set<std::string> Use(ir::Node* op) const;
+  const std::vector<ir::Node*> Ops() const;
+  std::vector<ir::Node*>& Ops();
+
+  // for ssa-graph nodes
+  ir::Node* GetNodeByName(const std::string& name, ir::Node* op) const;
+
+ private:
+  void BuildCFGGraph();
+  void ConnectNodes();
+
+  using NodeListMap = std::unordered_map<ir::Node*, std::set<ir::Node*>>;
+  using VarSetMap = std::map<ir::Node*, std::set<std::string>>;
+  // successors ops use the output variables.
+  NodeListMap successors_;
+  // predecessors ops generated input variables.
+  NodeListMap predecessors_;
+  // variables lived before run current op.
+  VarSetMap live_in_;
+  // variables lived after run current op.
+  VarSetMap live_out_;
+  VarSetMap uses_;  // op inputs
+  VarSetMap defs_;  // op outputs
+
+  std::vector<ir::Node*> ops_;  // op sequence by topology sort
 };

 // valid a tensor can be reuse or not
@ -93,15 +131,24 @@ bool NodeCanReused(const VarDesc& node);
 bool OpHasSubBlock(OpDesc* desc);

 // node memory size in bytes
-size_t NodeSizeInBytes(ir::Node* n);
+size_t NodeSize(ir::Node* n);

 // node memory size in bytes
-size_t NodeSizeInBytes(const VarDesc&);
+size_t NodeSize(const VarDesc&);

 std::string DebugString(ir::Node* var);

+// NOTE(dzhwinter)
+// after node reuse, the replaced node shape is
+// different with its VarDesc. So need to find the
+// correct VarDesc in Block.
 VarDesc* FindVarDescInBlock(ir::Node* n);

+static inline bool IsSameDesc(OpDesc* op1, OpDesc* op2) {
+  return op1->Type() == op2->Type() && op1->Inputs() == op2->Inputs() &&
+         op1->Outputs() == op2->Outputs();
+}
+
 template <typename Container, typename Callback>
 class FilterVariableImpl {
 public:
--- a/paddle/fluid/framework/details/memory_optimize_helper_test.cc
+++ b/paddle/fluid/framework/details/memory_optimize_helper_test.cc
--- a/paddle/fluid/framework/details/memory_optimize_pass.cc
+++ b/paddle/fluid/framework/details/memory_optimize_pass.cc
--- a/paddle/fluid/framework/details/memory_optimize_pass.h
+++ b/paddle/fluid/framework/details/memory_optimize_pass.h
@ -32,20 +32,15 @@
 namespace paddle {
 namespace framework {
 namespace details {
-constexpr char kAllOpDescs[] = "all_op_descs";
-
-std::vector<ir::Node*> SortOpLikeDescOrder(const ir::Graph& graph);
-
-class ControlFlowGraph;

 class MemoryOptimizePass : public ir::Pass {
 protected:
  std::unique_ptr<ir::Graph> ApplyImpl(
      std::unique_ptr<ir::Graph> graph) const override;
-
- private:
  // fill the variable map(var_nodes) by version.
  void InitSSAGraphNodes() const;
+
+ private:
  // update program descs
  void RenameVarInGraphDesc(const std::string& var,
                            const std::string& cache_var, size_t idx) const;
@ -62,7 +57,7 @@ class MemoryOptimizePass : public ir::Pass {

 private:
  // Reuse Node Pool, Owned.
-  mutable OrderedNodeList pool_;
+  mutable OrderedSet pool_;
  // controlflow Graph
  mutable std::unique_ptr<ControlFlowGraph> cfg_;
  // skip set
@ -71,45 +66,6 @@ class MemoryOptimizePass : public ir::Pass {
  mutable std::map<std::string, std::vector<ir::Node*>> var_nodes_;
 };

-class ControlFlowGraph {
- public:
-  ControlFlowGraph() = default;
-  // For IR Graph in parallelexecutor
-  explicit ControlFlowGraph(const ir::Graph& graph);
-
-  void LiveVariableAnalysis();
-
-  void RenameVarInCFGGraph(const std::string& old_node,
-                           const std::string& new_node, int begin_idx);
-
-  const std::set<std::string> LiveIn(ir::Node* op) const;
-  const std::set<std::string> LiveOut(ir::Node* op) const;
-  const std::set<std::string> Use(ir::Node* op) const;
-  const std::vector<ir::Node*> Ops() const;
-  std::vector<ir::Node*>& Ops();
-
-  // for ssa-graph nodes
-  ir::Node* GetNodeFromVarName(const std::string& name, ir::Node* op) const;
-
- private:
-  void BuildCFGGraph();
-  void ConnectNodes();
-  using NodeListMap = std::unordered_map<ir::Node*, std::set<ir::Node*>>;
-  using VarSetMap = std::map<ir::Node*, std::set<std::string>>;
-  // successors ops use the output variables.
-  NodeListMap successors_;
-  // predecessors ops generated input variables.
-  NodeListMap predecessors_;
-  // variables lived before run current op.
-  VarSetMap live_in_;
-  // variables lived after run current op.
-  VarSetMap live_out_;
-  VarSetMap uses_;  // op inputs
-  VarSetMap defs_;  // op outputs
-
-  std::vector<ir::Node*> ops_;  // op sequence by topology sort
-};
-
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/details/memory_optimize_pass_test.cc
+++ b/paddle/fluid/framework/details/memory_optimize_pass_test.cc
--- a/paddle/fluid/framework/details/sequential_execution_pass.cc
+++ b/paddle/fluid/framework/details/sequential_execution_pass.cc
@ -17,6 +17,7 @@
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
+#include "paddle/fluid/framework/details/memory_optimize_helper.h"
 #include "paddle/fluid/framework/op_proto_maker.h"

 namespace paddle {
--- a/paddle/fluid/framework/details/sequential_execution_pass.h
+++ b/paddle/fluid/framework/details/sequential_execution_pass.h
@ -21,8 +21,6 @@ namespace paddle {
 namespace framework {
 namespace details {

-constexpr char kAllOpDescs[] = "all_op_descs";
-
 class SequentialExecutionPass : public ir::Pass {
 protected:
  std::unique_ptr<ir::Graph> ApplyImpl(
--- a/paddle/fluid/framework/inplace_op_inference.h
+++ b/paddle/fluid/framework/inplace_op_inference.h
@ -69,7 +69,7 @@ class InplaceInToOut : public InplaceOpInference {
  bool TryInplaceInputOutput(const VarDesc& in, const VarDesc& out) const {
    return in.Name() != out.Name() && details::NodeCanReused(in) &&
           details::NodeCanReused(out) &&
-           details::NodeSizeInBytes(out) <= details::NodeSizeInBytes(in);
+           details::NodeSize(out) <= details::NodeSize(in);
  }
 };

--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@ -171,14 +171,6 @@ std::unique_ptr<ir::Graph> ParallelExecutorPrivate::PrepareGCAndRefCnts(
    eager_deletion_pass->SetNotOwned(details::kAllPlaces, &places_);
    graph = eager_deletion_pass->Apply(std::move(graph));
    VLOG(10) << "EagerDeletionPass Applied";
-
-    if (build_strategy_.memory_early_delete_) {
-      auto early_delete_pass =
-          ir::PassRegistry::Instance().Get("memory_early_delete_pass");
-      early_delete_pass->SetNotOwned(details::kGarbageCollector, &gcs_);
-      graph = early_delete_pass->Apply(std::move(graph));
-    }
-    VLOG(10) << "MemoryEarlyDeletePass Applied.";
  }

  return graph;
@ -288,6 +280,8 @@ ParallelExecutor::ParallelExecutor(
  graphs.push_back(std::move(graph));
 #endif
  auto max_memory_size = GetEagerDeletionThreshold();
+  VLOG(10) << "Eager Deletion Threshold "
+           << static_cast<float>(max_memory_size) / (1 << 30);
  if (max_memory_size >= 0) {
    for (size_t i = 0; i < graphs.size(); ++i) {
      graphs[i] = member_->PrepareGCAndRefCnts(
@ -506,6 +500,5 @@ ParallelExecutor::~ParallelExecutor() {
 }  // namespace framework
 }  // namespace paddle

-USE_PASS(memory_early_delete_pass);
 USE_PASS(reference_count_pass);
 USE_PASS(eager_deletion_pass);
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@ -22,11 +22,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/threadpool.h"
 #include "paddle/fluid/string/printf.h"

-DEFINE_bool(benchmark, false,
-            "Doing memory benchmark. It will make deleting scope synchronized, "
-            "and add some memory usage logs."
-            "Default cuda is asynchronous device, set to True will"
-            "force op run in synchronous mode.");
+DECLARE_bool(benchmark);

 DEFINE_bool(
    eager_delete_scope, true,
--- a/paddle/fluid/memory/allocation/legacy_allocator.cc
+++ b/paddle/fluid/memory/allocation/legacy_allocator.cc
@ -36,6 +36,7 @@ DEFINE_bool(init_allocated_mem, false,
            "that initializing the allocated memory with a small value "
            "during unit testing.");
 DECLARE_double(fraction_of_gpu_memory_to_use);
+DECLARE_bool(benchmark);

 namespace paddle {
 namespace memory {
@ -198,7 +199,7 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
               << string::HumanReadableSize(Used<platform::CUDAPlace>(place));
    platform::SetDeviceId(cur_dev);
  } else {
-    if (VLOG_IS_ON(3)) {
+    if (FLAGS_benchmark) {
      allocation::GPUMemMonitor.Add(place.device, size);
    }
    if (FLAGS_init_allocated_mem) {
@ -216,7 +217,7 @@ void Free<platform::CUDAPlace>(const platform::CUDAPlace &place, void *p,
                               size_t size) {
 #ifdef PADDLE_WITH_CUDA
  GetGPUBuddyAllocator(place.device)->Free(p);
-  if (VLOG_IS_ON(3)) {
+  if (FLAGS_benchmark) {
    allocation::GPUMemMonitor.Minus(place.device, size);
  }
 #else
--- a/paddle/fluid/platform/place.cc
+++ b/paddle/fluid/platform/place.cc
@ -14,6 +14,12 @@ limitations under the License. */

 #include "paddle/fluid/platform/place.h"

+DEFINE_bool(benchmark, false,
+            "Doing memory benchmark. It will make deleting scope synchronized, "
+            "and add some memory usage logs."
+            "Default cuda is asynchronous device, set to True will"
+            "force op run in synchronous mode.");
+
 namespace paddle {
 namespace platform {

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@ -1092,10 +1092,6 @@ All parameter, weight, gradient are variables in Paddle.
          "is_distribution",
          [](const BuildStrategy &self) { return self.is_distribution_; },
          [](BuildStrategy &self, bool b) { self.is_distribution_ = b; })
-      .def_property(
-          "memory_early_delete",
-          [](const BuildStrategy &self) { return self.memory_early_delete_; },
-          [](BuildStrategy &self, bool b) { self.memory_early_delete_ = b; })
      .def_property(
          "enable_inplace",
          [](const BuildStrategy &self) { return self.enable_inplace_; },
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@ -148,7 +148,8 @@ class ParallelExecutor(object):
            else framework.default_main_program()
        # FIXME(dzhwinter): enable_inplace should be after memory_optimize
        # if turn on python memory optimize, turn off the inplace_pass.
-        build_strategy.enable_inplace = False if main._is_mem_optimized else True
+        if build_strategy.enable_inplace is None:
+            build_strategy.enable_inplace = False if main._is_mem_optimized else True
        scope = scope if scope is not None else executor.global_scope()

        if share_vars_from and not isinstance(share_vars_from,