Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into add-async-ssa-graph-executor

test=develop
7 years ago · cf0511f21e
parent 43c82376cb 660e410655
commit cf0511f21e
167 changed files with 3883 additions and 1187 deletions
--- a/AUTHORS.md
+++ b/AUTHORS.md
@ -44,6 +44,7 @@
 | qingqing01 | Qing-Qing Dang |
 | reyoung | Yang Yu |
 | Sand3r- | Michal Gallus |
 | sfraczek | Sylwester Fraczek |
 | Superjom | Chun-Wei Yan |
 | tensor-tang | Jian Tang |
 | tianbingsz | Tian-Bing Xu |
@ -54,6 +55,7 @@
 | wangyang59 | Yang Wang |
 | wangzhen-nlp | Zhen Wang |
 | wen-bo-yang | Wen-Bo Yang |
 | wojtuss | Wojciech Uss |
 | wwhu | Wei-Wei Hu |
 | xinghai-sun | Xing-Hai Sun |
 | Xreki | Yi-Qun Liu |
--- a/README.md
+++ b/README.md
@ -3,8 +3,8 @@
 English | [简体中文](./README_cn.md)
 [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
-[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html)
+[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://paddlepaddle.org/documentation/docs/en/1.3/beginners_guide/index_en.html)
-[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/index.html)
+[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://paddlepaddle.org/documentation/docs/zh/1.3/beginners_guide/index.html)
 [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
 [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
@ -18,7 +18,7 @@ learning to many products at Baidu.
 Our vision is to enable deep learning for everyone via PaddlePaddle.
 Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle.
-### Latest PaddlePaddle Release: [Fluid 1.2.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.2)
+### Latest PaddlePaddle Release: [Fluid 1.3.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.3)
 ### Install Latest Stable Release:
 ```
 # Linux CPU
@ -26,9 +26,9 @@ pip install paddlepaddle
 # Linux GPU cuda9cudnn7
 pip install paddlepaddle-gpu
 # Linux GPU cuda8cudnn7
-pip install paddlepaddle-gpu==1.2.0.post87
+pip install paddlepaddle-gpu==1.3.0.post87
 # Linux GPU cuda8cudnn5
-pip install paddlepaddle-gpu==1.2.0.post85
+pip install paddlepaddle-gpu==1.3.0.post85
 # For installation on other platform, refer to http://paddlepaddle.org/
 ```
@ -75,26 +75,26 @@ pip install paddlepaddle-gpu==1.2.0.post85
 ## Installation
-It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/install/index_cn.html) on our website.
+It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/en/1.3/beginners_guide/index_en.html) on our website.
 ## Documentation
-We provide [English](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html) and
+We provide [English](http://paddlepaddle.org/documentation/docs/en/1.3/beginners_guide/index_en.html) and
-[Chinese](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/index.html) documentation.
+[Chinese](http://paddlepaddle.org/documentation/docs/zh/1.3/beginners_guide/index.html) documentation.
 - [Deep Learning 101](https://github.com/PaddlePaddle/book)
  You might want to start from this online interactive book that can run in a Jupyter Notebook.
- [Distributed Training](http://paddlepaddle.org/documentation/docs/zh/1.2/user_guides/howto/training/cluster_howto.html)
+- [Distributed Training](http://paddlepaddle.org/documentation/docs/en/1.3/user_guides/howto/training/multi_node_en.html)
  You can run distributed training jobs on MPI clusters.
- [Python API](http://paddlepaddle.org/documentation/docs/zh/1.2/api_cn/index_cn.html)
+- [Python API](http://paddlepaddle.org/documentation/docs/en/1.3/api/index_en.html)
   Our new API enables much shorter programs.
- [How to Contribute](http://paddlepaddle.org/documentation/docs/zh/1.2/advanced_usage/development/contribute_to_paddle/index_cn.html)
+- [How to Contribute](http://paddlepaddle.org/documentation/docs/en/1.3/advanced_usage/development/contribute_to_paddle/index_en.html)
   We appreciate your contributions!
--- a/README_cn.md
+++ b/README_cn.md
@ -3,8 +3,8 @@
 [English](./README.md) | 简体中文
 [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
-[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html)
+[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://paddlepaddle.org/documentation/docs/en/1.3/beginners_guide/index_en.html)
-[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/index.html)
+[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://paddlepaddle.org/documentation/docs/zh/1.3/beginners_guide/index.html)
 [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
 [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
@ -16,7 +16,7 @@ PaddlePaddle (PArallel Distributed Deep LEarning) 是一个简单易用、高效
 跟进PaddlePaddle最新特性请参考我们的[版本说明](https://github.com/PaddlePaddle/Paddle/releases)
-### PaddlePaddle最新版本: [Fluid 1.2.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.2)
+### PaddlePaddle最新版本: [Fluid 1.3.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.3)
 ### 安装最新稳定版本:
 ```
 # Linux CPU
@ -24,9 +24,9 @@ pip install paddlepaddle
 # Linux GPU cuda9cudnn7
 pip install paddlepaddle-gpu
 # Linux GPU cuda8cudnn7
-pip install paddlepaddle-gpu==1.2.0.post87
+pip install paddlepaddle-gpu==1.3.0.post87
 # Linux GPU cuda8cudnn5
-pip install paddlepaddle-gpu==1.2.0.post85
+pip install paddlepaddle-gpu==1.3.0.post85
 # 其他平台上的安装指引请参考 http://paddlepaddle.org/
 ```
@ -57,26 +57,26 @@ pip install paddlepaddle-gpu==1.2.0.post85
 ## 安装
-推荐阅读官网上的[安装说明](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/install/index_cn.html)
+推荐阅读官网上的[安装说明](http://paddlepaddle.org/documentation/docs/zh/1.3/beginners_guide/install/index_cn.html)
 ## 文档
-我们提供[英文](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html)和
+我们提供[英文](http://paddlepaddle.org/documentation/docs/en/1.3/beginners_guide/index_en.html)和
-[中文](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/index.html) 文档
+[中文](http://paddlepaddle.org/documentation/docs/zh/1.3/beginners_guide/index.html) 文档
 - [深度学习101](https://github.com/PaddlePaddle/book)
  或许您想从这个在线交互式书籍开始，可以在Jupyter Notebook中运行
- [分布式训练](http://paddlepaddle.org/documentation/docs/zh/1.2/user_guides/howto/training/cluster_howto.html)
+- [分布式训练](http://paddlepaddle.org/documentation/docs/zh/1.3/user_guides/howto/training/multi_node.html)
  可以在MPI集群上运行分布式训练任务
- [Python API](http://paddlepaddle.org/documentation/docs/zh/1.2/api_cn/index_cn.html)
+- [Python API](http://paddlepaddle.org/documentation/docs/zh/1.3/api_cn/index_cn.html)
   新的API支持代码更少更简洁的程序
- [贡献方式](http://paddlepaddle.org/documentation/docs/zh/1.2/advanced_usage/development/contribute_to_paddle/index_cn.html)
+- [贡献方式](http://paddlepaddle.org/documentation/docs/zh/1.3/advanced_usage/development/contribute_to_paddle/index_cn.html)
   欢迎您的贡献!
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@ -43,7 +43,7 @@ paddle.fluid.AsyncExecutor.init_worker ArgSpec(args=['self', 'dist_desc', 'start
 paddle.fluid.AsyncExecutor.run ArgSpec(args=['self', 'program', 'data_feed', 'filelist', 'thread_num', 'fetch', 'mode', 'debug'], varargs=None, keywords=None, defaults=('', False))
 paddle.fluid.AsyncExecutor.save_model ArgSpec(args=['self', 'save_path'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.AsyncExecutor.stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.CompiledProgram.__init__ ArgSpec(args=['self', 'program'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.CompiledProgram.__init__ ArgSpec(args=['self', 'program_or_graph'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.CompiledProgram.with_data_parallel ArgSpec(args=['self', 'loss_name', 'build_strategy', 'exec_strategy', 'share_vars_from'], varargs=None, keywords=None, defaults=(None, None, None, None))
 paddle.fluid.CompiledProgram.with_inference_optimize ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.ExecutionStrategy) -> None
@ -71,7 +71,7 @@ paddle.fluid.initializer.NumpyArrayInitializer.__init__ ArgSpec(args=['self', 'v
 paddle.fluid.layers.fc ArgSpec(args=['input', 'size', 'num_flatten_dims', 'param_attr', 'bias_attr', 'act', 'is_test', 'name'], varargs=None, keywords=None, defaults=(1, None, None, None, False, None))
 paddle.fluid.layers.embedding ArgSpec(args=['input', 'size', 'is_sparse', 'is_distributed', 'padding_idx', 'param_attr', 'dtype'], varargs=None, keywords=None, defaults=(False, False, None, None, 'float32'))
 paddle.fluid.layers.dynamic_lstm ArgSpec(args=['input', 'size', 'h_0', 'c_0', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'float32', None))
-paddle.fluid.layers.dynamic_lstmp ArgSpec(args=['input', 'size', 'proj_size', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'proj_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'tanh', 'float32', None))
+paddle.fluid.layers.dynamic_lstmp ArgSpec(args=['input', 'size', 'proj_size', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'proj_activation', 'dtype', 'name', 'h_0', 'c_0', 'cell_clip', 'proj_clip'], varargs=None, keywords=None, defaults=(None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'tanh', 'float32', None, None, None, None, None))
 paddle.fluid.layers.dynamic_gru ArgSpec(args=['input', 'size', 'param_attr', 'bias_attr', 'is_reverse', 'gate_activation', 'candidate_activation', 'h_0', 'origin_mode'], varargs=None, keywords=None, defaults=(None, None, False, 'sigmoid', 'tanh', None, False))
 paddle.fluid.layers.gru_unit ArgSpec(args=['input', 'hidden', 'size', 'param_attr', 'bias_attr', 'activation', 'gate_activation', 'origin_mode'], varargs=None, keywords=None, defaults=(None, None, 'tanh', 'sigmoid', False))
 paddle.fluid.layers.linear_chain_crf ArgSpec(args=['input', 'label', 'param_attr'], varargs=None, keywords=None, defaults=(None,))
@ -121,6 +121,7 @@ paddle.fluid.layers.sequence_reshape ArgSpec(args=['input', 'new_dim'], varargs=
 paddle.fluid.layers.transpose ArgSpec(args=['x', 'perm', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.im2sequence ArgSpec(args=['input', 'filter_size', 'stride', 'padding', 'input_image_size', 'out_stride', 'name'], varargs=None, keywords=None, defaults=(1, 1, 0, None, 1, None))
 paddle.fluid.layers.nce ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name', 'sampler', 'custom_dist', 'seed', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 'uniform', None, 0, False))
 paddle.fluid.layers.sampled_softmax_with_cross_entropy ArgSpec(args=['logits', 'label', 'num_samples', 'num_true', 'remove_accidental_hits', 'use_customized_samples', 'customized_samples', 'customized_probabilities', 'seed'], varargs=None, keywords=None, defaults=(1, True, False, None, None, 0))
 paddle.fluid.layers.hsigmoid ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name', 'path_table', 'path_code', 'is_custom', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, False, False))
 paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'is_accumulated', 'name', 'return_parent_idx'], varargs=None, keywords=None, defaults=(0, True, None, False))
 paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None))
@ -303,7 +304,7 @@ paddle.fluid.layers.reciprocal ArgSpec(args=['x', 'name'], varargs=None, keyword
 paddle.fluid.layers.square ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.softplus ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.softsign ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.uniform_random ArgSpec(args=['shape', 'dtype', 'min', 'max', 'seed'], varargs=None, keywords=None, defaults=(None, None, None, None))
+paddle.fluid.layers.uniform_random ArgSpec(args=['shape', 'dtype', 'min', 'max', 'seed'], varargs=None, keywords=None, defaults=('float32', -1.0, 1.0, 0))
 paddle.fluid.layers.hard_shrink ArgSpec(args=['x', 'threshold'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.cumsum ArgSpec(args=['x', 'axis', 'exclusive', 'reverse'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.layers.thresholded_relu ArgSpec(args=['x', 'threshold'], varargs=None, keywords=None, defaults=(None,))
--- a/paddle/fluid/framework/block_desc.cc
+++ b/paddle/fluid/framework/block_desc.cc
@ -163,6 +163,20 @@ std::vector<OpDesc *> BlockDesc::AllOps() const {
  return res;
 }
 void BlockDesc::Clear() {
  // clear all ops
  ops_.clear();
  // clear all vars which are not persistable
  for (auto it = vars_.begin(); it != vars_.end();) {
    if (it->second->Persistable()) {
      ++it;
    } else {
      vars_.erase(it++);
    }
  }
 }
 void BlockDesc::Flush() {
  for (auto &op_desc : ops_) {
    op_desc->Flush();
--- a/paddle/fluid/framework/block_desc.h
+++ b/paddle/fluid/framework/block_desc.h
@ -97,6 +97,8 @@ class BlockDesc {
  std::vector<OpDesc *> AllOps() const;
  void Clear();
  size_t OpSize() const { return ops_.size(); }
  OpDesc *Op(int idx) const { return ops_.at(idx).get(); }
--- a/paddle/fluid/framework/data_layout_transform.cc
+++ b/paddle/fluid/framework/data_layout_transform.cc
@ -134,11 +134,6 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
  out_layout =
      out_layout == DataLayout::kAnyLayout ? DataLayout::kNCHW : out_layout;
  auto& pool = platform::DeviceContextPool::Instance();
  auto* dev_ctx = dynamic_cast<platform::MKLDNNDeviceContext*>(
      pool.Get(expected_kernel_type.place_));
  auto& cpu_engine = dev_ctx->GetEngine();
  std::vector<int> in_tz = paddle::framework::vectorize2int(in.dims());
  std::vector<int> out_tz = in_tz;
@ -147,29 +142,25 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
                 "Input tensor type is not supported: %s", in.type());
  memory::data_type out_type = in_type;
  auto in_format = platform::MKLDNNFormatForSize(in_tz.size(), in.format());
  auto out_format =
      platform::MKLDNNFormatForSize(in_tz.size(), ToMKLDNNFormat(out_layout));
  // output tensor has the same dims as input. Reorder don't change dims
  out->Resize(in.dims());
-  if (in_format != out_format) {
+  // tempory mem pd fr out , to make reorder
  auto out_mem_pd = paddle::platform::create_prim_desc_from_dims(
      paddle::framework::vectorize2int(out->dims()),
      mkldnn::memory::format::blocked, out_type);
  if (in.get_mkldnn_prim_desc() != out_mem_pd) {
    void* in_data = GetDataFromTensor(in, in_type);
    auto out_data = out->mutable_data(expected_kernel_type.place_, in.type());
-    auto in_memory =
+    auto in_memory = memory(in.get_mkldnn_prim_desc(), in_data);
-        memory({{{in_tz}, in_type, in_format}, cpu_engine}, in_data);
+    auto out_memory = memory(out_mem_pd, out_data);
    auto out_memory =
        memory({{{out_tz}, out_type, out_format}, cpu_engine}, out_data);
    platform::Reorder(in_memory, out_memory);
  } else {
    out->ShareDataWith(in);
  }
  out->set_layout(out_layout);
  // reset format since the out tensor will be feed to non-MKLDNN OPkernel
  out->set_format(memory::format::format_undef);
 #endif
 }
--- a/paddle/fluid/framework/data_transform.cc
+++ b/paddle/fluid/framework/data_transform.cc
@ -51,13 +51,31 @@ void TransformData(const OpKernelType &expected_kernel_type,
 #ifdef PADDLE_WITH_MKLDNN
        // Case1 - transform from Non-MKLDNN OPKernel to MKLDNN OPKernel
        // Just set layout/format. No real transform occur
        out.ShareDataWith(input_tensor);
        // TODO(jczaja): Remove that once all mkldnn ops
        // are modified to work with mkldnn_blocked
        auto mkldnn_fmt = [&](int rank) {
          switch (rank) {
            case 5:
              return mkldnn::memory::format::ncdhw;
            case 4:
              return mkldnn::memory::format::nchw;
            case 3:
              return mkldnn::memory::format::ncw;
            case 2:
              return mkldnn::memory::format::nc;
            case 1:
              return mkldnn::memory::format::x;
            default:
              return mkldnn::memory::format::blocked;
          }
        };
-        auto out_format = platform::MKLDNNFormatForSize(in.dims().size(),
+        auto out_mem_pd = paddle::platform::create_prim_desc_from_dims(
-                                                        ToMKLDNNFormat(lin));
+            paddle::framework::vectorize2int(out.dims()),
            mkldnn_fmt(out.dims().size()));
-        out.ShareDataWith(input_tensor);
+        out.set_mkldnn_prim_desc(out_mem_pd);
        out.set_layout(DataLayout::kMKLDNN);
        out.set_format(out_format);
 #endif
      } else {
        // Case2 - transfrom from MKLDNN OPKernel to Non-MKLDNN OPKernel
--- a/paddle/fluid/framework/details/all_reduce_deps_pass.cc
+++ b/paddle/fluid/framework/details/all_reduce_deps_pass.cc
@ -50,7 +50,7 @@ std::unique_ptr<ir::Graph> AllReduceDepsPass::ApplyImpl(
  std::unordered_map<std::string, int> vars;
  // TODO(gongwb): use graph topology sort to find the order of operators.
  //               Note that must assert topology sort is stable
-  auto& ops = Get<const std::vector<OpDesc*>>(kAllOpDescs);
+  auto& ops = graph->Get<const std::vector<OpDesc*>>(kStaleProgramOpDescs);
  for (auto* op_desc : ops) {
    auto outputs = op_desc->Outputs();
    for (auto& o_it : outputs) {
@ -120,4 +120,4 @@ std::unique_ptr<ir::Graph> AllReduceDepsPass::ApplyImpl(
 REGISTER_PASS(all_reduce_deps_pass,
              paddle::framework::details::AllReduceDepsPass)
-    .RequirePassAttr(paddle::framework::details::kAllOpDescs);
+    .RequireGraphAttr(paddle::framework::details::kStaleProgramOpDescs);
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@ -136,14 +136,17 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
    ir::Pass *multi_devices_pass;
    if (strategy_.is_distribution_) {
      VLOG(3) << "multi device parameter server mode";
      multi_devices_pass = AppendPass("dist_multi_devices_pass").get();
    } else if (strategy_.async_mode_) {
      multi_devices_pass = AppendPass("async_multi_devices_pass").get();
    } else {
      if (strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) {
        VLOG(3) << "multi devices collective mode with allreduce";
        multi_devices_pass =
            AppendPass("allreduce_mode_multi_devices_pass").get();
      } else if (strategy.reduce_ == BuildStrategy::ReduceStrategy::kReduce) {
        VLOG(3) << "multi deivces collective mode with reduce";
        multi_devices_pass = AppendPass("reduce_mode_multi_devices_pass").get();
      } else {
        PADDLE_THROW("Unknown reduce strategy.");
@ -174,7 +177,8 @@ bool BuildStrategy::IsMultiDevPass(const std::string &pass_name) const {
 }
 std::unique_ptr<ir::Graph> BuildStrategy::Apply(
-    const ProgramDesc &main_program, const std::vector<platform::Place> &places,
+    std::unique_ptr<ir::Graph> graph,
    const std::vector<platform::Place> &places,
    const std::string &loss_var_name, const std::vector<Scope *> &local_scopes,
    const size_t &nranks,
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
@ -185,7 +189,6 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
  // Create a default one if not finalized by user.
  CreatePassesFromStrategy(false);
  std::unique_ptr<ir::Graph> graph(new ir::Graph(main_program));
  for (std::shared_ptr<ir::Pass> &pass : pass_builder_->AllPasses()) {
    if (IsMultiDevPass(pass->Type())) {
      pass->Erase(kPlaces);
@ -203,41 +206,12 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
      pass->Erase("nccl_ctxs");
      pass->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nctx);
 #endif
    } else if (pass->Type() == "memory_optimize_pass") {
      if (graph->Has(kAllOpDescs)) {
        graph->Erase(kAllOpDescs);
      }
      const std::vector<OpDesc *> *all_op_descs =
          new std::vector<OpDesc *>(main_program.Block(0).AllOps());
      graph->Set<const std::vector<OpDesc *>>(kAllOpDescs,
                                              all_op_descs);  // take ownership
      pass->Erase(kAllOpDescs);
      pass->SetNotOwned<const std::vector<OpDesc *>>(kAllOpDescs, all_op_descs);
    } else if (pass->Type() == "sequential_execution_pass") {
      LOG(INFO) << "set enable_sequential_execution:"
                << enable_sequential_execution_;
      pass->Erase(kAllOpDescs);
      pass->Set<const std::vector<OpDesc *>>(
          kAllOpDescs,
          new std::vector<OpDesc *>(main_program.Block(0).AllOps()));
    } else if (pass->Type() == "all_reduce_deps_pass") {
      LOG(INFO) << "SeqOnlyAllReduceOps:" << SeqOnlyAllReduceOps(*this)
                << ", num_trainers:" << num_trainers_;
      pass->Erase(kAllOpDescs);
      pass->Set<const std::vector<OpDesc *>>(
          kAllOpDescs,
          new std::vector<OpDesc *>(main_program.Block(0).AllOps()));
    } else if (pass->Type() == "inplace_pass") {
      if (graph->Has(kAllOpDescs)) {
        graph->Erase(kAllOpDescs);
      }
      graph->Set<const std::vector<OpDesc *>>(
          kAllOpDescs,
          new std::vector<OpDesc *>(main_program.Block(0).AllOps()));
    } else if (pass->Type() == "fuse_relu_depthwise_conv_pass") {
      if (!use_cuda) {
        LOG(WARNING) << "fuse_relu_depthwise_conv_pass is only supported on "
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@ -115,7 +115,7 @@ struct BuildStrategy {
  // Apply the passes built by the pass_builder_. The passes will be
  // applied to the Program and output an ir::Graph.
-  std::unique_ptr<ir::Graph> Apply(const ProgramDesc &main_program,
+  std::unique_ptr<ir::Graph> Apply(std::unique_ptr<ir::Graph> graph,
                                   const std::vector<platform::Place> &places,
                                   const std::string &loss_var_name,
                                   const std::vector<Scope *> &local_scopes,
--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
@ -24,12 +24,11 @@ namespace details {
 FastThreadedSSAGraphExecutor::FastThreadedSSAGraphExecutor(
    const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
-    const std::vector<platform::Place> &places,
+    const std::vector<platform::Place> &places, ir::Graph *graph)
    std::unique_ptr<ir::Graph> &&graph)
    : strategy_(strategy),
      local_scopes_(local_scopes),
      places_(places),
-      graph_(std::move(graph)),
+      graph_(graph),
      pool_(strategy.num_threads_),
      prepare_pool_(1),  // add one more thread for generate op_deps
      fetch_ctxs_(places) {
@ -110,14 +109,14 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
        }
      }
      if (exception_.IsCaught()) {
-        ClearFetchOp(graph_.get(), &fetch_ops);
+        ClearFetchOp(graph_, &fetch_ops);
        exception_.ReThrow();
      }
    }
    num_complete += num_comp;
  }
  // Wait FetchOps.
-  ClearFetchOp(graph_.get(), &fetch_ops);
+  ClearFetchOp(graph_, &fetch_ops);
  return fetches;
 }
--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
@ -32,7 +32,7 @@ class FastThreadedSSAGraphExecutor : public SSAGraphExecutor {
  FastThreadedSSAGraphExecutor(const ExecutionStrategy &strategy,
                               const std::vector<Scope *> &local_scopes,
                               const std::vector<platform::Place> &places,
-                               std::unique_ptr<ir::Graph> &&graph);
+                               ir::Graph *graph);
  FeedFetchList Run(const std::vector<std::string> &fetch_tensors) override;
  const ir::Graph &Graph() const override;
@ -40,7 +40,7 @@ class FastThreadedSSAGraphExecutor : public SSAGraphExecutor {
  ExecutionStrategy strategy_;
  std::vector<Scope *> local_scopes_;
  std::vector<platform::Place> places_;
-  std::unique_ptr<ir::Graph> graph_;
+  ir::Graph *graph_;
  std::unordered_map<OpHandleBase *, int> op_deps_;
  std::vector<OpHandleBase *> bootstrap_ops_;
--- a/paddle/fluid/framework/details/memory_optimize_helper.cc
+++ b/paddle/fluid/framework/details/memory_optimize_helper.cc
@ -33,10 +33,10 @@ namespace details {
 using paddle::framework::VarDesc;
 std::vector<ir::Node*> SortOpLikeDescOrder(const ir::Graph& graph) {
-  PADDLE_ENFORCE(graph.Has(kAllOpDescs),
+  PADDLE_ENFORCE(graph.Has(kStaleProgramOpDescs),
-                 "Graph has no attribute of kAllOpDescs.");
+                 "Graph has no attribute of kStaleProgramOpDescs.");
  // 1. get op desc order
-  auto& op_descs = graph.Get<const std::vector<OpDesc*>>(kAllOpDescs);
+  auto& op_descs = graph.Get<const std::vector<OpDesc*>>(kStaleProgramOpDescs);
  // 2. topology sort order
  auto nodes = graph.Nodes();
@ -461,11 +461,21 @@ void ControlFlowGraph::LiveVariableAnalysis() {
      }
    }
  }
  for (auto* op : ops_) {
    unlived_vars_[op] = std::set<std::string>();
    for (auto& var : this->LiveIn(op)) {
      if (!this->LiveOut(op).count(var)) {
        unlived_vars_[op].insert(var);
      }
    }
  }
 }
 void ControlFlowGraph::RenameVarInCFGGraph(const std::string& old_node,
                                           const std::string& new_node,
                                           int begin_idx) {
  std::vector<bool> need_update(ops_.size(), false);
  // update graph from begin idx to the end
  for (size_t i = begin_idx; i != ops_.size(); ++i) {
    auto* op = ops_[i];
@ -480,15 +490,27 @@ void ControlFlowGraph::RenameVarInCFGGraph(const std::string& old_node,
    if (live_in_[op].find(old_node) != live_in_[op].end()) {
      live_in_[op].erase(old_node);
      live_in_[op].insert(new_node);
      need_update[i] = true;
    }
    if (live_out_[op].find(old_node) != live_out_[op].end()) {
      live_out_[op].erase(old_node);
      live_out_[op].insert(new_node);
      need_update[i] = true;
    }
  }
  for (size_t i = begin_idx; i < ops_.size(); ++i) {
    if (!need_update[i]) continue;
    auto* op = ops_[i];
    for (auto& var : this->LiveIn(op)) {
      if (!this->LiveOut(op).count(var)) {
        unlived_vars_[op].insert(var);
      }
    }
  }
 }
-const std::set<std::string> ControlFlowGraph::LiveIn(ir::Node* op) const {
+const std::set<std::string>& ControlFlowGraph::LiveIn(ir::Node* op) const {
  auto it = live_in_.find(op);
  PADDLE_ENFORCE(
      it != live_in_.end(),
@ -496,7 +518,7 @@ const std::set<std::string> ControlFlowGraph::LiveIn(ir::Node* op) const {
  return it->second;
 }
-const std::set<std::string> ControlFlowGraph::LiveOut(ir::Node* op) const {
+const std::set<std::string>& ControlFlowGraph::LiveOut(ir::Node* op) const {
  auto it = live_out_.find(op);
  PADDLE_ENFORCE(
      it != live_out_.end(),
@ -504,15 +526,24 @@ const std::set<std::string> ControlFlowGraph::LiveOut(ir::Node* op) const {
  return it->second;
 }
-const std::set<std::string> ControlFlowGraph::Use(ir::Node* op) const {
+const std::set<std::string>& ControlFlowGraph::Use(ir::Node* op) const {
  auto it = uses_.find(op);
  PADDLE_ENFORCE(
      it != uses_.end(),
-      string::Sprintf("Expect %s in live_out, but Not Found.", op->Name()));
+      string::Sprintf("Expect %s in use, but Not Found.", op->Name()));
  return it->second;
 }
 const std::set<std::string>& ControlFlowGraph::Unlived(ir::Node* op) const {
  auto it = unlived_vars_.find(op);
  PADDLE_ENFORCE(
      it != unlived_vars_.end(),
      string::Sprintf("Expect %s in unlived_set, but Not Found.", op->Name()));
  return it->second;
  return it->second;
 }
-const std::vector<ir::Node*> ControlFlowGraph::Ops() const { return ops_; }
+const std::vector<ir::Node*>& ControlFlowGraph::Ops() const { return ops_; }
 std::vector<ir::Node*>& ControlFlowGraph::Ops() { return ops_; }
--- a/paddle/fluid/framework/details/memory_optimize_helper.h
+++ b/paddle/fluid/framework/details/memory_optimize_helper.h
@ -92,10 +92,11 @@ class ControlFlowGraph {
  void RenameVarInCFGGraph(const std::string& old_node,
                           const std::string& new_node, int begin_idx);
-  const std::set<std::string> LiveIn(ir::Node* op) const;
+  const std::set<std::string>& LiveIn(ir::Node* op) const;
-  const std::set<std::string> LiveOut(ir::Node* op) const;
+  const std::set<std::string>& LiveOut(ir::Node* op) const;
-  const std::set<std::string> Use(ir::Node* op) const;
+  const std::set<std::string>& Use(ir::Node* op) const;
-  const std::vector<ir::Node*> Ops() const;
+  const std::set<std::string>& Unlived(ir::Node* op) const;
  const std::vector<ir::Node*>& Ops() const;
  std::vector<ir::Node*>& Ops();
  // for ssa-graph nodes
@ -117,6 +118,7 @@ class ControlFlowGraph {
  VarSetMap live_out_;
  VarSetMap uses_;  // op inputs
  VarSetMap defs_;  // op outputs
  std::unordered_map<ir::Node*, std::set<std::string>> unlived_vars_;
  std::vector<ir::Node*> ops_;  // op sequence by topology sort
 };
--- a/paddle/fluid/framework/details/memory_optimize_helper_test.cc
+++ b/paddle/fluid/framework/details/memory_optimize_helper_test.cc
@ -228,9 +228,6 @@ TEST(CFGGraph, IRGraph) {
  // prepare ir graph
  auto prog = FillProgramDesc();
  ir::Graph graph(prog);
  const std::vector<OpDesc*>* all_op_descs =
      new std::vector<OpDesc*>(prog.Block(0).AllOps());
  graph.Set(details::kAllOpDescs, all_op_descs);  // take ownership
  ControlFlowGraph cfg(graph);
  cfg.LiveVariableAnalysis();
@ -256,9 +253,6 @@ TEST(CFGGraph, IRGraph) {
 TEST(SortOpLikeDescOrder, NormalTest) {
  auto prog = FillProgramDesc();
  ir::Graph graph(prog);
  const std::vector<OpDesc*>* all_op_descs =
      new std::vector<OpDesc*>(prog.Block(0).AllOps());
  graph.Set(details::kAllOpDescs, all_op_descs);  // take ownership
  auto nodes = SortOpLikeDescOrder(graph);
  auto op_descs = prog.Block(0).AllOps();
@ -273,9 +267,6 @@ TEST(SortOpLikeDescOrder, NormalTest) {
 TEST(SortOpLikeDescOrder, RemoveOpDesc) {
  auto prog = FillProgramDesc();
  ir::Graph graph(prog);
  const std::vector<OpDesc*>* all_op_descs =
      new std::vector<OpDesc*>(prog.Block(0).AllOps());
  graph.Set(details::kAllOpDescs, all_op_descs);  // take ownership
  auto nodes = graph.Nodes();
  auto op_descs = prog.Block(0).AllOps();
  ir::Node* found_node = nullptr;
@ -324,8 +315,6 @@ TEST(SortOpLikeDescOrder, RemoveOpDesc) {
 // 3. add some op_desc
 TEST(SortOpLikeDescOrder, AddOpDesc) {
  auto prog = FillProgramDesc();
  const std::vector<OpDesc*>* all_op_descs =
      new std::vector<OpDesc*>(prog.Block(0).AllOps());
  ir::Graph graph(prog);
  auto find_node_in_graph = [&](std::string s) {
@ -342,9 +331,7 @@ TEST(SortOpLikeDescOrder, AddOpDesc) {
  // cached desc different with real one
  // mimic the intermidiete pass modify the programdesc.
-  graph.Set(details::kAllOpDescs, all_op_descs);  // take ownership
+  std::vector<OpDesc*> op_descs = graph.OriginProgram().Block(0).AllOps();
  auto op_descs = prog.Block(0).AllOps();
  auto op = prog.MutableBlock(0)->AppendOp();
  prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR);
@ -376,9 +363,6 @@ TEST(SortOpLikeDescOrder, AddOpDesc) {
 TEST(SortOpLikeDescOrder, AddAndDeleteOpDesc) {
  auto prog = FillProgramDesc();
  ir::Graph graph(prog);
  const std::vector<OpDesc*>* all_op_descs =
      new std::vector<OpDesc*>(prog.Block(0).AllOps());
  graph.Set(details::kAllOpDescs, all_op_descs);  // take ownership
  auto find_node_in_graph = [&](std::string s) {
    ir::Node* ret = nullptr;
@ -392,8 +376,9 @@ TEST(SortOpLikeDescOrder, AddAndDeleteOpDesc) {
    return ret;
  };
  std::vector<OpDesc*> op_descs = graph.OriginProgram().Block(0).AllOps();
  // remove sum node
  auto op_descs = prog.Block(0).AllOps();
  ir::Node* found_node = nullptr;
  auto nodes = graph.Nodes();
  for (auto node : nodes) {
@ -454,9 +439,7 @@ TEST(SortOpLikeDescOrder, AddAndDeleteOpDesc) {
 TEST(SortOpLikeDescOrder, AddAndReplaceOpDescInplace) {
  auto prog = FillProgramDesc();
  ir::Graph graph(prog);
-  const std::vector<OpDesc*>* all_op_descs =
+  std::vector<OpDesc*> op_descs = graph.OriginProgram().Block(0).AllOps();
      new std::vector<OpDesc*>(prog.Block(0).AllOps());
  graph.Set(details::kAllOpDescs, all_op_descs);  // take ownership
  auto find_node_in_graph = [&](std::string s) {
    ir::Node* ret = nullptr;
@ -470,7 +453,6 @@ TEST(SortOpLikeDescOrder, AddAndReplaceOpDescInplace) {
    return ret;
  };
  auto op_descs = prog.Block(0).AllOps();
  // add node
  auto op = prog.MutableBlock(0)->AppendOp();
  prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR);
--- a/paddle/fluid/framework/details/memory_optimize_pass.cc
+++ b/paddle/fluid/framework/details/memory_optimize_pass.cc
@ -118,8 +118,7 @@ std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl(
      }
    }
    // fill the pool
-    for (auto var : cfg_->LiveIn(op)) {
+    for (auto& var : cfg_->Unlived(op)) {
      if (cfg_->LiveOut(op).count(var) == 0) {
      ir::Node* var_node = cfg_->GetNodeByName(var, op);
      if (var_node == nullptr || var_node->IsCtrlVar()) continue;
      if (NodeCanReused(var_node) && !pool_.Has(var_node)) {
@ -127,7 +126,6 @@ std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl(
      }
    }
  }
  }
  graph->ResolveHazard(var_nodes_);
  return graph;
@ -337,4 +335,4 @@ void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var,
 REGISTER_PASS(memory_optimize_pass,
              paddle::framework::details::MemoryOptimizePass)
-    .RequireGraphAttr(paddle::framework::details::kAllOpDescs);
+    .RequireGraphAttr(paddle::framework::details::kStaleProgramOpDescs);
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@ -969,9 +969,21 @@ void DistSSAGraphBuilder::InsertCollectiveOp(ir::Graph *result,
 }
 void DistSSAGraphBuilder::InsertPostprocessOps(ir::Graph *result) const {
-  if (need_broadcast_var_ ||
+  // broad cast received parameters when training in parameter server mode.
-      (UseGPU() &&
+  if (need_broadcast_var_) {
-       strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce)) {
+    // There are 4 conditions:
    // 1. GPU && Reduce: Reduce gradient then broadcast gradient to other GPUS.
    // Need to broadcast received parameters to other GPU.
    // 2. GPU && AllReduce: AllReduce all graident to each GPU. Need to
    // broadcast received parameters to other GPU.
    // 3. CPU && AllReduce: AllReduce all gradient to each thread. Need to
    // broadcast received parameters to other scope.
    // 4. CPU && Reduce: because all parameters share the same memory, did not
    // broadcast received parameters.
    if (!UseGPU() &&
        strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce) {
      return;
    }
    if (strategy_.fuse_broadcast_op_) {
      CreateFusedBroadcastOp(result, bcast_var_name_set_);
    } else {
--- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
@ -20,8 +20,7 @@ namespace framework {
 namespace details {
 std::vector<std::unique_ptr<ir::Graph>>
-ParallelSSAGraphExecutor::SeparateMultiDevicesGraph(
+ParallelSSAGraphExecutor::SeparateMultiDevicesGraph(ir::Graph *graph) {
    std::unique_ptr<ir::Graph> &&graph) {
  std::vector<std::unique_ptr<ir::Graph>> graphs;
  graphs.reserve(places_.size());
  for (size_t i = 0; i < places_.size(); ++i) {
@ -77,24 +76,18 @@ ParallelSSAGraphExecutor::SeparateMultiDevicesGraph(
 ParallelSSAGraphExecutor::ParallelSSAGraphExecutor(
    const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
-    const std::vector<platform::Place> &places,
+    const std::vector<platform::Place> &places, ir::Graph *graph)
    const framework::ProgramDesc &main_prog, std::unique_ptr<ir::Graph> &&graph)
    : strategy_(std::move(strategy)),
      local_scopes_(std::move(local_scopes)),
      pool_(places.size() >= 2 ? new ::ThreadPool(places.size()) : nullptr),
      places_(std::move(places)),
      main_prog_(main_prog),
      // TODO(Yancey1989): Copying graphs is not safely since it deleted the
      // attrs.
-      graphs_(SeparateMultiDevicesGraph(std::move(graph))) {
+      graphs_(SeparateMultiDevicesGraph(graph)) {
  PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
  auto seq_allreduce_pass =
      ir::PassRegistry::Instance().Get("all_reduce_deps_pass");
  seq_allreduce_pass->Erase(details::kAllOpDescs);
  seq_allreduce_pass->Set<const std::vector<OpDesc *>>(
      details::kAllOpDescs,
      new std::vector<OpDesc *>(main_prog_.Block(0).AllOps()));
  for (size_t i = 0; i < graphs_.size(); ++i) {
    graphs_[i] = seq_allreduce_pass->Apply(std::move(graphs_[i]));
  }
@ -107,7 +100,7 @@ ParallelSSAGraphExecutor::ParallelSSAGraphExecutor(
          << " to run the operators of the graph on each device.";
  for (size_t i = 0; i < places.size(); ++i) {
    executors_.emplace_back(new details::ThreadedSSAGraphExecutor(
-        strategy_, local_scopes_, {places_[i]}, std::move(graphs_.at(i))));
+        strategy_, local_scopes_, {places_[i]}, graphs_.at(i).get()));
  }
 }
--- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.h
@ -31,8 +31,7 @@ class ParallelSSAGraphExecutor : public SSAGraphExecutor {
  ParallelSSAGraphExecutor(const ExecutionStrategy &strategy,
                           const std::vector<Scope *> &local_scopes,
                           const std::vector<platform::Place> &places,
-                           const framework::ProgramDesc &main_prog,
+                           ir::Graph *graph);
                           std::unique_ptr<ir::Graph> &&graph);
  ~ParallelSSAGraphExecutor() final = default;
  const ir::Graph &Graph() const override { return *graphs_[0]; }
@ -41,13 +40,12 @@ class ParallelSSAGraphExecutor : public SSAGraphExecutor {
 private:
  std::vector<std::unique_ptr<ir::Graph>> SeparateMultiDevicesGraph(
-      std::unique_ptr<ir::Graph> &&graph);
+      ir::Graph *graph);
  ExecutionStrategy strategy_;
  std::vector<Scope *> local_scopes_;
  std::unique_ptr<::ThreadPool> pool_{nullptr};
  std::vector<platform::Place> places_;
  framework::ProgramDesc main_prog_;
  std::vector<std::unique_ptr<ir::Graph>> graphs_;
  std::vector<std::unique_ptr<details::ThreadedSSAGraphExecutor>> executors_;
--- a/paddle/fluid/framework/details/sequential_execution_pass.cc
+++ b/paddle/fluid/framework/details/sequential_execution_pass.cc
@ -40,7 +40,7 @@ std::unique_ptr<ir::Graph> SequentialExecutionPass::ApplyImpl(
  static std::unordered_set<std::string> skip_dist_ops{
      "send", "recv", "send_barrier", "fetch_barrier"};
-  auto &ops = Get<const std::vector<OpDesc *>>(kAllOpDescs);
+  auto &ops = graph->Get<const std::vector<OpDesc *>>(kStaleProgramOpDescs);
  std::vector<ir::Node *> op_node_list;
  op_node_list.reserve(ops.size());
@ -107,4 +107,4 @@ std::unique_ptr<ir::Graph> SequentialExecutionPass::ApplyImpl(
 REGISTER_PASS(sequential_execution_pass,
              paddle::framework::details::SequentialExecutionPass)
-    .RequirePassAttr(paddle::framework::details::kAllOpDescs);
+    .RequireGraphAttr(paddle::framework::details::kStaleProgramOpDescs);
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@ -23,9 +23,8 @@ namespace framework {
 namespace details {
 ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor(
    const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
-    const std::vector<platform::Place> &places,
+    const std::vector<platform::Place> &places, ir::Graph *graph)
-    std::unique_ptr<ir::Graph> &&graph)
+    : graph_(graph),
    : graph_(std::move(graph)),
      pool_(strategy.num_threads_ >= 2 ? new ::ThreadPool(strategy.num_threads_)
                                       : nullptr),
      local_scopes_(local_scopes),
@ -123,7 +122,7 @@ inline FeedFetchList ThreadedSSAGraphExecutor::RunImpl(
        for (auto &run_op_future : run_op_futures_) {
          run_op_future.wait();
        }
-        ClearFetchOp(graph_.get(), &fetch_ops);
+        ClearFetchOp(graph_, &fetch_ops);
        exception_holder_.ReThrow();
      } else {
        continue;
@ -148,7 +147,7 @@ inline FeedFetchList ThreadedSSAGraphExecutor::RunImpl(
  }
  PADDLE_ENFORCE(ready_ops.empty());
  // Wait FetchOps.
-  ClearFetchOp(graph_.get(), &fetch_ops);
+  ClearFetchOp(graph_, &fetch_ops);
  return fetch_data;
 }
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@ -41,7 +41,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
  ThreadedSSAGraphExecutor(const ExecutionStrategy &strategy,
                           const std::vector<Scope *> &local_scopes,
                           const std::vector<platform::Place> &places,
-                           std::unique_ptr<ir::Graph> &&graph);
+                           ir::Graph *graph);
  const ir::Graph &Graph() const override { return *graph_; }
  // Run a SSAGraph by a thread pool
@ -56,7 +56,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
             details::OpHandleBase *op);
 private:
-  std::unique_ptr<ir::Graph> graph_;
+  ir::Graph *graph_;
  std::unique_ptr<::ThreadPool> pool_;
  std::vector<Scope *> local_scopes_;
  std::vector<platform::Place> places_;
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@ -102,6 +102,7 @@ cc_test(test_seqpool_concat_fuse_pass SRCS seqpool_concat_fuse_pass_tester.cc DE
 cc_test(test_is_test_pass SRCS is_test_pass_tester.cc DEPS is_test_pass)
 if (WITH_MKLDNN)
    cc_test(test_depthwise_conv_mkldnn_pass SRCS mkldnn/depthwise_conv_mkldnn_pass_tester.cc DEPS depthwise_conv_mkldnn_pass)
    cc_test(test_conv_bias_mkldnn_fuse_pass SRCS mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc DEPS conv_bias_mkldnn_fuse_pass naive_executor)
    cc_test(test_conv_relu_mkldnn_fuse_pass SRCS mkldnn/conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass)
    cc_test(test_conv_elementwise_add_mkldnn_fuse_pass SRCS mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc DEPS conv_elementwise_add_mkldnn_fuse_pass)
 endif ()
--- a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.h
+++ b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.h
@ -22,7 +22,8 @@ namespace ir {
 class AttentionLSTMFusePass : public FusePassBase {
 protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
      std::unique_ptr<ir::Graph> graph) const override;
 };
 }  // namespace ir
--- a/Show More
+++ b/Show More