Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into feature/python_doc

8 years ago · 59d75bda74
parent df681fd4e0 50104f18c7
commit 59d75bda74
65 changed files with 1421 additions and 425 deletions
--- a/benchmark/fluid/README.md
+++ b/benchmark/fluid/README.md
@ -29,9 +29,11 @@ Currently supported `--model` argument include:
    You can choose to use GPU/CPU training. With GPU training, you can specify
    `--gpus <gpu_num>` to run multi GPU training.
 * Run distributed training with parameter servers:
    * see [run_fluid_benchmark.sh](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/fluid/run_fluid_benchmark.sh) as an example.
    * start parameter servers:
        ```bash
        PADDLE_TRAINING_ROLE=PSERVER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=1 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist  --device GPU --update_method pserver
        sleep 15
        ```
    * start trainers:
        ```bash
--- a/benchmark/fluid/run_fluid_benchmark.sh
+++ b/benchmark/fluid/run_fluid_benchmark.sh
@ -0,0 +1,9 @@
 #!/bin/bash
 PADDLE_TRAINING_ROLE=PSERVER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=2 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model resnet --device CPU --update_method pserver --iterations=10000 &
 sleep 15
 CUDA_VISIBLE_DEVICES=0,1 PADDLE_TRAINING_ROLE=TRAINER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=2 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model resnet --device GPU --update_method pserver --iterations=10000 --gpus 2 &
 CUDA_VISIBLE_DEVICES=2,3 PADDLE_TRAINING_ROLE=TRAINER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=2 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=1 python fluid_benchmark.py --model resnet --device GPU --update_method pserver --iterations=10000 --gpus 2 &
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@ -87,7 +87,7 @@ cc_library(executor SRCS executor.cc DEPS op_registry device_context scope
 framework_proto glog lod_rank_table feed_fetch_method)
-cc_library(parallel_executor SRCS parallel_executor.cc DEPS multi_devices_graph_builder threaded_ssa_graph_executor scope_buffered_ssa_graph_executor)
+cc_library(parallel_executor SRCS parallel_executor.cc DEPS graph_builder_factory threaded_ssa_graph_executor scope_buffered_ssa_graph_executor)
 cc_library(prune SRCS prune.cc DEPS framework_proto)
 cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@ -7,6 +7,7 @@ cc_library(rpc_op_handle SRCS rpc_op_handle.cc DEPS framework_proto scope place
 cc_library(ssa_graph SRCS ssa_graph.cc DEPS var_handle op_handle_base)
 cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS ssa_graph)
 cc_library(ssa_graph_printer SRCS ssa_graph_printer.cc DEPS ssa_graph_builder)
 cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_rows)
@ -28,6 +29,9 @@ cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope d
 cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle
        scale_loss_grad_op_handle rpc_op_handle ${multi_devices_graph_builder_deps} reduce_op_handle broadcast_op_handle)
 cc_library(graph_builder_factory SRCS graph_builder_factory.cc DEPS multi_devices_graph_builder ssa_graph_printer)
 cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph framework_proto)
 cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
        simple_threadpool device_context)
--- a/paddle/fluid/framework/details/broadcast_op_handle.h
+++ b/paddle/fluid/framework/details/broadcast_op_handle.h
@ -59,8 +59,8 @@ struct BroadcastOpHandle : public OpHandleBase {
  void RunImpl() override;
 private:
-  const std::vector<Scope *> &local_scopes_;
+  std::vector<Scope *> local_scopes_;
-  const std::vector<platform::Place> &places_;
+  std::vector<platform::Place> places_;
 #ifdef PADDLE_WITH_CUDA
  const platform::NCCLContextMap *nccl_ctxs_;
 #endif
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@ -14,6 +14,8 @@
 #pragma once
 #include <string>
 namespace paddle {
 namespace framework {
 namespace details {
@ -29,6 +31,8 @@ struct BuildStrategy {
  ReduceStrategy reduce_{ReduceStrategy::kAllReduce};
  GradientScaleStrategy gradient_scale_{GradientScaleStrategy::kCoeffNumDevice};
  std::string debug_graphviz_path_{""};
 };
 }  // namespace details
--- a/paddle/fluid/framework/details/graph_builder_factory.cc
+++ b/paddle/fluid/framework/details/graph_builder_factory.cc
@ -0,0 +1,47 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/framework/details/graph_builder_factory.h"
 #include <fstream>
 #include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
 #include "paddle/fluid/framework/details/ssa_graph_printer.h"
 namespace paddle {
 namespace framework {
 namespace details {
 std::unique_ptr<SSAGraphBuilder> SSAGraphBuilderFactory::Create() {
  std::unique_ptr<SSAGraphBuilder> res(
 #ifdef PADDLE_WITH_CUDA
      new MultiDevSSAGraphBuilder(places_, loss_var_name_, param_names_,
                                  local_scopes_, nccl_ctxs_, strategy_)
 #else
      new MultiDevSSAGraphBuilder(places_, loss_var_name_, param_names_,
                                  local_scopes_, strategy_)
 #endif
          );  // NOLINT
  if (!strategy_.debug_graphviz_path_.empty()) {
    std::unique_ptr<std::ostream> fout(
        new std::ofstream(strategy_.debug_graphviz_path_));
    PADDLE_ENFORCE(fout->good());
    std::unique_ptr<GraphvizSSAGraphPrinter> graphviz_printer(
        new GraphvizSSAGraphPrinter());
    res.reset(new SSAGraghBuilderWithPrinter(
        std::move(fout), std::move(graphviz_printer), std::move(res)));
  }
  return res;
 }
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/details/graph_builder_factory.h
+++ b/paddle/fluid/framework/details/graph_builder_factory.h
@ -0,0 +1,67 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
 #include <memory>
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/details/build_strategy.h"
 #include "paddle/fluid/framework/details/ssa_graph_builder.h"
 #include "paddle/fluid/platform/place.h"
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
 namespace paddle {
 namespace framework {
 class Scope;
 namespace details {
 class SSAGraphBuilderFactory {
 public:
  SSAGraphBuilderFactory(const std::vector<platform::Place>& places,
                         const std::string& loss_var_name,
                         const std::unordered_set<std::string>& param_names,
                         const std::vector<Scope*>& local_scopes,
                         const BuildStrategy& strategy)
      : places_(places),
        loss_var_name_(loss_var_name),
        param_names_(param_names),
        local_scopes_(local_scopes),
        strategy_(strategy) {}
 #ifdef PADDLE_WITH_CUDA
  void SetNCCLContextMap(platform::NCCLContextMap* nccl_ctxs) {
    nccl_ctxs_ = nccl_ctxs;
  }
 #endif
  std::unique_ptr<SSAGraphBuilder> Create();
 private:
  std::vector<platform::Place> places_;
  std::string loss_var_name_;
  std::unordered_set<std::string> param_names_;
  std::vector<Scope*> local_scopes_;
  BuildStrategy strategy_;
 #ifdef PADDLE_WITH_CUDA
  platform::NCCLContextMap* nccl_ctxs_;
 #endif
 };
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@ -30,10 +30,6 @@
 #include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h"
 #endif
 DEFINE_string(ssa_graph_path, "/tmp/ssa_graph.dot",
              "the ssa graph path only print with GLOG_v=10,"
              "default /tmp/graph.dot");
 namespace paddle {
 namespace framework {
 namespace details {
@ -277,11 +273,6 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
   */
  AddOutputToLeafOps(&result);
  if (VLOG_IS_ON(10)) {
    std::ofstream fout(FLAGS_ssa_graph_path);
    PrintGraphviz(*graph, fout);
  }
  return std::unique_ptr<SSAGraph>(graph);
 }
--- a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h
+++ b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h
@ -41,8 +41,8 @@ struct NCCLAllReduceOpHandle : public OpHandleBase {
  void RunImpl() override;
 private:
-  const std::vector<Scope *> &local_scopes_;
+  std::vector<Scope *> local_scopes_;
-  const std::vector<platform::Place> &places_;
+  std::vector<platform::Place> places_;
  const platform::NCCLContextMap &nccl_ctxs_;
 };
--- a/paddle/fluid/framework/details/reduce_op_handle.h
+++ b/paddle/fluid/framework/details/reduce_op_handle.h
@ -32,8 +32,8 @@ namespace framework {
 namespace details {
 struct ReduceOpHandle : public OpHandleBase {
-  const std::vector<Scope *> &local_scopes_;
+  std::vector<Scope *> local_scopes_;
-  const std::vector<platform::Place> &places_;
+  std::vector<platform::Place> places_;
 #ifdef PADDLE_WITH_CUDA
  const platform::NCCLContextMap *nccl_ctxs_;
--- a/paddle/fluid/framework/details/ssa_graph_builder.cc
+++ b/paddle/fluid/framework/details/ssa_graph_builder.cc
@ -73,64 +73,6 @@ void SSAGraphBuilder::CreateOpOutput(SSAGraph *graph, OpHandleBase *op_handle,
  op_handle->AddOutput(var);
 }
 template <typename Callback>
 void IterAllVar(const SSAGraph &graph, Callback callback) {
  for (auto &each : graph.vars_) {
    for (auto &pair1 : each) {
      for (auto &pair2 : pair1.second) {
        callback(*pair2);
      }
    }
  }
  for (auto &var : graph.dep_vars_) {
    callback(*var);
  }
 }
 void SSAGraphBuilder::PrintGraphviz(const SSAGraph &graph, std::ostream &sout) {
  size_t var_id = 0;
  std::unordered_map<const VarHandleBase *, size_t> vars;
  sout << "digraph G {\n";
  IterAllVar(graph, [&](const VarHandleBase &var) {
    auto *var_ptr = &var;
    auto *var_handle_ptr = dynamic_cast<const VarHandle *>(var_ptr);
    auto *dummy_ptr = dynamic_cast<const DummyVarHandle *>(var_ptr);
    size_t cur_var_id = var_id++;
    vars[var_ptr] = cur_var_id;
    if (var_handle_ptr) {
      sout << "var_" << cur_var_id << " [label=\"" << var_handle_ptr->name_
           << "\\n"
           << var_handle_ptr->place_ << "\\n"
           << var_handle_ptr->version_ << "\"]" << std::endl;
    } else if (dummy_ptr) {
      sout << "var_" << cur_var_id << " [label=\"dummy\"]" << std::endl;
    }
  });
  size_t op_id = 0;
  for (auto &op : graph.ops_) {
    std::string op_name = "op_" + std::to_string(op_id++);
    sout << op_name << " [label=\"" << op->Name() << "\", shape=rect]"
         << std::endl;
    for (auto in : op->Inputs()) {
      std::string var_name = "var_" + std::to_string(vars[in]);
      sout << var_name << " -> " << op_name << std::endl;
    }
    for (auto out : op->Outputs()) {
      std::string var_name = "var_" + std::to_string(vars[out]);
      sout << op_name << " -> " << var_name << std::endl;
    }
  }
  sout << "}\n";
 }
 void SSAGraphBuilder::AddOutputToLeafOps(SSAGraph *graph) {
  for (auto &op : graph->ops_) {
    if (!op->Outputs().empty()) {
--- a/paddle/fluid/framework/details/ssa_graph_builder.h
+++ b/paddle/fluid/framework/details/ssa_graph_builder.h
@ -55,8 +55,6 @@ class SSAGraphBuilder {
                             const platform::Place &place, size_t place_offset);
  static void AddOutputToLeafOps(SSAGraph *graph);
  static void PrintGraphviz(const SSAGraph &graph, std::ostream &sout);
 };
 }  // namespace details
 }  // namespace framework
--- a/paddle/fluid/framework/details/ssa_graph_printer.cc
+++ b/paddle/fluid/framework/details/ssa_graph_printer.cc
@ -0,0 +1,83 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/framework/details/ssa_graph_printer.h"
 #include <string>
 #include "paddle/fluid/framework/details/ssa_graph.h"
 namespace paddle {
 namespace framework {
 namespace details {
 template <typename Callback>
 static inline void IterAllVar(const SSAGraph &graph, Callback callback) {
  for (auto &each : graph.vars_) {
    for (auto &pair1 : each) {
      for (auto &pair2 : pair1.second) {
        callback(*pair2);
      }
    }
  }
  for (auto &var : graph.dep_vars_) {
    callback(*var);
  }
 }
 void GraphvizSSAGraphPrinter::Print(const SSAGraph &graph,
                                    std::ostream &sout) const {
  size_t var_id = 0;
  std::unordered_map<const VarHandleBase *, size_t> vars;
  sout << "digraph G {\n";
  IterAllVar(graph, [&](const VarHandleBase &var) {
    auto *var_ptr = &var;
    auto *var_handle_ptr = dynamic_cast<const VarHandle *>(var_ptr);
    auto *dummy_ptr = dynamic_cast<const DummyVarHandle *>(var_ptr);
    size_t cur_var_id = var_id++;
    vars[var_ptr] = cur_var_id;
    if (var_handle_ptr) {
      sout << "var_" << cur_var_id << " [label=\"" << var_handle_ptr->name_
           << "\\n"
           << var_handle_ptr->place_ << "\\n"
           << var_handle_ptr->version_ << "\"]" << std::endl;
    } else if (dummy_ptr) {
      sout << "var_" << cur_var_id << " [label=\"dummy\"]" << std::endl;
    }
  });
  size_t op_id = 0;
  for (auto &op : graph.ops_) {
    std::string op_name = "op_" + std::to_string(op_id++);
    sout << op_name << " [label=\"" << op->Name() << "\", shape=rect]"
         << std::endl;
    for (auto in : op->Inputs()) {
      std::string var_name = "var_" + std::to_string(vars[in]);
      sout << var_name << " -> " << op_name << std::endl;
    }
    for (auto out : op->Outputs()) {
      std::string var_name = "var_" + std::to_string(vars[out]);
      sout << op_name << " -> " << var_name << std::endl;
    }
  }
  sout << "}\n";
 }
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/details/ssa_graph_printer.h
+++ b/paddle/fluid/framework/details/ssa_graph_printer.h
@ -0,0 +1,67 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
 #include <iosfwd>
 #include "paddle/fluid/framework/details/ssa_graph_builder.h"
 namespace paddle {
 namespace framework {
 namespace details {
 class SSAGraph;
 class SSAGraphPrinter {
 public:
  virtual ~SSAGraphPrinter() {}
  virtual void Print(const SSAGraph& graph, std::ostream& sout) const = 0;
 };
 class GraphvizSSAGraphPrinter : public SSAGraphPrinter {
 public:
  void Print(const SSAGraph& graph, std::ostream& sout) const override;
 };
 class SSAGraghBuilderWithPrinter : public SSAGraphBuilder {
 public:
  SSAGraghBuilderWithPrinter(std::ostream& sout,
                             std::unique_ptr<SSAGraphPrinter>&& printer,
                             std::unique_ptr<SSAGraphBuilder>&& builder)
      : printer_(std::move(printer)),
        builder_(std::move(builder)),
        stream_ref_(sout) {}
  SSAGraghBuilderWithPrinter(std::unique_ptr<std::ostream>&& sout,
                             std::unique_ptr<SSAGraphPrinter>&& printer,
                             std::unique_ptr<SSAGraphBuilder>&& builder)
      : printer_(std::move(printer)),
        builder_(std::move(builder)),
        stream_ptr_(std::move(sout)),
        stream_ref_(*stream_ptr_) {}
  std::unique_ptr<SSAGraph> Build(const ProgramDesc& program) const override {
    auto graph = builder_->Build(program);
    printer_->Print(*graph, stream_ref_);
    return graph;
  }
 private:
  std::unique_ptr<SSAGraphPrinter> printer_;
  std::unique_ptr<SSAGraphBuilder> builder_;
  std::unique_ptr<std::ostream> stream_ptr_;
  std::ostream& stream_ref_;
 };
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@ -22,7 +22,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
-#include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
+#include "paddle/fluid/framework/details/graph_builder_factory.h"
 #include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
 #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
 #include "paddle/fluid/platform/profiler.h"
@ -104,20 +104,17 @@ ParallelExecutor::ParallelExecutor(
  // Step 3. Convert main_program to SSA form and dependency graph. Also, insert
  // ncclOp
-#ifdef PADDLE_WITH_CUDA
+
-  details::MultiDevSSAGraphBuilder builder(
+  details::SSAGraphBuilderFactory builder_factory(
      member_->places_, loss_var_name, params, member_->local_scopes_,
      member_->nccl_ctxs_.get(), build_strategy);
 #else
  details::MultiDevSSAGraphBuilder builder(member_->places_, loss_var_name,
                                           params, member_->local_scopes_,
      build_strategy);
 #ifdef PADDLE_WITH_CUDA
  builder_factory.SetNCCLContextMap(member_->nccl_ctxs_.get());
 #endif
  auto graph = builder.Build(main_program);
  member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
-      exec_strategy, member_->local_scopes_, places, std::move(graph)));
+      exec_strategy, member_->local_scopes_, places,
      builder_factory.Create()->Build(main_program)));
  member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor(
      exec_strategy, member_->local_scopes_, std::move(var_infos),
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@ -34,13 +34,7 @@ DEFINE_bool(
 namespace paddle {
 namespace framework {
-Scope::~Scope() {
+Scope::~Scope() { DropKids(); }
  DropKids();
  for (auto& kv : vars_) {
    VLOG(3) << "Destroy variable " << kv.first;
    delete kv.second;
  }
 }
 Scope& Scope::NewScope() const {
  std::unique_lock<std::mutex> lock(mutex_);
@ -49,10 +43,13 @@ Scope& Scope::NewScope() const {
 }
 Variable* Scope::Var(const std::string& name) {
  // acquire the lock when new var under this scope
  std::unique_lock<std::mutex> lock(mutex_);
  auto* v = FindVarLocally(name);
  if (v != nullptr) return v;
  v = new Variable();
-  vars_[name] = v;
+  vars_[name].reset(v);
  VLOG(3) << "Create variable " << name;
  v->name_ = &(vars_.find(name)->first);
  return v;
@ -67,22 +64,29 @@ Variable* Scope::Var(std::string* name) {
 }
 Variable* Scope::FindVar(const std::string& name) const {
  // acquire the lock when find var
  std::unique_lock<std::mutex> lock(mutex_);
  return FindVarInternal(name);
 }
 Variable* Scope::FindVarInternal(const std::string& name) const {
  auto var = FindVarLocally(name);
  if (var != nullptr) {
    return var;
  }
-  return (parent_ == nullptr) ? nullptr : parent_->FindVar(name);
+  return (parent_ == nullptr) ? nullptr : parent_->FindVarInternal(name);
 }
 const Scope* Scope::FindScope(const Variable* var) const {
  for (auto& kv : vars_) {
-    if (kv.second == var) {
+    if (kv.second.get() == var) {
      return this;
    }
  }
  return (parent_ == nullptr) ? nullptr : parent_->FindScope(var);
 }
 void Scope::DropKids() {
  std::unique_lock<std::mutex> lock(mutex_);
  for (Scope* s : kids_) delete s;
  kids_.clear();
 }
@ -110,10 +114,10 @@ void Scope::DeleteScope(Scope* scope) const {
 }
 void Scope::EraseVars(const std::vector<std::string>& var_names) {
  std::unique_lock<std::mutex> lock(mutex_);
  std::set<std::string> var_set(var_names.begin(), var_names.end());
  for (auto it = vars_.begin(); it != vars_.end();) {
    if (var_set.find(it->first) != var_set.end()) {
      delete it->second;
      it = vars_.erase(it);
    } else {
      ++it;
@ -129,7 +133,7 @@ void Scope::Rename(const std::string& origin_name,
  auto new_it = vars_.find(new_name);
  PADDLE_ENFORCE(new_it == vars_.end(),
                 "The variable with name %s is already in the scope", new_name);
-  vars_[new_name] = origin_it->second;
+  vars_[new_name].reset(origin_it->second.release());
  vars_.erase(origin_it);
 }
@ -141,7 +145,7 @@ std::string Scope::Rename(const std::string& origin_name) const {
 Variable* Scope::FindVarLocally(const std::string& name) const {
  auto it = vars_.find(name);
-  if (it != vars_.end()) return it->second;
+  if (it != vars_.end()) return it->second.get();
  return nullptr;
 }
--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@ -47,15 +47,18 @@ class Scope {
  Scope& NewScope() const;
  /// Create a variable with given name if it doesn't exist.
  /// Caller doesn't own the returned Variable.
  Variable* Var(const std::string& name);
  /// Create a variable with a scope-unique name.
  /// Caller doesn't own the returned Variable.
  Variable* Var(std::string* name = nullptr);
  void EraseVars(const std::vector<std::string>& var_names);
  /// Find a variable in the scope or any of its ancestors.  Returns
  /// nullptr if cannot find.
  /// Caller doesn't own the returned Variable.
  Variable* FindVar(const std::string& name) const;
  const Scope* parent() const { return parent_; }
@ -78,13 +81,21 @@ class Scope {
  // Rename variable to a new name and return the new name
  std::string Rename(const std::string& origin_name) const;
  Variable* FindVarLocally(const std::string& name) const;
 private:
  // Call Scope::NewScope for a sub-scope.
  explicit Scope(Scope const* parent) : parent_(parent) {}
-  mutable std::unordered_map<std::string, Variable*> vars_;
+  // Called by FindVar recursively.
  // Caller doesn't own the returned Variable.
  Variable* FindVarInternal(const std::string& name) const;
  // Called by FindVarInternal and Var.
  // Caller doesn't own the returned Variable.
  Variable* FindVarLocally(const std::string& name) const;
  mutable std::unordered_map<std::string, std::unique_ptr<Variable>> vars_;
  // Scope in `kids_` are owned by this class.
  mutable std::list<Scope*> kids_;
  Scope const* parent_{nullptr};
--- a/paddle/fluid/framework/tensor.cc
+++ b/paddle/fluid/framework/tensor.cc
@ -15,5 +15,102 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor.h"
 namespace paddle {
-namespace framework {}
+namespace framework {
 extern size_t SizeOfType(std::type_index type);
 void Tensor::check_memory_size() const {
  PADDLE_ENFORCE_NOT_NULL(
      holder_, "Tensor holds no memory. Call Tensor::mutable_data first.");
  PADDLE_ENFORCE_LE(
      numel() * SizeOfType(type()), memory_size(),
      "Tensor's dims_ is out of bound. Call Tensor::mutable_data "
      "first to re-allocate memory.\n"
      "or maybe the required data-type mismatches the data already stored.");
 }
 size_t Tensor::memory_size() const {
  return holder_ == nullptr ? 0UL : holder_->size() - offset_;
 }
 void* Tensor::mutable_data(platform::Place place, std::type_index type) {
  if (holder_ != nullptr) {
    holder_->set_type(type);
  }
  PADDLE_ENFORCE_GE(numel(), 0,
                    "When calling this method, the Tensor's numel must be "
                    "equal or larger than zero. "
                    "Please check Tensor::Resize has been called first.");
  int64_t size = numel() * SizeOfType(type);
  /* some versions of boost::variant don't have operator!= */
  if (holder_ == nullptr || !(holder_->place() == place) ||
      holder_->size() < size + offset_) {
    if (platform::is_cpu_place(place)) {
      holder_.reset(new PlaceholderImpl<platform::CPUPlace>(
          boost::get<platform::CPUPlace>(place), size, type));
    } else if (platform::is_gpu_place(place) ||
               platform::is_cuda_pinned_place(place)) {
 #ifndef PADDLE_WITH_CUDA
      PADDLE_THROW(
          "CUDAPlace or CUDAPinnedPlace is not supported in CPU-only mode.");
    }
 #else
      if (platform::is_gpu_place(place)) {
        holder_.reset(new PlaceholderImpl<platform::CUDAPlace>(
            boost::get<platform::CUDAPlace>(place), size, type));
      } else if (platform::is_cuda_pinned_place(place)) {
        holder_.reset(new PlaceholderImpl<platform::CUDAPinnedPlace>(
            boost::get<platform::CUDAPinnedPlace>(place), size, type));
      }
    }
 #endif
    offset_ = 0;
  }
  return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
                                 offset_);
 }
 void* Tensor::mutable_data(platform::Place place) {
  PADDLE_ENFORCE(this->holder_ != nullptr,
                 "Cannot invoke mutable data if current hold nothing.");
  return mutable_data(place, holder_->type());
 }
 Tensor& Tensor::ShareDataWith(const Tensor& src) {
  src.check_memory_size();
  *this = src;
  return *this;
 }
 Tensor Tensor::Slice(int begin_idx, int end_idx) const {
  check_memory_size();
  PADDLE_ENFORCE_GE(begin_idx, 0,
                    "The start row index must be greater than 0.");
  PADDLE_ENFORCE_LE(end_idx, dims_[0], "The end row index is out of bound.");
  PADDLE_ENFORCE_LT(
      begin_idx, end_idx,
      "The start row index must be lesser than the end row index.");
  if (dims_[0] == 1) {
    return *this;
  } else {
    size_t base = numel() / dims_[0];
    Tensor dst;
    dst.holder_ = holder_;
    dst.set_layout(layout_);
    DDim dst_dims = dims_;
    dst_dims[0] = end_idx - begin_idx;
    dst.Resize(dst_dims);
    dst.offset_ = offset_ + begin_idx * base * SizeOfType(type());
    return dst;
  }
 }
 Tensor& Tensor::Resize(const DDim& dims) {
  dims_ = dims;
  return *this;
 }
 const DDim& Tensor::dims() const { return dims_; }
 int64_t Tensor::numel() const { return product(dims_); }
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@ -54,26 +54,24 @@ class Tensor {
  /*! Return a pointer to mutable memory block. */
  template <typename T>
-  inline T* data();
+  T* data();
  /*! Return a pointer to constant memory block. */
  template <typename T>
-  inline const T* data() const;
+  const T* data() const;
-  inline bool IsInitialized() const;
+  bool IsInitialized() const;
  inline void switch_place(platform::Place new_place);
  /**
   * @brief   Return a pointer to mutable memory block.
   * @note    If not exist, then allocation.
   */
  template <typename T>
-  inline T* mutable_data(platform::Place place);
+  T* mutable_data(platform::Place place);
-  inline void* mutable_data(platform::Place place, std::type_index type);
+  void* mutable_data(platform::Place place, std::type_index type);
-  inline void* mutable_data(platform::Place place);
+  void* mutable_data(platform::Place place);
  /**
   * @brief     Return a pointer to mutable memory block.
@ -84,19 +82,19 @@ class Tensor {
   * @note      If not exist, then allocation.
   */
  template <typename T>
-  inline T* mutable_data(DDim dims, platform::Place place);
+  T* mutable_data(DDim dims, platform::Place place);
  /*! Return the dimensions of the memory block. */
-  inline const DDim& dims() const;
+  const DDim& dims() const;
  /*! Return the numel of the memory block. */
-  inline int64_t numel() const;
+  int64_t numel() const;
  /*! Resize the dimensions of the memory block. */
-  inline Tensor& Resize(const DDim& dims);
+  Tensor& Resize(const DDim& dims);
  /*! The internal of two tensors share the same memory block. */
-  inline Tensor& ShareDataWith(const Tensor& src);
+  Tensor& ShareDataWith(const Tensor& src);
  /**
   * @brief  Return a sub-tensor of the given tensor.
@ -106,7 +104,7 @@ class Tensor {
   * @param[in] end_idx     The index of the end row(exclusive) to slice.
   *                        The index number begins from 0.
   */
-  inline Tensor Slice(int begin_idx, int end_idx) const;
+  Tensor Slice(int begin_idx, int end_idx) const;
  platform::Place place() const {
    PADDLE_ENFORCE_NOT_NULL(
@ -123,11 +121,11 @@ class Tensor {
  // memory size returns the holding memory size in byte.
  size_t memory_size() const;
-  inline void check_memory_size() const;
+  void check_memory_size() const;
-  inline DataLayout layout() const { return layout_; }
+  DataLayout layout() const { return layout_; }
-  inline void set_layout(const DataLayout layout) { layout_ = layout; }
+  void set_layout(const DataLayout layout) { layout_ = layout; }
 private:
  /**
@ -210,15 +208,6 @@ class Tensor {
  size_t offset_;
 };
 inline void Tensor::switch_place(platform::Place new_place) {
  if (holder_->place() == new_place) {
    return;
  }
  // TODO(tonyyang-svail): do memcpy here.
  PADDLE_THROW("Not Implemented");
 }
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/tensor_impl.h
+++ b/paddle/fluid/framework/tensor_impl.h
@ -20,21 +20,6 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 extern size_t SizeOfType(std::type_index type);
 inline void Tensor::check_memory_size() const {
  PADDLE_ENFORCE_NOT_NULL(
      holder_, "Tensor holds no memory. Call Tensor::mutable_data first.");
  PADDLE_ENFORCE_LE(
      numel() * SizeOfType(type()), memory_size(),
      "Tensor's dims_ is out of bound. Call Tensor::mutable_data "
      "first to re-allocate memory.\n"
      "or maybe the required data-type mismatches the data already stored.");
 }
 inline size_t Tensor::memory_size() const {
  return holder_ == nullptr ? 0UL : holder_->size() - offset_;
 }
 template <typename T>
 inline const T* Tensor::data() const {
  check_memory_size();
@ -73,88 +58,6 @@ inline T* Tensor::mutable_data(platform::Place place) {
  return reinterpret_cast<T*>(mutable_data(place, typeid(T)));
 }
 inline void* Tensor::mutable_data(platform::Place place, std::type_index type) {
  if (holder_ != nullptr) {
    holder_->set_type(type);
  }
  PADDLE_ENFORCE_GE(numel(), 0,
                    "When calling this method, the Tensor's numel must be "
                    "equal or larger than zero. "
                    "Please check Tensor::Resize has been called first.");
  int64_t size = numel() * SizeOfType(type);
  /* some versions of boost::variant don't have operator!= */
  if (holder_ == nullptr || !(holder_->place() == place) ||
      holder_->size() < size + offset_) {
    if (platform::is_cpu_place(place)) {
      holder_.reset(new PlaceholderImpl<platform::CPUPlace>(
          boost::get<platform::CPUPlace>(place), size, type));
    } else if (platform::is_gpu_place(place) ||
               platform::is_cuda_pinned_place(place)) {
 #ifndef PADDLE_WITH_CUDA
      PADDLE_THROW(
          "CUDAPlace or CUDAPinnedPlace is not supported in CPU-only mode.");
    }
 #else
      if (platform::is_gpu_place(place)) {
        holder_.reset(new PlaceholderImpl<platform::CUDAPlace>(
            boost::get<platform::CUDAPlace>(place), size, type));
      } else if (platform::is_cuda_pinned_place(place)) {
        holder_.reset(new PlaceholderImpl<platform::CUDAPinnedPlace>(
            boost::get<platform::CUDAPinnedPlace>(place), size, type));
      }
    }
 #endif
    offset_ = 0;
  }
  return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
                                 offset_);
 }
 inline void* Tensor::mutable_data(platform::Place place) {
  PADDLE_ENFORCE(this->holder_ != nullptr,
                 "Cannot invoke mutable data if current hold nothing.");
  return mutable_data(place, holder_->type());
 }
 inline Tensor& Tensor::ShareDataWith(const Tensor& src) {
  src.check_memory_size();
  *this = src;
  return *this;
 }
 inline Tensor Tensor::Slice(int begin_idx, int end_idx) const {
  check_memory_size();
  PADDLE_ENFORCE_GE(begin_idx, 0,
                    "The start row index must be greater than 0.");
  PADDLE_ENFORCE_LE(end_idx, dims_[0], "The end row index is out of bound.");
  PADDLE_ENFORCE_LT(
      begin_idx, end_idx,
      "The start row index must be lesser than the end row index.");
  if (dims_[0] == 1) {
    return *this;
  } else {
    size_t base = numel() / dims_[0];
    Tensor dst;
    dst.holder_ = holder_;
    dst.set_layout(layout_);
    DDim dst_dims = dims_;
    dst_dims[0] = end_idx - begin_idx;
    dst.Resize(dst_dims);
    dst.offset_ = offset_ + begin_idx * base * SizeOfType(type());
    return dst;
  }
 }
 inline Tensor& Tensor::Resize(const DDim& dims) {
  dims_ = dims;
  return *this;
 }
 inline const DDim& Tensor::dims() const { return dims_; }
 inline int64_t Tensor::numel() const { return product(dims_); }
 inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) {
  Tensor res;
  res.ShareDataWith(src);
--- a/paddle/fluid/inference/analysis/helper.h
+++ b/paddle/fluid/inference/analysis/helper.h
@ -18,6 +18,8 @@ limitations under the License. */
 #include <unordered_map>
 #include <vector>
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/platform/enforce.h"
 namespace paddle {
@ -107,6 +109,13 @@ class OrderedRegistry {
  std::vector<std::unique_ptr<T>> data_;
 };
 template <typename T>
 T &GetFromScope(const framework::Scope &scope, const std::string &name) {
  framework::Variable *var = scope.FindVar(name);
  PADDLE_ENFORCE(var != nullptr);
  return *var->GetMutable<T>();
 }
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@ -1,10 +1,16 @@
 # Add TRT tests
 nv_test(test_op_converter SRCS test_op_converter.cc mul_op.cc conv2d_op.cc DEPS ${FLUID_CORE_MODULES} tensorrt_engine)
 # This test is not stable
 # See https://paddleci.ngrok.io/viewLog.html?tab=buildLog&buildTypeId=Paddle_PrCi2&buildId=36834&_focus=8828 
 #nv_test(test_trt_activation_op SRCS test_activation_op.cc activation_op.cc io_converter.cc
 #    DEPS ${FLUID_CORE_MODULES} activation_op tensorrt_engine
 #    SERIAL)
 nv_library(tensorrt_converter
  SRCS mul_op.cc conv2d_op.cc fc_op.cc
  DEPS tensorrt_engine mul_op)
 nv_test(test_op_converter SRCS test_op_converter.cc DEPS
  ${FLUID_CORE_MODULES} tensorrt_engine tensorrt_converter)
 nv_test(test_io_converter SRCS test_io_converter.cc io_converter.cc DEPS dynload_cuda dynamic_loader lod_tensor)
 nv_test(test_trt_mul_op SRCS test_mul_op.cc mul_op.cc
        DEPS ${FLUID_CORE_MODULES} tensorrt_engine mul_op SERIAL)
--- a/paddle/fluid/inference/tensorrt/convert/activation_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/activation_op.cc
@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 namespace paddle {
@ -36,8 +37,8 @@ class ReluOpConverter : public OpConverter {
  }
 };
 REGISTER_TRT_OP_CONVERTER(relu, ReluOpConverter);
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
 REGISTER_TRT_OP_CONVERTER(relu, ReluOpConverter);
--- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
@ -22,14 +22,14 @@ class Conv2dOpConverter : public OpConverter {
 public:
  Conv2dOpConverter() {}
  void operator()(const framework::proto::OpDesc& op,
-                  const framework::Scope& scope) override {
+                  const framework::Scope& scope, bool test_mode) override {
    LOG(INFO)
        << "convert a fluid conv2d op to tensorrt conv layer without bias";
  }
 };
 REGISTER_TRT_OP_CONVERTER(conv2d, Conv2dOpConverter);
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
 REGISTER_TRT_OP_CONVERTER(conv2d, Conv2dOpConverter);
--- a/Show More
+++ b/Show More