Refactor dygraph (#19107)

* refactor dygraph,test=develop * fix failed unittest,test=develop * polish code,test=develop * check windows ci error,test=develop try to fix windows ci error by np.allclose,test=develop * polish vlog and profiler, test=develop * try to fix preceding ops order,test=develop * test transformer in windows ci, test=develop * use python c-api to speed up tracer.trace,test=develop * test=develop, fix docker with paddle nccl problem * test=develop, add ut for debug string and gradient_accumulator * test=develop, add tests for layer/gradient_accumulator/prepared_op * test=develop, fix complie error for test_prepared_op * test=develop, add more ut for dygraph * test=develop, create API.spec for dygraph api change * test=develop, refoctor name to make it easier to understand * test=develop, refoctor name to make it easier to understand * test=develop, fix multi-gpu failed problem , add Tracer tests, change PADDLEENFORCE to PADDLEENFORCE_EQ * test=develop, fix ut failed on parallel se-resnext * test=develop, change one more PADDLE_ENFORCE
6 years ago · e9233d1c1e
parent dca9b6c5b0
commit e9233d1c1e
35 changed files with 2894 additions and 2342 deletions
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@ -820,11 +820,11 @@ paddle.fluid.dygraph.TreeConv.state_dict (ArgSpec(args=['self', 'destination', '
 paddle.fluid.dygraph.TreeConv.sublayers (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '00a881005ecbc96578faf94513bf0d62'))
 paddle.fluid.dygraph.TreeConv.train (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.dygraph.Tracer ('paddle.fluid.dygraph.tracer.Tracer', ('document', '28d72409112111274c33e1f07229d5da'))
-paddle.fluid.dygraph.Tracer.__init__ (ArgSpec(args=['self', 'block'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.dygraph.Tracer.__init__ (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.dygraph.Tracer.all_parameters (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.dygraph.Tracer.eval_mode (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.Tracer.trace 1. trace(self: paddle.fluid.core_avx.Tracer, arg0: paddle.fluid.core_avx.OpBase, arg1: Dict[unicode, handle], arg2: Dict[unicode, handle], arg3: Dict[unicode, Variant], arg4: paddle::platform::CPUPlace, arg5: bool) -> None  2. trace(self: paddle.fluid.core_avx.Tracer, arg0: paddle.fluid.core_avx.OpBase, arg1: Dict[unicode, handle], arg2: Dict[unicode, handle], arg3: Dict[unicode, Variant], arg4: paddle::platform::CUDAPlace, arg5: bool) -> None
-paddle.fluid.dygraph.Tracer.trace_op (ArgSpec(args=['self', 'op', 'inputs', 'outputs', 'stop_gradient'], varargs=None, keywords=None, defaults=(False,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.dygraph.Tracer.trace 1. trace(self: paddle.fluid.core_avx.Tracer, arg0: unicode, arg1: Dict[unicode, handle], arg2: Dict[unicode, handle], arg3: Dict[unicode, Variant], arg4: paddle::platform::CUDAPlace, arg5: bool) -> None  2. trace(self: paddle.fluid.core_avx.Tracer, arg0: unicode, arg1: Dict[unicode, handle], arg2: Dict[unicode, handle], arg3: Dict[unicode, Variant], arg4: paddle::platform::CPUPlace, arg5: bool) -> None
+paddle.fluid.dygraph.Tracer.trace_op (ArgSpec(args=['self', 'type', 'inputs', 'outputs', 'attrs', 'stop_gradient'], varargs=None, keywords=None, defaults=(False,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.dygraph.Tracer.trace_var (ArgSpec(args=['self', 'name', 'var'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.dygraph.Tracer.train_mode (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.dygraph.prepare_context (ArgSpec(args=['strategy'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
--- a/paddle/fluid/imperative/CMakeLists.txt
+++ b/paddle/fluid/imperative/CMakeLists.txt
@ -1,10 +1,11 @@
 cc_library(imperative_flag SRCS flags.cc DEPS gflags) 

-if(WITH_PYTHON)
-cc_library(layer SRCS layer.cc DEPS proto_desc operator device_context blas pybind profiler imperative_flag)
-cc_library(tracer SRCS tracer.cc DEPS proto_desc device_context pybind profiler)
-cc_library(engine SRCS engine.cc)
+cc_library(prepared_operator SRCS prepared_operator.cc DEPS proto_desc operator device_context lod_tensor selected_rows var_type_traits)
+cc_library(layer SRCS layer.cc DEPS prepared_operator math_function imperative_flag variable_helper op_registry)
+cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows var_type_traits layer)
+cc_library(tracer SRCS tracer.cc DEPS layer engine)
+cc_library(engine SRCS engine.cc DEPS layer gradient_accumulator)
 cc_library(imperative_profiler SRCS profiler.cc)
 cc_library(nccl_context SRCS nccl_context.cc DEPS device_context)
-cc_test(nccl_context_test SRCS nccl_context_test.cc  DEPS nccl_context)
-endif()
+
+add_subdirectory(tests)
--- a/paddle/fluid/imperative/backward_strategy.h
+++ b/paddle/fluid/imperative/backward_strategy.h
@ -16,17 +16,12 @@
 // Created by Jiabin on 2019-04-25.
 //
 #pragma once
-#ifndef PADDLE_BACKWARDSTRATEGY_H
-#define PADDLE_BACKWARDSTRATEGY_H
-
-#endif  // PADDLE_BACKWARDSTRATEGY_H

 namespace paddle {
 namespace imperative {
 namespace detail {

-class BackwardStrategy {
- public:
+struct BackwardStrategy {
  /* DyGraph now support two kinds of backward strategy, one is sorted sum
   * gradient, another is sum gradient once they are created */
  // TODO(jiabin): add more Strategy when we support
--- a/paddle/fluid/imperative/engine.cc
+++ b/paddle/fluid/imperative/engine.cc
@ -14,40 +14,219 @@

 #include "paddle/fluid/imperative/engine.h"

-#include <mutex>  // NOLINT
+#include <algorithm>
+#include <memory>
+#include <queue>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
 #include <vector>
-
-#include "glog/logging.h"
+#include "paddle/fluid/imperative/gradient_accumulator.h"
+#include "paddle/fluid/imperative/layer.h"
+#include "paddle/fluid/imperative/tracer.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/profiler.h"

 namespace paddle {
 namespace imperative {

-static std::once_flag init_engine;
-static Engine* engine;
+void Engine::RunOp(paddle::imperative::OpBase* op,
+                   const paddle::imperative::NameVarBaseMap& ins,
+                   const paddle::imperative::NameVarBaseMap& outs,
+                   const paddle::platform::Place& place) {
+  platform::RecordEvent event(op->Type());
+
+  op->Run(ins, outs);
+}

-class DummyEngine : public Engine {
- public:
-  void Enqueue(Runnable* runnable) override {
-    queued_runnables_.push_back(runnable);
+void BasicEngine::Init(VarBase* var, const detail::BackwardStrategy& strategy) {
+  backward_strategy_ = strategy;
+  const std::vector<OpBase*> ops = var->GradVarBase()->GradOps();
+  var->ClearGradOps();
+
+  if (ops.empty()) {
+    VLOG(3) << "Skip auto grad since there is no grad op for var: "
+            << var->Name();
+    return;
+  } else {
+    bool valid = false;
+    for (const auto& op : ops) {
+      if (op) {
+        valid = true;
+      }
+    }
+    if (!valid) {
+      VLOG(3) << "Skip auto grad since all grad op of start VarBase is nullptr";
+      return;
+    }
  }
+  init_ops_ = ops;
+  platform::RecordEvent record_event("Imperative Backward");
+  VLOG(3) << "start backward";
+
+  PADDLE_ENFORCE_EQ(var->HasGradVar(), true,
+                    "Grad variable not exist for variable %s", var->Name());

-  size_t Size() const override { return queued_runnables_.size(); }
+  auto& fwd_var = var->Var().Get<framework::LoDTensor>();
+  auto* grad_var =
+      var->GradVarBase()->MutableVar()->GetMutable<framework::LoDTensor>();
+  auto* dev_ctx = platform::DeviceContextPool::Instance().Get(fwd_var.place());
+  grad_var->Resize(fwd_var.dims());
+  grad_var->mutable_data(fwd_var.place(), fwd_var.type());
+  operators::math::set_constant(*dev_ctx, grad_var, 1.0);
+}

-  void Sync() override {
-    for (Runnable* l : queued_runnables_) {
-      LOG(INFO) << "running " << reinterpret_cast<void*>(l);
+bool BasicEngine::CheckBackwardInputs(OpBase* op) {
+  for (auto& pair : op->GetInsMap()) {
+    for (auto& var : pair.second) {
+      if (var && !var->StopGradient()) {
+        return true;
+      }
    }
-    queued_runnables_.clear();
+  }
+  return false;
+}
+
+void BasicEngine::PrepareGradAccumulators(OpBase* op) {
+  for (const auto& pair : op->GetOutsMap()) {
+    for (const auto& var : pair.second) {
+      if (!var) continue;
+
+      auto& accumulator = accumulators_[var.get()];
+      if (!accumulator) {
+        if (backward_strategy_.sorted_sum_gradient_) {
+          accumulator.reset(new SortedGradientAccumulator(var.get()));
+        } else {
+          accumulator.reset(new EagerGradientAccumulator(var.get()));
+        }
+      }
+
+      accumulator->IncreaseRefCnt();
+
+      VLOG(3) << "Prepare to acccumulate variable grad " << var->Name()
+              << "with reference count " << accumulator->RefCnt();
+    }
+  }
+}
+
+void BasicEngine::PrepareDeps() {
+  PADDLE_ENFORCE_EQ(op_deps_.empty(), true, "Op deps must be initialized here");
+  PADDLE_ENFORCE_EQ(accumulators_.empty(), true,
+                    "Accumulators must be initialized here");
+
+  std::queue<OpBase*> q;
+  std::unordered_set<OpBase*> visited;
+  for (const auto& init_op : init_ops_) {
+    q.push(init_op);
+    visited.insert(init_op);
  }

- private:
-  std::vector<Runnable*> queued_runnables_;
-};
+  while (!q.empty()) {
+    auto* cur_op = q.front();
+    q.pop();
+    VLOG(3) << "Checking grads of op " << cur_op->Type();

-Engine* GetEngine() {
-  std::call_once(init_engine, []() { engine = new DummyEngine(); });
-  return engine;
+    if (!CheckBackwardInputs(cur_op)) {
+      // TODO(zjl): clear ops that do not need grad before running autograd
+      VLOG(3) << "Stop checking preceding ops of " << cur_op->Type()
+              << " because all of its backward inputs is stop_gradient=True";
+      continue;
+    }
+
+    PrepareGradAccumulators(cur_op);
+
+    auto& preceding_ops = cur_op->GradPendingOps();
+    for (auto* preceding_op : preceding_ops) {
+      PADDLE_ENFORCE_NOT_NULL(preceding_op);
+      ++op_deps_[preceding_op];
+      if (visited.count(preceding_op) == 0) {
+        visited.insert(preceding_op);
+        q.push(preceding_op);
+      }
+    }
+  }
 }

+void BasicEngine::SumGradient(OpBase* op, std::shared_ptr<VarBase> src,
+                              VarBase* dst) {
+  auto iter = accumulators_.find(dst);
+  PADDLE_ENFORCE_EQ(iter != accumulators_.end(), true,
+                    "Cannot find gradient of variable %s", dst->Name());
+  iter->second->Add(std::move(src), op->id());
+}
+void BasicEngine::Execute() {
+  PrepareDeps();
+  // Start execute Computation graph
+  std::queue<OpBase*> q;
+  for (const auto& init_op : init_ops_) {
+    q.push(init_op);
+  }
+  while (!q.empty()) {
+    OpBase* cur_op = q.front();
+    q.pop();
+
+    // Step 1: Run Backward
+    auto& bwd_ins = cur_op->GetInsMap();
+    auto& bwd_outs = cur_op->GetOutsMap();
+
+    NameVarBaseMap tmp_outs;
+    // A var may be coresponding to several grad var in one op
+    std::unordered_map<VarBase*, std::vector<std::shared_ptr<VarBase>>> var_map;
+    size_t counter = 0;
+    for (auto& bwd_out : bwd_outs) {
+      auto& tmp_var_list = tmp_outs[bwd_out.first];
+      tmp_var_list.reserve(bwd_out.second.size());
+      for (auto& var : bwd_out.second) {
+        auto tmp_var = std::make_shared<VarBase>(
+            false, "Gtmp@" + std::to_string(counter++));  // Do not need grad
+        tmp_var_list.emplace_back(tmp_var);
+        if (var) {
+          var_map[var.get()].emplace_back(std::move(tmp_var));
+          var->ClearGradOps();
+        }
+      }
+    }
+
+    VLOG(3) << "Start to execute grad op " << cur_op->Type();
+    RunOp(cur_op, bwd_ins, tmp_outs, cur_op->place());
+    // Step 2: Sum Gradient
+    {
+      platform::RecordEvent record_event("merge_grads");
+      for (auto& var_pair : var_map) {
+        auto* dst_var = var_pair.first;
+        if (dst_var == nullptr) continue;
+        for (auto& src_var : var_pair.second) {
+          VLOG(3) << "Sum gradient of variable " << dst_var->Name()
+                  << " after op " << cur_op->Type();
+          SumGradient(cur_op, std::move(src_var), dst_var);
+        }
+      }
+    }
+
+    // Step 3: Collect ready ops
+    for (auto* preceding_op : cur_op->GradPendingOps()) {
+      PADDLE_ENFORCE_NOT_NULL(preceding_op);
+      auto iter = op_deps_.find(preceding_op);
+      if (iter == op_deps_.end()) {
+        continue;
+      }
+
+      VLOG(3) << "Found preceding op of " << cur_op->Type();
+      // An Op is ready to go while its deps comes to zero
+
+      if (--(iter->second) == 0) {
+        q.push(preceding_op);
+        VLOG(3) << "Push preceding op " << preceding_op->Type()
+                << " into queue";
+      }
+    }
+
+    // Step 4: Delete op to collect unused variables
+    VLOG(3) << "Remove op after op " << cur_op->Type() << " runs";
+    RemoveOp(cur_op);
+  }
+  VLOG(3) << "Clean properties of BasicEngine";
+  CleanEngine();
+}
 }  // namespace imperative
 }  // namespace paddle
--- a/paddle/fluid/imperative/engine.h
+++ b/paddle/fluid/imperative/engine.h
@ -16,24 +16,80 @@

 #include <cstddef>
 #include <cstdint>
+#include <memory>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/imperative/backward_strategy.h"
+#include "paddle/fluid/imperative/gradient_accumulator.h"
+#include "paddle/fluid/imperative/layer.h"

 namespace paddle {
 namespace imperative {

-struct Runnable {};
-
+// It seems there is no need for Engine to be an
+// singleton, we can have multi-engine to run
+// mutil-graoh. For future use we may expose a interface
+// to Python to support
 class Engine {
 public:
-  virtual ~Engine() {}
+  virtual ~Engine() = default;
+  virtual void Execute() = 0;
+  virtual void Init(VarBase* var, const detail::BackwardStrategy& strategy) = 0;
+  virtual void RunOp(imperative::OpBase* op, const NameVarBaseMap& ins,
+                     const NameVarBaseMap& outs, const platform::Place& place);

-  virtual void Enqueue(Runnable* runnable) = 0;
+  virtual void RemoveOp(OpBase* op) {
+    PADDLE_ENFORCE_NOT_NULL(op, "Cannot remove null op");
+    auto iter = grad_ops_.find(op);
+    PADDLE_ENFORCE_EQ(iter != grad_ops_.end(), true, "Op is not inside tracer");
+    grad_ops_.erase(iter);
+  }

-  virtual size_t Size() const = 0;
+  void InsertOp(OpBase* op, std::shared_ptr<OpBase> op_shared) {
+    grad_ops_[op] = std::move(op_shared);
+  }
+  void Clear() { grad_ops_.clear(); }

-  virtual void Sync() = 0;
+ private:
+  std::unordered_map<OpBase*, std::shared_ptr<OpBase>>
+      grad_ops_;  // opBase for remove - grad_op
 };

-Engine* GetEngine();
+class BasicEngine : public Engine {
+ public:
+  BasicEngine() = default;
+
+  void Init(VarBase* var, const detail::BackwardStrategy& strategy) override;
+
+  ~BasicEngine() override = default;
+
+  void Execute() override;
+
+ private:
+  void PrepareDeps();
+
+  bool CheckBackwardInputs(OpBase* op);
+
+  void PrepareGradAccumulators(OpBase* op);
+
+  void SumGradient(OpBase* op, std::shared_ptr<VarBase> src, VarBase* dst);
+
+  // TODO(jiabin): maybe we can optimize the performance of engine by cache the
+  // result
+  void CleanEngine() {
+    init_ops_.clear();
+    op_deps_.clear();
+    accumulators_.clear();
+    Clear();
+  }
+
+  std::vector<OpBase*> init_ops_;
+  detail::BackwardStrategy backward_strategy_;
+  std::unordered_map<OpBase*, size_t> op_deps_;
+  std::unordered_map<VarBase*, std::unique_ptr<GradientAccumulator>>
+      accumulators_;
+};

 }  // namespace imperative
 }  // namespace paddle
--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@ -0,0 +1,148 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/imperative/gradient_accumulator.h"
+#include <algorithm>
+#include <memory>
+#include <utility>
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/imperative/layer.h"
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/profiler.h"
+
+namespace paddle {
+namespace imperative {
+
+template <typename T>
+class TensorAddFunctor : public boost::static_visitor<> {
+ public:
+  TensorAddFunctor(int64_t numel, const T* x, T* y)
+      : numel_(numel), x_(x), y_(y) {}
+
+  void operator()(const platform::CPUPlace& place) {
+    platform::CPUDeviceContext* ctx = dynamic_cast<platform::CPUDeviceContext*>(
+        platform::DeviceContextPool::Instance().Get(place));
+    auto blas = operators::math::GetBlas<platform::CPUDeviceContext, T>(*ctx);
+    blas.AXPY(numel_, 1., x_, y_);
+  }
+
+#ifdef PADDLE_WITH_CUDA
+  void operator()(const platform::CUDAPlace& place) {
+    platform::CUDADeviceContext* ctx =
+        dynamic_cast<platform::CUDADeviceContext*>(
+            platform::DeviceContextPool::Instance().Get(place));
+    auto blas = operators::math::GetBlas<platform::CUDADeviceContext, T>(*ctx);
+    blas.AXPY(numel_, 1., x_, y_);
+  }
+#else
+  void operator()(const platform::CUDAPlace& place) {
+    PADDLE_THROW("Do NOT support gradient merge in place %s", place);
+  }
+#endif
+
+  // there is NO blas in CUDAPinnedPlace
+  void operator()(const platform::CUDAPinnedPlace& place) {
+    PADDLE_THROW("Do NOT support gradient merge in place %s", place);
+  }
+
+ private:
+  int64_t numel_;
+  const T* x_;
+  T* y_;
+};
+
+void TensorAdd(const framework::Variable& src, framework::Variable* dst) {
+  auto* dst_tensor = dst->GetMutable<framework::LoDTensor>();
+  auto& src_tensor = src.Get<framework::LoDTensor>();
+
+  auto numel = src_tensor.numel();
+
+  // FIXME(minqiyang): loss_grad op will pass a zero grad of label
+  // ugly fix for it
+  if (numel == 0) {
+    return;
+  }
+
+  PADDLE_ENFORCE_EQ(dst_tensor->numel() == numel, true,
+                    "dst_numel %d vs. src_numel %d", dst_tensor->numel(),
+                    numel);
+
+  auto data_type = src_tensor.type();
+  auto place = src_tensor.place();
+
+#define PADDLE_TENSOR_ADD_MACRO(cpp_type)                            \
+  if (data_type == framework::DataTypeTrait<cpp_type>::DataType()) { \
+    TensorAddFunctor<cpp_type> func(                                 \
+        numel, src_tensor.data<cpp_type>(),                          \
+        dst_tensor->mutable_data<cpp_type>(place));                  \
+    boost::apply_visitor(func, place);                               \
+    return;                                                          \
+  }
+
+  PADDLE_TENSOR_ADD_MACRO(float);
+  PADDLE_TENSOR_ADD_MACRO(double);
+
+#undef PADDLE_TENSOR_ADD_MACRO
+
+  PADDLE_THROW("Not supported data type %s for AddTo",
+               framework::DataTypeToString(data_type));
+}
+
+void EagerGradientAccumulator::Add(std::shared_ptr<VarBase> var,
+                                   size_t trace_id) {
+  auto* dst_var = var_->MutableVar();
+  if (cur_cnt_ == 0) {
+    *dst_var = std::move(*(var->MutableVar()));
+  } else {
+    TensorAdd(var->Var(), dst_var);
+  }
+  ++cur_cnt_;
+}
+
+void SortedGradientAccumulator::Add(std::shared_ptr<VarBase> var,
+                                    size_t trace_id) {
+  auto* dst_var = var_->MutableVar();
+  if (ref_cnt_ == 1) {
+    *dst_var = std::move(*(var->MutableVar()));
+  } else {
+    if (tmp_grad_vars_.empty()) {
+      tmp_grad_vars_.reserve(ref_cnt_);
+    }
+
+    tmp_grad_vars_.emplace_back(std::move(var), trace_id);
+
+    if (tmp_grad_vars_.size() != ref_cnt_) {
+      return;
+    }
+
+    std::sort(tmp_grad_vars_.begin(), tmp_grad_vars_.end(),
+              [](const std::pair<std::shared_ptr<VarBase>, size_t>& p1,
+                 const std::pair<std::shared_ptr<VarBase>, size_t>& p2) {
+                return p1.second > p2.second;
+              });
+
+    *dst_var = std::move(*(tmp_grad_vars_[0].first->MutableVar()));
+    for (size_t i = 1; i < tmp_grad_vars_.size(); ++i) {
+      TensorAdd(tmp_grad_vars_[i].first->Var(), dst_var);
+    }
+
+    tmp_grad_vars_.clear();
+  }
+}
+
+}  // namespace imperative
+}  // namespace paddle
--- a/paddle/fluid/imperative/gradient_accumulator.h
+++ b/paddle/fluid/imperative/gradient_accumulator.h
@ -0,0 +1,63 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/imperative/layer.h"
+
+namespace paddle {
+namespace imperative {
+
+class GradientAccumulator {
+ public:
+  explicit GradientAccumulator(VarBase* var) : var_(var) {}
+
+  virtual void Add(std::shared_ptr<VarBase> var, size_t trace_id) = 0;
+
+  virtual ~GradientAccumulator() = default;
+
+  inline void IncreaseRefCnt() { ++ref_cnt_; }
+
+  inline size_t RefCnt() const { return ref_cnt_; }
+
+ protected:
+  VarBase* var_;
+  size_t ref_cnt_{0};
+};
+
+class EagerGradientAccumulator : public GradientAccumulator {
+ public:
+  using GradientAccumulator::GradientAccumulator;
+
+  void Add(std::shared_ptr<VarBase> var, size_t trace_id) override;
+
+ private:
+  size_t cur_cnt_{0};
+};
+
+class SortedGradientAccumulator : public GradientAccumulator {
+ public:
+  using GradientAccumulator::GradientAccumulator;
+
+  void Add(std::shared_ptr<VarBase> var, size_t trace_id) override;
+
+ private:
+  std::vector<std::pair<std::shared_ptr<VarBase>, size_t>> tmp_grad_vars_;
+};
+
+}  // namespace imperative
+}  // namespace paddle
--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
--- a/paddle/fluid/imperative/layer.h
+++ b/paddle/fluid/imperative/layer.h
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@ -0,0 +1,101 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/imperative/prepared_operator.h"
+#include <sstream>
+
+namespace paddle {
+namespace imperative {
+
+const framework::Tensor* GetTensorFromVar(const framework::Variable& var) {
+  if (var.IsType<framework::LoDTensor>()) {
+    return &(var.Get<framework::LoDTensor>());
+  } else if (var.IsType<framework::SelectedRows>()) {
+    return &(var.Get<framework::SelectedRows>().value());
+  } else {
+    return nullptr;
+  }
+}
+
+platform::Place PreparedOp::GetExpectedPlace(const platform::Place& place,
+                                             const NameVarBaseMap& ins) {
+  bool found = false;
+  for (auto& name_pair : ins) {
+    for (auto& var_base : name_pair.second) {
+      const auto* tensor = GetTensorFromVar(var_base->Var());
+      if (tensor && tensor->IsInitialized()) {
+        auto tmp_place = tensor->place();
+        PADDLE_ENFORCE_EQ(!found || tmp_place == place, true,
+                          "Input variable should keep in the same place: %s, "
+                          "but get place: %s of input %s instead",
+                          place, tmp_place, name_pair.first);
+      }
+    }
+  }
+  return place;
+}
+
+PreparedOp::PreparedOp(const framework::OperatorBase& op,
+                       const framework::RuntimeContext& ctx,
+                       framework::OperatorWithKernel::OpKernelFunc func,
+                       platform::DeviceContext* dev_ctx,
+                       std::vector<framework::KernelConfig>* kernel_configs)
+    : op_(op),
+      ctx_(ctx),
+      func_(std::move(func)),
+      dev_ctx_(dev_ctx),
+      kernel_configs_(kernel_configs) {}
+
+PreparedOp PreparedOp::Prepare(const framework::RuntimeContext& ctx,
+                               const framework::OperatorWithKernel& op,
+                               const platform::Place& place) {
+  auto* dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+
+  // check if op[type] has kernel registered.
+  auto& all_op_kernels = op.AllOpKernels();
+  auto kernels_iter = all_op_kernels.find(op.Type());
+  if (kernels_iter == all_op_kernels.end()) {
+    PADDLE_THROW(
+        "There are no kernels which are registered in the %s operator.",
+        op.Type());
+  }
+
+  auto& kernels = kernels_iter->second;
+
+  auto expected_kernel_key =
+      op.GetExpectedKernelType(framework::ExecutionContext(
+          op, framework::Scope(), *dev_ctx, ctx, nullptr));
+  VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
+
+  auto kernel_iter = kernels.find(expected_kernel_key);
+  // TODO(jiabin): Add operator.cc's line 1000 part back when we need that case
+  if (kernel_iter == kernels.end()) {
+    PADDLE_THROW("op %s does not have kernel for %s", op.Type(),
+                 KernelTypeToString(expected_kernel_key));
+  }
+  std::vector<framework::KernelConfig>* kernel_configs =
+      op.GetKernelConfig(expected_kernel_key);
+  return PreparedOp(op, ctx, kernel_iter->second, dev_ctx, kernel_configs);
+}
+
+void PreparedOp::Run() {
+  // TODO(zjl): remove scope in dygraph
+  framework::Scope scope;
+  op_.RuntimeInferShape(scope, dev_ctx_->GetPlace(), ctx_);
+  func_(framework::ExecutionContext(op_, scope, *dev_ctx_, ctx_,
+                                    kernel_configs_));
+}
+
+}  // namespace imperative
+}  // namespace paddle
--- a/paddle/fluid/imperative/prepared_operator.h
+++ b/paddle/fluid/imperative/prepared_operator.h
@ -0,0 +1,58 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/imperative/layer.h"
+#include "paddle/fluid/imperative/type_defs.h"
+
+namespace paddle {
+namespace imperative {
+
+const framework::Tensor* GetTensorFromVar(const framework::Variable& var);
+
+class PreparedOp {
+ public:
+  static PreparedOp Prepare(const framework::RuntimeContext& ctx,
+                            const framework::OperatorWithKernel& op,
+                            const platform::Place& place);
+
+  inline platform::DeviceContext* GetDeviceContext() const { return dev_ctx_; }
+
+  void Run();
+
+  static platform::Place GetExpectedPlace(const platform::Place& place,
+                                          const NameVarBaseMap& ins);
+
+ private:
+  PreparedOp(const framework::OperatorBase& op,
+             const framework::RuntimeContext& ctx,
+             framework::OperatorWithKernel::OpKernelFunc func,
+             platform::DeviceContext* dev_ctx,
+             std::vector<framework::KernelConfig>* kernel_configs);
+
+ private:
+  const framework::OperatorBase& op_;
+  const framework::RuntimeContext& ctx_;
+  framework::OperatorWithKernel::OpKernelFunc func_;
+  platform::DeviceContext* dev_ctx_;
+  std::vector<framework::KernelConfig>* kernel_configs_;
+};
+
+}  // namespace imperative
+}  // namespace paddle
--- a/paddle/fluid/imperative/tests/CMakeLists.txt
+++ b/paddle/fluid/imperative/tests/CMakeLists.txt
@ -0,0 +1,5 @@
+cc_test(nccl_context_test SRCS nccl_context_test.cc DEPS nccl_context)
+cc_test(test_gradient_accmulator SRCS test_gradient_accmulator.cc DEPS gradient_accumulator memcpy)
+cc_test(test_layer SRCS test_layer.cc DEPS layer proto_desc operator op_registry variable_helper mul_op)
+cc_test(test_prepare_op SRCS test_prepare_op.cc DEPS prepared_operator op_info split_op layer concat_and_split)
+cc_test(test_tracer SRCS test_tracer.cc DEPS tracer layer proto_desc operator op_registry variable_helper mul_op)
--- a/paddle/fluid/imperative/tests/nccl_context_test.cc
+++ b/paddle/fluid/imperative/tests/nccl_context_test.cc
--- a/paddle/fluid/imperative/tests/test_gradient_accmulator.cc
+++ b/paddle/fluid/imperative/tests/test_gradient_accmulator.cc
@ -0,0 +1,121 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+#include <vector>
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/imperative/gradient_accumulator.h"
+#include "paddle/fluid/memory/memcpy.h"
+
+namespace imperative = paddle::imperative;
+namespace platform = paddle::platform;
+namespace framework = paddle::framework;
+namespace paddle {
+namespace imperative {
+
+void TensorAdd(const framework::Variable& src, framework::Variable* dst);
+
+#if defined(PADDLE_WITH_CUDA)
+template <typename T>
+int TensorGPUAddTest(platform::CUDAPlace place, T t1, T t2) {
+  framework::Variable var1;
+  framework::Variable var2;
+  std::vector<T> src_data(10, t1);
+  std::vector<T> dst_data(10, t2);
+  std::vector<T> result;
+  platform::CPUPlace src_place;
+  for (unsigned int i = 0; i < 10; i++) {
+    result.emplace_back(src_data[i] + dst_data[i]);
+  }
+  std::vector<int64_t> dims = {2, 5};
+  auto* src = var1.GetMutable<framework::LoDTensor>();
+  auto* dst = var2.GetMutable<framework::LoDTensor>();
+  src->Resize(framework::make_ddim(dims));
+  dst->Resize(framework::make_ddim(dims));
+  auto* src_mutable = src->mutable_data<T>(place);
+  auto* dst_mutable = dst->mutable_data<T>(place);
+  paddle::memory::Copy(place, src_mutable, src_place, src_data.data(),
+                       sizeof(T) * src_data.size(), 0);
+  paddle::memory::Copy(place, dst_mutable, src_place, dst_data.data(),
+                       sizeof(T) * dst_data.size(), 0);
+  imperative::TensorAdd(var1, &var2);
+  framework::LoDTensor rlt;
+  platform::CPUPlace rlt_place;
+  framework::TensorCopySync(*dst, rlt_place, &rlt);
+
+  for (unsigned int i = 0; i < rlt.numel(); i++) {
+    if (rlt.data<T>()[i] != result[i]) return 1;
+  }
+  return 0;
+}
+#endif
+
+template <typename T>
+int TensorCPUAddTest(platform::CPUPlace place, T t1, T t2) {
+  framework::Variable var1;
+  framework::Variable var2;
+  std::vector<T> src_data(10, t1);
+  std::vector<T> dst_data(10, t2);
+  std::vector<T> result;
+  platform::CPUPlace src_place;
+  for (unsigned int i = 0; i < 10; i++) {
+    result.emplace_back(src_data[i] + dst_data[i]);
+  }
+  std::vector<int64_t> dims = {2, 5};
+  auto* src = var1.GetMutable<framework::LoDTensor>();
+  auto* dst = var2.GetMutable<framework::LoDTensor>();
+  src->Resize(framework::make_ddim(dims));
+  dst->Resize(framework::make_ddim(dims));
+  auto* src_mutable = src->mutable_data<T>(place);
+  auto* dst_mutable = dst->mutable_data<T>(place);
+  paddle::memory::Copy(place, src_mutable, src_place, src_data.data(),
+                       sizeof(T) * src_data.size());
+  paddle::memory::Copy(place, dst_mutable, src_place, dst_data.data(),
+                       sizeof(T) * dst_data.size());
+  imperative::TensorAdd(var1, &var2);
+  framework::LoDTensor rlt;
+  platform::CPUPlace rlt_place;
+  framework::TensorCopySync(*dst, rlt_place, &rlt);
+
+  for (unsigned int i = 0; i < rlt.numel(); i++) {
+    if (rlt.data<T>()[i] != result[i]) return 1;
+  }
+  return 0;
+}
+
+TEST(test_add_functor, add_functor) {
+#if defined(PADDLE_WITH_CUDA)
+  platform::CUDAPlace gpu_place(0);
+#endif
+  platform::CPUPlace cpu_place;
+
+  int cpu_res = 1;
+  cpu_res = TensorCPUAddTest(cpu_place, 1.0, 0.0);
+  EXPECT_EQ(cpu_res, 0);
+  cpu_res = TensorCPUAddTest(cpu_place, static_cast<double>(1.0),
+                             static_cast<double>(2.0));
+  EXPECT_EQ(cpu_res, 0);
+#if defined(PADDLE_WITH_CUDA)
+  int gpu_res = 1;
+  gpu_res = TensorGPUAddTest(gpu_place, 1.0, 0.0);
+  EXPECT_EQ(gpu_res, 0);
+  gpu_res = TensorGPUAddTest(gpu_place, static_cast<double>(1.0),
+                             static_cast<double>(2.0));
+  EXPECT_EQ(gpu_res, 0);
+#endif
+}
+
+}  // namespace imperative
+}  // namespace paddle
--- a/paddle/fluid/imperative/tests/test_layer.cc
+++ b/paddle/fluid/imperative/tests/test_layer.cc
@ -0,0 +1,154 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//
+// Created by Jiabin on 2019-08-16.
+//
+
+#include <paddle/fluid/framework/op_registry.h>
+#include <memory>
+#include <string>
+#include <vector>
+#include "gtest/gtest.h"
+#include "paddle/fluid/imperative/layer.h"
+
+namespace imperative = paddle::imperative;
+namespace platform = paddle::platform;
+namespace framework = paddle::framework;
+
+namespace paddle {
+namespace imperative {
+
+using vb_vector = std::vector<std::shared_ptr<imperative::VarBase>>;
+
+using var_pair = std::pair<std::string, vb_vector>;
+
+TEST(test_layer, test_runtime_context) {
+  std::shared_ptr<imperative::VarBase> vin(
+      new imperative::VarBase(false, "vin"));
+  std::shared_ptr<imperative::VarBase> vout(
+      new imperative::VarBase(false, "vout"));
+  var_pair in_pair = var_pair("X", vb_vector(1, vin));
+  var_pair out_pair = var_pair("Out", vb_vector(1, vout));
+  imperative::NameVarBaseMap ins = {in_pair};
+  imperative::NameVarBaseMap outs = {out_pair};
+  framework::AttributeMap attrs;
+  auto* ctx = new imperative::RuntimeInferVarTypeContext(ins, &outs, attrs);
+  ASSERT_TRUE(ctx->HasVar("vin"));
+  ASSERT_TRUE(ctx->HasInput("X"));
+  ASSERT_TRUE(ctx->HasOutput("Out"));
+
+  ASSERT_ANY_THROW(ctx->GetDataTypes("vin"));
+  std::vector<framework::proto::VarType::Type> NullType;
+  ASSERT_ANY_THROW(ctx->SetDataTypes("vin", NullType));
+  ASSERT_ANY_THROW(ctx->GetShape("vin"));
+  ASSERT_ANY_THROW(ctx->GetLoDLevel("vin"));
+  ASSERT_ANY_THROW(ctx->SetLoDLevel("vin", 2));
+}
+
+std::string LayerDebugString(const std::string& op_type,
+                             const NameVarBaseMap& ins,
+                             const NameVarBaseMap& outs);
+
+TEST(test_layer, test_debug_string_test_debug_Test) {
+  std::shared_ptr<imperative::VarBase> vin(
+      new imperative::VarBase(false, "vin"));
+  std::shared_ptr<imperative::VarBase> vin_error(
+      new imperative::VarBase(false, "vin_error"));
+  std::shared_ptr<imperative::VarBase> vout(
+      new imperative::VarBase(false, "vout"));
+  std::shared_ptr<imperative::VarBase> vout_error(
+      new imperative::VarBase(false, "vout_error"));
+  vin_error->MutableVar()->GetMutable<framework::LoDTensor>();
+  vout->MutableVar()->GetMutable<framework::LoDTensor>();
+  vout_error->MutableVar()->GetMutable<framework::SelectedRows>();
+  var_pair in_pair = var_pair("X", vb_vector(1, vin));
+  vb_vector vb_in_error = {vin_error, nullptr};
+  var_pair vin_error_pair = var_pair("X", vb_in_error);
+  var_pair out_pair = var_pair("Out", vb_vector(1, vout));
+  var_pair vout_error_pair = var_pair("Out2", vb_vector(1, vout_error));
+  imperative::NameVarBaseMap ins = {in_pair};
+  imperative::NameVarBaseMap ins_error = {vin_error_pair};
+  imperative::NameVarBaseMap outs = {out_pair};
+  imperative::NameVarBaseMap outs_error = {vout_error_pair};
+  ASSERT_NO_FATAL_FAILURE(LayerDebugString("test_op", ins, outs));
+  std::string res = LayerDebugString("test_op", ins, outs_error);
+  ASSERT_TRUE(res.find("UNRESOLVED_TYPE") != std::string::npos);
+  std::string res2 = LayerDebugString("test_op", ins_error, outs_error);
+  VLOG(3) << res2;
+  ASSERT_TRUE(res2.find("NOT_INITED") != std::string::npos);
+  ASSERT_TRUE(res2.find("NULL") != std::string::npos);
+}
+
+TEST(test_layer, test_clear_backward_info) {
+  std::shared_ptr<imperative::VarBase> vin(
+      new imperative::VarBase(false, "vin"));
+  std::shared_ptr<imperative::VarBase> vout(
+      new imperative::VarBase(false, "vout"));
+  framework::OpDesc desc;
+  platform::CPUPlace place;
+  var_pair x_pair = var_pair("X", vb_vector(1, vin));
+  var_pair y_pair = var_pair("Y", vb_vector(1, vin));
+  var_pair out_pair = var_pair("Out", vb_vector(1, vout));
+  imperative::NameVarBaseMap ins = {x_pair, y_pair};
+  imperative::NameVarBaseMap outs = {out_pair};
+  framework::AttributeMap concat_att_map;
+  concat_att_map["axis"] = 1;
+  std::shared_ptr<imperative::OpBase> op(
+      OpBase::Create(0, "mul", ins, outs, concat_att_map, place));
+  std::shared_ptr<imperative::OpBase> preceding_op(
+      OpBase::Create(0, "mul", ins, outs, concat_att_map, place));
+  op->InsertGradPendingOps(preceding_op.get());
+  *(op->GetMutableInsMap()) = ins;
+  *(op->GetMutableOutsMap()) = outs;
+  ASSERT_GT(op->GetInsMap().size(), 0);
+  ASSERT_GT(op->GetOutsMap().size(), 0);
+  ASSERT_GT(op->GradPendingOps().size(), 0);
+
+  op->ClearBackwardTrace();
+
+  ASSERT_EQ(op->GetInsMap().size(), 0);
+  ASSERT_EQ(op->GetOutsMap().size(), 0);
+  ASSERT_EQ(op->GradPendingOps().size(), 0);
+}
+
+TEST(test_layer, test_varbase_basic) {
+  platform::CPUPlace place;
+  std::shared_ptr<imperative::VarBase> vin(
+      new imperative::VarBase(false, "vin"));
+  vin->MutableVar()->GetMutable<framework::LoDTensor>()->mutable_data<float>(
+      place);
+  std::shared_ptr<imperative::VarBase> vout(vin->NewVarBase(place, false));
+  ASSERT_EQ(vout->Name(), "Itmp0");
+
+  std::shared_ptr<imperative::VarBase> vin_with_grad(
+      new imperative::VarBase(true, "vin"));
+  ASSERT_ANY_THROW(vin->MutableGradVar());
+  ASSERT_NO_THROW(ASSERT_TRUE(dynamic_cast<framework::Variable*>(
+                                  vin_with_grad->MutableGradVar()) != 0));
+  ASSERT_TRUE(
+      dynamic_cast<framework::Variable*>(vin_with_grad->MutableGradVar()) != 0);
+  vin_with_grad->SetStopGradient(true);
+  ASSERT_TRUE(vin_with_grad->StopGradient());
+  ASSERT_NO_FATAL_FAILURE(vin_with_grad->SetPersistable(true));
+  ASSERT_TRUE(vin_with_grad->StopGradient());
+  ASSERT_NO_FATAL_FAILURE(vin_with_grad->SetName("new_name"));
+  ASSERT_EQ(vin_with_grad->Name(), "new_name");
+}
+// TODO(jiabin): Add more ut here for layer
+
+}  // namespace imperative
+}  // namespace paddle
+
+USE_OP(mul);
--- a/paddle/fluid/imperative/tests/test_prepare_op.cc
+++ b/paddle/fluid/imperative/tests/test_prepare_op.cc
@ -0,0 +1,130 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//
+// Created by Jiabin on 2019-08-19.
+//
+
+#include <paddle/fluid/framework/op_registry.h>
+#include <memory>
+#include <string>
+#include <vector>
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_info.h"
+#include "paddle/fluid/imperative/prepared_operator.h"
+#include "paddle/fluid/imperative/type_defs.h"
+
+namespace imperative = paddle::imperative;
+namespace platform = paddle::platform;
+namespace framework = paddle::framework;
+
+namespace paddle {
+namespace imperative {
+
+static framework::RuntimeContext PrepareRuntimeContext(
+    const NameVarBaseMap& ins, const NameVarBaseMap& outs) {
+  framework::VariableValueMap inputs, outputs;
+  for (auto& in_pair : ins) {
+    auto& in_ctx = inputs[in_pair.first];
+    in_ctx.reserve(in_pair.second.size());
+    for (auto& in_var : in_pair.second) {
+      in_ctx.emplace_back(in_var->MutableVar());
+    }
+  }
+
+  for (auto& out_pair : outs) {
+    auto& out_ctx = outputs[out_pair.first];
+    out_ctx.reserve(out_pair.second.size());
+    for (auto& out_var : out_pair.second) {
+      out_ctx.emplace_back(out_var->MutableVar());
+    }
+  }
+  return framework::RuntimeContext(std::move(inputs), std::move(outputs));
+}
+
+static framework::VariableNameMap CreateVarNameMap(
+    const framework::OpInfo& op_info, const std::string& op_type,
+    const NameVarBaseMap& varbase_map, bool is_input) {
+  if (op_info.proto_ == nullptr) {
+    return {};
+  }
+
+  framework::VariableNameMap result;
+
+  for (auto& var :
+       is_input ? op_info.Proto().inputs() : op_info.Proto().outputs()) {
+    auto it = varbase_map.find(var.name());
+    if (it == varbase_map.end()) {
+      PADDLE_ENFORCE_EQ(
+          var.dispensable(), true,
+          "Var: %s not dispensable and there are no such var in inputs",
+          var.name());
+      result[var.name()] = {};
+    } else {
+      auto& var_vector = it->second;
+      std::vector<std::string> args;
+      args.reserve(var_vector.size());
+      for (auto& var_base : var_vector) {
+        args.emplace_back(var_base->Name());
+      }
+      result[var.name()] = std::move(args);
+    }
+  }
+  return result;
+}
+
+using vb_vector = std::vector<std::shared_ptr<imperative::VarBase>>;
+
+using var_pair = std::pair<std::string, vb_vector>;
+
+TEST(test_prepare_op, test_prepare_op) {
+  std::shared_ptr<imperative::VarBase> vin(
+      new imperative::VarBase(false, "vin"));
+  std::shared_ptr<imperative::VarBase> vout(
+      new imperative::VarBase(false, "vout"));
+  framework::OpDesc desc;
+  platform::CPUPlace place;
+  vin->MutableVar()->GetMutable<framework::LoDTensor>()->mutable_data<float>(
+      place);
+  var_pair x_pair = var_pair("X", vb_vector(1, vin));
+  var_pair out_pair = var_pair("Out", vb_vector(1, vout));
+  imperative::NameVarBaseMap ins = {x_pair};
+  imperative::NameVarBaseMap outs = {out_pair};
+  framework::AttributeMap split_attr_map;
+  const auto& info = framework::OpInfoMap::Instance().Get("split");
+  framework::VariableNameMap var_in_map =
+      CreateVarNameMap(info, "split", ins, true);
+  framework::VariableNameMap var_out_map =
+      CreateVarNameMap(info, "split", outs, false);
+  framework::OperatorWithKernel op("split", var_in_map, var_out_map,
+                                   split_attr_map);
+  framework::RuntimeContext ctx = PrepareRuntimeContext(ins, outs);
+  ASSERT_NO_FATAL_FAILURE(PreparedOp preparedOp =
+                              PreparedOp::Prepare(ctx, op, place));
+}
+
+const framework::Tensor* GetTensorFromVar(const framework::Variable& var);
+
+TEST(test_prepare_op, test_get_tensor_from_var) {
+  std::shared_ptr<imperative::VarBase> vout_error(
+      new imperative::VarBase(false, "vout_error"));
+  vout_error->MutableVar()->GetMutable<framework::SelectedRows>();
+  auto* ts = GetTensorFromVar(*vout_error->MutableVar());
+  ASSERT_TRUE(ts != nullptr);
+}
+
+}  // namespace imperative
+}  // namespace paddle
+
+USE_OP(split);
--- a/paddle/fluid/imperative/tests/test_tracer.cc
+++ b/paddle/fluid/imperative/tests/test_tracer.cc
@ -0,0 +1,148 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//
+// Created by Jiabin on 2019-08-16.
+//
+
+#include <paddle/fluid/framework/op_registry.h>
+#include <memory>
+#include <string>
+#include <vector>
+#include "gtest/gtest.h"
+#include "paddle/fluid/imperative/tracer.h"
+
+namespace imperative = paddle::imperative;
+namespace platform = paddle::platform;
+namespace framework = paddle::framework;
+
+namespace paddle {
+namespace imperative {
+
+using vb_vector = std::vector<std::shared_ptr<imperative::VarBase>>;
+
+using var_pair = std::pair<std::string, vb_vector>;
+
+TEST(test_tracer, test_trace_op) {
+  // Doing an mul
+  imperative::Tracer tracer;
+  std::shared_ptr<imperative::VarBase> x_in(
+      new imperative::VarBase(true, "x_in"));
+  std::shared_ptr<imperative::VarBase> y_in(
+      new imperative::VarBase(true, "y_in"));
+  std::shared_ptr<imperative::VarBase> vout(
+      new imperative::VarBase(true, "vout"));
+  platform::CPUPlace place;
+  std::vector<float> src_data(10, 2.0);
+  std::vector<int64_t> dims1 = {2, 5};
+  std::vector<int64_t> dims2 = {5, 2};
+
+  auto* x_in_tensor = x_in->MutableVar()->GetMutable<framework::LoDTensor>();
+  auto* y_in_tensor = y_in->MutableVar()->GetMutable<framework::LoDTensor>();
+  x_in_tensor->Resize(framework::make_ddim(dims1));
+  auto* mutable_x = x_in_tensor->mutable_data<float>(place);
+  paddle::memory::Copy(place, mutable_x, place, src_data.data(),
+                       sizeof(float) * src_data.size());
+  y_in_tensor->Resize(framework::make_ddim(dims2));
+  auto* mutable_y = y_in_tensor->mutable_data<float>(place);
+  paddle::memory::Copy(place, mutable_y, place, src_data.data(),
+                       sizeof(float) * src_data.size());
+
+  var_pair x_pair = var_pair("X", vb_vector(1, x_in));
+  var_pair y_pair = var_pair("Y", vb_vector(1, y_in));
+  var_pair out_pair = var_pair("Out", vb_vector(1, vout));
+  imperative::NameVarBaseMap ins = {x_pair, y_pair};
+  imperative::NameVarBaseMap outs = {out_pair};
+  framework::AttributeMap mul_attr_map;
+  mul_attr_map["use_mkldnn"] = false;
+  tracer.TraceOp("mul", ins, outs, mul_attr_map, place, true);
+  const auto& out_tensor = vout->Var().Get<framework::LoDTensor>();
+  for (size_t i = 0; i < vout->Var().Get<framework::LoDTensor>().numel(); i++) {
+    ASSERT_EQ(out_tensor.data<float>()[i], 20.0);
+  }
+}
+
+TEST(test_tracer, test_track_backward_output) {
+  // Doing an mul
+  imperative::Tracer tracer;
+  std::shared_ptr<imperative::VarBase> x_in(
+      new imperative::VarBase(true, "x_in"));
+  std::shared_ptr<imperative::VarBase> y_in(
+      new imperative::VarBase(false, "y_in"));
+  std::shared_ptr<imperative::VarBase> vout(
+      new imperative::VarBase(true, "vout"));
+  platform::CPUPlace place;
+  std::vector<float> src_data(10, 2.0);
+  std::vector<int64_t> dims1 = {2, 5};
+  std::vector<int64_t> dims2 = {5, 2};
+
+  auto* x_in_tensor = x_in->MutableVar()->GetMutable<framework::LoDTensor>();
+  auto* y_in_tensor = y_in->MutableVar()->GetMutable<framework::LoDTensor>();
+  x_in_tensor->Resize(framework::make_ddim(dims1));
+  auto* mutable_x = x_in_tensor->mutable_data<float>(place);
+  paddle::memory::Copy(place, mutable_x, place, src_data.data(),
+                       sizeof(float) * src_data.size());
+  y_in_tensor->Resize(framework::make_ddim(dims2));
+  auto* mutable_y = y_in_tensor->mutable_data<float>(place);
+  paddle::memory::Copy(place, mutable_y, place, src_data.data(),
+                       sizeof(float) * src_data.size());
+
+  var_pair x_pair = var_pair("X", vb_vector(1, x_in));
+  var_pair y_pair = var_pair("Y", vb_vector(1, y_in));
+  var_pair out_pair = var_pair("Out", vb_vector(1, vout));
+  imperative::NameVarBaseMap ins = {x_pair, y_pair};
+  imperative::NameVarBaseMap outs = {out_pair};
+  framework::AttributeMap mul_attr_map;
+  mul_attr_map["use_mkldnn"] = false;
+  ASSERT_ANY_THROW(tracer.TraceOp("mul", ins, outs, mul_attr_map, place, true));
+}
+
+TEST(test_tracer, test_track_backward_input) {
+  // Doing an mul
+  imperative::Tracer tracer;
+  std::shared_ptr<imperative::VarBase> x_in(
+      new imperative::VarBase(true, "x_in"));
+  std::shared_ptr<imperative::VarBase> y_in(
+      new imperative::VarBase(true, "y_in"));
+  std::shared_ptr<imperative::VarBase> vout(
+      new imperative::VarBase(false, "vout"));
+  platform::CPUPlace place;
+  std::vector<float> src_data(10, 2.0);
+  std::vector<int64_t> dims1 = {2, 5};
+  std::vector<int64_t> dims2 = {5, 2};
+
+  auto* x_in_tensor = x_in->MutableVar()->GetMutable<framework::LoDTensor>();
+  auto* y_in_tensor = y_in->MutableVar()->GetMutable<framework::LoDTensor>();
+  x_in_tensor->Resize(framework::make_ddim(dims1));
+  auto* mutable_x = x_in_tensor->mutable_data<float>(place);
+  paddle::memory::Copy(place, mutable_x, place, src_data.data(),
+                       sizeof(float) * src_data.size());
+  y_in_tensor->Resize(framework::make_ddim(dims2));
+  auto* mutable_y = y_in_tensor->mutable_data<float>(place);
+  paddle::memory::Copy(place, mutable_y, place, src_data.data(),
+                       sizeof(float) * src_data.size());
+
+  var_pair x_pair = var_pair("X", vb_vector(1, x_in));
+  var_pair y_pair = var_pair("Y", vb_vector(1, y_in));
+  var_pair out_pair = var_pair("Out", vb_vector(1, vout));
+  imperative::NameVarBaseMap ins = {x_pair, y_pair};
+  imperative::NameVarBaseMap outs = {out_pair};
+  framework::AttributeMap mul_attr_map;
+  mul_attr_map["use_mkldnn"] = false;
+  ASSERT_ANY_THROW(tracer.TraceOp("mul", ins, outs, mul_attr_map, place, true));
+}
+}  // namespace imperative
+}  // namespace paddle
+
+USE_OP(mul);
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
--- a/paddle/fluid/imperative/tracer.h
+++ b/paddle/fluid/imperative/tracer.h
@ -1,4 +1,4 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -14,46 +14,48 @@

 #pragma once

-#include <map>
-#include <set>
+#include <atomic>
+#include <future>  // NOLINT
+#include <memory>
 #include <string>
 #include <unordered_map>
-#include <unordered_set>
 #include <vector>
-
-#include "paddle/fluid/framework/op_desc.h"
-#include "paddle/fluid/framework/op_registry.h"
+#include "ThreadPool.h"
 #include "paddle/fluid/imperative/engine.h"
 #include "paddle/fluid/imperative/layer.h"
-#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/platform/macros.h"

 namespace paddle {
 namespace imperative {

-void CreateGradOp(const framework::OpDesc& op_desc,
-                  const std::unordered_set<std::string>& no_grad_set,
-                  const std::vector<framework::BlockDesc*>& grad_sub_block,
-                  framework::OpDesc** grad_op_desc,
-                  std::unordered_map<std::string, std::string>* grad_to_var);
-
-platform::Place GetExpectedPlace(platform::Place place, VarBasePtrMap inputs);
-
 class Tracer {
+  DISABLE_COPY_AND_ASSIGN(Tracer);
+
 public:
-  explicit Tracer(framework::BlockDesc* root_block);
+  Tracer() : engine_(new BasicEngine()) {}

-  virtual ~Tracer() {}
+  ~Tracer() = default;

-  void Trace(OpBase* op, const VarBasePtrMap& inputs,
-             VarBasePtrMap* outputs,  // NOLINT
-             framework::AttributeMap attrs_map,
-             const platform::Place expected_place,
-             const bool stop_gradient = false);
+  void TraceOp(const std::string& type, const NameVarBaseMap& ins,
+               const NameVarBaseMap& outs, framework::AttributeMap attrs,
+               const platform::Place& place, bool trace_bacward);
+
+  bool ComputeRequiredGrad(const NameVarBaseMap& ins, const NameVarBaseMap outs,
+                           bool trace_backward);
+
+  void TraceBackward(const std::shared_ptr<OpBase>& fwd_op,
+                     const framework::OpDesc& fwd_op_desc,
+                     const NameVarBaseMap& ins, const NameVarBaseMap& outs);
+  Engine* GetDefaultEngine() const { return engine_.get(); }

 private:
-  platform::Place GetPlace(const VarBasePtrMap& inputs);
+  static size_t GenerateUniqueId() {
+    static std::atomic<size_t> id{0};
+    return id.fetch_add(1);
+  }

-  framework::BlockDesc* root_block_;
+ private:
+  std::unique_ptr<Engine> engine_;
 };

 }  // namespace imperative
--- a/paddle/fluid/imperative/type_defs.h
+++ b/paddle/fluid/imperative/type_defs.h
@ -17,8 +17,6 @@ limitations under the License. */
 #include <map>
 #include <memory>
 #include <string>
-#include <unordered_map>
-#include <utility>
 #include <vector>

 namespace paddle {
@ -26,18 +24,10 @@ namespace imperative {

 class VarBase;
 class OpBase;
+class Tracer;

-typedef std::map<std::string, std::vector<std::shared_ptr<VarBase>>>
-    VarBasePtrMap;
-typedef std::vector<std::weak_ptr<VarBase>> VarBaseWeakPtrList;
-typedef std::map<std::string, std::vector<OpBase*>> OpBasePtrMap;
-typedef std::unordered_map<
-    const VarBase*,
-    std::pair<platform::Place,
-              std::vector<std::pair<int, std::shared_ptr<VarBase>>>>>
-    BackwardSumMap;  // var_grad -> {place, {id -> var_grad@rename}}
-typedef std::unordered_map<const VarBase*, std::pair<int, bool>> GradientRef;
-// var_grad -> {ref_times, is_first_to_be_accumulate}
+using NameVarBaseMap =
+    std::map<std::string, std::vector<std::shared_ptr<VarBase>>>;

 }  // namespace imperative
 }  // namespace paddle
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@ -1,6 +1,6 @@
 set(PYBIND_DEPS pybind python proto_desc memory executor fleet_wrapper box_wrapper nccl_wrapper prune
-  feed_fetch_method pass_builder parallel_executor profiler layer scope_pool
-  tracer analysis_predictor imperative_profiler nccl_context)
+  feed_fetch_method pass_builder parallel_executor profiler layer tracer engine scope_pool
+  analysis_predictor imperative_profiler nccl_context imperative_flag)

 if(WITH_PYTHON)
  list(APPEND PYBIND_DEPS py_func_op)
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
--- a/paddle/fluid/pybind/imperative.h
+++ b/paddle/fluid/pybind/imperative.h
@ -14,10 +14,6 @@ limitations under the License. */
 #pragma once

 #include <Python.h>
-#include <string>
-#include <vector>
-#include "paddle/fluid/imperative/layer.h"
-#include "paddle/fluid/imperative/nccl_context.h"
 #include "pybind11/pybind11.h"
 #include "pybind11/stl.h"

--- a/python/paddle/fluid/dygraph/base.py
+++ b/python/paddle/fluid/dygraph/base.py
@ -18,6 +18,7 @@ from paddle.fluid import core
 from paddle.fluid import framework
 from .tracer import Tracer
 import logging
+import objgraph

 __all__ = [
    'no_grad',
@ -123,7 +124,7 @@ def guard(place=None):
    """
    train = framework.Program()
    startup = framework.Program()
-    tracer = Tracer(train.current_block().desc)
+    tracer = Tracer()

    if place is None:
        if core.is_compiled_with_cuda():
@ -138,19 +139,22 @@ def guard(place=None):
                    yield


-def _print_debug_msg():
+def _print_debug_msg(limit=5, is_test=False):
    if not core._is_dygraph_debug_enabled():
        logging.warn(
            'Debug mode is not enabled. Please set FLAGS_dygraph_debug=1 to enable debug'
        )
        return
-
    unique_name_size = len(framework.unique_name.generator.ids)
    tracer_var_size = len(framework._dygraph_tracer()._vars)
    alive_cpp_var_size = len(core.VarBase._alive_vars())
-    logging.warn(
-        'unique_name num: {}, tracer vars num: {}, alive cpp vars num: {}'
-        .format(unique_name_size, tracer_var_size, alive_cpp_var_size))
+    if not is_test:
+        logging.warn(
+            'unique_name num: {}, tracer vars num: {}, alive cpp vars num: {}'
+            .format(unique_name_size, tracer_var_size, alive_cpp_var_size))
+        objgraph.show_growth(limit=limit)
+    else:
+        return unique_name_size, tracer_var_size, alive_cpp_var_size


 def to_variable(value, block=None, name=None):
--- a/python/paddle/fluid/dygraph/parallel.py
+++ b/python/paddle/fluid/dygraph/parallel.py
@ -20,7 +20,7 @@ from . import layers
 from . import parallel_helper
 from .. import framework
 from ..layers import collective
-from . import to_variable
+from . import to_variable, no_grad

 __all__ = ["prepare_context"]

@ -197,6 +197,7 @@ class DataParallel(layers.Layer):
            for g_var, g_shape in zip(origin_grad_vars, grad_shapes):
                nn.reshape(x=g_var, shape=g_shape, inplace=True)

+    @no_grad
    def apply_collective_grads(self):
        """
        AllReduce the Parameters' gradient.
--- a/Show More
+++ b/Show More