Feature/operator run place (#6783)

* "change operator interface" * "move devicepool to device_context" * "fix operator test" * "fix op_registry Run interface" * "net op passed. Need to fix nccl multi-Context" * "add nccl group function" * "add nccl group function" * "fix gpu count exceed 32 error" * "fix recurrent op, nccl op" * "change the other operators interface with Place" * "fix typo" * "fix pybind" * "fix device in python side" * "fix pybind failed" * "add init for test" * "fix CI"
8 years ago · 735eba2976
parent b8de140199
commit 735eba2976
68 changed files with 468 additions and 293 deletions
--- a/doc/design/block.md
+++ b/doc/design/block.md
@ -291,10 +291,10 @@ public:
  }

  void Run(const framework::Scope& scope,
-           const platform::DeviceContext& dev_ctx) const override {
+           const platform::Place& place) const override {
    PADDLE_ENFORCE(symbols_ready_, "operators and variables should be created first.");
    for (auto& op : runtime_table_.ops()) {
-      op->Run(scope, dev_ctx);
+      op->Run(scope, place);
    }
  }

--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@ -30,7 +30,7 @@ cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker)
 cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
 cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute)
 cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog shape_inference)
-cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry)
+cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry init)
 cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog)

 cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
@ -59,5 +59,5 @@ cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry
 cc_library(selected_rows SRCS selected_rows.cc DEPS tensor)
 cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows)

-cc_library(init SRCS init.cc DEPS gflags executor place stringpiece)
+cc_library(init SRCS init.cc DEPS gflags device_context place stringpiece)
 cc_test(init_test SRCS init_test.cc DEPS init)
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@ -33,13 +33,7 @@ namespace framework {
 const std::string kFeedOpType = "feed";
 const std::string kFetchOpType = "fetch";

-DeviceContextPool* DeviceContextPool::pool = nullptr;
-
-Executor::Executor(const std::vector<platform::Place>& places) {
-  DeviceContextPool& pool = DeviceContextPool::Get();
-  auto borrowed_contexts = pool.Borrow(places);
-  device_contexts_.swap(borrowed_contexts);
-}
+Executor::Executor(const platform::Place& place) : place_(place) {}

 static void CreateTensor(Variable* var, proto::VarDesc::VarType var_type) {
  if (var_type == proto::VarDesc::LOD_TENSOR) {
@ -71,7 +65,6 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
  //    - will change to use multiple blocks for RNN op and Cond Op
  PADDLE_ENFORCE_LT(static_cast<size_t>(block_id), pdesc.Size());
  auto& block = pdesc.Block(block_id);
-  auto& device = device_contexts_[0];

  Scope* local_scope = scope;
  if (create_vars) {
@ -107,7 +100,7 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
  for (auto& op_desc : block.AllOps()) {
    auto op = paddle::framework::OpRegistry::CreateOp(*op_desc);
    VLOG(3) << op->DebugString();
-    op->Run(*local_scope, *device);
+    op->Run(*local_scope, place_);
  }
  if (create_local_scope) {
    scope->DeleteScope(local_scope);
--- a/paddle/framework/executor.h
+++ b/paddle/framework/executor.h
@ -14,9 +14,6 @@ limitations under the License. */

 #pragma once

-#include <map>
-#include <unordered_map>
-
 #include "paddle/framework/op_info.h"
 #include "paddle/framework/program_desc.h"
 #include "paddle/framework/scope.h"
@ -26,96 +23,13 @@ limitations under the License. */
 namespace paddle {
 namespace framework {

-class DeviceContextPool {
- public:
-  static DeviceContextPool& Get() {
-    PADDLE_ENFORCE_NOT_NULL(pool, "Need to Create DeviceContextPool first!");
-    return *pool;
-  }
-
-  static DeviceContextPool& Create(const std::vector<platform::Place>& places) {
-    if (pool == nullptr) {
-      pool = new DeviceContextPool(places);
-    }
-    return *pool;
-  }
-
-  const platform::DeviceContext* Borrow(const platform::Place& place) {
-    auto range = device_contexts_.equal_range(place);
-    if (range.first == range.second) {
-      PADDLE_THROW(
-          "'Place' is not supported, Please re-compile with WITH_GPU "
-          "option");
-    }
-    return range.first->second;
-  }
-
-  std::vector<const platform::DeviceContext*> Borrow(
-      const std::vector<platform::Place>& places) {
-    PADDLE_ENFORCE_GT(places.size(), 0);
-    PADDLE_ENFORCE_LE(places.size(), device_contexts_.size());
-    std::vector<const platform::DeviceContext*> borrowed_contexts;
-    for (auto& place : places) {
-      auto range = device_contexts_.equal_range(place);
-      if (range.first == range.second) {
-        PADDLE_THROW(
-            "'Place' is not supported, Please re-compile with WITH_GPU "
-            "option");
-      }
-      // TODO(dzhwinter) : assign the first found device. Will enhanced later.
-      // device load balancer maybe useful here.
-      borrowed_contexts.emplace_back(range.first->second);
-    }
-    return borrowed_contexts;
-  }
-
-  explicit DeviceContextPool(const std::vector<platform::Place>& places) {
-    PADDLE_ENFORCE_GT(places.size(), 0);
-    for (size_t i = 0; i < places.size(); i++) {
-      if (platform::is_cpu_place(places[i])) {
-        device_contexts_.emplace(
-            places[i], new platform::CPUDeviceContext(
-                           boost::get<platform::CPUPlace>(places[i])));
-      } else if (platform::is_gpu_place(places[i])) {
-#ifdef PADDLE_WITH_CUDA
-        device_contexts_.emplace(
-            places[i], new platform::CUDADeviceContext(
-                           boost::get<platform::GPUPlace>(places[i])));
-#else
-        PADDLE_THROW(
-            "'GPUPlace' is not supported, Please re-compile with WITH_GPU "
-            "option");
-#endif
-      }
-    }
-  }
-
-  ~DeviceContextPool() {}
-
- private:
-  static DeviceContextPool* pool;
-  struct Hash {
-    std::hash<int> hash_;
-    size_t operator()(const platform::Place& place) const {
-      return hash_(place.which());
-    }
-  };
-  std::unordered_multimap<const platform::Place, const platform::DeviceContext*,
-                          Hash>
-      device_contexts_;
-  DISABLE_COPY_AND_ASSIGN(DeviceContextPool);
-};
-
 class Executor {
 public:
  // TODO(dzhwinter) : Do not rely on this function, it will be removed
  explicit Executor(const platform::DeviceContext& device)
-      : Executor(std::vector<platform::Place>({device.GetPlace()})) {}
-
-  explicit Executor(const platform::Place& place)
-      : Executor(std::vector<platform::Place>({place})) {}
+      : Executor(device.GetPlace()) {}

-  explicit Executor(const std::vector<platform::Place>& places);
+  explicit Executor(const platform::Place& place);

  /* @Brief
   * Runtime evaluation of the given ProgramDesc under certain Scope
@ -128,7 +42,7 @@ class Executor {
           bool create_vars = true);

 private:
-  std::vector<const platform::DeviceContext*> device_contexts_;
+  const platform::Place place_;
 };

 }  // namespace framework
--- a/paddle/framework/init.cc
+++ b/paddle/framework/init.cc
@ -14,8 +14,8 @@
 #include <algorithm>
 #include <string>

-#include "paddle/framework/executor.h"
 #include "paddle/framework/init.h"
+#include "paddle/platform/device_context.h"
 #include "paddle/platform/place.h"
 #include "paddle/string/piece.h"

@ -48,7 +48,7 @@ bool InitDevices(const std::vector<std::string> &devices) {
  std::vector<platform::Place> places;
  for (auto &device : devices) {
    auto p = string::Piece(device);
-    if (string::Find(p, ':', 0) == string::Piece::npos) {
+    if (string::HasPrefix(p, "CPU")) {
      places.emplace_back(platform::CPUPlace());
    } else if (string::HasPrefix(p, "GPU")) {
 #ifdef PADDLE_WITH_CUDA
@ -69,10 +69,9 @@ bool InitDevices(const std::vector<std::string> &devices) {
                     return platform::is_cpu_place(place);
                   }) == places.end()) {
    places.emplace_back(platform::CPUPlace());
-    LOG(WARNING) << "Not specified any device, use CPU by Default.";
+    LOG(WARNING) << "Not specified CPU device, create CPU by Default.";
  }
-  DeviceContextPool::Create(places);
-  return true;
+  platform::DeviceContextPool::Create(places);
  return true;
 }

--- a/paddle/framework/init_test.cc
+++ b/paddle/framework/init_test.cc
@ -23,5 +23,9 @@ TEST(Init, InitDevices) {
 #ifdef PADDLE_WITH_CUDA
  std::vector<std::string> ds2 = {"CPU", "GPU:0", "GPU:1"};
  ASSERT_EQ(InitDevices(ds2), true);
+
+  // test re-init
+  std::vector<std::string> ds3 = {"GPU:0", "GPU:1"};
+  ASSERT_EQ(InitDevices(ds3), true);
 #endif
 }
--- a/paddle/framework/op_registry_test.cc
+++ b/paddle/framework/op_registry_test.cc
@ -8,8 +8,7 @@ namespace framework {
 class CosineOp : public OperatorBase {
 public:
  using OperatorBase::OperatorBase;
-  void Run(const Scope& scope,
-           const platform::DeviceContext& dev_ctx) const override {}
+  void Run(const Scope& scope, const platform::Place& place) const override {}
 };

 class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
@ -28,8 +27,7 @@ class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
 class MyTestOp : public OperatorBase {
 public:
  using OperatorBase::OperatorBase;
-  void Run(const Scope& scope,
-           const platform::DeviceContext& dev_ctx) const override {}
+  void Run(const Scope& scope, const platform::Place& place) const override {}
 };

 class MyTestOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
@ -76,8 +74,8 @@ TEST(OpRegistry, CreateOp) {

  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
  paddle::framework::Scope scope;
-  paddle::platform::CPUDeviceContext dev_ctx;
-  op->Run(scope, dev_ctx);
+  paddle::platform::CPUPlace cpu_place;
+  op->Run(scope, cpu_place);
  float scale_get = op->Attr<float>("scale");
  ASSERT_EQ(scale_get, scale);
 }
@ -117,8 +115,8 @@ TEST(OpRegistry, DefaultValue) {

  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
  paddle::framework::Scope scope;
-  paddle::platform::CPUDeviceContext dev_ctx;
-  op->Run(scope, dev_ctx);
+  paddle::platform::CPUPlace cpu_place;
+  op->Run(scope, cpu_place);
  ASSERT_EQ(op->Attr<float>("scale"), 1.0);
 }

@ -167,9 +165,9 @@ TEST(OpRegistry, CustomChecker) {
  attr->set_type(paddle::framework::proto::AttrType::INT);
  attr->set_i(4);
  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
-  paddle::platform::CPUDeviceContext dev_ctx;
+  paddle::platform::CPUPlace cpu_place;
  paddle::framework::Scope scope;
-  op->Run(scope, dev_ctx);
+  op->Run(scope, cpu_place);
  int test_attr = op->Attr<int>("test_attr");
  ASSERT_EQ(test_attr, 4);
 }
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@ -12,10 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "paddle/framework/operator.h"
 #include <algorithm>
 #include <atomic>
+
+#include "paddle/framework/executor.h"
 #include "paddle/framework/lod_tensor_array.h"
+#include "paddle/framework/operator.h"
 #include "paddle/framework/shape_inference.h"
 #include "paddle/framework/var_type.h"

@ -388,11 +390,11 @@ class RuntimeInferShapeContext : public InferShapeContext {
 };

 void OperatorWithKernel::Run(const Scope& scope,
-                             const platform::DeviceContext& dev_ctx) const {
+                             const platform::Place& place) const {
  RuntimeInferShapeContext infer_shape_ctx(*this, scope);
  this->InferShape(&infer_shape_ctx);
-
-  ExecutionContext ctx(*this, scope, dev_ctx);
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Get();
+  auto dev_ctx = pool.Borrow(place);

  // check if op[type] has kernel registered.
  auto& all_op_kernels = AllOpKernels();
@ -404,6 +406,8 @@ void OperatorWithKernel::Run(const Scope& scope,

  // check if op[type] have kernel for kernel_key
  OpKernelMap& kernels = kernels_iter->second;
+
+  ExecutionContext ctx(*this, scope, *dev_ctx);
  auto kernel_key = GetKernelType(ctx);
  auto kernel_iter = kernels.find(kernel_key);

--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@ -83,8 +83,7 @@ class OperatorBase {
  virtual std::string DebugString() const;

  /// Net will call this function to Run an op.
-  virtual void Run(const Scope& scope,
-                   const platform::DeviceContext& dev_ctx) const = 0;
+  virtual void Run(const Scope& scope, const platform::Place& place) const = 0;

  virtual bool IsNetOp() const { return false; }

@ -159,8 +158,7 @@ class OperatorBase {
 class NOP : public OperatorBase {
 public:
  using OperatorBase::OperatorBase;
-  void Run(const Scope& scope,
-           const platform::DeviceContext& dev_ctx) const override {}
+  void Run(const Scope& scope, const platform::Place& place) const override {}
  std::unique_ptr<OperatorBase> Clone() const override {
    return std::unique_ptr<OperatorBase>(new NOP(*this));
  }
@ -383,8 +381,7 @@ class OperatorWithKernel : public OperatorBase {
                     const VariableNameMap& outputs, const AttributeMap& attrs)
      : OperatorBase(type, inputs, outputs, attrs) {}

-  void Run(const Scope& scope,
-           const platform::DeviceContext& dev_ctx) const final;
+  void Run(const Scope& scope, const platform::Place& place) const final;

  static std::unordered_map<std::string /* op_type */, OpKernelMap>&
  AllOpKernels() {
--- a/paddle/framework/operator_test.cc
+++ b/paddle/framework/operator_test.cc
@ -11,11 +11,12 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
-#include "paddle/framework/operator.h"
 #include "gtest/gtest.h"
+
+#include "paddle/framework/init.h"
 #include "paddle/framework/op_info.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/framework/operator.h"

 namespace paddle {
 namespace framework {
@ -27,8 +28,7 @@ class OpWithoutKernelTest : public OperatorBase {
  OpWithoutKernelTest(const std::string& type, const VariableNameMap& inputs,
                      const VariableNameMap& outputs, const AttributeMap& attrs)
      : OperatorBase(type, inputs, outputs, attrs), x(1) {}
-  void Run(const Scope& scope,
-           const platform::DeviceContext& dev_ctx) const override {
+  void Run(const Scope& scope, const platform::Place& place) const override {
    ++op_run_num;
    ASSERT_EQ(static_cast<int>(inputs_.size()), 1);
    ASSERT_EQ(static_cast<int>(outputs_.size()), 1);
@ -41,10 +41,9 @@ class OpWithoutKernelTest : public OperatorBase {
  int x{0};
 };

-class OpeWithoutKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
+class OpWithoutKernelCheckerMaker : public OpProtoAndCheckerMaker {
 public:
-  OpeWithoutKernelTestProtoAndCheckerMaker(OpProto* proto,
-                                           OpAttrChecker* op_checker)
+  OpWithoutKernelCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("input", "input of test op");
    AddOutput("output", "output of test op");
@ -65,11 +64,12 @@ static void BuildVar(const std::string& param_name,
  }
 }

-REGISTER_OP_WITHOUT_GRADIENT(
-    test_operator, paddle::framework::OpWithoutKernelTest,
-    paddle::framework::OpeWithoutKernelTestProtoAndCheckerMaker);
+REGISTER_OP_WITHOUT_GRADIENT(test_operator,
+                             paddle::framework::OpWithoutKernelTest,
+                             paddle::framework::OpWithoutKernelCheckerMaker);

 TEST(OperatorBase, all) {
+  paddle::framework::InitDevices({"CPU"});
  paddle::framework::proto::OpDesc op_desc;
  op_desc.set_type("test_operator");
  BuildVar("input", {"IN1"}, op_desc.add_inputs());
@ -80,13 +80,13 @@ TEST(OperatorBase, all) {
  attr->set_type(paddle::framework::proto::AttrType::FLOAT);
  attr->set_f(3.14);

-  paddle::platform::CPUDeviceContext device_context;
+  paddle::platform::CPUPlace cpu_place;
  paddle::framework::Scope scope;

  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
  scope.Var("OUT1");
  ASSERT_EQ(paddle::framework::op_run_num, 0);
-  op->Run(scope, device_context);
+  op->Run(scope, cpu_place);
  ASSERT_EQ(paddle::framework::op_run_num, 1);
 }

@ -123,7 +123,6 @@ template <typename T1, typename T2>
 class CPUKernelTest : public OpKernel<float> {
 public:
  void Compute(const ExecutionContext& ctx) const {
-    std::cout << "this is cpu kernel" << std::endl;
    std::cout << ctx.op().DebugString() << std::endl;
    cpu_kernel_run_num++;
    ASSERT_EQ(ctx.op().Input("x"), "IN1");
@ -195,6 +194,7 @@ REGISTER_OP_CPU_KERNEL(op_with_kernel,

 // test with single input
 TEST(OpKernel, all) {
+  paddle::framework::InitDevices({"CPU"});
  paddle::framework::proto::OpDesc op_desc;
  op_desc.set_type("op_with_kernel");
  BuildVar("x", {"IN1"}, op_desc.add_inputs());
@ -205,12 +205,12 @@ TEST(OpKernel, all) {
  attr->set_type(paddle::framework::proto::AttrType::FLOAT);
  attr->set_f(3.14);

-  paddle::platform::CPUDeviceContext cpu_device_context;
+  paddle::platform::CPUPlace cpu_place;
  paddle::framework::Scope scope;

  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
  ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 0);
-  op->Run(scope, cpu_device_context);
+  op->Run(scope, cpu_place);
  ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 1);
 }

@ -224,7 +224,9 @@ REGISTER_OP_CPU_KERNEL(op_multi_inputs_with_kernel,
 TEST(OpKernel, multi_inputs) {
  using namespace paddle::framework;

+  paddle::framework::InitDevices({"CPU"});
  proto::OpDesc op_desc;
+
  op_desc.set_type("op_multi_inputs_with_kernel");
  BuildVar("xs", {"x0", "x1", "x2"}, op_desc.add_inputs());
  BuildVar("k", {"k0"}, op_desc.add_inputs());
@ -235,7 +237,7 @@ TEST(OpKernel, multi_inputs) {
  attr->set_type(paddle::framework::proto::AttrType::FLOAT);
  attr->set_f(3.14);

-  paddle::platform::CPUDeviceContext cpu_device_context;
+  paddle::platform::CPUPlace cpu_place;
  paddle::framework::Scope scope;
  scope.Var("x0")->GetMutable<LoDTensor>();
  scope.Var("x1")->GetMutable<LoDTensor>();
@ -245,7 +247,7 @@ TEST(OpKernel, multi_inputs) {
  scope.Var("y1")->GetMutable<LoDTensor>();

  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
-  op->Run(scope, cpu_device_context);
+  op->Run(scope, cpu_place);
 }

 class OperatorClone : public paddle::framework::OperatorBase {
@ -257,10 +259,11 @@ class OperatorClone : public paddle::framework::OperatorBase {
                const paddle::framework::AttributeMap& attrs)
      : OperatorBase(type, inputs, outputs, attrs) {}
  void Run(const paddle::framework::Scope& scope,
-           const paddle::platform::DeviceContext& dev_ctx) const override {}
+           const paddle::platform::Place& place) const override {}
 };

 TEST(Operator, Clone) {
+  paddle::framework::InitDevices({"CPU"});
  OperatorClone a("ABC", paddle::framework::VariableNameMap{},
                  paddle::framework::VariableNameMap{},
                  paddle::framework::AttributeMap{});
--- a/paddle/operators/array_operator.h
+++ b/paddle/operators/array_operator.h
@ -15,6 +15,7 @@
 #pragma once
 #include "paddle/framework/lod_tensor_array.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/platform/device_context.h"

 namespace paddle {
 namespace operators {
@ -27,11 +28,16 @@ class ArrayOp : public framework::OperatorBase {

 protected:
  size_t GetOffset(const framework::Scope &scope,
-                   const platform::DeviceContext &dev_ctx) const {
+                   const platform::Place &place) const {
    auto *i = scope.FindVar(Input("I"));
    PADDLE_ENFORCE(i != nullptr, "I must be set");
    auto &i_tensor = i->Get<framework::LoDTensor>();
    PADDLE_ENFORCE_EQ(i_tensor.numel(), 1);
+
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+    auto &dev_ctx = *pool.Borrow(place);
+
    size_t offset;
    if (platform::is_gpu_place(i_tensor.place())) {
      // FIXME: Avoid copy from GPU to CPU
--- a/paddle/operators/array_to_lod_tensor_op.cc
+++ b/paddle/operators/array_to_lod_tensor_op.cc
@ -12,10 +12,12 @@
   See the License for the specific language governing permissions and
   limitations under the License. */
 #include <numeric>
+
 #include "paddle/framework/lod_rank_table.h"
 #include "paddle/framework/lod_tensor_array.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/memory/memcpy.h"
+#include "paddle/platform/device_context.h"

 namespace paddle {
 namespace operators {
@ -30,7 +32,7 @@ class ArrayToLoDTensorOp : public framework::OperatorBase {
                     const framework::AttributeMap &attrs)
      : OperatorBase(type, inputs, outputs, attrs) {}
  void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &dev_place) const override {
    auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensorArray>();
    auto &rank_table =
        scope.FindVar(Input("RankTable"))->Get<framework::LoDRankTable>();
@ -103,6 +105,10 @@ class ArrayToLoDTensorOp : public framework::OperatorBase {
          continue;
        }
        auto slice = out->Slice(out_offset, out_offset + len);
+
+        platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+        auto &dev_ctx = *pool.Borrow(place);
+
        framework::CopyFrom(x[x_idx].Slice(start_offset, end_offset), place,
                            dev_ctx, &slice);
        out_offset += len;
--- a/paddle/operators/assign_op.cc
+++ b/paddle/operators/assign_op.cc
@ -15,6 +15,7 @@
 #include "paddle/framework/data_type.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/framework/var_type.h"
+#include "paddle/platform/device_context.h"

 namespace paddle {
 namespace operators {
@ -71,7 +72,7 @@ class AssignOp : public framework::OperatorBase {
           const framework::AttributeMap &attrs)
      : OperatorBase(type, inputs, outputs, attrs) {}
  void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
    auto *x = scope.FindVar(Input("X"));
    if (x == nullptr) {
      return;
@ -80,6 +81,10 @@ class AssignOp : public framework::OperatorBase {
    PADDLE_ENFORCE(
        out != nullptr,
        "The Output(Out) should not be null if the Input(X) is set.");
+
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+    auto &dev_ctx = *pool.Borrow(place);
+
    framework::VisitVarType(*x, AssignFunctor(out, dev_ctx));
  }
 };
--- a/paddle/operators/beam_search_decode_op.cc
+++ b/paddle/operators/beam_search_decode_op.cc
@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/operators/beam_search_decode_op.h"
+#include "paddle/platform/device_context.h"

 namespace paddle {
 namespace operators {
@ -55,7 +56,10 @@ class BeamSearchDecodeOp : public framework::OperatorBase {
                     const framework::AttributeMap& attrs)
      : OperatorBase(type, inputs, outputs, attrs) {}
  void Run(const framework::Scope& scope,
-           const platform::DeviceContext& dev_ctx) const override {
+           const platform::Place& dev_place) const override {
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Get();
+    auto& dev_ctx = *pool.Borrow(dev_place);
+
    framework::ExecutionContext ctx(*this, scope, dev_ctx);

    const LoDTensorArray* ids = ctx.Input<LoDTensorArray>("Ids");
--- a/paddle/operators/beam_search_op.h
+++ b/paddle/operators/beam_search_op.h
@ -189,7 +189,7 @@ class BeamSearchOp : public framework::OperatorBase {
  }

  void Run(const framework::Scope& scope,
-           const platform::DeviceContext& dev_ctx) const override {
+           const platform::Place& dev_place) const override {
    LOG(INFO) << "run beam search op";
    auto ids_var = scope.FindVar(Input("ids"));
    auto scores_var = scope.FindVar(Input("scores"));
--- a/paddle/operators/cond_op.cc
+++ b/paddle/operators/cond_op.cc
@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/operators/cond_op.h"
-
 #include "paddle/operators/gather.h"
 #include "paddle/operators/scatter.h"
+#include "paddle/platform/device_context.h"

 namespace paddle {
 namespace operators {
@ -193,12 +193,15 @@ void CondOp::MergeDataFromSubnet(const framework::Scope& scope,
  }
 }

-void CondOp::Run(const Scope& scope,
-                 const platform::DeviceContext& dev_ctx) const {
+void CondOp::Run(const Scope& scope, const platform::Place& place) const {
+  // get device context from pool
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Get();
+  auto& dev_ctx = *pool.Borrow(place);
+
  PrepareDataForSubnet(scope, dev_ctx);
  std::vector<framework::Scope*>& sub_scopes = GetSubScopes(scope);
  for (int i = 0; i < BRANCH_NUM; ++i) {
-    sub_net_op_[i]->Run(*sub_scopes[i], dev_ctx);
+    sub_net_op_[i]->Run(*sub_scopes[i], place);
  }
  MergeDataFromSubnet(scope, dev_ctx);
 }
--- a/paddle/operators/cond_op.h
+++ b/paddle/operators/cond_op.h
@ -78,7 +78,7 @@ class CondOp : public framework::OperatorBase {
  }

  void Run(const framework::Scope& scope,
-           const platform::DeviceContext& dev_ctx) const override;
+           const platform::Place& place) const override;

 private:
  const int TRUE_BRANCH = 0;
--- a/paddle/operators/conditional_block_op.cc
+++ b/paddle/operators/conditional_block_op.cc
@ -51,7 +51,7 @@ class ConditionalBlockOp : public ConditionalOp {
                     const framework::AttributeMap &attrs)
      : ConditionalOp(type, inputs, outputs, attrs) {}
  void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &dev_place) const override {
    auto xs = InputTensors(scope);
    bool need_run = std::all_of(
        xs.begin(), xs.end(),
@ -65,8 +65,8 @@ class ConditionalBlockOp : public ConditionalOp {
      scopes->front() = &scope.NewScope();
      auto &cur_scope = *scopes->front();

+      framework::Executor exec(dev_place);
      auto *block = Attr<framework::BlockDesc *>("sub_block");
-      framework::Executor exec(dev_ctx);
      exec.Run(*block->Program(), &cur_scope, block->ID(), false);
    }
  }
@ -104,7 +104,7 @@ class ConditionalBlockGradOp : public ConditionalOp {
                         const framework::AttributeMap &attrs)
      : ConditionalOp(type, inputs, outputs, attrs) {}
  void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &dev_place) const override {
    auto xs = this->InputTensors(scope);
    bool need_run = std::all_of(
        xs.begin(), xs.end(),
@ -116,21 +116,21 @@ class ConditionalBlockGradOp : public ConditionalOp {
      auto &scopes = scope_var->Get<std::vector<framework::Scope *>>();
      framework::Scope &cur_scope = *scopes[0];

+      framework::Executor exec(dev_place);
      auto *block = Attr<framework::BlockDesc *>("sub_block");
-      framework::Executor exec(dev_ctx);
      exec.Run(*block->Program(), &cur_scope, block->ID(), false);

-      AssignLocalGradientToGlobal(dev_ctx, cur_scope, Inputs("Params"),
+      AssignLocalGradientToGlobal(dev_place, cur_scope, Inputs("Params"),
                                  Outputs(framework::GradVarName("Params")));

-      AssignLocalGradientToGlobal(dev_ctx, cur_scope, Inputs("X"),
+      AssignLocalGradientToGlobal(dev_place, cur_scope, Inputs("X"),
                                  Outputs(framework::GradVarName("X")));
    }
  }

 private:
  void AssignLocalGradientToGlobal(
-      const platform::DeviceContext &dev_ctx, const framework::Scope &cur_scope,
+      const platform::Place &place, const framework::Scope &cur_scope,
      const std::vector<std::string> &p_names,
      const std::vector<std::string> &pg_names) const {
    for (size_t i = 0; i < p_names.size(); ++i) {
@ -144,7 +144,7 @@ class ConditionalBlockGradOp : public ConditionalOp {
      auto assign = framework::OpRegistry::CreateOp(
          "assign", {{"X", {new_in_grad_name}}}, {{"Out", {out_grad_name}}},
          framework::AttributeMap{});
-      assign->Run(cur_scope, dev_ctx);
+      assign->Run(cur_scope, place);
      cur_scope.Rename(new_in_grad_name, in_grad_name);
    }
  }
--- a/paddle/operators/feed_op.cc
+++ b/paddle/operators/feed_op.cc
@ -25,7 +25,7 @@ class FeedOp : public framework::OperatorBase {
         const framework::AttributeMap &attrs)
      : OperatorBase(type, inputs, outputs, attrs) {}
  void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
    auto feed_var_name = Input("X");
    auto *feed_var = scope.FindVar(feed_var_name);

@ -47,7 +47,12 @@ class FeedOp : public framework::OperatorBase {
    auto &feed_list = feed_var->Get<framework::FeedFetchList>();
    auto &feed_item = feed_list.at(static_cast<size_t>(col));
    auto *out_item = out_var->GetMutable<framework::FeedFetchType>();
-    framework::CopyFrom(feed_item, dev_ctx.GetPlace(), dev_ctx, out_item);
+
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+    auto &dev_ctx = *pool.Borrow(place);
+
+    framework::CopyFrom(feed_item, place, dev_ctx, out_item);
    out_item->set_lod(feed_item.lod());
  }
 };
--- a/paddle/operators/fetch_op.cc
+++ b/paddle/operators/fetch_op.cc
@ -14,6 +14,7 @@

 #include "paddle/framework/feed_fetch_type.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/platform/device_context.h"

 namespace paddle {
 namespace operators {
@ -26,7 +27,7 @@ class FetchOp : public framework::OperatorBase {
      : OperatorBase(type, inputs, outputs, attrs) {}

  void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
    auto fetch_var_name = Input("X");
    auto *fetch_var = scope.FindVar(fetch_var_name);
    PADDLE_ENFORCE(fetch_var != nullptr,
@ -51,6 +52,9 @@ class FetchOp : public framework::OperatorBase {

    // FIXME(yuyang18): Should we assume the fetch operator always generate
    // CPU outputs?
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+    auto &dev_ctx = *pool.Borrow(place);
+
    CopyFrom(src_item, platform::CPUPlace(), dev_ctx, &dst_item);
    dev_ctx.Wait();
    dst_item.set_lod(src_item.lod());
--- a/paddle/operators/fill_constant_op.cc
+++ b/paddle/operators/fill_constant_op.cc
@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/framework/data_type.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/math/math_function.h"
+#include "paddle/platform/device_context.h"

 namespace paddle {
 namespace operators {
@ -33,7 +34,7 @@ class FillConstantOp : public framework::OperatorBase {
 public:
  using framework::OperatorBase::OperatorBase;
  void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &dev_place) const override {
    auto data_type =
        static_cast<framework::proto::DataType>(Attr<int>("dtype"));
    auto value = Attr<float>("value");
@ -45,8 +46,11 @@ class FillConstantOp : public framework::OperatorBase {
      auto cpu = platform::CPUPlace();
      out.mutable_data(cpu, framework::ToTypeIndex(data_type));
    } else {
-      out.mutable_data(dev_ctx.GetPlace(), framework::ToTypeIndex(data_type));
+      out.mutable_data(dev_place, framework::ToTypeIndex(data_type));
    }
+
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+    auto &dev_ctx = *pool.Borrow(dev_place);
    math::set_constant(dev_ctx, &out, value);
  }
 };
--- a/paddle/operators/fill_op.cc
+++ b/paddle/operators/fill_op.cc
@ -15,6 +15,7 @@
 #include "paddle/framework/data_type.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/detail/safe_ref.h"
+#include "paddle/platform/device_context.h"

 namespace paddle {
 namespace operators {
@ -42,7 +43,7 @@ class FillOp : public framework::OperatorBase {
         const framework::AttributeMap &attrs)
      : OperatorBase(type, inputs, outputs, attrs) {}
  void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
    auto &out =
        detail::Ref(detail::Ref(scope.FindVar(Output("Out")),
                                "Cannot find variable %s", Output("Out"))
@ -51,12 +52,11 @@ class FillOp : public framework::OperatorBase {
    auto dtype = static_cast<framework::proto::DataType>(Attr<int>("dtype"));
    platform::CPUPlace cpu;
    auto force_cpu = Attr<bool>("force_cpu");
-    out.mutable_data(force_cpu ? cpu : dev_ctx.GetPlace(),
-                     framework::ToTypeIndex(dtype));
+    out.mutable_data(force_cpu ? cpu : place, framework::ToTypeIndex(dtype));

    framework::LoDTensor tensor;

-    if (force_cpu || platform::is_cpu_place(dev_ctx.GetPlace())) {
+    if (force_cpu || platform::is_cpu_place(place)) {
      tensor.ShareDataWith(out);
    } else {
      // Always make tensor in CPU memory.
@ -67,9 +67,11 @@ class FillOp : public framework::OperatorBase {
    framework::VisitDataType(
        dtype, FillOpVisitor(&tensor, Attr<std::vector<float>>("value")));

-    if (!force_cpu && platform::is_gpu_place(dev_ctx.GetPlace())) {
+    if (!force_cpu && platform::is_gpu_place(place)) {
      // Copy tensor to out
-      framework::CopyFrom(tensor, dev_ctx.GetPlace(), dev_ctx, &out);
+      platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+      auto &dev_ctx = *pool.Borrow(place);
+      framework::CopyFrom(tensor, place, dev_ctx, &out);
    }
  }
 };
--- a/paddle/operators/increment_op.cc
+++ b/paddle/operators/increment_op.cc
@ -52,7 +52,7 @@ class IncrementOp : public framework::OperatorBase {
      : OperatorBase(type, inputs, outputs, attrs) {}

  void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
    auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
    auto &out =
        *scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
--- a/paddle/operators/is_empty_op.cc
+++ b/paddle/operators/is_empty_op.cc
@ -29,7 +29,7 @@ class IsEmptyOp : public framework::OperatorBase {
      : OperatorBase(type, inputs, outputs, attrs) {}

  void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
    // get input
    auto *var = scope.FindVar(Input(kInput));
    PADDLE_ENFORCE_NOT_NULL(var);
--- a/paddle/operators/load_op.cc
+++ b/paddle/operators/load_op.cc
@ -11,10 +11,10 @@
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License. */
+#include <fstream>

 #include "paddle/framework/op_registry.h"
-
-#include <fstream>
+#include "paddle/platform/device_context.h"

 namespace paddle {
 namespace operators {
@ -26,7 +26,7 @@ class LoadOp : public framework::OperatorBase {
         const framework::AttributeMap &attrs)
      : OperatorBase(type, inputs, outputs, attrs) {}
  void Run(const framework::Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {
+           const platform::Place &place) const override {
    auto filename = Attr<std::string>("file_path");
    std::ifstream fin(filename);
    PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s for load op",
@ -40,7 +40,9 @@ class LoadOp : public framework::OperatorBase {
    auto *tensor = out_var->GetMutable<framework::LoDTensor>();
    framework::DeserializeFromStream(fin, tensor);

-    auto place = dev_ctx.GetPlace();
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Get();
+    auto &dev_ctx = *pool.Borrow(place);
+
    if (platform::is_gpu_place(place)) {
      // copy CPU to GPU
      framework::LoDTensor cpu_tensor;
--- a/Show More
+++ b/Show More