Use system allocator in OpTest (#21335)

* use system allocator in unittests, test=develop * fix op bugs, test=develop * fix tensor copy bug when src and dst are the same, test=develop
6 years ago · 09696d5df8
parent 007c997572
commit 09696d5df8
12 changed files with 179 additions and 25 deletions
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@ -56,6 +56,8 @@ else()
  cc_test(tensor_util_test SRCS tensor_util_test.cc DEPS tensor dlpack_tensor)
 endif()

+cc_test(copy_same_tensor_test SRCS copy_same_tensor_test.cc DEPS tensor)
+
 cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)

 if(WITH_GPU)
--- a/paddle/fluid/framework/copy_same_tensor_test.cc
+++ b/paddle/fluid/framework/copy_same_tensor_test.cc
@ -0,0 +1,96 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cstring>
+#include <random>
+#include "gflags/gflags.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/tensor_util.h"
+
+DECLARE_bool(use_system_allocator);
+
+namespace paddle {
+namespace framework {
+
+static std::vector<platform::Place> CreatePlaceList() {
+  std::vector<platform::Place> places;
+  places.emplace_back(platform::CPUPlace());
+#ifdef PADDLE_WITH_CUDA
+  places.emplace_back(platform::CUDAPlace(0));
+#endif
+  return places;
+}
+
+template <typename T>
+static bool CopySameTensorTestMain(const DDim &dims,
+                                   const platform::Place &src_place,
+                                   const platform::Place &dst_place,
+                                   bool sync_copy) {
+  FLAGS_use_system_allocator = true;  // force to use system allocator
+
+  // Step 1: create a cpu tensor and initialize it with random value;
+  Tensor src_cpu_tensor;
+  {
+    src_cpu_tensor.Resize(dims);
+    auto *src_ptr_cpu = src_cpu_tensor.mutable_data<T>(platform::CPUPlace());
+    int64_t num = src_cpu_tensor.numel();
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_real_distribution<T> dist(-1000, 1000);
+    for (int64_t i = 0; i < num; ++i) {
+      src_ptr_cpu[i] = dist(gen);
+    }
+  }
+
+  // Step 2: copy the source tensor to dst place
+  Tensor dst_cpu_tensor;
+  {
+    Tensor src_tensor;
+    TensorCopySync(src_cpu_tensor, src_place, &src_tensor);
+
+    // The source tensor and dst_tensor is the same
+    if (sync_copy) {
+      TensorCopySync(src_tensor, dst_place, &src_tensor);
+    } else {
+      TensorCopy(src_tensor, dst_place, &src_tensor);
+      platform::DeviceContextPool::Instance().Get(src_place)->Wait();
+      platform::DeviceContextPool::Instance().Get(dst_place)->Wait();
+    }
+
+    // Get the result cpu tensor
+    TensorCopySync(src_tensor, platform::CPUPlace(), &dst_cpu_tensor);
+  }
+
+  const void *ground_truth_ptr = src_cpu_tensor.data<void>();
+  const void *result_ptr = dst_cpu_tensor.data<void>();
+  size_t byte_num = product(dims) * sizeof(T);
+  return std::memcmp(ground_truth_ptr, result_ptr, byte_num) == 0;
+}
+
+TEST(test_tensor_copy, test_copy_same_tensor) {
+  using DataType = float;
+  auto dims = make_ddim({3, 4, 5});
+
+  auto places = CreatePlaceList();
+  for (auto &src_p : places) {
+    for (auto &dst_p : places) {
+      ASSERT_TRUE(CopySameTensorTestMain<DataType>(dims, src_p, dst_p, true));
+      ASSERT_TRUE(CopySameTensorTestMain<DataType>(dims, src_p, dst_p, false));
+    }
+  }
+}
+
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@ -25,6 +25,12 @@ namespace framework {

 void TensorCopy(const Tensor& src, const platform::Place& dst_place,
                const platform::DeviceContext& ctx, Tensor* dst) {
+  if (&src == dst) {
+    auto src_copy = src;
+    TensorCopy(src_copy, dst_place, ctx, dst);
+    return;
+  }
+
  VLOG(3) << "TensorCopy " << src.dims() << " from " << src.place() << " to "
          << dst_place;
  src.check_memory_size();
@ -33,7 +39,6 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
  dst->set_layout(src.layout());
  auto src_place = src.place();
  auto src_ptr = src.data<void>();
-
  auto dst_ptr = dst->mutable_data(dst_place, src.type());

  if (src_ptr == dst_ptr && src_place == dst_place) {
@ -115,6 +120,12 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,

 void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
                    Tensor* dst) {
+  if (&src == dst) {
+    auto src_copy = src;
+    TensorCopySync(src_copy, dst_place, dst);
+    return;
+  }
+
  VLOG(3) << "TensorCopySync " << src.dims() << " from " << src.place()
          << " to " << dst_place;
  src.check_memory_size();
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@ -41,12 +41,18 @@ DEFINE_int64(
    "The retry time (milliseconds) when allocator fails "
    "to allocate memory. No retry if this value is not greater than 0");

+DEFINE_bool(use_system_allocator, false,
+            "Whether to use system allocator to allocate CPU and GPU memory. "
+            "Only used for unittests.");
+
 namespace paddle {
 namespace memory {
 namespace allocation {

 class AllocatorFacadePrivate {
 public:
+  using AllocatorMap = std::map<platform::Place, std::shared_ptr<Allocator>>;
+
  AllocatorFacadePrivate() {
    auto strategy = GetAllocatorStrategy();
    switch (strategy) {
@ -80,6 +86,7 @@ class AllocatorFacadePrivate {
      }
    }
    InitZeroSizeAllocators();
+    InitSystemAllocators();

    if (FLAGS_gpu_allocator_retry_time > 0) {
      WrapCUDARetryAllocator(FLAGS_gpu_allocator_retry_time);
@ -90,7 +97,10 @@ class AllocatorFacadePrivate {

  inline const std::shared_ptr<Allocator>& GetAllocator(
      const platform::Place& place, size_t size) {
-    const auto& allocators = (size > 0 ? allocators_ : zero_size_allocators_);
+    const auto& allocators =
+        (size > 0 ? (UNLIKELY(FLAGS_use_system_allocator) ? system_allocators_
+                                                          : allocators_)
+                  : zero_size_allocators_);
    auto iter = allocators.find(place);
    PADDLE_ENFORCE(iter != allocators.end(),
                   "No such allocator for the place, %s", place);
@ -98,6 +108,19 @@ class AllocatorFacadePrivate {
  }

 private:
+  void InitSystemAllocators() {
+    system_allocators_[platform::CPUPlace()] = std::make_shared<CPUAllocator>();
+#ifdef PADDLE_WITH_CUDA
+    system_allocators_[platform::CUDAPinnedPlace()] =
+        std::make_shared<CPUPinnedAllocator>();
+    int device_count = platform::GetCUDADeviceCount();
+    for (int i = 0; i < device_count; ++i) {
+      platform::CUDAPlace p(i);
+      system_allocators_[p] = std::make_shared<CUDAAllocator>(p);
+    }
+#endif
+  }
+
  void InitNaiveBestFitCPUAllocator() {
    allocators_[platform::CPUPlace()] =
        std::make_shared<NaiveBestFitAllocator>(platform::CPUPlace());
@ -153,14 +176,18 @@ class AllocatorFacadePrivate {
    }
  }

-  void CheckAllocThreadSafe() const {
-    for (auto& pair : allocators_) {
-      PADDLE_ENFORCE_EQ(pair.second->IsAllocThreadSafe(), true);
+  static void CheckAllocThreadSafe(const AllocatorMap& allocators) {
+    for (auto& pair : allocators) {
+      PADDLE_ENFORCE_EQ(pair.second->IsAllocThreadSafe(), true,
+                        platform::errors::InvalidArgument(
+                            "Public allocators must be thread safe"));
    }
-
-    for (auto& pair : zero_size_allocators_) {
-      PADDLE_ENFORCE_EQ(pair.second->IsAllocThreadSafe(), true);
  }
+
+  void CheckAllocThreadSafe() const {
+    CheckAllocThreadSafe(allocators_);
+    CheckAllocThreadSafe(zero_size_allocators_);
+    CheckAllocThreadSafe(system_allocators_);
  }

  void WrapCUDARetryAllocator(size_t retry_time) {
@ -173,8 +200,9 @@ class AllocatorFacadePrivate {
  }

 private:
-  std::map<platform::Place, std::shared_ptr<Allocator>> allocators_;
-  std::map<platform::Place, std::shared_ptr<Allocator>> zero_size_allocators_;
+  AllocatorMap allocators_;
+  AllocatorMap zero_size_allocators_;
+  AllocatorMap system_allocators_;
 };

 // Pimpl. Make interface clean.
--- a/paddle/fluid/operators/detection/yolov3_loss_op.h
+++ b/paddle/fluid/operators/detection/yolov3_loss_op.h
@ -299,8 +299,8 @@ class Yolov3LossKernel : public framework::OpKernel<T> {
        gt_match_mask->mutable_data<int>({n, b}, ctx.GetPlace());

    const T* gt_score_data;
-    if (!gt_score) {
    Tensor gtscore;
+    if (!gt_score) {
      gtscore.mutable_data<T>({n, b}, ctx.GetPlace());
      math::SetConstant<platform::CPUDeviceContext, T>()(
          ctx.template device_context<platform::CPUDeviceContext>(), &gtscore,
@ -454,8 +454,8 @@ class Yolov3LossGradKernel : public framework::OpKernel<T> {
    memset(input_grad_data, 0, input_grad->numel() * sizeof(T));

    const T* gt_score_data;
-    if (!gt_score) {
    Tensor gtscore;
+    if (!gt_score) {
      gtscore.mutable_data<T>({n, b}, ctx.GetPlace());
      math::SetConstant<platform::CPUDeviceContext, T>()(
          ctx.template device_context<platform::CPUDeviceContext>(), &gtscore,
--- a/paddle/fluid/operators/fused/fusion_lstm_op.cc
+++ b/paddle/fluid/operators/fused/fusion_lstm_op.cc
@ -389,7 +389,7 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
      const T* c0_data = c0->data<T>();
      prev_h_data = reordered_h0_data;
      prev_c_data = reordered_c0_data;
-      size_t sz = sizeof(T) * D;
+      size_t sz = D;
      for (int i = 0; i < max_bs; ++i) {
        blas.VCOPY(sz, h0_data + seq_order[i] * D, reordered_h0_data);
        blas.VCOPY(sz, c0_data + seq_order[i] * D, reordered_c0_data);
--- a/paddle/fluid/operators/lod_reset_op.cc
+++ b/paddle/fluid/operators/lod_reset_op.cc
@ -205,8 +205,8 @@ class LoDResetGradMaker : public framework::SingleGradOpMaker<T> {
  }
 };

-DECLARE_INPLACE_OP_INFERER(LodResetInplaceInferer, {"X", "Out"});
-DECLARE_INPLACE_OP_INFERER(LodResetGradInplaceInferer,
+DECLARE_INPLACE_OP_INFERER(LoDResetInplaceInferer, {"X", "Out"});
+DECLARE_INPLACE_OP_INFERER(LoDResetGradInplaceInferer,
                           {framework::GradVarName("Out"),
                            framework::GradVarName("X")});

@ -220,10 +220,10 @@ namespace ops = paddle::operators;
 REGISTER_OPERATOR(lod_reset, ops::LoDResetOp, ops::LoDResetOpMaker,
                  ops::LoDResetGradMaker<paddle::framework::OpDesc>,
                  ops::LoDResetGradMaker<paddle::imperative::OpBase>,
-                  ops::LoDResetOpVarTypeInference, ops::LodResetInplaceInferer);
+                  ops::LoDResetOpVarTypeInference, ops::LoDResetInplaceInferer);
 REGISTER_OPERATOR(lod_reset_grad, ops::LoDResetGradOp,
                  ops::LoDResetGradNoNeedBufferVarInference,
-                  ops::LodResetGradInplaceInferer);
+                  ops::LoDResetGradInplaceInferer);

 REGISTER_OP_CPU_KERNEL(
    lod_reset, ops::LoDResetKernel<paddle::platform::CPUPlace, float>,
--- a/paddle/fluid/operators/uniform_random_op.h
+++ b/paddle/fluid/operators/uniform_random_op.h
@ -26,8 +26,8 @@ inline std::vector<int64_t> GetNewDataFromShapeTensor(
    const Tensor *new_data_tensor) {
  if (new_data_tensor->type() == framework::proto::VarType::INT64) {
    auto *new_data = new_data_tensor->data<int64_t>();
-    if (platform::is_gpu_place(new_data_tensor->place())) {
    framework::Tensor cpu_starts_tensor;
+    if (platform::is_gpu_place(new_data_tensor->place())) {
      TensorCopySync(*new_data_tensor, platform::CPUPlace(),
                     &cpu_starts_tensor);
      new_data = cpu_starts_tensor.data<int64_t>();
@ -38,8 +38,8 @@ inline std::vector<int64_t> GetNewDataFromShapeTensor(
  } else if (new_data_tensor->type() == framework::proto::VarType::INT32) {
    auto *new_data = new_data_tensor->data<int32_t>();
    std::vector<int64_t> vec_new_data;
-    if (platform::is_gpu_place(new_data_tensor->place())) {
    framework::Tensor cpu_starts_tensor;
+    if (platform::is_gpu_place(new_data_tensor->place())) {
      TensorCopySync(*new_data_tensor, platform::CPUPlace(),
                     &cpu_starts_tensor);
      new_data = cpu_starts_tensor.data<int32_t>();
--- a/paddle/fluid/operators/utils.h
+++ b/paddle/fluid/operators/utils.h
@ -25,16 +25,16 @@ inline std::vector<T> GetDataFromTensor(const framework::Tensor* x) {
  std::vector<T> vec_new_data;
  if (x->type() == framework::proto::VarType::INT32) {
    auto* data = x->data<int>();
-    if (platform::is_gpu_place(x->place())) {
    framework::Tensor cpu_attr_tensor;
+    if (platform::is_gpu_place(x->place())) {
      TensorCopySync(*x, platform::CPUPlace(), &cpu_attr_tensor);
      data = cpu_attr_tensor.data<int>();
    }
    vec_new_data = std::vector<T>(data, data + x->numel());
  } else if (x->type() == framework::proto::VarType::INT64) {
    auto* data = x->data<int64_t>();
-    if (platform::is_gpu_place(x->place())) {
    framework::Tensor cpu_attr_tensor;
+    if (platform::is_gpu_place(x->place())) {
      TensorCopySync(*x, platform::CPUPlace(), &cpu_attr_tensor);
      data = cpu_attr_tensor.data<int64_t>();
    }
--- a/paddle/fluid/pybind/global_value_getter_setter.cc
+++ b/paddle/fluid/pybind/global_value_getter_setter.cc
@ -29,6 +29,7 @@
 DECLARE_double(eager_delete_tensor_gb);
 DECLARE_bool(use_mkldnn);
 DECLARE_bool(use_ngraph);
+DECLARE_bool(use_system_allocator);

 namespace paddle {
 namespace pybind {
@ -152,7 +153,10 @@ void BindGlobalValueGetterSetter(pybind11::module *module) {

 #define REGISTER_GLOBAL_VAR_SETTER_ONLY(var)                          \
  GlobalVarGetterSetterRegistry::MutableInstance()->RegisterSetter(   \
-      #var, [](const py::object &obj) { var = py::cast<decltype(var)>(obj); })
+      #var, [](const py::object &obj) {                               \
+        using ValueType = std::remove_reference<decltype(var)>::type; \
+        var = py::cast<ValueType>(obj);                               \
+      })

 #define REGISTER_GLOBAL_VAR_GETTER_SETTER(var) \
  REGISTER_GLOBAL_VAR_GETTER_ONLY(var);        \
@ -162,6 +166,7 @@ static void RegisterGlobalVarGetterSetter() {
  REGISTER_GLOBAL_VAR_GETTER_ONLY(FLAGS_use_mkldnn);
  REGISTER_GLOBAL_VAR_GETTER_ONLY(FLAGS_use_ngraph);
  REGISTER_GLOBAL_VAR_GETTER_SETTER(FLAGS_eager_delete_tensor_gb);
+  REGISTER_GLOBAL_VAR_GETTER_SETTER(FLAGS_use_system_allocator);
 }

 }  // namespace pybind
--- a/python/paddle/fluid/init.py
+++ b/python/paddle/fluid/init.py
@ -168,7 +168,7 @@ def __bootstrap__():
        'print_sub_graph_dir', 'pe_profile_fname', 'inner_op_parallelism',
        'enable_parallel_graph', 'fuse_parameter_groups_size',
        'multiple_of_cupti_buffer_size', 'fuse_parameter_memory_size',
-        'tracer_profile_fname', 'dygraph_debug'
+        'tracer_profile_fname', 'dygraph_debug', 'use_system_allocator'
    ]
    if 'Darwin' not in sysstr:
        read_env_flags.append('use_pinned_memory')
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@ -35,6 +35,14 @@ from testsuite import create_op, set_input, append_input_output, append_loss_ops
 from paddle.fluid import unique_name


+def _set_use_system_allocator(value=None):
+    USE_SYSTEM_ALLOCATOR_FLAG = "FLAGS_use_system_allocator"
+    old_value = core.globals()[USE_SYSTEM_ALLOCATOR_FLAG]
+    value = old_value if value is None else value
+    core.globals()[USE_SYSTEM_ALLOCATOR_FLAG] = value
+    return old_value
+
+
 def randomize_probability(batch_size, class_num, dtype='float32'):
    prob = np.random.uniform(
        0.1, 1.0, size=(batch_size, class_num)).astype(dtype)
@ -146,12 +154,16 @@ class OpTest(unittest.TestCase):
        np.random.seed(123)
        random.seed(124)

+        cls._use_system_allocator = _set_use_system_allocator(True)
+
    @classmethod
    def tearDownClass(cls):
        """Restore random seeds"""
        np.random.set_state(cls._np_rand_state)
        random.setstate(cls._py_rand_state)

+        _set_use_system_allocator(cls._use_system_allocator)
+
    def try_call_once(self, data_type):
        if not self.call_once:
            self.call_once = True