fix cuda dev_ctx allocator cmake deps, test=develop (#19953)

5 years ago · 37f76407b0
parent ebff68fa74
commit 37f76407b0
12 changed files with 159 additions and 308 deletions
--- a/paddle/fluid/memory/CMakeLists.txt
+++ b/paddle/fluid/memory/CMakeLists.txt
@ -17,8 +17,6 @@ cc_library(memory
        memcpy)
 if (WITH_GPU)
    add_dependencies(malloc cuda_device_context_allocator_pool)
    target_link_libraries(malloc cuda_device_context_allocator_pool)
    nv_test(malloc_test
            SRCS malloc_test.cu
            DEPS device_context malloc)
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@ -14,12 +14,6 @@ endif()
 if (WITH_GPU)
  nv_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard)
  nv_library(cuda_device_context_allocation SRCS cuda_device_context_allocation.cc
      DEPS allocator enforce place ${MKLDNN_CTX_DEPS})
  nv_library(cuda_device_context_allocator SRCS cuda_device_context_allocator.cc 
      DEPS allocator enforce place cuda_device_context_allocation ${MKLDNN_CTX_DEPS})
  nv_library(cuda_device_context_allocator_pool SRCS cuda_device_context_allocator_pool.cc
      DEPS allocator enforce place cuda_device_context_allocation cuda_device_context_allocator ${MKLDNN_CTX_DEPS})
 endif()
 cc_library(retry_allocator SRCS retry_allocator.cc DEPS allocator)
--- a/paddle/fluid/memory/allocation/cuda_device_context_allocation.cc
+++ b/paddle/fluid/memory/allocation/cuda_device_context_allocation.cc
@ -1,47 +0,0 @@
 // Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/memory/allocation/cuda_device_context_allocation.h"
 #include <utility>
 #include "paddle/fluid/platform/enforce.h"
 namespace paddle {
 namespace memory {
 namespace allocation {
 CUDADeviceContextAllocation::CUDADeviceContextAllocation(
    AllocationPtr allocation)
    : Allocation(allocation->ptr(), allocation->size(), allocation->place()),
      underlying_allocation_(std::move(allocation)) {}
 CUDADeviceContextAllocation::~CUDADeviceContextAllocation() {
  PADDLE_ENFORCE_NOT_NULL(
      dev_ctx_, "Didn't set device context for CUDADeviceContextAllocation");
  auto *p_allocation = underlying_allocation_.release();
  VLOG(4) << "Adding callback to delete CUDADeviceContextAllocation at "
          << p_allocation;
  dev_ctx_->AddStreamCallback([p_allocation] {
    VLOG(4) << "Delete CUDADeviceContextAllocation at " << p_allocation;
    AllocationDeleter()(p_allocation);
  });
 }
 void CUDADeviceContextAllocation::SetCUDADeviceContext(
    const platform::CUDADeviceContext *dev_ctx) {
  dev_ctx_ = dev_ctx;
 }
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
--- a/paddle/fluid/memory/allocation/cuda_device_context_allocation.h
+++ b/paddle/fluid/memory/allocation/cuda_device_context_allocation.h
@ -1,42 +0,0 @@
 // Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/platform/device_context.h"
 namespace paddle {
 namespace memory {
 namespace allocation {
 /**
 * CUDADeviceContextAllocation is a wrapper of the underbeneath allocation.
 * CUDADeviceContextAllocation adds a CUDA stream callback for the underbeneath
 * allocation so that CUDADeviceContextAllocation can be used in a CUDA stream
 * which deletes allocation in the callback.
 */
 class CUDADeviceContextAllocation : public Allocation {
 public:
  explicit CUDADeviceContextAllocation(AllocationPtr allocation);
  ~CUDADeviceContextAllocation();
  void SetCUDADeviceContext(const platform::CUDADeviceContext *dev_ctx);
 private:
  AllocationPtr underlying_allocation_;
  const platform::CUDADeviceContext *dev_ctx_{nullptr};
 };
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
--- a/paddle/fluid/memory/allocation/cuda_device_context_allocator.cc
+++ b/paddle/fluid/memory/allocation/cuda_device_context_allocator.cc
@ -1,66 +0,0 @@
 // Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/memory/allocation/cuda_device_context_allocator.h"
 #include "paddle/fluid/memory/allocation/cuda_device_context_allocation.h"
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #include "paddle/fluid/platform/enforce.h"
 namespace paddle {
 namespace memory {
 namespace allocation {
 CUDADeviceContextAllocator::CUDADeviceContextAllocator(
    const platform::CUDAPlace place, cudaStream_t default_stream)
    : place_(place), default_stream_(default_stream) {
  platform::CUDADeviceGuard guard(place_.device);
  PADDLE_ENFORCE_CUDA_SUCCESS(
      cudaEventCreate(&event_, cudaEventDisableTiming),
      "Create event failed in CUDADeviceContextAllocator");
 }
 CUDADeviceContextAllocator::~CUDADeviceContextAllocator() {
  if (event_) {
    platform::CUDADeviceGuard guard(place_.device);
    PADDLE_ENFORCE_CUDA_SUCCESS(
        cudaEventDestroy(event_),
        "Destory event failed in CUDADeviceContextAllocator destroctor");
  }
 }
 Allocation *CUDADeviceContextAllocator::AllocateImpl(size_t size) {
  PADDLE_ENFORCE_NOT_NULL(
      default_stream_,
      "Didn't set default stream for CUDADeviceContextAllocator");
  platform::CUDADeviceGuard guard(place_.device);
  auto allocation =
      new CUDADeviceContextAllocation(memory::Alloc(place_, size));
  // Wait for the event on stream
  PADDLE_ENFORCE_CUDA_SUCCESS(
      cudaEventRecord(event_, default_stream_),
      "Failed to record event in CUDADeviceContextAllocator");
  PADDLE_ENFORCE_CUDA_SUCCESS(
      cudaStreamWaitEvent(default_stream_, event_, 0),
      "Failed to wait event in CUDADeviceContextAllocator");
  return allocation;
 }
 void CUDADeviceContextAllocator::FreeImpl(Allocation *allocation) {
  delete allocation;
 }
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
--- a/paddle/fluid/memory/allocation/cuda_device_context_allocator.h
+++ b/paddle/fluid/memory/allocation/cuda_device_context_allocator.h
@ -15,15 +15,58 @@
 #pragma once
 #include <cuda_runtime.h>
 #include <map>
 #include <memory>
 #include <utility>
 #include <vector>
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/place.h"
 namespace paddle {
 namespace platform {
 class CUDADeviceContext;
 }  // namespace platform
 namespace memory {
 namespace allocation {
 /**
 * CUDADeviceContextAllocation is a wrapper of the underbeneath allocation.
 * CUDADeviceContextAllocation adds a CUDA stream callback for the underbeneath
 * allocation so that CUDADeviceContextAllocation can be used in a CUDA stream
 * which deletes allocation in the callback.
 */
 class CUDADeviceContextAllocation : public Allocation {
 public:
  explicit CUDADeviceContextAllocation(AllocationPtr allocation)
      : Allocation(allocation->ptr(), allocation->size(), allocation->place()),
        underlying_allocation_(std::move(allocation)) {}
  ~CUDADeviceContextAllocation() {
    PADDLE_ENFORCE_NOT_NULL(
        dev_ctx_, "Didn't set device context for CUDADeviceContextAllocation");
    auto *p_allocation = underlying_allocation_.release();
    VLOG(4) << "Adding callback to delete CUDADeviceContextAllocation at "
            << p_allocation;
    dev_ctx_->AddStreamCallback([p_allocation] {
      VLOG(4) << "Delete CUDADeviceContextAllocation at " << p_allocation;
      AllocationDeleter()(p_allocation);
    });
  }
  void SetCUDADeviceContext(const platform::CUDADeviceContext *dev_ctx) {
    dev_ctx_ = dev_ctx;
  }
 private:
  AllocationPtr underlying_allocation_;
  const platform::CUDADeviceContext *dev_ctx_{nullptr};
 };
 /**
 * CUDADeviceContextAllocator will allocate a CUDADeviceContextAllocation
 * after waiting for a self-created event on the default stream. It does so to
@ -33,12 +76,42 @@ namespace allocation {
 class CUDADeviceContextAllocator : public Allocator {
 public:
  explicit CUDADeviceContextAllocator(platform::CUDAPlace place,
-                                      cudaStream_t default_stream);
+                                      cudaStream_t default_stream)
-  ~CUDADeviceContextAllocator();
+      : place_(place), default_stream_(default_stream) {
    platform::CUDADeviceGuard guard(place_.device);
    PADDLE_ENFORCE_CUDA_SUCCESS(
        cudaEventCreate(&event_, cudaEventDisableTiming),
        "Create event failed in CUDADeviceContextAllocator");
  }
  ~CUDADeviceContextAllocator() {
    if (event_) {
      platform::CUDADeviceGuard guard(place_.device);
      PADDLE_ENFORCE_CUDA_SUCCESS(
          cudaEventDestroy(event_),
          "Destory event failed in CUDADeviceContextAllocator destroctor");
    }
  }
 protected:
-  Allocation *AllocateImpl(size_t size) override;
+  Allocation *AllocateImpl(size_t size) override {
-  void FreeImpl(Allocation *allocation) override;
+    PADDLE_ENFORCE_NOT_NULL(
        default_stream_,
        "Didn't set default stream for CUDADeviceContextAllocator");
    platform::CUDADeviceGuard guard(place_.device);
    auto allocation =
        new CUDADeviceContextAllocation(memory::Alloc(place_, size));
    // Wait for the event on stream
    PADDLE_ENFORCE_CUDA_SUCCESS(
        cudaEventRecord(event_, default_stream_),
        "Failed to record event in CUDADeviceContextAllocator");
    PADDLE_ENFORCE_CUDA_SUCCESS(
        cudaStreamWaitEvent(default_stream_, event_, 0),
        "Failed to wait event in CUDADeviceContextAllocator");
    return allocation;
  }
  void FreeImpl(Allocation *allocation) override { delete allocation; }
 private:
  platform::CUDAPlace place_;
@ -46,6 +119,49 @@ class CUDADeviceContextAllocator : public Allocator {
  cudaStream_t default_stream_{nullptr};
 };
 /**
 * CUDADeviceContextAllocatorPool is a singletion stores mapping from
 * CUDAPlace(s) to std::shared_ptr<CUDADeviceContextAllocator>. When a
 * CUDADeviceContext's compute stream isn't default stream, it can call this
 * class to allocate GPU memory which will be released by a callback after
 * stream execution.
 */
 class CUDADeviceContextAllocatorPool {
 public:
  static CUDADeviceContextAllocatorPool &Instance() {
    static CUDADeviceContextAllocatorPool pool;
    return pool;
  }
  AllocationPtr Alloc(const platform::CUDADeviceContext &dev_ctx, size_t size) {
    auto iter =
        allocators_.find(boost::get<platform::CUDAPlace>(dev_ctx.GetPlace()));
    PADDLE_ENFORCE_EQ(iter != allocators_.end(), true,
                      "CUDADeviceContextAllocatorPool initialization error");
    auto &allocator = iter->second;
    AllocationPtr allocation = allocator->Allocate(size);
    static_cast<CUDADeviceContextAllocation *>(allocation.get())
        ->SetCUDADeviceContext(&dev_ctx);
    return allocation;
  }
 private:
  CUDADeviceContextAllocatorPool() {
    std::vector<int> devices = platform::GetSelectedDevices();
    for (int i : devices) {
      auto place = platform::CUDAPlace(i);
      auto compute_stream =
          platform::DeviceContextPool::Instance().GetByPlace(place)->stream();
      auto allocator = std::shared_ptr<CUDADeviceContextAllocator>(
          new CUDADeviceContextAllocator(place, compute_stream));
      allocators_.insert(make_pair(place, allocator));
    }
  }
  std::map<platform::CUDAPlace, std::shared_ptr<CUDADeviceContextAllocator>>
      allocators_;
 };
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
--- a/paddle/fluid/memory/allocation/cuda_device_context_allocator_pool.cc
+++ b/paddle/fluid/memory/allocation/cuda_device_context_allocator_pool.cc
@ -1,59 +0,0 @@
 // Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/memory/allocation/cuda_device_context_allocator_pool.h"
 #include <utility>
 #include <vector>
 #include "paddle/fluid/memory/allocation/cuda_device_context_allocation.h"
 #include "paddle/fluid/memory/allocation/cuda_device_context_allocator.h"
 #include "paddle/fluid/platform/enforce.h"
 namespace paddle {
 namespace memory {
 namespace allocation {
 CUDADeviceContextAllocatorPool &CUDADeviceContextAllocatorPool::Instance() {
  static CUDADeviceContextAllocatorPool pool;
  return pool;
 }
 AllocationPtr CUDADeviceContextAllocatorPool::Alloc(
    const platform::CUDADeviceContext &dev_ctx, size_t size) {
  auto iter =
      allocators_.find(boost::get<platform::CUDAPlace>(dev_ctx.GetPlace()));
  PADDLE_ENFORCE_EQ(iter != allocators_.end(), true,
                    "CUDADeviceContextAllocatorPool initialization error");
  auto &allocator = iter->second;
  AllocationPtr allocation = allocator->Allocate(size);
  static_cast<CUDADeviceContextAllocation *>(allocation.get())
      ->SetCUDADeviceContext(&dev_ctx);
  return allocation;
 }
 CUDADeviceContextAllocatorPool::CUDADeviceContextAllocatorPool() {
  std::vector<int> devices = platform::GetSelectedDevices();
  for (int i : devices) {
    auto place = platform::CUDAPlace(i);
    auto compute_stream =
        platform::DeviceContextPool::Instance().GetByPlace(place)->stream();
    auto allocator = std::shared_ptr<CUDADeviceContextAllocator>(
        new CUDADeviceContextAllocator(place, compute_stream));
    allocators_.insert(make_pair(place, allocator));
  }
 }
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
--- a/paddle/fluid/memory/allocation/cuda_device_context_allocator_pool.h
+++ b/paddle/fluid/memory/allocation/cuda_device_context_allocator_pool.h
@ -1,48 +0,0 @@
 // Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
 #include <map>
 #include <memory>
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/memory/allocation/cuda_device_context_allocator.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/place.h"
 namespace paddle {
 namespace memory {
 namespace allocation {
 /**
 * CUDADeviceContextAllocatorPool is a singletion stores mapping from
 * CUDAPlace(s) to std::shared_ptr<CUDADeviceContextAllocator>. When a
 * CUDADeviceContext's compute stream isn't default stream, it can call this
 * class to allocate GPU memory which will be released by a callback after
 * stream execution.
 */
 class CUDADeviceContextAllocatorPool {
 public:
  static CUDADeviceContextAllocatorPool &Instance();
  AllocationPtr Alloc(const platform::CUDADeviceContext &dev_ctx, size_t size);
 private:
  CUDADeviceContextAllocatorPool();
  std::map<platform::CUDAPlace, std::shared_ptr<CUDADeviceContextAllocator>>
      allocators_;
 };
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
--- a/paddle/fluid/memory/malloc.cc
+++ b/paddle/fluid/memory/malloc.cc
@ -17,10 +17,6 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/fluid/memory/allocation/allocator_strategy.h"
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/memory/allocation/cuda_device_context_allocator_pool.h"
 #endif
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/place.h"
 namespace paddle {
@ -35,26 +31,5 @@ AllocationPtr Alloc(const platform::Place &place, size_t size) {
  return allocation::AllocatorFacade::Instance().Alloc(place, size);
 }
 AllocationPtr Alloc(const platform::DeviceContext &dev_ctx, size_t size) {
  auto place = dev_ctx.GetPlace();
 #ifdef PADDLE_WITH_CUDA
  if (size == 0 || !platform::is_gpu_place(place)) {
    return Alloc(place, size);
  }
  auto *default_dev_ctx = static_cast<platform::CUDADeviceContext *>(
      platform::DeviceContextPool::Instance().Get(place));
  auto &desired_dev_ctx =
      static_cast<const platform::CUDADeviceContext &>(dev_ctx);
  if (default_dev_ctx->stream() == desired_dev_ctx.stream()) {
    return Alloc(place, size);
  } else {
    return allocation::CUDADeviceContextAllocatorPool::Instance().Alloc(
        desired_dev_ctx, size);
  }
 #else
  return Alloc(place, size);
 #endif
 }
 }  // namespace memory
 }  // namespace paddle
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@ -70,7 +70,7 @@ ENDIF()
 # memcpy depends on device_context, here add deps individually for
 # avoiding cycle dependencies
-cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool malloc ${STREAM_CALLBACK_DEPS}
+cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool malloc xxhash ${STREAM_CALLBACK_DEPS}
    place eigen3 stringpiece cpu_helper cpu_info framework_proto ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS}
    ${dgc_deps})
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@ -18,11 +18,39 @@ limitations under the License. */
 #include "paddle/fluid/memory/memory.h"
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/framework/rw_lock.h"
 #include "paddle/fluid/memory/allocation/cuda_device_context_allocator.h"
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
 #include "glog/logging.h"
 namespace paddle {
 namespace memory {
 AllocationPtr Alloc(const platform::DeviceContext& dev_ctx, size_t size) {
  auto place = dev_ctx.GetPlace();
 #ifdef PADDLE_WITH_CUDA
  if (size == 0 || !platform::is_gpu_place(place)) {
    return Alloc(place, size);
  }
  auto* default_dev_ctx = static_cast<platform::CUDADeviceContext*>(
      platform::DeviceContextPool::Instance().Get(place));
  auto& desired_dev_ctx =
      static_cast<const platform::CUDADeviceContext&>(dev_ctx);
  if (default_dev_ctx->stream() == desired_dev_ctx.stream()) {
    return Alloc(place, size);
  } else {
    return allocation::CUDADeviceContextAllocatorPool::Instance().Alloc(
        desired_dev_ctx, size);
  }
 #else
  return Alloc(place, size);
 #endif
 }
 }  // namespace memory
 }  // namespace paddle
 namespace paddle {
 namespace platform {
@ -174,6 +202,15 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface {
  mutable std::unordered_map<void*, memory::AllocationPtr> allocations_;
 };
 void CudnnWorkspaceHandle::ReallocWorkspace(size_t required_workspace_bytes) {
  if (required_workspace_bytes <= WorkspaceSize()) {
    return;
  }
  // reset allocation first before re-allocate to save memory
  allocation_.reset();
  allocation_ = memory::Alloc(device_context_, required_workspace_bytes);
 }
 CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) {
  CUDADeviceGuard guard(place_.device);
  compute_capability_ = GetCUDAComputeCapability(place_.device);
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@ -220,14 +220,7 @@ class CudnnWorkspaceHandle {
    ResetWorkspace();
  }
-  inline void ReallocWorkspace(size_t required_workspace_bytes) {
+  void ReallocWorkspace(size_t required_workspace_bytes);
    if (required_workspace_bytes <= WorkspaceSize()) {
      return;
    }
    // reset allocation first before re-allocate to save memory
    allocation_.reset();
    allocation_ = memory::Alloc(device_context_, required_workspace_bytes);
  }
  inline void ResetWorkspace() { allocation_ = nullptr; }