supports thread-binding stream, test=develop (#23177)

5 years ago · 75ebb48a91
parent 708ded584e
commit 75ebb48a91
6 changed files with 344 additions and 90 deletions
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@ -44,6 +44,7 @@ cc_library(place SRCS place.cc DEPS enforce boost)
 cc_test(place_test SRCS place_test.cc DEPS place glog gflags)

 add_subdirectory(dynload)
+add_subdirectory(stream)

 cc_library(cpu_helper SRCS cpu_helper.cc DEPS cblas enforce)
 cc_test(cpu_helper_test SRCS cpu_helper_test.cc DEPS cpu_helper)
@ -54,7 +55,7 @@ IF(WITH_DGC)
 ENDIF()

 IF(WITH_GPU)
-    set(GPU_CTX_DEPS dynload_cuda dynamic_loader)
+    set(GPU_CTX_DEPS dynload_cuda dynamic_loader cuda_stream)
 ENDIF()

 IF(WITH_MKLDNN)
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@ -211,6 +211,34 @@ void CudnnWorkspaceHandle::ReallocWorkspace(size_t required_workspace_bytes) {
  allocation_ = memory::Alloc(device_context_, required_workspace_bytes);
 }

+thread_local std::unordered_map<const CUDADeviceContext*,
+                                std::unique_ptr<CUDAContext>>
+    CUDADeviceContext::thread_ctx_;
+thread_local std::mutex CUDADeviceContext::ctx_mtx_;
+
+void CUDAContext::InitEigenContext(const stream::CUDAStream& stream) {
+  eigen_stream_.reset(new EigenCudaStreamDevice());
+  eigen_stream_->Reinitialize(&stream.stream(), place_);
+  eigen_device_.reset(new Eigen::GpuDevice(eigen_stream_.get()));
+}
+
+CUDAContext::CUDAContext(const CUDAPlace& place,
+                         const enum stream::Priority& priority) {
+  place_ = place;
+  CUDADeviceGuard guard(place_.device);
+  stream_.Init(place, priority);
+  InitEigenContext(stream_);
+  InitCuBlasContext(stream_);
+  InitCuDNNContext(stream_);
+  InitCallbackManager(stream_);
+}
+
+CUDAContext::~CUDAContext() {
+  CUDADeviceGuard guard(place_.device);
+  DestoryCuDNNContext();
+  DestoryCuBlasContext();
+}
+
 CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) {
  CUDADeviceGuard guard(place_.device);
  compute_capability_ = GetCUDAComputeCapability(place_.device);
@ -218,18 +246,6 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) {
  max_threads_per_mp_ = GetCUDAMaxThreadsPerMultiProcessor(place_.device);
  max_grid_dim_size_ = GetGpuMaxGridDimSize(place_.device);
  max_threads_per_block_ = GetCUDAMaxThreadsPerBlock(place_.device);
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream_));
-  eigen_stream_.reset(new EigenCudaStreamDevice());
-  eigen_stream_->Reinitialize(&stream_, place);
-  eigen_device_.reset(new Eigen::GpuDevice(eigen_stream_.get()));
-  cublas_handle_.reset(new CublasHandleHolder(stream_, CUBLAS_DEFAULT_MATH));
-
-  if (TensorCoreAvailable()) {
-#if CUDA_VERSION >= 9000
-    cublas_tensor_core_handle_.reset(
-        new CublasHandleHolder(stream_, CUBLAS_TENSOR_OP_MATH));
-#endif
-  }

  driver_version_ = GetCUDADriverVersion(place_.device);
  runtime_version_ = GetCUDARuntimeVersion(place_.device);
@ -263,48 +279,14 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) {
          << "Please recompile or reinstall Paddle with compatible CUDA "
             "version.";
    }
-
-    if (dynload::HasCUDNN()) {
-      auto local_cudnn_version = cudnn_dso_ver / 100;
-      auto compile_cudnn_version = CUDNN_VERSION / 100;
-      if (local_cudnn_version < static_cast<size_t>(compile_cudnn_version)) {
-        LOG_FIRST_N(WARNING, 1)
-            << "WARNING: device: " << place_.device
-            << ". The installed Paddle is compiled with CUDNN "
-            << compile_cudnn_version / 10 << "." << compile_cudnn_version % 10
-            << ", but CUDNN version in your machine is "
-            << local_cudnn_version / 10 << "." << local_cudnn_version % 10
-            << ", which may cause serious incompatible bug. "
-            << "Please recompile or reinstall Paddle with compatible CUDNN "
-               "version.";
-      }
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          dynload::cudnnCreate(&cudnn_handle_),
-          "Failed to create Cudnn handle in DeviceContext");
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          dynload::cudnnSetStream(cudnn_handle_, stream_),
-          "Failed to set stream for Cudnn handle in DeviceContext");
-    } else {
-      cudnn_handle_ = nullptr;
-    }
  }
-
-  callback_manager_.reset(new StreamCallbackManager(stream_));
+  default_ctx_.reset(new CUDAContext(place_));
 }

 CUDADeviceContext::~CUDADeviceContext() {
  SetDeviceId(place_.device);
  Wait();
  WaitStreamCallback();
-  cublas_handle_.reset();
-  cublas_tensor_core_handle_.reset();
-  eigen_stream_.reset();
-  eigen_device_.reset();
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(stream_));
-  if (cudnn_handle_) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroy(cudnn_handle_),
-                                "Failed to destory Cudnn handle");
-  }
 #if defined(PADDLE_WITH_NCCL)
  if (nccl_comm_) {
    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::ncclCommDestroy(nccl_comm_));
@ -314,22 +296,7 @@ CUDADeviceContext::~CUDADeviceContext() {

 Place CUDADeviceContext::GetPlace() const { return place_; }

-void CUDADeviceContext::Wait() const {
-  cudaError_t e_sync = cudaSuccess;
-#if !defined(_WIN32)
-  e_sync = cudaStreamSynchronize(stream_);
-#else
-  while (e_sync = cudaStreamQuery(stream_)) {
-    if (e_sync == cudaErrorNotReady) continue;
-    break;
-  }
-#endif
-
-  PADDLE_ENFORCE_CUDA_SUCCESS(
-      e_sync, platform::errors::Fatal(
-                  "cudaStreamSynchronize raises error: %s, errono: %d",
-                  cudaGetErrorString(e_sync), static_cast<int>(e_sync)));
-}
+void CUDADeviceContext::Wait() const { context()->Wait(); }

 int CUDADeviceContext::GetComputeCapability() const {
  return compute_capability_;
@ -346,24 +313,26 @@ int CUDADeviceContext::GetMaxThreadsPerBlock() const {
 }

 Eigen::GpuDevice* CUDADeviceContext::eigen_device() const {
-  return eigen_device_.get();
+  return context()->EigenDevice().get();
 }

 bool CUDADeviceContext::tensor_core_available() const {
-  return cublas_tensor_core_handle_ != nullptr;
+  return context()->CublasTensorCoreHandle() != nullptr;
 }

 dim3 CUDADeviceContext::GetCUDAMaxGridDimSize() const {
  return max_grid_dim_size_;
 }

-cudnnHandle_t CUDADeviceContext::cudnn_handle() const { return cudnn_handle_; }
+cudnnHandle_t CUDADeviceContext::cudnn_handle() const {
+  return context()->CudnnHandle();
+}

 CudnnWorkspaceHandle CUDADeviceContext::cudnn_workspace_handle() const {
  return CudnnWorkspaceHandle(*this, &cudnn_handle_mtx_);
 }

-cudaStream_t CUDADeviceContext::stream() const { return stream_; }
+cudaStream_t CUDADeviceContext::stream() const { return context()->Stream(); }

 CUDAPinnedDeviceContext::CUDAPinnedDeviceContext() {
  eigen_device_.reset(new Eigen::DefaultDevice());
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
--- a/paddle/fluid/platform/stream/CMakeLists.txt
+++ b/paddle/fluid/platform/stream/CMakeLists.txt
@ -0,0 +1,3 @@
+IF(WITH_GPU)
+cc_library(cuda_stream SRCS cuda_stream.cc DEPS enforce)
+ENDIF()
--- a/paddle/fluid/platform/stream/cuda_stream.cc
+++ b/paddle/fluid/platform/stream/cuda_stream.cc
@ -0,0 +1,60 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/stream/cuda_stream.h"
+#include "paddle/fluid/platform/cuda_device_guard.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace platform {
+namespace stream {
+
+constexpr int64_t kHighPriority = -1;
+constexpr int64_t kNormalPriority = 0;
+constexpr unsigned int kDefaultFlag = cudaStreamDefault;
+
+bool CUDAStream::Init(const Place& place, const enum Priority& priority) {
+  PADDLE_ENFORCE_EQ(is_gpu_place(place), true,
+                    platform::errors::InvalidArgument(
+                        "Cuda stream must be created using cuda place."));
+  place_ = place;
+  CUDADeviceGuard guard(boost::get<CUDAPlace>(place_).device);
+  if (priority == Priority::HIGH) {
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        cudaStreamCreateWithPriority(&stream_, kDefaultFlag, kHighPriority),
+        platform::errors::Fatal("High priority cuda stream creation failed."));
+  } else if (priority == Priority::NORMAL) {
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        cudaStreamCreateWithPriority(&stream_, kDefaultFlag, kNormalPriority),
+        platform::errors::Fatal(
+            "Normal priority cuda stream creation failed."));
+  }
+  VLOG(3) << "CUDAStream Init stream: " << stream_
+          << ", priority: " << static_cast<int>(priority);
+  return true;
+}
+
+void CUDAStream::Destroy() {
+  CUDADeviceGuard guard(boost::get<CUDAPlace>(place_).device);
+  if (stream_) {
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        cudaStreamDestroy(stream_),
+        platform::errors::Fatal("Cuda stream destruction failed."));
+  }
+  stream_ = nullptr;
+}
+
+}  // namespace stream
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/fluid/platform/stream/cuda_stream.h
+++ b/paddle/fluid/platform/stream/cuda_stream.h
@ -0,0 +1,61 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <cstdint>
+#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/macros.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace platform {
+namespace stream {
+
+#ifdef PADDLE_WITH_CUDA
+
+enum class Priority : uint8_t {
+  NIL = 0x0,
+  HIGH = 0x1,
+  NORMAL = 0x2,
+};
+
+class CUDAStream final {
+ public:
+  CUDAStream() = default;
+  CUDAStream(const Place& place,
+             const enum Priority& priority = Priority::NORMAL) {
+    Init(place, priority);
+  }
+  virtual ~CUDAStream() { Destroy(); }
+
+  bool Init(const Place& place,
+            const enum Priority& priority = Priority::NORMAL);
+
+  const cudaStream_t& stream() const { return stream_; }
+  void Destroy();
+
+ private:
+  Place place_;
+  cudaStream_t stream_{nullptr};
+  Priority priority_{Priority::NORMAL};
+
+  DISABLE_COPY_AND_ASSIGN(CUDAStream);
+};
+
+#endif
+
+}  // namespace stream
+}  // namespace platform
+}  // namespace paddle