supports thread-binding stream, test=develop (#23177)

revert-23830-2.0-beta
石晓伟 5 years ago committed by GitHub
parent 708ded584e
commit 75ebb48a91
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -44,6 +44,7 @@ cc_library(place SRCS place.cc DEPS enforce boost)
cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
add_subdirectory(dynload)
add_subdirectory(stream)
cc_library(cpu_helper SRCS cpu_helper.cc DEPS cblas enforce)
cc_test(cpu_helper_test SRCS cpu_helper_test.cc DEPS cpu_helper)
@ -54,7 +55,7 @@ IF(WITH_DGC)
ENDIF()
IF(WITH_GPU)
set(GPU_CTX_DEPS dynload_cuda dynamic_loader)
set(GPU_CTX_DEPS dynload_cuda dynamic_loader cuda_stream)
ENDIF()
IF(WITH_MKLDNN)

@ -211,6 +211,34 @@ void CudnnWorkspaceHandle::ReallocWorkspace(size_t required_workspace_bytes) {
allocation_ = memory::Alloc(device_context_, required_workspace_bytes);
}
thread_local std::unordered_map<const CUDADeviceContext*,
std::unique_ptr<CUDAContext>>
CUDADeviceContext::thread_ctx_;
thread_local std::mutex CUDADeviceContext::ctx_mtx_;
void CUDAContext::InitEigenContext(const stream::CUDAStream& stream) {
eigen_stream_.reset(new EigenCudaStreamDevice());
eigen_stream_->Reinitialize(&stream.stream(), place_);
eigen_device_.reset(new Eigen::GpuDevice(eigen_stream_.get()));
}
CUDAContext::CUDAContext(const CUDAPlace& place,
const enum stream::Priority& priority) {
place_ = place;
CUDADeviceGuard guard(place_.device);
stream_.Init(place, priority);
InitEigenContext(stream_);
InitCuBlasContext(stream_);
InitCuDNNContext(stream_);
InitCallbackManager(stream_);
}
CUDAContext::~CUDAContext() {
CUDADeviceGuard guard(place_.device);
DestoryCuDNNContext();
DestoryCuBlasContext();
}
CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) {
CUDADeviceGuard guard(place_.device);
compute_capability_ = GetCUDAComputeCapability(place_.device);
@ -218,18 +246,6 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) {
max_threads_per_mp_ = GetCUDAMaxThreadsPerMultiProcessor(place_.device);
max_grid_dim_size_ = GetGpuMaxGridDimSize(place_.device);
max_threads_per_block_ = GetCUDAMaxThreadsPerBlock(place_.device);
PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream_));
eigen_stream_.reset(new EigenCudaStreamDevice());
eigen_stream_->Reinitialize(&stream_, place);
eigen_device_.reset(new Eigen::GpuDevice(eigen_stream_.get()));
cublas_handle_.reset(new CublasHandleHolder(stream_, CUBLAS_DEFAULT_MATH));
if (TensorCoreAvailable()) {
#if CUDA_VERSION >= 9000
cublas_tensor_core_handle_.reset(
new CublasHandleHolder(stream_, CUBLAS_TENSOR_OP_MATH));
#endif
}
driver_version_ = GetCUDADriverVersion(place_.device);
runtime_version_ = GetCUDARuntimeVersion(place_.device);
@ -263,48 +279,14 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) {
<< "Please recompile or reinstall Paddle with compatible CUDA "
"version.";
}
if (dynload::HasCUDNN()) {
auto local_cudnn_version = cudnn_dso_ver / 100;
auto compile_cudnn_version = CUDNN_VERSION / 100;
if (local_cudnn_version < static_cast<size_t>(compile_cudnn_version)) {
LOG_FIRST_N(WARNING, 1)
<< "WARNING: device: " << place_.device
<< ". The installed Paddle is compiled with CUDNN "
<< compile_cudnn_version / 10 << "." << compile_cudnn_version % 10
<< ", but CUDNN version in your machine is "
<< local_cudnn_version / 10 << "." << local_cudnn_version % 10
<< ", which may cause serious incompatible bug. "
<< "Please recompile or reinstall Paddle with compatible CUDNN "
"version.";
}
PADDLE_ENFORCE_CUDA_SUCCESS(
dynload::cudnnCreate(&cudnn_handle_),
"Failed to create Cudnn handle in DeviceContext");
PADDLE_ENFORCE_CUDA_SUCCESS(
dynload::cudnnSetStream(cudnn_handle_, stream_),
"Failed to set stream for Cudnn handle in DeviceContext");
} else {
cudnn_handle_ = nullptr;
}
}
callback_manager_.reset(new StreamCallbackManager(stream_));
default_ctx_.reset(new CUDAContext(place_));
}
CUDADeviceContext::~CUDADeviceContext() {
SetDeviceId(place_.device);
Wait();
WaitStreamCallback();
cublas_handle_.reset();
cublas_tensor_core_handle_.reset();
eigen_stream_.reset();
eigen_device_.reset();
PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(stream_));
if (cudnn_handle_) {
PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroy(cudnn_handle_),
"Failed to destory Cudnn handle");
}
#if defined(PADDLE_WITH_NCCL)
if (nccl_comm_) {
PADDLE_ENFORCE_CUDA_SUCCESS(dynload::ncclCommDestroy(nccl_comm_));
@ -314,22 +296,7 @@ CUDADeviceContext::~CUDADeviceContext() {
Place CUDADeviceContext::GetPlace() const { return place_; }
void CUDADeviceContext::Wait() const {
cudaError_t e_sync = cudaSuccess;
#if !defined(_WIN32)
e_sync = cudaStreamSynchronize(stream_);
#else
while (e_sync = cudaStreamQuery(stream_)) {
if (e_sync == cudaErrorNotReady) continue;
break;
}
#endif
PADDLE_ENFORCE_CUDA_SUCCESS(
e_sync, platform::errors::Fatal(
"cudaStreamSynchronize raises error: %s, errono: %d",
cudaGetErrorString(e_sync), static_cast<int>(e_sync)));
}
void CUDADeviceContext::Wait() const { context()->Wait(); }
int CUDADeviceContext::GetComputeCapability() const {
return compute_capability_;
@ -346,24 +313,26 @@ int CUDADeviceContext::GetMaxThreadsPerBlock() const {
}
Eigen::GpuDevice* CUDADeviceContext::eigen_device() const {
return eigen_device_.get();
return context()->EigenDevice().get();
}
bool CUDADeviceContext::tensor_core_available() const {
return cublas_tensor_core_handle_ != nullptr;
return context()->CublasTensorCoreHandle() != nullptr;
}
dim3 CUDADeviceContext::GetCUDAMaxGridDimSize() const {
return max_grid_dim_size_;
}
cudnnHandle_t CUDADeviceContext::cudnn_handle() const { return cudnn_handle_; }
cudnnHandle_t CUDADeviceContext::cudnn_handle() const {
return context()->CudnnHandle();
}
CudnnWorkspaceHandle CUDADeviceContext::cudnn_workspace_handle() const {
return CudnnWorkspaceHandle(*this, &cudnn_handle_mtx_);
}
cudaStream_t CUDADeviceContext::stream() const { return stream_; }
cudaStream_t CUDADeviceContext::stream() const { return context()->Stream(); }
CUDAPinnedDeviceContext::CUDAPinnedDeviceContext() {
eigen_device_.reset(new Eigen::DefaultDevice());

File diff suppressed because it is too large Load Diff

@ -0,0 +1,3 @@
IF(WITH_GPU)
cc_library(cuda_stream SRCS cuda_stream.cc DEPS enforce)
ENDIF()

@ -0,0 +1,60 @@
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/stream/cuda_stream.h"
#include "paddle/fluid/platform/cuda_device_guard.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace platform {
namespace stream {
constexpr int64_t kHighPriority = -1;
constexpr int64_t kNormalPriority = 0;
constexpr unsigned int kDefaultFlag = cudaStreamDefault;
bool CUDAStream::Init(const Place& place, const enum Priority& priority) {
PADDLE_ENFORCE_EQ(is_gpu_place(place), true,
platform::errors::InvalidArgument(
"Cuda stream must be created using cuda place."));
place_ = place;
CUDADeviceGuard guard(boost::get<CUDAPlace>(place_).device);
if (priority == Priority::HIGH) {
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaStreamCreateWithPriority(&stream_, kDefaultFlag, kHighPriority),
platform::errors::Fatal("High priority cuda stream creation failed."));
} else if (priority == Priority::NORMAL) {
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaStreamCreateWithPriority(&stream_, kDefaultFlag, kNormalPriority),
platform::errors::Fatal(
"Normal priority cuda stream creation failed."));
}
VLOG(3) << "CUDAStream Init stream: " << stream_
<< ", priority: " << static_cast<int>(priority);
return true;
}
void CUDAStream::Destroy() {
CUDADeviceGuard guard(boost::get<CUDAPlace>(place_).device);
if (stream_) {
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaStreamDestroy(stream_),
platform::errors::Fatal("Cuda stream destruction failed."));
}
stream_ = nullptr;
}
} // namespace stream
} // namespace platform
} // namespace paddle

@ -0,0 +1,61 @@
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <cstdint>
#include "paddle/fluid/platform/gpu_info.h"
#include "paddle/fluid/platform/macros.h"
#include "paddle/fluid/platform/place.h"
namespace paddle {
namespace platform {
namespace stream {
#ifdef PADDLE_WITH_CUDA
enum class Priority : uint8_t {
NIL = 0x0,
HIGH = 0x1,
NORMAL = 0x2,
};
class CUDAStream final {
public:
CUDAStream() = default;
CUDAStream(const Place& place,
const enum Priority& priority = Priority::NORMAL) {
Init(place, priority);
}
virtual ~CUDAStream() { Destroy(); }
bool Init(const Place& place,
const enum Priority& priority = Priority::NORMAL);
const cudaStream_t& stream() const { return stream_; }
void Destroy();
private:
Place place_;
cudaStream_t stream_{nullptr};
Priority priority_{Priority::NORMAL};
DISABLE_COPY_AND_ASSIGN(CUDAStream);
};
#endif
} // namespace stream
} // namespace platform
} // namespace paddle
Loading…
Cancel
Save