diff --git a/mindspore/ccsrc/kernel/gpu/cuda_impl/dropout_impl.cu b/mindspore/ccsrc/kernel/gpu/cuda_impl/dropout_impl.cu index 019d71d740..d65f319ab7 100644 --- a/mindspore/ccsrc/kernel/gpu/cuda_impl/dropout_impl.cu +++ b/mindspore/ccsrc/kernel/gpu/cuda_impl/dropout_impl.cu @@ -17,31 +17,59 @@ #include #include "dropout_impl.cuh" #include "include/cuda_runtime.h" - -__global__ void DropoutForwardKernel(const float *input, float *mask, float *output, size_t num_count, +template +__global__ void DropoutForwardKernel(const T *input, T *mask, T *output, float *mask_f, size_t num_count, float keep_prob) { float scale = 1.f / keep_prob; for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < num_count; i += blockDim.x * gridDim.x) { - mask[i] = mask[i] <= keep_prob; - output[i] = scale * input[i] * mask[i]; + mask_f[i] = mask_f[i] <= keep_prob; + output[i] = scale * input[i] * mask_f[i]; + mask[i] = mask_f[i]; } } - -void DropoutForward(const float *input, float *mask, float *output, size_t num_count, float drop_prob, +template <> +__global__ void DropoutForwardKernel(const half *input, half *mask, half *output, float *mask_f, + size_t num_count, float keep_prob) { + half scale = __float2half(1.f / keep_prob); + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < num_count; i += blockDim.x * gridDim.x) { + mask_f[i] = mask_f[i] <= keep_prob; + output[i] = scale * input[i] * __float2half(mask_f[i]); + mask[i] = __float2half(mask_f[i]); + } +} +template +void DropoutForward(const T *input, T *mask, T *output, float *mask_f, size_t num_count, float drop_prob, cudaStream_t cuda_stream) { - DropoutForwardKernel<<>>(input, mask, output, num_count, - drop_prob); + DropoutForwardKernel<<>>(input, mask, output, mask_f, + num_count, drop_prob); } - -__global__ void DropoutBackwardKernel(const float *dy, const float *mask, float *dx, size_t num_count, +template +__global__ void DropoutBackwardKernel(const T *dy, const T *mask, T *dx, size_t num_count, float keep_prob) { float scale = 1.f / keep_prob; for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < num_count; i += blockDim.x * gridDim.x) { dx[i] = scale * dy[i] * mask[i]; } } - -void DropoutBackward(const float *dy, const float *mask, float *dx, size_t num_count, float drop_prob, +template <> +__global__ void DropoutBackwardKernel(const half *dy, const half *mask, half *dx, size_t num_count, + float keep_prob) { + half scale = __float2half(1.f / keep_prob); + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < num_count; i += blockDim.x * gridDim.x) { + dx[i] = scale * dy[i] * mask[i]; + } +} +template +void DropoutBackward(const T *dy, const T *mask, T *dx, size_t num_count, float drop_prob, cudaStream_t cuda_stream) { DropoutBackwardKernel<<>>(dy, mask, dx, num_count, drop_prob); } + +template void DropoutForward(const float *input, float *mask, float *output, float *mask_f, + size_t num_count, float drop_prob, cudaStream_t cuda_stream); +template void DropoutForward(const half *input, half *mask, half *output, float *mask_f, + size_t num_count, float drop_prob, cudaStream_t cuda_stream); +template void DropoutBackward(const float *dy, const float *mask, float *dx, size_t num_count, + float drop_prob, cudaStream_t cuda_stream); +template void DropoutBackward(const half *dy, const half *mask, half *dx, size_t num_count, + float drop_prob, cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/kernel/gpu/cuda_impl/dropout_impl.cuh b/mindspore/ccsrc/kernel/gpu/cuda_impl/dropout_impl.cuh index bd3de6524d..f89d42ce49 100644 --- a/mindspore/ccsrc/kernel/gpu/cuda_impl/dropout_impl.cuh +++ b/mindspore/ccsrc/kernel/gpu/cuda_impl/dropout_impl.cuh @@ -18,9 +18,10 @@ #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_DROPOUT_H_ #include "device/gpu/cuda_common.h" -void DropoutForward(const float *input, float *mask, float *output, size_t num_count, float keep_prob, +template +void DropoutForward(const T *input, T *mask, T *output, float *mask_f, size_t num_count, float keep_prob, cudaStream_t cuda_stream); -void DropoutBackward(const float *dy, const float *mask, float *dx, size_t num_count, float keep_prob, - cudaStream_t cuda_stream); +template +void DropoutBackward(const T *dy, const T *mask, T *dx, size_t num_count, float keep_prob, cudaStream_t cuda_stream); #endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_DROPOUT_H_ diff --git a/mindspore/ccsrc/kernel/gpu/nn/dropout_gpu_kernel.cc b/mindspore/ccsrc/kernel/gpu/nn/dropout_gpu_kernel.cc index b84dc628e0..459010e9e9 100644 --- a/mindspore/ccsrc/kernel/gpu/nn/dropout_gpu_kernel.cc +++ b/mindspore/ccsrc/kernel/gpu/nn/dropout_gpu_kernel.cc @@ -15,84 +15,16 @@ */ #include "kernel/gpu/nn/dropout_gpu_kernel.h" -#include "kernel/gpu/cuda_impl/dropout_impl.cuh" namespace mindspore { namespace kernel { -DropoutGpuFwdKernel::DropoutGpuFwdKernel() - : cudnn_handle_(nullptr), - is_null_input_(false), - num_count_(0), - keep_prob_(0.0), - states_init_(false), - mask_generator_(nullptr) {} - -DropoutGpuFwdKernel::~DropoutGpuFwdKernel() { DestroyResource(); } - -const std::vector &DropoutGpuFwdKernel::GetInputSizeList() const { return input_size_list_; } - -const std::vector &DropoutGpuFwdKernel::GetOutputSizeList() const { return output_size_list_; } - -const std::vector &DropoutGpuFwdKernel::GetWorkspaceSizeList() const { return workspace_size_list_; } - -bool DropoutGpuFwdKernel::Init(const CNodePtr &kernel_node) { - InitResource(); - - size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); - if (input_num != 1) { - MS_LOG(EXCEPTION) << "Argument number is " << input_num << ", but DropoutGpuFwdKernel needs 1."; - } - - auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); - is_null_input_ = CHECK_NULL_INPUT(input_shape); - if (is_null_input_) { - InitSizeLists(); - return true; - } - - num_count_ = 1; - for (size_t x : input_shape) { - num_count_ *= x; - } - keep_prob_ = GetValue(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("keep_prob")); - - InitSizeLists(); - return true; -} - -void DropoutGpuFwdKernel::InitResource() { - cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); -} - -void DropoutGpuFwdKernel::DestroyResource() noexcept {} - -void DropoutGpuFwdKernel::InitSizeLists() { - size_t input_size = num_count_ * sizeof(float); - input_size_list_.push_back(input_size); - output_size_list_.push_back(input_size); // output size: the same with input size - output_size_list_.push_back(input_size); // mask size: the same with input size -} - -bool DropoutGpuFwdKernel::Launch(const std::vector &inputs, const std::vector &, - const std::vector &outputs, void *stream_ptr) { - if (is_null_input_) { - return true; - } - - auto *input = reinterpret_cast(inputs[0]->addr); - auto *output = reinterpret_cast(outputs[0]->addr); - auto *mask = reinterpret_cast(outputs[1]->addr); - - if (!states_init_) { - curandCreateGenerator(&mask_generator_, CURAND_RNG_PSEUDO_DEFAULT); - curandSetPseudoRandomGeneratorSeed(mask_generator_, time(NULL)); - states_init_ = true; - } - - curandGenerateUniform(mask_generator_, mask, num_count_); - DropoutForward(input, mask, output, num_count_, keep_prob_, reinterpret_cast(stream_ptr)); - - return true; -} +MS_REG_GPU_KERNEL_ONE( + Dropout, + KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32), + DropoutGpuFwdKernel, float) +MS_REG_GPU_KERNEL_ONE( + Dropout, + KernelAttr().AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16), + DropoutGpuFwdKernel, half) } // namespace kernel } // namespace mindspore diff --git a/mindspore/ccsrc/kernel/gpu/nn/dropout_gpu_kernel.h b/mindspore/ccsrc/kernel/gpu/nn/dropout_gpu_kernel.h index 81eb78c880..4dfacb7ca1 100644 --- a/mindspore/ccsrc/kernel/gpu/nn/dropout_gpu_kernel.h +++ b/mindspore/ccsrc/kernel/gpu/nn/dropout_gpu_kernel.h @@ -20,35 +20,88 @@ #include #include "kernel/gpu/gpu_kernel.h" #include "kernel/gpu/gpu_kernel_factory.h" +#include "kernel/gpu/cuda_impl/dropout_impl.cuh" #include "include/curand.h" namespace mindspore { namespace kernel { +template class DropoutGpuFwdKernel : public GpuKernel { public: - DropoutGpuFwdKernel(); + DropoutGpuFwdKernel() + : cudnn_handle_(nullptr), + is_null_input_(false), + num_count_(0), + keep_prob_(0.0), + states_init_(false), + mask_generator_(nullptr) {} - ~DropoutGpuFwdKernel() override; + ~DropoutGpuFwdKernel() override = default; - const std::vector &GetInputSizeList() const override; + const std::vector &GetInputSizeList() const override { return input_size_list_; } + const std::vector &GetOutputSizeList() const override { return output_size_list_; } + const std::vector &GetWorkspaceSizeList() const override { return workspace_size_list_; } - const std::vector &GetOutputSizeList() const override; + bool Launch(const std::vector &inputs, const std::vector &workspace, + const std::vector &outputs, void *stream_ptr) override { + if (is_null_input_) { + return true; + } - const std::vector &GetWorkspaceSizeList() const override; + T *input = GetDeviceAddress(inputs, 0); + T *output = GetDeviceAddress(outputs, 0); + T *mask = GetDeviceAddress(outputs, 1); + float *mask_f = GetDeviceAddress(workspace, 0); - bool Launch(const std::vector &inputs, const std::vector &workspace, - const std::vector &outputs, void *stream_ptr) override; + if (!states_init_) { + curandCreateGenerator(&mask_generator_, CURAND_RNG_PSEUDO_DEFAULT); + curandSetPseudoRandomGeneratorSeed(mask_generator_, time(NULL)); + states_init_ = true; + } + // curandGen only support float or double for mask. + curandGenerateUniform(mask_generator_, mask_f, num_count_); + DropoutForward(input, mask, output, mask_f, num_count_, keep_prob_, reinterpret_cast(stream_ptr)); + + return true; + } + + bool Init(const CNodePtr &kernel_node) override { + InitResource(); + + size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); + if (input_num != 1) { + MS_LOG(EXCEPTION) << "Argument number is " << input_num << ", but DropoutGpuFwdKernel needs 1."; + } - bool Init(const CNodePtr &kernel_node) override; + auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); + is_null_input_ = CHECK_NULL_INPUT(input_shape); + if (is_null_input_) { + InitSizeLists(); + return true; + } + + num_count_ = 1; + for (size_t x : input_shape) { + num_count_ *= x; + } + keep_prob_ = GetAttr(kernel_node, "keep_prob"); + + InitSizeLists(); + return true; + } protected: - void InitResource() override; + void InitResource() override { cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); } - void InitSizeLists() override; + void InitSizeLists() override { + size_t input_size = num_count_ * sizeof(T); + input_size_list_.push_back(input_size); + output_size_list_.push_back(input_size); // output size: the same with input size + output_size_list_.push_back(input_size); // mask size: the same with input size + workspace_size_list_.push_back(num_count_ * sizeof(float)); // temp mask_f for curandGen + } private: - void DestroyResource() noexcept; - cudnnHandle_t cudnn_handle_; bool is_null_input_; size_t num_count_; @@ -59,8 +112,6 @@ class DropoutGpuFwdKernel : public GpuKernel { std::vector output_size_list_; std::vector workspace_size_list_; }; - -MS_REG_GPU_KERNEL(Dropout, DropoutGpuFwdKernel) } // namespace kernel } // namespace mindspore diff --git a/mindspore/ccsrc/kernel/gpu/nn/dropout_grad_kernel.cc b/mindspore/ccsrc/kernel/gpu/nn/dropout_grad_kernel.cc index 2194805e92..2fd21c96ee 100644 --- a/mindspore/ccsrc/kernel/gpu/nn/dropout_grad_kernel.cc +++ b/mindspore/ccsrc/kernel/gpu/nn/dropout_grad_kernel.cc @@ -15,76 +15,16 @@ */ #include "kernel/gpu/nn/dropout_grad_kernel.h" -#include "kernel/gpu/cuda_impl/dropout_impl.cuh" namespace mindspore { namespace kernel { -DropoutGradGpuFwdKernel::DropoutGradGpuFwdKernel() - : cudnn_handle_(nullptr), is_null_input_(false), num_count_(0), keep_prob_(0.0) {} - -DropoutGradGpuFwdKernel::~DropoutGradGpuFwdKernel() { DestroyResource(); } - -const std::vector &DropoutGradGpuFwdKernel::GetInputSizeList() const { return input_size_list_; } - -const std::vector &DropoutGradGpuFwdKernel::GetOutputSizeList() const { return output_size_list_; } - -const std::vector &DropoutGradGpuFwdKernel::GetWorkspaceSizeList() const { return workspace_size_list_; } - -bool DropoutGradGpuFwdKernel::Init(const CNodePtr &kernel_node) { - InitResource(); - - size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); - if (input_num != 2) { - MS_LOG(ERROR) << "Argument number is " << input_num << ", but DropoutGradGpuFwdKernel needs 2."; - return false; - } - - auto input_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0); - is_null_input_ = CHECK_NULL_INPUT(input_shape); - if (is_null_input_) { - InitSizeLists(); - return true; - } - - num_count_ = 1; - for (size_t x : input_shape) { - num_count_ *= x; - } - keep_prob_ = GetValue(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("keep_prob")); - - InitSizeLists(); - return true; -} - -void DropoutGradGpuFwdKernel::InitResource() { - cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); -} - -void DropoutGradGpuFwdKernel::DestroyResource() noexcept {} - -void DropoutGradGpuFwdKernel::InitSizeLists() { - size_t dy_size = num_count_ * sizeof(float); - size_t mask_size = dy_size; - size_t dx_size = dy_size; - - input_size_list_.push_back(dy_size); - input_size_list_.push_back(mask_size); - output_size_list_.push_back(dx_size); -} - -bool DropoutGradGpuFwdKernel::Launch(const std::vector &inputs, const std::vector &, - const std::vector &outputs, void *stream_ptr) { - if (is_null_input_) { - return true; - } - - auto *dy = reinterpret_cast(inputs[0]->addr); - auto *mask = reinterpret_cast(inputs[1]->addr); - auto *dx = reinterpret_cast(outputs[0]->addr); - - DropoutBackward(dy, mask, dx, num_count_, keep_prob_, reinterpret_cast(stream_ptr)); - - return true; -} +MS_REG_GPU_KERNEL_ONE( + DropoutGrad, + KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32), + DropoutGradGpuBwdKernel, float) +MS_REG_GPU_KERNEL_ONE( + DropoutGrad, + KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16), + DropoutGradGpuBwdKernel, half) } // namespace kernel } // namespace mindspore diff --git a/mindspore/ccsrc/kernel/gpu/nn/dropout_grad_kernel.h b/mindspore/ccsrc/kernel/gpu/nn/dropout_grad_kernel.h index 4991b9dad5..e6683e15dd 100644 --- a/mindspore/ccsrc/kernel/gpu/nn/dropout_grad_kernel.h +++ b/mindspore/ccsrc/kernel/gpu/nn/dropout_grad_kernel.h @@ -20,28 +20,72 @@ #include #include "kernel/gpu/gpu_kernel.h" #include "kernel/gpu/gpu_kernel_factory.h" +#include "kernel/gpu/cuda_impl/dropout_impl.cuh" namespace mindspore { namespace kernel { -class DropoutGradGpuFwdKernel : public GpuKernel { +template +class DropoutGradGpuBwdKernel : public GpuKernel { public: - DropoutGradGpuFwdKernel(); - ~DropoutGradGpuFwdKernel() override; + DropoutGradGpuBwdKernel() : cudnn_handle_(nullptr), is_null_input_(false), num_count_(0), keep_prob_(0.0) {} + ~DropoutGradGpuBwdKernel() override = default; - const std::vector &GetInputSizeList() const override; - const std::vector &GetOutputSizeList() const override; - const std::vector &GetWorkspaceSizeList() const override; + const std::vector &GetInputSizeList() const override { return input_size_list_; } + const std::vector &GetOutputSizeList() const override { return output_size_list_; } + const std::vector &GetWorkspaceSizeList() const override { return workspace_size_list_; } bool Launch(const std::vector &inputs, const std::vector &workspace, - const std::vector &outputs, void *stream_ptr) override; - bool Init(const CNodePtr &kernel_node) override; + const std::vector &outputs, void *stream_ptr) override { + if (is_null_input_) { + return true; + } + + T *dy = GetDeviceAddress(inputs, 0); + T *mask = GetDeviceAddress(inputs, 1); + T *dx = GetDeviceAddress(outputs, 0); + + DropoutBackward(dy, mask, dx, num_count_, keep_prob_, reinterpret_cast(stream_ptr)); + + return true; + } + bool Init(const CNodePtr &kernel_node) override { + InitResource(); + + size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); + if (input_num != 2) { + MS_LOG(ERROR) << "Argument number is " << input_num << ", but DropoutGradGpuBwdKernel needs 2."; + return false; + } + + auto input_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0); + is_null_input_ = CHECK_NULL_INPUT(input_shape); + if (is_null_input_) { + InitSizeLists(); + return true; + } + + num_count_ = 1; + for (size_t x : input_shape) { + num_count_ *= x; + } + keep_prob_ = GetAttr(kernel_node, "keep_prob"); + + InitSizeLists(); + return true; + } protected: - void InitResource() override; - void InitSizeLists() override; + void InitResource() override { cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); } + void InitSizeLists() override { + size_t dy_size = num_count_ * sizeof(T); + size_t mask_size = dy_size; + size_t dx_size = dy_size; - private: - void DestroyResource() noexcept; + input_size_list_.push_back(dy_size); + input_size_list_.push_back(mask_size); + output_size_list_.push_back(dx_size); + } + private: cudnnHandle_t cudnn_handle_; bool is_null_input_; size_t num_count_; @@ -50,8 +94,6 @@ class DropoutGradGpuFwdKernel : public GpuKernel { std::vector output_size_list_; std::vector workspace_size_list_; }; - -MS_REG_GPU_KERNEL(DropoutGrad, DropoutGradGpuFwdKernel) } // namespace kernel } // namespace mindspore diff --git a/mindspore/ops/operations/nn_ops.py b/mindspore/ops/operations/nn_ops.py index 24b081bdad..10c61285d1 100644 --- a/mindspore/ops/operations/nn_ops.py +++ b/mindspore/ops/operations/nn_ops.py @@ -4117,6 +4117,7 @@ class Dropout(PrimitiveWithInfer): def infer_dtype(self, x_dtype): valid_types = (mstype.float16, mstype.float32) + validator.check_subclass("x", x_dtype, mstype.tensor, self.name) validator.check_tensor_type_same({"x_dtype": x_dtype}, valid_types, self.name) return x_dtype, x_dtype @@ -4151,6 +4152,8 @@ class DropoutGrad(PrimitiveWithInfer): def infer_dtype(self, dy_dtype, mask_dtype): valid_types = (mstype.float16, mstype.float32) + validator.check_subclass("dy", dy_dtype, mstype.tensor, self.name) + validator.check_subclass("mask", mask_dtype, mstype.tensor, self.name) validator.check_tensor_type_same({"dy_dtype": dy_dtype}, valid_types, self.name) return dy_dtype