|
|
|
@ -37,9 +37,9 @@ class PoolingGradGpuKernel : public GpuKernel {
|
|
|
|
|
dy_descriptor_(nullptr),
|
|
|
|
|
x_descriptor_(nullptr),
|
|
|
|
|
dx_descriptor_(nullptr),
|
|
|
|
|
padded_descriptor_(nullptr),
|
|
|
|
|
pooling_mode_(CUDNN_POOLING_MAX),
|
|
|
|
|
cudnn_data_type_(CUDNN_DATA_FLOAT),
|
|
|
|
|
compute_format_(CUDNN_TENSOR_NCHW),
|
|
|
|
|
old_height_(0),
|
|
|
|
|
old_width_(0),
|
|
|
|
|
pad_height_(0),
|
|
|
|
@ -52,9 +52,7 @@ class PoolingGradGpuKernel : public GpuKernel {
|
|
|
|
|
is_null_input_(false),
|
|
|
|
|
input_size_(0),
|
|
|
|
|
output_size_(0),
|
|
|
|
|
padded_size_(0),
|
|
|
|
|
workspace_size_(0),
|
|
|
|
|
use_pad_(true) {}
|
|
|
|
|
workspace_size_(0) {}
|
|
|
|
|
~PoolingGradGpuKernel() override { DestroyResource(); }
|
|
|
|
|
|
|
|
|
|
const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
|
|
|
|
@ -72,27 +70,10 @@ class PoolingGradGpuKernel : public GpuKernel {
|
|
|
|
|
|
|
|
|
|
const float alpha = 1;
|
|
|
|
|
const float beta = 0;
|
|
|
|
|
if ((pad_mode_ == kSamePadModeUpperCase || pad_mode_ == kSamePadModeLowerCase) && use_pad_) {
|
|
|
|
|
T *padded = GetDeviceAddress<T>(workspace, 0);
|
|
|
|
|
T *padded_dx = GetDeviceAddress<T>(workspace, 1);
|
|
|
|
|
|
|
|
|
|
CalPad(padded_size_ / sizeof(T), x_data, n_, c_, old_height_, old_width_, old_height_ + pad_height_,
|
|
|
|
|
old_width_ + pad_width_, pad_top_, pad_left_, pad_value_, padded,
|
|
|
|
|
reinterpret_cast<cudaStream_t>(stream_ptr));
|
|
|
|
|
|
|
|
|
|
CHECK_CUDNN_RET_WITH_EXCEPT(
|
|
|
|
|
cudnnPoolingBackward(cudnn_handle_, pooling_descriptor_, &alpha, y_descriptor_, y, dy_descriptor_, dy,
|
|
|
|
|
padded_descriptor_, padded, &beta, padded_descriptor_, padded_dx),
|
|
|
|
|
"cudnnPoolingBackward failed");
|
|
|
|
|
|
|
|
|
|
CalPadGrad(output_size_ / sizeof(T), padded_dx, n_, c_, old_height_, old_width_, old_height_ + pad_height_,
|
|
|
|
|
old_width_ + pad_width_, pad_top_, pad_left_, dx, reinterpret_cast<cudaStream_t>(stream_ptr));
|
|
|
|
|
} else {
|
|
|
|
|
CHECK_CUDNN_RET_WITH_EXCEPT(
|
|
|
|
|
cudnnPoolingBackward(cudnn_handle_, pooling_descriptor_, &alpha, y_descriptor_, y, dy_descriptor_, dy,
|
|
|
|
|
x_descriptor_, x_data, &beta, dx_descriptor_, dx),
|
|
|
|
|
"cudnnPoolingBackward failed");
|
|
|
|
|
}
|
|
|
|
|
CHECK_CUDNN_RET_WITH_EXCEPT(
|
|
|
|
|
cudnnPoolingBackward(cudnn_handle_, pooling_descriptor_, &alpha, y_descriptor_, y, dy_descriptor_, dy,
|
|
|
|
|
x_descriptor_, x_data, &beta, dx_descriptor_, dx),
|
|
|
|
|
"cudnnPoolingBackward failed");
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
bool Init(const CNodePtr &kernel_node) override {
|
|
|
|
@ -104,46 +85,73 @@ class PoolingGradGpuKernel : public GpuKernel {
|
|
|
|
|
int window_height = window[2];
|
|
|
|
|
int window_width = window[3];
|
|
|
|
|
SetPoolingMode(kernel_node);
|
|
|
|
|
auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
|
|
|
|
|
auto input_mask = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
|
|
|
|
|
auto input_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
|
|
|
|
|
auto input_mask = AnfAlgo::GetInputDeviceShape(kernel_node, 1);
|
|
|
|
|
auto dout_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 2);
|
|
|
|
|
auto output_shape = AnfAlgo::GetOutputDeviceShape(kernel_node, 0);
|
|
|
|
|
data_format_ = AnfAlgo::GetInputFormat(kernel_node, 0);
|
|
|
|
|
is_null_input_ = CHECK_NULL_INPUT(input_shape) || CHECK_NULL_INPUT(input_mask);
|
|
|
|
|
if (is_null_input_) {
|
|
|
|
|
MS_LOG(WARNING) << "PoolingGradGpuKernel input is null.";
|
|
|
|
|
InitSizeLists();
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
SetNCHW(input_shape, &n_, &c_, &old_height_, &old_width_, data_format_);
|
|
|
|
|
int windowDimA[2] = {window_height, window_width};
|
|
|
|
|
int paddingA[2] = {0, 0};
|
|
|
|
|
int strideA[2] = {stride_[2], stride_[3]};
|
|
|
|
|
int nbDims = 4;
|
|
|
|
|
int dimA[4];
|
|
|
|
|
int strideAin[4];
|
|
|
|
|
int dimAy[4];
|
|
|
|
|
int strideAiny[4];
|
|
|
|
|
int dimAdy[4];
|
|
|
|
|
int strideAdy[4];
|
|
|
|
|
int dimAout[4];
|
|
|
|
|
int strideAout[4];
|
|
|
|
|
SetDimA(input_shape, dimA, data_format_);
|
|
|
|
|
SetStrideA(input_shape, strideAin, data_format_);
|
|
|
|
|
SetDimA(input_mask, dimAy, data_format_);
|
|
|
|
|
SetStrideA(input_mask, strideAiny, data_format_);
|
|
|
|
|
SetDimA(dout_shape, dimAdy, data_format_);
|
|
|
|
|
SetStrideA(dout_shape, strideAdy, data_format_);
|
|
|
|
|
SetDimA(output_shape, dimAout, data_format_);
|
|
|
|
|
SetStrideA(output_shape, strideAout, data_format_);
|
|
|
|
|
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensorNdDescriptor(y_descriptor_, cudnn_data_type_, nbDims, dimAy, strideAiny),
|
|
|
|
|
"cudnnSetTensor4dDescriptor failed");
|
|
|
|
|
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensorNdDescriptor(dy_descriptor_, cudnn_data_type_, nbDims, dimAdy, strideAdy),
|
|
|
|
|
"cudnnSetTensor4dDescriptor failed");
|
|
|
|
|
CHECK_CUDNN_RET_WITH_EXCEPT(
|
|
|
|
|
cudnnSetTensor4dDescriptor(y_descriptor_, CUDNN_TENSOR_NCHW, cudnn_data_type_, SizeToInt(input_mask[0]),
|
|
|
|
|
SizeToInt(input_mask[1]), SizeToInt(input_mask[2]), SizeToInt(input_mask[3])),
|
|
|
|
|
"cudnnSetTensor4dDescriptor");
|
|
|
|
|
|
|
|
|
|
auto dout_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 2);
|
|
|
|
|
CHECK_CUDNN_RET_WITH_EXCEPT(
|
|
|
|
|
cudnnSetTensor4dDescriptor(dy_descriptor_, CUDNN_TENSOR_NCHW, cudnn_data_type_, SizeToInt(dout_shape[0]),
|
|
|
|
|
SizeToInt(dout_shape[1]), SizeToInt(dout_shape[2]), SizeToInt(dout_shape[3])),
|
|
|
|
|
"cudnnSetTensor4dDescriptor");
|
|
|
|
|
|
|
|
|
|
auto output_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0);
|
|
|
|
|
CHECK_CUDNN_RET_WITH_EXCEPT(
|
|
|
|
|
cudnnSetTensor4dDescriptor(dx_descriptor_, CUDNN_TENSOR_NCHW, cudnn_data_type_, SizeToInt(output_shape[0]),
|
|
|
|
|
SizeToInt(output_shape[1]), SizeToInt(output_shape[2]), SizeToInt(output_shape[3])),
|
|
|
|
|
cudnnSetTensorNdDescriptor(dx_descriptor_, cudnn_data_type_, nbDims, dimAout, strideAout),
|
|
|
|
|
"cudnnSetTensor4dDescriptor failed");
|
|
|
|
|
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensorNdDescriptor(x_descriptor_, cudnn_data_type_, nbDims, dimA, strideAin),
|
|
|
|
|
"cudnnSetTensor4dDescriptor failed");
|
|
|
|
|
if (kSamePadModeUpperCase == pad_mode_ || kSamePadModeLowerCase == pad_mode_) {
|
|
|
|
|
SetPad(input_shape, window_height, window_width);
|
|
|
|
|
pad_height_ =
|
|
|
|
|
std::max<int>(0, (((old_height_ / stride_[2]) * stride_[2] == old_height_ ? (old_height_ / stride_[2])
|
|
|
|
|
: (old_height_ / stride_[2]) + 1) -
|
|
|
|
|
1) *
|
|
|
|
|
stride_[2] +
|
|
|
|
|
window_height - old_height_);
|
|
|
|
|
pad_width_ =
|
|
|
|
|
std::max<int>(0, (((old_width_ / stride_[3]) * stride_[3] == old_width_ ? (old_width_ / stride_[3])
|
|
|
|
|
: (old_width_ / stride_[3]) + 1) -
|
|
|
|
|
1) *
|
|
|
|
|
stride_[3] +
|
|
|
|
|
window_width - old_width_);
|
|
|
|
|
pad_top_ = pad_height_ / 2;
|
|
|
|
|
pad_left_ = pad_width_ / 2;
|
|
|
|
|
paddingA[0] = pad_top_;
|
|
|
|
|
paddingA[1] = pad_left_;
|
|
|
|
|
} else {
|
|
|
|
|
if (pad_mode_ == kValidPadModeUpperCase || pad_mode_ == kValidPadModeLowerCase) {
|
|
|
|
|
pad_height_ = 0;
|
|
|
|
|
pad_width_ = 0;
|
|
|
|
|
}
|
|
|
|
|
CHECK_CUDNN_RET_WITH_EXCEPT(
|
|
|
|
|
cudnnSetPooling2dDescriptor(pooling_descriptor_, pooling_mode_, CUDNN_NOT_PROPAGATE_NAN, window_height,
|
|
|
|
|
window_width, pad_height_, pad_width_, stride_[2], stride_[3]),
|
|
|
|
|
"cudnnSetPooling2dDescriptor failed");
|
|
|
|
|
CHECK_CUDNN_RET_WITH_EXCEPT(
|
|
|
|
|
cudnnSetTensor4dDescriptor(x_descriptor_, CUDNN_TENSOR_NCHW, cudnn_data_type_, SizeToInt(input_shape[0]),
|
|
|
|
|
SizeToInt(input_shape[1]), SizeToInt(input_shape[2]), SizeToInt(input_shape[3])),
|
|
|
|
|
"cudnnSetTensor4dDescriptor");
|
|
|
|
|
}
|
|
|
|
|
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetPoolingNdDescriptor(pooling_descriptor_, pooling_mode_, CUDNN_NOT_PROPAGATE_NAN,
|
|
|
|
|
2, windowDimA, paddingA, strideA),
|
|
|
|
|
"cudnnSetPoolingNdDescriptor failed");
|
|
|
|
|
InitSizeLists();
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
@ -155,7 +163,6 @@ class PoolingGradGpuKernel : public GpuKernel {
|
|
|
|
|
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&dy_descriptor_), "cudnnCreateTensorDescriptor failed");
|
|
|
|
|
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&x_descriptor_), "cudnnCreateTensorDescriptor failed");
|
|
|
|
|
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&dx_descriptor_), "cudnnCreateTensorDescriptor failed");
|
|
|
|
|
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateTensorDescriptor(&padded_descriptor_), "cudnnCreateTensorDescriptor failed");
|
|
|
|
|
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreatePoolingDescriptor(&pooling_descriptor_),
|
|
|
|
|
"cudnnCreatePoolingDescriptor failed");
|
|
|
|
|
}
|
|
|
|
@ -179,16 +186,6 @@ class PoolingGradGpuKernel : public GpuKernel {
|
|
|
|
|
"cudnnGetTensorSizeInBytes failed");
|
|
|
|
|
}
|
|
|
|
|
input_size_list_.push_back(input_size_);
|
|
|
|
|
|
|
|
|
|
if ((pad_mode_ == kSamePadModeUpperCase || pad_mode_ == kSamePadModeLowerCase) && use_pad_ && !is_null_input_) {
|
|
|
|
|
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnGetTensorSizeInBytes(padded_descriptor_, &padded_size_),
|
|
|
|
|
"cudnnGetTensorSizeInBytes failed");
|
|
|
|
|
if (padded_size_ == 0) {
|
|
|
|
|
MS_LOG(EXCEPTION) << "Padded size is 0.";
|
|
|
|
|
}
|
|
|
|
|
workspace_size_list_.push_back(padded_size_);
|
|
|
|
|
workspace_size_list_.push_back(padded_size_);
|
|
|
|
|
}
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
@ -206,35 +203,6 @@ class PoolingGradGpuKernel : public GpuKernel {
|
|
|
|
|
c_ = SizeToInt(input_shape[1]);
|
|
|
|
|
old_height_ = SizeToInt(input_shape[2]);
|
|
|
|
|
old_width_ = SizeToInt(input_shape[3]);
|
|
|
|
|
pad_height_ =
|
|
|
|
|
std::max<int>(0, (((old_height_ / stride_[2]) * stride_[2] == old_height_ ? (old_height_ / stride_[2])
|
|
|
|
|
: (old_height_ / stride_[2]) + 1) -
|
|
|
|
|
1) *
|
|
|
|
|
stride_[2] +
|
|
|
|
|
window_height - old_height_);
|
|
|
|
|
pad_width_ =
|
|
|
|
|
std::max<int>(0, (((old_width_ / stride_[3]) * stride_[3] == old_width_ ? (old_width_ / stride_[3])
|
|
|
|
|
: (old_width_ / stride_[3]) + 1) -
|
|
|
|
|
1) *
|
|
|
|
|
stride_[3] +
|
|
|
|
|
window_width - old_width_);
|
|
|
|
|
pad_top_ = pad_height_ / 2;
|
|
|
|
|
pad_left_ = pad_width_ / 2;
|
|
|
|
|
if (pad_height_ % 2 == 0 && pad_width_ % 2 == 0) {
|
|
|
|
|
use_pad_ = false;
|
|
|
|
|
}
|
|
|
|
|
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensor4dDescriptor(padded_descriptor_, CUDNN_TENSOR_NCHW, cudnn_data_type_, n_,
|
|
|
|
|
c_, old_height_ + pad_height_, old_width_ + pad_width_),
|
|
|
|
|
"cudnnSetTensor4dDescriptor failed");
|
|
|
|
|
CHECK_CUDNN_RET_WITH_EXCEPT(
|
|
|
|
|
cudnnSetTensor4dDescriptor(x_descriptor_, CUDNN_TENSOR_NCHW, cudnn_data_type_, SizeToInt(input_shape[0]),
|
|
|
|
|
SizeToInt(input_shape[1]), SizeToInt(input_shape[2]) + (use_pad_ ? pad_height_ : 0),
|
|
|
|
|
SizeToInt(input_shape[3]) + (use_pad_ ? pad_width_ : 0)),
|
|
|
|
|
"cudnnSetTensor4dDescriptor");
|
|
|
|
|
CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetPooling2dDescriptor(pooling_descriptor_, pooling_mode_, CUDNN_NOT_PROPAGATE_NAN,
|
|
|
|
|
window_height, window_width, use_pad_ ? 0 : pad_top_,
|
|
|
|
|
use_pad_ ? 0 : pad_left_, stride_[2], stride_[3]),
|
|
|
|
|
"cudnnSetPooling2dDescriptor failed");
|
|
|
|
|
}
|
|
|
|
|
void SetPoolingMode(const CNodePtr &kernel_node) {
|
|
|
|
|
pad_mode_ = GetAttr<std::string>(kernel_node, "padding");
|
|
|
|
@ -252,7 +220,6 @@ class PoolingGradGpuKernel : public GpuKernel {
|
|
|
|
|
void DestroyResource() noexcept {
|
|
|
|
|
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyPoolingDescriptor(pooling_descriptor_),
|
|
|
|
|
"cudnnDestroyPoolingDescriptor failed");
|
|
|
|
|
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(padded_descriptor_), "cudnnDestroyTensorDescriptor failed");
|
|
|
|
|
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dx_descriptor_), "cudnnDestroyTensorDescriptor failed");
|
|
|
|
|
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(x_descriptor_), "cudnnDestroyTensorDescriptor failed");
|
|
|
|
|
CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyTensorDescriptor(dy_descriptor_), "cudnnDestroyTensorDescriptor failed");
|
|
|
|
@ -265,7 +232,6 @@ class PoolingGradGpuKernel : public GpuKernel {
|
|
|
|
|
cudnnTensorDescriptor_t dy_descriptor_;
|
|
|
|
|
cudnnTensorDescriptor_t x_descriptor_;
|
|
|
|
|
cudnnTensorDescriptor_t dx_descriptor_;
|
|
|
|
|
cudnnTensorDescriptor_t padded_descriptor_;
|
|
|
|
|
cudnnPoolingMode_t pooling_mode_ = CUDNN_POOLING_MAX;
|
|
|
|
|
std::vector<int> stride_;
|
|
|
|
|
std::vector<size_t> input_size_list_;
|
|
|
|
@ -273,7 +239,9 @@ class PoolingGradGpuKernel : public GpuKernel {
|
|
|
|
|
std::vector<size_t> workspace_size_list_;
|
|
|
|
|
std::string mode_;
|
|
|
|
|
std::string pad_mode_;
|
|
|
|
|
std::string data_format_ = "NCHW";
|
|
|
|
|
cudnnDataType_t cudnn_data_type_;
|
|
|
|
|
cudnnTensorFormat_t compute_format_;
|
|
|
|
|
int old_height_;
|
|
|
|
|
int old_width_;
|
|
|
|
|
int pad_height_;
|
|
|
|
@ -286,9 +254,7 @@ class PoolingGradGpuKernel : public GpuKernel {
|
|
|
|
|
bool is_null_input_;
|
|
|
|
|
size_t input_size_;
|
|
|
|
|
size_t output_size_;
|
|
|
|
|
size_t padded_size_;
|
|
|
|
|
size_t workspace_size_;
|
|
|
|
|
bool use_pad_;
|
|
|
|
|
};
|
|
|
|
|
} // namespace kernel
|
|
|
|
|
} // namespace mindspore
|
|
|
|
|