Reworked mirrorPad

lintIssues convert long -> int64 correcting int64 -> int64_t lint
4 years ago · 0f69be06b1
parent 70bb0a842a
commit 0f69be06b1
9 changed files with 342 additions and 126 deletions
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/mirror_pad_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/mirror_pad_impl.cu
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/mirror_pad_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/mirror_pad_impl.cuh
@ -19,13 +19,28 @@
 #include <cuda_runtime.h>
 #include "runtime/device/gpu/cuda_common.h"

+// preset size of paddings
+#define MAX_PADDINGS 4
+#define PADDING_SIZE 2
+
+// define constants for kernel indexing use
+#define BATCH 0 * PADDING_SIZE
+#define CHANNEL 1 * PADDING_SIZE
+#define HEIGHT 2 * PADDING_SIZE
+#define WIDTH 3 * PADDING_SIZE
+#define TOP 0
+#define BOTTOM 1
+#define LEFT 0
+#define RIGHT 1
+
 template <typename T>
-void CalMirrorPad(const size_t size, const T *input, const int num, const int channels, const int old_height,
+void CalMirrorPad(const size_t size, const T *input, const int old_batch, const int old_channel, const int old_height,
                  const int old_width, const int padded_height, const int padded_width, int padd_num,
-                  const int *paddings, int mode, T *output, cudaStream_t cuda_stream);
+                  const int64_t *paddings, int mode, T *output, cudaStream_t cuda_stream);
 template <typename T>
-void CalMirrorPadGrad(const size_t size, const T *dy, const int num, const int channels, const int padded_height,
-                      const int padded_width, const int old_height, const int old_width, const int padd_dim,
-                      const int *paddings, int mode, T *dx, cudaStream_t cuda_stream);
+void CalMirrorPadGrad(const size_t dx_size, const size_t dy_size, T *dy, T *interim, const int output_batch,
+                      const int output_channel, const int output_height, const int output_width, const int input_height,
+                      const int input_width, const int padd_dim, const int64_t *paddings, int mode, T *dx,
+                      cudaStream_t cuda_stream);

 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_MIRROR_PAD_IMPL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/pad_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/pad_impl.cu
@ -206,3 +206,8 @@ template void CalPadGeneral<half>(const size_t size, const half *input, const in
                                  const int old_width, const int padded_height, const int padded_width,
                                  const int pad_top, const int pad_left, float pad_value, half *output,
                                  cudaStream_t cuda_stream);
+template void CalPadGeneral<int>(const size_t size, const int *input, const int num, const int channels_orig,
+                                  const int pad_channel_before, const int pad_channel_after, const int old_height,
+                                  const int old_width, const int padded_height, const int padded_width,
+                                  const int pad_top, const int pad_left, float pad_value, int *output,
+                                  cudaStream_t cuda_stream);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/mirror_pad_gpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/mirror_pad_gpu_kernel.cc
@ -26,5 +26,8 @@ MS_REG_GPU_KERNEL_ONE(
  MirrorPad,
  KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeFloat16),
  MirrorPadGpuFwdKernel, half)
+MS_REG_GPU_KERNEL_ONE(
+  MirrorPad, KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt32),
+  MirrorPadGpuFwdKernel, int)
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/mirror_pad_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/mirror_pad_gpu_kernel.h
@ -40,7 +40,7 @@ class MirrorPadGpuFwdKernel : public GpuKernel {
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
    T *input = GetDeviceAddress<T>(inputs, 0);
-    int *paddings = GetDeviceAddress<int>(inputs, 1);
+    int64_t *paddings = GetDeviceAddress<int64_t>(inputs, 1);
    T *output = GetDeviceAddress<T>(outputs, 0);

    size_t size = output_size_ / sizeof(T);
@ -58,13 +58,11 @@ class MirrorPadGpuFwdKernel : public GpuKernel {
      MS_LOG(ERROR) << "Input number is " << input_num << ", but MirrorPad needs 2 input.";
      return false;
    }
-    // check number of output -> should be 1
    size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
    if (output_num != 1) {
      MS_LOG(ERROR) << "Output number is " << output_num << ", but Pad needs 1 output.";
      return false;
    }
-
    string mode = GetValue<string>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("mode"));
    if (mode == "REFLECT") {
      mode_ = 0;  // reflected mirroring
@ -89,10 +87,9 @@ class MirrorPadGpuFwdKernel : public GpuKernel {
    }
    num_input_ = input_size_;
    input_size_ *= sizeof(T);
-
    auto padding_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
    num_paddings_ = padding_shape[0];
-    input_size_ += 2 * num_paddings_ * sizeof(int);
+    input_size_ += 2 * num_paddings_ * sizeof(int64_t);

    output_size_ = sizeof(T);
    auto output_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0);
@ -103,7 +100,6 @@ class MirrorPadGpuFwdKernel : public GpuKernel {

    int max_width = input_shape_[3];
    int max_height = input_shape_[2];
-
    // basic error check for padding value
    if (mode_ == 1) {  // symmetric
      max_width = max_width + (2 * max_width);
@ -112,13 +108,11 @@ class MirrorPadGpuFwdKernel : public GpuKernel {
      max_width = max_width + (2 * (max_width - 1));
      max_height = max_height + (2 * (max_height - 1));
    }
-
    if (output_shape_[(output_shape_.size() - 2) + 0] > max_width ||
        output_shape_[(output_shape_.size() - 2) + 1] > max_width) {
      MS_LOG(ERROR) << "ERROR: Padding value too high for input Tensor on 1 or more dims";
      return false;
    }
-
    InitSizeLists();
    return true;
  }
@ -126,7 +120,7 @@ class MirrorPadGpuFwdKernel : public GpuKernel {
 protected:
  void InitSizeLists() override {
    input_size_list_.push_back(num_input_ * sizeof(T));
-    input_size_list_.push_back(2 * num_paddings_ * sizeof(int));
+    input_size_list_.push_back(2 * num_paddings_ * sizeof(int64_t));  // for 64 bit int defined in API
    output_size_list_.push_back(output_size_);
  }

--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/mirror_pad_grad_gpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/mirror_pad_grad_gpu_kernel.cc
@ -26,5 +26,9 @@ MS_REG_GPU_KERNEL_ONE(
  MirrorPadGrad,
  KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeFloat16),
  MirrorPadGpuBackKernel, half)
+MS_REG_GPU_KERNEL_ONE(
+  MirrorPadGrad,
+  KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt32),
+  MirrorPadGpuBackKernel, int)
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/mirror_pad_grad_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/mirror_pad_grad_gpu_kernel.h
@ -40,15 +40,15 @@ class MirrorPadGpuBackKernel : public GpuKernel {
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
    T *input = GetDeviceAddress<T>(inputs, 0);
-    int *paddings = GetDeviceAddress<int>(inputs, 1);
+    int64_t *paddings = GetDeviceAddress<int64_t>(inputs, 1);
+    T *interim = GetDeviceAddress<T>(workspace, 0);
    T *output = GetDeviceAddress<T>(outputs, 0);

-    size_t size = output_size_ / sizeof(T);
-    int dim_offset = output_shape_.size() - 2;
-
-    CalMirrorPadGrad(size, input, input_shape_[0], input_shape_[1], input_shape_[2], input_shape_[3],
-                     output_shape_[dim_offset + 0], output_shape_[dim_offset + 1], num_paddings_, paddings, mode_,
-                     output, reinterpret_cast<cudaStream_t>(stream_ptr));
+    size_t dx_size = output_size_ / sizeof(T);
+    size_t interim_dy_size = workspace_size_ / sizeof(T);
+    CalMirrorPadGrad(dx_size, interim_dy_size, input, interim, output_shape_[0], output_shape_[1], output_shape_[2],
+                     output_shape_[3], input_shape_[2], input_shape_[3], num_paddings_, paddings, mode_, output,
+                     reinterpret_cast<cudaStream_t>(stream_ptr));
    return true;
  }

@ -58,13 +58,11 @@ class MirrorPadGpuBackKernel : public GpuKernel {
      MS_LOG(ERROR) << "Input number is " << input_num << ", but MirrorPadGrad needs 2 input.";
      return false;
    }
-    // check number of output -> should be 1
    size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
    if (output_num != 1) {
      MS_LOG(ERROR) << "Output number is " << output_num << ", but MirrorPadGrad needs 1 output.";
      return false;
    }
-
    string mode = GetValue<string>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("mode"));
    if (mode == "REFLECT") {
      mode_ = 0;  // reflected mirroring
@ -82,28 +80,43 @@ class MirrorPadGpuBackKernel : public GpuKernel {
      auto it = input_shape.begin();
      input_shape.insert(it, 2, 1);  // channel padding
    }
-
+    input_size_ = sizeof(T);
    for (auto in_shape : input_shape) {
      input_size_ *= in_shape;
      input_shape_.push_back(in_shape);
    }
    num_input_ = input_size_;
-    input_size_ *= sizeof(T);

+    // account for paddings in input size -> passed as int64_ts
    auto padding_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
    num_paddings_ = padding_shape[0];
-    input_size_ += +(2 * num_paddings_ * sizeof(int));
+    input_size_ += (2 * num_paddings_ * sizeof(int64_t));

-    output_size_ = sizeof(T);
    auto output_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0);
+    if (output_shape.size() == 4) {
+    } else if (output_shape.size() == 3) {
+      auto it = output_shape.begin();
+      output_shape.insert(it, 1);  // batch padding
+    } else if (output_shape.size() == 2) {
+      auto it = output_shape.begin();
+      output_shape.insert(it, 2, 1);  // channel padding
+    }
+    output_size_ = sizeof(T);
    for (auto x : output_shape) {
      output_size_ *= x;
      output_shape_.push_back(x);
    }

+    // calc workspace size
+    // store dy values with accumulation across batch and channel only
+    workspace_size_ = sizeof(T);
+    for (int i = 0; i < 2; i++) {
+      workspace_size_ *= output_shape[i];     // BATCH, CHANNEL -> Output size
+      workspace_size_ *= input_shape[i + 2];  // WIDTH, HEIGHT -> Input Size
+    }
+
    int max_width = input_shape_[3];
    int max_height = input_shape_[2];
-
    // basic error check for padding value
    if (mode_ == 1) {  // symmetric
      max_width = max_width + (2 * max_width);
@ -112,13 +125,11 @@ class MirrorPadGpuBackKernel : public GpuKernel {
      max_width = max_width + (2 * (max_width - 1));
      max_height = max_height + (2 * (max_height - 1));
    }
-
    if (output_shape_[(output_shape_.size() - 2) + 0] > max_width ||
        output_shape_[(output_shape_.size() - 2) + 1] > max_width) {
      MS_LOG(ERROR) << "ERROR: Padding value too high for input Tensor on 1 or more DIMS";
      return false;
    }
-
    InitSizeLists();
    return true;
  }
@ -126,7 +137,8 @@ class MirrorPadGpuBackKernel : public GpuKernel {
 protected:
  void InitSizeLists() override {
    input_size_list_.push_back(num_input_ * sizeof(T));
-    input_size_list_.push_back(2 * num_paddings_ * sizeof(int));
+    input_size_list_.push_back(2 * num_paddings_ * sizeof(int64_t));  // for 64 bit int defined in API
+    workspace_size_list_.push_back(workspace_size_);
    output_size_list_.push_back(output_size_);
  }

@ -134,9 +146,8 @@ class MirrorPadGpuBackKernel : public GpuKernel {
  size_t num_input_;
  int num_paddings_;
  int mode_;
-  std::vector<int> input_shape_;   // dims of the input data
-  std::vector<int> output_shape_;  // dims of the output data
-  // default
+  std::vector<int> input_shape_;
+  std::vector<int> output_shape_;
  size_t input_size_;
  size_t output_size_;
  size_t workspace_size_;
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/pad_gpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/pad_gpu_kernel.cc
@ -22,5 +22,7 @@ MS_REG_GPU_KERNEL_ONE(Pad, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutp
                      PadGpuFwdKernel, float)
 MS_REG_GPU_KERNEL_ONE(Pad, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
                      PadGpuFwdKernel, half)
+MS_REG_GPU_KERNEL_ONE(Pad, KernelAttr().AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32), PadGpuFwdKernel,
+                      int)
 }  // namespace kernel
 }  // namespace mindspore
--- a/tests/st/ops/gpu/test_mirror_pad.py
+++ b/tests/st/ops/gpu/test_mirror_pad.py
@ -64,9 +64,9 @@ class Grad(nn.Cell):
        return self.grad(self.network)(input_, output_grad)

 class Net(nn.Cell):
-    def __init__(self):
+    def __init__(self, pads, mode_):
        super(Net, self).__init__()
-        self.pad = nn.Pad(mode="REFLECT", paddings=((0, 0), (0, 0), (1, 0), (0, 2)))
+        self.pad = nn.Pad(mode=mode_, paddings=pads)
    def construct(self, x):
        return self.pad(x)

@ -82,7 +82,88 @@ def test_mirror_pad_backprop():
    expected_dx = np.array([[[[0.2, 0.2, 0.1],
                              [0.4, 0.4, 0.2],
                              [0.2, 0.2, 0.1]]]])
-    net = Grad(Net())
+    net = Grad(Net(((0, 0), (0, 0), (1, 0), (0, 2)), "REFLECT"))
    dx = net(test_arr_in, Tensor(dy))
    dx = dx[0].asnumpy()
    np.testing.assert_array_almost_equal(dx, expected_dx)
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_mirror_pad_fwd_back_4d_int32_reflect():
+    context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
+    # set constants
+    shape = (2, 3, 3, 5)
+    pads = ((1, 0), (2, 0), (1, 2), (3, 4))
+    total_val = np.prod(shape)
+    test_arr_np = np.arange(total_val).reshape(shape) + 1
+    test_arr_ms = Tensor(test_arr_np, dtype=mindspore.int32)
+    # fwd_pass_check
+    op = nn.Pad(mode="REFLECT", paddings=pads)
+    expected_np_result = np.pad(test_arr_np, pads, 'reflect')
+    obtained_ms_res = op(test_arr_ms).asnumpy()
+    np.testing.assert_array_equal(expected_np_result, obtained_ms_res)
+    # backwards pass check
+    GradNet = Grad(Net(pads, "REFLECT"))
+    dy_value = Tensor(np.ones(obtained_ms_res.shape), dtype=mindspore.int32)
+    dx_value_obtained = GradNet(test_arr_ms, dy_value)[0].asnumpy()
+    dx_value_expected = np.array([[[[4, 6, 6, 6, 2],
+                                    [6, 9, 9, 9, 3],
+                                    [2, 3, 3, 3, 1]],
+                                   [[8, 12, 12, 12, 4],
+                                    [12, 18, 18, 18, 6],
+                                    [4, 6, 6, 6, 2]],
+                                   [[8, 12, 12, 12, 4],
+                                    [12, 18, 18, 18, 6],
+                                    [4, 6, 6, 6, 2]]],
+                                  [[[8, 12, 12, 12, 4],
+                                    [12, 18, 18, 18, 6],
+                                    [4, 6, 6, 6, 2]],
+                                   [[16, 24, 24, 24, 8],
+                                    [24, 36, 36, 36, 12],
+                                    [8, 12, 12, 12, 4]],
+                                   [[16, 24, 24, 24, 8],
+                                    [24, 36, 36, 36, 12],
+                                    [8, 12, 12, 12, 4]]]], dtype=np.int32)
+    np.testing.assert_array_equal(dx_value_expected, dx_value_obtained)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+def test_mirror_pad_fwd_back_4d_int32_symm():
+    context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
+    # set constants
+    shape = (2, 3, 3, 5)
+    pads = ((1, 0), (2, 0), (1, 2), (3, 4))
+    total_val = np.prod(shape)
+    test_arr_np = np.arange(total_val).reshape(shape) + 1
+    test_arr_ms = Tensor(test_arr_np, dtype=mindspore.int32)
+    # fwd_pass_check
+    op = nn.Pad(mode="SYMMETRIC", paddings=pads)
+    expected_np_result = np.pad(test_arr_np, pads, 'symmetric')
+    obtained_ms_res = op(test_arr_ms).asnumpy()
+    np.testing.assert_array_equal(expected_np_result, obtained_ms_res)
+    # backwards pass check
+    GradNet = Grad(Net(pads, "SYMMETRIC"))
+    dy_value = Tensor(np.ones(obtained_ms_res.shape), dtype=mindspore.int32)
+    dx_value_obtained = GradNet(test_arr_ms, dy_value)[0].asnumpy()
+    dx_value_expected = np.array([[[[16, 24, 24, 16, 16],
+                                    [16, 24, 24, 16, 16],
+                                    [16, 24, 24, 16, 16]],
+                                   [[16, 24, 24, 16, 16],
+                                    [16, 24, 24, 16, 16],
+                                    [16, 24, 24, 16, 16]],
+                                   [[8, 12, 12, 8, 8],
+                                    [8, 12, 12, 8, 8],
+                                    [8, 12, 12, 8, 8]]],
+                                  [[[8, 12, 12, 8, 8],
+                                    [8, 12, 12, 8, 8],
+                                    [8, 12, 12, 8, 8]],
+                                   [[8, 12, 12, 8, 8],
+                                    [8, 12, 12, 8, 8],
+                                    [8, 12, 12, 8, 8]],
+                                   [[4, 6, 6, 4, 4],
+                                    [4, 6, 6, 4, 4],
+                                    [4, 6, 6, 4, 4]]]], dtype=np.int32)
+    np.testing.assert_array_equal(dx_value_expected, dx_value_obtained)