!12605 Fix GPU ReduceMax/Sum float64 type failure

From: @TFbunny Reviewed-by: @tom__chen Signed-off-by:
4 years ago · d8433e9f5c
parent f1c7aeaa6b ec644a5291
commit d8433e9f5c
3 changed files with 41 additions and 13 deletions
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/array_reduce_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/array_reduce_gpu_kernel.h
@ -50,8 +50,8 @@ class ArrayReduceGpuKernel : public GpuKernel {
    T *output_addr = GetDeviceAddress<T>(outputs, 0);
    T *workspace_addr = GetDeviceAddress<T>(workspace, 0);
-    const float alpha = 1;
+    T alpha = static_cast<T>(1.0f);
-    const float beta = 0;
+    T beta = static_cast<T>(0.0f);
    if (all_match_) {
      MS_LOG(DEBUG)
        << "The corresponding dimensions of the input and output tensors all match. No need to call cuDNN kernel.";
@ -60,11 +60,21 @@ class ArrayReduceGpuKernel : public GpuKernel {
                                                 reinterpret_cast<cudaStream_t>(stream_ptr)),
                                 "cudaMemcpyAsync failed in ArrayReduceGpuKernel::Launch.");
    } else {
-      CHECK_CUDNN_RET_WITH_EXCEPT(
+      if (data_type_ == CUDNN_DATA_DOUBLE) {
-        kernel_node_,
+        CHECK_CUDNN_RET_WITH_EXCEPT(
-        cudnnReduceTensor(cudnn_handle_, reduce_tensor_descriptor_, nullptr, 0, workspace_addr, workspace_size_, &alpha,
+          kernel_node_,
-                          inputA_descriptor_, input_addr, &beta, outputC_descriptor_, output_addr),
+          cudnnReduceTensor(cudnn_handle_, reduce_tensor_descriptor_, nullptr, 0, workspace_addr, workspace_size_,
-        "cudnnReduceTensor failed.");
+                            &alpha, inputA_descriptor_, input_addr, &beta, outputC_descriptor_, output_addr),
          "cudnnReduceTensor failed.");
      } else {
        const float alphaf = static_cast<float>(alpha);
        const float betaf = static_cast<float>(beta);
        CHECK_CUDNN_RET_WITH_EXCEPT(
          kernel_node_,
          cudnnReduceTensor(cudnn_handle_, reduce_tensor_descriptor_, nullptr, 0, workspace_addr, workspace_size_,
                            &alphaf, inputA_descriptor_, input_addr, &betaf, outputC_descriptor_, output_addr),
          "cudnnReduceTensor failed.");
      }
    }
    return true;
  }
@ -194,12 +204,12 @@ class ArrayReduceGpuKernel : public GpuKernel {
      MS_LOG(EXCEPTION) << "Array reduce kernel type " << kernel_name << " is not supported.";
    }
    reduce_tensor_op_ = iter->second;
-
+    // add check for float64
-    CHECK_CUDNN_RET_WITH_EXCEPT(
+    cudnnDataType_t comp_type = (data_type_ == CUDNN_DATA_DOUBLE) ? CUDNN_DATA_DOUBLE : CUDNN_DATA_FLOAT;
-      kernel_node_,
+    CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
-      cudnnSetReduceTensorDescriptor(reduce_tensor_descriptor_, reduce_tensor_op_, CUDNN_DATA_FLOAT, nan_prop_,
+                                cudnnSetReduceTensorDescriptor(reduce_tensor_descriptor_, reduce_tensor_op_, comp_type,
-                                     reduce_indices_, CUDNN_32BIT_INDICES),
+                                                               nan_prop_, reduce_indices_, CUDNN_32BIT_INDICES),
-      "cudnnSetReduceTensorDescriptor failed");
+                                "cudnnSetReduceTensorDescriptor failed");
    return;
  }
  void InferInAndOutDesc(const std::vector<size_t> &input_shape, const std::vector<size_t> &output_shape) {
--- a/tests/st/ops/gpu/test_reduce_max_op.py
+++ b/tests/st/ops/gpu/test_reduce_max_op.py
@ -237,3 +237,12 @@ def test_reduce_max_float64():
    error = np.ones(shape=expect.shape) * 1.0e-5
    assert np.all(diff < error)
    assert output.shape == expect.shape
    context.set_context(mode=context.PYNATIVE_MODE, device_target="GPU")
    net = ReduceMaxTypeNet(np.float64)
    output = net()
    expect = np.max(x0, axis=axis0, keepdims=keep_dims0).astype(np.float64)
    diff = abs(output.asnumpy() - expect)
    error = np.ones(shape=expect.shape) * 1.0e-5
    assert np.all(diff < error)
    assert output.shape == expect.shape
--- a/tests/st/ops/gpu/test_reduce_sum_op.py
+++ b/tests/st/ops/gpu/test_reduce_sum_op.py
@ -329,3 +329,12 @@ def test_reduce_sum_float64():
    error = np.ones(shape=expect.shape) * 1.0e-5
    assert np.all(diff < error)
    assert output.shape == expect.shape
    context.set_context(mode=context.PYNATIVE_MODE, device_target="GPU")
    net = ReduceSumTypeNet(np.float64)
    output = net()
    expect = np.sum(x0, axis=axis0, keepdims=keep_dims0).astype(np.float64)
    diff = abs(output.asnumpy() - expect)
    error = np.ones(shape=expect.shape) * 1.0e-5
    assert np.all(diff < error)
    assert output.shape == expect.shape