[ROCM] update fluid elementwise op for rocm (part10), test=develop (#31361)

* [ROCM] update fluid elementwise op for rocm (part10), test=develop * update, test=develop * address review comments, test=develop
5 years ago · 7cdf6ea770
parent 84639b6193
commit 7cdf6ea770
13 changed files with 76 additions and 44 deletions
--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@ -239,7 +239,7 @@ void Copy<platform::CPUPlace, platform::CUDAPlace>(

  platform::SetDeviceId(src_place.device);
  VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
-          << dst_place << " by thream(" << stream << ")";
+          << dst_place << " by stream(" << stream << ")";
  if (stream) {
    platform::RecordEvent record_event("GpuMemcpyAsync:GPU->CPU");
 #ifdef PADDLE_WITH_HIP
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.h
@ -20,12 +20,18 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/math_function.h"
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #ifdef __NVCC__
 #include <cuda.h>
 #include <cuda_fp16.h>
 #include "cub/cub.cuh"
 #endif
+#ifdef __HIPCC__
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
 #endif

 namespace paddle {
@ -179,7 +185,7 @@ __global__ void MatrixColReduce(const T *__restrict__ in, T *__restrict__ out,
  }
 }

-#if CUDA_VERSION >= 10000
+#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 10000
 template <int SIZE>
 __global__ void VecFP16MatrixColReduce(const __half2 *__restrict__ in,
                                       __half2 *__restrict__ out, size_t width,
@ -287,7 +293,7 @@ bool static RunSpecialDims(const framework::DDim &dx_dims,
  return true;
 }

-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 // cuda definition
 template <typename DeviceContext, typename T>
 typename std::enable_if<
--- a/paddle/fluid/operators/elementwise/elementwise_div_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op.h
@ -144,7 +144,7 @@ elementwise_div_grad(const framework::ExecutionContext& ctx,
      ctx, *x, *y, *out, *dout, axis, dx, dy, DivGradDX<T>(), DivGradDY<T>());
 }

-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 // cuda definition
 template <typename DeviceContext, typename T>
 typename std::enable_if<
--- a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h
@ -25,10 +25,14 @@ namespace operators {
 template <typename T>
 struct FloorDivFunctor {
  inline HOSTDEVICE T operator()(T a, T b) const {
-#ifdef __CUDA_ARCH__
+#if defined(__HIPCC__) || defined(__CUDA_ARCH__)
    if (b == 0) {
      printf("Error: Divide by zero encounter in floor_divide\n");
+#ifdef __HIPCC__
+      abort();
+#else
      asm("trap;");
+#endif
    }
 #else
    if (b == 0)
@ -42,10 +46,14 @@ struct FloorDivFunctor {
 template <typename T>
 struct InverseFloorDivFunctor {
  inline HOSTDEVICE T operator()(T a, T b) const {
-#ifdef __CUDA_ARCH__
+#if defined(__HIPCC__) || defined(__CUDA_ARCH__)
    if (a == 0) {
      printf("Error: Divide by zero encounter in floor_divide\n");
+#ifdef __HIPCC__
+      abort();
+#else
      asm("trap;");
+#endif
    }
 #else
    if (a == 0)
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
@ -192,7 +192,7 @@ elementwise_mul_grad(const framework::ExecutionContext& ctx,
      ctx, *x, *y, *out, *dout, axis, dx, dy, MulGradDX<T>(), MulGradDY<T>());
 }

-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 // cuda definition
 template <typename DeviceContext, typename T>
 typename std::enable_if<
--- a/paddle/fluid/operators/elementwise/elementwise_op_function.cu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_function.cu.h
@ -22,13 +22,19 @@ limitations under the License. */

 #ifdef PADDLE_WITH_CUDA
 #include <cuda.h>
+#ifdef PADDLE_CUDA_FP16
+#include <cuda_fp16.h>
+#endif
 #endif  // PADDLE_WITH_CUDA

+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_runtime.h>
 #ifdef PADDLE_CUDA_FP16
-#include <cuda_fp16.h>
+#include <hip/hip_fp16.h>
 #endif
+#endif  // PADDLE_WITH_HIP

-#if CUDA_VERSION < 9000
+#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION < 9000
 #define __h2div h2div
 #endif

@ -101,7 +107,7 @@ struct DivRangeFunctor<

 #ifdef PADDLE_CUDA_FP16
 inline DEVICE half2 half2_add(const half2& a, const half2& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530)
  return __hadd2(a, b);
 #else
  float a1 = __low2float(a);
@ -115,7 +121,7 @@ inline DEVICE half2 half2_add(const half2& a, const half2& b) {
 }

 inline DEVICE half2 half2_sub(const half2& a, const half2& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530)
  return __hsub2(a, b);
 #else
  float a1 = __low2float(a);
@ -129,7 +135,7 @@ inline DEVICE half2 half2_sub(const half2& a, const half2& b) {
 }

 inline DEVICE half2 half2_mul(const half2& a, const half2& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530)
  return __hmul2(a, b);
 #else
  float a1 = __low2float(a);
@ -143,7 +149,7 @@ inline DEVICE half2 half2_mul(const half2& a, const half2& b) {
 }

 inline DEVICE half2 half2_div(const half2& a, const half2& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530)
  return __h2div(a, b);
 #else
  float a1 = __low2float(a);
--- a/paddle/fluid/operators/elementwise/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h
@ -29,8 +29,12 @@ limitations under the License. */
 #include "paddle/fluid/platform/gpu_info.h"
 #include "paddle/fluid/platform/transform.h"

+#if defined(__NVCC__) || defined(__HIPCC__)
 #ifdef __NVCC__
 #include <cuda.h>
+#elif defined(__HIPCC__)
+#include <hip/hip_runtime.h>
+#endif
 #include <thrust/iterator/iterator_adaptor.h>

 #include "paddle/fluid/platform/cuda_device_function.h"
@ -196,7 +200,7 @@ void CommonForwardBroadcastCPU(const framework::Tensor *x,
  }
 }

-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__HIPCC__)
 template <typename Functor, typename T, typename OutType>
 __global__ void ElementwiseKernel(const T *x, const T *y, OutType *out, int pre,
                                  int n, int post, int total, Functor func) {
@ -310,7 +314,7 @@ void CommonForwardBroadcastCUDA(
      y_data, out_data, out_size, max_dim, func, is_xsize_larger);
 }

-#endif  // __NVCC__
+#endif  // __NVCC__ or __HIPCC__

 template <typename T, typename DX_OP, typename DY_OP>
 void CommonGradBroadcastCPU(
@ -382,7 +386,7 @@ inline void ComputeBroadcastTranspositionArray(const int *x_one_indexs,
  }
 }

-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__HIPCC__)
 template <typename T, typename DX_OP, typename DY_OP>
 static __global__ void ElemwiseGradBroadcast1CUDAKernel(
    const T *x, const T *y, const T *out, const T *dout, int h, int w,
@ -1212,7 +1216,7 @@ void CommonGradBroadcastCUDA(
  }
 }

-#endif  // __NVCC__
+#endif  // __NVCC__ or __HIPCC__

 inline framework::DDim trim_trailing_singular_dims(
    const framework::DDim &dims) {
@ -1339,7 +1343,7 @@ class MidWiseTransformIterator<T, platform::CPUDeviceContext>
  int64_t post_;
 };

-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__HIPCC__)
 template <typename T>
 class RowwiseTransformIterator<T, platform::CUDADeviceContext>
    : public thrust::iterator_adaptor<
@ -1504,10 +1508,10 @@ static void ElemwiseGradBroadcast1CPU(const T *x, const T *y, const T *out,
  }
 }

-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__HIPCC__)

 template <typename T, typename DX_OP, typename DY_OP>
-static void ElemwiseGradBroadcast1CUDA(cudaStream_t stream, const T *x,
+static void ElemwiseGradBroadcast1CUDA(gpuStream_t stream, const T *x,
                                       const T *y, const T *out, const T *dout,
                                       int h, int w, bool is_xsize_larger,
                                       DX_OP dx_op, DY_OP dy_op, T *dx, T *dy) {
@ -1577,7 +1581,7 @@ static void ElemwiseGradBroadcast2CPU(const T *x, const T *y, const T *out,
  }
 }

-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__HIPCC__)
 template <typename T, typename DX_OP, typename DY_OP>
 static __global__ void ElemwiseGradBroadcast2CUDAKernel(
    const T *x, const T *y, const T *out, const T *dout, int pre, int n,
@ -1646,7 +1650,7 @@ static __global__ void ElemwiseGradBroadcast2CUDAKernel(
 }

 template <typename T, typename DX_OP, typename DY_OP>
-static void ElemwiseGradBroadcast2CUDA(cudaStream_t stream, const T *x,
+static void ElemwiseGradBroadcast2CUDA(gpuStream_t stream, const T *x,
                                       const T *y, const T *out, const T *dout,
                                       int pre, int n, int post,
                                       bool is_xsize_larger, DX_OP dx_op,
@ -1686,7 +1690,7 @@ void CommonElementwiseBroadcastBackward(
          << " ydim:" << framework::make_ddim(y_dims_array);

  if (platform::is_gpu_place(ctx.GetPlace())) {
-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__HIPCC__)
    CommonGradBroadcastCUDA<T, DX_OP, DY_OP>(
        x, y, out, dout, dx, dy, x_dims_array.data(), y_dims_array.data(),
        out_dims_array.data(), max_dim,
@ -1769,7 +1773,7 @@ void ElemwiseGradComputeWithBroadcast(
  }
  if (post == 1) {
    if (platform::is_gpu_place(ctx.GetPlace())) {
-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__HIPCC__)
      ElemwiseGradBroadcast1CUDA(
          ctx.template device_context<DeviceContext>().stream(), x.data<T>(),
          y.data<T>(), out.data<T>(), dout.data<T>(), pre, n, is_xsize_larger,
@ -1786,7 +1790,7 @@ void ElemwiseGradComputeWithBroadcast(
    }
  } else {
    if (platform::is_gpu_place(ctx.GetPlace())) {
-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__HIPCC__)
      ElemwiseGradBroadcast2CUDA(
          ctx.template device_context<DeviceContext>().stream(), x.data<T>(),
          y.data<T>(), out.data<T>(), dout.data<T>(), pre, n, post,
@ -1830,7 +1834,7 @@ void CommonElementwiseBroadcastForward(
                         axis);

  if (platform::is_gpu_place(ctx.GetPlace())) {
-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__HIPCC__)
    CommonForwardBroadcastCUDA<Functor, T, OutType>(
        x, y, z, x_dims_array.data(), y_dims_array.data(),
        out_dims_array.data(), max_dim,
@ -1942,7 +1946,7 @@ void ElementwiseComputeEx(const framework::ExecutionContext &ctx,
  }

  if (platform::is_gpu_place(ctx.GetPlace())) {
-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__HIPCC__)
    ComputeElementwiseCUDA<Functor, T, OutType>(
        x, y, z, pre, n, post,
        ctx.template device_context<platform::CUDADeviceContext>(), func,
@ -2066,7 +2070,7 @@ static void FusedElemwiseAndActBroadcast2CPU(const T *x, const T *y, int pre,
  }
 }

-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__HIPCC__)
 template <typename T, typename CompoundFunctor, bool BcastY,
          bool KeepIntermediateOut, bool SameShapeOfIntermediateOutAndOut>
 static __global__ void FusedElemwiseAndActBroadcast1CUDAKernel(
@ -2107,7 +2111,7 @@ static __global__ void FusedElemwiseAndActBroadcast1CUDAKernel(

 template <typename T, typename CompoundFunctor, bool BcastY,
          bool KeepIntermediateOut, bool SameShapeOfIntermediateOutAndOut>
-static void FusedElemwiseAndActBroadcast1CUDA(cudaStream_t stream, const T *x,
+static void FusedElemwiseAndActBroadcast1CUDA(gpuStream_t stream, const T *x,
                                              const T *y,
                                              CompoundFunctor compound_functor,
                                              int h, int w, T *out,
@ -2164,7 +2168,7 @@ static __global__ void FusedElemwiseAndActBroadcast2CUDAKernel(

 template <typename T, typename CompoundFunctor, bool BcastY,
          bool KeepIntermediateOut, bool SameShapeOfIntermediateOutAndOut>
-static void FusedElemwiseAndActBroadcast2CUDA(cudaStream_t stream, const T *x,
+static void FusedElemwiseAndActBroadcast2CUDA(gpuStream_t stream, const T *x,
                                              const T *y, int pre, int n,
                                              int post,
                                              CompoundFunctor compound_functor,
@ -2219,7 +2223,7 @@ void FusedElemwiseAndActComputeWithBroadcast(
    int h = pre;
    int w = n;
    if (platform::is_gpu_place(ctx.GetPlace())) {
-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__HIPCC__)
      FusedElemwiseAndActBroadcast1CUDA<T, CompoundFunctor, BcastY,
                                        KeepIntermediateOut,
                                        SameShapeOfIntermediateOutAndOut>(
@ -2242,7 +2246,7 @@ void FusedElemwiseAndActComputeWithBroadcast(
    }
  } else {
    if (platform::is_gpu_place(ctx.GetPlace())) {
-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__HIPCC__)
      FusedElemwiseAndActBroadcast2CUDA<T, CompoundFunctor, BcastY,
                                        KeepIntermediateOut,
                                        SameShapeOfIntermediateOutAndOut>(
@ -2493,7 +2497,7 @@ static void FusedElemwiseAndActGradBroadcast2CPU(
  }
 }

-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__HIPCC__)
 template <typename T, typename DX_OP, typename DY_OP, typename DIntermediate_OP,
          bool UseIntermediateOut, bool BcastY,
          bool SameShapeOfIntermediateOutAndOut>
@ -2593,7 +2597,7 @@ template <typename T, typename DX_OP, typename DY_OP, typename DIntermediate_OP,
          bool UseIntermediateOut, bool BcastY,
          bool SameShapeOfIntermediateOutAndOut>
 static void FusedElemwiseAndActGradBroadcast1CUDA(
-    cudaStream_t stream, const T *x, const T *y, const T *intermediate_out,
+    gpuStream_t stream, const T *x, const T *y, const T *intermediate_out,
    const T *out, const T *dout, int h, int w, DX_OP dx_op, DY_OP dy_op,
    DIntermediate_OP dintermediate_op, T *dx, T *dy, T *d_intermediate) {
  int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, h);
@ -2708,7 +2712,7 @@ template <typename T, typename DX_OP, typename DY_OP, typename DIntermediate_OP,
          bool UseIntermediateOut, bool BcastY,
          bool SameShapeOfIntermediateOutAndOut>
 static void FusedElemwiseAndActGradBroadcast2CUDA(
-    cudaStream_t stream, const T *x, const T *y, const T *intermediate_out,
+    gpuStream_t stream, const T *x, const T *y, const T *intermediate_out,
    const T *out, const T *dout, int pre, int n, int post, DX_OP dx_op,
    DY_OP dy_op, DIntermediate_OP dintermediate_op, T *dx, T *dy,
    T *dintermediate) {
@ -2748,7 +2752,7 @@ void FusedElemwiseAndActGradComputeWithBroadcast(
    int w = n;

    if (platform::is_gpu_place(ctx.GetPlace())) {
-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__HIPCC__)
      FusedElemwiseAndActGradBroadcast1CUDA<T, DX_OP, DY_OP, DIntermediate_OP,
                                            UseIntermediateOut, BcastY,
                                            SameShapeOfIntermediateOutAndOut>(
@ -2774,7 +2778,7 @@ void FusedElemwiseAndActGradComputeWithBroadcast(
    }
  } else {
    if (platform::is_gpu_place(ctx.GetPlace())) {
-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__HIPCC__)
      FusedElemwiseAndActGradBroadcast2CUDA<T, DX_OP, DY_OP, DIntermediate_OP,
                                            UseIntermediateOut, BcastY,
                                            SameShapeOfIntermediateOutAndOut>(
--- a/paddle/fluid/operators/elementwise/elementwise_pow_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_pow_op.h
@ -25,7 +25,7 @@ struct PowFunctor {
  inline HOSTDEVICE T operator()(T a, T b) const {
 // TODO(wujionghao): A potential speed improvement is supporting different
 // types in C++.
-#ifdef __CUDA_ARCH__
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
    // On CUDAPlace, std::pow(3, 1) calls pow(float, float), and
    // it will return a float number like 2.99... , which floor to 2
    // when cast to int by default and it is wrong.
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.h
@ -86,7 +86,7 @@ elementwise_sub_grad(const framework::ExecutionContext& ctx,
      ctx, *x, *y, *out, *dout, axis, dx, dy, SubGradDX<T>(), SubGradDY<T>());
 }

-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 // cuda definition
 template <typename DeviceContext, typename T>
 typename std::enable_if<
--- a/paddle/fluid/operators/elementwise/test_elementwise_add_grad_grad.cc
+++ b/paddle/fluid/operators/elementwise/test_elementwise_add_grad_grad.cc
@ -60,7 +60,7 @@ TEST(test_elementwise_add_grad_grad_without_ddx, cpu_place) {
  TestElementwiseAddGradGradWithoutDDX<float> test(p, dims);
  ASSERT_TRUE(test.Check());
 }
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 TEST(test_elementwise_add_grad_grad_without_ddx, gpu_place) {
  framework::DDim dims({32, 64});
  platform::CUDAPlace p(0);
--- a/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc
+++ b/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc
@ -32,6 +32,8 @@ static void Memcpy(void *dst, const void *src, size_t n, bool copy_to_gpu) {
 #ifdef PADDLE_WITH_CUDA
    PADDLE_ENFORCE_CUDA_SUCCESS(
        cudaMemcpy(dst, src, n, cudaMemcpyHostToDevice));
+#elif defined(PADDLE_WITH_HIP)
+    PADDLE_ENFORCE_CUDA_SUCCESS(hipMemcpy(dst, src, n, hipMemcpyHostToDevice));
 #else
    PADDLE_THROW(
        platform::errors::InvalidArgument("Check your paddle version, current "
@ -129,7 +131,7 @@ TEST(test_elementwise_add_not_inplace, cpu_place) {
  ASSERT_TRUE(TestMain<float>(p, dims, false));
 }

-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 TEST(test_elementwise_add_inplace, gpu_place) {
  framework::DDim dims({32, 64});
  platform::CUDAPlace p(0);
--- a/paddle/fluid/operators/elementwise/test_elementwise_div_grad_grad.cc
+++ b/paddle/fluid/operators/elementwise/test_elementwise_div_grad_grad.cc
@ -84,7 +84,7 @@ TEST(test_elementwise_div_grad_grad_without_dout, cpu_place) {
  ASSERT_TRUE(test.Check());
 }

-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 TEST(test_elementwise_div_grad_grad_without_dout, gpu_place) {
  framework::DDim dims({32, 64});
  platform::CUDAPlace p(0);
--- a/paddle/fluid/operators/elementwise/test_elementwise_op_grad_grad.h
+++ b/paddle/fluid/operators/elementwise/test_elementwise_op_grad_grad.h
@ -88,7 +88,7 @@ class TestElementwiseOpGradGrad {
        auto dst_place = BOOST_GET_CONST(platform::CPUPlace, place_);
        memory::Copy(dst_place, dst, src_place, src, bytes);
      } else if (platform::is_gpu_place(place_)) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
        auto dst_place = BOOST_GET_CONST(platform::CUDAPlace, place_);
        memory::Copy(dst_place, dst, src_place, src, bytes, nullptr);
 #else
@ -126,8 +126,14 @@ class TestElementwiseOpGradGrad {
      }
      auto *out_ptr = cpu_out.data<T>();
      size_t numel = static_cast<size_t>(framework::product(dims_));
+#ifdef PADDLE_WITH_HIP
+      auto is_equal = std::equal(
+          out_ptr, out_ptr + numel, expected_outs_[out_name].data(),
+          [](const float &l, const float &r) { return fabs(l - r) < 1e-8; });
+#else
      auto is_equal =
          std::equal(out_ptr, out_ptr + numel, expected_outs_[out_name].data());
+#endif
      if (!is_equal) {
        all_equal = false;
        break;