optimization for fp16 elementwise add (#29744)

5 years ago · 7b2dc4e6b1
parent 27bdbec7fc
commit 7b2dc4e6b1
1 changed files with 38 additions and 1 deletions
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.h
@ -19,6 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.cu.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 #include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/math/math_function.h"
 #ifdef PADDLE_WITH_CUDA
 #ifdef __NVCC__
 #include "cub/cub.cuh"
@ -176,6 +177,25 @@ __global__ void MatrixColReduce(const T *__restrict__ in, T *__restrict__ out,
  }
 }

+template <int SIZE>
+__global__ void VecFP16MatrixColReduce(const __half2 *__restrict__ in,
+                                       __half2 *__restrict__ out, size_t width,
+                                       size_t height) {
+  int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  int by = blockIdx.y;
+  __half2 zero = __half2half2(static_cast<__half>(0));
+  const int cols = width / 2;
+  for (; idx < cols; idx += blockDim.x * gridDim.x) {
+    __half2 sum = zero;
+    for (int row = 0; row < SIZE; row++) {
+      int index = idx + (row + by * SIZE) * cols;
+      sum = __hadd2(sum, in[index]);
+    }
+
+    atomicAdd(&(out[idx]), sum);
+  }
+}
+
 template <typename T>
 __global__ void MatrixReduceLongWidth(const T *__restrict__ in, T *out,
                                      size_t width, size_t height) {
@ -198,7 +218,7 @@ __global__ void VecMatrixReduceLongWidth(const T *__restrict__ in, T *out,
  int idx = threadIdx.x + blockIdx.x * blockDim.x;
  int w = idx * VEC_SIZE;
  int width_stride = blockDim.x * gridDim.x * VEC_SIZE;
-  for (; w < width; w += width) {
+  for (; w < width; w += width_stride) {
    T zero = static_cast<T>(0);
    T sum[VEC_SIZE] = {zero};
    T tmp_vec[VEC_SIZE] = {zero};
@ -341,6 +361,23 @@ class ElementwiseAddGradKernel : public ElemwiseGradKernel<T> {
      int max_blocks = std::max(max_physical_threads / (block_x * block_y), 1);
      int theory_block = (width + blocks.x - 1) / blocks.x;
      dim3 grids(std::min(theory_block, max_blocks));
+      if (std::is_same<T, paddle::platform::float16>::value && width < 2048 &&
+          width % 2 == 0 && height % 64 == 0) {
+        auto &dev_ctx =
+            ctx.template device_context<platform::CUDADeviceContext>();
+        math::SetConstant<platform::CUDADeviceContext, T> functor;
+        if (dout->dims() == dx->dims())
+          functor(dev_ctx, dy, static_cast<T>(0));
+        else
+          functor(dev_ctx, dx, static_cast<T>(0));
+        const __half2 *ptr1 = reinterpret_cast<const __half2 *>(dout_data);
+        __half2 *ptr2 = reinterpret_cast<__half2 *>(out_data);
+        const int threads = 128;
+        dim3 grid(1, (height + 64 - 1) / 64);
+        VecFP16MatrixColReduce<64><<<grid, threads, 0, stream>>>(ptr1, ptr2,
+                                                                 width, height);
+        return;
+      }

      if (width / height < 32) {
        MatrixColReduce<T, block_x, block_y><<<grids, blocks, 0, stream>>>(