fix the mean grad OP performance improvement test=develop (#21658)

5 years ago · 5eec8cf5af
parent 29f64c8c9e
commit 5eec8cf5af
1 changed files with 9 additions and 6 deletions
--- a/paddle/fluid/operators/mean_op.cu
+++ b/paddle/fluid/operators/mean_op.cu
@ -31,10 +31,11 @@ struct DivideFunctor {
 };

 template <typename T>
-__global__ void MeanRunKernel(const T in_data, T* out_data, int N) {
+__global__ void MeanRunKernel(const T* in_data, T* out_data, int N) {
  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  T data = in_data[0];
  for (; idx < N; idx += blockDim.x * gridDim.x) {
-    out_data[idx] = in_data / (static_cast<T>(N));
+    out_data[idx] = data / (static_cast<T>(N));
  }
 }

@ -85,7 +86,7 @@ class MeanCUDAGradKernel : public framework::OpKernel<T> {
    auto IG = context.Output<Tensor>(framework::GradVarName("X"));
    IG->mutable_data<T>(context.GetPlace());

-    T in_data = OG[0];
+    auto in_data = OG->data<T>();
    auto size_prob = IG->numel();
    auto out_data = IG->data<T>();
    int threads = 512;
@ -105,6 +106,8 @@ REGISTER_OP_CUDA_KERNEL(
    ops::MeanCUDAKernel<paddle::platform::CUDADeviceContext, double>,
    ops::MeanCUDAKernel<paddle::platform::CUDADeviceContext, plat::float16>);
 REGISTER_OP_CUDA_KERNEL(
-    mean_grad, ops::MeanGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MeanGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::MeanGradKernel<paddle::platform::CUDADeviceContext, plat::float16>);
+    mean_grad,
+    ops::MeanCUDAGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::MeanCUDAGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::MeanCUDAGradKernel<paddle::platform::CUDADeviceContext,
+                            plat::float16>);