diff --git a/paddle/fluid/operators/mean_op.cu b/paddle/fluid/operators/mean_op.cu
index f0f895c08a..cdd138d7bd 100644
--- a/paddle/fluid/operators/mean_op.cu
+++ b/paddle/fluid/operators/mean_op.cu
@@ -31,10 +31,11 @@ struct DivideFunctor {
 };
 
 template <typename T>
-__global__ void MeanRunKernel(const T in_data, T* out_data, int N) {
+__global__ void MeanRunKernel(const T* in_data, T* out_data, int N) {
   int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  T data = in_data[0];
   for (; idx < N; idx += blockDim.x * gridDim.x) {
-    out_data[idx] = in_data / (static_cast<T>(N));
+    out_data[idx] = data / (static_cast<T>(N));
   }
 }
 
@@ -85,7 +86,7 @@ class MeanCUDAGradKernel : public framework::OpKernel<T> {
     auto IG = context.Output<Tensor>(framework::GradVarName("X"));
     IG->mutable_data<T>(context.GetPlace());
 
-    T in_data = OG[0];
+    auto in_data = OG->data<T>();
     auto size_prob = IG->numel();
     auto out_data = IG->data<T>();
     int threads = 512;
@@ -105,6 +106,8 @@ REGISTER_OP_CUDA_KERNEL(
     ops::MeanCUDAKernel<paddle::platform::CUDADeviceContext, double>,
     ops::MeanCUDAKernel<paddle::platform::CUDADeviceContext, plat::float16>);
 REGISTER_OP_CUDA_KERNEL(
-    mean_grad, ops::MeanGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MeanGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::MeanGradKernel<paddle::platform::CUDADeviceContext, plat::float16>);
+    mean_grad,
+    ops::MeanCUDAGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::MeanCUDAGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::MeanCUDAGradKernel<paddle::platform::CUDADeviceContext,
+                            plat::float16>);