diff --git a/paddle/fluid/operators/mean_op.cu b/paddle/fluid/operators/mean_op.cu index f0f895c08a..cdd138d7bd 100644 --- a/paddle/fluid/operators/mean_op.cu +++ b/paddle/fluid/operators/mean_op.cu @@ -31,10 +31,11 @@ struct DivideFunctor { }; template <typename T> -__global__ void MeanRunKernel(const T in_data, T* out_data, int N) { +__global__ void MeanRunKernel(const T* in_data, T* out_data, int N) { int idx = blockDim.x * blockIdx.x + threadIdx.x; + T data = in_data[0]; for (; idx < N; idx += blockDim.x * gridDim.x) { - out_data[idx] = in_data / (static_cast<T>(N)); + out_data[idx] = data / (static_cast<T>(N)); } } @@ -85,7 +86,7 @@ class MeanCUDAGradKernel : public framework::OpKernel<T> { auto IG = context.Output<Tensor>(framework::GradVarName("X")); IG->mutable_data<T>(context.GetPlace()); - T in_data = OG[0]; + auto in_data = OG->data<T>(); auto size_prob = IG->numel(); auto out_data = IG->data<T>(); int threads = 512; @@ -105,6 +106,8 @@ REGISTER_OP_CUDA_KERNEL( ops::MeanCUDAKernel<paddle::platform::CUDADeviceContext, double>, ops::MeanCUDAKernel<paddle::platform::CUDADeviceContext, plat::float16>); REGISTER_OP_CUDA_KERNEL( - mean_grad, ops::MeanGradKernel<paddle::platform::CUDADeviceContext, float>, - ops::MeanGradKernel<paddle::platform::CUDADeviceContext, double>, - ops::MeanGradKernel<paddle::platform::CUDADeviceContext, plat::float16>); + mean_grad, + ops::MeanCUDAGradKernel<paddle::platform::CUDADeviceContext, float>, + ops::MeanCUDAGradKernel<paddle::platform::CUDADeviceContext, double>, + ops::MeanCUDAGradKernel<paddle::platform::CUDADeviceContext, + plat::float16>);