diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index 9fea9ca05b..785148d4f9 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -37,7 +37,6 @@ SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
 SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLDNN_INSTALL_DIR}/lib")
 
 INCLUDE_DIRECTORIES(${MKLDNN_INC_DIR}) # For MKLDNN code to include internal headers.
-INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install) # For Paddle code to include mkldnn.h
 
 IF(${CBLAS_PROVIDER} STREQUAL "MKLML")
     SET(MKLDNN_DEPENDS   ${MKLML_PROJECT})
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 7599313070..776bdfaee8 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -5,6 +5,8 @@ list(REMOVE_DUPLICATES GENERAL_OPS)
 set(DEPS_OPS "")
 set(pybind_file ${PADDLE_BINARY_DIR}/paddle/fluid/pybind/pybind.h)
 file(WRITE ${pybind_file} "// Generated by the paddle/fluid/operator/CMakeLists.txt.  DO NOT EDIT!\n\n")
+
+set(PART_CUDA_KERNEL_FILES)
 function(op_library TARGET)
     # op_library is a function to create op library. The interface is same as
     # cc_library. But it handle split GPU/CPU code and link some common library
@@ -37,6 +39,12 @@ function(op_library TARGET)
         if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
             list(APPEND cu_srcs ${TARGET}.cu)
         endif()
+        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu)
+            set(PART_CUDA_KERNEL_FILES ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu
+                    ${PART_CUDA_KERNEL_FILES} PARENT_SCOPE)
+            list(APPEND cu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu)
+        endif()
+
         if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.hip.cu)
             list(APPEND hip_cu_srcs ${TARGET}.hip.cu)
         endif()
@@ -327,6 +335,8 @@ foreach(src ${GENERAL_OPS})
 endforeach()
 
 file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\n")
+
+
 if (NOT WIN32)
 add_subdirectory(reader)
 endif(NOT WIN32)
@@ -353,3 +363,14 @@ if(NOT WIN32)
     nv_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
 endif()
 nv_test(dropout_op_test SRCS dropout_op_test.cc DEPS dropout_op tensor)
+
+if(WITH_GPU)
+    foreach(CUDA_KERNEL_FILE ${PART_CUDA_KERNEL_FILES})
+        file(READ ${CUDA_KERNEL_FILE} TARGET_CONTENT)
+        string(REGEX MATCH "REGISTER_OP_CUDA_KERNEL\\(\\n?([^,]+),.*" MATCHED ${TARGET_CONTENT})
+        if (MATCHED)
+            string(STRIP ${CMAKE_MATCH_1} MATCHED)
+            file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${MATCHED}, CUDA);\n")
+        endif()
+    endforeach()
+endif()
diff --git a/paddle/fluid/operators/reduce_max_op.cu b/paddle/fluid/operators/reduce_max_op.cu
index 0d86b3127e..b21da178f3 100644
--- a/paddle/fluid/operators/reduce_max_op.cu
+++ b/paddle/fluid/operators/reduce_max_op.cu
@@ -23,12 +23,3 @@ REGISTER_OP_CUDA_KERNEL(reduce_max,
                                           int, ops::MaxFunctor>,
                         ops::ReduceKernel<paddle::platform::CUDADeviceContext,
                                           int64_t, ops::MaxFunctor>);
-REGISTER_OP_CUDA_KERNEL(
-    reduce_max_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
-                                           float, ops::MaxOrMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
-                          ops::MaxOrMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
-                          ops::MaxOrMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
-                          ops::MaxOrMinGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_max_op.part.cu b/paddle/fluid/operators/reduce_max_op.part.cu
new file mode 100644
index 0000000000..6954c8d744
--- /dev/null
+++ b/paddle/fluid/operators/reduce_max_op.part.cu
@@ -0,0 +1,25 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_min_max_op.h"
+
+REGISTER_OP_CUDA_KERNEL(
+    reduce_max_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
+                                           float, ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
+                          ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
+                          ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
+                          ops::MaxOrMinGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_mean_op.cu b/paddle/fluid/operators/reduce_mean_op.cu
index 59b3024483..4408200d2d 100644
--- a/paddle/fluid/operators/reduce_mean_op.cu
+++ b/paddle/fluid/operators/reduce_mean_op.cu
@@ -69,13 +69,3 @@ REGISTER_OP_CUDA_KERNEL(reduce_mean, ops::ReduceMeanKernel<float>,
                         ops::ReduceMeanKernel<double>,
                         ops::ReduceMeanKernel<int>,
                         ops::ReduceMeanKernel<int64_t>);
-
-REGISTER_OP_CUDA_KERNEL(
-    reduce_mean_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
-                                            float, ops::MeanGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
-                          ops::MeanGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
-                          ops::MeanGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
-                          ops::MeanGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_mean_op.part.cu b/paddle/fluid/operators/reduce_mean_op.part.cu
new file mode 100644
index 0000000000..4b663bcdca
--- /dev/null
+++ b/paddle/fluid/operators/reduce_mean_op.part.cu
@@ -0,0 +1,26 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// .part used to speed up nvcc compile
+#include "paddle/fluid/operators/reduce_mean_op.h"
+
+REGISTER_OP_CUDA_KERNEL(
+    reduce_mean_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
+                                            float, ops::MeanGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
+                          ops::MeanGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
+                          ops::MeanGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
+                          ops::MeanGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_min_op.cu b/paddle/fluid/operators/reduce_min_op.cu
index da466f805e..5a04a12b79 100644
--- a/paddle/fluid/operators/reduce_min_op.cu
+++ b/paddle/fluid/operators/reduce_min_op.cu
@@ -23,12 +23,3 @@ REGISTER_OP_CUDA_KERNEL(reduce_min,
                                           int, ops::MinFunctor>,
                         ops::ReduceKernel<paddle::platform::CUDADeviceContext,
                                           int64_t, ops::MinFunctor>);
-REGISTER_OP_CUDA_KERNEL(
-    reduce_min_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
-                                           float, ops::MaxOrMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
-                          ops::MaxOrMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
-                          ops::MaxOrMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
-                          ops::MaxOrMinGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_min_op.part.cu b/paddle/fluid/operators/reduce_min_op.part.cu
new file mode 100644
index 0000000000..5b8f061b2d
--- /dev/null
+++ b/paddle/fluid/operators/reduce_min_op.part.cu
@@ -0,0 +1,25 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_min_max_op.h"
+
+REGISTER_OP_CUDA_KERNEL(
+    reduce_min_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
+                                           float, ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
+                          ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
+                          ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
+                          ops::MaxOrMinGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_prod_op.cu b/paddle/fluid/operators/reduce_prod_op.cu
index d62e677d92..d8692afb96 100644
--- a/paddle/fluid/operators/reduce_prod_op.cu
+++ b/paddle/fluid/operators/reduce_prod_op.cu
@@ -23,12 +23,3 @@ REGISTER_OP_CUDA_KERNEL(reduce_prod,
                                           int, ops::ProdFunctor>,
                         ops::ReduceKernel<paddle::platform::CUDADeviceContext,
                                           int64_t, ops::ProdFunctor>);
-REGISTER_OP_CUDA_KERNEL(
-    reduce_prod_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
-                                            float, ops::ProdGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
-                          ops::ProdGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
-                          ops::ProdGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
-                          ops::ProdGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_prod_op.part.cu b/paddle/fluid/operators/reduce_prod_op.part.cu
new file mode 100644
index 0000000000..486c578c64
--- /dev/null
+++ b/paddle/fluid/operators/reduce_prod_op.part.cu
@@ -0,0 +1,25 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_prod_op.h"
+
+REGISTER_OP_CUDA_KERNEL(
+    reduce_prod_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
+                                            float, ops::ProdGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
+                          ops::ProdGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
+                          ops::ProdGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
+                          ops::ProdGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_sum_op.cu b/paddle/fluid/operators/reduce_sum_op.cu
index 53cd9e9419..2b031e8df9 100644
--- a/paddle/fluid/operators/reduce_sum_op.cu
+++ b/paddle/fluid/operators/reduce_sum_op.cu
@@ -64,13 +64,3 @@ class ReduceSumKernel : public framework::OpKernel<T> {
 REGISTER_OP_CUDA_KERNEL(reduce_sum, ops::ReduceSumKernel<float>,
                         ops::ReduceSumKernel<double>, ops::ReduceSumKernel<int>,
                         ops::ReduceSumKernel<int64_t>);
-
-REGISTER_OP_CUDA_KERNEL(
-    reduce_sum_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
-                                           float, ops::SumGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
-                          ops::SumGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
-                          ops::SumGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
-                          ops::SumGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_sum_op.part.cu b/paddle/fluid/operators/reduce_sum_op.part.cu
new file mode 100644
index 0000000000..525633f62a
--- /dev/null
+++ b/paddle/fluid/operators/reduce_sum_op.part.cu
@@ -0,0 +1,26 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/cub_reduce.h"
+#include "paddle/fluid/operators/reduce_sum_op.h"
+
+REGISTER_OP_CUDA_KERNEL(
+    reduce_sum_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
+                                           float, ops::SumGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
+                          ops::SumGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
+                          ops::SumGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
+                          ops::SumGradFunctor>);