perf(compile): speed up reduce_op compile by splitting files (#14294)

test=develop
6 years ago · 8f9bfad246
parent 792bf0b77f
commit 8f9bfad246
12 changed files with 148 additions and 48 deletions
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@ -37,7 +37,6 @@ SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
 SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLDNN_INSTALL_DIR}/lib")
 INCLUDE_DIRECTORIES(${MKLDNN_INC_DIR}) # For MKLDNN code to include internal headers.
 INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install) # For Paddle code to include mkldnn.h
 IF(${CBLAS_PROVIDER} STREQUAL "MKLML")
    SET(MKLDNN_DEPENDS   ${MKLML_PROJECT})
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@ -5,6 +5,8 @@ list(REMOVE_DUPLICATES GENERAL_OPS)
 set(DEPS_OPS "")
 set(pybind_file ${PADDLE_BINARY_DIR}/paddle/fluid/pybind/pybind.h)
 file(WRITE ${pybind_file} "// Generated by the paddle/fluid/operator/CMakeLists.txt.  DO NOT EDIT!\n\n")
 set(PART_CUDA_KERNEL_FILES)
 function(op_library TARGET)
    # op_library is a function to create op library. The interface is same as
    # cc_library. But it handle split GPU/CPU code and link some common library
@ -37,6 +39,12 @@ function(op_library TARGET)
        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
            list(APPEND cu_srcs ${TARGET}.cu)
        endif()
        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu)
            set(PART_CUDA_KERNEL_FILES ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu
                    ${PART_CUDA_KERNEL_FILES} PARENT_SCOPE)
            list(APPEND cu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu)
        endif()
        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.hip.cu)
            list(APPEND hip_cu_srcs ${TARGET}.hip.cu)
        endif()
@ -327,6 +335,8 @@ foreach(src ${GENERAL_OPS})
 endforeach()
 file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\n")
 if (NOT WIN32)
 add_subdirectory(reader)
 endif(NOT WIN32)
@ -353,3 +363,14 @@ if(NOT WIN32)
    nv_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
 endif()
 nv_test(dropout_op_test SRCS dropout_op_test.cc DEPS dropout_op tensor)
 if(WITH_GPU)
    foreach(CUDA_KERNEL_FILE ${PART_CUDA_KERNEL_FILES})
        file(READ ${CUDA_KERNEL_FILE} TARGET_CONTENT)
        string(REGEX MATCH "REGISTER_OP_CUDA_KERNEL\\(\\n?([^,]+),.*" MATCHED ${TARGET_CONTENT})
        if (MATCHED)
            string(STRIP ${CMAKE_MATCH_1} MATCHED)
            file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${MATCHED}, CUDA);\n")
        endif()
    endforeach()
 endif()
--- a/paddle/fluid/operators/reduce_max_op.cu
+++ b/paddle/fluid/operators/reduce_max_op.cu
@ -23,12 +23,3 @@ REGISTER_OP_CUDA_KERNEL(reduce_max,
                                          int, ops::MaxFunctor>,
                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
                                          int64_t, ops::MaxFunctor>);
 REGISTER_OP_CUDA_KERNEL(
    reduce_max_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
                                           float, ops::MaxOrMinGradFunctor>,
    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
                          ops::MaxOrMinGradFunctor>,
    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
                          ops::MaxOrMinGradFunctor>,
    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
                          ops::MaxOrMinGradFunctor>);
--- a/paddle/fluid/operators/reduce_max_op.part.cu
+++ b/paddle/fluid/operators/reduce_max_op.part.cu
@ -0,0 +1,25 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/operators/reduce_min_max_op.h"
 REGISTER_OP_CUDA_KERNEL(
    reduce_max_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
                                           float, ops::MaxOrMinGradFunctor>,
    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
                          ops::MaxOrMinGradFunctor>,
    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
                          ops::MaxOrMinGradFunctor>,
    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
                          ops::MaxOrMinGradFunctor>);
--- a/paddle/fluid/operators/reduce_mean_op.cu
+++ b/paddle/fluid/operators/reduce_mean_op.cu
@ -69,13 +69,3 @@ REGISTER_OP_CUDA_KERNEL(reduce_mean, ops::ReduceMeanKernel<float>,
                        ops::ReduceMeanKernel<double>,
                        ops::ReduceMeanKernel<int>,
                        ops::ReduceMeanKernel<int64_t>);
 REGISTER_OP_CUDA_KERNEL(
    reduce_mean_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
                                            float, ops::MeanGradFunctor>,
    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
                          ops::MeanGradFunctor>,
    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
                          ops::MeanGradFunctor>,
    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
                          ops::MeanGradFunctor>);
--- a/paddle/fluid/operators/reduce_mean_op.part.cu
+++ b/paddle/fluid/operators/reduce_mean_op.part.cu
@ -0,0 +1,26 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // .part used to speed up nvcc compile
 #include "paddle/fluid/operators/reduce_mean_op.h"
 REGISTER_OP_CUDA_KERNEL(
    reduce_mean_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
                                            float, ops::MeanGradFunctor>,
    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
                          ops::MeanGradFunctor>,
    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
                          ops::MeanGradFunctor>,
    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
                          ops::MeanGradFunctor>);
--- a/paddle/fluid/operators/reduce_min_op.cu
+++ b/paddle/fluid/operators/reduce_min_op.cu
@ -23,12 +23,3 @@ REGISTER_OP_CUDA_KERNEL(reduce_min,
                                          int, ops::MinFunctor>,
                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
                                          int64_t, ops::MinFunctor>);
 REGISTER_OP_CUDA_KERNEL(
    reduce_min_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
                                           float, ops::MaxOrMinGradFunctor>,
    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
                          ops::MaxOrMinGradFunctor>,
    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
                          ops::MaxOrMinGradFunctor>,
    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
                          ops::MaxOrMinGradFunctor>);
--- a/paddle/fluid/operators/reduce_min_op.part.cu
+++ b/paddle/fluid/operators/reduce_min_op.part.cu
@ -0,0 +1,25 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/operators/reduce_min_max_op.h"
 REGISTER_OP_CUDA_KERNEL(
    reduce_min_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
                                           float, ops::MaxOrMinGradFunctor>,
    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
                          ops::MaxOrMinGradFunctor>,
    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
                          ops::MaxOrMinGradFunctor>,
    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
                          ops::MaxOrMinGradFunctor>);
--- a/paddle/fluid/operators/reduce_prod_op.cu
+++ b/paddle/fluid/operators/reduce_prod_op.cu
@ -23,12 +23,3 @@ REGISTER_OP_CUDA_KERNEL(reduce_prod,
                                          int, ops::ProdFunctor>,
                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
                                          int64_t, ops::ProdFunctor>);
 REGISTER_OP_CUDA_KERNEL(
    reduce_prod_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
                                            float, ops::ProdGradFunctor>,
    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
                          ops::ProdGradFunctor>,
    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
                          ops::ProdGradFunctor>,
    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
                          ops::ProdGradFunctor>);
--- a/paddle/fluid/operators/reduce_prod_op.part.cu
+++ b/paddle/fluid/operators/reduce_prod_op.part.cu
@ -0,0 +1,25 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/operators/reduce_prod_op.h"
 REGISTER_OP_CUDA_KERNEL(
    reduce_prod_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
                                            float, ops::ProdGradFunctor>,
    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
                          ops::ProdGradFunctor>,
    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
                          ops::ProdGradFunctor>,
    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
                          ops::ProdGradFunctor>);
--- a/paddle/fluid/operators/reduce_sum_op.cu
+++ b/paddle/fluid/operators/reduce_sum_op.cu
@ -64,13 +64,3 @@ class ReduceSumKernel : public framework::OpKernel<T> {
 REGISTER_OP_CUDA_KERNEL(reduce_sum, ops::ReduceSumKernel<float>,
                        ops::ReduceSumKernel<double>, ops::ReduceSumKernel<int>,
                        ops::ReduceSumKernel<int64_t>);
 REGISTER_OP_CUDA_KERNEL(
    reduce_sum_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
                                           float, ops::SumGradFunctor>,
    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
                          ops::SumGradFunctor>,
    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
                          ops::SumGradFunctor>,
    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
                          ops::SumGradFunctor>);
--- a/paddle/fluid/operators/reduce_sum_op.part.cu
+++ b/paddle/fluid/operators/reduce_sum_op.part.cu
@ -0,0 +1,26 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/operators/cub_reduce.h"
 #include "paddle/fluid/operators/reduce_sum_op.h"
 REGISTER_OP_CUDA_KERNEL(
    reduce_sum_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
                                           float, ops::SumGradFunctor>,
    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
                          ops::SumGradFunctor>,
    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
                          ops::SumGradFunctor>,
    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
                          ops::SumGradFunctor>);