update to develop branch.

8 years ago · 9f490c7748
parent 4eb25b4793 fd64369f30
commit 9f490c7748
167 changed files with 5916 additions and 4214 deletions
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -24,7 +24,7 @@
        description: Format files with ClangFormat.
        entry: clang-format -i
        language: system
-        files: \.(c|cc|cxx|cpp|h|hpp|hxx)$
+        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto)$
 -   repo: https://github.com/PaddlePaddle/pre-commit-golang
    sha: 8337620115c25ff8333f1b1a493bd031049bd7c0
    hooks:
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -36,8 +36,8 @@ include(simd)
 ################################ Configurations #######################################
 option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_FOUND})
 option(WITH_AVX         "Compile PaddlePaddle with AVX intrinsics"      ${AVX_FOUND})
-option(WITH_MKLDNN      "Compile PaddlePaddle with mkl-dnn support."    OFF)
+option(WITH_MKLDNN      "Compile PaddlePaddle with mkl-dnn support."    ${AVX_FOUND})
-option(WITH_MKLML       "Compile PaddlePaddle with mklml package."      OFF)
+option(WITH_MKLML       "Compile PaddlePaddle with mklml package."      ${AVX_FOUND})
 option(WITH_DSO         "Compile PaddlePaddle with dynamic linked CUDA" ON)
 option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        ON)
 option(WITH_SWIG_PY     "Compile PaddlePaddle with inference api"       ON)
--- a/5
+++ b/5
@ -27,13 +27,16 @@ RUN apt-get update && \
    git python-pip python-dev openssh-server bison  \
    wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
    curl sed grep graphviz libjpeg-dev zlib1g-dev  \
-    python-numpy python-matplotlib gcc-4.8 g++-4.8 \
+    python-matplotlib gcc-4.8 g++-4.8 \
    automake locales clang-format-3.8 swig doxygen cmake  \
    liblapack-dev liblapacke-dev libboost-dev \
    clang-3.8 llvm-3.8 libclang-3.8-dev \
    net-tools && \
    apt-get clean -y
 # paddle is using numpy.flip, which is introduced since 1.12.0
 RUN pip --no-cache-dir install 'numpy>=1.12.0'
 # Install Go and glide
 RUN wget -O go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz && \
    tar -C /usr/local -xzf go.tgz && \
--- a/README.md
+++ b/README.md
@ -72,7 +72,7 @@ We provide [English](http://doc.paddlepaddle.org/develop/doc/) and
 - [Deep Learning 101](http://book.paddlepaddle.org/index.html)
-  You might want to start from the this online interactive book that can run in Jupyter Notebook.
+  You might want to start from this online interactive book that can run in Jupyter Notebook.
 - [Distributed Training](http://doc.paddlepaddle.org/develop/doc/howto/usage/cluster/cluster_train_en.html)
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@ -74,8 +74,6 @@ if(WITH_MKLDNN)
        set(OPENMP_FLAGS "-fopenmp")
        set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
        set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
        set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -L${MKLDNN_IOMP_DIR} -liomp5 -Wl,--as-needed")
        set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -L${MKLDNN_IOMP_DIR} -liomp5 -Wl,--as-needed")
        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}")
        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPENMP_FLAGS}")
    else()
--- a/cmake/cpplint.cmake
+++ b/cmake/cpplint.cmake
@ -56,11 +56,14 @@ macro(add_style_check_target TARGET_NAME)
                # cpplint code style
                get_filename_component(base_filename ${filename} NAME)
                set(CUR_GEN ${CMAKE_CURRENT_BINARY_DIR}/${base_filename}.cpplint)
-                add_custom_command(TARGET ${TARGET_NAME} PRE_BUILD
+                add_custom_command(OUTPUT ${CUR_GEN} PRE_BUILD
                    COMMAND "${PYTHON_EXECUTABLE}" "${PROJ_ROOT}/paddle/scripts/cpplint.py"
                            "--filter=${STYLE_FILTER}"
                            "--write-success=${CUR_GEN}" ${filename}
                    DEPENDS ${filename} ${PROJ_ROOT}/paddle/scripts/cpplint.py
                    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
                add_custom_target(${base_filename}.cpplint DEPENDS ${CUR_GEN})
                add_dependencies(${TARGET_NAME} ${base_filename}.cpplint)
            endif()
        endforeach()
    endif()
--- a/cmake/external/gflags.cmake
+++ b/cmake/external/gflags.cmake
@ -28,7 +28,14 @@ INCLUDE_DIRECTORIES(${GFLAGS_INCLUDE_DIR})
 ExternalProject_Add(
    extern_gflags
    ${EXTERNAL_PROJECT_LOG_ARGS}
-    GIT_REPOSITORY  "https://github.com/gflags/gflags.git"
+    # TODO(yiwang): The annoying warnings mentioned in
    # https://github.com/PaddlePaddle/Paddle/issues/3277 are caused by
    # gflags.  I fired a PR https://github.com/gflags/gflags/pull/230
    # to fix it.  Before it gets accepted by the gflags team, we use
    # my personal fork, which contains above fix, temporarily.  Let's
    # change this back to the official Github repo once my PR is
    # merged.
    GIT_REPOSITORY  "https://github.com/wangkuiyi/gflags.git"
    PREFIX          ${GFLAGS_SOURCES_DIR}
    UPDATE_COMMAND  ""
    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@ -20,34 +20,30 @@ INCLUDE(ExternalProject)
 SET(MKLDNN_PROJECT        "extern_mkldnn")
 SET(MKLDNN_SOURCES_DIR    ${THIRD_PARTY_PATH}/mkldnn)
-SET(MKLDNN_INSTALL_ROOT   ${CMAKE_INSTALL_PREFIX})
+SET(MKLDNN_INSTALL_DIR    ${THIRD_PARTY_PATH}/install/mkldnn)
-IF(NOT "$ENV{HOME}" STREQUAL "/root")
+SET(MKLDNN_INC_DIR        "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE)
    SET(MKLDNN_INSTALL_ROOT  "$ENV{HOME}")
 ENDIF()
 SET(MKLDNN_INSTALL_DIR    "${MKLDNN_INSTALL_ROOT}/opt/paddle/third_party/mkldnn")
 SET(MKLDNN_INCLUDE_DIR    "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE)
-IF(WIN32)
+IF(WIN32 OR APPLE)
-    MESSAGE(WARNING "It is not supported compiling with mkldnn in windows Paddle yet."
+    MESSAGE(WARNING 
        "Windows or Mac is not supported with MKLDNN in Paddle yet."
        "Force WITH_MKLDNN=OFF")
-    SET(WITH_MKLDNN OFF)
+    SET(WITH_MKLDNN OFF CACHE STRING "Disable MKLDNN in Windows and MacOS" FORCE)
    return()
-ELSE(WIN32)
+ENDIF()
-    SET(MKLDNN_LIBRARY "${MKLDNN_INSTALL_DIR}/lib/libmkldnn.so" CACHE FILEPATH "mkldnn library." FORCE)
+
 SET(MKLDNN_LIB "${MKLDNN_INSTALL_DIR}/lib/libmkldnn.so" CACHE FILEPATH "mkldnn library." FORCE)
 MESSAGE(STATUS "Set ${MKLDNN_INSTALL_DIR}/lib to runtime path")
 SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
    #SET(CMAKE_MACOSX_RPATH 1) # hold for MacOS
 SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLDNN_INSTALL_DIR}/lib")
 ENDIF(WIN32)
-INCLUDE_DIRECTORIES(${MKLDNN_INCLUDE_DIR})
+INCLUDE_DIRECTORIES(${MKLDNN_INC_DIR})
 IF(${CBLAS_PROVIDER} STREQUAL "MKLML")
    SET(MKLDNN_DEPENDS   ${MKLML_PROJECT})
    SET(MKLDNN_MKLROOT   ${MKLML_ROOT})
    SET(MKLDNN_IOMP_LIB  ${MKLML_IOMP_LIB})
    SET(MKLDNN_IOMP_DIR  ${MKLML_LIB_DIR})
    MESSAGE(STATUS "Build MKLDNN with ${MKLDNN_MKLROOT}")
 ENDIF()
 ExternalProject_Add(
@ -57,16 +53,15 @@ ExternalProject_Add(
    GIT_REPOSITORY      "https://github.com/01org/mkl-dnn.git"
    GIT_TAG             "v0.9"
    PREFIX              ${MKLDNN_SOURCES_DIR}
    CONFIGURE_COMMAND   mkdir -p <SOURCE_DIR>/build
    BUILD_COMMAND       cd <SOURCE_DIR>/build
                        && cmake .. -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR} -DMKLROOT=${MKLDNN_MKLROOT}
                        && $(MAKE)
    INSTALL_COMMAND     cd <SOURCE_DIR>/build && $(MAKE) install
    UPDATE_COMMAND      ""
    CMAKE_ARGS          -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR}
    CMAKE_ARGS          -DMKLROOT=${MKLDNN_MKLROOT}
    CMAKE_CACHE_ARGS    -DCMAKE_INSTALL_PREFIX:PATH=${MKLDNN_INSTALL_DIR}
                        -DMKLROOT:PATH=${MKLDNN_MKLROOT}
 )
 ADD_LIBRARY(mkldnn SHARED IMPORTED GLOBAL)
-SET_PROPERTY(TARGET mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIBRARY})
+SET_PROPERTY(TARGET mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIB})
 ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT})
-MESSAGE(STATUS "Mkldnn library: ${MKLDNN_LIBRARY}")
+MESSAGE(STATUS "Mkldnn library: ${MKLDNN_LIB}")
 LIST(APPEND external_project_dependencies mkldnn)
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@ -16,19 +16,23 @@ IF(NOT ${WITH_MKLML})
  return()
 ENDIF(NOT ${WITH_MKLML})
 IF(WIN32 OR APPLE)
    MESSAGE(WARNING 
        "Windows or Mac is not supported with MKLML in Paddle yet."
        "Force WITH_MKLML=OFF")
    SET(WITH_MKLML OFF CACHE STRING "Disable MKLML package in Windows and MacOS" FORCE)
    return()
 ENDIF()
 INCLUDE(ExternalProject)
 SET(MKLML_PROJECT       "extern_mklml")
-SET(MKLML_VER           "mklml_lnx_2018.0.20170425")
+SET(MKLML_VER           "mklml_lnx_2018.0.20170720")
 SET(MKLML_URL           "https://github.com/01org/mkl-dnn/releases/download/v0.9/${MKLML_VER}.tgz")
 SET(MKLML_SOURCE_DIR    "${THIRD_PARTY_PATH}/mklml")
 SET(MKLML_DOWNLOAD_DIR  "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}")
-SET(MKLML_DST_DIR       "opt/paddle/third_party/mklml")
+SET(MKLML_DST_DIR       "mklml")
-SET(MKLML_INSTALL_ROOT  "${CMAKE_INSTALL_PREFIX}")
+SET(MKLML_INSTALL_ROOT  "${THIRD_PARTY_PATH}/install")
 IF(NOT "$ENV{HOME}" STREQUAL "/root")
    SET(MKLML_INSTALL_ROOT  "$ENV{HOME}")
 ENDIF()
 SET(MKLML_INSTALL_DIR   ${MKLML_INSTALL_ROOT}/${MKLML_DST_DIR})
 SET(MKLML_ROOT          ${MKLML_INSTALL_DIR}/${MKLML_VER})
 SET(MKLML_INC_DIR       ${MKLML_ROOT}/include)
--- a/cmake/external/python.cmake
+++ b/cmake/external/python.cmake
@ -24,7 +24,6 @@ IF(WITH_PYTHON)
 ENDIF(WITH_PYTHON)
 SET(py_env "")
 SET(USE_VIRTUALENV_FOR_TEST 1)
 IF(PYTHONINTERP_FOUND)
    find_python_module(pip REQUIRED)
    find_python_module(numpy REQUIRED)
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@ -187,7 +187,13 @@ function(cc_library TARGET_NAME)
    endif()
    # cpplint code style
-    add_style_check_target(${TARGET_NAME} ${cc_library_SRCS})
+    foreach(source_file ${cc_library_SRCS})
      string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file})
      if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
        list(APPEND cc_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
      endif()
    endforeach()
    add_style_check_target(${TARGET_NAME} ${cc_library_SRCS} ${cc_library_HEADERS})
  else(cc_library_SRCS)
    if (cc_library_DEPS)
@ -239,6 +245,14 @@ function(nv_library TARGET_NAME)
        add_dependencies(${TARGET_NAME} ${nv_library_DEPS})
        target_link_libraries(${TARGET_NAME} ${nv_library_DEPS})
      endif()
      # cpplint code style
      foreach(source_file ${nv_library_SRCS})
        string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file})
        if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
          list(APPEND cc_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
        endif()
      endforeach()
      add_style_check_target(${TARGET_NAME} ${nv_library_SRCS} ${nv_library_HEADERS})
    else(nv_library_SRCS)
      if (nv_library_DEPS)
        merge_static_libs(${TARGET_NAME} ${nv_library_DEPS})
--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@ -118,7 +118,6 @@ endfunction()
 macro(add_unittest_without_exec TARGET_NAME)
    add_executable(${TARGET_NAME} ${ARGN})
    link_paddle_test(${TARGET_NAME})
    add_style_check_target(${TARGET_NAME} ${ARGN})
 endmacro()
 # add_unittest
@ -150,9 +149,12 @@ endfunction()
 # Create a python unittest using run_python_tests.sh,
 # which takes care of making correct running environment
 function(add_python_test TEST_NAME)
-  add_test(NAME ${TEST_NAME}
+    foreach(arg ${ARGN})
-        COMMAND env PADDLE_PACKAGE_DIR=${PADDLE_PYTHON_PACKAGE_DIR}
+        get_filename_component(py_fn ${arg} NAME_WE)
-        bash ${PROJ_ROOT}/paddle/scripts/run_python_tests.sh
+        set(TRG_NAME ${TEST_NAME}_${py_fn})
-        ${USE_VIRTUALENV_FOR_TEST} ${PYTHON_EXECUTABLE} ${ARGN}
+        add_test(NAME ${TRG_NAME}
                COMMAND env PYTHONPATH=${PADDLE_PYTHON_PACKAGE_DIR}
                python2 ${arg}
                WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
    endforeach()
 endfunction()
--- a/paddle/.set_python_path.sh
+++ b/paddle/.set_python_path.sh
@ -21,8 +21,6 @@
 # 
 # It same as PYTHONPATH=${YOUR_PYTHON_PATH}:$PYTHONPATH {exec...}
 #
 if ! python -c "import paddle" >/dev/null 2>/dev/null; then
 PYPATH=""
 set -x
 while getopts "d:" opt; do
@ -35,8 +33,3 @@ if ! python -c "import paddle" >/dev/null 2>/dev/null; then
 shift $(($OPTIND - 1))
 export PYTHONPATH=$PYPATH:$PYTHONPATH
 $@
 else
  echo "paddle package is already in your PYTHONPATH. But unittest need a clean environment."
  echo "Please uninstall paddle package before start unittest. Try to 'pip uninstall paddle'"
  exit 1
 fi
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@ -15,7 +15,6 @@ if(Boost_FOUND)
  add_subdirectory(platform)
  add_subdirectory(framework)
  add_subdirectory(operators)
  add_subdirectory(pybind)
 endif()
 if(WITH_C_API)
--- a/paddle/cuda/src/hl_batch_transpose.cu
+++ b/paddle/cuda/src/hl_batch_transpose.cu
@ -12,17 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "hl_batch_transpose.h"
 #include "hl_base.h"
 #include "hl_batch_transpose.h"
 const int TILE_DIM = 64;
 const int BLOCK_ROWS = 16;
 // No bank-conflict transpose for a batch of data.
-__global__ void batchTransposeNoBankConflicts(real* odata,
+__global__ void batchTransposeNoBankConflicts(
-                                              const real* idata,
+    real* odata, const real* idata, int numSamples, int width, int height) {
                                              int numSamples, int width,
                                              int height) {
  __shared__ float tile[TILE_DIM][TILE_DIM + 1];
  const int x = blockIdx.x * TILE_DIM + threadIdx.x;
@ -50,12 +48,12 @@ __global__ void batchTransposeNoBankConflicts(real* odata,
          newX] = tile[threadIdx.x][j];
 }
-void batchTranspose(const real* input, real* output, int width, int height,
+void batchTranspose(
-                    int batchSize) {
+    const real* input, real* output, int width, int height, int batchSize) {
  dim3 dimBlock(TILE_DIM, BLOCK_ROWS, 1);
  dim3 dimGrid(DIVUP(width, TILE_DIM), DIVUP(height, TILE_DIM), batchSize);
-  batchTransposeNoBankConflicts<<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>
+  batchTransposeNoBankConflicts<<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>(
-      (output, input, batchSize, width, height);
+      output, input, batchSize, width, height);
  CHECK_SYNC("batchTranspose failed!");
 }
--- a/paddle/cuda/src/hl_cuda_aggregate.cu
+++ b/paddle/cuda/src/hl_cuda_aggregate.cu
@ -12,23 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
+#include "hl_aggregate.h"
 #include "hl_base.h"
 #include "hl_cuda.h"
 #include "hl_cuda.ph"
 #include "hl_aggregate.h"
 #include "hl_thread.ph"
 #include "hl_matrix_base.cuh"
 #include "hl_thread.ph"
 #include "paddle/utils/Logging.h"
 /**
 * @brief   matrix row operator.
 */
 template <class Agg, int blockSize>
-__global__ void KeMatrixRowOp(Agg agg,
+__global__ void KeMatrixRowOp(Agg agg, real *E, real *Sum, int dimN) {
                              real *E,
                              real *Sum,
                              int dimN) {
  __shared__ real sum_s[blockSize];
  int cnt = (dimN + blockSize - 1) / blockSize;
  int rowId = blockIdx.x + blockIdx.y * gridDim.x;
@ -58,29 +54,21 @@ __global__ void KeMatrixRowOp(Agg agg,
 }
 template <class Agg>
-void hl_matrix_row_op(Agg agg,
+void hl_matrix_row_op(Agg agg, real *A_d, real *C_d, int dimM, int dimN) {
                      real *A_d,
                      real *C_d,
                      int dimM,
                      int dimN) {
  int blocksX = dimM;
  int blocksY = 1;
  dim3 threads(128, 1);
  dim3 grid(blocksX, blocksY);
-  KeMatrixRowOp<Agg, 128><<< grid, threads, 0, STREAM_DEFAULT >>>
+  KeMatrixRowOp<Agg, 128><<<grid, threads, 0, STREAM_DEFAULT>>>(
-           (agg, A_d, C_d, dimN);
+      agg, A_d, C_d, dimN);
 }
 void hl_matrix_row_sum(real *A_d, real *C_d, int dimM, int dimN) {
  CHECK_NOTNULL(A_d);
  CHECK_NOTNULL(C_d);
-  hl_matrix_row_op(aggregate::sum(),
+  hl_matrix_row_op(aggregate::sum(), A_d, C_d, dimM, dimN);
                   A_d,
                   C_d,
                   dimM,
                   dimN);
  CHECK_SYNC("hl_matrix_row_sum failed");
 }
@ -88,11 +76,7 @@ void hl_matrix_row_max(real *A_d, real *C_d, int dimM, int dimN) {
  CHECK_NOTNULL(A_d);
  CHECK_NOTNULL(C_d);
-  hl_matrix_row_op(aggregate::max(),
+  hl_matrix_row_op(aggregate::max(), A_d, C_d, dimM, dimN);
                   A_d,
                   C_d,
                   dimM,
                   dimN);
  CHECK_SYNC("hl_matrix_row_max failed");
 }
@ -100,11 +84,7 @@ void hl_matrix_row_min(real *A_d, real *C_d, int dimM, int dimN) {
  CHECK_NOTNULL(A_d);
  CHECK_NOTNULL(C_d);
-  hl_matrix_row_op(aggregate::min(),
+  hl_matrix_row_op(aggregate::min(), A_d, C_d, dimM, dimN);
                   A_d,
                   C_d,
                   dimM,
                   dimN);
  CHECK_SYNC("hl_matrix_row_min failed");
 }
@ -112,11 +92,8 @@ void hl_matrix_row_min(real *A_d, real *C_d, int dimM, int dimN) {
 * @brief   matrix column operator.
 */
 template <class Agg>
-__global__ void KeMatrixColumnOp(Agg agg,
+__global__ void KeMatrixColumnOp(
-                                 real *E,
+    Agg agg, real *E, real *Sum, int dimM, int dimN) {
                                 real *Sum,
                                 int dimM,
                                 int dimN) {
  int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
  real tmp = agg.init();
  if (rowIdx < dimN) {
@ -128,11 +105,8 @@ __global__ void KeMatrixColumnOp(Agg agg,
 }
 template <class Agg, int blockDimX, int blockDimY>
-__global__ void KeMatrixColumnOp_S(Agg agg,
+__global__ void KeMatrixColumnOp_S(
-                                   real *E,
+    Agg agg, real *E, real *Sum, int dimM, int dimN) {
                                   real *Sum,
                                   int dimM,
                                   int dimN) {
  __shared__ real _sum[blockDimX * blockDimY];
  int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
  int index = threadIdx.y;
@ -159,25 +133,21 @@ __global__ void KeMatrixColumnOp_S(Agg agg,
 }
 template <class Agg>
-void hl_matrix_column_op(Agg agg,
+void hl_matrix_column_op(Agg agg, real *A_d, real *C_d, int dimM, int dimN) {
                         real *A_d,
                         real *C_d,
                         int dimM,
                         int dimN) {
  if (dimN >= 8192) {
    int blocksX = (dimN + 128 - 1) / 128;
    int blocksY = 1;
    dim3 threads(128, 1);
    dim3 grid(blocksX, blocksY);
-    KeMatrixColumnOp<Agg><<< grid, threads, 0, STREAM_DEFAULT >>>
+    KeMatrixColumnOp<Agg><<<grid, threads, 0, STREAM_DEFAULT>>>(
-             (agg, A_d, C_d, dimM, dimN);
+        agg, A_d, C_d, dimM, dimN);
  } else {
    int blocksX = (dimN + 32 - 1) / 32;
    int blocksY = 1;
    dim3 threads(32, 32);
    dim3 grid(blocksX, blocksY);
-    KeMatrixColumnOp_S<Agg, 32, 32><<< grid, threads, 0, STREAM_DEFAULT>>>
+    KeMatrixColumnOp_S<Agg, 32, 32><<<grid, threads, 0, STREAM_DEFAULT>>>(
-             (agg, A_d, C_d, dimM, dimN);
+        agg, A_d, C_d, dimM, dimN);
  }
  return;
@ -187,11 +157,7 @@ void hl_matrix_column_sum(real *A_d, real *C_d, int dimM, int dimN) {
  CHECK_NOTNULL(A_d);
  CHECK_NOTNULL(C_d);
-  hl_matrix_column_op(aggregate::sum(),
+  hl_matrix_column_op(aggregate::sum(), A_d, C_d, dimM, dimN);
                      A_d,
                      C_d,
                      dimM,
                      dimN);
  CHECK_SYNC("hl_matrix_column_sum failed");
 }
@ -200,11 +166,7 @@ void hl_matrix_column_max(real *A_d, real *C_d, int dimM, int dimN) {
  CHECK_NOTNULL(A_d);
  CHECK_NOTNULL(C_d);
-  hl_matrix_column_op(aggregate::max(),
+  hl_matrix_column_op(aggregate::max(), A_d, C_d, dimM, dimN);
                      A_d,
                      C_d,
                      dimM,
                      dimN);
  CHECK_SYNC("hl_matrix_column_max failed");
 }
@ -213,11 +175,7 @@ void hl_matrix_column_min(real *A_d, real *C_d, int dimM, int dimN) {
  CHECK_NOTNULL(A_d);
  CHECK_NOTNULL(C_d);
-  hl_matrix_column_op(aggregate::min(),
+  hl_matrix_column_op(aggregate::min(), A_d, C_d, dimM, dimN);
                      A_d,
                      C_d,
                      dimM,
                      dimN);
  CHECK_SYNC("hl_matrix_column_min failed");
 }
@ -261,20 +219,21 @@ void hl_vector_sum(real *A_d, real *C_h, int dimM) {
  struct _hl_event_st hl_event_st = {.cu_event = t_resource.event};
  hl_event_t hl_event = &hl_event_st;
-  while (!hl_cuda_event_is_ready(hl_event)) {}
+  while (!hl_cuda_event_is_ready(hl_event)) {
  }
-  KeVectorSum<128><<< grid, threads, 0, STREAM_DEFAULT >>>
+  KeVectorSum<128><<<grid, threads, 0, STREAM_DEFAULT>>>(
-           (A_d, t_resource.gpu_mem, dimM);
+      A_d, t_resource.gpu_mem, dimM);
-  KeVectorSum<128><<< 1, threads, 0, STREAM_DEFAULT >>>
+  KeVectorSum<128><<<1, threads, 0, STREAM_DEFAULT>>>(
-           (t_resource.gpu_mem, t_resource.cpu_mem, 128);
+      t_resource.gpu_mem, t_resource.cpu_mem, 128);
  hl_memcpy_async(C_h, t_resource.cpu_mem, sizeof(real), HPPL_STREAM_DEFAULT);
  hl_stream_record_event(HPPL_STREAM_DEFAULT, hl_event);
  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
  cudaError_t err = (cudaError_t)hl_get_device_last_error();
-  CHECK_EQ(cudaSuccess, err)
+  CHECK_EQ(cudaSuccess, err) << "CUDA error: "
-    << "CUDA error: " << hl_get_device_error_string((size_t)err);
+                             << hl_get_device_error_string((size_t)err);
 }
 template <int blockSize>
@ -316,18 +275,19 @@ void hl_vector_abs_sum(real *A_d, real *C_h, int dimM) {
  struct _hl_event_st hl_event_st = {.cu_event = t_resource.event};
  hl_event_t hl_event = &hl_event_st;
-  while (!hl_cuda_event_is_ready(hl_event)) {}
+  while (!hl_cuda_event_is_ready(hl_event)) {
  }
-  KeVectorAbsSum<128><<< grid, threads, 0, STREAM_DEFAULT >>>
+  KeVectorAbsSum<128><<<grid, threads, 0, STREAM_DEFAULT>>>(
-           (A_d, t_resource.gpu_mem, dimM);
+      A_d, t_resource.gpu_mem, dimM);
-  KeVectorAbsSum<128><<< 1, threads, 0, STREAM_DEFAULT >>>
+  KeVectorAbsSum<128><<<1, threads, 0, STREAM_DEFAULT>>>(
-           (t_resource.gpu_mem, t_resource.cpu_mem, 128);
+      t_resource.gpu_mem, t_resource.cpu_mem, 128);
  hl_memcpy_async(C_h, t_resource.cpu_mem, sizeof(real), HPPL_STREAM_DEFAULT);
  hl_stream_record_event(HPPL_STREAM_DEFAULT, hl_event);
  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
  cudaError_t err = (cudaError_t)hl_get_device_last_error();
-  CHECK_EQ(cudaSuccess, err)
+  CHECK_EQ(cudaSuccess, err) << "CUDA error: "
-    << "CUDA error: " << hl_get_device_error_string((size_t)err);
+                             << hl_get_device_error_string((size_t)err);
 }
--- a/paddle/cuda/src/hl_cuda_cnn.cu
+++ b/paddle/cuda/src/hl_cuda_cnn.cu
--- a/paddle/cuda/src/hl_cuda_lstm.cu
+++ b/paddle/cuda/src/hl_cuda_lstm.cu
--- a/paddle/cuda/src/hl_cuda_matrix.cu
+++ b/paddle/cuda/src/hl_cuda_matrix.cu
--- a/paddle/cuda/src/hl_cuda_sequence.cu
+++ b/paddle/cuda/src/hl_cuda_sequence.cu
@ -55,16 +55,13 @@ void hl_max_sequence_forward(real* input,
  dim3 threads(256, 1);
  dim3 grid(numSequences, 1);
-  KeMaxSequenceForward<<< grid, threads, 0, STREAM_DEFAULT >>>
+  KeMaxSequenceForward<<<grid, threads, 0, STREAM_DEFAULT>>>(
-      (input, sequence, output, index, numSequences, dim);
+      input, sequence, output, index, numSequences, dim);
  CHECK_SYNC("hl_max_sequence_forward failed");
 }
-__global__ void KeMaxSequenceBackward(real *outputGrad,
+__global__ void KeMaxSequenceBackward(
-                                      int *index,
+    real* outputGrad, int* index, real* inputGrad, int numSequences, int dim) {
                                      real* inputGrad,
                                      int numSequences,
                                      int dim) {
  int idx = threadIdx.x + blockIdx.x * blockDim.x;
  int colIdx = idx % dim;
  if (idx < numSequences * dim) {
@ -73,11 +70,8 @@ __global__ void KeMaxSequenceBackward(real *outputGrad,
  }
 }
-void hl_max_sequence_backward(real* outputGrad,
+void hl_max_sequence_backward(
-                              int *index,
+    real* outputGrad, int* index, real* inputGrad, int numSequences, int dim) {
                              real* inputGrad,
                              int numSequences,
                              int dim) {
  CHECK_NOTNULL(outputGrad);
  CHECK_NOTNULL(index);
  CHECK_NOTNULL(inputGrad);
@ -85,8 +79,8 @@ void hl_max_sequence_backward(real* outputGrad,
  unsigned int blocks = (numSequences * dim + 128 - 1) / 128;
  dim3 threads(128, 1);
  dim3 grid(blocks, 1);
-  KeMaxSequenceBackward<<< grid, threads, 0, STREAM_DEFAULT >>>
+  KeMaxSequenceBackward<<<grid, threads, 0, STREAM_DEFAULT>>>(
-      (outputGrad, index, inputGrad, numSequences, dim);
+      outputGrad, index, inputGrad, numSequences, dim);
  CHECK_SYNC("hl_max_sequence_backward failed");
 }
@ -118,9 +112,12 @@ __global__ void KeMatrixAddRows(real* output,
  }
 }
-template<int blockDimX, int blockDimY, int gridDimX, bool seq2batch, bool isAdd>
+template <int blockDimX,
-__global__
+          int blockDimY,
-void KeSequence2Batch(real *batch,
+          int gridDimX,
          bool seq2batch,
          bool isAdd>
 __global__ void KeSequence2Batch(real* batch,
                                 real* sequence,
                                 const int* batchIndex,
                                 int seqWidth,
@ -164,11 +161,11 @@ void hl_sequence2batch_copy(real *batch,
  dim3 threads(128, 8);
  dim3 grid(8, 1);
  if (seq2batch) {
-    KeSequence2Batch<128, 8, 8, 1, 0><<< grid, threads, 0, STREAM_DEFAULT >>>
+    KeSequence2Batch<128, 8, 8, 1, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-      (batch, sequence, batchIndex, seqWidth, batchCount);
+        batch, sequence, batchIndex, seqWidth, batchCount);
  } else {
-    KeSequence2Batch<128, 8, 8, 0, 0><<< grid, threads, 0, STREAM_DEFAULT >>>
+    KeSequence2Batch<128, 8, 8, 0, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-      (batch, sequence, batchIndex, seqWidth, batchCount);
+        batch, sequence, batchIndex, seqWidth, batchCount);
  }
  CHECK_SYNC("hl_sequence2batch_copy failed");
 }
@ -186,18 +183,17 @@ void hl_sequence2batch_add(real *batch,
  dim3 threads(128, 8);
  dim3 grid(8, 1);
  if (seq2batch) {
-    KeSequence2Batch<128, 8, 8, 1, 1><<< grid, threads, 0, STREAM_DEFAULT >>>
+    KeSequence2Batch<128, 8, 8, 1, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-      (batch, sequence, batchIndex, seqWidth, batchCount);
+        batch, sequence, batchIndex, seqWidth, batchCount);
  } else {
-    KeSequence2Batch<128, 8, 8, 0, 1><<< grid, threads, 0, STREAM_DEFAULT >>>
+    KeSequence2Batch<128, 8, 8, 0, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-      (batch, sequence, batchIndex, seqWidth, batchCount);
+        batch, sequence, batchIndex, seqWidth, batchCount);
  }
  CHECK_SYNC("hl_sequence2batch_add failed");
 }
 template <bool normByTimes, bool seq2batch>
-__global__
+__global__ void KeSequence2BatchPadding(real* batch,
 void KeSequence2BatchPadding(real* batch,
                                        real* sequence,
                                        const int* sequenceStartPositions,
                                        const size_t sequenceWidth,
@ -277,36 +273,48 @@ void hl_sequence2batch_copy_padding(real* batch,
    /* sequence -> batch */
    if (normByTimes) {
      KeSequence2BatchPadding<1, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-              batch, sequence, sequenceStartPositions,
+          batch,
-              sequenceWidth, maxSequenceLength, numSequences);
+          sequence,
          sequenceStartPositions,
          sequenceWidth,
          maxSequenceLength,
          numSequences);
    } else {
      KeSequence2BatchPadding<0, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-              batch, sequence, sequenceStartPositions,
+          batch,
-              sequenceWidth, maxSequenceLength, numSequences);
+          sequence,
          sequenceStartPositions,
          sequenceWidth,
          maxSequenceLength,
          numSequences);
    }
  } else {
    /* batch -> sequence */
    if (normByTimes) {
      KeSequence2BatchPadding<1, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-              batch, sequence, sequenceStartPositions,
+          batch,
-              sequenceWidth, maxSequenceLength, numSequences);
+          sequence,
          sequenceStartPositions,
          sequenceWidth,
          maxSequenceLength,
          numSequences);
    } else {
      KeSequence2BatchPadding<0, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-              batch, sequence, sequenceStartPositions,
+          batch,
-              sequenceWidth, maxSequenceLength, numSequences);
+          sequence,
          sequenceStartPositions,
          sequenceWidth,
          maxSequenceLength,
          numSequences);
    }
  }
  CHECK_SYNC("hl_sequence2batch_copy_padding failed");
 }
-__device__ inline float my_rsqrt(float x) {
+__device__ inline float my_rsqrt(float x) { return rsqrtf(x); }
  return rsqrtf(x);
 }
-__device__ inline double my_rsqrt(double x) {
+__device__ inline double my_rsqrt(double x) { return rsqrt(x); }
  return rsqrt(x);
 }
 __global__ void KeSequenceAvgForward(real* dst,
                                     real* src,
@ -327,8 +335,8 @@ __global__ void KeSequenceAvgForward(real* dst,
    for (int i = start; i < end; i++) {
      sum += src[i * width + col];
    }
-    sum = mode == 1 ? sum :
+    sum = mode == 1 ? sum : (mode == 0 ? sum / seqLength
-        (mode == 0 ? sum / seqLength : sum * my_rsqrt((real)seqLength));
+                                       : sum * my_rsqrt((real)seqLength));
    dst[gid] += sum;
  }
 }
@ -349,8 +357,8 @@ void hl_sequence_avg_forward(real* dst,
  CHECK(mode == 0 || mode == 1 || mode == 2)
      << "mode error in hl_sequence_avg_forward!";
-  KeSequenceAvgForward<<< grid, block, 0, STREAM_DEFAULT >>>
+  KeSequenceAvgForward<<<grid, block, 0, STREAM_DEFAULT>>>(
-           (dst, src, starts, height, width, mode);
+      dst, src, starts, height, width, mode);
  CHECK_SYNC("hl_sequence_avg_forward failed");
 }
@ -370,8 +378,8 @@ __global__ void KeSequenceAvgBackward(real* dst,
    int seqLength = end - start;
    if (seqLength == 0) return;
    real grad = src[gid];
-    grad = mode == 1 ? grad :
+    grad = mode == 1 ? grad : (mode == 0 ? grad / seqLength
-        (mode == 0 ? grad / seqLength : grad * my_rsqrt((real)seqLength));
+                                         : grad * my_rsqrt((real)seqLength));
    for (int i = start; i < end; i++) {
      dst[i * width + col] += grad;
    }
@ -394,7 +402,7 @@ void hl_sequence_avg_backward(real* dst,
  CHECK(mode == 0 || mode == 1 || mode == 2)
      << "mode error in hl_sequence_avg_backward!";
-  KeSequenceAvgBackward<<< grid, block, 0, STREAM_DEFAULT >>>
+  KeSequenceAvgBackward<<<grid, block, 0, STREAM_DEFAULT>>>(
-           (dst, src, starts, height, width, mode);
+      dst, src, starts, height, width, mode);
  CHECK_SYNC("hl_sequence_avg_backward failed");
 }
--- a/paddle/cuda/src/hl_cuda_sparse.cu
+++ b/paddle/cuda/src/hl_cuda_sparse.cu
--- a/paddle/cuda/src/hl_perturbation_util.cu
+++ b/paddle/cuda/src/hl_perturbation_util.cu
@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include <cmath>
 #include <stdlib.h>
-#include "hl_cuda.h"
+#include <cmath>
 #include "hl_time.h"
 #include "hl_base.h"
 #include "hl_cuda.h"
 #include "hl_perturbation_util.cuh"
 #include "hl_time.h"
 #define _USE_MATH_DEFINES
@ -30,10 +29,16 @@ limitations under the License. */
 * centerX, centerY: translation.
 * sourceX, sourceY: output coordinates in the original image.
 */
-__device__ void getTranformCoord(int x, int y, real theta, real scale,
+__device__ void getTranformCoord(int x,
-                                 real tgtCenter, real imgCenter,
+                                 int y,
-                                 real centerR, real centerC,
+                                 real theta,
-                                 int* sourceX, int* sourceY) {
+                                 real scale,
                                 real tgtCenter,
                                 real imgCenter,
                                 real centerR,
                                 real centerC,
                                 int* sourceX,
                                 int* sourceY) {
  real H[4] = {cosf(-theta), -sinf(-theta), sinf(-theta), cosf(-theta)};
  // compute coornidates in the rotated and scaled image
@ -57,11 +62,17 @@ __device__ void getTranformCoord(int x, int y, real theta, real scale,
 * created by Wei Xu (genome), converted by Jiang Wang
 */
-__global__ void kSamplingPatches(const real* imgs, real* targets,
+__global__ void kSamplingPatches(const real* imgs,
-                                 int imgSize, int tgtSize, const int channels,
+                                 real* targets,
-                                 int samplingRate, const real* thetas,
+                                 int imgSize,
-                                 const real* scales, const int* centerRs,
+                                 int tgtSize,
-                                 const int* centerCs, const real padValue,
+                                 const int channels,
                                 int samplingRate,
                                 const real* thetas,
                                 const real* scales,
                                 const int* centerRs,
                                 const int* centerCs,
                                 const real padValue,
                                 const int numImages) {
  const int caseIdx = blockIdx.x * 4 + threadIdx.x;
  const int pxIdx = blockIdx.y * 128 + threadIdx.y;
@ -80,8 +91,15 @@ __global__ void kSamplingPatches(const real* imgs, real* targets,
    const int pxY = pxIdx / tgtSize;
    int srcPxX, srcPxY;
-    getTranformCoord(pxX, pxY, thetas[imgIdx], scales[imgIdx], tgtCenter,
+    getTranformCoord(pxX,
-                     imgCenter, centerCs[caseIdx], centerRs[caseIdx], &srcPxX,
+                     pxY,
                     thetas[imgIdx],
                     scales[imgIdx],
                     tgtCenter,
                     imgCenter,
                     centerCs[caseIdx],
                     centerRs[caseIdx],
                     &srcPxX,
                     &srcPxY);
    imgs += (imgIdx * imgPixels + srcPxY * imgSize + srcPxX) * channels;
@ -100,10 +118,15 @@ __global__ void kSamplingPatches(const real* imgs, real* targets,
 *
 * created by Wei Xu
 */
-void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
+void hl_generate_disturb_params(real*& gpuAngle,
-                                int*& gpuCenterR, int*& gpuCenterC,
+                                real*& gpuScaleRatio,
-                                int numImages, int imgSize, real rotateAngle,
+                                int*& gpuCenterR,
-                                real scaleRatio, int samplingRate,
+                                int*& gpuCenterC,
                                int numImages,
                                int imgSize,
                                real rotateAngle,
                                real scaleRatio,
                                int samplingRate,
                                bool isTrain) {
  // The number of output samples.
  int numPatches = numImages * samplingRate;
@ -123,7 +146,8 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
    for (int i = 0; i < numImages; i++) {
      r_angle[i] =
          (rotateAngle * M_PI / 180.0) * (rand() / (RAND_MAX + 1.0)  // NOLINT
-                                          - 0.5);
+                                          -
                                          0.5);
      s_ratio[i] =
          1 + (rand() / (RAND_MAX + 1.0) - 0.5) * scaleRatio;  // NOLINT
    }
@ -140,8 +164,10 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
        int pxY =
            (int)(real(imgSize - 1) * rand() / (RAND_MAX + 1.0));  // NOLINT
-        const real H[4] = {cos(-r_angle[i]), -sin(-r_angle[i]),
+        const real H[4] = {cos(-r_angle[i]),
-                           sin(-r_angle[i]), cos(-r_angle[i])};
+                           -sin(-r_angle[i]),
                           sin(-r_angle[i]),
                           cos(-r_angle[i])};
        real x = pxX - imgCenter;
        real y = pxY - imgCenter;
        real xx = H[0] * x + H[1] * y;
@ -185,9 +211,12 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
  delete[] center_c;
 }
-void hl_conv_random_disturb_with_params(const real* images, int imgSize,
+void hl_conv_random_disturb_with_params(const real* images,
-                                        int tgtSize, int channels,
+                                        int imgSize,
-                                        int numImages, int samplingRate,
+                                        int tgtSize,
                                        int channels,
                                        int numImages,
                                        int samplingRate,
                                        const real* gpuRotationAngle,
                                        const real* gpuScaleRatio,
                                        const int* gpuCenterR,
@ -202,29 +231,59 @@ void hl_conv_random_disturb_with_params(const real* images, int imgSize,
  dim3 threadsPerBlock(4, 128);
  dim3 numBlocks(DIVUP(numPatches, 4), DIVUP(targetSize, 128));
-  kSamplingPatches <<<numBlocks, threadsPerBlock>>>
+  kSamplingPatches<<<numBlocks, threadsPerBlock>>>(images,
-      (images, target, imgSize, tgtSize, channels, samplingRate,
+                                                   target,
-      gpuRotationAngle, gpuScaleRatio, gpuCenterR, gpuCenterC,
+                                                   imgSize,
-      paddingValue, numImages);
+                                                   tgtSize,
                                                   channels,
                                                   samplingRate,
                                                   gpuRotationAngle,
                                                   gpuScaleRatio,
                                                   gpuCenterR,
                                                   gpuCenterC,
                                                   paddingValue,
                                                   numImages);
  hl_device_synchronize();
 }
-void hl_conv_random_disturb(const real* images, int imgSize,
+void hl_conv_random_disturb(const real* images,
-                            int tgtSize, int channels, int numImages,
+                            int imgSize,
-                            real scaleRatio, real rotateAngle,
+                            int tgtSize,
-                            int samplingRate, real* gpu_r_angle,
+                            int channels,
-                            real* gpu_s_ratio, int* gpu_center_r,
+                            int numImages,
-                            int* gpu_center_c, int paddingValue,
+                            real scaleRatio,
-                            bool isTrain, real* targets) {
+                            real rotateAngle,
                            int samplingRate,
                            real* gpu_r_angle,
                            real* gpu_s_ratio,
                            int* gpu_center_r,
                            int* gpu_center_c,
                            int paddingValue,
                            bool isTrain,
                            real* targets) {
  // generate the random disturbance sequence and the sampling locations
-  hl_generate_disturb_params(gpu_r_angle, gpu_s_ratio, gpu_center_r,
+  hl_generate_disturb_params(gpu_r_angle,
-                  gpu_center_c, numImages, imgSize, rotateAngle,
+                             gpu_s_ratio,
-                  scaleRatio, samplingRate, isTrain);
+                             gpu_center_r,
-
+                             gpu_center_c,
-  hl_conv_random_disturb_with_params(
+                             numImages,
-                  images, imgSize, tgtSize, channels, numImages,
+                             imgSize,
-                  samplingRate, gpu_r_angle, gpu_s_ratio,
+                             rotateAngle,
-                  gpu_center_r, gpu_center_r, paddingValue,
+                             scaleRatio,
                             samplingRate,
                             isTrain);
  hl_conv_random_disturb_with_params(images,
                                     imgSize,
                                     tgtSize,
                                     channels,
                                     numImages,
                                     samplingRate,
                                     gpu_r_angle,
                                     gpu_s_ratio,
                                     gpu_center_r,
                                     gpu_center_r,
                                     paddingValue,
                                     targets);
 }
--- a/paddle/cuda/src/hl_table_apply.cu
+++ b/paddle/cuda/src/hl_table_apply.cu
@ -12,15 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "hl_base.h"
 #include "hl_device_functions.cuh"
 #include "hl_cuda.h"
 #include "hl_device_functions.cuh"
 #include "paddle/utils/Logging.h"
 template <int blockDimX, int blockDimY, int gridDimX, bool AddRow>
-__global__ void KeMatrixAddRows(real* output, int ldo,
+__global__ void KeMatrixAddRows(real* output,
-                                real* table, int ldt,
+                                int ldo,
                                real* table,
                                int ldt,
                                int* ids,
                                int numSamples,
                                int tableSize,
@ -45,8 +46,10 @@ __global__ void KeMatrixAddRows(real* output, int ldo,
  }
 }
-void hl_matrix_select_rows(real* output, int ldo,
+void hl_matrix_select_rows(real* output,
-                           real* table, int ldt,
+                           int ldo,
                           real* table,
                           int ldt,
                           int* ids,
                           int numSamples,
                           int tableSize,
@ -57,14 +60,16 @@ void hl_matrix_select_rows(real* output, int ldo,
  dim3 threads(128, 8);
  dim3 grid(8, 1);
-  KeMatrixAddRows<128, 8, 8, 0><<< grid, threads, 0, STREAM_DEFAULT >>>
+  KeMatrixAddRows<128, 8, 8, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-    (output, ldo, table, ldt, ids, numSamples, tableSize, dim);
+      output, ldo, table, ldt, ids, numSamples, tableSize, dim);
  CHECK_SYNC("hl_matrix_select_rows failed");
 }
-void hl_matrix_add_to_rows(real* table, int ldt,
+void hl_matrix_add_to_rows(real* table,
-                           real* input, int ldi,
+                           int ldt,
                           real* input,
                           int ldi,
                           int* ids,
                           int numSamples,
                           int tableSize,
@ -75,16 +80,15 @@ void hl_matrix_add_to_rows(real* table, int ldt,
  dim3 threads(128, 8);
  dim3 grid(8, 1);
-  KeMatrixAddRows<128, 8, 8, 1><<< grid, threads, 0, STREAM_DEFAULT >>>
+  KeMatrixAddRows<128, 8, 8, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-    (input, ldi, table, ldt, ids, numSamples, tableSize, dim);
+      input, ldi, table, ldt, ids, numSamples, tableSize, dim);
  CHECK_SYNC("hl_matrix_add_to_rows failed");
 }
 template <class T, int blockDimX, int gridDimX>
-__global__ void KeVectorSelect(T* dst, int sized,
+__global__ void KeVectorSelect(
-                               const T* src, int sizes,
+    T* dst, int sized, const T* src, int sizes, const int* ids, int sizei) {
                               const int* ids, int sizei) {
  int idx = threadIdx.x + blockDimX * blockIdx.x;
  while (idx < sizei) {
    int index = ids[idx];
@ -95,9 +99,8 @@ __global__ void KeVectorSelect(T* dst, int sized,
 }
 template <class T>
-void hl_vector_select_from(T* dst, int sized,
+void hl_vector_select_from(
-                           const T* src, int sizes,
+    T* dst, int sized, const T* src, int sizes, const int* ids, int sizei) {
                           const int* ids, int sizei) {
  CHECK_NOTNULL(dst);
  CHECK_NOTNULL(src);
  CHECK_NOTNULL(ids);
@ -105,18 +108,17 @@ void hl_vector_select_from(T* dst, int sized,
  dim3 threads(512, 1);
  dim3 grid(8, 1);
-  KeVectorSelect<T, 512, 8><<< grid, threads, 0, STREAM_DEFAULT >>>
+  KeVectorSelect<T, 512, 8><<<grid, threads, 0, STREAM_DEFAULT>>>(
-    (dst, sized, src, sizes, ids, sizei);
+      dst, sized, src, sizes, ids, sizei);
  CHECK_SYNC("hl_vector_select_from failed");
 }
-template
+template void hl_vector_select_from(real* dst,
-void hl_vector_select_from(real* dst, int sized,
+                                    int sized,
-                           const real* src, int sizes,
+                                    const real* src,
-                           const int* ids, int sizei);
+                                    int sizes,
-template
+                                    const int* ids,
-void hl_vector_select_from(int* dst, int sized,
+                                    int sizei);
-                           const int* src, int sizes,
+template void hl_vector_select_from(
-                           const int* ids, int sizei);
+    int* dst, int sized, const int* src, int sizes, const int* ids, int sizei);
--- a/paddle/cuda/src/hl_top_k.cu
+++ b/paddle/cuda/src/hl_top_k.cu
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@ -12,13 +12,15 @@ cc_test(variable_test SRCS variable_test.cc)
 cc_library(scope SRCS scope.cc)
 cc_test(scope_test SRCS scope_test.cc DEPS scope)
-proto_library(attr_type SRCS attr_type.proto)
+proto_library(attribute_proto SRCS attribute.proto)
-proto_library(op_proto SRCS op_proto.proto DEPS attr_type)
+proto_library(op_proto SRCS op_proto.proto DEPS attribute_proto)
-proto_library(op_desc SRCS op_desc.proto DEPS attr_type)
+proto_library(op_desc SRCS op_desc.proto DEPS attribute_proto)
 cc_test(op_proto_test SRCS op_proto_test.cc DEPS op_proto protobuf)
 cc_test(op_desc_test SRCS op_desc_test.cc DEPS op_desc protobuf)
-cc_library(operator SRCS operator.cc DEPS op_desc device_context tensor scope)
+cc_library(attribute SRCS attribute.cc DEPS op_desc op_proto)
 cc_library(operator SRCS operator.cc DEPS op_desc device_context tensor scope attribute)
 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry)
 cc_library(grad_op_builder SRCS grad_op_builder.cc DEPS op_proto operator)
@ -26,13 +28,19 @@ cc_library(op_registry SRCS op_registry.cc DEPS op_desc grad_op_builder)
 cc_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
 cc_test(grad_op_builder_test SRCS grad_op_builder_test.cc DEPS grad_op_builder op_registry add_op)
-py_proto_compile(framework_py_proto SRCS attr_type.proto op_proto.proto op_desc.proto)
+py_proto_compile(framework_py_proto SRCS attribute.proto op_proto.proto op_desc.proto)
 # Generate an empty __init__.py to make framework_py_proto as a valid python module.
 add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
 add_dependencies(framework_py_proto framework_py_proto_init)
-cc_library(net SRCS net.cc DEPS op_registry)
+cc_library(backward SRCS backward.cc DEPS net_op)
 cc_test(net_op_test SRCS net_op_test.cc DEPS net)
 cc_library(backward SRCS backward.cc DEPS net)
 cc_test(backward_test SRCS backward_test.cc DEPS backward)
 cc_library(paddle_pybind SHARED
    SRCS pybind.cc
    DEPS pybind python backward
 	fc_op
 	sgd_op
 	add_op
 	mean_op
 	cross_entropy_op
 	recurrent_op)
--- a/Show More
+++ b/Show More