Merge remote-tracking branch 'upstream/develop' into merge-doc

8 years ago · 4bc1dc269e
parent 6405438bcf 91b674fca4
commit 4bc1dc269e
160 changed files with 5866 additions and 4152 deletions
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -24,7 +24,7 @@
        description: Format files with ClangFormat.
        entry: clang-format -i
        language: system
-        files: \.(c|cc|cxx|cpp|h|hpp|hxx)$
+        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto)$
 -   repo: https://github.com/PaddlePaddle/pre-commit-golang
    sha: 8337620115c25ff8333f1b1a493bd031049bd7c0
    hooks:
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -36,8 +36,8 @@ include(simd)
 ################################ Configurations #######################################
 option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_FOUND})
 option(WITH_AVX         "Compile PaddlePaddle with AVX intrinsics"      ${AVX_FOUND})
-option(WITH_MKLDNN      "Compile PaddlePaddle with mkl-dnn support."    ${AVX_FOUND})
+option(WITH_MKLDNN      "Compile PaddlePaddle with mkl-dnn support."    OFF)
-option(WITH_MKLML       "Compile PaddlePaddle with mklml package."      ${AVX_FOUND})
+option(WITH_MKLML       "Compile PaddlePaddle with mklml package."      OFF)
 option(WITH_DSO         "Compile PaddlePaddle with dynamic linked CUDA" ON)
 option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        ON)
 option(WITH_SWIG_PY     "Compile PaddlePaddle with inference api"       ON)
--- a/5
+++ b/5
@ -27,13 +27,16 @@ RUN apt-get update && \
    git python-pip python-dev openssh-server bison  \
    wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
    curl sed grep graphviz libjpeg-dev zlib1g-dev  \
-    python-numpy python-matplotlib gcc-4.8 g++-4.8 \
+    python-matplotlib gcc-4.8 g++-4.8 \
    automake locales clang-format-3.8 swig doxygen cmake  \
    liblapack-dev liblapacke-dev libboost-dev \
    clang-3.8 llvm-3.8 libclang-3.8-dev \
    net-tools && \
    apt-get clean -y
 # paddle is using numpy.flip, which is introduced since 1.12.0
 RUN pip --no-cache-dir install 'numpy>=1.12.0'
 # Install Go and glide
 RUN wget -O go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz && \
    tar -C /usr/local -xzf go.tgz && \
--- a/cmake/cpplint.cmake
+++ b/cmake/cpplint.cmake
@ -56,11 +56,14 @@ macro(add_style_check_target TARGET_NAME)
                # cpplint code style
                get_filename_component(base_filename ${filename} NAME)
                set(CUR_GEN ${CMAKE_CURRENT_BINARY_DIR}/${base_filename}.cpplint)
-                add_custom_command(TARGET ${TARGET_NAME} PRE_BUILD
+                add_custom_command(OUTPUT ${CUR_GEN} PRE_BUILD
                    COMMAND "${PYTHON_EXECUTABLE}" "${PROJ_ROOT}/paddle/scripts/cpplint.py"
                            "--filter=${STYLE_FILTER}"
                            "--write-success=${CUR_GEN}" ${filename}
                    DEPENDS ${filename} ${PROJ_ROOT}/paddle/scripts/cpplint.py
                    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
                add_custom_target(${base_filename}.cpplint DEPENDS ${CUR_GEN})
                add_dependencies(${TARGET_NAME} ${base_filename}.cpplint)
            endif()
        endforeach()
    endif()
--- a/cmake/external/gflags.cmake
+++ b/cmake/external/gflags.cmake
@ -28,7 +28,14 @@ INCLUDE_DIRECTORIES(${GFLAGS_INCLUDE_DIR})
 ExternalProject_Add(
    extern_gflags
    ${EXTERNAL_PROJECT_LOG_ARGS}
-    GIT_REPOSITORY  "https://github.com/gflags/gflags.git"
+    # TODO(yiwang): The annoying warnings mentioned in
    # https://github.com/PaddlePaddle/Paddle/issues/3277 are caused by
    # gflags.  I fired a PR https://github.com/gflags/gflags/pull/230
    # to fix it.  Before it gets accepted by the gflags team, we use
    # my personal fork, which contains above fix, temporarily.  Let's
    # change this back to the official Github repo once my PR is
    # merged.
    GIT_REPOSITORY  "https://github.com/wangkuiyi/gflags.git"
    PREFIX          ${GFLAGS_SOURCES_DIR}
    UPDATE_COMMAND  ""
    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
--- a/cmake/external/python.cmake
+++ b/cmake/external/python.cmake
@ -24,7 +24,6 @@ IF(WITH_PYTHON)
 ENDIF(WITH_PYTHON)
 SET(py_env "")
 SET(USE_VIRTUALENV_FOR_TEST 1)
 IF(PYTHONINTERP_FOUND)
    find_python_module(pip REQUIRED)
    find_python_module(numpy REQUIRED)
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@ -187,7 +187,13 @@ function(cc_library TARGET_NAME)
    endif()
    # cpplint code style
-    add_style_check_target(${TARGET_NAME} ${cc_library_SRCS})
+    foreach(source_file ${cc_library_SRCS})
      string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file})
      if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
        list(APPEND cc_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
      endif()
    endforeach()
    add_style_check_target(${TARGET_NAME} ${cc_library_SRCS} ${cc_library_HEADERS})
  else(cc_library_SRCS)
    if (cc_library_DEPS)
@ -239,6 +245,14 @@ function(nv_library TARGET_NAME)
        add_dependencies(${TARGET_NAME} ${nv_library_DEPS})
        target_link_libraries(${TARGET_NAME} ${nv_library_DEPS})
      endif()
      # cpplint code style
      foreach(source_file ${nv_library_SRCS})
        string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file})
        if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
          list(APPEND cc_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
        endif()
      endforeach()
      add_style_check_target(${TARGET_NAME} ${nv_library_SRCS} ${nv_library_HEADERS})
    else(nv_library_SRCS)
      if (nv_library_DEPS)
        merge_static_libs(${TARGET_NAME} ${nv_library_DEPS})
--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@ -118,7 +118,6 @@ endfunction()
 macro(add_unittest_without_exec TARGET_NAME)
    add_executable(${TARGET_NAME} ${ARGN})
    link_paddle_test(${TARGET_NAME})
    add_style_check_target(${TARGET_NAME} ${ARGN})
 endmacro()
 # add_unittest
@ -150,9 +149,12 @@ endfunction()
 # Create a python unittest using run_python_tests.sh,
 # which takes care of making correct running environment
 function(add_python_test TEST_NAME)
-  add_test(NAME ${TEST_NAME}
+    foreach(arg ${ARGN})
-        COMMAND env PADDLE_PACKAGE_DIR=${PADDLE_PYTHON_PACKAGE_DIR}
+        get_filename_component(py_fn ${arg} NAME_WE)
-        bash ${PROJ_ROOT}/paddle/scripts/run_python_tests.sh
+        set(TRG_NAME ${TEST_NAME}_${py_fn})
-        ${USE_VIRTUALENV_FOR_TEST} ${PYTHON_EXECUTABLE} ${ARGN}
+        add_test(NAME ${TRG_NAME}
-        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+                COMMAND env PYTHONPATH=${PADDLE_PYTHON_PACKAGE_DIR}
                python2 ${arg}
                WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
    endforeach()
 endfunction()
--- a/paddle/.set_python_path.sh
+++ b/paddle/.set_python_path.sh
@ -21,22 +21,15 @@
 # 
 # It same as PYTHONPATH=${YOUR_PYTHON_PATH}:$PYTHONPATH {exec...}
 #
-
+PYPATH=""
-if ! python -c "import paddle" >/dev/null 2>/dev/null; then
+set -x
-  PYPATH=""
+while getopts "d:" opt; do
-  set -x
+  case $opt in
-  while getopts "d:" opt; do
+    d)
-    case $opt in
+      PYPATH=$OPTARG
-      d)
+      ;;
-        PYPATH=$OPTARG
+  esac
-        ;;
+done
-    esac
+shift $(($OPTIND - 1))
-  done
+export PYTHONPATH=$PYPATH:$PYTHONPATH
-  shift $(($OPTIND - 1))
+$@
  export PYTHONPATH=$PYPATH:$PYTHONPATH
  $@
 else
  echo "paddle package is already in your PYTHONPATH. But unittest need a clean environment."
  echo "Please uninstall paddle package before start unittest. Try to 'pip uninstall paddle'"
  exit 1
 fi
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@ -15,7 +15,6 @@ if(Boost_FOUND)
  add_subdirectory(platform)
  add_subdirectory(framework)
  add_subdirectory(operators)
  add_subdirectory(pybind)
 endif()
 if(WITH_C_API)
--- a/paddle/cuda/src/hl_batch_transpose.cu
+++ b/paddle/cuda/src/hl_batch_transpose.cu
@ -12,17 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "hl_batch_transpose.h"
 #include "hl_base.h"
 #include "hl_batch_transpose.h"
 const int TILE_DIM = 64;
 const int BLOCK_ROWS = 16;
 // No bank-conflict transpose for a batch of data.
-__global__ void batchTransposeNoBankConflicts(real* odata,
+__global__ void batchTransposeNoBankConflicts(
-                                              const real* idata,
+    real* odata, const real* idata, int numSamples, int width, int height) {
                                              int numSamples, int width,
                                              int height) {
  __shared__ float tile[TILE_DIM][TILE_DIM + 1];
  const int x = blockIdx.x * TILE_DIM + threadIdx.x;
@ -50,12 +48,12 @@ __global__ void batchTransposeNoBankConflicts(real* odata,
          newX] = tile[threadIdx.x][j];
 }
-void batchTranspose(const real* input, real* output, int width, int height,
+void batchTranspose(
-                    int batchSize) {
+    const real* input, real* output, int width, int height, int batchSize) {
  dim3 dimBlock(TILE_DIM, BLOCK_ROWS, 1);
  dim3 dimGrid(DIVUP(width, TILE_DIM), DIVUP(height, TILE_DIM), batchSize);
-  batchTransposeNoBankConflicts<<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>
+  batchTransposeNoBankConflicts<<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>(
-      (output, input, batchSize, width, height);
+      output, input, batchSize, width, height);
  CHECK_SYNC("batchTranspose failed!");
 }
--- a/paddle/cuda/src/hl_cuda_aggregate.cu
+++ b/paddle/cuda/src/hl_cuda_aggregate.cu
--- a/paddle/cuda/src/hl_cuda_cnn.cu
+++ b/paddle/cuda/src/hl_cuda_cnn.cu
--- a/paddle/cuda/src/hl_cuda_lstm.cu
+++ b/paddle/cuda/src/hl_cuda_lstm.cu
--- a/paddle/cuda/src/hl_cuda_matrix.cu
+++ b/paddle/cuda/src/hl_cuda_matrix.cu
--- a/paddle/cuda/src/hl_cuda_sequence.cu
+++ b/paddle/cuda/src/hl_cuda_sequence.cu
--- a/paddle/cuda/src/hl_cuda_sparse.cu
+++ b/paddle/cuda/src/hl_cuda_sparse.cu
--- a/paddle/cuda/src/hl_perturbation_util.cu
+++ b/paddle/cuda/src/hl_perturbation_util.cu
@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include <cmath>
 #include <stdlib.h>
-#include "hl_cuda.h"
+#include <cmath>
 #include "hl_time.h"
 #include "hl_base.h"
 #include "hl_cuda.h"
 #include "hl_perturbation_util.cuh"
 #include "hl_time.h"
 #define _USE_MATH_DEFINES
@ -30,10 +29,16 @@ limitations under the License. */
 * centerX, centerY: translation.
 * sourceX, sourceY: output coordinates in the original image.
 */
-__device__ void getTranformCoord(int x, int y, real theta, real scale,
+__device__ void getTranformCoord(int x,
-                                 real tgtCenter, real imgCenter,
+                                 int y,
-                                 real centerR, real centerC,
+                                 real theta,
-                                 int* sourceX, int* sourceY) {
+                                 real scale,
                                 real tgtCenter,
                                 real imgCenter,
                                 real centerR,
                                 real centerC,
                                 int* sourceX,
                                 int* sourceY) {
  real H[4] = {cosf(-theta), -sinf(-theta), sinf(-theta), cosf(-theta)};
  // compute coornidates in the rotated and scaled image
@ -57,11 +62,17 @@ __device__ void getTranformCoord(int x, int y, real theta, real scale,
 * created by Wei Xu (genome), converted by Jiang Wang
 */
-__global__ void kSamplingPatches(const real* imgs, real* targets,
+__global__ void kSamplingPatches(const real* imgs,
-                                 int imgSize, int tgtSize, const int channels,
+                                 real* targets,
-                                 int samplingRate, const real* thetas,
+                                 int imgSize,
-                                 const real* scales, const int* centerRs,
+                                 int tgtSize,
-                                 const int* centerCs, const real padValue,
+                                 const int channels,
                                 int samplingRate,
                                 const real* thetas,
                                 const real* scales,
                                 const int* centerRs,
                                 const int* centerCs,
                                 const real padValue,
                                 const int numImages) {
  const int caseIdx = blockIdx.x * 4 + threadIdx.x;
  const int pxIdx = blockIdx.y * 128 + threadIdx.y;
@ -80,8 +91,15 @@ __global__ void kSamplingPatches(const real* imgs, real* targets,
    const int pxY = pxIdx / tgtSize;
    int srcPxX, srcPxY;
-    getTranformCoord(pxX, pxY, thetas[imgIdx], scales[imgIdx], tgtCenter,
+    getTranformCoord(pxX,
-                     imgCenter, centerCs[caseIdx], centerRs[caseIdx], &srcPxX,
+                     pxY,
                     thetas[imgIdx],
                     scales[imgIdx],
                     tgtCenter,
                     imgCenter,
                     centerCs[caseIdx],
                     centerRs[caseIdx],
                     &srcPxX,
                     &srcPxY);
    imgs += (imgIdx * imgPixels + srcPxY * imgSize + srcPxX) * channels;
@ -100,10 +118,15 @@ __global__ void kSamplingPatches(const real* imgs, real* targets,
 *
 * created by Wei Xu
 */
-void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
+void hl_generate_disturb_params(real*& gpuAngle,
-                                int*& gpuCenterR, int*& gpuCenterC,
+                                real*& gpuScaleRatio,
-                                int numImages, int imgSize, real rotateAngle,
+                                int*& gpuCenterR,
-                                real scaleRatio, int samplingRate,
+                                int*& gpuCenterC,
                                int numImages,
                                int imgSize,
                                real rotateAngle,
                                real scaleRatio,
                                int samplingRate,
                                bool isTrain) {
  // The number of output samples.
  int numPatches = numImages * samplingRate;
@ -123,7 +146,8 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
    for (int i = 0; i < numImages; i++) {
      r_angle[i] =
          (rotateAngle * M_PI / 180.0) * (rand() / (RAND_MAX + 1.0)  // NOLINT
-                                          - 0.5);
+                                          -
                                          0.5);
      s_ratio[i] =
          1 + (rand() / (RAND_MAX + 1.0) - 0.5) * scaleRatio;  // NOLINT
    }
@ -140,8 +164,10 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
        int pxY =
            (int)(real(imgSize - 1) * rand() / (RAND_MAX + 1.0));  // NOLINT
-        const real H[4] = {cos(-r_angle[i]), -sin(-r_angle[i]),
+        const real H[4] = {cos(-r_angle[i]),
-                           sin(-r_angle[i]), cos(-r_angle[i])};
+                           -sin(-r_angle[i]),
                           sin(-r_angle[i]),
                           cos(-r_angle[i])};
        real x = pxX - imgCenter;
        real y = pxY - imgCenter;
        real xx = H[0] * x + H[1] * y;
@ -185,9 +211,12 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
  delete[] center_c;
 }
-void hl_conv_random_disturb_with_params(const real* images, int imgSize,
+void hl_conv_random_disturb_with_params(const real* images,
-                                        int tgtSize, int channels,
+                                        int imgSize,
-                                        int numImages, int samplingRate,
+                                        int tgtSize,
                                        int channels,
                                        int numImages,
                                        int samplingRate,
                                        const real* gpuRotationAngle,
                                        const real* gpuScaleRatio,
                                        const int* gpuCenterR,
@ -202,29 +231,59 @@ void hl_conv_random_disturb_with_params(const real* images, int imgSize,
  dim3 threadsPerBlock(4, 128);
  dim3 numBlocks(DIVUP(numPatches, 4), DIVUP(targetSize, 128));
-  kSamplingPatches <<<numBlocks, threadsPerBlock>>>
+  kSamplingPatches<<<numBlocks, threadsPerBlock>>>(images,
-      (images, target, imgSize, tgtSize, channels, samplingRate,
+                                                   target,
-      gpuRotationAngle, gpuScaleRatio, gpuCenterR, gpuCenterC,
+                                                   imgSize,
-      paddingValue, numImages);
+                                                   tgtSize,
                                                   channels,
                                                   samplingRate,
                                                   gpuRotationAngle,
                                                   gpuScaleRatio,
                                                   gpuCenterR,
                                                   gpuCenterC,
                                                   paddingValue,
                                                   numImages);
  hl_device_synchronize();
 }
-void hl_conv_random_disturb(const real* images, int imgSize,
+void hl_conv_random_disturb(const real* images,
-                            int tgtSize, int channels, int numImages,
+                            int imgSize,
-                            real scaleRatio, real rotateAngle,
+                            int tgtSize,
-                            int samplingRate, real* gpu_r_angle,
+                            int channels,
-                            real* gpu_s_ratio, int* gpu_center_r,
+                            int numImages,
-                            int* gpu_center_c, int paddingValue,
+                            real scaleRatio,
-                            bool isTrain, real* targets) {
+                            real rotateAngle,
                            int samplingRate,
                            real* gpu_r_angle,
                            real* gpu_s_ratio,
                            int* gpu_center_r,
                            int* gpu_center_c,
                            int paddingValue,
                            bool isTrain,
                            real* targets) {
  // generate the random disturbance sequence and the sampling locations
-  hl_generate_disturb_params(gpu_r_angle, gpu_s_ratio, gpu_center_r,
+  hl_generate_disturb_params(gpu_r_angle,
-                  gpu_center_c, numImages, imgSize, rotateAngle,
+                             gpu_s_ratio,
-                  scaleRatio, samplingRate, isTrain);
+                             gpu_center_r,
-
+                             gpu_center_c,
-  hl_conv_random_disturb_with_params(
+                             numImages,
-                  images, imgSize, tgtSize, channels, numImages,
+                             imgSize,
-                  samplingRate, gpu_r_angle, gpu_s_ratio,
+                             rotateAngle,
-                  gpu_center_r, gpu_center_r, paddingValue,
+                             scaleRatio,
-                  targets);
+                             samplingRate,
                             isTrain);
  hl_conv_random_disturb_with_params(images,
                                     imgSize,
                                     tgtSize,
                                     channels,
                                     numImages,
                                     samplingRate,
                                     gpu_r_angle,
                                     gpu_s_ratio,
                                     gpu_center_r,
                                     gpu_center_r,
                                     paddingValue,
                                     targets);
 }
--- a/paddle/cuda/src/hl_table_apply.cu
+++ b/paddle/cuda/src/hl_table_apply.cu
@ -12,15 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "hl_base.h"
 #include "hl_device_functions.cuh"
 #include "hl_cuda.h"
 #include "hl_device_functions.cuh"
 #include "paddle/utils/Logging.h"
-template<int blockDimX, int blockDimY, int gridDimX, bool AddRow>
+template <int blockDimX, int blockDimY, int gridDimX, bool AddRow>
-__global__ void KeMatrixAddRows(real* output, int ldo,
+__global__ void KeMatrixAddRows(real* output,
-                                real* table, int ldt,
+                                int ldo,
                                real* table,
                                int ldt,
                                int* ids,
                                int numSamples,
                                int tableSize,
@ -31,8 +32,8 @@ __global__ void KeMatrixAddRows(real* output, int ldo,
  while (idy < numSamples) {
    int tableId = ids[idy];
    if ((0 <= tableId) && (tableId < tableSize)) {
-      real *out = output + idy * ldo;
+      real* out = output + idy * ldo;
-      real *tab = table + tableId * ldt;
+      real* tab = table + tableId * ldt;
      for (int i = idx; i < dim; i += blockDimX) {
        if (AddRow) {
          paddle::paddleAtomicAdd(&tab[i], out[i]);
@ -45,8 +46,10 @@ __global__ void KeMatrixAddRows(real* output, int ldo,
  }
 }
-void hl_matrix_select_rows(real* output, int ldo,
+void hl_matrix_select_rows(real* output,
-                           real* table, int ldt,
+                           int ldo,
                           real* table,
                           int ldt,
                           int* ids,
                           int numSamples,
                           int tableSize,
@ -57,14 +60,16 @@ void hl_matrix_select_rows(real* output, int ldo,
  dim3 threads(128, 8);
  dim3 grid(8, 1);
-  KeMatrixAddRows<128, 8, 8, 0><<< grid, threads, 0, STREAM_DEFAULT >>>
+  KeMatrixAddRows<128, 8, 8, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-    (output, ldo, table, ldt, ids, numSamples, tableSize, dim);
+      output, ldo, table, ldt, ids, numSamples, tableSize, dim);
  CHECK_SYNC("hl_matrix_select_rows failed");
 }
-void hl_matrix_add_to_rows(real* table, int ldt,
+void hl_matrix_add_to_rows(real* table,
-                           real* input, int ldi,
+                           int ldt,
                           real* input,
                           int ldi,
                           int* ids,
                           int numSamples,
                           int tableSize,
@ -75,16 +80,15 @@ void hl_matrix_add_to_rows(real* table, int ldt,
  dim3 threads(128, 8);
  dim3 grid(8, 1);
-  KeMatrixAddRows<128, 8, 8, 1><<< grid, threads, 0, STREAM_DEFAULT >>>
+  KeMatrixAddRows<128, 8, 8, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-    (input, ldi, table, ldt, ids, numSamples, tableSize, dim);
+      input, ldi, table, ldt, ids, numSamples, tableSize, dim);
  CHECK_SYNC("hl_matrix_add_to_rows failed");
 }
-template<class T, int blockDimX, int gridDimX>
+template <class T, int blockDimX, int gridDimX>
-__global__ void KeVectorSelect(T* dst, int sized,
+__global__ void KeVectorSelect(
-                               const T* src, int sizes,
+    T* dst, int sized, const T* src, int sizes, const int* ids, int sizei) {
                               const int* ids, int sizei) {
  int idx = threadIdx.x + blockDimX * blockIdx.x;
  while (idx < sizei) {
    int index = ids[idx];
@ -95,9 +99,8 @@ __global__ void KeVectorSelect(T* dst, int sized,
 }
 template <class T>
-void hl_vector_select_from(T* dst, int sized,
+void hl_vector_select_from(
-                           const T* src, int sizes,
+    T* dst, int sized, const T* src, int sizes, const int* ids, int sizei) {
                           const int* ids, int sizei) {
  CHECK_NOTNULL(dst);
  CHECK_NOTNULL(src);
  CHECK_NOTNULL(ids);
@ -105,18 +108,17 @@ void hl_vector_select_from(T* dst, int sized,
  dim3 threads(512, 1);
  dim3 grid(8, 1);
-  KeVectorSelect<T, 512, 8><<< grid, threads, 0, STREAM_DEFAULT >>>
+  KeVectorSelect<T, 512, 8><<<grid, threads, 0, STREAM_DEFAULT>>>(
-    (dst, sized, src, sizes, ids, sizei);
+      dst, sized, src, sizes, ids, sizei);
  CHECK_SYNC("hl_vector_select_from failed");
 }
-template
+template void hl_vector_select_from(real* dst,
-void hl_vector_select_from(real* dst, int sized,
+                                    int sized,
-                           const real* src, int sizes,
+                                    const real* src,
-                           const int* ids, int sizei);
+                                    int sizes,
-template
+                                    const int* ids,
-void hl_vector_select_from(int* dst, int sized,
+                                    int sizei);
-                           const int* src, int sizes,
+template void hl_vector_select_from(
-                           const int* ids, int sizei);
+    int* dst, int sized, const int* src, int sizes, const int* ids, int sizei);
--- a/paddle/cuda/src/hl_top_k.cu
+++ b/paddle/cuda/src/hl_top_k.cu
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@ -12,13 +12,15 @@ cc_test(variable_test SRCS variable_test.cc)
 cc_library(scope SRCS scope.cc)
 cc_test(scope_test SRCS scope_test.cc DEPS scope)
-proto_library(attr_type SRCS attr_type.proto)
+proto_library(attribute_proto SRCS attribute.proto)
-proto_library(op_proto SRCS op_proto.proto DEPS attr_type)
+proto_library(op_proto SRCS op_proto.proto DEPS attribute_proto)
-proto_library(op_desc SRCS op_desc.proto DEPS attr_type)
+proto_library(op_desc SRCS op_desc.proto DEPS attribute_proto)
 cc_test(op_proto_test SRCS op_proto_test.cc DEPS op_proto protobuf)
 cc_test(op_desc_test SRCS op_desc_test.cc DEPS op_desc protobuf)
-cc_library(operator SRCS operator.cc DEPS op_desc device_context tensor scope)
+cc_library(attribute SRCS attribute.cc DEPS op_desc op_proto)
 cc_library(operator SRCS operator.cc DEPS op_desc device_context tensor scope attribute)
 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry)
 cc_library(grad_op_builder SRCS grad_op_builder.cc DEPS op_proto operator)
@ -26,13 +28,19 @@ cc_library(op_registry SRCS op_registry.cc DEPS op_desc grad_op_builder)
 cc_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
 cc_test(grad_op_builder_test SRCS grad_op_builder_test.cc DEPS grad_op_builder op_registry add_op)
-py_proto_compile(framework_py_proto SRCS attr_type.proto op_proto.proto op_desc.proto)
+py_proto_compile(framework_py_proto SRCS attribute.proto op_proto.proto op_desc.proto)
 # Generate an empty __init__.py to make framework_py_proto as a valid python module.
 add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
 add_dependencies(framework_py_proto framework_py_proto_init)
-cc_library(net SRCS net.cc DEPS op_registry)
+cc_library(backward SRCS backward.cc DEPS net_op)
 cc_test(net_op_test SRCS net_op_test.cc DEPS net)
 cc_library(backward SRCS backward.cc DEPS net)
 cc_test(backward_test SRCS backward_test.cc DEPS backward)
 cc_library(paddle_pybind SHARED
    SRCS pybind.cc
    DEPS pybind python backward
 	fc_op
 	sgd_op
 	add_op
 	mean_op
 	cross_entropy_op
 	recurrent_op)
--- a/paddle/framework/attribute.cc
+++ b/paddle/framework/attribute.cc
@ -0,0 +1,85 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/framework/attribute.h"
 #include <vector>
 namespace paddle {
 namespace framework {
 template <>
 AttrType AttrTypeID<int>() {
  return INT;
 }
 template <>
 AttrType AttrTypeID<float>() {
  return FLOAT;
 }
 template <>
 AttrType AttrTypeID<std::string>() {
  return STRING;
 }
 template <>
 AttrType AttrTypeID<std::vector<int>>() {
  return INTS;
 }
 template <>
 AttrType AttrTypeID<std::vector<float>>() {
  return FLOATS;
 }
 template <>
 AttrType AttrTypeID<std::vector<std::string>>() {
  return STRINGS;
 }
 Attribute GetAttrValue(const AttrDesc& attr_desc) {
  switch (attr_desc.type()) {
    case paddle::framework::AttrType::INT: {
      return attr_desc.i();
    }
    case paddle::framework::AttrType::FLOAT: {
      return attr_desc.f();
    }
    case paddle::framework::AttrType::STRING: {
      return attr_desc.s();
    }
    case paddle::framework::AttrType::INTS: {
      std::vector<int> val(attr_desc.ints_size());
      for (int i = 0; i < attr_desc.ints_size(); ++i) {
        val[i] = attr_desc.ints(i);
      }
      return val;
    }
    case paddle::framework::AttrType::FLOATS: {
      std::vector<float> val(attr_desc.floats_size());
      for (int i = 0; i < attr_desc.floats_size(); ++i) {
        val[i] = attr_desc.floats(i);
      }
      return val;
    }
    case paddle::framework::AttrType::STRINGS: {
      std::vector<std::string> val(attr_desc.strings_size());
      for (int i = 0; i < attr_desc.strings_size(); ++i) {
        val[i] = attr_desc.strings(i);
      }
      return val;
    }
  }
  PADDLE_ENFORCE(false, "Unknown OpDesc::AttrDesc::type !");
  return boost::blank();
 }
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/attr_checker.h
+++ b/paddle/framework/attr_checker.h
@ -1,3 +1,17 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 #include <boost/variant.hpp>
@ -6,6 +20,9 @@
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
 #include "paddle/framework/attribute.pb.h"
 #include "paddle/framework/op_desc.pb.h"
 #include "paddle/platform/enforce.h"
 namespace paddle {
@ -14,13 +31,19 @@ namespace framework {
 typedef boost::variant<boost::blank, int, float, std::string, std::vector<int>,
                       std::vector<float>, std::vector<std::string>>
    Attribute;
 typedef std::unordered_map<std::string, Attribute> AttributeMap;
 template <typename T>
 AttrType AttrTypeID();
 Attribute GetAttrValue(const AttrDesc& attr_desc);
 // check whether a value(attribute) fit a certain limit
 template <typename T>
 class LargerThanChecker {
 public:
-  LargerThanChecker(T lower_bound) : lower_bound_(lower_bound) {}
+  explicit LargerThanChecker(T lower_bound) : lower_bound_(lower_bound) {}
  void operator()(T& value) const {
    PADDLE_ENFORCE(value > lower_bound_, "larger_than check fail");
  }
@ -35,7 +58,8 @@ class LargerThanChecker {
 template <typename T>
 class DefaultValueSetter {
 public:
-  DefaultValueSetter(T default_value) : default_value_(default_value) {}
+  explicit DefaultValueSetter(T default_value)
      : default_value_(default_value) {}
  void operator()(T& value) const { value = default_value_; }
 private:
@ -78,7 +102,8 @@ class TypedAttrChecker {
  typedef std::function<void(T&)> ValueChecker;
 public:
-  TypedAttrChecker(const std::string& attr_name) : attr_name_(attr_name) {}
+  explicit TypedAttrChecker(const std::string& attr_name)
      : attr_name_(attr_name) {}
  TypedAttrChecker& InEnum(const std::unordered_set<T>& range) {
    value_checkers_.push_back(EnumInContainer<T>(range));
--- a/paddle/framework/attribute.proto
+++ b/paddle/framework/attribute.proto
@ -12,17 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-syntax="proto2";
+syntax = "proto2";
 package paddle.framework;
 // Attribute Type for paddle's Op.
 // Op contains many attributes. Each type of attributes could be different.
 // The AttrType will be shared between AttrDesc and AttrProto.
 enum AttrType {
-    INT = 0;
+  INT = 0;
-    FLOAT = 1;
+  FLOAT = 1;
-    STRING = 2;
+  STRING = 2;
-    INTS = 3;
+  INTS = 3;
-    FLOATS = 4;
+  FLOATS = 4;
-    STRINGS = 5;
+  STRINGS = 5;
 }
--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@ -14,8 +14,8 @@
 #include "paddle/framework/backward.h"
 #include <list>
 #include "paddle/framework/net.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/net_op.h"
 namespace paddle {
 namespace framework {
@ -32,7 +32,7 @@ static bool AllInSet(const std::vector<std::string>& names,
 }
 static std::shared_ptr<OperatorBase> NOP() {
-  auto net_op = std::make_shared<NetOp>();
+  auto net_op = std::make_shared<operators::NetOp>();
  net_op->type_ = "@NOP@";
  net_op->CompleteAddOp();
  return net_op;
@ -42,9 +42,9 @@ static std::shared_ptr<OperatorBase> NOP() {
 //
 //  no_grad_names the gradient variable names without gradient calculating.
 //
-//  uniq_id is a unique index used inside recursively calling BackwardRecursive.
+//  uniq_id is a unique index used inside recursively calling
-//  use `uid = uniq_id++;` to get the unique index, and pass `uniq_id` through
+//  BackwardRecursive. use `uid = uniq_id++;` to get the unique index, and
-//  recursive calling.
+//  pass `uniq_id` through recursive calling.
 //
 //  returns The backward operator. For simple situation, it is a simple
 //  operator. For complex situation, it is a NetOp.
@ -59,32 +59,30 @@ std::shared_ptr<OperatorBase> BackwardRecursive(
  //  If all input gradients of forwarding operator do not need to calculate,
  //  just return an NOP. Not return null ptr because NOP does not take
  //  too much time for calculation, but it is useful for simplifying logic.
-  if (AllInSet(forwardOp.inputs_, OperatorBase::GRAD_VAR_SUFFIX(),
+  if (AllInSet(forwardOp.inputs_, kGradVarSuffix, no_grad_names)) {
               no_grad_names)) {
    return NOP();
  }
-  //  All output gradients of forwarding operator do not need to calculate. Then
+  //  All output gradients of forwarding operator do not need to calculate.
-  //  all input gradients cannot be computed at all, and we put them into
+  //  Then all input gradients cannot be computed at all, and we put them into
  //  `no_grad_names` set. Return an NOP.
-  if (AllInSet(forwardOp.outputs_, OperatorBase::GRAD_VAR_SUFFIX(),
+  if (AllInSet(forwardOp.outputs_, kGradVarSuffix, no_grad_names)) {
               no_grad_names)) {
    for (auto& name : forwardOp.inputs_) {
      // Mark all input is not need
-      no_grad_names.insert(name + OperatorBase::GRAD_VAR_SUFFIX());
+      no_grad_names.insert(name + kGradVarSuffix);
    }
    return NOP();
  }
  // Returned gradient network
-  auto net = std::make_shared<NetOp>();
+  auto net = std::make_shared<operators::NetOp>();
  if (forwardOp.IsNetOp()) {
    // Because forwardOp is a net op, it can static_cast.
-    auto& forwardNet = static_cast<const NetOp&>(forwardOp);
+    auto& forwardNet = static_cast<const operators::NetOp&>(forwardOp);
-    // Map from output gradient variable name to operator's indices in backward
+    // Map from output gradient variable name to operator's indices in
-    // net. That operator generates that variable.
+    // backward net. That operator generates that variable.
    std::unordered_map<std::string, std::vector<size_t>> dup_output_ops;
    size_t local_op_id = 0;
@ -134,9 +132,9 @@ std::shared_ptr<OperatorBase> BackwardRecursive(
    std::shared_ptr<OperatorBase> grad_op = OpRegistry::CreateGradOp(forwardOp);
    for (std::string& grad_input : grad_op->inputs_) {
      if (no_grad_names.count(grad_input)) {
-        std::string prefix = grad_input.substr(
+        std::string prefix =
-            0, grad_input.size() - OperatorBase::GRAD_VAR_SUFFIX().size());
+            grad_input.substr(0, grad_input.size() - kGradVarSuffix.size());
-        grad_input = prefix + OperatorBase::ZERO_VAR_SUFFIX();
+        grad_input = prefix + kZeroVarSuffix;
        // If part of input gradient of that operator is not calculated, fill
        // zero variables to that input gradient.
@ -147,7 +145,7 @@ std::shared_ptr<OperatorBase> BackwardRecursive(
    for (std::string& grad_output : grad_op->outputs_) {
      if (no_grad_names.count(grad_output)) {
-        grad_output = OperatorBase::EMPTY_VAR_NAME();
+        grad_output = kEmptyVarName;
      }
    }
@ -168,14 +166,14 @@ std::shared_ptr<OperatorBase> Backward(
  std::unordered_set<std::string> no_grad_names;
  no_grad_names.reserve(no_grad_vars.size());
-  no_grad_names.insert(OperatorBase::EMPTY_VAR_NAME() +
+  no_grad_names.insert(kEmptyVarName + kGradVarSuffix);
                       OperatorBase::GRAD_VAR_SUFFIX());
  for (auto& name : no_grad_vars) {
-    no_grad_names.insert(name + OperatorBase::GRAD_VAR_SUFFIX());
+    no_grad_names.insert(name + kGradVarSuffix);
  }
  size_t uid = 0;
  return BackwardRecursive(forwardOp, no_grad_names, uid);
 }
 }  // namespace framework
 }  // namespace paddle
--- a/Show More
+++ b/Show More