From ccea3b026ed1741ab352574eb99a26386452e239 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Sun, 20 Nov 2016 15:35:40 +0800
Subject: [PATCH 1/9] Add style check for *.cc files in cuda directory

---
 paddle/cuda/CMakeLists.txt        |   7 +-
 paddle/cuda/src/hl_cuda_cublas.cc |  15 ++--
 paddle/cuda/src/hl_cuda_cudnn.cc  | 143 +++++++++++++-----------------
 paddle/cuda/src/hl_cuda_device.cc |  12 +--
 paddle/cuda/src/hl_cudart_wrap.cc |  26 ++----
 paddle/cuda/src/hl_dso_loader.cc  |  41 ++++-----
 6 files changed, 112 insertions(+), 132 deletions(-)

diff --git a/paddle/cuda/CMakeLists.txt b/paddle/cuda/CMakeLists.txt
index cdb730bb3c..11dbfb54b2 100755
--- a/paddle/cuda/CMakeLists.txt
+++ b/paddle/cuda/CMakeLists.txt
@@ -81,5 +81,8 @@ else()
     add_library(paddle_cuda ${CUDA_SOURCES})
 endif()
 
-add_style_check_target(paddle_cuda ${CUDA_SOURCES})
-add_style_check_target(paddle_cuda ${CUDA_HEADERS})
+add_style_check_target(paddle_cuda
+                       ${CUDA_SOURCES}
+                       ${CUDA_HEADERS}
+                       ${CUDA_DSO_SOURCES}
+                       ${CUDA_CXX_WITH_GPU_SOURCES})
diff --git a/paddle/cuda/src/hl_cuda_cublas.cc b/paddle/cuda/src/hl_cuda_cublas.cc
index f16376ec93..abf6afadc2 100644
--- a/paddle/cuda/src/hl_cuda_cublas.cc
+++ b/paddle/cuda/src/hl_cuda_cublas.cc
@@ -104,7 +104,7 @@ CUBLAS_BLAS_ROUTINE_EACH(DYNAMIC_LOAD_CUBLAS_V2_WRAP)
 #endif
 
 const char* hl_cublas_get_error_string(cublasStatus_t status) {
-  switch(status) {
+  switch (status) {
      case CUBLAS_STATUS_NOT_INITIALIZED:
         return "[cublas status]: not initialized";
      case CUBLAS_STATUS_ALLOC_FAILED:
@@ -181,7 +181,7 @@ void hl_matrix_inverse(real *A_d, real *C_d, int dimN, int lda, int ldc) {
   real **inout_d = (real **)hl_malloc_device(sizeof(real *));
   hl_memcpy(inout_d, inout_h, sizeof(real *));
 
-  int *pivot_d = (int *)hl_malloc_device(dimN*sizeof(int));  
+  int *pivot_d = (int *)hl_malloc_device(dimN * sizeof(int));
   int *info_d = (int *)t_resource.gpu_mem;
 
   /* Note: cublasSgetrfBatched is used to calculate a number of
@@ -189,10 +189,9 @@ void hl_matrix_inverse(real *A_d, real *C_d, int dimN, int lda, int ldc) {
      the API for better performance.
    */
   CHECK_CUBLAS(CUBLAS_GETRF(t_resource.handle,
-	       dimN, inout_d, lda, pivot_d,
-               info_d, 1));
+      dimN, inout_d, lda, pivot_d, info_d, 1));
 
-  int info_h; 
+  int info_h;
   hl_memcpy(&info_h, info_d, sizeof(int));
   if (info_h != 0) {
       LOG(FATAL) << "Factorization of matrix failed: matrix may be singular.\n";
@@ -204,8 +203,8 @@ void hl_matrix_inverse(real *A_d, real *C_d, int dimN, int lda, int ldc) {
   hl_memcpy(out_d, out_h, sizeof(real *));
 
   CHECK_CUBLAS(CUBLAS_GETRI(t_resource.handle,
-	       dimN, (const real **)inout_d, lda, pivot_d,
-	       out_d, ldc, info_d, 1));
+      dimN, (const real **)inout_d, lda, pivot_d,
+      out_d, ldc, info_d, 1));
 
   hl_memcpy(&info_h, info_d, sizeof(int));
   if (info_h != 0) {
@@ -215,7 +214,7 @@ void hl_matrix_inverse(real *A_d, real *C_d, int dimN, int lda, int ldc) {
   hl_free_mem_device(inout_d);
   hl_free_mem_device(pivot_d);
   hl_free_mem_device(out_d);
-  
+
   CHECK_SYNC("hl_matrix_inverse failed");
 }
 
diff --git a/paddle/cuda/src/hl_cuda_cudnn.cc b/paddle/cuda/src/hl_cuda_cudnn.cc
index 92b28e4345..1829fe23ac 100644
--- a/paddle/cuda/src/hl_cuda_cudnn.cc
+++ b/paddle/cuda/src/hl_cuda_cudnn.cc
@@ -159,13 +159,11 @@ CUDNN_DNN_ROUTINE_EACH_R5(DYNAMIC_LOAD_CUDNN_WRAP)
 bool g_is_libcudnn_init = false;
 int g_cudnn_lib_version = 0;
 
-void hl_cudnn_desc_init(cudnnTensorDescriptor_t*  cudnn_desc)
-{
+void hl_cudnn_desc_init(cudnnTensorDescriptor_t*  cudnn_desc) {
     CHECK_CUDNN(dynload::cudnnCreateTensorDescriptor(cudnn_desc));
 }
 
-void hl_cudnn_init(cudnnHandle_t *cudnn_handle, cudaStream_t stream)
-{
+void hl_cudnn_init(cudnnHandle_t *cudnn_handle, cudaStream_t stream) {
     size_t cudnn_dso_ver = dynload::cudnnGetVersion();
     size_t cudnn_dso_major = cudnn_dso_ver / 1000;
     size_t cudnn_cuh_major = CUDNN_VERSION / 1000;
@@ -212,13 +210,18 @@ void hl_conv_workspace(hl_tensor_descriptor input,
     CHECK_NOTNULL(conv);
 
     // Specify workspace limit directly
-    size_t memoryLimitBytes = (1LL << 20) * FLAGS_cudnn_conv_workspace_limit_in_mb;
+    size_t memoryLimitBytes =
+        (1LL << 20) * FLAGS_cudnn_conv_workspace_limit_in_mb;
 
     // cudnn convolution forward configuration
-    cudnnTensorDescriptor_t       fwd_src_desc = GET_TENSOR_DESCRIPTOR(input);
-    cudnnTensorDescriptor_t       fwd_dest_desc = GET_TENSOR_DESCRIPTOR(output);
-    cudnnFilterDescriptor_t       fwd_filter_desc = GET_FILTER_DESCRIPTOR(filter);
-    cudnnConvolutionDescriptor_t  fwd_conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
+    cudnnTensorDescriptor_t       fwd_src_desc =
+                                        GET_TENSOR_DESCRIPTOR(input);
+    cudnnTensorDescriptor_t       fwd_dest_desc =
+                                        GET_TENSOR_DESCRIPTOR(output);
+    cudnnFilterDescriptor_t       fwd_filter_desc =
+                                        GET_FILTER_DESCRIPTOR(filter);
+    cudnnConvolutionDescriptor_t  fwd_conv_desc =
+                                        GET_CONVOLUTION_DESCRIPTOR(conv);
 
     CHECK_CUDNN(dynload::cudnnGetConvolutionForwardAlgorithm(
              t_resource.cudnn_handle,
@@ -250,23 +253,23 @@ void hl_conv_workspace(hl_tensor_descriptor input,
                                           GET_CONVOLUTION_DESCRIPTOR(conv);
 
     CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataAlgorithm(
-             t_resource.cudnn_handle,
-             bwd_data_filter_desc,
-             bwd_data_diff_desc,
-             bwd_data_conv_desc,
-             bwd_data_grad_desc,
-             CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
-             memoryLimitBytes,
-             reinterpret_cast<cudnnConvolutionBwdDataAlgo_t*>(convBwdDataAlgo)));
+        t_resource.cudnn_handle,
+        bwd_data_filter_desc,
+        bwd_data_diff_desc,
+        bwd_data_conv_desc,
+        bwd_data_grad_desc,
+        CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
+        memoryLimitBytes,
+        reinterpret_cast<cudnnConvolutionBwdDataAlgo_t*>(convBwdDataAlgo)));
 
     CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
-             t_resource.cudnn_handle,
-             bwd_data_filter_desc,
-             bwd_data_diff_desc,
-             bwd_data_conv_desc,
-             bwd_data_grad_desc,
-             static_cast<cudnnConvolutionBwdDataAlgo_t>(*convBwdDataAlgo),
-             bwdDataLimitBytes));
+        t_resource.cudnn_handle,
+        bwd_data_filter_desc,
+        bwd_data_diff_desc,
+        bwd_data_conv_desc,
+        bwd_data_grad_desc,
+        static_cast<cudnnConvolutionBwdDataAlgo_t>(*convBwdDataAlgo),
+        bwdDataLimitBytes));
 
     // cudnn convolution backward filter configuration
     cudnnTensorDescriptor_t       bwd_filter_src_desc =
@@ -279,21 +282,21 @@ void hl_conv_workspace(hl_tensor_descriptor input,
                                       GET_FILTER_DESCRIPTOR(filter);
 
     CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
-             t_resource.cudnn_handle,
-             bwd_filter_src_desc,
-             bwd_filter_diff_desc,
-             bwd_filter_conv_desc,
-             bwd_filter_grad_desc,
-             CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
-             memoryLimitBytes,
-             reinterpret_cast<cudnnConvolutionBwdFilterAlgo_t*>(convBwdFilterAlgo)));
+        t_resource.cudnn_handle,
+        bwd_filter_src_desc,
+        bwd_filter_diff_desc,
+        bwd_filter_conv_desc,
+        bwd_filter_grad_desc,
+        CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
+        memoryLimitBytes,
+        reinterpret_cast<cudnnConvolutionBwdFilterAlgo_t*>(convBwdFilterAlgo)));
 
     CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
-             t_resource.cudnn_handle, bwd_filter_src_desc,
-             bwd_filter_diff_desc, bwd_filter_conv_desc,
-             bwd_filter_grad_desc,
-             static_cast<cudnnConvolutionBwdFilterAlgo_t>(*convBwdFilterAlgo),
-             bwdFilterLimitBytes));
+        t_resource.cudnn_handle, bwd_filter_src_desc,
+        bwd_filter_diff_desc, bwd_filter_conv_desc,
+        bwd_filter_grad_desc,
+        static_cast<cudnnConvolutionBwdFilterAlgo_t>(*convBwdFilterAlgo),
+        bwdFilterLimitBytes));
 
 #endif
 }
@@ -302,8 +305,7 @@ void hl_create_tensor_descriptor(hl_tensor_descriptor* image_desc,
                                  int batch_size,
                                  int feature_maps,
                                  int height,
-                                 int width)
-{
+                                 int width) {
     CHECK_NOTNULL(image_desc);
 
     cudnn_tensor_descriptor hl_desc =
@@ -359,8 +361,7 @@ void hl_tensor_reshape(hl_tensor_descriptor image_desc,
                        int batch_size,
                        int feature_maps,
                        int height,
-                       int width)
-{
+                       int width) {
     const int stride_w = 1;
     const int stride_h = width * stride_w;
     const int stride_c = height * stride_h;
@@ -384,8 +385,7 @@ void hl_tensor_reshape(hl_tensor_descriptor image_desc,
                        int nStride,
                        int cStride,
                        int hStride,
-                       int wStride)
-{
+                       int wStride) {
     CHECK_NOTNULL(image_desc);
 
     cudnn_tensor_descriptor hl_desc = (cudnn_tensor_descriptor)image_desc;
@@ -408,8 +408,7 @@ void hl_tensor_reshape(hl_tensor_descriptor image_desc,
     hl_desc->width = width;
 }
 
-void hl_destroy_tensor_descriptor(hl_tensor_descriptor image_desc)
-{
+void hl_destroy_tensor_descriptor(hl_tensor_descriptor image_desc) {
     CHECK_NOTNULL(image_desc);
 
     cudnn_tensor_descriptor hl_desc = (cudnn_tensor_descriptor)image_desc;
@@ -430,11 +429,9 @@ void hl_create_pooling_descriptor(hl_pooling_descriptor* pooling_desc,
                                   int height_padding,
                                   int width_padding,
                                   int stride_height,
-                                  int stride_width)
-{
+                                  int stride_width) {
     cudnnPoolingMode_t cudnn_mode;
-    switch (mode)
-    {
+    switch (mode) {
         case HL_POOLING_MAX:
             cudnn_mode = CUDNN_POOLING_MAX;
             break;
@@ -478,13 +475,13 @@ void hl_create_pooling_descriptor(hl_pooling_descriptor* pooling_desc,
     *pooling_desc = (hl_pooling_descriptor)hl_pooling_desc;
 }
 
-void hl_destroy_pooling_descriptor(hl_pooling_descriptor pooling_desc)
-{
+void hl_destroy_pooling_descriptor(hl_pooling_descriptor pooling_desc) {
     CHECK_NOTNULL(pooling_desc);
 
-    cudnn_pooling_descriptor hl_pooling = (cudnn_pooling_descriptor)pooling_desc;
-    CHECK_NOTNULL(hl_pooling->desc);
+    cudnn_pooling_descriptor hl_pooling =
+        (cudnn_pooling_descriptor)pooling_desc;
 
+    CHECK_NOTNULL(hl_pooling->desc);
     CHECK_CUDNN(dynload::cudnnDestroyPoolingDescriptor(hl_pooling->desc));
 
     hl_pooling->desc = NULL;
@@ -496,8 +493,7 @@ void hl_pooling_forward(hl_tensor_descriptor input,
                         real* input_image,
                         hl_tensor_descriptor output,
                         real* output_image,
-                        hl_pooling_descriptor pooling)
-{
+                        hl_pooling_descriptor pooling) {
     cudnnPoolingDescriptor_t    pooling_desc;
     cudnnTensorDescriptor_t     input_desc;
     cudnnTensorDescriptor_t     output_desc;
@@ -531,8 +527,7 @@ void hl_pooling_backward(hl_tensor_descriptor input,
                          hl_tensor_descriptor output,
                          real* output_image,
                          real* output_image_grad,
-                         hl_pooling_descriptor pooling)
-{
+                         hl_pooling_descriptor pooling) {
     cudnnPoolingDescriptor_t    pooling_desc;
     cudnnTensorDescriptor_t     input_desc;
     cudnnTensorDescriptor_t     output_desc;
@@ -571,8 +566,7 @@ void hl_create_filter_descriptor(hl_filter_descriptor* filter,
                                  int input_feature_maps,
                                  int output_feature_maps,
                                  int height,
-                                 int width)
-{
+                                 int width) {
     CHECK_NOTNULL(filter);
 
     cudnn_filter_descriptor hl_filter =
@@ -607,8 +601,7 @@ void hl_create_filter_descriptor(hl_filter_descriptor* filter,
 }
 
 
-void hl_destroy_filter_descriptor(hl_filter_descriptor filter)
-{
+void hl_destroy_filter_descriptor(hl_filter_descriptor filter) {
     CHECK_NOTNULL(filter);
 
     cudnn_filter_descriptor hl_filter = (cudnn_filter_descriptor)filter;
@@ -627,14 +620,13 @@ void hl_create_convolution_descriptor(hl_convolution_descriptor* conv,
                                       int padding_height,
                                       int padding_width,
                                       int stride_height,
-                                      int stride_width)
-{
+                                      int stride_width) {
     CHECK_NOTNULL(conv);
 
-    cudnn_convolution_descriptor hl_conv =
-        (cudnn_convolution_descriptor)malloc(sizeof(_cudnn_convolution_descriptor));
-    CHECK_NOTNULL(hl_conv);
+    cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)
+        malloc(sizeof(_cudnn_convolution_descriptor));
 
+    CHECK_NOTNULL(hl_conv);
     CHECK_CUDNN(dynload::cudnnCreateConvolutionDescriptor(&hl_conv->desc));
 
     cudnnConvolutionMode_t mode = CUDNN_CROSS_CORRELATION;
@@ -667,8 +659,7 @@ void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
                                      int padding_height,
                                      int padding_width,
                                      int stride_height,
-                                     int stride_width)
-{
+                                     int stride_width) {
     CHECK_NOTNULL(conv);
     CHECK_NOTNULL(image);
     CHECK_NOTNULL(filter);
@@ -697,8 +688,7 @@ void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
     hl_conv->mode = mode;
 }
 
-void hl_destroy_convolution_descriptor(hl_convolution_descriptor conv)
-{
+void hl_destroy_convolution_descriptor(hl_convolution_descriptor conv) {
     CHECK_NOTNULL(conv);
 
     cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)conv;
@@ -753,8 +743,7 @@ void hl_convolution_forward(hl_tensor_descriptor input,
 void hl_convolution_forward_add_bias(hl_tensor_descriptor bias,
                                      real* bias_data,
                                      hl_tensor_descriptor output,
-                                     real* output_data)
-{
+                                     real* output_data) {
     CHECK_NOTNULL(bias);
     CHECK_NOTNULL(output);
     CHECK_NOTNULL(bias_data);
@@ -782,8 +771,7 @@ void hl_convolution_forward_add_bias(hl_tensor_descriptor bias,
 void hl_convolution_backward_bias(hl_tensor_descriptor bias,
                                   real* bias_grad_data,
                                   hl_tensor_descriptor output,
-                                  real* output_grad_data)
-{
+                                  real* output_grad_data) {
     CHECK_NOTNULL(bias);
     CHECK_NOTNULL(output);
     CHECK_NOTNULL(bias_grad_data);
@@ -814,7 +802,6 @@ void hl_convolution_backward_filter(hl_tensor_descriptor input,
                                     void* gpuWorkSpace,
                                     size_t sizeInBytes,
                                     int convBwdFilterAlgo) {
-
     CHECK_NOTNULL(input);
     CHECK_NOTNULL(output);
     CHECK_NOTNULL(filter);
@@ -889,8 +876,7 @@ void hl_convolution_backward_data(hl_tensor_descriptor input,
 void hl_softmax_forward(real *input,
                         real *output,
                         int height,
-                        int width)
-{
+                        int width) {
 #ifndef PADDLE_TYPE_DOUBLE
     cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
 #else
@@ -923,8 +909,7 @@ void hl_softmax_forward(real *input,
 void hl_softmax_backward(real *output_value,
                          real *output_grad,
                          int height,
-                         int width)
-{
+                         int width) {
 #ifndef PADDLE_TYPE_DOUBLE
     cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
 #else
diff --git a/paddle/cuda/src/hl_cuda_device.cc b/paddle/cuda/src/hl_cuda_device.cc
index 3ea2c91bd5..ca19f210c5 100644
--- a/paddle/cuda/src/hl_cuda_device.cc
+++ b/paddle/cuda/src/hl_cuda_device.cc
@@ -203,8 +203,8 @@ inline pid_t gettid() {
   #endif
   pid_t tid = syscall(__NR_gettid);
 #endif
-  CHECK_NE(tid, -1);
-  return tid;    
+  CHECK_NE((int)tid, -1);
+  return tid;
 }
 
 void hl_init(int device) {
@@ -355,7 +355,8 @@ void* hl_malloc_host(size_t size) {
   void *dest_h;
 
   CHECK(size) << __func__ << ": the size for device memory is 0, please check.";
-  CHECK_CUDA(dynload::cudaHostAlloc((void**)&dest_h, size, cudaHostAllocDefault));
+  CHECK_CUDA(dynload::cudaHostAlloc(
+    (void**)&dest_h, size, cudaHostAllocDefault));
 
   return dest_h;
 }
@@ -364,7 +365,7 @@ void hl_free_mem_host(void *dest_h) {
   CHECK_NOTNULL(dest_h);
 
   cudaError_t err = dynload::cudaFreeHost(dest_h);
-  CHECK (cudaSuccess == err || cudaErrorCudartUnloading == err)
+  CHECK(cudaSuccess == err || cudaErrorCudartUnloading == err)
     << hl_get_device_error_string();
 }
 
@@ -502,7 +503,8 @@ int hl_get_cuda_version() {
   return g_cuda_lib_version;
 }
 
-void hl_create_thread_resources(int device, thread_device_resources device_res) {
+void hl_create_thread_resources(int device,
+  thread_device_resources device_res) {
   CHECK_CUDA(dynload::cudaSetDevice(device));
 
   /* create thread stream */
diff --git a/paddle/cuda/src/hl_cudart_wrap.cc b/paddle/cuda/src/hl_cudart_wrap.cc
index 27bbd03bc3..fe755b8c26 100644
--- a/paddle/cuda/src/hl_cudart_wrap.cc
+++ b/paddle/cuda/src/hl_cudart_wrap.cc
@@ -78,48 +78,38 @@ __host__ cudaError_t CUDARTAPI cudaLaunchKernel(const void *func,
                                                 dim3 blockDim,
                                                 void **args,
                                                 size_t sharedMem,
-                                                cudaStream_t stream)
-{
-  return dynload::cudaLaunchKernel(func, gridDim, blockDim, args, sharedMem, stream);
+                                                cudaStream_t stream) {
+  return dynload::cudaLaunchKernel(func, gridDim, blockDim,
+                                   args, sharedMem, stream);
 }
 #endif /* CUDART_VERSION >= 7000 */
 
 
-__host__ cudaError_t CUDARTAPI cudaLaunch(const void *func)
-{
+__host__ cudaError_t CUDARTAPI cudaLaunch(const void *func) {
   return dynload::cudaLaunch(func);
 }
 
 __host__ cudaError_t CUDARTAPI cudaSetupArgument(const void *arg,
                                                  size_t size,
-                                                 size_t offset)
-{
+                                                 size_t offset) {
   return dynload::cudaSetupArgument(arg, size, offset);
 }
 
 __host__ cudaError_t CUDARTAPI cudaConfigureCall(dim3 gridDim,
                                                  dim3 blockDim,
                                                  size_t sharedMem,
-                                                 cudaStream_t stream)
-{
+                                                 cudaStream_t stream) {
   return dynload::cudaConfigureCall(gridDim, blockDim,
                                     sharedMem, stream);
 }
 
 extern "C" {
 
-void** CUDARTAPI __cudaRegisterFatBinary(
-  void *fatCubin
-)
-{
+void** CUDARTAPI __cudaRegisterFatBinary(void *fatCubin) {
   return dynload::__cudaRegisterFatBinary(fatCubin);
-
 }
 
-void CUDARTAPI __cudaUnregisterFatBinary(
-  void **fatCubinHandle
-)
-{
+void CUDARTAPI __cudaUnregisterFatBinary(void **fatCubinHandle) {
   return dynload::__cudaUnregisterFatBinary(fatCubinHandle);
 }
 
diff --git a/paddle/cuda/src/hl_dso_loader.cc b/paddle/cuda/src/hl_dso_loader.cc
index b564b96903..5cb16cfbb3 100644
--- a/paddle/cuda/src/hl_dso_loader.cc
+++ b/paddle/cuda/src/hl_dso_loader.cc
@@ -19,17 +19,18 @@ limitations under the License. */
 
 P_DEFINE_string(cudnn_dir, "",
                 "Specify path for loading libcudnn.so. For instance, "
-                "/usr/local/cudnn/lib64. If empty [default], dlopen will search "
-                "cudnn from LD_LIBRARY_PATH");
+                "/usr/local/cudnn/lib64. If empty [default], dlopen "
+                "will search cudnn from LD_LIBRARY_PATH");
 
 P_DEFINE_string(cuda_dir, "",
                 "Specify path for loading cuda library, such as libcublas, "
-                "libcurand. For instance, /usr/local/cuda/lib64. "
-                "(Note: libcudart can not be specified by cuda_dir, since some "
+                "libcurand. For instance, /usr/local/cuda/lib64. (Note: "
+                "libcudart can not be specified by cuda_dir, since some "
                 "build-in function in cudart already ran before main entry). "
-                "If empty [default], dlopen will search cuda from LD_LIBRARY_PATH");
+                "If default, dlopen will search cuda from LD_LIBRARY_PATH");
 
-static inline std::string join(const std::string& part1, const std::string& part2) {
+static inline std::string join(const std::string& part1,
+                               const std::string& part2) {
   // directory separator
   const char sep = '/';
 
@@ -49,10 +50,10 @@ static inline std::string join(const std::string& part1, const std::string& part
 static inline void GetDsoHandleFromDefaultPath(
         std::string& dso_path, void** dso_handle, int dynload_flags) {
     VLOG(3) << "Try to find cuda library: " << dso_path
-              << " from default system path.";
-    // default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH 
+            << " from default system path.";
+    // default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH
     *dso_handle = dlopen(dso_path.c_str(), dynload_flags);
-    
+
     // DYLD_LIBRARY_PATH is disabled after Mac OS 10.11 to
     // bring System Integrity Projection (SIP), if dso_handle
     // is null, search from default package path in Mac OS.
@@ -62,13 +63,13 @@ static inline void GetDsoHandleFromDefaultPath(
         *dso_handle = dlopen(dso_path.c_str(), dynload_flags);
         if (nullptr == *dso_handle) {
             if (dso_path == "libcudnn.dylib") {
-                LOG(FATAL) << "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n"
-                << "For instance, sudo tar -xzf cudnn-7.5-osx-x64-v5.0-ga.tgz -C "
-                << "/usr/local \n sudo chmod a+r /usr/local/cuda/include/cudnn.h "
+                LOG(FATAL) << "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n" // NOLINT
+                << "For instance, sudo tar -xzf cudnn-7.5-osx-x64-v5.0-ga.tgz -C "    // NOLINT
+                << "/usr/local \n sudo chmod a+r /usr/local/cuda/include/cudnn.h "    // NOLINT
                 << "/usr/local/cuda/lib/libcudnn*";
             }
-        } 
-    }   
+        }
+    }
     #endif
 }
 
@@ -96,19 +97,19 @@ static inline void GetDsoHandleFromSearchPath(
 
     CHECK(nullptr != *dso_handle)
       << "Failed to find cuda library: " << dlPath << std::endl
-      << "Please specify its path correctly using one of the following ideas: \n"
+      << "Please specify its path correctly using one of the following ways: \n"    // NOLINT
 
-      << "Idea 1. set cuda and cudnn lib path at runtime. "
-      << "http://www.paddlepaddle.org/doc/ui/cmd_argument/argument_outline.html \n"
+      << "Method 1. set cuda and cudnn lib path at runtime. "
+      << "http://www.paddlepaddle.org/doc/ui/cmd_argument/argument_outline.html \n" // NOLINT
       << "For instance, issue command: paddle train --use_gpu=1 "
-      << "--cuda_dir=/usr/local/cudnn/lib --cudnn_dir=/usr/local/cudnn/lib ...\n"
+      << "--cuda_dir=/usr/local/cuda/lib64 --cudnn_dir=/usr/local/cudnn/lib ...\n"  // NOLINT
 
-      << "Idea 2. set environment variable LD_LIBRARY_PATH on Linux or "
+      << "Method 2. set environment variable LD_LIBRARY_PATH on Linux or "
       << "DYLD_LIBRARY_PATH on Mac OS. \n"
       << "For instance, issue command: export LD_LIBRARY_PATH=... \n"
 
       << "Note: After Mac OS 10.11, using the DYLD_LIBRARY_PATH is impossible "
-      << "unless System Integrity Protection (SIP) is disabled. However, @Idea 1"
+      << "unless System Integrity Protection (SIP) is disabled. However, method 1 " // NOLINT
       << "always work well.";
 }
 

From 201c0ed8287643a106d63fef60d1ab3fb4e80e45 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Thu, 17 Nov 2016 23:52:45 +0800
Subject: [PATCH 2/9] Add new badges for docs and stats

---
 README.md | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index e8679fb55f..962b08853b 100644
--- a/README.md
+++ b/README.md
@@ -2,9 +2,16 @@
 
 
 [![Build Status](https://travis-ci.org/baidu/Paddle.svg?branch=master)](https://travis-ci.org/baidu/Paddle)
+[![Downloads](https://img.shields.io/github/downloads/baidu/Paddle/total.svg)](https://github.com/baidu/Paddle/releases)
 [![Coverage Status](https://coveralls.io/repos/github/baidu/Paddle/badge.svg?branch=develop)](https://coveralls.io/github/baidu/Paddle?branch=develop)
-[![Join the chat at https://gitter.im/PaddlePaddle/Deep_Learning](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/PaddlePaddle/Deep_Learning?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
-[![License](https://img.shields.io/badge/license-Apache%202.0-green.svg)](LICENSE)
+
+[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org/)
+[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)]()
+[![Join the chat at](https://img.shields.io/gitter/room/PaddlePaddle/Deep_Learning.svg)](https://gitter.im/PaddlePaddle/Deep_Learning)
+
+[![Release](https://img.shields.io/github/release/baidu/Paddle.svg)](https://github.com/baidu/Paddle/releases)
+[![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
+
 
 Welcome to the PaddlePaddle GitHub.
 

From c9b7e0efd04c474f5ca8178647d26ca29b98a59b Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Thu, 17 Nov 2016 23:59:02 +0800
Subject: [PATCH 3/9] Add chinese docs link in badge

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 962b08853b..a14e1b82d1 100644
--- a/README.md
+++ b/README.md
@@ -6,7 +6,7 @@
 [![Coverage Status](https://coveralls.io/repos/github/baidu/Paddle/badge.svg?branch=develop)](https://coveralls.io/github/baidu/Paddle?branch=develop)
 
 [![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org/)
-[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)]()
+[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org/cn/index.html)
 [![Join the chat at](https://img.shields.io/gitter/room/PaddlePaddle/Deep_Learning.svg)](https://gitter.im/PaddlePaddle/Deep_Learning)
 
 [![Release](https://img.shields.io/github/release/baidu/Paddle.svg)](https://github.com/baidu/Paddle/releases)

From dab2ddbb610f862b487b95642c442d3d9bd169d4 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Mon, 21 Nov 2016 14:51:18 +0800
Subject: [PATCH 4/9] Revise badge status

*  because baidu/paddle transfered to paddlepaddle/paddle
---
 README.md | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index a14e1b82d1..4060096559 100644
--- a/README.md
+++ b/README.md
@@ -1,20 +1,15 @@
 # PaddlePaddle
 
 
-[![Build Status](https://travis-ci.org/baidu/Paddle.svg?branch=master)](https://travis-ci.org/baidu/Paddle)
-[![Downloads](https://img.shields.io/github/downloads/baidu/Paddle/total.svg)](https://github.com/baidu/Paddle/releases)
-[![Coverage Status](https://coveralls.io/repos/github/baidu/Paddle/badge.svg?branch=develop)](https://coveralls.io/github/baidu/Paddle?branch=develop)
-
+[![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/baidu/Paddle)
 [![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org/)
 [![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org/cn/index.html)
-[![Join the chat at](https://img.shields.io/gitter/room/PaddlePaddle/Deep_Learning.svg)](https://gitter.im/PaddlePaddle/Deep_Learning)
-
-[![Release](https://img.shields.io/github/release/baidu/Paddle.svg)](https://github.com/baidu/Paddle/releases)
+[![Coverage Status](https://coveralls.io/repos/github/PaddlePaddle/Paddle/badge.svg?branch=develop)](https://coveralls.io/github/baidu/Paddle?branch=develop)
+[![Release](https://img.shields.io/github/release/baidu/Paddle.svg?colorB=fedcba)](https://github.com/baidu/Paddle/releases)
 [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
 
 
 Welcome to the PaddlePaddle GitHub.
-
 PaddlePaddle (PArallel Distributed Deep LEarning) is an easy-to-use,
 efficient, flexible and scalable deep learning platform, which is originally
 developed by Baidu scientists and engineers for the purpose of applying deep

From a4d18146680fe9cd95955fe9e85e3f61fdcbaf9e Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Mon, 21 Nov 2016 14:54:24 +0800
Subject: [PATCH 5/9] Revise Readme.md

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 4060096559..bd47ed44bc 100644
--- a/README.md
+++ b/README.md
@@ -10,6 +10,7 @@
 
 
 Welcome to the PaddlePaddle GitHub.
+
 PaddlePaddle (PArallel Distributed Deep LEarning) is an easy-to-use,
 efficient, flexible and scalable deep learning platform, which is originally
 developed by Baidu scientists and engineers for the purpose of applying deep

From d42fbed02daf63d29be3b6383f017a6f4e7c7c9e Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Sun, 20 Nov 2016 22:12:32 +0800
Subject: [PATCH 6/9] Fix several cpp issues

* Different Type compare.
* ostream << should pass a const object.
* remove always true checks.
---
 .../gserver/evaluators/CTCErrorEvaluator.cpp  |  2 +-
 paddle/gserver/evaluators/ChunkEvaluator.cpp  |  2 +-
 paddle/gserver/evaluators/Evaluator.cpp       |  6 +++---
 paddle/gserver/evaluators/Evaluator.h         | 16 +++++++--------
 .../gserver/gradientmachines/MultiNetwork.cpp |  2 +-
 .../gradientmachines/NeuralNetwork.cpp        |  2 +-
 paddle/math/BaseMatrix.cu                     | 20 +++++++++----------
 paddle/math/Vector.cpp                        |  6 +++---
 paddle/pserver/ParameterServer2.cpp           |  2 --
 paddle/utils/BarrierStat.cpp                  | 10 +++++-----
 paddle/utils/BarrierStat.h                    | 11 +++++-----
 paddle/utils/CompilerMacros.h                 | 17 ++++++++++++++++
 paddle/utils/Logging.cpp                      |  4 ++--
 paddle/utils/Logging.h                        |  3 ++-
 14 files changed, 60 insertions(+), 43 deletions(-)
 create mode 100644 paddle/utils/CompilerMacros.h

diff --git a/paddle/gserver/evaluators/CTCErrorEvaluator.cpp b/paddle/gserver/evaluators/CTCErrorEvaluator.cpp
index e397c71c87..c2625bce9a 100644
--- a/paddle/gserver/evaluators/CTCErrorEvaluator.cpp
+++ b/paddle/gserver/evaluators/CTCErrorEvaluator.cpp
@@ -240,7 +240,7 @@ public:
     seqClassficationError_ = 0;
   }
 
-  virtual void printStats(std::ostream& os) {
+  virtual void printStats(std::ostream& os) const {
     os << config_.name() << "="
        << (numSequences_ ? totalScore_ / numSequences_ : 0);
     os << "  deletions error"
diff --git a/paddle/gserver/evaluators/ChunkEvaluator.cpp b/paddle/gserver/evaluators/ChunkEvaluator.cpp
index 22579891f3..6f5d2b47c3 100644
--- a/paddle/gserver/evaluators/ChunkEvaluator.cpp
+++ b/paddle/gserver/evaluators/ChunkEvaluator.cpp
@@ -114,7 +114,7 @@ public:
     numCorrect_ = 0;
   }
 
-  virtual void printStats(std::ostream& os) {
+  virtual void printStats(std::ostream& os) const {
     double precision = (double)numCorrect_ / numOutputSegments_;
     double recall = (double)numCorrect_ / numLabelSegments_;
     double f1 =
diff --git a/paddle/gserver/evaluators/Evaluator.cpp b/paddle/gserver/evaluators/Evaluator.cpp
index 7bdcdaae53..d43dceea74 100644
--- a/paddle/gserver/evaluators/Evaluator.cpp
+++ b/paddle/gserver/evaluators/Evaluator.cpp
@@ -315,7 +315,7 @@ public:
     return 0;
   }
 
-  virtual void printStats(std::ostream& os) {
+  virtual void printStats(std::ostream& os) const {
     CHECK(colIdx_ + (int32_t)colNum_ >= 0 && colIdx_ - (int32_t)colNum_ < 0)
         << "column index [" << colIdx_ << "] out of range [-" << colNum_ << ", "
         << colNum_ << ")";
@@ -421,7 +421,7 @@ void AucEvaluator::distributeEval(ParameterClient2* client) {
   client->reduce(statNeg_, statNeg_, kBinNum_ + 1, FLAGS_trainer_id, 0);
 }
 
-double AucEvaluator::calcAuc() {
+double AucEvaluator::calcAuc() const {
   double totPos = 0.0;
   double totNeg = 0.0;
   double totPosPrev = 0.0;
@@ -584,7 +584,7 @@ real PrecisionRecallEvaluator::evalImp(std::vector<Argument>& arguments) {
   return 0;
 }
 
-void PrecisionRecallEvaluator::printStats(std::ostream& os) {
+void PrecisionRecallEvaluator::printStats(std::ostream& os) const {
   int label = config_.positive_label();
   if (label != -1) {
     CHECK(label >= 0 && label < (int)statsInfo_.size())
diff --git a/paddle/gserver/evaluators/Evaluator.h b/paddle/gserver/evaluators/Evaluator.h
index b79a539384..e9957a5ce2 100644
--- a/paddle/gserver/evaluators/Evaluator.h
+++ b/paddle/gserver/evaluators/Evaluator.h
@@ -99,19 +99,19 @@ public:
    * @brief print the statistics of evaluate result
    * @note finish() should be called before printStats
    */
-  virtual void printStats(std::ostream& os) {
+  virtual void printStats(std::ostream& os) const {
     os << config_.name() << "="
        << (numSamples_ ? totalScore_ / numSamples_ : 0);
   }
 
   friend std::ostream& operator<<(std::ostream& os,
-                                  Evaluator& evaluator) {
+                                  const Evaluator& evaluator) {
     evaluator.printStats(os);
     return os;
   }
 
   friend std::ostream&& operator<<(std::ostream&& os,    // NOLINT
-                                   Evaluator& evaluator) {
+                                   const Evaluator& evaluator) {
     evaluator.printStats(os);
     return std::move(os);
   }
@@ -135,7 +135,7 @@ public:
     return -1;
   }
   virtual void finish() {}
-  virtual void printStats(std::ostream&) {}
+  virtual void printStats(std::ostream&) const {}
 };
 /**
  * @brief evaluate AUC using colIdx-th column as prediction.
@@ -165,7 +165,7 @@ public:
 
   virtual real evalImp(std::vector<Argument>& arguments);
 
-  virtual void printStats(std::ostream& os) {
+  virtual void printStats(std::ostream& os) const {
     os << config_.name() << "=" << calcAuc();
   }
 
@@ -189,7 +189,7 @@ private:
     return (X1 > X2 ? (X1 - X2) : (X2 - X1)) * (Y1 + Y2) / 2.0;
   }
 
-  double calcAuc();
+  double calcAuc() const;
 };
 
 /**
@@ -244,7 +244,7 @@ public:
 
   virtual real evalImp(std::vector<Argument>& arguments);
 
-  virtual void printStats(std::ostream& os);
+  virtual void printStats(std::ostream& os) const;
 
   virtual void distributeEval(ParameterClient2* client);
 
@@ -339,7 +339,7 @@ public:
 
   virtual void finish() { calc(predictArray_); }
 
-  virtual void printStats(std::ostream& os) {
+  virtual void printStats(std::ostream& os) const {
     os << " pos/neg"
        << "=" << pairArray_[0] / ((pairArray_[1] <= 0) ? 1.0 : pairArray_[1]);
   }
diff --git a/paddle/gserver/gradientmachines/MultiNetwork.cpp b/paddle/gserver/gradientmachines/MultiNetwork.cpp
index d30ca6f28e..b85d2e0c99 100644
--- a/paddle/gserver/gradientmachines/MultiNetwork.cpp
+++ b/paddle/gserver/gradientmachines/MultiNetwork.cpp
@@ -154,7 +154,7 @@ public:
     return -1;
   }
 
-  virtual void printStats(std::ostream& os) {
+  virtual void printStats(std::ostream& os) const {
     for (auto& evaluator : evaluators_) {
       evaluator->printStats(os);
       os << ' ';
diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.cpp b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
index 3127b4dd9a..c77b00eb06 100644
--- a/paddle/gserver/gradientmachines/NeuralNetwork.cpp
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
@@ -325,7 +325,7 @@ public:
     (void)arguments;
     return -1;
   }
-  virtual void printStats(std::ostream& os) {
+  virtual void printStats(std::ostream& os) const {
     for (auto& evaluator : evaluators_) {
       evaluator->printStats(os);
       os << ' ';
diff --git a/paddle/math/BaseMatrix.cu b/paddle/math/BaseMatrix.cu
index 54448bdb5a..2afb216db5 100644
--- a/paddle/math/BaseMatrix.cu
+++ b/paddle/math/BaseMatrix.cu
@@ -1449,8 +1449,8 @@ template<>
 template <class Agg>
 int BaseMatrixT<real>::applyRow(Agg agg, BaseMatrixT& b) {
   MatrixOffset offset(0, 0, 0, 0, 0, 0);
-  int numRows = b.height_;
-  int numCols = b.width_;
+  auto numRows = b.height_;
+  auto numCols = b.width_;
   CHECK_EQ(height_, numRows);
   CHECK_EQ(width_, 1UL);
   aggregate(agg, base::unary::identity(), base::binary::second(), b, numRows,
@@ -1463,8 +1463,8 @@ template<>
 template <class Agg, class Saver>
 int BaseMatrixT<real>::applyRow(Agg agg, Saver sv, BaseMatrixT& b) {
   MatrixOffset offset(0, 0, 0, 0, 0, 0);
-  int numRows = b.height_;
-  int numCols = b.width_;
+  auto numRows = b.height_;
+  auto numCols = b.width_;
   CHECK_EQ(height_, numRows);
   CHECK_EQ(width_, 1UL);
   aggregate(agg, base::unary::identity(), sv, b, numRows, numCols, offset,
@@ -1493,8 +1493,8 @@ template <class Agg, class Op, class Saver>
 int BaseMatrixT<real>::applyRow(Agg agg, Op op, Saver sv,
                                 BaseMatrixT& b, BaseMatrixT& c) {
   MatrixOffset offset(0, 0, 0, 0, 0, 0);
-  int numRows = b.height_;
-  int numCols = b.width_;
+  auto numRows = b.height_;
+  auto numCols = b.width_;
   CHECK_EQ(height_, numRows);
   CHECK_EQ(width_, 1UL);
   CHECK_EQ(c.height_, numRows);
@@ -1524,8 +1524,8 @@ template<>
 template <class Agg>
 int BaseMatrixT<real>::applyCol(Agg agg, BaseMatrixT& b) {
   MatrixOffset offset(0, 0, 0, 0, 0, 0);
-  int numRows = b.height_;
-  int numCols = b.width_;
+  auto numRows = b.height_;
+  auto numCols = b.width_;
   CHECK_EQ(width_, numCols);
   CHECK_EQ(height_, 1UL);
   aggregate(agg, base::unary::identity(), base::binary::second(), b, numRows,
@@ -1538,8 +1538,8 @@ template<>
 template <class Agg, class Saver>
 int BaseMatrixT<real>::applyCol(Agg agg, Saver sv, BaseMatrixT& b) {
   MatrixOffset offset(0, 0, 0, 0, 0, 0);
-  int numRows = b.height_;
-  int numCols = b.width_;
+  auto numRows = b.height_;
+  auto numCols = b.width_;
   CHECK_EQ(width_, numCols);
   CHECK_EQ(height_, 1UL);
   aggregate(agg, base::unary::identity(), sv, b, numRows, numCols, offset,
diff --git a/paddle/math/Vector.cpp b/paddle/math/Vector.cpp
index 23c9caccea..9ef7f2b4b5 100644
--- a/paddle/math/Vector.cpp
+++ b/paddle/math/Vector.cpp
@@ -82,8 +82,8 @@ MatrixPtr VectorT<real>::toOneHotSparseMatrix(size_t idRange, bool useGpu) {
 
 template <>
 MatrixPtr VectorT<int>::toOneHotSparseMatrix(size_t idRange, bool useGpu) {
-  int height = getSize();
-  int width = idRange;
+  auto height = getSize();
+  auto width = idRange;
   MatrixPtr mat = Matrix::createSparseMatrix(
       height, idRange, height, NO_VALUE, SPARSE_CSR, false, useGpu);
 
@@ -91,7 +91,7 @@ MatrixPtr VectorT<int>::toOneHotSparseMatrix(size_t idRange, bool useGpu) {
   cpuIds.copyFrom(*this);
   int *idData = cpuIds.getData();
 
-  for (int i = 0; i < height; i ++) {
+  for (decltype(height) i = 0; i < height; i ++) {
     const unsigned int id = idData[i];
     CHECK_LT(id, width);
     mat->setRow(i, 1, &id, nullptr);
diff --git a/paddle/pserver/ParameterServer2.cpp b/paddle/pserver/ParameterServer2.cpp
index c8f37d0bf4..960fca2853 100644
--- a/paddle/pserver/ParameterServer2.cpp
+++ b/paddle/pserver/ParameterServer2.cpp
@@ -1469,7 +1469,6 @@ void ParameterServer2::waitPassFinish(const WaitPassFinishRequest& request,
 
 void ParameterServer2::synchronize(const SynchronizeRequest& request,
                                    ProtoResponseCallback callback) {
-  CHECK_LT(request.sync_object_id(), SyncObject_ARRAYSIZE);
   synchronizeBarriers_[request.sync_object_id()]->wait();
   dataSize_ = 0;
   callback(SynchronizeResponse());
@@ -1477,7 +1476,6 @@ void ParameterServer2::synchronize(const SynchronizeRequest& request,
 
 void ParameterServer2::asyncFinishPass(const SynchronizeRequest& request,
                                        ProtoResponseCallback callback) {
-  CHECK_LT(request.sync_object_id(), SyncObject_ARRAYSIZE);
   synchronizeBarriers_[request.sync_object_id()]->wait();
   callback(SynchronizeResponse());
 
diff --git a/paddle/utils/BarrierStat.cpp b/paddle/utils/BarrierStat.cpp
index cbc738a839..f083ef3982 100644
--- a/paddle/utils/BarrierStat.cpp
+++ b/paddle/utils/BarrierStat.cpp
@@ -29,10 +29,10 @@ P_DEFINE_bool(log_barrier_show_log, false,  // for performance tuning insight
 
 namespace paddle {
 
-std::ostream &operator<<(std::ostream &output, BarrierStatBase &stat) {
+std::ostream &operator<<(std::ostream &output,
+                         const BarrierStatBase &stat) {
   if (FLAGS_log_barrier_abstract) {
-    std::lock_guard<std::mutex> guard(
-        const_cast<BarrierStatBase &>(stat).lock_);
+    std::lock_guard<std::mutex> guard(stat.lock_);
     stat.showAbstract(output);
   }
   return output;
@@ -136,7 +136,7 @@ void BarrierEndStat::reset(bool clearRawData) {
   totAbstract_.minDelta = UINT64_MAX;
 }
 
-void BarrierEndStat::showAbstract(std::ostream &output) {
+void BarrierEndStat::showAbstract(std::ostream &output) const {
   // do not support the case "<=2 pserver"
   if (numConnThreads_ <= 2 || !totSamples_) {
     return;
@@ -272,7 +272,7 @@ void BarrierDeltaStat::reset(bool clearRawData) {
   totAbstract_.minDelta = UINT64_MAX;
 }
 
-void BarrierDeltaStat::showAbstract(std::ostream &output) {
+void BarrierDeltaStat::showAbstract(std::ostream &output) const {
   // do not support the case "<=2 pserver"
   if (numConnThreads_ <= 2 || !totSamples_) {
     return;
diff --git a/paddle/utils/BarrierStat.h b/paddle/utils/BarrierStat.h
index 22d6cc9bce..add1093758 100644
--- a/paddle/utils/BarrierStat.h
+++ b/paddle/utils/BarrierStat.h
@@ -218,11 +218,12 @@ public:
   }
 
 protected:
-  virtual void showAbstract(std::ostream &output) {}
-  friend std::ostream &operator<<(std::ostream &output, BarrierStatBase &stat);
+  virtual void showAbstract(std::ostream &output) const {}
+  friend std::ostream &operator<<(std::ostream &output,
+                                  const BarrierStatBase &stat);
 
 protected:
-  std::mutex lock_;
+  mutable std::mutex lock_;
   std::mutex abstractLock_;  // see note on updaterStat
   // each freqency for each barrier trainer
   std::vector<struct Abstract> abstract_;
@@ -262,7 +263,7 @@ protected:
    * log_barrier_abstract, log_barrier_lowest_nodes, log_barrier_threshold
    * control details.
    */
-  virtual void showAbstract(std::ostream &output);
+  virtual void showAbstract(std::ostream &output) const;
 
 private:
   std::unique_ptr<TimeVectorEnd> timeVector_;
@@ -286,7 +287,7 @@ public:
   virtual bool checkPassBarrier() { return timeVector_->empty(); }
 
 protected:
-  virtual void showAbstract(std::ostream &outPut);
+  virtual void showAbstract(std::ostream &outPut) const;
 
 private:
   // store delta time in uint64_t, eg BP time of all trainers
diff --git a/paddle/utils/CompilerMacros.h b/paddle/utils/CompilerMacros.h
new file mode 100644
index 0000000000..4236d750c4
--- /dev/null
+++ b/paddle/utils/CompilerMacros.h
@@ -0,0 +1,17 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#define ATTR_NORETURN __attribute__((noreturn))
diff --git a/paddle/utils/Logging.cpp b/paddle/utils/Logging.cpp
index a0644940b5..9a6b1f2d83 100644
--- a/paddle/utils/Logging.cpp
+++ b/paddle/utils/Logging.cpp
@@ -134,7 +134,7 @@ static void initializeLogFds(char* argv0) {
   gLogInited = true;
 }
 
-static void (*gFailureFunctionPtr)() __attribute__((noreturn)) = abort;
+static void (*gFailureFunctionPtr)() ATTR_NORETURN = abort;
 
 LogMessage::LogMessage(const char* fname, int line, int severity)
     : fname_(fname), line_(line), severity_(severity) {}
@@ -171,7 +171,7 @@ void setMinLogLevel(int level) {
   paddle::internal::gMinLogLevel = level;
 }
 
-void installFailureFunction(void (*callback)()) {
+void installFailureFunction(void (*callback)() ATTR_NORETURN) {
   paddle::internal::gFailureFunctionPtr = callback;
 }
 
diff --git a/paddle/utils/Logging.h b/paddle/utils/Logging.h
index 7fdfa3240c..46b6a7feeb 100644
--- a/paddle/utils/Logging.h
+++ b/paddle/utils/Logging.h
@@ -23,6 +23,7 @@ limitations under the License. */
 #include <string>
 
 #ifndef PADDLE_USE_GLOG
+#include "CompilerMacros.h"
 
 //! TODO(yuyang18): Move this utility macro into some global header.
 #define PP_CAT(a, b) PP_CAT_I(a, b)
@@ -168,7 +169,7 @@ void setMinLogLevel(int level);
  * @brief Install Log(Fatal) failure function. Default is abort();
  * @param callback: The failure function.
  */
-void installFailureFunction(void (*callback)());
+void installFailureFunction(void (*callback)() ATTR_NORETURN);
 
 /**
  * @brief installFailureWriter

From 731fe950c4714d5ab3374128cfbbbb24b1aefe78 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Mon, 21 Nov 2016 16:37:36 +0800
Subject: [PATCH 7/9] Change auto => size_t in BaseMatrix.cu

* Because it is a cuda source file, and we need to support c++ 03 in
  cuda.
---
 paddle/math/BaseMatrix.cu | 20 ++++++++++----------
 paddle/math/Vector.cpp    |  4 ++--
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/paddle/math/BaseMatrix.cu b/paddle/math/BaseMatrix.cu
index 2afb216db5..2f32b3fdd1 100644
--- a/paddle/math/BaseMatrix.cu
+++ b/paddle/math/BaseMatrix.cu
@@ -1449,8 +1449,8 @@ template<>
 template <class Agg>
 int BaseMatrixT<real>::applyRow(Agg agg, BaseMatrixT& b) {
   MatrixOffset offset(0, 0, 0, 0, 0, 0);
-  auto numRows = b.height_;
-  auto numCols = b.width_;
+  size_t numRows = b.height_;
+  size_t numCols = b.width_;
   CHECK_EQ(height_, numRows);
   CHECK_EQ(width_, 1UL);
   aggregate(agg, base::unary::identity(), base::binary::second(), b, numRows,
@@ -1463,8 +1463,8 @@ template<>
 template <class Agg, class Saver>
 int BaseMatrixT<real>::applyRow(Agg agg, Saver sv, BaseMatrixT& b) {
   MatrixOffset offset(0, 0, 0, 0, 0, 0);
-  auto numRows = b.height_;
-  auto numCols = b.width_;
+  size_t numRows = b.height_;
+  size_t numCols = b.width_;
   CHECK_EQ(height_, numRows);
   CHECK_EQ(width_, 1UL);
   aggregate(agg, base::unary::identity(), sv, b, numRows, numCols, offset,
@@ -1493,8 +1493,8 @@ template <class Agg, class Op, class Saver>
 int BaseMatrixT<real>::applyRow(Agg agg, Op op, Saver sv,
                                 BaseMatrixT& b, BaseMatrixT& c) {
   MatrixOffset offset(0, 0, 0, 0, 0, 0);
-  auto numRows = b.height_;
-  auto numCols = b.width_;
+  size_t numRows = b.height_;
+  size_t numCols = b.width_;
   CHECK_EQ(height_, numRows);
   CHECK_EQ(width_, 1UL);
   CHECK_EQ(c.height_, numRows);
@@ -1524,8 +1524,8 @@ template<>
 template <class Agg>
 int BaseMatrixT<real>::applyCol(Agg agg, BaseMatrixT& b) {
   MatrixOffset offset(0, 0, 0, 0, 0, 0);
-  auto numRows = b.height_;
-  auto numCols = b.width_;
+  size_t numRows = b.height_;
+  size_t numCols = b.width_;
   CHECK_EQ(width_, numCols);
   CHECK_EQ(height_, 1UL);
   aggregate(agg, base::unary::identity(), base::binary::second(), b, numRows,
@@ -1538,8 +1538,8 @@ template<>
 template <class Agg, class Saver>
 int BaseMatrixT<real>::applyCol(Agg agg, Saver sv, BaseMatrixT& b) {
   MatrixOffset offset(0, 0, 0, 0, 0, 0);
-  auto numRows = b.height_;
-  auto numCols = b.width_;
+  size_t numRows = b.height_;
+  size_t numCols = b.width_;
   CHECK_EQ(width_, numCols);
   CHECK_EQ(height_, 1UL);
   aggregate(agg, base::unary::identity(), sv, b, numRows, numCols, offset,
diff --git a/paddle/math/Vector.cpp b/paddle/math/Vector.cpp
index 9ef7f2b4b5..68a1518d67 100644
--- a/paddle/math/Vector.cpp
+++ b/paddle/math/Vector.cpp
@@ -82,8 +82,8 @@ MatrixPtr VectorT<real>::toOneHotSparseMatrix(size_t idRange, bool useGpu) {
 
 template <>
 MatrixPtr VectorT<int>::toOneHotSparseMatrix(size_t idRange, bool useGpu) {
-  auto height = getSize();
-  auto width = idRange;
+  size_t height = getSize();
+  size_t width = idRange;
   MatrixPtr mat = Matrix::createSparseMatrix(
       height, idRange, height, NO_VALUE, SPARSE_CSR, false, useGpu);
 

From c6eeb650af3ed249dfa92b13ba8b1f0ebc9b0b13 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Tue, 22 Nov 2016 10:56:45 +0800
Subject: [PATCH 8/9] Add clang-format hooks

---
 .pre-commit-config.yaml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 9385943da9..c1ae1058c2 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -22,3 +22,7 @@
 # not all of our python code is runnable. Some are used for 
 # documenation
 #    -   id: debug-statements
+-   repo: https://github.com/PaddlePaddle/clang-format-pre-commit-hook.git
+    sha: 38b232ea0fd2be46c26a691b9f32dd94d1ee3333
+    hooks:
+    -   id: clang-formater

From 5e3dd38143769d79e93009dfcb5b819af564689f Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Tue, 22 Nov 2016 11:06:28 +0800
Subject: [PATCH 9/9] auto-update pre-commit

---
 .pre-commit-config.yaml | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index c1ae1058c2..90c25e4350 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -7,22 +7,14 @@
     hooks:
     -   id: yapf
 -   repo: https://github.com/pre-commit/pre-commit-hooks
-    sha: 4ef03c4223ad322c7adaa6c6c0efb26b57df3b71
+    sha: 7539d8bd1a00a3c1bfd34cdb606d3a6372e83469
     hooks:
     -   id: check-added-large-files
     -   id: check-merge-conflict
     -   id: check-symlinks
     -   id: detect-private-key
     -   id: end-of-file-fixer
-# TODO(yuyang): trailing whitespace has some bugs on markdown 
-# files now, please not add it to pre-commit hook now
-#    -   id: trailing-whitespace
-#
-# TODO(yuyang): debug-statements not fit for Paddle, because
-# not all of our python code is runnable. Some are used for 
-# documenation
-#    -   id: debug-statements
 -   repo: https://github.com/PaddlePaddle/clang-format-pre-commit-hook.git
-    sha: 38b232ea0fd2be46c26a691b9f32dd94d1ee3333
+    sha: 28c0ea8a67a3e2dbbf4822ef44e85b63a0080a29
     hooks:
     -   id: clang-formater