Merge pull request #3228 from gangliao/clang-format

ClangFormat for proto and cuda
8 years ago · 75185d821a
parent fa839c52ea d1e754336e
commit 75185d821a
39 changed files with 3660 additions and 2920 deletions
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -24,7 +24,7 @@
        description: Format files with ClangFormat.
        entry: clang-format -i
        language: system
-        files: \.(c|cc|cxx|cpp|h|hpp|hxx)$
+        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto)$
 -   repo: https://github.com/PaddlePaddle/pre-commit-golang
    sha: 8337620115c25ff8333f1b1a493bd031049bd7c0
    hooks:
--- a/paddle/cuda/src/hl_batch_transpose.cu
+++ b/paddle/cuda/src/hl_batch_transpose.cu
@ -12,17 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "hl_batch_transpose.h"
 #include "hl_base.h"
 #include "hl_batch_transpose.h"
 const int TILE_DIM = 64;
 const int BLOCK_ROWS = 16;
 // No bank-conflict transpose for a batch of data.
-__global__ void batchTransposeNoBankConflicts(real* odata,
+__global__ void batchTransposeNoBankConflicts(
-                                              const real* idata,
+    real* odata, const real* idata, int numSamples, int width, int height) {
                                              int numSamples, int width,
                                              int height) {
  __shared__ float tile[TILE_DIM][TILE_DIM + 1];
  const int x = blockIdx.x * TILE_DIM + threadIdx.x;
@ -50,12 +48,12 @@ __global__ void batchTransposeNoBankConflicts(real* odata,
          newX] = tile[threadIdx.x][j];
 }
-void batchTranspose(const real* input, real* output, int width, int height,
+void batchTranspose(
-                    int batchSize) {
+    const real* input, real* output, int width, int height, int batchSize) {
  dim3 dimBlock(TILE_DIM, BLOCK_ROWS, 1);
  dim3 dimGrid(DIVUP(width, TILE_DIM), DIVUP(height, TILE_DIM), batchSize);
-  batchTransposeNoBankConflicts<<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>
+  batchTransposeNoBankConflicts<<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>(
-      (output, input, batchSize, width, height);
+      output, input, batchSize, width, height);
  CHECK_SYNC("batchTranspose failed!");
 }
--- a/paddle/cuda/src/hl_cuda_aggregate.cu
+++ b/paddle/cuda/src/hl_cuda_aggregate.cu
--- a/paddle/cuda/src/hl_cuda_cnn.cu
+++ b/paddle/cuda/src/hl_cuda_cnn.cu
--- a/paddle/cuda/src/hl_cuda_lstm.cu
+++ b/paddle/cuda/src/hl_cuda_lstm.cu
--- a/paddle/cuda/src/hl_cuda_matrix.cu
+++ b/paddle/cuda/src/hl_cuda_matrix.cu
--- a/paddle/cuda/src/hl_cuda_sequence.cu
+++ b/paddle/cuda/src/hl_cuda_sequence.cu
--- a/paddle/cuda/src/hl_cuda_sparse.cu
+++ b/paddle/cuda/src/hl_cuda_sparse.cu
--- a/paddle/cuda/src/hl_perturbation_util.cu
+++ b/paddle/cuda/src/hl_perturbation_util.cu
@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include <cmath>
 #include <stdlib.h>
-#include "hl_cuda.h"
+#include <cmath>
 #include "hl_time.h"
 #include "hl_base.h"
 #include "hl_cuda.h"
 #include "hl_perturbation_util.cuh"
 #include "hl_time.h"
 #define _USE_MATH_DEFINES
@ -30,10 +29,16 @@ limitations under the License. */
 * centerX, centerY: translation.
 * sourceX, sourceY: output coordinates in the original image.
 */
-__device__ void getTranformCoord(int x, int y, real theta, real scale,
+__device__ void getTranformCoord(int x,
-                                 real tgtCenter, real imgCenter,
+                                 int y,
-                                 real centerR, real centerC,
+                                 real theta,
-                                 int* sourceX, int* sourceY) {
+                                 real scale,
                                 real tgtCenter,
                                 real imgCenter,
                                 real centerR,
                                 real centerC,
                                 int* sourceX,
                                 int* sourceY) {
  real H[4] = {cosf(-theta), -sinf(-theta), sinf(-theta), cosf(-theta)};
  // compute coornidates in the rotated and scaled image
@ -57,11 +62,17 @@ __device__ void getTranformCoord(int x, int y, real theta, real scale,
 * created by Wei Xu (genome), converted by Jiang Wang
 */
-__global__ void kSamplingPatches(const real* imgs, real* targets,
+__global__ void kSamplingPatches(const real* imgs,
-                                 int imgSize, int tgtSize, const int channels,
+                                 real* targets,
-                                 int samplingRate, const real* thetas,
+                                 int imgSize,
-                                 const real* scales, const int* centerRs,
+                                 int tgtSize,
-                                 const int* centerCs, const real padValue,
+                                 const int channels,
                                 int samplingRate,
                                 const real* thetas,
                                 const real* scales,
                                 const int* centerRs,
                                 const int* centerCs,
                                 const real padValue,
                                 const int numImages) {
  const int caseIdx = blockIdx.x * 4 + threadIdx.x;
  const int pxIdx = blockIdx.y * 128 + threadIdx.y;
@ -80,8 +91,15 @@ __global__ void kSamplingPatches(const real* imgs, real* targets,
    const int pxY = pxIdx / tgtSize;
    int srcPxX, srcPxY;
-    getTranformCoord(pxX, pxY, thetas[imgIdx], scales[imgIdx], tgtCenter,
+    getTranformCoord(pxX,
-                     imgCenter, centerCs[caseIdx], centerRs[caseIdx], &srcPxX,
+                     pxY,
                     thetas[imgIdx],
                     scales[imgIdx],
                     tgtCenter,
                     imgCenter,
                     centerCs[caseIdx],
                     centerRs[caseIdx],
                     &srcPxX,
                     &srcPxY);
    imgs += (imgIdx * imgPixels + srcPxY * imgSize + srcPxX) * channels;
@ -100,10 +118,15 @@ __global__ void kSamplingPatches(const real* imgs, real* targets,
 *
 * created by Wei Xu
 */
-void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
+void hl_generate_disturb_params(real*& gpuAngle,
-                                int*& gpuCenterR, int*& gpuCenterC,
+                                real*& gpuScaleRatio,
-                                int numImages, int imgSize, real rotateAngle,
+                                int*& gpuCenterR,
-                                real scaleRatio, int samplingRate,
+                                int*& gpuCenterC,
                                int numImages,
                                int imgSize,
                                real rotateAngle,
                                real scaleRatio,
                                int samplingRate,
                                bool isTrain) {
  // The number of output samples.
  int numPatches = numImages * samplingRate;
@ -123,7 +146,8 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
    for (int i = 0; i < numImages; i++) {
      r_angle[i] =
          (rotateAngle * M_PI / 180.0) * (rand() / (RAND_MAX + 1.0)  // NOLINT
-                                          - 0.5);
+                                          -
                                          0.5);
      s_ratio[i] =
          1 + (rand() / (RAND_MAX + 1.0) - 0.5) * scaleRatio;  // NOLINT
    }
@ -140,8 +164,10 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
        int pxY =
            (int)(real(imgSize - 1) * rand() / (RAND_MAX + 1.0));  // NOLINT
-        const real H[4] = {cos(-r_angle[i]), -sin(-r_angle[i]),
+        const real H[4] = {cos(-r_angle[i]),
-                           sin(-r_angle[i]), cos(-r_angle[i])};
+                           -sin(-r_angle[i]),
                           sin(-r_angle[i]),
                           cos(-r_angle[i])};
        real x = pxX - imgCenter;
        real y = pxY - imgCenter;
        real xx = H[0] * x + H[1] * y;
@ -185,9 +211,12 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
  delete[] center_c;
 }
-void hl_conv_random_disturb_with_params(const real* images, int imgSize,
+void hl_conv_random_disturb_with_params(const real* images,
-                                        int tgtSize, int channels,
+                                        int imgSize,
-                                        int numImages, int samplingRate,
+                                        int tgtSize,
                                        int channels,
                                        int numImages,
                                        int samplingRate,
                                        const real* gpuRotationAngle,
                                        const real* gpuScaleRatio,
                                        const int* gpuCenterR,
@ -202,29 +231,59 @@ void hl_conv_random_disturb_with_params(const real* images, int imgSize,
  dim3 threadsPerBlock(4, 128);
  dim3 numBlocks(DIVUP(numPatches, 4), DIVUP(targetSize, 128));
-  kSamplingPatches <<<numBlocks, threadsPerBlock>>>
+  kSamplingPatches<<<numBlocks, threadsPerBlock>>>(images,
-      (images, target, imgSize, tgtSize, channels, samplingRate,
+                                                   target,
-      gpuRotationAngle, gpuScaleRatio, gpuCenterR, gpuCenterC,
+                                                   imgSize,
-      paddingValue, numImages);
+                                                   tgtSize,
                                                   channels,
                                                   samplingRate,
                                                   gpuRotationAngle,
                                                   gpuScaleRatio,
                                                   gpuCenterR,
                                                   gpuCenterC,
                                                   paddingValue,
                                                   numImages);
  hl_device_synchronize();
 }
-void hl_conv_random_disturb(const real* images, int imgSize,
+void hl_conv_random_disturb(const real* images,
-                            int tgtSize, int channels, int numImages,
+                            int imgSize,
-                            real scaleRatio, real rotateAngle,
+                            int tgtSize,
-                            int samplingRate, real* gpu_r_angle,
+                            int channels,
-                            real* gpu_s_ratio, int* gpu_center_r,
+                            int numImages,
-                            int* gpu_center_c, int paddingValue,
+                            real scaleRatio,
-                            bool isTrain, real* targets) {
+                            real rotateAngle,
                            int samplingRate,
                            real* gpu_r_angle,
                            real* gpu_s_ratio,
                            int* gpu_center_r,
                            int* gpu_center_c,
                            int paddingValue,
                            bool isTrain,
                            real* targets) {
  // generate the random disturbance sequence and the sampling locations
-  hl_generate_disturb_params(gpu_r_angle, gpu_s_ratio, gpu_center_r,
+  hl_generate_disturb_params(gpu_r_angle,
-                  gpu_center_c, numImages, imgSize, rotateAngle,
+                             gpu_s_ratio,
-                  scaleRatio, samplingRate, isTrain);
+                             gpu_center_r,
-
+                             gpu_center_c,
-  hl_conv_random_disturb_with_params(
+                             numImages,
-                  images, imgSize, tgtSize, channels, numImages,
+                             imgSize,
-                  samplingRate, gpu_r_angle, gpu_s_ratio,
+                             rotateAngle,
-                  gpu_center_r, gpu_center_r, paddingValue,
+                             scaleRatio,
-                  targets);
+                             samplingRate,
                             isTrain);
  hl_conv_random_disturb_with_params(images,
                                     imgSize,
                                     tgtSize,
                                     channels,
                                     numImages,
                                     samplingRate,
                                     gpu_r_angle,
                                     gpu_s_ratio,
                                     gpu_center_r,
                                     gpu_center_r,
                                     paddingValue,
                                     targets);
 }
--- a/paddle/cuda/src/hl_table_apply.cu
+++ b/paddle/cuda/src/hl_table_apply.cu
@ -12,15 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "hl_base.h"
 #include "hl_device_functions.cuh"
 #include "hl_cuda.h"
 #include "hl_device_functions.cuh"
 #include "paddle/utils/Logging.h"
-template<int blockDimX, int blockDimY, int gridDimX, bool AddRow>
+template <int blockDimX, int blockDimY, int gridDimX, bool AddRow>
-__global__ void KeMatrixAddRows(real* output, int ldo,
+__global__ void KeMatrixAddRows(real* output,
-                                real* table, int ldt,
+                                int ldo,
                                real* table,
                                int ldt,
                                int* ids,
                                int numSamples,
                                int tableSize,
@ -31,8 +32,8 @@ __global__ void KeMatrixAddRows(real* output, int ldo,
  while (idy < numSamples) {
    int tableId = ids[idy];
    if ((0 <= tableId) && (tableId < tableSize)) {
-      real *out = output + idy * ldo;
+      real* out = output + idy * ldo;
-      real *tab = table + tableId * ldt;
+      real* tab = table + tableId * ldt;
      for (int i = idx; i < dim; i += blockDimX) {
        if (AddRow) {
          paddle::paddleAtomicAdd(&tab[i], out[i]);
@ -45,8 +46,10 @@ __global__ void KeMatrixAddRows(real* output, int ldo,
  }
 }
-void hl_matrix_select_rows(real* output, int ldo,
+void hl_matrix_select_rows(real* output,
-                           real* table, int ldt,
+                           int ldo,
                           real* table,
                           int ldt,
                           int* ids,
                           int numSamples,
                           int tableSize,
@ -57,14 +60,16 @@ void hl_matrix_select_rows(real* output, int ldo,
  dim3 threads(128, 8);
  dim3 grid(8, 1);
-  KeMatrixAddRows<128, 8, 8, 0><<< grid, threads, 0, STREAM_DEFAULT >>>
+  KeMatrixAddRows<128, 8, 8, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-    (output, ldo, table, ldt, ids, numSamples, tableSize, dim);
+      output, ldo, table, ldt, ids, numSamples, tableSize, dim);
  CHECK_SYNC("hl_matrix_select_rows failed");
 }
-void hl_matrix_add_to_rows(real* table, int ldt,
+void hl_matrix_add_to_rows(real* table,
-                           real* input, int ldi,
+                           int ldt,
                           real* input,
                           int ldi,
                           int* ids,
                           int numSamples,
                           int tableSize,
@ -75,16 +80,15 @@ void hl_matrix_add_to_rows(real* table, int ldt,
  dim3 threads(128, 8);
  dim3 grid(8, 1);
-  KeMatrixAddRows<128, 8, 8, 1><<< grid, threads, 0, STREAM_DEFAULT >>>
+  KeMatrixAddRows<128, 8, 8, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-    (input, ldi, table, ldt, ids, numSamples, tableSize, dim);
+      input, ldi, table, ldt, ids, numSamples, tableSize, dim);
  CHECK_SYNC("hl_matrix_add_to_rows failed");
 }
-template<class T, int blockDimX, int gridDimX>
+template <class T, int blockDimX, int gridDimX>
-__global__ void KeVectorSelect(T* dst, int sized,
+__global__ void KeVectorSelect(
-                               const T* src, int sizes,
+    T* dst, int sized, const T* src, int sizes, const int* ids, int sizei) {
                               const int* ids, int sizei) {
  int idx = threadIdx.x + blockDimX * blockIdx.x;
  while (idx < sizei) {
    int index = ids[idx];
@ -95,9 +99,8 @@ __global__ void KeVectorSelect(T* dst, int sized,
 }
 template <class T>
-void hl_vector_select_from(T* dst, int sized,
+void hl_vector_select_from(
-                           const T* src, int sizes,
+    T* dst, int sized, const T* src, int sizes, const int* ids, int sizei) {
                           const int* ids, int sizei) {
  CHECK_NOTNULL(dst);
  CHECK_NOTNULL(src);
  CHECK_NOTNULL(ids);
@ -105,18 +108,17 @@ void hl_vector_select_from(T* dst, int sized,
  dim3 threads(512, 1);
  dim3 grid(8, 1);
-  KeVectorSelect<T, 512, 8><<< grid, threads, 0, STREAM_DEFAULT >>>
+  KeVectorSelect<T, 512, 8><<<grid, threads, 0, STREAM_DEFAULT>>>(
-    (dst, sized, src, sizes, ids, sizei);
+      dst, sized, src, sizes, ids, sizei);
  CHECK_SYNC("hl_vector_select_from failed");
 }
-template
+template void hl_vector_select_from(real* dst,
-void hl_vector_select_from(real* dst, int sized,
+                                    int sized,
-                           const real* src, int sizes,
+                                    const real* src,
-                           const int* ids, int sizei);
+                                    int sizes,
-template
+                                    const int* ids,
-void hl_vector_select_from(int* dst, int sized,
+                                    int sizei);
-                           const int* src, int sizes,
+template void hl_vector_select_from(
-                           const int* ids, int sizei);
+    int* dst, int sized, const int* src, int sizes, const int* ids, int sizei);
--- a/paddle/cuda/src/hl_top_k.cu
+++ b/paddle/cuda/src/hl_top_k.cu
--- a/paddle/framework/attribute.proto
+++ b/paddle/framework/attribute.proto
@ -12,17 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-syntax="proto2";
+syntax = "proto2";
 package paddle.framework;
 // Attribute Type for paddle's Op.
 // Op contains many attributes. Each type of attributes could be different.
 // The AttrType will be shared between AttrDesc and AttrProto.
 enum AttrType {
-    INT = 0;
+  INT = 0;
-    FLOAT = 1;
+  FLOAT = 1;
-    STRING = 2;
+  STRING = 2;
-    INTS = 3;
+  INTS = 3;
-    FLOATS = 4;
+  FLOATS = 4;
-    STRINGS = 5;
+  STRINGS = 5;
 }
--- a/paddle/framework/op_desc.proto
+++ b/paddle/framework/op_desc.proto
@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-syntax="proto2";
+syntax = "proto2";
 package paddle.framework;
 import "attribute.proto";
@ -22,14 +22,14 @@ import "attribute.proto";
 //
 // e.g, for scale=3.0: name=scala, type=AttrType.FLOAT, value=3.0
 message AttrDesc {
-    required string name = 1;
+  required string name = 1;
-    required AttrType type = 2;
+  required AttrType type = 2;
-    optional int32 i = 3;
+  optional int32 i = 3;
-    optional float f = 4;
+  optional float f = 4;
-    optional string s = 5;
+  optional string s = 5;
-    repeated int32 ints = 6;
+  repeated int32 ints = 6;
-    repeated float floats = 7;
+  repeated float floats = 7;
-    repeated string strings = 8;
+  repeated string strings = 8;
 };
 // Protocol Message to describe an Operator.
@ -42,15 +42,15 @@ message AttrDesc {
 // 3rd-party language can build this proto message and call
 // AddOp(const OpDesc& op_desc) of Paddle core to create an Operator.
 message OpDesc {
-    // input names of this Operator.
+  // input names of this Operator.
-    repeated string inputs = 1;
+  repeated string inputs = 1;
-    // output names of this Operator.
+  // output names of this Operator.
-    repeated string outputs = 2;
+  repeated string outputs = 2;
-    // type of this Operator, such as "add", "sub", "fc".
+  // type of this Operator, such as "add", "sub", "fc".
-    required string type = 3;
+  required string type = 3;
-    // Attributes of this Operator. e.g., scale=3.0 in cosine op.
+  // Attributes of this Operator. e.g., scale=3.0 in cosine op.
-    repeated AttrDesc attrs = 4;
+  repeated AttrDesc attrs = 4;
 };
--- a/paddle/framework/op_proto.proto
+++ b/paddle/framework/op_proto.proto
@ -15,10 +15,11 @@ limitations under the License. */
 // Protocol Message for 3rd-party language binding.
 //
 // Paddle Python package will use `OpProto` to generate op creation methods.
-// The op creation methods take user's input and generate `OpDesc` proto message,
+// The op creation methods take user's input and generate `OpDesc` proto
 // message,
 // then pass `OpDesc` to C++ side and create Op pointer.
 //
-syntax="proto2";
+syntax = "proto2";
 package paddle.framework;
 import "attribute.proto";
@ -26,89 +27,90 @@ import "attribute.proto";
 // Attribute protocol message for 3rd-party language binding.
 // It will store the Op support what attribute and what type.
 message AttrProto {
-    // Supported attribute name. e.g. `scale` for cosine op.
+  // Supported attribute name. e.g. `scale` for cosine op.
-    required string name = 1;
+  required string name = 1;
-    // Supported attribute type.
+  // Supported attribute type.
-    required AttrType type = 2;
+  required AttrType type = 2;
-    // Supported attribute comments. It helps 3rd-party language generate doc-string.
+  // Supported attribute comments. It helps 3rd-party language generate
-    required string comment = 3;
+  // doc-string.
  required string comment = 3;
-    // If that attribute is generated, it means the Paddle third language
+  // If that attribute is generated, it means the Paddle third language
-    // binding has responsibility to fill that attribute. End-User should
+  // binding has responsibility to fill that attribute. End-User should
-    // not set that attribute.
+  // not set that attribute.
-    optional bool generated = 4 [default=false];
+  optional bool generated = 4 [ default = false ];
 }
 // Input or output message for 3rd-party language binding.
 // It contains parameter name and its comments.
 message VarProto {
-    // Input or output name in that op creation function.
+  // Input or output name in that op creation function.
-    // e.g. `cos(a, b, output, ...)`, "a", "b", "output" are names.
+  // e.g. `cos(a, b, output, ...)`, "a", "b", "output" are names.
-    required string name = 1;
+  required string name = 1;
-
+
-    // The comment for that input. It helps 3rd-party language generate doc-string.
+  // The comment for that input. It helps 3rd-party language generate
-    required string comment = 2;
+  // doc-string.
-
+  required string comment = 2;
-    // Is that input/output could be a list or not.
+
-    // If so, that Op should write a attributed named `input_format` or
+  // Is that input/output could be a list or not.
-    // `output_format`.
+  // If so, that Op should write a attributed named `input_format` or
-    //
+  // `output_format`.
-    // e.g.
+  //
-    //   If the op is a fc op, the inputs are `X`, `W`, `b`. The `X` and `W`
+  // e.g.
-    //   could be multiple, so the multiple of `X` and `W` is True, and OpDesc
+  //   If the op is a fc op, the inputs are `X`, `W`, `b`. The `X` and `W`
-    //   will hold a attribute of them.
+  //   could be multiple, so the multiple of `X` and `W` is True, and OpDesc
-    //
+  //   will hold a attribute of them.
-    //   The Op desc of same fc could be
+  //
-    //   {
+  //   The Op desc of same fc could be
-    //      "type": "fc",
+  //   {
-    //      "input": ["X1", "X2", "W1", "W2", "b"],
+  //      "type": "fc",
-    //      "output": "fc.out",
+  //      "input": ["X1", "X2", "W1", "W2", "b"],
-    //      "attrs" : {
+  //      "output": "fc.out",
-    //        "input_format": [0, 2, 4, 5]
+  //      "attrs" : {
-    //      }
+  //        "input_format": [0, 2, 4, 5]
-    //   }
+  //      }
-    //
+  //   }
-    optional bool multiple = 3 [default=false];
+  //
-
+  optional bool multiple = 3 [ default = false ];
-    // It marks that output is a temporary output. That output is not used by
+
-    // user, but used by other op internally as input. If other op is not use
+  // It marks that output is a temporary output. That output is not used by
-    // that output, it could be optimized early.
+  // user, but used by other op internally as input. If other op is not use
-    //
+  // that output, it could be optimized early.
-    // Attribute temporary_index will be set in OpDesc if there is some
+  //
-    // outputs are temporary.
+  // Attribute temporary_index will be set in OpDesc if there is some
-    //
+  // outputs are temporary.
-    // output = [ "xxx.out1", "xxx.tmp", "xxx.out2"],
+  //
-    // attrs = {
+  // output = [ "xxx.out1", "xxx.tmp", "xxx.out2"],
-    //   "temporary_index": [1]
+  // attrs = {
-    // }
+  //   "temporary_index": [1]
-    optional bool temporary = 4 [default=false];
+  // }
-
+  optional bool temporary = 4 [ default = false ];
-    // The gradient of operator can be ignored immediately
+
-    // e.g. operator AddOp, y = x1 + x2, the gradient of dy/dx1, dy/dx2
+  // The gradient of operator can be ignored immediately
-    // can be ignored for the future optimized on graph.
+  // e.g. operator AddOp, y = x1 + x2, the gradient of dy/dx1, dy/dx2
-    optional bool ignore_gradient = 6;
+  // can be ignored for the future optimized on graph.
  optional bool ignore_gradient = 6;
 }
 // Op protocol message for 3rd-party language binding.
 // It contains all information for generating op creation method.
 message OpProto {
-    // The input information to generate op creation method.
+  // The input information to generate op creation method.
-    repeated VarProto inputs = 1;
+  repeated VarProto inputs = 1;
-    // The output information to generate op creation method.
+  // The output information to generate op creation method.
-    repeated VarProto outputs = 2;
+  repeated VarProto outputs = 2;
-    // The attribute information to generate op creation method.
+  // The attribute information to generate op creation method.
-    repeated AttrProto attrs = 3;
+  repeated AttrProto attrs = 3;
-    // The comments for that Op. It helps 3rd-party language generate
+  // The comments for that Op. It helps 3rd-party language generate
-    // doc-string. The whole documentation of that Op is generated by comment,
+  // doc-string. The whole documentation of that Op is generated by comment,
-    // inputs, outputs, attrs together.
+  // inputs, outputs, attrs together.
-    required string comment = 4;
+  required string comment = 4;
    // The type of that Op.
    required string type = 5;
  // The type of that Op.
  required string type = 5;
 }
--- a/paddle/function/ContextProjectionOpGpu.cu
+++ b/paddle/function/ContextProjectionOpGpu.cu
@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "hl_base.h"
 #include "ContextProjectionOp.h"
 #include "hl_base.h"
 namespace paddle {
@ -30,7 +30,7 @@ __global__ void KeContextProjectionForward(const real* input,
  int block_size = blockDim.x;
  int sequenceId = blockIdx.x;
  int seq_start = sequence[sequenceId];
-  int seq_end = sequence[sequenceId+1];
+  int seq_end = sequence[sequenceId + 1];
  real value = 0;
  int instances = seq_end - seq_start + context_length - 1;
@ -49,8 +49,9 @@ __global__ void KeContextProjectionForward(const real* input,
        } else if ((i + context_start) >= (seq_end - seq_start)) {
          if (padding) {
            value =
-              weight[(begin_pad + i + context_start - (seq_end - seq_start)) *
+                weight[(begin_pad + i + context_start - (seq_end - seq_start)) *
-                         input_dim + idx];
+                           input_dim +
                       idx];
          } else {
            continue;
          }
@ -61,7 +62,7 @@ __global__ void KeContextProjectionForward(const real* input,
        int outx = (i - context_length) < 0 ? i : (context_length - 1);
        int outy = (i - context_length) < 0 ? 0 : (i - (context_length - 1));
        real* output_r =
-          output + outy * input_dim * context_length + outx * input_dim;
+            output + outy * input_dim * context_length + outx * input_dim;
        for (int j = outy; j < seq_end - seq_start; j++) {
          output_r[idx] += value;
          if (j - outy == outx) break;
@ -108,13 +109,25 @@ void hl_context_projection_forward(const real* input,
  dim3 grid(blocks_x, blocks_y);
  if (weight) {
-    KeContextProjectionForward<true><<< grid, threads, 0, STREAM_DEFAULT >>>
+    KeContextProjectionForward<true><<<grid, threads, 0, STREAM_DEFAULT>>>(
-      (input, sequence, weight, output, input_dim,
+        input,
-       context_length, context_start, begin_pad);
+        sequence,
-  } else  {
+        weight,
-    KeContextProjectionForward<false><<< grid, threads, 0, STREAM_DEFAULT >>>
+        output,
-      (input, sequence, weight, output, input_dim,
+        input_dim,
-       context_length, context_start, begin_pad);
+        context_length,
        context_start,
        begin_pad);
  } else {
    KeContextProjectionForward<false><<<grid, threads, 0, STREAM_DEFAULT>>>(
        input,
        sequence,
        weight,
        output,
        input_dim,
        context_length,
        context_start,
        begin_pad);
  }
  CHECK_SYNC("hl_context_projection_forward failed");
 }
@ -148,7 +161,7 @@ __global__ void KeContextProjectionBackwardData(const real* out_grad,
  int block_size = blockDim.x;
  int sequenceId = blockIdx.x;
  int seq_start = sequence[sequenceId];
-  int seq_end = sequence[sequenceId+1];
+  int seq_end = sequence[sequenceId + 1];
  real value = 0;
  int instances = seq_end - seq_start + context_length - 1;
@ -170,7 +183,7 @@ __global__ void KeContextProjectionBackwardData(const real* out_grad,
        int outx = (i - context_length) < 0 ? i : (context_length - 1);
        int outy = (i - context_length) < 0 ? 0 : (i - (context_length - 1));
        real* output_r =
-          out + outy * input_dim * context_length + outx * input_dim;
+            out + outy * input_dim * context_length + outx * input_dim;
        for (int j = outy; j < seq_end - seq_start; j++) {
          value += output_r[idx];
          if (j - outy == outx) break;
@ -211,8 +224,8 @@ void hl_context_projection_backward_data(const real* out_grad,
  int blocks_y = 1;
  dim3 threads(block_size, 1);
  dim3 grid(blocks_x, blocks_y);
-  KeContextProjectionBackwardData<<< grid, threads, 0, STREAM_DEFAULT >>>
+  KeContextProjectionBackwardData<<<grid, threads, 0, STREAM_DEFAULT>>>(
-    (out_grad, sequence, input_grad, input_dim, context_length, context_start);
+      out_grad, sequence, input_grad, input_dim, context_length, context_start);
  CHECK_SYNC("hl_context_projection_backward_data failed");
 }
@ -231,7 +244,7 @@ void ContextProjectionBackwardData<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
                                      context_start);
 }
-template<int THREADS_X, int THREADS_Y>
+template <int THREADS_X, int THREADS_Y>
 __global__ void KeContextProjectionBackwardWeight(const real* out_grad,
                                                  const int* sequence,
                                                  real* w_grad,
@ -254,17 +267,17 @@ __global__ void KeContextProjectionBackwardWeight(const real* out_grad,
  if (weight_idx < w_dim) {
    for (int seqId = idy; seqId < num_sequences; seqId += THREADS_Y) {
      int seq_start = sequence[seqId];
-      int seq_end = sequence[seqId+1];
+      int seq_end = sequence[seqId + 1];
-      output_r = const_cast<real*>(out_grad)
+      output_r =
-                    + seq_start * w_dim * context_length;
+          const_cast<real*>(out_grad) + seq_start * w_dim * context_length;
      if (context_start < 0) {
        if (padId + context_start < 0) {
          instanceId = padId;
        } else {
          // begin_pad > 0;
-          instanceId = (padId - begin_pad) +
+          instanceId =
-            (seq_end - seq_start) - context_start;
+              (padId - begin_pad) + (seq_end - seq_start) - context_start;
        }
      } else {
        if (padId + (seq_end - seq_start) < context_start) {
@ -275,10 +288,11 @@ __global__ void KeContextProjectionBackwardWeight(const real* out_grad,
        }
      }
-      int outx = (instanceId - context_length) < 0 ?
+      int outx =
-                 instanceId : (context_length - 1);
+          (instanceId - context_length) < 0 ? instanceId : (context_length - 1);
-      int outy = (instanceId - context_length) < 0 ?
+      int outy = (instanceId - context_length) < 0
-                 0 : (instanceId - (context_length - 1));
+                     ? 0
                     : (instanceId - (context_length - 1));
      output_r += outy * w_dim * context_length + outx * w_dim;
      for (int j = outy; j < seq_end - seq_start; j++) {
        value += output_r[weight_idx];
@ -290,7 +304,7 @@ __global__ void KeContextProjectionBackwardWeight(const real* out_grad,
  }
  __syncthreads();
-  for (int stride = THREADS_Y/2; stride > 0; stride = stride/2) {
+  for (int stride = THREADS_Y / 2; stride > 0; stride = stride / 2) {
    if (idy < stride) {
      sum_s[idy][idx] += sum_s[idy + stride][idx];
    }
@ -339,22 +353,27 @@ void hl_context_projection_backward_weight(const real* out_grad,
  dim3 threads(threads_x, threads_y);
  dim3 grid(blocks_x, 1);
-  KeContextProjectionBackwardWeight<32, 32>
+  KeContextProjectionBackwardWeight<32,
-    <<< grid, threads, 0, STREAM_DEFAULT >>>
+                                    32><<<grid, threads, 0, STREAM_DEFAULT>>>(
-    (out_grad, sequence, w_grad, num_sequences, w_dim,
+      out_grad,
-     context_length, context_start, begin_pad);
+      sequence,
      w_grad,
      num_sequences,
      w_dim,
      context_length,
      context_start,
      begin_pad);
  CHECK_SYNC("hl_context_projection_backward_weight failed");
 }
 template <>
-void ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>(
+void ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
-        const GpuMatrix& out_grad,
+                                                      GpuMatrix& w_grad,
-        GpuMatrix& w_grad,
+                                                      const GpuIVector& seq_vec,
-        const GpuIVector& seq_vec,
+                                                      size_t context_length,
-        size_t context_length,
+                                                      int context_start,
-        int context_start,
+                                                      size_t total_pad,
-        size_t total_pad,
+                                                      size_t begin_pad) {
        size_t begin_pad) {
  hl_context_projection_backward_weight(out_grad.getData(),
                                        seq_vec.getData(),
                                        w_grad.getData(),
@ -376,23 +395,18 @@ void ContextProjectionBackward<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
                                                size_t begin_pad,
                                                bool is_padding,
                                                size_t total_pad) {
-    if (in_grad) {
+  if (in_grad) {
-        ContextProjectionBackwardData<DEVICE_TYPE_GPU>(
+    ContextProjectionBackwardData<DEVICE_TYPE_GPU>(
-                out_grad,
+        out_grad, in_grad, sequence, context_length, context_start);
-                in_grad,
+  }
-                sequence,
+  if (is_padding && w_grad) {
-                context_length,
+    ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>(out_grad,
-                context_start);
+                                                     w_grad,
-    }
+                                                     sequence,
-    if (is_padding && w_grad) {
+                                                     context_length,
-        ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>(
+                                                     context_start,
-                out_grad,
+                                                     total_pad,
-                w_grad,
+                                                     begin_pad);
                sequence,
                context_length,
                context_start,
                total_pad,
                begin_pad);
  }
 }
--- a/paddle/function/CosSimOpGpu.cu
+++ b/paddle/function/CosSimOpGpu.cu
@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "CosSimOp.h"
 #include "hl_base.h"
 #include "hl_device_functions.cuh"
 #include "CosSimOp.h"
 namespace paddle {
-template<int block_size>
+template <int block_size>
 __global__ void KeCosSim(real* output,
                         const real* input1,
                         const real* input2,
@ -78,8 +78,8 @@ void hlCossim(real* output,
  dim3 threads(block_size, 1);
  dim3 grid(1, input1_height);
-  KeCosSim<block_size><<<grid, threads, 0, STREAM_DEFAULT>>>
+  KeCosSim<block_size><<<grid, threads, 0, STREAM_DEFAULT>>>(
-    (output, input1, input2, width, input1_height, input2_height, scale);
+      output, input1, input2, width, input1_height, input2_height, scale);
  CHECK_SYNC("hlCossim failed");
 }
@ -99,7 +99,7 @@ void CosSimForward<DEVICE_TYPE_GPU>(GpuMatrix& out_mat,
  hlCossim(out, x, y, dim, in1_mat.getHeight(), in2_mat.getHeight(), scale);
 }
-template<int block_size>
+template <int block_size>
 __global__ void KeCosSimDerivative(const real* grad,
                                   const real* output,
                                   const real* prev_out_x,
@ -148,14 +148,13 @@ __global__ void KeCosSimDerivative(const real* grad,
  if (xy[0] == 0) {
    real reciprocal = 1.0 / (sqrt(xx[0]) * sqrt(yy[0]));
    for (int index = tid; index < width; index += block_size) {
-      prev_grad_x[index] +=
+      prev_grad_x[index] += scale * grad[ty] * prev_out_y[index] * reciprocal;
        scale * grad[ty] * prev_out_y[index] * reciprocal;
      if (input2_height > 1) {
-        prev_grad_y[index] +=
+        prev_grad_y[index] += scale * grad[ty] * prev_out_x[index] * reciprocal;
          scale * grad[ty] * prev_out_x[index] * reciprocal;
      } else {
-        paddle::paddleAtomicAdd(prev_grad_y + index,
+        paddle::paddleAtomicAdd(
-          scale * grad[ty] * prev_out_x[index] * reciprocal);
+            prev_grad_y + index,
            scale * grad[ty] * prev_out_x[index] * reciprocal);
      }
    }
  } else {
@ -163,17 +162,18 @@ __global__ void KeCosSimDerivative(const real* grad,
    real reciprocalSquareSumX = 1.0 / xx[0];
    real reciprocalSquareSumY = 1.0 / yy[0];
    for (int index = tid; index < width; index += block_size) {
-      prev_grad_x[index] += output[ty] * grad[ty] *
+      prev_grad_x[index] +=
-        (prev_out_y[index] * reciprocalXY -
+          output[ty] * grad[ty] * (prev_out_y[index] * reciprocalXY -
-         prev_out_x[index] * reciprocalSquareSumX);
+                                   prev_out_x[index] * reciprocalSquareSumX);
      if (input2_height > 1) {
-        prev_grad_y[index] += output[ty] * grad[ty] *
+        prev_grad_y[index] +=
-          (prev_out_x[index] * reciprocalXY -
+            output[ty] * grad[ty] * (prev_out_x[index] * reciprocalXY -
-           prev_out_y[index] * reciprocalSquareSumY);
+                                     prev_out_y[index] * reciprocalSquareSumY);
      } else {
-        paddle::paddleAtomicAdd(prev_grad_y + index, output[ty] * grad[ty] *
+        paddle::paddleAtomicAdd(
-          (prev_out_x[index] * reciprocalXY -
+            prev_grad_y + index,
-           prev_out_y[index] * reciprocalSquareSumY));
+            output[ty] * grad[ty] * (prev_out_x[index] * reciprocalXY -
                                     prev_out_y[index] * reciprocalSquareSumY));
      }
    }
  }
@ -198,9 +198,17 @@ void hlCossimDerivative(const real* grad,
  const int block_size = 256;
  dim3 threads(block_size, 1);
  dim3 grid(1, input1_height);
-  KeCosSimDerivative<block_size><<<grid, threads, 0, STREAM_DEFAULT>>>
+  KeCosSimDerivative<block_size><<<grid, threads, 0, STREAM_DEFAULT>>>(
-    (grad, output, prev_out_x, prev_out_y, prev_grad_x, prev_grad_y, width,
+      grad,
-        input1_height, input2_height, scale);
+      output,
      prev_out_x,
      prev_out_y,
      prev_grad_x,
      prev_grad_y,
      width,
      input1_height,
      input2_height,
      scale);
  CHECK_SYNC("hlCossimDerivate failed");
 }
@ -214,9 +222,9 @@ void CosSimBackward<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
                                     real scale) {
  CHECK(out_grad.getData() && out_val.getData() && in1_val.getData() &&
        in2_val.getData() && in1_grad.getData() && in2_grad.getData());
-  CHECK(out_grad.useGpu_ && out_val.useGpu_ && in1_val.useGpu_
+  CHECK(out_grad.useGpu_ && out_val.useGpu_ && in1_val.useGpu_ &&
-        && in2_val.useGpu_ && in1_grad.useGpu_ && in2_grad.useGpu_)
+        in2_val.useGpu_ && in1_grad.useGpu_ && in2_grad.useGpu_)
-        << "Matrix types are not equally GPU";
+      << "Matrix types are not equally GPU";
  size_t dim = in1_val.getWidth();
  const real* grad = out_grad.getData();
--- a/paddle/function/CropOpGpu.cu
+++ b/paddle/function/CropOpGpu.cu
@ -12,15 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "hl_base.h"
 #include "CropOp.h"
 #include "hl_base.h"
 namespace paddle {
-__global__ void KeCrop(real* outputs, const real* inputs,
+__global__ void KeCrop(real* outputs,
-                      int inC, int inH, int inW,
+                       const real* inputs,
-                      int cropC, int cropH, int cropW,
+                       int inC,
-                      int outC, int outH, int outW, int nthreads) {
+                       int inH,
                       int inW,
                       int cropC,
                       int cropH,
                       int cropW,
                       int outC,
                       int outH,
                       int outW,
                       int nthreads) {
  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
  if (idx < nthreads) {
    const int w = idx % outW;
@ -35,12 +43,12 @@ __global__ void KeCrop(real* outputs, const real* inputs,
 template <>
 void Crop<DEVICE_TYPE_GPU>(real* outputs,
-                          const real* inputs,
+                           const real* inputs,
-                          const TensorShape inShape,
+                           const TensorShape inShape,
-                          const TensorShape outShape,
+                           const TensorShape outShape,
-                          const FuncConfig& conf) {
+                           const FuncConfig& conf) {
  std::vector<uint32_t> crop_corner =
-        conf.get<std::vector<uint32_t>>("crop_corner");
+      conf.get<std::vector<uint32_t>>("crop_corner");
  int cropC = crop_corner[1];
  int cropH = crop_corner[2];
  int cropW = crop_corner[3];
@ -58,16 +66,33 @@ void Crop<DEVICE_TYPE_GPU>(real* outputs,
  int blockSize = 1024;
  int gridSize = (nth + blockSize - 1) / blockSize;
-  KeCrop<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
+  KeCrop<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(outputs,
-    (outputs, inputs, inC, inH, inW, cropC, cropH, cropW,
+                                                     inputs,
-     outC, outH, outW, nth);
+                                                     inC,
                                                     inH,
                                                     inW,
                                                     cropC,
                                                     cropH,
                                                     cropW,
                                                     outC,
                                                     outH,
                                                     outW,
                                                     nth);
  CHECK_SYNC("Crop");
 }
-__global__ void KeCropDiff(const real* inGrad, real* outGrad,
+__global__ void KeCropDiff(const real* inGrad,
-                          int inC, int inH, int inW,
+                           real* outGrad,
-                          int cropC, int cropH, int cropW,
+                           int inC,
-                          int outC, int outH, int outW, int nthreads) {
+                           int inH,
                           int inW,
                           int cropC,
                           int cropH,
                           int cropW,
                           int outC,
                           int outH,
                           int outW,
                           int nthreads) {
  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
  if (idx < nthreads) {
    const int w = idx % inW;
@ -84,12 +109,12 @@ __global__ void KeCropDiff(const real* inGrad, real* outGrad,
 template <>
 void CropGrad<DEVICE_TYPE_GPU>(const real* inGrad,
-                              real* outGrad,
+                               real* outGrad,
-                              const TensorShape inShape,
+                               const TensorShape inShape,
-                              const TensorShape outShape,
+                               const TensorShape outShape,
-                              const FuncConfig& conf) {
+                               const FuncConfig& conf) {
  std::vector<uint32_t> crop_corner =
-        conf.get<std::vector<uint32_t>>("crop_corner");
+      conf.get<std::vector<uint32_t>>("crop_corner");
  int cropC = crop_corner[1];
  int cropH = crop_corner[2];
  int cropW = crop_corner[3];
@ -107,9 +132,18 @@ void CropGrad<DEVICE_TYPE_GPU>(const real* inGrad,
  int blockSize = 1024;
  int gridSize = (nth + blockSize - 1) / blockSize;
-  KeCropDiff <<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
+  KeCropDiff<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(inGrad,
-    (inGrad, outGrad, inC, inH, inW, cropC, cropH, cropW,
+                                                         outGrad,
-     outC, outH, outW, nth);
+                                                         inC,
                                                         inH,
                                                         inW,
                                                         cropC,
                                                         cropH,
                                                         cropW,
                                                         outC,
                                                         outH,
                                                         outW,
                                                         nth);
  CHECK_SYNC("CropGrad");
 }
--- a/paddle/function/CrossMapNormalOpGpu.cu
+++ b/paddle/function/CrossMapNormalOpGpu.cu
@ -12,14 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "hl_base.h"
 #include "CrossMapNormalOp.h"
 #include "hl_base.h"
 namespace paddle {
-__global__ void KeCMRNormFillScale(size_t imageSize, const real* in,
+__global__ void KeCMRNormFillScale(size_t imageSize,
-                                   real* scale, size_t channels,
+                                   const real* in,
-                                   size_t height, size_t width, size_t size,
+                                   real* scale,
                                   size_t channels,
                                   size_t height,
                                   size_t width,
                                   size_t size,
                                   real alpha) {
  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
  if (idx < imageSize) {
@ -51,8 +55,10 @@ __global__ void KeCMRNormFillScale(size_t imageSize, const real* in,
  }
 }
-__global__ void KeCMRNormOutput(size_t inputSize, const real* in,
+__global__ void KeCMRNormOutput(size_t inputSize,
-                                const real* scale, real negative_beta,
+                                const real* in,
                                const real* scale,
                                real negative_beta,
                                real* out) {
  const int index = threadIdx.x + blockIdx.x * blockDim.x;
  if (index < inputSize) {
@ -74,24 +80,30 @@ void CrossMapNormal<DEVICE_TYPE_GPU>(real* outputs,
  size_t imageSize = numSamples * height * width;
  int blockSize = 1024;
  int gridSize = (imageSize + 1024 - 1) / 1024;
-  KeCMRNormFillScale<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
+  KeCMRNormFillScale<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
-    (imageSize, inputs, denoms, channels, height, width, size, scale);
+      imageSize, inputs, denoms, channels, height, width, size, scale);
-  size_t inputSize = numSamples * height * width *channels;
+  size_t inputSize = numSamples * height * width * channels;
  blockSize = 1024;
  gridSize = (inputSize + 1024 - 1) / 1024;
-  KeCMRNormOutput<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
+  KeCMRNormOutput<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
-    (inputSize, inputs, denoms, -pow, outputs);
+      inputSize, inputs, denoms, -pow, outputs);
  CHECK_SYNC("CrossMapNormal");
 }
-__global__ void KeCMRNormDiff(size_t imageSize, const real* bottom_data,
+__global__ void KeCMRNormDiff(size_t imageSize,
-                              const real* top_data, const real* scale,
+                              const real* bottom_data,
-                              const real* top_diff, size_t channels,
+                              const real* top_data,
-                              size_t height, size_t width, size_t size,
+                              const real* scale,
-                              real negative_beta, real cache_ratio,
+                              const real* top_diff,
-                              real* bottom_diff ) {
+                              size_t channels,
                              size_t height,
                              size_t width,
                              size_t size,
                              real negative_beta,
                              real cache_ratio,
                              real* bottom_diff) {
  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
  if (idx < imageSize) {
    const int w = idx % width;
@ -113,17 +125,17 @@ __global__ void KeCMRNormDiff(size_t imageSize, const real* bottom_data,
    while (index < channels + post_pad) {
      if (index < channels) {
        accum += top_diff[index * step] * top_data[index * step] /
-          scale[index * step];
+                 scale[index * step];
      }
      if (index >= size) {
        accum -= top_diff[(index - size) * step] *
-          top_data[(index - size) * step] / scale[(index - size) * step];
+                 top_data[(index - size) * step] / scale[(index - size) * step];
      }
      if (index >= post_pad) {
        bottom_diff[(index - post_pad) * step] +=
-          top_diff[(index - post_pad) * step] *
+            top_diff[(index - post_pad) * step] *
-          pow(scale[(index - post_pad) * step], negative_beta) - cache_ratio *
+                pow(scale[(index - post_pad) * step], negative_beta) -
-          bottom_data[(index - post_pad) * step] * accum;
+            cache_ratio * bottom_data[(index - post_pad) * step] * accum;
      }
      ++index;
    }
@ -147,9 +159,18 @@ void CrossMapNormalGrad<DEVICE_TYPE_GPU>(real* inputsGrad,
  int blockSize = 1024;
  int gridSize = (imageSize + 1024 - 1) / 1024;
-  KeCMRNormDiff <<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
+  KeCMRNormDiff<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(imageSize,
-    (imageSize, inputsValue, outputsValue, denoms, outputsGrad, channels,
+                                                            inputsValue,
-      height, width, size, -pow, 2.0f * pow * scale, inputsGrad);
+                                                            outputsValue,
                                                            denoms,
                                                            outputsGrad,
                                                            channels,
                                                            height,
                                                            width,
                                                            size,
                                                            -pow,
                                                            2.0f * pow * scale,
                                                            inputsGrad);
  CHECK_SYNC("CrossMapNormalGrad");
 }
--- a/paddle/function/DepthwiseConvOpGpu.cu
+++ b/paddle/function/DepthwiseConvOpGpu.cu
--- a/paddle/function/Im2ColOpGpu.cu
+++ b/paddle/function/Im2ColOpGpu.cu
--- a/paddle/function/MulOpGpu.cu
+++ b/paddle/function/MulOpGpu.cu
@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "hl_base.h"
 #include "MulOp.h"
 #include "hl_base.h"
 #include "paddle/math/Matrix.h"
 #include "paddle/math/SparseMatrix.h"
--- a/paddle/function/PadOpGpu.cu
+++ b/paddle/function/PadOpGpu.cu
@ -12,15 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "hl_base.h"
 #include "PadOp.h"
 #include "hl_base.h"
 namespace paddle {
-__global__ void KePad(real* outputs, const real* inputs,
+__global__ void KePad(real* outputs,
-                      int inC, int inH, int inW,
+                      const real* inputs,
-                      int padc, int padh, int padw,
+                      int inC,
-                      int outC, int outH, int outW, int nthreads) {
+                      int inH,
                      int inW,
                      int padc,
                      int padh,
                      int padw,
                      int outC,
                      int outH,
                      int outW,
                      int nthreads) {
  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
  if (idx < nthreads) {
    const int w = idx % inW;
@ -50,16 +58,33 @@ void Pad<DEVICE_TYPE_GPU>(real* outputs,
  int outC = inC + cstart + cend;
  int outH = inH + hstart + hend;
  int outW = inW + wstart + wend;
-  KePad<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
+  KePad<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(outputs,
-    (outputs, inputs, inC, inH, inW, cstart, hstart, wstart,
+                                                    inputs,
-     outC, outH, outW, nth);
+                                                    inC,
                                                    inH,
                                                    inW,
                                                    cstart,
                                                    hstart,
                                                    wstart,
                                                    outC,
                                                    outH,
                                                    outW,
                                                    nth);
  CHECK_SYNC("Pad");
 }
-__global__ void KePadDiff(real* inGrad, const real* outGrad,
+__global__ void KePadDiff(real* inGrad,
-                          int inC, int inH, int inW,
+                          const real* outGrad,
-                          int padc, int padh, int padw,
+                          int inC,
-                          int outC, int outH, int outW, int nthreads) {
+                          int inH,
                          int inW,
                          int padc,
                          int padh,
                          int padw,
                          int outC,
                          int outH,
                          int outW,
                          int nthreads) {
  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
  if (idx < nthreads) {
    const int w = idx % inW;
@ -89,9 +114,18 @@ void PadGrad<DEVICE_TYPE_GPU>(real* inGrad,
  int outC = inC + cstart + cend;
  int outH = inH + hstart + hend;
  int outW = inW + wstart + wend;
-  KePadDiff <<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
+  KePadDiff<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(inGrad,
-    (inGrad, outGrad, inC, inH, inW, cstart, hstart, wstart,
+                                                        outGrad,
-     outC, outH, outW, nth);
+                                                        inC,
                                                        inH,
                                                        inW,
                                                        cstart,
                                                        hstart,
                                                        wstart,
                                                        outC,
                                                        outH,
                                                        outW,
                                                        nth);
  CHECK_SYNC("PadGrad");
 }
--- a/paddle/function/RowConvOpGpu.cu
+++ b/paddle/function/RowConvOpGpu.cu
--- a/paddle/gserver/layers/GruCompute.cu
+++ b/paddle/gserver/layers/GruCompute.cu
@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "GruCompute.h"
 #include "hl_recurrent_apply.cuh"
@ -31,8 +30,10 @@ void GruCompute::forward<1>(hl_gru_value value, int frameSize, int batchSize) {
 }
 template <>
-void GruCompute::backward<1>(hl_gru_value value, hl_gru_grad grad,
+void GruCompute::backward<1>(hl_gru_value value,
-                            int frameSize, int batchSize) {
+                             hl_gru_grad grad,
                             int frameSize,
                             int batchSize) {
  hl_gpu_gru_backward(hppl::backward::gru_stateGrad(),
                      hppl::backward::gru_resetGrad(),
                      value,
--- a/paddle/gserver/layers/LstmCompute.cu
+++ b/paddle/gserver/layers/LstmCompute.cu
@ -12,41 +12,62 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "LstmCompute.h"
 #include "hl_recurrent_apply.cuh"
 namespace paddle {
 template <>
-void LstmCompute::forwardBatch<1>(hl_lstm_value value, int frameSize,
+void LstmCompute::forwardBatch<1>(hl_lstm_value value,
-                                 int batchSize) {
+                                  int frameSize,
-  hl_gpu_lstm_forward(hppl::forward::lstm(), value, frameSize,
+                                  int batchSize) {
-                      batchSize, activeNode_, activeGate_,
+  hl_gpu_lstm_forward(hppl::forward::lstm(),
                      value,
                      frameSize,
                      batchSize,
                      activeNode_,
                      activeGate_,
                      activeState_);
 }
 template <>
-void LstmCompute::backwardBatch<1>(hl_lstm_value value, hl_lstm_grad grad,
+void LstmCompute::backwardBatch<1>(hl_lstm_value value,
-                                   int frameSize, int batchSize) {
+                                   hl_lstm_grad grad,
-  hl_gpu_lstm_backward(hppl::backward::lstm(), value, grad,
+                                   int frameSize,
-                       frameSize, batchSize, activeNode_,
+                                   int batchSize) {
-                       activeGate_, activeState_);
+  hl_gpu_lstm_backward(hppl::backward::lstm(),
                       value,
                       grad,
                       frameSize,
                       batchSize,
                       activeNode_,
                       activeGate_,
                       activeState_);
 }
 template <>
 void LstmCompute::forwardOneSequence<1>(hl_lstm_value value, int frameSize) {
-  hl_gpu_lstm_forward(hppl::forward::lstm(), value,
+  hl_gpu_lstm_forward(hppl::forward::lstm(),
-                      frameSize, /* batchSize */ 1,
+                      value,
-                      activeNode_, activeGate_, activeState_);
+                      frameSize,
                      /* batchSize */ 1,
                      activeNode_,
                      activeGate_,
                      activeState_);
 }
 template <>
-void LstmCompute::backwardOneSequence<1>(hl_lstm_value value, hl_lstm_grad grad,
+void LstmCompute::backwardOneSequence<1>(hl_lstm_value value,
                                         hl_lstm_grad grad,
                                         int frameSize) {
-  hl_gpu_lstm_backward(hppl::backward::lstm(), value, grad,
+  hl_gpu_lstm_backward(hppl::backward::lstm(),
-                       frameSize, /* batchSize */ 1,
+                       value,
-                       activeNode_, activeGate_, activeState_);
+                       grad,
                       frameSize,
                       /* batchSize */ 1,
                       activeNode_,
                       activeGate_,
                       activeState_);
 }
 }  // namespace paddle
--- a/Show More
+++ b/Show More