diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 9385943da9..90c25e4350 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -7,18 +7,14 @@
     hooks:
     -   id: yapf
 -   repo: https://github.com/pre-commit/pre-commit-hooks
-    sha: 4ef03c4223ad322c7adaa6c6c0efb26b57df3b71
+    sha: 7539d8bd1a00a3c1bfd34cdb606d3a6372e83469
     hooks:
     -   id: check-added-large-files
     -   id: check-merge-conflict
     -   id: check-symlinks
     -   id: detect-private-key
     -   id: end-of-file-fixer
-# TODO(yuyang): trailing whitespace has some bugs on markdown 
-# files now, please not add it to pre-commit hook now
-#    -   id: trailing-whitespace
-#
-# TODO(yuyang): debug-statements not fit for Paddle, because
-# not all of our python code is runnable. Some are used for 
-# documenation
-#    -   id: debug-statements
+-   repo: https://github.com/PaddlePaddle/clang-format-pre-commit-hook.git
+    sha: 28c0ea8a67a3e2dbbf4822ef44e85b63a0080a29
+    hooks:
+    -   id: clang-formater
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8b62f29787..af193c27ae 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,8 +2,8 @@ cmake_minimum_required(VERSION 2.8)
 
 project(paddle CXX C)
 set(PADDLE_MAJOR_VERSION 0)
-set(PADDLE_MINOR_VERSION 8)
-set(PADDLE_PATCH_VERSION 0b3)
+set(PADDLE_MINOR_VERSION 9)
+set(PADDLE_PATCH_VERSION 0a0)
 set(PADDLE_VERSION ${PADDLE_MAJOR_VERSION}.${PADDLE_MINOR_VERSION}.${PADDLE_PATCH_VERSION})
 
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake")
diff --git a/README.md b/README.md
index 81ff8c7122..8a8e158415 100644
--- a/README.md
+++ b/README.md
@@ -1,10 +1,13 @@
 # PaddlePaddle
 
 
-[![Build Status](https://travis-ci.org/baidu/Paddle.svg?branch=master)](https://travis-ci.org/baidu/Paddle)
-[![Coverage Status](https://coveralls.io/repos/github/baidu/Paddle/badge.svg?branch=develop)](https://coveralls.io/github/baidu/Paddle?branch=develop)
-[![Join the chat at https://gitter.im/PaddlePaddle/Deep_Learning](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/PaddlePaddle/Deep_Learning?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
-[![License](https://img.shields.io/badge/license-Apache%202.0-green.svg)](LICENSE)
+[![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
+[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org/)
+[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org/cn/index.html)
+[![Coverage Status](https://coveralls.io/repos/github/PaddlePaddle/Paddle/badge.svg?branch=develop)](https://coveralls.io/github/PaddlePaddle/Paddle?branch=develop)
+[![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
+[![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
+
 
 Welcome to the PaddlePaddle GitHub.
 
@@ -14,7 +17,7 @@ developed by Baidu scientists and engineers for the purpose of applying deep
 learning to many products at Baidu.
 
 Our vision is to enable deep learning for everyone via PaddlePaddle.
-Please refer to our [release announcement](https://github.com/baidu/Paddle/releases) to track the latest feature of PaddlePaddle. 
+Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle.
 
 ## Features
 
@@ -26,15 +29,15 @@ Please refer to our [release announcement](https://github.com/baidu/Paddle/relea
     connection.
 
 -  **Efficiency**
-  
+
     In order to unleash the power of heterogeneous computing resource,
     optimization occurs at different levels of PaddlePaddle, including
     computing, memory, architecture and communication. The following are some
     examples:
 
       - Optimized math operations through SSE/AVX intrinsics, BLAS libraries
-      (e.g. MKL, ATLAS, cuBLAS) or customized CPU/GPU kernels. 
-      - Highly optimized recurrent networks which can handle **variable-length** 
+      (e.g. MKL, ATLAS, cuBLAS) or customized CPU/GPU kernels.
+      - Highly optimized recurrent networks which can handle **variable-length**
       sequence without padding.
       - Optimized local and distributed training for models with high dimensional
       sparse data.
@@ -57,41 +60,39 @@ Please refer to our [release announcement](https://github.com/baidu/Paddle/relea
 
 ## Installation
 Check out the [Install Guide](http://paddlepaddle.org/doc/build/) to install from
-pre-built packages (**docker image**, **deb package**) or 
+pre-built packages (**docker image**, **deb package**) or
 directly build on **Linux** and **Mac OS X** from the source code.
- 
+
 ## Documentation
 Both [English Docs](http://paddlepaddle.org/doc/) and [Chinese Docs](http://paddlepaddle.org/doc_cn/) are provided for our users and developers.
 
 - [Quick Start](http://paddlepaddle.org/doc/demo/quick_start/index_en) <br>
    You can follow the quick start tutorial to learn how use PaddlePaddle
    step-by-step.
-    
+
 - [Example and Demo](http://paddlepaddle.org/doc/demo/) <br>
    We provide five demos, including: image classification, sentiment analysis,
-   sequence to sequence model, recommendation, semantic role labeling. 
-   
+   sequence to sequence model, recommendation, semantic role labeling.
+
 - [Distributed Training](http://paddlepaddle.org/doc/cluster) <br>
   This system supports training deep learning models on multiple machines
   with data parallelism.
-   
+
 - [Python API](http://paddlepaddle.org/doc/ui/) <br>
    PaddlePaddle supports using either Python interface or C++ to build your
    system. We also use SWIG to wrap C++ source code to create a user friendly
    interface for Python. You can also use SWIG to create interface for your
    favorite programming language.
- 
+
 - [How to Contribute](http://paddlepaddle.org/doc/build/contribute_to_paddle.html) <br>
    We sincerely appreciate your interest and contributions. If you would like to
-   contribute, please read the contribution guide.   
+   contribute, please read the contribution guide.
 
 - [Source Code Documents](http://paddlepaddle.org/doc/source/) <br>
 
 ## Ask Questions
-Please join the [**gitter chat**](https://gitter.im/PaddlePaddle/Deep_Learning) or send email to
-**paddle-dev@baidu.com** to ask questions and talk about methods and models.
-Framework development discussions and
-bug reports are collected on [Issues](https://github.com/baidu/paddle/issues).
+
+You are welcome to submit questions and bug reports as [Github Issues](https://github.com/PaddlePaddle/Paddle/issues).
 
 ## Copyright and License
 PaddlePaddle is provided under the [Apache-2.0 license](LICENSE).
diff --git a/demo/semantic_role_labeling/data/extract_dict_feature.py b/demo/semantic_role_labeling/data/extract_dict_feature.py
index 2982e54c66..daca5f01cf 100644
--- a/demo/semantic_role_labeling/data/extract_dict_feature.py
+++ b/demo/semantic_role_labeling/data/extract_dict_feature.py
@@ -17,24 +17,15 @@ import os
 from optparse import OptionParser
 
 
-def extract_dict_features(pair_file, feature_file, src_dict_file,
-                          tgt_dict_file):
-    src_dict = set()
-    tgt_dict = set()
-
-    with open(pair_file) as fin, open(feature_file, 'w') as feature_out, open(
-            src_dict_file, 'w') as src_dict_out, open(tgt_dict_file,
-                                                      'w') as tgt_dict_out:
+def extract_dict_features(pair_file, feature_file):
+
+    with open(pair_file) as fin, open(feature_file, 'w') as feature_out:
         for line in fin:
-            sentence, labels = line.strip().split('\t')
+            sentence, predicate, labels = line.strip().split('\t')
             sentence_list = sentence.split()
             labels_list = labels.split()
 
-            src_dict.update(sentence_list)
-            tgt_dict.update(labels_list)
-
             verb_index = labels_list.index('B-V')
-            verb_feature = sentence_list[verb_index]
 
             mark = [0] * len(labels_list)
             if verb_index > 0:
@@ -42,47 +33,50 @@ def extract_dict_features(pair_file, feature_file, src_dict_file,
                 ctx_n1 = sentence_list[verb_index - 1]
             else:
                 ctx_n1 = 'bos'
-            ctx_n1_feature = ctx_n1
+            
+            if verb_index > 1:
+                mark[verb_index - 2] = 1
+                ctx_n2 = sentence_list[verb_index - 2]
+            else:
+                ctx_n2 = 'bos'
 
             mark[verb_index] = 1
-            ctx_0_feature = sentence_list[verb_index]
+            ctx_0 = sentence_list[verb_index]
 
             if verb_index < len(labels_list) - 2:
                 mark[verb_index + 1] = 1
                 ctx_p1 = sentence_list[verb_index + 1]
             else:
                 ctx_p1 = 'eos'
-            ctx_p1_feature = ctx_p1
+            
+            if verb_index < len(labels_list) - 3:
+                mark[verb_index + 2] = 1
+                ctx_p2 = sentence_list[verb_index + 2]
+            else:
+                ctx_p2 = 'eos'
+
 
             feature_str  = sentence + '\t' \
-                           + verb_feature + '\t' \
-                           + ctx_n1_feature + '\t' \
-                           + ctx_0_feature + '\t' \
-                           + ctx_p1_feature + '\t' \
+                           + predicate + '\t' \
+                           + ctx_n2 + '\t' \
+                           + ctx_n1 + '\t' \
+                           + ctx_0 + '\t' \
+                           + ctx_p1 + '\t' \
+                           + ctx_p2 + '\t' \
                            + ' '.join([str(i) for i in mark]) + '\t' \
                            + labels
 
             feature_out.write(feature_str + '\n')
 
-        src_dict_out.write('<unk>\n')
-        src_dict_out.write('\n'.join(list(src_dict)))
-
-        tgt_dict_out.write('\n'.join(list(tgt_dict)))
 
 
 if __name__ == '__main__':
 
-    usage = '-p pair_file -f feature_file -s source dictionary -t target dictionary '
+    usage = '-p pair_file -f feature_file'
     parser = OptionParser(usage)
     parser.add_option('-p', dest='pair_file', help='the pair file')
-    parser.add_option(
-        '-f', dest='feature_file', help='the file to store feature')
-    parser.add_option(
-        '-s', dest='src_dict', help='the file to store source dictionary')
-    parser.add_option(
-        '-t', dest='tgt_dict', help='the file to store target dictionary')
+    parser.add_option('-f', dest='feature_file', help='the feature file')
 
     (options, args) = parser.parse_args()
 
-    extract_dict_features(options.pair_file, options.feature_file,
-                          options.src_dict, options.tgt_dict)
+    extract_dict_features(options.pair_file, options.feature_file)
diff --git a/demo/semantic_role_labeling/data/extract_pairs.py b/demo/semantic_role_labeling/data/extract_pairs.py
index 4d1bef8f95..86ab00ce41 100644
--- a/demo/semantic_role_labeling/data/extract_pairs.py
+++ b/demo/semantic_role_labeling/data/extract_pairs.py
@@ -51,7 +51,7 @@ def read_sentences(words_file):
         for line in fin:
             line = line.strip()
             if line == '':
-                sentences.append(s.lower())
+                sentences.append(s)
                 s = ''
             else:
                 s += line + ' '
@@ -64,6 +64,11 @@ def transform_labels(sentences, labels):
         if len(labels[i]) == 1:
             continue
         else:
+            verb_list = []
+            for x in labels[i][0]:
+                if x !='-':
+                   verb_list.append(x)
+
             for j in xrange(1, len(labels[i])):
                 label_list = labels[i][j]
                 current_tag = 'O'
@@ -88,8 +93,7 @@ def transform_labels(sentences, labels):
                         is_in_bracket = True
                     else:
                         print 'error:', ll
-
-                sen_lab_pair.append((sentences[i], label_seq))
+                sen_lab_pair.append((sentences[i], verb_list[j-1], label_seq))
     return sen_lab_pair
 
 
@@ -97,9 +101,9 @@ def write_file(sen_lab_pair, output_file):
     with open(output_file, 'w') as fout:
         for x in sen_lab_pair:
             sentence = x[0]
-            label_seq = ' '.join(x[1])
-            assert len(sentence.split()) == len(x[1])
-            fout.write(sentence + '\t' + label_seq + '\n')
+            label_seq = ' '.join(x[2])
+            assert len(sentence.split()) == len(x[2])
+            fout.write(sentence + '\t' + x[1]+'\t' +label_seq + '\n')
 
 
 if __name__ == '__main__':
diff --git a/demo/semantic_role_labeling/data/get_data.sh b/demo/semantic_role_labeling/data/get_data.sh
index 268c0995e2..55e33f4685 100644
--- a/demo/semantic_role_labeling/data/get_data.sh
+++ b/demo/semantic_role_labeling/data/get_data.sh
@@ -14,6 +14,10 @@
 # limitations under the License.
 set -e
 wget http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz
+wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/semantic_role_labeling/verbDict.txt --no-check-certificate
+wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/semantic_role_labeling/targetDict.txt --no-check-certificate
+wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/semantic_role_labeling/wordDict.txt --no-check-certificate
+wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/semantic_role_labeling/emb --no-check-certificate
 tar -xzvf conll05st-tests.tar.gz
 rm conll05st-tests.tar.gz
 cp ./conll05st-release/test.wsj/words/test.wsj.words.gz  .
@@ -22,4 +26,4 @@ gunzip test.wsj.words.gz
 gunzip test.wsj.props.gz
 
 python extract_pairs.py  -w test.wsj.words -p test.wsj.props -o test.wsj.seq_pair
-python extract_dict_feature.py -p test.wsj.seq_pair -f feature  -s src.dict  -t tgt.dict
+python extract_dict_feature.py -p test.wsj.seq_pair -f feature 
diff --git a/demo/semantic_role_labeling/dataprovider.py b/demo/semantic_role_labeling/dataprovider.py
index 5c003584a5..d4c137ef42 100644
--- a/demo/semantic_role_labeling/dataprovider.py
+++ b/demo/semantic_role_labeling/dataprovider.py
@@ -17,11 +17,15 @@ from paddle.trainer.PyDataProvider2 import *
 UNK_IDX = 0
 
 
-def hook(settings, word_dict, label_dict, **kwargs):
+def hook(settings, word_dict, label_dict, predicate_dict, **kwargs):
     settings.word_dict = word_dict
     settings.label_dict = label_dict
+    settings.predicate_dict = predicate_dict
+   
     #all inputs are integral and sequential type
     settings.slots = [
+        integer_value_sequence(len(word_dict)),
+        integer_value_sequence(len(predicate_dict)),
         integer_value_sequence(len(word_dict)),
         integer_value_sequence(len(word_dict)),
         integer_value_sequence(len(word_dict)),
@@ -31,27 +35,33 @@ def hook(settings, word_dict, label_dict, **kwargs):
     ]
 
 
-@provider(init_hook=hook)
-def process(obj, file_name):
+def get_batch_size(yeild_data):
+    return len(yeild_data[0])
+    
+
+@provider(init_hook=hook, should_shuffle=True, calc_batch_size=get_batch_size, 
+          can_over_batch_size=False, cache=CacheType.CACHE_PASS_IN_MEM)
+def process(settings, file_name):
     with open(file_name, 'r') as fdata:
         for line in fdata:
-            sentence, predicate, ctx_n1, ctx_0, ctx_p1, mark, label = \
+            sentence, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2,  mark, label = \
                 line.strip().split('\t')
-
+           
             words = sentence.split()
             sen_len = len(words)
-            word_slot = [obj.word_dict.get(w, UNK_IDX) for w in words]
+            word_slot = [settings.word_dict.get(w, UNK_IDX) for w in words]
 
-            predicate_slot = [obj.word_dict.get(predicate, UNK_IDX)] * sen_len
-            ctx_n1_slot = [obj.word_dict.get(ctx_n1, UNK_IDX)] * sen_len
-            ctx_0_slot = [obj.word_dict.get(ctx_0, UNK_IDX)] * sen_len
-            ctx_p1_slot = [obj.word_dict.get(ctx_p1, UNK_IDX)] * sen_len
+            predicate_slot = [settings.predicate_dict.get(predicate)] * sen_len
+            ctx_n2_slot = [settings.word_dict.get(ctx_n2, UNK_IDX)] * sen_len
+            ctx_n1_slot = [settings.word_dict.get(ctx_n1, UNK_IDX)] * sen_len
+            ctx_0_slot = [settings.word_dict.get(ctx_0, UNK_IDX)] * sen_len
+            ctx_p1_slot = [settings.word_dict.get(ctx_p1, UNK_IDX)] * sen_len
+            ctx_p2_slot = [settings.word_dict.get(ctx_p2, UNK_IDX)] * sen_len
 
             marks = mark.split()
             mark_slot = [int(w) for w in marks]
 
             label_list = label.split()
-            label_slot = [obj.label_dict.get(w) for w in label_list]
-
-            yield word_slot, predicate_slot, ctx_n1_slot, \
-                  ctx_0_slot, ctx_p1_slot, mark_slot, label_slot
+            label_slot = [settings.label_dict.get(w) for w in label_list]
+            yield word_slot, predicate_slot, ctx_n2_slot, ctx_n1_slot, \
+                  ctx_0_slot, ctx_p1_slot, ctx_p2_slot, mark_slot, label_slot
diff --git a/demo/semantic_role_labeling/db_lstm.py b/demo/semantic_role_labeling/db_lstm.py
index e3f6edad69..54ceff0e72 100644
--- a/demo/semantic_role_labeling/db_lstm.py
+++ b/demo/semantic_role_labeling/db_lstm.py
@@ -18,8 +18,9 @@ import sys
 from paddle.trainer_config_helpers import *
 
 #file paths
-word_dict_file = './data/src.dict'
-label_dict_file = './data/tgt.dict'
+word_dict_file = './data/wordDict.txt'
+label_dict_file = './data/targetDict.txt'
+predicate_file= './data/verbDict.txt'
 train_list_file = './data/train.list'
 test_list_file = './data/test.list'
 
@@ -30,8 +31,10 @@ if not is_predict:
     #load dictionaries
     word_dict = dict()
     label_dict = dict()
+    predicate_dict = dict()
     with open(word_dict_file, 'r') as f_word, \
-         open(label_dict_file, 'r') as f_label:
+         open(label_dict_file, 'r') as f_label, \
+         open(predicate_file, 'r') as f_pre:
         for i, line in enumerate(f_word):
             w = line.strip()
             word_dict[w] = i
@@ -40,6 +43,11 @@ if not is_predict:
             w = line.strip()
             label_dict[w] = i
 
+        for i, line in enumerate(f_pre):
+            w = line.strip()
+            predicate_dict[w] = i
+
+
     if is_test:
         train_list_file = None
 
@@ -50,91 +58,157 @@ if not is_predict:
         module='dataprovider',
         obj='process',
         args={'word_dict': word_dict,
-              'label_dict': label_dict})
+              'label_dict': label_dict,
+              'predicate_dict': predicate_dict })
 
     word_dict_len = len(word_dict)
     label_dict_len = len(label_dict)
+    pred_len = len(predicate_dict)
 
 else:
     word_dict_len = get_config_arg('dict_len', int)
     label_dict_len = get_config_arg('label_len', int)
+    pred_len = get_config_arg('pred_len', int)
 
+############################## Hyper-parameters ##################################
 mark_dict_len = 2
 word_dim = 32
 mark_dim = 5
-hidden_dim = 128
+hidden_dim = 512
 depth = 8
-emb_lr = 1e-2
-fc_lr = 1e-2
-lstm_lr = 2e-2
+
+
+
+########################### Optimizer #######################################
+
 
 settings(
     batch_size=150,
-    learning_method=AdamOptimizer(),
-    learning_rate=1e-3,
+    learning_method=MomentumOptimizer(momentum=0),
+    learning_rate=2e-2,
     regularization=L2Regularization(8e-4),
-    gradient_clipping_threshold=25)
+    is_async=False,
+    model_average=ModelAverage(average_window=0.5,
+                               max_average_window=10000),
+                               
+)
 
-#6 features
+
+
+
+####################################### network ##############################
+#8 features and 1 target
 word = data_layer(name='word_data', size=word_dict_len)
-predicate = data_layer(name='verb_data', size=word_dict_len)
+predicate = data_layer(name='verb_data', size=pred_len)
+
+ctx_n2 = data_layer(name='ctx_n2_data', size=word_dict_len)
 ctx_n1 = data_layer(name='ctx_n1_data', size=word_dict_len)
 ctx_0 = data_layer(name='ctx_0_data', size=word_dict_len)
 ctx_p1 = data_layer(name='ctx_p1_data', size=word_dict_len)
+ctx_p2 = data_layer(name='ctx_p2_data', size=word_dict_len)
 mark = data_layer(name='mark_data', size=mark_dict_len)
 
+
 if not is_predict:
     target = data_layer(name='target', size=label_dict_len)
 
-ptt = ParameterAttribute(name='src_emb', learning_rate=emb_lr)
-layer_attr = ExtraLayerAttribute(drop_rate=0.5)
-fc_para_attr = ParameterAttribute(learning_rate=fc_lr)
-lstm_para_attr = ParameterAttribute(initial_std=0., learning_rate=lstm_lr)
-para_attr = [fc_para_attr, lstm_para_attr]
 
-word_embedding = embedding_layer(size=word_dim, input=word, param_attr=ptt)
-predicate_embedding = embedding_layer(
-    size=word_dim, input=predicate, param_attr=ptt)
-ctx_n1_embedding = embedding_layer(size=word_dim, input=ctx_n1, param_attr=ptt)
-ctx_0_embedding = embedding_layer(size=word_dim, input=ctx_0, param_attr=ptt)
-ctx_p1_embedding = embedding_layer(size=word_dim, input=ctx_p1, param_attr=ptt)
-mark_embedding = embedding_layer(size=mark_dim, input=mark)
+default_std=1/math.sqrt(hidden_dim)/3.0
+
+emb_para = ParameterAttribute(name='emb', initial_std=0., learning_rate=0.)
+std_0 = ParameterAttribute(initial_std=0.)
+std_default = ParameterAttribute(initial_std=default_std) 
+
+predicate_embedding = embedding_layer(size=word_dim, input=predicate, param_attr=ParameterAttribute(name='vemb',initial_std=default_std))
+mark_embedding = embedding_layer(name='word_ctx-in_embedding', size=mark_dim, input=mark, param_attr=std_0)
+
+word_input=[word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
+emb_layers = [embedding_layer(size=word_dim, input=x, param_attr=emb_para) for x in word_input]
+emb_layers.append(predicate_embedding)
+emb_layers.append(mark_embedding)
 
 hidden_0 = mixed_layer(
+    name='hidden0',
     size=hidden_dim,
-    input=[
-        full_matrix_projection(input=word_embedding),
-        full_matrix_projection(input=predicate_embedding),
-        full_matrix_projection(input=ctx_n1_embedding),
-        full_matrix_projection(input=ctx_0_embedding),
-        full_matrix_projection(input=ctx_p1_embedding),
-        full_matrix_projection(input=mark_embedding),
-    ])
+    bias_attr=std_default,
+    input=[ full_matrix_projection(input=emb, param_attr=std_default ) for emb in emb_layers ])
+
 
-lstm_0 = lstmemory(input=hidden_0, layer_attr=layer_attr)
+mix_hidden_lr = 1e-3
+lstm_para_attr = ParameterAttribute(initial_std=0.0, learning_rate=1.0)
+hidden_para_attr = ParameterAttribute(initial_std=default_std, learning_rate=mix_hidden_lr)
+
+lstm_0 = lstmemory(name='lstm0',
+                   input=hidden_0, 
+                   act=ReluActivation(),
+                   gate_act=SigmoidActivation(),
+                   state_act=SigmoidActivation(),
+                   bias_attr=std_0,
+                   param_attr=lstm_para_attr)
 
 #stack L-LSTM and R-LSTM with direct edges
 input_tmp = [hidden_0, lstm_0]
 
+
 for i in range(1, depth):
 
-    fc = fc_layer(input=input_tmp, size=hidden_dim, param_attr=para_attr)
+    mix_hidden = mixed_layer(name='hidden'+str(i),
+                             size=hidden_dim, 
+                             bias_attr=std_default,
+                             input=[full_matrix_projection(input=input_tmp[0], param_attr=hidden_para_attr),
+                                    full_matrix_projection(input=input_tmp[1], param_attr=lstm_para_attr)
+                                   ]
+                             )
+
+    lstm = lstmemory(name='lstm'+str(i),
+                     input=mix_hidden,
+                     act=ReluActivation(),
+                     gate_act=SigmoidActivation(),
+                     state_act=SigmoidActivation(),
+                     reverse=((i % 2)==1),
+                     bias_attr=std_0,
+                     param_attr=lstm_para_attr)
+
+    input_tmp = [mix_hidden, lstm]
+
+feature_out = mixed_layer(name='output',
+                          size=label_dict_len,
+                          bias_attr=std_default, 
+                          input=[full_matrix_projection(input=input_tmp[0], param_attr=hidden_para_attr),
+                                 full_matrix_projection(input=input_tmp[1], param_attr=lstm_para_attr)
+                                ],
+                          )
 
-    lstm = lstmemory(
-        input=fc,
-        act=ReluActivation(),
-        reverse=(i % 2) == 1,
-        layer_attr=layer_attr)
-    input_tmp = [fc, lstm]
 
-prob = fc_layer(
-    input=input_tmp,
-    size=label_dict_len,
-    act=SoftmaxActivation(),
-    param_attr=para_attr)
 
 if not is_predict:
-    cls = classification_cost(input=prob, label=target)
-    outputs(cls)
+    crf_l = crf_layer( name = 'crf',
+                       size = label_dict_len,
+                       input = feature_out, 
+                       label = target,
+                       param_attr=ParameterAttribute(name='crfw',initial_std=default_std, learning_rate=mix_hidden_lr)
+
+                      )
+
+    
+    crf_dec_l = crf_decoding_layer(name = 'crf_dec_l',
+                                   size = label_dict_len,
+                                   input = feature_out,
+                                   label = target,
+                                   param_attr=ParameterAttribute(name='crfw')
+                                       )
+
+
+    eval = sum_evaluator(input=crf_dec_l)
+        
+    outputs(crf_l)
+
 else:
-    outputs(prob)
+    crf_dec_l = crf_decoding_layer(name = 'crf_dec_l',
+                                   size = label_dict_len,
+                                   input = feature_out,
+                                   param_attr=ParameterAttribute(name='crfw')
+                                       )
+
+    outputs(crf_dec_l)
+
diff --git a/demo/semantic_role_labeling/predict.py b/demo/semantic_role_labeling/predict.py
index f051d4175c..2761814e18 100644
--- a/demo/semantic_role_labeling/predict.py
+++ b/demo/semantic_role_labeling/predict.py
@@ -26,7 +26,7 @@ UNK_IDX = 0
 
 
 class Prediction():
-    def __init__(self, train_conf, dict_file, model_dir, label_file):
+    def __init__(self, train_conf, dict_file, model_dir, label_file, predicate_dict_file):
         """
         train_conf: trainer configure.
         dict_file: word dictionary file name.
@@ -35,26 +35,41 @@ class Prediction():
 
         self.dict = {}
         self.labels = {}
+        self.predicate_dict={}
         self.labels_reverse = {}
-        self.load_dict_label(dict_file, label_file)
+        self.load_dict_label(dict_file, label_file, predicate_dict_file)
 
         len_dict = len(self.dict)
         len_label = len(self.labels)
-
-        conf = parse_config(train_conf, 'dict_len=' + str(len_dict) +
-                            ',label_len=' + str(len_label) + ',is_predict=True')
+        len_pred = len(self.predicate_dict)
+
+        conf = parse_config(
+            train_conf,
+            'dict_len=' + str(len_dict) + 
+            ',label_len=' + str(len_label) +
+            ',pred_len=' + str(len_pred) +
+            ',is_predict=True')
         self.network = swig_paddle.GradientMachine.createFromConfigProto(
             conf.model_config)
         self.network.loadParameters(model_dir)
 
         slots = [
+            integer_value_sequence(len_dict),
+            integer_value_sequence(len_pred),
+            integer_value_sequence(len_dict),
+            integer_value_sequence(len_dict),
+            integer_value_sequence(len_dict),
+            integer_value_sequence(len_dict),
+            integer_value_sequence(len_dict), 
+            integer_value_sequence(2)
+            ]
             integer_value_sequence(len_dict), integer_value_sequence(len_dict),
             integer_value_sequence(len_dict), integer_value_sequence(len_dict),
             integer_value_sequence(len_dict), integer_value_sequence(2)
         ]
         self.converter = DataProviderConverter(slots)
 
-    def load_dict_label(self, dict_file, label_file):
+    def load_dict_label(self, dict_file, label_file, predicate_dict_file):
         """
         Load dictionary from self.dict_file.
         """
@@ -65,39 +80,42 @@ class Prediction():
             self.labels[line.strip()] = line_count
             self.labels_reverse[line_count] = line.strip()
 
+        for line_count, line in enumerate(open(predicate_dict_file, 'r')):
+            self.predicate_dict[line.strip()] = line_count
     def get_data(self, data_file):
         """
         Get input data of paddle format.
         """
         with open(data_file, 'r') as fdata:
             for line in fdata:
-                sentence, predicate, ctx_n1, ctx_0, ctx_p1, mark, label = line.strip(
+                sentence, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, label = line.strip(
                 ).split('\t')
                 words = sentence.split()
                 sen_len = len(words)
-
+                 
                 word_slot = [self.dict.get(w, UNK_IDX) for w in words]
-                predicate_slot = [self.dict.get(predicate, UNK_IDX)] * sen_len
+                predicate_slot = [self.predicate_dict.get(predicate, UNK_IDX)] * sen_len
+                ctx_n2_slot = [self.dict.get(ctx_n2, UNK_IDX)] * sen_len
                 ctx_n1_slot = [self.dict.get(ctx_n1, UNK_IDX)] * sen_len
                 ctx_0_slot = [self.dict.get(ctx_0, UNK_IDX)] * sen_len
                 ctx_p1_slot = [self.dict.get(ctx_p1, UNK_IDX)] * sen_len
+                ctx_p2_slot = [self.dict.get(ctx_p2, UNK_IDX)] * sen_len
 
                 marks = mark.split()
                 mark_slot = [int(w) for w in marks]
+                
+                yield word_slot, predicate_slot, ctx_n2_slot, ctx_n1_slot, \
+                      ctx_0_slot, ctx_p1_slot, ctx_p2_slot,  mark_slot
 
-                yield word_slot, predicate_slot, ctx_n1_slot, \
-                      ctx_0_slot, ctx_p1_slot, mark_slot
-
-    def predict(self, data_file):
+    def predict(self, data_file, output_file):
         """
         data_file: file name of input data.
         """
         input = self.converter(self.get_data(data_file))
         output = self.network.forwardTest(input)
-        prob = output[0]["value"]
-        lab = list(np.argsort(-prob)[:, 0])
+        lab = output[0]["id"].tolist()
 
-        with open(data_file, 'r') as fin, open('predict.res', 'w') as fout:
+        with open(data_file, 'r') as fin, open(output_file, 'w') as fout:
             index = 0
             for line in fin:
                 sen = line.split('\t')[0]
@@ -109,8 +127,8 @@ class Prediction():
 
 
 def option_parser():
-    usage = ("python predict.py -c config -w model_dir "
-             "-d word dictionary -l label_file -i input_file")
+    usage = ("python predict.py -c config -w model_dir " 
+             "-d word dictionary -l label_file -i input_file  -p pred_dict_file")
     parser = OptionParser(usage="usage: %s [options]" % usage)
     parser.add_option(
         "-c",
@@ -131,6 +149,13 @@ def option_parser():
         dest="label_file",
         default=None,
         help="label file")
+    parser.add_option(
+        "-p",
+        "--predict_dict_file",
+        action="store",
+        dest="predict_dict_file",
+        default=None,
+        help="predict_dict_file")
     parser.add_option(
         "-i",
         "--data",
@@ -144,6 +169,14 @@ def option_parser():
         dest="model_path",
         default=None,
         help="model path")
+
+    parser.add_option(
+        "-o",
+        "--output_file",
+        action="store",
+        dest="output_file",
+        default=None,
+        help="output file")
     return parser.parse_args()
 
 
@@ -154,10 +187,12 @@ def main():
     dict_file = options.dict_file
     model_path = options.model_path
     label_file = options.label_file
+    predict_dict_file = options.predict_dict_file
+    output_file = options.output_file
 
     swig_paddle.initPaddle("--use_gpu=0")
-    predict = Prediction(train_conf, dict_file, model_path, label_file)
-    predict.predict(data_file)
+    predict = Prediction(train_conf, dict_file, model_path, label_file, predict_dict_file)
+    predict.predict(data_file,output_file)
 
 
 if __name__ == '__main__':
diff --git a/demo/semantic_role_labeling/predict.sh b/demo/semantic_role_labeling/predict.sh
index a545b9a5d5..d0acdb0bd0 100644
--- a/demo/semantic_role_labeling/predict.sh
+++ b/demo/semantic_role_labeling/predict.sh
@@ -26,15 +26,18 @@ LOG=`get_best_pass $log`
 LOG=(${LOG})
 best_model_path="output/pass-${LOG[1]}"
 
-
 config_file=db_lstm.py
-dict_file=./data/src.dict
-label_file=./data/tgt.dict 
+dict_file=./data/wordDict.txt
+label_file=./data/targetDict.txt 
+predicate_dict_file=./data/verbDict.txt
 input_file=./data/feature
+output_file=predict.res
  
 python predict.py \
      -c $config_file \
      -w $best_model_path \
      -l $label_file \
+     -p $predicate_dict_file  \
      -d $dict_file \
-     -i $input_file
+     -i $input_file \
+     -o $output_file
diff --git a/demo/semantic_role_labeling/test.sh b/demo/semantic_role_labeling/test.sh
index 844649e8c0..c4ab44f5ca 100644
--- a/demo/semantic_role_labeling/test.sh
+++ b/demo/semantic_role_labeling/test.sh
@@ -36,4 +36,5 @@ paddle train \
   --job=test \
   --use_gpu=false \
   --config_args=is_test=1 \
+  --test_all_data_in_one_period=1 \
 2>&1 | tee 'test.log'
diff --git a/demo/semantic_role_labeling/train.sh b/demo/semantic_role_labeling/train.sh
index c3a22b644b..420768bb2b 100644
--- a/demo/semantic_role_labeling/train.sh
+++ b/demo/semantic_role_labeling/train.sh
@@ -16,11 +16,14 @@
 set -e
 paddle train \
   --config=./db_lstm.py \
+  --use_gpu=0 \
+  --log_period=5000 \
+  --trainer_count=1 \
+  --show_parameter_stats_period=5000 \
   --save_dir=./output \
-  --trainer_count=4 \
-  --log_period=10 \
-  --num_passes=500 \
-  --use_gpu=false \
-  --show_parameter_stats_period=10 \
+  --num_passes=10000 \
+  --average_test_period=10000000 \
+  --init_model_path=./data \
+  --load_missing_parameter_strategy=rand \
   --test_all_data_in_one_period=1 \
-2>&1 | tee 'train.log'
+  2>&1 | tee 'train.log'
diff --git a/demo/sentiment/trainer_config.py b/demo/sentiment/trainer_config.py
index 894070e7c9..114a9138eb 100644
--- a/demo/sentiment/trainer_config.py
+++ b/demo/sentiment/trainer_config.py
@@ -29,6 +29,7 @@ settings(
     batch_size=128,
     learning_rate=2e-3,
     learning_method=AdamOptimizer(),
+    average_window=0.5,
     regularization=L2Regularization(8e-4),
     gradient_clipping_threshold=25)
 
diff --git a/doc/algorithm/rnn/rnn.rst b/doc/algorithm/rnn/rnn.rst
index 399c5da5ff..01d2caefb5 100644
--- a/doc/algorithm/rnn/rnn.rst
+++ b/doc/algorithm/rnn/rnn.rst
@@ -17,7 +17,7 @@ PaddlePaddle does not need any preprocessing to sequence data, such as padding.
 
 .. code-block:: python
 
-    settings.slots = [
+    settings.input_types = [
       integer_value_sequence(len(settings.src_dict)),
       integer_value_sequence(len(settings.trg_dict)),
       integer_value_sequence(len(settings.trg_dict))]
diff --git a/doc/demo/semantic_role_labeling/curve.jpg b/doc/demo/semantic_role_labeling/curve.jpg
new file mode 100644
index 0000000000..baa35ae7f0
Binary files /dev/null and b/doc/demo/semantic_role_labeling/curve.jpg differ
diff --git a/doc/demo/semantic_role_labeling/semantic_role_labeling.md b/doc/demo/semantic_role_labeling/semantic_role_labeling.md
index 890f731458..e2793b2b34 100644
--- a/doc/demo/semantic_role_labeling/semantic_role_labeling.md
+++ b/doc/demo/semantic_role_labeling/semantic_role_labeling.md
@@ -1,183 +1,200 @@
-# Semantic Role labeling Tutorial #
-
-Semantic role labeling (SRL) is a form of shallow semantic parsing whose goal is to discover the predicate-argument structure of each predicate in a given input sentence. SRL is useful as an intermediate step in a wide range of natural language processing tasks, such as information extraction. automatic document categorization and question answering.  An instance is as following [1]:
-
- [ <sub>A0</sub> He ] [ <sub>AM-MOD</sub> would ][ <sub>AM-NEG</sub> n’t ] [ <sub>V</sub> accept] [ <sub>A1</sub> anything of value ] from [<sub>A2</sub> those he was writing about ]. 
-
-- V: verb
-- A0: acceptor
-- A1: thing accepted
-- A2: accepted-from
-- A3: Attribute
-- AM-MOD: modal 
-- AM-NEG: negation
-
-Given the verb "accept", the chunks in sentence would play certain semantic roles. Here, the label scheme is from Penn Proposition Bank. 
-
-To this date, most of the successful SRL systems are built on top of some form of parsing results where pre-defined feature templates over the syntactic structure are used. This tutorial will present an end-to-end system using deep bidirectional long short-term memory (DB-LSTM)[2] for solving the SRL task, which largely outperforms the previous state-of-the-art systems. The system regards SRL task as the sequence labelling problem. 
-
-## Data Description
-The relevant paper[2] takes the data set in CoNLL-2005&2012 Shared Task for training and testing. Accordingto data license,  the demo adopts the test data set of CoNLL-2005, which can be reached on website.
-
-To download and process the original data, user just need to execute the following command:
-
-```bash
-cd data
-./get_data.sh
-```
-Several new files appear in the `data `directory as follows.
-```bash
-conll05st-release：the test data set of CoNll-2005 shared task 
-test.wsj.words：the Wall Street Journal data sentences
-test.wsj.props:  the propositional arguments
-src.dict：the dictionary of words in sentences
-tgt.dict：the labels dictionary
-feature: the extracted features from data set
-```
-
-## Training
-### DB-LSTM
-Please refer to the Sentiment Analysis demo to learn more about the long short-term memory unit. 
-
-Unlike Bidirectional-LSTM that used in Sentiment Analysis demo,  the DB-LSTM adopts another way to stack LSTM layer. First a standard LSTM processes the sequence in forward direction. The input and output of this LSTM layer are taken by the next LSTM layer as input, processed in reversed direction. These two standard LSTM layers compose a pair of LSTM. Then we stack LSTM layers pair after pair to obtain the deep LSTM model. 
-
-The following figure shows a temporal expanded 2-layer DB-LSTM network.
-<center>
-![pic](./network_arch.png)
-</center>
-
-### Features
-Two input features play an essential role in this pipeline: predicate (pred) and argument (argu). Two other features: predicate context (ctx-p) and region mark (mr) are also adopted. Because a single predicate word can not exactly describe the predicate information, especially when the same words appear more than one times in a sentence. With the predicate context, the ambiguity can be largely eliminated. Similarly, we use region mark m<sub>r</sub> = 1 to denote the argument position if it locates in the predicate context region, or m<sub>r</sub> = 0 if does not. These four simple features are all we need for our SRL system. Features of one sample with context size set to 1 is showed as following[2]:
-<center>
-![pic](./feature.jpg)
-</center>
-
-In this sample, the coresponding labelled sentence is:
-
-[ <sub>A1</sub> A record date ] has [ <sub>AM-NEG</sub> n't ] been [ <sub>V</sub> set ] . 
-
-In the demo, we adopt the feature template as above, consists of :  `argument`, `predicate`, `ctx-p (p=-1,0,1)`, `mark` and use `B/I/O` scheme to label each argument. These features and labels are stored in `feature` file, and separated by `\t`.
-
-### Data Provider
-
-`dataprovider.py` is the python file to wrap data. `hook()` function is to define the data slots for network. The  Six features and label are all IndexSlots.
-```
-def hook(settings, word_dict, label_dict, **kwargs):
-    settings.word_dict = word_dict
-    settings.label_dict = label_dict
-    #all inputs are integral and sequential type
-    settings.slots = [
-        integer_value_sequence(len(word_dict)),
-        integer_value_sequence(len(word_dict)),
-        integer_value_sequence(len(word_dict)),
-        integer_value_sequence(len(word_dict)),
-        integer_value_sequence(len(word_dict)),
-        integer_value_sequence(2),
-        integer_value_sequence(len(label_dict))]
-```
-The corresponding data iterator is as following:
-```
-@provider(use_seq=True, init_hook=hook)
-def process(obj, file_name):
-    with open(file_name, 'r') as fdata:
-        for line in fdata:
-            sentence, predicate, ctx_n1, ctx_0, ctx_p1, mark, label = line.strip().split('\t')
-            words = sentence.split()
-            sen_len = len(words)
-            word_slot = [obj.word_dict.get(w, UNK_IDX) for w in words]
-
-            predicate_slot = [obj.word_dict.get(predicate, UNK_IDX)] * sen_len
-            ctx_n1_slot = [obj.word_dict.get(ctx_n1, UNK_IDX) ] * sen_len
-            ctx_0_slot = [obj.word_dict.get(ctx_0, UNK_IDX) ] * sen_len
-            ctx_p1_slot = [obj.word_dict.get(ctx_p1, UNK_IDX) ] * sen_len
-
-            marks = mark.split()
-            mark_slot = [int(w) for w in marks]
-
-            label_list = label.split()
-            label_slot = [obj.label_dict.get(w) for w in label_list]
-
-            yield word_slot, predicate_slot, ctx_n1_slot, ctx_0_slot, ctx_p1_slot, mark_slot, label_slot
-```
-The `process`function yield 7 lists which are six features and labels.
- 
-### Neural Network Config
-`db_lstm.py` is the neural network config file to load the dictionaries and define the  data provider module and network architecture during the training procedure. 
-
-Seven `data_layer` load instances from data provider. Six features are transformed into embedddings respectively, and mixed by `mixed_layer` .  Deep bidirectional LSTM layers extract features for the softmax layer. The objective function is cross entropy of labels.
-
-### Run Training 
-The script for training is `train.sh`, user just need to execute:
-```bash
-  ./train.sh
-```
-The content in `train.sh`:
-```
-paddle train \
-  --config=./db_lstm.py \
-  --save_dir=./output \
-  --trainer_count=4 \
-  --log_period=10 \
-  --num_passes=500 \
-  --use_gpu=false \
-  --show_parameter_stats_period=10 \
-  --test_all_data_in_one_period=1 \
-2>&1 | tee 'train.log'
-```
-
--  \--config=./db_lstm.py : network config file.
--  \--save_di=./output: output path to save models.
--  \--trainer_count=4 : set thread number (or GPU count).
--  \--log_period=10 : print log every 20 batches.
--  \--num_passes=500: set pass number, one pass in PaddlePaddle means training all samples in dataset one time.
--  \--use_gpu=false: use CPU to train, set true, if you install GPU version of PaddlePaddle and want to use GPU to train.
--  \--show_parameter_stats_period=10: show parameter statistic every 100 batches.
--  \--test_all_data_in_one_period=1: test all data in every testing.
-
-
-After training, the models  will be saved in directory `output`.
-
-### Run testing
-The script for testing is `test.sh`, user just need to execute:
-```bash
-  ./test.sh
-```
-The main part in `tesh.sh`
-```
-paddle train \
-  --config=./db_lstm.py \
-  --model_list=$model_list \
-  --job=test \
-  --config_args=is_test=1 \
-```
-
-  - \--config=./db_lstm.py: network config file
-  - \--model_list=$model_list.list: model list file
-  - \--job=test: indicate the test job
-  - \--config_args=is_test=1: flag to indicate test
-  
-
-### Run prediction
-The script for prediction is `predict.sh`, user just need to execute:
-```bash
-  ./predict.sh
-  
-```
-In `predict.sh`, user should offer the network config file, model path, label file, word dictionary file, feature file
-```
-python predict.py 
-     -c $config_file 
-     -w $model_path 
-     -l $label_file 
-     -d $dict_file 
-     -i $input_file
-```
-
-`predict.py` is the main executable python script, which includes functions: load model, load data, data prediction. The network model will output the probability distribution of labels. In the demo, we take the label with maximum probability as result. User can also implement the beam search or viterbi decoding upon the probability distribution matrix.
-
-After prediction,  the result is saved in `predict.res`.
-
-## Reference
-[1] Martha Palmer, Dan Gildea, and Paul Kingsbury. The Proposition Bank: An Annotated Corpus of Semantic Roles , Computational Linguistics, 31(1), 2005. 
-
-[2] Zhou, Jie, and Wei Xu. "End-to-end learning of semantic role labeling using recurrent neural networks." Proceedings of the Annual Meeting of the Association for Computational Linguistics. 2015.
+# Semantic Role labeling Tutorial #
+
+Semantic role labeling (SRL) is a form of shallow semantic parsing whose goal is to discover the predicate-argument structure of each predicate in a given input sentence. SRL is useful as an intermediate step in a wide range of natural language processing tasks, such as information extraction. automatic document categorization and question answering.  An instance is as following [1]:
+
+ [ <sub>A0</sub> He ] [ <sub>AM-MOD</sub> would ][ <sub>AM-NEG</sub> n’t ] [ <sub>V</sub> accept] [ <sub>A1</sub> anything of value ] from [<sub>A2</sub> those he was writing about ]. 
+
+- V: verb
+- A0: acceptor
+- A1: thing accepted
+- A2: accepted-from
+- A3: Attribute
+- AM-MOD: modal 
+- AM-NEG: negation
+
+Given the verb "accept", the chunks in sentence would play certain semantic roles. Here, the label scheme is from Penn Proposition Bank. 
+
+To this date, most of the successful SRL systems are built on top of some form of parsing results where pre-defined feature templates over the syntactic structure are used. This tutorial will present an end-to-end system using deep bidirectional long short-term memory (DB-LSTM)[2] for solving the SRL task, which largely outperforms the previous state-of-the-art systems. The system regards SRL task as the sequence labelling problem. 
+
+## Data Description
+The relevant paper[2] takes the data set in CoNLL-2005&2012 Shared Task for training and testing. Accordingto data license,  the demo adopts the test data set of CoNLL-2005, which can be reached on website.
+
+To download and process the original data, user just need to execute the following command:
+
+```bash
+cd data
+./get_data.sh
+```
+Several new files appear in the `data `directory as follows.
+```bash
+conll05st-release：the test data set of CoNll-2005 shared task 
+test.wsj.words：the Wall Street Journal data sentences
+test.wsj.props:  the propositional arguments
+feature: the extracted features from data set
+```
+
+## Training
+### DB-LSTM
+Please refer to the Sentiment Analysis demo to learn more about the long short-term memory unit. 
+
+Unlike Bidirectional-LSTM that used in Sentiment Analysis demo,  the DB-LSTM adopts another way to stack LSTM layer. First a standard LSTM processes the sequence in forward direction. The input and output of this LSTM layer are taken by the next LSTM layer as input, processed in reversed direction. These two standard LSTM layers compose a pair of LSTM. Then we stack LSTM layers pair after pair to obtain the deep LSTM model. 
+
+The following figure shows a temporal expanded 2-layer DB-LSTM network.
+<center>
+![pic](./network_arch.png)
+</center>
+
+### Features
+Two input features play an essential role in this pipeline: predicate (pred) and argument (argu). Two other features: predicate context (ctx-p) and region mark (mr) are also adopted. Because a single predicate word can not exactly describe the predicate information, especially when the same words appear more than one times in a sentence. With the predicate context, the ambiguity can be largely eliminated. Similarly, we use region mark m<sub>r</sub> = 1 to denote the argument position if it locates in the predicate context region, or m<sub>r</sub> = 0 if does not. These four simple features are all we need for our SRL system. Features of one sample with context size set to 1 is showed as following[2]:
+<center>
+![pic](./feature.jpg)
+</center>
+
+In this sample, the coresponding labelled sentence is:
+
+[ <sub>A1</sub> A record date ] has [ <sub>AM-NEG</sub> n't ] been [ <sub>V</sub> set ] . 
+
+In the demo, we adopt the feature template as above, consists of :  `argument`, `predicate`, `ctx-p (p=-1,0,1)`, `mark` and use `B/I/O` scheme to label each argument. These features and labels are stored in `feature` file, and separated by `\t`.
+
+### Data Provider
+
+`dataprovider.py` is the python file to wrap data. `hook()` function is to define the data slots for network. The  Six features and label are all IndexSlots.
+```
+def hook(settings, word_dict, label_dict, **kwargs):
+    settings.word_dict = word_dict
+    settings.label_dict = label_dict
+    #all inputs are integral and sequential type
+    settings.slots = [
+        integer_value_sequence(len(word_dict)),
+        integer_value_sequence(len(predicate_dict)),
+        integer_value_sequence(len(word_dict)),
+        integer_value_sequence(len(word_dict)),
+        integer_value_sequence(len(word_dict)),
+        integer_value_sequence(len(word_dict)),
+        integer_value_sequence(len(word_dict)),
+        integer_value_sequence(2),
+        integer_value_sequence(len(label_dict))]
+```
+The corresponding data iterator is as following:
+```
+@provider(init_hook=hook, should_shuffle=True, calc_batch_size=get_batch_size,
+          can_over_batch_size=False, cache=CacheType.CACHE_PASS_IN_MEM)
+def process(settings, file_name):
+    with open(file_name, 'r') as fdata:
+        for line in fdata:
+            sentence, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2,  mark, label = \
+                line.strip().split('\t')
+
+            words = sentence.split()
+            sen_len = len(words)
+            word_slot = [settings.word_dict.get(w, UNK_IDX) for w in words]
+
+            predicate_slot = [settings.predicate_dict.get(predicate)] * sen_len
+            ctx_n2_slot = [settings.word_dict.get(ctx_n2, UNK_IDX)] * sen_len
+            ctx_n1_slot = [settings.word_dict.get(ctx_n1, UNK_IDX)] * sen_len
+            ctx_0_slot = [settings.word_dict.get(ctx_0, UNK_IDX)] * sen_len
+            ctx_p1_slot = [settings.word_dict.get(ctx_p1, UNK_IDX)] * sen_len
+            ctx_p2_slot = [settings.word_dict.get(ctx_p2, UNK_IDX)] * sen_len
+
+            marks = mark.split()
+            mark_slot = [int(w) for w in marks]
+
+            label_list = label.split()
+            label_slot = [settings.label_dict.get(w) for w in label_list]
+            yield word_slot, predicate_slot, ctx_n2_slot, ctx_n1_slot, \
+                  ctx_0_slot, ctx_p1_slot, ctx_p2_slot, mark_slot, label_slot
+```
+The `process`function yield 9 lists which are 8 features and label.
+ 
+### Neural Network Config
+`db_lstm.py` is the neural network config file to load the dictionaries and define the  data provider module and network architecture during the training procedure. 
+
+Nine `data_layer` load instances from data provider. Eight features are transformed into embedddings respectively, and mixed by `mixed_layer` .  Deep bidirectional LSTM layers extract features for the softmax layer. The objective function is cross entropy of labels.
+
+### Run Training 
+The script for training is `train.sh`, user just need to execute:
+```bash
+  ./train.sh
+```
+The content in `train.sh`:
+```
+paddle train \
+  --config=./db_lstm.py \
+  --use_gpu=0 \
+  --log_period=5000 \
+  --trainer_count=1 \
+  --show_parameter_stats_period=5000 \
+  --save_dir=./output \
+  --num_passes=10000 \
+  --average_test_period=10000000 \
+  --init_model_path=./data \
+  --load_missing_parameter_strategy=rand \
+  --test_all_data_in_one_period=1 \
+2>&1 | tee 'train.log'
+```
+
+-  \--config=./db_lstm.py : network config file.
+-  \--use_gpu=false: use CPU to train, set true, if you install GPU version of PaddlePaddle and want to use GPU to train, until now crf_layer do not support GPU
+-  \--log_period=500: print log every 20 batches.
+-  \--trainer_count=1: set thread number (or GPU count).
+-  \--show_parameter_stats_period=5000: show parameter statistic every 100 batches.
+-  \--save_dir=./output: output path to save models.
+-  \--num_passes=10000: set pass number, one pass in PaddlePaddle means training all samples in dataset one time.
+-  \--average_test_period=10000000:  do test on average parameter every average_test_period batches
+-  \--init_model_path=./data: parameter initialization path 
+-  \--load_missing_parameter_strategy=rand: random initialization unexisted parameters
+-  \--test_all_data_in_one_period=1: test all data in one period
+
+
+After training, the models  will be saved in directory `output`. Our training curve is as following:
+<center>
+![pic](./curve.jpg)
+</center>
+
+### Run testing
+The script for testing is `test.sh`, user just need to execute:
+```bash
+  ./test.sh
+```
+The main part in `tesh.sh`
+```
+paddle train \
+  --config=./db_lstm.py \
+  --model_list=$model_list \
+  --job=test \
+  --config_args=is_test=1 \
+```
+
+  - \--config=./db_lstm.py: network config file
+  - \--model_list=$model_list.list: model list file
+  - \--job=test: indicate the test job
+  - \--config_args=is_test=1: flag to indicate test
+  - \--test_all_data_in_one_period=1: test all data in 1 period
+  
+
+### Run prediction
+The script for prediction is `predict.sh`, user just need to execute:
+```bash
+  ./predict.sh
+  
+```
+In `predict.sh`, user should offer the network config file, model path, label file, word dictionary file, feature file
+```
+python predict.py 
+     -c $config_file \
+     -w $best_model_path \
+     -l $label_file \
+     -p $predicate_dict_file  \
+     -d $dict_file \
+     -i $input_file \
+     -o $output_file
+```
+
+`predict.py` is the main executable python script, which includes functions: load model, load data, data prediction. The network model will output the probability distribution of labels. In the demo, we take the label with maximum probability as result. User can also implement the beam search or viterbi decoding upon the probability distribution matrix.
+
+After prediction,  the result is saved in `predict.res`.
+
+## Reference
+[1] Martha Palmer, Dan Gildea, and Paul Kingsbury. The Proposition Bank: An Annotated Corpus of Semantic Roles , Computational Linguistics, 31(1), 2005. 
+
+[2] Zhou, Jie, and Wei Xu. "End-to-end learning of semantic role labeling using recurrent neural networks." Proceedings of the Annual Meeting of the Association for Computational Linguistics. 2015.
diff --git a/doc/demo/sentiment_analysis/sentiment_analysis.md b/doc/demo/sentiment_analysis/sentiment_analysis.md
index 385f49891d..c53952c544 100644
--- a/doc/demo/sentiment_analysis/sentiment_analysis.md
+++ b/doc/demo/sentiment_analysis/sentiment_analysis.md
@@ -6,7 +6,7 @@ Sentiment analysis is also used to monitor social media based on large amount of
 
 On the other hand, grabbing the user comments of products and analyzing their sentiment are useful to understand user preferences for companies, products, even competing products.
 
-This tutorial will guide you through the process of training a Long Short Term Memory (LSTM) Network to classify the sentiment of sentences from [Large Movie Review Dataset](http://ai.stanford.edu/~amaas/data/sentiment/), sometimes known as the [Internet Movie Database (IMDB)](http://ai.stanford.edu/~amaas/papers/wvSent_acl2011.pdf). This dataset contains movie reviews along with their associated binary sentiment polarity labels, namely positive and negative. So randomly guessing yields 50% accuracy.
+This tutorial will guide you through the process of training a Long Short Term Memory (LSTM) Network to classify the sentiment of sentences from [Large Movie Review Dataset](http://ai.stanford.edu/~amaas/data/sentiment/), sometimes known as the Internet Movie Database (IMDB). This dataset contains movie reviews along with their associated binary sentiment polarity labels, namely positive and negative. So randomly guessing yields 50% accuracy.
 
 ## Data Preparation
 
@@ -39,7 +39,7 @@ imdbEr.txt  imdb.vocab  README  test  train
 * imdbEr.txt: expected rating for each token in imdb.vocab.
 * README: data documentation.
 
-Both train and test set directory contains:
+The file in train set directory is as follows. The test set also contains them except `unsup` and `urls_unsup.txt`.
 
 ```
 labeledBow.feat  neg  pos  unsup  unsupBow.feat  urls_neg.txt  urls_pos.txt  urls_unsup.txt
@@ -151,6 +151,7 @@ settings(
   batch_size=128,
   learning_rate=2e-3,
   learning_method=AdamOptimizer(),
+  average_window=0.5,
   regularization=L2Regularization(8e-4),
   gradient_clipping_threshold=25
 )
@@ -163,17 +164,18 @@ stacked_lstm_net(dict_dim, class_dim=class_dim,
 
 * **Data Definition**:
    * get\_config\_arg(): get arguments setted by `--config_args=xx` in commandline argument.
-   * Define TrainData and TestData provider, here using Python interface (PyDataProviderWrapper) of PaddlePaddle to load data. For details, you can refer to the document of PyDataProvider.
+   * Define data provider, here using Python interface to load data. For details, you can refer to the document of PyDataProvider2.
 
 * **Algorithm Configuration**:
-   * use sgd algorithm.
-   * use adam optimization.
    * set batch size of 128.
-   * set average sgd window.
    * set global learning rate.
+   * use adam optimization.
+   * set average sgd window.
+   * set L2 regularization.
+   * set gradient clipping threshold.
 * **Network Configuration**:
-   * dict_dim: get dictionary dimension.
-   * class_dim: set category number, IMDB has two label, namely positive and negative label.
+   * dict_dim: dictionary dimension.
+   * class_dim: category number, IMDB has two label, namely positive and negative label.
    * `stacked_lstm_net`: predefined network as shown in Figure 3, use this network by default.
    * `bidirectional_lstm_net`: predefined network as shown in Figure 2.
 
diff --git a/doc/dev/new_layer/new_layer.rst b/doc/dev/new_layer/new_layer.rst
index 2fa0073048..af8b76a307 100644
--- a/doc/dev/new_layer/new_layer.rst
+++ b/doc/dev/new_layer/new_layer.rst
@@ -60,7 +60,7 @@ Implement C++ Class
 
 The C++ class of the layer implements the initialization, forward, and backward part of the layer. The fully connected layer is at :code:`paddle/gserver/layers/FullyConnectedLayer.h` and :code:`paddle/gserver/layers/FullyConnectedLayer.cpp`. We list simplified version of the code below.
 
-It needs to derive the base class :code:`paddle::BaseLayer`, and it needs to override the following functions:
+It needs to derive the base class :code:`paddle::Layer`, and it needs to override the following functions:
 
 - constructor and destructor.
 - :code:`init` function. It is used to initialize the parameters and settings.
diff --git a/doc/optimization/gpu_profiling.rst b/doc/optimization/gpu_profiling.rst
index 44ecb34885..667bf1364e 100644
--- a/doc/optimization/gpu_profiling.rst
+++ b/doc/optimization/gpu_profiling.rst
@@ -53,7 +53,7 @@ above profilers.
 
 .. literalinclude:: ../../paddle/math/tests/test_GpuProfiler.cpp
    :language: c++
-   :lines: 107-121
+   :lines: 111-124
    :linenos:
 
 The above code snippet includes two methods, you can use any of them to profile the regions of interest.
@@ -75,12 +75,12 @@ To enable built-in timer in PaddlePaddle, first you have to add :code:`REGISTER_
 Then, all information could be stamped in the console via :code:`printStatus` or :code:`printAllStatus` function.
 As a simple example, consider the following:
 
-1. Add :code:`REGISTER_TIMER_INFO` and :code:`printStatus` functions (see the emphasize-lines).
+1. Add :code:`REGISTER_TIMER_INFO` and :code:`printAllStatus` functions (see the emphasize-lines).
 
     .. literalinclude:: ../../paddle/math/tests/test_GpuProfiler.cpp
         :language: c++
-        :lines: 107-121
-        :emphasize-lines: 10-11,14
+        :lines: 111-124
+        :emphasize-lines: 8-10,13
         :linenos:
 
 2. Configure cmake with **WITH_TIMER** and recompile PaddlePaddle.
@@ -126,8 +126,8 @@ To use this command line profiler **nvprof**, you can simply issue the following
 
     .. literalinclude:: ../../paddle/math/tests/test_GpuProfiler.cpp
         :language: c++
-        :lines: 107-121
-        :emphasize-lines: 7-8
+        :lines: 111-124
+        :emphasize-lines: 6-7
         :linenos:
 
 2. Configure cmake with **WITH_PROFILER** and recompile PaddlePaddle.
diff --git a/doc/source/api/api.rst b/doc/source/api.rst
similarity index 90%
rename from doc/source/api/api.rst
rename to doc/source/api.rst
index 6fc450202d..30396c26b6 100644
--- a/doc/source/api/api.rst
+++ b/doc/source/api.rst
@@ -1,5 +1,5 @@
 API
-========
+===
 
 .. doxygenfile:: paddle/api/PaddleAPI.h
 .. doxygenfile:: paddle/api/Internal.h
diff --git a/doc/source/cuda/cuda/cuda.rst b/doc/source/cuda/cuda/cuda.rst
deleted file mode 100644
index 52f17c2b2e..0000000000
--- a/doc/source/cuda/cuda/cuda.rst
+++ /dev/null
@@ -1,39 +0,0 @@
-Cuda
-=============
-
-Dynamic Link Libs
---------------------------
-
-hl_dso_loader.h
-``````````````````
-.. doxygenfile:: paddle/cuda/include/hl_dso_loader.h
-
-GPU Resources
-----------------
-
-hl_cuda.ph
-``````````````
-.. doxygenfile:: paddle/cuda/include/hl_cuda.ph
-
-hl_cuda.h
-``````````````
-.. doxygenfile:: paddle/cuda/include/hl_cuda.h
-
-CUDA Wrapper
---------------
-
-hl_cuda_cublas.h
-``````````````````````
-.. doxygenfile:: paddle/cuda/include/hl_cuda_cublas.h
-
-hl_cuda_cudnn.h
-``````````````````````
-.. doxygenfile:: paddle/cuda/include/hl_cuda_cudnn.h
-
-hl_cuda_cudnn.h
-``````````````````````
-.. doxygenfile:: paddle/cuda/include/hl_cuda_cudnn.ph
-
-
-
-
diff --git a/doc/source/cuda/cuda/index.rst b/doc/source/cuda/cuda/index.rst
deleted file mode 100644
index 5fa38ff0fc..0000000000
--- a/doc/source/cuda/cuda/index.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-CUDA
-====================
-
-.. toctree::
-  :maxdepth: 3
-
-  cuda.rst
diff --git a/doc/source/cuda/index.rst b/doc/source/cuda/index.rst
new file mode 100644
index 0000000000..b0fed2e7f7
--- /dev/null
+++ b/doc/source/cuda/index.rst
@@ -0,0 +1,9 @@
+CUDA
+====
+
+.. toctree::
+  :maxdepth: 2
+
+  matrix.rst
+  nn.rst
+  utils.rst
diff --git a/doc/source/cuda/matrix/matrix.rst b/doc/source/cuda/matrix.rst
similarity index 76%
rename from doc/source/cuda/matrix/matrix.rst
rename to doc/source/cuda/matrix.rst
index dd4f06599c..b7699c83ed 100644
--- a/doc/source/cuda/matrix/matrix.rst
+++ b/doc/source/cuda/matrix.rst
@@ -1,61 +1,59 @@
 Matrix
-=======
+======
 
-Base Matrix
--------------
+Base
+----
 
 hl_matrix.h
-``````````````````
+```````````
 .. doxygenfile:: paddle/cuda/include/hl_matrix.h
 
 hl_matrix_base.h
-``````````````````
+````````````````
 .. doxygenfile:: paddle/cuda/include/hl_matrix_base.cuh
 
 hl_matrix_apply.cuh
-``````````````````````
+```````````````````
 .. doxygenfile:: paddle/cuda/include/hl_matrix_apply.cuh
 
 hl_matrix_ops.cuh
-``````````````````````
+`````````````````
 .. doxygenfile:: paddle/cuda/include/hl_matrix_ops.cuh
 
 hl_matrix_type.cuh
-``````````````````````
+``````````````````
 .. doxygenfile:: paddle/cuda/include/hl_matrix_type.cuh
 
 hl_sse_matrix_kernel.cuh
-``````````````````````````
+````````````````````````
 .. doxygenfile:: paddle/cuda/include/hl_sse_matrix_kernel.cuh
 
+Matrix Function 
+---------------
+
 hl_batch_transpose.h
-``````````````````````````
+````````````````````
 .. doxygenfile:: paddle/cuda/include/hl_batch_transpose.h
 
-Sparse Matrix
---------------
-
-hl_sparse.h
-``````````````````
-.. doxygenfile:: paddle/cuda/include/hl_sparse.h
-
-hl_sparse.ph
-``````````````````````
-.. doxygenfile:: paddle/cuda/include/hl_sparse.ph
-
-Others
----------------
-
 hl_aggregate.h
-``````````````````
+``````````````
 .. doxygenfile:: paddle/cuda/include/hl_aggregate.h
 
+hl_top_k.h
+``````````
+.. doxygenfile:: paddle/cuda/include/hl_top_k.h
+
 hl_table_apply.h
-``````````````````
+````````````````
 .. doxygenfile:: paddle/cuda/include/hl_table_apply.h
 
-hl_top_k.h
-``````````````````
-.. doxygenfile:: paddle/cuda/include/hl_top_k.h
+Sparse Matrix
+-------------
 
+hl_sparse.h
+```````````
+.. doxygenfile:: paddle/cuda/include/hl_sparse.h
 
+hl_sparse.ph
+````````````
+.. doxygenfile:: paddle/cuda/include/hl_sparse.ph
diff --git a/doc/source/cuda/matrix/index.rst b/doc/source/cuda/matrix/index.rst
deleted file mode 100644
index 63f95eb466..0000000000
--- a/doc/source/cuda/matrix/index.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Matrix
-====================
-
-.. toctree::
-  :maxdepth: 3
-
-  matrix.rst
diff --git a/doc/source/cuda/rnn/rnn.rst b/doc/source/cuda/nn.rst
similarity index 79%
rename from doc/source/cuda/rnn/rnn.rst
rename to doc/source/cuda/nn.rst
index ce8ed96692..5577d01e72 100644
--- a/doc/source/cuda/rnn/rnn.rst
+++ b/doc/source/cuda/nn.rst
@@ -1,36 +1,39 @@
-Neural Networks
-==================
+Neural Network
+==============
 
 Base
--------
+----
+
 .. doxygenfile:: paddle/cuda/include/hl_gpu.h
-.. doxygenfile:: paddle/cuda/include/hl_cnn.h
 .. doxygenfile:: paddle/cuda/include/hl_functions.h
 .. doxygenfile:: paddle/cuda/include/hl_avx_functions.h
-.. doxygenfile:: paddle/cuda/include/hl_device_functions.cuh
 .. doxygenfile:: paddle/cuda/include/hl_gpu_functions.cuh
-
-Activation Functions
------------------------
 .. doxygenfile:: paddle/cuda/include/hl_activation_functions.h
 
+
+CNN Related APIs
+----------------
+.. doxygenfile:: paddle/cuda/include/hl_cnn.h
+.. doxygenfile:: paddle/cuda/include/hl_cuda_cudnn.h
+.. doxygenfile:: paddle/cuda/include/hl_cuda_cudnn.ph
+
 RNN Related APIs
------------------
+----------------
 
 .. doxygenfile:: paddle/cuda/include/hl_recurrent_apply.cuh
 .. doxygenfile:: paddle/cuda/include/hl_sequence.h
 
 LSTM Model
-``````````````
+``````````
+
 .. doxygenfile:: paddle/cuda/include/hl_lstm.h
 .. dpxygenfile:: paddle/cuda/include/hl_cpu_lstm.cuh
 .. doxygenfile:: paddle/cuda/include/hl_gpu_lstm.cuh
 .. doxygenfile:: paddle/cuda/include/hl_lstm_ops.cuh
 
 GRU Model
-````````````````
+`````````
+
 .. doxygenfile:: paddle/cuda/include/hl_gru_ops.cuh
 .. doxygenfile:: paddle/cuda/include/hl_cpu_gru.cuh
 .. doxygenfile:: paddle/cuda/include/hl_gpu_gru.cuh
-
-
diff --git a/doc/source/cuda/rnn/index.rst b/doc/source/cuda/rnn/index.rst
deleted file mode 100644
index 4913e47ba1..0000000000
--- a/doc/source/cuda/rnn/index.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-RNN
-====================
-
-.. toctree::
-  :maxdepth: 3
-
-  rnn.rst
diff --git a/doc/source/cuda/utils.rst b/doc/source/cuda/utils.rst
new file mode 100644
index 0000000000..850e8bd1c6
--- /dev/null
+++ b/doc/source/cuda/utils.rst
@@ -0,0 +1,37 @@
+Utils
+=====
+
+Dynamic Link Libs
+-----------------
+.. doxygenfile:: paddle/cuda/include/hl_dso_loader.h
+
+GPU Resources
+-------------
+
+hl_cuda.ph
+``````````
+.. doxygenfile:: paddle/cuda/include/hl_cuda.ph
+
+hl_cuda.h
+`````````
+.. doxygenfile:: paddle/cuda/include/hl_cuda.h
+
+HPPL Base
+---------
+.. doxygenfile:: paddle/cuda/include/hl_base.h
+
+CUBLAS Wrapper
+--------------
+.. doxygenfile:: paddle/cuda/include/hl_cuda_cublas.h
+
+Timer
+-----
+.. doxygenfile:: paddle/cuda/include/hl_time.h
+
+Thread Resource
+---------------
+.. doxygenfile:: paddle/cuda/include/hl_thread.ph
+
+Device Function
+---------------
+.. doxygenfile:: paddle/cuda/include/hl_device_functions.cuh
diff --git a/doc/source/cuda/utils/index.rst b/doc/source/cuda/utils/index.rst
deleted file mode 100644
index 7a84cbe27d..0000000000
--- a/doc/source/cuda/utils/index.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Utils
-====================
-
-.. toctree::
-  :maxdepth: 3
-
-  utils.rst
diff --git a/doc/source/cuda/utils/utils.rst b/doc/source/cuda/utils/utils.rst
deleted file mode 100644
index 1ea3e5404a..0000000000
--- a/doc/source/cuda/utils/utils.rst
+++ /dev/null
@@ -1,23 +0,0 @@
-Utilities
-===========
-
-HPPL Base
-------------
-
-hl_base.h
-``````````````
-.. doxygenfile:: paddle/cuda/include/hl_base.h
-
-Timer
------------
-
-hl_time.h
-``````````````
-.. doxygenfile:: paddle/cuda/include/hl_time.h
-
-Thread Resource
------------
-
-hl_thread.ph
-``````````````
-.. doxygenfile:: paddle/cuda/include/hl_thread.ph
diff --git a/doc/source/gserver/activations/index.rst b/doc/source/gserver/activations.rst
similarity index 83%
rename from doc/source/gserver/activations/index.rst
rename to doc/source/gserver/activations.rst
index ccdae41128..55b9d3be38 100644
--- a/doc/source/gserver/activations/index.rst
+++ b/doc/source/gserver/activations.rst
@@ -1,5 +1,5 @@
 Activations
-=============
+===========
 
 ..  doxygenclass:: paddle::ActivationFunction
     :members:
diff --git a/doc/source/gserver/dataprovider/index.rst b/doc/source/gserver/dataprovider/index.rst
deleted file mode 100644
index 4f6077f122..0000000000
--- a/doc/source/gserver/dataprovider/index.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Data Providers Documents
-==========================
-
-.. toctree::
-  :maxdepth: 3
-
-  dataproviders.rst
diff --git a/doc/source/gserver/dataprovider/dataproviders.rst b/doc/source/gserver/dataproviders.rst
similarity index 87%
rename from doc/source/gserver/dataprovider/dataproviders.rst
rename to doc/source/gserver/dataproviders.rst
index e8aa4bc356..c30d9d6a36 100644
--- a/doc/source/gserver/dataprovider/dataproviders.rst
+++ b/doc/source/gserver/dataproviders.rst
@@ -1,23 +1,27 @@
+==============
 Data Providers
-================
+==============
 
-Base DataProvider
-------------------
+DataProviders
+=============
+
+Base
+----
 ..  doxygenclass:: paddle::DataProvider
     :members:
 
 DataProviderGroup
--------------------
+-----------------
 ..  doxygenclass:: paddle::DataProviderGroup
     :members:
 
 MultiDataProvider
--------------------
+-----------------
 ..  doxygenclass:: paddle::MultiDataProvider
     :members:
 
 PyDataProvider
-===================
+==============
 
 IFieldScanner
 -------------
@@ -45,7 +49,7 @@ SparseValueScanner
     :members:
 
 SequenceScanner
-------------------
+---------------
 ..  doxygenclass:: paddle::SparseValueScanner
     :members:
 
@@ -69,8 +73,8 @@ IPyDataProvider
 ..  doxygenclass:: paddle::PyDataProvider2
     :members:
 
-Proto Data Provider
-===================
+ProtoDataProvider
+=================
 
 ProtoDataProvider
 ----------------
@@ -78,6 +82,6 @@ ProtoDataProvider
     :members:
 
 ProtoSequenceDataProvider
-----------------
+-------------------------
 ..  doxygenclass:: paddle::ProtoSequenceDataProvider
     :members:
diff --git a/doc/source/gserver/evaluators/evaluators.rst b/doc/source/gserver/evaluators.rst
similarity index 96%
rename from doc/source/gserver/evaluators/evaluators.rst
rename to doc/source/gserver/evaluators.rst
index 0c5cc85e7d..f5361f76cd 100644
--- a/doc/source/gserver/evaluators/evaluators.rst
+++ b/doc/source/gserver/evaluators.rst
@@ -1,14 +1,15 @@
-Base Evaluator
-==============
+==========
+Evaluators
+==========
+
+Base
+====
 
-Evaluator
----------
 ..  doxygenclass:: paddle::Evaluator
     :members:
 
-
-Utils
-=====
+Sum
+===
 
 SumEvaluator
 ------------
diff --git a/doc/source/gserver/evaluators/index.rst b/doc/source/gserver/evaluators/index.rst
deleted file mode 100644
index 298de3e1a3..0000000000
--- a/doc/source/gserver/evaluators/index.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Evaluators
-==========
-
-.. toctree::
-  :maxdepth: 3
-
-  evaluators.rst
diff --git a/doc/source/gserver/gradientmachines/gradientmachines.rst b/doc/source/gserver/gradientmachines.rst
similarity index 54%
rename from doc/source/gserver/gradientmachines/gradientmachines.rst
rename to doc/source/gserver/gradientmachines.rst
index 3607664c85..04c8e91d03 100644
--- a/doc/source/gserver/gradientmachines/gradientmachines.rst
+++ b/doc/source/gserver/gradientmachines.rst
@@ -1,18 +1,18 @@
 Gradient Machines
-================
+=================
 
 GradientMachine
----------------------
+---------------
 ..  doxygenclass:: paddle::GradientMachine
     :members:
 
-GradientMachineModel
---------------------
+GradientMachineMode
+-------------------
 ..  doxygenclass:: paddle::IGradientMachineMode
     :members:
 
 MultiGradientMachine
----------------------
+--------------------
 ..  doxygenclass:: paddle::MultiGradientMachine
     :members:
 
@@ -21,20 +21,7 @@ TrainerThread
 ..  doxygenclass:: paddle::TrainerThread
     :members:
 
-Recurrent Gradient Machines
----------------------------
+RecurrentGradientMachine
+------------------------
 ..  doxygenclass:: paddle::RecurrentGradientMachine
     :members:
-
-Networks
-========
-
-NeuralNetwork
--------------
-..  doxygenclass:: paddle::NeuralNetwork
-    :members:
-
-ParallelNeuralNetwork
----------------------
-..  doxygenclass:: paddle::ParallelNeuralNetwork
-    :members:
diff --git a/doc/source/gserver/gradientmachines/index.rst b/doc/source/gserver/gradientmachines/index.rst
deleted file mode 100644
index 997c29a102..0000000000
--- a/doc/source/gserver/gradientmachines/index.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Gradient Machines Documents
-=============================
-
-.. toctree::
-  :maxdepth: 3
-
-  gradientmachines.rst
diff --git a/doc/source/gserver/index.rst b/doc/source/gserver/index.rst
new file mode 100644
index 0000000000..223b00b9a9
--- /dev/null
+++ b/doc/source/gserver/index.rst
@@ -0,0 +1,12 @@
+GServer
+=======
+
+.. toctree::
+  :maxdepth: 2
+
+  activations.rst
+  dataproviders.rst
+  evaluators.rst
+  gradientmachines.rst
+  layers.rst
+  neworks.rst
diff --git a/doc/source/gserver/layers/layer.rst b/doc/source/gserver/layers.rst
similarity index 95%
rename from doc/source/gserver/layers/layer.rst
rename to doc/source/gserver/layers.rst
index 4b8e149505..191b2bdff2 100644
--- a/doc/source/gserver/layers/layer.rst
+++ b/doc/source/gserver/layers.rst
@@ -1,6 +1,10 @@
-Base
+======
+Layers
 ======
 
+Base
+====
+
 Layer 
 -----
 ..  doxygenclass:: paddle::Layer
@@ -17,7 +21,7 @@ Operator
     :members:
     
 Data Layer
-===========
+==========
 
 ..  doxygenclass:: paddle::DataLayer
     :members:
@@ -58,6 +62,11 @@ CudnnConvLayer
 ..  doxygenclass:: paddle::CudnnConvLayer
     :members:
 
+ExpandConvBaseLayer
+-------------------
+..  doxygenclass:: paddle::ExpandConvBaseLayer
+    :members:
+
 ExpandConvLayer
 ---------------
 ..  doxygenclass:: paddle::ExpandConvLayer
@@ -86,6 +95,16 @@ CudnnPoolLayer
 ..  doxygenclass:: paddle::CudnnPoolLayer
     :members:
 
+SpatialPyramidPoolLayer
+-----------------------
+..  doxygenclass:: paddle::SpatialPyramidPoolLayer
+    :members:
+
+MaxOutLayer
+-----------
+..  doxygenclass:: paddle::MaxOutLayer
+    :members:
+
 Norm Layers
 ===========
 
@@ -402,6 +421,11 @@ TransLayer
 Sampling Layers
 ===============
 
+BilinearInterpLayer
+-------------------
+..  doxygenclass:: paddle::BilinearInterpLayer
+    :members:
+
 MultinomialSampler
 ------------------
 ..  doxygenclass:: paddle::MultinomialSampler
diff --git a/doc/source/gserver/layers/index.rst b/doc/source/gserver/layers/index.rst
deleted file mode 100644
index 559c5436b1..0000000000
--- a/doc/source/gserver/layers/index.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Layers Documents
-====================
-
-.. toctree::
-  :maxdepth: 3
-
-  layer.rst
diff --git a/doc/source/gserver/neworks.rst b/doc/source/gserver/neworks.rst
new file mode 100644
index 0000000000..73fb60d549
--- /dev/null
+++ b/doc/source/gserver/neworks.rst
@@ -0,0 +1,12 @@
+Networks
+========
+
+NeuralNetwork
+-------------
+..  doxygenclass:: paddle::NeuralNetwork
+    :members:
+
+ParallelNeuralNetwork
+---------------------
+..  doxygenclass:: paddle::ParallelNeuralNetwork
+    :members:
diff --git a/doc/source/index.md b/doc/source/index.md
deleted file mode 100644
index 55fcdeb3df..0000000000
--- a/doc/source/index.md
+++ /dev/null
@@ -1,49 +0,0 @@
-# Source Code Documents
-
-## cuda
-
-- [CUDA](cuda/cuda/index.rst)
-- [Matrix](cuda/matrix/index.rst)
-- [RNN](cuda/rnn/index.rst)
-- [Utils](cuda/utils/index.rst)
-
-## gserver
-
-- [Activations](gserver/activations/index.rst)
-- [Data Providers](gserver/dataprovider/index.rst)
-- [Evaluators](gserver/evaluators/index.rst)
-- [Gradient Machines](gserver/gradientmachines/index.rst)
-- [Layers](gserver/layers/index.rst)
-
-## math
-
-- [Matrix](math/matrix/index.rst)
-- [Utils](math/utils/index.rst)
-
-## parameter
-
-- [Parameter](parameter/parameter/index.rst)
-- [Update](parameter/update/index.rst)
-- [Optimizer](parameter/optimizer/index.rst)
-
-## pserver
-
-- [Client](pserver/client/index.rst)
-- [Network](pserver/network/index.rst)
-- [Server](pserver/server/index.rst)
-
-## trainer
-
-- [Trainer](trainer/trainer.rst)
-
-## api
-
-- [API](api/api.rst)
-
-## utils
-
-- [CustomStackTrace](utils/customStackTrace.rst)
-- [Enumeration wrapper](utils/enum.rst)
-- [Lock](utils/lock.rst)
-- [Queue](utils/queue.rst)
-- [Thread](utils/thread.rst)
diff --git a/doc/source/index.rst b/doc/source/index.rst
new file mode 100644
index 0000000000..36323c888e
--- /dev/null
+++ b/doc/source/index.rst
@@ -0,0 +1,14 @@
+Source Code Documents
+=====================
+
+.. toctree::
+  :maxdepth: 1
+
+  gserver/index.rst
+  trainer.rst
+  parameter/index.rst
+  pserver/index.rst
+  api.rst
+  cuda/index.rst
+  math/index.rst
+  utils/index.rst
diff --git a/doc/source/math/functions.rst b/doc/source/math/functions.rst
new file mode 100644
index 0000000000..aef12e0f00
--- /dev/null
+++ b/doc/source/math/functions.rst
@@ -0,0 +1,10 @@
+Functions
+=========
+
+MathFunctions
+-------------
+.. doxygenfile:: paddle/math/MathFunctions.h
+
+SIMDFunctions
+-------------
+.. doxygenfile:: paddle/math/SIMDFunctions.h
diff --git a/doc/source/math/index.rst b/doc/source/math/index.rst
new file mode 100644
index 0000000000..2ec16f2b44
--- /dev/null
+++ b/doc/source/math/index.rst
@@ -0,0 +1,10 @@
+Math
+====
+
+.. toctree::
+  :maxdepth: 2
+
+  vector.rst
+  matrix.rst
+  functions.rst
+  utils.rst
diff --git a/doc/source/math/matrix.rst b/doc/source/math/matrix.rst
new file mode 100644
index 0000000000..9bb20f618d
--- /dev/null
+++ b/doc/source/math/matrix.rst
@@ -0,0 +1,76 @@
+Matrix
+======
+
+Base
+----
+
+BaseMatrix Template
+```````````````````
+..  doxygenclass:: paddle::BaseMatrixT
+    :members:
+
+Matrix
+``````
+..  doxygenclass:: paddle::Matrix
+    :members:
+
+MatrixOffset
+````````````
+..  doxygenclass:: paddle::MatrixOffset
+    :members:
+
+CpuMatrix
+---------
+
+CpuMatrix
+`````````
+..  doxygenclass:: paddle::CpuMatrix
+    :members:
+
+SharedCpuMatrix
+```````````````
+..  doxygenclass:: paddle::SharedCpuMatrix
+    :members:
+
+GpuMatrix
+---------
+..  doxygenclass:: paddle::GpuMatrix
+    :members:
+
+CpuSparseMatrix
+---------------
+
+CpuSparseMatrix
+```````````````
+..  doxygenclass:: paddle::CpuSparseMatrix
+    :members:
+
+SparseRowCpuMatrix
+``````````````````
+..  doxygenclass:: paddle::SparseRowCpuMatrix
+    :members:
+
+SparseAutoGrowRowCpuMatrix
+``````````````````````````
+..  doxygenclass:: paddle::SparseAutoGrowRowCpuMatrix
+    :members:
+
+SparsePrefetchRowCpuMatrix
+``````````````````````````
+..  doxygenclass:: paddle::SparsePrefetchRowCpuMatrix
+    :members:
+
+SparseRowIdsCpuMatrix
+`````````````````````
+..  doxygenclass:: paddle::SparseRowIdsCpuMatrix
+    :members:
+
+CacheRowCpuMatrix
+`````````````````
+..  doxygenclass:: paddle::CacheRowCpuMatrix
+    :members:
+
+GpuSparseMatrix
+---------------
+..  doxygenclass:: paddle::GpuSparseMatrix
+    :members:
diff --git a/doc/source/math/matrix/index.rst b/doc/source/math/matrix/index.rst
deleted file mode 100644
index 68410f2a27..0000000000
--- a/doc/source/math/matrix/index.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Matrix Documents
-====================
-
-.. toctree::
-  :maxdepth: 3
-
-  matrix.rst
diff --git a/doc/source/math/matrix/matrix.rst b/doc/source/math/matrix/matrix.rst
deleted file mode 100644
index b12e3934f4..0000000000
--- a/doc/source/math/matrix/matrix.rst
+++ /dev/null
@@ -1,20 +0,0 @@
-Matrix
-=======
-
-Base
---------
-.. doxygenfile:: paddle/math/BaseMatrix.h
-
-Sparse Matrix
-----------------
-.. doxygenfile:: paddle/math/Matrix.h
-.. doxygenfile:: paddle/math/Vector.h
-.. doxygenfile:: paddle/math/MathUtils.h
-.. doxygenfile:: paddle/math/SparseMatrix.h
-.. doxygenfile:: paddle/math/SparseRowMatrix.h
-.. doxygenfile:: paddle/math/CpuSparseMatrix.h
-
-Others
-----------
-.. doxygenfile:: paddle/math/MathFunctions.h
-.. doxygenfile:: paddle/math/SIMDFunctions.h
diff --git a/doc/source/math/utils/utils.rst b/doc/source/math/utils.rst
similarity index 62%
rename from doc/source/math/utils/utils.rst
rename to doc/source/math/utils.rst
index 3df721a47b..55d9961a39 100644
--- a/doc/source/math/utils/utils.rst
+++ b/doc/source/math/utils.rst
@@ -1,9 +1,18 @@
-Utils
-=======
+Memory Manager
+==============
 
 Memory Handle
---------------
+-------------
 .. doxygenfile:: paddle/math/MemoryHandle.h
+
+Allocator
+---------
 .. doxygenfile:: paddle/math/Allocator.h
+
+PoolAllocator
+`````````````
 .. doxygenfile:: paddle/math/PoolAllocator.h
+
+Storage
+-------
 .. doxygenfile:: paddle/math/Storage.h
diff --git a/doc/source/math/utils/index.rst b/doc/source/math/utils/index.rst
deleted file mode 100644
index e5fe335da2..0000000000
--- a/doc/source/math/utils/index.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Utils Documents
-====================
-
-.. toctree::
-  :maxdepth: 3
-
-  utils.rst
diff --git a/doc/source/math/vector.rst b/doc/source/math/vector.rst
new file mode 100644
index 0000000000..07f7062aba
--- /dev/null
+++ b/doc/source/math/vector.rst
@@ -0,0 +1,37 @@
+Vector
+======
+
+BaseVector
+``````````
+..  doxygenclass:: paddle::BaseVector
+    :members:
+
+Vector Template
+```````````````
+..  doxygenclass:: paddle::VectorT
+    :members:
+
+CpuVector Template
+``````````````````
+..  doxygenclass:: paddle::CpuVectorT
+    :members:
+
+GpuVector Template
+``````````````````
+..  doxygenclass:: paddle::GpuVectorT
+    :members:
+
+ParallelCpuVector Template
+``````````````````````````
+..  doxygenclass:: paddle::ParallelCpuVectorT
+    :members:
+
+ParallelGpuVector Template
+``````````````````````````
+..  doxygenclass:: paddle::ParallelGpuVectorT
+    :members:
+
+CpuGpuVector Template
+`````````````````````
+..  doxygenclass:: paddle::CpuGpuVectorT
+    :members:
diff --git a/doc/source/parameter/index.rst b/doc/source/parameter/index.rst
new file mode 100644
index 0000000000..3bf6948dc3
--- /dev/null
+++ b/doc/source/parameter/index.rst
@@ -0,0 +1,9 @@
+Parameter
+=========
+
+.. toctree::
+  :maxdepth: 2
+
+  parameter.rst
+  optimizer.rst
+  updater.rst
diff --git a/doc/source/parameter/optimizer.rst b/doc/source/parameter/optimizer.rst
new file mode 100644
index 0000000000..b5b8b850b3
--- /dev/null
+++ b/doc/source/parameter/optimizer.rst
@@ -0,0 +1,22 @@
+Optimizer
+=========
+
+ParameterOptimizer
+------------------
+.. doxygenfile:: paddle/parameter/ParameterOptimizer.h
+
+Regularizer
+-----------
+.. doxygenfile:: paddle/parameter/Regularizer.h
+
+FirstOrderOptimizer
+-------------------
+.. doxygenfile:: paddle/parameter/FirstOrderOptimizer.h
+
+AverageOptimizer
+----------------
+.. doxygenfile:: paddle/parameter/AverageOptimizer.h
+
+OptimizerWithRegularizer
+------------------------
+.. doxygenfile:: paddle/parameter/OptimizerWithRegularizer.h
diff --git a/doc/source/parameter/optimizer/index.rst b/doc/source/parameter/optimizer/index.rst
deleted file mode 100644
index 3338af5608..0000000000
--- a/doc/source/parameter/optimizer/index.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Parameter Documents
-====================
-
-.. toctree::
-  :maxdepth: 3
-
-  optimizer.rst
diff --git a/doc/source/parameter/optimizer/optimizer.rst b/doc/source/parameter/optimizer/optimizer.rst
deleted file mode 100644
index 3d9e49217e..0000000000
--- a/doc/source/parameter/optimizer/optimizer.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Optimizer
-============
-
-.. doxygenfile:: paddle/parameter/FirstOrderOptimizer.h
-.. doxygenfile:: paddle/parameter/AverageOptimizer.h
-.. doxygenfile:: paddle/parameter/ParameterOptimizer.h
-.. doxygenfile:: paddle/parameter/OptimizerWithRegularizer.h
diff --git a/doc/source/parameter/parameter/parameter.rst b/doc/source/parameter/parameter.rst
similarity index 66%
rename from doc/source/parameter/parameter/parameter.rst
rename to doc/source/parameter/parameter.rst
index 2b7afdb409..2daa62d4e6 100644
--- a/doc/source/parameter/parameter/parameter.rst
+++ b/doc/source/parameter/parameter.rst
@@ -1,16 +1,12 @@
 Parameter
-=============
-
-Weight
---------
-.. doxygenfile:: paddle/parameter/Weight.h
-
-Regularizer
-------------
-.. doxygenfile:: paddle/parameter/Regularizer.h
+=========
 
 Parameter
--------------
+---------
 .. doxygenfile:: paddle/parameter/Argument.h
 .. doxygenfile:: paddle/parameter/Parameter.h
 .. doxygenfile:: paddle/parameter/ParallelParameter.h
+
+Weight
+------
+.. doxygenfile:: paddle/parameter/Weight.h
diff --git a/doc/source/parameter/parameter/index.rst b/doc/source/parameter/parameter/index.rst
deleted file mode 100644
index e7ed70ec4c..0000000000
--- a/doc/source/parameter/parameter/index.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Parameter Documents
-====================
-
-.. toctree::
-  :maxdepth: 3
-
-  parameter.rst
diff --git a/doc/source/parameter/update/index.rst b/doc/source/parameter/update/index.rst
deleted file mode 100644
index 1bbd733193..0000000000
--- a/doc/source/parameter/update/index.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Parameter Documents
-====================
-
-.. toctree::
-  :maxdepth: 3
-
-  update.rst
diff --git a/doc/source/parameter/update/update.rst b/doc/source/parameter/updater.rst
similarity index 75%
rename from doc/source/parameter/update/update.rst
rename to doc/source/parameter/updater.rst
index c417602f03..dfa22e8e7d 100644
--- a/doc/source/parameter/update/update.rst
+++ b/doc/source/parameter/updater.rst
@@ -1,7 +1,14 @@
-Update
-==========
+Updater
+=======
 
+Base
+----
 .. doxygenfile:: paddle/parameter/ParameterUpdaterBase.h
+
+Hook
+----
 .. doxygenfile:: paddle/parameter/ParameterUpdaterHook.h
-.. doxygenfile:: paddle/parameter/ParameterUpdateFunctions.h
 
+Functions
+---------
+.. doxygenfile:: paddle/parameter/ParameterUpdateFunctions.h
diff --git a/doc/source/pserver/client.rst b/doc/source/pserver/client.rst
new file mode 100644
index 0000000000..e5bba0706a
--- /dev/null
+++ b/doc/source/pserver/client.rst
@@ -0,0 +1,12 @@
+Client
+======
+
+BaseClient
+----------
+..  doxygenclass:: paddle::BaseClient
+    :members:
+
+ParameterClient2
+----------------
+..  doxygenclass:: paddle::ParameterClient2
+    :members:
diff --git a/doc/source/pserver/client/client.rst b/doc/source/pserver/client/client.rst
deleted file mode 100644
index fc7ed90d3d..0000000000
--- a/doc/source/pserver/client/client.rst
+++ /dev/null
@@ -1,14 +0,0 @@
-Client
-=========
-
-.. doxygenclass:: paddle::BaseClient
-    :members:
-    :protected-members:
-    :private-members:
-    :undoc-members:
-
-.. doxygenclass:: paddle::ParameterClient2
-    :members:
-    :protected-members:
-    :private-members:
-    :undoc-members:
diff --git a/doc/source/pserver/client/index.rst b/doc/source/pserver/client/index.rst
deleted file mode 100644
index dc924c9ca8..0000000000
--- a/doc/source/pserver/client/index.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Client Documents
-====================
-
-.. toctree::
-  :maxdepth: 3
-
-  client.rst
diff --git a/doc/source/pserver/index.rst b/doc/source/pserver/index.rst
new file mode 100644
index 0000000000..0031e9476b
--- /dev/null
+++ b/doc/source/pserver/index.rst
@@ -0,0 +1,10 @@
+PServer
+=======
+
+.. toctree::
+  :maxdepth: 2
+
+  client.rst
+  network.rst
+  server.rst
+  utils.rst
diff --git a/doc/source/pserver/network.rst b/doc/source/pserver/network.rst
new file mode 100644
index 0000000000..7004c9d91f
--- /dev/null
+++ b/doc/source/pserver/network.rst
@@ -0,0 +1,27 @@
+Network
+=======
+
+SocketServer
+------------
+..  doxygenclass:: paddle::SocketServer
+    :members:
+
+SocketWorker
+------------
+..  doxygenclass:: paddle::SocketWorker
+    :members:
+
+SocketClient
+------------
+..  doxygenclass:: paddle::SocketClient
+    :members:
+
+SocketChannel
+-------------
+..  doxygenclass:: paddle::SocketChannel
+    :members:
+
+MessageReader
+-------------
+..  doxygenclass:: paddle::MsgReader
+    :members:
diff --git a/doc/source/pserver/network/index.rst b/doc/source/pserver/network/index.rst
deleted file mode 100644
index 2fdf95e17d..0000000000
--- a/doc/source/pserver/network/index.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Network Documents
-====================
-
-.. toctree::
-  :maxdepth: 3
-
-  network.rst
diff --git a/doc/source/pserver/network/network.rst b/doc/source/pserver/network/network.rst
deleted file mode 100644
index e000ff8dbb..0000000000
--- a/doc/source/pserver/network/network.rst
+++ /dev/null
@@ -1,42 +0,0 @@
-Network
-==========
-
-Socket Server
-----------------
-.. doxygenclass:: paddle::SocketServer
-    :members:
-    :protected-members:
-    :private-members:
-    :undoc-members:
-
-Socket Worker
-----------------
-.. doxygenclass:: paddle::SocketWorker
-    :members:
-    :protected-members:
-    :private-members:
-    :undoc-members:
-
-Socket Client
-----------------
-.. doxygenclass:: paddle::SocketClient
-    :members:
-    :protected-members:
-    :private-members:
-    :undoc-members:
-
-Socket Channel
----------------
-.. doxygenclass:: paddle::SocketChannel
-    :members:
-    :protected-members:
-    :private-members:
-    :undoc-members:
-
-Message Reader
----------------
-.. doxygenclass:: paddle::MsgReader
-    :members:
-    :protected-members:
-    :private-members:
-    :undoc-members:
diff --git a/doc/source/pserver/server.rst b/doc/source/pserver/server.rst
new file mode 100644
index 0000000000..35301acf8f
--- /dev/null
+++ b/doc/source/pserver/server.rst
@@ -0,0 +1,12 @@
+Server
+======
+
+ProtoServer
+-----------
+..  doxygenclass:: paddle::ProtoServer
+    :members:
+
+ParameterServer2
+----------------
+..  doxygenclass:: paddle::ParameterServer2
+    :members:
diff --git a/doc/source/pserver/server/index.rst b/doc/source/pserver/server/index.rst
deleted file mode 100644
index 09e3530bfe..0000000000
--- a/doc/source/pserver/server/index.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Server Documents
-====================
-
-.. toctree::
-  :maxdepth: 3
-
-  server.rst
diff --git a/doc/source/pserver/server/server.rst b/doc/source/pserver/server/server.rst
deleted file mode 100644
index f3110fdd73..0000000000
--- a/doc/source/pserver/server/server.rst
+++ /dev/null
@@ -1,14 +0,0 @@
-Server
-==========
-
-.. doxygenclass:: paddle::ProtoServer
-    :members:
-    :protected-members:
-    :private-members:
-    :undoc-members:
-
-.. doxygenclass:: paddle::ParameterServer2
-    :members:
-    :protected-members:
-    :private-members:
-    :undoc-members:
diff --git a/doc/source/trainer/trainer.rst b/doc/source/trainer.rst
similarity index 94%
rename from doc/source/trainer/trainer.rst
rename to doc/source/trainer.rst
index 12c24597e7..85f1feb4fc 100644
--- a/doc/source/trainer/trainer.rst
+++ b/doc/source/trainer.rst
@@ -14,7 +14,7 @@ RemoteParameterUpdater
     :members:
 
 ConcurrentRemoteParameterUpdater
----------------------------------
+--------------------------------
 
 ..  doxygenclass:: paddle::ConcurrentRemoteParameterUpdater
     :members:
diff --git a/doc/source/utils/customStackTrace.rst b/doc/source/utils/customStackTrace.rst
index a4e6f05a40..cdc8930739 100644
--- a/doc/source/utils/customStackTrace.rst
+++ b/doc/source/utils/customStackTrace.rst
@@ -1,9 +1,4 @@
 CustomStackTrace
 ================
-
-
-class CustomStackTrace
-----------------------
-
 ..  doxygenclass:: paddle::CustomStackTrace
     :members:
diff --git a/doc/source/utils/enum.rst b/doc/source/utils/enum.rst
index 17166d35f7..e0da75afe1 100644
--- a/doc/source/utils/enum.rst
+++ b/doc/source/utils/enum.rst
@@ -1,9 +1,3 @@
-enumeration_wrapper
+Enumeration wrapper
 ===================
-
-
-namespace paddle::enumeration_wrapper
--------------------------------------
-
 ..  doxygennamespace:: paddle::enumeration_wrapper
-
diff --git a/doc/source/utils/index.rst b/doc/source/utils/index.rst
new file mode 100644
index 0000000000..7ddc47d172
--- /dev/null
+++ b/doc/source/utils/index.rst
@@ -0,0 +1,11 @@
+Utils
+=====
+
+.. toctree::
+  :maxdepth: 2
+
+  lock.rst
+  queue.rst
+  thread.rst
+  customStackTrace.rst
+  enum.rst
diff --git a/doc/source/utils/lock.rst b/doc/source/utils/lock.rst
index 0b027e403f..f011acb943 100644
--- a/doc/source/utils/lock.rst
+++ b/doc/source/utils/lock.rst
@@ -1,37 +1,32 @@
-Thread
-======
+Lock
+====
 
-
-class Thread 
-------------
-
-..  doxygenclass:: paddle::Thread
+RWLock
+------
+..  doxygenclass:: paddle::RWLock
     :members:
 
-
-class ThreadWorker
-------------------
-
-..  doxygenclass:: paddle::ThreadWorker
+ReadLockGuard
+-------------
+..  doxygenclass:: paddle::ReadLockGuard
     :members:
-    
 
-class SyncThreadPool 
---------------------
-
-..  doxygenclass:: paddle::SyncThreadPool 
+SpinLock
+--------
+..  doxygenclass:: paddle::SpinLock
     :members:
-    
-
-class MultiThreadWorker 
------------------------
 
-..  doxygenclass:: paddle::MultiThreadWorker 
+Semaphore
+---------
+..  doxygenclass:: paddle::Semaphore
     :members:
-    
 
-class AsyncThreadPool 
----------------------
+ThreadBarrier
+-------------
+..  doxygenclass:: paddle::ThreadBarrier
+    :members:
 
-..  doxygenclass:: paddle::AsyncThreadPool 
+LockedCondition
+---------------
+..  doxygenclass:: paddle::LockedCondition
     :members:
diff --git a/doc/source/utils/queue.rst b/doc/source/utils/queue.rst
index 72a464ca67..98192648e2 100644
--- a/doc/source/utils/queue.rst
+++ b/doc/source/utils/queue.rst
@@ -1,16 +1,12 @@
 Queue
 =====
 
-
-class Queue
-------------
-
+Queue
+-----
 ..  doxygenclass:: paddle::Queue
     :members:
 
-
-class BlockingQueue 
--------------------
-
+BlockingQueue 
+-------------
 ..  doxygenclass:: paddle::BlockingQueue 
     :members:
diff --git a/doc/source/utils/thread.rst b/doc/source/utils/thread.rst
index 2eb67dde6a..23d379a989 100644
--- a/doc/source/utils/thread.rst
+++ b/doc/source/utils/thread.rst
@@ -1,40 +1,27 @@
-Lock
-====
+Thread
+======
 
-
-class RWLock
-------------
-
-..  doxygenclass:: paddle::RWLock
+Thread 
+------
+..  doxygenclass:: paddle::Thread
     :members:
 
-class ReadLockGuard
--------------------
-
-..  doxygenclass:: paddle::ReadLockGuard
+ThreadWorker
+------------
+..  doxygenclass:: paddle::ThreadWorker
     :members:
 
-class SpinLock
+SyncThreadPool 
 --------------
-
-..  doxygenclass:: paddle::SpinLock
+..  doxygenclass:: paddle::SyncThreadPool 
     :members:
-
-class Semaphore
----------------
-
-..  doxygenclass:: paddle::Semaphore
-    :members:
-
-class ThreadBarrier
--------------------
-
-..  doxygenclass:: paddle::ThreadBarrier
+    
+MultiThreadWorker 
+-----------------
+..  doxygenclass:: paddle::MultiThreadWorker 
     :members:
 
-class LockedCondition
----------------------
-
-..  doxygenclass:: paddle::LockedCondition
+AsyncThreadPool 
+---------------
+..  doxygenclass:: paddle::AsyncThreadPool
     :members:
-
diff --git a/doc_cn/build_and_install/cmake/cblas_settings.csv b/doc_cn/build_and_install/cmake/cblas_settings.csv
index d804c0a662..a6356baf16 100644
--- a/doc_cn/build_and_install/cmake/cblas_settings.csv
+++ b/doc_cn/build_and_install/cmake/cblas_settings.csv
@@ -1,4 +1,5 @@
-MKL_ROOT,mkl的路径，在${MKL_ROOT}/include下需要包含mkl.h，在${MKL_ROOT}/lib目录下需要包含 mkl_core，mkl_sequential和mkl_intel_lp64三个库
-ATLAS_ROOT,ATLAS库的路径，在${ATLAS_ROOT}/include下需要包含cblas.h，而在${ATLAS_ROOT}/lib下需要包含cblas和atlas两个库
-OPENBLAS_ROOT,在${OPENBLAS_ROOT}/include下需要包含cblas.h，而在${OPENBLAS_ROOT}/lib下需要包含openblas库
-REFERENCE_CBLAS_ROOT,在${REFERENCE_CBLAS_ROOT}/include下需要包含cblas.h，在${REFERENCE_CBLAS_ROOT}/lib下需要包含cblas库
\ No newline at end of file
+编译选项,描述,注意
+MKL_ROOT,MKL的路径,${MKL_ROOT}/include下需要包含mkl.h，${MKL_ROOT}/lib目录下需要包含mkl_core，mkl_sequential和mkl_intel_lp64三个库。
+ATLAS_ROOT,ATLAS的路径,${ATLAS_ROOT}/include下需要包含cblas.h，${ATLAS_ROOT}/lib下需要包含cblas和atlas两个库。
+OPENBLAS_ROOT,OpenBLAS的路径,${OPENBLAS_ROOT}/include下需要包含cblas.h，${OPENBLAS_ROOT}/lib下需要包含openblas库。
+REFERENCE_CBLAS_ROOT,REFERENCE BLAS的路径,${REFERENCE_CBLAS_ROOT}/include下需要包含cblas.h，${REFERENCE_CBLAS_ROOT}/lib下需要包含cblas库。
\ No newline at end of file
diff --git a/doc_cn/build_and_install/cmake/compile_options.csv b/doc_cn/build_and_install/cmake/compile_options.csv
index 0b8015aaee..12b45eebb2 100644
--- a/doc_cn/build_and_install/cmake/compile_options.csv
+++ b/doc_cn/build_and_install/cmake/compile_options.csv
@@ -1,15 +1,14 @@
-选项,说明,默认值
-WITH_GPU,是否编译GPU支持。,是否寻找到cuda工具链
-WITH_DOUBLE,是否使用双精度浮点数。,否
-WITH_DSO,是否使用运行时动态加载cuda动态库，而非静态加载cuda动态库。,是
-WITH_AVX,是否编译含有AVX指令集的PaddlePaddle二进制,是
-WITH_PYTHON,是否内嵌python解释器。可以方便嵌入式工作。,是
-WITH_STYLE_CHECK,是否编译时进行代码风格检查,是
-WITH_RDMA,是否开启RDMA支持,否
-WITH_GLOG,是否使用GLOG，如果不使用则会使用一个简化版的日志实现。可以方便嵌入式工作。,取决于是否寻找到GLOG
-WITH_GFLAGS,是否使用GFLAGS，如果不使用则会使用一个简化版的命令行参数解析。可以方便嵌入式工作。,取决于是否寻找到GFLAGS
-WITH_TIMER,是否开启计时功能开启计时功能会导致运行略慢，打印的日志变多。但是方便调试和benchmark,否
-WITH_TESTING,是否开启单元测试,取决于是否寻找到gtest
-WITH_DOC,是否编译英文文档,否
-WITH_DOC_CN,是否编译中文文档,否
-WITH_SWIG_PY,是否编译python的swig接口，python的swig接口可以方便进行预测和定制化训练,取决于是否找到swig
+选项,说明,默认值
+WITH_GPU,是否支持GPU。,取决于是否寻找到CUDA工具链
+WITH_DOUBLE,是否使用双精度浮点数。,否
+WITH_DSO,是否运行时动态加载CUDA动态库，而非静态加载CUDA动态库。,是
+WITH_AVX,是否编译含有AVX指令集的PaddlePaddle二进制文件,是
+WITH_PYTHON,是否内嵌PYTHON解释器。方便今后的嵌入式移植工作。,是
+WITH_STYLE_CHECK,是否编译时进行代码风格检查,是
+WITH_RDMA,是否开启RDMA,否
+WITH_GLOG,是否开启GLOG。如果不开启，则会使用一个简化版的日志，同时方便今后的嵌入式移植工作。,取决于是否寻找到GLOG
+WITH_GFLAGS,是否使用GFLAGS。如果不开启，则会使用一个简化版的命令行参数解析器，同时方便今后的嵌入式移植工作。,取决于是否寻找到GFLAGS
+WITH_TIMER,是否开启计时功能。如果开启会导致运行略慢，打印的日志变多，但是方便调试和测Benchmark,否
+WITH_TESTING,是否开启单元测试,取决于是否寻找到GTEST
+WITH_DOC,是否编译中英文文档,否
+WITH_SWIG_PY,是否编译PYTHON的SWIG接口，该接口可用于预测和定制化训练,取决于是否寻找到SWIG
\ No newline at end of file
diff --git a/doc_cn/build_and_install/cmake/compile_options.rst b/doc_cn/build_and_install/cmake/compile_options.rst
index bb5b18a073..f345ead2bf 100644
--- a/doc_cn/build_and_install/cmake/compile_options.rst
+++ b/doc_cn/build_and_install/cmake/compile_options.rst
@@ -1,62 +1,43 @@
-设置PaddlePaddle的编译选项
-==========================
-
-PaddlePaddle的编译选项可以在调用cmake的时候设置。cmake是一个跨平台的编译脚本，调用
-cmake可以将cmake项目文件，生成各个平台的makefile。详细的cmake使用方法可以参考
-`cmake的官方文档 <https://cmake.org/cmake-tutorial>`_ 。
-
-PaddlePaddle的编译选项是可以控制PaddlePaddle生成CPU/GPU版本二进制，链接何种blas等等。所有的
-编译选项列表如下
-
-PaddlePaddle的编译选项
-----------------------
-
-bool型的编译选项
-++++++++++++++++
-设置下列编译选项时，可以在cmake的命令行设置。使用 -D命令即可。例如 
-:code:`cmake -D WITH_GPU=OFF`
-
-..  csv-table:: PaddlePaddle的bool型编译选项
-    :widths: 1, 7, 2
-    :file: compile_options.csv
-
-blas相关的编译选项
-++++++++++++++++++
-
-PaddlePaddle可以使用 `MKL <https://software.intel.com/en-us/intel-mkl>`_ ，
-`Atlas <http://math-atlas.sourceforge.net/>`_ ,
-`OpenBlas <http://www.openblas.net/>`_ 和 
-`refference Blas <http://www.netlib.org/blas/>`_ ，任意一种cblas实现。
-通过编译时指定路径来实现引用各种blas。
-
-cmake编译时会首先在系统路径(/usr/lib\:/usr/local/lib)中寻找这些blas的实现。同时
-也会读取相关路径变量来进行搜索。路径变量为\:
-
-
-..  csv-table:: PaddlePaddle的cblas编译选项
-    :widths: 1, 9
-    :header: "编译选项", "描述"
-    :file: cblas_settings.csv
-
-这些变量均可以使用 -D命令指定。例如 :code:`cmake -D MKL_ROOT=/opt/mkl/`。这些变
-量也可以通过调用cmake命令前通过环境变量指定。例如
-
-..  code-block:: bash
-
-    export MKL_ROOT=/opt/mkl
-    cmake
-
-需要注意的是，这些变量只在第一次cmake的时候有效。如果在第一次cmake之后想要重新设
-置这些变量，推荐清理( :code:`rm -rf` )掉编译目录后，再指定。
-
-cuda/cudnn相关的编译选项
-++++++++++++++++++++++++
-
-PaddlePaddle可以使用 cudnn v2之后的任何一个cudnn版本来编译运行。但需要注意的是编译和
-运行使用的cudnn尽量是同一个版本。推荐使用最新版本的cudnn v5.1。
-
-在cmake配置时可以使用 :code:`CUDNN_ROOT` 来配置CUDNN的安装路径。使用的命令也是 
--D，例如 :code:`cmake -D CUDNN_ROOT=/opt/cudnnv5` 。
-
-需要注意的是，这些变量只在第一次cmake的时候有效。如果在第一次cmake之后想要重新设
-置这些变量，推荐清理( :code:`rm -rf` )掉编译目录后，再指定。
+PaddlePaddle的编译选项
+======================
+
+PaddlePaddle的编译选项，包括生成CPU/GPU二进制文件、链接何种BLAS库等。用户可在调用cmake的时候设置它们，详细的cmake使用方法可以参考 `官方文档 <https://cmake.org/cmake-tutorial>`_ 。
+
+Bool型的编译选项
+----------------
+用户可在cmake的命令行中，通过使用 ``-D`` 命令设置该类编译选项，例如
+
+..  code-block:: bash
+
+    cmake .. -DWITH_GPU=OFF
+
+..  csv-table:: Bool型的编译选项
+    :widths: 1, 7, 2
+    :file: compile_options.csv
+
+BLAS/CUDA/Cudnn的编译选项
+--------------------------
+BLAS
++++++
+
+PaddlePaddle支持以下任意一种BLAS库：`MKL <https://software.intel.com/en-us/intel-mkl>`_ ，`ATLAS <http://math-atlas.sourceforge.net/>`_ ，`OpenBlAS <http://www.openblas.net/>`_ 和 `REFERENCE BLAS <http://www.netlib.org/blas/>`_ 。
+
+..  csv-table:: BLAS路径相关的编译选项
+    :widths: 1, 2, 7
+    :file: cblas_settings.csv
+
+CUDA/Cudnn
++++++++++++
+
+PaddlePaddle可以使用cudnn v2之后的任何一个版本来编译运行，但尽量请保持编译和运行使用的cudnn是同一个版本。 我们推荐使用最新版本的cudnn v5.1。
+
+编译选项的设置
+++++++++++++++
+
+PaddePaddle通过编译时指定路径来实现引用各种BLAS/CUDA/Cudnn库。cmake编译时，首先在系统路径(/usr/lib\:/usr/local/lib)中搜索这几个库，同时也会读取相关路径变量来进行搜索。 通过使用 ``-D`` 命令可以设置，例如 
+
+..  code-block:: bash
+
+    cmake .. -DMKL_ROOT=/opt/mkl/ -DCUDNN_ROOT=/opt/cudnnv5
+
+注意：这几个编译选项的设置，只在第一次cmake的时候有效。如果之后想要重新设置，推荐清理整个编译目录（``rm -rf``）后，再指定。
\ No newline at end of file
diff --git a/paddle/api/Arguments.cpp b/paddle/api/Arguments.cpp
index 6f51d55120..b539374cd4 100644
--- a/paddle/api/Arguments.cpp
+++ b/paddle/api/Arguments.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "PaddleAPI.h"
 #include "PaddleAPIPrivate.h"
 
@@ -112,7 +111,7 @@ void Arguments::setSlotSequenceStartPositions(size_t idx,
 }
 
 void Arguments::setSlotSubSequenceStartPositions(
-    size_t idx, IVector *vec) throw(RangeError) {
+    size_t idx, IVector* vec) throw(RangeError) {
   auto& a = m->getArg(idx);
   auto& v = m->cast<paddle::IVector>(vec->getSharedPtr());
   a.subSequenceStartPositions = std::make_shared<paddle::ICpuGpuVector>(v);
diff --git a/paddle/api/ConfigParser.cpp b/paddle/api/ConfigParser.cpp
index 25d94f5a6a..bc40d871d1 100644
--- a/paddle/api/ConfigParser.cpp
+++ b/paddle/api/ConfigParser.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "PaddleAPI.h"
 #include "PaddleAPIPrivate.h"
 #include "paddle/trainer/Trainer.h"
@@ -44,8 +43,7 @@ TrainerConfig* TrainerConfig::createFromTrainerConfigFile(
   return retv;
 }
 
-TrainerConfig* TrainerConfig::createFromProtoString(
-    const std::string& str) {
+TrainerConfig* TrainerConfig::createFromProtoString(const std::string& str) {
   auto retv = new TrainerConfig();
   paddle::TrainerConfig trainerConfigProto;
   auto conf = std::make_shared<paddle::TrainerConfigHelper>(trainerConfigProto);
diff --git a/paddle/api/GradientMachine.cpp b/paddle/api/GradientMachine.cpp
index bef499c678..9a4846d809 100644
--- a/paddle/api/GradientMachine.cpp
+++ b/paddle/api/GradientMachine.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "PaddleAPI.h"
 #include "PaddleAPIPrivate.h"
 
@@ -27,7 +26,8 @@ GradientMachine::GradientMachine() : m(new GradientMachinePrivate()) {}
 GradientMachine::~GradientMachine() { delete m; }
 
 GradientMachine* GradientMachine::createFromPaddleModelPtr(
-    const void* confPtr, GradientMatchineCreateMode mode,
+    const void* confPtr,
+    GradientMatchineCreateMode mode,
     const std::vector<int>& types) {
   auto& conf = *(const paddle::ModelConfig*)(confPtr);
   std::vector<ParameterType> realTypes;
@@ -44,7 +44,8 @@ GradientMachine* GradientMachine::createFromPaddleModelPtr(
 }
 
 GradientMachine* GradientMachine::createByConfigProtoStr(
-    const std::string& protoStr, GradientMatchineCreateMode mode,
+    const std::string& protoStr,
+    GradientMatchineCreateMode mode,
     const std::vector<int>& types) {
   paddle::ModelConfig conf;
   conf.ParseFromString(protoStr);
@@ -56,13 +57,15 @@ GradientMachine* GradientMachine::createByConfigProtoStr(
 }
 
 GradientMachine* GradientMachine::createByModelConfig(
-    ModelConfig* conf, GradientMatchineCreateMode mode,
+    ModelConfig* conf,
+    GradientMatchineCreateMode mode,
     const std::vector<int>& types) {
   auto confPtr = &conf->m->conf->getModelConfig();
   return GradientMachine::createFromPaddleModelPtr(confPtr, mode, types);
 }
 
-void GradientMachine::forward(const Arguments& inArgs, Arguments* outArgs,
+void GradientMachine::forward(const Arguments& inArgs,
+                              Arguments* outArgs,
                               PassType passType) {
   auto& in =
       m->cast<std::vector<paddle::Argument>>(inArgs.getInternalArgumentsPtr());
@@ -99,7 +102,8 @@ void GradientMachine::backward(const UpdateCallback& callback) {
 }
 
 void GradientMachine::forwardBackward(const Arguments& inArgs,
-                                      Arguments* outArgs, PassType passType,
+                                      Arguments* outArgs,
+                                      PassType passType,
                                       const UpdateCallback& callback) {
   auto& in =
       m->cast<std::vector<paddle::Argument>>(inArgs.getInternalArgumentsPtr());
@@ -129,7 +133,7 @@ Parameter* GradientMachine::getParameter(size_t i) throw(RangeError) {
 void GradientMachine::randParameters() { m->machine->randParameters(); }
 
 Matrix* GradientMachine::getLayerOutput(const std::string& layerName) const
-  throw(UnsupportError) {
+    throw(UnsupportError) {
   auto nn = std::dynamic_pointer_cast<paddle::NeuralNetwork>(m->machine);
   if (nn) {
     auto mat = nn->getLayerOutput(layerName);
@@ -140,8 +144,11 @@ Matrix* GradientMachine::getLayerOutput(const std::string& layerName) const
 }
 
 SequenceGenerator* GradientMachine::asSequenceGenerator(
-    const std::vector<std::string>& dict, size_t begin_id, size_t end_id,
-    size_t max_length, size_t beam_size) {
+    const std::vector<std::string>& dict,
+    size_t begin_id,
+    size_t end_id,
+    size_t max_length,
+    size_t beam_size) {
   SequenceGenerator* r =
       SequenceGenerator::createByGradientMachineSharedPtr(&m->machine);
   r->setDict(dict);
diff --git a/paddle/api/Internal.h b/paddle/api/Internal.h
index b990f650be..66a13bc603 100644
--- a/paddle/api/Internal.h
+++ b/paddle/api/Internal.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "PaddleAPI.h"
@@ -23,7 +22,8 @@ limitations under the License. */
 template <typename T1, typename T2>
 void staticCastVector(std::vector<T2>* dest, const std::vector<T1>& src) {
   dest->resize(src.size());
-  std::transform(src.begin(), src.end(), dest->begin(), [](T1 t){
-    return static_cast<T2>(t);
-  });
+  std::transform(src.begin(),
+                 src.end(),
+                 dest->begin(),
+                 [](T1 t) { return static_cast<T2>(t); });
 }
diff --git a/paddle/api/Matrix.cpp b/paddle/api/Matrix.cpp
index e5493a381a..f257ee65aa 100644
--- a/paddle/api/Matrix.cpp
+++ b/paddle/api/Matrix.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "PaddleAPI.h"
 #include "paddle/math/Matrix.h"
 #include "paddle/math/SparseMatrix.h"
@@ -44,17 +43,21 @@ Matrix* Matrix::createZero(size_t height, size_t width, bool useGpu) {
   return m;
 }
 
-Matrix* Matrix::createDense(const std::vector<float>& data, size_t height,
-                            size_t width, bool useGpu) {
+Matrix* Matrix::createDense(const std::vector<float>& data,
+                            size_t height,
+                            size_t width,
+                            bool useGpu) {
   auto m = new Matrix();
   m->m->mat = paddle::Matrix::create(height, width, useGpu);
   m->m->mat->copyFrom(data.data(), data.size());
   return m;
 }
 
-Matrix* Matrix::createDenseFromNumpy(float* data, int dim1, int dim2,
-                                      bool copy, bool useGpu)
-                                     throw (UnsupportError) {
+Matrix* Matrix::createDenseFromNumpy(float* data,
+                                     int dim1,
+                                     int dim2,
+                                     bool copy,
+                                     bool useGpu) throw(UnsupportError) {
   if (useGpu) {
     /// Gpu mode only supports copy=True
     if (!copy) {
@@ -66,7 +69,9 @@ Matrix* Matrix::createDenseFromNumpy(float* data, int dim1, int dim2,
   }
 }
 
-Matrix* Matrix::createCpuDenseFromNumpy(float* data, int dim1, int dim2,
+Matrix* Matrix::createCpuDenseFromNumpy(float* data,
+                                        int dim1,
+                                        int dim2,
                                         bool copy) {
   auto m = new Matrix();
   if (copy) {
@@ -85,12 +90,20 @@ Matrix* Matrix::createGpuDenseFromNumpy(float* data, int dim1, int dim2) {
   return m;
 }
 
-Matrix* Matrix::createSparse(size_t height, size_t width, size_t nnz,
-                             bool isNonVal, bool isTrans, bool useGpu) {
+Matrix* Matrix::createSparse(size_t height,
+                             size_t width,
+                             size_t nnz,
+                             bool isNonVal,
+                             bool isTrans,
+                             bool useGpu) {
   auto m = new Matrix();
   m->m->mat = paddle::Matrix::createSparseMatrix(
-      height, width, nnz, isNonVal ? paddle::NO_VALUE : paddle::FLOAT_VALUE,
-      isTrans, useGpu);
+      height,
+      width,
+      nnz,
+      isNonVal ? paddle::NO_VALUE : paddle::FLOAT_VALUE,
+      isTrans,
+      useGpu);
   return m;
 }
 
@@ -221,7 +234,8 @@ FloatArray Matrix::getData() const {
 }
 
 void Matrix::sparseCopyFrom(
-    const std::vector<int>& rows, const std::vector<int>& cols,
+    const std::vector<int>& rows,
+    const std::vector<int>& cols,
     const std::vector<float>& vals) throw(UnsupportError) {
   auto cpuSparseMat =
       std::dynamic_pointer_cast<paddle::CpuSparseMatrix>(m->mat);
@@ -240,7 +254,8 @@ void Matrix::sparseCopyFrom(
 
 void* Matrix::getSharedPtr() const { return &m->mat; }
 
-void Matrix::toNumpyMatInplace(float** view_data, int* dim1,
+void Matrix::toNumpyMatInplace(float** view_data,
+                               int* dim1,
                                int* dim2) throw(UnsupportError) {
   auto cpuMat = std::dynamic_pointer_cast<paddle::CpuMatrix>(m->mat);
   if (cpuMat) {
@@ -251,7 +266,8 @@ void Matrix::toNumpyMatInplace(float** view_data, int* dim1,
     throw UnsupportError();
   }
 }
-void Matrix::copyToNumpyMat(float** view_m_data, int* dim1,
+void Matrix::copyToNumpyMat(float** view_m_data,
+                            int* dim1,
                             int* dim2) throw(UnsupportError) {
   static_assert(sizeof(paddle::real) == sizeof(float),
                 "Currently PaddleAPI only support for single "
@@ -269,8 +285,8 @@ void Matrix::copyToNumpyMat(float** view_m_data, int* dim1,
     } else if (auto gpuMat = dynamic_cast<paddle::GpuMatrix*>(m->mat.get())) {
       auto src = gpuMat->getData();
       auto dest = *view_m_data;
-      hl_memcpy_device2host(dest, src,
-                            sizeof(paddle::real) * (*dim1) * (*dim2));
+      hl_memcpy_device2host(
+          dest, src, sizeof(paddle::real) * (*dim1) * (*dim2));
     } else {
       LOG(WARNING) << "Unexpected Situation";
       throw UnsupportError();
@@ -278,7 +294,8 @@ void Matrix::copyToNumpyMat(float** view_m_data, int* dim1,
   }
 }
 
-void Matrix::copyFromNumpyMat(float* data, int dim1,
+void Matrix::copyFromNumpyMat(float* data,
+                              int dim1,
                               int dim2) throw(UnsupportError, RangeError) {
   if (isSparse()) {
     throw UnsupportError();
diff --git a/paddle/api/PaddleAPI.h b/paddle/api/PaddleAPI.h
index 5688ece44d..c07facdb12 100644
--- a/paddle/api/PaddleAPI.h
+++ b/paddle/api/PaddleAPI.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <stddef.h>
@@ -61,8 +60,8 @@ class RangeError {};
 /// Not support Error, such as access GPU memory directly, etc.
 class UnsupportError : public std::runtime_error {
 public:
-  UnsupportError() : std::runtime_error(" ") {};
-  UnsupportError(const std::string& message) : std::runtime_error(message) {};
+  UnsupportError() : std::runtime_error(" "){};
+  UnsupportError(const std::string& message) : std::runtime_error(message){};
 };
 
 /// This type will map to python's list of float.
@@ -112,7 +111,8 @@ public:
   /**
    * Create A Matrix with height,width, which is filled by zero.
    */
-  static Matrix* createZero(size_t height, size_t width,
+  static Matrix* createZero(size_t height,
+                            size_t width,
                             bool useGpu = isUsingGpu());
 
   /**
@@ -124,8 +124,11 @@ public:
    *
    * @note the default sparse type is SPARSE_CSR.
    */
-  static Matrix* createSparse(size_t height, size_t width, size_t nnz,
-                              bool isNonVal = true, bool trans = false,
+  static Matrix* createSparse(size_t height,
+                              size_t width,
+                              size_t nnz,
+                              bool isNonVal = true,
+                              bool trans = false,
                               bool useGpu = isUsingGpu());
 
   /**
@@ -134,13 +137,17 @@ public:
    * @param data  list of float should be passed in python.
    * @note        the value will be copy into a new matrix.
    */
-  static Matrix* createDense(const std::vector<float>& data, size_t height,
-                             size_t width, bool useGpu = isUsingGpu());
-
-  static Matrix* createDenseFromNumpy(float* data, int dim1, int dim2,
-                                      bool copy = true,
-                                      bool useGpu = isUsingGpu())
-                                      throw (UnsupportError);
+  static Matrix* createDense(const std::vector<float>& data,
+                             size_t height,
+                             size_t width,
+                             bool useGpu = isUsingGpu());
+
+  static Matrix* createDenseFromNumpy(
+      float* data,
+      int dim1,
+      int dim2,
+      bool copy = true,
+      bool useGpu = isUsingGpu()) throw(UnsupportError);
 
   /**
    *  Create Cpu Dense Matrix from numpy matrix, dtype=float32
@@ -151,7 +158,9 @@ public:
    *  @param copy  true if copy into a new matrix, false will create
    *               matrix inplace.
    */
-  static Matrix* createCpuDenseFromNumpy(float* data, int dim1, int dim2,
+  static Matrix* createCpuDenseFromNumpy(float* data,
+                                         int dim1,
+                                         int dim2,
                                          bool copy = false);
 
   /// Create Gpu Dense Matrix from numpy matrix, dtype=float32
@@ -171,11 +180,13 @@ public:
    * numpy_mat = m.toNumpyMat()
    * @endcode
    */
-  void toNumpyMatInplace(float** view_data, int* dim1,
+  void toNumpyMatInplace(float** view_data,
+                         int* dim1,
                          int* dim2) throw(UnsupportError);
 
   /// Copy To numpy mat.
-  void copyToNumpyMat(float** view_m_data, int* dim1,
+  void copyToNumpyMat(float** view_m_data,
+                      int* dim1,
                       int* dim2) throw(UnsupportError);
 
   /// Copy From Numpy Mat
@@ -248,15 +259,18 @@ public:
   static Vector* create(const std::vector<float>& data,
                         bool useGpu = isUsingGpu());
 
-  static Vector* createVectorFromNumpy(float* data, int dim, bool copy = true,
-                                       bool useGpu = isUsingGpu())
-                                       throw (UnsupportError);
+  static Vector* createVectorFromNumpy(
+      float* data,
+      int dim,
+      bool copy = true,
+      bool useGpu = isUsingGpu()) throw(UnsupportError);
   /**
    * Create Cpu Vector from numpy array, which dtype=float32
    *
    * If copy is false, it will create vector inplace.
    */
-  static Vector* createCpuVectorFromNumpy(float* data, int dim,
+  static Vector* createCpuVectorFromNumpy(float* data,
+                                          int dim,
                                           bool copy = false);
 
   /// Create Gpu Vector from numpy array, which dtype=float32
@@ -312,16 +326,19 @@ public:
   static IVector* create(const std::vector<int>& data,
                          bool useGpu = isUsingGpu());
 
-  static IVector* createVectorFromNumpy(int* data, int dim, bool copy = true,
-                                        bool useGpu = isUsingGpu())
-                                        throw (UnsupportError);
+  static IVector* createVectorFromNumpy(
+      int* data,
+      int dim,
+      bool copy = true,
+      bool useGpu = isUsingGpu()) throw(UnsupportError);
 
   /**
    * Create Cpu IVector from numpy array, which dtype=int32
    *
    * If copy is false, it will create vector inplace
    */
-  static IVector* createCpuVectorFromNumpy(int* data, int dim,
+  static IVector* createCpuVectorFromNumpy(int* data,
+                                           int dim,
                                            bool copy = false);
   /**
    * Create Gpu IVector from numpy array, which dtype=int32
@@ -605,7 +622,8 @@ class ParameterTraverseCallback {
 public:
   ~ParameterTraverseCallback();
 
-  void apply(const std::vector<Vector*>& vecs, const ParameterConfig& config,
+  void apply(const std::vector<Vector*>& vecs,
+             const ParameterConfig& config,
              size_t sparseId);
 
 private:
@@ -638,7 +656,8 @@ public:
 
   void finishBatch();
 
-  void update(const std::vector<Vector*>& vecs, const ParameterConfig& conf,
+  void update(const std::vector<Vector*>& vecs,
+              const ParameterConfig& conf,
               size_t sparseId = NO_SPARSE_ID);
 
   std::vector<int> getParameterTypes() const;
@@ -678,7 +697,8 @@ public:
    * model config by TrainerConfig
    */
   static GradientMachine* createByModelConfig(
-      ModelConfig* conf, GradientMatchineCreateMode mode = CREATE_MODE_NORMAL,
+      ModelConfig* conf,
+      GradientMatchineCreateMode mode = CREATE_MODE_NORMAL,
       const std::vector<int>& parameterTypes = defaultParamTypes);
 
   /**
@@ -701,7 +721,8 @@ public:
   /**
    * Combine forward/backward
    */
-  void forwardBackward(const Arguments& inArgs, Arguments* outArgs,
+  void forwardBackward(const Arguments& inArgs,
+                       Arguments* outArgs,
                        PassType passType,
                        const UpdateCallback& callback = UpdateCallback());
 
@@ -722,14 +743,17 @@ public:
    */
   SequenceGenerator* asSequenceGenerator(
       const std::vector<std::string>& dict = std::vector<std::string>(),
-      size_t begin_id = 0UL, size_t end_id = 0UL, size_t max_length = 100UL,
+      size_t begin_id = 0UL,
+      size_t end_id = 0UL,
+      size_t max_length = 100UL,
       size_t beam_size = -1UL);
 
 private:
   GradientMachinePrivate* m;
 
   static GradientMachine* createFromPaddleModelPtr(
-      const void* confPtr, GradientMatchineCreateMode mode,
+      const void* confPtr,
+      GradientMatchineCreateMode mode,
       const std::vector<int>& types);
 
   // Not to use c++ 11 init-list, so we use static var as function default arg.
@@ -751,8 +775,8 @@ public:
   /// Create A Trainer By TrainerConfig. using paddle command line.
   static Trainer* createByCommandLine() throw(IOError);
 
-  static Trainer* create(TrainerConfig* optConfig, GradientMachine* gm)
-      throw(IOError);
+  static Trainer* create(TrainerConfig* optConfig,
+                         GradientMachine* gm) throw(IOError);
 
   /// Start training
   void startTrain();
diff --git a/paddle/api/Parameter.cpp b/paddle/api/Parameter.cpp
index 8b56adc97c..c5876bb1c7 100644
--- a/paddle/api/Parameter.cpp
+++ b/paddle/api/Parameter.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "PaddleAPI.h"
 #include "paddle/parameter/Parameter.h"
 
diff --git a/paddle/api/ParameterOptimizer.cpp b/paddle/api/ParameterOptimizer.cpp
index b13761ab09..21d031e4bc 100644
--- a/paddle/api/ParameterOptimizer.cpp
+++ b/paddle/api/ParameterOptimizer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "PaddleAPI.h"
 #include "PaddleAPIPrivate.h"
 #include "paddle/parameter/ParameterOptimizer.h"
@@ -32,17 +31,21 @@ struct ParameterTraverseCallbackPrivate {
       const paddle::ParameterOptimizer::TraverseCallback& callback)
       : callback(callback) {}
 
-  void apply(const std::vector<Vector*>& vecs, const ParameterConfig& conf,
+  void apply(const std::vector<Vector*>& vecs,
+             const ParameterConfig& conf,
              size_t sparseId) {
     std::vector<paddle::VectorPtr> real_vecs;
     real_vecs.resize(vecs.size());
-    std::transform(vecs.begin(), vecs.end(), real_vecs.begin(), [](Vector* v) {
-      if (v) {
-        return *(paddle::VectorPtr*)(v->getSharedPtr());
-      } else {
-        return paddle::VectorPtr();
-      }
-    });
+    std::transform(vecs.begin(),
+                   vecs.end(),
+                   real_vecs.begin(),
+                   [](Vector* v) {
+                     if (v) {
+                       return *(paddle::VectorPtr*)(v->getSharedPtr());
+                     } else {
+                       return paddle::VectorPtr();
+                     }
+                   });
 
     paddle::ParameterConfig& real_conf =
         *(paddle::ParameterConfig*)(const_cast<ParameterConfig&>(conf)
@@ -86,10 +89,12 @@ void ParameterOptimizer::startBatch(size_t numSamplesProcessed) {
 void ParameterOptimizer::finishBatch() { m->optimizer->finishBatch(); }
 
 void ParameterOptimizer::update(const std::vector<Vector*>& vecs,
-                                const ParameterConfig& conf, size_t sparseId) {
-  ParameterTraverseCallbackPrivate invoker([&](
-      const paddle::VectorPtr _vecs[], const paddle::ParameterConfig& config,
-      size_t sid = -1UL) { m->optimizer->update(_vecs, config, sid); });
+                                const ParameterConfig& conf,
+                                size_t sparseId) {
+  ParameterTraverseCallbackPrivate invoker(
+      [&](const paddle::VectorPtr _vecs[],
+          const paddle::ParameterConfig& config,
+          size_t sid = -1UL) { m->optimizer->update(_vecs, config, sid); });
   invoker.apply(vecs, conf, sparseId);
 }
 
@@ -116,8 +121,9 @@ void ParameterTraverseCallback::apply(const std::vector<Vector*>& vecs,
 
 ParameterTraverseCallback* ParameterOptimizer::needSpecialTraversal(
     const ParameterConfig& config) const {
-  auto& param_config = *(paddle::ParameterConfig*)const_cast<ParameterConfig&>(
-                            config).getRawPtr();
+  auto& param_config =
+      *(paddle::ParameterConfig*)const_cast<ParameterConfig&>(config)
+           .getRawPtr();
   auto callback = m->optimizer->needSpecialTraversal(param_config);
   if (callback) {
     auto retCallback = new ParameterTraverseCallback();
diff --git a/paddle/api/SequenceGenerator.cpp b/paddle/api/SequenceGenerator.cpp
index 9d353ccc8e..d51be78d45 100644
--- a/paddle/api/SequenceGenerator.cpp
+++ b/paddle/api/SequenceGenerator.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "PaddleAPI.h"
 #include "paddle/gserver/gradientmachines/GradientMachine.h"
 #include "paddle/parameter/Argument.h"
@@ -42,8 +41,10 @@ struct Path {
 // position
 static void findNBest(paddle::GradientMachine* gradMachine,
                       std::vector<paddle::Argument>& inArgs,
-                      std::vector<Path>& finalPaths, size_t bos_id,
-                      size_t eos_id, size_t max_length) {
+                      std::vector<Path>& finalPaths,
+                      size_t bos_id,
+                      size_t eos_id,
+                      size_t max_length) {
   std::vector<Path> paths;
   Path emptyPath;
   paths.push_back(emptyPath);
@@ -166,7 +167,8 @@ public:
     if (id < getSize()) {
       Path& p = (*path_)[id];
       std::ostringstream sout;
-      std::transform(p.ids.begin(), p.ids.end(),
+      std::transform(p.ids.begin(),
+                     p.ids.end(),
                      std::ostream_iterator<std::string>(sout, split ? " " : ""),
                      [&](int id) { return (*dict_)[id]; });
       return sout.str();
diff --git a/paddle/api/Trainer.cpp b/paddle/api/Trainer.cpp
index b61f36f740..7a6aa69fb6 100644
--- a/paddle/api/Trainer.cpp
+++ b/paddle/api/Trainer.cpp
@@ -64,12 +64,11 @@ Trainer* Trainer::createByCommandLine() throw(IOError) {
 
 Trainer::Trainer(TrainerConfig* config, GradientMachine* gm)
     : m(new TrainerPrivate()) {
-  m->init(config->m->conf, /* testing= */false, gm ? gm->m->machine : nullptr);
+  m->init(config->m->conf, /* testing= */ false, gm ? gm->m->machine : nullptr);
 }
 
-Trainer* Trainer::create(TrainerConfig* config, GradientMachine* gm)
-    throw(IOError)
-{
+Trainer* Trainer::create(TrainerConfig* config,
+                         GradientMachine* gm) throw(IOError) {
   auto retv = new Trainer(config, gm);
   if (retv->m->getConfig().IsInitialized()) {
     return retv;
@@ -134,15 +133,17 @@ void Trainer::finishTestPeriod() { m->finishTestPeriod(); }
 
 Matrix* Trainer::getLayerOutput(const std::string& layerName) {
   auto nn = std::dynamic_pointer_cast<paddle::NeuralNetwork>(
-          this->m->getGradientMachine());
+      this->m->getGradientMachine());
   CHECK(nn) << "trainerInternal_.getGradientMachine() is not NeuralNetwork";
   auto m = nn->getLayerOutput(layerName);
   return Matrix::createByPaddleMatrixPtr(&m);
 }
 
-void Trainer::forwardOneBatch(size_t batchSize) { m->forwardOneBatch(batchSize); }
+void Trainer::forwardOneBatch(size_t batchSize) {
+  m->forwardOneBatch(batchSize);
+}
 
-bool TrainerPrivate::forwardOneBatch(size_t batchSize)  {
+bool TrainerPrivate::forwardOneBatch(size_t batchSize) {
   CHECK(dataProvider_) << "data_provider is not specified";
   paddle::DataBatch dataBatch;
   int num = dataProvider_->getNextBatch(batchSize, &dataBatch);
@@ -156,7 +157,6 @@ bool TrainerPrivate::forwardOneBatch(size_t batchSize)  {
 
 void TrainerPrivate::forwardOneDataBatch(
     const std::vector<paddle::Argument>& inArgs) {
-
   std::vector<paddle::Argument>& outArgs = forwardOutput_;
 
   if (config_->getOptConfig().use_sparse_remote_updater()) {
diff --git a/paddle/api/Util.cpp b/paddle/api/Util.cpp
index a8932351a6..1bba1df2e1 100644
--- a/paddle/api/Util.cpp
+++ b/paddle/api/Util.cpp
@@ -37,13 +37,15 @@ FloatArray::FloatArray(const float* b, const size_t l)
 IntArray::IntArray(const int* b, const size_t l, bool f)
     : buf(b), length(l), needFree(f) {}
 
-IntWithFloatArray::IntWithFloatArray(const float* v, const int* i, size_t l,
+IntWithFloatArray::IntWithFloatArray(const float* v,
+                                     const int* i,
+                                     size_t l,
                                      bool f)
     : valBuf(v), idxBuf(i), length(l), needFree(f) {}
 
-bool isUsingGpu() {return FLAGS_use_gpu;}
+bool isUsingGpu() { return FLAGS_use_gpu; }
 
-void setUseGpu(bool useGpu) {FLAGS_use_gpu = useGpu;}
+void setUseGpu(bool useGpu) { FLAGS_use_gpu = useGpu; }
 
 bool isGpuVersion() {
 #ifdef PADDLE_ONLY_CPU
diff --git a/paddle/api/Vector.cpp b/paddle/api/Vector.cpp
index d44cdefc35..cc1c098223 100644
--- a/paddle/api/Vector.cpp
+++ b/paddle/api/Vector.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "PaddleAPI.h"
 
 #include "paddle/math/Vector.h"
@@ -39,8 +38,10 @@ IVector* IVector::create(const std::vector<int>& data, bool useGpu) {
   return v;
 }
 
-IVector* IVector::createVectorFromNumpy(int* data, int dim, bool copy,
-                                        bool useGpu) throw (UnsupportError){
+IVector* IVector::createVectorFromNumpy(int* data,
+                                        int dim,
+                                        bool copy,
+                                        bool useGpu) throw(UnsupportError) {
   if (useGpu) {
     /// if use gpu only copy=true is supported
     if (!copy) {
@@ -137,8 +138,8 @@ void IVector::copyToNumpyArray(int** view_m_data, int* dim1) {
   if (auto cpuVec = dynamic_cast<paddle::CpuIVector*>(m->vec.get())) {
     std::memcpy(*view_m_data, cpuVec->getData(), sizeof(int) * (*dim1));
   } else if (auto gpuVec = dynamic_cast<paddle::GpuIVector*>(m->vec.get())) {
-    hl_memcpy_device2host(*view_m_data, gpuVec->getData(),
-                          sizeof(int) * (*dim1));
+    hl_memcpy_device2host(
+        *view_m_data, gpuVec->getData(), sizeof(int) * (*dim1));
   } else {
     LOG(INFO) << "Unexpected situation";
   }
@@ -201,8 +202,10 @@ Vector* Vector::createByPaddleVectorPtr(void* ptr) {
   }
 }
 
-Vector* Vector::createVectorFromNumpy(float* data, int dim, bool copy,
-                                      bool useGpu) throw (UnsupportError){
+Vector* Vector::createVectorFromNumpy(float* data,
+                                      int dim,
+                                      bool copy,
+                                      bool useGpu) throw(UnsupportError) {
   if (useGpu) {
     /// if use gpu only copy=True is supported
     if (!copy) {
@@ -251,8 +254,8 @@ void Vector::copyToNumpyArray(float** view_m_data, int* dim1) {
   if (auto cpuVec = dynamic_cast<paddle::CpuVector*>(m->vec.get())) {
     std::memcpy(*view_m_data, cpuVec->getData(), sizeof(float) * (*dim1));
   } else if (auto gpuVec = dynamic_cast<paddle::CpuVector*>(m->vec.get())) {
-    hl_memcpy_device2host(*view_m_data, gpuVec->getData(),
-                          sizeof(float) * (*dim1));
+    hl_memcpy_device2host(
+        *view_m_data, gpuVec->getData(), sizeof(float) * (*dim1));
   } else {
     LOG(INFO) << "Unexpected situation";
   }
diff --git a/paddle/cuda/CMakeLists.txt b/paddle/cuda/CMakeLists.txt
index cdb730bb3c..11dbfb54b2 100755
--- a/paddle/cuda/CMakeLists.txt
+++ b/paddle/cuda/CMakeLists.txt
@@ -81,5 +81,8 @@ else()
     add_library(paddle_cuda ${CUDA_SOURCES})
 endif()
 
-add_style_check_target(paddle_cuda ${CUDA_SOURCES})
-add_style_check_target(paddle_cuda ${CUDA_HEADERS})
+add_style_check_target(paddle_cuda
+                       ${CUDA_SOURCES}
+                       ${CUDA_HEADERS}
+                       ${CUDA_DSO_SOURCES}
+                       ${CUDA_CXX_WITH_GPU_SOURCES})
diff --git a/paddle/cuda/include/hl_activation_functions.h b/paddle/cuda/include/hl_activation_functions.h
index c8aabc7844..03e15b2223 100644
--- a/paddle/cuda/include/hl_activation_functions.h
+++ b/paddle/cuda/include/hl_activation_functions.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_ACTIVATION_FUNCTIONS_H_
 #define HL_ACTIVATION_FUNCTIONS_H_
 
@@ -21,11 +20,8 @@ limitations under the License. */
 /**
  * Active functions: sigmoid, relu, tanh and linear.
  */
-#define HPPL_ACTIVE_FUNCTION  {hppl::sigmoid,   \
-                               hppl::relu,      \
-                               hppl::tanh,      \
-                               hppl::linear     \
-                              }
+#define HPPL_ACTIVE_FUNCTION \
+  { hppl::sigmoid, hppl::relu, hppl::tanh, hppl::linear }
 
 namespace hppl {
 
@@ -42,18 +38,18 @@ public:
 
 #ifdef __NVCC__
 namespace gpu {
-static __device__ Active<real>::forward  forward[]  = HPPL_ACTIVE_FUNCTION;
+static __device__ Active<real>::forward forward[] = HPPL_ACTIVE_FUNCTION;
 static __device__ Active<real>::backward backward[] = HPPL_ACTIVE_FUNCTION;
 }
 #else
 namespace cpu {
-static Active<real>::forward  forward[] = HPPL_ACTIVE_FUNCTION;
+static Active<real>::forward forward[] = HPPL_ACTIVE_FUNCTION;
 static Active<real>::backward backward[] = HPPL_ACTIVE_FUNCTION;
 }
 
 #ifdef __AVX__
 namespace avx {
-static Active<__m256>::forward  forward[] = HPPL_ACTIVE_FUNCTION;
+static Active<__m256>::forward forward[] = HPPL_ACTIVE_FUNCTION;
 static Active<__m256>::backward backward[] = HPPL_ACTIVE_FUNCTION;
 }
 #endif
diff --git a/paddle/cuda/include/hl_aggregate.h b/paddle/cuda/include/hl_aggregate.h
index db75809f5d..a6d9ff8483 100644
--- a/paddle/cuda/include/hl_aggregate.h
+++ b/paddle/cuda/include/hl_aggregate.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_AGGREGATE_H_
 #define HL_AGGREGATE_H_
 
diff --git a/paddle/cuda/include/hl_avx_functions.h b/paddle/cuda/include/hl_avx_functions.h
index cf062dd969..ed339e312a 100644
--- a/paddle/cuda/include/hl_avx_functions.h
+++ b/paddle/cuda/include/hl_avx_functions.h
@@ -12,22 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_AVX_FUNCTIONS_H_
 #define HL_AVX_FUNCTIONS_H_
 
 #include <immintrin.h>
 
 namespace hppl {
-  __m256 relu(const __m256 a);
-  __m256 sigmoid(const __m256 a);
-  __m256 tanh(const __m256 a);
-  __m256 linear(const __m256 a);
-
-  __m256 relu(const __m256 a, const __m256 b);
-  __m256 sigmoid(const __m256 a, const __m256 b);
-  __m256 tanh(const __m256 a, const __m256 b);
-  __m256 linear(const __m256 a, const __m256 b);
+__m256 relu(const __m256 a);
+__m256 sigmoid(const __m256 a);
+__m256 tanh(const __m256 a);
+__m256 linear(const __m256 a);
+
+__m256 relu(const __m256 a, const __m256 b);
+__m256 sigmoid(const __m256 a, const __m256 b);
+__m256 tanh(const __m256 a, const __m256 b);
+__m256 linear(const __m256 a, const __m256 b);
 }  // namespace hppl
 
 #endif  // HL_AVX_FUNCTIONS_H_
diff --git a/paddle/cuda/include/hl_base.h b/paddle/cuda/include/hl_base.h
index 9f80898a1f..a076952467 100644
--- a/paddle/cuda/include/hl_base.h
+++ b/paddle/cuda/include/hl_base.h
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
-
 #ifndef HL_BASE_H_
 #define HL_BASE_H_
 
@@ -33,36 +31,36 @@ limitations under the License. */
  *          HPPL_STREAM_DEFAULT is HPPL default stream.
  */
 typedef enum {
-    HPPL_STREAM_DEFAULT = 0,    /* Thread Default Stream*/
-    HPPL_STREAM_1 = 1,
-    HPPL_STREAM_2 = 2,
-    HPPL_STREAM_3 = 3,
-    HPPL_STREAM_4 = 4,
-    HPPL_THREAD_STREAM_1 = 5,
-    HPPL_THREAD_STREAM_2 = 6,
-    HPPL_THREAD_STREAM_3 = 7,
-    HPPL_THREAD_STREAM_4 = 8,
-    HPPL_STREAM_END
+  HPPL_STREAM_DEFAULT = 0, /* Thread Default Stream*/
+  HPPL_STREAM_1 = 1,
+  HPPL_STREAM_2 = 2,
+  HPPL_STREAM_3 = 3,
+  HPPL_STREAM_4 = 4,
+  HPPL_THREAD_STREAM_1 = 5,
+  HPPL_THREAD_STREAM_2 = 6,
+  HPPL_THREAD_STREAM_3 = 7,
+  HPPL_THREAD_STREAM_4 = 8,
+  HPPL_STREAM_END
 } hl_stream_t;
 
 /**
  * @brief HPPL activation mode.
  */
 typedef enum {
-    HL_ACTIVATION_SIGMOID   = 0,
-    HL_ACTIVATION_RELU      = 1,
-    HL_ACTIVATION_TANH      = 2,
-    HL_ACTIVATION_LINEAR    = 3,
-    HL_ACTIVATION_END
+  HL_ACTIVATION_SIGMOID = 0,
+  HL_ACTIVATION_RELU = 1,
+  HL_ACTIVATION_TANH = 2,
+  HL_ACTIVATION_LINEAR = 3,
+  HL_ACTIVATION_END
 } hl_activation_mode_t;
 
 /**
  * @brief Transpose type.
  */
 typedef enum {
-    HPPL_OP_N = 0, /* transpose */
-    HPPL_OP_T = 1, /* non transpose */
-    HPPL_OP_END
+  HPPL_OP_N = 0, /* transpose */
+  HPPL_OP_T = 1, /* non transpose */
+  HPPL_OP_END
 } hl_trans_op_t;
 
 /**
@@ -148,23 +146,21 @@ typedef struct {
  * @brief  Sparse matrix value type.
  */
 typedef enum {
-    HL_NO_VALUE = 0,                       /* matrix values only 0 or 1 */
-    HL_FLOAT_VALUE = 1,
-    HL_VALUE_END
+  HL_NO_VALUE = 0, /* matrix values only 0 or 1 */
+  HL_FLOAT_VALUE = 1,
+  HL_VALUE_END
 } hl_matrix_value_t;
 
-
 /**
  * @brief  HPPL matrix format.
  */
 typedef enum {
-    HL_SPARSE_CSR = 0,
-    HL_SPARSE_CSC = 1,
-    HL_SPARSE_END
+  HL_SPARSE_CSR = 0,
+  HL_SPARSE_CSC = 1,
+  HL_SPARSE_END
 } hl_matrix_format_t;
 
-
-typedef struct _hl_matrix_s * hl_matrix_s;
+typedef struct _hl_matrix_s *hl_matrix_s;
 
 /**
  * @brief   HPPL sparse matrix.
@@ -177,12 +173,12 @@ typedef struct _hl_matrix_s * hl_matrix_s;
  * @param  nnz        nonzero values of sparse matrix.
  */
 typedef struct {
-    hl_matrix_s             matrix;
-    hl_matrix_format_t      format;
-    hl_matrix_value_t       type;
-    int                     rows;
-    int                     cols;
-    size_t                  nnz;
+  hl_matrix_s matrix;
+  hl_matrix_format_t format;
+  hl_matrix_value_t type;
+  int rows;
+  int cols;
+  size_t nnz;
 } _hl_sparse_matrix_s, *hl_sparse_matrix_s;
 
 #ifndef PADDLE_TYPE_DOUBLE
@@ -195,7 +191,7 @@ typedef struct {
  *
  * HL_FLOAT_MIN: 1.17549435e-38F
  */
-#define HL_FLOAT_MAX        3.40282347e+38F
+#define HL_FLOAT_MAX 3.40282347e+38F
 /**
  * if real == double
  *
@@ -203,20 +199,18 @@ typedef struct {
  *
  * HL_FLOAT_MIN: 2.2250738585072014e-308
  */
-#define HL_FLOAT_MIN        1.17549435e-38F
+#define HL_FLOAT_MIN 1.17549435e-38F
 #else
-#define HL_FLOAT_MAX        1.7976931348623157e+308
-#define HL_FLOAT_MIN        2.2250738585072014e-308
+#define HL_FLOAT_MAX 1.7976931348623157e+308
+#define HL_FLOAT_MIN 2.2250738585072014e-308
 #endif
 
-
 /**
  * The maximum input value for exp, used to avoid overflow problem.
  *
  * Currently only used for tanh function.
  */
-#define EXP_MAX_INPUT       40.0
-
+#define EXP_MAX_INPUT 40.0
 
 /**
  * @brief DIVUP(x, y) is similar to ceil(x / y).
@@ -224,7 +218,7 @@ typedef struct {
  *        the size of blockDim.
  */
 #ifndef DIVUP
-#define DIVUP(x, y) (((x) + (y) - 1) / (y))
+#define DIVUP(x, y) (((x) + (y)-1) / (y))
 #endif
 
 #ifdef __NVCC__
@@ -233,7 +227,7 @@ typedef struct {
 #include "hl_cuda.h"
 #include "cuda_runtime.h"
 
-extern  __thread bool g_sync_flag;
+extern __thread bool g_sync_flag;
 extern __thread cudaStream_t default_stream;
 #define STREAM_DEFAULT default_stream
 
@@ -241,16 +235,15 @@ extern __thread cudaStream_t default_stream;
  * @brief   Check cuda kernel execution.
  * @param   msg   error string
  */
-#define CHECK_SYNC(msg)                                   \
-  if (true == g_sync_flag) {                              \
-    hl_stream_synchronize(HPPL_STREAM_DEFAULT);           \
-    cudaError_t err                                       \
-      = (cudaError_t)hl_get_device_last_error();          \
-    CHECK_EQ(cudaSuccess, err) << "[" << msg << "] "      \
-      << "CUDA error: "                                   \
-      << hl_get_device_error_string((size_t)err);         \
+#define CHECK_SYNC(msg)                                               \
+  if (true == g_sync_flag) {                                          \
+    hl_stream_synchronize(HPPL_STREAM_DEFAULT);                       \
+    cudaError_t err = (cudaError_t)hl_get_device_last_error();        \
+    CHECK_EQ(cudaSuccess, err)                                        \
+        << "[" << msg << "] "                                         \
+        << "CUDA error: " << hl_get_device_error_string((size_t)err); \
   }
 
-#endif  /* __NVCC__ */
+#endif /* __NVCC__ */
 
-#endif  /* HL_BASE_H_ */
+#endif /* HL_BASE_H_ */
diff --git a/paddle/cuda/include/hl_batch_transpose.h b/paddle/cuda/include/hl_batch_transpose.h
index 414c7996ac..f3630e9762 100644
--- a/paddle/cuda/include/hl_batch_transpose.h
+++ b/paddle/cuda/include/hl_batch_transpose.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_BATCH_TRANSPOSE_H_
 #define HL_BATCH_TRANSPOSE_H_
 
@@ -31,10 +30,7 @@ limitations under the License. */
  *          order. Each batch has height * width data, which are
  *          arranged in height-first (or row-first) manner.
  */
-extern void batchTranspose(const real* input,
-                           real* output,
-                           int width,
-                           int height,
-                           int batchSize);
+extern void batchTranspose(
+    const real* input, real* output, int width, int height, int batchSize);
 
 #endif  // HL_BATCH_TRANSPOSE_H_
diff --git a/paddle/cuda/include/hl_cnn.h b/paddle/cuda/include/hl_cnn.h
index 70b5be6fda..cffaac634f 100644
--- a/paddle/cuda/include/hl_cnn.h
+++ b/paddle/cuda/include/hl_cnn.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_CNN_H_
 #define HL_CNN_H_
 
@@ -37,15 +36,21 @@ limitations under the License. */
  * @param[in]   alpha
  * @param[in]   beta
  */
-extern void hl_shrink_col2feature(
-    const real * dataCol, size_t channels,
-    size_t height, size_t width,
-    size_t blockH, size_t blockW,
-    size_t strideH, size_t strideW,
-    size_t paddingH, size_t paddingW,
-    size_t outputH, size_t outputW,
-    real* dataIm,
-    real alpha = 1.0f, real beta = 0.0f);
+extern void hl_shrink_col2feature(const real* dataCol,
+                                  size_t channels,
+                                  size_t height,
+                                  size_t width,
+                                  size_t blockH,
+                                  size_t blockW,
+                                  size_t strideH,
+                                  size_t strideW,
+                                  size_t paddingH,
+                                  size_t paddingW,
+                                  size_t outputH,
+                                  size_t outputW,
+                                  real* dataIm,
+                                  real alpha = 1.0f,
+                                  real beta = 0.0f);
 
 /**
  * @brief   Expand feature to column.
@@ -65,14 +70,19 @@ extern void hl_shrink_col2feature(
  * @param[out]  dataCol     expand data.
  *
  */
-extern void hl_expand_feature2col(
-    const real* dataIm, size_t channels,
-    size_t height, size_t width,
-    size_t blockH, size_t blockW,
-    size_t strideH, size_t strideW,
-    size_t paddingH, size_t paddingW,
-    size_t outputH, size_t outputW,
-    real* dataCol);
+extern void hl_expand_feature2col(const real* dataIm,
+                                  size_t channels,
+                                  size_t height,
+                                  size_t width,
+                                  size_t blockH,
+                                  size_t blockW,
+                                  size_t strideH,
+                                  size_t strideW,
+                                  size_t paddingH,
+                                  size_t paddingW,
+                                  size_t outputH,
+                                  size_t outputW,
+                                  real* dataCol);
 
 /**
  * @brief   Maximum pool forward.
@@ -94,15 +104,21 @@ extern void hl_expand_feature2col(
  * @param[in]   tgtStride   stride between output data samples.
  *
  */
-extern void hl_maxpool_forward(
-    const int frameCnt, const real* inputData,
-    const int channels,
-    const int height, const int width,
-    const int pooledH, const int pooledW,
-    const int sizeX, const int sizeY,
-    const int strideH, const int strideW,
-    const int paddingH, const int paddingW,
-    real* tgtData, const int tgtStride);
+extern void hl_maxpool_forward(const int frameCnt,
+                               const real* inputData,
+                               const int channels,
+                               const int height,
+                               const int width,
+                               const int pooledH,
+                               const int pooledW,
+                               const int sizeX,
+                               const int sizeY,
+                               const int strideH,
+                               const int strideW,
+                               const int paddingH,
+                               const int paddingW,
+                               real* tgtData,
+                               const int tgtStride);
 
 /**
  * @brief   Maximum pool backward.
@@ -125,20 +141,28 @@ extern void hl_maxpool_forward(
  * @param[in]   paddingH    padding height.
  * @param[in]   paddingW    padding width.
  * @param[out]  targetGrad  output grad.
- * @param[in]   outStride   stride between output data samples. 
+ * @param[in]   outStride   stride between output data samples.
  *
  */
-extern void hl_maxpool_backward(
-    const int frameCnt, const real* inputData,
-    const real* outData, const real* outGrad,
-    const int channels, const int height,
-    const int width,
-    const int pooledH, const int pooledW,
-    const int sizeX, const int sizeY,
-    const int strideH, const int strideW,
-    const int paddingH, const int paddingW,
-    real scaleA, real scaleB,
-    real* targetGrad, const int outStride);
+extern void hl_maxpool_backward(const int frameCnt,
+                                const real* inputData,
+                                const real* outData,
+                                const real* outGrad,
+                                const int channels,
+                                const int height,
+                                const int width,
+                                const int pooledH,
+                                const int pooledW,
+                                const int sizeX,
+                                const int sizeY,
+                                const int strideH,
+                                const int strideW,
+                                const int paddingH,
+                                const int paddingW,
+                                real scaleA,
+                                real scaleB,
+                                real* targetGrad,
+                                const int outStride);
 
 /**
  * @brief   Averge pool forward.
@@ -160,15 +184,21 @@ extern void hl_maxpool_backward(
  * @param[in]   tgtStride   stride between output data samples.
  *
  */
-extern void hl_avgpool_forward(
-    const int frameCnt, const real* inputData,
-    const int channels,
-    const int height, const int width,
-    const int pooledH, const int pooledW,
-    const int sizeX, const int sizeY,
-    const int strideH, const int strideW,
-    const int paddingH, const int paddingW,
-    real* tgtData, const int tgtStride);
+extern void hl_avgpool_forward(const int frameCnt,
+                               const real* inputData,
+                               const int channels,
+                               const int height,
+                               const int width,
+                               const int pooledH,
+                               const int pooledW,
+                               const int sizeX,
+                               const int sizeY,
+                               const int strideH,
+                               const int strideW,
+                               const int paddingH,
+                               const int paddingW,
+                               real* tgtData,
+                               const int tgtStride);
 
 /**
  * @brief   Maximum pool backward.
@@ -189,19 +219,26 @@ extern void hl_avgpool_forward(
  * @param[in]   scaleA      scale.
  * @param[in]   scaleB      scale.
  * @param[out]  backGrad    output grad.
- * @param[in]   outStride   stride between output data samples. 
+ * @param[in]   outStride   stride between output data samples.
  *
  */
-extern void hl_avgpool_backward(
-    const int frameCnt, const real* outGrad,
-    const int channels, const int height,
-    const int width,
-    const int pooledH, const int pooledW,
-    const int sizeX, const int sizeY,
-    const int strideH, const int strideW,
-    int paddingH, int paddingW,
-    real scaleA, real scaleB,
-    real* backGrad, const int outStride);
+extern void hl_avgpool_backward(const int frameCnt,
+                                const real* outGrad,
+                                const int channels,
+                                const int height,
+                                const int width,
+                                const int pooledH,
+                                const int pooledW,
+                                const int sizeX,
+                                const int sizeY,
+                                const int strideH,
+                                const int strideW,
+                                int paddingH,
+                                int paddingW,
+                                real scaleA,
+                                real scaleB,
+                                real* backGrad,
+                                const int outStride);
 
 /**
  * @brief   Cross-map-respose normalize forward.
@@ -218,10 +255,16 @@ extern void hl_avgpool_backward(
  * @param[in]   beta        scale.
  *
  */
-extern void hl_CMRNorm_forward(
-    size_t frameCnt, const real* in, real* scale, real* out,
-    size_t channels, size_t height, size_t width, size_t sizeX,
-    real alpha, real beta);
+extern void hl_CMRNorm_forward(size_t frameCnt,
+                               const real* in,
+                               real* scale,
+                               real* out,
+                               size_t channels,
+                               size_t height,
+                               size_t width,
+                               size_t sizeX,
+                               real alpha,
+                               real beta);
 
 /**
  * @brief   Cross-map-respose normalize backward.
@@ -240,11 +283,18 @@ extern void hl_CMRNorm_forward(
  * @param[in]   beta        scale.
  *
  */
-extern void hl_CMRNorm_backward(
-    size_t frameCnt, const real* inV, const real* scale,
-    const real* outV, const real* outDiff, real *inDiff,
-    size_t channels, size_t height, size_t width, size_t sizeX,
-    real alpha, real beta);
+extern void hl_CMRNorm_backward(size_t frameCnt,
+                                const real* inV,
+                                const real* scale,
+                                const real* outV,
+                                const real* outDiff,
+                                real* inDiff,
+                                size_t channels,
+                                size_t height,
+                                size_t width,
+                                size_t sizeX,
+                                real alpha,
+                                real beta);
 
 /**
  * @brief   Bilinear interpolation forward.
@@ -278,24 +328,24 @@ extern void hl_bilinear_forward(const real* inData,
                                 const real ratioH,
                                 const real ratioW);
 
- /**
- * @brief   Bilinear interpolation backward.
- *
- * @param[out]  inGrad      input gradient.
- * @param[in]   inImgH      input image height.
- * @param[in]   inImgW      input image width.
- * @param[in]   inputH      input batchSize.
- * @param[in]   inputW      input image data dim.
- * @param[in]   outGrad     output gradient.
- * @param[in]   outImgH     output image height.
- * @param[in]   outImgW     output image width.
- * @param[in]   outputH     output batchSize.
- * @param[in]   outputW     output image data dim.
- * @param[in]   numChannels number of channels.
- * @param[in]   ratioH      inImgH / outImgH.
- * @param[in]   ratioW      inImgW / outImgW.
- *
- */                               
+/**
+* @brief   Bilinear interpolation backward.
+*
+* @param[out]  inGrad      input gradient.
+* @param[in]   inImgH      input image height.
+* @param[in]   inImgW      input image width.
+* @param[in]   inputH      input batchSize.
+* @param[in]   inputW      input image data dim.
+* @param[in]   outGrad     output gradient.
+* @param[in]   outImgH     output image height.
+* @param[in]   outImgW     output image width.
+* @param[in]   outputH     output batchSize.
+* @param[in]   outputW     output image data dim.
+* @param[in]   numChannels number of channels.
+* @param[in]   ratioH      inImgH / outImgH.
+* @param[in]   ratioW      inImgW / outImgW.
+*
+*/
 extern void hl_bilinear_backward(real* inGrad,
                                  const size_t inImgH,
                                  const size_t inImgW,
@@ -321,9 +371,13 @@ extern void hl_bilinear_backward(real* inGrad,
  * @param[in]   featLen     feature length = image height * image width.
  * @param[in]   groups      number of groups.
  */
-extern void hl_maxout_forward(
-    const real* inData, real* outData, int* idData,
-    size_t batchSize, size_t size, size_t featLen, size_t groups);
+extern void hl_maxout_forward(const real* inData,
+                              real* outData,
+                              int* idData,
+                              size_t batchSize,
+                              size_t size,
+                              size_t featLen,
+                              size_t groups);
 
 /**
  * @brief   MaxOut backward.
@@ -336,8 +390,12 @@ extern void hl_maxout_forward(
  * @param[in]   featLen     feature length = image height * image width.
  * @param[in]   groups      number of groups.
  */
-extern void hl_maxout_backward(
-    real* inGrad, const real* outGrad, const int* idData,
-    size_t batchSize, size_t size, size_t featLen, size_t groups);
+extern void hl_maxout_backward(real* inGrad,
+                               const real* outGrad,
+                               const int* idData,
+                               size_t batchSize,
+                               size_t size,
+                               size_t featLen,
+                               size_t groups);
 
 #endif /* HL_CNN_H_ */
diff --git a/paddle/cuda/include/hl_cuda.h b/paddle/cuda/include/hl_cuda.h
index d763658c93..2c7d665101 100644
--- a/paddle/cuda/include/hl_cuda.h
+++ b/paddle/cuda/include/hl_cuda.h
@@ -12,18 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_CUDA_H_
 #define HL_CUDA_H_
 
-#include "hl_base.h"
 #include <string>
+#include "hl_base.h"
 
 /**
  * @brief   HPPL event.
  */
-typedef struct _hl_event_st *  hl_event_t;
-
+typedef struct _hl_event_st *hl_event_t;
 
 /**
  * @brief return cuda runtime api version.
@@ -42,7 +40,7 @@ extern void hl_start();
  *                      if device is NULL, will start all GPU.
  * @param[in]   number  number of devices.
  */
-extern void hl_specify_devices_start(int* device, int number);
+extern void hl_specify_devices_start(int *device, int number);
 
 /**
  * @brief   Queries if a device may directly access a peer device's memory.
@@ -126,7 +124,7 @@ extern int hl_get_device();
  *
  * @return      dest_d   pointer to device memory.
  */
-extern void* hl_malloc_device(size_t size);
+extern void *hl_malloc_device(size_t size);
 
 /**
  * @brief   Free device memory.
@@ -143,7 +141,7 @@ extern void hl_free_mem_device(void *dest_d);
  *
  * @return      dest_h   pointer to host memory.
  */
-extern void* hl_malloc_host(size_t size);
+extern void *hl_malloc_host(size_t size);
 
 /**
  * @brief   Free host page-lock memory.
@@ -228,9 +226,9 @@ extern void hl_srand(unsigned int seed);
  * @param[in]   stream  stream id.
  */
 extern void hl_memcpy_async(void *dst,
-                           void *src,
-                           size_t size,
-                           hl_stream_t stream);
+                            void *src,
+                            size_t size,
+                            hl_stream_t stream);
 
 /**
  * @brief   Waits for stream tasks to complete.
@@ -261,8 +259,7 @@ extern void hl_destroy_event(hl_event_t event);
  *
  * @return      time   Time between start and end in ms.
  */
-extern float hl_event_elapsed_time(hl_event_t start,
-                                   hl_event_t end);
+extern float hl_event_elapsed_time(hl_event_t start, hl_event_t end);
 
 /**
  * @brief   Records an event.
@@ -300,7 +297,7 @@ extern void hl_set_device_flags_block();
 /**
  * @brief   Returns the last error string from a cuda runtime call.
  */
-extern const char* hl_get_device_error_string();
+extern const char *hl_get_device_error_string();
 
 /**
  * @brief     Returns the last error string from a cuda runtime call.
@@ -309,7 +306,7 @@ extern const char* hl_get_device_error_string();
  *
  * @see       hl_get_device_last_error()
  */
-extern const char* hl_get_device_error_string(size_t err);
+extern const char *hl_get_device_error_string(size_t err);
 
 /**
  * @brief   Returns the last error number.
diff --git a/paddle/cuda/include/hl_cuda_cublas.h b/paddle/cuda/include/hl_cuda_cublas.h
index d757317eb4..db8c03c2c0 100644
--- a/paddle/cuda/include/hl_cuda_cublas.h
+++ b/paddle/cuda/include/hl_cuda_cublas.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_CUDA_CUBLAS_H_
 #define HL_CUDA_CUBLAS_H_
 
@@ -29,12 +28,8 @@ limitations under the License. */
  * @param[in]   ldc     the first dimension of C_d.
  *
  */
-extern void hl_matrix_transpose(real *A_d,
-                                real *C_d,
-                                int dimM,
-                                int dimN,
-                                int lda,
-                                int ldc);
+extern void hl_matrix_transpose(
+    real *A_d, real *C_d, int dimM, int dimN, int lda, int ldc);
 
 /*
  * @brief Matrix transpose, while lda = dimN, ldc = dimM.
@@ -45,10 +40,7 @@ extern void hl_matrix_transpose(real *A_d,
  * @param[in]   dimN    matrix width.
  *
  */
-extern void hl_matrix_transpose(real *A_d,
-                                real *C_d,
-                                int dimM,
-                                int dimN);
+extern void hl_matrix_transpose(real *A_d, real *C_d, int dimM, int dimN);
 
 /*
  * @brief Matrix inverse
@@ -60,11 +52,7 @@ extern void hl_matrix_transpose(real *A_d,
  * @param[in]   ldc    the first dimension of C_d
  *
  */
-extern void hl_matrix_inverse(real *A_d,
-                              real *C_d,
-                              int dimN,
-                              int lda,
-                              int ldc);
+extern void hl_matrix_inverse(real *A_d, real *C_d, int dimN, int lda, int ldc);
 
 /**
  * @brief   C_d = alpha*(op(A_d) * op(B_d)) + beta*C_d
@@ -84,12 +72,19 @@ extern void hl_matrix_inverse(real *A_d,
  * @param[in]   ldc     the first dimension of C_d.
  *
  */
-extern void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
-                          real *B_d, hl_trans_op_t transb,
+extern void hl_matrix_mul(real *A_d,
+                          hl_trans_op_t transa,
+                          real *B_d,
+                          hl_trans_op_t transb,
                           real *C_d,
-                          int dimM, int dimN, int dimK,
-                          real alpha, real beta,
-                          int lda, int ldb, int ldc);
+                          int dimM,
+                          int dimN,
+                          int dimK,
+                          real alpha,
+                          real beta,
+                          int lda,
+                          int ldb,
+                          int ldc);
 
 /**
  * @brief   C_d = alpha*(op(A_d) * op(B_d)) + beta*C_d
@@ -106,11 +101,16 @@ extern void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
  * @param[in]   beta    scalar used for multiplication.
  *
  */
-extern void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
-                          real *B_d, hl_trans_op_t transb,
+extern void hl_matrix_mul(real *A_d,
+                          hl_trans_op_t transa,
+                          real *B_d,
+                          hl_trans_op_t transb,
                           real *C_d,
-                          int dimM, int dimN, int dimK,
-                          real alpha, real beta);
+                          int dimM,
+                          int dimN,
+                          int dimK,
+                          real alpha,
+                          real beta);
 
 /**
  * @brief   This function performs the matrix-vector multiplication.
@@ -132,11 +132,17 @@ extern void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
  *
  */
 
-extern void hl_matrix_mul_vector(real *A_d, hl_trans_op_t trans,
-                                 real *B_d, real *C_d,
-                                 int dimM, int dimN,
-                                 real alpha, real beta,
-                                 int lda, int incb, int incc);
+extern void hl_matrix_mul_vector(real *A_d,
+                                 hl_trans_op_t trans,
+                                 real *B_d,
+                                 real *C_d,
+                                 int dimM,
+                                 int dimN,
+                                 real alpha,
+                                 real beta,
+                                 int lda,
+                                 int incb,
+                                 int incc);
 
 /**
  * @brief   This function performs the matrix-vector multiplication.
@@ -154,9 +160,13 @@ extern void hl_matrix_mul_vector(real *A_d, hl_trans_op_t trans,
  * @param[in]     beta   scalar used for multiplication.
  *
  */
-extern void hl_matrix_mul_vector(real *A_d, hl_trans_op_t trans,
-                                 real *B_d, real *C_d,
-                                 int dimM, int dimN,
-                                 real alpha, real beta);
+extern void hl_matrix_mul_vector(real *A_d,
+                                 hl_trans_op_t trans,
+                                 real *B_d,
+                                 real *C_d,
+                                 int dimM,
+                                 int dimN,
+                                 real alpha,
+                                 real beta);
 
 #endif /* HL_CUDA_CUBLAS_H_ */
diff --git a/paddle/cuda/include/hl_cuda_cudnn.h b/paddle/cuda/include/hl_cuda_cudnn.h
index f256cb54df..3a2f916210 100644
--- a/paddle/cuda/include/hl_cuda_cudnn.h
+++ b/paddle/cuda/include/hl_cuda_cudnn.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_CUDA_CUDNN_H_
 #define HL_CUDA_CUDNN_H_
 
@@ -22,7 +21,7 @@ limitations under the License. */
  *  hppl pooling mode
  */
 typedef enum {
-  HL_POOLING_MAX     = 0,
+  HL_POOLING_MAX = 0,
   // average includes padded values
   HL_POOLING_AVERAGE = 1,
   // average does not include padded values
@@ -324,17 +323,16 @@ extern void hl_convolution_forward_add_bias(hl_tensor_descriptor bias,
  * @param[in]   sizeInBytes         gpu workspace size (bytes).
  * @param[in]   convBwdFilterAlgo   backward filter algorithm.
  */
-extern void hl_convolution_backward_filter(
-        hl_tensor_descriptor input,
-        real* input_data,
-        hl_tensor_descriptor output,
-        real* output_grad_data,
-        hl_filter_descriptor filter,
-        real* filter_grad_data,
-        hl_convolution_descriptor conv,
-        void* gpuWorkSpace,
-        size_t sizeInBytes,
-        int  convBwdFilterAlgo);
+extern void hl_convolution_backward_filter(hl_tensor_descriptor input,
+                                           real* input_data,
+                                           hl_tensor_descriptor output,
+                                           real* output_grad_data,
+                                           hl_filter_descriptor filter,
+                                           real* filter_grad_data,
+                                           hl_convolution_descriptor conv,
+                                           void* gpuWorkSpace,
+                                           size_t sizeInBytes,
+                                           int convBwdFilterAlgo);
 
 /**
  * @brief   convolution backward data(calculate input image grad data).
@@ -350,17 +348,16 @@ extern void hl_convolution_backward_filter(
  * @param[in]   sizeInBytes         gpu workspace size (bytes).
  * @param[in]   convBwdDataAlgo     backward data algorithm.
  */
-extern void hl_convolution_backward_data(
-        hl_tensor_descriptor input,
-        real* input_data_grad,
-        hl_tensor_descriptor output,
-        real* output_grad_data,
-        hl_filter_descriptor filter,
-        real* filter_data,
-        hl_convolution_descriptor conv,
-        void* gpuWorkSpace,
-        size_t sizeInBytes,
-        int convBwdDataAlgo);
+extern void hl_convolution_backward_data(hl_tensor_descriptor input,
+                                         real* input_data_grad,
+                                         hl_tensor_descriptor output,
+                                         real* output_grad_data,
+                                         hl_filter_descriptor filter,
+                                         real* filter_data,
+                                         hl_convolution_descriptor conv,
+                                         void* gpuWorkSpace,
+                                         size_t sizeInBytes,
+                                         int convBwdDataAlgo);
 
 /**
  * @brief   convolution backward bias(calculate bias grad data).
@@ -383,8 +380,8 @@ extern void hl_convolution_backward_bias(hl_tensor_descriptor bias,
  * @param[in]   height              matrix height.
  * @param[in]   width               matrix width.
  */
-extern void hl_softmax_forward(real *input,
-                               real *output,
+extern void hl_softmax_forward(real* input,
+                               real* output,
                                int height,
                                int width);
 
@@ -396,8 +393,8 @@ extern void hl_softmax_forward(real *input,
  * @param[in]   height              matrix height.
  * @param[in]   width               matrix width.
  */
-extern void hl_softmax_backward(real *output_value,
-                                real *output_grad,
+extern void hl_softmax_backward(real* output_value,
+                                real* output_grad,
                                 int height,
                                 int width);
 
@@ -426,18 +423,18 @@ extern void hl_softmax_backward(real *output_value,
  *
  */
 extern void hl_batch_norm_forward_training(hl_tensor_descriptor inputDesc,
-                                           real *input,
+                                           real* input,
                                            hl_tensor_descriptor outputDesc,
-                                           real *output,
+                                           real* output,
                                            hl_tensor_descriptor bnParamDesc,
-                                           real *scale,
-                                           real *bias,
+                                           real* scale,
+                                           real* bias,
                                            double factor,
-                                           real *runningMean,
-                                           real *runningInvVar,
+                                           real* runningMean,
+                                           real* runningInvVar,
                                            double epsilon,
-                                           real *savedMean,
-                                           real *savedVar);
+                                           real* savedMean,
+                                           real* savedVar);
 
 /**
  * @brief   cudnn batch norm forward.
@@ -463,14 +460,14 @@ extern void hl_batch_norm_forward_training(hl_tensor_descriptor inputDesc,
  *
  */
 extern void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
-                                            real *input,
+                                            real* input,
                                             hl_tensor_descriptor outputDesc,
-                                            real *output,
+                                            real* output,
                                             hl_tensor_descriptor bnParamDesc,
-                                            real *scale,
-                                            real *bias,
-                                            real *estimatedMean,
-                                            real *estimatedVar,
+                                            real* scale,
+                                            real* bias,
+                                            real* estimatedMean,
+                                            real* estimatedVar,
                                             double epsilon);
 
 /**
@@ -483,7 +480,8 @@ extern void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
  * @param[in]   inGradDesc      input tensor descriptor desc.
  * @param[in]   inGrad          input data.
  * @param[in]   dBnParamDesc    tensor descriptor desc.
- *                              bnScale, bnBias, running mean/var, save_mean/var.
+ *                              bnScale, bnBias, running mean/var,
+ * save_mean/var.
  * @param[in]   scale           batch normalization scale parameter (in original
  *                              paper scale is referred to as gamma).
  * @param[in]   scaleGrad       batch normalization scale parameter (in original
@@ -497,17 +495,17 @@ extern void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
  *
  */
 extern void hl_batch_norm_backward(hl_tensor_descriptor inputDesc,
-                                   real *input,
+                                   real* input,
                                    hl_tensor_descriptor outGradDesc,
-                                   real *outGrad,
+                                   real* outGrad,
                                    hl_tensor_descriptor inGradDesc,
-                                   real *inGrad,
+                                   real* inGrad,
                                    hl_tensor_descriptor dBnParamDesc,
-                                   real *scale,
-                                   real *scaleGrad,
-                                   real *biasGrad,
+                                   real* scale,
+                                   real* scaleGrad,
+                                   real* biasGrad,
                                    double epsilon,
-                                   real *savedMean,
-                                   real *savedInvVar);
+                                   real* savedMean,
+                                   real* savedInvVar);
 
 #endif  // HL_CUDA_CUDNN_H_
diff --git a/paddle/cuda/include/hl_dso_loader.h b/paddle/cuda/include/hl_dso_loader.h
index f36c724e2d..1eb9f9ca88 100644
--- a/paddle/cuda/include/hl_dso_loader.h
+++ b/paddle/cuda/include/hl_dso_loader.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_DSO_LOADER_H_
 #define HL_DSO_LOADER_H_
 
diff --git a/paddle/cuda/include/hl_functions.h b/paddle/cuda/include/hl_functions.h
index 65f366461c..91ce9a0678 100644
--- a/paddle/cuda/include/hl_functions.h
+++ b/paddle/cuda/include/hl_functions.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_FUNCTIONS_H_
 #define HL_FUNCTIONS_H_
 
@@ -21,30 +20,30 @@ limitations under the License. */
 /**
  * sigmoid threshold maximum
  */
-#define     SIGMOID_THRESHOLD_MIN   -40.0
+#define SIGMOID_THRESHOLD_MIN -40.0
 
 /**
  * sigmoid threshold minimum
  */
-#define     SIGMOID_THRESHOLD_MAX   13.0
+#define SIGMOID_THRESHOLD_MAX 13.0
 
 #ifndef __NVCC__
 namespace hppl {
-  /*
-   * forward activation
-   */
-  real relu(const real a);
-  real sigmoid(const real a);
-  real tanh(const real a);
-  real linear(const real a);
-
-  /*
-   * backward activation
-   */
-  real relu(const real a, const real b);
-  real sigmoid(const real a, const real b);
-  real tanh(const real a, const real b);
-  real linear(const real a, const real b);
+/*
+ * forward activation
+ */
+real relu(const real a);
+real sigmoid(const real a);
+real tanh(const real a);
+real linear(const real a);
+
+/*
+ * backward activation
+ */
+real relu(const real a, const real b);
+real sigmoid(const real a, const real b);
+real tanh(const real a, const real b);
+real linear(const real a, const real b);
 }  // namespace hppl
 
 #ifdef __AVX__
diff --git a/paddle/cuda/include/hl_gpu.h b/paddle/cuda/include/hl_gpu.h
index 05039663b6..3be0df3b93 100644
--- a/paddle/cuda/include/hl_gpu.h
+++ b/paddle/cuda/include/hl_gpu.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_GPU_H_
 #define HL_GPU_H_
 
diff --git a/paddle/cuda/include/hl_lstm.h b/paddle/cuda/include/hl_lstm.h
index 1f95e318a1..7e527a7902 100644
--- a/paddle/cuda/include/hl_lstm.h
+++ b/paddle/cuda/include/hl_lstm.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_LSTM_H_
 #define HL_LSTM_H_
 
diff --git a/paddle/cuda/include/hl_matrix.h b/paddle/cuda/include/hl_matrix.h
index 6195e30b99..96648661e3 100644
--- a/paddle/cuda/include/hl_matrix.h
+++ b/paddle/cuda/include/hl_matrix.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_MATRIX_H_
 #define HL_MATRIX_H_
 
@@ -30,13 +29,8 @@ limitations under the License. */
  * @param[in]   beta    scalar used for addition.
  *
  */
-extern void hl_matrix_add(real* A_d,
-                          real* B_d,
-                          real* C_d,
-                          int dimM,
-                          int dimN,
-                          real alpha,
-                          real beta);
+extern void hl_matrix_add(
+    real* A_d, real* B_d, real* C_d, int dimM, int dimN, real alpha, real beta);
 /**
  * @brief   Matrix Softmax.
  *
@@ -46,7 +40,7 @@ extern void hl_matrix_add(real* A_d,
  * @param[in]   dimN    matrix width.
  *
  */
-extern void hl_matrix_softmax(real *A_d, real *C_d, int dimM, int dimN);
+extern void hl_matrix_softmax(real* A_d, real* C_d, int dimM, int dimN);
 
 /**
  * @brief   Matrix softmax derivative.
@@ -58,11 +52,8 @@ extern void hl_matrix_softmax(real *A_d, real *C_d, int dimM, int dimN);
  * @param[in]   dimN         matrix width.
  *
  */
-extern void hl_matrix_softmax_derivative(real* grad_d,
-                                         real* output_d,
-                                         real* sftmaxSum_d,
-                                         int dimM,
-                                         int dimN);
+extern void hl_matrix_softmax_derivative(
+    real* grad_d, real* output_d, real* sftmaxSum_d, int dimM, int dimN);
 
 /**
  * @brief   Sequence softmax.
@@ -73,8 +64,8 @@ extern void hl_matrix_softmax_derivative(real* grad_d,
  * @param[in]   numSequence sequence number.
  *
  */
-extern void hl_sequence_softmax_forward(real *A_d,
-                                        real *C_d,
+extern void hl_sequence_softmax_forward(real* A_d,
+                                        real* C_d,
                                         const int* index,
                                         int numSequence);
 
@@ -88,11 +79,8 @@ extern void hl_sequence_softmax_forward(real *A_d,
  * @param[in]   dimN    matrix width.
  *
  */
-extern void hl_matrix_classification_error(real* A_d,
-                                           int* B_d,
-                                           real* C_d,
-                                           int dimM,
-                                           int dimN);
+extern void hl_matrix_classification_error(
+    real* A_d, int* B_d, real* C_d, int dimM, int dimN);
 
 /**
  * @brief   Matrix cross entropy.
@@ -104,11 +92,8 @@ extern void hl_matrix_classification_error(real* A_d,
  * @param[in]   dimN    matrix width.
  *
  */
-extern void hl_matrix_cross_entropy(real* A_d,
-                                    real* C_d,
-                                    int* label_d,
-                                    int dimM,
-                                    int dimN);
+extern void hl_matrix_cross_entropy(
+    real* A_d, real* C_d, int* label_d, int dimM, int dimN);
 
 /**
  * @brief   Matrix cross entropy back propagation.
@@ -120,11 +105,8 @@ extern void hl_matrix_cross_entropy(real* A_d,
  * @param[in]   dimN        matrix width.
  *
  */
-extern void hl_matrix_cross_entropy_bp(real* grad_d,
-                                       real* output_d,
-                                       int* label_d,
-                                       int dimM,
-                                       int dimN);
+extern void hl_matrix_cross_entropy_bp(
+    real* grad_d, real* output_d, int* label_d, int dimM, int dimN);
 
 /**
  * @brief  Matrix multi-binary label cross entropy
@@ -135,11 +117,8 @@ extern void hl_matrix_cross_entropy_bp(real* grad_d,
  * @param[in]   dimM      matrix height.
  * @param[in]   dimN      matrix width.
  */
-extern void hl_matrix_multi_binary_cross_entropy(real* output,
-                                                 real* entropy,
-                                                 hl_sparse_matrix_s mat,
-                                                 int dimM,
-                                                 int dimN);
+extern void hl_matrix_multi_binary_cross_entropy(
+    real* output, real* entropy, hl_sparse_matrix_s mat, int dimM, int dimN);
 
 /**
  * @brief  Matrix multi-binary label cross entropy backprop
@@ -150,11 +129,8 @@ extern void hl_matrix_multi_binary_cross_entropy(real* output,
  * @param[in]   dimM      matrix height.
  * @param[in]   dimN      matrix width.
  */
-extern void hl_matrix_multi_binary_cross_entropy_bp(real* output,
-                                                    real* grad,
-                                                    hl_sparse_matrix_s mat,
-                                                    int dimM,
-                                                    int dimN);
+extern void hl_matrix_multi_binary_cross_entropy_bp(
+    real* output, real* grad, hl_sparse_matrix_s mat, int dimM, int dimN);
 
 /**
  * @brief  Matrix zero memory.
@@ -176,12 +152,8 @@ extern void hl_matrix_zero_mem(real* data, int num);
  * @param[in]  partial_sum
  */
 
-extern void hl_param_relu_forward(real* output,
-                                  real* input,
-                                  real* w,
-                                  int width,
-                                  int height,
-                                  int partial_sum);
+extern void hl_param_relu_forward(
+    real* output, real* input, real* w, int width, int height, int partial_sum);
 /**
  * @brief parameter relu backward w
  *
diff --git a/paddle/cuda/include/hl_sequence.h b/paddle/cuda/include/hl_sequence.h
index 46d86b2982..bb5124df44 100644
--- a/paddle/cuda/include/hl_sequence.h
+++ b/paddle/cuda/include/hl_sequence.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_SEQUENCE_H_
 #define HL_SEQUENCE_H_
 
@@ -32,7 +31,7 @@ limitations under the License. */
 extern void hl_max_sequence_forward(real* input,
                                     const int* sequence,
                                     real* output,
-                                    int *index,
+                                    int* index,
                                     int numSequences,
                                     int dim);
 
@@ -46,11 +45,8 @@ extern void hl_max_sequence_forward(real* input,
  * @param[in]   dim             input dimension.
  *
  */
-extern void hl_max_sequence_backward(real* outputGrad,
-                                     int *index,
-                                     real* inputGrad,
-                                     int numSequences,
-                                     int dim);
+extern void hl_max_sequence_backward(
+    real* outputGrad, int* index, real* inputGrad, int numSequences, int dim);
 
 /**
  * @brief   Context projection forward.
@@ -63,7 +59,8 @@ extern void hl_max_sequence_backward(real* outputGrad,
  * @param[in]   inputDim        input sequence dimension.
  * @param[in]   contextLength   context length.
  * @param[in]   contextStart    context start.
- * @param[in]   beginPad        number of extra timesteps added at the beginning.
+ * @param[in]   beginPad        number of extra timesteps added at the
+ * beginning.
  * @param[in]   isPadding       trainable padding.
  *
  */
@@ -109,7 +106,8 @@ extern void hl_context_projection_backward_data(real* outputGrad,
  * @param[in]   totalPad        number of extra timesteps.
  * @param[in]   contextLength   context length.
  * @param[in]   contextStart    context start.
- * @param[in]   beginPad        number of extra timesteps added at the beginning.
+ * @param[in]   beginPad        number of extra timesteps added at the
+ * beginning.
  *
  */
 extern void hl_context_projection_backward_weight(real* outputGrad,
@@ -141,9 +139,9 @@ extern void hl_context_projection_backward_weight(real* outputGrad,
  * @param[in]       seq2batch   copy direction.
  *
  */
-extern void hl_sequence2batch_copy(real *batch,
-                                   real *sequence,
-                                   const int *batchIndex,
+extern void hl_sequence2batch_copy(real* batch,
+                                   real* sequence,
+                                   const int* batchIndex,
                                    int seqWidth,
                                    int batchCount,
                                    bool seq2batch);
@@ -167,9 +165,9 @@ extern void hl_sequence2batch_copy(real *batch,
  * @param[in]       seq2batch   copy direction.
  *
  */
-extern void hl_sequence2batch_add(real *batch,
-                                  real *sequence,
-                                  int *batchIndex,
+extern void hl_sequence2batch_add(real* batch,
+                                  real* sequence,
+                                  int* batchIndex,
                                   int seqWidth,
                                   int batchCount,
                                   bool seq2batch);
diff --git a/paddle/cuda/include/hl_sparse.h b/paddle/cuda/include/hl_sparse.h
index 9acdebdebf..c4e0be23e2 100644
--- a/paddle/cuda/include/hl_sparse.h
+++ b/paddle/cuda/include/hl_sparse.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_SPARSE_H_
 #define HL_SPARSE_H_
 
@@ -31,7 +30,7 @@ limitations under the License. */
  */
 extern void hl_malloc_sparse_matrix(hl_sparse_matrix_s *A_d,
                                     hl_matrix_format_t format,
-                                    hl_matrix_value_t  value_type,
+                                    hl_matrix_value_t value_type,
                                     int dimM,
                                     int dimN,
                                     int nnz);
@@ -60,10 +59,10 @@ extern void hl_free_sparse_matrix(hl_sparse_matrix_s A_d);
  *
  */
 extern void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
-                                       void * dest_d,
+                                       void *dest_d,
                                        size_t size,
                                        hl_matrix_format_t format,
-                                       hl_matrix_value_t  value_type,
+                                       hl_matrix_value_t value_type,
                                        int dimM,
                                        int dimN,
                                        int nnz);
@@ -94,11 +93,11 @@ extern void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
  *
  */
 extern void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
-                                       real* value_d,
-                                       int* rows_d,
-                                       int* cols_d,
+                                       real *value_d,
+                                       int *rows_d,
+                                       int *cols_d,
                                        hl_matrix_format_t format,
-                                       hl_matrix_value_t  value_type,
+                                       hl_matrix_value_t value_type,
                                        int dimM,
                                        int dimN,
                                        int nnz);
@@ -259,10 +258,14 @@ extern void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d,
  */
 extern void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d,
                                     hl_trans_op_t transa,
-                                    real *B_d, hl_trans_op_t transb,
+                                    real *B_d,
+                                    hl_trans_op_t transb,
                                     real *C_d,
-                                    int dimM, int dimN, int dimK,
-                                    real alpha, real beta);
+                                    int dimM,
+                                    int dimN,
+                                    int dimK,
+                                    real alpha,
+                                    real beta);
 
 /**
  * @brief   C_d = alpha*(op(A_d) * op(B_d)) + beta*C_d.
@@ -311,11 +314,16 @@ extern void hl_matrix_dense_mul_csc(real *A_d,
  * @note    transb is not support HPPL_OP_T.
  *
  */
-extern void hl_sparse_matrix_mul(real* A_d, hl_trans_op_t transa,
-                                 real *B_d, hl_trans_op_t transb,
+extern void hl_sparse_matrix_mul(real *A_d,
+                                 hl_trans_op_t transa,
+                                 real *B_d,
+                                 hl_trans_op_t transb,
                                  hl_sparse_matrix_s C_d,
-                                 int dimM, int dimN, int dimK,
-                                 real alpha, real beta);
+                                 int dimM,
+                                 int dimN,
+                                 int dimK,
+                                 real alpha,
+                                 real beta);
 
 /**
  * @brief   C_d = alpha*(op(A_d) * op(B_d)) + beta*C_d
@@ -336,12 +344,16 @@ extern void hl_sparse_matrix_mul(real* A_d, hl_trans_op_t transa,
  * @note    transa is not support HPPL_OP_T.
  *
  */
-extern void hl_matrix_dense_mul_csr(real *A_d, hl_trans_op_t transa,
+extern void hl_matrix_dense_mul_csr(real *A_d,
+                                    hl_trans_op_t transa,
                                     hl_sparse_matrix_s B_d,
                                     hl_trans_op_t transb,
                                     real *C_d,
-                                    int dimM, int dimN, int dimK,
-                                    real alpha, real beta);
+                                    int dimM,
+                                    int dimN,
+                                    int dimK,
+                                    real alpha,
+                                    real beta);
 
 /**
  * @brief   Memcpy csc_matrix to host.
@@ -412,7 +424,6 @@ extern void hl_memcpy_from_csr_matrix(real *csr_val,
                                       hl_sparse_matrix_s csr_matrix,
                                       hl_stream_t stream);
 
-
 /**
  * @brief   A_d[j] += B_d[i,j] for i in range(height)
  *
@@ -423,19 +434,13 @@ extern void hl_memcpy_from_csr_matrix(real *csr_val,
  * @param[in]       scale  scale of B_d
  *
  */
-extern void hl_sparse_matrix_column_sum(real* A_d,
-                                        hl_sparse_matrix_s B_d,
-                                        int dimM,
-                                        int dimN,
-                                        real scale);
+extern void hl_sparse_matrix_column_sum(
+    real *A_d, hl_sparse_matrix_s B_d, int dimM, int dimN, real scale);
 /**
  * @brief implementation of csr sparse matrix in hl_sparse_matirx_column_sum
  */
-extern void hl_matrix_csr_column_sum(real* A_d,
-                                     hl_sparse_matrix_s B_d,
-                                     int dimM,
-                                     int dimN,
-                                     real scale);
+extern void hl_matrix_csr_column_sum(
+    real *A_d, hl_sparse_matrix_s B_d, int dimM, int dimN, real scale);
 
 /**
  * @brief   A_d[i,j] += B_d[j]
@@ -446,13 +451,13 @@ extern void hl_matrix_csr_column_sum(real* A_d,
  *
  */
 extern void hl_sparse_matrix_add_bias(hl_sparse_matrix_s A_d,
-                                      real* B_d,
+                                      real *B_d,
                                       real scale);
 /**
  * @brief implementation of csr sparse matrix in hl_sparse_matrix_add_bias
  */
 extern void hl_matrix_csr_add_bias(hl_sparse_matrix_s A_d,
-                                   real* B_d,
+                                   real *B_d,
                                    real scale);
 
 /**
@@ -470,7 +475,7 @@ extern void hl_matrix_csr_add_bias(hl_sparse_matrix_s A_d,
  *
  */
 extern void hl_sparse_matrix_add_dense(hl_sparse_matrix_s A_d,
-                                       real* B_d,
+                                       real *B_d,
                                        int dimM,
                                        int dimN,
                                        real alpha,
@@ -479,7 +484,7 @@ extern void hl_sparse_matrix_add_dense(hl_sparse_matrix_s A_d,
  * @brief implementation of csr sparse matrix in hl_sparse_matrix_add_dense
  */
 extern void hl_matrix_csr_add_dense(hl_sparse_matrix_s A_d,
-                                    real* B_d,
+                                    real *B_d,
                                     int dimM,
                                     int dimN,
                                     real alpha,
@@ -493,7 +498,7 @@ extern void hl_matrix_csr_add_dense(hl_sparse_matrix_s A_d,
  * @return   return rows pointer, which is gpu address
  *
  */
-extern int* hl_sparse_matrix_get_rows(hl_sparse_matrix_s sMat);
+extern int *hl_sparse_matrix_get_rows(hl_sparse_matrix_s sMat);
 
 /**
  * @brief get cols pionter of GpuSparseMatrix
@@ -503,7 +508,7 @@ extern int* hl_sparse_matrix_get_rows(hl_sparse_matrix_s sMat);
  * @return   return cols pointer, which is gpu address
  *
  */
-extern int* hl_sparse_matrix_get_cols(hl_sparse_matrix_s sMat);
+extern int *hl_sparse_matrix_get_cols(hl_sparse_matrix_s sMat);
 
 /**
  * @brief get value pionter of GpuSparseMatrix
@@ -513,7 +518,6 @@ extern int* hl_sparse_matrix_get_cols(hl_sparse_matrix_s sMat);
  * @return   return value pointer, which is gpu address
  *
  */
-extern real* hl_sparse_matrix_get_value(hl_sparse_matrix_s sMat);
-
+extern real *hl_sparse_matrix_get_value(hl_sparse_matrix_s sMat);
 
 #endif /* HL_SPARSE_H_ */
diff --git a/paddle/cuda/include/hl_table_apply.h b/paddle/cuda/include/hl_table_apply.h
index 3c9428e925..b4ac83a66a 100644
--- a/paddle/cuda/include/hl_table_apply.h
+++ b/paddle/cuda/include/hl_table_apply.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_TABLE_APPLY_H_
 #define HL_TABLE_APPLY_H_
 
@@ -31,8 +30,10 @@ limitations under the License. */
  * @param[in]   dim             width of table.
  *
  */
-extern void hl_matrix_select_rows(real* output, int ldo,
-                                  real* table, int ldt,
+extern void hl_matrix_select_rows(real* output,
+                                  int ldo,
+                                  real* table,
+                                  int ldt,
                                   int* ids,
                                   int numSamples,
                                   int tableSize,
@@ -53,8 +54,10 @@ extern void hl_matrix_select_rows(real* output, int ldo,
  * @param[in]   dim             width of table.
  *
  */
-extern void hl_matrix_add_to_rows(real* table, int ldt,
-                                  real* input, int ldi,
+extern void hl_matrix_add_to_rows(real* table,
+                                  int ldt,
+                                  real* input,
+                                  int ldi,
                                   int* ids,
                                   int numSamples,
                                   int tableSize,
@@ -72,8 +75,7 @@ extern void hl_matrix_add_to_rows(real* table, int ldt,
  *
  */
 template <class T>
-extern void hl_vector_select_from(T* dst, int sized,
-                                  const T* src, int sizes,
-                                  const int* ids, int sizei);
+extern void hl_vector_select_from(
+    T* dst, int sized, const T* src, int sizes, const int* ids, int sizei);
 
-#endif  /* HL_TABLE_APPLY_H_ */
+#endif /* HL_TABLE_APPLY_H_ */
diff --git a/paddle/cuda/include/hl_time.h b/paddle/cuda/include/hl_time.h
index 4414b0b2d2..b0a88c66a1 100644
--- a/paddle/cuda/include/hl_time.h
+++ b/paddle/cuda/include/hl_time.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_TIME_H_
 #define HL_TIME_H_
 
diff --git a/paddle/cuda/include/hl_top_k.h b/paddle/cuda/include/hl_top_k.h
index a38d4cf862..e8cfebbf6a 100644
--- a/paddle/cuda/include/hl_top_k.h
+++ b/paddle/cuda/include/hl_top_k.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_TOP_K_H_
 #define HL_TOP_K_H_
 
@@ -31,9 +30,11 @@ limitations under the License. */
  * @param[in]   numSamples     height of input value.
  *
  */
-extern void hl_matrix_top_k(real* topVal, int ldv,
-                            int * topIds,
-                            real* src, int lds,
+extern void hl_matrix_top_k(real* topVal,
+                            int ldv,
+                            int* topIds,
+                            real* src,
+                            int lds,
                             int dim,
                             int beamSize,
                             int numSamples);
@@ -50,8 +51,9 @@ extern void hl_matrix_top_k(real* topVal, int ldv,
  *
  * @note    Only support HL_SPARSE_CSR format.
  */
-extern void hl_sparse_matrix_top_k(real* topVal, int ldv,
-                                   int * topIds,
+extern void hl_sparse_matrix_top_k(real* topVal,
+                                   int ldv,
+                                   int* topIds,
                                    hl_sparse_matrix_s src,
                                    int beamSize,
                                    int numSamples);
diff --git a/paddle/cuda/include/stub/hl_aggregate_stub.h b/paddle/cuda/include/stub/hl_aggregate_stub.h
index 4c0c68f3c9..bb53fc581e 100644
--- a/paddle/cuda/include/stub/hl_aggregate_stub.h
+++ b/paddle/cuda/include/stub/hl_aggregate_stub.h
@@ -12,29 +12,22 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_AGGREGATE_STUB_H_
 #define HL_AGGREGATE_STUB_H_
 
 #include "hl_aggregate.h"
 
-inline void hl_matrix_row_sum(real *A_d, real *C_d,
-                              int dimM, int dimN) {}
+inline void hl_matrix_row_sum(real *A_d, real *C_d, int dimM, int dimN) {}
 
-inline void hl_matrix_row_max(real *A_d, real *C_d,
-                              int dimM, int dimN) {}
+inline void hl_matrix_row_max(real *A_d, real *C_d, int dimM, int dimN) {}
 
-inline void hl_matrix_row_min(real *A_d, real *C_d,
-                              int dimM, int dimN) {}
+inline void hl_matrix_row_min(real *A_d, real *C_d, int dimM, int dimN) {}
 
-inline void hl_matrix_column_sum(real *A_d, real *C_d,
-                                 int dimM, int dimN) {}
+inline void hl_matrix_column_sum(real *A_d, real *C_d, int dimM, int dimN) {}
 
-inline void hl_matrix_column_max(real *A_d, real *C_d,
-                                 int dimM, int dimN) {}
+inline void hl_matrix_column_max(real *A_d, real *C_d, int dimM, int dimN) {}
 
-inline void hl_matrix_column_min(real *A_d, real *C_d,
-                                 int dimM, int dimN) {}
+inline void hl_matrix_column_min(real *A_d, real *C_d, int dimM, int dimN) {}
 
 inline void hl_vector_sum(real *A_d, real *C_h, int dimM) {}
 
diff --git a/paddle/cuda/include/stub/hl_cnn_stub.h b/paddle/cuda/include/stub/hl_cnn_stub.h
index c6f32ad337..2f73b9671e 100644
--- a/paddle/cuda/include/stub/hl_cnn_stub.h
+++ b/paddle/cuda/include/stub/hl_cnn_stub.h
@@ -12,84 +12,134 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_CNN_STUB_H_
 #define HL_CNN_STUB_H_
 
 #include "hl_cnn.h"
 
-inline void hl_shrink_col2feature(
-    const real * dataCol, size_t channels,
-    size_t height, size_t width,
-    size_t blockH, size_t blockW,
-    size_t strideH, size_t strideW,
-    size_t paddingH, size_t paddingW,
-    size_t outputH, size_t outputW,
-    real* dataIm,
-    real alpha, real beta) {}
-
-inline void hl_expand_feature2col(
-    const real* dataIm, size_t channels,
-    size_t height, size_t width,
-    size_t blockH, size_t blockW,
-    size_t strideH, size_t strideW,
-    size_t paddingH, size_t paddingW,
-    size_t outputH, size_t outputW,
-    real* dataCol) {}
-
-inline void hl_maxpool_forward(
-    const int frameCnt, const real* inputData,
-    const int channels,
-    const int height, const int width,
-    const int pooledH, const int pooledW,
-    const int sizeX, const int sizeY,
-    const int strideH, const int strideW,
-    const int paddingH, const int paddingW,
-    real* tgtData, const int tgtStride) {}
-
-inline void hl_maxpool_backward(
-    const int frameCnt, const real* inputData,
-    const real* outData, const real* outGrad,
-    const int channels, const int height,
-    const int width,
-    const int pooledH, const int pooledW,
-    const int sizeX, const int sizeY,
-    const int strideH, const int strideW,
-    const int paddingH, const int paddingW,
-    real scaleA, real scaleB,
-    real* targetGrad, const int outStride) {}
-
-inline void hl_avgpool_forward(
-    const int frameCnt, const real* inputData,
-    const int channels,
-    const int height, const int width,
-    const int pooledH, const int pooledW,
-    const int sizeX, const int sizeY,
-    const int strideH, const int strideW,
-    const int paddingH, const int paddingW,
-    real* tgtData, const int tgtStride) {}
-
-inline void hl_avgpool_backward(
-    const int frameCnt, const real* outGrad,
-    const int channels, const int height,
-    const int width,
-    const int pooledH, const int pooledW,
-    const int sizeX, const int sizeY,
-    const int strideH, const int strideW,
-    int paddingH, int paddingW,
-    real scaleA, real scaleB,
-    real* backGrad, const int outStride) {}
-
-inline void hl_CMRNorm_forward(
-    size_t frameCnt, const real* in, real* scale, real* out,
-    size_t channels, size_t height, size_t width, size_t sizeX,
-    real alpha, real beta) {}
-
-inline void hl_CMRNorm_backward(
-    size_t frameCnt, const real* inV, const real* scale,
-    const real* outV, const real* outDiff, real *inDiff,
-    size_t channels, size_t height, size_t width, size_t sizeX,
-    real alpha, real beta) {}
+inline void hl_shrink_col2feature(const real* dataCol,
+                                  size_t channels,
+                                  size_t height,
+                                  size_t width,
+                                  size_t blockH,
+                                  size_t blockW,
+                                  size_t strideH,
+                                  size_t strideW,
+                                  size_t paddingH,
+                                  size_t paddingW,
+                                  size_t outputH,
+                                  size_t outputW,
+                                  real* dataIm,
+                                  real alpha,
+                                  real beta) {}
+
+inline void hl_expand_feature2col(const real* dataIm,
+                                  size_t channels,
+                                  size_t height,
+                                  size_t width,
+                                  size_t blockH,
+                                  size_t blockW,
+                                  size_t strideH,
+                                  size_t strideW,
+                                  size_t paddingH,
+                                  size_t paddingW,
+                                  size_t outputH,
+                                  size_t outputW,
+                                  real* dataCol) {}
+
+inline void hl_maxpool_forward(const int frameCnt,
+                               const real* inputData,
+                               const int channels,
+                               const int height,
+                               const int width,
+                               const int pooledH,
+                               const int pooledW,
+                               const int sizeX,
+                               const int sizeY,
+                               const int strideH,
+                               const int strideW,
+                               const int paddingH,
+                               const int paddingW,
+                               real* tgtData,
+                               const int tgtStride) {}
+
+inline void hl_maxpool_backward(const int frameCnt,
+                                const real* inputData,
+                                const real* outData,
+                                const real* outGrad,
+                                const int channels,
+                                const int height,
+                                const int width,
+                                const int pooledH,
+                                const int pooledW,
+                                const int sizeX,
+                                const int sizeY,
+                                const int strideH,
+                                const int strideW,
+                                const int paddingH,
+                                const int paddingW,
+                                real scaleA,
+                                real scaleB,
+                                real* targetGrad,
+                                const int outStride) {}
+
+inline void hl_avgpool_forward(const int frameCnt,
+                               const real* inputData,
+                               const int channels,
+                               const int height,
+                               const int width,
+                               const int pooledH,
+                               const int pooledW,
+                               const int sizeX,
+                               const int sizeY,
+                               const int strideH,
+                               const int strideW,
+                               const int paddingH,
+                               const int paddingW,
+                               real* tgtData,
+                               const int tgtStride) {}
+
+inline void hl_avgpool_backward(const int frameCnt,
+                                const real* outGrad,
+                                const int channels,
+                                const int height,
+                                const int width,
+                                const int pooledH,
+                                const int pooledW,
+                                const int sizeX,
+                                const int sizeY,
+                                const int strideH,
+                                const int strideW,
+                                int paddingH,
+                                int paddingW,
+                                real scaleA,
+                                real scaleB,
+                                real* backGrad,
+                                const int outStride) {}
+
+inline void hl_CMRNorm_forward(size_t frameCnt,
+                               const real* in,
+                               real* scale,
+                               real* out,
+                               size_t channels,
+                               size_t height,
+                               size_t width,
+                               size_t sizeX,
+                               real alpha,
+                               real beta) {}
+
+inline void hl_CMRNorm_backward(size_t frameCnt,
+                                const real* inV,
+                                const real* scale,
+                                const real* outV,
+                                const real* outDiff,
+                                real* inDiff,
+                                size_t channels,
+                                size_t height,
+                                size_t width,
+                                size_t sizeX,
+                                real alpha,
+                                real beta) {}
 
 inline void hl_bilinear_forward(const real* inData,
                                 const size_t inImgH,
@@ -106,25 +156,33 @@ inline void hl_bilinear_forward(const real* inData,
                                 const real ratioW) {}
 
 inline void hl_bilinear_backward(real* inGrad,
-                                const size_t inImgH,
-                                const size_t inImgW,
-                                const size_t inputH,
-                                const size_t inputW,
-                                const real* outGrad,
-                                const size_t outImgH,
-                                const size_t outImgW,
-                                const size_t outputH,
-                                const size_t outputW,
-                                const size_t numChannels,
-                                const real ratioH,
-                                const real ratioW) {}
-
-inline void hl_maxout_forward(
-    const real* inData, real* outData, int* idData,
-    size_t batchSize, size_t size, size_t featLen, size_t group) {}
-
-inline void hl_maxout_backward(
-    real* inGrad, const real* outGrad, const int* idData,
-    size_t batchSize, size_t size, size_t featLen, size_t group) {}
+                                 const size_t inImgH,
+                                 const size_t inImgW,
+                                 const size_t inputH,
+                                 const size_t inputW,
+                                 const real* outGrad,
+                                 const size_t outImgH,
+                                 const size_t outImgW,
+                                 const size_t outputH,
+                                 const size_t outputW,
+                                 const size_t numChannels,
+                                 const real ratioH,
+                                 const real ratioW) {}
+
+inline void hl_maxout_forward(const real* inData,
+                              real* outData,
+                              int* idData,
+                              size_t batchSize,
+                              size_t size,
+                              size_t featLen,
+                              size_t group) {}
+
+inline void hl_maxout_backward(real* inGrad,
+                               const real* outGrad,
+                               const int* idData,
+                               size_t batchSize,
+                               size_t size,
+                               size_t featLen,
+                               size_t group) {}
 
 #endif  // HL_CNN_STUB_H_
diff --git a/paddle/cuda/include/stub/hl_cuda_cublas_stub.h b/paddle/cuda/include/stub/hl_cuda_cublas_stub.h
index 903dcbe835..85f7c390c4 100644
--- a/paddle/cuda/include/stub/hl_cuda_cublas_stub.h
+++ b/paddle/cuda/include/stub/hl_cuda_cublas_stub.h
@@ -12,41 +12,42 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_CUDA_CUBLAS_STUB_H_
 #define HL_CUDA_CUBLAS_STUB_H_
 
 #include "hl_cuda_cublas.h"
 
-inline void hl_matrix_transpose(real *A_d,
-                                real *C_d,
-                                int dimM,
-                                int dimN,
-                                int lda,
-                                int ldc) {}
-
-inline void hl_matrix_transpose(real *A_d,
-                                real *C_d,
-                                int dimM,
-                                int dimN) {}
-
-inline void hl_matrix_inverse(real *A_d,
-                              real *C_d,
-                              int dimN,
-                              int lda,
-                              int ldc) {}
-
-inline void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
-                          real *B_d, hl_trans_op_t transb,
-                          real *C_d,
-                          int dimM, int dimN, int dimK,
-                          real alpha, real beta,
-                          int lda, int ldb, int ldc) {}
+inline void hl_matrix_transpose(
+    real *A_d, real *C_d, int dimM, int dimN, int lda, int ldc) {}
+
+inline void hl_matrix_transpose(real *A_d, real *C_d, int dimM, int dimN) {}
 
-inline void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
-                          real *B_d, hl_trans_op_t transb,
+inline void hl_matrix_inverse(
+    real *A_d, real *C_d, int dimN, int lda, int ldc) {}
+
+inline void hl_matrix_mul(real *A_d,
+                          hl_trans_op_t transa,
+                          real *B_d,
+                          hl_trans_op_t transb,
+                          real *C_d,
+                          int dimM,
+                          int dimN,
+                          int dimK,
+                          real alpha,
+                          real beta,
+                          int lda,
+                          int ldb,
+                          int ldc) {}
+
+inline void hl_matrix_mul(real *A_d,
+                          hl_trans_op_t transa,
+                          real *B_d,
+                          hl_trans_op_t transb,
                           real *C_d,
-                          int dimM, int dimN, int dimK,
-                          real alpha, real beta) {}
+                          int dimM,
+                          int dimN,
+                          int dimK,
+                          real alpha,
+                          real beta) {}
 
 #endif  // HL_CUDA_CUBLAS_STUB_H_
diff --git a/paddle/cuda/include/stub/hl_cuda_cudnn_stub.h b/paddle/cuda/include/stub/hl_cuda_cudnn_stub.h
index b96804afd8..3beb0e5b51 100644
--- a/paddle/cuda/include/stub/hl_cuda_cudnn_stub.h
+++ b/paddle/cuda/include/stub/hl_cuda_cudnn_stub.h
@@ -12,15 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_CUDA_CUDNN_STUB_H_
 #define HL_CUDA_CUDNN_STUB_H_
 
 #include "hl_cuda_cudnn.h"
 
-inline int hl_get_cudnn_lib_version() {
-  return 0;
-}
+inline int hl_get_cudnn_lib_version() { return 0; }
 
 inline void hl_create_tensor_descriptor(hl_tensor_descriptor* image_desc) {}
 
@@ -68,41 +65,41 @@ inline void hl_pooling_backward(hl_tensor_descriptor input,
                                 hl_pooling_descriptor pooling) {}
 
 inline void hl_create_filter_descriptor(hl_filter_descriptor* filter,
-                                       int input_feature_maps,
-                                       int output_feature_maps,
-                                       int height,
-                                       int width) {}
+                                        int input_feature_maps,
+                                        int output_feature_maps,
+                                        int height,
+                                        int width) {}
 
 inline void hl_destroy_filter_descriptor(hl_filter_descriptor filter) {}
 
 inline void hl_create_convolution_descriptor(hl_convolution_descriptor* conv,
-        hl_tensor_descriptor image,
-        hl_filter_descriptor filter,
-        int padding_height,
-        int padding_width,
-        int stride_height,
-        int stride_width) {}
+                                             hl_tensor_descriptor image,
+                                             hl_filter_descriptor filter,
+                                             int padding_height,
+                                             int padding_width,
+                                             int stride_height,
+                                             int stride_width) {}
 
 inline void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
-        hl_tensor_descriptor image,
-        hl_filter_descriptor filter,
-        int padding_height,
-        int padding_width,
-        int stride_height,
-        int stride_width) {}
+                                            hl_tensor_descriptor image,
+                                            hl_filter_descriptor filter,
+                                            int padding_height,
+                                            int padding_width,
+                                            int stride_height,
+                                            int stride_width) {}
 
 inline void hl_destroy_convolution_descriptor(hl_convolution_descriptor conv) {}
 
 inline void hl_conv_workspace(hl_tensor_descriptor input,
-                       hl_tensor_descriptor output,
-                       hl_filter_descriptor filter,
-                       hl_convolution_descriptor conv,
-                       int* convFwdAlgo,
-                       size_t* fwdLimitBytes,
-                       int* convBwdDataAlgo,
-                       size_t* bwdDataLimitBytes,
-                       int* convBwdFilterAlgo,
-                       size_t* bwdFilterLimitBytes) {}
+                              hl_tensor_descriptor output,
+                              hl_filter_descriptor filter,
+                              hl_convolution_descriptor conv,
+                              int* convFwdAlgo,
+                              size_t* fwdLimitBytes,
+                              int* convBwdDataAlgo,
+                              size_t* bwdDataLimitBytes,
+                              int* convBwdFilterAlgo,
+                              size_t* bwdFilterLimitBytes) {}
 
 inline void hl_convolution_forward(hl_tensor_descriptor input,
                                    real* input_data,
@@ -116,86 +113,84 @@ inline void hl_convolution_forward(hl_tensor_descriptor input,
                                    int convFwdAlgo) {}
 
 inline void hl_convolution_forward_add_bias(hl_tensor_descriptor bias,
-        real* bias_data,
-        hl_tensor_descriptor output,
-        real* output_data) {}
-
-inline void hl_convolution_backward_filter(
-        hl_tensor_descriptor input,
-        real* input_data,
-        hl_tensor_descriptor output,
-        real* output_grad_data,
-        hl_filter_descriptor filter,
-        real* filter_grad_data,
-        hl_convolution_descriptor conv,
-        void* gpuWorkSpace,
-        size_t sizeInBytes,
-        int convBwdFilterAlgo) {}
-
-inline void hl_convolution_backward_data(
-        hl_tensor_descriptor input,
-        real* input_data_grad,
-        hl_tensor_descriptor output,
-        real* output_grad_data,
-        hl_filter_descriptor filter,
-        real* filter_data,
-        hl_convolution_descriptor conv,
-        void* gpuWorkSpace,
-        size_t sizeInBytes,
-        int convBwdDataAlgo) {}
+                                            real* bias_data,
+                                            hl_tensor_descriptor output,
+                                            real* output_data) {}
+
+inline void hl_convolution_backward_filter(hl_tensor_descriptor input,
+                                           real* input_data,
+                                           hl_tensor_descriptor output,
+                                           real* output_grad_data,
+                                           hl_filter_descriptor filter,
+                                           real* filter_grad_data,
+                                           hl_convolution_descriptor conv,
+                                           void* gpuWorkSpace,
+                                           size_t sizeInBytes,
+                                           int convBwdFilterAlgo) {}
+
+inline void hl_convolution_backward_data(hl_tensor_descriptor input,
+                                         real* input_data_grad,
+                                         hl_tensor_descriptor output,
+                                         real* output_grad_data,
+                                         hl_filter_descriptor filter,
+                                         real* filter_data,
+                                         hl_convolution_descriptor conv,
+                                         void* gpuWorkSpace,
+                                         size_t sizeInBytes,
+                                         int convBwdDataAlgo) {}
 
 inline void hl_convolution_backward_bias(hl_tensor_descriptor bias,
-                                        real* bias_grad_data,
-                                        hl_tensor_descriptor output,
-                                        real* output_grad_data) {}
+                                         real* bias_grad_data,
+                                         hl_tensor_descriptor output,
+                                         real* output_grad_data) {}
 
-inline void hl_softmax_forward(real *input,
-                              real *output,
-                              int height,
-                              int width) {}
-
-inline void hl_softmax_backward(real *output_value,
-                               real *output_grad,
+inline void hl_softmax_forward(real* input,
+                               real* output,
                                int height,
                                int width) {}
 
+inline void hl_softmax_backward(real* output_value,
+                                real* output_grad,
+                                int height,
+                                int width) {}
+
 inline void hl_batch_norm_forward_training(hl_tensor_descriptor inputDesc,
-                                           real *input,
+                                           real* input,
                                            hl_tensor_descriptor outputDesc,
-                                           real *output,
+                                           real* output,
                                            hl_tensor_descriptor bnParamDesc,
-                                           real *scale,
-                                           real *bias,
+                                           real* scale,
+                                           real* bias,
                                            double factor,
-                                           real *runningMean,
-                                           real *runningInvVar,
+                                           real* runningMean,
+                                           real* runningInvVar,
                                            double epsilon,
-                                           real *savedMean,
-                                           real *savedVar) {}
+                                           real* savedMean,
+                                           real* savedVar) {}
 
 inline void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
-                                            real *input,
+                                            real* input,
                                             hl_tensor_descriptor outputDesc,
-                                            real *output,
+                                            real* output,
                                             hl_tensor_descriptor bnParamDesc,
-                                            real *scale,
-                                            real *bias,
-                                            real *estimatedMean,
-                                            real *estimatedVar,
+                                            real* scale,
+                                            real* bias,
+                                            real* estimatedMean,
+                                            real* estimatedVar,
                                             double epsilon) {}
 
 inline void hl_batch_norm_backward(hl_tensor_descriptor inputDesc,
-                                   real *input,
+                                   real* input,
                                    hl_tensor_descriptor outGradDesc,
-                                   real *outGrad,
+                                   real* outGrad,
                                    hl_tensor_descriptor inGradDesc,
-                                   real *inGrad,
+                                   real* inGrad,
                                    hl_tensor_descriptor dBnParamDesc,
-                                   real *scale,
-                                   real *scaleGrad,
-                                   real *biasGrad,
+                                   real* scale,
+                                   real* scaleGrad,
+                                   real* biasGrad,
                                    double epsilon,
-                                   real *savedMean,
-                                   real *savedInvVar) {}
+                                   real* savedMean,
+                                   real* savedInvVar) {}
 
 #endif  // HL_CUDA_CUDNN_STUB_H_
diff --git a/paddle/cuda/include/stub/hl_cuda_stub.h b/paddle/cuda/include/stub/hl_cuda_stub.h
index fa7904421d..24923a0d4a 100644
--- a/paddle/cuda/include/stub/hl_cuda_stub.h
+++ b/paddle/cuda/include/stub/hl_cuda_stub.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_CUDA_STUB_H_
 #define HL_CUDA_STUB_H_
 
@@ -24,29 +23,25 @@ inline void hl_specify_devices_start(int *device, int number) {}
 
 inline void hl_init(int device) {}
 
-inline int hl_get_cuda_lib_version(int device) {
-  return 0;
-}
+inline int hl_get_cuda_lib_version(int device) { return 0; }
 
 inline void hl_fini() {}
 
 inline void hl_set_sync_flag(bool flag) {}
 
-inline bool hl_get_sync_flag() {
-  return false;
-}
+inline bool hl_get_sync_flag() { return false; }
 
-inline int hl_get_device_count() { return 0;  }
+inline int hl_get_device_count() { return 0; }
 
 inline void hl_set_device(int device) {}
 
-inline int hl_get_device() { return 0;  }
+inline int hl_get_device() { return 0; }
 
-inline void* hl_malloc_device(size_t size) { return NULL; }
+inline void *hl_malloc_device(size_t size) { return NULL; }
 
 inline void hl_free_mem_device(void *dest_d) {}
 
-inline void* hl_malloc_host(size_t size) { return NULL;  }
+inline void *hl_malloc_host(size_t size) { return NULL; }
 
 inline void hl_free_mem_host(void *dest_h) {}
 
@@ -64,7 +59,9 @@ inline void hl_rand(real *dest_d, size_t num) {}
 
 inline void hl_srand(unsigned int seed) {}
 
-inline void hl_memcpy_async(void *dst, void *src, size_t size,
+inline void hl_memcpy_async(void *dst,
+                            void *src,
+                            size_t size,
                             hl_stream_t stream) {}
 
 inline void hl_stream_synchronize(hl_stream_t stream) {}
@@ -83,11 +80,11 @@ inline void hl_stream_wait_event(hl_stream_t stream, hl_event_t event) {}
 
 inline void hl_event_synchronize(hl_event_t event) {}
 
-inline int hl_get_device_last_error() { return 0;  }
+inline int hl_get_device_last_error() { return 0; }
 
-inline const char* hl_get_device_error_string() { return NULL; }
+inline const char *hl_get_device_error_string() { return NULL; }
 
-inline const char* hl_get_device_error_string(size_t err) { return NULL; }
+inline const char *hl_get_device_error_string(size_t err) { return NULL; }
 
 inline bool hl_cuda_event_is_ready(hl_event_t event) { return true; }
 
diff --git a/paddle/cuda/include/stub/hl_lstm_stub.h b/paddle/cuda/include/stub/hl_lstm_stub.h
index 2700bef02a..7ccda032d2 100644
--- a/paddle/cuda/include/stub/hl_lstm_stub.h
+++ b/paddle/cuda/include/stub/hl_lstm_stub.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_LSTM_STUB_H_
 #define HL_LSTM_STUB_H_
 
diff --git a/paddle/cuda/include/stub/hl_matrix_stub.h b/paddle/cuda/include/stub/hl_matrix_stub.h
index 76cac2e577..1bd78d23fb 100644
--- a/paddle/cuda/include/stub/hl_matrix_stub.h
+++ b/paddle/cuda/include/stub/hl_matrix_stub.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_MATRIX_STUB_H_
 #define HL_MATRIX_STUB_H_
 
@@ -26,48 +25,30 @@ inline void hl_matrix_add(real* A_d,
                           real alpha,
                           real beta) {}
 
-inline void hl_matrix_softmax(real *A_d, real *C_d, int dimM, int dimN) {}
+inline void hl_matrix_softmax(real* A_d, real* C_d, int dimM, int dimN) {}
 
-inline void hl_sequence_softmax_forward(real *A_d,
-                                        real *C_d,
+inline void hl_sequence_softmax_forward(real* A_d,
+                                        real* C_d,
                                         const int* index,
                                         int numSequence) {}
 
-inline void hl_matrix_softmax_derivative(real* grad_d,
-                                         real* output_d,
-                                         real* sftmaxSum_d,
-                                         int dimM,
-                                         int dimN) {}
-
-inline void hl_matrix_classification_error(real* A_d,
-                                           int* B_d,
-                                           real* C_d,
-                                           int dimM,
-                                           int dimN) {}
-
-inline void hl_matrix_cross_entropy(real* A_d,
-                                    real* C_d,
-                                    int* label_d,
-                                    int dimM,
-                                    int dimN) {}
-
-inline void hl_matrix_cross_entropy_bp(real* grad_d,
-                                       real* output_d,
-                                       int* label_d,
-                                       int dimM,
-                                       int dimN) {}
-
-inline void hl_matrix_multi_binary_cross_entropy(real* output,
-                                                 real* entropy,
-                                                 hl_sparse_matrix_s mat,
-                                                 int dimM,
-                                                 int dimN) {}
-
-inline void hl_matrix_multi_binary_cross_entropy_bp(real* output,
-                                                    real* grad,
-                                                    hl_sparse_matrix_s mat,
-                                                    int dimM,
-                                                    int dimN) {}
+inline void hl_matrix_softmax_derivative(
+    real* grad_d, real* output_d, real* sftmaxSum_d, int dimM, int dimN) {}
+
+inline void hl_matrix_classification_error(
+    real* A_d, int* B_d, real* C_d, int dimM, int dimN) {}
+
+inline void hl_matrix_cross_entropy(
+    real* A_d, real* C_d, int* label_d, int dimM, int dimN) {}
+
+inline void hl_matrix_cross_entropy_bp(
+    real* grad_d, real* output_d, int* label_d, int dimM, int dimN) {}
+
+inline void hl_matrix_multi_binary_cross_entropy(
+    real* output, real* entropy, hl_sparse_matrix_s mat, int dimM, int dimN) {}
+
+inline void hl_matrix_multi_binary_cross_entropy_bp(
+    real* output, real* grad, hl_sparse_matrix_s mat, int dimM, int dimN) {}
 
 inline void hl_matrix_zero_mem(real* data, int num) {}
 
@@ -101,7 +82,6 @@ inline void hl_cossim(real* output,
                       int input2_height,
                       real scale) {}
 
-
 inline void hl_cossim_derivative(real* grad,
                                  real* output,
                                  real* prevOutX,
diff --git a/paddle/cuda/include/stub/hl_sequence_stub.h b/paddle/cuda/include/stub/hl_sequence_stub.h
index aabd956c37..381f0a6f26 100644
--- a/paddle/cuda/include/stub/hl_sequence_stub.h
+++ b/paddle/cuda/include/stub/hl_sequence_stub.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_SEQUENCE_STUB_H_
 #define HL_SEQUENCE_STUB_H_
 
@@ -21,15 +20,12 @@ limitations under the License. */
 inline void hl_max_sequence_forward(real* input,
                                     const int* sequence,
                                     real* output,
-                                    int *index,
+                                    int* index,
                                     int numSequences,
                                     int dim) {}
 
-inline void hl_max_sequence_backward(real* outputGrad,
-                                     int *index,
-                                     real* inputGrad,
-                                     int numSequences,
-                                     int dim) {}
+inline void hl_max_sequence_backward(
+    real* outputGrad, int* index, real* inputGrad, int numSequences, int dim) {}
 
 inline void hl_context_projection_forward(real* input,
                                           const int* sequence,
@@ -60,16 +56,16 @@ inline void hl_context_projection_backward_weight(real* outputGrad,
                                                   int contextStart,
                                                   int beginPad) {}
 
-inline void hl_sequence2batch_copy(real *batch,
-                                   real *sequence,
-                                   const int *batchIndex,
+inline void hl_sequence2batch_copy(real* batch,
+                                   real* sequence,
+                                   const int* batchIndex,
                                    int seqWidth,
                                    int batchCount,
                                    bool seq2batch) {}
 
-inline void hl_sequence2batch_add(real *batch,
-                                  real *sequence,
-                                  int *batchIndex,
+inline void hl_sequence2batch_add(real* batch,
+                                  real* sequence,
+                                  int* batchIndex,
                                   int seqWidth,
                                   int batchCount,
                                   bool seq2batch) {}
diff --git a/paddle/cuda/include/stub/hl_sparse_stub.h b/paddle/cuda/include/stub/hl_sparse_stub.h
index 346a1900dd..d47bdd2c47 100644
--- a/paddle/cuda/include/stub/hl_sparse_stub.h
+++ b/paddle/cuda/include/stub/hl_sparse_stub.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_SPARSE_STUB_H_
 #define HL_SPARSE_STUB_H_
 
@@ -20,7 +19,7 @@ limitations under the License. */
 
 inline void hl_malloc_sparse_matrix(hl_sparse_matrix_s *A_d,
                                     hl_matrix_format_t format,
-                                    hl_matrix_value_t  value_type,
+                                    hl_matrix_value_t value_type,
                                     int dimM,
                                     int dimN,
                                     int nnz) {}
@@ -28,20 +27,20 @@ inline void hl_malloc_sparse_matrix(hl_sparse_matrix_s *A_d,
 inline void hl_free_sparse_matrix(hl_sparse_matrix_s A_d) {}
 
 inline void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
-                                       void * dest_d,
+                                       void *dest_d,
                                        size_t size,
                                        hl_matrix_format_t format,
-                                       hl_matrix_value_t  value_type,
+                                       hl_matrix_value_t value_type,
                                        int dimM,
                                        int dimN,
                                        int nnz) {}
 
 inline void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
-                                       real* value_d,
-                                       int* rows_d,
-                                       int* cols_d,
+                                       real *value_d,
+                                       int *rows_d,
+                                       int *cols_d,
                                        hl_matrix_format_t format,
-                                       hl_matrix_value_t  value_type,
+                                       hl_matrix_value_t value_type,
                                        int dimM,
                                        int dimN,
                                        int nnz) {}
@@ -87,10 +86,14 @@ inline void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d,
 
 inline void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d,
                                     hl_trans_op_t transa,
-                                    real *B_d, hl_trans_op_t transb,
+                                    real *B_d,
+                                    hl_trans_op_t transb,
                                     real *C_d,
-                                    int dimM, int dimN, int dimK,
-                                    real alpha, real beta) {}
+                                    int dimM,
+                                    int dimN,
+                                    int dimK,
+                                    real alpha,
+                                    real beta) {}
 
 inline void hl_matrix_dense_mul_csc(real *A_d,
                                     hl_trans_op_t transa,
@@ -103,18 +106,27 @@ inline void hl_matrix_dense_mul_csc(real *A_d,
                                     real alpha,
                                     real beta) {}
 
-inline void hl_sparse_matrix_mul(real* A_d, hl_trans_op_t transa,
-                                 real *B_d, hl_trans_op_t transb,
+inline void hl_sparse_matrix_mul(real *A_d,
+                                 hl_trans_op_t transa,
+                                 real *B_d,
+                                 hl_trans_op_t transb,
                                  hl_sparse_matrix_s C_d,
-                                 int dimM, int dimN, int dimK,
-                                 real alpha, real beta) {}
+                                 int dimM,
+                                 int dimN,
+                                 int dimK,
+                                 real alpha,
+                                 real beta) {}
 
-inline void hl_matrix_dense_mul_csr(real *A_d, hl_trans_op_t transa,
+inline void hl_matrix_dense_mul_csr(real *A_d,
+                                    hl_trans_op_t transa,
                                     hl_sparse_matrix_s B_d,
                                     hl_trans_op_t transb,
                                     real *C_d,
-                                    int dimM, int dimN, int dimK,
-                                    real alpha, real beta) {}
+                                    int dimM,
+                                    int dimN,
+                                    int dimK,
+                                    real alpha,
+                                    real beta) {}
 
 inline void hl_memcpy_from_csc_matrix(real *csc_val,
                                       size_t val_size,
@@ -134,49 +146,39 @@ inline void hl_memcpy_from_csr_matrix(real *csr_val,
                                       hl_sparse_matrix_s csr_matrix,
                                       hl_stream_t stream) {}
 
-inline void hl_sparse_matrix_column_sum(real* A_d,
-                                        hl_sparse_matrix_s B_d,
-                                        int dimM,
-                                        int dimN,
-                                        real scale) {}
+inline void hl_sparse_matrix_column_sum(
+    real *A_d, hl_sparse_matrix_s B_d, int dimM, int dimN, real scale) {}
 
-inline void hl_matrix_csr_column_sum(real* A_d,
-                                     hl_sparse_matrix_s B_d,
-                                     int dimM,
-                                     int dimN,
-                                     real scale) {}
+inline void hl_matrix_csr_column_sum(
+    real *A_d, hl_sparse_matrix_s B_d, int dimM, int dimN, real scale) {}
 
 inline void hl_sparse_matrix_add_bias(hl_sparse_matrix_s A_d,
-                                      real* B_d,
+                                      real *B_d,
                                       real scale) {}
 
 inline void hl_matrix_csr_add_bias(hl_sparse_matrix_s A_d,
-                                   real* B_d,
+                                   real *B_d,
                                    real scale) {}
 
 inline void hl_sparse_matrix_add_dense(hl_sparse_matrix_s A_d,
-                                       real* B_d,
+                                       real *B_d,
                                        int dimM,
                                        int dimN,
                                        real alpha,
                                        real beta) {}
 
 inline void hl_matrix_csr_add_dense(hl_sparse_matrix_s A_d,
-                                    real* B_d,
+                                    real *B_d,
                                     int dimM,
                                     int dimN,
                                     real alpha,
                                     real beta) {}
 
-inline int* hl_sparse_matrix_get_rows(hl_sparse_matrix_s sMat) {
-  return NULL;
-}
+inline int *hl_sparse_matrix_get_rows(hl_sparse_matrix_s sMat) { return NULL; }
 
-inline int* hl_sparse_matrix_get_cols(hl_sparse_matrix_s sMat) {
-  return NULL;
-}
+inline int *hl_sparse_matrix_get_cols(hl_sparse_matrix_s sMat) { return NULL; }
 
-inline real* hl_sparse_matrix_get_value(hl_sparse_matrix_s sMat) {
+inline real *hl_sparse_matrix_get_value(hl_sparse_matrix_s sMat) {
   return NULL;
 }
 
diff --git a/paddle/cuda/src/avx_mathfun.h b/paddle/cuda/src/avx_mathfun.h
index 2922d4dc29..2412ed5abc 100644
--- a/paddle/cuda/src/avx_mathfun.h
+++ b/paddle/cuda/src/avx_mathfun.h
@@ -32,32 +32,35 @@
 #include <immintrin.h>
 
 /* yes I know, the top of this file is quite ugly */
-# define ALIGN32_BEG
-# define ALIGN32_END __attribute__((aligned(32)))
+#define ALIGN32_BEG
+#define ALIGN32_END __attribute__((aligned(32)))
 
 /* __m128 is ugly to write */
-typedef __m256  v8sf; // vector of 8 float (avx)
-typedef __m256i v8si; // vector of 8 int   (avx)
-typedef __m128i v4si; // vector of 8 int   (avx)
+typedef __m256 v8sf;   // vector of 8 float (avx)
+typedef __m256i v8si;  // vector of 8 int   (avx)
+typedef __m128i v4si;  // vector of 8 int   (avx)
 
-#define _PI32AVX_CONST(Name, Val)                                            \
-  static const ALIGN32_BEG int _pi32avx_##Name[4] ALIGN32_END = { Val, Val, Val, Val }
+#define _PI32AVX_CONST(Name, Val)                                 \
+  static const ALIGN32_BEG int _pi32avx_##Name[4] ALIGN32_END = { \
+      Val, Val, Val, Val}
 
 _PI32AVX_CONST(1, 1);
 _PI32AVX_CONST(inv1, ~1);
 _PI32AVX_CONST(2, 2);
 _PI32AVX_CONST(4, 4);
 
-
 /* declare some AVX constants -- why can't I figure a better way to do that? */
-#define _PS256_CONST(Name, Val)                                            \
-  static const ALIGN32_BEG float _ps256_##Name[8] ALIGN32_END = { Val, Val, Val, Val, Val, Val, Val, Val }
-#define _PI32_CONST256(Name, Val)                                            \
-  static const ALIGN32_BEG int _pi32_256_##Name[8] ALIGN32_END = { Val, Val, Val, Val, Val, Val, Val, Val }
-#define _PS256_CONST_TYPE(Name, Type, Val)                                 \
-  static const ALIGN32_BEG Type _ps256_##Name[8] ALIGN32_END = { Val, Val, Val, Val, Val, Val, Val, Val }
-
-_PS256_CONST(1  , 1.0f);
+#define _PS256_CONST(Name, Val)                                   \
+  static const ALIGN32_BEG float _ps256_##Name[8] ALIGN32_END = { \
+      Val, Val, Val, Val, Val, Val, Val, Val}
+#define _PI32_CONST256(Name, Val)                                  \
+  static const ALIGN32_BEG int _pi32_256_##Name[8] ALIGN32_END = { \
+      Val, Val, Val, Val, Val, Val, Val, Val}
+#define _PS256_CONST_TYPE(Name, Type, Val)                       \
+  static const ALIGN32_BEG Type _ps256_##Name[8] ALIGN32_END = { \
+      Val, Val, Val, Val, Val, Val, Val, Val}
+
+_PS256_CONST(1, 1.0f);
 _PS256_CONST(0p5, 0.5f);
 /* the smallest non denormalized float number */
 _PS256_CONST_TYPE(min_norm_pos, int, 0x00800000);
@@ -76,14 +79,14 @@ _PI32_CONST256(0x7f, 0x7f);
 
 _PS256_CONST(cephes_SQRTHF, 0.707106781186547524);
 _PS256_CONST(cephes_log_p0, 7.0376836292E-2);
-_PS256_CONST(cephes_log_p1, - 1.1514610310E-1);
+_PS256_CONST(cephes_log_p1, -1.1514610310E-1);
 _PS256_CONST(cephes_log_p2, 1.1676998740E-1);
-_PS256_CONST(cephes_log_p3, - 1.2420140846E-1);
-_PS256_CONST(cephes_log_p4, + 1.4249322787E-1);
-_PS256_CONST(cephes_log_p5, - 1.6668057665E-1);
-_PS256_CONST(cephes_log_p6, + 2.0000714765E-1);
-_PS256_CONST(cephes_log_p7, - 2.4999993993E-1);
-_PS256_CONST(cephes_log_p8, + 3.3333331174E-1);
+_PS256_CONST(cephes_log_p3, -1.2420140846E-1);
+_PS256_CONST(cephes_log_p4, +1.4249322787E-1);
+_PS256_CONST(cephes_log_p5, -1.6668057665E-1);
+_PS256_CONST(cephes_log_p6, +2.0000714765E-1);
+_PS256_CONST(cephes_log_p7, -2.4999993993E-1);
+_PS256_CONST(cephes_log_p8, +3.3333331174E-1);
 _PS256_CONST(cephes_log_q1, -2.12194440e-4);
 _PS256_CONST(cephes_log_q2, 0.693359375);
 
@@ -94,50 +97,51 @@ typedef union imm_xmm_union {
   v4si xmm[2];
 } imm_xmm_union;
 
-#define COPY_IMM_TO_XMM(imm_, xmm0_, xmm1_) {    \
-    imm_xmm_union u __attribute__((aligned(32)));  \
-    u.imm = imm_;				   \
-    xmm0_ = u.xmm[0];                            \
-    xmm1_ = u.xmm[1];                            \
-}
-
-#define COPY_XMM_TO_IMM(xmm0_, xmm1_, imm_) {                       \
+#define COPY_IMM_TO_XMM(imm_, xmm0_, xmm1_)       \
+  {                                               \
     imm_xmm_union u __attribute__((aligned(32))); \
-    u.xmm[0]=xmm0_; u.xmm[1]=xmm1_; imm_ = u.imm; \
+    u.imm = imm_;                                 \
+    xmm0_ = u.xmm[0];                             \
+    xmm1_ = u.xmm[1];                             \
   }
 
+#define COPY_XMM_TO_IMM(xmm0_, xmm1_, imm_)       \
+  {                                               \
+    imm_xmm_union u __attribute__((aligned(32))); \
+    u.xmm[0] = xmm0_;                             \
+    u.xmm[1] = xmm1_;                             \
+    imm_ = u.imm;                                 \
+  }
 
-#define AVX2_BITOP_USING_SSE2(fn) \
-static inline v8si avx2_mm256_##fn(v8si x, int a) \
-{ \
-  /* use SSE2 instruction to perform the bitop AVX2 */ \
-  v4si x1, x2; \
-  v8si ret; \
-  COPY_IMM_TO_XMM(x, x1, x2); \
-  x1 = _mm_##fn(x1,a); \
-  x2 = _mm_##fn(x2,a); \
-  COPY_XMM_TO_IMM(x1, x2, ret); \
-  return(ret); \
-}
+#define AVX2_BITOP_USING_SSE2(fn)                        \
+  static inline v8si avx2_mm256_##fn(v8si x, int a) {    \
+    /* use SSE2 instruction to perform the bitop AVX2 */ \
+    v4si x1, x2;                                         \
+    v8si ret;                                            \
+    COPY_IMM_TO_XMM(x, x1, x2);                          \
+    x1 = _mm_##fn(x1, a);                                \
+    x2 = _mm_##fn(x2, a);                                \
+    COPY_XMM_TO_IMM(x1, x2, ret);                        \
+    return (ret);                                        \
+  }
 
 //#warning "Using SSE2 to perform AVX2 bitshift ops"
 AVX2_BITOP_USING_SSE2(slli_epi32)
 AVX2_BITOP_USING_SSE2(srli_epi32)
 
-#define AVX2_INTOP_USING_SSE2(fn) \
-static inline v8si avx2_mm256_##fn(v8si x, v8si y) \
-{ \
-  /* use SSE2 instructions to perform the AVX2 integer operation */ \
-  v4si x1, x2; \
-  v4si y1, y2; \
-  v8si ret; \
-  COPY_IMM_TO_XMM(x, x1, x2); \
-  COPY_IMM_TO_XMM(y, y1, y2); \
-  x1 = _mm_##fn(x1,y1); \
-  x2 = _mm_##fn(x2,y2); \
-  COPY_XMM_TO_IMM(x1, x2, ret); \
-  return(ret); \
-}
+#define AVX2_INTOP_USING_SSE2(fn)                                     \
+  static inline v8si avx2_mm256_##fn(v8si x, v8si y) {                \
+    /* use SSE2 instructions to perform the AVX2 integer operation */ \
+    v4si x1, x2;                                                      \
+    v4si y1, y2;                                                      \
+    v8si ret;                                                         \
+    COPY_IMM_TO_XMM(x, x1, x2);                                       \
+    COPY_IMM_TO_XMM(y, y1, y2);                                       \
+    x1 = _mm_##fn(x1, y1);                                            \
+    x2 = _mm_##fn(x2, y2);                                            \
+    COPY_XMM_TO_IMM(x1, x2, ret);                                     \
+    return (ret);                                                     \
+  }
 
 //#warning "Using SSE2 to perform AVX2 integer ops"
 AVX2_INTOP_USING_SSE2(and_si128)
@@ -157,84 +161,83 @@ AVX2_INTOP_USING_SSE2(add_epi32)
 #define avx2_mm256_add_epi32 _mm256_add_epi32
 #endif /* __AVX2__ */
 
-
-/* natural logarithm computed for 8 simultaneous float 
+/* natural logarithm computed for 8 simultaneous float
    return NaN for x <= 0
 */
 v8sf log256_ps(v8sf x) {
   v8si imm0;
-  v8sf one = *(v8sf*)_ps256_1;
+  v8sf one = *(v8sf *)_ps256_1;
 
-  //v8sf invalid_mask = _mm256_cmple_ps(x, _mm256_setzero_ps());
+  // v8sf invalid_mask = _mm256_cmple_ps(x, _mm256_setzero_ps());
   v8sf invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LE_OS);
 
-  x = _mm256_max_ps(x, *(v8sf*)_ps256_min_norm_pos);  /* cut off denormalized stuff */
+  x = _mm256_max_ps(
+      x, *(v8sf *)_ps256_min_norm_pos); /* cut off denormalized stuff */
 
   // can be done with AVX2
   imm0 = avx2_mm256_srli_epi32(_mm256_castps_si256(x), 23);
 
   /* keep only the fractional part */
-  x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_mant_mask);
-  x = _mm256_or_ps(x, *(v8sf*)_ps256_0p5);
+  x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_mant_mask);
+  x = _mm256_or_ps(x, *(v8sf *)_ps256_0p5);
 
   // this is again another AVX2 instruction
-  imm0 = avx2_mm256_sub_epi32(imm0, *(v8si*)_pi32_256_0x7f);
+  imm0 = avx2_mm256_sub_epi32(imm0, *(v8si *)_pi32_256_0x7f);
   v8sf e = _mm256_cvtepi32_ps(imm0);
 
   e = _mm256_add_ps(e, one);
 
-  /* part2: 
+  /* part2:
      if( x < SQRTHF ) {
        e -= 1;
        x = x + x - 1.0;
      } else { x = x - 1.0; }
   */
-  //v8sf mask = _mm256_cmplt_ps(x, *(v8sf*)_ps256_cephes_SQRTHF);
-  v8sf mask = _mm256_cmp_ps(x, *(v8sf*)_ps256_cephes_SQRTHF, _CMP_LT_OS);
+  // v8sf mask = _mm256_cmplt_ps(x, *(v8sf*)_ps256_cephes_SQRTHF);
+  v8sf mask = _mm256_cmp_ps(x, *(v8sf *)_ps256_cephes_SQRTHF, _CMP_LT_OS);
   v8sf tmp = _mm256_and_ps(x, mask);
   x = _mm256_sub_ps(x, one);
   e = _mm256_sub_ps(e, _mm256_and_ps(one, mask));
   x = _mm256_add_ps(x, tmp);
 
-  v8sf z = _mm256_mul_ps(x,x);
+  v8sf z = _mm256_mul_ps(x, x);
 
-  v8sf y = *(v8sf*)_ps256_cephes_log_p0;
+  v8sf y = *(v8sf *)_ps256_cephes_log_p0;
   y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p1);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p1);
   y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p2);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p2);
   y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p3);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p3);
   y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p4);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p4);
   y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p5);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p5);
   y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p6);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p6);
   y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p7);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p7);
   y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p8);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p8);
   y = _mm256_mul_ps(y, x);
 
   y = _mm256_mul_ps(y, z);
-  
-  tmp = _mm256_mul_ps(e, *(v8sf*)_ps256_cephes_log_q1);
-  y = _mm256_add_ps(y, tmp);
 
+  tmp = _mm256_mul_ps(e, *(v8sf *)_ps256_cephes_log_q1);
+  y = _mm256_add_ps(y, tmp);
 
-  tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5);
+  tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5);
   y = _mm256_sub_ps(y, tmp);
 
-  tmp = _mm256_mul_ps(e, *(v8sf*)_ps256_cephes_log_q2);
+  tmp = _mm256_mul_ps(e, *(v8sf *)_ps256_cephes_log_q2);
   x = _mm256_add_ps(x, y);
   x = _mm256_add_ps(x, tmp);
-  x = _mm256_or_ps(x, invalid_mask); // negative arg will be NAN
+  x = _mm256_or_ps(x, invalid_mask);  // negative arg will be NAN
   return x;
 }
 
-_PS256_CONST(exp_hi,	88.3762626647949f);
-_PS256_CONST(exp_lo,	-88.3762626647949f);
+_PS256_CONST(exp_hi, 88.3762626647949f);
+_PS256_CONST(exp_lo, -88.3762626647949f);
 
 _PS256_CONST(cephes_LOG2EF, 1.44269504088896341);
 _PS256_CONST(cephes_exp_C1, 0.693359375);
@@ -250,45 +253,45 @@ _PS256_CONST(cephes_exp_p5, 5.0000001201E-1);
 v8sf exp256_ps(v8sf x) {
   v8sf tmp = _mm256_setzero_ps(), fx;
   v8si imm0;
-  v8sf one = *(v8sf*)_ps256_1;
+  v8sf one = *(v8sf *)_ps256_1;
 
-  x = _mm256_min_ps(x, *(v8sf*)_ps256_exp_hi);
-  x = _mm256_max_ps(x, *(v8sf*)_ps256_exp_lo);
+  x = _mm256_min_ps(x, *(v8sf *)_ps256_exp_hi);
+  x = _mm256_max_ps(x, *(v8sf *)_ps256_exp_lo);
 
   /* express exp(x) as exp(g + n*log(2)) */
-  fx = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_LOG2EF);
-  fx = _mm256_add_ps(fx, *(v8sf*)_ps256_0p5);
+  fx = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_LOG2EF);
+  fx = _mm256_add_ps(fx, *(v8sf *)_ps256_0p5);
 
   /* how to perform a floorf with SSE: just below */
-  //imm0 = _mm256_cvttps_epi32(fx);
-  //tmp  = _mm256_cvtepi32_ps(imm0);
-  
+  // imm0 = _mm256_cvttps_epi32(fx);
+  // tmp  = _mm256_cvtepi32_ps(imm0);
+
   tmp = _mm256_floor_ps(fx);
 
   /* if greater, substract 1 */
-  //v8sf mask = _mm256_cmpgt_ps(tmp, fx);    
-  v8sf mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS);    
+  // v8sf mask = _mm256_cmpgt_ps(tmp, fx);
+  v8sf mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS);
   mask = _mm256_and_ps(mask, one);
   fx = _mm256_sub_ps(tmp, mask);
 
-  tmp = _mm256_mul_ps(fx, *(v8sf*)_ps256_cephes_exp_C1);
-  v8sf z = _mm256_mul_ps(fx, *(v8sf*)_ps256_cephes_exp_C2);
+  tmp = _mm256_mul_ps(fx, *(v8sf *)_ps256_cephes_exp_C1);
+  v8sf z = _mm256_mul_ps(fx, *(v8sf *)_ps256_cephes_exp_C2);
   x = _mm256_sub_ps(x, tmp);
   x = _mm256_sub_ps(x, z);
 
-  z = _mm256_mul_ps(x,x);
-  
-  v8sf y = *(v8sf*)_ps256_cephes_exp_p0;
+  z = _mm256_mul_ps(x, x);
+
+  v8sf y = *(v8sf *)_ps256_cephes_exp_p0;
   y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p1);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p1);
   y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p2);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p2);
   y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p3);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p3);
   y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p4);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p4);
   y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p5);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p5);
   y = _mm256_mul_ps(y, z);
   y = _mm256_add_ps(y, x);
   y = _mm256_add_ps(y, one);
@@ -296,7 +299,7 @@ v8sf exp256_ps(v8sf x) {
   /* build 2^n */
   imm0 = _mm256_cvttps_epi32(fx);
   // another two AVX2 instructions
-  imm0 = avx2_mm256_add_epi32(imm0, *(v8si*)_pi32_256_0x7f);
+  imm0 = avx2_mm256_add_epi32(imm0, *(v8si *)_pi32_256_0x7f);
   imm0 = avx2_mm256_slli_epi32(imm0, 23);
   v8sf pow2n = _mm256_castsi256_ps(imm0);
   y = _mm256_mul_ps(y, pow2n);
@@ -307,13 +310,12 @@ _PS256_CONST(minus_cephes_DP1, -0.78515625);
 _PS256_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
 _PS256_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
 _PS256_CONST(sincof_p0, -1.9515295891E-4);
-_PS256_CONST(sincof_p1,  8.3321608736E-3);
+_PS256_CONST(sincof_p1, 8.3321608736E-3);
 _PS256_CONST(sincof_p2, -1.6666654611E-1);
-_PS256_CONST(coscof_p0,  2.443315711809948E-005);
+_PS256_CONST(coscof_p0, 2.443315711809948E-005);
 _PS256_CONST(coscof_p1, -1.388731625493765E-003);
-_PS256_CONST(coscof_p2,  4.166664568298827E-002);
-_PS256_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
-
+_PS256_CONST(coscof_p2, 4.166664568298827E-002);
+_PS256_CONST(cephes_FOPI, 1.27323954473516);  // 4 / M_PI
 
 /* evaluation of 8 sines at onces using AVX intrisics
 
@@ -327,7 +329,7 @@ _PS256_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
    surprising but correct result.
 
 */
-v8sf sin256_ps(v8sf x) { // any x
+v8sf sin256_ps(v8sf x) {  // any x
   v8sf xmm1, xmm2 = _mm256_setzero_ps(), xmm3, sign_bit, y;
   v8si imm0, imm2;
 
@@ -338,78 +340,78 @@ v8sf sin256_ps(v8sf x) { // any x
 
   sign_bit = x;
   /* take the absolute value */
-  x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_sign_mask);
+  x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask);
   /* extract the sign bit (upper one) */
-  sign_bit = _mm256_and_ps(sign_bit, *(v8sf*)_ps256_sign_mask);
-  
+  sign_bit = _mm256_and_ps(sign_bit, *(v8sf *)_ps256_sign_mask);
+
   /* scale by 4/Pi */
-  y = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_FOPI);
+  y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI);
 
-  /*
-    Here we start a series of integer operations, which are in the
-    realm of AVX2.
-    If we don't have AVX, let's perform them using SSE2 directives
-  */
+/*
+  Here we start a series of integer operations, which are in the
+  realm of AVX2.
+  If we don't have AVX, let's perform them using SSE2 directives
+*/
 
 #ifdef __AVX2__
   /* store the integer part of y in mm0 */
   imm2 = _mm256_cvttps_epi32(y);
   /* j=(j+1) & (~1) (see the cephes sources) */
   // another two AVX2 instruction
-  imm2 = avx2_mm256_add_epi32(imm2, *(v8si*)_pi32_256_1);
-  imm2 = avx2_mm256_and_si256(imm2, *(v8si*)_pi32_256_inv1);
+  imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1);
+  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1);
   y = _mm256_cvtepi32_ps(imm2);
 
   /* get the swap sign flag */
-  imm0 = avx2_mm256_and_si256(imm2, *(v8si*)_pi32_256_4);
+  imm0 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_4);
   imm0 = avx2_mm256_slli_epi32(imm0, 29);
-  /* get the polynom selection mask 
+  /* get the polynom selection mask
      there is one polynom for 0 <= x <= Pi/4
      and another one for Pi/4<x<=Pi/2
 
      Both branches will be computed.
   */
-  imm2 = avx2_mm256_and_si256(imm2, *(v8si*)_pi32_256_2);
-  imm2 = avx2_mm256_cmpeq_epi32(imm2,*(v8si*)_pi32_256_0);
+  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_2);
+  imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si *)_pi32_256_0);
 #else
   /* we use SSE2 routines to perform the integer ops */
-  COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y),imm2_1,imm2_2);
+  COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y), imm2_1, imm2_2);
 
-  imm2_1 = _mm_add_epi32(imm2_1, *(v4si*)_pi32avx_1);
-  imm2_2 = _mm_add_epi32(imm2_2, *(v4si*)_pi32avx_1);
+  imm2_1 = _mm_add_epi32(imm2_1, *(v4si *)_pi32avx_1);
+  imm2_2 = _mm_add_epi32(imm2_2, *(v4si *)_pi32avx_1);
 
-  imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_inv1);
-  imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_inv1);
+  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_inv1);
+  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_inv1);
 
-  COPY_XMM_TO_IMM(imm2_1,imm2_2,imm2);
+  COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
   y = _mm256_cvtepi32_ps(imm2);
 
-  imm0_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_4);
-  imm0_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_4);
+  imm0_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_4);
+  imm0_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_4);
 
   imm0_1 = _mm_slli_epi32(imm0_1, 29);
   imm0_2 = _mm_slli_epi32(imm0_2, 29);
 
   COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
 
-  imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_2);
-  imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_2);
+  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_2);
+  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_2);
 
   imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
   imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
 
   COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
 #endif
- 
+
   v8sf swap_sign_bit = _mm256_castsi256_ps(imm0);
   v8sf poly_mask = _mm256_castsi256_ps(imm2);
   sign_bit = _mm256_xor_ps(sign_bit, swap_sign_bit);
 
-  /* The magic pass: "Extended precision modular arithmetic" 
+  /* The magic pass: "Extended precision modular arithmetic"
      x = ((x - y * DP1) - y * DP2) - y * DP3; */
-  xmm1 = *(v8sf*)_ps256_minus_cephes_DP1;
-  xmm2 = *(v8sf*)_ps256_minus_cephes_DP2;
-  xmm3 = *(v8sf*)_ps256_minus_cephes_DP3;
+  xmm1 = *(v8sf *)_ps256_minus_cephes_DP1;
+  xmm2 = *(v8sf *)_ps256_minus_cephes_DP2;
+  xmm3 = *(v8sf *)_ps256_minus_cephes_DP3;
   xmm1 = _mm256_mul_ps(y, xmm1);
   xmm2 = _mm256_mul_ps(y, xmm2);
   xmm3 = _mm256_mul_ps(y, xmm3);
@@ -418,35 +420,35 @@ v8sf sin256_ps(v8sf x) { // any x
   x = _mm256_add_ps(x, xmm3);
 
   /* Evaluate the first polynom  (0 <= x <= Pi/4) */
-  y = *(v8sf*)_ps256_coscof_p0;
-  v8sf z = _mm256_mul_ps(x,x);
+  y = *(v8sf *)_ps256_coscof_p0;
+  v8sf z = _mm256_mul_ps(x, x);
 
   y = _mm256_mul_ps(y, z);
-  y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p1);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p1);
   y = _mm256_mul_ps(y, z);
-  y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p2);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p2);
   y = _mm256_mul_ps(y, z);
   y = _mm256_mul_ps(y, z);
-  v8sf tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5);
+  v8sf tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5);
   y = _mm256_sub_ps(y, tmp);
-  y = _mm256_add_ps(y, *(v8sf*)_ps256_1);
-  
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_1);
+
   /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
 
-  v8sf y2 = *(v8sf*)_ps256_sincof_p0;
+  v8sf y2 = *(v8sf *)_ps256_sincof_p0;
   y2 = _mm256_mul_ps(y2, z);
-  y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p1);
+  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p1);
   y2 = _mm256_mul_ps(y2, z);
-  y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p2);
+  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p2);
   y2 = _mm256_mul_ps(y2, z);
   y2 = _mm256_mul_ps(y2, x);
   y2 = _mm256_add_ps(y2, x);
 
-  /* select the correct result from the two polynoms */  
+  /* select the correct result from the two polynoms */
   xmm3 = poly_mask;
-  y2 = _mm256_and_ps(xmm3, y2); //, xmm3);
+  y2 = _mm256_and_ps(xmm3, y2);  //, xmm3);
   y = _mm256_andnot_ps(xmm3, y);
-  y = _mm256_add_ps(y,y2);
+  y = _mm256_add_ps(y, y2);
   /* update the sign */
   y = _mm256_xor_ps(y, sign_bit);
 
@@ -454,7 +456,7 @@ v8sf sin256_ps(v8sf x) { // any x
 }
 
 /* almost the same as sin_ps */
-v8sf cos256_ps(v8sf x) { // any x
+v8sf cos256_ps(v8sf x) {  // any x
   v8sf xmm1, xmm2 = _mm256_setzero_ps(), xmm3, y;
   v8si imm0, imm2;
 
@@ -464,53 +466,53 @@ v8sf cos256_ps(v8sf x) { // any x
 #endif
 
   /* take the absolute value */
-  x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_sign_mask);
-  
+  x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask);
+
   /* scale by 4/Pi */
-  y = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_FOPI);
-  
+  y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI);
+
 #ifdef __AVX2__
   /* store the integer part of y in mm0 */
   imm2 = _mm256_cvttps_epi32(y);
   /* j=(j+1) & (~1) (see the cephes sources) */
-  imm2 = avx2_mm256_add_epi32(imm2, *(v8si*)_pi32_256_1);
-  imm2 = avx2_mm256_and_si256(imm2, *(v8si*)_pi32_256_inv1);
+  imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1);
+  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1);
   y = _mm256_cvtepi32_ps(imm2);
-  imm2 = avx2_mm256_sub_epi32(imm2, *(v8si*)_pi32_256_2);
-  
+  imm2 = avx2_mm256_sub_epi32(imm2, *(v8si *)_pi32_256_2);
+
   /* get the swap sign flag */
-  imm0 = avx2_mm256_andnot_si256(imm2, *(v8si*)_pi32_256_4);
+  imm0 = avx2_mm256_andnot_si256(imm2, *(v8si *)_pi32_256_4);
   imm0 = avx2_mm256_slli_epi32(imm0, 29);
   /* get the polynom selection mask */
-  imm2 = avx2_mm256_and_si256(imm2, *(v8si*)_pi32_256_2);
-  imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si*)_pi32_256_0);
+  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_2);
+  imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si *)_pi32_256_0);
 #else
 
   /* we use SSE2 routines to perform the integer ops */
-  COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y),imm2_1,imm2_2);
+  COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y), imm2_1, imm2_2);
 
-  imm2_1 = _mm_add_epi32(imm2_1, *(v4si*)_pi32avx_1);
-  imm2_2 = _mm_add_epi32(imm2_2, *(v4si*)_pi32avx_1);
+  imm2_1 = _mm_add_epi32(imm2_1, *(v4si *)_pi32avx_1);
+  imm2_2 = _mm_add_epi32(imm2_2, *(v4si *)_pi32avx_1);
 
-  imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_inv1);
-  imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_inv1);
+  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_inv1);
+  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_inv1);
 
-  COPY_XMM_TO_IMM(imm2_1,imm2_2,imm2);
+  COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
   y = _mm256_cvtepi32_ps(imm2);
 
-  imm2_1 = _mm_sub_epi32(imm2_1, *(v4si*)_pi32avx_2);
-  imm2_2 = _mm_sub_epi32(imm2_2, *(v4si*)_pi32avx_2);
+  imm2_1 = _mm_sub_epi32(imm2_1, *(v4si *)_pi32avx_2);
+  imm2_2 = _mm_sub_epi32(imm2_2, *(v4si *)_pi32avx_2);
 
-  imm0_1 = _mm_andnot_si128(imm2_1, *(v4si*)_pi32avx_4);
-  imm0_2 = _mm_andnot_si128(imm2_2, *(v4si*)_pi32avx_4);
+  imm0_1 = _mm_andnot_si128(imm2_1, *(v4si *)_pi32avx_4);
+  imm0_2 = _mm_andnot_si128(imm2_2, *(v4si *)_pi32avx_4);
 
   imm0_1 = _mm_slli_epi32(imm0_1, 29);
   imm0_2 = _mm_slli_epi32(imm0_2, 29);
 
   COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
 
-  imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_2);
-  imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_2);
+  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_2);
+  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_2);
 
   imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
   imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
@@ -521,58 +523,58 @@ v8sf cos256_ps(v8sf x) { // any x
   v8sf sign_bit = _mm256_castsi256_ps(imm0);
   v8sf poly_mask = _mm256_castsi256_ps(imm2);
 
-  /* The magic pass: "Extended precision modular arithmetic" 
+  /* The magic pass: "Extended precision modular arithmetic"
      x = ((x - y * DP1) - y * DP2) - y * DP3; */
-  xmm1 = *(v8sf*)_ps256_minus_cephes_DP1;
-  xmm2 = *(v8sf*)_ps256_minus_cephes_DP2;
-  xmm3 = *(v8sf*)_ps256_minus_cephes_DP3;
+  xmm1 = *(v8sf *)_ps256_minus_cephes_DP1;
+  xmm2 = *(v8sf *)_ps256_minus_cephes_DP2;
+  xmm3 = *(v8sf *)_ps256_minus_cephes_DP3;
   xmm1 = _mm256_mul_ps(y, xmm1);
   xmm2 = _mm256_mul_ps(y, xmm2);
   xmm3 = _mm256_mul_ps(y, xmm3);
   x = _mm256_add_ps(x, xmm1);
   x = _mm256_add_ps(x, xmm2);
   x = _mm256_add_ps(x, xmm3);
-  
+
   /* Evaluate the first polynom  (0 <= x <= Pi/4) */
-  y = *(v8sf*)_ps256_coscof_p0;
-  v8sf z = _mm256_mul_ps(x,x);
+  y = *(v8sf *)_ps256_coscof_p0;
+  v8sf z = _mm256_mul_ps(x, x);
 
   y = _mm256_mul_ps(y, z);
-  y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p1);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p1);
   y = _mm256_mul_ps(y, z);
-  y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p2);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p2);
   y = _mm256_mul_ps(y, z);
   y = _mm256_mul_ps(y, z);
-  v8sf tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5);
+  v8sf tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5);
   y = _mm256_sub_ps(y, tmp);
-  y = _mm256_add_ps(y, *(v8sf*)_ps256_1);
-  
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_1);
+
   /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
 
-  v8sf y2 = *(v8sf*)_ps256_sincof_p0;
+  v8sf y2 = *(v8sf *)_ps256_sincof_p0;
   y2 = _mm256_mul_ps(y2, z);
-  y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p1);
+  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p1);
   y2 = _mm256_mul_ps(y2, z);
-  y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p2);
+  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p2);
   y2 = _mm256_mul_ps(y2, z);
   y2 = _mm256_mul_ps(y2, x);
   y2 = _mm256_add_ps(y2, x);
 
-  /* select the correct result from the two polynoms */  
+  /* select the correct result from the two polynoms */
   xmm3 = poly_mask;
-  y2 = _mm256_and_ps(xmm3, y2); //, xmm3);
+  y2 = _mm256_and_ps(xmm3, y2);  //, xmm3);
   y = _mm256_andnot_ps(xmm3, y);
-  y = _mm256_add_ps(y,y2);
+  y = _mm256_add_ps(y, y2);
   /* update the sign */
   y = _mm256_xor_ps(y, sign_bit);
 
   return y;
 }
 
-/* since sin256_ps and cos256_ps are almost identical, sincos256_ps could replace both of them..
+/* since sin256_ps and cos256_ps are almost identical, sincos256_ps could
+   replace both of them..
    it is almost as fast, and gives you a free cosine with your sine */
 void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
-
   v8sf xmm1, xmm2, xmm3 = _mm256_setzero_ps(), sign_bit_sin, y;
   v8si imm0, imm2, imm4;
 
@@ -584,59 +586,59 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
 
   sign_bit_sin = x;
   /* take the absolute value */
-  x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_sign_mask);
+  x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask);
   /* extract the sign bit (upper one) */
-  sign_bit_sin = _mm256_and_ps(sign_bit_sin, *(v8sf*)_ps256_sign_mask);
-  
+  sign_bit_sin = _mm256_and_ps(sign_bit_sin, *(v8sf *)_ps256_sign_mask);
+
   /* scale by 4/Pi */
-  y = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_FOPI);
+  y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI);
 
-#ifdef __AVX2__    
+#ifdef __AVX2__
   /* store the integer part of y in imm2 */
   imm2 = _mm256_cvttps_epi32(y);
 
   /* j=(j+1) & (~1) (see the cephes sources) */
-  imm2 = avx2_mm256_add_epi32(imm2, *(v8si*)_pi32_256_1);
-  imm2 = avx2_mm256_and_si256(imm2, *(v8si*)_pi32_256_inv1);
+  imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1);
+  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1);
 
   y = _mm256_cvtepi32_ps(imm2);
   imm4 = imm2;
 
   /* get the swap sign flag for the sine */
-  imm0 = avx2_mm256_and_si256(imm2, *(v8si*)_pi32_256_4);
+  imm0 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_4);
   imm0 = avx2_mm256_slli_epi32(imm0, 29);
-  //v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0);
+  // v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0);
 
   /* get the polynom selection mask for the sine*/
-  imm2 = avx2_mm256_and_si256(imm2, *(v8si*)_pi32_256_2);
-  imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si*)_pi32_256_0);
-  //v8sf poly_mask = _mm256_castsi256_ps(imm2);
+  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_2);
+  imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si *)_pi32_256_0);
+// v8sf poly_mask = _mm256_castsi256_ps(imm2);
 #else
   /* we use SSE2 routines to perform the integer ops */
-  COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y),imm2_1,imm2_2);
+  COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y), imm2_1, imm2_2);
+
+  imm2_1 = _mm_add_epi32(imm2_1, *(v4si *)_pi32avx_1);
+  imm2_2 = _mm_add_epi32(imm2_2, *(v4si *)_pi32avx_1);
 
-  imm2_1 = _mm_add_epi32(imm2_1, *(v4si*)_pi32avx_1);
-  imm2_2 = _mm_add_epi32(imm2_2, *(v4si*)_pi32avx_1);
-  
-  imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_inv1);
-  imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_inv1);
+  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_inv1);
+  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_inv1);
 
-  COPY_XMM_TO_IMM(imm2_1,imm2_2,imm2);
+  COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
   y = _mm256_cvtepi32_ps(imm2);
 
   imm4_1 = imm2_1;
   imm4_2 = imm2_2;
 
-  imm0_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_4);
-  imm0_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_4);
-  
+  imm0_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_4);
+  imm0_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_4);
+
   imm0_1 = _mm_slli_epi32(imm0_1, 29);
   imm0_2 = _mm_slli_epi32(imm0_2, 29);
 
   COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
 
-  imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_2);
-  imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_2);
+  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_2);
+  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_2);
 
   imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
   imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
@@ -646,11 +648,11 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
   v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0);
   v8sf poly_mask = _mm256_castsi256_ps(imm2);
 
-  /* The magic pass: "Extended precision modular arithmetic" 
+  /* The magic pass: "Extended precision modular arithmetic"
      x = ((x - y * DP1) - y * DP2) - y * DP3; */
-  xmm1 = *(v8sf*)_ps256_minus_cephes_DP1;
-  xmm2 = *(v8sf*)_ps256_minus_cephes_DP2;
-  xmm3 = *(v8sf*)_ps256_minus_cephes_DP3;
+  xmm1 = *(v8sf *)_ps256_minus_cephes_DP1;
+  xmm2 = *(v8sf *)_ps256_minus_cephes_DP2;
+  xmm3 = *(v8sf *)_ps256_minus_cephes_DP3;
   xmm1 = _mm256_mul_ps(y, xmm1);
   xmm2 = _mm256_mul_ps(y, xmm2);
   xmm3 = _mm256_mul_ps(y, xmm3);
@@ -659,16 +661,16 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
   x = _mm256_add_ps(x, xmm3);
 
 #ifdef __AVX2__
-  imm4 = avx2_mm256_sub_epi32(imm4, *(v8si*)_pi32_256_2);
-  imm4 = avx2_mm256_andnot_si256(imm4, *(v8si*)_pi32_256_4);
+  imm4 = avx2_mm256_sub_epi32(imm4, *(v8si *)_pi32_256_2);
+  imm4 = avx2_mm256_andnot_si256(imm4, *(v8si *)_pi32_256_4);
   imm4 = avx2_mm256_slli_epi32(imm4, 29);
 #else
-  imm4_1 = _mm_sub_epi32(imm4_1, *(v4si*)_pi32avx_2);
-  imm4_2 = _mm_sub_epi32(imm4_2, *(v4si*)_pi32avx_2);
+  imm4_1 = _mm_sub_epi32(imm4_1, *(v4si *)_pi32avx_2);
+  imm4_2 = _mm_sub_epi32(imm4_2, *(v4si *)_pi32avx_2);
+
+  imm4_1 = _mm_andnot_si128(imm4_1, *(v4si *)_pi32avx_4);
+  imm4_2 = _mm_andnot_si128(imm4_2, *(v4si *)_pi32avx_4);
 
-  imm4_1 = _mm_andnot_si128(imm4_1, *(v4si*)_pi32avx_4);
-  imm4_2 = _mm_andnot_si128(imm4_2, *(v4si*)_pi32avx_4);
-  
   imm4_1 = _mm_slli_epi32(imm4_1, 29);
   imm4_2 = _mm_slli_epi32(imm4_2, 29);
 
@@ -678,42 +680,42 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
   v8sf sign_bit_cos = _mm256_castsi256_ps(imm4);
 
   sign_bit_sin = _mm256_xor_ps(sign_bit_sin, swap_sign_bit_sin);
-  
+
   /* Evaluate the first polynom  (0 <= x <= Pi/4) */
-  v8sf z = _mm256_mul_ps(x,x);
-  y = *(v8sf*)_ps256_coscof_p0;
+  v8sf z = _mm256_mul_ps(x, x);
+  y = *(v8sf *)_ps256_coscof_p0;
 
   y = _mm256_mul_ps(y, z);
-  y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p1);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p1);
   y = _mm256_mul_ps(y, z);
-  y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p2);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p2);
   y = _mm256_mul_ps(y, z);
   y = _mm256_mul_ps(y, z);
-  v8sf tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5);
+  v8sf tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5);
   y = _mm256_sub_ps(y, tmp);
-  y = _mm256_add_ps(y, *(v8sf*)_ps256_1);
-  
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_1);
+
   /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
 
-  v8sf y2 = *(v8sf*)_ps256_sincof_p0;
+  v8sf y2 = *(v8sf *)_ps256_sincof_p0;
   y2 = _mm256_mul_ps(y2, z);
-  y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p1);
+  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p1);
   y2 = _mm256_mul_ps(y2, z);
-  y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p2);
+  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p2);
   y2 = _mm256_mul_ps(y2, z);
   y2 = _mm256_mul_ps(y2, x);
   y2 = _mm256_add_ps(y2, x);
 
-  /* select the correct result from the two polynoms */  
+  /* select the correct result from the two polynoms */
   xmm3 = poly_mask;
   v8sf ysin2 = _mm256_and_ps(xmm3, y2);
   v8sf ysin1 = _mm256_andnot_ps(xmm3, y);
-  y2 = _mm256_sub_ps(y2,ysin2);
+  y2 = _mm256_sub_ps(y2, ysin2);
   y = _mm256_sub_ps(y, ysin1);
 
-  xmm1 = _mm256_add_ps(ysin1,ysin2);
-  xmm2 = _mm256_add_ps(y,y2);
- 
+  xmm1 = _mm256_add_ps(ysin1, ysin2);
+  xmm2 = _mm256_add_ps(y, y2);
+
   /* update the sign */
   *s = _mm256_xor_ps(xmm1, sign_bit_sin);
   *c = _mm256_xor_ps(xmm2, sign_bit_cos);
diff --git a/paddle/cuda/src/hl_avx_functions.cc b/paddle/cuda/src/hl_avx_functions.cc
index 08976180ff..c1e0c7f9d9 100644
--- a/paddle/cuda/src/hl_avx_functions.cc
+++ b/paddle/cuda/src/hl_avx_functions.cc
@@ -12,62 +12,58 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <immintrin.h>
 #include "hl_functions.h"
 
 namespace hppl {
 
-  extern __m256 exp(__m256 a);
+extern __m256 exp(__m256 a);
 
-  __m256 relu(const __m256 a) {
-    __m256 tmp = _mm256_set1_ps(0.0f);
-    return _mm256_max_ps(a, tmp);
-  }
+__m256 relu(const __m256 a) {
+  __m256 tmp = _mm256_set1_ps(0.0f);
+  return _mm256_max_ps(a, tmp);
+}
 
-  __m256 sigmoid(const __m256 a) {
-    __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX);
-    __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN);
-    __m256 tmp = _mm256_max_ps(a, min);
-    tmp = _mm256_min_ps(tmp, max);
-    tmp = _mm256_sub_ps(_mm256_set1_ps(0.0f), tmp);
-    tmp = exp(tmp);
-    tmp = _mm256_add_ps(_mm256_set1_ps(1.0f), tmp);
-    tmp = _mm256_div_ps(_mm256_set1_ps(1.0f), tmp);
-    return tmp;
-  }
+__m256 sigmoid(const __m256 a) {
+  __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX);
+  __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN);
+  __m256 tmp = _mm256_max_ps(a, min);
+  tmp = _mm256_min_ps(tmp, max);
+  tmp = _mm256_sub_ps(_mm256_set1_ps(0.0f), tmp);
+  tmp = exp(tmp);
+  tmp = _mm256_add_ps(_mm256_set1_ps(1.0f), tmp);
+  tmp = _mm256_div_ps(_mm256_set1_ps(1.0f), tmp);
+  return tmp;
+}
 
-  __m256 tanh(const __m256 a) {
-    __m256 max = _mm256_set1_ps(EXP_MAX_INPUT);
-    __m256 tmp = _mm256_mul_ps(_mm256_set1_ps(-2.0f), a);
-    tmp = _mm256_min_ps(tmp, max);
-    tmp = exp(tmp);
-    return _mm256_sub_ps(
-        _mm256_div_ps(_mm256_set1_ps(2.0f),
-        _mm256_add_ps(_mm256_set1_ps(1.0f), tmp)), _mm256_set1_ps(1.0f));
-  }
+__m256 tanh(const __m256 a) {
+  __m256 max = _mm256_set1_ps(EXP_MAX_INPUT);
+  __m256 tmp = _mm256_mul_ps(_mm256_set1_ps(-2.0f), a);
+  tmp = _mm256_min_ps(tmp, max);
+  tmp = exp(tmp);
+  return _mm256_sub_ps(_mm256_div_ps(_mm256_set1_ps(2.0f),
+                                     _mm256_add_ps(_mm256_set1_ps(1.0f), tmp)),
+                       _mm256_set1_ps(1.0f));
+}
 
-  __m256 linear(const __m256 a) {
-    return a;
-  }
+__m256 linear(const __m256 a) { return a; }
 
-  __m256 relu(const __m256 a, const __m256 b) {
-    return _mm256_mul_ps(a,
+__m256 relu(const __m256 a, const __m256 b) {
+  return _mm256_mul_ps(
+      a,
       _mm256_and_ps(_mm256_cmp_ps(b, _mm256_set1_ps(0.0f), _CMP_GT_OS),
-      _mm256_set1_ps(1.0f)));
-  }
+                    _mm256_set1_ps(1.0f)));
+}
 
-  __m256 sigmoid(const __m256 a, const __m256 b) {
-    return _mm256_mul_ps(_mm256_mul_ps(a, b),
-        _mm256_sub_ps(_mm256_set1_ps(1.0f), b));
-  }
+__m256 sigmoid(const __m256 a, const __m256 b) {
+  return _mm256_mul_ps(_mm256_mul_ps(a, b),
+                       _mm256_sub_ps(_mm256_set1_ps(1.0f), b));
+}
 
-  __m256 tanh(const __m256 a, const __m256 b) {
-    return _mm256_mul_ps(a,
-      _mm256_sub_ps(_mm256_set1_ps(1.0f), _mm256_mul_ps(b, b)));
-  }
+__m256 tanh(const __m256 a, const __m256 b) {
+  return _mm256_mul_ps(
+      a, _mm256_sub_ps(_mm256_set1_ps(1.0f), _mm256_mul_ps(b, b)));
+}
 
-  __m256 linear(const __m256 a, const __m256 b) {
-    return a;
-  }
+__m256 linear(const __m256 a, const __m256 b) { return a; }
 }  // namespace hppl
diff --git a/paddle/cuda/src/hl_cpu_functions.cc b/paddle/cuda/src/hl_cpu_functions.cc
index b8352c2d53..af00f352e5 100644
--- a/paddle/cuda/src/hl_cpu_functions.cc
+++ b/paddle/cuda/src/hl_cpu_functions.cc
@@ -12,46 +12,33 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <math.h>
 #include "hl_functions.h"
 
 namespace hppl {
 
-  real relu(const real a) {
-    return a > 0.0f ? a : 0.0f;
-  }
-
-  real sigmoid(const real a) {
-    const real min = SIGMOID_THRESHOLD_MIN;
-    const real max = SIGMOID_THRESHOLD_MAX;
-    real tmp = (a < min) ? min : ((a > max) ? max : a);
-    return 1.0 / (1.0 + exp(-tmp));
-  }
-
-  real tanh(const real a) {
-    real tmp = -2.0 * a;
-    tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
-    return (2.0 / (1.0 + exp(tmp))) - 1.0;
-  }
-
-  real linear(const real a) {
-    return a;
-  }
-
-  real relu(const real a, const real b) {
-    return a * (b > 0.0f ? 1.0f : 0.0f);
-  }
-
-  real sigmoid(const real a, const real b) {
-    return a * b * (1 - b);
-  }
-
-  real tanh(const real a, const real b) {
-    return a * (1.0f - b * b);
-  }
-
-  real linear(const real a, const real b) {
-    return a;
-  }
+real relu(const real a) { return a > 0.0f ? a : 0.0f; }
+
+real sigmoid(const real a) {
+  const real min = SIGMOID_THRESHOLD_MIN;
+  const real max = SIGMOID_THRESHOLD_MAX;
+  real tmp = (a < min) ? min : ((a > max) ? max : a);
+  return 1.0 / (1.0 + exp(-tmp));
+}
+
+real tanh(const real a) {
+  real tmp = -2.0 * a;
+  tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
+  return (2.0 / (1.0 + exp(tmp))) - 1.0;
+}
+
+real linear(const real a) { return a; }
+
+real relu(const real a, const real b) { return a * (b > 0.0f ? 1.0f : 0.0f); }
+
+real sigmoid(const real a, const real b) { return a * b * (1 - b); }
+
+real tanh(const real a, const real b) { return a * (1.0f - b * b); }
+
+real linear(const real a, const real b) { return a; }
 }  // namespace hppl
diff --git a/paddle/cuda/src/hl_cuda_cublas.cc b/paddle/cuda/src/hl_cuda_cublas.cc
index f16376ec93..e8ba232d44 100644
--- a/paddle/cuda/src/hl_cuda_cublas.cc
+++ b/paddle/cuda/src/hl_cuda_cublas.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <sys/time.h>
 #include <mutex>
 #include "hl_cuda.h"
@@ -24,7 +23,7 @@ limitations under the License. */
 namespace dynload {
 
 std::once_flag cublas_dso_flag;
-void* cublas_dso_handle = nullptr;
+void *cublas_dso_handle = nullptr;
 
 /**
  * The following macro definition can generate structs
@@ -34,31 +33,30 @@ void* cublas_dso_handle = nullptr;
  * note: default dynamic linked libs
  */
 #ifdef PADDLE_USE_DSO
-#define DYNAMIC_LOAD_CUBLAS_WRAP(__name)                          \
-   struct DynLoad__##__name {                                     \
-    template <typename... Args>                                   \
-    cublasStatus_t operator()(Args... args) {                     \
-        typedef cublasStatus_t (*cublasFunc)(Args...);            \
-        std::call_once(cublas_dso_flag, GetCublasDsoHandle,       \
-                      &cublas_dso_handle);                        \
-        void* p_##__name = dlsym(cublas_dso_handle, #__name);     \
-        return reinterpret_cast<cublasFunc>(p_##__name)(args...); \
-    }                                                             \
+#define DYNAMIC_LOAD_CUBLAS_WRAP(__name)                                       \
+  struct DynLoad__##__name {                                                   \
+    template <typename... Args>                                                \
+    cublasStatus_t operator()(Args... args) {                                  \
+      typedef cublasStatus_t (*cublasFunc)(Args...);                           \
+      std::call_once(cublas_dso_flag, GetCublasDsoHandle, &cublas_dso_handle); \
+      void *p_##__name = dlsym(cublas_dso_handle, #__name);                    \
+      return reinterpret_cast<cublasFunc>(p_##__name)(args...);                \
+    }                                                                          \
   } __name;  // struct DynLoad__##__name
 #else
-#define DYNAMIC_LOAD_CUBLAS_WRAP(__name)                          \
-   struct DynLoad__##__name {                                     \
-    template <typename... Args>                                   \
-    cublasStatus_t operator()(Args... args) {                     \
-      return __name(args...);                                     \
-    }                                                             \
+#define DYNAMIC_LOAD_CUBLAS_WRAP(__name)      \
+  struct DynLoad__##__name {                  \
+    template <typename... Args>               \
+    cublasStatus_t operator()(Args... args) { \
+      return __name(args...);                 \
+    }                                         \
   } __name;  // struct DynLoad__##__name
 #endif
 
-#define DYNAMIC_LOAD_CUBLAS_V2_WRAP(__name) \
-  DYNAMIC_LOAD_CUBLAS_WRAP(__name)
+#define DYNAMIC_LOAD_CUBLAS_V2_WRAP(__name) DYNAMIC_LOAD_CUBLAS_WRAP(__name)
 
 // include all needed cublas functions in HPPL
+// clang-format off
 #define CUBLAS_BLAS_ROUTINE_EACH(__macro) \
   __macro(cublasSgemv)                    \
   __macro(cublasDgemv)                    \
@@ -88,41 +86,41 @@ CUBLAS_BLAS_ROUTINE_EACH(DYNAMIC_LOAD_CUBLAS_V2_WRAP)
 
 } /* namespace dynload */
 
-
+// clang-format on
 #ifndef PADDLE_TYPE_DOUBLE
-#define     CUBLAS_GEAM     dynload::cublasSgeam
-#define     CUBLAS_GEMV     dynload::cublasSgemv
-#define     CUBLAS_GEMM     dynload::cublasSgemm
-#define     CUBLAS_GETRF    dynload::cublasSgetrfBatched
-#define     CUBLAS_GETRI    dynload::cublasSgetriBatched
+#define CUBLAS_GEAM dynload::cublasSgeam
+#define CUBLAS_GEMV dynload::cublasSgemv
+#define CUBLAS_GEMM dynload::cublasSgemm
+#define CUBLAS_GETRF dynload::cublasSgetrfBatched
+#define CUBLAS_GETRI dynload::cublasSgetriBatched
 #else
-#define     CUBLAS_GEAM     dynload::cublasDgeam
-#define     CUBLAS_GEMV     dynload::cublasDgemv
-#define     CUBLAS_GEMM     dynload::cublasDgemm
-#define     CUBLAS_GETRF    dynload::cublasDgetrfBatched
-#define     CUBLAS_GETRI    dynload::cublasDgetriBatched
+#define CUBLAS_GEAM dynload::cublasDgeam
+#define CUBLAS_GEMV dynload::cublasDgemv
+#define CUBLAS_GEMM dynload::cublasDgemm
+#define CUBLAS_GETRF dynload::cublasDgetrfBatched
+#define CUBLAS_GETRI dynload::cublasDgetriBatched
 #endif
 
-const char* hl_cublas_get_error_string(cublasStatus_t status) {
-  switch(status) {
-     case CUBLAS_STATUS_NOT_INITIALIZED:
-        return "[cublas status]: not initialized";
-     case CUBLAS_STATUS_ALLOC_FAILED:
-        return "[cublas status]: allocate failed";
-     case CUBLAS_STATUS_INVALID_VALUE:
-        return "[cublas status]: invalid value";
-     case CUBLAS_STATUS_ARCH_MISMATCH:
-        return "[cublas status]: arch mismatch";
-     case CUBLAS_STATUS_MAPPING_ERROR:
-        return "[cublas status]: mapping error";
-     case CUBLAS_STATUS_EXECUTION_FAILED:
-        return "[cublas status]: execution failed";
-     case CUBLAS_STATUS_INTERNAL_ERROR:
-        return "[cublas status]: internal error";
-     case CUBLAS_STATUS_SUCCESS:
-        return "[cublas status]: success";
-     default:
-        return "[cublas status]: unknown error";
+const char *hl_cublas_get_error_string(cublasStatus_t status) {
+  switch (status) {
+    case CUBLAS_STATUS_NOT_INITIALIZED:
+      return "[cublas status]: not initialized";
+    case CUBLAS_STATUS_ALLOC_FAILED:
+      return "[cublas status]: allocate failed";
+    case CUBLAS_STATUS_INVALID_VALUE:
+      return "[cublas status]: invalid value";
+    case CUBLAS_STATUS_ARCH_MISMATCH:
+      return "[cublas status]: arch mismatch";
+    case CUBLAS_STATUS_MAPPING_ERROR:
+      return "[cublas status]: mapping error";
+    case CUBLAS_STATUS_EXECUTION_FAILED:
+      return "[cublas status]: execution failed";
+    case CUBLAS_STATUS_INTERNAL_ERROR:
+      return "[cublas status]: internal error";
+    case CUBLAS_STATUS_SUCCESS:
+      return "[cublas status]: success";
+    default:
+      return "[cublas status]: unknown error";
   }
 }
 
@@ -131,27 +129,21 @@ const char* hl_cublas_get_error_string(cublasStatus_t status) {
  * support << operator for more details error info.
  */
 cublasStatus_t g_cublasStat;
-#define CHECK_CUBLAS(cublas_func)                 \
-  g_cublasStat = cublas_func;                     \
-  CHECK_EQ(CUBLAS_STATUS_SUCCESS, g_cublasStat)   \
-      << "Cublas Error: "                         \
-      << hl_cublas_get_error_string(g_cublasStat) \
-      << " "
+#define CHECK_CUBLAS(cublas_func)               \
+  g_cublasStat = cublas_func;                   \
+  CHECK_EQ(CUBLAS_STATUS_SUCCESS, g_cublasStat) \
+      << "Cublas Error: " << hl_cublas_get_error_string(g_cublasStat) << " "
 
 void hl_cublas_init(cublasHandle_t *cublas_handle, cudaStream_t stream) {
   CHECK_CUBLAS(dynload::cublasCreate(cublas_handle))
-    << "[cublas init] Cublas create handle faild!";
+      << "[cublas init] Cublas create handle faild!";
 
   CHECK_CUBLAS(dynload::cublasSetStream(*cublas_handle, stream))
-    << "[cublas init] Cublas set stream faild!";
+      << "[cublas init] Cublas set stream faild!";
 }
 
-void hl_matrix_transpose(real *A_d,
-                         real *C_d,
-                         int dimM,
-                         int dimN,
-                         int lda,
-                         int ldc) {
+void hl_matrix_transpose(
+    real *A_d, real *C_d, int dimM, int dimN, int lda, int ldc) {
   real alpha = 1.0;
   real beta = 0.0;
 
@@ -159,11 +151,18 @@ void hl_matrix_transpose(real *A_d,
   CHECK_NOTNULL(C_d);
 
   CHECK_CUBLAS(CUBLAS_GEAM(t_resource.handle,
-               CUBLAS_OP_T, CUBLAS_OP_N,
-               dimM, dimN,
-               &alpha, A_d, lda,
-               &beta, nullptr, dimM,
-               C_d, ldc));
+                           CUBLAS_OP_T,
+                           CUBLAS_OP_N,
+                           dimM,
+                           dimN,
+                           &alpha,
+                           A_d,
+                           lda,
+                           &beta,
+                           nullptr,
+                           dimM,
+                           C_d,
+                           ldc));
   CHECK_SYNC("hl_matrix_transpose failed");
 }
 
@@ -181,21 +180,20 @@ void hl_matrix_inverse(real *A_d, real *C_d, int dimN, int lda, int ldc) {
   real **inout_d = (real **)hl_malloc_device(sizeof(real *));
   hl_memcpy(inout_d, inout_h, sizeof(real *));
 
-  int *pivot_d = (int *)hl_malloc_device(dimN*sizeof(int));  
+  int *pivot_d = (int *)hl_malloc_device(dimN * sizeof(int));
   int *info_d = (int *)t_resource.gpu_mem;
 
   /* Note: cublasSgetrfBatched is used to calculate a number of
      small-sized matrices. There may be a better way to reconstruct
      the API for better performance.
    */
-  CHECK_CUBLAS(CUBLAS_GETRF(t_resource.handle,
-	       dimN, inout_d, lda, pivot_d,
-               info_d, 1));
+  CHECK_CUBLAS(
+      CUBLAS_GETRF(t_resource.handle, dimN, inout_d, lda, pivot_d, info_d, 1));
 
-  int info_h; 
+  int info_h;
   hl_memcpy(&info_h, info_d, sizeof(int));
   if (info_h != 0) {
-      LOG(FATAL) << "Factorization of matrix failed: matrix may be singular.\n";
+    LOG(FATAL) << "Factorization of matrix failed: matrix may be singular.\n";
   }
 
   /* Step 2: Compute the inverse of the matrix given its LU decomposition */
@@ -204,27 +202,40 @@ void hl_matrix_inverse(real *A_d, real *C_d, int dimN, int lda, int ldc) {
   hl_memcpy(out_d, out_h, sizeof(real *));
 
   CHECK_CUBLAS(CUBLAS_GETRI(t_resource.handle,
-	       dimN, (const real **)inout_d, lda, pivot_d,
-	       out_d, ldc, info_d, 1));
+                            dimN,
+                            (const real **)inout_d,
+                            lda,
+                            pivot_d,
+                            out_d,
+                            ldc,
+                            info_d,
+                            1));
 
   hl_memcpy(&info_h, info_d, sizeof(int));
   if (info_h != 0) {
-      LOG(FATAL) << "Inversion of matrix failed: matrix may be singular.\n";
+    LOG(FATAL) << "Inversion of matrix failed: matrix may be singular.\n";
   }
 
   hl_free_mem_device(inout_d);
   hl_free_mem_device(pivot_d);
   hl_free_mem_device(out_d);
-  
+
   CHECK_SYNC("hl_matrix_inverse failed");
 }
 
-void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
-                   real *B_d, hl_trans_op_t transb,
+void hl_matrix_mul(real *A_d,
+                   hl_trans_op_t transa,
+                   real *B_d,
+                   hl_trans_op_t transb,
                    real *C_d,
-                   int dimM, int dimN, int dimK,
-                   real alpha, real beta,
-                   int lda, int ldb, int ldc) {
+                   int dimM,
+                   int dimN,
+                   int dimK,
+                   real alpha,
+                   real beta,
+                   int lda,
+                   int ldb,
+                   int ldc) {
   CHECK_NOTNULL(A_d);
   CHECK_NOTNULL(B_d);
   CHECK_NOTNULL(C_d);
@@ -232,8 +243,8 @@ void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
   if (dimN == 1 && dimM != 1 && dimK != 1 && transb == HPPL_OP_N) {
     int m = (transa == HPPL_OP_N) ? dimM : dimK;
     int n = (transa == HPPL_OP_N) ? dimK : dimM;
-    hl_matrix_mul_vector(A_d, transa, B_d, C_d, m, n,
-                         alpha, beta, lda, ldb, ldc);
+    hl_matrix_mul_vector(
+        A_d, transa, B_d, C_d, m, n, alpha, beta, lda, ldb, ldc);
     return;
   }
 
@@ -241,8 +252,7 @@ void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
     int m = (transb == HPPL_OP_N) ? dimK : dimN;
     int n = (transb == HPPL_OP_N) ? dimN : dimK;
     hl_trans_op_t trans = (transb == HPPL_OP_N) ? HPPL_OP_T : HPPL_OP_N;
-    hl_matrix_mul_vector(B_d, trans, A_d, C_d, m, n,
-                         alpha, beta, ldb, 1, 1);
+    hl_matrix_mul_vector(B_d, trans, A_d, C_d, m, n, alpha, beta, ldb, 1, 1);
     return;
   }
 
@@ -251,26 +261,47 @@ void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
     stat = CUBLAS_GEMM(t_resource.handle,
                        CUBLAS_OP_N,
                        CUBLAS_OP_N,
-                       dimN, dimM, dimK,
-                       &alpha, B_d, ldb,
-                       A_d, lda,
-                       &beta, C_d, ldc);
+                       dimN,
+                       dimM,
+                       dimK,
+                       &alpha,
+                       B_d,
+                       ldb,
+                       A_d,
+                       lda,
+                       &beta,
+                       C_d,
+                       ldc);
   } else if ((HPPL_OP_T == transa) && (HPPL_OP_N == transb)) {
     stat = CUBLAS_GEMM(t_resource.handle,
                        CUBLAS_OP_N,
                        CUBLAS_OP_T,
-                       dimN, dimM, dimK,
-                       &alpha, B_d, ldb,
-                       A_d, lda,
-                       &beta, C_d, ldc);
+                       dimN,
+                       dimM,
+                       dimK,
+                       &alpha,
+                       B_d,
+                       ldb,
+                       A_d,
+                       lda,
+                       &beta,
+                       C_d,
+                       ldc);
   } else if ((HPPL_OP_N == transa) && (HPPL_OP_T == transb)) {
     stat = CUBLAS_GEMM(t_resource.handle,
                        CUBLAS_OP_T,
                        CUBLAS_OP_N,
-                       dimN, dimM, dimK,
-                       &alpha, B_d, ldb,
-                       A_d, lda,
-                       &beta, C_d, ldc);
+                       dimN,
+                       dimM,
+                       dimK,
+                       &alpha,
+                       B_d,
+                       ldb,
+                       A_d,
+                       lda,
+                       &beta,
+                       C_d,
+                       ldc);
   } else {
     LOG(FATAL) << "parameter transa error!";
   }
@@ -278,24 +309,46 @@ void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
   CHECK_SYNC("hl_matrix_mul failed");
 }
 
-void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
-                   real *B_d, hl_trans_op_t transb,
+void hl_matrix_mul(real *A_d,
+                   hl_trans_op_t transa,
+                   real *B_d,
+                   hl_trans_op_t transb,
                    real *C_d,
-                   int dimM, int dimN, int dimK,
-                   real alpha, real beta) {
+                   int dimM,
+                   int dimN,
+                   int dimK,
+                   real alpha,
+                   real beta) {
   int lda = (HPPL_OP_N == transa) ? dimK : dimM;
   int ldb = (HPPL_OP_N == transb) ? dimN : dimK;
   int ldc = dimN;
 
-  hl_matrix_mul(A_d, transa, B_d, transb, C_d, dimM, dimN,
-                dimK, alpha, beta, lda, ldb, ldc);
+  hl_matrix_mul(A_d,
+                transa,
+                B_d,
+                transb,
+                C_d,
+                dimM,
+                dimN,
+                dimK,
+                alpha,
+                beta,
+                lda,
+                ldb,
+                ldc);
 }
 
-void hl_matrix_mul_vector(real *A_d, hl_trans_op_t trans,
-                          real *B_d, real *C_d,
-                          int dimM, int dimN,
-                          real alpha, real beta,
-                          int lda, int incb, int incc) {
+void hl_matrix_mul_vector(real *A_d,
+                          hl_trans_op_t trans,
+                          real *B_d,
+                          real *C_d,
+                          int dimM,
+                          int dimN,
+                          real alpha,
+                          real beta,
+                          int lda,
+                          int incb,
+                          int incc) {
   CHECK_NOTNULL(A_d);
   CHECK_NOTNULL(B_d);
   CHECK_NOTNULL(C_d);
@@ -304,21 +357,29 @@ void hl_matrix_mul_vector(real *A_d, hl_trans_op_t trans,
   if (HPPL_OP_N == trans) {
     stat = CUBLAS_GEMV(t_resource.handle,
                        CUBLAS_OP_T,
-                       dimN, dimM,
+                       dimN,
+                       dimM,
                        &alpha,
-                       A_d, lda,
-                       B_d, incb,
+                       A_d,
+                       lda,
+                       B_d,
+                       incb,
                        &beta,
-                       C_d, incc);
+                       C_d,
+                       incc);
   } else if (HPPL_OP_T == trans) {
     stat = CUBLAS_GEMV(t_resource.handle,
                        CUBLAS_OP_N,
-                       dimN, dimM,
+                       dimN,
+                       dimM,
                        &alpha,
-                       A_d, lda,
-                       B_d, incb,
+                       A_d,
+                       lda,
+                       B_d,
+                       incb,
                        &beta,
-                       C_d, incc);
+                       C_d,
+                       incc);
   } else {
     LOG(FATAL) << "parameter transa error!";
   }
@@ -327,10 +388,14 @@ void hl_matrix_mul_vector(real *A_d, hl_trans_op_t trans,
   CHECK_SYNC("hl_matrix_mul_vector");
 }
 
-void hl_matrix_mul_vector(real *A_d, hl_trans_op_t trans,
-                          real *B_d, real *C_d,
-                          int dimM, int dimN,
-                          real alpha, real beta) {
-  hl_matrix_mul_vector(A_d, trans, B_d, C_d, dimM, dimN,
-                       alpha, beta, dimN, 1, 1);
+void hl_matrix_mul_vector(real *A_d,
+                          hl_trans_op_t trans,
+                          real *B_d,
+                          real *C_d,
+                          int dimM,
+                          int dimN,
+                          real alpha,
+                          real beta) {
+  hl_matrix_mul_vector(
+      A_d, trans, B_d, C_d, dimM, dimN, alpha, beta, dimN, 1, 1);
 }
diff --git a/paddle/cuda/src/hl_cuda_cudnn.cc b/paddle/cuda/src/hl_cuda_cudnn.cc
index 92b28e4345..9d4ff08a78 100644
--- a/paddle/cuda/src/hl_cuda_cudnn.cc
+++ b/paddle/cuda/src/hl_cuda_cudnn.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <cudnn.h>
 #include <mutex>
 #include "hl_cuda_cudnn.h"
@@ -22,9 +21,10 @@ limitations under the License. */
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/CommandLineParser.h"
 
-P_DEFINE_int32(cudnn_conv_workspace_limit_in_mb, 4096,
-                "Specify cuDNN max workspace limit, in units MB, "
-                "4096MB=4GB by default.");
+P_DEFINE_int32(cudnn_conv_workspace_limit_in_mb,
+               4096,
+               "Specify cuDNN max workspace limit, in units MB, "
+               "4096MB=4GB by default.");
 
 namespace dynload {
 
@@ -41,16 +41,15 @@ void* cudnn_dso_handle = nullptr;
 
 #ifdef PADDLE_USE_DSO
 
-#define DYNAMIC_LOAD_CUDNN_WRAP(__name)                          \
-  struct DynLoad__##__name {                                     \
-    template <typename... Args>                                  \
-    auto operator()(Args... args) -> decltype(__name(args...)) { \
-      using cudnn_func = decltype(__name(args...))(*)(Args...);  \
-      std::call_once(cudnn_dso_flag, GetCudnnDsoHandle,          \
-                     &cudnn_dso_handle);                         \
-      void* p_##__name = dlsym(cudnn_dso_handle, #__name);       \
-      return reinterpret_cast<cudnn_func>(p_##__name)(args...);  \
-    }                                                            \
+#define DYNAMIC_LOAD_CUDNN_WRAP(__name)                                     \
+  struct DynLoad__##__name {                                                \
+    template <typename... Args>                                             \
+    auto operator()(Args... args) -> decltype(__name(args...)) {            \
+      using cudnn_func = decltype(__name(args...)) (*)(Args...);            \
+      std::call_once(cudnn_dso_flag, GetCudnnDsoHandle, &cudnn_dso_handle); \
+      void* p_##__name = dlsym(cudnn_dso_handle, #__name);                  \
+      return reinterpret_cast<cudnn_func>(p_##__name)(args...);             \
+    }                                                                       \
   } __name; /* struct DynLoad__##__name */
 
 #else
@@ -69,6 +68,7 @@ void* cudnn_dso_handle = nullptr;
  * include all needed cudnn functions in HPPL
  * different cudnn version has different interfaces
  **/
+// clang-format off
 #define CUDNN_DNN_ROUTINE_EACH(__macro)                   \
   __macro(cudnnSetTensor4dDescriptor)                     \
   __macro(cudnnSetTensor4dDescriptorEx)                   \
@@ -141,58 +141,53 @@ CUDNN_DNN_ROUTINE_EACH_R5(DYNAMIC_LOAD_CUDNN_WRAP)
 #endif
 
 #undef CUDNN_DNN_ROUTINE_EACH
-
+// clang-format on
 } /* namespace dynload */
 
 /**
  * Check build-in cudnn function using glog and it **does not**
  * support << operator for more details error info.
  */
-#define CHECK_CUDNN(cudnnFunc)                               \
-  do {                                                       \
-    cudnnStatus_t cudnnStat = cudnnFunc;                     \
-    CHECK_EQ(CUDNN_STATUS_SUCCESS, cudnnStat)                \
-        << "Cudnn Error: "                                   \
-        << dynload::cudnnGetErrorString(cudnnStat);          \
+#define CHECK_CUDNN(cudnnFunc)                                         \
+  do {                                                                 \
+    cudnnStatus_t cudnnStat = cudnnFunc;                               \
+    CHECK_EQ(CUDNN_STATUS_SUCCESS, cudnnStat)                          \
+        << "Cudnn Error: " << dynload::cudnnGetErrorString(cudnnStat); \
   } while (0)
 
 bool g_is_libcudnn_init = false;
 int g_cudnn_lib_version = 0;
 
-void hl_cudnn_desc_init(cudnnTensorDescriptor_t*  cudnn_desc)
-{
-    CHECK_CUDNN(dynload::cudnnCreateTensorDescriptor(cudnn_desc));
+void hl_cudnn_desc_init(cudnnTensorDescriptor_t* cudnn_desc) {
+  CHECK_CUDNN(dynload::cudnnCreateTensorDescriptor(cudnn_desc));
 }
 
-void hl_cudnn_init(cudnnHandle_t *cudnn_handle, cudaStream_t stream)
-{
-    size_t cudnn_dso_ver = dynload::cudnnGetVersion();
-    size_t cudnn_dso_major = cudnn_dso_ver / 1000;
-    size_t cudnn_cuh_major = CUDNN_VERSION / 1000;
-
-    // Compare cudnn header version with that of cudnn.so.
-    CHECK((cudnn_cuh_major < 4 && cudnn_dso_major < 4) ||
-          (cudnn_cuh_major == cudnn_dso_major))
-        << "[cudnn init] libcudnn v" << cudnn_dso_major <<
-        " with header v" << cudnn_cuh_major << " unmatched!\n"
-        << "PaddlePaddle Requirement: "
-        << "(header v[2-3] with libcudnn v[2-3]) Or "
-        << "(header v4 with libcudnn v4) Or "
-        << "(header v5 with libcudnn v5).";
-
-    CHECK(!(CUDNN_VERSION >= 5000 && CUDA_VERSION < 7050))
-        << "cudnn v5 requires cuda version >= 7.5";
-
-    CHECK_CUDNN(dynload::cudnnCreate(cudnn_handle));
-    CHECK_CUDNN(dynload::cudnnSetStream(*cudnn_handle, stream));
-
-    g_is_libcudnn_init = true;
-    g_cudnn_lib_version = cudnn_dso_ver;
+void hl_cudnn_init(cudnnHandle_t* cudnn_handle, cudaStream_t stream) {
+  size_t cudnn_dso_ver = dynload::cudnnGetVersion();
+  size_t cudnn_dso_major = cudnn_dso_ver / 1000;
+  size_t cudnn_cuh_major = CUDNN_VERSION / 1000;
+
+  // Compare cudnn header version with that of cudnn.so.
+  CHECK((cudnn_cuh_major < 4 && cudnn_dso_major < 4) ||
+        (cudnn_cuh_major == cudnn_dso_major))
+      << "[cudnn init] libcudnn v" << cudnn_dso_major << " with header v"
+      << cudnn_cuh_major << " unmatched!\n"
+      << "PaddlePaddle Requirement: "
+      << "(header v[2-3] with libcudnn v[2-3]) Or "
+      << "(header v4 with libcudnn v4) Or "
+      << "(header v5 with libcudnn v5).";
+
+  CHECK(!(CUDNN_VERSION >= 5000 && CUDA_VERSION < 7050))
+      << "cudnn v5 requires cuda version >= 7.5";
+
+  CHECK_CUDNN(dynload::cudnnCreate(cudnn_handle));
+  CHECK_CUDNN(dynload::cudnnSetStream(*cudnn_handle, stream));
+
+  g_is_libcudnn_init = true;
+  g_cudnn_lib_version = cudnn_dso_ver;
 }
 
-int hl_get_cudnn_lib_version() {
-  return g_cudnn_lib_version;
-}
+int hl_get_cudnn_lib_version() { return g_cudnn_lib_version; }
 
 void hl_conv_workspace(hl_tensor_descriptor input,
                        hl_tensor_descriptor output,
@@ -206,94 +201,91 @@ void hl_conv_workspace(hl_tensor_descriptor input,
                        size_t* bwdFilterLimitBytes) {
 #if CUDNN_VERSION >= 4000
 
-    CHECK_NOTNULL(input);
-    CHECK_NOTNULL(output);
-    CHECK_NOTNULL(filter);
-    CHECK_NOTNULL(conv);
-
-    // Specify workspace limit directly
-    size_t memoryLimitBytes = (1LL << 20) * FLAGS_cudnn_conv_workspace_limit_in_mb;
-
-    // cudnn convolution forward configuration
-    cudnnTensorDescriptor_t       fwd_src_desc = GET_TENSOR_DESCRIPTOR(input);
-    cudnnTensorDescriptor_t       fwd_dest_desc = GET_TENSOR_DESCRIPTOR(output);
-    cudnnFilterDescriptor_t       fwd_filter_desc = GET_FILTER_DESCRIPTOR(filter);
-    cudnnConvolutionDescriptor_t  fwd_conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
-
-    CHECK_CUDNN(dynload::cudnnGetConvolutionForwardAlgorithm(
-             t_resource.cudnn_handle,
-             fwd_src_desc,
-             fwd_filter_desc,
-             fwd_conv_desc,
-             fwd_dest_desc,
-             CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
-             memoryLimitBytes,
-             reinterpret_cast<cudnnConvolutionFwdAlgo_t*>(convFwdAlgo)));
-
-    CHECK_CUDNN(dynload::cudnnGetConvolutionForwardWorkspaceSize(
-             t_resource.cudnn_handle,
-             fwd_src_desc,
-             fwd_filter_desc,
-             fwd_conv_desc,
-             fwd_dest_desc,
-             static_cast<cudnnConvolutionFwdAlgo_t>(*convFwdAlgo),
-             fwdLimitBytes));
-
-    // cudnn convolution backward data configuration
-    cudnnFilterDescriptor_t       bwd_data_filter_desc =
-                                          GET_FILTER_DESCRIPTOR(filter);
-    cudnnTensorDescriptor_t       bwd_data_diff_desc =
-                                          GET_TENSOR_DESCRIPTOR(output);
-    cudnnTensorDescriptor_t       bwd_data_grad_desc =
-                                          GET_TENSOR_DESCRIPTOR(input);
-    cudnnConvolutionDescriptor_t  bwd_data_conv_desc =
-                                          GET_CONVOLUTION_DESCRIPTOR(conv);
-
-    CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataAlgorithm(
-             t_resource.cudnn_handle,
-             bwd_data_filter_desc,
-             bwd_data_diff_desc,
-             bwd_data_conv_desc,
-             bwd_data_grad_desc,
-             CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
-             memoryLimitBytes,
-             reinterpret_cast<cudnnConvolutionBwdDataAlgo_t*>(convBwdDataAlgo)));
-
-    CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
-             t_resource.cudnn_handle,
-             bwd_data_filter_desc,
-             bwd_data_diff_desc,
-             bwd_data_conv_desc,
-             bwd_data_grad_desc,
-             static_cast<cudnnConvolutionBwdDataAlgo_t>(*convBwdDataAlgo),
-             bwdDataLimitBytes));
-
-    // cudnn convolution backward filter configuration
-    cudnnTensorDescriptor_t       bwd_filter_src_desc =
-                                      GET_TENSOR_DESCRIPTOR(input);
-    cudnnTensorDescriptor_t       bwd_filter_diff_desc =
-                                      GET_TENSOR_DESCRIPTOR(output);
-    cudnnConvolutionDescriptor_t  bwd_filter_conv_desc =
-                                      GET_CONVOLUTION_DESCRIPTOR(conv);
-    cudnnFilterDescriptor_t       bwd_filter_grad_desc =
-                                      GET_FILTER_DESCRIPTOR(filter);
-
-    CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
-             t_resource.cudnn_handle,
-             bwd_filter_src_desc,
-             bwd_filter_diff_desc,
-             bwd_filter_conv_desc,
-             bwd_filter_grad_desc,
-             CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
-             memoryLimitBytes,
-             reinterpret_cast<cudnnConvolutionBwdFilterAlgo_t*>(convBwdFilterAlgo)));
-
-    CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
-             t_resource.cudnn_handle, bwd_filter_src_desc,
-             bwd_filter_diff_desc, bwd_filter_conv_desc,
-             bwd_filter_grad_desc,
-             static_cast<cudnnConvolutionBwdFilterAlgo_t>(*convBwdFilterAlgo),
-             bwdFilterLimitBytes));
+  CHECK_NOTNULL(input);
+  CHECK_NOTNULL(output);
+  CHECK_NOTNULL(filter);
+  CHECK_NOTNULL(conv);
+
+  // Specify workspace limit directly
+  size_t memoryLimitBytes =
+      (1LL << 20) * FLAGS_cudnn_conv_workspace_limit_in_mb;
+
+  // cudnn convolution forward configuration
+  cudnnTensorDescriptor_t fwd_src_desc = GET_TENSOR_DESCRIPTOR(input);
+  cudnnTensorDescriptor_t fwd_dest_desc = GET_TENSOR_DESCRIPTOR(output);
+  cudnnFilterDescriptor_t fwd_filter_desc = GET_FILTER_DESCRIPTOR(filter);
+  cudnnConvolutionDescriptor_t fwd_conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
+
+  CHECK_CUDNN(dynload::cudnnGetConvolutionForwardAlgorithm(
+      t_resource.cudnn_handle,
+      fwd_src_desc,
+      fwd_filter_desc,
+      fwd_conv_desc,
+      fwd_dest_desc,
+      CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
+      memoryLimitBytes,
+      reinterpret_cast<cudnnConvolutionFwdAlgo_t*>(convFwdAlgo)));
+
+  CHECK_CUDNN(dynload::cudnnGetConvolutionForwardWorkspaceSize(
+      t_resource.cudnn_handle,
+      fwd_src_desc,
+      fwd_filter_desc,
+      fwd_conv_desc,
+      fwd_dest_desc,
+      static_cast<cudnnConvolutionFwdAlgo_t>(*convFwdAlgo),
+      fwdLimitBytes));
+
+  // cudnn convolution backward data configuration
+  cudnnFilterDescriptor_t bwd_data_filter_desc = GET_FILTER_DESCRIPTOR(filter);
+  cudnnTensorDescriptor_t bwd_data_diff_desc = GET_TENSOR_DESCRIPTOR(output);
+  cudnnTensorDescriptor_t bwd_data_grad_desc = GET_TENSOR_DESCRIPTOR(input);
+  cudnnConvolutionDescriptor_t bwd_data_conv_desc =
+      GET_CONVOLUTION_DESCRIPTOR(conv);
+
+  CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataAlgorithm(
+      t_resource.cudnn_handle,
+      bwd_data_filter_desc,
+      bwd_data_diff_desc,
+      bwd_data_conv_desc,
+      bwd_data_grad_desc,
+      CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
+      memoryLimitBytes,
+      reinterpret_cast<cudnnConvolutionBwdDataAlgo_t*>(convBwdDataAlgo)));
+
+  CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
+      t_resource.cudnn_handle,
+      bwd_data_filter_desc,
+      bwd_data_diff_desc,
+      bwd_data_conv_desc,
+      bwd_data_grad_desc,
+      static_cast<cudnnConvolutionBwdDataAlgo_t>(*convBwdDataAlgo),
+      bwdDataLimitBytes));
+
+  // cudnn convolution backward filter configuration
+  cudnnTensorDescriptor_t bwd_filter_src_desc = GET_TENSOR_DESCRIPTOR(input);
+  cudnnTensorDescriptor_t bwd_filter_diff_desc = GET_TENSOR_DESCRIPTOR(output);
+  cudnnConvolutionDescriptor_t bwd_filter_conv_desc =
+      GET_CONVOLUTION_DESCRIPTOR(conv);
+  cudnnFilterDescriptor_t bwd_filter_grad_desc = GET_FILTER_DESCRIPTOR(filter);
+
+  CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
+      t_resource.cudnn_handle,
+      bwd_filter_src_desc,
+      bwd_filter_diff_desc,
+      bwd_filter_conv_desc,
+      bwd_filter_grad_desc,
+      CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
+      memoryLimitBytes,
+      reinterpret_cast<cudnnConvolutionBwdFilterAlgo_t*>(convBwdFilterAlgo)));
+
+  CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
+      t_resource.cudnn_handle,
+      bwd_filter_src_desc,
+      bwd_filter_diff_desc,
+      bwd_filter_conv_desc,
+      bwd_filter_grad_desc,
+      static_cast<cudnnConvolutionBwdFilterAlgo_t>(*convBwdFilterAlgo),
+      bwdFilterLimitBytes));
 
 #endif
 }
@@ -302,78 +294,75 @@ void hl_create_tensor_descriptor(hl_tensor_descriptor* image_desc,
                                  int batch_size,
                                  int feature_maps,
                                  int height,
-                                 int width)
-{
-    CHECK_NOTNULL(image_desc);
+                                 int width) {
+  CHECK_NOTNULL(image_desc);
 
-    cudnn_tensor_descriptor hl_desc =
-        (cudnn_tensor_descriptor)malloc(sizeof(_cudnn_tensor_descriptor));
-    CHECK_NOTNULL(hl_desc);
+  cudnn_tensor_descriptor hl_desc =
+      (cudnn_tensor_descriptor)malloc(sizeof(_cudnn_tensor_descriptor));
+  CHECK_NOTNULL(hl_desc);
 
 #ifndef PADDLE_TYPE_DOUBLE
-    cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
+  cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
 #else
-    cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
+  cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
 #endif
-    CHECK_CUDNN(dynload::cudnnCreateTensorDescriptor(&hl_desc->desc));
-
-    CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptor(
-                hl_desc->desc,
-                CUDNN_TENSOR_NCHW,
-                data_type,
-                batch_size,
-                feature_maps,
-                height,
-                width));
-
-    hl_desc->format = CUDNN_TENSOR_NCHW;
-    hl_desc->data_type = data_type;
-    hl_desc->batch_size = batch_size;
-    hl_desc->feature_maps = feature_maps;
-    hl_desc->height = height;
-    hl_desc->width = width;
-
-    *image_desc = (hl_tensor_descriptor)hl_desc;
+  CHECK_CUDNN(dynload::cudnnCreateTensorDescriptor(&hl_desc->desc));
+
+  CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptor(hl_desc->desc,
+                                                  CUDNN_TENSOR_NCHW,
+                                                  data_type,
+                                                  batch_size,
+                                                  feature_maps,
+                                                  height,
+                                                  width));
+
+  hl_desc->format = CUDNN_TENSOR_NCHW;
+  hl_desc->data_type = data_type;
+  hl_desc->batch_size = batch_size;
+  hl_desc->feature_maps = feature_maps;
+  hl_desc->height = height;
+  hl_desc->width = width;
+
+  *image_desc = (hl_tensor_descriptor)hl_desc;
 }
 
 void hl_create_tensor_descriptor(hl_tensor_descriptor* image_desc) {
-    CHECK_NOTNULL(image_desc);
+  CHECK_NOTNULL(image_desc);
 
-    cudnn_tensor_descriptor hl_desc =
-        (cudnn_tensor_descriptor)malloc(sizeof(_cudnn_tensor_descriptor));
-    CHECK_NOTNULL(hl_desc);
+  cudnn_tensor_descriptor hl_desc =
+      (cudnn_tensor_descriptor)malloc(sizeof(_cudnn_tensor_descriptor));
+  CHECK_NOTNULL(hl_desc);
 
 #ifndef PADDLE_TYPE_DOUBLE
-    cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
+  cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
 #else
-    cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
+  cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
 #endif
-    CHECK_CUDNN(dynload::cudnnCreateTensorDescriptor(&hl_desc->desc));
+  CHECK_CUDNN(dynload::cudnnCreateTensorDescriptor(&hl_desc->desc));
 
-    hl_desc->data_type = data_type;
+  hl_desc->data_type = data_type;
 
-    *image_desc = (hl_tensor_descriptor)hl_desc;
+  *image_desc = (hl_tensor_descriptor)hl_desc;
 }
 
 void hl_tensor_reshape(hl_tensor_descriptor image_desc,
                        int batch_size,
                        int feature_maps,
                        int height,
-                       int width)
-{
-    const int stride_w = 1;
-    const int stride_h = width * stride_w;
-    const int stride_c = height * stride_h;
-    const int stride_n = feature_maps * stride_c;
-    return hl_tensor_reshape(image_desc,
-                             batch_size,
-                             feature_maps,
-                             height,
-                             width,
-                             stride_n,
-                             stride_c,
-                             stride_h,
-                             stride_w);
+                       int width) {
+  const int stride_w = 1;
+  const int stride_h = width * stride_w;
+  const int stride_c = height * stride_h;
+  const int stride_n = feature_maps * stride_c;
+  return hl_tensor_reshape(image_desc,
+                           batch_size,
+                           feature_maps,
+                           height,
+                           width,
+                           stride_n,
+                           stride_c,
+                           stride_h,
+                           stride_w);
 }
 
 void hl_tensor_reshape(hl_tensor_descriptor image_desc,
@@ -384,45 +373,42 @@ void hl_tensor_reshape(hl_tensor_descriptor image_desc,
                        int nStride,
                        int cStride,
                        int hStride,
-                       int wStride)
-{
-    CHECK_NOTNULL(image_desc);
-
-    cudnn_tensor_descriptor hl_desc = (cudnn_tensor_descriptor)image_desc;
-    CHECK_NOTNULL(hl_desc->desc);
-
-    CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptorEx(hl_desc->desc,
-                hl_desc->data_type,
-                batch_size,
-                feature_maps,
-                height,
-                width,
-                nStride,
-                cStride,
-                hStride,
-                wStride));
-
-    hl_desc->batch_size = batch_size;
-    hl_desc->feature_maps = feature_maps;
-    hl_desc->height = height;
-    hl_desc->width = width;
+                       int wStride) {
+  CHECK_NOTNULL(image_desc);
+
+  cudnn_tensor_descriptor hl_desc = (cudnn_tensor_descriptor)image_desc;
+  CHECK_NOTNULL(hl_desc->desc);
+
+  CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptorEx(hl_desc->desc,
+                                                    hl_desc->data_type,
+                                                    batch_size,
+                                                    feature_maps,
+                                                    height,
+                                                    width,
+                                                    nStride,
+                                                    cStride,
+                                                    hStride,
+                                                    wStride));
+
+  hl_desc->batch_size = batch_size;
+  hl_desc->feature_maps = feature_maps;
+  hl_desc->height = height;
+  hl_desc->width = width;
 }
 
-void hl_destroy_tensor_descriptor(hl_tensor_descriptor image_desc)
-{
-    CHECK_NOTNULL(image_desc);
+void hl_destroy_tensor_descriptor(hl_tensor_descriptor image_desc) {
+  CHECK_NOTNULL(image_desc);
 
-    cudnn_tensor_descriptor hl_desc = (cudnn_tensor_descriptor)image_desc;
-    CHECK_NOTNULL(hl_desc->desc);
+  cudnn_tensor_descriptor hl_desc = (cudnn_tensor_descriptor)image_desc;
+  CHECK_NOTNULL(hl_desc->desc);
 
-    CHECK_CUDNN(dynload::cudnnDestroyTensorDescriptor(hl_desc->desc));
+  CHECK_CUDNN(dynload::cudnnDestroyTensorDescriptor(hl_desc->desc));
 
-    hl_desc->desc = NULL;
+  hl_desc->desc = NULL;
 
-    free(image_desc);
+  free(image_desc);
 }
 
-
 void hl_create_pooling_descriptor(hl_pooling_descriptor* pooling_desc,
                                   hl_pooling_mode_t mode,
                                   int height,
@@ -430,99 +416,93 @@ void hl_create_pooling_descriptor(hl_pooling_descriptor* pooling_desc,
                                   int height_padding,
                                   int width_padding,
                                   int stride_height,
-                                  int stride_width)
-{
-    cudnnPoolingMode_t cudnn_mode;
-    switch (mode)
-    {
-        case HL_POOLING_MAX:
-            cudnn_mode = CUDNN_POOLING_MAX;
-            break;
-        case HL_POOLING_AVERAGE:
-            cudnn_mode = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
-            break;
-        case HL_POOLING_AVERAGE_EXCLUDE_PADDING:
-            cudnn_mode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
-            break;
-        default:
-            LOG(FATAL) << "parameter mode error";
-    }
-
-    CHECK_NOTNULL(pooling_desc);
-
-    cudnn_pooling_descriptor hl_pooling_desc =
-        (cudnn_pooling_descriptor)malloc(sizeof(_cudnn_pooling_descriptor));
-    CHECK_NOTNULL(hl_pooling_desc);
-
-    CHECK_CUDNN(dynload::cudnnCreatePoolingDescriptor(&hl_pooling_desc->desc));
-
-    CHECK_CUDNN(dynload::cudnnSetPooling2dDescriptor(
-                hl_pooling_desc->desc,
-                cudnn_mode,
+                                  int stride_width) {
+  cudnnPoolingMode_t cudnn_mode;
+  switch (mode) {
+    case HL_POOLING_MAX:
+      cudnn_mode = CUDNN_POOLING_MAX;
+      break;
+    case HL_POOLING_AVERAGE:
+      cudnn_mode = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
+      break;
+    case HL_POOLING_AVERAGE_EXCLUDE_PADDING:
+      cudnn_mode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
+      break;
+    default:
+      LOG(FATAL) << "parameter mode error";
+  }
+
+  CHECK_NOTNULL(pooling_desc);
+
+  cudnn_pooling_descriptor hl_pooling_desc =
+      (cudnn_pooling_descriptor)malloc(sizeof(_cudnn_pooling_descriptor));
+  CHECK_NOTNULL(hl_pooling_desc);
+
+  CHECK_CUDNN(dynload::cudnnCreatePoolingDescriptor(&hl_pooling_desc->desc));
+
+  CHECK_CUDNN(dynload::cudnnSetPooling2dDescriptor(hl_pooling_desc->desc,
+                                                   cudnn_mode,
 #if CUDNN_VERSION >= 5000
-                CUDNN_PROPAGATE_NAN,
+                                                   CUDNN_PROPAGATE_NAN,
 #endif
-                height,
-                width,
-                height_padding,
-                width_padding,
-                stride_height,
-                stride_width));
-
-    hl_pooling_desc->mode = cudnn_mode;
-    hl_pooling_desc->window_height = height;
-    hl_pooling_desc->window_width = width;
-    hl_pooling_desc->stride_height = stride_height;
-    hl_pooling_desc->stride_width = stride_width;
-
-    *pooling_desc = (hl_pooling_descriptor)hl_pooling_desc;
+                                                   height,
+                                                   width,
+                                                   height_padding,
+                                                   width_padding,
+                                                   stride_height,
+                                                   stride_width));
+
+  hl_pooling_desc->mode = cudnn_mode;
+  hl_pooling_desc->window_height = height;
+  hl_pooling_desc->window_width = width;
+  hl_pooling_desc->stride_height = stride_height;
+  hl_pooling_desc->stride_width = stride_width;
+
+  *pooling_desc = (hl_pooling_descriptor)hl_pooling_desc;
 }
 
-void hl_destroy_pooling_descriptor(hl_pooling_descriptor pooling_desc)
-{
-    CHECK_NOTNULL(pooling_desc);
+void hl_destroy_pooling_descriptor(hl_pooling_descriptor pooling_desc) {
+  CHECK_NOTNULL(pooling_desc);
 
-    cudnn_pooling_descriptor hl_pooling = (cudnn_pooling_descriptor)pooling_desc;
-    CHECK_NOTNULL(hl_pooling->desc);
+  cudnn_pooling_descriptor hl_pooling = (cudnn_pooling_descriptor)pooling_desc;
 
-    CHECK_CUDNN(dynload::cudnnDestroyPoolingDescriptor(hl_pooling->desc));
+  CHECK_NOTNULL(hl_pooling->desc);
+  CHECK_CUDNN(dynload::cudnnDestroyPoolingDescriptor(hl_pooling->desc));
 
-    hl_pooling->desc = NULL;
+  hl_pooling->desc = NULL;
 
-    free(pooling_desc);
+  free(pooling_desc);
 }
 
 void hl_pooling_forward(hl_tensor_descriptor input,
                         real* input_image,
                         hl_tensor_descriptor output,
                         real* output_image,
-                        hl_pooling_descriptor pooling)
-{
-    cudnnPoolingDescriptor_t    pooling_desc;
-    cudnnTensorDescriptor_t     input_desc;
-    cudnnTensorDescriptor_t     output_desc;
-
-    CHECK_NOTNULL(input);
-    CHECK_NOTNULL(output);
-    CHECK_NOTNULL(pooling);
-    CHECK_NOTNULL(input_image);
-    CHECK_NOTNULL(output_image);
-
-    real alpha = 1.0f;
-    real beta = 1.0f;
-    input_desc = ((cudnn_tensor_descriptor)input)->desc;
-    output_desc = ((cudnn_tensor_descriptor)output)->desc;
-    pooling_desc = ((cudnn_pooling_descriptor)pooling)->desc;
-    CHECK_CUDNN(dynload::cudnnPoolingForward(
-                t_resource.cudnn_handle,
-                pooling_desc,
-                &alpha,
-                input_desc,
-                input_image,
-                &beta,
-                output_desc,
-                output_image));
-    CHECK_SYNC("hl_pooling_forward failed");
+                        hl_pooling_descriptor pooling) {
+  cudnnPoolingDescriptor_t pooling_desc;
+  cudnnTensorDescriptor_t input_desc;
+  cudnnTensorDescriptor_t output_desc;
+
+  CHECK_NOTNULL(input);
+  CHECK_NOTNULL(output);
+  CHECK_NOTNULL(pooling);
+  CHECK_NOTNULL(input_image);
+  CHECK_NOTNULL(output_image);
+
+  real alpha = 1.0f;
+  real beta = 1.0f;
+  input_desc = ((cudnn_tensor_descriptor)input)->desc;
+  output_desc = ((cudnn_tensor_descriptor)output)->desc;
+  pooling_desc = ((cudnn_pooling_descriptor)pooling)->desc;
+  CHECK_CUDNN(dynload::cudnnPoolingForward(t_resource.cudnn_handle,
+                                           pooling_desc,
+                                           &alpha,
+                                           input_desc,
+                                           input_image,
+                                           &beta,
+                                           output_desc,
+                                           output_image));
+  CHECK_SYNC("hl_pooling_forward failed");
 }
 
 void hl_pooling_backward(hl_tensor_descriptor input,
@@ -531,94 +511,87 @@ void hl_pooling_backward(hl_tensor_descriptor input,
                          hl_tensor_descriptor output,
                          real* output_image,
                          real* output_image_grad,
-                         hl_pooling_descriptor pooling)
-{
-    cudnnPoolingDescriptor_t    pooling_desc;
-    cudnnTensorDescriptor_t     input_desc;
-    cudnnTensorDescriptor_t     output_desc;
-
-    CHECK_NOTNULL(input);
-    CHECK_NOTNULL(output);
-    CHECK_NOTNULL(pooling);
-    CHECK_NOTNULL(input_image);
-    CHECK_NOTNULL(input_image_grad);
-    CHECK_NOTNULL(output_image);
-    CHECK_NOTNULL(output_image_grad);
-
-    real alpha = 1.0f;
-    real beta = 1.0f;
-    input_desc = ((cudnn_tensor_descriptor)input)->desc;
-    output_desc = ((cudnn_tensor_descriptor)output)->desc;
-    pooling_desc = ((cudnn_pooling_descriptor)pooling)->desc;
-    CHECK_CUDNN(dynload::cudnnPoolingBackward(
-                t_resource.cudnn_handle,
-                pooling_desc,
-                &alpha,
-                output_desc,
-                output_image,
-                output_desc,
-                output_image_grad,
-                input_desc,
-                input_image,
-                &beta,
-                input_desc,
-                input_image_grad));
+                         hl_pooling_descriptor pooling) {
+  cudnnPoolingDescriptor_t pooling_desc;
+  cudnnTensorDescriptor_t input_desc;
+  cudnnTensorDescriptor_t output_desc;
+
+  CHECK_NOTNULL(input);
+  CHECK_NOTNULL(output);
+  CHECK_NOTNULL(pooling);
+  CHECK_NOTNULL(input_image);
+  CHECK_NOTNULL(input_image_grad);
+  CHECK_NOTNULL(output_image);
+  CHECK_NOTNULL(output_image_grad);
+
+  real alpha = 1.0f;
+  real beta = 1.0f;
+  input_desc = ((cudnn_tensor_descriptor)input)->desc;
+  output_desc = ((cudnn_tensor_descriptor)output)->desc;
+  pooling_desc = ((cudnn_pooling_descriptor)pooling)->desc;
+  CHECK_CUDNN(dynload::cudnnPoolingBackward(t_resource.cudnn_handle,
+                                            pooling_desc,
+                                            &alpha,
+                                            output_desc,
+                                            output_image,
+                                            output_desc,
+                                            output_image_grad,
+                                            input_desc,
+                                            input_image,
+                                            &beta,
+                                            input_desc,
+                                            input_image_grad));
   CHECK_SYNC("hl_pooling_backward failed");
 }
 
-
 void hl_create_filter_descriptor(hl_filter_descriptor* filter,
                                  int input_feature_maps,
                                  int output_feature_maps,
                                  int height,
-                                 int width)
-{
-    CHECK_NOTNULL(filter);
+                                 int width) {
+  CHECK_NOTNULL(filter);
 
-    cudnn_filter_descriptor hl_filter =
-        (cudnn_filter_descriptor)malloc(sizeof(_cudnn_filter_descriptor));
-    CHECK_NOTNULL(hl_filter);
+  cudnn_filter_descriptor hl_filter =
+      (cudnn_filter_descriptor)malloc(sizeof(_cudnn_filter_descriptor));
+  CHECK_NOTNULL(hl_filter);
 
-    CHECK_CUDNN(dynload::cudnnCreateFilterDescriptor(&hl_filter->desc));
+  CHECK_CUDNN(dynload::cudnnCreateFilterDescriptor(&hl_filter->desc));
 
 #ifndef PADDLE_TYPE_DOUBLE
-    cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
+  cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
 #else
-    cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
+  cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
 #endif
-    CHECK_CUDNN(dynload::cudnnSetFilter4dDescriptor(
-             hl_filter->desc,
-             data_type,
+  CHECK_CUDNN(dynload::cudnnSetFilter4dDescriptor(hl_filter->desc,
+                                                  data_type,
 #if CUDNN_VERSION >= 5000
-             CUDNN_TENSOR_NCHW,
+                                                  CUDNN_TENSOR_NCHW,
 #endif
-             output_feature_maps,
-             input_feature_maps,
-             height,
-             width));
-
-    hl_filter->data_type = data_type;
-    hl_filter->output_feature_maps = output_feature_maps;
-    hl_filter->input_feature_maps = input_feature_maps;
-    hl_filter->filter_height = height;
-    hl_filter->filter_width = width;
-
-    *filter = (hl_filter_descriptor)hl_filter;
+                                                  output_feature_maps,
+                                                  input_feature_maps,
+                                                  height,
+                                                  width));
+
+  hl_filter->data_type = data_type;
+  hl_filter->output_feature_maps = output_feature_maps;
+  hl_filter->input_feature_maps = input_feature_maps;
+  hl_filter->filter_height = height;
+  hl_filter->filter_width = width;
+
+  *filter = (hl_filter_descriptor)hl_filter;
 }
 
+void hl_destroy_filter_descriptor(hl_filter_descriptor filter) {
+  CHECK_NOTNULL(filter);
 
-void hl_destroy_filter_descriptor(hl_filter_descriptor filter)
-{
-    CHECK_NOTNULL(filter);
+  cudnn_filter_descriptor hl_filter = (cudnn_filter_descriptor)filter;
+  CHECK_NOTNULL(hl_filter->desc);
 
-    cudnn_filter_descriptor hl_filter = (cudnn_filter_descriptor)filter;
-    CHECK_NOTNULL(hl_filter->desc);
+  CHECK_CUDNN(dynload::cudnnDestroyFilterDescriptor(hl_filter->desc));
 
-    CHECK_CUDNN(dynload::cudnnDestroyFilterDescriptor(hl_filter->desc));
+  hl_filter->desc = NULL;
 
-    hl_filter->desc = NULL;
-
-    free(filter);
+  free(filter);
 }
 
 void hl_create_convolution_descriptor(hl_convolution_descriptor* conv,
@@ -627,38 +600,36 @@ void hl_create_convolution_descriptor(hl_convolution_descriptor* conv,
                                       int padding_height,
                                       int padding_width,
                                       int stride_height,
-                                      int stride_width)
-{
-    CHECK_NOTNULL(conv);
-
-    cudnn_convolution_descriptor hl_conv =
-        (cudnn_convolution_descriptor)malloc(sizeof(_cudnn_convolution_descriptor));
-    CHECK_NOTNULL(hl_conv);
-
-    CHECK_CUDNN(dynload::cudnnCreateConvolutionDescriptor(&hl_conv->desc));
-
-    cudnnConvolutionMode_t mode = CUDNN_CROSS_CORRELATION;
-    CHECK_CUDNN(dynload::cudnnSetConvolution2dDescriptor(
-                hl_conv->desc,
-                padding_height,
-                padding_width,
-                stride_height,
-                stride_width,
-                1,
-                1,
-                mode));
-
-    hl_conv->input_image = image;
-    hl_conv->filter = filter;
-    hl_conv->padding_height = padding_height;
-    hl_conv->padding_width = padding_width;
-    hl_conv->stride_height = stride_height;
-    hl_conv->stride_width = stride_width;
-    hl_conv->upscalex = 1;
-    hl_conv->upscaley = 1;
-    hl_conv->mode = mode;
-
-    *conv = (hl_convolution_descriptor)hl_conv;
+                                      int stride_width) {
+  CHECK_NOTNULL(conv);
+
+  cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)malloc(
+      sizeof(_cudnn_convolution_descriptor));
+
+  CHECK_NOTNULL(hl_conv);
+  CHECK_CUDNN(dynload::cudnnCreateConvolutionDescriptor(&hl_conv->desc));
+
+  cudnnConvolutionMode_t mode = CUDNN_CROSS_CORRELATION;
+  CHECK_CUDNN(dynload::cudnnSetConvolution2dDescriptor(hl_conv->desc,
+                                                       padding_height,
+                                                       padding_width,
+                                                       stride_height,
+                                                       stride_width,
+                                                       1,
+                                                       1,
+                                                       mode));
+
+  hl_conv->input_image = image;
+  hl_conv->filter = filter;
+  hl_conv->padding_height = padding_height;
+  hl_conv->padding_width = padding_width;
+  hl_conv->stride_height = stride_height;
+  hl_conv->stride_width = stride_width;
+  hl_conv->upscalex = 1;
+  hl_conv->upscaley = 1;
+  hl_conv->mode = mode;
+
+  *conv = (hl_convolution_descriptor)hl_conv;
 }
 
 void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
@@ -667,47 +638,44 @@ void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
                                      int padding_height,
                                      int padding_width,
                                      int stride_height,
-                                     int stride_width)
-{
-    CHECK_NOTNULL(conv);
-    CHECK_NOTNULL(image);
-    CHECK_NOTNULL(filter);
-
-    cudnnConvolutionDescriptor_t  conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
-    cudnnConvolutionMode_t mode = CUDNN_CROSS_CORRELATION;
-    CHECK_CUDNN(dynload::cudnnSetConvolution2dDescriptor(
-                conv_desc,
-                padding_height,
-                padding_width,
-                stride_height,
-                stride_width,
-                1,
-                1,
-                mode));
-
-    cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)conv;
-    hl_conv->input_image = image;
-    hl_conv->filter = filter;
-    hl_conv->padding_height = padding_height;
-    hl_conv->padding_width = padding_width;
-    hl_conv->stride_height = stride_height;
-    hl_conv->stride_width = stride_width;
-    hl_conv->upscalex = 1;
-    hl_conv->upscaley = 1;
-    hl_conv->mode = mode;
+                                     int stride_width) {
+  CHECK_NOTNULL(conv);
+  CHECK_NOTNULL(image);
+  CHECK_NOTNULL(filter);
+
+  cudnnConvolutionDescriptor_t conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
+  cudnnConvolutionMode_t mode = CUDNN_CROSS_CORRELATION;
+  CHECK_CUDNN(dynload::cudnnSetConvolution2dDescriptor(conv_desc,
+                                                       padding_height,
+                                                       padding_width,
+                                                       stride_height,
+                                                       stride_width,
+                                                       1,
+                                                       1,
+                                                       mode));
+
+  cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)conv;
+  hl_conv->input_image = image;
+  hl_conv->filter = filter;
+  hl_conv->padding_height = padding_height;
+  hl_conv->padding_width = padding_width;
+  hl_conv->stride_height = stride_height;
+  hl_conv->stride_width = stride_width;
+  hl_conv->upscalex = 1;
+  hl_conv->upscaley = 1;
+  hl_conv->mode = mode;
 }
 
-void hl_destroy_convolution_descriptor(hl_convolution_descriptor conv)
-{
-    CHECK_NOTNULL(conv);
+void hl_destroy_convolution_descriptor(hl_convolution_descriptor conv) {
+  CHECK_NOTNULL(conv);
 
-    cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)conv;
-    CHECK_NOTNULL(hl_conv->desc);
+  cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)conv;
+  CHECK_NOTNULL(hl_conv->desc);
 
-    CHECK_CUDNN(dynload::cudnnDestroyConvolutionDescriptor(hl_conv->desc));
-    hl_conv->desc = NULL;
+  CHECK_CUDNN(dynload::cudnnDestroyConvolutionDescriptor(hl_conv->desc));
+  hl_conv->desc = NULL;
 
-    free(conv);
+  free(conv);
 }
 
 void hl_convolution_forward(hl_tensor_descriptor input,
@@ -720,87 +688,83 @@ void hl_convolution_forward(hl_tensor_descriptor input,
                             void* gpuWorkSpace,
                             size_t sizeInBytes,
                             int convFwdAlgo) {
-    CHECK_NOTNULL(input);
-    CHECK_NOTNULL(output);
-    CHECK_NOTNULL(filter);
-    CHECK_NOTNULL(conv);
-    CHECK_NOTNULL(input_data);
-    CHECK_NOTNULL(output_data);
-    CHECK_NOTNULL(filter_data);
-    cudnnTensorDescriptor_t       src_desc = GET_TENSOR_DESCRIPTOR(input);
-    cudnnTensorDescriptor_t       dest_desc = GET_TENSOR_DESCRIPTOR(output);
-    cudnnFilterDescriptor_t       filter_desc = GET_FILTER_DESCRIPTOR(filter);
-    cudnnConvolutionDescriptor_t  conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
-    real alpha = 1.0f;
-    real beta = 1.0f;
-    CHECK_CUDNN(dynload::cudnnConvolutionForward(
-                t_resource.cudnn_handle,
-                &alpha,
-                src_desc,
-                input_data,
-                filter_desc,
-                filter_data,
-                conv_desc,
-                static_cast<cudnnConvolutionFwdAlgo_t>(convFwdAlgo),
-                gpuWorkSpace,
-                sizeInBytes,
-                &beta,
-                dest_desc,
-                output_data));
+  CHECK_NOTNULL(input);
+  CHECK_NOTNULL(output);
+  CHECK_NOTNULL(filter);
+  CHECK_NOTNULL(conv);
+  CHECK_NOTNULL(input_data);
+  CHECK_NOTNULL(output_data);
+  CHECK_NOTNULL(filter_data);
+  cudnnTensorDescriptor_t src_desc = GET_TENSOR_DESCRIPTOR(input);
+  cudnnTensorDescriptor_t dest_desc = GET_TENSOR_DESCRIPTOR(output);
+  cudnnFilterDescriptor_t filter_desc = GET_FILTER_DESCRIPTOR(filter);
+  cudnnConvolutionDescriptor_t conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
+  real alpha = 1.0f;
+  real beta = 1.0f;
+  CHECK_CUDNN(dynload::cudnnConvolutionForward(
+      t_resource.cudnn_handle,
+      &alpha,
+      src_desc,
+      input_data,
+      filter_desc,
+      filter_data,
+      conv_desc,
+      static_cast<cudnnConvolutionFwdAlgo_t>(convFwdAlgo),
+      gpuWorkSpace,
+      sizeInBytes,
+      &beta,
+      dest_desc,
+      output_data));
   CHECK_SYNC("hl_convolution_forward failed");
 }
 
 void hl_convolution_forward_add_bias(hl_tensor_descriptor bias,
                                      real* bias_data,
                                      hl_tensor_descriptor output,
-                                     real* output_data)
-{
-    CHECK_NOTNULL(bias);
-    CHECK_NOTNULL(output);
-    CHECK_NOTNULL(bias_data);
-    CHECK_NOTNULL(output_data);
-
-    cudnnTensorDescriptor_t output_desc = GET_TENSOR_DESCRIPTOR(output);
-    cudnnTensorDescriptor_t bias_desc = GET_TENSOR_DESCRIPTOR(bias);
-    real alpha = 1.0f;
-    real beta = 1.0f;
-
-    CHECK_CUDNN(dynload::cudnnAddTensor(
-                t_resource.cudnn_handle,
+                                     real* output_data) {
+  CHECK_NOTNULL(bias);
+  CHECK_NOTNULL(output);
+  CHECK_NOTNULL(bias_data);
+  CHECK_NOTNULL(output_data);
+
+  cudnnTensorDescriptor_t output_desc = GET_TENSOR_DESCRIPTOR(output);
+  cudnnTensorDescriptor_t bias_desc = GET_TENSOR_DESCRIPTOR(bias);
+  real alpha = 1.0f;
+  real beta = 1.0f;
+
+  CHECK_CUDNN(dynload::cudnnAddTensor(t_resource.cudnn_handle,
 #if CUDNN_VERSION < 4000
-                CUDNN_ADD_SAME_C,
+                                      CUDNN_ADD_SAME_C,
 #endif
-                &alpha,
-                bias_desc,
-                bias_data,
-                &beta,
-                output_desc,
-                output_data));
+                                      &alpha,
+                                      bias_desc,
+                                      bias_data,
+                                      &beta,
+                                      output_desc,
+                                      output_data));
   CHECK_SYNC("hl_convolution_forward_add_bias failed");
 }
 
 void hl_convolution_backward_bias(hl_tensor_descriptor bias,
                                   real* bias_grad_data,
                                   hl_tensor_descriptor output,
-                                  real* output_grad_data)
-{
-    CHECK_NOTNULL(bias);
-    CHECK_NOTNULL(output);
-    CHECK_NOTNULL(bias_grad_data);
-    CHECK_NOTNULL(output_grad_data);
-
-    real alpha = 1.0f;
-    real beta = 1.0f;
-    cudnnTensorDescriptor_t diff_desc = GET_TENSOR_DESCRIPTOR(output);
-    cudnnTensorDescriptor_t bias_desc = GET_TENSOR_DESCRIPTOR(bias);
-    CHECK_CUDNN(dynload::cudnnConvolutionBackwardBias(
-                t_resource.cudnn_handle,
-                &alpha,
-                diff_desc,
-                output_grad_data,
-                &beta,
-                bias_desc,
-                bias_grad_data));
+                                  real* output_grad_data) {
+  CHECK_NOTNULL(bias);
+  CHECK_NOTNULL(output);
+  CHECK_NOTNULL(bias_grad_data);
+  CHECK_NOTNULL(output_grad_data);
+
+  real alpha = 1.0f;
+  real beta = 1.0f;
+  cudnnTensorDescriptor_t diff_desc = GET_TENSOR_DESCRIPTOR(output);
+  cudnnTensorDescriptor_t bias_desc = GET_TENSOR_DESCRIPTOR(bias);
+  CHECK_CUDNN(dynload::cudnnConvolutionBackwardBias(t_resource.cudnn_handle,
+                                                    &alpha,
+                                                    diff_desc,
+                                                    output_grad_data,
+                                                    &beta,
+                                                    bias_desc,
+                                                    bias_grad_data));
   CHECK_SYNC("hl_convolution_backward_bias failed");
 }
 
@@ -814,38 +778,37 @@ void hl_convolution_backward_filter(hl_tensor_descriptor input,
                                     void* gpuWorkSpace,
                                     size_t sizeInBytes,
                                     int convBwdFilterAlgo) {
+  CHECK_NOTNULL(input);
+  CHECK_NOTNULL(output);
+  CHECK_NOTNULL(filter);
+  CHECK_NOTNULL(conv);
+  CHECK_NOTNULL(input_data);
+  CHECK_NOTNULL(output_grad_data);
+  CHECK_NOTNULL(filter_grad_data);
 
-    CHECK_NOTNULL(input);
-    CHECK_NOTNULL(output);
-    CHECK_NOTNULL(filter);
-    CHECK_NOTNULL(conv);
-    CHECK_NOTNULL(input_data);
-    CHECK_NOTNULL(output_grad_data);
-    CHECK_NOTNULL(filter_grad_data);
-
-    real alpha = 1.0f;
-    real beta = 1.0f;
-    cudnnTensorDescriptor_t       src_desc = GET_TENSOR_DESCRIPTOR(input);
-    cudnnTensorDescriptor_t       diff_desc = GET_TENSOR_DESCRIPTOR(output);
-    cudnnConvolutionDescriptor_t  conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
-    cudnnFilterDescriptor_t       grad_desc = GET_FILTER_DESCRIPTOR(filter);
-
-    CHECK_CUDNN(dynload::cudnnConvolutionBackwardFilter(
-                t_resource.cudnn_handle,
-                &alpha,
-                src_desc,
-                input_data,
-                diff_desc,
-                output_grad_data,
-                conv_desc,
+  real alpha = 1.0f;
+  real beta = 1.0f;
+  cudnnTensorDescriptor_t src_desc = GET_TENSOR_DESCRIPTOR(input);
+  cudnnTensorDescriptor_t diff_desc = GET_TENSOR_DESCRIPTOR(output);
+  cudnnConvolutionDescriptor_t conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
+  cudnnFilterDescriptor_t grad_desc = GET_FILTER_DESCRIPTOR(filter);
+
+  CHECK_CUDNN(dynload::cudnnConvolutionBackwardFilter(
+      t_resource.cudnn_handle,
+      &alpha,
+      src_desc,
+      input_data,
+      diff_desc,
+      output_grad_data,
+      conv_desc,
 #if CUDNN_VERSION >= 4000
-                static_cast<cudnnConvolutionBwdFilterAlgo_t>(convBwdFilterAlgo),
-                gpuWorkSpace,
-                sizeInBytes,
+      static_cast<cudnnConvolutionBwdFilterAlgo_t>(convBwdFilterAlgo),
+      gpuWorkSpace,
+      sizeInBytes,
 #endif
-                &beta,
-                grad_desc,
-                filter_grad_data));
+      &beta,
+      grad_desc,
+      filter_grad_data));
   CHECK_SYNC("hl_convolution_backward_filter failed");
 }
 
@@ -859,121 +822,111 @@ void hl_convolution_backward_data(hl_tensor_descriptor input,
                                   void* gpuWorkSpace,
                                   size_t sizeInBytes,
                                   int convBwdDataAlgo) {
-    real alpha = 1.0f;
-    real beta = 1.0f;
-    cudnnFilterDescriptor_t       filter_desc = GET_FILTER_DESCRIPTOR(filter);
-    cudnnTensorDescriptor_t       diff_desc = GET_TENSOR_DESCRIPTOR(output);
-    cudnnTensorDescriptor_t       grad_desc = GET_TENSOR_DESCRIPTOR(input);
-    cudnnConvolutionDescriptor_t  conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
-
-    CHECK_CUDNN(dynload::cudnnConvolutionBackwardData(
-                t_resource.cudnn_handle,
-                &alpha,
-                filter_desc,
-                filter_data,
-                diff_desc,
-                output_grad_data,
-                conv_desc,
+  real alpha = 1.0f;
+  real beta = 1.0f;
+  cudnnFilterDescriptor_t filter_desc = GET_FILTER_DESCRIPTOR(filter);
+  cudnnTensorDescriptor_t diff_desc = GET_TENSOR_DESCRIPTOR(output);
+  cudnnTensorDescriptor_t grad_desc = GET_TENSOR_DESCRIPTOR(input);
+  cudnnConvolutionDescriptor_t conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
+
+  CHECK_CUDNN(dynload::cudnnConvolutionBackwardData(
+      t_resource.cudnn_handle,
+      &alpha,
+      filter_desc,
+      filter_data,
+      diff_desc,
+      output_grad_data,
+      conv_desc,
 #if CUDNN_VERSION >= 4000
-                static_cast<cudnnConvolutionBwdDataAlgo_t>(convBwdDataAlgo),
-                gpuWorkSpace,
-                sizeInBytes,
+      static_cast<cudnnConvolutionBwdDataAlgo_t>(convBwdDataAlgo),
+      gpuWorkSpace,
+      sizeInBytes,
 #endif
-                &beta,
-                grad_desc,
-                input_data_grad));
+      &beta,
+      grad_desc,
+      input_data_grad));
   CHECK_SYNC("hl_convolution_backward_data failed");
 }
 
-
-void hl_softmax_forward(real *input,
-                        real *output,
-                        int height,
-                        int width)
-{
+void hl_softmax_forward(real* input, real* output, int height, int width) {
 #ifndef PADDLE_TYPE_DOUBLE
-    cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
+  cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
 #else
-    cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
+  cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
 #endif
-    CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptor(
-                t_resource.cudnn_desc,
-                CUDNN_TENSOR_NCHW,
-                data_type,
-                height,
-                width,
-                1,
-                1));
-
-    real alpha = 1.0f;
-    real beta = 0.0f;
-    CHECK_CUDNN(dynload::cudnnSoftmaxForward(
-                t_resource.cudnn_handle,
-                CUDNN_SOFTMAX_ACCURATE,
-                CUDNN_SOFTMAX_MODE_CHANNEL,
-                &alpha,
-                t_resource.cudnn_desc,
-                input,
-                &beta,
-                t_resource.cudnn_desc,
-                output));
+  CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptor(t_resource.cudnn_desc,
+                                                  CUDNN_TENSOR_NCHW,
+                                                  data_type,
+                                                  height,
+                                                  width,
+                                                  1,
+                                                  1));
+
+  real alpha = 1.0f;
+  real beta = 0.0f;
+  CHECK_CUDNN(dynload::cudnnSoftmaxForward(t_resource.cudnn_handle,
+                                           CUDNN_SOFTMAX_ACCURATE,
+                                           CUDNN_SOFTMAX_MODE_CHANNEL,
+                                           &alpha,
+                                           t_resource.cudnn_desc,
+                                           input,
+                                           &beta,
+                                           t_resource.cudnn_desc,
+                                           output));
   CHECK_SYNC("hl_softmax_forward failed");
 }
 
-void hl_softmax_backward(real *output_value,
-                         real *output_grad,
+void hl_softmax_backward(real* output_value,
+                         real* output_grad,
                          int height,
-                         int width)
-{
+                         int width) {
 #ifndef PADDLE_TYPE_DOUBLE
-    cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
+  cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
 #else
-    cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
+  cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
 #endif
-    CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptor(
-                t_resource.cudnn_desc,
-                CUDNN_TENSOR_NCHW,
-                data_type,
-                height,
-                width,
-                1,
-                1));
-
-    real alpha = 1.0f;
-    real beta = 0.0f;
-    CHECK_CUDNN(dynload::cudnnSoftmaxBackward(
-                t_resource.cudnn_handle,
-                CUDNN_SOFTMAX_ACCURATE,
-                CUDNN_SOFTMAX_MODE_CHANNEL,
-                &alpha,
-                t_resource.cudnn_desc,
-                output_value,
-                t_resource.cudnn_desc,
-                output_grad,
-                &beta,
-                t_resource.cudnn_desc,
-                output_grad));
+  CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptor(t_resource.cudnn_desc,
+                                                  CUDNN_TENSOR_NCHW,
+                                                  data_type,
+                                                  height,
+                                                  width,
+                                                  1,
+                                                  1));
+
+  real alpha = 1.0f;
+  real beta = 0.0f;
+  CHECK_CUDNN(dynload::cudnnSoftmaxBackward(t_resource.cudnn_handle,
+                                            CUDNN_SOFTMAX_ACCURATE,
+                                            CUDNN_SOFTMAX_MODE_CHANNEL,
+                                            &alpha,
+                                            t_resource.cudnn_desc,
+                                            output_value,
+                                            t_resource.cudnn_desc,
+                                            output_grad,
+                                            &beta,
+                                            t_resource.cudnn_desc,
+                                            output_grad));
   CHECK_SYNC("hl_softmax_backward failed");
 }
 
 void hl_batch_norm_forward_training(hl_tensor_descriptor inputDesc,
-                                    real *input,
+                                    real* input,
                                     hl_tensor_descriptor outputDesc,
-                                    real *output,
+                                    real* output,
                                     hl_tensor_descriptor bnParamDesc,
-                                    real *scale,
-                                    real *bias,
+                                    real* scale,
+                                    real* bias,
                                     double factor,
-                                    real *runningMean,
-                                    real *runningInvVar,
+                                    real* runningMean,
+                                    real* runningInvVar,
                                     double epsilon,
-                                    real *savedMean,
-                                    real *savedVar) {
+                                    real* savedMean,
+                                    real* savedVar) {
 #if CUDNN_VERSION >= 4007
   if ((NULL != runningMean && NULL == runningInvVar) ||
       (NULL == runningMean && NULL != runningInvVar)) {
     LOG(FATAL) << "runningMean and runningInvVar can be NULL "
-              << "but only at the same time.";
+               << "but only at the same time.";
   }
   if ((NULL != savedMean && NULL == savedVar) ||
       (NULL == savedMean && NULL != savedVar)) {
@@ -987,10 +940,24 @@ void hl_batch_norm_forward_training(hl_tensor_descriptor inputDesc,
   real alpha = 1.0f;
   real beta = 1.0f;
   cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL;
-  CHECK_CUDNN(dynload::cudnnBatchNormalizationForwardTraining(
-              t_resource.cudnn_handle, mode, &alpha, &beta, xDesc,
-              input, yDesc, output, bnDesc, scale, bias, factor,
-              runningMean, runningInvVar, epsilon, savedMean, savedVar));
+  CHECK_CUDNN(
+      dynload::cudnnBatchNormalizationForwardTraining(t_resource.cudnn_handle,
+                                                      mode,
+                                                      &alpha,
+                                                      &beta,
+                                                      xDesc,
+                                                      input,
+                                                      yDesc,
+                                                      output,
+                                                      bnDesc,
+                                                      scale,
+                                                      bias,
+                                                      factor,
+                                                      runningMean,
+                                                      runningInvVar,
+                                                      epsilon,
+                                                      savedMean,
+                                                      savedVar));
 
   CHECK_SYNC("hl_batch_norm_forward_training failed");
 #else
@@ -1000,15 +967,15 @@ void hl_batch_norm_forward_training(hl_tensor_descriptor inputDesc,
 }
 
 void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
-                                    real *input,
-                                    hl_tensor_descriptor outputDesc,
-                                    real *output,
-                                    hl_tensor_descriptor bnParamDesc,
-                                    real *scale,
-                                    real *bias,
-                                    real *estimatedMean,
-                                    real *estimatedInvVar,
-                                    double epsilon) {
+                                     real* input,
+                                     hl_tensor_descriptor outputDesc,
+                                     real* output,
+                                     hl_tensor_descriptor bnParamDesc,
+                                     real* scale,
+                                     real* bias,
+                                     real* estimatedMean,
+                                     real* estimatedInvVar,
+                                     double epsilon) {
 #if CUDNN_VERSION >= 4007
   cudnnTensorDescriptor_t xDesc = GET_TENSOR_DESCRIPTOR(inputDesc);
   cudnnTensorDescriptor_t yDesc = GET_TENSOR_DESCRIPTOR(outputDesc);
@@ -1016,10 +983,21 @@ void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
   real alpha = 1.0f;
   real beta = 1.0f;
   cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL;
-  CHECK_CUDNN(dynload::cudnnBatchNormalizationForwardInference(
-              t_resource.cudnn_handle, mode, &alpha, &beta, xDesc,
-              input, yDesc, output, bnDesc, scale, bias,
-              estimatedMean, estimatedInvVar, epsilon));
+  CHECK_CUDNN(
+      dynload::cudnnBatchNormalizationForwardInference(t_resource.cudnn_handle,
+                                                       mode,
+                                                       &alpha,
+                                                       &beta,
+                                                       xDesc,
+                                                       input,
+                                                       yDesc,
+                                                       output,
+                                                       bnDesc,
+                                                       scale,
+                                                       bias,
+                                                       estimatedMean,
+                                                       estimatedInvVar,
+                                                       epsilon));
 
   CHECK_SYNC("hl_batch_norm_forward_inference failed");
 #else
@@ -1029,18 +1007,18 @@ void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
 }
 
 void hl_batch_norm_backward(hl_tensor_descriptor inputDesc,
-                            real *input,
+                            real* input,
                             hl_tensor_descriptor outGradDesc,
-                            real *outGrad,
+                            real* outGrad,
                             hl_tensor_descriptor inGradDesc,
-                            real *inGrad,
+                            real* inGrad,
                             hl_tensor_descriptor dBnParamDesc,
-                            real *scale,
-                            real *scaleGrad,
-                            real *biasGrad,
+                            real* scale,
+                            real* scaleGrad,
+                            real* biasGrad,
                             double epsilon,
-                            real *savedMean,
-                            real *savedInvVar) {
+                            real* savedMean,
+                            real* savedInvVar) {
 #if CUDNN_VERSION >= 4007
   if ((NULL != savedMean && NULL == savedInvVar) ||
       (NULL == savedMean && NULL != savedInvVar)) {
@@ -1055,12 +1033,25 @@ void hl_batch_norm_backward(hl_tensor_descriptor inputDesc,
   real alpha = 1.0f;
   real beta = 1.0f;
   cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL;
-  CHECK_CUDNN(dynload::cudnnBatchNormalizationBackward(
-              t_resource.cudnn_handle, mode, &alpha, &beta,
-              &alpha, &beta,
-              xDesc, input, dyDesc, outGrad, dxDesc, inGrad,
-              bnDesc, scale, scaleGrad, biasGrad, epsilon,
-              savedMean, savedInvVar));
+  CHECK_CUDNN(dynload::cudnnBatchNormalizationBackward(t_resource.cudnn_handle,
+                                                       mode,
+                                                       &alpha,
+                                                       &beta,
+                                                       &alpha,
+                                                       &beta,
+                                                       xDesc,
+                                                       input,
+                                                       dyDesc,
+                                                       outGrad,
+                                                       dxDesc,
+                                                       inGrad,
+                                                       bnDesc,
+                                                       scale,
+                                                       scaleGrad,
+                                                       biasGrad,
+                                                       epsilon,
+                                                       savedMean,
+                                                       savedInvVar));
 
   CHECK_SYNC("hl_batch_norm_backward failed");
 #else
diff --git a/paddle/cuda/src/hl_cuda_device.cc b/paddle/cuda/src/hl_cuda_device.cc
index aa1d184a3e..6b71a53848 100644
--- a/paddle/cuda/src/hl_cuda_device.cc
+++ b/paddle/cuda/src/hl_cuda_device.cc
@@ -12,23 +12,22 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
-#include <sys/time.h>
+#include <cuda_profiler_api.h>
 #include <string.h>
-#include <unistd.h>
 #include <sys/syscall.h>
-#include <cuda_profiler_api.h>
+#include <sys/time.h>
+#include <unistd.h>
 #include <mutex>
 #include "hl_cuda.h"
 #include "hl_cuda.ph"
-#include "hl_thread.ph"
 #include "hl_dso_loader.h"
+#include "hl_thread.ph"
 #include "paddle/utils/Logging.h"
 
 namespace dynload {
 
 std::once_flag curand_dso_flag;
-void* curand_dso_handle = nullptr;
+void *curand_dso_handle = nullptr;
 
 /**
  * The following macro definition can generate structs
@@ -38,34 +37,35 @@ void* curand_dso_handle = nullptr;
  * note: default dynamic linked libs
  */
 #ifdef PADDLE_USE_DSO
-#define DYNAMIC_LOAD_CURAND_WRAP(__name)                           \
-  struct DynLoad__##__name {                                       \
-    template <typename... Args>                                    \
-    curandStatus_t operator()(Args... args) {                      \
-       typedef curandStatus_t (*curandFunc)(Args...);              \
-       std::call_once(curand_dso_flag, GetCurandDsoHandle,         \
-                      &curand_dso_handle);                         \
-       void* p_##__name = dlsym(curand_dso_handle, #__name);       \
-       return reinterpret_cast<curandFunc>(p_##__name)(args...);   \
-    }                                                              \
-  } __name;  /* struct DynLoad__##__name */
+#define DYNAMIC_LOAD_CURAND_WRAP(__name)                                       \
+  struct DynLoad__##__name {                                                   \
+    template <typename... Args>                                                \
+    curandStatus_t operator()(Args... args) {                                  \
+      typedef curandStatus_t (*curandFunc)(Args...);                           \
+      std::call_once(curand_dso_flag, GetCurandDsoHandle, &curand_dso_handle); \
+      void *p_##__name = dlsym(curand_dso_handle, #__name);                    \
+      return reinterpret_cast<curandFunc>(p_##__name)(args...);                \
+    }                                                                          \
+  } __name; /* struct DynLoad__##__name */
 #else
-#define DYNAMIC_LOAD_CURAND_WRAP(__name)                           \
-  struct DynLoad__##__name {                                       \
-    template <typename... Args>                                    \
-    curandStatus_t operator()(Args... args) {                      \
-       return __name(args...);                                     \
-    }                                                              \
-  } __name;  /* struct DynLoad__##__name */
+#define DYNAMIC_LOAD_CURAND_WRAP(__name)      \
+  struct DynLoad__##__name {                  \
+    template <typename... Args>               \
+    curandStatus_t operator()(Args... args) { \
+      return __name(args...);                 \
+    }                                         \
+  } __name; /* struct DynLoad__##__name */
 #endif
 
 /* include all needed curand functions in HPPL */
+// clang-format off
 #define CURAND_RAND_ROUTINE_EACH(__macro)    \
   __macro(curandCreateGenerator)             \
   __macro(curandSetStream)                   \
   __macro(curandSetPseudoRandomGeneratorSeed)\
   __macro(curandGenerateUniform)             \
   __macro(curandGenerateUniformDouble)
+// clang-format on
 
 CURAND_RAND_ROUTINE_EACH(DYNAMIC_LOAD_CURAND_WRAP)
 
@@ -73,7 +73,7 @@ CURAND_RAND_ROUTINE_EACH(DYNAMIC_LOAD_CURAND_WRAP)
 #undef DYNAMIC_LOAD_CURAND_WRAP
 
 std::once_flag cudart_dso_flag;
-void* cudart_dso_handle = nullptr;
+void *cudart_dso_handle = nullptr;
 
 /**
  * The following macro definition can generate structs
@@ -83,28 +83,28 @@ void* cudart_dso_handle = nullptr;
  * note: default dynamic linked libs
  */
 #ifdef PADDLE_USE_DSO
-#define DYNAMIC_LOAD_CUDART_WRAP(__name)                            \
-  struct DynLoad__##__name {                                        \
-    template <typename... Args>                                     \
-    auto operator()(Args... args) -> decltype(__name(args...)) {    \
-      using cudart_func = decltype(__name(args...))(*)(Args...);    \
-      std::call_once(cudart_dso_flag, GetCudartDsoHandle,           \
-                     &cudart_dso_handle);                           \
-      void* p_##__name = dlsym(cudart_dso_handle, #__name);         \
-      return reinterpret_cast<cudart_func>(p_##__name)(args...);    \
-    }                                                               \
-  } __name;  /* struct DynLoad__##__name */
+#define DYNAMIC_LOAD_CUDART_WRAP(__name)                                       \
+  struct DynLoad__##__name {                                                   \
+    template <typename... Args>                                                \
+    auto operator()(Args... args) -> decltype(__name(args...)) {               \
+      using cudart_func = decltype(__name(args...)) (*)(Args...);              \
+      std::call_once(cudart_dso_flag, GetCudartDsoHandle, &cudart_dso_handle); \
+      void *p_##__name = dlsym(cudart_dso_handle, #__name);                    \
+      return reinterpret_cast<cudart_func>(p_##__name)(args...);               \
+    }                                                                          \
+  } __name; /* struct DynLoad__##__name */
 #else
-#define DYNAMIC_LOAD_CUDART_WRAP(__name)                            \
-  struct DynLoad__##__name {                                        \
-    template <typename... Args>                                     \
-    auto operator()(Args... args) -> decltype(__name(args...)) {    \
-      return __name(args...);                                       \
-    }                                                               \
-  } __name;  /* struct DynLoad__##__name */
+#define DYNAMIC_LOAD_CUDART_WRAP(__name)                         \
+  struct DynLoad__##__name {                                     \
+    template <typename... Args>                                  \
+    auto operator()(Args... args) -> decltype(__name(args...)) { \
+      return __name(args...);                                    \
+    }                                                            \
+  } __name; /* struct DynLoad__##__name */
 #endif
 
 /* include all needed cuda functions in HPPL */
+// clang-format off
 #define CUDA_ROUTINE_EACH(__macro)        \
   __macro(cudaMalloc)                     \
   __macro(cudaHostAlloc)                  \
@@ -137,56 +137,57 @@ void* cudart_dso_handle = nullptr;
   __macro(cudaGetErrorString)             \
   __macro(cudaProfilerStart)              \
   __macro(cudaProfilerStop)
+// clang-format on
+
 CUDA_ROUTINE_EACH(DYNAMIC_LOAD_CUDART_WRAP)
 
 #undef CUDA_ROUNTINE_EACH
 #undef DYNAMIC_LOAD_CUDART_WRAP
 
-}  /* namespace dynload */
+} /* namespace dynload */
 
 /**
  * @brief   global resource.
  */
-int                     g_system_device_num = 0;    /* system device number */
-int                     device_num = 0;             /* use    device number */
-hl_device_prop          *g_device;                  /* device info table */
-__thread thread_device_resources *t_device;         /* device resources table */
+int g_system_device_num = 0;                /* system device number */
+int device_num = 0;                         /* use    device number */
+hl_device_prop *g_device;                   /* device info table */
+__thread thread_device_resources *t_device; /* device resources table */
 int g_cuda_lib_version = 0;
 
 /* number of global stream */
-#define  NUMBER_OF_GLOBAL_STREAM    (HPPL_THREAD_STREAM_1)
+#define NUMBER_OF_GLOBAL_STREAM (HPPL_THREAD_STREAM_1)
 /* number of thread stream */
-#define  NUMBER_OF_THREAD_STREAM    (HPPL_STREAM_END - HPPL_THREAD_STREAM_1)
+#define NUMBER_OF_THREAD_STREAM (HPPL_STREAM_END - HPPL_THREAD_STREAM_1)
 /* sizeof of device memory */
-#define  HPPL_GPU_MEMORY_SIZE                (256*4)
+#define HPPL_GPU_MEMORY_SIZE (256 * 4)
 
 /**
  * Check build-in cuda function using glog and it **does not**
  * support << operator for more details error info.
  */
-#define CHECK_CUDA(cudaFunc)                               \
-  do {                                                     \
-    cudaError_t cudaStat = cudaFunc;                       \
-    CHECK_EQ(cudaSuccess, cudaStat) << "Cuda Error: "      \
-        << dynload::cudaGetErrorString(cudaStat);          \
+#define CHECK_CUDA(cudaFunc)                                                  \
+  do {                                                                        \
+    cudaError_t cudaStat = cudaFunc;                                          \
+    CHECK_EQ(cudaSuccess, cudaStat) << "Cuda Error: "                         \
+                                    << dynload::cudaGetErrorString(cudaStat); \
   } while (0)
 
 /**
  * @brief   thread resource.
  */
-__thread _hl_thread_resource t_resource = {
-                                           {0},     /* stream */
-                                           0,       /* handle */
-                                           0,       /* gen */
-                                           0,       /* cudnn_handle */
-                                           0,       /* cudnn_desc */
-                                           NULL,    /* gen_mutex */
-                                           NULL,    /* gpu_mem */
-                                           NULL,    /* cpu_mem */
-                                           0,       /* event */
-                                           -1,      /* device */
-                                           0,       /* major */
-                                           false};  /* is_init */
+__thread _hl_thread_resource t_resource = {{0},    /* stream */
+                                           0,      /* handle */
+                                           0,      /* gen */
+                                           0,      /* cudnn_handle */
+                                           0,      /* cudnn_desc */
+                                           NULL,   /* gen_mutex */
+                                           NULL,   /* gpu_mem */
+                                           NULL,   /* cpu_mem */
+                                           0,      /* event */
+                                           -1,     /* device */
+                                           0,      /* major */
+                                           false}; /* is_init */
 
 __thread cudaStream_t default_stream = 0;
 __thread bool g_sync_flag = true;
@@ -200,18 +201,17 @@ inline pid_t gettid() {
   uint64_t tid;
   pthread_threadid_np(NULL, &tid);
 #else
-  #ifndef __NR_gettid
-  #define __NR_gettid 224
-  #endif
+#ifndef __NR_gettid
+#define __NR_gettid 224
+#endif
   pid_t tid = syscall(__NR_gettid);
 #endif
-  CHECK_NE(tid, -1);
-  return tid;    
+  CHECK_NE((int)tid, -1);
+  return tid;
 }
 
 void hl_init(int device) {
-  CHECK(hl_start_flag)
-    << "[Init failed] hl_start() did not succeed.";
+  CHECK(hl_start_flag) << "[Init failed] hl_start() did not succeed.";
 
   /* thread has been initialized */
   if (true == t_resource.is_init) {
@@ -222,16 +222,16 @@ void hl_init(int device) {
   /* create thread devcie resources */
   char *tmp;
   thread_device_resources device_res;
-  tmp = (char *)malloc(g_system_device_num*sizeof(thread_device_resources*) +
-                       device_num*sizeof(_thread_device_resources));
+  tmp = (char *)malloc(g_system_device_num * sizeof(thread_device_resources *) +
+                       device_num * sizeof(_thread_device_resources));
   CHECK_NOTNULL(tmp);
-  t_device = (thread_device_resources*)tmp;
-  device_res = (thread_device_resources)((char*)tmp +
-               g_system_device_num*sizeof(thread_device_resources*));
-  memset(t_device, 0, g_system_device_num*sizeof(thread_device_resources*));
+  t_device = (thread_device_resources *)tmp;
+  device_res = (thread_device_resources)(
+      (char *)tmp + g_system_device_num * sizeof(thread_device_resources *));
+  memset(t_device, 0, g_system_device_num * sizeof(thread_device_resources *));
 
-  char *tmp_stream = (char *)
-      malloc(device_num*NUMBER_OF_THREAD_STREAM*sizeof(cudaStream_t));
+  char *tmp_stream = (char *)malloc(device_num * NUMBER_OF_THREAD_STREAM *
+                                    sizeof(cudaStream_t));
   CHECK_NOTNULL(tmp_stream);
 
   int num = 0;
@@ -241,8 +241,9 @@ void hl_init(int device) {
     }
 
     t_device[dev] = &device_res[num];
-    t_device[dev]->stream = (cudaStream_t*)(tmp_stream +
-        num*NUMBER_OF_THREAD_STREAM*sizeof(cudaStream_t));
+    t_device[dev]->stream =
+        (cudaStream_t *)(tmp_stream +
+                         num * NUMBER_OF_THREAD_STREAM * sizeof(cudaStream_t));
 
     hl_create_thread_resources(dev, t_device[dev]);
     num++;
@@ -268,14 +269,14 @@ void hl_fini() {
     t_resource.stream[i] = 0;
   }
 
-  char* tmp = (char*)t_device;
-  char* tmp_stream = NULL;
+  char *tmp = (char *)t_device;
+  char *tmp_stream = NULL;
   for (int dev = 0; dev < g_system_device_num; dev++) {
     if (!t_device[dev]) {
       continue;
     }
     if (!tmp_stream) {
-        tmp_stream = (char*)t_device[dev]->stream;
+      tmp_stream = (char *)t_device[dev]->stream;
     }
     for (int j = 0; j < NUMBER_OF_THREAD_STREAM; j++) {
       CHECK_CUDA(dynload::cudaStreamDestroy(t_device[dev]->stream[j]));
@@ -292,9 +293,7 @@ void hl_fini() {
   t_resource.is_init = false;
 }
 
-int hl_get_device_count() {
-  return device_num;
-}
+int hl_get_device_count() { return device_num; }
 
 void hl_set_device(int device) {
   if (device == t_resource.device) {
@@ -302,7 +301,7 @@ void hl_set_device(int device) {
   }
 
   CHECK(device >= 0 && device < g_system_device_num && g_device[device])
-    << "Device: " << device << " is not specified in startup.";
+      << "Device: " << device << " is not specified in startup.";
 
   CHECK_CUDA(dynload::cudaSetDevice(device));
 
@@ -314,11 +313,11 @@ void hl_set_device(int device) {
   if (true == t_resource.is_init) {
     for (int i = NUMBER_OF_GLOBAL_STREAM; i < HPPL_STREAM_END; i++) {
       t_resource.stream[i] =
-        t_device[device]->stream[i - NUMBER_OF_GLOBAL_STREAM];
+          t_device[device]->stream[i - NUMBER_OF_GLOBAL_STREAM];
     }
     t_resource.gpu_mem = t_device[device]->gpu_mem;
     t_resource.cpu_mem = t_device[device]->cpu_mem;
-    t_resource.event   = t_device[device]->mem_event;
+    t_resource.event = t_device[device]->mem_event;
   }
 
   t_resource.handle = g_device[device]->device_resources->handle;
@@ -336,11 +335,11 @@ int hl_get_device() {
   return device;
 }
 
-void* hl_malloc_device(size_t size) {
+void *hl_malloc_device(size_t size) {
   void *dest_d;
 
   CHECK(size) << __func__ << ": the size for device memory is 0, please check.";
-  CHECK_CUDA(dynload::cudaMalloc((void**)&dest_d, size));
+  CHECK_CUDA(dynload::cudaMalloc((void **)&dest_d, size));
 
   return dest_d;
 }
@@ -350,14 +349,15 @@ void hl_free_mem_device(void *dest_d) {
 
   cudaError_t err = dynload::cudaFree(dest_d);
   CHECK(cudaSuccess == err || cudaErrorCudartUnloading == err)
-    << hl_get_device_error_string();
+      << hl_get_device_error_string();
 }
 
-void* hl_malloc_host(size_t size) {
+void *hl_malloc_host(size_t size) {
   void *dest_h;
 
   CHECK(size) << __func__ << ": the size for device memory is 0, please check.";
-  CHECK_CUDA(dynload::cudaHostAlloc((void**)&dest_h, size, cudaHostAllocDefault));
+  CHECK_CUDA(
+      dynload::cudaHostAlloc((void **)&dest_h, size, cudaHostAllocDefault));
 
   return dest_h;
 }
@@ -366,8 +366,8 @@ void hl_free_mem_host(void *dest_h) {
   CHECK_NOTNULL(dest_h);
 
   cudaError_t err = dynload::cudaFreeHost(dest_h);
-  CHECK (cudaSuccess == err || cudaErrorCudartUnloading == err)
-    << hl_get_device_error_string();
+  CHECK(cudaSuccess == err || cudaErrorCudartUnloading == err)
+      << hl_get_device_error_string();
 }
 
 void hl_memcpy(void *dst, void *src, size_t size) {
@@ -389,8 +389,7 @@ void hl_memcpy_host2device(void *dest_d, void *src_h, size_t size) {
   }
   CHECK_NOTNULL(src_h);
   CHECK_NOTNULL(dest_d);
-  CHECK_CUDA(dynload::cudaMemcpy(dest_d, src_h, size,
-             cudaMemcpyHostToDevice));
+  CHECK_CUDA(dynload::cudaMemcpy(dest_d, src_h, size, cudaMemcpyHostToDevice));
 }
 
 void hl_memcpy_device2host(void *dest_h, void *src_d, size_t size) {
@@ -399,8 +398,7 @@ void hl_memcpy_device2host(void *dest_h, void *src_d, size_t size) {
   }
   CHECK_NOTNULL(dest_h);
   CHECK_NOTNULL(src_d);
-  CHECK_CUDA(dynload::cudaMemcpy(dest_h, src_d, size,
-             cudaMemcpyDeviceToHost));
+  CHECK_CUDA(dynload::cudaMemcpy(dest_h, src_d, size, cudaMemcpyDeviceToHost));
 }
 
 void hl_memcpy_device2device(void *dest_d, void *src_d, size_t size) {
@@ -409,8 +407,8 @@ void hl_memcpy_device2device(void *dest_d, void *src_d, size_t size) {
   }
   CHECK_NOTNULL(dest_d);
   CHECK_NOTNULL(src_d);
-  CHECK_CUDA(dynload::cudaMemcpy(dest_d, src_d, size,
-             cudaMemcpyDeviceToDevice));
+  CHECK_CUDA(
+      dynload::cudaMemcpy(dest_d, src_d, size, cudaMemcpyDeviceToDevice));
 }
 
 void hl_memcpy_async(void *dst, void *src, size_t size, hl_stream_t stream) {
@@ -424,8 +422,8 @@ void hl_memcpy_async(void *dst, void *src, size_t size, hl_stream_t stream) {
   CHECK_LT(stream, HPPL_STREAM_END);
   cu_stream = t_resource.stream[stream];
 
-  CHECK_CUDA(dynload::cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault,
-             cu_stream));
+  CHECK_CUDA(
+      dynload::cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault, cu_stream));
 }
 
 void hl_start() {
@@ -436,8 +434,8 @@ void hl_start() {
 
 bool hl_device_can_access_peer(int device, int peerDevice) {
   int canAccessPeer;
-  CHECK_CUDA(dynload::cudaDeviceCanAccessPeer(&canAccessPeer, device,
-             peerDevice));
+  CHECK_CUDA(
+      dynload::cudaDeviceCanAccessPeer(&canAccessPeer, device, peerDevice));
 
   if (canAccessPeer == 1) {
     return true;
@@ -479,32 +477,32 @@ void hl_create_global_resources(hl_device_prop device_prop) {
 
   /* create curand gen */
   CHECK_EQ(dynload::curandCreateGenerator(&device_res->gen,
-           CURAND_RNG_PSEUDO_DEFAULT), CURAND_STATUS_SUCCESS)
-           << "[Start failed] Curand init failed.";
+                                          CURAND_RNG_PSEUDO_DEFAULT),
+           CURAND_STATUS_SUCCESS)
+      << "[Start failed] Curand init failed.";
 
-  CHECK_EQ(dynload::curandSetStream(device_res->gen,
-           device_res->stream[0]), CURAND_STATUS_SUCCESS)
-           << "[Start failed] Curand set stream failed!";
+  CHECK_EQ(dynload::curandSetStream(device_res->gen, device_res->stream[0]),
+           CURAND_STATUS_SUCCESS)
+      << "[Start failed] Curand set stream failed!";
 
   /* create cudnn handle */
   hl_cudnn_init(&device_res->cudnn_handle, device_res->stream[0]);
 
   int seed = gettid();
-  CHECK_EQ(dynload::curandSetPseudoRandomGeneratorSeed(
-           device_res->gen, seed+device), CURAND_STATUS_SUCCESS);
+  CHECK_EQ(dynload::curandSetPseudoRandomGeneratorSeed(device_res->gen,
+                                                       seed + device),
+           CURAND_STATUS_SUCCESS);
 
-  device_res->gen_mutex =
-    (pthread_mutex_t*)(malloc(sizeof (pthread_mutex_t)));
+  device_res->gen_mutex = (pthread_mutex_t *)(malloc(sizeof(pthread_mutex_t)));
   pthread_mutex_init(device_res->gen_mutex, NULL);
 
   CHECK_CUDA(dynload::cudaRuntimeGetVersion(&g_cuda_lib_version));
 }
 
-int hl_get_cuda_version() {
-  return g_cuda_lib_version;
-}
+int hl_get_cuda_version() { return g_cuda_lib_version; }
 
-void hl_create_thread_resources(int device, thread_device_resources device_res) {
+void hl_create_thread_resources(int device,
+                                thread_device_resources device_res) {
   CHECK_CUDA(dynload::cudaSetDevice(device));
 
   /* create thread stream */
@@ -513,15 +511,15 @@ void hl_create_thread_resources(int device, thread_device_resources device_res)
   }
 
   /* allocation device memory */
-  device_res->gpu_mem = (real*)hl_malloc_device(HPPL_GPU_MEMORY_SIZE);
+  device_res->gpu_mem = (real *)hl_malloc_device(HPPL_GPU_MEMORY_SIZE);
 
   /* allocation host memory */
-  device_res->cpu_mem = (real*)hl_malloc_host(HPPL_GPU_MEMORY_SIZE);
+  device_res->cpu_mem = (real *)hl_malloc_host(HPPL_GPU_MEMORY_SIZE);
 
   CHECK_CUDA(dynload::cudaEventCreate(&device_res->mem_event));
 }
 
-void hl_specify_devices_start(int* device, int number) {
+void hl_specify_devices_start(int *device, int number) {
   if (hl_start_flag) return;
 
   /* 1. get the number of devices */
@@ -533,20 +531,19 @@ void hl_specify_devices_start(int* device, int number) {
 
   /* 2. check device & create device property table */
   CHECK_LE(number, g_system_device_num)
-    << "[Start failed] System does not have enough device. "
-    << "Device number: " << g_system_device_num
-    << "Input number: " << number;
+      << "[Start failed] System does not have enough device. "
+      << "Device number: " << g_system_device_num << "Input number: " << number;
 
   char *tmp;
   hl_device_prop device_prop;
-  tmp = (char *)malloc(g_system_device_num*sizeof(hl_device_prop*) +
-                       number*sizeof(_hl_device_prop));
+  tmp = (char *)malloc(g_system_device_num * sizeof(hl_device_prop *) +
+                       number * sizeof(_hl_device_prop));
   CHECK(tmp) << "[Start failed] System memory is not enough.";
 
-  g_device = (hl_device_prop*)tmp;
-  device_prop = (hl_device_prop)((char*)tmp +
-                g_system_device_num*sizeof(hl_device_prop*));
-  memset(g_device, 0, g_system_device_num*sizeof(hl_device_prop*));
+  g_device = (hl_device_prop *)tmp;
+  device_prop = (hl_device_prop)(
+      (char *)tmp + g_system_device_num * sizeof(hl_device_prop *));
+  memset(g_device, 0, g_system_device_num * sizeof(hl_device_prop *));
   int num = 0;
   for (int i = 0; i < number; i++) {
     int dev;
@@ -557,13 +554,13 @@ void hl_specify_devices_start(int* device, int number) {
     }
 
     CHECK_LT(dev, g_system_device_num)
-      << "[Start failed] The specified device number is "
-      << "out of range. Max device number: " << g_system_device_num - 1
-      << " Specified devcie number: "<< dev;
+        << "[Start failed] The specified device number is "
+        << "out of range. Max device number: " << g_system_device_num - 1
+        << " Specified devcie number: " << dev;
 
     if (g_device[dev]) {
       /* Warning */
-      LOG(WARNING) <<"[Warning] Repeat specify device: " << dev;
+      LOG(WARNING) << "[Warning] Repeat specify device: " << dev;
       continue;
     }
 
@@ -574,11 +571,11 @@ void hl_specify_devices_start(int* device, int number) {
   device_num = num;
 
   /* 3.  create global device resources */
-  char *tmp_res = (char *)malloc(device_num*sizeof(_global_device_resources));
+  char *tmp_res = (char *)malloc(device_num * sizeof(_global_device_resources));
   CHECK_NOTNULL(tmp_res);
 
-  char *tmp_stream =
-    (char *)malloc(device_num*NUMBER_OF_GLOBAL_STREAM*sizeof(cudaStream_t));
+  char *tmp_stream = (char *)malloc(device_num * NUMBER_OF_GLOBAL_STREAM *
+                                    sizeof(cudaStream_t));
   CHECK_NOTNULL(tmp_stream);
 
   num = 0;
@@ -587,10 +584,11 @@ void hl_specify_devices_start(int* device, int number) {
       continue;
     }
 
-    g_device[i]->device_resources = (global_device_resources)(tmp_res +
-      num*sizeof(_global_device_resources));
-    g_device[i]->device_resources->stream = (cudaStream_t*)(tmp_stream +
-      num*NUMBER_OF_GLOBAL_STREAM*sizeof(cudaStream_t));
+    g_device[i]->device_resources = (global_device_resources)(
+        tmp_res + num * sizeof(_global_device_resources));
+    g_device[i]->device_resources->stream =
+        (cudaStream_t *)(tmp_stream +
+                         num * NUMBER_OF_GLOBAL_STREAM * sizeof(cudaStream_t));
 
     hl_create_global_resources(g_device[i]);
     num++;
@@ -600,9 +598,9 @@ void hl_specify_devices_start(int* device, int number) {
   hl_start_flag = true;
   /* set default device */
   if (device == NULL) {
-      hl_set_device(0);
+    hl_set_device(0);
   } else {
-      hl_set_device(device[0]);
+    hl_set_device(device[0]);
   }
 }
 
@@ -610,35 +608,31 @@ void hl_rand(real *dest_d, size_t num) {
   pthread_mutex_lock(t_resource.gen_mutex);
   CHECK_EQ(
 #ifndef PADDLE_TYPE_DOUBLE
-  dynload::curandGenerateUniform(t_resource.gen, dest_d, num),
+      dynload::curandGenerateUniform(t_resource.gen, dest_d, num),
 #else
-  dynload::curandGenerateUniformDouble(t_resource.gen, dest_d, num),
+      dynload::curandGenerateUniformDouble(t_resource.gen, dest_d, num),
 #endif
-  CURAND_STATUS_SUCCESS);
+      CURAND_STATUS_SUCCESS);
   pthread_mutex_unlock(t_resource.gen_mutex);
   CHECK_SYNC("hl_rand failed");
 }
 
 void hl_srand(unsigned int seed) {
   pthread_mutex_lock(t_resource.gen_mutex);
-  CHECK_EQ(dynload::curandSetPseudoRandomGeneratorSeed(
-           t_resource.gen, seed), CURAND_STATUS_SUCCESS);
+  CHECK_EQ(dynload::curandSetPseudoRandomGeneratorSeed(t_resource.gen, seed),
+           CURAND_STATUS_SUCCESS);
   pthread_mutex_unlock(t_resource.gen_mutex);
 }
 
-void hl_set_sync_flag(bool flag) {
-  g_sync_flag = flag;
-}
+void hl_set_sync_flag(bool flag) { g_sync_flag = flag; }
 
-bool hl_get_sync_flag() {
-  return g_sync_flag;
-}
+bool hl_get_sync_flag() { return g_sync_flag; }
 
 void hl_stream_synchronize(hl_stream_t stream) {
   cudaStream_t cu_stream;
 
-  CHECK_LT(stream, HPPL_STREAM_END)
-    << __func__ <<": the parameter stream is error.";
+  CHECK_LT(stream, HPPL_STREAM_END) << __func__
+                                    << ": the parameter stream is error.";
 
   cu_stream = t_resource.stream[stream];
   CHECK_CUDA(dynload::cudaStreamSynchronize(cu_stream));
@@ -647,8 +641,8 @@ void hl_stream_synchronize(hl_stream_t stream) {
 void hl_create_event(hl_event_t *event) {
   CHECK_NOTNULL(event);
 
-  struct _hl_event_st* st_event =
-    (struct _hl_event_st*)malloc(sizeof(struct _hl_event_st));
+  struct _hl_event_st *st_event =
+      (struct _hl_event_st *)malloc(sizeof(struct _hl_event_st));
 
   CHECK_CUDA(dynload::cudaEventCreate(&st_event->cu_event));
 
@@ -660,8 +654,8 @@ float hl_event_elapsed_time(hl_event_t start, hl_event_t end) {
   CHECK_NOTNULL(start);
   CHECK_NOTNULL(end);
 
-  CHECK_CUDA(dynload::cudaEventElapsedTime(&time,
-             start->cu_event, end->cu_event));
+  CHECK_CUDA(
+      dynload::cudaEventElapsedTime(&time, start->cu_event, end->cu_event));
   return time;
 }
 
@@ -669,24 +663,22 @@ void hl_stream_record_event(hl_stream_t stream, hl_event_t event) {
   cudaStream_t cu_stream;
 
   CHECK_NOTNULL(event);
-  CHECK_LT(stream, HPPL_STREAM_END)
-    << __func__ <<": the parameter stream is error.";
+  CHECK_LT(stream, HPPL_STREAM_END) << __func__
+                                    << ": the parameter stream is error.";
 
   cu_stream = t_resource.stream[stream];
-  CHECK_CUDA(dynload::cudaEventRecord(
-             event->cu_event, cu_stream));
+  CHECK_CUDA(dynload::cudaEventRecord(event->cu_event, cu_stream));
 }
 
 void hl_stream_wait_event(hl_stream_t stream, hl_event_t event) {
   cudaStream_t cu_stream;
 
   CHECK_NOTNULL(event);
-  CHECK_LT(stream, HPPL_STREAM_END)
-    << __func__ <<": the parameter stream is error.";
+  CHECK_LT(stream, HPPL_STREAM_END) << __func__
+                                    << ": the parameter stream is error.";
 
   cu_stream = t_resource.stream[stream];
-  CHECK_CUDA(dynload::cudaStreamWaitEvent(
-             cu_stream, event->cu_event, 0));
+  CHECK_CUDA(dynload::cudaStreamWaitEvent(cu_stream, event->cu_event, 0));
 }
 
 void hl_destroy_event(hl_event_t event) {
@@ -705,15 +697,15 @@ void hl_event_synchronize(hl_event_t event) {
 void hl_get_device_name(char *name, int len, int device) {
   CHECK_NOTNULL(name);
   CHECK(device >= 0 && device < g_system_device_num && g_device[device])
-    << "Device("<< device <<") is not specified in startup.";
+      << "Device(" << device << ") is not specified in startup.";
 
-  strncpy(name, g_device[device]->device_name , len);
+  strncpy(name, g_device[device]->device_name, len);
 }
 
 void hl_get_device_memory(size_t *mem_size, int device) {
   CHECK_NOTNULL(mem_size);
   CHECK(device >= 0 && device < g_system_device_num && g_device[device])
-    << "Device("<< device <<") is not specified in startup.";
+      << "Device(" << device << ") is not specified in startup.";
 
   *mem_size = g_device[device]->device_mem;
 }
@@ -722,31 +714,26 @@ void hl_get_device_compute_capability(int *major, int *minor, int device) {
   CHECK_NOTNULL(major);
   CHECK_NOTNULL(minor);
   CHECK(device >= 0 && device < g_system_device_num && g_device[device])
-    << "Device("<< device << ") is not specified in startup.";
+      << "Device(" << device << ") is not specified in startup.";
 
   *major = g_device[device]->major;
   *minor = g_device[device]->minor;
 }
 
-int hl_get_device_last_error() {
-  return (int)dynload::cudaGetLastError();
-}
+int hl_get_device_last_error() { return (int)dynload::cudaGetLastError(); }
 
-const char* hl_get_device_error_string() {
+const char *hl_get_device_error_string() {
   cudaError_t err = dynload::cudaGetLastError();
   return dynload::cudaGetErrorString(err);
 }
 
-const char* hl_get_device_error_string(size_t err) {
+const char *hl_get_device_error_string(size_t err) {
   return dynload::cudaGetErrorString((cudaError_t)err);
 }
 
-void hl_device_synchronize() {
-  CHECK_CUDA(dynload::cudaDeviceSynchronize());
-}
+void hl_device_synchronize() { CHECK_CUDA(dynload::cudaDeviceSynchronize()); }
 void hl_set_device_flags_block() {
-  CHECK_CUDA(dynload::cudaSetDeviceFlags(
-             cudaDeviceScheduleBlockingSync));
+  CHECK_CUDA(dynload::cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync));
 }
 
 bool hl_cuda_event_is_ready(hl_event_t event) {
@@ -759,11 +746,6 @@ bool hl_cuda_event_is_ready(hl_event_t event) {
   return true;
 }
 
-void hl_profiler_start() {
-    CHECK_CUDA(dynload::cudaProfilerStart());
-}
-
-void hl_profiler_end() {
-    CHECK_CUDA(dynload::cudaProfilerStop());
-}
+void hl_profiler_start() { CHECK_CUDA(dynload::cudaProfilerStart()); }
 
+void hl_profiler_end() { CHECK_CUDA(dynload::cudaProfilerStop()); }
diff --git a/paddle/cuda/src/hl_cudart_wrap.cc b/paddle/cuda/src/hl_cudart_wrap.cc
index 27bbd03bc3..ff6b830b7a 100644
--- a/paddle/cuda/src/hl_cudart_wrap.cc
+++ b/paddle/cuda/src/hl_cudart_wrap.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifdef PADDLE_USE_DSO
 
 #include <mutex>
@@ -29,26 +28,26 @@ limitations under the License. */
 namespace dynload {
 
 extern std::once_flag cudart_dso_flag;
-extern void* cudart_dso_handle;
+extern void *cudart_dso_handle;
 
 /**
  * The following macro definition can generate structs
  * (for each function) to dynamic load cuda routine
  * via operator overloading.
  **/
-#define DYNAMIC_LOAD_CUDART_WRAP(__name, __type)                    \
-  struct DynLoad__##__name {                                        \
-    template <typename... Args>                                     \
-    __type operator()(Args... args) {                               \
-      typedef __type (*cudartFunc)(Args...);                        \
-      std::call_once(cudart_dso_flag, GetCudartDsoHandle,           \
-                    &cudart_dso_handle);                            \
-      void* p_##__name = dlsym(cudart_dso_handle, #__name);         \
-      return reinterpret_cast<cudartFunc>(p_##__name)(args...);     \
-    }                                                               \
-  } __name;  /* struct DynLoad__##__name */
+#define DYNAMIC_LOAD_CUDART_WRAP(__name, __type)                               \
+  struct DynLoad__##__name {                                                   \
+    template <typename... Args>                                                \
+    __type operator()(Args... args) {                                          \
+      typedef __type (*cudartFunc)(Args...);                                   \
+      std::call_once(cudart_dso_flag, GetCudartDsoHandle, &cudart_dso_handle); \
+      void *p_##__name = dlsym(cudart_dso_handle, #__name);                    \
+      return reinterpret_cast<cudartFunc>(p_##__name)(args...);                \
+    }                                                                          \
+  } __name; /* struct DynLoad__##__name */
 
 /* include all needed cuda functions in HPPL */
+// clang-format off
 #define CUDA_ROUTINE_EACH(__macro)          \
   __macro(cudaLaunch, cudaError_t)          \
   __macro(cudaSetupArgument, cudaError_t)   \
@@ -61,16 +60,17 @@ extern void* cudart_dso_handle;
   __macro(__cudaInitModule, char)           \
   __macro(__cudaRegisterTexture, void)      \
   __macro(__cudaRegisterSurface, void)
+// clang-format on
 
 CUDA_ROUTINE_EACH(DYNAMIC_LOAD_CUDART_WRAP)
 
 #if CUDART_VERSION >= 7000
-  DYNAMIC_LOAD_CUDART_WRAP(cudaLaunchKernel, cudaError_t)
+DYNAMIC_LOAD_CUDART_WRAP(cudaLaunchKernel, cudaError_t)
 #endif
 
 #undef CUDA_ROUNTINE_EACH
 
-}  /* namespace dynload */
+} /* namespace dynload */
 
 #if CUDART_VERSION >= 7000
 __host__ cudaError_t CUDARTAPI cudaLaunchKernel(const void *func,
@@ -78,131 +78,120 @@ __host__ cudaError_t CUDARTAPI cudaLaunchKernel(const void *func,
                                                 dim3 blockDim,
                                                 void **args,
                                                 size_t sharedMem,
-                                                cudaStream_t stream)
-{
-  return dynload::cudaLaunchKernel(func, gridDim, blockDim, args, sharedMem, stream);
+                                                cudaStream_t stream) {
+  return dynload::cudaLaunchKernel(
+      func, gridDim, blockDim, args, sharedMem, stream);
 }
 #endif /* CUDART_VERSION >= 7000 */
 
-
-__host__ cudaError_t CUDARTAPI cudaLaunch(const void *func)
-{
+__host__ cudaError_t CUDARTAPI cudaLaunch(const void *func) {
   return dynload::cudaLaunch(func);
 }
 
 __host__ cudaError_t CUDARTAPI cudaSetupArgument(const void *arg,
                                                  size_t size,
-                                                 size_t offset)
-{
+                                                 size_t offset) {
   return dynload::cudaSetupArgument(arg, size, offset);
 }
 
 __host__ cudaError_t CUDARTAPI cudaConfigureCall(dim3 gridDim,
                                                  dim3 blockDim,
                                                  size_t sharedMem,
-                                                 cudaStream_t stream)
-{
-  return dynload::cudaConfigureCall(gridDim, blockDim,
-                                    sharedMem, stream);
+                                                 cudaStream_t stream) {
+  return dynload::cudaConfigureCall(gridDim, blockDim, sharedMem, stream);
 }
 
 extern "C" {
 
-void** CUDARTAPI __cudaRegisterFatBinary(
-  void *fatCubin
-)
-{
+void **CUDARTAPI __cudaRegisterFatBinary(void *fatCubin) {
   return dynload::__cudaRegisterFatBinary(fatCubin);
-
 }
 
-void CUDARTAPI __cudaUnregisterFatBinary(
-  void **fatCubinHandle
-)
-{
+void CUDARTAPI __cudaUnregisterFatBinary(void **fatCubinHandle) {
   return dynload::__cudaUnregisterFatBinary(fatCubinHandle);
 }
 
-void CUDARTAPI __cudaRegisterFunction(
-        void   **fatCubinHandle,
-  const char    *hostFun,
-        char    *deviceFun,
-  const char    *deviceName,
-        int      thread_limit,
-        uint3   *tid,
-        uint3   *bid,
-        dim3    *bDim,
-        dim3    *gDim,
-        int     *wSize
-) {
-  return dynload::__cudaRegisterFunction(
-                fatCubinHandle, hostFun, deviceFun, deviceName,
-                thread_limit, tid, bid, bDim, gDim, wSize);
+void CUDARTAPI __cudaRegisterFunction(void **fatCubinHandle,
+                                      const char *hostFun,
+                                      char *deviceFun,
+                                      const char *deviceName,
+                                      int thread_limit,
+                                      uint3 *tid,
+                                      uint3 *bid,
+                                      dim3 *bDim,
+                                      dim3 *gDim,
+                                      int *wSize) {
+  return dynload::__cudaRegisterFunction(fatCubinHandle,
+                                         hostFun,
+                                         deviceFun,
+                                         deviceName,
+                                         thread_limit,
+                                         tid,
+                                         bid,
+                                         bDim,
+                                         gDim,
+                                         wSize);
 }
 
-void CUDARTAPI __cudaRegisterVar(
-        void **fatCubinHandle,
-        char  *hostVar,
-        char  *deviceAddress,
-  const char  *deviceName,
-        int    ext,
-        int    size,
-        int    constant,
-        int    global
-) {
-  return dynload::__cudaRegisterVar(
-                fatCubinHandle, hostVar, deviceAddress,
-                deviceName, ext, size, constant, global);
+void CUDARTAPI __cudaRegisterVar(void **fatCubinHandle,
+                                 char *hostVar,
+                                 char *deviceAddress,
+                                 const char *deviceName,
+                                 int ext,
+                                 int size,
+                                 int constant,
+                                 int global) {
+  return dynload::__cudaRegisterVar(fatCubinHandle,
+                                    hostVar,
+                                    deviceAddress,
+                                    deviceName,
+                                    ext,
+                                    size,
+                                    constant,
+                                    global);
 }
 
-
-
-extern void CUDARTAPI __cudaRegisterManagedVar(
-        void **fatCubinHandle,
-        void **hostVarPtrAddress,
-        char  *deviceAddress,
-  const char  *deviceName,
-        int    ext,
-        int    size,
-        int    constant,
-        int    global
-) {
-  return dynload::__cudaRegisterManagedVar(
-                fatCubinHandle, hostVarPtrAddress, deviceAddress,
-                deviceName, ext, size, constant, global);
+extern void CUDARTAPI __cudaRegisterManagedVar(void **fatCubinHandle,
+                                               void **hostVarPtrAddress,
+                                               char *deviceAddress,
+                                               const char *deviceName,
+                                               int ext,
+                                               int size,
+                                               int constant,
+                                               int global) {
+  return dynload::__cudaRegisterManagedVar(fatCubinHandle,
+                                           hostVarPtrAddress,
+                                           deviceAddress,
+                                           deviceName,
+                                           ext,
+                                           size,
+                                           constant,
+                                           global);
 }
 
-char CUDARTAPI __cudaInitModule(
-        void **fatCubinHandle
-) {
+char CUDARTAPI __cudaInitModule(void **fatCubinHandle) {
   return dynload::__cudaInitModule(fatCubinHandle);
 }
 
-void CUDARTAPI __cudaRegisterTexture(
-        void                    **fatCubinHandle,
-  const struct textureReference  *hostVar,
-  const void                    **deviceAddress,
-  const char                     *deviceName,
-        int                       dim,
-        int                       norm,
-        int                       ext
-) {
+void CUDARTAPI __cudaRegisterTexture(void **fatCubinHandle,
+                                     const struct textureReference *hostVar,
+                                     const void **deviceAddress,
+                                     const char *deviceName,
+                                     int dim,
+                                     int norm,
+                                     int ext) {
   return dynload::__cudaRegisterTexture(
-                fatCubinHandle, hostVar, deviceAddress,
-                deviceName, dim, norm, ext);
+      fatCubinHandle, hostVar, deviceAddress, deviceName, dim, norm, ext);
 }
 
-void CUDARTAPI __cudaRegisterSurface(
-        void                    **fatCubinHandle,
-  const struct surfaceReference  *hostVar,
-  const void                    **deviceAddress,
-  const char                     *deviceName,
-        int                       dim,
-        int                       ext
-) {
+void CUDARTAPI __cudaRegisterSurface(void **fatCubinHandle,
+                                     const struct surfaceReference *hostVar,
+                                     const void **deviceAddress,
+                                     const char *deviceName,
+                                     int dim,
+                                     int ext) {
   return dynload::__cudaRegisterSurface(
-                fatCubinHandle, hostVar, deviceAddress,
-                deviceName, dim, ext);
+      fatCubinHandle, hostVar, deviceAddress, deviceName, dim, ext);
 }
 
 } /* extern "C" */
diff --git a/paddle/cuda/src/hl_dso_loader.cc b/paddle/cuda/src/hl_dso_loader.cc
index b564b96903..1a3ce08619 100644
--- a/paddle/cuda/src/hl_dso_loader.cc
+++ b/paddle/cuda/src/hl_dso_loader.cc
@@ -12,27 +12,28 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "hl_dso_loader.h"
-#include "paddle/utils/Logging.h"
 #include "paddle/utils/CommandLineParser.h"
+#include "paddle/utils/Logging.h"
 
-P_DEFINE_string(cudnn_dir, "",
+P_DEFINE_string(cudnn_dir,
+                "",
                 "Specify path for loading libcudnn.so. For instance, "
-                "/usr/local/cudnn/lib64. If empty [default], dlopen will search "
-                "cudnn from LD_LIBRARY_PATH");
+                "/usr/local/cudnn/lib. If empty [default], dlopen "
+                "will search cudnn from LD_LIBRARY_PATH");
 
-P_DEFINE_string(cuda_dir, "",
+P_DEFINE_string(cuda_dir,
+                "",
                 "Specify path for loading cuda library, such as libcublas, "
-                "libcurand. For instance, /usr/local/cuda/lib64. "
-                "(Note: libcudart can not be specified by cuda_dir, since some "
+                "libcurand. For instance, /usr/local/cuda/lib64. (Note: "
+                "libcudart can not be specified by cuda_dir, since some "
                 "build-in function in cudart already ran before main entry). "
-                "If empty [default], dlopen will search cuda from LD_LIBRARY_PATH");
+                "If default, dlopen will search cuda from LD_LIBRARY_PATH");
 
-static inline std::string join(const std::string& part1, const std::string& part2) {
+static inline std::string join(const std::string& part1,
+                               const std::string& part2) {
   // directory separator
   const char sep = '/';
-
   if (!part2.empty() && part2.front() == sep) {
     return part2;
   }
@@ -46,100 +47,115 @@ static inline std::string join(const std::string& part1, const std::string& part
   return ret;
 }
 
-static inline void GetDsoHandleFromDefaultPath(
-        std::string& dso_path, void** dso_handle, int dynload_flags) {
-    VLOG(3) << "Try to find cuda library: " << dso_path
-              << " from default system path.";
-    // default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH 
+static inline void GetDsoHandleFromDefaultPath(std::string& dso_path,
+                                               void** dso_handle,
+                                               int dynload_flags) {
+  VLOG(3) << "Try to find cuda library: " << dso_path
+          << " from default system path.";
+  // default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH
+  *dso_handle = dlopen(dso_path.c_str(), dynload_flags);
+
+// DYLD_LIBRARY_PATH is disabled after Mac OS 10.11 to
+// bring System Integrity Projection (SIP), if dso_handle
+// is null, search from default package path in Mac OS.
+#if defined(__APPLE__) || defined(__OSX__)
+  if (nullptr == *dso_handle) {
+    dso_path = join("/usr/local/cuda/lib/", dso_path);
     *dso_handle = dlopen(dso_path.c_str(), dynload_flags);
-    
-    // DYLD_LIBRARY_PATH is disabled after Mac OS 10.11 to
-    // bring System Integrity Projection (SIP), if dso_handle
-    // is null, search from default package path in Mac OS.
-    #if defined(__APPLE__) || defined(__OSX__)
     if (nullptr == *dso_handle) {
-        dso_path = join("/usr/local/cuda/lib/", dso_path);
-        *dso_handle = dlopen(dso_path.c_str(), dynload_flags);
-        if (nullptr == *dso_handle) {
-            if (dso_path == "libcudnn.dylib") {
-                LOG(FATAL) << "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n"
-                << "For instance, sudo tar -xzf cudnn-7.5-osx-x64-v5.0-ga.tgz -C "
-                << "/usr/local \n sudo chmod a+r /usr/local/cuda/include/cudnn.h "
-                << "/usr/local/cuda/lib/libcudnn*";
-            }
-        } 
-    }   
-    #endif
+      if (dso_path == "libcudnn.dylib") {
+        LOG(FATAL)
+            << "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n"  // NOLINT
+            << "For instance, sudo tar -xzf "
+               "cudnn-7.5-osx-x64-v5.0-ga.tgz -C "  // NOLINT
+            << "/usr/local \n sudo chmod a+r "
+               "/usr/local/cuda/include/cudnn.h "  // NOLINT
+            << "/usr/local/cuda/lib/libcudnn*";
+      }
+    }
+  }
+#endif
 }
 
-static inline void GetDsoHandleFromSearchPath(
-        const std::string& search_root,
-        const std::string& dso_name,
-        void** dso_handle) {
-    int dynload_flags = RTLD_LAZY | RTLD_LOCAL;
-    *dso_handle = nullptr;
-
-    std::string dlPath = dso_name;
-    if (search_root.empty()) {
-        GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags);
-    } else {
-        // search xxx.so from custom path
-        dlPath = join(search_root, dso_name);
-        *dso_handle = dlopen(dlPath.c_str(), dynload_flags);
-        // if not found, search from default path
-        if (nullptr == dso_handle) {
-            LOG(WARNING) << "Failed to find cuda library: " << dlPath;
-            dlPath = dso_name;
-            GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags);
-        }
+static inline void GetDsoHandleFromSearchPath(const std::string& search_root,
+                                              const std::string& dso_name,
+                                              void** dso_handle) {
+  int dynload_flags = RTLD_LAZY | RTLD_LOCAL;
+  *dso_handle = nullptr;
+
+  std::string dlPath = dso_name;
+  if (search_root.empty()) {
+    GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags);
+  } else {
+    // search xxx.so from custom path
+    dlPath = join(search_root, dso_name);
+    *dso_handle = dlopen(dlPath.c_str(), dynload_flags);
+    // if not found, search from default path
+    if (nullptr == *dso_handle) {
+      LOG(WARNING) << "Failed to find cuda library: " << dlPath;
+      dlPath = dso_name;
+      GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags);
     }
+  }
 
-    CHECK(nullptr != *dso_handle)
-      << "Failed to find cuda library: " << dlPath << std::endl
-      << "Please specify its path correctly using one of the following ideas: \n"
-
-      << "Idea 1. set cuda and cudnn lib path at runtime. "
-      << "http://www.paddlepaddle.org/doc/ui/cmd_argument/argument_outline.html \n"
-      << "For instance, issue command: paddle train --use_gpu=1 "
-      << "--cuda_dir=/usr/local/cudnn/lib --cudnn_dir=/usr/local/cudnn/lib ...\n"
-
-      << "Idea 2. set environment variable LD_LIBRARY_PATH on Linux or "
-      << "DYLD_LIBRARY_PATH on Mac OS. \n"
-      << "For instance, issue command: export LD_LIBRARY_PATH=... \n"
-
-      << "Note: After Mac OS 10.11, using the DYLD_LIBRARY_PATH is impossible "
-      << "unless System Integrity Protection (SIP) is disabled. However, @Idea 1"
-      << "always work well.";
+  CHECK(nullptr != *dso_handle) << "Failed to find cuda library: " << dlPath
+                                << std::endl
+                                << "Please specify its path correctly using "
+                                   "one of the following ways: \n"  // NOLINT
+
+                                << "Method 1. set cuda and cudnn lib path at "
+                                   "runtime. "
+                                << "http://www.paddlepaddle.org/doc/ui/"
+                                   "cmd_argument/"
+                                   "argument_outline.html \n"  // NOLINT
+                                << "For instance, issue command: paddle train "
+                                   "--use_gpu=1 "
+                                << "--cuda_dir=/usr/local/cuda/lib64 "
+                                   "--cudnn_dir=/usr/local/cudnn/lib "
+                                   "...\n"  // NOLINT
+
+                                << "Method 2. set environment variable "
+                                   "LD_LIBRARY_PATH on Linux or "
+                                << "DYLD_LIBRARY_PATH on Mac OS. \n"
+                                << "For instance, issue command: export "
+                                   "LD_LIBRARY_PATH=... \n"
+
+                                << "Note: After Mac OS 10.11, using the "
+                                   "DYLD_LIBRARY_PATH is impossible "
+                                << "unless System Integrity Protection (SIP) "
+                                   "is disabled. However, "
+                                   "method 1 "  // NOLINT
+                                << "always work well.";
 }
 
 void GetCublasDsoHandle(void** dso_handle) {
 #if defined(__APPLE__) || defined(__OSX__)
-    GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib", dso_handle);
+  GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib", dso_handle);
 #else
-    GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so", dso_handle);
+  GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so", dso_handle);
 #endif
 }
 
 void GetCudnnDsoHandle(void** dso_handle) {
 #if defined(__APPLE__) || defined(__OSX__)
-    GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", dso_handle);
+  GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", dso_handle);
 #else
-    GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", dso_handle);
+  GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", dso_handle);
 #endif
 }
 
 void GetCudartDsoHandle(void** dso_handle) {
 #if defined(__APPLE__) || defined(__OSX__)
-    GetDsoHandleFromSearchPath("", "libcudart.dylib", dso_handle);
+  GetDsoHandleFromSearchPath("", "libcudart.dylib", dso_handle);
 #else
-    GetDsoHandleFromSearchPath("", "libcudart.so", dso_handle);
+  GetDsoHandleFromSearchPath("", "libcudart.so", dso_handle);
 #endif
 }
 
 void GetCurandDsoHandle(void** dso_handle) {
 #if defined(__APPLE__) || defined(__OSX__)
-    GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib", dso_handle);
+  GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib", dso_handle);
 #else
-    GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so", dso_handle);
+  GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so", dso_handle);
 #endif
 }
diff --git a/paddle/cuda/src/hl_math.cc b/paddle/cuda/src/hl_math.cc
index 76d48c4a9b..f4bf888bab 100644
--- a/paddle/cuda/src/hl_math.cc
+++ b/paddle/cuda/src/hl_math.cc
@@ -12,24 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "avx_mathfun.h"
 
 namespace hppl {
-__m256 exp(__m256 a) {
-  return exp256_ps(a);
-}
+__m256 exp(__m256 a) { return exp256_ps(a); }
 
-__m256 log(__m256 a) {
-  return log256_ps(a);
-}
+__m256 log(__m256 a) { return log256_ps(a); }
 
-__m256 sin(__m256 a) {
-  return sin256_ps(a);
-}
+__m256 sin(__m256 a) { return sin256_ps(a); }
 
-__m256 cos(__m256 a) {
-  return cos256_ps(a);
-}
+__m256 cos(__m256 a) { return cos256_ps(a); }
 
 }  // namespace hppl
diff --git a/paddle/cuda/src/hl_time.cc b/paddle/cuda/src/hl_time.cc
index adc88d60dd..d52b2a1df0 100644
--- a/paddle/cuda/src/hl_time.cc
+++ b/paddle/cuda/src/hl_time.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <chrono>
 #include <stdlib.h>
 #include <iostream>
@@ -21,8 +20,7 @@ limitations under the License. */
 using std::chrono::high_resolution_clock;
 
 int64_t getCurrentTimeStick() {
-    high_resolution_clock::time_point tp = high_resolution_clock::now();
-    high_resolution_clock::duration dtn = tp.time_since_epoch();
-    return dtn.count();
+  high_resolution_clock::time_point tp = high_resolution_clock::now();
+  high_resolution_clock::duration dtn = tp.time_since_epoch();
+  return dtn.count();
 }
-
diff --git a/paddle/gserver/activations/ActivationFunction.cpp b/paddle/gserver/activations/ActivationFunction.cpp
index 27eed75d4d..f1bb94216c 100644
--- a/paddle/gserver/activations/ActivationFunction.cpp
+++ b/paddle/gserver/activations/ActivationFunction.cpp
@@ -51,12 +51,14 @@ static ClassRegistrar<ActivationFunction> gActivationRegistrar;
  * @brief Macro for registering a derived activation class
  */
 #define END_DEFINE_ACTIVATION(ACTIVATION_NAME)                     \
-  };                                                               \
+  }                                                                \
+  ;                                                                \
   const std::string ACTIVATION_CLASS_NAME(ACTIVATION_NAME)::name = \
       #ACTIVATION_NAME;                                            \
   static InitFunction __reg_activation__##ACTIVATION_NAME([] {     \
-    gActivationRegistrar.registerClass<                            \
-        ACTIVATION_CLASS_NAME(ACTIVATION_NAME)>(#ACTIVATION_NAME); \
+    gActivationRegistrar                                           \
+        .registerClass<ACTIVATION_CLASS_NAME(ACTIVATION_NAME)>(    \
+            #ACTIVATION_NAME);                                     \
   });
 
 /**
@@ -111,14 +113,22 @@ void backward(Argument& act) {
     outputG->softmaxBackward(*outputV);
   } else {
     SetDevice device(act.deviceId);
-    Matrix::resizeOrCreate(sftMaxDot_, outputG->getHeight(),
+    Matrix::resizeOrCreate(sftMaxDot_,
+                           outputG->getHeight(),
                            outputG->getWidth(),
-                           /* trans */ false, useGpu(act.deviceId));
-    Matrix::resizeOrCreate(sftMaxSum_, outputG->getHeight(), 1,
-                           /* trans */ false, useGpu(act.deviceId));
+                           /* trans */ false,
+                           useGpu(act.deviceId));
+    Matrix::resizeOrCreate(sftMaxSum_,
+                           outputG->getHeight(),
+                           1,
+                           /* trans */ false,
+                           useGpu(act.deviceId));
     if (!one_ || one_->getWidth() != outputG->getWidth()) {
-      Matrix::resizeOrCreate(one_, 1, outputG->getWidth(),
-                             /* trans */ false, useGpu(act.deviceId));
+      Matrix::resizeOrCreate(one_,
+                             1,
+                             outputG->getWidth(),
+                             /* trans */ false,
+                             useGpu(act.deviceId));
       one_->one();
     }
 
@@ -130,7 +140,6 @@ void backward(Argument& act) {
 }
 END_DEFINE_ACTIVATION(softmax)
 
-
 /**
  * @brief Sequence_softmax Activation
  * @note Softmax on all frames of one sequence.
@@ -146,10 +155,16 @@ void forward(Argument& act) {
   CHECK_EQ(act.value->getWidth(), 1UL);
 
   if (!argument_.value) {
-    argument_.value = Matrix::create(nullptr, /* height= */ 1, 1,
-                                     /* trans= */ false, useGpu(act.deviceId));
-    argument_.grad = Matrix::create(nullptr, /* height= */ 1, 1,
-                                    /* trans= */ false, useGpu(act.deviceId));
+    argument_.value = Matrix::create(nullptr,
+                                     /* height= */ 1,
+                                     1,
+                                     /* trans= */ false,
+                                     useGpu(act.deviceId));
+    argument_.grad = Matrix::create(nullptr,
+                                    /* height= */ 1,
+                                    1,
+                                    /* trans= */ false,
+                                    useGpu(act.deviceId));
   }
 
   auto starts = act.sequenceStartPositions->getVector(useGpu(act.deviceId));
@@ -267,8 +282,11 @@ END_DEFINE_ACTIVATION(softrelu)
 BEGIN_DEFINE_ACTIVATION(abs)
 void forward(Argument& act) {
   SetDevice device(act.deviceId);
-  Matrix::resizeOrCreate(act.in, act.value->getHeight(), act.value->getWidth(),
-                         /* trans */ false, useGpu(act.deviceId));
+  Matrix::resizeOrCreate(act.in,
+                         act.value->getHeight(),
+                         act.value->getWidth(),
+                         /* trans */ false,
+                         useGpu(act.deviceId));
 
   act.in->copyFrom(*act.value);
   act.value->abs(*act.value);
@@ -286,8 +304,11 @@ END_DEFINE_ACTIVATION(abs)
 BEGIN_DEFINE_ACTIVATION(square)
 void forward(Argument& act) {
   SetDevice device(act.deviceId);
-  Matrix::resizeOrCreate(act.in, act.value->getHeight(), act.value->getWidth(),
-                         /* trans */ false, useGpu(act.deviceId));
+  Matrix::resizeOrCreate(act.in,
+                         act.value->getHeight(),
+                         act.value->getWidth(),
+                         /* trans */ false,
+                         useGpu(act.deviceId));
 
   act.in->copyFrom(*act.value);
   act.value->square(*act.value);
@@ -317,8 +338,11 @@ END_DEFINE_ACTIVATION(exponential)
 BEGIN_DEFINE_ACTIVATION(log)
 void forward(Argument& act) {
   SetDevice device(act.deviceId);
-  Matrix::resizeOrCreate(act.in, act.value->getHeight(), act.value->getWidth(),
-                         /* trans */ false, useGpu(act.deviceId));
+  Matrix::resizeOrCreate(act.in,
+                         act.value->getHeight(),
+                         act.value->getWidth(),
+                         /* trans */ false,
+                         useGpu(act.deviceId));
 
   act.in->copyFrom(*act.value);
   act.value->log(*act.value);
@@ -333,11 +357,9 @@ ActivationFunction* ActivationFunction::create(const std::string& type) {
 
 std::vector<std::string> ActivationFunction::getAllRegisteredTypes() {
   std::vector<std::string> types;
-  gActivationRegistrar.forEachType([&](const std::string& type) {
-      types.push_back(type);
-    });
+  gActivationRegistrar.forEachType(
+      [&](const std::string& type) { types.push_back(type); });
   return types;
 }
 
-
 }  // namespace paddle
diff --git a/paddle/gserver/activations/ActivationFunction.h b/paddle/gserver/activations/ActivationFunction.h
index c483372256..e9ed5c619a 100644
--- a/paddle/gserver/activations/ActivationFunction.h
+++ b/paddle/gserver/activations/ActivationFunction.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 #include <string>
 #include <vector>
diff --git a/paddle/gserver/dataproviders/DataProvider.cpp b/paddle/gserver/dataproviders/DataProvider.cpp
index 2cfb5a3a18..e6cc4a246a 100644
--- a/paddle/gserver/dataproviders/DataProvider.cpp
+++ b/paddle/gserver/dataproviders/DataProvider.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "DataProvider.h"
 
 #include "paddle/utils/Util.h"
@@ -57,7 +56,7 @@ void BufferBatch::clone(DataBatch* srcBatch, bool useGpu) {
   }
 }
 
-DoubleBuffer::DoubleBuffer(DataProvider *dataPool,
+DoubleBuffer::DoubleBuffer(DataProvider* dataPool,
                            bool useGpu,
                            int64_t batchSize) {
   batchSize_ = batchSize;
@@ -155,7 +154,7 @@ void DoubleBuffer::startAsyncLoad() {
 }
 
 ClassRegistrar<DataProvider, DataConfig, ModelConfig, bool>
-DataProvider::registrar_;
+    DataProvider::registrar_;
 
 DataProvider* DataProvider::create(const DataConfig& config,
                                    const ModelConfig& modelConfig,
@@ -182,7 +181,8 @@ int64_t DataProvider::getNextBatch(int64_t size, DataBatch* batch) {
   for (int i = 0; i < config_.constant_slots_size(); ++i) {
     MemoryHandlePtr handle =
         constantSlots[i] ? constantSlots[i]->getMemoryHandle() : nullptr;
-    Matrix::resizeOrCreate(constantSlots[i], batchSize,
+    Matrix::resizeOrCreate(constantSlots[i],
+                           batchSize,
                            1,         // = width
                            false,     // = trans
                            useGpu_);  // = useGpu
@@ -216,7 +216,8 @@ void DataProvider::initAsyncLoader() {
 }
 
 SimpleDataProviderBase::SimpleDataProviderBase(const DataConfig& config,
-                                               bool useGpu, bool withInfo)
+                                               bool useGpu,
+                                               bool withInfo)
     : DataProvider(config, useGpu) {
   /* initialize the size of a sample, and the buffer */
   sampleDim_ = config_.feat_dim() * (2 * config_.context_len() + 1);
@@ -337,7 +338,8 @@ int64_t SimpleDataProviderBase::fillBuffer() {
   sampleNumInBuf_ =
       n + fillBufferImp(hInputDataBuf_->getData() + n * sampleDim_,
                         hInputLabelBuf_->getData() + n,
-                        hInputInfoBuf_->getData() + n, bufferCapacity_ - n);
+                        hInputInfoBuf_->getData() + n,
+                        bufferCapacity_ - n);
 
   /* for stachastic gradient training */
   if (!skipShuffle_) {
@@ -357,11 +359,14 @@ SimpleDataProvider::SimpleDataProvider(const DataConfig& config, bool useGpu)
 
 SimpleDataProvider::~SimpleDataProvider() {}
 
-int64_t SimpleDataProvider::fillBufferImp(real* data, int* label, int* info,
+int64_t SimpleDataProvider::fillBufferImp(real* data,
+                                          int* label,
+                                          int* info,
                                           int64_t size) {
   (void)info;
   int64_t n = std::min<int64_t>(labels_.size() - currentSampleIndex_, size);
-  memcpy(data, &data_[currentSampleIndex_ * sampleDim_],
+  memcpy(data,
+         &data_[currentSampleIndex_ * sampleDim_],
          n * sampleDim_ * sizeof(real));
   memcpy(label, &labels_[currentSampleIndex_], sizeof(int) * n);
   currentSampleIndex_ += n;
diff --git a/paddle/gserver/dataproviders/DataProvider.h b/paddle/gserver/dataproviders/DataProvider.h
index 112e45de1c..8b7fb27f82 100644
--- a/paddle/gserver/dataproviders/DataProvider.h
+++ b/paddle/gserver/dataproviders/DataProvider.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <vector>
@@ -44,15 +43,15 @@ namespace paddle {
  * @brief Macro for registering a data provider. The class type should contain
  *        a consturctor with parameter (DataConfig, bool).
  */
-#define REGISTER_DATA_PROVIDER(__type_name, __class_name)\
-  static InitFunction __reg_type_##__type_name([]() {\
-  DataProvider::registrar_.registerClass(\
-  #__type_name, \
-  [](DataConfig conf, ModelConfig, bool useGpu) -> DataProvider* { \
-    DataProvider* dp = new __class_name (conf, useGpu);\
-    return dp;\
-  });\
-})
+#define REGISTER_DATA_PROVIDER(__type_name, __class_name)                \
+  static InitFunction __reg_type_##__type_name([]() {                    \
+    DataProvider::registrar_.registerClass(                              \
+        #__type_name,                                                    \
+        [](DataConfig conf, ModelConfig, bool useGpu) -> DataProvider* { \
+          DataProvider* dp = new __class_name(conf, useGpu);             \
+          return dp;                                                     \
+        });                                                              \
+  })
 
 /**
  * @def REGISTER_DATA_PROVIDER_EX
@@ -61,8 +60,8 @@ namespace paddle {
  */
 #define REGISTER_DATA_PROVIDER_EX(__type_name, __class_name)            \
   static InitFunction __reg_type_##__type_name([] {                     \
-  DataProvider::registrar_.registerClass<__class_name>(#__type_name);   \
-})
+    DataProvider::registrar_.registerClass<__class_name>(#__type_name); \
+  })
 
 class DataBatch;
 class BufferBatch;
@@ -181,7 +180,8 @@ public:
    * @param[in]  size    DataBatch.getSize()
    * @param[in]  dataId  sub dataprovider id (in MultiDataProvider)
    */
-  void appendArguments(const std::vector<Argument>& argus, int size,
+  void appendArguments(const std::vector<Argument>& argus,
+                       int size,
                        int dataId) {
     size_ += size;
     for (const auto& argu : argus) {
@@ -259,9 +259,7 @@ typedef Queue<BufferBatch*> BufferBatchQueue;
 
 class DoubleBuffer {
 public:
-  DoubleBuffer(DataProvider* dataPool,
-               bool useGpu,
-               int64_t batchSize = 0);
+  DoubleBuffer(DataProvider* dataPool, bool useGpu, int64_t batchSize = 0);
   virtual ~DoubleBuffer();
   void removeOneBatch(DataBatch* dataBatch);
 
@@ -310,7 +308,7 @@ public:
   /**
    * @brief create only used for unittest.
    */
-  inline static DataProvider* create(const DataConfig &config,
+  inline static DataProvider* create(const DataConfig& config,
                                      bool useGpu = FLAGS_use_gpu) {
     return create(config, ModelConfig(), useGpu);
   }
@@ -462,7 +460,9 @@ protected:
    *
    * label[n] is the label for the n-th sample.
    */
-  virtual int64_t fillBufferImp(real* data, int* label, int* info,
+  virtual int64_t fillBufferImp(real* data,
+                                int* label,
+                                int* info,
                                 int64_t size) = 0;
 };
 
@@ -475,7 +475,9 @@ public:
 protected:
   void loadData(const std::string& fileName);
   void loadDataFile(const std::string& fileName);
-  virtual int64_t fillBufferImp(real* data, int* label, int* info,
+  virtual int64_t fillBufferImp(real* data,
+                                int* label,
+                                int* info,
                                 int64_t size);
 
 protected:
diff --git a/paddle/gserver/dataproviders/DataProviderGroup.h b/paddle/gserver/dataproviders/DataProviderGroup.h
index 0689f90f3e..6c178e29ee 100644
--- a/paddle/gserver/dataproviders/DataProviderGroup.h
+++ b/paddle/gserver/dataproviders/DataProviderGroup.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "DataProvider.h"
@@ -65,8 +64,8 @@ void DataProviderGroup<T>::reset() {
   provider_ = nullptr;
 
   // shuffle file list
-  std::shuffle(fileList_.begin(), fileList_.end(),
-      ThreadLocalRandomEngine::get());
+  std::shuffle(
+      fileList_.begin(), fileList_.end(), ThreadLocalRandomEngine::get());
 
   startLoader();
   DataProvider::reset();
@@ -113,8 +112,9 @@ void DataProviderGroup<T>::startLoader() {
     size_t endPos = std::min(fileList_.size(), startPos + loadFileCount);
     std::vector<std::string> fileVec(fileList_.begin() + startPos,
                                      fileList_.begin() + endPos);
-    loader_->addJob([this, fileVec]()
-                        -> ProviderPtrType { return this->loadFile(fileVec); });
+    loader_->addJob([this, fileVec]() -> ProviderPtrType {
+      return this->loadFile(fileVec);
+    });
   }
   loader_->stopAddJob();
 }
diff --git a/paddle/gserver/dataproviders/MultiDataProvider.cpp b/paddle/gserver/dataproviders/MultiDataProvider.cpp
index 8e4f53978a..51fb1f2666 100644
--- a/paddle/gserver/dataproviders/MultiDataProvider.cpp
+++ b/paddle/gserver/dataproviders/MultiDataProvider.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Util.h"
 #include "MultiDataProvider.h"
 #include "paddle/utils/Logging.h"
@@ -59,10 +58,8 @@ MultiDataProvider::MultiDataProvider(const DataConfig& config,
                    "MultiDataProvider";
       subConfig.set_async_load_data(false);
     }
-    subDataProviders_[i] =
-        std::unique_ptr<DataProvider>(DataProvider::create(subConfig,
-                                                           modelConfig,
-                                                           useGpu_));
+    subDataProviders_[i] = std::unique_ptr<DataProvider>(
+        DataProvider::create(subConfig, modelConfig, useGpu_));
   }
 }
 
diff --git a/paddle/gserver/dataproviders/MultiDataProvider.h b/paddle/gserver/dataproviders/MultiDataProvider.h
index b498ba6516..876467c04f 100644
--- a/paddle/gserver/dataproviders/MultiDataProvider.h
+++ b/paddle/gserver/dataproviders/MultiDataProvider.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "DataProvider.h"
diff --git a/paddle/gserver/dataproviders/ProtoDataProvider.cpp b/paddle/gserver/dataproviders/ProtoDataProvider.cpp
index 344644755f..0a7ff80246 100644
--- a/paddle/gserver/dataproviders/ProtoDataProvider.cpp
+++ b/paddle/gserver/dataproviders/ProtoDataProvider.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "ProtoDataProvider.h"
 #include "paddle/utils/Util.h"
 #include "paddle/utils/StringUtil.h"
@@ -23,7 +22,8 @@ limitations under the License. */
 #include "paddle/utils/Logging.h"
 #include "DataProviderGroup.h"
 
-P_DEFINE_double(memory_threshold_on_load_data, 1.0,
+P_DEFINE_double(memory_threshold_on_load_data,
+                1.0,
                 "stop loading data when memory is not sufficient");
 
 namespace paddle {
@@ -32,7 +32,8 @@ REGISTER_DATA_PROVIDER(proto_group, DataProviderGroup<ProtoDataProvider>);
 REGISTER_DATA_PROVIDER(proto_sequence_group,
                        DataProviderGroup<ProtoSequenceDataProvider>);
 
-ProtoDataProvider::ProtoDataProvider(const DataConfig& config, bool useGpu,
+ProtoDataProvider::ProtoDataProvider(const DataConfig& config,
+                                     bool useGpu,
                                      bool loadDataAll)
     : DataProvider(config, useGpu), sampleNums_(0), currentSequenceIndex_(0) {
   if (loadDataAll) {
@@ -279,7 +280,8 @@ void ProtoDataProvider::fillSlots(const DataSample& sample) {
         }
         slot.sparseNonValueData.resize(slot.indices.back() + slotSize);
         const unsigned int* ids = sample.vector_slots(i).ids().data();
-        memcpy(slot.sparseNonValueData.data() + slot.indices.back(), ids,
+        memcpy(slot.sparseNonValueData.data() + slot.indices.back(),
+               ids,
                sizeof(*ids) * slotSize);
         slot.indices.push_back(slot.indices.back() + slotSize);
         if (subSlotSize) {
@@ -318,10 +320,11 @@ void ProtoDataProvider::fillSlots(const DataSample& sample) {
         slot.varDenseData[oldSize].data.resize(varDim);
         const float* values = sample.vector_slots(i).values().data();
 #ifdef PADDLE_TYPE_DOUBLE
-        std::copy(values, values + varDim,
-                  slot.varDenseData[oldSize].data.data());
+        std::copy(
+            values, values + varDim, slot.varDenseData[oldSize].data.data());
 #else
-        memcpy(slot.varDenseData[oldSize].data.data(), values,
+        memcpy(slot.varDenseData[oldSize].data.data(),
+               values,
                sizeof(real) * varDim);
 #endif
         slot.varDenseData[oldSize].dims.resize(
@@ -374,8 +377,9 @@ void ProtoDataProvider::reset() {
 }
 
 void ProtoDataProvider::shuffle() {
-  std::shuffle(shuffledSequenceIds_.begin(), shuffledSequenceIds_.end(),
-      ThreadLocalRandomEngine::get());
+  std::shuffle(shuffledSequenceIds_.begin(),
+               shuffledSequenceIds_.end(),
+               ThreadLocalRandomEngine::get());
 }
 
 /*
@@ -502,7 +506,8 @@ int64_t ProtoDataProvider::getNextBatchInternal(int64_t size,
 
   if (!iidData()) {
     ICpuGpuVector::resizeOrCreate(cpuArguments[0].sequenceStartPositions,
-                                   numSequences + 1, /* useGpu= */ false);
+                                  numSequences + 1,
+                                  /* useGpu= */ false);
     int* buf = cpuArguments[0].sequenceStartPositions->getMutableData(false);
     int pos = 0;
     int i = 0;
@@ -530,7 +535,9 @@ int64_t ProtoDataProvider::getNextBatchInternal(int64_t size,
 
     switch (slotType) {
       case SlotDef::VECTOR_DENSE: {
-        Matrix::resizeOrCreate(cpuArguments[slot].value, size, dim,
+        Matrix::resizeOrCreate(cpuArguments[slot].value,
+                               size,
+                               dim,
                                false,   // trans = false
                                false);  // useGpu = false
         real* buf = cpuArguments[slot].value->getData();
@@ -543,19 +550,27 @@ int64_t ProtoDataProvider::getNextBatchInternal(int64_t size,
       }
       case SlotDef::VECTOR_SPARSE_NON_VALUE: {
         if (!(cpuArguments[slot].value)) {
-          cpuArguments[slot].value = Matrix::createSparseMatrix(
-              size, dim, size /*DEFAULT_AVG_WIDTH = 1*/, NO_VALUE, SPARSE_CSR,
-              false, useGpu_);
+          cpuArguments[slot].value =
+              Matrix::createSparseMatrix(size,
+                                         dim,
+                                         size /*DEFAULT_AVG_WIDTH = 1*/,
+                                         NO_VALUE,
+                                         SPARSE_CSR,
+                                         false,
+                                         useGpu_);
         }
         auto mat = cpuArguments[slot].value;
         mat->resize(size, dim);
         if (std::dynamic_pointer_cast<GpuSparseMatrix>(mat)) {
           std::dynamic_pointer_cast<GpuSparseMatrix>(mat)
-              ->copyFrom(dataPos.data(), slots_[slot].indices.data(),
-                         slots_[slot].sparseNonValueData.data(), HPPL_STREAM_1);
+              ->copyFrom(dataPos.data(),
+                         slots_[slot].indices.data(),
+                         slots_[slot].sparseNonValueData.data(),
+                         HPPL_STREAM_1);
         } else if (std::dynamic_pointer_cast<CpuSparseMatrix>(mat)) {
           std::dynamic_pointer_cast<CpuSparseMatrix>(mat)
-              ->copyFrom(dataPos.data(), slots_[slot].indices.data(),
+              ->copyFrom(dataPos.data(),
+                         slots_[slot].indices.data(),
                          slots_[slot].sparseNonValueData.data());
         } else {
           LOG(FATAL) << "Not Supported";
@@ -571,19 +586,27 @@ int64_t ProtoDataProvider::getNextBatchInternal(int64_t size,
       }
       case SlotDef::VECTOR_SPARSE_VALUE: {
         if (!(cpuArguments[slot].value)) {
-          cpuArguments[slot].value = Matrix::createSparseMatrix(
-              size, dim, size /*DEFAULT_AVG_WIDTH = 1*/, FLOAT_VALUE,
-              SPARSE_CSR, false, useGpu_);
+          cpuArguments[slot].value =
+              Matrix::createSparseMatrix(size,
+                                         dim,
+                                         size /*DEFAULT_AVG_WIDTH = 1*/,
+                                         FLOAT_VALUE,
+                                         SPARSE_CSR,
+                                         false,
+                                         useGpu_);
         }
         auto mat = cpuArguments[slot].value;
         mat->resize(size, dim);
         if (std::dynamic_pointer_cast<GpuSparseMatrix>(mat)) {
-          std::dynamic_pointer_cast<GpuSparseMatrix>(mat)->copyFrom(
-              dataPos.data(), slots_[slot].indices.data(),
-              slots_[slot].sparseFloatValueData.data(), HPPL_STREAM_1);
+          std::dynamic_pointer_cast<GpuSparseMatrix>(mat)
+              ->copyFrom(dataPos.data(),
+                         slots_[slot].indices.data(),
+                         slots_[slot].sparseFloatValueData.data(),
+                         HPPL_STREAM_1);
         } else if (std::dynamic_pointer_cast<CpuSparseMatrix>(mat)) {
           std::dynamic_pointer_cast<CpuSparseMatrix>(mat)
-              ->copyFrom(dataPos.data(), slots_[slot].indices.data(),
+              ->copyFrom(dataPos.data(),
+                         slots_[slot].indices.data(),
                          slots_[slot].sparseFloatValueData.data());
         } else {
           LOG(FATAL) << "Not Supported";
@@ -591,7 +614,8 @@ int64_t ProtoDataProvider::getNextBatchInternal(int64_t size,
         break;
       }
       case SlotDef::INDEX: {
-        IVector::resizeOrCreate(cpuArguments[slot].ids, size,
+        IVector::resizeOrCreate(cpuArguments[slot].ids,
+                                size,
                                 /*  useGpu= */ false);
         int* buf = cpuArguments[slot].ids->getData();
         for (int i = 0; i < size; ++i) {
@@ -621,7 +645,9 @@ int64_t ProtoDataProvider::getNextBatchInternal(int64_t size,
         if (oldWidth < height) {
           totalDim = width * height * depth;
         }
-        Matrix::resizeOrCreate(cpuArguments[slot].value, size, totalDim,
+        Matrix::resizeOrCreate(cpuArguments[slot].value,
+                               size,
+                               totalDim,
                                false,   // trans = false
                                false);  // useGpu = false
         real* buf = cpuArguments[slot].value->getData();
@@ -637,13 +663,13 @@ int64_t ProtoDataProvider::getNextBatchInternal(int64_t size,
             }
           }
         } else {
-          memcpy(buf, slots_[slot].varDenseData[dataPos[0]].data.data(),
+          memcpy(buf,
+                 slots_[slot].varDenseData[dataPos[0]].data.data(),
                  sizeof(real) * totalDim);
         }
-        ICpuGpuVector::resizeOrCreate(
-            cpuArguments[slot].sequenceStartPositions,
-            size + 1, /* size == 1 currently */
-            /* useGpu= */ false);
+        ICpuGpuVector::resizeOrCreate(cpuArguments[slot].sequenceStartPositions,
+                                      size + 1, /* size == 1 currently */
+                                      /* useGpu= */ false);
         int* bufStarts =
             cpuArguments[slot].sequenceStartPositions->getMutableData(false);
         bufStarts[0] = 0;
@@ -653,16 +679,17 @@ int64_t ProtoDataProvider::getNextBatchInternal(int64_t size,
       case SlotDef::VAR_MDIM_INDEX: {
         CHECK_EQ(size, 1);
         size_t totalDim = slots_[slot].varIndices[dataPos[0]].size();
-        IVector::resizeOrCreate(cpuArguments[slot].ids, totalDim,
+        IVector::resizeOrCreate(cpuArguments[slot].ids,
+                                totalDim,
                                 /*  useGpu= */ false);
         int* buf = cpuArguments[slot].ids->getData();
-        memcpy(buf, slots_[slot].varIndices[dataPos[0]].data(),
+        memcpy(buf,
+               slots_[slot].varIndices[dataPos[0]].data(),
                sizeof(int) * totalDim);
 
-        ICpuGpuVector::resizeOrCreate(
-            cpuArguments[slot].sequenceStartPositions,
-            size + 1, /* size == 1 currently */
-            /* useGpu= */ false);
+        ICpuGpuVector::resizeOrCreate(cpuArguments[slot].sequenceStartPositions,
+                                      size + 1, /* size == 1 currently */
+                                      /* useGpu= */ false);
         int* bufStarts =
             cpuArguments[slot].sequenceStartPositions->getMutableData(false);
         bufStarts[0] = 0;
@@ -700,8 +727,8 @@ int64_t ProtoDataProvider::getNextBatchInternal(int64_t size,
         gpuArguments[i].sequenceStartPositions =
             cpuArguments[i].sequenceStartPositions;
       } else {
-        gpuArguments[i].resizeAndCopyFrom(cpuArguments[i], useGpu_,
-                                          HPPL_STREAM_1);
+        gpuArguments[i].resizeAndCopyFrom(
+            cpuArguments[i], useGpu_, HPPL_STREAM_1);
       }
     }
     hl_stream_synchronize(HPPL_STREAM_1);
@@ -746,10 +773,9 @@ int64_t ProtoSequenceDataProvider::getNextBatchInternal(int64_t size,
     sampleLoop(op, size);
 
     // current slot: sequenceStartPositions
-    ICpuGpuVector::resizeOrCreate(
-        cpuArguments[slot].sequenceStartPositions,
-        size + 1,
-        /* useGpu= */ false);
+    ICpuGpuVector::resizeOrCreate(cpuArguments[slot].sequenceStartPositions,
+                                  size + 1,
+                                  /* useGpu= */ false);
 
     switch (slotType) {
       case SlotDef::VECTOR_SPARSE_VALUE:
@@ -821,10 +847,10 @@ int64_t ProtoSequenceDataProvider::getNextBatchInternal(int64_t size,
           };
           int subSize = subSampleLoop(op, size, slot);
           ICpuGpuVector::resizeOrCreate(
-              cpuArguments[slot].subSequenceStartPositions, subSize + 1,
-              false);
+              cpuArguments[slot].subSequenceStartPositions, subSize + 1, false);
           int* currPosOfArgumentSubSeqStart =
-            cpuArguments[slot].subSequenceStartPositions->getMutableData(false);
+              cpuArguments[slot].subSequenceStartPositions->getMutableData(
+                  false);
           int64_t* subSeqs = dataSubPos.data();
           int64_t* subIndexs = slots_[slot].subIndices.data();
           int allSubSequenceLength = 0;
@@ -849,7 +875,8 @@ int64_t ProtoSequenceDataProvider::getNextBatchInternal(int64_t size,
       }
       case SlotDef::INDEX: {
         // label slot
-        IVector::resizeOrCreate(cpuArguments[slot].ids, size,
+        IVector::resizeOrCreate(cpuArguments[slot].ids,
+                                size,
                                 /* useGpu= */ false);
         // fill labels
         int* buf = cpuArguments[slot].ids->getData();
@@ -863,7 +890,9 @@ int64_t ProtoSequenceDataProvider::getNextBatchInternal(int64_t size,
       case SlotDef::VECTOR_DENSE: {
         // copy values
         size_t dim = header_.slot_defs(slot).dim();
-        Matrix::resizeOrCreate(cpuArguments[slot].value, size, dim,
+        Matrix::resizeOrCreate(cpuArguments[slot].value,
+                               size,
+                               dim,
                                false,   // trans = false
                                false);  // useGpu = false
         real* buf = cpuArguments[slot].value->getData();
@@ -887,8 +916,8 @@ int64_t ProtoSequenceDataProvider::getNextBatchInternal(int64_t size,
     gpuArguments.resize(cpuArguments.size());
     gpuBatch.setSize(size);
     for (size_t i = 0; i < cpuArguments.size(); ++i) {
-      gpuArguments[i].resizeAndCopyFrom(cpuArguments[i], useGpu_,
-                                        HPPL_STREAM_1);
+      gpuArguments[i].resizeAndCopyFrom(
+          cpuArguments[i], useGpu_, HPPL_STREAM_1);
     }
     hl_stream_synchronize(HPPL_STREAM_1);
     *batch = gpuBatch;
diff --git a/paddle/gserver/dataproviders/ProtoDataProvider.h b/paddle/gserver/dataproviders/ProtoDataProvider.h
index 846dd7673a..ffdcc8fdc9 100644
--- a/paddle/gserver/dataproviders/ProtoDataProvider.h
+++ b/paddle/gserver/dataproviders/ProtoDataProvider.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <vector>
@@ -48,7 +47,8 @@ namespace paddle {
  */
 class ProtoDataProvider : public DataProvider {
 public:
-  ProtoDataProvider(const DataConfig& config, bool useGpu,
+  ProtoDataProvider(const DataConfig& config,
+                    bool useGpu,
                     bool loadDataAll = true);
   virtual void reset();
 
@@ -161,14 +161,16 @@ protected:
 };
 
 /**
- * @brief Special use for Proto data: instances should contain sparse-non-value slots
+ * @brief Special use for Proto data: instances should contain sparse-non-value
+ * slots
  * and label.
  *
  * @note ProtoSequenceDataProvider treats each SPARSE SLOT as a SEQUENCE
  */
 class ProtoSequenceDataProvider : public ProtoDataProvider {
 public:
-  ProtoSequenceDataProvider(const DataConfig& config, bool useGpu,
+  ProtoSequenceDataProvider(const DataConfig& config,
+                            bool useGpu,
                             bool loadDataAll = true);
   ~ProtoSequenceDataProvider() {}
   virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch);
diff --git a/paddle/gserver/dataproviders/ProtoReader.h b/paddle/gserver/dataproviders/ProtoReader.h
index 3b1eb7e9ef..b8fca3cd7f 100644
--- a/paddle/gserver/dataproviders/ProtoReader.h
+++ b/paddle/gserver/dataproviders/ProtoReader.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <memory>
@@ -138,7 +137,8 @@ protected:
    *
    * @note this code depends on protobuf 2.4.0. There is nothing like
    * CodedInputStream::CurrentPosition() in protobuf 2.5.0 to tell us how many
-   * bytes has the object readed so far. Therefore, we calculated bytes ourselves.
+   * bytes has the object readed so far. Therefore, we calculated bytes
+   * ourselves.
    */
   int approximateReadedBytes_;
 };
diff --git a/paddle/gserver/dataproviders/PyDataProvider.cpp b/paddle/gserver/dataproviders/PyDataProvider.cpp
index 1332c0ab63..bee6ca14a2 100644
--- a/paddle/gserver/dataproviders/PyDataProvider.cpp
+++ b/paddle/gserver/dataproviders/PyDataProvider.cpp
@@ -12,21 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "PyDataProvider.h"
 #include "paddle/utils/PythonUtil.h"
 #include <fenv.h>
 #include "paddle/utils/Util.h"
 #include "paddle/utils/Excepts.h"
 
-
 namespace paddle {
 
 #ifndef PADDLE_NO_PYTHON
 REGISTER_DATA_PROVIDER(py, PyDataProvider);
 #endif
 
-PyDataProvider::PyDataProvider(const DataConfig& config, bool useGpu,
+PyDataProvider::PyDataProvider(const DataConfig& config,
+                               bool useGpu,
                                bool loadDataAll)
     : DataProvider(config, useGpu), batchSize_(0) {
   PyGuard guard;
@@ -50,8 +49,8 @@ void PyDataProvider::loadData(const std::vector<std::string>& fileList) {
   classInstance_ =
       createPythonClass(pyModuleName_, pyClassName_, fileList, pyUserArgs_);
   CHECK(classInstance_) << "Create class instance failed.";
-  PyObjectPtr obj(PyObject_CallMethod(classInstance_.get(),
-                                      const_cast<char*>("getHeader"), NULL));
+  PyObjectPtr obj(PyObject_CallMethod(
+      classInstance_.get(), const_cast<char*>("getHeader"), NULL));
   CHECK_PY(obj) << "Call function getHeader failed.";
   std::string headerInfo =
       std::string(PyString_AsString(obj.get()), PyString_Size(obj.get()));
@@ -90,7 +89,8 @@ void PyDataProvider::resetSlots() {
   }
 }
 
-void PyDataProvider::fillDenseSlot(ProtoSlot& slot, char*& data,
+void PyDataProvider::fillDenseSlot(ProtoSlot& slot,
+                                   char*& data,
                                    const char* dataEnd) {
   unsigned int dim = slot.dim;
   slot.sampleNum = readT<unsigned int>(data, dataEnd);
@@ -102,14 +102,17 @@ void PyDataProvider::fillDenseSlot(ProtoSlot& slot, char*& data,
   float* dat = reinterpret_cast<float*>(data);
   std::copy(dat, dat + slot.sampleNum * dim, slot.denseData.begin());
 #else
-  memcpyWithCheck(slot.denseData.data(), data,
-                  sizeof(real) * dim * slot.sampleNum, dataEnd);
+  memcpyWithCheck(slot.denseData.data(),
+                  data,
+                  sizeof(real) * dim * slot.sampleNum,
+                  dataEnd);
 #endif
   // PyDataProvider always provide data in float
   data += sizeof(float) * dim * slot.sampleNum;
 }
 
-void PyDataProvider::fillSparseNonValueSlot(ProtoSlot& slot, char*& data,
+void PyDataProvider::fillSparseNonValueSlot(ProtoSlot& slot,
+                                            char*& data,
                                             const char* dataEnd) {
   slot.sampleNum = readT<unsigned int>(data, dataEnd);
   unsigned int* indexPtr = (unsigned int*)data;
@@ -121,12 +124,15 @@ void PyDataProvider::fillSparseNonValueSlot(ProtoSlot& slot, char*& data,
   length = readT<unsigned int>(data, dataEnd);
   slot.indices.push_back(length);
   slot.sparseNonValueData.resize(length);
-  memcpyWithCheck(slot.sparseNonValueData.data(), data,
-                  sizeof(unsigned int) * length, dataEnd);
+  memcpyWithCheck(slot.sparseNonValueData.data(),
+                  data,
+                  sizeof(unsigned int) * length,
+                  dataEnd);
   data += sizeof(unsigned int) * length;
 }
 
-void PyDataProvider::fillSparseValueSlot(ProtoSlot& slot, char*& data,
+void PyDataProvider::fillSparseValueSlot(ProtoSlot& slot,
+                                         char*& data,
                                          const char* dataEnd) {
   slot.sampleNum = readT<unsigned int>(data, dataEnd);
   unsigned int* indexPtr = (unsigned int*)data;
@@ -153,7 +159,8 @@ void PyDataProvider::fillSparseValueSlot(ProtoSlot& slot, char*& data,
   }
 }
 
-void PyDataProvider::fillIndexSlot(ProtoSlot& slot, char*& data,
+void PyDataProvider::fillIndexSlot(ProtoSlot& slot,
+                                   char*& data,
                                    const char* dataEnd) {
   slot.sampleNum = readT<unsigned int>(data, dataEnd);
   CHECK_LE(data + sizeof(unsigned int) * slot.sampleNum, dataEnd)
@@ -163,7 +170,8 @@ void PyDataProvider::fillIndexSlot(ProtoSlot& slot, char*& data,
   data += sizeof(unsigned int) * slot.sampleNum;
 }
 
-void PyDataProvider::fillStringSlot(ProtoSlot& slot, char*& data,
+void PyDataProvider::fillStringSlot(ProtoSlot& slot,
+                                    char*& data,
                                     const char* dataEnd) {
   slot.sampleNum = readT<unsigned int>(data, dataEnd);
   for (unsigned int i = 0; i < slot.sampleNum; ++i) {
@@ -225,9 +233,8 @@ void PyDataProvider::fillSlotsByStr(const std::string& samples) {
       }
       for (size_t i = 0; i < sequenceNum; ++i) {
         size_t begin = slot.sequenceStartPositions[i];
-        size_t end = (i < sequenceNum - 1)
-                         ? slot.sequenceStartPositions[i + 1]
-                         : slot.sampleNum;
+        size_t end = (i < sequenceNum - 1) ? slot.sequenceStartPositions[i + 1]
+                                           : slot.sampleNum;
         for (size_t ii = begin; ii < end; ++ii) {
           slot.sampleSequenceIdVec.push_back(ii);
         }
@@ -255,8 +262,8 @@ void PyDataProvider::fillSlotsByStr(const std::string& samples) {
 void PyDataProvider::reset() {
   {  // Invoke PyDataProvider Reset
     PyGuard guard;
-    PyObjectPtr obj(PyObject_CallMethod(classInstance_.get(),
-                                        const_cast<char*>("reset"), NULL));
+    PyObjectPtr obj(PyObject_CallMethod(
+        classInstance_.get(), const_cast<char*>("reset"), NULL));
     CHECK_PY(obj) << "Call function reset failed.";
   }
 
@@ -270,15 +277,18 @@ void PyDataProvider::reset() {
 void PyDataProvider::shuffle() {
   // py shuffle
   PyGuard guard;
-  PyObjectPtr obj(PyObject_CallMethod(classInstance_.get(),
-                                      const_cast<char*>("shuffle"), NULL));
+  PyObjectPtr obj(PyObject_CallMethod(
+      classInstance_.get(), const_cast<char*>("shuffle"), NULL));
   CHECK_PY(obj) << "Call function shuffle failed.";
 }
 
-void PyDataProvider::handleDenseSlot(ProtoSlot& slot, size_t slotIndex,
+void PyDataProvider::handleDenseSlot(ProtoSlot& slot,
+                                     size_t slotIndex,
                                      std::vector<Argument>& cpuArguments) {
   unsigned int dim = slot.dim;
-  Matrix::resizeOrCreate(cpuArguments[slotIndex].value, slot.sampleNum, dim,
+  Matrix::resizeOrCreate(cpuArguments[slotIndex].value,
+                         slot.sampleNum,
+                         dim,
                          false,   // trans = false
                          false);  // useGpu = false
   real* buf = cpuArguments[slotIndex].value->getData();
@@ -294,19 +304,27 @@ void PyDataProvider::handleSparseNonValueSlot(
     ProtoSlot& slot, size_t slotIndex, std::vector<Argument>& cpuArguments) {
   unsigned int dim = slot.dim;
   if (!(cpuArguments[slotIndex].value)) {
-    cpuArguments[slotIndex].value = Matrix::createSparseMatrix(
-        slot.sampleNum, dim, slot.sampleNum /*DEFAULT_AVG_WIDTH = 1*/, NO_VALUE,
-        SPARSE_CSR, false, useGpu_);
+    cpuArguments[slotIndex].value =
+        Matrix::createSparseMatrix(slot.sampleNum,
+                                   dim,
+                                   slot.sampleNum /*DEFAULT_AVG_WIDTH = 1*/,
+                                   NO_VALUE,
+                                   SPARSE_CSR,
+                                   false,
+                                   useGpu_);
   }
   auto mat = cpuArguments[slotIndex].value;
   mat->resize(slot.sampleNum, dim, slot.sampleNum, NO_VALUE, SPARSE_CSR);
   if (std::dynamic_pointer_cast<GpuSparseMatrix>(mat)) {
     std::dynamic_pointer_cast<GpuSparseMatrix>(mat)
-        ->copyFrom(slot.sampleSequenceIdVec.data(), slot.indices.data(),
-                   slot.sparseNonValueData.data(), HPPL_STREAM_1);
+        ->copyFrom(slot.sampleSequenceIdVec.data(),
+                   slot.indices.data(),
+                   slot.sparseNonValueData.data(),
+                   HPPL_STREAM_1);
   } else if (std::dynamic_pointer_cast<CpuSparseMatrix>(mat)) {
     std::dynamic_pointer_cast<CpuSparseMatrix>(mat)
-        ->copyFrom(slot.sampleSequenceIdVec.data(), slot.indices.data(),
+        ->copyFrom(slot.sampleSequenceIdVec.data(),
+                   slot.indices.data(),
                    slot.sparseNonValueData.data());
   } else {
     LOG(FATAL) << "Not Supported";
@@ -317,28 +335,38 @@ void PyDataProvider::handleSparseValueSlot(
     ProtoSlot& slot, size_t slotIndex, std::vector<Argument>& cpuArguments) {
   unsigned int dim = slot.dim;
   if (!(cpuArguments[slotIndex].value)) {
-    cpuArguments[slotIndex].value = Matrix::createSparseMatrix(
-        slot.sampleNum, dim, slot.sampleNum /*DEFAULT_AVG_WIDTH = 1*/,
-        FLOAT_VALUE, SPARSE_CSR, false, useGpu_);
+    cpuArguments[slotIndex].value =
+        Matrix::createSparseMatrix(slot.sampleNum,
+                                   dim,
+                                   slot.sampleNum /*DEFAULT_AVG_WIDTH = 1*/,
+                                   FLOAT_VALUE,
+                                   SPARSE_CSR,
+                                   false,
+                                   useGpu_);
   }
   auto mat = cpuArguments[slotIndex].value;
   mat->resize(slot.sampleNum, dim, slot.sampleNum, FLOAT_VALUE, SPARSE_CSR);
   if (std::dynamic_pointer_cast<GpuSparseMatrix>(mat)) {
     std::dynamic_pointer_cast<GpuSparseMatrix>(mat)
-        ->copyFrom(slot.sampleSequenceIdVec.data(), slot.indices.data(),
-                   slot.sparseFloatValueData.data(), HPPL_STREAM_DEFAULT);
+        ->copyFrom(slot.sampleSequenceIdVec.data(),
+                   slot.indices.data(),
+                   slot.sparseFloatValueData.data(),
+                   HPPL_STREAM_DEFAULT);
   } else if (std::dynamic_pointer_cast<CpuSparseMatrix>(mat)) {
     std::dynamic_pointer_cast<CpuSparseMatrix>(mat)
-        ->copyFrom(slot.sampleSequenceIdVec.data(), slot.indices.data(),
+        ->copyFrom(slot.sampleSequenceIdVec.data(),
+                   slot.indices.data(),
                    slot.sparseFloatValueData.data());
   } else {
     LOG(FATAL) << "Not Supported";
   }
 }
 
-void PyDataProvider::handleIndexSlot(ProtoSlot& slot, size_t slotIndex,
+void PyDataProvider::handleIndexSlot(ProtoSlot& slot,
+                                     size_t slotIndex,
                                      std::vector<Argument>& cpuArguments) {
-  IVector::resizeOrCreate(cpuArguments[slotIndex].ids, slot.sampleNum,
+  IVector::resizeOrCreate(cpuArguments[slotIndex].ids,
+                          slot.sampleNum,
                           /*useGpu_*/ false);
   int* buf = cpuArguments[slotIndex].ids->getData();
   for (size_t i = 0; i < slot.sampleNum; ++i) {
@@ -346,7 +374,8 @@ void PyDataProvider::handleIndexSlot(ProtoSlot& slot, size_t slotIndex,
   }
 }
 
-void PyDataProvider::handleStringSlot(ProtoSlot& slot, size_t slotIndex,
+void PyDataProvider::handleStringSlot(ProtoSlot& slot,
+                                      size_t slotIndex,
                                       std::vector<Argument>& cpuArguments) {
   if (cpuArguments[slotIndex].strs) {
     cpuArguments[slotIndex].strs->resize(slot.sampleNum);
@@ -364,7 +393,8 @@ int64_t PyDataProvider::getNextBatchInternal(int64_t size, DataBatch* batch) {
   PyGuard guard;
   PyObjectPtr obj(PyObject_CallMethod(classInstance_.get(),
                                       const_cast<char*>("getNextBatch"),
-                                      const_cast<char*>("i"), size));
+                                      const_cast<char*>("i"),
+                                      size));
   CHECK_PY(obj) << "Call function getNextBatch failed.";
   const std::string& samples =
       std::string(PyString_AsString(obj.get()), PyString_Size(obj.get()));
@@ -381,23 +411,24 @@ int64_t PyDataProvider::getNextBatchInternal(int64_t size, DataBatch* batch) {
   if (!iidData()) {
     for (size_t j = 0; j < slotNum_; ++j) {
       auto& slot = slots_[j];
-      ICpuGpuVector::resizeOrCreate(
-          cpuArguments[j].sequenceStartPositions,
-          slot.sequenceNum + 1, /* useGpu= */ false);
+      ICpuGpuVector::resizeOrCreate(cpuArguments[j].sequenceStartPositions,
+                                    slot.sequenceNum + 1,
+                                    /* useGpu= */ false);
       int* buf = cpuArguments[j].sequenceStartPositions->getMutableData(false);
       std::copy(slot.sequenceStartPositions.begin(),
-                slot.sequenceStartPositions.end(), buf);
+                slot.sequenceStartPositions.end(),
+                buf);
       buf[slot.sequenceStartPositions.size()] = slot.sampleNum;
 
       if (slot.subSequenceStartPositions.size()) {
-        ICpuGpuVector::resizeOrCreate(
-            cpuArguments[j].subSequenceStartPositions,
-            slot.subSequenceNum + 1,
-            /*  useGpu= */ false);
+        ICpuGpuVector::resizeOrCreate(cpuArguments[j].subSequenceStartPositions,
+                                      slot.subSequenceNum + 1,
+                                      /*  useGpu= */ false);
         int* buf =
-           cpuArguments[j].subSequenceStartPositions->getMutableData(false);
+            cpuArguments[j].subSequenceStartPositions->getMutableData(false);
         std::copy(slot.subSequenceStartPositions.begin(),
-                  slot.subSequenceStartPositions.end(), buf);
+                  slot.subSequenceStartPositions.end(),
+                  buf);
         buf[slot.subSequenceNum] = slot.sampleNum;
         // check subSequenceStartPositions and sequenceStartPositions
         cpuArguments[j].checkSubset();
@@ -452,8 +483,8 @@ int64_t PyDataProvider::getNextBatchInternal(int64_t size, DataBatch* batch) {
               cpuArguments[i].subSequenceStartPositions;
         }
       } else {
-        gpuArguments[i].resizeAndCopyFrom(cpuArguments[i], useGpu_,
-                                          HPPL_STREAM_1);
+        gpuArguments[i].resizeAndCopyFrom(
+            cpuArguments[i], useGpu_, HPPL_STREAM_1);
       }
     }
     hl_stream_synchronize(HPPL_STREAM_1);
diff --git a/paddle/gserver/dataproviders/PyDataProvider.h b/paddle/gserver/dataproviders/PyDataProvider.h
index 939d9cf725..6bb7c831fd 100644
--- a/paddle/gserver/dataproviders/PyDataProvider.h
+++ b/paddle/gserver/dataproviders/PyDataProvider.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <paddle/utils/PythonUtil.h>
@@ -25,7 +24,8 @@ namespace paddle {
 
 class PyDataProvider : public DataProvider {
 public:
-  PyDataProvider(const DataConfig& config, bool useGpu,
+  PyDataProvider(const DataConfig& config,
+                 bool useGpu,
                  bool loadDataAll = true);
 
   virtual void reset();
@@ -48,21 +48,27 @@ protected:
 
   void parseHeaderData(const std::string& headerData);
   void fillDenseSlot(ProtoSlot& slot, char*& data, const char* dataEnd);
-  void fillSparseNonValueSlot(ProtoSlot& slot, char*& data,
+  void fillSparseNonValueSlot(ProtoSlot& slot,
+                              char*& data,
                               const char* dataEnd);
   void fillSparseValueSlot(ProtoSlot& slot, char*& data, const char* dataEnd);
   void fillIndexSlot(ProtoSlot& slot, char*& data, const char* dataEnd);
   void fillStringSlot(ProtoSlot& slot, char*& data, const char* dataEnd);
   void fillSlotsByStr(const std::string& samples);
-  void handleDenseSlot(ProtoSlot& slot, size_t slotIndex,
+  void handleDenseSlot(ProtoSlot& slot,
+                       size_t slotIndex,
                        std::vector<Argument>& cpuArguments);
-  void handleSparseNonValueSlot(ProtoSlot& slot, size_t slotIndex,
+  void handleSparseNonValueSlot(ProtoSlot& slot,
+                                size_t slotIndex,
                                 std::vector<Argument>& cpuArguments);
-  void handleSparseValueSlot(ProtoSlot& slot, size_t slotIndex,
+  void handleSparseValueSlot(ProtoSlot& slot,
+                             size_t slotIndex,
                              std::vector<Argument>& cpuArguments);
-  void handleIndexSlot(ProtoSlot& slot, size_t slotIndex,
+  void handleIndexSlot(ProtoSlot& slot,
+                       size_t slotIndex,
                        std::vector<Argument>& cpuArguments);
-  void handleStringSlot(ProtoSlot& slot, size_t slotIndex,
+  void handleStringSlot(ProtoSlot& slot,
+                        size_t slotIndex,
                         std::vector<Argument>& cpuArguments);
   void resetSlots();
   void loadData(const std::vector<std::string>& fileList);
diff --git a/paddle/gserver/dataproviders/PyDataProvider2.cpp b/paddle/gserver/dataproviders/PyDataProvider2.cpp
index 90391a7c30..967fc9026a 100644
--- a/paddle/gserver/dataproviders/PyDataProvider2.cpp
+++ b/paddle/gserver/dataproviders/PyDataProvider2.cpp
@@ -34,7 +34,7 @@ namespace paddle {
 namespace unittest {
 
 static std::unique_ptr<std::function<void(size_t /*poolActualSize */)>>
-         OnPoolFilled;
+    OnPoolFilled;
 
 namespace pydp2 {
 
@@ -43,15 +43,11 @@ void setOnPoolFilledHook(const std::function<void(size_t)>& callback) {
   *OnPoolFilled = callback;
 }
 
-void clearOnPoolFilledHook() {
-  OnPoolFilled.reset();
-}
+void clearOnPoolFilledHook() { OnPoolFilled.reset(); }
 
 }  // namespace pydp2
 }  // namespace unittest
 
-
-
 /**
  * Slot type
  */
@@ -65,17 +61,13 @@ enum SlotType {
 /**
  * Sequence type
  */
-enum SeqType {
-  SQT_NONE = 0,
-  SQT_SEQ,
-  SQT_SUBSEQ
-};
+enum SeqType { SQT_NONE = 0, SQT_SEQ, SQT_SUBSEQ };
 
 /**
  * Cache Type.
  */
 enum CacheType {
-  NO_CACHE = 0,  // Each pass will load data from PyDataProvider2.
+  NO_CACHE = 0,           // Each pass will load data from PyDataProvider2.
   CACHE_PASS_IN_MEM = 1,  // First pass will load data from PyDataProvider2,
                           // then cache all data in memory. Load data from
                           // memory in rest passes.
@@ -87,8 +79,8 @@ struct SlotHeader {  // Slot Header will parse from python object's slots field.
   SeqType seqType;
 };
 
-inline std::ostream& operator << (std::ostream& os, const SlotHeader& header) {
-  os <<"Dim = " << header.dim << " Type = " << header.slotType
+inline std::ostream& operator<<(std::ostream& os, const SlotHeader& header) {
+  os << "Dim = " << header.dim << " Type = " << header.slotType
      << " SeqType = " << header.seqType;
   return os;
 }
@@ -158,7 +150,6 @@ protected:
   SlotHeader* headerPtr_;
 };
 
-
 /**
  * Py Data Provider Cache Interface.
  */
@@ -209,17 +200,13 @@ public:
   PyDataProvider2(const DataConfig& config,
                   const ModelConfig& modelConfig,
                   bool useGpu)
-    :DataProvider(config, useGpu),
-      callingContextCreated_(2) {
-    if (PyArray_API == NULL)
-      import_array();
+      : DataProvider(config, useGpu), callingContextCreated_(2) {
+    if (PyArray_API == NULL) import_array();
     auto& args = config.load_data_args();
     PyObjectPtr kwargs = PyObjectPtr(PyDict_New());
     if (!args.empty()) {
       kwargs = callPythonFuncRetPyObj(
-            "paddle.trainer.PyDataProvider2",
-            "deserialize_args",
-            {args});
+          "paddle.trainer.PyDataProvider2", "deserialize_args", {args});
     }
 
     py::DictHelper kwargsDict(kwargs);
@@ -245,40 +232,38 @@ public:
    * Dtor
    * @note will stop loading thread when destructing
    */
-  virtual ~PyDataProvider2() {
-    resetImpl(false);
-  }
+  virtual ~PyDataProvider2() { resetImpl(false); }
 
 private:
   void createPyDataObj(const std::string& model,
                        const std::string& className,
                        const std::string& fileListName,
-                       PyObjectPtr && kwargs) {
-    LOG(INFO) << "loading dataprovider " << model <<"::" << className;
+                       PyObjectPtr&& kwargs  // NOLINT
+                       ) {
+    LOG(INFO) << "loading dataprovider " << model << "::" << className;
 
     PyObjectPtr module = py::import(model);
     PyObjectPtr moduleDict(PyModule_GetDict(module.get()));
     CHECK_PY(moduleDict) << "Invoke module.__dict__ error";
-    PyObjectPtr cls(PyDict_GetItemString(moduleDict.get(),
-                                         className.c_str()));
+    PyObjectPtr cls(PyDict_GetItemString(moduleDict.get(), className.c_str()));
     CHECK_PY(cls) << "load class " << className.c_str() << "error";
 
     // If there are multiple python instance share same module, the PyObjectPtr
     // only for instance will make python reference-count error.
     //
     // So here, we increase reference count manually.
-    if (gModuleClsPtrs_.find((uintptr_t) module.get())
-        != gModuleClsPtrs_.end()) {
+    if (gModuleClsPtrs_.find((uintptr_t)module.get()) !=
+        gModuleClsPtrs_.end()) {
       // Multi instance use same module
       Py_XINCREF(module.get());
       Py_XINCREF(moduleDict.get());
     } else {
-      gModuleClsPtrs_.insert((uintptr_t) module.get());
+      gModuleClsPtrs_.insert((uintptr_t)module.get());
     }
-    if (gModuleClsPtrs_.find((uintptr_t) cls.get()) != gModuleClsPtrs_.end()) {
+    if (gModuleClsPtrs_.find((uintptr_t)cls.get()) != gModuleClsPtrs_.end()) {
       Py_XINCREF(cls.get());
     } else {
-      gModuleClsPtrs_.insert((uintptr_t) cls.get());
+      gModuleClsPtrs_.insert((uintptr_t)cls.get());
     }
 
     PyObjectPtr fileListInPy = loadPyFileLists(fileListName);
@@ -294,8 +279,8 @@ private:
     py::ObjectHelper self(this->instance_);
     bool ok;
 
-    this->skipShuffle_ = !self.getBoolAttr("should_shuffle",
-                                           &ok /*isBoolType*/);
+    this->skipShuffle_ =
+        !self.getBoolAttr("should_shuffle", &ok /*isBoolType*/);
     if (!ok) {
       this->skipShuffle_ = testing;  // shuffle when is training, skip shuffle
                                      // when is testing.
@@ -335,12 +320,12 @@ private:
       PyObjectPtr headerPtrWrap(hdPtr);
       py::ObjectHelper hd(headerPtrWrap);
       header.dim = hd.getIntAttrWithError<size_t>("dim");
-      header.seqType = (SeqType) hd.getIntAttrWithError<int>("seq_type");
-      header.slotType = (SlotType) hd.getIntAttrWithError<int>("type");
+      header.seqType = (SeqType)hd.getIntAttrWithError<int>("seq_type");
+      header.slotType = (SlotType)hd.getIntAttrWithError<int>("type");
     }
 
     DBG << "Data header size " << headers_.size();
-    for (auto & header : headers_) {
+    for (auto& header : headers_) {
       DBG << header;
     }
     cache_.reset(IPyDataProviderCache::create(
@@ -351,8 +336,7 @@ private:
     loadFileList(fileListName, fileLists_);
     PyObject* lst = PyList_New(fileLists_.size());
     for (size_t i = 0; i < fileLists_.size(); ++i) {
-      PyList_SET_ITEM(lst, i,
-                      PyString_FromString(fileLists_[i].c_str()));
+      PyList_SET_ITEM(lst, i, PyString_FromString(fileLists_[i].c_str()));
     }
     return PyObjectPtr(lst);
   }
@@ -414,11 +398,12 @@ private:
         CHECK(ok) << "CalcBatchSize must return int or long";
       }
 
-      if (this->loadThread_){  // wait poolActualSize < poolSize;
+      if (this->loadThread_) {  // wait poolActualSize < poolSize;
         std::unique_lock<std::mutex> l(mtx_);
-        pushCV_.wait(l, [this, additionalBatchSize] {
-          return this->poolActualSize_ < poolSize_;
-        });
+        pushCV_.wait(l,
+                     [this, additionalBatchSize] {
+                       return this->poolActualSize_ < poolSize_;
+                     });
       }
 
       {
@@ -487,14 +472,14 @@ private:
   std::vector<std::string> fileLists_;
   std::vector<SlotHeader> headers_;
   static PyObjectPtr zeroTuple_;
-  static std::unordered_set<uintptr_t > gModuleClsPtrs_;
+  static std::unordered_set<uintptr_t> gModuleClsPtrs_;
 
   class PositionRandom {
   public:
-    inline explicit PositionRandom(bool skipRand):
-        eng_(ThreadLocalRandomEngine::get()), skipRand_(skipRand) {}
+    inline explicit PositionRandom(bool skipRand)
+        : eng_(ThreadLocalRandomEngine::get()), skipRand_(skipRand) {}
 
-    inline size_t operator() (size_t len) {
+    inline size_t operator()(size_t len) {
       if (!skipRand_) {
         if (!dist_ || dist_->b() != len - 1) {
           dist_.reset(new std::uniform_int_distribution<size_t>(0, len - 1));
@@ -525,32 +510,31 @@ public:
    * Shuffle. Do nothing because PyDataProvider do shuffle implicitly by random
    * select data from datapool.
    */
-  void shuffle() {
-  }
+  void shuffle() {}
 
   /**
    * Not limited size.
    */
-  int64_t getSize() {
-    return -1;
-  }
+  int64_t getSize() { return -1; }
 
   /**
    * Loading a batch of data.
    */
-  int64_t getNextBatchInternal(int64_t size_, DataBatch *batch) {
+  int64_t getNextBatchInternal(int64_t size_, DataBatch* batch) {
     std::lock_guard<std::mutex> guard(mutexForReset_);
     REGISTER_TIMER("PyDP2.getNextBatchInternal")
     CHECK_GE(size_, 0);
-    size_t size = (size_t) size_;
+    size_t size = (size_t)size_;
     if (loadThread_) {  // loading from thread should wait for data pool ready.
                         // but, loading from cache, cache object should ensure
                         // data pool ready.
       std::unique_lock<std::mutex> l(mtx_);
-      pullCV_.wait(l, [this, &size] {
-        return this->poolActualSize_ >= std::max(size, this->minPoolSize_)
-            || callingContexts_.empty();
-      });
+      pullCV_.wait(l,
+                   [this, &size] {
+                     return this->poolActualSize_ >=
+                                std::max(size, this->minPoolSize_) ||
+                            callingContexts_.empty();
+                   });
 
       if (unittest::OnPoolFilled) {
         (*unittest::OnPoolFilled)(this->poolActualSize_);
@@ -633,35 +617,35 @@ public:
     cpuBatch.setSize(bsize);
     auto& inArgs = cpuBatch.getStreams();
     inArgs.resize(headers_.size());
-    std::vector<std::unique_ptr<IFieldScanner> > scanners;
+    std::vector<std::unique_ptr<IFieldScanner>> scanners;
     scanners.reserve(headers_.size());
     for (auto& header : headers_) {
       scanners.emplace_back(IFieldScanner::create(&header));
     }
     DBG << "Scanner created.";
-    for (size_t i=0; i < headers_.size(); ++i) {
+    for (size_t i = 0; i < headers_.size(); ++i) {
       scanners[i]->startPrepare(inArgs[i]);
     }
-    for (auto & d : data) {
+    for (auto& d : data) {
       py::SequenceHelper s(d);
-      for (size_t i=0; i < headers_.size(); ++i) {
+      for (size_t i = 0; i < headers_.size(); ++i) {
         scanners[i]->prepare(inArgs[i], s[i]);
       }
     }
-    for (size_t i=0; i < headers_.size(); ++i) {
+    for (size_t i = 0; i < headers_.size(); ++i) {
       scanners[i]->finishPrepare(inArgs[i]);
     }
-    for (size_t i=0; i < headers_.size(); ++i) {
+    for (size_t i = 0; i < headers_.size(); ++i) {
       scanners[i]->startFill(inArgs[i]);
     }
-    for (auto & d : data) {
+    for (auto& d : data) {
       py::SequenceHelper s(d);
       for (size_t i = 0; i < headers_.size(); ++i) {
         scanners[i]->fill(inArgs[i], s[i]);
       }
     }
 
-    for (size_t i=0; i < headers_.size(); ++i) {
+    for (size_t i = 0; i < headers_.size(); ++i) {
       scanners[i]->finishFill(inArgs[i]);
     }
 
@@ -679,8 +663,8 @@ public:
       gpuArguments.resize(cpuArguments.size());
       gpuBatch.setSize(size);
       for (size_t i = 0; i < headers_.size(); ++i) {
-        gpuArguments[i].resizeAndCopyFrom(cpuArguments[i], useGpu_,
-                                          HPPL_STREAM_1);
+        gpuArguments[i].resizeAndCopyFrom(
+            cpuArguments[i], useGpu_, HPPL_STREAM_1);
       }
       hl_stream_synchronize(HPPL_STREAM_1);
     } else {
@@ -690,31 +674,28 @@ public:
   }
 };
 
-std::unordered_set<uintptr_t > PyDataProvider2::gModuleClsPtrs_;
+std::unordered_set<uintptr_t> PyDataProvider2::gModuleClsPtrs_;
 PyObjectPtr PyDataProvider2::zeroTuple_(PyTuple_New(0));
 
 REGISTER_DATA_PROVIDER_EX(py2, PyDataProvider2);
 
-
 /**
  * Scanner for dense slot.
  */
-class DenseScanner: public IFieldScanner {
+class DenseScanner : public IFieldScanner {
 public:
-  explicit DenseScanner(SlotHeader* ptr):IFieldScanner(ptr), height_(0) {}
+  explicit DenseScanner(SlotHeader* ptr) : IFieldScanner(ptr), height_(0) {}
 
   /**
    * Prepare.
    * @param argument target argument
    * @param obj each timestep of a sample.
    */
-  virtual void prepare(Argument &argument, PyObject *obj) {
-    ++height_;
-  }
+  virtual void prepare(Argument& argument, PyObject* obj) { ++height_; }
 
-  virtual void finishPrepare(Argument &argument) {
-    Matrix::resizeOrCreate(argument.value, height_, headerPtr_->dim,
-                           false, false);
+  virtual void finishPrepare(Argument& argument) {
+    Matrix::resizeOrCreate(
+        argument.value, height_, headerPtr_->dim, false, false);
     height_ = 0;
   }
 
@@ -723,24 +704,23 @@ public:
    * @param argument
    * @param obj
    */
-  virtual void fill(Argument &argument, PyObject *obj) {
+  virtual void fill(Argument& argument, PyObject* obj) {
     real* dat = argument.value->getData() + height_ * headerPtr_->dim;
     if (PyArray_Check(obj)) {
-        auto dtype = PyArray_DTYPE((PyArrayObject*)obj);
-        if (dtype->type == 'f' && dtype->elsize == sizeof(real)) {
-            real * data = (real*)PyArray_DATA((PyArrayObject*)obj);
-            auto sz = PyArray_SIZE((PyArrayObject*)obj);
-            std::copy(data, data + sz, dat);
-        } else {
-            LOG(FATAL) << "You should yield float" << sizeof(real) * 8
-                       << " array";
-        }
-     } else {
-        py::SequenceHelper s(obj);
-        // TODO(yuyang18): Here we can use AVX or SSE to accelerate memory copy.
-        for (size_t i=0; i < headerPtr_->dim; ++i) {
-          dat[i] = (real) s.getDouble(i);
-        }
+      auto dtype = PyArray_DTYPE((PyArrayObject*)obj);
+      if (dtype->type == 'f' && dtype->elsize == sizeof(real)) {
+        real* data = (real*)PyArray_DATA((PyArrayObject*)obj);
+        auto sz = PyArray_SIZE((PyArrayObject*)obj);
+        std::copy(data, data + sz, dat);
+      } else {
+        LOG(FATAL) << "You should yield float" << sizeof(real) * 8 << " array";
+      }
+    } else {
+      py::SequenceHelper s(obj);
+      // TODO(yuyang18): Here we can use AVX or SSE to accelerate memory copy.
+      for (size_t i = 0; i < headerPtr_->dim; ++i) {
+        dat[i] = (real)s.getDouble(i);
+      }
     }
     ++height_;
   }
@@ -752,20 +732,18 @@ private:
 /**
  * Scanner for index slot
  */
-class IndexScanner: public IFieldScanner {
+class IndexScanner : public IFieldScanner {
 public:
-  explicit IndexScanner(SlotHeader* ptr):IFieldScanner(ptr), cnt_(0) {}
+  explicit IndexScanner(SlotHeader* ptr) : IFieldScanner(ptr), cnt_(0) {}
 
   /**
    * Prepare memory space.
    *
    * @note obj is a single timestep of sample
    */
-  virtual void prepare(Argument &argument, PyObject *obj) {
-    ++cnt_;
-  }
+  virtual void prepare(Argument& argument, PyObject* obj) { ++cnt_; }
 
-  virtual void finishPrepare(Argument &argument) {
+  virtual void finishPrepare(Argument& argument) {
     IVector::resizeOrCreate(argument.ids, cnt_, false);
     cnt_ = 0;
   }
@@ -773,9 +751,9 @@ public:
   /**
    * Fill one index to argument.
    */
-  virtual void fill(Argument &argument, PyObject *obj) {
+  virtual void fill(Argument& argument, PyObject* obj) {
     bool ok;
-    argument.ids->getData()[cnt_++] = py::castInt<int >(obj, &ok);
+    argument.ids->getData()[cnt_++] = py::castInt<int>(obj, &ok);
     CHECK(ok) << "Cannot cast int " << py::repr(obj);
   }
 
@@ -785,27 +763,25 @@ private:
 
 class SparseNonValueScanner : public IFieldScanner {
 public:
-  explicit SparseNonValueScanner(SlotHeader* ptr): IFieldScanner(ptr),
-                                                   nnz_(0),
-                                                   height_(0) {}
+  explicit SparseNonValueScanner(SlotHeader* ptr)
+      : IFieldScanner(ptr), nnz_(0), height_(0) {}
 
   /**
    * Prepare memory space
    * @note obj is a timestep of one sample.
    */
-  virtual void prepare(Argument &argument, PyObject *obj) {
+  virtual void prepare(Argument& argument, PyObject* obj) {
     ++height_;
     nnz_ += py::SequenceHelper(obj).size();
   }
 
-  virtual void finishPrepare(Argument &argument) {
-    Matrix::resizeOrCreateSparseMatrix(argument.value, height_,
-                                       headerPtr_->dim,
-                                       nnz_, NO_VALUE);
+  virtual void finishPrepare(Argument& argument) {
+    Matrix::resizeOrCreateSparseMatrix(
+        argument.value, height_, headerPtr_->dim, nnz_, NO_VALUE);
   }
 
-  virtual void startFill(Argument & argument) {
-    auto smat = (CpuSparseMatrix*) (argument.value.get());
+  virtual void startFill(Argument& argument) {
+    auto smat = (CpuSparseMatrix*)(argument.value.get());
     smat->getRows()[0] = 0;
     nnz_ = 0;
     height_ = 1;
@@ -818,14 +794,14 @@ public:
   virtual void fill(Argument& argument, PyObject* obj) {
     py::SequenceHelper s(obj);
     auto sz = s.size();
-    auto smat = (CpuSparseMatrix*) (argument.value.get());
+    auto smat = (CpuSparseMatrix*)(argument.value.get());
     int* row = smat->getRows();
     int* col = smat->getCols();
     real* dat = smat->getData();
-    row[height_] = row[height_-1] + (int)sz;
+    row[height_] = row[height_ - 1] + (int)sz;
 
     for (decltype(sz) i = 0; i < sz; ++i) {
-      setData(col+nnz_, dat+nnz_, s[i]);
+      setData(col + nnz_, dat + nnz_, s[i]);
       ++nnz_;
     }
     ++height_;
@@ -839,7 +815,7 @@ protected:
    * @param [in] obj Python Object. For sparse_non_value is a PyInt or PyLong.
    *                 For sparse_value is a Tuple (int, float).
    */
-  virtual void setData(int* col, real * dat, PyObject* obj) {
+  virtual void setData(int* col, real* dat, PyObject* obj) {
     bool ok;
     *col = py::castInt<int>(obj, &ok);
     CHECK(ok);
@@ -851,26 +827,25 @@ protected:
 
 class SparseValueScanner : public SparseNonValueScanner {
 public:
-  explicit SparseValueScanner(SlotHeader *ptr) : SparseNonValueScanner(ptr) {}
+  explicit SparseValueScanner(SlotHeader* ptr) : SparseNonValueScanner(ptr) {}
 
-  virtual void finishPrepare(Argument &argument) {
-    Matrix::resizeOrCreateSparseMatrix(argument.value, height_,
-                                       headerPtr_->dim,
-                                       nnz_, FLOAT_VALUE);
+  virtual void finishPrepare(Argument& argument) {
+    Matrix::resizeOrCreateSparseMatrix(
+        argument.value, height_, headerPtr_->dim, nnz_, FLOAT_VALUE);
   }
 
 protected:
-  virtual void setData(int *col, real *dat, PyObject *obj) {
+  virtual void setData(int* col, real* dat, PyObject* obj) {
     py::SequenceHelper s(obj);
     SparseNonValueScanner::setData(col, dat, s[0]);
-    *dat = (real) s.getDouble(1);
+    *dat = (real)s.getDouble(1);
   }
 };
 
 /**
  * Sequence Scanner. Scanner for sequence or sub-sequence.
  */
-class SequenceScanner: public IFieldScanner {
+class SequenceScanner : public IFieldScanner {
 public:
   /**
    * Ctor
@@ -879,15 +854,18 @@ public:
    *                       return a sequence start position or a sub-sequence
    *                       start position.
    */
-  SequenceScanner(std::unique_ptr<IFieldScanner>&& innerScanner,
-    const std::function<ICpuGpuVectorPtr&(Argument&)>& getSeqStartPos)
-      : IFieldScanner(nullptr), inner_(std::move(innerScanner)),
-        cnt_(0), getSeqStartPos_(getSeqStartPos) {}
+  SequenceScanner(
+      std::unique_ptr<IFieldScanner>&& innerScanner,
+      const std::function<ICpuGpuVectorPtr&(Argument&)>& getSeqStartPos)
+      : IFieldScanner(nullptr),
+        inner_(std::move(innerScanner)),
+        cnt_(0),
+        getSeqStartPos_(getSeqStartPos) {}
 
   /**
    * Start prepare. Invoke inner->startPrepare too.
    */
-  virtual void startPrepare(Argument &argument) {
+  virtual void startPrepare(Argument& argument) {
     inner_->startPrepare(argument);
   }
 
@@ -895,10 +873,10 @@ public:
    * Prepare. obj is a list or tuple. it will invoke inner_->prepare for each
    * element of sequence obj.
    */
-  virtual void prepare(Argument &argument, PyObject *obj) {
+  virtual void prepare(Argument& argument, PyObject* obj) {
     py::SequenceHelper s(obj);
     ++cnt_;
-    for (size_t i=0; i < s.size(); ++i) {
+    for (size_t i = 0; i < s.size(); ++i) {
       inner_->prepare(argument, s[i]);
     }
   }
@@ -906,7 +884,7 @@ public:
   /**
    * Finish prepare. invoke inner_->finishPrepare too.
    */
-  virtual void finishPrepare(Argument &argument) {
+  virtual void finishPrepare(Argument& argument) {
     ICpuGpuVector::resizeOrCreate(getSeqStartPos_(argument), cnt_ + 1, false);
     inner_->finishPrepare(argument);
   }
@@ -914,7 +892,7 @@ public:
   /**
    * Start fill. invoke inner->startFill too.
    */
-  virtual void startFill(Argument &argument) {
+  virtual void startFill(Argument& argument) {
     getSeqStartPos_(argument)->getMutableData(false)[0] = 0;
     cnt_ = 1;
     inner_->startFill(argument);
@@ -925,13 +903,13 @@ public:
    * sequence obj. And set seqStartPos at same time. The seqStartPos will be
    * calculated by getSeqStartPos callback passed in ctor.
    */
-  virtual void fill(Argument &argument, PyObject *obj) {
+  virtual void fill(Argument& argument, PyObject* obj) {
     getSeqStartPos_(argument)->getMutableData(false)[cnt_] =
-      getSeqStartPos_(argument)->getMutableData(false)[cnt_ - 1] +
-          (int)getSize(obj);
+        getSeqStartPos_(argument)->getMutableData(false)[cnt_ - 1] +
+        (int)getSize(obj);
     py::SequenceHelper s(obj);
     ++cnt_;
-    for (size_t i=0; i < s.size(); ++i) {
+    for (size_t i = 0; i < s.size(); ++i) {
       inner_->fill(argument, s[i]);
     }
   }
@@ -939,9 +917,7 @@ public:
   /**
    * Finish fill. will invoke inner->finishFill too.
    */
-  virtual void finishFill(Argument &argument) {
-    inner_->finishFill(argument);
-  }
+  virtual void finishFill(Argument& argument) { inner_->finishFill(argument); }
 
 protected:
   size_t getSize(PyObject* obj) {
@@ -949,7 +925,7 @@ protected:
     auto sc = dynamic_cast<SequenceScanner*>(inner_.get());
     if (sc) {
       size_t sum = 0;
-      for (size_t i=0; i < s.size(); ++i) {
+      for (size_t i = 0; i < s.size(); ++i) {
         sum += sc->getSize(s[i]);
       }
       return sum;
@@ -964,8 +940,7 @@ private:
   std::function<ICpuGpuVectorPtr&(Argument&)> getSeqStartPos_;
 };
 
-
-IFieldScanner* IFieldScanner::create(SlotHeader *header) {
+IFieldScanner* IFieldScanner::create(SlotHeader* header) {
   IFieldScanner* retv = nullptr;
   switch (header->slotType) {
     case ST_DENSE:
@@ -989,15 +964,15 @@ IFieldScanner* IFieldScanner::create(SlotHeader *header) {
       break;
     case SQT_SUBSEQ:
       retv = new SequenceScanner(std::unique_ptr<IFieldScanner>(retv),
-            [](Argument& arg) -> ICpuGpuVectorPtr& {
-              return arg.subSequenceStartPositions;
-            });
-      // fall through, not break;
+                                 [](Argument& arg) -> ICpuGpuVectorPtr& {
+                                   return arg.subSequenceStartPositions;
+                                 });
+    // fall through, not break;
     case SQT_SEQ:
       retv = new SequenceScanner(std::unique_ptr<IFieldScanner>(retv),
-          [](Argument& arg) -> ICpuGpuVectorPtr& {
-            return arg.sequenceStartPositions;
-          });
+                                 [](Argument& arg) -> ICpuGpuVectorPtr& {
+                                   return arg.sequenceStartPositions;
+                                 });
       break;
     default:
       LOG(FATAL) << "Not implemented";
@@ -1010,19 +985,13 @@ IFieldScanner* IFieldScanner::create(SlotHeader *header) {
  * No Cache Strategy. Will destruct old data immediately and load data from
  * python every pass.
  */
-class NoCacheStrategy: public IPyDataProviderCache {
+class NoCacheStrategy : public IPyDataProviderCache {
 public:
-  virtual bool reset() {
-    return true;
-  }
+  virtual bool reset() { return true; }
 
-  virtual void drop(std::deque<PyObjectPtr> *data) {
-    data->clear();
-  }
+  virtual void drop(std::deque<PyObjectPtr>* data) { data->clear(); }
 
-  virtual std::deque<PyObjectPtr>* load() {
-    return nullptr;
-  }
+  virtual std::deque<PyObjectPtr>* load() { return nullptr; }
 };
 
 /**
@@ -1033,9 +1002,9 @@ public:
  */
 class CacheOnePassInMemory : public IPyDataProviderCache {
 public:
-  CacheOnePassInMemory() : objPool_(new std::deque<PyObjectPtr>()),
-                           droppedPool_(new std::deque<PyObjectPtr>())
-  {}
+  CacheOnePassInMemory()
+      : objPool_(new std::deque<PyObjectPtr>()),
+        droppedPool_(new std::deque<PyObjectPtr>()) {}
 
   virtual bool reset() {
     if (objPool_->empty() && droppedPool_->empty()) {
@@ -1048,25 +1017,22 @@ public:
     }
   }
 
-  virtual void drop(std::deque<PyObjectPtr> *data) {
+  virtual void drop(std::deque<PyObjectPtr>* data) {
     size_t orgSize = droppedPool_->size();
     droppedPool_->resize(orgSize + data->size());
-    for (size_t i=0; i < data->size(); ++i) {
+    for (size_t i = 0; i < data->size(); ++i) {
       std::swap((*droppedPool_)[orgSize + i], (*data)[i]);
     }
     data->clear();
   }
 
-  virtual std::deque<PyObjectPtr>* load() {
-    return objPool_.get();
-  }
+  virtual std::deque<PyObjectPtr>* load() { return objPool_.get(); }
 
 private:
-  std::unique_ptr<std::deque<PyObjectPtr> > objPool_;
-  std::unique_ptr<std::deque<PyObjectPtr> > droppedPool_;
+  std::unique_ptr<std::deque<PyObjectPtr>> objPool_;
+  std::unique_ptr<std::deque<PyObjectPtr>> droppedPool_;
 };
 
-
 IPyDataProviderCache* IPyDataProviderCache::create(CacheType ct) {
   switch (ct) {
     case NO_CACHE:
diff --git a/paddle/gserver/evaluators/CTCErrorEvaluator.cpp b/paddle/gserver/evaluators/CTCErrorEvaluator.cpp
index e397c71c87..8f7d2fb80e 100644
--- a/paddle/gserver/evaluators/CTCErrorEvaluator.cpp
+++ b/paddle/gserver/evaluators/CTCErrorEvaluator.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "Evaluator.h"
 #include "paddle/gserver/gradientmachines/NeuralNetwork.h"
 
@@ -33,7 +32,8 @@ private:
     str.clear();
     int prevLabel = -1;
     for (std::vector<int>::const_iterator label = path.begin();
-         label != path.end(); label++) {
+         label != path.end();
+         label++) {
       if (*label != blank_ &&
           (str.empty() || *label != str.back() || prevLabel == blank_)) {
         str.push_back(*label);
@@ -58,8 +58,11 @@ private:
   /* "sp, dp, ip" is the weighting parameter of "substitution, deletion,
    * insertion"
    * in edit-distance error */
-  real stringAlignment(std::vector<int>& gtStr, std::vector<int>& recogStr,
-                       bool backtrace = true, real sp = 1.0, real dp = 1.0,
+  real stringAlignment(std::vector<int>& gtStr,
+                       std::vector<int>& recogStr,
+                       bool backtrace = true,
+                       real sp = 1.0,
+                       real dp = 1.0,
                        real ip = 1.0) {
     std::vector<std::vector<int>> matrix;
     int substitutions, deletions, insertions;
@@ -165,8 +168,8 @@ private:
     return distance / maxLen;
   }
 
-  real editDistance(real* output, int numTimes, int numClasses, int* labels,
-                    int labelsLen) {
+  real editDistance(
+      real* output, int numTimes, int numClasses, int* labels, int labelsLen) {
     numTimes_ = numTimes;
     numClasses_ = numClasses;
     blank_ = numClasses_ - 1;
@@ -207,7 +210,8 @@ public:
       real err = 0;
       err = editDistance(
           output.value->getData() + output.value->getWidth() * outputStarts[i],
-          outputStarts[i+1] - outputStarts[i], output.value->getWidth(),
+          outputStarts[i + 1] - outputStarts[i],
+          output.value->getWidth(),
           label.ids->getData() + labelStarts[i],
           labelStarts[i + 1] - labelStarts[i]);
 
@@ -240,7 +244,7 @@ public:
     seqClassficationError_ = 0;
   }
 
-  virtual void printStats(std::ostream& os) {
+  virtual void printStats(std::ostream& os) const {
     os << config_.name() << "="
        << (numSequences_ ? totalScore_ / numSequences_ : 0);
     os << "  deletions error"
diff --git a/paddle/gserver/evaluators/ChunkEvaluator.cpp b/paddle/gserver/evaluators/ChunkEvaluator.cpp
index 22579891f3..923e77fc9d 100644
--- a/paddle/gserver/evaluators/ChunkEvaluator.cpp
+++ b/paddle/gserver/evaluators/ChunkEvaluator.cpp
@@ -114,7 +114,7 @@ public:
     numCorrect_ = 0;
   }
 
-  virtual void printStats(std::ostream& os) {
+  virtual void printStats(std::ostream& os) const {
     double precision = (double)numCorrect_ / numOutputSegments_;
     double recall = (double)numCorrect_ / numLabelSegments_;
     double f1 =
@@ -144,7 +144,8 @@ public:
     size_t numSequences = sequenceStartPositions->getSize() - 1;
     const int* starts = sequenceStartPositions->getData();
     for (size_t i = 0; i < numSequences; ++i) {
-      eval1(output->getData() + starts[i], label->getData() + starts[i],
+      eval1(output->getData() + starts[i],
+            label->getData() + starts[i],
             starts[i + 1] - starts[i]);
     }
     return 0;
diff --git a/paddle/gserver/evaluators/Evaluator.cpp b/paddle/gserver/evaluators/Evaluator.cpp
index 7bdcdaae53..f5df2b18de 100644
--- a/paddle/gserver/evaluators/Evaluator.cpp
+++ b/paddle/gserver/evaluators/Evaluator.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Stat.h"
 #include "paddle/gserver/evaluators/Evaluator.h"
 
@@ -74,17 +73,19 @@ public:
     }
 
     const MatrixPtr errorMat = Matrix::create(output->getHeight(),
-      1, /* trans= */ false, useGpu(arguments[0].deviceId));
+                                              1,
+                                              /* trans= */ false,
+                                              useGpu(arguments[0].deviceId));
     errorMat->zeroMem();
     if (label != nullptr) {
       errorMat->classificationError(output, label);
     } else if (dynamic_cast<CpuSparseMatrix*>(multiBinaryLabel.get()) ||
                dynamic_cast<GpuSparseMatrix*>(multiBinaryLabel.get())) {
-      errorMat->classificationErrorMulti(*output, *multiBinaryLabel,
-                                         config_.classification_threshold());
+      errorMat->classificationErrorMulti(
+          *output, *multiBinaryLabel, config_.classification_threshold());
     } else {
-      errorMat->binaryClassificationError(0, *output, *multiBinaryLabel,
-                                          config_.classification_threshold());
+      errorMat->binaryClassificationError(
+          0, *output, *multiBinaryLabel, config_.classification_threshold());
     }
 
     if (supportWeight) {
@@ -126,8 +127,8 @@ public:
     int errCounter = 0;
     CpuVector errorVec(0, nullptr);
     for (size_t i = 0; i < sequenceStartPositions->getSize() - 1; ++i) {
-      errorVec.subVecFrom(errorMat->getData(), starts[i],
-                          starts[i + 1] - starts[i]);
+      errorVec.subVecFrom(
+          errorMat->getData(), starts[i], starts[i + 1] - starts[i]);
       if (errorVec.getSum() > 0) {
         errCounter += 1;
       }
@@ -315,7 +316,7 @@ public:
     return 0;
   }
 
-  virtual void printStats(std::ostream& os) {
+  virtual void printStats(std::ostream& os) const {
     CHECK(colIdx_ + (int32_t)colNum_ >= 0 && colIdx_ - (int32_t)colNum_ < 0)
         << "column index [" << colIdx_ << "] out of range [-" << colNum_ << ", "
         << colNum_ << ")";
@@ -330,8 +331,8 @@ public:
   }
 
   void distributeEval(ParameterClient2* client) {
-    client->reduce(sum_->getData(), sum_->getData(), colNum_, FLAGS_trainer_id,
-                   0);
+    client->reduce(
+        sum_->getData(), sum_->getData(), colNum_, FLAGS_trainer_id, 0);
     client->reduce(&numSamples_, &numSamples_, 1, FLAGS_trainer_id, 0);
   }
 
@@ -379,8 +380,11 @@ real AucEvaluator::evalImp(std::vector<Argument>& arguments) {
   }
 
   if (dynamic_cast<GpuMatrix*>(output.get())) {
-    Matrix::resizeOrCreate(cpuOutput_, insNum, outputDim,
-                           /* trans=*/false, /* useGpu=*/false);
+    Matrix::resizeOrCreate(cpuOutput_,
+                           insNum,
+                           outputDim,
+                           /* trans=*/false,
+                           /* useGpu=*/false);
     cpuOutput_->copyFrom(*output);
     IVector::resizeOrCreate(cpuLabel_, insNum, false);
     cpuLabel_->copyFrom(*label);
@@ -421,7 +425,7 @@ void AucEvaluator::distributeEval(ParameterClient2* client) {
   client->reduce(statNeg_, statNeg_, kBinNum_ + 1, FLAGS_trainer_id, 0);
 }
 
-double AucEvaluator::calcAuc() {
+double AucEvaluator::calcAuc() const {
   double totPos = 0.0;
   double totNeg = 0.0;
   double totPosPrev = 0.0;
@@ -479,19 +483,24 @@ real RankAucEvaluator::evalImp(std::vector<Argument>& arguments) {
   for (size_t i = 0; i < batchNum; ++i) {
     int beginPos = startPosData[i];
     int endPos = startPosData[i + 1];
-    batchAuc += calcRankAuc(outputData + beginPos, clickData + beginPos,
-                            pvData + beginPos, endPos - beginPos);
+    batchAuc += calcRankAuc(outputData + beginPos,
+                            clickData + beginPos,
+                            pvData + beginPos,
+                            endPos - beginPos);
   }
   return batchAuc;
 }
 
-double RankAucEvaluator::calcRankAuc(real* outputData, real* clickData,
-                                     real* pvData, size_t size) {
+double RankAucEvaluator::calcRankAuc(real* outputData,
+                                     real* clickData,
+                                     real* pvData,
+                                     size_t size) {
   outputPair_.clear();
   for (size_t i = 0; i < size; ++i) {
     outputPair_.push_back(std::make_pair(outputData[i], i));
   }
-  std::sort(outputPair_.begin(), outputPair_.end(),
+  std::sort(outputPair_.begin(),
+            outputPair_.end(),
             [](const std::pair<real, int>& a, const std::pair<real, int>& b) {
               return a.first > b.first;
             });
@@ -584,7 +593,7 @@ real PrecisionRecallEvaluator::evalImp(std::vector<Argument>& arguments) {
   return 0;
 }
 
-void PrecisionRecallEvaluator::printStats(std::ostream& os) {
+void PrecisionRecallEvaluator::printStats(std::ostream& os) const {
   int label = config_.positive_label();
   if (label != -1) {
     CHECK(label >= 0 && label < (int)statsInfo_.size())
@@ -790,8 +799,12 @@ real PnpairEvaluator::evalImp(std::vector<Argument>& arguments) {
   return 0;
 }
 
-void PnpairEvaluator::stat(size_t start, size_t end, PredictionResult* answers,
-                           double& pos, double& neg, double& spe) {
+void PnpairEvaluator::stat(size_t start,
+                           size_t end,
+                           PredictionResult* answers,
+                           double& pos,
+                           double& neg,
+                           double& spe) {
   for (size_t i = start; i < end; i++) {
     for (size_t j = i + 1; j < end; j++) {
       CHECK_EQ(answers[i].queryid, answers[j].queryid);
@@ -817,7 +830,8 @@ void PnpairEvaluator::stat(size_t start, size_t end, PredictionResult* answers,
 }
 
 void PnpairEvaluator::calc(std::vector<PredictionResult>& predictArray) {
-  std::sort(predictArray.begin(), predictArray.end(),
+  std::sort(predictArray.begin(),
+            predictArray.end(),
             [](const PredictionResult& x, const PredictionResult& y) {
               return x.queryid < y.queryid;
             });
@@ -828,11 +842,16 @@ void PnpairEvaluator::calc(std::vector<PredictionResult>& predictArray) {
   auto start = predictArray.begin();
   while (start != predictArray.end()) {
     auto end = std::find_if(
-        start + 1, predictArray.end(),
+        start + 1,
+        predictArray.end(),
         [=](const PredictionResult& x) { return x.queryid != start->queryid; });
     CHECK(end != start);
-    stat(start - predictArray.begin(), end - predictArray.begin(),
-         predictArray.data(), pos, neg, special);
+    stat(start - predictArray.begin(),
+         end - predictArray.begin(),
+         predictArray.data(),
+         pos,
+         neg,
+         special);
 
     start = end;
   }
@@ -1120,8 +1139,8 @@ public:
 
     auto resizeMatrix = [](MatrixPtr& dest, const MatrixPtr& src) {
       if (src && src->useGpu()) {
-        Matrix::resizeOrCreate(dest, src->getHeight(), src->getWidth(), false,
-                               false);
+        Matrix::resizeOrCreate(
+            dest, src->getHeight(), src->getWidth(), false, false);
         dest->copyFrom(*src);
       } else {
         dest = src;
diff --git a/paddle/gserver/evaluators/Evaluator.h b/paddle/gserver/evaluators/Evaluator.h
index b79a539384..732abb6079 100644
--- a/paddle/gserver/evaluators/Evaluator.h
+++ b/paddle/gserver/evaluators/Evaluator.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "paddle/pserver/ParameterClient2.h"
@@ -99,19 +98,19 @@ public:
    * @brief print the statistics of evaluate result
    * @note finish() should be called before printStats
    */
-  virtual void printStats(std::ostream& os) {
+  virtual void printStats(std::ostream& os) const {
     os << config_.name() << "="
        << (numSamples_ ? totalScore_ / numSamples_ : 0);
   }
 
   friend std::ostream& operator<<(std::ostream& os,
-                                  Evaluator& evaluator) {
+                                  const Evaluator& evaluator) {
     evaluator.printStats(os);
     return os;
   }
 
-  friend std::ostream&& operator<<(std::ostream&& os,    // NOLINT
-                                   Evaluator& evaluator) {
+  friend std::ostream&& operator<<(std::ostream&& os,  // NOLINT
+                                   const Evaluator& evaluator) {
     evaluator.printStats(os);
     return std::move(os);
   }
@@ -135,7 +134,7 @@ public:
     return -1;
   }
   virtual void finish() {}
-  virtual void printStats(std::ostream&) {}
+  virtual void printStats(std::ostream&) const {}
 };
 /**
  * @brief evaluate AUC using colIdx-th column as prediction.
@@ -165,7 +164,7 @@ public:
 
   virtual real evalImp(std::vector<Argument>& arguments);
 
-  virtual void printStats(std::ostream& os) {
+  virtual void printStats(std::ostream& os) const {
     os << config_.name() << "=" << calcAuc();
   }
 
@@ -184,12 +183,14 @@ private:
 
   AucEvaluator() {}
 
-  inline static double trapezoidArea(double X1, double X2, double Y1,
+  inline static double trapezoidArea(double X1,
+                                     double X2,
+                                     double Y1,
                                      double Y2) {
     return (X1 > X2 ? (X1 - X2) : (X2 - X1)) * (Y1 + Y2) / 2.0;
   }
 
-  double calcAuc();
+  double calcAuc() const;
 };
 
 /**
@@ -218,7 +219,9 @@ private:
   MatrixPtr pv_;
   std::vector<std::pair<real, int>> outputPair_;
 
-  double calcRankAuc(real* outputData, real* clickData, real* pvData,
+  double calcRankAuc(real* outputData,
+                     real* clickData,
+                     real* pvData,
                      size_t size);
 };
 /**
@@ -244,7 +247,7 @@ public:
 
   virtual real evalImp(std::vector<Argument>& arguments);
 
-  virtual void printStats(std::ostream& os);
+  virtual void printStats(std::ostream& os) const;
 
   virtual void distributeEval(ParameterClient2* client);
 
@@ -269,10 +272,12 @@ private:
   IVectorPtr cpuLabel_;
   MatrixPtr cpuWeight_;
 
-  void calcStatsInfo(const MatrixPtr& output, const IVectorPtr& label,
+  void calcStatsInfo(const MatrixPtr& output,
+                     const IVectorPtr& label,
                      const MatrixPtr& weight);
 
-  void calcStatsInfoMulti(const MatrixPtr& output, const MatrixPtr& label,
+  void calcStatsInfoMulti(const MatrixPtr& output,
+                          const MatrixPtr& label,
                           const MatrixPtr& weight);
 
   inline static double calcPrecision(double TP, double FP) {
@@ -333,13 +338,17 @@ public:
     }
   }
 
-  void stat(size_t start, size_t end, PredictionResult* answers, double& pos,
-            double& neg, double& spe);
+  void stat(size_t start,
+            size_t end,
+            PredictionResult* answers,
+            double& pos,
+            double& neg,
+            double& spe);
   void calc(std::vector<PredictionResult>& predictArray);
 
   virtual void finish() { calc(predictArray_); }
 
-  virtual void printStats(std::ostream& os) {
+  virtual void printStats(std::ostream& os) const {
     os << " pos/neg"
        << "=" << pairArray_[0] / ((pairArray_[1] <= 0) ? 1.0 : pairArray_[1]);
   }
diff --git a/paddle/gserver/gradientmachines/GradientMachine.cpp b/paddle/gserver/gradientmachines/GradientMachine.cpp
index b20525f664..3761fda5f3 100644
--- a/paddle/gserver/gradientmachines/GradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/GradientMachine.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "GradientMachine.h"
 
 #include "paddle/utils/Logging.h"
@@ -29,7 +28,8 @@ limitations under the License. */
 namespace paddle {
 
 GradientMachine* GradientMachine::create(
-    const ModelConfig& config, int mode,
+    const ModelConfig& config,
+    int mode,
     const std::vector<ParameterType>& parameterTypes) {
   if (auto gm = IGradientMachineMode::tryCreateGradientMachine(mode, config)) {
     return gm;
@@ -49,10 +49,11 @@ GradientMachine* GradientMachine::create(
       /* single thread calculate */
       nn = NeuralNetwork::create(config);
     }
-    ParamInitCallback testParamInitCb =
-        [](int paramId, Parameter* para) { para->enableType(PARAMETER_VALUE); };
-    nn->init(config, mode == kTesting ? testParamInitCb : nullptr,
-             parameterTypes);
+    ParamInitCallback testParamInitCb = [](int paramId, Parameter* para) {
+      para->enableType(PARAMETER_VALUE);
+    };
+    nn->init(
+        config, mode == kTesting ? testParamInitCb : nullptr, parameterTypes);
     return nn;
   }
   LOG(FATAL) << "Unknown model type: " << config.type();
diff --git a/paddle/gserver/gradientmachines/GradientMachine.h b/paddle/gserver/gradientmachines/GradientMachine.h
index 986a1ee71d..27cdf7f789 100644
--- a/paddle/gserver/gradientmachines/GradientMachine.h
+++ b/paddle/gserver/gradientmachines/GradientMachine.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <iostream>
@@ -84,10 +83,11 @@ public:
    * Parameter will have parameterTypes
    */
   static GradientMachine* create(
-      const ModelConfig& config, int mode = kNormal,
+      const ModelConfig& config,
+      int mode = kNormal,
       const std::vector<ParameterType>& parameterTypes =
-          std::vector<ParameterType>{PARAMETER_VALUE, PARAMETER_GRADIENT,
-                                     PARAMETER_MOMENTUM});
+          std::vector<ParameterType>{
+              PARAMETER_VALUE, PARAMETER_GRADIENT, PARAMETER_MOMENTUM});
 
   /**
    * Create a gradient machine from the merged model file.
@@ -137,7 +137,8 @@ public:
    * @note: if passType==PASS_TEST, then backward() should not be called
    */
   virtual void forward(const std::vector<Argument>& inArgs,
-                       std::vector<Argument>* outArgs, PassType passType) = 0;
+                       std::vector<Argument>* outArgs,
+                       PassType passType) = 0;
 
   /**
    * @brief Backward propagation.
@@ -211,7 +212,7 @@ public:
    * @note    This function will only been implemented and used in a
    *          multithreaded environment.
    */
- virtual void start(const TrainerConfig& config,
+  virtual void start(const TrainerConfig& config,
                      DataProviderPtr dataProvider) {
     (void)config;
     (void)dataProvider;
@@ -246,7 +247,6 @@ public:
    */
   virtual void restart() {}
 
-
   /// Set the gradient of the output from outside.
   virtual void setOutputGrad(const std::vector<Argument>& args) {
     LOG(FATAL) << "Not implemented!";
diff --git a/paddle/gserver/gradientmachines/GradientMachineMode.h b/paddle/gserver/gradientmachines/GradientMachineMode.h
index 9aff9c616c..f2f55a7067 100644
--- a/paddle/gserver/gradientmachines/GradientMachineMode.h
+++ b/paddle/gserver/gradientmachines/GradientMachineMode.h
@@ -23,10 +23,10 @@ public:
   virtual ~IGradientMachineMode() {}
 
 public:  // interfaces
-  /**
-   * @brief create current mode's gradient machine by model config.
-   * @param config model config
-   */
+         /**
+          * @brief create current mode's gradient machine by model config.
+          * @param config model config
+          */
   virtual GradientMachine* create(const ModelConfig& config) = 0;
 
   /**
@@ -37,11 +37,10 @@ public:  // interfaces
    * @param isGpu is using gpu.
    * @return true if mode should be this mode.
    */
-  virtual bool shouldBeMe(
-      const std::string& algo,
-      size_t trainerCount,
-      bool isLocal,
-      bool isGpu) const = 0;
+  virtual bool shouldBeMe(const std::string& algo,
+                          size_t trainerCount,
+                          bool isLocal,
+                          bool isGpu) const = 0;
 
   /**
    * @brief Is data must be in cpu even if using gpu mode.
@@ -57,13 +56,13 @@ public:  // interfaces
   virtual bool needTrainWholeDataInOneBatch() const = 0;
 
 public:  // static methods.
-  /**
-   * @brief register a custom gradient machine mode.
-   * @note For user to register a custom gradient machine mode, id should >=
-   * kCustom.
-   * @param mode mode id.
-   * @param ptr mode description object.
-   */
+         /**
+          * @brief register a custom gradient machine mode.
+          * @note For user to register a custom gradient machine mode, id should >=
+          * kCustom.
+          * @param mode mode id.
+          * @param ptr mode description object.
+          */
   static void regGradientMachineMode(
       int32_t mode, std::unique_ptr<IGradientMachineMode>&& ptr) {
     modes_.insert(std::make_pair(mode, std::move(ptr)));
@@ -102,9 +101,11 @@ public:  // static methods.
    * @param [in] isGpu using gpu or not.
    * @return true if there is a custom mode fit these conditions.
    */
-  static bool tryGetMode(int* mode, const std::string& algo,
+  static bool tryGetMode(int* mode,
+                         const std::string& algo,
                          int32_t trainerCount,
-                         bool isLocal, bool isGpu) {
+                         bool isLocal,
+                         bool isGpu) {
     for (auto it = modes_.begin(); it != modes_.end(); ++it) {
       if (it->second->shouldBeMe(algo, trainerCount, isLocal, isGpu)) {
         *mode = it->first;
@@ -130,8 +131,8 @@ public:  // static methods.
    * @brief try to create gradient machine by mode & config.
    * @return nullptr if we cannot create a gradient machine by such mode.
    */
-  static GradientMachine* tryCreateGradientMachine(
-      int32_t mode, const ModelConfig& config) {
+  static GradientMachine* tryCreateGradientMachine(int32_t mode,
+                                                   const ModelConfig& config) {
     auto m = IGradientMachineMode::mode(mode);
     if (m) {
       return m->create(config);
@@ -142,7 +143,7 @@ public:  // static methods.
 
 private:
   static std::unordered_map<int32_t, std::unique_ptr<IGradientMachineMode>>
-    modes_;
+      modes_;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/gradientmachines/MultiGradientMachine.cpp b/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
index 0ded30eeb4..148451f18d 100644
--- a/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "MultiGradientMachine.h"
 
 #include "paddle/utils/Logging.h"
@@ -22,7 +21,8 @@ limitations under the License. */
 #include "NeuralNetwork.h"
 #include "ParallelNeuralNetwork.h"
 
-P_DEFINE_bool(allow_only_one_model_on_one_gpu, true,
+P_DEFINE_bool(allow_only_one_model_on_one_gpu,
+              true,
               "If true, do not allow multiple models on one GPU device");
 #ifdef PADDLE_METRIC_LEARNING
 P_DECLARE_bool(external);
@@ -32,15 +32,15 @@ namespace paddle {
 
 // get types of the parameters which need to be merged after backward()
 static void fillMergeTypes(PassType passType,
-    std::vector<ParameterType>* mergeTypes) {
+                           std::vector<ParameterType>* mergeTypes) {
   mergeTypes->clear();
   if (passType != PASS_TEST) {
     mergeTypes->push_back(PARAMETER_GRADIENT);
   }
 }
 
-MultiGradientMachine::MultiGradientMachine(
-    const ModelConfig& config, bool useGpu)
+MultiGradientMachine::MultiGradientMachine(const ModelConfig& config,
+                                           bool useGpu)
     : useGpu_(useGpu),
       trainerBarrier_(FLAGS_trainer_count),
       allBarrier_(FLAGS_trainer_count + 1),
@@ -65,13 +65,11 @@ MultiGradientMachine::MultiGradientMachine(
     if (para->useGpu()) return;
 
     if (para->isSparseRemoteUpdate()) {
-      para->enableType(
-        PARAMETER_VALUE,
-        FLAGS_loadsave_parameters_in_pserver
-          ? Parameter::MAT_SPARSE_ROW_PREFETCH
-          : Parameter::MAT_SPARSE_ROW_PREFETCH_FULL_SIZE);
-      para->enableType(
-        PARAMETER_GRADIENT, Parameter::MAT_SPARSE_ROW);
+      para->enableType(PARAMETER_VALUE,
+                       FLAGS_loadsave_parameters_in_pserver
+                           ? Parameter::MAT_SPARSE_ROW_PREFETCH
+                           : Parameter::MAT_SPARSE_ROW_PREFETCH_FULL_SIZE);
+      para->enableType(PARAMETER_GRADIENT, Parameter::MAT_SPARSE_ROW);
     } else if (para->isGradSparseUpdate()) {
       para->enableType(PARAMETER_VALUE);
       para->enableType(PARAMETER_GRADIENT, Parameter::MAT_SPARSE_ROW_IDS);
@@ -100,17 +98,16 @@ MultiGradientMachine::MultiGradientMachine(
   if (useGpu_) {
     numLogicalDevices_ = 1;
 
-    for (size_t pid = 0; pid  < parameters_.size(); pid++) {
+    for (size_t pid = 0; pid < parameters_.size(); pid++) {
       if (parameters_[pid]->getConfig().device() + 1 > numLogicalDevices_) {
         numLogicalDevices_ = parameters_[pid]->getConfig().device() + 1;
       }
     }
     LOG(INFO) << "numLogicalDevices=" << numLogicalDevices_
-              << " numThreads=" << numThreads_
-              << " numDevices=" << numDevices_;
+              << " numThreads=" << numThreads_ << " numDevices=" << numDevices_;
 
-    if (numLogicalDevices_ * numThreads_ > numDevices_
-        && FLAGS_allow_only_one_model_on_one_gpu) {
+    if (numLogicalDevices_ * numThreads_ > numDevices_ &&
+        FLAGS_allow_only_one_model_on_one_gpu) {
       LOG(FATAL) << "trainer_count * num_devices_in_model "
                  << "(" << numThreads_ << "*" << numLogicalDevices_ << ")"
                  << "=" << numThreads_ * numLogicalDevices_
@@ -130,11 +127,7 @@ MultiGradientMachine::MultiGradientMachine(
   }
 
   for (int i = 0; i < numThreads_; ++i) {
-    threads_.emplace_back(
-        new TrainerThread(
-            config,
-            i,
-            this));
+    threads_.emplace_back(new TrainerThread(config, i, this));
   }
 
   bufferSizes_.resize(numLogicalDevices_, 0);
@@ -162,7 +155,7 @@ MultiGradientMachine::MultiGradientMachine(
 
   // combination of all trainers mainPara into GradientMachine parameters
   hasNonstaticCpuParamters_ = false;
-  for (size_t pid = 0; pid  < parameters_.size(); pid++) {
+  for (size_t pid = 0; pid < parameters_.size(); pid++) {
     if (parameters_[pid]->useGpu()) {
       parameters_[pid] = threads_[paraMainThread_[pid]]->getParameters()[pid];
     } else if (!parameters_[pid]->isStatic()) {
@@ -209,7 +202,7 @@ void MultiGradientMachine::allocGradBufs() {
       SetDevice device(logicalDeviceId2RealDeviceId(d, i));
       for (size_t j = 0; j < mergeTypes_.size(); j++) {
         gradBufs_[i][d].bufs.push_back(
-          Vector::create(bufferSizes_[d], /* useGpu= */true));
+            Vector::create(bufferSizes_[d], /* useGpu= */ true));
       }
     }
   }
@@ -249,18 +242,16 @@ void MultiGradientMachine::prefetch(const std::vector<Argument>& inArgs) {
   }
 }
 
-void MultiGradientMachine::forward(
-    const std::vector<Argument>& inArgs,
-    std::vector<Argument>* outArgs,
-    PassType passType) {
+void MultiGradientMachine::forward(const std::vector<Argument>& inArgs,
+                                   std::vector<Argument>* outArgs,
+                                   PassType passType) {
   forwardImp(inArgs, outArgs, passType, TASK_FORWARD);
 }
 
-void MultiGradientMachine::forwardImp(
-    const std::vector<Argument>& inArgs,
-    std::vector<Argument>* outArgs,
-    PassType passType,
-    TaskType taskType) {
+void MultiGradientMachine::forwardImp(const std::vector<Argument>& inArgs,
+                                      std::vector<Argument>* outArgs,
+                                      PassType passType,
+                                      TaskType taskType) {
   updateThreadParameters();
   passType_ = passType;
 
@@ -282,18 +273,16 @@ void MultiGradientMachine::backward(const UpdateCallback& callback) {
   backwardImp(callback);
 }
 
-void MultiGradientMachine::forwardBackward(
-    const std::vector<Argument>& inArgs,
-    std::vector<Argument>* outArgs,
-    PassType passType,
-    const UpdateCallback& callback) {
+void MultiGradientMachine::forwardBackward(const std::vector<Argument>& inArgs,
+                                           std::vector<Argument>* outArgs,
+                                           PassType passType,
+                                           const UpdateCallback& callback) {
   backwardCallback_ = callback;
   forwardImp(inArgs, outArgs, passType, TASK_FORWARD_BACKWARD);
   backwardImp(callback);
 }
 
-void MultiGradientMachine::backwardImp(
-    const UpdateCallback& callback) {
+void MultiGradientMachine::backwardImp(const UpdateCallback& callback) {
   for (size_t i = 0; i < parameters_.size(); i++) {
     if (!parameters_[i]->useGpu() || parameters_[i]->isStatic()) continue;
     REGISTER_TIMER("controller_dequeue");
@@ -349,9 +338,8 @@ void MultiGradientMachine::eval(Evaluator* evaluator) {
   }
 }
 
-void MultiGradientMachine::getOutArgs(
-    std::vector<Argument>* outArgs,
-    PassType passType) {
+void MultiGradientMachine::getOutArgs(std::vector<Argument>* outArgs,
+                                      PassType passType) {
   for (auto& thread : threads_) {
     REGISTER_TIMER("waitOutArgs");
     thread->waitOutArgsReady();
@@ -375,7 +363,6 @@ void MultiGradientMachine::getOutArgs(
   *outArgs = outArgs_;
 }
 
-
 void MultiGradientMachine::setOutputGrad(const std::vector<Argument>& args) {
   CHECK_EQ(args.size(), outArgs_.size());
   for (size_t i = 0; i < args.size(); i++) {
@@ -390,10 +377,9 @@ void MultiGradientMachine::startTask(TaskType taskType) {
   }
 }
 
-TrainerThread::TrainerThread(
-    const ModelConfig& config,
-    int threadId,
-    MultiGradientMachine* multiMachine)
+TrainerThread::TrainerThread(const ModelConfig& config,
+                             int threadId,
+                             MultiGradientMachine* multiMachine)
     : multiMachine_(multiMachine),
       config_(config),
       threadId_(threadId),
@@ -407,8 +393,9 @@ TrainerThread::TrainerThread(
 
   partnerId_ = mod(threadId_ - 1, numThreads);
 
-  deviceId_ = !multiMachine_->useGpu() ? -1
-      : multiMachine_->logicalDeviceId2RealDeviceId(0, threadId_);
+  deviceId_ = !multiMachine_->useGpu()
+                  ? -1
+                  : multiMachine_->logicalDeviceId2RealDeviceId(0, threadId_);
   SetDevice gpuDevice(deviceId_);
 
   NeuralNetwork* nn = nullptr;
@@ -418,22 +405,20 @@ TrainerThread::TrainerThread(
     nn = new ParallelNeuralNetwork();
     for (auto& paraConfig : *config_.mutable_parameters()) {
       if (paraConfig.device() != -1) {
-        paraConfig.set_device(
-          multiMachine_->logicalDeviceId2RealDeviceId(
+        paraConfig.set_device(multiMachine_->logicalDeviceId2RealDeviceId(
             paraConfig.device(), threadId_));
       }
     }
     for (auto& layerConfig : *config_.mutable_layers()) {
       if (layerConfig.device() != -1) {
-        layerConfig.set_device(
-          multiMachine_->logicalDeviceId2RealDeviceId(
+        layerConfig.set_device(multiMachine_->logicalDeviceId2RealDeviceId(
             layerConfig.device(), threadId_));
       }
     }
   }
   // Only GPU do not share parameter values with main paramters.
-  ParamInitCallback slaveParamInitCb = std::bind(parameterInitNN, _1, _2,
-                                                 &mainParas);
+  ParamInitCallback slaveParamInitCb =
+      std::bind(parameterInitNN, _1, _2, &mainParas);
   nn->init(config_, slaveParamInitCb);
   gradientMachine_.reset(nn);
   parameters_ = gradientMachine_->getParameters();
@@ -443,9 +428,8 @@ TrainerThread::TrainerThread(
     }
   }
 
-  backwardCallback_ = std::bind(
-      &TrainerThread::backwardCallback,
-      this, std::placeholders::_1);
+  backwardCallback_ =
+      std::bind(&TrainerThread::backwardCallback, this, std::placeholders::_1);
 
   gradStream_ = HPPL_STREAM_2;
   valueStream_ = HPPL_STREAM_3;
@@ -454,25 +438,21 @@ TrainerThread::TrainerThread(
   parameterUpdated_ = false;
 }
 
-TrainerThread::~TrainerThread() {
-  stop();
-}
+TrainerThread::~TrainerThread() { stop(); }
 
 void TrainerThread::start() {
-  gradientMachine_->start(*(TrainerConfig*)nullptr, (DataProviderPtr)nullptr);
+  gradientMachine_->start(*(TrainerConfig*)nullptr, (DataProviderPtr) nullptr);
 
-  computeThread_.reset(new std::thread(
-      [this](){ computeThread(); }));
+  computeThread_.reset(new std::thread([this]() { computeThread(); }));
 
   if (multiMachine_->useGpu()) {
-    gradCollectThread_.reset(new std::thread(
-      [this](){ gradCollectThread(); }));
+    gradCollectThread_.reset(
+        new std::thread([this]() { gradCollectThread(); }));
 
-    valueDispatchThread_.reset(new std::thread(
-      [this](){ valueDispatchThread(); }));
+    valueDispatchThread_.reset(
+        new std::thread([this]() { valueDispatchThread(); }));
 
-    copyThread_.reset(new std::thread(
-      [this](){ copyGradToBufferThread(); }));
+    copyThread_.reset(new std::thread([this]() { copyGradToBufferThread(); }));
   }
 }
 
@@ -565,20 +545,14 @@ void TrainerThread::forward() {
 
   {
     REGISTER_TIMER("wait_value");
-    valueReadyCond_.wait(
-        [this]() {
-          return !parameterUpdated_;
-        });
+    valueReadyCond_.wait([this]() { return !parameterUpdated_; });
   }
 
-  {
-    fillMergeTypes(multiMachine_->getPassType(), &mergeTypes_);
-  }
+  { fillMergeTypes(multiMachine_->getPassType(), &mergeTypes_); }
 
   {
     REGISTER_TIMER("thread_forward");
-    gradientMachine_->forward(
-        inArgs_, &outArgs_, multiMachine_->getPassType());
+    gradientMachine_->forward(inArgs_, &outArgs_, multiMachine_->getPassType());
   }
   outArgsReadySem_.post();
 }
@@ -602,9 +576,8 @@ void TrainerThread::backwardCallback(Parameter* para) {
   if (multiMachine_->getNumThreads() == 1) {
     // no need to do merge if there is only one thread
     doCallback(paramId);
-  } else if (threadId_ ==
-             mod(multiMachine_->paraMainThread(paramId) - 1,
-                 multiMachine_->getNumThreads())) {
+  } else if (threadId_ == mod(multiMachine_->paraMainThread(paramId) - 1,
+                              multiMachine_->getNumThreads())) {
     notifyCopyGradToBuffer(paramId);
   } else {
     notifyGradientCollect(paramId);
@@ -625,7 +598,7 @@ void TrainerThread::copyGradToBufferThread() {
     if (stopping_) break;
 
     int pdeviceId = multiMachine_->realDeviceId2LogicalDeviceId(
-      parameters_[pid]->getDeviceId(), threadId_);
+        parameters_[pid]->getDeviceId(), threadId_);
 
     auto& gradBuf = gradBufs[pdeviceId];
 
@@ -639,9 +612,9 @@ void TrainerThread::copyGradToBufferThread() {
       SetDevice setDevice(parameters_[pid]->getDeviceId());
       for (size_t i = 0; i < mergeTypes_.size(); ++i) {
         gradBuf.bufs[i]->resize(
-          parameters_[pid]->getBuf(mergeTypes_[i])->getSize());
-        gradBuf.bufs[i]->copyFrom(
-            *parameters_[pid]->getBuf(mergeTypes_[i]), gradStream_);
+            parameters_[pid]->getBuf(mergeTypes_[i])->getSize());
+        gradBuf.bufs[i]->copyFrom(*parameters_[pid]->getBuf(mergeTypes_[i]),
+                                  gradStream_);
       }
       hl_stream_synchronize(gradStream_);
     }
@@ -667,7 +640,7 @@ void TrainerThread::gradCollectThread() {
     if (++gradReadyCount[pid] < 2) continue;
     gradReadyCount[pid] = 0;
     int pdeviceId = multiMachine_->realDeviceId2LogicalDeviceId(
-      parameters_[pid]->getDeviceId(), threadId_);
+        parameters_[pid]->getDeviceId(), threadId_);
 
     auto& gradBuf = gradBufs[pdeviceId];
 
@@ -741,8 +714,7 @@ void TrainerThread::valueDispatchThread() {
 
 void TrainerThread::notifyValueReady(int paramId) {
   if (--updateCounter_ == 0) {
-    valueReadyCond_.notify_all(
-        [this] { parameterUpdated_ = false; });
+    valueReadyCond_.notify_all([this] { parameterUpdated_ = false; });
   }
 
   notifyValueDispatch(paramId);
@@ -750,7 +722,7 @@ void TrainerThread::notifyValueReady(int paramId) {
 
 void TrainerThread::copyInArgs() {
   const std::vector<Argument>& fullInArgs = multiMachine_->getInArgs();
-  int     numThreads = multiMachine_->getAllThreads().size();
+  int numThreads = multiMachine_->getAllThreads().size();
   int32_t numSequences = fullInArgs[0].getNumSequences();
   int32_t startSeq = numSequences * threadId_ / numThreads;
   int32_t endSeq = numSequences * (threadId_ + 1) / numThreads;
@@ -767,9 +739,11 @@ void TrainerThread::copyInArgs() {
     return;
   }
 
-  for (size_t i=0; i < fullInArgs.size(); i++) {
+  for (size_t i = 0; i < fullInArgs.size(); i++) {
     inArgs_[i].resizeAndCopyFrom(
-        fullInArgs[i], startSeq, copySize,
+        fullInArgs[i],
+        startSeq,
+        copySize,
         FLAGS_parallel_nn ? false : multiMachine_->useGpu());
   }
 }
@@ -814,10 +788,8 @@ void TrainerThread::mergeGradSparse(
   std::vector<uint32_t>& ids = mainMat->getIds(threadId_);
 
   for (auto slaveParams : slaveParameters) {
-    SparseRowCpuMatrix* mat =
-        dynamic_cast<SparseRowCpuMatrix*>((*slaveParams)[pid]
-                                              ->getMat(PARAMETER_GRADIENT)
-                                              .get());
+    SparseRowCpuMatrix* mat = dynamic_cast<SparseRowCpuMatrix*>(
+        (*slaveParams)[pid]->getMat(PARAMETER_GRADIENT).get());
     mat->addTo(*mainMat, ids, threadId_, multiMachine_->getNumThreads());
     // we use a sample hash method(%) instead of range partition,
     // because range partition has balance issue sometimes,
@@ -847,9 +819,10 @@ void TrainerThread::mergeGradDense(
     Parameter* para,
     std::vector<const std::vector<ParameterPtr>*>& slaveParameters) {
   size_t pid = para->getID();
-  auto interval =
-      calcSplitArrayInterval(para->getSize(), (size_t)threadId_,
-                             multiMachine_->getNumThreads(), 8LU /*for avx*/);
+  auto interval = calcSplitArrayInterval(para->getSize(),
+                                         (size_t)threadId_,
+                                         multiMachine_->getNumThreads(),
+                                         8LU /*for avx*/);
   size_t startSeq = interval.first;
   size_t copySize = interval.second - interval.first;
 
@@ -861,8 +834,7 @@ void TrainerThread::mergeGradDense(
   CpuVector slaveGradSub(0, nullptr);
   for (auto slaveParams : slaveParameters) {
     slaveGradSub.subVecFrom(
-      *(*slaveParams)[pid]->getBuf(PARAMETER_GRADIENT),
-      startSeq, copySize);
+        *(*slaveParams)[pid]->getBuf(PARAMETER_GRADIENT), startSeq, copySize);
     destGrad.add(slaveGradSub);
   }
 }
@@ -876,7 +848,9 @@ void TrainerThread::copyOutputGrad() {
   int32_t copySize = endSeq - startSeq;
   outArgs_.resize(outputGradArgs.size());
   for (size_t i = 0; i < outputGradArgs.size(); i++) {
-    outArgs_[i].resizeAndCopyFrom(outputGradArgs[i], startSeq, copySize,
+    outArgs_[i].resizeAndCopyFrom(outputGradArgs[i],
+                                  startSeq,
+                                  copySize,
                                   multiMachine_->useGpu(),
                                   HPPL_STREAM_DEFAULT);
   }
diff --git a/paddle/gserver/gradientmachines/MultiGradientMachine.h b/paddle/gserver/gradientmachines/MultiGradientMachine.h
index d13cf426c2..58c5486810 100644
--- a/paddle/gserver/gradientmachines/MultiGradientMachine.h
+++ b/paddle/gserver/gradientmachines/MultiGradientMachine.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <atomic>
@@ -52,7 +51,8 @@ struct GradBuffer {
  *
  *  It handles GPU and Cpu parameters differently.  In GPU, one computing thread
  *  generally corresponds to one GPU device. Thus, each thread keeps a separate
- *  copy of the parameter in its own device's memory. In CPU, we only need to keep
+ *  copy of the parameter in its own device's memory. In CPU, we only need to
+ keep
  *  one copy of the parameters in the main memory. After, each computing thread
  *  computes its own parameter gradient, the update process needs to accumulate
  *  the parameter gradients from all the computing threads, and update the
@@ -66,16 +66,21 @@ struct GradBuffer {
  *  computing thread so that the parameters in all the computing threads are
  *  synchronized. The scatter and gather process are implemented by ring-style
  *  communication. Assume we have N computing threads, its thread ids will be
- *  0, 1, ..., N-1. For each parameter, the id of the main thread is specified in
- *  paraMainThread_[pid], where pid is the id of the parameter. Each thread i only
+ *  0, 1, ..., N-1. For each parameter, the id of the main thread is specified
+ in
+ *  paraMainThread_[pid], where pid is the id of the parameter. Each thread i
+ only
  *  sends data to its partner thread (i - 1) % N. For example, for a parameter
  *  gradient that is computed in thread 4, and its main thread is 2. Its
- *  traveling process would be 4, 5,..., N-1, 0, 1, 2. In each step, the gradient
+ *  traveling process would be 4, 5,..., N-1, 0, 1, 2. In each step, the
+ gradient
  *  buffer is added to the local gradient, and the local gradient is then copied
  *  to the gradient buffer of the next thread. At last, its main thread 2 will
  *  get the accumulated parameter gradient. For the same parameter, after its
- *  value is updated, the value's traveling process would be 2, 1, 0, N-1, ... 3.
- *  At the end, all the computing threads would have the updated parameter value.
+ *  value is updated, the value's traveling process would be 2, 1, 0, N-1, ...
+ 3.
+ *  At the end, all the computing threads would have the updated parameter
+ value.
  *
  *  A computing thread (TrainerThread) uses 4 threads to do different jobs:
  *
@@ -94,8 +99,10 @@ struct GradBuffer {
  *  * Handling of sparse update
  *  Currently, sparse update is only supported for CPU parameters.
 
- *  Sparse updates refers to gradient caculation where the gradient is sparse. For
- *  example, if the input argument to a 'fc' layer is sparse, the gradient of the
+ *  Sparse updates refers to gradient caculation where the gradient is sparse.
+ For
+ *  example, if the input argument to a 'fc' layer is sparse, the gradient of
+ the
  *  weight matrix of this layer will be sparse. It is usually more efficient to
  *  treat the gradient explicitly as sparse vector during the parameter update.
 
@@ -104,7 +111,8 @@ struct GradBuffer {
 
  *  For both types of sparse updates, there is one copy of parameter value and
  *  gradient called main parameter value and gradient, and there is a copy of
- *  parameter value and gradient for each computing thread called slave parameter
+ *  parameter value and gradient for each computing thread called slave
+ parameter
  *  value and gradient. The slave parameter values are always shared with the
  *  corresponding main parameter value. The slave parameter grad is a sparse row
  *  matrix. The sparse pattern for slave parameter grads are different, because
@@ -124,7 +132,8 @@ struct GradBuffer {
  *     (SparseAutoGrowRowCpuMatrix). It is a sparse row matrix.
  *
  *     During backward() of each TrainerThread, SparseAutoGrowRowCpuMatrix will
- *     gather all the non-zero gradient. And After backward(), they will be merged
+ *     gather all the non-zero gradient. And After backward(), they will be
+ merged
  *     into main parameter grad (SparseRowIdsCpuMatrix), with indices indicating
  *     which rows have nonzero gradient.
  *
@@ -136,9 +145,11 @@ struct GradBuffer {
  *     parameter values that are prefetched is up-to-date.
  *
  *     Main parameter grad type is MAT_SPARSE_ROW (SparseRowCpuMatrix).
- *     And it shares sparse pattern with value by sharing indexDictHandle_, which
+ *     And it shares sparse pattern with value by sharing indexDictHandle_,
+ which
  *     is an internal data structure used by SparseRowCpuMatrixto specify the
- *     sparsity pattern of Slave parameter value shares with main parameter value.
+ *     sparsity pattern of Slave parameter value shares with main parameter
+ value.
  *
  *     Slave parameter grad type is MAT_SPARSE_ROW_AUTO_GROW
  *     (SparsePrefetchRowCpuMatrix). It is a sparse row matrix
@@ -148,8 +159,10 @@ struct GradBuffer {
  *     parameter server.
  *
  *     During backward() of each TrainerThread, SparseAutoGrowRowCpuMatrix will
- *     gather all the non-zero gradient. And After backward(), they will be merged
- *     into main parameter grad (SparseRowCpuMatrix). And the framework will send
+ *     gather all the non-zero gradient. And After backward(), they will be
+ merged
+ *     into main parameter grad (SparseRowCpuMatrix). And the framework will
+ send
  *     the merged gradient to parameter server.
  */
 class MultiGradientMachine : public GradientMachine {
@@ -165,18 +178,16 @@ public:
 
   virtual void prefetch(const std::vector<Argument>& inArgs);
 
-  virtual void forward(
-      const std::vector<Argument>& inArgs,
-      std::vector<Argument>* outArgs,
-      PassType passType);
+  virtual void forward(const std::vector<Argument>& inArgs,
+                       std::vector<Argument>* outArgs,
+                       PassType passType);
 
   virtual void backward(const UpdateCallback& callback = nullptr);
 
-  void forwardBackward(
-    const std::vector<Argument>& inArgs,
-    std::vector<Argument>* outArgs,
-    PassType passType,
-    const UpdateCallback& callback);
+  void forwardBackward(const std::vector<Argument>& inArgs,
+                       std::vector<Argument>* outArgs,
+                       PassType passType,
+                       const UpdateCallback& callback);
 
   virtual void onPassEnd();
 
@@ -186,9 +197,7 @@ public:
 
   virtual void eval(Evaluator* evaluator);
 
-  bool useGpu() const {
-    return useGpu_;
-  }
+  bool useGpu() const { return useGpu_; }
 
   /// @return whether to pass the gradients in outArgs_ to each threads.
   bool isPassGrad() { return isPassGrad_; }
@@ -203,9 +212,7 @@ public:
 protected:
   friend class TrainerThread;
 
-  std::vector<TrainerThreadPtr>& getAllThreads() {
-    return threads_;
-  }
+  std::vector<TrainerThreadPtr>& getAllThreads() { return threads_; }
   /// Calculate the real device id based on the logical device id and the
   /// thread id.
   int logicalDeviceId2RealDeviceId(int logicalId, int threadId = 0) const {
@@ -229,9 +236,7 @@ protected:
 
   std::vector<const std::vector<ParameterPtr>*> getSlaveParameters();
 
-  bool hasNonstaticCpuParamters() const {
-    return hasNonstaticCpuParamters_;
-  }
+  bool hasNonstaticCpuParamters() const { return hasNonstaticCpuParamters_; }
 
   /// Called TrainerThread to wait before merging CPU parameter gradients.
   void waitBeforeMerge() { trainerBarrier_.wait(); }
@@ -244,59 +249,41 @@ protected:
   /// finishing
   void waitForCopyInArgs() { allBarrier_.wait(); }
 
-  TrainerThreadPtr& getThread(int threadId) {
-    return threads_[threadId];
-  }
+  TrainerThreadPtr& getThread(int threadId) { return threads_[threadId]; }
 
   std::vector<GradBuffer>& getGradBuf(int threadId) {
     return gradBufs_[threadId];
   }
 
-  PassType getPassType() const {
-    return passType_;
-  }
+  PassType getPassType() const { return passType_; }
 
   /// Called by TrainerThread to notify MultiGradientMachine that the gradient
   /// for paramId is ready
   void notifyGradientTransfer(int paramId);
 
-  const std::vector<Argument>& getInArgs() {
-    return inArgs_;
-  }
+  const std::vector<Argument>& getInArgs() { return inArgs_; }
 
-  TaskType getTaskType() const {
-    return taskType_;
-  }
+  TaskType getTaskType() const { return taskType_; }
 
   const UpdateCallback& getBackwardCallback() const {
     return backwardCallback_;
   }
 
-  int getNumDevices() const {
-    return numDevices_;
-  }
+  int getNumDevices() const { return numDevices_; }
 
-  int getNumLogicalDevices() const {
-    return numLogicalDevices_;
-  }
+  int getNumLogicalDevices() const { return numLogicalDevices_; }
 
-  int getNumThreads() const {
-    return numThreads_;
-  }
+  int getNumThreads() const { return numThreads_; }
 
-  int paraMainThread(int pid) const {
-    return paraMainThread_[pid];
-  }
+  int paraMainThread(int pid) const { return paraMainThread_[pid]; }
 
 protected:
-  virtual void forwardImp(
-      const std::vector<Argument>& inArgs,
-      std::vector<Argument>* outArgs,
-      PassType passType,
-      TaskType taskType);
+  virtual void forwardImp(const std::vector<Argument>& inArgs,
+                          std::vector<Argument>* outArgs,
+                          PassType passType,
+                          TaskType taskType);
 
-  virtual void backwardImp(
-      const UpdateCallback& callback = NULL);
+  virtual void backwardImp(const UpdateCallback& callback = NULL);
 
   /// update all parameters
   void updateThreadParameters();
@@ -329,9 +316,9 @@ protected:
 
   /// ParameterType which needs to be merged from each GPU
   std::vector<ParameterType> mergeTypes_;
-  int numDevices_;  /* number of gpu devices */
+  int numDevices_;         /* number of gpu devices */
   int numLogicalDevices_;  // number of GPU used by one NN
-  int numThreads_;  /* number of train threads */
+  int numThreads_;         /* number of train threads */
 
   UpdateCallback backwardCallback_;
 
@@ -350,38 +337,25 @@ protected:
 
 class TrainerThread {
 public:
-  TrainerThread(
-      const ModelConfig& config,
-      int threadId,
-      MultiGradientMachine* multiMachine);
+  TrainerThread(const ModelConfig& config,
+                int threadId,
+                MultiGradientMachine* multiMachine);
 
   ~TrainerThread();
 
   void start();
 
-  void onPassEnd() {
-    gradientMachine_->onPassEnd();
-  }
+  void onPassEnd() { gradientMachine_->onPassEnd(); }
 
-  void waitOutArgsReady() {
-    outArgsReadySem_.wait();
-  }
+  void waitOutArgsReady() { outArgsReadySem_.wait(); }
 
-  void notifyTaskReady() {
-    taskReadySem_.post();
-  }
+  void notifyTaskReady() { taskReadySem_.post(); }
 
-  int getDeviceId() const {
-    return deviceId_;
-  }
+  int getDeviceId() const { return deviceId_; }
 
-  GradientMachine* getGradientMachine() {
-    return gradientMachine_.get();
-  }
+  GradientMachine* getGradientMachine() { return gradientMachine_.get(); }
 
-  const std::vector<ParameterPtr>& getParameters() {
-    return parameters_;
-  }
+  const std::vector<ParameterPtr>& getParameters() { return parameters_; }
 
   void stop();
 
@@ -391,26 +365,18 @@ public:
     return parameters_[paramId]->getBuf(PARAMETER_VALUE);
   }
 
-  const std::vector<Argument>& getOutArgs() {
-    return outArgs_;
-  }
+  const std::vector<Argument>& getOutArgs() { return outArgs_; }
 
   void incUpdateCounter(int n = 1) {
     updateCounter_ += n;
     parameterUpdated_ = true;
   }
 
-  void notifyGradientCollect(int paramId) {
-    gradQueue_.enqueue(paramId);
-  }
+  void notifyGradientCollect(int paramId) { gradQueue_.enqueue(paramId); }
 
-  void notifyCopyGradToBuffer(int paramId) {
-    gradBufQueue_.enqueue(paramId);
-  }
+  void notifyCopyGradToBuffer(int paramId) { gradBufQueue_.enqueue(paramId); }
 
-  void notifyValueDispatch(int paramId) {
-    valueReadyQueue_.enqueue(paramId);
-  }
+  void notifyValueDispatch(int paramId) { valueReadyQueue_.enqueue(paramId); }
 
   void prefetch();
 
@@ -421,16 +387,16 @@ protected:
   void mergeCpuGradients();
 
   void mergeGradSparse(
-    Parameter* para,
-    std::vector<const std::vector<ParameterPtr>*>& slaveParameters);
+      Parameter* para,
+      std::vector<const std::vector<ParameterPtr>*>& slaveParameters);
 
   void mergeGradSparseRemote(
-    Parameter* para,
-    std::vector<const std::vector<ParameterPtr>*>& slaveParameters);
+      Parameter* para,
+      std::vector<const std::vector<ParameterPtr>*>& slaveParameters);
 
   void mergeGradDense(
-    Parameter* para,
-    std::vector<const std::vector<ParameterPtr>*>& slaveParameters);
+      Parameter* para,
+      std::vector<const std::vector<ParameterPtr>*>& slaveParameters);
 
   void computeThread();
   void valueDispatchThread();
@@ -499,5 +465,4 @@ protected:
   bool inArgsCopied_;
 };
 
-
 }  // namespace paddle
diff --git a/paddle/gserver/gradientmachines/MultiNetwork.cpp b/paddle/gserver/gradientmachines/MultiNetwork.cpp
index d30ca6f28e..e5be19cad6 100644
--- a/paddle/gserver/gradientmachines/MultiNetwork.cpp
+++ b/paddle/gserver/gradientmachines/MultiNetwork.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Stat.h"
 #include "paddle/utils/Util.h"
 #include <algorithm>
@@ -24,7 +23,8 @@ limitations under the License. */
 
 namespace paddle {
 
-void MultiNetwork::init(const ModelConfig& config, ParamInitCallback callback,
+void MultiNetwork::init(const ModelConfig& config,
+                        ParamInitCallback callback,
                         const std::vector<ParameterType>& parameterTypes,
                         bool useGpu) {
   CHECK_GT(config.sub_models_size(), 1) << "sub_models_size should GT 1";
@@ -40,10 +40,10 @@ void MultiNetwork::init(const ModelConfig& config, ParamInitCallback callback,
     std::string subModelName = config.sub_models(i).name();
     if (FLAGS_parallel_nn) {
       subNetworks_[i - 1] = std::unique_ptr<ParallelNeuralNetwork>(
-                           new ParallelNeuralNetwork(subModelName, this));
+          new ParallelNeuralNetwork(subModelName, this));
     } else {
       subNetworks_[i - 1] = std::unique_ptr<NeuralNetwork>(
-                           NeuralNetwork::newNeuralNetwork(subModelName, this));
+          NeuralNetwork::newNeuralNetwork(subModelName, this));
     }
     subNetworks_[i - 1]->init(config);
   }
@@ -64,7 +64,8 @@ void MultiNetwork::prefetch(const std::vector<Argument>& inArgs) {
 }
 
 void MultiNetwork::forward(const std::vector<Argument>& inArgs,
-                           std::vector<Argument>* outArgs, PassType passType) {
+                           std::vector<Argument>* outArgs,
+                           PassType passType) {
   // split inArgs to several vectors
   std::vector<std::vector<Argument>> argumentGroups;
   Argument::splitByDataId(inArgs, &argumentGroups);
@@ -154,7 +155,7 @@ public:
     return -1;
   }
 
-  virtual void printStats(std::ostream& os) {
+  virtual void printStats(std::ostream& os) const {
     for (auto& evaluator : evaluators_) {
       evaluator->printStats(os);
       os << ' ';
diff --git a/paddle/gserver/gradientmachines/MultiNetwork.h b/paddle/gserver/gradientmachines/MultiNetwork.h
index a162420c3b..779a2267f5 100644
--- a/paddle/gserver/gradientmachines/MultiNetwork.h
+++ b/paddle/gserver/gradientmachines/MultiNetwork.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "GradientMachine.h"
@@ -27,19 +26,22 @@ public:
   explicit MultiNetwork(std::string subModelName = "")
       : NeuralNetwork(subModelName) {}
 
-  virtual void init(const ModelConfig& config, ParamInitCallback callback,
+  virtual void init(const ModelConfig& config,
+                    ParamInitCallback callback,
                     const std::vector<ParameterType>& parameterTypes,
                     bool useGpu);
 
   virtual void prefetch(const std::vector<Argument>& inArgs);
 
   virtual void forward(const std::vector<Argument>& inArgs,
-                       std::vector<Argument>* outArgs, PassType passType);
+                       std::vector<Argument>* outArgs,
+                       PassType passType);
 
   virtual void backward(const UpdateCallback& callback = nullptr);
 
   void forwardBackward(const std::vector<Argument>& inArgs,
-                       std::vector<Argument>* outArgs, PassType passType,
+                       std::vector<Argument>* outArgs,
+                       PassType passType,
                        const UpdateCallback& callback);
 
   virtual void onPassEnd();
@@ -52,8 +54,7 @@ public:
     return subNetworks_;
   }
 
-  virtual void start(const TrainerConfig& config,
-                     DataProviderPtr dataProvider);
+  virtual void start(const TrainerConfig& config, DataProviderPtr dataProvider);
 
   virtual void finish();
 
diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.cpp b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
index 3127b4dd9a..9932ea655e 100644
--- a/paddle/gserver/gradientmachines/NeuralNetwork.cpp
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Util.h"
 
 #include "paddle/utils/Logging.h"
@@ -26,7 +25,8 @@ limitations under the License. */
 #include "paddle/gserver/layers/AgentLayer.h"
 
 namespace paddle {
-void parameterInitNN(int paramId, Parameter* para,
+void parameterInitNN(int paramId,
+                     Parameter* para,
                      std::vector<ParameterPtr>* sharedParams) {
   // Create parameters values.
   if (!para->useGpu() && sharedParams) {
@@ -35,10 +35,10 @@ void parameterInitNN(int paramId, Parameter* para,
                            (*sharedParams)[paramId]->getMat(PARAMETER_VALUE));
   } else {
     if (para->isSparseRemoteUpdate()) {
-      para->enableType(
-          PARAMETER_VALUE, FLAGS_loadsave_parameters_in_pserver
-                              ? Parameter::MAT_SPARSE_ROW_PREFETCH
-                              : Parameter::MAT_SPARSE_ROW_PREFETCH_FULL_SIZE);
+      para->enableType(PARAMETER_VALUE,
+                       FLAGS_loadsave_parameters_in_pserver
+                           ? Parameter::MAT_SPARSE_ROW_PREFETCH
+                           : Parameter::MAT_SPARSE_ROW_PREFETCH_FULL_SIZE);
     } else {
       para->enableType(PARAMETER_VALUE);
     }
@@ -65,7 +65,8 @@ NeuralNetwork* NeuralNetwork::create(const ModelConfig& config) {
 
 std::map<std::string, bool> NeuralNetwork::dllInitMap;
 
-void NeuralNetwork::init(const ModelConfig& config, ParamInitCallback callback,
+void NeuralNetwork::init(const ModelConfig& config,
+                         ParamInitCallback callback,
                          const std::vector<ParameterType>& parameterTypes,
                          bool useGpu) {
   using std::placeholders::_1;
@@ -89,12 +90,13 @@ void NeuralNetwork::init(const ModelConfig& config, ParamInitCallback callback,
   } else {
     parameters_.reserve(config.parameters_size());
     for (const auto& para_config : config.parameters()) {
-      auto parameter = std::make_shared<Parameter>(para_config, useGpu,
+      auto parameter = std::make_shared<Parameter>(para_config,
+                                                   useGpu,
                                                    /*initialize=*/false);
       paramCallback(parameters_.size(), parameter.get());
       if (!callback) {
         for (ParameterType type :
-                 (parameter->isStatic()
+             (parameter->isStatic()
                   ? std::vector<ParameterType>{PARAMETER_VALUE}
                   : parameterTypes)) {
           if (type != PARAMETER_VALUE && type != PARAMETER_GRADIENT) {
@@ -117,18 +119,19 @@ void NeuralNetwork::init(const ModelConfig& config, ParamInitCallback callback,
     layerMap_[layer->getName()] = layer;
   };
 
-  auto subModelConfig =
-      std::find_if(config.sub_models().begin(), config.sub_models().end(),
-                   [=](const SubModelConfig& sub_model) {
-                     return sub_model.name() == subModelName_;
-                   });
+  auto subModelConfig = std::find_if(config.sub_models().begin(),
+                                     config.sub_models().end(),
+                                     [=](const SubModelConfig& sub_model) {
+                                       return sub_model.name() == subModelName_;
+                                     });
   bool useSubModel = (subModelConfig != config.sub_models().end());
   CHECK_EQ(useSubModel, !subModelName_.empty());
   if (useSubModel) {
     layers_.reserve(subModelConfig->layer_names_size());
     for (const auto& layer_name : subModelConfig->layer_names()) {
       auto layer_config =
-          std::find_if(config.layers().begin(), config.layers().end(),
+          std::find_if(config.layers().begin(),
+                       config.layers().end(),
                        [=](const LayerConfig& layer_config) {
                          return layer_config.name() == layer_name;
                        });
@@ -176,14 +179,16 @@ void NeuralNetwork::init(const ModelConfig& config, ParamInitCallback callback,
   }
 }
 
-void NeuralNetwork::connect(LayerPtr agentLayer, LayerPtr realLayer,
+void NeuralNetwork::connect(LayerPtr agentLayer,
+                            LayerPtr realLayer,
                             int height) {
   AgentLayer* agent = dynamic_cast<AgentLayer*>(agentLayer.get());
   CHECK_NOTNULL(agent);
   agent->setRealLayer(realLayer, height);
 }
 
-void NeuralNetwork::connect(std::string agentLayerName, NeuralNetwork* srcNN,
+void NeuralNetwork::connect(std::string agentLayerName,
+                            NeuralNetwork* srcNN,
                             std::string realLayerName) {
   connect(this->getLayer(agentLayerName), srcNN->getLayer(realLayerName));
 }
@@ -195,7 +200,7 @@ void NeuralNetwork::prefetch(const std::vector<Argument>& inArgs) {
     for (auto& para : parameters_) {
       if (para->isSparseRemoteUpdate()) {
         auto mat = dynamic_cast<SparsePrefetchRowCpuMatrix*>(
-          para->getMat(PARAMETER_VALUE).get());
+            para->getMat(PARAMETER_VALUE).get());
         para->clearGradient();
         mat->clearIndices();
       }
@@ -217,10 +222,10 @@ void NeuralNetwork::prefetch(const std::vector<Argument>& inArgs) {
     for (auto& para : parameters_) {
       if (para->isSparseRemoteUpdate()) {
         auto mat = dynamic_cast<SparsePrefetchRowCpuMatrix*>(
-          para->getMat(PARAMETER_VALUE).get());
+            para->getMat(PARAMETER_VALUE).get());
         mat->setupIndices();
         auto matGrad = dynamic_cast<SparseRowCpuMatrix*>(
-          para->getMat(PARAMETER_GRADIENT).get());
+            para->getMat(PARAMETER_GRADIENT).get());
         matGrad->reserveStore();
       }
     }
@@ -228,7 +233,8 @@ void NeuralNetwork::prefetch(const std::vector<Argument>& inArgs) {
 }
 
 void NeuralNetwork::forward(const std::vector<Argument>& inArgs,
-                            std::vector<Argument>* outArgs, PassType passType) {
+                            std::vector<Argument>* outArgs,
+                            PassType passType) {
   CHECK_EQ(inArgs.size(), dataLayers_.size());
   outArgs->resize(outputLayers_.size());
   for (size_t i = 0; i != dataLayers_.size(); ++i) {
@@ -325,7 +331,7 @@ public:
     (void)arguments;
     return -1;
   }
-  virtual void printStats(std::ostream& os) {
+  virtual void printStats(std::ostream& os) const {
     for (auto& evaluator : evaluators_) {
       evaluator->printStats(os);
       os << ' ';
@@ -344,11 +350,11 @@ protected:
 
 Evaluator* NeuralNetwork::makeEvaluator() {
   CombinedEvaluator* combinedEvaluator = new CombinedEvaluator();
-  auto subModelConfig =
-      std::find_if(config_.sub_models().begin(), config_.sub_models().end(),
-                   [=](const SubModelConfig& sub_model) {
-                     return sub_model.name() == subModelName_;
-                   });
+  auto subModelConfig = std::find_if(config_.sub_models().begin(),
+                                     config_.sub_models().end(),
+                                     [=](const SubModelConfig& sub_model) {
+                                       return sub_model.name() == subModelName_;
+                                     });
   bool useSubModel = (subModelConfig != config_.sub_models().end());
   CHECK_EQ(useSubModel, !subModelName_.empty());
   if (useSubModel) {
@@ -356,7 +362,8 @@ Evaluator* NeuralNetwork::makeEvaluator() {
     for (int i = 0; i < subModelConfig->evaluator_names_size(); ++i) {
       // find evaluator by name
       auto thisEvalConfig = std::find_if(
-          config_.evaluators().begin(), config_.evaluators().end(),
+          config_.evaluators().begin(),
+          config_.evaluators().end(),
           [=](const EvaluatorConfig& ecfg) {
             return ecfg.name() == subModelConfig->evaluator_names(i);
           });
@@ -385,17 +392,17 @@ void NeuralNetwork::setOutputGrad(const std::vector<Argument>& args) {
   }
 }
 
-extern NeuralNetwork* newCustomNerualNetwork(
-  const std::string& name, NeuralNetwork* network) __attribute__((weak));
+extern NeuralNetwork* newCustomNerualNetwork(const std::string& name,
+                                             NeuralNetwork* network)
+    __attribute__((weak));
 
-NeuralNetwork* NeuralNetwork::newNeuralNetwork(
-    const std::string& name,
-    NeuralNetwork* rootNetwork) {
-    if (newCustomNerualNetwork) {
-      return newCustomNerualNetwork(name, rootNetwork);
-    } else {
-      return new NeuralNetwork(name, rootNetwork);
-    }
+NeuralNetwork* NeuralNetwork::newNeuralNetwork(const std::string& name,
+                                               NeuralNetwork* rootNetwork) {
+  if (newCustomNerualNetwork) {
+    return newCustomNerualNetwork(name, rootNetwork);
+  } else {
+    return new NeuralNetwork(name, rootNetwork);
+  }
 }
 
 }  // namespace paddle
diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.h b/paddle/gserver/gradientmachines/NeuralNetwork.h
index 06c679a63c..55ef45c5ee 100644
--- a/paddle/gserver/gradientmachines/NeuralNetwork.h
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <memory>
@@ -52,14 +51,15 @@ namespace paddle {
  * GPU value: NORMAL
  * GPU param: NORMAL
  */
-void parameterInitNN(int paramId, Parameter* para,
+void parameterInitNN(int paramId,
+                     Parameter* para,
                      std::vector<ParameterPtr>* sharedParams);
 
-
 class NeuralNetwork : public GradientMachine {
 public:
   virtual void init(
-      const ModelConfig& config, ParamInitCallback callback = nullptr,
+      const ModelConfig& config,
+      ParamInitCallback callback = nullptr,
       const std::vector<ParameterType>&
           parameterTypes = std::vector<ParameterType>{PARAMETER_VALUE,
                                                       PARAMETER_GRADIENT,
@@ -76,13 +76,15 @@ public:
    * @param agentLayer The up-submodel's input agent layer.
    */
   static void connect(LayerPtr agentLayer, LayerPtr realLayer, int height = 0);
-  void connect(std::string agentLayerName, NeuralNetwork* srcNN,
+  void connect(std::string agentLayerName,
+               NeuralNetwork* srcNN,
                std::string realLayerName);
 
   virtual void prefetch(const std::vector<Argument>& inArgs);
 
   virtual void forward(const std::vector<Argument>& inArgs,
-                       std::vector<Argument>* outArgs, PassType passType);
+                       std::vector<Argument>* outArgs,
+                       PassType passType);
 
   virtual void backward(const UpdateCallback& callback = nullptr);
 
@@ -117,16 +119,15 @@ public:
    */
   template <typename T>
   void forEachLayer(T callback) {
-    for (auto & l : layers_) {
+    for (auto& l : layers_) {
       if (callback(l)) {
         break;
       }
     }
   }
 
-
   static NeuralNetwork* newNeuralNetwork(const std::string& name = "",
-                                        NeuralNetwork* rootNetwork = nullptr);
+                                         NeuralNetwork* rootNetwork = nullptr);
 
 protected:
   /**
@@ -139,8 +140,7 @@ protected:
    */
   NeuralNetwork(std::string subModelName = "",
                 NeuralNetwork* rootNetwork = nullptr)
-      : subModelName_(subModelName),
-        rootNetwork_(rootNetwork) {}
+      : subModelName_(subModelName), rootNetwork_(rootNetwork) {}
 
   std::string subModelName_;
   ModelConfig config_;
diff --git a/paddle/gserver/gradientmachines/ParallelNeuralNetwork.cpp b/paddle/gserver/gradientmachines/ParallelNeuralNetwork.cpp
index 22698f5867..9dbf418c31 100644
--- a/paddle/gserver/gradientmachines/ParallelNeuralNetwork.cpp
+++ b/paddle/gserver/gradientmachines/ParallelNeuralNetwork.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Stat.h"
 #include "paddle/utils/Util.h"
 
@@ -24,14 +23,16 @@ limitations under the License. */
 namespace paddle {
 
 void ParallelNeuralNetwork::init(
-    const ModelConfig& config, ParamInitCallback callback,
-    const std::vector<ParameterType>& parameterTypes, bool useGpu) {
+    const ModelConfig& config,
+    ParamInitCallback callback,
+    const std::vector<ParameterType>& parameterTypes,
+    bool useGpu) {
   NeuralNetwork::init(config, callback, parameterTypes, useGpu);
 
   if (config.type() == "recurrent_nn") {
     LOG(FATAL)
-      << "You can not add `--parallel_nn=true` on the command line, "
-      << "parallel_nn training mode does not support the recurrent_nn model.";
+        << "You can not add `--parallel_nn=true` on the command line, "
+        << "parallel_nn training mode does not support the recurrent_nn model.";
   }
 
   useGpu_ = useGpu;
@@ -54,8 +55,8 @@ void ParallelNeuralNetwork::addComputeThread(int deviceId) {
     }
   }
 
-  threads_.emplace_back(new ParallelThread(threads_.size(), deviceId,
-                                           deviceId >= 0 ? useGpu_ : false));
+  threads_.emplace_back(new ParallelThread(
+      threads_.size(), deviceId, deviceId >= 0 ? useGpu_ : false));
 }
 
 void ParallelNeuralNetwork::waitAllThread() {
@@ -68,7 +69,8 @@ void ParallelNeuralNetwork::waitAllThread() {
   }
 }
 
-void ParallelNeuralNetwork::dispatchByDeviceId(int deviceId, LayerPtr layer,
+void ParallelNeuralNetwork::dispatchByDeviceId(int deviceId,
+                                               LayerPtr layer,
                                                TaskType task) {
   for (auto& thread : threads_) {
     if (thread->getDeviceId() == deviceId) {
diff --git a/paddle/gserver/gradientmachines/ParallelNeuralNetwork.h b/paddle/gserver/gradientmachines/ParallelNeuralNetwork.h
index 2a3db654f4..71488bc3b7 100644
--- a/paddle/gserver/gradientmachines/ParallelNeuralNetwork.h
+++ b/paddle/gserver/gradientmachines/ParallelNeuralNetwork.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "NeuralNetwork.h"
@@ -35,24 +34,27 @@ enum TaskType {
 class ParallelNeuralNetwork : public NeuralNetwork {
 public:
   ParallelNeuralNetwork(std::string subModelName = "",
-      NeuralNetwork* rootNetwork = nullptr)
-    : NeuralNetwork(subModelName, rootNetwork) {}
+                        NeuralNetwork *rootNetwork = nullptr)
+      : NeuralNetwork(subModelName, rootNetwork) {}
 
   virtual void init(
-      const ModelConfig &config, ParamInitCallback callback = nullptr,
-      const std::vector<ParameterType> &
-          parameterTypes = std::vector<ParameterType>{PARAMETER_VALUE,
-                                                      PARAMETER_GRADIENT,
-                                                      PARAMETER_MOMENTUM},
+      const ModelConfig &config,
+      ParamInitCallback callback = nullptr,
+      const std::vector<ParameterType>
+          &parameterTypes = std::vector<ParameterType>{PARAMETER_VALUE,
+                                                       PARAMETER_GRADIENT,
+                                                       PARAMETER_MOMENTUM},
       bool useGpu = FLAGS_use_gpu);
 
   virtual void forward(const std::vector<Argument> &inArgs,
-                       std::vector<Argument> *outArgs, PassType passType);
+                       std::vector<Argument> *outArgs,
+                       PassType passType);
 
   virtual void backward(const UpdateCallback &callback = nullptr);
 
   void forwardBackward(const std::vector<Argument> &inArgs,
-                       std::vector<Argument> *outArgs, PassType passType,
+                       std::vector<Argument> *outArgs,
+                       PassType passType,
                        const UpdateCallback &callback = NULL);
 
   virtual void start(const TrainerConfig &config, DataProviderPtr dataProvider);
diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
index 340cd1b9f8..516b617576 100644
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
@@ -53,8 +53,8 @@ typedef int (*DiyStartCalcProbCallback)(size_t nNodes, int* nodes);
  *          path.
  * @NOTE: Return -INFINITY will DROP this path IMMEDIATELY!!
  */
-typedef real (*DiyCalcProbCallback)(int handler, size_t nNodes, int* nodes,
-                                    real curProb, bool atEos);
+typedef real (*DiyCalcProbCallback)(
+    int handler, size_t nNodes, int* nodes, real curProb, bool atEos);
 
 /**
  * Finish Custom Calculation of Probability callback type.
@@ -190,13 +190,16 @@ public:
 };
 
 void RecurrentGradientMachine::init(
-    const ModelConfig& config, ParamInitCallback callback,
-    const std::vector<ParameterType>& parameterTypes, bool useGpu) {
+    const ModelConfig& config,
+    ParamInitCallback callback,
+    const std::vector<ParameterType>& parameterTypes,
+    bool useGpu) {
   NeuralNetwork::init(config, callback, parameterTypes, useGpu);
   useGpu_ = useGpu;
 
   auto subModelConfig =
-      std::find_if(config.sub_models().begin(), config.sub_models().end(),
+      std::find_if(config.sub_models().begin(),
+                   config.sub_models().end(),
                    [this](const SubModelConfig& sub_model) {
                      return sub_model.name() == this->subModelName_;
                    });
@@ -224,7 +227,8 @@ void RecurrentGradientMachine::init(
     memoryFrameLines_[i].layerName = memoryConfig.layer_name();
     memoryFrameLines_[i].linkName = memoryConfig.link_name();
     auto agentConfig =
-        std::find_if(config.layers().begin(), config.layers().end(),
+        std::find_if(config.layers().begin(),
+                     config.layers().end(),
                      [&memoryConfig](const LayerConfig& layerConfig) {
                        return layerConfig.name() == memoryConfig.link_name();
                      });
@@ -413,7 +417,8 @@ void RecurrentGradientMachine::forward(const std::vector<Argument>& inArgs,
     //    sample is one sentence
     if (shareInlinkInfo) {
       CHECK_EQ(input1.getBatchSize(), batchSize);
-      CHECK(std::equal(starts, starts + numSequences + 1,
+      CHECK(std::equal(starts,
+                       starts + numSequences + 1,
                        input1.sequenceStartPositions->getData(false)));
     }
   }
@@ -428,7 +433,8 @@ void RecurrentGradientMachine::forward(const std::vector<Argument>& inArgs,
       const Argument& input1 = inFrameLines_[i].inLayer->getOutput();
       CHECK_EQ((size_t)input1.getNumSubSequences(), numSubSequences);
       if (shareInlinkInfo) {
-        CHECK(std::equal(subStarts, subStarts + numSubSequences + 1,
+        CHECK(std::equal(subStarts,
+                         subStarts + numSubSequences + 1,
                          input1.subSequenceStartPositions->getData(false)));
       }
     }
@@ -460,8 +466,10 @@ void RecurrentGradientMachine::forward(const std::vector<Argument>& inArgs,
     // inFrameLine select rows in real layer one time
     for (size_t i = 0; i < inFrameLines_.size(); i++) {
       int curInlinkId = shareInlinkInfo ? 0 : i;
-      selectRowsOneTime(inFrameLines_[i].inLayer, info_[curInlinkId].allIds,
-                        &(inFrameLines_[i].outArg), passType);
+      selectRowsOneTime(inFrameLines_[i].inLayer,
+                        info_[curInlinkId].allIds,
+                        &(inFrameLines_[i].outArg),
+                        passType);
     }
   }
   resizeOrCreateFrames(maxSequenceLength_);
@@ -472,15 +480,17 @@ void RecurrentGradientMachine::forward(const std::vector<Argument>& inArgs,
       auto scatterAgent =
           dynamic_cast<ScatterAgentLayer*>(memoryFrameLine.rootAgent.get());
       createMemoryFrameInfo(&memoryFrameLine, passType);
-      scatterAgent->setRealLayerAndOutput(
-          memoryFrameLine.rootLayer, memoryFrameLine.outArg,
-          memoryFrameLine.allIds,
-          /* idIndex */ 0, memoryFrameLine.allIds->getSize());
+      scatterAgent->setRealLayerAndOutput(memoryFrameLine.rootLayer,
+                                          memoryFrameLine.outArg,
+                                          memoryFrameLine.allIds,
+                                          /* idIndex */ 0,
+                                          memoryFrameLine.allIds->getSize());
       if (memoryFrameLine.is_sequence) {  // memoryConfig is sequence
         int size = memoryFrameLine.sequenceStartPositions->getSize();
         scatterAgent->setSequenceStartPositions(
             memoryFrameLine.sequenceStartPositions,
-            /* seqStartPosIndex */ 0, size);
+            /* seqStartPosIndex */ 0,
+            size);
       }
     }
   }
@@ -489,7 +499,8 @@ void RecurrentGradientMachine::forward(const std::vector<Argument>& inArgs,
     auto gatherAgent =
         dynamic_cast<GatherAgentLayer*>(outFrameLine.agentLayer.get());
     CHECK_NOTNULL(gatherAgent);
-    gatherAgent->copyIdAndSequenceInfo(input, info_[targetInfoInlinkId_].allIds,
+    gatherAgent->copyIdAndSequenceInfo(input,
+                                       info_[targetInfoInlinkId_].allIds,
                                        info_[targetInfoInlinkId_].idIndex);
   }
 
@@ -504,15 +515,15 @@ void RecurrentGradientMachine::forward(const std::vector<Argument>& inArgs,
       auto scatterAgent =
           dynamic_cast<ScatterAgentLayer*>(inFrameLine.agents[i].get());
       scatterAgent->setRealLayerAndOutput(inFrameLine.inLayer,
-                                          inFrameLine.outArg, info.allIds,
-                                          info.idIndex[i], idSize);
+                                          inFrameLine.outArg,
+                                          info.allIds,
+                                          info.idIndex[i],
+                                          idSize);
       if (hasSubseq) {
         // size: the length of subsequence
-        int size =
-            info.seqStartPosIndex[i + 1] - info.seqStartPosIndex[i];
-        scatterAgent->setSequenceStartPositions(info.sequenceStartPositions,
-                                                info.seqStartPosIndex[i],
-                                                size);
+        int size = info.seqStartPosIndex[i + 1] - info.seqStartPosIndex[i];
+        scatterAgent->setSequenceStartPositions(
+            info.sequenceStartPositions, info.seqStartPosIndex[i], size);
       }
     }
 
@@ -547,7 +558,7 @@ void RecurrentGradientMachine::forward(const std::vector<Argument>& inArgs,
     if (hasSubseq) {
       for (auto& outFrameLine : outFrameLines_) {
         CHECK(outFrameLine.frames[i]->getOutput().sequenceStartPositions)
-          << "In hierachical RNN, all out links should be from sequences.";
+            << "In hierachical RNN, all out links should be from sequences.";
       }
     }
   }
@@ -573,8 +584,10 @@ void RecurrentGradientMachine::backward(const UpdateCallback& callback) {
 }
 
 void RecurrentGradientMachine::forwardBackward(
-    const std::vector<Argument>& inArgs, std::vector<Argument>* outArgs,
-    PassType passType, const UpdateCallback& callback) {
+    const std::vector<Argument>& inArgs,
+    std::vector<Argument>* outArgs,
+    PassType passType,
+    const UpdateCallback& callback) {
   LOG(FATAL) << "should not use this function";
 }
 
@@ -729,12 +742,15 @@ void RecurrentGradientMachine::createMemoryFrameInfo(
   // copy and check scatterId
   copyScattedId(allIds, &(*memoryFrameLine).allIds, input.getBatchSize());
   // memoryFrameLine select rows in real layer one time
-  selectRowsOneTime((*memoryFrameLine).rootLayer, (*memoryFrameLine).allIds,
-                    &(*memoryFrameLine).outArg, passType);
+  selectRowsOneTime((*memoryFrameLine).rootLayer,
+                    (*memoryFrameLine).allIds,
+                    &(*memoryFrameLine).outArg,
+                    passType);
 }
 
 void RecurrentGradientMachine::copyScattedId(std::vector<int>& srcIds,
-                                             IVectorPtr* dstIds, int size) {
+                                             IVectorPtr* dstIds,
+                                             int size) {
   int idSize = srcIds.size();
   CHECK_EQ(idSize, size);
   IVector::resizeOrCreate(*dstIds, idSize, useGpu_);
@@ -756,12 +772,12 @@ void RecurrentGradientMachine::selectRowsOneTime(LayerPtr layer,
     int height = realV->getHeight();
     int width = realV->getWidth();
     Matrix::resizeOrCreate(
-      arg->value, height, width, /* trans */ false, useGpu_);
+        arg->value, height, width, /* trans */ false, useGpu_);
     arg->value->zeroMem();
     arg->value->selectRows(*realV, *allIds);
     if (passType != PASS_TEST) {
-      Matrix::resizeOrCreate(arg->grad, height, width, /* trans */ false,
-                             useGpu_);
+      Matrix::resizeOrCreate(
+          arg->grad, height, width, /* trans */ false, useGpu_);
       arg->grad->zeroMem();
     }
   }
@@ -833,8 +849,8 @@ void RecurrentGradientMachine::generateSequence() {
             << "boot layer must be a sequence when is_sequence = true";
       }
     }
-    NeuralNetwork::connect(memoryFrameLine.agents[0], memoryFrameLine.bootLayer,
-                           ids.size());
+    NeuralNetwork::connect(
+        memoryFrameLine.agents[0], memoryFrameLine.bootLayer, ids.size());
   }
 
   // boot layer forward
@@ -847,14 +863,19 @@ void RecurrentGradientMachine::generateSequence() {
   size_t resultNum = generator_.config.num_results_per_sample();
   IVector::resizeOrCreate(
       generator_.outArg.ids,
-      generator_.config.max_num_frames() * numSequences * resultNum, false);
+      generator_.config.max_num_frames() * numSequences * resultNum,
+      false);
   if (resultNum > 1) {
     CHECK_LE(resultNum, static_cast<size_t>(generator_.config.beam_size()));
-    Matrix::resizeOrCreate(generator_.outArg.in, /* height */ numSequences,
-                           /* width */ resultNum, false, /* useGpu */ false);
+    Matrix::resizeOrCreate(generator_.outArg.in,
+                           /* height */ numSequences,
+                           /* width */ resultNum,
+                           false,
+                           /* useGpu */ false);
   }
   ICpuGpuVector::resizeOrCreate(generator_.outArg.sequenceStartPositions,
-                                numSequences + 1, /* useGpu */ false);
+                                numSequences + 1,
+                                /* useGpu */ false);
   if (getBeamSize() > 1) {
     beamSearch(numSequences);
   } else {
@@ -906,7 +927,8 @@ void RecurrentGradientMachine::oneWaySearch(size_t batchSize) {
         auto scatterAgent = dynamic_cast<ScatterAgentLayer*>(
             memoryFrameLine.scatterAgents[machineCur].get());
         scatterAgent->setRealLayer(memoryFrameLine.frames[machinePrev],
-                                   scatterIds, memoryFrameLine.is_sequence);
+                                   scatterIds,
+                                   memoryFrameLine.is_sequence);
         scatterAgent->forward(PASS_TEST);
         NeuralNetwork::connect(memoryFrameLine.agents[machineCur],
                                memoryFrameLine.scatterAgents[machineCur]);
@@ -948,7 +970,8 @@ void RecurrentGradientMachine::oneWaySearch(size_t batchSize) {
   starts[0] = 0;
   generator_.ids.clear();
   for (size_t i = 0; i < batchSize; ++i) {
-    generator_.ids.insert(generator_.ids.end(), finalPaths[i].ids.begin(),
+    generator_.ids.insert(generator_.ids.end(),
+                          finalPaths[i].ids.begin(),
                           finalPaths[i].ids.end());
     starts[i + 1] = generator_.ids.size();
     batchMachineIdVec_.insert(batchMachineIdVec_.end(),
@@ -999,8 +1022,11 @@ void RecurrentGradientMachine::forwardFrame(int machineCur) {
   if (useGpu_) {
     IVector::resizeOrCreate(cpuId_, ids->getSize(), false /* useGpu */);
     cpuId_->copyFrom(*ids);
-    Matrix::resizeOrCreate(cpuProb_, in->getHeight(), in->getWidth(),
-                           false /* trans */, false /* useGpu */);
+    Matrix::resizeOrCreate(cpuProb_,
+                           in->getHeight(),
+                           in->getWidth(),
+                           false /* trans */,
+                           false /* useGpu */);
     cpuProb_->copyFrom(*in);
     IVector::resizeOrCreate(cpuEos_, eos->getSize(), false /* useGpu */);
     cpuEos_->copyFrom(*eos);
@@ -1011,7 +1037,8 @@ void RecurrentGradientMachine::forwardFrame(int machineCur) {
   }
 }
 
-void RecurrentGradientMachine::singlePathExpand(Path& curPath, size_t curPathId,
+void RecurrentGradientMachine::singlePathExpand(Path& curPath,
+                                                size_t curPathId,
                                                 std::vector<Path>& newPaths,
                                                 size_t expandWidth) {
   int calc_id =
@@ -1037,8 +1064,8 @@ void RecurrentGradientMachine::singlePathExpand(Path& curPath, size_t curPathId,
     if (id == -1) break;
 
     real newLogProb = generator_.config.log_prob() ? std::log(prob) : prob;
-    Path newPath(curPath, id, newLogProb, curPathId /*machineId*/,
-                 k /*topIndex*/);
+    Path newPath(
+        curPath, id, newLogProb, curPathId /*machineId*/, k /*topIndex*/);
     if (this->beamSearchCtrlCallbacks_) {
       if (beamSearchCtrlCallbacks_->stopDetermineCandidates(
               newPath.seqId, newPath.ids, newPath.probHistory))
@@ -1104,7 +1131,8 @@ size_t RecurrentGradientMachine::beamShrink(std::vector<Path>& newPaths,
   }
   std::nth_element(newPaths.begin() + totalExpandCount,
                    newPaths.begin() + totalExpandCount + minNewPathSize,
-                   newPaths.end(), Path::greaterPath);
+                   newPaths.end(),
+                   Path::greaterPath);
   newPaths.resize(totalExpandCount + minNewPathSize);
 
   real minPathLogProb =
@@ -1116,7 +1144,8 @@ size_t RecurrentGradientMachine::beamShrink(std::vector<Path>& newPaths,
 
   // Remove the already formed paths that are relatively short
   finalPaths_[seqId].erase(
-      std::remove_if(finalPaths_[seqId].begin(), finalPaths_[seqId].end(),
+      std::remove_if(finalPaths_[seqId].begin(),
+                     finalPaths_[seqId].end(),
                      [&](Path& p) { return p.logProb < minPathLogProb; }),
       finalPaths_[seqId].end());
   for (auto p : finalPaths_[seqId]) {
@@ -1139,7 +1168,8 @@ void RecurrentGradientMachine::fillGenOutputs() {
     size_t minFinalPathsSize = std::min(numResults, finalPaths_[i].size());
     std::partial_sort(finalPaths_[i].begin(),
                       finalPaths_[i].begin() + minFinalPathsSize,
-                      finalPaths_[i].end(), Path::greaterPath);
+                      finalPaths_[i].end(),
+                      Path::greaterPath);
     finalPaths_[i].resize(minFinalPathsSize);
   }
 
@@ -1154,8 +1184,8 @@ void RecurrentGradientMachine::fillGenOutputs() {
       for (size_t j = 0; j < finalPaths_[i].size(); ++j) {
         Path& path = finalPaths_[i][j];
         generator_.ids.push_back(path.ids.size());  // sequence size
-        generator_.ids.insert(generator_.ids.end(), path.ids.begin(),
-                              path.ids.end());
+        generator_.ids.insert(
+            generator_.ids.end(), path.ids.begin(), path.ids.end());
         generator_.ids.push_back(-1);  // end of sequence
         probs[i * numResults + j] = path.logProb;
 
@@ -1198,8 +1228,12 @@ void RecurrentGradientMachine::createDataOutlink(
   }
 
   for (size_t i = 0; i < dataArgsSize_; i++) {
-    dataArgs_[i].concat(dataArgsFrame_[i], machineIdVec, starts, useGpu_,
-                        HPPL_STREAM_1, PASS_TEST);
+    dataArgs_[i].concat(dataArgsFrame_[i],
+                        machineIdVec,
+                        starts,
+                        useGpu_,
+                        HPPL_STREAM_1,
+                        PASS_TEST);
 
     auto dataAgent =
         dynamic_cast<DataLayer*>(outFrameLines_[i + 1].agentLayer.get());
@@ -1235,7 +1269,8 @@ void RecurrentGradientMachine::beamSearch(size_t batchSize) {
       auto ptr =
           new ScopedCallbacks<const RecurrentGradientMachine::EachStepCallback&,
                               int>(beamSearchStatistics_->onEachStepStarted,
-                                   beamSearchStatistics_->onEachStepStoped, i);
+                                   beamSearchStatistics_->onEachStepStoped,
+                                   i);
       statisticsBlock.reset(ptr);
     }
     if (stopBeamSearch_) break;
@@ -1246,7 +1281,9 @@ void RecurrentGradientMachine::beamSearch(size_t batchSize) {
       std::vector<std::vector<int>*> prefixes;
       prefixes.resize(paths.size());
       std::transform(
-          paths.begin(), paths.end(), prefixes.begin(),
+          paths.begin(),
+          paths.end(),
+          prefixes.begin(),
           [](const Path& p) { return const_cast<std::vector<int>*>(&p.ids); });
       beamSearchCtrlCallbacks_->beamSearchCandidateAdjust(
           prefixes, frames_[machineCur].get(), i);
diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
index 6328213793..cb74a67e52 100644
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
@@ -44,19 +44,22 @@ public:
     this->removeBeamSearchControlCallbacks();
   }
 
-  virtual void init(const ModelConfig& config, ParamInitCallback callback,
+  virtual void init(const ModelConfig& config,
+                    ParamInitCallback callback,
                     const std::vector<ParameterType>& parameterTypes,
                     bool useGpu);
 
   virtual void prefetch(const std::vector<Argument>& inArgs);
 
   virtual void forward(const std::vector<Argument>& inArgs,
-                       std::vector<Argument>* outArgs, PassType passType);
+                       std::vector<Argument>* outArgs,
+                       PassType passType);
 
   virtual void backward(const UpdateCallback& callback = nullptr);
 
   void forwardBackward(const std::vector<Argument>& inArgs,
-                       std::vector<Argument>* outArgs, PassType passType,
+                       std::vector<Argument>* outArgs,
+                       PassType passType,
                        const UpdateCallback& callback);
 
   virtual void resetState() {}
@@ -81,8 +84,8 @@ public:
    * beam search, so that user can customize different operations in different
    * beam search iterations.
    */
-  typedef std::function<void(const std::vector<std::vector<int>*>&,
-                             NeuralNetwork*, const int)>
+  typedef std::function<void(
+      const std::vector<std::vector<int>*>&, NeuralNetwork*, const int)>
       BeamSearchCandidatesAdjustCallback;
 
   /**
@@ -99,8 +102,9 @@ public:
    *
    * Return true if this prefix or candidate is expected to be dropped.
    */
-  typedef std::function<bool(int seqId, const std::vector<int>&,
-                             const std::vector<real>&)> DropCallback;
+  typedef std::function<bool(
+      int seqId, const std::vector<int>&, const std::vector<real>&)>
+      DropCallback;
 
   /**
     * @brief NormOrDropNodeCallback
@@ -115,8 +119,9 @@ public:
     *
     * The fourth parameter is the probability of the whole path.
     */
-  typedef std::function<void(int seqId, const std::vector<int>&,
-                             std::vector<real>&, real*)> NormOrDropNodeCallback;
+  typedef std::function<void(
+      int seqId, const std::vector<int>&, std::vector<real>&, real*)>
+      NormOrDropNodeCallback;
 
   /**
    * @brief Register beam search control callbacks. Used for prediction.
@@ -346,7 +351,8 @@ protected:
   *  If hasSubseq, will also create scattered sequenceStartPositions infomation
   *  for all realLayer of inFrameLines one time.
   */
-  void createInFrameInfo(int inlinks_id, const Argument& input,
+  void createInFrameInfo(int inlinks_id,
+                         const Argument& input,
                          PassType passType);
 
   void createMemoryFrameInfo(MemoryFrameLine* memoryFrameLine,
@@ -354,8 +360,10 @@ protected:
 
   void copyScattedId(std::vector<int>& srcIds, IVectorPtr* dstIds, int size);
 
-  void selectRowsOneTime(LayerPtr layer, const IVectorPtr& allIds,
-                         Argument* arg, PassType passType);
+  void selectRowsOneTime(LayerPtr layer,
+                         const IVectorPtr& allIds,
+                         Argument* arg,
+                         PassType passType);
 
   void createSeqPos(const std::vector<int>& sequenceStartPosition,
                     ICpuGpuVectorPtr* sequenceStartPositions);
@@ -459,7 +467,8 @@ private:
    * @param totalExpandCount : number of already shrinked paths in newPaths
    * @return size of retained paths at the end of a beam search iteration
    */
-  size_t beamShrink(std::vector<Path>& newPaths, size_t seqId,
+  size_t beamShrink(std::vector<Path>& newPaths,
+                    size_t seqId,
                     size_t totalExpandCount);
 
   /*
@@ -469,8 +478,10 @@ private:
    * @param curPathId : index of curPath in member newPaths
    * @param expandWidth : number of paths to be expanded
    */
-  void singlePathExpand(Path& curPath, size_t curPathId,
-                        std::vector<Path>& newPaths, size_t expandWidth);
+  void singlePathExpand(Path& curPath,
+                        size_t curPathId,
+                        std::vector<Path>& newPaths,
+                        size_t expandWidth);
 
   /*
    * @brief A new beam search iteration. Each half-generated paths in previous
diff --git a/paddle/gserver/layers/AddtoLayer.cpp b/paddle/gserver/layers/AddtoLayer.cpp
index 083b1957f3..8a9aecfa19 100644
--- a/paddle/gserver/layers/AddtoLayer.cpp
+++ b/paddle/gserver/layers/AddtoLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "AddtoLayer.h"
 
 #include "paddle/utils/Logging.h"
diff --git a/paddle/gserver/layers/AddtoLayer.h b/paddle/gserver/layers/AddtoLayer.h
index 0f2ca0bf19..883d186f3e 100644
--- a/paddle/gserver/layers/AddtoLayer.h
+++ b/paddle/gserver/layers/AddtoLayer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "Layer.h"
@@ -21,15 +20,16 @@ limitations under the License. */
 
 namespace paddle {
 
-/** 
- * This layer just simply add all input layers together, then activate 
- * the sum inputs. Each input of this layer should be the same size, 
+/**
+ * This layer just simply add all input layers together, then activate
+ * the sum inputs. Each input of this layer should be the same size,
  * which is also the output size of this layer.
  * \f[
  *   y=f(\sum_{i}x_i + b)
  * \f]
- * where \f$y\f$ is output, \f$x\f$ is input, \f$b\f$ is bias, and \f$f\f$ is activation function.
- * 
+ * where \f$y\f$ is output, \f$x\f$ is input, \f$b\f$ is bias, and \f$f\f$ is
+ * activation function.
+ *
  * The config file api is addto_layer.
  */
 class AddtoLayer : public Layer {
@@ -41,20 +41,20 @@ public:
 
   ~AddtoLayer() {}
 
-  /** 
-   * Intialization of AddtoLayer. 
+  /**
+   * Intialization of AddtoLayer.
    */
   bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
 
-  /** 
+  /**
    * Forward propagation.
-   * @note There is no weight matrix for each input, 
+   * @note There is no weight matrix for each input,
    *       because it just a simple add operation.
    */
   void forward(PassType passType);
 
-  /** 
-   * Backward propagation. 
+  /**
+   * Backward propagation.
    */
   void backward(const UpdateCallback& callback = nullptr);
 };
diff --git a/paddle/gserver/layers/AgentLayer.cpp b/paddle/gserver/layers/AgentLayer.cpp
index 5e07446c71..eb89281cb1 100644
--- a/paddle/gserver/layers/AgentLayer.cpp
+++ b/paddle/gserver/layers/AgentLayer.cpp
@@ -44,8 +44,8 @@ void AgentLayer::forward(PassType passType) {
     if (realOutput.ids) {
       output_.ids->subVecFrom(*realOutput.ids, 0, numSamples_);
     } else {
-      output_.subArgFrom(realOutput, /* offset */ 0, numSamples_, getSize(),
-                         useGpu_);
+      output_.subArgFrom(
+          realOutput, /* offset */ 0, numSamples_, getSize(), useGpu_);
     }
   } else {
     output_ = realOutput;
@@ -64,9 +64,15 @@ void SequenceAgentLayer::forward(PassType passType) {
     int numRows =
         realOutput.sequenceStartPositions->getData(false)[numSamples_];
     CHECK(!realOutput.ids) << "Not supported";
-    output_.subArgFrom(realOutput, /* offset */ 0, numRows, getSize(), useGpu_,
-                       /* trans */ false, /* seqFlag */ true,
-                       /* seqStart */ 0, /* seqSize */ numSamples_ + 1);
+    output_.subArgFrom(realOutput,
+                       /* offset */ 0,
+                       numRows,
+                       getSize(),
+                       useGpu_,
+                       /* trans */ false,
+                       /* seqFlag */ true,
+                       /* seqStart */ 0,
+                       /* seqSize */ numSamples_ + 1);
   } else {
     output_ = realOutput;
   }
@@ -107,7 +113,8 @@ void GatherAgentLayer::forward(PassType passType) {
   for (size_t i = 0; i < realLayers_.size(); ++i) {
     const MatrixPtr& realV = realLayers_[i]->getOutputValue();
     idsVec_[i] = IVector::create(allIds_->getData() + idIndex_[i],
-                                 /* size */ realV->getHeight(), useGpu_);
+                                 /* size */ realV->getHeight(),
+                                 useGpu_);
     realV->addToRows(*outV, *idsVec_[i]);
   }
 }
@@ -140,8 +147,8 @@ void ScatterAgentLayer::forward(PassType passType) {
 
   int width = this->getSize();
   if (realOutArg_.value || realOutArg_.ids) {
-    output_.subArgFrom(realOutArg_, /* offset */ idIndex_, idSize_, width,
-                       useGpu_);
+    output_.subArgFrom(
+        realOutArg_, /* offset */ idIndex_, idSize_, width, useGpu_);
   } else {  // used in generation
     if (realLayer_->getOutput().ids) {
       IVector::resizeOrCreate(output_.ids, ids_->getSize(), useGpu_);
@@ -223,8 +230,13 @@ void SequenceScatterAgentLayer::forward(PassType passType) {
 
   if (realOutArg_.value || realOutArg_.ids) {
     CHECK(realOutArg_.sequenceStartPositions);
-    output_.subArgFrom(realOutArg_, /* offset */ idIndex_, idSize_, width,
-                       useGpu_, /* trans */ false, /* seqFlag */ true,
+    output_.subArgFrom(realOutArg_,
+                       /* offset */ idIndex_,
+                       idSize_,
+                       width,
+                       useGpu_,
+                       /* trans */ false,
+                       /* seqFlag */ true,
                        /* seqStart */ seqStartPosIndex_,
                        /* seqSize */ numSequences_);
   } else {
@@ -247,8 +259,8 @@ void SequenceScatterAgentLayer::forward(PassType passType) {
 
     CHECK_NE(input.sequenceStartPositions.get(),
              output_.sequenceStartPositions.get());
-    ICpuGpuVector::resizeOrCreate(output_.sequenceStartPositions,
-                                  numSequences + 1, false);
+    ICpuGpuVector::resizeOrCreate(
+        output_.sequenceStartPositions, numSequences + 1, false);
     int* outStarts = output_.sequenceStartPositions->getMutableData(false);
 
     ICpuGpuVector::resizeOrCreate(inputStartPos_, height, false);
diff --git a/paddle/gserver/layers/AgentLayer.h b/paddle/gserver/layers/AgentLayer.h
index 3d7bf55834..0186653c0f 100644
--- a/paddle/gserver/layers/AgentLayer.h
+++ b/paddle/gserver/layers/AgentLayer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "Layer.h"
@@ -82,7 +81,8 @@ public:
   bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
 
   // call before addRealLayer
-  void copyIdAndSequenceInfo(const Argument& input, const IVectorPtr& allIds,
+  void copyIdAndSequenceInfo(const Argument& input,
+                             const IVectorPtr& allIds,
                              const std::vector<int>& idIndex);
 
   // add one real layer, can call many times
@@ -140,11 +140,12 @@ public:
    *
    * @param layer[input]    realLayer
    * @param ids[input]      row id in real layer
-   * @param copyId[input]   whether to copy a cpu version of ids, 
-   *                        false(default) in ScatterAgentLayer, and 
+   * @param copyId[input]   whether to copy a cpu version of ids,
+   *                        false(default) in ScatterAgentLayer, and
    *                        true in SequenceScatterAgentLayer.
    */
-  void setRealLayer(LayerPtr layer, const std::vector<int>& ids,
+  void setRealLayer(LayerPtr layer,
+                    const std::vector<int>& ids,
                     bool copyId = false) {
     realLayer_ = layer;
     IVector::resizeOrCreate(ids_, ids.size(), useGpu_);
@@ -161,8 +162,11 @@ public:
 
   // set real layer and output, [idIndex, idIndex + idSize) of *ids*
   // are selected row for realOutArg in realLayer
-  void setRealLayerAndOutput(LayerPtr layer, const Argument& outArg,
-                             const IVectorPtr& ids, int idIndex, int idSize) {
+  void setRealLayerAndOutput(LayerPtr layer,
+                             const Argument& outArg,
+                             const IVectorPtr& ids,
+                             int idIndex,
+                             int idSize) {
     realLayer_ = layer;
     realOutArg_ = outArg;
     ids_ = ids;
@@ -170,9 +174,9 @@ public:
     idSize_ = idSize;
   }
 
-  void setSequenceStartPositions(
-      const ICpuGpuVectorPtr& sequenceStartPositions,
-      int seqStartPosIndex, int numSequences) {
+  void setSequenceStartPositions(const ICpuGpuVectorPtr& sequenceStartPositions,
+                                 int seqStartPosIndex,
+                                 int numSequences) {
     realOutArg_.sequenceStartPositions = sequenceStartPositions;
     seqStartPosIndex_ = seqStartPosIndex;
     numSequences_ = numSequences;
diff --git a/paddle/gserver/layers/AverageLayer.cpp b/paddle/gserver/layers/AverageLayer.cpp
index 7401cdc9a5..af64e15fe3 100644
--- a/paddle/gserver/layers/AverageLayer.cpp
+++ b/paddle/gserver/layers/AverageLayer.cpp
@@ -75,8 +75,8 @@ void AverageLayer::backward(const UpdateCallback& callback) {
         // empty sequence
         continue;
       }
-      dataMtx_->setData(gradientData + starts[sequenceId] * dim, sequenceLength,
-                        dim);
+      dataMtx_->setData(
+          gradientData + starts[sequenceId] * dim, sequenceLength, dim);
       outMtx_->setData(gradient + sequenceId * dim);
       switch (mode_) {
         case kAverage: {
diff --git a/paddle/gserver/layers/BatchNormBaseLayer.cpp b/paddle/gserver/layers/BatchNormBaseLayer.cpp
index 8052b35ec6..2d5bcff29f 100644
--- a/paddle/gserver/layers/BatchNormBaseLayer.cpp
+++ b/paddle/gserver/layers/BatchNormBaseLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Stat.h"
 #include "Layer.h"
 #include "BatchNormBaseLayer.h"
diff --git a/paddle/gserver/layers/BatchNormBaseLayer.h b/paddle/gserver/layers/BatchNormBaseLayer.h
index 2302d1a8e0..d65882d39d 100644
--- a/paddle/gserver/layers/BatchNormBaseLayer.h
+++ b/paddle/gserver/layers/BatchNormBaseLayer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "paddle/utils/Stat.h"
@@ -21,14 +20,15 @@ limitations under the License. */
 namespace paddle {
 
 /**
- * @brief Batch normalization layer use to normalizes the input to across the batch.
+ * @brief Batch normalization layer use to normalizes the input to across the
+ * batch.
  *
  * By default, calculating global mean and variance statistics via a running
  * average in the training peroid. Then the pre-calculated global mean and
  * variance are used for testing.
  *
  * Moving mean and variance are located in Parameter object when constructing
- * and the calculation will change them. Now we only save global mean and 
+ * and the calculation will change them. Now we only save global mean and
  * variance of one thread in first node for GPU.
  * But the calculation in CPU is different, because parameters are shared by
  * multiple threads. Here using ShareCpuMatrix with lock to calculate. We
@@ -41,8 +41,7 @@ namespace paddle {
 
 class BatchNormBaseLayer : public Layer {
 public:
-  explicit BatchNormBaseLayer(const LayerConfig& config)
-      : Layer(config) {}
+  explicit BatchNormBaseLayer(const LayerConfig& config) : Layer(config) {}
 
   ~BatchNormBaseLayer() {}
 
@@ -55,8 +54,8 @@ public:
 
   virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
 
-  /** 
-   * @brief Calculate feature map size. Some input uses frameHeight and 
+  /**
+   * @brief Calculate feature map size. Some input uses frameHeight and
    * frameWidth to store feature size
    */
   void calFeatureMapSize();
diff --git a/paddle/gserver/layers/BatchNormalizationLayer.cpp b/paddle/gserver/layers/BatchNormalizationLayer.cpp
index b2921e6d40..e431c03311 100644
--- a/paddle/gserver/layers/BatchNormalizationLayer.cpp
+++ b/paddle/gserver/layers/BatchNormalizationLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Stat.h"
 #ifndef PADDLE_ONLY_CPU
 #include "hl_batch_transpose.h"
@@ -44,8 +43,8 @@ void BatchNormalizationLayer::calMeanAndStd(const MatrixPtr& mat) {
   tmpMat_->square();
   savedInvVar_->zeroMem();
   savedInvVar_->accumulateColSum(*tmpMat_);
-  savedInvVar_->mulScalar(1.0 / numSamples);  // E[x^2]
-  savedInvVar_->addSquare(*savedMean_, -1.0);      // E[x^2] - E^2[x]
+  savedInvVar_->mulScalar(1.0 / numSamples);   // E[x^2]
+  savedInvVar_->addSquare(*savedMean_, -1.0);  // E[x^2] - E^2[x]
 
   // Variance may be small negative value
   // because of the subtraction operation.
@@ -104,17 +103,23 @@ void BatchNormalizationLayer::expandMat(const MatrixPtr& in, MatrixPtr& out) {
 #ifdef PADDLE_ONLY_CPU
     LOG(FATAL) << "paddle is compiled only for cpu";
 #else
-    batchTranspose(in->getData(), out->getData(), imgPixels_,
-                   channels_, batchSize);
+    batchTranspose(
+        in->getData(), out->getData(), imgPixels_, channels_, batchSize);
 #endif
   } else {
     for (size_t i = 0; i < batchSize; i++) {
       const MatrixPtr inTmp =
-          Matrix::create(in->getData() + i * imgPixels_ * channels_, channels_,
-                         imgPixels_, false, useGpu_);
+          Matrix::create(in->getData() + i * imgPixels_ * channels_,
+                         channels_,
+                         imgPixels_,
+                         false,
+                         useGpu_);
       MatrixPtr outTmp =
           Matrix::create(out->getData() + i * imgPixels_ * channels_,
-                         imgPixels_, channels_, false, useGpu_);
+                         imgPixels_,
+                         channels_,
+                         false,
+                         useGpu_);
       inTmp->transpose(outTmp, false);
     }
   }
@@ -135,23 +140,27 @@ void BatchNormalizationLayer::shrinkMat(const MatrixPtr& in, MatrixPtr& out) {
 #ifdef PADDLE_ONLY_CPU
     LOG(FATAL) << "paddle is compiled only for cpu";
 #else
-    batchTranspose(in->getData(), out->getData(), channels_,
-                   imgPixels_, batchSize);
+    batchTranspose(
+        in->getData(), out->getData(), channels_, imgPixels_, batchSize);
 #endif
   } else {
     for (size_t i = 0; i < batchSize; i++) {
       const MatrixPtr inTmp =
-          Matrix::create(in->getData() + i * channels_ * imgPixels_, imgPixels_,
-                         channels_, false, useGpu_);
+          Matrix::create(in->getData() + i * channels_ * imgPixels_,
+                         imgPixels_,
+                         channels_,
+                         false,
+                         useGpu_);
       MatrixPtr outTmp =
-          Matrix::create(out->getData() + i * imgPixels_ * channels_, channels_,
-                         imgPixels_, useGpu_);
+          Matrix::create(out->getData() + i * imgPixels_ * channels_,
+                         channels_,
+                         imgPixels_,
+                         useGpu_);
       inTmp->transpose(outTmp, false);
     }
   }
 }
 
-
 void BatchNormalizationLayer::forward(PassType passType) {
   Layer::forward(passType);
 
@@ -165,12 +174,12 @@ void BatchNormalizationLayer::forward(PassType passType) {
     useGlobalStats_ = config_.use_global_stats();
   }
 
-  Matrix::resizeOrCreate(expandedIn_, batchSize * imgPixels_, channels_, false,
-                         useGpu_);
-  Matrix::resizeOrCreate(normIn_, batchSize * imgPixels_, channels_, false,
-                         useGpu_);
-  Matrix::resizeOrCreate(expandedOut_, batchSize * imgPixels_, channels_, false,
-                         useGpu_);
+  Matrix::resizeOrCreate(
+      expandedIn_, batchSize * imgPixels_, channels_, false, useGpu_);
+  Matrix::resizeOrCreate(
+      normIn_, batchSize * imgPixels_, channels_, false, useGpu_);
+  Matrix::resizeOrCreate(
+      expandedOut_, batchSize * imgPixels_, channels_, false, useGpu_);
   expandMat(getInputValue(0), expandedIn_);
 
   if (useGlobalStats_) {
@@ -184,7 +193,7 @@ void BatchNormalizationLayer::forward(PassType passType) {
   }
 
   normIn_->assign(*expandedIn_);
-  normIn_->addBias(*savedMean_, -1);  // subtract mean.
+  normIn_->addBias(*savedMean_, -1);     // subtract mean.
   normIn_->divRowVector(*savedInvVar_);  // divide std.
 
   expandedOut_->assign(*normIn_);
@@ -211,18 +220,18 @@ void BatchNormalizationLayer::backward(const UpdateCallback& callback) {
   Matrix::resizeOrCreate(meanGrad_, 1, channels_, false, useGpu_);
   Matrix::resizeOrCreate(stdGrad_, 1, channels_, false, useGpu_);
 
-  Matrix::resizeOrCreate(expandedInGrad_, batchSize * imgPixels_, channels_,
-                         false, useGpu_);
-  Matrix::resizeOrCreate(inGrad_, batchSize, imgPixels_ * channels_, false,
-                         useGpu_);
-  Matrix::resizeOrCreate(normInGrad_, batchSize * imgPixels_, channels_, false,
-                         useGpu_);
-  Matrix::resizeOrCreate(expandedOutGrad_, batchSize * imgPixels_, channels_,
-                         false, useGpu_);
-  Matrix::resizeOrCreate(tmpMat_, batchSize * imgPixels_, channels_, false,
-                         useGpu_);
-  Matrix::resizeOrCreate(tmpGrad_, batchSize * imgPixels_, channels_, false,
-                         useGpu_);
+  Matrix::resizeOrCreate(
+      expandedInGrad_, batchSize * imgPixels_, channels_, false, useGpu_);
+  Matrix::resizeOrCreate(
+      inGrad_, batchSize, imgPixels_ * channels_, false, useGpu_);
+  Matrix::resizeOrCreate(
+      normInGrad_, batchSize * imgPixels_, channels_, false, useGpu_);
+  Matrix::resizeOrCreate(
+      expandedOutGrad_, batchSize * imgPixels_, channels_, false, useGpu_);
+  Matrix::resizeOrCreate(
+      tmpMat_, batchSize * imgPixels_, channels_, false, useGpu_);
+  Matrix::resizeOrCreate(
+      tmpGrad_, batchSize * imgPixels_, channels_, false, useGpu_);
 
   expandMat(getOutputGrad(), expandedOutGrad_);
 
diff --git a/paddle/gserver/layers/BatchNormalizationLayer.h b/paddle/gserver/layers/BatchNormalizationLayer.h
index 175b9a80e6..36925a5ed2 100644
--- a/paddle/gserver/layers/BatchNormalizationLayer.h
+++ b/paddle/gserver/layers/BatchNormalizationLayer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "Layer.h"
diff --git a/paddle/gserver/layers/BilinearInterpLayer.cpp b/paddle/gserver/layers/BilinearInterpLayer.cpp
index ac5f87be7a..c30e26dc03 100644
--- a/paddle/gserver/layers/BilinearInterpLayer.cpp
+++ b/paddle/gserver/layers/BilinearInterpLayer.cpp
@@ -40,10 +40,10 @@ size_t BilinearInterpLayer::getSize() {
   CHECK(inImgH_ > 0 && inImgW_ > 0);
   CHECK(numChannels_);
 
-  ratioH_ = (outImgH_ > 1) ?
-    static_cast<real>(inImgH_ - 1) / (outImgH_ - 1) : 0.f;
-  ratioW_ = (outImgW_ > 1) ?
-    static_cast<real>(inImgW_ - 1) / (outImgW_ - 1) : 0.f;
+  ratioH_ =
+      (outImgH_ > 1) ? static_cast<real>(inImgH_ - 1) / (outImgH_ - 1) : 0.f;
+  ratioW_ =
+      (outImgW_ > 1) ? static_cast<real>(inImgW_ - 1) / (outImgW_ - 1) : 0.f;
 
   getOutput().setFrameHeight(outImgH_);
   getOutput().setFrameWidth(outImgW_);
@@ -74,21 +74,33 @@ void BilinearInterpLayer::forward(PassType passType) {
   MatrixPtr outV = getOutputValue();
   {
     REGISTER_TIMER_INFO("FwBilinearInterpTimer", getName().c_str());
-    outV->bilinearForward(*inV, inImgH_, inImgW_, outImgH_, outImgW_,
-      numChannels_, ratioH_, ratioW_);
+    outV->bilinearForward(*inV,
+                          inImgH_,
+                          inImgW_,
+                          outImgH_,
+                          outImgW_,
+                          numChannels_,
+                          ratioH_,
+                          ratioW_);
   }
 }
 
 void BilinearInterpLayer::backward(const UpdateCallback& callback) {
-  (void) callback;
+  (void)callback;
 
   MatrixPtr inputG = getInputGrad(0);
   MatrixPtr outG = getOutputGrad();
   {
     REGISTER_TIMER_INFO("BwBilinearInterpTimer", getName().c_str());
     if (inputG) {
-      inputG->bilinearBackward(*outG, outImgH_, outImgW_, inImgH_, inImgW_,
-        numChannels_, ratioH_, ratioW_);
+      inputG->bilinearBackward(*outG,
+                               outImgH_,
+                               outImgW_,
+                               inImgH_,
+                               inImgW_,
+                               numChannels_,
+                               ratioH_,
+                               ratioW_);
     }
   }
 }
diff --git a/paddle/gserver/layers/BlockExpandLayer.cpp b/paddle/gserver/layers/BlockExpandLayer.cpp
index 8da159def8..17d77879b2 100644
--- a/paddle/gserver/layers/BlockExpandLayer.cpp
+++ b/paddle/gserver/layers/BlockExpandLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "BlockExpandLayer.h"
 
 #include "paddle/utils/Logging.h"
@@ -52,7 +51,7 @@ size_t BlockExpandLayer::getBlockNum() {
   if (imgSizeW_ == 0) {
     imgSizeW_ = blockConf.img_size_x();
   }
-  size_t tmpH  = 2 * paddingH_ + imgSizeH_ - blockH_;
+  size_t tmpH = 2 * paddingH_ + imgSizeH_ - blockH_;
   outputH_ = (int)tmpH < 0 ? 1 : 1 + (tmpH + strideH_ - 1) / strideH_;
   size_t tmpW = 2 * paddingW_ + imgSizeW_ - blockW_;
   outputW_ = (int)tmpW < 0 ? 1 : 1 + (tmpW + strideW_ - 1) / strideW_;
@@ -73,8 +72,8 @@ void BlockExpandLayer::forward(PassType passType) {
 
   MatrixPtr input = getPrev(0)->getOutputValue();
   Matrix::resizeOrCreate(outVTrans_, blockSize, blockNum, false, useGpu_);
-  ICpuGpuVector::resizeOrCreate(out.sequenceStartPositions,
-                                batchSize + 1, false);
+  ICpuGpuVector::resizeOrCreate(
+      out.sequenceStartPositions, batchSize + 1, false);
   IVector::resizeOrCreate(out.cpuSequenceDims, 2 * batchSize, false);
   int* start = out.sequenceStartPositions->getMutableData(false);
   int* dims = out.cpuSequenceDims->getData();
@@ -82,14 +81,29 @@ void BlockExpandLayer::forward(PassType passType) {
     outVTrans_->zeroMem();
     /* expand each block as one row */
     MatrixPtr inputTmp =
-        Matrix::create(input->getData() + i * input->getWidth(), 1,
-                       input->getWidth(), false, useGpu_);
-    outVTrans_->convExpand(*inputTmp, imgSizeH_, imgSizeW_, channels_, blockH_,
-                          blockW_, strideH_, strideW_, paddingH_, paddingW_,
-                          outputH_, outputW_);
+        Matrix::create(input->getData() + i * input->getWidth(),
+                       1,
+                       input->getWidth(),
+                       false,
+                       useGpu_);
+    outVTrans_->convExpand(*inputTmp,
+                           imgSizeH_,
+                           imgSizeW_,
+                           channels_,
+                           blockH_,
+                           blockW_,
+                           strideH_,
+                           strideW_,
+                           paddingH_,
+                           paddingW_,
+                           outputH_,
+                           outputW_);
     MatrixPtr outVTmp =
-        Matrix::create(outV->getData() + i * blockNum * blockSize, blockNum,
-                       blockSize, false, useGpu_);
+        Matrix::create(outV->getData() + i * blockNum * blockSize,
+                       blockNum,
+                       blockSize,
+                       false,
+                       useGpu_);
     outVTrans_->transpose(outVTmp, false);
     start[i] = i * blockNum;
     dims[2 * i] = outputH_;
@@ -115,15 +129,32 @@ void BlockExpandLayer::backward(const UpdateCallback& callback) {
 
   for (size_t i = 0; i < batchSize; i++) {
     MatrixPtr gradTmp =
-        Matrix::create(grad->getData() + i * blockNum * blockSize, blockNum,
-                       blockSize, false, useGpu_);
+        Matrix::create(grad->getData() + i * blockNum * blockSize,
+                       blockNum,
+                       blockSize,
+                       false,
+                       useGpu_);
     gradTmp->transpose(gradTrans, false);
     MatrixPtr preGradTmp =
-        Matrix::create(preGrad->getData() + i * preGrad->getWidth(), 1,
-                       preGrad->getWidth(), false, useGpu_);
-    preGradTmp->convShrink(*gradTrans, imgSizeH_, imgSizeW_, channels_, blockH_,
-                           blockW_, strideH_, strideW_, paddingH_, paddingW_,
-                           outputH_, outputW_, 1.0, 1.0);
+        Matrix::create(preGrad->getData() + i * preGrad->getWidth(),
+                       1,
+                       preGrad->getWidth(),
+                       false,
+                       useGpu_);
+    preGradTmp->convShrink(*gradTrans,
+                           imgSizeH_,
+                           imgSizeW_,
+                           channels_,
+                           blockH_,
+                           blockW_,
+                           strideH_,
+                           strideW_,
+                           paddingH_,
+                           paddingW_,
+                           outputH_,
+                           outputW_,
+                           1.0,
+                           1.0);
   }
 }
 
diff --git a/paddle/gserver/layers/BlockExpandLayer.h b/paddle/gserver/layers/BlockExpandLayer.h
index f8f8172127..1496fb681a 100644
--- a/paddle/gserver/layers/BlockExpandLayer.h
+++ b/paddle/gserver/layers/BlockExpandLayer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "Layer.h"
diff --git a/paddle/gserver/layers/CRFDecodingLayer.cpp b/paddle/gserver/layers/CRFDecodingLayer.cpp
index d3dfbb7c80..8986741dc3 100644
--- a/paddle/gserver/layers/CRFDecodingLayer.cpp
+++ b/paddle/gserver/layers/CRFDecodingLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "CRFDecodingLayer.h"
 
 namespace paddle {
@@ -46,7 +45,8 @@ void CRFDecodingLayer::forward(PassType passType) {
 
   for (size_t i = 0; i < numSequences; ++i) {
     crf_->decode(output.value->getData() + numClasses_ * starts[i],
-                 output_.ids->getData() + starts[i], starts[i + 1] - starts[i]);
+                 output_.ids->getData() + starts[i],
+                 starts[i + 1] - starts[i]);
   }
 
   if (inputLayers_.size() == 2) {
diff --git a/paddle/gserver/layers/CRFDecodingLayer.h b/paddle/gserver/layers/CRFDecodingLayer.h
index 005bffff6b..1914062011 100644
--- a/paddle/gserver/layers/CRFDecodingLayer.h
+++ b/paddle/gserver/layers/CRFDecodingLayer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <memory>
diff --git a/paddle/gserver/layers/CRFLayer.cpp b/paddle/gserver/layers/CRFLayer.cpp
index c1dcad2b5f..ed4f864ba9 100644
--- a/paddle/gserver/layers/CRFLayer.cpp
+++ b/paddle/gserver/layers/CRFLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "CRFLayer.h"
 
 namespace paddle {
@@ -73,12 +72,13 @@ void CRFLayer::forward(PassType passType) {
       crfs_.emplace_back(numClasses_,
                          parameter_->getBuf(PARAMETER_VALUE)->getData(),
                          parameter_->getBuf(PARAMETER_GRADIENT)
-                            ? parameter_->getBuf(PARAMETER_GRADIENT)->getData()
-                            : nullptr);
+                             ? parameter_->getBuf(PARAMETER_GRADIENT)->getData()
+                             : nullptr);
     }
-    output_.value->getData()[i] = crfs_[i].forward(
-        output.value->getData() + numClasses_ * starts[i],
-        label.ids->getData() + starts[i], starts[i + 1] - starts[i]);
+    output_.value->getData()[i] =
+        crfs_[i].forward(output.value->getData() + numClasses_ * starts[i],
+                         label.ids->getData() + starts[i],
+                         starts[i + 1] - starts[i]);
   }
 
   if (weightLayer_) {
@@ -87,7 +87,7 @@ void CRFLayer::forward(PassType passType) {
   }
 }
 
-void CRFLayer::backward(const UpdateCallback &callback) {
+void CRFLayer::backward(const UpdateCallback& callback) {
   const Argument& output = getInput(0);
   const Argument& label = getInput(1);
   const int* starts = label.sequenceStartPositions->getData(false);
@@ -100,7 +100,7 @@ void CRFLayer::backward(const UpdateCallback &callback) {
                       starts[i + 1] - starts[i]);
     if (weightLayer_) {
       real weight = getInputValue(*weightLayer_)->getElement(i, 0);
-      MatrixPtr grad = output.grad->subRowMatrix(starts[i], starts[i+1]);
+      MatrixPtr grad = output.grad->subRowMatrix(starts[i], starts[i + 1]);
       grad->mulScalar(weight);
     }
   }
diff --git a/paddle/gserver/layers/CRFLayer.h b/paddle/gserver/layers/CRFLayer.h
index 58902a0d3b..21c7fc61e1 100644
--- a/paddle/gserver/layers/CRFLayer.h
+++ b/paddle/gserver/layers/CRFLayer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <memory>
@@ -39,7 +38,7 @@ protected:
   ParameterPtr parameter_;
   std::vector<LinearChainCRF> crfs_;
   LayerPtr weightLayer_;  // weight for each sequence
-  real coeff_;  // weight for the layer
+  real coeff_;            // weight for the layer
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/CTCLayer.cpp b/paddle/gserver/layers/CTCLayer.cpp
index 6b9ffc5c74..be5d2c8c75 100644
--- a/paddle/gserver/layers/CTCLayer.cpp
+++ b/paddle/gserver/layers/CTCLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "CTCLayer.h"
 
 /* Please reference the Chapter7  in
@@ -71,8 +70,7 @@ void CTCLayer::forwardImp(const Argument& softmaxSeqs,
   resizeOutput(numSequences, 1);
   std::vector<real> out(numSequences);
 
-  const int* labelSeqsStarts =
-      labelSeqs.sequenceStartPositions->getData(false);
+  const int* labelSeqsStarts = labelSeqs.sequenceStartPositions->getData(false);
   const int* softmaxSeqsStarts =
       softmaxSeqs.sequenceStartPositions->getData(false);
 
@@ -81,22 +79,22 @@ void CTCLayer::forwardImp(const Argument& softmaxSeqs,
       ctcs_.emplace_back(numClasses_, normByTimes_);
     }
     out[i] = ctcs_[i].forward(
-            softmaxSeqs.value->getData() + numClasses_ * softmaxSeqsStarts[i],
-            softmaxSeqsStarts[i + 1] - softmaxSeqsStarts[i],
-            labelSeqs.ids->getData() + labelSeqsStarts[i],
-            labelSeqsStarts[i + 1] - labelSeqsStarts[i]);
+        softmaxSeqs.value->getData() + numClasses_ * softmaxSeqsStarts[i],
+        softmaxSeqsStarts[i + 1] - softmaxSeqsStarts[i],
+        labelSeqs.ids->getData() + labelSeqsStarts[i],
+        labelSeqsStarts[i + 1] - labelSeqsStarts[i]);
   }
   output_.value->copyFrom(out.data(), numSequences);
 }
 
-void CTCLayer::backward(const UpdateCallback &callback) {
+void CTCLayer::backward(const UpdateCallback& callback) {
   (void)callback;
   if (useGpu_) {
     backwardImp(callback, tmpCpuInput_[0], tmpCpuInput_[1]);
-    const_cast<Argument&>(getInput(0)).
-            resizeAndCopyFrom(tmpCpuInput_[0], true, HPPL_STREAM_DEFAULT);
-    const_cast<Argument&>(getInput(1)).
-            resizeAndCopyFrom(tmpCpuInput_[1], true, HPPL_STREAM_DEFAULT);
+    const_cast<Argument&>(getInput(0))
+        .resizeAndCopyFrom(tmpCpuInput_[0], true, HPPL_STREAM_DEFAULT);
+    const_cast<Argument&>(getInput(1))
+        .resizeAndCopyFrom(tmpCpuInput_[1], true, HPPL_STREAM_DEFAULT);
   } else {
     backwardImp(callback, getInput(0), getInput(1));
   }
@@ -107,8 +105,7 @@ void CTCLayer::backwardImp(const UpdateCallback& callback,
                            const Argument& labelSeqs) {
   size_t numSequences = labelSeqs.sequenceStartPositions->getSize() - 1;
 
-  const int* labelSeqsStarts =
-      labelSeqs.sequenceStartPositions->getData(false);
+  const int* labelSeqsStarts = labelSeqs.sequenceStartPositions->getData(false);
   const int* softmaxSeqsStarts =
       softmaxSeqs.sequenceStartPositions->getData(false);
 
diff --git a/paddle/gserver/layers/CTCLayer.h b/paddle/gserver/layers/CTCLayer.h
index 49a059e43e..18ba12583b 100644
--- a/paddle/gserver/layers/CTCLayer.h
+++ b/paddle/gserver/layers/CTCLayer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "Layer.h"
@@ -28,7 +27,8 @@ public:
   void forwardImp(const Argument& softmaxSeqs, const Argument& labelSeqs);
   virtual void backward(const UpdateCallback& callback);
   void backwardImp(const UpdateCallback& callback,
-                   const Argument& softmaxSeqs, const Argument& labelSeqs);
+                   const Argument& softmaxSeqs,
+                   const Argument& labelSeqs);
 
 protected:
   size_t numClasses_;
diff --git a/paddle/gserver/layers/ConcatenateLayer.cpp b/paddle/gserver/layers/ConcatenateLayer.cpp
index a986ec10b4..910eec8bbc 100644
--- a/paddle/gserver/layers/ConcatenateLayer.cpp
+++ b/paddle/gserver/layers/ConcatenateLayer.cpp
@@ -97,8 +97,7 @@ void ConcatenateLayer::backward(const UpdateCallback& callback) {
  */
 class ConcatenateLayer2 : public Layer {
 public:
-  explicit ConcatenateLayer2(const LayerConfig& config) :
-      Layer(config) {}
+  explicit ConcatenateLayer2(const LayerConfig& config) : Layer(config) {}
 
   ~ConcatenateLayer2() {}
 
@@ -130,8 +129,8 @@ bool ConcatenateLayer2::init(const LayerMap& layerMap,
   size_t startCol = 0;
   size_t endCol = 0;
   for (size_t i = 0; i < inputLayers_.size(); i++) {
-    projections_.emplace_back(Projection::create(config_.inputs(i).proj_conf(),
-                                                 parameters_[i], useGpu_));
+    projections_.emplace_back(Projection::create(
+        config_.inputs(i).proj_conf(), parameters_[i], useGpu_));
 
     endCol += projections_[i]->getOutputSize();
     projCol_.push_back(std::make_pair(startCol, endCol));
diff --git a/paddle/gserver/layers/ContextProjection.cpp b/paddle/gserver/layers/ContextProjection.cpp
index 3b1498f7e9..30dbf168fb 100644
--- a/paddle/gserver/layers/ContextProjection.cpp
+++ b/paddle/gserver/layers/ContextProjection.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Stat.h"
 #include "ContextProjection.h"
 
@@ -21,7 +20,8 @@ namespace paddle {
 REGISTER_PROJECTION(context, ContextProjection);
 
 ContextProjection::ContextProjection(const ProjectionConfig& config,
-                                     ParameterPtr parameter, bool useGpu)
+                                     ParameterPtr parameter,
+                                     bool useGpu)
     : Projection(config, parameter, useGpu) {
   CHECK(config.has_context_start());
   CHECK(config.has_context_length());
@@ -44,10 +44,13 @@ void ContextProjection::resetState() {
   CHECK_LE(config_.context_start() + config_.context_length(), 1)
       << "state is not allowed for future context";
   if (config_.context_start() >= 0) return;
-  Matrix::resizeOrCreate(state_, -config_.context_start(), config_.input_size(),
+  Matrix::resizeOrCreate(state_,
+                         -config_.context_start(),
+                         config_.input_size(),
                          false,  // trans
                          useGpu_);
-  Matrix::resizeOrCreate(state2_, -config_.context_start(),
+  Matrix::resizeOrCreate(state2_,
+                         -config_.context_start(),
                          config_.input_size(),
                          false,  // trans
                          useGpu_);
@@ -78,8 +81,7 @@ void ContextProjection::forward() {
   CHECK(in_->value);
   CHECK(in_->sequenceStartPositions);
 
-  auto startPositions =
-    in_->sequenceStartPositions->getVector(useGpu_);
+  auto startPositions = in_->sequenceStartPositions->getVector(useGpu_);
 
   int64_t inputDim = in_->value->getWidth();
   int64_t dim = out_->value->getWidth();
@@ -88,9 +90,13 @@ void ContextProjection::forward() {
   REGISTER_TIMER_INFO("ContextProjectionForward", getName().c_str());
   bool isPadding = config_.trainable_padding();
   out_->value->contextProjectionForward(
-      in_->value, state_ ? state_ : isPadding ? weight_->getW() : nullptr,
-      *startPositions, config_.context_length(), config_.context_start(),
-      beginPad_, state_ ? true : isPadding);
+      in_->value,
+      state_ ? state_ : isPadding ? weight_->getW() : nullptr,
+      *startPositions,
+      config_.context_length(),
+      config_.context_start(),
+      beginPad_,
+      state_ ? true : isPadding);
 
   if (state_ && config_.context_start() < 0) {
     CHECK_EQ(1, in_->getNumSequences());
@@ -116,27 +122,35 @@ void ContextProjection::backward(const UpdateCallback& callback) {
   int64_t inputDim = in_->value->getWidth();
   int64_t dim = out_->value->getWidth();
   CHECK_EQ(dim, inputDim * config_.context_length());
-  auto startPositions =
-    in_->sequenceStartPositions->getVector(useGpu_);
+  auto startPositions = in_->sequenceStartPositions->getVector(useGpu_);
 
   REGISTER_TIMER_INFO("ContextProjectionBackward", getName().c_str());
   bool isPadding = config_.trainable_padding();
   if (!out_->grad->useGpu()) {
     out_->grad->contextProjectionBackward(
-        in_->grad, isPadding ? weight_->getWGrad() : nullptr, *startPositions,
-        config_.context_length(), config_.context_start(), beginPad_,
+        in_->grad,
+        isPadding ? weight_->getWGrad() : nullptr,
+        *startPositions,
+        config_.context_length(),
+        config_.context_start(),
+        beginPad_,
         isPadding);
   } else {
     if (in_->grad) {
-      out_->grad->contextProjectionBackwardData(in_->grad, *startPositions,
+      out_->grad->contextProjectionBackwardData(in_->grad,
+                                                *startPositions,
                                                 config_.context_length(),
                                                 config_.context_start());
     }
 
     if (isPadding && weight_->getWGrad()) {
       out_->grad->contextProjectionBackwardWeight(
-          weight_->getWGrad(), *startPositions, config_.context_length(),
-          config_.context_start(), weight_->getWGrad()->getHeight(), beginPad_);
+          weight_->getWGrad(),
+          *startPositions,
+          config_.context_length(),
+          config_.context_start(),
+          weight_->getWGrad()->getHeight(),
+          beginPad_);
     }
   }
 
diff --git a/paddle/gserver/layers/ContextProjection.h b/paddle/gserver/layers/ContextProjection.h
index 0786ee28f2..188dec0fb3 100644
--- a/paddle/gserver/layers/ContextProjection.h
+++ b/paddle/gserver/layers/ContextProjection.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "Projection.h"
@@ -50,7 +49,8 @@ public:
    * and if it is set, constructor will set learned weight, which is used to
    * pad output.
    */
-  ContextProjection(const ProjectionConfig& config, ParameterPtr parameter,
+  ContextProjection(const ProjectionConfig& config,
+                    ParameterPtr parameter,
                     bool useGpu);
   virtual void forward();
   virtual void backward(const UpdateCallback& callback);
diff --git a/paddle/gserver/layers/ConvBaseLayer.cpp b/paddle/gserver/layers/ConvBaseLayer.cpp
index 6bc3b3b801..7637e245a3 100644
--- a/paddle/gserver/layers/ConvBaseLayer.cpp
+++ b/paddle/gserver/layers/ConvBaseLayer.cpp
@@ -22,7 +22,8 @@ bool ConvBaseLayer::init(const LayerMap& layerMap,
   /* Initialize the basic parent class */
   Layer::init(layerMap, parameterMap);
   isDeconv_ = (config_.type() == "exconv" || config_.type() == "cudnn_conv")
-              ? false : true;
+                  ? false
+                  : true;
 
   /* Initialize the convolutional layer parameter */
   numFilters_ = config_.num_filters();
@@ -88,33 +89,25 @@ size_t ConvBaseLayer::calOutputSize() {
 
   auto setLayerSize = [&](IntV& inH, IntV& inW, IntV& outH, IntV& outW) {
     for (size_t i = 0; i < inputLayers_.size(); i++) {
-       inH.push_back(inputLayers_[i]->getOutput().getFrameHeight());
-       inW.push_back(inputLayers_[i]->getOutput().getFrameWidth());
-       if (isDeconv_) {
-         if (inH[i] == 0)
-           inH[i] = config_.inputs(i).conv_conf().output_x();
-         if (inW[i] == 0)
-           inW[i] = config_.inputs(i).conv_conf().output_x();
-         outH.push_back(
-             imageSize(inH[i], filterSizeY_[i], paddingY_[i], strideY_[i],
-                       caffeMode_));
-         outW.push_back(
-             imageSize(inW[i], filterSize_[i], padding_[i], stride_[i],
-                       caffeMode_));
-       } else {
-         if (inH[i] == 0)
-           inH[i] = config_.inputs(i).conv_conf().img_size();
-         if (inW[i] == 0)
-           inW[i] = config_.inputs(i).conv_conf().img_size();
-         outH.push_back(
-             outputSize(inH[i], filterSizeY_[i], paddingY_[i], strideY_[i],
-                        caffeMode_));
-         outW.push_back(
-             outputSize(inW[i], filterSize_[i], padding_[i], stride_[i],
-                        caffeMode_));
-       }
-       CHECK_EQ(outH[i], outH[0]);
-       CHECK_EQ(outW[i], outW[0]);
+      inH.push_back(inputLayers_[i]->getOutput().getFrameHeight());
+      inW.push_back(inputLayers_[i]->getOutput().getFrameWidth());
+      if (isDeconv_) {
+        if (inH[i] == 0) inH[i] = config_.inputs(i).conv_conf().output_x();
+        if (inW[i] == 0) inW[i] = config_.inputs(i).conv_conf().output_x();
+        outH.push_back(imageSize(
+            inH[i], filterSizeY_[i], paddingY_[i], strideY_[i], caffeMode_));
+        outW.push_back(imageSize(
+            inW[i], filterSize_[i], padding_[i], stride_[i], caffeMode_));
+      } else {
+        if (inH[i] == 0) inH[i] = config_.inputs(i).conv_conf().img_size();
+        if (inW[i] == 0) inW[i] = config_.inputs(i).conv_conf().img_size();
+        outH.push_back(outputSize(
+            inH[i], filterSizeY_[i], paddingY_[i], strideY_[i], caffeMode_));
+        outW.push_back(outputSize(
+            inW[i], filterSize_[i], padding_[i], stride_[i], caffeMode_));
+      }
+      CHECK_EQ(outH[i], outH[0]);
+      CHECK_EQ(outW[i], outW[0]);
     }
     getOutput().setFrameHeight(outH[0]);
     getOutput().setFrameWidth(outW[0]);
diff --git a/paddle/gserver/layers/ConvBaseLayer.h b/paddle/gserver/layers/ConvBaseLayer.h
index b80cab8995..85f57dbe0b 100644
--- a/paddle/gserver/layers/ConvBaseLayer.h
+++ b/paddle/gserver/layers/ConvBaseLayer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "Layer.h"
diff --git a/paddle/gserver/layers/ConvOperator.cpp b/paddle/gserver/layers/ConvOperator.cpp
index 2d9c892fe5..9b8e18b1ba 100644
--- a/paddle/gserver/layers/ConvOperator.cpp
+++ b/paddle/gserver/layers/ConvOperator.cpp
@@ -155,9 +155,15 @@ void ConvOperator::reshape(int batchSize) {
   reshapeImageDescriptors();
 
   if (!isSelectAlgo_) {
-    hl_conv_workspace(inputDesc_, outputDesc_, filterDesc_, convDesc_,
-                      &fwdAlgo_, &fwdLimitBytes_, &bwdDataAlgo_,
-                      &bwdDataLimitBytes_, &bwdFilterAlgo_,
+    hl_conv_workspace(inputDesc_,
+                      outputDesc_,
+                      filterDesc_,
+                      convDesc_,
+                      &fwdAlgo_,
+                      &fwdLimitBytes_,
+                      &bwdDataAlgo_,
+                      &bwdDataLimitBytes_,
+                      &bwdFilterAlgo_,
                       &bwdFilterLimitBytes_);
 
     size_t maxWorkSpace = 0;
@@ -171,26 +177,48 @@ void ConvOperator::reshape(int batchSize) {
 }
 
 void ConvOperator::computeConvSizes() {
-  hl_create_filter_descriptor(&filterDesc_, channels_, numFilters_,
-                              filterSizeY_, filterSize_);
+  hl_create_filter_descriptor(
+      &filterDesc_, channels_, numFilters_, filterSizeY_, filterSize_);
   hl_create_tensor_descriptor(&inputDesc_);
   int outputX =
       outputSize(imgSize_, filterSize_, padding_, stride_, caffeMode_);
   CHECK_EQ(outputX, outputX_);
   hl_create_tensor_descriptor(&outputDesc_);
-  hl_create_convolution_descriptor(&convDesc_, inputDesc_, filterDesc_,
-                                   paddingY_, padding_, strideY_, stride_);
+  hl_create_convolution_descriptor(&convDesc_,
+                                   inputDesc_,
+                                   filterDesc_,
+                                   paddingY_,
+                                   padding_,
+                                   strideY_,
+                                   stride_);
 }
 
 void ConvOperator::reshapeImageDescriptors() {
-  hl_tensor_reshape(inputDesc_, 1, channels_, imageH_, imageW_,
-                    channels_ * imageH_ * imageW_, imageH_ * imageW_, imageW_,
+  hl_tensor_reshape(inputDesc_,
+                    1,
+                    channels_,
+                    imageH_,
+                    imageW_,
+                    channels_ * imageH_ * imageW_,
+                    imageH_ * imageW_,
+                    imageW_,
                     1);
-  hl_tensor_reshape(outputDesc_, 1, numFilters_, outputH_, outputW_,
-                    numFilters_ * outputH_ * outputW_, outputH_ * outputW_,
-                    outputW_, 1);
-  hl_reset_convolution_descriptor(convDesc_, inputDesc_, filterDesc_, paddingY_,
-                                  padding_, strideY_, stride_);
+  hl_tensor_reshape(outputDesc_,
+                    1,
+                    numFilters_,
+                    outputH_,
+                    outputW_,
+                    numFilters_ * outputH_ * outputW_,
+                    outputH_ * outputW_,
+                    outputW_,
+                    1);
+  hl_reset_convolution_descriptor(convDesc_,
+                                  inputDesc_,
+                                  filterDesc_,
+                                  paddingY_,
+                                  padding_,
+                                  strideY_,
+                                  stride_);
   inputOffset_ = channels_ * imageH_ * imageW_;
   outputOffset_ = numFilters_ * outputH_ * outputW_;
   weightOffset_ = numFilters_ * channels_ * filterSize_ * filterSize_;
@@ -220,17 +248,27 @@ void ConvOperator::forward() {
   reshape(batchSize);
   CHECK_EQ(ins_[1]->value->getHeight(), batchSize);
   checkFilterSize(ins_[1]->value);
-  Matrix::resizeOrCreate(out_->value, batchSize,
-                         outputH_ * outputW_ * numFilters_, false, useGpu_);
+  Matrix::resizeOrCreate(out_->value,
+                         batchSize,
+                         outputH_ * outputW_ * numFilters_,
+                         false,
+                         useGpu_);
   {
     AsyncGpuBlock block;
     for (size_t batchId = 0; batchId < batchSize; ++batchId) {
       real *inputData = ins_[0]->value->getData() + inputOffset_ * batchId;
       real *wgtData = ins_[1]->value->getData() + weightOffset_ * batchId;
       real *outData = out_->value->getData() + outputOffset_ * batchId;
-      hl_convolution_forward(inputDesc_, inputData, outputDesc_, outData,
-                             filterDesc_, wgtData, convDesc_, workSpace_,
-                             workSpaceInBytes_, fwdAlgo_);
+      hl_convolution_forward(inputDesc_,
+                             inputData,
+                             outputDesc_,
+                             outData,
+                             filterDesc_,
+                             wgtData,
+                             convDesc_,
+                             workSpace_,
+                             workSpaceInBytes_,
+                             fwdAlgo_);
     }
   }
 }
@@ -244,9 +282,15 @@ void ConvOperator::backward() {
       if (ins_[1]->grad) {
         real *inputData = ins_[0]->value->getData() + inputOffset_ * batchId;
         real *weightGrad = ins_[1]->grad->getData() + weightOffset_ * batchId;
-        hl_convolution_backward_filter(inputDesc_, inputData, outputDesc_,
-                                       outGrad, filterDesc_, weightGrad,
-                                       convDesc_, workSpace_, workSpaceInBytes_,
+        hl_convolution_backward_filter(inputDesc_,
+                                       inputData,
+                                       outputDesc_,
+                                       outGrad,
+                                       filterDesc_,
+                                       weightGrad,
+                                       convDesc_,
+                                       workSpace_,
+                                       workSpaceInBytes_,
                                        bwdFilterAlgo_);
       }
 
@@ -254,9 +298,16 @@ void ConvOperator::backward() {
       if (NULL != preGrad) {
         real *inputGrad = preGrad->getData() + inputOffset_ * batchId;
         real *wgtData = ins_[1]->value->getData() + weightOffset_ * batchId;
-        hl_convolution_backward_data(
-            inputDesc_, inputGrad, outputDesc_, outGrad, filterDesc_, wgtData,
-            convDesc_, workSpace_, workSpaceInBytes_, bwdDataAlgo_);
+        hl_convolution_backward_data(inputDesc_,
+                                     inputGrad,
+                                     outputDesc_,
+                                     outGrad,
+                                     filterDesc_,
+                                     wgtData,
+                                     convDesc_,
+                                     workSpace_,
+                                     workSpaceInBytes_,
+                                     bwdDataAlgo_);
       }
     }
   }
diff --git a/paddle/gserver/layers/ConvProjection.cpp b/paddle/gserver/layers/ConvProjection.cpp
index d1ce53fe26..4ab0a1dc84 100644
--- a/paddle/gserver/layers/ConvProjection.cpp
+++ b/paddle/gserver/layers/ConvProjection.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Stat.h"
 #include "ConvProjection.h"
 
@@ -20,12 +19,12 @@ namespace paddle {
 
 REGISTER_PROJECTION(conv, ConvProjection);
 
-ThreadLocalD<std::vector<MemoryHandle*>> ConvProjection::convMem_;
+ThreadLocalD<std::vector<MemoryHandle *>> ConvProjection::convMem_;
 
-ConvProjection::ConvProjection(const ProjectionConfig& config,
-                               ParameterPtr parameter, bool useGpu)
+ConvProjection::ConvProjection(const ProjectionConfig &config,
+                               ParameterPtr parameter,
+                               bool useGpu)
     : Projection(config, parameter, useGpu) {
-
   CHECK(useGpu);  // only support GPU
   getConvParams();
   initCudnn();
@@ -59,12 +58,17 @@ void ConvProjection::getConvParams() {
 }
 
 void ConvProjection::initCudnn() {
-  hl_create_filter_descriptor(&filterDesc_, channels_, numFilters_,
-                              filterH_, filterW_);
+  hl_create_filter_descriptor(
+      &filterDesc_, channels_, numFilters_, filterH_, filterW_);
   hl_create_tensor_descriptor(&inputDesc_);
   hl_create_tensor_descriptor(&outputDesc_);
-  hl_create_convolution_descriptor(&convDesc_, inputDesc_, filterDesc_,
-                                   paddingH_, paddingW_, strideH_, strideW_);
+  hl_create_convolution_descriptor(&convDesc_,
+                                   inputDesc_,
+                                   filterDesc_,
+                                   paddingH_,
+                                   paddingW_,
+                                   strideH_,
+                                   strideW_);
 
   // initialize all to default algorithms
   fwdAlgo_ = 0;
@@ -80,11 +84,22 @@ void ConvProjection::initCudnn() {
 }
 
 void ConvProjection::reshapeTensorDesc(int batchSize) {
-  hl_tensor_reshape(inputDesc_, batchSize, channels_, imageH_, imageW_,
-                    channels_ * imageH_ * imageW_, imageH_ * imageW_,
-                    imageW_, 1);
-  hl_reset_convolution_descriptor(convDesc_, inputDesc_, filterDesc_,
-                                  paddingH_, paddingW_, strideH_, strideW_);
+  hl_tensor_reshape(inputDesc_,
+                    batchSize,
+                    channels_,
+                    imageH_,
+                    imageW_,
+                    channels_ * imageH_ * imageW_,
+                    imageH_ * imageW_,
+                    imageW_,
+                    1);
+  hl_reset_convolution_descriptor(convDesc_,
+                                  inputDesc_,
+                                  filterDesc_,
+                                  paddingH_,
+                                  paddingW_,
+                                  strideH_,
+                                  strideW_);
 
   // The stride between two consecutive images in ConvProjection may not be 1,
   // for example, in the case of layer ConcatenateLayer2 with two
@@ -98,8 +113,15 @@ void ConvProjection::reshapeTensorDesc(int batchSize) {
     nStride = out_->value->getStride();
   }
 
-  hl_tensor_reshape(outputDesc_, batchSize, numFilters_, outputH_, outputW_,
-                    nStride, outputH_ * outputW_, outputW_, 1);
+  hl_tensor_reshape(outputDesc_,
+                    batchSize,
+                    numFilters_,
+                    outputH_,
+                    outputW_,
+                    nStride,
+                    outputH_ * outputW_,
+                    outputW_,
+                    1);
 }
 
 void ConvProjection::reshape(int batchSize) {
@@ -111,20 +133,24 @@ void ConvProjection::reshape(int batchSize) {
 
   if (!isSelectAlgo_) {
     reshapeTensorDesc(batchSize);
-    hl_conv_workspace(inputDesc_, outputDesc_, filterDesc_,
-                      convDesc_, &fwdAlgo_, &fwdLimitBytes_,
-                      &bwdDataAlgo_, &bwdDataLimitBytes_,
-                      &bwdFilterAlgo_, &bwdFilterLimitBytes_);
+    hl_conv_workspace(inputDesc_,
+                      outputDesc_,
+                      filterDesc_,
+                      convDesc_,
+                      &fwdAlgo_,
+                      &fwdLimitBytes_,
+                      &bwdDataAlgo_,
+                      &bwdDataLimitBytes_,
+                      &bwdFilterAlgo_,
+                      &bwdFilterLimitBytes_);
 
     size_t maxWorkSpace = 0;
     maxWorkSpace = std::max(fwdLimitBytes_, bwdDataLimitBytes_);
     maxWorkSpace = std::max(maxWorkSpace, bwdFilterLimitBytes_);
     workSpaceInBytes_ = maxWorkSpace;
 
-
     VLOG(3) << getName() << " Fwd / BwdData / BwdFilter algo: " << fwdAlgo_
-                         << " / " << bwdDataAlgo_
-                         << " / " << bwdFilterAlgo_;
+            << " / " << bwdDataAlgo_ << " / " << bwdFilterAlgo_;
   }
 
   isSelectAlgo_ = true;
@@ -134,7 +160,7 @@ void ConvProjection::forward() {
   int batchSize = in_->value->getHeight();
   reshape(batchSize);
 
-  void* workSpace = NULL;
+  void *workSpace = NULL;
   if (workSpaceInBytes_ > 0) {
     workSpace = getSpaceBytes(workSpaceInBytes_);
   }
@@ -145,17 +171,23 @@ void ConvProjection::forward() {
     real *inputData = in_->value->getData() + g * inputOffset_;
     real *wgtData = weight_->getW()->getData() + g * weightOffset_;
     real *outData = out_->value->getData() + g * outputOffset_;
-    hl_convolution_forward(inputDesc_, inputData, outputDesc_,
-                           outData, filterDesc_, wgtData,
-                           convDesc_, workSpace,
-                           fwdLimitBytes_, fwdAlgo_);
+    hl_convolution_forward(inputDesc_,
+                           inputData,
+                           outputDesc_,
+                           outData,
+                           filterDesc_,
+                           wgtData,
+                           convDesc_,
+                           workSpace,
+                           fwdLimitBytes_,
+                           fwdAlgo_);
   }
 }
 
-void ConvProjection::backward(const UpdateCallback& callback) {
+void ConvProjection::backward(const UpdateCallback &callback) {
   REGISTER_TIMER_INFO("CudnnConvBpTimer", getName().c_str());
 
-  void* workSpace = NULL;
+  void *workSpace = NULL;
   if (workSpaceInBytes_ > 0) {
     workSpace = getSpaceBytes(workSpaceInBytes_);
   }
@@ -165,35 +197,47 @@ void ConvProjection::backward(const UpdateCallback& callback) {
     if (weight_->getWGrad()) {
       real *inputData = in_->value->getData() + g * inputOffset_;
       real *weightGrad = weight_->getWGrad()->getData() + g * weightOffset_;
-      hl_convolution_backward_filter(
-          inputDesc_, inputData, outputDesc_, outGrad, filterDesc_,
-          weightGrad, convDesc_, workSpace, bwdFilterLimitBytes_,
-          bwdFilterAlgo_);
+      hl_convolution_backward_filter(inputDesc_,
+                                     inputData,
+                                     outputDesc_,
+                                     outGrad,
+                                     filterDesc_,
+                                     weightGrad,
+                                     convDesc_,
+                                     workSpace,
+                                     bwdFilterLimitBytes_,
+                                     bwdFilterAlgo_);
     }
 
     MatrixPtr preGrad = in_->grad;
     if (NULL != preGrad) {
       real *inputGrad = preGrad->getData() + g * inputOffset_;
-      real *wgtData = weight_->getW()->getData() + g* weightOffset_;
-      hl_convolution_backward_data(
-          inputDesc_, inputGrad, outputDesc_, outGrad, filterDesc_,
-          wgtData, convDesc_, workSpace, bwdDataLimitBytes_,
-          bwdDataAlgo_);
+      real *wgtData = weight_->getW()->getData() + g * weightOffset_;
+      hl_convolution_backward_data(inputDesc_,
+                                   inputGrad,
+                                   outputDesc_,
+                                   outGrad,
+                                   filterDesc_,
+                                   wgtData,
+                                   convDesc_,
+                                   workSpace,
+                                   bwdDataLimitBytes_,
+                                   bwdDataAlgo_);
     }
   }
 
   weight_->getParameterPtr()->incUpdate(callback);
 }
 
-void* ConvProjection::getSpaceBytes(size_t size) {
-  std::vector<MemoryHandle*>& convMem = *convMem_;
+void *ConvProjection::getSpaceBytes(size_t size) {
+  std::vector<MemoryHandle *> &convMem = *convMem_;
   if (convMem.empty()) {
     int numDevices = hl_get_device_count();
     convMem.resize(numDevices);
   }
 
   int devId = hl_get_device();
-  MemoryHandle** localMem = &(convMem[devId]);
+  MemoryHandle **localMem = &(convMem[devId]);
   if (NULL == *localMem || size > (*localMem)->getAllocSize()) {
     *localMem = new GpuMemoryHandle(size);
   }
diff --git a/paddle/gserver/layers/ConvProjection.h b/paddle/gserver/layers/ConvProjection.h
index d0bfe9a6ed..779fe1455a 100644
--- a/paddle/gserver/layers/ConvProjection.h
+++ b/paddle/gserver/layers/ConvProjection.h
@@ -27,7 +27,8 @@ public:
   /**
    * Constructor.
    */
-  ConvProjection(const ProjectionConfig& config, ParameterPtr parameter,
+  ConvProjection(const ProjectionConfig& config,
+                 ParameterPtr parameter,
                  bool useGpu);
 
   ~ConvProjection();
@@ -47,9 +48,15 @@ protected:
     imageW_ = in_->getFrameWidth();
     if (imageH_ == 0) imageH_ = configImgH_;
     if (imageW_ == 0) imageW_ = configImgW_;
-    outputH_ = outputSize(imageH_, filterH_, paddingH_, strideH_,
+    outputH_ = outputSize(imageH_,
+                          filterH_,
+                          paddingH_,
+                          strideH_,
                           /* caffeMode */ true);
-    outputW_ = outputSize(imageW_, filterW_, paddingW_, strideW_,
+    outputW_ = outputSize(imageW_,
+                          filterW_,
+                          paddingW_,
+                          strideW_,
                           /* caffeMode */ true);
 
     const_cast<Argument*>(out_)->setFrameHeight(outputH_);
diff --git a/paddle/gserver/layers/ConvShiftLayer.cpp b/paddle/gserver/layers/ConvShiftLayer.cpp
index 6b3881e3cc..6e77c1f14e 100644
--- a/paddle/gserver/layers/ConvShiftLayer.cpp
+++ b/paddle/gserver/layers/ConvShiftLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Logging.h"
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
diff --git a/paddle/gserver/layers/ConvexCombinationLayer.cpp b/paddle/gserver/layers/ConvexCombinationLayer.cpp
index a81cf939af..7e1fef8bc6 100644
--- a/paddle/gserver/layers/ConvexCombinationLayer.cpp
+++ b/paddle/gserver/layers/ConvexCombinationLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Logging.h"
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
@@ -70,12 +69,21 @@ bool ConvexCombinationLayer::init(const LayerMap& layerMap,
   CHECK_EQ(weightDim * dataDim, inputLayers_[1]->getSize())
       << "Dimension mismatch";
 
-  tmpRow0 = Matrix::create(nullptr, /* height= */ 1, weightDim,
-                           /* trans= */ false, useGpu_);
-  tmpRow1 = Matrix::create(nullptr, /* height= */ 1, dataDim,
-                           /* trans= */ false, useGpu_);
-  tmpMtx0 = Matrix::create(nullptr, /* height= */ weightDim, dataDim,
-                           /* trans= */ false, useGpu_);
+  tmpRow0 = Matrix::create(nullptr,
+                           /* height= */ 1,
+                           weightDim,
+                           /* trans= */ false,
+                           useGpu_);
+  tmpRow1 = Matrix::create(nullptr,
+                           /* height= */ 1,
+                           dataDim,
+                           /* trans= */ false,
+                           useGpu_);
+  tmpMtx0 = Matrix::create(nullptr,
+                           /* height= */ weightDim,
+                           dataDim,
+                           /* trans= */ false,
+                           useGpu_);
 
   return true;
 }
diff --git a/paddle/gserver/layers/CosSimLayer.cpp b/paddle/gserver/layers/CosSimLayer.cpp
index 05a70aeff5..894cb5b0d8 100644
--- a/paddle/gserver/layers/CosSimLayer.cpp
+++ b/paddle/gserver/layers/CosSimLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "CosSimLayer.h"
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
@@ -57,9 +56,12 @@ void CosSimLayer::backward(const UpdateCallback& callback) {
     REGISTER_TIMER_INFO("CosBpAtvTimer", getName().c_str());
     MatrixPtr outG = this->getOutputGrad();
 
-    outG->cosSimDerivative(*this->getOutputValue(), *getInputValue(0),
-                           *getInputValue(1), *getInputGrad(0),
-                           *getInputGrad(1), config_.cos_scale());
+    outG->cosSimDerivative(*this->getOutputValue(),
+                           *getInputValue(0),
+                           *getInputValue(1),
+                           *getInputGrad(0),
+                           *getInputGrad(1),
+                           config_.cos_scale());
   }
 }
 
diff --git a/paddle/gserver/layers/CosSimLayer.h b/paddle/gserver/layers/CosSimLayer.h
index 65eb807ab2..bc47998c11 100644
--- a/paddle/gserver/layers/CosSimLayer.h
+++ b/paddle/gserver/layers/CosSimLayer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "Layer.h"
@@ -35,8 +34,7 @@ namespace paddle {
  */
 class CosSimLayer : public Layer {
 public:
-  explicit CosSimLayer(const LayerConfig& config)
-      : Layer(config) {}
+  explicit CosSimLayer(const LayerConfig& config) : Layer(config) {}
 
   ~CosSimLayer() {}
 
diff --git a/paddle/gserver/layers/CosSimVecMatLayer.cpp b/paddle/gserver/layers/CosSimVecMatLayer.cpp
index 7d251ace6f..56d177da64 100644
--- a/paddle/gserver/layers/CosSimVecMatLayer.cpp
+++ b/paddle/gserver/layers/CosSimVecMatLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Logging.h"
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
@@ -67,19 +66,37 @@ bool CosSimVecMatLayer::init(const LayerMap& layerMap,
 
   CHECK_EQ(dataDim * numKeys, memoryDim) << "Dimension mismatch";
 
-  tmpRow0 = Matrix::create(nullptr, /* height= */ 1, dataDim,
-                           /* trans= */ false, useGpu_);
-  tmpRow1 = Matrix::create(nullptr, /* height= */ 1, dataDim,
-                           /* trans= */ false, useGpu_);
-  tmpRow2 = Matrix::create(nullptr, /* height= */ numKeys, 1,
-                           /* trans= */ false, useGpu_);
-  tmpRow3 = Matrix::create(nullptr, /* height= */ numKeys, 1,
-                           /* trans= */ false, useGpu_);
-
-  tmpMtx0 = Matrix::create(nullptr, /* height= */ numKeys, dataDim,
-                           /* trans= */ false, useGpu_);
-  tmpMtx1 = Matrix::create(nullptr, /* height= */ numKeys, dataDim,
-                           /* trans= */ false, useGpu_);
+  tmpRow0 = Matrix::create(nullptr,
+                           /* height= */ 1,
+                           dataDim,
+                           /* trans= */ false,
+                           useGpu_);
+  tmpRow1 = Matrix::create(nullptr,
+                           /* height= */ 1,
+                           dataDim,
+                           /* trans= */ false,
+                           useGpu_);
+  tmpRow2 = Matrix::create(nullptr,
+                           /* height= */ numKeys,
+                           1,
+                           /* trans= */ false,
+                           useGpu_);
+  tmpRow3 = Matrix::create(nullptr,
+                           /* height= */ numKeys,
+                           1,
+                           /* trans= */ false,
+                           useGpu_);
+
+  tmpMtx0 = Matrix::create(nullptr,
+                           /* height= */ numKeys,
+                           dataDim,
+                           /* trans= */ false,
+                           useGpu_);
+  tmpMtx1 = Matrix::create(nullptr,
+                           /* height= */ numKeys,
+                           dataDim,
+                           /* trans= */ false,
+                           useGpu_);
   return true;
 }
 
@@ -131,8 +148,12 @@ void CosSimVecMatLayer::backward(const UpdateCallback& callback) {
       tmpRow2->setData(outV->rowBuf(i));
       tmpRow3->setData(outG->rowBuf(i));
 
-      tmpRow3->cosSimDerivative(*(tmpRow2), *(tmpMtx0), *(tmpRow0), *(tmpMtx1),
-                                *(tmpRow1), config_.cos_scale());
+      tmpRow3->cosSimDerivative(*(tmpRow2),
+                                *(tmpMtx0),
+                                *(tmpRow0),
+                                *(tmpMtx1),
+                                *(tmpRow1),
+                                config_.cos_scale());
     }
   } else {
     CHECK(!inG0 || !inG1) << "Not supported";
diff --git a/paddle/gserver/layers/CostLayer.cpp b/paddle/gserver/layers/CostLayer.cpp
index 3c2df52fed..094c36ceb1 100644
--- a/paddle/gserver/layers/CostLayer.cpp
+++ b/paddle/gserver/layers/CostLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <memory>
 #include <algorithm>
 #include "paddle/utils/Logging.h"
@@ -88,13 +87,15 @@ bool MultiClassCrossEntropy::init(const LayerMap& layerMap,
   return CostLayer::init(layerMap, parameterMap);
 }
 
-void MultiClassCrossEntropy::forwardImp(Matrix& output, Argument& label,
+void MultiClassCrossEntropy::forwardImp(Matrix& output,
+                                        Argument& label,
                                         Matrix& target) {
   target.oneHotCrossEntropy(output, *label.ids);
 }
 
-void MultiClassCrossEntropy::backwardImp(
-    Matrix& output, Argument& label, Matrix& outputG) {
+void MultiClassCrossEntropy::backwardImp(Matrix& output,
+                                         Argument& label,
+                                         Matrix& outputG) {
   outputG.oneHotCrossEntropyBp(output, *label.ids);
 }
 
@@ -152,17 +153,19 @@ bool SoftBinaryClassCrossEntropy::init(const LayerMap& layerMap,
   return CostLayer::init(layerMap, parameterMap);
 }
 
-void SoftBinaryClassCrossEntropy::forwardImp(Matrix& output, Argument& label,
+void SoftBinaryClassCrossEntropy::forwardImp(Matrix& output,
+                                             Argument& label,
                                              Matrix& target) {
-  Matrix::resizeOrCreate(targetPerDim_, output.getHeight(), output.getWidth(),
-                         false, useGpu_);
+  Matrix::resizeOrCreate(
+      targetPerDim_, output.getHeight(), output.getWidth(), false, useGpu_);
 
   targetPerDim_->softCrossEntropy(output, *label.value);
   targetPerDim_->rowSum(target);
 }
 
-void SoftBinaryClassCrossEntropy::backwardImp(
-    Matrix& output, Argument& label, Matrix& outputG) {
+void SoftBinaryClassCrossEntropy::backwardImp(Matrix& output,
+                                              Argument& label,
+                                              Matrix& outputG) {
   outputG.softCrossEntropyBp(output, *label.value);
 }
 
@@ -177,13 +180,15 @@ bool SumOfSquaresCostLayer::init(const LayerMap& layerMap,
   return CostLayer::init(layerMap, parameterMap);
 }
 
-void SumOfSquaresCostLayer::forwardImp(Matrix& output, Argument& label,
+void SumOfSquaresCostLayer::forwardImp(Matrix& output,
+                                       Argument& label,
                                        Matrix& target) {
   target.sumOfSquares(output, *label.value);
 }
 
-void SumOfSquaresCostLayer::backwardImp(
-    Matrix& output, Argument& label, Matrix& outputG) {
+void SumOfSquaresCostLayer::backwardImp(Matrix& output,
+                                        Argument& label,
+                                        Matrix& outputG) {
   outputG.sumOfSquaresBp(output, *label.value);
 }
 
@@ -219,8 +224,8 @@ void RankingCost::forward(PassType passType) {
     IVectorPtr idLabel = getInput(*getLabelLayer()).ids;
     CHECK(idLabel) << "label layer has neither value nor ids";
     CHECK_EQ((size_t)batchSize, idLabel->getSize());
-    Matrix::resizeOrCreate(labelBuf_, batchSize, /*width*/ 1, /*trans*/ false,
-                           useGpu_);
+    Matrix::resizeOrCreate(
+        labelBuf_, batchSize, /*width*/ 1, /*trans*/ false, useGpu_);
     labelBuf_->copyFrom(*idLabel);
     label = labelBuf_;
   }
@@ -261,8 +266,8 @@ void RankingCost::backward(const UpdateCallback& callback) {
     label = labelBuf_;
   }
 
-  Matrix::resizeOrCreate(marginGrad_, label->getHeight(), 1, /* trans= */ false,
-                         useGpu_);
+  Matrix::resizeOrCreate(
+      marginGrad_, label->getHeight(), 1, /* trans= */ false, useGpu_);
   marginGrad_->zeroMem();
   marginGrad_->logisticRegressionLossBp(*margin_, *label);
   if (weightLayer_) {
@@ -317,15 +322,14 @@ void LambdaCost::forward(PassType passType) {
   real* outputData = output->getData();
   real* targetData = target->getData();
 
-  auto startPos =
-      getInput(*getOutputLayer()).sequenceStartPositions;
+  auto startPos = getInput(*getOutputLayer()).sequenceStartPositions;
   const int* startPosData = startPos->getData(false);
   size_t batchNum = startPos->getSize() - 1;
   for (size_t i = 0; i < batchNum; ++i) {
     int beginPos = startPosData[i];
     int endPos = startPosData[i + 1];
-    real NDCG = calcNDCG(outputData + beginPos, scoreData + beginPos,
-                         endPos - beginPos);
+    real NDCG = calcNDCG(
+        outputData + beginPos, scoreData + beginPos, endPos - beginPos);
     for (int j = beginPos; j < endPos; ++j) {
       targetData[j] = NDCG;
     }
@@ -336,23 +340,27 @@ void LambdaCost::backward(const UpdateCallback& callback) {
   (void)callback;
   MatrixPtr score = getInputValue(*getScoreLayer());
   MatrixPtr output = getInputValue(*getOutputLayer());
-  Matrix::resizeOrCreate(marginGrad_, score->getHeight(), 1,
-                         /* trans= */ false, useGpu_);
+  Matrix::resizeOrCreate(marginGrad_,
+                         score->getHeight(),
+                         1,
+                         /* trans= */ false,
+                         useGpu_);
   marginGrad_->zeroMem();
 
   real* gradData = marginGrad_->getData();
   real* scoreData = score->getData();
   real* outputData = output->getData();
 
-  auto startPos =
-      getInput(*getOutputLayer()).sequenceStartPositions;
+  auto startPos = getInput(*getOutputLayer()).sequenceStartPositions;
   const int* startPosData = startPos->getData(false);
   size_t batchNum = startPos->getSize() - 1;
 
   for (size_t i = 0; i < batchNum; ++i) {
     int beginPos = startPosData[i];
     int endPos = startPosData[i + 1];
-    calcGrad(outputData + beginPos, scoreData + beginPos, gradData + beginPos,
+    calcGrad(outputData + beginPos,
+             scoreData + beginPos,
+             gradData + beginPos,
              endPos - beginPos);
   }
 
@@ -361,8 +369,10 @@ void LambdaCost::backward(const UpdateCallback& callback) {
 
 void LambdaCost::onPassEnd() {}
 
-void LambdaCost::calcGrad(const real* outputScore, const real* score,
-                          real* gradData, int size) {
+void LambdaCost::calcGrad(const real* outputScore,
+                          const real* score,
+                          real* gradData,
+                          int size) {
   CHECK_GE(size, truncationSize_)
       << "Invalid: (Sample num in the same list) < (NDCG truncation num) !";
   int sortSize = maxSortSize_ == -1 ? size : std::min(maxSortSize_, size);
@@ -372,13 +382,16 @@ void LambdaCost::calcGrad(const real* outputScore, const real* score,
     scorePair_.push_back(std::make_pair(score[i], i));
   }
   if (size <= sortSize) {
-    std::sort(scorePair_.begin(), scorePair_.end(),
+    std::sort(scorePair_.begin(),
+              scorePair_.end(),
               [](const std::pair<real, int>& a, const std::pair<real, int>& b) {
                 return a.first > b.first;
               });
   } else {
     std::partial_sort(
-        scorePair_.begin(), scorePair_.begin() + sortSize, scorePair_.end(),
+        scorePair_.begin(),
+        scorePair_.begin() + sortSize,
+        scorePair_.end(),
         [](const std::pair<real, int>& a, const std::pair<real, int>& b) {
           return a.first > b.first;
         });
@@ -414,7 +427,8 @@ void LambdaCost::calcGrad(const real* outputScore, const real* score,
   }
 }
 
-real LambdaCost::calcNDCG(const real* outputScore, const real* score,
+real LambdaCost::calcNDCG(const real* outputScore,
+                          const real* score,
                           int size) {
   CHECK_GE(size, truncationSize_)
       << "Invalid: (Sample num in the same list) < (NDCG truncation num) !";
@@ -424,7 +438,8 @@ real LambdaCost::calcNDCG(const real* outputScore, const real* score,
     outputScorePair_.push_back(std::make_pair(outputScore[i], i));
   }
   std::partial_sort(
-      outputScorePair_.begin(), outputScorePair_.begin() + truncationSize_,
+      outputScorePair_.begin(),
+      outputScorePair_.begin() + truncationSize_,
       outputScorePair_.end(),
       [](const std::pair<real, int>& a, const std::pair<real, int>& b) {
         return a.first > b.first;
@@ -439,8 +454,10 @@ real LambdaCost::calcNDCG(const real* outputScore, const real* score,
   scoreVec_.resize(size);
   std::copy(score, score + size, scoreVec_.begin());
   real maxDCG = 0;
-  std::partial_sort(scoreVec_.begin(), scoreVec_.begin() + truncationSize_,
-                    scoreVec_.end(), std::greater<real>());
+  std::partial_sort(scoreVec_.begin(),
+                    scoreVec_.begin() + truncationSize_,
+                    scoreVec_.end(),
+                    std::greater<real>());
   for (int i = 0; i < truncationSize_; ++i) {
     maxDCG += (std::pow(2, scoreVec_[i]) - 1) / std::log(i + 2);
   }
@@ -460,7 +477,8 @@ bool MultiBinaryLabelCrossEntropy::init(const LayerMap& layerMap,
   return CostLayer::init(layerMap, parameterMap);
 }
 
-void MultiBinaryLabelCrossEntropy::forwardImp(Matrix& output, Argument& label,
+void MultiBinaryLabelCrossEntropy::forwardImp(Matrix& output,
+                                              Argument& label,
                                               Matrix& target) {
   MatrixPtr value = nullptr;
   if (label.ids) {
@@ -475,16 +493,17 @@ void MultiBinaryLabelCrossEntropy::forwardImp(Matrix& output, Argument& label,
       dynamic_cast<GpuSparseMatrix*>(value.get())) {
     target.multiBinaryLabelCrossEntropy(output, *value);
   } else {
-    Matrix::resizeOrCreate(targetPerDim_, output.getHeight(), output.getWidth(),
-                           false, useGpu_);
+    Matrix::resizeOrCreate(
+        targetPerDim_, output.getHeight(), output.getWidth(), false, useGpu_);
 
     targetPerDim_->binaryLabelCrossEntropy(output, *value);
     targetPerDim_->rowSum(target);
   }
 }
 
-void MultiBinaryLabelCrossEntropy::backwardImp(
-    Matrix& output, Argument& label, Matrix& outputG) {
+void MultiBinaryLabelCrossEntropy::backwardImp(Matrix& output,
+                                               Argument& label,
+                                               Matrix& outputG) {
   MatrixPtr value = nullptr;
   if (label.ids) {
     CHECK(!value);
@@ -519,8 +538,7 @@ bool HuberTwoClass::init(const LayerMap& layerMap,
   return true;
 }
 
-void HuberTwoClass::forwardImp(Matrix &output, Argument &label,
-                               Matrix &cost) {
+void HuberTwoClass::forwardImp(Matrix& output, Argument& label, Matrix& cost) {
   if (useGpu_) {
     for (size_t i = 0; i < inputLayers_.size(); i++) {
       tmpCpuInput_[i].resizeAndCopyFrom(
@@ -531,7 +549,8 @@ void HuberTwoClass::forwardImp(Matrix &output, Argument &label,
   forwardImpIn(output, label, cost);
 }
 
-void HuberTwoClass::forwardImpIn(Matrix& output, Argument& label,
+void HuberTwoClass::forwardImpIn(Matrix& output,
+                                 Argument& label,
                                  Matrix& target) {
   size_t numSamples = target.getHeight();
   CHECK_EQ((*label.ids).getSize(), numSamples);
@@ -539,7 +558,7 @@ void HuberTwoClass::forwardImpIn(Matrix& output, Argument& label,
   CHECK_EQ(output.getWidth(), (size_t)1);
   CHECK_EQ(target.getWidth(), (size_t)1);
 
-  real* out = useGpu_ ? tmpCpuInput_[0].value->getData(): output.getData();
+  real* out = useGpu_ ? tmpCpuInput_[0].value->getData() : output.getData();
   int* lbl = useGpu_ ? tmpCpuInput_[1].ids->getData() : (*label.ids).getData();
   std::vector<real> cost(numSamples);
   for (size_t i = 0; i < numSamples; ++i) {
@@ -554,19 +573,21 @@ void HuberTwoClass::forwardImpIn(Matrix& output, Argument& label,
   target.copyFrom(cost.data(), numSamples);
 }
 
-void HuberTwoClass::backwardImp(Matrix &outputValue,
-                                Argument &label, Matrix &outputGrad) {
+void HuberTwoClass::backwardImp(Matrix& outputValue,
+                                Argument& label,
+                                Matrix& outputGrad) {
   if (useGpu_) {
-    backwardImpIn(*tmpCpuInput_[0].value, tmpCpuInput_[1],
-                  *tmpCpuInput_[0].grad);
+    backwardImpIn(
+        *tmpCpuInput_[0].value, tmpCpuInput_[1], *tmpCpuInput_[0].grad);
     outputGrad.copyFrom(*tmpCpuInput_[0].grad);
   } else {
     backwardImpIn(outputValue, label, outputGrad);
   }
 }
 
-void HuberTwoClass::backwardImpIn(
-    Matrix& output, Argument& label, Matrix& outputG) {
+void HuberTwoClass::backwardImpIn(Matrix& output,
+                                  Argument& label,
+                                  Matrix& outputG) {
   size_t numSamples = output.getHeight();
   real* out = output.getData();
   real* grad = outputG.getData();
@@ -605,7 +626,7 @@ public:
     int batchSize = input->getHeight();
     int size = 1;
     resizeOutput(batchSize, size);
-    output_.value->sumRows(*input, /* scaleSum= */1, /* scaleDest= */0);
+    output_.value->sumRows(*input, /* scaleSum= */ 1, /* scaleDest= */ 0);
   }
 
   virtual void backward(const UpdateCallback& callback = nullptr) {
diff --git a/paddle/gserver/layers/CostLayer.h b/paddle/gserver/layers/CostLayer.h
index f263c68821..120ff9bd2d 100644
--- a/paddle/gserver/layers/CostLayer.h
+++ b/paddle/gserver/layers/CostLayer.h
@@ -42,10 +42,12 @@ public:
 
   virtual void backward(const UpdateCallback& callback = nullptr);
 
-  virtual void forwardImp(Matrix& outputValue, Argument& label,
+  virtual void forwardImp(Matrix& outputValue,
+                          Argument& label,
                           Matrix& cost) = 0;
 
-  virtual void backwardImp(Matrix& outputValue, Argument& label,
+  virtual void backwardImp(Matrix& outputValue,
+                           Argument& label,
                            Matrix& outputGrad) = 0;
 
 protected:
@@ -225,7 +227,9 @@ public:
   void onPassEnd();
 
   real calcNDCG(const real* outputScore, const real* score, int size);
-  void calcGrad(const real* outputScore, const real* score, real* gradData,
+  void calcGrad(const real* outputScore,
+                const real* score,
+                real* gradData,
                 int size);
 
 private:
@@ -274,6 +278,7 @@ public:
  */
 class HuberTwoClass : public CostLayer {
   std::vector<Argument> tmpCpuInput_;
+
 public:
   explicit HuberTwoClass(const LayerConfig& config) : CostLayer(config) {}
 
diff --git a/paddle/gserver/layers/CudnnBatchNormLayer.cpp b/paddle/gserver/layers/CudnnBatchNormLayer.cpp
index 3c6d13b0bf..6be62b1a25 100644
--- a/paddle/gserver/layers/CudnnBatchNormLayer.cpp
+++ b/paddle/gserver/layers/CudnnBatchNormLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Stat.h"
 #include "Layer.h"
 #include "CudnnBatchNormLayer.h"
@@ -65,16 +64,31 @@ void CudnnBatchNormLayer::forward(PassType passType) {
     REGISTER_TIMER_INFO("CudnnBatchFwTimer", getName().c_str());
     real* savedMean = savedMean_->getData();
     real* savedInvVar = savedInvVar_->getData();
-    hl_batch_norm_forward_training(ioDesc_, input, ioDesc_, output,
+    hl_batch_norm_forward_training(ioDesc_,
+                                   input,
+                                   ioDesc_,
+                                   output,
                                    bnParamDesc_,
-                                   gamma, beta, 1.0 - movingAvgFraction_,
-                                   movingMean, movingVar,
-                                   EPS, savedMean, savedInvVar);
+                                   gamma,
+                                   beta,
+                                   1.0 - movingAvgFraction_,
+                                   movingMean,
+                                   movingVar,
+                                   EPS,
+                                   savedMean,
+                                   savedInvVar);
   } else {
     // used movingMean and movingVar in testing
-    hl_batch_norm_forward_inference(ioDesc_, input, ioDesc_, output,
-                                    bnParamDesc_, gamma, beta,
-                                    movingMean, movingVar, EPS);
+    hl_batch_norm_forward_inference(ioDesc_,
+                                    input,
+                                    ioDesc_,
+                                    output,
+                                    bnParamDesc_,
+                                    gamma,
+                                    beta,
+                                    movingMean,
+                                    movingVar,
+                                    EPS);
   }
 
   /* activation */ {
@@ -115,10 +129,19 @@ void CudnnBatchNormLayer::backward(const UpdateCallback& callback) {
     create(tmpBiasGrad_, 1, channels_, &betaGrad);
   }
 
-  hl_batch_norm_backward(ioDesc_, input, ioDesc_, outGrad,
-                         ioDesc_, inGrad, bnParamDesc_,
-                         gamma, gammaGrad, betaGrad,
-                         EPS, savedMean, savedInvVar);
+  hl_batch_norm_backward(ioDesc_,
+                         input,
+                         ioDesc_,
+                         outGrad,
+                         ioDesc_,
+                         inGrad,
+                         bnParamDesc_,
+                         gamma,
+                         gammaGrad,
+                         betaGrad,
+                         EPS,
+                         savedMean,
+                         savedInvVar);
 
   {
     REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
diff --git a/paddle/gserver/layers/CudnnBatchNormLayer.h b/paddle/gserver/layers/CudnnBatchNormLayer.h
index 03f4f591c3..6220e77ceb 100644
--- a/paddle/gserver/layers/CudnnBatchNormLayer.h
+++ b/paddle/gserver/layers/CudnnBatchNormLayer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "paddle/utils/Stat.h"
@@ -23,7 +22,8 @@ namespace paddle {
 
 /**
  * @brief Cudnn Batch normalization layer use to cuDNN lib to implentment.
- * @note Cudnn version must >= v4.0, and better to use the latest version (v5.1).
+ * @note Cudnn version must >= v4.0, and better to use the latest version
+ * (v5.1).
  *
  * The config file api is batch_norm_layer.
  */
diff --git a/paddle/gserver/layers/CudnnConvLayer.cpp b/paddle/gserver/layers/CudnnConvLayer.cpp
index 23ba234118..93c5565d2f 100644
--- a/paddle/gserver/layers/CudnnConvLayer.cpp
+++ b/paddle/gserver/layers/CudnnConvLayer.cpp
@@ -32,16 +32,16 @@ bool CudnnConvLayer::init(const LayerMap &layerMap,
   numFilters_ = config_.num_filters();
   CHECK(config_.shared_biases());
   for (size_t i = 0; i < inputLayers_.size(); i++) {
-    ProjectionConfig* conf = new ProjectionConfig();
+    ProjectionConfig *conf = new ProjectionConfig();
     conf->set_type("conv");
     conf->set_num_filters(numFilters_);
-    ConvConfig* convConf = conf->mutable_conv_conf();
+    ConvConfig *convConf = conf->mutable_conv_conf();
     *convConf = *(config_.mutable_inputs(i)->mutable_conv_conf());
     conf->set_input_size(getPrev(i)->getSize());
     conf->set_output_size(getSize());
     projConf_.emplace_back(conf);
-    projections_.emplace_back(Projection::create(*projConf_[i],
-                                                 parameters_[i], useGpu_));
+    projections_.emplace_back(
+        Projection::create(*projConf_[i], parameters_[i], useGpu_));
   }
 
   if (biases_.get() && sharedBiases_) {
@@ -67,15 +67,21 @@ void CudnnConvLayer::forward(PassType passType) {
   if (biases_) {
     REGISTER_TIMER_INFO("CudnnConvBiasTimer", getName().c_str());
     int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
-    hl_tensor_reshape(outputDesc_, batchSize, numFilters_ / groups_[0],
-        outputH_[0], outputW_[0], numFilters_ * outputH_[0] * outputW_[0],
-        outputH_[0] * outputW_[0], outputW_[0], 1);
+    hl_tensor_reshape(outputDesc_,
+                      batchSize,
+                      numFilters_ / groups_[0],
+                      outputH_[0],
+                      outputW_[0],
+                      numFilters_ * outputH_[0] * outputW_[0],
+                      outputH_[0] * outputW_[0],
+                      outputW_[0],
+                      1);
     outputOffset_ = getOutputValue()->getWidth() / groups_[0];
     for (int g = 0; g < groups_[0]; ++g) {
       real *biasData = biases_->getW()->getData() + biasOffset_ * g;
       real *outData = getOutputValue()->getData() + outputOffset_ * g;
-      hl_convolution_forward_add_bias(biasDesc_, biasData,
-                                      outputDesc_, outData);
+      hl_convolution_forward_add_bias(
+          biasDesc_, biasData, outputDesc_, outData);
     }
   }
 
diff --git a/paddle/gserver/layers/CudnnConvLayer.h b/paddle/gserver/layers/CudnnConvLayer.h
index 6390d96315..6cfbadfb53 100644
--- a/paddle/gserver/layers/CudnnConvLayer.h
+++ b/paddle/gserver/layers/CudnnConvLayer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "ConvBaseLayer.h"
diff --git a/paddle/gserver/layers/CudnnPoolLayer.cpp b/paddle/gserver/layers/CudnnPoolLayer.cpp
index 24adb50a98..21d8e2579f 100644
--- a/paddle/gserver/layers/CudnnPoolLayer.cpp
+++ b/paddle/gserver/layers/CudnnPoolLayer.cpp
@@ -61,8 +61,13 @@ bool CudnnPoolLayer::init(const LayerMap &layerMap,
   strideHeight = strideY_;
   strideWidth = stride_;
 
-  hl_create_pooling_descriptor(&poolingDesc_, mode_, windowHeight, windowWidth,
-                               heightPadding, widthPadding, strideHeight,
+  hl_create_pooling_descriptor(&poolingDesc_,
+                               mode_,
+                               windowHeight,
+                               windowWidth,
+                               heightPadding,
+                               widthPadding,
+                               strideHeight,
                                strideWidth);
 
   return true;
@@ -79,7 +84,10 @@ void CudnnPoolLayer::reshape(int batchSize) {
   }
   CHECK_EQ(inputLayers_[0]->getOutput().value->getWidth(),
            channels_ * imageH_ * imageW_);
-  outputH_ = outputSize(imageH_, sizeY_, confPaddingY_, strideY_,
+  outputH_ = outputSize(imageH_,
+                        sizeY_,
+                        confPaddingY_,
+                        strideY_,
                         /* caffeMode */ false);
   outputW_ =
       outputSize(imageW_, sizeX_, confPadding_, stride_, /* caffeMode */ false);
@@ -113,8 +121,13 @@ void CudnnPoolLayer::backward(const UpdateCallback &callback) {
   real *inputGrad = getInputGrad(0)->getData();
   real *outData = getOutputValue()->getData();
   real *outGrad = getOutputGrad()->getData();
-  hl_pooling_backward(inputDesc_, inputData, inputGrad, outputDesc_, outData,
-                      outGrad, poolingDesc_);
+  hl_pooling_backward(inputDesc_,
+                      inputData,
+                      inputGrad,
+                      outputDesc_,
+                      outData,
+                      outGrad,
+                      poolingDesc_);
 }
 
 CudnnPoolLayer::~CudnnPoolLayer() {
diff --git a/paddle/gserver/layers/CudnnPoolLayer.h b/paddle/gserver/layers/CudnnPoolLayer.h
index 2ef94720d2..6a6b28db96 100644
--- a/paddle/gserver/layers/CudnnPoolLayer.h
+++ b/paddle/gserver/layers/CudnnPoolLayer.h
@@ -12,19 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "PoolLayer.h"
 
 namespace paddle {
 
- /**
-  * @brief CudnnPoolLayer is subclass of PoolLayer, which is implemented by
-  * cudnn api and only supports GPU.
-  *
-  * The config file api is img_pool_layer.
-  */
+/**
+ * @brief CudnnPoolLayer is subclass of PoolLayer, which is implemented by
+ * cudnn api and only supports GPU.
+ *
+ * The config file api is img_pool_layer.
+ */
 
 class CudnnPoolLayer : public PoolLayer {
 protected:
diff --git a/paddle/gserver/layers/DataLayer.cpp b/paddle/gserver/layers/DataLayer.cpp
index 79b9181e69..9a4b2e9d3e 100644
--- a/paddle/gserver/layers/DataLayer.cpp
+++ b/paddle/gserver/layers/DataLayer.cpp
@@ -32,19 +32,20 @@ void DataLayer::copyDataToOutput(Argument& output) {
                                           data_.value->getWidth(),
                                           useGpu(output.deviceId));
       } else {
-        output.value->resize(data_.value->getHeight(),
-                             data_.value->getWidth());
+        output.value->resize(data_.value->getHeight(), data_.value->getWidth());
       }
       output.value->copyFrom(*data_.value);
     }
     if (data_.grad) {
-      Matrix::resizeOrCreate(output.grad, data_.grad->getHeight(),
+      Matrix::resizeOrCreate(output.grad,
+                             data_.grad->getHeight(),
                              data_.grad->getWidth(),
-                             /* trans= */ false, useGpu(output.deviceId));
+                             /* trans= */ false,
+                             useGpu(output.deviceId));
     }
     if (data_.ids) {
-      IVector::resizeOrCreate(output.ids, data_.ids->getSize(),
-                              useGpu(output.deviceId));
+      IVector::resizeOrCreate(
+          output.ids, data_.ids->getSize(), useGpu(output.deviceId));
       output.ids->copyFrom(*data_.ids);
     }
   }
diff --git a/paddle/gserver/layers/DataLayer.h b/paddle/gserver/layers/DataLayer.h
index 3abec1b065..da74702201 100644
--- a/paddle/gserver/layers/DataLayer.h
+++ b/paddle/gserver/layers/DataLayer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <memory>
@@ -20,7 +19,7 @@ limitations under the License. */
 #include "Layer.h"
 
 namespace paddle {
-/** 
+/**
  * This layer just copy data to output, and has no backward propagation.
  *
  * The config file api is data_layer.
@@ -34,12 +33,10 @@ public:
   /**
    * Prefetch sparse matrix/ids only.
    */
-  void prefetch() {
-    output_ = data_;
-  }
+  void prefetch() { output_ = data_; }
 
-  /** 
-   * Forward propagation. Copy data_ (value, in, grad, ids, cpuSequenceDims, 
+  /**
+   * Forward propagation. Copy data_ (value, in, grad, ids, cpuSequenceDims,
    * sequenceStartPositions, subSequenceStartPositions, strs) to output_.
    */
   virtual void forward(PassType passType) {
diff --git a/paddle/gserver/layers/DataNormLayer.cpp b/paddle/gserver/layers/DataNormLayer.cpp
index 150977ce1a..b398f3dbed 100644
--- a/paddle/gserver/layers/DataNormLayer.cpp
+++ b/paddle/gserver/layers/DataNormLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "DataNormLayer.h"
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
@@ -37,16 +36,28 @@ bool DataNormLayer::init(const LayerMap& layerMap,
       << "The parameter of DataNormLayer must be static";
 
   weight_ = std::unique_ptr<Weight>(new Weight(5, getSize(), parameters_[0]));
-  min_ = Matrix::create(nullptr, /* height= */ 1, getSize(), /* trans= */ false,
-                        useGpu_);
-  rangeReciprocal_ = Matrix::create(nullptr, /* height= */ 1, getSize(),
-                                    /* trans= */ false, useGpu_);
-  mean_ = Matrix::create(nullptr, /* height= */ 1, getSize(),
-                         /* trans= */ false, useGpu_);
-  stdReciprocal_ = Matrix::create(nullptr, /* height= */ 1, getSize(),
-                                  /* trans= */ false, useGpu_);
-  decimalReciprocal_ = Matrix::create(nullptr, /* height= */ 1, getSize(),
-                                      /* trans= */ false, useGpu_);
+  min_ = Matrix::create(
+      nullptr, /* height= */ 1, getSize(), /* trans= */ false, useGpu_);
+  rangeReciprocal_ = Matrix::create(nullptr,
+                                    /* height= */ 1,
+                                    getSize(),
+                                    /* trans= */ false,
+                                    useGpu_);
+  mean_ = Matrix::create(nullptr,
+                         /* height= */ 1,
+                         getSize(),
+                         /* trans= */ false,
+                         useGpu_);
+  stdReciprocal_ = Matrix::create(nullptr,
+                                  /* height= */ 1,
+                                  getSize(),
+                                  /* trans= */ false,
+                                  useGpu_);
+  decimalReciprocal_ = Matrix::create(nullptr,
+                                      /* height= */ 1,
+                                      getSize(),
+                                      /* trans= */ false,
+                                      useGpu_);
 
   min_->setData(weight_->getW()->getData());
   rangeReciprocal_->setData(weight_->getW()->getData() + getSize());
diff --git a/paddle/gserver/layers/DataNormLayer.h b/paddle/gserver/layers/DataNormLayer.h
index 232c73f034..1179d94fbb 100644
--- a/paddle/gserver/layers/DataNormLayer.h
+++ b/paddle/gserver/layers/DataNormLayer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "Layer.h"
diff --git a/paddle/gserver/layers/DotMulOperator.cpp b/paddle/gserver/layers/DotMulOperator.cpp
index e6d2375b47..9409493fda 100644
--- a/paddle/gserver/layers/DotMulOperator.cpp
+++ b/paddle/gserver/layers/DotMulOperator.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "Operator.h"
 
 namespace paddle {
@@ -42,8 +41,8 @@ DotMulOperator::DotMulOperator(const OperatorConfig& config, bool useGpu)
 }
 
 void DotMulOperator::forward() {
-  out_->value->addDotMul(*ins_[0]->value, *ins_[1]->value, 1,
-                         config_.dotmul_scale());
+  out_->value->addDotMul(
+      *ins_[0]->value, *ins_[1]->value, 1, config_.dotmul_scale());
 }
 
 void DotMulOperator::backward() {
diff --git a/paddle/gserver/layers/DotMulProjection.cpp b/paddle/gserver/layers/DotMulProjection.cpp
index f6f14c4429..862eeb6f01 100644
--- a/paddle/gserver/layers/DotMulProjection.cpp
+++ b/paddle/gserver/layers/DotMulProjection.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "Projection.h"
 
 namespace paddle {
@@ -29,7 +28,8 @@ namespace paddle {
 class DotMulProjection : public Projection {
 public:
   DotMulProjection(const ProjectionConfig& config,
-                   const ParameterPtr& parameter, bool useGpu);
+                   const ParameterPtr& parameter,
+                   bool useGpu);
   virtual void forward();
   virtual void backward(const UpdateCallback& callback);
 
@@ -41,7 +41,8 @@ protected:
 REGISTER_PROJECTION(dot_mul, DotMulProjection);
 
 DotMulProjection::DotMulProjection(const ProjectionConfig& config,
-                                   const ParameterPtr& parameter, bool useGpu)
+                                   const ParameterPtr& parameter,
+                                   bool useGpu)
     : Projection(config, parameter, useGpu) {
   weight_.reset(new Weight(1LU, config.output_size(), parameter));
 }
diff --git a/paddle/gserver/layers/EosIdCheckLayer.cpp b/paddle/gserver/layers/EosIdCheckLayer.cpp
index 2d0778a451..3a43705d26 100644
--- a/paddle/gserver/layers/EosIdCheckLayer.cpp
+++ b/paddle/gserver/layers/EosIdCheckLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Logging.h"
 #include "Layer.h"
 
@@ -20,7 +19,7 @@ namespace paddle {
 /**
  * A layer for checking EOS for each sample:
  * - output_id = (input_id == conf.eos_id)
- * 
+ *
  * The result is stored in output_.ids.
  * It is used by recurrent layer group.
  */
diff --git a/paddle/gserver/layers/ExpandConvBaseLayer.cpp b/paddle/gserver/layers/ExpandConvBaseLayer.cpp
index 0bab0ca764..71a69bd0d0 100644
--- a/paddle/gserver/layers/ExpandConvBaseLayer.cpp
+++ b/paddle/gserver/layers/ExpandConvBaseLayer.cpp
@@ -12,14 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "ExpandConvBaseLayer.h"
 
 #include "paddle/utils/Logging.h"
 namespace paddle {
 
 bool ExpandConvBaseLayer::init(const LayerMap &layerMap,
-                           const ParameterMap &parameterMap) {
+                               const ParameterMap &parameterMap) {
   /* Initialize the basic convolutional parent class */
   ConvBaseLayer::init(layerMap, parameterMap);
 
@@ -76,9 +75,11 @@ void ExpandConvBaseLayer::addSharedBias() {
   transOutValue_->reshape(transOutValue_->getElementCnt() / numFilters_,
                           numFilters_);
 
-  MatrixPtr bias =
-      Matrix::create(biases_->getW()->getData(), 1,
-                     biases_->getW()->getElementCnt(), false, useGpu_);
+  MatrixPtr bias = Matrix::create(biases_->getW()->getData(),
+                                  1,
+                                  biases_->getW()->getElementCnt(),
+                                  false,
+                                  useGpu_);
   transOutValue_->addBias(*bias, 1.0f);
 
   transOutValue_->reshape(mapW, mapH);
@@ -90,32 +91,46 @@ void ExpandConvBaseLayer::addSharedBias() {
 
 void ExpandConvBaseLayer::addUnsharedBias() {
   MatrixPtr outValue = getOutputValue();
-  MatrixPtr bias =
-      Matrix::create(biases_->getW()->getData(), 1,
-                     biases_->getW()->getElementCnt(), false, useGpu_);
+  MatrixPtr bias = Matrix::create(biases_->getW()->getData(),
+                                  1,
+                                  biases_->getW()->getElementCnt(),
+                                  false,
+                                  useGpu_);
   outValue->addBias(*bias, 1.0f);
 }
 
-
-void ExpandConvBaseLayer::expandOneFrame(MatrixPtr image, size_t startIdx,
-                                     int inIdx) {
+void ExpandConvBaseLayer::expandOneFrame(MatrixPtr image,
+                                         size_t startIdx,
+                                         int inIdx) {
   int channel = isDeconv_ ? numFilters_ : channels_[inIdx];
 
   resetExpandInput(subK_[inIdx] * groups_[inIdx], subN_[inIdx]);
   real *imgData = image->getData() + startIdx * image->getWidth();
-  MatrixPtr imageTmp = Matrix::create(
-      imgData, 1, imgSizeH_[inIdx] * imgSizeW_[inIdx] * channel, false,
-      useGpu_);
-  expandInput_->convExpand(*imageTmp, imgSizeH_[inIdx], imgSizeW_[inIdx],
-                           channel, filterSize_[inIdx],
-                           filterSize_[inIdx], stride_[inIdx], stride_[inIdx],
-                           padding_[inIdx], padding_[inIdx],
-                           outputH_[inIdx], outputW_[inIdx]);
+  MatrixPtr imageTmp =
+      Matrix::create(imgData,
+                     1,
+                     imgSizeH_[inIdx] * imgSizeW_[inIdx] * channel,
+                     false,
+                     useGpu_);
+  expandInput_->convExpand(*imageTmp,
+                           imgSizeH_[inIdx],
+                           imgSizeW_[inIdx],
+                           channel,
+                           filterSize_[inIdx],
+                           filterSize_[inIdx],
+                           stride_[inIdx],
+                           stride_[inIdx],
+                           padding_[inIdx],
+                           padding_[inIdx],
+                           outputH_[inIdx],
+                           outputW_[inIdx]);
   imageTmp->clear();
 }
 
-void ExpandConvBaseLayer::expandFwdOnce(MatrixPtr image, MatrixPtr out,
-                                     int inIdx, int startIdx) {
+void ExpandConvBaseLayer::expandFwdOnce(MatrixPtr image,
+                                        MatrixPtr out,
+                                        int inIdx,
+                                        int startIdx) {
   int subM = subM_[inIdx];
   int subN = subN_[inIdx];
   int subK = subK_[inIdx];
@@ -124,8 +139,7 @@ void ExpandConvBaseLayer::expandFwdOnce(MatrixPtr image, MatrixPtr out,
 
   int numFilters = isDeconv_ ? channels_[inIdx] : numFilters_;
 
-  real *outData =
-      out->getData() + startIdx * subN * numFilters;
+  real *outData = out->getData() + startIdx * subN * numFilters;
 
   real *wgtData = weights_[inIdx]->getW()->getData();
   real *expInData = expandInput_->getData();
@@ -145,7 +159,8 @@ void ExpandConvBaseLayer::expandFwdOnce(MatrixPtr image, MatrixPtr out,
   }
 }
 
-void ExpandConvBaseLayer::bpropActs(MatrixPtr out, MatrixPtr image,
+void ExpandConvBaseLayer::bpropActs(MatrixPtr out,
+                                    MatrixPtr image,
                                     int inpIdx) {
   int channel = isDeconv_ ? numFilters_ : channels_[inpIdx];
 
@@ -183,15 +198,26 @@ void ExpandConvBaseLayer::bpropActs(MatrixPtr out, MatrixPtr image,
     // shrink one frame outGrad
     MatrixPtr oneGradTmp = Matrix::create(
         expandInput_->getData(), subK * groups_[inpIdx], subN, false, useGpu_);
-    MatrixPtr vTmp = Matrix::create(
-        tgtGradData, 1,
-        imgSizeH_[inpIdx] * imgSizeW_[inpIdx] * channel, false,
-        useGpu_);
-    vTmp->convShrink(*oneGradTmp, imgSizeH_[inpIdx], imgSizeW_[inpIdx],
-                     channel, filterSize_[inpIdx],
-                     filterSize_[inpIdx], stride_[inpIdx], stride_[inpIdx],
-                     padding_[inpIdx], padding_[inpIdx],
-                     outputH_[inpIdx], outputW_[inpIdx], 1.0f, 1.0f);
+    MatrixPtr vTmp =
+        Matrix::create(tgtGradData,
+                       1,
+                       imgSizeH_[inpIdx] * imgSizeW_[inpIdx] * channel,
+                       false,
+                       useGpu_);
+    vTmp->convShrink(*oneGradTmp,
+                     imgSizeH_[inpIdx],
+                     imgSizeW_[inpIdx],
+                     channel,
+                     filterSize_[inpIdx],
+                     filterSize_[inpIdx],
+                     stride_[inpIdx],
+                     stride_[inpIdx],
+                     padding_[inpIdx],
+                     padding_[inpIdx],
+                     outputH_[inpIdx],
+                     outputW_[inpIdx],
+                     1.0f,
+                     1.0f);
     vTmp->clear();
     oneGradTmp->clear();
 
@@ -200,8 +226,9 @@ void ExpandConvBaseLayer::bpropActs(MatrixPtr out, MatrixPtr image,
   }
 }
 
-void ExpandConvBaseLayer::bpropWeights(MatrixPtr image, MatrixPtr out,
-                                    int inpIdx) {
+void ExpandConvBaseLayer::bpropWeights(MatrixPtr image,
+                                       MatrixPtr out,
+                                       int inpIdx) {
   MatrixPtr weightGrad = weights_[inpIdx]->getWGrad();
 
   int subM = subM_[inpIdx];
@@ -249,9 +276,11 @@ void ExpandConvBaseLayer::bpropSharedBias(MatrixPtr biases, MatrixPtr v) {
 }
 
 void ExpandConvBaseLayer::bpropBiases(MatrixPtr v) {
-  MatrixPtr biases =
-      Matrix::create(biases_->getWGrad()->getData(), 1,
-                     biases_->getWGrad()->getElementCnt(), false, useGpu_);
+  MatrixPtr biases = Matrix::create(biases_->getWGrad()->getData(),
+                                    1,
+                                    biases_->getWGrad()->getElementCnt(),
+                                    false,
+                                    useGpu_);
   if (sharedBiases_) {
     bpropSharedBias(biases, v);
   } else {
diff --git a/paddle/gserver/layers/ExpandConvBaseLayer.h b/paddle/gserver/layers/ExpandConvBaseLayer.h
index 9858fa348c..5939d27e2a 100644
--- a/paddle/gserver/layers/ExpandConvBaseLayer.h
+++ b/paddle/gserver/layers/ExpandConvBaseLayer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "ConvBaseLayer.h"
@@ -45,7 +44,7 @@ protected:
 
 public:
   explicit ExpandConvBaseLayer(const LayerConfig& config)
-    : ConvBaseLayer(config) {}
+      : ConvBaseLayer(config) {}
 
   ~ExpandConvBaseLayer() {}
 
diff --git a/paddle/gserver/layers/ExpandConvLayer.cpp b/paddle/gserver/layers/ExpandConvLayer.cpp
index 5ea1fdece5..0649289c1c 100644
--- a/paddle/gserver/layers/ExpandConvLayer.cpp
+++ b/paddle/gserver/layers/ExpandConvLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
 #include "ExpandConvLayer.h"
@@ -58,7 +57,6 @@ void ExpandConvLayer::forward(PassType passType) {
   forwardActivation();
 }
 
-
 void ExpandConvLayer::backward(const UpdateCallback &callback) {
   backwardActivation();
 
diff --git a/paddle/gserver/layers/ExpandConvLayer.h b/paddle/gserver/layers/ExpandConvLayer.h
index c07188a406..82a9e88a42 100644
--- a/paddle/gserver/layers/ExpandConvLayer.h
+++ b/paddle/gserver/layers/ExpandConvLayer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "paddle/math/Matrix.h"
@@ -31,8 +30,8 @@ namespace paddle {
 
 class ExpandConvLayer : public ExpandConvBaseLayer {
 public:
-  explicit ExpandConvLayer(const LayerConfig& config) :
-    ExpandConvBaseLayer(config) {}
+  explicit ExpandConvLayer(const LayerConfig& config)
+      : ExpandConvBaseLayer(config) {}
 
   ~ExpandConvLayer() {}
 
diff --git a/paddle/gserver/layers/ExpandConvTransLayer.cpp b/paddle/gserver/layers/ExpandConvTransLayer.cpp
index a3e160f1f4..1132ab4f92 100644
--- a/paddle/gserver/layers/ExpandConvTransLayer.cpp
+++ b/paddle/gserver/layers/ExpandConvTransLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
 #include "ExpandConvTransLayer.h"
@@ -27,7 +26,7 @@ namespace paddle {
 REGISTER_LAYER(exconvt, ExpandConvTransLayer);
 
 bool ExpandConvTransLayer::init(const LayerMap &layerMap,
-                           const ParameterMap &parameterMap) {
+                                const ParameterMap &parameterMap) {
   /* Initialize the basic convolutional parent class */
   ExpandConvBaseLayer::init(layerMap, parameterMap);
 
@@ -88,5 +87,4 @@ void ExpandConvTransLayer::backward(const UpdateCallback &callback) {
   }
 }
 
-
 }  // namespace paddle
diff --git a/paddle/gserver/layers/ExpandConvTransLayer.h b/paddle/gserver/layers/ExpandConvTransLayer.h
index 87c464a97f..47efe3f656 100644
--- a/paddle/gserver/layers/ExpandConvTransLayer.h
+++ b/paddle/gserver/layers/ExpandConvTransLayer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "paddle/math/Matrix.h"
@@ -30,8 +29,8 @@ namespace paddle {
  */
 class ExpandConvTransLayer : public ExpandConvBaseLayer {
 public:
-  explicit ExpandConvTransLayer(const LayerConfig& config) :
-    ExpandConvBaseLayer(config) {}
+  explicit ExpandConvTransLayer(const LayerConfig& config)
+      : ExpandConvBaseLayer(config) {}
 
   ~ExpandConvTransLayer() {}
 
diff --git a/paddle/gserver/layers/FeatureMapExpandLayer.cpp b/paddle/gserver/layers/FeatureMapExpandLayer.cpp
index d18b51dd79..97c8d143fe 100644
--- a/paddle/gserver/layers/FeatureMapExpandLayer.cpp
+++ b/paddle/gserver/layers/FeatureMapExpandLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
 #include "paddle/utils/Stat.h"
@@ -79,9 +78,12 @@ void FeatureMapExpandLayer::forward(PassType passType) {
     for (size_t i = 0; i < batchSize; i++) {
       MatrixPtr outVTmp =
           Matrix::create(outputV->getData() + i * imgSize * numFilters_,
-                         numFilters_, imgSize, false, useGpu_);
-      MatrixPtr inVTmp = Matrix::create(inputV->getData() + i * imgSize, 1,
-                                        imgSize, false, useGpu_);
+                         numFilters_,
+                         imgSize,
+                         false,
+                         useGpu_);
+      MatrixPtr inVTmp = Matrix::create(
+          inputV->getData() + i * imgSize, 1, imgSize, false, useGpu_);
       outVTmp->addRowVector(*inVTmp);
     }
   }
@@ -101,9 +103,12 @@ void FeatureMapExpandLayer::backward(const UpdateCallback& callback) {
     for (size_t i = 0; i < batchSize; i++) {
       MatrixPtr outGradTmp =
           Matrix::create(outGrad->getData() + i * imgSize * numFilters_,
-                         numFilters_, imgSize, false, useGpu_);
-      MatrixPtr inGradTmp = Matrix::create(inGrad->getData() + i * imgSize, 1,
-                                           imgSize, false, useGpu_);
+                         numFilters_,
+                         imgSize,
+                         false,
+                         useGpu_);
+      MatrixPtr inGradTmp = Matrix::create(
+          inGrad->getData() + i * imgSize, 1, imgSize, false, useGpu_);
       inGradTmp->collectBias(*outGradTmp, 1);
     }
   }
diff --git a/paddle/gserver/layers/FullMatrixProjection.cpp b/paddle/gserver/layers/FullMatrixProjection.cpp
index f17c1b05bd..35a5cb5b7a 100644
--- a/paddle/gserver/layers/FullMatrixProjection.cpp
+++ b/paddle/gserver/layers/FullMatrixProjection.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "FullMatrixProjection.h"
 
 namespace paddle {
diff --git a/paddle/gserver/layers/FullMatrixProjection.h b/paddle/gserver/layers/FullMatrixProjection.h
index e99444b33b..ddb1e7b18c 100644
--- a/paddle/gserver/layers/FullMatrixProjection.h
+++ b/paddle/gserver/layers/FullMatrixProjection.h
@@ -30,7 +30,8 @@ namespace paddle {
 class FullMatrixProjection : public Projection {
 public:
   FullMatrixProjection(const ProjectionConfig& config,
-                       const ParameterPtr& parameter, bool useGpu);
+                       const ParameterPtr& parameter,
+                       bool useGpu);
   virtual void forward();
   virtual void backward(const UpdateCallback& callback);
 
diff --git a/paddle/gserver/layers/FullyConnectedLayer.cpp b/paddle/gserver/layers/FullyConnectedLayer.cpp
index c754f8fd94..70c56499a7 100644
--- a/paddle/gserver/layers/FullyConnectedLayer.cpp
+++ b/paddle/gserver/layers/FullyConnectedLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "FullyConnectedLayer.h"
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
diff --git a/paddle/gserver/layers/FullyConnectedLayer.h b/paddle/gserver/layers/FullyConnectedLayer.h
index 334eb4b722..e15e1236cd 100644
--- a/paddle/gserver/layers/FullyConnectedLayer.h
+++ b/paddle/gserver/layers/FullyConnectedLayer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "Layer.h"
@@ -20,9 +19,9 @@ limitations under the License. */
 #include "paddle/utils/ThreadLocal.h"
 
 namespace paddle {
-/** 
+/**
  * A layer has full connections to all neurons in the previous layer.
- * It computes an inner product with a set of learned weights, and 
+ * It computes an inner product with a set of learned weights, and
  * (optionally) adds biases.
  *
  * The config file api is fc_layer.
@@ -34,8 +33,7 @@ protected:
   std::unique_ptr<Weight> biases_;
 
 public:
-  explicit FullyConnectedLayer(const LayerConfig& config)
-      : Layer(config) {}
+  explicit FullyConnectedLayer(const LayerConfig& config) : Layer(config) {}
   ~FullyConnectedLayer() {}
 
   bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
diff --git a/paddle/gserver/layers/GatedRecurrentLayer.cpp b/paddle/gserver/layers/GatedRecurrentLayer.cpp
index e0c6ff7ea2..495c2174f3 100644
--- a/paddle/gserver/layers/GatedRecurrentLayer.cpp
+++ b/paddle/gserver/layers/GatedRecurrentLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "Layer.h"
 #include "GatedRecurrentLayer.h"
 #include "paddle/utils/Stat.h"
@@ -30,8 +29,8 @@ bool GatedRecurrentLayer::init(const LayerMap& layerMap,
   CHECK_EQ(getSize() * 3, biasParameter_->getSize());
   weight_.reset(new Weight(getSize(), getSize() * 3, parameters_[0]));
   gateWeight_.reset(new Weight(getSize(), getSize() * 2, parameters_[0], 0));
-  stateWeight_.reset(new Weight(getSize(), getSize(), parameters_[0],
-                                2 * getSize() * getSize()));
+  stateWeight_.reset(new Weight(
+      getSize(), getSize(), parameters_[0], 2 * getSize() * getSize()));
   if (biasParameter_.get() != NULL) {
     bias_.reset(new Weight(1, getSize() * 3, biasParameter_));
   }
@@ -48,8 +47,8 @@ bool GatedRecurrentLayer::init(const LayerMap& layerMap,
 void GatedRecurrentLayer::resetState() {
   CHECK(!reversed_) << "state is not allowed for reversed gated "
                        "recurrent layer";
-  Matrix::resizeOrCreate(prevOutput_, 1, getSize(), /* trans= */ false,
-                         useGpu_);
+  Matrix::resizeOrCreate(
+      prevOutput_, 1, getSize(), /* trans= */ false, useGpu_);
   prevOutput_->zeroMem();
 
   // TODO(hedaoyuan): support prev_batch_state
@@ -85,10 +84,16 @@ void GatedRecurrentLayer::forward(PassType passType) {
   // batchSize = length of total frames in a batch (NOT size of mini-batch)
   CHECK_EQ(starts[numSequences], batchSize);
 
-  Matrix::resizeOrCreate(gate_.value, /* height= */batchSize,
-                         getSize() * 3, /* trans= */false, useGpu_);
-  Matrix::resizeOrCreate(resetOutput_.value, /* height= */batchSize,
-                         getSize(), /* trans= */false, useGpu_);
+  Matrix::resizeOrCreate(gate_.value,
+                         /* height= */ batchSize,
+                         getSize() * 3,
+                         /* trans= */ false,
+                         useGpu_);
+  Matrix::resizeOrCreate(resetOutput_.value,
+                         /* height= */ batchSize,
+                         getSize(),
+                         /* trans= */ false,
+                         useGpu_);
 
   if (useBatch_) {
     forwardBatch(batchSize, numSequences, starts, input.value);
@@ -105,10 +110,16 @@ void GatedRecurrentLayer::backward(const UpdateCallback& callback) {
   const int* starts = input.sequenceStartPositions->getData(false);
   size_t numSequences = input.getNumSequences();
 
-  Matrix::resizeOrCreate(gate_.grad, /* height= */batchSize,
-                         getSize() * 3, /* trans= */false, useGpu_);
-  Matrix::resizeOrCreate(resetOutput_.grad, /* height= */batchSize,
-                         getSize(), /* trans= */false, useGpu_);
+  Matrix::resizeOrCreate(gate_.grad,
+                         /* height= */ batchSize,
+                         getSize() * 3,
+                         /* trans= */ false,
+                         useGpu_);
+  Matrix::resizeOrCreate(resetOutput_.grad,
+                         /* height= */ batchSize,
+                         getSize(),
+                         /* trans= */ false,
+                         useGpu_);
 
   if (useBatch_) {
     backwardBatch(batchSize, input.grad);
@@ -125,7 +136,7 @@ void GatedRecurrentLayer::backward(const UpdateCallback& callback) {
 
 void GatedRecurrentLayer::forwardSequence(int batchSize,
                                           size_t numSequences,
-                                          const int *starts,
+                                          const int* starts,
                                           MatrixPtr inputValue) {
   REGISTER_TIMER_INFO("GruFwSequenceTime", getName().c_str());
   gate_.value->assign(*inputValue);
@@ -198,7 +209,7 @@ void GatedRecurrentLayer::forwardSequence(int batchSize,
 
 void GatedRecurrentLayer::backwardSequence(int batchSize,
                                            size_t numSequences,
-                                           const int *starts,
+                                           const int* starts,
                                            MatrixPtr inputGrad) {
   REGISTER_TIMER_INFO("GruBwSequenceTime", getName().c_str());
 
@@ -211,9 +222,10 @@ void GatedRecurrentLayer::backwardSequence(int batchSize,
 
   hl_gru_grad gruGrad;
   gruGrad.gateWeightGrad =
-    (gateWeight_->getWGrad() ? gateWeight_->getWGrad()->getData() : nullptr);
+      (gateWeight_->getWGrad() ? gateWeight_->getWGrad()->getData() : nullptr);
   gruGrad.stateWeightGrad =
-    (stateWeight_->getWGrad() ? stateWeight_->getWGrad()->getData() : nullptr);
+      (stateWeight_->getWGrad() ? stateWeight_->getWGrad()->getData()
+                                : nullptr);
   gruGrad.gateGrad = gate_.grad->getData();
   gruGrad.resetOutputGrad = resetOutput_.grad->getData();
   gruGrad.outputGrad = output_.grad->getData();
@@ -298,11 +310,10 @@ void GatedRecurrentLayer::forwardBatch(int batchSize,
   if (!batchValue_) {
     batchValue_.reset(new SequenceToBatch(useGpu_));
   }
-  batchValue_->resizeOrCreateBatch(batchSize, numSequences, starts,
-                                   reversed_);
+  batchValue_->resizeOrCreateBatch(batchSize, numSequences, starts, reversed_);
 
   batchValue_->resizeOrCreate(*output_.value);
-  batchValue_->copy(*inputValue, *gate_.value, /* seq2batch */true);
+  batchValue_->copy(*inputValue, *gate_.value, /* seq2batch */ true);
   if (bias_ && bias_->getWGrad()) {
     gate_.value->addBias(*(bias_->getW()), 1);
   }
@@ -315,14 +326,14 @@ void GatedRecurrentLayer::forwardBatch(int batchSize,
       MatrixPtr outputValueTmp = batchValue_->getBatchValue(n);
       gruValue.outputValue = outputValueTmp->getData();
       gruValue.gateValue =
-        (batchValue_->getBatchValue(*gate_.value, n))->getData();
+          (batchValue_->getBatchValue(*gate_.value, n))->getData();
       gruValue.resetOutputValue =
-        (batchValue_->getBatchValue(*resetOutput_.value, n))->getData();
+          (batchValue_->getBatchValue(*resetOutput_.value, n))->getData();
 
       batchSize = outputValueTmp->getHeight();
       gruValue.prevOutValue =
-        (n == 0 ? nullptr
-                : (batchValue_->getBatchValue(n - 1, batchSize))->getData());
+          (n == 0 ? nullptr
+                  : (batchValue_->getBatchValue(n - 1, batchSize))->getData());
 
       {
         if (useGpu_) {
@@ -333,13 +344,10 @@ void GatedRecurrentLayer::forwardBatch(int batchSize,
       }
     }
   }
-  {
-    batchValue_->copyBackSeq(*output_.value);
-  }
+  { batchValue_->copyBackSeq(*output_.value); }
 }
 
-void GatedRecurrentLayer::backwardBatch(int batchSize,
-                                        MatrixPtr inputGrad) {
+void GatedRecurrentLayer::backwardBatch(int batchSize, MatrixPtr inputGrad) {
   REGISTER_TIMER_INFO("GruBwBatchTime", getName().c_str());
   hl_gru_value gruValue;
   gruValue.gateWeight = (gateWeight_->getW())->getData();
@@ -347,18 +355,17 @@ void GatedRecurrentLayer::backwardBatch(int batchSize,
 
   hl_gru_grad gruGrad;
   gruGrad.gateWeightGrad =
-    (gateWeight_->getWGrad() ? gateWeight_->getWGrad()->getData() : nullptr);
+      (gateWeight_->getWGrad() ? gateWeight_->getWGrad()->getData() : nullptr);
   gruGrad.stateWeightGrad =
-    (stateWeight_->getWGrad() ? stateWeight_->getWGrad()->getData() : nullptr);
+      (stateWeight_->getWGrad() ? stateWeight_->getWGrad()->getData()
+                                : nullptr);
 
   if (!batchGrad_) {
     batchGrad_.reset(new SequenceToBatch(useGpu_));
   }
   batchGrad_->shareIndexWith(*batchValue_);
 
-  {
-    batchGrad_->copyFromSeq(*output_.grad);
-  }
+  { batchGrad_->copyFromSeq(*output_.grad); }
 
   {
     int numBatch = batchGrad_->getNumBatch();
@@ -366,39 +373,36 @@ void GatedRecurrentLayer::backwardBatch(int batchSize,
     AsyncGpuBlock asyncGpuBlock;
     for (int n = (int)numBatch - 1; n >= 0; n--) {
       gruValue.gateValue =
-        (batchGrad_->getBatchValue(*gate_.value, n))->getData();
+          (batchGrad_->getBatchValue(*gate_.value, n))->getData();
       gruValue.resetOutputValue =
-        (batchGrad_->getBatchValue(*resetOutput_.value, n))->getData();
+          (batchGrad_->getBatchValue(*resetOutput_.value, n))->getData();
 
-      MatrixPtr outputGradTmp  = batchGrad_->getBatchValue(n);
+      MatrixPtr outputGradTmp = batchGrad_->getBatchValue(n);
       gruGrad.outputGrad = outputGradTmp->getData();
-      gruGrad.gateGrad =
-        (batchGrad_->getBatchValue(*gate_.grad , n))->getData();
+      gruGrad.gateGrad = (batchGrad_->getBatchValue(*gate_.grad, n))->getData();
       gruGrad.resetOutputGrad =
-        (batchGrad_->getBatchValue(*resetOutput_.grad , n))->getData();
+          (batchGrad_->getBatchValue(*resetOutput_.grad, n))->getData();
 
       {
         batchSize = outputGradTmp->getHeight();
         gruValue.prevOutValue =
-          (n == 0 ? nullptr
-                  : (batchValue_->getBatchValue(n - 1, batchSize))->getData());
+            (n == 0 ? nullptr : (batchValue_->getBatchValue(n - 1, batchSize))
+                                    ->getData());
         gruGrad.prevOutGrad =
-          (n == 0 ? nullptr
-                  : (batchGrad_->getBatchValue(n - 1, batchSize))->getData());
+            (n == 0 ? nullptr
+                    : (batchGrad_->getBatchValue(n - 1, batchSize))->getData());
 
         if (useGpu_) {
-          GruCompute::backward<1>(gruValue, gruGrad, getSize(),
-                                  batchSize);
+          GruCompute::backward<1>(gruValue, gruGrad, getSize(), batchSize);
         } else {
-          GruCompute::backward<0>(gruValue, gruGrad, getSize(),
-                                  batchSize);
+          GruCompute::backward<0>(gruValue, gruGrad, getSize(), batchSize);
         }
       }
     }
   }
 
   if (inputGrad) {
-    batchGrad_->add(*inputGrad, *gate_.grad, /* seq2batch */false);
+    batchGrad_->add(*inputGrad, *gate_.grad, /* seq2batch */ false);
   }
   if (bias_ && bias_->getWGrad()) {
     bias_->getWGrad()->collectBias(*gate_.grad, /* scale */ 1);
diff --git a/paddle/gserver/layers/GatedRecurrentLayer.h b/paddle/gserver/layers/GatedRecurrentLayer.h
index 19f71206bc..3b8706a44e 100644
--- a/paddle/gserver/layers/GatedRecurrentLayer.h
+++ b/paddle/gserver/layers/GatedRecurrentLayer.h
@@ -63,13 +63,19 @@ public:
   LayerStatePtr getState();
 
 protected:
-  void forwardSequence(int batchSize, size_t numSequences,
-                       const int *starts, MatrixPtr inputValue);
-  void backwardSequence(int batchSize, size_t numSequences,
-                        const int *starts, MatrixPtr inputGrad);
-
-  void forwardBatch(int batchSize, size_t numSequences,
-                    const int *starts, MatrixPtr inputValue);
+  void forwardSequence(int batchSize,
+                       size_t numSequences,
+                       const int* starts,
+                       MatrixPtr inputValue);
+  void backwardSequence(int batchSize,
+                        size_t numSequences,
+                        const int* starts,
+                        MatrixPtr inputGrad);
+
+  void forwardBatch(int batchSize,
+                    size_t numSequences,
+                    const int* starts,
+                    MatrixPtr inputValue);
   void backwardBatch(int batchSize, MatrixPtr inputGrad);
 
 protected:
diff --git a/paddle/gserver/layers/GetOutputLayer.cpp b/paddle/gserver/layers/GetOutputLayer.cpp
index f036cd2b52..01579d55fd 100644
--- a/paddle/gserver/layers/GetOutputLayer.cpp
+++ b/paddle/gserver/layers/GetOutputLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "Layer.h"
 
 namespace paddle {
diff --git a/paddle/gserver/layers/GruCompute.cpp b/paddle/gserver/layers/GruCompute.cpp
index c942122633..d9d423af44 100644
--- a/paddle/gserver/layers/GruCompute.cpp
+++ b/paddle/gserver/layers/GruCompute.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Util.h"
 #include "GruCompute.h"
 #include "hl_recurrent_apply.cuh"
@@ -20,14 +19,12 @@ limitations under the License. */
 namespace paddle {
 
 void GruCompute::init(LayerConfig &config) {
-    activeNode_ = hlActiveType(config.active_type());
-    activeGate_ = hlActiveType(config.active_gate_type());
+  activeNode_ = hlActiveType(config.active_type());
+  activeGate_ = hlActiveType(config.active_gate_type());
 }
 
 template <>
-void GruCompute::forward<0>(hl_gru_value value,
-                            int frameSize,
-                            int batchSize) {
+void GruCompute::forward<0>(hl_gru_value value, int frameSize, int batchSize) {
   hl_cpu_gru_forward(hppl::forward::gru_resetOutput(),
                      hppl::forward::gru_finalOutput(),
                      value,
@@ -39,17 +36,17 @@ void GruCompute::forward<0>(hl_gru_value value,
 
 template <>
 void GruCompute::backward<0>(hl_gru_value value,
-                            hl_gru_grad  grad,
-                            int frameSize,
-                            int batchSize) {
-hl_cpu_gru_backward(hppl::backward::gru_stateGrad(),
-                    hppl::backward::gru_resetGrad(),
-                    value,
-                    grad,
-                    frameSize,
-                    batchSize,
-                    activeNode_,
-                    activeGate_);
+                             hl_gru_grad grad,
+                             int frameSize,
+                             int batchSize) {
+  hl_cpu_gru_backward(hppl::backward::gru_stateGrad(),
+                      hppl::backward::gru_resetGrad(),
+                      value,
+                      grad,
+                      frameSize,
+                      batchSize,
+                      activeNode_,
+                      activeGate_);
 }
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/GruCompute.h b/paddle/gserver/layers/GruCompute.h
index 3a1b69b940..58b5aacba0 100644
--- a/paddle/gserver/layers/GruCompute.h
+++ b/paddle/gserver/layers/GruCompute.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "paddle/utils/TypeDefs.h"
@@ -29,7 +28,9 @@ public:
   void forward(hl_gru_value value, int frameSize, int batchSize = 1);
 
   template <bool useGpu>
-  void backward(hl_gru_value value, hl_gru_grad grad, int frameSize,
+  void backward(hl_gru_value value,
+                hl_gru_grad grad,
+                int frameSize,
                 int batchSize = 1);
 
 public:
diff --git a/paddle/gserver/layers/GruStepLayer.cpp b/paddle/gserver/layers/GruStepLayer.cpp
index 501229d10a..6c9b0c5771 100644
--- a/paddle/gserver/layers/GruStepLayer.cpp
+++ b/paddle/gserver/layers/GruStepLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "Layer.h"
 #include "GruCompute.h"
 #include "paddle/utils/Stat.h"
@@ -32,7 +31,8 @@ namespace paddle {
  * \f[
  * update \ gate: z_t = actGate(xz_t + U_z * prev_out + bias_z) \\
  * reset \ gate: r_t = actGate(xr_t + U_r * prev_out + bias_r)  \\
- * output \ candidate: {h}_t = actNode(xi_t + U * dot(r_t, prev_out) + bias_o) \\
+ * output \ candidate: {h}_t = actNode(xi_t + U * dot(r_t, prev_out) + bias_o)
+ * \\
  * output: h_t = dot((1-z_t), prev_out) + dot(z_t, prev_out)
  * \f]
  *
@@ -91,10 +91,16 @@ void GruStepLayer::forward(PassType passType) {
 
   int batchSize = input.getBatchSize();
   resetOutput(batchSize, getSize());
-  resetSpecifyOutput(gate_, batchSize, getSize() * 3,
-                     /* isValueClean */ false, /* isGradClean */ false);
-  resetSpecifyOutput(resetOutput_, batchSize, getSize(),
-                     /* isValueClean */ false, /* isGradClean */ false);
+  resetSpecifyOutput(gate_,
+                     batchSize,
+                     getSize() * 3,
+                     /* isValueClean */ false,
+                     /* isGradClean */ false);
+  resetSpecifyOutput(resetOutput_,
+                     batchSize,
+                     getSize(),
+                     /* isValueClean */ false,
+                     /* isGradClean */ false);
   gate_.value->assign(*input.value);
   if (bias_) {
     gate_.value->addBias(*(bias_->getW()), 1);
@@ -103,7 +109,7 @@ void GruStepLayer::forward(PassType passType) {
   hl_gru_value gruValue;
   gruValue.gateWeight = weight_->getW()->getData();
   gruValue.stateWeight = weight_->getW()->getData() + getSize() * getSize() * 2;
-  gruValue.gateValue = gate_.value->getData();;
+  gruValue.gateValue = gate_.value->getData();
   gruValue.resetOutputValue = resetOutput_.value->getData();
   gruValue.outputValue = output_.value->getData();
   gruValue.prevOutValue = prevOutput.value->getData();
@@ -125,17 +131,18 @@ void GruStepLayer::backward(const UpdateCallback& callback) {
   hl_gru_value gruValue;
   gruValue.gateWeight = weight_->getW()->getData();
   gruValue.stateWeight = weight_->getW()->getData() + getSize() * getSize() * 2;
-  gruValue.gateValue = gate_.value->getData();;
+  gruValue.gateValue = gate_.value->getData();
   gruValue.resetOutputValue = resetOutput_.value->getData();
   gruValue.outputValue = output_.value->getData();
   gruValue.prevOutValue = prevOutput.value->getData();
 
-  hl_gru_grad  gruGrad;
+  hl_gru_grad gruGrad;
   gruGrad.gateWeightGrad =
-    (weight_->getWGrad() ? weight_->getWGrad()->getData() : nullptr);
+      (weight_->getWGrad() ? weight_->getWGrad()->getData() : nullptr);
   gruGrad.stateWeightGrad =
-    (weight_->getWGrad() ?
-     weight_->getWGrad()->getData() + getSize() * getSize() * 2 : nullptr);
+      (weight_->getWGrad()
+           ? weight_->getWGrad()->getData() + getSize() * getSize() * 2
+           : nullptr);
 
   gruGrad.gateGrad = gate_.grad->getData();
   gruGrad.resetOutputGrad = resetOutput_.grad->getData();
diff --git a/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp b/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp
index 7091c6aa22..61bc777785 100644
--- a/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp
+++ b/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "HierarchicalSigmoidLayer.h"
 #include "paddle/utils/Util.h"
 
@@ -61,10 +60,16 @@ void HierarchicalSigmoidLayer::forward(PassType passType) {
   int batchSize = getInputValue(0)->getHeight();
   int size = getSize();
   reserveOutput(batchSize, size);
-  Matrix::resizeOrCreate(preOutput_.value, batchSize, codeLength_,
-                         /* trans */ false, useGpu(deviceId_));
-  Matrix::resizeOrCreate(preOutput_.grad, batchSize, codeLength_,
-                         /* trans */ false, useGpu(deviceId_));
+  Matrix::resizeOrCreate(preOutput_.value,
+                         batchSize,
+                         codeLength_,
+                         /* trans */ false,
+                         useGpu(deviceId_));
+  Matrix::resizeOrCreate(preOutput_.grad,
+                         batchSize,
+                         codeLength_,
+                         /* trans */ false,
+                         useGpu(deviceId_));
 
   IVectorPtr label = getInput(*getLabelLayer()).ids;
 
@@ -76,16 +81,18 @@ void HierarchicalSigmoidLayer::forward(PassType passType) {
   }
   for (size_t i = 0; i < inputLayers_.size() - 1; ++i) {
     MatrixPtr input = getInputValue(i);
-    preOutput_.value->mulByBitCode(numClasses_, *label, *weights_[i]->getW(),
-                                   *input);
+    preOutput_.value->mulByBitCode(
+        numClasses_, *label, *weights_[i]->getW(), *input);
   }
   // keep consistent with the clipping in the following softrelu
   preOutput_.value->clip(-40.0, 40.0);
-  preOutput_.value->sumByBitCode(numClasses_, *label, *output_.value,
+  preOutput_.value->sumByBitCode(numClasses_,
+                                 *label,
+                                 *output_.value,
                                  -1);  // scaleSum
   preOutput_.value->softrelu(*preOutput_.value);
-  MatrixPtr sum = Matrix::create(batchSize,
-    1, /* trans= */ false, useGpu(deviceId_));
+  MatrixPtr sum =
+      Matrix::create(batchSize, 1, /* trans= */ false, useGpu(deviceId_));
   preOutput_.value->rowSum(*sum);
   output_.value->add(*sum);
 }
@@ -97,8 +104,8 @@ void HierarchicalSigmoidLayer::backward(const UpdateCallback& callback) {
   preOutput_.grad->subByBitCode(numClasses_, *label);
 
   if (biases_ && biases_->getWGrad()) {
-    preOutput_.grad->addByBitCodeBackward(numClasses_, *label,
-                                          *biases_->getWGrad());
+    preOutput_.grad->addByBitCodeBackward(
+        numClasses_, *label, *biases_->getWGrad());
 
     /* Increasing the number of gradient */
     biases_->getParameterPtr()->incUpdate(callback);
diff --git a/paddle/gserver/layers/HierarchicalSigmoidLayer.h b/paddle/gserver/layers/HierarchicalSigmoidLayer.h
index 1942c5fe1e..10762bc926 100644
--- a/paddle/gserver/layers/HierarchicalSigmoidLayer.h
+++ b/paddle/gserver/layers/HierarchicalSigmoidLayer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "Layer.h"
@@ -20,15 +19,15 @@ limitations under the License. */
 namespace paddle {
 
 /**
- * Organize the classes into a binary tree. At each node, a sigmoid function 
+ * Organize the classes into a binary tree. At each node, a sigmoid function
  * is used to calculate the probability of belonging to the right branch.
- * This idea is from "F. Morin, Y. Bengio (AISTATS 05): 
+ * This idea is from "F. Morin, Y. Bengio (AISTATS 05):
  * Hierarchical Probabilistic Neural Network Language Model."
  *
  * Here we uses a simple way of making the binary tree.
  * Assuming the number of classes C = 6,
  * The classes are organized as a binary tree in the following way:
- * 
+ *
  * @code{.py}
  * *-*-*- 2
  * | | |- 3
@@ -44,15 +43,15 @@ namespace paddle {
  * - Node 0 ... C-2 are internal nodes.
  * - Node C-1 ... 2C-2 are leaf nodes.
  * - Class c is represented by leaf node \f$c+C-1\f$.
- * 
+ *
  * We assign an id for each node:
  * - the id of root be 0.
  * - the left child of a node i is 2*i+1.
  * - the right child of a node i is 2*i+2.
  *
  * It's easy to see that:
- * - the parent of node i is \f$\left\lfloor(i-1)/2\right\rfloor\f$. 
- * - the j-th level ancestor of node i is 
+ * - the parent of node i is \f$\left\lfloor(i-1)/2\right\rfloor\f$.
+ * - the j-th level ancestor of node i is
  * \f$\left\lfloor(i+1)/2^{j+1}\right\rfloor - 1\f$.
  * - A node i is a left child of its parent if \f$(i-1)\%2==0\f$.
  *
@@ -69,7 +68,7 @@ public:
 protected:
   /**
    * The last of inputs is label layer.
-   */ 
+   */
   LayerPtr getLabelLayer() { return inputLayers_.back(); }
 
   WeightList weights_;
diff --git a/paddle/gserver/layers/IdentityProjection.cpp b/paddle/gserver/layers/IdentityProjection.cpp
index 6b7d20cc50..b38656c960 100644
--- a/paddle/gserver/layers/IdentityProjection.cpp
+++ b/paddle/gserver/layers/IdentityProjection.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Stat.h"
 #include "Projection.h"
 
@@ -29,7 +28,8 @@ namespace paddle {
 class IdentityProjection : public Projection {
 public:
   IdentityProjection(const ProjectionConfig& config,
-                     const ParameterPtr& parameter, bool useGpu);
+                     const ParameterPtr& parameter,
+                     bool useGpu);
   virtual void forward();
   virtual void backward(const UpdateCallback& callback);
 };
@@ -70,7 +70,8 @@ void IdentityProjection::backward(const UpdateCallback& callback) {
 class IdentityOffsetProjection : public Projection {
 public:
   IdentityOffsetProjection(const ProjectionConfig& config,
-                           const ParameterPtr& parameter, bool useGpu);
+                           const ParameterPtr& parameter,
+                           bool useGpu);
   virtual void forward();
   virtual void backward(const UpdateCallback& callback);
 };
diff --git a/paddle/gserver/layers/InterpolationLayer.cpp b/paddle/gserver/layers/InterpolationLayer.cpp
index 4102df840a..b00bee2356 100644
--- a/paddle/gserver/layers/InterpolationLayer.cpp
+++ b/paddle/gserver/layers/InterpolationLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Logging.h"
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
@@ -26,8 +25,8 @@ namespace paddle {
  * \f[
  *   y.row[i] = w[i] * x_1.row[i] + (1 - w[i]) * x_2.row[i]
  * \f]
- * where \f$x_1\f$ and \f$x_2\f$ are two (batchSize x dataDim) inputs, 
- * \f$w\f$ is (batchSize x 1) weight vector, 
+ * where \f$x_1\f$ and \f$x_2\f$ are two (batchSize x dataDim) inputs,
+ * \f$w\f$ is (batchSize x 1) weight vector,
  * and \f$y\f$ is (batchSize x dataDim) output.
  *
  * The config file api is interpolation_layer.
diff --git a/paddle/gserver/layers/Layer.cpp b/paddle/gserver/layers/Layer.cpp
index 78d15c5530..0f9e7c0ff8 100644
--- a/paddle/gserver/layers/Layer.cpp
+++ b/paddle/gserver/layers/Layer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Util.h"
 
 #include "paddle/utils/Logging.h"
@@ -123,19 +122,22 @@ LayerPtr Layer::create(const LayerConfig& config) {
   return LayerPtr(registrar_.createByType(config.type(), config));
 }
 
-void Layer::resetSpecifyOutput(Argument& output, size_t height, size_t width,
-                               bool isValueClean, bool isGradClean) {
+void Layer::resetSpecifyOutput(Argument& output,
+                               size_t height,
+                               size_t width,
+                               bool isValueClean,
+                               bool isGradClean) {
   SetDevice device(output.deviceId);
 
-  Matrix::resizeOrCreate(output.value, height, width, /* trans */ false,
-                         useGpu(output.deviceId));
+  Matrix::resizeOrCreate(
+      output.value, height, width, /* trans */ false, useGpu(output.deviceId));
   if (isValueClean) {
     output.value->zeroMem();
   }
 
   if (passType_ != PASS_TEST && needGradient()) {
-    Matrix::resizeOrCreate(output.grad, height, width, /* trans */ false,
-                           useGpu(output.deviceId));
+    Matrix::resizeOrCreate(
+        output.grad, height, width, /* trans */ false, useGpu(output.deviceId));
     if (isGradClean) {
       output.grad->zeroMem();
     }
@@ -227,8 +229,10 @@ void Layer::waitAndMergeOutputGrad() {
     if (outputOtherDevice_.size() == 1) return;
   }
 
-  Matrix::resizeOrCreate(tmpGrad_, output_.grad->getHeight(),
-                         output_.grad->getWidth(), /* trans */ false,
+  Matrix::resizeOrCreate(tmpGrad_,
+                         output_.grad->getHeight(),
+                         output_.grad->getWidth(),
+                         /* trans */ false,
                          useGpu(output_.deviceId));
 
   for (; i != outputOtherDevice_.size(); i++) {
@@ -258,8 +262,8 @@ void Layer::zeroGrad() {
 }
 
 void Layer::initNeedFlags() {
-  auto initFlag = [this](bool& flag, bool (Layer::*flagQueryFunc)() const,
-                         ParameterType type) {
+  auto initFlag = [this](
+      bool& flag, bool (Layer::*flagQueryFunc)() const, ParameterType type) {
     flag = false;
     if (biasParameter_ && biasParameter_->hasType(type)) {
       flag = true;
@@ -293,10 +297,12 @@ void Layer::showOutputStats() {
   }
   MatrixPtr outSquare;
   if (dynamic_cast<GpuSparseMatrix*>(out.get())) {
-    GpuSparseMatrix *tmp = dynamic_cast<GpuSparseMatrix*>(out.get());
-    outSquare = std::make_shared<CpuSparseMatrix>(
-      tmp->getHeight(), tmp->getWidth(), tmp->getElementCnt(),
-      tmp->getValueType(), tmp->getFormat());
+    GpuSparseMatrix* tmp = dynamic_cast<GpuSparseMatrix*>(out.get());
+    outSquare = std::make_shared<CpuSparseMatrix>(tmp->getHeight(),
+                                                  tmp->getWidth(),
+                                                  tmp->getElementCnt(),
+                                                  tmp->getValueType(),
+                                                  tmp->getFormat());
   } else {
     outSquare = out->clone();
   }
@@ -321,8 +327,7 @@ void Layer::showOutputStats() {
   std = std > 0 ? std : 0;
   LOG(INFO) << "The output state of " << config_.name() << ": mean=" << mean
             << ", "
-            << "std=" << std
-            << ", "
+            << "std=" << std << ", "
             << "min=" << min << ", "
             << "max=" << max;
 }
@@ -348,8 +353,8 @@ void Layer::backwardActivation() {
   if (config_.error_clipping_threshold() > 0.0f) {
     if (FLAGS_log_error_clipping) {
       CpuVector outGradVec(0, nullptr);
-      outGradVec.subVecFrom(output_.grad->getData(), 0,
-                            output_.grad->getElementCnt());
+      outGradVec.subVecFrom(
+          output_.grad->getData(), 0, output_.grad->getElementCnt());
       real maxAbsGrad = outGradVec.getAbsMax();
       if (maxAbsGrad > config_.error_clipping_threshold()) {
         real avgAbsGrad = outGradVec.getAbsSum() / outGradVec.getSize();
@@ -376,16 +381,19 @@ void Layer::forwardDropOut() {
   if (passType_ == PASS_TRAIN || passType_ == PASS_METRIC_TRAIN ||
       passType_ == PASS_METRIC_TRAIN_WITH_NOERROR) {
     // new dropOutMask_ if dropOutMask_ is null ptr
-    Matrix::resizeOrCreate(dropOutMask_, outV->getHeight(), outV->getWidth(),
-                           false, useGpu(deviceId_));
+    Matrix::resizeOrCreate(dropOutMask_,
+                           outV->getHeight(),
+                           outV->getWidth(),
+                           false,
+                           useGpu(deviceId_));
     dropOutMask_->randomizeUniform();  // generate a uniform random matrix
     dropOutMask_->biggerThanScalar(config_.drop_rate());  // random mask
     outV->dotMul(*outV, *dropOutMask_);                   // dropout
   } else if (passType_ == PASS_GC) {
     // only initialize once
     if (!dropOutMask_) {
-      dropOutMask_ = Matrix::create(outV->getHeight(), outV->getWidth(), false,
-                                    useGpu(deviceId_));
+      dropOutMask_ = Matrix::create(
+          outV->getHeight(), outV->getWidth(), false, useGpu(deviceId_));
       // We use cpu matrix to generate mask so that the mask
       // will be same for both gpu version and cpu version.
       // This will help unittest to make sure they have same result.
diff --git a/paddle/gserver/layers/Layer.h b/paddle/gserver/layers/Layer.h
index ae7cdb0028..3d427a1ac6 100644
--- a/paddle/gserver/layers/Layer.h
+++ b/paddle/gserver/layers/Layer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <memory>
@@ -109,7 +108,7 @@ public:
   virtual void waitInputValue();
 
   /**
-   * Copy layer's output_ to other device. 
+   * Copy layer's output_ to other device.
    * If output layer is in other device, called after Layer::forward() function.
    */
   virtual void copyOutputToOtherDevice();
@@ -189,8 +188,11 @@ protected:
    * Reset to value zero if isValueClean = true,
    * Reset to grad zero if isGradClean = true.
    */
-  void resetSpecifyOutput(Argument& output, size_t height, size_t width,
-                          bool isValueClean, bool isGradClean);
+  void resetSpecifyOutput(Argument& output,
+                          size_t height,
+                          size_t width,
+                          bool isValueClean,
+                          bool isGradClean);
 
   /**
    * Add output argument to other devices.
@@ -204,48 +206,48 @@ public:
   /// Register a Layer
   static ClassRegistrar<Layer, LayerConfig> registrar_;
 
-  /** 
+  /**
    * Get the flag whether layer need to compute gradient.
    */
   bool needGradient() const { return needGradient_; }
 
-  /** 
+  /**
    * Set the flag whether layer need to compute gradient.
    */
   void setNeedGradient(bool need) { needGradient_ = need; }
 
-  /** 
+  /**
    * Set the flag whether layer need to re-compute sequence information,
    * which includes sequenceStartPositions or subSequenceStartPositions.
    */
   void setNeedSequenceInfo(bool need) { needSequenceInfo_ = need; }
 
-  /** 
+  /**
    * Get layer's name.
    */
   const std::string& getName() const { return config_.name(); }
 
-  /** 
+  /**
    * Get layer's type.
    */
   const std::string& getType() const { return config_.type(); }
 
-  /** 
+  /**
    * Get layer's size.
    */
   size_t getSize() const { return config_.size(); }
 
-  /** 
+  /**
    * Get layer's deviceId.
    */
   int getDeviceId() const { return deviceId_; }
 
-  /** 
+  /**
    * Add the inputLayer.
    */
   void addPrev(LayerPtr l) { inputLayers_.push_back(l); }
 
-  /** 
+  /**
    * Get the size of inputLayer[i].
    */
   const LayerPtr& getPrev(size_t i) { return inputLayers_[i]; }
@@ -265,7 +267,7 @@ public:
    */
   const MatrixPtr& getOutputGrad() { return output_.grad; }
   /**
-   * If layer has multi-output, set output into outputMap_. 
+   * If layer has multi-output, set output into outputMap_.
    */
   void setOutput(const std::string& name, Argument* output) {
     outputMap_[name] = output;
@@ -351,8 +353,8 @@ public:
   /**
    * Intialization for sub network if there has sub network.
    * @param rootNetwork root network
-   * @param config model config 
-   * @param parameterTypes parameter's type 
+   * @param config model config
+   * @param parameterTypes parameter's type
    * @param useGpu whether to use gpu or not
    */
   virtual void initSubNetwork(NeuralNetwork* rootNetwork,
@@ -391,7 +393,8 @@ public:
   /**
    * Reset the internal state variables.
    * Allocate them if they have not been allocated.
-   * This function need to called before Layer::forward() for generating sequence.
+   * This function need to called before Layer::forward() for generating
+   * sequence.
    *
    * This is used for sequence generation. When generating sequence, the
    * calculation at current timestamp depends on the state from previous
@@ -407,7 +410,7 @@ public:
   virtual void setState(LayerStatePtr state) {}
 
   /**
-   * Get layer state. 
+   * Get layer state.
    * @return A copy of internal state.
    */
   virtual LayerStatePtr getState() { return nullptr; }
diff --git a/paddle/gserver/layers/LinearChainCRF.cpp b/paddle/gserver/layers/LinearChainCRF.cpp
index fb54fd26cf..2b3a50b2e2 100644
--- a/paddle/gserver/layers/LinearChainCRF.cpp
+++ b/paddle/gserver/layers/LinearChainCRF.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <algorithm>
 #include "LinearChainCRF.h"
 
diff --git a/paddle/gserver/layers/LinearChainCRF.h b/paddle/gserver/layers/LinearChainCRF.h
index c33c83b259..6368f2b9de 100644
--- a/paddle/gserver/layers/LinearChainCRF.h
+++ b/paddle/gserver/layers/LinearChainCRF.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "paddle/math/Matrix.h"
@@ -31,7 +30,8 @@ public:
    * \f$P(s) = (1/Z) exp(a_{s_1} + b_{s_L}
    *                  + \sum_{l=1}^L x_{s_l}
    *                  + \sum_{l=2}^L w_{s_{l-1},s_l})\f$
-   * where \f$Z\f$ is a normalization value so that the sum of \f$P(s)\f$ over all possible
+   * where \f$Z\f$ is a normalization value so that the sum of \f$P(s)\f$ over
+   * all possible
    * sequences is \f$1\f$, and \f$x\f$ is the input feature to the CRF.
    */
   LinearChainCRF(int numClasses, real* para, real* grad);
diff --git a/paddle/gserver/layers/LinearChainCTC.cpp b/paddle/gserver/layers/LinearChainCTC.cpp
index c0ffadbd91..3368eb4d8a 100644
--- a/paddle/gserver/layers/LinearChainCTC.cpp
+++ b/paddle/gserver/layers/LinearChainCTC.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <math.h>
 #include "LinearChainCTC.h"
 #include <limits>
@@ -90,7 +89,9 @@ LinearChainCTC::LinearChainCTC(int numClasses, bool normByTimes)
   Matrix::resizeOrCreate(gradTerms_, 1, numClasses_);
 }
 
-real LinearChainCTC::forward(real* softmaxSeq, int softmaxSeqLen, int* labelSeq,
+real LinearChainCTC::forward(real* softmaxSeq,
+                             int softmaxSeqLen,
+                             int* labelSeq,
                              int labelSeqLen) {
   isInvalid_ = false;
   totalTime_ = softmaxSeqLen;
@@ -215,7 +216,9 @@ real LinearChainCTC::forward(real* softmaxSeq, int softmaxSeqLen, int* labelSeq,
   return -logProb_;
 }
 
-void LinearChainCTC::backward(real* softmaxSeq, real* grad, int* labelSeq,
+void LinearChainCTC::backward(real* softmaxSeq,
+                              real* grad,
+                              int* labelSeq,
                               int labelSeqLen) {
   /* if not meet the conditions of CTC computing, then set the grads to zeros */
   if (isInvalid_) {
@@ -246,9 +249,9 @@ void LinearChainCTC::backward(real* softmaxSeq, real* grad, int* labelSeq,
                        logMul(logProb_, logActsData[i * numClasses_ + j]))) /
             totalTime_;
       } else {
-        grad[i * numClasses_ + j] += -safeExp(logDiv(
-            gradTermsData[j],
-            logMul(logProb_, logActsData[i * numClasses_ + j])));
+        grad[i * numClasses_ + j] += -safeExp(
+            logDiv(gradTermsData[j],
+                   logMul(logProb_, logActsData[i * numClasses_ + j])));
       }
     }
   }
diff --git a/paddle/gserver/layers/LinearChainCTC.h b/paddle/gserver/layers/LinearChainCTC.h
index b09218e3e7..0a93d2e9a6 100644
--- a/paddle/gserver/layers/LinearChainCTC.h
+++ b/paddle/gserver/layers/LinearChainCTC.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <vector>
@@ -25,11 +24,15 @@ public:
   LinearChainCTC(int numClasses, bool normByTimes);
 
   // Calculate the negative log probability as loss
-  real forward(real* softmaxSeq, int softmaxSeqLen, int* labelSeq,
+  real forward(real* softmaxSeq,
+               int softmaxSeqLen,
+               int* labelSeq,
                int labelSeqLen);
 
   // calculate the gradient
-  void backward(real* softmaxSeq, real* softmaxSeqGrad, int* labelSeq,
+  void backward(real* softmaxSeq,
+                real* softmaxSeqGrad,
+                int* labelSeq,
                 int labelSeqLen);
 
 protected:
diff --git a/paddle/gserver/layers/LstmCompute.cpp b/paddle/gserver/layers/LstmCompute.cpp
index ced9636d35..38057636ed 100644
--- a/paddle/gserver/layers/LstmCompute.cpp
+++ b/paddle/gserver/layers/LstmCompute.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Util.h"
 #include "hl_recurrent_apply.cuh"
 #include "LstmCompute.h"
@@ -27,22 +26,31 @@ void LstmCompute::init(LayerConfig &config) {
 
 template <>
 void LstmCompute::forwardOneSequence<0>(hl_lstm_value value, int frameSize) {
-  hl_cpu_lstm_forward(hppl::forward::lstm(), value,
-                      frameSize, activeNode_, activeGate_,
+  hl_cpu_lstm_forward(hppl::forward::lstm(),
+                      value,
+                      frameSize,
+                      activeNode_,
+                      activeGate_,
                       activeState_);
 }
 
 template <>
-void LstmCompute::backwardOneSequence<0>(hl_lstm_value value, hl_lstm_grad grad,
-                                        int frameSize) {
-  hl_cpu_lstm_backward(hppl::backward::lstm(), value, grad,
-                       frameSize, activeNode_, activeGate_,
+void LstmCompute::backwardOneSequence<0>(hl_lstm_value value,
+                                         hl_lstm_grad grad,
+                                         int frameSize) {
+  hl_cpu_lstm_backward(hppl::backward::lstm(),
+                       value,
+                       grad,
+                       frameSize,
+                       activeNode_,
+                       activeGate_,
                        activeState_);
 }
 
 template <>
-void LstmCompute::forwardBatch<0>(hl_lstm_value value, int frameSize,
-                                 int batchSize) {
+void LstmCompute::forwardBatch<0>(hl_lstm_value value,
+                                  int frameSize,
+                                  int batchSize) {
   for (int b = 0; b < batchSize; b++) {
     forwardOneSequence<0>(value, frameSize);
 
@@ -57,8 +65,10 @@ void LstmCompute::forwardBatch<0>(hl_lstm_value value, int frameSize,
 }
 
 template <>
-void LstmCompute::backwardBatch<0>(hl_lstm_value value, hl_lstm_grad grad,
-                                  int frameSize, int batchSize) {
+void LstmCompute::backwardBatch<0>(hl_lstm_value value,
+                                   hl_lstm_grad grad,
+                                   int frameSize,
+                                   int batchSize) {
   for (int b = 0; b < batchSize; b++) {
     backwardOneSequence<0>(value, grad, frameSize);
 
diff --git a/paddle/gserver/layers/LstmCompute.h b/paddle/gserver/layers/LstmCompute.h
index 638acdb56d..97be7218f2 100644
--- a/paddle/gserver/layers/LstmCompute.h
+++ b/paddle/gserver/layers/LstmCompute.h
@@ -35,7 +35,9 @@ public:
   void forwardBatch(hl_lstm_value value, int frameSize, int batchSize);
 
   template <bool useGpu>
-  void backwardBatch(hl_lstm_value value, hl_lstm_grad grad, int frameSize,
+  void backwardBatch(hl_lstm_value value,
+                     hl_lstm_grad grad,
+                     int frameSize,
                      int batchSize);
 
   /**
@@ -51,7 +53,8 @@ public:
   template <bool useGpu>
   void forwardOneSequence(hl_lstm_value value, int frameSize);
   template <bool useGpu>
-  void backwardOneSequence(hl_lstm_value value, hl_lstm_grad grad,
+  void backwardOneSequence(hl_lstm_value value,
+                           hl_lstm_grad grad,
                            int frameSize);
 
 public:
diff --git a/paddle/gserver/layers/LstmLayer.cpp b/paddle/gserver/layers/LstmLayer.cpp
index 61ad47a7fb..e70a20e5c0 100644
--- a/paddle/gserver/layers/LstmLayer.cpp
+++ b/paddle/gserver/layers/LstmLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "LstmLayer.h"
 #include "paddle/math/Matrix.h"
 #include "paddle/math/BaseMatrix.h"
@@ -35,14 +34,26 @@ bool LstmLayer::init(const LayerMap &layerMap,
   if (biasParameter_.get() != NULL) {
     bias_.reset(new Weight(1, getSize() * 7, biasParameter_));
     if (bias_->getW()) {
-      localBias_ = Matrix::create(nullptr, /* height= */ 1, getSize() * 4,
-                                  /* trans= */ false, useGpu_);
-      checkIg_ = Matrix::create(nullptr, /* height= */ 1, getSize(),
-                                /* trans= */ false, useGpu_);
-      checkFg_ = Matrix::create(nullptr, /* height= */ 1, getSize(),
-                                /* trans= */ false, useGpu_);
-      checkOg_ = Matrix::create(nullptr, /* height= */ 1, getSize(),
-                                /* trans= */ false, useGpu_);
+      localBias_ = Matrix::create(nullptr,
+                                  /* height= */ 1,
+                                  getSize() * 4,
+                                  /* trans= */ false,
+                                  useGpu_);
+      checkIg_ = Matrix::create(nullptr,
+                                /* height= */ 1,
+                                getSize(),
+                                /* trans= */ false,
+                                useGpu_);
+      checkFg_ = Matrix::create(nullptr,
+                                /* height= */ 1,
+                                getSize(),
+                                /* trans= */ false,
+                                useGpu_);
+      checkOg_ = Matrix::create(nullptr,
+                                /* height= */ 1,
+                                getSize(),
+                                /* trans= */ false,
+                                useGpu_);
 
       localBias_->setData(bias_->getW()->getData());
       checkIg_->setData(bias_->getW()->getData() + getSize() * 4);
@@ -51,14 +62,26 @@ bool LstmLayer::init(const LayerMap &layerMap,
     }
 
     if (bias_->getWGrad()) {
-      localBiasGrad_ = Matrix::create(nullptr, /* height= */ 1, getSize() * 4,
-                                      /* trans= */ false, useGpu_);
-      checkIgGrad_ = Matrix::create(nullptr, /* height= */ 1, getSize(),
-                                    /* trans= */ false, useGpu_);
-      checkFgGrad_ = Matrix::create(nullptr, /* height= */ 1, getSize(),
-                                    /* trans= */ false, useGpu_);
-      checkOgGrad_ = Matrix::create(nullptr, /* height= */ 1, getSize(),
-                                    /* trans= */ false, useGpu_);
+      localBiasGrad_ = Matrix::create(nullptr,
+                                      /* height= */ 1,
+                                      getSize() * 4,
+                                      /* trans= */ false,
+                                      useGpu_);
+      checkIgGrad_ = Matrix::create(nullptr,
+                                    /* height= */ 1,
+                                    getSize(),
+                                    /* trans= */ false,
+                                    useGpu_);
+      checkFgGrad_ = Matrix::create(nullptr,
+                                    /* height= */ 1,
+                                    getSize(),
+                                    /* trans= */ false,
+                                    useGpu_);
+      checkOgGrad_ = Matrix::create(nullptr,
+                                    /* height= */ 1,
+                                    getSize(),
+                                    /* trans= */ false,
+                                    useGpu_);
       localBiasGrad_->setData(bias_->getWGrad()->getData());
       checkIgGrad_->setData(bias_->getWGrad()->getData() + getSize() * 4);
       checkFgGrad_->setData(bias_->getWGrad()->getData() + getSize() * 5);
@@ -84,8 +107,8 @@ bool LstmLayer::init(const LayerMap &layerMap,
 
 void LstmLayer::resetState() {
   CHECK(!reversed_) << "state is not allowed for reversed lstmemory layer";
-  Matrix::resizeOrCreate(prevOutput_, 1, getSize(), /* trans= */ false,
-                         useGpu_);
+  Matrix::resizeOrCreate(
+      prevOutput_, 1, getSize(), /* trans= */ false, useGpu_);
   Matrix::resizeOrCreate(prevState_, 1, getSize(), /* trans= */ false, useGpu_);
   prevOutput_->resize(0, getSize());
   prevState_->resize(0, getSize());
@@ -138,8 +161,10 @@ void LstmLayer::forward(PassType passType) {
   CHECK_EQ(starts[numSequences], batchSize);
 
   Matrix::resizeOrCreate(gate_.value,
-                         /* height= */ batchSize, getSize() * 4,
-                         /* trans= */ false, useGpu_);
+                         /* height= */ batchSize,
+                         getSize() * 4,
+                         /* trans= */ false,
+                         useGpu_);
   if (prevOutput_) {
     size_t prevNumSeq = useBatch_ ? numSequences : 1;
     if (prevOutput_->getHeight() == 0) {
@@ -151,18 +176,29 @@ void LstmLayer::forward(PassType passType) {
       CHECK_EQ(prevOutput_->getHeight(), prevNumSeq)
           << "the number of sequences must be the same";
     }
-    Matrix::resizeOrCreate(totalState_, prevState_->getHeight() + batchSize,
-                           getSize(), /*trans*/ false, useGpu_);
-    state_.value = Matrix::create(nullptr, /* height= */ batchSize, getSize(),
-                                  /* trans= */ false, useGpu_);
+    Matrix::resizeOrCreate(totalState_,
+                           prevState_->getHeight() + batchSize,
+                           getSize(),
+                           /*trans*/ false,
+                           useGpu_);
+    state_.value = Matrix::create(nullptr,
+                                  /* height= */ batchSize,
+                                  getSize(),
+                                  /* trans= */ false,
+                                  useGpu_);
     state_.value->setData(totalState_->getData() +
                           prevState_->getHeight() * getSize());
   } else {
-    Matrix::resizeOrCreate(state_.value, /* height= */ batchSize, getSize(),
-                           /* trans= */ false, useGpu_);
+    Matrix::resizeOrCreate(state_.value,
+                           /* height= */ batchSize,
+                           getSize(),
+                           /* trans= */ false,
+                           useGpu_);
   }
   Matrix::resizeOrCreate(preOutput_.value,
-                         /* height= */ batchSize, getSize(), /* trans= */ false,
+                         /* height= */ batchSize,
+                         getSize(),
+                         /* trans= */ false,
                          useGpu_);
 
   if (!useBatch_) {
@@ -171,7 +207,7 @@ void LstmLayer::forward(PassType passType) {
     if (!useSeqParallel_) {
       forwardBatch(batchSize, numSequences, starts, input.value);
     } else {
-      const int* starts = input.sequenceStartPositions->getData(useGpu_);
+      const int *starts = input.sequenceStartPositions->getData(useGpu_);
       forwardSeqParallel(batchSize, numSequences, starts, input.value);
     }
   }
@@ -188,13 +224,19 @@ void LstmLayer::backward(const UpdateCallback &callback) {
   size_t numSequences = input.getNumSequences();
 
   Matrix::resizeOrCreate(gate_.grad,
-                         /* height= */ batchSize, getSize() * 4,
-                         /* trans= */ false, useGpu_);
+                         /* height= */ batchSize,
+                         getSize() * 4,
+                         /* trans= */ false,
+                         useGpu_);
   Matrix::resizeOrCreate(state_.grad,
-                         /* height= */ batchSize, getSize(), /* trans= */ false,
+                         /* height= */ batchSize,
+                         getSize(),
+                         /* trans= */ false,
                          useGpu_);
   Matrix::resizeOrCreate(preOutput_.grad,
-                         /* height= */ batchSize, getSize(), /* trans= */ false,
+                         /* height= */ batchSize,
+                         getSize(),
+                         /* trans= */ false,
                          useGpu_);
   state_.grad->zero();
 
@@ -205,7 +247,7 @@ void LstmLayer::backward(const UpdateCallback &callback) {
     if (!useSeqParallel_) {
       backwardBatch(batchSize, numSequences, starts, input.grad);
     } else {
-      const int* starts = input.sequenceStartPositions->getData(useGpu_);
+      const int *starts = input.sequenceStartPositions->getData(useGpu_);
       backwardSeqParallel(batchSize, numSequences, starts, input.grad);
     }
   }
@@ -216,8 +258,10 @@ void LstmLayer::backward(const UpdateCallback &callback) {
   weight_->getParameterPtr()->incUpdate(callback);
 }
 
-void LstmLayer::forwardSequence(int batchSize, size_t numSequences,
-                                const int *starts, MatrixPtr inputValue) {
+void LstmLayer::forwardSequence(int batchSize,
+                                size_t numSequences,
+                                const int *starts,
+                                MatrixPtr inputValue) {
   REGISTER_TIMER_INFO("LstmFwSequenceTime", getName().c_str());
   gate_.value->assign(*inputValue);
   if (bias_) {
@@ -255,10 +299,16 @@ void LstmLayer::forwardSequence(int batchSize, size_t numSequences,
     }
   };
 
-  MatrixPtr frameGate = Matrix::create(nullptr, /* height= */ 1, getSize() * 4,
-                                       /* trans= */ false, useGpu_);
-  MatrixPtr frameOutput = Matrix::create(nullptr, /* height= */ 1, getSize(),
-                                         /* trans= */ false, useGpu_);
+  MatrixPtr frameGate = Matrix::create(nullptr,
+                                       /* height= */ 1,
+                                       getSize() * 4,
+                                       /* trans= */ false,
+                                       useGpu_);
+  MatrixPtr frameOutput = Matrix::create(nullptr,
+                                         /* height= */ 1,
+                                         getSize(),
+                                         /* trans= */ false,
+                                         useGpu_);
 
   if (!reversed_) {
     if (prevState_) {
@@ -316,8 +366,10 @@ void LstmLayer::forwardSequence(int batchSize, size_t numSequences,
   }
 }
 
-void LstmLayer::backwardSequence(int batchSize, size_t numSequences,
-                                 const int *starts, MatrixPtr inputGrad) {
+void LstmLayer::backwardSequence(int batchSize,
+                                 size_t numSequences,
+                                 const int *starts,
+                                 MatrixPtr inputGrad) {
   REGISTER_TIMER_INFO("LstmBwSequenceTime", getName().c_str());
   MatrixPtr weightT = weight_->getW()->getTranspose();
 
@@ -381,10 +433,16 @@ void LstmLayer::backwardSequence(int batchSize, size_t numSequences,
     }
   };
 
-  MatrixPtr frameGate = Matrix::create(nullptr, /* height= */ 1, getSize() * 4,
-                                       /* trans= */ false, useGpu_);
-  MatrixPtr frameOutput = Matrix::create(nullptr, /* height= */ 1, getSize(),
-                                         /* trans= */ false, useGpu_);
+  MatrixPtr frameGate = Matrix::create(nullptr,
+                                       /* height= */ 1,
+                                       getSize() * 4,
+                                       /* trans= */ false,
+                                       useGpu_);
+  MatrixPtr frameOutput = Matrix::create(nullptr,
+                                         /* height= */ 1,
+                                         getSize(),
+                                         /* trans= */ false,
+                                         useGpu_);
 
   {
     AsyncGpuBlock asyncGpuBlock;
@@ -422,11 +480,15 @@ void LstmLayer::backwardSequence(int batchSize, size_t numSequences,
         if (!reversed_) {
           weight_->getWGrad()->mul(
               output_.value->subMatrix(start, length - 1)->getTranspose(),
-              gate_.grad->subMatrix(start + 1, length - 1), 1, 1);
+              gate_.grad->subMatrix(start + 1, length - 1),
+              1,
+              1);
         } else {
           weight_->getWGrad()->mul(
               output_.value->subMatrix(start + 1, length - 1)->getTranspose(),
-              gate_.grad->subMatrix(start, length - 1), 1, 1);
+              gate_.grad->subMatrix(start, length - 1),
+              1,
+              1);
         }
       }
     }
@@ -440,8 +502,10 @@ void LstmLayer::backwardSequence(int batchSize, size_t numSequences,
   }
 }
 
-void LstmLayer::forwardBatch(int batchSize, size_t numSequences,
-                             const int *starts, MatrixPtr inputValue) {
+void LstmLayer::forwardBatch(int batchSize,
+                             size_t numSequences,
+                             const int *starts,
+                             MatrixPtr inputValue) {
   REGISTER_TIMER_INFO("LstmFwBatchTime", getName().c_str());
 
   hl_lstm_value lstmValue;
@@ -452,8 +516,8 @@ void LstmLayer::forwardBatch(int batchSize, size_t numSequences,
   if (!batchValue_) {
     batchValue_.reset(new SequenceToBatch(useGpu_));
   }
-  batchValue_->resizeOrCreateBatch(batchSize, numSequences, starts, reversed_,
-                                   prevOutput_ ? true : false);
+  batchValue_->resizeOrCreateBatch(
+      batchSize, numSequences, starts, reversed_, prevOutput_ ? true : false);
 
   batchValue_->resizeOrCreate(*output_.value);
   batchValue_->copy(*inputValue, *gate_.value, /* seq2batch */ true);
@@ -479,8 +543,11 @@ void LstmLayer::forwardBatch(int batchSize, size_t numSequences,
         MatrixPtr batch1 = batchValue_->getBatchValue(n - 1, batchSize);
         gateValue->mul(batch1, weight_->getW(), 1, 1);
       } else if (prevOutput_) {
-        Matrix::resizeOrCreate(prevBatchOutput2_, gateValue->getHeight(),
-                               getSize(), false, useGpu_);
+        Matrix::resizeOrCreate(prevBatchOutput2_,
+                               gateValue->getHeight(),
+                               getSize(),
+                               false,
+                               useGpu_);
         batchValue_->prevOutput2Batch(*prevOutput_, *prevBatchOutput2_);
         gateValue->mul(prevBatchOutput2_, weight_->getW(), 1, 1);
 
@@ -525,8 +592,10 @@ void LstmLayer::getPrevBatchState(size_t numSequences) {
   batchValue_->getSeqOutputFromBatch(*prevState_, *state_.value);
 }
 
-void LstmLayer::backwardBatch(int batchSize, size_t numSequences,
-                              const int *starts, MatrixPtr inputGrad) {
+void LstmLayer::backwardBatch(int batchSize,
+                              size_t numSequences,
+                              const int *starts,
+                              MatrixPtr inputGrad) {
   REGISTER_TIMER_INFO("LstmBwBatchTime", getName().c_str());
 
   hl_lstm_value lstmValue;
@@ -593,11 +662,11 @@ void LstmLayer::backwardBatch(int batchSize, size_t numSequences,
           }
         }
         if (useGpu_) {
-          LstmCompute::backwardBatch<1>(lstmValue, lstmGrad,
-                                        getSize(), batchSize);
+          LstmCompute::backwardBatch<1>(
+              lstmValue, lstmGrad, getSize(), batchSize);
         } else {
-          LstmCompute::backwardBatch<0>(lstmValue, lstmGrad,
-                                        getSize(), batchSize);
+          LstmCompute::backwardBatch<0>(
+              lstmValue, lstmGrad, getSize(), batchSize);
         }
       }
 
@@ -611,8 +680,8 @@ void LstmLayer::backwardBatch(int batchSize, size_t numSequences,
         MatrixPtr outputValue = batchValue_->getBatchValue(n - 1, batchSize);
         weight_->getWGrad()->mul(outputValue->getTranspose(), gateGrad, 1, 1);
       } else if (prevOutput_ && weight_->getWGrad()) {
-        weight_->getWGrad()->mul(prevBatchOutput2_->getTranspose(), gateGrad, 1,
-                                 1);
+        weight_->getWGrad()->mul(
+            prevBatchOutput2_->getTranspose(), gateGrad, 1, 1);
       }
     }
   }
@@ -625,8 +694,10 @@ void LstmLayer::backwardBatch(int batchSize, size_t numSequences,
   }
 }
 
-void LstmLayer::forwardSeqParallel(int batchSize, size_t numSequences,
-                                   const int *starts, MatrixPtr inputValue) {
+void LstmLayer::forwardSeqParallel(int batchSize,
+                                   size_t numSequences,
+                                   const int *starts,
+                                   MatrixPtr inputValue) {
   REGISTER_TIMER_INFO("LstmFwSeqParallelTime", getName().c_str());
   gate_.value->assign(*inputValue);
   if (bias_) {
@@ -641,14 +712,27 @@ void LstmLayer::forwardSeqParallel(int batchSize, size_t numSequences,
   real *checkFg = checkFg_->getData();
   real *checkOg = checkOg_->getData();
   real *weight = weight_->getW()->getData();
-  hl_lstm_parallel_forward(
-      gateValue, stateValue, preOutputValue, outputValue, checkIg, checkFg,
-      checkOg, weight, starts, getSize(), numSequences, reversed_, activeNode_,
-      activeGate_, activeState_);
+  hl_lstm_parallel_forward(gateValue,
+                           stateValue,
+                           preOutputValue,
+                           outputValue,
+                           checkIg,
+                           checkFg,
+                           checkOg,
+                           weight,
+                           starts,
+                           getSize(),
+                           numSequences,
+                           reversed_,
+                           activeNode_,
+                           activeGate_,
+                           activeState_);
 }
 
-void LstmLayer::backwardSeqParallel(int batchSize, size_t numSequences,
-                                    const int *starts, MatrixPtr inputGrad) {
+void LstmLayer::backwardSeqParallel(int batchSize,
+                                    size_t numSequences,
+                                    const int *starts,
+                                    MatrixPtr inputGrad) {
   REGISTER_TIMER_INFO("LstmBwSeqParallelTime", getName().c_str());
   real *gateValue = gate_.value->getData();
   real *gateGrad = gate_.grad->getData();
@@ -675,11 +759,27 @@ void LstmLayer::backwardSeqParallel(int batchSize, size_t numSequences,
     checkOgGrad = nullptr;
   }
 
-  hl_lstm_parallel_backward_data(
-      gateValue, gateGrad, stateValue, stateGrad, preOutputValue, preOutputGrad,
-      outputGrad, checkIg, checkIgGrad, checkFg, checkFgGrad, checkOg,
-      checkOgGrad, weight, starts, getSize(), numSequences, reversed_,
-      activeNode_, activeGate_, activeState_);
+  hl_lstm_parallel_backward_data(gateValue,
+                                 gateGrad,
+                                 stateValue,
+                                 stateGrad,
+                                 preOutputValue,
+                                 preOutputGrad,
+                                 outputGrad,
+                                 checkIg,
+                                 checkIgGrad,
+                                 checkFg,
+                                 checkFgGrad,
+                                 checkOg,
+                                 checkOgGrad,
+                                 weight,
+                                 starts,
+                                 getSize(),
+                                 numSequences,
+                                 reversed_,
+                                 activeNode_,
+                                 activeGate_,
+                                 activeState_);
 
   if (inputGrad) {
     inputGrad->add(*gate_.grad);
@@ -691,9 +791,14 @@ void LstmLayer::backwardSeqParallel(int batchSize, size_t numSequences,
   real *outputValue = output_.value->getData();
   if (weight_->getWGrad()) {
     real *weightGrad = weight_->getWGrad()->getData();
-    hl_lstm_parallel_backward_weight(weightGrad, outputValue, gateGrad,
-                                     starts, getSize(), batchSize,
-                                     numSequences, reversed_);
+    hl_lstm_parallel_backward_weight(weightGrad,
+                                     outputValue,
+                                     gateGrad,
+                                     starts,
+                                     getSize(),
+                                     batchSize,
+                                     numSequences,
+                                     reversed_);
   }
 }
 
diff --git a/paddle/gserver/layers/LstmLayer.h b/paddle/gserver/layers/LstmLayer.h
index e080a40141..5b936ff44e 100644
--- a/paddle/gserver/layers/LstmLayer.h
+++ b/paddle/gserver/layers/LstmLayer.h
@@ -97,12 +97,16 @@ protected:
    * @param starts Each start position of each samples.
    * @param inputValue The input values.
    */
-  void forwardSequence(int batchSize, size_t numSequences, const int *starts,
+  void forwardSequence(int batchSize,
+                       size_t numSequences,
+                       const int *starts,
                        MatrixPtr inputValue);
   /**
    * Compute lstm backward one sequence by one sequence.
    */
-  void backwardSequence(int batchSize, size_t numSequences, const int *starts,
+  void backwardSequence(int batchSize,
+                        size_t numSequences,
+                        const int *starts,
                         MatrixPtr inputGrad);
 
   /**
@@ -121,12 +125,16 @@ protected:
    * }
    * @endcode
    */
-  void forwardBatch(int batchSize, size_t numSequences, const int *starts,
+  void forwardBatch(int batchSize,
+                    size_t numSequences,
+                    const int *starts,
                     MatrixPtr inputValue);
   /**
    * Compute lstm backward one batch by one batch.
    */
-  void backwardBatch(int batchSize, size_t numSequences, const int *starts,
+  void backwardBatch(int batchSize,
+                     size_t numSequences,
+                     const int *starts,
                      MatrixPtr inputGrad);
 
   /**
@@ -134,13 +142,17 @@ protected:
    * batch value. It will launch one kernel to parallelly compute forward
    * propagation in sequence level.
    */
-  void forwardSeqParallel(int batchSize, size_t numSequences, const int *starts,
+  void forwardSeqParallel(int batchSize,
+                          size_t numSequences,
+                          const int *starts,
                           MatrixPtr inputValue);
   /**
    * Backward propagation corresponding to forwardSeqParallel.
    */
-  void backwardSeqParallel(int batchSize, size_t numSequences,
-                           const int *starts, MatrixPtr inputGrad);
+  void backwardSeqParallel(int batchSize,
+                           size_t numSequences,
+                           const int *starts,
+                           MatrixPtr inputGrad);
   /**
    * This function is used for sequence generation and get output after
    * forwardBatch.
diff --git a/paddle/gserver/layers/LstmStepLayer.cpp b/paddle/gserver/layers/LstmStepLayer.cpp
index fb0fdbf7e9..e7a8d519f2 100644
--- a/paddle/gserver/layers/LstmStepLayer.cpp
+++ b/paddle/gserver/layers/LstmStepLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "Layer.h"
 #include "LstmCompute.h"
 #include "paddle/utils/Stat.h"
@@ -49,24 +48,36 @@ bool LstmStepLayer::init(const LayerMap& layerMap,
   if (!Layer::init(layerMap, parameterMap)) return false;
   CHECK_EQ(2U, inputLayers_.size());
 
-  checkIg_ =
-      Matrix::create(nullptr,
-                     /* height= */ 1, getSize(), /* trans= */ false, useGpu_);
-  checkFg_ =
-      Matrix::create(nullptr,
-                     /* height= */ 1, getSize(), /* trans= */ false, useGpu_);
-  checkOg_ =
-      Matrix::create(nullptr,
-                     /* height= */ 1, getSize(), /* trans= */ false, useGpu_);
-  checkIgGrad_ =
-      Matrix::create(nullptr,
-                     /* height= */ 1, getSize(), /* trans= */ false, useGpu_);
-  checkFgGrad_ =
-      Matrix::create(nullptr,
-                     /* height= */ 1, getSize(), /* trans= */ false, useGpu_);
-  checkOgGrad_ =
-      Matrix::create(nullptr,
-                     /* height= */ 1, getSize(), /* trans= */ false, useGpu_);
+  checkIg_ = Matrix::create(nullptr,
+                            /* height= */ 1,
+                            getSize(),
+                            /* trans= */ false,
+                            useGpu_);
+  checkFg_ = Matrix::create(nullptr,
+                            /* height= */ 1,
+                            getSize(),
+                            /* trans= */ false,
+                            useGpu_);
+  checkOg_ = Matrix::create(nullptr,
+                            /* height= */ 1,
+                            getSize(),
+                            /* trans= */ false,
+                            useGpu_);
+  checkIgGrad_ = Matrix::create(nullptr,
+                                /* height= */ 1,
+                                getSize(),
+                                /* trans= */ false,
+                                useGpu_);
+  checkFgGrad_ = Matrix::create(nullptr,
+                                /* height= */ 1,
+                                getSize(),
+                                /* trans= */ false,
+                                useGpu_);
+  checkOgGrad_ = Matrix::create(nullptr,
+                                /* height= */ 1,
+                                getSize(),
+                                /* trans= */ false,
+                                useGpu_);
 
   if (biasParameter_.get() != NULL) {
     CHECK_EQ(getSize() * 3, biasParameter_->getSize());
@@ -101,12 +112,21 @@ void LstmStepLayer::forward(PassType passType) {
   CHECK_EQ(getSize(), prevState.value->getWidth());
   int batchSize = input.getBatchSize();
   reserveOutput(batchSize, getSize());
-  resetSpecifyOutput(state_, batchSize, getSize(), /*  isValueClean */ false,
+  resetSpecifyOutput(state_,
+                     batchSize,
+                     getSize(),
+                     /*  isValueClean */ false,
                      /* isGradClean */ true);
-  resetSpecifyOutput(gate_, batchSize, getSize() * 4,
-                     /* isValueClean */ false, /* isGradClean */ false);
-  resetSpecifyOutput(stateActive_, batchSize, getSize(),
-                     /*  isValueClean */ false, /* isGradClean */ false);
+  resetSpecifyOutput(gate_,
+                     batchSize,
+                     getSize() * 4,
+                     /* isValueClean */ false,
+                     /* isGradClean */ false);
+  resetSpecifyOutput(stateActive_,
+                     batchSize,
+                     getSize(),
+                     /*  isValueClean */ false,
+                     /* isGradClean */ false);
   gate_.value->assign(*input.value);
 
   hl_lstm_value lstmValue;
@@ -156,11 +176,9 @@ void LstmStepLayer::backward(const UpdateCallback& callback) {
   lstmGrad.checkOgGrad = checkOgGrad_->getData();
 
   if (useGpu_) {
-    LstmCompute::backwardBatch<1>(lstmValue, lstmGrad, getSize(),
-                                  batchSize);
+    LstmCompute::backwardBatch<1>(lstmValue, lstmGrad, getSize(), batchSize);
   } else {
-    LstmCompute::backwardBatch<0>(lstmValue, lstmGrad, getSize(),
-                                  batchSize);
+    LstmCompute::backwardBatch<0>(lstmValue, lstmGrad, getSize(), batchSize);
   }
 
   if (input.grad) {
diff --git a/paddle/gserver/layers/MDLstmLayer.cpp b/paddle/gserver/layers/MDLstmLayer.cpp
index 8ca92dee6d..93f52c1c31 100644
--- a/paddle/gserver/layers/MDLstmLayer.cpp
+++ b/paddle/gserver/layers/MDLstmLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "LstmLayer.h"
 #include "paddle/math/Matrix.h"
 #include "paddle/math/BaseMatrix.h"
@@ -106,7 +105,8 @@ public:
 
   bool end() { return end_; }
 
-  bool getPrePos(const std::vector<int>& delays, int idx,
+  bool getPrePos(const std::vector<int>& delays,
+                 int idx,
                  std::vector<int>& prePos) {
     bool isAvial = true;
     prePos.clear();
@@ -129,7 +129,8 @@ public:
     return isAvial;
   }
 
-  bool getNextPos(const std::vector<int>& delays, int idx,
+  bool getNextPos(const std::vector<int>& delays,
+                  int idx,
                   std::vector<int>& nextPos) {
     bool isAvial = true;
     nextPos.clear();
@@ -232,24 +233,46 @@ bool MDLstmLayer::init(const LayerMap& layerMap,
       new Weight(numBlocks_, numBlocks_ * (3 + numDims_), parameters_[0]));
   if (biasParameter_.get() != NULL) {
     bias_.reset(new Weight(1, numBlocks_ * (5 + 2 * numDims_), biasParameter_));
-    localBias_ =
-        Matrix::create(nullptr, /* height= */ 1, numBlocks_ * (3 + numDims_),
-                       /* trans= */ false, useGpu_);
-    checkIg_ = Matrix::create(nullptr, /* height= */ 1, numBlocks_,
-                              /* trans= */ false, useGpu_);
-    checkFg_ = Matrix::create(nullptr, /* height= */ numDims_, numBlocks_,
-                              /* trans= */ false, useGpu_);
-    checkOg_ = Matrix::create(nullptr, /* height= */ 1, numBlocks_,
-                              /* trans= */ false, useGpu_);
-    localBiasGrad_ =
-        Matrix::create(nullptr, /* height= */ 1, numBlocks_ * (3 + numDims_),
-                       /* trans= */ false, useGpu_);
-    checkIgGrad_ = Matrix::create(nullptr, /* height= */ 1, numBlocks_,
-                                  /* trans= */ false, useGpu_);
-    checkFgGrad_ = Matrix::create(nullptr, /* height= */ numDims_, numBlocks_,
-                                  /* trans= */ false, useGpu_);
-    checkOgGrad_ = Matrix::create(nullptr, /* height= */ 1, numBlocks_,
-                                  /* trans= */ false, useGpu_);
+    localBias_ = Matrix::create(nullptr,
+                                /* height= */ 1,
+                                numBlocks_ * (3 + numDims_),
+                                /* trans= */ false,
+                                useGpu_);
+    checkIg_ = Matrix::create(nullptr,
+                              /* height= */ 1,
+                              numBlocks_,
+                              /* trans= */ false,
+                              useGpu_);
+    checkFg_ = Matrix::create(nullptr,
+                              /* height= */ numDims_,
+                              numBlocks_,
+                              /* trans= */ false,
+                              useGpu_);
+    checkOg_ = Matrix::create(nullptr,
+                              /* height= */ 1,
+                              numBlocks_,
+                              /* trans= */ false,
+                              useGpu_);
+    localBiasGrad_ = Matrix::create(nullptr,
+                                    /* height= */ 1,
+                                    numBlocks_ * (3 + numDims_),
+                                    /* trans= */ false,
+                                    useGpu_);
+    checkIgGrad_ = Matrix::create(nullptr,
+                                  /* height= */ 1,
+                                  numBlocks_,
+                                  /* trans= */ false,
+                                  useGpu_);
+    checkFgGrad_ = Matrix::create(nullptr,
+                                  /* height= */ numDims_,
+                                  numBlocks_,
+                                  /* trans= */ false,
+                                  useGpu_);
+    checkOgGrad_ = Matrix::create(nullptr,
+                                  /* height= */ 1,
+                                  numBlocks_,
+                                  /* trans= */ false,
+                                  useGpu_);
 
     localBias_->setData(bias_->getW()->getData());
     checkIg_->setData(bias_->getW()->getData() + numBlocks_ * (3 + numDims_));
@@ -315,49 +338,79 @@ void MDLstmLayer::forward(PassType passType) {
   frameOutput_.reserve(batchSize);
 
   Matrix::resizeOrCreate(gate_.value,
-                         /* height= */ batchSize, numBlocks_ * (3 + numDims_),
-                         /* trans= */ false, useGpu_);
+                         /* height= */ batchSize,
+                         numBlocks_ * (3 + numDims_),
+                         /* trans= */ false,
+                         useGpu_);
 
   for (int i = frameGate_.size(); i < batchSize; i++) {
     Argument arg;
-    arg.value =
-        Matrix::create(nullptr, /* height= */ 1, numBlocks_ * (3 + numDims_),
-                       /* trans= */ false, useGpu_);
-    arg.grad =
-        Matrix::create(nullptr, /* height= */ 1, numBlocks_ * (3 + numDims_),
-                       /* trans= */ false, useGpu_);
+    arg.value = Matrix::create(nullptr,
+                               /* height= */ 1,
+                               numBlocks_ * (3 + numDims_),
+                               /* trans= */ false,
+                               useGpu_);
+    arg.grad = Matrix::create(nullptr,
+                              /* height= */ 1,
+                              numBlocks_ * (3 + numDims_),
+                              /* trans= */ false,
+                              useGpu_);
     frameGate_.push_back(arg);
   }
   for (int i = frameInputGate_.size(); i < batchSize; i++) {
     Argument arg;
-    arg.value = Matrix::create(nullptr, /* height= */ 1, numBlocks_,
-                               /* trans= */ false, useGpu_);
-    arg.grad = Matrix::create(nullptr, /* height= */ 1, numBlocks_,
-                              /* trans= */ false, useGpu_);
+    arg.value = Matrix::create(nullptr,
+                               /* height= */ 1,
+                               numBlocks_,
+                               /* trans= */ false,
+                               useGpu_);
+    arg.grad = Matrix::create(nullptr,
+                              /* height= */ 1,
+                              numBlocks_,
+                              /* trans= */ false,
+                              useGpu_);
     frameInputGate_.push_back(arg);
   }
   for (int i = frameForgetGate_.size(); i < batchSize; i++) {
     Argument arg;
-    arg.value = Matrix::create(nullptr, /* height= */ numDims_, numBlocks_,
-                               /* trans= */ false, useGpu_);
-    arg.grad = Matrix::create(nullptr, /* height= */ numDims_, numBlocks_,
-                              /* trans= */ false, useGpu_);
+    arg.value = Matrix::create(nullptr,
+                               /* height= */ numDims_,
+                               numBlocks_,
+                               /* trans= */ false,
+                               useGpu_);
+    arg.grad = Matrix::create(nullptr,
+                              /* height= */ numDims_,
+                              numBlocks_,
+                              /* trans= */ false,
+                              useGpu_);
     frameForgetGate_.push_back(arg);
   }
   for (int i = frameOutputGate_.size(); i < batchSize; i++) {
     Argument arg;
-    arg.value = Matrix::create(nullptr, /* height= */ 1, numBlocks_,
-                               /* trans= */ false, useGpu_);
-    arg.grad = Matrix::create(nullptr, /* height= */ 1, numBlocks_,
-                              /* trans= */ false, useGpu_);
+    arg.value = Matrix::create(nullptr,
+                               /* height= */ 1,
+                               numBlocks_,
+                               /* trans= */ false,
+                               useGpu_);
+    arg.grad = Matrix::create(nullptr,
+                              /* height= */ 1,
+                              numBlocks_,
+                              /* trans= */ false,
+                              useGpu_);
     frameOutputGate_.push_back(arg);
   }
   for (int i = frameInputNode_.size(); i < batchSize; i++) {
     Argument arg;
-    arg.value = Matrix::create(nullptr, /* height= */ 1, numBlocks_,
-                               /* trans= */ false, useGpu_);
-    arg.grad = Matrix::create(nullptr, /* height= */ 1, numBlocks_,
-                              /* trans= */ false, useGpu_);
+    arg.value = Matrix::create(nullptr,
+                               /* height= */ 1,
+                               numBlocks_,
+                               /* trans= */ false,
+                               useGpu_);
+    arg.grad = Matrix::create(nullptr,
+                              /* height= */ 1,
+                              numBlocks_,
+                              /* trans= */ false,
+                              useGpu_);
     frameInputNode_.push_back(arg);
   }
   for (int i = frameState_.size(); i < batchSize; i++) {
@@ -374,10 +427,16 @@ void MDLstmLayer::forward(PassType passType) {
   }
   for (int i = frameOutput_.size(); i < batchSize; i++) {
     Argument arg;
-    arg.value = Matrix::create(nullptr, /* height= */ 1, numBlocks_,
-                               /* trans= */ false, useGpu_);
-    arg.grad = Matrix::create(nullptr, /* height= */ 1, numBlocks_,
-                              /* trans= */ false, useGpu_);
+    arg.value = Matrix::create(nullptr,
+                               /* height= */ 1,
+                               numBlocks_,
+                               /* trans= */ false,
+                               useGpu_);
+    arg.grad = Matrix::create(nullptr,
+                              /* height= */ 1,
+                              numBlocks_,
+                              /* trans= */ false,
+                              useGpu_);
     frameOutput_.push_back(arg);
   }
 
@@ -432,13 +491,19 @@ void MDLstmLayer::forwardGate2OutputSequence(int start,
           *frameState_[start + preOffsetV[i]].value, *checkIg_, 1.0, 1.0);
 
       MatrixPtr fgGateOneDim = Matrix::create(
-          frameForgetGate_[idxCurr].value->getData() + i * numBlocks_, 1,
-          numBlocks_, false, useGpu_);
+          frameForgetGate_[idxCurr].value->getData() + i * numBlocks_,
+          1,
+          numBlocks_,
+          false,
+          useGpu_);
       MatrixPtr checkFgOneDim =
-          Matrix::create(checkFg_->getData() + i * numBlocks_, 1.0, numBlocks_,
-                         false, useGpu_);
-      fgGateOneDim->addDotMul(*frameState_[start + preOffsetV[i]].value,
-                              *checkFgOneDim, 1.0, 1.0);
+          Matrix::create(checkFg_->getData() + i * numBlocks_,
+                         1.0,
+                         numBlocks_,
+                         false,
+                         useGpu_);
+      fgGateOneDim->addDotMul(
+          *frameState_[start + preOffsetV[i]].value, *checkFgOneDim, 1.0, 1.0);
     }
   }
   activationGate_->forward(frameInputGate_[idxCurr]);
@@ -449,18 +514,22 @@ void MDLstmLayer::forwardGate2OutputSequence(int start,
   for (int i = 0; i < numDims_; i++) {
     if (preOffsetV[i] >= 0) {
       MatrixPtr fgGateOneDim = Matrix::create(
-          frameForgetGate_[idxCurr].value->getData() + i * numBlocks_, 1,
-          numBlocks_, false, useGpu_);
+          frameForgetGate_[idxCurr].value->getData() + i * numBlocks_,
+          1,
+          numBlocks_,
+          false,
+          useGpu_);
       frameState_[idxCurr].value->addDotMul(
           *frameState_[start + preOffsetV[i]].value, *fgGateOneDim, 1.0, 1.0);
     }
   }
   frameState_[idxCurr].value->addDotMul(*frameInputNode_[idxCurr].value,
-                                        *frameInputGate_[idxCurr].value, 1.0,
+                                        *frameInputGate_[idxCurr].value,
+                                        1.0,
                                         1.0);
 
-  frameOutputGate_[idxCurr].value->addDotMul(*frameState_[idxCurr].value,
-                                             *checkOg_, 1.0, 1.0);
+  frameOutputGate_[idxCurr].value->addDotMul(
+      *frameState_[idxCurr].value, *checkOg_, 1.0, 1.0);
   activationGate_->forward(frameOutputGate_[idxCurr]);
 
   framePreOutput_[idxCurr].value->copyFrom(*(frameState_[idxCurr].value));
@@ -493,8 +562,10 @@ void MDLstmLayer::backward(const UpdateCallback& callback) {
   size_t numSequences = input.getNumSequences();
 
   Matrix::resizeOrCreate(gate_.grad,
-                         /* height= */ batchSize, numBlocks_ * (3 + numDims_),
-                         /* trans= */ false, useGpu_);
+                         /* height= */ batchSize,
+                         numBlocks_ * (3 + numDims_),
+                         /* trans= */ false,
+                         useGpu_);
 
   for (int i = 0; i < batchSize; i++) {
     if (frameState_[i].grad == NULL)
@@ -576,8 +647,8 @@ void MDLstmLayer::backwardGate2OutputSequence(int start,
                                          *framePreOutput_[idxCurr].value);
   activationGate_->backward(frameOutputGate_[idxCurr]);
 
-  frameState_[idxCurr].grad->addDotMul(*frameOutputGate_[idxCurr].grad,
-                                       *checkOg_, 1.0, 1.0);
+  frameState_[idxCurr].grad->addDotMul(
+      *frameOutputGate_[idxCurr].grad, *checkOg_, 1.0, 1.0);
   for (int i = 0; i < numDims_; i++) {
     if (nextOffsetV[i] >= 0) {
       frameState_[idxCurr].grad->addDotMul(
@@ -586,18 +657,26 @@ void MDLstmLayer::backwardGate2OutputSequence(int start,
       MatrixPtr fgGateOneDimGrad = Matrix::create(
           frameForgetGate_[start + nextOffsetV[i]].grad->getData() +
               i * numBlocks_,
-          1, numBlocks_, false, useGpu_);
+          1,
+          numBlocks_,
+          false,
+          useGpu_);
       MatrixPtr fgGateOneDimVal = Matrix::create(
           frameForgetGate_[start + nextOffsetV[i]].value->getData() +
               i * numBlocks_,
-          1, numBlocks_, false, useGpu_);
+          1,
+          numBlocks_,
+          false,
+          useGpu_);
       MatrixPtr checkFgOneDim = Matrix::create(
           checkFg_->getData() + i * numBlocks_, 1, numBlocks_, false, useGpu_);
 
-      frameState_[idxCurr].grad->addDotMul(*fgGateOneDimGrad, *checkFgOneDim,
-                                           1.0, 1.0);
       frameState_[idxCurr].grad->addDotMul(
-          *frameState_[start + nextOffsetV[i]].grad, *fgGateOneDimVal, 1.0,
+          *fgGateOneDimGrad, *checkFgOneDim, 1.0, 1.0);
+      frameState_[idxCurr].grad->addDotMul(
+          *frameState_[start + nextOffsetV[i]].grad,
+          *fgGateOneDimVal,
+          1.0,
           1.0);
     }
   }
@@ -611,11 +690,15 @@ void MDLstmLayer::backwardGate2OutputSequence(int start,
   for (int i = 0; i < numDims_; i++) {
     if (preOffsetV[i] >= 0) {
       MatrixPtr fgGateOneDimGrad = Matrix::create(
-          frameForgetGate_[idxCurr].grad->getData() + i * numBlocks_, 1,
-          numBlocks_, false, useGpu_);
+          frameForgetGate_[idxCurr].grad->getData() + i * numBlocks_,
+          1,
+          numBlocks_,
+          false,
+          useGpu_);
       fgGateOneDimGrad->addDotMul(*frameState_[idxCurr].grad,
                                   *frameState_[start + preOffsetV[i]].value,
-                                  1.0, 1.0);
+                                  1.0,
+                                  1.0);
     }
   }
 
@@ -627,22 +710,30 @@ void MDLstmLayer::backwardGate2OutputSequence(int start,
     for (int i = 0; i < numDims_; i++) {
       if (preOffsetV[i] >= 0) {
         checkIgGrad_->addDotMul(*frameInputGate_[idxCurr].grad,
-                                *frameState_[start + preOffsetV[i]].value, 1.0,
+                                *frameState_[start + preOffsetV[i]].value,
+                                1.0,
                                 1.0);
 
         MatrixPtr fgGateOneDimGrad = Matrix::create(
-            frameForgetGate_[idxCurr].grad->getData() + i * numBlocks_, 1,
-            numBlocks_, false, useGpu_);
+            frameForgetGate_[idxCurr].grad->getData() + i * numBlocks_,
+            1,
+            numBlocks_,
+            false,
+            useGpu_);
         MatrixPtr checkFgOneDimGrad =
-            Matrix::create(checkFgGrad_->getData() + i * numBlocks_, 1,
-                           numBlocks_, false, useGpu_);
+            Matrix::create(checkFgGrad_->getData() + i * numBlocks_,
+                           1,
+                           numBlocks_,
+                           false,
+                           useGpu_);
         checkFgOneDimGrad->addDotMul(*fgGateOneDimGrad,
                                      *frameState_[start + preOffsetV[i]].value,
-                                     1.0, 1.0);
+                                     1.0,
+                                     1.0);
       }
     }
-    checkOgGrad_->addDotMul(*frameOutputGate_[idxCurr].grad,
-                            *frameState_[idxCurr].value, 1.0, 1.0);
+    checkOgGrad_->addDotMul(
+        *frameOutputGate_[idxCurr].grad, *frameState_[idxCurr].value, 1.0, 1.0);
   }
 }
 
@@ -660,7 +751,9 @@ void MDLstmLayer::backwardOneSequence(int start, CoordIterator& coordIter) {
         if (weight_->getWGrad()) {
           weight_->getWGrad()->mul(
               frameOutput_[start + preOffset].value->getTranspose(),
-              frameGate_[start + offset].grad, 1.0, 1.0);
+              frameGate_[start + offset].grad,
+              1.0,
+              1.0);
         }
       }
     }
diff --git a/paddle/gserver/layers/MaxIdLayer.cpp b/paddle/gserver/layers/MaxIdLayer.cpp
index b80de87b4e..22670fa121 100644
--- a/paddle/gserver/layers/MaxIdLayer.cpp
+++ b/paddle/gserver/layers/MaxIdLayer.cpp
@@ -45,7 +45,10 @@ public:
     const Argument& input = getInput(0);
     size_t batchSize = input.getBatchSize();
     IVector::resizeOrCreate(output_.ids, batchSize * beamSize_, useGpu_);
-    Matrix::resizeOrCreate(output_.in, batchSize, beamSize_, false,
+    Matrix::resizeOrCreate(output_.in,
+                           batchSize,
+                           beamSize_,
+                           false,
                            /* useGpu */ useGpu_);
     output_.value = nullptr;
     input.value->rowMax(*output_.ids, *output_.in);
diff --git a/paddle/gserver/layers/MaxLayer.cpp b/paddle/gserver/layers/MaxLayer.cpp
index c4ffe894ec..42bc6bb815 100644
--- a/paddle/gserver/layers/MaxLayer.cpp
+++ b/paddle/gserver/layers/MaxLayer.cpp
@@ -23,8 +23,8 @@ REGISTER_LAYER(max, MaxLayer);
 void MaxLayer::forward(PassType passType) {
   SequencePoolLayer::forward(passType);
 
-  IVector::resizeOrCreate(maxIndex_, newBatchSize_ * getSize(),
-                          useGpu(deviceId_));
+  IVector::resizeOrCreate(
+      maxIndex_, newBatchSize_ * getSize(), useGpu(deviceId_));
   maxIndex_->zeroMem();
 
   MatrixPtr inputValue = getInputValue(0);
diff --git a/paddle/gserver/layers/MaxLayer.h b/paddle/gserver/layers/MaxLayer.h
index e6dcfe9c67..74df0b8b57 100644
--- a/paddle/gserver/layers/MaxLayer.h
+++ b/paddle/gserver/layers/MaxLayer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "SequencePoolLayer.h"
diff --git a/paddle/gserver/layers/MixedLayer.cpp b/paddle/gserver/layers/MixedLayer.cpp
index 26b1360290..1392188fca 100644
--- a/paddle/gserver/layers/MixedLayer.cpp
+++ b/paddle/gserver/layers/MixedLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Stat.h"
 #include "MixedLayer.h"
 
@@ -29,8 +28,8 @@ bool MixedLayer::init(const LayerMap& layerMap,
   projections_.resize(inputLayers_.size());
   for (size_t i = 0; i < inputLayers_.size(); i++) {
     if (config_.inputs(i).has_proj_conf()) {
-      projections_[i].reset(Projection::create(config_.inputs(i).proj_conf(),
-                                               parameters_[i], useGpu_));
+      projections_[i].reset(Projection::create(
+          config_.inputs(i).proj_conf(), parameters_[i], useGpu_));
     } else {
       CHECK(!parameters_[i]) << "should no parameters for operators";
     }
@@ -46,8 +45,7 @@ bool MixedLayer::init(const LayerMap& layerMap,
   if (biasParameter_.get() != NULL) {
     sharedBias_ = config_.shared_biases();
     size_t psize = config_.bias_size();
-    biases_ = std::unique_ptr<Weight>(
-        new Weight(1, psize, biasParameter_));
+    biases_ = std::unique_ptr<Weight>(new Weight(1, psize, biasParameter_));
   }
 
   return true;
diff --git a/paddle/gserver/layers/MixedLayer.h b/paddle/gserver/layers/MixedLayer.h
index 5842e51e1d..271e0c2538 100644
--- a/paddle/gserver/layers/MixedLayer.h
+++ b/paddle/gserver/layers/MixedLayer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "Layer.h"
@@ -22,8 +21,8 @@ limitations under the License. */
 namespace paddle {
 
 /**
- * A mixed layer has multiple input layers. 
- * Each input layer was processed by a Projection or Operator. 
+ * A mixed layer has multiple input layers.
+ * Each input layer was processed by a Projection or Operator.
  * The results of all projections or Operators are summed together with bias
  * (if configured), and then go through an activation function and dropout
  * (if configured).
@@ -43,7 +42,7 @@ public:
   virtual void backward(const UpdateCallback& callback = nullptr);
   virtual void resetState();
   /**
-   * setState() should be called after getState(). 
+   * setState() should be called after getState().
    * Argument state consists of all projections states.
    */
   virtual void setState(LayerStatePtr state);
diff --git a/paddle/gserver/layers/MultinomialSampler.cpp b/paddle/gserver/layers/MultinomialSampler.cpp
index 518dc0c60c..e85dca72d3 100644
--- a/paddle/gserver/layers/MultinomialSampler.cpp
+++ b/paddle/gserver/layers/MultinomialSampler.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "MultinomialSampler.h"
 
 namespace paddle {
diff --git a/paddle/gserver/layers/MultinomialSampler.h b/paddle/gserver/layers/MultinomialSampler.h
index 442124704a..59683d2ee2 100644
--- a/paddle/gserver/layers/MultinomialSampler.h
+++ b/paddle/gserver/layers/MultinomialSampler.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <random>
diff --git a/paddle/gserver/layers/MultiplexLayer.cpp b/paddle/gserver/layers/MultiplexLayer.cpp
index a70172d9a6..c681eb0623 100644
--- a/paddle/gserver/layers/MultiplexLayer.cpp
+++ b/paddle/gserver/layers/MultiplexLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Logging.h"
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
diff --git a/paddle/gserver/layers/NCELayer.cpp b/paddle/gserver/layers/NCELayer.cpp
index 4faebe5d2a..50b29cdea5 100644
--- a/paddle/gserver/layers/NCELayer.cpp
+++ b/paddle/gserver/layers/NCELayer.cpp
@@ -23,7 +23,8 @@ namespace paddle {
 /**
  * Noise-contrastive estimation.
  * Implements the method in the following paper:
- * A fast and simple algorithm for training neural probabilistic language models.
+ * A fast and simple algorithm for training neural probabilistic language
+ * models.
  *
  * The config file api is nce_layer.
  */
@@ -180,8 +181,11 @@ public:
     int size = getSize();
     resetOutput(batchSize, size);
 
-    Matrix::resizeOrCreate(sampleOut_.value, 1, samples_.size(),
-                           /* trans= */ false, useGpu_);
+    Matrix::resizeOrCreate(sampleOut_.value,
+                           1,
+                           samples_.size(),
+                           /* trans= */ false,
+                           useGpu_);
 
     forwardBias();
 
@@ -195,8 +199,11 @@ public:
   }
 
   void backward(const UpdateCallback& callback) {
-    Matrix::resizeOrCreate(sampleOut_.grad, 1, samples_.size(),
-                           /* trans= */ false, useGpu_);
+    Matrix::resizeOrCreate(sampleOut_.grad,
+                           1,
+                           samples_.size(),
+                           /* trans= */ false,
+                           useGpu_);
 
     backwardCost();
 
@@ -241,7 +248,8 @@ public:
     real* sampleOut = sampleOut_.value->getData();
 
     for (size_t i = 0; i < samples_.size(); ++i) {
-      sampleOut[i] += dotProduct(dim, inputMat->getRowBuf(samples_[i].sampleId),
+      sampleOut[i] += dotProduct(dim,
+                                 inputMat->getRowBuf(samples_[i].sampleId),
                                  weightMat->getRowBuf(samples_[i].labelId));
     }
   }
@@ -257,7 +265,9 @@ public:
 
     if (weightGradMat) {
       for (size_t i = 0; i < samples_.size(); ++i) {
-        axpy(dim, sampleGrad[i], inputMat->getRowBuf(samples_[i].sampleId),
+        axpy(dim,
+             sampleGrad[i],
+             inputMat->getRowBuf(samples_[i].sampleId),
              weightGradMat->getRowBuf(samples_[i].labelId));
       }
       weights_[layerId]->incUpdate(callback);
@@ -265,7 +275,9 @@ public:
 
     if (inputGradMat) {
       for (size_t i = 0; i < samples_.size(); ++i) {
-        axpy(dim, sampleGrad[i], weightMat->getRowBuf(samples_[i].labelId),
+        axpy(dim,
+             sampleGrad[i],
+             weightMat->getRowBuf(samples_[i].labelId),
              inputGradMat->getRowBuf(samples_[i].sampleId));
       }
     }
diff --git a/paddle/gserver/layers/NormLayer.cpp b/paddle/gserver/layers/NormLayer.cpp
index ad8b92d2ff..7f6ffe2298 100644
--- a/paddle/gserver/layers/NormLayer.cpp
+++ b/paddle/gserver/layers/NormLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Logging.h"
 #include "NormLayer.h"
 #include "NormProjectionLayer.h"
diff --git a/paddle/gserver/layers/NormLayer.h b/paddle/gserver/layers/NormLayer.h
index 2b05be6fcb..9e848e5268 100644
--- a/paddle/gserver/layers/NormLayer.h
+++ b/paddle/gserver/layers/NormLayer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <vector>
@@ -44,8 +43,8 @@ public:
 
 /**
  * @brief response normalization within feature maps
- * namely normalize in independent channel 
- * When code refactoring, we delete the original implementation. 
+ * namely normalize in independent channel
+ * When code refactoring, we delete the original implementation.
  * Need to implement in the futrue.
  */
 class ResponseNormLayer : public NormLayer {
diff --git a/paddle/gserver/layers/NormProjectionLayer.cpp b/paddle/gserver/layers/NormProjectionLayer.cpp
index eab6e904ee..6ac468e6fc 100644
--- a/paddle/gserver/layers/NormProjectionLayer.cpp
+++ b/paddle/gserver/layers/NormProjectionLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
 #include "NormProjectionLayer.h"
@@ -65,8 +64,8 @@ void CMRProjectionNormLayer::forward(PassType passType) {
 
   denoms_->zeroMem();
 
-  outV->crossMapNormalFwd(*input, imgSizeH_, imgSizeW_, *denoms_, channels_,
-                          size_, scale_, pow_);
+  outV->crossMapNormalFwd(
+      *input, imgSizeH_, imgSizeW_, *denoms_, channels_, size_, scale_, pow_);
 }
 
 void CMRProjectionNormLayer::backward(const UpdateCallback& callback) {
@@ -81,8 +80,15 @@ void CMRProjectionNormLayer::backward(const UpdateCallback& callback) {
   MatrixPtr localOutV = getOutputValue();
   MatrixPtr preOutV = inputLayers_[0]->getOutputValue();
 
-  preOutGrad->crossMapNormalBwd(*localGrad, *denoms_, *preOutV, *localOutV,
-                                channels_, imgSizeH_, imgSizeW_, size_, scale_,
+  preOutGrad->crossMapNormalBwd(*localGrad,
+                                *denoms_,
+                                *preOutV,
+                                *localOutV,
+                                channels_,
+                                imgSizeH_,
+                                imgSizeW_,
+                                size_,
+                                scale_,
                                 pow_);
 }
 }  // namespace paddle
diff --git a/paddle/gserver/layers/NormProjectionLayer.h b/paddle/gserver/layers/NormProjectionLayer.h
index 728806ea76..b42e98ab09 100644
--- a/paddle/gserver/layers/NormProjectionLayer.h
+++ b/paddle/gserver/layers/NormProjectionLayer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "NormLayer.h"
diff --git a/paddle/gserver/layers/Operator.cpp b/paddle/gserver/layers/Operator.cpp
index 5fa8239ac5..b89c474014 100644
--- a/paddle/gserver/layers/Operator.cpp
+++ b/paddle/gserver/layers/Operator.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "Operator.h"
 
 namespace paddle {
diff --git a/paddle/gserver/layers/Operator.h b/paddle/gserver/layers/Operator.h
index 9ee16f70ee..ff6558dc73 100644
--- a/paddle/gserver/layers/Operator.h
+++ b/paddle/gserver/layers/Operator.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "paddle/parameter/Parameter.h"
@@ -48,12 +47,14 @@ public:
   static ClassRegistrar<Operator, OperatorConfig, bool> registrar_;
 
   /**
-   * Forward propagation. If backward() will be called, in and out must be kept valid until then.
+   * Forward propagation. If backward() will be called, in and out must be kept
+   * valid until then.
    * @param ins inputs of operator
    * @param out output of operator
    * @param passType PASS_TRAIN of PASS_TEST
    */
-  void forward(std::vector<const Argument*> ins, Argument* out,
+  void forward(std::vector<const Argument*> ins,
+               Argument* out,
                PassType passType) {
     ins_ = ins;
     out_ = out;
diff --git a/paddle/gserver/layers/OuterProdLayer.cpp b/paddle/gserver/layers/OuterProdLayer.cpp
index 708c901ba9..9b24a4f440 100644
--- a/paddle/gserver/layers/OuterProdLayer.cpp
+++ b/paddle/gserver/layers/OuterProdLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Logging.h"
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
@@ -58,12 +57,15 @@ bool OuterProdLayer::init(const LayerMap& layerMap,
 
   CHECK_EQ(dim0 * dim1, getSize()) << "Dimension mismatch";
 
-  tmpRow0 = Matrix::create(nullptr, /* height= */ 1, dim0, /* trans= */ false,
-                           useGpu_);
-  tmpRow1 = Matrix::create(nullptr, /* height= */ 1, dim1, /* trans= */ false,
+  tmpRow0 = Matrix::create(
+      nullptr, /* height= */ 1, dim0, /* trans= */ false, useGpu_);
+  tmpRow1 = Matrix::create(
+      nullptr, /* height= */ 1, dim1, /* trans= */ false, useGpu_);
+  tmpMtx0 = Matrix::create(nullptr,
+                           /* height= */ dim0,
+                           dim1,
+                           /* trans= */ false,
                            useGpu_);
-  tmpMtx0 = Matrix::create(nullptr, /* height= */ dim0, dim1,
-                           /* trans= */ false, useGpu_);
   return true;
 }
 
diff --git a/paddle/gserver/layers/ParameterReluLayer.cpp b/paddle/gserver/layers/ParameterReluLayer.cpp
index 98d108db5f..cd3bffa2e1 100644
--- a/paddle/gserver/layers/ParameterReluLayer.cpp
+++ b/paddle/gserver/layers/ParameterReluLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "ParameterReluLayer.h"
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
@@ -59,8 +58,8 @@ void ParameterReluLayer::backward(const UpdateCallback& callback) {
   }
 
   MatrixPtr preGrad = getInputGrad(0);
-  preGrad->paramReluBackwardDiff(*getOutputGrad(), *(getInputValue(0)),
-                                 *(weight_->getW()));
+  preGrad->paramReluBackwardDiff(
+      *getOutputGrad(), *(getInputValue(0)), *(weight_->getW()));
   {
     REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
     weight_->getParameterPtr()->incUpdate(callback);
diff --git a/paddle/gserver/layers/ParameterReluLayer.h b/paddle/gserver/layers/ParameterReluLayer.h
index 367e4e787c..029c09381f 100644
--- a/paddle/gserver/layers/ParameterReluLayer.h
+++ b/paddle/gserver/layers/ParameterReluLayer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "Layer.h"
diff --git a/paddle/gserver/layers/PoolLayer.cpp b/paddle/gserver/layers/PoolLayer.cpp
index 2fbc9001f1..511dfd87c1 100644
--- a/paddle/gserver/layers/PoolLayer.cpp
+++ b/paddle/gserver/layers/PoolLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Logging.h"
 #include "PoolLayer.h"
 #include "PoolProjectionLayer.h"
diff --git a/paddle/gserver/layers/PoolLayer.h b/paddle/gserver/layers/PoolLayer.h
index e87ad08251..59be295a53 100644
--- a/paddle/gserver/layers/PoolLayer.h
+++ b/paddle/gserver/layers/PoolLayer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "Layer.h"
diff --git a/paddle/gserver/layers/PoolProjection.cpp b/paddle/gserver/layers/PoolProjection.cpp
index 9be5aba3d5..1b227c8084 100644
--- a/paddle/gserver/layers/PoolProjection.cpp
+++ b/paddle/gserver/layers/PoolProjection.cpp
@@ -19,7 +19,8 @@ namespace paddle {
 REGISTER_PROJECTION_CREATE_FUNC(pool, &PoolProjection::create);
 
 PoolProjection::PoolProjection(const ProjectionConfig& config,
-                               ParameterPtr parameter, bool useGpu)
+                               ParameterPtr parameter,
+                               bool useGpu)
     : Projection(config, parameter, useGpu) {
   const PoolConfig& conf = config_.pool_conf();
   poolType_ = conf.pool_type();
@@ -47,9 +48,15 @@ size_t PoolProjection::getSize() {
   if (imgSize_ == 0) {
     imgSize_ = conf.img_size();
   }
-  outputY_ = outputSize(imgSizeY_, sizeY_, confPaddingY_, strideY_,
+  outputY_ = outputSize(imgSizeY_,
+                        sizeY_,
+                        confPaddingY_,
+                        strideY_,
                         /* caffeMode */ false);
-  outputX_ = outputSize(imgSize_, sizeX_, confPadding_, stride_,
+  outputX_ = outputSize(imgSize_,
+                        sizeX_,
+                        confPadding_,
+                        stride_,
                         /* caffeMode */ false);
 
   const_cast<Argument*>(out_)->setFrameHeight(outputY_);
@@ -59,7 +66,8 @@ size_t PoolProjection::getSize() {
 }
 
 PoolProjection* PoolProjection::create(const ProjectionConfig& config,
-                                       ParameterPtr parameter, bool useGpu) {
+                                       ParameterPtr parameter,
+                                       bool useGpu) {
   const std::string& pool = config.pool_conf().pool_type();
   if (pool == "max-projection") {
     return new MaxPoolProjection(config, parameter, useGpu);
@@ -76,8 +84,17 @@ void MaxPoolProjection::forward() {
   CHECK_EQ(width, out_->value->getWidth());
   MatrixPtr inputV = in_->value;
   MatrixPtr outV = out_->value;
-  outV->maxPoolForward(*inputV, imgSizeY_, imgSize_, channels_, sizeX_, sizeY_,
-                       strideY_, stride_, outputY_, outputX_, confPaddingY_,
+  outV->maxPoolForward(*inputV,
+                       imgSizeY_,
+                       imgSize_,
+                       channels_,
+                       sizeX_,
+                       sizeY_,
+                       strideY_,
+                       stride_,
+                       outputY_,
+                       outputX_,
+                       confPaddingY_,
                        confPadding_);
 }
 
@@ -91,9 +108,21 @@ void MaxPoolProjection::backward(const UpdateCallback& callback) {
   if (NULL == inputGrad) {
     return;
   }
-  inputGrad->maxPoolBackward(*inputV, imgSizeY_, imgSize_, *outGrad, *outV,
-                             sizeX_, sizeY_, strideY_, stride_, outputY_,
-                             outputX_, 1, 1, confPaddingY_, confPadding_);
+  inputGrad->maxPoolBackward(*inputV,
+                             imgSizeY_,
+                             imgSize_,
+                             *outGrad,
+                             *outV,
+                             sizeX_,
+                             sizeY_,
+                             strideY_,
+                             stride_,
+                             outputY_,
+                             outputX_,
+                             1,
+                             1,
+                             confPaddingY_,
+                             confPadding_);
 }
 
 void AvgPoolProjection::forward() {
@@ -101,8 +130,17 @@ void AvgPoolProjection::forward() {
   CHECK_EQ(width, out_->value->getWidth());
   MatrixPtr inputV = in_->value;
   MatrixPtr outV = out_->value;
-  outV->avgPoolForward(*inputV, imgSizeY_, imgSize_, channels_, sizeX_, sizeY_,
-                       strideY_, stride_, outputY_, outputX_, confPaddingY_,
+  outV->avgPoolForward(*inputV,
+                       imgSizeY_,
+                       imgSize_,
+                       channels_,
+                       sizeX_,
+                       sizeY_,
+                       strideY_,
+                       stride_,
+                       outputY_,
+                       outputX_,
+                       confPaddingY_,
                        confPadding_);
 }
 
@@ -116,8 +154,18 @@ void AvgPoolProjection::backward(const UpdateCallback& callback) {
     return;
   }
 
-  inputGrad->avgPoolBackward(*outputGrad, imgSizeY_, imgSize_, sizeX_, sizeY_,
-                             strideY_, stride_, outputY_, outputX_, 1, 1,
-                             confPaddingY_, confPadding_);
+  inputGrad->avgPoolBackward(*outputGrad,
+                             imgSizeY_,
+                             imgSize_,
+                             sizeX_,
+                             sizeY_,
+                             strideY_,
+                             stride_,
+                             outputY_,
+                             outputX_,
+                             1,
+                             1,
+                             confPaddingY_,
+                             confPadding_);
 }
 }  // namespace paddle
diff --git a/paddle/gserver/layers/PoolProjection.h b/paddle/gserver/layers/PoolProjection.h
index a11e25b729..9c3191bd80 100644
--- a/paddle/gserver/layers/PoolProjection.h
+++ b/paddle/gserver/layers/PoolProjection.h
@@ -30,11 +30,13 @@ protected:
   std::string poolType_;
 
 public:
-  PoolProjection(const ProjectionConfig& config, ParameterPtr parameter,
+  PoolProjection(const ProjectionConfig& config,
+                 ParameterPtr parameter,
                  bool useGpu);
 
   static PoolProjection* create(const ProjectionConfig& config,
-                                ParameterPtr parameter, bool useGpu);
+                                ParameterPtr parameter,
+                                bool useGpu);
 
   const std::string& getPoolType() const { return poolType_; }
 
@@ -43,7 +45,8 @@ public:
 
 class MaxPoolProjection : public PoolProjection {
 public:
-  MaxPoolProjection(const ProjectionConfig& config, ParameterPtr parameter,
+  MaxPoolProjection(const ProjectionConfig& config,
+                    ParameterPtr parameter,
                     bool useGpu)
       : PoolProjection(config, parameter, useGpu) {}
 
@@ -53,7 +56,8 @@ public:
 
 class AvgPoolProjection : public PoolProjection {
 public:
-  AvgPoolProjection(const ProjectionConfig& config, ParameterPtr parameter,
+  AvgPoolProjection(const ProjectionConfig& config,
+                    ParameterPtr parameter,
                     bool useGpu)
       : PoolProjection(config, parameter, useGpu) {}
 
diff --git a/paddle/gserver/layers/PoolProjectionLayer.cpp b/paddle/gserver/layers/PoolProjectionLayer.cpp
index cabb346d6c..aabc60af19 100644
--- a/paddle/gserver/layers/PoolProjectionLayer.cpp
+++ b/paddle/gserver/layers/PoolProjectionLayer.cpp
@@ -18,7 +18,6 @@ limitations under the License. */
 
 namespace paddle {
 
-
 size_t PoolProjectionLayer::getSize() {
   CHECK_EQ(inputLayers_.size(), 1UL);
   size_t layerSize = 0;
@@ -31,9 +30,15 @@ size_t PoolProjectionLayer::getSize() {
     imgSizeW_ = imgSize_;
   }
 
-  outputH_ = outputSize(imgSizeH_, sizeY_, confPaddingY_, strideY_,
+  outputH_ = outputSize(imgSizeH_,
+                        sizeY_,
+                        confPaddingY_,
+                        strideY_,
                         /* caffeMode */ false);
-  outputW_ = outputSize(imgSizeW_, sizeX_, confPadding_, stride_,
+  outputW_ = outputSize(imgSizeW_,
+                        sizeX_,
+                        confPadding_,
+                        stride_,
                         /* caffeMode */ false);
 
   layerSize = outputH_ * outputW_ * channels_;
diff --git a/paddle/gserver/layers/PowerLayer.cpp b/paddle/gserver/layers/PowerLayer.cpp
index 44c5e6063b..0b9672f220 100644
--- a/paddle/gserver/layers/PowerLayer.cpp
+++ b/paddle/gserver/layers/PowerLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Logging.h"
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
@@ -26,7 +25,7 @@ namespace paddle {
  * \f[
  *   y = x^w
  * \f]
- * where \f$x\f$ is a input vector, \f$w\f$ is scalar weight, 
+ * where \f$x\f$ is a input vector, \f$w\f$ is scalar weight,
  * and output \f$y\f$ is a vector.
  *
  * The config file api is power_layer.
diff --git a/paddle/gserver/layers/PrintLayer.cpp b/paddle/gserver/layers/PrintLayer.cpp
index 68fee69f44..95be7b34cb 100644
--- a/paddle/gserver/layers/PrintLayer.cpp
+++ b/paddle/gserver/layers/PrintLayer.cpp
@@ -18,8 +18,7 @@ namespace paddle {
 
 class PrintLayer : public Layer {
 public:
-  explicit PrintLayer(const LayerConfig& config)
-      : Layer(config) {}
+  explicit PrintLayer(const LayerConfig& config) : Layer(config) {}
   void forward(PassType passType);
   void backward(const UpdateCallback& callback) {}
 };
diff --git a/paddle/gserver/layers/Projection.cpp b/paddle/gserver/layers/Projection.cpp
index aebc08f4a0..c7eb4b6442 100644
--- a/paddle/gserver/layers/Projection.cpp
+++ b/paddle/gserver/layers/Projection.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "Projection.h"
 
 #include "ContextProjection.h"
@@ -25,7 +24,8 @@ ClassRegistrar<Projection, ProjectionConfig, ParameterPtr, bool>
     Projection::registrar_;
 
 Projection* Projection::create(const ProjectionConfig& config,
-                               ParameterPtr parameter, bool useGpu) {
+                               ParameterPtr parameter,
+                               bool useGpu) {
   return registrar_.createByType(config.type(), config, parameter, useGpu);
 }
 
diff --git a/paddle/gserver/layers/Projection.h b/paddle/gserver/layers/Projection.h
index 203edc5396..798503113d 100644
--- a/paddle/gserver/layers/Projection.h
+++ b/paddle/gserver/layers/Projection.h
@@ -39,9 +39,11 @@ namespace paddle {
 class Projection {
 public:
   static Projection* create(const ProjectionConfig& config,
-                            ParameterPtr parameter, bool useGpu);
+                            ParameterPtr parameter,
+                            bool useGpu);
 
-  Projection(const ProjectionConfig& config, ParameterPtr parameter,
+  Projection(const ProjectionConfig& config,
+             ParameterPtr parameter,
              bool useGpu)
       : config_(config), parameter_(parameter), useGpu_(useGpu) {}
 
diff --git a/paddle/gserver/layers/RecurrentLayer.cpp b/paddle/gserver/layers/RecurrentLayer.cpp
index 30ef679f92..08453e21b8 100644
--- a/paddle/gserver/layers/RecurrentLayer.cpp
+++ b/paddle/gserver/layers/RecurrentLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "Layer.h"
 #include "paddle/utils/Stat.h"
 #include "SequenceToBatch.h"
@@ -143,8 +142,8 @@ bool RecurrentLayer::init(const LayerMap& layerMap,
 
 void RecurrentLayer::resetState() {
   CHECK(!reversed_) << "state is not allowed for reversed recurrent layer";
-  Matrix::resizeOrCreate(prevOutput_, 1, getSize(), /* trans= */ false,
-                         useGpu_);
+  Matrix::resizeOrCreate(
+      prevOutput_, 1, getSize(), /* trans= */ false, useGpu_);
   prevOutput_->zeroMem();
 }
 
@@ -183,16 +182,23 @@ void RecurrentLayer::forward(PassType passType) {
   }
 }
 
-void RecurrentLayer::forwardSequence(int batchSize, size_t numSequences,
+void RecurrentLayer::forwardSequence(int batchSize,
+                                     size_t numSequences,
                                      const int* starts) {
   REGISTER_TIMER_INFO("RecurrentFwSequence", getName().c_str());
   frameOutput_.reserve(batchSize);
   for (int i = frameOutput_.size(); i < batchSize; ++i) {
     Argument arg;
-    arg.value = Matrix::create(nullptr, /* height= */ 1, getSize(),
-                               /* trans= */ false, useGpu_);
-    arg.grad = Matrix::create(nullptr, /* height= */ 1, getSize(),
-                              /* trans= */ false, useGpu_);
+    arg.value = Matrix::create(nullptr,
+                               /* height= */ 1,
+                               getSize(),
+                               /* trans= */ false,
+                               useGpu_);
+    arg.grad = Matrix::create(nullptr,
+                              /* height= */ 1,
+                              getSize(),
+                              /* trans= */ false,
+                              useGpu_);
     frameOutput_.push_back(arg);
   }
 
@@ -213,8 +219,8 @@ void RecurrentLayer::forwardOneSequence(int start, int length) {
     }
     activation_->forward(frameOutput_[start]);
     for (int i = 1; i < length; ++i) {
-      frameOutput_[start + i].value->mul(frameOutput_[start + i - 1].value,
-                                         weight_->getW(), 1, 1);
+      frameOutput_[start + i].value->mul(
+          frameOutput_[start + i - 1].value, weight_->getW(), 1, 1);
       activation_->forward(frameOutput_[start + i]);
     }
     if (prevOutput_) {
@@ -223,8 +229,8 @@ void RecurrentLayer::forwardOneSequence(int start, int length) {
   } else {
     activation_->forward(frameOutput_[start + length - 1]);
     for (int i = length - 2; i >= 0; --i) {
-      frameOutput_[start + i].value->mul(frameOutput_[start + i + 1].value,
-                                         weight_->getW(), 1, 1);
+      frameOutput_[start + i].value->mul(
+          frameOutput_[start + i + 1].value, weight_->getW(), 1, 1);
       activation_->forward(frameOutput_[start + i]);
     }
   }
@@ -256,7 +262,8 @@ void RecurrentLayer::backward(const UpdateCallback& callback) {
   weight_->getParameterPtr()->incUpdate(callback);
 }
 
-void RecurrentLayer::backwardSequence(int batchSize, size_t numSequences,
+void RecurrentLayer::backwardSequence(int batchSize,
+                                      size_t numSequences,
                                       const int* starts) {
   REGISTER_TIMER_INFO("RecurrentBwSequence", getName().c_str());
   for (int i = 0; i < batchSize; ++i) {
@@ -274,31 +281,36 @@ void RecurrentLayer::backwardOneSequence(int start, int length) {
   if (!reversed_) {
     for (int i = length - 1; i > 0; --i) {
       activation_->backward(frameOutput_[start + i]);
-      frameOutput_[start + i - 1].grad->mul(frameOutput_[start + i].grad,
-                                            weightT, 1, 1);
+      frameOutput_[start + i - 1].grad->mul(
+          frameOutput_[start + i].grad, weightT, 1, 1);
     }
     activation_->backward(frameOutput_[start]);
     if (weight_->getWGrad()) {
       weight_->getWGrad()->mul(
           output_.value->subMatrix(start, length - 1)->getTranspose(),
-          output_.grad->subMatrix(start + 1, length - 1), 1, 1);
+          output_.grad->subMatrix(start + 1, length - 1),
+          1,
+          1);
     }
   } else {
     for (int i = 0; i < length - 1; ++i) {
       activation_->backward(frameOutput_[start + i]);
-      frameOutput_[start + i + 1].grad->mul(frameOutput_[start + i].grad,
-                                            weightT, 1, 1);
+      frameOutput_[start + i + 1].grad->mul(
+          frameOutput_[start + i].grad, weightT, 1, 1);
     }
     activation_->backward(frameOutput_[start + length - 1]);
     if (weight_->getWGrad()) {
       weight_->getWGrad()->mul(
           output_.value->subMatrix(start + 1, length - 1)->getTranspose(),
-          output_.grad->subMatrix(start, length - 1), 1, 1);
+          output_.grad->subMatrix(start, length - 1),
+          1,
+          1);
     }
   }
 }
 
-void RecurrentLayer::forwardBatch(int batchSize, size_t numSequences,
+void RecurrentLayer::forwardBatch(int batchSize,
+                                  size_t numSequences,
                                   const int* starts) {
   if (!batchValue_) {
     batchValue_.reset(new SequenceToBatch(useGpu_));
@@ -327,7 +339,8 @@ void RecurrentLayer::forwardBatch(int batchSize, size_t numSequences,
   batchValue_->copyBackSeq(*output_.value);
 }
 
-void RecurrentLayer::backwardBatch(int batchSize, size_t numSequences,
+void RecurrentLayer::backwardBatch(int batchSize,
+                                   size_t numSequences,
                                    const int* starts) {
   if (!batchGrad_) {
     batchGrad_.reset(new SequenceToBatch(useGpu_));
@@ -377,11 +390,15 @@ void RecurrentLayer::backwardBatch(int batchSize, size_t numSequences,
       if (!reversed_) {
         weight_->getWGrad()->mul(
             output_.value->subMatrix(starts[seq], len - 1)->getTranspose(),
-            output_.grad->subMatrix(starts[seq] + 1, len - 1), 1, 1);
+            output_.grad->subMatrix(starts[seq] + 1, len - 1),
+            1,
+            1);
       } else {
         weight_->getWGrad()->mul(
             output_.value->subMatrix(starts[seq] + 1, len - 1)->getTranspose(),
-            output_.grad->subMatrix(starts[seq], len - 1), 1, 1);
+            output_.grad->subMatrix(starts[seq], len - 1),
+            1,
+            1);
       }
     }
   }
diff --git a/paddle/gserver/layers/RecurrentLayerGroup.cpp b/paddle/gserver/layers/RecurrentLayerGroup.cpp
index 62dbaa2674..a5443975da 100644
--- a/paddle/gserver/layers/RecurrentLayerGroup.cpp
+++ b/paddle/gserver/layers/RecurrentLayerGroup.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/gserver/layers/Layer.h"
 #include <functional>
 
@@ -31,7 +30,8 @@ class RecurrentLayerGroup : public Layer {
 public:
   explicit RecurrentLayerGroup(const LayerConfig& config) : Layer(config) {}
 
-  void initSubNetwork(NeuralNetwork* rootNetwork, const ModelConfig& config,
+  void initSubNetwork(NeuralNetwork* rootNetwork,
+                      const ModelConfig& config,
                       const std::vector<ParameterType>& parameterTypes,
                       bool useGpu);
 
@@ -53,7 +53,7 @@ public:
   /**
    * @see Layer.accessSubNetwork
    */
-  void accessSubNetwork(const std::function<void(NeuralNetwork &)> &callback) {
+  void accessSubNetwork(const std::function<void(NeuralNetwork&)>& callback) {
     callback(*network_);
   }
 
@@ -64,8 +64,10 @@ private:
 REGISTER_LAYER(recurrent_layer_group, RecurrentLayerGroup);
 
 void RecurrentLayerGroup::initSubNetwork(
-    NeuralNetwork* rootNetwork, const ModelConfig& config,
-    const std::vector<ParameterType>& parameterTypes, bool useGpu) {
+    NeuralNetwork* rootNetwork,
+    const ModelConfig& config,
+    const std::vector<ParameterType>& parameterTypes,
+    bool useGpu) {
   setNeedGradient(true);
 
   network_.reset(new RecurrentGradientMachine(config_.name(), rootNetwork));
diff --git a/paddle/gserver/layers/ResizeLayer.cpp b/paddle/gserver/layers/ResizeLayer.cpp
index dc573e838f..3c478a33e3 100644
--- a/paddle/gserver/layers/ResizeLayer.cpp
+++ b/paddle/gserver/layers/ResizeLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
 #include "paddle/math/BaseMatrix.h"
@@ -68,9 +67,11 @@ void ResizeLayer::backward(const UpdateCallback& callback) {
     return;
   }
 
-  MatrixPtr tmp =
-      Matrix::create(input.grad->getData(), height * width / getSize(),
-                     getSize(), false, useGpu_);
+  MatrixPtr tmp = Matrix::create(input.grad->getData(),
+                                 height * width / getSize(),
+                                 getSize(),
+                                 false,
+                                 useGpu_);
   tmp->add(*output_.grad);
 }
 
diff --git a/paddle/gserver/layers/ScalingLayer.cpp b/paddle/gserver/layers/ScalingLayer.cpp
index a494b401ff..71570810f9 100644
--- a/paddle/gserver/layers/ScalingLayer.cpp
+++ b/paddle/gserver/layers/ScalingLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Logging.h"
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
@@ -26,7 +25,7 @@ namespace paddle {
  * \f[
  *   y.row[i] = w[i] * x.row[i]
  * \f]
- * where \f$x\f$ is (batchSize x dataDim) input, \f$w\f$ is 
+ * where \f$x\f$ is (batchSize x dataDim) input, \f$w\f$ is
  * (batchSize x 1) weight vector, and \f$y\f$ is (batchSize x dataDim) output.
  *
  * The config file api is scaling_layer.
diff --git a/paddle/gserver/layers/ScalingProjection.cpp b/paddle/gserver/layers/ScalingProjection.cpp
index c0a7072c6a..7999d02d38 100644
--- a/paddle/gserver/layers/ScalingProjection.cpp
+++ b/paddle/gserver/layers/ScalingProjection.cpp
@@ -19,7 +19,8 @@ namespace paddle {
 class ScalingProjection : public Projection {
 public:
   ScalingProjection(const ProjectionConfig& config,
-                    const ParameterPtr& parameter, bool useGpu)
+                    const ParameterPtr& parameter,
+                    bool useGpu)
       : Projection(config, parameter, useGpu) {
     CHECK_EQ(parameter->getSize(), 1UL);
     weight_.reset(new Weight(1, 1, parameter));
@@ -33,10 +34,13 @@ public:
   void backward(const UpdateCallback& callback) {
     if (weight_->getWGrad()) {
       auto sum = Matrix::create(in_->value->getHeight(), 1, false, useGpu_);
-      sum->sumOfProducts(*in_->value, *out_->grad,
-                         /* scaleSum= */1, /* scaleDest= */0);
+      sum->sumOfProducts(*in_->value,
+                         *out_->grad,
+                         /* scaleSum= */ 1,
+                         /* scaleDest= */ 0);
       weight_->getWGrad()->sumCols(*sum,
-                                   /* scaleSum= */1, /* scaleDest= */1);
+                                   /* scaleSum= */ 1,
+                                   /* scaleDest= */ 1);
       parameter_->incUpdate(callback);
     }
     if (in_->grad) {
diff --git a/paddle/gserver/layers/SelectiveFullyConnectedLayer.cpp b/paddle/gserver/layers/SelectiveFullyConnectedLayer.cpp
index 25ae9d5195..4dfa2c179d 100644
--- a/paddle/gserver/layers/SelectiveFullyConnectedLayer.cpp
+++ b/paddle/gserver/layers/SelectiveFullyConnectedLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "SelectiveFullyConnectedLayer.h"
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
@@ -49,11 +48,11 @@ bool SelectiveFullyConnectedLayer::init(const LayerMap& layerMap,
 
 void SelectiveFullyConnectedLayer::prefetch() {}
 
-void SelectiveFullyConnectedLayer::reserveOutput(size_t height, size_t width,
+void SelectiveFullyConnectedLayer::reserveOutput(size_t height,
+                                                 size_t width,
                                                  size_t nnz) {
   bool flag = (passType_ == PASS_TEST &&
-               config_.selective_fc_pass_generation() &&
-               !fullOutput_);
+               config_.selective_fc_pass_generation() && !fullOutput_);
   SetDevice device(output_.deviceId);
   if (flag) {
     // output_.value is sparse matrix
@@ -61,8 +60,12 @@ void SelectiveFullyConnectedLayer::reserveOutput(size_t height, size_t width,
         dynamic_cast<GpuMatrix*>(output_.value.get())) {
       output_.value = nullptr;
     }
-    Matrix::resizeOrCreateSparseMatrix(output_.value, height, width, nnz,
-                                       FLOAT_VALUE, SPARSE_CSR,
+    Matrix::resizeOrCreateSparseMatrix(output_.value,
+                                       height,
+                                       width,
+                                       nnz,
+                                       FLOAT_VALUE,
+                                       SPARSE_CSR,
                                        /*trans=*/false,
                                        /*useGpu=*/useGpu_);
     output_.value->copyFrom(*selCols_);
@@ -74,19 +77,31 @@ void SelectiveFullyConnectedLayer::reserveOutput(size_t height, size_t width,
           dynamic_cast<GpuSparseMatrix*>(output_.value.get())) {
         output_.value = nullptr;
       }
-      Matrix::resizeOrCreate(output_.value, height, width,
-                             /*trans=*/false, /*useGpu=*/useGpu_);
+      Matrix::resizeOrCreate(output_.value,
+                             height,
+                             width,
+                             /*trans=*/false,
+                             /*useGpu=*/useGpu_);
       interOutput_ = output_.value;
     } else {
       // output_.value is dense matrix, but width = nnz /height
       CHECK_EQ(nnz % height, 0U);
       CHECK(nnz / height);
-      Matrix::resizeOrCreate(output_.value, height, nnz / height,
-                             /*trans=*/false, /*useGpu=*/useGpu_);
-      interOutput_ = Matrix::createSparseMatrix(
-          output_.value->getData(), selCols_->getRows(), selCols_->getCols(),
-          height, width, nnz, FLOAT_VALUE, SPARSE_CSR,
-          /*trans=*/false, /*useGpu=*/useGpu_);
+      Matrix::resizeOrCreate(output_.value,
+                             height,
+                             nnz / height,
+                             /*trans=*/false,
+                             /*useGpu=*/useGpu_);
+      interOutput_ = Matrix::createSparseMatrix(output_.value->getData(),
+                                                selCols_->getRows(),
+                                                selCols_->getCols(),
+                                                height,
+                                                width,
+                                                nnz,
+                                                FLOAT_VALUE,
+                                                SPARSE_CSR,
+                                                /*trans=*/false,
+                                                /*useGpu=*/useGpu_);
     }
   }
   interOutput_->zeroMem();
@@ -97,8 +112,11 @@ void SelectiveFullyConnectedLayer::reserveOutput(size_t height, size_t width,
     CHECK(nnz / height)
         << "during training, "
            "each sample must have at least one column selected.";
-    Matrix::resizeOrCreate(output_.grad, height, nnz / height,
-                           /*trans=*/false, /*useGpu=*/useGpu_);
+    Matrix::resizeOrCreate(output_.grad,
+                           height,
+                           nnz / height,
+                           /*trans=*/false,
+                           /*useGpu=*/useGpu_);
     output_.grad->zeroMem();
   }
 }
@@ -131,7 +149,7 @@ void SelectiveFullyConnectedLayer::forward(PassType passType) {
     real scaleT = i == 0 ? real(0) : real(1);
 
     flag = nnz < (hsize * wsize) * config_.selective_fc_full_mul_ratio() &&
-                !fullOutput_;
+           !fullOutput_;
     if (flag) {
       // if the indecies are highly sparse,
       // manully compute the multiplication of
@@ -145,8 +163,11 @@ void SelectiveFullyConnectedLayer::forward(PassType passType) {
       if (fullOutput_) {
         interOutput_->mul(input, weight->getTranspose(), 1, scaleT);
       } else {
-        Matrix::resizeOrCreate(mmat_, hsize, wsize,
-                               /*trans=*/false, /*useGpu=*/useGpu_);
+        Matrix::resizeOrCreate(mmat_,
+                               hsize,
+                               wsize,
+                               /*trans=*/false,
+                               /*useGpu=*/useGpu_);
         mmat_->mul(input, weight->getTranspose());
         interOutput_->add3(mmat_);
       }
@@ -158,7 +179,7 @@ void SelectiveFullyConnectedLayer::forward(PassType passType) {
   }
 
   flag = (passType_ == PASS_TEST && config_.selective_fc_pass_generation() &&
-         !fullOutput_);
+          !fullOutput_);
   if (flag) {
     // during generation, output of this layer is a sparse csr matrix,
     // which is probably the input of maxid layer
@@ -166,8 +187,11 @@ void SelectiveFullyConnectedLayer::forward(PassType passType) {
     // activiation of this layer should be exponential, not softmax.
 
     Argument arg;
-    arg.value = Matrix::create(interOutput_->getData(), 1, nnz,
-                               /*trans=*/false, /*useGpu=*/useGpu_);
+    arg.value = Matrix::create(interOutput_->getData(),
+                               1,
+                               nnz,
+                               /*trans=*/false,
+                               /*useGpu=*/useGpu_);
     activation_->forward(arg);
   } else /* train and test in train, not generating */ {
     // during training, this layer output value is *Matrix*, which is input of
@@ -187,17 +211,22 @@ void SelectiveFullyConnectedLayer::backward(const UpdateCallback& callback) {
   backwardActivation();
   MatrixPtr oGrad = getOutputGrad();
   if (!fullOutput_) {
-    interOutGrad_ = Matrix::createSparseMatrix(
-        oGrad->getData(), interOutput_->getRows(), interOutput_->getCols(),
-        interOutput_->getHeight(), interOutput_->getWidth(),
-        interOutput_->getElementCnt(), FLOAT_VALUE, SPARSE_CSR,
-        /*trans=*/false,
-        /*useGpu=*/useGpu_);
+    interOutGrad_ = Matrix::createSparseMatrix(oGrad->getData(),
+                                               interOutput_->getRows(),
+                                               interOutput_->getCols(),
+                                               interOutput_->getHeight(),
+                                               interOutput_->getWidth(),
+                                               interOutput_->getElementCnt(),
+                                               FLOAT_VALUE,
+                                               SPARSE_CSR,
+                                               /*trans=*/false,
+                                               /*useGpu=*/useGpu_);
   } else {
-    interOutGrad_ =
-        Matrix::create(oGrad->getData(), oGrad->getHeight(), oGrad->getWidth(),
-                       /*trans=*/false,
-                       /*useGpu=*/useGpu_);
+    interOutGrad_ = Matrix::create(oGrad->getData(),
+                                   oGrad->getHeight(),
+                                   oGrad->getWidth(),
+                                   /*trans=*/false,
+                                   /*useGpu=*/useGpu_);
   }
 
   if (biases_ && biases_->getWGrad()) {
@@ -240,13 +269,21 @@ void paddle::SelectiveFullyConnectedLayer::fillSelectiveData(
   size_t sampleNum = candidates->size();
   size_t outputWidth = getSize();
   size_t nnz =
-      std::accumulate(candidates->begin(), candidates->end(), 0UL,
+      std::accumulate(candidates->begin(),
+                      candidates->end(),
+                      0UL,
                       [](size_t a, const std::pair<int*, size_t>& arr) {
                         return a + arr.second;
                       });
 
   Matrix::resizeOrCreateSparseMatrix(this->cpuSelCols_,
-    sampleNum, outputWidth, nnz, NO_VALUE, SPARSE_CSR, false, false);
+                                     sampleNum,
+                                     outputWidth,
+                                     nnz,
+                                     NO_VALUE,
+                                     SPARSE_CSR,
+                                     false,
+                                     false);
   CHECK(this->cpuSelCols_ != nullptr);
   CpuSparseMatrixPtr selCols =
       std::dynamic_pointer_cast<CpuSparseMatrix>(cpuSelCols_);
@@ -272,7 +309,13 @@ void paddle::SelectiveFullyConnectedLayer::fillSelectiveData(
     this->selCols_ = this->cpuSelCols_;
   } else {
     Matrix::resizeOrCreateSparseMatrix(this->selCols_,
-          sampleNum, outputWidth, nnz, NO_VALUE, SPARSE_CSR, false, true);
+                                       sampleNum,
+                                       outputWidth,
+                                       nnz,
+                                       NO_VALUE,
+                                       SPARSE_CSR,
+                                       false,
+                                       true);
     this->selCols_->copyFrom(*cpuSelCols_, HPPL_STREAM_1);
     hl_stream_synchronize(HPPL_STREAM_1);
   }
diff --git a/paddle/gserver/layers/SelectiveFullyConnectedLayer.h b/paddle/gserver/layers/SelectiveFullyConnectedLayer.h
index c152151cff..9f92ae0605 100644
--- a/paddle/gserver/layers/SelectiveFullyConnectedLayer.h
+++ b/paddle/gserver/layers/SelectiveFullyConnectedLayer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "Layer.h"
@@ -98,8 +97,6 @@ private:
   /**
    * @brief Make SelectiveFC act as FullyConnectedLayer
    */
-  void fillFullySelectiveData() {
-    fullOutput_ = true;
-  }
+  void fillFullySelectiveData() { fullOutput_ = true; }
 };
 }  // namespace paddle
diff --git a/paddle/gserver/layers/SequenceConcatLayer.cpp b/paddle/gserver/layers/SequenceConcatLayer.cpp
index dfce4dcb19..bd72ba3d16 100644
--- a/paddle/gserver/layers/SequenceConcatLayer.cpp
+++ b/paddle/gserver/layers/SequenceConcatLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Logging.h"
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
@@ -68,13 +67,11 @@ void SequenceConcatLayer::forward(PassType passType) {
 
   const Argument& input1 = getInput(0);
   size_t numSequences1 = input1.getNumSequences();
-  auto startPositions1 =
-      input1.sequenceStartPositions->getVector(false);
+  auto startPositions1 = input1.sequenceStartPositions->getVector(false);
 
   const Argument& input2 = getInput(1);
   size_t numSequences2 = input2.getNumSequences();
-  auto startPositions2 =
-      input2.sequenceStartPositions->getVector(false);
+  auto startPositions2 = input2.sequenceStartPositions->getVector(false);
 
   CHECK_EQ(dim, input1.value->getWidth());
   CHECK_EQ(startPositions1->getData()[numSequences1], input1.getBatchSize());
@@ -117,8 +114,8 @@ void SequenceConcatLayer::forward(PassType passType) {
     }
 
     // modify the sequenceStartPositions
-    ICpuGpuVector::resizeOrCreate(output_.sequenceStartPositions,
-                                   numSequences1 + 1, false);
+    ICpuGpuVector::resizeOrCreate(
+        output_.sequenceStartPositions, numSequences1 + 1, false);
 
     int* tgtBuf = output_.sequenceStartPositions->getMutableData(false);
 
@@ -150,10 +147,8 @@ void SequenceConcatLayer::backward(const UpdateCallback& callback) {
   MatrixPtr inputGrad1 = getInputGrad(0);
   MatrixPtr inputGrad2 = getInputGrad(1);
   MatrixPtr outputGrad = getOutputGrad();
-  auto startPositions1 =
-      getInput(0).sequenceStartPositions->getVector(false);
-  auto startPositions2 =
-      getInput(1).sequenceStartPositions->getVector(false);
+  auto startPositions1 = getInput(0).sequenceStartPositions->getVector(false);
+  auto startPositions2 = getInput(1).sequenceStartPositions->getVector(false);
 
   size_t numSequences1 = startPositions1->getSize() - 1;
   size_t numSequences2 = startPositions2->getSize() - 1;
diff --git a/paddle/gserver/layers/SequenceLastInstanceLayer.cpp b/paddle/gserver/layers/SequenceLastInstanceLayer.cpp
index 26d9536dd5..0e9531eabb 100644
--- a/paddle/gserver/layers/SequenceLastInstanceLayer.cpp
+++ b/paddle/gserver/layers/SequenceLastInstanceLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Logging.h"
 
 #include "SequencePoolLayer.h"
diff --git a/paddle/gserver/layers/SequencePoolLayer.cpp b/paddle/gserver/layers/SequencePoolLayer.cpp
index 55be73d363..c9f19b7d3b 100644
--- a/paddle/gserver/layers/SequencePoolLayer.cpp
+++ b/paddle/gserver/layers/SequencePoolLayer.cpp
@@ -58,7 +58,7 @@ void SequencePoolLayer::forward(PassType passType) {
   resetOutput(newBatchSize_, dim);
   if (type_) {
     CHECK(input.subSequenceStartPositions)
-      << "when trans_type = seq, input must hasSubseq";
+        << "when trans_type = seq, input must hasSubseq";
   }
   /* If type_ = kNonSeq, both seq has or not has sub-seq degrade to a non-seq,
    * thus, in this case, output_ has no sequenceStartPositions.
diff --git a/paddle/gserver/layers/SequenceReshapeLayer.cpp b/paddle/gserver/layers/SequenceReshapeLayer.cpp
index 05766706b0..5ca9b8b300 100644
--- a/paddle/gserver/layers/SequenceReshapeLayer.cpp
+++ b/paddle/gserver/layers/SequenceReshapeLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Logging.h"
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
@@ -69,8 +68,7 @@ void SequenceReshapeLayer::forward(PassType passType) {
   size_t outDim = getSize();
 
   size_t numSequences = input.getNumSequences();
-  auto startPositions =
-      input.sequenceStartPositions->getVector(false);
+  auto startPositions = input.sequenceStartPositions->getVector(false);
   const int* starts = startPositions->getData();
 
   CHECK_EQ(starts[numSequences], input.getBatchSize());
@@ -96,9 +94,7 @@ void SequenceReshapeLayer::forward(PassType passType) {
 
     // modify the sequenceStartPositions
     ICpuGpuVector::resizeOrCreate(
-        output_.sequenceStartPositions,
-        numSequences + 1,
-        false);
+        output_.sequenceStartPositions, numSequences + 1, false);
 
     int* tgtBuf = output_.sequenceStartPositions->getMutableData(false);
 
@@ -134,8 +130,11 @@ void SequenceReshapeLayer::backward(const UpdateCallback& callback) {
   REGISTER_TIMER_INFO("SequenceReshapeLayerBackward", getName().c_str());
 
   if (inputGrad) {
-    Matrix::resizeOrCreate(reshapedOutputGrad, inputGrad->getHeight(),
-                           inputGrad->getWidth(), false, useGpu_);
+    Matrix::resizeOrCreate(reshapedOutputGrad,
+                           inputGrad->getHeight(),
+                           inputGrad->getWidth(),
+                           false,
+                           useGpu_);
     reshapedOutputGrad->copyFrom(*outputGrad);
     inputGrad->add(*reshapedOutputGrad);
   }
diff --git a/paddle/gserver/layers/SequenceToBatch.cpp b/paddle/gserver/layers/SequenceToBatch.cpp
index 88eace28b2..04402db9c8 100644
--- a/paddle/gserver/layers/SequenceToBatch.cpp
+++ b/paddle/gserver/layers/SequenceToBatch.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <vector>
 #include <algorithm>
 #include "SequenceToBatch.h"
@@ -21,8 +20,10 @@ limitations under the License. */
 
 namespace paddle {
 
-void SequenceToBatch::resizeOrCreateBatch(int batchSize, size_t numSequences,
-                                          const int *seqStarts, bool reversed,
+void SequenceToBatch::resizeOrCreateBatch(int batchSize,
+                                          size_t numSequences,
+                                          const int *seqStarts,
+                                          bool reversed,
                                           bool prevBatchState) {
   CHECK_EQ(seqStarts[numSequences], batchSize);
   IVector::resizeOrCreate(seq2BatchIdx_, batchSize, useGpu_);
@@ -50,7 +51,8 @@ void SequenceToBatch::resizeOrCreateBatch(int batchSize, size_t numSequences,
     int length = seqStarts[seqId + 1] - seqStarts[seqId];
     seqStartAndLength.emplace_back(seqStarts[seqId], length, seqId);
   }
-  std::sort(seqStartAndLength.begin(), seqStartAndLength.end(),
+  std::sort(seqStartAndLength.begin(),
+            seqStartAndLength.end(),
             [](SeqStartAndLength a, SeqStartAndLength b) {
               return a.length_ > b.length_;
             });
@@ -122,15 +124,19 @@ void SequenceToBatch::resizeOrCreateBatch(int batchSize, size_t numSequences,
 }
 
 void SequenceToBatch::resizeOrCreate(Matrix &seqValue) {
-  Matrix::resizeOrCreate(batchValue_, seqValue.getHeight(), seqValue.getWidth(),
-                         /* trans= */ false, useGpu_);
+  Matrix::resizeOrCreate(batchValue_,
+                         seqValue.getHeight(),
+                         seqValue.getWidth(),
+                         /* trans= */ false,
+                         useGpu_);
 }
 
 MatrixPtr SequenceToBatch::getBatchValue(int batchId, int numRows) {
   return getBatchValue(*batchValue_, batchId, numRows);
 }
 
-MatrixPtr SequenceToBatch::getBatchValue(Matrix &batchValue, int batchId,
+MatrixPtr SequenceToBatch::getBatchValue(Matrix &batchValue,
+                                         int batchId,
                                          int numRows) {
   int *batchStartPositions = batchStartPositions_->getData();
   int start = batchStartPositions[batchId];
@@ -151,7 +157,8 @@ void SequenceToBatch::getSeqOutputFromBatch(Matrix &sequence, Matrix &batch) {
   sequence2BatchCopy(sequence, batch, *seqEndIdxInBatch_, true);
 }
 
-void SequenceToBatch::sequence2BatchCopy(Matrix &batch, Matrix &sequence,
+void SequenceToBatch::sequence2BatchCopy(Matrix &batch,
+                                         Matrix &sequence,
                                          IVector &seq2BatchIdx,
                                          bool seq2batch) {
   int seqWidth = sequence.getWidth();
@@ -161,23 +168,27 @@ void SequenceToBatch::sequence2BatchCopy(Matrix &batch, Matrix &sequence,
   int *idxData = seq2BatchIdx.getData();
 
   if (useGpu_) {
-    hl_sequence2batch_copy(batchData, seqData, idxData, seqWidth,
-                           batchCount, seq2batch);
+    hl_sequence2batch_copy(
+        batchData, seqData, idxData, seqWidth, batchCount, seq2batch);
   } else {
     for (int i = 0; i < batchCount; ++i) {
       if (seq2batch) {
-        memcpy(batch.rowBuf(i), sequence.rowBuf(idxData[i]),
+        memcpy(batch.rowBuf(i),
+               sequence.rowBuf(idxData[i]),
                seqWidth * sizeof(real));
       } else {
-        memcpy(sequence.rowBuf(idxData[i]), batch.rowBuf(i),
+        memcpy(sequence.rowBuf(idxData[i]),
+               batch.rowBuf(i),
                seqWidth * sizeof(real));
       }
     }
   }
 }
 
-void SequenceToBatch::sequence2BatchAdd(Matrix &batch, Matrix &sequence,
-                                        IVector &seq2BatchIdx, bool seq2batch) {
+void SequenceToBatch::sequence2BatchAdd(Matrix &batch,
+                                        Matrix &sequence,
+                                        IVector &seq2BatchIdx,
+                                        bool seq2batch) {
   int seqWidth = sequence.getWidth();
   int batchCount = batch.getHeight();
   real *batchData = batch.getData();
@@ -185,8 +196,8 @@ void SequenceToBatch::sequence2BatchAdd(Matrix &batch, Matrix &sequence,
   int *idxData = seq2BatchIdx.getData();
 
   if (useGpu_) {
-    hl_sequence2batch_add(batchData, seqData, idxData, seqWidth,
-                          batchCount, seq2batch);
+    hl_sequence2batch_add(
+        batchData, seqData, idxData, seqWidth, batchCount, seq2batch);
   } else {
     for (int i = 0; i < batchCount; ++i) {
       if (seq2batch) {
@@ -199,8 +210,11 @@ void SequenceToBatch::sequence2BatchAdd(Matrix &batch, Matrix &sequence,
 }
 
 void SequenceToBatch::copyFromSeq(Matrix &seqValue) {
-  Matrix::resizeOrCreate(batchValue_, seqValue.getHeight(), seqValue.getWidth(),
-                         /* trans= */ false, useGpu_);
+  Matrix::resizeOrCreate(batchValue_,
+                         seqValue.getHeight(),
+                         seqValue.getWidth(),
+                         /* trans= */ false,
+                         useGpu_);
   sequence2BatchCopy(*batchValue_, seqValue, *seq2BatchIdx_, true);
 }
 
@@ -208,12 +222,14 @@ void SequenceToBatch::copyBackSeq(Matrix &seqValue) {
   sequence2BatchCopy(*batchValue_, seqValue, *seq2BatchIdx_, false);
 }
 
-void SequenceToBatch::copy(Matrix &seqValue, Matrix &batchValue,
+void SequenceToBatch::copy(Matrix &seqValue,
+                           Matrix &batchValue,
                            bool seq2batch) {
   sequence2BatchCopy(batchValue, seqValue, *seq2BatchIdx_, seq2batch);
 }
 
-void SequenceToBatch::add(Matrix &seqValue, Matrix &batchValue,
+void SequenceToBatch::add(Matrix &seqValue,
+                          Matrix &batchValue,
                           bool seq2batch) {
   sequence2BatchAdd(batchValue, seqValue, *seq2BatchIdx_, seq2batch);
 }
diff --git a/paddle/gserver/layers/SequenceToBatch.h b/paddle/gserver/layers/SequenceToBatch.h
index 8cba7ea3b9..6bc12f207e 100644
--- a/paddle/gserver/layers/SequenceToBatch.h
+++ b/paddle/gserver/layers/SequenceToBatch.h
@@ -43,8 +43,10 @@ public:
   explicit SequenceToBatch(bool useGpu) : useGpu_(useGpu) {}
 
   /* resize and calculate the batchIndex_ */
-  void resizeOrCreateBatch(int batchSize, size_t numSequences,
-                           const int *seqStarts, bool reversed,
+  void resizeOrCreateBatch(int batchSize,
+                           size_t numSequences,
+                           const int *seqStarts,
+                           bool reversed,
                            bool prevBatchState = false);
 
   /* sequence matrix and batch matrix copy:
@@ -81,9 +83,13 @@ public:
   }
 
 protected:
-  void sequence2BatchCopy(Matrix &batch, Matrix &sequence,
-                          IVector &seq2BatchIdx, bool seq2batch);
-  void sequence2BatchAdd(Matrix &batch, Matrix &sequence, IVector &seq2BatchIdx,
+  void sequence2BatchCopy(Matrix &batch,
+                          Matrix &sequence,
+                          IVector &seq2BatchIdx,
+                          bool seq2batch);
+  void sequence2BatchAdd(Matrix &batch,
+                         Matrix &sequence,
+                         IVector &seq2BatchIdx,
                          bool seq2batch);
 
   IVectorPtr batchStartPositions_;
diff --git a/paddle/gserver/layers/SlopeInterceptLayer.cpp b/paddle/gserver/layers/SlopeInterceptLayer.cpp
index af5fccf650..dd6ffcd50b 100644
--- a/paddle/gserver/layers/SlopeInterceptLayer.cpp
+++ b/paddle/gserver/layers/SlopeInterceptLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Logging.h"
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
@@ -21,7 +20,8 @@ limitations under the License. */
 namespace paddle {
 
 /**
- * @brief A layer for applying a slope and an intercept to the input element-wise.
+ * @brief A layer for applying a slope and an intercept to the input
+ * element-wise.
  * This layer is used in NEURAL TURING MACHINE.
  * @note There is no activation and weight in this layer.
  *
@@ -29,7 +29,8 @@ namespace paddle {
  *    y = ax + b
  * \f]
  *
- * Here, a is scale and b is offset, which are provided as attributes of the layer.
+ * Here, a is scale and b is offset, which are provided as attributes of the
+ * layer.
  *
  * The config file api is slope_intercept_layer.
  */
diff --git a/paddle/gserver/layers/SpatialPyramidPoolLayer.cpp b/paddle/gserver/layers/SpatialPyramidPoolLayer.cpp
index 2fcfc8e1ae..9609919695 100644
--- a/paddle/gserver/layers/SpatialPyramidPoolLayer.cpp
+++ b/paddle/gserver/layers/SpatialPyramidPoolLayer.cpp
@@ -93,7 +93,8 @@ bool SpatialPyramidPoolLayer::init(const LayerMap& layerMap,
   size_t endCol = 0;
   for (size_t i = 0; i < pyramidHeight_; i++) {
     poolProjections_.emplace_back(PoolProjection::create(
-        getConfig(imgSizeW_, imgSizeH_, channels_, i, poolType_), nullptr,
+        getConfig(imgSizeW_, imgSizeH_, channels_, i, poolType_),
+        nullptr,
         useGpu_));
     endCol += poolProjections_[i]->getOutputSize();
     projCol_.push_back(std::make_pair(startCol, endCol));
diff --git a/paddle/gserver/layers/SpatialPyramidPoolLayer.h b/paddle/gserver/layers/SpatialPyramidPoolLayer.h
index e15b6d2f85..79db574d99 100644
--- a/paddle/gserver/layers/SpatialPyramidPoolLayer.h
+++ b/paddle/gserver/layers/SpatialPyramidPoolLayer.h
@@ -24,7 +24,7 @@ namespace paddle {
  * @brief A layer for spatial pyramid pooling on the input image by taking
  * the max, average, etc. within regions, so that the result vector of
  * different sized images are of the same size.
- * 
+ *
  * The config file api is spp_layer.
  */
 
@@ -47,8 +47,11 @@ public:
 
   virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
 
-  ProjectionConfig getConfig(size_t sizeX_, size_t sizeY_, size_t channels,
-                             size_t pyamidLevel_, std::string& poolType_);
+  ProjectionConfig getConfig(size_t sizeX_,
+                             size_t sizeY_,
+                             size_t channels,
+                             size_t pyamidLevel_,
+                             std::string& poolType_);
   size_t getSize();
 
   virtual void forward(PassType passType);
diff --git a/paddle/gserver/layers/SubSequenceLayer.cpp b/paddle/gserver/layers/SubSequenceLayer.cpp
index ccf65ba649..664f9e13c0 100644
--- a/paddle/gserver/layers/SubSequenceLayer.cpp
+++ b/paddle/gserver/layers/SubSequenceLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Logging.h"
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
@@ -75,18 +74,15 @@ void SubSequenceLayer::forward(PassType passType) {
 
   const Argument& input = getInput(0);
   size_t numSequences1 = input.getNumSequences();
-  auto startPositions1 =
-      input.sequenceStartPositions->getVector(false);
+  auto startPositions1 = input.sequenceStartPositions->getVector(false);
 
   const Argument& offsetSeq = getInput(1);
   size_t numSequences2 = offsetSeq.getNumSequences();
-  auto startPositions2 =
-      offsetSeq.sequenceStartPositions->getVector(false);
+  auto startPositions2 = offsetSeq.sequenceStartPositions->getVector(false);
 
   const Argument& sizeSeq = getInput(2);
   size_t numSequences3 = sizeSeq.getNumSequences();
-  auto startPositions3 =
-      sizeSeq.sequenceStartPositions->getVector(false);
+  auto startPositions3 = sizeSeq.sequenceStartPositions->getVector(false);
 
   CHECK_EQ(dim, input.value->getWidth());
 
@@ -143,8 +139,8 @@ void SubSequenceLayer::forward(PassType passType) {
     }
 
     // modify the sequenceStartPositions
-    ICpuGpuVector::resizeOrCreate(output_.sequenceStartPositions,
-                                   numSequences1 + 1, false);
+    ICpuGpuVector::resizeOrCreate(
+        output_.sequenceStartPositions, numSequences1 + 1, false);
 
     int* tgtBuf = output_.sequenceStartPositions->getMutableData(false);
     int offset = 0;
@@ -177,8 +173,7 @@ void SubSequenceLayer::backward(const UpdateCallback& callback) {
 
   MatrixPtr inputGrad1 = getInputGrad(0);
   MatrixPtr outputGrad = getOutputGrad();
-  auto startPositions1 =
-      getInput(0).sequenceStartPositions->getVector(false);
+  auto startPositions1 = getInput(0).sequenceStartPositions->getVector(false);
   size_t numSequences1 = startPositions1->getSize() - 1;
   const int* starts1 = startPositions1->getData();
 
diff --git a/paddle/gserver/layers/SumToOneNormLayer.cpp b/paddle/gserver/layers/SumToOneNormLayer.cpp
index 7b61dd0822..bcf3916840 100644
--- a/paddle/gserver/layers/SumToOneNormLayer.cpp
+++ b/paddle/gserver/layers/SumToOneNormLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Logging.h"
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
@@ -21,7 +20,7 @@ limitations under the License. */
 namespace paddle {
 
 /**
- * A layer for sum-to-one normalization, 
+ * A layer for sum-to-one normalization,
  * which is used in NEURAL TURING MACHINE.
  * \f[
  *   out[i] = \frac {in[i]} {\sum_{k=1}^N in[k]}
diff --git a/paddle/gserver/layers/TableProjection.cpp b/paddle/gserver/layers/TableProjection.cpp
index 947d8cf9be..2bc0d329d9 100644
--- a/paddle/gserver/layers/TableProjection.cpp
+++ b/paddle/gserver/layers/TableProjection.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "TableProjection.h"
 
 namespace paddle {
@@ -20,7 +19,8 @@ namespace paddle {
 REGISTER_PROJECTION(table, TableProjection);
 
 TableProjection::TableProjection(const ProjectionConfig& config,
-                                 const ParameterPtr& parameter, bool useGpu)
+                                 const ParameterPtr& parameter,
+                                 bool useGpu)
     : Projection(config, parameter, useGpu) {
   table_.reset(
       new Weight(config.input_size(), config.output_size(), parameter));
diff --git a/paddle/gserver/layers/TableProjection.h b/paddle/gserver/layers/TableProjection.h
index eadf2de623..97c672508a 100644
--- a/paddle/gserver/layers/TableProjection.h
+++ b/paddle/gserver/layers/TableProjection.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "Projection.h"
@@ -34,7 +33,8 @@ namespace paddle {
  */
 class TableProjection : public Projection {
 public:
-  TableProjection(const ProjectionConfig& config, const ParameterPtr& parameter,
+  TableProjection(const ProjectionConfig& config,
+                  const ParameterPtr& parameter,
                   bool useGpu);
   /**
    * If use sparse row matrix as parameter, prefetch feature ids in input label.
diff --git a/paddle/gserver/layers/TensorLayer.cpp b/paddle/gserver/layers/TensorLayer.cpp
index 84fe9005b0..03586cc6ff 100644
--- a/paddle/gserver/layers/TensorLayer.cpp
+++ b/paddle/gserver/layers/TensorLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "TensorLayer.h"
 
 #include "paddle/utils/Logging.h"
@@ -72,7 +71,9 @@ void TensorLayer::forward(PassType passType) {
     MatrixPtr input1 = getInputValue(0);
     MatrixPtr input2 = getInputValue(1);
     MatrixPtr tmpMat = Matrix::create(input2->getHeight(),
-      input2->getWidth(), /* trans= */ false, input2->useGpu());
+                                      input2->getWidth(),
+                                      /* trans= */ false,
+                                      input2->useGpu());
     REGISTER_TIMER_INFO("TensorFwMulTimer", getName().c_str());
     for (size_t i = 0; i < getSize(); ++i) {
       MatrixPtr weights = weights_[i]->getW();
@@ -101,7 +102,9 @@ void TensorLayer::backward(const UpdateCallback& callback) {
   MatrixPtr input2 = getInputValue(1);
   MatrixPtr oGrad = getOutputGrad();
   MatrixPtr tmpMat = Matrix::create(input1->getHeight(),
-    input1->getWidth(), /* trans= */ false, input1->useGpu());
+                                    input1->getWidth(),
+                                    /* trans= */ false,
+                                    input1->useGpu());
 
   /* trans(grad * e1) * e2 */ {
     REGISTER_TIMER_INFO("TensorGradMulTimer", getName().c_str());
diff --git a/paddle/gserver/layers/TensorLayer.h b/paddle/gserver/layers/TensorLayer.h
index 83b87b1307..9ac651de4d 100644
--- a/paddle/gserver/layers/TensorLayer.h
+++ b/paddle/gserver/layers/TensorLayer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "Layer.h"
diff --git a/paddle/gserver/layers/TransLayer.cpp b/paddle/gserver/layers/TransLayer.cpp
index f8827bec63..53a24d4cc4 100644
--- a/paddle/gserver/layers/TransLayer.cpp
+++ b/paddle/gserver/layers/TransLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Logging.h"
 #include "TransLayer.h"
 namespace paddle {
diff --git a/paddle/gserver/layers/TransLayer.h b/paddle/gserver/layers/TransLayer.h
index 867ccb4d19..25b091f9f4 100644
--- a/paddle/gserver/layers/TransLayer.h
+++ b/paddle/gserver/layers/TransLayer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "Layer.h"
diff --git a/paddle/gserver/layers/TransposedFullMatrixProjection.cpp b/paddle/gserver/layers/TransposedFullMatrixProjection.cpp
index 6e3f6bf2e4..c883283f78 100644
--- a/paddle/gserver/layers/TransposedFullMatrixProjection.cpp
+++ b/paddle/gserver/layers/TransposedFullMatrixProjection.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Stat.h"
 #include "Projection.h"
 
@@ -27,7 +26,8 @@ namespace paddle {
 class TransposedFullMatrixProjection : public Projection {
 public:
   TransposedFullMatrixProjection(const ProjectionConfig& config,
-                                 ParameterPtr parameter, bool useGPu);
+                                 ParameterPtr parameter,
+                                 bool useGPu);
   virtual void forward();
   virtual void backward(const UpdateCallback& callback);
 
diff --git a/paddle/gserver/layers/ValidationLayer.cpp b/paddle/gserver/layers/ValidationLayer.cpp
index 48a7b54338..0fee4bd246 100644
--- a/paddle/gserver/layers/ValidationLayer.cpp
+++ b/paddle/gserver/layers/ValidationLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <memory>
 #include <algorithm>
 #include <fstream>
@@ -68,8 +67,11 @@ void AucValidation::validationImp(MatrixPtr output, IVectorPtr label) {
   if (dynamic_cast<GpuMatrix*>(output.get())) {
     size_t height = output->getHeight();
     size_t width = output->getWidth();
-    Matrix::resizeOrCreate(cpuOutput_, height, width,
-                           /* trans=*/false, /* useGpu=*/false);
+    Matrix::resizeOrCreate(cpuOutput_,
+                           height,
+                           width,
+                           /* trans=*/false,
+                           /* useGpu=*/false);
     cpuOutput_->copyFrom(*output);
     IVector::resizeOrCreate(cpuLabel_, height, false);
     cpuLabel_->copyFrom(*label);
diff --git a/paddle/gserver/tests/LayerGradUtil.cpp b/paddle/gserver/tests/LayerGradUtil.cpp
index bc7bee0e4b..4757516917 100644
--- a/paddle/gserver/tests/LayerGradUtil.cpp
+++ b/paddle/gserver/tests/LayerGradUtil.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "LayerGradUtil.h"
 
 P_DECLARE_bool(thread_local_rand_use_global_seed);
@@ -28,8 +27,13 @@ real getCostSum(LayerPtr& testLayer, MatrixPtr weights) {
   return Argument::sumCosts(outArgs);
 }
 
-real getDiffAndPrint(real newCost1, real newCost2, real callbackCount,
-                     char fill, string testLayerName, string name, real step,
+real getDiffAndPrint(real newCost1,
+                     real newCost2,
+                     real callbackCount,
+                     char fill,
+                     string testLayerName,
+                     string name,
+                     real step,
                      real delta) {
   EXPECT_FALSE(std::isnan(newCost1));
   EXPECT_FALSE(std::isnan(newCost2));
@@ -49,7 +53,8 @@ real getDiffAndPrint(real newCost1, real newCost2, real callbackCount,
   return diff;
 }
 
-void testState(LayerPtr testLayer, vector<DataLayerPtr>& dataLayers,
+void testState(LayerPtr testLayer,
+               vector<DataLayerPtr>& dataLayers,
                vector<Argument>& datas) {
   auto batchSize = datas[0].getBatchSize();
   Argument data;
@@ -82,8 +87,8 @@ void testState(LayerPtr testLayer, vector<DataLayerPtr>& dataLayers,
         data.value = datas[j].value->subMatrix(batchId, 1);
       }
       if (datas[j].ids) {
-        data.ids = IVector::create(datas[j].ids->getData() + batchId, 1,
-                                   FLAGS_use_gpu);
+        data.ids = IVector::create(
+            datas[j].ids->getData() + batchId, 1, FLAGS_use_gpu);
       }
       dataLayers[j]->setData(data);
       dataLayers[j]->forward(PASS_TEST);
@@ -128,7 +133,8 @@ void testState(LayerPtr testLayer, vector<DataLayerPtr>& dataLayers,
   }
 }
 
-void testBatchState(LayerPtr testLayer, vector<DataLayerPtr>& dataLayers,
+void testBatchState(LayerPtr testLayer,
+                    vector<DataLayerPtr>& dataLayers,
                     vector<Argument>& datas) {
   auto batchSize = datas[0].getBatchSize();
   Argument data;
@@ -192,8 +198,10 @@ void testBatchState(LayerPtr testLayer, vector<DataLayerPtr>& dataLayers,
     splitData.sequenceStartPositions = cpuSeqStartPos;
     for (size_t j = 0; j < datas.size(); ++j) {
       if (datas[j].value) {
-        Matrix::resizeOrCreate(splitData.value, splitBatchSize,
-                               datas[j].value->getWidth(), false,
+        Matrix::resizeOrCreate(splitData.value,
+                               splitBatchSize,
+                               datas[j].value->getWidth(),
+                               false,
                                FLAGS_use_gpu);
         for (size_t seqId = 0; seqId < numSequences; ++seqId) {
           if (seqLens[seqId]) {
@@ -268,8 +276,10 @@ void initWeight(MatrixPtr& weights) {
   weights->copyFrom(*tmpMat);
 }
 
-void initBatchState(LayerPtr dataLayer, LayerPtr testLayer,
-                    LayerStatePtr state, bool useGpu) {
+void initBatchState(LayerPtr dataLayer,
+                    LayerPtr testLayer,
+                    LayerStatePtr state,
+                    bool useGpu) {
   int sequenceNum = dataLayer->getOutput().getNumSequences();
   MatrixPtr prevBatchOutput =
       Matrix::create(sequenceNum, testLayer->getSize(), false, useGpu);
@@ -282,9 +292,13 @@ void initBatchState(LayerPtr dataLayer, LayerPtr testLayer,
   state->value.push_back(prevBatchState);
 }
 
-void initDataLayer(TestConfig testConf, std::vector<DataLayerPtr>* dataLayers,
-                   vector<Argument>* datas, LayerMap* layerMap,
-                   string testLayerName, size_t batchSize, bool trans,
+void initDataLayer(TestConfig testConf,
+                   std::vector<DataLayerPtr>* dataLayers,
+                   vector<Argument>* datas,
+                   LayerMap* layerMap,
+                   string testLayerName,
+                   size_t batchSize,
+                   bool trans,
                    bool useGpu) {
   ICpuGpuVectorPtr sequenceStartPositions;
   ICpuGpuVectorPtr subSequenceStartPositions;
@@ -328,13 +342,17 @@ void initDataLayer(TestConfig testConf, std::vector<DataLayerPtr>* dataLayers,
         break;
       case INPUT_SPARSE_NON_VALUE_DATA:
         data.value = makeRandomSparseMatrix(
-            batchSize, layer->getSize(),
-            /* withValue= */ false, useGpu,
+            batchSize,
+            layer->getSize(),
+            /* withValue= */ false,
+            useGpu,
             testConf.inputDefs[i].sparse.equalNnzPerSample);
         break;
       case INPUT_SPARSE_FLOAT_VALUE_DATA:
-        data.value = makeRandomSparseMatrix(batchSize, layer->getSize(),
-                                            /* withValue= */ true, useGpu);
+        data.value = makeRandomSparseMatrix(batchSize,
+                                            layer->getSize(),
+                                            /* withValue= */ true,
+                                            useGpu);
         break;
       case INPUT_DENSE_DIM_DATA:
         fillData(trans, layer->getSize(), numSequence);
@@ -379,16 +397,21 @@ void initDataLayer(TestConfig testConf, std::vector<DataLayerPtr>* dataLayers,
   }
 }
 
-void initTestLayer(TestConfig testConf, LayerMap* layerMap,
-                   std::vector<ParameterPtr>* parameters, LayerPtr* testLayer) {
+void initTestLayer(TestConfig testConf,
+                   LayerMap* layerMap,
+                   std::vector<ParameterPtr>* parameters,
+                   LayerPtr* testLayer) {
   ParameterMap parameterMap;
   size_t index = 0;
   LayerConfig testConfig = testConf.layerConfig;
   CHECK_EQ(testConf.inputDefs.size(),
            size_t(testConf.layerConfig.inputs_size()));
 
-  auto initParameter = [&](string paraName, size_t paraSize, bool isStatic,
-                           bool initialize, ParameterConfig paraConfig) {
+  auto initParameter = [&](string paraName,
+                           size_t paraSize,
+                           bool isStatic,
+                           bool initialize,
+                           ParameterConfig paraConfig) {
     paraConfig.set_name(paraName);
     paraConfig.set_size(paraSize);
     paraConfig.set_initial_std(1);
@@ -431,8 +454,11 @@ void initTestLayer(TestConfig testConf, LayerMap* layerMap,
   if (testConf.biasSize) {
     testConfig.set_bias_parameter_name("bias");
     ParameterConfig paraConfig;
-    initParameter(testConfig.bias_parameter_name(), testConf.biasSize,
-                  testConf.staticBias, true, paraConfig);
+    initParameter(testConfig.bias_parameter_name(),
+                  testConf.biasSize,
+                  testConf.staticBias,
+                  true,
+                  paraConfig);
   }
 
   *testLayer = Layer::create(testConfig);
@@ -441,9 +467,13 @@ void initTestLayer(TestConfig testConf, LayerMap* layerMap,
   (*testLayer)->setNeedGradient(true);
 }
 
-void testPerturbParameter(TestConfig testConf, const MatrixPtr weights,
-                          const LayerStatePtr state, real cost,
-                          real callbackCount, real* maxDiff, LayerPtr testLayer,
+void testPerturbParameter(TestConfig testConf,
+                          const MatrixPtr weights,
+                          const LayerStatePtr state,
+                          real cost,
+                          real callbackCount,
+                          real* maxDiff,
+                          LayerPtr testLayer,
                           std::vector<ParameterPtr>* parameters) {
   char fill = ' ';
   for (auto& parameter : *parameters) {
@@ -481,9 +511,14 @@ void testPerturbParameter(TestConfig testConf, const MatrixPtr weights,
       parameter->setValueUpdated();
       newCost[k] = getCostSum(testLayer, weights);
     }
-    real diff = getDiffAndPrint(newCost[0], newCost[1], callbackCount, fill,
-                                testLayer->getName(), parameter->getName(),
-                                step, delta);
+    real diff = getDiffAndPrint(newCost[0],
+                                newCost[1],
+                                callbackCount,
+                                fill,
+                                testLayer->getName(),
+                                parameter->getName(),
+                                step,
+                                delta);
     *maxDiff = std::max(*maxDiff, abs(diff));
     // restore parameter
     parameter->getBuf(PARAMETER_VALUE)->copyFrom(oldPara);
@@ -492,9 +527,13 @@ void testPerturbParameter(TestConfig testConf, const MatrixPtr weights,
   }
 }
 
-void testPerturbInput(TestConfig testConf, const MatrixPtr weights,
-                      const LayerStatePtr state, real cost, real callbackCount,
-                      real* maxDiff, LayerPtr testLayer,
+void testPerturbInput(TestConfig testConf,
+                      const MatrixPtr weights,
+                      const LayerStatePtr state,
+                      real cost,
+                      real callbackCount,
+                      real* maxDiff,
+                      LayerPtr testLayer,
                       std::vector<DataLayerPtr> dataLayers) {
   char fill = ' ';
   for (size_t index = 0; index < testConf.inputDefs.size(); index++) {
@@ -539,9 +578,14 @@ void testPerturbInput(TestConfig testConf, const MatrixPtr weights,
       newCost[k] = getCostSum(testLayer, weights);
     }
 
-    real diff = getDiffAndPrint(newCost[0], newCost[1], callbackCount, fill,
+    real diff = getDiffAndPrint(newCost[0],
+                                newCost[1],
+                                callbackCount,
+                                fill,
                                 testLayer->getName(),
-                                dataLayers[index]->getName(), step, delta);
+                                dataLayers[index]->getName(),
+                                step,
+                                delta);
     *maxDiff = std::max(*maxDiff, abs(diff));
     // restore parameter
     outV->copyFrom(oldPara);
@@ -549,9 +593,13 @@ void testPerturbInput(TestConfig testConf, const MatrixPtr weights,
   }
 }
 
-void testLayerGradKernel(TestConfig testConf, string testLayerName,
-                         size_t batchSize, bool trans, bool useGpu,
-                         bool useWeight, float epsilon) {
+void testLayerGradKernel(TestConfig testConf,
+                         string testLayerName,
+                         size_t batchSize,
+                         bool trans,
+                         bool useGpu,
+                         bool useWeight,
+                         float epsilon) {
 #ifdef PADDLE_ONLY_CPU
   if (useGpu) return;
 #endif
@@ -566,8 +614,14 @@ void testLayerGradKernel(TestConfig testConf, string testLayerName,
   std::vector<DataLayerPtr> dataLayers;
   LayerMap layerMap;
   vector<Argument> datas;
-  initDataLayer(testConf, &dataLayers, &datas, &layerMap, testLayerName,
-                batchSize, trans, useGpu);
+  initDataLayer(testConf,
+                &dataLayers,
+                &datas,
+                &layerMap,
+                testLayerName,
+                batchSize,
+                trans,
+                useGpu);
   // test layer initialize
   std::vector<ParameterPtr> parameters;
   LayerPtr testLayer;
@@ -620,17 +674,28 @@ void testLayerGradKernel(TestConfig testConf, string testLayerName,
     ++callbackCount;
   }
   for (size_t i = 0; i < parameters.size(); ++i) {
-    EXPECT_EQ(parameters[i]->isStatic() ? 0 : callbackCount,
-              callbackFlags[i]);
+    EXPECT_EQ(parameters[i]->isStatic() ? 0 : callbackCount, callbackFlags[i]);
   }
 
   // Test whether the layer's forward calculation is stable
   // by adding perturbation to its parameters or its input layers
   real maxDiff = 0;
-  testPerturbParameter(testConf, weights, state, cost, callbackCount, &maxDiff,
-                       testLayer, &parameters);
-  testPerturbInput(testConf, weights, state, cost, callbackCount, &maxDiff,
-                   testLayer, dataLayers);
+  testPerturbParameter(testConf,
+                       weights,
+                       state,
+                       cost,
+                       callbackCount,
+                       &maxDiff,
+                       testLayer,
+                       &parameters);
+  testPerturbInput(testConf,
+                   weights,
+                   state,
+                   cost,
+                   callbackCount,
+                   &maxDiff,
+                   testLayer,
+                   dataLayers);
   EXPECT_LE(fabs(maxDiff), epsilon);
 
   if (testConf.testState) {
@@ -641,10 +706,15 @@ void testLayerGradKernel(TestConfig testConf, string testLayerName,
   }
 }
 
-void testLayerGrad(TestConfig testConf, string testLayerName, size_t batchSize,
-                   bool trans, bool useGpu, bool useWeight, float epsilon) {
-  testLayerGradKernel(testConf, testLayerName, batchSize, trans, useGpu,
-                      useWeight, epsilon);
+void testLayerGrad(TestConfig testConf,
+                   string testLayerName,
+                   size_t batchSize,
+                   bool trans,
+                   bool useGpu,
+                   bool useWeight,
+                   float epsilon) {
+  testLayerGradKernel(
+      testConf, testLayerName, batchSize, trans, useGpu, useWeight, epsilon);
   bool isStaticTest = false;
   LayerConfig testConfig = testConf.layerConfig;
   for (size_t i = 0; i < testConf.inputDefs.size(); i++) {
@@ -662,14 +732,19 @@ void testLayerGrad(TestConfig testConf, string testLayerName, size_t batchSize,
     isStaticTest = true;
   }
   if (isStaticTest) {
-    testLayerGradKernel(testConf, testLayerName, batchSize, trans, useGpu,
-                        useWeight, epsilon);
+    testLayerGradKernel(
+        testConf, testLayerName, batchSize, trans, useGpu, useWeight, epsilon);
   }
 }
 
-void testProjectionGrad(ProjectionConfig conf, InputType inputType,
-                        size_t parameterSize, size_t batchSize, bool useGpu,
-                        bool testState, int biasSize, bool sharedBias) {
+void testProjectionGrad(ProjectionConfig conf,
+                        InputType inputType,
+                        size_t parameterSize,
+                        size_t batchSize,
+                        bool useGpu,
+                        bool testState,
+                        int biasSize,
+                        bool sharedBias) {
   TestConfig config;
   conf.set_name(conf.type());
   config.layerConfig.set_type("mixed");
@@ -684,8 +759,11 @@ void testProjectionGrad(ProjectionConfig conf, InputType inputType,
   testLayerGrad(config, "mixed", batchSize, false, useGpu);
 }
 
-void testOperatorGrad(TestConfig& config, OperatorConfig& operatorConf,
-                      size_t batchSize, bool useGpu, bool testState) {
+void testOperatorGrad(TestConfig& config,
+                      OperatorConfig& operatorConf,
+                      size_t batchSize,
+                      bool useGpu,
+                      bool testState) {
   config.layerConfig.set_type("mixed");
 
   operatorConf.set_output_size(config.layerConfig.size());
diff --git a/paddle/gserver/tests/LayerGradUtil.h b/paddle/gserver/tests/LayerGradUtil.h
index 3b9ec80395..a061c7fc53 100644
--- a/paddle/gserver/tests/LayerGradUtil.h
+++ b/paddle/gserver/tests/LayerGradUtil.h
@@ -72,7 +72,10 @@ struct InputDef {
     sparse = {""};
     isStatic = false;
   }
-  InputDef(InputType type, string nameIn, size_t dimIn, size_t sizeIn,
+  InputDef(InputType type,
+           string nameIn,
+           size_t dimIn,
+           size_t sizeIn,
            ParaSparse sparseIn) {
     inputType = type;
     name = nameIn;
@@ -98,11 +101,18 @@ struct TestConfig {
         testBatchState(false) {}
 };
 
-real getCostSum(ParameterPtr& parameter, CpuVector& cpuPara,
-                LayerPtr& testLayer, MatrixPtr weights = nullptr);
+real getCostSum(ParameterPtr& parameter,
+                CpuVector& cpuPara,
+                LayerPtr& testLayer,
+                MatrixPtr weights = nullptr);
 
-real getDiffAndPrint(real newCost1, real newCost2, real callbackCount,
-                     char fill, string testLayerName, string name, real step,
+real getDiffAndPrint(real newCost1,
+                     real newCost2,
+                     real callbackCount,
+                     char fill,
+                     string testLayerName,
+                     string name,
+                     real step,
                      real delta);
 
 /**
@@ -113,7 +123,8 @@ real getDiffAndPrint(real newCost1, real newCost2, real callbackCount,
  * @param dataLayers[in/out]   dataLayers
  * @param datas[in/out]        data of dataLayers
  */
-void testState(LayerPtr testLayer, vector<DataLayerPtr>& dataLayers,
+void testState(LayerPtr testLayer,
+               vector<DataLayerPtr>& dataLayers,
                vector<Argument>& datas);
 
 /**
@@ -124,7 +135,8 @@ void testState(LayerPtr testLayer, vector<DataLayerPtr>& dataLayers,
  * @param dataLayers[in/out]   dataLayers
  * @param datas[in/out]        data of dataLayers
  */
-void testBatchState(LayerPtr testLayer, vector<DataLayerPtr>& dataLayers,
+void testBatchState(LayerPtr testLayer,
+                    vector<DataLayerPtr>& dataLayers,
                     vector<Argument>& datas);
 
 /**
@@ -144,8 +156,10 @@ double genPerturbation(const real* oldGrad, real* newGrad, size_t dim);
 
 void initWeight(MatrixPtr& weights);
 
-void initBatchState(LayerPtr dataLayer, LayerPtr testLayer,
-                    LayerStatePtr state, bool useGpu);
+void initBatchState(LayerPtr dataLayer,
+                    LayerPtr testLayer,
+                    LayerStatePtr state,
+                    bool useGpu);
 
 /**
  * @brief initialize the dataLayer by its inputType
@@ -155,9 +169,13 @@ void initBatchState(LayerPtr dataLayer, LayerPtr testLayer,
  *        datas[out]          initialized data of dataLayers
  *        layerMap[out]       layerMap
  */
-void initDataLayer(TestConfig testConf, std::vector<DataLayerPtr>* dataLayers,
-                   vector<Argument>* datas, LayerMap* layerMap,
-                   string testLayerName, size_t batchSize, bool trans,
+void initDataLayer(TestConfig testConf,
+                   std::vector<DataLayerPtr>* dataLayers,
+                   vector<Argument>* datas,
+                   LayerMap* layerMap,
+                   string testLayerName,
+                   size_t batchSize,
+                   bool trans,
                    bool useGpu);
 
 /**
@@ -168,8 +186,10 @@ void initDataLayer(TestConfig testConf, std::vector<DataLayerPtr>* dataLayers,
  *        parameters[out]     parameters of testLayer
  *        testLayer[out]      testLayer
  */
-void initTestLayer(TestConfig testConf, LayerMap* layerMap,
-                   std::vector<ParameterPtr>* parameters, LayerPtr* testLayer);
+void initTestLayer(TestConfig testConf,
+                   LayerMap* layerMap,
+                   std::vector<ParameterPtr>* parameters,
+                   LayerPtr* testLayer);
 
 /**
  * @brief Test whether the layer's forward calculation is stable by adding
@@ -184,9 +204,13 @@ void initTestLayer(TestConfig testConf, LayerMap* layerMap,
  *        testLayer[in/out]    testLayer
  *        parameters[in/out]   parameters of testLayer
  */
-void testPerturbParameter(TestConfig testConf, const MatrixPtr weights,
-                          const LayerStatePtr state, real cost,
-                          real callbackCount, real* maxDiff, LayerPtr testLayer,
+void testPerturbParameter(TestConfig testConf,
+                          const MatrixPtr weights,
+                          const LayerStatePtr state,
+                          real cost,
+                          real callbackCount,
+                          real* maxDiff,
+                          LayerPtr testLayer,
                           std::vector<ParameterPtr>* parameters);
 
 /**
@@ -202,25 +226,44 @@ void testPerturbParameter(TestConfig testConf, const MatrixPtr weights,
  *        testLayer[in/out]    testLayer
  *        dataLayers[in/out]   dataLayers
  */
-void testPerturbInput(TestConfig testConf, const MatrixPtr weights,
-                      const LayerStatePtr state, real cost, real callbackCount,
-                      real* maxDiff, LayerPtr testLayer,
+void testPerturbInput(TestConfig testConf,
+                      const MatrixPtr weights,
+                      const LayerStatePtr state,
+                      real cost,
+                      real callbackCount,
+                      real* maxDiff,
+                      LayerPtr testLayer,
                       std::vector<DataLayerPtr> dataLayers);
 
-void testLayerGradKernel(TestConfig testConf, string testLayerName,
-                         size_t batchSize, bool trans, bool useGpu,
-                         bool useWeight = false, float epsilon = 0.02);
+void testLayerGradKernel(TestConfig testConf,
+                         string testLayerName,
+                         size_t batchSize,
+                         bool trans,
+                         bool useGpu,
+                         bool useWeight = false,
+                         float epsilon = 0.02);
 
-void testLayerGrad(TestConfig testConf, string testLayerName, size_t batchSize,
-                   bool trans, bool useGpu, bool useWeight = false,
+void testLayerGrad(TestConfig testConf,
+                   string testLayerName,
+                   size_t batchSize,
+                   bool trans,
+                   bool useGpu,
+                   bool useWeight = false,
                    float epsilon = 0.02);
 
-void testProjectionGrad(ProjectionConfig conf, InputType inputType,
-                        size_t parameterSize, size_t batchSize, bool useGpu,
-                        bool testState = false, int biasSize = 0,
+void testProjectionGrad(ProjectionConfig conf,
+                        InputType inputType,
+                        size_t parameterSize,
+                        size_t batchSize,
+                        bool useGpu,
+                        bool testState = false,
+                        int biasSize = 0,
                         bool sharedBias = false);
 
-void testOperatorGrad(TestConfig& config, OperatorConfig& operatorConf,
-                      size_t batchSize, bool useGpu, bool testState = false);
+void testOperatorGrad(TestConfig& config,
+                      OperatorConfig& operatorConf,
+                      size_t batchSize,
+                      bool useGpu,
+                      bool testState = false);
 
 }  //  namespace paddle
diff --git a/paddle/gserver/tests/TestUtil.cpp b/paddle/gserver/tests/TestUtil.cpp
index 97fbcc8176..84d516683c 100644
--- a/paddle/gserver/tests/TestUtil.cpp
+++ b/paddle/gserver/tests/TestUtil.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "TestUtil.h"
 
 #include "paddle/utils/CommandLineParser.h"
@@ -30,8 +29,11 @@ std::string randStr(const int len) {
   return s;
 }
 
-MatrixPtr makeRandomSparseMatrix(size_t height, size_t width, bool withValue,
-                                 bool useGpu, bool equalNnzPerSample) {
+MatrixPtr makeRandomSparseMatrix(size_t height,
+                                 size_t width,
+                                 bool withValue,
+                                 bool useGpu,
+                                 bool equalNnzPerSample) {
   std::vector<int64_t> ids(height);
   std::vector<int64_t> indices(height + 1);
   indices[0] = 0;
@@ -55,8 +57,8 @@ MatrixPtr makeRandomSparseMatrix(size_t height, size_t width, bool withValue,
     for (size_t i = 0; i < data.size(); ++i) {
       data[i].col = uniformRandom(width);
     }
-    auto mat = Matrix::createSparseMatrix(height, width, data.size(), NO_VALUE,
-                                          SPARSE_CSR, false, useGpu);
+    auto mat = Matrix::createSparseMatrix(
+        height, width, data.size(), NO_VALUE, SPARSE_CSR, false, useGpu);
     if (useGpu) {
       std::dynamic_pointer_cast<GpuSparseMatrix>(mat)->copyFrom(
           ids.data(), indices.data(), data.data(), HPPL_STREAM_DEFAULT);
@@ -93,7 +95,7 @@ void generateSequenceStartPositions(size_t batchSize,
 }
 
 void generateSequenceStartPositions(size_t batchSize,
-    ICpuGpuVectorPtr& sequenceStartPositions) {
+                                    ICpuGpuVectorPtr& sequenceStartPositions) {
   int numSeqs;
   if (FLAGS_fixed_seq_length != 0) {
     numSeqs = std::ceil((float)batchSize / (float)FLAGS_fixed_seq_length);
@@ -101,7 +103,7 @@ void generateSequenceStartPositions(size_t batchSize,
     numSeqs = batchSize / 10 + 1;
   }
   sequenceStartPositions =
-      ICpuGpuVector::create(numSeqs + 1, /* useGpu= */false);
+      ICpuGpuVector::create(numSeqs + 1, /* useGpu= */ false);
   int* buf = sequenceStartPositions->getMutableData(false);
   int64_t pos = 0;
   int len = FLAGS_fixed_seq_length;
@@ -109,7 +111,8 @@ void generateSequenceStartPositions(size_t batchSize,
   for (int i = 0; i < numSeqs; ++i) {
     if (FLAGS_fixed_seq_length == 0) {
       len = uniformRandom(
-            std::min<int64_t>(maxLen, batchSize - pos - numSeqs + i)) + 1;
+                std::min<int64_t>(maxLen, batchSize - pos - numSeqs + i)) +
+            1;
     }
     buf[i] = pos;
     pos += len;
@@ -118,7 +121,6 @@ void generateSequenceStartPositions(size_t batchSize,
   buf[numSeqs] = batchSize;
 }
 
-
 void generateSubSequenceStartPositions(
     const ICpuGpuVectorPtr& sequenceStartPositions,
     ICpuGpuVectorPtr& subSequenceStartPositions) {
@@ -148,7 +150,6 @@ void generateSubSequenceStartPositions(
   subBuf[j] = buf[numSeqs];
 }
 
-
 void generateMDimSequenceData(const IVectorPtr& sequenceStartPositions,
                               IVectorPtr& cpuSequenceDims) {
   /* generate sequences with 2 dims */
@@ -174,9 +175,8 @@ void generateMDimSequenceData(const IVectorPtr& sequenceStartPositions,
   }
 }
 
-void generateMDimSequenceData(
-    const ICpuGpuVectorPtr& sequenceStartPositions,
-    IVectorPtr& cpuSequenceDims) {
+void generateMDimSequenceData(const ICpuGpuVectorPtr& sequenceStartPositions,
+                              IVectorPtr& cpuSequenceDims) {
   /* generate sequences with 2 dims */
   int numSeqs = sequenceStartPositions->getSize() - 1;
   int numDims = 2;
diff --git a/paddle/gserver/tests/TestUtil.h b/paddle/gserver/tests/TestUtil.h
index 6a75f92ffe..000f8884e8 100644
--- a/paddle/gserver/tests/TestUtil.h
+++ b/paddle/gserver/tests/TestUtil.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <gtest/gtest.h>
@@ -28,8 +27,11 @@ inline bool approximatelyEqual(float a, float b, float epsilon) {
   return fabs(a - b) <= ((fabs(a) < fabs(b) ? fabs(b) : fabs(a)) * epsilon);
 }
 
-MatrixPtr makeRandomSparseMatrix(size_t height, size_t width, bool withValue,
-                                 bool useGpu, bool equalNnzPerSample = false);
+MatrixPtr makeRandomSparseMatrix(size_t height,
+                                 size_t width,
+                                 bool withValue,
+                                 bool useGpu,
+                                 bool equalNnzPerSample = false);
 
 /**
  * @brief generate sequenceStartPositions for INPUT_SEQUENCE_DATA,
@@ -39,10 +41,10 @@ MatrixPtr makeRandomSparseMatrix(size_t height, size_t width, bool withValue,
  *        sequenceStartPositions[out] generation output
  */
 void generateSequenceStartPositions(size_t batchSize,
-    IVectorPtr& sequenceStartPositions);
+                                    IVectorPtr& sequenceStartPositions);
 
 void generateSequenceStartPositions(size_t batchSize,
-   ICpuGpuVectorPtr& sequenceStartPositions);
+                                    ICpuGpuVectorPtr& sequenceStartPositions);
 
 /**
  * @brief generate subSequenceStartPositions for INPUT_HASSUB_SEQUENCE_DATA
@@ -51,9 +53,8 @@ void generateSequenceStartPositions(size_t batchSize,
  * @param sequenceStartPositions[in]     input
  *        subSequenceStartPositions[out] generation output
  */
-void generateSubSequenceStartPositions(
-    const IVectorPtr& sequenceStartPositions,
-    IVectorPtr& subSequenceStartPositions);
+void generateSubSequenceStartPositions(const IVectorPtr& sequenceStartPositions,
+                                       IVectorPtr& subSequenceStartPositions);
 
 void generateSubSequenceStartPositions(
     const ICpuGpuVectorPtr& sequenceStartPositions,
@@ -66,12 +67,10 @@ void generateSubSequenceStartPositions(
  * @param sequenceStartPositions[in]     input
  *        cpuSequenceDims[out]              generation output
  */
-void generateMDimSequenceData(
-    const IVectorPtr& sequenceStartPositions,
-    IVectorPtr& cpuSequenceDims);
-void generateMDimSequenceData(
-    const ICpuGpuVectorPtr& sequenceStartPositions,
-    IVectorPtr& cpuSequenceDims);
+void generateMDimSequenceData(const IVectorPtr& sequenceStartPositions,
+                              IVectorPtr& cpuSequenceDims);
+void generateMDimSequenceData(const ICpuGpuVectorPtr& sequenceStartPositions,
+                              IVectorPtr& cpuSequenceDims);
 
 void checkMatrixEqual(const MatrixPtr& a, const MatrixPtr& b);
 
diff --git a/paddle/gserver/tests/test_ActivationGrad.cpp b/paddle/gserver/tests/test_ActivationGrad.cpp
index 2c5d17090d..e54c5109e7 100644
--- a/paddle/gserver/tests/test_ActivationGrad.cpp
+++ b/paddle/gserver/tests/test_ActivationGrad.cpp
@@ -42,9 +42,9 @@ void testActivation(const string& act) {
     testLayerGrad(config,
                   act + "_activation",
                   100,
-                  /* trans= */false,
+                  /* trans= */ false,
                   useGpu,
-                  /* useWeight */true);
+                  /* useWeight */ true);
   }
 }
 
diff --git a/paddle/gserver/tests/test_ConvTrans.cpp b/paddle/gserver/tests/test_ConvTrans.cpp
index bff7222b29..f3efdfb428 100644
--- a/paddle/gserver/tests/test_ConvTrans.cpp
+++ b/paddle/gserver/tests/test_ConvTrans.cpp
@@ -36,206 +36,206 @@ P_DECLARE_bool(prev_batch_state);
 
 // Test that the convTrans forward is the same as conv backward
 TEST(Layer, convTransLayerFwd) {
-    // Setting up conv-trans layer
-    TestConfig configt;
-    configt.biasSize = 3;
-    configt.layerConfig.set_type("exconvt");
-    configt.layerConfig.set_num_filters(3);
-    configt.layerConfig.set_partial_sum(1);
-    configt.layerConfig.set_shared_biases(true);
-
-    configt.inputDefs.push_back({INPUT_DATA, "layer_0", 1024, 384});
-    LayerInputConfig* input = configt.layerConfig.add_inputs();
-    ConvConfig* conv = input->mutable_conv_conf();
-    conv->set_filter_size(2);
-    conv->set_filter_size_y(4);
-    conv->set_channels(16);
-    conv->set_padding(0);
-    conv->set_padding_y(1);
-    conv->set_stride(2);
-    conv->set_stride_y(2);
-    conv->set_groups(1);
-    conv->set_filter_channels(3 / conv->groups());
-    conv->set_img_size(16);
-    conv->set_output_x(outputSize(conv->img_size(), conv->filter_size(),
-                                  conv->padding(), conv->stride(),
-                                  /* caffeMode */ true));
-    configt.layerConfig.set_size(conv->img_size() * conv->img_size() *
-                                configt.layerConfig.num_filters());
-    configt.layerConfig.set_name("convTrans");
-
-    // data layer initialize
-    std::vector<DataLayerPtr> dataLayers;
-    LayerMap layerMap;
-    vector<Argument> datas;
-    initDataLayer(configt, &dataLayers, &datas, &layerMap, "convTrans",
-                  100, false, false);
-    // test layer initialize
-    std::vector<ParameterPtr> parameters;
-    LayerPtr convtLayer;
-    initTestLayer(configt, &layerMap, &parameters, &convtLayer);
-    convtLayer->getBiasParameter()->zeroMem();
-    convtLayer->forward(PASS_GC);
-
-    // Setting up conv-layer config
-    TestConfig config;
-    config.biasSize = 16;
-    config.layerConfig.set_type("exconv");
-    config.layerConfig.set_num_filters(16);
-    config.layerConfig.set_partial_sum(1);
-    config.layerConfig.set_shared_biases(true);
-
-    config.inputDefs.push_back({INPUT_DATA, "layer_1", 768, 384});
-    input = config.layerConfig.add_inputs();
-    conv = input->mutable_conv_conf();
-    conv->set_filter_size(2);
-    conv->set_filter_size_y(4);
-    conv->set_channels(3);
-    conv->set_padding(0);
-    conv->set_padding_y(1);
-    conv->set_stride(2);
-    conv->set_stride_y(2);
-    conv->set_groups(1);
-    conv->set_filter_channels(conv->channels() / conv->groups());
-    conv->set_img_size(16);
-    conv->set_output_x(outputSize(conv->img_size(), conv->filter_size(),
-                                  conv->padding(), conv->stride(),
-                                  /* caffeMode */ true));
-    config.layerConfig.set_size(conv->output_x() * conv->output_x() *
-                                config.layerConfig.num_filters());
-    config.layerConfig.set_name("conv");
-
-    // data layer initialize
-    std::vector<DataLayerPtr> dataLayers2;
-    LayerMap layerMap2;
-    vector<Argument> datas2;
-    initDataLayer(config, &dataLayers2, &datas2, &layerMap2, "conv",
-                  100, false, false);
-    // test layer initialize
-    std::vector<ParameterPtr> parameters2;
-    LayerPtr convLayer;
-    initTestLayer(config, &layerMap2, &parameters2, &convLayer);
-
-    // Sync convLayer and convtLayer parameter
-    convLayer->getBiasParameter()->zeroMem();
-    convLayer->getParameters()[0]->getBuf(PARAMETER_VALUE)->copyFrom(
-            *(convtLayer->getParameters()[0]->getBuf(PARAMETER_VALUE)));
-
-    // Set convLayer outputGrad as convTransLayer input value
-    convLayer->forward(PASS_GC);
-    convLayer->getOutput().grad->copyFrom(*(dataLayers[0]->getOutputValue()));
-
-    vector<int> callbackFlags(parameters2.size(), 0);
-    auto callback = [&](Parameter* para) { ++callbackFlags[para->getID()]; };
-    convLayer->backward(callback);
-
-    // Check that the convLayer backward is the same as convTransLayer forward
-    checkMatrixEqual(convtLayer->getOutputValue(),
-                     dataLayers2[0]->getOutputGrad());
+  // Setting up conv-trans layer
+  TestConfig configt;
+  configt.biasSize = 3;
+  configt.layerConfig.set_type("exconvt");
+  configt.layerConfig.set_num_filters(3);
+  configt.layerConfig.set_partial_sum(1);
+  configt.layerConfig.set_shared_biases(true);
+
+  configt.inputDefs.push_back({INPUT_DATA, "layer_0", 1024, 384});
+  LayerInputConfig* input = configt.layerConfig.add_inputs();
+  ConvConfig* conv = input->mutable_conv_conf();
+  conv->set_filter_size(2);
+  conv->set_filter_size_y(4);
+  conv->set_channels(16);
+  conv->set_padding(0);
+  conv->set_padding_y(1);
+  conv->set_stride(2);
+  conv->set_stride_y(2);
+  conv->set_groups(1);
+  conv->set_filter_channels(3 / conv->groups());
+  conv->set_img_size(16);
+  conv->set_output_x(outputSize(conv->img_size(),
+                                conv->filter_size(),
+                                conv->padding(),
+                                conv->stride(),
+                                /* caffeMode */ true));
+  configt.layerConfig.set_size(conv->img_size() * conv->img_size() *
+                               configt.layerConfig.num_filters());
+  configt.layerConfig.set_name("convTrans");
+
+  // data layer initialize
+  std::vector<DataLayerPtr> dataLayers;
+  LayerMap layerMap;
+  vector<Argument> datas;
+  initDataLayer(
+      configt, &dataLayers, &datas, &layerMap, "convTrans", 100, false, false);
+  // test layer initialize
+  std::vector<ParameterPtr> parameters;
+  LayerPtr convtLayer;
+  initTestLayer(configt, &layerMap, &parameters, &convtLayer);
+  convtLayer->getBiasParameter()->zeroMem();
+  convtLayer->forward(PASS_GC);
+
+  // Setting up conv-layer config
+  TestConfig config;
+  config.biasSize = 16;
+  config.layerConfig.set_type("exconv");
+  config.layerConfig.set_num_filters(16);
+  config.layerConfig.set_partial_sum(1);
+  config.layerConfig.set_shared_biases(true);
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 768, 384});
+  input = config.layerConfig.add_inputs();
+  conv = input->mutable_conv_conf();
+  conv->set_filter_size(2);
+  conv->set_filter_size_y(4);
+  conv->set_channels(3);
+  conv->set_padding(0);
+  conv->set_padding_y(1);
+  conv->set_stride(2);
+  conv->set_stride_y(2);
+  conv->set_groups(1);
+  conv->set_filter_channels(conv->channels() / conv->groups());
+  conv->set_img_size(16);
+  conv->set_output_x(outputSize(conv->img_size(),
+                                conv->filter_size(),
+                                conv->padding(),
+                                conv->stride(),
+                                /* caffeMode */ true));
+  config.layerConfig.set_size(conv->output_x() * conv->output_x() *
+                              config.layerConfig.num_filters());
+  config.layerConfig.set_name("conv");
+
+  // data layer initialize
+  std::vector<DataLayerPtr> dataLayers2;
+  LayerMap layerMap2;
+  vector<Argument> datas2;
+  initDataLayer(
+      config, &dataLayers2, &datas2, &layerMap2, "conv", 100, false, false);
+  // test layer initialize
+  std::vector<ParameterPtr> parameters2;
+  LayerPtr convLayer;
+  initTestLayer(config, &layerMap2, &parameters2, &convLayer);
+
+  // Sync convLayer and convtLayer parameter
+  convLayer->getBiasParameter()->zeroMem();
+  convLayer->getParameters()[0]
+      ->getBuf(PARAMETER_VALUE)
+      ->copyFrom(*(convtLayer->getParameters()[0]->getBuf(PARAMETER_VALUE)));
+
+  // Set convLayer outputGrad as convTransLayer input value
+  convLayer->forward(PASS_GC);
+  convLayer->getOutput().grad->copyFrom(*(dataLayers[0]->getOutputValue()));
+
+  vector<int> callbackFlags(parameters2.size(), 0);
+  auto callback = [&](Parameter* para) { ++callbackFlags[para->getID()]; };
+  convLayer->backward(callback);
+
+  // Check that the convLayer backward is the same as convTransLayer forward
+  checkMatrixEqual(convtLayer->getOutputValue(),
+                   dataLayers2[0]->getOutputGrad());
 }
 
-
 // Do one forward pass of convTrans layer and check to see if its output
 // matches the given result
-void doOneConvtTest(size_t imgSize, size_t output_x, size_t stride,
-                    size_t padding, size_t filter_size, MatrixPtr& result) {
-    TestConfig configt;
-    configt.biasSize = 1;
-    configt.layerConfig.set_type("exconvt");
-    configt.layerConfig.set_num_filters(1);
-    configt.layerConfig.set_partial_sum(1);
-    configt.layerConfig.set_shared_biases(true);
-
-    configt.inputDefs.push_back({INPUT_DATA, "layer_0", output_x * output_x,
-                                 filter_size * filter_size});
-    LayerInputConfig* input = configt.layerConfig.add_inputs();
-    ConvConfig* conv = input->mutable_conv_conf();
-    conv->set_filter_size(filter_size);
-    conv->set_filter_size_y(filter_size);
-    conv->set_channels(1);
-    conv->set_padding(padding);
-    conv->set_padding_y(padding);
-    conv->set_stride(stride);
-    conv->set_stride_y(stride);
-    conv->set_groups(1);
-    conv->set_filter_channels(1);
-    conv->set_img_size(imgSize);
-    conv->set_output_x(output_x);
-
-    configt.layerConfig.set_size(conv->img_size() * conv->img_size() *
-                                configt.layerConfig.num_filters());
-    configt.layerConfig.set_name("convTrans");
-
-    std::vector<DataLayerPtr> dataLayers;
-    LayerMap layerMap;
-    vector<Argument> datas;
-    initDataLayer(configt, &dataLayers, &datas, &layerMap, "convTrans",
-                  1, false, false);
-    dataLayers[0]->getOutputValue()->zeroMem();
-    dataLayers[0]->getOutputValue()->add(1.0);
-
-    // test layer initialize
-    std::vector<ParameterPtr> parameters;
-    LayerPtr convtLayer;
-    initTestLayer(configt, &layerMap, &parameters, &convtLayer);
-    convtLayer->getBiasParameter()->zeroMem();
-    convtLayer->getParameters()[0]->zeroMem();
-    convtLayer->getParameters()[0]->getBuf(PARAMETER_VALUE)->add(1.0);
-    convtLayer->forward(PASS_GC);
-
-    checkMatrixEqual(convtLayer->getOutputValue(), result);
+void doOneConvtTest(size_t imgSize,
+                    size_t output_x,
+                    size_t stride,
+                    size_t padding,
+                    size_t filter_size,
+                    MatrixPtr& result) {
+  TestConfig configt;
+  configt.biasSize = 1;
+  configt.layerConfig.set_type("exconvt");
+  configt.layerConfig.set_num_filters(1);
+  configt.layerConfig.set_partial_sum(1);
+  configt.layerConfig.set_shared_biases(true);
+
+  configt.inputDefs.push_back(
+      {INPUT_DATA, "layer_0", output_x * output_x, filter_size * filter_size});
+  LayerInputConfig* input = configt.layerConfig.add_inputs();
+  ConvConfig* conv = input->mutable_conv_conf();
+  conv->set_filter_size(filter_size);
+  conv->set_filter_size_y(filter_size);
+  conv->set_channels(1);
+  conv->set_padding(padding);
+  conv->set_padding_y(padding);
+  conv->set_stride(stride);
+  conv->set_stride_y(stride);
+  conv->set_groups(1);
+  conv->set_filter_channels(1);
+  conv->set_img_size(imgSize);
+  conv->set_output_x(output_x);
+
+  configt.layerConfig.set_size(conv->img_size() * conv->img_size() *
+                               configt.layerConfig.num_filters());
+  configt.layerConfig.set_name("convTrans");
+
+  std::vector<DataLayerPtr> dataLayers;
+  LayerMap layerMap;
+  vector<Argument> datas;
+  initDataLayer(
+      configt, &dataLayers, &datas, &layerMap, "convTrans", 1, false, false);
+  dataLayers[0]->getOutputValue()->zeroMem();
+  dataLayers[0]->getOutputValue()->add(1.0);
+
+  // test layer initialize
+  std::vector<ParameterPtr> parameters;
+  LayerPtr convtLayer;
+  initTestLayer(configt, &layerMap, &parameters, &convtLayer);
+  convtLayer->getBiasParameter()->zeroMem();
+  convtLayer->getParameters()[0]->zeroMem();
+  convtLayer->getParameters()[0]->getBuf(PARAMETER_VALUE)->add(1.0);
+  convtLayer->forward(PASS_GC);
+
+  checkMatrixEqual(convtLayer->getOutputValue(), result);
 }
 
 TEST(Layer, convTransLayerFwd2) {
-    MatrixPtr result;
-    result = Matrix::create(1, 5 * 5, false, false);
-    result->zeroMem();
-    result->add(1.0);
-    doOneConvtTest(/* imgSize */ 5,
-                   /* output_x */ 1,
-                   /* stride */ 1,
-                   /* padding */ 0,
-                   /* filter_size */ 5,
-                   result);
-
-    float resultData[] = {1, 2, 2, 2, 1,
-                          2, 4, 4, 4, 2,
-                          2, 4, 4, 4, 2,
-                          2, 4, 4, 4, 2,
-                          1, 2, 2, 2, 1};
-    result->setData(resultData);
-    doOneConvtTest(/* imgSize */ 5,
-                   /* output_x */ 2,
-                   /* stride */ 1,
-                   /* padding */ 0,
-                   /* filter_size */ 4,
-                   result);
-
-    float resultData2[] = {1, 2, 2, 2, 1,
-                           2, 4, 4, 4, 2,
-                           2, 4, 4, 4, 2,
-                           2, 4, 4, 4, 2,
-                           1, 2, 2, 2, 1};
-    result->setData(resultData2);
-    doOneConvtTest(/* imgSize */ 5,
-                   /* output_x */ 2,
-                   /* stride */ 2,
-                   /* padding */ 1,
-                   /* filter_size */ 5,
-                   result);
-
-    float resultData3[] = {1, 1, 2, 1, 1,
-                           1, 1, 2, 1, 1,
-                           2, 2, 4, 2, 2,
-                           1, 1, 2, 1, 1,
-                           1, 1, 2, 1, 1};
-    result->setData(resultData3);
-    doOneConvtTest(/* imgSize */ 5,
-                   /* output_x */ 2,
-                   /* stride */ 2,
-                   /* padding */ 0,
-                   /* filter_size */ 3,
-                   result);}
+  MatrixPtr result;
+  result = Matrix::create(1, 5 * 5, false, false);
+  result->zeroMem();
+  result->add(1.0);
+  doOneConvtTest(/* imgSize */ 5,
+                 /* output_x */ 1,
+                 /* stride */ 1,
+                 /* padding */ 0,
+                 /* filter_size */ 5,
+                 result);
+
+  float resultData[] = {1, 2, 2, 2, 1, 2, 4, 4, 4, 2, 2, 4, 4,
+                        4, 2, 2, 4, 4, 4, 2, 1, 2, 2, 2, 1};
+  result->setData(resultData);
+  doOneConvtTest(/* imgSize */ 5,
+                 /* output_x */ 2,
+                 /* stride */ 1,
+                 /* padding */ 0,
+                 /* filter_size */ 4,
+                 result);
+
+  float resultData2[] = {1, 2, 2, 2, 1, 2, 4, 4, 4, 2, 2, 4, 4,
+                         4, 2, 2, 4, 4, 4, 2, 1, 2, 2, 2, 1};
+  result->setData(resultData2);
+  doOneConvtTest(/* imgSize */ 5,
+                 /* output_x */ 2,
+                 /* stride */ 2,
+                 /* padding */ 1,
+                 /* filter_size */ 5,
+                 result);
+
+  float resultData3[] = {1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 2, 2, 4,
+                         2, 2, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1};
+  result->setData(resultData3);
+  doOneConvtTest(/* imgSize */ 5,
+                 /* output_x */ 2,
+                 /* stride */ 2,
+                 /* padding */ 0,
+                 /* filter_size */ 3,
+                 result);
+}
 
 int main(int argc, char** argv) {
   testing::InitGoogleTest(&argc, argv);
diff --git a/paddle/gserver/tests/test_Evaluator.cpp b/paddle/gserver/tests/test_Evaluator.cpp
index 3a591a316b..be639ea093 100644
--- a/paddle/gserver/tests/test_Evaluator.cpp
+++ b/paddle/gserver/tests/test_Evaluator.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <gtest/gtest.h>
 #include <vector>
 #include "ModelConfig.pb.h"
@@ -48,8 +47,10 @@ struct TestConfig {
   TestConfig() : testAccumulate(true) {}
 };
 
-void testEvaluator(TestConfig testConf, string testEvaluatorName,
-                   size_t batchSize, bool useGpu) {
+void testEvaluator(TestConfig testConf,
+                   string testEvaluatorName,
+                   size_t batchSize,
+                   bool useGpu) {
 #ifdef PADDLE_ONLY_CPU
   if (useGpu) return;
 #endif
@@ -79,8 +80,10 @@ void testEvaluator(TestConfig testConf, string testEvaluatorName,
         data.ids->rand(dim);  // now rand number can be 0 to inputDefs[i].dim.
         break;
       case INPUT_SPARSE_NON_VALUE_DATA:
-        data.value = makeRandomSparseMatrix(batchSize, dim,
-                                            /* withValue= */ false, useGpu);
+        data.value = makeRandomSparseMatrix(batchSize,
+                                            dim,
+                                            /* withValue= */ false,
+                                            useGpu);
         break;
       default:
         LOG(FATAL) << " unknown inputType ";
@@ -116,8 +119,9 @@ void testEvaluator(TestConfig testConf, string testEvaluatorName,
   }
 }
 
-void testEvaluatorAll(TestConfig testConf, string testEvaluatorName,
-                   size_t batchSize) {
+void testEvaluatorAll(TestConfig testConf,
+                      string testEvaluatorName,
+                      size_t batchSize) {
   testEvaluator(testConf, testEvaluatorName, batchSize, true);
   testEvaluator(testConf, testEvaluatorName, batchSize, false);
 }
@@ -142,8 +146,8 @@ TEST(Evaluator, classification_error) {
   config.evaluatorConfig.set_classification_threshold(0.4);
   config.inputDefs.push_back({INPUT_DATA, "weight", 1});
   // Not support GPU
-  testEvaluator(config, "classification_error_weight_multi_binary_label", 50,
-                false);
+  testEvaluator(
+      config, "classification_error_weight_multi_binary_label", 50, false);
 }
 
 TEST(Evaluator, sum) {
@@ -211,8 +215,8 @@ TEST(Evaluator, precision_recall) {
   config.evaluatorConfig.set_classification_threshold(0.4);
   config.inputDefs.push_back({INPUT_DATA, "weight", 1});
   // Not support GPU
-  testEvaluator(config, "precision_recall_weight_multi_binary_label", 100,
-                false);
+  testEvaluator(
+      config, "precision_recall_weight_multi_binary_label", 100, false);
 }
 
 TEST(Evaluator, ctc_error_evaluator) {
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index a79dfe39c9..374ae57dd3 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -69,8 +69,10 @@ TEST(Projection, context) {
               std::max(0, conf.context_start() + conf.context_length() - 1);
           for (auto useGpu : {false, true}) {
             testProjectionGrad(
-                conf, INPUT_SEQUENCE_DATA,
-                trainablePadding ? conf.input_size() * pad : 0, batchSize,
+                conf,
+                INPUT_SEQUENCE_DATA,
+                trainablePadding ? conf.input_size() * pad : 0,
+                batchSize,
                 useGpu,
                 contextStart + contextLength <= 1);  // = testState
           }
@@ -86,8 +88,11 @@ TEST(Projection, trans_fc) {
   conf.set_input_size(50);
   conf.set_output_size(20);
   for (auto useGpu : {false, true}) {
-    testProjectionGrad(conf, INPUT_DATA, /* parameterSize */ 1000,
-                       /* batchSize */ 100, useGpu);
+    testProjectionGrad(conf,
+                       INPUT_DATA,
+                       /* parameterSize */ 1000,
+                       /* batchSize */ 100,
+                       useGpu);
   }
 }
 
@@ -97,8 +102,11 @@ TEST(Projection, fc) {
   conf.set_input_size(10);
   conf.set_output_size(20);
   for (auto useGpu : {false, true}) {
-    testProjectionGrad(conf, INPUT_DATA, /* parameterSize */ 200,
-                       /* batchSize */ 100, useGpu);
+    testProjectionGrad(conf,
+                       INPUT_DATA,
+                       /* parameterSize */ 200,
+                       /* batchSize */ 100,
+                       useGpu);
   }
 }
 
@@ -108,8 +116,11 @@ TEST(Projection, dot_mul) {
   conf.set_input_size(20);
   conf.set_output_size(20);
   for (auto useGpu : {false, true}) {
-    testProjectionGrad(conf, INPUT_DATA, /* parameterSize */ 20,
-                       /* batchSize */ 100, useGpu);
+    testProjectionGrad(conf,
+                       INPUT_DATA,
+                       /* parameterSize */ 20,
+                       /* batchSize */ 100,
+                       useGpu);
   }
 }
 
@@ -119,8 +130,11 @@ TEST(Projection, table) {
   conf.set_input_size(10);
   conf.set_output_size(20);
   for (auto useGpu : {false, true}) {
-    testProjectionGrad(conf, INPUT_LABEL, /* parameterSize */ 200,
-                       /* batchSize */ 100, useGpu);
+    testProjectionGrad(conf,
+                       INPUT_LABEL,
+                       /* parameterSize */ 200,
+                       /* batchSize */ 100,
+                       useGpu);
   }
 }
 
@@ -130,8 +144,11 @@ TEST(Projection, identity) {
   conf.set_input_size(10);
   conf.set_output_size(10);
   for (auto useGpu : {false, true}) {
-    testProjectionGrad(conf, INPUT_DATA, /* parameterSize */ 0,
-                       /* batchSize */ 100, useGpu);
+    testProjectionGrad(conf,
+                       INPUT_DATA,
+                       /* parameterSize */ 0,
+                       /* batchSize */ 100,
+                       useGpu);
   }
 }
 
@@ -141,8 +158,11 @@ TEST(Projection, scaling) {
   conf.set_input_size(10);
   conf.set_output_size(10);
   for (auto useGpu : {false}) {
-    testProjectionGrad(conf, INPUT_DATA, /* parameterSize */ 1,
-                       /* batchSize */ 100, useGpu);
+    testProjectionGrad(conf,
+                       INPUT_DATA,
+                       /* parameterSize */ 1,
+                       /* batchSize */ 100,
+                       useGpu);
   }
 }
 
@@ -169,20 +189,29 @@ TEST(Projection, conv) {
   conv->set_groups(1);
   conv->set_filter_channels(conv->channels() / conv->groups());
   conv->set_img_size(IMAGE_SIZE);
-  int output_x =
-      outputSize(conv->img_size(), conv->filter_size(), conv->padding(),
-                 conv->stride(), /* caffeMode */ true);
-  int output_y =
-      outputSize(conv->img_size(), conv->filter_size_y(), conv->padding_y(),
-                 conv->stride_y(), /* caffeMode */ true);
+  int output_x = outputSize(conv->img_size(),
+                            conv->filter_size(),
+                            conv->padding(),
+                            conv->stride(),
+                            /* caffeMode */ true);
+  int output_y = outputSize(conv->img_size(),
+                            conv->filter_size_y(),
+                            conv->padding_y(),
+                            conv->stride_y(),
+                            /* caffeMode */ true);
   conv->set_output_x(output_x);
   conf.set_input_size(IMAGE_SIZE * IMAGE_SIZE * CHANNELS);
   conf.set_output_size(output_x * output_y * NUM_FILTERS);
 
   testProjectionGrad(
-      conf, INPUT_DATA,
+      conf,
+      INPUT_DATA,
       /* parameterSize */ NUM_FILTERS * CHANNELS * FILTER_SIZE * FILTER_SIZE_Y,
-      /* batchSize */ 100, true, false, NUM_FILTERS, true);
+      /* batchSize */ 100,
+      true,
+      false,
+      NUM_FILTERS,
+      true);
 }
 #endif
 
@@ -253,8 +282,13 @@ TEST(Layer, CRFLayer) {
   config.layerConfig.add_inputs();
 
   // Not support GPU now
-  testLayerGrad(config, "crf", 100, /* trans */ false, /* useGpu */ false,
-                false /*useWeight*/, 0.03 /*epsilon*/);
+  testLayerGrad(config,
+                "crf",
+                100,
+                /* trans */ false,
+                /* useGpu */ false,
+                false /*useWeight*/,
+                0.03 /*epsilon*/);
 }
 
 TEST(Layer, CTCLayer) {
@@ -327,8 +361,10 @@ void testConvLayer(const string& type, bool trans, bool useGpu) {
   conv->set_groups(1);
   conv->set_filter_channels(conv->channels() / conv->groups());
   conv->set_img_size(16);
-  conv->set_output_x(outputSize(conv->img_size(), conv->filter_size(),
-                                conv->padding(), conv->stride(),
+  conv->set_output_x(outputSize(conv->img_size(),
+                                conv->filter_size(),
+                                conv->padding(),
+                                conv->stride(),
                                 /* caffeMode */ true));
   config.layerConfig.set_size(conv->output_x() * conv->output_x() *
                               config.layerConfig.num_filters());
@@ -346,7 +382,6 @@ TEST(Layer, convLayer) {
 #endif
 }
 
-
 void testConvTransLayer(const string& type, bool trans, bool useGpu) {
   TestConfig config;
   config.biasSize = 3;
@@ -368,8 +403,10 @@ void testConvTransLayer(const string& type, bool trans, bool useGpu) {
   conv->set_groups(1);
   conv->set_filter_channels(3 / conv->groups());
   conv->set_img_size(16);
-  conv->set_output_x(outputSize(conv->img_size(), conv->filter_size(),
-                                conv->padding(), conv->stride(),
+  conv->set_output_x(outputSize(conv->img_size(),
+                                conv->filter_size(),
+                                conv->padding(),
+                                conv->stride(),
                                 /* caffeMode */ true));
 
   config.layerConfig.set_size(conv->img_size() * conv->img_size() *
@@ -403,14 +440,16 @@ TEST(Layer, blockExpandLayer) {
   blockExpand->set_block_y(32);
   blockExpand->set_stride_x(2);
   blockExpand->set_stride_y(2);
-  blockExpand->set_output_x(
-      outputSize(blockExpand->img_size_x(), blockExpand->block_x(),
-                 blockExpand->padding_x(), blockExpand->stride_x(),
-                 /* caffeMode */ false));
-  blockExpand->set_output_y(
-      outputSize(blockExpand->img_size_y(), blockExpand->block_y(),
-                 blockExpand->padding_y(), blockExpand->stride_y(),
-                 /* caffeMode */ false));
+  blockExpand->set_output_x(outputSize(blockExpand->img_size_x(),
+                                       blockExpand->block_x(),
+                                       blockExpand->padding_x(),
+                                       blockExpand->stride_x(),
+                                       /* caffeMode */ false));
+  blockExpand->set_output_y(outputSize(blockExpand->img_size_y(),
+                                       blockExpand->block_y(),
+                                       blockExpand->padding_y(),
+                                       blockExpand->stride_y(),
+                                       /* caffeMode */ false));
   config.layerConfig.set_size(blockExpand->block_x() * blockExpand->block_y() *
                               blockExpand->channels());
 
@@ -453,7 +492,11 @@ void testFcLayer(string format, size_t nnz) {
             << config.inputDefs[0].sparse.format;
 
   for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "fc", 100, /* trans */ false, useGpu,
+    testLayerGrad(config,
+                  "fc",
+                  100,
+                  /* trans */ false,
+                  useGpu,
                   /* weight */ true);
   }
 }
@@ -481,11 +524,19 @@ TEST(Layer, SelectiveFullyConnectedLayer) {
       {INPUT_SPARSE_NON_VALUE_DATA, "index", nout, 0, ParaSparse("csr", true)});
   config.layerConfig.add_inputs();
 
-  testLayerGrad(config, "selective_fc", 100,
-                /* trans= */ false, /* useGup= */ false, false);
+  testLayerGrad(config,
+                "selective_fc",
+                100,
+                /* trans= */ false,
+                /* useGup= */ false,
+                false);
 #ifndef PADDLE_ONLY_CPU
-  testLayerGrad(config, "selective_fc", 100,
-                /* trans= */ false, /* useGup= */ true, false);
+  testLayerGrad(config,
+                "selective_fc",
+                100,
+                /* trans= */ false,
+                /* useGup= */ true,
+                false);
 #endif
 }
 
@@ -502,7 +553,10 @@ TEST(Layer, DataNormLayer) {
   for (auto strategy : {"z-score", "min-max", "decimal-scaling"}) {
     config.layerConfig.set_data_norm_strategy(strategy);
     // The parameters are static, so not support GPU now
-    testLayerGrad(config, "data_norm", 200, /* trans */ false,
+    testLayerGrad(config,
+                  "data_norm",
+                  200,
+                  /* trans */ false,
                   /* useGpu */ false);
   }
 }
@@ -534,8 +588,8 @@ TEST(Layer, multi_cross) {
   config.layerConfig.add_inputs();
 
   for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "multi-class-cross-entropy", 100, /* trans */ false,
-                  useGpu);
+    testLayerGrad(
+        config, "multi-class-cross-entropy", 100, /* trans */ false, useGpu);
   }
 }
 
@@ -550,8 +604,11 @@ TEST(Layer, multi_binary_label_sparse_mat) {
   config.layerConfig.add_inputs();
 
   for (auto useGpu : {false, true}) {
-      testLayerGrad(config, "multi_binary_label_cross_entropy", 100,
-                    /* trans */ false, useGpu);
+    testLayerGrad(config,
+                  "multi_binary_label_cross_entropy",
+                  100,
+                  /* trans */ false,
+                  useGpu);
   }
 }
 
@@ -566,8 +623,11 @@ TEST(layer, multi_binary_label_id) {
   config.layerConfig.add_inputs();
 
   for (auto useGpu : {false, true}) {
-      testLayerGrad(config, "multi_binary_label_cross_entropy", 100,
-                    /* trans */ false, useGpu);
+    testLayerGrad(config,
+                  "multi_binary_label_cross_entropy",
+                  100,
+                  /* trans */ false,
+                  useGpu);
   }
 }
 
@@ -583,7 +643,9 @@ TEST(Layer, multi_cross_with_selfnorm) {
   config.layerConfig.add_inputs();
 
   // Not support GPU now
-  testLayerGrad(config, "multi_class_cross_entropy_with_selfnorm", 100,
+  testLayerGrad(config,
+                "multi_class_cross_entropy_with_selfnorm",
+                100,
                 /* trans */ false,
                 /* useGpu */ false);
 }
@@ -599,8 +661,11 @@ TEST(Layer, multi_cross_soft) {
   config.layerConfig.add_inputs();
 
   for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "soft_binary_class_cross_entropy", 100,
-                  /* trans */ false, useGpu);
+    testLayerGrad(config,
+                  "soft_binary_class_cross_entropy",
+                  100,
+                  /* trans */ false,
+                  useGpu);
   }
 }
 
@@ -630,7 +695,10 @@ TEST(Layer, sparse_square_error) {
   config.layerConfig.add_inputs();
 
   // "GpuSparseMatrix" as label is not supported
-  testLayerGrad(config, "square_error", 100, /* trans */ false,
+  testLayerGrad(config,
+                "square_error",
+                100,
+                /* trans */ false,
                 /* useGpu */ false);
 }
 
@@ -645,7 +713,10 @@ TEST(Layer, sparse_float_square_error) {
   config.layerConfig.add_inputs();
 
   // "GpuSparseMatrix" as label is not supported
-  testLayerGrad(config, "square_error", 100, /* trans */ false,
+  testLayerGrad(config,
+                "square_error",
+                100,
+                /* trans */ false,
                 /* useGpu */ false);
 }
 
@@ -688,10 +759,14 @@ void testExpandLayer(string trans_type, bool hasSubseq) {
 
   config.inputDefs.push_back(
       {trans_type == "non-seq" ? INPUT_DENSE_DIM_DATA : INPUT_SEQUENCE_DATA,
-       "layer_0", 10, 0});
+       "layer_0",
+       10,
+       0});
   config.inputDefs.push_back(
-      {hasSubseq ? INPUT_HASSUB_SEQUENCE_DATA : INPUT_SEQUENCE_DATA, "layer_1",
-       10, 0});
+      {hasSubseq ? INPUT_HASSUB_SEQUENCE_DATA : INPUT_SEQUENCE_DATA,
+       "layer_1",
+       10,
+       0});
   config.layerConfig.add_inputs();
   config.layerConfig.add_inputs();
   config.layerConfig.set_trans_type(trans_type);
@@ -715,8 +790,10 @@ void testDegradeLayer(bool hasSubseq, string layer_type, string trans_type) {
   config.biasSize = 0;
 
   config.inputDefs.push_back(
-      {hasSubseq ? INPUT_HASSUB_SEQUENCE_DATA : INPUT_SEQUENCE_DATA, "layer_0",
-       10, 0});
+      {hasSubseq ? INPUT_HASSUB_SEQUENCE_DATA : INPUT_SEQUENCE_DATA,
+       "layer_0",
+       10,
+       0});
   config.layerConfig.add_inputs();
   config.layerConfig.set_trans_type(trans_type);
 
@@ -746,9 +823,11 @@ TEST(Layer, MaxLayer) {
 }
 
 TEST(Layer, SequenceLastInstanceLayer) {
-  testDegradeLayer(false, "seqlastins",
+  testDegradeLayer(false,
+                   "seqlastins",
                    "non-seq");  // seq seqlastins to non-seq
-  testDegradeLayer(true, "seqlastins",
+  testDegradeLayer(true,
+                   "seqlastins",
                    "non-seq");  // hasSubseq seqlastins to non-seq
   testDegradeLayer(true, "seqlastins", "seq");  // hasSubseq seqlastins to seq
 }
@@ -933,7 +1012,8 @@ TEST(Layer, NormLayer) {
 }
 #endif
 
-void setPoolConfig(TestConfig* config, PoolConfig* pool,
+void setPoolConfig(TestConfig* config,
+                   PoolConfig* pool,
                    const string& poolType) {
   (*config).biasSize = 0;
   (*config).layerConfig.set_type("pool");
@@ -1009,7 +1089,9 @@ TEST(Layer, PoolLayer) {
 #endif
 }
 
-void testSppLayer(const string& poolType, const int pyramidHeight, bool trans,
+void testSppLayer(const string& poolType,
+                  const int pyramidHeight,
+                  bool trans,
                   bool useGpu) {
   TestConfig config;
   config.layerConfig.set_type("spp");
@@ -1232,7 +1314,8 @@ TEST(Layer, NCELayer) {
 
     for (auto isIdLabel : {false, true}) {
       config.inputDefs[1] = {
-          isIdLabel ? INPUT_LABEL : INPUT_SPARSE_NON_VALUE_DATA, "label",
+          isIdLabel ? INPUT_LABEL : INPUT_SPARSE_NON_VALUE_DATA,
+          "label",
           /* dim= */ numClasses,
           /* paraSize= */ 0};
 
@@ -1254,7 +1337,10 @@ TEST(Layer, NCELayer) {
                   << " isIdLabel=" << isIdLabel << " withWeight=" << withWeight
                   << " withDist=" << withDist;
         // Not support GPU now
-        testLayerGrad(config, "nce", 100, /* trans= */ false,
+        testLayerGrad(config,
+                      "nce",
+                      100,
+                      /* trans= */ false,
                       /* useGpu */ false);
       }
     }
@@ -1332,7 +1418,8 @@ void testBatchNormLayer(const string& type, bool trans, bool useGpu) {
   config.layerConfig.set_size(CHANNELS * IMG_SIZE * IMG_SIZE);
   config.layerConfig.set_active_type("sigmoid");
   config.biasSize = CHANNELS;
-  config.inputDefs.push_back({INPUT_DATA, "layer_0",
+  config.inputDefs.push_back({INPUT_DATA,
+                              "layer_0",
                               /* dim= */ IMG_SIZE * IMG_SIZE * CHANNELS,
                               /* paraSize= */ CHANNELS});
 
@@ -1349,7 +1436,11 @@ void testBatchNormLayer(const string& type, bool trans, bool useGpu) {
   img_conf->set_channels(CHANNELS);
   img_conf->set_img_size(IMG_SIZE);
 
-  testLayerGrad(config, "batch_norm", 64, /* trans= */ trans, useGpu,
+  testLayerGrad(config,
+                "batch_norm",
+                64,
+                /* trans= */ trans,
+                useGpu,
                 /* useWeight */ true);
 }
 
@@ -1384,9 +1475,11 @@ TEST(Operator, conv) {
   conv->set_groups(1);
   conv->set_filter_channels(conv->channels() / conv->groups());
   conv->set_img_size(IMAGE_SIZE);
-  int output_x =
-      outputSize(conv->img_size(), conv->filter_size(), conv->padding(),
-                 conv->stride(), /* caffeMode */ true);
+  int output_x = outputSize(conv->img_size(),
+                            conv->filter_size(),
+                            conv->padding(),
+                            conv->stride(),
+                            /* caffeMode */ true);
   conv->set_output_x(output_x);
   config.layerConfig.set_size(output_x * output_x *
                               config.layerConfig.num_filters());
@@ -1396,8 +1489,10 @@ TEST(Operator, conv) {
   config.inputDefs.push_back(
       {INPUT_DATA, "layer_0", IMAGE_SIZE * IMAGE_SIZE * CHANNELS, 0});
   config.inputDefs.push_back(
-      {INPUT_DATA, "layer_1",
-       FILTER_SIZE * FILTER_SIZE_Y * CHANNELS * NUM_FILTERS, 0});
+      {INPUT_DATA,
+       "layer_1",
+       FILTER_SIZE * FILTER_SIZE_Y * CHANNELS * NUM_FILTERS,
+       0});
   config.layerConfig.add_inputs();
   config.layerConfig.add_inputs();
 
@@ -1411,12 +1506,17 @@ TEST(Layer, FeatureMapExpandLayer) {
   const int INPUT_SIZE = 100;
   config.layerConfig.set_size(INPUT_SIZE * CHANNELS);
   config.layerConfig.set_num_filters(CHANNELS);
-  config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "layer_0",
-                              /* dim= */ INPUT_SIZE, /* paraSize= */ 0});
+  config.inputDefs.push_back({INPUT_SEQUENCE_DATA,
+                              "layer_0",
+                              /* dim= */ INPUT_SIZE,
+                              /* paraSize= */ 0});
   config.layerConfig.add_inputs();
   for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "featmap_expand",
-                  /*batch_size*/ 100, /* trans= */ false, useGpu,
+    testLayerGrad(config,
+                  "featmap_expand",
+                  /*batch_size*/ 100,
+                  /* trans= */ false,
+                  useGpu,
                   /* useWeight */ true);
   }
 }
diff --git a/paddle/gserver/tests/test_LinearChainCRF.cpp b/paddle/gserver/tests/test_LinearChainCRF.cpp
index f45e40c8b6..913d6ed751 100644
--- a/paddle/gserver/tests/test_LinearChainCRF.cpp
+++ b/paddle/gserver/tests/test_LinearChainCRF.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <gtest/gtest.h>
 #include <vector>
 #include "paddle/gserver/layers/LinearChainCRF.h"
diff --git a/paddle/gserver/tests/test_MultinomialSampler.cpp b/paddle/gserver/tests/test_MultinomialSampler.cpp
index 73b4d0b8b7..3fc099adbd 100644
--- a/paddle/gserver/tests/test_MultinomialSampler.cpp
+++ b/paddle/gserver/tests/test_MultinomialSampler.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <random>
 
 #include <gtest/gtest.h>
@@ -43,7 +42,7 @@ TEST(MultinomialSampler, gen) {
   int size = 1024 * 4;
   default_random_engine reng;
 
-  for (size_t iter=0; iter < 256; ++iter) {
+  for (size_t iter = 0; iter < 256; ++iter) {
     uniform_int_distribution<int> rand(1, numGrids / size * 1.8);
     vector<real> prob;
     int sum = 0;
@@ -138,7 +137,6 @@ void benchmarkRandom() {
   LOG(INFO) << "sum1=" << sum1;
 }
 
-
 int main(int argc, char** argv) {
   initMain(argc, argv);
   testing::InitGoogleTest(&argc, argv);
diff --git a/paddle/gserver/tests/test_NetworkCompare.cpp b/paddle/gserver/tests/test_NetworkCompare.cpp
index 8d3eac5aca..1810bc31fc 100644
--- a/paddle/gserver/tests/test_NetworkCompare.cpp
+++ b/paddle/gserver/tests/test_NetworkCompare.cpp
@@ -41,7 +41,8 @@ struct DataOut {
   std::vector<VectorPtr> paraGrads;
 };
 
-void initArgument(DataIn& data, const std::string& configPath,
+void initArgument(DataIn& data,
+                  const std::string& configPath,
                   bool useGpu = FLAGS_use_gpu) {
   TrainerConfigHelper config(configPath);
   size_t batchSize = config.getOptConfig().batch_size();
@@ -122,9 +123,10 @@ void calcGradient(DataIn& in, DataOut& out, const std::string& configPath) {
   }
   gradientMachine->backward();
   for (size_t i = 0; i < in.outGrads.size(); i++) {
-    MatrixPtr value =
-        Matrix::create(outArgs[i].value->getHeight(),
-                       outArgs[i].value->getWidth(), false, false);
+    MatrixPtr value = Matrix::create(outArgs[i].value->getHeight(),
+                                     outArgs[i].value->getWidth(),
+                                     false,
+                                     false);
     value->copyFrom(*outArgs[i].value);
     out.outValues.push_back(value);
   }
@@ -147,8 +149,12 @@ void calcGradient(DataIn& in, DataOut& out, const std::string& configPath) {
   gradientMachine->finish();
 }
 
-void checkBuffer(real* A, const char* desA, real* B, const char* desB,
-                 size_t len, size_t width = 1) {
+void checkBuffer(real* A,
+                 const char* desA,
+                 real* B,
+                 const char* desB,
+                 size_t len,
+                 size_t width = 1) {
   int nNum = 0;
   for (size_t i = 0; i < len; ++i) {
     real diff = fabs(A[i] - B[i]);
@@ -168,8 +174,10 @@ void compareGradient(DataOut& outA, DataOut& outB) {
             << "------------------------------";
   for (size_t i = 0; i < outA.outValues.size(); ++i) {
     LOG(INFO) << "OUTPUT VALUE: " << i;
-    checkBuffer(outA.outValues[i]->getData(), "network A output",
-                outB.outValues[i]->getData(), "network B output",
+    checkBuffer(outA.outValues[i]->getData(),
+                "network A output",
+                outB.outValues[i]->getData(),
+                "network B output",
                 outA.outValues[i]->getElementCnt(),
                 outA.outValues[i]->getWidth());
   }
@@ -180,8 +188,10 @@ void compareGradient(DataOut& outA, DataOut& outB) {
               << "------------------------------";
     for (size_t i = 0; i < outA.paraGrads.size(); ++i) {
       LOG(INFO) << "PARAMETER GRADIENT: " << i;
-      checkBuffer(outA.paraGrads[i]->getData(), "Network A",
-                  outB.paraGrads[i]->getData(), "Network B",
+      checkBuffer(outA.paraGrads[i]->getData(),
+                  "Network A",
+                  outB.paraGrads[i]->getData(),
+                  "Network B",
                   outA.paraGrads[i]->getSize());
     }
   }
@@ -247,7 +257,6 @@ TEST(Compare, img_conv) {
 }
 #endif
 
-
 P_DEFINE_string(config_file_a, "", "config of one network to compare");
 P_DEFINE_string(config_file_b, "", "config of another network to compare");
 TEST(Compare, network) {
diff --git a/paddle/gserver/tests/test_ProtoDataProvider.cpp b/paddle/gserver/tests/test_ProtoDataProvider.cpp
index 68f7f43261..01070bc1cb 100644
--- a/paddle/gserver/tests/test_ProtoDataProvider.cpp
+++ b/paddle/gserver/tests/test_ProtoDataProvider.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <memory>
 #include <string>
 
@@ -41,7 +40,9 @@ const int kSpraseMatrixDim = 1024;
 
 using namespace paddle;  // NOLINT
 
-void prepareData(DataBatch* batch, const int* numPerSlotType, bool iid,
+void prepareData(DataBatch* batch,
+                 const int* numPerSlotType,
+                 bool iid,
                  bool useGpu) {
   batch->clear();
   int64_t size = uniformRandom(100) + 10;
@@ -137,7 +138,7 @@ inline int getSlotDim(const Argument& arg) {
 
 inline SlotDef::SlotType getSlotType(const Argument& arg) {
   if (arg.value) {
-    auto & m = *arg.value;
+    auto& m = *arg.value;
     auto& type = typeid(m);
     if (type == typeid(CpuMatrix) || type == typeid(GpuMatrix)) {
       return SlotDef::VECTOR_DENSE;
@@ -169,8 +170,12 @@ inline SlotDef::SlotType getSlotType(const Argument& arg) {
   return SlotDef::VECTOR_DENSE;
 }
 
-void getColRow(const Argument& arg, int64_t pos, bool useGpu, int* colNum,
-               const int** rowCols, const real** rowValues) {
+void getColRow(const Argument& arg,
+               int64_t pos,
+               bool useGpu,
+               int* colNum,
+               const int** rowCols,
+               const real** rowValues) {
   SlotDef::SlotType type = getSlotType(arg);
   GpuSparseMatrixPtr matGpu;
   CpuSparseMatrixPtr matCpu;
@@ -190,8 +195,11 @@ void getColRow(const Argument& arg, int64_t pos, bool useGpu, int* colNum,
   }
 }
 
-void makeSample(const vector<Argument>& arguments, int64_t pos,
-                bool isBeginning, DataSample* sample, bool useGpu) {
+void makeSample(const vector<Argument>& arguments,
+                int64_t pos,
+                bool isBeginning,
+                DataSample* sample,
+                bool useGpu) {
   sample->set_is_beginning(isBeginning);
   int slotid = 0;
   for (auto& arg : arguments) {
@@ -272,8 +280,7 @@ void writeData(const DataBatch& batch, bool useGpu, bool dataCompression) {
 
   int64_t totalSeqs = batch.getNumSequences();
   int64_t seq = 0;
-  ICpuGpuVectorPtr sequenceStartPositions =
-      arguments[0].sequenceStartPositions;
+  ICpuGpuVectorPtr sequenceStartPositions = arguments[0].sequenceStartPositions;
   int64_t numWritten = 0;
   vector<string> curProtoFiles =
       dataCompression ? protoFilesCompressed : protoFiles;
@@ -306,8 +313,11 @@ void writeData(const DataBatch& batch, bool useGpu, bool dataCompression) {
 }
 
 // check that the sample at pos1 in args1 is same as the sample at pos2 in args2
-void checkSample(const vector<Argument>& args1, int64_t pos1,
-                 const vector<Argument>& args2, int64_t pos2, bool useGpu) {
+void checkSample(const vector<Argument>& args1,
+                 int64_t pos1,
+                 const vector<Argument>& args2,
+                 int64_t pos2,
+                 bool useGpu) {
   EXPECT_EQ(args1.size(), args2.size());
   VLOG(1) << " pos1=" << pos1 << " pos2=" << pos2;
 
@@ -361,8 +371,11 @@ void checkSample(const vector<Argument>& args1, int64_t pos1,
   }
 }
 
-void testProtoDataProvider(int* numPerSlotType, bool iid, bool async,
-                           bool useGpu, bool dataCompression,
+void testProtoDataProvider(int* numPerSlotType,
+                           bool iid,
+                           bool async,
+                           bool useGpu,
+                           bool dataCompression,
                            int numConstantSlots = 0) {
   mkDir(kTestDir);
   DataBatch data;
@@ -377,7 +390,9 @@ void testProtoDataProvider(int* numPerSlotType, bool iid, bool async,
 
   for (int i = 0; i < numConstantSlots; ++i) {
     config.add_constant_slots(i + 11);
-    MatrixPtr w = Matrix::create(data.getSize(), 1, /* trans= */ false,
+    MatrixPtr w = Matrix::create(data.getSize(),
+                                 1,
+                                 /* trans= */ false,
                                  /* useGpu= */ false);
     w->assign(config.constant_slots(i));
     data.appendData(w);
@@ -393,16 +408,14 @@ void testProtoDataProvider(int* numPerSlotType, bool iid, bool async,
 
   size_t seq1 = 0;
   vector<Argument>& args1 = data.getStreams();
-  ICpuGpuVectorPtr sequenceStartPositions1 =
-      args1[0].sequenceStartPositions;
+  ICpuGpuVectorPtr sequenceStartPositions1 = args1[0].sequenceStartPositions;
 
   dataProvider->reset();
 
   while (dataProvider->getNextBatch(batchSize, &batch) > 0) {
     CHECK_EQ(data.getNumStreams(), batch.getNumStreams());
     vector<Argument>& args2 = batch.getStreams();
-    ICpuGpuVectorPtr sequenceStartPositions2 =
-        args2[0].sequenceStartPositions;
+    ICpuGpuVectorPtr sequenceStartPositions2 = args2[0].sequenceStartPositions;
     for (auto& arg : args2) {
       EXPECT_EQ(iid, !arg.sequenceStartPositions);
     }
@@ -494,8 +507,8 @@ TEST(ProtoDataProvider, test) {
                 numSparseValueVectorSlots;
             numPerSlotType[SlotDef::INDEX] = numIdSlots;
             numPerSlotType[SlotDef::STRING] = numStrSlots;
-            testProtoDataProvider(numPerSlotType, iid, async, useGpu,
-                                  dataCompression);
+            testProtoDataProvider(
+                numPerSlotType, iid, async, useGpu, dataCompression);
           }  // end for (int dataCompression : numTwoArray)
         }    // end for (int useGpu : numTwoArray)
       }      // end for (int async : numTwoArray)
@@ -531,7 +544,9 @@ TEST(ProtoDataProvider, constant_slots) {
             numPerSlotType[SlotDef::INDEX] = 1;
             testProtoDataProvider(numPerSlotType,
                                   /* iid= */ true,
-                                  /* async= */ false, useGpu, dataCompression,
+                                  /* async= */ false,
+                                  useGpu,
+                                  dataCompression,
                                   numConstantSlots);
           }  // end for (int dataCompression : numTwoArray)
         }    // end for (int useGpu : numTwoArray)
@@ -541,16 +556,17 @@ TEST(ProtoDataProvider, constant_slots) {
 }
 
 void checkSampleSequence(const vector<Argument>& args1,
-                         const vector<Argument>& args2, int64_t offset,
-                         int64_t numSeqs, bool useGpu) {
+                         const vector<Argument>& args2,
+                         int64_t offset,
+                         int64_t numSeqs,
+                         bool useGpu) {
   // check slot num are equal
   EXPECT_EQ(args1.size(), args2.size());
   for (size_t i = 0; i < args1.size(); i++) {
     auto type = getSlotType(args1[i]);
     // check for args2: sequenceStartPositions vs numSeqs
     // (1) size
-    EXPECT_EQ(args2[i].sequenceStartPositions->getSize(),
-              (size_t)numSeqs + 1);
+    EXPECT_EQ(args2[i].sequenceStartPositions->getSize(), (size_t)numSeqs + 1);
     // (2) content
     auto checkArgContent = [&](const Argument& args, int numSeqs) {
       for (int j = 0; j <= numSeqs; j++) {
@@ -579,8 +595,8 @@ void checkSampleSequence(const vector<Argument>& args1,
         const real* rowValues1;  // nullptr
         int totalLength = 0;
         for (int j = 0; j < numSeqs; j++) {
-          getColRow(args1[i], offset + j, useGpu, &colNum1, &rowCols1,
-                    &rowValues1);
+          getColRow(
+              args1[i], offset + j, useGpu, &colNum1, &rowCols1, &rowValues1);
           // (1) lengths
           EXPECT_EQ(totalLength,
                     args2[i].sequenceStartPositions->getElement(j));
@@ -626,13 +642,16 @@ void checkSampleSequence(const vector<Argument>& args1,
   }
 }
 
-void testProtoSequenceDataProvider(int* numPerSlotType, bool async,
+void testProtoSequenceDataProvider(int* numPerSlotType,
+                                   bool async,
                                    bool useGpu) {
   mkDir(kTestDir);
   DataBatch data;
 
-  prepareData(&data, numPerSlotType,
-              /* iid */ true, useGpu);
+  prepareData(&data,
+              numPerSlotType,
+              /* iid */ true,
+              useGpu);
   writeData(data, useGpu, /* dataCompression */ false);
 
   DataConfig config;
@@ -649,8 +668,7 @@ void testProtoSequenceDataProvider(int* numPerSlotType, bool async,
   DataBatch batch;
 
   vector<Argument>& args1 = data.getStreams();
-  ICpuGpuVectorPtr sequenceStartPositions1 =
-      args1[0].sequenceStartPositions;
+  ICpuGpuVectorPtr sequenceStartPositions1 = args1[0].sequenceStartPositions;
 
   dataProvider->reset();
 
@@ -658,8 +676,7 @@ void testProtoSequenceDataProvider(int* numPerSlotType, bool async,
   while (dataProvider->getNextBatch(batchSize, &batch) > 0) {
     CHECK_EQ(data.getNumStreams(), batch.getNumStreams());
     vector<Argument>& args2 = batch.getStreams();
-    ICpuGpuVectorPtr sequenceStartPositions2 =
-        args2[0].sequenceStartPositions;
+    ICpuGpuVectorPtr sequenceStartPositions2 = args2[0].sequenceStartPositions;
     for (auto& arg : args1) {
       // args1 should not has sequence
       EXPECT_EQ(true, !arg.sequenceStartPositions);
diff --git a/paddle/gserver/tests/test_PyDataProvider.cpp b/paddle/gserver/tests/test_PyDataProvider.cpp
index 6ad45e3a65..802f9aa4cb 100644
--- a/paddle/gserver/tests/test_PyDataProvider.cpp
+++ b/paddle/gserver/tests/test_PyDataProvider.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <memory>
 #include <string>
 
@@ -114,9 +113,10 @@ void simpleValueCheck(const vector<Argument>& argumentList, bool useGpu) {
   // Dense
   real* data;
   if (useGpu) {
-    MatrixPtr cpuMatrixPtr =
-        Matrix::create(argumentList[0].value->getHeight(),
-                       argumentList[0].value->getWidth(), 0, 0);
+    MatrixPtr cpuMatrixPtr = Matrix::create(argumentList[0].value->getHeight(),
+                                            argumentList[0].value->getWidth(),
+                                            0,
+                                            0);
     cpuMatrixPtr->copyFrom(*argumentList[0].value);
     data = cpuMatrixPtr->getData();
   } else {
diff --git a/paddle/gserver/tests/test_PyDataProvider2.cpp b/paddle/gserver/tests/test_PyDataProvider2.cpp
index b9867a728d..24aa73910f 100644
--- a/paddle/gserver/tests/test_PyDataProvider2.cpp
+++ b/paddle/gserver/tests/test_PyDataProvider2.cpp
@@ -31,14 +31,11 @@ extern void clearOnPoolFilledHook();
 }  // namespace unittest
 }  // namespace paddle
 
-
 const paddle::real epsilon = 1e-5;
 
-static inline int64_t readDataBatch(
-    paddle::DataBatch* batch,
-    const std::string& funcName,
-    int64_t batchSize = 65535) {
-
+static inline int64_t readDataBatch(paddle::DataBatch* batch,
+                                    const std::string& funcName,
+                                    int64_t batchSize = 65535) {
   paddle::DataConfig config;
   config.set_type("py2");
   config.set_files(FLAGS_train_list.c_str());
@@ -64,18 +61,19 @@ TEST(PyDataProvider2, dense_no_seq) {
   provider->setSkipShuffle();  // skip shuffle for unittest.
 
   paddle::DataBatch batch;
-  for (size_t pass=0; pass < 2; ++pass) {  // read 2 passes
+  for (size_t pass = 0; pass < 2; ++pass) {  // read 2 passes
     provider->reset();
     int64_t num = provider->getNextBatchInternal(100, &batch);
     ASSERT_NE(num, 0);
     ASSERT_EQ((size_t)batch.getStreams().size(), (size_t)1);
     ASSERT_EQ((size_t)batch.getSize(), (size_t)100);
     // Check batch data.
-    for (size_t i=0; i < 100; ++i) {
-      for (size_t j=0; j < 200; ++j) {
-        paddle::real tmp = (paddle::real)((j-100.0) * (i+1) / 200.0);
-        ASSERT_NEAR(batch.getStreams()[0].value->getData()[i*200 + j],
-                    tmp, epsilon);}
+    for (size_t i = 0; i < 100; ++i) {
+      for (size_t j = 0; j < 200; ++j) {
+        paddle::real tmp = (paddle::real)((j - 100.0) * (i + 1) / 200.0);
+        ASSERT_NEAR(
+            batch.getStreams()[0].value->getData()[i * 200 + j], tmp, epsilon);
+      }
     }
 
     num = provider->getNextBatchInternal(100, &batch);
@@ -83,12 +81,13 @@ TEST(PyDataProvider2, dense_no_seq) {
     ASSERT_EQ(batch.getStreams().size(), (size_t)1);
     ASSERT_EQ((size_t)batch.getSize(), (size_t)100);
     // Check batch data.
-    for (size_t i=0; i < 100; ++i) {
+    for (size_t i = 0; i < 100; ++i) {
       size_t ii = i + 100;
-      for (size_t j=0; j < 200; ++j) {
-        paddle::real tmp = (paddle::real)((j-100.0) * (ii+1) / 200.0);
-        ASSERT_NEAR(batch.getStreams()[0].value->getData()[i*200 + j],
-                    tmp, epsilon);}
+      for (size_t j = 0; j < 200; ++j) {
+        paddle::real tmp = (paddle::real)((j - 100.0) * (ii + 1) / 200.0);
+        ASSERT_NEAR(
+            batch.getStreams()[0].value->getData()[i * 200 + j], tmp, epsilon);
+      }
     }
     num = provider->getNextBatchInternal(100, &batch);
     ASSERT_EQ(num, 0);
@@ -106,11 +105,11 @@ TEST(PyDataProvider2, index_no_seq) {
 
   provider->setSkipShuffle();  // skip shuffle for unittest.
   paddle::DataBatch batch;
-  for (size_t pass=0; pass < 2; ++pass) {
+  for (size_t pass = 0; pass < 2; ++pass) {
     provider->reset();
     int64_t num = provider->getNextBatchInternal(10000, &batch);
     CHECK_EQ(num, 200);
-    for (int i=0; i < 200; ++i) {
+    for (int i = 0; i < 200; ++i) {
       CHECK_EQ(i, batch.getStreams()[0].ids->getData()[i]);
     }
   }
@@ -118,13 +117,14 @@ TEST(PyDataProvider2, index_no_seq) {
 
 TEST(PyDataProvider2, init_hook) {
   paddle::PyObjectPtr pickle = paddle::py::import("pickle");
-  paddle::PyObjectPtr globals(
-      PyModule_GetDict(PyImport_AddModule("__main__")));
+  paddle::PyObjectPtr globals(PyModule_GetDict(PyImport_AddModule("__main__")));
   PyDict_SetItemString(globals.get(), "pickle", pickle.get());
   paddle::PyObjectPtr locals(PyDict_New());
   paddle::PyObjectPtr mdl(PyRun_String(
       "dumps = pickle.dumps({'value':[float(x) for x in xrange(20)]})",
-      Py_file_input, globals.get(), locals.get()));
+      Py_file_input,
+      globals.get(),
+      locals.get()));
   CHECK_PY(mdl) << "Error!";
   paddle::PyObjectPtr dps(PyDict_GetItemString(locals.get(), "dumps"));
   CHECK_PY(dps) << "Error!";
@@ -145,9 +145,9 @@ TEST(PyDataProvider2, init_hook) {
   ASSERT_EQ(num, 200);
   auto& mat = batch.getStreams()[0].value;
   ASSERT_EQ((size_t)mat->getWidth(), (size_t)20);
-  for (size_t i=0; i < 200; ++i) {
-    for (size_t j=0; j < 20; ++j) {
-      ASSERT_NEAR((paddle::real)j, mat->getData()[i*20 + j], epsilon);
+  for (size_t i = 0; i < 200; ++i) {
+    for (size_t j = 0; j < 20; ++j) {
+      ASSERT_NEAR((paddle::real)j, mat->getData()[i * 20 + j], epsilon);
     }
   }
 }
@@ -168,11 +168,11 @@ TEST(PyDataProvider2, sparse_no_value_no_seq) {
   auto csm = std::dynamic_pointer_cast<paddle::CpuSparseMatrix>(
       batch.getStreams()[0].value);
   CHECK(csm != nullptr);
-  for (int i=0; i < 200; ++i) {
+  for (int i = 0; i < 200; ++i) {
     CHECK_EQ(csm->getColNum(i), (size_t)10);
     int* cols = csm->getRowCols(i);
-    for (int j=0; j < 10; ++j) {
-      CHECK_EQ(cols[j], (i+1)*(j+1));
+    for (int j = 0; j < 10; ++j) {
+      CHECK_EQ(cols[j], (i + 1) * (j + 1));
     }
   }
 }
@@ -183,13 +183,13 @@ TEST(PyDataProvider2, sparse_value_no_seq) {
   auto csm = std::dynamic_pointer_cast<paddle::CpuSparseMatrix>(
       batch.getStreams()[0].value);
   CHECK(csm != nullptr);
-  for (int i=0; i < 200; ++i) {
+  for (int i = 0; i < 200; ++i) {
     CHECK_EQ(csm->getColNum(i), (size_t)10);
     int* cols = csm->getRowCols(i);
     real* dat = csm->getRowValues(i);
-    for (int j=0; j < 10; ++j) {
-      EXPECT_EQ(cols[j], (i+1)*(j+1));
-      EXPECT_EQ(dat[j], real(j)/real(i+1));
+    for (int j = 0; j < 10; ++j) {
+      EXPECT_EQ(cols[j], (i + 1) * (j + 1));
+      EXPECT_EQ(dat[j], real(j) / real(i + 1));
     }
   }
 }
@@ -198,10 +198,10 @@ TEST(PyDataProvider2, index_seq) {
   paddle::DataBatch batch;
   CHECK_EQ(readDataBatch(&batch, "test_index_seq"), 200);
   auto& arg = batch.getStreams()[0];
-  CHECK_EQ((int)arg.ids->getSize(), (200 + 1) * 200 /2);
+  CHECK_EQ((int)arg.ids->getSize(), (200 + 1) * 200 / 2);
   size_t tmp = 0;
-  for (size_t i=0; i < 200; ++i) {  // CHECK DATA CORRECT
-    for (size_t j=0; j < i+1; ++j) {
+  for (size_t i = 0; i < 200; ++i) {  // CHECK DATA CORRECT
+    for (size_t j = 0; j < i + 1; ++j) {
       ASSERT_EQ((size_t)arg.ids->getData()[tmp], j);
       ++tmp;
     }
@@ -221,9 +221,9 @@ TEST(PyDataProvider2, index_sub_seq) {
   ASSERT_EQ(readDataBatch(&batch, "test_index_sub_seq"), 200);
   auto& arg = batch.getStreams()[0];
   size_t tmp = 0;
-  for (size_t i=0; i < 200; ++i) {
-    for (size_t j=0; j < i+1; ++j) {
-      for (size_t k=0; k < j+1; ++k) {
+  for (size_t i = 0; i < 200; ++i) {
+    for (size_t j = 0; j < i + 1; ++j) {
+      for (size_t k = 0; k < j + 1; ++k) {
         CHECK_EQ((size_t)arg.ids->getData()[tmp++], k);
       }
     }
@@ -236,14 +236,14 @@ TEST(PyDataProvider2, index_sub_seq) {
   ASSERT_EQ(arg.sequenceStartPositions->getData(false)[0], 0);
   size_t idx = 1;
   tmp = 0;
-  for (size_t i=0; i < 200; ++i) {
-    for (size_t j=0; j < i+1; ++j) {
-      tmp += j+1;
+  for (size_t i = 0; i < 200; ++i) {
+    for (size_t j = 0; j < i + 1; ++j) {
+      tmp += j + 1;
       ASSERT_EQ((size_t)arg.subSequenceStartPositions->getData(false)[idx],
-          (size_t)tmp);
+                (size_t)tmp);
       ++idx;
     }
-    ASSERT_EQ((size_t)arg.sequenceStartPositions->getData(false)[i+1], tmp);
+    ASSERT_EQ((size_t)arg.sequenceStartPositions->getData(false)[i + 1], tmp);
   }
 }
 
@@ -264,7 +264,7 @@ TEST(PyDataProvider2, min_pool_size) {
 
   paddle::unittest::pydp2::setOnPoolFilledHook([&](size_t poolSize) {
     if (totalData > batchSize) {
-      CHECK_GE(poolSize, std::min(totalData-batchSize, minPoolSize));
+      CHECK_GE(poolSize, std::min(totalData - batchSize, minPoolSize));
     }
   });
   while (true) {
@@ -287,7 +287,7 @@ TEST(PyDataProvider2, can_over_batch_size) {
   config.set_load_data_args("");
   paddle::DataBatch batch;
   std::unique_ptr<paddle::DataProvider> provider(
-  paddle::DataProvider::create(config, false));
+      paddle::DataProvider::create(config, false));
   provider->reset();
   constexpr size_t batchSize = 100;
   while (true) {
@@ -313,7 +313,7 @@ TEST(PyDataProvider2, input_order) {
   *modelConfig.add_input_layer_names() = "input2";
   paddle::DataBatch batch;
   std::unique_ptr<paddle::DataProvider> provider(
-  paddle::DataProvider::create(config, modelConfig, false));
+      paddle::DataProvider::create(config, modelConfig, false));
   provider->reset();
   constexpr size_t batchSize = 100;
   while (true) {
@@ -338,7 +338,7 @@ TEST(PyDataProvider2, test_check) {
   config.set_load_data_args("");
   paddle::DataBatch batch;
   std::unique_ptr<paddle::DataProvider> provider(
-  paddle::DataProvider::create(config, false));
+      paddle::DataProvider::create(config, false));
   provider->reset();
   while (true) {
     size_t realBatchSize = provider->getNextBatchInternal(100, &batch);
@@ -346,7 +346,7 @@ TEST(PyDataProvider2, test_check) {
       break;
     } else {
       auto& ivec = batch.getStream(0).ids;
-      for (size_t i=0; i < ivec->getSize(); ++i) {
+      for (size_t i = 0; i < ivec->getSize(); ++i) {
         CHECK_LT(ivec->getData()[i], 10);
       }
     }
diff --git a/paddle/gserver/tests/test_RecurrentGradientMachine.cpp b/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
index d104db3e5b..80d713dac0 100644
--- a/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
+++ b/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
@@ -45,12 +45,16 @@ public:
     auto p = const_cast<TrainerForTest*>(this);
     auto& params = p->getGradientMachine()->getParameters();
     return std::accumulate(
-        params.begin(), params.end(), 0UL,
+        params.begin(),
+        params.end(),
+        0UL,
         [](size_t a, const ParameterPtr& p) { return a + p->getSize(); });
   }
 };
 
-void CalCost(const string& conf, const string& dir, real* cost,
+void CalCost(const string& conf,
+             const string& dir,
+             real* cost,
              int num_passes) {
   auto config = std::make_shared<TrainerConfigHelper>(conf);
   TrainerForTest trainer;
@@ -82,8 +86,8 @@ void CalCost(const string& conf, const string& dir, real* cost,
       int num = dataProvider->getNextBatch(batchSize, &dataBatch);
       if (num == 0) break;
       totalCost += trainer.calcGradient(dataBatch, vecW, vecGradient);
-      sgdUpdate(learningRate, momentum, decayRate, &vecW, &vecGradient,
-                &vecMomentum);
+      sgdUpdate(
+          learningRate, momentum, decayRate, &vecW, &vecGradient, &vecMomentum);
     }
     cost[i] = totalCost;
   }
@@ -119,7 +123,8 @@ TEST(RecurrentGradientMachine, HasSubSequence) {
   for (bool useGpu : {false, true}) {
     test("gserver/tests/sequence_layer_group.conf",
          "gserver/tests/sequence_nest_layer_group.conf",
-         1e-5, useGpu);
+         1e-5,
+         useGpu);
   }
 }
 
@@ -127,7 +132,8 @@ TEST(RecurrentGradientMachine, rnn) {
   for (bool useGpu : {false, true}) {
     test("gserver/tests/sequence_rnn.conf",
          "gserver/tests/sequence_nest_rnn.conf",
-         1e-6, useGpu);
+         1e-6,
+         useGpu);
   }
 }
 
@@ -135,16 +141,18 @@ TEST(RecurrentGradientMachine, rnn_multi_input) {
   for (bool useGpu : {false, true}) {
     test("gserver/tests/sequence_rnn_multi_input.conf",
          "gserver/tests/sequence_nest_rnn_multi_input.conf",
-         1e-6, useGpu);
+         1e-6,
+         useGpu);
   }
 }
 
 TEST(RecurrentGradientMachine, rnn_multi_unequalength_input) {
-    for (bool useGpu : {false, true}) {
-        test("gserver/tests/sequence_rnn_multi_unequalength_inputs.conf",
-        "gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.conf",
-             1e-6, useGpu);
-    }
+  for (bool useGpu : {false, true}) {
+    test("gserver/tests/sequence_rnn_multi_unequalength_inputs.conf",
+         "gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.conf",
+         1e-6,
+         useGpu);
+  }
 }
 
 int main(int argc, char** argv) {
diff --git a/paddle/gserver/tests/test_RecurrentLayer.cpp b/paddle/gserver/tests/test_RecurrentLayer.cpp
index 1c8497e8c5..0643cec38b 100644
--- a/paddle/gserver/tests/test_RecurrentLayer.cpp
+++ b/paddle/gserver/tests/test_RecurrentLayer.cpp
@@ -71,7 +71,9 @@ void checkError(const CpuVector& vector1, const CpuVector& vector2) {
   EXPECT_EQ(count, 0) << "There are " << count << " different element.";
 }
 
-LayerPtr creatDataLayer(string name, size_t batchSize, int layerSize,
+LayerPtr creatDataLayer(string name,
+                        size_t batchSize,
+                        int layerSize,
                         bool useGpu) {
   LayerConfig dataConfig;
   dataConfig.set_name(name);
@@ -96,7 +98,9 @@ LayerPtr creatDataLayer(string name, size_t batchSize, int layerSize,
   return layer;
 }
 
-ParameterPtr creatParameter(string name, int pid, size_t paraSize,
+ParameterPtr creatParameter(string name,
+                            int pid,
+                            size_t paraSize,
                             bool useGpu) {
   ParameterConfig paraConfig;
   paraConfig.set_name(name);
@@ -112,7 +116,9 @@ ParameterPtr creatParameter(string name, int pid, size_t paraSize,
   return parameter;
 }
 
-ParameterPtr creatParameterBias(string name, int pid, size_t paraSize,
+ParameterPtr creatParameterBias(string name,
+                                int pid,
+                                size_t paraSize,
                                 bool useGpu) {
   ParameterConfig paraConfig;
   paraConfig.set_name(name);
@@ -127,8 +133,10 @@ ParameterPtr creatParameterBias(string name, int pid, size_t paraSize,
   return parameter;
 }
 
-LayerPtr initRecurrentLayer(LayerConfig layerConfig, size_t batchSize,
-                            int layerSize, bool useGpu) {
+LayerPtr initRecurrentLayer(LayerConfig layerConfig,
+                            size_t batchSize,
+                            int layerSize,
+                            bool useGpu) {
   FLAGS_use_gpu = useGpu;
   LayerMap layerMap;
   ParameterMap parameterMap;
@@ -214,7 +222,7 @@ TEST(Layer, RecurrentLayer) {
 #define protected public
 #include "paddle/gserver/layers/LstmLayer.h"
 #include "paddle/gserver/layers/GatedRecurrentLayer.h"
-template<class T>
+template <class T>
 class TestRecurrentLayer {
 public:
   LayerConfig config_;
@@ -227,25 +235,34 @@ public:
   LayerMap layerMap_;
   ParameterMap parameterMap_;
   TestRecurrentLayer(const LayerConfig& config,
-    bool useGpu, bool useBatch = false)
-    : config_(config), useGpu_(useGpu), useBatch_(useBatch) {}
+                     bool useGpu,
+                     bool useBatch = false)
+      : config_(config), useGpu_(useGpu), useBatch_(useBatch) {}
   void init(size_t batchSize) {
     FLAGS_use_gpu = useGpu_;
     testLayer_ = Layer::create(config_);
     if (typeid(T) == typeid(GatedRecurrentLayer)) {
       dataLayer_ = creatDataLayer(config_.mutable_inputs(0)->input_layer_name(),
-                                  batchSize, config_.size() * 3, useGpu_);
+                                  batchSize,
+                                  config_.size() * 3,
+                                  useGpu_);
       para_ = creatParameter(config_.mutable_inputs(0)->input_parameter_name(),
-                             0, config_.size() * config_.size() * 3, useGpu_);
-      bias_ = creatParameterBias(config_.bias_parameter_name(),
-                                 1, config_.size() * 3, useGpu_);
+                             0,
+                             config_.size() * config_.size() * 3,
+                             useGpu_);
+      bias_ = creatParameterBias(
+          config_.bias_parameter_name(), 1, config_.size() * 3, useGpu_);
     } else if (typeid(T) == typeid(LstmLayer)) {
       dataLayer_ = creatDataLayer(config_.mutable_inputs(0)->input_layer_name(),
-                                  batchSize, config_.size() * 4, useGpu_);
+                                  batchSize,
+                                  config_.size() * 4,
+                                  useGpu_);
       para_ = creatParameter(config_.mutable_inputs(0)->input_parameter_name(),
-                             0, config_.size() * config_.size() * 4, useGpu_);
-      bias_ = creatParameterBias(config_.bias_parameter_name(),
-                                 1, config_.size() * 7, useGpu_);
+                             0,
+                             config_.size() * config_.size() * 4,
+                             useGpu_);
+      bias_ = creatParameterBias(
+          config_.bias_parameter_name(), 1, config_.size() * 7, useGpu_);
     }
     layerMap_[dataLayer_->getName()] = dataLayer_;
     parameterMap_[para_->getName()] = para_;
@@ -266,15 +283,17 @@ public:
   }
 };
 
-template<class T>
-void checkRecurrentLayer(LayerConfig layerConfig, size_t batchSize,
-                         bool cpuBatch, bool gpuBatch) {
+template <class T>
+void checkRecurrentLayer(LayerConfig layerConfig,
+                         size_t batchSize,
+                         bool cpuBatch,
+                         bool gpuBatch) {
   TestRecurrentLayer<T> testCpu(layerConfig, false, cpuBatch);
   TestRecurrentLayer<T> testGpu(layerConfig, true, gpuBatch);
   testCpu.init(batchSize);
   testGpu.init(batchSize);
-  auto checkError = [](MatrixPtr cpu, MatrixPtr gpu,
-                       int numSequences, const char* str) {
+  auto checkError = [](
+      MatrixPtr cpu, MatrixPtr gpu, int numSequences, const char* str) {
     CpuMatrix check(gpu->getHeight(), gpu->getWidth());
     check.copyFrom(*gpu);
     int height = cpu->getHeight();
@@ -290,8 +309,8 @@ void checkRecurrentLayer(LayerConfig layerConfig, size_t batchSize,
         }
       }
     }
-    EXPECT_EQ(count, 0) << "[" << str << "]" <<
-      "There are " << count << " different element.";
+    EXPECT_EQ(count, 0) << "[" << str << "]"
+                        << "There are " << count << " different element.";
   };
   T* cpuLayer = dynamic_cast<T*>(testCpu.testLayer_.get());
   T* gpuLayer = dynamic_cast<T*>(testGpu.testLayer_.get());
@@ -312,8 +331,8 @@ void checkRecurrentLayer(LayerConfig layerConfig, size_t batchSize,
   testCpu.forward();
   testGpu.forward();
 
-  checkError(cpuLayer->getOutputValue(),
-             gpuLayer->getOutputValue(), 1, "outputValue");
+  checkError(
+      cpuLayer->getOutputValue(), gpuLayer->getOutputValue(), 1, "outputValue");
 
   /* check backward */
   cpuLayer->getOutputGrad()->randomizeUniform();
@@ -327,11 +346,15 @@ void checkRecurrentLayer(LayerConfig layerConfig, size_t batchSize,
   checkError(cpuInput.grad, gpuInput.grad, 1, "inputGrad");
   // check weight grad
   int numSequences = cpuInput.getNumSequences();
-  checkError(cpuLayer->weight_->getWGrad(), gpuLayer->weight_->getWGrad(),
-             numSequences, "weightGrad");
+  checkError(cpuLayer->weight_->getWGrad(),
+             gpuLayer->weight_->getWGrad(),
+             numSequences,
+             "weightGrad");
   // check bias grad
-  checkError(cpuLayer->bias_->getWGrad(), gpuLayer->bias_->getWGrad(),
-             numSequences, "biasGrad");
+  checkError(cpuLayer->bias_->getWGrad(),
+             gpuLayer->bias_->getWGrad(),
+             numSequences,
+             "biasGrad");
 }
 
 TEST(Layer, GatedRecurrentLayer) {
@@ -357,7 +380,7 @@ TEST(Layer, GatedRecurrentLayer) {
             layerConfig.set_size(frameSize);
             layerConfig.set_reversed(reversed);
             checkRecurrentLayer<GatedRecurrentLayer>(
-              layerConfig, batchSize, cpuBatch, gpuBatch);
+                layerConfig, batchSize, cpuBatch, gpuBatch);
           }
         }
       }
@@ -388,8 +411,8 @@ TEST(Layer, LstmLayer) {
                       << " cpuBatch=" << cpuBatch << " gpuBatch=" << gpuBatch;
             layerConfig.set_size(frameSize);
             layerConfig.set_reversed(reversed);
-            checkRecurrentLayer<LstmLayer>
-              (layerConfig, batchSize, cpuBatch, gpuBatch);
+            checkRecurrentLayer<LstmLayer>(
+                layerConfig, batchSize, cpuBatch, gpuBatch);
           }
         }
       }
diff --git a/paddle/gserver/tests/test_SelectiveFCLayer.cpp b/paddle/gserver/tests/test_SelectiveFCLayer.cpp
index 9a83217f1a..204b03332f 100644
--- a/paddle/gserver/tests/test_SelectiveFCLayer.cpp
+++ b/paddle/gserver/tests/test_SelectiveFCLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <paddle/utils/PythonUtil.h>
 #include <cstdlib>
 #include <ctime>
@@ -53,7 +52,7 @@ int randint(int* data, size_t int_max, size_t size) {
   int this_int = 0;
 
   while (count < size) {
-    this_int = std::rand() % int_max; // NOLINT
+    this_int = std::rand() % int_max;  // NOLINT
     if (tmp.find(this_int) == tmp.end()) {
       tmp[this_int] = 0;
       count += 1;
@@ -71,8 +70,10 @@ int randint(int* data, size_t int_max, size_t size) {
   return 0;
 }
 
-void calcOutput(ComData& comData, const string configFile,
-    const string configArgs, bool useGpu) {
+void calcOutput(ComData& comData,
+                const string configFile,
+                const string configArgs,
+                bool useGpu) {
   FLAGS_config = configFile;
   FLAGS_config_args = configArgs;
   FLAGS_use_gpu = useGpu;
@@ -95,8 +96,8 @@ void calcOutput(ComData& comData, const string configFile,
 
   vector<Argument>& inArgs = dataBatch.getStreams();
   trainer.getGradientMachine()->start(trainer.getConfig(), nullptr);
-  trainer.getGradientMachine()->forwardBackward(inArgs, &comData.outArgs,
-                                                PASS_TRAIN);
+  trainer.getGradientMachine()->forwardBackward(
+      inArgs, &comData.outArgs, PASS_TRAIN);
   trainer.getGradientMachine()->finish();
 }
 
@@ -108,8 +109,8 @@ void checkMatrix(real* A, real* B, size_t matSize) {
 #endif
   int diffNum = 0;
   for (size_t i = 0; i < matSize; ++i) {
-    if (std::isinf(A[i]) || std::isnan(A[i])
-        || std::isinf(B[i]) || std::isnan(B[i])) {
+    if (std::isinf(A[i]) || std::isnan(A[i]) || std::isinf(B[i]) ||
+        std::isnan(B[i])) {
     } else if (fabs(A[i] - B[i]) > err) {
       diffNum++;
     }
@@ -117,8 +118,10 @@ void checkMatrix(real* A, real* B, size_t matSize) {
   EXPECT_EQ(0, diffNum);
 }
 
-void checkTranspose(real* matrix, real* transpose,
-    size_t width, size_t matSize) {
+void checkTranspose(real* matrix,
+                    real* transpose,
+                    size_t width,
+                    size_t matSize) {
 #ifndef PADDLE_TYPE_DOUBLE
   real err = 1e-3;
 #else
@@ -149,20 +152,20 @@ void compareOutput(ComData& fcData, ComData& selFcData) {
   // check cost
   LOG(INFO) << "Check cost";
   CpuMatrix fcCost(outArgsFc[0].value->getHeight(),
-                 outArgsFc[0].value->getWidth());
+                   outArgsFc[0].value->getWidth());
   CpuMatrix selfcCost(outArgsSelfc[0].value->getHeight(),
-                    outArgsSelfc[0].value->getWidth());
+                      outArgsSelfc[0].value->getWidth());
   fcCost.copyFrom(*outArgsFc[0].value);
   selfcCost.copyFrom(*outArgsSelfc[0].value);
   checkMatrix(fcCost.getData(), selfcCost.getData(), fcCost.getElementCnt());
 
   // check selective fc output and fc output
-  LOG(INFO) << "Compare output of SelectiveFullyConectedLayer " <<
-    "with FullyConectedLayer";
+  LOG(INFO) << "Compare output of SelectiveFullyConectedLayer "
+            << "with FullyConectedLayer";
   CpuMatrix fcOut(outArgsFc[1].value->getHeight(),
-                 outArgsFc[1].value->getWidth());
+                  outArgsFc[1].value->getWidth());
   CpuMatrix selfcOut(outArgsSelfc[1].value->getHeight(),
-                    outArgsSelfc[1].value->getWidth());
+                     outArgsSelfc[1].value->getWidth());
 
   fcOut.copyFrom(*outArgsFc[1].value);
   selfcOut.copyFrom(*outArgsSelfc[1].value);
@@ -189,32 +192,40 @@ void compareOutput(ComData& fcData, ComData& selFcData) {
     CpuVector paraGrad1(*p1->getBuf(PARAMETER_GRADIENT));
     CpuVector paraGrad2(*p2->getBuf(PARAMETER_GRADIENT));
     if (paramName == "rand_fc_param.bias") {
-      checkMatrix(paraValue1.getData(),
-                  paraValue2.getData(),
-                  paraValue1.getSize());
-      checkMatrix(paraGrad1.getData(),
-                 paraGrad2.getData(),
-                 paraGrad1.getSize());
+      checkMatrix(
+          paraValue1.getData(), paraValue2.getData(), paraValue1.getSize());
+      checkMatrix(
+          paraGrad1.getData(), paraGrad2.getData(), paraGrad1.getSize());
     } else {
-      checkTranspose(paraValue1.getData(), paraValue2.getData(),
-          fcLayerWidth, paraValue1.getSize());
-      checkTranspose(paraGrad1.getData(), paraGrad2.getData(),
-          fcLayerWidth, paraGrad1.getSize());
+      checkTranspose(paraValue1.getData(),
+                     paraValue2.getData(),
+                     fcLayerWidth,
+                     paraValue1.getSize());
+      checkTranspose(paraGrad1.getData(),
+                     paraGrad2.getData(),
+                     fcLayerWidth,
+                     paraGrad1.getSize());
     }
   }
 }
 
-void compareSparseMulOutput(real* fcOutput, real* selOutput, size_t nnz,
-    const std::shared_ptr<std::vector<std::pair<int*, size_t> > > &selCols) {
+void compareSparseMulOutput(
+    real* fcOutput,
+    real* selOutput,
+    size_t nnz,
+    const std::shared_ptr<std::vector<std::pair<int*, size_t>>>& selCols) {
 #ifndef PADDLE_TYPE_DOUBLE
   real err = 1e-3;
 #else
   real err = 1e-10;
 #endif
-  size_t nnzCount = std::accumulate(selCols->begin(), selCols->end(), 0UL,
-                            [](size_t a, const std::pair<int*, size_t>& arr){
-    return a+arr.second;
-  });
+  size_t nnzCount =
+      std::accumulate(selCols->begin(),
+                      selCols->end(),
+                      0UL,
+                      [](size_t a, const std::pair<int*, size_t>& arr) {
+                        return a + arr.second;
+                      });
   EXPECT_EQ(nnz, nnzCount);
 
   size_t sampleNum = selCols->size();
@@ -225,18 +236,20 @@ void compareSparseMulOutput(real* fcOutput, real* selOutput, size_t nnz,
       size_t selIdx = (*selCols)[i].first[j];
       if (fabs(fcOutput[i * fcLayerWidth + selIdx] - selOutput[count]) > err) {
         diffNum++;
-        LOG(INFO) << count << " diff : "
-                  << fcOutput[i * fcLayerWidth + selIdx] << "\t"
-                  << selOutput[count];
-       }
+        LOG(INFO) << count << " diff : " << fcOutput[i * fcLayerWidth + selIdx]
+                  << "\t" << selOutput[count];
+      }
       count++;
     }
   }
   EXPECT_EQ(0, diffNum);
 }
 
-LayerPtr creatDataLayer(string name, size_t batchSize, size_t layerSize,
-    std::vector<real>& values, bool useGpu) {
+LayerPtr creatDataLayer(string name,
+                        size_t batchSize,
+                        size_t layerSize,
+                        std::vector<real>& values,
+                        bool useGpu) {
   LayerConfig dataConfig;
   dataConfig.set_name(name);
   dataConfig.set_type("data");
@@ -253,8 +266,8 @@ LayerPtr creatDataLayer(string name, size_t batchSize, size_t layerSize,
   return layer;
 }
 
-ParameterPtr creatParameter(string name, int pid, size_t paraSize,
-        string paramFile, bool useGpu) {
+ParameterPtr creatParameter(
+    string name, int pid, size_t paraSize, string paramFile, bool useGpu) {
   ParameterConfig paraConfig;
   paraConfig.set_name(name);
   paraConfig.set_size(paraSize);
@@ -268,16 +281,19 @@ ParameterPtr creatParameter(string name, int pid, size_t paraSize,
   return parameter;
 }
 
-LayerPtr initFcLayer(LayerPtr dataLayer, LayerConfig layerConfig,
-    int dataLayerSize, int fcLayerSize,
-    string paraName, string paraFile, bool useGpu) {
+LayerPtr initFcLayer(LayerPtr dataLayer,
+                     LayerConfig layerConfig,
+                     int dataLayerSize,
+                     int fcLayerSize,
+                     string paraName,
+                     string paraFile,
+                     bool useGpu) {
   LayerMap layerMap;
   ParameterMap parameterMap;
 
   layerMap[dataLayer->getName()] = dataLayer;
-  ParameterPtr para =
-      creatParameter(paraName, 0, dataLayerSize * fcLayerSize,
-      paraFile, useGpu);
+  ParameterPtr para = creatParameter(
+      paraName, 0, dataLayerSize * fcLayerSize, paraFile, useGpu);
   parameterMap[para->getName()] = para;
 
   layerConfig.add_inputs();
@@ -296,14 +312,13 @@ LayerPtr initFcLayer(LayerPtr dataLayer, LayerConfig layerConfig,
 #ifndef PADDLE_TYPE_DOUBLE
 // The parameter file used in fc.conf and selective_fc.conf is float
 TEST(Layer, SelectiveFcLayer_train_dense_mul) {
-  const string& fcConfig =
-      "gserver/tests/SelectiveFcTest/conf/fc.conf";
+  const string& fcConfig = "gserver/tests/SelectiveFcTest/conf/fc.conf";
   const string& fcConfigArgs =
-    "filelist=gserver/tests/SelectiveFcTest/dense_mul_list";
+      "filelist=gserver/tests/SelectiveFcTest/dense_mul_list";
   const string& selFcConfig =
       "gserver/tests/SelectiveFcTest/conf/selective_fc.conf";
   const string& selConfigArgs =
-    "filelist=gserver/tests/SelectiveFcTest/dense_mul_list";
+      "filelist=gserver/tests/SelectiveFcTest/dense_mul_list";
 
   for (auto useGpu : {false, true}) {
 #ifdef PADDLE_ONLY_CPU
@@ -323,7 +338,7 @@ TEST(Layer, SelectiveFcLayer_train_dense_mul) {
 }
 #endif  // PADDLE_TYPE_DOUBLE
 
-void testSelectiveFcLayerTrainSparseMul(const LayerConfig &config,
+void testSelectiveFcLayerTrainSparseMul(const LayerConfig& config,
                                         bool useGpu) {
   FLAGS_use_gpu = useGpu;
   size_t batchSize = 100;
@@ -332,21 +347,26 @@ void testSelectiveFcLayerTrainSparseMul(const LayerConfig &config,
   for (size_t j = 0; j < batchSize * dataLayerSize; ++j) {
     values[j] = std::rand() / real(RAND_MAX);
   }
-  LayerPtr dataLayer = creatDataLayer(
-      "data", batchSize, dataLayerSize, values, useGpu);
+  LayerPtr dataLayer =
+      creatDataLayer("data", batchSize, dataLayerSize, values, useGpu);
 
   const string& selfcParaFile =
-    "gserver/tests/SelectiveFcTest/model/rand_fc_param.w.transpose";
+      "gserver/tests/SelectiveFcTest/model/rand_fc_param.w.transpose";
   const string& selfcParaName = "rand_fc_param.w.transpose";
 
   std::shared_ptr<SelectiveFullyConnectedLayer> selfcLayer =
-    std::dynamic_pointer_cast<SelectiveFullyConnectedLayer>(initFcLayer(
-        dataLayer, config, dataLayerSize, fcLayerWidth,
-        selfcParaName, selfcParaFile, useGpu));
+      std::dynamic_pointer_cast<SelectiveFullyConnectedLayer>(
+          initFcLayer(dataLayer,
+                      config,
+                      dataLayerSize,
+                      fcLayerWidth,
+                      selfcParaName,
+                      selfcParaFile,
+                      useGpu));
 
   // create selected columns
-  std::shared_ptr<std::vector<std::pair<int*, size_t> > > selCols(
-     new std::vector<std::pair<int*, size_t> > (batchSize));
+  std::shared_ptr<std::vector<std::pair<int*, size_t>>> selCols(
+      new std::vector<std::pair<int*, size_t>>(batchSize));
   size_t maxNNZ = 30;
   srand((size_t)(time(NULL)));
   int total = 0;
@@ -364,8 +384,9 @@ void testSelectiveFcLayerTrainSparseMul(const LayerConfig &config,
 
   MatrixPtr outMatSelfc = selfcLayer->getOutputValue();
   CpuSparseMatrixPtr cpuOutMatSelfc(
-    new CpuSparseMatrix(outMatSelfc->getHeight(), outMatSelfc->getWidth(),
-                        outMatSelfc->getElementCnt()));
+      new CpuSparseMatrix(outMatSelfc->getHeight(),
+                          outMatSelfc->getWidth(),
+                          outMatSelfc->getElementCnt()));
   cpuOutMatSelfc->copyFrom(*outMatSelfc, HPPL_STREAM_DEFAULT);
 #ifndef PADDLE_ONLY_CPU
   if (useGpu) {
@@ -376,7 +397,7 @@ void testSelectiveFcLayerTrainSparseMul(const LayerConfig &config,
   size_t nnz = cpuOutMatSelfc->getElementCnt();
 
   const string& fcParaFile =
-    "gserver/tests/SelectiveFcTest/model/rand_fc_param.w";
+      "gserver/tests/SelectiveFcTest/model/rand_fc_param.w";
   const string& fcParaName = "rand_fc_param.w";
   LayerConfig fcLayerConfig;
   fcLayerConfig.set_name("fc_layer");
@@ -384,13 +405,18 @@ void testSelectiveFcLayerTrainSparseMul(const LayerConfig &config,
   fcLayerConfig.set_active_type("linear");
   fcLayerConfig.set_size(fcLayerWidth);
 
-  LayerPtr fcLayer = initFcLayer(dataLayer, fcLayerConfig,
-      dataLayerSize, fcLayerWidth, fcParaName, fcParaFile, useGpu);
+  LayerPtr fcLayer = initFcLayer(dataLayer,
+                                 fcLayerConfig,
+                                 dataLayerSize,
+                                 fcLayerWidth,
+                                 fcParaName,
+                                 fcParaFile,
+                                 useGpu);
   fcLayer->forward(PASS_TEST);
 
   MatrixPtr outMatFc = fcLayer->getOutputValue();
   MatrixPtr cpuOutMatFc(
-    new CpuMatrix(outMatFc->getHeight(), outMatFc->getWidth()));
+      new CpuMatrix(outMatFc->getHeight(), outMatFc->getWidth()));
   cpuOutMatFc->copyFrom(*outMatFc, HPPL_STREAM_DEFAULT);
 #ifndef PADDLE_ONLY_CPU
   if (useGpu) {
@@ -401,7 +427,7 @@ void testSelectiveFcLayerTrainSparseMul(const LayerConfig &config,
 
   compareSparseMulOutput(outValueFc, outValueSelfc, nnz, selCols);
   for (size_t i = 0; i < batchSize; ++i) {
-    delete [](*selCols)[i].first;
+    delete[](*selCols)[i].first;
   }
 }
 
diff --git a/paddle/math/Allocator.h b/paddle/math/Allocator.h
index f7aa60380f..cba8b37289 100644
--- a/paddle/math/Allocator.h
+++ b/paddle/math/Allocator.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <mutex>
@@ -48,10 +47,10 @@ public:
    * @return Pointer to the allocated memory
    */
   virtual void* alloc(size_t size) {
-      void* ptr;
-      CHECK_EQ(posix_memalign(&ptr, 32ul, size), 0);
-      CHECK(ptr) << "Fail to allocate CPU memory: size=" << size;
-      return ptr;
+    void* ptr;
+    CHECK_EQ(posix_memalign(&ptr, 32ul, size), 0);
+    CHECK(ptr) << "Fail to allocate CPU memory: size=" << size;
+    return ptr;
   }
 
   /**
@@ -59,12 +58,12 @@ public:
    * @param ptr  Pointer to be free.
    */
   virtual void free(void* ptr) {
-    if (ptr) { ::free(ptr); }
+    if (ptr) {
+      ::free(ptr);
+    }
   }
 
-  virtual std::string getName() {
-    return "cpu_alloc";
-  }
+  virtual std::string getName() { return "cpu_alloc"; }
 };
 
 /**
@@ -81,7 +80,7 @@ public:
    */
   virtual void* alloc(size_t size) {
     void* ptr = hl_malloc_device(size);
-    CHECK(ptr)<< "Fail to allocate GPU memory " << size << " bytes";
+    CHECK(ptr) << "Fail to allocate GPU memory " << size << " bytes";
     return ptr;
   }
 
@@ -95,9 +94,7 @@ public:
     }
   }
 
-  virtual std::string getName() {
-    return "gpu_alloc";
-  }
+  virtual std::string getName() { return "gpu_alloc"; }
 };
 
 /**
@@ -128,9 +125,7 @@ public:
     }
   }
 
-  virtual std::string getName() {
-    return "cuda_host_alloc";
-  }
+  virtual std::string getName() { return "cuda_host_alloc"; }
 };
 
 }  // namespace paddle
diff --git a/paddle/math/BaseMatrix.cu b/paddle/math/BaseMatrix.cu
index 54448bdb5a..2f32b3fdd1 100644
--- a/paddle/math/BaseMatrix.cu
+++ b/paddle/math/BaseMatrix.cu
@@ -1449,8 +1449,8 @@ template<>
 template <class Agg>
 int BaseMatrixT<real>::applyRow(Agg agg, BaseMatrixT& b) {
   MatrixOffset offset(0, 0, 0, 0, 0, 0);
-  int numRows = b.height_;
-  int numCols = b.width_;
+  size_t numRows = b.height_;
+  size_t numCols = b.width_;
   CHECK_EQ(height_, numRows);
   CHECK_EQ(width_, 1UL);
   aggregate(agg, base::unary::identity(), base::binary::second(), b, numRows,
@@ -1463,8 +1463,8 @@ template<>
 template <class Agg, class Saver>
 int BaseMatrixT<real>::applyRow(Agg agg, Saver sv, BaseMatrixT& b) {
   MatrixOffset offset(0, 0, 0, 0, 0, 0);
-  int numRows = b.height_;
-  int numCols = b.width_;
+  size_t numRows = b.height_;
+  size_t numCols = b.width_;
   CHECK_EQ(height_, numRows);
   CHECK_EQ(width_, 1UL);
   aggregate(agg, base::unary::identity(), sv, b, numRows, numCols, offset,
@@ -1493,8 +1493,8 @@ template <class Agg, class Op, class Saver>
 int BaseMatrixT<real>::applyRow(Agg agg, Op op, Saver sv,
                                 BaseMatrixT& b, BaseMatrixT& c) {
   MatrixOffset offset(0, 0, 0, 0, 0, 0);
-  int numRows = b.height_;
-  int numCols = b.width_;
+  size_t numRows = b.height_;
+  size_t numCols = b.width_;
   CHECK_EQ(height_, numRows);
   CHECK_EQ(width_, 1UL);
   CHECK_EQ(c.height_, numRows);
@@ -1524,8 +1524,8 @@ template<>
 template <class Agg>
 int BaseMatrixT<real>::applyCol(Agg agg, BaseMatrixT& b) {
   MatrixOffset offset(0, 0, 0, 0, 0, 0);
-  int numRows = b.height_;
-  int numCols = b.width_;
+  size_t numRows = b.height_;
+  size_t numCols = b.width_;
   CHECK_EQ(width_, numCols);
   CHECK_EQ(height_, 1UL);
   aggregate(agg, base::unary::identity(), base::binary::second(), b, numRows,
@@ -1538,8 +1538,8 @@ template<>
 template <class Agg, class Saver>
 int BaseMatrixT<real>::applyCol(Agg agg, Saver sv, BaseMatrixT& b) {
   MatrixOffset offset(0, 0, 0, 0, 0, 0);
-  int numRows = b.height_;
-  int numCols = b.width_;
+  size_t numRows = b.height_;
+  size_t numCols = b.width_;
   CHECK_EQ(width_, numCols);
   CHECK_EQ(height_, 1UL);
   aggregate(agg, base::unary::identity(), sv, b, numRows, numCols, offset,
diff --git a/paddle/math/BaseMatrix.h b/paddle/math/BaseMatrix.h
index 3a91fdc3c3..d41dcee682 100644
--- a/paddle/math/BaseMatrix.h
+++ b/paddle/math/BaseMatrix.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 #include <cstddef>
 #include <stdint.h>
@@ -52,9 +51,14 @@ public:
   size_t cRow_;
   size_t dCol_;
   size_t dRow_;
-  MatrixOffset(size_t aCol = 0, size_t aRow = 0, size_t bCol = 0,
-               size_t bRow = 0, size_t cCol = 0, size_t cRow = 0,
-               size_t dCol = 0, size_t dRow = 0)
+  MatrixOffset(size_t aCol = 0,
+               size_t aRow = 0,
+               size_t bCol = 0,
+               size_t bRow = 0,
+               size_t cCol = 0,
+               size_t cRow = 0,
+               size_t dCol = 0,
+               size_t dRow = 0)
       : aCol_(aCol),
         aRow_(aRow),
         bCol_(bCol),
@@ -65,7 +69,7 @@ public:
         dRow_(dRow) {}
 };
 
-template<class T>
+template <class T>
 class BaseMatrixT {
 public:
   size_t height_, width_;
@@ -97,8 +101,12 @@ public:
         trans_(mat.trans_),
         useGpu_(useGpu) {}
 
-  BaseMatrixT(size_t height, size_t width, size_t stride, T* data, bool trans,
-             bool use_gpu)
+  BaseMatrixT(size_t height,
+              size_t width,
+              size_t stride,
+              T* data,
+              bool trans,
+              bool use_gpu)
       : height_(height),
         width_(width),
         stride_(stride),
@@ -167,12 +175,17 @@ public:
    * @endcode
    */
   template <class Op, class bAsRowVector, class bAsColVector>
-  int applyBinary(Op op, BaseMatrixT& b, int numRows, int numCols,
-                  MatrixOffset& offset, bAsRowVector, bAsColVector);
+  int applyBinary(Op op,
+                  BaseMatrixT& b,
+                  int numRows,
+                  int numCols,
+                  MatrixOffset& offset,
+                  bAsRowVector,
+                  bAsColVector);
 
   template <class Op>
-  int applyBinary(Op op, BaseMatrixT& b, int numRows, int numCols,
-                  MatrixOffset& offset);
+  int applyBinary(
+      Op op, BaseMatrixT& b, int numRows, int numCols, MatrixOffset& offset);
 
   /**
    * ternary operator: element wise op(a, b, c).
@@ -212,13 +225,22 @@ public:
    * @endcode
    */
   template <class Op, class cAsRowVector, class cAsColVector>
-  int applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c, int numRows,
-                   int numCols, MatrixOffset& offset, cAsRowVector,
+  int applyTernary(Op op,
+                   BaseMatrixT& b,
+                   BaseMatrixT& c,
+                   int numRows,
+                   int numCols,
+                   MatrixOffset& offset,
+                   cAsRowVector,
                    cAsColVector);
 
   template <class Op>
-  int applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c, int numRows,
-                   int numCols, MatrixOffset& offset);
+  int applyTernary(Op op,
+                   BaseMatrixT& b,
+                   BaseMatrixT& c,
+                   int numRows,
+                   int numCols,
+                   MatrixOffset& offset);
 
   /**
    * quaternary operator: element wise op(a, b, c, d).
@@ -247,8 +269,13 @@ public:
    * @endcode
    */
   template <class Op>
-  int applyQuaternary(Op op, BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d,
-                      int numRows, int numCols, MatrixOffset& offset);
+  int applyQuaternary(Op op,
+                      BaseMatrixT& b,
+                      BaseMatrixT& c,
+                      BaseMatrixT& d,
+                      int numRows,
+                      int numCols,
+                      MatrixOffset& offset);
 
   /**
    * a aggregate expression that apply each row(or column) of matrix b.
@@ -266,10 +293,20 @@ public:
    *    a[i] = sv(a[i], dst)
    * @endcode
    */
-  template <class Agg, class Op, class Saver, class aAsRowVector,
+  template <class Agg,
+            class Op,
+            class Saver,
+            class aAsRowVector,
             class aAsColVector>
-  int aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b, int numRows,
-                int numCols, MatrixOffset& offset, aAsRowVector, aAsColVector);
+  int aggregate(Agg agg,
+                Op op,
+                Saver sv,
+                BaseMatrixT& b,
+                int numRows,
+                int numCols,
+                MatrixOffset& offset,
+                aAsRowVector,
+                aAsColVector);
 
   /**
    * a aggregate expression that apply each row(or column) of matrix b and c.
@@ -288,10 +325,20 @@ public:
    *     a[i] = sv(a[i], dst)
    * @endcode
    */
-  template <class Agg, class Op, class Saver, class aAsRowVector,
+  template <class Agg,
+            class Op,
+            class Saver,
+            class aAsRowVector,
             class aAsColVector>
-  int aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b, BaseMatrixT& c,
-                int numRows, int numCols, MatrixOffset& offset, aAsRowVector,
+  int aggregate(Agg agg,
+                Op op,
+                Saver sv,
+                BaseMatrixT& b,
+                BaseMatrixT& c,
+                int numRows,
+                int numCols,
+                MatrixOffset& offset,
+                aAsRowVector,
                 aAsColVector);
 
   /**
@@ -319,8 +366,12 @@ public:
 
   // Same as the above with the special handing of sv=add2(scaleDest, scaleAgg)
   template <class Agg, class Op>
-  int applyRow(Agg agg, Op op, real scaleDest, real scaleAgg,
-               BaseMatrixT& b, BaseMatrixT& c);
+  int applyRow(Agg agg,
+               Op op,
+               real scaleDest,
+               real scaleAgg,
+               BaseMatrixT& b,
+               BaseMatrixT& c);
 
   /**
    * a aggregate expression that apply each row of matrix b.
@@ -664,8 +715,7 @@ public:
    * this = a*p1 + b*p2 + c*p3
    * @endcode
    */
-  void add3(BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d, T p1, T p2,
-            T p3);
+  void add3(BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d, T p1, T p2, T p3);
 
   /**
    * @code
@@ -675,9 +725,9 @@ public:
    */
   void sgdUpdate(BaseMatrixT& b,  //  grad
                  BaseMatrixT& c,  //  mom
-                 T p1,        //  learningRate,
-                 T p2,        //  momentum,
-                 T p3);       //  decayRate
+                 T p1,            //  learningRate,
+                 T p2,            //  momentum,
+                 T p3);           //  decayRate
 
   /**
    * @code
@@ -688,9 +738,9 @@ public:
   void sgdUpdate(BaseMatrixT& b,  // grad,
                  BaseMatrixT& c,  // mom,
                  BaseMatrixT& d,  // lr,
-                 T p1,        // learningRate,
-                 T p2,        // momentum,
-                 T p3);       // decayRate
+                 T p1,            // learningRate,
+                 T p2,            // momentum,
+                 T p3);           // decayRate
 
   /// apply L1/L2 to *this*
   void applyL1(T learningRate, T decayRate);
@@ -767,17 +817,21 @@ public:
    * this = b>c ? b : c
    * @endcode
    */
-   void max(BaseMatrixT& b, BaseMatrixT& c);  //  NOLINT
+  void max(BaseMatrixT& b, BaseMatrixT& c);  //  NOLINT
 
   /**
    * @code
    * this[destCol] += (b>p1 == c>p1) ? 0 : 1)
    * @endcode
    */
-  void binaryClassificationError(size_t destCol, BaseMatrixT& b, BaseMatrixT& c,
+  void binaryClassificationError(size_t destCol,
+                                 BaseMatrixT& b,
+                                 BaseMatrixT& c,
                                  T p);
-  void binaryClassificationError2(size_t destCol, BaseMatrixT& b,
-                                  BaseMatrixT& c, T p);
+  void binaryClassificationError2(size_t destCol,
+                                  BaseMatrixT& b,
+                                  BaseMatrixT& c,
+                                  T p);
 
   /**
    * @code
@@ -833,8 +887,8 @@ public:
    * this += sqr(p1*b + p2*c + p3*d)
    * @endcode
    */
-  void addSquareSum(BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT d, T p1,
-                    T p2, T p3);
+  void addSquareSum(
+      BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT d, T p1, T p2, T p3);
 
   /**
    * @code
@@ -965,12 +1019,13 @@ public:
   void sumCols(BaseMatrixT& b, T scaleSum, T scaleDest);
 
   /// this_i = scaleDest * this_i + scaleSum * \sum_j (b_{ij} - c_{ij})^2
-  void sumOfSquaredDiffs(BaseMatrixT& b, BaseMatrixT& c,
-                         T scaleSum, T scaleDest);
+  void sumOfSquaredDiffs(BaseMatrixT& b,
+                         BaseMatrixT& c,
+                         T scaleSum,
+                         T scaleDest);
 
   /// this_i = scaleDest * this_i + scaleSum * \sum_j b_{ij} * c_{ij}
-  void sumOfProducts(BaseMatrixT& b, BaseMatrixT& c,
-                     T scaleSum, T scaleDest);
+  void sumOfProducts(BaseMatrixT& b, BaseMatrixT& c, T scaleSum, T scaleDest);
 
   /**
    * @code
@@ -985,9 +1040,7 @@ public:
    */
   void rowPow(size_t cCol, BaseMatrixT& b, BaseMatrixT& c);
 
-  virtual bool isSparse() const {
-    return false;
-  }
+  virtual bool isSparse() const { return false; }
 };
 
 typedef BaseMatrixT<real> BaseMatrix;
diff --git a/paddle/math/CpuSparseMatrix.cpp b/paddle/math/CpuSparseMatrix.cpp
index 64ee124a56..ad3f8e64ef 100644
--- a/paddle/math/CpuSparseMatrix.cpp
+++ b/paddle/math/CpuSparseMatrix.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "hl_gpu.h"
 #include "CpuSparseMatrix.h"
 #include "SparseMatrix.h"
@@ -24,24 +23,35 @@ namespace paddle {
 
 const size_t CpuSparseMatrix::DEFAULT_AVG_WIDTH;
 
-CpuSparseMatrix::CpuSparseMatrix(size_t height, size_t width, size_t nnz,
-                                 SparseValueType valueType, SparseFormat format,
+CpuSparseMatrix::CpuSparseMatrix(size_t height,
+                                 size_t width,
+                                 size_t nnz,
+                                 SparseValueType valueType,
+                                 SparseFormat format,
                                  bool trans)
     : Matrix(NULL, height, width, trans, false) {
   resize(height, width, nnz, valueType, format);
 }
 
-CpuSparseMatrix::CpuSparseMatrix(CpuMemHandlePtr dataHandle, size_t height,
-                                 size_t width, size_t nnz,
-                                 SparseValueType valueType, SparseFormat format,
+CpuSparseMatrix::CpuSparseMatrix(CpuMemHandlePtr dataHandle,
+                                 size_t height,
+                                 size_t width,
+                                 size_t nnz,
+                                 SparseValueType valueType,
+                                 SparseFormat format,
                                  bool trans)
     : Matrix(dataHandle, height, width, trans, false) {
   resize(height, width, nnz, valueType, format);
 }
 
-CpuSparseMatrix::CpuSparseMatrix(real* data, int* rows, int* cols,
-                                 size_t height, size_t width, size_t nnz,
-                                 SparseValueType valueType, SparseFormat format,
+CpuSparseMatrix::CpuSparseMatrix(real* data,
+                                 int* rows,
+                                 int* cols,
+                                 size_t height,
+                                 size_t width,
+                                 size_t nnz,
+                                 SparseValueType valueType,
+                                 SparseFormat format,
                                  bool trans)
     : Matrix(NULL, height, width, trans, false) {
   cols_ = cols;
@@ -54,8 +64,11 @@ CpuSparseMatrix::CpuSparseMatrix(real* data, int* rows, int* cols,
   format_ = format;
 }
 
-void CpuSparseMatrix::resize(size_t newHeight, size_t newWidth, size_t newNnz,
-                             SparseValueType valueType, SparseFormat format) {
+void CpuSparseMatrix::resize(size_t newHeight,
+                             size_t newWidth,
+                             size_t newNnz,
+                             SparseValueType valueType,
+                             SparseFormat format) {
   CHECK_LE(newNnz, newHeight * newWidth);
   size_t newSize = 0;
   if (format == SPARSE_CSR) {
@@ -110,23 +123,38 @@ void CpuSparseMatrix::sparseResize() {
 }
 
 void CpuSparseMatrix::resize(size_t newHeight, size_t newWidth) {
-  resize(newHeight, newWidth, newHeight * std::min(DEFAULT_AVG_WIDTH, newWidth),
-         valueType_, format_);
+  resize(newHeight,
+         newWidth,
+         newHeight * std::min(DEFAULT_AVG_WIDTH, newWidth),
+         valueType_,
+         format_);
 }
 
 MatrixPtr CpuSparseMatrix::getTranspose() {
   if (!memoryHandle_ && !value_) {
-    MatrixPtr dest(new CpuSparseMatrix(height_, width_, elementCnt_, valueType_,
-                                       format_, true));
+    MatrixPtr dest(new CpuSparseMatrix(
+        height_, width_, elementCnt_, valueType_, format_, true));
     return dest;
   } else if (memoryHandle_) {
     MatrixPtr dest(new CpuSparseMatrix(
-        std::dynamic_pointer_cast<CpuMemoryHandle>(memoryHandle_), height_,
-        width_, elementCnt_, valueType_, format_, true));
+        std::dynamic_pointer_cast<CpuMemoryHandle>(memoryHandle_),
+        height_,
+        width_,
+        elementCnt_,
+        valueType_,
+        format_,
+        true));
     return dest;
   } else if (value_) {
-    MatrixPtr dest(new CpuSparseMatrix(value_, rows_, cols_, height_, width_,
-                                       elementCnt_, valueType_, format_, true));
+    MatrixPtr dest(new CpuSparseMatrix(value_,
+                                       rows_,
+                                       cols_,
+                                       height_,
+                                       width_,
+                                       elementCnt_,
+                                       valueType_,
+                                       format_,
+                                       true));
     return dest;
   } else {
     return NULL;
@@ -140,7 +168,10 @@ void CpuSparseMatrix::mul(MatrixPtr a, MatrixPtr b, real scaleAB, real scaleT) {
 
   if (dynamic_cast<CpuMatrix*>(a.get()) && dynamic_cast<CpuMatrix*>(b.get())) {
     CpuMatrix::mul(dynamic_cast<CpuMatrix*>(a.get()),
-                   dynamic_cast<CpuMatrix*>(b.get()), this, scaleAB, scaleT);
+                   dynamic_cast<CpuMatrix*>(b.get()),
+                   this,
+                   scaleAB,
+                   scaleT);
   } else {
     LOG(FATAL) << "not supported";
   }
@@ -243,7 +274,8 @@ void CpuSparseMatrix::randomizeUniform() {
   }
 }
 
-void CpuSparseMatrix::copyFrom(std::vector<int>& rows, std::vector<int>& cols,
+void CpuSparseMatrix::copyFrom(std::vector<int>& rows,
+                               std::vector<int>& cols,
                                std::vector<real>& values) {
   size_t size = format_ == SPARSE_CSR ? cols.size() : rows.size();
   resize(height_, width_, size, valueType_, format_);
@@ -302,11 +334,11 @@ MatrixPtr CpuSparseMatrix::clone(size_t height, size_t width, bool useGpu) {
   }
   CHECK(width && height);
   if (!useGpu) {
-    return std::make_shared<CpuSparseMatrix>(height, width, 0, valueType_,
-                                             format_);
+    return std::make_shared<CpuSparseMatrix>(
+        height, width, 0, valueType_, format_);
   } else {
-    return std::make_shared<GpuSparseMatrix>(height, width, elementCnt_,
-                                             valueType_, format_);
+    return std::make_shared<GpuSparseMatrix>(
+        height, width, elementCnt_, valueType_, format_);
   }
 }
 
@@ -315,13 +347,25 @@ MatrixPtr CpuSparseMatrix::subMatrix(size_t startRow, size_t numRows) {
   CHECK_EQ(format_, SPARSE_CSR);
   if (valueType_ == NO_VALUE) {
     return std::make_shared<CpuSparseMatrix>(
-        nullptr, rows_ + startRow, cols_, numRows, width_,
-        rows_[startRow + numRows] - rows_[startRow], valueType_, format_,
+        nullptr,
+        rows_ + startRow,
+        cols_,
+        numRows,
+        width_,
+        rows_[startRow + numRows] - rows_[startRow],
+        valueType_,
+        format_,
         trans_);
   } else {
     return std::make_shared<CpuSparseMatrix>(
-        value_, rows_ + startRow, cols_, numRows, width_,
-        rows_[startRow + numRows] - rows_[startRow], valueType_, format_,
+        value_,
+        rows_ + startRow,
+        cols_,
+        numRows,
+        width_,
+        rows_[startRow + numRows] - rows_[startRow],
+        valueType_,
+        format_,
         trans_);
   }
 }
@@ -404,8 +448,10 @@ void CpuSparseMatrix::transpose(MatrixPtr matTrans, bool memAlloc) {
   }
 }
 
-void CpuSparseMatrix::setRow(size_t row, size_t colNum,
-                             const unsigned int* cols, const real* values) {
+void CpuSparseMatrix::setRow(size_t row,
+                             size_t colNum,
+                             const unsigned int* cols,
+                             const real* values) {
   if (format_ == SPARSE_CSR) {
     CHECK_LT(row, height_);
     CHECK(NULL != cols);
@@ -494,11 +540,23 @@ void CpuSparseMatrix::copyFrom(const GpuSparseMatrix& src, hl_stream_t stream) {
   CHECK_EQ(size_t(elementCnt_), src.getElementCnt());
   size_t valSize = valueType_ == NO_VALUE ? 0 : elementCnt_;
   if (format_ == SPARSE_CSC)
-    hl_memcpy_from_csc_matrix(value_, valSize, rows_, elementCnt_, cols_,
-                              width_ + 1, src.sMatrix_.get(), stream);
+    hl_memcpy_from_csc_matrix(value_,
+                              valSize,
+                              rows_,
+                              elementCnt_,
+                              cols_,
+                              width_ + 1,
+                              src.sMatrix_.get(),
+                              stream);
   else
-    hl_memcpy_from_csr_matrix(value_, valSize, rows_, height_ + 1, cols_,
-                              elementCnt_, src.sMatrix_.get(), stream);
+    hl_memcpy_from_csr_matrix(value_,
+                              valSize,
+                              rows_,
+                              height_ + 1,
+                              cols_,
+                              elementCnt_,
+                              src.sMatrix_.get(),
+                              stream);
 }
 
 void CpuSparseMatrix::copyFrom(const CpuSparseMatrix& src) {
@@ -536,14 +594,16 @@ void CpuSparseMatrix::copyFrom(const CpuSparseMatrix& src) {
   }
 }
 
-void CpuSparseMatrix::copyRow(int offsets, size_t colNum,
+void CpuSparseMatrix::copyRow(int offsets,
+                              size_t colNum,
                               const sparse_non_value_t* row) {
   for (size_t j = 0; j < colNum; j++) {
     cols_[offsets + j] = row[j].col;
   }
 }
 
-void CpuSparseMatrix::copyRow(int offsets, size_t colNum,
+void CpuSparseMatrix::copyRow(int offsets,
+                              size_t colNum,
                               const sparse_float_value_t* row) {
   for (size_t j = 0; j < colNum; j++) {
     cols_[offsets + j] = row[j].col;
@@ -596,7 +656,8 @@ void CpuSparseMatrix::trimFrom(const CpuSparseMatrix& src) {
   if (format_ == SPARSE_CSR) {
     int* srcCols = src.getCols();
     size_t numLessWidth =
-        std::count_if(srcCols, srcCols + src.getElementCnt(),
+        std::count_if(srcCols,
+                      srcCols + src.getElementCnt(),
                       [this](size_t n) { return n < this->width_; });
     resize(height_, width_, numLessWidth, valueType_, format_);
     rows_[0] = 0;
@@ -636,13 +697,15 @@ void CpuSparseMatrix::trimFrom(const CpuSparseMatrix& src) {
 
 void CpuSparseMatrix::zeroMem() {
   CHECK(valueType_ == FLOAT_VALUE);
-  memset(value_, 0, elementCnt_* sizeof(real));
+  memset(value_, 0, elementCnt_ * sizeof(real));
 }
 
-template void CpuSparseMatrix::copyFrom(int64_t* ids, int64_t* indices,
+template void CpuSparseMatrix::copyFrom(int64_t* ids,
+                                        int64_t* indices,
                                         sparse_non_value_t* data);
 
-template void CpuSparseMatrix::copyFrom(int64_t* ids, int64_t* indices,
+template void CpuSparseMatrix::copyFrom(int64_t* ids,
+                                        int64_t* indices,
                                         sparse_float_value_t* data);
 
 template void CpuSparseMatrix::copyFrom(int64_t* indices,
@@ -673,7 +736,9 @@ void CpuSparseMatrix::rowMax(IVector& maxIds, Matrix& maxVal) {
     }
 
     size_t outsize = std::min(num, beam);
-    std::partial_sort(vec.begin(), vec.begin() + outsize, vec.end(),
+    std::partial_sort(vec.begin(),
+                      vec.begin() + outsize,
+                      vec.end(),
                       [](const valuepair& a, const valuepair& b) {
                         return a.first > b.first;
                       });
diff --git a/paddle/math/CpuSparseMatrix.h b/paddle/math/CpuSparseMatrix.h
index fd3b5030be..8615645551 100644
--- a/paddle/math/CpuSparseMatrix.h
+++ b/paddle/math/CpuSparseMatrix.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 #include <cstddef>
 #include "Matrix.h"
@@ -21,24 +20,38 @@ namespace paddle {
 
 class CpuSparseMatrix : public Matrix {
 public:
-  CpuSparseMatrix(size_t height, size_t width,
+  CpuSparseMatrix(size_t height,
+                  size_t width,
                   size_t nnz, /* used to allocate space */
                   SparseValueType valueType = FLOAT_VALUE,
-                  SparseFormat format = SPARSE_CSR, bool trans = false);
-
-  CpuSparseMatrix(CpuMemHandlePtr memHandle, size_t height, size_t width,
-                  size_t nnz, SparseValueType valueType, SparseFormat format,
+                  SparseFormat format = SPARSE_CSR,
+                  bool trans = false);
+
+  CpuSparseMatrix(CpuMemHandlePtr memHandle,
+                  size_t height,
+                  size_t width,
+                  size_t nnz,
+                  SparseValueType valueType,
+                  SparseFormat format,
                   bool trans);
 
-  CpuSparseMatrix(real* data, int* rows, int* cols, size_t height, size_t width,
-                  size_t nnz, SparseValueType valueType, SparseFormat format,
+  CpuSparseMatrix(real* data,
+                  int* rows,
+                  int* cols,
+                  size_t height,
+                  size_t width,
+                  size_t nnz,
+                  SparseValueType valueType,
+                  SparseFormat format,
                   bool trans);
 
   ~CpuSparseMatrix() {}
 
-  void resize(size_t newHeight, size_t newWidth,
+  void resize(size_t newHeight,
+              size_t newWidth,
               size_t newNnz, /* used to allocate space */
-              SparseValueType valueType, SparseFormat format);
+              SparseValueType valueType,
+              SparseFormat format);
   void resize(size_t newHeight, size_t newWidth);
 
   MatrixPtr getTranspose();
@@ -75,8 +88,6 @@ public:
     }
   }
 
-
-
   real* getColumn(size_t i) const {
     if (format_ == SPARSE_CSC) {
       return value_ + cols_[i];
@@ -182,7 +193,7 @@ public:
    * getData is convenient to get value
    */
   real* getData() { return getValue(); }
-  const real* getData() const { return getValue();}
+  const real* getData() const { return getValue(); }
 
   /**
    * @brief only set value_ of FLOAT_VALUE sparse matrix to zero
@@ -220,7 +231,9 @@ public:
 
   void printOneRow(std::ostream& os, size_t idx) const;
 
-  void setRow(size_t row, size_t colNum, const unsigned int* cols,
+  void setRow(size_t row,
+              size_t colNum,
+              const unsigned int* cols,
               const real* values);
 
   void randomizeUniform();
@@ -241,7 +254,8 @@ public:
 
   virtual MatrixPtr subMatrix(size_t startRow, size_t numRows);
 
-  void copyFrom(std::vector<int>& rows, std::vector<int>& cols,
+  void copyFrom(std::vector<int>& rows,
+                std::vector<int>& cols,
                 std::vector<real>& values);
 
   void copyFrom(const CpuMatrix& src);
@@ -285,9 +299,7 @@ protected:
 
   // BaseMatrixT interface
 public:
-  bool isSparse() const {
-    return true;
-  }
+  bool isSparse() const { return true; }
 
 private:
   using Matrix::copyFrom;
diff --git a/paddle/math/ExecViaCpu.h b/paddle/math/ExecViaCpu.h
index 64e5b83121..67fb6c0cda 100644
--- a/paddle/math/ExecViaCpu.h
+++ b/paddle/math/ExecViaCpu.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 /*
  execViaCpu is used to do operations on GpuMatirx and/or GpuIVector through
  cpu functions. It can automatically make a temporary CPU copy for the
@@ -46,8 +45,10 @@ public:
   explicit CopyToCpu(Matrix& arg) : arg_(arg) {
     if (arg.useGpu()) {
       CHECK(!arg.isTransposed()) << "Not supported";
-      copied_ = Matrix::create(arg.getHeight(), arg.getWidth(),
-                               /* trans= */ false, /* useGpu= */ false);
+      copied_ = Matrix::create(arg.getHeight(),
+                               arg.getWidth(),
+                               /* trans= */ false,
+                               /* useGpu= */ false);
       copied_->copyFrom(arg);
     }
   }
@@ -69,8 +70,10 @@ public:
   explicit CopyToCpu(const Matrix& arg) : arg_(arg) {
     if (arg.useGpu()) {
       CHECK(!arg.isTransposed()) << "Not supported";
-      copied_ = Matrix::create(arg.getHeight(), arg.getWidth(),
-                               /* trans= */ false, /* useGpu= */ false);
+      copied_ = Matrix::create(arg.getHeight(),
+                               arg.getWidth(),
+                               /* trans= */ false,
+                               /* useGpu= */ false);
       copied_->copyFrom(arg);
     }
   }
@@ -165,7 +168,8 @@ class GpuFuncWrapper2
           std::is_function<F>::value,
           std::is_pointer<F>::value &&
               std::is_function<typename std::remove_pointer<F>::type>::value,
-          std::is_class<F>::value, F> {};
+          std::is_class<F>::value,
+          F> {};
 
 template <typename F>
 class GpuFuncWrapper
diff --git a/paddle/math/MathFunctions.cpp b/paddle/math/MathFunctions.cpp
index e0b2a2bb5b..1217163bee 100644
--- a/paddle/math/MathFunctions.cpp
+++ b/paddle/math/MathFunctions.cpp
@@ -12,36 +12,79 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "MathFunctions.h"
 #include "hl_matrix_ops.cuh"
 #include "hl_matrix_apply.cuh"
 
 namespace paddle {
 
-template<>
-void gemm<float>(const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB,
-                 const int M, const int N, const int K,
-                 const float alpha, const float* A, const int lda,
-                 const float* B, const int ldb,
-                 const float beta, float* C, const int ldc) {
-  cblas_sgemm(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
-              beta, C, ldc);
-}
-
-template<>
-void gemm<double>(const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB,
-                  const int M, const int N, const int K,
-                  const double alpha, const double* A, const int lda,
-                  const double* B, const int ldb,
-                  const double beta, double* C, const int ldc) {
-  cblas_dgemm(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
-              beta, C, ldc);
-}
-
-template<>
-int getrf<float>(const CBLAS_ORDER order, const int M, const int N,
-                  float *A, const int lda, int *ipiv) {
+template <>
+void gemm<float>(const CBLAS_TRANSPOSE transA,
+                 const CBLAS_TRANSPOSE transB,
+                 const int M,
+                 const int N,
+                 const int K,
+                 const float alpha,
+                 const float* A,
+                 const int lda,
+                 const float* B,
+                 const int ldb,
+                 const float beta,
+                 float* C,
+                 const int ldc) {
+  cblas_sgemm(CblasRowMajor,
+              transA,
+              transB,
+              M,
+              N,
+              K,
+              alpha,
+              A,
+              lda,
+              B,
+              ldb,
+              beta,
+              C,
+              ldc);
+}
+
+template <>
+void gemm<double>(const CBLAS_TRANSPOSE transA,
+                  const CBLAS_TRANSPOSE transB,
+                  const int M,
+                  const int N,
+                  const int K,
+                  const double alpha,
+                  const double* A,
+                  const int lda,
+                  const double* B,
+                  const int ldb,
+                  const double beta,
+                  double* C,
+                  const int ldc) {
+  cblas_dgemm(CblasRowMajor,
+              transA,
+              transB,
+              M,
+              N,
+              K,
+              alpha,
+              A,
+              lda,
+              B,
+              ldb,
+              beta,
+              C,
+              ldc);
+}
+
+template <>
+int getrf<float>(const CBLAS_ORDER order,
+                 const int M,
+                 const int N,
+                 float* A,
+                 const int lda,
+                 int* ipiv) {
 #ifdef PADDLE_USE_ATLAS
   return clapack_sgetrf(order, M, N, A, lda, ipiv);
 #else
@@ -49,9 +92,13 @@ int getrf<float>(const CBLAS_ORDER order, const int M, const int N,
 #endif
 }
 
-template<>
-int getrf<double>(const CBLAS_ORDER order, const int M, const int N,
-                   double *A, const int lda, int *ipiv) {
+template <>
+int getrf<double>(const CBLAS_ORDER order,
+                  const int M,
+                  const int N,
+                  double* A,
+                  const int lda,
+                  int* ipiv) {
 #ifdef PADDLE_USE_ATLAS
   return clapack_dgetrf(order, M, N, A, lda, ipiv);
 #else
@@ -59,9 +106,12 @@ int getrf<double>(const CBLAS_ORDER order, const int M, const int N,
 #endif
 }
 
-template<>
-int getri<float>(const CBLAS_ORDER order, const int N, float *A,
-                  const int lda, const int *ipiv) {
+template <>
+int getri<float>(const CBLAS_ORDER order,
+                 const int N,
+                 float* A,
+                 const int lda,
+                 const int* ipiv) {
 #ifdef PADDLE_USE_ATLAS
   return clapack_sgetri(order, N, A, lda, ipiv);
 #else
@@ -69,9 +119,12 @@ int getri<float>(const CBLAS_ORDER order, const int N, float *A,
 #endif
 }
 
-template<>
-int getri<double>(const CBLAS_ORDER order, const int N, double *A,
-                  const int lda, const int *ipiv) {
+template <>
+int getri<double>(const CBLAS_ORDER order,
+                  const int N,
+                  double* A,
+                  const int lda,
+                  const int* ipiv) {
 #ifdef PADDLE_USE_ATLAS
   return clapack_dgetri(order, N, A, lda, ipiv);
 #else
@@ -79,149 +132,155 @@ int getri<double>(const CBLAS_ORDER order, const int N, double *A,
 #endif
 }
 
-template<>
+template <>
 void axpy<float>(const int n, const float alpha, const float* x, float* y) {
   cblas_saxpy(n, alpha, x, 1, y, 1);
 }
 
-template<>
+template <>
 void axpy<double>(const int n, const double alpha, const double* x, double* y) {
   cblas_daxpy(n, alpha, x, 1, y, 1);
 }
 
-template<>
+template <>
 float dotProduct<float>(const int n, const float* x, const float* y) {
   return cblas_sdot(n, x, 1, y, 1);
 }
 
-template<>
+template <>
 double dotProduct<double>(const int n, const double* x, const double* y) {
   return cblas_ddot(n, x, 1, y, 1);
 }
 
 #ifdef PADDLE_USE_MKL
 
-template<>
+template <>
 void vExp<float>(const int n, const float* a, float* r) {
   vsExp(n, a, r);
 }
 
-template<>
+template <>
 void vExp<double>(const int n, const double* a, double* r) {
   vdExp(n, a, r);
 }
 
-template<>
+template <>
 void vPow<float>(const int n, const float* a, const float b, float* r) {
   vsPowx(n, a, b, r);
 }
 
-template<>
+template <>
 void vPow<double>(const int n, const double* a, const double b, double* r) {
   vdPowx(n, a, b, r);
 }
 
-template<>
+template <>
 void vLog<float>(const int n, const float* a, float* r) {
   vsLn(n, a, r);
 }
 
-template<>
+template <>
 void vLog<double>(const int n, const double* a, double* r) {
   vdLn(n, a, r);
 }
 
-template<>
+template <>
 void vAdd<float>(const int n, const float* a, const float* b, float* r) {
   vsAdd(n, a, b, r);
 }
 
-template<>
+template <>
 void vAdd<double>(const int n, const double* a, const double* b, double* r) {
   vdAdd(n, a, b, r);
 }
 
-template<>
+template <>
 void vInvSqrt<float>(const int n, const float* a, float* r) {
   vsInvSqrt(n, a, r);
 }
 
-template<>
+template <>
 void vInvSqrt<double>(const int n, const double* a, double* r) {
   vdInvSqrt(n, a, r);
 }
 
-template<>
+template <>
 void vLog1p<float>(const int n, const float* a, float* r) {
   vsLog1p(n, a, r);
 }
 
-template<>
+template <>
 void vLog1p<double>(const int n, const double* a, double* r) {
   vdLog1p(n, a, r);
 }
 
-template<>
+template <>
 void vTanh<float>(const int n, const float* a, float* r) {
   vsTanh(n, a, r);
 }
 
-template<>
+template <>
 void vTanh<double>(const int n, const double* a, double* r) {
   vdTanh(n, a, r);
 }
 #else
 
 DEFINE_MATRIX_BINARY_OP(vExp, b = std::exp(a));
-template<class T>
+template <class T>
 void vExp(const int n, const T* a, T* r) {
   hl_cpu_apply_binary_op<T, binary::vExp<T>, 0, 0>(
-    binary::vExp<T>(), const_cast<T*>(a), r, 1, n, n, n);
+      binary::vExp<T>(), const_cast<T*>(a), r, 1, n, n, n);
 }
 
 DEFINE_MATRIX_BINARY_OP(vLog, b = std::log(a));
-template<class T>
+template <class T>
 void vLog(const int n, const T* a, T* r) {
   hl_cpu_apply_binary_op<T, binary::vLog<T>, 0, 0>(
-    binary::vLog<T>(), const_cast<T*>(a), r, 1, n, n, n);
+      binary::vLog<T>(), const_cast<T*>(a), r, 1, n, n, n);
 }
 
 DEFINE_MATRIX_BINARY_OP(vInvSqrt, b = 1.0f / std::sqrt(a));
-template<class T>
+template <class T>
 void vInvSqrt(const int n, const T* a, T* r) {
   hl_cpu_apply_binary_op<T, binary::vInvSqrt<T>, 0, 0>(
-    binary::vInvSqrt<T>(), const_cast<T*>(a), r, 1, n, n, n);
+      binary::vInvSqrt<T>(), const_cast<T*>(a), r, 1, n, n, n);
 }
 
 DEFINE_MATRIX_BINARY_OP(vLog1p, b = std::log(1.0f + a));
-template<class T>
+template <class T>
 void vLog1p(const int n, const T* a, T* r) {
   hl_cpu_apply_binary_op<T, binary::vLog1p<T>, 0, 0>(
-    binary::vLog1p<T>(), const_cast<T*>(a), r, 1, n, n, n);
+      binary::vLog1p<T>(), const_cast<T*>(a), r, 1, n, n, n);
 }
 
-DEFINE_MATRIX_BINARY_OP(vTanh,
-    T tmp = -2.0 * a;
-    tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
-    b = 2.0 / (1.0 + std::exp(tmp)) - 1.0);
-template<class T>
+DEFINE_MATRIX_BINARY_OP(vTanh, T tmp = -2.0 * a;
+                        tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
+                        b = 2.0 / (1.0 + std::exp(tmp)) - 1.0);
+template <class T>
 void vTanh(const int n, const T* a, T* r) {
   hl_cpu_apply_binary_op<T, binary::vTanh<T>, 0, 0>(
-    binary::vTanh<T>(), const_cast<T*>(a), r, 1, n, n, n);
+      binary::vTanh<T>(), const_cast<T*>(a), r, 1, n, n, n);
 }
 
 DEFINE_MATRIX_BINARY_PARAMETER_OP(vPow, ONE_PARAMETER, b = std::pow(a, p));
-template<class T>
+template <class T>
 void vPow(const int n, const T* a, const T b, T* r) {
   hl_cpu_apply_binary_op<T, binary::vPow<T>, 0, 0>(
-    binary::vPow<T>(b), const_cast<T*>(a), r, 1, n, n, n);
+      binary::vPow<T>(b), const_cast<T*>(a), r, 1, n, n, n);
 }
 
 DEFINE_MATRIX_TERNARY_OP(vAdd, c = a + b);
-template<class T>
+template <class T>
 void vAdd(const int n, const T* a, const T* b, T* r) {
   hl_cpu_apply_ternary_op<T, ternary::vAdd<T>, 0, 0>(ternary::vAdd<T>(),
-    const_cast<T*>(a), const_cast<T*>(b), r, 1, n, n, n , n);
+                                                     const_cast<T*>(a),
+                                                     const_cast<T*>(b),
+                                                     r,
+                                                     1,
+                                                     n,
+                                                     n,
+                                                     n,
+                                                     n);
 }
 
 template void vExp(const int n, const float* a, float* r);
diff --git a/paddle/math/MathFunctions.h b/paddle/math/MathFunctions.h
index 29c07467c7..0741c45678 100644
--- a/paddle/math/MathFunctions.h
+++ b/paddle/math/MathFunctions.h
@@ -35,46 +35,58 @@ extern "C" {
 
 namespace paddle {
 
-template<class T>
-void gemm(const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB,
-          const int M, const int N, const int K,
-          const T alpha, const T* A, const int lda,
-          const T* B, const int ldb,
-          const T beta, T* C, const int ldc);
-
-template<class T>
-int getrf(const CBLAS_ORDER Order, const int M, const int N,
-          T *A, const int lda, int *ipiv);
-
-template<class T>
-int getri(const CBLAS_ORDER Order, const int N, T *A,
-          const int lda, const int *ipiv);
-
-template<class T>
+template <class T>
+void gemm(const CBLAS_TRANSPOSE transA,
+          const CBLAS_TRANSPOSE transB,
+          const int M,
+          const int N,
+          const int K,
+          const T alpha,
+          const T* A,
+          const int lda,
+          const T* B,
+          const int ldb,
+          const T beta,
+          T* C,
+          const int ldc);
+
+template <class T>
+int getrf(const CBLAS_ORDER Order,
+          const int M,
+          const int N,
+          T* A,
+          const int lda,
+          int* ipiv);
+
+template <class T>
+int getri(
+    const CBLAS_ORDER Order, const int N, T* A, const int lda, const int* ipiv);
+
+template <class T>
 void axpy(const int n, const T alpha, const T* x, T* y);
 
-template<class T>
+template <class T>
 T dotProduct(const int n, const T* x, const T* y);
 
-template<class T>
+template <class T>
 void vExp(const int n, const T* a, T* r);
 
-template<class T>
+template <class T>
 void vPow(const int n, const T* a, const T b, T* r);
 
-template<class T>
+template <class T>
 void vLog(const int n, const T* a, T* r);
 
-template<class T>
+template <class T>
 void vAdd(const int n, const T* a, const T* b, T* r);
 
-template<class T>
+template <class T>
 void vInvSqrt(const int n, const T* a, T* r);
 
-template<class T>
+template <class T>
 void vLog1p(const int n, const T* a, T* r);
 
-template<class T>
+template <class T>
 void vTanh(const int n, const T* a, T* r);
 
 }  // namespace paddle
diff --git a/paddle/math/MathUtils.cpp b/paddle/math/MathUtils.cpp
index 548f179363..878e0b8723 100644
--- a/paddle/math/MathUtils.cpp
+++ b/paddle/math/MathUtils.cpp
@@ -23,8 +23,8 @@ namespace paddle {
  * major is rows and minor is cols, according to
  * major value to initialize minor value"
  */
-void sparseRand(int* major, int* minor, int nnz, int majorLen, int minorMax,
-                bool useGpu) {
+void sparseRand(
+    int* major, int* minor, int nnz, int majorLen, int minorMax, bool useGpu) {
   CHECK(size_t(nnz) > size_t(1));
   int* cpuMajor;
   int* cpuMinor;
@@ -57,7 +57,8 @@ void sparseRand(int* major, int* minor, int nnz, int majorLen, int minorMax,
       cpuMinor[j] = idx;
       used[idx] = 1;
     }
-    std::sort(cpuMinor + cpuMajor[i], cpuMinor + cpuMajor[i + 1],
+    std::sort(cpuMinor + cpuMajor[i],
+              cpuMinor + cpuMajor[i + 1],
               [](int a, int b) { return a < b; });
   }
   /*memcpy result to gpu*/
@@ -67,8 +68,8 @@ void sparseRand(int* major, int* minor, int nnz, int majorLen, int minorMax,
   }
 }
 
-int outputSize(int imageSize, int filterSize, int padding, int stride,
-               bool caffeMode) {
+int outputSize(
+    int imageSize, int filterSize, int padding, int stride, bool caffeMode) {
   int outputSize;
   if (!caffeMode) {
     outputSize =
@@ -80,14 +81,14 @@ int outputSize(int imageSize, int filterSize, int padding, int stride,
   return outputSize;
 }
 
-int imageSize(int outputSize, int filterSize, int padding, int stride,
-              bool caffeMode) {
+int imageSize(
+    int outputSize, int filterSize, int padding, int stride, bool caffeMode) {
   int imageSize;
   if (!caffeMode) {
-   imageSize =
-       (outputSize - 1) * stride + filterSize - 2 * padding - stride + 1;
+    imageSize =
+        (outputSize - 1) * stride + filterSize - 2 * padding - stride + 1;
   } else {
-   imageSize = (outputSize - 1) * stride + filterSize - 2 * padding;
+    imageSize = (outputSize - 1) * stride + filterSize - 2 * padding;
   }
   CHECK_GE(imageSize, 1);
   return imageSize;
diff --git a/paddle/math/MathUtils.h b/paddle/math/MathUtils.h
index 91683dc3e9..907116c002 100644
--- a/paddle/math/MathUtils.h
+++ b/paddle/math/MathUtils.h
@@ -41,8 +41,8 @@ namespace paddle {
  *
  * rows is [1, 3, 4, 0, 2, 4, 1, 2, 3, 4]
  */
-void sparseRand(int* major, int* minor, int nnz, int majorLen, int minorMax,
-                bool useGpu);
+void sparseRand(
+    int* major, int* minor, int nnz, int majorLen, int minorMax, bool useGpu);
 
 /**
  * Calculate output size based on caffeMode_.
@@ -57,14 +57,14 @@ void sparseRand(int* major, int* minor, int nnz, int majorLen, int minorMax,
  *   - output: (012), (234), (456), (678), (9)
  *   - outputSize = 5;
  */
-int outputSize(int imageSize, int filterSize, int padding, int stride,
-               bool caffeMode);
+int outputSize(
+    int imageSize, int filterSize, int padding, int stride, bool caffeMode);
 
 /**
  * Calculate image size based on output size and caffeMode_.
  * It is the reverse function of outputSize()
  */
-int imageSize(int outputSize, int filterSize, int padding, int stride,
-              bool caffeMode);
+int imageSize(
+    int outputSize, int filterSize, int padding, int stride, bool caffeMode);
 
 }  // namespace paddle
diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp
index 706a598d0c..b70b47a5fc 100644
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -40,58 +40,75 @@ inline real _square(real a) { return a * a; }
 
 inline real _safelog(real a) { return a > 0.0f ? std::log(a) : -40.0f; }
 
-Matrix::Matrix(MemoryHandlePtr memHandle, size_t height, size_t width,
-               bool trans, bool use_gpu)
+Matrix::Matrix(MemoryHandlePtr memHandle,
+               size_t height,
+               size_t width,
+               bool trans,
+               bool use_gpu)
     : BaseMatrix(
-          height, width,
+          height,
+          width,
           memHandle ? (reinterpret_cast<real*>(memHandle->getBuf())) : nullptr,
-          trans, use_gpu) {
+          trans,
+          use_gpu) {
   elementCnt_ = width * height;
   memoryHandle_ = memHandle;
 }
 
-Matrix::Matrix(real* data, size_t height, size_t width, bool trans,
-               bool use_gpu)
+Matrix::Matrix(
+    real* data, size_t height, size_t width, bool trans, bool use_gpu)
     : BaseMatrix(height, width, data, trans, use_gpu) {
   elementCnt_ = width * height;
 }
 
-Matrix::Matrix(real* data, size_t height, size_t width, size_t stride,
-               bool trans, bool use_gpu)
+Matrix::Matrix(real* data,
+               size_t height,
+               size_t width,
+               size_t stride,
+               bool trans,
+               bool use_gpu)
     : BaseMatrix(height, width, stride, data, trans, use_gpu) {
   elementCnt_ = width * height;
 }
 
-MatrixPtr Matrix::createSparseMatrix(real* data, int* row, int* col,
-                                     size_t height, size_t width,
+MatrixPtr Matrix::createSparseMatrix(real* data,
+                                     int* row,
+                                     int* col,
+                                     size_t height,
+                                     size_t width,
                                      size_t nnz, /* used to allocate space */
                                      SparseValueType valueType, /*value type*/
-                                     SparseFormat format, bool trans,
+                                     SparseFormat format,
+                                     bool trans,
                                      bool useGpu) {
   if (useGpu) {
-    return std::make_shared<GpuSparseMatrix>(data, row, col, height, width, nnz,
-                                             valueType, format, trans);
+    return std::make_shared<GpuSparseMatrix>(
+        data, row, col, height, width, nnz, valueType, format, trans);
   } else {
-    return std::make_shared<CpuSparseMatrix>(data, row, col, height, width, nnz,
-                                             valueType, format, trans);
+    return std::make_shared<CpuSparseMatrix>(
+        data, row, col, height, width, nnz, valueType, format, trans);
   }
 }
 
-MatrixPtr Matrix::createSparseMatrix(size_t height, size_t width,
+MatrixPtr Matrix::createSparseMatrix(size_t height,
+                                     size_t width,
                                      size_t nnz, /* used to allocate space */
                                      SparseValueType valueType, /*value type*/
-                                     SparseFormat format, bool trans,
+                                     SparseFormat format,
+                                     bool trans,
                                      bool useGpu) {
   if (useGpu) {
-    return std::make_shared<GpuSparseMatrix>(height, width, nnz, valueType,
-                                             format, trans);
+    return std::make_shared<GpuSparseMatrix>(
+        height, width, nnz, valueType, format, trans);
   } else {
-    return std::make_shared<CpuSparseMatrix>(height, width, nnz, valueType,
-                                             format, trans);
+    return std::make_shared<CpuSparseMatrix>(
+        height, width, nnz, valueType, format, trans);
   }
 }
 
-MatrixPtr Matrix::create(MemoryHandlePtr memHandle, size_t height, size_t width,
+MatrixPtr Matrix::create(MemoryHandlePtr memHandle,
+                         size_t height,
+                         size_t width,
                          bool trans) {
   if (auto gpuHandle = std::dynamic_pointer_cast<GpuMemoryHandle>(memHandle)) {
     return std::make_shared<GpuMatrix>(gpuHandle, height, width, trans);
@@ -112,8 +129,8 @@ MatrixPtr Matrix::create(size_t height, size_t width, bool trans, bool useGpu) {
   }
 }
 
-MatrixPtr Matrix::create(real* data, size_t height, size_t width, bool trans,
-                         bool useGpu) {
+MatrixPtr Matrix::create(
+    real* data, size_t height, size_t width, bool trans, bool useGpu) {
   if (useGpu) {
     return std::make_shared<GpuMatrix>(data, height, width, trans);
   } else {
@@ -121,8 +138,12 @@ MatrixPtr Matrix::create(real* data, size_t height, size_t width, bool trans,
   }
 }
 
-MatrixPtr Matrix::create(real* data, size_t height, size_t width, size_t stride,
-                         bool trans, bool useGpu) {
+MatrixPtr Matrix::create(real* data,
+                         size_t height,
+                         size_t width,
+                         size_t stride,
+                         bool trans,
+                         bool useGpu) {
   if (useGpu) {
     return std::make_shared<GpuMatrix>(data, height, width, stride, trans);
   } else {
@@ -130,20 +151,23 @@ MatrixPtr Matrix::create(real* data, size_t height, size_t width, size_t stride,
   }
 }
 
-MatrixPtr Matrix::createSparseMatrix(size_t height, size_t width, size_t nnz,
-                                     SparseValueType valueType, bool trans,
+MatrixPtr Matrix::createSparseMatrix(size_t height,
+                                     size_t width,
+                                     size_t nnz,
+                                     SparseValueType valueType,
+                                     bool trans,
                                      bool useGpu) {
   if (useGpu) {
-    return std::make_shared<GpuSparseMatrix>(height, width, nnz, valueType,
-                                             SPARSE_CSR, trans);
+    return std::make_shared<GpuSparseMatrix>(
+        height, width, nnz, valueType, SPARSE_CSR, trans);
   } else {
-    return std::make_shared<CpuSparseMatrix>(height, width, nnz, valueType,
-                                             SPARSE_CSR, trans);
+    return std::make_shared<CpuSparseMatrix>(
+        height, width, nnz, valueType, SPARSE_CSR, trans);
   }
 }
 
-void Matrix::resizeOrCreate(MatrixPtr& matrix, size_t height, size_t width,
-                            bool trans, bool useGpu) {
+void Matrix::resizeOrCreate(
+    MatrixPtr& matrix, size_t height, size_t width, bool trans, bool useGpu) {
   if (!matrix) {
     matrix = Matrix::create(height, width, trans, useGpu);
   } else {
@@ -152,14 +176,17 @@ void Matrix::resizeOrCreate(MatrixPtr& matrix, size_t height, size_t width,
   }
 }
 
-void Matrix::resizeOrCreateSparseMatrix(MatrixPtr& matrix, size_t height,
-                                        size_t width, size_t nnz,
+void Matrix::resizeOrCreateSparseMatrix(MatrixPtr& matrix,
+                                        size_t height,
+                                        size_t width,
+                                        size_t nnz,
                                         SparseValueType valueType,
-                                        SparseFormat format, bool trans,
+                                        SparseFormat format,
+                                        bool trans,
                                         bool useGpu) {
   if (!matrix) {
-    matrix = Matrix::createSparseMatrix(height, width, nnz, valueType, format,
-                                        trans, useGpu);
+    matrix = Matrix::createSparseMatrix(
+        height, width, nnz, valueType, format, trans, useGpu);
   } else {
     CHECK(dynamic_cast<CpuSparseMatrix*>(matrix.get()) ||
           dynamic_cast<GpuSparseMatrix*>(matrix.get()));
@@ -176,7 +203,9 @@ void Matrix::reshape(size_t height, size_t width) {
   stride_ = width_;
 }
 
-MatrixPtr Matrix::subMatrix(size_t startRow, size_t endRow, size_t startCol,
+MatrixPtr Matrix::subMatrix(size_t startRow,
+                            size_t endRow,
+                            size_t startCol,
                             size_t endCol) {
   CHECK_LE(startRow, endRow);
   CHECK_LE(endRow, getHeight());
@@ -184,8 +213,11 @@ MatrixPtr Matrix::subMatrix(size_t startRow, size_t endRow, size_t startCol,
   CHECK_LE(endCol, getWidth());
 
   return Matrix::create(getData() + startRow * getStride() + startCol,
-                        endRow - startRow, endCol - startCol, getStride(),
-                        trans_, useGpu_);
+                        endRow - startRow,
+                        endCol - startCol,
+                        getStride(),
+                        trans_,
+                        useGpu_);
 }
 
 void Matrix::setDiag(real value) {
@@ -199,7 +231,10 @@ void Matrix::setDiag(real value) {
 
 GpuMatrix::GpuMatrix(size_t height, size_t width, bool trans)
     : Matrix(std::make_shared<GpuMemoryHandle>(height * width * sizeof(real)),
-             height, width, trans, true) {}
+             height,
+             width,
+             trans,
+             true) {}
 
 GpuMatrix::~GpuMatrix() {}
 
@@ -258,11 +293,11 @@ void GpuMatrix::copyFrom(const Matrix& src) {
   CHECK(elementCnt_ == src.getElementCnt());
 
   if (typeid(src) == typeid(CpuMatrix)) {
-    hl_memcpy_host2device(data_, const_cast<real*>(src.getData()),
-                          sizeof(real) * elementCnt_);
+    hl_memcpy_host2device(
+        data_, const_cast<real*>(src.getData()), sizeof(real) * elementCnt_);
   } else if (typeid(src) == typeid(GpuMatrix)) {
-    hl_memcpy_device2device(data_, const_cast<real*>(src.getData()),
-                            sizeof(real) * elementCnt_);
+    hl_memcpy_device2device(
+        data_, const_cast<real*>(src.getData()), sizeof(real) * elementCnt_);
   } else {
     LOG(FATAL) << "Wrong";
   }
@@ -272,8 +307,10 @@ void GpuMatrix::copyFrom(const Matrix& src, hl_stream_t stream) {
   CHECK(isContiguous());
   CHECK(src.isContiguous());
   CHECK(elementCnt_ == src.getElementCnt());
-  hl_memcpy_async(this->getData(), const_cast<real*>(src.getData()),
-                  sizeof(real) * elementCnt_, stream);
+  hl_memcpy_async(this->getData(),
+                  const_cast<real*>(src.getData()),
+                  sizeof(real) * elementCnt_,
+                  stream);
 }
 
 void GpuMatrix::copyFrom(const real* hostSrc, size_t size) {
@@ -324,7 +361,9 @@ MatrixPtr GpuMatrix::getTranspose() {
   if (memoryHandle_.get() != NULL) {
     MatrixPtr copy_T(
         new GpuMatrix(std::dynamic_pointer_cast<GpuMemoryHandle>(memoryHandle_),
-                      height_, width_, true));
+                      height_,
+                      width_,
+                      true));
     return copy_T;
   } else {
     MatrixPtr copy_T(new GpuMatrix(data_, height_, width_, true));
@@ -346,7 +385,6 @@ void GpuMatrix::transpose(MatrixPtr matTrans, bool memAlloc) {
   hl_matrix_transpose(data, dataTrans, height_, width_, lda, ldc);
 }
 
-
 MatrixPtr GpuMatrix::getInverse() {
   MatrixPtr matInv;
   inverse(matInv, true);
@@ -379,17 +417,16 @@ void GpuMatrix::addSharedBias(Matrix& b, real scale) {
   CHECK(b.getHeight() == 1) << "the Bias should be a vector";
   CHECK_LE(b.getWidth(), getWidth());
   CHECK_EQ(getWidth() % b.getWidth(), 0UL);
-  hl_matrix_add_shared_bias(getData(), b.getData(), b.getWidth(),
-                            getHeight(), getWidth(), scale);
+  hl_matrix_add_shared_bias(
+      getData(), b.getData(), b.getWidth(), getHeight(), getWidth(), scale);
 }
 
-
 void GpuMatrix::collectBias(Matrix& a, real scale) {
   CHECK_EQ(getHeight(), (size_t)1);
   CHECK_EQ(width_, a.getWidth());
   GpuSparseMatrix* sMatPtr = dynamic_cast<GpuSparseMatrix*>(&a);
   if (!sMatPtr) {
-    sumCols(a, /* scaleSum= */scale, /* scaleDest= */1);
+    sumCols(a, /* scaleSum= */ scale, /* scaleDest= */ 1);
   } else {
     real* data = getData();
     hl_sparse_matrix_s A_d = sMatPtr->sMatrix_.get();
@@ -397,15 +434,13 @@ void GpuMatrix::collectBias(Matrix& a, real scale) {
   }
 }
 
-
 void GpuMatrix::collectSharedBias(Matrix& a, real scale) {
   CHECK_EQ(getHeight(), (size_t)1);
   CHECK_EQ(a.getWidth() % getWidth(), 0UL);
-  hl_matrix_collect_shared_bias(getData(), a.getData(), getWidth(),
-                                a.getHeight(), a.getWidth(), scale);
+  hl_matrix_collect_shared_bias(
+      getData(), a.getData(), getWidth(), a.getHeight(), a.getWidth(), scale);
 }
 
-
 void GpuMatrix::sequenceAvgForward(Matrix& a,
                                    const IVector& startsPos,
                                    int mode) {
@@ -421,7 +456,9 @@ void GpuMatrix::sequenceAvgForward(Matrix& a,
 }
 
 /* this = scaleAB*(a*b) +  scaleT*this */
-void GpuMatrix::mul(const GpuMatrix& a, const GpuMatrix& b, real scaleAB,
+void GpuMatrix::mul(const GpuMatrix& a,
+                    const GpuMatrix& b,
+                    real scaleAB,
                     real scaleT) {
   CHECK(!isTransposed()) << "Not supported";
 
@@ -453,11 +490,24 @@ void GpuMatrix::mul(const GpuMatrix& a, const GpuMatrix& b, real scaleAB,
   hl_trans_op_t transa = !a.isTransposed() ? HPPL_OP_N : HPPL_OP_T;
   hl_trans_op_t transb = !b.isTransposed() ? HPPL_OP_N : HPPL_OP_T;
 
-  hl_matrix_mul(A_d, transa, B_d, transb, C_d, dimM, dimN, dimK, scaleAB,
-                scaleT, lda, ldb, ldc);
-}
-
-void GpuMatrix::mul(const GpuSparseMatrix& a, const GpuMatrix& b, real scaleAB,
+  hl_matrix_mul(A_d,
+                transa,
+                B_d,
+                transb,
+                C_d,
+                dimM,
+                dimN,
+                dimK,
+                scaleAB,
+                scaleT,
+                lda,
+                ldb,
+                ldc);
+}
+
+void GpuMatrix::mul(const GpuSparseMatrix& a,
+                    const GpuMatrix& b,
+                    real scaleAB,
                     real scaleT) {
   CHECK(isContiguous());
   CHECK(b.isContiguous());
@@ -475,11 +525,21 @@ void GpuMatrix::mul(const GpuSparseMatrix& a, const GpuMatrix& b, real scaleAB,
   hl_sparse_matrix_s A_d = a.sMatrix_.get();
   real* B_d = b.data_;
   real* C_d = data_;
-  hl_matrix_csr_mul_dense(A_d, transA, B_d, HPPL_OP_N, C_d, height_, width_,
-                          b.height_, scaleAB, scaleT);
-}
-
-void GpuMatrix::mul(const GpuMatrix& a, const GpuSparseMatrix& b, real scaleAB,
+  hl_matrix_csr_mul_dense(A_d,
+                          transA,
+                          B_d,
+                          HPPL_OP_N,
+                          C_d,
+                          height_,
+                          width_,
+                          b.height_,
+                          scaleAB,
+                          scaleT);
+}
+
+void GpuMatrix::mul(const GpuMatrix& a,
+                    const GpuSparseMatrix& b,
+                    real scaleAB,
                     real scaleT) {
   CHECK(isContiguous());
   CHECK(a.isContiguous());
@@ -497,11 +557,27 @@ void GpuMatrix::mul(const GpuMatrix& a, const GpuSparseMatrix& b, real scaleAB,
         << "Matrix dimensions are not equal";
   }
   if (b.format_ == SPARSE_CSC) {
-    hl_matrix_dense_mul_csc(A_d, HPPL_OP_N, B_d, transB, C_d, height_, width_,
-                            a.width_, scaleAB, scaleT);
+    hl_matrix_dense_mul_csc(A_d,
+                            HPPL_OP_N,
+                            B_d,
+                            transB,
+                            C_d,
+                            height_,
+                            width_,
+                            a.width_,
+                            scaleAB,
+                            scaleT);
   } else {
-    hl_matrix_dense_mul_csr(A_d, HPPL_OP_N, B_d, transB, C_d, height_, width_,
-                            a.width_, scaleAB, scaleT);
+    hl_matrix_dense_mul_csr(A_d,
+                            HPPL_OP_N,
+                            B_d,
+                            transB,
+                            C_d,
+                            height_,
+                            width_,
+                            a.width_,
+                            scaleAB,
+                            scaleT);
   }
 }
 
@@ -510,7 +586,9 @@ void GpuMatrix::mul(const MatrixPtr a, const MatrixPtr b) {
   mul(a, b, 1.0, 0.0);
 }
 
-void GpuMatrix::mul(const MatrixPtr a, const MatrixPtr b, real scaleAB,
+void GpuMatrix::mul(const MatrixPtr a,
+                    const MatrixPtr b,
+                    real scaleAB,
                     real scaleT) {
   GpuMatrixPtr a_ptr = std::dynamic_pointer_cast<GpuMatrix>(a);
   GpuMatrixPtr b_ptr = std::dynamic_pointer_cast<GpuMatrix>(b);
@@ -563,8 +641,14 @@ void GpuMatrix::selectRows(Matrix& table, IVector& ids) {
   size_t tableSize = table.getHeight();
   int* index = ids.getData();
 
-  hl_matrix_select_rows(a, stride_, table.getData(), table.stride_, index,
-                        numSamples, tableSize, dim);
+  hl_matrix_select_rows(a,
+                        stride_,
+                        table.getData(),
+                        table.stride_,
+                        index,
+                        numSamples,
+                        tableSize,
+                        dim);
 #endif
 }
 
@@ -581,15 +665,21 @@ void GpuMatrix::addToRows(Matrix& table, IVector& ids) {
   size_t tableSize = table.getHeight();
   int* index = ids.getData();
 
-  hl_matrix_add_to_rows(table.getData(), table.stride_, a, stride_, index,
-                        numSamples, tableSize, dim);
+  hl_matrix_add_to_rows(table.getData(),
+                        table.stride_,
+                        a,
+                        stride_,
+                        index,
+                        numSamples,
+                        tableSize,
+                        dim);
 #endif
 }
 
 void GpuMatrix::colMerge(Matrix& src) {
   CHECK(src.height_ == height_);
   if (!trans_ && !src.trans_) {
-    sumRows(src, /* scaleSum= */1, /* scaleDest= */0);
+    sumRows(src, /* scaleSum= */ 1, /* scaleDest= */ 0);
   } else {
     LOG(FATAL) << "Is not supported";
   }
@@ -599,7 +689,7 @@ void GpuMatrix::rowSum(Matrix& sum) {
   CHECK_EQ(sum.getHeight(), getHeight());
   CHECK_EQ(sum.getWidth(), (size_t)1);
 
-  sum.sumRows(*this, /* scaleSum= */1, /* scaleDest= */0);
+  sum.sumRows(*this, /* scaleSum= */ 1, /* scaleDest= */ 0);
 }
 
 void GpuMatrix::rowMax(Matrix& max) {
@@ -617,8 +707,13 @@ void GpuMatrix::rowMax(IVector& maxIds, Matrix& maxVal) {
   CHECK_EQ(maxIds.getSize(), numSamples * beam);
   CHECK_EQ(maxVal.getHeight(), numSamples);
 
-  hl_matrix_top_k(maxVal.getData(), maxVal.getStride(), maxIds.getData(),
-                  this->getData(), this->getStride(), this->getWidth(), beam,
+  hl_matrix_top_k(maxVal.getData(),
+                  maxVal.getStride(),
+                  maxIds.getData(),
+                  this->getData(),
+                  this->getStride(),
+                  this->getWidth(),
+                  beam,
                   numSamples);
 #endif
 }
@@ -634,7 +729,9 @@ void GpuMatrix::colMax(IVector& maxIds, Matrix& maxVal) {
   LOG(FATAL) << "Is not supported";
 }
 
-void GpuMatrix::maxoutForward(Matrix& a, IVector& id, size_t channels,
+void GpuMatrix::maxoutForward(Matrix& a,
+                              IVector& id,
+                              size_t channels,
                               size_t groups) {
   CHECK(dynamic_cast<GpuMatrix*>(&a));
   CHECK(dynamic_cast<GpuIVector*>(&id));
@@ -646,11 +743,13 @@ void GpuMatrix::maxoutForward(Matrix& a, IVector& id, size_t channels,
   real* output = getData();
   int* idForGpu = id.getData();
 
-  hl_maxout_forward(input, output, idForGpu, batchSize, size, size / channels,
-                    groups);
+  hl_maxout_forward(
+      input, output, idForGpu, batchSize, size, size / channels, groups);
 }
 
-void GpuMatrix::maxoutBackward(Matrix& a, IVector& id, size_t channels,
+void GpuMatrix::maxoutBackward(Matrix& a,
+                               IVector& id,
+                               size_t channels,
                                size_t groups) {
   CHECK(dynamic_cast<GpuMatrix*>(&a));
   CHECK(dynamic_cast<GpuIVector*>(&id));
@@ -662,8 +761,8 @@ void GpuMatrix::maxoutBackward(Matrix& a, IVector& id, size_t channels,
   const real* output = a.getData();
   const int* idForGpu = id.getData();
 
-  hl_maxout_backward(input, output, idForGpu, batchSize, size, size / channels,
-                     groups);
+  hl_maxout_backward(
+      input, output, idForGpu, batchSize, size, size / channels, groups);
 }
 
 /*calulate the error of classification */
@@ -679,8 +778,8 @@ void GpuMatrix::classificationError(MatrixPtr output, IVectorPtr label) {
   real* recResult_d = data_;
   int* label_d = label_ptr->getData();
 
-  hl_matrix_classification_error(output_d, label_d, recResult_d, height_,
-                                 output_ptr->width_);
+  hl_matrix_classification_error(
+      output_d, label_d, recResult_d, height_, output_ptr->width_);
 }
 
 /* copy -log(output[i * width + label]) to this->data[i] */
@@ -717,13 +816,15 @@ void GpuMatrix::oneHotCrossEntropyBp(Matrix& outputV, IVector& label) {
   hl_matrix_cross_entropy_bp(grad_d, output_d, label_d, height_, width_);
 }
 
-void GpuMatrix::oneHotCrossEntropyWithSelfNorm(Matrix& output, IVector& label,
+void GpuMatrix::oneHotCrossEntropyWithSelfNorm(Matrix& output,
+                                               IVector& label,
                                                real alpha) {
   LOG(FATAL) << "Not implemented";
 }
 
 void GpuMatrix::oneHotCrossEntropyWithSelfNormBp(Matrix& outputV,
-                                                 IVector& label, real alpha) {
+                                                 IVector& label,
+                                                 real alpha) {
   LOG(FATAL) << "Not implemented";
 }
 
@@ -790,8 +891,10 @@ void GpuMatrix::sumOfSquares(Matrix& output, Matrix& label) {
     LOG(FATAL) << "not supported: GpuSparseMatrix as label";
   }
 
-  BaseMatrix::sumOfSquaredDiffs(output, label,
-                                /* scaleSum= */1, /* scaleDest= */1);
+  BaseMatrix::sumOfSquaredDiffs(output,
+                                label,
+                                /* scaleSum= */ 1,
+                                /* scaleDest= */ 1);
 }
 
 void GpuMatrix::sumOfSquaresBp(Matrix& outputV, Matrix& label) {
@@ -826,9 +929,12 @@ void GpuMatrix::cosSim(Matrix& output1, Matrix& output2, real scale) {
   real* y = output2.getData();
   hl_cossim(out, x, y, dim, output1.getHeight(), output2.getHeight(), scale);
 }
-void GpuMatrix::cosSimDerivative(Matrix& output, Matrix& prevOut1,
-                                 Matrix& prevOut2, Matrix& prevGrad1,
-                                 Matrix& prevGrad2, real scale) {
+void GpuMatrix::cosSimDerivative(Matrix& output,
+                                 Matrix& prevOut1,
+                                 Matrix& prevOut2,
+                                 Matrix& prevGrad1,
+                                 Matrix& prevGrad2,
+                                 real scale) {
   CHECK(output.useGpu_ == true && prevOut1.useGpu_ == true &&
         prevOut2.useGpu_ == true && prevGrad1.useGpu_ == true &&
         prevGrad2.useGpu_ == true)
@@ -852,8 +958,16 @@ void GpuMatrix::cosSimDerivative(Matrix& output, Matrix& prevOut1,
   real* prevOutY = prevOut2.getData();
   real* prevGradX = prevGrad1.getData();
   real* prevGradY = prevGrad2.getData();
-  hl_cossim_derivative(grad, out, prevOutX, prevOutY, prevGradX, prevGradY, dim,
-                       prevOut1.getHeight(), prevOut2.getHeight(), scale);
+  hl_cossim_derivative(grad,
+                       out,
+                       prevOutX,
+                       prevOutY,
+                       prevGradX,
+                       prevGradY,
+                       dim,
+                       prevOut1.getHeight(),
+                       prevOut2.getHeight(),
+                       scale);
 }
 
 void GpuMatrix::randomizeUniform() {
@@ -902,9 +1016,17 @@ void GpuMatrix::check(std::ostream& os, Matrix& refMat, bool printDiff) {
   LOG(INFO) << "the  diffCnt is " << diffCnt;
 }
 
-void GpuMatrix::convExpand(Matrix& feature, int feaImgHeight, int feaImgWidth,
-                           int channels, int blockH, int blockW, int strideH,
-                           int strideW, int paddingH, int paddingW, int outputH,
+void GpuMatrix::convExpand(Matrix& feature,
+                           int feaImgHeight,
+                           int feaImgWidth,
+                           int channels,
+                           int blockH,
+                           int blockW,
+                           int strideH,
+                           int strideW,
+                           int paddingH,
+                           int paddingW,
+                           int outputH,
                            int outputW) {
   CHECK(feature.useGpu_ == true) << "Matrix type are not equal";
 
@@ -915,15 +1037,34 @@ void GpuMatrix::convExpand(Matrix& feature, int feaImgHeight, int feaImgWidth,
   size_t elemCnt = outputH * outputW * blockH * blockW * channels;
   CHECK_EQ(elemCnt, height_ * width_) << "Matrix dimensions are not equal";
 
-  hl_expand_feature2col(feature.getData(), channels, feaImgHeight, feaImgWidth,
-                        blockH, blockW, strideH, strideW, paddingH, paddingW,
-                        outputH, outputW, getData());
-}
-
-void GpuMatrix::convShrink(Matrix& expandFeat, int thisImgHeight,
-                           int thisImgWidth, int channels, int blockH,
-                           int blockW, int strideH, int strideW, int paddingH,
-                           int paddingW, int outputH, int outputW, real alpha,
+  hl_expand_feature2col(feature.getData(),
+                        channels,
+                        feaImgHeight,
+                        feaImgWidth,
+                        blockH,
+                        blockW,
+                        strideH,
+                        strideW,
+                        paddingH,
+                        paddingW,
+                        outputH,
+                        outputW,
+                        getData());
+}
+
+void GpuMatrix::convShrink(Matrix& expandFeat,
+                           int thisImgHeight,
+                           int thisImgWidth,
+                           int channels,
+                           int blockH,
+                           int blockW,
+                           int strideH,
+                           int strideW,
+                           int paddingH,
+                           int paddingW,
+                           int outputH,
+                           int outputW,
+                           real alpha,
                            real beta) {
   CHECK(expandFeat.useGpu_ == true) << "Matrix type are not equal";
   CHECK_EQ(size_t(thisImgHeight * thisImgWidth * channels),
@@ -933,16 +1074,34 @@ void GpuMatrix::convShrink(Matrix& expandFeat, int thisImgHeight,
   size_t elemCnt = outputH * outputW * blockW * blockH * channels;
   CHECK(elemCnt == expandFeat.getHeight() * expandFeat.getWidth())
       << "Matrix dimensions are not equal";
-  hl_shrink_col2feature(expandFeat.getData(), channels, thisImgHeight,
-                        thisImgWidth, blockH, blockW, strideH, strideW,
-                        paddingH, paddingW, outputH, outputW, getData(), alpha,
+  hl_shrink_col2feature(expandFeat.getData(),
+                        channels,
+                        thisImgHeight,
+                        thisImgWidth,
+                        blockH,
+                        blockW,
+                        strideH,
+                        strideW,
+                        paddingH,
+                        paddingW,
+                        outputH,
+                        outputW,
+                        getData(),
+                        alpha,
                         beta);
 }
 
-void GpuMatrix::maxPoolForward(Matrix& inputMat, size_t imgSizeH,
-                               size_t imgSizeW, size_t channels, size_t sizeX,
-                               size_t sizeY, size_t strideH, size_t strideW,
-                               size_t outputH, size_t outputW, size_t paddingH,
+void GpuMatrix::maxPoolForward(Matrix& inputMat,
+                               size_t imgSizeH,
+                               size_t imgSizeW,
+                               size_t channels,
+                               size_t sizeX,
+                               size_t sizeY,
+                               size_t strideH,
+                               size_t strideW,
+                               size_t outputH,
+                               size_t outputW,
+                               size_t paddingH,
                                size_t paddingW) {
   CHECK(inputMat.useGpu_ == true) << "Matrix type are not equal";
 
@@ -954,17 +1113,38 @@ void GpuMatrix::maxPoolForward(Matrix& inputMat, size_t imgSizeH,
   CHECK(height_ == inputMat.getHeight());
   CHECK(width_ == outputH * outputW * channels);
 
-  hl_maxpool_forward(frameNum, inputData, channels, height, width, outputH,
-                     outputW, sizeX, sizeY, strideH, strideW, paddingH,
-                     paddingW, data_, getStride());
-}
-
-void GpuMatrix::maxPoolBackward(Matrix& inputMat, size_t imgSizeH,
-                                size_t imgSizeW, Matrix& outGrad, Matrix& outV,
-                                size_t sizeX, size_t sizeY, size_t strideH,
-                                size_t strideW, size_t outputH, size_t outputW,
-                                real scaleTargets, real scaleOutput,
-                                size_t paddingH, size_t paddingW) {
+  hl_maxpool_forward(frameNum,
+                     inputData,
+                     channels,
+                     height,
+                     width,
+                     outputH,
+                     outputW,
+                     sizeX,
+                     sizeY,
+                     strideH,
+                     strideW,
+                     paddingH,
+                     paddingW,
+                     data_,
+                     getStride());
+}
+
+void GpuMatrix::maxPoolBackward(Matrix& inputMat,
+                                size_t imgSizeH,
+                                size_t imgSizeW,
+                                Matrix& outGrad,
+                                Matrix& outV,
+                                size_t sizeX,
+                                size_t sizeY,
+                                size_t strideH,
+                                size_t strideW,
+                                size_t outputH,
+                                size_t outputW,
+                                real scaleTargets,
+                                real scaleOutput,
+                                size_t paddingH,
+                                size_t paddingW) {
   CHECK(inputMat.useGpu_ == true && outGrad.useGpu_ == true &&
         outV.useGpu_ == true)
       << "Matrix type are not equal";
@@ -982,16 +1162,38 @@ void GpuMatrix::maxPoolBackward(Matrix& inputMat, size_t imgSizeH,
   CHECK(outGrad.getHeight() == outV.getHeight() &&
         outGrad.getWidth() == outV.getWidth());
 
-  hl_maxpool_backward(frameNum, inputData, outData, outDiff, channels, height,
-                      width, outputH, outputW, sizeX, sizeY, strideH, strideW,
-                      paddingH, paddingW, scaleTargets, scaleOutput, data_,
+  hl_maxpool_backward(frameNum,
+                      inputData,
+                      outData,
+                      outDiff,
+                      channels,
+                      height,
+                      width,
+                      outputH,
+                      outputW,
+                      sizeX,
+                      sizeY,
+                      strideH,
+                      strideW,
+                      paddingH,
+                      paddingW,
+                      scaleTargets,
+                      scaleOutput,
+                      data_,
                       outGrad.getStride());
 }
 
-void GpuMatrix::avgPoolForward(Matrix& inputMat, size_t imgSizeH,
-                               size_t imgSizeW, size_t channels, size_t sizeX,
-                               size_t sizeY, size_t strideH, size_t strideW,
-                               size_t outputH, size_t outputW, size_t paddingH,
+void GpuMatrix::avgPoolForward(Matrix& inputMat,
+                               size_t imgSizeH,
+                               size_t imgSizeW,
+                               size_t channels,
+                               size_t sizeX,
+                               size_t sizeY,
+                               size_t strideH,
+                               size_t strideW,
+                               size_t outputH,
+                               size_t outputW,
+                               size_t paddingH,
                                size_t paddingW) {
   CHECK(inputMat.useGpu_ == true) << "Matrix type are not equal";
 
@@ -1003,16 +1205,35 @@ void GpuMatrix::avgPoolForward(Matrix& inputMat, size_t imgSizeH,
   CHECK(height_ == inputMat.getHeight());
   CHECK(width_ == outputH * outputW * channels);
 
-  hl_avgpool_forward(frameNum, inputData, channels, height, width, outputH,
-                     outputW, sizeX, sizeY, strideH, strideW, paddingH,
-                     paddingW, data_, getStride());
-}
-
-void GpuMatrix::avgPoolBackward(Matrix& outGrad, size_t imgSizeH,
-                                size_t imgSizeW, size_t sizeX, size_t sizeY,
-                                size_t strideH, size_t strideW, size_t outputH,
-                                size_t outputW, real scaleTargets,
-                                real scaleOutput, size_t paddingH,
+  hl_avgpool_forward(frameNum,
+                     inputData,
+                     channels,
+                     height,
+                     width,
+                     outputH,
+                     outputW,
+                     sizeX,
+                     sizeY,
+                     strideH,
+                     strideW,
+                     paddingH,
+                     paddingW,
+                     data_,
+                     getStride());
+}
+
+void GpuMatrix::avgPoolBackward(Matrix& outGrad,
+                                size_t imgSizeH,
+                                size_t imgSizeW,
+                                size_t sizeX,
+                                size_t sizeY,
+                                size_t strideH,
+                                size_t strideW,
+                                size_t outputH,
+                                size_t outputW,
+                                real scaleTargets,
+                                real scaleOutput,
+                                size_t paddingH,
                                 size_t paddingW) {
   CHECK(outGrad.useGpu_ == true) << "Matrix type are not equal";
 
@@ -1025,15 +1246,32 @@ void GpuMatrix::avgPoolBackward(Matrix& outGrad, size_t imgSizeH,
   CHECK(height_ == outGrad.getHeight());
   CHECK(outGrad.getWidth() == outputH * outputW * channels);
 
-  hl_avgpool_backward(frameNum, outDiff, channels, height, width, outputH,
-                      outputW, sizeX, sizeY, strideH, strideW, paddingH,
-                      paddingW, scaleTargets, scaleOutput, data_,
+  hl_avgpool_backward(frameNum,
+                      outDiff,
+                      channels,
+                      height,
+                      width,
+                      outputH,
+                      outputW,
+                      sizeX,
+                      sizeY,
+                      strideH,
+                      strideW,
+                      paddingH,
+                      paddingW,
+                      scaleTargets,
+                      scaleOutput,
+                      data_,
                       outGrad.getStride());
 }
 
-void GpuMatrix::crossMapNormalFwd(Matrix& input, size_t imgSizeH,
-                                  size_t imgSizeW, Matrix& denoms,
-                                  size_t channels, size_t sizeX, float scale,
+void GpuMatrix::crossMapNormalFwd(Matrix& input,
+                                  size_t imgSizeH,
+                                  size_t imgSizeW,
+                                  Matrix& denoms,
+                                  size_t channels,
+                                  size_t sizeX,
+                                  float scale,
                                   float pow) {
   size_t num = input.getHeight();
   size_t height = imgSizeH;
@@ -1043,14 +1281,27 @@ void GpuMatrix::crossMapNormalFwd(Matrix& input, size_t imgSizeH,
   CHECK(denoms.getHeight() == input.getHeight() &&
         denoms.getWidth() == input.getWidth() && input.getHeight() == height_ &&
         input.getWidth() == width_);
-  hl_CMRNorm_forward(num, input.getData(), denoms.getData(), data_, channels,
-                     height, width, sizeX, scale, -pow);
-}
-
-void GpuMatrix::crossMapNormalBwd(Matrix& localGrad, Matrix& denoms,
-                                  Matrix& preOutV, Matrix& localOutV,
-                                  size_t channels, size_t imgSizeH,
-                                  size_t imgSizeW, size_t sizeX, float scale,
+  hl_CMRNorm_forward(num,
+                     input.getData(),
+                     denoms.getData(),
+                     data_,
+                     channels,
+                     height,
+                     width,
+                     sizeX,
+                     scale,
+                     -pow);
+}
+
+void GpuMatrix::crossMapNormalBwd(Matrix& localGrad,
+                                  Matrix& denoms,
+                                  Matrix& preOutV,
+                                  Matrix& localOutV,
+                                  size_t channels,
+                                  size_t imgSizeH,
+                                  size_t imgSizeW,
+                                  size_t sizeX,
+                                  float scale,
                                   float pow) {
   size_t num = preOutV.getHeight();
   size_t height = imgSizeH;
@@ -1063,12 +1314,22 @@ void GpuMatrix::crossMapNormalBwd(Matrix& localGrad, Matrix& denoms,
   CHECK(denoms.getHeight() == localGrad.getHeight() &&
         denoms.getWidth() == localGrad.getWidth());
 
-  hl_CMRNorm_backward(num, preOutV.getData(), denoms.getData(),
-                      localOutV.getData(), localGrad.getData(), data_, channels,
-                      height, width, sizeX, -pow, 2.0f * pow * scale);
-}
-
-void GpuMatrix::maxSequenceForward(Matrix& input, const IVector& sequence,
+  hl_CMRNorm_backward(num,
+                      preOutV.getData(),
+                      denoms.getData(),
+                      localOutV.getData(),
+                      localGrad.getData(),
+                      data_,
+                      channels,
+                      height,
+                      width,
+                      sizeX,
+                      -pow,
+                      2.0f * pow * scale);
+}
+
+void GpuMatrix::maxSequenceForward(Matrix& input,
+                                   const IVector& sequence,
                                    IVector& index) {
   CHECK(dynamic_cast<GpuMatrix*>(&input));
   CHECK(dynamic_cast<const GpuIVector*>(&sequence));
@@ -1085,11 +1346,12 @@ void GpuMatrix::maxSequenceForward(Matrix& input, const IVector& sequence,
   CHECK_EQ(numSequences, sequence.getSize() - 1);
   CHECK_EQ(numSequences * dim, index.getSize());
 
-  hl_max_sequence_forward(inputData, starts, outData, maxIndex, numSequences,
-                          dim);
+  hl_max_sequence_forward(
+      inputData, starts, outData, maxIndex, numSequences, dim);
 }
 
-void GpuMatrix::maxSequenceBackward(Matrix& outputGrad, const IVector& sequence,
+void GpuMatrix::maxSequenceBackward(Matrix& outputGrad,
+                                    const IVector& sequence,
                                     IVector& index) {
   CHECK(dynamic_cast<GpuMatrix*>(&outputGrad));
   CHECK(dynamic_cast<const GpuIVector*>(&sequence));
@@ -1108,10 +1370,13 @@ void GpuMatrix::maxSequenceBackward(Matrix& outputGrad, const IVector& sequence,
   hl_max_sequence_backward(outGrad, maxIndex, inputGrad, numSequences, dim);
 }
 
-void GpuMatrix::contextProjectionForward(MatrixPtr input, MatrixPtr weight,
+void GpuMatrix::contextProjectionForward(MatrixPtr input,
+                                         MatrixPtr weight,
                                          const IVector& sequence,
-                                         int contextLength, int contextStart,
-                                         size_t beginPad, bool isPadding) {
+                                         int contextLength,
+                                         int contextStart,
+                                         size_t beginPad,
+                                         bool isPadding) {
   CHECK(dynamic_cast<GpuMatrix*>(input.get()));
   CHECK(dynamic_cast<const GpuIVector*>(&sequence));
   if (weight) CHECK(dynamic_cast<GpuMatrix*>(weight.get()));
@@ -1125,9 +1390,16 @@ void GpuMatrix::contextProjectionForward(MatrixPtr input, MatrixPtr weight,
   real* inputData = input->getData();
   const int* starts = sequence.getData();
 
-  hl_context_projection_forward(
-      inputData, starts, isPadding ? weight->getData() : NULL, outData,
-      numSequences, inputDim, contextLength, contextStart, beginPad, isPadding);
+  hl_context_projection_forward(inputData,
+                                starts,
+                                isPadding ? weight->getData() : NULL,
+                                outData,
+                                numSequences,
+                                inputDim,
+                                contextLength,
+                                contextStart,
+                                beginPad,
+                                isPadding);
 }
 
 void GpuMatrix::contextProjectionBackwardData(MatrixPtr inputGrad,
@@ -1146,14 +1418,20 @@ void GpuMatrix::contextProjectionBackwardData(MatrixPtr inputGrad,
   real* inGrad = inputGrad->getData();
   const int* starts = sequence.getData();
 
-  hl_context_projection_backward_data(outGrad, starts, inGrad, numSequences,
-                                      inputDim, contextLength, contextStart);
+  hl_context_projection_backward_data(outGrad,
+                                      starts,
+                                      inGrad,
+                                      numSequences,
+                                      inputDim,
+                                      contextLength,
+                                      contextStart);
 }
 
 void GpuMatrix::contextProjectionBackwardWeight(MatrixPtr weightGrad,
                                                 const IVector& sequence,
                                                 int contextLength,
-                                                int contextStart, int totalPad,
+                                                int contextStart,
+                                                int totalPad,
                                                 size_t beginPad) {
   CHECK(dynamic_cast<GpuMatrix*>(weightGrad.get()));
   CHECK(dynamic_cast<const GpuIVector*>(&sequence));
@@ -1167,9 +1445,15 @@ void GpuMatrix::contextProjectionBackwardWeight(MatrixPtr weightGrad,
   real* wtGrad = weightGrad->getData();
   const int* starts = sequence.getData();
 
-  hl_context_projection_backward_weight(outGrad, starts, wtGrad, numSequences,
-                                        weightDim, totalPad, contextLength,
-                                        contextStart, beginPad);
+  hl_context_projection_backward_weight(outGrad,
+                                        starts,
+                                        wtGrad,
+                                        numSequences,
+                                        weightDim,
+                                        totalPad,
+                                        contextLength,
+                                        contextStart,
+                                        beginPad);
 }
 
 void GpuMatrix::paramReluForward(Matrix& data, Matrix& W) {
@@ -1193,8 +1477,8 @@ void GpuMatrix::paramReluBackwardW(Matrix& oGrad, Matrix& data) {
   size_t numElements = data.getWidth();
   size_t numSamples = data.getHeight();
   size_t partial_sum = numElements / (this->getHeight() * this->getWidth());
-  hl_param_relu_backward_w(wgrad, ograd, input, numElements, numSamples,
-                           partial_sum);
+  hl_param_relu_backward_w(
+      wgrad, ograd, input, numElements, numSamples, partial_sum);
 }
 
 void GpuMatrix::paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W) {
@@ -1205,8 +1489,8 @@ void GpuMatrix::paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W) {
   size_t numElements = data.getWidth();
   size_t numSamples = data.getHeight();
   size_t partial_sum = numElements / (W.getHeight() * W.getWidth());
-  hl_param_relu_backward_diff(ograd, input, w, diff, numElements, numSamples,
-                              partial_sum);
+  hl_param_relu_backward_diff(
+      ograd, input, w, diff, numElements, numSamples, partial_sum);
 }
 
 void GpuMatrix::addColumnVector(const Matrix& b) {
@@ -1229,15 +1513,24 @@ void GpuMatrix::bilinearForward(const Matrix& in,
   const size_t inputH = in.getHeight();
 
   real* outData = getData();
-  const real* inData  = in.getData();
+  const real* inData = in.getData();
 
   if (inImgH == outImgW && inImgW == outImgW) {
     this->copyFrom(in);
   } else {
-    hl_bilinear_forward(
-      inData, inImgH, inImgW, inputH, inputW, outData,
-      outImgH, outImgW, outputH, outputW, numChannels,
-      ratioH, ratioW);
+    hl_bilinear_forward(inData,
+                        inImgH,
+                        inImgW,
+                        inputH,
+                        inputW,
+                        outData,
+                        outImgH,
+                        outImgW,
+                        outputH,
+                        outputW,
+                        numChannels,
+                        ratioH,
+                        ratioW);
   }
 }
 
@@ -1262,47 +1555,56 @@ void GpuMatrix::bilinearBackward(const Matrix& out,
   if (outImgH == inImgH && outImgW == inImgW) {
     this->add(const_cast<Matrix&>(out));
   } else {
-    hl_bilinear_backward(
-      inGrad, inImgH, inImgW, inputH, inputW, outGrad,
-      outImgH, outImgW, outputH, outputW, numChannels,
-      ratioH, ratioW);
+    hl_bilinear_backward(inGrad,
+                         inImgH,
+                         inImgW,
+                         inputH,
+                         inputW,
+                         outGrad,
+                         outImgH,
+                         outImgW,
+                         outputH,
+                         outputW,
+                         numChannels,
+                         ratioH,
+                         ratioW);
   }
 }
 
 void GpuMatrix::multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label) {
-    GpuMatrix* outputPtr = dynamic_cast<GpuMatrix*>(&output);
-    auto labelPtr = dynamic_cast<GpuSparseMatrix*>(&label);
-
-    CHECK(outputPtr && labelPtr) << "Invalid argument pointer";
-    CHECK(labelPtr->format_ == SPARSE_CSR) << "Matrix format not supported";
-    CHECK(height_ == outputPtr->height_ && width_ == 1
-          && outputPtr->width_ == labelPtr->getWidth()
-          && outputPtr->height_ == labelPtr->getHeight())
-            << "Matrix dimensions are not equal";
+  GpuMatrix* outputPtr = dynamic_cast<GpuMatrix*>(&output);
+  auto labelPtr = dynamic_cast<GpuSparseMatrix*>(&label);
+
+  CHECK(outputPtr && labelPtr) << "Invalid argument pointer";
+  CHECK(labelPtr->format_ == SPARSE_CSR) << "Matrix format not supported";
+  CHECK(height_ == outputPtr->height_ && width_ == 1 &&
+        outputPtr->width_ == labelPtr->getWidth() &&
+        outputPtr->height_ == labelPtr->getHeight())
+      << "Matrix dimensions are not equal";
 
-    real* output_d = outputPtr->data_;
-    real* entropy_d = data_;
-    hl_sparse_matrix_s mat_d = labelPtr->sMatrix_.get();
-    hl_matrix_multi_binary_cross_entropy(
-        output_d, entropy_d, mat_d, height_, outputPtr->width_);
+  real* output_d = outputPtr->data_;
+  real* entropy_d = data_;
+  hl_sparse_matrix_s mat_d = labelPtr->sMatrix_.get();
+  hl_matrix_multi_binary_cross_entropy(
+      output_d, entropy_d, mat_d, height_, outputPtr->width_);
 }
 
-void GpuMatrix::multiBinaryLabelCrossEntropyBp(Matrix &output, Matrix &label) {
-    GpuMatrix* outputPtr = dynamic_cast<GpuMatrix*>(&output);
-    auto labelPtr = dynamic_cast<GpuSparseMatrix*>(&label);
+void GpuMatrix::multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label) {
+  GpuMatrix* outputPtr = dynamic_cast<GpuMatrix*>(&output);
+  auto labelPtr = dynamic_cast<GpuSparseMatrix*>(&label);
 
-    CHECK(outputPtr && labelPtr) << "Invalid argument pointer";
-    CHECK(labelPtr->format_ == SPARSE_CSR) << "Matrix format not supported";
-    CHECK(height_ == outputPtr->height_ && width_ == outputPtr->width_
-          && outputPtr->width_ == labelPtr->getWidth()
-          && outputPtr->height_ == labelPtr->getHeight())
-            << "Matrix dimensions are not equal";
+  CHECK(outputPtr && labelPtr) << "Invalid argument pointer";
+  CHECK(labelPtr->format_ == SPARSE_CSR) << "Matrix format not supported";
+  CHECK(height_ == outputPtr->height_ && width_ == outputPtr->width_ &&
+        outputPtr->width_ == labelPtr->getWidth() &&
+        outputPtr->height_ == labelPtr->getHeight())
+      << "Matrix dimensions are not equal";
 
-    real* output_d = outputPtr->data_;
-    real* grad_d = data_;
-    hl_sparse_matrix_s mat_d = labelPtr->sMatrix_.get();
-    hl_matrix_multi_binary_cross_entropy_bp(
-        output_d, grad_d, mat_d, height_, width_);
+  real* output_d = outputPtr->data_;
+  real* grad_d = data_;
+  hl_sparse_matrix_s mat_d = labelPtr->sMatrix_.get();
+  hl_matrix_multi_binary_cross_entropy_bp(
+      output_d, grad_d, mat_d, height_, width_);
 }
 
 /**
@@ -1311,7 +1613,10 @@ void GpuMatrix::multiBinaryLabelCrossEntropyBp(Matrix &output, Matrix &label) {
 
 CpuMatrix::CpuMatrix(size_t height, size_t width, bool trans)
     : Matrix(std::make_shared<CpuMemoryHandle>(height * width * sizeof(real)),
-             height, width, trans, false) {}
+             height,
+             width,
+             trans,
+             false) {}
 
 CpuMatrix::~CpuMatrix() {}
 
@@ -1333,8 +1638,8 @@ void CpuMatrix::copyFrom(const Matrix& src) {
   if (typeid(src) == typeid(GpuMatrix)) {
     CHECK(src.isContiguous());
     CHECK(elementCnt_ == src.getElementCnt());
-    hl_memcpy_device2host(data_, const_cast<real*>(src.getData()),
-                          sizeof(real) * elementCnt_);
+    hl_memcpy_device2host(
+        data_, const_cast<real*>(src.getData()), sizeof(real) * elementCnt_);
   } else if (typeid(src) == typeid(CpuMatrix) ||
              typeid(src) == typeid(SharedCpuMatrix)) {
     CHECK(src.isContiguous());
@@ -1399,8 +1704,10 @@ void CpuMatrix::copyFrom(const Matrix& src, hl_stream_t stream) {
   CHECK(src.isContiguous());
   CHECK(elementCnt_ == src.getElementCnt());
   if (typeid(src) == typeid(GpuMatrix)) {
-    hl_memcpy_async(this->getData(), const_cast<real*>(src.getData()),
-                    sizeof(real) * elementCnt_, stream);
+    hl_memcpy_async(this->getData(),
+                    const_cast<real*>(src.getData()),
+                    sizeof(real) * elementCnt_,
+                    stream);
   } else if (typeid(src) == typeid(CpuMatrix)) {
     memcpy(data_, src.getData(), sizeof(real) * elementCnt_);
   } else {
@@ -1502,7 +1809,7 @@ void CpuMatrix::accumulateColSum(Matrix& src) {
   CHECK_EQ(getWidth(), src.getWidth());
   CHECK_EQ(getHeight(), (size_t)1);
 
-  sumCols(src, /* scaleSum= */1, /* scaleDest= */1);
+  sumCols(src, /* scaleSum= */ 1, /* scaleDest= */ 1);
 }
 
 real CpuMatrix::getAbsSum() {
@@ -1519,8 +1826,10 @@ real CpuMatrix::getAbsSum() {
 MatrixPtr CpuMatrix::getTranspose() {
   if (memoryHandle_.get() != NULL) {
     return std::make_shared<CpuMatrix>(
-        std::dynamic_pointer_cast<CpuMemoryHandle>(memoryHandle_), height_,
-        width_, true);
+        std::dynamic_pointer_cast<CpuMemoryHandle>(memoryHandle_),
+        height_,
+        width_,
+        true);
   } else {
     MatrixPtr copy_T(new CpuMatrix(data_, height_, width_, true));
     return copy_T;
@@ -1545,7 +1854,6 @@ void CpuMatrix::transpose(MatrixPtr matTrans, bool memAlloc) {
   }
 }
 
-
 MatrixPtr CpuMatrix::getInverse() {
   MatrixPtr matInv;
   inverse(matInv, true);
@@ -1586,9 +1894,17 @@ void CpuMatrix::inverse(MatrixPtr matInv, bool memAlloc) {
   CHECK_EQ(info, 0);
 }
 
-void CpuMatrix::convExpand(Matrix& feature, int feaImgHeight, int feaImgWidth,
-                           int channels, int blockH, int blockW, int strideH,
-                           int strideW, int paddingH, int paddingW, int outputH,
+void CpuMatrix::convExpand(Matrix& feature,
+                           int feaImgHeight,
+                           int feaImgWidth,
+                           int channels,
+                           int blockH,
+                           int blockW,
+                           int strideH,
+                           int strideW,
+                           int paddingH,
+                           int paddingW,
+                           int outputH,
                            int outputW) {
   CHECK(feature.useGpu_ == false) << "Matrix type are not equal";
 
@@ -1626,10 +1942,19 @@ void CpuMatrix::convExpand(Matrix& feature, int feaImgHeight, int feaImgWidth,
   }
 }
 
-void CpuMatrix::convShrink(Matrix& expandFeat, int thisImgHeight,
-                           int thisImgWidth, int channels, int blockH,
-                           int blockW, int strideH, int strideW, int paddingH,
-                           int paddingW, int outputH, int outputW, real alpha,
+void CpuMatrix::convShrink(Matrix& expandFeat,
+                           int thisImgHeight,
+                           int thisImgWidth,
+                           int channels,
+                           int blockH,
+                           int blockW,
+                           int strideH,
+                           int strideW,
+                           int paddingH,
+                           int paddingW,
+                           int outputH,
+                           int outputW,
+                           real alpha,
                            real beta) {
   CHECK(expandFeat.useGpu_ == false) << "Matrix type are not equal";
   CHECK_EQ(size_t(thisImgHeight * thisImgWidth * channels),
@@ -1666,10 +1991,17 @@ void CpuMatrix::convShrink(Matrix& expandFeat, int thisImgHeight,
   }
 }
 
-void CpuMatrix::maxPoolForward(Matrix& inputMat, size_t imgSizeH,
-                               size_t imgSizeW, size_t channels, size_t sizeX,
-                               size_t sizeY, size_t strideH, size_t strideW,
-                               size_t outputH, size_t outputW, size_t paddingH,
+void CpuMatrix::maxPoolForward(Matrix& inputMat,
+                               size_t imgSizeH,
+                               size_t imgSizeW,
+                               size_t channels,
+                               size_t sizeX,
+                               size_t sizeY,
+                               size_t strideH,
+                               size_t strideW,
+                               size_t outputH,
+                               size_t outputW,
+                               size_t paddingH,
                                size_t paddingW) {
   real* inputData = inputMat.getData();
   real* outData = data_;
@@ -1717,12 +2049,21 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat, size_t imgSizeH,
   }
 }
 
-void CpuMatrix::maxPoolBackward(Matrix& image, size_t imgSizeH, size_t imgSizeW,
-                                Matrix& outGrad, Matrix& outV, size_t sizeX,
-                                size_t sizeY, size_t strideH, size_t strideW,
-                                size_t outputH, size_t outputW,
-                                real scaleTargets, real scaleOutput,
-                                size_t paddingH, size_t paddingW) {
+void CpuMatrix::maxPoolBackward(Matrix& image,
+                                size_t imgSizeH,
+                                size_t imgSizeW,
+                                Matrix& outGrad,
+                                Matrix& outV,
+                                size_t sizeX,
+                                size_t sizeY,
+                                size_t strideH,
+                                size_t strideW,
+                                size_t outputH,
+                                size_t outputW,
+                                real scaleTargets,
+                                real scaleOutput,
+                                size_t paddingH,
+                                size_t paddingW) {
   size_t num = image.getHeight();
   size_t channels = size_t(width_ / imgSizeH / imgSizeW);
   CHECK(image.getWidth() == imgSizeH * imgSizeW * channels);
@@ -1772,10 +2113,17 @@ void CpuMatrix::maxPoolBackward(Matrix& image, size_t imgSizeH, size_t imgSizeW,
   }
 }
 
-void CpuMatrix::avgPoolForward(Matrix& input, size_t imgSizeH, size_t imgSizeW,
-                               size_t channels, size_t sizeX, size_t sizeY,
-                               size_t strideH, size_t strideW, size_t outputH,
-                               size_t outputW, size_t paddingH,
+void CpuMatrix::avgPoolForward(Matrix& input,
+                               size_t imgSizeH,
+                               size_t imgSizeW,
+                               size_t channels,
+                               size_t sizeX,
+                               size_t sizeY,
+                               size_t strideH,
+                               size_t strideW,
+                               size_t outputH,
+                               size_t outputW,
+                               size_t paddingH,
                                size_t paddingW) {
   // The main loop
   size_t num = input.getHeight();
@@ -1820,11 +2168,19 @@ void CpuMatrix::avgPoolForward(Matrix& input, size_t imgSizeH, size_t imgSizeW,
   }
 }
 
-void CpuMatrix::avgPoolBackward(Matrix& input, size_t imgSizeH, size_t imgSizeW,
-                                size_t sizeX, size_t sizeY, size_t strideH,
-                                size_t strideW, size_t outputH, size_t outputW,
-                                real scaleTargets, real scaleOutput,
-                                size_t paddingH, size_t paddingW) {
+void CpuMatrix::avgPoolBackward(Matrix& input,
+                                size_t imgSizeH,
+                                size_t imgSizeW,
+                                size_t sizeX,
+                                size_t sizeY,
+                                size_t strideH,
+                                size_t strideW,
+                                size_t outputH,
+                                size_t outputW,
+                                real scaleTargets,
+                                real scaleOutput,
+                                size_t paddingH,
+                                size_t paddingW) {
   size_t num = input.getHeight();
   size_t channels = input.getWidth() / outputH / outputW;
   CHECK(imgSizeH * imgSizeW * channels == getWidth());
@@ -1863,9 +2219,13 @@ void CpuMatrix::avgPoolBackward(Matrix& input, size_t imgSizeH, size_t imgSizeW,
   }
 }
 
-void CpuMatrix::crossMapNormalFwd(Matrix& input, size_t imgSizeH,
-                                  size_t imgSizeW, Matrix& denoms,
-                                  size_t channels, size_t sizeX, float scale,
+void CpuMatrix::crossMapNormalFwd(Matrix& input,
+                                  size_t imgSizeH,
+                                  size_t imgSizeW,
+                                  Matrix& denoms,
+                                  size_t channels,
+                                  size_t sizeX,
+                                  float scale,
                                   float pow) {
   size_t num = input.getHeight();
   size_t height = imgSizeH;
@@ -1915,10 +2275,15 @@ void CpuMatrix::crossMapNormalFwd(Matrix& input, size_t imgSizeH,
   integralData = NULL;
 }
 
-void CpuMatrix::crossMapNormalBwd(Matrix& localGrad, Matrix& denoms,
-                                  Matrix& preOutV, Matrix& localOutV,
-                                  size_t channels, size_t imgSizeH,
-                                  size_t imgSizeW, size_t size, float scale,
+void CpuMatrix::crossMapNormalBwd(Matrix& localGrad,
+                                  Matrix& denoms,
+                                  Matrix& preOutV,
+                                  Matrix& localOutV,
+                                  size_t channels,
+                                  size_t imgSizeH,
+                                  size_t imgSizeW,
+                                  size_t size,
+                                  float scale,
                                   float pow) {
   LOG(FATAL) << "Not implemented";
 
@@ -1937,7 +2302,8 @@ void CpuMatrix::crossMapNormalBwd(Matrix& localGrad, Matrix& denoms,
  * Output: output size is the number of input sequences (NOT input instances).
  * output[i] is set to max_{for each instance in this sequence}{input[i]}
  */
-void CpuMatrix::maxSequenceForward(Matrix& input, const IVector& sequence,
+void CpuMatrix::maxSequenceForward(Matrix& input,
+                                   const IVector& sequence,
                                    IVector& index) {
   CHECK(dynamic_cast<CpuMatrix*>(&input));
   CHECK(dynamic_cast<const CpuIVector*>(&sequence));
@@ -1978,7 +2344,8 @@ void CpuMatrix::maxSequenceForward(Matrix& input, const IVector& sequence,
   }
 }
 
-void CpuMatrix::maxSequenceBackward(Matrix& outputGrad, const IVector& sequence,
+void CpuMatrix::maxSequenceBackward(Matrix& outputGrad,
+                                    const IVector& sequence,
                                     IVector& index) {
   CHECK(dynamic_cast<CpuMatrix*>(&outputGrad));
   CHECK(dynamic_cast<const CpuIVector*>(&sequence));
@@ -2004,10 +2371,13 @@ void CpuMatrix::maxSequenceBackward(Matrix& outputGrad, const IVector& sequence,
   }
 }
 
-void CpuMatrix::contextProjectionForward(MatrixPtr input, MatrixPtr weight,
+void CpuMatrix::contextProjectionForward(MatrixPtr input,
+                                         MatrixPtr weight,
                                          const IVector& sequence,
-                                         int contextLength, int contextStart,
-                                         size_t beginPad, bool isPadding) {
+                                         int contextLength,
+                                         int contextStart,
+                                         size_t beginPad,
+                                         bool isPadding) {
   CHECK(dynamic_cast<CpuMatrix*>(input.get()));
   CHECK(dynamic_cast<const CpuIVector*>(&sequence));
   if (weight) CHECK(dynamic_cast<CpuMatrix*>(weight.get()));
@@ -2058,8 +2428,10 @@ void CpuMatrix::contextProjectionForward(MatrixPtr input, MatrixPtr weight,
 void CpuMatrix::contextProjectionBackward(MatrixPtr inputGrad,
                                           MatrixPtr weightGrad,
                                           const IVector& sequence,
-                                          int contextLength, int contextStart,
-                                          size_t beginPad, bool isPadding) {
+                                          int contextLength,
+                                          int contextStart,
+                                          size_t beginPad,
+                                          bool isPadding) {
   if (inputGrad) CHECK(dynamic_cast<CpuMatrix*>(inputGrad.get()));
   if (weightGrad) CHECK(dynamic_cast<CpuMatrix*>(weightGrad.get()));
   CHECK(dynamic_cast<const CpuIVector*>(&sequence));
@@ -2125,15 +2497,15 @@ inline void vecAddTo(real* a, const real* b, real scaleB, size_t len) {
   }
 }
 
-inline void colVecAddTo(real* a, const real* b, size_t len, size_t aWidth,
-                        size_t bWidth) {
+inline void colVecAddTo(
+    real* a, const real* b, size_t len, size_t aWidth, size_t bWidth) {
   for (unsigned int i = 0; i < len; ++i) {
     a[i * aWidth] += b[i * bWidth];
   }
 }
 
-inline void colVecAddTo(real* a, real* b, real c, size_t len, size_t aWidth,
-                        size_t bWidth) {
+inline void colVecAddTo(
+    real* a, real* b, real c, size_t len, size_t aWidth, size_t bWidth) {
   for (unsigned int i = 0; i < len; ++i) {
     a[i * aWidth] += b[i * bWidth] * c;
   }
@@ -2189,7 +2561,7 @@ void CpuMatrix::collectBias(Matrix& a, real scale) {
   CHECK_EQ(width_, a.getWidth());
   CpuSparseMatrix* aptr = dynamic_cast<CpuSparseMatrix*>(&a);
   if (!aptr) {
-    sumCols(a, /* scaleSum= */scale, /* scaleDest= */1);
+    sumCols(a, /* scaleSum= */ scale, /* scaleDest= */ 1);
   } else {
     size_t nnz = aptr->getElementCnt();
     int* cols = aptr->getCols();
@@ -2240,15 +2612,17 @@ void CpuMatrix::sequenceAvgForward(Matrix& a,
     dataMtx->setData(src + starts[i] * width, sequenceLength, width);
     if (mode == 0) {
       // plain average
-      outMtx->sumCols(*dataMtx, (real)1 / (real)sequenceLength,
-                      /* scaleDest= */1);
+      outMtx->sumCols(*dataMtx,
+                      (real)1 / (real)sequenceLength,
+                      /* scaleDest= */ 1);
     } else if (mode == 1) {
       // sum instead of average
-      outMtx->sumCols(*dataMtx,  /* scaleSum= */1, /* scaleDest= */1);
+      outMtx->sumCols(*dataMtx, /* scaleSum= */ 1, /* scaleDest= */ 1);
     } else if (mode == 2) {
       // divide by square root of sequenceLength
-      outMtx->sumCols(*dataMtx, (real)1 / std::sqrt(sequenceLength),
-                      /* scaleDest= */1);
+      outMtx->sumCols(*dataMtx,
+                      (real)1 / std::sqrt(sequenceLength),
+                      /* scaleDest= */ 1);
     } else {
       LOG(FATAL) << "should not reach here";
     }
@@ -2256,27 +2630,37 @@ void CpuMatrix::sequenceAvgForward(Matrix& a,
 }
 
 /* this = scaleAB*(a*b) + scaleT*this*/
-void CpuMatrix::mul(const MatrixPtr a, const MatrixPtr b, real scaleAB,
+void CpuMatrix::mul(const MatrixPtr a,
+                    const MatrixPtr b,
+                    real scaleAB,
                     real scaleT) {
   CHECK(!isTransposed()) << "Not supported";
 
   if (dynamic_cast<CpuMatrix*>(a.get()) && dynamic_cast<CpuMatrix*>(b.get())) {
-    mul(dynamic_cast<CpuMatrix*>(a.get()), dynamic_cast<CpuMatrix*>(b.get()),
-        scaleAB, scaleT);
+    mul(dynamic_cast<CpuMatrix*>(a.get()),
+        dynamic_cast<CpuMatrix*>(b.get()),
+        scaleAB,
+        scaleT);
   } else if (dynamic_cast<CpuSparseMatrix*>(a.get()) &&
              dynamic_cast<CpuMatrix*>(b.get())) {
     mul(dynamic_cast<CpuSparseMatrix*>(a.get()),
-        dynamic_cast<CpuMatrix*>(b.get()), scaleAB, scaleT);
+        dynamic_cast<CpuMatrix*>(b.get()),
+        scaleAB,
+        scaleT);
   } else if (dynamic_cast<CpuMatrix*>(a.get()) &&
              dynamic_cast<CpuSparseMatrix*>(b.get())) {
     mul(dynamic_cast<CpuMatrix*>(a.get()),
-        dynamic_cast<CpuSparseMatrix*>(b.get()), scaleAB, scaleT);
+        dynamic_cast<CpuSparseMatrix*>(b.get()),
+        scaleAB,
+        scaleT);
   } else {
     LOG(FATAL) << "Not supported";
   }
 }
 
-void CpuMatrix::mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB,
+void CpuMatrix::mul(CpuSparseMatrix* a,
+                    CpuMatrix* b,
+                    real scaleAB,
                     real scaleT) {
   if (dynamic_cast<CacheRowCpuMatrix*>(b)) {
     return mul(a, dynamic_cast<CacheRowCpuMatrix*>(b), this, scaleAB, scaleT);
@@ -2326,11 +2710,35 @@ void CpuMatrix::mul(CpuMatrix* a, CpuMatrix* b, real scaleAB, real scaleT) {
   int ldb = b->getStride();
   int ldc = getStride();
 #ifndef PADDLE_TYPE_DOUBLE
-  cblas_sgemm(CblasRowMajor, a_trans, b_trans, M, N, K, scaleAB, A, lda, B, ldb,
-              scaleT, C, ldc);
+  cblas_sgemm(CblasRowMajor,
+              a_trans,
+              b_trans,
+              M,
+              N,
+              K,
+              scaleAB,
+              A,
+              lda,
+              B,
+              ldb,
+              scaleT,
+              C,
+              ldc);
 #else
-  cblas_dgemm(CblasRowMajor, a_trans, b_trans, M, N, K, scaleAB, A, lda, B, ldb,
-              scaleT, C, ldc);
+  cblas_dgemm(CblasRowMajor,
+              a_trans,
+              b_trans,
+              M,
+              N,
+              K,
+              scaleAB,
+              A,
+              lda,
+              B,
+              ldb,
+              scaleT,
+              C,
+              ldc);
 // TODO(yuyang18): Is gemm defined other place?
 #endif
 
@@ -2338,8 +2746,8 @@ void CpuMatrix::mul(CpuMatrix* a, CpuMatrix* b, real scaleAB, real scaleT) {
           << " B[1]=" << B[1] << " C[0]=" << C[0] << " C[1]=" << C[1];
 }
 
-void CpuMatrix::mul(CpuMatrix* a, CpuMatrix* b, CpuSparseMatrix* c,
-                    real scaleAB, real scaleT) {
+void CpuMatrix::mul(
+    CpuMatrix* a, CpuMatrix* b, CpuSparseMatrix* c, real scaleAB, real scaleT) {
   CHECK(!c->isTransposed()) << "Not supported";
   CHECK_EQ(c->getValueType(), FLOAT_VALUE);
 
@@ -2446,7 +2854,9 @@ void CpuMatrix::mul(CpuMatrix* a, CpuMatrix* b, CpuSparseMatrix* c,
   }
 }
 
-void CpuMatrix::mul(CpuMatrix* a, CpuSparseMatrix* b, real scaleAB,
+void CpuMatrix::mul(CpuMatrix* a,
+                    CpuSparseMatrix* b,
+                    real scaleAB,
                     real scaleT) {
   CHECK(!trans_) << "Not supported";
   CHECK(!a->isTransposed()) << "Not supported";
@@ -2484,8 +2894,8 @@ void CpuMatrix::mul(CpuMatrix* a, CpuSparseMatrix* b, real scaleAB,
           int start = b->getColStartIdx(j);
           int end = b->getColStartIdx(j + 1);
           for (int i = start; i < end; ++i) {
-            colVecAddTo(C + j, A + rows[i], B[i], height_, width_,
-                        a->getWidth());
+            colVecAddTo(
+                C + j, A + rows[i], B[i], height_, width_, a->getWidth());
           }
         }
       }
@@ -2507,8 +2917,8 @@ void CpuMatrix::mul(CpuMatrix* a, CpuSparseMatrix* b, real scaleAB,
           int start = b->getColStartIdx(i);
           int end = b->getColStartIdx(i + 1);
           for (int j = start; j < end; ++j) {
-            colVecAddTo(C + rows[j], A + i, B[j], height_, width_,
-                        a->getWidth());
+            colVecAddTo(
+                C + rows[j], A + i, B[j], height_, width_, a->getWidth());
           }
         }
       }
@@ -2533,8 +2943,8 @@ void CpuMatrix::mul(CpuMatrix* a, CpuSparseMatrix* b, real scaleAB,
           int start = b->getRowStartIdx(j);
           int end = b->getRowStartIdx(j + 1);
           for (int i = start; i < end; ++i) {
-            colVecAddTo(C + cols[i], A + j, B[i], height_, width_,
-                        a->getWidth());
+            colVecAddTo(
+                C + cols[i], A + j, B[i], height_, width_, a->getWidth());
           }
         }
       }
@@ -2556,8 +2966,8 @@ void CpuMatrix::mul(CpuMatrix* a, CpuSparseMatrix* b, real scaleAB,
           int start = b->getRowStartIdx(i);
           int end = b->getRowStartIdx(i + 1);
           for (int j = start; j < end; ++j) {
-            colVecAddTo(C + i, A + cols[j], B[j], height_, width_,
-                        a->getWidth());
+            colVecAddTo(
+                C + i, A + cols[j], B[j], height_, width_, a->getWidth());
           }
         }
       }
@@ -2656,8 +3066,8 @@ void CpuMatrix::addToRowsImp(TableMatType& table, IVector& ids) {
 static ThreadLocal<std::vector<const real*>> threadLocalColArray;
 
 template <typename MatBType, typename MatCType>
-void CpuMatrix::mul(CpuSparseMatrix* a, MatBType* b, MatCType* c, real scaleAB,
-                    real scaleT) {
+void CpuMatrix::mul(
+    CpuSparseMatrix* a, MatBType* b, MatCType* c, real scaleAB, real scaleT) {
   CHECK(!c->isTransposed()) << "Not supported";
   CHECK(!b->isTransposed()) << "Not supported";
   // TODO(yuyang18): Maybe bug implementation here.
@@ -2760,18 +3170,26 @@ void CpuMatrix::mul(CpuSparseMatrix* a, MatBType* b, MatCType* c, real scaleAB,
 
 // instantiation mul() called in SparseRowMatrix.cpp
 template void CpuMatrix::mul<CpuMatrix, SparseRowCpuMatrix>(
-    CpuSparseMatrix* a, CpuMatrix* b, SparseRowCpuMatrix* c, real scaleAB,
+    CpuSparseMatrix* a,
+    CpuMatrix* b,
+    SparseRowCpuMatrix* c,
+    real scaleAB,
     real scaleT);
 template void CpuMatrix::mul<CpuMatrix, SparseAutoGrowRowCpuMatrix>(
-    CpuSparseMatrix* a, CpuMatrix* b, SparseAutoGrowRowCpuMatrix* c,
-    real scaleAB, real scaleT);
+    CpuSparseMatrix* a,
+    CpuMatrix* b,
+    SparseAutoGrowRowCpuMatrix* c,
+    real scaleAB,
+    real scaleT);
 template void CpuMatrix::mul<CpuMatrix, CacheRowCpuMatrix>(CpuSparseMatrix* a,
                                                            CpuMatrix* b,
                                                            CacheRowCpuMatrix* c,
                                                            real scaleAB,
                                                            real scaleT);
 
-void SharedCpuMatrix::mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB,
+void SharedCpuMatrix::mul(CpuSparseMatrix* a,
+                          CpuMatrix* b,
+                          real scaleAB,
                           real scaleT) {
   CHECK(!isTransposed()) << "Not supported";
   CHECK(!b->isTransposed()) << "Not supported";
@@ -2811,8 +3229,8 @@ void SharedCpuMatrix::mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB,
     for (int k = 0; k < blockNum_; ++k) {
       blockSeq.push_back(k);
     }
-    std::shuffle(blockSeq.begin(), blockSeq.end(),
-                 ThreadLocalRandomEngine::get());
+    std::shuffle(
+        blockSeq.begin(), blockSeq.end(), ThreadLocalRandomEngine::get());
   }
   std::vector<int>& localBufRows = *localBufRows_;
   int* cols = a->getCols();
@@ -2850,8 +3268,8 @@ void SharedCpuMatrix::mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB,
         localBufRows.push_back(i);
         size_t bufPos = localBufRows.size() - 1;
         for (int j = start; j < end; ++j) {
-          vecAddTo(localC + bufPos * width, B + cols[j] * width, value[j],
-                   width);
+          vecAddTo(
+              localC + bufPos * width, B + cols[j] * width, value[j], width);
         }
       }
     }
@@ -2935,7 +3353,7 @@ void CpuMatrix::rowSum(Matrix& sum) {
   CHECK_EQ(sum.getHeight(), getHeight());
   CHECK_EQ(sum.getWidth(), (size_t)1);
 
-  sum.sumRows(*this, /* scaleSum= */1, /* scaleDest= */0);
+  sum.sumRows(*this, /* scaleSum= */ 1, /* scaleDest= */ 0);
 }
 
 void CpuMatrix::rowMaxId(IVector& maxIds) {
@@ -2987,7 +3405,9 @@ void CpuMatrix::rowMax(IVector& maxIds, Matrix& maxVal) {
     }
 
     std::partial_sort(
-        vec.begin(), vec.begin() + beam, vec.end(),
+        vec.begin(),
+        vec.begin() + beam,
+        vec.end(),
         [](const std::pair<real, size_t>& l, const std::pair<real, size_t>& r) {
           return l.first > r.first;
         });
@@ -3023,7 +3443,9 @@ void CpuMatrix::colMax(IVector& maxIds, Matrix& maxVal) {
     }
 
     std::partial_sort(
-        vec.begin(), vec.begin() + beam, vec.end(),
+        vec.begin(),
+        vec.begin() + beam,
+        vec.end(),
         [](const std::pair<real, size_t>& l, const std::pair<real, size_t>& r) {
           return l.first > r.first;
         });
@@ -3034,7 +3456,9 @@ void CpuMatrix::colMax(IVector& maxIds, Matrix& maxVal) {
   }
 }
 
-void CpuMatrix::maxoutForward(Matrix& a, IVector& id, size_t channels,
+void CpuMatrix::maxoutForward(Matrix& a,
+                              IVector& id,
+                              size_t channels,
                               size_t groups) {
   CHECK(dynamic_cast<CpuMatrix*>(&a));
   CHECK(dynamic_cast<CpuIVector*>(&id));
@@ -3067,7 +3491,9 @@ void CpuMatrix::maxoutForward(Matrix& a, IVector& id, size_t channels,
   }
 }
 
-void CpuMatrix::maxoutBackward(Matrix& a, IVector& id, size_t channels,
+void CpuMatrix::maxoutBackward(Matrix& a,
+                               IVector& id,
+                               size_t channels,
                                size_t groups) {
   CHECK(dynamic_cast<CpuMatrix*>(&a));
   CHECK(dynamic_cast<CpuIVector*>(&id));
@@ -3189,7 +3615,8 @@ void CpuMatrix::oneHotCrossEntropyBp(Matrix& output, IVector& label) {
     but we define the scalar function here for sanity check
     deletion of the function does not affect anything neverthelss
 */
-void CpuMatrix::oneHotCrossEntropyWithSelfNorm(Matrix& output, IVector& label,
+void CpuMatrix::oneHotCrossEntropyWithSelfNorm(Matrix& output,
+                                               IVector& label,
                                                real alpha) {
   CHECK(dynamic_cast<CpuMatrix*>(&output));
   CHECK(dynamic_cast<CpuIVector*>(&label));
@@ -3220,7 +3647,8 @@ void CpuMatrix::oneHotCrossEntropyWithSelfNorm(Matrix& output, IVector& label,
     but we define the scalar function here for sanity check
     deletion of the function does not affect anything neverthelss
 */
-void CpuMatrix::oneHotCrossEntropyWithSelfNormBp(Matrix& output, IVector& label,
+void CpuMatrix::oneHotCrossEntropyWithSelfNormBp(Matrix& output,
+                                                 IVector& label,
                                                  real alpha) {
   CHECK(dynamic_cast<CpuMatrix*>(&output));
   CHECK(dynamic_cast<CpuIVector*>(&label));
@@ -3301,10 +3729,16 @@ void CpuMatrix::sequenceSoftmax(Matrix& output, const IVector& index) {
   CHECK_EQ(output.getWidth(), 1UL);
   CHECK(isContiguous());
 
-  MatrixPtr inTmp = Matrix::create(nullptr, /* height= */ 1, 1,
-                                   /* trans= */ false, false);
-  MatrixPtr outTmp = Matrix::create(nullptr, /* height= */ 1, 1,
-                                    /* trans= */ false, false);
+  MatrixPtr inTmp = Matrix::create(nullptr,
+                                   /* height= */ 1,
+                                   1,
+                                   /* trans= */ false,
+                                   false);
+  MatrixPtr outTmp = Matrix::create(nullptr,
+                                    /* height= */ 1,
+                                    1,
+                                    /* trans= */ false,
+                                    false);
   size_t numSequences = index.getSize() - 1;
   auto starts = index.getData();
   for (size_t i = 0; i < numSequences; ++i) {
@@ -3360,9 +3794,12 @@ void CpuMatrix::cosSim(Matrix& output1, Matrix& output2, real scale) {
   }
 }
 
-void CpuMatrix::cosSimDerivative(Matrix& output, Matrix& prevOut1,
-                                 Matrix& prevOut2, Matrix& prevGrad1,
-                                 Matrix& prevGrad2, real scale) {
+void CpuMatrix::cosSimDerivative(Matrix& output,
+                                 Matrix& prevOut1,
+                                 Matrix& prevOut2,
+                                 Matrix& prevGrad1,
+                                 Matrix& prevGrad2,
+                                 real scale) {
   CHECK(output.useGpu_ == false) << "Matrix type are not equal";
 
   CHECK_EQ(getWidth(), 1UL);
@@ -3392,8 +3829,11 @@ void CpuMatrix::cosSimDerivative(Matrix& output, Matrix& prevOut1,
     CHECK_EQ(prevOut2.getHeight(), numSamples);
     CHECK_EQ(prevGrad2.getHeight(), numSamples);
   }
-  for (size_t i = 0; i < numSamples; ++i, prevOutX += dim, prevOutY += yInc,
-              prevGradX += dim, prevGradY += yInc) {
+  for (size_t i = 0; i < numSamples; ++i,
+              prevOutX += dim,
+              prevOutY += yInc,
+              prevGradX += dim,
+              prevGradY += yInc) {
     real squareSumX = 0;
     real squareSumY = 0;
     real xy = 0;
@@ -3450,7 +3890,8 @@ void CpuMatrix::sumOfSquares(Matrix& output, Matrix& label) {
         int* cols = labelptr->getCols();
         for (size_t i = 0; i < numSamples; ++i) {
           for (size_t j = labelptr->getRowStartIdx(i);
-               j < labelptr->getRowStartIdx(i + 1); ++j) {
+               j < labelptr->getRowStartIdx(i + 1);
+               ++j) {
             cost[i] += 1.0 - 2.0 * out[i * dim + cols[j]];
             /*
              * explanation of above line: original codes are follows:
@@ -3466,7 +3907,8 @@ void CpuMatrix::sumOfSquares(Matrix& output, Matrix& label) {
           real sum1 = 0;
           real sum2 = 0;
           for (size_t j = labelptr->getRowStartIdx(i);
-               j < labelptr->getRowStartIdx(i + 1); ++j) {
+               j < labelptr->getRowStartIdx(i + 1);
+               ++j) {
             sum1 += values[j] * values[j];
             sum2 += values[j] * out[i * dim + cols[j]];
             /*
@@ -3488,8 +3930,10 @@ void CpuMatrix::sumOfSquares(Matrix& output, Matrix& label) {
     }
   }
 
-  BaseMatrix::sumOfSquaredDiffs(output, label,
-                                /* scaleSum= */1, /* scaleDest= */1);
+  BaseMatrix::sumOfSquaredDiffs(output,
+                                label,
+                                /* scaleSum= */ 1,
+                                /* scaleDest= */ 1);
 }
 
 /* calculate the error of outputV according to label */
@@ -3519,7 +3963,8 @@ void CpuMatrix::sumOfSquaresBp(Matrix& output, Matrix& label) {
         int* cols = labelptr->getCols();
         for (size_t i = 0; i < numSamples; ++i) {
           for (size_t j = labelptr->getRowStartIdx(i);
-               j < labelptr->getRowStartIdx(i + 1); ++j) {
+               j < labelptr->getRowStartIdx(i + 1);
+               ++j) {
             grad[i * dim + cols[j]] -= 2.0;
             /*
              * explanation of above line: original codes are follows:
@@ -3534,7 +3979,8 @@ void CpuMatrix::sumOfSquaresBp(Matrix& output, Matrix& label) {
         real* values = labelptr->getValue();
         for (size_t i = 0; i < numSamples; ++i) {
           for (size_t j = labelptr->getRowStartIdx(i);
-               j < labelptr->getRowStartIdx(i + 1); ++j) {
+               j < labelptr->getRowStartIdx(i + 1);
+               ++j) {
             grad[i * dim + cols[j]] -= 2.0 * values[j];
             /*
              * explanation of above line: original codes are follows:
@@ -3809,8 +4255,8 @@ void CpuMatrix::circularConv(Matrix& in0, Matrix& in1) {
   }
 }
 
-void CpuMatrix::circularConvDerivative(Matrix& outG, Matrix& in0, Matrix& in1,
-                                       Matrix& inG0, Matrix& inG1) {
+void CpuMatrix::circularConvDerivative(
+    Matrix& outG, Matrix& in0, Matrix& in1, Matrix& inG0, Matrix& inG1) {
   size_t height = in0.getHeight();
   size_t width0 = in0.getWidth();
   size_t width1 = in1.getWidth();
@@ -3830,8 +4276,12 @@ void CpuMatrix::circularConvDerivative(Matrix& outG, Matrix& in0, Matrix& in1,
   real* inGV1 = inG1.getData();
 
   int leftCtxLen = (width1 - 1) / 2;
-  for (size_t x = 0; x < height; ++x, outGV += width0, inV0 += width0,
-              inV1 += width1, inGV0 += width0, inGV1 += width1) {
+  for (size_t x = 0; x < height; ++x,
+              outGV += width0,
+              inV0 += width0,
+              inV1 += width1,
+              inGV0 += width0,
+              inGV1 += width1) {
     for (size_t j = 0; j < width1; ++j) {  // iterate over width1
       for (size_t i = 0; i < width0; ++i) {
         // such over all dimensions of outG
@@ -3900,7 +4350,8 @@ void CpuMatrix::multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label) {
 }
 
 /* calculate the classification error for multi binary label */
-void CpuMatrix::classificationErrorMulti(Matrix& output, Matrix& label,
+void CpuMatrix::classificationErrorMulti(Matrix& output,
+                                         Matrix& label,
                                          real threshold) {
   CHECK(dynamic_cast<CpuMatrix*>(&output));
   auto labelPtr = dynamic_cast<CpuSparseMatrix*>(&label);
@@ -3954,12 +4405,12 @@ void CpuMatrix::bilinearForward(const Matrix& in,
   (void)(inputH);
 
   real* outData = getData();
-  const real* inData  = in.getData();
+  const real* inData = in.getData();
 
   if (inImgH == outImgH && inImgW == outImgW) {
     this->copyFrom(in);
   } else {
-    for (size_t k = 0; k < batchSize; ++k) {   // loop for batches
+    for (size_t k = 0; k < batchSize; ++k) {  // loop for batches
       for (size_t i = 0; i < outImgH; ++i) {  // loop for images
         size_t h = ratioH * i;
         size_t hid = (h < inImgH - 1) ? 1 : 0;
@@ -3977,9 +4428,9 @@ void CpuMatrix::bilinearForward(const Matrix& in,
           for (size_t c = 0; c < numChannels; ++c) {  // loop for channels
             // bilinear interpolation
             outPos[0] =
-              h2lambda * (w2lambda * inPos[0] + w1lambda * inPos[wid]) +
-              h1lambda * (w2lambda * inPos[hid * inImgW] +
-              w1lambda * inPos[hid * inImgW + wid]);
+                h2lambda * (w2lambda * inPos[0] + w1lambda * inPos[wid]) +
+                h1lambda * (w2lambda * inPos[hid * inImgW] +
+                            w1lambda * inPos[hid * inImgW + wid]);
             inPos += inPosOffset;
             outPos += outPosOffset;
           }
@@ -4013,7 +4464,7 @@ void CpuMatrix::bilinearBackward(const Matrix& out,
   if (inImgH == outImgH && inImgW == outImgW) {
     this->add(const_cast<Matrix&>(out));
   } else {
-    for (size_t k = 0; k < batchSize; ++k) {   // loop for batches
+    for (size_t k = 0; k < batchSize; ++k) {  // loop for batches
       for (size_t i = 0; i < outImgH; ++i) {  // loop for images
         size_t h = ratioH * i;
         size_t hid = (h < inImgH - 1) ? 1 : 0;
diff --git a/paddle/math/Matrix.h b/paddle/math/Matrix.h
index 6c3c4804d2..075dc84576 100644
--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <memory>
@@ -77,12 +76,19 @@ typedef std::shared_ptr<CpuSparseMatrix> CpuSparseMatrixPtr;
  */
 class Matrix : public BaseMatrix {
 protected:
-  Matrix(MemoryHandlePtr memHandle, size_t height, size_t width, bool trans,
+  Matrix(MemoryHandlePtr memHandle,
+         size_t height,
+         size_t width,
+         bool trans,
          bool use_gpu);
 
   Matrix(real* data, size_t height, size_t width, bool trans, bool use_gpu);
 
-  Matrix(real* data, size_t height, size_t width, size_t stride, bool trans,
+  Matrix(real* data,
+         size_t height,
+         size_t width,
+         size_t stride,
+         bool trans,
          bool use_gpu);
 
   static ThreadLocal<MatrixPtr> tmpMat_;
@@ -94,38 +100,66 @@ public:
 public:
   virtual ~Matrix() {}
 
-  static MatrixPtr create(MemoryHandlePtr memHandle, size_t height,
-                          size_t width, bool trans = false);
-  static MatrixPtr create(size_t height, size_t width, bool trans = false,
+  static MatrixPtr create(MemoryHandlePtr memHandle,
+                          size_t height,
+                          size_t width,
+                          bool trans = false);
+  static MatrixPtr create(size_t height,
+                          size_t width,
+                          bool trans = false,
+                          bool useGpu = false);
+  static MatrixPtr create(real* data,
+                          size_t height,
+                          size_t width,
+                          bool trans = false,
                           bool useGpu = false);
-  static MatrixPtr create(real* data, size_t height, size_t width,
-                          bool trans = false, bool useGpu = false);
-  static MatrixPtr create(real* data, size_t height, size_t width,
-                          size_t stride, bool trans = false,
+  static MatrixPtr create(real* data,
+                          size_t height,
+                          size_t width,
+                          size_t stride,
+                          bool trans = false,
                           bool useGpu = false);
 
-  static MatrixPtr createSparseMatrix(size_t height, size_t width, size_t nnz,
+  static MatrixPtr createSparseMatrix(size_t height,
+                                      size_t width,
+                                      size_t nnz,
                                       SparseValueType valueType = FLOAT_VALUE,
-                                      bool trans = false, bool useGpu = false);
-  static MatrixPtr createSparseMatrix(size_t height, size_t width, size_t nnz,
+                                      bool trans = false,
+                                      bool useGpu = false);
+  static MatrixPtr createSparseMatrix(size_t height,
+                                      size_t width,
+                                      size_t nnz,
                                       SparseValueType valueType = FLOAT_VALUE,
                                       SparseFormat foramt = SPARSE_CSR,
-                                      bool trans = false, bool useGpu = false);
-
-  static MatrixPtr createSparseMatrix(real* data, int* row, int* col,
-                                      size_t height, size_t width,
+                                      bool trans = false,
+                                      bool useGpu = false);
+
+  static MatrixPtr createSparseMatrix(real* data,
+                                      int* row,
+                                      int* col,
+                                      size_t height,
+                                      size_t width,
                                       size_t nnz, /* used to allocate space */
                                       SparseValueType valueType, /*value type*/
-                                      SparseFormat format, bool trans,
+                                      SparseFormat format,
+                                      bool trans,
                                       bool useGpu);
 
   static void resizeOrCreateSparseMatrix(
-      MatrixPtr& matrix, size_t height, size_t width, size_t nnz,
-      SparseValueType valueType = FLOAT_VALUE, SparseFormat foramt = SPARSE_CSR,
-      bool trans = false, bool useGpu = false);
-
-  static void resizeOrCreate(MatrixPtr& a, size_t height, size_t width,
-                             bool trans = false, bool useGpu = false);
+      MatrixPtr& matrix,
+      size_t height,
+      size_t width,
+      size_t nnz,
+      SparseValueType valueType = FLOAT_VALUE,
+      SparseFormat foramt = SPARSE_CSR,
+      bool trans = false,
+      bool useGpu = false);
+
+  static void resizeOrCreate(MatrixPtr& a,
+                             size_t height,
+                             size_t width,
+                             bool trans = false,
+                             bool useGpu = false);
 
   /**
    * @brief  set the data buffer used to hold the matrix data.
@@ -163,12 +197,12 @@ public:
   // if refactor sparse matrix
   virtual int* getRows() const {
     LOG(FATAL) << "Not implemented";
-    return nullptr;   //! suppress warning for no return value.
+    return nullptr;  //! suppress warning for no return value.
   }
 
   virtual int* getCols() const {
     LOG(FATAL) << "Not implemented";
-    return nullptr;   //! suppress warning for no return value.
+    return nullptr;  //! suppress warning for no return value.
   }
 
   virtual SparseFormat getFormat() const {
@@ -178,7 +212,7 @@ public:
 
   virtual SparseValueType getValueType() const {
     LOG(FATAL) << "Not implemented";
-    return NO_VALUE;    //! suppress warning for no return value.
+    return NO_VALUE;  //! suppress warning for no return value.
   }
 
   /**
@@ -208,7 +242,9 @@ public:
     LOG(FATAL) << "Not implemented";
   }
 
-  MatrixPtr subMatrix(size_t startRow, size_t endRow, size_t startCol,
+  MatrixPtr subMatrix(size_t startRow,
+                      size_t endRow,
+                      size_t startCol,
                       size_t endCol);
 
   MatrixPtr subRowMatrix(size_t startRow, size_t endRow) {
@@ -221,8 +257,11 @@ public:
 
   virtual MatrixPtr subMatrix(size_t startRow, size_t numRows) {
     CHECK_LE(startRow + numRows, getHeight());
-    return Matrix::create(getData() + startRow * getWidth(), numRows,
-                          getWidth(), trans_, useGpu_);
+    return Matrix::create(getData() + startRow * getWidth(),
+                          numRows,
+                          getWidth(),
+                          trans_,
+                          useGpu_);
   }
   virtual MatrixPtr subMatrix(size_t startRow, size_t numRows, MatrixPtr dest) {
     CHECK_LE(startRow + numRows, getHeight());
@@ -267,7 +306,8 @@ public:
    * as this, otherwise the new matrix will have the specified size.
    *
    */
-  virtual MatrixPtr clone(size_t height = 0, size_t width = 0,
+  virtual MatrixPtr clone(size_t height = 0,
+                          size_t width = 0,
                           bool useGpu = false) {
     LOG(FATAL) << "Not implemented";
     return nullptr;
@@ -305,9 +345,11 @@ public:
   /**
    * @note This should only be used for sparse matrix.
    */
-  virtual void resize(size_t newHeight, size_t newWidth,
+  virtual void resize(size_t newHeight,
+                      size_t newWidth,
                       size_t newNnz, /* total item used to allocate space */
-                      SparseValueType valueType, SparseFormat format) = 0;
+                      SparseValueType valueType,
+                      SparseFormat format) = 0;
 
   /**
    * @brief This should only be used for sparse matrix.
@@ -315,7 +357,9 @@ public:
    * Currently must be called for each row in order.
    * The matrix is not valid until setRow is called for the last row.
    */
-  virtual void setRow(size_t row, size_t colNum, const unsigned int* cols,
+  virtual void setRow(size_t row,
+                      size_t colNum,
+                      const unsigned int* cols,
                       const real* values) = 0;
 
   virtual MatrixPtr getTranspose() = 0;
@@ -389,8 +433,9 @@ public:
     }
   }
 
-  virtual void sequenceAvgForward(Matrix& a, const IVector& startsPos,
-    int mode) {
+  virtual void sequenceAvgForward(Matrix& a,
+                                  const IVector& startsPos,
+                                  int mode) {
     LOG(FATAL) << "Not implemented";
   }
 
@@ -399,7 +444,9 @@ public:
    * this = scaleAB*(a*b) + scaleT*this
    * @endcode
    */
-  virtual void mul(const MatrixPtr a, const MatrixPtr b, real scaleAB,
+  virtual void mul(const MatrixPtr a,
+                   const MatrixPtr b,
+                   real scaleAB,
                    real scaleT) {
     LOG(FATAL) << "Not implemented";
   }
@@ -416,7 +463,8 @@ public:
    * where index(i, j) = ((codes(i) + numClasses) >> (j + 1)) - 1
    * @endcode
    */
-  virtual void addByBitCode(size_t numClasses, const IVector& codes,
+  virtual void addByBitCode(size_t numClasses,
+                            const IVector& codes,
                             const Matrix& vec) {
     (void)numClasses;
     (void)codes;
@@ -431,7 +479,8 @@ public:
    * where index is same as the index for addByBitCode
    * @endcode
    */
-  virtual void addByBitCodeBackward(size_t numClasses, const IVector& codes,
+  virtual void addByBitCodeBackward(size_t numClasses,
+                                    const IVector& codes,
                                     Matrix& vec) {
     (void)numClasses;
     (void)codes;
@@ -446,8 +495,10 @@ public:
    * where index is same as the index for addByBitCode
    * @endcode
    */
-  virtual void mulByBitCode(size_t numClasses, const IVector& codes,
-                            const Matrix& mat, const Matrix& input) {
+  virtual void mulByBitCode(size_t numClasses,
+                            const IVector& codes,
+                            const Matrix& mat,
+                            const Matrix& input) {
     (void)numClasses;
     (void)codes;
     (void)mat;
@@ -463,7 +514,8 @@ public:
    * @endcode
    */
   virtual void mulByBitCodeBackwardWeight(size_t numClasses,
-                                          const IVector& codes, Matrix& mat,
+                                          const IVector& codes,
+                                          Matrix& mat,
                                           const Matrix& input) {
     (void)numClasses;
     (void)codes;
@@ -481,7 +533,8 @@ public:
    */
   virtual void mulByBitCodeBackwardError(size_t numClasses,
                                          const IVector& codes,
-                                         const Matrix& mat, Matrix& input) {
+                                         const Matrix& mat,
+                                         Matrix& input) {
     (void)numClasses;
     (void)codes;
     (void)mat;
@@ -496,7 +549,9 @@ public:
    * where bit(i, j) = ((codes(i) + numClasses) & 2^j) ? 1 : 0
    * @endcode
    */
-  virtual void sumByBitCode(size_t numClasses, IVector& codes, Matrix& sum,
+  virtual void sumByBitCode(size_t numClasses,
+                            IVector& codes,
+                            Matrix& sum,
                             real scaleSum) {
     (void)numClasses;
     (void)codes;
@@ -550,12 +605,16 @@ public:
     LOG(FATAL) << "not implemented";
   }
 
-  virtual void maxoutForward(Matrix& a, IVector& id, size_t channels,
+  virtual void maxoutForward(Matrix& a,
+                             IVector& id,
+                             size_t channels,
                              size_t groups) {
     LOG(FATAL) << "not implemented";
   }
 
-  virtual void maxoutBackward(Matrix& a, IVector& id, size_t channels,
+  virtual void maxoutBackward(Matrix& a,
+                              IVector& id,
+                              size_t channels,
                               size_t groups) {
     LOG(FATAL) << "not implemented";
   }
@@ -634,7 +693,8 @@ public:
   }
 
   /// copy -log(output[label]) to this->data[i].
-  virtual void oneHotCrossEntropyWithSelfNorm(Matrix& output, IVector& label,
+  virtual void oneHotCrossEntropyWithSelfNorm(Matrix& output,
+                                              IVector& label,
                                               real alpha) {
     LOG(FATAL) << "Not implemented";
   }
@@ -660,13 +720,14 @@ public:
     LOG(FATAL) << "Not implemented";
   }
 
-  virtual void circularConvDerivative(Matrix& output, Matrix& prevOut1,
-                                      Matrix& prevOut2, Matrix& prevGrad1,
+  virtual void circularConvDerivative(Matrix& output,
+                                      Matrix& prevOut1,
+                                      Matrix& prevOut2,
+                                      Matrix& prevGrad1,
                                       Matrix& prevGrad2) {
     LOG(FATAL) << "Not implemented";
   }
 
-
   /* output_ij = exp(this_{ij}) / (sum_j exp(this_ij)) */
   virtual void softmax(Matrix& output) {
     (void)output;
@@ -727,9 +788,12 @@ public:
     LOG(FATAL) << "Not implemented";
   }
 
-  virtual void cosSimDerivative(Matrix& output, Matrix& prevOut1,
-                                Matrix& prevOut2, Matrix& prevGrad1,
-                                Matrix& prevGrad2, real scale = 1.0f) {
+  virtual void cosSimDerivative(Matrix& output,
+                                Matrix& prevOut1,
+                                Matrix& prevOut2,
+                                Matrix& prevGrad1,
+                                Matrix& prevGrad2,
+                                real scale = 1.0f) {
     LOG(FATAL) << "Not implemented";
   }
 
@@ -781,10 +845,18 @@ public:
    * It will expand a feature matrix according to the
    * convolution filters
    */
-  virtual void convExpand(Matrix& feature, int feaImgHeight, int feaImgWidth,
-                          int channels, int blockH, int blockW, int strideH,
-                          int strideW, int paddingH, int paddingW,
-                          int outputH, int outputW) {
+  virtual void convExpand(Matrix& feature,
+                          int feaImgHeight,
+                          int feaImgWidth,
+                          int channels,
+                          int blockH,
+                          int blockW,
+                          int strideH,
+                          int strideW,
+                          int paddingH,
+                          int paddingW,
+                          int outputH,
+                          int outputW) {
     LOG(FATAL) << "Not implemeted";
   }
 
@@ -793,11 +865,20 @@ public:
    *
    * Its function is to restore a expanded-matrix into a feature matrix
    */
-  virtual void convShrink(Matrix& expandColMat, int thisImgHeight,
-                          int thisImgWidth, int channels, int blockH,
-                          int blockW, int strideH, int strideW, int paddingH,
-                          int paddingW, int outputH, int outputW,
-                          real alpha = 1.0f, real beta = 0.0f) {
+  virtual void convShrink(Matrix& expandColMat,
+                          int thisImgHeight,
+                          int thisImgWidth,
+                          int channels,
+                          int blockH,
+                          int blockW,
+                          int strideH,
+                          int strideW,
+                          int paddingH,
+                          int paddingW,
+                          int outputH,
+                          int outputW,
+                          real alpha = 1.0f,
+                          real beta = 0.0f) {
     LOG(FATAL) << "Not implemeted";
   }
 
@@ -805,54 +886,93 @@ public:
    * Pooling forward operation, pick out the largest element
    * in the sizeX of value
    */
-  virtual void maxPoolForward(Matrix& inputMat, size_t imgSizeH,
-                              size_t imgSizeW, size_t channels, size_t sizeX,
-                              size_t sizeY, size_t strideH, size_t strideW,
-                              size_t outputH, size_t outputW,
-                              size_t paddingH, size_t paddingW) {
+  virtual void maxPoolForward(Matrix& inputMat,
+                              size_t imgSizeH,
+                              size_t imgSizeW,
+                              size_t channels,
+                              size_t sizeX,
+                              size_t sizeY,
+                              size_t strideH,
+                              size_t strideW,
+                              size_t outputH,
+                              size_t outputW,
+                              size_t paddingH,
+                              size_t paddingW) {
     LOG(FATAL) << "Not implemeted";
   }
 
   /// Pooling backward operation.
-  virtual void maxPoolBackward(Matrix& image, size_t imgSizeH, size_t imgSizeW,
-                               Matrix& outGrad, Matrix& outV, size_t sizeX,
-                               size_t sizeY, size_t strideH, size_t strideW,
-                               size_t outputH, size_t outputW,
-                               real scaleTargets, real scaleOutput,
-                               size_t paddingH, size_t paddingW) {
+  virtual void maxPoolBackward(Matrix& image,
+                               size_t imgSizeH,
+                               size_t imgSizeW,
+                               Matrix& outGrad,
+                               Matrix& outV,
+                               size_t sizeX,
+                               size_t sizeY,
+                               size_t strideH,
+                               size_t strideW,
+                               size_t outputH,
+                               size_t outputW,
+                               real scaleTargets,
+                               real scaleOutput,
+                               size_t paddingH,
+                               size_t paddingW) {
     LOG(FATAL) << "Not implemeted";
   }
 
   /// Pooling forward operation, caculate the average of sizeX elements.
-  virtual void avgPoolForward(Matrix& input, size_t imgSizeH, size_t imgSizeW,
-                              size_t channels, size_t sizeX, size_t sizeY,
-                              size_t strideH, size_t strideW,
-                              size_t outputH, size_t outputW,
-                              size_t paddingH, size_t paddingW) {
+  virtual void avgPoolForward(Matrix& input,
+                              size_t imgSizeH,
+                              size_t imgSizeW,
+                              size_t channels,
+                              size_t sizeX,
+                              size_t sizeY,
+                              size_t strideH,
+                              size_t strideW,
+                              size_t outputH,
+                              size_t outputW,
+                              size_t paddingH,
+                              size_t paddingW) {
     LOG(FATAL) << "Not implemeted";
   }
 
-  virtual void avgPoolBackward(Matrix& input, size_t imgSizeH, size_t imgSizeW,
-                               size_t sizeX, size_t sizeY,
-                               size_t strideH, size_t strideW,
-                               size_t outputH, size_t outputW,
-                               real scaleTargets, real scaleOutput,
-                               size_t paddingH, size_t paddingW) {
+  virtual void avgPoolBackward(Matrix& input,
+                               size_t imgSizeH,
+                               size_t imgSizeW,
+                               size_t sizeX,
+                               size_t sizeY,
+                               size_t strideH,
+                               size_t strideW,
+                               size_t outputH,
+                               size_t outputW,
+                               real scaleTargets,
+                               real scaleOutput,
+                               size_t paddingH,
+                               size_t paddingW) {
     LOG(FATAL) << "Not implemeted";
   }
 
   /// normalize-operation.
-  virtual void crossMapNormalFwd(Matrix& input, size_t imgSizeH,
-                                 size_t imgSizeW, Matrix& denoms,
-                                 size_t channels, size_t sizeX, float scale,
+  virtual void crossMapNormalFwd(Matrix& input,
+                                 size_t imgSizeH,
+                                 size_t imgSizeW,
+                                 Matrix& denoms,
+                                 size_t channels,
+                                 size_t sizeX,
+                                 float scale,
                                  float pow) {
     LOG(FATAL) << "Not implemeted";
   }
 
-  virtual void crossMapNormalBwd(Matrix& localGrad, Matrix& denoms,
-                                 Matrix& preOutV, Matrix& localOutV,
-                                 size_t channels, size_t imgSizeH,
-                                 size_t imgSizeW, size_t size, float scale,
+  virtual void crossMapNormalBwd(Matrix& localGrad,
+                                 Matrix& denoms,
+                                 Matrix& preOutV,
+                                 Matrix& localOutV,
+                                 size_t channels,
+                                 size_t imgSizeH,
+                                 size_t imgSizeW,
+                                 size_t size,
+                                 float scale,
                                  float pow) {
     LOG(FATAL) << "Not implemeted";
   }
@@ -865,20 +985,24 @@ public:
    *
    * output[i] is set to max_input[i].
    */
-  virtual void maxSequenceForward(Matrix& input, const IVector& sequence,
+  virtual void maxSequenceForward(Matrix& input,
+                                  const IVector& sequence,
                                   IVector& index) {
     LOG(FATAL) << "Not implemeted";
   }
 
-  virtual void maxSequenceBackward(Matrix& outputGrad, const IVector& sequence,
+  virtual void maxSequenceBackward(Matrix& outputGrad,
+                                   const IVector& sequence,
                                    IVector& index) {
     LOG(FATAL) << "Not implemeted";
   }
 
-  virtual void contextProjectionForward(MatrixPtr input, MatrixPtr weight,
+  virtual void contextProjectionForward(MatrixPtr input,
+                                        MatrixPtr weight,
                                         const IVector& sequence,
                                         int contextLength,
-                                        int contextStart, size_t beginPad,
+                                        int contextStart,
+                                        size_t beginPad,
                                         bool isPadding) {
     LOG(FATAL) << "Not implemeted";
   }
@@ -887,7 +1011,8 @@ public:
                                          MatrixPtr weightGrad,
                                          const IVector& sequence,
                                          int contextLength,
-                                         int contextStart, size_t beginPad,
+                                         int contextStart,
+                                         size_t beginPad,
                                          bool isPadding) {
     LOG(FATAL) << "Not implemeted";
   }
@@ -902,7 +1027,8 @@ public:
   virtual void contextProjectionBackwardWeight(MatrixPtr weightGrad,
                                                const IVector& sequence,
                                                int contextLength,
-                                               int contextStart, int totalPad,
+                                               int contextStart,
+                                               int totalPad,
                                                size_t beginPad) {
     LOG(FATAL) << "Not implemeted";
   }
@@ -981,7 +1107,8 @@ public:
    *            / output->getWidth()
    * @endcode
    */
-  virtual void classificationErrorMulti(Matrix& output, Matrix& label,
+  virtual void classificationErrorMulti(Matrix& output,
+                                        Matrix& label,
                                         real threshold) {
     LOG(FATAL) << "Not implemented";
   }
@@ -1029,10 +1156,15 @@ public:
   GpuMatrix(size_t height, size_t width, bool trans = false);
   GpuMatrix(real* data, size_t height, size_t width, bool trans = false)
       : Matrix(data, height, width, trans, true) {}
-  GpuMatrix(real* data, size_t height, size_t width, size_t stride,
+  GpuMatrix(real* data,
+            size_t height,
+            size_t width,
+            size_t stride,
             bool trans = false)
       : Matrix(data, height, width, stride, trans, true) {}
-  GpuMatrix(GpuMemHandlePtr dataHandle, size_t height, size_t width,
+  GpuMatrix(GpuMemHandlePtr dataHandle,
+            size_t height,
+            size_t width,
             bool trans = false)
       : Matrix(dataHandle, height, width, trans, true) {}
   ~GpuMatrix();
@@ -1042,12 +1174,16 @@ public:
   void setDiag(real value);
 
   void resize(size_t newHeight, size_t newWidth);
-  void resize(size_t newHeight, size_t newWidth,
+  void resize(size_t newHeight,
+              size_t newWidth,
               size_t newNnz, /* used to allocate space */
-              SparseValueType valueType, SparseFormat format) {
+              SparseValueType valueType,
+              SparseFormat format) {
     LOG(FATAL) << "Only Support Sparse Matrix";
   }
-  void setRow(size_t row, size_t colNum, const unsigned int* cols,
+  void setRow(size_t row,
+              size_t colNum,
+              const unsigned int* cols,
               const real* values) {
     LOG(FATAL) << "Only Support Sparse Matrix";
   }
@@ -1137,10 +1273,14 @@ public:
 
   void mul(const GpuMatrix& a, const GpuMatrix& b, real scaleAB, real scaleT);
 
-  void mul(const GpuSparseMatrix& a, const GpuMatrix& b, real scaleAB,
+  void mul(const GpuSparseMatrix& a,
+           const GpuMatrix& b,
+           real scaleAB,
            real scaleT);
 
-  void mul(const GpuMatrix& a, const GpuSparseMatrix& b, real scaleAB,
+  void mul(const GpuMatrix& a,
+           const GpuSparseMatrix& b,
+           real scaleAB,
            real scaleT);
 
   /**
@@ -1182,9 +1322,11 @@ public:
 
   void oneHotCrossEntropy(Matrix& output, IVector& label);
   void oneHotCrossEntropyBp(Matrix& outputV, IVector& label);
-  void oneHotCrossEntropyWithSelfNorm(Matrix& output, IVector& label,
+  void oneHotCrossEntropyWithSelfNorm(Matrix& output,
+                                      IVector& label,
                                       real alpha);
-  void oneHotCrossEntropyWithSelfNormBp(Matrix& outputV, IVector& label,
+  void oneHotCrossEntropyWithSelfNormBp(Matrix& outputV,
+                                        IVector& label,
                                         real alpha);
 
   void softmax(Matrix& output);
@@ -1204,8 +1346,12 @@ public:
   void scaledTanh(Matrix& output, real p1, real p2);
 
   void cosSim(Matrix& output1, Matrix& output2, real scale);
-  void cosSimDerivative(Matrix& output, Matrix& prevOut1, Matrix& prevOut2,
-                        Matrix& prevGrad1, Matrix& prevGrad2, real scale);
+  void cosSimDerivative(Matrix& output,
+                        Matrix& prevOut1,
+                        Matrix& prevOut2,
+                        Matrix& prevGrad1,
+                        Matrix& prevGrad2,
+                        real scale);
 
   virtual void print(std::ostream& os) const;
   virtual void print(std::ostream& os, size_t height, size_t width) const;
@@ -1219,71 +1365,136 @@ public:
 
   void classificationError(MatrixPtr output, IVectorPtr label);
 
-  void convExpand(Matrix& feature, int feaImgHeight, int feaImgWidth,
-                  int channels, int blockH, int blockW, int strideH,
-                  int strideW, int paddingH, int paddingW,
-                  int outputH, int outputW);
-
-  void convShrink(Matrix& expandColMat, int thisImgHeight, int thisImgWidth,
-                  int channels, int blockH, int blochW, int strideH,
-                  int strideW, int paddingH, int paddingWreal,
-                  int outputH, int outputW,
-                  real alpha = 1.0f, real beta = 0.0f);
-
-  void maxPoolForward(Matrix& inputMat, size_t imgSizeH, size_t imgSizeW,
-                      size_t channels, size_t sizeX, size_t sizeY,
-                      size_t strideH, size_t strideW,
-                      size_t outputH, size_t outputW,
-                      size_t paddingH, size_t paddingW);
-
-  void maxPoolBackward(Matrix& image, size_t imgSizeH, size_t imgSizeW,
-                       Matrix& outGrad, Matrix& outV, size_t sizeX,
-                       size_t sizeY, size_t strideH, size_t strideW,
-                       size_t outputH, size_t outputW,
-                       real scaleTargets, real scaleOutput,
-                       size_t paddingH, size_t paddingW);
-
-  void avgPoolForward(Matrix& input, size_t imgSizeH, size_t imgSizeW,
-                      size_t channels, size_t sizeX, size_t sizeY,
-                      size_t strideH, size_t strideW,
-                      size_t outputH, size_t outputW,
-                      size_t paddingH, size_t paddingW);
-
-  void avgPoolBackward(Matrix& input, size_t imgSizeH, size_t imgSizeW,
-                       size_t sizeX, size_t sizeY,
-                       size_t strideH, size_t strideW,
-                       size_t outputH, size_t outputW,
-                       real scaleTargets, real scaleOutput,
-                       size_t paddingH, size_t paddingW);
-
-  void crossMapNormalFwd(Matrix& input, size_t imgSizeH, size_t imgSizeW,
-                         Matrix& denoms, size_t channels, size_t sizeX,
-                         float scale, float pow);
-
-  void crossMapNormalBwd(Matrix& localGrad, Matrix& denoms, Matrix& preOutV,
-                         Matrix& localOutV, size_t channels, size_t imgSizeH,
-                         size_t imgSizeW, size_t sizeX,
-                         float scale, float pow);
-
-  void maxSequenceForward(Matrix& input, const IVector& sequence,
+  void convExpand(Matrix& feature,
+                  int feaImgHeight,
+                  int feaImgWidth,
+                  int channels,
+                  int blockH,
+                  int blockW,
+                  int strideH,
+                  int strideW,
+                  int paddingH,
+                  int paddingW,
+                  int outputH,
+                  int outputW);
+
+  void convShrink(Matrix& expandColMat,
+                  int thisImgHeight,
+                  int thisImgWidth,
+                  int channels,
+                  int blockH,
+                  int blochW,
+                  int strideH,
+                  int strideW,
+                  int paddingH,
+                  int paddingWreal,
+                  int outputH,
+                  int outputW,
+                  real alpha = 1.0f,
+                  real beta = 0.0f);
+
+  void maxPoolForward(Matrix& inputMat,
+                      size_t imgSizeH,
+                      size_t imgSizeW,
+                      size_t channels,
+                      size_t sizeX,
+                      size_t sizeY,
+                      size_t strideH,
+                      size_t strideW,
+                      size_t outputH,
+                      size_t outputW,
+                      size_t paddingH,
+                      size_t paddingW);
+
+  void maxPoolBackward(Matrix& image,
+                       size_t imgSizeH,
+                       size_t imgSizeW,
+                       Matrix& outGrad,
+                       Matrix& outV,
+                       size_t sizeX,
+                       size_t sizeY,
+                       size_t strideH,
+                       size_t strideW,
+                       size_t outputH,
+                       size_t outputW,
+                       real scaleTargets,
+                       real scaleOutput,
+                       size_t paddingH,
+                       size_t paddingW);
+
+  void avgPoolForward(Matrix& input,
+                      size_t imgSizeH,
+                      size_t imgSizeW,
+                      size_t channels,
+                      size_t sizeX,
+                      size_t sizeY,
+                      size_t strideH,
+                      size_t strideW,
+                      size_t outputH,
+                      size_t outputW,
+                      size_t paddingH,
+                      size_t paddingW);
+
+  void avgPoolBackward(Matrix& input,
+                       size_t imgSizeH,
+                       size_t imgSizeW,
+                       size_t sizeX,
+                       size_t sizeY,
+                       size_t strideH,
+                       size_t strideW,
+                       size_t outputH,
+                       size_t outputW,
+                       real scaleTargets,
+                       real scaleOutput,
+                       size_t paddingH,
+                       size_t paddingW);
+
+  void crossMapNormalFwd(Matrix& input,
+                         size_t imgSizeH,
+                         size_t imgSizeW,
+                         Matrix& denoms,
+                         size_t channels,
+                         size_t sizeX,
+                         float scale,
+                         float pow);
+
+  void crossMapNormalBwd(Matrix& localGrad,
+                         Matrix& denoms,
+                         Matrix& preOutV,
+                         Matrix& localOutV,
+                         size_t channels,
+                         size_t imgSizeH,
+                         size_t imgSizeW,
+                         size_t sizeX,
+                         float scale,
+                         float pow);
+
+  void maxSequenceForward(Matrix& input,
+                          const IVector& sequence,
                           IVector& index);
 
-  void maxSequenceBackward(Matrix& outputGrad, const IVector& sequence,
+  void maxSequenceBackward(Matrix& outputGrad,
+                           const IVector& sequence,
                            IVector& index);
 
-  void contextProjectionForward(MatrixPtr input, MatrixPtr weight,
-                                const IVector& sequence, int contextLength,
-                                int contextStart, size_t beginPad,
+  void contextProjectionForward(MatrixPtr input,
+                                MatrixPtr weight,
+                                const IVector& sequence,
+                                int contextLength,
+                                int contextStart,
+                                size_t beginPad,
                                 bool isPadding);
 
   void contextProjectionBackwardData(MatrixPtr inputGrad,
                                      const IVector& sequence,
-                                     int contextLength, int contextStart);
+                                     int contextLength,
+                                     int contextStart);
 
   void contextProjectionBackwardWeight(MatrixPtr weightGrad,
                                        const IVector& sequence,
                                        int contextLength,
-                                       int contextStart, int totalPad,
+                                       int contextStart,
+                                       int totalPad,
                                        size_t beginPad);
 
   void bilinearForward(const Matrix& in,
@@ -1314,11 +1525,16 @@ public:
   CpuMatrix(size_t height, size_t width, bool trans = false);
   CpuMatrix(real* data, size_t height, size_t width, bool trans = false)
       : Matrix(data, height, width, trans, false) {}
-  CpuMatrix(real* data, size_t height, size_t width, size_t stride,
+  CpuMatrix(real* data,
+            size_t height,
+            size_t width,
+            size_t stride,
             bool trans = false)
       : Matrix(data, height, width, stride, trans, false) {}
 
-  CpuMatrix(CpuMemHandlePtr dataHandle, size_t height, size_t width,
+  CpuMatrix(CpuMemHandlePtr dataHandle,
+            size_t height,
+            size_t width,
             bool trans = false)
       : Matrix(dataHandle, height, width, trans, false) {}
 
@@ -1329,12 +1545,16 @@ public:
   void setDiag(real value);
 
   void resize(size_t newHeight, size_t newWidth);
-  void resize(size_t newHeight, size_t newWidth,
+  void resize(size_t newHeight,
+              size_t newWidth,
               size_t newNnz, /* used to allocate space */
-              SparseValueType valueType, SparseFormat format) {
+              SparseValueType valueType,
+              SparseFormat format) {
     LOG(FATAL) << "Only Support Sparse Matrix";
   }
-  void setRow(size_t row, size_t colNum, const unsigned int* cols,
+  void setRow(size_t row,
+              size_t colNum,
+              const unsigned int* cols,
               const real* values) {
     LOG(FATAL) << "Only Support Sparse Matrix";
   }
@@ -1366,67 +1586,132 @@ public:
 
   MatrixPtr clone(size_t height, size_t width, bool useGpu = false);
 
-  void convExpand(Matrix& feature, int feaImgHeight, int feaImgWidth,
-                  int channels, int blcokH, int blockW, int strideH,
-                  int strideW, int paddingH, int paddingW,
-                  int outputH, int outputW);
-
-  void convShrink(Matrix& expandFeat, int thisImgHeight, int thisImgWidth,
-                  int channels, int blockH, int blockW, int strideH,
-                  int strideW, int paddingH, int paddingW,
-                  int outputH, int outputW,
-                  real alpha = 1.0f, real beta = 0.0f);
-
-  void maxPoolForward(Matrix& inputMat, size_t imgSizeH, size_t imgSizeW,
-                      size_t channels, size_t sizeX, size_t sizeY,
-                      size_t strideH, size_t strideW,
-                      size_t outputH, size_t outputW,
-                      size_t paddingH, size_t paddingW);
-
-  void maxPoolBackward(Matrix& image, size_t imgSizeH, size_t imgSizeW,
-                       Matrix& outGrad, Matrix& outV,
-                       size_t sizeX, size_t sizeY,
-                       size_t strideH, size_t strideW,
-                       size_t outputH, size_t outputW,
-                       real scaleTargets, real scaleOutput,
-                       size_t paddingH, size_t paddingW);
-
-  void avgPoolForward(Matrix& input, size_t imgSizeH, size_t imgSizeW,
-                      size_t channels, size_t sizeX, size_t sizeY,
-                      size_t strideH, size_t strideW,
-                      size_t outputH, size_t outputW,
-                      size_t paddingH, size_t paddingW);
-
-  void avgPoolBackward(Matrix& input, size_t imgSizeH, size_t imgSizeW,
-                       size_t sizeX, size_t sizeY,
-                       size_t strideH, size_t strideW,
-                       size_t outputH, size_t outputW,
-                       real scaleTargets, real scaleOutput,
-                       size_t paddingH, size_t paddingW);
-
-  void crossMapNormalFwd(Matrix& input, size_t imgSizeH, size_t imgSizeW,
-                         Matrix& denoms, size_t channels, size_t sizeX,
-                         float scale, float pow);
-
-  void crossMapNormalBwd(Matrix& localGrad, Matrix& denoms, Matrix& preOutV,
-                         Matrix& localOutV, size_t channels, size_t imgSizeH,
-                         size_t imgSizeW, size_t sizeX,
-                         float scale, float pow);
-
-  void maxSequenceForward(Matrix& input, const IVector& sequence,
+  void convExpand(Matrix& feature,
+                  int feaImgHeight,
+                  int feaImgWidth,
+                  int channels,
+                  int blcokH,
+                  int blockW,
+                  int strideH,
+                  int strideW,
+                  int paddingH,
+                  int paddingW,
+                  int outputH,
+                  int outputW);
+
+  void convShrink(Matrix& expandFeat,
+                  int thisImgHeight,
+                  int thisImgWidth,
+                  int channels,
+                  int blockH,
+                  int blockW,
+                  int strideH,
+                  int strideW,
+                  int paddingH,
+                  int paddingW,
+                  int outputH,
+                  int outputW,
+                  real alpha = 1.0f,
+                  real beta = 0.0f);
+
+  void maxPoolForward(Matrix& inputMat,
+                      size_t imgSizeH,
+                      size_t imgSizeW,
+                      size_t channels,
+                      size_t sizeX,
+                      size_t sizeY,
+                      size_t strideH,
+                      size_t strideW,
+                      size_t outputH,
+                      size_t outputW,
+                      size_t paddingH,
+                      size_t paddingW);
+
+  void maxPoolBackward(Matrix& image,
+                       size_t imgSizeH,
+                       size_t imgSizeW,
+                       Matrix& outGrad,
+                       Matrix& outV,
+                       size_t sizeX,
+                       size_t sizeY,
+                       size_t strideH,
+                       size_t strideW,
+                       size_t outputH,
+                       size_t outputW,
+                       real scaleTargets,
+                       real scaleOutput,
+                       size_t paddingH,
+                       size_t paddingW);
+
+  void avgPoolForward(Matrix& input,
+                      size_t imgSizeH,
+                      size_t imgSizeW,
+                      size_t channels,
+                      size_t sizeX,
+                      size_t sizeY,
+                      size_t strideH,
+                      size_t strideW,
+                      size_t outputH,
+                      size_t outputW,
+                      size_t paddingH,
+                      size_t paddingW);
+
+  void avgPoolBackward(Matrix& input,
+                       size_t imgSizeH,
+                       size_t imgSizeW,
+                       size_t sizeX,
+                       size_t sizeY,
+                       size_t strideH,
+                       size_t strideW,
+                       size_t outputH,
+                       size_t outputW,
+                       real scaleTargets,
+                       real scaleOutput,
+                       size_t paddingH,
+                       size_t paddingW);
+
+  void crossMapNormalFwd(Matrix& input,
+                         size_t imgSizeH,
+                         size_t imgSizeW,
+                         Matrix& denoms,
+                         size_t channels,
+                         size_t sizeX,
+                         float scale,
+                         float pow);
+
+  void crossMapNormalBwd(Matrix& localGrad,
+                         Matrix& denoms,
+                         Matrix& preOutV,
+                         Matrix& localOutV,
+                         size_t channels,
+                         size_t imgSizeH,
+                         size_t imgSizeW,
+                         size_t sizeX,
+                         float scale,
+                         float pow);
+
+  void maxSequenceForward(Matrix& input,
+                          const IVector& sequence,
                           IVector& index);
 
-  void maxSequenceBackward(Matrix& outputGrad, const IVector& sequence,
+  void maxSequenceBackward(Matrix& outputGrad,
+                           const IVector& sequence,
                            IVector& index);
 
-  void contextProjectionForward(MatrixPtr input, MatrixPtr weight,
-                                const IVector& sequence, int contextLength,
-                                int contextStart, size_t beginPad,
+  void contextProjectionForward(MatrixPtr input,
+                                MatrixPtr weight,
+                                const IVector& sequence,
+                                int contextLength,
+                                int contextStart,
+                                size_t beginPad,
                                 bool isPadding);
 
-  void contextProjectionBackward(MatrixPtr inputGrad, MatrixPtr weightGrad,
-                                 const IVector& sequence, int contextLength,
-                                 int contextStart, size_t beginPad,
+  void contextProjectionBackward(MatrixPtr inputGrad,
+                                 MatrixPtr weightGrad,
+                                 const IVector& sequence,
+                                 int contextLength,
+                                 int contextStart,
+                                 size_t beginPad,
                                  bool isPadding);
 
   real* getRow(size_t row) { return BaseMatrix::rowBuf(row); }
@@ -1443,7 +1728,6 @@ public:
 
   void sequenceAvgForward(Matrix& a, const IVector& startsPos, int mode);
 
-
   /**
    * @code
    * this.row[i] += table.row[ids[i]]
@@ -1490,7 +1774,10 @@ public:
 
   void mul(CpuMatrix* a, CpuSparseMatrix* b, real scaleAB, real scaleT);
 
-  static void mul(CpuMatrix* a, CpuMatrix* b, CpuSparseMatrix* c, real scaleAB,
+  static void mul(CpuMatrix* a,
+                  CpuMatrix* b,
+                  CpuSparseMatrix* c,
+                  real scaleAB,
                   real scaleT);
 
   /**
@@ -1500,8 +1787,8 @@ public:
    * Define B,C as template instead of virtual class for performance sake.
    */
   template <typename MatBType, typename MatCType>
-  static void mul(CpuSparseMatrix* a, MatBType* b, MatCType* c, real scaleAB,
-                  real scaleT);
+  static void mul(
+      CpuSparseMatrix* a, MatBType* b, MatCType* c, real scaleAB, real scaleT);
 
   virtual void mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB, real scaleT);
 
@@ -1525,14 +1812,18 @@ public:
 
   void oneHotCrossEntropy(Matrix& output, IVector& label);
   void oneHotCrossEntropyBp(Matrix& outputV, IVector& label);
-  void oneHotCrossEntropyWithSelfNorm(Matrix& output, IVector& label,
+  void oneHotCrossEntropyWithSelfNorm(Matrix& output,
+                                      IVector& label,
                                       real alpha);
-  void oneHotCrossEntropyWithSelfNormBp(Matrix& outputV, IVector& label,
+  void oneHotCrossEntropyWithSelfNormBp(Matrix& outputV,
+                                        IVector& label,
                                         real alpha);
 
   void circularConv(Matrix& b, Matrix& c);
-  void circularConvDerivative(Matrix& output, Matrix& prevOut1,
-                              Matrix& prevOut2, Matrix& prevGrad1,
+  void circularConvDerivative(Matrix& output,
+                              Matrix& prevOut1,
+                              Matrix& prevOut2,
+                              Matrix& prevGrad1,
                               Matrix& prevGrad2);
 
   void softmax(Matrix& output);
@@ -1553,8 +1844,12 @@ public:
   void scaledTanh(Matrix& output, real p1, real p2);
 
   void cosSim(Matrix& output1, Matrix& output2, real scale);
-  void cosSimDerivative(Matrix& output, Matrix& prevOut1, Matrix& prevOut2,
-                        Matrix& prevGrad1, Matrix& prevGrad2, real scale);
+  void cosSimDerivative(Matrix& output,
+                        Matrix& prevOut1,
+                        Matrix& prevOut2,
+                        Matrix& prevGrad1,
+                        Matrix& prevGrad2,
+                        real scale);
 
   void print(std::ostream& os) const;
   void print(std::ostream& os, size_t height, size_t width) const;
@@ -1575,19 +1870,28 @@ public:
 
   void addByBitCode(size_t numClasses, const IVector& codes, const Matrix& vec);
 
-  void addByBitCodeBackward(size_t numClasses, const IVector& codes,
+  void addByBitCodeBackward(size_t numClasses,
+                            const IVector& codes,
                             Matrix& vec);
 
-  void mulByBitCode(size_t numClasses, const IVector& codes, const Matrix& mat,
+  void mulByBitCode(size_t numClasses,
+                    const IVector& codes,
+                    const Matrix& mat,
                     const Matrix& input);
 
-  void mulByBitCodeBackwardWeight(size_t numClasses, const IVector& codes,
-                                  Matrix& mat, const Matrix& input);
+  void mulByBitCodeBackwardWeight(size_t numClasses,
+                                  const IVector& codes,
+                                  Matrix& mat,
+                                  const Matrix& input);
 
-  void mulByBitCodeBackwardError(size_t numClasses, const IVector& codes,
-                                 const Matrix& mat, Matrix& input);
+  void mulByBitCodeBackwardError(size_t numClasses,
+                                 const IVector& codes,
+                                 const Matrix& mat,
+                                 Matrix& input);
 
-  void sumByBitCode(size_t numClasses, IVector& codes, Matrix& sum,
+  void sumByBitCode(size_t numClasses,
+                    IVector& codes,
+                    Matrix& sum,
                     real scaleSum);
 
   void subByBitCode(size_t numClasses_, IVector& codes);
@@ -1622,20 +1926,25 @@ public:
       : CpuMatrix(height, width, trans) {
     initShared(blockNum);
   }
-  SharedCpuMatrix(int blockNum, real* data, size_t height, size_t width,
-                  bool trans = false)
+  SharedCpuMatrix(
+      int blockNum, real* data, size_t height, size_t width, bool trans = false)
       : CpuMatrix(data, height, width, trans) {
     initShared(blockNum);
   }
 
-  SharedCpuMatrix(int blockNum, CpuMemHandlePtr dataHandle, size_t height,
-                  size_t width, bool trans = false)
+  SharedCpuMatrix(int blockNum,
+                  CpuMemHandlePtr dataHandle,
+                  size_t height,
+                  size_t width,
+                  bool trans = false)
       : CpuMatrix(dataHandle, height, width, trans) {
     initShared(blockNum);
   }
 
-  SharedCpuMatrix(CpuMemHandlePtr dataHandle, size_t height,
-                  size_t width, bool trans = false)
+  SharedCpuMatrix(CpuMemHandlePtr dataHandle,
+                  size_t height,
+                  size_t width,
+                  bool trans = false)
       : CpuMatrix(dataHandle, height, width, trans) {
     initBlock(1);
   }
diff --git a/paddle/math/MatrixBitCode.cpp b/paddle/math/MatrixBitCode.cpp
index 8497c26e35..ac5b10c7bd 100644
--- a/paddle/math/MatrixBitCode.cpp
+++ b/paddle/math/MatrixBitCode.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Util.h"
 #include "Matrix.h"
@@ -80,8 +79,8 @@ private:
        op(tmat(i, j), vec(0, index(i, j)))
 */
 template <class CodeTable, class Op, class TMat, class Mat>
-static void addByBitCodeT(Op op, CodeTable codeTable, const IVector& codes,
-                          TMat& tmat, Mat& vec) {
+static void addByBitCodeT(
+    Op op, CodeTable codeTable, const IVector& codes, TMat& tmat, Mat& vec) {
   CHECK(!vec.useGpu());
 
   size_t numClasses = codeTable.size();
@@ -109,7 +108,8 @@ static void addByBitCodeT(Op op, CodeTable codeTable, const IVector& codes,
 /* For j < codeLength:
    this(i, j) += vec(0, index(i, j))
 */
-void CpuMatrix::addByBitCode(size_t numClasses, const IVector& codes,
+void CpuMatrix::addByBitCode(size_t numClasses,
+                             const IVector& codes,
                              const Matrix& vec) {
   auto op = [](real& t, real v) { t += v; };
   addByBitCodeT(op, SimpleCodeTable(numClasses), codes, *this, vec);
@@ -118,7 +118,8 @@ void CpuMatrix::addByBitCode(size_t numClasses, const IVector& codes,
 /* For j < codeLength:
    vec(0, index(i, j)) += this(i, j)
 */
-void CpuMatrix::addByBitCodeBackward(size_t numClasses, const IVector& codes,
+void CpuMatrix::addByBitCodeBackward(size_t numClasses,
+                                     const IVector& codes,
                                      Matrix& vec) {
   auto op = [](real t, real& v) { v += t; };
   addByBitCodeT(op, SimpleCodeTable(numClasses), codes, *this, vec);
@@ -129,10 +130,18 @@ void CpuMatrix::addByBitCodeBackward(size_t numClasses, const IVector& codes,
     for j < codeLength:
       op(tmat(i, j), mat.row(index(i, j)), input.row(i))
 */
-template <class Op, class CodeTable, class IVec, class TMat, class WMat,
+template <class Op,
+          class CodeTable,
+          class IVec,
+          class TMat,
+          class WMat,
           class InMat>
-void mulByBitCodeT(Op op, CodeTable codeTable, IVec& codes, TMat& tmat,
-                   WMat& weight, InMat& input) {
+void mulByBitCodeT(Op op,
+                   CodeTable codeTable,
+                   IVec& codes,
+                   TMat& tmat,
+                   WMat& weight,
+                   InMat& input) {
   CHECK(!tmat.useGpu() && !weight.useGpu() && !input.useGpu());
 
   size_t numClasses = codeTable.size();
@@ -161,10 +170,12 @@ void mulByBitCodeT(Op op, CodeTable codeTable, IVec& codes, TMat& tmat,
 /* For j < codeLength:
    this(i, j) += <weight.row(index(i, j)), input.row(i)>
 */
-void CpuMatrix::mulByBitCode(size_t numClasses, const IVector& codes,
-                             const Matrix& weight, const Matrix& input) {
-  auto op = [](real& t, const real* weightRow, const real* inputRow,
-               size_t inputDim) {
+void CpuMatrix::mulByBitCode(size_t numClasses,
+                             const IVector& codes,
+                             const Matrix& weight,
+                             const Matrix& input) {
+  auto op = [](
+      real& t, const real* weightRow, const real* inputRow, size_t inputDim) {
     real sum = 0;
     for (size_t k = 0; k < inputDim; ++k) {
       sum += weightRow[k] * inputRow[k];
@@ -179,14 +190,15 @@ void CpuMatrix::mulByBitCode(size_t numClasses, const IVector& codes,
    weight.row(index(i, j)) += this(i, j) * input.row(i)
 */
 void CpuMatrix::mulByBitCodeBackwardWeight(size_t numClasses,
-                                           const IVector& codes, Matrix& weight,
+                                           const IVector& codes,
+                                           Matrix& weight,
                                            const Matrix& input) {
-  auto op =
-      [](const real t, real* weightRow, const real* inputRow, size_t inputDim) {
-        for (size_t k = 0; k < inputDim; ++k) {
-          weightRow[k] += t * inputRow[k];
-        }
-      };
+  auto op = [](
+      const real t, real* weightRow, const real* inputRow, size_t inputDim) {
+    for (size_t k = 0; k < inputDim; ++k) {
+      weightRow[k] += t * inputRow[k];
+    }
+  };
 
   mulByBitCodeT(op, SimpleCodeTable(numClasses), codes, *this, weight, input);
 }
@@ -196,20 +208,24 @@ void CpuMatrix::mulByBitCodeBackwardWeight(size_t numClasses,
 */
 void CpuMatrix::mulByBitCodeBackwardError(size_t numClasses,
                                           const IVector& codes,
-                                          const Matrix& weight, Matrix& input) {
-  auto op =
-      [](const real t, const real* weightRow, real* inputRow, size_t inputDim) {
-        for (size_t k = 0; k < inputDim; ++k) {
-          inputRow[k] += t * weightRow[k];
-        }
-      };
+                                          const Matrix& weight,
+                                          Matrix& input) {
+  auto op = [](
+      const real t, const real* weightRow, real* inputRow, size_t inputDim) {
+    for (size_t k = 0; k < inputDim; ++k) {
+      inputRow[k] += t * weightRow[k];
+    }
+  };
 
   mulByBitCodeT(op, SimpleCodeTable(numClasses), codes, *this, weight, input);
 }
 
 template <class CodeTable>
-void sumByBitCodeT(CodeTable codeTable, IVector& codes, const CpuMatrix& tmat,
-                   Matrix& sum, real scaleSum) {
+void sumByBitCodeT(CodeTable codeTable,
+                   IVector& codes,
+                   const CpuMatrix& tmat,
+                   Matrix& sum,
+                   real scaleSum) {
   size_t maxCodeLength = codeTable.getMaxCodeLength();
   size_t numSamples = tmat.getHeight();
   size_t oWidth = tmat.getWidth();
@@ -237,7 +253,9 @@ void sumByBitCodeT(CodeTable codeTable, IVector& codes, const CpuMatrix& tmat,
 /* For j < codeLength:
    sum(i, 0) = \sum_j  bit(i, j) * this(i, j)
 */
-void CpuMatrix::sumByBitCode(size_t numClasses, IVector& codes, Matrix& sum,
+void CpuMatrix::sumByBitCode(size_t numClasses,
+                             IVector& codes,
+                             Matrix& sum,
                              real scaleSum) {
   sumByBitCodeT(SimpleCodeTable(numClasses), codes, *this, sum, scaleSum);
 }
diff --git a/paddle/math/MemoryHandle.cpp b/paddle/math/MemoryHandle.cpp
index 11f746df5c..9101957fc6 100644
--- a/paddle/math/MemoryHandle.cpp
+++ b/paddle/math/MemoryHandle.cpp
@@ -21,8 +21,7 @@ namespace paddle {
 /**
  * Calculate the actual allocation size according to the required size.
  */
-MemoryHandle::MemoryHandle(size_t size)
-    : size_(size), buf_(nullptr) {
+MemoryHandle::MemoryHandle(size_t size) : size_(size), buf_(nullptr) {
   if (size_ <= 256) {
     // Memory allocation in cuda is always aligned to at least 256 bytes.
     // In many cases it is 512 bytes.
@@ -44,9 +43,7 @@ GpuMemoryHandle::GpuMemoryHandle(size_t size) : MemoryHandle(size) {
   buf_ = allocator_->alloc(allocSize_);
 }
 
-GpuMemoryHandle::~GpuMemoryHandle() {
-  allocator_->free(buf_, allocSize_);
-}
+GpuMemoryHandle::~GpuMemoryHandle() { allocator_->free(buf_, allocSize_); }
 
 CpuMemoryHandle::CpuMemoryHandle(size_t size) : MemoryHandle(size) {
   CHECK(size != 0) << " allocate 0 bytes";
@@ -54,8 +51,6 @@ CpuMemoryHandle::CpuMemoryHandle(size_t size) : MemoryHandle(size) {
   buf_ = allocator_->alloc(allocSize_);
 }
 
-CpuMemoryHandle::~CpuMemoryHandle() {
-  allocator_->free(buf_, allocSize_);
-}
+CpuMemoryHandle::~CpuMemoryHandle() { allocator_->free(buf_, allocSize_); }
 
 }  // namespace paddle
diff --git a/paddle/math/MemoryHandle.h b/paddle/math/MemoryHandle.h
index 809fba2d0a..f12635d5d4 100644
--- a/paddle/math/MemoryHandle.h
+++ b/paddle/math/MemoryHandle.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <memory>
@@ -32,9 +31,9 @@ public:
 
 protected:
   PoolAllocator* allocator_;
-  size_t size_;         // the requested size
-  size_t allocSize_;    // the allocated size
-  int deviceId_;        // the device id of memory if gpu memory
+  size_t size_;       // the requested size
+  size_t allocSize_;  // the allocated size
+  int deviceId_;      // the device id of memory if gpu memory
   void* buf_;
 };
 
diff --git a/paddle/math/PoolAllocator.cpp b/paddle/math/PoolAllocator.cpp
index 3a03496eb1..2c150949dd 100644
--- a/paddle/math/PoolAllocator.cpp
+++ b/paddle/math/PoolAllocator.cpp
@@ -12,21 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "PoolAllocator.h"
 
 namespace paddle {
 
 PoolAllocator::PoolAllocator(Allocator* allocator,
-  size_t sizeLimit, const std::string& name)
+                             size_t sizeLimit,
+                             const std::string& name)
     : allocator_(allocator),
       sizeLimit_(sizeLimit),
       poolMemorySize_(0),
       name_(name) {}
 
-PoolAllocator::~PoolAllocator() {
-  freeAll();
-}
+PoolAllocator::~PoolAllocator() { freeAll(); }
 
 void* PoolAllocator::alloc(size_t size) {
   if (sizeLimit_ > 0) {
diff --git a/paddle/math/PoolAllocator.h b/paddle/math/PoolAllocator.h
index aca8ffb0ab..5d33b45312 100644
--- a/paddle/math/PoolAllocator.h
+++ b/paddle/math/PoolAllocator.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <memory>
diff --git a/paddle/math/SIMDFunctions.cpp b/paddle/math/SIMDFunctions.cpp
index 6147bed3d8..1fb156f29b 100644
--- a/paddle/math/SIMDFunctions.cpp
+++ b/paddle/math/SIMDFunctions.cpp
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
-
 #include "SIMDFunctions.h"
 #include <immintrin.h>
 #include <algorithm>
@@ -85,7 +83,9 @@ static void batch_addto_sse(float* a, const float* b[], int batch, size_t len) {
   return;
 }
 
-static void col_max_sse(float* result, const float* data, int dim,
+static void col_max_sse(float* result,
+                        const float* data,
+                        int dim,
                         int numSamples) {
   // first sample, direct copy
   for (int d = 0; d < dim; ++d) {
@@ -195,7 +195,9 @@ static void batch_addto_avx(float* a, const float* b[], int batch, size_t len) {
   return;
 }
 
-static void col_max_avx(float* result, const float* data, int dim,
+static void col_max_avx(float* result,
+                        const float* data,
+                        int dim,
                         int numSamples) {
   // first sample, direct copy
   for (int d = 0; d < dim; ++d) {
@@ -289,8 +291,8 @@ static void decayL1_avx(float* dst, float* src, float lambda, size_t sz) {
   }
 }
 
-static void decayL1_avx(float* dst, float* src, float* lr, float lambda,
-                        size_t sz) {
+static void decayL1_avx(
+    float* dst, float* src, float* lr, float lambda, size_t sz) {
   int64_t i;
   int64_t size = sz;
   float src_val;
@@ -379,8 +381,8 @@ void colMaxImpl(float* result, const float* data, int dim, int numSamples) {
 void decayL1AvxImpl(float* dst, float* src, float lambda, size_t len) {
   decayL1_avx(dst, src, lambda, len);
 }
-void decayL1AvxImpl(float* dst, float* src, float* lr, float lambda,
-                    size_t len) {
+void decayL1AvxImpl(
+    float* dst, float* src, float* lr, float lambda, size_t len) {
   decayL1_avx(dst, src, lr, lambda, len);
 }
 
diff --git a/paddle/math/SIMDFunctions.h b/paddle/math/SIMDFunctions.h
index 2b984d5f96..ac82f10910 100644
--- a/paddle/math/SIMDFunctions.h
+++ b/paddle/math/SIMDFunctions.h
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
-
 #pragma once
 #include <stddef.h>
 #include <stdint.h>
@@ -123,8 +121,8 @@ void batchAddToImpl(float* a, const float* b[], int batch, size_t len);
 void colMaxImpl(float* result, const float* data, int dim, int numSamples);
 #ifdef __AVX__
 void decayL1AvxImpl(float* dst, float* src, float lambda, size_t len);
-void decayL1AvxImpl(float* dst, float* src, float* lr, float lambda,
-                    size_t len);
+void decayL1AvxImpl(
+    float* dst, float* src, float* lr, float lambda, size_t len);
 #endif
 }  // namespace internal
 
@@ -153,8 +151,8 @@ inline void decayL1(float* dst, float* src, float lambda, size_t len) {
 }
 
 template <>
-inline void decayL1(float* dst, float* src, float* lr, float lambda,
-                    size_t len) {
+inline void decayL1(
+    float* dst, float* src, float* lr, float lambda, size_t len) {
 #ifdef __AVX__
   internal::decayL1AvxImpl(dst, src, lr, lambda, len);
 #else
diff --git a/paddle/math/SparseMatrix.cpp b/paddle/math/SparseMatrix.cpp
index 67ac048862..2b0bff9535 100644
--- a/paddle/math/SparseMatrix.cpp
+++ b/paddle/math/SparseMatrix.cpp
@@ -22,18 +22,25 @@ limitations under the License. */
 
 namespace paddle {
 
-GpuSparseMatrix::GpuSparseMatrix(size_t height, size_t width, size_t nnz,
-                                 SparseValueType valueType, SparseFormat format,
+GpuSparseMatrix::GpuSparseMatrix(size_t height,
+                                 size_t width,
+                                 size_t nnz,
+                                 SparseValueType valueType,
+                                 SparseFormat format,
                                  bool trans)
     : Matrix(NULL, height, width, trans, true) {
   resize(height, width, nnz, valueType, format);
 }
 
 GpuSparseMatrix::GpuSparseMatrix(GpuMemHandlePtr dataHandle,
-                                 hl_sparse_matrix_s_ptr sMatrix, size_t height,
-                                 size_t width, size_t nnz,
-                                 SparseValueType valueType, SparseFormat format,
-                                 bool trans, MemoryHandlePtr sMemoryHandle)
+                                 hl_sparse_matrix_s_ptr sMatrix,
+                                 size_t height,
+                                 size_t width,
+                                 size_t nnz,
+                                 SparseValueType valueType,
+                                 SparseFormat format,
+                                 bool trans,
+                                 MemoryHandlePtr sMemoryHandle)
     : Matrix(dataHandle, height, width, trans, true) {
   CHECK(dataHandle && sMatrix) << "Invalid argument pointer";
 
@@ -67,10 +74,14 @@ GpuSparseMatrix::GpuSparseMatrix(GpuMemHandlePtr dataHandle,
     sparseResizeCSC();
 }
 
-GpuSparseMatrix::GpuSparseMatrix(hl_sparse_matrix_s_ptr sMatrix, size_t height,
-                                 size_t width, size_t nnz,
-                                 SparseValueType valueType, SparseFormat format,
-                                 bool trans, MemoryHandlePtr sMemoryHandle)
+GpuSparseMatrix::GpuSparseMatrix(hl_sparse_matrix_s_ptr sMatrix,
+                                 size_t height,
+                                 size_t width,
+                                 size_t nnz,
+                                 SparseValueType valueType,
+                                 SparseFormat format,
+                                 bool trans,
+                                 MemoryHandlePtr sMemoryHandle)
     : Matrix(NULL, height, width, trans, true) {
   CHECK(sMatrix) << "Invalid argument pointer";
   sMatrix_ = sMatrix;
@@ -80,9 +91,14 @@ GpuSparseMatrix::GpuSparseMatrix(hl_sparse_matrix_s_ptr sMatrix, size_t height,
   valueType_ = valueType;
 }
 
-GpuSparseMatrix::GpuSparseMatrix(real* value, int* rows, int* cols,
-                                 size_t height, size_t width, size_t nnz,
-                                 SparseValueType valueType, SparseFormat format,
+GpuSparseMatrix::GpuSparseMatrix(real* value,
+                                 int* rows,
+                                 int* cols,
+                                 size_t height,
+                                 size_t width,
+                                 size_t nnz,
+                                 SparseValueType valueType,
+                                 SparseFormat format,
                                  bool trans)
     : Matrix(NULL, height, width, trans, true) {
   size_t size = 0;
@@ -118,9 +134,15 @@ GpuSparseMatrix::GpuSparseMatrix(real* value, int* rows, int* cols,
       /* construct hl_sparse_matrix_s */
       hl_sparse_matrix_s tmp;
       hl_construct_sparse_matrix(
-          &tmp, value, rows, cols, HL_SPARSE_CSR,
-          valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE, height_,
-          width_, elementCnt_);
+          &tmp,
+          value,
+          rows,
+          cols,
+          HL_SPARSE_CSR,
+          valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE,
+          height_,
+          width_,
+          elementCnt_);
       hl_sparse_matrix_s_ptr tmp2(tmp, hl_destruct_sparse_matrix);
       sMatrix_ = tmp2;
     }
@@ -143,9 +165,15 @@ GpuSparseMatrix::GpuSparseMatrix(real* value, int* rows, int* cols,
       /* construct hl_sparse_matrix_s */
       hl_sparse_matrix_s tmp;
       hl_construct_sparse_matrix(
-          &tmp, value, rows, cols, HL_SPARSE_CSC,
-          valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE, height_,
-          width_, elementCnt_);
+          &tmp,
+          value,
+          rows,
+          cols,
+          HL_SPARSE_CSC,
+          valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE,
+          height_,
+          width_,
+          elementCnt_);
       hl_sparse_matrix_s_ptr tmp2(tmp, hl_destruct_sparse_matrix);
       sMatrix_ = tmp2;
     }
@@ -171,8 +199,13 @@ void GpuSparseMatrix::sparseResizeCSR() {
     /* construct hl_sparse_matrix_s */
     hl_sparse_matrix_s tmp;
     hl_construct_sparse_matrix(
-        &tmp, data_, memoryHandle_->getSize(), HL_SPARSE_CSR,
-        valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE, height_, width_,
+        &tmp,
+        data_,
+        memoryHandle_->getSize(),
+        HL_SPARSE_CSR,
+        valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE,
+        height_,
+        width_,
         elementCnt_);
     hl_sparse_matrix_s_ptr tmp2(tmp, hl_destruct_sparse_matrix);
     sMatrix_ = tmp2;
@@ -197,16 +230,24 @@ void GpuSparseMatrix::sparseResizeCSC() {
     /* construct hl_sparse_matrix_s */
     hl_sparse_matrix_s tmp;
     hl_construct_sparse_matrix(
-        &tmp, memoryHandle_->getBuf(), memoryHandle_->getSize(), HL_SPARSE_CSC,
-        valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE, height_, width_,
+        &tmp,
+        memoryHandle_->getBuf(),
+        memoryHandle_->getSize(),
+        HL_SPARSE_CSC,
+        valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE,
+        height_,
+        width_,
         elementCnt_);
     hl_sparse_matrix_s_ptr tmp2(tmp, hl_destruct_sparse_matrix);
     sMatrix_ = tmp2;
   }
 }
 
-void GpuSparseMatrix::resize(size_t newHeight, size_t newWidth, size_t newNnz,
-                             SparseValueType valueType, SparseFormat format) {
+void GpuSparseMatrix::resize(size_t newHeight,
+                             size_t newWidth,
+                             size_t newNnz,
+                             SparseValueType valueType,
+                             SparseFormat format) {
   if (format == SPARSE_CSR) {
     resizeCSR(newHeight, newWidth, newNnz, valueType);
   } else {
@@ -214,8 +255,10 @@ void GpuSparseMatrix::resize(size_t newHeight, size_t newWidth, size_t newNnz,
   }
 }
 
-void GpuSparseMatrix::resizeCSR(size_t newHeight, size_t newWidth,
-                                size_t newNnz, SparseValueType valueType) {
+void GpuSparseMatrix::resizeCSR(size_t newHeight,
+                                size_t newWidth,
+                                size_t newNnz,
+                                SparseValueType valueType) {
   size_t newSize = (newHeight + 1) * sizeof(int) + newNnz * sizeof(int);
   if (NO_VALUE != valueType) {
     newSize += newNnz * sizeof(real);
@@ -266,8 +309,10 @@ void GpuSparseMatrix::resizeCSR(size_t newHeight, size_t newWidth,
   }
 }
 
-void GpuSparseMatrix::resizeCSC(size_t newHeight, size_t newWidth,
-                                size_t newNnz, SparseValueType valueType) {
+void GpuSparseMatrix::resizeCSC(size_t newHeight,
+                                size_t newWidth,
+                                size_t newNnz,
+                                SparseValueType valueType) {
   size_t newSize = (newWidth + 1) * sizeof(int) + newNnz * sizeof(int);
   if (NO_VALUE != valueType) {
     newSize += newNnz * sizeof(real);
@@ -327,24 +372,37 @@ MatrixPtr GpuSparseMatrix::getTranspose() {
   CHECK(memoryHandle_.get() || sMatrix_) << "not supported";
   if (memoryHandle_.get()) {
     MatrixPtr copy_T(new GpuSparseMatrix(
-        std::dynamic_pointer_cast<GpuMemoryHandle>(memoryHandle_), sMatrix_,
-        height_, width_, elementCnt_, valueType_, format_, true,
+        std::dynamic_pointer_cast<GpuMemoryHandle>(memoryHandle_),
+        sMatrix_,
+        height_,
+        width_,
+        elementCnt_,
+        valueType_,
+        format_,
+        true,
         sMemoryHandle_));
     return copy_T;
   } else {
-    MatrixPtr copy_T(new GpuSparseMatrix(sMatrix_, height_, width_, elementCnt_,
-                                         valueType_, format_, true,
+    MatrixPtr copy_T(new GpuSparseMatrix(sMatrix_,
+                                         height_,
+                                         width_,
+                                         elementCnt_,
+                                         valueType_,
+                                         format_,
+                                         true,
                                          sMemoryHandle_));
     return copy_T;
   }
 }
 
-void GpuSparseMatrix::copyRow(int offsets, size_t colNum,
+void GpuSparseMatrix::copyRow(int offsets,
+                              size_t colNum,
                               const sparse_non_value_t* row) {
   memcpy(cols_ + offsets, row, sizeof(int) * colNum);
 }
 
-void GpuSparseMatrix::copyRow(int offsets, size_t colNum,
+void GpuSparseMatrix::copyRow(int offsets,
+                              size_t colNum,
                               const sparse_float_value_t* row) {
   for (size_t j = 0; j < colNum; j++) {
     cols_[offsets + j] = row[j].col;
@@ -368,7 +426,9 @@ void GpuSparseMatrix::copyFrom(const Matrix& src) {
 }
 
 template <class T>
-void GpuSparseMatrix::copyFrom(int64_t* ids, int64_t* indices, T* data,
+void GpuSparseMatrix::copyFrom(int64_t* ids,
+                               int64_t* indices,
+                               T* data,
                                hl_stream_t stream) {
   CHECK_EQ(format_, SPARSE_CSR);
   size_t nnz = 0;
@@ -377,7 +437,9 @@ void GpuSparseMatrix::copyFrom(int64_t* ids, int64_t* indices, T* data,
     nnz += indices[id + 1] - indices[id];
   }
 
-  resize(height_, width_, nnz,
+  resize(height_,
+         width_,
+         nnz,
          sizeof(T) == sizeof(sparse_non_value_t) ? NO_VALUE : FLOAT_VALUE,
          format_);
 
@@ -399,8 +461,10 @@ void GpuSparseMatrix::copyFrom(int64_t* ids, int64_t* indices, T* data,
   hl_memcpy_csr_matrix(sMatrix_.get(), value_, rows_, cols_, stream);
 }
 
-void GpuSparseMatrix::setRow(size_t row, size_t colNum,
-                             const unsigned int* cols, const real* values) {
+void GpuSparseMatrix::setRow(size_t row,
+                             size_t colNum,
+                             const unsigned int* cols,
+                             const real* values) {
   CHECK_EQ(format_, SPARSE_CSR);
   if (NO_VALUE == valueType_) {
     CHECK_LT(row, height_);
@@ -427,8 +491,8 @@ void GpuSparseMatrix::setRow(size_t row, size_t colNum,
     sMatrix_->rows = height_;
     sMatrix_->cols = width_;
     sMatrix_->nnz = elementCnt_;
-    hl_memcpy_csr_matrix(sMatrix_.get(), value_, rows_, cols_,
-                         HPPL_STREAM_DEFAULT);
+    hl_memcpy_csr_matrix(
+        sMatrix_.get(), value_, rows_, cols_, HPPL_STREAM_DEFAULT);
   }
 }
 
@@ -438,8 +502,8 @@ void GpuSparseMatrix::transpose(MatrixPtr matTrans, bool memAlloc) {
   CHECK_EQ(format_, SPARSE_CSC);
   int nnz = sMatrix_->nnz;
   if (memAlloc) {
-    matTrans = std::make_shared<GpuSparseMatrix>(width_, height_, nnz,
-                                                 valueType_, format_, false);
+    matTrans = std::make_shared<GpuSparseMatrix>(
+        width_, height_, nnz, valueType_, format_, false);
   } else {
     CHECK(matTrans != nullptr);
   }
@@ -449,9 +513,14 @@ void GpuSparseMatrix::transpose(MatrixPtr matTrans, bool memAlloc) {
   CpuIVector cols_full(nnz);
   CpuVector value(nnz);
   hl_stream_t stream = HPPL_STREAM_1;
-  hl_memcpy_from_csc_matrix(value.getData(), nnz, rows.getData(), nnz,
-                            cols.getData(), width_ + 1,
-                            sMatrix_.get(), stream);
+  hl_memcpy_from_csc_matrix(value.getData(),
+                            nnz,
+                            rows.getData(),
+                            nnz,
+                            cols.getData(),
+                            width_ + 1,
+                            sMatrix_.get(),
+                            stream);
 
   hl_stream_synchronize(stream);
 
@@ -465,12 +534,14 @@ void GpuSparseMatrix::transpose(MatrixPtr matTrans, bool memAlloc) {
 
   /*sort row index and column index by the ascending order*/
   for (int i = 0; i < nnz; i++) {
-    dataVec.emplace_back(rows.getData()[i], cols_full.getData()[i],
-                         value.getData()[i]);
+    dataVec.emplace_back(
+        rows.getData()[i], cols_full.getData()[i], value.getData()[i]);
   }
-  std::sort(dataVec.begin(), dataVec.end(), [](Element a, Element b) {
-    return a.row < b.row || (a.row == b.row && a.col < b.col);
-  });
+  std::sort(dataVec.begin(),
+            dataVec.end(),
+            [](Element a, Element b) {
+              return a.row < b.row || (a.row == b.row && a.col < b.col);
+            });
 
   /*get sorted data, row index, and col index, put them in the right place*/
   cols.resize(height_ + 1);
@@ -494,13 +565,18 @@ void GpuSparseMatrix::transpose(MatrixPtr matTrans, bool memAlloc) {
   /*copy back from cpu*/
   GpuSparseMatrixPtr dest =
       std::dynamic_pointer_cast<GpuSparseMatrix>(matTrans);
-  hl_memcpy_csc_matrix((dest->sMatrix_).get(), value.getData(),
-                       rows.getData(), cols.getData(), stream);
+  hl_memcpy_csc_matrix((dest->sMatrix_).get(),
+                       value.getData(),
+                       rows.getData(),
+                       cols.getData(),
+                       stream);
   hl_stream_synchronize(stream);
 }
 
-void GpuSparseMatrix::mul(const GpuMatrixPtr a, const GpuMatrixPtr b,
-                          real scaleAB, real scaleT) {
+void GpuSparseMatrix::mul(const GpuMatrixPtr a,
+                          const GpuMatrixPtr b,
+                          real scaleAB,
+                          real scaleT) {
   CHECK(a->useGpu_ && b->useGpu_) << "type not match";
   CHECK(!trans_) << "trans not supported";
   real* A_d = a->getData();
@@ -527,11 +603,13 @@ void GpuSparseMatrix::mul(const GpuMatrixPtr a, const GpuMatrixPtr b,
   int dimM = height_;
   int dimN = width_;
   int dimK = !b->trans_ ? b->getHeight() : b->getWidth();
-  hl_sparse_matrix_mul(A_d, a_trans, B_d, b_trans, C_d, dimM,
-                       dimN, dimK, scaleAB, scaleT);
+  hl_sparse_matrix_mul(
+      A_d, a_trans, B_d, b_trans, C_d, dimM, dimN, dimK, scaleAB, scaleT);
 }
 
-void GpuSparseMatrix::mul(const MatrixPtr a, const MatrixPtr b, real scaleAB,
+void GpuSparseMatrix::mul(const MatrixPtr a,
+                          const MatrixPtr b,
+                          real scaleAB,
                           real scaleT) {
   if (std::dynamic_pointer_cast<GpuMatrix>(a) &&
       std::dynamic_pointer_cast<GpuMatrix>(b)) {
@@ -559,9 +637,14 @@ void GpuSparseMatrix::print(std::ostream& os) const {
     IVectorPtr cols = IVector::create(width_ + 1, false);
     VectorPtr value = Vector::create(nnz, false);
     hl_stream_t stream = HPPL_STREAM_DEFAULT;
-    hl_memcpy_from_csc_matrix(
-        value->getData(), value->getSize(), rows->getData(), rows->getSize(),
-        cols->getData(), cols->getSize(), sMatrix_.get(), stream);
+    hl_memcpy_from_csc_matrix(value->getData(),
+                              value->getSize(),
+                              rows->getData(),
+                              rows->getSize(),
+                              cols->getData(),
+                              cols->getSize(),
+                              sMatrix_.get(),
+                              stream);
     hl_stream_synchronize(stream);
 
     printBuf(os, cols->getData(), width_ + 1, "col idx");
@@ -574,11 +657,10 @@ void GpuSparseMatrix::copyFromCSR(CpuSparseMatrix& src, hl_stream_t stream) {
   trans_ = src.trans_;
   size_t nnz = src.getElementCnt();
 
-  resize(src.getHeight(), src.getWidth(), nnz, valueType_,
-         src.getFormat());
+  resize(src.getHeight(), src.getWidth(), nnz, valueType_, src.getFormat());
   // if have different value type, only copy rows and cols
   SparseValueType vType =
-    valueType_ != src.getValueType() ? NO_VALUE : valueType_;
+      valueType_ != src.getValueType() ? NO_VALUE : valueType_;
 
   sMatrix_->format = HL_SPARSE_CSR;
   sMatrix_->type = vType == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE;
@@ -588,7 +670,9 @@ void GpuSparseMatrix::copyFromCSR(CpuSparseMatrix& src, hl_stream_t stream) {
 
   hl_memcpy_csr_matrix(sMatrix_.get(),
                        vType == NO_VALUE ? NULL : src.getValue(),
-                       src.getRows(), src.getCols(), stream);
+                       src.getRows(),
+                       src.getCols(),
+                       stream);
 
   // restore type of sMatrix_
   sMatrix_->type = valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE;
@@ -598,12 +682,11 @@ void GpuSparseMatrix::copyFromCSC(CpuSparseMatrix& src, hl_stream_t stream) {
   trans_ = src.trans_;
   size_t nnz = src.getElementCnt();
 
-  resize(src.getHeight(), src.getWidth(), nnz, valueType_,
-         src.getFormat());
+  resize(src.getHeight(), src.getWidth(), nnz, valueType_, src.getFormat());
 
   // if have different value type, only copy rows and cols
   SparseValueType vType =
-    valueType_ != src.getValueType() ? NO_VALUE : valueType_;
+      valueType_ != src.getValueType() ? NO_VALUE : valueType_;
 
   sMatrix_->format = HL_SPARSE_CSC;
   sMatrix_->type = vType == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE;
@@ -613,7 +696,9 @@ void GpuSparseMatrix::copyFromCSC(CpuSparseMatrix& src, hl_stream_t stream) {
 
   hl_memcpy_csc_matrix(sMatrix_.get(),
                        vType == NO_VALUE ? NULL : src.getValue(),
-                       src.getRows(), src.getCols(), stream);
+                       src.getRows(),
+                       src.getCols(),
+                       stream);
 
   // restore type of sMatrix_
   sMatrix_->type = valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE;
@@ -622,23 +707,24 @@ void GpuSparseMatrix::copyFromCSC(CpuSparseMatrix& src, hl_stream_t stream) {
 void GpuSparseMatrix::copyFrom(GpuSparseMatrix& src, hl_stream_t stream) {
   CHECK(trans_ == src.trans_);
   CHECK(format_ == src.getFormat());
-  resize(src.getHeight(), src.getWidth(), elementCnt_, valueType_,
+  resize(src.getHeight(),
+         src.getWidth(),
+         elementCnt_,
+         valueType_,
          src.getFormat());
 
   size_t rowSize = format_ == SPARSE_CSC ? elementCnt_ : height_ + 1;
   size_t colSize = format_ == SPARSE_CSC ? width_ + 1 : elementCnt_;
 
   if (valueType_ == FLOAT_VALUE && src.getValueType() == FLOAT_VALUE) {
-    hl_memcpy_async(getValue(), src.getValue(),
-                    sizeof(real) * elementCnt_, stream);
+    hl_memcpy_async(
+        getValue(), src.getValue(), sizeof(real) * elementCnt_, stream);
   }
   CHECK(getRows());
   CHECK(src.getRows());
 
-  hl_memcpy_async(getRows(), src.getRows(),
-                  sizeof(int) * rowSize, stream);
-  hl_memcpy_async(getCols(), src.getCols(),
-                  sizeof(int) * colSize, stream);
+  hl_memcpy_async(getRows(), src.getRows(), sizeof(int) * rowSize, stream);
+  hl_memcpy_async(getCols(), src.getCols(), sizeof(int) * colSize, stream);
 }
 
 void GpuSparseMatrix::copyFrom(CpuSparseMatrix& src, hl_stream_t stream) {
@@ -652,7 +738,8 @@ void GpuSparseMatrix::copyFrom(CpuSparseMatrix& src, hl_stream_t stream) {
 void GpuSparseMatrix::trimFromCSR(const CpuSparseMatrix& src) {
   trans_ = src.trans_;
   int* srcCols = src.getCols();
-  size_t nnz = std::count_if(srcCols, srcCols + src.getElementCnt(),
+  size_t nnz = std::count_if(srcCols,
+                             srcCols + src.getElementCnt(),
                              [this](size_t n) { return n < this->width_; });
   resize(height_, width_, nnz, valueType_, format_);
 
@@ -678,9 +765,11 @@ void GpuSparseMatrix::trimFromCSR(const CpuSparseMatrix& src) {
   sMatrix_->cols = width_;
   sMatrix_->nnz = nnz;
 
-  hl_memcpy_csr_matrix(
-      sMatrix_.get(), valueType_ == NO_VALUE ? NULL : value_, rows_, cols_,
-      /*default stream = */ HPPL_STREAM_DEFAULT);
+  hl_memcpy_csr_matrix(sMatrix_.get(),
+                       valueType_ == NO_VALUE ? NULL : value_,
+                       rows_,
+                       cols_,
+                       /*default stream = */ HPPL_STREAM_DEFAULT);
 }
 
 void GpuSparseMatrix::trimFromCSC(const CpuSparseMatrix& src) {
@@ -703,9 +792,11 @@ void GpuSparseMatrix::trimFromCSC(const CpuSparseMatrix& src) {
   sMatrix_->cols = width_;
   sMatrix_->nnz = nnz;
 
-  hl_memcpy_csc_matrix(
-      sMatrix_.get(), valueType_ == NO_VALUE ? NULL : value_, rows_, cols_,
-      /*default stream = */ HPPL_STREAM_DEFAULT);
+  hl_memcpy_csc_matrix(sMatrix_.get(),
+                       valueType_ == NO_VALUE ? NULL : value_,
+                       rows_,
+                       cols_,
+                       /*default stream = */ HPPL_STREAM_DEFAULT);
 }
 
 void GpuSparseMatrix::trimFrom(const CpuSparseMatrix& src) {
@@ -766,10 +857,12 @@ void GpuSparseMatrix::rowMax(IVector& maxIds, Matrix& maxVal) {
 #endif
 }
 
-template void GpuSparseMatrix::copyFrom(int64_t* ids, int64_t* indices,
+template void GpuSparseMatrix::copyFrom(int64_t* ids,
+                                        int64_t* indices,
                                         sparse_non_value_t* data,
                                         hl_stream_t stream);
-template void GpuSparseMatrix::copyFrom(int64_t* ids, int64_t* indices,
+template void GpuSparseMatrix::copyFrom(int64_t* ids,
+                                        int64_t* indices,
                                         sparse_float_value_t* data,
                                         hl_stream_t stream);
 }  // namespace paddle
diff --git a/paddle/math/SparseMatrix.h b/paddle/math/SparseMatrix.h
index 4b9a03302b..175ef54b85 100644
--- a/paddle/math/SparseMatrix.h
+++ b/paddle/math/SparseMatrix.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 #include <cstddef>
 #include "Matrix.h"
@@ -35,25 +34,41 @@ public:
   SparseFormat format_;
 
 public:
-  GpuSparseMatrix(size_t height, size_t width,
+  GpuSparseMatrix(size_t height,
+                  size_t width,
                   size_t nnz, /* used to allocate space */
                   SparseValueType valueType = FLOAT_VALUE,
-                  SparseFormat format_ = SPARSE_CSR, bool trans = false);
+                  SparseFormat format_ = SPARSE_CSR,
+                  bool trans = false);
 
-  GpuSparseMatrix(GpuMemHandlePtr dataHandle, hl_sparse_matrix_s_ptr sMatrix,
-                  size_t height, size_t width,
+  GpuSparseMatrix(GpuMemHandlePtr dataHandle,
+                  hl_sparse_matrix_s_ptr sMatrix,
+                  size_t height,
+                  size_t width,
                   size_t nnz, /* used to allocate space */
                   SparseValueType valueType = FLOAT_VALUE,
-                  SparseFormat format_ = SPARSE_CSR, bool trans = false,
+                  SparseFormat format_ = SPARSE_CSR,
+                  bool trans = false,
                   MemoryHandlePtr sMemoryHandle = NULL);
 
-  GpuSparseMatrix(real* value, int* rows, int* cols, size_t height,
-                  size_t width, size_t nnz, SparseValueType valueType,
-                  SparseFormat format, bool trans);
-
-  GpuSparseMatrix(hl_sparse_matrix_s_ptr sMatrix, size_t height, size_t width,
-                  size_t nnz, SparseValueType valueType, SparseFormat format,
-                  bool trans, MemoryHandlePtr sMemoryHandle);
+  GpuSparseMatrix(real* value,
+                  int* rows,
+                  int* cols,
+                  size_t height,
+                  size_t width,
+                  size_t nnz,
+                  SparseValueType valueType,
+                  SparseFormat format,
+                  bool trans);
+
+  GpuSparseMatrix(hl_sparse_matrix_s_ptr sMatrix,
+                  size_t height,
+                  size_t width,
+                  size_t nnz,
+                  SparseValueType valueType,
+                  SparseFormat format,
+                  bool trans,
+                  MemoryHandlePtr sMemoryHandle);
 
 protected:
   struct Element {
@@ -67,9 +82,11 @@ protected:
 public:
   ~GpuSparseMatrix() {}
 
-  void resize(size_t newHeight, size_t newWidth,
+  void resize(size_t newHeight,
+              size_t newWidth,
               size_t newNnz, /* used to allocate space */
-              SparseValueType valueType, SparseFormat format);
+              SparseValueType valueType,
+              SparseFormat format);
 
   void resize(size_t newHeight, size_t newWidth);
 
@@ -77,13 +94,19 @@ public:
 
   void sparseResizeCSC();
 
-  void resizeCSR(size_t newHeight, size_t newWidth, size_t newNnz,
+  void resizeCSR(size_t newHeight,
+                 size_t newWidth,
+                 size_t newNnz,
                  SparseValueType valueType);
 
-  void resizeCSC(size_t newHeight, size_t newWidth, size_t newNnz,
+  void resizeCSC(size_t newHeight,
+                 size_t newWidth,
+                 size_t newNnz,
                  SparseValueType valueType);
 
-  void mul(const GpuMatrixPtr a, const GpuMatrixPtr b, real scaleAB,
+  void mul(const GpuMatrixPtr a,
+           const GpuMatrixPtr b,
+           real scaleAB,
            real scaleT);
   /// B = A , B.trans = !A.trans
   MatrixPtr getTranspose();
@@ -104,7 +127,9 @@ public:
   template <class T>
   void copyFrom(int64_t* ids, int64_t* indices, T* data, hl_stream_t stream);
 
-  void setRow(size_t row, size_t colNum, const unsigned int* cols,
+  void setRow(size_t row,
+              size_t colNum,
+              const unsigned int* cols,
               const real* values);
   SparseValueType getValueType() const;
   SparseFormat getFormat() const { return format_; }
@@ -173,7 +198,7 @@ public:
    * getData is convenient to get value
    */
   real* getData() { return getValue(); }
-  const real* getData() const { return getValue();}
+  const real* getData() const { return getValue(); }
 
   /**
    * @brief  Get top k value of each row in sparse matrix.
@@ -204,9 +229,7 @@ public:
 
   // BaseMatrixT interface
 public:
-  bool isSparse() const {
-    return true;
-  }
+  bool isSparse() const { return true; }
 
 private:
   using Matrix::mul;
diff --git a/paddle/math/SparseRowMatrix.cpp b/paddle/math/SparseRowMatrix.cpp
index 6986624d25..eefaf4b71f 100644
--- a/paddle/math/SparseRowMatrix.cpp
+++ b/paddle/math/SparseRowMatrix.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "SparseRowMatrix.h"
 #include "CpuSparseMatrix.h"
 
@@ -26,7 +25,8 @@ limitations under the License. */
 #include "paddle/utils/Util.h"
 #include "paddle/utils/Thread.h"
 
-P_DEFINE_bool(allow_inefficient_sparse_update, false,
+P_DEFINE_bool(allow_inefficient_sparse_update,
+              false,
               "Whether to allow inefficient sparse update");
 
 namespace paddle {
@@ -45,7 +45,9 @@ void SparseRowCpuMatrix::init(size_t height, size_t width) {
   globalIndices_ = indexDictHandle_->globalIndices.data();
 }
 
-void SparseRowCpuMatrix::mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB,
+void SparseRowCpuMatrix::mul(CpuSparseMatrix* a,
+                             CpuMatrix* b,
+                             real scaleAB,
                              real scaleT) {
   CpuMatrix::mul<CpuMatrix, SparseRowCpuMatrix>(a, b, this, scaleAB, scaleT);
 }
@@ -55,24 +57,25 @@ void SparseRowCpuMatrix::copyFrom(const real* src, size_t size) {
 }
 
 void SparseRowCpuMatrix::zeroMem() {
-  apply(
-    [](real* buf, size_t len) {
-      memset(buf, 0, sizeof(real) * len);
-    });
+  apply([](real* buf, size_t len) { memset(buf, 0, sizeof(real) * len); });
   clearRows();
 }
 
 void SparseRowCpuMatrix::applyL1Decay(real learningRate, real decayRate) {
   apply([=](real* buf, size_t len) {
-      CpuVector value(0, nullptr);
-      value.subVecFrom(buf, 0, len);
-      value.applyL1(learningRate, decayRate);
-    });
+    CpuVector value(0, nullptr);
+    value.subVecFrom(buf, 0, len);
+    value.applyL1(learningRate, decayRate);
+  });
 }
 
-void SparseRowCpuMatrix::sgdUpdate(BaseMatrix& value, IVector& t0,
-                                   real learningRate, int currentTime,
-                                   real decayRate, bool useL1, bool fini) {
+void SparseRowCpuMatrix::sgdUpdate(BaseMatrix& value,
+                                   IVector& t0,
+                                   real learningRate,
+                                   int currentTime,
+                                   real decayRate,
+                                   bool useL1,
+                                   bool fini) {
   std::vector<unsigned int>& localIndices = indexDictHandle_->localIndices;
 
   // t0 and value are vectors
@@ -124,7 +127,7 @@ void SparseRowCpuMatrix::sgdUpdate(BaseMatrix& value, IVector& t0,
       for (size_t j = 0; j < this->width_; ++j) {
         v[j] -= learningRate * g[j];
       }
-      simd::decayL1(v, v, learningRate*decayRate, this->width_);
+      simd::decayL1(v, v, learningRate * decayRate, this->width_);
 
       // state update to t+1
       t[0] = currentTime + 1;
@@ -173,8 +176,10 @@ void SparseRowCpuMatrix::sgdUpdate(BaseMatrix& value, IVector& t0,
   }
 }
 
-void SparseRowCpuMatrix::addTo(BaseMatrix& dest, std::vector<uint32_t>& ids,
-                               size_t tid, size_t numThreads) {
+void SparseRowCpuMatrix::addTo(BaseMatrix& dest,
+                               std::vector<uint32_t>& ids,
+                               size_t tid,
+                               size_t numThreads) {
   CHECK(!dest.useGpu_);
   CHECK_EQ(dest.height_ * dest.width_, this->height_ * this->width_);
 
@@ -182,14 +187,14 @@ void SparseRowCpuMatrix::addTo(BaseMatrix& dest, std::vector<uint32_t>& ids,
   for (size_t i = 0; i < localIndices.size(); ++i) {
     uint32_t id = localIndices[i];
     if (id % numThreads == tid) {
-      simd::addTo(dest.rowBuf(id), getLocalRow(i),
-                  this->width_);
+      simd::addTo(dest.rowBuf(id), getLocalRow(i), this->width_);
       ids.push_back(id);
     }
   }
 }
 
-void SparseRowCpuMatrix::addTo(SparseRowCpuMatrix& dest, size_t tid,
+void SparseRowCpuMatrix::addTo(SparseRowCpuMatrix& dest,
+                               size_t tid,
                                size_t numThreads) {
   CHECK(!dest.useGpu_);
   CHECK_EQ(dest.height_ * dest.width_, this->height_ * this->width_);
@@ -214,24 +219,28 @@ void SparseRowCpuMatrix::zeroMemThread(size_t tid, size_t numThreads) {
   }
 }
 
-void SparseAutoGrowRowCpuMatrix::mul(CpuSparseMatrix* a, CpuMatrix* b,
-                                     real scaleAB, real scaleT) {
-  CpuMatrix::mul<CpuMatrix, SparseAutoGrowRowCpuMatrix>(a, b, this, scaleAB,
-                                                        scaleT);
+void SparseAutoGrowRowCpuMatrix::mul(CpuSparseMatrix* a,
+                                     CpuMatrix* b,
+                                     real scaleAB,
+                                     real scaleT) {
+  CpuMatrix::mul<CpuMatrix, SparseAutoGrowRowCpuMatrix>(
+      a, b, this, scaleAB, scaleT);
 }
 
-void CacheRowCpuMatrix::mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB,
+void CacheRowCpuMatrix::mul(CpuSparseMatrix* a,
+                            CpuMatrix* b,
+                            real scaleAB,
                             real scaleT) {
   CpuMatrix::mul<CpuMatrix, CacheRowCpuMatrix>(a, b, this, scaleAB, scaleT);
 }
 
 void SparsePrefetchRowCpuMatrix::addRows(const unsigned int* ids, size_t len) {
   std::vector<unsigned int>& localIndices = indexDictHandle_->localIndices;
-  for (size_t i = 0; i < len; i ++) {
+  for (size_t i = 0; i < len; i++) {
     CHECK_LT(*(ids + i), this->getHeight())
-      << "id:" << *(ids + i) << "Height:" << this->getHeight()
-      << "sparse id value exceeds the max input dimension, "
-      << "it could be caused invalid input data samples";
+        << "id:" << *(ids + i) << "Height:" << this->getHeight()
+        << "sparse id value exceeds the max input dimension, "
+        << "it could be caused invalid input data samples";
   }
   localIndices.insert(localIndices.end(), ids, ids + len);
 }
@@ -252,9 +261,9 @@ void SparsePrefetchRowCpuMatrix::addRows(IVectorPtr ids) {
 
     unsigned int id = (unsigned int)index[i];
     CHECK_LT(id, this->getHeight())
-      << "id:" << id << "Height:" << this->getHeight()
-      << "sparse id value exceeds the max input dimension, "
-      << "it could be caused invalid input data samples";
+        << "id:" << id << "Height:" << this->getHeight()
+        << "sparse id value exceeds the max input dimension, "
+        << "it could be caused invalid input data samples";
     localIndices.push_back(id);
   }
 }
diff --git a/paddle/math/SparseRowMatrix.h b/paddle/math/SparseRowMatrix.h
index 2dcd81188d..56f113a361 100644
--- a/paddle/math/SparseRowMatrix.h
+++ b/paddle/math/SparseRowMatrix.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <algorithm>
@@ -41,12 +40,15 @@ public:
 
   /// heightStore is max number of rows of the sparse matrix.
   SparseRowCpuMatrix(CpuMemHandlePtr dataHandle,
-                     size_t height, size_t width,
-                     IndexDictPtr indexDictHandle = nullptr, bool trans = false)
+                     size_t height,
+                     size_t width,
+                     IndexDictPtr indexDictHandle = nullptr,
+                     bool trans = false)
       : CpuMatrix(nullptr, height, width, trans),
         storeMat_(dataHandle,
                   dataHandle ? dataHandle->getSize() / sizeof(real) / width : 0,
-                  width, trans),
+                  width,
+                  trans),
         indexDictHandle_(indexDictHandle) {
     init(height, width);
   }
@@ -123,8 +125,12 @@ public:
    * While pass finished, caller should call this func one more time
    *  with (fini=true) to let weight decay catch up current time.
    */
-  void sgdUpdate(BaseMatrix& value, IVector& t0, real learningRate,
-                 int currentTime, real decayRate, bool useL1,
+  void sgdUpdate(BaseMatrix& value,
+                 IVector& t0,
+                 real learningRate,
+                 int currentTime,
+                 real decayRate,
+                 bool useL1,
                  bool fini = false);
 
   /**
@@ -135,7 +141,9 @@ public:
    *  ids occured in *this* append to *ids*
    *  filtered by  (id % numThreads == tid)
    */
-  void addTo(BaseMatrix& dest, std::vector<uint32_t>& ids, size_t tid,
+  void addTo(BaseMatrix& dest,
+             std::vector<uint32_t>& ids,
+             size_t tid,
              size_t numThreads);
 
   /**
@@ -166,7 +174,7 @@ public:
   }
 
 protected:
-  template<typename Func>
+  template <typename Func>
   void apply(Func f) {
     real* data = storeMat_.getData() ? storeMat_.getData() : rowStore_.data();
     f(data, localIndices_->size() * width_);
@@ -211,9 +219,11 @@ class SyncThreadPool;
 class SparsePrefetchRowCpuMatrix : public SparseRowCpuMatrix {
 public:
   SparsePrefetchRowCpuMatrix(CpuMemHandlePtr dataHandle,
-                             size_t height, size_t width,
+                             size_t height,
+                             size_t width,
                              IndexDictPtr indexDictHandle = nullptr,
-                             SyncThreadPool* pool = nullptr, bool trans = false)
+                             SyncThreadPool* pool = nullptr,
+                             bool trans = false)
       : SparseRowCpuMatrix(dataHandle, height, width, indexDictHandle, trans),
         pool_(pool) {}
 
@@ -239,7 +249,8 @@ protected:
 
 class SparseAutoGrowRowCpuMatrix : public SparseRowCpuMatrix {
 public:
-  SparseAutoGrowRowCpuMatrix(size_t height, size_t width,
+  SparseAutoGrowRowCpuMatrix(size_t height,
+                             size_t width,
                              IndexDictPtr indexDictHandle = nullptr,
                              bool trans = false)
       : SparseRowCpuMatrix(nullptr, height, width, indexDictHandle, trans) {}
@@ -261,8 +272,10 @@ public:
 
 class CacheRowCpuMatrix : public SparseAutoGrowRowCpuMatrix {
 public:
-  CacheRowCpuMatrix(size_t height, size_t width,
-                    IndexDictPtr indexDictHandle = nullptr, bool trans = false)
+  CacheRowCpuMatrix(size_t height,
+                    size_t width,
+                    IndexDictPtr indexDictHandle = nullptr,
+                    bool trans = false)
       : SparseAutoGrowRowCpuMatrix(height, width, indexDictHandle, trans),
         sourceData_(nullptr) {}
 
@@ -277,8 +290,8 @@ public:
       id = globalIndices_[row] = localIndices_->size();
       localIndices_->push_back(row);
       checkStoreSize();
-      memcpy(getLocalRow(id), sourceData_ + width_ * row,
-             sizeof(float) * width_);
+      memcpy(
+          getLocalRow(id), sourceData_ + width_ * row, sizeof(float) * width_);
     }
     return getLocalRow(id);
   }
@@ -300,7 +313,9 @@ public:
  */
 class SparseRowIdsCpuMatrix : public CpuMatrix {
 public:
-  SparseRowIdsCpuMatrix(CpuMemHandlePtr dataHandle, size_t height, size_t width,
+  SparseRowIdsCpuMatrix(CpuMemHandlePtr dataHandle,
+                        size_t height,
+                        size_t width,
                         bool trans = false)
       : CpuMatrix(dataHandle, height, width, trans) {}
 
diff --git a/paddle/math/Storage.cpp b/paddle/math/Storage.cpp
index 0403c3521c..57ea5c9266 100644
--- a/paddle/math/Storage.cpp
+++ b/paddle/math/Storage.cpp
@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Util.h"
 #include "Allocator.h"
 #include "Storage.h"
 
-P_DEFINE_int32(pool_limit_size, 536870912,
+P_DEFINE_int32(pool_limit_size,
+               536870912,
                "maximum memory size managed by a memory pool, default is 512M");
 
 namespace paddle {
@@ -25,11 +25,10 @@ namespace paddle {
 // Initialization StorageEngine singleton.
 // Other modules may rely on storage management,
 // so StorageEngine need to be initialized before other modules.
-static InitFunction __init_storage_engine([](){StorageEngine::singleton();},
+static InitFunction __init_storage_engine([]() { StorageEngine::singleton(); },
                                           std::numeric_limits<int>::max());
 
-StorageEngine::StorageEngine() : cpuAllocator_(nullptr) {
-}
+StorageEngine::StorageEngine() : cpuAllocator_(nullptr) {}
 
 StorageEngine::~StorageEngine() {
   if (cpuAllocator_) {
@@ -49,8 +48,8 @@ PoolAllocator* StorageEngine::getGpuAllocator(int deviceId) {
   {
     // if gpuAllocator_ has been constructed
     ReadLockGuard guard(lock_);
-    if (deviceId < static_cast<int>(gpuAllocator_.size())
-        && (gpuAllocator_[deviceId] != nullptr)) {
+    if (deviceId < static_cast<int>(gpuAllocator_.size()) &&
+        (gpuAllocator_[deviceId] != nullptr)) {
       return gpuAllocator_[deviceId];
     }
   }
@@ -63,9 +62,9 @@ PoolAllocator* StorageEngine::getGpuAllocator(int deviceId) {
     }
     if (gpuAllocator_[deviceId] == nullptr) {
       std::string name =
-        "gpu" + std::to_string(deviceId) + std::string("_pool");
-      gpuAllocator_[deviceId] = new PoolAllocator(
-        new GpuAllocator(), FLAGS_pool_limit_size, name);
+          "gpu" + std::to_string(deviceId) + std::string("_pool");
+      gpuAllocator_[deviceId] =
+          new PoolAllocator(new GpuAllocator(), FLAGS_pool_limit_size, name);
     }
     return gpuAllocator_[deviceId];
   }
@@ -86,10 +85,10 @@ PoolAllocator* StorageEngine::getCpuAllocator() {
     if (cpuAllocator_ == nullptr) {
       if (FLAGS_use_gpu) {
         cpuAllocator_ = new PoolAllocator(
-          new CudaHostAllocator(), FLAGS_pool_limit_size, "cuda_host_pool");
+            new CudaHostAllocator(), FLAGS_pool_limit_size, "cuda_host_pool");
       } else {
         cpuAllocator_ = new PoolAllocator(
-          new CpuAllocator(), FLAGS_pool_limit_size, "cpu_pool");
+            new CpuAllocator(), FLAGS_pool_limit_size, "cpu_pool");
       }
     }
     return cpuAllocator_;
diff --git a/paddle/math/Vector.cpp b/paddle/math/Vector.cpp
index 23c9caccea..b2ade83138 100644
--- a/paddle/math/Vector.cpp
+++ b/paddle/math/Vector.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Util.h"
 #include "Vector.h"
 
@@ -49,7 +48,8 @@ std::shared_ptr<VectorT<T>> VectorT<T>::createParallelVector(
 }
 
 template <class T>
-std::shared_ptr<VectorT<T>> VectorT<T>::create(T* data, size_t size,
+std::shared_ptr<VectorT<T>> VectorT<T>::create(T* data,
+                                               size_t size,
                                                bool useGpu) {
   if (useGpu) {
     return std::make_shared<GpuVectorT<T>>(size, data);
@@ -63,10 +63,10 @@ std::shared_ptr<VectorT<T>> VectorT<T>::create(size_t size,
                                                MemoryHandlePtr memoryHandle,
                                                size_t offset) {
   if (auto cpuMemHandle =
-      std::dynamic_pointer_cast<CpuMemoryHandle>(memoryHandle)) {
+          std::dynamic_pointer_cast<CpuMemoryHandle>(memoryHandle)) {
     return std::make_shared<CpuVectorT<T>>(size, cpuMemHandle, offset);
   } else if (auto gpuMemHandle =
-             std::dynamic_pointer_cast<GpuMemoryHandle>(memoryHandle)) {
+                 std::dynamic_pointer_cast<GpuMemoryHandle>(memoryHandle)) {
     return std::make_shared<GpuVectorT<T>>(size, gpuMemHandle, offset);
   } else {
     LOG(FATAL) << "Wrong";
@@ -76,22 +76,22 @@ std::shared_ptr<VectorT<T>> VectorT<T>::create(size_t size,
 
 template <>
 MatrixPtr VectorT<real>::toOneHotSparseMatrix(size_t idRange, bool useGpu) {
-    LOG(FATAL) << "Wrong for real vector";
-    return nullptr;
+  LOG(FATAL) << "Wrong for real vector";
+  return nullptr;
 }
 
 template <>
 MatrixPtr VectorT<int>::toOneHotSparseMatrix(size_t idRange, bool useGpu) {
-  int height = getSize();
-  int width = idRange;
+  size_t height = getSize();
+  size_t width = idRange;
   MatrixPtr mat = Matrix::createSparseMatrix(
       height, idRange, height, NO_VALUE, SPARSE_CSR, false, useGpu);
 
   CpuIVector cpuIds(height);
   cpuIds.copyFrom(*this);
-  int *idData = cpuIds.getData();
+  int* idData = cpuIds.getData();
 
-  for (int i = 0; i < height; i ++) {
+  for (decltype(height) i = 0; i < height; i++) {
     const unsigned int id = idData[i];
     CHECK_LT(id, width);
     mat->setRow(i, 1, &id, nullptr);
@@ -101,21 +101,20 @@ MatrixPtr VectorT<int>::toOneHotSparseMatrix(size_t idRange, bool useGpu) {
 
 template <class T>
 GpuVectorT<T>::GpuVectorT(size_t size)
-    : VectorT<T>(size, std::make_shared<GpuMemoryHandle>(sizeof(T) * size),
+    : VectorT<T>(size,
+                 std::make_shared<GpuMemoryHandle>(sizeof(T) * size),
                  0, /* offset = 0 */
                  true /* useGpu = true */) {}
 
 template <class T>
 T GpuVectorT<T>::getElement(size_t i) const {
   T elem = 0;
-  hl_memcpy_device2host(&elem, const_cast<T*>(&this->getData()[i]),
-                        sizeof(T));
+  hl_memcpy_device2host(&elem, const_cast<T*>(&this->getData()[i]), sizeof(T));
   return elem;
 }
 template <class T>
 void GpuVectorT<T>::setElement(size_t i, const T& value) {
-  hl_memcpy_host2device(&this->getData()[i], const_cast<T*>(&value),
-                        sizeof(T));
+  hl_memcpy_host2device(&this->getData()[i], const_cast<T*>(&value), sizeof(T));
 }
 
 template <class T>
@@ -219,8 +218,7 @@ real GpuVectorT<real>::getMin() {
 template <class T>
 T GpuVectorT<T>::get(size_t pos) {
   T val = (T)0;
-  hl_memcpy_device2host((void*)&val, (void*)(this->getData() + pos),
-                        sizeof(T));
+  hl_memcpy_device2host((void*)&val, (void*)(this->getData() + pos), sizeof(T));
   return val;
 }
 
@@ -229,7 +227,7 @@ void GpuVectorT<T>::histogram(std::ostream& os, int type) {
   LOG(FATAL) << "Not implemented";
 }
 
-template<class T>
+template <class T>
 void GpuVectorT<T>::zeroMem() {
   BaseMatrixT<T>::zero();
 }
@@ -252,8 +250,10 @@ void GpuVectorT<T>::copyFrom(const VectorT<T>& src) {
 template <class T>
 void GpuVectorT<T>::copyFrom(const VectorT<T>& src, hl_stream_t stream) {
   CHECK_EQ(src.getSize(), this->getSize());
-  hl_memcpy_async((void*)this->getData(), (void*)src.getData(),
-                  sizeof(T) * this->getSize(), stream);
+  hl_memcpy_async((void*)this->getData(),
+                  (void*)src.getData(),
+                  sizeof(T) * this->getSize(),
+                  stream);
 }
 
 template <class T>
@@ -269,15 +269,16 @@ void GpuVectorT<T>::copyFrom(const T* gpuSrc, size_t size, hl_stream_t stream) {
   CHECK(gpuSrc != NULL);
   CHECK_LE(size, this->size_);
 
-  hl_memcpy_async((void*)this->getData(), (void*)gpuSrc,
-                  sizeof(T) * size, stream);
+  hl_memcpy_async(
+      (void*)this->getData(), (void*)gpuSrc, sizeof(T) * size, stream);
 }
 
 template <class T>
 void GpuVectorT<T>::copyTo(CpuVectorT<T>* dest) const {
   CHECK_EQ(this->getSize(), dest->getSize());
 
-  hl_memcpy_device2host((void*)dest->getData(), (void*)this->getData(),
+  hl_memcpy_device2host((void*)dest->getData(),
+                        (void*)this->getData(),
                         sizeof(T) * this->getSize());
 }
 
@@ -285,7 +286,8 @@ template <class T>
 void GpuVectorT<T>::copyTo(GpuVectorT<T>* dest) const {
   CHECK_EQ(this->getSize(), dest->getSize());
 
-  hl_memcpy_device2device((void*)dest->getData(), (void*)this->getData(),
+  hl_memcpy_device2device((void*)dest->getData(),
+                          (void*)this->getData(),
                           sizeof(T) * this->getSize());
 }
 
@@ -297,7 +299,8 @@ void GpuVectorT<int>::rand() {
 template <>
 void GpuVectorT<int>::print(std::ostream& os, size_t num) const {
   IVectorPtr dest = IVector::create(this->size_, false);
-  hl_memcpy_device2host((void*)dest->getData(), (void*)this->getData(),
+  hl_memcpy_device2host((void*)dest->getData(),
+                        (void*)this->getData(),
                         sizeof(int) * this->getSize());
   dest->print(os, num);
 }
@@ -305,7 +308,8 @@ void GpuVectorT<int>::print(std::ostream& os, size_t num) const {
 template <>
 void GpuVectorT<real>::print(std::ostream& os, size_t num) const {
   VectorPtr dest = Vector::create(this->size_, false);
-  hl_memcpy_device2host((void*)dest->getData(), (void*)this->getData(),
+  hl_memcpy_device2host((void*)dest->getData(),
+                        (void*)this->getData(),
                         sizeof(int) * this->getSize());
   dest->print(os, num);
 }
@@ -428,8 +432,8 @@ void GpuVectorT<real>::randnorm(real mean, real std) {
   CpuVector cpuVec = CpuVector(this->getSize());
   cpuVec.randnorm(mean, std);
 
-  hl_memcpy_host2device(data_, cpuVec.getData(),
-                        this->getSize() * sizeof(real));
+  hl_memcpy_host2device(
+      data_, cpuVec.getData(), this->getSize() * sizeof(real));
 }
 
 template <>
@@ -437,19 +441,22 @@ void GpuVectorT<real>::uniform(real left, real right) {
   CpuVector cpuVec = CpuVector(this->getSize());
   cpuVec.uniform(left, right);
 
-  hl_memcpy_host2device(data_, cpuVec.getData(),
-                        this->getSize() * sizeof(real));
+  hl_memcpy_host2device(
+      data_, cpuVec.getData(), this->getSize() * sizeof(real));
 }
 
 template <class T>
 CpuVectorT<T>::CpuVectorT(size_t size)
-    : VectorT<T>(size, std::make_shared<CpuMemoryHandle>(sizeof(T) * size),
+    : VectorT<T>(size,
+                 std::make_shared<CpuMemoryHandle>(sizeof(T) * size),
                  0, /* offset = 0 */
                  false /* useGpu = false */) {}
 
 template <class T>
 CpuVectorT<T>::CpuVectorT(const VectorT<T>& src)
-    : VectorT<T>(src.getSize(), src.getMemoryHandle(), 0, /* offset = 0 */
+    : VectorT<T>(src.getSize(),
+                 src.getMemoryHandle(),
+                 0, /* offset = 0 */
                  false /* useGpu = false */) {
   if (typeid(*this->memoryHandle_.get()) != typeid(CpuMemoryHandle)) {
     this->memoryHandle_ =
@@ -646,8 +653,10 @@ void CpuVectorT<T>::copyFrom(const VectorT<T>& src) {
 template <class T>
 void CpuVectorT<T>::copyFrom(const VectorT<T>& src, hl_stream_t stream) {
   if (typeid(src) == typeid(GpuVectorT<T>)) {
-    hl_memcpy_async((void*)this->getData(), (void*)src.getData(),
-                    sizeof(T) * this->getSize(), stream);
+    hl_memcpy_async((void*)this->getData(),
+                    (void*)src.getData(),
+                    sizeof(T) * this->getSize(),
+                    stream);
   } else {
     src.copyTo(this);
   }
@@ -661,7 +670,8 @@ void CpuVectorT<T>::copyFrom(const T* hostSrc, size_t size) {
 }
 
 template <class T>
-void CpuVectorT<T>::copyFrom(const T* hostSrc, size_t size,
+void CpuVectorT<T>::copyFrom(const T* hostSrc,
+                             size_t size,
                              hl_stream_t stream) {
   (void)stream;
 
@@ -679,7 +689,8 @@ void CpuVectorT<T>::copyTo(CpuVectorT<T>* dest) const {
 template <class T>
 void CpuVectorT<T>::copyTo(GpuVectorT<T>* dest) const {
   CHECK_EQ(this->getSize(), dest->getSize());
-  hl_memcpy_host2device((void*)dest->getData(), (void*)this->getData(),
+  hl_memcpy_host2device((void*)dest->getData(),
+                        (void*)this->getData(),
                         sizeof(T) * this->getSize());
 }
 
@@ -723,8 +734,8 @@ void ParallelCpuVectorT<T>::parallelExec(ExecFunc func) {
 template <>
 void ParallelCpuVectorT<real>::parallelExec(ExecFunc func) {
   pool_->exec([this, func](int tid, size_t numThreads) {
-    auto interval = calcSplitArrayInterval(this->getSize(), (size_t)tid,
-                                           numThreads, 8LU /*for avx*/);
+    auto interval = calcSplitArrayInterval(
+        this->getSize(), (size_t)tid, numThreads, 8LU /*for avx*/);
     // setup sub bufs
     CpuVector subVec(0, nullptr);
     subVec.subVecFrom(*this, interval);
@@ -743,7 +754,8 @@ void ParallelCpuVectorT<real>::exec(SyncThreadPool::JobFunc func) {
 }
 
 template <class T>
-CpuGpuVectorT<T>::CpuGpuVectorT(size_t size, bool useGpu) : sync_(nullptr) {
+CpuGpuVectorT<T>::CpuGpuVectorT(size_t size, bool useGpu)
+    : sync_(nullptr) {
   if (!useGpu) {
     cpuVectorT_ = std::make_shared<CpuVectorT<T>>(size);
   } else {
@@ -754,7 +766,7 @@ CpuGpuVectorT<T>::CpuGpuVectorT(size_t size, bool useGpu) : sync_(nullptr) {
 
 template <class T>
 CpuGpuVectorT<T>::CpuGpuVectorT(const std::shared_ptr<VectorT<T>>& src)
-  : sync_(nullptr) {
+    : sync_(nullptr) {
   bool useGpu = src->useGpu();
   if (useGpu) {
     gpuVectorT_ = src;
@@ -766,7 +778,7 @@ CpuGpuVectorT<T>::CpuGpuVectorT(const std::shared_ptr<VectorT<T>>& src)
 
 template <class T>
 CpuGpuVectorT<T>::CpuGpuVectorT(size_t size, T* data, bool useGpu)
-  : sync_(nullptr) {
+    : sync_(nullptr) {
   if (!useGpu) {
     cpuVectorT_ = std::make_shared<CpuVectorT<T>>(size, data);
     setSync(DATA_AT_CPU);
@@ -777,8 +789,8 @@ CpuGpuVectorT<T>::CpuGpuVectorT(size_t size, T* data, bool useGpu)
 }
 
 template <class T>
-std::shared_ptr<CpuGpuVectorT<T>>
-CpuGpuVectorT<T>::create(size_t size, bool useGpu) {
+std::shared_ptr<CpuGpuVectorT<T>> CpuGpuVectorT<T>::create(size_t size,
+                                                           bool useGpu) {
   return std::make_shared<CpuGpuVectorT<T>>(size, useGpu);
 }
 
@@ -809,9 +821,9 @@ void CpuGpuVectorT<T>::resize(size_t size, bool useGpu) {
 }
 
 template <class T>
-void CpuGpuVectorT<T>::resizeOrCreate(
-    std::shared_ptr<CpuGpuVectorT<T>>& vec,
-    size_t size, bool useGpu) {
+void CpuGpuVectorT<T>::resizeOrCreate(std::shared_ptr<CpuGpuVectorT<T>>& vec,
+                                      size_t size,
+                                      bool useGpu) {
   if (vec) {
     vec->resize(size, useGpu);
   } else {
@@ -833,7 +845,9 @@ void CpuGpuVectorT<T>::resizeOrCreate(size_t size, bool useGpu) {
 
 template <class T>
 CpuGpuVectorT<T>::CpuGpuVectorT(CpuGpuVectorT<T>& src,
-  size_t offset, size_t size) : sync_(nullptr) {
+                                size_t offset,
+                                size_t size)
+    : sync_(nullptr) {
   CHECK_LE(offset + size, static_cast<size_t>(src.getSize()));
 #ifndef PADDLE_ONLY_CPU
   SyncedFlag* flag = src.getSync();
@@ -844,21 +858,21 @@ CpuGpuVectorT<T>::CpuGpuVectorT(CpuGpuVectorT<T>& src,
   }
 #endif
   auto cMemHandle = (src.getVector(false))->getMemoryHandle();
-  cpuVectorT_ = std::make_shared<CpuVectorT<T>>(size,
-    std::dynamic_pointer_cast<CpuMemoryHandle>(cMemHandle), offset);
+  cpuVectorT_ = std::make_shared<CpuVectorT<T>>(
+      size, std::dynamic_pointer_cast<CpuMemoryHandle>(cMemHandle), offset);
 #ifndef PADDLE_ONLY_CPU
   auto gMemHandle = (src.getVector(true))->getMemoryHandle();
-  gpuVectorT_ = std::make_shared<GpuVectorT<T>>(size,
-    std::dynamic_pointer_cast<GpuMemoryHandle>(gMemHandle), offset);
+  gpuVectorT_ = std::make_shared<GpuVectorT<T>>(
+      size, std::dynamic_pointer_cast<GpuMemoryHandle>(gMemHandle), offset);
   src.setSync(SYNCED);
 #endif
   setSync(src.getSync());
 }
 
 template <class T>
-std::shared_ptr<const VectorT<T>>
-CpuGpuVectorT<T>::getVector(bool useGpu) const {
-  auto * self = const_cast<CpuGpuVectorT<T>*>(this);
+std::shared_ptr<const VectorT<T>> CpuGpuVectorT<T>::getVector(
+    bool useGpu) const {
+  auto* self = const_cast<CpuGpuVectorT<T>*>(this);
   if (useGpu) {
     self->copyToGpu();
     return std::const_pointer_cast<const VectorT<T>>(gpuVectorT_);
@@ -964,8 +978,10 @@ void CpuGpuVectorT<T>::copyFrom(const T* data, size_t size, bool useGpu) {
 }
 
 template <class T>
-void CpuGpuVectorT<T>::copyFrom(const T* data, size_t size,
-    hl_stream_t stream, bool useGpu) {
+void CpuGpuVectorT<T>::copyFrom(const T* data,
+                                size_t size,
+                                hl_stream_t stream,
+                                bool useGpu) {
   if (useGpu) {
     copyToGpu(data, size, stream);
   } else {
@@ -975,7 +991,10 @@ void CpuGpuVectorT<T>::copyFrom(const T* data, size_t size,
 
 template <class T>
 void CpuGpuVectorT<T>::copyFrom(CpuGpuVectorT<T>& src,
-    size_t offset, size_t size, bool useGpu, hl_stream_t stream) {
+                                size_t offset,
+                                size_t size,
+                                bool useGpu,
+                                hl_stream_t stream) {
   if (useGpu) {
     VectorT<T>::resizeOrCreate(gpuVectorT_, size, true);
     gpuVectorT_->copyFrom(src.getData(true) + offset, size, stream);
@@ -987,8 +1006,7 @@ void CpuGpuVectorT<T>::copyFrom(CpuGpuVectorT<T>& src,
 }
 
 template <class T>
-void CpuGpuVectorT<T>::copyFrom(CpuGpuVectorT<T>& src,
-    hl_stream_t stream) {
+void CpuGpuVectorT<T>::copyFrom(CpuGpuVectorT<T>& src, hl_stream_t stream) {
   switch (*src.getSync()) {
     case DATA_AT_CPU:
       copyFrom(*(src.getVector(false)), stream);
diff --git a/paddle/math/Vector.h b/paddle/math/Vector.h
index faf8186b6d..46a25c04df 100644
--- a/paddle/math/Vector.h
+++ b/paddle/math/Vector.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <memory>
@@ -39,12 +38,11 @@ class SyncThreadPool;
 
 class Matrix;
 
-template<class T>
+template <class T>
 class BaseVector : public BaseMatrixT<T> {
 public:
   BaseVector(size_t size, T* data, bool useGpu)
-    : BaseMatrixT<T>(1, size, data, false, useGpu),
-      size_(this->width_) {}
+      : BaseMatrixT<T>(1, size, data, false, useGpu), size_(this->width_) {}
 
   ~BaseVector() {}
 
@@ -113,7 +111,8 @@ public:
     this->size_ = newSize;
   }
 
-  static void resizeOrCreate(std::shared_ptr<VectorT<T>>& vec, size_t size,
+  static void resizeOrCreate(std::shared_ptr<VectorT<T>>& vec,
+                             size_t size,
                              bool useGpu) {
     if (vec) {
       vec->resize(size);
@@ -431,11 +430,7 @@ public:
    *
    * SYNCED: data is located in CPU and GPU simultaneously.
    */
-  enum SyncedFlag {
-    DATA_AT_CPU = 0,
-    DATA_AT_GPU = 1,
-    SYNCED = 2
-  };
+  enum SyncedFlag { DATA_AT_CPU = 0, DATA_AT_GPU = 1, SYNCED = 2 };
 
   /**
    * @brief A constructor, create cpuVectorT_ or gpuVectorT_.
@@ -469,8 +464,7 @@ public:
    */
   CpuGpuVectorT(size_t size, T* data, bool useGpu);
 
-  CpuGpuVectorT(CpuGpuVectorT<T>& src,
-    size_t offset, size_t size);
+  CpuGpuVectorT(CpuGpuVectorT<T>& src, size_t offset, size_t size);
 
   virtual ~CpuGpuVectorT() {}
 
@@ -489,8 +483,8 @@ public:
    * @brief resize or create CpuGpuVectorT.
    */
   static void resizeOrCreate(std::shared_ptr<CpuGpuVectorT<T>>& vec,
-                             size_t size, bool useGpu);
-
+                             size_t size,
+                             bool useGpu);
 
   /**
    * @brief return a const cpuVectorT_ or gpuVectorT_.
@@ -522,10 +516,10 @@ public:
    */
   const T* getData(bool useGpu) const;
 
-// TODO(yuyang18): Make getData more c++ style.
-//  inline T* getData(bool useGpu) {
-//    return getMutableData(useGpu);
-//  }
+  // TODO(yuyang18): Make getData more c++ style.
+  //  inline T* getData(bool useGpu) {
+  //    return getMutableData(useGpu);
+  //  }
 
   T* getMutableData(bool useGpu);
 
@@ -615,8 +609,11 @@ public:
   /**
    * @brief copy from (src + offset) using specifed-stream.
    */
-  void copyFrom(CpuGpuVectorT<T>& src, size_t offset, size_t size,
-                bool useGpu, hl_stream_t stream);
+  void copyFrom(CpuGpuVectorT<T>& src,
+                size_t offset,
+                size_t size,
+                bool useGpu,
+                hl_stream_t stream);
 
   /**
    * @brief copy from src using specifed-stream.
@@ -626,16 +623,12 @@ public:
   /**
    * @brief return sync_.
    */
-  inline SyncedFlag* getSync() const {
-    return sync_;
-  }
+  inline SyncedFlag* getSync() const { return sync_; }
 
   /**
    * @brief set sync_.
    */
-  inline void setSync(SyncedFlag* sync) {
-    sync_ = sync;
-  }
+  inline void setSync(SyncedFlag* sync) { sync_ = sync; }
 
   inline void setSync(SyncedFlag syncFlag) {
     if (sync_) {
diff --git a/paddle/math/tests/test_Allocator.cpp b/paddle/math/tests/test_Allocator.cpp
index c94e7f043c..084322a1ca 100644
--- a/paddle/math/tests/test_Allocator.cpp
+++ b/paddle/math/tests/test_Allocator.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <gtest/gtest.h>
 #include "paddle/utils/Util.h"
 #include "paddle/utils/Logging.h"
@@ -21,11 +20,12 @@ limitations under the License. */
 #include "paddle/math/Allocator.h"
 #include "paddle/math/PoolAllocator.h"
 
-using namespace paddle;     // NOLINT
+using namespace paddle;  // NOLINT
 
-template<typename Allocator>
+template <typename Allocator>
 void testPoolAllocator() {
-  PoolAllocator* pool = new PoolAllocator(new Allocator(), /* sizeLimit */1024);
+  PoolAllocator* pool =
+      new PoolAllocator(new Allocator(), /* sizeLimit */ 1024);
 
   /* alloc from system memory */
   void* ptr1 = pool->alloc(10);
diff --git a/paddle/math/tests/test_ExecViaCpu.cpp b/paddle/math/tests/test_ExecViaCpu.cpp
index ae201f1723..b3eca19a72 100644
--- a/paddle/math/tests/test_ExecViaCpu.cpp
+++ b/paddle/math/tests/test_ExecViaCpu.cpp
@@ -23,7 +23,10 @@ using namespace paddle;  // NOLINT
 const int height = 10;
 const int width = 16;
 
-real f(Matrix& mat1, const Matrix& mat2, IVector& vec1, const IVector& vec2,
+real f(Matrix& mat1,
+       const Matrix& mat2,
+       IVector& vec1,
+       const IVector& vec2,
        real scalar) {
   CHECK(!mat1.useGpu());
   CHECK(!mat2.useGpu());
@@ -37,8 +40,11 @@ real f(Matrix& mat1, const Matrix& mat2, IVector& vec1, const IVector& vec2,
 
 class Functor {
 public:
-  real operator()(Matrix& mat1, const Matrix& mat2, IVector& vec1,
-                  const IVector& vec2, real scalar) {
+  real operator()(Matrix& mat1,
+                  const Matrix& mat2,
+                  IVector& vec1,
+                  const IVector& vec2,
+                  real scalar) {
     a_ = f(mat1, mat2, vec1, vec2, scalar);
     return a_;
   }
@@ -93,9 +99,13 @@ TEST(ExecViaCpu, test1) {
   testWrapper(f);
   testWrapper(&f);
 
-  auto lambda =
-      [](Matrix& mat1, const Matrix& mat2, IVector& vec1, const IVector& vec2,
-         real scalar) -> real { return f(mat1, mat2, vec1, vec2, scalar); };
+  auto lambda = [](Matrix& mat1,
+                   const Matrix& mat2,
+                   IVector& vec1,
+                   const IVector& vec2,
+                   real scalar) -> real {
+    return f(mat1, mat2, vec1, vec2, scalar);
+  };
   LOG(INFO) << "lambda is_class=" << std::is_class<decltype(lambda)>::value
             << " is_function=" << std::is_function<decltype(lambda)>::value;
   testWrapper(lambda);
diff --git a/paddle/math/tests/test_FPException.cpp b/paddle/math/tests/test_FPException.cpp
index 174278c2aa..f996e0dadd 100644
--- a/paddle/math/tests/test_FPException.cpp
+++ b/paddle/math/tests/test_FPException.cpp
@@ -12,14 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 /**
  * This test is about floating point calculation exception.
  * Paddle catches FE_INVALID, FE DIVBYZERO and FE_OVERFLOW exceptions.
  *
- * Some exceptions occur in the middle of a set of formulas, 
+ * Some exceptions occur in the middle of a set of formulas,
  * that can be circumvented by some tricks.
- * For example, 
+ * For example,
  * calculate tanh
  *   b = 2.0 / (1.0 + exp(-2 * a)) - 1.0
  *
@@ -34,7 +33,7 @@ limitations under the License. */
 #include "paddle/math/Matrix.h"
 #include "paddle/utils/Excepts.h"
 
-using namespace paddle;     // NOLINT
+using namespace paddle;  // NOLINT
 
 void SetTensorValue(Matrix& matrix, real value) {
   int height = matrix.getHeight();
@@ -53,7 +52,7 @@ void SetTensorValue(Matrix& matrix, real value) {
   }
 }
 
-template<typename Matrix>
+template <typename Matrix>
 void testTanh(real illegal) {
   MatrixPtr A = std::make_shared<Matrix>(10, 10);
   MatrixPtr B = std::make_shared<Matrix>(10, 10);
@@ -65,7 +64,7 @@ void testTanh(real illegal) {
   A->tanh(*B);
 }
 
-template<typename Matrix>
+template <typename Matrix>
 void testSigmoid(real illegal) {
   MatrixPtr A = std::make_shared<Matrix>(10, 10);
   MatrixPtr B = std::make_shared<Matrix>(10, 10);
diff --git a/paddle/math/tests/test_SIMDFunctions.cpp b/paddle/math/tests/test_SIMDFunctions.cpp
index 491b0cda7b..8405b96fc2 100644
--- a/paddle/math/tests/test_SIMDFunctions.cpp
+++ b/paddle/math/tests/test_SIMDFunctions.cpp
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
-
 #include "paddle/math/SIMDFunctions.h"
 #include "paddle/utils/Util.h"
 
@@ -128,13 +126,13 @@ TEST(SIMDFunction, decayL1_WithLR) {
   typedef std::function<void(float*, float*, float*, float, size_t)>
       DecayL1MethodType;
 
-  DecayL1MethodType naive = [](float* d, float* s, float* lr, float l,
-                               size_t len) {
+  DecayL1MethodType naive = [](
+      float* d, float* s, float* lr, float l, size_t len) {
     paddle::simd::naive::decayL1<float>(d, s, lr, l, len);
   };
 
-  DecayL1MethodType simd = [](float* d, float* s, float* lr, float l,
-                              size_t len) {
+  DecayL1MethodType simd = [](
+      float* d, float* s, float* lr, float l, size_t len) {
     paddle::simd::decayL1<float>(d, s, lr, l, len);
   };
 
diff --git a/paddle/math/tests/test_batchTranspose.cpp b/paddle/math/tests/test_batchTranspose.cpp
index 737504da38..a9596992b2 100644
--- a/paddle/math/tests/test_batchTranspose.cpp
+++ b/paddle/math/tests/test_batchTranspose.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "test_matrixUtil.h"
 #include "hl_batch_transpose.h"
 
@@ -48,8 +47,8 @@ TEST(MatrixBatchTransTest, test_batch_matrix_transpose) {
             cData[sample_id * nx * ny + j * nx + i];
   // device
   gMat->copyFrom(*cMat, HPPL_STREAM_DEFAULT);
-  batchTranspose(gMat->getData(), gBatchTransMat->getData(), nx, ny,
-                 numSamples);
+  batchTranspose(
+      gMat->getData(), gBatchTransMat->getData(), nx, ny, numSamples);
   cMat_d2h->copyFrom(*gBatchTransMat, HPPL_STREAM_DEFAULT);
   checkMatrixEqual(cBatchTransMat, cMat_d2h);
 }
diff --git a/paddle/math/tests/test_matrix.cpp b/paddle/math/tests/test_matrix.cpp
index 71c9622420..3788218aab 100644
--- a/paddle/math/tests/test_matrix.cpp
+++ b/paddle/math/tests/test_matrix.cpp
@@ -48,7 +48,8 @@ struct MatrixPara {
 };
 
 #ifndef PADDLE_ONLY_CPU
-void test_sparse_matrix_mul(MatrixPara paraA, MatrixPara paraB,
+void test_sparse_matrix_mul(MatrixPara paraA,
+                            MatrixPara paraB,
                             MatrixPara paraC) {
   // for cpu sparse matrix mul
   MatrixPtr cpuMatrixA, cpuMatrixB, cpuMatrixC, gpuMatrixC_d2h;
@@ -58,12 +59,20 @@ void test_sparse_matrix_mul(MatrixPara paraA, MatrixPara paraB,
   MatrixPtr cpuDenseA, cpuDenseB, cpuDenseC;
 
   if (paraA.sparse) {
-    cpuMatrixA = Matrix::createSparseMatrix(paraA.height, paraA.width,
-                                            paraA.nnz, FLOAT_VALUE,
-                                            paraA.format, paraA.trans, false);
-    gpuMatrixA = Matrix::createSparseMatrix(paraA.height, paraA.width,
-                                            paraA.nnz, FLOAT_VALUE,
-                                            paraA.format, paraA.trans, true);
+    cpuMatrixA = Matrix::createSparseMatrix(paraA.height,
+                                            paraA.width,
+                                            paraA.nnz,
+                                            FLOAT_VALUE,
+                                            paraA.format,
+                                            paraA.trans,
+                                            false);
+    gpuMatrixA = Matrix::createSparseMatrix(paraA.height,
+                                            paraA.width,
+                                            paraA.nnz,
+                                            FLOAT_VALUE,
+                                            paraA.format,
+                                            paraA.trans,
+                                            true);
   } else {
     cpuMatrixA = Matrix::create(paraA.height, paraA.width, paraA.trans, false);
     gpuMatrixA = Matrix::create(paraA.height, paraA.width, paraA.trans, true);
@@ -71,12 +80,20 @@ void test_sparse_matrix_mul(MatrixPara paraA, MatrixPara paraB,
   cpuDenseA = Matrix::create(paraA.height, paraA.width, paraA.trans, false);
 
   if (paraB.sparse) {
-    cpuMatrixB = Matrix::createSparseMatrix(paraB.height, paraB.width,
-                                            paraB.nnz, FLOAT_VALUE,
-                                            paraB.format, paraB.trans, false);
-    gpuMatrixB = Matrix::createSparseMatrix(paraB.height, paraB.width,
-                                            paraB.nnz, FLOAT_VALUE,
-                                            paraB.format, paraB.trans, true);
+    cpuMatrixB = Matrix::createSparseMatrix(paraB.height,
+                                            paraB.width,
+                                            paraB.nnz,
+                                            FLOAT_VALUE,
+                                            paraB.format,
+                                            paraB.trans,
+                                            false);
+    gpuMatrixB = Matrix::createSparseMatrix(paraB.height,
+                                            paraB.width,
+                                            paraB.nnz,
+                                            FLOAT_VALUE,
+                                            paraB.format,
+                                            paraB.trans,
+                                            true);
   } else {
     cpuMatrixB = Matrix::create(paraB.height, paraB.width, paraB.trans, false);
     gpuMatrixB = Matrix::create(paraB.height, paraB.width, paraB.trans, true);
@@ -84,15 +101,27 @@ void test_sparse_matrix_mul(MatrixPara paraA, MatrixPara paraB,
   cpuDenseB = Matrix::create(paraB.height, paraB.width, paraB.trans, false);
 
   if (paraC.sparse) {
-    cpuMatrixC = Matrix::createSparseMatrix(paraC.height, paraC.width,
-                                            paraC.nnz, FLOAT_VALUE,
-                                            paraC.format, paraC.trans, false);
-    gpuMatrixC = Matrix::createSparseMatrix(paraC.height, paraC.width,
-                                            paraC.nnz, FLOAT_VALUE,
-                                            paraC.format, paraC.trans, true);
-    gpuMatrixC_d2h = Matrix::createSparseMatrix(
-        paraC.height, paraC.width, paraC.nnz, FLOAT_VALUE, paraC.format,
-        paraC.trans, false);
+    cpuMatrixC = Matrix::createSparseMatrix(paraC.height,
+                                            paraC.width,
+                                            paraC.nnz,
+                                            FLOAT_VALUE,
+                                            paraC.format,
+                                            paraC.trans,
+                                            false);
+    gpuMatrixC = Matrix::createSparseMatrix(paraC.height,
+                                            paraC.width,
+                                            paraC.nnz,
+                                            FLOAT_VALUE,
+                                            paraC.format,
+                                            paraC.trans,
+                                            true);
+    gpuMatrixC_d2h = Matrix::createSparseMatrix(paraC.height,
+                                                paraC.width,
+                                                paraC.nnz,
+                                                FLOAT_VALUE,
+                                                paraC.format,
+                                                paraC.trans,
+                                                false);
   } else {
     cpuMatrixC = Matrix::create(paraC.height, paraC.width, paraC.trans, false);
     gpuMatrixC = Matrix::create(paraC.height, paraC.width, paraC.trans, true);
@@ -267,8 +296,8 @@ TEST(Matrix, CpuSparseMatrixSubMatrix) {
   }
 }
 
-void sparseValid(int* major, int* minor, size_t nnz, size_t majorLen,
-                 size_t minorLen) {
+void sparseValid(
+    int* major, int* minor, size_t nnz, size_t majorLen, size_t minorLen) {
   CHECK_EQ(nnz, size_t(major[majorLen - 1]));
   CHECK_EQ(nnz, minorLen);
   for (size_t i = 0; i < majorLen - 1; i++) {
@@ -375,14 +404,25 @@ TEST(Matrix, SparseMatrixCSRFormatTrimFrom) {
   int64_t trimedIndices[11] = {0, 1, 3, 3, 7, 7, 9, 10, 12, 16, 19};
   sparse_float_value_t trimedData[19];
   int trimedValue[19] = {
-      1,           // row_0 : 1
-      3, 1,        // row_1 : 2
-      0, 1, 2, 3,  // row_3 : 4
-      2, 3,        // row_5 : 2
-      3,           // row_6 : 1
-      0, 1,        // row_7 : 2
-      0, 1, 2, 3,  // row_8 : 4
-      2, 3, 1      // row_9 : 3
+      1,  // row_0 : 1
+      3,
+      1,  // row_1 : 2
+      0,
+      1,
+      2,
+      3,  // row_3 : 4
+      2,
+      3,  // row_5 : 2
+      3,  // row_6 : 1
+      0,
+      1,  // row_7 : 2
+      0,
+      1,
+      2,
+      3,  // row_8 : 4
+      2,
+      3,
+      1  // row_9 : 3
   };
   for (size_t i = 0; i < 19; i++) {
     trimedData[i].col = trimedValue[i];
@@ -415,9 +455,13 @@ TEST(Matrix, SparseMatrixCSRFormatTrimFrom) {
       height, trimedWidth, height, FLOAT_VALUE, SPARSE_CSR, true);
   matC->trimFrom(*mat);
 
-  CpuSparseMatrixPtr matD = std::make_shared<CpuSparseMatrix>(
-      height, trimedWidth, matC->getElementCnt(), FLOAT_VALUE, SPARSE_CSR,
-      false);
+  CpuSparseMatrixPtr matD =
+      std::make_shared<CpuSparseMatrix>(height,
+                                        trimedWidth,
+                                        matC->getElementCnt(),
+                                        FLOAT_VALUE,
+                                        SPARSE_CSR,
+                                        false);
   matD->copyFrom(*matC, HPPL_STREAM_DEFAULT);
   hl_stream_synchronize(HPPL_STREAM_DEFAULT);
   checkSMatrixEqual2(matA, matD);
@@ -462,11 +506,17 @@ TEST(Matrix, SparseMatrixCSCFormatTrimFrom) {
   int trimedIndices[6] = {0, 1, 5, 5, 9, 13};
   int trimedValue[13] = {
       1,  // col_0 : 1
-      5, 3, 1,
+      5,
+      3,
+      1,
       6,  // col_1 : 4
-      0, 1, 2,
+      0,
+      1,
+      2,
       3,  // col_3 : 4
-      4, 5, 6,
+      4,
+      5,
+      6,
       7  // col_4 : 4
   };
   std::vector<int> rowsA(trimedValue, trimedValue + 13);
@@ -499,9 +549,13 @@ TEST(Matrix, SparseMatrixCSCFormatTrimFrom) {
       height, trimedWidth, height, FLOAT_VALUE, SPARSE_CSC, true);
   matC->trimFrom(*mat);
 
-  CpuSparseMatrixPtr matD = std::make_shared<CpuSparseMatrix>(
-      height, trimedWidth, matC->getElementCnt(), FLOAT_VALUE, SPARSE_CSC,
-      false);
+  CpuSparseMatrixPtr matD =
+      std::make_shared<CpuSparseMatrix>(height,
+                                        trimedWidth,
+                                        matC->getElementCnt(),
+                                        FLOAT_VALUE,
+                                        SPARSE_CSC,
+                                        false);
   matD->copyFrom(*matC, HPPL_STREAM_DEFAULT);
   hl_stream_synchronize(HPPL_STREAM_DEFAULT);
   checkSMatrixEqual2(matA, matD);
diff --git a/paddle/math/tests/test_matrixCompare.cpp b/paddle/math/tests/test_matrixCompare.cpp
index 9c03695ba5..ae5bc5a86a 100644
--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
@@ -23,11 +23,10 @@ limitations under the License. */
 #include "paddle/gserver/tests/TestUtil.h"
 #include "paddle/utils/Stat.h"
 
-
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
 
-template<class T>
+template <class T>
 void VectorCheckEqual(const VectorT<T>& vector1, const VectorT<T>& vector2) {
   CHECK(vector1.getSize() == vector2.getSize());
 
@@ -90,7 +89,9 @@ void MatrixCheckErr(const Matrix& matrix1, const Matrix& matrix2) {
   EXPECT_EQ(count, 0) << "There are " << count << " different element.";
 }
 
-void testBilinearFwdBwd(int numSamples, int imgSizeH, int imgSizeW,
+void testBilinearFwdBwd(int numSamples,
+                        int imgSizeH,
+                        int imgSizeW,
                         int channels) {
   int inWidth = imgSizeH * imgSizeW * channels;
   int outWidth = 2 * imgSizeH * 2 * imgSizeW * channels;
@@ -107,10 +108,22 @@ void testBilinearFwdBwd(int numSamples, int imgSizeH, int imgSizeW,
   input->randomizeUniform();
   inputGpu->copyFrom(*input);
 
-  target->bilinearForward(*input, imgSizeH, imgSizeW,
-      2 * imgSizeH, 2 * imgSizeW, channels, ratioH, ratioW);
-  targetGpu->bilinearForward(*inputGpu, imgSizeH, imgSizeW,
-      2 * imgSizeH, 2 * imgSizeW, channels, ratioH, ratioW);
+  target->bilinearForward(*input,
+                          imgSizeH,
+                          imgSizeW,
+                          2 * imgSizeH,
+                          2 * imgSizeW,
+                          channels,
+                          ratioH,
+                          ratioW);
+  targetGpu->bilinearForward(*inputGpu,
+                             imgSizeH,
+                             imgSizeW,
+                             2 * imgSizeH,
+                             2 * imgSizeW,
+                             channels,
+                             ratioH,
+                             ratioW);
 
   // check
   targetCheck->copyFrom(*targetGpu);
@@ -121,8 +134,8 @@ void testBilinearFwdBwd(int numSamples, int imgSizeH, int imgSizeW,
   MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
 
   MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr targetGpuGrad = GpuMatrix::create(numSamples, outWidth, false,
-                                              true);
+  MatrixPtr targetGpuGrad =
+      GpuMatrix::create(numSamples, outWidth, false, true);
   MatrixPtr targetCheckGrad =
       CpuMatrix::create(numSamples, inWidth, false, false);
 
@@ -131,10 +144,22 @@ void testBilinearFwdBwd(int numSamples, int imgSizeH, int imgSizeW,
   inputGpuGrad->copyFrom(*inputGrad);
   targetGpuGrad->copyFrom(*targetGrad);
 
-  inputGrad->bilinearBackward(*targetGrad, 2 * imgSizeH, 2 * imgSizeW,
-      imgSizeH, imgSizeW, channels, ratioH, ratioW);
-  inputGpuGrad->bilinearBackward(*targetGpuGrad, 2 * imgSizeH, 2 * imgSizeW,
-      imgSizeH, imgSizeW, channels, ratioH, ratioW);
+  inputGrad->bilinearBackward(*targetGrad,
+                              2 * imgSizeH,
+                              2 * imgSizeW,
+                              imgSizeH,
+                              imgSizeW,
+                              channels,
+                              ratioH,
+                              ratioW);
+  inputGpuGrad->bilinearBackward(*targetGpuGrad,
+                                 2 * imgSizeH,
+                                 2 * imgSizeW,
+                                 imgSizeH,
+                                 imgSizeW,
+                                 channels,
+                                 ratioH,
+                                 ratioW);
 
   // check
   targetCheckGrad->copyFrom(*inputGpuGrad);
@@ -146,10 +171,8 @@ TEST(Matrix, BilinearFwdBwd) {
     for (auto channels : {8, 16}) {
       for (auto imgSizeH : {14, 28}) {
         for (auto imgSizeW : {16, 30}) {
-          VLOG(3) << " numSamples=" << numSamples
-                  << " channels=" << channels
-                  << " imgSizeH=" << imgSizeH
-                  << " imgSizeW=" << imgSizeW;
+          VLOG(3) << " numSamples=" << numSamples << " channels=" << channels
+                  << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW;
           testBilinearFwdBwd(numSamples, imgSizeH, imgSizeW, channels);
         }
       }
@@ -157,8 +180,11 @@ TEST(Matrix, BilinearFwdBwd) {
   }
 }
 
-void testMatrixProjectionForward(int contextStart, int contextLength,
-                                 bool padding, int batchSize, int inputDim) {
+void testMatrixProjectionForward(int contextStart,
+                                 int contextLength,
+                                 bool padding,
+                                 int batchSize,
+                                 int inputDim) {
   MatrixPtr cpuInput = std::make_shared<CpuMatrix>(batchSize, inputDim);
   MatrixPtr gpuInput = std::make_shared<GpuMatrix>(batchSize, inputDim);
   cpuInput->randomizeUniform();
@@ -190,12 +216,20 @@ void testMatrixProjectionForward(int contextStart, int contextLength,
 
   // calculate
   int beginPad = std::max(0, -contextStart);
-  cpuOutput->contextProjectionForward(cpuInput, cpuWeight, *cpuSequence,
-                                      contextLength, contextStart, beginPad,
+  cpuOutput->contextProjectionForward(cpuInput,
+                                      cpuWeight,
+                                      *cpuSequence,
+                                      contextLength,
+                                      contextStart,
+                                      beginPad,
                                       padding);
 
-  gpuOutput->contextProjectionForward(gpuInput, gpuWeight, *gpuSequence,
-                                      contextLength, contextStart, beginPad,
+  gpuOutput->contextProjectionForward(gpuInput,
+                                      gpuWeight,
+                                      *gpuSequence,
+                                      contextLength,
+                                      contextStart,
+                                      beginPad,
                                       padding);
 
   // check
@@ -206,8 +240,11 @@ void testMatrixProjectionForward(int contextStart, int contextLength,
   MatrixCheckEqual(*cpuOutput, *outputCheck);
 }
 
-void testMatrixProjectionBackward(int contextStart, int contextLength,
-                                  bool padding, int batchSize, int inputDim) {
+void testMatrixProjectionBackward(int contextStart,
+                                  int contextLength,
+                                  bool padding,
+                                  int batchSize,
+                                  int inputDim) {
   MatrixPtr cpuOutputGrad =
       std::make_shared<CpuMatrix>(batchSize, inputDim * contextLength);
   MatrixPtr gpuOutputGrad =
@@ -239,15 +276,22 @@ void testMatrixProjectionBackward(int contextStart, int contextLength,
 
   // calculate
   int beginPad = std::max(0, -contextStart);
-  cpuOutputGrad->contextProjectionBackward(cpuInputGrad, cpuWeightGrad,
-                                           *cpuSequence, contextLength,
-                                           contextStart, beginPad, padding);
-  gpuOutputGrad->contextProjectionBackwardData(gpuInputGrad, *gpuSequence,
-                                               contextLength, contextStart);
+  cpuOutputGrad->contextProjectionBackward(cpuInputGrad,
+                                           cpuWeightGrad,
+                                           *cpuSequence,
+                                           contextLength,
+                                           contextStart,
+                                           beginPad,
+                                           padding);
+  gpuOutputGrad->contextProjectionBackwardData(
+      gpuInputGrad, *gpuSequence, contextLength, contextStart);
   if (padding) {
-    gpuOutputGrad->contextProjectionBackwardWeight(
-        gpuWeightGrad, *gpuSequence, contextLength,
-        contextStart, pad, beginPad);
+    gpuOutputGrad->contextProjectionBackwardWeight(gpuWeightGrad,
+                                                   *gpuSequence,
+                                                   contextLength,
+                                                   contextStart,
+                                                   pad,
+                                                   beginPad);
   }
 
   // check
@@ -269,13 +313,19 @@ TEST(Matrix, projection) {
         for (auto batchSize : {1, 2, 5, 20, 100}) {
           for (auto inputDim : {15, 32, 63, 128, 200}) {
             VLOG(3) << " contextStart=" << contextStart
-                      << " contextLength=" << contextLength
-                      << " trainablePadding=" << trainablePadding
-                      << " batchSize=" << batchSize << " inputDim=" << inputDim;
-            testMatrixProjectionForward(contextStart, contextLength,
-                                        trainablePadding, batchSize, inputDim);
-            testMatrixProjectionBackward(contextStart, contextLength,
-                                         trainablePadding, batchSize, inputDim);
+                    << " contextLength=" << contextLength
+                    << " trainablePadding=" << trainablePadding
+                    << " batchSize=" << batchSize << " inputDim=" << inputDim;
+            testMatrixProjectionForward(contextStart,
+                                        contextLength,
+                                        trainablePadding,
+                                        batchSize,
+                                        inputDim);
+            testMatrixProjectionBackward(contextStart,
+                                         contextLength,
+                                         trainablePadding,
+                                         batchSize,
+                                         inputDim);
           }
         }
       }
@@ -813,7 +863,6 @@ void testSequenceSoftmax(int batchSize) {
   MatrixCheckErr(*cpuInput, *outputCheck);
 }
 
-
 void testMatrixSoftmaxThreshold(int height, int width) {
   MatrixPtr cpuInput = std::make_shared<CpuMatrix>(height, width);
   MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(height, width);
@@ -1216,7 +1265,7 @@ TEST(Matrix, AtOffset) {
     for (auto width1 : {1, 32, 100, 512, 1000}) {
       for (auto width2 : {1, 32, 100, 512, 1000}) {
         VLOG(3) << " height=" << height << " width1=" << width1
-                  << " width2=" << width2;
+                << " width2=" << width2;
 
         testMatrixAddAtOffset(height, width1, width2);
         testMatrixAssignAtOffset(height, width1, width2);
@@ -1284,7 +1333,7 @@ TEST(Matrix, tableProjection) {
     for (auto tableSize : {10, 100}) {
       for (auto inputDim : {20, 50}) {
         VLOG(3) << " numSamples=" << numSamples << " tableSize=" << tableSize
-                  << " inputDim=" << inputDim;
+                << " inputDim=" << inputDim;
         testMatrixSelectRows(numSamples, tableSize, inputDim);
         testMatrixAddToRows(numSamples, tableSize, inputDim);
       }
@@ -1359,8 +1408,12 @@ void testSubMatrixMul(bool transa, bool transb, int dimM, int dimN, int dimK) {
     }
   };
 
-  auto subMatrix = [](MatrixPtr& sub, MatrixPtr matrix, size_t startRow,
-                      size_t endRow, size_t startCol, size_t endCol) {
+  auto subMatrix = [](MatrixPtr& sub,
+                      MatrixPtr matrix,
+                      size_t startRow,
+                      size_t endRow,
+                      size_t startCol,
+                      size_t endCol) {
     if (!matrix->isTransposed()) {
       sub = matrix->subMatrix(startRow, endRow, startCol, endCol);
     } else {
@@ -1404,9 +1457,9 @@ TEST(Matrix, mul) {
               continue;
             }
             VLOG(3) << setiosflags(ios::left) << setfill(' ')
-                      << " transa=" << transa << " transb=" << transb
-                      << " dimM=" << setw(5) << dimM << " dimN=" << setw(5)
-                      << dimN << " dimK=" << setw(5) << dimK;
+                    << " transa=" << transa << " transb=" << transb
+                    << " dimM=" << setw(5) << dimM << " dimN=" << setw(5)
+                    << dimN << " dimK=" << setw(5) << dimK;
 
             testMatrixMul(transa, transb, dimM, dimN, dimK);
             testSubMatrixMul(transa, transb, dimM, dimN, dimK);
@@ -1436,7 +1489,7 @@ TEST(Vector, rowFunc) {
   }
 }
 
-template<class T>
+template <class T>
 void testVectorReset(int size) {
   std::shared_ptr<CpuVectorT<T>> cpu = std::make_shared<CpuVectorT<T>>(size);
   std::shared_ptr<GpuVectorT<T>> gpu = std::make_shared<GpuVectorT<T>>(size);
@@ -1450,14 +1503,14 @@ void testVectorReset(int size) {
   VectorCheckEqual(*cpu, *out);
 }
 
-template<class T>
+template <class T>
 void testVecortSelectFrom(int size) {
   std::shared_ptr<CpuVectorT<T>> cpuDst = std::make_shared<CpuVectorT<T>>(size);
   std::shared_ptr<GpuVectorT<T>> gpuDst = std::make_shared<GpuVectorT<T>>(size);
-  std::shared_ptr<CpuVectorT<T>>
-    cpuSrc = std::make_shared<CpuVectorT<T>>(size*2);
-  std::shared_ptr<GpuVectorT<T>>
-    gpuSrc = std::make_shared<GpuVectorT<T>>(size*2);
+  std::shared_ptr<CpuVectorT<T>> cpuSrc =
+      std::make_shared<CpuVectorT<T>>(size * 2);
+  std::shared_ptr<GpuVectorT<T>> gpuSrc =
+      std::make_shared<GpuVectorT<T>>(size * 2);
   CpuIVectorPtr cpuIds = std::make_shared<CpuVectorT<int>>(size);
   GpuIVectorPtr gpuIds = std::make_shared<GpuVectorT<int>>(size);
 
@@ -1478,7 +1531,7 @@ void testVecortSelectFrom(int size) {
   VectorCheckEqual(*cpuDst, *out);
 }
 
-template<class T>
+template <class T>
 void testVecotrZeroMem(int size) {
   std::shared_ptr<CpuVectorT<T>> cpu = std::make_shared<CpuVectorT<T>>(size);
   std::shared_ptr<GpuVectorT<T>> gpu = std::make_shared<GpuVectorT<T>>(size);
@@ -1491,7 +1544,7 @@ void testVecotrZeroMem(int size) {
   VectorCheckEqual(*cpu, *out);
 }
 
-template<class T>
+template <class T>
 void testVectorIsEqual(int size) {
   std::shared_ptr<CpuVectorT<T>> cpuA = std::make_shared<CpuVectorT<T>>(size);
   std::shared_ptr<CpuVectorT<T>> cpuB = std::make_shared<CpuVectorT<T>>(size);
@@ -1549,12 +1602,11 @@ void testMatrixTopK(int samples, int dim, int beamSize) {
 
 TEST(Matrix, topK) {
   for (auto samples : {1, 5, 31, 90, 150, 500}) {
-    for (auto dim : {1, 5 , 8, 10, 15, 64, 80, 120, 256, 300,
-                     1280, 5120, 50000}) {
+    for (auto dim :
+         {1, 5, 8, 10, 15, 64, 80, 120, 256, 300, 1280, 5120, 50000}) {
       for (auto beamSize : {1, 5, 10, 20, 40, (int)rand() % dim + 1}) {
         if (beamSize > dim) continue;
-        VLOG(3) << " samples=" << samples
-                << " beamSize=" << beamSize
+        VLOG(3) << " samples=" << samples << " beamSize=" << beamSize
                 << " dim=" << dim;
         testMatrixTopK(samples, dim, beamSize);
       }
@@ -1604,10 +1656,8 @@ TEST(SMatrix, topK) {
       for (auto beamSize : {1, 5, 40, 100, 500}) {
         for (auto ratio : {0.01, 0.001}) {
           if (beamSize > dim) continue;
-          VLOG(3) << " samples=" << samples
-                  << " beamSize=" << beamSize
-                  << " dim=" << dim
-                  << " ratio=" << ratio;
+          VLOG(3) << " samples=" << samples << " beamSize=" << beamSize
+                  << " dim=" << dim << " ratio=" << ratio;
           testSMatrixTopK(samples, dim, beamSize, ratio);
         }
       }
@@ -1728,8 +1778,7 @@ TEST(Matrix, cosSim) {
   }
 }
 
-void testCosSimDerivate(int heightX, int heightY, int width,
-                        real scale) {
+void testCosSimDerivate(int heightX, int heightY, int width, real scale) {
   MatrixPtr prevOutX = CpuMatrix::create(heightX, width, false, false);
   MatrixPtr prevOutY = CpuMatrix::create(heightY, width, false, false);
   MatrixPtr grad = CpuMatrix::create(heightX, 1, false, false);
@@ -1758,12 +1807,8 @@ void testCosSimDerivate(int heightX, int heightY, int width,
   prevGradXGpu->copyFrom(*prevGradX);
   prevGradYGpu->copyFrom(*prevGradY);
 
-  grad->cosSimDerivative(*output,
-                         *prevOutX,
-                         *prevOutY,
-                         *prevGradX,
-                         *prevGradY,
-                         scale);
+  grad->cosSimDerivative(
+      *output, *prevOutX, *prevOutY, *prevGradX, *prevGradY, scale);
 
   gradGpu->cosSimDerivative(*outputGpu,
                             *prevOutXGpu,
@@ -1772,10 +1817,8 @@ void testCosSimDerivate(int heightX, int heightY, int width,
                             *prevGradYGpu,
                             scale);
 
-  MatrixPtr prevGradXCheck = CpuMatrix::create(heightX, width, false,
-                                               false);
-  MatrixPtr prevGradYCheck = CpuMatrix::create(heightY, width, false,
-                                               false);
+  MatrixPtr prevGradXCheck = CpuMatrix::create(heightX, width, false, false);
+  MatrixPtr prevGradYCheck = CpuMatrix::create(heightY, width, false, false);
   prevGradXCheck->copyFrom(*prevGradXGpu);
   prevGradYCheck->copyFrom(*prevGradYGpu);
   MatrixCheckErr(*prevGradX, *prevGradXCheck);
@@ -1794,8 +1837,7 @@ TEST(Matrix, cosSimDerivate) {
   }
 }
 
-void testParamReluForward(int height, int width, int w_height,
-                                                 int w_width) {
+void testParamReluForward(int height, int width, int w_height, int w_width) {
   MatrixPtr output = CpuMatrix::create(height, width, false, false);
   MatrixPtr input = CpuMatrix::create(height, width, false, false);
   MatrixPtr w = CpuMatrix::create(w_height, w_width, false, false);
@@ -1832,8 +1874,7 @@ TEST(Matrix, paramReluForward) {
   }
 }
 
-void testParamReluBackwardW(int height, int width, int w_height,
-                                                   int w_width) {
+void testParamReluBackwardW(int height, int width, int w_height, int w_width) {
   MatrixPtr oGrad = CpuMatrix::create(height, width, false, false);
   MatrixPtr input = CpuMatrix::create(height, width, false, false);
   MatrixPtr w = CpuMatrix::create(w_height, w_width, false, false);
@@ -1870,8 +1911,10 @@ TEST(Matrix, paramReluBackwardW) {
   }
 }
 
-void testParamReluBackwardDiff(int height, int width, int w_height,
-                                                      int w_width) {
+void testParamReluBackwardDiff(int height,
+                               int width,
+                               int w_height,
+                               int w_width) {
   MatrixPtr oGrad = CpuMatrix::create(height, width, false, false);
   MatrixPtr input = CpuMatrix::create(height, width, false, false);
   MatrixPtr diff = CpuMatrix::create(height, width, false, false);
@@ -1943,11 +1986,16 @@ TEST(Matrix, classificationError) {
   }
 }
 
-void testMaxPoolFwdBwd(int numSamples, int channels,
-                       int imgSizeH, int imgSizeW,
-                       int ksizeH, int ksizeW,
-                       int strideH, int strideW,
-                       int padH, int padW) {
+void testMaxPoolFwdBwd(int numSamples,
+                       int channels,
+                       int imgSizeH,
+                       int imgSizeW,
+                       int ksizeH,
+                       int ksizeW,
+                       int strideH,
+                       int strideW,
+                       int padH,
+                       int padW) {
   int outH = 0, outW = 0;
   outH = (imgSizeH - ksizeH + 2 * padH + strideH - 1) / strideH + 1;
   outW = (imgSizeW - ksizeW + 2 * padW + strideW - 1) / strideW + 1;
@@ -1965,12 +2013,30 @@ void testMaxPoolFwdBwd(int numSamples, int channels,
   inputGpu->copyFrom(*input);
   targetGpu->copyFrom(*target);
 
-  target->maxPoolForward(*input, imgSizeH, imgSizeW,
-                         channels, ksizeW, ksizeH,
-                         strideH, strideW, outH, outW, padH, padW);
-  targetGpu->maxPoolForward(*inputGpu, imgSizeH, imgSizeW,
-                            channels, ksizeW, ksizeH,
-                            strideH, strideW, outH, outW, padH, padW);
+  target->maxPoolForward(*input,
+                         imgSizeH,
+                         imgSizeW,
+                         channels,
+                         ksizeW,
+                         ksizeH,
+                         strideH,
+                         strideW,
+                         outH,
+                         outW,
+                         padH,
+                         padW);
+  targetGpu->maxPoolForward(*inputGpu,
+                            imgSizeH,
+                            imgSizeW,
+                            channels,
+                            ksizeW,
+                            ksizeH,
+                            strideH,
+                            strideW,
+                            outH,
+                            outW,
+                            padH,
+                            padW);
   MatrixPtr targetCheck = CpuMatrix::create(numSamples, outWidth, false, false);
   targetCheck->copyFrom(*targetGpu);
   checkMatrixEqual(target, targetCheck);
@@ -1978,35 +2044,60 @@ void testMaxPoolFwdBwd(int numSamples, int channels,
   MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
   MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
   MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr targetGpuGrad = GpuMatrix::create(numSamples, outWidth,
-                                              false, true);
+  MatrixPtr targetGpuGrad =
+      GpuMatrix::create(numSamples, outWidth, false, true);
 
   inputGrad->randomizeUniform();
   targetGrad->randomizeUniform();
   inputGpuGrad->copyFrom(*inputGrad);
   targetGpuGrad->copyFrom(*targetGrad);
 
-  inputGrad->maxPoolBackward(*input, imgSizeH, imgSizeW,
-                             *targetGrad, *target,
-                             ksizeW, ksizeH,
-                             strideH, strideW,
-                             outH, outW, 1.0, 1.0, padH, padW);
-  inputGpuGrad->maxPoolBackward(*inputGpu, imgSizeH, imgSizeW,
-                                *targetGpuGrad, *targetGpu,
-                                ksizeW, ksizeH,
-                                strideH, strideW,
-                                outH, outW, 1.0, 1.0, padH, padW);
-  MatrixPtr targetBwdCheck = CpuMatrix::create(numSamples, inWidth,
-                                               false, false);
+  inputGrad->maxPoolBackward(*input,
+                             imgSizeH,
+                             imgSizeW,
+                             *targetGrad,
+                             *target,
+                             ksizeW,
+                             ksizeH,
+                             strideH,
+                             strideW,
+                             outH,
+                             outW,
+                             1.0,
+                             1.0,
+                             padH,
+                             padW);
+  inputGpuGrad->maxPoolBackward(*inputGpu,
+                                imgSizeH,
+                                imgSizeW,
+                                *targetGpuGrad,
+                                *targetGpu,
+                                ksizeW,
+                                ksizeH,
+                                strideH,
+                                strideW,
+                                outH,
+                                outW,
+                                1.0,
+                                1.0,
+                                padH,
+                                padW);
+  MatrixPtr targetBwdCheck =
+      CpuMatrix::create(numSamples, inWidth, false, false);
   targetBwdCheck->copyFrom(*inputGpuGrad);
   checkMatrixEqual(inputGrad, targetBwdCheck);
 }
 
-void testAvgPoolFwdBwd(int numSamples, int channels,
-                       int imgSizeH, int imgSizeW,
-                       int ksizeH, int ksizeW,
-                       int strideH, int strideW,
-                       int padH, int padW) {
+void testAvgPoolFwdBwd(int numSamples,
+                       int channels,
+                       int imgSizeH,
+                       int imgSizeW,
+                       int ksizeH,
+                       int ksizeW,
+                       int strideH,
+                       int strideW,
+                       int padH,
+                       int padW) {
   int outH = 0, outW = 0;
   outH = (imgSizeH - ksizeH + 2 * padH + strideH - 1) / strideH + 1;
   outW = (imgSizeW - ksizeW + 2 * padW + strideW - 1) / strideW + 1;
@@ -2024,12 +2115,30 @@ void testAvgPoolFwdBwd(int numSamples, int channels,
   inputGpu->copyFrom(*input);
   targetGpu->copyFrom(*target);
 
-  target->avgPoolForward(*input, imgSizeH, imgSizeW,
-                         channels, ksizeW, ksizeH,
-                         strideH, strideW, outH, outW, padH, padW);
-  targetGpu->avgPoolForward(*inputGpu, imgSizeH, imgSizeW,
-                            channels, ksizeW, ksizeH,
-                            strideH, strideW, outH, outW, padH, padW);
+  target->avgPoolForward(*input,
+                         imgSizeH,
+                         imgSizeW,
+                         channels,
+                         ksizeW,
+                         ksizeH,
+                         strideH,
+                         strideW,
+                         outH,
+                         outW,
+                         padH,
+                         padW);
+  targetGpu->avgPoolForward(*inputGpu,
+                            imgSizeH,
+                            imgSizeW,
+                            channels,
+                            ksizeW,
+                            ksizeH,
+                            strideH,
+                            strideW,
+                            outH,
+                            outW,
+                            padH,
+                            padW);
   MatrixPtr targetCheck = CpuMatrix::create(numSamples, outWidth, false, false);
   targetCheck->copyFrom(*targetGpu);
   MatrixCheckErr(*target, *targetCheck);
@@ -2037,24 +2146,42 @@ void testAvgPoolFwdBwd(int numSamples, int channels,
   MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
   MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
   MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr targetGpuGrad = GpuMatrix::create(numSamples, outWidth,
-                                              false, true);
+  MatrixPtr targetGpuGrad =
+      GpuMatrix::create(numSamples, outWidth, false, true);
 
   inputGrad->randomizeUniform();
   targetGrad->randomizeUniform();
   inputGpuGrad->copyFrom(*inputGrad);
   targetGpuGrad->copyFrom(*targetGrad);
 
-  inputGrad->avgPoolBackward(*targetGrad, imgSizeH, imgSizeW,
-                             ksizeW, ksizeH,
-                             strideH, strideW,
-                             outH, outW, 1.0, 1.0, padH, padW);
-  inputGpuGrad->avgPoolBackward(*targetGpuGrad, imgSizeH, imgSizeW,
-                                ksizeW, ksizeH,
-                                strideH, strideW,
-                                outH, outW, 1.0, 1.0, padH, padW);
-  MatrixPtr targetBwdCheck = CpuMatrix::create(numSamples, inWidth,
-                                               false, false);
+  inputGrad->avgPoolBackward(*targetGrad,
+                             imgSizeH,
+                             imgSizeW,
+                             ksizeW,
+                             ksizeH,
+                             strideH,
+                             strideW,
+                             outH,
+                             outW,
+                             1.0,
+                             1.0,
+                             padH,
+                             padW);
+  inputGpuGrad->avgPoolBackward(*targetGpuGrad,
+                                imgSizeH,
+                                imgSizeW,
+                                ksizeW,
+                                ksizeH,
+                                strideH,
+                                strideW,
+                                outH,
+                                outW,
+                                1.0,
+                                1.0,
+                                padH,
+                                padW);
+  MatrixPtr targetBwdCheck =
+      CpuMatrix::create(numSamples, inWidth, false, false);
   targetBwdCheck->copyFrom(*inputGpuGrad);
   MatrixCheckErr(*inputGrad, *targetBwdCheck);
 }
@@ -2068,24 +2195,37 @@ TEST(Matrix, PoolFwdBwd) {
             for (auto sizeY : {2, 5}) {
               for (auto sH : {1, 2}) {
                 for (auto sW : {1, 2}) {
-                   for (auto pH : {0, (sizeY - 1)/2}) {
-                     for (auto pW : {0, (sizeX - 1)/2}) {
-                       VLOG(3) << " numSamples=" << numSamples
-                               << " channels=" << channels
-                               << " imgSizeH=" << imgSizeH
-                               << " imgSizeW=" << imgSizeW
-                               << " sizeX=" << sizeX
-                               << " sizeY=" << sizeY
-                               << " strideH=" << sH
-                               << " strideW=" << sW
-                               << " padingH=" << pH
-                               << " padingW=" << pW;
-                       testMaxPoolFwdBwd(numSamples, channels, imgSizeH,
-                         imgSizeW, sizeX, sizeY, sH, sW, pH, pW);
-                       testAvgPoolFwdBwd(numSamples, channels, imgSizeH,
-                         imgSizeW, sizeX, sizeY, sH, sW, pH, pW);
-                     }
-                   }
+                  for (auto pH : {0, (sizeY - 1) / 2}) {
+                    for (auto pW : {0, (sizeX - 1) / 2}) {
+                      VLOG(3) << " numSamples=" << numSamples
+                              << " channels=" << channels
+                              << " imgSizeH=" << imgSizeH
+                              << " imgSizeW=" << imgSizeW << " sizeX=" << sizeX
+                              << " sizeY=" << sizeY << " strideH=" << sH
+                              << " strideW=" << sW << " padingH=" << pH
+                              << " padingW=" << pW;
+                      testMaxPoolFwdBwd(numSamples,
+                                        channels,
+                                        imgSizeH,
+                                        imgSizeW,
+                                        sizeX,
+                                        sizeY,
+                                        sH,
+                                        sW,
+                                        pH,
+                                        pW);
+                      testAvgPoolFwdBwd(numSamples,
+                                        channels,
+                                        imgSizeH,
+                                        imgSizeW,
+                                        sizeX,
+                                        sizeY,
+                                        sH,
+                                        sW,
+                                        pH,
+                                        pW);
+                    }
+                  }
                 }
               }
             }
@@ -2096,8 +2236,8 @@ TEST(Matrix, PoolFwdBwd) {
   }
 }
 
-void testMaxOutFwdBwd(int numSamples, int imgSizeH, int imgSizeW,
-                      int channels, int groups) {
+void testMaxOutFwdBwd(
+    int numSamples, int imgSizeH, int imgSizeW, int channels, int groups) {
   int inWidth = imgSizeH * imgSizeW * channels;
   int outChannels = channels / groups;
   int outWidth = imgSizeH * imgSizeW * outChannels;
@@ -2131,10 +2271,10 @@ void testMaxOutFwdBwd(int numSamples, int imgSizeH, int imgSizeW,
   MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
 
   MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr targetGpuGrad = GpuMatrix::create(numSamples, outWidth, false,
-                                              true);
-  MatrixPtr targetCheckGrad = CpuMatrix::create(numSamples, inWidth, false,
-                                                false);
+  MatrixPtr targetGpuGrad =
+      GpuMatrix::create(numSamples, outWidth, false, true);
+  MatrixPtr targetCheckGrad =
+      CpuMatrix::create(numSamples, inWidth, false, false);
 
   inputGrad->randomizeUniform();
   targetGrad->randomizeUniform();
@@ -2155,10 +2295,8 @@ TEST(Matrix, MaxOutFwdBwd) {
       for (auto imgSizeH : {14, 28}) {
         for (auto imgSizeW : {16, 30}) {
           for (auto groups : {2, 4}) {
-            VLOG(3) << " numSamples=" << numSamples
-                    << " channels=" << channels
-                    << " imgSizeH=" << imgSizeH
-                    << " imgSizeW=" << imgSizeW
+            VLOG(3) << " numSamples=" << numSamples << " channels=" << channels
+                    << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW
                     << " groups=" << groups;
             testMaxOutFwdBwd(numSamples, imgSizeH, imgSizeW, channels, groups);
           }
@@ -2232,12 +2370,12 @@ void testMultiBinaryLabelCrossEntropy(int numSamples, int dim) {
   MatrixPtr cpuGrad = std::make_shared<CpuMatrix>(numSamples, dim);
   MatrixPtr gpuGrad = std::make_shared<GpuMatrix>(numSamples, dim);
 
-  MatrixPtr cpuLabel = std::make_shared<CpuSparseMatrix>
-          (numSamples, dim, numSamples, NO_VALUE, SPARSE_CSR, false);
-  MatrixPtr gpuLabel = std::make_shared<GpuSparseMatrix>
-          (numSamples, dim, numSamples, NO_VALUE, SPARSE_CSR, false);
-  for (int i = 0; i < numSamples; i ++) {
-    const unsigned int id = rand() % dim; // NOLINT
+  MatrixPtr cpuLabel = std::make_shared<CpuSparseMatrix>(
+      numSamples, dim, numSamples, NO_VALUE, SPARSE_CSR, false);
+  MatrixPtr gpuLabel = std::make_shared<GpuSparseMatrix>(
+      numSamples, dim, numSamples, NO_VALUE, SPARSE_CSR, false);
+  for (int i = 0; i < numSamples; i++) {
+    const unsigned int id = rand() % dim;  // NOLINT
     cpuLabel->setRow(i, 1, &id, nullptr);
     gpuLabel->setRow(i, 1, &id, nullptr);
   }
diff --git a/paddle/math/tests/test_matrixUtil.h b/paddle/math/tests/test_matrixUtil.h
index fa682164aa..5300e7168b 100644
--- a/paddle/math/tests/test_matrixUtil.h
+++ b/paddle/math/tests/test_matrixUtil.h
@@ -104,8 +104,7 @@ void checkSMatrixEqual2Dense(const CpuSparseMatrixPtr& a,
   }
 }
 
-void checkSMatrixErr(const CpuSparseMatrixPtr& a,
-                     const CpuSparseMatrixPtr& b) {
+void checkSMatrixErr(const CpuSparseMatrixPtr& a, const CpuSparseMatrixPtr& b) {
 #ifndef PADDLE_TYPE_DOUBLE
   real err = 1e-3;
 #else
@@ -126,7 +125,8 @@ void checkSMatrixErr(const CpuSparseMatrixPtr& a,
         real bVal = b->getValue()[r];
         if (std::abs(aVal - bVal) > err) {
           if ((std::abs(aVal - bVal) / std::abs(aVal)) > (err / 10.0f)) {
-            LOG(INFO) << "a=" << aVal << "\t" << "b=" << bVal;
+            LOG(INFO) << "a=" << aVal << "\t"
+                      << "b=" << bVal;
             count++;
           }
         }
diff --git a/paddle/math/tests/test_perturbation.cpp b/paddle/math/tests/test_perturbation.cpp
index 4fa9bc7201..837c2f47ba 100644
--- a/paddle/math/tests/test_perturbation.cpp
+++ b/paddle/math/tests/test_perturbation.cpp
@@ -37,7 +37,9 @@ protected:
 
   virtual void TearDown() {}
 
-  void allocateMem(real*& gpuAngle, real*& gpuScale, int*& gpuCenterR,
+  void allocateMem(real*& gpuAngle,
+                   real*& gpuScale,
+                   int*& gpuCenterR,
                    int*& gpuCenterC) {
     gpuAngle = (real*)hl_malloc_device(sizeof(real) * NUM_IMAGES);
     gpuScale = (real*)hl_malloc_device(sizeof(real) * NUM_IMAGES);
@@ -48,7 +50,8 @@ protected:
   }
 
   // Generate translation parameters for testing.
-  void generateTranslationParams(int*& gpuCenterR, int*& gpuCenterC,
+  void generateTranslationParams(int*& gpuCenterR,
+                                 int*& gpuCenterC,
                                  int imgSize) {
     int cpuCenterR[NUM_IMAGES * SAMPLING_RATE];
     int cpuCenterC[NUM_IMAGES * SAMPLING_RATE];
@@ -59,13 +62,13 @@ protected:
 
     gpuCenterR =
         (int*)hl_malloc_device(sizeof(int) * NUM_IMAGES * SAMPLING_RATE);
-    hl_memcpy_host2device(gpuCenterR, cpuCenterR,
-                          sizeof(int) * NUM_IMAGES * SAMPLING_RATE);
+    hl_memcpy_host2device(
+        gpuCenterR, cpuCenterR, sizeof(int) * NUM_IMAGES * SAMPLING_RATE);
 
     gpuCenterC =
         (int*)hl_malloc_device(sizeof(int) * NUM_IMAGES * SAMPLING_RATE);
-    hl_memcpy_host2device(gpuCenterC, cpuCenterC,
-                          sizeof(int) * NUM_IMAGES * SAMPLING_RATE);
+    hl_memcpy_host2device(
+        gpuCenterC, cpuCenterC, sizeof(int) * NUM_IMAGES * SAMPLING_RATE);
   }
 
   // Generate rotation parameters for testing.
@@ -84,8 +87,7 @@ protected:
       cpuScale[i] = static_cast<real>(TGT_SIZE - 2) / TGT_SIZE;
     }
     gpuScale = (real*)hl_malloc_device(sizeof(real) * NUM_IMAGES);
-    hl_memcpy_host2device(gpuScale, cpuScale,
-                          sizeof(real) * NUM_IMAGES);
+    hl_memcpy_host2device(gpuScale, cpuScale, sizeof(real) * NUM_IMAGES);
   }
 
   // Generate the test images, only the center regions are set to 1.
@@ -111,8 +113,7 @@ protected:
       }
     }
     gpuImages = (real*)hl_malloc_device(sizeof(real) * IMAGE_MEM_SIZE);
-    hl_memcpy_host2device(gpuImages, cpuImages,
-                          sizeof(real) * IMAGE_MEM_SIZE);
+    hl_memcpy_host2device(gpuImages, cpuImages, sizeof(real) * IMAGE_MEM_SIZE);
   }
 
   real* gpuImages_;
@@ -120,64 +121,99 @@ protected:
 
 // Random perturbation. Only to make sure the code does not break.
 TEST_F(PerturbationTest, random_perturb) {
-  real* gpuAngle, *gpuScaleRatio;
-  int* gpuCenterR, *gpuCenterC;
+  real *gpuAngle, *gpuScaleRatio;
+  int *gpuCenterR, *gpuCenterC;
   allocateMem(gpuAngle, gpuScaleRatio, gpuCenterR, gpuCenterC);
 
   real* targets = NULL;
   const int TARGET_MEM_SIZE =
       NUM_IMAGES * SAMPLING_RATE * TGT_SIZE * TGT_SIZE * CHANNELS;
   targets = (real*)hl_malloc_device(sizeof(real) * TARGET_MEM_SIZE);
-  hl_conv_random_disturb(gpuImages_, IMG_SIZE, TGT_SIZE, CHANNELS,
-                         NUM_IMAGES, 1.0, 1.0, SAMPLING_RATE, gpuAngle,
-                         gpuScaleRatio, gpuCenterR, gpuCenterC, 2, true,
+  hl_conv_random_disturb(gpuImages_,
+                         IMG_SIZE,
+                         TGT_SIZE,
+                         CHANNELS,
+                         NUM_IMAGES,
+                         1.0,
+                         1.0,
+                         SAMPLING_RATE,
+                         gpuAngle,
+                         gpuScaleRatio,
+                         gpuCenterR,
+                         gpuCenterC,
+                         2,
+                         true,
                          targets);
   real cpuTargets[TARGET_MEM_SIZE];
-  hl_memcpy_device2host(cpuTargets, targets,
-                        sizeof(real) * TARGET_MEM_SIZE);
+  hl_memcpy_device2host(cpuTargets, targets, sizeof(real) * TARGET_MEM_SIZE);
 }
 
 TEST_F(PerturbationTest, identity_perturb) {
-  real* gpuAngle, *gpuScaleRatio;
-  int* gpuCenterR, *gpuCenterC;
+  real *gpuAngle, *gpuScaleRatio;
+  int *gpuCenterR, *gpuCenterC;
   allocateMem(gpuAngle, gpuScaleRatio, gpuCenterR, gpuCenterC);
 
   real* targets = NULL;
   const int TARGET_MEM_SIZE =
       NUM_IMAGES * SAMPLING_RATE * TGT_SIZE * TGT_SIZE * CHANNELS;
   targets = (real*)hl_malloc_device(sizeof(real) * TARGET_MEM_SIZE);
-  hl_conv_random_disturb(gpuImages_, IMG_SIZE, TGT_SIZE, CHANNELS,
-                         NUM_IMAGES, 1.0, 1.0, SAMPLING_RATE, gpuAngle,
-                         gpuScaleRatio, gpuCenterR, gpuCenterC, 2, false,
+  hl_conv_random_disturb(gpuImages_,
+                         IMG_SIZE,
+                         TGT_SIZE,
+                         CHANNELS,
+                         NUM_IMAGES,
+                         1.0,
+                         1.0,
+                         SAMPLING_RATE,
+                         gpuAngle,
+                         gpuScaleRatio,
+                         gpuCenterR,
+                         gpuCenterC,
+                         2,
+                         false,
                          targets);
   real cpuTargets[TARGET_MEM_SIZE];
-  hl_memcpy_device2host(cpuTargets, targets,
-                        sizeof(real) * TARGET_MEM_SIZE);
+  hl_memcpy_device2host(cpuTargets, targets, sizeof(real) * TARGET_MEM_SIZE);
   for (int i = 0; i < TARGET_MEM_SIZE; ++i) {
     EXPECT_FLOAT_EQ(1.0, cpuTargets[i]);
   }
 }
 
 TEST_F(PerturbationTest, translation_test) {
-  real* gpuAngle, *gpuScaleRatio;
-  int* gpuCenterR, *gpuCenterC;
+  real *gpuAngle, *gpuScaleRatio;
+  int *gpuCenterR, *gpuCenterC;
   allocateMem(gpuAngle, gpuScaleRatio, gpuCenterR, gpuCenterC);
-  hl_generate_disturb_params(gpuAngle, gpuScaleRatio, gpuCenterR,
-                             gpuCenterC, NUM_IMAGES, IMG_SIZE, 0.0,
-                             0.0, SAMPLING_RATE, false);
+  hl_generate_disturb_params(gpuAngle,
+                             gpuScaleRatio,
+                             gpuCenterR,
+                             gpuCenterC,
+                             NUM_IMAGES,
+                             IMG_SIZE,
+                             0.0,
+                             0.0,
+                             SAMPLING_RATE,
+                             false);
   generateTranslationParams(gpuCenterR, gpuCenterC, IMG_SIZE);
 
   real* targets = NULL;
   const int TARGET_MEM_SIZE =
       NUM_IMAGES * SAMPLING_RATE * TGT_SIZE * TGT_SIZE * CHANNELS;
   targets = (real*)hl_malloc_device(sizeof(real) * TARGET_MEM_SIZE);
-  hl_conv_random_disturb_with_params(
-      gpuImages_, IMG_SIZE, TGT_SIZE, CHANNELS, NUM_IMAGES, SAMPLING_RATE,
-      gpuAngle, gpuScaleRatio, gpuCenterR, gpuCenterC, 2, targets);
+  hl_conv_random_disturb_with_params(gpuImages_,
+                                     IMG_SIZE,
+                                     TGT_SIZE,
+                                     CHANNELS,
+                                     NUM_IMAGES,
+                                     SAMPLING_RATE,
+                                     gpuAngle,
+                                     gpuScaleRatio,
+                                     gpuCenterR,
+                                     gpuCenterC,
+                                     2,
+                                     targets);
 
   real cpuTargets[TARGET_MEM_SIZE];
-  hl_memcpy_device2host(cpuTargets, targets,
-                        sizeof(real) * TARGET_MEM_SIZE);
+  hl_memcpy_device2host(cpuTargets, targets, sizeof(real) * TARGET_MEM_SIZE);
   for (int i = 0; i < SAMPLING_RATE * NUM_IMAGES; ++i) {
     for (int p = 0; p < TGT_SIZE * TGT_SIZE * CHANNELS; ++p) {
       const int offset = i * TGT_SIZE * TGT_SIZE * CHANNELS + p;
@@ -191,50 +227,80 @@ TEST_F(PerturbationTest, translation_test) {
 }
 
 TEST_F(PerturbationTest, rotation_test) {
-  real* gpuAngle, *gpuScaleRatio;
-  int* gpuCenterR, *gpuCenterC;
+  real *gpuAngle, *gpuScaleRatio;
+  int *gpuCenterR, *gpuCenterC;
   allocateMem(gpuAngle, gpuScaleRatio, gpuCenterR, gpuCenterC);
-  hl_generate_disturb_params(gpuAngle, gpuScaleRatio, gpuCenterR,
-                             gpuCenterC, NUM_IMAGES, IMG_SIZE, 0.0,
-                             0.0, SAMPLING_RATE, false);
+  hl_generate_disturb_params(gpuAngle,
+                             gpuScaleRatio,
+                             gpuCenterR,
+                             gpuCenterC,
+                             NUM_IMAGES,
+                             IMG_SIZE,
+                             0.0,
+                             0.0,
+                             SAMPLING_RATE,
+                             false);
   generateRotationParams(gpuAngle);
 
   real* targets = NULL;
   const int TARGET_MEM_SIZE =
       NUM_IMAGES * SAMPLING_RATE * TGT_SIZE * TGT_SIZE * CHANNELS;
   targets = (real*)hl_malloc_device(sizeof(real) * TARGET_MEM_SIZE);
-  hl_conv_random_disturb_with_params(
-      gpuImages_, IMG_SIZE, TGT_SIZE, CHANNELS, NUM_IMAGES, SAMPLING_RATE,
-      gpuAngle, gpuScaleRatio, gpuCenterR, gpuCenterC, 2, targets);
+  hl_conv_random_disturb_with_params(gpuImages_,
+                                     IMG_SIZE,
+                                     TGT_SIZE,
+                                     CHANNELS,
+                                     NUM_IMAGES,
+                                     SAMPLING_RATE,
+                                     gpuAngle,
+                                     gpuScaleRatio,
+                                     gpuCenterR,
+                                     gpuCenterC,
+                                     2,
+                                     targets);
 
   real cpuTargets[TARGET_MEM_SIZE];
-  hl_memcpy_device2host(cpuTargets, targets,
-                        sizeof(real) * TARGET_MEM_SIZE);
+  hl_memcpy_device2host(cpuTargets, targets, sizeof(real) * TARGET_MEM_SIZE);
   for (int i = 0; i < TARGET_MEM_SIZE; ++i) {
     EXPECT_FLOAT_EQ(1.0, cpuTargets[i]);
   }
 }
 
 TEST_F(PerturbationTest, scale_test) {
-  real* gpuAngle, *gpuScaleRatio;
-  int* gpuCenterR, *gpuCenterC;
+  real *gpuAngle, *gpuScaleRatio;
+  int *gpuCenterR, *gpuCenterC;
   allocateMem(gpuAngle, gpuScaleRatio, gpuCenterR, gpuCenterC);
-  hl_generate_disturb_params(gpuAngle, gpuScaleRatio, gpuCenterR,
-                             gpuCenterC, NUM_IMAGES, IMG_SIZE, 0.0,
-                             0.0, SAMPLING_RATE, false);
+  hl_generate_disturb_params(gpuAngle,
+                             gpuScaleRatio,
+                             gpuCenterR,
+                             gpuCenterC,
+                             NUM_IMAGES,
+                             IMG_SIZE,
+                             0.0,
+                             0.0,
+                             SAMPLING_RATE,
+                             false);
   generateScaleParams(gpuScaleRatio);
 
   real* targets = NULL;
   const int TARGET_MEM_SIZE =
       NUM_IMAGES * SAMPLING_RATE * TGT_SIZE * TGT_SIZE * CHANNELS;
   targets = (real*)hl_malloc_device(sizeof(real) * TARGET_MEM_SIZE);
-  hl_conv_random_disturb_with_params(
-      gpuImages_, IMG_SIZE, TGT_SIZE, CHANNELS, NUM_IMAGES, SAMPLING_RATE,
-      gpuAngle, gpuScaleRatio, gpuCenterR, gpuCenterC, 2, targets);
+  hl_conv_random_disturb_with_params(gpuImages_,
+                                     IMG_SIZE,
+                                     TGT_SIZE,
+                                     CHANNELS,
+                                     NUM_IMAGES,
+                                     SAMPLING_RATE,
+                                     gpuAngle,
+                                     gpuScaleRatio,
+                                     gpuCenterR,
+                                     gpuCenterC,
+                                     2,
+                                     targets);
 
   real cpuTargets[TARGET_MEM_SIZE];
-  hl_memcpy_device2host(cpuTargets, targets,
-                        sizeof(real) * TARGET_MEM_SIZE);
+  hl_memcpy_device2host(cpuTargets, targets, sizeof(real) * TARGET_MEM_SIZE);
   for (int i = 0; i < SAMPLING_RATE * NUM_IMAGES; ++i) {
     for (int p = 0; p < TGT_SIZE * TGT_SIZE * CHANNELS; ++p) {
       const int offset = i * TGT_SIZE * TGT_SIZE * CHANNELS + p;
diff --git a/paddle/math/tests/test_sparseMatrixCompare.cpp b/paddle/math/tests/test_sparseMatrixCompare.cpp
index 6048dd8112..d7aa20eb98 100644
--- a/paddle/math/tests/test_sparseMatrixCompare.cpp
+++ b/paddle/math/tests/test_sparseMatrixCompare.cpp
@@ -155,7 +155,7 @@ TEST(SMatrix, sMatrixMul) {
   for (auto M : {1, 40, 128, 200}) {
     for (auto N : {100, 2000, 20480}) {
       for (auto K : {100, 512, 1024}) {
-        VLOG(3) << " M=" << M << " N=" << N << " K=" << K;;
+        VLOG(3) << " M=" << M << " N=" << N << " K=" << K;
         testSpMatrixMul(M, N, K, 0.05);
       }
     }
diff --git a/paddle/parameter/Argument.cpp b/paddle/parameter/Argument.cpp
index 42c74661d2..81d53f065b 100644
--- a/paddle/parameter/Argument.cpp
+++ b/paddle/parameter/Argument.cpp
@@ -12,14 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "Argument.h"
 #include "paddle/math/SparseMatrix.h"
 
 #include <algorithm>
 
 namespace paddle {
-static void resizeAndCopy(MatrixPtr& dest, const MatrixPtr& src, bool useGpu,
+static void resizeAndCopy(MatrixPtr& dest,
+                          const MatrixPtr& src,
+                          bool useGpu,
                           hl_stream_t stream) {
   if (src) {
     if (!dest) {
@@ -34,7 +35,9 @@ static void resizeAndCopy(MatrixPtr& dest, const MatrixPtr& src, bool useGpu,
   }
 }
 
-static void resizeAndCopy(IVectorPtr& dest, const IVectorPtr& src, bool useGpu,
+static void resizeAndCopy(IVectorPtr& dest,
+                          const IVectorPtr& src,
+                          bool useGpu,
                           hl_stream_t stream) {
   if (src) {
     IVector::resizeOrCreate(dest, src->getSize(), useGpu);
@@ -56,8 +59,11 @@ static void resizeAndCopy(ICpuGpuVectorPtr& dest,
   }
 }
 
-static void resizeAndCopy(MatrixPtr& dest, const MatrixPtr& src,
-                          int32_t startRow, int32_t copySize, bool useGpu,
+static void resizeAndCopy(MatrixPtr& dest,
+                          const MatrixPtr& src,
+                          int32_t startRow,
+                          int32_t copySize,
+                          bool useGpu,
                           hl_stream_t stream = HPPL_STREAM_DEFAULT) {
   if (src) {
     CHECK_LE((size_t)startRow + copySize, src->getHeight());
@@ -84,8 +90,11 @@ static void resizeAndCopy(MatrixPtr& dest, const MatrixPtr& src,
   }
 }
 
-static void resizeAndCopy(IVectorPtr& dest, const IVectorPtr& src,
-                          int32_t startPos, int32_t copySize, bool useGpu,
+static void resizeAndCopy(IVectorPtr& dest,
+                          const IVectorPtr& src,
+                          int32_t startPos,
+                          int32_t copySize,
+                          bool useGpu,
                           hl_stream_t stream = HPPL_STREAM_DEFAULT) {
   if (src) {
     CHECK_LE((size_t)startPos + copySize, src->getSize());
@@ -115,7 +124,8 @@ static void resizeAndCopy(ICpuGpuVectorPtr& dest,
 }
 
 static void resizeAndCopy(UserDefinedVectorPtr& dest,
-                          const UserDefinedVectorPtr& src, bool useGpu,
+                          const UserDefinedVectorPtr& src,
+                          bool useGpu,
                           hl_stream_t stream) {
   if (src) {
     CHECK(!useGpu) << "not implemented";
@@ -132,8 +142,10 @@ static void resizeAndCopy(UserDefinedVectorPtr& dest,
 }
 
 static void resizeAndCopy(UserDefinedVectorPtr& dest,
-                          const UserDefinedVectorPtr& src, int32_t startPos,
-                          int32_t copySize, bool useGpu,
+                          const UserDefinedVectorPtr& src,
+                          int32_t startPos,
+                          int32_t copySize,
+                          bool useGpu,
                           hl_stream_t stream = HPPL_STREAM_DEFAULT) {
   if (src) {
     CHECK(!useGpu) << "not implemented";
@@ -151,7 +163,9 @@ static void resizeAndCopy(UserDefinedVectorPtr& dest,
   }
 }
 
-static void resizeAndCopy(SVectorPtr& dest, const SVectorPtr& src, bool useGpu,
+static void resizeAndCopy(SVectorPtr& dest,
+                          const SVectorPtr& src,
+                          bool useGpu,
                           hl_stream_t stream) {
   if (src) {
     size_t height = src->size();
@@ -166,8 +180,11 @@ static void resizeAndCopy(SVectorPtr& dest, const SVectorPtr& src, bool useGpu,
   }
 }
 
-static void resizeAndCopy(SVectorPtr& dest, const SVectorPtr& src,
-                          int32_t startPos, int32_t copySize, bool useGpu,
+static void resizeAndCopy(SVectorPtr& dest,
+                          const SVectorPtr& src,
+                          int32_t startPos,
+                          int32_t copySize,
+                          bool useGpu,
                           hl_stream_t stream = HPPL_STREAM_DEFAULT) {
   if (src) {
     CHECK_LE((size_t)startPos + copySize, src->size());
@@ -184,37 +201,46 @@ static void resizeAndCopy(SVectorPtr& dest, const SVectorPtr& src,
 }
 
 void Argument::resizeAndCopyFrom(const Argument& src, bool useGpu) {
-   resizeAndCopyFrom(src, useGpu, HPPL_STREAM_DEFAULT);
-   hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+  resizeAndCopyFrom(src, useGpu, HPPL_STREAM_DEFAULT);
+  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
 }
 
-void Argument::resizeAndCopyFrom(const Argument& src, bool useGpu,
+void Argument::resizeAndCopyFrom(const Argument& src,
+                                 bool useGpu,
                                  hl_stream_t stream) {
   dataId = src.dataId;
   resizeAndCopy(value, src.value, useGpu, stream);
   resizeAndCopy(grad, src.grad, useGpu, stream);
   resizeAndCopy(in, src.in, useGpu, stream);
   resizeAndCopy(ids, src.ids, useGpu, stream);
-  resizeAndCopy(sequenceStartPositions, src.sequenceStartPositions,
-                false /* useGpu */, stream);
+  resizeAndCopy(sequenceStartPositions,
+                src.sequenceStartPositions,
+                false /* useGpu */,
+                stream);
   if (src.hasSubseq()) {
     resizeAndCopy(subSequenceStartPositions,
-                  src.subSequenceStartPositions, false /* useGpu */, stream);
+                  src.subSequenceStartPositions,
+                  false /* useGpu */,
+                  stream);
   }
   resizeAndCopy(udp, src.udp, useGpu, stream);
   resizeAndCopy(strs, src.strs, useGpu, stream);
 }
 
-int32_t Argument::resizeAndCopyFrom(const Argument& src, int32_t startSeq,
-                                    int32_t copySize, bool useGpu) {
-    int32_t size = resizeAndCopyFrom(src, startSeq, copySize, useGpu,
-                                     HPPL_STREAM_DEFAULT);
-    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-    return size;
+int32_t Argument::resizeAndCopyFrom(const Argument& src,
+                                    int32_t startSeq,
+                                    int32_t copySize,
+                                    bool useGpu) {
+  int32_t size =
+      resizeAndCopyFrom(src, startSeq, copySize, useGpu, HPPL_STREAM_DEFAULT);
+  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+  return size;
 }
 
-int32_t Argument::resizeAndCopyFrom(const Argument& src, int32_t startSeq,
-                                    int32_t copySize, bool useGpu,
+int32_t Argument::resizeAndCopyFrom(const Argument& src,
+                                    int32_t startSeq,
+                                    int32_t copySize,
+                                    bool useGpu,
                                     hl_stream_t stream) {
   dataId = src.dataId;
 
@@ -239,8 +265,12 @@ int32_t Argument::resizeAndCopyFrom(const Argument& src, int32_t startSeq,
     resizeAndCopy(grad, src.grad, startRow, copyFeatureSize, useGpu, stream);
     resizeAndCopy(ids, src.ids, startRow, copyFeatureSize, useGpu, stream);
     resizeAndCopy(udp, src.udp, startRow, copySize, useGpu, stream);
-    resizeAndCopy(sequenceStartPositions, src.sequenceStartPositions,
-                  startSeq, copySize + 1, false, stream);
+    resizeAndCopy(sequenceStartPositions,
+                  src.sequenceStartPositions,
+                  startSeq,
+                  copySize + 1,
+                  false,
+                  stream);
     // modify new sequenceStartPositions
     int* destSequences = sequenceStartPositions->getMutableData(false);
     for (int i = 0; i < copySize + 1; i++) {
@@ -264,8 +294,11 @@ int32_t Argument::resizeAndCopyFrom(const Argument& src, int32_t startSeq,
       }
       int32_t copySubSize = subEndSeq - subStartSeq;
       resizeAndCopy(subSequenceStartPositions,
-                    src.subSequenceStartPositions, subStartSeq,
-                    copySubSize + 1, false, stream);
+                    src.subSequenceStartPositions,
+                    subStartSeq,
+                    copySubSize + 1,
+                    false,
+                    stream);
       // modify new subSequenceStartPositions
       int* destSubSequences = subSequenceStartPositions->getMutableData(false);
       for (int i = 0; i < copySubSize + 1; i++) {
@@ -281,14 +314,19 @@ int32_t Argument::resizeAndCopyFrom(const Argument& src, int32_t startSeq,
 
 void Argument::concat(const std::vector<Argument>& args,
                       const std::vector<int>& selectRows,
-                      const std::vector<int>& seqStartPos, bool useGpu,
-                      hl_stream_t stream, PassType passType) {
+                      const std::vector<int>& seqStartPos,
+                      bool useGpu,
+                      hl_stream_t stream,
+                      PassType passType) {
   CHECK(!subSequenceStartPositions)
-          << "undefined behavior for subsequence positions";
+      << "undefined behavior for subsequence positions";
 
   size_t batchSize = selectRows.size();
-  auto copyArg = [batchSize, stream](MatrixPtr& dst, MatrixPtr src,
-                                     int startRow, int pos, int size,
+  auto copyArg = [batchSize, stream](MatrixPtr& dst,
+                                     MatrixPtr src,
+                                     int startRow,
+                                     int pos,
+                                     int size,
                                      bool useGpu) {
     if (!src) {
       dst.reset();
@@ -305,8 +343,11 @@ void Argument::concat(const std::vector<Argument>& args,
     tmpMatrix->copyFrom(*src->subMatrix(pos, size), stream);
   };
 
-  auto copyIds = [batchSize, stream](IVectorPtr& dst, const IVectorPtr& src,
-                                     int startRow, int pos, int size,
+  auto copyIds = [batchSize, stream](IVectorPtr& dst,
+                                     const IVectorPtr& src,
+                                     int startRow,
+                                     int pos,
+                                     int size,
                                      bool useGpu) {
     if (!src) {
       dst.reset();
@@ -316,8 +357,11 @@ void Argument::concat(const std::vector<Argument>& args,
     dst->subVec(startRow, size)->copyFrom(*src->subVec(pos, size), stream);
   };
 
-  auto copyStrs = [batchSize, stream](SVectorPtr& dst, const SVectorPtr& src,
-                                      int startRow, int pos, int size,
+  auto copyStrs = [batchSize, stream](SVectorPtr& dst,
+                                      const SVectorPtr& src,
+                                      int startRow,
+                                      int pos,
+                                      int size,
                                       bool useGpu) {
     if (!src) {
       dst.reset();
@@ -328,8 +372,8 @@ void Argument::concat(const std::vector<Argument>& args,
     } else {
       dst->resize(batchSize);
     }
-    std::copy(src->begin() + pos, src->begin() + pos + size,
-              dst->begin() + startRow);
+    std::copy(
+        src->begin() + pos, src->begin() + pos + size, dst->begin() + startRow);
   };
 
   dataId = args[0].dataId;
@@ -354,14 +398,16 @@ void Argument::concat(const std::vector<Argument>& args,
       copyStrs(strs, arg.strs, j, rowIdx, copySize, useGpu);
     }
   }
-  ICpuGpuVector::resizeOrCreate(sequenceStartPositions,
-                          seqStartPos.size(), useGpu);
-  sequenceStartPositions->copyFrom(seqStartPos.data(),
-                                   seqStartPos.size(), useGpu);
+  ICpuGpuVector::resizeOrCreate(
+      sequenceStartPositions, seqStartPos.size(), useGpu);
+  sequenceStartPositions->copyFrom(
+      seqStartPos.data(), seqStartPos.size(), useGpu);
 }
 
-void Argument::concat(const std::vector<Argument>& args, bool useGpu,
-                      hl_stream_t stream, PassType passType) {
+void Argument::concat(const std::vector<Argument>& args,
+                      bool useGpu,
+                      hl_stream_t stream,
+                      PassType passType) {
   int32_t batchSize = 0;
   int64_t numSequences = 0;
   int64_t numSubSequences = 0;
@@ -371,8 +417,8 @@ void Argument::concat(const std::vector<Argument>& args, bool useGpu,
     numSubSequences += arg.getNumSubSequences();
   }
 
-  auto copyArg = [batchSize, stream](MatrixPtr& dst, MatrixPtr src,
-                                     int startRow, bool useGpu) {
+  auto copyArg = [batchSize, stream](
+      MatrixPtr& dst, MatrixPtr src, int startRow, bool useGpu) {
     if (!src) {
       dst.reset();
       return;
@@ -388,8 +434,8 @@ void Argument::concat(const std::vector<Argument>& args, bool useGpu,
     tmpMatrix->copyFrom(*src, stream);
   };
 
-  auto copyIds = [batchSize, stream](IVectorPtr& dst, const IVectorPtr& src,
-                                     int startRow, bool useGpu) {
+  auto copyIds = [batchSize, stream](
+      IVectorPtr& dst, const IVectorPtr& src, int startRow, bool useGpu) {
     if (!src) {
       dst.reset();
       return;
@@ -398,8 +444,8 @@ void Argument::concat(const std::vector<Argument>& args, bool useGpu,
     dst->subVec(startRow, src->getSize())->copyFrom(*src, stream);
   };
 
-  auto copyStrs = [batchSize, stream](SVectorPtr& dst, const SVectorPtr& src,
-                                      int startRow, bool useGpu) {
+  auto copyStrs = [batchSize, stream](
+      SVectorPtr& dst, const SVectorPtr& src, int startRow, bool useGpu) {
     if (!src) {
       dst.reset();
       return;
@@ -412,21 +458,23 @@ void Argument::concat(const std::vector<Argument>& args, bool useGpu,
     std::copy(src->begin(), src->end(), dst->begin() + startRow);
   };
 
-  auto copySequencePos = []
-          (ICpuGpuVectorPtr& dstSeq, const ICpuGpuVectorPtr& srcSeq,
-           int dstNumSequences, int srcNumSequences,
-           int& startSequences, int startRow) {
-      if (srcSeq) {
-          ICpuGpuVector::resizeOrCreate(dstSeq, dstNumSequences + 1, false);
-          const int* src = srcSeq->getData(false);
-          int* dest = dstSeq->getMutableData(false);
-          for (int i = 0; i < srcNumSequences + 1; ++i) {
-              dest[i + startSequences] = src[i] + startRow;
-          }
-          startSequences += srcNumSequences;
-      } else {
-          dstSeq.reset();
+  auto copySequencePos = [](ICpuGpuVectorPtr& dstSeq,
+                            const ICpuGpuVectorPtr& srcSeq,
+                            int dstNumSequences,
+                            int srcNumSequences,
+                            int& startSequences,
+                            int startRow) {
+    if (srcSeq) {
+      ICpuGpuVector::resizeOrCreate(dstSeq, dstNumSequences + 1, false);
+      const int* src = srcSeq->getData(false);
+      int* dest = dstSeq->getMutableData(false);
+      for (int i = 0; i < srcNumSequences + 1; ++i) {
+        dest[i + startSequences] = src[i] + startRow;
       }
+      startSequences += srcNumSequences;
+    } else {
+      dstSeq.reset();
+    }
   };
 
   int startRow = 0;
@@ -479,8 +527,8 @@ void Argument::splitByDataId(const std::vector<Argument>& argus,
 
 void Argument::getSeqInfo(std::vector<SeqInfo>* seqInfo) const {
   const int* starts = sequenceStartPositions->getData(false);
-  const int* subStarts = hasSubseq()
-      ? subSequenceStartPositions->getData(false) : nullptr;
+  const int* subStarts =
+      hasSubseq() ? subSequenceStartPositions->getData(false) : nullptr;
   size_t numSequences = getNumSequences();
   seqInfo->reserve(numSequences);
   int subSeqEnd = 0;
@@ -501,7 +549,8 @@ void Argument::getSeqInfo(std::vector<SeqInfo>* seqInfo) const {
     }
     seqInfo->push_back(info);
   }
-  std::sort(seqInfo->begin(), seqInfo->end(),
+  std::sort(seqInfo->begin(),
+            seqInfo->end(),
             [](const SeqInfo& a, const SeqInfo& b) {
               return a.topLevelLength > b.topLevelLength;
             });
@@ -535,9 +584,8 @@ void Argument::degradeSequence(const Argument& input, bool useGpu) {
   CHECK_EQ(input.hasSubseq(), 1UL);
   size_t numSequences = input.getNumSequences();
   size_t numSubSequences = input.getNumSubSequences();
-  ICpuGpuVector::resizeOrCreate(sequenceStartPositions,
-                                 numSequences + 1,
-                                 false);
+  ICpuGpuVector::resizeOrCreate(
+      sequenceStartPositions, numSequences + 1, false);
   int* tgtBuf = sequenceStartPositions->getMutableData(false);
   const int* starts = input.sequenceStartPositions->getData(false);
   const int* subStarts = input.subSequenceStartPositions->getData(false);
@@ -551,24 +599,29 @@ void Argument::degradeSequence(const Argument& input, bool useGpu) {
   tgtBuf[numSequences] = numSubSequences;
 }
 
-void Argument::subArgFrom(const Argument& input, size_t offset, size_t height,
-                          size_t width, bool useGpu, bool trans, bool seqFlag,
-                          size_t seqStart, size_t seqSize) {
+void Argument::subArgFrom(const Argument& input,
+                          size_t offset,
+                          size_t height,
+                          size_t width,
+                          bool useGpu,
+                          bool trans,
+                          bool seqFlag,
+                          size_t seqStart,
+                          size_t seqSize) {
   if (input.value) {
-    value = Matrix::create(input.value->getData() + offset * width,
-                           height, width, trans, useGpu);
+    value = Matrix::create(
+        input.value->getData() + offset * width, height, width, trans, useGpu);
   }
   if (input.ids) {
     ids = IVector::create(input.ids->getData() + offset, height, useGpu);
   }
   if (input.grad) {
-    grad = Matrix::create(input.grad->getData() + offset * width,
-                          height, width, trans, useGpu);
+    grad = Matrix::create(
+        input.grad->getData() + offset * width, height, width, trans, useGpu);
   }
   if (seqFlag) {
     sequenceStartPositions = std::make_shared<ICpuGpuVector>(
-        *(input.sequenceStartPositions),
-        seqStart, seqSize);
+        *(input.sequenceStartPositions), seqStart, seqSize);
   }
 }
 
diff --git a/paddle/parameter/Argument.h b/paddle/parameter/Argument.h
index 81ff9029bc..2b20122deb 100644
--- a/paddle/parameter/Argument.h
+++ b/paddle/parameter/Argument.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "hl_gpu.h"
@@ -153,9 +152,8 @@ struct Argument {
   }
 
   int64_t getNumSubSequences() const {
-    return subSequenceStartPositions
-               ? subSequenceStartPositions->getSize() - 1
-               : getBatchSize();
+    return subSequenceStartPositions ? subSequenceStartPositions->getSize() - 1
+                                     : getBatchSize();
   }
 
   bool hasSubseq() const { return subSequenceStartPositions != nullptr; }
@@ -190,9 +188,14 @@ struct Argument {
    * @param seqStart[in]    offset of input.sequenceStartPositions
    * @param seqSize[in]     lenght of output.sequenceStartPositions
    */
-  void subArgFrom(const Argument& input, size_t offset, size_t height,
-                  size_t width, bool useGpu, bool trans = false,
-                  bool seqFlag = false, size_t seqStart = 0,
+  void subArgFrom(const Argument& input,
+                  size_t offset,
+                  size_t height,
+                  size_t width,
+                  bool useGpu,
+                  bool trans = false,
+                  bool seqFlag = false,
+                  size_t seqStart = 0,
                   size_t seqSize = 0);
   /*
    * for sequence input:
@@ -206,16 +209,21 @@ struct Argument {
    * Note that when specifying the stream explicitly in this case,
    * synchronize should also be called somewhere after this function
    */
-  int32_t resizeAndCopyFrom(const Argument& src, int32_t startSeq,
-                            int32_t copySize, bool useGpu, hl_stream_t stream);
+  int32_t resizeAndCopyFrom(const Argument& src,
+                            int32_t startSeq,
+                            int32_t copySize,
+                            bool useGpu,
+                            hl_stream_t stream);
 
   /*
    * same with the above function, except that the stream is
    * HPPL_STREAM_DEFAULT and synchronize is automatically called
    * inside it
    */
-  int32_t resizeAndCopyFrom(const Argument& src, int32_t startSeq,
-                            int32_t copySize, bool useGpu = FLAGS_use_gpu);
+  int32_t resizeAndCopyFrom(const Argument& src,
+                            int32_t startSeq,
+                            int32_t copySize,
+                            bool useGpu = FLAGS_use_gpu);
 
   void resizeAndCopyFrom(const Argument& src, bool useGpu, hl_stream_t stream);
 
@@ -237,13 +245,16 @@ struct Argument {
    */
   void concat(const std::vector<Argument>& args,
               const std::vector<int>& selectRows,
-              const std::vector<int>& seqStartPos, bool useGpu,
-              hl_stream_t stream, PassType passType);
+              const std::vector<int>& seqStartPos,
+              bool useGpu,
+              hl_stream_t stream,
+              PassType passType);
 
   /*
     Concatenate several args into one and put the result into this.
    */
-  void concat(const std::vector<Argument>& src, bool useGpu = FLAGS_use_gpu,
+  void concat(const std::vector<Argument>& src,
+              bool useGpu = FLAGS_use_gpu,
               hl_stream_t stream = HPPL_STREAM_DEFAULT,
               PassType passType = PASS_TEST);
 
diff --git a/paddle/parameter/AverageOptimizer.cpp b/paddle/parameter/AverageOptimizer.cpp
index 4f730059c7..593594761e 100644
--- a/paddle/parameter/AverageOptimizer.cpp
+++ b/paddle/parameter/AverageOptimizer.cpp
@@ -12,15 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "AverageOptimizer.h"
 
 namespace paddle {
 
 // factory method to create an instance of AverageOptimizer
 ParameterOptimizer* AverageOptimizer::create(
-    const OptimizationConfig& optConfig, ParameterOptimizer* optimizer,
-    bool isParameterSparse, bool useParameterApply) {
+    const OptimizationConfig& optConfig,
+    ParameterOptimizer* optimizer,
+    bool isParameterSparse,
+    bool useParameterApply) {
   if (optConfig.average_window() <= 0) {
     return optimizer;
   }
@@ -44,8 +45,8 @@ AverageOptimizer::AverageOptimizer(const OptimizationConfig& optConfig,
       prevNumUpdates_(0),
       numAccumulates_(0),
       oldNumAccumulates_(0),
-      minAverageWindow_(std::min<int64_t>(
-        10000L, optConfig_.max_average_window())),
+      minAverageWindow_(
+          std::min<int64_t>(10000L, optConfig_.max_average_window())),
       maxAverageWindow_(optConfig_.max_average_window()) {
   parameterTypes_ = optimizer_->getParameterTypes();
   addParameterType(PARAMETER_SUM1);
@@ -121,17 +122,27 @@ ParameterOptimizer::TraverseCallback AverageOptimizer::apply() {
 
   real scale = 1. / (numAccumulates_ + oldNumAccumulates_);
   if (useApply_) {
-    return [scale](const VectorPtr vecs[], const ParameterConfig& config,
+    return [scale](const VectorPtr vecs[],
+                   const ParameterConfig& config,
                    size_t sparseId) {
-      vecs[PARAMETER_APPLY]->add3(*vecs[PARAMETER_SUM1], *vecs[PARAMETER_SUM2],
-                                  *vecs[PARAMETER_SUM3], scale, scale, scale);
+      vecs[PARAMETER_APPLY]->add3(*vecs[PARAMETER_SUM1],
+                                  *vecs[PARAMETER_SUM2],
+                                  *vecs[PARAMETER_SUM3],
+                                  scale,
+                                  scale,
+                                  scale);
     };
   } else {
-    return [scale](const VectorPtr vecs[], const ParameterConfig& config,
+    return [scale](const VectorPtr vecs[],
+                   const ParameterConfig& config,
                    size_t sparseId) {
       vecs[PARAMETER_GRADIENT]->copyFrom(*vecs[PARAMETER_VALUE]);
-      vecs[PARAMETER_VALUE]->add3(*vecs[PARAMETER_SUM1], *vecs[PARAMETER_SUM2],
-                                  *vecs[PARAMETER_SUM3], scale, scale, scale);
+      vecs[PARAMETER_VALUE]->add3(*vecs[PARAMETER_SUM1],
+                                  *vecs[PARAMETER_SUM2],
+                                  *vecs[PARAMETER_SUM3],
+                                  scale,
+                                  scale,
+                                  scale);
     };
   }
 }
@@ -144,8 +155,8 @@ ParameterOptimizer::TraverseCallback AverageOptimizer::restore() {
     return nullptr;
   }
 
-  return [](const VectorPtr vecs[], const ParameterConfig& config,
-            size_t sparseId) {
+  return [](
+      const VectorPtr vecs[], const ParameterConfig& config, size_t sparseId) {
     vecs[PARAMETER_VALUE]->copyFrom(*vecs[PARAMETER_GRADIENT]);
     vecs[PARAMETER_GRADIENT]->zeroMem();
   };
@@ -174,7 +185,8 @@ ParameterOptimizer::TraverseCallback AverageSparseOptimizer::startCatchUpWith()
 
   if (timer_ > 0) {
     callbacks.emplace_back(
-        [this](const VectorPtr vecs[], const ParameterConfig& config,
+        [this](const VectorPtr vecs[],
+               const ParameterConfig& config,
                size_t sparseId) { this->catchUpWith(vecs, config, sparseId); });
   }
 
diff --git a/paddle/parameter/AverageOptimizer.h b/paddle/parameter/AverageOptimizer.h
index 8e0ead8412..ccc2612608 100644
--- a/paddle/parameter/AverageOptimizer.h
+++ b/paddle/parameter/AverageOptimizer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "FirstOrderOptimizer.h"
@@ -26,7 +25,8 @@ public:
   // if *useParameterApply* set, use PARAMETER_APPLY to store averaged parameter
   // else use PARAMETER_VALUE, and value backup in PARAMETER_GRADIENT
   AverageOptimizer(const OptimizationConfig& optConfig,
-                   ParameterOptimizer* optimizer, bool useParameterApply);
+                   ParameterOptimizer* optimizer,
+                   bool useParameterApply);
 
   static ParameterOptimizer* create(const OptimizationConfig& optConfig,
                                     ParameterOptimizer* optimizer,
@@ -45,7 +45,8 @@ public:
 
   virtual void startBatch(int64_t numSamplesProcessed);
   virtual void finishBatch();
-  virtual void update(const VectorPtr vecs[], const ParameterConfig& paraConfig,
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& paraConfig,
                       size_t sparseId) const {
     optimizer_->update(vecs, paraConfig, sparseId);
     vecs[PARAMETER_SUM1]->add(*vecs[PARAMETER_VALUE], 1.0f);
@@ -99,7 +100,8 @@ protected:
 class AverageSparseOptimizer : public AverageOptimizer {
 public:
   AverageSparseOptimizer(const OptimizationConfig& optConfig,
-                         ParameterOptimizer* optimizer, bool useParameterApply)
+                         ParameterOptimizer* optimizer,
+                         bool useParameterApply)
       : AverageOptimizer(optConfig, optimizer, useParameterApply) {}
 
   virtual void init(size_t numRows, const ParameterConfig* config) {
@@ -114,9 +116,11 @@ public:
     AverageOptimizer::finishBatch();
     timer_++;
   }
-  virtual void update(const VectorPtr vecs[], const ParameterConfig& paraConfig,
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& paraConfig,
                       size_t sparseId) const;
-  void catchUpWith(const VectorPtr vecs[], const ParameterConfig& paraConfig,
+  void catchUpWith(const VectorPtr vecs[],
+                   const ParameterConfig& paraConfig,
                    size_t sparseId) const;
   virtual TraverseCallback startCatchUpWith() const;
   virtual void finishCatchUpWith() {
diff --git a/paddle/parameter/FirstOrderOptimizer.cpp b/paddle/parameter/FirstOrderOptimizer.cpp
index bb46a51d1e..a9be07d062 100644
--- a/paddle/parameter/FirstOrderOptimizer.cpp
+++ b/paddle/parameter/FirstOrderOptimizer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Util.h"
 #include "paddle/utils/Flags.h"
 
@@ -71,13 +70,15 @@ void SparseMomentumParameterOptimizer::update(const VectorPtr vecs[],
                                      tau_ * alpha_ * gamma_ * learningRate_);
     vecs[PARAMETER_VALUE]->add(*vecs[PARAMETER_MOMENTUM_UT],
                                tau_ / beta_ + 1.0 / alpha_,
-                               *vecs[PARAMETER_MOMENTUM_VT], 1.0 / beta_);
+                               *vecs[PARAMETER_MOMENTUM_VT],
+                               1.0 / beta_);
 
   } else {
-    vecs[PARAMETER_VALUE]->sgdUpdate(
-        *vecs[PARAMETER_GRADIENT], *vecs[PARAMETER_MOMENTUM],
-        learningRate_ * paraConfig.learning_rate(), paraConfig.momentum(),
-        applyDecay_ ? paraConfig.decay_rate() : 0);
+    vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
+                                     *vecs[PARAMETER_MOMENTUM],
+                                     learningRate_ * paraConfig.learning_rate(),
+                                     paraConfig.momentum(),
+                                     applyDecay_ ? paraConfig.decay_rate() : 0);
   }
 }
 
@@ -90,7 +91,8 @@ SparseMomentumParameterOptimizer::needSpecialTraversal(
     //  2. Note that \tau * u_t + v_t = \beta \theta_t, therefore:
     //     u_t should be rescaled to u_t/alpha_
     //     v_t should be reset to \theta_t
-    return [this](const VectorPtr vecs[], const ParameterConfig& config,
+    return [this](const VectorPtr vecs[],
+                  const ParameterConfig& config,
                   size_t sparseId) {
       vecs[PARAMETER_MOMENTUM_UT]->divScalar(alpha_);
       vecs[PARAMETER_MOMENTUM_VT]->assign(*vecs[PARAMETER_VALUE]);
@@ -120,10 +122,12 @@ void AdagradParameterOptimizer::update(const VectorPtr vecs[],
   vecs[PARAMETER_LEARNING_RATE]->add(optConfig_.ada_epsilon());
   vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]);
 
-  vecs[PARAMETER_VALUE]->sgdUpdate(
-      *vecs[PARAMETER_GRADIENT], *vecs[PARAMETER_MOMENTUM],
-      *vecs[PARAMETER_LEARNING_RATE], learningRate_ * config.learning_rate(),
-      config.momentum(), applyDecay_ ? config.decay_rate() : 0);
+  vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
+                                   *vecs[PARAMETER_MOMENTUM],
+                                   *vecs[PARAMETER_LEARNING_RATE],
+                                   learningRate_ * config.learning_rate(),
+                                   config.momentum(),
+                                   applyDecay_ ? config.decay_rate() : 0);
 }
 
 ParameterOptimizer::TraverseCallback
@@ -132,7 +136,8 @@ AdagradParameterOptimizer::needSpecialTraversal(
   if (numUpdates_ % kMaxNumAccumulates == 0) {
     // Move the sum to a different buffer to avoid loss of precision
     // due to too many sums.
-    return [this](const VectorPtr vecs[], const ParameterConfig& config,
+    return [this](const VectorPtr vecs[],
+                  const ParameterConfig& config,
                   size_t sparseId) {
       vecs[PARAMETER_GRADIENT_SQURESUM]->add(
           *vecs[PARAMETER_GRADIENT_SQURESUM1]);
@@ -148,24 +153,29 @@ void AdaDeltaParameterOptimizer::update(const VectorPtr vecs[],
                                         size_t sparseId) const {
   CHECK(sparseId == -1LU) << "Sparse update is not supported";
   // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
-  vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare(*vecs[PARAMETER_GRADIENT],
-                                                    rou_, 1.0f - rou_);
+  vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare(
+      *vecs[PARAMETER_GRADIENT], rou_, 1.0f - rou_);
 
   // learn_rate = sqrt( ( E(dx_{t-1}^2) + epsilon ) / ( E(g_t^2) + epsilon ) )
   vecs[PARAMETER_LEARNING_RATE]->dotDiv(*vecs[PARAMETER_GRADIENT_SQURESUM1],
                                         *vecs[PARAMETER_GRADIENT_SQURESUM],
-                                        epsilon_, epsilon_);
+                                        epsilon_,
+                                        epsilon_);
   vecs[PARAMETER_LEARNING_RATE]->sqrt();
 
   // E(dx_t^2) = \rou * E(dx_{t-1}^2) + (1-\rou) * (-g*learn_rate)^2
   vecs[PARAMETER_GRADIENT_SQURESUM1]->decayAddSquareMul(
-      *vecs[PARAMETER_GRADIENT], *vecs[PARAMETER_LEARNING_RATE], rou_,
+      *vecs[PARAMETER_GRADIENT],
+      *vecs[PARAMETER_LEARNING_RATE],
+      rou_,
       1.0f - rou_);
 
-  vecs[PARAMETER_VALUE]->sgdUpdate(
-      *vecs[PARAMETER_GRADIENT], *vecs[PARAMETER_MOMENTUM],
-      *vecs[PARAMETER_LEARNING_RATE], learningRate_ * config.learning_rate(),
-      config.momentum(), applyDecay_ ? config.decay_rate() : 0);
+  vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
+                                   *vecs[PARAMETER_MOMENTUM],
+                                   *vecs[PARAMETER_LEARNING_RATE],
+                                   learningRate_ * config.learning_rate(),
+                                   config.momentum(),
+                                   applyDecay_ ? config.decay_rate() : 0);
 }
 
 void RMSPropParameterOptimizer::update(const VectorPtr vecs[],
@@ -185,12 +195,13 @@ void RMSPropParameterOptimizer::update(const VectorPtr vecs[],
   // For the first time update, make the sum be the current square
   // so that the initial estimation of E(g_t^2) will not be too small.
   vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare(
-      *vecs[PARAMETER_GRADIENT], accumulatedRou,
+      *vecs[PARAMETER_GRADIENT],
+      accumulatedRou,
       firstTime ? 1.0f : 1.0f - rou_);
 
   // E(g_t) = \rou * E(g_{t-1}) + (1-\rou) * g
-  vecs[PARAMETER_GRADIENT_SQURESUM1]->add(*vecs[PARAMETER_GRADIENT],
-                                          accumulatedRou, 1.0f - rou_);
+  vecs[PARAMETER_GRADIENT_SQURESUM1]->add(
+      *vecs[PARAMETER_GRADIENT], accumulatedRou, 1.0f - rou_);
 
   // learn_rate = 1/sqrt( ( E(g_t^2) - (E(g_t))^2 + epsilon )
   // Basiclly if the sign of the gradient changes more often,
@@ -201,10 +212,12 @@ void RMSPropParameterOptimizer::update(const VectorPtr vecs[],
   vecs[PARAMETER_LEARNING_RATE]->add(optConfig_.ada_epsilon());
   vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]);
 
-  vecs[PARAMETER_VALUE]->sgdUpdate(
-      *vecs[PARAMETER_GRADIENT], *vecs[PARAMETER_MOMENTUM],
-      *vecs[PARAMETER_LEARNING_RATE], learningRate_ * config.learning_rate(),
-      config.momentum(), applyDecay_ ? config.decay_rate() : 0);
+  vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
+                                   *vecs[PARAMETER_MOMENTUM],
+                                   *vecs[PARAMETER_LEARNING_RATE],
+                                   learningRate_ * config.learning_rate(),
+                                   config.momentum(),
+                                   applyDecay_ ? config.decay_rate() : 0);
 }
 
 void DecayedAdagradParameterOptimizer::update(const VectorPtr vecs[],
@@ -224,7 +237,8 @@ void DecayedAdagradParameterOptimizer::update(const VectorPtr vecs[],
   // For the first time update, make the sum be the current square
   // so that the initial estimation of E(g_t^2) will not be too small.
   vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare(
-      *vecs[PARAMETER_GRADIENT], accumulatedRou,
+      *vecs[PARAMETER_GRADIENT],
+      accumulatedRou,
       firstTime ? 1.0f : 1.0f - rou_);
 
   // learn_rate = 1/sqrt( ( E(g_t^2) + epsilon )
@@ -234,10 +248,12 @@ void DecayedAdagradParameterOptimizer::update(const VectorPtr vecs[],
   vecs[PARAMETER_LEARNING_RATE]->add(*vecs[PARAMETER_GRADIENT_SQURESUM]);
   vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]);
 
-  vecs[PARAMETER_VALUE]->sgdUpdate(
-      *vecs[PARAMETER_GRADIENT], *vecs[PARAMETER_MOMENTUM],
-      *vecs[PARAMETER_LEARNING_RATE], learningRate_ * config.learning_rate(),
-      config.momentum(), applyDecay_ ? config.decay_rate() : 0);
+  vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
+                                   *vecs[PARAMETER_MOMENTUM],
+                                   *vecs[PARAMETER_LEARNING_RATE],
+                                   learningRate_ * config.learning_rate(),
+                                   config.momentum(),
+                                   applyDecay_ ? config.decay_rate() : 0);
 }
 
 void AdamParameterOptimizer::update(const VectorPtr vecs[],
@@ -290,7 +306,6 @@ void AdamaxParameterOptimizer::update(const VectorPtr vecs[],
   theta->add(*theta, 1.0, *g, -learningRate);
 }
 
-
 void OptimizerWithGradientClipping::update(const VectorPtr vecs[],
                                            const ParameterConfig& config,
                                            size_t sparseId) const {
diff --git a/paddle/parameter/FirstOrderOptimizer.h b/paddle/parameter/FirstOrderOptimizer.h
index ad5f480976..a9a2ffdd41 100644
--- a/paddle/parameter/FirstOrderOptimizer.h
+++ b/paddle/parameter/FirstOrderOptimizer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "ParameterOptimizer.h"
@@ -31,21 +30,22 @@ public:
   virtual void startBatch(int64_t numSamplesProcessed) {
     learningRate_ = calcLearningRate(numSamplesProcessed, pass_);
   }
-  virtual void update(const VectorPtr vecs[], const ParameterConfig& paraConfig,
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& paraConfig,
                       size_t sparseId) const {
     (void)sparseId;
-    real torch_learningRate = optConfig_.learning_method() == "torch_momentum" ?
-                              1.0 - paraConfig.momentum() : 1.0;
+    real torch_learningRate = optConfig_.learning_method() == "torch_momentum"
+                                  ? 1.0 - paraConfig.momentum()
+                                  : 1.0;
     vecs[PARAMETER_VALUE]->sgdUpdate(
-        *vecs[PARAMETER_GRADIENT], *vecs[PARAMETER_MOMENTUM],
+        *vecs[PARAMETER_GRADIENT],
+        *vecs[PARAMETER_MOMENTUM],
         learningRate_ * paraConfig.learning_rate() *
-        (firstTime_ ? 1.0 : torch_learningRate),
+            (firstTime_ ? 1.0 : torch_learningRate),
         paraConfig.momentum(),
         applyDecay_ ? paraConfig.decay_rate() : 0);
   }
-  virtual void finishBatch() {
-        firstTime_ = false;
-  }
+  virtual void finishBatch() { firstTime_ = false; }
 };
 
 // SGD optimization with sparse support.
@@ -71,7 +71,8 @@ public:
       const OptimizationConfig& optConfig);
   virtual void init(size_t numRows, const ParameterConfig* config);
   virtual void startBatch(int64_t numSamplesProcessed);
-  virtual void update(const VectorPtr vecs[], const ParameterConfig& paraConfig,
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& paraConfig,
                       size_t sparseId) const;
   virtual TraverseCallback needSpecialTraversal(
       const ParameterConfig& config) const;
@@ -111,7 +112,8 @@ public:
     (void)numSamplesProcessed;
     ++numUpdates_;
   }
-  virtual void update(const VectorPtr vecs[], const ParameterConfig& config,
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& config,
                       size_t sparseId) const;
   virtual TraverseCallback needSpecialTraversal(
       const ParameterConfig& config) const;
@@ -141,7 +143,8 @@ public:
     learningRate_ = calcLearningRate(numSamplesProcessed, pass_);
   }
 
-  virtual void update(const VectorPtr vecs[], const ParameterConfig& config,
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& config,
                       size_t sparseId) const;
 
 protected:
@@ -173,7 +176,8 @@ public:
   }
   virtual void finishBatch() { timer_++; }
 
-  virtual void update(const VectorPtr vecs[], const ParameterConfig& config,
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& config,
                       size_t sparseId) const;
 
 protected:
@@ -214,7 +218,8 @@ public:
   }
   virtual void finishBatch() { timer_++; }
 
-  virtual void update(const VectorPtr vecs[], const ParameterConfig& config,
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& config,
                       size_t sparseId) const;
 
 protected:
@@ -251,7 +256,8 @@ public:
 
   virtual void finishBatch() { ++step_; }
 
-  virtual void update(const VectorPtr vecs[], const ParameterConfig& config,
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& config,
                       size_t sparseId) const;
 
 protected:
@@ -280,7 +286,8 @@ public:
 
   virtual void finishBatch() { ++step_; }
 
-  virtual void update(const VectorPtr vecs[], const ParameterConfig& config,
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& config,
                       size_t sparseId) const;
 
 protected:
@@ -301,7 +308,8 @@ public:
     // learningRate required by regularizer
     learningRate_ = calcLearningRate(numSamplesProcessed, pass_);
   }
-  virtual void update(const VectorPtr vecs[], const ParameterConfig& paraConfig,
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& paraConfig,
                       size_t sparseId) const {
     vecs[PARAMETER_VALUE]->add(*vecs[PARAMETER_GRADIENT],
                                optConfig_.delta_add_rate());
@@ -314,7 +322,8 @@ public:
   explicit DummyOptimizer(const OptimizationConfig& optConfig)
       : ParameterOptimizer(optConfig) {}
 
-  virtual void update(const VectorPtr vecs[], const ParameterConfig& paraConfig,
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& paraConfig,
                       size_t sparseId) const {}
 };
 
@@ -344,7 +353,8 @@ public:
       const ParameterConfig& config) const {
     return optimizer_->needSpecialTraversal(config);
   }
-  virtual void update(const VectorPtr vecs[], const ParameterConfig& config,
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& config,
                       size_t sparseId) const;
 
   virtual void setNoDecay() { optimizer_->setNoDecay(); }
diff --git a/paddle/parameter/LearningRateScheduler.cpp b/paddle/parameter/LearningRateScheduler.cpp
index ce045ebf05..a7412500cc 100644
--- a/paddle/parameter/LearningRateScheduler.cpp
+++ b/paddle/parameter/LearningRateScheduler.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "LearningRateScheduler.h"
 #include "paddle/utils/StringUtil.h"
 
diff --git a/paddle/parameter/LearningRateScheduler.h b/paddle/parameter/LearningRateScheduler.h
index 74fb848fab..e987c3dcde 100644
--- a/paddle/parameter/LearningRateScheduler.h
+++ b/paddle/parameter/LearningRateScheduler.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "TrainerConfig.pb.h"
@@ -20,9 +19,10 @@ limitations under the License. */
 
 namespace paddle {
 // NOLINTNEXTLINES_4
-#define REGISTER_LEARNING_RATE_SCHEDULER(__type_name, __class_name)              \
-  static InitFunction __reg_type_##__type_name([]() {                            \
-    LearningRateScheduler::registrar_.registerClass<__class_name>(#__type_name); \
+#define REGISTER_LEARNING_RATE_SCHEDULER(__type_name, __class_name) \
+  static InitFunction __reg_type_##__type_name([]() {               \
+    LearningRateScheduler::registrar_.registerClass<__class_name>(  \
+        #__type_name);                                              \
   })
 
 class LearningRateScheduler {
diff --git a/paddle/parameter/OptimizerFunctions.cpp b/paddle/parameter/OptimizerFunctions.cpp
index 5adcf86efd..6fd7964347 100644
--- a/paddle/parameter/OptimizerFunctions.cpp
+++ b/paddle/parameter/OptimizerFunctions.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "AverageOptimizer.h"
 #include "FirstOrderOptimizer.h"
 #include "OptimizerWithRegularizer.h"
@@ -22,19 +21,22 @@ namespace paddle {
 // creator for AverageOptimizer
 ParameterOptimizer* sgdOptimizerCreate(const OptimizationConfig& optConfig,
                                        const ParameterConfig& paraConfig,
-                                       bool isParameterSparse, bool inPserver) {
+                                       bool isParameterSparse,
+                                       bool inPserver) {
   ParameterOptimizer* optimizer = OptimizerWithRegularizer::create(
       optConfig, paraConfig, isParameterSparse, inPserver);
-  return AverageOptimizer::create(optConfig, optimizer, isParameterSparse,
-                                  inPserver /*useParameterApply*/);
+  return AverageOptimizer::create(
+      optConfig, optimizer, isParameterSparse, inPserver /*useParameterApply*/);
 }
 
 std::vector<ParameterType> sgdOptimizerGetTypes(
     const OptimizationConfig& optConfig, bool inPserver) {
   std::unique_ptr<ParameterOptimizer> optimizer;
-  optimizer.reset(AverageOptimizer::create(
-      optConfig, ParameterOptimizer::create(optConfig, inPserver),
-      false /*isParameterSparse*/, inPserver));
+  optimizer.reset(
+      AverageOptimizer::create(optConfig,
+                               ParameterOptimizer::create(optConfig, inPserver),
+                               false /*isParameterSparse*/,
+                               inPserver));
   CHECK(optimizer) << "fail to create optimizer: "
                    << optConfig.learning_method();
   return optimizer->getParameterTypes();
diff --git a/paddle/parameter/OptimizerFunctions.h b/paddle/parameter/OptimizerFunctions.h
index 9592658224..a5f8b2c569 100644
--- a/paddle/parameter/OptimizerFunctions.h
+++ b/paddle/parameter/OptimizerFunctions.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "FirstOrderOptimizer.h"
@@ -25,7 +24,8 @@ namespace paddle {
  */
 ParameterOptimizer* sgdOptimizerCreate(const OptimizationConfig& optConfig,
                                        const ParameterConfig& paraConfig,
-                                       bool isParameterSparse, bool inPserver);
+                                       bool isParameterSparse,
+                                       bool inPserver);
 
 /*
  * Get the parameter types needed for the specific optimization
diff --git a/paddle/parameter/OptimizerWithRegularizer.cpp b/paddle/parameter/OptimizerWithRegularizer.cpp
index 0da27a51c6..5381e7bef3 100644
--- a/paddle/parameter/OptimizerWithRegularizer.cpp
+++ b/paddle/parameter/OptimizerWithRegularizer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "OptimizerWithRegularizer.h"
 
 namespace paddle {
@@ -24,7 +23,8 @@ OptimizerWithRegularizerEveryNumBatches::needSpecialTraversal(
 
   if (isRegularizationBatch(config)) {
     callbacks.emplace_back(
-        [this](const VectorPtr vecs[], const ParameterConfig& config,
+        [this](const VectorPtr vecs[],
+               const ParameterConfig& config,
                size_t sparseId) { this->doTraversal(vecs, config); });
   }
 
@@ -39,8 +39,8 @@ void OptimizerWithRegularizerEveryNumBatches::doTraversal(
     const VectorPtr vecs[], const ParameterConfig& config) const {
   int32_t base =
       std::max(baseTimer_, (timer_ + 1 - config.num_batches_regularization()));
-  regularizer_->update(vecs, config, optimizer_->getLearningRate(), base,
-                       timer_ + 1);
+  regularizer_->update(
+      vecs, config, optimizer_->getLearningRate(), base, timer_ + 1);
 }
 
 ParameterOptimizer::TraverseCallback
@@ -53,7 +53,8 @@ OptimizerWithRegularizerEveryNumBatches::startCatchUpWith() const {
 
   if (baseTimer_ < timer_) {
     callbacks.emplace_back(
-        [this](const VectorPtr vecs[], const ParameterConfig& config,
+        [this](const VectorPtr vecs[],
+               const ParameterConfig& config,
                size_t sparseId) { this->catchUpWith(vecs, config, sparseId); });
   }
 
@@ -61,11 +62,15 @@ OptimizerWithRegularizerEveryNumBatches::startCatchUpWith() const {
 }
 
 void OptimizerWithRegularizerEveryNumBatches::catchUpWith(
-    const VectorPtr vecs[], const ParameterConfig& config,
+    const VectorPtr vecs[],
+    const ParameterConfig& config,
     size_t sparseId) const {
   int32_t base = timer_ - timer_ % config.num_batches_regularization();
-  regularizer_->update(vecs, config, optimizer_->getLearningRate(),
-                       std::max(base, baseTimer_), timer_);
+  regularizer_->update(vecs,
+                       config,
+                       optimizer_->getLearningRate(),
+                       std::max(base, baseTimer_),
+                       timer_);
 }
 
 void OptimizerWithRegularizerSparse::init(size_t numRows,
@@ -83,8 +88,11 @@ void OptimizerWithRegularizerSparse::update(const VectorPtr vecs[],
   optimizer_->update(vecs, config, sparseId);
   // para W(t0) -> W(t+1)
   CHECK_LT(sparseId, t0Vec_.size());
-  regularizer_->update(vecs, config, optimizer_->getLearningRate(),
-                       t0Vec_[sparseId], timer_ + 1);
+  regularizer_->update(vecs,
+                       config,
+                       optimizer_->getLearningRate(),
+                       t0Vec_[sparseId],
+                       timer_ + 1);
   t0Vec_[sparseId] = timer_ + 1;
 }
 
@@ -98,7 +106,8 @@ OptimizerWithRegularizerSparse::startCatchUpWith() const {
 
   if (timer_ > 0) {
     callbacks.emplace_back(
-        [this](const VectorPtr vecs[], const ParameterConfig& config,
+        [this](const VectorPtr vecs[],
+               const ParameterConfig& config,
                size_t sparseId) { this->catchUpWith(vecs, config, sparseId); });
   }
 
@@ -110,18 +119,20 @@ void OptimizerWithRegularizerSparse::catchUpWith(const VectorPtr vecs[],
                                                  size_t sparseId) const {
   // para W(t0) -> W(t+1)
   CHECK_LT(sparseId, t0Vec_.size());
-  regularizer_->update(vecs, config, optimizer_->getLearningRate(),
-                       t0Vec_[sparseId], timer_);
+  regularizer_->update(
+      vecs, config, optimizer_->getLearningRate(), t0Vec_[sparseId], timer_);
 }
 
 // factory method to create instance of OptimizerWithRegularizer
 ParameterOptimizer* OptimizerWithRegularizer::create(
-    const OptimizationConfig& optConfig, const ParameterConfig& paraConfig,
-    bool isParameterSparse, bool inPserver) {
+    const OptimizationConfig& optConfig,
+    const ParameterConfig& paraConfig,
+    bool isParameterSparse,
+    bool inPserver) {
   ParameterOptimizer* optimizer =
       ParameterOptimizer::create(optConfig, inPserver);
   if (paraConfig.gradient_clipping_threshold() > 0.0f &&
-     !dynamic_cast<AddOptimizer*>(optimizer)) {
+      !dynamic_cast<AddOptimizer*>(optimizer)) {
     optimizer = new OptimizerWithGradientClipping(optConfig, optimizer);
   }
   Regularizer* regularizer =
@@ -157,23 +168,23 @@ ParameterOptimizer* OptimizerWithRegularizer::create(
     }
     // normal
     optimizer->setNoDecay();
-    return new OptimizerWithRegularizerEveryNumBatches(optConfig, optimizer,
-                                                       regularizer);
+    return new OptimizerWithRegularizerEveryNumBatches(
+        optConfig, optimizer, regularizer);
   }
   if (isParameterSparse) {
-      CHECK(paraConfig.momentum() == 0.0f)
-          << "Parameter cannot support momentum if it's sparse.";
+    CHECK(paraConfig.momentum() == 0.0f)
+        << "Parameter cannot support momentum if it's sparse.";
     optimizer->setNoDecay();
-    return new OptimizerWithRegularizerSparse(optConfig, optimizer,
-                                              regularizer);
+    return new OptimizerWithRegularizerSparse(
+        optConfig, optimizer, regularizer);
   }
   // dense
   if (paraConfig.decay_rate_l1() == 0.0f ||
-    dynamic_cast<AddOptimizer*>(optimizer)) {
+      dynamic_cast<AddOptimizer*>(optimizer)) {
     return optimizer;
   }
   CHECK(paraConfig.momentum() == 0.0f)
-    << "Parameter cannot support momentum if it use L1 decay.";
+      << "Parameter cannot support momentum if it use L1 decay.";
   optimizer->setNoDecay();
   return new OptimizerWithRegularizer(optConfig, optimizer, regularizer);
 }
diff --git a/paddle/parameter/OptimizerWithRegularizer.h b/paddle/parameter/OptimizerWithRegularizer.h
index b8b2d5b84d..ebe23c7397 100644
--- a/paddle/parameter/OptimizerWithRegularizer.h
+++ b/paddle/parameter/OptimizerWithRegularizer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "FirstOrderOptimizer.h"
@@ -24,7 +23,8 @@ class OptimizerWithRegularizer : public ParameterOptimizer {
 public:
   static ParameterOptimizer* create(const OptimizationConfig& optConfig,
                                     const ParameterConfig& paraConfig,
-                                    bool isParameterSparse, bool inPserver);
+                                    bool isParameterSparse,
+                                    bool inPserver);
 
   OptimizerWithRegularizer(const OptimizationConfig& optConfig,
                            ParameterOptimizer* optimizer,
@@ -60,7 +60,8 @@ public:
     return optimizer_->needSpecialTraversal(config);
   }
 
-  virtual void update(const VectorPtr vecs[], const ParameterConfig& config,
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& config,
                       size_t sparseId) const {
     optimizer_->update(vecs, config, sparseId);
     regularizer_->update(vecs, config, optimizer_->getLearningRate(), 0, 1);
@@ -94,7 +95,8 @@ public:
     baseTimer_ = 0;
   }
 
-  virtual void update(const VectorPtr vecs[], const ParameterConfig& config,
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& config,
                       size_t sparseId) const {
     optimizer_->update(vecs, config, sparseId);
   }
@@ -103,7 +105,8 @@ public:
       const ParameterConfig& config) const;
   void doTraversal(const VectorPtr vecs[], const ParameterConfig& config) const;
 
-  void catchUpWith(const VectorPtr vecs[], const ParameterConfig& config,
+  void catchUpWith(const VectorPtr vecs[],
+                   const ParameterConfig& config,
                    size_t sparseId) const;
 
   virtual TraverseCallback startCatchUpWith() const;
@@ -130,9 +133,11 @@ public:
 
   virtual void init(size_t numRows, const ParameterConfig* config);
 
-  virtual void update(const VectorPtr vecs[], const ParameterConfig& config,
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& config,
                       size_t sparseId) const;
-  void catchUpWith(const VectorPtr vecs[], const ParameterConfig& config,
+  void catchUpWith(const VectorPtr vecs[],
+                   const ParameterConfig& config,
                    size_t sparseId) const;
   virtual TraverseCallback startCatchUpWith() const;
   virtual void finishCatchUpWith() {
diff --git a/paddle/parameter/ParallelParameter.cpp b/paddle/parameter/ParallelParameter.cpp
index 19cbdab1c8..99b20a59ca 100644
--- a/paddle/parameter/ParallelParameter.cpp
+++ b/paddle/parameter/ParallelParameter.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Logging.h"
 #include <fstream>
 
@@ -152,7 +151,8 @@ void SyncParameter::minorUpdate(real learnRate) {
   gradSem_->post();
 }
 
-AsyncParameter::AsyncParameter(TrainerRole role, int asyncCount,
+AsyncParameter::AsyncParameter(TrainerRole role,
+                               int asyncCount,
                                ParameterPtr localParam)
     : ParallelParameter(role, localParam) {
   asyncCount_ = asyncCount;
diff --git a/paddle/parameter/ParallelParameter.h b/paddle/parameter/ParallelParameter.h
index 882033af63..2b65321fe2 100644
--- a/paddle/parameter/ParallelParameter.h
+++ b/paddle/parameter/ParallelParameter.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <stdint.h>
@@ -47,17 +46,17 @@ const int UPDATE_TYPE_NUM = 32;
  * TrainRole denotes the role of current training, different roles have
  * different jobs.
  *
- * control, major, minor are three kinds of role to support mutiple GPUs 
+ * control, major, minor are three kinds of role to support mutiple GPUs
  * parallel SGD training. SM on GPU card has two groups, each group
  * consist of a major and a minor.
  *
  * @param    single  single GPU card single thread training.
- * 
+ *
  *
  * @param    control current parameter updates via control role,
  *                   not participate in real training. control role is
- *                   responsible for merging all major's gradient and 
- *                   update parameter value. 
+ *                   responsible for merging all major's gradient and
+ *                   update parameter value.
  *
  * @param    major   major role paticipates in real training, when local
  *                   gradient is ready, merge its corresponding minor's
@@ -83,7 +82,8 @@ typedef void (ParallelParameter::*UpdateFunction)(real learnRate);
 
 class ParallelParameter {
 public:
-  static ParallelParameterPtr create(TrainerRole role, ParameterPtr localParam,
+  static ParallelParameterPtr create(TrainerRole role,
+                                     ParameterPtr localParam,
                                      int asyncCount = 1);
 
   ParallelParameter(TrainerRole role, ParameterPtr localParam) {
@@ -135,7 +135,7 @@ protected:
 };
 
 /**
- * this class is designed for multi-threading training. 
+ * this class is designed for multi-threading training.
  *
  * "Synchronous" means multiple GPUs calculate 1/4 mini-Batch,
  * but will get only one gradient
@@ -209,14 +209,14 @@ public:
    * When asynchronous training, update strategy including slave and master.
    *
    * slave: If in range asyncCount, adopting self-update method.
-   *        If beyond asyncCount, waiting for master to update. 
+   *        If beyond asyncCount, waiting for master to update.
    */
   void slaveUpdate(real learnRate);
 
   /**
    * When asynchronous training, update strategy including slave and master.
    *
-   * master: it only polls slaves, do not training data. 
+   * master: it only polls slaves, do not training data.
    *         If slave's gradient is ready, fetch it.
    *         Update master's parameter, then copy it into
    *         corresponding slave.
@@ -227,7 +227,7 @@ public:
 private:
   /**
    * When asynchronous training, every aysnc trainer needs to
-   * accumulate a number of batch gradient. 
+   * accumulate a number of batch gradient.
    *
    * gradientAccum_ is used to save the sum of gradients.
    */
diff --git a/paddle/parameter/Parameter.cpp b/paddle/parameter/Parameter.cpp
index 64d72ae740..7e37bf225b 100644
--- a/paddle/parameter/Parameter.cpp
+++ b/paddle/parameter/Parameter.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <fstream>
 #include "paddle/math/MathUtils.h"
 #include "AverageOptimizer.h"
@@ -27,11 +26,13 @@ limitations under the License. */
 #include "hl_gpu.h"
 #include "paddle/utils/CommandLineParser.h"
 
-P_DEFINE_int32(enable_grad_share, (100 * 1024 * 1024),
+P_DEFINE_int32(enable_grad_share,
+               (100 * 1024 * 1024),
                "threshold for enable gradient parameter share for batch "
                "multi-cpu training");
 P_DEFINE_int32(
-    grad_share_block_num, 64,
+    grad_share_block_num,
+    64,
     "block number of gradient parameter share for batch multi-cpu training");
 
 namespace paddle {
@@ -95,13 +96,12 @@ void Parameter::randomize(const VectorPtr& value,
     real initial_max = config.initial_mean() + config.initial_std();
     value->uniform(initial_min, initial_max);
     VLOG(1) << config.name() << ": initial_min=" << initial_min
-                            << ", initial_max=" << initial_max;
+            << ", initial_max=" << initial_max;
   } else if (PARAMETER_INIT_NORMAL == config.initial_strategy()) {
     /* Initialize the parameters randomly */
     value->randnorm(config.initial_mean(), config.initial_std());
-    VLOG(1) << config.name()
-                            << ": initial_mean=" << config.initial_mean()
-                            << ", initial_std=" << config.initial_std();
+    VLOG(1) << config.name() << ": initial_mean=" << config.initial_mean()
+            << ", initial_std=" << config.initial_std();
   } else {
     LOG(FATAL) << "not supported initial_strategy: "
                << config.initial_strategy();
@@ -116,12 +116,18 @@ void Parameter::randomize() {
   if (config_.is_sparse()) {
     if (format_ == SPARSE_CSC) {
       sparseRand(intBufs_[PARAMETER_COLS]->getData(),
-                 intBufs_[PARAMETER_ROWS]->getData(), config_.size(),
-                 config_.dims(1) + 1, config_.dims(0), useGpu_);
+                 intBufs_[PARAMETER_ROWS]->getData(),
+                 config_.size(),
+                 config_.dims(1) + 1,
+                 config_.dims(0),
+                 useGpu_);
     } else {
       sparseRand(intBufs_[PARAMETER_ROWS]->getData(),
-                 intBufs_[PARAMETER_COLS]->getData(), config_.size(),
-                 config_.dims(0) + 1, config_.dims(1), useGpu_);
+                 intBufs_[PARAMETER_COLS]->getData(),
+                 config_.size(),
+                 config_.dims(0) + 1,
+                 config_.dims(1),
+                 useGpu_);
     }
   }
   setValueUpdated();
@@ -152,7 +158,7 @@ bool Parameter::isValueShared() {
 
 bool Parameter::isGradSparseUpdate() const {
   return !useGpu_ && !isStatic() &&
-      (config_.sparse_update() || config_.sparse_remote_update());
+         (config_.sparse_update() || config_.sparse_remote_update());
 }
 
 void Parameter::setMat(ParameterType pType, int matType) {
@@ -180,30 +186,42 @@ void Parameter::setMat(ParameterType pType, int matType) {
         CHECK_EQ(width + 1, intBufs_[PARAMETER_COLS]->getSize());
         CHECK_EQ(size, intBufs_[PARAMETER_ROWS]->getSize());
       }
-      mats_[pType] = Matrix::createSparseMatrix(
-          bufs_[pType]->getData(), intBufs_[PARAMETER_ROWS]->getData(),
-          intBufs_[PARAMETER_COLS]->getData(), height, width,
-          bufs_[pType]->getSize(), FLOAT_VALUE, format_, false, useGpu_);
+      mats_[pType] =
+          Matrix::createSparseMatrix(bufs_[pType]->getData(),
+                                     intBufs_[PARAMETER_ROWS]->getData(),
+                                     intBufs_[PARAMETER_COLS]->getData(),
+                                     height,
+                                     width,
+                                     bufs_[pType]->getSize(),
+                                     FLOAT_VALUE,
+                                     format_,
+                                     false,
+                                     useGpu_);
     }
   } else if (matType == MAT_NORMAL_SHARED) {
     CHECK_EQ(height * width, bufs_[pType]->getSize());
     size_t blockNum = 0;
     CHECK(isGradShared(&blockNum));
     mats_[pType] = std::make_shared<SharedCpuMatrix>(
-        blockNum, std::dynamic_pointer_cast<CpuMemoryHandle>(
-                      bufs_[pType]->getMemoryHandle()),
-        height, width);
+        blockNum,
+        std::dynamic_pointer_cast<CpuMemoryHandle>(
+            bufs_[pType]->getMemoryHandle()),
+        height,
+        width);
   } else if (matType == MAT_VALUE_SHARED) {
     CHECK_EQ(height * width, bufs_[pType]->getSize());
     mats_[pType] = std::make_shared<SharedCpuMatrix>(
         std::dynamic_pointer_cast<CpuMemoryHandle>(
-        bufs_[pType]->getMemoryHandle()), height, width);
+            bufs_[pType]->getMemoryHandle()),
+        height,
+        width);
   } else if (matType == MAT_SPARSE_ROW_IDS) {
     CHECK_EQ(height * width, bufs_[pType]->getSize());
     mats_[pType] = std::make_shared<SparseRowIdsCpuMatrix>(
         std::dynamic_pointer_cast<CpuMemoryHandle>(
             bufs_[pType]->getMemoryHandle()),
-        height, width);
+        height,
+        width);
   } else if (matType == MAT_SPARSE_ROW) {
     auto valueMat =
         std::dynamic_pointer_cast<SparseRowCpuMatrix>(mats_[PARAMETER_VALUE]);
@@ -214,29 +232,31 @@ void Parameter::setMat(ParameterType pType, int matType) {
                       << " MAT_SPARSE_ROW_PREFETCH or MAT_CACHE_ROW";
       indexDict = valueMat->getIndexDictHandle();
     }
-    auto mat = std::make_shared<SparseRowCpuMatrix>(
-        nullptr, height, width,
-        // grad share index with value
-        indexDict);
+    auto mat =
+        std::make_shared<SparseRowCpuMatrix>(nullptr,
+                                             height,
+                                             width,
+                                             // grad share index with value
+                                             indexDict);
     mats_[pType] = mat;
   } else if (matType == MAT_CACHE_ROW) {
     CHECK(isGradSparseUpdate());
-    auto mat = std::make_shared<CacheRowCpuMatrix>(
-      height, width);
+    auto mat = std::make_shared<CacheRowCpuMatrix>(height, width);
     mats_[pType] = mat;
   } else if (matType == MAT_SPARSE_ROW_PREFETCH_FULL_SIZE ||
              matType == MAT_SPARSE_ROW_PREFETCH) {
     auto mat = std::make_shared<SparsePrefetchRowCpuMatrix>(
         bufs_[pType] ? std::dynamic_pointer_cast<CpuMemoryHandle>(
-          bufs_[pType]->getMemoryHandle()) : nullptr,
-        height, width,
+                           bufs_[pType]->getMemoryHandle())
+                     : nullptr,
+        height,
+        width,
         nullptr,  // indexDictHandle
         getGlobalSyncThreadPool());
     mats_[pType] = mat;
   } else if (matType == MAT_SPARSE_ROW_AUTO_GROW) {
     CHECK(isGradSparseUpdate());
-    mats_[pType] = std::make_shared<SparseAutoGrowRowCpuMatrix>(
-      height, width);
+    mats_[pType] = std::make_shared<SparseAutoGrowRowCpuMatrix>(height, width);
   } else {
     LOG(FATAL) << "Unsupported mat type" << matType;
   }
@@ -252,30 +272,43 @@ SparsePrefetchRowCpuMatrix* Parameter::getPrefetchMatrix() {
 }
 
 void Parameter::updateWithGradient(real learningRate) {
-  sgdUpdate(learningRate * config_.learning_rate(), config_.momentum(),
-            config_.decay_rate(), bufs_[PARAMETER_VALUE].get(),
-            bufs_[PARAMETER_GRADIENT].get(), bufs_[PARAMETER_MOMENTUM].get());
+  sgdUpdate(learningRate * config_.learning_rate(),
+            config_.momentum(),
+            config_.decay_rate(),
+            bufs_[PARAMETER_VALUE].get(),
+            bufs_[PARAMETER_GRADIENT].get(),
+            bufs_[PARAMETER_MOMENTUM].get());
 }
 
-void Parameter::updateWithGradient(real learningRate, MatrixPtr gradMat,
-                                   IVectorPtr t0, int currentTime, bool fini) {
+void Parameter::updateWithGradient(real learningRate,
+                                   MatrixPtr gradMat,
+                                   IVectorPtr t0,
+                                   int currentTime,
+                                   bool fini) {
   SparseRowCpuMatrix* sparseMat =
       dynamic_cast<SparseRowCpuMatrix*>(gradMat.get());
   CHECK(sparseMat);
   CHECK_EQ(config_.momentum(), 0.0f)
       << "not support momentum in sparse input sgd";
   bool useL1 = (config_.decay_rate_l1() != 0.0f);
-  sparseMat->sgdUpdate(*bufs_[PARAMETER_VALUE], *t0,
-                       learningRate * config_.learning_rate(), currentTime,
+  sparseMat->sgdUpdate(*bufs_[PARAMETER_VALUE],
+                       *t0,
+                       learningRate * config_.learning_rate(),
+                       currentTime,
                        useL1 ? config_.decay_rate_l1() : config_.decay_rate(),
-                       useL1, fini);
+                       useL1,
+                       fini);
 }
 
-void Parameter::updateWithGradient(real learningRate, VectorPtr gradVec,
+void Parameter::updateWithGradient(real learningRate,
+                                   VectorPtr gradVec,
                                    bool normalUpdate) {
   if (normalUpdate) {
-    sgdUpdate(learningRate * config_.learning_rate(), config_.momentum(),
-              config_.decay_rate(), bufs_[PARAMETER_VALUE].get(), gradVec.get(),
+    sgdUpdate(learningRate * config_.learning_rate(),
+              config_.momentum(),
+              config_.decay_rate(),
+              bufs_[PARAMETER_VALUE].get(),
+              gradVec.get(),
               bufs_[PARAMETER_MOMENTUM].get());
   } else {
     size_t size = gradVec->getSize();
@@ -361,7 +394,7 @@ bool Parameter::load(const std::string& filename) {
       return true;
     }
     LOG(FATAL) << "unsupported load_missing_parameter_strategy: "
-        << FLAGS_load_missing_parameter_strategy;
+               << FLAGS_load_missing_parameter_strategy;
     return false;
   }
   return load(fs);
@@ -372,8 +405,8 @@ bool Parameter::load(std::istream& s) {
   Header header;
   CHECK(s.read(reinterpret_cast<char*>(&header), sizeof(header)))
       << "Fail to read parameter " << getName();
-  CHECK_EQ(header.version, kFormatVersion)
-      << "Incorrect format version: " << header.version;
+  CHECK_EQ(header.version, kFormatVersion) << "Incorrect format version: "
+                                           << header.version;
   CHECK_EQ(header.size, getSize())
       << "The size (" << header.size << ") in the file does not match the size "
       << "(" << getSize() << ") of the parameter: " << getName();
@@ -382,7 +415,7 @@ bool Parameter::load(std::istream& s) {
   CHECK(s.read(reinterpret_cast<char*>(vec.getData()),
                header.size * sizeof(real)));
 
-  auto & tmp = *bufs_[PARAMETER_VALUE].get();
+  auto& tmp = *bufs_[PARAMETER_VALUE].get();
   if (typeid(tmp) == typeid(GpuVector)) {
     bufs_[PARAMETER_VALUE]->copyFrom(vec);
   }
@@ -393,7 +426,11 @@ bool Parameter::load(std::istream& s) {
     auto height = config_.dims(0);
     auto width = config_.dims(1);
     auto mat = Matrix::create(vec.getData(), height, width);
-    CpuSparseMatrix sparseMat(height, width, 0, FLOAT_VALUE, format_,
+    CpuSparseMatrix sparseMat(height,
+                              width,
+                              0,
+                              FLOAT_VALUE,
+                              format_,
                               /*trans*/ false);
     sparseMat.copyFrom(*mat, HPPL_STREAM_DEFAULT);
     auto nnz = sparseMat.getElementCnt();
@@ -423,11 +460,11 @@ bool Parameter::load(std::istream& s) {
         s.read(reinterpret_cast<char*>(rows.getData()), rowSize * sizeof(int)));
     CHECK(
         s.read(reinterpret_cast<char*>(cols.getData()), colSize * sizeof(int)));
-    auto & paramRows = *intBufs_[PARAMETER_ROWS].get();
+    auto& paramRows = *intBufs_[PARAMETER_ROWS].get();
     if (typeid(paramRows) == typeid(GpuIVector)) {
       intBufs_[PARAMETER_ROWS]->copyFrom(rows);
     }
-    auto & paramCols = *intBufs_[PARAMETER_COLS].get();
+    auto& paramCols = *intBufs_[PARAMETER_COLS].get();
     if (typeid(paramCols) == typeid(GpuIVector)) {
       intBufs_[PARAMETER_COLS]->copyFrom(cols);
     }
@@ -457,8 +494,8 @@ void Parameter::exec(ExecFunc func) {
       func(this->getBufs());
     } else {  // multi thread
       VectorPtr* vecs = Parameter::getTlsTempBufs();
-      auto interval = calcSplitArrayInterval(this->getSize(), (size_t)tid,
-                                             numThreads, 8LU /*for avx*/);
+      auto interval = calcSplitArrayInterval(
+          this->getSize(), (size_t)tid, numThreads, 8LU /*for avx*/);
       for (size_t i = 0; i < (size_t)NUM_PARAMETER_TYPES; ++i) {
         if (bufs_[i]) {
           vecs[i]->subVecFrom(*bufs_[i], interval);
diff --git a/paddle/parameter/Parameter.h b/paddle/parameter/Parameter.h
index ff251fe89f..1c159d669a 100644
--- a/paddle/parameter/Parameter.h
+++ b/paddle/parameter/Parameter.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <stdint.h>
@@ -52,7 +51,6 @@ struct Segment {
   int64_t beginPos;  // beginning position in the local value or grad buffer
 };
 
-
 class Parameter;
 typedef std::shared_ptr<Parameter> ParameterPtr;
 
@@ -129,8 +127,7 @@ public:
     if (config_.dims_size() == 2) {
       if (matType == MAT_NORMAL || matType == MAT_NORMAL_SHARED ||
           matType == MAT_SPARSE_ROW_PREFETCH_FULL_SIZE ||
-          matType == MAT_VALUE_SHARED ||
-          matType == MAT_SPARSE_ROW_IDS) {
+          matType == MAT_VALUE_SHARED || matType == MAT_SPARSE_ROW_IDS) {
         bufs_[type] = Vector::createParallelVector(config_.size(), useGpu_);
         bufs_[type]->zeroMem();
       } else {
@@ -161,7 +158,8 @@ public:
     }
   }
 
-  void enableSharedType(ParameterType type, VectorPtr vec,
+  void enableSharedType(ParameterType type,
+                        VectorPtr vec,
                         MatrixPtr mat = nullptr) {
     if (!bufs_[type] && !mats_[type]) {
       bufs_[type] = vec;
@@ -235,13 +233,17 @@ public:
    *
    * @see SparseRowCpuMatrix::sgdUpdate for more information.
    */
-  void updateWithGradient(real learningRate, MatrixPtr gradMat, IVectorPtr t0,
-                          int currentTime, bool fini = false);
+  void updateWithGradient(real learningRate,
+                          MatrixPtr gradMat,
+                          IVectorPtr t0,
+                          int currentTime,
+                          bool fini = false);
 
   /**
    * This function is used to calculate multiple gpus, but only as a candidate
    */
-  void updateWithGradient(real learningRate, VectorPtr grad,
+  void updateWithGradient(real learningRate,
+                          VectorPtr grad,
                           bool normalUpdate = true);
 
   /**
diff --git a/paddle/parameter/ParameterOptimizer.cpp b/paddle/parameter/ParameterOptimizer.cpp
index 164b50c4d2..2a71d6aee4 100644
--- a/paddle/parameter/ParameterOptimizer.cpp
+++ b/paddle/parameter/ParameterOptimizer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Logging.h"
 
 #include <fstream>
diff --git a/paddle/parameter/ParameterOptimizer.h b/paddle/parameter/ParameterOptimizer.h
index 8c76674340..21a148333c 100644
--- a/paddle/parameter/ParameterOptimizer.h
+++ b/paddle/parameter/ParameterOptimizer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "LearningRateScheduler.h"
@@ -32,8 +31,8 @@ namespace paddle {
  */
 class ParameterOptimizer {
 public:
-  typedef std::function<void(const VectorPtr vecs[],
-                             const ParameterConfig& config, size_t sparseId)>
+  typedef std::function<void(
+      const VectorPtr vecs[], const ParameterConfig& config, size_t sparseId)>
       TraverseCallback;
 
 public:
@@ -69,35 +68,35 @@ public:
     (void)numSamplesProcessed;
   }
 
- /**
-  * following hooks useful for sparse update,
-  * because the traversal in block costs.
-  * called by Trainer after update and before finishBatch
-  * e.g. Trainer call like this:
-  *
-  * @code
-  * startBatch();
-  * if (dense) {
-  *   update(blockVec);
-  * } else {//sparse
-  *   for (row : rows_in_block) {update(rowVec)}
-  * }
-  * auto callback = needSpecialTraversal();
-  * if (callback) {
-  *   // do traverse, maybe multi-thread
-  *   if (dense) {
-  *     callback();
-  *   } else {//sparse
-  *     for (row : all_rows_in_block) {callback();}
-  *   }
-  * }
-  * finishBatch();
-  * @endcode
-  *
-  * @return callback if need traverse,
-  *         else return nullptr.
-  *         It should be no state change.
-  */
+  /**
+   * following hooks useful for sparse update,
+   * because the traversal in block costs.
+   * called by Trainer after update and before finishBatch
+   * e.g. Trainer call like this:
+   *
+   * @code
+   * startBatch();
+   * if (dense) {
+   *   update(blockVec);
+   * } else {//sparse
+   *   for (row : rows_in_block) {update(rowVec)}
+   * }
+   * auto callback = needSpecialTraversal();
+   * if (callback) {
+   *   // do traverse, maybe multi-thread
+   *   if (dense) {
+   *     callback();
+   *   } else {//sparse
+   *     for (row : all_rows_in_block) {callback();}
+   *   }
+   * }
+   * finishBatch();
+   * @endcode
+   *
+   * @return callback if need traverse,
+   *         else return nullptr.
+   *         It should be no state change.
+   */
   virtual TraverseCallback needSpecialTraversal(
       const ParameterConfig& config) const {
     return nullptr;
@@ -112,47 +111,48 @@ public:
    * with its gradient in PARAMETER_GRADIENT. sparseId is row id,
    * when sparseId set, update is sparse, each time one row.
    */
-  virtual void update(const VectorPtr vecs[], const ParameterConfig& config,
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& config,
                       size_t sparseId = -1LU) const = 0;
 
- /**
-  * following hooks catch up with current time for sparse update,
-  * In the beginning, call startCatchUpWith() and check return.
-  * In the end, call finishCatchUpWith() to finish state.
-  * callback do the actual works, can call many times for sparse data.
-  * e.g. Trainer call like this:
-  *
-  * @code
-  * auto callback = startCatchUpWith();
-  * if (callback) {
-  *   // do catch up with, maybe multi-thread
-  *   if (dense) {
-  *     callback();
-  *   } else {//sparse
-  *     for (row : rows_in_block) {callback();}
-  *   }
-  *   // finish catch up with, main thread
-  *   finishCatchUpWith();
-  * }
-  * @endcode
-  *
-  * @return callback if need catch up with,
-  *         else return nullptr.
-  *         It should be no state change.
-  */
+  /**
+   * following hooks catch up with current time for sparse update,
+   * In the beginning, call startCatchUpWith() and check return.
+   * In the end, call finishCatchUpWith() to finish state.
+   * callback do the actual works, can call many times for sparse data.
+   * e.g. Trainer call like this:
+   *
+   * @code
+   * auto callback = startCatchUpWith();
+   * if (callback) {
+   *   // do catch up with, maybe multi-thread
+   *   if (dense) {
+   *     callback();
+   *   } else {//sparse
+   *     for (row : rows_in_block) {callback();}
+   *   }
+   *   // finish catch up with, main thread
+   *   finishCatchUpWith();
+   * }
+   * @endcode
+   *
+   * @return callback if need catch up with,
+   *         else return nullptr.
+   *         It should be no state change.
+   */
   virtual TraverseCallback startCatchUpWith() const { return nullptr; }
   virtual void finishCatchUpWith() {}
 
- /**
-  * following two hooks used by averager,
-  * apply to final parameter value (PARAMETER_VALUE or PARAMETER_APPLY).
-  *
-  * restore() will restore orginal value if it apply to PARAMETER_VALUE.
-  * Caller must ensure it's catched up with current time before apply.
-  *
-  * Use returned callback same way as callback returned by
-  * ParameterOptimizer::needSpecialTraversal()
-  */
+  /**
+   * following two hooks used by averager,
+   * apply to final parameter value (PARAMETER_VALUE or PARAMETER_APPLY).
+   *
+   * restore() will restore orginal value if it apply to PARAMETER_VALUE.
+   * Caller must ensure it's catched up with current time before apply.
+   *
+   * Use returned callback same way as callback returned by
+   * ParameterOptimizer::needSpecialTraversal()
+   */
   virtual TraverseCallback apply() { return nullptr; }
   virtual TraverseCallback restore() { return nullptr; }
 
@@ -180,7 +180,8 @@ protected:
   static TraverseCallback composeCallbacks(
       const TraverseCallbackVec& callbacks) {
     if (callbacks.size() > 1LU) {
-      return [callbacks](const VectorPtr vecs[], const ParameterConfig& config,
+      return [callbacks](const VectorPtr vecs[],
+                         const ParameterConfig& config,
                          size_t sparseId) {
         for (auto callback : callbacks) {
           callback(vecs, config, sparseId);
diff --git a/paddle/parameter/ParameterUpdateFunctions.cpp b/paddle/parameter/ParameterUpdateFunctions.cpp
index 679e3bf89b..510ec5bf48 100644
--- a/paddle/parameter/ParameterUpdateFunctions.cpp
+++ b/paddle/parameter/ParameterUpdateFunctions.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Logging.h"
 #ifdef __AVX__
 #include <x86intrin.h>
@@ -23,8 +22,13 @@ limitations under the License. */
 
 namespace paddle {
 
-void sgdUpdateCpu(real learningRate, real momentum, real decayRate, size_t size,
-                  real* value, const real* grad, real* momentumVec) {
+void sgdUpdateCpu(real learningRate,
+                  real momentum,
+                  real decayRate,
+                  size_t size,
+                  real* value,
+                  const real* grad,
+                  real* momentumVec) {
   decayRate *= learningRate;
   for (size_t i = 0; i < size; ++i) {
     momentumVec[i] = momentum * momentumVec[i] - learningRate * grad[i] -
@@ -33,8 +37,12 @@ void sgdUpdateCpu(real learningRate, real momentum, real decayRate, size_t size,
   }
 }
 
-void sgdUpdate(real learningRate, real momentum, real decayRate, Vector* value,
-               Vector* grad, Vector* momentumVec) {
+void sgdUpdate(real learningRate,
+               real momentum,
+               real decayRate,
+               Vector* value,
+               Vector* grad,
+               Vector* momentumVec) {
   size_t size = value->getSize();
   real* val = value->getData();
   real* grd = grad->getData();
@@ -48,8 +56,12 @@ void sgdUpdate(real learningRate, real momentum, real decayRate, Vector* value,
   }
 }
 
-void sgdUpdateAvx(float learningRate, float momentum, float decayRate,
-                  size_t size, float* value, const float* _grad,
+void sgdUpdateAvx(float learningRate,
+                  float momentum,
+                  float decayRate,
+                  size_t size,
+                  float* value,
+                  const float* _grad,
                   float* momentumVec) {
 #ifdef __AVX__
   float* grad = const_cast<float*>(_grad);  // the gradient is not modified
@@ -86,18 +98,36 @@ void sgdUpdateAvx(float learningRate, float momentum, float decayRate,
   std::function<void(void)> loopFun;
 
   learningRate *= -1;
-  lr = _mm256_set_ps(learningRate, learningRate, learningRate, learningRate,
-                     learningRate, learningRate, learningRate, learningRate);
+  lr = _mm256_set_ps(learningRate,
+                     learningRate,
+                     learningRate,
+                     learningRate,
+                     learningRate,
+                     learningRate,
+                     learningRate,
+                     learningRate);
 
   if (0 != momentum) {
-    mom = _mm256_set_ps(momentum, momentum, momentum, momentum, momentum,
-                        momentum, momentum, momentum);
+    mom = _mm256_set_ps(momentum,
+                        momentum,
+                        momentum,
+                        momentum,
+                        momentum,
+                        momentum,
+                        momentum,
+                        momentum);
   }
 
   decayRate *= learningRate;
   if (0 != decayRate) {
-    dr = _mm256_set_ps(decayRate, decayRate, decayRate, decayRate, decayRate,
-                       decayRate, decayRate, decayRate);
+    dr = _mm256_set_ps(decayRate,
+                       decayRate,
+                       decayRate,
+                       decayRate,
+                       decayRate,
+                       decayRate,
+                       decayRate,
+                       decayRate);
   }
 
   auto gradMulFun = [&](void) {
diff --git a/paddle/parameter/ParameterUpdateFunctions.h b/paddle/parameter/ParameterUpdateFunctions.h
index 59eb25656e..2d98030bd2 100644
--- a/paddle/parameter/ParameterUpdateFunctions.h
+++ b/paddle/parameter/ParameterUpdateFunctions.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "paddle/utils/TypeDefs.h"
@@ -31,14 +30,27 @@ namespace paddle {
  * momentum = 0 or decayRate = 0 are specially handled to avoid unnecessary
  * computation.
  */
-void sgdUpdate(real learningRate, real momentum, real decayRate, Vector* value,
-               Vector* grad, Vector* momentumVec);
-
-void sgdUpdateCpu(real learningRate, real momentum, real decayRate, size_t size,
-                  real* value, const real* grad, real* momentumVec);
-
-void sgdUpdateAvx(float learningRate, float momentum, float decayRate,
-                  size_t size, float* value, const float* grad,
+void sgdUpdate(real learningRate,
+               real momentum,
+               real decayRate,
+               Vector* value,
+               Vector* grad,
+               Vector* momentumVec);
+
+void sgdUpdateCpu(real learningRate,
+                  real momentum,
+                  real decayRate,
+                  size_t size,
+                  real* value,
+                  const real* grad,
+                  real* momentumVec);
+
+void sgdUpdateAvx(float learningRate,
+                  float momentum,
+                  float decayRate,
+                  size_t size,
+                  float* value,
+                  const float* grad,
                   float* momentumVec);
 
 }  // namespace paddle
diff --git a/paddle/parameter/ParameterUpdaterBase.cpp b/paddle/parameter/ParameterUpdaterBase.cpp
index e3f1d54037..e706742053 100644
--- a/paddle/parameter/ParameterUpdaterBase.cpp
+++ b/paddle/parameter/ParameterUpdaterBase.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <fstream>
 #include "paddle/utils/Logging.h"
 #include "ParameterUpdaterBase.h"
diff --git a/paddle/parameter/ParameterUpdaterBase.h b/paddle/parameter/ParameterUpdaterBase.h
index f16e183515..ffd2980261 100644
--- a/paddle/parameter/ParameterUpdaterBase.h
+++ b/paddle/parameter/ParameterUpdaterBase.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "Parameter.h"
diff --git a/paddle/parameter/ParameterUpdaterHook.cpp b/paddle/parameter/ParameterUpdaterHook.cpp
index 02a352920c..7d85a32c0c 100644
--- a/paddle/parameter/ParameterUpdaterHook.cpp
+++ b/paddle/parameter/ParameterUpdaterHook.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "ParameterUpdaterHook.h"
 
 #include <fstream>
@@ -155,7 +154,8 @@ private:
   std::hash<int> intHasher_;
 };
 
-static WeakKVCache<std::pair<std::string, int>, IParameterUpdaterHook,
+static WeakKVCache<std::pair<std::string, int>,
+                   IParameterUpdaterHook,
                    StringIntPairHasher> g_hookCache_;
 
 /**
diff --git a/paddle/parameter/ParameterUpdaterHook.h b/paddle/parameter/ParameterUpdaterHook.h
index 1c132a7338..553282bcaa 100644
--- a/paddle/parameter/ParameterUpdaterHook.h
+++ b/paddle/parameter/ParameterUpdaterHook.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 #include <memory>
 
diff --git a/paddle/parameter/Regularizer.cpp b/paddle/parameter/Regularizer.cpp
index bc7de3ca04..a9bddc1596 100644
--- a/paddle/parameter/Regularizer.cpp
+++ b/paddle/parameter/Regularizer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Util.h"
 #include "paddle/utils/Flags.h"
 #include "Regularizer.h"
@@ -21,8 +20,9 @@ namespace paddle {
 
 Regularizer* Regularizer::get(const std::vector<ParameterType>& types,
                               const ParameterConfig& paraConfig) {
-  bool useLearningRateVec = std::find(types.begin(), types.end(),
-                                      PARAMETER_LEARNING_RATE) != types.end();
+  bool useLearningRateVec =
+      std::find(types.begin(), types.end(), PARAMETER_LEARNING_RATE) !=
+      types.end();
   if (paraConfig.decay_rate_l1() > 0.0f &&
       paraConfig.decay_rate() > 0.0f) {  // use L1 and L2
     if (useLearningRateVec) {
diff --git a/paddle/parameter/Regularizer.h b/paddle/parameter/Regularizer.h
index 8c9eb49ab6..5baaccc00d 100644
--- a/paddle/parameter/Regularizer.h
+++ b/paddle/parameter/Regularizer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "ParameterUpdaterBase.h"
@@ -22,7 +21,8 @@ namespace paddle {
 // Regularizer function for parameter, e.g. L1/L2
 class Regularizer {
 public:
-  virtual void update(const VectorPtr vecs[], const ParameterConfig& paraConfig,
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& paraConfig,
                       real learningRate,  // learningrate from optimizer
                       int t0,             // last occurence time
                       int t) const = 0;   // current time
@@ -34,8 +34,11 @@ public:
 
 // L1 Regularizer, |w|_1
 class L1Regularizer : public Regularizer {
-  virtual void update(const VectorPtr vecs[], const ParameterConfig& paraConfig,
-                      real learningRate, int t0, int t) const {
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& paraConfig,
+                      real learningRate,
+                      int t0,
+                      int t) const {
     vecs[PARAMETER_VALUE]->applyL1(learningRate * paraConfig.learning_rate(),
                                    paraConfig.decay_rate_l1() * (t - t0));
   }
@@ -43,8 +46,11 @@ class L1Regularizer : public Regularizer {
 
 // L1 Lr Regularizer
 class L1LrRegularizer : public Regularizer {
-  virtual void update(const VectorPtr vecs[], const ParameterConfig& paraConfig,
-                      real learningRate, int t0, int t) const {
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& paraConfig,
+                      real learningRate,
+                      int t0,
+                      int t) const {
     vecs[PARAMETER_VALUE]->applyL1(*vecs[PARAMETER_LEARNING_RATE],
                                    learningRate * paraConfig.learning_rate(),
                                    paraConfig.decay_rate_l1() * (t - t0));
@@ -53,8 +59,11 @@ class L1LrRegularizer : public Regularizer {
 
 // L2 Regularizer, |w|_2^2
 class L2Regularizer : public Regularizer {
-  virtual void update(const VectorPtr vecs[], const ParameterConfig& paraConfig,
-                      real learningRate, int t0, int t) const {
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& paraConfig,
+                      real learningRate,
+                      int t0,
+                      int t) const {
     vecs[PARAMETER_VALUE]->applyL2(learningRate * paraConfig.learning_rate(),
                                    paraConfig.decay_rate() * (t - t0));
   }
@@ -62,8 +71,11 @@ class L2Regularizer : public Regularizer {
 
 // L2 Lr Regularizer
 class L2LrRegularizer : public Regularizer {
-  virtual void update(const VectorPtr vecs[], const ParameterConfig& paraConfig,
-                      real learningRate, int t0, int t) const {
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& paraConfig,
+                      real learningRate,
+                      int t0,
+                      int t) const {
     vecs[PARAMETER_VALUE]->applyL2(*vecs[PARAMETER_LEARNING_RATE],
                                    learningRate * paraConfig.learning_rate(),
                                    paraConfig.decay_rate() * (t - t0));
@@ -72,8 +84,11 @@ class L2LrRegularizer : public Regularizer {
 
 // L1 + L2 Regularizer, |w|_1 + |w|_2^2
 class L1L2Regularizer : public Regularizer {
-  virtual void update(const VectorPtr vecs[], const ParameterConfig& paraConfig,
-                      real learningRate, int t0, int t) const {
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& paraConfig,
+                      real learningRate,
+                      int t0,
+                      int t) const {
     vecs[PARAMETER_VALUE]->applyL1(learningRate * paraConfig.learning_rate(),
                                    paraConfig.decay_rate_l1() * (t - t0));
     vecs[PARAMETER_VALUE]->applyL2(learningRate * paraConfig.learning_rate(),
@@ -83,8 +98,11 @@ class L1L2Regularizer : public Regularizer {
 
 // L1 + L2 Lr Regularizer
 class L1L2LrRegularizer : public Regularizer {
-  virtual void update(const VectorPtr vecs[], const ParameterConfig& paraConfig,
-                      real learningRate, int t0, int t) const {
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& paraConfig,
+                      real learningRate,
+                      int t0,
+                      int t) const {
     vecs[PARAMETER_VALUE]->applyL1(*vecs[PARAMETER_LEARNING_RATE],
                                    learningRate * paraConfig.learning_rate(),
                                    paraConfig.decay_rate_l1() * (t - t0));
diff --git a/paddle/parameter/Weight.cpp b/paddle/parameter/Weight.cpp
index ed02355c01..c138010607 100644
--- a/paddle/parameter/Weight.cpp
+++ b/paddle/parameter/Weight.cpp
@@ -60,14 +60,20 @@ Weight::Weight(size_t height, size_t width, ParameterPtr param, size_t offset) {
 
   // weight_
   if (vPtr) {
-    weight_ = Matrix::create(vPtr->getData() + offset, height, width,
-                             /* trans */ false, param->useGpu());
+    weight_ = Matrix::create(vPtr->getData() + offset,
+                             height,
+                             width,
+                             /* trans */ false,
+                             param->useGpu());
   }
 
   // weightGrad
   if (gPtr) {
-    weightGrad_ = Matrix::create(gPtr->getData() + offset, height, width,
-                                 /* trans */ false, param->useGpu());
+    weightGrad_ = Matrix::create(gPtr->getData() + offset,
+                                 height,
+                                 width,
+                                 /* trans */ false,
+                                 param->useGpu());
   }
 
   parameter_ = param;
diff --git a/paddle/parameter/tests/test_common.cpp b/paddle/parameter/tests/test_common.cpp
index 1a22abf7cf..1a64fe3352 100644
--- a/paddle/parameter/tests/test_common.cpp
+++ b/paddle/parameter/tests/test_common.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <stdlib.h>
 #include <paddle/utils/Util.h>
 
@@ -38,8 +37,8 @@ protected:
   CommonTest() : testStat_("test") {}
   virtual ~CommonTest() {}
   virtual void SetUp() {
-    const size_t buffSize[] = {100,  128,   500,    1024,
-                               4096, 10240, 102400, 1000000};
+    const size_t buffSize[] = {
+        100, 128, 500, 1024, 4096, 10240, 102400, 1000000};
     sizeVec_.resize(8);
     memcpy(&sizeVec_[0], &buffSize[0], 8 * sizeof(size_t));
     valueUint_.resize(4);
@@ -54,8 +53,10 @@ protected:
     learningRate_ = 1.0;
   }
 
-  void test_sgdUpadate(real* gradientBuffer, real* valueBuffer,
-                       real* momentumBuffer, size_t size);
+  void test_sgdUpadate(real* gradientBuffer,
+                       real* valueBuffer,
+                       real* momentumBuffer,
+                       size_t size);
 
   virtual void TreaDown() { LOG(INFO) << "All Test Finished."; }
 
@@ -66,8 +67,10 @@ protected:
   StatSet testStat_;
 };
 
-void CommonTest::test_sgdUpadate(real* gradientBuffer, real* valueBuffer,
-                                 real* momentumBuffer, size_t size) {
+void CommonTest::test_sgdUpadate(real* gradientBuffer,
+                                 real* valueBuffer,
+                                 real* momentumBuffer,
+                                 size_t size) {
 // sgdUpdateAvx has no double version yet
 #if defined(__AVX__) && !defined(PADDLE_TYPE_DOUBLE)
   real valueSum1 = 0, valueSum2 = 0, momSum1 = 0, momSum2 = 0;
@@ -85,8 +88,13 @@ void CommonTest::test_sgdUpadate(real* gradientBuffer, real* valueBuffer,
         gettimeofday(&t, NULL);
       }
       REGISTER_TIMER("avxTimer", 0);
-      sgdUpdateAvx(learningRate_, arg.first, arg.second, size, valueBuffer,
-                   gradientBuffer, momentumBuffer);
+      sgdUpdateAvx(learningRate_,
+                   arg.first,
+                   arg.second,
+                   size,
+                   valueBuffer,
+                   gradientBuffer,
+                   momentumBuffer);
     }
     for (size_t i = 0; i < size; i++) {
       valueSum1 += valueBuffer[i];
@@ -98,8 +106,13 @@ void CommonTest::test_sgdUpadate(real* gradientBuffer, real* valueBuffer,
     }
     {
       REGISTER_TIMER("cpuTimer", 0);
-      sgdUpdateCpu(learningRate_, arg.first, arg.second, size, valueTmp,
-                   gradTmp, momentumTmp);
+      sgdUpdateCpu(learningRate_,
+                   arg.first,
+                   arg.second,
+                   size,
+                   valueTmp,
+                   gradTmp,
+                   momentumTmp);
     }
     for (size_t i = 0; i < size; i++) {
       valueSum2 += valueTmp[i];
@@ -126,10 +139,10 @@ TEST_F(CommonTest, sgdUpdate) {
   for (auto& size : sizeVec_) {
     real *gradientBuffer, *valueBuffer, *momentumBuffer;
     CHECK_EQ(posix_memalign((void**)&gradientBuffer, 32, sizeof(real) * size),
-        0);
+             0);
     CHECK_EQ(posix_memalign((void**)&valueBuffer, 32, sizeof(real) * size), 0);
     CHECK_EQ(posix_memalign((void**)&momentumBuffer, 32, sizeof(real) * size),
-        0);
+             0);
 
     for (size_t i = 0; i < size; i++) {
       gradientBuffer[i] = 1.0;
@@ -141,7 +154,8 @@ TEST_F(CommonTest, sgdUpdate) {
                 << "-------------------------";
       test_sgdUpadate(&gradientBuffer[alignHeader[i]],
                       &valueBuffer[alignHeader[i]],
-                      &momentumBuffer[alignHeader[i]], size - alignHeader[i]);
+                      &momentumBuffer[alignHeader[i]],
+                      size - alignHeader[i]);
     }
     free(gradientBuffer);
     free(valueBuffer);
@@ -173,16 +187,16 @@ TEST_F(CommonTest, barrierStat) {
 
   SyncThreadPool pool(threadNum);
 
-#define TEST_BARRIER_RANDOM(statName, numConnThreads, ...)               \
-  pool.exec([&](int tid, size_t numThreads) {                            \
-    struct timeval time;                                                 \
-    gettimeofday(&time, nullptr);                                        \
-    uint64_t usec = timeToMicroSecond(time);                             \
-    std::srand(usec);                                                    \
-    auto value = std::rand() % 100000;                                   \
-    usleep(value);                                                       \
-    REGISTER_SLOW_NODES_PROBE(globalStat, statName, numConnThreads, tid, \
-                              __VA_ARGS__);                              \
+#define TEST_BARRIER_RANDOM(statName, numConnThreads, ...)       \
+  pool.exec([&](int tid, size_t numThreads) {                    \
+    struct timeval time;                                         \
+    gettimeofday(&time, nullptr);                                \
+    uint64_t usec = timeToMicroSecond(time);                     \
+    std::srand(usec);                                            \
+    auto value = std::rand() % 100000;                           \
+    usleep(value);                                               \
+    REGISTER_SLOW_NODES_PROBE(                                   \
+        globalStat, statName, numConnThreads, tid, __VA_ARGS__); \
   });
 
   for (auto i = 0; i < 10; i++) {
@@ -202,11 +216,11 @@ TEST_F(CommonTest, barrierStat) {
   globalStat.reset();
 
 // use it to test accurate barrier gap
-#define TEST_BARRIER(statName, numConnThreads, ...)                      \
-  pool.exec([&](int tid, size_t numThreads) {                            \
-    usleep(tid * 10000);                                                 \
-    REGISTER_SLOW_NODES_PROBE(globalStat, statName, numConnThreads, tid, \
-                              __VA_ARGS__);                              \
+#define TEST_BARRIER(statName, numConnThreads, ...)              \
+  pool.exec([&](int tid, size_t numThreads) {                    \
+    usleep(tid * 10000);                                         \
+    REGISTER_SLOW_NODES_PROBE(                                   \
+        globalStat, statName, numConnThreads, tid, __VA_ARGS__); \
   });
 
   for (auto i = 0; i < 10; i++) {
diff --git a/paddle/pserver/BaseClient.cpp b/paddle/pserver/BaseClient.cpp
index df4daca9bf..ff83970ab1 100644
--- a/paddle/pserver/BaseClient.cpp
+++ b/paddle/pserver/BaseClient.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <vector>
 #include <string.h>
 #include "paddle/utils/Stat.h"
diff --git a/paddle/pserver/BaseClient.h b/paddle/pserver/BaseClient.h
index f1c4c9eb37..3a501172b7 100644
--- a/paddle/pserver/BaseClient.h
+++ b/paddle/pserver/BaseClient.h
@@ -62,7 +62,10 @@ public:
 
   /// send data to server, support only synchronize
   template <class DataType>
-  void putData(int clientId, SendDataType type, DataType* datas, size_t size,
+  void putData(int clientId,
+               SendDataType type,
+               DataType* datas,
+               size_t size,
                DataUpdateMode mode) {
     synchronize(SYNC_DATA);
     sendData(clientId, type, mode, datas, size);
@@ -71,16 +74,23 @@ public:
   }
 
   template <class DataType>
-  void putOwnData(int clientId, SendDataType type, DataType* datas,
+  void putOwnData(int clientId,
+                  SendDataType type,
+                  DataType* datas,
                   size_t size) {
     putData(clientId, type, datas, size, DATA_UPDATE_MODE_SET_OWN);
   }
 
   template <class DataType>
-  void getAllData(int clientId, SendDataType type, DataType* datas,
+  void getAllData(int clientId,
+                  SendDataType type,
+                  DataType* datas,
                   size_t size) {
-    sendData(clientId, type, DATA_UPDATE_MODE_GET_ALL,
-             reinterpret_cast<DataType*>(NULL), 0);
+    sendData(clientId,
+             type,
+             DATA_UPDATE_MODE_GET_ALL,
+             reinterpret_cast<DataType*>(NULL),
+             0);
     recvData();
     size_t dataOffset = 0;
     for (auto& recvMem : recvDataMems_) {
@@ -100,7 +110,10 @@ public:
    * The results are saved in recvBuf of rootId client
    */
   template <class DataType>
-  void reduce(DataType* sendBuf, DataType* recvBuf, size_t size, int clientId,
+  void reduce(DataType* sendBuf,
+              DataType* recvBuf,
+              size_t size,
+              int clientId,
               int rootId) {
     putOwnData(clientId, DATA_REDUCE_SUM, sendBuf, size);
     if (rootId == clientId) {
@@ -147,8 +160,12 @@ protected:
   void finishThreads();
 
   template <class DataType>
-  void prepareData(int clientId, SendDataType type, DataUpdateMode updateMode,
-                   DataType* datas, size_t size, SendJob* sendJob) {
+  void prepareData(int clientId,
+                   SendDataType type,
+                   DataUpdateMode updateMode,
+                   DataType* datas,
+                   size_t size,
+                   SendJob* sendJob) {
     sendJob->parallelDataRequests.resize(serviceNum_);
     sendJob->parallelInputIovs.resize(serviceNum_);
     for (int i = 0; i < serviceNum_; ++i) {
@@ -192,8 +209,11 @@ protected:
    *        synchronization in metric learning.
    */
   template <class DataType>
-  void sendData(int clientId, SendDataType type, DataUpdateMode updateMode,
-                DataType* datas, size_t size) {
+  void sendData(int clientId,
+                SendDataType type,
+                DataUpdateMode updateMode,
+                DataType* datas,
+                size_t size) {
     SendJobPtr sendJob = std::make_shared<SendJob>();
     prepareData(clientId, type, updateMode, datas, size, sendJob.get());
     for (int i = 0; i < threadNum_; ++i) {
@@ -210,7 +230,8 @@ protected:
 
   /// send request, and recv responses
   template <typename ProtoIn, typename ProtoOut>
-  void multiCall(const char* funcName, const ProtoIn& request,
+  void multiCall(const char* funcName,
+                 const ProtoIn& request,
                  std::vector<ProtoOut>* responses) {
     responses->resize(clients_.size());
     size_t numClients = clients_.size();
diff --git a/paddle/pserver/LightNetwork.cpp b/paddle/pserver/LightNetwork.cpp
index ff2875fc70..1830170a16 100644
--- a/paddle/pserver/LightNetwork.cpp
+++ b/paddle/pserver/LightNetwork.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <sys/types.h>
 #include <sys/socket.h>
 #include <netdb.h>
@@ -32,19 +31,22 @@ limitations under the License. */
 #include "RDMANetwork.h"
 
 /// quick ack can reduce the latency of small message
-P_DEFINE_bool(small_messages, false,
+P_DEFINE_bool(small_messages,
+              false,
               "if message size is small, recommend set it True to enable quick "
               "ack and no delay");
 
 /// reasonable sock_send_buf_size can control the traffic injected into switch
 /// network. Injecting too many data into traffic could cause packets loss which
 /// cause long latency and degrade the efficiency of communication.
-P_DEFINE_int32(sock_send_buf_size, 1024 * 1024 * 40,
+P_DEFINE_int32(sock_send_buf_size,
+               1024 * 1024 * 40,
                "restrict sock send buff size, can reduce network congestion if "
                "set carefully");
 
 /// reasonable size can hold bursted packets and reduce packets loss
-P_DEFINE_int32(sock_recv_buf_size, 1024 * 1024 * 40,
+P_DEFINE_int32(sock_recv_buf_size,
+               1024 * 1024 * 40,
                "restrict sock recv buff size");
 
 namespace paddle {
@@ -174,7 +176,8 @@ void SocketServer::tcpServer() {
   if (!addr_.empty()) {
     server = gethostbyname(addr_.c_str());
     PCHECK(server) << "ERROR, no such host: " << addr_;
-    bcopy((char *)server->h_addr, (char *)&serv_addr.sin_addr.s_addr,
+    bcopy((char *)server->h_addr,
+          (char *)&serv_addr.sin_addr.s_addr,
           server->h_length);
   } else {
     serv_addr.sin_addr.s_addr = INADDR_ANY;
@@ -347,29 +350,32 @@ void SocketClient::TcpClient(const std::string &serverAddr, int serverPort) {
   struct sockaddr_in serv_addr;
   struct hostent *server;
 
-  int errRet;      // temp for gethostbyname_r
+  int errRet;  // temp for gethostbyname_r
 
   /// Create a socket point
   int sockfd = socket(AF_INET, SOCK_STREAM, 0);
   PCHECK(sockfd >= 0) << "ERROR opening socket";
 
 #if defined(__OSX__) || defined(__APPLE__)
-   server = getipnodebyname(serverAddr.c_str(), AF_INET, AI_DEFAULT, &errRet);
-   CHECK_NE(HOST_NOT_FOUND, errRet)
-     << "ERROR, no such host: " << serverAddr << " ret = " << errRet;
-   CHECK(server) << "getipnodebyname error!";
+  server = getipnodebyname(serverAddr.c_str(), AF_INET, AI_DEFAULT, &errRet);
+  CHECK_NE(HOST_NOT_FOUND, errRet) << "ERROR, no such host: " << serverAddr
+                                   << " ret = " << errRet;
+  CHECK(server) << "getipnodebyname error!";
 #else
-   struct hostent hostinfo;
-   char buf[1024];  // temp for gethostbyname_r
-   CHECK_EQ(0, gethostbyname_r(serverAddr.c_str(), &hostinfo, buf, sizeof(buf),
-                               &server, &errRet))
-       << "ERROR, no such host: " << serverAddr << " ret = " << errRet;
-   CHECK(server) << "gethostbyname_r error!";
+  struct hostent hostinfo;
+  char buf[1024];  // temp for gethostbyname_r
+  CHECK_EQ(
+      0,
+      gethostbyname_r(
+          serverAddr.c_str(), &hostinfo, buf, sizeof(buf), &server, &errRet))
+      << "ERROR, no such host: " << serverAddr << " ret = " << errRet;
+  CHECK(server) << "gethostbyname_r error!";
 #endif
 
   bzero((char *)&serv_addr, sizeof(serv_addr));
   serv_addr.sin_family = AF_INET;
-  bcopy((char *)server->h_addr, (char *)&serv_addr.sin_addr.s_addr,
+  bcopy((char *)server->h_addr,
+        (char *)&serv_addr.sin_addr.s_addr,
         server->h_length);
   serv_addr.sin_port = htons(serverPort);
 
@@ -421,7 +427,8 @@ void SocketClient::RdmaClient(const std::string &serverAddr, int serverPort) {
  *
  * @note  responsible for building one connection to specified pserver port
  */
-SocketClient::SocketClient(const std::string &serverAddr, int serverPort,
+SocketClient::SocketClient(const std::string &serverAddr,
+                           int serverPort,
                            enum ChannelType channelType) {
   if (channelType == F_RDMA)
     RdmaClient(serverAddr, serverPort);
diff --git a/paddle/pserver/LightNetwork.h b/paddle/pserver/LightNetwork.h
index 0d6d6bf6b7..b7d7bc7902 100644
--- a/paddle/pserver/LightNetwork.h
+++ b/paddle/pserver/LightNetwork.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "SocketChannel.h"
@@ -39,9 +38,9 @@ class SocketWorker;
  *        in child class of socketserver.
  */
 class SocketServer : public Thread {
-   // rdmaCpu controls the cpu affinity of RDMA server daemon,
-   // which could benifit performance. rdmaCpu = -1 means TCP
-   // is used instead of RDMA transport.
+  // rdmaCpu controls the cpu affinity of RDMA server daemon,
+  // which could benifit performance. rdmaCpu = -1 means TCP
+  // is used instead of RDMA transport.
 public:
   SocketServer(const std::string& addr, int port, int rdmaCpu);
   ~SocketServer();
@@ -91,7 +90,6 @@ protected:
   bool stopping_;
 };
 
-
 /**
  * @brief class for holding one connection from one trainer
  *
@@ -165,7 +163,8 @@ private:
  */
 class SocketClient {
 public:
-  SocketClient(const std::string& serverAddr, int serverPort,
+  SocketClient(const std::string& serverAddr,
+               int serverPort,
                enum ChannelType channelType);
 
   SocketChannel* getChannel() { return channel_.get(); }
diff --git a/paddle/pserver/ParameterClient2.cpp b/paddle/pserver/ParameterClient2.cpp
index d0e5352c82..28cc0ae2dd 100644
--- a/paddle/pserver/ParameterClient2.cpp
+++ b/paddle/pserver/ParameterClient2.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <unistd.h>
 
 #include "ParameterClient2.h"
@@ -27,7 +26,8 @@ P_DEFINE_int32(parallel_thread_num, 1, "Thread number for parameter send");
 namespace paddle {
 
 template <class T>
-void copyToRepeatedField(google::protobuf::RepeatedField<T>* dest, const T* src,
+void copyToRepeatedField(google::protobuf::RepeatedField<T>* dest,
+                         const T* src,
                          size_t size) {
   dest->Clear();
   dest->Reserve(size);
@@ -46,11 +46,10 @@ void copyToRepeatedField(const std::vector<T>& src,
 ParameterClient2::ParameterClient2(bool separate, int port, int numPorts)
     : BaseClient(separate, numPorts), port_(port) {
 #ifndef PADDLE_DISABLE_TIMER
-    forwardbackwordTime_ = 0;
+  forwardbackwordTime_ = 0;
 #endif
 }
 
-
 int ParameterClient2::calcParameterBlockSize(
     const std::vector<ParameterPtr>& parameters, size_t serviceNum) {
   size_t totalSize = 0;
@@ -89,8 +88,8 @@ bool ParameterClient2::init(const std::vector<ParameterPtr>& parameters) {
   for (auto& para : parameters) {
     /// set block size for each parameter
     para->getConfig().set_parameter_block_size(
-            para->getConfig().sparse_remote_update() ?
-            para->getConfig().dims(1) : denseBlockSize);
+        para->getConfig().sparse_remote_update() ? para->getConfig().dims(1)
+                                                 : denseBlockSize);
   }
 
   for (auto& para : parameters) {
@@ -107,7 +106,7 @@ bool ParameterClient2::init(const std::vector<ParameterPtr>& parameters) {
     allSegments_.push_back(segments);
     if (para->getConfig().sparse_remote_update()) {
       CHECK_EQ(para->getConfig().parameter_block_size(),
-              para->getConfig().dims(1))
+               para->getConfig().dims(1))
           << "For sparse remote update parameter,"
           << " block size is the width of each row.";
     }
@@ -152,7 +151,8 @@ void ParameterClient2::destroy() {
   clients_.clear();
 }
 
-void ParameterClient2::sendParallel(int tid, size_t numThreads,
+void ParameterClient2::sendParallel(int tid,
+                                    size_t numThreads,
                                     ParameterType recvParameterType) {
   int numMyClients = divup(serviceNum_ - tid, numThreads);
 
@@ -163,7 +163,8 @@ void ParameterClient2::sendParallel(int tid, size_t numThreads,
     /// at the same time so that they will not flood data to the same
     /// pserver.
     i = calcClientId(i, serviceNum_);
-    clients_[i].send("sendParameter", sendJob_.parallelRequests[i],
+    clients_[i].send("sendParameter",
+                     sendJob_.parallelRequests[i],
                      sendJob_.parallelInputIovs[i]);
 
     /// clear large structure
@@ -204,10 +205,15 @@ void ParameterClient2::sendParallel(int tid, size_t numThreads,
 }
 
 void ParameterClient2::prepareSendData(
-    ParameterUpdateMode updateMode, ParameterType parameterType,
-    const std::vector<ParameterSegments>& parameterSegments, int64_t numSamples,
-    real cost, bool sendBackParameter, ParameterType sendBackParameterType,
-    BatchStatus batchStatus, SendJob* sendJob) {
+    ParameterUpdateMode updateMode,
+    ParameterType parameterType,
+    const std::vector<ParameterSegments>& parameterSegments,
+    int64_t numSamples,
+    real cost,
+    bool sendBackParameter,
+    ParameterType sendBackParameterType,
+    BatchStatus batchStatus,
+    SendJob* sendJob) {
   sendJob->parallelRequests.resize(serviceNum_);
   sendJob->parallelInputIovs.resize(serviceNum_);
 
@@ -247,11 +253,11 @@ void ParameterClient2::prepareSendData(
       const auto prefetchMat = parameter->getPrefetchMatrix();
       CHECK(prefetchMat != nullptr) << "prefetchMat is nullptr";
       auto sendMat = dynamic_cast<SparseRowCpuMatrix*>(
-        parameter->getMat(parameterType).get());
+          parameter->getMat(parameterType).get());
       CHECK(sendMat != nullptr) << "sendMat is nullptr";
 
       syncThreadPool_->exec([&](int tid, size_t numThreads) {
-        const auto &localIndices = prefetchMat->getLocalIndices();
+        const auto& localIndices = prefetchMat->getLocalIndices();
         /// num of sparse rows
         size_t nLocalBlocks = localIndices.size();
         uint64_t beginDim = 0;
@@ -278,17 +284,17 @@ void ParameterClient2::prepareSendData(
 
           if (sendingPara) {
             sendJob->parallelInputIovs[serverId].push_back(
-                {sendMat->getLocalRow(row), sizeof(real) * (size_t) blockSize});
+                {sendMat->getLocalRow(row), sizeof(real) * (size_t)blockSize});
             /// detect sparse parameter distribution
             sparseDistribution_->probeDistribution(serverId,
-                    sizeof(real) * blockSize);
+                                                   sizeof(real) * blockSize);
           }
         }
       });
 
     } else {  /// parameter set for dense and sparse
-      real* buf = sendingPara ?
-          parameter->getBuf(parameterType)->getPoint(0) : nullptr;
+      real* buf =
+          sendingPara ? parameter->getBuf(parameterType)->getPoint(0) : nullptr;
       uint64_t endDim = 0;
       for (uint64_t beginDim = 0; beginDim < paraSize; beginDim = endDim) {
         endDim = std::min<int64_t>(beginDim + blockSize, paraSize);
@@ -302,8 +308,8 @@ void ParameterClient2::prepareSendData(
         block->set_begin_pos(beginDim);
         block->set_block_size(endDim - beginDim);
         if (buf) {
-            sendJob->parallelInputIovs[serverId].push_back({buf + beginDim,
-                     sizeof(real) * ((size_t) (endDim - beginDim))});
+          sendJob->parallelInputIovs[serverId].push_back(
+              {buf + beginDim, sizeof(real) * ((size_t)(endDim - beginDim))});
         }
       }
     }
@@ -313,13 +319,23 @@ void ParameterClient2::prepareSendData(
 }
 
 void ParameterClient2::sendAndReceiveParameter(
-    ParameterUpdateMode updateMode, ParameterType parameterType,
-    const std::vector<ParameterSegments>& parameterSegments, int64_t numSamples,
-    real cost, bool sendBackParameter, ParameterType sendBackParameterType,
+    ParameterUpdateMode updateMode,
+    ParameterType parameterType,
+    const std::vector<ParameterSegments>& parameterSegments,
+    int64_t numSamples,
+    real cost,
+    bool sendBackParameter,
+    ParameterType sendBackParameterType,
     ParameterType recvParameterType) {
-  prepareSendData(updateMode, parameterType, parameterSegments, numSamples,
-                  cost, sendBackParameter, sendBackParameterType,
-                  /*batchStatus = */ BATCH_START_AND_FINISH, &sendJob_);
+  prepareSendData(updateMode,
+                  parameterType,
+                  parameterSegments,
+                  numSamples,
+                  cost,
+                  sendBackParameter,
+                  sendBackParameterType,
+                  /*batchStatus = */ BATCH_START_AND_FINISH,
+                  &sendJob_);
 
   syncThreadPool_->exec([&](int tid, size_t numThreads) {
     this->sendParallel(tid, numThreads, recvParameterType);
@@ -327,12 +343,22 @@ void ParameterClient2::sendAndReceiveParameter(
 }
 
 void ParameterClient2::sendParameter(
-    ParameterUpdateMode updateMode, ParameterType parameterType,
-    const std::vector<ParameterSegments>& parameterSegments, int64_t numSamples,
-    real cost, bool sendBackParameter, BatchStatus batchStatus) {
+    ParameterUpdateMode updateMode,
+    ParameterType parameterType,
+    const std::vector<ParameterSegments>& parameterSegments,
+    int64_t numSamples,
+    real cost,
+    bool sendBackParameter,
+    BatchStatus batchStatus) {
   SendJobPtr sendJob = std::make_shared<SendJob>();
-  prepareSendData(updateMode, parameterType, parameterSegments, numSamples,
-                  cost, sendBackParameter, PARAMETER_VALUE, batchStatus,
+  prepareSendData(updateMode,
+                  parameterType,
+                  parameterSegments,
+                  numSamples,
+                  cost,
+                  sendBackParameter,
+                  PARAMETER_VALUE,
+                  batchStatus,
                   sendJob.get());
 
   for (int i = 0; i < threadNum_; i++) {
@@ -360,10 +386,12 @@ void ParameterClient2::send(int threadId) {
       /// pserver.
       i = calcClientId(i, serviceNum_);
       if (recvJob->parallelRequests.size()) {
-        clients_[i].send("sendParameter", recvJob->parallelRequests[i],
+        clients_[i].send("sendParameter",
+                         recvJob->parallelRequests[i],
                          recvJob->parallelInputIovs[i]);
       } else {
-        clients_[i].send("sendData", recvJob->parallelDataRequests[i],
+        clients_[i].send("sendData",
+                         recvJob->parallelDataRequests[i],
                          recvJob->parallelInputIovs[i]);
       }
     }
@@ -586,12 +614,13 @@ void PreparedOperations::addOperationHelper(Operation* op, CpuMatrixPtr mat) {
   ProtoMatrix& pmat = *op->add_matrices();
   pmat.set_num_cols(mat->getWidth());
   pmat.set_num_rows(mat->getHeight());
-  copyToRepeatedField(pmat.mutable_values(), mat->getData(),
-                      pmat.num_cols() * pmat.num_rows());
+  copyToRepeatedField(
+      pmat.mutable_values(), mat->getData(), pmat.num_cols() * pmat.num_rows());
 }
 
 void ParameterClient2::doOperation(PreparedOperations& ops,
-                                   bool waitForGradient, bool sendBackGradient,
+                                   bool waitForGradient,
+                                   bool sendBackGradient,
                                    bool releasePass) {
   std::vector<DoOperationResponse> responses;
   ops.request_.set_wait_for_gradient(waitForGradient);
@@ -666,7 +695,8 @@ void ParameterClient2::doOperation(PreparedOperations& ops,
         CHECK_EQ(rmat->getWidth(), (size_t)mat.num_cols());
         CpuMatrixPtr amat =
             std::make_shared<CpuMatrix>(const_cast<real*>(mat.values().data()),
-                                        rmat->getHeight(), rmat->getWidth());
+                                        rmat->getHeight(),
+                                        rmat->getWidth());
         rmat->add(*amat);
       }
     }
@@ -700,14 +730,17 @@ void ParameterClient2::vectorAddMult(PServerVector u, PServerVector v, real a) {
   doOperation(ops, false, false);
 }
 
-void ParameterClient2::vectorAddMultInto(PServerVector u, PServerVector v,
-                                         PServerVector w, real a) {
+void ParameterClient2::vectorAddMultInto(PServerVector u,
+                                         PServerVector v,
+                                         PServerVector w,
+                                         real a) {
   PreparedOperations ops;
   ops.addOperation(PSERVER_OP_au_bv_cw, v, w, u, (real)1, a, (real)0);
   doOperation(ops, false, false);
 }
 
-void ParameterClient2::vectorScaleInto(PServerVector u, PServerVector v,
+void ParameterClient2::vectorScaleInto(PServerVector u,
+                                       PServerVector v,
                                        real a) {
   PreparedOperations ops;
   ops.addOperation(PSERVER_OP_au_bv, v, u, a, (real)0);
diff --git a/paddle/pserver/ParameterClient2.h b/paddle/pserver/ParameterClient2.h
index 7a4085ad82..af8dd41ec4 100644
--- a/paddle/pserver/ParameterClient2.h
+++ b/paddle/pserver/ParameterClient2.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <atomic>
@@ -190,8 +189,8 @@ protected:
 };
 
 struct ParameterSegments {
-  std::string name;               // name of the parameter
-  size_t id;                      // id of the parameter
+  std::string name;  // name of the parameter
+  size_t id;         // id of the parameter
 };
 
 /**
@@ -225,7 +224,8 @@ public:
    *                 connections the parameter client maintains.
    */
   ParameterClient2(bool separate = false,
-                   int port = FLAGS_port, int numPorts = FLAGS_ports_num);
+                   int port = FLAGS_port,
+                   int numPorts = FLAGS_ports_num);
 
   ~ParameterClient2();
 
@@ -255,14 +255,14 @@ public:
    *            client[recvParameterType]
    * @note Only parameterType will be sent.
    */
-  void sendAndReceiveParameter(
-      ParameterUpdateMode updateMode,
-      ParameterType parameterType,
-      const std::vector<ParameterSegments>& segments,
-      int64_t numSamples,
-      real cost, bool sendBackParameter,
-      ParameterType sendBackParameterType,
-      ParameterType recvParameterType);
+  void sendAndReceiveParameter(ParameterUpdateMode updateMode,
+                               ParameterType parameterType,
+                               const std::vector<ParameterSegments>& segments,
+                               int64_t numSamples,
+                               real cost,
+                               bool sendBackParameter,
+                               ParameterType sendBackParameterType,
+                               ParameterType recvParameterType);
 
   /**
    * @brief Sends all parameters to parameter servers, and receives the response
@@ -276,8 +276,13 @@ public:
       bool sendBackParameter,
       ParameterType sendBackParameterType = PARAMETER_VALUE,
       ParameterType recvParameterType = PARAMETER_VALUE) {
-    sendAndReceiveParameter(updateMode, parameterType, allSegments_, numSamples,
-                            cost, sendBackParameter, sendBackParameterType,
+    sendAndReceiveParameter(updateMode,
+                            parameterType,
+                            allSegments_,
+                            numSamples,
+                            cost,
+                            sendBackParameter,
+                            sendBackParameterType,
                             recvParameterType);
   }
 
@@ -302,29 +307,41 @@ public:
   void sendParameter(ParameterUpdateMode updateMode,
                      ParameterType parameterType,
                      const std::vector<ParameterSegments>& segments,
-                     int64_t numSamples, real cost, bool sendBackParameter,
+                     int64_t numSamples,
+                     real cost,
+                     bool sendBackParameter,
                      BatchStatus batchStatus);
 
   void recvParameter();
 
   /**
-   * Sends all parameters to parameter servers, recvParameter() have to be invoked
+   * Sends all parameters to parameter servers, recvParameter() have to be
+   * invoked
    * afterwards.
    *
    * @note This function is non-blocking. This means that if parameter should
    *       not changes between this call and recvParameter()
    */
   void sendParameter(ParameterUpdateMode updateMode,
-                     ParameterType parameterType, int64_t numSamples, real cost,
-                     bool sendBackParameter, BatchStatus batchStatus) {
-    sendParameter(updateMode, parameterType, allSegments_, numSamples, cost,
-                  sendBackParameter, batchStatus);
+                     ParameterType parameterType,
+                     int64_t numSamples,
+                     real cost,
+                     bool sendBackParameter,
+                     BatchStatus batchStatus) {
+    sendParameter(updateMode,
+                  parameterType,
+                  allSegments_,
+                  numSamples,
+                  cost,
+                  sendBackParameter,
+                  batchStatus);
   }
 
   /// Get all parameters from parameter servers
   void getParameter(ParameterType recvParameterType = PARAMETER_VALUE,
                     ParameterType sendBackParameterType = PARAMETER_VALUE) {
-    sendAndReceiveParameter(PSERVER_UPDATE_MODE_GET_PARAM, PARAMETER_VALUE,
+    sendAndReceiveParameter(PSERVER_UPDATE_MODE_GET_PARAM,
+                            PARAMETER_VALUE,
                             0,     // numSamples = 0
                             0,     // cost = 0
                             true,  // sendBackParameter = true
@@ -341,12 +358,14 @@ public:
                             0,     // numSamples = 0
                             0,     // cost = 0
                             true,  // sendBackParameter = true
-                            sendBackParameterType, recvParameterType);
+                            sendBackParameterType,
+                            recvParameterType);
   }
 
   /// Set all parameters on parameter servers using the local parameters
   void setParameter() {
-    sendAndReceiveParameter(PSERVER_UPDATE_MODE_SET_PARAM, PARAMETER_VALUE,
+    sendAndReceiveParameter(PSERVER_UPDATE_MODE_SET_PARAM,
+                            PARAMETER_VALUE,
                             0,       // numSamples = 0
                             0,       // cost = 0
                             false);  // sendBackParameter = false
@@ -356,7 +375,8 @@ public:
    * means do not sending local parameters
    */
   void setParameterZero() {
-    sendAndReceiveParameter(PSERVER_UPDATE_MODE_SET_PARAM_ZERO, PARAMETER_VALUE,
+    sendAndReceiveParameter(PSERVER_UPDATE_MODE_SET_PARAM_ZERO,
+                            PARAMETER_VALUE,
                             0,       // numSamples = 0
                             0,       // cost = 0
                             false);  // sendBackParameter = false
@@ -401,15 +421,18 @@ public:
    * @param[in] If true, and if all clients call waitPassFinish, signal all
    *            clients finish the pass.
    */
-  void doOperation(PreparedOperations& ops, bool waitForGradient,
-                   bool sendBackParameter, bool releasePass = true);
+  void doOperation(PreparedOperations& ops,
+                   bool waitForGradient,
+                   bool sendBackParameter,
+                   bool releasePass = true);
 
   /**
    * Set the configuration of pserver, including parameter config and
    * optimization config
    */
   void setConfig(const OptimizationConfig& optConfig,
-                 const std::string& saveDir = "", bool isSparseServer = false);
+                 const std::string& saveDir = "",
+                 bool isSparseServer = false);
 
   /// Return true if all pservers are in the given status
   bool inStatus(PServerStatus status);
@@ -454,7 +477,9 @@ public:
   void vectorAddMult(PServerVector u, PServerVector v, real a);
 
   /// u = v + w * a
-  void vectorAddMultInto(PServerVector u, PServerVector v, PServerVector w,
+  void vectorAddMultInto(PServerVector u,
+                         PServerVector v,
+                         PServerVector w,
                          real a);
   /// u = v * a
   void vectorScaleInto(PServerVector u, PServerVector v, real a);
@@ -491,7 +516,8 @@ public:
 
 protected:
   template <typename ProtoIn, typename ProtoOut>
-  void multiCall(const char* funcName, const ProtoIn& request,
+  void multiCall(const char* funcName,
+                 const ProtoIn& request,
                  std::vector<ProtoOut>* responses) {
     responses->resize(clients_.size());
     size_t numClients = clients_.size();
@@ -511,10 +537,12 @@ private:
    *        to all pservers. it is called under one SyncThreadPool. it
    *        supports to use N thread to control M connections. the receiving
    *        actions can be started until all sending action to all connections
-   *        owned by current thread are finished. Different connections controlled
+   *        owned by current thread are finished. Different connections
+   * controlled
    *        by different threads can transfer data asynchronously.
    */
-  void sendParallel(int tid, size_t numThreads,
+  void sendParallel(int tid,
+                    size_t numThreads,
                     ParameterType recvParameterType);
   /// sending thread routine for asynchronously send data
   void send(int threadId);
@@ -535,9 +563,12 @@ private:
       ParameterUpdateMode updateMode,
       ParameterType parameterType,  // client send type
       const std::vector<ParameterSegments>& parameterSegments,
-      int64_t numSamples, real cost, bool sendBackParameter,
+      int64_t numSamples,
+      real cost,
+      bool sendBackParameter,
       ParameterType sendBackParameterType,  // send back type in pserver
-      BatchStatus batchStatus, SendJob* sendJob);
+      BatchStatus batchStatus,
+      SendJob* sendJob);
 
   /// start necessary threads for threadPool
   void initThreads();
diff --git a/paddle/pserver/ParameterServer2.cpp b/paddle/pserver/ParameterServer2.cpp
index c8f37d0bf4..b7f999f8b1 100644
--- a/paddle/pserver/ParameterServer2.cpp
+++ b/paddle/pserver/ParameterServer2.cpp
@@ -31,10 +31,12 @@ limitations under the License. */
 #include "paddle/utils/GlobalConstants.h"
 
 P_DEFINE_int32(pserver_num_threads, 1, "number of threads for sync op exec");
-P_DEFINE_double(async_lagged_ratio_min, 1.0,
+P_DEFINE_double(async_lagged_ratio_min,
+                1.0,
                 "control config_.async_lagged_grad_discard_ratio() min value");
 P_DEFINE_double(
-    async_lagged_ratio_default, 1.5,
+    async_lagged_ratio_default,
+    1.5,
     "if async_lagged_grad_discard_ratio is not set in trainer_config.conf"
     "use it as defalut value");
 
@@ -47,7 +49,8 @@ const std::string ParameterServer2::kRetMsgInvalidVectorHandle =
 const std::string ParameterServer2::kRetMsgUnknownOperation =
     "Unknown operation";
 
-ParameterServer2::ParameterServer2(const std::string& addr, int port,
+ParameterServer2::ParameterServer2(const std::string& addr,
+                                   int port,
                                    int rdmaCpu)
     : ProtoServer(addr, port, rdmaCpu),
       dataSize_(0),
@@ -59,12 +62,12 @@ ParameterServer2::ParameterServer2(const std::string& addr, int port,
       allClientPassFinish_(false),
       serverId_(-1),
       batchId_(-1) {
- /**
-  * register function for remote client calling, these functions
-  * will be mapped to a data structure for quick looking up. each
-  * request from trainer can contains one function name to indicate
-  * remote action. this architecture looks like rpc style for pserver.
-  */
+  /**
+   * register function for remote client calling, these functions
+   * will be mapped to a data structure for quick looking up. each
+   * request from trainer can contains one function name to indicate
+   * remote action. this architecture looks like rpc style for pserver.
+   */
   REGISTER_SERVICE_FUNCTION_EX(ParameterServer2, sendParameter);
   REGISTER_SERVICE_FUNCTION_EX(ParameterServer2, sendData);
   REGISTER_SERVICE_FUNCTION(ParameterServer2, setConfig);
@@ -150,12 +153,12 @@ void ParameterServer2::setConfig(const SetConfigRequest& request,
       mkDir(request.save_dir().c_str());
     }
 
-  for (const auto& config : request.param_configs()) {
-    CHECK(!configMap_.count(config.para_id()))
-        << "Duplicated parameter name: " << config.name();
-    configMap_[config.para_id()] = config;
-    CHECK_EQ(config.sparse_remote_update(), isSparseServer_);
-  }
+    for (const auto& config : request.param_configs()) {
+      CHECK(!configMap_.count(config.para_id()))
+          << "Duplicated parameter name: " << config.name();
+      configMap_[config.para_id()] = config;
+      CHECK_EQ(config.sparse_remote_update(), isSparseServer_);
+    }
 
     config_ = request.opt_config();
     if (config_.algorithm() == TrainAlgorithm::AsyncSGD) {
@@ -267,9 +270,9 @@ void ParameterServer2::setParameter(const SendParameterRequest& request,
 
   if (!request.blocks().size()) {
     LOG(WARNING)
-          << "--ports_num or --ports_num_for_sparse might be too large, "
-          << "or total dense parameter size or sparse parameters size "
-          << "might be too small, this psever doesn't store any parameter.";
+        << "--ports_num or --ports_num_for_sparse might be too large, "
+        << "or total dense parameter size or sparse parameters size "
+        << "might be too small, this psever doesn't store any parameter.";
     return;
   }
 
@@ -339,8 +342,8 @@ void ParameterServer2::setParameter(const SendParameterRequest& request,
           << "width : " << width;
     }
     info.optimizer->init(1, info.config);
-    usedSegments_.push_back(std::make_pair(offsets[i],
-                offsets[i] + request.blocks(i).block_size()));
+    usedSegments_.push_back(std::make_pair(
+        offsets[i], offsets[i] + request.blocks(i).block_size()));
   }
   mergeSegments(&usedSegments_);
 
@@ -364,15 +367,18 @@ void ParameterServer2::addGradient(const SendParameterRequest& request,
                                    std::vector<Buffer>* outputBuffers) {
   VLOG(1) << "pserver: addGradient";
 
-  /// forwardbackward delta from all trainers
-  /// indicate the fluctuation caused by forwardbackward.
+/// forwardbackward delta from all trainers
+/// indicate the fluctuation caused by forwardbackward.
 #ifndef PADDLE_METRIC_LEARNING
   // @TODO(yanfei):
   // add support tuning forwardbackward balance for metric learning
   if (!numPassFinishClients_) {
     REGISTER_BARRIER_DELTA_SERVER_SET(
-        *statSet_, "forwardbackwardDelta", FLAGS_num_gradient_servers,
-        request.trainer_id(), request.forwardbackward_time(),
+        *statSet_,
+        "forwardbackwardDelta",
+        FLAGS_num_gradient_servers,
+        request.trainer_id(),
+        request.forwardbackward_time(),
         isSparseServer_ ? "_sparseUpdater" : "_denseUpdater");
   }
 #endif
@@ -390,14 +396,19 @@ void ParameterServer2::addGradient(const SendParameterRequest& request,
   /// barrier fluctuation caused by network and previous forwardbackward
   if (!numPassFinishClients_) {
     REGISTER_BARRIER_TIMER_SERVER_SET(
-        *statSet_, "handleReqBegin", FLAGS_num_gradient_servers,
-        request.trainer_id(), (*handleRequestBegin_),
+        *statSet_,
+        "handleReqBegin",
+        FLAGS_num_gradient_servers,
+        request.trainer_id(),
+        (*handleRequestBegin_),
         isSparseServer_ ? "_sparseUpdater" : "_denseUpdater");
   }
 
   if (!numPassFinishClients_) {
     REGISTER_BARRIER_TIMER_SERVER(
-        *statSet_, "addGradBegin", FLAGS_num_gradient_servers,
+        *statSet_,
+        "addGradBegin",
+        FLAGS_num_gradient_servers,
         request.trainer_id(),
         isSparseServer_ ? "_sparseUpdater" : "_denseUpdater");
   }
@@ -414,8 +425,8 @@ void ParameterServer2::addGradient(const SendParameterRequest& request,
 
       int64_t blockId = getBlockId(block);
       CHECK_GE(blockId, 0) << "Only existing parameter block is allowed: "
-                          << " id=" << block.para_id()
-                          << " block id=" << block.block_id();
+                           << " id=" << block.para_id()
+                           << " block id=" << block.block_id();
 
       Buffer buffer = inputBuffers[bufferIndex];
       ++bufferIndex;
@@ -438,7 +449,9 @@ void ParameterServer2::addGradient(const SendParameterRequest& request,
 
     if (!numPassFinishClients_) {
       REGISTER_BARRIER_TIMER_SERVER(
-          *statSet_, "addGradCoreFinish", FLAGS_num_gradient_servers,
+          *statSet_,
+          "addGradCoreFinish",
+          FLAGS_num_gradient_servers,
           request.trainer_id(),
           isSparseServer_ ? "_sparseUpdater" : "_denseUpdater");
     }
@@ -453,7 +466,9 @@ void ParameterServer2::addGradient(const SendParameterRequest& request,
     /// numPassFinishClients_ means some trainer has entered finishPass
     if (!numPassFinishClients_) {
       REGISTER_SLOW_NODES_PROBE(
-          *statSet_, "SLOW_NODES", FLAGS_num_gradient_servers,
+          *statSet_,
+          "SLOW_NODES",
+          FLAGS_num_gradient_servers,
           request.trainer_id(),
           isSparseServer_ ? "_sparseUpdater" : "_denseUpdater");
     }
@@ -463,7 +478,9 @@ void ParameterServer2::addGradient(const SendParameterRequest& request,
 
     /// if wait pass finish does not start, do check
     if (!numPassFinishClients_) {
-      CHECK_BARRIER_TIMER(*statSet_, "SLOW_NODES", FLAGS_num_gradient_servers,
+      CHECK_BARRIER_TIMER(*statSet_,
+                          "SLOW_NODES",
+                          FLAGS_num_gradient_servers,
                           isSparseServer_ ? "_sparseUpdater" : "_denseUpdater");
     }
 
@@ -471,7 +488,9 @@ void ParameterServer2::addGradient(const SendParameterRequest& request,
     /// can indicate the fluctation caused by computation at pserver.
     if (!numPassFinishClients_) {
       REGISTER_BARRIER_TIMER_SERVER(
-          *statSet_, "paraReady", FLAGS_num_gradient_servers,
+          *statSet_,
+          "paraReady",
+          FLAGS_num_gradient_servers,
           request.trainer_id(),
           isSparseServer_ ? "_sparseUpdater" : "_denseUpdater");
     }
@@ -481,7 +500,8 @@ void ParameterServer2::addGradient(const SendParameterRequest& request,
     {
       /// total time except overhead of network.
       REGISTER_TIMER_DYNAMIC_SET("sendParaNoRecvNoSend",
-                                 timeToMicroSecond(*addGradBegin_), -1,
+                                 timeToMicroSecond(*addGradBegin_),
+                                 -1,
                                  *statSet_);
     }
   }
@@ -609,7 +629,8 @@ void ParameterServer2::asyncSGD(const SendParameterRequest& request,
                         << " block id=" << block.block_id();
     int64_t blockId = getBlockId(block);
     CHECK_GE(blockId, 0) << "Only existing parameter block is allowed: "
-        << " id=" << block.para_id() << " block id=" << block.block_id();
+                         << " id=" << block.para_id()
+                         << " block id=" << block.block_id();
     Buffer buffer = inputBuffers[bufferIndex];
     ++bufferIndex;
 
@@ -730,10 +751,11 @@ void ParameterServer2::sendBackParameter(const ParameterBlock& block,
 
   int64_t offset = getBlockOffset(block);
   CHECK_GE(offset, 0) << "Only existing parameter block is allowed: "
-      << " id=" << block.para_id() << " block id=" << block.block_id();
+                      << " id=" << block.para_id()
+                      << " block id=" << block.block_id();
 
   real* valueBuffer = vectors_[parameterType]->getPoint(offset);
-  outputBuffers->push_back({valueBuffer, (size_t) block.block_size()});
+  outputBuffers->push_back({valueBuffer, (size_t)block.block_size()});
 }
 
 void ParameterServer2::sendBackParameter(const ParameterBlock& block,
@@ -749,7 +771,8 @@ void ParameterServer2::sendBackParameter(const ParameterBlock& block,
 
   int64_t offset = getBlockOffset(block);
   CHECK_GE(offset, 0) << "Only existing parameter block is allowed: "
-      << " id=" << block.para_id() << " block id=" << block.block_id();
+                      << " id=" << block.para_id()
+                      << " block id=" << block.block_id();
 
   size_t size = buffer->size;
   real* valueBuffer = vectors_[parameterType]->getPoint(offset);
@@ -759,8 +782,11 @@ void ParameterServer2::sendBackParameter(const ParameterBlock& block,
 }
 
 void ParameterServer2::sendBackParameterSparse(
-    const ParameterBlock& block, int parameterType,
-    SendParameterResponse* response, Buffer* buffer, size_t width,
+    const ParameterBlock& block,
+    int parameterType,
+    SendParameterResponse* response,
+    Buffer* buffer,
+    size_t width,
     std::vector<Buffer>* outputBuffers) {
   ParameterBlock* returnBlock = response->add_blocks();
   returnBlock->set_para_id(block.para_id());
@@ -769,7 +795,8 @@ void ParameterServer2::sendBackParameterSparse(
   returnBlock->set_block_size(block.block_size());
   int64_t offset = getBlockOffset(block);
   CHECK_GE(offset, 0) << "Only existing parameter block is allowed: "
-      << " id=" << block.para_id() << " block id=" << block.block_id();
+                      << " id=" << block.para_id()
+                      << " block id=" << block.block_id();
 
   real* valueBuffer = vectors_[parameterType]->getPoint(offset);
   CHECK_EQ(buffer->size, width);
@@ -781,7 +808,7 @@ void ParameterServer2::readAllBlocks(
     MsgReader* msgReader, std::vector<ParameterServer2::Buffer>* buffers) {
   auto& buffer = *readWriteBuffer_;
   size_t numBlocks = msgReader->getNumBlocks();
-  buffer.resizeWithAlignHints(msgReader->getTotalLength()/sizeof(real),
+  buffer.resizeWithAlignHints(msgReader->getTotalLength() / sizeof(real),
                               numBlocks);
   std::vector<void*> bufs(numBlocks);
   buffers->clear();
@@ -861,7 +888,9 @@ void ParameterServer2::sendParameter(const SendParameterRequest& request,
         /// indicates network flucatuation for big message.
         if (!numPassFinishClients_) {
           REGISTER_BARRIER_TIMER_SERVER(
-              *statSet_, "sendParamFinish", FLAGS_num_gradient_servers,
+              *statSet_,
+              "sendParamFinish",
+              FLAGS_num_gradient_servers,
               request.trainer_id(),
               isSparseServer_ ? "_sparseUpdater" : "_denseUpdater");
         }
@@ -871,13 +900,15 @@ void ParameterServer2::sendParameter(const SendParameterRequest& request,
           /// total time including overhead of network.
           REGISTER_TIMER_DYNAMIC_SET("sendParaTotal",
                                      timeToMicroSecond(*handleRequestBegin_),
-                                     -1, *statSet_);
+                                     -1,
+                                     *statSet_);
         }
         /// all time exhausted in pserverServer except recieve network.
         {
           /// total time except overhead of network receive
           REGISTER_TIMER_DYNAMIC_SET("sendParaNoRecv",
-                                     timeToMicroSecond(*addGradBegin_), -1,
+                                     timeToMicroSecond(*addGradBegin_),
+                                     -1,
                                      *statSet_);
         }
       }
@@ -1007,36 +1038,42 @@ void ParameterServer2::clearUnusedSegments(CpuVector* vec) {
     return;
   }
   memset(data, 0, sizeof(real) * usedSegments_[0].first);
-  memset(data + usedSegments_.back().second, 0,
+  memset(data + usedSegments_.back().second,
+         0,
          sizeof(real) * (size_ - usedSegments_.back().second));
   size_t n = size_ - usedSegments_.back().second;
 
   for (size_t i = 1; i < usedSegments_.size(); ++i) {
     memset(
-        data + usedSegments_[i - 1].second, 0,
+        data + usedSegments_[i - 1].second,
+        0,
         sizeof(real) * (usedSegments_[i].first - usedSegments_[i - 1].second));
     n += usedSegments_[i].first - usedSegments_[i - 1].second;
   }
 }
 
 void ParameterServer2::parallelExecForEachBlock(ExecFunc func) {
-  SyncThreadPool::execHelper(syncThreadPool_.get(), [&](int tid,
-                                                        size_t numThreads) {
-    int64_t numBlocks = blockIdMap_.size();
-    VectorPtr* vecs = Parameter::getTlsTempBufs();
-    for (int64_t blockId = tid; blockId < numBlocks; blockId += numThreads) {
-      func(blockId, vecs);
-    }
-  });
+  SyncThreadPool::execHelper(syncThreadPool_.get(),
+                             [&](int tid, size_t numThreads) {
+                               int64_t numBlocks = blockIdMap_.size();
+                               VectorPtr* vecs = Parameter::getTlsTempBufs();
+                               for (int64_t blockId = tid; blockId < numBlocks;
+                                    blockId += numThreads) {
+                                 func(blockId, vecs);
+                               }
+                             });
 }
 
 void ParameterServer2::blockTraverse(
-    BlockInfo& info, const ParameterConfig& config, int64_t offset, size_t size,
+    BlockInfo& info,
+    const ParameterConfig& config,
+    int64_t offset,
+    size_t size,
     const VectorPtr vecs[],
     const ParameterOptimizer::TraverseCallback& callback) {
   /// setup sub bufs
   for (const auto type : info.optimizer->getParameterTypes()) {
-      vecs[type]->subVecFrom(*vectors_[type], offset, size);
+    vecs[type]->subVecFrom(*vectors_[type], offset, size);
   }
   callback(vecs, config, config.sparse_remote_update() ? 0 : -1LU);
 }
@@ -1064,10 +1101,10 @@ void ParameterServer2::op_SGD(const Operation& operation,
       info.optimizer->startBatch(numSamplesProcessed_);
 
       for (const auto type : info.optimizer->getParameterTypes()) {
-          vecs[type]->subVecFrom(*vectors_[type], offset, size);
+        vecs[type]->subVecFrom(*vectors_[type], offset, size);
       }
-      info.optimizer->update(vecs, config,
-              config.sparse_remote_update() ? 0 : -1LU);
+      info.optimizer->update(
+          vecs, config, config.sparse_remote_update() ? 0 : -1LU);
       vecs[PARAMETER_GRADIENT]->zeroMem();
 
       if (auto callback = info.optimizer->needSpecialTraversal(config)) {
@@ -1469,7 +1506,6 @@ void ParameterServer2::waitPassFinish(const WaitPassFinishRequest& request,
 
 void ParameterServer2::synchronize(const SynchronizeRequest& request,
                                    ProtoResponseCallback callback) {
-  CHECK_LT(request.sync_object_id(), SyncObject_ARRAYSIZE);
   synchronizeBarriers_[request.sync_object_id()]->wait();
   dataSize_ = 0;
   callback(SynchronizeResponse());
@@ -1477,7 +1513,6 @@ void ParameterServer2::synchronize(const SynchronizeRequest& request,
 
 void ParameterServer2::asyncFinishPass(const SynchronizeRequest& request,
                                        ProtoResponseCallback callback) {
-  CHECK_LT(request.sync_object_id(), SyncObject_ARRAYSIZE);
   synchronizeBarriers_[request.sync_object_id()]->wait();
   callback(SynchronizeResponse());
 
diff --git a/paddle/pserver/ParameterServer2.h b/paddle/pserver/ParameterServer2.h
index ceb1ad69e9..ccaea42e7d 100644
--- a/paddle/pserver/ParameterServer2.h
+++ b/paddle/pserver/ParameterServer2.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <atomic>
@@ -55,7 +54,6 @@ namespace paddle {
 // computation causes big optmization latency, the GPU may be required by
 // pserver.
 
-
 /**
  * Client interface for the parameter server
  *
@@ -189,9 +187,10 @@ protected:
      */
     constexpr static size_t AlignElementCount = AlignBytes / sizeof(T);
 
-    static_assert(
-        AlignElementCount == (AlignElementCount & -AlignElementCount)
-          || AlignBytes > sizeof(T), "AlignElementCount should be exp of 2");
+    static_assert(AlignElementCount ==
+                          (AlignElementCount & -AlignElementCount) ||
+                      AlignBytes > sizeof(T),
+                  "AlignElementCount should be exp of 2");
 
     /**
      * @brief Resize Buffer, with block count that will be allocated. Each block
@@ -205,7 +204,7 @@ protected:
       } else {
         //! at most, we need such elements in buffer to make sure each block is
         //! aligned.
-        this->resize(size + alignBlockCount* (AlignElementCount - 1));
+        this->resize(size + alignBlockCount * (AlignElementCount - 1));
       }
     }
 
@@ -224,8 +223,8 @@ protected:
       curOffset_ += blockSize;
 
       if (!IsTLargerThanAlign) {
-        curOffset_ = (curOffset_ + AlignElementCount - 1) &
-            ~(AlignElementCount -1);
+        curOffset_ =
+            (curOffset_ + AlignElementCount - 1) & ~(AlignElementCount - 1);
       }
       return r;
     }
@@ -369,7 +368,8 @@ public:
   /**
    * @brief send config to pserver
    *
-   * @note  it can help pserver to understand the configuration for optimization,
+   * @note  it can help pserver to understand the configuration for
+   * optimization,
    *        logging control, duplicated initialization, etc.
    */
   void setConfig(const SetConfigRequest& request,
@@ -545,17 +545,17 @@ protected:
                      std::vector<ParameterServer2::Buffer>* buffers);
 
   const ParameterConfig& getParameterConfig(const ParameterBlock& block) {
-    CHECK_LT(block.para_id(), -1UL)
-        << "invalid parameter id:" << block.para_id();
+    CHECK_LT(block.para_id(), -1UL) << "invalid parameter id:"
+                                    << block.para_id();
     const auto it = configMap_.find(block.para_id());
-    CHECK(it != configMap_.end())
-        << "can not find parameter id: " << block.para_id();
+    CHECK(it != configMap_.end()) << "can not find parameter id: "
+                                  << block.para_id();
     return it->second;
   }
 
   /// it implictly check blockOffsetMap_ while retrieving blockId
   const ParameterConfig& getParameterConfig(int64_t blockId) const {
-    CHECK(blockId >= 0 && blockId < (int64_t) blockInfos_.size())
+    CHECK(blockId >= 0 && blockId < (int64_t)blockInfos_.size())
         << "block idx out of range, id: " << blockId
         << " info size: " << blockInfos_.size();
     return *(blockInfos_[blockId].config);
@@ -614,7 +614,8 @@ protected:
    *        vectors_[parameterType] directly
    *        for dense with sync-sgd
    */
-  void sendBackParameter(const ParameterBlock& block, int parameterType,
+  void sendBackParameter(const ParameterBlock& block,
+                         int parameterType,
                          SendParameterResponse* response,
                          std::vector<Buffer>* outputBuffers);
 
@@ -627,16 +628,20 @@ protected:
    *        to buffer->base.
    *        for dense with async-sgd
    */
-  void sendBackParameter(const ParameterBlock& block, int parameterType,
-                         SendParameterResponse* response, Buffer* buffer,
+  void sendBackParameter(const ParameterBlock& block,
+                         int parameterType,
+                         SendParameterResponse* response,
+                         Buffer* buffer,
                          std::vector<Buffer>* outputBuffers);
   /**
    * @brief prepare data for sending back
    *
    * @note  specified for sparse
    */
-  void sendBackParameterSparse(const ParameterBlock& block, int parameterType,
-                               SendParameterResponse* response, Buffer* buffer,
+  void sendBackParameterSparse(const ParameterBlock& block,
+                               int parameterType,
+                               SendParameterResponse* response,
+                               Buffer* buffer,
                                size_t width,
                                std::vector<Buffer>* outputBuffers);
 
@@ -648,8 +653,11 @@ protected:
    */
   typedef std::function<void(int64_t blockId, const VectorPtr vecs[])> ExecFunc;
   void parallelExecForEachBlock(ExecFunc func);
-  void blockTraverse(BlockInfo& info, const ParameterConfig& config,
-                     int64_t offset, size_t size, const VectorPtr vecs[],
+  void blockTraverse(BlockInfo& info,
+                     const ParameterConfig& config,
+                     int64_t offset,
+                     size_t size,
+                     const VectorPtr vecs[],
                      const ParameterOptimizer::TraverseCallback& callback);
 
 public:
diff --git a/paddle/pserver/ProtoServer.cpp b/paddle/pserver/ProtoServer.cpp
index 0ce06ddf91..2f6d911a01 100644
--- a/paddle/pserver/ProtoServer.cpp
+++ b/paddle/pserver/ProtoServer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "ProtoServer.h"
 
 namespace paddle {
@@ -42,8 +41,8 @@ void ProtoServer::handleRequest(std::unique_ptr<MsgReader> msgReader,
 
 void ProtoServer::registerServiceFunctionImp(const std::string& funcName,
                                              ServiceFunction func) {
-  CHECK(!nameToFuncMap_.count(funcName))
-      << "Duplicated registration: " << funcName;
+  CHECK(!nameToFuncMap_.count(funcName)) << "Duplicated registration: "
+                                         << funcName;
   nameToFuncMap_[funcName] = func;
 }
 
diff --git a/paddle/pserver/ProtoServer.h b/paddle/pserver/ProtoServer.h
index 86e7158683..cf08e24ff3 100644
--- a/paddle/pserver/ProtoServer.h
+++ b/paddle/pserver/ProtoServer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "LightNetwork.h"
@@ -23,17 +22,17 @@ limitations under the License. */
 
 namespace paddle {
 
-  /**
-   *
-   * It implements the rpc framework, which launchs one thread for each
-   * connection. Here define one parameter server as single TCP server
-   * binding on single port. All connections share single tcp ProtoServer
-   * object, each connection handles all requests from specified trainer
-   * within single worker thread.
-   * to accelerate bandwidth efficiency and harness multicore for pserver
-   * optimization to reduce pserver latency, you could launch more port
-   * for single NIC hardward with --port=N(N>1) for small cluster job.
-   */
+/**
+ *
+ * It implements the rpc framework, which launchs one thread for each
+ * connection. Here define one parameter server as single TCP server
+ * binding on single port. All connections share single tcp ProtoServer
+ * object, each connection handles all requests from specified trainer
+ * within single worker thread.
+ * to accelerate bandwidth efficiency and harness multicore for pserver
+ * optimization to reduce pserver latency, you could launch more port
+ * for single NIC hardward with --port=N(N>1) for small cluster job.
+ */
 class ProtoServer : public SocketServer {
 public:
   /// rdmaCpu controls the cpu affinity of RDMA server daemon,
@@ -84,7 +83,8 @@ public:
   template <class ProtoIn>
   void registerServiceFunctionEx(
       const std::string& funcName,
-      std::function<void(const ProtoIn&, std::unique_ptr<MsgReader> msgReader,
+      std::function<void(const ProtoIn&,
+                         std::unique_ptr<MsgReader> msgReader,
                          ProtoResponseCallbackEx callback)> func);
 
 protected:
@@ -120,7 +120,8 @@ protected:
 
 class ProtoClient : public SocketClient {
 public:
-  ProtoClient(const std::string& serverAddr, int serverPort,
+  ProtoClient(const std::string& serverAddr,
+              int serverPort,
               enum ChannelType channelType = F_TCP)
       : SocketClient(serverAddr, serverPort, channelType) {}
 
@@ -133,7 +134,8 @@ public:
    * @note  iov provides additional blocks which need to be written to the
    *        communication channel
    */
-  void send(const char* funcName, const google::protobuf::MessageLite& proto,
+  void send(const char* funcName,
+            const google::protobuf::MessageLite& proto,
             const std::vector<iovec>& iov = std::vector<iovec>());
 
   /**
@@ -148,7 +150,8 @@ public:
 
   /// combines send() and recv()
   std::unique_ptr<MsgReader> sendAndRecv(
-      const char* funcName, const google::protobuf::MessageLite& protoIn,
+      const char* funcName,
+      const google::protobuf::MessageLite& protoIn,
       google::protobuf::MessageLite* protoOut) {
     send(funcName, protoIn);
     return recv(protoOut);
@@ -156,8 +159,10 @@ public:
 
   /// combines send() and recv()
   std::unique_ptr<MsgReader> sendAndRecv(
-      const char* funcName, const google::protobuf::MessageLite& protoIn,
-      const std::vector<iovec>& iov, google::protobuf::MessageLite* protoOut) {
+      const char* funcName,
+      const google::protobuf::MessageLite& protoIn,
+      const std::vector<iovec>& iov,
+      google::protobuf::MessageLite* protoOut) {
     send(funcName, protoIn, iov);
     return recv(protoOut);
   }
@@ -172,52 +177,62 @@ struct service_arg_type<R (C::*)(const Arg1&, Arg2)> {
 };
 
 template <class R, class C, class Arg1, class Arg2>
-struct service_arg_type<R (C::*)(const Arg1&, std::unique_ptr<MsgReader>,
-                                 Arg2)> {
+struct service_arg_type<R (C::*)(  // NOLINT
+    const Arg1&,
+    std::unique_ptr<MsgReader>,
+    Arg2)> {
   typedef Arg1 _1;
 };
 
 /// register a service function to the ProtoServer
 /// This should only be used within a member function of className
-#define REGISTER_SERVICE_FUNCTION(className, funcName)                        \
-  registerServiceFunction<                                                    \
-      service_arg_type<decltype(&className::funcName)>::_1>(                  \
-      #funcName, std::bind(&className::funcName, this, std::placeholders::_1, \
-                           std::placeholders::_2))
+#define REGISTER_SERVICE_FUNCTION(className, funcName)       \
+  registerServiceFunction<                                   \
+      service_arg_type<decltype(&className::funcName)>::_1>( \
+      #funcName,                                             \
+      std::bind(&className::funcName,                        \
+                this,                                        \
+                std::placeholders::_1,                       \
+                std::placeholders::_2))
 
 /// register a service function to the ProtoServer
 /// This should only be used within a member function of className
-#define REGISTER_SERVICE_FUNCTION_EX(className, funcName)                     \
-  registerServiceFunctionEx<                                                  \
-      service_arg_type<decltype(&className::funcName)>::_1>(                  \
-      #funcName, std::bind(&className::funcName, this, std::placeholders::_1, \
-                           std::placeholders::_2, std::placeholders::_3))
+#define REGISTER_SERVICE_FUNCTION_EX(className, funcName)    \
+  registerServiceFunctionEx<                                 \
+      service_arg_type<decltype(&className::funcName)>::_1>( \
+      #funcName,                                             \
+      std::bind(&className::funcName,                        \
+                this,                                        \
+                std::placeholders::_1,                       \
+                std::placeholders::_2,                       \
+                std::placeholders::_3))
 
 /// create wrapper function for parameter server high level function and
 /// register the wrapper function into function mapping.
 template <class ProtoIn>
 void ProtoServer::registerServiceFunctionEx(
     const std::string& funcName,
-    std::function<void(const ProtoIn&, std::unique_ptr<MsgReader> msgReader,
+    std::function<void(const ProtoIn&,
+                       std::unique_ptr<MsgReader> msgReader,
                        ProtoResponseCallbackEx callback)> func) {
-  auto f =
-      [func](std::unique_ptr<MsgReader> msgReader, ResponseCallback callback) {
-        ProtoIn request;
-        std::string str(msgReader->getNextBlockLength(), 0);
-        msgReader->readNextBlock(&str[0]);
-        CHECK(request.ParseFromString(str));
-        auto pcob = [callback](const google::protobuf::MessageLite& response,
-                               const std::vector<iovec>& outputIovs) {
-          std::string out;
-          CHECK(response.SerializeToString(&out));
-          std::vector<iovec> iovs;
-          iovs.push_back({&out[0], out.size()});
-          iovs.insert(iovs.end(), outputIovs.begin(), outputIovs.end());
-          callback(iovs);
-        };
-
-        func(request, std::move(msgReader), pcob);
-      };
+  auto f = [func](std::unique_ptr<MsgReader> msgReader,
+                  ResponseCallback callback) {
+    ProtoIn request;
+    std::string str(msgReader->getNextBlockLength(), 0);
+    msgReader->readNextBlock(&str[0]);
+    CHECK(request.ParseFromString(str));
+    auto pcob = [callback](const google::protobuf::MessageLite& response,
+                           const std::vector<iovec>& outputIovs) {
+      std::string out;
+      CHECK(response.SerializeToString(&out));
+      std::vector<iovec> iovs;
+      iovs.push_back({&out[0], out.size()});
+      iovs.insert(iovs.end(), outputIovs.begin(), outputIovs.end());
+      callback(iovs);
+    };
+
+    func(request, std::move(msgReader), pcob);
+  };
 
   registerServiceFunctionImp(funcName, f);
 }
@@ -226,24 +241,24 @@ template <class ProtoIn>
 void ProtoServer::registerServiceFunction(
     const std::string& funcName,
     std::function<void(const ProtoIn&, ProtoResponseCallback callback)> func) {
-  auto f =
-      [func](std::unique_ptr<MsgReader> msgReader, ResponseCallback callback) {
-        ProtoIn request;
-        std::string str(msgReader->getNextBlockLength(), 0);
-        msgReader->readNextBlock(&str[0]);
-        CHECK(request.ParseFromString(str));
-        msgReader.reset();
-
-        auto pcob = [callback](const google::protobuf::MessageLite& response) {
-          std::string out;
-          CHECK(response.SerializeToString(&out));
-          std::vector<iovec> iovs;
-          iovs.push_back({&out[0], out.size()});
-          callback(iovs);
-        };
-
-        func(request, pcob);
-      };
+  auto f = [func](std::unique_ptr<MsgReader> msgReader,
+                  ResponseCallback callback) {
+    ProtoIn request;
+    std::string str(msgReader->getNextBlockLength(), 0);
+    msgReader->readNextBlock(&str[0]);
+    CHECK(request.ParseFromString(str));
+    msgReader.reset();
+
+    auto pcob = [callback](const google::protobuf::MessageLite& response) {
+      std::string out;
+      CHECK(response.SerializeToString(&out));
+      std::vector<iovec> iovs;
+      iovs.push_back({&out[0], out.size()});
+      callback(iovs);
+    };
+
+    func(request, pcob);
+  };
 
   registerServiceFunctionImp(funcName, f);
 }
diff --git a/paddle/pserver/RDMANetwork.h b/paddle/pserver/RDMANetwork.h
index 05b845b68a..4e492a3afd 100644
--- a/paddle/pserver/RDMANetwork.h
+++ b/paddle/pserver/RDMANetwork.h
@@ -76,7 +76,7 @@ inline sxi_sock* accept(sxi_socket* s) {
 
 inline sockaddr_in* getSourceAddress(sxi_sock* sock) {
 #ifndef PADDLE_DISABLE_RDMA
-  return reinterpret_cast<sockaddr_in *>(&sock->sa);
+  return reinterpret_cast<sockaddr_in*>(&sock->sa);
 #else
   PROMPT_ERR();
 #endif
@@ -98,7 +98,6 @@ inline int close(sxi_sock* sock) {
 #endif
 }
 
-
 inline void init() {
 #ifndef PADDLE_DISABLE_RDMA
   sxi_module_init();
@@ -155,6 +154,5 @@ inline sxi_sock* connect(sxi_socket* socket, const char* url) {
 #endif
 }
 
-
 }  //  namespace rdma
 }  //  namespace paddle
diff --git a/paddle/pserver/SocketChannel.cpp b/paddle/pserver/SocketChannel.cpp
index 20295d7cdc..4ebc47d326 100644
--- a/paddle/pserver/SocketChannel.cpp
+++ b/paddle/pserver/SocketChannel.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "SocketChannel.h"
 
 #include <stdio.h>
@@ -35,7 +34,6 @@ namespace paddle {
 #define UIO_MAXIOV 512
 #endif
 
-
 SocketChannel::~SocketChannel() {
   if (tcpRdma_ == F_TCP)
     close(tcpSocket_);
@@ -81,8 +79,12 @@ size_t SocketChannel::write(const void* buf, size_t size) {
 }
 
 template <class IOFunc, class SocketType>
-static size_t readwritev(IOFunc iofunc, SocketType socket, iovec* iovs,
-                         int iovcnt, int maxiovs, const std::string& peerName) {
+static size_t readwritev(IOFunc iofunc,
+                         SocketType socket,
+                         iovec* iovs,
+                         int iovcnt,
+                         int maxiovs,
+                         const std::string& peerName) {
   int curIov = 0;
   size_t total = 0;
 
@@ -123,25 +125,40 @@ static size_t readwritev(IOFunc iofunc, SocketType socket, iovec* iovs,
   return size;
 }
 
-
 /// rdma::readv and rdma::writev can take advantage of RDMA blocking offload
 /// transfering
 size_t SocketChannel::writev(const std::vector<struct iovec>& iovs) {
   if (tcpRdma_ == F_TCP)
-    return readwritev(::writev, tcpSocket_, const_cast<iovec*>(&iovs[0]),
-                      iovs.size(), UIO_MAXIOV, peerName_);
+    return readwritev(::writev,
+                      tcpSocket_,
+                      const_cast<iovec*>(&iovs[0]),
+                      iovs.size(),
+                      UIO_MAXIOV,
+                      peerName_);
   else
-    return readwritev(rdma::writev, rdmaSocket_, const_cast<iovec*>(&iovs[0]),
-                      iovs.size(), MAX_VEC_SIZE, peerName_);
+    return readwritev(rdma::writev,
+                      rdmaSocket_,
+                      const_cast<iovec*>(&iovs[0]),
+                      iovs.size(),
+                      MAX_VEC_SIZE,
+                      peerName_);
 }
 
 size_t SocketChannel::readv(std::vector<struct iovec>* iovs) {
   if (tcpRdma_ == F_TCP)
-    return readwritev(::readv, tcpSocket_, const_cast<iovec*>(&(*iovs)[0]),
-                      iovs->size(), UIO_MAXIOV, peerName_);
+    return readwritev(::readv,
+                      tcpSocket_,
+                      const_cast<iovec*>(&(*iovs)[0]),
+                      iovs->size(),
+                      UIO_MAXIOV,
+                      peerName_);
   else
-    return readwritev(rdma::readv, rdmaSocket_, const_cast<iovec*>(&(*iovs)[0]),
-                      iovs->size(), MAX_VEC_SIZE, peerName_);
+    return readwritev(rdma::readv,
+                      rdmaSocket_,
+                      const_cast<iovec*>(&(*iovs)[0]),
+                      iovs->size(),
+                      MAX_VEC_SIZE,
+                      peerName_);
 }
 
 void SocketChannel::writeMessage(const std::vector<struct iovec>& userIovs) {
@@ -157,8 +174,8 @@ void SocketChannel::writeMessage(const std::vector<struct iovec>& userIovs) {
   std::vector<iovec> iovs;
   iovs.reserve(userIovs.size() + 2);
   iovs.push_back({&header, sizeof(header)});
-  iovs.push_back({&iovLengths[0], static_cast<size_t>(
-      sizeof(iovLengths[0]) * header.numIovs)});
+  iovs.push_back({&iovLengths[0],
+                  static_cast<size_t>(sizeof(iovLengths[0]) * header.numIovs)});
   iovs.insert(iovs.end(), userIovs.begin(), userIovs.end());
 
   header.totalLength = 0;
diff --git a/paddle/pserver/SocketChannel.h b/paddle/pserver/SocketChannel.h
index fb9ac2e1dc..472b37a122 100644
--- a/paddle/pserver/SocketChannel.h
+++ b/paddle/pserver/SocketChannel.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "paddle/utils/Util.h"
diff --git a/paddle/pserver/SparseParameterDistribution.cpp b/paddle/pserver/SparseParameterDistribution.cpp
index 31682c158e..2085b22a95 100644
--- a/paddle/pserver/SparseParameterDistribution.cpp
+++ b/paddle/pserver/SparseParameterDistribution.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <unistd.h>
 
 #include "paddle/utils/Logging.h"
@@ -21,19 +20,24 @@ limitations under the License. */
 
 #include "SparseParameterDistribution.h"
 
-P_DEFINE_bool(check_sparse_distribution_in_pserver, false,
+P_DEFINE_bool(check_sparse_distribution_in_pserver,
+              false,
               "check whether sparse parameter exhibts balanced distribution at "
               "all pservers");
-P_DEFINE_bool(show_check_sparse_distribution_log, false,
+P_DEFINE_bool(show_check_sparse_distribution_log,
+              false,
               "show logs details for sparse parameter distribution in pserver");
-P_DEFINE_int32(check_sparse_distribution_batches, 100,
+P_DEFINE_int32(check_sparse_distribution_batches,
+               100,
                "run sparse parameter distribution check for N batches");
 P_DEFINE_double(
-    check_sparse_distribution_ratio, 0.6,
+    check_sparse_distribution_ratio,
+    0.6,
     "if parameters dispatched to different pservers exhibit unbalanced "
     " distribution for check_sparse_distribution_ratio * "
     " check_sparse_distribution_batches times, crash program");
-P_DEFINE_double(check_sparse_distribution_unbalance_degree, 2.0,
+P_DEFINE_double(check_sparse_distribution_unbalance_degree,
+                2.0,
                 "the ratio of maximum data size and minimun data size for "
                 "different pserver");
 
diff --git a/paddle/pserver/test/SocketTest.cpp b/paddle/pserver/test/SocketTest.cpp
index 260aed0083..24c90f1078 100644
--- a/paddle/pserver/test/SocketTest.cpp
+++ b/paddle/pserver/test/SocketTest.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Util.h"
 
 #include <stdio.h>
@@ -184,7 +183,8 @@ SocketClient::SocketClient(const std::string& serverAddr, int serverPort) {
 
   bzero((char*)&serv_addr, sizeof(serv_addr));
   serv_addr.sin_family = AF_INET;
-  bcopy((char*)server->h_addr, (char*)&serv_addr.sin_addr.s_addr,
+  bcopy((char*)server->h_addr,
+        (char*)&serv_addr.sin_addr.s_addr,
         server->h_length);
   serv_addr.sin_port = htons(serverPort);
 
diff --git a/paddle/pserver/test/test_ParameterServer2.cpp b/paddle/pserver/test/test_ParameterServer2.cpp
index c9722f1212..eb813e92d6 100644
--- a/paddle/pserver/test/test_ParameterServer2.cpp
+++ b/paddle/pserver/test/test_ParameterServer2.cpp
@@ -27,7 +27,9 @@ P_DEFINE_int32(server_cpu, 0, "assign server cpu");
 
 class ParameterServer2Tester : public ParameterServer2 {
 public:
-  ParameterServer2Tester(std::string serverAddr, int port, int rdmaCpu = -1,
+  ParameterServer2Tester(std::string serverAddr,
+                         int port,
+                         int rdmaCpu = -1,
                          bool sepSendAndRecv = false)
       : ParameterServer2(serverAddr, port, rdmaCpu), client_(sepSendAndRecv) {}
   virtual ~ParameterServer2Tester() {}
@@ -63,7 +65,7 @@ public:
     }
 
     size_t id = 0;
-    for (auto &para : parameters_) {
+    for (auto& para : parameters_) {
       para->setID(id++);
     }
 
@@ -560,8 +562,8 @@ TEST(ParameterServer2, sendData) {
   std::unique_ptr<ParameterServer2Tester> g_server2;
   std::unique_ptr<ParameterServer2Tester> g_server3;
   if (FLAGS_rdma_tcp == "rdma") {
-    g_server1.reset(new ParameterServer2Tester(FLAGS_server_addr, FLAGS_port,
-                                               FLAGS_server_cpu));
+    g_server1.reset(new ParameterServer2Tester(
+        FLAGS_server_addr, FLAGS_port, FLAGS_server_cpu));
     g_server1->start();
     g_server2.reset(new ParameterServer2Tester(
         FLAGS_server_addr, FLAGS_port + 1, FLAGS_server_cpu + 1));
@@ -604,8 +606,8 @@ int main(int argc, char** argv) {
   FLAGS_num_gradient_servers = 2;
 
   if (FLAGS_rdma_tcp == "rdma") {
-    g_server.reset(new ParameterServer2Tester(FLAGS_server_addr, FLAGS_port,
-                                              FLAGS_server_cpu));
+    g_server.reset(new ParameterServer2Tester(
+        FLAGS_server_addr, FLAGS_port, FLAGS_server_cpu));
   } else {
     g_server.reset(new ParameterServer2Tester(FLAGS_server_addr, FLAGS_port));
   }
diff --git a/paddle/pserver/test/test_ProtoServer.cpp b/paddle/pserver/test/test_ProtoServer.cpp
index 065d6b3396..79d1f2743a 100644
--- a/paddle/pserver/test/test_ProtoServer.cpp
+++ b/paddle/pserver/test/test_ProtoServer.cpp
@@ -126,9 +126,11 @@ TEST(ProtoServer, extended) {
         GetStatusResponse response;
         {
           REGISTER_TIMER("sendAndRecv");
-          auto msgReader = client->sendAndRecv(
-              "getStatusEx", request, {{cpuGrad.getData(), (size_t)dataSize}},
-              &response);
+          auto msgReader =
+              client->sendAndRecv("getStatusEx",
+                                  request,
+                                  {{cpuGrad.getData(), (size_t)dataSize}},
+                                  &response);
 
           EXPECT_EQ(msgReader->getNumBlocks(), (size_t)1);
           EXPECT_EQ(msgReader->getNextBlockLength(), (size_t)dataSize);
diff --git a/paddle/scripts/deb/build_scripts/build.sh b/paddle/scripts/deb/build_scripts/build.sh
index 66a1cfb247..d13dea5148 100755
--- a/paddle/scripts/deb/build_scripts/build.sh
+++ b/paddle/scripts/deb/build_scripts/build.sh
@@ -1,12 +1,12 @@
 #!/bin/bash
 set -e
+apt-get update
 apt-get install -y dh-make
 cd ~
 mkdir -p ~/dist/gpu
 mkdir -p ~/dist/cpu
 mkdir -p ~/dist/cpu-noavx
 mkdir -p ~/dist/gpu-noavx
-git clone https://github.com/baidu/Paddle.git paddle
 cd paddle
 mkdir build
 cd build
diff --git a/paddle/scripts/deb/build_scripts/build_deb.sh b/paddle/scripts/deb/build_scripts/build_deb.sh
index 1331c1249d..c38c6299f8 100755
--- a/paddle/scripts/deb/build_scripts/build_deb.sh
+++ b/paddle/scripts/deb/build_scripts/build_deb.sh
@@ -3,6 +3,6 @@ set -e
 docker build -t build_paddle_deb .
 rm -rf dist
 mkdir -p dist
-docker run -v$PWD/dist:/root/dist --name tmp_build_deb_container build_paddle_deb
+docker run -v$PWD/dist:/root/dist -v $PWD/../../../..:/root/paddle --name tmp_build_deb_container build_paddle_deb
 docker rm tmp_build_deb_container
 docker rmi build_paddle_deb
diff --git a/paddle/scripts/docker/Dockerfile.cpu b/paddle/scripts/docker/Dockerfile.cpu
index a833c69c66..69b8363b7a 100644
--- a/paddle/scripts/docker/Dockerfile.cpu
+++ b/paddle/scripts/docker/Dockerfile.cpu
@@ -1,7 +1,7 @@
 FROM ubuntu:14.04
 MAINTAINER PaddlePaddle Dev Team <paddle-dev@baidu.com>
 COPY build.sh /root/
-ENV GIT_CHECKOUT=develop
+ENV GIT_CHECKOUT=v0.9.0a0
 ENV WITH_GPU=OFF
 ENV IS_DEVEL=OFF
 ENV WITH_DEMO=OFF
diff --git a/paddle/scripts/docker/Dockerfile.cpu-demo b/paddle/scripts/docker/Dockerfile.cpu-demo
index 1fda1e472b..ccbd183ee3 100644
--- a/paddle/scripts/docker/Dockerfile.cpu-demo
+++ b/paddle/scripts/docker/Dockerfile.cpu-demo
@@ -1,7 +1,7 @@
 FROM ubuntu:14.04
 MAINTAINER PaddlePaddle Dev Team <paddle-dev@baidu.com>
 COPY build.sh /root/
-ENV GIT_CHECKOUT=develop
+ENV GIT_CHECKOUT=v0.9.0a0
 ENV WITH_GPU=OFF
 ENV IS_DEVEL=ON
 ENV WITH_DEMO=ON
diff --git a/paddle/scripts/docker/Dockerfile.cpu-devel b/paddle/scripts/docker/Dockerfile.cpu-devel
index 66bdc978dd..36460384f3 100644
--- a/paddle/scripts/docker/Dockerfile.cpu-devel
+++ b/paddle/scripts/docker/Dockerfile.cpu-devel
@@ -1,7 +1,7 @@
 FROM ubuntu:14.04
 MAINTAINER PaddlePaddle Dev Team <paddle-dev@baidu.com>
 COPY build.sh /root/
-ENV GIT_CHECKOUT=develop
+ENV GIT_CHECKOUT=v0.9.0a0
 ENV WITH_GPU=OFF
 ENV IS_DEVEL=ON
 ENV WITH_DEMO=OFF
diff --git a/paddle/scripts/docker/Dockerfile.cpu-noavx b/paddle/scripts/docker/Dockerfile.cpu-noavx
index d0ba30e55a..fa3b7427b0 100644
--- a/paddle/scripts/docker/Dockerfile.cpu-noavx
+++ b/paddle/scripts/docker/Dockerfile.cpu-noavx
@@ -1,7 +1,7 @@
 FROM ubuntu:14.04
 MAINTAINER PaddlePaddle Dev Team <paddle-dev@baidu.com>
 COPY build.sh /root/
-ENV GIT_CHECKOUT=develop
+ENV GIT_CHECKOUT=v0.9.0a0
 ENV WITH_GPU=OFF
 ENV IS_DEVEL=OFF
 ENV WITH_DEMO=OFF
diff --git a/paddle/scripts/docker/Dockerfile.cpu-noavx-demo b/paddle/scripts/docker/Dockerfile.cpu-noavx-demo
index 28439b4bdf..61315f762d 100644
--- a/paddle/scripts/docker/Dockerfile.cpu-noavx-demo
+++ b/paddle/scripts/docker/Dockerfile.cpu-noavx-demo
@@ -1,7 +1,7 @@
 FROM ubuntu:14.04
 MAINTAINER PaddlePaddle Dev Team <paddle-dev@baidu.com>
 COPY build.sh /root/
-ENV GIT_CHECKOUT=develop
+ENV GIT_CHECKOUT=v0.9.0a0
 ENV WITH_GPU=OFF
 ENV IS_DEVEL=ON
 ENV WITH_DEMO=ON
diff --git a/paddle/scripts/docker/Dockerfile.cpu-noavx-devel b/paddle/scripts/docker/Dockerfile.cpu-noavx-devel
index eb4739d6dc..7636531199 100644
--- a/paddle/scripts/docker/Dockerfile.cpu-noavx-devel
+++ b/paddle/scripts/docker/Dockerfile.cpu-noavx-devel
@@ -1,7 +1,7 @@
 FROM ubuntu:14.04
 MAINTAINER PaddlePaddle Dev Team <paddle-dev@baidu.com>
 COPY build.sh /root/
-ENV GIT_CHECKOUT=develop
+ENV GIT_CHECKOUT=v0.9.0a0
 ENV WITH_GPU=OFF
 ENV IS_DEVEL=ON
 ENV WITH_DEMO=OFF
diff --git a/paddle/scripts/docker/Dockerfile.gpu b/paddle/scripts/docker/Dockerfile.gpu
index fa61cfeec8..1e023ae281 100644
--- a/paddle/scripts/docker/Dockerfile.gpu
+++ b/paddle/scripts/docker/Dockerfile.gpu
@@ -1,7 +1,7 @@
 FROM nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04
 MAINTAINER PaddlePaddle Dev Team <paddle-dev@baidu.com>
 COPY build.sh /root/
-ENV GIT_CHECKOUT=develop
+ENV GIT_CHECKOUT=v0.9.0a0
 ENV WITH_GPU=ON
 ENV IS_DEVEL=OFF
 ENV WITH_DEMO=OFF
diff --git a/paddle/scripts/docker/Dockerfile.gpu-demo b/paddle/scripts/docker/Dockerfile.gpu-demo
index 4f5417c1af..92b0dca402 100644
--- a/paddle/scripts/docker/Dockerfile.gpu-demo
+++ b/paddle/scripts/docker/Dockerfile.gpu-demo
@@ -1,7 +1,7 @@
 FROM nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04
 MAINTAINER PaddlePaddle Dev Team <paddle-dev@baidu.com>
 COPY build.sh /root/
-ENV GIT_CHECKOUT=develop
+ENV GIT_CHECKOUT=v0.9.0a0
 ENV WITH_GPU=ON
 ENV IS_DEVEL=ON
 ENV WITH_DEMO=ON
diff --git a/paddle/scripts/docker/Dockerfile.gpu-devel b/paddle/scripts/docker/Dockerfile.gpu-devel
index 37cfced190..fb6f351fd2 100644
--- a/paddle/scripts/docker/Dockerfile.gpu-devel
+++ b/paddle/scripts/docker/Dockerfile.gpu-devel
@@ -1,7 +1,7 @@
 FROM nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04
 MAINTAINER PaddlePaddle Dev Team <paddle-dev@baidu.com>
 COPY build.sh /root/
-ENV GIT_CHECKOUT=develop
+ENV GIT_CHECKOUT=v0.9.0a0
 ENV WITH_GPU=ON
 ENV IS_DEVEL=ON
 ENV WITH_DEMO=OFF
diff --git a/paddle/scripts/docker/Dockerfile.gpu-noavx b/paddle/scripts/docker/Dockerfile.gpu-noavx
index 95fb125b79..7567e62025 100644
--- a/paddle/scripts/docker/Dockerfile.gpu-noavx
+++ b/paddle/scripts/docker/Dockerfile.gpu-noavx
@@ -1,7 +1,7 @@
 FROM nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04
 MAINTAINER PaddlePaddle Dev Team <paddle-dev@baidu.com>
 COPY build.sh /root/
-ENV GIT_CHECKOUT=develop
+ENV GIT_CHECKOUT=v0.9.0a0
 ENV WITH_GPU=ON
 ENV IS_DEVEL=OFF
 ENV WITH_DEMO=OFF
diff --git a/paddle/scripts/docker/Dockerfile.gpu-noavx-demo b/paddle/scripts/docker/Dockerfile.gpu-noavx-demo
index b5fbe4b941..ac52484c5c 100644
--- a/paddle/scripts/docker/Dockerfile.gpu-noavx-demo
+++ b/paddle/scripts/docker/Dockerfile.gpu-noavx-demo
@@ -1,7 +1,7 @@
 FROM nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04
 MAINTAINER PaddlePaddle Dev Team <paddle-dev@baidu.com>
 COPY build.sh /root/
-ENV GIT_CHECKOUT=develop
+ENV GIT_CHECKOUT=v0.9.0a0
 ENV WITH_GPU=ON
 ENV IS_DEVEL=ON
 ENV WITH_DEMO=ON
diff --git a/paddle/scripts/docker/Dockerfile.gpu-noavx-devel b/paddle/scripts/docker/Dockerfile.gpu-noavx-devel
index 531c8ec7ae..19202f306b 100644
--- a/paddle/scripts/docker/Dockerfile.gpu-noavx-devel
+++ b/paddle/scripts/docker/Dockerfile.gpu-noavx-devel
@@ -1,7 +1,7 @@
 FROM nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04
 MAINTAINER PaddlePaddle Dev Team <paddle-dev@baidu.com>
 COPY build.sh /root/
-ENV GIT_CHECKOUT=develop
+ENV GIT_CHECKOUT=v0.9.0a0
 ENV WITH_GPU=ON
 ENV IS_DEVEL=ON
 ENV WITH_DEMO=OFF
diff --git a/paddle/scripts/docker/Dockerfile.m4 b/paddle/scripts/docker/Dockerfile.m4
index 57c8655844..e14493ed9e 100644
--- a/paddle/scripts/docker/Dockerfile.m4
+++ b/paddle/scripts/docker/Dockerfile.m4
@@ -1,7 +1,7 @@
 FROM PADDLE_BASE_IMAGE
 MAINTAINER PaddlePaddle Dev Team <paddle-dev@baidu.com>
 COPY build.sh /root/
-ENV GIT_CHECKOUT=develop
+ENV GIT_CHECKOUT=v0.9.0a0
 ENV WITH_GPU=PADDLE_WITH_GPU
 ENV IS_DEVEL=PADDLE_IS_DEVEL
 ENV WITH_DEMO=PADDLE_WITH_DEMO
diff --git a/paddle/scripts/submit_local.sh.in b/paddle/scripts/submit_local.sh.in
index 213cf2f1cc..20ea2fedc4 100644
--- a/paddle/scripts/submit_local.sh.in
+++ b/paddle/scripts/submit_local.sh.in
@@ -28,6 +28,34 @@ function version(){
         echo "    with_predict_sdk: @WITH_PREDICT_SDK@"
 }
 
+function ver2num() {
+  # convert version to number.
+  if [ -z "$1" ]; then # empty argument
+    printf "%03d%03d%03d%03d%03d" 0
+  else
+    local VERN=$(echo $1 | sed 's#v##g' | sed 's#\.# #g' \
+        | sed 's#a# 0 #g' | sed 's#b# 1 #g' | sed 's#rc# 2 #g')
+    if [ `echo $VERN | wc -w` -eq 3 ] ; then
+      printf "%03d%03d%03d%03d%03d" $VERN 999 999
+    else
+      printf "%03d%03d%03d%03d%03d" $VERN
+    fi
+  fi
+}
+
+PADDLE_CONF_HOME="$HOME/.config/paddle"
+mkdir -p ${PADDLE_CONF_HOME}
+
+if [ -z "${PADDLE_NO_STAT+x}" ]; then
+    SERVER_VER=`curl -m 5 -X POST --data content="{ \"version\": \"@PADDLE_VERSION@\" }"\
+        -b ${PADDLE_CONF_HOME}/paddle.cookie \
+        -c ${PADDLE_CONF_HOME}/paddle.cookie \
+        http://api.paddlepaddle.org/version 2>/dev/null`
+    if [ $? -eq 0 ] && [ "$(ver2num @PADDLE_VERSION@)" -lt  $(ver2num $SERVER_VER) ]; then
+      echo "Paddle release a new version ${SERVER_VER}, you can get the install package in http://www.paddlepaddle.org" 
+    fi
+fi
+
 
 MYDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 
diff --git a/paddle/trainer/ParamUtil.cpp b/paddle/trainer/ParamUtil.cpp
index bb309a5497..2be9cd6223 100644
--- a/paddle/trainer/ParamUtil.cpp
+++ b/paddle/trainer/ParamUtil.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "ParamUtil.h"
 
 #include <fenv.h>
@@ -48,8 +47,6 @@ ParameterUtil::ParameterUtil(
   pUpdater_ = parameterUpdater;
 }
 
-
-
 bool ParameterUtil::loadParameters(int passId, bool local, bool remote) {
   constexpr int kBufLen = 100;
   char buf[kBufLen];
@@ -60,8 +57,9 @@ bool ParameterUtil::loadParameters(int passId, bool local, bool remote) {
   return true;
 }
 
-void ParameterUtil::loadParametersWithPath(const std::string& dir,
-                                    bool local, bool remote) {
+void ParameterUtil::loadParametersWithPath(const std::string &dir,
+                                           bool local,
+                                           bool remote) {
   if (local) {
     gserver_->loadParameters(dir);
   }
@@ -98,7 +96,7 @@ void ParameterUtil::saveParameters(int passId, int passInnerId) {
   mkDir(saveDir.c_str());
   if (!intConfig_->load_save_param_pserver_) {
     pUpdater_->getParametersRemote(true /*full parameter*/,
-                                  true /*after apply*/);
+                                   true /*after apply*/);
   }
 
   gserver_->saveParameters(saveDir);
@@ -117,9 +115,13 @@ void ParameterUtil::saveParameters(int passId, int passInnerId) {
 void ParameterUtil::deleteParameters(int passId, int passInnerId) {
   constexpr int kBufLen = 100;
   char buf[kBufLen];
-  const std::string& saveDir = config_->getSaveDir();
+  const std::string &saveDir = config_->getSaveDir();
   if (passInnerId > 0) {
-    snprintf(buf, kBufLen, "%s/pass-%05d-%03d", saveDir.c_str(), passId,
+    snprintf(buf,
+             kBufLen,
+             "%s/pass-%05d-%03d",
+             saveDir.c_str(),
+             passId,
              passInnerId);
   } else {
     snprintf(buf, kBufLen, "%s/pass-%05d", saveDir.c_str(), passId);
@@ -129,8 +131,7 @@ void ParameterUtil::deleteParameters(int passId, int passInnerId) {
   rmDir(buf);
 }
 
-
-void ParameterUtil::saveConfigWithPath(const std::string& path) {
+void ParameterUtil::saveConfigWithPath(const std::string &path) {
   std::string src;
   // save config in some path
   if (!intConfig_->config_.empty()) {
diff --git a/paddle/trainer/ParamUtil.h b/paddle/trainer/ParamUtil.h
index cfb637a3ed..3923941c3d 100644
--- a/paddle/trainer/ParamUtil.h
+++ b/paddle/trainer/ParamUtil.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "paddle/utils/Util.h"
@@ -37,14 +36,14 @@ namespace paddle {
 struct ParameterUtilConfig {
   DISABLE_COPY(ParameterUtilConfig);
 
-  ParameterUtilConfig(bool save_only_one, int saving_period,
+  ParameterUtilConfig(bool save_only_one,
+                      int saving_period,
                       bool load_save_parameters_in_pserver,
-                      std::string config):
-                      save_only_one_(save_only_one),
-                      saving_period_(saving_period),
-                      load_save_param_pserver_(load_save_parameters_in_pserver),
-                      config_(config) {
-                      }
+                      std::string config)
+      : save_only_one_(save_only_one),
+        saving_period_(saving_period),
+        load_save_param_pserver_(load_save_parameters_in_pserver),
+        config_(config) {}
 
   bool save_only_one_;
   int saving_period_;
@@ -52,7 +51,6 @@ struct ParameterUtilConfig {
   std::string config_;
 };
 
-
 /**
  * ParameterUtil
  * Utility class for loading and saving parameters
@@ -80,8 +78,9 @@ public:
   bool loadParameters(int passId, bool local = true, bool remote = false);
 
   /// load parameters given path info
-  void loadParametersWithPath(const std::string& dir, bool local = true,
-                      bool remote = false);
+  void loadParametersWithPath(const std::string &dir,
+                              bool local = true,
+                              bool remote = false);
 
   /// Save parameter to dist for pass passId
   /// passInnerId means saving times in one pass, some users want to
@@ -97,14 +96,14 @@ public:
   void deleteParameters(int passId, int passInnerId = 0);
 
   /// save config given path info
-  void saveConfigWithPath(const std::string& path);
+  void saveConfigWithPath(const std::string &path);
 
   /**
    * Try to load parameter from config.
    * @return true if can load from trainer config.
    */
   inline bool tryLoadParametersFromConfig() {
-    auto& c = config_->getConfig();
+    auto &c = config_->getConfig();
     if (!c.init_model_path().empty()) {
       loadParametersWithPath(c.init_model_path());
       return true;
diff --git a/paddle/trainer/ParameterUpdater.cpp b/paddle/trainer/ParameterUpdater.cpp
index ef2b1443d9..6001a0b391 100644
--- a/paddle/trainer/ParameterUpdater.cpp
+++ b/paddle/trainer/ParameterUpdater.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "ParameterUpdater.h"
 
 #include "paddle/utils/Logging.h"
@@ -30,7 +29,8 @@ SgdUpdaterWithCpuAverager::SgdUpdaterWithCpuAverager(
   CHECK(FLAGS_use_gpu && optConfig.do_average_in_cpu());
   averager_.reset(AverageOptimizer::create(optConfig,
                                            new DummyOptimizer(optConfig),
-                                           false /*sparse*/, true /*apply*/));
+                                           false /*sparse*/,
+                                           true /*apply*/));
   updateWorker_.addJob([]() { hl_set_device(FLAGS_gpu_id); });
 }
 
diff --git a/paddle/trainer/ParameterUpdater.h b/paddle/trainer/ParameterUpdater.h
index 854e6a45d8..b83b4cf55e 100644
--- a/paddle/trainer/ParameterUpdater.h
+++ b/paddle/trainer/ParameterUpdater.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "paddle/utils/Thread.h"
@@ -69,7 +68,8 @@ public:
     ParameterUpdater::init(parameters);
     optimizer_->init(parameters_.size(), nullptr);
     // check no L1 decay in parameter configs
-    CHECK(std::find_if(parameters.begin(), parameters.end(),
+    CHECK(std::find_if(parameters.begin(),
+                       parameters.end(),
                        [](const ParameterPtr& para) {
                          return para->getConfig().decay_rate_l1() > 0.0f;
                        }) == parameters.end())
@@ -146,7 +146,6 @@ protected:
     para->getBuf(PARAMETER_GRADIENT)->zeroMem();
   }
 
-
   std::unique_ptr<ParameterOptimizer> optimizer_;
 
   /**
@@ -163,10 +162,10 @@ class SgdCpuUpdater : public SgdLocalUpdater, public Deprecated {
 public:
   explicit SgdCpuUpdater(const OptimizationConfig& optConfig)
       : SgdLocalUpdater(optConfig),
-        Deprecated("SgdCpuUpdater is used only in recursive neural network, "
-                   "and recursive neural network is deprecated in paddle. "
-                   "Use it all by your own.")
-  {}
+        Deprecated(
+            "SgdCpuUpdater is used only in recursive neural network, "
+            "and recursive neural network is deprecated in paddle. "
+            "Use it all by your own.") {}
 
   /**
    * @brief update all parameter on finish batch.
diff --git a/paddle/trainer/RemoteParameterUpdater.cpp b/paddle/trainer/RemoteParameterUpdater.cpp
index 3a5c2a3517..d83bb5b10a 100644
--- a/paddle/trainer/RemoteParameterUpdater.cpp
+++ b/paddle/trainer/RemoteParameterUpdater.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "RemoteParameterUpdater.h"
 #include "Trainer.h"
 #include "paddle/utils/Stat.h"
@@ -31,7 +30,8 @@ const std::string RemoteParameterUpdater::kAverage = "average";
 const std::string RemoteParameterUpdater::kElasticAverage = "elastic_average";
 
 RemoteParameterUpdater::RemoteParameterUpdater(
-    const OptimizationConfig& config, int expectedPassCount,
+    const OptimizationConfig& config,
+    int expectedPassCount,
     std::unique_ptr<ParameterUpdater>&& localUpdater)
     : config_(config),
       localUpdater_(std::move(localUpdater)),
@@ -94,8 +94,8 @@ void RemoteParameterUpdater::init(std::vector<ParameterPtr>& parameters) {
     parameterClient_->getParameter();
     copyParametersToDevice(PARAMETER_VALUE);
   }
-  if (FLAGS_trainer_id == 0 && (config_.algorithm()
-                                != TrainAlgorithm::AsyncSGD)) {
+  if (FLAGS_trainer_id == 0 &&
+      (config_.algorithm() != TrainAlgorithm::AsyncSGD)) {
     startController();
     useApplyInPserver_ = useApplyInPserver(config_);
   }
@@ -241,7 +241,9 @@ void RemoteParameterUpdater::finishBatch(real cost) {
 
   {
     REGISTER_TIMER("sendAndRecv_dense");
-    parameterClient_->sendAndReceiveParameter(mode, sendType, batchSize_,
+    parameterClient_->sendAndReceiveParameter(mode,
+                                              sendType,
+                                              batchSize_,
                                               0,  // cost = 0
                                               sendBackParameter);
   }
@@ -356,7 +358,8 @@ void RemoteParameterUpdater::restore() {
 }
 
 ConcurrentRemoteParameterUpdater::ConcurrentRemoteParameterUpdater(
-    OptimizationConfig config, int passCount,
+    OptimizationConfig config,
+    int passCount,
     std::unique_ptr<ParameterUpdater>&& localUpdater)
     : RemoteParameterUpdater(config, passCount, std::move(localUpdater)) {
   sendThread_.reset(new std::thread([this]() { this->send(); }));
@@ -423,7 +426,10 @@ void ConcurrentRemoteParameterUpdater::send(Parameter* para) {
   std::vector<ParameterSegments> paraSegment;
   if (para == NULL) {
     parameterClient_->sendParameter(
-        mode, sendType, paraSegment, batchSize_,
+        mode,
+        sendType,
+        paraSegment,
+        batchSize_,
         0,              // cost=0
         true,           // sendBackParameter = true
         batchStatus_);  // batchStatus_ = BATCH_FINISH
@@ -440,7 +446,10 @@ void ConcurrentRemoteParameterUpdater::send(Parameter* para) {
       copySingleParaFromDevice(para, sendType);
       hl_stream_synchronize(kDeviceToHostStream);
     }
-    parameterClient_->sendParameter(mode, sendType, paraSegment, batchSize_,
+    parameterClient_->sendParameter(mode,
+                                    sendType,
+                                    paraSegment,
+                                    batchSize_,
                                     0,     // cost=0
                                     true,  // sendBackParameter = true
                                     batchStatus_);
@@ -589,14 +598,14 @@ SparseRemoteParameterUpdater::SparseRemoteParameterUpdater(
 void SparseRemoteParameterUpdater::init(std::vector<ParameterPtr>& parameters) {
   ParameterUpdater::init(parameters);
 
-  parameterClient_.reset(new ParameterClient2(false,
-      FLAGS_port + FLAGS_ports_num, FLAGS_ports_num_for_sparse));
+  parameterClient_.reset(new ParameterClient2(
+      false, FLAGS_port + FLAGS_ports_num, FLAGS_ports_num_for_sparse));
   parameterClient_->init(parameters_);
   parameterClient_->setTrainerId(FLAGS_trainer_id);
 
   if (FLAGS_trainer_id == 0) {
-    parameterClient_->setConfig(config_, FLAGS_save_dir,
-                                true /*is_sparse_server*/);
+    parameterClient_->setConfig(
+        config_, FLAGS_save_dir, true /*is_sparse_server*/);
     if (parameters[0]->isFullSize()) {
       parameterClient_->setParameter();
     } else {  // init in pserver
@@ -615,9 +624,8 @@ void SparseRemoteParameterUpdater::startController() {
 }
 
 void SparseRemoteParameterUpdater::controller() {
-  ParameterClient2 client(false,
-                          FLAGS_port + FLAGS_ports_num,
-                          FLAGS_ports_num_for_sparse);
+  ParameterClient2 client(
+      false, FLAGS_port + FLAGS_ports_num, FLAGS_ports_num_for_sparse);
   client.init(parameters_);
 
   while (true) {
@@ -679,7 +687,9 @@ void SparseRemoteParameterUpdater::finishBatch(real cost) {
   ParameterType sendType = PARAMETER_GRADIENT;
 
   REGISTER_TIMER("sendSparseParam");
-  parameterClient_->sendAndReceiveParameter(mode, sendType, batchSize_,
+  parameterClient_->sendAndReceiveParameter(mode,
+                                            sendType,
+                                            batchSize_,
                                             0,       // cost = 0
                                             false);  // sendBackParameter
 
@@ -823,6 +833,6 @@ void SparseRemoteParameterUpdaterComposite::init(
 
 std::vector<std::function<ParameterUpdater*(
     const std::string&, const OptimizationConfig&, bool, size_t)>>
-ParameterUpdaterCreators::constructors_;
+    ParameterUpdaterCreators::constructors_;
 
 }  // namespace paddle
diff --git a/paddle/trainer/RemoteParameterUpdater.h b/paddle/trainer/RemoteParameterUpdater.h
index be273e9ef7..a40884724c 100644
--- a/paddle/trainer/RemoteParameterUpdater.h
+++ b/paddle/trainer/RemoteParameterUpdater.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <thread>
@@ -56,7 +55,8 @@ namespace paddle {
 class RemoteParameterUpdater : public ParameterUpdater {
 public:
   RemoteParameterUpdater(
-      const OptimizationConfig& config, int expectedPpassCount,
+      const OptimizationConfig& config,
+      int expectedPpassCount,
       std::unique_ptr<ParameterUpdater>&& localUpdater = nullptr);
   ~RemoteParameterUpdater() {
     if (controllerThread_) {
@@ -180,7 +180,8 @@ protected:
 class ConcurrentRemoteParameterUpdater : public RemoteParameterUpdater {
 public:
   ConcurrentRemoteParameterUpdater(
-      OptimizationConfig config, int expectedPassCount,
+      OptimizationConfig config,
+      int expectedPassCount,
       std::unique_ptr<ParameterUpdater>&& localUpdater);
   ~ConcurrentRemoteParameterUpdater();
 
@@ -264,7 +265,8 @@ private:
 class SparseRemoteParameterUpdater : public ParameterUpdater {
 public:
   SparseRemoteParameterUpdater(const OptimizationConfig& config,
-                               int expectedPassCount, bool testing);
+                               int expectedPassCount,
+                               bool testing);
   ~SparseRemoteParameterUpdater() {
     if (controllerThread_) {
       controllerThread_->join();
@@ -345,7 +347,9 @@ public:
    * @note  use syncThreadPool to synchronize these two updaters
    */
   SparseRemoteParameterUpdaterComposite(
-      const OptimizationConfig& config, int expectedPassCount, bool testing,
+      const OptimizationConfig& config,
+      int expectedPassCount,
+      bool testing,
       std::unique_ptr<ParameterUpdater>&& normalUpdater) {
     updaters_.resize(NUMBER_UPDATERS);
     updaters_[UPDATER_SPARSE_REMOTE].reset(
@@ -373,11 +377,11 @@ public:
    */
   static void addCreator(
       const std::function<ParameterUpdater*(
-          const std::string&,  // algo
+          const std::string&,         // algo
           const OptimizationConfig&,  // optConfig
-          bool,  // isLocal
-          size_t  // numPasses
-        )>& creator) {    // NOLINT  explicit move closing ) in this line
+          bool,                       // isLocal
+          size_t                      // numPasses
+          )>& creator) {  // NOLINT  explicit move closing ) in this line
                           // for readability
     constructors_.push_back(creator);
   }
@@ -395,7 +399,7 @@ public:
                                             const OptimizationConfig& optConfig,
                                             bool isLocal,
                                             size_t numPasses) {
-    for (auto & c : constructors_) {
+    for (auto& c : constructors_) {
       if (auto updater = c(algo, optConfig, isLocal, numPasses)) {
         return updater;
       }
@@ -406,7 +410,7 @@ public:
 private:
   static std::vector<std::function<ParameterUpdater*(
       const std::string&, const OptimizationConfig&, bool, size_t)>>
-  constructors_;
+      constructors_;
 };
 
 }  // namespace paddle
diff --git a/paddle/trainer/Tester.cpp b/paddle/trainer/Tester.cpp
index d3b88019fa..30e92682ba 100644
--- a/paddle/trainer/Tester.cpp
+++ b/paddle/trainer/Tester.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "Tester.h"
 
 #include <fenv.h>
@@ -37,38 +36,33 @@ limitations under the License. */
 
 namespace paddle {
 
-Tester::Tester(const std::shared_ptr<TrainerConfigHelper> &config,
-               std::unique_ptr<TesterConfig> &&intconfig,
-               const GradientMachinePtr &gradientMachine,
-               const std::shared_ptr<ParameterUpdater> &parameterUpdater,
-               std::shared_ptr<DataProvider> testDataProvider):
-               config_(config),
-               intconfig_(std::move(intconfig)),
-               gradientMachine_(gradientMachine),
-               parameterUpdater_(parameterUpdater),
-               testDataProvider_(testDataProvider) {
-  testEvaluator_.reset(gradientMachine_ ->makeEvaluator());
+Tester::Tester(const std::shared_ptr<TrainerConfigHelper>& config,
+               std::unique_ptr<TesterConfig>&& intconfig,
+               const GradientMachinePtr& gradientMachine,
+               const std::shared_ptr<ParameterUpdater>& parameterUpdater,
+               std::shared_ptr<DataProvider> testDataProvider)
+    : config_(config),
+      intconfig_(std::move(intconfig)),
+      gradientMachine_(gradientMachine),
+      parameterUpdater_(parameterUpdater),
+      testDataProvider_(testDataProvider) {
+  testEvaluator_.reset(gradientMachine_->makeEvaluator());
   if (intconfig_->distributeTest) {
     testParameterClient_.reset(new ParameterClient2(true));
   }
 
   if (testParameterClient_) {
-    testParameterClient_->init(
-        gradientMachine_->getParameters());
+    testParameterClient_->init(gradientMachine_->getParameters());
   }
 
   std::unique_ptr<ParameterUtilConfig> paramConfig(
-      new ParameterUtilConfig(
-          intconfig_->saveOnlyOne,
-          intconfig_->savingPeriod,
-          intconfig_->loadsaveParametersInPserver,
-          intconfig_->config));
+      new ParameterUtilConfig(intconfig_->saveOnlyOne,
+                              intconfig_->savingPeriod,
+                              intconfig_->loadsaveParametersInPserver,
+                              intconfig_->config));
 
   paramUtil_.reset(new ParameterUtil(
-      config_,
-      std::move(paramConfig),
-      gradientMachine_,
-      parameterUpdater_));
+      config_, std::move(paramConfig), gradientMachine_, parameterUpdater_));
 }
 
 void Tester::startTestPeriod() {
@@ -83,10 +77,10 @@ void Tester::startTestPeriod() {
   }
 }
 
-void Tester::testOneDataBatch(
-    const DataBatch& dataBatch, std::vector<Argument>* outArgs) {
-  testContext_.cost += forwardOneBatch(
-    dataBatch, testEvaluator_.get(), outArgs);
+void Tester::testOneDataBatch(const DataBatch& dataBatch,
+                              std::vector<Argument>* outArgs) {
+  testContext_.cost +=
+      forwardOneBatch(dataBatch, testEvaluator_.get(), outArgs);
   testContext_.numSamples += dataBatch.getSize();
 }
 
@@ -158,8 +152,8 @@ int64_t Tester::testOneBatchById(int64_t batchId) {
   return actualBatchSize;
 }
 
-real Tester::forwardOneBatch(const DataBatch &dataBatch,
-                             Evaluator *evaluator,
+real Tester::forwardOneBatch(const DataBatch& dataBatch,
+                             Evaluator* evaluator,
                              std::vector<Argument>* pOutArgs) {
   auto& outArgs = *pOutArgs;
   const std::vector<Argument>& inArgs = dataBatch.getStreams();
@@ -180,7 +174,8 @@ real Tester::forwardOneBatch(const DataBatch &dataBatch,
     featMatrices.resize(numOutputs);
     for (size_t i = 0; i < numOutputs; ++i) {
       featMatrices[i] = Matrix::create(outArgs[i].value->getHeight(),
-                                       outArgs[i].value->getWidth(), false,
+                                       outArgs[i].value->getWidth(),
+                                       false,
                                        false);  // CPU data buffer
       featMatrices[i]->copyFrom(*(outArgs[i].value), HPPL_STREAM_DEFAULT);
     }
@@ -222,20 +217,19 @@ real Tester::forwardOneBatch(const DataBatch &dataBatch,
   return Argument::sumCosts(outArgs);
 }
 
-
 void Tester::testOnePassBatch(int passId) {
   stats_.reset();
   const std::vector<Argument> inArgs;
   gradientMachine_->forward(inArgs, nullptr, PASS_TEST);
-  int64_t num; real cost;
+  int64_t num;
+  real cost;
   gradientMachine_->getStats(cost, num);
-  stats_ += std::pair<int64_t, real> {num, cost};
+  stats_ += std::pair<int64_t, real>{num, cost};
   gradientMachine_->onPassEnd();
 
   LOG(INFO) << " Pass=" << passId << " " << stats_.getStats(false);
 }
 
-
 void Tester::testOnePass(int passId) {
   stats_.reset();
   int64_t batchId = 0;
@@ -265,7 +259,6 @@ void Tester::testOnePass(int passId) {
   }
 }
 
-
 void Tester::test() {
   CHECK(testDataProvider_) << "TestData is not specified";
   testDataProvider_->setSkipShuffle();
@@ -281,33 +274,32 @@ void Tester::test() {
     intconfig_->testPass = 0;
     intconfig_->numPasses = modelList.size();
     intconfig_->savingPeriod = 1;
-    CHECK_EQ(intconfig_->testWait, 0) <<
-      "--test_wait must be 0 for evaluation";
+    CHECK_EQ(intconfig_->testWait, 0) << "--test_wait must be 0 for evaluation";
   } else if (!initModelPath.empty()) {
     modelList.push_back(initModelPath);
     intconfig_->testPass = 0;
     intconfig_->numPasses = 1;
     intconfig_->savingPeriod = 1;
-    CHECK_EQ(intconfig_->testWait, 0) <<
-      "--test_wait must be 0 for evaluation";
+    CHECK_EQ(intconfig_->testWait, 0) << "--test_wait must be 0 for evaluation";
   }
 
   for (int i = intconfig_->testPass; i < intconfig_->numPasses; ++i) {
     int passId = i;
     if (passId % intconfig_->savingPeriod == 0) {
       if (intconfig_->testWait) {
-        while (paramUtil_->loadParameters(passId,
-                true /*local*/, true /*remote*/) == false) {
+        while (paramUtil_->loadParameters(
+                   passId, true /*local*/, true /*remote*/) == false) {
           LOG(INFO) << "Waiting for parameters of pass " << passId;
           sleep(60);  // sleep 60s
         }
       } else {
         if (modelList.size() == 0) {
-          CHECK_EQ(paramUtil_->loadParameters(passId,
-                  true /*local*/, true /*remote*/), true);
+          CHECK_EQ(paramUtil_->loadParameters(
+                       passId, true /*local*/, true /*remote*/),
+                   true);
         } else {
-          paramUtil_->loadParametersWithPath(modelList[i],
-                                      true /*local*/, true /*remote*/);
+          paramUtil_->loadParametersWithPath(
+              modelList[i], true /*local*/, true /*remote*/);
         }
       }
       if (IGradientMachineMode::trainWholeDataInOneBatch(intconfig_->mode)) {
@@ -326,9 +318,8 @@ void Tester::test() {
   gradientMachine_->finish();
 }
 
-
 void Tester::printOutput(const std::vector<Argument>& outArgs,
-                          std::ostream& os) {
+                         std::ostream& os) {
   size_t numOutputs = outArgs.size();
   size_t numIns = outArgs[0].getBatchSize();
   if (cpuMat_.size() != numOutputs || cpuVec_.size() != numOutputs) {
@@ -346,11 +337,13 @@ void Tester::printOutput(const std::vector<Argument>& outArgs,
         } else if (dynamic_cast<GpuSparseMatrix*>(outArgs[i].value.get())) {
           auto sparseMat =
               dynamic_cast<GpuSparseMatrix*>(outArgs[i].value.get());
-          cpuMat_[i] = Matrix::createSparseMatrix(
-              sparseMat->getHeight(), sparseMat->getWidth(),
-              sparseMat->getElementCnt(), sparseMat->getValueType(),
-              sparseMat->format_, false, /* trans */
-              false);                    /* useGpu */
+          cpuMat_[i] = Matrix::createSparseMatrix(sparseMat->getHeight(),
+                                                  sparseMat->getWidth(),
+                                                  sparseMat->getElementCnt(),
+                                                  sparseMat->getValueType(),
+                                                  sparseMat->format_,
+                                                  false,  /* trans */
+                                                  false); /* useGpu */
           hl_stream_t stream = HPPL_STREAM_DEFAULT;
           cpuMat_[i]->copyFrom(*sparseMat, stream);
         } else {
diff --git a/paddle/trainer/Tester.h b/paddle/trainer/Tester.h
index 671ffc5220..a9de9fe208 100644
--- a/paddle/trainer/Tester.h
+++ b/paddle/trainer/Tester.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "paddle/utils/Util.h"
@@ -49,10 +48,10 @@ public:
    *                         for getting parameter from parameter-server.
    * @param testDataProvider Test data provider.
    */
-  Tester(const std::shared_ptr<TrainerConfigHelper> &config,
-         std::unique_ptr<TesterConfig> &&intconfig,
-         const GradientMachinePtr &gradientMachine,
-         const std::shared_ptr<ParameterUpdater> &parameterUpdater,
+  Tester(const std::shared_ptr<TrainerConfigHelper>& config,
+         std::unique_ptr<TesterConfig>&& intconfig,
+         const GradientMachinePtr& gradientMachine,
+         const std::shared_ptr<ParameterUpdater>& parameterUpdater,
          std::shared_ptr<DataProvider> testDataProvider);
 
   /**
@@ -83,13 +82,11 @@ public:
                        Evaluator* evaluator,
                        std::vector<Argument>* outArgs);
 
-
   /**
    * performance the full pass of test given test data provider
    */
   void test();
 
-
 protected:
   std::shared_ptr<ParameterClient2> testParameterClient_;
   std::shared_ptr<TrainerConfigHelper> config_;
diff --git a/paddle/trainer/TesterConfig.h b/paddle/trainer/TesterConfig.h
index d5e644ce61..90267e68d7 100644
--- a/paddle/trainer/TesterConfig.h
+++ b/paddle/trainer/TesterConfig.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "paddle/utils/Util.h"
diff --git a/paddle/trainer/ThreadParameterUpdater.cpp b/paddle/trainer/ThreadParameterUpdater.cpp
index d0fda1b625..cc22851d8e 100644
--- a/paddle/trainer/ThreadParameterUpdater.cpp
+++ b/paddle/trainer/ThreadParameterUpdater.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "ThreadParameterUpdater.h"
 
 #include "paddle/utils/Logging.h"
@@ -45,7 +44,8 @@ void SgdThreadUpdater::init(std::vector<ParameterPtr>& parameters) {
   optimizers_.resize(maxId + 1);
   for (auto& para : parameters_) {
     int pid = para->getID();
-    optimizers_[pid].reset(sgdOptimizerCreate(config_, para->getConfig(),
+    optimizers_[pid].reset(sgdOptimizerCreate(config_,
+                                              para->getConfig(),
                                               para->isGradSparseUpdate(),
                                               false /*inPserver*/));
     size_t numRows = para->isGradSparseUpdate() ? para->getConfig().dims(0) : 0;
@@ -91,8 +91,10 @@ void SgdThreadUpdater::updateImpl(Parameter* para) {
 }
 
 void SgdThreadUpdater::threadTraverse(
-    const ParameterOptimizer::TraverseCallback& callback, int tid,
-    size_t numThreads, Parameter* para) {
+    const ParameterOptimizer::TraverseCallback& callback,
+    int tid,
+    size_t numThreads,
+    Parameter* para) {
   VectorPtr* vecs = Parameter::getTlsTempBufs();
   if (para->isGradSparseUpdate()) {
     size_t height = para->getConfig().dims(0);
@@ -106,8 +108,8 @@ void SgdThreadUpdater::threadTraverse(
     }
   } else {  // dense
     // setup sub bufs
-    auto interval = calcSplitArrayInterval(para->getSize(), (size_t)tid,
-                                           numThreads, 8LU /*for avx*/);
+    auto interval = calcSplitArrayInterval(
+        para->getSize(), (size_t)tid, numThreads, 8LU /*for avx*/);
     for (auto type : parameterTypes_) {
       vecs[type]->subVecFrom(*para->getBuf(type), interval);
     }
@@ -150,7 +152,7 @@ void SgdThreadUpdater::traverse(GetTraverseCallback getTraverseCallback) {
   } else if (hasCpuPara) {
     getGlobalSyncThreadPool()->exec(cpuTraverse);
   } else if (hasGpuPara) {
-      gpuTraverse(0, 0);
+    gpuTraverse(0, 0);
   }
 }
 
@@ -168,9 +170,8 @@ void SgdThreadUpdater::catchUpWith() {
 void SgdThreadUpdater::apply() {
   catchUpWith();
 
-  traverse([this](Parameter* para) {
-    return optimizers_[para->getID()]->apply();
-  });
+  traverse(
+      [this](Parameter* para) { return optimizers_[para->getID()]->apply(); });
 }
 
 void SgdThreadUpdater::restore() {
@@ -205,9 +206,9 @@ void SgdThreadUpdater::finishBatch(real cost) {
   }
 }
 
-void SgdThreadUpdater::threadUpdateSparse(
-    int tid, size_t numThreads, Parameter* para) {
-
+void SgdThreadUpdater::threadUpdateSparse(int tid,
+                                          size_t numThreads,
+                                          Parameter* para) {
   int pid = para->getID();
   ParameterOptimizer* optimizer = optimizers_[pid].get();
   VectorPtr* vecs = Parameter::getTlsTempBufs();
@@ -216,10 +217,10 @@ void SgdThreadUpdater::threadUpdateSparse(
   size_t width = para->getConfig().dims(1);
 
   if (dynamic_cast<SparseRowIdsCpuMatrix*>(
-        para->getMat(PARAMETER_GRADIENT).get())) {
+          para->getMat(PARAMETER_GRADIENT).get())) {
     // From MultiGradientMachine
     SparseRowIdsCpuMatrix* mainMat = dynamic_cast<SparseRowIdsCpuMatrix*>(
-      para->getMat(PARAMETER_GRADIENT).get());
+        para->getMat(PARAMETER_GRADIENT).get());
     std::vector<uint32_t>& sparseIds = mainMat->getIds(tid);
 
     for (auto id : sparseIds) {
@@ -232,16 +233,16 @@ void SgdThreadUpdater::threadUpdateSparse(
     }
     sparseIds.clear();
   } else if (dynamic_cast<SparseRowCpuMatrix*>(
-               para->getMat(PARAMETER_GRADIENT).get())) {
+                 para->getMat(PARAMETER_GRADIENT).get())) {
     // From NeuralNetwork
     SparseRowCpuMatrix* mainMat = dynamic_cast<SparseRowCpuMatrix*>(
-      para->getMat(PARAMETER_GRADIENT).get());
+        para->getMat(PARAMETER_GRADIENT).get());
 
     std::vector<unsigned int>& localIndices =
         mainMat->getIndexDictHandle()->localIndices;
 
-    auto interval = calcSplitArrayInterval(
-      localIndices.size(), tid, numThreads);
+    auto interval =
+        calcSplitArrayInterval(localIndices.size(), tid, numThreads);
     for (size_t i = interval.first; i < interval.second; ++i) {
       auto id = localIndices[i];
       real* row = mainMat->getLocalRow(i);
@@ -261,12 +262,11 @@ void SgdThreadUpdater::threadUpdateSparse(
     CHECK_EQ(numThreads, 1UL);
     mainMat->clearIndices();
   } else {
-    auto & m = *para->getMat(PARAMETER_GRADIENT).get();
+    auto& m = *para->getMat(PARAMETER_GRADIENT).get();
     LOG(FATAL) << "Internal error: " << para->getName() << " "
                << typeid(m).name();
   }
 
-
   if (auto callback = optimizer->needSpecialTraversal(para->getConfig())) {
     for (size_t i = tid; i < height; i += numThreads) {
       // setup sub bufs
@@ -278,14 +278,15 @@ void SgdThreadUpdater::threadUpdateSparse(
   }
 }
 
-void SgdThreadUpdater::threadUpdateDense(int tid, size_t numThreads,
+void SgdThreadUpdater::threadUpdateDense(int tid,
+                                         size_t numThreads,
                                          Parameter* para) {
   int pid = para->getID();
   ParameterOptimizer* optimizer = optimizers_[pid].get();
   VectorPtr* vecs = Parameter::getTlsTempBufs();
 
-  auto interval = calcSplitArrayInterval(para->getSize(), (size_t)tid,
-                                         numThreads, 8LU /*for avx*/);
+  auto interval = calcSplitArrayInterval(
+      para->getSize(), (size_t)tid, numThreads, 8LU /*for avx*/);
 
   // setup sub bufs
   for (auto type : parameterTypes_) {
diff --git a/paddle/trainer/ThreadParameterUpdater.h b/paddle/trainer/ThreadParameterUpdater.h
index d8a7a5dd4f..5a5e3f1d4b 100644
--- a/paddle/trainer/ThreadParameterUpdater.h
+++ b/paddle/trainer/ThreadParameterUpdater.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "paddle/utils/Util.h"
@@ -26,7 +25,6 @@ limitations under the License. */
 #include <memory>
 #include <vector>
 
-
 namespace paddle {
 
 /**
@@ -45,14 +43,12 @@ public:
   explicit SgdThreadUpdater(const OptimizationConfig& optConfig);
   virtual ~SgdThreadUpdater() {}
 
-
   // Use the startPass() function of the base optimizer.
   virtual void startPass();
 
   // Use the finishPass() function of the base optimizer.
   virtual bool finishPass(real cost);
 
-
   virtual void init(std::vector<ParameterPtr>& parameters);
   virtual PassType startBatch(int64_t batchSize);
   // Call finishBatch for each optimizer.
@@ -78,9 +74,11 @@ protected:
   void threadUpdateDense(int tid, size_t numThreads, Parameter* para);
   // The update function for after update operations, such as averager.
   void threadTraverse(const ParameterOptimizer::TraverseCallback& callback,
-                      int tid, size_t numThreads, Parameter* para);
+                      int tid,
+                      size_t numThreads,
+                      Parameter* para);
   typedef std::function<const ParameterOptimizer::TraverseCallback(Parameter*)>
-    GetTraverseCallback;
+      GetTraverseCallback;
   void traverse(GetTraverseCallback getTraverseCallback);
 };
 
diff --git a/paddle/trainer/Trainer.cpp b/paddle/trainer/Trainer.cpp
index 7fc48dd1fb..8a5162912e 100644
--- a/paddle/trainer/Trainer.cpp
+++ b/paddle/trainer/Trainer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "Trainer.h"
 
 #include <fenv.h>
@@ -40,7 +39,8 @@ limitations under the License. */
 #include "TrainerConfigHelper.h"
 
 P_DEFINE_string(config, "", "Trainer config file");
-P_DEFINE_int32(test_period, 0,
+P_DEFINE_int32(test_period,
+               0,
                "Run test every so many train batches."
                " 0 for testing after each pass."
                " If not 0, test log_period batches."
@@ -49,23 +49,28 @@ P_DEFINE_int32(test_period, 0,
 P_DEFINE_bool(local, true, "Train in local mode or not");
 
 P_DEFINE_bool(
-    test_all_data_in_one_period, false,
+    test_all_data_in_one_period,
+    false,
     "true will test all data in one test peroid."
     "Otherwise test (batch_size * log_peroid) data in one test period.");
 
-P_DEFINE_int32(average_test_period, 0,
+P_DEFINE_int32(average_test_period,
+               0,
                "Do test on average parameter every so"
                " many batches. MUST be devided by FLAGS_log_period."
                " Default 0 means do not test average parameter");
 
 P_DEFINE_int32(saving_period, 1, "Save parameteres every so many passes");
-P_DEFINE_int64(saving_period_by_batches, 0,
+P_DEFINE_int64(saving_period_by_batches,
+               0,
                "Save parameters every so many batches in one pass");
 P_DEFINE_string(save_dir, "", "Directory for saving model parameter");
-P_DEFINE_int32(start_pass, 0,
+P_DEFINE_int32(start_pass,
+               0,
                "Start training from this pass. "
                "Will load parameter from the previous pass");
-P_DEFINE_int32(test_pass, -1,
+P_DEFINE_int32(test_pass,
+               -1,
                "Will load parameter start from this pass to test");
 P_DEFINE_int32(test_wait, 0, "Waiting for pass parameter if not exist");
 P_DEFINE_bool(with_cost, true, "enable cost layer or not");
@@ -73,17 +78,21 @@ P_DEFINE_bool(distribute_test, false, "test in distribute mode");
 
 P_DEFINE_int32(num_passes, 100, "train for so many passes");
 
-P_DEFINE_string(config_args, "",
+P_DEFINE_string(config_args,
+                "",
                 "arguments passed to config file."
                 "Format: key1=value1,key2=value2");
 
-P_DEFINE_bool(save_only_one, false,
+P_DEFINE_bool(save_only_one,
+              false,
               "Save only parameters in last pass, remove previous.");
 
 P_DEFINE_string(feat_file, "", "File name of extracted feature.");
-P_DEFINE_string(predict_output_dir, "",
+P_DEFINE_string(predict_output_dir,
+                "",
                 "Directory that saves the predicted results of output layers");
-P_DEFINE_string(model_list, "",
+P_DEFINE_string(model_list,
+                "",
                 "File that saves the model list when evaluation");
 
 namespace paddle {
@@ -98,11 +107,11 @@ void Trainer::init(int argc, char** argv) {
   init(config);
 }
 
-void Trainer::init(const std::shared_ptr<TrainerConfigHelper> &config,
+void Trainer::init(const std::shared_ptr<TrainerConfigHelper>& config,
                    bool testing,
-                   const std::shared_ptr<GradientMachine> &gradientMachine,
-                   const std::shared_ptr<DataProvider> &dataProvider,
-                   const std::shared_ptr<DataProvider> &testDataProvider) {
+                   const std::shared_ptr<GradientMachine>& gradientMachine,
+                   const std::shared_ptr<DataProvider>& dataProvider,
+                   const std::shared_ptr<DataProvider>& testDataProvider) {
   this->stats_ = std::make_shared<TrainerStats>();
 
   config_ = config;
@@ -156,13 +165,16 @@ void Trainer::init(const std::shared_ptr<TrainerConfigHelper> &config,
       LOG(INFO) << "trainer mode: Testing";
     }
   } else if (IGradientMachineMode::tryGetMode(
-               (int*)&mode_, config_->getOptConfig().algorithm(),
-               FLAGS_trainer_count,
-               FLAGS_local, FLAGS_use_gpu)) {
+                 (int*)&mode_,
+                 config_->getOptConfig().algorithm(),
+                 FLAGS_trainer_count,
+                 FLAGS_local,
+                 FLAGS_use_gpu)) {
     LOG(INFO) << "Custom trainer mode.";
   } else if ((config_->getOptConfig().algorithm() == TrainAlgorithm::SGD ||
-              config_->getOptConfig().algorithm() == TrainAlgorithm::AsyncSGD)
-             && useSparseUpdater) {
+              config_->getOptConfig().algorithm() ==
+                  TrainAlgorithm::AsyncSGD) &&
+             useSparseUpdater) {
     mode_ = GradientMachine::kSgdSparseCpuTraining;
     LOG(INFO) << "trainer mode: SgdSparseCpuTraining";
   } else {
@@ -171,26 +183,26 @@ void Trainer::init(const std::shared_ptr<TrainerConfigHelper> &config,
   }
 
   // initialize trainer internal
-  trainerInternal_.init(config_, gradientMachine,
+  trainerInternal_.init(config_,
+                        gradientMachine,
                         TrainerInternalConfig::createFromMode(mode_),
-                        stats_, testing);
+                        stats_,
+                        testing);
   std::unique_ptr<ParameterUtilConfig> paramConfig(
-          new ParameterUtilConfig(FLAGS_save_only_one,
-                                  FLAGS_saving_period,
-                                  FLAGS_loadsave_parameters_in_pserver,
-                                  FLAGS_config));
+      new ParameterUtilConfig(FLAGS_save_only_one,
+                              FLAGS_saving_period,
+                              FLAGS_loadsave_parameters_in_pserver,
+                              FLAGS_config));
 
   paramUtil_.reset(
-      new paddle::ParameterUtil(
-          config_,
-          std::move(paramConfig),
-          trainerInternal_.getGradientMachine(),
-          trainerInternal_.getParameterUpdater()));
-
+      new paddle::ParameterUtil(config_,
+                                std::move(paramConfig),
+                                trainerInternal_.getGradientMachine(),
+                                trainerInternal_.getParameterUpdater()));
 
-  bool gpuData = FLAGS_use_gpu && (!FLAGS_parallel_nn) &&
-                 (!IGradientMachineMode::dataMustInCpu(mode_,
-                                                       FLAGS_trainer_count));
+  bool gpuData =
+      FLAGS_use_gpu && (!FLAGS_parallel_nn) &&
+      (!IGradientMachineMode::dataMustInCpu(mode_, FLAGS_trainer_count));
 
   dataProvider_ = dataProvider;
   if (!dataProvider_ && config_->hasDataConfig()) {
@@ -244,12 +256,14 @@ void Trainer::init(const std::shared_ptr<TrainerConfigHelper> &config,
       } else if (!config_->getConfig().init_model_path().empty() &&
                  (FLAGS_local || FLAGS_trainer_id == 0)) {
         paramUtil_->loadParametersWithPath(
-              config_->getConfig().init_model_path(),
-              false /*local*/, true /*remote*/);
+            config_->getConfig().init_model_path(),
+            false /*local*/,
+            true /*remote*/);
       } else if (config_->getConfig().start_pass() > 0 &&
                  (FLAGS_local || FLAGS_trainer_id == 0)) {
         CHECK(paramUtil_->loadParameters(config_->getConfig().start_pass() - 1,
-              false /*local*/, true /*remote*/));
+                                         false /*local*/,
+                                         true /*remote*/));
       } else {
         trainerInternal_.getParameterUpdater()->randParametersRemote();
       }
@@ -277,9 +291,8 @@ void Trainer::train(size_t numPasses) {
   finishTrain();
 }
 
-
 static double genPerturbation(real* d, real* grad, size_t dim) {
-  auto & reng = ThreadLocalRandomEngine::get();
+  auto& reng = ThreadLocalRandomEngine::get();
   std::uniform_real_distribution<double> dist(-1, 1);
   double gradNorm = 0, dNorm = 0;
   for (size_t i = 0; i < dim; ++i) {
@@ -390,9 +403,7 @@ void Trainer::startTrain() {
   trainerInternal_.getGradientMachine()->start(*config_, dataProvider_);
 }
 
-void Trainer::finishTrain() {
-  trainerInternal_.getGradientMachine()->finish();
-}
+void Trainer::finishTrain() { trainerInternal_.getGradientMachine()->finish(); }
 
 void Trainer::startTrainPass() {
   stats_->reset();
@@ -421,9 +432,8 @@ void Trainer::trainOneDataBatch(DataBatch& dataBatch) {
       if (FLAGS_prev_batch_state) {
         trainerInternal_.getGradientMachine()->getState(trainState_);
       }
-      trainPassContext_.avgTestCost +=
-          tester_->forwardOneBatch(
-            dataBatch, averageEvaluator_.get(), &forwardOutput_);
+      trainPassContext_.avgTestCost += tester_->forwardOneBatch(
+          dataBatch, averageEvaluator_.get(), &forwardOutput_);
       if (FLAGS_prev_batch_state) {
         trainerInternal_.getGradientMachine()->setState(trainState_);
       }
@@ -434,16 +444,16 @@ void Trainer::trainOneDataBatch(DataBatch& dataBatch) {
   {
     REGISTER_TIMER("TrainBatch");
     trainerInternal_.trainOneBatch(
-      trainPassContext_.batchId, dataBatch, &forwardOutput_);
+        trainPassContext_.batchId, dataBatch, &forwardOutput_);
   }
 
   if (averageEvaluator_ &&
-      trainPassContext_.batchId % FLAGS_average_test_period
-        == FLAGS_average_test_period - 1) {
+      trainPassContext_.batchId % FLAGS_average_test_period ==
+          FLAGS_average_test_period - 1) {
     averageEvaluator_->finish();
     LOG(INFO) << " Averaged parameter:"
-              << " cost=" << trainPassContext_.avgTestCost
-                             / trainPassContext_.numAvgTests
+              << " cost="
+              << trainPassContext_.avgTestCost / trainPassContext_.numAvgTests
               << " Eval: " << *averageEvaluator_;
     trainPassContext_.numAvgTests = 0;
     trainPassContext_.avgTestCost = 0;
@@ -463,15 +473,15 @@ void Trainer::trainOneDataBatch(DataBatch& dataBatch) {
   }
 
   if (FLAGS_saving_period_by_batches > 0 &&
-      trainPassContext_.batchId
-          > FLAGS_saving_period_by_batches * trainPassContext_.passInnerId &&
+      trainPassContext_.batchId >
+          FLAGS_saving_period_by_batches * trainPassContext_.passInnerId &&
       0 == FLAGS_trainer_id) {
     trainerInternal_.getParameterUpdater()->catchUpWith();
     if (testDataProvider_) {
       tester_->testOnePeriod();
     }
-    paramUtil_->saveParametersOnePass(
-      trainPassContext_.passId, trainPassContext_.passInnerId);
+    paramUtil_->saveParametersOnePass(trainPassContext_.passId,
+                                      trainPassContext_.passInnerId);
     ++trainPassContext_.passInnerId;
   }
 }
@@ -482,8 +492,8 @@ void Trainer::finishTrainPass() {
     return;
   }
 
-  trainerInternal_.finishTrainPass(
-    trainPassContext_.passId, trainPassContext_.batchId);
+  trainerInternal_.finishTrainPass(trainPassContext_.passId,
+                                   trainPassContext_.batchId);
 
   FOR_TIMING(globalStat.setThreadInfo(true));
   FOR_TIMING(globalStat.printAllStatus());
@@ -493,8 +503,8 @@ void Trainer::finishTrainPass() {
     tester_->testOnePeriod();
   }
 
-  if (trainPassContext_.passId % FLAGS_saving_period == 0
-      && FLAGS_trainer_id == 0) {
+  if (trainPassContext_.passId % FLAGS_saving_period == 0 &&
+      FLAGS_trainer_id == 0) {
     paramUtil_->saveParametersOnePass(trainPassContext_.passId);
   }
   ++trainPassContext_.passId;
@@ -526,8 +536,8 @@ void Trainer::trainOnePassBatch(int passId) {
   const std::vector<Argument> inArgs;
   {
     REGISTER_TIMER("onePass");
-    trainerInternal_.getGradientMachine()->forwardBackward(inArgs, nullptr,
-                                                        PASS_TRAIN, nullptr);
+    trainerInternal_.getGradientMachine()->forwardBackward(
+        inArgs, nullptr, PASS_TRAIN, nullptr);
   }
 
   real cost = .0;
@@ -537,8 +547,7 @@ void Trainer::trainOnePassBatch(int passId) {
 
   trainerInternal_.getGradientMachine()->onPassEnd();
 
-  bool accepted =
-    trainerInternal_.getParameterUpdater()->finishPass(cost);
+  bool accepted = trainerInternal_.getParameterUpdater()->finishPass(cost);
 
   globalStat.setThreadInfo(true);
   globalStat.printAllStatus();
@@ -559,11 +568,12 @@ void Trainer::trainOnePassBatch(int passId) {
   }
 }
 
-real Trainer::calcGradient(const DataBatch& dataBatch, const Vector& value,
+real Trainer::calcGradient(const DataBatch& dataBatch,
+                           const Vector& value,
                            Vector& gradient) {
   CHECK_EQ(value.getSize(), gradient.getSize());
   std::vector<ParameterPtr>& parameters =
-    trainerInternal_.getGradientMachine()->getParameters();
+      trainerInternal_.getGradientMachine()->getParameters();
 
   clearGradient();
 
@@ -584,8 +594,8 @@ real Trainer::calcGradient(const DataBatch& dataBatch, const Vector& value,
   std::vector<Argument> inArgs = dataBatch.getStreams();
   std::vector<Argument> outArgs;
 
-  trainerInternal_.getGradientMachine()->forwardBackward(inArgs, &outArgs,
-                                                         PASS_TRAIN);
+  trainerInternal_.getGradientMachine()->forwardBackward(
+      inArgs, &outArgs, PASS_TRAIN);
   real cost = Argument::sumCosts(outArgs);
 
   offset = 0;
@@ -612,15 +622,14 @@ void Trainer::clearGradient() {
 int Trainer::getBatchSize() { return config_->getOptConfig().batch_size(); }
 
 void Trainer::createTester() {
-  tester_.reset(new paddle::Tester(config_, createTesterConfig(),
+  tester_.reset(new paddle::Tester(config_,
+                                   createTesterConfig(),
                                    trainerInternal_.getGradientMachine(),
                                    trainerInternal_.getParameterUpdater(),
                                    testDataProvider_));
 }
 
-void Trainer::test() {
-  tester_->test();
-}
+void Trainer::test() { tester_->test(); }
 
 std::unique_ptr<TesterConfig> Trainer::createTesterConfig() {
   TesterConfig* conf = new TesterConfig;
@@ -648,7 +657,5 @@ std::unique_ptr<TesterConfig> Trainer::createTesterConfig() {
   return std::unique_ptr<TesterConfig>(conf);
 }
 
-ParameterUtil* Trainer::getParameterUtilPtr() {
-  return paramUtil_.get();
-}
+ParameterUtil* Trainer::getParameterUtilPtr() { return paramUtil_.get(); }
 }  // namespace paddle
diff --git a/paddle/trainer/Trainer.h b/paddle/trainer/Trainer.h
index 7762722456..899607c7c0 100644
--- a/paddle/trainer/Trainer.h
+++ b/paddle/trainer/Trainer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "paddle/utils/Util.h"
@@ -66,18 +65,17 @@ public:
    * @param testDataProvider Test Data Provider. null if create from config.
    */
   virtual void init(
-      const std::shared_ptr<TrainerConfigHelper> &config,
+      const std::shared_ptr<TrainerConfigHelper>& config,
       bool testing = false,
-      const std::shared_ptr<GradientMachine> &gradientMachine = nullptr,
-      const std::shared_ptr<DataProvider> &dataProvider = nullptr,
-      const std::shared_ptr<DataProvider> &testDataProvider = nullptr);
+      const std::shared_ptr<GradientMachine>& gradientMachine = nullptr,
+      const std::shared_ptr<DataProvider>& dataProvider = nullptr,
+      const std::shared_ptr<DataProvider>& testDataProvider = nullptr);
 
   /**
    * Initialize Trainer from command line flags.
    */
   void init(int argc, char** argv);
 
-
   /**
    * Train until num_passes reached.
    * One pass means neural network train through all training data.
@@ -108,7 +106,8 @@ public:
    * TODO(yuyang18): I think this method is deprecated and buggy. Should it be
    * removed?
    */
-  real calcGradient(const DataBatch& dataBatch, const Vector& value,
+  real calcGradient(const DataBatch& dataBatch,
+                    const Vector& value,
                     Vector& gradient);
 
   /**
@@ -207,12 +206,12 @@ protected:
   // parameter util
   std::unique_ptr<ParameterUtil> paramUtil_;
 
-  #ifdef PADDLE_METRIC_LEARNING
+#ifdef PADDLE_METRIC_LEARNING
   MetricTrainer trainerInternal_;
-  #else
+#else
   // trainer Internal
   TrainerInternal trainerInternal_;
-  #endif
+#endif
 };
 
 }  // namespace paddle
diff --git a/paddle/trainer/TrainerConfigHelper.cpp b/paddle/trainer/TrainerConfigHelper.cpp
index 98197e7988..ee5b1e0a9c 100644
--- a/paddle/trainer/TrainerConfigHelper.cpp
+++ b/paddle/trainer/TrainerConfigHelper.cpp
@@ -29,9 +29,8 @@ P_DECLARE_bool(with_gpu);
 P_DECLARE_bool(parallel_nn);
 P_DECLARE_string(config_args);
 
-
-const char* kConfigParserModuleName = "paddle.trainer.config_parser";
-const char* kConfigParserFuncName = "parse_config_and_serialize";
+const char *kConfigParserModuleName = "paddle.trainer.config_parser";
+const char *kConfigParserFuncName = "parse_config_and_serialize";
 
 namespace paddle {
 
@@ -40,12 +39,10 @@ struct TrainerConfigHelperPrivate {
 };
 
 TrainerConfigHelper::TrainerConfigHelper(const std::string &configFilePath)
-  :m(new TrainerConfigHelperPrivate()) {
+    : m(new TrainerConfigHelperPrivate()) {
   std::ostringstream configArgs;
-  configArgs << "trainer_id=" << FLAGS_trainer_id
-             << ",local=" << FLAGS_local
-             << ",with_cost=" << FLAGS_with_cost
-             << ",use_gpu=" << FLAGS_use_gpu
+  configArgs << "trainer_id=" << FLAGS_trainer_id << ",local=" << FLAGS_local
+             << ",with_cost=" << FLAGS_with_cost << ",use_gpu=" << FLAGS_use_gpu
              << ",parallel_nn=" << FLAGS_parallel_nn
              << ",cudnn_version=" << hl_get_cudnn_lib_version();
   if (!FLAGS_config_args.empty()) {
@@ -54,31 +51,26 @@ TrainerConfigHelper::TrainerConfigHelper(const std::string &configFilePath)
 
   VLOG(3) << "Parsing trainer config " << configFilePath;
   std::string configProtoStr =
-      callPythonFunc(kConfigParserModuleName, kConfigParserFuncName,
+      callPythonFunc(kConfigParserModuleName,
+                     kConfigParserFuncName,
                      {configFilePath, configArgs.str()});
   CHECK(m->conf.ParseFromString(configProtoStr));
 }
 
-TrainerConfigHelper::TrainerConfigHelper(const TrainerConfig& config)
-  :m(new TrainerConfigHelperPrivate()) {
+TrainerConfigHelper::TrainerConfigHelper(const TrainerConfig &config)
+    : m(new TrainerConfigHelperPrivate()) {
   m->conf = config;
 }
 
-
 TrainerConfigHelper::~TrainerConfigHelper() {
   if (m) {
     delete m;
   }
 }
 
-const TrainerConfig &
-TrainerConfigHelper::getConfig() const {
-  return m->conf;
-}
+const TrainerConfig &TrainerConfigHelper::getConfig() const { return m->conf; }
 
-TrainerConfig& TrainerConfigHelper::getMutableConfig() {
-  return m->conf;
-}
+TrainerConfig &TrainerConfigHelper::getMutableConfig() { return m->conf; }
 
 const OptimizationConfig &TrainerConfigHelper::getOptConfig() const {
   return m->conf.opt_config();
@@ -173,8 +165,7 @@ std::string TrainerConfigHelper::getConfigName(bool *ok) const {
   } else if (!m->conf.init_model_path().empty()) {
     retv = getConfigNameFromPath(m->conf.init_model_path());
   } else if (m->conf.start_pass() >= 1) {
-    retv = getConfigNameFromPassId(m->conf.start_pass(),
-                                   m->conf.save_dir());
+    retv = getConfigNameFromPassId(m->conf.start_pass(), m->conf.save_dir());
   }
 
   if (ok) {
@@ -191,8 +182,8 @@ std::shared_ptr<TrainerConfigHelper> TrainerConfigHelper::createFromFlags() {
   } else if (!FLAGS_init_model_path.empty()) {
     configPath = getConfigNameFromPath(FLAGS_init_model_path);
   } else if (FLAGS_start_pass >= 1) {
-    configPath = getConfigNameFromPassId(FLAGS_start_pass - 1,
-                                         FLAGS_init_model_path);
+    configPath =
+        getConfigNameFromPassId(FLAGS_start_pass - 1, FLAGS_init_model_path);
   } else {
     return nullptr;
   }
diff --git a/paddle/trainer/TrainerConfigHelper.h b/paddle/trainer/TrainerConfigHelper.h
index d3ad1eeeb4..d206849641 100644
--- a/paddle/trainer/TrainerConfigHelper.h
+++ b/paddle/trainer/TrainerConfigHelper.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <memory>
@@ -27,7 +26,6 @@ struct TrainerConfigHelperPrivate;
 class ModelConfig;
 class DataConfig;
 
-
 /**
  * @brief TrainerConfig Helper. A class wrap protobuf's TrainerConfig Object,
  * simplize the usage for TrainerConfig.
@@ -46,7 +44,7 @@ public:
    * @brief Ctor, Create a TrainerConfig from config file
    * @param configFilePath Config file path.
    */
-  explicit TrainerConfigHelper(const std::string &configFilePath);
+  explicit TrainerConfigHelper(const std::string& configFilePath);
   explicit TrainerConfigHelper(const TrainerConfig& config);
 
   /**
@@ -106,7 +104,6 @@ public:
    */
   bool hasTestDataConfig() const;
 
-
   /**
    * @brief Update trainer config from command line flags.
    *        Override config's (save_dir, init_model_path, start_pass) if command
@@ -114,7 +111,6 @@ public:
    */
   void updateConfigFromFlags();
 
-
   /**
    * @brief Disable optimization's sparse remote update.
    */
@@ -125,13 +121,10 @@ public:
    */
   void disableRemoteSparseUpdaterForEachParams();
 
-
   /**
    * @brief implicit conversion.
    */
-  inline operator const TrainerConfig&() const {
-    return this->getConfig();
-  }
+  inline operator const TrainerConfig&() const { return this->getConfig(); }
 
   /**
    * @brief implicit conversion.
@@ -143,16 +136,12 @@ public:
   /**
    * @brief implicit conversion.
    */
-  inline operator const DataConfig&() const {
-    return this->getDataConfig();
-  }
+  inline operator const DataConfig&() const { return this->getDataConfig(); }
 
   /**
    * @brief implicit conversion.
    */
-  inline operator const ModelConfig&() const {
-    return this->getModelConfig();
-  }
+  inline operator const ModelConfig&() const { return this->getModelConfig(); }
 
   /**
    * @brief Get mutable optimization config.
diff --git a/paddle/trainer/TrainerInternal.cpp b/paddle/trainer/TrainerInternal.cpp
index e23e42927c..b1c3bf26d2 100644
--- a/paddle/trainer/TrainerInternal.cpp
+++ b/paddle/trainer/TrainerInternal.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "TrainerInternal.h"
 
 #include <fenv.h>
@@ -37,30 +36,31 @@ limitations under the License. */
 
 namespace paddle {
 
-void TrainerInternal::init(const std::shared_ptr<TrainerConfigHelper> &config,
-                           const GradientMachinePtr &gradientMachine,
-                           std::unique_ptr<TrainerInternalConfig> &&intconfig,
-                           const std::shared_ptr<TrainerStats> &stats,
+void TrainerInternal::init(const std::shared_ptr<TrainerConfigHelper>& config,
+                           const GradientMachinePtr& gradientMachine,
+                           std::unique_ptr<TrainerInternalConfig>&& intconfig,
+                           const std::shared_ptr<TrainerStats>& stats,
                            bool testing) {
-    config_ = config;
-    intconfig_ = std::move(intconfig);
-    stats_ = stats;
+  config_ = config;
+  intconfig_ = std::move(intconfig);
+  stats_ = stats;
 
-    //! in training will use parameter updater definitly.
-    //! But only use parameter in testing mode when some parameter in pserver.
-    if (!testing || (config_->getOptConfig().use_sparse_remote_updater() &&
+  //! in training will use parameter updater definitly.
+  //! But only use parameter in testing mode when some parameter in pserver.
+  if (!testing || (config_->getOptConfig().use_sparse_remote_updater() &&
                    intconfig_->loadsave_parameters_in_pserver)) {
-      createParameterUpdater(testing);
-    }
+    createParameterUpdater(testing);
+  }
 
-    gradientMachine_ = gradientMachine;
-    if (!gradientMachine) {
-      CHECK(config_->getConfig().has_model_config())
-          << "Missing model_config in trainer_config";
-      gradientMachine_.reset(GradientMachine::create(
-        config_->getConfig().model_config(), intconfig_->mode,
-        parameterUpdater_->getParameterTypes()));
-    }
+  gradientMachine_ = gradientMachine;
+  if (!gradientMachine) {
+    CHECK(config_->getConfig().has_model_config())
+        << "Missing model_config in trainer_config";
+    gradientMachine_.reset(
+        GradientMachine::create(config_->getConfig().model_config(),
+                                intconfig_->mode,
+                                parameterUpdater_->getParameterTypes()));
+  }
 }
 
 void TrainerInternal::trainOneBatch(int64_t batchId,
@@ -96,8 +96,8 @@ void TrainerInternal::trainOneBatch(int64_t batchId,
     parameterUpdater_->getParametersRemote();
   }
 
-  UpdateCallback updateCallback =
-      [this, showStats, &paraStats](Parameter* para) {
+  UpdateCallback updateCallback = [this, showStats, &paraStats](
+      Parameter* para) {
     if (showStats) {
       //! @TODO(yuyang18) Show stats is actually a ParameterHook, refactor
       // it
@@ -116,8 +116,8 @@ void TrainerInternal::trainOneBatch(int64_t batchId,
     timer.start();
 #endif
     REGISTER_TIMER("forwardBackward");
-    forwardBackwardBatch(inArgs, *outArgs, passType, updateCallback,
-                         doPipelineUpdate);
+    forwardBackwardBatch(
+        inArgs, *outArgs, passType, updateCallback, doPipelineUpdate);
 #ifndef PADDLE_DISABLE_TIMER
     timer.stop();
     parameterUpdater_->setForwardbackwardTime(timer.get());
@@ -147,7 +147,7 @@ void TrainerInternal::trainOneBatch(int64_t batchId,
     gradientMachine_->eval(evaluator_);
   }
 
-  *stats_ += { actualBatchSize, cost };
+  *stats_ += {actualBatchSize, cost};
   {
     REGISTER_TIMER("finishBatch");
     parameterUpdater_->finishBatch(cost);
@@ -162,12 +162,11 @@ void TrainerInternal::trainOneBatch(int64_t batchId,
     if (intconfig_->dot_period > 0) {
       std::cerr << std::endl;
     }
-    LOG(INFO) << " Batch=" << batchId + 1 << " "
-              << *stats_
+    LOG(INFO) << " Batch=" << batchId + 1 << " " << *stats_
               << " Eval: " << *evaluator_
               << " CurrentEval: " << *currentEvaluator_;
   } else if (intconfig_->dot_period > 0 &&
-            (batchId + 1) % intconfig_->dot_period == 0) {
+             (batchId + 1) % intconfig_->dot_period == 0) {
     std::cerr << ".";
   }
 }
@@ -179,13 +178,13 @@ void TrainerInternal::finishTrainPass(int passId, int batchId) {
   gradientMachine_->onPassEnd();
   parameterUpdater_->finishPass();
   evaluator_->finish();
-  LOG(INFO) << " Pass=" << passId << " Batch=" << batchId
-            << " " << stats_->getStats(false /*without current cost*/)
+  LOG(INFO) << " Pass=" << passId << " Batch=" << batchId << " "
+            << stats_->getStats(false /*without current cost*/)
             << " Eval: " << *evaluator_;
 }
 
-void TrainerInternal::showParameterStats(const std::vector<ParaStat>&
-                                        paraStats) {
+void TrainerInternal::showParameterStats(
+    const std::vector<ParaStat>& paraStats) {
   std::vector<ParameterPtr>& parameters = gradientMachine_->getParameters();
   for (auto& parameter : parameters) {
     SetDevice device(parameter->getDeviceId());
@@ -218,18 +217,21 @@ void TrainerInternal::showParameterStats(const std::vector<ParaStat>&
 void TrainerInternal::createParameterUpdater(bool testing) {
   const std::string& alg = config_->getOptConfig().algorithm();
   parameterUpdater_.reset(ParameterUpdaterCreators::tryCreateUpdater(
-                            alg, config_->getOptConfig(), intconfig_->local,
-                            intconfig_->num_passes));
-  if (parameterUpdater_) { return; }
+      alg, config_->getOptConfig(), intconfig_->local, intconfig_->num_passes));
+  if (parameterUpdater_) {
+    return;
+  }
 
   if (!intconfig_->local) {
     if (testing && config_->getOptConfig().use_sparse_remote_updater()) {
       std::unique_ptr<ParameterUpdater> localUpdater;
       localUpdater.reset(
           new SgdLocalUpdater(config_->getOptConfig()));  // do nothing
-      parameterUpdater_.reset(new SparseRemoteParameterUpdaterComposite(
-          config_->getOptConfig(), intconfig_->num_passes, testing,
-          std::move(localUpdater)));
+      parameterUpdater_.reset(
+          new SparseRemoteParameterUpdaterComposite(config_->getOptConfig(),
+                                                    intconfig_->num_passes,
+                                                    testing,
+                                                    std::move(localUpdater)));
     } else {
       if (GradientMachine::kSgdSparseCpuTraining == intconfig_->mode &&
           !intconfig_->use_old_updater) {
@@ -251,21 +253,18 @@ void TrainerInternal::createParameterUpdater(bool testing) {
       }
 
       localUpdater.reset(
-              intconfig_->use_old_updater
+          intconfig_->use_old_updater
               ? new RemoteParameterUpdater(
-                      *config_,
-                      intconfig_->num_passes,
-                      std::move(localUpdater))
+                    *config_, intconfig_->num_passes, std::move(localUpdater))
               : new ConcurrentRemoteParameterUpdater(
-                      *config_,
-                      intconfig_->num_passes,
-                      std::move(localUpdater)));
-
+                    *config_, intconfig_->num_passes, std::move(localUpdater)));
 
       if (config_->getOptConfig().use_sparse_remote_updater()) {
-        localUpdater.reset(new SparseRemoteParameterUpdaterComposite(
-            *config_, intconfig_->num_passes, testing,
-            std::move(localUpdater)));
+        localUpdater.reset(
+            new SparseRemoteParameterUpdaterComposite(*config_,
+                                                      intconfig_->num_passes,
+                                                      testing,
+                                                      std::move(localUpdater)));
       }
 
       this->parameterUpdater_ = std::move(localUpdater);
@@ -282,8 +281,7 @@ void TrainerInternal::createParameterUpdater(bool testing) {
       } else if (intconfig_->use_gpu &&
                  config_->getOptConfig().do_average_in_cpu() &&
                  config_->getOptConfig().average_window() > 0) {
-        parameterUpdater_.reset(
-            new SgdUpdaterWithCpuAverager(*config_));
+        parameterUpdater_.reset(new SgdUpdaterWithCpuAverager(*config_));
       } else {
         parameterUpdater_.reset(new SgdLocalUpdater(*config_));
       }
@@ -294,10 +292,10 @@ void TrainerInternal::createParameterUpdater(bool testing) {
 }
 
 void TrainerInternal::forwardBackwardBatch(const std::vector<Argument>& inArgs,
-                                   std::vector<Argument>& outArgs,
-                                   PassType& passType,
-                                   UpdateCallback updateCallback,
-                                   bool doPipelineUpdate) {
+                                           std::vector<Argument>& outArgs,
+                                           PassType& passType,
+                                           UpdateCallback updateCallback,
+                                           bool doPipelineUpdate) {
   gradientMachine_->forwardBackward(
       inArgs, &outArgs, passType, doPipelineUpdate ? updateCallback : nullptr);
 }
diff --git a/paddle/trainer/TrainerInternal.h b/paddle/trainer/TrainerInternal.h
index 3a53aa1d17..962d53a30e 100644
--- a/paddle/trainer/TrainerInternal.h
+++ b/paddle/trainer/TrainerInternal.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "paddle/utils/Util.h"
@@ -28,7 +27,6 @@ limitations under the License. */
 #include "TrainerConfigHelper.h"
 #include "TrainerInternalConfig.h"
 
-
 namespace paddle {
 
 /**
@@ -40,12 +38,10 @@ public:
   struct ParaStat {
     real maxAbsGrad;
     real avgAbsGrad;
-    ParaStat() :maxAbsGrad(.0), avgAbsGrad(.0){
-    }
+    ParaStat() : maxAbsGrad(.0), avgAbsGrad(.0) {}
   };
 
-  TrainerInternal() {
-  }
+  TrainerInternal() {}
 
   /**
    * Intializes trainer internal class
@@ -55,10 +51,10 @@ public:
    * @param stats training stats
    * @param testing if it is in testing phase
    */
-  void init(const std::shared_ptr<TrainerConfigHelper> &config,
-            const GradientMachinePtr &machine,
-            std::unique_ptr<TrainerInternalConfig> &&intconfig,
-            const std::shared_ptr<TrainerStats> &stats,
+  void init(const std::shared_ptr<TrainerConfigHelper>& config,
+            const GradientMachinePtr& machine,
+            std::unique_ptr<TrainerInternalConfig>&& intconfig,
+            const std::shared_ptr<TrainerStats>& stats,
             bool testing);
 
   virtual ~TrainerInternal() {}
@@ -94,7 +90,7 @@ public:
   /**
    * getGradientMachine
    */
-  inline const GradientMachinePtr & getGradientMachine() const {
+  inline const GradientMachinePtr& getGradientMachine() const {
     return gradientMachine_;
   }
 
@@ -109,17 +105,13 @@ public:
    * setCurrentEvaluator
    * @param eval evaluator to set
    */
-  inline void setCurrentEvaluator(Evaluator* eval) {
-    currentEvaluator_ = eval;
-  }
+  inline void setCurrentEvaluator(Evaluator* eval) { currentEvaluator_ = eval; }
 
   /**
    * setEvaluator
    * @param eval evaluator to set
    */
-  inline void setEvaluator(Evaluator* eval) {
-    evaluator_ = eval;
-  }
+  inline void setEvaluator(Evaluator* eval) { evaluator_ = eval; }
 
   /**
    * forwardBackwardBatch
diff --git a/paddle/trainer/TrainerInternalConfig.cpp b/paddle/trainer/TrainerInternalConfig.cpp
index 4a829a4df9..0dc74cb3b3 100644
--- a/paddle/trainer/TrainerInternalConfig.cpp
+++ b/paddle/trainer/TrainerInternalConfig.cpp
@@ -14,7 +14,8 @@ limitations under the License. */
 
 #include "TrainerInternalConfig.h"
 
-P_DEFINE_int32(show_parameter_stats_period, 0,
+P_DEFINE_int32(show_parameter_stats_period,
+               0,
                "Whether to show parameter stats during training");
 
 P_DEFINE_int32(dot_period, 1, "Print '.' every so many batches");
diff --git a/paddle/trainer/TrainerInternalConfig.h b/paddle/trainer/TrainerInternalConfig.h
index 9b59143bad..b7bfd29abd 100644
--- a/paddle/trainer/TrainerInternalConfig.h
+++ b/paddle/trainer/TrainerInternalConfig.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "paddle/utils/Util.h"
@@ -94,9 +93,7 @@ public:
    * @brief get all processed samples' number
    * @return all processed samples' number
    */
-  inline int64_t getNumProcessed() const {
-    return this->numProcessed_;
-  }
+  inline int64_t getNumProcessed() const { return this->numProcessed_; }
 
   /**
    * @brief same function as addCost. But it is simple to invoke.
@@ -111,7 +108,7 @@ public:
    * @param p a pair of parameter, first is numProcessed, second is cost.
    * @return *this
    */
-  inline TrainerStats& operator += (const std::pair<int64_t, real>& p) {
+  inline TrainerStats& operator+=(const std::pair<int64_t, real>& p) {
     this->addCost(p.first, p.second);
     return *this;
   }
@@ -121,9 +118,7 @@ public:
    *
    * reset stat when constructed.
    */
-  inline TrainerStats() {
-    this->reset();
-  }
+  inline TrainerStats() { this->reset(); }
 
   /**
    * @brief show stats to ostream.
@@ -137,7 +132,7 @@ public:
     os << "samples=" << this->getNumProcessed()
        << " AvgCost=" << this->getAvgCost();
     if (withCurrentCost) {
-       os << " CurrentCost=" << this->getCurrentAvgCost();
+      os << " CurrentCost=" << this->getCurrentAvgCost();
     }
   }
 
diff --git a/paddle/trainer/TrainerMain.cpp b/paddle/trainer/TrainerMain.cpp
index a486cc383a..e23e745d99 100644
--- a/paddle/trainer/TrainerMain.cpp
+++ b/paddle/trainer/TrainerMain.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <fenv.h>
 #include "paddle/utils/PythonUtil.h"
 #include "paddle/utils/StringUtil.h"
@@ -34,7 +33,7 @@ P_DECLARE_string(rdma_tcp);
 using namespace paddle;  // NOLINT
 
 int main(int argc, char** argv) {
-  // write logs instantly (never buffer log messages)
+// write logs instantly (never buffer log messages)
 #ifdef PADDLE_USE_GLOG
   FLAGS_logbuflevel = -1;
 #endif
diff --git a/paddle/trainer/tests/picojson.h b/paddle/trainer/tests/picojson.h
index a0b5c2274b..cb657d219e 100644
--- a/paddle/trainer/tests/picojson.h
+++ b/paddle/trainer/tests/picojson.h
@@ -409,7 +409,8 @@ inline std::string value::to_str() const {
     case number_type: {
       char buf[256];
       double tmp;
-      SNPRINTF(buf, sizeof(buf),
+      SNPRINTF(buf,
+               sizeof(buf),
                fabs(u_.number_) < (1ULL << 53) && modf(u_.number_, &tmp) == 0
                    ? "%.f"
                    : "%.17g",
@@ -532,7 +533,8 @@ void value::_serialize(Iter oi, int indent) const {
         ++indent;
       }
       for (object::const_iterator i = u_.object_->begin();
-           i != u_.object_->end(); ++i) {
+           i != u_.object_->end();
+           ++i) {
         if (i != u_.object_->begin()) {
           *oi++ = ',';
         }
@@ -983,7 +985,9 @@ inline std::string parse(value& out, Iter& pos, const Iter& last) {
 }
 
 template <typename Context, typename Iter>
-inline Iter _parse(Context& ctx, const Iter& first, const Iter& last,
+inline Iter _parse(Context& ctx,
+                   const Iter& first,
+                   const Iter& last,
                    std::string* err) {
   input<Iter> in(first, last);
   if (!_parse(ctx, in) && err != NULL) {
@@ -1003,7 +1007,9 @@ inline Iter _parse(Context& ctx, const Iter& first, const Iter& last,
 }
 
 template <typename Iter>
-inline Iter parse(value& out, const Iter& first, const Iter& last,
+inline Iter parse(value& out,
+                  const Iter& first,
+                  const Iter& last,
                   std::string* err) {
   default_parse_context ctx(&out);
   return _parse(ctx, first, last, err);
@@ -1017,8 +1023,10 @@ inline std::string parse(value& out, const std::string& s) {
 
 inline std::string parse(value& out, std::istream& is) {
   std::string err;
-  parse(out, std::istreambuf_iterator<char>(is.rdbuf()),
-        std::istreambuf_iterator<char>(), &err);
+  parse(out,
+        std::istreambuf_iterator<char>(is.rdbuf()),
+        std::istreambuf_iterator<char>(),
+        &err);
   return err;
 }
 
diff --git a/paddle/trainer/tests/test_Compare.cpp b/paddle/trainer/tests/test_Compare.cpp
index 735c5a5b27..03312f9e47 100644
--- a/paddle/trainer/tests/test_Compare.cpp
+++ b/paddle/trainer/tests/test_Compare.cpp
@@ -52,8 +52,8 @@ void calcGradient(bool useGpu, comData& Data) {
   vector<Argument>& inArgs = dataBatch.getStreams();
   trainer.getGradientMachine()->start(trainer.getConfig(), nullptr);
   for (int i = 0; i < 2; ++i) {
-    trainer.getGradientMachine()->forwardBackward(inArgs, &Data.outArgs,
-                                                  PASS_TRAIN);
+    trainer.getGradientMachine()->forwardBackward(
+        inArgs, &Data.outArgs, PASS_TRAIN);
   }
   trainer.getGradientMachine()->finish();
 }
diff --git a/paddle/trainer/tests/test_CompareSparse.cpp b/paddle/trainer/tests/test_CompareSparse.cpp
index 311dd333a1..a7c6862ce3 100644
--- a/paddle/trainer/tests/test_CompareSparse.cpp
+++ b/paddle/trainer/tests/test_CompareSparse.cpp
@@ -23,7 +23,7 @@ using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
 
 static const string& configFile1 =
-              "trainer/tests/sample_trainer_config_qb_rnn.conf";
+    "trainer/tests/sample_trainer_config_qb_rnn.conf";
 
 P_DECLARE_bool(use_gpu);
 P_DECLARE_string(config);
@@ -38,8 +38,9 @@ P_DECLARE_bool(local);
 P_DECLARE_bool(use_old_updater);
 P_DECLARE_bool(parallel_nn);
 P_DECLARE_string(config_args);
-P_DEFINE_double(max_diff_ratio, 0.0f,
-              "max diff ratio allowed for parameters value");
+P_DEFINE_double(max_diff_ratio,
+                0.0f,
+                "max diff ratio allowed for parameters value");
 
 int gNumDevices = 0;
 
@@ -53,8 +54,7 @@ std::vector<ParameterPtr> trainerOnePassTest(const string& configFile,
   FLAGS_config_args = sparseUpdate ? "sparse_update=1" : "sparse_update=0";
 
   LOG(INFO) << " useGpu=" << useGpu << " trainerCount=" << trainerCount
-            << " configFile=" << configFile
-            << " sparseUpdate=" << sparseUpdate;
+            << " configFile=" << configFile << " sparseUpdate=" << sparseUpdate;
   srand(FLAGS_seed);
   *ThreadLocalRand::getSeed() = FLAGS_seed;
   ThreadLocalRandomEngine::get().seed(FLAGS_seed);
@@ -91,8 +91,12 @@ std::vector<ParameterPtr>& getDenseParameters() {
   return denseParameters;
 }
 
-void checkBuffer(real* A, const char* desA, real* B, const char* desB,
-                 size_t len, double maxDiffRatio) {
+void checkBuffer(real* A,
+                 const char* desA,
+                 real* B,
+                 const char* desB,
+                 size_t len,
+                 double maxDiffRatio) {
   double maxDiff = 0;
   double maxValue = 0;
   for (size_t i = 0; i < len; ++i) {
@@ -101,10 +105,8 @@ void checkBuffer(real* A, const char* desA, real* B, const char* desB,
     maxDiff = std::max(maxDiff, diff);
   }
   EXPECT_LE(maxDiff / maxValue, maxDiffRatio);
-  LOG(INFO) << " maxDiff=" << maxDiff
-            << " maxValue=" << maxValue
-            << " maxDiff/maxValue=" << maxDiff / maxValue
-            << "\n\n";
+  LOG(INFO) << " maxDiff=" << maxDiff << " maxValue=" << maxValue
+            << " maxDiff/maxValue=" << maxDiff / maxValue << "\n\n";
 }
 
 void compareValue(const vector<ParameterPtr>& parametersA,
@@ -125,8 +127,12 @@ void compareValue(const vector<ParameterPtr>& parametersA,
 
     LOG(INFO) << "\n\n----------- PARAMETER_VALUE:  " << parameterA->getName()
               << " ; size : " << paraA.getSize() << " ------------";
-    checkBuffer(paraA.getData(), "para_A", paraB.getData(), "para_B",
-                paraA.getSize(), maxDiffRatio);
+    checkBuffer(paraA.getData(),
+                "para_A",
+                paraB.getData(),
+                "para_B",
+                paraA.getSize(),
+                maxDiffRatio);
   }
 }
 
@@ -172,8 +178,7 @@ TEST(compareSparse, multiGradientMachine) {
       if (useGpu) continue;
 #endif
       FLAGS_parallel_nn = useGpu;
-      LOG(INFO) << " local=" << local
-                << " useGpu=" << useGpu;
+      LOG(INFO) << " local=" << local << " useGpu=" << useGpu;
       int trainerCount = useGpu ? numGpu : 2;
       std::vector<ParameterPtr> parameters =
           trainerOnePassTest(configFile1, true, trainerCount, useGpu);
@@ -197,8 +202,7 @@ TEST(compareSparse, NeuralNetwork) {
       if (useGpu) continue;
 #endif
       FLAGS_parallel_nn = useGpu;
-      LOG(INFO) << " local=" << local
-                << " useGpu=" << useGpu;
+      LOG(INFO) << " local=" << local << " useGpu=" << useGpu;
       int trainerCount = 1;
       std::vector<ParameterPtr> parameters =
           trainerOnePassTest(configFile1, true, trainerCount, useGpu);
diff --git a/paddle/trainer/tests/test_CompareTwoNets.cpp b/paddle/trainer/tests/test_CompareTwoNets.cpp
index d1057f2aea..81320da6ac 100644
--- a/paddle/trainer/tests/test_CompareTwoNets.cpp
+++ b/paddle/trainer/tests/test_CompareTwoNets.cpp
@@ -32,10 +32,12 @@ P_DECLARE_string(nics);
 
 P_DEFINE_string(config_file_a, "", "config of one network to compare");
 P_DEFINE_string(config_file_b, "", "config of another network to compare");
-P_DEFINE_bool(need_high_accuracy, false,
+P_DEFINE_bool(need_high_accuracy,
+              false,
               "whether need to run in double accuracy");
 P_DEFINE_double(
-    max_diff_ratio, 0.0f,
+    max_diff_ratio,
+    0.0f,
     "max diff ratio allowed for outputs and parameters (value/gradient)");
 P_DECLARE_bool(thread_local_rand_use_global_seed);
 P_DECLARE_int32(seed);
@@ -71,14 +73,18 @@ void calcGradient(ComData& data, const string configFile) {
   vector<Argument>& inArgs = dataBatch.getStreams();
 
   trainer.getGradientMachine()->start(trainer.getConfig(), nullptr);
-  trainer.getGradientMachine()->forwardBackward(inArgs, &data.outArgs,
-                                                PASS_TRAIN);
+  trainer.getGradientMachine()->forwardBackward(
+      inArgs, &data.outArgs, PASS_TRAIN);
 
   trainer.getGradientMachine()->finish();
 }
 
-void checkBuffer(real* A, const char* desA, real* B, const char* desB,
-                 size_t len, size_t width = 1) {
+void checkBuffer(real* A,
+                 const char* desA,
+                 real* B,
+                 const char* desB,
+                 size_t len,
+                 size_t width = 1) {
   int nNum = 0;
   real maxVal = 0;
   for (size_t i = 0; i < len; ++i) {
@@ -90,8 +96,8 @@ void checkBuffer(real* A, const char* desA, real* B, const char* desB,
     maxDiff = std::max(maxDiff, diff);
     if (diff > maxVal * FLAGS_max_diff_ratio) {
       nNum++;
-      VLOG(1) << "Row: " << i / width << ", " << desA << " : " << A[i]
-              << "    " << desB << " : " << B[i] << " diff=" << diff;
+      VLOG(1) << "Row: " << i / width << ", " << desA << " : " << A[i] << "    "
+              << desB << " : " << B[i] << " diff=" << diff;
     }
   }
   EXPECT_EQ(0, nNum);
@@ -114,8 +120,12 @@ void compareGradient(ComData& comDataA, ComData& comDataB) {
     LOG(INFO) << "\n--------------------------------"
               << " Check Network Output_" << i << ":"
               << " -------------------------------------\n";
-    checkBuffer(matA.getData(), "network A output", matB.getData(),
-                "network B output", matA.getElementCnt(), matA.getWidth());
+    checkBuffer(matA.getData(),
+                "network A output",
+                matB.getData(),
+                "network B output",
+                matA.getElementCnt(),
+                matA.getWidth());
   }
 
   vector<ParameterPtr>& parametersA = comDataA.parameters;
@@ -136,7 +146,10 @@ void compareGradient(ComData& comDataA, ComData& comDataB) {
 
     LOG(INFO) << "\n\n----------- PARAMETER_VALUE:  " << parameterA->getName()
               << " ; size : " << paraA.getSize() << " ------------";
-    checkBuffer(paraA.getData(), "Network A", paraB.getData(), "Network B",
+    checkBuffer(paraA.getData(),
+                "Network A",
+                paraB.getData(),
+                "Network B",
                 paraA.getSize());
 
     CpuVector gradA(*parameterA->getBuf(PARAMETER_GRADIENT));
@@ -144,7 +157,10 @@ void compareGradient(ComData& comDataA, ComData& comDataB) {
 
     LOG(INFO) << "\n\n----------- PARAMETER_GRADIENT: " << parameterA->getName()
               << " ; size : " << gradA.getSize() << " -----------";
-    checkBuffer(gradA.getData(), "Network A", gradB.getData(), "Network B",
+    checkBuffer(gradA.getData(),
+                "Network A",
+                gradB.getData(),
+                "Network B",
                 gradA.getSize());
   }
 }
diff --git a/paddle/trainer/tests/test_CompareTwoOpts.cpp b/paddle/trainer/tests/test_CompareTwoOpts.cpp
index 2c44da43fc..a52f2fa7e7 100644
--- a/paddle/trainer/tests/test_CompareTwoOpts.cpp
+++ b/paddle/trainer/tests/test_CompareTwoOpts.cpp
@@ -32,11 +32,13 @@ P_DECLARE_string(nics);
 
 P_DEFINE_string(config_file_a, "", "config of one network to compare");
 P_DEFINE_string(config_file_b, "", "config of another network to compare");
-P_DEFINE_bool(need_high_accuracy, true,
+P_DEFINE_bool(need_high_accuracy,
+              true,
               "whether need to run in double accuracy (recommended)");
 P_DEFINE_double(
-      max_diff_ratio, 0.0f,
-      "max diff ratio allowed for outputs and parameters (value/gradient)");
+    max_diff_ratio,
+    0.0f,
+    "max diff ratio allowed for outputs and parameters (value/gradient)");
 
 struct ComData {
   vector<Argument> outArgs;
@@ -62,8 +64,12 @@ void calcGradient(ComData& data, const string configFile) {
   trainer.train();
 }
 
-void checkBuffer(real* A, const char* desA, real* B, const char* desB,
-                 size_t len, size_t width = 1) {
+void checkBuffer(real* A,
+                 const char* desA,
+                 real* B,
+                 const char* desB,
+                 size_t len,
+                 size_t width = 1) {
   int nNum = 0;
   for (size_t i = 0; i < len; ++i) {
     real diff = fabs(A[i] - B[i]);
@@ -94,8 +100,12 @@ void compareGradient(ComData& comDataA, ComData& comDataB) {
     LOG(INFO) << "\n--------------------------------"
               << " Check Network Output_" << i << ":"
               << " -------------------------------------\n";
-    checkBuffer(matA.getData(), "network A output", matB.getData(),
-                "network B output", matA.getElementCnt(), matA.getWidth());
+    checkBuffer(matA.getData(),
+                "network A output",
+                matB.getData(),
+                "network B output",
+                matA.getElementCnt(),
+                matA.getWidth());
   }
 
   vector<ParameterPtr>& parametersA = comDataA.parameters;
@@ -116,7 +126,10 @@ void compareGradient(ComData& comDataA, ComData& comDataB) {
 
     LOG(INFO) << "\n\n----------- PARAMETER_VALUE:  " << parameterA->getName()
               << " ; size : " << paraA.getSize() << " ------------";
-    checkBuffer(paraA.getData(), "Network A", paraB.getData(), "Network B",
+    checkBuffer(paraA.getData(),
+                "Network A",
+                paraB.getData(),
+                "Network B",
                 paraA.getSize());
 
     CpuVector gradA(*parameterA->getBuf(PARAMETER_GRADIENT));
@@ -124,7 +137,10 @@ void compareGradient(ComData& comDataA, ComData& comDataB) {
 
     LOG(INFO) << "\n\n----------- PARAMETER_GRADIENT: " << parameterA->getName()
               << " ; size : " << gradA.getSize() << " -----------";
-    checkBuffer(gradA.getData(), "Network A", gradB.getData(), "Network B",
+    checkBuffer(gradA.getData(),
+                "Network A",
+                gradB.getData(),
+                "Network B",
                 gradA.getSize());
   }
 }
diff --git a/paddle/trainer/tests/test_Prediction.cpp b/paddle/trainer/tests/test_Prediction.cpp
index 1c7f93666b..6db33439b3 100644
--- a/paddle/trainer/tests/test_Prediction.cpp
+++ b/paddle/trainer/tests/test_Prediction.cpp
@@ -20,7 +20,8 @@ limitations under the License. */
 
 P_DECLARE_string(config);
 P_DECLARE_string(config_args);
-P_DEFINE_string(merger, "./paddle_merge_model",
+P_DEFINE_string(merger,
+                "./paddle_merge_model",
                 "path to paddle_merge_model binary");
 
 using namespace paddle;  // NOLINT
@@ -120,8 +121,10 @@ TEST(GradientMachine, create) {
           rand() / (real)RAND_MAX;  // NOLINT TODO(yuyang): use rand_r
     }
   }
-  MatrixPtr input = Matrix::create(numSamples, inputDim,
-                                   /* trans */ false, FLAGS_use_gpu);
+  MatrixPtr input = Matrix::create(numSamples,
+                                   inputDim,
+                                   /* trans */ false,
+                                   FLAGS_use_gpu);
   input->copyFrom(cpuInput);
   inArgs[0].value = input;
   gradientMachine1->forward(inArgs, &outArgs, PASS_TEST);
@@ -139,8 +142,8 @@ TEST(GradientMachine, create) {
 
   gradientMachine3->forward(inArgs, &outArgs2, PASS_TEST);
   out2.copyFrom(*outArgs2[0].value);
-  checkBuffer(out1.getData(), out2.getData(),
-              out2.getHeight() * out2.getWidth());
+  checkBuffer(
+      out1.getData(), out2.getData(), out2.getHeight() * out2.getWidth());
 
   cmd = " rm -rf " + modelDir + "/*";
   LOG(INFO) << "cmd " << cmd;
diff --git a/paddle/trainer/tests/test_PyDataProviderWrapper.cpp b/paddle/trainer/tests/test_PyDataProviderWrapper.cpp
index 49332b877d..e53291386c 100644
--- a/paddle/trainer/tests/test_PyDataProviderWrapper.cpp
+++ b/paddle/trainer/tests/test_PyDataProviderWrapper.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef PADDLE_NO_PYTHON
 #include <gtest/gtest.h>
 #include <paddle/utils/PythonUtil.h>
diff --git a/paddle/trainer/tests/test_Trainer.cpp b/paddle/trainer/tests/test_Trainer.cpp
index ad2a715ef8..900c05af85 100644
--- a/paddle/trainer/tests/test_Trainer.cpp
+++ b/paddle/trainer/tests/test_Trainer.cpp
@@ -33,7 +33,9 @@ P_DECLARE_string(config);
 P_DECLARE_int32(gpu_id);
 P_DECLARE_bool(allow_only_one_model_on_one_gpu);
 
-void checkGradientTest(const string& configFile, bool useGpu, bool parallel,
+void checkGradientTest(const string& configFile,
+                       bool useGpu,
+                       bool parallel,
                        int trainerCount = 1) {
   FLAGS_use_gpu = useGpu;
   FLAGS_parallel_nn = parallel;
@@ -94,7 +96,7 @@ TEST(checkGradient, multi) {
 TEST(checkGradient, hsigmoid) { checkGradientTest(configFile2, false, false); }
 
 TEST(checkGradient, chunk) {
-#if defined(__APPLE__) || defined (__OSX__)
+#if defined(__APPLE__) || defined(__OSX__)
   EXPECT_EQ(0, system("python trainer/tests/gen_proto_data.py"));
 #else
   EXPECT_EQ(0, system("python2 trainer/tests/gen_proto_data.py"));
diff --git a/paddle/trainer/tests/test_TrainerOnePass.cpp b/paddle/trainer/tests/test_TrainerOnePass.cpp
index 4554b94485..da2954d166 100644
--- a/paddle/trainer/tests/test_TrainerOnePass.cpp
+++ b/paddle/trainer/tests/test_TrainerOnePass.cpp
@@ -41,12 +41,13 @@ public:
   }
 };
 
-
-
 int gNumDevices = 0;
 
-void trainerOnePassTest(const string& configFile, bool useGpu, bool parallel,
-                        int trainerCount = 1, double averageWindow = 0.0f,
+void trainerOnePassTest(const string& configFile,
+                        bool useGpu,
+                        bool parallel,
+                        int trainerCount = 1,
+                        double averageWindow = 0.0f,
                         bool doAverageInCpu = false) {
   FLAGS_use_gpu = useGpu;
   FLAGS_parallel_nn = parallel;
@@ -164,13 +165,13 @@ double checkRemoteParameterUpdater(TrainerForTest& trainer) {
   const vector<Argument>& inArgs = dataBatch.getStreams();
   vector<Argument> outArgs;
 
-  UpdateCallback updateCallback =
-      [parameterUpdater, parameterCheck](Parameter* para) {
-        parameterCheck[para->getID()]
-            ->getBuf(PARAMETER_GRADIENT)
-            ->copyFrom(*para->getBuf(PARAMETER_GRADIENT));
-        parameterUpdater->update(para);
-      };
+  UpdateCallback updateCallback = [parameterUpdater,
+                                   parameterCheck](Parameter* para) {
+    parameterCheck[para->getID()]
+        ->getBuf(PARAMETER_GRADIENT)
+        ->copyFrom(*para->getBuf(PARAMETER_GRADIENT));
+    parameterUpdater->update(para);
+  };
 
   parameterUpdater->startPass();
   parameterUpdaterCheck->startPass();
@@ -178,8 +179,8 @@ double checkRemoteParameterUpdater(TrainerForTest& trainer) {
   for (int i = 0; i < config.opt_config().num_batches_per_get_parameter() * 2;
        ++i) {
     PassType passType = parameterUpdater->startBatch(actualBatchSize);
-    gradientMachine->forwardBackward(inArgs, &outArgs, passType,
-                                     updateCallback);
+    gradientMachine->forwardBackward(
+        inArgs, &outArgs, passType, updateCallback);
     parameterUpdater->finishBatch(0);
 
     parameterUpdaterCheck->startBatch(actualBatchSize);
@@ -191,7 +192,7 @@ double checkRemoteParameterUpdater(TrainerForTest& trainer) {
 
   double sum = 0.0f;
   for (size_t i = 0; i != parameters.size(); ++i) {
-    real* v1, *v2;
+    real *v1, *v2;
     CpuVector trainerPara(parameters[i]->getSize());
     trainerPara.copyFrom(*parameters[i]->getBuf(PARAMETER_VALUE));
     if (!FLAGS_use_gpu) {
@@ -217,8 +218,10 @@ double checkRemoteParameterUpdater(TrainerForTest& trainer) {
   return sum;
 }
 
-void checkRemoteParameterUpdaterTest(const string& configFile, bool useGpu,
-                                     bool parallel, int trainerCount = 1,
+void checkRemoteParameterUpdaterTest(const string& configFile,
+                                     bool useGpu,
+                                     bool parallel,
+                                     int trainerCount = 1,
                                      bool useOldUpdater = false,
                                      int num_batches_per_get_parameter = 1) {
   FLAGS_use_gpu = useGpu;
diff --git a/paddle/trainer/tests/test_recurrent_machine_generation.cpp b/paddle/trainer/tests/test_recurrent_machine_generation.cpp
index fcee318d16..49e8a97ad0 100644
--- a/paddle/trainer/tests/test_recurrent_machine_generation.cpp
+++ b/paddle/trainer/tests/test_recurrent_machine_generation.cpp
@@ -51,8 +51,10 @@ void checkOutput(const string& expRetFile) {
   }
 }
 
-void prepareInArgs(vector<Argument>& inArgs, const size_t batchSize,
-                   bool useGpu, bool hasSubseq) {
+void prepareInArgs(vector<Argument>& inArgs,
+                   const size_t batchSize,
+                   bool useGpu,
+                   bool hasSubseq) {
   inArgs.clear();
   // sentence id
   Argument sentId;
@@ -87,7 +89,9 @@ void prepareInArgs(vector<Argument>& inArgs, const size_t batchSize,
   inArgs.emplace_back(dummyInput);
 }
 
-void testGeneration(const string& configFile, bool useGpu, bool hasSubseq,
+void testGeneration(const string& configFile,
+                    bool useGpu,
+                    bool hasSubseq,
                     const string& expRetFile) {
   FLAGS_use_gpu = useGpu;
   auto config = std::make_shared<TrainerConfigHelper>(configFile);
@@ -114,8 +118,10 @@ TEST(RecurrentGradientMachine, test_generation) {
 #else
   const auto useGpuConfs = {true, false};
 #endif
-  auto testGen = [&](const string& configFile, bool hasSubseq,
-                     const string& expRetFile, bool beam_search) {
+  auto testGen = [&](const string& configFile,
+                     bool hasSubseq,
+                     const string& expRetFile,
+                     bool beam_search) {
     FLAGS_config_args = beam_search ? "beam_search=1" : "beam_search=0";
     for (auto useGpu : useGpuConfs) {
       testGeneration(configFile, useGpu, hasSubseq, expRetFile);
@@ -126,7 +132,9 @@ TEST(RecurrentGradientMachine, test_generation) {
   // In hierarchical RNN, beam search and one way search are only in inner-RNN,
   // outer-RNN will concat the generated inner-results (first for beam search)
   // from inner-RNN. Thus, they have the same outer-results.
-  testGen(NEST_CONFIG_FILE, true, expectFile + ".nest",
+  testGen(NEST_CONFIG_FILE,
+          true,
+          expectFile + ".nest",
           false);  // no beam search
   testGen(NEST_CONFIG_FILE, true, expectFile + ".nest", true);  // beam search
 }
diff --git a/paddle/utils/BarrierStat.cpp b/paddle/utils/BarrierStat.cpp
index cbc738a839..82c5b84e59 100644
--- a/paddle/utils/BarrierStat.cpp
+++ b/paddle/utils/BarrierStat.cpp
@@ -20,19 +20,21 @@ limitations under the License. */
 #include "paddle/utils/BarrierStat.h"
 #include "paddle/utils/Flags.h"
 
-P_DEFINE_bool(log_barrier_abstract, true,
+P_DEFINE_bool(log_barrier_abstract,
+              true,
               "if true, show abstract of barrier performance");
-P_DEFINE_int32(log_barrier_lowest_nodes, 5,
+P_DEFINE_int32(log_barrier_lowest_nodes,
+               5,
                "how many lowest node will be logged");
-P_DEFINE_bool(log_barrier_show_log, false,  // for performance tuning insight
+P_DEFINE_bool(log_barrier_show_log,
+              false,  // for performance tuning insight
               "if true, always show barrier abstract even with little gap");
 
 namespace paddle {
 
-std::ostream &operator<<(std::ostream &output, BarrierStatBase &stat) {
+std::ostream &operator<<(std::ostream &output, const BarrierStatBase &stat) {
   if (FLAGS_log_barrier_abstract) {
-    std::lock_guard<std::mutex> guard(
-        const_cast<BarrierStatBase &>(stat).lock_);
+    std::lock_guard<std::mutex> guard(stat.lock_);
     stat.showAbstract(output);
   }
   return output;
@@ -136,7 +138,7 @@ void BarrierEndStat::reset(bool clearRawData) {
   totAbstract_.minDelta = UINT64_MAX;
 }
 
-void BarrierEndStat::showAbstract(std::ostream &output) {
+void BarrierEndStat::showAbstract(std::ostream &output) const {
   // do not support the case "<=2 pserver"
   if (numConnThreads_ <= 2 || !totSamples_) {
     return;
@@ -144,7 +146,8 @@ void BarrierEndStat::showAbstract(std::ostream &output) {
 
   // duplicate freq info
   std::vector<struct Abstract> outputAbstract = abstract_;
-  std::sort(outputAbstract.begin(), outputAbstract.end(),
+  std::sort(outputAbstract.begin(),
+            outputAbstract.end(),
             [](const struct Abstract &a, const struct Abstract &b) {
               return a.freq > b.freq;
             });
@@ -272,7 +275,7 @@ void BarrierDeltaStat::reset(bool clearRawData) {
   totAbstract_.minDelta = UINT64_MAX;
 }
 
-void BarrierDeltaStat::showAbstract(std::ostream &output) {
+void BarrierDeltaStat::showAbstract(std::ostream &output) const {
   // do not support the case "<=2 pserver"
   if (numConnThreads_ <= 2 || !totSamples_) {
     return;
@@ -280,7 +283,8 @@ void BarrierDeltaStat::showAbstract(std::ostream &output) {
 
   // duplicate freq info
   std::vector<struct Abstract> outputAbstract = abstract_;
-  std::sort(outputAbstract.begin(), outputAbstract.end(),
+  std::sort(outputAbstract.begin(),
+            outputAbstract.end(),
             [](const struct Abstract &a, const struct Abstract &b) {
               return a.freq > b.freq;
             });
diff --git a/paddle/utils/BarrierStat.h b/paddle/utils/BarrierStat.h
index 22d6cc9bce..661340ad27 100644
--- a/paddle/utils/BarrierStat.h
+++ b/paddle/utils/BarrierStat.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <stdint.h>
@@ -218,11 +217,12 @@ public:
   }
 
 protected:
-  virtual void showAbstract(std::ostream &output) {}
-  friend std::ostream &operator<<(std::ostream &output, BarrierStatBase &stat);
+  virtual void showAbstract(std::ostream &output) const {}
+  friend std::ostream &operator<<(std::ostream &output,
+                                  const BarrierStatBase &stat);
 
 protected:
-  std::mutex lock_;
+  mutable std::mutex lock_;
   std::mutex abstractLock_;  // see note on updaterStat
   // each freqency for each barrier trainer
   std::vector<struct Abstract> abstract_;
@@ -262,7 +262,7 @@ protected:
    * log_barrier_abstract, log_barrier_lowest_nodes, log_barrier_threshold
    * control details.
    */
-  virtual void showAbstract(std::ostream &output);
+  virtual void showAbstract(std::ostream &output) const;
 
 private:
   std::unique_ptr<TimeVectorEnd> timeVector_;
@@ -286,7 +286,7 @@ public:
   virtual bool checkPassBarrier() { return timeVector_->empty(); }
 
 protected:
-  virtual void showAbstract(std::ostream &outPut);
+  virtual void showAbstract(std::ostream &outPut) const;
 
 private:
   // store delta time in uint64_t, eg BP time of all trainers
@@ -304,44 +304,44 @@ private:
 // nodes.
 
 // end barrier
-#define __REGISTER_BARRIER_TIMER_SERVER(set, statName, numConnThreads, \
-                                        trainerId, ...)                \
-  do {                                                                 \
-    if (numConnThreads > 2) {                                          \
-      std::string internalName =                                       \
-          std::string(statName) + std::string(__VA_ARGS__);            \
-      BarrierStatPtr __stat =                                          \
-          (set).getStat(numConnThreads, internalName, BARRIER_END);    \
-      struct timeval cur;                                              \
-      gettimeofday(&cur, nullptr);                                     \
-      __stat->updateStat(cur, trainerId);                              \
-    }                                                                  \
+#define __REGISTER_BARRIER_TIMER_SERVER(                            \
+    set, statName, numConnThreads, trainerId, ...)                  \
+  do {                                                              \
+    if (numConnThreads > 2) {                                       \
+      std::string internalName =                                    \
+          std::string(statName) + std::string(__VA_ARGS__);         \
+      BarrierStatPtr __stat =                                       \
+          (set).getStat(numConnThreads, internalName, BARRIER_END); \
+      struct timeval cur;                                           \
+      gettimeofday(&cur, nullptr);                                  \
+      __stat->updateStat(cur, trainerId);                           \
+    }                                                               \
   } while (0);
 
 // end barrier with user-defined timer
-#define __REGISTER_BARRIER_TIMER_SERVER_SET(set, statName, numConnThreads, \
-                                            trainerId, cur, ...)           \
-  do {                                                                     \
-    if (numConnThreads > 2) {                                              \
-      std::string internalName =                                           \
-          std::string(statName) + std::string(__VA_ARGS__);                \
-      BarrierStatPtr __stat =                                              \
-          (set).getStat(numConnThreads, internalName, BARRIER_END);        \
-      __stat->updateStat(cur, trainerId);                                  \
-    }                                                                      \
+#define __REGISTER_BARRIER_TIMER_SERVER_SET(                        \
+    set, statName, numConnThreads, trainerId, cur, ...)             \
+  do {                                                              \
+    if (numConnThreads > 2) {                                       \
+      std::string internalName =                                    \
+          std::string(statName) + std::string(__VA_ARGS__);         \
+      BarrierStatPtr __stat =                                       \
+          (set).getStat(numConnThreads, internalName, BARRIER_END); \
+      __stat->updateStat(cur, trainerId);                           \
+    }                                                               \
   } while (0);
 
 // delta barrier
-#define __REGISTER_BARRIER_DELTA_SERVER_SET(set, statName, numConnThreads, \
-                                            trainerId, delta, ...)         \
-  do {                                                                     \
-    if (numConnThreads > 2) {                                              \
-      std::string internalName =                                           \
-          std::string(statName) + std::string(__VA_ARGS__);                \
-      BarrierStatPtr __stat =                                              \
-          (set).getStat(numConnThreads, internalName, BARRIER_DELTA);      \
-      __stat->updateStat(delta, trainerId);                                \
-    }                                                                      \
+#define __REGISTER_BARRIER_DELTA_SERVER_SET(                          \
+    set, statName, numConnThreads, trainerId, delta, ...)             \
+  do {                                                                \
+    if (numConnThreads > 2) {                                         \
+      std::string internalName =                                      \
+          std::string(statName) + std::string(__VA_ARGS__);           \
+      BarrierStatPtr __stat =                                         \
+          (set).getStat(numConnThreads, internalName, BARRIER_DELTA); \
+      __stat->updateStat(delta, trainerId);                           \
+    }                                                                 \
   } while (0);
 
 // check end barrier
@@ -373,10 +373,10 @@ private:
  */
 
 // try to capture which trainer is slowest node in sync-sgd at pserver.
-#define REGISTER_SLOW_NODES_PROBE(set, statName, numConnThreads, trainerId,   \
-                                  ...)                                        \
-  __REGISTER_BARRIER_TIMER_SERVER((set), statName, numConnThreads, trainerId, \
-                                  __VA_ARGS__)
+#define REGISTER_SLOW_NODES_PROBE(                 \
+    set, statName, numConnThreads, trainerId, ...) \
+  __REGISTER_BARRIER_TIMER_SERVER(                 \
+      (set), statName, numConnThreads, trainerId, __VA_ARGS__)
 // try to check if all threads or trainers have passed barriers for data
 // accuracy.
 #define CHECK_BARRIER_TIMER(set, statName, numConnThreads, ...) \
@@ -384,12 +384,12 @@ private:
 
 #ifdef PADDLE_DISABLE_TIMER
 
-#define REGISTER_BARRIER_TIMER_SERVER(set, statName, numConnThreads, \
-                                      trainerId, ...)
-#define REGISTER_BARRIER_TIMER_SERVER_SET(set, statName, numConnThreads, \
-                                          trainerId, cur, ...)
-#define REGISTER_BARRIER_DELTA_SERVER_SET(set, statName, numConnThreads, \
-                                          trainerId, cur, ...)
+#define REGISTER_BARRIER_TIMER_SERVER( \
+    set, statName, numConnThreads, trainerId, ...)
+#define REGISTER_BARRIER_TIMER_SERVER_SET( \
+    set, statName, numConnThreads, trainerId, cur, ...)
+#define REGISTER_BARRIER_DELTA_SERVER_SET( \
+    set, statName, numConnThreads, trainerId, cur, ...)
 
 #else
 
@@ -397,10 +397,10 @@ private:
  * sensing barrier time distribution for all parallelization threads.
  * it provides low API for slow node check(REGISTER_SLOW_NODES_PROBE)
  */
-#define REGISTER_BARRIER_TIMER_SERVER(set, statName, numConnThreads,          \
-                                      trainerId, ...)                         \
-  __REGISTER_BARRIER_TIMER_SERVER((set), statName, numConnThreads, trainerId, \
-                                  __VA_ARGS__)
+#define REGISTER_BARRIER_TIMER_SERVER(             \
+    set, statName, numConnThreads, trainerId, ...) \
+  __REGISTER_BARRIER_TIMER_SERVER(                 \
+      (set), statName, numConnThreads, trainerId, __VA_ARGS__)
 
 /*
  * sensing barrier time distribution for all parallelization threads.
@@ -409,18 +409,18 @@ private:
  * time distribution
  * for receiving data.
  */
-#define REGISTER_BARRIER_TIMER_SERVER_SET(set, statName, numConnThreads, \
-                                          trainerId, cur, ...)           \
-  __REGISTER_BARRIER_TIMER_SERVER_SET((set), statName, numConnThreads,   \
-                                      trainerId, cur, __VA_ARGS__)
+#define REGISTER_BARRIER_TIMER_SERVER_SET(              \
+    set, statName, numConnThreads, trainerId, cur, ...) \
+  __REGISTER_BARRIER_TIMER_SERVER_SET(                  \
+      (set), statName, numConnThreads, trainerId, cur, __VA_ARGS__)
 
 // try to capture time delta from all trainers, such as forwardBackward time
 // which implies
 // computation fluctuation
-#define REGISTER_BARRIER_DELTA_SERVER_SET(set, statName, numConnThreads, \
-                                          trainerId, delta, ...)         \
-  __REGISTER_BARRIER_DELTA_SERVER_SET((set), statName, numConnThreads,     \
-                                      trainerId, delta, __VA_ARGS__)
+#define REGISTER_BARRIER_DELTA_SERVER_SET(                \
+    set, statName, numConnThreads, trainerId, delta, ...) \
+  __REGISTER_BARRIER_DELTA_SERVER_SET(                    \
+      (set), statName, numConnThreads, trainerId, delta, __VA_ARGS__)
 
 #endif  // DISABLE_TIMER
 }  // namespace paddle
diff --git a/paddle/utils/ClassRegistrar.h b/paddle/utils/ClassRegistrar.h
index 0c7747ac77..ee58ccb2ad 100644
--- a/paddle/utils/ClassRegistrar.h
+++ b/paddle/utils/ClassRegistrar.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <map>
@@ -63,16 +62,16 @@ public:
   // Create a class instance of type @type using args
   BaseClass* createByType(const std::string& type, CreateArgs... args) {
     ClassCreator creator;
-    CHECK(mapGet(type, creatorMap_, &creator))
-        << "Unknown class type: " << type;
+    CHECK(mapGet(type, creatorMap_, &creator)) << "Unknown class type: "
+                                               << type;
     return creator(args...);
   }
 
   template <typename T>
   inline void forEachType(T callback) {
-      for (auto it = creatorMap_.begin(); it != creatorMap_.end(); ++it) {
-          callback(it->first);
-      }
+    for (auto it = creatorMap_.begin(); it != creatorMap_.end(); ++it) {
+      callback(it->first);
+    }
   }
 
 protected:
diff --git a/paddle/utils/CommandLineParser.cpp b/paddle/utils/CommandLineParser.cpp
index 8edcad5747..307e304bb0 100644
--- a/paddle/utils/CommandLineParser.cpp
+++ b/paddle/utils/CommandLineParser.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "CommandLineParser.h"
 #ifndef PADDLE_USE_GFLAGS
 #include "paddle/utils/StringUtil.h"
@@ -31,7 +30,6 @@ static constexpr int kStatusOK = 0;
 static constexpr int kStatusInvalid = 1;
 static constexpr int kStatusNotFound = 2;
 
-
 /**
  * \brief: Convert a string to any type value.
  *
@@ -48,13 +46,16 @@ template <>
 bool StringToValue<bool>(const std::string& content, bool* value) {
   std::string tmp = content;
 
-  std::transform(tmp.begin(), tmp.end(), tmp.begin(), [](char in) -> char {
-    if (in <= 'Z' && in >= 'A') {
-      return in - ('Z' - 'z');
-    } else {
-      return in;
-    }
-  });  // tolower.
+  std::transform(tmp.begin(),
+                 tmp.end(),
+                 tmp.begin(),
+                 [](char in) -> char {
+                   if (in <= 'Z' && in >= 'A') {
+                     return in - ('Z' - 'z');
+                   } else {
+                     return in;
+                   }
+                 });  // tolower.
 
   if (tmp == "true" || tmp == "1") {
     *value = true;
@@ -121,20 +122,16 @@ int ParseArgument(const std::string& argument, std::string* extraInfo) {
  * parse '--flag_name', '-flag_name' as true; '--noflag_name', '-noflag_name' as
  * false
  */
-static int ParseBoolArgumentExtra(
-    const std::string& argument, std::string* extraInfo) {
+static int ParseBoolArgumentExtra(const std::string& argument,
+                                  std::string* extraInfo) {
   (void)(extraInfo);  // unused extraInfo, just make api same.
 
   //! @warning: The order and content of prefixes is DESIGNED for parsing
   //! command line. The length of prefixes are 1, 2, 3, 4. The parse logic takes
   //! use of this fact. DO NOT CHANGE IT without reading how to parse command
   //! below.
-  static const std::vector<std::pair<const char*, bool> >  prefixes = {
-    {"-", true},
-    {"--", true},
-    {"-no", false},
-    {"--no", false}
-  };
+  static const std::vector<std::pair<const char*, bool>> prefixes = {
+      {"-", true}, {"--", true}, {"-no", false}, {"--no", false}};
 
   for (flags_internal::CommandLineFlagRegistry<bool>::Command& command :
        flags_internal::CommandLineFlagRegistry<bool>::Instance()->commands) {
@@ -153,7 +150,6 @@ static int ParseBoolArgumentExtra(
   return kStatusNotFound;
 }
 
-
 /**
  * \brief: Print command line arguments' usage with type T.
  */
@@ -170,12 +166,9 @@ static void PrintTypeUsage() {
   }
 }
 
-template <typename ...TS>
+template <typename... TS>
 static void PrintTypeUsages() {
-  int unused[] = {
-    0,
-    (PrintTypeUsage<TS>(), 0) ...
-  };
+  int unused[] = {0, (PrintTypeUsage<TS>(), 0)...};
   (void)(unused);
 }
 /**
@@ -190,7 +183,8 @@ static void PrintUsageAndExit(const char* argv0) {
 /**
  * \brief: Print the error flags, usage, and exit.
  */
-static void PrintParseError(const std::string& name, const char* actualInput,
+static void PrintParseError(const std::string& name,
+                            const char* actualInput,
                             const char* arg0) {
   std::cerr << "Parse command flag " << name << " error! User input is "
             << actualInput << std::endl;
@@ -211,7 +205,7 @@ void ParseCommandLineFlags(int* argc, char** argv, bool withHelp) {
     PrintParseError(extra, argv[i], argv[0]); \
   }
 
-    ParseArgumentWithType(bool);    // NOLINT
+    ParseArgumentWithType(bool);  // NOLINT
     ParseArgumentWithType(int32_t);
     ParseArgumentWithType(double);  // NOLINT
     ParseArgumentWithType(int64_t);
diff --git a/paddle/utils/CommandLineParser.h b/paddle/utils/CommandLineParser.h
index d18675ffa3..c46567913e 100644
--- a/paddle/utils/CommandLineParser.h
+++ b/paddle/utils/CommandLineParser.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 #ifndef PADDLE_USE_GFLAGS
 #include "DisableCopy.h"
@@ -72,7 +71,8 @@ struct CommandLineFlagRegister {
    * \param [inout] val: The command line argument instance, FLAGS_xxx.
    * \param [in] desc: The command line helper message.
    */
-  CommandLineFlagRegister(const std::string& name, T* val,
+  CommandLineFlagRegister(const std::string& name,
+                          T* val,
                           const std::string desc) {
     CommandLineFlagRegistry<T>::Instance()->commands.push_back(
         {name, val, desc, *val});
@@ -83,7 +83,8 @@ struct CommandLineFlagRegister {
  * \brief: Define a command line arguments.
  *
  * \param type: The variable type, such as int, double, etc.
- * \param name: The variable name. The command line argument is '--name', the variable
+ * \param name: The variable name. The command line argument is '--name', the
+ *variable
  *is 'FLAGS_name'
  * \param default_value: The default value of command line argument.
  * \param text: The description in command line argument.
diff --git a/paddle/utils/CompilerMacros.h b/paddle/utils/CompilerMacros.h
new file mode 100644
index 0000000000..4236d750c4
--- /dev/null
+++ b/paddle/utils/CompilerMacros.h
@@ -0,0 +1,17 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#define ATTR_NORETURN __attribute__((noreturn))
diff --git a/paddle/utils/CustomStackTrace.cpp b/paddle/utils/CustomStackTrace.cpp
index 232a478ecd..8740fe662e 100644
--- a/paddle/utils/CustomStackTrace.cpp
+++ b/paddle/utils/CustomStackTrace.cpp
@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "CustomStackTrace.h"
 #include "CommandLineParser.h"
 #include <iostream>
 
-P_DEFINE_bool(layer_stack_error_only_current_thread,
+P_DEFINE_bool(
+    layer_stack_error_only_current_thread,
     true,
     "Dump current thread or whole process layer stack when signal error "
     "occurred. true means only dump current thread layer stack");
@@ -33,21 +33,23 @@ void installLayerStackTracer() {
     if (!gLayerStackTrace.empty()) {
       size_t curTid = -1UL;
       std::hash<std::thread::id> hasher;
-      gLayerStackTrace.dump([&curTid, &hasher](std::thread::id tid,
-                            bool* isForwarding,
-                            const std::string& layerName) {
-        if (curTid != hasher(tid)) {
-          if (curTid != -1UL) {
-            std::cerr << std::endl;
-          }
-          curTid = hasher(tid);
-          std::cerr << "Thread [" << tid << "] ";
-          if (isForwarding) {
-            std::cerr << (*isForwarding ? "Forwarding ": "Backwarding ");
-          }
-        }
-        std::cerr << layerName << ", ";
-      }, FLAGS_layer_stack_error_only_current_thread);
+      gLayerStackTrace.dump(
+          [&curTid, &hasher](std::thread::id tid,
+                             bool* isForwarding,
+                             const std::string& layerName) {
+            if (curTid != hasher(tid)) {
+              if (curTid != -1UL) {
+                std::cerr << std::endl;
+              }
+              curTid = hasher(tid);
+              std::cerr << "Thread [" << tid << "] ";
+              if (isForwarding) {
+                std::cerr << (*isForwarding ? "Forwarding " : "Backwarding ");
+              }
+            }
+            std::cerr << layerName << ", ";
+          },
+          FLAGS_layer_stack_error_only_current_thread);
       std::cerr << std::endl;
     }
     std::cerr.write(data, sz);
diff --git a/paddle/utils/CustomStackTrace.h b/paddle/utils/CustomStackTrace.h
index 774c4db2b9..878e14eb5f 100644
--- a/paddle/utils/CustomStackTrace.h
+++ b/paddle/utils/CustomStackTrace.h
@@ -24,13 +24,13 @@ limitations under the License. */
 namespace paddle {
 
 /**
- * A ThreadLocal stack for tracing train/test process. 
- * (More details of ThreadLocal can be find 
+ * A ThreadLocal stack for tracing train/test process.
+ * (More details of ThreadLocal can be find
  * in the comments of ThreadLocal class.)
- * 
+ *
  * For example.
  * @code{.cpp}
- * 
+ *
  * paddle::CustomStackTrace<std::string> stack;
  * for (auto& layer : layers){
  *   stack.push(layer->getName());
@@ -48,7 +48,7 @@ namespace paddle {
  * @endcode
  */
 template <typename T>
-class CustomStackTrace{
+class CustomStackTrace {
 public:
   /**
    * @brief Pop out an item from the top of the stack if item == top.
@@ -87,7 +87,6 @@ public:
     return true;
   }
 
-
   /**
    * @brief DumpCallback Type. It will be invoked many times by dump method.
    *
@@ -96,8 +95,8 @@ public:
    * The third parameter is the item in stack.
    */
   typedef std::function<void(const std::thread::id& /*threadId*/,
-                              bool* /*isPushing*/,
-                              const T& /*item*/)> DumpCallback;
+                             bool* /*isPushing*/,
+                             const T& /*item*/)> DumpCallback;
 
   /**
    * Dump all thread stack, and all stack will be cleared.
@@ -160,25 +159,23 @@ private:
    * @brief Get thread local stack reference.
    */
   std::stack<T>& stack() {
-    return this->getThreadLocal(this->logStack_,
-                                this->stackBuffers_);
+    return this->getThreadLocal(this->logStack_, this->stackBuffers_);
   }
 
   /**
    * @brief Get thread local pushing flag.
    */
   bool& pushing() {
-    return this->getThreadLocal(this->isPushing_,
-                                this->pushingBuffers_);
+    return this->getThreadLocal(this->isPushing_, this->pushingBuffers_);
   }
 
 private:
   mutable std::mutex mtx_;
 
-  std::unordered_map<std::thread::id, std::stack<T>* > stackBuffers_;
-  std::unordered_map<std::thread::id, bool* > pushingBuffers_;
+  std::unordered_map<std::thread::id, std::stack<T>*> stackBuffers_;
+  std::unordered_map<std::thread::id, bool*> pushingBuffers_;
   ThreadLocal<bool> isPushing_;
-  ThreadLocal<std::stack<T> > logStack_;
+  ThreadLocal<std::stack<T>> logStack_;
 };
 
 extern CustomStackTrace<std::string> gLayerStackTrace;
diff --git a/paddle/utils/DisableCopy.h b/paddle/utils/DisableCopy.h
index 964daa237b..e991c07cdf 100644
--- a/paddle/utils/DisableCopy.h
+++ b/paddle/utils/DisableCopy.h
@@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 /**
  * Disable copy macro.
  */
-#define DISABLE_COPY(CLASS_NAME)\
-  CLASS_NAME(CLASS_NAME &&) = delete; \
+#define DISABLE_COPY(CLASS_NAME)                \
+  CLASS_NAME(CLASS_NAME &&) = delete;           \
   CLASS_NAME(const CLASS_NAME &other) = delete; \
-  CLASS_NAME& operator=(const CLASS_NAME &other) = delete
+  CLASS_NAME &operator=(const CLASS_NAME &other) = delete
diff --git a/paddle/utils/Excepts.cpp b/paddle/utils/Excepts.cpp
index 9123508fc7..b2fad3ac9d 100644
--- a/paddle/utils/Excepts.cpp
+++ b/paddle/utils/Excepts.cpp
@@ -27,28 +27,28 @@ int feenableexcept(unsigned int excepts) {
   static fenv_t fenv;
   unsigned int new_excepts = excepts & FE_ALL_EXCEPT, old_excepts;
 
-  if ( fegetenv (&fenv) ) return -1;
+  if (fegetenv(&fenv)) return -1;
   old_excepts = fenv.__control & FE_ALL_EXCEPT;
 
   // unmask
   fenv.__control &= ~new_excepts;
-  fenv.__mxcsr   &= ~(new_excepts << 7);
+  fenv.__mxcsr &= ~(new_excepts << 7);
 
-  return ( fesetenv (&fenv) ? -1 : old_excepts );
+  return (fesetenv(&fenv) ? -1 : old_excepts);
 }
 
 int fedisableexcept(unsigned int excepts) {
   static fenv_t fenv;
   unsigned int new_excepts = excepts & FE_ALL_EXCEPT, old_excepts;
 
-  if ( fegetenv (&fenv) ) return -1;
+  if (fegetenv(&fenv)) return -1;
   old_excepts = fenv.__control & FE_ALL_EXCEPT;
 
   // mask
   fenv.__control |= new_excepts;
-  fenv.__mxcsr   |= new_excepts << 7;
+  fenv.__mxcsr |= new_excepts << 7;
 
-  return ( fesetenv (&fenv) ? -1 : old_excepts );
+  return (fesetenv(&fenv) ? -1 : old_excepts);
 }
 
 #endif
diff --git a/paddle/utils/Flags.cpp b/paddle/utils/Flags.cpp
index b2b5a5949e..6fae24e1b5 100644
--- a/paddle/utils/Flags.cpp
+++ b/paddle/utils/Flags.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "Flags.h"
 
 #ifdef PADDLE_ONLY_CPU
@@ -22,7 +21,8 @@ P_DEFINE_bool(use_gpu, true, "Whether to use GPU for training");
 #endif
 
 P_DEFINE_bool(
-    parallel_nn, false,
+    parallel_nn,
+    false,
     "Whether to use multi-threads to calculate one neural network."
     "If it was set false, use gpu_id specify which gpu core to use"
     "(the device property in the trainer config file will be ingored)."
@@ -32,39 +32,48 @@ P_DEFINE_int32(trainer_count, 1, "Defined how many trainers to train");
 P_DEFINE_int32(gpu_id, 0, "Which gpu core to use");
 P_DEFINE_int32(port, 20134, "Listening port for pserver");
 P_DEFINE_int32(data_server_port, 21134, "Listening port for dserver");
-P_DEFINE_int32(ports_num, 1,
+P_DEFINE_int32(ports_num,
+               1,
                "The ports number for parameter send,"
                " increment based on default port number");
-P_DEFINE_int32(ports_num_for_sparse, 0,
+P_DEFINE_int32(ports_num_for_sparse,
+               0,
                "The ports number for parameter send,"
                " increment based on default (port + ports_num)");
 P_DEFINE_string(nics, "xgbe0,xgbe1", "network device name for pservers");
 P_DEFINE_string(rdma_tcp, "tcp", "use rdma or tcp rdma transport protocol");
 P_DEFINE_int32(
-    trainer_id, 0,
+    trainer_id,
+    0,
     "For distributed training, each trainer must be given an unique id"
     " ranging from 0 to num_trainers-1. Trainer 0 is the master"
     " trainer");
 P_DEFINE_int32(num_gradient_servers, 1, "number of gradient servers");
 P_DEFINE_string(comment, "", "A string for commenting this training task");
-P_DEFINE_string(load_missing_parameter_strategy, "fail",
+P_DEFINE_string(load_missing_parameter_strategy,
+                "fail",
                 "which operation to take on load model fails. support "
                 "fail/rand/zero only.");
 P_DEFINE_int32(log_period, 100, "Log progress every so many batches");
-P_DEFINE_int32(log_period_server, 500,
+P_DEFINE_int32(log_period_server,
+               500,
                "Log progress every so many batches at pserver end");
 P_DEFINE_double(checkgrad_eps, 1e-5, "parameter change size for checkgrad");
-P_DEFINE_int32(enable_parallel_vector, 0,
+P_DEFINE_int32(enable_parallel_vector,
+               0,
                "threshold for enable parallel vector");
-P_DEFINE_bool(loadsave_parameters_in_pserver, false,
+P_DEFINE_bool(loadsave_parameters_in_pserver,
+              false,
               "load and save parameters in pserver. "
               "only work while parameter set sparse_remote_update.");
-P_DEFINE_int32(beam_size, 1,
+P_DEFINE_int32(beam_size,
+               1,
                "Beam size used in generating most probable output sequences.");
 
 P_DEFINE_bool(show_layer_stat, false, "show the statistics of each layer");
 P_DEFINE_string(predict_file, "", "File name for saving predict result");
 P_DEFINE_bool(prev_batch_state, false, "batch is continue with next batch");
-P_DEFINE_string(init_model_path, "",
+P_DEFINE_string(init_model_path,
+                "",
                 "Path of the initial model parameters."
                 "If it was set, start_pass will be ignored.");
diff --git a/paddle/utils/Flags.h b/paddle/utils/Flags.h
index b23a29eff9..dda60c3f96 100644
--- a/paddle/utils/Flags.h
+++ b/paddle/utils/Flags.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "CommandLineParser.h"
diff --git a/paddle/utils/GlobalConstants.cpp b/paddle/utils/GlobalConstants.cpp
index 8ed6471e4e..d769cd1ee7 100644
--- a/paddle/utils/GlobalConstants.cpp
+++ b/paddle/utils/GlobalConstants.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "GlobalConstants.h"
 
 namespace paddle {
diff --git a/paddle/utils/GlobalConstants.h b/paddle/utils/GlobalConstants.h
index 8818b014f8..4c74c17a50 100644
--- a/paddle/utils/GlobalConstants.h
+++ b/paddle/utils/GlobalConstants.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 #include <string>
 
@@ -20,9 +19,9 @@ namespace paddle {
 
 namespace enumeration_wrapper {
 enum PassType {
-  PASS_TRAIN,  // Train pass
-  PASS_TEST,   // Test pass
-  PASS_GC,     // Gradient Check pass
+  PASS_TRAIN,   // Train pass
+  PASS_TEST,    // Test pass
+  PASS_GC,      // Gradient Check pass
   PASS_METRIC,  // pass for generate template output with no drop rate.
   // pass for metric learning training with metric learning error, only used
   // when we are doing KNN evaluation.
@@ -81,7 +80,7 @@ enum ParameterType {
 }  // namespace enumeration_wrapper
 
 //! explicit import enum into paddle namespace.
-using namespace enumeration_wrapper;    // NOLINT
+using namespace enumeration_wrapper;  // NOLINT
 
 class TrainAlgorithm {
 public:
diff --git a/paddle/utils/Locks.h b/paddle/utils/Locks.h
index 1fc0363d34..5990e16570 100644
--- a/paddle/utils/Locks.h
+++ b/paddle/utils/Locks.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <pthread.h>
@@ -26,7 +25,7 @@ namespace paddle {
 
 /**
  * A simple read-write lock.
- * The RWlock allows a number of readers or at most one writer 
+ * The RWlock allows a number of readers or at most one writer
  * at any point in time.
  * The RWlock disable copy.
  *
@@ -37,7 +36,7 @@ namespace paddle {
  *
  * Use lock_shared() to lock on read mode, other thread can get
  * it by using the same method lock_shared().
- * 
+ *
  * Unlock:
  *
  * Use unlock() to unlock the lock.
@@ -68,13 +67,13 @@ protected:
 };
 
 /**
- * The ReadLockGuard is a read mode RWLock 
- * using RAII management mechanism. 
+ * The ReadLockGuard is a read mode RWLock
+ * using RAII management mechanism.
  */
 class ReadLockGuard {
 public:
   /**
-   * @brief Construct Function. Lock on rwlock in read mode. 
+   * @brief Construct Function. Lock on rwlock in read mode.
    */
   explicit ReadLockGuard(RWLock& rwlock) : rwlock_(&rwlock) {
     rwlock_->lock_shared();
@@ -82,7 +81,7 @@ public:
 
   /**
    * @brief Destruct Function.
-   * @note This method just unlock the read mode rwlock, 
+   * @note This method just unlock the read mode rwlock,
    * won't destroy the lock.
    */
   ~ReadLockGuard() { rwlock_->unlock(); }
@@ -120,16 +119,15 @@ class Semaphore {
 public:
   //! Disable copy & assign
   Semaphore(const Semaphore& other) = delete;
-  Semaphore& operator= (const Semaphore&& other) = delete;
+  Semaphore& operator=(const Semaphore&& other) = delete;
 
   //! Enable move.
-  Semaphore(Semaphore&& other): m(std::move(other.m)) {
-  }
+  Semaphore(Semaphore&& other) : m(std::move(other.m)) {}
 
 public:
   /**
-   * @brief Construct Function. 
-   * @param[in] initValue the initial value of the 
+   * @brief Construct Function.
+   * @param[in] initValue the initial value of the
    * semaphore, default 0.
    */
   explicit Semaphore(int initValue = 0);
@@ -137,22 +135,23 @@ public:
   ~Semaphore();
 
   /**
-   * @brief The same as wait(), except if the decrement can not 
+   * @brief The same as wait(), except if the decrement can not
    * be performed until ts return false install of blocking.
-   * @param[in] ts an absolute timeout in seconds and nanoseconds 
+   * @param[in] ts an absolute timeout in seconds and nanoseconds
    * since the Epoch 1970-01-01 00:00:00 +0000(UTC).
-   * @return ture if the decrement proceeds before ts, 
+   * @return ture if the decrement proceeds before ts,
    * else return false.
    */
   bool timeWait(struct timespec* ts);
 
   /**
-   * @brief decrement the semaphore. If the semaphore's value is 0, then call blocks.
+   * @brief decrement the semaphore. If the semaphore's value is 0, then call
+   * blocks.
    */
   void wait();
 
   /**
-   * @brief increment the semaphore. If the semaphore's value 
+   * @brief increment the semaphore. If the semaphore's value
    * greater than 0, wake up a thread blocked in wait().
    */
   void post();
@@ -178,9 +177,9 @@ public:
   ~ThreadBarrier();
 
   /**
-   * @brief . 
-   * If there were count - 1 threads waiting before, 
-   * then wake up all the count - 1 threads and continue run together. 
+   * @brief .
+   * If there were count - 1 threads waiting before,
+   * then wake up all the count - 1 threads and continue run together.
    * Else block the thread until waked by other thread .
    */
   void wait();
@@ -218,12 +217,12 @@ public:
 
   /**
    * @brief wait until pred return ture.
-   * @tparam Predicate c++ concepts, describes a function object 
-   * that takes a single iterator argument 
-   * that is dereferenced and used to 
+   * @tparam Predicate c++ concepts, describes a function object
+   * that takes a single iterator argument
+   * that is dereferenced and used to
    * return a value testable as a bool.
-   * @note pred shall not apply any non-constant function 
-   * through the dereferenced iterator. 
+   * @note pred shall not apply any non-constant function
+   * through the dereferenced iterator.
    */
   template <class Predicate>
   void wait(Predicate pred) {
diff --git a/paddle/utils/Logging.cpp b/paddle/utils/Logging.cpp
index a0644940b5..14303bd4c7 100644
--- a/paddle/utils/Logging.cpp
+++ b/paddle/utils/Logging.cpp
@@ -91,8 +91,8 @@ static inline int env2index(const char* envName,
 }
 
 static bool gLogToStderr = env2bool("PLOG_LOGTOSTDERR", true);
-static const std::vector<std::string> gLevelName = {"INFO", "WARNING", "ERROR",
-                                                    "FATAL"};
+static const std::vector<std::string> gLevelName = {
+    "INFO", "WARNING", "ERROR", "FATAL"};
 static int gMinLogLevel =
     env2int("PLOG_MINLOGLEVEL", env2index("PLOG_MINLOGLEVEL", gLevelName, 0));
 
@@ -134,7 +134,7 @@ static void initializeLogFds(char* argv0) {
   gLogInited = true;
 }
 
-static void (*gFailureFunctionPtr)() __attribute__((noreturn)) = abort;
+static void (*gFailureFunctionPtr)() ATTR_NORETURN = abort;
 
 LogMessage::LogMessage(const char* fname, int line, int severity)
     : fname_(fname), line_(line), severity_(severity) {}
@@ -143,11 +143,19 @@ LogMessage::~LogMessage() { this->generateLogMessage(); }
 
 void LogMessage::generateLogMessage() {
   if (!gLogInited) {
-    fprintf(stderr, "%c %s:%d] %s\n", "IWEF"[severity_], fname_, line_,
+    fprintf(stderr,
+            "%c %s:%d] %s\n",
+            "IWEF"[severity_],
+            fname_,
+            line_,
             str().c_str());
   } else {
     for (auto& fd : gLogFds[this->severity_]) {
-      dprintf(fd, "%c %s:%d] %s\n", "IWEF"[severity_], fname_, line_,
+      dprintf(fd,
+              "%c %s:%d] %s\n",
+              "IWEF"[severity_],
+              fname_,
+              line_,
               str().c_str());
     }
   }
@@ -167,11 +175,9 @@ void initializeLogging(int argc, char** argv) {
 }
 
 namespace logging {
-void setMinLogLevel(int level) {
-  paddle::internal::gMinLogLevel = level;
-}
+void setMinLogLevel(int level) { paddle::internal::gMinLogLevel = level; }
 
-void installFailureFunction(void (*callback)()) {
+void installFailureFunction(void (*callback)() ATTR_NORETURN) {
   paddle::internal::gFailureFunctionPtr = callback;
 }
 
@@ -191,13 +197,11 @@ void initializeLogging(int argc, char** argv) {
 }
 
 namespace logging {
-void setMinLogLevel(int level) {
-  FLAGS_minloglevel = level;
-}
+void setMinLogLevel(int level) { FLAGS_minloglevel = level; }
 void installFailureFunction(void (*callback)()) {
   google::InstallFailureFunction(callback);
 }
-void installFailureWriter(void(*callback)(const char*, int)) {
+void installFailureWriter(void (*callback)(const char*, int)) {
   google::InstallFailureWriter(callback);
 }
 }  // namespace logging
diff --git a/paddle/utils/Logging.h b/paddle/utils/Logging.h
index 7fdfa3240c..e9029b421f 100644
--- a/paddle/utils/Logging.h
+++ b/paddle/utils/Logging.h
@@ -23,6 +23,7 @@ limitations under the License. */
 #include <string>
 
 #ifndef PADDLE_USE_GLOG
+#include "CompilerMacros.h"
 
 //! TODO(yuyang18): Move this utility macro into some global header.
 #define PP_CAT(a, b) PP_CAT_I(a, b)
@@ -31,11 +32,11 @@ limitations under the License. */
 
 /**
  * Generate Unique Variable Name, Usefully in macro.
- * @SEE http://stackoverflow.com/questions/1082192/how-to-generate-random-variable-names-in-c-using-macros
+ * @SEE
+ * http://stackoverflow.com/questions/1082192/how-to-generate-random-variable-names-in-c-using-macros
  */
 #define UNIQUE_NAME(base) PP_CAT(base, __LINE__)
 
-
 namespace paddle {
 
 //! Log levels.
@@ -168,13 +169,13 @@ void setMinLogLevel(int level);
  * @brief Install Log(Fatal) failure function. Default is abort();
  * @param callback: The failure function.
  */
-void installFailureFunction(void (*callback)());
+void installFailureFunction(void (*callback)() ATTR_NORETURN);
 
 /**
  * @brief installFailureWriter
  * @note: not implemented currently.
  */
-inline void installFailureWriter(void(*callback)(const char*, int)) {
+inline void installFailureWriter(void (*callback)(const char*, int)) {
   (void)(callback);  // unused callback.
 }
 }  //  namespace logging
@@ -186,7 +187,7 @@ void initializeLogging(int argc, char** argv);
 namespace logging {
 void setMinLogLevel(int level);
 void installFailureFunction(void (*callback)());
-void installFailureWriter(void(*callback)(const char*, int));
+void installFailureWriter(void (*callback)(const char*, int));
 }  //  namespace logging
 }
 #endif  // PADDLE_USE_GLOG
diff --git a/paddle/utils/PythonUtil.cpp b/paddle/utils/PythonUtil.cpp
index 90e5093f96..7f17a82522 100644
--- a/paddle/utils/PythonUtil.cpp
+++ b/paddle/utils/PythonUtil.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "PythonUtil.h"
 #include <sstream>
 #include <signal.h>
@@ -33,7 +32,8 @@ int executeCMD(const char* cmd, char* result) {
   strncpy(ps, cmd, kExecuteCMDBufLength);
   if ((ptr = popen(ps, "r")) != NULL) {
     size_t count = fread(bufPs, 1, kExecuteCMDBufLength, ptr);
-    memcpy(result, bufPs,
+    memcpy(result,
+           bufPs,
            count - 1);  // why count-1: remove the '\n' at the end
     result[count] = 0;
     pclose(ptr);
@@ -71,15 +71,14 @@ std::string callPythonFunc(const std::string& moduleName,
 
 #else
 
-
 static std::recursive_mutex g_pyMutex;
 
 PyGuard::PyGuard() : guard_(g_pyMutex) {}
 
-
-static void printPyErrorStack(std::ostream& os, bool withEndl = false,
+static void printPyErrorStack(std::ostream& os,
+                              bool withEndl = false,
                               bool withPyPath = true) {
-  PyObject * ptype, *pvalue, *ptraceback;
+  PyObject *ptype, *pvalue, *ptraceback;
   PyErr_Fetch(&ptype, &pvalue, &ptraceback);
   PyErr_NormalizeException(&ptype, &pvalue, &ptraceback);
   PyErr_Clear();
@@ -91,10 +90,8 @@ static void printPyErrorStack(std::ostream& os, bool withEndl = false,
   }
   PyTracebackObject* obj = (PyTracebackObject*)ptraceback;
 
-  os << "Python Error: " << PyString_AsString(PyObject_Str(ptype))
-            <<" : " << (pvalue == NULL ? ""
-                                       : PyString_AsString(
-                                           PyObject_Str(pvalue)));
+  os << "Python Error: " << PyString_AsString(PyObject_Str(ptype)) << " : "
+     << (pvalue == NULL ? "" : PyString_AsString(PyObject_Str(pvalue)));
   if (withEndl) {
     os << std::endl;
   }
@@ -104,8 +101,8 @@ static void printPyErrorStack(std::ostream& os, bool withEndl = false,
   }
   while (obj != NULL) {
     int line = obj->tb_lineno;
-    const char* filename = PyString_AsString(
-          obj->tb_frame->f_code->co_filename);
+    const char* filename =
+        PyString_AsString(obj->tb_frame->f_code->co_filename);
     os << "            " << filename << " : " << line;
     if (withEndl) {
       os << std::endl;
@@ -143,7 +140,8 @@ std::string callPythonFunc(const std::string& moduleName,
 }
 
 PyObjectPtr createPythonClass(
-    const std::string& moduleName, const std::string& className,
+    const std::string& moduleName,
+    const std::string& className,
     const std::vector<std::string>& args,
     const std::map<std::string, std::string>& kwargs) {
   PyGuard guard;
@@ -164,21 +162,18 @@ PyObjectPtr createPythonClass(
   PyObjectPtr kwargsObjectList(PyDict_New());
   for (auto& x : kwargs) {
     PyObjectPtr pyArg(Py_BuildValue("s#", x.second.c_str(), x.second.length()));
-    PyDict_SetItemString(kwargsObjectList.get(), x.first.c_str(),
-                         pyArg.release());
+    PyDict_SetItemString(
+        kwargsObjectList.get(), x.first.c_str(), pyArg.release());
   }
 
-  PyObjectPtr pyInstance(PyInstance_New(pyClass.get(), argsObjectList.release(),
-                                        kwargsObjectList.release()));
+  PyObjectPtr pyInstance(PyInstance_New(
+      pyClass.get(), argsObjectList.release(), kwargsObjectList.release()));
   CHECK_PY(pyInstance) << "Create class " << className << " failed.";
   return pyInstance;
 }
 
-
 namespace py {
-char* repr(PyObject* obj) {
-  return PyString_AsString(PyObject_Repr(obj));
-}
+char* repr(PyObject* obj) { return PyString_AsString(PyObject_Repr(obj)); }
 
 std::string getPyCallStack() {
   std::ostringstream os;
@@ -186,7 +181,7 @@ std::string getPyCallStack() {
   return os.str();
 }
 
-PyObjectPtr import(const std::string &moduleName) {
+PyObjectPtr import(const std::string& moduleName) {
   auto module = PyImport_ImportModule(moduleName.c_str());
   CHECK_PY(module) << "Import " << moduleName << "Error";
   return PyObjectPtr(module);
diff --git a/paddle/utils/PythonUtil.h b/paddle/utils/PythonUtil.h
index 00fc177022..65677d9010 100644
--- a/paddle/utils/PythonUtil.h
+++ b/paddle/utils/PythonUtil.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #ifndef PADDLE_NO_PYTHON
@@ -83,8 +82,7 @@ PyObjectPtr createPythonClass(const std::string& moduleName,
                               const std::vector<std::string>& args,
                               const std::map<std::string, std::string>& kwargs);
 
-#define CHECK_PY(x)\
-  CHECK((x) != nullptr) << ::paddle::py::getPyCallStack()
+#define CHECK_PY(x) CHECK((x) != nullptr) << ::paddle::py::getPyCallStack()
 
 namespace py {
 PyObjectPtr import(const std::string& moduleName);
@@ -101,13 +99,13 @@ template <typename T>
 T castInt(PyObject* obj, bool* ok = nullptr) {
   if (PyLong_Check(obj)) {
     if (ok) *ok = true;
-    return (T) PyLong_AsUnsignedLong(obj);
+    return (T)PyLong_AsUnsignedLong(obj);
   } else if (PyInt_Check(obj)) {
     if (ok) *ok = true;
-    return (T) PyInt_AsLong(obj);
+    return (T)PyInt_AsLong(obj);
   } else {
     if (ok) *ok = false;
-    return (T) 0;
+    return (T)0;
   }
 }
 
@@ -116,14 +114,12 @@ T castInt(PyObject* obj, bool* ok = nullptr) {
  *
  * Just like toString method in java.
  */
-char *repr(PyObject* obj);
+char* repr(PyObject* obj);
 
 /**
  * Invoke repr of python object.
  */
-inline char *repr(const PyObjectPtr &obj) {
-  return repr(obj.get());
-}
+inline char* repr(const PyObjectPtr& obj) { return repr(obj.get()); }
 
 /**
  * Get Python Error Stack String.
@@ -137,8 +133,7 @@ std::string getPyCallStack();
  */
 class ObjectHelper {
 public:
-  explicit ObjectHelper(const PyObjectPtr& obj): obj_(obj) {
-  }
+  explicit ObjectHelper(const PyObjectPtr& obj) : obj_(obj) {}
 
   /**
    * get attribute
@@ -211,15 +206,13 @@ public:
     CHECK(PySequence_Check(seq_));
   }
 
-  explicit SequenceHelper(PyObject* seq): seq_(seq) {
+  explicit SequenceHelper(PyObject* seq) : seq_(seq) {
     CHECK(PySequence_Check(seq_));
   }
 
-  inline size_t size() const {
-    return (size_t) PySequence_Size(seq_);
-  }
+  inline size_t size() const { return (size_t)PySequence_Size(seq_); }
 
-  inline PyObject* operator[] (size_t i) const {
+  inline PyObject* operator[](size_t i) const {
     return PySequence_Fast_GET_ITEM(seq_, i);
   }
 
@@ -260,9 +253,9 @@ private:
 
 class DictHelper {
 public:
-  explicit DictHelper(PyObject* d): dict_(d) {}
+  explicit DictHelper(PyObject* d) : dict_(d) {}
 
-  explicit DictHelper(const PyObjectPtr& d): dict_(d.get()) {}
+  explicit DictHelper(const PyObjectPtr& d) : dict_(d.get()) {}
 
   void set(const std::string& key, PyObject* item) {
     PyDict_SetItemString(dict_, key.c_str(), item);
@@ -274,17 +267,15 @@ public:
 
   void setStringList(const std::string& key,
                      const std::vector<std::string>& items) {
-    auto * list = PyList_New(items.size());
-    for (size_t i=0; i < items.size(); ++i) {
+    auto* list = PyList_New(items.size());
+    for (size_t i = 0; i < items.size(); ++i) {
       PyList_SetItem(list, i, PyString_FromString(items[i].c_str()));
     }
     this->set(key, list);
   }
 
 private:
-  inline void checkDict() {
-    CHECK(PyDict_Check(this->dict_));
-  }
+  inline void checkDict() { CHECK(PyDict_Check(this->dict_)); }
 
   PyObject* dict_;
 };
@@ -298,7 +289,7 @@ inline static bool isCallable(const PyObjectPtr& obj) {
  */
 class CallableHelper {
 public:
-  explicit CallableHelper(const PyObjectPtr& obj): obj_(obj) {
+  explicit CallableHelper(const PyObjectPtr& obj) : obj_(obj) {
     CHECK(py::isCallable(obj_));
   }
 
@@ -308,21 +299,17 @@ public:
    * reset args, and create new tuple.
    * @param sz args size.
    */
-  void setArgsSize(size_t sz) {
-    args.reset(PyTuple_New(sz));
-  }
+  void setArgsSize(size_t sz) { args.reset(PyTuple_New(sz)); }
 
   /**
    * Get args sequence. User can set/get by SequenceHelper.
    */
-  SequenceHelper getArgs() {
-    return SequenceHelper(args);
-  }
+  SequenceHelper getArgs() { return SequenceHelper(args); }
 
   /**
    * Call python method, return an object.
    */
-  PyObject* operator() () {
+  PyObject* operator()() {
     PyGuard guard;
     return PyObject_Call(obj_.get(), args.get(), kwargs.get());
   }
diff --git a/paddle/utils/Queue.h b/paddle/utils/Queue.h
index f952cf5877..58d17e86c4 100644
--- a/paddle/utils/Queue.h
+++ b/paddle/utils/Queue.h
@@ -142,12 +142,9 @@ public:
    */
   bool waitNotEmptyFor(int seconds) {
     std::unique_lock<std::mutex> lock(queueLock_);
-    return queueCV_.wait_for(
-          lock,
-          std::chrono::seconds(seconds),
-          [this] {
-      return numElements_ != 0;
-    });
+    return queueCV_.wait_for(lock,
+                             std::chrono::seconds(seconds),
+                             [this] { return numElements_ != 0; });
   }
 
 private:
@@ -190,7 +187,7 @@ template <typename T>
 class BlockingQueue {
 public:
   /**
-   * @brief Construct Function. 
+   * @brief Construct Function.
    * @param[in] capacity the max numer of elements the queue can have.
    */
   explicit BlockingQueue(size_t capacity) : capacity_(capacity) {}
@@ -198,9 +195,9 @@ public:
   /**
    * @brief enqueue an element into Queue.
    * @param[in] x The enqueue element, pass by reference .
-   * @note This method is thread-safe, and will wake up another thread 
+   * @note This method is thread-safe, and will wake up another thread
    * who was blocked because of the queue is empty.
-   * @note If it's size() >= capacity before enqueue, 
+   * @note If it's size() >= capacity before enqueue,
    * this method will block and wait until size() < capacity.
    */
   void enqueue(const T& x) {
@@ -229,7 +226,7 @@ public:
   /**
    * Return size of queue.
    *
-   * @note This method is thread safe. 
+   * @note This method is thread safe.
    * The size of the queue won't change until the method return.
    */
   size_t size() {
diff --git a/paddle/utils/Stat.h b/paddle/utils/Stat.h
index 8bfe42a694..1ef688ea8d 100644
--- a/paddle/utils/Stat.h
+++ b/paddle/utils/Stat.h
@@ -15,19 +15,18 @@ limitations under the License. */
 #pragma once
 
 #include <stdint.h>
-#include <string>
 #include <sys/time.h>
-#include <memory>
 #include <iostream>
+#include <list>
+#include <memory>
 #include <mutex>
+#include <string>
 #include <unordered_map>
-#include <list>
 
-#include "Logging.h"
 #include "BarrierStat.h"
 #include "Locks.h"
+#include "Logging.h"
 #include "ThreadLocal.h"
-#include "BarrierStat.h"
 #include "hl_gpu.h"
 
 namespace paddle {
@@ -94,7 +93,8 @@ public:
     return ret.first->second;
   }
 
-  BarrierStatPtr getStat(uint16_t numConnThreads, const std::string& name,
+  BarrierStatPtr getStat(uint16_t numConnThreads,
+                         const std::string& name,
                          BarrierStatType bType);
 
   void deleteStat(const std::string& name);
@@ -205,8 +205,10 @@ protected:
 
 class TimerOnce {
 public:
-  TimerOnce(Stat* stat, const char* info = "",
-            uint64_t threshold = -1, bool autoStart = true,
+  TimerOnce(Stat* stat,
+            const char* info = "",
+            uint64_t threshold = -1,
+            bool autoStart = true,
             uint64_t startStamp = 0)
       : stat_(stat), info_(info), timer_(autoStart), threshold_(threshold) {
     if (!autoStart) {
@@ -262,21 +264,21 @@ inline StatSet& registerTimerArg2(uint64_t threshold = -1,
 
 #define REGISTER_TIMER_SET(statName, start, ...)                            \
   static StatPtr __stat = registerTimerArg2(__VA_ARGS__).getStat(statName); \
-  TimerOnce __timerOnce(__stat.get(), "", registerTimerArg1(__VA_ARGS__),   \
-                        false, start);
+  TimerOnce __timerOnce(                                                    \
+      __stat.get(), "", registerTimerArg1(__VA_ARGS__), false, start);
 
 // dynmaic timer, support to discriminate runtime entity, used in pserver
-#define REGISTER_TIMER_DYNAMIC(statName, ...)                               \
-  StatPtr __stat = registerTimerArg2(__VA_ARGS__).getStat(statName);        \
+#define REGISTER_TIMER_DYNAMIC(statName, ...)                        \
+  StatPtr __stat = registerTimerArg2(__VA_ARGS__).getStat(statName); \
   TimerOnce __timerOnce(__stat.get(), "", registerTimerArg1(__VA_ARGS__));
 
-#define REGISTER_TIMER_DYNAMIC_SET(statName, start, ...)                    \
-  StatPtr __stat = registerTimerArg2(__VA_ARGS__).getStat(statName);        \
-  TimerOnce __timerOnce(__stat.get(), "", registerTimerArg1(__VA_ARGS__),   \
-                        false, start);
+#define REGISTER_TIMER_DYNAMIC_SET(statName, start, ...)             \
+  StatPtr __stat = registerTimerArg2(__VA_ARGS__).getStat(statName); \
+  TimerOnce __timerOnce(                                             \
+      __stat.get(), "", registerTimerArg1(__VA_ARGS__), false, start);
 
-#define REGISTER_TIMER_INFO(statName, info)                                 \
-  static StatPtr __stat = globalStat.getStat(statName);                     \
+#define REGISTER_TIMER_INFO(statName, info)             \
+  static StatPtr __stat = globalStat.getStat(statName); \
   TimerOnce __timerOnce(__stat.get(), info, 10 * 1000000LU /*threshold*/);
 
 #endif  // DISABLE_TIMER
@@ -285,6 +287,7 @@ class GpuProfiler final {
 public:
   GpuProfiler(std::string statName, std::string info);
   ~GpuProfiler();
+
 private:
   std::lock_guard<std::recursive_mutex> guard_;
 };
diff --git a/paddle/utils/StringUtil.h b/paddle/utils/StringUtil.h
index 50301a19be..8b44dad192 100644
--- a/paddle/utils/StringUtil.h
+++ b/paddle/utils/StringUtil.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <string>
@@ -68,8 +67,6 @@ inline T to(const std::string& s) {
   return v;
 }
 
-
-
 }  // namespace str
 
 #undef DEFINE_STRING_CONVERSION
diff --git a/paddle/utils/Thread.h b/paddle/utils/Thread.h
index f6c826a1ee..ade0ee496f 100644
--- a/paddle/utils/Thread.h
+++ b/paddle/utils/Thread.h
@@ -57,7 +57,8 @@ public:
   void join() { thread_->join(); }
 
   /**
-   * @brief Define what to be done on this thread through override this function.
+   * @brief Define what to be done on this thread through override this
+   * function.
    */
   virtual void run() = 0;
 
@@ -155,10 +156,9 @@ public:
   /**
    * @brief Construct Function. No thread will be created.
    */
-  SyncThreadPool()
-    : jobStartBarrier_(0),
-    jobFinishBarrier_(0)
-  { LOG(FATAL) << "Not implemented"; }
+  SyncThreadPool() : jobStartBarrier_(0), jobFinishBarrier_(0) {
+    LOG(FATAL) << "Not implemented";
+  }
 
   /**
    * @brief Construct Fucntion. Create numWorkers of threads in the pool.
@@ -191,7 +191,8 @@ public:
   /**
    * @brief Execute a job using all the theads in the pool.
    * @param[in] jobFunc The function to be executed.
-   * @param[in] ownerFunc Owner thread can do something in owerFunc when job executing.
+   * @param[in] ownerFunc Owner thread can do something in owerFunc when job
+   * executing.
    * @note For the ownerFunc, tid=getNumThreads().
    */
   void exec(JobFunc jobFunc, JobFunc ownerFunc = nullptr) {
@@ -316,7 +317,8 @@ protected:
  *
  * Force stop:
  *
- *    Use forceStop() to exit forcibly even though there are remaining jobs in the
+ *    Use forceStop() to exit forcibly even though there are remaining jobs in
+ * the
  * job queue.
  */
 template <class T>
@@ -426,7 +428,8 @@ protected:
   /**
    * @brief Do the jobs in the job queue sequentianlly
    * and enqueue the result into the result queue.
-   * @note A nullptr will be enqueued into the resulte queue, when a worker finished.
+   * @note A nullptr will be enqueued into the resulte queue, when a worker
+   * finished.
    */
   virtual void run() {
     while (true) {
@@ -492,7 +495,9 @@ public:
   }
 
   ~AsyncThreadPool() {
-    if (!stopping_) { stop(); }
+    if (!stopping_) {
+      stop();
+    }
   }
 
   /**
@@ -501,7 +506,7 @@ public:
   void stop() {
     stopping_ = true;
     for (size_t i = 0; i < workers_.size(); i++) {
-      jobs_.enqueue([]{});
+      jobs_.enqueue([] {});
     }
     for (auto& worker : workers_) {
       worker->join();
@@ -526,7 +531,7 @@ public:
    * asynchronously.
    * Call std::future::get() when the execturation result is needed;
    */
-  template<class F, class... Args>
+  template <class F, class... Args>
   auto addJob(F&& f, Args&&... args)
       -> std::future<typename std::result_of<F(Args...)>::type> {
     CHECK(!stopping_) << "AsyncThreadPool is closed";
@@ -535,7 +540,7 @@ public:
     auto task = std::make_shared<std::packaged_task<T()>>(
         std::bind(std::forward<F>(f), std::forward<Args>(args)...));
     auto res = task->get_future();
-    jobs_.enqueue([task]{ (*task)(); });
+    jobs_.enqueue([task] { (*task)(); });
     return res;
   }
 
@@ -551,15 +556,15 @@ public:
    *
    * @note *results* may need to be carefully cleared before *addBatchJobs()*.
    */
-  template<class F>
-  void addBatchJobs(const std::vector<F> &jobs,
-      std::vector<typename std::result_of<F()>::type> &results) {
+  template <class F>
+  void addBatchJobs(const std::vector<F>& jobs,
+                    std::vector<typename std::result_of<F()>::type>& results) {
     typedef typename std::result_of<F()>::type T;
     static_assert(!std::is_same<T, void>::value,
-        "should pass a non-void function as job");
+                  "should pass a non-void function as job");
 
-    std::vector<std::future<T> > resFuts;
-    for (const auto &job : jobs) {
+    std::vector<std::future<T>> resFuts;
+    for (const auto& job : jobs) {
       resFuts.emplace_back(addJob(job));
     }
     for (auto& fut : resFuts) {
@@ -572,13 +577,16 @@ public:
    * @tparam F don't need to have a return value.
    * @param[in] jobs a vector of executable objection.
    */
-  template<class F>
-  void addBatchJobs(const std::vector<F> &jobs) {
+  template <class F>
+  void addBatchJobs(const std::vector<F>& jobs) {
     CHECK(!stopping_) << "AsyncThreadPool is closed";
-    std::vector<std::future<bool> > tmpRes;
+    std::vector<std::future<bool>> tmpRes;
 
     for (const auto& job : jobs) {
-      tmpRes.emplace_back(addJob([&job]{ job(); return true; }));
+      tmpRes.emplace_back(addJob([&job] {
+        job();
+        return true;
+      }));
     }
 
     for (auto& res : tmpRes) {
@@ -604,4 +612,4 @@ private:
   bool stopping_;
 };  // class AsyncThreadPool
 
-}   // namespace paddle
+}  // namespace paddle
diff --git a/paddle/utils/ThreadLocal.cpp b/paddle/utils/ThreadLocal.cpp
index 0f948f1029..49d4b15265 100644
--- a/paddle/utils/ThreadLocal.cpp
+++ b/paddle/utils/ThreadLocal.cpp
@@ -16,7 +16,8 @@ limitations under the License. */
 #include "ThreadLocal.h"
 #include "CommandLineParser.h"
 
-P_DEFINE_bool(thread_local_rand_use_global_seed, false,
+P_DEFINE_bool(thread_local_rand_use_global_seed,
+              false,
               "Whether to use global seed in thread local rand.");
 
 namespace paddle {
diff --git a/paddle/utils/ThreadLocal.h b/paddle/utils/ThreadLocal.h
index b91e4ad547..06c8b392af 100644
--- a/paddle/utils/ThreadLocal.h
+++ b/paddle/utils/ThreadLocal.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <pthread.h>
@@ -91,9 +90,7 @@ public:
   /**
    * Implicit conversion to T*
    */
-  operator T*() {
-    return get();
-  }
+  operator T*() { return get(); }
 
 private:
   static void dataDestructor(void* p) { delete (T*)p; }
diff --git a/paddle/utils/TypeDefs.h b/paddle/utils/TypeDefs.h
index e02fd62b53..e8be779bea 100644
--- a/paddle/utils/TypeDefs.h
+++ b/paddle/utils/TypeDefs.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 namespace paddle {
diff --git a/paddle/utils/Util.cpp b/paddle/utils/Util.cpp
index b16d431465..bc727cfa74 100644
--- a/paddle/utils/Util.cpp
+++ b/paddle/utils/Util.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "Util.h"
 
 #include <dirent.h>
@@ -54,7 +53,8 @@ P_DEFINE_int32(seed, 1, "random number seed. 0 for srand(time)");
 #include <gperftools/profiler.h>
 
 P_DEFINE_int32(profile_signal, 12, "signal for switch google profiler");
-P_DEFINE_string(profile_data_file, "gperf.prof",
+P_DEFINE_string(profile_data_file,
+                "gperf.prof",
                 "file for storing profile data");
 
 static void profilerSwitch(int signalNumber) {
@@ -94,18 +94,18 @@ static void installProfilerSwitch() {}
 namespace paddle {
 
 pid_t getTID() {
-  #if defined(__APPLE__) || defined(__OSX__)
-      // syscall is deprecated: first deprecated in macOS 10.12.
-      // syscall is unsupported;
-      // syscall pid_t tid = syscall(SYS_thread_selfid);
-      uint64_t tid;
-      pthread_threadid_np(NULL, &tid);
-  #else
-      #ifndef __NR_gettid
-      #define __NR_gettid 224
-      #endif
-      pid_t tid = syscall(__NR_gettid);
-  #endif
+#if defined(__APPLE__) || defined(__OSX__)
+  // syscall is deprecated: first deprecated in macOS 10.12.
+  // syscall is unsupported;
+  // syscall pid_t tid = syscall(SYS_thread_selfid);
+  uint64_t tid;
+  pthread_threadid_np(NULL, &tid);
+#else
+#ifndef __NR_gettid
+#define __NR_gettid 224
+#endif
+  pid_t tid = syscall(__NR_gettid);
+#endif
   CHECK_NE((int)tid, -1);
   return tid;
 }
@@ -126,22 +126,25 @@ void registerInitFunction(std::function<void()> func, int priority) {
 }
 
 void runInitFunctions() {
-  std::call_once(g_onceFlag, []() {
-    LOG(INFO) << "Calling runInitFunctions";
-    if (g_initFuncs) {
-      std::sort(g_initFuncs->begin(), g_initFuncs->end(),
-                [](const PriorityFuncPair& x, const PriorityFuncPair& y) {
-                  return x.first > y.first;
-                });
-      for (auto& f : *g_initFuncs) {
-        f.second();
-      }
-      delete g_initFuncs;
-      g_initFuncs = nullptr;
-    }
-    g_initialized = true;
-    LOG(INFO) << "Call runInitFunctions done.";
-  });
+  std::call_once(
+      g_onceFlag,
+      []() {
+        LOG(INFO) << "Calling runInitFunctions";
+        if (g_initFuncs) {
+          std::sort(g_initFuncs->begin(),
+                    g_initFuncs->end(),
+                    [](const PriorityFuncPair& x, const PriorityFuncPair& y) {
+                      return x.first > y.first;
+                    });
+          for (auto& f : *g_initFuncs) {
+            f.second();
+          }
+          delete g_initFuncs;
+          g_initFuncs = nullptr;
+        }
+        g_initialized = true;
+        LOG(INFO) << "Call runInitFunctions done.";
+      });
 }
 
 void initMain(int argc, char** argv) {
@@ -282,7 +285,7 @@ void mkDir(const char* filename) {
   }
 }
 
-void mkDirRecursively(const char *dir) {
+void mkDirRecursively(const char* dir) {
   struct stat sb;
 
   if (!stat(dir, &sb)) return;
@@ -303,7 +306,6 @@ void loadFileList(const std::string& fileListFileName,
   }
 }
 
-
 double getMemoryUsage() {
   FILE* fp = fopen("/proc/meminfo", "r");
   CHECK(fp) << "failed to fopen /proc/meminfo";
@@ -363,7 +365,9 @@ size_t calculateServiceNum(const std::string& pservers, int ports_num) {
   return hosts.size() * ports_num;
 }
 
-void memcpyWithCheck(void* dest, const void* src, size_t num,
+void memcpyWithCheck(void* dest,
+                     const void* src,
+                     size_t num,
                      const void* srcEnd) {
   int minus = (char*)srcEnd - (char*)src - num;
   CHECK_LE(0, minus) << "memcpyWithCheck: copy " << num
diff --git a/paddle/utils/Util.h b/paddle/utils/Util.h
index 2adb626c83..ed38f8fa60 100644
--- a/paddle/utils/Util.h
+++ b/paddle/utils/Util.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <algorithm>
@@ -47,7 +46,8 @@ limitations under the License. */
  */
 #define FOR_EACH(iterator_name, container)                              \
   for (auto iterator_name = (container).begin(), e = (container).end(); \
-       iterator_name != e; ++iterator_name)
+       iterator_name != e;                                              \
+       ++iterator_name)
 
 /**
  * Loop over the elements in a container in reverse order
@@ -60,8 +60,8 @@ limitations under the License. */
  */
 #define FOR_EACH_R(iterator_name, container)                              \
   for (auto iterator_name = (container).rbegin(), e = (container).rend(); \
-       iterator_name != e; ++iterator_name)
-
+       iterator_name != e;                                                \
+       ++iterator_name)
 
 namespace paddle {
 
@@ -77,11 +77,11 @@ pid_t getTID();
  * \f]
  */
 inline constexpr size_t findLastSet(size_t x) {
-  return std::is_same<size_t , unsigned int>::value ?
-      (x ? 8 * sizeof(x) - __builtin_clz(x) : 0)
-    : (std::is_same<size_t , unsigned long>::value ? // NOLINT
-      (x ? 8 * sizeof(x) - __builtin_clzl(x) : 0)
-    : (x ? 8 * sizeof(x) - __builtin_clzll(x) : 0));
+  return std::is_same<size_t, unsigned int>::value
+             ? (x ? 8 * sizeof(x) - __builtin_clz(x) : 0)
+             : (std::is_same<size_t, unsigned long>::value  // NOLINT
+                    ? (x ? 8 * sizeof(x) - __builtin_clzl(x) : 0)
+                    : (x ? 8 * sizeof(x) - __builtin_clzll(x) : 0));
 }
 
 /**
@@ -95,7 +95,6 @@ inline int mod(int a, int b) {
   return r >= 0 ? r : r + b;
 }
 
-
 /**
  * find the value given a key k from container c.
  * If the key can be found, the value is stored in *value
@@ -120,7 +119,7 @@ static bool contains(const Container& container, const T& val) {
 /**
  * pop and get the front element of a container
  */
-template<typename Container>
+template <typename Container>
 typename Container::value_type pop_get_front(Container& c) {
   typename Container::value_type v;
   swap(v, c.front());
@@ -207,7 +206,6 @@ protected:
   int devId_;
 };
 
-
 /**
  * Enables direct access to memory allocations on a peer device(d2).
  * input:
@@ -250,7 +248,6 @@ private:
   bool syncFlag_;
 };
 
-
 inline bool useGpu(int deviceId) {
   return FLAGS_parallel_nn ? (deviceId >= 0 ? true : false) : FLAGS_use_gpu;
 }
@@ -328,7 +325,9 @@ T readT(char*& p, const char* pEnd) {
   return v;
 }
 
-void memcpyWithCheck(void* dest, const void* src, size_t num,
+void memcpyWithCheck(void* dest,
+                     const void* src,
+                     size_t num,
                      const void* srcEnd);
 
 /**
@@ -338,7 +337,6 @@ void memcpyWithCheck(void* dest, const void* src, size_t num,
 class SyncThreadPool;
 SyncThreadPool* getGlobalSyncThreadPool();
 
-
 namespace path {
 
 // directory separator
@@ -363,7 +361,8 @@ std::string dirname(const std::string& path);
 std::string join(const std::string& part1, const std::string& part2);
 
 template <typename... Args>
-std::string join(const std::string& part1, const std::string& part2,
+std::string join(const std::string& part1,
+                 const std::string& part2,
                  Args... args) {
   return join(join(part1, part2), args...);
 }
@@ -392,8 +391,8 @@ public:
     std::call_once(onceFlag_, [&] { invokeThreadId_ = curThreadId; });
     CHECK_EQ(invokeThreadId_, curThreadId)
         << "This method should invoke in "
-           "same thread, but first invoked in " << invokeThreadId_
-        << " current invoked in " << curThreadId;
+           "same thread, but first invoked in "
+        << invokeThreadId_ << " current invoked in " << curThreadId;
   }
 
 private:
@@ -447,28 +446,23 @@ private:
  * @brief The ScopedCallbacks class is a callback invoker when object is
  *        created and destroyed.
  */
-template <typename CallbackType, typename ...Args>
+template <typename CallbackType, typename... Args>
 class ScopedCallbacks {
 public:
-  ScopedCallbacks(CallbackType enter,
-                  CallbackType exit,
-                  Args& ... args)
-    : exit_(std::bind(exit, args...)) {
+  ScopedCallbacks(CallbackType enter, CallbackType exit, Args&... args)
+      : exit_(std::bind(exit, args...)) {
     enter(args...);
   }
 
   ScopedCallbacks(const ScopedCallbacks& other) = delete;
-  ScopedCallbacks& operator = (const ScopedCallbacks& other) = delete;
+  ScopedCallbacks& operator=(const ScopedCallbacks& other) = delete;
 
-  ~ScopedCallbacks() {
-    exit_();
-  }
+  ~ScopedCallbacks() { exit_(); }
 
 private:
   std::function<void()> exit_;
 };
 
-
 /**
  * std compatible allocator with memory alignment.
  * @tparam T type of allocator elements.
@@ -537,8 +531,7 @@ public:
       return nullptr;
     }
     if (n > max_size()) {
-      throw std::length_error(
-          "AlignAllocator<T>::allocate() - Int Overflow.");
+      throw std::length_error("AlignAllocator<T>::allocate() - Int Overflow.");
     }
     void* r = nullptr;
     CHECK_EQ(posix_memalign(&r, Alignment * 8, sizeof(T) * n), 0);
@@ -558,7 +551,6 @@ private:
   AlignedAllocator& operator=(const AlignedAllocator&);  // disable
 };
 
-
 class Deprecated {
 public:
   explicit Deprecated(const std::string& msg = "") {
diff --git a/paddle/utils/Version.cpp b/paddle/utils/Version.cpp
index b59b78f570..e706983918 100644
--- a/paddle/utils/Version.cpp
+++ b/paddle/utils/Version.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "Version.h"
 
 #include "Flags.h"
@@ -34,18 +33,22 @@ void printVersion(std::ostream& os) {
 #ifndef PADDLE_VERSION
 #define PADDLE_VERSION "unknown"
 #endif
-  os << "paddle version: " << PADDLE_VERSION << std::endl << std::boolalpha
-      << "\t" << "withGpu: " << version::isWithGpu() << std::endl
-      << "\t" << "withAvx: " << version::isWithAvx() << std::endl
-      << "\t" << "withPyDataProvider: " << version::isWithPyDataProvider()
-      << std::endl
-      << "\t" << "withTimer: " << version::isWithTimer() << std::endl
-      << "\t" << "withFpga: " << version::isWithFpga() << std::endl
-      << "\t" << "real byte size: "<< version::sizeofReal() << std::endl
-      << std::endl;
+  os << "paddle version: " << PADDLE_VERSION << std::endl
+     << std::boolalpha << "\t"
+     << "withGpu: " << version::isWithGpu() << std::endl
+     << "\t"
+     << "withAvx: " << version::isWithAvx() << std::endl
+     << "\t"
+     << "withPyDataProvider: " << version::isWithPyDataProvider() << std::endl
+     << "\t"
+     << "withTimer: " << version::isWithTimer() << std::endl
+     << "\t"
+     << "withFpga: " << version::isWithFpga() << std::endl
+     << "\t"
+     << "real byte size: " << version::sizeofReal() << std::endl
+     << std::endl;
 }
 
-
 void printVersion() {
   if (FLAGS_version) {
     printVersion(std::cout);
diff --git a/paddle/utils/Version.h b/paddle/utils/Version.h
index e6655fa75d..e6c799644e 100644
--- a/paddle/utils/Version.h
+++ b/paddle/utils/Version.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 #include <stddef.h>
 #include "TypeDefs.h"
@@ -35,7 +34,6 @@ namespace paddle {
  *    real byte size: 4
  */
 
-
 namespace version {
 
 /**
@@ -44,7 +42,6 @@ namespace version {
  */
 void printVersion();
 
-
 void printVersion(std::ostream& os);
 /**
  * @brief isWithGpu
@@ -75,7 +72,6 @@ constexpr bool isWithPyDataProvider() {
 #endif
 }
 
-
 /**
  * @brief isWithTimer
  * @return true if paddle compiled with timer.
@@ -116,25 +112,19 @@ constexpr bool isWithFpga() {
  * @brief sizeofReal
  * @return return the byte size of real
  */
-constexpr size_t sizeofReal() {
-  return sizeof(real);
-}
+constexpr size_t sizeofReal() { return sizeof(real); }
 
 /**
  * @brief isPaddleUseDouble
  * @return true if paddle compiled with double precision.
  */
-constexpr bool isPaddleUseDouble() {
-  return sizeofReal() == sizeof(double);
-}
+constexpr bool isPaddleUseDouble() { return sizeofReal() == sizeof(double); }
 
 /**
  * @brief isPaddleUseFloat
  * @return true if paddle compiled with float precision
  */
-constexpr bool isPaddleUseFloat() {
-  return sizeofReal() == sizeof(float);
-}
+constexpr bool isPaddleUseFloat() { return sizeofReal() == sizeof(float); }
 
 }  //  namespace version
 
diff --git a/paddle/utils/arch/linux/Locks.cpp b/paddle/utils/arch/linux/Locks.cpp
index 347ae64c26..93016daeae 100644
--- a/paddle/utils/arch/linux/Locks.cpp
+++ b/paddle/utils/arch/linux/Locks.cpp
@@ -22,26 +22,19 @@ public:
   sem_t sem;
 };
 
-Semaphore::Semaphore(int initValue): m(new SemaphorePrivate()) {
+Semaphore::Semaphore(int initValue) : m(new SemaphorePrivate()) {
   sem_init(&m->sem, 0, initValue);
 }
 
-Semaphore::~Semaphore() {
-  sem_destroy(&m->sem);
-}
+Semaphore::~Semaphore() { sem_destroy(&m->sem); }
 
 bool Semaphore::timeWait(struct timespec* ts) {
   return (0 == sem_timedwait(&m->sem, ts));
 }
 
-void Semaphore::wait() {
-  sem_wait(&m->sem);
-}
-
-void Semaphore::post() {
-  sem_post(&m->sem);
-}
+void Semaphore::wait() { sem_wait(&m->sem); }
 
+void Semaphore::post() { sem_post(&m->sem); }
 
 class SpinLockPrivate {
 public:
@@ -51,25 +44,20 @@ public:
   char padding_[64 - sizeof(pthread_spinlock_t)];
 };
 
-SpinLock::SpinLock():m(new SpinLockPrivate()) {}
-
+SpinLock::SpinLock() : m(new SpinLockPrivate()) {}
 
 SpinLock::~SpinLock() { delete m; }
 
-void SpinLock::lock() {
-  pthread_spin_lock(&m->lock_);
-}
+void SpinLock::lock() { pthread_spin_lock(&m->lock_); }
 
-void SpinLock::unlock() {
-  pthread_spin_unlock(&m->lock_);
-}
+void SpinLock::unlock() { pthread_spin_unlock(&m->lock_); }
 
 class ThreadBarrierPrivate {
 public:
   pthread_barrier_t barrier_;
 };
 
-ThreadBarrier::ThreadBarrier(int count): m(new ThreadBarrierPrivate()) {
+ThreadBarrier::ThreadBarrier(int count) : m(new ThreadBarrierPrivate()) {
   pthread_barrier_init(&m->barrier_, nullptr, count);
 }
 
@@ -78,8 +66,6 @@ ThreadBarrier::~ThreadBarrier() {
   delete m;
 }
 
-void ThreadBarrier::wait() {
-  pthread_barrier_wait(&m->barrier_);
-}
+void ThreadBarrier::wait() { pthread_barrier_wait(&m->barrier_); }
 
 }  // namespace paddle
diff --git a/paddle/utils/arch/osx/Locks.cpp b/paddle/utils/arch/osx/Locks.cpp
index b3ec454976..ae563a6afd 100644
--- a/paddle/utils/arch/osx/Locks.cpp
+++ b/paddle/utils/arch/osx/Locks.cpp
@@ -22,20 +22,16 @@ namespace paddle {
 
 class SemaphorePrivate {
 public:
-  ~SemaphorePrivate() {
-    dispatch_release(sem);
-  }
+  ~SemaphorePrivate() { dispatch_release(sem); }
 
   dispatch_semaphore_t sem;
 };
 
-Semaphore::Semaphore(int initValue): m(new SemaphorePrivate()) {
+Semaphore::Semaphore(int initValue) : m(new SemaphorePrivate()) {
   m->sem = dispatch_semaphore_create(initValue);
 }
 
-Semaphore::~Semaphore() {
-  delete m;
-}
+Semaphore::~Semaphore() { delete m; }
 
 bool Semaphore::timeWait(timespec *ts) {
   dispatch_time_t tm = dispatch_walltime(ts, 0);
@@ -46,9 +42,7 @@ void Semaphore::wait() {
   dispatch_semaphore_wait(m->sem, DISPATCH_TIME_FOREVER);
 }
 
-void Semaphore::post() {
-  dispatch_semaphore_signal(m->sem);
-}
+void Semaphore::post() { dispatch_semaphore_signal(m->sem); }
 
 class SpinLockPrivate {
 public:
@@ -56,17 +50,15 @@ public:
   char padding_[64 - sizeof(lock_)];  // Padding to cache line size
 };
 
-SpinLock::SpinLock(): m(new SpinLockPrivate()) {}
+SpinLock::SpinLock() : m(new SpinLockPrivate()) {}
 SpinLock::~SpinLock() { delete m; }
 
 void SpinLock::lock() {
-  while (m->lock_.test_and_set(std::memory_order_acquire)) {}
-}
-
-void SpinLock::unlock() {
-  m->lock_.clear(std::memory_order_release);
+  while (m->lock_.test_and_set(std::memory_order_acquire)) {
+  }
 }
 
+void SpinLock::unlock() { m->lock_.clear(std::memory_order_release); }
 
 class ThreadBarrierPrivate {
 public:
@@ -75,7 +67,7 @@ public:
   int count_;
   int tripCount_;
 
-  inline explicit ThreadBarrierPrivate(int cnt):count_(0), tripCount_(cnt) {
+  inline explicit ThreadBarrierPrivate(int cnt) : count_(0), tripCount_(cnt) {
     CHECK_NE(cnt, 0);
     CHECK_GE(pthread_mutex_init(&mutex_, 0), 0);
     CHECK_GE(pthread_cond_init(&cond_, 0), 0);
@@ -106,7 +98,7 @@ public:
   }
 };
 
-ThreadBarrier::ThreadBarrier(int count): m(new ThreadBarrierPrivate(count)) {}
+ThreadBarrier::ThreadBarrier(int count) : m(new ThreadBarrierPrivate(count)) {}
 ThreadBarrier::~ThreadBarrier() { delete m; }
 void ThreadBarrier::wait() { m->wait(); }
 
diff --git a/paddle/utils/tests/test_CommandLineParser.cpp b/paddle/utils/tests/test_CommandLineParser.cpp
index 9bb6827540..5ecfb2b4f5 100644
--- a/paddle/utils/tests/test_CommandLineParser.cpp
+++ b/paddle/utils/tests/test_CommandLineParser.cpp
@@ -63,10 +63,15 @@ TEST(CommandLineParser, defaultValue) {
 }
 
 TEST(CommandLineParser, normal) {
-  char* argv[] = {
-      cc("test_program"), cc("--i2=32"),              cc("--str1=abc"),
-      cc("--b2=1"),       cc("-b1=False"),            cc("--d2=.34"),
-      cc("--d1=0"),       cc("--l1=-12345678901234"), cc("-ul2=3212")};
+  char* argv[] = {cc("test_program"),
+                  cc("--i2=32"),
+                  cc("--str1=abc"),
+                  cc("--b2=1"),
+                  cc("-b1=False"),
+                  cc("--d2=.34"),
+                  cc("--d1=0"),
+                  cc("--l1=-12345678901234"),
+                  cc("-ul2=3212")};
   int argc = sizeof(argv) / sizeof(char*);
   paddle::ParseCommandLineFlags(&argc, argv);
   ASSERT_EQ(argc, 1);
@@ -104,8 +109,6 @@ int main(int argc, char** argv) {
 
 #else
 
-int main(int argc, char** argv) {
-  return 0;
-}
+int main(int argc, char** argv) { return 0; }
 
 #endif
diff --git a/paddle/utils/tests/test_CustomStackTrace.cpp b/paddle/utils/tests/test_CustomStackTrace.cpp
index 3e66502147..3bfb381ed9 100644
--- a/paddle/utils/tests/test_CustomStackTrace.cpp
+++ b/paddle/utils/tests/test_CustomStackTrace.cpp
@@ -22,11 +22,12 @@ limitations under the License. */
 
 P_DEFINE_int32(test_thread_num, 10, "testing thread number");
 
-void testNormalImpl(const std::function<void(
-                      paddle::CustomStackTrace<std::string>&,
-                      size_t, size_t,
-                      paddle::ThreadBarrier&,
-                      paddle::ThreadBarrier&)>& callback) {
+void testNormalImpl(
+    const std::function<void(paddle::CustomStackTrace<std::string>&,
+                             size_t,
+                             size_t,
+                             paddle::ThreadBarrier&,
+                             paddle::ThreadBarrier&)>& callback) {
   paddle::CustomStackTrace<std::string> tracer;
   paddle::ThreadBarrier doneBarrier(FLAGS_test_thread_num + 1);
   paddle::ThreadBarrier startBarrier(FLAGS_test_thread_num + 1);
@@ -35,10 +36,13 @@ void testNormalImpl(const std::function<void(
   std::vector<std::unique_ptr<std::thread>> threads;
   threads.reserve(FLAGS_test_thread_num);
 
-  for (int32_t i=0; i < FLAGS_test_thread_num; ++i) {
-    threads.emplace_back(new std::thread([&tracer, &countDown, &layerSize,
-                                         &startBarrier, &doneBarrier,
-                                         &callback]{
+  for (int32_t i = 0; i < FLAGS_test_thread_num; ++i) {
+    threads.emplace_back(new std::thread([&tracer,
+                                          &countDown,
+                                          &layerSize,
+                                          &startBarrier,
+                                          &doneBarrier,
+                                          &callback] {
       callback(tracer, countDown, layerSize, startBarrier, doneBarrier);
     }));
   }
@@ -55,18 +59,19 @@ void testNormalImpl(const std::function<void(
   }
 }
 
-
 TEST(CustomStackTrace, normalTrain) {
   testNormalImpl([](paddle::CustomStackTrace<std::string>& tracer,
-                 size_t countDown, size_t layerSize,
-                 paddle::ThreadBarrier& start, paddle::ThreadBarrier& finish){
+                    size_t countDown,
+                    size_t layerSize,
+                    paddle::ThreadBarrier& start,
+                    paddle::ThreadBarrier& finish) {
     while (countDown-- > 0) {
       start.wait();
-      for (size_t i=0; i < layerSize; ++i) {
+      for (size_t i = 0; i < layerSize; ++i) {
         tracer.push("layer_" + std::to_string(i));
       }
       tracer.pop("");
-      for (size_t i=0; i < layerSize; ++i) {
+      for (size_t i = 0; i < layerSize; ++i) {
         tracer.pop("layer_" + std::to_string(layerSize - 1 - i));
       }
       finish.wait();
@@ -75,12 +80,14 @@ TEST(CustomStackTrace, normalTrain) {
 }
 
 TEST(CustomStackTrace, normalTest) {
-  testNormalImpl([] (paddle::CustomStackTrace<std::string>& tracer,
-                 size_t countDown, size_t layerSize,
-                 paddle::ThreadBarrier& start, paddle::ThreadBarrier& finish){
+  testNormalImpl([](paddle::CustomStackTrace<std::string>& tracer,
+                    size_t countDown,
+                    size_t layerSize,
+                    paddle::ThreadBarrier& start,
+                    paddle::ThreadBarrier& finish) {
     while (countDown-- > 0) {
       start.wait();
-      for (size_t i=0; i < layerSize; ++i) {
+      for (size_t i = 0; i < layerSize; ++i) {
         tracer.push("layer_" + std::to_string(i));
       }
       tracer.clear();  // in forward test, tracer will clear after forward.
diff --git a/paddle/utils/tests/test_CustomStackTracePrint.cpp b/paddle/utils/tests/test_CustomStackTracePrint.cpp
index c19c98614e..d39a190961 100644
--- a/paddle/utils/tests/test_CustomStackTracePrint.cpp
+++ b/paddle/utils/tests/test_CustomStackTracePrint.cpp
@@ -18,7 +18,7 @@ limitations under the License. */
 int main(int argc, char** argv) {
   paddle::initMain(argc, argv);
 
-  for (size_t i=0; i < 1000; ++i) {
+  for (size_t i = 0; i < 1000; ++i) {
     paddle::gLayerStackTrace.push("layer_" + std::to_string(i));
     if (i == 998) {
       throw "Unhandle exception";
diff --git a/paddle/utils/tests/test_Logging.cpp b/paddle/utils/tests/test_Logging.cpp
index a9382de6da..9f477fab14 100644
--- a/paddle/utils/tests/test_Logging.cpp
+++ b/paddle/utils/tests/test_Logging.cpp
@@ -54,7 +54,7 @@ TEST(Logging, Check) {
 
   auto pcheckDown = [&] { P_CHECK(a == b); };
   ASSERT_DEATH(pcheckDown(),
-    "F .*test_Logging.cpp:[0-9]+] Check failed: a == b ");
+               "F .*test_Logging.cpp:[0-9]+] Check failed: a == b ");
 
   P_CHECK_LE(a, b);
   P_CHECK_LT(a, b);
@@ -157,8 +157,6 @@ int main(int argc, char** argv) {
 
 #else
 
-int main(int, char**) {
-  return 0;
-}
+int main(int, char**) { return 0; }
 
 #endif
diff --git a/paddle/utils/tests/test_SpinLock.cpp b/paddle/utils/tests/test_SpinLock.cpp
index ebc84e0f52..77d281962c 100644
--- a/paddle/utils/tests/test_SpinLock.cpp
+++ b/paddle/utils/tests/test_SpinLock.cpp
@@ -21,17 +21,18 @@ limitations under the License. */
 
 P_DEFINE_int32(test_thread_num, 100, "testing thread number");
 
-void testNormalImpl(size_t thread_num, const std::function
-    <void(size_t, size_t&, paddle::SpinLock&)>& callback) {
+void testNormalImpl(
+    size_t thread_num,
+    const std::function<void(size_t, size_t&, paddle::SpinLock&)>& callback) {
   paddle::SpinLock mutex;
   std::vector<std::thread> threads;
   threads.reserve(thread_num);
 
   size_t count = 0;
   for (size_t i = 0; i < thread_num; ++i) {
-      threads.emplace_back([&thread_num, &count, &mutex, &callback]{
-          callback(thread_num, count, mutex);
-      });
+    threads.emplace_back([&thread_num, &count, &mutex, &callback] {
+      callback(thread_num, count, mutex);
+    });
   }
   for (auto& thread : threads) {
     thread.join();
@@ -41,12 +42,13 @@ void testNormalImpl(size_t thread_num, const std::function
 }
 
 TEST(ThreadSpinLock, normalTest) {
-  for (auto &thread_num : {10, 30, 50 , 100 , 300, 1000}) {
-    testNormalImpl(thread_num, [](size_t thread_num,
-        size_t& count, paddle::SpinLock& mutex){
-        std::lock_guard<paddle::SpinLock> lock(mutex);
-        ++count;
-    });
+  for (auto& thread_num : {10, 30, 50, 100, 300, 1000}) {
+    testNormalImpl(
+        thread_num,
+        [](size_t thread_num, size_t& count, paddle::SpinLock& mutex) {
+          std::lock_guard<paddle::SpinLock> lock(mutex);
+          ++count;
+        });
   }
 }
 
diff --git a/paddle/utils/tests/test_StringUtils.cpp b/paddle/utils/tests/test_StringUtils.cpp
index b8636709e9..2c699b791f 100644
--- a/paddle/utils/tests/test_StringUtils.cpp
+++ b/paddle/utils/tests/test_StringUtils.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/StringUtil.h"
 
 #include <gtest/gtest.h>
diff --git a/paddle/utils/tests/test_Thread.cpp b/paddle/utils/tests/test_Thread.cpp
index bf4e275345..154db5d9c6 100644
--- a/paddle/utils/tests/test_Thread.cpp
+++ b/paddle/utils/tests/test_Thread.cpp
@@ -20,7 +20,7 @@ using paddle::AsyncThreadPool;  // NOLINT
 
 TEST(AsyncThreadPool, addJob) {
   AsyncThreadPool pool(8);
-  auto a = pool.addJob([]{ return 1; });
+  auto a = pool.addJob([] { return 1; });
   auto b = pool.addJob([] { return true; });
   auto c = pool.addJob([] { return false; });
 
@@ -36,10 +36,7 @@ TEST(AsyncThreadPool, addBatchJob) {
   std::vector<AsyncThreadPool::JobFunc> jobs;
 
   for (int i = 0; i < 10000; i++) {
-    jobs.emplace_back(
-        [&] {
-          counter++;
-        });
+    jobs.emplace_back([&] { counter++; });
   }
 
   pool.addBatchJobs(jobs);
@@ -55,13 +52,16 @@ TEST(AsyncThreadPool, multiThreadAddBatchJob) {
   int counter = 0;
   const int numMonitors = 300;
   const int numSlaves = 300;
-  std::vector<AsyncThreadPool::JobFunc> moniterJobs(numMonitors, [&] {
-      std::vector<AsyncThreadPool::JobFunc> slaveJobs(numSlaves,
-          [mut, &counter] {
-            std::lock_guard<std::mutex> lk(*mut);
-            counter++;
-          });
-      levelTwoPool.addBatchJobs(slaveJobs);
+  std::vector<AsyncThreadPool::JobFunc> moniterJobs(
+      numMonitors,
+      [&] {
+        std::vector<AsyncThreadPool::JobFunc> slaveJobs(
+            numSlaves,
+            [mut, &counter] {
+              std::lock_guard<std::mutex> lk(*mut);
+              counter++;
+            });
+        levelTwoPool.addBatchJobs(slaveJobs);
       });
   levelOnePool.addBatchJobs(moniterJobs);
   ASSERT_EQ(counter, numMonitors * numSlaves);
@@ -70,13 +70,10 @@ TEST(AsyncThreadPool, multiThreadAddBatchJob) {
 TEST(AsyncThreadPool, addBatchJobWithResults) {
   AsyncThreadPool pool(100);
 
-  std::vector<std::function<int()> > jobs;
+  std::vector<std::function<int()>> jobs;
   const int numJobs = 100;
   for (int i = 0; i < numJobs; i++) {
-    jobs.emplace_back(
-        [i]{
-          return i;
-        });
+    jobs.emplace_back([i] { return i; });
   }
 
   std::vector<int> res;
diff --git a/paddle/utils/tests/test_ThreadBarrier.cpp b/paddle/utils/tests/test_ThreadBarrier.cpp
index 90bd6c21bc..20b9babd94 100644
--- a/paddle/utils/tests/test_ThreadBarrier.cpp
+++ b/paddle/utils/tests/test_ThreadBarrier.cpp
@@ -22,42 +22,44 @@ limitations under the License. */
 
 P_DEFINE_int32(test_thread_num, 100, "testing thread number");
 
-void testNormalImpl(size_t thread_num,
-                    const std::function<void(size_t,
-                    std::mutex&, std::set<std::thread::id>&,
-                    paddle::ThreadBarrier&)>& callback) {
- std::mutex mutex;
- std::set<std::thread::id> tids;
- paddle::ThreadBarrier barrier(thread_num);
+void testNormalImpl(
+    size_t thread_num,
+    const std::function<void(size_t,
+                             std::mutex&,
+                             std::set<std::thread::id>&,
+                             paddle::ThreadBarrier&)>& callback) {
+  std::mutex mutex;
+  std::set<std::thread::id> tids;
+  paddle::ThreadBarrier barrier(thread_num);
 
- std::vector<std::thread> threads;
- threads.reserve(thread_num);
- for (size_t i = 0; i < thread_num; ++i) {
-    threads.emplace_back([&thread_num, &mutex,
-                         &tids, &barrier, &callback]{
-        callback(thread_num, mutex, tids, barrier);
+  std::vector<std::thread> threads;
+  threads.reserve(thread_num);
+  for (size_t i = 0; i < thread_num; ++i) {
+    threads.emplace_back([&thread_num, &mutex, &tids, &barrier, &callback] {
+      callback(thread_num, mutex, tids, barrier);
     });
- }
+  }
 
- for (auto& thread : threads) {
-   thread.join();
- }
+  for (auto& thread : threads) {
+    thread.join();
+  }
 }
 
 TEST(ThreadBarrier, normalTest) {
-  for (auto &thread_num : {10, 30, 50 , 100 , 300, 1000}) {
+  for (auto& thread_num : {10, 30, 50, 100, 300, 1000}) {
     testNormalImpl(thread_num,
-                  [](size_t thread_num, std::mutex& mutex,
-                  std::set<std::thread::id>& tids,
-                  paddle::ThreadBarrier& barrier){
-      {
-        std::lock_guard<std::mutex> guard(mutex);
-        tids.insert(std::this_thread::get_id());
-      }
-      barrier.wait();
-      // Check whether all threads reach this point or not
-      CHECK_EQ(tids.size(), thread_num);
-    });
+                   [](size_t thread_num,
+                      std::mutex& mutex,
+                      std::set<std::thread::id>& tids,
+                      paddle::ThreadBarrier& barrier) {
+                     {
+                       std::lock_guard<std::mutex> guard(mutex);
+                       tids.insert(std::this_thread::get_id());
+                     }
+                     barrier.wait();
+                     // Check whether all threads reach this point or not
+                     CHECK_EQ(tids.size(), thread_num);
+                   });
   }
 }
 
diff --git a/proto/ModelConfig.proto.m4 b/proto/ModelConfig.proto.m4
index aea77248cb..c835cfd522 100644
--- a/proto/ModelConfig.proto.m4
+++ b/proto/ModelConfig.proto.m4
@@ -92,7 +92,7 @@ message PoolConfig {
   optional uint32 start = 4;
 
   // Defines the stride size between successive pooling squares.
-  required uint32 stride = 5;
+  required uint32 stride = 5 [default = 1];
 
   // The size of output feature map.
   required uint32 output_x = 6;
@@ -105,19 +105,19 @@ message PoolConfig {
   optional uint32 padding = 8 [default = 0];
 
   // if not set, use size_x
-  optional uint32 size_y = 9 [default = 0];
+  optional uint32 size_y = 9;
 
   // if not set, use stride
-  optional uint32 stride_y = 10 [default = 0];
+  optional uint32 stride_y = 10;
 
   // if not set, use output_x
-  optional uint32 output_y = 11 [default = 0];
+  optional uint32 output_y = 11;
 
   // if not set, use img_size
-  optional uint32 img_size_y = 12 [default = 0];
+  optional uint32 img_size_y = 12;
 
   // if not set, use padding
-  optional uint32 padding_y = 13 [default = 0];
+  optional uint32 padding_y = 13;
 }
 
 message SppConfig {
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index dbe2f3b292..9db42bf172 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -592,6 +592,7 @@ class DotMulProjection(Projection):
     def calc_parameter_dims(self, input_size, output_size):
         return [1, output_size]
 
+
 # ScalingProjection
 @config_class
 class ScalingProjection(Projection):
@@ -808,17 +809,18 @@ class BilinearInterp(Cfg):
 # please refer to the comments in proto/ModelConfig.proto
 @config_class
 class Pool(Cfg):
-    def __init__(self,
-                 pool_type,
-                 channels,
-                 size_x,
-                 size_y=None,
-                 img_width=None,
-                 start=None,
-                 stride=None,
-                 stride_y=None,
-                 padding=None,
-                 padding_y=None):
+    def __init__(
+            self,
+            pool_type,
+            channels,
+            size_x,
+            size_y=None,
+            img_width=None,
+            start=None,
+            stride=None,  # 1 by defalut in protobuf
+            stride_y=None,
+            padding=None,  # 0 by defalut in protobuf
+            padding_y=None):
         self.add_keys(locals())
 
 
@@ -1113,13 +1115,13 @@ def parse_pool(pool, input_layer_name, pool_conf):
 
     if pool.padding is not None:
         pool_conf.padding = pool.padding
-        pool_conf.padding_y = default(pool.padding_y, pool_conf.padding)
-        pool_conf.output_x = cnn_output_size(
-            pool_conf.img_size, pool_conf.size_x, pool_conf.padding,
-            pool_conf.stride, False)
-        pool_conf.output_y = cnn_output_size(
-            pool_conf.img_size_y, pool_conf.size_y, pool_conf.padding_y,
-            pool_conf.stride_y, False)
+    pool_conf.padding_y = default(pool.padding_y, pool_conf.padding)
+    pool_conf.output_x = cnn_output_size(pool_conf.img_size, pool_conf.size_x,
+                                         pool_conf.padding, pool_conf.stride,
+                                         False)
+    pool_conf.output_y = cnn_output_size(pool_conf.img_size_y, pool_conf.size_y,
+                                         pool_conf.padding_y,
+                                         pool_conf.stride_y, False)
 
 
 def parse_spp(spp, input_layer_name, spp_conf):