From 82801f24e5e314579b963ace057d80c949379b23 Mon Sep 17 00:00:00 2001
From: caoying03 <caoying03@baidu.com>
Date: Mon, 10 Jul 2017 10:06:50 +0800
Subject: [PATCH 01/19] save log probabilty for every generated words.

---
 .../RecurrentGradientMachine.cpp              | 26 ++++++++++++++-----
 .../RecurrentGradientMachine.h                | 12 +++++++--
 2 files changed, 29 insertions(+), 9 deletions(-)
diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
index 9a972466d6..41e0929959 100644
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
@@ -967,8 +967,9 @@ void RecurrentGradientMachine::generateSequence() {
   size_t numSequences = getGenBatchSize();
 
   resizeBootFrame(numSequences);
-  // We create only two sub-network in generation for alternate use.
-  // Thus, we can reduce total memory of output_ in layer forward.
+  // We create only two sub-network in generation, one stores states of all
+  // layers in previous time step and the other storing the states at current
+  // time step.
   resizeOrCreateFrames(2);
 
   // outFrameLines_.size() > 1UL
@@ -1001,10 +1002,9 @@ void RecurrentGradientMachine::generateSequence() {
 
   // init outArg
   size_t resultNum = generator_.config.num_results_per_sample();
-  IVector::resizeOrCreate(
-      generator_.outArg.ids,
-      generator_.config.max_num_frames() * numSequences * resultNum,
-      false);
+  size_t maxGenWordCount =
+      generator_.config.max_num_frames() * numSequences * resultNum;
+  IVector::resizeOrCreate(generator_.outArg.ids, maxGenWordCount, false);
   if (resultNum > 1) {
     CHECK_LE(resultNum, static_cast<size_t>(generator_.config.beam_size()));
     Matrix::resizeOrCreate(generator_.outArg.in,
@@ -1012,6 +1012,11 @@ void RecurrentGradientMachine::generateSequence() {
                            /* width */ resultNum,
                            false,
                            /* useGpu */ false);
+    Matrix::resizeOrCreate(generator_.outArg.value,
+                           /* height */ maxGenWordCount,
+                           /* width */ 1,
+                           false,
+                           /* useGpu */ false);
   }
   ICpuGpuVector::resizeOrCreate(generator_.outArg.sequenceStartPositions,
                                 numSequences + 1,
@@ -1313,13 +1318,20 @@ void RecurrentGradientMachine::fillGenOutputs() {
   starts[0] = 0;
   if (numResults > 1) {
     real* probs = generator_.outArg.in->getData();
+    real* idsProb = generator_.outArg.value->getData();
+    size_t curPos = 0;
     for (size_t i = 0; i < finalPaths_.size(); ++i) {
       for (size_t j = 0; j < finalPaths_[i].size(); ++j) {
         Path& path = finalPaths_[i][j];
-        generator_.ids.push_back(path.ids.size());  // sequence size
+        size_t genLen = path.ids.size();
+        generator_.ids.push_back(genLen);  // sequence size
         generator_.ids.insert(
             generator_.ids.end(), path.ids.begin(), path.ids.end());
         generator_.ids.push_back(-1);  // end of sequence
+
+        memcpy(idsProb + curPos, path.idsProb.data(), sizeof(real) * genLen);
+        curPos += genLen;
+        idsProb[curPos++] = -1.0;
         probs[i * numResults + j] = path.logProb;
 
         if (!j && dataArgsSize_) {
diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
index f245620cf6..fb3fc5877a 100644
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
@@ -189,6 +189,11 @@ public:
      */
     std::vector<int> ids;
 
+    /**
+     * @brief idsProb, log probability of each generated words.
+     */
+    std::vector<real> idsProb;
+
     /**
      * @brief logProb, current probability of path.
      */
@@ -228,11 +233,13 @@ public:
      */
     Path(Path& old, int newId, real logProb, int machineId, int topIndex)
         : ids(old.ids),
+          idsProb(old.idsProb),
           logProb(old.logProb + logProb),
           machineId(machineId),
           topIndex(topIndex),
           seqId(old.seqId) {
       ids.push_back(newId);
+      idsProb.push_back(logProb);
       if (!old.probHistory.empty()) {
         this->probHistory = old.probHistory;
         // probHistory store current prob, not sum
@@ -411,8 +418,9 @@ protected:
 
   struct Generator {
     GeneratorConfig config;
-    std::vector<int> ids;  // store generated sequences
-    Argument outArg;       // final output argument
+    std::vector<int> ids;       // store generated sequences
+    std::vector<real> idsProb;  // log probability of each generated word
+    Argument outArg;            // final output argument
   };
   bool generating_;
   Generator generator_;

From 7c42aad412e634c7e8853d170c3f516fc3e6b2bf Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Wed, 2 Aug 2017 16:07:51 +0800
Subject: [PATCH 02/19] Initialize Gradient Checker

Add get_numeric_gradient API and its unittest.
---
 paddle/pybind/pybind.cc                       | 10 ++-
 .../paddle/v2/framework/tests/CMakeLists.txt  |  3 +-
 .../v2/framework/tests/gradient_checker.py    | 69 +++++++++++++++++++
 3 files changed, 79 insertions(+), 3 deletions(-)
 create mode 100644 python/paddle/v2/framework/tests/gradient_checker.py

diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index ee5f675e25..e79ad49b6d 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -77,8 +77,14 @@ PYBIND11_PLUGIN(core) {
            })
       .def("set", paddle::pybind::PyTensorSetFromArray<float>)
       .def("set", paddle::pybind::PyTensorSetFromArray<int>)
-      .def("shape",
-           [](pd::Tensor& self) { return pd::vectorize(self.dims()); });
+      .def("shape", [](pd::Tensor& self) { return pd::vectorize(self.dims()); })
+      .def("set_float_element",
+           [](pd::Tensor& self, size_t offset, float f) {
+             self.data<float>()[offset] = f;
+           })
+      .def("get_float_element", [](pd::Tensor& self, size_t offset) -> float {
+        return self.data<float>()[offset];
+      });
 
   py::class_<pd::Variable>(m, "Variable", R"DOC(Variable Class.
 
diff --git a/python/paddle/v2/framework/tests/CMakeLists.txt b/python/paddle/v2/framework/tests/CMakeLists.txt
index cdaaa60674..494c517a9b 100644
--- a/python/paddle/v2/framework/tests/CMakeLists.txt
+++ b/python/paddle/v2/framework/tests/CMakeLists.txt
@@ -13,4 +13,5 @@ add_python_test(test_framework
     test_sigmoid_op.py
     test_softmax_op.py
     test_rowwise_add_op.py
-    test_network.py)
+    test_network.py
+    gradient_checker.py)
diff --git a/python/paddle/v2/framework/tests/gradient_checker.py b/python/paddle/v2/framework/tests/gradient_checker.py
new file mode 100644
index 0000000000..d7e5de8252
--- /dev/null
+++ b/python/paddle/v2/framework/tests/gradient_checker.py
@@ -0,0 +1,69 @@
+import paddle.v2.framework.core as core
+from paddle.v2.framework.create_op_creation_methods import op_creations
+import numpy
+import unittest
+
+
+def get_numeric_gradient(op,
+                         input_values,
+                         output_name,
+                         input_to_check,
+                         delta=1e-5,
+                         local_scope=None):
+    if local_scope is None:
+        local_scope = core.Scope()
+    for var_name in input_values:
+        var = local_scope.new_var(var_name)
+        tensor = var.get_tensor()
+        tensor.set_dims(input_values[var_name].shape)
+        tensor.alloc_float()
+        tensor.set(input_values[var_name])
+
+    for output in op.outputs():
+        local_scope.new_var(output).get_tensor()
+
+    op.infer_shape(local_scope)
+
+    for output in op.outputs():
+        local_scope.find_var(output).get_tensor().alloc_float()
+
+    cpu_ctx = core.DeviceContext.cpu_context()
+
+    def get_output():
+        op.run(local_scope, cpu_ctx)
+        return numpy.array(local_scope.find_var(output_name).get_tensor()).sum()
+
+    def product(dim):
+        return reduce(lambda a, b: a * b, dim, 1)
+
+    tensor_to_check = local_scope.find_var(input_to_check).get_tensor()
+    tensor_size = product(tensor_to_check.get_dims())
+    gradient_flat = numpy.zeros(shape=(tensor_size, ), dtype='float32')
+    for i in xrange(tensor_size):
+        origin = tensor_to_check.get_float_element(i)
+        x_pos = origin + delta
+        tensor_to_check.set_float_element(i, x_pos)
+        y_pos = get_output()
+
+        x_neg = origin - delta
+        tensor_to_check.set_float_element(i, x_neg)
+        y_neg = get_output()
+
+        tensor_to_check.set_float_element(i, origin)  # restore old value
+        gradient_flat[i] = (y_pos - y_neg) / delta / 2
+    return gradient_flat.reshape(tensor_to_check.get_dims())
+
+
+if __name__ == '__main__':
+
+    class GetNumericGradientTest(unittest.TestCase):
+        def test_add_op(self):
+            add_op = op_creations.add_two(X="X", Y="Y", Out="Z")
+            x = numpy.random.random((10, 1)).astype("float32")
+            y = numpy.random.random((10, 1)).astype("float32")
+
+            arr = get_numeric_gradient(add_op, {'X': x, "Y": y}, 'Z', 'X')
+
+            self.assertAlmostEqual(arr.mean(), 1.0, delta=1e-2)
+
+    unittest.main()

From fcc28ccea220ab2be166ea824dca3504dd3fc2c6 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Wed, 2 Aug 2017 16:18:59 +0800
Subject: [PATCH 03/19] Add comments

---
 .../v2/framework/tests/gradient_checker.py    | 22 ++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/python/paddle/v2/framework/tests/gradient_checker.py b/python/paddle/v2/framework/tests/gradient_checker.py
index d7e5de8252..e7fca05d6f 100644
--- a/python/paddle/v2/framework/tests/gradient_checker.py
+++ b/python/paddle/v2/framework/tests/gradient_checker.py
@@ -10,8 +10,24 @@ def get_numeric_gradient(op,
                          input_to_check,
                          delta=1e-5,
                          local_scope=None):
+    """
+    Get Numeric Gradient for an operator's input.
+    
+    :param op: C++ operator instance, could be an network 
+    :param input_values: The input variables. Should be an dictionary, key is 
+    variable name. Value is numpy array.
+    :param output_name: The final output variable name. 
+    :param input_to_check: The input variable need to get gradient.
+    :param delta: The perturbation value for numeric gradient method. The 
+    smaller delta is, the more accurate result will get. But if that delta is
+     too small, it could occur numerical stability problem.
+    :param local_scope: The local scope used for get_numeric_gradient.
+    :return: The gradient array in numpy format.
+    """
     if local_scope is None:
         local_scope = core.Scope()
+
+    # Create all input variable in local_scope
     for var_name in input_values:
         var = local_scope.new_var(var_name)
         tensor = var.get_tensor()
@@ -19,14 +35,18 @@ def get_numeric_gradient(op,
         tensor.alloc_float()
         tensor.set(input_values[var_name])
 
+    # Create all output variable in local_scope
     for output in op.outputs():
-        local_scope.new_var(output).get_tensor()
+        if local_scope.find_var(output) is None:
+            local_scope.new_var(output).get_tensor()
 
     op.infer_shape(local_scope)
 
+    # allocate output memory
     for output in op.outputs():
         local_scope.find_var(output).get_tensor().alloc_float()
 
+    # TODO(yuyang18): Only CPU is support now.
     cpu_ctx = core.DeviceContext.cpu_context()
 
     def get_output():

From a0590004ceae3fb769a87b55e03833523451a34a Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Wed, 2 Aug 2017 16:28:41 +0800
Subject: [PATCH 04/19] Add __all__ in `gradient_checker.py`

---
 python/paddle/v2/framework/tests/gradient_checker.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/paddle/v2/framework/tests/gradient_checker.py b/python/paddle/v2/framework/tests/gradient_checker.py
index e7fca05d6f..0ee7e8fb49 100644
--- a/python/paddle/v2/framework/tests/gradient_checker.py
+++ b/python/paddle/v2/framework/tests/gradient_checker.py
@@ -3,6 +3,8 @@ from paddle.v2.framework.create_op_creation_methods import op_creations
 import numpy
 import unittest
 
+__all__ = ['get_numeric_gradient']
+
 
 def get_numeric_gradient(op,
                          input_values,

From a404d9abb4e57b9c903ad98becca45b43d7d92d6 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Wed, 2 Aug 2017 17:43:17 +0800
Subject: [PATCH 05/19] Add todo comments

---
 paddle/pybind/pybind.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index fa76da67f2..dc6f29d026 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -81,9 +81,11 @@ PYBIND11_PLUGIN(core) {
       .def("shape", [](pd::Tensor& self) { return pd::vectorize(self.dims()); })
       .def("set_float_element",
            [](pd::Tensor& self, size_t offset, float f) {
+             // TODO(yuyang18): Only support GPU now.
              self.data<float>()[offset] = f;
            })
       .def("get_float_element", [](pd::Tensor& self, size_t offset) -> float {
+        // TODO(yuyang18): Only support GPU now.
         return self.data<float>()[offset];
       });
 

From a560aee3180ea7ecea8af24c5bf9632b52023dd5 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Thu, 3 Aug 2017 12:55:19 +0800
Subject: [PATCH 06/19] Fix code style

---
 paddle/math/BaseMatrix.cu | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/paddle/math/BaseMatrix.cu b/paddle/math/BaseMatrix.cu
index 6db5965789..344cad496a 100644
--- a/paddle/math/BaseMatrix.cu
+++ b/paddle/math/BaseMatrix.cu
@@ -442,7 +442,8 @@ DEFINE_MATRIX_UNARY_PARAMETER_OP(Clip, TWO_PARAMETER,
 template<class T>
 void BaseMatrixT<T>::clip(T p1, T p2) { applyUnary(unary::Clip<T>(p1, p2)); }
 
-DEFINE_MATRIX_BINARY_PARAMETER_OP(ClipDerivative, TWO_PARAMETER, a = b < p1 ? 0 : (b > p2 ? 0 : 1));
+DEFINE_MATRIX_BINARY_PARAMETER_OP(ClipDerivative, TWO_PARAMETER,
+		a = b < p1 ? 0 : (b > p2 ? 0 : 1));
 template<class T>
 void BaseMatrixT<T>::clipDerivative(BaseMatrixT& b, T p1, T p2) {
   applyBinary(binary::ClipDerivative<T>(p1, p2), b);

From 8544bdbb4986081e392c831ea784b3134a86bac1 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Thu, 3 Aug 2017 12:56:26 +0800
Subject: [PATCH 07/19] `clang-format` for BaseMatrix.cu

---
 paddle/math/BaseMatrix.cu | 985 ++++++++++++++++++++++++--------------
 1 file changed, 619 insertions(+), 366 deletions(-)

diff --git a/paddle/math/BaseMatrix.cu b/paddle/math/BaseMatrix.cu
index 344cad496a..5435808fb7 100644
--- a/paddle/math/BaseMatrix.cu
+++ b/paddle/math/BaseMatrix.cu
@@ -12,21 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <cmath>
-#include <string.h>
 #include <paddle/utils/Logging.h>
+#include <string.h>
+#include <cmath>
 #include "BaseMatrix.h"
-#include "hl_matrix_ops.cuh"
-#include "hl_matrix_base.cuh"
-#include "hl_matrix_apply.cuh"
-#include "SIMDFunctions.h"
 #include "MathFunctions.h"
+#include "SIMDFunctions.h"
+#include "hl_matrix_apply.cuh"
+#include "hl_matrix_base.cuh"
+#include "hl_matrix_ops.cuh"
 
 namespace paddle {
 
 const char* SPARSE_SUPPORT_ERROR = "Sparse Matrix/Vector is not supported.";
 
-template<class T>
+template <class T>
 template <class Op>
 int BaseMatrixT<T>::applyUnary(Op op) {
   MatrixOffset offset(0, 0);
@@ -34,9 +34,11 @@ int BaseMatrixT<T>::applyUnary(Op op) {
   return 0;
 }
 
-template<class T>
+template <class T>
 template <class Op>
-int BaseMatrixT<T>::applyUnary(Op op, int numRows, int numCols,
+int BaseMatrixT<T>::applyUnary(Op op,
+                               int numRows,
+                               int numCols,
                                MatrixOffset& offset) {
   CHECK(!this->isSparse()) << SPARSE_SUPPORT_ERROR;
   int dimM = numRows;
@@ -56,7 +58,7 @@ int BaseMatrixT<T>::applyUnary(Op op, int numRows, int numCols,
   return 0;
 }
 
-template<class T>
+template <class T>
 template <class Op>
 int BaseMatrixT<T>::applyBinary(Op op, BaseMatrixT& b) {
   CHECK(height_ == b.height_ && width_ == b.width_)
@@ -67,18 +69,23 @@ int BaseMatrixT<T>::applyBinary(Op op, BaseMatrixT& b) {
   return 0;
 }
 
-template<class T>
+template <class T>
 template <class Op>
-int BaseMatrixT<T>::applyBinary(Op op, BaseMatrixT& b, int numRows, int numCols,
-                                MatrixOffset& offset) {
+int BaseMatrixT<T>::applyBinary(
+    Op op, BaseMatrixT& b, int numRows, int numCols, MatrixOffset& offset) {
   applyBinary(op, b, numRows, numCols, offset, false_type(), false_type());
   return 0;
 }
 
-template<class T>
+template <class T>
 template <class Op, class bAsRowVector, class bAsColVector>
-int BaseMatrixT<T>::applyBinary(Op op, BaseMatrixT& b, int numRows, int numCols,
-                            MatrixOffset& offset, bAsRowVector, bAsColVector) {
+int BaseMatrixT<T>::applyBinary(Op op,
+                                BaseMatrixT& b,
+                                int numRows,
+                                int numCols,
+                                MatrixOffset& offset,
+                                bAsRowVector,
+                                bAsColVector) {
   CHECK(!this->isSparse()) << SPARSE_SUPPORT_ERROR;
   CHECK(!b.isSparse()) << SPARSE_SUPPORT_ERROR;
   CHECK(useGpu_ == b.useGpu_) << "Matrix type mismatch";
@@ -91,8 +98,8 @@ int BaseMatrixT<T>::applyBinary(Op op, BaseMatrixT& b, int numRows, int numCols,
   T* A = data_;
   T* B = b.data_;
   CAL_MATRIX_START_ADDRESS(A, height_, width_, lda, offset.aCol_, offset.aRow_);
-  CAL_MATRIX_START_ADDRESS(B, b.height_, b.width_, ldb, offset.bCol_,
-                           offset.bRow_);
+  CAL_MATRIX_START_ADDRESS(
+      B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_);
   CHECK_LE(dimM + offset.aRow_, this->height_);
   CHECK_LE(dimN + offset.aCol_, this->width_);
   if (!bAsRowVector::value && !bAsColVector::value) {
@@ -115,7 +122,7 @@ int BaseMatrixT<T>::applyBinary(Op op, BaseMatrixT& b, int numRows, int numCols,
   return 0;
 }
 
-template<class T>
+template <class T>
 template <class Op>
 int BaseMatrixT<T>::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c) {
   CHECK_EQ(height_, b.height_);
@@ -129,21 +136,29 @@ int BaseMatrixT<T>::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c) {
   return 0;
 }
 
-template<class T>
+template <class T>
 template <class Op>
-int BaseMatrixT<T>::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c,
-                                 int numRows, int numCols,
+int BaseMatrixT<T>::applyTernary(Op op,
+                                 BaseMatrixT& b,
+                                 BaseMatrixT& c,
+                                 int numRows,
+                                 int numCols,
                                  MatrixOffset& offset) {
   applyTernary(op, b, c, numRows, numCols, offset, false_type(), false_type());
 
   return 0;
 }
 
-template<class T>
+template <class T>
 template <class Op, class cAsRowVector, class cAsColVector>
-int BaseMatrixT<T>::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c,
-                                 int numRows, int numCols, MatrixOffset& offset,
-                                 cAsRowVector, cAsColVector) {
+int BaseMatrixT<T>::applyTernary(Op op,
+                                 BaseMatrixT& b,
+                                 BaseMatrixT& c,
+                                 int numRows,
+                                 int numCols,
+                                 MatrixOffset& offset,
+                                 cAsRowVector,
+                                 cAsColVector) {
   CHECK(!this->isSparse()) << SPARSE_SUPPORT_ERROR;
   CHECK(!b.isSparse()) << SPARSE_SUPPORT_ERROR;
   CHECK(!c.isSparse()) << SPARSE_SUPPORT_ERROR;
@@ -160,10 +175,10 @@ int BaseMatrixT<T>::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c,
   T* B = b.data_;
   T* C = c.data_;
   CAL_MATRIX_START_ADDRESS(A, height_, width_, lda, offset.aCol_, offset.aRow_);
-  CAL_MATRIX_START_ADDRESS(B, b.height_, b.width_, ldb, offset.bCol_,
-                           offset.bRow_);
-  CAL_MATRIX_START_ADDRESS(C, c.height_, c.width_, ldc, offset.cCol_,
-                           offset.cRow_);
+  CAL_MATRIX_START_ADDRESS(
+      B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_);
+  CAL_MATRIX_START_ADDRESS(
+      C, c.height_, c.width_, ldc, offset.cCol_, offset.cRow_);
 
   CHECK_LE(dimM + offset.aRow_, this->height_);
   CHECK_LE(dimN + offset.aCol_, this->width_);
@@ -180,21 +195,21 @@ int BaseMatrixT<T>::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c,
   }
 
   if (true == useGpu_) {
-    hl_gpu_apply_ternary_op
-      <T, Op, cAsRowVector::value, cAsColVector::value>(
+    hl_gpu_apply_ternary_op<T, Op, cAsRowVector::value, cAsColVector::value>(
         op, A, B, C, dimM, dimN, lda, ldb, ldc);
   } else {
-    hl_cpu_apply_ternary_op
-      <T, Op, cAsRowVector::value, cAsColVector::value>(
+    hl_cpu_apply_ternary_op<T, Op, cAsRowVector::value, cAsColVector::value>(
         op, A, B, C, dimM, dimN, lda, ldb, ldc);
   }
 
   return 0;
 }
 
-template<class T>
+template <class T>
 template <class Op>
-int BaseMatrixT<T>::applyQuaternary(Op op, BaseMatrixT& b, BaseMatrixT& c,
+int BaseMatrixT<T>::applyQuaternary(Op op,
+                                    BaseMatrixT& b,
+                                    BaseMatrixT& c,
                                     BaseMatrixT& d) {
   CHECK_EQ(height_, b.height_);
   CHECK_EQ(width_, b.width_);
@@ -209,10 +224,14 @@ int BaseMatrixT<T>::applyQuaternary(Op op, BaseMatrixT& b, BaseMatrixT& c,
   return 0;
 }
 
-template<class T>
+template <class T>
 template <class Op>
-int BaseMatrixT<T>::applyQuaternary(Op op, BaseMatrixT& b, BaseMatrixT& c,
-                                    BaseMatrixT& d, int numRows, int numCols,
+int BaseMatrixT<T>::applyQuaternary(Op op,
+                                    BaseMatrixT& b,
+                                    BaseMatrixT& c,
+                                    BaseMatrixT& d,
+                                    int numRows,
+                                    int numCols,
                                     MatrixOffset& offset) {
   CHECK(!this->isSparse()) << SPARSE_SUPPORT_ERROR;
   CHECK(!b.isSparse()) << SPARSE_SUPPORT_ERROR;
@@ -234,12 +253,12 @@ int BaseMatrixT<T>::applyQuaternary(Op op, BaseMatrixT& b, BaseMatrixT& c,
   T* C = c.data_;
   T* D = d.data_;
   CAL_MATRIX_START_ADDRESS(A, height_, width_, lda, offset.aCol_, offset.aRow_);
-  CAL_MATRIX_START_ADDRESS(B, b.height_, b.width_, ldb, offset.bCol_,
-                           offset.bRow_);
-  CAL_MATRIX_START_ADDRESS(C, c.height_, c.width_, ldc, offset.cCol_,
-                           offset.cRow_);
-  CAL_MATRIX_START_ADDRESS(D, d.height_, d.width_, ldd, offset.dCol_,
-                           offset.dRow_);
+  CAL_MATRIX_START_ADDRESS(
+      B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_);
+  CAL_MATRIX_START_ADDRESS(
+      C, c.height_, c.width_, ldc, offset.cCol_, offset.cRow_);
+  CAL_MATRIX_START_ADDRESS(
+      D, d.height_, d.width_, ldd, offset.dCol_, offset.dRow_);
 
   CHECK_LE(dimM + offset.aRow_, this->height_);
   CHECK_LE(dimN + offset.aCol_, this->width_);
@@ -250,22 +269,29 @@ int BaseMatrixT<T>::applyQuaternary(Op op, BaseMatrixT& b, BaseMatrixT& c,
   CHECK_LE(dimM + offset.dRow_, d.height_);
   CHECK_LE(dimN + offset.dCol_, d.width_);
   if (true == useGpu_) {
-    hl_gpu_apply_quaternary_op(op, A, B, C, D, dimM, dimN, lda, ldb,
-                               ldc, ldd);
+    hl_gpu_apply_quaternary_op(op, A, B, C, D, dimM, dimN, lda, ldb, ldc, ldd);
   } else {
-    hl_cpu_apply_quaternary_op(op, A, B, C, D, dimM, dimN, lda, ldb,
-                               ldc, ldd);
+    hl_cpu_apply_quaternary_op(op, A, B, C, D, dimM, dimN, lda, ldb, ldc, ldd);
   }
 
   return 0;
 }
 
-template<class T>
-template <class Agg, class Op, class Saver, class aAsRowVector,
+template <class T>
+template <class Agg,
+          class Op,
+          class Saver,
+          class aAsRowVector,
           class aAsColVector>
-int BaseMatrixT<T>::aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b,
-                              int numRows, int numCols, MatrixOffset& offset,
-                              aAsRowVector, aAsColVector) {
+int BaseMatrixT<T>::aggregate(Agg agg,
+                              Op op,
+                              Saver sv,
+                              BaseMatrixT& b,
+                              int numRows,
+                              int numCols,
+                              MatrixOffset& offset,
+                              aAsRowVector,
+                              aAsColVector) {
   CHECK_EQ(useGpu_, b.useGpu_);
 
   int ld = stride_;
@@ -273,10 +299,10 @@ int BaseMatrixT<T>::aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b,
 
   T* dst = data_;
   T* B = b.data_;
-  CAL_MATRIX_START_ADDRESS(dst, height_, width_, ld, offset.aCol_,
-                           offset.aRow_);
-  CAL_MATRIX_START_ADDRESS(B, b.height_, b.width_, ldb, offset.bCol_,
-                           offset.bRow_);
+  CAL_MATRIX_START_ADDRESS(
+      dst, height_, width_, ld, offset.aCol_, offset.aRow_);
+  CAL_MATRIX_START_ADDRESS(
+      B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_);
 
   if (aAsRowVector::value && !aAsColVector::value) {
     if (useGpu_) {
@@ -297,12 +323,21 @@ int BaseMatrixT<T>::aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b,
   return 0;
 }
 
-template<class T>
-template <class Agg, class Op, class Saver, class aAsRowVector,
+template <class T>
+template <class Agg,
+          class Op,
+          class Saver,
+          class aAsRowVector,
           class aAsColVector>
-int BaseMatrixT<T>::aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b,
-                              BaseMatrixT& c, int numRows, int numCols,
-                              MatrixOffset& offset, aAsRowVector,
+int BaseMatrixT<T>::aggregate(Agg agg,
+                              Op op,
+                              Saver sv,
+                              BaseMatrixT& b,
+                              BaseMatrixT& c,
+                              int numRows,
+                              int numCols,
+                              MatrixOffset& offset,
+                              aAsRowVector,
                               aAsColVector) {
   CHECK_EQ(useGpu_, b.useGpu_);
   CHECK_EQ(useGpu_, c.useGpu_);
@@ -314,28 +349,28 @@ int BaseMatrixT<T>::aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b,
   T* dst = data_;
   T* B = b.data_;
   T* C = c.data_;
-  CAL_MATRIX_START_ADDRESS(dst, height_, width_, ld, offset.aCol_,
-                           offset.aRow_);
-  CAL_MATRIX_START_ADDRESS(B, b.height_, b.width_, ldb, offset.bCol_,
-                           offset.bRow_);
-  CAL_MATRIX_START_ADDRESS(C, c.height_, c.width_, ldc, offset.cCol_,
-                           offset.cRow_);
+  CAL_MATRIX_START_ADDRESS(
+      dst, height_, width_, ld, offset.aCol_, offset.aRow_);
+  CAL_MATRIX_START_ADDRESS(
+      B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_);
+  CAL_MATRIX_START_ADDRESS(
+      C, c.height_, c.width_, ldc, offset.cCol_, offset.cRow_);
 
   if (aAsRowVector::value && !aAsColVector::value) {
     if (useGpu_) {
-      hl_gpu_matrix_column_op(agg, op, sv, numRows, numCols, dst, B,
-                              ldb, C, ldc);
+      hl_gpu_matrix_column_op(
+          agg, op, sv, numRows, numCols, dst, B, ldb, C, ldc);
     } else {
-      hl_cpu_matrix_column_op(agg, op, sv, numRows, numCols, dst, B,
-                              ldb, C, ldc);
+      hl_cpu_matrix_column_op(
+          agg, op, sv, numRows, numCols, dst, B, ldb, C, ldc);
     }
   } else if (!aAsRowVector::value && aAsColVector::value) {
     if (useGpu_) {
-      hl_gpu_matrix_row_op(agg, op, sv, numRows, numCols, dst, ld, B,
-                           ldb, C, ldc);
+      hl_gpu_matrix_row_op(
+          agg, op, sv, numRows, numCols, dst, ld, B, ldb, C, ldc);
     } else {
-      hl_cpu_matrix_row_op(agg, op, sv, numRows, numCols, dst, ld, B,
-                           ldb, C, ldc);
+      hl_cpu_matrix_row_op(
+          agg, op, sv, numRows, numCols, dst, ld, B, ldb, C, ldc);
     }
   } else {
     LOG(FATAL) << "not supported";
@@ -350,15 +385,19 @@ int BaseMatrixT<T>::aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b,
  */
 
 DEFINE_MATRIX_UNARY_OP(Neg, a = -a);
-template<class T>
-void BaseMatrixT<T>::neg() { applyUnary(unary::Neg<T>()); }
+template <class T>
+void BaseMatrixT<T>::neg() {
+  applyUnary(unary::Neg<T>());
+}
 
 DEFINE_MATRIX_UNARY_OP(Exp, a = exp(a));
-template<>
-void BaseMatrixT<real>::exp2() { applyUnary(unary::Exp<real>()); }
+template <>
+void BaseMatrixT<real>::exp2() {
+  applyUnary(unary::Exp<real>());
+}
 
 DEFINE_MATRIX_UNARY_OP(Log, a = log(a));
-template<>
+template <>
 void BaseMatrixT<real>::log2() {
   if (useGpu_) {
     applyUnary(unary::Log<real>());
@@ -368,30 +407,42 @@ void BaseMatrixT<real>::log2() {
 }
 
 DEFINE_MATRIX_UNARY_OP(Sqrt, a = sqrt(a));
-template<>
-void BaseMatrixT<real>::sqrt2() { applyUnary(unary::Sqrt<real>()); }
+template <>
+void BaseMatrixT<real>::sqrt2() {
+  applyUnary(unary::Sqrt<real>());
+}
 
 DEFINE_MATRIX_UNARY_OP(Square, a = a * a);
-template<class T>
-void BaseMatrixT<T>::square2() { applyUnary(unary::Square<T>()); }
+template <class T>
+void BaseMatrixT<T>::square2() {
+  applyUnary(unary::Square<T>());
+}
 
 DEFINE_MATRIX_UNARY_OP(Reciprocal, a = 1.0f / a);
-template<class T>
-void BaseMatrixT<T>::reciprocal2() { applyUnary(unary::Reciprocal<T>()); }
+template <class T>
+void BaseMatrixT<T>::reciprocal2() {
+  applyUnary(unary::Reciprocal<T>());
+}
 
 DEFINE_MATRIX_UNARY_OP(Abs, a = a > 0 ? a : -a);
-template<class T>
-void BaseMatrixT<T>::abs2() { applyUnary(unary::Abs<T>()); }
+template <class T>
+void BaseMatrixT<T>::abs2() {
+  applyUnary(unary::Abs<T>());
+}
 
 DEFINE_MATRIX_UNARY_OP(Sign, a = (a > 0) - (a < 0));
-template<class T>
-void BaseMatrixT<T>::sign2() { applyUnary(unary::Sign<T>()); }
+template <class T>
+void BaseMatrixT<T>::sign2() {
+  applyUnary(unary::Sign<T>());
+}
 
 DEFINE_MATRIX_UNARY_OP(Zero, a = 0);
-template<class T>
-void BaseMatrixT<T>::zero() { applyUnary(unary::Zero<T>()); }
+template <class T>
+void BaseMatrixT<T>::zero() {
+  applyUnary(unary::Zero<T>());
+}
 
-template<class T>
+template <class T>
 void BaseMatrixT<T>::zeroAtOffset(int64_t columnOffset, int64_t numColumns) {
   int numRows = height_;
   int numCols = numColumns;
@@ -400,11 +451,13 @@ void BaseMatrixT<T>::zeroAtOffset(int64_t columnOffset, int64_t numColumns) {
 }
 
 DEFINE_MATRIX_UNARY_OP(One, a = 1);
-template<class T>
-void BaseMatrixT<T>::one() { applyUnary(unary::One<T>()); }
+template <class T>
+void BaseMatrixT<T>::one() {
+  applyUnary(unary::One<T>());
+}
 
 DEFINE_MATRIX_UNARY_PARAMETER_OP(Pow, ONE_PARAMETER, a = pow(a, p));
-template<>
+template <>
 void BaseMatrixT<real>::pow2(real p) {
   if (useGpu_) {
     applyUnary(unary::Pow<real>(p));
@@ -414,51 +467,67 @@ void BaseMatrixT<real>::pow2(real p) {
 }
 
 DEFINE_MATRIX_UNARY_PARAMETER_OP(SubScalar, ONE_PARAMETER, a -= p);
-template<class T>
-void BaseMatrixT<T>::subScalar(T p) { applyUnary(unary::SubScalar<T>(p)); }
+template <class T>
+void BaseMatrixT<T>::subScalar(T p) {
+  applyUnary(unary::SubScalar<T>(p));
+}
 
 DEFINE_MATRIX_UNARY_PARAMETER_OP(MulScalar, ONE_PARAMETER, a *= p);
-template<class T>
-void BaseMatrixT<T>::mulScalar(T p) { applyUnary(unary::MulScalar<T>(p)); }
+template <class T>
+void BaseMatrixT<T>::mulScalar(T p) {
+  applyUnary(unary::MulScalar<T>(p));
+}
 
 DEFINE_MATRIX_UNARY_PARAMETER_OP(DivScalar, ONE_PARAMETER, a /= p);
-template<class T>
-void BaseMatrixT<T>::divScalar(T p) { applyUnary(unary::DivScalar<T>(p)); }
+template <class T>
+void BaseMatrixT<T>::divScalar(T p) {
+  applyUnary(unary::DivScalar<T>(p));
+}
 
 DEFINE_MATRIX_UNARY_PARAMETER_OP(Assign, ONE_PARAMETER, a = p);
-template<class T>
-void BaseMatrixT<T>::assign(T p) { applyUnary(unary::Assign<T>(p)); }
+template <class T>
+void BaseMatrixT<T>::assign(T p) {
+  applyUnary(unary::Assign<T>(p));
+}
 
 DEFINE_MATRIX_UNARY_PARAMETER_OP(Add, ONE_PARAMETER, a += p);
-template<class T>
-void BaseMatrixT<T>::add(T p) { applyUnary(unary::Add<T>(p)); }
+template <class T>
+void BaseMatrixT<T>::add(T p) {
+  applyUnary(unary::Add<T>(p));
+}
 
 DEFINE_MATRIX_UNARY_PARAMETER_OP(Add2, TWO_PARAMETER, a = a * p1 + p2);
-template<class T>
-void BaseMatrixT<T>::add(T p1, T p2) { applyUnary(unary::Add2<T>(p1, p2)); }
+template <class T>
+void BaseMatrixT<T>::add(T p1, T p2) {
+  applyUnary(unary::Add2<T>(p1, p2));
+}
 
-DEFINE_MATRIX_UNARY_PARAMETER_OP(Clip, TWO_PARAMETER,
+DEFINE_MATRIX_UNARY_PARAMETER_OP(Clip,
+                                 TWO_PARAMETER,
                                  a = a < p1 ? p1 : (a > p2 ? p2 : a));
-template<class T>
-void BaseMatrixT<T>::clip(T p1, T p2) { applyUnary(unary::Clip<T>(p1, p2)); }
+template <class T>
+void BaseMatrixT<T>::clip(T p1, T p2) {
+  applyUnary(unary::Clip<T>(p1, p2));
+}
 
-DEFINE_MATRIX_BINARY_PARAMETER_OP(ClipDerivative, TWO_PARAMETER,
-		a = b < p1 ? 0 : (b > p2 ? 0 : 1));
-template<class T>
+DEFINE_MATRIX_BINARY_PARAMETER_OP(ClipDerivative,
+                                  TWO_PARAMETER,
+                                  a = b < p1 ? 0 : (b > p2 ? 0 : 1));
+template <class T>
 void BaseMatrixT<T>::clipDerivative(BaseMatrixT& b, T p1, T p2) {
   applyBinary(binary::ClipDerivative<T>(p1, p2), b);
 }
 
-DEFINE_MATRIX_UNARY_PARAMETER_OP(BiggerThanScalar, ONE_PARAMETER,
+DEFINE_MATRIX_UNARY_PARAMETER_OP(BiggerThanScalar,
+                                 ONE_PARAMETER,
                                  a = a > p ? 1.0f : 0.0f);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::biggerThanScalar(T p) {
   applyUnary(unary::BiggerThanScalar<T>(p));
 }
 
-DEFINE_MATRIX_UNARY_PARAMETER_OP(DownClip, ONE_PARAMETER,
-                                 a = a > p ? a : p);
-template<class T>
+DEFINE_MATRIX_UNARY_PARAMETER_OP(DownClip, ONE_PARAMETER, a = a > p ? a : p);
+template <class T>
 void BaseMatrixT<T>::downClip(T p) {
   applyUnary(unary::DownClip<T>(p));
 }
@@ -469,12 +538,12 @@ void BaseMatrixT<T>::downClip(T p) {
  */
 
 DEFINE_MATRIX_BINARY_OP(Add, a += b);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::add(BaseMatrixT& b) {
   applyBinary(binary::Add<T>(), b);
 }
 
-template<>
+template <>
 void BaseMatrixT<real>::add(BaseMatrixT& b) {
   if (useGpu_) {
     applyBinary(binary::Add<real>(), b);
@@ -485,7 +554,7 @@ void BaseMatrixT<real>::add(BaseMatrixT& b) {
   }
 }
 
-template<class T>
+template <class T>
 void BaseMatrixT<T>::addAtOffset(BaseMatrixT& b, int64_t columnOffset) {
   if (columnOffset + b.width_ <= width_) {
     int numRows = height_;
@@ -504,43 +573,53 @@ void BaseMatrixT<T>::addAtOffset(BaseMatrixT& b, int64_t columnOffset) {
   }
 }
 
-template<class T>
+template <class T>
 void BaseMatrixT<T>::addP2P(BaseMatrixT& b) {
   T* A = data_;
   T* B = b.data_;
   int dimM = height_;
   int dimN = width_;
 
-  hl_gpu_apply_binary_op<T, binary::Add<T>, 0, 0>
-    (binary::Add<T>(), A, B, dimM, dimN, dimN, dimN);
+  hl_gpu_apply_binary_op<T, binary::Add<T>, 0, 0>(
+      binary::Add<T>(), A, B, dimM, dimN, dimN, dimN);
 }
 
-template<class T>
+template <class T>
 void BaseMatrixT<T>::addColVector(BaseMatrixT& b) {
   MatrixOffset offset(0, 0, 0, 0);
   int numRows = height_;
   int numCols = width_;
-  applyBinary(binary::Add<T>(), b, numRows, numCols, offset, false_type(),
+  applyBinary(binary::Add<T>(),
+              b,
+              numRows,
+              numCols,
+              offset,
+              false_type(),
               true_type() /* bAsColVector */);
 }
 
-template<class T>
+template <class T>
 void BaseMatrixT<T>::addRowVector(BaseMatrixT& b) {
   MatrixOffset offset(0, 0, 0, 0);
   int numRows = height_;
   int numCols = width_;
-  applyBinary(binary::Add<T>(), b, numRows, numCols, offset,
-              true_type() /* bAsRowVector */, false_type());
+  applyBinary(binary::Add<T>(),
+              b,
+              numRows,
+              numCols,
+              offset,
+              true_type() /* bAsRowVector */,
+              false_type());
 }
 
 DEFINE_MATRIX_BINARY_PARAMETER_OP(Add1, ONE_PARAMETER, a += b * p);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::add(BaseMatrixT& b, T p) {
   applyBinary(binary::Add1<T>(p), b);
 }
 
 DEFINE_MATRIX_BINARY_PARAMETER_OP(Pow, ONE_PARAMETER, a = pow(b, p));
-template<>
+template <>
 void BaseMatrixT<real>::pow2(BaseMatrixT& b, real p) {
   if (useGpu_) {
     applyBinary(binary::Pow<real>(p), b);
@@ -550,36 +629,45 @@ void BaseMatrixT<real>::pow2(BaseMatrixT& b, real p) {
 }
 
 DEFINE_MATRIX_BINARY_PARAMETER_OP(Add2, TWO_PARAMETER, a = p1 * a + p2 * b);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::add(BaseMatrixT& b, T p1, T p2) {
   applyBinary(binary::Add2<T>(p1, p2), b);
 }
 
-template<class T>
+template <class T>
 void BaseMatrixT<T>::addBias(BaseMatrixT& b, T scale) {
   MatrixOffset offset(0, 0, 0, 0);
   int numRows = height_;
   int numCols = width_;
-  applyBinary(binary::Add1<T>(scale), b, numRows, numCols, offset,
-              true_type() /* bAsRowVector */, false_type());
+  applyBinary(binary::Add1<T>(scale),
+              b,
+              numRows,
+              numCols,
+              offset,
+              true_type() /* bAsRowVector */,
+              false_type());
 }
 
 DEFINE_MATRIX_BINARY_OP(Sub, a -= b);
-template<class T>
-void BaseMatrixT<T>::sub(BaseMatrixT& b) { applyBinary(binary::Sub<T>(), b); }
+template <class T>
+void BaseMatrixT<T>::sub(BaseMatrixT& b) {
+  applyBinary(binary::Sub<T>(), b);
+}
 
 DEFINE_MATRIX_BINARY_PARAMETER_OP(Sub1, ONE_PARAMETER, a -= b * p);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::sub(BaseMatrixT& b, T p) {
   applyBinary(binary::Sub1<T>(p), b);
 }
 
 DEFINE_MATRIX_BINARY_OP(Relu, b = a > 0.0f ? a : 0.0f);
-template<class T>
-void BaseMatrixT<T>::relu(BaseMatrixT& b) { applyBinary(binary::Relu<T>(), b); }
+template <class T>
+void BaseMatrixT<T>::relu(BaseMatrixT& b) {
+  applyBinary(binary::Relu<T>(), b);
+}
 
 DEFINE_MATRIX_BINARY_OP(ReluDerivative, a *= (b > 0.0f ? 1.0f : 0.0f));
-template<class T>
+template <class T>
 void BaseMatrixT<T>::reluDerivative(BaseMatrixT& b) {
   applyBinary(binary::ReluDerivative<T>(), b);
 }
@@ -589,7 +677,7 @@ DEFINE_MATRIX_BINARY_OP(Softrelu, const T THRESHOLD = 40.0;
                                               ? THRESHOLD
                                               : ((a < -THRESHOLD) ? (-THRESHOLD)
                                                                   : a))));
-template<>
+template <>
 void BaseMatrixT<real>::softrelu(BaseMatrixT& b) {
   applyBinary(binary::Softrelu<real>(), b);
 }
@@ -599,97 +687,100 @@ DEFINE_MATRIX_BINARY_OP(
     a *= (1.0 - exp(-1.0 * ((b > THRESHOLD)
                                 ? THRESHOLD
                                 : ((b < -THRESHOLD) ? (-THRESHOLD) : b)))));
-template<>
+template <>
 void BaseMatrixT<real>::softreluDerivative(BaseMatrixT& b) {
   applyBinary(binary::SoftreluDerivative<real>(), b);
 }
 
 DEFINE_MATRIX_BINARY_PARAMETER_OP(Brelu, TWO_PARAMETER, b = a > p1 ? a : p1;
                                   b = b < p2 ? b : p2);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::brelu(BaseMatrixT& b) {
-  int p1 = 0, p2 = 24;    //! TODO(yuyang18): Make p1,p2 configuable.
+  int p1 = 0, p2 = 24;  //! TODO(yuyang18): Make p1,p2 configuable.
   applyBinary(binary::Brelu<T>(p1, p2), b);
 }
 
-DEFINE_MATRIX_BINARY_PARAMETER_OP(BreluDerivative, TWO_PARAMETER,
+DEFINE_MATRIX_BINARY_PARAMETER_OP(BreluDerivative,
+                                  TWO_PARAMETER,
                                   a *= (b > p1 && b < p2) ? 1.0 : 0.0);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::breluDerivative(BaseMatrixT& b) {
   int p1 = 0, p2 = 24;
   applyBinary(binary::BreluDerivative<T>(p1, p2), b);
 }
 
 DEFINE_MATRIX_BINARY_OP(Square, b = a * a);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::square2(BaseMatrixT& b) {
   applyBinary(binary::Square<T>(), b);
 }
 
 DEFINE_MATRIX_BINARY_OP(SquareDerivative, a *= 2.0 * b);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::squareDerivative(BaseMatrixT& b) {
   applyBinary(binary::SquareDerivative<T>(), b);
 }
 
-DEFINE_MATRIX_BINARY_OP(Tanh,
-    T tmp = -2.0 * a;
-    tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
-    b = 2.0 / (1.0 + std::exp(tmp)) - 1.0);
-template<>
+DEFINE_MATRIX_BINARY_OP(Tanh, T tmp = -2.0 * a;
+                        tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
+                        b = 2.0 / (1.0 + std::exp(tmp)) - 1.0);
+template <>
 void BaseMatrixT<real>::tanh(BaseMatrixT& b) {
   applyBinary(binary::Tanh<real>(), b);
 }
 
 DEFINE_MATRIX_BINARY_OP(TanhDerivative, a *= 1 - b * b);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::tanhDerivative(BaseMatrixT& b) {
   applyBinary(binary::TanhDerivative<T>(), b);
 }
 
-DEFINE_MATRIX_BINARY_PARAMETER_OP(ScaledTanh, TWO_PARAMETER,
-                                  b = p1 *
-                                      (2.0 / (1.0 + exp(-2 * p2 * a)) - 1.0));
-template<>
+DEFINE_MATRIX_BINARY_PARAMETER_OP(
+    ScaledTanh, TWO_PARAMETER, b = p1 * (2.0 / (1.0 + exp(-2 * p2 * a)) - 1.0));
+template <>
 void BaseMatrixT<real>::scaledTanh(BaseMatrixT& b, real p1, real p2) {
   applyBinary(binary::ScaledTanh<real>(p1, p2), b);
 }
 
-DEFINE_MATRIX_BINARY_PARAMETER_OP(ScaledTanhDerivative, TWO_PARAMETER,
+DEFINE_MATRIX_BINARY_PARAMETER_OP(ScaledTanhDerivative,
+                                  TWO_PARAMETER,
                                   a *= p2 * (p1 - b * b));
-template<class T>
+template <class T>
 void BaseMatrixT<T>::scaledTanhDerivative(BaseMatrixT& b, T p1, T p2) {
   applyBinary(binary::ScaledTanhDerivative<T>(p1 * p1, p2 / p1), b);
 }
 
 DEFINE_MATRIX_BINARY_OP(Reciprocal, b = 1.0f / a);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::reciprocal2(BaseMatrixT& b) {
   applyBinary(binary::Reciprocal<T>(), b);
 }
 
 DEFINE_MATRIX_BINARY_OP(ReciprocalDerivative, a *= -b * b);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::reciprocalDerivative(BaseMatrixT& b) {
   applyBinary(binary::ReciprocalDerivative<T>(), b);
 }
 
 DEFINE_MATRIX_BINARY_OP(Abs, b = a > 0.0f ? a : -a);
-template<class T>
-void BaseMatrixT<T>::abs2(BaseMatrixT& b) { applyBinary(binary::Abs<T>(), b); }
+template <class T>
+void BaseMatrixT<T>::abs2(BaseMatrixT& b) {
+  applyBinary(binary::Abs<T>(), b);
+}
 
 DEFINE_MATRIX_BINARY_OP(AbsDerivative, a = (b > 0) ? a : (b < 0) ? -a : 0);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::absDerivative(BaseMatrixT& b) {
   applyBinary(binary::AbsDerivative<T>(), b);
 }
 
-DEFINE_MATRIX_BINARY_OP(
-    Sigmoid, const T THRESHOLD_MIN = -40.0; const T THRESHOLD_MAX = 13.0;
-    T tmp = (a < THRESHOLD_MIN) ? THRESHOLD_MIN
-                                   : ((a > THRESHOLD_MAX) ? THRESHOLD_MAX : a);
-    b = 1.0f / (1.0f + exp(-tmp)));
-template<>
+DEFINE_MATRIX_BINARY_OP(Sigmoid, const T THRESHOLD_MIN = -40.0;
+                        const T THRESHOLD_MAX = 13.0;
+                        T tmp = (a < THRESHOLD_MIN)
+                                    ? THRESHOLD_MIN
+                                    : ((a > THRESHOLD_MAX) ? THRESHOLD_MAX : a);
+                        b = 1.0f / (1.0f + exp(-tmp)));
+template <>
 void BaseMatrixT<real>::sigmoid(BaseMatrixT& b) {
   if (useGpu_) {
     applyBinary(binary::Sigmoid<real>(), b);
@@ -723,31 +814,31 @@ void BaseMatrixT<real>::sigmoid(BaseMatrixT& b) {
 }
 
 DEFINE_MATRIX_BINARY_OP(SigmoidDerivative, a *= b * (1 - b));
-template<class T>
+template <class T>
 void BaseMatrixT<T>::sigmoidDerivative(BaseMatrixT& b) {
   applyBinary(binary::SigmoidDerivative<T>(), b);
 }
 
 DEFINE_MATRIX_BINARY_OP(ExpDerivative, a *= b);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::expDerivative(BaseMatrixT& b) {
   applyBinary(binary::ExpDerivative<T>(), b);
 }
 
 DEFINE_MATRIX_BINARY_OP(Sign, b = a > 0.0f ? 1.0f : -1.0f);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::sign2(BaseMatrixT& b) {
   applyBinary(binary::Sign<T>(), b);
 }
 
 DEFINE_MATRIX_BINARY_OP(Exp, a = exp(b));
-template<>
+template <>
 void BaseMatrixT<real>::exp2(BaseMatrixT& b) {
   applyBinary(binary::Exp<real>(), b);
 }
 
 DEFINE_MATRIX_BINARY_OP(Log, a = log(b));
-template<>
+template <>
 void BaseMatrixT<real>::log2(BaseMatrixT& b) {
   if (useGpu_) {
     applyBinary(binary::Log<real>(), b);
@@ -757,13 +848,13 @@ void BaseMatrixT<real>::log2(BaseMatrixT& b) {
 }
 
 DEFINE_MATRIX_BINARY_OP(Sqrt, a = sqrt(b));
-template<>
+template <>
 void BaseMatrixT<real>::sqrt2(BaseMatrixT& b) {
   applyBinary(binary::Sqrt<real>(), b);
 }
 
 DEFINE_MATRIX_BINARY_OP(InvSqrt, a = 1.0f / sqrt(b));
-template<>
+template <>
 void BaseMatrixT<real>::invSqrt(BaseMatrixT& b) {
   if (useGpu_) {
     applyBinary(binary::InvSqrt<real>(), b);
@@ -775,37 +866,37 @@ void BaseMatrixT<real>::invSqrt(BaseMatrixT& b) {
 }
 
 DEFINE_MATRIX_BINARY_PARAMETER_OP(IsEqual, ONE_PARAMETER, a = (b == p));
-template<class T>
+template <class T>
 void BaseMatrixT<T>::isEqualTo(BaseMatrixT& b, T value) {
   applyBinary(binary::IsEqual<T>(value), b);
 }
 
 DEFINE_MATRIX_BINARY_PARAMETER_OP(AddScalar, ONE_PARAMETER, a = b + p);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::addScalar(BaseMatrixT& b, T p) {
   applyBinary(binary::AddScalar<T>(p), b);
 }
 
 DEFINE_MATRIX_BINARY_PARAMETER_OP(SubScalar, ONE_PARAMETER, a = b - p);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::subScalar(BaseMatrixT& b, T p) {
   applyBinary(binary::SubScalar<T>(p), b);
 }
 
 DEFINE_MATRIX_BINARY_PARAMETER_OP(MulScalar, ONE_PARAMETER, a = b * p);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::mulScalar(BaseMatrixT& b, T p) {
   applyBinary(binary::MulScalar<T>(p), b);
 }
 
 DEFINE_MATRIX_BINARY_PARAMETER_OP(DivScalar, ONE_PARAMETER, a = b / p);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::divScalar(BaseMatrixT& b, T p) {
   applyBinary(binary::DivScalar<T>(p), b);
 }
 
 DEFINE_MATRIX_BINARY_PARAMETER_OP(ScalarDiv, ONE_PARAMETER, a = p / b);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::scalarDiv(BaseMatrixT& b, T p) {
   applyBinary(binary::ScalarDiv<T>(p), b);
 }
@@ -817,20 +908,20 @@ void BaseMatrixT<T>::scalarDiv(BaseMatrixT& b, T p) {
 
 DEFINE_MATRIX_TERNARY_OP(SoftCrossEntropy,
                          a = -c * log(b) - (1 - c) * log(1 - b));
-template<>
+template <>
 void BaseMatrixT<real>::softCrossEntropy(BaseMatrixT& b, BaseMatrixT& c) {
   applyTernary(ternary::SoftCrossEntropy<real>(), b, c);
 }
 
 DEFINE_MATRIX_TERNARY_OP(SoftCrossEntropyBp, a += (b - c) / (b * (1 - b)));
-template<class T>
+template <class T>
 void BaseMatrixT<T>::softCrossEntropyBp(BaseMatrixT& b, BaseMatrixT& c) {
   applyTernary(ternary::SoftCrossEntropyBp<T>(), b, c);
 }
 
 DEFINE_MATRIX_TERNARY_OP(BinaryCrossEntropy,
                          a = c > 0.5 ? -log(b) : -log(1.0 - b));
-template<>
+template <>
 void BaseMatrixT<real>::binaryLabelCrossEntropy(BaseMatrixT& b,
                                                 BaseMatrixT& c) {
   if (useGpu_) {
@@ -858,70 +949,73 @@ void BaseMatrixT<real>::binaryLabelCrossEntropy(BaseMatrixT& b,
 
 DEFINE_MATRIX_TERNARY_OP(BinaryCrossEntropyBp,
                          a += c > 0.5 ? -1.0 / b : 1.0 / (1.0 - b));
-template<class T>
+template <class T>
 void BaseMatrixT<T>::binaryLabelCrossEntropyBp(BaseMatrixT& b, BaseMatrixT& c) {
   applyTernary(ternary::BinaryCrossEntropyBp<T>(), b, c);
 }
 
 DEFINE_MATRIX_TERNARY_OP(Add, a = b + c);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::add(BaseMatrixT& b, BaseMatrixT& c) {
   applyTernary(ternary::Add<T>(), b, c);
 }
 
 DEFINE_MATRIX_TERNARY_PARAMETER_OP(Add1, TWO_PARAMETER, a = p1 * b + p2 * c);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::add(BaseMatrixT& b, T p1, BaseMatrixT& c, T p2) {
   applyTernary(ternary::Add1<T>(p1, p2), b, c);
 }
 
 DEFINE_MATRIX_TERNARY_OP(Sub, a = b - c);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::sub(BaseMatrixT& b, BaseMatrixT& c) {
   applyTernary(ternary::Sub<T>(), b, c);
 }
 
 DEFINE_MATRIX_TERNARY_PARAMETER_OP(Sub1, TWO_PARAMETER, a = p1 * b - p2 * c);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::sub(BaseMatrixT& b, T p1, BaseMatrixT& c, T p2) {
   applyTernary(ternary::Sub1<T>(p1, p2), b, c);
 }
 
 DEFINE_MATRIX_TERNARY_OP(Add2, a = a + b + c);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::add2(BaseMatrixT& b, BaseMatrixT& c) {
   applyTernary(ternary::Add2<T>(), b, c);
 }
 
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(Add3, THREE_PARAMETER,
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(Add3,
+                                   THREE_PARAMETER,
                                    a = p1 * a + p2 * b + p3 * c);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::add2(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2, T p3) {
   applyTernary(ternary::Add3<T>(p1, p2, p3), b, c);
 }
 
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(SgdUpdate, THREE_PARAMETER,
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(SgdUpdate,
+                                   THREE_PARAMETER,
                                    c = p2 * c - p1 * (b + p3 * a);
                                    a = a + c);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::sgdUpdate(BaseMatrixT& b,  // grad
                                BaseMatrixT& c,  // mom
-                               T p1,        // learningRate,
-                               T p2,        // momentum,
-                               T p3) {      // decayRate
+                               T p1,            // learningRate,
+                               T p2,            // momentum,
+                               T p3) {          // decayRate
   applyTernary(ternary::SgdUpdate<T>(p1, p2, p3), b, c);
 }
 
-DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(SgdUpdate, THREE_PARAMETER,
+DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(SgdUpdate,
+                                      THREE_PARAMETER,
                                       c = p2 * c - p1 * d * (b + p3 * a);
                                       a += c);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::sgdUpdate(BaseMatrixT& b,  // grad,
                                BaseMatrixT& c,  // mom,
                                BaseMatrixT& d,  // lr,
-                               T p1,        // learningRate,
-                               T p2,        // momentum,
-                               T p3) {      // decayRate
+                               T p1,            // learningRate,
+                               T p2,            // momentum,
+                               T p3) {          // decayRate
   applyQuaternary(quaternary::SgdUpdate<T>(p1, p2, p3), b, c, d);
 }
 
@@ -929,19 +1023,22 @@ DEFINE_MATRIX_BINARY_PARAMETER_OP(ApplyL1, ONE_PARAMETER, T lambda = p * b;
                                   a = (a > lambda)
                                           ? (a - lambda)
                                           : (a < -lambda) ? (a + lambda) : 0);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::applyL1(BaseMatrixT& lr, T learningRate, T decayRate) {
   applyBinary(binary::ApplyL1<T>(learningRate * decayRate), lr);
 }
 
-template<>
+template <>
 void BaseMatrixT<real>::applyL1(BaseMatrixT& lr,
                                 real learningRate,
                                 real decayRate) {
   if (useGpu_) {
     applyBinary(binary::ApplyL1<real>(learningRate * decayRate), lr);
   } else {
-    simd::decayL1(this->data_, this->data_, lr.data_, learningRate * decayRate,
+    simd::decayL1(this->data_,
+                  this->data_,
+                  lr.data_,
+                  learningRate * decayRate,
                   height_ * width_);
   }
 }
@@ -950,24 +1047,25 @@ DEFINE_MATRIX_UNARY_PARAMETER_OP(ApplyL1, ONE_PARAMETER, T lambda = p;
                                  a = (a > lambda)
                                          ? (a - lambda)
                                          : (a < -lambda) ? (a + lambda) : 0);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::applyL1(T learningRate, T decayRate) {
   applyUnary(unary::ApplyL1<T>(learningRate * decayRate));
 }
 
-template<>
+template <>
 void BaseMatrixT<real>::applyL1(real learningRate, real decayRate) {
   if (useGpu_) {
     applyUnary(unary::ApplyL1<real>(learningRate * decayRate));
   } else {
-    simd::decayL1(this->data_, this->data_, learningRate * decayRate,
-                  height_ * width_);
+    simd::decayL1(
+        this->data_, this->data_, learningRate * decayRate, height_ * width_);
   }
 }
 
-DEFINE_MATRIX_BINARY_PARAMETER_OP(ApplyL2, ONE_PARAMETER,
+DEFINE_MATRIX_BINARY_PARAMETER_OP(ApplyL2,
+                                  ONE_PARAMETER,
                                   a *= (1.0f / (1.0f + p * b)));
-template<class T>
+template <class T>
 void BaseMatrixT<T>::applyL2(BaseMatrixT& lr, T learningRate, T decayRate) {
   if (useGpu_) {
     applyBinary(binary::ApplyL2<T>(learningRate * decayRate), lr);
@@ -980,32 +1078,33 @@ void BaseMatrixT<T>::applyL2(BaseMatrixT& lr, T learningRate, T decayRate) {
   }
 }
 
-template<class T>
+template <class T>
 void BaseMatrixT<T>::applyL2(T learningRate, T decayRate) {
   BaseMatrixT<T>::mulScalar(1.0f / (1.0f + learningRate * decayRate));
 }
 
 DEFINE_MATRIX_BINARY_OP(DotMul, a *= b);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::dotMul(BaseMatrixT& b) {
   applyBinary(binary::DotMul<T>(), b);
 }
 
 DEFINE_MATRIX_TERNARY_OP(DotMul, a = b * c);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::dotMul(BaseMatrixT& b, BaseMatrixT& c) {
   applyTernary(ternary::DotMul<T>(), b, c);
 }
 
 DEFINE_MATRIX_TERNARY_OP(DotDiv, a = (b == 0.0) ? 0.0 : b / c);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::dotDiv(BaseMatrixT& b, BaseMatrixT& c) {
   applyTernary(ternary::DotDiv<T>(), b, c);
 }
 
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotDiv2P, TWO_PARAMETER,
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotDiv2P,
+                                   TWO_PARAMETER,
                                    a = (b + p1) / (c + p2));
-template<class T>
+template <class T>
 void BaseMatrixT<T>::dotDiv(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2) {
   applyTernary(ternary::DotDiv2P<T>(p1, p2), b, c);
 }
@@ -1015,7 +1114,7 @@ DEFINE_MATRIX_QUATERNARY_OP(RankLoss, const T THRESHOLD = 40.0; a = b - c;
                                     ? THRESHOLD
                                     : ((a < -THRESHOLD) ? (-THRESHOLD) : a);
                             a = log(1 + exp(a)) - a * d);
-template<>
+template <>
 void BaseMatrixT<real>::rankLoss(BaseMatrixT& b,
                                  BaseMatrixT& c,
                                  BaseMatrixT& d) {
@@ -1026,8 +1125,9 @@ DEFINE_MATRIX_QUATERNARY_OP(RankLossBp, const T THRESHOLD = 40.0; a = b - c;
                             a = (a > THRESHOLD)
                                     ? THRESHOLD
                                     : ((a < -THRESHOLD) ? (-THRESHOLD) : a);
-                            a = exp(a); a = (a / (1 + a) - d));
-template<>
+                            a = exp(a);
+                            a = (a / (1 + a) - d));
+template <>
 void BaseMatrixT<real>::rankLossBp(BaseMatrixT& b,
                                    BaseMatrixT& c,
                                    BaseMatrixT& d) {
@@ -1040,7 +1140,7 @@ DEFINE_MATRIX_TERNARY_OP(LogisticRegressionLoss, const T THRESHOLD = 40.0;
                                                                  ? -THRESHOLD
                                                                  : b;
                          a = log(1 + exp(x)) - c * x);
-template<>
+template <>
 void BaseMatrixT<real>::logisticRegressionLoss(BaseMatrixT& b, BaseMatrixT& c) {
   applyTernary(ternary::LogisticRegressionLoss<real>(), b, c);
 }
@@ -1050,22 +1150,23 @@ DEFINE_MATRIX_TERNARY_OP(LogisticRegressionLossBp, const T THRESHOLD = 40.0;
                          T x = (b > THRESHOLD) ? THRESHOLD : (b < -THRESHOLD)
                                                                  ? -THRESHOLD
                                                                  : b;
-                         x = exp(x); a = x / (1 + x) - c);
-template<>
+                         x = exp(x);
+                         a = x / (1 + x) - c);
+template <>
 void BaseMatrixT<real>::logisticRegressionLossBp(BaseMatrixT& b,
                                                  BaseMatrixT& c) {
   applyTernary(ternary::LogisticRegressionLossBp<real>(), b, c);
 }
 
 DEFINE_MATRIX_TERNARY_OP(BiggerThan, a = (b > c) ? 1.0f : 0.0f);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::biggerThan(BaseMatrixT& b, BaseMatrixT& c) {
   applyTernary(ternary::BiggerThan<T>(), b, c);
 }
 
 DEFINE_MATRIX_QUATERNARY_OP(
     BiggerThan, a = ((b > c && d > 0.5f) || (b < c && d < 0.5f)) ? 1.0f : 0.0f);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::biggerThan(BaseMatrixT& b,
                                 BaseMatrixT& c,
                                 BaseMatrixT& d) {
@@ -1073,25 +1174,34 @@ void BaseMatrixT<T>::biggerThan(BaseMatrixT& b,
 }
 
 DEFINE_MATRIX_TERNARY_OP(Max, a = (b > c) ? b : c);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::max2(BaseMatrixT& b, BaseMatrixT& c) {
   applyTernary(ternary::Max<T>(), b, c);
 }
 
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(BinaryClassificationError, ONE_PARAMETER,
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(BinaryClassificationError,
+                                   ONE_PARAMETER,
                                    c += ((a > p) == (b > p)) ? 0.0f : 1.0f);
-template<class T>
-void BaseMatrixT<T>::binaryClassificationError2(size_t destCol, BaseMatrixT& b,
-                                                BaseMatrixT& c, T p) {
+template <class T>
+void BaseMatrixT<T>::binaryClassificationError2(size_t destCol,
+                                                BaseMatrixT& b,
+                                                BaseMatrixT& c,
+                                                T p) {
   CHECK(!useGpu_) << "do not support gpu";
   MatrixOffset offset(0, 0, 0, 0, destCol, 0);
   int numRows = b.height_;
   int numCols = b.width_;
-  b.applyTernary(ternary::BinaryClassificationError<T>(p), c, *this, numRows,
-                 numCols, offset, false_type(), true_type() /*cAsColVector*/);
+  b.applyTernary(ternary::BinaryClassificationError<T>(p),
+                 c,
+                 *this,
+                 numRows,
+                 numCols,
+                 offset,
+                 false_type(),
+                 true_type() /*cAsColVector*/);
 }
 
-template<>
+template <>
 void BaseMatrixT<real>::binaryClassificationError(size_t destCol,
                                                   BaseMatrixT& b,
                                                   BaseMatrixT& c,
@@ -1099,127 +1209,148 @@ void BaseMatrixT<real>::binaryClassificationError(size_t destCol,
   MatrixOffset offset(destCol, 0, 0, 0, 0, 0);
   int numRows = b.height_;
   int numCols = b.width_;
-  aggregate(aggregate::sum(), base::binary::classificationError(p),
-            base::binary::add(), b, c, numRows, numCols, offset, false_type(),
+  aggregate(aggregate::sum(),
+            base::binary::classificationError(p),
+            base::binary::add(),
+            b,
+            c,
+            numRows,
+            numCols,
+            offset,
+            false_type(),
             true_type() /*aAsColVector*/);
 }
 
-DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(Add3, THREE_PARAMETER,
+DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(Add3,
+                                      THREE_PARAMETER,
                                       a = p1 * b + p2 * c + p3 * d);
-template<class T>
-void BaseMatrixT<T>::add3(BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d, T p1,
-                          T p2, T p3) {
+template <class T>
+void BaseMatrixT<T>::add3(
+    BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d, T p1, T p2, T p3) {
   applyQuaternary(quaternary::Add3<T>(p1, p2, p3), b, c, d);
 }
 
 DEFINE_MATRIX_TERNARY_OP(DotMulSquare, a = b * c * c);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::dotMulSquare(BaseMatrixT& b, BaseMatrixT& c) {
   applyTernary(ternary::DotMulSquare<T>(), b, c);
 }
 
 DEFINE_MATRIX_TERNARY_OP(DotSquareSquare, a = b * b * c * c);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::dotSquareSquare(BaseMatrixT& b, BaseMatrixT& c) {
   applyTernary(ternary::DotSquareSquare<T>(), b, c);
 }
 
 DEFINE_MATRIX_BINARY_OP(DotMulSquare, a *= b * b);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::dotMulSquare(BaseMatrixT& b) {
   applyBinary(binary::DotMulSquare<T>(), b);
 }
 
 DEFINE_MATRIX_BINARY_OP(DotSquareMul, a = a * a * b);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::dotSquareMul(BaseMatrixT& b) {
   applyBinary(binary::DotSquareMul<T>(), b);
 }
 
-DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(AddSquareSum, THREE_PARAMETER,
+DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(AddSquareSum,
+                                      THREE_PARAMETER,
                                       T tmp = p1 * b + p2 * c + p3 * d;
                                       a += tmp * tmp);
-template<class T>
-void BaseMatrixT<T>::addSquareSum(BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT d,
-                                  T p1, T p2, T p3) {
+template <class T>
+void BaseMatrixT<T>::addSquareSum(
+    BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT d, T p1, T p2, T p3) {
   applyQuaternary(quaternary::AddSquareSum<T>(p1, p2, p3), b, c, d);
 }
 
 DEFINE_MATRIX_BINARY_PARAMETER_OP(AddSquare, ONE_PARAMETER, a += p * b * b);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::addSquare(BaseMatrixT& b, T p) {
   applyBinary(binary::AddSquare<T>(p), b);
 }
 
-DEFINE_MATRIX_BINARY_PARAMETER_OP(DecayAddSquare, TWO_PARAMETER,
+DEFINE_MATRIX_BINARY_PARAMETER_OP(DecayAddSquare,
+                                  TWO_PARAMETER,
                                   a = p1 * a + p2 * b * b);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::decayAddSquare(BaseMatrixT& b, T p1, T p2) {
   applyBinary(binary::DecayAddSquare<T>(p1, p2), b);
 }
 
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(DecayAddSquareMul, TWO_PARAMETER,
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(DecayAddSquareMul,
+                                   TWO_PARAMETER,
                                    a = p1 * a + p2 * b * b * c * c);
-template<class T>
-void BaseMatrixT<T>::decayAddSquareMul(BaseMatrixT& b, BaseMatrixT& c, T p1,
+template <class T>
+void BaseMatrixT<T>::decayAddSquareMul(BaseMatrixT& b,
+                                       BaseMatrixT& c,
+                                       T p1,
                                        T p2) {
   applyTernary(ternary::DecayAddSquareMul<T>(p1, p2), b, c);
 }
 
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(ReciprocalSum, THREE_PARAMETER,
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(ReciprocalSum,
+                                   THREE_PARAMETER,
                                    a = 1 / (p1 * b + p2 * c + p3));
-template<class T>
-void BaseMatrixT<T>::reciprocalSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2,
-                                   T p3) {
+template <class T>
+void BaseMatrixT<T>::reciprocalSum(
+    BaseMatrixT& b, BaseMatrixT& c, T p1, T p2, T p3) {
   applyTernary(ternary::ReciprocalSum<T>(p1, p2, p3), b, c);
 }
 
-DEFINE_MATRIX_BINARY_PARAMETER_OP(Reciprocal2, TWO_PARAMETER,
+DEFINE_MATRIX_BINARY_PARAMETER_OP(Reciprocal2,
+                                  TWO_PARAMETER,
                                   a = 1 / (p1 * b + p2));
-template<class T>
+template <class T>
 void BaseMatrixT<T>::reciprocal2(BaseMatrixT& b, T p1, T p2) {
   applyBinary(binary::Reciprocal2<T>(p1, p2), b);
 }
 
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotMulSquareSum, TWO_PARAMETER,
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotMulSquareSum,
+                                   TWO_PARAMETER,
                                    T tmp = p1 * b + p2 * c;
                                    a *= tmp * tmp);
-template<class T>
-void BaseMatrixT<T>::dotMulSquareSum(BaseMatrixT& b, BaseMatrixT& c, T p1,
+template <class T>
+void BaseMatrixT<T>::dotMulSquareSum(BaseMatrixT& b,
+                                     BaseMatrixT& c,
+                                     T p1,
                                      T p2) {
   applyTernary(ternary::DotMulSquareSum<T>(p1, p2), b, c);
 }
 
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotSquareSum, TWO_PARAMETER,
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotSquareSum,
+                                   TWO_PARAMETER,
                                    T tmp = p1 * b + p2 * c;
                                    a = tmp * tmp);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::dotSquareSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2) {
   applyTernary(ternary::DotSquareSum<T>(p1, p2), b, c);
 }
 
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotMulSum, TWO_PARAMETER,
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotMulSum,
+                                   TWO_PARAMETER,
                                    a *= p1 * b + p2 * c);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::dotMulSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2) {
   applyTernary(ternary::DotMulSum<T>(p1, p2), b, c);
 }
 
 DEFINE_MATRIX_BINARY_OP(CopyAndClear, b = a; a = 0);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::copyAndClear(BaseMatrixT& b) {
   applyBinary(binary::CopyAndClear<T>(), b);
 }
 
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(AddDotMul, TWO_PARAMETER,
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(AddDotMul,
+                                   TWO_PARAMETER,
                                    a = p1 * a + p2 * b * c);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::addDotMul(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2) {
   applyTernary(ternary::AddDotMul<T>(p1, p2), b, c);
 }
 
 DEFINE_MATRIX_BINARY_OP(Assign, a = b;);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::assign(BaseMatrixT& b) {
   if (useGpu_) {
     applyBinary(binary::Assign<T>(), b);
@@ -1230,7 +1361,7 @@ void BaseMatrixT<T>::assign(BaseMatrixT& b) {
   }
 }
 
-template<class T>
+template <class T>
 void BaseMatrixT<T>::assignAtOffset(BaseMatrixT& b, int64_t columnOffset) {
   if (columnOffset + b.width_ <= width_) {
     int numRows = height_;
@@ -1250,24 +1381,31 @@ void BaseMatrixT<T>::assignAtOffset(BaseMatrixT& b, int64_t columnOffset) {
 }
 
 DEFINE_MATRIX_BINARY_OP(DeepSwap, T tmp = a; a = b; b = tmp);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::deepSwap(BaseMatrixT& b) {
-    applyBinary(binary::DeepSwap<T>(), b);
+  applyBinary(binary::DeepSwap<T>(), b);
 }
 
-template<>
+template <>
 void BaseMatrixT<real>::rowDotMul(size_t destCol,
                                   BaseMatrixT& b,
                                   BaseMatrixT& c) {
   int numRows = b.height_;
   int numCols = b.width_;
   MatrixOffset offset(destCol, 0, 0, 0, 0, 0);
-  aggregate(aggregate::sum(), base::binary::mul(), base::binary::add(), b, c,
-            numRows, numCols, offset, false_type(),
+  aggregate(aggregate::sum(),
+            base::binary::mul(),
+            base::binary::add(),
+            b,
+            c,
+            numRows,
+            numCols,
+            offset,
+            false_type(),
             true_type() /*aAsColVector*/);
 }
 
-template<class T>
+template <class T>
 void BaseMatrixT<T>::rowDotMul2(size_t destCol,
                                 BaseMatrixT& b,
                                 BaseMatrixT& c) {
@@ -1290,17 +1428,24 @@ void BaseMatrixT<T>::rowDotMul2(size_t destCol,
   }
 }
 
-template<>
+template <>
 void BaseMatrixT<real>::addDotMulVMM(BaseMatrixT& b, BaseMatrixT& c) {
   MatrixOffset offset(0, 0, 0, 0, 0, 0);
   int numRows = b.height_;
   int numCols = b.width_;
-  aggregate(aggregate::sum(), base::binary::mul(), base::binary::add(), b, c,
-            numRows, numCols, offset, true_type() /*aAsRowVector*/,
+  aggregate(aggregate::sum(),
+            base::binary::mul(),
+            base::binary::add(),
+            b,
+            c,
+            numRows,
+            numCols,
+            offset,
+            true_type() /*aAsRowVector*/,
             false_type());
 }
 
-template<class T>
+template <class T>
 void BaseMatrixT<T>::addDotMulVMM2(BaseMatrixT& b, BaseMatrixT& c) {
   CHECK(!useGpu_) << "do not support gpu";
 
@@ -1321,16 +1466,22 @@ void BaseMatrixT<T>::addDotMulVMM2(BaseMatrixT& b, BaseMatrixT& c) {
 }
 
 DEFINE_MATRIX_TERNARY_OP(addDotMulMMV, a += b * c);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::addDotMulMMV(BaseMatrixT& b, BaseMatrixT& c) {
   MatrixOffset offset(0, 0, 0, 0, 0, 0);
   int numRows = height_;
   int numCols = width_;
-  applyTernary(ternary::addDotMulMMV<T>(), b, c, numRows, numCols, offset,
-               true_type() /*cAsRowVector*/, false_type());
+  applyTernary(ternary::addDotMulMMV<T>(),
+               b,
+               c,
+               numRows,
+               numCols,
+               offset,
+               true_type() /*cAsRowVector*/,
+               false_type());
 }
 
-template<class T>
+template <class T>
 void BaseMatrixT<T>::addDotMulMMV2(BaseMatrixT& b, BaseMatrixT& c) {
   CHECK(!useGpu_) << "do not support gpu";
 
@@ -1350,16 +1501,22 @@ void BaseMatrixT<T>::addDotMulMMV2(BaseMatrixT& b, BaseMatrixT& c) {
   }
 }
 
-template<class T>
+template <class T>
 void BaseMatrixT<T>::rowScale(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) {
   MatrixOffset offset(0, 0, 0, 0, cCol, 0);
   int numRows = height_;
   int numCols = width_;
-  applyTernary(ternary::DotMul<T>(), b, c, numRows, numCols, offset,
-    false_type(), true_type() /*cAsColVector*/);
+  applyTernary(ternary::DotMul<T>(),
+               b,
+               c,
+               numRows,
+               numCols,
+               offset,
+               false_type(),
+               true_type() /*cAsColVector*/);
 }
 
-template<class T>
+template <class T>
 void BaseMatrixT<T>::rowScale2(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) {
   CHECK(!useGpu_) << "do not support gpu";
 
@@ -1379,52 +1536,82 @@ void BaseMatrixT<T>::rowScale2(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) {
   }
 }
 
-template<class T>
+template <class T>
 void BaseMatrixT<T>::colScale(size_t cRow, BaseMatrixT& b, BaseMatrixT& c) {
   MatrixOffset offset(0, 0, 0, 0, 0, cRow);
   int numRows = height_;
   int numCols = width_;
-  applyTernary(ternary::DotMul<T>(), b, c, numRows, numCols, offset,
-               true_type() /* cAsRowVector */, false_type() /* cAsColVector */);
+  applyTernary(ternary::DotMul<T>(),
+               b,
+               c,
+               numRows,
+               numCols,
+               offset,
+               true_type() /* cAsRowVector */,
+               false_type() /* cAsColVector */);
 }
 
-template<class T>
+template <class T>
 void BaseMatrixT<T>::addColScale(size_t cRow, BaseMatrixT& b, BaseMatrixT& c) {
   MatrixOffset offset(0, 0, 0, 0, 0, cRow);
   int numRows = height_;
   int numCols = width_;
-  applyTernary(ternary::addDotMulMMV<T>(), b, c, numRows, numCols, offset,
-               true_type() /* cAsRowVector */, false_type() /* cAsColVector */);
+  applyTernary(ternary::addDotMulMMV<T>(),
+               b,
+               c,
+               numRows,
+               numCols,
+               offset,
+               true_type() /* cAsRowVector */,
+               false_type() /* cAsColVector */);
 }
 
-template<class T>
+template <class T>
 void BaseMatrixT<T>::addRowScale(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) {
   MatrixOffset offset(0, 0, 0, 0, cCol, 0);
   int numRows = height_;
   int numCols = width_;
-  applyTernary(ternary::addDotMulMMV<T>(), b, c, numRows, numCols, offset,
-               false_type(), true_type() /*cAsColVector*/);
+  applyTernary(ternary::addDotMulMMV<T>(),
+               b,
+               c,
+               numRows,
+               numCols,
+               offset,
+               false_type(),
+               true_type() /*cAsColVector*/);
 }
 
 DEFINE_MATRIX_TERNARY_PARAMETER_OP(RowAdd, ONE_PARAMETER, a = b + p * c);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::rowAdd(size_t cCol, BaseMatrixT& b, BaseMatrixT& c, T p) {
   MatrixOffset offset(0, 0, 0, 0, cCol, 0);
   int numRows = height_;
   int numCols = width_;
-  applyTernary(ternary::RowAdd<T>(p), b, c, numRows, numCols, offset,
-    false_type(), true_type() /*cAsColVector*/);
+  applyTernary(ternary::RowAdd<T>(p),
+               b,
+               c,
+               numRows,
+               numCols,
+               offset,
+               false_type(),
+               true_type() /*cAsColVector*/);
 }
 
 DEFINE_MATRIX_TERNARY_OP(RowPow, a = pow(b, c));
-template<>
+template <>
 void BaseMatrixT<real>::rowPow(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) {
   if (useGpu_) {
     MatrixOffset offset(0, 0, 0, 0, cCol, 0);
     int numRows = height_;
     int numCols = width_;
-    applyTernary(ternary::RowPow<real>(), b, c, numRows, numCols, offset,
-                 false_type(), true_type() /*cAsColVector*/);
+    applyTernary(ternary::RowPow<real>(),
+                 b,
+                 c,
+                 numRows,
+                 numCols,
+                 offset,
+                 false_type(),
+                 true_type() /*cAsColVector*/);
   } else {
     size_t height = this->height_;
     size_t width = this->width_;
@@ -1441,44 +1628,64 @@ void BaseMatrixT<real>::rowPow(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) {
   }
 }
 
-template<class T>
+template <class T>
 void BaseMatrixT<T>::mulRowVector(BaseMatrixT& b) {
   MatrixOffset offset(0, 0, 0, 0);
   int numRows = height_;
   int numCols = width_;
-  applyBinary(binary::DotMul<T>(), b, numRows, numCols, offset,
-              true_type() /* bAsRowVector */, false_type());
+  applyBinary(binary::DotMul<T>(),
+              b,
+              numRows,
+              numCols,
+              offset,
+              true_type() /* bAsRowVector */,
+              false_type());
 }
 
 DEFINE_MATRIX_BINARY_OP(DotDiv, a /= b);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::divRowVector(BaseMatrixT& b) {
   MatrixOffset offset(0, 0, 0, 0);
   int numRows = height_;
   int numCols = width_;
-  applyBinary(binary::DotDiv<T>(), b, numRows, numCols, offset,
-              true_type() /* bAsRowVector */, false_type());
+  applyBinary(binary::DotDiv<T>(),
+              b,
+              numRows,
+              numCols,
+              offset,
+              true_type() /* bAsRowVector */,
+              false_type());
 }
 
-template<class T>
+template <class T>
 void BaseMatrixT<T>::mulColVector(BaseMatrixT& b) {
   MatrixOffset offset(0, 0, 0, 0);
   int numRows = height_;
   int numCols = width_;
-  applyBinary(binary::DotMul<T>(), b, numRows, numCols, offset,
-              false_type(), true_type() /* bAsColVector */);
+  applyBinary(binary::DotMul<T>(),
+              b,
+              numRows,
+              numCols,
+              offset,
+              false_type(),
+              true_type() /* bAsColVector */);
 }
 
-template<class T>
+template <class T>
 void BaseMatrixT<T>::divColVector(BaseMatrixT& b) {
   MatrixOffset offset(0, 0, 0, 0);
   int numRows = height_;
   int numCols = width_;
-  applyBinary(binary::DotDiv<T>(), b, numRows, numCols, offset,
-              false_type(), true_type() /* bAsColVector */);
+  applyBinary(binary::DotDiv<T>(),
+              b,
+              numRows,
+              numCols,
+              offset,
+              false_type(),
+              true_type() /* bAsColVector */);
 }
 
-template<>
+template <>
 template <class Agg>
 int BaseMatrixT<real>::applyRow(Agg agg, BaseMatrixT& b) {
   MatrixOffset offset(0, 0, 0, 0, 0, 0);
@@ -1486,13 +1693,20 @@ int BaseMatrixT<real>::applyRow(Agg agg, BaseMatrixT& b) {
   size_t numCols = b.width_;
   CHECK_EQ(height_, numRows);
   CHECK_EQ(width_, 1UL);
-  aggregate(agg, base::unary::identity(), base::binary::second(), b, numRows,
-            numCols, offset, false_type(), true_type() /*aAsColVector*/);
+  aggregate(agg,
+            base::unary::identity(),
+            base::binary::second(),
+            b,
+            numRows,
+            numCols,
+            offset,
+            false_type(),
+            true_type() /*aAsColVector*/);
 
   return 0;
 }
 
-template<>
+template <>
 template <class Agg, class Saver>
 int BaseMatrixT<real>::applyRow(Agg agg, Saver sv, BaseMatrixT& b) {
   MatrixOffset offset(0, 0, 0, 0, 0, 0);
@@ -1500,16 +1714,25 @@ int BaseMatrixT<real>::applyRow(Agg agg, Saver sv, BaseMatrixT& b) {
   size_t numCols = b.width_;
   CHECK_EQ(height_, numRows);
   CHECK_EQ(width_, 1UL);
-  aggregate(agg, base::unary::identity(), sv, b, numRows, numCols, offset,
-            false_type(), true_type() /*aAsColVector*/);
+  aggregate(agg,
+            base::unary::identity(),
+            sv,
+            b,
+            numRows,
+            numCols,
+            offset,
+            false_type(),
+            true_type() /*aAsColVector*/);
 
   return 0;
 }
 
-template<>
+template <>
 template <class Agg>
-int BaseMatrixT<real>::applyRow(
-     Agg agg, real scaleDest, real scaleAgg, BaseMatrixT& b) {
+int BaseMatrixT<real>::applyRow(Agg agg,
+                                real scaleDest,
+                                real scaleAgg,
+                                BaseMatrixT& b) {
   if (scaleDest != 0) {
     applyRow(agg, base::binary::add2(scaleDest, scaleAgg), b);
   } else {
@@ -1521,10 +1744,10 @@ int BaseMatrixT<real>::applyRow(
   return 0;
 }
 
-template<>
+template <>
 template <class Agg, class Op, class Saver>
-int BaseMatrixT<real>::applyRow(Agg agg, Op op, Saver sv,
-                                BaseMatrixT& b, BaseMatrixT& c) {
+int BaseMatrixT<real>::applyRow(
+    Agg agg, Op op, Saver sv, BaseMatrixT& b, BaseMatrixT& c) {
   MatrixOffset offset(0, 0, 0, 0, 0, 0);
   size_t numRows = b.height_;
   size_t numCols = b.width_;
@@ -1532,16 +1755,27 @@ int BaseMatrixT<real>::applyRow(Agg agg, Op op, Saver sv,
   CHECK_EQ(width_, 1UL);
   CHECK_EQ(c.height_, numRows);
   CHECK_EQ(c.width_, numCols);
-  aggregate(agg, op, sv,
-            b, c, numRows, numCols, offset,
-            false_type(), true_type() /*aAsColVector*/);
+  aggregate(agg,
+            op,
+            sv,
+            b,
+            c,
+            numRows,
+            numCols,
+            offset,
+            false_type(),
+            true_type() /*aAsColVector*/);
   return 0;
 }
 
-template<>
+template <>
 template <class Agg, class Op>
-int BaseMatrixT<real>::applyRow(Agg agg, Op op, real scaleDest, real scaleAgg,
-                                BaseMatrixT& b, BaseMatrixT& c) {
+int BaseMatrixT<real>::applyRow(Agg agg,
+                                Op op,
+                                real scaleDest,
+                                real scaleAgg,
+                                BaseMatrixT& b,
+                                BaseMatrixT& c) {
   if (scaleDest != 0) {
     applyRow(agg, op, base::binary::add2(scaleDest, scaleAgg), b, c);
   } else {
@@ -1553,7 +1787,7 @@ int BaseMatrixT<real>::applyRow(Agg agg, Op op, real scaleDest, real scaleAgg,
   return 0;
 }
 
-template<>
+template <>
 template <class Agg>
 int BaseMatrixT<real>::applyCol(Agg agg, BaseMatrixT& b) {
   MatrixOffset offset(0, 0, 0, 0, 0, 0);
@@ -1561,13 +1795,20 @@ int BaseMatrixT<real>::applyCol(Agg agg, BaseMatrixT& b) {
   size_t numCols = b.width_;
   CHECK_EQ(width_, numCols);
   CHECK_EQ(height_, 1UL);
-  aggregate(agg, base::unary::identity(), base::binary::second(), b, numRows,
-            numCols, offset, true_type() /*aAsRowVector*/, false_type());
+  aggregate(agg,
+            base::unary::identity(),
+            base::binary::second(),
+            b,
+            numRows,
+            numCols,
+            offset,
+            true_type() /*aAsRowVector*/,
+            false_type());
 
   return 0;
 }
 
-template<>
+template <>
 template <class Agg, class Saver>
 int BaseMatrixT<real>::applyCol(Agg agg, Saver sv, BaseMatrixT& b) {
   MatrixOffset offset(0, 0, 0, 0, 0, 0);
@@ -1575,16 +1816,25 @@ int BaseMatrixT<real>::applyCol(Agg agg, Saver sv, BaseMatrixT& b) {
   size_t numCols = b.width_;
   CHECK_EQ(width_, numCols);
   CHECK_EQ(height_, 1UL);
-  aggregate(agg, base::unary::identity(), sv, b, numRows, numCols, offset,
-            true_type() /*aAsRowVector*/, false_type());
+  aggregate(agg,
+            base::unary::identity(),
+            sv,
+            b,
+            numRows,
+            numCols,
+            offset,
+            true_type() /*aAsRowVector*/,
+            false_type());
 
   return 0;
 }
 
-template<>
+template <>
 template <class Agg>
-int BaseMatrixT<real>::applyCol(
-     Agg agg, real scaleDest, real scaleAgg, BaseMatrixT& b) {
+int BaseMatrixT<real>::applyCol(Agg agg,
+                                real scaleDest,
+                                real scaleAgg,
+                                BaseMatrixT& b) {
   if (scaleDest != 0) {
     applyCol(agg, base::binary::add2(scaleDest, scaleAgg), b);
   } else {
@@ -1596,48 +1846,51 @@ int BaseMatrixT<real>::applyCol(
   return 0;
 }
 
-template<>
+template <>
 void BaseMatrixT<real>::sumRows(BaseMatrixT& b, real scaleSum, real scaleDest) {
   applyRow(aggregate::sum(), scaleDest, scaleSum, b);
 }
 
-template<>
+template <>
 void BaseMatrixT<real>::maxRows(BaseMatrixT& b) {
   applyRow(aggregate::max(), b);
 }
 
-template<>
+template <>
 void BaseMatrixT<real>::minRows(BaseMatrixT& b) {
   applyRow(aggregate::min(), b);
 }
 
-template<>
+template <>
 void BaseMatrixT<real>::maxCols(BaseMatrixT& b) {
   applyCol(aggregate::max(), b);
 }
 
-template<>
+template <>
 void BaseMatrixT<real>::minCols(BaseMatrixT& b) {
   applyCol(aggregate::min(), b);
 }
 
-template<>
+template <>
 void BaseMatrixT<real>::sumCols(BaseMatrixT& b, real scaleSum, real scaleDest) {
   applyCol(aggregate::sum(), scaleDest, scaleSum, b);
 }
 
-template<>
-void BaseMatrixT<real>::sumOfSquaredDiffs(
-    BaseMatrixT& b, BaseMatrixT& c, real scaleSum, real scaleDest) {
-  applyRow(aggregate::sum(), base::binary::squaredDiff(),
-           scaleDest, scaleSum, b, c);
+template <>
+void BaseMatrixT<real>::sumOfSquaredDiffs(BaseMatrixT& b,
+                                          BaseMatrixT& c,
+                                          real scaleSum,
+                                          real scaleDest) {
+  applyRow(
+      aggregate::sum(), base::binary::squaredDiff(), scaleDest, scaleSum, b, c);
 }
 
-template<>
-void BaseMatrixT<real>::sumOfProducts(
-    BaseMatrixT& b, BaseMatrixT& c, real scaleSum, real scaleDest) {
-  applyRow(aggregate::sum(), base::binary::mul(),
-           scaleDest, scaleSum, b, c);
+template <>
+void BaseMatrixT<real>::sumOfProducts(BaseMatrixT& b,
+                                      BaseMatrixT& c,
+                                      real scaleSum,
+                                      real scaleDest) {
+  applyRow(aggregate::sum(), base::binary::mul(), scaleDest, scaleSum, b, c);
 }
 
 template class BaseMatrixT<real>;

From 133541ee41624e8b25b885fb1a2f11cbdd17299e Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Thu, 3 Aug 2017 15:00:21 +0800
Subject: [PATCH 08/19] Merge codes

---
 python/paddle/v2/framework/tests/gradient_checker.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/python/paddle/v2/framework/tests/gradient_checker.py b/python/paddle/v2/framework/tests/gradient_checker.py
index 0ee7e8fb49..4022de1c40 100644
--- a/python/paddle/v2/framework/tests/gradient_checker.py
+++ b/python/paddle/v2/framework/tests/gradient_checker.py
@@ -10,7 +10,7 @@ def get_numeric_gradient(op,
                          input_values,
                          output_name,
                          input_to_check,
-                         delta=1e-5,
+                         delta=1e-2,
                          local_scope=None):
     """
     Get Numeric Gradient for an operator's input.
@@ -34,8 +34,8 @@ def get_numeric_gradient(op,
         var = local_scope.new_var(var_name)
         tensor = var.get_tensor()
         tensor.set_dims(input_values[var_name].shape)
-        tensor.alloc_float()
-        tensor.set(input_values[var_name])
+        tensor.alloc_float(core.CPUPlace())
+        tensor.set(input_values[var_name], core.CPUPlace())
 
     # Create all output variable in local_scope
     for output in op.outputs():
@@ -46,10 +46,10 @@ def get_numeric_gradient(op,
 
     # allocate output memory
     for output in op.outputs():
-        local_scope.find_var(output).get_tensor().alloc_float()
+        local_scope.find_var(output).get_tensor().alloc_float(core.CPUPlace())
 
     # TODO(yuyang18): Only CPU is support now.
-    cpu_ctx = core.DeviceContext.cpu_context()
+    cpu_ctx = core.DeviceContext.create(core.CPUPlace())
 
     def get_output():
         op.run(local_scope, cpu_ctx)
@@ -85,7 +85,6 @@ if __name__ == '__main__':
             y = numpy.random.random((10, 1)).astype("float32")
 
             arr = get_numeric_gradient(add_op, {'X': x, "Y": y}, 'Z', 'X')
-
             self.assertAlmostEqual(arr.mean(), 1.0, delta=1e-2)
 
     unittest.main()

From b58725bd5181fa9c5ada0fb94e553258dc1b25b0 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Fri, 4 Aug 2017 11:07:47 +0800
Subject: [PATCH 09/19] Add cpplint for *.h and cuda  *.cu

---
 cmake/generic.cmake                    | 16 +++++++++++++++-
 paddle/framework/ddim.h                |  9 +++------
 paddle/framework/grad_op_builder.h     | 20 +++++++++++++++++---
 paddle/framework/op_registry.h         |  6 +++---
 paddle/framework/operator.h            |  2 +-
 paddle/math/BaseMatrix.cu              |  3 ++-
 paddle/memory/CMakeLists.txt           |  4 ++--
 paddle/memory/detail/buddy_allocator.h |  2 +-
 paddle/memory/detail/meta_cache.h      |  8 ++++----
 paddle/memory/memory.h                 |  2 +-
 paddle/operators/add_op.cu             | 14 ++++++++++++++
 paddle/operators/cross_entropy_op.cu   | 16 +++++++++++++++-
 paddle/operators/fill_zeros_like_op.cu | 16 +++++++++++++++-
 paddle/operators/mean_op.cu            | 16 +++++++++++++++-
 paddle/operators/mean_op.h             |  2 +-
 paddle/operators/mul_op.cu             |  2 +-
 paddle/operators/recurrent_op.h        | 20 ++++++++------------
 paddle/operators/rowwise_add_op.cu     | 14 ++++++++++++++
 paddle/operators/sgd_op.cu             | 16 +++++++++++++++-
 paddle/operators/sigmoid_op.cu         | 14 ++++++++++++++
 paddle/operators/softmax_op.cc         |  1 +
 paddle/operators/softmax_op.cu         | 14 ++++++++++++++
 paddle/platform/device_context.h       |  8 ++++----
 paddle/platform/dynload/cublas.cc      | 14 ++++++++++++++
 paddle/platform/dynload/cudnn.cc       | 16 +++++++++++++++-
 paddle/platform/dynload/curand.cc      | 21 ++++++++++++++++++---
 paddle/platform/place.h                |  2 +-
 paddle/string/piece.h                  |  4 ++--
 28 files changed, 230 insertions(+), 52 deletions(-)

diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 534be0abe2..41b9b59289 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -187,7 +187,13 @@ function(cc_library TARGET_NAME)
     endif()
     
     # cpplint code style
-    add_style_check_target(${TARGET_NAME} ${cc_library_SRCS})
+    foreach(source_file ${cc_library_SRCS})
+      string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file})
+      if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+        list(APPEND cc_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+      endif()
+    endforeach()
+    add_style_check_target(${TARGET_NAME} ${cc_library_SRCS} ${cc_library_HEADERS})
 
   else(cc_library_SRCS)
     if (cc_library_DEPS)
@@ -239,6 +245,14 @@ function(nv_library TARGET_NAME)
         add_dependencies(${TARGET_NAME} ${nv_library_DEPS})
         target_link_libraries(${TARGET_NAME} ${nv_library_DEPS})
       endif()
+      # cpplint code style
+      foreach(source_file ${nv_library_SRCS})
+        string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file})
+        if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+          list(APPEND cc_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+        endif()
+      endforeach()
+      add_style_check_target(${TARGET_NAME} ${nv_library_SRCS} ${nv_library_HEADERS})
     else(nv_library_SRCS)
       if (nv_library_DEPS)
         merge_static_libs(${TARGET_NAME} ${nv_library_DEPS})
diff --git a/paddle/framework/ddim.h b/paddle/framework/ddim.h
index 9fcc657edc..5aa5af0c19 100644
--- a/paddle/framework/ddim.h
+++ b/paddle/framework/ddim.h
@@ -25,18 +25,15 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-namespace {
-typedef boost::variant<Dim<1>, Dim<2>, Dim<3>, Dim<4>, Dim<5>, Dim<6>, Dim<7>,
-                       Dim<8>, Dim<9>>
-    DDimVar;
-}
-
 /**
  * \brief A dynamically sized dimension.
  *
  * The number of dimensions must be between [1, 9].
  */
 struct DDim {
+  typedef boost::variant<Dim<1>, Dim<2>, Dim<3>, Dim<4>, Dim<5>, Dim<6>, Dim<7>,
+                         Dim<8>, Dim<9>>
+      DDimVar;
   DDimVar var;
 
   DDim() : var(Dim<1>()) {}
diff --git a/paddle/framework/grad_op_builder.h b/paddle/framework/grad_op_builder.h
index cc7a76f372..973c12658c 100644
--- a/paddle/framework/grad_op_builder.h
+++ b/paddle/framework/grad_op_builder.h
@@ -1,3 +1,17 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
 #pragma once
 
 #include "paddle/framework/op_proto.pb.h"
@@ -10,8 +24,8 @@ class OpRegistry;
 enum InOutType { IN, OUT };
 
 struct OpInOutArg {
-  OpInOutArg(const std::string& proto_name, const InOutType& type,
-             bool needed_in_grad, size_t begin_idx, size_t end_idx)
+  explicit OpInOutArg(const std::string& proto_name, const InOutType& type,
+                      bool needed_in_grad, size_t begin_idx, size_t end_idx)
       : proto_name_(proto_name),
         type_(type),
         needed_in_grad_(needed_in_grad),
@@ -29,7 +43,7 @@ class GradOpBuilder {
   using VarIndexMap = std::unordered_map<std::string, int>;
 
  public:
-  GradOpBuilder(const OperatorBase& op) : op_(op) {}
+  explicit GradOpBuilder(const OperatorBase& op) : op_(op) {}
   OperatorBase* Build();
 
  private:
diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
index 3e72e39126..228943d819 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -315,7 +315,7 @@ class OpRegistry {
   static std::unordered_map<std::string, OpProto>& protos() {
     static std::unordered_map<std::string, OpProto> protos_;
     return protos_;
-  };
+  }
 
   static std::unordered_map<std::string, std::string>& grad_ops() {
     static std::unordered_map<std::string, std::string> grad_ops_;
@@ -337,7 +337,7 @@ class OpRegistry {
   static std::unordered_map<std::string, OpAttrChecker>& op_checkers() {
     static std::unordered_map<std::string, OpAttrChecker> op_checkers_;
     return op_checkers_;
-  };
+  }
 
   static void GenerateTempVariableName(OperatorBase* op) {
     static std::atomic<size_t> gUniqId(0UL);
@@ -354,7 +354,7 @@ class OpRegistry {
 template <typename OpType, typename ProtoMakerType>
 class OpRegisterHelper {
  public:
-  OpRegisterHelper(const char* op_type) {
+  explicit OpRegisterHelper(const char* op_type) {
     OpRegistry::RegisterOp<OpType, ProtoMakerType>(op_type);
   }
 };
diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
index 5543510348..09a116ba75 100644
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -280,7 +280,7 @@ class OperatorWithKernel : public OperatorBase {
     platform::Place place_;
 
     OpKernelKey() = default;
-    OpKernelKey(const platform::DeviceContext& dev_ctx) {
+    explicit OpKernelKey(const platform::DeviceContext& dev_ctx) {
       place_ = dev_ctx.GetPlace();
     }
 
diff --git a/paddle/math/BaseMatrix.cu b/paddle/math/BaseMatrix.cu
index 6db5965789..f60d9cc5c4 100644
--- a/paddle/math/BaseMatrix.cu
+++ b/paddle/math/BaseMatrix.cu
@@ -442,7 +442,8 @@ DEFINE_MATRIX_UNARY_PARAMETER_OP(Clip, TWO_PARAMETER,
 template<class T>
 void BaseMatrixT<T>::clip(T p1, T p2) { applyUnary(unary::Clip<T>(p1, p2)); }
 
-DEFINE_MATRIX_BINARY_PARAMETER_OP(ClipDerivative, TWO_PARAMETER, a = b < p1 ? 0 : (b > p2 ? 0 : 1));
+DEFINE_MATRIX_BINARY_PARAMETER_OP(ClipDerivative, TWO_PARAMETER,
+                                  a = b < p1 ? 0 : (b > p2 ? 0 : 1));
 template<class T>
 void BaseMatrixT<T>::clipDerivative(BaseMatrixT& b, T p1, T p2) {
   applyBinary(binary::ClipDerivative<T>(p1, p2), b);
diff --git a/paddle/memory/CMakeLists.txt b/paddle/memory/CMakeLists.txt
index 8035d93bfe..eb2f5cb66a 100644
--- a/paddle/memory/CMakeLists.txt
+++ b/paddle/memory/CMakeLists.txt
@@ -1,7 +1,7 @@
 add_subdirectory(detail)
 
-cc_library(memory SRCS memory.cc)
-cc_library(memcpy SRCS memcpy.cc DEPS device_context)
+cc_library(memory SRCS memory.h memory.cc)
+cc_library(memcpy SRCS memcpy.h memcpy.cc DEPS device_context)
 
 cc_library(paddle_memory
     DEPS
diff --git a/paddle/memory/detail/buddy_allocator.h b/paddle/memory/detail/buddy_allocator.h
index 4fa3fb0ee5..9c41378483 100644
--- a/paddle/memory/detail/buddy_allocator.h
+++ b/paddle/memory/detail/buddy_allocator.h
@@ -39,7 +39,7 @@ class BuddyAllocator {
 
  public:
   void* Alloc(size_t unaligned_size);
-  void Free(void*);
+  void Free(void* ptr);
   size_t Used();
 
  public:
diff --git a/paddle/memory/detail/meta_cache.h b/paddle/memory/detail/meta_cache.h
index ca0789779e..cf58156442 100644
--- a/paddle/memory/detail/meta_cache.h
+++ b/paddle/memory/detail/meta_cache.h
@@ -33,17 +33,17 @@ namespace detail {
  */
 class MetadataCache {
  public:
-  MetadataCache(bool uses_gpu);
+  explicit MetadataCache(bool uses_gpu);
 
  public:
   /*! \brief Load the associated metadata for the specified memory block. */
-  Metadata load(const MemoryBlock*);
+  Metadata load(const MemoryBlock* memory_block);
 
   /*! \brief Store the associated metadata for the specified memory block. */
-  void store(MemoryBlock*, const Metadata&);
+  void store(MemoryBlock* memory_block, const Metadata& meta_data);
 
   /*! \brief Indicate that the specified metadata will no longer be used. */
-  void invalidate(MemoryBlock*);
+  void invalidate(MemoryBlock* memory_block);
 
  public:
   MetadataCache(const MetadataCache&) = delete;
diff --git a/paddle/memory/memory.h b/paddle/memory/memory.h
index 44f567caf9..72351b9dfa 100644
--- a/paddle/memory/memory.h
+++ b/paddle/memory/memory.h
@@ -68,7 +68,7 @@ class PODDeleter {
   static_assert(std::is_pod<T>::value, "T must be POD");
 
  public:
-  PODDeleter(Place place) : place_(place) {}
+  explicit PODDeleter(Place place) : place_(place) {}
   void operator()(T* ptr) { Free(place_, static_cast<void*>(ptr)); }
 
  private:
diff --git a/paddle/operators/add_op.cu b/paddle/operators/add_op.cu
index f961b37565..9bd08634da 100644
--- a/paddle/operators/add_op.cu
+++ b/paddle/operators/add_op.cu
@@ -1,3 +1,17 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
 #define EIGEN_USE_GPU
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/add_op.h"
diff --git a/paddle/operators/cross_entropy_op.cu b/paddle/operators/cross_entropy_op.cu
index 926a0c616b..2f453f8379 100644
--- a/paddle/operators/cross_entropy_op.cu
+++ b/paddle/operators/cross_entropy_op.cu
@@ -1,5 +1,19 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
 #define EIGEN_USE_GPU
 #include "paddle/operators/cross_entropy_op.h"
 
 REGISTER_OP_GPU_KERNEL(onehot_cross_entropy,
-                       ops::OnehotCrossEntropyOpKernel<ops::GPUPlace, float>);
\ No newline at end of file
+                       ops::OnehotCrossEntropyOpKernel<ops::GPUPlace, float>);
diff --git a/paddle/operators/fill_zeros_like_op.cu b/paddle/operators/fill_zeros_like_op.cu
index 55ad58f4f1..ed1068219c 100644
--- a/paddle/operators/fill_zeros_like_op.cu
+++ b/paddle/operators/fill_zeros_like_op.cu
@@ -1,6 +1,20 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/fill_zeros_like_op.h"
 
 REGISTER_OP_GPU_KERNEL(
     fill_zeros_like,
-    paddle::operators::FillZerosLikeKernel<paddle::platform::GPUPlace, float>);
\ No newline at end of file
+    paddle::operators::FillZerosLikeKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/mean_op.cu b/paddle/operators/mean_op.cu
index e15de2fd0d..8b97b0154c 100644
--- a/paddle/operators/mean_op.cu
+++ b/paddle/operators/mean_op.cu
@@ -1,6 +1,20 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
 #define EIGEN_USE_GPU
 
 #include "paddle/operators/mean_op.h"
 
 REGISTER_OP_GPU_KERNEL(mean, ops::MeanKernel<ops::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(mean_grad, ops::MeanGradKernel<ops::GPUPlace, float>);
\ No newline at end of file
+REGISTER_OP_GPU_KERNEL(mean_grad, ops::MeanGradKernel<ops::GPUPlace, float>);
diff --git a/paddle/operators/mean_op.h b/paddle/operators/mean_op.h
index a89cb422f9..9234d4dff8 100644
--- a/paddle/operators/mean_op.h
+++ b/paddle/operators/mean_op.h
@@ -47,7 +47,7 @@ public:
 
     T ig_size = (T)framework::product(IG->dims());
 
-    EigenVector<T>::Flatten(*IG).device(*(context.GetEigenDevice<Place>())) =
+    EigenVector<T>::Flatten(*IG).device((context.GetEigenDevice<Place>())) =
         EigenScalar<T>::From(*OG) / ig_size;
   }
 };
diff --git a/paddle/operators/mul_op.cu b/paddle/operators/mul_op.cu
index dc92367016..1dc04c4297 100644
--- a/paddle/operators/mul_op.cu
+++ b/paddle/operators/mul_op.cu
@@ -15,4 +15,4 @@
 #define EIGEN_USE_GPU
 #include "paddle/operators/mul_op.h"
 
-REGISTER_OP_GPU_KERNEL(mul, ops::MulKernel<ops::GPUPlace, float>);
\ No newline at end of file
+REGISTER_OP_GPU_KERNEL(mul, ops::MulKernel<ops::GPUPlace, float>);
diff --git a/paddle/operators/recurrent_op.h b/paddle/operators/recurrent_op.h
index 2a0964fff3..35e6d9d50d 100644
--- a/paddle/operators/recurrent_op.h
+++ b/paddle/operators/recurrent_op.h
@@ -19,7 +19,7 @@
 namespace paddle {
 namespace operators {
 
-using namespace paddle::framework;
+using namespace paddle::framework;  // NOLINT
 
 namespace rnn {
 
@@ -94,7 +94,7 @@ void InitArgument(const ArgumentName& name, Argument* arg);
 };  // namespace rnn
 
 // The sequence format in RecurrentOp is Tensor<seq_len, batch_size, dim> now.
-// TODO:
+// TODO(Yan Chunwei):
 // 1. No-padding computing for sequences with indifinite length in one batch.
 // 2. Hierarchical RNN for sequence with sub-sequence.
 // 3. Internal Memory.
@@ -172,12 +172,10 @@ public:
   /**
    * InferShape must be called before Run.
    */
-  virtual void InferShape(const Scope& scope) const override {
-    alg_.InferShape(scope);
-  }
+  void InferShape(const Scope& scope) const override { alg_.InferShape(scope); }
 
-  virtual void Run(const Scope& scope,
-                   const platform::DeviceContext& dev_ctx) const override {
+  void Run(const Scope& scope,
+           const platform::DeviceContext& dev_ctx) const override {
     alg_.Run(scope, dev_ctx);
   }
 
@@ -194,12 +192,10 @@ public:
   /**
    * InferShape must be called before Run.
    */
-  virtual void InferShape(const Scope& scope) const override {
-    alg_.InferShape(scope);
-  }
+  void InferShape(const Scope& scope) const override { alg_.InferShape(scope); }
 
-  virtual void Run(const Scope& scope,
-                   const platform::DeviceContext& dev_ctx) const override {
+  void Run(const Scope& scope,
+           const platform::DeviceContext& dev_ctx) const override {
     alg_.Run(scope, dev_ctx);
   }
 
diff --git a/paddle/operators/rowwise_add_op.cu b/paddle/operators/rowwise_add_op.cu
index 82338ceccc..f76faa0a3a 100644
--- a/paddle/operators/rowwise_add_op.cu
+++ b/paddle/operators/rowwise_add_op.cu
@@ -1,3 +1,17 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
 #define EIGEN_USE_GPU
 #include "paddle/operators/rowwise_add_op.h"
 
diff --git a/paddle/operators/sgd_op.cu b/paddle/operators/sgd_op.cu
index d79258cbf1..72629ccfbb 100644
--- a/paddle/operators/sgd_op.cu
+++ b/paddle/operators/sgd_op.cu
@@ -1,4 +1,18 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
 #define EIGEN_USE_GPU
 #include "paddle/operators/sgd_op.h"
 
-REGISTER_OP_GPU_KERNEL(sgd, ops::SGDOpKernel<ops::GPUPlace, float>);
\ No newline at end of file
+REGISTER_OP_GPU_KERNEL(sgd, ops::SGDOpKernel<ops::GPUPlace, float>);
diff --git a/paddle/operators/sigmoid_op.cu b/paddle/operators/sigmoid_op.cu
index c9d11a2e1f..2123b17e4b 100644
--- a/paddle/operators/sigmoid_op.cu
+++ b/paddle/operators/sigmoid_op.cu
@@ -1,3 +1,17 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
 #define EIGEN_USE_GPU
 #include "paddle/operators/sigmoid_op.h"
 
diff --git a/paddle/operators/softmax_op.cc b/paddle/operators/softmax_op.cc
index 5b59fad7d5..70ac1b4c1a 100644
--- a/paddle/operators/softmax_op.cc
+++ b/paddle/operators/softmax_op.cc
@@ -11,6 +11,7 @@
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and
    limitations under the License. */
+
 #include "paddle/operators/softmax_op.h"
 
 namespace paddle {
diff --git a/paddle/operators/softmax_op.cu b/paddle/operators/softmax_op.cu
index ddf8f6e913..d209eb82a4 100644
--- a/paddle/operators/softmax_op.cu
+++ b/paddle/operators/softmax_op.cu
@@ -1,3 +1,17 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
 #define EIGEN_USE_GPU
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/softmax_op.h"
diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h
index 2038fafe2e..48b9f5dcb5 100644
--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
@@ -40,7 +40,7 @@ class DeviceContext {
 class CPUDeviceContext : public DeviceContext {
  public:
   CPUDeviceContext();
-  CPUDeviceContext(CPUPlace);
+  explicit CPUDeviceContext(CPUPlace);
   virtual ~CPUDeviceContext() {}
 
   Eigen::DefaultDevice* eigen_device() const;
@@ -55,7 +55,7 @@ class CPUDeviceContext : public DeviceContext {
 
 class CUDADeviceContext : public DeviceContext {
  public:
-  explicit CUDADeviceContext(GPUPlace);
+  CUDADeviceContext(GPUPlace);  // NOLINT
   virtual ~CUDADeviceContext();
 
   /*! \brief  Wait for all operations completion in the stream. */
@@ -69,10 +69,10 @@ class CUDADeviceContext : public DeviceContext {
 
   // clang-format off
   /*! \brief  Return cublas handle in the device context. */
-  cublasHandle_t    cublas_handle   ();
+  cublasHandle_t    cublas_handle();
 
   /*! \brief  Return cudnn  handle in the device context. */
-  cudnnHandle_t     cudnn_handle    ();
+  cudnnHandle_t     cudnn_handle();
 
   /*! \brief  Return curand handle in the device context. */
   curandGenerator_t curand_generator();
diff --git a/paddle/platform/dynload/cublas.cc b/paddle/platform/dynload/cublas.cc
index 4e3dfdaefb..9cd2a1f565 100644
--- a/paddle/platform/dynload/cublas.cc
+++ b/paddle/platform/dynload/cublas.cc
@@ -1,3 +1,17 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
 #include <paddle/platform/dynload/cublas.h>
 
 namespace paddle {
diff --git a/paddle/platform/dynload/cudnn.cc b/paddle/platform/dynload/cudnn.cc
index 8b5e15b5ef..d3e4cb567d 100644
--- a/paddle/platform/dynload/cudnn.cc
+++ b/paddle/platform/dynload/cudnn.cc
@@ -1,3 +1,17 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
 #include <paddle/platform/dynload/cudnn.h>
 
 namespace paddle {
@@ -25,4 +39,4 @@ CUDNN_DNN_ROUTINE_EACH_R5(DEFINE_WRAP);
 
 }  // namespace dynload
 }  // namespace platform
-}  // namespace paddle
\ No newline at end of file
+}  // namespace paddle
diff --git a/paddle/platform/dynload/curand.cc b/paddle/platform/dynload/curand.cc
index 5c1fab992c..d05dd88126 100644
--- a/paddle/platform/dynload/curand.cc
+++ b/paddle/platform/dynload/curand.cc
@@ -1,3 +1,17 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
 #include <paddle/platform/dynload/curand.h>
 
 namespace paddle {
@@ -10,6 +24,7 @@ void *curand_dso_handle;
 #define DEFINE_WRAP(__name) DynLoad__##__name __name
 
 CURAND_RAND_ROUTINE_EACH(DEFINE_WRAP);
-}
-}
-}
\ No newline at end of file
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/place.h b/paddle/platform/place.h
index 7cead18388..a37ad38a8f 100644
--- a/paddle/platform/place.h
+++ b/paddle/platform/place.h
@@ -32,7 +32,7 @@ struct CPUPlace {
 
 struct GPUPlace {
   GPUPlace() : GPUPlace(0) {}
-  GPUPlace(int d) : device(d) {}
+  GPUPlace(int d) : device(d) {}  // NOLINT
 
   // needed for variant equality comparison
   inline bool operator==(const GPUPlace &o) const { return device == o.device; }
diff --git a/paddle/string/piece.h b/paddle/string/piece.h
index 0272529d1c..3b887490b5 100644
--- a/paddle/string/piece.h
+++ b/paddle/string/piece.h
@@ -39,8 +39,8 @@ public:
   // size_ is 0.
   Piece();
   Piece(const char* d, size_t n);
-  Piece(const char* d);
-  Piece(const std::string& s);
+  Piece(const char* d);         // NOLINT
+  Piece(const std::string& s);  // NOLINT
 
   const char* data() const { return data_; }
   size_t len() const { return size_; }

From 051fe1724f94397fc58df5f913c9b427fb63ebdb Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Fri, 4 Aug 2017 11:12:48 +0800
Subject: [PATCH 10/19] remove duplicate cpplint

---
 paddle/memory/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/memory/CMakeLists.txt b/paddle/memory/CMakeLists.txt
index eb2f5cb66a..8035d93bfe 100644
--- a/paddle/memory/CMakeLists.txt
+++ b/paddle/memory/CMakeLists.txt
@@ -1,7 +1,7 @@
 add_subdirectory(detail)
 
-cc_library(memory SRCS memory.h memory.cc)
-cc_library(memcpy SRCS memcpy.h memcpy.cc DEPS device_context)
+cc_library(memory SRCS memory.cc)
+cc_library(memcpy SRCS memcpy.cc DEPS device_context)
 
 cc_library(paddle_memory
     DEPS

From c6186120c32c98fb91609a1bffdc98c7df4f0aae Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Fri, 4 Aug 2017 11:17:35 +0800
Subject: [PATCH 11/19] fix softmax_op code line > 80

---
 paddle/operators/softmax_op.cu | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/paddle/operators/softmax_op.cu b/paddle/operators/softmax_op.cu
index d1115a7047..b79228580a 100644
--- a/paddle/operators/softmax_op.cu
+++ b/paddle/operators/softmax_op.cu
@@ -17,4 +17,5 @@
 #include "paddle/operators/softmax_op.h"
 
 REGISTER_OP_GPU_KERNEL(softmax, ops::SoftmaxKernel<ops::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(softmax_grad, ops::SoftmaxGradKernel<ops::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(softmax_grad,
+                       ops::SoftmaxGradKernel<ops::GPUPlace, float>);

From 5485caf7f7d557ec856c3ea8e95ae9f21f2f9ca8 Mon Sep 17 00:00:00 2001
From: Superjom <superjom@gmail.com>
Date: Fri, 4 Aug 2017 11:57:05 +0800
Subject: [PATCH 12/19] add EQ

---
 paddle/platform/enforce.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/paddle/platform/enforce.h b/paddle/platform/enforce.h
index 60a42c777d..7e03bf4425 100644
--- a/paddle/platform/enforce.h
+++ b/paddle/platform/enforce.h
@@ -162,5 +162,10 @@ inline void throw_on_error(T e) {
     }                                                                   \
   } while (0)
 
+#define PADDLE_ENFORCE_EQ(__VAL0, __VAL1)                                   \
+  PADDLE_ENFORCE((__VAL0) == (__VAL1), "enforce %s == %s failed, %s != %s", \
+                 #__VAL0, #__VAL1, std::to_string(__VAL0),                  \
+                 std::to_string(__VAL1));
+
 }  // namespace platform
 }  // namespace paddle

From 1a34becf4231c47d5074156a434629afc198d200 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Fri, 4 Aug 2017 12:21:26 +0800
Subject: [PATCH 13/19] Reset develop BaseMatrix.cu

---
 paddle/math/BaseMatrix.cu | 985 ++++++++++++++------------------------
 1 file changed, 366 insertions(+), 619 deletions(-)

diff --git a/paddle/math/BaseMatrix.cu b/paddle/math/BaseMatrix.cu
index 5435808fb7..ba2b47d6cc 100644
--- a/paddle/math/BaseMatrix.cu
+++ b/paddle/math/BaseMatrix.cu
@@ -12,21 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <paddle/utils/Logging.h>
-#include <string.h>
 #include <cmath>
+#include <string.h>
+#include <paddle/utils/Logging.h>
 #include "BaseMatrix.h"
-#include "MathFunctions.h"
-#include "SIMDFunctions.h"
-#include "hl_matrix_apply.cuh"
-#include "hl_matrix_base.cuh"
 #include "hl_matrix_ops.cuh"
+#include "hl_matrix_base.cuh"
+#include "hl_matrix_apply.cuh"
+#include "SIMDFunctions.h"
+#include "MathFunctions.h"
 
 namespace paddle {
 
 const char* SPARSE_SUPPORT_ERROR = "Sparse Matrix/Vector is not supported.";
 
-template <class T>
+template<class T>
 template <class Op>
 int BaseMatrixT<T>::applyUnary(Op op) {
   MatrixOffset offset(0, 0);
@@ -34,11 +34,9 @@ int BaseMatrixT<T>::applyUnary(Op op) {
   return 0;
 }
 
-template <class T>
+template<class T>
 template <class Op>
-int BaseMatrixT<T>::applyUnary(Op op,
-                               int numRows,
-                               int numCols,
+int BaseMatrixT<T>::applyUnary(Op op, int numRows, int numCols,
                                MatrixOffset& offset) {
   CHECK(!this->isSparse()) << SPARSE_SUPPORT_ERROR;
   int dimM = numRows;
@@ -58,7 +56,7 @@ int BaseMatrixT<T>::applyUnary(Op op,
   return 0;
 }
 
-template <class T>
+template<class T>
 template <class Op>
 int BaseMatrixT<T>::applyBinary(Op op, BaseMatrixT& b) {
   CHECK(height_ == b.height_ && width_ == b.width_)
@@ -69,23 +67,18 @@ int BaseMatrixT<T>::applyBinary(Op op, BaseMatrixT& b) {
   return 0;
 }
 
-template <class T>
+template<class T>
 template <class Op>
-int BaseMatrixT<T>::applyBinary(
-    Op op, BaseMatrixT& b, int numRows, int numCols, MatrixOffset& offset) {
+int BaseMatrixT<T>::applyBinary(Op op, BaseMatrixT& b, int numRows, int numCols,
+                                MatrixOffset& offset) {
   applyBinary(op, b, numRows, numCols, offset, false_type(), false_type());
   return 0;
 }
 
-template <class T>
+template<class T>
 template <class Op, class bAsRowVector, class bAsColVector>
-int BaseMatrixT<T>::applyBinary(Op op,
-                                BaseMatrixT& b,
-                                int numRows,
-                                int numCols,
-                                MatrixOffset& offset,
-                                bAsRowVector,
-                                bAsColVector) {
+int BaseMatrixT<T>::applyBinary(Op op, BaseMatrixT& b, int numRows, int numCols,
+                            MatrixOffset& offset, bAsRowVector, bAsColVector) {
   CHECK(!this->isSparse()) << SPARSE_SUPPORT_ERROR;
   CHECK(!b.isSparse()) << SPARSE_SUPPORT_ERROR;
   CHECK(useGpu_ == b.useGpu_) << "Matrix type mismatch";
@@ -98,8 +91,8 @@ int BaseMatrixT<T>::applyBinary(Op op,
   T* A = data_;
   T* B = b.data_;
   CAL_MATRIX_START_ADDRESS(A, height_, width_, lda, offset.aCol_, offset.aRow_);
-  CAL_MATRIX_START_ADDRESS(
-      B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_);
+  CAL_MATRIX_START_ADDRESS(B, b.height_, b.width_, ldb, offset.bCol_,
+                           offset.bRow_);
   CHECK_LE(dimM + offset.aRow_, this->height_);
   CHECK_LE(dimN + offset.aCol_, this->width_);
   if (!bAsRowVector::value && !bAsColVector::value) {
@@ -122,7 +115,7 @@ int BaseMatrixT<T>::applyBinary(Op op,
   return 0;
 }
 
-template <class T>
+template<class T>
 template <class Op>
 int BaseMatrixT<T>::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c) {
   CHECK_EQ(height_, b.height_);
@@ -136,29 +129,21 @@ int BaseMatrixT<T>::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c) {
   return 0;
 }
 
-template <class T>
+template<class T>
 template <class Op>
-int BaseMatrixT<T>::applyTernary(Op op,
-                                 BaseMatrixT& b,
-                                 BaseMatrixT& c,
-                                 int numRows,
-                                 int numCols,
+int BaseMatrixT<T>::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c,
+                                 int numRows, int numCols,
                                  MatrixOffset& offset) {
   applyTernary(op, b, c, numRows, numCols, offset, false_type(), false_type());
 
   return 0;
 }
 
-template <class T>
+template<class T>
 template <class Op, class cAsRowVector, class cAsColVector>
-int BaseMatrixT<T>::applyTernary(Op op,
-                                 BaseMatrixT& b,
-                                 BaseMatrixT& c,
-                                 int numRows,
-                                 int numCols,
-                                 MatrixOffset& offset,
-                                 cAsRowVector,
-                                 cAsColVector) {
+int BaseMatrixT<T>::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c,
+                                 int numRows, int numCols, MatrixOffset& offset,
+                                 cAsRowVector, cAsColVector) {
   CHECK(!this->isSparse()) << SPARSE_SUPPORT_ERROR;
   CHECK(!b.isSparse()) << SPARSE_SUPPORT_ERROR;
   CHECK(!c.isSparse()) << SPARSE_SUPPORT_ERROR;
@@ -175,10 +160,10 @@ int BaseMatrixT<T>::applyTernary(Op op,
   T* B = b.data_;
   T* C = c.data_;
   CAL_MATRIX_START_ADDRESS(A, height_, width_, lda, offset.aCol_, offset.aRow_);
-  CAL_MATRIX_START_ADDRESS(
-      B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_);
-  CAL_MATRIX_START_ADDRESS(
-      C, c.height_, c.width_, ldc, offset.cCol_, offset.cRow_);
+  CAL_MATRIX_START_ADDRESS(B, b.height_, b.width_, ldb, offset.bCol_,
+                           offset.bRow_);
+  CAL_MATRIX_START_ADDRESS(C, c.height_, c.width_, ldc, offset.cCol_,
+                           offset.cRow_);
 
   CHECK_LE(dimM + offset.aRow_, this->height_);
   CHECK_LE(dimN + offset.aCol_, this->width_);
@@ -195,21 +180,21 @@ int BaseMatrixT<T>::applyTernary(Op op,
   }
 
   if (true == useGpu_) {
-    hl_gpu_apply_ternary_op<T, Op, cAsRowVector::value, cAsColVector::value>(
+    hl_gpu_apply_ternary_op
+      <T, Op, cAsRowVector::value, cAsColVector::value>(
         op, A, B, C, dimM, dimN, lda, ldb, ldc);
   } else {
-    hl_cpu_apply_ternary_op<T, Op, cAsRowVector::value, cAsColVector::value>(
+    hl_cpu_apply_ternary_op
+      <T, Op, cAsRowVector::value, cAsColVector::value>(
         op, A, B, C, dimM, dimN, lda, ldb, ldc);
   }
 
   return 0;
 }
 
-template <class T>
+template<class T>
 template <class Op>
-int BaseMatrixT<T>::applyQuaternary(Op op,
-                                    BaseMatrixT& b,
-                                    BaseMatrixT& c,
+int BaseMatrixT<T>::applyQuaternary(Op op, BaseMatrixT& b, BaseMatrixT& c,
                                     BaseMatrixT& d) {
   CHECK_EQ(height_, b.height_);
   CHECK_EQ(width_, b.width_);
@@ -224,14 +209,10 @@ int BaseMatrixT<T>::applyQuaternary(Op op,
   return 0;
 }
 
-template <class T>
+template<class T>
 template <class Op>
-int BaseMatrixT<T>::applyQuaternary(Op op,
-                                    BaseMatrixT& b,
-                                    BaseMatrixT& c,
-                                    BaseMatrixT& d,
-                                    int numRows,
-                                    int numCols,
+int BaseMatrixT<T>::applyQuaternary(Op op, BaseMatrixT& b, BaseMatrixT& c,
+                                    BaseMatrixT& d, int numRows, int numCols,
                                     MatrixOffset& offset) {
   CHECK(!this->isSparse()) << SPARSE_SUPPORT_ERROR;
   CHECK(!b.isSparse()) << SPARSE_SUPPORT_ERROR;
@@ -253,12 +234,12 @@ int BaseMatrixT<T>::applyQuaternary(Op op,
   T* C = c.data_;
   T* D = d.data_;
   CAL_MATRIX_START_ADDRESS(A, height_, width_, lda, offset.aCol_, offset.aRow_);
-  CAL_MATRIX_START_ADDRESS(
-      B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_);
-  CAL_MATRIX_START_ADDRESS(
-      C, c.height_, c.width_, ldc, offset.cCol_, offset.cRow_);
-  CAL_MATRIX_START_ADDRESS(
-      D, d.height_, d.width_, ldd, offset.dCol_, offset.dRow_);
+  CAL_MATRIX_START_ADDRESS(B, b.height_, b.width_, ldb, offset.bCol_,
+                           offset.bRow_);
+  CAL_MATRIX_START_ADDRESS(C, c.height_, c.width_, ldc, offset.cCol_,
+                           offset.cRow_);
+  CAL_MATRIX_START_ADDRESS(D, d.height_, d.width_, ldd, offset.dCol_,
+                           offset.dRow_);
 
   CHECK_LE(dimM + offset.aRow_, this->height_);
   CHECK_LE(dimN + offset.aCol_, this->width_);
@@ -269,29 +250,22 @@ int BaseMatrixT<T>::applyQuaternary(Op op,
   CHECK_LE(dimM + offset.dRow_, d.height_);
   CHECK_LE(dimN + offset.dCol_, d.width_);
   if (true == useGpu_) {
-    hl_gpu_apply_quaternary_op(op, A, B, C, D, dimM, dimN, lda, ldb, ldc, ldd);
+    hl_gpu_apply_quaternary_op(op, A, B, C, D, dimM, dimN, lda, ldb,
+                               ldc, ldd);
   } else {
-    hl_cpu_apply_quaternary_op(op, A, B, C, D, dimM, dimN, lda, ldb, ldc, ldd);
+    hl_cpu_apply_quaternary_op(op, A, B, C, D, dimM, dimN, lda, ldb,
+                               ldc, ldd);
   }
 
   return 0;
 }
 
-template <class T>
-template <class Agg,
-          class Op,
-          class Saver,
-          class aAsRowVector,
+template<class T>
+template <class Agg, class Op, class Saver, class aAsRowVector,
           class aAsColVector>
-int BaseMatrixT<T>::aggregate(Agg agg,
-                              Op op,
-                              Saver sv,
-                              BaseMatrixT& b,
-                              int numRows,
-                              int numCols,
-                              MatrixOffset& offset,
-                              aAsRowVector,
-                              aAsColVector) {
+int BaseMatrixT<T>::aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b,
+                              int numRows, int numCols, MatrixOffset& offset,
+                              aAsRowVector, aAsColVector) {
   CHECK_EQ(useGpu_, b.useGpu_);
 
   int ld = stride_;
@@ -299,10 +273,10 @@ int BaseMatrixT<T>::aggregate(Agg agg,
 
   T* dst = data_;
   T* B = b.data_;
-  CAL_MATRIX_START_ADDRESS(
-      dst, height_, width_, ld, offset.aCol_, offset.aRow_);
-  CAL_MATRIX_START_ADDRESS(
-      B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_);
+  CAL_MATRIX_START_ADDRESS(dst, height_, width_, ld, offset.aCol_,
+                           offset.aRow_);
+  CAL_MATRIX_START_ADDRESS(B, b.height_, b.width_, ldb, offset.bCol_,
+                           offset.bRow_);
 
   if (aAsRowVector::value && !aAsColVector::value) {
     if (useGpu_) {
@@ -323,21 +297,12 @@ int BaseMatrixT<T>::aggregate(Agg agg,
   return 0;
 }
 
-template <class T>
-template <class Agg,
-          class Op,
-          class Saver,
-          class aAsRowVector,
+template<class T>
+template <class Agg, class Op, class Saver, class aAsRowVector,
           class aAsColVector>
-int BaseMatrixT<T>::aggregate(Agg agg,
-                              Op op,
-                              Saver sv,
-                              BaseMatrixT& b,
-                              BaseMatrixT& c,
-                              int numRows,
-                              int numCols,
-                              MatrixOffset& offset,
-                              aAsRowVector,
+int BaseMatrixT<T>::aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b,
+                              BaseMatrixT& c, int numRows, int numCols,
+                              MatrixOffset& offset, aAsRowVector,
                               aAsColVector) {
   CHECK_EQ(useGpu_, b.useGpu_);
   CHECK_EQ(useGpu_, c.useGpu_);
@@ -349,28 +314,28 @@ int BaseMatrixT<T>::aggregate(Agg agg,
   T* dst = data_;
   T* B = b.data_;
   T* C = c.data_;
-  CAL_MATRIX_START_ADDRESS(
-      dst, height_, width_, ld, offset.aCol_, offset.aRow_);
-  CAL_MATRIX_START_ADDRESS(
-      B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_);
-  CAL_MATRIX_START_ADDRESS(
-      C, c.height_, c.width_, ldc, offset.cCol_, offset.cRow_);
+  CAL_MATRIX_START_ADDRESS(dst, height_, width_, ld, offset.aCol_,
+                           offset.aRow_);
+  CAL_MATRIX_START_ADDRESS(B, b.height_, b.width_, ldb, offset.bCol_,
+                           offset.bRow_);
+  CAL_MATRIX_START_ADDRESS(C, c.height_, c.width_, ldc, offset.cCol_,
+                           offset.cRow_);
 
   if (aAsRowVector::value && !aAsColVector::value) {
     if (useGpu_) {
-      hl_gpu_matrix_column_op(
-          agg, op, sv, numRows, numCols, dst, B, ldb, C, ldc);
+      hl_gpu_matrix_column_op(agg, op, sv, numRows, numCols, dst, B,
+                              ldb, C, ldc);
     } else {
-      hl_cpu_matrix_column_op(
-          agg, op, sv, numRows, numCols, dst, B, ldb, C, ldc);
+      hl_cpu_matrix_column_op(agg, op, sv, numRows, numCols, dst, B,
+                              ldb, C, ldc);
     }
   } else if (!aAsRowVector::value && aAsColVector::value) {
     if (useGpu_) {
-      hl_gpu_matrix_row_op(
-          agg, op, sv, numRows, numCols, dst, ld, B, ldb, C, ldc);
+      hl_gpu_matrix_row_op(agg, op, sv, numRows, numCols, dst, ld, B,
+                           ldb, C, ldc);
     } else {
-      hl_cpu_matrix_row_op(
-          agg, op, sv, numRows, numCols, dst, ld, B, ldb, C, ldc);
+      hl_cpu_matrix_row_op(agg, op, sv, numRows, numCols, dst, ld, B,
+                           ldb, C, ldc);
     }
   } else {
     LOG(FATAL) << "not supported";
@@ -385,19 +350,15 @@ int BaseMatrixT<T>::aggregate(Agg agg,
  */
 
 DEFINE_MATRIX_UNARY_OP(Neg, a = -a);
-template <class T>
-void BaseMatrixT<T>::neg() {
-  applyUnary(unary::Neg<T>());
-}
+template<class T>
+void BaseMatrixT<T>::neg() { applyUnary(unary::Neg<T>()); }
 
 DEFINE_MATRIX_UNARY_OP(Exp, a = exp(a));
-template <>
-void BaseMatrixT<real>::exp2() {
-  applyUnary(unary::Exp<real>());
-}
+template<>
+void BaseMatrixT<real>::exp2() { applyUnary(unary::Exp<real>()); }
 
 DEFINE_MATRIX_UNARY_OP(Log, a = log(a));
-template <>
+template<>
 void BaseMatrixT<real>::log2() {
   if (useGpu_) {
     applyUnary(unary::Log<real>());
@@ -407,42 +368,30 @@ void BaseMatrixT<real>::log2() {
 }
 
 DEFINE_MATRIX_UNARY_OP(Sqrt, a = sqrt(a));
-template <>
-void BaseMatrixT<real>::sqrt2() {
-  applyUnary(unary::Sqrt<real>());
-}
+template<>
+void BaseMatrixT<real>::sqrt2() { applyUnary(unary::Sqrt<real>()); }
 
 DEFINE_MATRIX_UNARY_OP(Square, a = a * a);
-template <class T>
-void BaseMatrixT<T>::square2() {
-  applyUnary(unary::Square<T>());
-}
+template<class T>
+void BaseMatrixT<T>::square2() { applyUnary(unary::Square<T>()); }
 
 DEFINE_MATRIX_UNARY_OP(Reciprocal, a = 1.0f / a);
-template <class T>
-void BaseMatrixT<T>::reciprocal2() {
-  applyUnary(unary::Reciprocal<T>());
-}
+template<class T>
+void BaseMatrixT<T>::reciprocal2() { applyUnary(unary::Reciprocal<T>()); }
 
 DEFINE_MATRIX_UNARY_OP(Abs, a = a > 0 ? a : -a);
-template <class T>
-void BaseMatrixT<T>::abs2() {
-  applyUnary(unary::Abs<T>());
-}
+template<class T>
+void BaseMatrixT<T>::abs2() { applyUnary(unary::Abs<T>()); }
 
 DEFINE_MATRIX_UNARY_OP(Sign, a = (a > 0) - (a < 0));
-template <class T>
-void BaseMatrixT<T>::sign2() {
-  applyUnary(unary::Sign<T>());
-}
+template<class T>
+void BaseMatrixT<T>::sign2() { applyUnary(unary::Sign<T>()); }
 
 DEFINE_MATRIX_UNARY_OP(Zero, a = 0);
-template <class T>
-void BaseMatrixT<T>::zero() {
-  applyUnary(unary::Zero<T>());
-}
+template<class T>
+void BaseMatrixT<T>::zero() { applyUnary(unary::Zero<T>()); }
 
-template <class T>
+template<class T>
 void BaseMatrixT<T>::zeroAtOffset(int64_t columnOffset, int64_t numColumns) {
   int numRows = height_;
   int numCols = numColumns;
@@ -451,13 +400,11 @@ void BaseMatrixT<T>::zeroAtOffset(int64_t columnOffset, int64_t numColumns) {
 }
 
 DEFINE_MATRIX_UNARY_OP(One, a = 1);
-template <class T>
-void BaseMatrixT<T>::one() {
-  applyUnary(unary::One<T>());
-}
+template<class T>
+void BaseMatrixT<T>::one() { applyUnary(unary::One<T>()); }
 
 DEFINE_MATRIX_UNARY_PARAMETER_OP(Pow, ONE_PARAMETER, a = pow(a, p));
-template <>
+template<>
 void BaseMatrixT<real>::pow2(real p) {
   if (useGpu_) {
     applyUnary(unary::Pow<real>(p));
@@ -467,67 +414,51 @@ void BaseMatrixT<real>::pow2(real p) {
 }
 
 DEFINE_MATRIX_UNARY_PARAMETER_OP(SubScalar, ONE_PARAMETER, a -= p);
-template <class T>
-void BaseMatrixT<T>::subScalar(T p) {
-  applyUnary(unary::SubScalar<T>(p));
-}
+template<class T>
+void BaseMatrixT<T>::subScalar(T p) { applyUnary(unary::SubScalar<T>(p)); }
 
 DEFINE_MATRIX_UNARY_PARAMETER_OP(MulScalar, ONE_PARAMETER, a *= p);
-template <class T>
-void BaseMatrixT<T>::mulScalar(T p) {
-  applyUnary(unary::MulScalar<T>(p));
-}
+template<class T>
+void BaseMatrixT<T>::mulScalar(T p) { applyUnary(unary::MulScalar<T>(p)); }
 
 DEFINE_MATRIX_UNARY_PARAMETER_OP(DivScalar, ONE_PARAMETER, a /= p);
-template <class T>
-void BaseMatrixT<T>::divScalar(T p) {
-  applyUnary(unary::DivScalar<T>(p));
-}
+template<class T>
+void BaseMatrixT<T>::divScalar(T p) { applyUnary(unary::DivScalar<T>(p)); }
 
 DEFINE_MATRIX_UNARY_PARAMETER_OP(Assign, ONE_PARAMETER, a = p);
-template <class T>
-void BaseMatrixT<T>::assign(T p) {
-  applyUnary(unary::Assign<T>(p));
-}
+template<class T>
+void BaseMatrixT<T>::assign(T p) { applyUnary(unary::Assign<T>(p)); }
 
 DEFINE_MATRIX_UNARY_PARAMETER_OP(Add, ONE_PARAMETER, a += p);
-template <class T>
-void BaseMatrixT<T>::add(T p) {
-  applyUnary(unary::Add<T>(p));
-}
+template<class T>
+void BaseMatrixT<T>::add(T p) { applyUnary(unary::Add<T>(p)); }
 
 DEFINE_MATRIX_UNARY_PARAMETER_OP(Add2, TWO_PARAMETER, a = a * p1 + p2);
-template <class T>
-void BaseMatrixT<T>::add(T p1, T p2) {
-  applyUnary(unary::Add2<T>(p1, p2));
-}
+template<class T>
+void BaseMatrixT<T>::add(T p1, T p2) { applyUnary(unary::Add2<T>(p1, p2)); }
 
-DEFINE_MATRIX_UNARY_PARAMETER_OP(Clip,
-                                 TWO_PARAMETER,
+DEFINE_MATRIX_UNARY_PARAMETER_OP(Clip, TWO_PARAMETER,
                                  a = a < p1 ? p1 : (a > p2 ? p2 : a));
-template <class T>
-void BaseMatrixT<T>::clip(T p1, T p2) {
-  applyUnary(unary::Clip<T>(p1, p2));
-}
+template<class T>
+void BaseMatrixT<T>::clip(T p1, T p2) { applyUnary(unary::Clip<T>(p1, p2)); }
 
-DEFINE_MATRIX_BINARY_PARAMETER_OP(ClipDerivative,
-                                  TWO_PARAMETER,
-                                  a = b < p1 ? 0 : (b > p2 ? 0 : 1));
-template <class T>
+DEFINE_MATRIX_BINARY_PARAMETER_OP(ClipDerivative, TWO_PARAMETER,
+    a = b < p1 ? 0 : (b > p2 ? 0 : 1));
+template<class T>
 void BaseMatrixT<T>::clipDerivative(BaseMatrixT& b, T p1, T p2) {
   applyBinary(binary::ClipDerivative<T>(p1, p2), b);
 }
 
-DEFINE_MATRIX_UNARY_PARAMETER_OP(BiggerThanScalar,
-                                 ONE_PARAMETER,
+DEFINE_MATRIX_UNARY_PARAMETER_OP(BiggerThanScalar, ONE_PARAMETER,
                                  a = a > p ? 1.0f : 0.0f);
-template <class T>
+template<class T>
 void BaseMatrixT<T>::biggerThanScalar(T p) {
   applyUnary(unary::BiggerThanScalar<T>(p));
 }
 
-DEFINE_MATRIX_UNARY_PARAMETER_OP(DownClip, ONE_PARAMETER, a = a > p ? a : p);
-template <class T>
+DEFINE_MATRIX_UNARY_PARAMETER_OP(DownClip, ONE_PARAMETER,
+                                 a = a > p ? a : p);
+template<class T>
 void BaseMatrixT<T>::downClip(T p) {
   applyUnary(unary::DownClip<T>(p));
 }
@@ -538,12 +469,12 @@ void BaseMatrixT<T>::downClip(T p) {
  */
 
 DEFINE_MATRIX_BINARY_OP(Add, a += b);
-template <class T>
+template<class T>
 void BaseMatrixT<T>::add(BaseMatrixT& b) {
   applyBinary(binary::Add<T>(), b);
 }
 
-template <>
+template<>
 void BaseMatrixT<real>::add(BaseMatrixT& b) {
   if (useGpu_) {
     applyBinary(binary::Add<real>(), b);
@@ -554,7 +485,7 @@ void BaseMatrixT<real>::add(BaseMatrixT& b) {
   }
 }
 
-template <class T>
+template<class T>
 void BaseMatrixT<T>::addAtOffset(BaseMatrixT& b, int64_t columnOffset) {
   if (columnOffset + b.width_ <= width_) {
     int numRows = height_;
@@ -573,53 +504,43 @@ void BaseMatrixT<T>::addAtOffset(BaseMatrixT& b, int64_t columnOffset) {
   }
 }
 
-template <class T>
+template<class T>
 void BaseMatrixT<T>::addP2P(BaseMatrixT& b) {
   T* A = data_;
   T* B = b.data_;
   int dimM = height_;
   int dimN = width_;
 
-  hl_gpu_apply_binary_op<T, binary::Add<T>, 0, 0>(
-      binary::Add<T>(), A, B, dimM, dimN, dimN, dimN);
+  hl_gpu_apply_binary_op<T, binary::Add<T>, 0, 0>
+    (binary::Add<T>(), A, B, dimM, dimN, dimN, dimN);
 }
 
-template <class T>
+template<class T>
 void BaseMatrixT<T>::addColVector(BaseMatrixT& b) {
   MatrixOffset offset(0, 0, 0, 0);
   int numRows = height_;
   int numCols = width_;
-  applyBinary(binary::Add<T>(),
-              b,
-              numRows,
-              numCols,
-              offset,
-              false_type(),
+  applyBinary(binary::Add<T>(), b, numRows, numCols, offset, false_type(),
               true_type() /* bAsColVector */);
 }
 
-template <class T>
+template<class T>
 void BaseMatrixT<T>::addRowVector(BaseMatrixT& b) {
   MatrixOffset offset(0, 0, 0, 0);
   int numRows = height_;
   int numCols = width_;
-  applyBinary(binary::Add<T>(),
-              b,
-              numRows,
-              numCols,
-              offset,
-              true_type() /* bAsRowVector */,
-              false_type());
+  applyBinary(binary::Add<T>(), b, numRows, numCols, offset,
+              true_type() /* bAsRowVector */, false_type());
 }
 
 DEFINE_MATRIX_BINARY_PARAMETER_OP(Add1, ONE_PARAMETER, a += b * p);
-template <class T>
+template<class T>
 void BaseMatrixT<T>::add(BaseMatrixT& b, T p) {
   applyBinary(binary::Add1<T>(p), b);
 }
 
 DEFINE_MATRIX_BINARY_PARAMETER_OP(Pow, ONE_PARAMETER, a = pow(b, p));
-template <>
+template<>
 void BaseMatrixT<real>::pow2(BaseMatrixT& b, real p) {
   if (useGpu_) {
     applyBinary(binary::Pow<real>(p), b);
@@ -629,45 +550,36 @@ void BaseMatrixT<real>::pow2(BaseMatrixT& b, real p) {
 }
 
 DEFINE_MATRIX_BINARY_PARAMETER_OP(Add2, TWO_PARAMETER, a = p1 * a + p2 * b);
-template <class T>
+template<class T>
 void BaseMatrixT<T>::add(BaseMatrixT& b, T p1, T p2) {
   applyBinary(binary::Add2<T>(p1, p2), b);
 }
 
-template <class T>
+template<class T>
 void BaseMatrixT<T>::addBias(BaseMatrixT& b, T scale) {
   MatrixOffset offset(0, 0, 0, 0);
   int numRows = height_;
   int numCols = width_;
-  applyBinary(binary::Add1<T>(scale),
-              b,
-              numRows,
-              numCols,
-              offset,
-              true_type() /* bAsRowVector */,
-              false_type());
+  applyBinary(binary::Add1<T>(scale), b, numRows, numCols, offset,
+              true_type() /* bAsRowVector */, false_type());
 }
 
 DEFINE_MATRIX_BINARY_OP(Sub, a -= b);
-template <class T>
-void BaseMatrixT<T>::sub(BaseMatrixT& b) {
-  applyBinary(binary::Sub<T>(), b);
-}
+template<class T>
+void BaseMatrixT<T>::sub(BaseMatrixT& b) { applyBinary(binary::Sub<T>(), b); }
 
 DEFINE_MATRIX_BINARY_PARAMETER_OP(Sub1, ONE_PARAMETER, a -= b * p);
-template <class T>
+template<class T>
 void BaseMatrixT<T>::sub(BaseMatrixT& b, T p) {
   applyBinary(binary::Sub1<T>(p), b);
 }
 
 DEFINE_MATRIX_BINARY_OP(Relu, b = a > 0.0f ? a : 0.0f);
-template <class T>
-void BaseMatrixT<T>::relu(BaseMatrixT& b) {
-  applyBinary(binary::Relu<T>(), b);
-}
+template<class T>
+void BaseMatrixT<T>::relu(BaseMatrixT& b) { applyBinary(binary::Relu<T>(), b); }
 
 DEFINE_MATRIX_BINARY_OP(ReluDerivative, a *= (b > 0.0f ? 1.0f : 0.0f));
-template <class T>
+template<class T>
 void BaseMatrixT<T>::reluDerivative(BaseMatrixT& b) {
   applyBinary(binary::ReluDerivative<T>(), b);
 }
@@ -677,7 +589,7 @@ DEFINE_MATRIX_BINARY_OP(Softrelu, const T THRESHOLD = 40.0;
                                               ? THRESHOLD
                                               : ((a < -THRESHOLD) ? (-THRESHOLD)
                                                                   : a))));
-template <>
+template<>
 void BaseMatrixT<real>::softrelu(BaseMatrixT& b) {
   applyBinary(binary::Softrelu<real>(), b);
 }
@@ -687,100 +599,97 @@ DEFINE_MATRIX_BINARY_OP(
     a *= (1.0 - exp(-1.0 * ((b > THRESHOLD)
                                 ? THRESHOLD
                                 : ((b < -THRESHOLD) ? (-THRESHOLD) : b)))));
-template <>
+template<>
 void BaseMatrixT<real>::softreluDerivative(BaseMatrixT& b) {
   applyBinary(binary::SoftreluDerivative<real>(), b);
 }
 
 DEFINE_MATRIX_BINARY_PARAMETER_OP(Brelu, TWO_PARAMETER, b = a > p1 ? a : p1;
                                   b = b < p2 ? b : p2);
-template <class T>
+template<class T>
 void BaseMatrixT<T>::brelu(BaseMatrixT& b) {
-  int p1 = 0, p2 = 24;  //! TODO(yuyang18): Make p1,p2 configuable.
+  int p1 = 0, p2 = 24;    //! TODO(yuyang18): Make p1,p2 configuable.
   applyBinary(binary::Brelu<T>(p1, p2), b);
 }
 
-DEFINE_MATRIX_BINARY_PARAMETER_OP(BreluDerivative,
-                                  TWO_PARAMETER,
+DEFINE_MATRIX_BINARY_PARAMETER_OP(BreluDerivative, TWO_PARAMETER,
                                   a *= (b > p1 && b < p2) ? 1.0 : 0.0);
-template <class T>
+template<class T>
 void BaseMatrixT<T>::breluDerivative(BaseMatrixT& b) {
   int p1 = 0, p2 = 24;
   applyBinary(binary::BreluDerivative<T>(p1, p2), b);
 }
 
 DEFINE_MATRIX_BINARY_OP(Square, b = a * a);
-template <class T>
+template<class T>
 void BaseMatrixT<T>::square2(BaseMatrixT& b) {
   applyBinary(binary::Square<T>(), b);
 }
 
 DEFINE_MATRIX_BINARY_OP(SquareDerivative, a *= 2.0 * b);
-template <class T>
+template<class T>
 void BaseMatrixT<T>::squareDerivative(BaseMatrixT& b) {
   applyBinary(binary::SquareDerivative<T>(), b);
 }
 
-DEFINE_MATRIX_BINARY_OP(Tanh, T tmp = -2.0 * a;
-                        tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
-                        b = 2.0 / (1.0 + std::exp(tmp)) - 1.0);
-template <>
+DEFINE_MATRIX_BINARY_OP(Tanh,
+    T tmp = -2.0 * a;
+    tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
+    b = 2.0 / (1.0 + std::exp(tmp)) - 1.0);
+template<>
 void BaseMatrixT<real>::tanh(BaseMatrixT& b) {
   applyBinary(binary::Tanh<real>(), b);
 }
 
 DEFINE_MATRIX_BINARY_OP(TanhDerivative, a *= 1 - b * b);
-template <class T>
+template<class T>
 void BaseMatrixT<T>::tanhDerivative(BaseMatrixT& b) {
   applyBinary(binary::TanhDerivative<T>(), b);
 }
 
-DEFINE_MATRIX_BINARY_PARAMETER_OP(
-    ScaledTanh, TWO_PARAMETER, b = p1 * (2.0 / (1.0 + exp(-2 * p2 * a)) - 1.0));
-template <>
+DEFINE_MATRIX_BINARY_PARAMETER_OP(ScaledTanh, TWO_PARAMETER,
+                                  b = p1 *
+                                      (2.0 / (1.0 + exp(-2 * p2 * a)) - 1.0));
+template<>
 void BaseMatrixT<real>::scaledTanh(BaseMatrixT& b, real p1, real p2) {
   applyBinary(binary::ScaledTanh<real>(p1, p2), b);
 }
 
-DEFINE_MATRIX_BINARY_PARAMETER_OP(ScaledTanhDerivative,
-                                  TWO_PARAMETER,
+DEFINE_MATRIX_BINARY_PARAMETER_OP(ScaledTanhDerivative, TWO_PARAMETER,
                                   a *= p2 * (p1 - b * b));
-template <class T>
+template<class T>
 void BaseMatrixT<T>::scaledTanhDerivative(BaseMatrixT& b, T p1, T p2) {
   applyBinary(binary::ScaledTanhDerivative<T>(p1 * p1, p2 / p1), b);
 }
 
 DEFINE_MATRIX_BINARY_OP(Reciprocal, b = 1.0f / a);
-template <class T>
+template<class T>
 void BaseMatrixT<T>::reciprocal2(BaseMatrixT& b) {
   applyBinary(binary::Reciprocal<T>(), b);
 }
 
 DEFINE_MATRIX_BINARY_OP(ReciprocalDerivative, a *= -b * b);
-template <class T>
+template<class T>
 void BaseMatrixT<T>::reciprocalDerivative(BaseMatrixT& b) {
   applyBinary(binary::ReciprocalDerivative<T>(), b);
 }
 
 DEFINE_MATRIX_BINARY_OP(Abs, b = a > 0.0f ? a : -a);
-template <class T>
-void BaseMatrixT<T>::abs2(BaseMatrixT& b) {
-  applyBinary(binary::Abs<T>(), b);
-}
+template<class T>
+void BaseMatrixT<T>::abs2(BaseMatrixT& b) { applyBinary(binary::Abs<T>(), b); }
 
 DEFINE_MATRIX_BINARY_OP(AbsDerivative, a = (b > 0) ? a : (b < 0) ? -a : 0);
-template <class T>
+template<class T>
 void BaseMatrixT<T>::absDerivative(BaseMatrixT& b) {
   applyBinary(binary::AbsDerivative<T>(), b);
 }
 
-DEFINE_MATRIX_BINARY_OP(Sigmoid, const T THRESHOLD_MIN = -40.0;
-                        const T THRESHOLD_MAX = 13.0;
-                        T tmp = (a < THRESHOLD_MIN)
-                                    ? THRESHOLD_MIN
-                                    : ((a > THRESHOLD_MAX) ? THRESHOLD_MAX : a);
-                        b = 1.0f / (1.0f + exp(-tmp)));
-template <>
+DEFINE_MATRIX_BINARY_OP(
+    Sigmoid, const T THRESHOLD_MIN = -40.0; const T THRESHOLD_MAX = 13.0;
+    T tmp = (a < THRESHOLD_MIN) ? THRESHOLD_MIN
+                                   : ((a > THRESHOLD_MAX) ? THRESHOLD_MAX : a);
+    b = 1.0f / (1.0f + exp(-tmp)));
+template<>
 void BaseMatrixT<real>::sigmoid(BaseMatrixT& b) {
   if (useGpu_) {
     applyBinary(binary::Sigmoid<real>(), b);
@@ -814,31 +723,31 @@ void BaseMatrixT<real>::sigmoid(BaseMatrixT& b) {
 }
 
 DEFINE_MATRIX_BINARY_OP(SigmoidDerivative, a *= b * (1 - b));
-template <class T>
+template<class T>
 void BaseMatrixT<T>::sigmoidDerivative(BaseMatrixT& b) {
   applyBinary(binary::SigmoidDerivative<T>(), b);
 }
 
 DEFINE_MATRIX_BINARY_OP(ExpDerivative, a *= b);
-template <class T>
+template<class T>
 void BaseMatrixT<T>::expDerivative(BaseMatrixT& b) {
   applyBinary(binary::ExpDerivative<T>(), b);
 }
 
 DEFINE_MATRIX_BINARY_OP(Sign, b = a > 0.0f ? 1.0f : -1.0f);
-template <class T>
+template<class T>
 void BaseMatrixT<T>::sign2(BaseMatrixT& b) {
   applyBinary(binary::Sign<T>(), b);
 }
 
 DEFINE_MATRIX_BINARY_OP(Exp, a = exp(b));
-template <>
+template<>
 void BaseMatrixT<real>::exp2(BaseMatrixT& b) {
   applyBinary(binary::Exp<real>(), b);
 }
 
 DEFINE_MATRIX_BINARY_OP(Log, a = log(b));
-template <>
+template<>
 void BaseMatrixT<real>::log2(BaseMatrixT& b) {
   if (useGpu_) {
     applyBinary(binary::Log<real>(), b);
@@ -848,13 +757,13 @@ void BaseMatrixT<real>::log2(BaseMatrixT& b) {
 }
 
 DEFINE_MATRIX_BINARY_OP(Sqrt, a = sqrt(b));
-template <>
+template<>
 void BaseMatrixT<real>::sqrt2(BaseMatrixT& b) {
   applyBinary(binary::Sqrt<real>(), b);
 }
 
 DEFINE_MATRIX_BINARY_OP(InvSqrt, a = 1.0f / sqrt(b));
-template <>
+template<>
 void BaseMatrixT<real>::invSqrt(BaseMatrixT& b) {
   if (useGpu_) {
     applyBinary(binary::InvSqrt<real>(), b);
@@ -866,37 +775,37 @@ void BaseMatrixT<real>::invSqrt(BaseMatrixT& b) {
 }
 
 DEFINE_MATRIX_BINARY_PARAMETER_OP(IsEqual, ONE_PARAMETER, a = (b == p));
-template <class T>
+template<class T>
 void BaseMatrixT<T>::isEqualTo(BaseMatrixT& b, T value) {
   applyBinary(binary::IsEqual<T>(value), b);
 }
 
 DEFINE_MATRIX_BINARY_PARAMETER_OP(AddScalar, ONE_PARAMETER, a = b + p);
-template <class T>
+template<class T>
 void BaseMatrixT<T>::addScalar(BaseMatrixT& b, T p) {
   applyBinary(binary::AddScalar<T>(p), b);
 }
 
 DEFINE_MATRIX_BINARY_PARAMETER_OP(SubScalar, ONE_PARAMETER, a = b - p);
-template <class T>
+template<class T>
 void BaseMatrixT<T>::subScalar(BaseMatrixT& b, T p) {
   applyBinary(binary::SubScalar<T>(p), b);
 }
 
 DEFINE_MATRIX_BINARY_PARAMETER_OP(MulScalar, ONE_PARAMETER, a = b * p);
-template <class T>
+template<class T>
 void BaseMatrixT<T>::mulScalar(BaseMatrixT& b, T p) {
   applyBinary(binary::MulScalar<T>(p), b);
 }
 
 DEFINE_MATRIX_BINARY_PARAMETER_OP(DivScalar, ONE_PARAMETER, a = b / p);
-template <class T>
+template<class T>
 void BaseMatrixT<T>::divScalar(BaseMatrixT& b, T p) {
   applyBinary(binary::DivScalar<T>(p), b);
 }
 
 DEFINE_MATRIX_BINARY_PARAMETER_OP(ScalarDiv, ONE_PARAMETER, a = p / b);
-template <class T>
+template<class T>
 void BaseMatrixT<T>::scalarDiv(BaseMatrixT& b, T p) {
   applyBinary(binary::ScalarDiv<T>(p), b);
 }
@@ -908,20 +817,20 @@ void BaseMatrixT<T>::scalarDiv(BaseMatrixT& b, T p) {
 
 DEFINE_MATRIX_TERNARY_OP(SoftCrossEntropy,
                          a = -c * log(b) - (1 - c) * log(1 - b));
-template <>
+template<>
 void BaseMatrixT<real>::softCrossEntropy(BaseMatrixT& b, BaseMatrixT& c) {
   applyTernary(ternary::SoftCrossEntropy<real>(), b, c);
 }
 
 DEFINE_MATRIX_TERNARY_OP(SoftCrossEntropyBp, a += (b - c) / (b * (1 - b)));
-template <class T>
+template<class T>
 void BaseMatrixT<T>::softCrossEntropyBp(BaseMatrixT& b, BaseMatrixT& c) {
   applyTernary(ternary::SoftCrossEntropyBp<T>(), b, c);
 }
 
 DEFINE_MATRIX_TERNARY_OP(BinaryCrossEntropy,
                          a = c > 0.5 ? -log(b) : -log(1.0 - b));
-template <>
+template<>
 void BaseMatrixT<real>::binaryLabelCrossEntropy(BaseMatrixT& b,
                                                 BaseMatrixT& c) {
   if (useGpu_) {
@@ -949,73 +858,70 @@ void BaseMatrixT<real>::binaryLabelCrossEntropy(BaseMatrixT& b,
 
 DEFINE_MATRIX_TERNARY_OP(BinaryCrossEntropyBp,
                          a += c > 0.5 ? -1.0 / b : 1.0 / (1.0 - b));
-template <class T>
+template<class T>
 void BaseMatrixT<T>::binaryLabelCrossEntropyBp(BaseMatrixT& b, BaseMatrixT& c) {
   applyTernary(ternary::BinaryCrossEntropyBp<T>(), b, c);
 }
 
 DEFINE_MATRIX_TERNARY_OP(Add, a = b + c);
-template <class T>
+template<class T>
 void BaseMatrixT<T>::add(BaseMatrixT& b, BaseMatrixT& c) {
   applyTernary(ternary::Add<T>(), b, c);
 }
 
 DEFINE_MATRIX_TERNARY_PARAMETER_OP(Add1, TWO_PARAMETER, a = p1 * b + p2 * c);
-template <class T>
+template<class T>
 void BaseMatrixT<T>::add(BaseMatrixT& b, T p1, BaseMatrixT& c, T p2) {
   applyTernary(ternary::Add1<T>(p1, p2), b, c);
 }
 
 DEFINE_MATRIX_TERNARY_OP(Sub, a = b - c);
-template <class T>
+template<class T>
 void BaseMatrixT<T>::sub(BaseMatrixT& b, BaseMatrixT& c) {
   applyTernary(ternary::Sub<T>(), b, c);
 }
 
 DEFINE_MATRIX_TERNARY_PARAMETER_OP(Sub1, TWO_PARAMETER, a = p1 * b - p2 * c);
-template <class T>
+template<class T>
 void BaseMatrixT<T>::sub(BaseMatrixT& b, T p1, BaseMatrixT& c, T p2) {
   applyTernary(ternary::Sub1<T>(p1, p2), b, c);
 }
 
 DEFINE_MATRIX_TERNARY_OP(Add2, a = a + b + c);
-template <class T>
+template<class T>
 void BaseMatrixT<T>::add2(BaseMatrixT& b, BaseMatrixT& c) {
   applyTernary(ternary::Add2<T>(), b, c);
 }
 
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(Add3,
-                                   THREE_PARAMETER,
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(Add3, THREE_PARAMETER,
                                    a = p1 * a + p2 * b + p3 * c);
-template <class T>
+template<class T>
 void BaseMatrixT<T>::add2(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2, T p3) {
   applyTernary(ternary::Add3<T>(p1, p2, p3), b, c);
 }
 
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(SgdUpdate,
-                                   THREE_PARAMETER,
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(SgdUpdate, THREE_PARAMETER,
                                    c = p2 * c - p1 * (b + p3 * a);
                                    a = a + c);
-template <class T>
+template<class T>
 void BaseMatrixT<T>::sgdUpdate(BaseMatrixT& b,  // grad
                                BaseMatrixT& c,  // mom
-                               T p1,            // learningRate,
-                               T p2,            // momentum,
-                               T p3) {          // decayRate
+                               T p1,        // learningRate,
+                               T p2,        // momentum,
+                               T p3) {      // decayRate
   applyTernary(ternary::SgdUpdate<T>(p1, p2, p3), b, c);
 }
 
-DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(SgdUpdate,
-                                      THREE_PARAMETER,
+DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(SgdUpdate, THREE_PARAMETER,
                                       c = p2 * c - p1 * d * (b + p3 * a);
                                       a += c);
-template <class T>
+template<class T>
 void BaseMatrixT<T>::sgdUpdate(BaseMatrixT& b,  // grad,
                                BaseMatrixT& c,  // mom,
                                BaseMatrixT& d,  // lr,
-                               T p1,            // learningRate,
-                               T p2,            // momentum,
-                               T p3) {          // decayRate
+                               T p1,        // learningRate,
+                               T p2,        // momentum,
+                               T p3) {      // decayRate
   applyQuaternary(quaternary::SgdUpdate<T>(p1, p2, p3), b, c, d);
 }
 
@@ -1023,22 +929,19 @@ DEFINE_MATRIX_BINARY_PARAMETER_OP(ApplyL1, ONE_PARAMETER, T lambda = p * b;
                                   a = (a > lambda)
                                           ? (a - lambda)
                                           : (a < -lambda) ? (a + lambda) : 0);
-template <class T>
+template<class T>
 void BaseMatrixT<T>::applyL1(BaseMatrixT& lr, T learningRate, T decayRate) {
   applyBinary(binary::ApplyL1<T>(learningRate * decayRate), lr);
 }
 
-template <>
+template<>
 void BaseMatrixT<real>::applyL1(BaseMatrixT& lr,
                                 real learningRate,
                                 real decayRate) {
   if (useGpu_) {
     applyBinary(binary::ApplyL1<real>(learningRate * decayRate), lr);
   } else {
-    simd::decayL1(this->data_,
-                  this->data_,
-                  lr.data_,
-                  learningRate * decayRate,
+    simd::decayL1(this->data_, this->data_, lr.data_, learningRate * decayRate,
                   height_ * width_);
   }
 }
@@ -1047,25 +950,24 @@ DEFINE_MATRIX_UNARY_PARAMETER_OP(ApplyL1, ONE_PARAMETER, T lambda = p;
                                  a = (a > lambda)
                                          ? (a - lambda)
                                          : (a < -lambda) ? (a + lambda) : 0);
-template <class T>
+template<class T>
 void BaseMatrixT<T>::applyL1(T learningRate, T decayRate) {
   applyUnary(unary::ApplyL1<T>(learningRate * decayRate));
 }
 
-template <>
+template<>
 void BaseMatrixT<real>::applyL1(real learningRate, real decayRate) {
   if (useGpu_) {
     applyUnary(unary::ApplyL1<real>(learningRate * decayRate));
   } else {
-    simd::decayL1(
-        this->data_, this->data_, learningRate * decayRate, height_ * width_);
+    simd::decayL1(this->data_, this->data_, learningRate * decayRate,
+                  height_ * width_);
   }
 }
 
-DEFINE_MATRIX_BINARY_PARAMETER_OP(ApplyL2,
-                                  ONE_PARAMETER,
+DEFINE_MATRIX_BINARY_PARAMETER_OP(ApplyL2, ONE_PARAMETER,
                                   a *= (1.0f / (1.0f + p * b)));
-template <class T>
+template<class T>
 void BaseMatrixT<T>::applyL2(BaseMatrixT& lr, T learningRate, T decayRate) {
   if (useGpu_) {
     applyBinary(binary::ApplyL2<T>(learningRate * decayRate), lr);
@@ -1078,33 +980,32 @@ void BaseMatrixT<T>::applyL2(BaseMatrixT& lr, T learningRate, T decayRate) {
   }
 }
 
-template <class T>
+template<class T>
 void BaseMatrixT<T>::applyL2(T learningRate, T decayRate) {
   BaseMatrixT<T>::mulScalar(1.0f / (1.0f + learningRate * decayRate));
 }
 
 DEFINE_MATRIX_BINARY_OP(DotMul, a *= b);
-template <class T>
+template<class T>
 void BaseMatrixT<T>::dotMul(BaseMatrixT& b) {
   applyBinary(binary::DotMul<T>(), b);
 }
 
 DEFINE_MATRIX_TERNARY_OP(DotMul, a = b * c);
-template <class T>
+template<class T>
 void BaseMatrixT<T>::dotMul(BaseMatrixT& b, BaseMatrixT& c) {
   applyTernary(ternary::DotMul<T>(), b, c);
 }
 
 DEFINE_MATRIX_TERNARY_OP(DotDiv, a = (b == 0.0) ? 0.0 : b / c);
-template <class T>
+template<class T>
 void BaseMatrixT<T>::dotDiv(BaseMatrixT& b, BaseMatrixT& c) {
   applyTernary(ternary::DotDiv<T>(), b, c);
 }
 
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotDiv2P,
-                                   TWO_PARAMETER,
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotDiv2P, TWO_PARAMETER,
                                    a = (b + p1) / (c + p2));
-template <class T>
+template<class T>
 void BaseMatrixT<T>::dotDiv(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2) {
   applyTernary(ternary::DotDiv2P<T>(p1, p2), b, c);
 }
@@ -1114,7 +1015,7 @@ DEFINE_MATRIX_QUATERNARY_OP(RankLoss, const T THRESHOLD = 40.0; a = b - c;
                                     ? THRESHOLD
                                     : ((a < -THRESHOLD) ? (-THRESHOLD) : a);
                             a = log(1 + exp(a)) - a * d);
-template <>
+template<>
 void BaseMatrixT<real>::rankLoss(BaseMatrixT& b,
                                  BaseMatrixT& c,
                                  BaseMatrixT& d) {
@@ -1125,9 +1026,8 @@ DEFINE_MATRIX_QUATERNARY_OP(RankLossBp, const T THRESHOLD = 40.0; a = b - c;
                             a = (a > THRESHOLD)
                                     ? THRESHOLD
                                     : ((a < -THRESHOLD) ? (-THRESHOLD) : a);
-                            a = exp(a);
-                            a = (a / (1 + a) - d));
-template <>
+                            a = exp(a); a = (a / (1 + a) - d));
+template<>
 void BaseMatrixT<real>::rankLossBp(BaseMatrixT& b,
                                    BaseMatrixT& c,
                                    BaseMatrixT& d) {
@@ -1140,7 +1040,7 @@ DEFINE_MATRIX_TERNARY_OP(LogisticRegressionLoss, const T THRESHOLD = 40.0;
                                                                  ? -THRESHOLD
                                                                  : b;
                          a = log(1 + exp(x)) - c * x);
-template <>
+template<>
 void BaseMatrixT<real>::logisticRegressionLoss(BaseMatrixT& b, BaseMatrixT& c) {
   applyTernary(ternary::LogisticRegressionLoss<real>(), b, c);
 }
@@ -1150,23 +1050,22 @@ DEFINE_MATRIX_TERNARY_OP(LogisticRegressionLossBp, const T THRESHOLD = 40.0;
                          T x = (b > THRESHOLD) ? THRESHOLD : (b < -THRESHOLD)
                                                                  ? -THRESHOLD
                                                                  : b;
-                         x = exp(x);
-                         a = x / (1 + x) - c);
-template <>
+                         x = exp(x); a = x / (1 + x) - c);
+template<>
 void BaseMatrixT<real>::logisticRegressionLossBp(BaseMatrixT& b,
                                                  BaseMatrixT& c) {
   applyTernary(ternary::LogisticRegressionLossBp<real>(), b, c);
 }
 
 DEFINE_MATRIX_TERNARY_OP(BiggerThan, a = (b > c) ? 1.0f : 0.0f);
-template <class T>
+template<class T>
 void BaseMatrixT<T>::biggerThan(BaseMatrixT& b, BaseMatrixT& c) {
   applyTernary(ternary::BiggerThan<T>(), b, c);
 }
 
 DEFINE_MATRIX_QUATERNARY_OP(
     BiggerThan, a = ((b > c && d > 0.5f) || (b < c && d < 0.5f)) ? 1.0f : 0.0f);
-template <class T>
+template<class T>
 void BaseMatrixT<T>::biggerThan(BaseMatrixT& b,
                                 BaseMatrixT& c,
                                 BaseMatrixT& d) {
@@ -1174,34 +1073,25 @@ void BaseMatrixT<T>::biggerThan(BaseMatrixT& b,
 }
 
 DEFINE_MATRIX_TERNARY_OP(Max, a = (b > c) ? b : c);
-template <class T>
+template<class T>
 void BaseMatrixT<T>::max2(BaseMatrixT& b, BaseMatrixT& c) {
   applyTernary(ternary::Max<T>(), b, c);
 }
 
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(BinaryClassificationError,
-                                   ONE_PARAMETER,
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(BinaryClassificationError, ONE_PARAMETER,
                                    c += ((a > p) == (b > p)) ? 0.0f : 1.0f);
-template <class T>
-void BaseMatrixT<T>::binaryClassificationError2(size_t destCol,
-                                                BaseMatrixT& b,
-                                                BaseMatrixT& c,
-                                                T p) {
+template<class T>
+void BaseMatrixT<T>::binaryClassificationError2(size_t destCol, BaseMatrixT& b,
+                                                BaseMatrixT& c, T p) {
   CHECK(!useGpu_) << "do not support gpu";
   MatrixOffset offset(0, 0, 0, 0, destCol, 0);
   int numRows = b.height_;
   int numCols = b.width_;
-  b.applyTernary(ternary::BinaryClassificationError<T>(p),
-                 c,
-                 *this,
-                 numRows,
-                 numCols,
-                 offset,
-                 false_type(),
-                 true_type() /*cAsColVector*/);
+  b.applyTernary(ternary::BinaryClassificationError<T>(p), c, *this, numRows,
+                 numCols, offset, false_type(), true_type() /*cAsColVector*/);
 }
 
-template <>
+template<>
 void BaseMatrixT<real>::binaryClassificationError(size_t destCol,
                                                   BaseMatrixT& b,
                                                   BaseMatrixT& c,
@@ -1209,148 +1099,127 @@ void BaseMatrixT<real>::binaryClassificationError(size_t destCol,
   MatrixOffset offset(destCol, 0, 0, 0, 0, 0);
   int numRows = b.height_;
   int numCols = b.width_;
-  aggregate(aggregate::sum(),
-            base::binary::classificationError(p),
-            base::binary::add(),
-            b,
-            c,
-            numRows,
-            numCols,
-            offset,
-            false_type(),
+  aggregate(aggregate::sum(), base::binary::classificationError(p),
+            base::binary::add(), b, c, numRows, numCols, offset, false_type(),
             true_type() /*aAsColVector*/);
 }
 
-DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(Add3,
-                                      THREE_PARAMETER,
+DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(Add3, THREE_PARAMETER,
                                       a = p1 * b + p2 * c + p3 * d);
-template <class T>
-void BaseMatrixT<T>::add3(
-    BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d, T p1, T p2, T p3) {
+template<class T>
+void BaseMatrixT<T>::add3(BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d, T p1,
+                          T p2, T p3) {
   applyQuaternary(quaternary::Add3<T>(p1, p2, p3), b, c, d);
 }
 
 DEFINE_MATRIX_TERNARY_OP(DotMulSquare, a = b * c * c);
-template <class T>
+template<class T>
 void BaseMatrixT<T>::dotMulSquare(BaseMatrixT& b, BaseMatrixT& c) {
   applyTernary(ternary::DotMulSquare<T>(), b, c);
 }
 
 DEFINE_MATRIX_TERNARY_OP(DotSquareSquare, a = b * b * c * c);
-template <class T>
+template<class T>
 void BaseMatrixT<T>::dotSquareSquare(BaseMatrixT& b, BaseMatrixT& c) {
   applyTernary(ternary::DotSquareSquare<T>(), b, c);
 }
 
 DEFINE_MATRIX_BINARY_OP(DotMulSquare, a *= b * b);
-template <class T>
+template<class T>
 void BaseMatrixT<T>::dotMulSquare(BaseMatrixT& b) {
   applyBinary(binary::DotMulSquare<T>(), b);
 }
 
 DEFINE_MATRIX_BINARY_OP(DotSquareMul, a = a * a * b);
-template <class T>
+template<class T>
 void BaseMatrixT<T>::dotSquareMul(BaseMatrixT& b) {
   applyBinary(binary::DotSquareMul<T>(), b);
 }
 
-DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(AddSquareSum,
-                                      THREE_PARAMETER,
+DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(AddSquareSum, THREE_PARAMETER,
                                       T tmp = p1 * b + p2 * c + p3 * d;
                                       a += tmp * tmp);
-template <class T>
-void BaseMatrixT<T>::addSquareSum(
-    BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT d, T p1, T p2, T p3) {
+template<class T>
+void BaseMatrixT<T>::addSquareSum(BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT d,
+                                  T p1, T p2, T p3) {
   applyQuaternary(quaternary::AddSquareSum<T>(p1, p2, p3), b, c, d);
 }
 
 DEFINE_MATRIX_BINARY_PARAMETER_OP(AddSquare, ONE_PARAMETER, a += p * b * b);
-template <class T>
+template<class T>
 void BaseMatrixT<T>::addSquare(BaseMatrixT& b, T p) {
   applyBinary(binary::AddSquare<T>(p), b);
 }
 
-DEFINE_MATRIX_BINARY_PARAMETER_OP(DecayAddSquare,
-                                  TWO_PARAMETER,
+DEFINE_MATRIX_BINARY_PARAMETER_OP(DecayAddSquare, TWO_PARAMETER,
                                   a = p1 * a + p2 * b * b);
-template <class T>
+template<class T>
 void BaseMatrixT<T>::decayAddSquare(BaseMatrixT& b, T p1, T p2) {
   applyBinary(binary::DecayAddSquare<T>(p1, p2), b);
 }
 
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(DecayAddSquareMul,
-                                   TWO_PARAMETER,
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(DecayAddSquareMul, TWO_PARAMETER,
                                    a = p1 * a + p2 * b * b * c * c);
-template <class T>
-void BaseMatrixT<T>::decayAddSquareMul(BaseMatrixT& b,
-                                       BaseMatrixT& c,
-                                       T p1,
+template<class T>
+void BaseMatrixT<T>::decayAddSquareMul(BaseMatrixT& b, BaseMatrixT& c, T p1,
                                        T p2) {
   applyTernary(ternary::DecayAddSquareMul<T>(p1, p2), b, c);
 }
 
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(ReciprocalSum,
-                                   THREE_PARAMETER,
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(ReciprocalSum, THREE_PARAMETER,
                                    a = 1 / (p1 * b + p2 * c + p3));
-template <class T>
-void BaseMatrixT<T>::reciprocalSum(
-    BaseMatrixT& b, BaseMatrixT& c, T p1, T p2, T p3) {
+template<class T>
+void BaseMatrixT<T>::reciprocalSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2,
+                                   T p3) {
   applyTernary(ternary::ReciprocalSum<T>(p1, p2, p3), b, c);
 }
 
-DEFINE_MATRIX_BINARY_PARAMETER_OP(Reciprocal2,
-                                  TWO_PARAMETER,
+DEFINE_MATRIX_BINARY_PARAMETER_OP(Reciprocal2, TWO_PARAMETER,
                                   a = 1 / (p1 * b + p2));
-template <class T>
+template<class T>
 void BaseMatrixT<T>::reciprocal2(BaseMatrixT& b, T p1, T p2) {
   applyBinary(binary::Reciprocal2<T>(p1, p2), b);
 }
 
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotMulSquareSum,
-                                   TWO_PARAMETER,
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotMulSquareSum, TWO_PARAMETER,
                                    T tmp = p1 * b + p2 * c;
                                    a *= tmp * tmp);
-template <class T>
-void BaseMatrixT<T>::dotMulSquareSum(BaseMatrixT& b,
-                                     BaseMatrixT& c,
-                                     T p1,
+template<class T>
+void BaseMatrixT<T>::dotMulSquareSum(BaseMatrixT& b, BaseMatrixT& c, T p1,
                                      T p2) {
   applyTernary(ternary::DotMulSquareSum<T>(p1, p2), b, c);
 }
 
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotSquareSum,
-                                   TWO_PARAMETER,
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotSquareSum, TWO_PARAMETER,
                                    T tmp = p1 * b + p2 * c;
                                    a = tmp * tmp);
-template <class T>
+template<class T>
 void BaseMatrixT<T>::dotSquareSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2) {
   applyTernary(ternary::DotSquareSum<T>(p1, p2), b, c);
 }
 
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotMulSum,
-                                   TWO_PARAMETER,
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotMulSum, TWO_PARAMETER,
                                    a *= p1 * b + p2 * c);
-template <class T>
+template<class T>
 void BaseMatrixT<T>::dotMulSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2) {
   applyTernary(ternary::DotMulSum<T>(p1, p2), b, c);
 }
 
 DEFINE_MATRIX_BINARY_OP(CopyAndClear, b = a; a = 0);
-template <class T>
+template<class T>
 void BaseMatrixT<T>::copyAndClear(BaseMatrixT& b) {
   applyBinary(binary::CopyAndClear<T>(), b);
 }
 
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(AddDotMul,
-                                   TWO_PARAMETER,
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(AddDotMul, TWO_PARAMETER,
                                    a = p1 * a + p2 * b * c);
-template <class T>
+template<class T>
 void BaseMatrixT<T>::addDotMul(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2) {
   applyTernary(ternary::AddDotMul<T>(p1, p2), b, c);
 }
 
 DEFINE_MATRIX_BINARY_OP(Assign, a = b;);
-template <class T>
+template<class T>
 void BaseMatrixT<T>::assign(BaseMatrixT& b) {
   if (useGpu_) {
     applyBinary(binary::Assign<T>(), b);
@@ -1361,7 +1230,7 @@ void BaseMatrixT<T>::assign(BaseMatrixT& b) {
   }
 }
 
-template <class T>
+template<class T>
 void BaseMatrixT<T>::assignAtOffset(BaseMatrixT& b, int64_t columnOffset) {
   if (columnOffset + b.width_ <= width_) {
     int numRows = height_;
@@ -1381,31 +1250,24 @@ void BaseMatrixT<T>::assignAtOffset(BaseMatrixT& b, int64_t columnOffset) {
 }
 
 DEFINE_MATRIX_BINARY_OP(DeepSwap, T tmp = a; a = b; b = tmp);
-template <class T>
+template<class T>
 void BaseMatrixT<T>::deepSwap(BaseMatrixT& b) {
-  applyBinary(binary::DeepSwap<T>(), b);
+    applyBinary(binary::DeepSwap<T>(), b);
 }
 
-template <>
+template<>
 void BaseMatrixT<real>::rowDotMul(size_t destCol,
                                   BaseMatrixT& b,
                                   BaseMatrixT& c) {
   int numRows = b.height_;
   int numCols = b.width_;
   MatrixOffset offset(destCol, 0, 0, 0, 0, 0);
-  aggregate(aggregate::sum(),
-            base::binary::mul(),
-            base::binary::add(),
-            b,
-            c,
-            numRows,
-            numCols,
-            offset,
-            false_type(),
+  aggregate(aggregate::sum(), base::binary::mul(), base::binary::add(), b, c,
+            numRows, numCols, offset, false_type(),
             true_type() /*aAsColVector*/);
 }
 
-template <class T>
+template<class T>
 void BaseMatrixT<T>::rowDotMul2(size_t destCol,
                                 BaseMatrixT& b,
                                 BaseMatrixT& c) {
@@ -1428,24 +1290,17 @@ void BaseMatrixT<T>::rowDotMul2(size_t destCol,
   }
 }
 
-template <>
+template<>
 void BaseMatrixT<real>::addDotMulVMM(BaseMatrixT& b, BaseMatrixT& c) {
   MatrixOffset offset(0, 0, 0, 0, 0, 0);
   int numRows = b.height_;
   int numCols = b.width_;
-  aggregate(aggregate::sum(),
-            base::binary::mul(),
-            base::binary::add(),
-            b,
-            c,
-            numRows,
-            numCols,
-            offset,
-            true_type() /*aAsRowVector*/,
+  aggregate(aggregate::sum(), base::binary::mul(), base::binary::add(), b, c,
+            numRows, numCols, offset, true_type() /*aAsRowVector*/,
             false_type());
 }
 
-template <class T>
+template<class T>
 void BaseMatrixT<T>::addDotMulVMM2(BaseMatrixT& b, BaseMatrixT& c) {
   CHECK(!useGpu_) << "do not support gpu";
 
@@ -1466,22 +1321,16 @@ void BaseMatrixT<T>::addDotMulVMM2(BaseMatrixT& b, BaseMatrixT& c) {
 }
 
 DEFINE_MATRIX_TERNARY_OP(addDotMulMMV, a += b * c);
-template <class T>
+template<class T>
 void BaseMatrixT<T>::addDotMulMMV(BaseMatrixT& b, BaseMatrixT& c) {
   MatrixOffset offset(0, 0, 0, 0, 0, 0);
   int numRows = height_;
   int numCols = width_;
-  applyTernary(ternary::addDotMulMMV<T>(),
-               b,
-               c,
-               numRows,
-               numCols,
-               offset,
-               true_type() /*cAsRowVector*/,
-               false_type());
+  applyTernary(ternary::addDotMulMMV<T>(), b, c, numRows, numCols, offset,
+               true_type() /*cAsRowVector*/, false_type());
 }
 
-template <class T>
+template<class T>
 void BaseMatrixT<T>::addDotMulMMV2(BaseMatrixT& b, BaseMatrixT& c) {
   CHECK(!useGpu_) << "do not support gpu";
 
@@ -1501,22 +1350,16 @@ void BaseMatrixT<T>::addDotMulMMV2(BaseMatrixT& b, BaseMatrixT& c) {
   }
 }
 
-template <class T>
+template<class T>
 void BaseMatrixT<T>::rowScale(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) {
   MatrixOffset offset(0, 0, 0, 0, cCol, 0);
   int numRows = height_;
   int numCols = width_;
-  applyTernary(ternary::DotMul<T>(),
-               b,
-               c,
-               numRows,
-               numCols,
-               offset,
-               false_type(),
-               true_type() /*cAsColVector*/);
+  applyTernary(ternary::DotMul<T>(), b, c, numRows, numCols, offset,
+    false_type(), true_type() /*cAsColVector*/);
 }
 
-template <class T>
+template<class T>
 void BaseMatrixT<T>::rowScale2(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) {
   CHECK(!useGpu_) << "do not support gpu";
 
@@ -1536,82 +1379,52 @@ void BaseMatrixT<T>::rowScale2(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) {
   }
 }
 
-template <class T>
+template<class T>
 void BaseMatrixT<T>::colScale(size_t cRow, BaseMatrixT& b, BaseMatrixT& c) {
   MatrixOffset offset(0, 0, 0, 0, 0, cRow);
   int numRows = height_;
   int numCols = width_;
-  applyTernary(ternary::DotMul<T>(),
-               b,
-               c,
-               numRows,
-               numCols,
-               offset,
-               true_type() /* cAsRowVector */,
-               false_type() /* cAsColVector */);
+  applyTernary(ternary::DotMul<T>(), b, c, numRows, numCols, offset,
+               true_type() /* cAsRowVector */, false_type() /* cAsColVector */);
 }
 
-template <class T>
+template<class T>
 void BaseMatrixT<T>::addColScale(size_t cRow, BaseMatrixT& b, BaseMatrixT& c) {
   MatrixOffset offset(0, 0, 0, 0, 0, cRow);
   int numRows = height_;
   int numCols = width_;
-  applyTernary(ternary::addDotMulMMV<T>(),
-               b,
-               c,
-               numRows,
-               numCols,
-               offset,
-               true_type() /* cAsRowVector */,
-               false_type() /* cAsColVector */);
+  applyTernary(ternary::addDotMulMMV<T>(), b, c, numRows, numCols, offset,
+               true_type() /* cAsRowVector */, false_type() /* cAsColVector */);
 }
 
-template <class T>
+template<class T>
 void BaseMatrixT<T>::addRowScale(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) {
   MatrixOffset offset(0, 0, 0, 0, cCol, 0);
   int numRows = height_;
   int numCols = width_;
-  applyTernary(ternary::addDotMulMMV<T>(),
-               b,
-               c,
-               numRows,
-               numCols,
-               offset,
-               false_type(),
-               true_type() /*cAsColVector*/);
+  applyTernary(ternary::addDotMulMMV<T>(), b, c, numRows, numCols, offset,
+               false_type(), true_type() /*cAsColVector*/);
 }
 
 DEFINE_MATRIX_TERNARY_PARAMETER_OP(RowAdd, ONE_PARAMETER, a = b + p * c);
-template <class T>
+template<class T>
 void BaseMatrixT<T>::rowAdd(size_t cCol, BaseMatrixT& b, BaseMatrixT& c, T p) {
   MatrixOffset offset(0, 0, 0, 0, cCol, 0);
   int numRows = height_;
   int numCols = width_;
-  applyTernary(ternary::RowAdd<T>(p),
-               b,
-               c,
-               numRows,
-               numCols,
-               offset,
-               false_type(),
-               true_type() /*cAsColVector*/);
+  applyTernary(ternary::RowAdd<T>(p), b, c, numRows, numCols, offset,
+    false_type(), true_type() /*cAsColVector*/);
 }
 
 DEFINE_MATRIX_TERNARY_OP(RowPow, a = pow(b, c));
-template <>
+template<>
 void BaseMatrixT<real>::rowPow(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) {
   if (useGpu_) {
     MatrixOffset offset(0, 0, 0, 0, cCol, 0);
     int numRows = height_;
     int numCols = width_;
-    applyTernary(ternary::RowPow<real>(),
-                 b,
-                 c,
-                 numRows,
-                 numCols,
-                 offset,
-                 false_type(),
-                 true_type() /*cAsColVector*/);
+    applyTernary(ternary::RowPow<real>(), b, c, numRows, numCols, offset,
+                 false_type(), true_type() /*cAsColVector*/);
   } else {
     size_t height = this->height_;
     size_t width = this->width_;
@@ -1628,64 +1441,44 @@ void BaseMatrixT<real>::rowPow(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) {
   }
 }
 
-template <class T>
+template<class T>
 void BaseMatrixT<T>::mulRowVector(BaseMatrixT& b) {
   MatrixOffset offset(0, 0, 0, 0);
   int numRows = height_;
   int numCols = width_;
-  applyBinary(binary::DotMul<T>(),
-              b,
-              numRows,
-              numCols,
-              offset,
-              true_type() /* bAsRowVector */,
-              false_type());
+  applyBinary(binary::DotMul<T>(), b, numRows, numCols, offset,
+              true_type() /* bAsRowVector */, false_type());
 }
 
 DEFINE_MATRIX_BINARY_OP(DotDiv, a /= b);
-template <class T>
+template<class T>
 void BaseMatrixT<T>::divRowVector(BaseMatrixT& b) {
   MatrixOffset offset(0, 0, 0, 0);
   int numRows = height_;
   int numCols = width_;
-  applyBinary(binary::DotDiv<T>(),
-              b,
-              numRows,
-              numCols,
-              offset,
-              true_type() /* bAsRowVector */,
-              false_type());
+  applyBinary(binary::DotDiv<T>(), b, numRows, numCols, offset,
+              true_type() /* bAsRowVector */, false_type());
 }
 
-template <class T>
+template<class T>
 void BaseMatrixT<T>::mulColVector(BaseMatrixT& b) {
   MatrixOffset offset(0, 0, 0, 0);
   int numRows = height_;
   int numCols = width_;
-  applyBinary(binary::DotMul<T>(),
-              b,
-              numRows,
-              numCols,
-              offset,
-              false_type(),
-              true_type() /* bAsColVector */);
+  applyBinary(binary::DotMul<T>(), b, numRows, numCols, offset,
+              false_type(), true_type() /* bAsColVector */);
 }
 
-template <class T>
+template<class T>
 void BaseMatrixT<T>::divColVector(BaseMatrixT& b) {
   MatrixOffset offset(0, 0, 0, 0);
   int numRows = height_;
   int numCols = width_;
-  applyBinary(binary::DotDiv<T>(),
-              b,
-              numRows,
-              numCols,
-              offset,
-              false_type(),
-              true_type() /* bAsColVector */);
+  applyBinary(binary::DotDiv<T>(), b, numRows, numCols, offset,
+              false_type(), true_type() /* bAsColVector */);
 }
 
-template <>
+template<>
 template <class Agg>
 int BaseMatrixT<real>::applyRow(Agg agg, BaseMatrixT& b) {
   MatrixOffset offset(0, 0, 0, 0, 0, 0);
@@ -1693,20 +1486,13 @@ int BaseMatrixT<real>::applyRow(Agg agg, BaseMatrixT& b) {
   size_t numCols = b.width_;
   CHECK_EQ(height_, numRows);
   CHECK_EQ(width_, 1UL);
-  aggregate(agg,
-            base::unary::identity(),
-            base::binary::second(),
-            b,
-            numRows,
-            numCols,
-            offset,
-            false_type(),
-            true_type() /*aAsColVector*/);
+  aggregate(agg, base::unary::identity(), base::binary::second(), b, numRows,
+            numCols, offset, false_type(), true_type() /*aAsColVector*/);
 
   return 0;
 }
 
-template <>
+template<>
 template <class Agg, class Saver>
 int BaseMatrixT<real>::applyRow(Agg agg, Saver sv, BaseMatrixT& b) {
   MatrixOffset offset(0, 0, 0, 0, 0, 0);
@@ -1714,25 +1500,16 @@ int BaseMatrixT<real>::applyRow(Agg agg, Saver sv, BaseMatrixT& b) {
   size_t numCols = b.width_;
   CHECK_EQ(height_, numRows);
   CHECK_EQ(width_, 1UL);
-  aggregate(agg,
-            base::unary::identity(),
-            sv,
-            b,
-            numRows,
-            numCols,
-            offset,
-            false_type(),
-            true_type() /*aAsColVector*/);
+  aggregate(agg, base::unary::identity(), sv, b, numRows, numCols, offset,
+            false_type(), true_type() /*aAsColVector*/);
 
   return 0;
 }
 
-template <>
+template<>
 template <class Agg>
-int BaseMatrixT<real>::applyRow(Agg agg,
-                                real scaleDest,
-                                real scaleAgg,
-                                BaseMatrixT& b) {
+int BaseMatrixT<real>::applyRow(
+     Agg agg, real scaleDest, real scaleAgg, BaseMatrixT& b) {
   if (scaleDest != 0) {
     applyRow(agg, base::binary::add2(scaleDest, scaleAgg), b);
   } else {
@@ -1744,10 +1521,10 @@ int BaseMatrixT<real>::applyRow(Agg agg,
   return 0;
 }
 
-template <>
+template<>
 template <class Agg, class Op, class Saver>
-int BaseMatrixT<real>::applyRow(
-    Agg agg, Op op, Saver sv, BaseMatrixT& b, BaseMatrixT& c) {
+int BaseMatrixT<real>::applyRow(Agg agg, Op op, Saver sv,
+                                BaseMatrixT& b, BaseMatrixT& c) {
   MatrixOffset offset(0, 0, 0, 0, 0, 0);
   size_t numRows = b.height_;
   size_t numCols = b.width_;
@@ -1755,27 +1532,16 @@ int BaseMatrixT<real>::applyRow(
   CHECK_EQ(width_, 1UL);
   CHECK_EQ(c.height_, numRows);
   CHECK_EQ(c.width_, numCols);
-  aggregate(agg,
-            op,
-            sv,
-            b,
-            c,
-            numRows,
-            numCols,
-            offset,
-            false_type(),
-            true_type() /*aAsColVector*/);
+  aggregate(agg, op, sv,
+            b, c, numRows, numCols, offset,
+            false_type(), true_type() /*aAsColVector*/);
   return 0;
 }
 
-template <>
+template<>
 template <class Agg, class Op>
-int BaseMatrixT<real>::applyRow(Agg agg,
-                                Op op,
-                                real scaleDest,
-                                real scaleAgg,
-                                BaseMatrixT& b,
-                                BaseMatrixT& c) {
+int BaseMatrixT<real>::applyRow(Agg agg, Op op, real scaleDest, real scaleAgg,
+                                BaseMatrixT& b, BaseMatrixT& c) {
   if (scaleDest != 0) {
     applyRow(agg, op, base::binary::add2(scaleDest, scaleAgg), b, c);
   } else {
@@ -1787,7 +1553,7 @@ int BaseMatrixT<real>::applyRow(Agg agg,
   return 0;
 }
 
-template <>
+template<>
 template <class Agg>
 int BaseMatrixT<real>::applyCol(Agg agg, BaseMatrixT& b) {
   MatrixOffset offset(0, 0, 0, 0, 0, 0);
@@ -1795,20 +1561,13 @@ int BaseMatrixT<real>::applyCol(Agg agg, BaseMatrixT& b) {
   size_t numCols = b.width_;
   CHECK_EQ(width_, numCols);
   CHECK_EQ(height_, 1UL);
-  aggregate(agg,
-            base::unary::identity(),
-            base::binary::second(),
-            b,
-            numRows,
-            numCols,
-            offset,
-            true_type() /*aAsRowVector*/,
-            false_type());
+  aggregate(agg, base::unary::identity(), base::binary::second(), b, numRows,
+            numCols, offset, true_type() /*aAsRowVector*/, false_type());
 
   return 0;
 }
 
-template <>
+template<>
 template <class Agg, class Saver>
 int BaseMatrixT<real>::applyCol(Agg agg, Saver sv, BaseMatrixT& b) {
   MatrixOffset offset(0, 0, 0, 0, 0, 0);
@@ -1816,25 +1575,16 @@ int BaseMatrixT<real>::applyCol(Agg agg, Saver sv, BaseMatrixT& b) {
   size_t numCols = b.width_;
   CHECK_EQ(width_, numCols);
   CHECK_EQ(height_, 1UL);
-  aggregate(agg,
-            base::unary::identity(),
-            sv,
-            b,
-            numRows,
-            numCols,
-            offset,
-            true_type() /*aAsRowVector*/,
-            false_type());
+  aggregate(agg, base::unary::identity(), sv, b, numRows, numCols, offset,
+            true_type() /*aAsRowVector*/, false_type());
 
   return 0;
 }
 
-template <>
+template<>
 template <class Agg>
-int BaseMatrixT<real>::applyCol(Agg agg,
-                                real scaleDest,
-                                real scaleAgg,
-                                BaseMatrixT& b) {
+int BaseMatrixT<real>::applyCol(
+     Agg agg, real scaleDest, real scaleAgg, BaseMatrixT& b) {
   if (scaleDest != 0) {
     applyCol(agg, base::binary::add2(scaleDest, scaleAgg), b);
   } else {
@@ -1846,51 +1596,48 @@ int BaseMatrixT<real>::applyCol(Agg agg,
   return 0;
 }
 
-template <>
+template<>
 void BaseMatrixT<real>::sumRows(BaseMatrixT& b, real scaleSum, real scaleDest) {
   applyRow(aggregate::sum(), scaleDest, scaleSum, b);
 }
 
-template <>
+template<>
 void BaseMatrixT<real>::maxRows(BaseMatrixT& b) {
   applyRow(aggregate::max(), b);
 }
 
-template <>
+template<>
 void BaseMatrixT<real>::minRows(BaseMatrixT& b) {
   applyRow(aggregate::min(), b);
 }
 
-template <>
+template<>
 void BaseMatrixT<real>::maxCols(BaseMatrixT& b) {
   applyCol(aggregate::max(), b);
 }
 
-template <>
+template<>
 void BaseMatrixT<real>::minCols(BaseMatrixT& b) {
   applyCol(aggregate::min(), b);
 }
 
-template <>
+template<>
 void BaseMatrixT<real>::sumCols(BaseMatrixT& b, real scaleSum, real scaleDest) {
   applyCol(aggregate::sum(), scaleDest, scaleSum, b);
 }
 
-template <>
-void BaseMatrixT<real>::sumOfSquaredDiffs(BaseMatrixT& b,
-                                          BaseMatrixT& c,
-                                          real scaleSum,
-                                          real scaleDest) {
-  applyRow(
-      aggregate::sum(), base::binary::squaredDiff(), scaleDest, scaleSum, b, c);
+template<>
+void BaseMatrixT<real>::sumOfSquaredDiffs(
+    BaseMatrixT& b, BaseMatrixT& c, real scaleSum, real scaleDest) {
+  applyRow(aggregate::sum(), base::binary::squaredDiff(),
+           scaleDest, scaleSum, b, c);
 }
 
-template <>
-void BaseMatrixT<real>::sumOfProducts(BaseMatrixT& b,
-                                      BaseMatrixT& c,
-                                      real scaleSum,
-                                      real scaleDest) {
-  applyRow(aggregate::sum(), base::binary::mul(), scaleDest, scaleSum, b, c);
+template<>
+void BaseMatrixT<real>::sumOfProducts(
+    BaseMatrixT& b, BaseMatrixT& c, real scaleSum, real scaleDest) {
+  applyRow(aggregate::sum(), base::binary::mul(),
+           scaleDest, scaleSum, b, c);
 }
 
 template class BaseMatrixT<real>;

From 4b41a28d641007a4b237c818e70cdd6e079465e8 Mon Sep 17 00:00:00 2001
From: Yi Wang <yi.wang.2005@gmail.com>
Date: Thu, 3 Aug 2017 21:32:35 -0700
Subject: [PATCH 14/19] Disable MKL and OpenMP by default

---
 CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b174831109..c7d743e193 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -36,8 +36,8 @@ include(simd)
 ################################ Configurations #######################################
 option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_FOUND})
 option(WITH_AVX         "Compile PaddlePaddle with AVX intrinsics"      ${AVX_FOUND})
-option(WITH_MKLDNN      "Compile PaddlePaddle with mkl-dnn support."    ${AVX_FOUND})
-option(WITH_MKLML       "Compile PaddlePaddle with mklml package."      ${AVX_FOUND})
+option(WITH_MKLDNN      "Compile PaddlePaddle with mkl-dnn support."    OFF)
+option(WITH_MKLML       "Compile PaddlePaddle with mklml package."      OFF)
 option(WITH_DSO         "Compile PaddlePaddle with dynamic linked CUDA" ON)
 option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        ON)
 option(WITH_SWIG_PY     "Compile PaddlePaddle with inference api"       ON)

From ddacdf17353f33d7697f38b90a330ecc142e6194 Mon Sep 17 00:00:00 2001
From: Superjom <superjom@gmail.com>
Date: Fri, 4 Aug 2017 13:29:51 +0800
Subject: [PATCH 15/19] init enforce eq

---
 paddle/platform/enforce.h       |  9 ++++---
 paddle/platform/enforce_test.cc | 47 +++++++++++++++++++++++++++++++++
 2 files changed, 52 insertions(+), 4 deletions(-)

diff --git a/paddle/platform/enforce.h b/paddle/platform/enforce.h
index 7e03bf4425..a0d93a38ce 100644
--- a/paddle/platform/enforce.h
+++ b/paddle/platform/enforce.h
@@ -162,10 +162,11 @@ inline void throw_on_error(T e) {
     }                                                                   \
   } while (0)
 
-#define PADDLE_ENFORCE_EQ(__VAL0, __VAL1)                                   \
-  PADDLE_ENFORCE((__VAL0) == (__VAL1), "enforce %s == %s failed, %s != %s", \
-                 #__VAL0, #__VAL1, std::to_string(__VAL0),                  \
-                 std::to_string(__VAL1));
+#define PADDLE_ENFORCE_EQ(__VAL0, __VAL1, ...)                              \
+  PADDLE_ENFORCE((__VAL0) == (__VAL1),                                      \
+                 "enforce %s == %s failed, %s != %s\n%s", #__VAL0, #__VAL1, \
+                 std::to_string(__VAL0), std::to_string(__VAL1),            \
+                 paddle::string::Sprintf("" __VA_ARGS__));
 
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/enforce_test.cc b/paddle/platform/enforce_test.cc
index 2ac31812a8..c44fb4360d 100644
--- a/paddle/platform/enforce_test.cc
+++ b/paddle/platform/enforce_test.cc
@@ -34,3 +34,50 @@ TEST(ENFORCE, FAILED) {
   }
   ASSERT_TRUE(in_catch);
 }
+
+TEST(ENFORCE, NO_ARG_OK) {
+  int a = 2;
+  int b = 2;
+  PADDLE_ENFORCE_EQ(a, b);
+  // test enforce with extra message.
+  PADDLE_ENFORCE_EQ(a, b, "some thing wrong %s", "info");
+}
+
+TEST(ENFORCE_EQ, NO_EXTRA_MSG_FAIL) {
+  int a = 2;
+  bool in_catch = false;
+
+  try {
+    PADDLE_ENFORCE_EQ(a, 1 + 3);
+
+  } catch (paddle::platform::EnforceNotMet error) {
+    in_catch = true;
+    const std::string msg = "enforce a == 1 + 3 failed, 2 != 4";
+    const char* what = error.what();
+    for (size_t i = 0; i < msg.length(); ++i) {
+      ASSERT_EQ(what[i], msg[i]);
+    }
+  }
+
+  ASSERT_TRUE(in_catch);
+}
+
+TEST(ENFORCE_EQ, EXTRA_MSG_FAIL) {
+  int a = 2;
+  bool in_catch = false;
+
+  try {
+    PADDLE_ENFORCE_EQ(a, 1 + 3, "%s size not match", "their");
+
+  } catch (paddle::platform::EnforceNotMet error) {
+    in_catch = true;
+    const std::string msg =
+        "enforce a == 1 + 3 failed, 2 != 4\ntheir size not match";
+    const char* what = error.what();
+    for (size_t i = 0; i < msg.length(); ++i) {
+      ASSERT_EQ(what[i], msg[i]);
+    }
+  }
+
+  ASSERT_TRUE(in_catch);
+}

From 6bb970b52b25fcf9b49311288de15f5ae167f4e6 Mon Sep 17 00:00:00 2001
From: caoying03 <caoying03@baidu.com>
Date: Fri, 4 Aug 2017 13:47:00 +0800
Subject: [PATCH 16/19] enable defining sub-sequence data in test layer
 gradients.

---
 paddle/gserver/tests/LayerGradUtil.cpp | 14 +++++++++++++-
 paddle/gserver/tests/LayerGradUtil.h   |  5 ++++-
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/paddle/gserver/tests/LayerGradUtil.cpp b/paddle/gserver/tests/LayerGradUtil.cpp
index 9eca58f1a1..fd9cfa1dc7 100644
--- a/paddle/gserver/tests/LayerGradUtil.cpp
+++ b/paddle/gserver/tests/LayerGradUtil.cpp
@@ -400,7 +400,6 @@ void initDataLayer(TestConfig testConf,
         const std::vector<int>& labelSeqStartPositions =
             testConf.inputDefs[i].labelSeqStartPositions;
         if (labelSeqStartPositions.size() != 0) {
-          CHECK(!sequenceStartPositions);
           CHECK_GE(static_cast<int>(labelSeqStartPositions.size()), 2);
 
           sequenceStartPositions =
@@ -410,6 +409,19 @@ void initDataLayer(TestConfig testConf,
                                            useGpu);
           data.sequenceStartPositions = sequenceStartPositions;
         }
+
+        const std::vector<int>& labelSubSeqStartPositions =
+            testConf.inputDefs[i].labelSubSeqStartPositions;
+        if (labelSubSeqStartPositions.size() != 0) {
+          CHECK_GE(static_cast<int>(labelSubSeqStartPositions.size()), 2);
+
+          subSequenceStartPositions =
+              ICpuGpuVector::create(labelSubSeqStartPositions.size(), useGpu);
+          subSequenceStartPositions->copyFrom(labelSubSeqStartPositions.data(),
+                                              labelSubSeqStartPositions.size(),
+                                              useGpu);
+          data.subSequenceStartPositions = subSequenceStartPositions;
+        }
         break;
       }
       default:
diff --git a/paddle/gserver/tests/LayerGradUtil.h b/paddle/gserver/tests/LayerGradUtil.h
index d299b4dd09..5debedf5ef 100644
--- a/paddle/gserver/tests/LayerGradUtil.h
+++ b/paddle/gserver/tests/LayerGradUtil.h
@@ -67,6 +67,7 @@ struct InputDef {
   bool isStatic;
   std::vector<int> labelInitValue;
   std::vector<int> labelSeqStartPositions;
+  std::vector<int> labelSubSeqStartPositions;
   MatrixPtr selfDefinedData;
 
   InputDef(InputType type, string nameIn, size_t dimIn, size_t sizeIn) {
@@ -81,8 +82,10 @@ struct InputDef {
   InputDef(InputType type,
            string nameIn,
            MatrixPtr selfDefinedData,
-           std::vector<int> selfDefinedSeqStartPos = {})
+           std::vector<int> selfDefinedSeqStartPos = {},
+           std::vector<int> selfDefinedSubSeqStartPos = {})
       : labelSeqStartPositions(selfDefinedSeqStartPos),
+        labelSubSeqStartPositions(selfDefinedSubSeqStartPos),
         selfDefinedData(selfDefinedData) {
     inputType = type;
     name = nameIn;

From 5201b911a4fcca0e1091e59434b1f863ffef6c20 Mon Sep 17 00:00:00 2001
From: Superjom <superjom@gmail.com>
Date: Fri, 4 Aug 2017 14:56:22 +0800
Subject: [PATCH 17/19] add other enforces

---
 paddle/platform/enforce.h       |  47 +++++++++++--
 paddle/platform/enforce_test.cc | 115 ++++++++++++++++++++++++++++++++
 2 files changed, 158 insertions(+), 4 deletions(-)

diff --git a/paddle/platform/enforce.h b/paddle/platform/enforce.h
index a0d93a38ce..166d7032cd 100644
--- a/paddle/platform/enforce.h
+++ b/paddle/platform/enforce.h
@@ -162,11 +162,50 @@ inline void throw_on_error(T e) {
     }                                                                   \
   } while (0)
 
-#define PADDLE_ENFORCE_EQ(__VAL0, __VAL1, ...)                              \
-  PADDLE_ENFORCE((__VAL0) == (__VAL1),                                      \
-                 "enforce %s == %s failed, %s != %s\n%s", #__VAL0, #__VAL1, \
-                 std::to_string(__VAL0), std::to_string(__VAL1),            \
+/*
+ * Some enforce helpers here, usage:
+ *    int a = 1;
+ *    int b = 2;
+ *    PADDLE_ENFORCE_EQ(a, b);
+ *
+ *    will raise an expression described as follows:
+ *    "enforce a == b failed, 1 != 2" with detailed stack infomation.
+ *
+ *    extra messages is also supported, for example:
+ *    PADDLE_ENFORCE(a, b, "some simple enforce failed between %d numbers", 2)
+ */
+
+#define PADDLE_ENFORCE_EQ(__VAL0, __VAL1, ...) \
+  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, ==, !=, __VA_ARGS__)
+#define PADDLE_ENFORCE_NE(__VAL0, __VAL1, ...) \
+  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, !=, ==, __VA_ARGS__)
+#define PADDLE_ENFORCE_GT(__VAL0, __VAL1, ...) \
+  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, >, <=, __VA_ARGS__)
+#define PADDLE_ENFORCE_GE(__VAL0, __VAL1, ...) \
+  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, >=, <, __VA_ARGS__)
+#define PADDLE_ENFORCE_LT(__VAL0, __VAL1, ...) \
+  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <, >=, __VA_ARGS__)
+#define PADDLE_ENFORCE_LE(__VAL0, __VAL1, ...) \
+  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <=, >, __VA_ARGS__)
+
+// if two values have different data types, choose a compatible type for them.
+template <typename T1, typename T2>
+struct CompatibleType {
+  static constexpr const bool& t1_to_t2 = std::is_convertible<T1, T2>::value;
+  typedef typename std::conditional<t1_to_t2, T2, T1>::type type;
+};
+
+#define __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, __CMP, __INV_CMP, ...)        \
+  PADDLE_ENFORCE(__COMPATIBLE_TYPE(__VAL0, __VAL1, __VAL0)                    \
+                     __CMP __COMPATIBLE_TYPE(__VAL0, __VAL1, __VAL1),         \
+                 "enforce %s " #__CMP " %s failed, %s " #__INV_CMP " %s\n%s", \
+                 #__VAL0, #__VAL1, std::to_string(__VAL0),                    \
+                 std::to_string(__VAL1),                                      \
                  paddle::string::Sprintf("" __VA_ARGS__));
 
+#define __COMPATIBLE_TYPE(__VAL0, __VAL1, __VAL)              \
+  typename paddle::platform::CompatibleType<decltype(__VAL0), \
+                                            decltype(__VAL1)>::type(__VAL)
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/enforce_test.cc b/paddle/platform/enforce_test.cc
index c44fb4360d..7117b49474 100644
--- a/paddle/platform/enforce_test.cc
+++ b/paddle/platform/enforce_test.cc
@@ -81,3 +81,118 @@ TEST(ENFORCE_EQ, EXTRA_MSG_FAIL) {
 
   ASSERT_TRUE(in_catch);
 }
+
+TEST(ENFORCE_NE, OK) {
+  PADDLE_ENFORCE_NE(1, 2);
+  PADDLE_ENFORCE_NE(1.0, 2UL);
+}
+TEST(ENFORCE_NE, FAIL) {
+  bool in_catch = false;
+
+  try {
+    // 2UL here to check data type compatible
+    PADDLE_ENFORCE_NE(1.0, 1UL);
+
+  } catch (paddle::platform::EnforceNotMet error) {
+    in_catch = true;
+    const std::string msg = "enforce 1.0 != 1UL failed, 1.000000 == 1";
+    const char* what = error.what();
+    for (size_t i = 0; i < msg.length(); ++i) {
+      ASSERT_EQ(what[i], msg[i]);
+    }
+  }
+
+  ASSERT_TRUE(in_catch);
+}
+
+TEST(ENFORCE_GT, OK) { PADDLE_ENFORCE_GT(2, 1); }
+TEST(ENFORCE_GT, FAIL) {
+  bool in_catch = false;
+
+  try {
+    // 2UL here to check data type compatible
+    PADDLE_ENFORCE_GT(1, 2UL);
+
+  } catch (paddle::platform::EnforceNotMet error) {
+    in_catch = true;
+    const std::string msg = "enforce 1 > 2UL failed, 1 <= 2";
+    const char* what = error.what();
+    for (size_t i = 0; i < msg.length(); ++i) {
+      ASSERT_EQ(what[i], msg[i]);
+    }
+  }
+
+  ASSERT_TRUE(in_catch);
+}
+
+TEST(ENFORCE_GE, OK) {
+  PADDLE_ENFORCE_GE(2, 2UL);
+  PADDLE_ENFORCE_GE(3, 2UL);
+  PADDLE_ENFORCE_GE(3, 2);
+  PADDLE_ENFORCE_GE(3.21, 2UL);
+}
+TEST(ENFORCE_GE, FAIL) {
+  bool in_catch = false;
+
+  try {
+    PADDLE_ENFORCE_GE(1, 2UL);
+
+  } catch (paddle::platform::EnforceNotMet error) {
+    in_catch = true;
+    const std::string msg = "enforce 1 >= 2UL failed, 1 < 2";
+    const char* what = error.what();
+    for (size_t i = 0; i < msg.length(); ++i) {
+      ASSERT_EQ(what[i], msg[i]);
+    }
+  }
+
+  ASSERT_TRUE(in_catch);
+}
+
+TEST(ENFORCE_LE, OK) {
+  PADDLE_ENFORCE_LE(1, 1);
+  PADDLE_ENFORCE_LE(1, 1UL);
+  PADDLE_ENFORCE_LE(2, 3UL);
+  PADDLE_ENFORCE_LE(2UL, 3);
+  PADDLE_ENFORCE_LE(2UL, 3.2);
+}
+TEST(ENFORCE_LE, FAIL) {
+  bool in_catch = false;
+
+  try {
+    PADDLE_ENFORCE_GT(1, 2UL);
+
+  } catch (paddle::platform::EnforceNotMet error) {
+    in_catch = true;
+    const std::string msg = "enforce 1 > 2UL failed, 1 <= 2";
+    const char* what = error.what();
+    for (size_t i = 0; i < msg.length(); ++i) {
+      ASSERT_EQ(what[i], msg[i]);
+    }
+  }
+
+  ASSERT_TRUE(in_catch);
+}
+
+TEST(ENFORCE_LT, OK) {
+  PADDLE_ENFORCE_LT(3, 10);
+  PADDLE_ENFORCE_LT(2, 3UL);
+  PADDLE_ENFORCE_LT(2UL, 3);
+}
+TEST(ENFORCE_LT, FAIL) {
+  bool in_catch = false;
+
+  try {
+    PADDLE_ENFORCE_LT(1UL, 0.12);
+
+  } catch (paddle::platform::EnforceNotMet error) {
+    in_catch = true;
+    const std::string msg = "enforce 1UL < 0.12 failed, 1 >= 0.12";
+    const char* what = error.what();
+    for (size_t i = 0; i < msg.length(); ++i) {
+      ASSERT_EQ(what[i], msg[i]);
+    }
+  }
+
+  ASSERT_TRUE(in_catch);
+}

From 62e592e58b04617127619dc20b2b0c45c13eddf5 Mon Sep 17 00:00:00 2001
From: Superjom <superjom@gmail.com>
Date: Fri, 4 Aug 2017 17:17:04 +0800
Subject: [PATCH 18/19] fix ci error

---
 paddle/platform/enforce.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/platform/enforce.h b/paddle/platform/enforce.h
index 166d7032cd..bc0715656a 100644
--- a/paddle/platform/enforce.h
+++ b/paddle/platform/enforce.h
@@ -191,7 +191,7 @@ inline void throw_on_error(T e) {
 // if two values have different data types, choose a compatible type for them.
 template <typename T1, typename T2>
 struct CompatibleType {
-  static constexpr const bool& t1_to_t2 = std::is_convertible<T1, T2>::value;
+  static const bool t1_to_t2 = std::is_convertible<T1, T2>::value;
   typedef typename std::conditional<t1_to_t2, T2, T1>::type type;
 };
 

From cbabaa45444e3f2fe183ff69c78d753f3a5c2234 Mon Sep 17 00:00:00 2001
From: Yancey1989 <yancey1989@gmail.com>
Date: Fri, 4 Aug 2017 18:05:50 +0800
Subject: [PATCH 19/19] convert dataset into recordio format

---
 python/paddle/v2/dataset/cifar.py       |  8 +++----
 python/paddle/v2/dataset/common.py      | 30 ++++++++++++++++++++-----
 python/paddle/v2/dataset/conll05.py     |  4 ++--
 python/paddle/v2/dataset/imdb.py        |  4 ++--
 python/paddle/v2/dataset/imikolov.py    |  5 +++--
 python/paddle/v2/dataset/mnist.py       |  4 ++--
 python/paddle/v2/dataset/movielens.py   |  4 ++--
 python/paddle/v2/dataset/sentiment.py   |  4 ++--
 python/paddle/v2/dataset/uci_housing.py |  4 ++--
 python/paddle/v2/dataset/wmt14.py       |  5 +++--
 10 files changed, 46 insertions(+), 26 deletions(-)

diff --git a/python/paddle/v2/dataset/cifar.py b/python/paddle/v2/dataset/cifar.py
index f885b2834e..0a2a1ced11 100644
--- a/python/paddle/v2/dataset/cifar.py
+++ b/python/paddle/v2/dataset/cifar.py
@@ -133,7 +133,7 @@ def convert(path):
     """
     Converts dataset to recordio format
     """
-    paddle.v2.dataset.common.convert(path, train100(), 10, "cifar_train100")
-    paddle.v2.dataset.common.convert(path, test100(), 10, "cifar_test100")
-    paddle.v2.dataset.common.convert(path, train10(), 10, "cifar_train10")
-    paddle.v2.dataset.common.convert(path, test10(), 10, "cifar_test10")
+    paddle.v2.dataset.common.convert(path, train100(), 1000, "cifar_train100")
+    paddle.v2.dataset.common.convert(path, test100(), 1000, "cifar_test100")
+    paddle.v2.dataset.common.convert(path, train10(), 1000, "cifar_train10")
+    paddle.v2.dataset.common.convert(path, test10(), 1000, "cifar_test10")
diff --git a/python/paddle/v2/dataset/common.py b/python/paddle/v2/dataset/common.py
index 111496618d..053ae151c5 100644
--- a/python/paddle/v2/dataset/common.py
+++ b/python/paddle/v2/dataset/common.py
@@ -32,17 +32,22 @@ __all__ = [
 
 DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset')
 
+
 # When running unit tests, there could be multiple processes that
 # trying to create DATA_HOME directory simultaneously, so we cannot
 # use a if condition to check for the existence of the directory;
 # instead, we use the filesystem as the synchronization mechanism by
 # catching returned errors.
-try:
-    os.makedirs(DATA_HOME)
-except OSError as exc:
-    if exc.errno != errno.EEXIST:
-        raise
-    pass
+def must_mkdirs(path):
+    try:
+        os.makedirs(DATA_HOME)
+    except OSError as exc:
+        if exc.errno != errno.EEXIST:
+            raise
+        pass
+
+
+must_mkdirs(DATA_HOME)
 
 
 def md5file(fname):
@@ -93,6 +98,19 @@ def fetch_all():
                 "fetch")()
 
 
+def fetch_all_recordio(path):
+    for module_name in filter(lambda x: not x.startswith("__"),
+                              dir(paddle.v2.dataset)):
+        if "convert" in dir(
+                importlib.import_module("paddle.v2.dataset.%s" % module_name)) and \
+                not module_name == "common":
+            ds_path = os.path.join(path, module_name)
+            must_mkdirs(ds_path)
+            getattr(
+                importlib.import_module("paddle.v2.dataset.%s" % module_name),
+                "convert")(ds_path)
+
+
 def split(reader, line_count, suffix="%05d.pickle", dumper=cPickle.dump):
     """
     you can call the function as:
diff --git a/python/paddle/v2/dataset/conll05.py b/python/paddle/v2/dataset/conll05.py
index f8aae52e7c..23f5a24a1c 100644
--- a/python/paddle/v2/dataset/conll05.py
+++ b/python/paddle/v2/dataset/conll05.py
@@ -233,5 +233,5 @@ def convert(path):
     """
     Converts dataset to recordio format
     """
-    paddle.v2.dataset.common.convert(path, test(), 10, "conl105_train")
-    paddle.v2.dataset.common.convert(path, test(), 10, "conl105_test")
+    paddle.v2.dataset.common.convert(path, test(), 1000, "conl105_train")
+    paddle.v2.dataset.common.convert(path, test(), 1000, "conl105_test")
diff --git a/python/paddle/v2/dataset/imdb.py b/python/paddle/v2/dataset/imdb.py
index c0ec5992e0..93dd3e8f7d 100644
--- a/python/paddle/v2/dataset/imdb.py
+++ b/python/paddle/v2/dataset/imdb.py
@@ -173,5 +173,5 @@ def convert(path):
     Converts dataset to recordio format
     """
     w = word_dict()
-    paddle.v2.dataset.common.convert(path, lambda: train(w), 10, "imdb_train")
-    paddle.v2.dataset.common.convert(path, lambda: test(w), 10, "imdb_test")
+    paddle.v2.dataset.common.convert(path, lambda: train(w), 1000, "imdb_train")
+    paddle.v2.dataset.common.convert(path, lambda: test(w), 1000, "imdb_test")
diff --git a/python/paddle/v2/dataset/imikolov.py b/python/paddle/v2/dataset/imikolov.py
index b18ee8e9ba..617c722c41 100644
--- a/python/paddle/v2/dataset/imikolov.py
+++ b/python/paddle/v2/dataset/imikolov.py
@@ -155,6 +155,7 @@ def convert(path):
     N = 5
     word_dict = build_dict()
     paddle.v2.dataset.common.convert(path,
-                                     train(word_dict, N), 10, "imikolov_train")
+                                     train(word_dict, N), 1000,
+                                     "imikolov_train")
     paddle.v2.dataset.common.convert(path,
-                                     test(word_dict, N), 10, "imikolov_test")
+                                     test(word_dict, N), 1000, "imikolov_test")
diff --git a/python/paddle/v2/dataset/mnist.py b/python/paddle/v2/dataset/mnist.py
index ea5891f4f3..9f675bed89 100644
--- a/python/paddle/v2/dataset/mnist.py
+++ b/python/paddle/v2/dataset/mnist.py
@@ -119,5 +119,5 @@ def convert(path):
     """
     Converts dataset to recordio format
     """
-    paddle.v2.dataset.common.convert(path, train(), 10, "minist_train")
-    paddle.v2.dataset.common.convert(path, test(), 10, "minist_test")
+    paddle.v2.dataset.common.convert(path, train(), 1000, "minist_train")
+    paddle.v2.dataset.common.convert(path, test(), 1000, "minist_test")
diff --git a/python/paddle/v2/dataset/movielens.py b/python/paddle/v2/dataset/movielens.py
index d9372d422a..5b61a9420a 100644
--- a/python/paddle/v2/dataset/movielens.py
+++ b/python/paddle/v2/dataset/movielens.py
@@ -254,8 +254,8 @@ def convert(path):
     """
     Converts dataset to recordio format
     """
-    paddle.v2.dataset.common.convert(path, train(), 10, "movielens_train")
-    paddle.v2.dataset.common.convert(path, test(), 10, "movielens_test")
+    paddle.v2.dataset.common.convert(path, train(), 1000, "movielens_train")
+    paddle.v2.dataset.common.convert(path, test(), 1000, "movielens_test")
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/v2/dataset/sentiment.py b/python/paddle/v2/dataset/sentiment.py
index e33f120c87..b0b9757c1a 100644
--- a/python/paddle/v2/dataset/sentiment.py
+++ b/python/paddle/v2/dataset/sentiment.py
@@ -137,5 +137,5 @@ def convert(path):
     """
     Converts dataset to recordio format
     """
-    paddle.v2.dataset.common.convert(path, train, 10, "sentiment_train")
-    paddle.v2.dataset.common.convert(path, test, 10, "sentiment_test")
+    paddle.v2.dataset.common.convert(path, train, 1000, "sentiment_train")
+    paddle.v2.dataset.common.convert(path, test, 1000, "sentiment_test")
diff --git a/python/paddle/v2/dataset/uci_housing.py b/python/paddle/v2/dataset/uci_housing.py
index ec10ce646e..ce60aa21c2 100644
--- a/python/paddle/v2/dataset/uci_housing.py
+++ b/python/paddle/v2/dataset/uci_housing.py
@@ -119,5 +119,5 @@ def convert(path):
     """
     Converts dataset to recordio format
     """
-    paddle.v2.dataset.common.convert(path, train(), 10, "uci_housing_train")
-    paddle.v2.dataset.common.convert(path, test(), 10, "uci_houseing_test")
+    paddle.v2.dataset.common.convert(path, train(), 1000, "uci_housing_train")
+    paddle.v2.dataset.common.convert(path, test(), 1000, "uci_houseing_test")
diff --git a/python/paddle/v2/dataset/wmt14.py b/python/paddle/v2/dataset/wmt14.py
index 2a631c365f..95a35d97ce 100644
--- a/python/paddle/v2/dataset/wmt14.py
+++ b/python/paddle/v2/dataset/wmt14.py
@@ -169,5 +169,6 @@ def convert(path):
     Converts dataset to recordio format
     """
     dict_size = 30000
-    paddle.v2.dataset.common.convert(path, train(dict_size), 10, "wmt14_train")
-    paddle.v2.dataset.common.convert(path, test(dict_size), 10, "wmt14_test")
+    paddle.v2.dataset.common.convert(path,
+                                     train(dict_size), 1000, "wmt14_train")
+    paddle.v2.dataset.common.convert(path, test(dict_size), 1000, "wmt14_test")