From a3043989a4d2fb082e19173e430809c8025afba2 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Tue, 13 Dec 2016 16:57:09 +0800
Subject: [PATCH 01/88] Extract RowBuffer class for SparseRowMatrix.

* The original SparseRowMatrix use two fields to store each rows,
  which let code very confusing. Try to extract a RowBuffer class,
  for SparseRowMatrix data storage, and manage auto-growth logic.
---
 paddle/gserver/tests/test_PyDataProvider2.cpp |   2 +-
 paddle/math/SparseRowMatrix.h                 | 107 ++++++++++++++----
 2 files changed, 83 insertions(+), 26 deletions(-)

diff --git a/paddle/gserver/tests/test_PyDataProvider2.cpp b/paddle/gserver/tests/test_PyDataProvider2.cpp
index 436318d356..7a3b51da8b 100644
--- a/paddle/gserver/tests/test_PyDataProvider2.cpp
+++ b/paddle/gserver/tests/test_PyDataProvider2.cpp
@@ -293,7 +293,7 @@ TEST(PyDataProvider2, can_over_batch_size) {
   while (true) {
     int64_t realBatchSize = provider->getNextBatchInternal(batchSize, &batch);
     if (realBatchSize) {
-      CHECK_LE(realBatchSize, batchSize);
+      CHECK_LE((size_t)realBatchSize, batchSize);
     } else {
       break;
     }
diff --git a/paddle/math/SparseRowMatrix.h b/paddle/math/SparseRowMatrix.h
index badb4b9c1c..db1530f7cf 100644
--- a/paddle/math/SparseRowMatrix.h
+++ b/paddle/math/SparseRowMatrix.h
@@ -24,6 +24,73 @@ P_DECLARE_bool(allow_inefficient_sparse_update);
 
 namespace paddle {
 
+/**
+ * @brief The RowBuffer class
+ * Represent the SparseRow Matrix Data.
+ *
+ * If not set memory handler, then the data could be auto growth.
+ */
+class RowBuffer {
+public:
+  explicit RowBuffer(size_t width) : width_(width) {}
+  RowBuffer(const CpuMemHandlePtr& mem, size_t width)
+      : preallocatedBuf_(mem), width_(width) {}
+
+  inline void reserve(int rowCnt) {
+    if (preallocatedBuf_) {
+      CHECK(preallocatedBuf_->getSize() < rowCnt * width_ * sizeof(real));
+    } else {
+      rowStore_.reserve(rowCnt * width_);
+    }
+  }
+
+  inline const real* get(int row) const {
+    if (preallocatedBuf_) {
+      CHECK_LE((row + 1) * width_ * sizeof(real), preallocatedBuf_->getSize());
+      return reinterpret_cast<real*>(preallocatedBuf_->getBuf()) + row * width_;
+    } else {
+      CHECK_LE((row + 1) * width_, rowStore_.size());
+      return rowStore_.data() + row * width_;
+    }
+  }
+
+  inline const real* getWithAutoGrowth(int row) {
+    if (preallocatedBuf_) {
+      return get(row);
+    } else {
+      if ((rowStore_.size() <= row * width_)) {
+        rowStore_.resize((row + 1) * width_);
+      }
+      return rowStore_.data() + row * width_;
+    }
+  }
+
+  inline real* data() {
+    if (preallocatedBuf_) {
+      return reinterpret_cast<real*>(preallocatedBuf_->getBuf());
+    } else {
+      return rowStore_.data();
+    }
+  }
+
+  inline void clear() { rowStore_.clear(); }
+
+  inline size_t getRowCount() const {
+    if (preallocatedBuf_) {
+      return preallocatedBuf_->getSize() / sizeof(float) / width_;
+    } else {
+      return rowStore_.size() / width_;
+    }
+  }
+
+  inline bool canAutoGrowth() const { return preallocatedBuf_ == nullptr; }
+
+private:
+  CpuMemHandlePtr preallocatedBuf_;
+  std::vector<real, AlignedAllocator<real, 32>> rowStore_;
+  size_t width_;
+};
+
 /**
  * Sparse Row
  */
@@ -45,12 +112,9 @@ public:
                      IndexDictPtr indexDictHandle = nullptr,
                      bool trans = false)
       : CpuMatrix(nullptr, height, width, trans),
-        storeMat_(dataHandle,
-                  dataHandle ? dataHandle->getSize() / sizeof(real) / width : 0,
-                  width,
-                  trans),
         indexDictHandle_(indexDictHandle) {
     init(height, width);
+    buf_.reset(new RowBuffer(dataHandle, width));
   }
 
   virtual ~SparseRowCpuMatrix() {}
@@ -72,24 +136,17 @@ public:
    *  @param row row id in local storage
    */
   real* getLocalRow(size_t row) {
-    if (storeMat_.getData()) return storeMat_.rowBuf(row);
-    if (rowStore_.size() <= row * width_) {
-      rowStore_.resize((row + 1) * width_);
-    }
-    return rowStore_.data() + row * width_;
+    return const_cast<real*>(buf_->getWithAutoGrowth(row));
   }
 
   /**
-   *  reserve the storage for rows according to current size of indexDictHandle.
+   *  reserve the storage for rows according to current size of
+   * indexDictHandle.
    *
    *  This is only used when SparseRowCpuMatrix is constructed with
    *  indexDictHandle.
    */
-  void reserveStore() {
-    if (!storeMat_.getData() && !localIndices_->empty()) {
-      rowStore_.resize(localIndices_->size() * width_);
-    }
-  }
+  void reserveStore() { buf_->reserve(localIndices_->size()); }
 
   // row is the row id in the original matrix
   virtual real* getRowBuf(size_t row) { return getRow(row); }
@@ -117,7 +174,8 @@ public:
    *
    * If L1 decay set use L1, else if L2 set use L2, otherwise no decay atall.
    *
-   * t0 is a int vector used by L1/L2 decay, size = height of parameter matrix,
+   * t0 is a int vector used by L1/L2 decay, size = height of parameter
+   * matrix,
    * store the time that each weight row last updated.
    *
    * Time is batchId, currentTime is current batchId.
@@ -176,8 +234,7 @@ public:
 protected:
   template <typename Func>
   void apply(Func f) {
-    real* data = storeMat_.getData() ? storeMat_.getData() : rowStore_.data();
-    f(data, localIndices_->size() * width_);
+    f(buf_->data(), localIndices_->size() * width_);
   }
 
   void init(size_t height, size_t width);
@@ -188,25 +245,25 @@ protected:
       globalIndices_[id] = kUnusedId_;
     }
     localIndices_->clear();
-    rowStore_.clear();
+    buf_->clear();
   }
 
   inline void checkStoreSize() {
-    if (storeMat_.getData()) {
-      CHECK_LE(localIndices_->size(), storeMat_.getHeight());
-    } else if (!FLAGS_allow_inefficient_sparse_update) {
-      if (localIndices_->size() > 0.5 * height_) {
+    if (buf_->canAutoGrowth()) {
+      if (buf_->getRowCount() > 0.5 * height_) {
         LOG(WARNING)
             << "There are more than 0.5*height (" << localIndices_->size()
             << ") rows are used for sparse "
             << "update, which is not efficient. Considering not use "
             << "sparse_update or set --allow_inefficient_sparse_update=true";
+
+      } else {
+        CHECK_LE(localIndices_->size(), buf_->getRowCount());
       }
     }
   }
 
-  CpuMatrix storeMat_;
-  std::vector<real, AlignedAllocator<real, 32>> rowStore_;
+  std::unique_ptr<RowBuffer> buf_;
   IndexDictPtr indexDictHandle_;
   std::vector<unsigned int>* localIndices_;  // =&indexDictHandle_->localIndices
   unsigned int* globalIndices_;  // =indexDictHandle_->globalIndices.data();

From fa2c06fb053769f1c82e4d5c98b2bcdee376d6d0 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Tue, 13 Dec 2016 17:25:38 +0800
Subject: [PATCH 02/88] Add comments

---
 paddle/math/RowBuffer.h       | 127 ++++++++++++++++++++++++++++++++++
 paddle/math/SparseRowMatrix.h |  72 +------------------
 2 files changed, 130 insertions(+), 69 deletions(-)
 create mode 100644 paddle/math/RowBuffer.h

diff --git a/paddle/math/RowBuffer.h b/paddle/math/RowBuffer.h
new file mode 100644
index 0000000000..e358204612
--- /dev/null
+++ b/paddle/math/RowBuffer.h
@@ -0,0 +1,127 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <vector>
+#include "MemoryHandle.h"
+#include "paddle/utils/Util.h"
+
+namespace paddle {
+
+/**
+ * @brief The RowBuffer class
+ * Represent the SparseRow Matrix Data.
+ *
+ * If not set memory handler, then the data could be auto growth.
+ */
+class RowBuffer {
+public:
+  /**
+   * @brief RowBuffer create a auto-growth row buffer. The row length is width.
+   * @param width the length of each row, a.k.a matrix width.
+   */
+  explicit RowBuffer(size_t width) : width_(width) {}
+
+  /**
+   * @brief RowBuffer create a row buffer, which cannot be auto-growth.
+   * @param mem the pre-allocated memory.
+   * @param width the length of each row, a.k.a matrix width.
+   */
+  RowBuffer(const CpuMemHandlePtr& mem, size_t width)
+      : preallocatedBuf_(mem), width_(width) {}
+
+  /**
+   * @brief resize resize the buffer with rowCount
+   * @param rowCnt number of row. matrix height.
+   */
+  inline void resize(int rowCnt) {
+    if (preallocatedBuf_) {
+      CHECK(preallocatedBuf_->getSize() < rowCnt * width_ * sizeof(real));
+    } else {
+      rowStore_.resize(rowCnt * width_);
+    }
+  }
+
+  /**
+   * @brief get a row buffer with row index.
+   * @param row the index of row.
+   * @return row buffer.
+   */
+  inline const real* get(int row) const {
+    if (preallocatedBuf_) {
+      CHECK_LE((row + 1) * width_ * sizeof(real), preallocatedBuf_->getSize());
+      return reinterpret_cast<real*>(preallocatedBuf_->getBuf()) + row * width_;
+    } else {
+      CHECK_LE((row + 1) * width_, rowStore_.size());
+      return rowStore_.data() + row * width_;
+    }
+  }
+
+  /**
+   * @brief get a row buffer with row index. If row index is larger than local
+   *        buffer, the size of local buffer will grow.
+   * @param row the index of row.
+   * @return row buffer.
+   */
+  inline const real* getWithAutoGrowth(int row) {
+    if (preallocatedBuf_) {
+      return get(row);
+    } else {
+      if ((rowStore_.size() <= row * width_)) {
+        rowStore_.resize((row + 1) * width_);
+      }
+      return rowStore_.data() + row * width_;
+    }
+  }
+
+  /**
+   * @return raw data buffer.
+   */
+  inline real* data() {
+    if (preallocatedBuf_) {
+      return reinterpret_cast<real*>(preallocatedBuf_->getBuf());
+    } else {
+      return rowStore_.data();
+    }
+  }
+
+  /**
+   * @brief clear local buffer. It only affect auto-growth buffer.
+   */
+  inline void clear() { rowStore_.clear(); }
+
+  /**
+   * @brief get current number of rows.
+   * @return number of rows.
+   */
+  inline size_t getRowCount() const {
+    if (preallocatedBuf_) {
+      return preallocatedBuf_->getSize() / sizeof(float) / width_;
+    } else {
+      return rowStore_.size() / width_;
+    }
+  }
+
+  /**
+   * @brief get is this buffer can automatically grow or not.
+   * @return ture if can automacitally grow.
+   */
+  inline bool isAutoGrowth() const { return preallocatedBuf_ == nullptr; }
+
+private:
+  CpuMemHandlePtr preallocatedBuf_;
+  std::vector<real, AlignedAllocator<real, 32>> rowStore_;
+  size_t width_;
+};
+}  // namespace paddle
diff --git a/paddle/math/SparseRowMatrix.h b/paddle/math/SparseRowMatrix.h
index db1530f7cf..d77d8c3ed1 100644
--- a/paddle/math/SparseRowMatrix.h
+++ b/paddle/math/SparseRowMatrix.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <string.h>
 #include <algorithm>
 #include "Matrix.h"
+#include "RowBuffer.h"
 #include "paddle/utils/CommandLineParser.h"
 #include "paddle/utils/Util.h"
 
@@ -24,73 +25,6 @@ P_DECLARE_bool(allow_inefficient_sparse_update);
 
 namespace paddle {
 
-/**
- * @brief The RowBuffer class
- * Represent the SparseRow Matrix Data.
- *
- * If not set memory handler, then the data could be auto growth.
- */
-class RowBuffer {
-public:
-  explicit RowBuffer(size_t width) : width_(width) {}
-  RowBuffer(const CpuMemHandlePtr& mem, size_t width)
-      : preallocatedBuf_(mem), width_(width) {}
-
-  inline void reserve(int rowCnt) {
-    if (preallocatedBuf_) {
-      CHECK(preallocatedBuf_->getSize() < rowCnt * width_ * sizeof(real));
-    } else {
-      rowStore_.reserve(rowCnt * width_);
-    }
-  }
-
-  inline const real* get(int row) const {
-    if (preallocatedBuf_) {
-      CHECK_LE((row + 1) * width_ * sizeof(real), preallocatedBuf_->getSize());
-      return reinterpret_cast<real*>(preallocatedBuf_->getBuf()) + row * width_;
-    } else {
-      CHECK_LE((row + 1) * width_, rowStore_.size());
-      return rowStore_.data() + row * width_;
-    }
-  }
-
-  inline const real* getWithAutoGrowth(int row) {
-    if (preallocatedBuf_) {
-      return get(row);
-    } else {
-      if ((rowStore_.size() <= row * width_)) {
-        rowStore_.resize((row + 1) * width_);
-      }
-      return rowStore_.data() + row * width_;
-    }
-  }
-
-  inline real* data() {
-    if (preallocatedBuf_) {
-      return reinterpret_cast<real*>(preallocatedBuf_->getBuf());
-    } else {
-      return rowStore_.data();
-    }
-  }
-
-  inline void clear() { rowStore_.clear(); }
-
-  inline size_t getRowCount() const {
-    if (preallocatedBuf_) {
-      return preallocatedBuf_->getSize() / sizeof(float) / width_;
-    } else {
-      return rowStore_.size() / width_;
-    }
-  }
-
-  inline bool canAutoGrowth() const { return preallocatedBuf_ == nullptr; }
-
-private:
-  CpuMemHandlePtr preallocatedBuf_;
-  std::vector<real, AlignedAllocator<real, 32>> rowStore_;
-  size_t width_;
-};
-
 /**
  * Sparse Row
  */
@@ -146,7 +80,7 @@ public:
    *  This is only used when SparseRowCpuMatrix is constructed with
    *  indexDictHandle.
    */
-  void reserveStore() { buf_->reserve(localIndices_->size()); }
+  void reserveStore() { buf_->resize(localIndices_->size()); }
 
   // row is the row id in the original matrix
   virtual real* getRowBuf(size_t row) { return getRow(row); }
@@ -249,7 +183,7 @@ protected:
   }
 
   inline void checkStoreSize() {
-    if (buf_->canAutoGrowth()) {
+    if (buf_->isAutoGrowth()) {
       if (buf_->getRowCount() > 0.5 * height_) {
         LOG(WARNING)
             << "There are more than 0.5*height (" << localIndices_->size()

From f63fbcdd2981b2bd9f478e784097b8ec5c6e463c Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Tue, 13 Dec 2016 21:34:39 +0800
Subject: [PATCH 03/88] Add unit test for row buffer.

---
 paddle/math/RowBuffer.h              | 12 +++--
 paddle/math/SparseRowMatrix.h        |  4 +-
 paddle/math/tests/CMakeLists.txt     |  1 +
 paddle/math/tests/test_RowBuffer.cpp | 65 ++++++++++++++++++++++++++++
 4 files changed, 76 insertions(+), 6 deletions(-)
 create mode 100644 paddle/math/tests/test_RowBuffer.cpp

diff --git a/paddle/math/RowBuffer.h b/paddle/math/RowBuffer.h
index e358204612..bb55ca5f9f 100644
--- a/paddle/math/RowBuffer.h
+++ b/paddle/math/RowBuffer.h
@@ -58,13 +58,13 @@ public:
    * @param row the index of row.
    * @return row buffer.
    */
-  inline const real* get(int row) const {
+  inline real* get(int row) const {
     if (preallocatedBuf_) {
       CHECK_LE((row + 1) * width_ * sizeof(real), preallocatedBuf_->getSize());
       return reinterpret_cast<real*>(preallocatedBuf_->getBuf()) + row * width_;
     } else {
       CHECK_LE((row + 1) * width_, rowStore_.size());
-      return rowStore_.data() + row * width_;
+      return const_cast<real*>(rowStore_.data() + row * width_);
     }
   }
 
@@ -74,7 +74,7 @@ public:
    * @param row the index of row.
    * @return row buffer.
    */
-  inline const real* getWithAutoGrowth(int row) {
+  inline real* getWithAutoGrowth(int row) {
     if (preallocatedBuf_) {
       return get(row);
     } else {
@@ -119,6 +119,12 @@ public:
    */
   inline bool isAutoGrowth() const { return preallocatedBuf_ == nullptr; }
 
+  /**
+   * @brief return the width of matrix. a.k.a length of row.
+   * @return width of matrix
+   */
+  inline size_t getWidth() const { return width_; }
+
 private:
   CpuMemHandlePtr preallocatedBuf_;
   std::vector<real, AlignedAllocator<real, 32>> rowStore_;
diff --git a/paddle/math/SparseRowMatrix.h b/paddle/math/SparseRowMatrix.h
index d77d8c3ed1..8532bca879 100644
--- a/paddle/math/SparseRowMatrix.h
+++ b/paddle/math/SparseRowMatrix.h
@@ -69,9 +69,7 @@ public:
    *
    *  @param row row id in local storage
    */
-  real* getLocalRow(size_t row) {
-    return const_cast<real*>(buf_->getWithAutoGrowth(row));
-  }
+  real* getLocalRow(size_t row) { return buf_->getWithAutoGrowth(row); }
 
   /**
    *  reserve the storage for rows according to current size of
diff --git a/paddle/math/tests/CMakeLists.txt b/paddle/math/tests/CMakeLists.txt
index fe5177291c..9403bb073a 100644
--- a/paddle/math/tests/CMakeLists.txt
+++ b/paddle/math/tests/CMakeLists.txt
@@ -4,6 +4,7 @@ add_simple_unittest(test_ExecViaCpu)
 add_simple_unittest(test_SIMDFunctions)
 add_simple_unittest(test_TrainingAlgorithm)
 add_simple_unittest(test_SparseMatrix)
+add_simple_unittest(test_RowBuffer)
 
 # TODO(yuyang18): Refactor TestUtil.cpp. Remove this cross module reference.
 add_unittest(test_matrixCompare
diff --git a/paddle/math/tests/test_RowBuffer.cpp b/paddle/math/tests/test_RowBuffer.cpp
new file mode 100644
index 0000000000..5f66f22ef7
--- /dev/null
+++ b/paddle/math/tests/test_RowBuffer.cpp
@@ -0,0 +1,65 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "paddle/math/RowBuffer.h"
+
+TEST(RowBuffer, testAutoGrow) {
+  paddle::RowBuffer buf(128);
+  ASSERT_EQ(128, buf.getWidth());
+  ASSERT_TRUE(buf.isAutoGrowth());
+  buf.resize(2);
+  ASSERT_EQ(2, buf.getRowCount());
+  for (size_t i = 0; i < buf.getWidth() * 2; ++i) {
+    buf.data()[i] = i;
+  }
+  for (size_t i = 0; i < buf.getRowCount(); ++i) {
+    for (size_t j = 0; j < buf.getWidth(); ++j) {
+      ASSERT_NEAR(i * buf.getWidth() + j, buf.get(i)[j], 1e-5);
+    }
+  }
+
+  auto data = buf.getWithAutoGrowth(2);
+  for (size_t i = 0; i < buf.getWidth(); ++i) {
+    data[i] = i;
+  }
+
+  ASSERT_EQ(3, buf.getRowCount());
+  for (size_t i = 0; i < buf.getRowCount() - 1; ++i) {
+    for (size_t j = 0; j < buf.getWidth(); ++j) {
+      ASSERT_NEAR(i * buf.getWidth() + j, buf.get(i)[j], 1e-5);
+    }
+  }
+  for (size_t i = 0; i < buf.getWidth(); ++i) {
+    ASSERT_NEAR(i, buf.get(2)[i], 1e-5);
+  }
+}
+
+TEST(RowBuffer, testWithMemBuf) {
+  paddle::CpuMemHandlePtr mem =
+      std::make_shared<paddle::CpuMemoryHandle>(128 * 2 * sizeof(real));
+  paddle::RowBuffer buf(mem, 128);
+  ASSERT_TRUE(!buf.isAutoGrowth());
+  ASSERT_EQ(2, buf.getRowCount());
+  for (size_t i = 0; i < buf.getWidth() * 2; ++i) {
+    buf.data()[i] = i;
+  }
+  for (size_t i = 0; i < buf.getRowCount(); ++i) {
+    for (size_t j = 0; j < buf.getWidth(); ++j) {
+      ASSERT_NEAR(i * buf.getWidth() + j, buf.getWithAutoGrowth(i)[j], 1e-5);
+    }
+  }
+
+  ASSERT_DEATH_IF_SUPPORTED(buf.getWithAutoGrowth(3), ".*");
+}

From ea7dd7c92590116d69e3244b8718012fd0d6b68d Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Tue, 13 Dec 2016 22:48:32 +0800
Subject: [PATCH 04/88] Fix logic error before

---
 paddle/math/SparseRowMatrix.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/paddle/math/SparseRowMatrix.h b/paddle/math/SparseRowMatrix.h
index 8532bca879..dd4d85611d 100644
--- a/paddle/math/SparseRowMatrix.h
+++ b/paddle/math/SparseRowMatrix.h
@@ -188,10 +188,9 @@ protected:
             << ") rows are used for sparse "
             << "update, which is not efficient. Considering not use "
             << "sparse_update or set --allow_inefficient_sparse_update=true";
-
-      } else {
-        CHECK_LE(localIndices_->size(), buf_->getRowCount());
       }
+    } else {
+      CHECK_LE(localIndices_->size(), buf_->getRowCount());
     }
   }
 

From 8b5431d5e1e4f54339594ba6a76ecbccf256080f Mon Sep 17 00:00:00 2001
From: dangqingqing <dangqingqing@baidu.com>
Date: Fri, 6 Jan 2017 19:19:56 +0800
Subject: [PATCH 05/88] padding operation

---
 paddle/function/CMakeLists.txt                |    1 +
 paddle/function/PadOp.cpp                     |  185 +
 paddle/function/PadOp.h                       |   96 +
 paddle/function/PadOpGpu.cu                   |  102 +
 paddle/function/PadOpTest.cpp                 |   70 +
 paddle/gserver/layers/PadLayer.cpp            |  115 +
 paddle/gserver/layers/PadLayer.h              |   45 +
 paddle/gserver/tests/test_LayerGrad.cpp       | 3090 +++++++++--------
 proto/ModelConfig.proto                       |    8 +
 python/paddle/trainer/config_parser.py        |   26 +
 .../paddle/trainer_config_helpers/layers.py   |   84 +-
 .../tests/configs/test_pad.py                 |   21 +
 12 files changed, 2317 insertions(+), 1526 deletions(-)
 create mode 100644 paddle/function/PadOp.cpp
 create mode 100644 paddle/function/PadOp.h
 create mode 100644 paddle/function/PadOpGpu.cu
 create mode 100644 paddle/function/PadOpTest.cpp
 create mode 100644 paddle/gserver/layers/PadLayer.cpp
 create mode 100644 paddle/gserver/layers/PadLayer.h
 create mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_pad.py

diff --git a/paddle/function/CMakeLists.txt b/paddle/function/CMakeLists.txt
index 0b3126155d..70b25406a4 100644
--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@@ -17,6 +17,7 @@ if(WITH_TESTING)
     # file(GLOB test_files . *OpTest.cpp)
     # add_executable(${test_bin} EXCLUDE_FROM_ALL ${test_files})
     add_simple_unittest(CrossMapNormalOpTest)
+    add_simple_unittest(PadOpTest)
     add_unittest(ContextProjectionOpTest
         ContextProjectionOpTest.cpp
         ../gserver/tests/TestUtil.cpp)
diff --git a/paddle/function/PadOp.cpp b/paddle/function/PadOp.cpp
new file mode 100644
index 0000000000..e10011da2a
--- /dev/null
+++ b/paddle/function/PadOp.cpp
@@ -0,0 +1,185 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "PadOp.h"
+#include "paddle/math/Vector.h"
+
+namespace paddle {
+
+template <>
+void Pad<DEVICE_TYPE_CPU>(real* outputs,
+                          const real* inputs,
+                          const int num,
+                          const int inC,
+                          const int inH,
+                          const int inW,
+                          const int padc0,
+                          const int padc1,
+                          const int padh0,
+                          const int padh1,
+                          const int padw0,
+                          const int padw1) {
+  int outC = inC + padc0 + padc1;
+  int outH = inH + padh0 + padh1;
+  int outW = inW + padw0 + padw1;
+  for (int i = 0; i < num; i++) {
+    for (int c = 0; c < inC; c++) {
+      for (int h = 0; h < inH; h++) {
+        int inoff = ((i * inC + c) * inH + h) * inW;
+        int outoff = ((i * outC + c + padc0) * outH + h + padh0) * outW + padw0;
+        memcpy(outputs + outoff, inputs + inoff, inW * sizeof(real));
+      }
+    }
+  }
+}
+
+template <>
+void PadGrad<DEVICE_TYPE_CPU>(real* inGrad,
+                              const real* outGrad,
+                              const int num,
+                              const int inC,
+                              const int inH,
+                              const int inW,
+                              const int padc0,
+                              const int padc1,
+                              const int padh0,
+                              const int padh1,
+                              const int padw0,
+                              const int padw1) {
+  int outC = inC + padc0 + padc1;
+  int outH = inH + padh0 + padh1;
+  int outW = inW + padw0 + padw1;
+  for (int i = 0; i < num; i++) {
+    for (int c = 0; c < inC; c++) {
+      for (int h = 0; h < inH; h++) {
+        int inoff = ((i * inC + c) * inH + h) * inW;
+        int outoff = ((i * outC + c + padc0) * outH + h + padh0) * outW + padw0;
+        CpuVector inG = CpuVector(inW, inGrad + inoff);
+        CpuVector outG = CpuVector(inW, const_cast<real*>(outGrad + outoff));
+        inG += outG;
+      }
+    }
+  }
+}
+
+/**
+ * \param inputs[0] input value.
+ * \param outputs[0] output value.
+ */
+template <DeviceType Device>
+class PadFunc : public FunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    padc0_ = config.get<int>("padc0");
+    padc1_ = config.get<int>("padc1");
+    padh0_ = config.get<int>("padh0");
+    padh1_ = config.get<int>("padh1");
+    padw0_ = config.get<int>("padw0");
+    padw1_ = config.get<int>("padw1");
+  }
+
+  void calc(const Arguments& inputs,
+            const Arguments& outputs,
+            const Arguments& inouts) override {
+    CHECK_EQ(1, inputs.size());
+    CHECK_EQ(1, outputs.size());
+    CHECK_EQ(0, inouts.size());
+
+    size_t num = inputs[0].dims_[0];
+    size_t inC = inputs[0].dims_[1];
+    size_t inH = inputs[0].dims_[2];
+    size_t inW = inputs[0].dims_[3];
+
+    Pad<Device>(outputs[0].getData(),
+                inputs[0].getData(),
+                num,
+                inC,
+                inH,
+                inW,
+                padc0_,
+                padc1_,
+                padh0_,
+                padh1_,
+                padw0_,
+                padw1_);
+  }
+
+private:
+  int padc0_;
+  int padc1_;
+  int padh0_;
+  int padh1_;
+  int padw0_;
+  int padw1_;
+};
+
+/**
+ * \param inputs[0] input grad.
+ * \param outputs[0] output grad.
+ */
+template <DeviceType Device>
+class PadGradFunc : public FunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    padc0_ = config.get<int>("padc0");
+    padc1_ = config.get<int>("padc1");
+    padh0_ = config.get<int>("padh0");
+    padh1_ = config.get<int>("padh1");
+    padw0_ = config.get<int>("padw0");
+    padw1_ = config.get<int>("padw1");
+  }
+
+  void calc(const Arguments& inputs,
+            const Arguments& outputs,
+            const Arguments& inouts) override {
+    CHECK_EQ(1, inputs.size());
+    CHECK_EQ(0, outputs.size());
+    CHECK_EQ(1, inouts.size());
+
+    size_t n = inouts[0].dims_[0];
+    size_t inC = inouts[0].dims_[1];
+    size_t inH = inouts[0].dims_[2];
+    size_t inW = inouts[0].dims_[3];
+
+    PadGrad<Device>(inouts[0].getData(),
+                    inputs[0].getData(),
+                    n,
+                    inC,
+                    inH,
+                    inW,
+                    padc0_,
+                    padc1_,
+                    padh0_,
+                    padh1_,
+                    padw0_,
+                    padw1_);
+  }
+
+private:
+  int padc0_;
+  int padc1_;
+  int padh0_;
+  int padh1_;
+  int padw0_;
+  int padw1_;
+};
+
+REGISTER_TYPED_FUNC(Pad, CPU, PadFunc);
+REGISTER_TYPED_FUNC(PadGrad, CPU, PadGradFunc);
+#ifndef PADDLE_ONLY_CPU
+REGISTER_TYPED_FUNC(Pad, GPU, PadFunc);
+REGISTER_TYPED_FUNC(PadGrad, GPU, PadGradFunc);
+#endif
+
+}  // namespace paddle
diff --git a/paddle/function/PadOp.h b/paddle/function/PadOp.h
new file mode 100644
index 0000000000..4a5e8fe338
--- /dev/null
+++ b/paddle/function/PadOp.h
@@ -0,0 +1,96 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Function.h"
+
+namespace paddle {
+
+/**
+ * \brief  This funtion pads zeros to inputs according to the specify dimension.
+ *         The data structure of image data is NCHW.
+ *
+ * \param[out]  outputs  save results.
+ * \param[in]   inputs   input data.
+ * \param[in]   num      batch size of input data.
+ * \param[in]   inC      channel number of input data.
+ * \param[in]   inH      height of input data.
+ * \param[in]   inH      with of input data.
+ * \param[in]   padc0    how many values to add before the data in dimension of
+ * channel.
+ * \param[in]   padc1    how many values to add after the data in dimension of
+ * channel.
+ * \param[in]   padh0    how many values to add before the data in dimension of
+ * height.
+ * \param[in]   padh1    how many values to add after the data in dimension of
+ * height.
+ * \param[in]   padw0    how many values to add before the data in dimension of
+ * width.
+ * \param[in]   padw1    how many values to add after the data in dimension of
+ * width.
+ *
+ */
+template <DeviceType Device>
+void Pad(real* outputs,
+         const real* inputs,
+         const int num,
+         const int inC,
+         const int inH,
+         const int inW,
+         const int padc0,
+         const int padc1,
+         const int padh0,
+         const int padh1,
+         const int padw0,
+         const int padw1);
+
+/**
+ * \brief   Padding operation backward.
+ *          The data structure of image data is NCHW.
+ *
+ * \param[out]  inGrad   gradients of previous layer.
+ * \param[in]   outGrad  output gradients.
+ * \param[in]   num      batch size of input data.
+ * \param[in]   inC      channel number of input data.
+ * \param[in]   inH      height of input data.
+ * \param[in]   inH      with of input data.
+ * \param[in]   padc0    how many values to add before the data in dimension of
+ * channel.
+ * \param[in]   padc1    how many values to add after the data in dimension of
+ * channel.
+ * \param[in]   padh0    how many values to add before the data in dimension of
+ * height.
+ * \param[in]   padh1    how many values to add after the data in dimension of
+ * height.
+ * \param[in]   padw0    how many values to add before the data in dimension of
+ * width.
+ * \param[in]   padw1    how many values to add after the data in dimension of
+ * width.
+ *
+ */
+template <DeviceType Device>
+void PadGrad(real* inGrad,
+             const real* outGrad,
+             const int num,
+             const int inC,
+             const int inH,
+             const int inW,
+             const int padc0,
+             const int padc1,
+             const int padh0,
+             const int padh1,
+             const int padw0,
+             const int padw1);
+}  // namespace paddle
diff --git a/paddle/function/PadOpGpu.cu b/paddle/function/PadOpGpu.cu
new file mode 100644
index 0000000000..578d6e86d7
--- /dev/null
+++ b/paddle/function/PadOpGpu.cu
@@ -0,0 +1,102 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "hl_base.h"
+#include "PadOp.h"
+
+namespace paddle {
+
+__global__ void KePad(real* outputs, const real* inputs,
+                      int inC, int inH, int inW,
+                      int padc, int padh, int padw,
+                      int outC, int outH, int outW, int nthreads) {
+  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < nthreads) {
+    const int w = idx % inW;
+    const int h = (idx / inW) % inH;
+    const int c = (idx / inW / inH) % inC;
+    const int n = idx / inW / inH / inC;
+
+    const int off = ((n * outC + c + padc) * outH + h + padh) * outW + padw + w;
+    outputs[off] = inputs[idx];
+  }
+}
+
+template <>
+void Pad<DEVICE_TYPE_GPU>(real* outputs,
+                          const real* inputs,
+                          const int num,
+                          const int inC,
+                          const int inH,
+                          const int inW,
+                          const int padc0,
+                          const int padc1,
+                          const int padh0,
+                          const int padh1,
+                          const int padw0,
+                          const int padw1) {
+  size_t nth = num * inC * inH * inW;
+  int blockSize = 1024;
+  int gridSize = (nth + 1024 - 1) / 1024;
+  int outC = inC + padc0 + padc1;
+  int outH = inH + padh0 + padh1;
+  int outW = inW + padw0 + padw1;
+  KePad<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
+    (outputs, inputs, inC, inH, inW, padc0, padh0, padw0,
+     outC, outH, outW, nth);
+  CHECK_SYNC("Pad");
+}
+
+__global__ void KePadDiff(real* inGrad, const real* outGrad,
+                          int inC, int inH, int inW,
+                          int padc, int padh, int padw,
+                          int outC, int outH, int outW, int nthreads) {
+  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < nthreads) {
+    const int w = idx % inW;
+    const int h = (idx / inW) % inH;
+    const int c = (idx / inW / inH) % inC;
+    const int n = idx / inW / inH / inC;
+
+    const int off = ((n * outC + c + padc) * outH + h + padh) * outW + padw + w;
+    inGrad[idx] += outGrad[off];
+  }
+}
+
+template <>
+void PadGrad<DEVICE_TYPE_GPU>(real* inGrad,
+                              const real* outGrad,
+                              const int num,
+                              const int inC,
+                              const int inH,
+                              const int inW,
+                              const int padc0,
+                              const int padc1,
+                              const int padh0,
+                              const int padh1,
+                              const int padw0,
+                              const int padw1) {
+  int nth = num * inC * inH * inW;
+  int blockSize = 1024;
+  int gridSize = (nth + 1024 - 1) / 1024;
+  int outC = inC + padc0 + padc1;
+  int outH = inH + padh0 + padh1;
+  int outW = inW + padw0 + padw1;
+  KePadDiff <<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
+    (inGrad, outGrad, inC, inH, inW, padc0, padh0, padw0,
+     outC, outH, outW, nth);
+  CHECK_SYNC("PadGrad");
+}
+
+}  // namespace paddle
diff --git a/paddle/function/PadOpTest.cpp b/paddle/function/PadOpTest.cpp
new file mode 100644
index 0000000000..ee2834d793
--- /dev/null
+++ b/paddle/function/PadOpTest.cpp
@@ -0,0 +1,70 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "FunctionTest.h"
+
+namespace paddle {
+
+TEST(Pad, real) {
+  for (size_t numSamples : {5, 32}) {
+    for (size_t channels : {1, 5, 32}) {
+      for (size_t imgSizeH : {5, 33, 100}) {
+        for (size_t imgSizeW : {5, 32, 96}) {
+          VLOG(3) << " numSamples=" << numSamples << " channels=" << channels
+                  << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW;
+
+          FunctionCompare compare("Pad",
+                                  FuncConfig()
+                                      .set("padc0", 2)
+                                      .set("padc1", 3)
+                                      .set("padh0", 1)
+                                      .set("padh1", 2)
+                                      .set("padw0", 3)
+                                      .set("padw1", 2));
+          Dims inDims{numSamples, channels, imgSizeH, imgSizeW};
+          Dims outDims{numSamples, channels + 5, imgSizeH + 3, imgSizeW + 5};
+          compare.cmpWithArg(
+              {Tensor(nullptr, inDims)}, {Tensor(nullptr, outDims)}, {});
+        }
+      }
+    }
+  }
+}
+
+// TEST(PadGrad, real) {
+//  for (size_t numSamples : {5, 32}) {
+//    for (size_t channels : {1, 5, 32}) {
+//      for (size_t imgSizeH : {5, 33, 100}) {
+//        for (size_t imgSizeW : {5, 32, 96}) {
+//          VLOG(3) << " numSamples=" << numSamples << " channels=" << channels
+//                  << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW;
+//
+//          FunctionCompare compare("PadGrad",
+//                                  FuncConfig()
+//                                     .set("padc0", 2).set("padc1", 3)
+//                                     .set("padh0", 1).set("padh1", 2)
+//                                     .set("padw0", 3).set("padw1", 2));
+//          Dims inDims{numSamples, channels, imgSizeH, imgSizeW};
+//          Dims outDims{numSamples, channels + 5, imgSizeH + 3, imgSizeW + 5};
+//          compare.cmpWithArg({Tensor(nullptr, inDims)},
+//                             {Tensor(nullptr, outDims)},
+//                             {});
+//        }
+//      }
+//    }
+//  }
+//}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/PadLayer.cpp b/paddle/gserver/layers/PadLayer.cpp
new file mode 100644
index 0000000000..62c50af32d
--- /dev/null
+++ b/paddle/gserver/layers/PadLayer.cpp
@@ -0,0 +1,115 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "PadLayer.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(pad, PadLayer);
+
+bool PadLayer::init(const LayerMap& layerMap,
+                    const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  auto& pad_conf = config_.inputs(0).pad_conf();
+  auto& img_conf = pad_conf.image_conf();
+  CHECK_EQ(config_.inputs_size(), 1);
+  inDims_.push_back(0);
+  inDims_.push_back(img_conf.channels());
+  inDims_.push_back(img_conf.has_img_size_y() ? img_conf.img_size_y()
+                                              : img_conf.img_size());
+  inDims_.push_back(img_conf.img_size());
+
+  CHECK_EQ(2UL, pad_conf.pad_c_size());
+  CHECK_EQ(2UL, pad_conf.pad_h_size());
+  CHECK_EQ(2UL, pad_conf.pad_w_size());
+  padc_.push_back(pad_conf.pad_c(0));
+  padc_.push_back(pad_conf.pad_c(1));
+  padh_.push_back(pad_conf.pad_h(0));
+  padh_.push_back(pad_conf.pad_h(1));
+  padw_.push_back(pad_conf.pad_w(0));
+  padw_.push_back(pad_conf.pad_w(1));
+
+  outDims_.resize(4);
+  setOutDims(0);
+
+  createFunction(forward_,
+                 "Pad",
+                 FuncConfig()
+                     .set("padc0", padc_[0])
+                     .set("padc1", padc_[1])
+                     .set("padh0", padh_[0])
+                     .set("padh1", padh_[1])
+                     .set("padw0", padw_[0])
+                     .set("padw1", padw_[1]));
+  createFunction(backward_,
+                 "PadGrad",
+                 FuncConfig()
+                     .set("padc0", padc_[0])
+                     .set("padc1", padc_[1])
+                     .set("padh0", padh_[0])
+                     .set("padh1", padh_[1])
+                     .set("padw0", padw_[0])
+                     .set("padw1", padw_[1]));
+
+  return true;
+}
+
+void PadLayer::setOutDims(int batchSize) {
+  outDims_[0] = batchSize;
+  outDims_[1] = inDims_[1] + padc_[0] + padc_[1];
+  outDims_[2] = inDims_[2] + padh_[0] + padh_[1];
+  outDims_[3] = inDims_[3] + padw_[0] + padw_[1];
+}
+
+void PadLayer::setTensorDim(int batchSize) {
+  CHECK_EQ(inputLayers_.size(), 1UL);
+  inDims_[0] = batchSize;
+  int h = inputLayers_[0]->getOutput().getFrameHeight();
+  if (h != 0) inDims_[2];
+  int w = inputLayers_[0]->getOutput().getFrameWidth();
+  if (w != 0) inDims_[3];
+  setOutDims(batchSize);
+}
+
+void PadLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  MatrixPtr input = inputLayers_[0]->getOutputValue();
+  size_t batchSize = input->getHeight();
+  setTensorDim(batchSize);
+  int size = outDims_[1] * outDims_[2] * outDims_[3];
+  resetOutput(batchSize, size);
+  MatrixPtr outV = getOutputValue();
+  REGISTER_TIMER_INFO("PadForward", getName().c_str());
+  forward_[0]->calc({Tensor(input->getData(), inDims_)},
+                    {Tensor(outV->getData(), outDims_)},
+                    {});
+}
+
+void PadLayer::backward(const UpdateCallback& callback) {
+  (void)callback;
+
+  MatrixPtr preGrad = inputLayers_[0]->getOutputGrad();
+  if (NULL == preGrad) {
+    return;
+  }
+  MatrixPtr outGrad = getOutputGrad();
+  REGISTER_TIMER_INFO("PadBackward", getName().c_str());
+  backward_[0]->calc({Tensor(outGrad->getData(), outDims_)},
+                     {},
+                     {Tensor(preGrad->getData(), inDims_)});
+}
+}  // namespace paddle
diff --git a/paddle/gserver/layers/PadLayer.h b/paddle/gserver/layers/PadLayer.h
new file mode 100644
index 0000000000..834622a7af
--- /dev/null
+++ b/paddle/gserver/layers/PadLayer.h
@@ -0,0 +1,45 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+
+namespace paddle {
+
+/**
+ * @brief response normalization across feature maps
+ * namely normalize in number of size_ channels
+ */
+class PadLayer : public Layer {
+public:
+  explicit PadLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~PadLayer() {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  void forward(PassType passType);
+  void backward(const UpdateCallback& callback = nullptr);
+
+protected:
+  void setOutDims(int batchSize);
+  void setTensorDim(int batchSize);
+
+  std::vector<int> padc_;
+  std::vector<int> padh_;
+  std::vector<int> padw_;
+  Dims inDims_;
+  Dims outDims_;
+};
+}  // namespace paddle
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index 2cc25f6b21..3094b3a4a0 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -32,1534 +32,1580 @@ DECLARE_double(checkgrad_eps);
 DECLARE_bool(thread_local_rand_use_global_seed);
 DECLARE_bool(prev_batch_state);
 
-TEST(Operator, dot_mul) {
+// TEST(Operator, dot_mul) {
+//   TestConfig config;
+//   config.layerConfig.set_size(10);
+//
+//   config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
+//   config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
+//   config.layerConfig.add_inputs();
+//   config.layerConfig.add_inputs();
+//
+//   OperatorConfig& operatorConf = *config.layerConfig.add_operator_confs();
+//   operatorConf.set_type("dot_mul");
+//   operatorConf.set_dotmul_scale(-1);
+//
+//   testOperatorGrad(config, operatorConf, 100, false, false);
+// }
+//
+// TEST(Projection, context) {
+//   for (auto contextStart : {-5, -3, -1, 0, 3}) {
+//     for (auto contextLength : {1, 2, 5, 7}) {
+//       for (auto batchSize : {1, 2, 5, 20, 50}) {
+//         for (auto trainablePadding : {false, true}) {
+//           LOG(INFO) << " contextStart=" << contextStart
+//                     << " contextLength=" << contextLength
+//                     << " batchSize=" << batchSize
+//                     << " trainablePadding=" << trainablePadding;
+//           ProjectionConfig conf;
+//           conf.set_type("context");
+//           conf.set_input_size(10);
+//           conf.set_context_start(contextStart);
+//           conf.set_context_length(contextLength);
+//           conf.set_trainable_padding(trainablePadding);
+//           conf.set_output_size(conf.context_length() * conf.input_size());
+//           int pad =
+//               std::max(0, -conf.context_start()) +
+//               std::max(0, conf.context_start() + conf.context_length() - 1);
+//           for (auto useGpu : {false, true}) {
+//             testProjectionGrad(
+//                 conf,
+//                 INPUT_SEQUENCE_DATA,
+//                 trainablePadding ? conf.input_size() * pad : 0,
+//                 batchSize,
+//                 useGpu,
+//                 contextStart + contextLength <= 1);  // = testState
+//           }
+//         }
+//       }
+//     }
+//   }
+// }
+//
+// TEST(Projection, trans_fc) {
+//   ProjectionConfig conf;
+//   conf.set_type("trans_fc");
+//   conf.set_input_size(50);
+//   conf.set_output_size(20);
+//   for (auto useGpu : {false, true}) {
+//     testProjectionGrad(conf,
+//                        INPUT_DATA,
+//                        /* parameterSize */ 1000,
+//                        /* batchSize */ 100,
+//                        useGpu);
+//   }
+// }
+//
+// TEST(Projection, fc) {
+//   ProjectionConfig conf;
+//   conf.set_type("fc");
+//   conf.set_input_size(10);
+//   conf.set_output_size(20);
+//   for (auto useGpu : {false, true}) {
+//     testProjectionGrad(conf,
+//                        INPUT_DATA,
+//                        /* parameterSize */ 200,
+//                        /* batchSize */ 100,
+//                        useGpu);
+//   }
+// }
+//
+// TEST(Projection, dot_mul) {
+//   ProjectionConfig conf;
+//   conf.set_type("dot_mul");
+//   conf.set_input_size(20);
+//   conf.set_output_size(20);
+//   for (auto useGpu : {false, true}) {
+//     testProjectionGrad(conf,
+//                        INPUT_DATA,
+//                        /* parameterSize */ 20,
+//                        /* batchSize */ 100,
+//                        useGpu);
+//   }
+// }
+//
+// TEST(Projection, table) {
+//   ProjectionConfig conf;
+//   conf.set_type("table");
+//   conf.set_input_size(10);
+//   conf.set_output_size(20);
+//   for (auto useGpu : {false, true}) {
+//     testProjectionGrad(conf,
+//                        INPUT_LABEL,
+//                        /* parameterSize */ 200,
+//                        /* batchSize */ 100,
+//                        useGpu);
+//   }
+// }
+//
+// TEST(Projection, identity) {
+//   ProjectionConfig conf;
+//   conf.set_type("identity");
+//   conf.set_input_size(10);
+//   conf.set_output_size(10);
+//   for (auto useGpu : {false, true}) {
+//     testProjectionGrad(conf,
+//                        INPUT_DATA,
+//                        /* parameterSize */ 0,
+//                        /* batchSize */ 100,
+//                        useGpu);
+//   }
+// }
+//
+// TEST(Projection, scaling) {
+//   ProjectionConfig conf;
+//   conf.set_type("scaling");
+//   conf.set_input_size(10);
+//   conf.set_output_size(10);
+//   for (auto useGpu : {false}) {
+//     testProjectionGrad(conf,
+//                        INPUT_DATA,
+//                        /* parameterSize */ 1,
+//                        /* batchSize */ 100,
+//                        useGpu);
+//   }
+// }
+//
+// void testProjectionConv(size_t groups) {
+//   const int NUM_FILTERS = 18;
+//   const int FILTER_SIZE = 2;
+//   const int FILTER_SIZE_Y = 3;
+//   const int CHANNELS = 3;
+//   const int IMAGE_SIZE = 16;
+//
+//   ProjectionConfig conf;
+//   conf.set_type("conv");
+//   conf.set_num_filters(NUM_FILTERS);
+//
+//   ConvConfig* conv = conf.mutable_conv_conf();
+//   conv->set_filter_size(FILTER_SIZE);
+//   conv->set_filter_size_y(FILTER_SIZE_Y);
+//   conv->set_channels(CHANNELS);
+//   conv->set_padding(0);
+//   conv->set_padding_y(1);
+//   conv->set_stride(2);
+//   conv->set_stride_y(2);
+//   conv->set_groups(groups);
+//   conv->set_filter_channels(conv->channels() / conv->groups());
+//   conv->set_img_size(IMAGE_SIZE);
+//   int output_x = outputSize(conv->img_size(),
+//                             conv->filter_size(),
+//                             conv->padding(),
+//                             conv->stride(),
+//                             /* caffeMode */ true);
+//   int output_y = outputSize(conv->img_size(),
+//                             conv->filter_size_y(),
+//                             conv->padding_y(),
+//                             conv->stride_y(),
+//                             /* caffeMode */ true);
+//   conv->set_output_x(output_x);
+//   conf.set_input_size(IMAGE_SIZE * IMAGE_SIZE * CHANNELS);
+//   conf.set_output_size(output_x * output_y * NUM_FILTERS);
+//
+//   testProjectionGrad(conf,
+//                      INPUT_DATA,
+//                      /* parameterSize */ NUM_FILTERS * CHANNELS * FILTER_SIZE
+//                      *
+//                          FILTER_SIZE_Y / groups,
+//                      /* batchSize */ 100,
+//                      true,
+//                      false,
+//                      NUM_FILTERS,
+//                      true);
+// }
+//
+// #ifndef PADDLE_ONLY_CPU
+// TEST(Projection, conv) {
+//   testProjectionConv(1);
+//   testProjectionConv(3);
+// }
+// #endif
+//
+// TEST(Layer, BilinearInterpLayer) {
+//   TestConfig config;
+//   config.layerConfig.set_type("bilinear_interp");
+//   config.biasSize = 0;
+//   config.inputDefs.push_back({INPUT_DATA, "layer_0", 4096, 0});
+//
+//   LayerInputConfig* input = config.layerConfig.add_inputs();
+//   BilinearInterpConfig* bilinear = input->mutable_bilinear_interp_conf();
+//   ImageConfig* image = bilinear->mutable_image_conf();
+//   image->set_img_size(32);
+//   image->set_img_size_y(32);
+//   image->set_channels(4);
+//
+//   for (auto useGpu : {false, true}) {
+//     for (auto outSize : {32, 64}) {
+//       bilinear->set_out_size_x(outSize);
+//       bilinear->set_out_size_y(outSize);
+//       testLayerGrad(config, "bilinear_interp", 10, false, useGpu);
+//     }
+//   }
+// }
+//
+// TEST(Layer, concat) {
+//   TestConfig config;
+//   config.biasSize = 0;
+//   config.layerConfig.set_type("concat");
+//   config.layerConfig.set_size(15);
+//   config.layerConfig.set_active_type("sigmoid");
+//
+//   config.inputDefs.push_back({INPUT_DATA, "layer_0", 5, 0});
+//   config.layerConfig.add_inputs();
+//   config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
+//   config.layerConfig.add_inputs();
+//
+//   for (auto useGpu : {false, true}) {
+//     testLayerGrad(config, "concat", 100, false, useGpu);
+//   }
+// }
+//
+// TEST(Layer, AddtoLayer) {
+//   TestConfig config;
+//   config.biasSize = 0;
+//   config.layerConfig.set_type("addto");
+//   config.layerConfig.set_size(10);
+//   config.layerConfig.set_active_type("sigmoid");
+//
+//   config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
+//   config.layerConfig.add_inputs();
+//   config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
+//   config.layerConfig.add_inputs();
+//
+//   for (auto useGpu : {false, true}) {
+//     testLayerGrad(config, "addto", 100, false, useGpu);
+//   }
+// }
+//
+// TEST(Layer, CRFLayer) {
+//   TestConfig config;
+//   config.layerConfig.set_type("crf");
+//   config.layerConfig.set_size(10);
+//   config.biasSize = 0;
+//
+//   config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "layer_0", 10, 120});
+//   config.inputDefs.push_back({INPUT_SEQUENCE_LABEL, "layer_1", 10, 0});
+//   config.layerConfig.add_inputs();
+//   config.layerConfig.add_inputs();
+//
+//   // Not support GPU now
+//   testLayerGrad(config,
+//                 "crf",
+//                 100,
+//                 /* trans */ false,
+//                 /* useGpu */ false,
+//                 false /*useWeight*/,
+//                 0.03 /*epsilon*/);
+// }
+//
+// TEST(Layer, CTCLayer) {
+//   TestConfig config;
+//   config.layerConfig.set_type("ctc");
+//   config.layerConfig.set_norm_by_times(false);
+//   config.layerConfig.set_size(10);
+//   config.biasSize = 0;
+//
+//   config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "layer_0", 10, 0});
+//   config.inputDefs.push_back({INPUT_SEQUENCE_LABEL, "layer_1", 10, 0});
+//   config.layerConfig.add_inputs();
+//   config.layerConfig.add_inputs();
+//
+//   for (auto useGpu : {false, true}) {
+//     testLayerGrad(config, "ctc", 100, /* trans */ false, /* useGpu */
+//     useGpu);
+//   }
+// }
+//
+// TEST(Layer, cosSimLayer) {
+//   TestConfig config;
+//   config.layerConfig.set_type("cos");
+//   config.layerConfig.set_size(1);
+//   config.biasSize = 0;
+//
+//   config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
+//   config.inputDefs.push_back({INPUT_DATA, "layer_1", 50, 0});
+//   config.layerConfig.add_inputs();
+//   config.layerConfig.add_inputs();
+//
+//   for (auto useGpu : {false, true}) {
+//     testLayerGrad(config, "cos", 100, false, useGpu);
+//   }
+// }
+//
+// TEST(Layer, CosSimVecMatLayer) {
+//   TestConfig config;
+//   config.layerConfig.set_type("cos_vm");
+//   config.layerConfig.set_size(5);  // output size
+//   config.layerConfig.set_cos_scale(2.0);
+//
+//   config.inputDefs.push_back({INPUT_DATA, "layer_0", 20, 0});
+//   config.layerConfig.add_inputs();
+//   config.inputDefs.push_back({INPUT_DATA, "layer_1", 100, 0});
+//   config.layerConfig.add_inputs();
+//
+//   for (auto useGpu : {false, true}) {
+//     testLayerGrad(config, "cos_vm", 100, false, useGpu);
+//   }
+// }
+//
+// void testConvLayer(const string& type, bool trans, bool useGpu) {
+//   TestConfig config;
+//   config.biasSize = 16;
+//   config.layerConfig.set_type(type);
+//   config.layerConfig.set_num_filters(16);
+//   config.layerConfig.set_partial_sum(1);
+//   config.layerConfig.set_shared_biases(true);
+//
+//   config.inputDefs.push_back({INPUT_DATA, "layer_0", 384, 288});
+//   LayerInputConfig* input = config.layerConfig.add_inputs();
+//   ConvConfig* conv = input->mutable_conv_conf();
+//   conv->set_filter_size(2);
+//   conv->set_filter_size_y(3);
+//   conv->set_channels(3);
+//   conv->set_padding(0);
+//   conv->set_padding_y(1);
+//   conv->set_stride(2);
+//   conv->set_stride_y(2);
+//   conv->set_groups(1);
+//   conv->set_filter_channels(conv->channels() / conv->groups());
+//   conv->set_img_size(16);
+//   conv->set_img_size_y(8);
+//   conv->set_output_x(outputSize(conv->img_size(),
+//                                 conv->filter_size(),
+//                                 conv->padding(),
+//                                 conv->stride(),
+//                                 /* caffeMode */ true));
+//   conv->set_output_y(outputSize(conv->img_size_y(),
+//                                 conv->filter_size_y(),
+//                                 conv->padding_y(),
+//                                 conv->stride_y(),
+//                                 /* caffeMode */ true));
+//   config.layerConfig.set_size(conv->output_x() * conv->output_y() *
+//                               config.layerConfig.num_filters());
+//
+//   testLayerGrad(config, "conv", 100, trans, useGpu);
+//   // Use small batch_size and useWeight=true to test biasGrad
+//   testLayerGrad(config, "conv", 2, trans, useGpu, true, 0.02);
+// }
+//
+// TEST(Layer, convLayer) {
+//   testConvLayer("exconv", /* trans= */ false, /* useGpu= */ false);
+// #ifndef PADDLE_ONLY_CPU
+//   testConvLayer("exconv", /* trans= */ false, /* useGpu= */ true);
+//   testConvLayer("cudnn_conv", /* trans= */ false, /* useGpu= */ true);
+// #endif
+// }
+//
+// void testConvTransLayer(const string& type, bool trans, bool useGpu) {
+//   TestConfig config;
+//   config.biasSize = 3;
+//   config.layerConfig.set_type(type);
+//   config.layerConfig.set_num_filters(3);
+//   config.layerConfig.set_partial_sum(1);
+//   config.layerConfig.set_shared_biases(true);
+//
+//   config.inputDefs.push_back({INPUT_DATA, "layer_0", 1024, 288});
+//   LayerInputConfig* input = config.layerConfig.add_inputs();
+//   ConvConfig* conv = input->mutable_conv_conf();
+//   conv->set_filter_size(2);
+//   conv->set_filter_size_y(3);
+//   conv->set_channels(16);
+//   conv->set_padding(0);
+//   conv->set_padding_y(1);
+//   conv->set_stride(2);
+//   conv->set_stride_y(2);
+//   conv->set_groups(1);
+//   conv->set_filter_channels(3 / conv->groups());
+//   conv->set_img_size(16);
+//   conv->set_output_x(outputSize(conv->img_size(),
+//                                 conv->filter_size(),
+//                                 conv->padding(),
+//                                 conv->stride(),
+//                                 /* caffeMode */ true));
+//
+//   config.layerConfig.set_size(conv->img_size() * conv->img_size() *
+//                               config.layerConfig.num_filters());
+//
+//   testLayerGrad(config, "convTrans", 100, trans, useGpu);
+//   // Use small batch_size and useWeight=true to test biasGrad
+//   testLayerGrad(config, "convTrans", 2, trans, useGpu, true, 0.02);
+// }
+//
+// TEST(Layer, convTransLayer) {
+//   for (auto useGpu : {false, true}) {
+//     testConvTransLayer("exconvt", /* trans= */ false, /* useGpu= */ useGpu);
+//   }
+// }
+//
+// TEST(Layer, blockExpandLayer) {
+//   TestConfig config;
+//   config.biasSize = 0;
+//   config.layerConfig.set_type("blockexpand");
+//
+//   config.inputDefs.push_back({INPUT_DATA, "layer_0", 6144, 0});
+//   LayerInputConfig* input = config.layerConfig.add_inputs();
+//   BlockExpandConfig* blockExpand = input->mutable_block_expand_conf();
+//   blockExpand->set_img_size_x(64);
+//   blockExpand->set_img_size_y(32);
+//   blockExpand->set_channels(3);
+//   blockExpand->set_padding_x(0);
+//   blockExpand->set_padding_y(0);
+//   blockExpand->set_block_x(4);
+//   blockExpand->set_block_y(32);
+//   blockExpand->set_stride_x(2);
+//   blockExpand->set_stride_y(2);
+//   blockExpand->set_output_x(outputSize(blockExpand->img_size_x(),
+//                                        blockExpand->block_x(),
+//                                        blockExpand->padding_x(),
+//                                        blockExpand->stride_x(),
+//                                        /* caffeMode */ false));
+//   blockExpand->set_output_y(outputSize(blockExpand->img_size_y(),
+//                                        blockExpand->block_y(),
+//                                        blockExpand->padding_y(),
+//                                        blockExpand->stride_y(),
+//                                        /* caffeMode */ false));
+//   config.layerConfig.set_size(blockExpand->block_x() * blockExpand->block_y()
+//   *
+//                               blockExpand->channels());
+//
+//   for (auto useGpu : {false, true}) {
+//     testLayerGrad(config, "blockexpand", 100, false, useGpu);
+//   }
+// }
+//
+// TEST(Layer, maxoutLayer) {
+//   TestConfig config;
+//   config.biasSize = 0;
+//   config.layerConfig.set_type("maxout");
+//
+//   config.inputDefs.push_back({INPUT_DATA, "layer_0", 4096, 0});
+//   LayerInputConfig* input = config.layerConfig.add_inputs();
+//   MaxOutConfig* maxout = input->mutable_maxout_conf();
+//   ImageConfig* image = maxout->mutable_image_conf();
+//
+//   image->set_img_size(32);
+//   image->set_img_size_y(32);
+//   image->set_channels(4);
+//   maxout->set_groups(2);
+//
+//   for (auto useGpu : {false, true}) {
+//     testLayerGrad(config, "maxout", 10, false, useGpu);
+//   }
+// }
+// void testFcLayer(string format, size_t nnz) {
+//   TestConfig config;
+//   config.biasSize = 4096;
+//   config.layerConfig.set_type("fc");
+//   config.layerConfig.set_size(4096);
+//   config.layerConfig.set_active_type("sigmoid");
+//   config.layerConfig.set_drop_rate(0.1);
+//
+//   config.inputDefs.push_back(
+//       {INPUT_DATA, "layer_0", 8192, nnz, ParaSparse(format)});
+//   config.layerConfig.add_inputs();
+//
+//   LOG(INFO) << config.inputDefs[0].sparse.sparse << " "
+//             << config.inputDefs[0].sparse.format;
+//
+//   for (auto useGpu : {false, true}) {
+//     testLayerGrad(config,
+//                   "fc",
+//                   100,
+//                   /* trans */ false,
+//                   useGpu,
+//                   /* weight */ true);
+//   }
+// }
+//
+// TEST(Layer, fcLayer) {
+//   testFcLayer("", 4096 * 4096 * 2);
+//   testFcLayer("csc", 4096 * 40);
+//   testFcLayer("csr", 4096 * 40);
+// }
+//
+// TEST(Layer, SelectiveFullyConnectedLayer) {
+//   TestConfig config;
+//   size_t nin = 16;
+//   size_t nout = 256;
+//   config.layerConfig.set_type("selective_fc");
+//   config.layerConfig.set_size(nout);
+//   config.layerConfig.set_active_type("sigmoid");
+//   config.layerConfig.set_has_selected_colums(true);
+//   config.layerConfig.set_selective_fc_pass_generation(false);
+//   config.biasSize = nout;
+//
+//   config.inputDefs.push_back({INPUT_DATA, "input0", nin, nin * nout});
+//   config.layerConfig.add_inputs();
+//   config.inputDefs.push_back(
+//       {INPUT_SPARSE_NON_VALUE_DATA, "index", nout, 0, ParaSparse("csr",
+//       true)});
+//   config.layerConfig.add_inputs();
+//
+//   testLayerGrad(config,
+//                 "selective_fc",
+//                 100,
+//                 /* trans= */ false,
+//                 /* useGup= */ false,
+//                 false);
+// #ifndef PADDLE_ONLY_CPU
+//   testLayerGrad(config,
+//                 "selective_fc",
+//                 100,
+//                 /* trans= */ false,
+//                 /* useGup= */ true,
+//                 false);
+// #endif
+// }
+//
+// TEST(Layer, DataNormLayer) {
+//   TestConfig config;
+//   config.layerConfig.set_type("data_norm");
+//   config.layerConfig.set_size(20);
+//   config.biasSize = 0;
+//
+//   config.inputDefs.push_back({INPUT_DATA, "layer_0", 20, 100});
+//   config.inputDefs.back().isStatic = true;
+//   config.layerConfig.add_inputs();
+//
+//   for (auto strategy : {"z-score", "min-max", "decimal-scaling"}) {
+//     config.layerConfig.set_data_norm_strategy(strategy);
+//     // The parameters are static, so not support GPU now
+//     testLayerGrad(config,
+//                   "data_norm",
+//                   200,
+//                   /* trans */ false,
+//                   /* useGpu */ false);
+//   }
+// }
+//
+// TEST(Layer, hsigmoidLayer) {
+//   TestConfig config;
+//   config.layerConfig.set_type("hsigmoid");
+//   config.layerConfig.set_num_classes(5);
+//   config.layerConfig.set_size(1);
+//   config.biasSize = config.layerConfig.num_classes() - 1;
+//
+//   config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 200});
+//   config.inputDefs.push_back({INPUT_LABEL, "layer_1", 5, 0});
+//   config.layerConfig.add_inputs();
+//   config.layerConfig.add_inputs();
+//
+//   // Not support GPU now
+//   testLayerGrad(config, "hsigmoid", 100, /* trans */ false, /* useGpu */
+//   false);
+// }
+//
+// TEST(Layer, multi_cross) {
+//   TestConfig config;
+//   config.layerConfig.set_type("multi-class-cross-entropy");
+//   config.biasSize = 0;
+//
+//   config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
+//   config.inputDefs.push_back({INPUT_LABEL, "layer_1", 10, 0});
+//   config.layerConfig.add_inputs();
+//   config.layerConfig.add_inputs();
+//
+//   for (auto useGpu : {false, true}) {
+//     testLayerGrad(
+//         config, "multi-class-cross-entropy", 100, /* trans */ false, useGpu);
+//   }
+// }
+//
+// TEST(Layer, multi_binary_label_sparse_mat) {
+//   TestConfig config;
+//   config.layerConfig.set_type("multi_binary_label_cross_entropy");
+//   config.biasSize = 0;
+//
+//   config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
+//   config.inputDefs.push_back({INPUT_SPARSE_NON_VALUE_DATA, "layer_1", 50,
+//   0});
+//   config.layerConfig.add_inputs();
+//   config.layerConfig.add_inputs();
+//
+//   for (auto useGpu : {false, true}) {
+//     testLayerGrad(config,
+//                   "multi_binary_label_cross_entropy",
+//                   100,
+//                   /* trans */ false,
+//                   useGpu);
+//   }
+// }
+//
+// TEST(layer, multi_binary_label_id) {
+//   TestConfig config;
+//   config.layerConfig.set_type("multi_binary_label_cross_entropy");
+//   config.biasSize = 0;
+//
+//   config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
+//   config.inputDefs.push_back({INPUT_LABEL, "layer_1", 10, 0});
+//   config.layerConfig.add_inputs();
+//   config.layerConfig.add_inputs();
+//
+//   for (auto useGpu : {false, true}) {
+//     testLayerGrad(config,
+//                   "multi_binary_label_cross_entropy",
+//                   100,
+//                   /* trans */ false,
+//                   useGpu);
+//   }
+// }
+//
+// TEST(Layer, multi_cross_with_selfnorm) {
+//   TestConfig config;
+//   config.layerConfig.set_type("multi_class_cross_entropy_with_selfnorm");
+//   config.layerConfig.set_softmax_selfnorm_alpha(0.1);
+//   config.biasSize = 0;
+//
+//   config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
+//   config.inputDefs.push_back({INPUT_LABEL, "layer_1", 10, 0});
+//   config.layerConfig.add_inputs();
+//   config.layerConfig.add_inputs();
+//
+//   // Not support GPU now
+//   testLayerGrad(config,
+//                 "multi_class_cross_entropy_with_selfnorm",
+//                 100,
+//                 /* trans */ false,
+//                 /* useGpu */ false);
+// }
+//
+// TEST(Layer, multi_cross_soft) {
+//   TestConfig config;
+//   config.layerConfig.set_type("soft_binary_class_cross_entropy");
+//   config.biasSize = 0;
+//
+//   config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
+//   config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_1", 10, 0});
+//   config.layerConfig.add_inputs();
+//   config.layerConfig.add_inputs();
+//
+//   for (auto useGpu : {false, true}) {
+//     testLayerGrad(config,
+//                   "soft_binary_class_cross_entropy",
+//                   100,
+//                   /* trans */ false,
+//                   useGpu);
+//   }
+// }
+//
+// TEST(Layer, square_error) {
+//   TestConfig config;
+//   config.layerConfig.set_type("square_error");
+//   config.biasSize = 0;
+//
+//   config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
+//   config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_1", 10, 0});
+//   config.layerConfig.add_inputs();
+//   config.layerConfig.add_inputs();
+//
+//   for (auto useGpu : {false, true}) {
+//     testLayerGrad(config, "square_error", 100, /* trans */ false, useGpu);
+//   }
+// }
+//
+// TEST(Layer, sparse_square_error) {
+//   TestConfig config;
+//   config.layerConfig.set_type("square_error");
+//   config.biasSize = 0;
+//
+//   config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
+//   config.inputDefs.push_back({INPUT_SPARSE_NON_VALUE_DATA, "layer_1", 50,
+//   0});
+//   config.layerConfig.add_inputs();
+//   config.layerConfig.add_inputs();
+//
+//   // "GpuSparseMatrix" as label is not supported
+//   testLayerGrad(config,
+//                 "square_error",
+//                 100,
+//                 /* trans */ false,
+//                 /* useGpu */ false);
+// }
+//
+// TEST(Layer, sparse_float_square_error) {
+//   TestConfig config;
+//   config.layerConfig.set_type("square_error");
+//   config.biasSize = 0;
+//
+//   config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
+//   config.inputDefs.push_back({INPUT_SPARSE_FLOAT_VALUE_DATA, "layer_1", 50,
+//   0});
+//   config.layerConfig.add_inputs();
+//   config.layerConfig.add_inputs();
+//
+//   // "GpuSparseMatrix" as label is not supported
+//   testLayerGrad(config,
+//                 "square_error",
+//                 100,
+//                 /* trans */ false,
+//                 /* useGpu */ false);
+// }
+//
+// TEST(Layer, square_error_weighted) {
+//   TestConfig config;
+//   config.layerConfig.set_type("square_error");
+//   config.biasSize = 0;
+//   config.testAccumulate = false;
+//
+//   config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
+//   config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_1", 10, 0});
+//   config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_2", 1, 0});
+//   config.layerConfig.add_inputs();
+//   config.layerConfig.add_inputs();
+//   config.layerConfig.add_inputs();
+//
+//   for (auto useGpu : {false, true}) {
+//     testLayerGrad(config, "square_error", 100, /* trans */ false, useGpu);
+//   }
+// }
+//
+// TEST(Layer, huber_two_class) {
+//   TestConfig config;
+//   config.layerConfig.set_type("huber");
+//   config.biasSize = 0;
+//
+//   config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
+//   config.inputDefs.push_back({INPUT_LABEL, "layer_1", 2, 0});
+//   config.layerConfig.add_inputs();
+//   config.layerConfig.add_inputs();
+//
+//   for (auto useGpu : {false, true}) {
+//     testLayerGrad(config, "huber", 100, /* trans */ false, useGpu);
+//   }
+// }
+//
+// void testExpandLayer(string trans_type, bool hasSubseq) {
+//   TestConfig config;
+//   config.layerConfig.set_type("expand");
+//
+//   config.inputDefs.push_back(
+//       {trans_type == "non-seq" ? INPUT_DENSE_DIM_DATA : INPUT_SEQUENCE_DATA,
+//        "layer_0",
+//        10,
+//        0});
+//   config.inputDefs.push_back(
+//       {hasSubseq ? INPUT_HASSUB_SEQUENCE_DATA : INPUT_SEQUENCE_DATA,
+//        "layer_1",
+//        10,
+//        0});
+//   config.layerConfig.add_inputs();
+//   config.layerConfig.add_inputs();
+//   config.layerConfig.set_trans_type(trans_type);
+//   LOG(INFO) << " trans_type=" << trans_type << " hasSubseq=" << hasSubseq;
+//
+//   for (auto useGpu : {false, true}) {
+//     testLayerGrad(config, "expand", 30, false, useGpu);
+//   }
+// }
+//
+// TEST(Layer, ExpandLayer) {
+//   testExpandLayer("non-seq", false);  // non-seq expand to seq
+//   testExpandLayer("non-seq", true);   // non-seq expand to hasSubseq
+//   testExpandLayer("seq", true);       // seq expand to hasSubseq
+// }
+//
+// void testDegradeLayer(bool hasSubseq, string layer_type, string trans_type) {
+//   TestConfig config;
+//   config.layerConfig.set_type(layer_type);
+//   config.layerConfig.set_size(10);
+//   config.biasSize = 0;
+//
+//   config.inputDefs.push_back(
+//       {hasSubseq ? INPUT_HASSUB_SEQUENCE_DATA : INPUT_SEQUENCE_DATA,
+//        "layer_0",
+//        10,
+//        0});
+//   config.layerConfig.add_inputs();
+//   config.layerConfig.set_trans_type(trans_type);
+//
+//   auto testDegradeLayerGrad = [](TestConfig& config, string layer_type) {
+//     for (auto useGpu : {false, true}) {
+//       testLayerGrad(config, layer_type, 100, false, useGpu);
+//     }
+//   };
+//
+//   if (layer_type == "average") {
+//     for (auto strategy : {"average", "sum", "squarerootn"}) {
+//       LOG(INFO) << " hasSubseq=" << hasSubseq << " trans_type=" << trans_type
+//                 << " average_strategy=" << strategy;
+//       config.layerConfig.set_average_strategy(strategy);
+//       testDegradeLayerGrad(config, layer_type);
+//     }
+//   } else {
+//     LOG(INFO) << " hasSubseq=" << hasSubseq << " trans_type=" << trans_type;
+//     testDegradeLayerGrad(config, layer_type);
+//   }
+// }
+//
+// TEST(Layer, MaxLayer) {
+//   testDegradeLayer(false, "max", "non-seq");  // seq max to non-seq
+//   testDegradeLayer(true, "max", "non-seq");   // hasSubseq max to non-seq
+//   testDegradeLayer(true, "max", "seq");       // hasSubseq max to seq
+// }
+//
+// TEST(Layer, SequenceLastInstanceLayer) {
+//   testDegradeLayer(false,
+//                    "seqlastins",
+//                    "non-seq");  // seq seqlastins to non-seq
+//   testDegradeLayer(true,
+//                    "seqlastins",
+//                    "non-seq");  // hasSubseq seqlastins to non-seq
+//   testDegradeLayer(true, "seqlastins", "seq");  // hasSubseq seqlastins to
+//   seq
+// }
+//
+// TEST(Layer, AverageLayer) {
+//   testDegradeLayer(false, "average", "non-seq");  // seq average to non-seq
+//   testDegradeLayer(true, "average", "non-seq");  // hasSubseq average to
+//   non-seq
+//   testDegradeLayer(true, "average", "seq");      // hasSubseq average to seq
+// }
+//
+// TEST(Layer, SequenceConcatLayer) {
+//   TestConfig config;
+//   config.layerConfig.set_type("seqconcat");
+//   config.layerConfig.set_size(10);
+//   config.biasSize = 0;
+//
+//   config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "layer_0", 10, 0});
+//   config.layerConfig.add_inputs();
+//   config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "layer_1", 10, 0});
+//   config.layerConfig.add_inputs();
+//
+//   for (auto useGpu : {false, true}) {
+//     testLayerGrad(config, "seqconcat", 100, false, useGpu);
+//   }
+// }
+//
+// TEST(Layer, SequenceReshapeLayer) {
+//   TestConfig config;
+//   config.layerConfig.set_type("seqreshape");
+//   config.layerConfig.set_size(10);
+//
+//   config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "layer_0", 100, 0});
+//   config.layerConfig.add_inputs();
+//
+//   for (auto useGpu : {false, true}) {
+//     testLayerGrad(config, "seqreshape", 100, false, useGpu);
+//   }
+// }
+//
+// TEST(Layer, ConvShiftLayer) {
+//   TestConfig config;
+//   config.layerConfig.set_type("conv_shift");
+//   config.layerConfig.set_size(10);
+//
+//   config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
+//   config.inputDefs.push_back({INPUT_DATA, "layer_1", 3, 0});
+//   config.layerConfig.add_inputs();
+//   config.layerConfig.add_inputs();
+//
+//   // Not support GPU now
+//   testLayerGrad(config, "conv_shift", 100, false, false);
+// }
+//
+// TEST(Layer, PowerLayer) {
+//   TestConfig config;
+//   config.layerConfig.set_type("power");
+//   config.layerConfig.set_size(10);
+//
+//   config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
+//   config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
+//   config.layerConfig.add_inputs();
+//   config.layerConfig.add_inputs();
+//
+//   for (auto useGpu : {false, true}) {
+//     testLayerGrad(config, "power", 100, false, useGpu);
+//   }
+// }
+//
+// TEST(Layer, ConvexCombinationLayer) {
+//   TestConfig config;
+//   config.layerConfig.set_type("convex_comb");
+//   config.layerConfig.set_size(20);
+//   config.biasSize = 0;
+//
+//   config.inputDefs.push_back({INPUT_DATA, "layer_0", 5, 0});
+//   config.inputDefs.push_back({INPUT_DATA, "layer_1", 100, 0});
+//   config.layerConfig.add_inputs();
+//   config.layerConfig.add_inputs();
+//
+//   for (auto useGpu : {false, true}) {
+//     testLayerGrad(config, "convex_comb", 100, false, useGpu);
+//   }
+// }
+//
+// TEST(Layer, InterpolationLayer) {
+//   TestConfig config;
+//   config.layerConfig.set_type("interpolation");
+//   config.layerConfig.set_size(10);
+//   config.biasSize = 0;
+//
+//   config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
+//   config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
+//   config.inputDefs.push_back({INPUT_DATA, "layer_2", 10, 0});
+//   config.layerConfig.add_inputs();
+//   config.layerConfig.add_inputs();
+//   config.layerConfig.add_inputs();
+//
+//   for (auto useGpu : {false, true}) {
+//     testLayerGrad(config, "interpolation", 100, false, useGpu);
+//   }
+// }
+//
+// TEST(Layer, OuterProdLayer) {
+//   TestConfig config;
+//   config.layerConfig.set_type("out_prod");
+//   config.layerConfig.set_size(100);
+//
+//   config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
+//   config.layerConfig.add_inputs();
+//   config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
+//   config.layerConfig.add_inputs();
+//
+//   for (auto useGpu : {false, true}) {
+//     testLayerGrad(config, "out_prod", 100, false, useGpu);
+//   }
+// }
+//
+// TEST(Layer, SlopeInterceptLayer) {
+//   TestConfig config;
+//   config.layerConfig.set_type("slope_intercept");
+//   config.layerConfig.set_size(10);
+//   config.layerConfig.set_slope(1.0);
+//   config.layerConfig.set_intercept(0.1);
+//
+//   config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
+//   config.layerConfig.add_inputs();
+//
+//   for (auto useGpu : {false, true}) {
+//     testLayerGrad(config, "slope_intercept", 100, false, useGpu);
+//   }
+// }
+//
+// TEST(Layer, ScalingLayer) {
+//   TestConfig config;
+//   config.layerConfig.set_type("scaling");
+//   config.layerConfig.set_size(10);
+//   config.biasSize = 0;
+//
+//   config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
+//   config.layerConfig.add_inputs();
+//   config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
+//   config.layerConfig.add_inputs();
+//
+//   for (auto useGpu : {false, true}) {
+//     testLayerGrad(config, "scaling", 100, false, useGpu);
+//   }
+// }
+//
+// void testNormLayer(const string& normType, bool trans, bool useGpu) {
+//   TestConfig config;
+//   config.layerConfig.set_type("norm");
+//   config.layerConfig.set_active_type("relu");
+//
+//   config.inputDefs.push_back({INPUT_DATA, "layer_0", 1568, 0});
+//   LayerInputConfig* input = config.layerConfig.add_inputs();
+//   NormConfig* norm = input->mutable_norm_conf();
+//   norm->set_norm_type(normType);
+//   norm->set_channels(16);
+//   norm->set_size(5);
+//   norm->set_scale(0.001);
+//   norm->set_pow(0.75);
+//   norm->set_blocked(0);
+//   norm->set_img_size(14);
+//   norm->set_img_size_y(7);
+//   norm->set_output_x(norm->img_size());
+//   norm->set_output_y(norm->img_size_y());
+//   if (norm->norm_type() == "cmrnorm" ||
+//       norm->norm_type() == "cmrnorm-projection") {
+//     norm->set_scale(norm->scale() / norm->size());
+//   } else {
+//     norm->set_scale(norm->scale() / (norm->size() * norm->size()));
+//   }
+//
+//   config.layerConfig.set_size(norm->output_x() * norm->output_y() *
+//                               norm->channels());
+//   config.biasSize = 0;
+//
+//   testLayerGrad(config, "norm", 100, trans, useGpu);
+// }
+//
+// TEST(Layer, NormLayer) {
+//   testNormLayer("cmrnorm-projection", /* trans= */ false, /* useGpu= */
+//   true);
+//   testNormLayer("cmrnorm-projection", /* trans= */ false, /* useGpu= */
+//   false);
+// }
+//
+// void setPoolConfig(TestConfig* config,
+//                    PoolConfig* pool,
+//                    const string& poolType) {
+//   (*config).biasSize = 0;
+//   (*config).layerConfig.set_type("pool");
+//   (*config).layerConfig.set_num_filters(16);
+//
+//   int kw = 3, kh = 3;
+//   int pw = 0, ph = 0;
+//   int sw = 2, sh = 2;
+//   pool->set_pool_type(poolType);
+//   pool->set_channels(16);
+//   pool->set_size_x(kw);
+//   pool->set_size_y(kh);
+//   pool->set_start(0);
+//   pool->set_padding(pw);
+//   pool->set_padding_y(ph);
+//   pool->set_stride(sw);
+//   pool->set_stride_y(sh);
+//
+//   int ow = outputSize(pool->img_size(), kw, pw, sw, /* caffeMode */ false);
+//   int oh = outputSize(pool->img_size_y(), kh, ph, sh, /* caffeMode */ false);
+//   pool->set_output_x(ow);
+//   pool->set_output_y(oh);
+// }
+//
+// void testPoolLayer(const string& poolType, bool trans, bool useGpu) {
+//   TestConfig config;
+//   config.inputDefs.push_back({INPUT_DATA, "layer_0", 3136, 0});
+//   LayerInputConfig* input = config.layerConfig.add_inputs();
+//   PoolConfig* pool = input->mutable_pool_conf();
+//
+//   pool->set_img_size(14);
+//   pool->set_img_size_y(14);
+//   setPoolConfig(&config, pool, poolType);
+//   config.layerConfig.set_size(pool->output_x() * pool->output_y() *
+//                               pool->channels());
+//
+//   testLayerGrad(config, "pool", 100, trans, useGpu);
+// }
+//
+// #ifndef PADDLE_ONLY_CPU
+// void testPoolLayer2(const string& poolType, bool trans, bool useGpu) {
+//   TestConfig config;
+//   config.inputDefs.push_back({INPUT_DATA, "layer_0", 3200, 0});
+//   LayerInputConfig* input = config.layerConfig.add_inputs();
+//   PoolConfig* pool = input->mutable_pool_conf();
+//
+//   pool->set_size_y(4);
+//   pool->set_stride_y(3);
+//   pool->set_img_size(10);
+//   pool->set_img_size_y(20);
+//   setPoolConfig(&config, pool, poolType);
+//   pool->set_output_y((pool->img_size_y() - pool->start() - pool->size_y()) /
+//                          ((float)pool->stride_y()) +
+//                      1.5);
+//   config.layerConfig.set_size(pool->output_x() * pool->output_y() *
+//                               pool->channels());
+//
+//   testLayerGrad(config, "pool", 100, trans, useGpu);
+// }
+// #endif
+//
+// TEST(Layer, PoolLayer) {
+//   testPoolLayer("avg-projection", /* trans= */ false, /* useGpu= */ false);
+//   testPoolLayer("max-projection", /* trans= */ false, /* useGpu= */ false);
+//
+// #ifndef PADDLE_ONLY_CPU
+//   testPoolLayer("avg-projection", /* trans= */ false, /* useGpu= */ true);
+//   testPoolLayer("max-projection", /* trans= */ false, /* useGpu= */ true);
+//   testPoolLayer("cudnn-max-pool", /* trans= */ false, /* useGpu= */ true);
+//   testPoolLayer("cudnn-avg-pool", /* trans= */ false, /* useGpu= */ true);
+//   testPoolLayer2("cudnn-max-pool", /* trans= */ false, /* useGpu= */ true);
+//   testPoolLayer2("cudnn-avg-pool", /* trans= */ false, /* useGpu= */ true);
+// #endif
+// }
+//
+// void testSppLayer(const string& poolType,
+//                   const int pyramidHeight,
+//                   bool trans,
+//                   bool useGpu) {
+//   TestConfig config;
+//   config.layerConfig.set_type("spp");
+//   config.inputDefs.push_back({INPUT_DATA, "layer_0", 3200, 0});
+//   LayerInputConfig* input = config.layerConfig.add_inputs();
+//   SppConfig* sppConfig = input->mutable_spp_conf();
+//   sppConfig->set_pool_type(poolType);
+//   sppConfig->set_pyramid_height(pyramidHeight);
+//   ImageConfig* imageConfig = sppConfig->mutable_image_conf();
+//   imageConfig->set_channels(16);
+//   imageConfig->set_img_size(10);
+//   imageConfig->set_img_size_y(20);
+//   int outputSize = (std::pow(4, sppConfig->pyramid_height()) - 1) / (4 - 1);
+//   config.layerConfig.set_size(outputSize * imageConfig->channels());
+//   testLayerGrad(config, "spp", 100, trans, useGpu);
+// }
+//
+// TEST(Layer, SpatialPyramidPoolLayer) {
+//   for (auto useGpu : {false, true}) {
+//     for (auto pyramidHeight : {1, 2, 3}) {
+//       testSppLayer("avg-projection", pyramidHeight, false, useGpu);
+//       testSppLayer("max-projection", pyramidHeight, false, useGpu);
+//     }
+//   }
+// }
+//
+// TEST(Layer, rankCostLayer) {
+//   TestConfig config;
+//   config.layerConfig.set_type("rank-cost");
+//   config.biasSize = 0;
+//
+//   config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
+//   config.inputDefs.push_back({INPUT_DATA, "layer_1", 1, 0});
+//   config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_2", 1, 0});
+//   config.layerConfig.add_inputs();
+//   config.layerConfig.add_inputs();
+//   config.layerConfig.add_inputs();
+//
+//   for (auto useGpu : {false, true}) {
+//     testLayerGrad(config, "rank-cost", 100, false, useGpu);
+//   }
+// }
+//
+// TEST(Layer, sumCostLayer) {
+//   TestConfig config;
+//   config.layerConfig.set_type("sum_cost");
+//   config.biasSize = 0;
+//
+//   config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
+//   config.layerConfig.add_inputs();
+//
+//   for (auto useGpu : {false, true}) {
+//     testLayerGrad(config, "sum_cost", 100, false, useGpu);
+//   }
+// }
+//
+// TEST(Layer, weightedRankCostLayer) {
+//   TestConfig config;
+//   config.layerConfig.set_type("rank-cost");
+//   config.biasSize = 0;
+//
+//   config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
+//   config.inputDefs.push_back({INPUT_DATA, "layer_1", 1, 0});
+//   config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_2", 1, 0});
+//   config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_3", 1, 0});
+//   config.layerConfig.add_inputs();
+//   config.layerConfig.add_inputs();
+//   config.layerConfig.add_inputs();
+//   config.layerConfig.add_inputs();
+//
+//   for (auto useGpu : {false, true}) {
+//     testLayerGrad(config, "weighted-rank-cost", 100, false, useGpu);
+//   }
+// }
+//
+// TEST(Layer, TensorLayer) {
+//   TestConfig config;
+//   config.layerConfig.set_type("tensor");
+//   config.layerConfig.set_size(10);
+//   config.layerConfig.set_active_type("sigmoid");
+//   config.biasSize = config.layerConfig.size();
+//
+//   config.inputDefs.push_back({INPUT_DATA, "layer_0", 5, 250});
+//   config.inputDefs.push_back({INPUT_DATA, "layer_1", 5, 0});
+//   config.layerConfig.add_inputs();
+//   config.layerConfig.add_inputs();
+//
+//   for (auto useGpu : {false, true}) {
+//     testLayerGrad(config, "tensor", 100, false, useGpu);
+//   }
+// }
+//
+// TEST(Layer, RecurrentLayer) {
+//   TestConfig config;
+//   config.layerConfig.set_type("recurrent");
+//   config.layerConfig.set_size(4);
+//   config.layerConfig.set_active_type("tanh");
+//   config.biasSize = 4;
+//
+//   config.inputDefs.push_back(
+//       {INPUT_SEQUENCE_DATA, "layer_0", /* dim= */ 4, /* paraSize= */ 16});
+//   config.layerConfig.add_inputs();
+//
+//   for (auto useGpu : {false, true}) {
+//     for (auto reversed : {false, true}) {
+//       config.layerConfig.set_reversed(reversed);
+//       config.testState = !reversed;
+//       testLayerGrad(config, "recurrent", 50, /* trans= */ false, useGpu);
+//     }
+//   }
+// }
+//
+// TEST(Layer, LstmLayer) {
+//   TestConfig config;
+//   config.layerConfig.set_type("lstmemory");
+//   config.layerConfig.set_size(4);
+//   config.layerConfig.set_active_type("tanh");
+//   config.layerConfig.set_active_state_type("sigmoid");
+//   config.layerConfig.set_active_gate_type("sigmoid");
+//   config.biasSize = 28;
+//
+//   config.inputDefs.push_back(
+//       {INPUT_SEQUENCE_DATA, "layer_0", /* dim= */ 16, /* paraSize= */ 64});
+//   config.layerConfig.add_inputs();
+//
+//   for (auto useGpu : {false, true}) {
+//     for (auto reversed : {false, true}) {
+//       config.layerConfig.set_reversed(reversed);
+//       config.testState = !reversed;
+//       testLayerGrad(config, "lstmemory", 100, /* trans= */ false, useGpu);
+//     }
+//   }
+//   for (auto useGpu : {true}) {
+//     config.testBatchState = true;
+//     config.layerConfig.set_reversed(false);
+//     testLayerGrad(config, "lstmemory", 10, /* trans= */ false, useGpu);
+//   }
+// }
+//
+// TEST(Layer, MDLstmLayer) {
+//   TestConfig config;
+//   config.layerConfig.set_type("mdlstmemory");
+//   config.layerConfig.set_size(4);
+//   config.layerConfig.set_active_type("sigmoid");
+//   config.layerConfig.set_active_state_type("sigmoid");
+//   config.layerConfig.set_active_gate_type("sigmoid");
+//   config.biasSize = 4 * 9;
+//
+//   config.inputDefs.push_back(
+//       {INPUT_SEQUENCE_MDIM_DATA, "layer_0", 4 * 5, 4 * 4 * 5});
+//   config.layerConfig.add_inputs();
+//   config.layerConfig.add_directions(true);
+//   config.layerConfig.add_directions(true);
+//
+//   for (auto useGpu : {false, true}) {
+//     for (int i = 0; i < 2; i++) {
+//       for (int j = 0; j < 2; j++) {
+//         config.layerConfig.set_directions(0, bool(i));
+//         config.layerConfig.set_directions(1, bool(j));
+//         testLayerGrad(config, "mdlstmemory", 100, false, useGpu);
+//       }
+//     }
+//   }
+// }
+//
+// TEST(Layer, ParameterReluLayer) {
+//   auto testParameterReluLayer = [&](size_t inputSize, size_t channels) {
+//     TestConfig config;
+//     config.layerConfig.set_type("prelu");
+//     config.inputDefs.push_back({INPUT_DATA, "layer_0", inputSize, channels});
+//     config.layerConfig.add_inputs();
+//     config.layerConfig.set_size(inputSize);
+//     config.layerConfig.set_partial_sum(inputSize /
+//                                        channels);  // size of feature map
+//     for (auto useGpu : {false, true}) {
+//       testLayerGrad(config, "prelu", 100, false, useGpu);
+//     }
+//   };
+//
+//   testParameterReluLayer(192, 1);
+//   testParameterReluLayer(192, 3);
+//   testParameterReluLayer(192, 192);
+// }
+//
+// TEST(Layer, ResizeLayer) {
+//   TestConfig config;
+//   config.biasSize = 0;
+//   config.layerConfig.set_type("resize");
+//   config.layerConfig.set_size(64);
+//
+//   config.inputDefs.push_back({INPUT_DATA, "layer_0", 16, 0});
+//   config.layerConfig.add_inputs();
+//
+//   for (auto useGpu : {false, true}) {
+//     testLayerGrad(config, "resize", 100, false, useGpu);
+//   }
+// }
+//
+// TEST(Layer, NCELayer) {
+//   TestConfig config;
+//   size_t numClasses = 4;
+//   config.layerConfig.set_type("nce");
+//   config.layerConfig.set_size(1);
+//   config.layerConfig.set_active_type("sigmoid");
+//   config.layerConfig.set_num_classes(numClasses);
+//   config.biasSize = numClasses;
+//
+//   config.inputDefs.push_back(
+//       {INPUT_DATA, "layer_0", /* dim= */ 16, /* paraSize= */ 16 *
+//       numClasses});
+//   config.inputDefs.push_back(
+//       {INPUT_LABEL, "label", /* dim= */ numClasses, /* paraSize= */ 0});
+//   config.layerConfig.add_inputs();
+//   config.layerConfig.add_inputs();
+//
+//   for (auto withWeight : {false, true}) {
+//     if (withWeight) {
+//       config.inputDefs.push_back(
+//           {INPUT_DATA_TARGET, "weight", /* dim= */ 1, /* paraSize= */ 0});
+//       config.layerConfig.add_inputs();
+//     }
+//
+//     for (auto isIdLabel : {false, true}) {
+//       config.inputDefs[1] = {
+//           isIdLabel ? INPUT_LABEL : INPUT_SPARSE_NON_VALUE_DATA,
+//           "label",
+//           /* dim= */ numClasses,
+//           /* paraSize= */ 0};
+//
+//       for (auto withDist : {false, true}) {
+//         config.layerConfig.clear_neg_sampling_dist();
+//         if (withDist) {
+//           double sum = 0;
+//           for (size_t i = 0; i < numClasses; ++i) {
+//             real p = rand();  // NOLINT use rand_r
+//             config.layerConfig.add_neg_sampling_dist(p);
+//             sum += p;
+//           }
+//           for (size_t i = 0; i < numClasses; ++i) {
+//             real p = config.layerConfig.neg_sampling_dist(i) / sum;
+//             config.layerConfig.set_neg_sampling_dist(i, p);
+//           }
+//         }
+//         LOG(INFO) << "NCELayer "
+//                   << " isIdLabel=" << isIdLabel << " withWeight=" <<
+//                   withWeight
+//                   << " withDist=" << withDist;
+//         // Not support GPU now
+//         testLayerGrad(config,
+//                       "nce",
+//                       100,
+//                       /* trans= */ false,
+//                       /* useGpu */ false);
+//       }
+//     }
+//   }
+// }
+//
+// TEST(Layer, GatedRecurrentLayer) {
+//   TestConfig config;
+//   config.layerConfig.set_type("gated_recurrent");
+//   config.layerConfig.set_size(4);
+//   config.layerConfig.set_active_type("sigmoid");
+//   config.layerConfig.set_active_gate_type("sigmoid");
+//   config.biasSize = 12;
+//
+//   config.inputDefs.push_back(
+//       {INPUT_SEQUENCE_DATA, "layer_0", /* dim= */ 12, /* paraSize= */ 48});
+//   config.layerConfig.add_inputs();
+//
+//   for (auto useGpu : {false, true}) {
+//     for (auto reversed : {false, true}) {
+//       config.layerConfig.set_reversed(reversed);
+//       config.testState = !reversed;
+//       testLayerGrad(config, "gated_recurrent", 100, /* trans= */ false,
+//       useGpu);
+//     }
+//   }
+// }
+//
+// TEST(Layer, GruStepLayer) {
+//   TestConfig config;
+//   config.layerConfig.set_type("gru_step");
+//   config.layerConfig.set_size(4);
+//   config.layerConfig.set_active_type("sigmoid");
+//   config.layerConfig.set_active_gate_type("sigmoid");
+//   config.biasSize = 12;
+//
+//   config.inputDefs.push_back(
+//       {INPUT_DATA, "layer_0", /* dim= */ 12, /* paraSize= */ 48});
+//   config.inputDefs.push_back(
+//       {INPUT_DATA, "layer_1", /* dim= */ 4, /* paraSize= */ 0});
+//   config.layerConfig.add_inputs();
+//   config.layerConfig.add_inputs();
+//
+//   for (auto useGpu : {false, true}) {
+//     testLayerGrad(config, "gruStep", 100, /* trans= */ false, useGpu);
+//   }
+// }
+//
+// TEST(Layer, LstmStepLayer) {
+//   TestConfig config;
+//   config.layerConfig.set_type("lstm_step");
+//   config.layerConfig.set_size(4);
+//   config.layerConfig.set_active_type("sigmoid");
+//   config.layerConfig.set_active_state_type("sigmoid");
+//   config.layerConfig.set_active_gate_type("sigmoid");
+//   config.biasSize = 12;
+//   config.testAccumulate = false;
+//
+//   config.inputDefs.push_back(
+//       {INPUT_DATA, "layer_0", /* dim= */ 16, /* paraSize= */ 0});
+//   config.inputDefs.push_back(
+//       {INPUT_DATA, "layer_1", /* dim= */ 4, /* paraSize= */ 0});
+//   config.layerConfig.add_inputs();
+//   config.layerConfig.add_inputs();
+//
+//   for (auto useGpu : {false, true}) {
+//     testLayerGrad(config, "lstmStep", 100, /* trans= */ false, useGpu);
+//   }
+// }
+//
+// void testBatchNormLayer(const string& type, bool trans, bool useGpu) {
+//   TestConfig config;
+//   const int CHANNELS = 10;
+//   const int IMG_SIZE = 16;
+//   const int IMG_SIZE_Y = 8;
+//   size_t size = CHANNELS * IMG_SIZE * IMG_SIZE_Y;
+//   config.layerConfig.set_type(type);
+//   config.layerConfig.set_size(size);
+//   config.layerConfig.set_active_type("sigmoid");
+//   config.biasSize = CHANNELS;
+//   config.inputDefs.push_back({INPUT_DATA,
+//                               "layer_0",
+//                               /* dim= */ size,
+//                               /* paraSize= */ CHANNELS});
+//
+//   config.inputDefs.push_back({INPUT_DATA, "layer_1_running_mean", 1,
+//   CHANNELS});
+//   config.inputDefs.back().isStatic = true;
+//   config.inputDefs.push_back({INPUT_DATA, "layer_2_running_var", 1,
+//   CHANNELS});
+//   config.inputDefs.back().isStatic = true;
+//
+//   LayerInputConfig* input = config.layerConfig.add_inputs();
+//   config.layerConfig.add_inputs();
+//   config.layerConfig.add_inputs();
+//
+//   ImageConfig* img_conf = input->mutable_image_conf();
+//   img_conf->set_channels(CHANNELS);
+//   img_conf->set_img_size(IMG_SIZE);
+//   img_conf->set_img_size_y(IMG_SIZE_Y);
+//
+//   testLayerGrad(config,
+//                 "batch_norm",
+//                 64,
+//                 /* trans= */ trans,
+//                 useGpu,
+//                 /* useWeight */ true);
+// }
+//
+// TEST(Layer, BatchNormalizationLayer) {
+//   testBatchNormLayer("batch_norm", false, false);
+// #ifndef PADDLE_ONLY_CPU
+//   testBatchNormLayer("batch_norm", false, true);
+//   if (hl_get_cudnn_lib_version() >= int(4000)) {
+//     testBatchNormLayer("cudnn_batch_norm", false, true);
+//   }
+// #endif
+// }
+//
+// TEST(Operator, conv) {
+//   TestConfig config;
+//   const int NUM_FILTERS = 16;
+//   const int FILTER_SIZE = 2;
+//   const int FILTER_SIZE_Y = 3;
+//   const int CHANNELS = 3;
+//   const int IMAGE_SIZE = 16;
+//   const int IMAGE_SIZE_Y = 8;
+//   OperatorConfig& operatorConf = *config.layerConfig.add_operator_confs();
+//   operatorConf.set_type("conv");
+//   ConvConfig* conv = operatorConf.mutable_conv_conf();
+//   operatorConf.set_num_filters(NUM_FILTERS);
+//   conv->set_filter_size(FILTER_SIZE);
+//   conv->set_filter_size_y(FILTER_SIZE_Y);
+//   conv->set_channels(CHANNELS);
+//   conv->set_padding(0);
+//   conv->set_padding_y(1);
+//   conv->set_stride(2);
+//   conv->set_stride_y(2);
+//   conv->set_groups(1);
+//   conv->set_filter_channels(conv->channels() / conv->groups());
+//   conv->set_img_size(IMAGE_SIZE);
+//   conv->set_img_size_y(IMAGE_SIZE_Y);
+//   conv->set_output_x(outputSize(conv->img_size(),
+//                                 conv->filter_size(),
+//                                 conv->padding(),
+//                                 conv->stride(),
+//                                 /*  caffeMode */ true));
+//   conv->set_output_y(outputSize(conv->img_size_y(),
+//                                 conv->filter_size_y(),
+//                                 conv->padding_y(),
+//                                 conv->stride_y(),
+//                                 /*  caffeMode */ true));
+//   config.layerConfig.set_size(conv->output_x() * conv->output_y() *
+//                               NUM_FILTERS);
+//
+//   config.inputDefs.push_back(
+//       {INPUT_DATA, "layer_0", IMAGE_SIZE * IMAGE_SIZE_Y * CHANNELS, 0});
+//   config.inputDefs.push_back(
+//       {INPUT_DATA,
+//        "layer_1",
+//        FILTER_SIZE * FILTER_SIZE_Y * CHANNELS * NUM_FILTERS,
+//        0});
+//   config.layerConfig.add_inputs();
+//   config.layerConfig.add_inputs();
+//
+//   testOperatorGrad(config, operatorConf, 100, /*useGpu*/ true, false);
+// }
+//
+// TEST(Layer, FeatureMapExpandLayer) {
+//   TestConfig config;
+//   config.layerConfig.set_type("featmap_expand");
+//   const int CHANNELS = 10;
+//   const int INPUT_SIZE = 100;
+//   config.layerConfig.set_size(INPUT_SIZE * CHANNELS);
+//   config.layerConfig.set_num_filters(CHANNELS);
+//   config.inputDefs.push_back({INPUT_SEQUENCE_DATA,
+//                               "layer_0",
+//                               /* dim= */ INPUT_SIZE,
+//                               /* paraSize= */ 0});
+//   config.layerConfig.add_inputs();
+//   for (auto useGpu : {false, true}) {
+//     testLayerGrad(config,
+//                   "featmap_expand",
+//                   /*batch_size*/ 100,
+//                   /* trans= */ false,
+//                   useGpu,
+//                   /* useWeight */ true);
+//   }
+// }
+//
+// TEST(Layer, MultiplexLayer) {
+//   TestConfig config;
+//   const int LAYER_SIZE = 100;
+//   config.layerConfig.set_type("multiplex");
+//   config.layerConfig.set_size(LAYER_SIZE);
+//
+//   config.inputDefs.push_back({INPUT_LABEL, "layer_0", 2, 0});
+//   config.inputDefs.push_back(
+//       {INPUT_DATA, "layer_1", /* dim= */ LAYER_SIZE, /* paraSize= */ 0});
+//   config.inputDefs.push_back(
+//       {INPUT_DATA, "layer_2", /* dim= */ LAYER_SIZE, /* paraSize= */ 0});
+//   config.layerConfig.add_inputs();
+//   config.layerConfig.add_inputs();
+//   config.layerConfig.add_inputs();
+//
+//   for (auto useGpu : {false, true}) {
+//     testLayerGrad(config, "multiplex", 512, /* trans= */ false, useGpu);
+//   }
+// }
+//
+TEST(Layer, PadLayer) {
   TestConfig config;
-  config.layerConfig.set_size(10);
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  OperatorConfig& operatorConf = *config.layerConfig.add_operator_confs();
-  operatorConf.set_type("dot_mul");
-  operatorConf.set_dotmul_scale(-1);
-
-  testOperatorGrad(config, operatorConf, 100, false, false);
-}
-
-TEST(Projection, context) {
-  for (auto contextStart : {-5, -3, -1, 0, 3}) {
-    for (auto contextLength : {1, 2, 5, 7}) {
-      for (auto batchSize : {1, 2, 5, 20, 50}) {
-        for (auto trainablePadding : {false, true}) {
-          LOG(INFO) << " contextStart=" << contextStart
-                    << " contextLength=" << contextLength
-                    << " batchSize=" << batchSize
-                    << " trainablePadding=" << trainablePadding;
-          ProjectionConfig conf;
-          conf.set_type("context");
-          conf.set_input_size(10);
-          conf.set_context_start(contextStart);
-          conf.set_context_length(contextLength);
-          conf.set_trainable_padding(trainablePadding);
-          conf.set_output_size(conf.context_length() * conf.input_size());
-          int pad =
-              std::max(0, -conf.context_start()) +
-              std::max(0, conf.context_start() + conf.context_length() - 1);
-          for (auto useGpu : {false, true}) {
-            testProjectionGrad(
-                conf,
-                INPUT_SEQUENCE_DATA,
-                trainablePadding ? conf.input_size() * pad : 0,
-                batchSize,
-                useGpu,
-                contextStart + contextLength <= 1);  // = testState
-          }
-        }
-      }
-    }
-  }
-}
-
-TEST(Projection, trans_fc) {
-  ProjectionConfig conf;
-  conf.set_type("trans_fc");
-  conf.set_input_size(50);
-  conf.set_output_size(20);
-  for (auto useGpu : {false, true}) {
-    testProjectionGrad(conf,
-                       INPUT_DATA,
-                       /* parameterSize */ 1000,
-                       /* batchSize */ 100,
-                       useGpu);
-  }
-}
-
-TEST(Projection, fc) {
-  ProjectionConfig conf;
-  conf.set_type("fc");
-  conf.set_input_size(10);
-  conf.set_output_size(20);
-  for (auto useGpu : {false, true}) {
-    testProjectionGrad(conf,
-                       INPUT_DATA,
-                       /* parameterSize */ 200,
-                       /* batchSize */ 100,
-                       useGpu);
-  }
-}
-
-TEST(Projection, dot_mul) {
-  ProjectionConfig conf;
-  conf.set_type("dot_mul");
-  conf.set_input_size(20);
-  conf.set_output_size(20);
-  for (auto useGpu : {false, true}) {
-    testProjectionGrad(conf,
-                       INPUT_DATA,
-                       /* parameterSize */ 20,
-                       /* batchSize */ 100,
-                       useGpu);
-  }
-}
-
-TEST(Projection, table) {
-  ProjectionConfig conf;
-  conf.set_type("table");
-  conf.set_input_size(10);
-  conf.set_output_size(20);
-  for (auto useGpu : {false, true}) {
-    testProjectionGrad(conf,
-                       INPUT_LABEL,
-                       /* parameterSize */ 200,
-                       /* batchSize */ 100,
-                       useGpu);
-  }
-}
-
-TEST(Projection, identity) {
-  ProjectionConfig conf;
-  conf.set_type("identity");
-  conf.set_input_size(10);
-  conf.set_output_size(10);
-  for (auto useGpu : {false, true}) {
-    testProjectionGrad(conf,
-                       INPUT_DATA,
-                       /* parameterSize */ 0,
-                       /* batchSize */ 100,
-                       useGpu);
-  }
-}
-
-TEST(Projection, scaling) {
-  ProjectionConfig conf;
-  conf.set_type("scaling");
-  conf.set_input_size(10);
-  conf.set_output_size(10);
-  for (auto useGpu : {false}) {
-    testProjectionGrad(conf,
-                       INPUT_DATA,
-                       /* parameterSize */ 1,
-                       /* batchSize */ 100,
-                       useGpu);
-  }
-}
-
-void testProjectionConv(size_t groups) {
-  const int NUM_FILTERS = 18;
-  const int FILTER_SIZE = 2;
-  const int FILTER_SIZE_Y = 3;
-  const int CHANNELS = 3;
-  const int IMAGE_SIZE = 16;
-
-  ProjectionConfig conf;
-  conf.set_type("conv");
-  conf.set_num_filters(NUM_FILTERS);
-
-  ConvConfig* conv = conf.mutable_conv_conf();
-  conv->set_filter_size(FILTER_SIZE);
-  conv->set_filter_size_y(FILTER_SIZE_Y);
-  conv->set_channels(CHANNELS);
-  conv->set_padding(0);
-  conv->set_padding_y(1);
-  conv->set_stride(2);
-  conv->set_stride_y(2);
-  conv->set_groups(groups);
-  conv->set_filter_channels(conv->channels() / conv->groups());
-  conv->set_img_size(IMAGE_SIZE);
-  int output_x = outputSize(conv->img_size(),
-                            conv->filter_size(),
-                            conv->padding(),
-                            conv->stride(),
-                            /* caffeMode */ true);
-  int output_y = outputSize(conv->img_size(),
-                            conv->filter_size_y(),
-                            conv->padding_y(),
-                            conv->stride_y(),
-                            /* caffeMode */ true);
-  conv->set_output_x(output_x);
-  conf.set_input_size(IMAGE_SIZE * IMAGE_SIZE * CHANNELS);
-  conf.set_output_size(output_x * output_y * NUM_FILTERS);
-
-  testProjectionGrad(conf,
-                     INPUT_DATA,
-                     /* parameterSize */ NUM_FILTERS * CHANNELS * FILTER_SIZE *
-                         FILTER_SIZE_Y / groups,
-                     /* batchSize */ 100,
-                     true,
-                     false,
-                     NUM_FILTERS,
-                     true);
-}
-
-#ifndef PADDLE_ONLY_CPU
-TEST(Projection, conv) {
-  testProjectionConv(1);
-  testProjectionConv(3);
-}
-#endif
-
-TEST(Layer, BilinearInterpLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("bilinear_interp");
-  config.biasSize = 0;
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 4096, 0});
-
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  BilinearInterpConfig* bilinear = input->mutable_bilinear_interp_conf();
-  ImageConfig* image = bilinear->mutable_image_conf();
-  image->set_img_size(32);
-  image->set_img_size_y(32);
-  image->set_channels(4);
-
-  for (auto useGpu : {false, true}) {
-    for (auto outSize : {32, 64}) {
-      bilinear->set_out_size_x(outSize);
-      bilinear->set_out_size_y(outSize);
-      testLayerGrad(config, "bilinear_interp", 10, false, useGpu);
-    }
-  }
-}
-
-TEST(Layer, concat) {
-  TestConfig config;
-  config.biasSize = 0;
-  config.layerConfig.set_type("concat");
-  config.layerConfig.set_size(15);
-  config.layerConfig.set_active_type("sigmoid");
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 5, 0});
-  config.layerConfig.add_inputs();
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "concat", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, AddtoLayer) {
-  TestConfig config;
-  config.biasSize = 0;
-  config.layerConfig.set_type("addto");
-  config.layerConfig.set_size(10);
-  config.layerConfig.set_active_type("sigmoid");
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
-  config.layerConfig.add_inputs();
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "addto", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, CRFLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("crf");
-  config.layerConfig.set_size(10);
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "layer_0", 10, 120});
-  config.inputDefs.push_back({INPUT_SEQUENCE_LABEL, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  // Not support GPU now
-  testLayerGrad(config,
-                "crf",
-                100,
-                /* trans */ false,
-                /* useGpu */ false,
-                false /*useWeight*/,
-                0.03 /*epsilon*/);
-}
-
-TEST(Layer, CTCLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("ctc");
-  config.layerConfig.set_norm_by_times(false);
-  config.layerConfig.set_size(10);
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "layer_0", 10, 0});
-  config.inputDefs.push_back({INPUT_SEQUENCE_LABEL, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "ctc", 100, /* trans */ false, /* useGpu */ useGpu);
-  }
-}
-
-TEST(Layer, cosSimLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("cos");
-  config.layerConfig.set_size(1);
   config.biasSize = 0;
+  config.layerConfig.set_type("pad");
 
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 50, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "cos", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, CosSimVecMatLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("cos_vm");
-  config.layerConfig.set_size(5);  // output size
-  config.layerConfig.set_cos_scale(2.0);
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 20, 0});
-  config.layerConfig.add_inputs();
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 100, 0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "cos_vm", 100, false, useGpu);
-  }
-}
-
-void testConvLayer(const string& type, bool trans, bool useGpu) {
-  TestConfig config;
-  config.biasSize = 16;
-  config.layerConfig.set_type(type);
-  config.layerConfig.set_num_filters(16);
-  config.layerConfig.set_partial_sum(1);
-  config.layerConfig.set_shared_biases(true);
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 384, 288});
+  int c = 4;
+  int h = 31;
+  int w = 36;
+  size_t size = c * h * w;
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", size, 0});
   LayerInputConfig* input = config.layerConfig.add_inputs();
-  ConvConfig* conv = input->mutable_conv_conf();
-  conv->set_filter_size(2);
-  conv->set_filter_size_y(3);
-  conv->set_channels(3);
-  conv->set_padding(0);
-  conv->set_padding_y(1);
-  conv->set_stride(2);
-  conv->set_stride_y(2);
-  conv->set_groups(1);
-  conv->set_filter_channels(conv->channels() / conv->groups());
-  conv->set_img_size(16);
-  conv->set_img_size_y(8);
-  conv->set_output_x(outputSize(conv->img_size(),
-                                conv->filter_size(),
-                                conv->padding(),
-                                conv->stride(),
-                                /* caffeMode */ true));
-  conv->set_output_y(outputSize(conv->img_size_y(),
-                                conv->filter_size_y(),
-                                conv->padding_y(),
-                                conv->stride_y(),
-                                /* caffeMode */ true));
-  config.layerConfig.set_size(conv->output_x() * conv->output_y() *
-                              config.layerConfig.num_filters());
-
-  testLayerGrad(config, "conv", 100, trans, useGpu);
-  // Use small batch_size and useWeight=true to test biasGrad
-  testLayerGrad(config, "conv", 2, trans, useGpu, true, 0.02);
-}
-
-TEST(Layer, convLayer) {
-  testConvLayer("exconv", /* trans= */ false, /* useGpu= */ false);
-#ifndef PADDLE_ONLY_CPU
-  testConvLayer("exconv", /* trans= */ false, /* useGpu= */ true);
-  testConvLayer("cudnn_conv", /* trans= */ false, /* useGpu= */ true);
-#endif
-}
-
-void testConvTransLayer(const string& type, bool trans, bool useGpu) {
-  TestConfig config;
-  config.biasSize = 3;
-  config.layerConfig.set_type(type);
-  config.layerConfig.set_num_filters(3);
-  config.layerConfig.set_partial_sum(1);
-  config.layerConfig.set_shared_biases(true);
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1024, 288});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  ConvConfig* conv = input->mutable_conv_conf();
-  conv->set_filter_size(2);
-  conv->set_filter_size_y(3);
-  conv->set_channels(16);
-  conv->set_padding(0);
-  conv->set_padding_y(1);
-  conv->set_stride(2);
-  conv->set_stride_y(2);
-  conv->set_groups(1);
-  conv->set_filter_channels(3 / conv->groups());
-  conv->set_img_size(16);
-  conv->set_output_x(outputSize(conv->img_size(),
-                                conv->filter_size(),
-                                conv->padding(),
-                                conv->stride(),
-                                /* caffeMode */ true));
-
-  config.layerConfig.set_size(conv->img_size() * conv->img_size() *
-                              config.layerConfig.num_filters());
-
-  testLayerGrad(config, "convTrans", 100, trans, useGpu);
-  // Use small batch_size and useWeight=true to test biasGrad
-  testLayerGrad(config, "convTrans", 2, trans, useGpu, true, 0.02);
-}
-
-TEST(Layer, convTransLayer) {
-  for (auto useGpu : {false, true}) {
-    testConvTransLayer("exconvt", /* trans= */ false, /* useGpu= */ useGpu);
-  }
-}
-
-TEST(Layer, blockExpandLayer) {
-  TestConfig config;
-  config.biasSize = 0;
-  config.layerConfig.set_type("blockexpand");
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 6144, 0});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  BlockExpandConfig* blockExpand = input->mutable_block_expand_conf();
-  blockExpand->set_img_size_x(64);
-  blockExpand->set_img_size_y(32);
-  blockExpand->set_channels(3);
-  blockExpand->set_padding_x(0);
-  blockExpand->set_padding_y(0);
-  blockExpand->set_block_x(4);
-  blockExpand->set_block_y(32);
-  blockExpand->set_stride_x(2);
-  blockExpand->set_stride_y(2);
-  blockExpand->set_output_x(outputSize(blockExpand->img_size_x(),
-                                       blockExpand->block_x(),
-                                       blockExpand->padding_x(),
-                                       blockExpand->stride_x(),
-                                       /* caffeMode */ false));
-  blockExpand->set_output_y(outputSize(blockExpand->img_size_y(),
-                                       blockExpand->block_y(),
-                                       blockExpand->padding_y(),
-                                       blockExpand->stride_y(),
-                                       /* caffeMode */ false));
-  config.layerConfig.set_size(blockExpand->block_x() * blockExpand->block_y() *
-                              blockExpand->channels());
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "blockexpand", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, maxoutLayer) {
-  TestConfig config;
-  config.biasSize = 0;
-  config.layerConfig.set_type("maxout");
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 4096, 0});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  MaxOutConfig* maxout = input->mutable_maxout_conf();
-  ImageConfig* image = maxout->mutable_image_conf();
-
-  image->set_img_size(32);
-  image->set_img_size_y(32);
-  image->set_channels(4);
-  maxout->set_groups(2);
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "maxout", 10, false, useGpu);
-  }
-}
-void testFcLayer(string format, size_t nnz) {
-  TestConfig config;
-  config.biasSize = 4096;
-  config.layerConfig.set_type("fc");
-  config.layerConfig.set_size(4096);
-  config.layerConfig.set_active_type("sigmoid");
-  config.layerConfig.set_drop_rate(0.1);
-
-  config.inputDefs.push_back(
-      {INPUT_DATA, "layer_0", 8192, nnz, ParaSparse(format)});
-  config.layerConfig.add_inputs();
-
-  LOG(INFO) << config.inputDefs[0].sparse.sparse << " "
-            << config.inputDefs[0].sparse.format;
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config,
-                  "fc",
-                  100,
-                  /* trans */ false,
-                  useGpu,
-                  /* weight */ true);
-  }
-}
-
-TEST(Layer, fcLayer) {
-  testFcLayer("", 4096 * 4096 * 2);
-  testFcLayer("csc", 4096 * 40);
-  testFcLayer("csr", 4096 * 40);
-}
-
-TEST(Layer, SelectiveFullyConnectedLayer) {
-  TestConfig config;
-  size_t nin = 16;
-  size_t nout = 256;
-  config.layerConfig.set_type("selective_fc");
-  config.layerConfig.set_size(nout);
-  config.layerConfig.set_active_type("sigmoid");
-  config.layerConfig.set_has_selected_colums(true);
-  config.layerConfig.set_selective_fc_pass_generation(false);
-  config.biasSize = nout;
-
-  config.inputDefs.push_back({INPUT_DATA, "input0", nin, nin * nout});
-  config.layerConfig.add_inputs();
-  config.inputDefs.push_back(
-      {INPUT_SPARSE_NON_VALUE_DATA, "index", nout, 0, ParaSparse("csr", true)});
-  config.layerConfig.add_inputs();
-
-  testLayerGrad(config,
-                "selective_fc",
-                100,
-                /* trans= */ false,
-                /* useGup= */ false,
-                false);
-#ifndef PADDLE_ONLY_CPU
-  testLayerGrad(config,
-                "selective_fc",
-                100,
-                /* trans= */ false,
-                /* useGup= */ true,
-                false);
-#endif
-}
-
-TEST(Layer, DataNormLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("data_norm");
-  config.layerConfig.set_size(20);
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 20, 100});
-  config.inputDefs.back().isStatic = true;
-  config.layerConfig.add_inputs();
-
-  for (auto strategy : {"z-score", "min-max", "decimal-scaling"}) {
-    config.layerConfig.set_data_norm_strategy(strategy);
-    // The parameters are static, so not support GPU now
-    testLayerGrad(config,
-                  "data_norm",
-                  200,
-                  /* trans */ false,
-                  /* useGpu */ false);
-  }
-}
-
-TEST(Layer, hsigmoidLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("hsigmoid");
-  config.layerConfig.set_num_classes(5);
-  config.layerConfig.set_size(1);
-  config.biasSize = config.layerConfig.num_classes() - 1;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 200});
-  config.inputDefs.push_back({INPUT_LABEL, "layer_1", 5, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  // Not support GPU now
-  testLayerGrad(config, "hsigmoid", 100, /* trans */ false, /* useGpu */ false);
-}
-
-TEST(Layer, multi_cross) {
-  TestConfig config;
-  config.layerConfig.set_type("multi-class-cross-entropy");
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
-  config.inputDefs.push_back({INPUT_LABEL, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(
-        config, "multi-class-cross-entropy", 100, /* trans */ false, useGpu);
-  }
-}
-
-TEST(Layer, multi_binary_label_sparse_mat) {
-  TestConfig config;
-  config.layerConfig.set_type("multi_binary_label_cross_entropy");
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
-  config.inputDefs.push_back({INPUT_SPARSE_NON_VALUE_DATA, "layer_1", 50, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config,
-                  "multi_binary_label_cross_entropy",
-                  100,
-                  /* trans */ false,
-                  useGpu);
-  }
-}
-
-TEST(layer, multi_binary_label_id) {
-  TestConfig config;
-  config.layerConfig.set_type("multi_binary_label_cross_entropy");
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
-  config.inputDefs.push_back({INPUT_LABEL, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config,
-                  "multi_binary_label_cross_entropy",
-                  100,
-                  /* trans */ false,
-                  useGpu);
-  }
-}
-
-TEST(Layer, multi_cross_with_selfnorm) {
-  TestConfig config;
-  config.layerConfig.set_type("multi_class_cross_entropy_with_selfnorm");
-  config.layerConfig.set_softmax_selfnorm_alpha(0.1);
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
-  config.inputDefs.push_back({INPUT_LABEL, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  // Not support GPU now
-  testLayerGrad(config,
-                "multi_class_cross_entropy_with_selfnorm",
-                100,
-                /* trans */ false,
-                /* useGpu */ false);
-}
-
-TEST(Layer, multi_cross_soft) {
-  TestConfig config;
-  config.layerConfig.set_type("soft_binary_class_cross_entropy");
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
-  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config,
-                  "soft_binary_class_cross_entropy",
-                  100,
-                  /* trans */ false,
-                  useGpu);
-  }
-}
-
-TEST(Layer, square_error) {
-  TestConfig config;
-  config.layerConfig.set_type("square_error");
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
-  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "square_error", 100, /* trans */ false, useGpu);
-  }
-}
-
-TEST(Layer, sparse_square_error) {
-  TestConfig config;
-  config.layerConfig.set_type("square_error");
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
-  config.inputDefs.push_back({INPUT_SPARSE_NON_VALUE_DATA, "layer_1", 50, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  // "GpuSparseMatrix" as label is not supported
-  testLayerGrad(config,
-                "square_error",
-                100,
-                /* trans */ false,
-                /* useGpu */ false);
-}
-
-TEST(Layer, sparse_float_square_error) {
-  TestConfig config;
-  config.layerConfig.set_type("square_error");
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
-  config.inputDefs.push_back({INPUT_SPARSE_FLOAT_VALUE_DATA, "layer_1", 50, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  // "GpuSparseMatrix" as label is not supported
-  testLayerGrad(config,
-                "square_error",
-                100,
-                /* trans */ false,
-                /* useGpu */ false);
-}
-
-TEST(Layer, square_error_weighted) {
-  TestConfig config;
-  config.layerConfig.set_type("square_error");
-  config.biasSize = 0;
-  config.testAccumulate = false;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
-  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_1", 10, 0});
-  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_2", 1, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "square_error", 100, /* trans */ false, useGpu);
-  }
-}
-
-TEST(Layer, huber_two_class) {
-  TestConfig config;
-  config.layerConfig.set_type("huber");
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
-  config.inputDefs.push_back({INPUT_LABEL, "layer_1", 2, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "huber", 100, /* trans */ false, useGpu);
-  }
-}
-
-void testExpandLayer(string trans_type, bool hasSubseq) {
-  TestConfig config;
-  config.layerConfig.set_type("expand");
-
-  config.inputDefs.push_back(
-      {trans_type == "non-seq" ? INPUT_DENSE_DIM_DATA : INPUT_SEQUENCE_DATA,
-       "layer_0",
-       10,
-       0});
-  config.inputDefs.push_back(
-      {hasSubseq ? INPUT_HASSUB_SEQUENCE_DATA : INPUT_SEQUENCE_DATA,
-       "layer_1",
-       10,
-       0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-  config.layerConfig.set_trans_type(trans_type);
-  LOG(INFO) << " trans_type=" << trans_type << " hasSubseq=" << hasSubseq;
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "expand", 30, false, useGpu);
-  }
-}
-
-TEST(Layer, ExpandLayer) {
-  testExpandLayer("non-seq", false);  // non-seq expand to seq
-  testExpandLayer("non-seq", true);   // non-seq expand to hasSubseq
-  testExpandLayer("seq", true);       // seq expand to hasSubseq
-}
-
-void testDegradeLayer(bool hasSubseq, string layer_type, string trans_type) {
-  TestConfig config;
-  config.layerConfig.set_type(layer_type);
-  config.layerConfig.set_size(10);
-  config.biasSize = 0;
-
-  config.inputDefs.push_back(
-      {hasSubseq ? INPUT_HASSUB_SEQUENCE_DATA : INPUT_SEQUENCE_DATA,
-       "layer_0",
-       10,
-       0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.set_trans_type(trans_type);
-
-  auto testDegradeLayerGrad = [](TestConfig& config, string layer_type) {
-    for (auto useGpu : {false, true}) {
-      testLayerGrad(config, layer_type, 100, false, useGpu);
-    }
-  };
-
-  if (layer_type == "average") {
-    for (auto strategy : {"average", "sum", "squarerootn"}) {
-      LOG(INFO) << " hasSubseq=" << hasSubseq << " trans_type=" << trans_type
-                << " average_strategy=" << strategy;
-      config.layerConfig.set_average_strategy(strategy);
-      testDegradeLayerGrad(config, layer_type);
-    }
-  } else {
-    LOG(INFO) << " hasSubseq=" << hasSubseq << " trans_type=" << trans_type;
-    testDegradeLayerGrad(config, layer_type);
-  }
-}
-
-TEST(Layer, MaxLayer) {
-  testDegradeLayer(false, "max", "non-seq");  // seq max to non-seq
-  testDegradeLayer(true, "max", "non-seq");   // hasSubseq max to non-seq
-  testDegradeLayer(true, "max", "seq");       // hasSubseq max to seq
-}
-
-TEST(Layer, SequenceLastInstanceLayer) {
-  testDegradeLayer(false,
-                   "seqlastins",
-                   "non-seq");  // seq seqlastins to non-seq
-  testDegradeLayer(true,
-                   "seqlastins",
-                   "non-seq");  // hasSubseq seqlastins to non-seq
-  testDegradeLayer(true, "seqlastins", "seq");  // hasSubseq seqlastins to seq
-}
-
-TEST(Layer, AverageLayer) {
-  testDegradeLayer(false, "average", "non-seq");  // seq average to non-seq
-  testDegradeLayer(true, "average", "non-seq");  // hasSubseq average to non-seq
-  testDegradeLayer(true, "average", "seq");      // hasSubseq average to seq
-}
-
-TEST(Layer, SequenceConcatLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("seqconcat");
-  config.layerConfig.set_size(10);
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "layer_0", 10, 0});
-  config.layerConfig.add_inputs();
-  config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "seqconcat", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, SequenceReshapeLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("seqreshape");
-  config.layerConfig.set_size(10);
-
-  config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "layer_0", 100, 0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "seqreshape", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, ConvShiftLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("conv_shift");
-  config.layerConfig.set_size(10);
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 3, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  // Not support GPU now
-  testLayerGrad(config, "conv_shift", 100, false, false);
-}
-
-TEST(Layer, PowerLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("power");
-  config.layerConfig.set_size(10);
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "power", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, ConvexCombinationLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("convex_comb");
-  config.layerConfig.set_size(20);
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 5, 0});
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 100, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "convex_comb", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, InterpolationLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("interpolation");
-  config.layerConfig.set_size(10);
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
-  config.inputDefs.push_back({INPUT_DATA, "layer_2", 10, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "interpolation", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, OuterProdLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("out_prod");
-  config.layerConfig.set_size(100);
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
-  config.layerConfig.add_inputs();
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "out_prod", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, SlopeInterceptLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("slope_intercept");
-  config.layerConfig.set_size(10);
-  config.layerConfig.set_slope(1.0);
-  config.layerConfig.set_intercept(0.1);
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "slope_intercept", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, ScalingLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("scaling");
-  config.layerConfig.set_size(10);
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
-  config.layerConfig.add_inputs();
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "scaling", 100, false, useGpu);
-  }
-}
-
-void testNormLayer(const string& normType, bool trans, bool useGpu) {
-  TestConfig config;
-  config.layerConfig.set_type("norm");
-  config.layerConfig.set_active_type("relu");
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1568, 0});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  NormConfig* norm = input->mutable_norm_conf();
-  norm->set_norm_type(normType);
-  norm->set_channels(16);
-  norm->set_size(5);
-  norm->set_scale(0.001);
-  norm->set_pow(0.75);
-  norm->set_blocked(0);
-  norm->set_img_size(14);
-  norm->set_img_size_y(7);
-  norm->set_output_x(norm->img_size());
-  norm->set_output_y(norm->img_size_y());
-  if (norm->norm_type() == "cmrnorm" ||
-      norm->norm_type() == "cmrnorm-projection") {
-    norm->set_scale(norm->scale() / norm->size());
-  } else {
-    norm->set_scale(norm->scale() / (norm->size() * norm->size()));
-  }
-
-  config.layerConfig.set_size(norm->output_x() * norm->output_y() *
-                              norm->channels());
-  config.biasSize = 0;
-
-  testLayerGrad(config, "norm", 100, trans, useGpu);
-}
-
-TEST(Layer, NormLayer) {
-  testNormLayer("cmrnorm-projection", /* trans= */ false, /* useGpu= */ true);
-  testNormLayer("cmrnorm-projection", /* trans= */ false, /* useGpu= */ false);
-}
-
-void setPoolConfig(TestConfig* config,
-                   PoolConfig* pool,
-                   const string& poolType) {
-  (*config).biasSize = 0;
-  (*config).layerConfig.set_type("pool");
-  (*config).layerConfig.set_num_filters(16);
-
-  int kw = 3, kh = 3;
-  int pw = 0, ph = 0;
-  int sw = 2, sh = 2;
-  pool->set_pool_type(poolType);
-  pool->set_channels(16);
-  pool->set_size_x(kw);
-  pool->set_size_y(kh);
-  pool->set_start(0);
-  pool->set_padding(pw);
-  pool->set_padding_y(ph);
-  pool->set_stride(sw);
-  pool->set_stride_y(sh);
-
-  int ow = outputSize(pool->img_size(), kw, pw, sw, /* caffeMode */ false);
-  int oh = outputSize(pool->img_size_y(), kh, ph, sh, /* caffeMode */ false);
-  pool->set_output_x(ow);
-  pool->set_output_y(oh);
-}
-
-void testPoolLayer(const string& poolType, bool trans, bool useGpu) {
-  TestConfig config;
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 3136, 0});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  PoolConfig* pool = input->mutable_pool_conf();
-
-  pool->set_img_size(14);
-  pool->set_img_size_y(14);
-  setPoolConfig(&config, pool, poolType);
-  config.layerConfig.set_size(pool->output_x() * pool->output_y() *
-                              pool->channels());
-
-  testLayerGrad(config, "pool", 100, trans, useGpu);
-}
-
-#ifndef PADDLE_ONLY_CPU
-void testPoolLayer2(const string& poolType, bool trans, bool useGpu) {
-  TestConfig config;
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 3200, 0});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  PoolConfig* pool = input->mutable_pool_conf();
-
-  pool->set_size_y(4);
-  pool->set_stride_y(3);
-  pool->set_img_size(10);
-  pool->set_img_size_y(20);
-  setPoolConfig(&config, pool, poolType);
-  pool->set_output_y((pool->img_size_y() - pool->start() - pool->size_y()) /
-                         ((float)pool->stride_y()) +
-                     1.5);
-  config.layerConfig.set_size(pool->output_x() * pool->output_y() *
-                              pool->channels());
-
-  testLayerGrad(config, "pool", 100, trans, useGpu);
-}
-#endif
-
-TEST(Layer, PoolLayer) {
-  testPoolLayer("avg-projection", /* trans= */ false, /* useGpu= */ false);
-  testPoolLayer("max-projection", /* trans= */ false, /* useGpu= */ false);
-
-#ifndef PADDLE_ONLY_CPU
-  testPoolLayer("avg-projection", /* trans= */ false, /* useGpu= */ true);
-  testPoolLayer("max-projection", /* trans= */ false, /* useGpu= */ true);
-  testPoolLayer("cudnn-max-pool", /* trans= */ false, /* useGpu= */ true);
-  testPoolLayer("cudnn-avg-pool", /* trans= */ false, /* useGpu= */ true);
-  testPoolLayer2("cudnn-max-pool", /* trans= */ false, /* useGpu= */ true);
-  testPoolLayer2("cudnn-avg-pool", /* trans= */ false, /* useGpu= */ true);
-#endif
-}
-
-void testSppLayer(const string& poolType,
-                  const int pyramidHeight,
-                  bool trans,
-                  bool useGpu) {
-  TestConfig config;
-  config.layerConfig.set_type("spp");
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 3200, 0});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  SppConfig* sppConfig = input->mutable_spp_conf();
-  sppConfig->set_pool_type(poolType);
-  sppConfig->set_pyramid_height(pyramidHeight);
-  ImageConfig* imageConfig = sppConfig->mutable_image_conf();
-  imageConfig->set_channels(16);
-  imageConfig->set_img_size(10);
-  imageConfig->set_img_size_y(20);
-  int outputSize = (std::pow(4, sppConfig->pyramid_height()) - 1) / (4 - 1);
-  config.layerConfig.set_size(outputSize * imageConfig->channels());
-  testLayerGrad(config, "spp", 100, trans, useGpu);
-}
-
-TEST(Layer, SpatialPyramidPoolLayer) {
-  for (auto useGpu : {false, true}) {
-    for (auto pyramidHeight : {1, 2, 3}) {
-      testSppLayer("avg-projection", pyramidHeight, false, useGpu);
-      testSppLayer("max-projection", pyramidHeight, false, useGpu);
-    }
-  }
-}
-
-TEST(Layer, rankCostLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("rank-cost");
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 1, 0});
-  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_2", 1, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "rank-cost", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, sumCostLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("sum_cost");
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "sum_cost", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, weightedRankCostLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("rank-cost");
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 1, 0});
-  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_2", 1, 0});
-  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_3", 1, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "weighted-rank-cost", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, TensorLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("tensor");
-  config.layerConfig.set_size(10);
-  config.layerConfig.set_active_type("sigmoid");
-  config.biasSize = config.layerConfig.size();
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 5, 250});
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 5, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "tensor", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, RecurrentLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("recurrent");
-  config.layerConfig.set_size(4);
-  config.layerConfig.set_active_type("tanh");
-  config.biasSize = 4;
-
-  config.inputDefs.push_back(
-      {INPUT_SEQUENCE_DATA, "layer_0", /* dim= */ 4, /* paraSize= */ 16});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    for (auto reversed : {false, true}) {
-      config.layerConfig.set_reversed(reversed);
-      config.testState = !reversed;
-      testLayerGrad(config, "recurrent", 50, /* trans= */ false, useGpu);
-    }
-  }
-}
-
-TEST(Layer, LstmLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("lstmemory");
-  config.layerConfig.set_size(4);
-  config.layerConfig.set_active_type("tanh");
-  config.layerConfig.set_active_state_type("sigmoid");
-  config.layerConfig.set_active_gate_type("sigmoid");
-  config.biasSize = 28;
-
-  config.inputDefs.push_back(
-      {INPUT_SEQUENCE_DATA, "layer_0", /* dim= */ 16, /* paraSize= */ 64});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    for (auto reversed : {false, true}) {
-      config.layerConfig.set_reversed(reversed);
-      config.testState = !reversed;
-      testLayerGrad(config, "lstmemory", 100, /* trans= */ false, useGpu);
-    }
-  }
-  for (auto useGpu : {true}) {
-    config.testBatchState = true;
-    config.layerConfig.set_reversed(false);
-    testLayerGrad(config, "lstmemory", 10, /* trans= */ false, useGpu);
-  }
-}
-
-TEST(Layer, MDLstmLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("mdlstmemory");
-  config.layerConfig.set_size(4);
-  config.layerConfig.set_active_type("sigmoid");
-  config.layerConfig.set_active_state_type("sigmoid");
-  config.layerConfig.set_active_gate_type("sigmoid");
-  config.biasSize = 4 * 9;
-
-  config.inputDefs.push_back(
-      {INPUT_SEQUENCE_MDIM_DATA, "layer_0", 4 * 5, 4 * 4 * 5});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_directions(true);
-  config.layerConfig.add_directions(true);
-
-  for (auto useGpu : {false, true}) {
-    for (int i = 0; i < 2; i++) {
-      for (int j = 0; j < 2; j++) {
-        config.layerConfig.set_directions(0, bool(i));
-        config.layerConfig.set_directions(1, bool(j));
-        testLayerGrad(config, "mdlstmemory", 100, false, useGpu);
-      }
-    }
-  }
-}
-
-TEST(Layer, ParameterReluLayer) {
-  auto testParameterReluLayer = [&](size_t inputSize, size_t channels) {
-    TestConfig config;
-    config.layerConfig.set_type("prelu");
-    config.inputDefs.push_back({INPUT_DATA, "layer_0", inputSize, channels});
-    config.layerConfig.add_inputs();
-    config.layerConfig.set_size(inputSize);
-    config.layerConfig.set_partial_sum(inputSize /
-                                       channels);  // size of feature map
-    for (auto useGpu : {false, true}) {
-      testLayerGrad(config, "prelu", 100, false, useGpu);
-    }
-  };
-
-  testParameterReluLayer(192, 1);
-  testParameterReluLayer(192, 3);
-  testParameterReluLayer(192, 192);
-}
-
-TEST(Layer, ResizeLayer) {
-  TestConfig config;
-  config.biasSize = 0;
-  config.layerConfig.set_type("resize");
-  config.layerConfig.set_size(64);
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 16, 0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "resize", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, NCELayer) {
-  TestConfig config;
-  size_t numClasses = 4;
-  config.layerConfig.set_type("nce");
-  config.layerConfig.set_size(1);
-  config.layerConfig.set_active_type("sigmoid");
-  config.layerConfig.set_num_classes(numClasses);
-  config.biasSize = numClasses;
-
-  config.inputDefs.push_back(
-      {INPUT_DATA, "layer_0", /* dim= */ 16, /* paraSize= */ 16 * numClasses});
-  config.inputDefs.push_back(
-      {INPUT_LABEL, "label", /* dim= */ numClasses, /* paraSize= */ 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto withWeight : {false, true}) {
-    if (withWeight) {
-      config.inputDefs.push_back(
-          {INPUT_DATA_TARGET, "weight", /* dim= */ 1, /* paraSize= */ 0});
-      config.layerConfig.add_inputs();
-    }
-
-    for (auto isIdLabel : {false, true}) {
-      config.inputDefs[1] = {
-          isIdLabel ? INPUT_LABEL : INPUT_SPARSE_NON_VALUE_DATA,
-          "label",
-          /* dim= */ numClasses,
-          /* paraSize= */ 0};
-
-      for (auto withDist : {false, true}) {
-        config.layerConfig.clear_neg_sampling_dist();
-        if (withDist) {
-          double sum = 0;
-          for (size_t i = 0; i < numClasses; ++i) {
-            real p = rand();  // NOLINT use rand_r
-            config.layerConfig.add_neg_sampling_dist(p);
-            sum += p;
-          }
-          for (size_t i = 0; i < numClasses; ++i) {
-            real p = config.layerConfig.neg_sampling_dist(i) / sum;
-            config.layerConfig.set_neg_sampling_dist(i, p);
-          }
-        }
-        LOG(INFO) << "NCELayer "
-                  << " isIdLabel=" << isIdLabel << " withWeight=" << withWeight
-                  << " withDist=" << withDist;
-        // Not support GPU now
-        testLayerGrad(config,
-                      "nce",
-                      100,
-                      /* trans= */ false,
-                      /* useGpu */ false);
-      }
-    }
-  }
-}
-
-TEST(Layer, GatedRecurrentLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("gated_recurrent");
-  config.layerConfig.set_size(4);
-  config.layerConfig.set_active_type("sigmoid");
-  config.layerConfig.set_active_gate_type("sigmoid");
-  config.biasSize = 12;
-
-  config.inputDefs.push_back(
-      {INPUT_SEQUENCE_DATA, "layer_0", /* dim= */ 12, /* paraSize= */ 48});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    for (auto reversed : {false, true}) {
-      config.layerConfig.set_reversed(reversed);
-      config.testState = !reversed;
-      testLayerGrad(config, "gated_recurrent", 100, /* trans= */ false, useGpu);
-    }
-  }
-}
-
-TEST(Layer, GruStepLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("gru_step");
-  config.layerConfig.set_size(4);
-  config.layerConfig.set_active_type("sigmoid");
-  config.layerConfig.set_active_gate_type("sigmoid");
-  config.biasSize = 12;
-
-  config.inputDefs.push_back(
-      {INPUT_DATA, "layer_0", /* dim= */ 12, /* paraSize= */ 48});
-  config.inputDefs.push_back(
-      {INPUT_DATA, "layer_1", /* dim= */ 4, /* paraSize= */ 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "gruStep", 100, /* trans= */ false, useGpu);
-  }
-}
-
-TEST(Layer, LstmStepLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("lstm_step");
-  config.layerConfig.set_size(4);
-  config.layerConfig.set_active_type("sigmoid");
-  config.layerConfig.set_active_state_type("sigmoid");
-  config.layerConfig.set_active_gate_type("sigmoid");
-  config.biasSize = 12;
-  config.testAccumulate = false;
-
-  config.inputDefs.push_back(
-      {INPUT_DATA, "layer_0", /* dim= */ 16, /* paraSize= */ 0});
-  config.inputDefs.push_back(
-      {INPUT_DATA, "layer_1", /* dim= */ 4, /* paraSize= */ 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "lstmStep", 100, /* trans= */ false, useGpu);
-  }
-}
-
-void testBatchNormLayer(const string& type, bool trans, bool useGpu) {
-  TestConfig config;
-  const int CHANNELS = 10;
-  const int IMG_SIZE = 16;
-  const int IMG_SIZE_Y = 8;
-  size_t size = CHANNELS * IMG_SIZE * IMG_SIZE_Y;
-  config.layerConfig.set_type(type);
-  config.layerConfig.set_size(size);
-  config.layerConfig.set_active_type("sigmoid");
-  config.biasSize = CHANNELS;
-  config.inputDefs.push_back({INPUT_DATA,
-                              "layer_0",
-                              /* dim= */ size,
-                              /* paraSize= */ CHANNELS});
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_1_running_mean", 1, CHANNELS});
-  config.inputDefs.back().isStatic = true;
-  config.inputDefs.push_back({INPUT_DATA, "layer_2_running_var", 1, CHANNELS});
-  config.inputDefs.back().isStatic = true;
-
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  ImageConfig* img_conf = input->mutable_image_conf();
-  img_conf->set_channels(CHANNELS);
-  img_conf->set_img_size(IMG_SIZE);
-  img_conf->set_img_size_y(IMG_SIZE_Y);
-
-  testLayerGrad(config,
-                "batch_norm",
-                64,
-                /* trans= */ trans,
-                useGpu,
-                /* useWeight */ true);
-}
-
-TEST(Layer, BatchNormalizationLayer) {
-  testBatchNormLayer("batch_norm", false, false);
-#ifndef PADDLE_ONLY_CPU
-  testBatchNormLayer("batch_norm", false, true);
-  if (hl_get_cudnn_lib_version() >= int(4000)) {
-    testBatchNormLayer("cudnn_batch_norm", false, true);
-  }
-#endif
-}
-
-TEST(Operator, conv) {
-  TestConfig config;
-  const int NUM_FILTERS = 16;
-  const int FILTER_SIZE = 2;
-  const int FILTER_SIZE_Y = 3;
-  const int CHANNELS = 3;
-  const int IMAGE_SIZE = 16;
-  const int IMAGE_SIZE_Y = 8;
-  OperatorConfig& operatorConf = *config.layerConfig.add_operator_confs();
-  operatorConf.set_type("conv");
-  ConvConfig* conv = operatorConf.mutable_conv_conf();
-  operatorConf.set_num_filters(NUM_FILTERS);
-  conv->set_filter_size(FILTER_SIZE);
-  conv->set_filter_size_y(FILTER_SIZE_Y);
-  conv->set_channels(CHANNELS);
-  conv->set_padding(0);
-  conv->set_padding_y(1);
-  conv->set_stride(2);
-  conv->set_stride_y(2);
-  conv->set_groups(1);
-  conv->set_filter_channels(conv->channels() / conv->groups());
-  conv->set_img_size(IMAGE_SIZE);
-  conv->set_img_size_y(IMAGE_SIZE_Y);
-  conv->set_output_x(outputSize(conv->img_size(),
-                                conv->filter_size(),
-                                conv->padding(),
-                                conv->stride(),
-                                /*  caffeMode */ true));
-  conv->set_output_y(outputSize(conv->img_size_y(),
-                                conv->filter_size_y(),
-                                conv->padding_y(),
-                                conv->stride_y(),
-                                /*  caffeMode */ true));
-  config.layerConfig.set_size(conv->output_x() * conv->output_y() *
-                              NUM_FILTERS);
-
-  config.inputDefs.push_back(
-      {INPUT_DATA, "layer_0", IMAGE_SIZE * IMAGE_SIZE_Y * CHANNELS, 0});
-  config.inputDefs.push_back(
-      {INPUT_DATA,
-       "layer_1",
-       FILTER_SIZE * FILTER_SIZE_Y * CHANNELS * NUM_FILTERS,
-       0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  testOperatorGrad(config, operatorConf, 100, /*useGpu*/ true, false);
-}
-
-TEST(Layer, FeatureMapExpandLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("featmap_expand");
-  const int CHANNELS = 10;
-  const int INPUT_SIZE = 100;
-  config.layerConfig.set_size(INPUT_SIZE * CHANNELS);
-  config.layerConfig.set_num_filters(CHANNELS);
-  config.inputDefs.push_back({INPUT_SEQUENCE_DATA,
-                              "layer_0",
-                              /* dim= */ INPUT_SIZE,
-                              /* paraSize= */ 0});
-  config.layerConfig.add_inputs();
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config,
-                  "featmap_expand",
-                  /*batch_size*/ 100,
-                  /* trans= */ false,
-                  useGpu,
-                  /* useWeight */ true);
-  }
-}
-
-TEST(Layer, MultiplexLayer) {
-  TestConfig config;
-  const int LAYER_SIZE = 100;
-  config.layerConfig.set_type("multiplex");
-  config.layerConfig.set_size(LAYER_SIZE);
-
-  config.inputDefs.push_back({INPUT_LABEL, "layer_0", 2, 0});
-  config.inputDefs.push_back(
-      {INPUT_DATA, "layer_1", /* dim= */ LAYER_SIZE, /* paraSize= */ 0});
-  config.inputDefs.push_back(
-      {INPUT_DATA, "layer_2", /* dim= */ LAYER_SIZE, /* paraSize= */ 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
+  PadConfig* pad = input->mutable_pad_conf();
+  ImageConfig* image = pad->mutable_image_conf();
+
+  image->set_channels(c);
+  image->set_img_size(h);
+  image->set_img_size_y(w);
+  pad->add_pad_c(1);
+  pad->add_pad_c(2);
+  pad->add_pad_h(2);
+  pad->add_pad_h(3);
+  pad->add_pad_w(3);
+  pad->add_pad_w(5);
 
   for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "multiplex", 512, /* trans= */ false, useGpu);
+    testLayerGrad(config, "pad", 10, false, useGpu);
   }
 }
 
diff --git a/proto/ModelConfig.proto b/proto/ModelConfig.proto
index 3a9d339976..0456404832 100644
--- a/proto/ModelConfig.proto
+++ b/proto/ModelConfig.proto
@@ -255,6 +255,13 @@ message PriorBoxConfig {
   repeated float variance = 4;
 }
 
+message PadConfig {
+  required ImageConfig image_conf = 1;
+  repeated uint32 pad_c = 2;
+  repeated uint32 pad_h = 3;
+  repeated uint32 pad_w = 4;
+}
+
 message LayerInputConfig {
   required string input_layer_name = 1;
   optional string input_parameter_name = 2;
@@ -271,6 +278,7 @@ message LayerInputConfig {
   optional MaxOutConfig maxout_conf = 11;
   optional SppConfig spp_conf = 12;
   optional PriorBoxConfig priorbox_conf = 13;
+  optional PadConfig pad_conf = 14;
 }
 
 message LayerConfig {
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index 674b5ac58b..6e5922166d 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -493,6 +493,7 @@ class Input(Cfg):
             block_expand=None,
             maxout=None,
             spp=None,
+            pad=None,
             format=None,
             nnz=None,
             is_static=None,
@@ -844,6 +845,12 @@ class SpatialPyramidPool(Cfg):
         self.add_keys(locals())
 
 
+@config_class
+class Pad(Cfg):
+    def __init__(self, channels, pad_c, pad_h, pad_w):
+        self.add_keys(locals())
+
+
 @config_class
 class Norm(Cfg):
     def __init__(self,
@@ -1842,6 +1849,25 @@ class SpatialPyramidPoolLayer(LayerBase):
             self.set_cnn_layer(name, 1, output_x, spp_conf.image_conf.channels)
 
 
+@config_layer('pad')
+class PadLayer(LayerBase):
+    def __init__(self, name, inputs, **xargs):
+        super(PadLayer, self).__init__(name, 'pad', 0, inputs=inputs, **xargs)
+        pad = self.inputs[0].pad
+        self.config.inputs[0].pad_conf.pad_c.extend(pad.pad_c)
+        self.config.inputs[0].pad_conf.pad_h.extend(pad.pad_h)
+        self.config.inputs[0].pad_conf.pad_w.extend(pad.pad_w)
+
+        input_layer = self.get_input_layer(0)
+        image_conf = self.config.inputs[0].pad_conf.image_conf
+        parse_image(pad, input_layer.name, image_conf)
+        out_ch = pad.channels + pad.pad_c[0] + pad.pad_c[1]
+        out_h = image_conf.img_size_y + pad.pad_h[0] + pad.pad_h[1]
+        out_w = image_conf.img_size + pad.pad_w[0] + pad.pad_w[1]
+        self.set_cnn_layer(name, out_h, out_w, out_ch)
+        self.config.size = out_ch * out_h * out_w
+
+
 @config_layer('batch_norm')
 class BatchNormLayer(LayerBase):
     layer_type = 'batch_norm'
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index 9b6e5774bc..56c335a050 100644
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -170,6 +170,7 @@ class LayerType(object):
     BLOCK_EXPAND = "blockexpand"
     MAXOUT = "maxout"
     SPP_LAYER = "spp"
+    PAD_LAYER = "pad"
 
     PRINT_LAYER = "print"
     PRIORBOX_LAYER = "priorbox"
@@ -3488,9 +3489,6 @@ def conv_projection(input,
                     groups=1,
                     param_attr=None):
     """
-    ConvProjection with a layer as input.
-    It performs element-wise multiplication with weight.
-
     Different from img_conv_layer and conv_op, conv_projection is an Projection,
     which can be used in mixed_layer and conat_layer. It use cudnn to implement
     conv and only support GPU mode.
@@ -3499,7 +3497,7 @@ def conv_projection(input,
 
     .. code-block:: python
 
-       proj = conv_projection(img=input1,
+       proj = conv_projection(input=input1,
                               filter_size=3,
                               num_filters=64,
                               num_channels=64)
@@ -3582,6 +3580,84 @@ def conv_projection(input,
     return proj
 
 
+@wrap_name_default("pad")
+@layer_support()
+def pad_layer(input,
+              pad_c=None,
+              pad_h=None,
+              pad_w=None,
+              name=None,
+              layer_attr=None):
+    """
+    This operation pads zeros to the input data according to pad_c,pad_h
+    and pad_w. pad_c, pad_h, pad_w specifies the which dimension and size
+    of padding. And the input data shape is NCHW.
+
+    For example, pad_c=[2,3] means padding 2 zeros before the
+    input data and 3 zeros after the input data in channel dimension.
+    pad_h means padding zeros in height dimension. pad_w means padding zeros
+    in width dimension.
+
+    .. code-block:: python
+
+       pad = pad_layer(input=ipt,
+                       pad_c=[4,4],
+                       pad_h=[0,0],
+                       pad_w=[2,2])
+
+    :param input: layer's input.
+    :type input: LayerOutput
+    :param pad_c: padding size in channel dimension.
+    :type pad_c: list|None
+    :param pad_h: padding size in height dimension.
+    :type pad_h: list|None
+    :param pad_w: padding size in width dimension.
+    :type pad_w: list|None
+    :param layer_attr: Extra Layer Attribute.
+    :type layer_attr: ExtraLayerAttribute
+    :param name: layer name.
+    :type name: basestring
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+    if pad_c is not None:
+        assert isinstance(pad_c, collections.Sequence) and len(pad_c) == 2
+    else:
+        pad_c = [0, 0]
+
+    if pad_h is not None:
+        assert isinstance(pad_h, collections.Sequence) and len(pad_h) == 2
+    else:
+        pad_h = [0, 0]
+
+    if pad_w is not None:
+        assert isinstance(pad_w, collections.Sequence) and len(pad_w) == 2
+    else:
+        pad_w = [0, 0]
+
+    assert input.num_filters is not None
+    in_ch = input.num_filters
+    out_ch = in_ch + pad_c[0] + pad_c[1]
+
+    l = Layer(
+        name=name,
+        type=LayerType.PAD_LAYER,
+        inputs=Input(
+            input.name,
+            pad=Pad(
+                channels=in_ch,
+                pad_c=pad_c,
+                pad_h=pad_h,
+                pad_w=pad_w, )),
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(
+        name,
+        layer_type=LayerType.PAD_LAYER,
+        parents=[input],
+        num_filters=out_ch,
+        size=l.config.size)
+
+
 @wrap_name_default()
 @layer_support()
 def conv_shift_layer(a, b, name=None, layer_attr=None):
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_pad.py b/python/paddle/trainer_config_helpers/tests/configs/test_pad.py
new file mode 100644
index 0000000000..bb5f13410d
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_pad.py
@@ -0,0 +1,21 @@
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=1000, learning_rate=1e-5)
+
+data = data_layer(name='data', size=2304, height=48, width=42)
+
+conv = img_conv_layer(
+    input=data,
+    filter_size=3,
+    num_channels=1,
+    num_filters=16,
+    padding=1,
+    act=LinearActivation(),
+    bias_attr=True)
+
+pool = img_pool_layer(
+    input=conv, num_channels=8, pool_size=2, stride=2, pool_type=MaxPooling())
+
+pad = pad_layer(input=pool, pad_c=[2, 3], pad_h=[1, 2], pad_w=[3, 1])
+
+outputs(pad)

From 6b61a096e13d04e1927c0760e96f2474df0085c7 Mon Sep 17 00:00:00 2001
From: dangqingqing <dangqingqing@baidu.com>
Date: Sat, 7 Jan 2017 16:01:44 +0800
Subject: [PATCH 06/88] Optional padding mode, namely ceil or floor, ceil by
 default.

---
 paddle/function/PadOp.cpp                     |   16 +-
 paddle/function/PadOpTest.cpp                 |   48 +-
 paddle/gserver/tests/test_LayerGrad.cpp       | 3093 ++++++++---------
 python/paddle/trainer/config_parser.py        |   36 +-
 .../paddle/trainer_config_helpers/layers.py   |   21 +-
 5 files changed, 1614 insertions(+), 1600 deletions(-)

diff --git a/paddle/function/PadOp.cpp b/paddle/function/PadOp.cpp
index e10011da2a..2dfe03dcf6 100644
--- a/paddle/function/PadOp.cpp
+++ b/paddle/function/PadOp.cpp
@@ -73,10 +73,6 @@ void PadGrad<DEVICE_TYPE_CPU>(real* inGrad,
   }
 }
 
-/**
- * \param inputs[0] input value.
- * \param outputs[0] output value.
- */
 template <DeviceType Device>
 class PadFunc : public FunctionBase {
 public:
@@ -89,6 +85,10 @@ public:
     padw1_ = config.get<int>("padw1");
   }
 
+  /**
+   * \param inputs[0] input value.
+   * \param outputs[0] output value.
+   */
   void calc(const Arguments& inputs,
             const Arguments& outputs,
             const Arguments& inouts) override {
@@ -124,10 +124,6 @@ private:
   int padw1_;
 };
 
-/**
- * \param inputs[0] input grad.
- * \param outputs[0] output grad.
- */
 template <DeviceType Device>
 class PadGradFunc : public FunctionBase {
 public:
@@ -140,6 +136,10 @@ public:
     padw1_ = config.get<int>("padw1");
   }
 
+  /**
+   * \param inputs[0] output grad.
+   * \param inouts[0] input grad.
+   */
   void calc(const Arguments& inputs,
             const Arguments& outputs,
             const Arguments& inouts) override {
diff --git a/paddle/function/PadOpTest.cpp b/paddle/function/PadOpTest.cpp
index ee2834d793..c6d573c1d9 100644
--- a/paddle/function/PadOpTest.cpp
+++ b/paddle/function/PadOpTest.cpp
@@ -43,28 +43,30 @@ TEST(Pad, real) {
   }
 }
 
-// TEST(PadGrad, real) {
-//  for (size_t numSamples : {5, 32}) {
-//    for (size_t channels : {1, 5, 32}) {
-//      for (size_t imgSizeH : {5, 33, 100}) {
-//        for (size_t imgSizeW : {5, 32, 96}) {
-//          VLOG(3) << " numSamples=" << numSamples << " channels=" << channels
-//                  << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW;
-//
-//          FunctionCompare compare("PadGrad",
-//                                  FuncConfig()
-//                                     .set("padc0", 2).set("padc1", 3)
-//                                     .set("padh0", 1).set("padh1", 2)
-//                                     .set("padw0", 3).set("padw1", 2));
-//          Dims inDims{numSamples, channels, imgSizeH, imgSizeW};
-//          Dims outDims{numSamples, channels + 5, imgSizeH + 3, imgSizeW + 5};
-//          compare.cmpWithArg({Tensor(nullptr, inDims)},
-//                             {Tensor(nullptr, outDims)},
-//                             {});
-//        }
-//      }
-//    }
-//  }
-//}
+TEST(PadGrad, real) {
+  for (size_t numSamples : {5, 32}) {
+    for (size_t channels : {1, 5, 32}) {
+      for (size_t imgSizeH : {5, 33, 100}) {
+        for (size_t imgSizeW : {5, 32, 96}) {
+          VLOG(3) << " numSamples=" << numSamples << " channels=" << channels
+                  << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW;
+
+          FunctionCompare compare("PadGrad",
+                                  FuncConfig()
+                                      .set("padc0", 2)
+                                      .set("padc1", 3)
+                                      .set("padh0", 1)
+                                      .set("padh1", 2)
+                                      .set("padw0", 3)
+                                      .set("padw1", 2));
+          Dims inDims{numSamples, channels, imgSizeH, imgSizeW};
+          Dims outDims{numSamples, channels + 5, imgSizeH + 3, imgSizeW + 5};
+          compare.cmpWithArg(
+              {Tensor(nullptr, inDims)}, {}, {Tensor(nullptr, outDims)});
+        }
+      }
+    }
+  }
+}
 
 }  // namespace paddle
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index 2be52c17ad..0560bb3ed9 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -32,1554 +32,1551 @@ DECLARE_double(checkgrad_eps);
 DECLARE_bool(thread_local_rand_use_global_seed);
 DECLARE_bool(prev_batch_state);
 
-// TEST(Operator, dot_mul) {
-//   TestConfig config;
-//   config.layerConfig.set_size(10);
-//
-//   config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
-//   config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
-//   config.layerConfig.add_inputs();
-//   config.layerConfig.add_inputs();
-//
-//   OperatorConfig& operatorConf = *config.layerConfig.add_operator_confs();
-//   operatorConf.set_type("dot_mul");
-//   operatorConf.set_dotmul_scale(-1);
-//
-//   testOperatorGrad(config, operatorConf, 100, false, false);
-// }
-//
-// TEST(Projection, context) {
-//   for (auto contextStart : {-5, -3, -1, 0, 3}) {
-//     for (auto contextLength : {1, 2, 5, 7}) {
-//       for (auto batchSize : {1, 2, 5, 20, 50}) {
-//         for (auto trainablePadding : {false, true}) {
-//           LOG(INFO) << " contextStart=" << contextStart
-//                     << " contextLength=" << contextLength
-//                     << " batchSize=" << batchSize
-//                     << " trainablePadding=" << trainablePadding;
-//           ProjectionConfig conf;
-//           conf.set_type("context");
-//           conf.set_input_size(10);
-//           conf.set_context_start(contextStart);
-//           conf.set_context_length(contextLength);
-//           conf.set_trainable_padding(trainablePadding);
-//           conf.set_output_size(conf.context_length() * conf.input_size());
-//           int pad =
-//               std::max(0, -conf.context_start()) +
-//               std::max(0, conf.context_start() + conf.context_length() - 1);
-//           for (auto useGpu : {false, true}) {
-//             testProjectionGrad(
-//                 conf,
-//                 INPUT_SEQUENCE_DATA,
-//                 trainablePadding ? conf.input_size() * pad : 0,
-//                 batchSize,
-//                 useGpu,
-//                 contextStart + contextLength <= 1);  // = testState
-//           }
-//         }
-//       }
-//     }
-//   }
-// }
-//
-// TEST(Projection, trans_fc) {
-//   ProjectionConfig conf;
-//   conf.set_type("trans_fc");
-//   conf.set_input_size(50);
-//   conf.set_output_size(20);
-//   for (auto useGpu : {false, true}) {
-//     testProjectionGrad(conf,
-//                        INPUT_DATA,
-//                        /* parameterSize */ 1000,
-//                        /* batchSize */ 100,
-//                        useGpu);
-//   }
-// }
-//
-// TEST(Projection, fc) {
-//   ProjectionConfig conf;
-//   conf.set_type("fc");
-//   conf.set_input_size(10);
-//   conf.set_output_size(20);
-//   for (auto useGpu : {false, true}) {
-//     testProjectionGrad(conf,
-//                        INPUT_DATA,
-//                        /* parameterSize */ 200,
-//                        /* batchSize */ 100,
-//                        useGpu);
-//   }
-// }
-//
-// TEST(Projection, dot_mul) {
-//   ProjectionConfig conf;
-//   conf.set_type("dot_mul");
-//   conf.set_input_size(20);
-//   conf.set_output_size(20);
-//   for (auto useGpu : {false, true}) {
-//     testProjectionGrad(conf,
-//                        INPUT_DATA,
-//                        /* parameterSize */ 20,
-//                        /* batchSize */ 100,
-//                        useGpu);
-//   }
-// }
-//
-// TEST(Projection, table) {
-//   ProjectionConfig conf;
-//   conf.set_type("table");
-//   conf.set_input_size(10);
-//   conf.set_output_size(20);
-//   for (auto useGpu : {false, true}) {
-//     testProjectionGrad(conf,
-//                        INPUT_LABEL,
-//                        /* parameterSize */ 200,
-//                        /* batchSize */ 100,
-//                        useGpu);
-//   }
-// }
-//
-// TEST(Projection, identity) {
-//   ProjectionConfig conf;
-//   conf.set_type("identity");
-//   conf.set_input_size(10);
-//   conf.set_output_size(10);
-//   for (auto useGpu : {false, true}) {
-//     testProjectionGrad(conf,
-//                        INPUT_DATA,
-//                        /* parameterSize */ 0,
-//                        /* batchSize */ 100,
-//                        useGpu);
-//   }
-// }
-//
-// TEST(Projection, scaling) {
-//   ProjectionConfig conf;
-//   conf.set_type("scaling");
-//   conf.set_input_size(10);
-//   conf.set_output_size(10);
-//   for (auto useGpu : {false}) {
-//     testProjectionGrad(conf,
-//                        INPUT_DATA,
-//                        /* parameterSize */ 1,
-//                        /* batchSize */ 100,
-//                        useGpu);
-//   }
-// }
-//
-// void testProjectionConv(size_t groups) {
-//   const int NUM_FILTERS = 18;
-//   const int FILTER_SIZE = 2;
-//   const int FILTER_SIZE_Y = 3;
-//   const int CHANNELS = 3;
-//   const int IMAGE_SIZE = 16;
-//
-//   ProjectionConfig conf;
-//   conf.set_type("conv");
-//   conf.set_num_filters(NUM_FILTERS);
-//
-//   ConvConfig* conv = conf.mutable_conv_conf();
-//   conv->set_filter_size(FILTER_SIZE);
-//   conv->set_filter_size_y(FILTER_SIZE_Y);
-//   conv->set_channels(CHANNELS);
-//   conv->set_padding(0);
-//   conv->set_padding_y(1);
-//   conv->set_stride(2);
-//   conv->set_stride_y(2);
-//   conv->set_groups(groups);
-//   conv->set_filter_channels(conv->channels() / conv->groups());
-//   conv->set_img_size(IMAGE_SIZE);
-//   int output_x = outputSize(conv->img_size(),
-//                             conv->filter_size(),
-//                             conv->padding(),
-//                             conv->stride(),
-//                             /* caffeMode */ true);
-//   int output_y = outputSize(conv->img_size(),
-//                             conv->filter_size_y(),
-//                             conv->padding_y(),
-//                             conv->stride_y(),
-//                             /* caffeMode */ true);
-//   conv->set_output_x(output_x);
-//   conf.set_input_size(IMAGE_SIZE * IMAGE_SIZE * CHANNELS);
-//   conf.set_output_size(output_x * output_y * NUM_FILTERS);
-//
-//   testProjectionGrad(conf,
-//                      INPUT_DATA,
-//                      /* parameterSize */ NUM_FILTERS * CHANNELS * FILTER_SIZE
-//                      *
-//                          FILTER_SIZE_Y / groups,
-//                      /* batchSize */ 100,
-//                      true,
-//                      false,
-//                      NUM_FILTERS,
-//                      true);
-// }
-//
-// #ifndef PADDLE_ONLY_CPU
-// TEST(Projection, conv) {
-//   testProjectionConv(1);
-//   testProjectionConv(3);
-// }
-// #endif
-//
-// TEST(Layer, BilinearInterpLayer) {
-//   TestConfig config;
-//   config.layerConfig.set_type("bilinear_interp");
-//   config.biasSize = 0;
-//   config.inputDefs.push_back({INPUT_DATA, "layer_0", 4096, 0});
-//
-//   LayerInputConfig* input = config.layerConfig.add_inputs();
-//   BilinearInterpConfig* bilinear = input->mutable_bilinear_interp_conf();
-//   ImageConfig* image = bilinear->mutable_image_conf();
-//   image->set_img_size(32);
-//   image->set_img_size_y(32);
-//   image->set_channels(4);
-//
-//   for (auto useGpu : {false, true}) {
-//     for (auto outSize : {32, 64}) {
-//       bilinear->set_out_size_x(outSize);
-//       bilinear->set_out_size_y(outSize);
-//       testLayerGrad(config, "bilinear_interp", 10, false, useGpu);
-//     }
-//   }
-// }
-//
-// TEST(Layer, concat) {
-//   TestConfig config;
-//   config.biasSize = 0;
-//   config.layerConfig.set_type("concat");
-//   config.layerConfig.set_size(15);
-//   config.layerConfig.set_active_type("sigmoid");
-//
-//   config.inputDefs.push_back({INPUT_DATA, "layer_0", 5, 0});
-//   config.layerConfig.add_inputs();
-//   config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
-//   config.layerConfig.add_inputs();
-//
-//   for (auto useGpu : {false, true}) {
-//     testLayerGrad(config, "concat", 100, false, useGpu);
-//   }
-// }
-//
-// TEST(Layer, AddtoLayer) {
-//   TestConfig config;
-//   config.biasSize = 0;
-//   config.layerConfig.set_type("addto");
-//   config.layerConfig.set_size(10);
-//   config.layerConfig.set_active_type("sigmoid");
-//
-//   config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
-//   config.layerConfig.add_inputs();
-//   config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
-//   config.layerConfig.add_inputs();
-//
-//   for (auto useGpu : {false, true}) {
-//     testLayerGrad(config, "addto", 100, false, useGpu);
-//   }
-// }
-//
-// TEST(Layer, CRFLayer) {
-//   TestConfig config;
-//   config.layerConfig.set_type("crf");
-//   config.layerConfig.set_size(10);
-//   config.biasSize = 0;
-//
-//   config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "layer_0", 10, 120});
-//   config.inputDefs.push_back({INPUT_SEQUENCE_LABEL, "layer_1", 10, 0});
-//   config.layerConfig.add_inputs();
-//   config.layerConfig.add_inputs();
-//
-//   // Not support GPU now
-//   testLayerGrad(config,
-//                 "crf",
-//                 100,
-//                 /* trans */ false,
-//                 /* useGpu */ false,
-//                 false /*useWeight*/,
-//                 0.03 /*epsilon*/);
-// }
-//
-// TEST(Layer, CTCLayer) {
-//   TestConfig config;
-//   config.layerConfig.set_type("ctc");
-//   config.layerConfig.set_norm_by_times(false);
-//   config.layerConfig.set_size(10);
-//   config.biasSize = 0;
-//
-//   config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "layer_0", 10, 0});
-//   config.inputDefs.push_back({INPUT_SEQUENCE_LABEL, "layer_1", 10, 0});
-//   config.layerConfig.add_inputs();
-//   config.layerConfig.add_inputs();
-//
-//   for (auto useGpu : {false, true}) {
-//     testLayerGrad(config, "ctc", 100, /* trans */ false, /* useGpu */
-//     useGpu);
-//   }
-// }
-//
-// TEST(Layer, cosSimLayer) {
-//   TestConfig config;
-//   config.layerConfig.set_type("cos");
-//   config.layerConfig.set_size(1);
-//   config.biasSize = 0;
-//
-//   config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
-//   config.inputDefs.push_back({INPUT_DATA, "layer_1", 50, 0});
-//   config.layerConfig.add_inputs();
-//   config.layerConfig.add_inputs();
-//
-//   for (auto useGpu : {false, true}) {
-//     testLayerGrad(config, "cos", 100, false, useGpu);
-//   }
-// }
-//
-// TEST(Layer, CosSimVecMatLayer) {
-//   TestConfig config;
-//   config.layerConfig.set_type("cos_vm");
-//   config.layerConfig.set_size(5);  // output size
-//   config.layerConfig.set_cos_scale(2.0);
-//
-//   config.inputDefs.push_back({INPUT_DATA, "layer_0", 20, 0});
-//   config.layerConfig.add_inputs();
-//   config.inputDefs.push_back({INPUT_DATA, "layer_1", 100, 0});
-//   config.layerConfig.add_inputs();
-//
-//   for (auto useGpu : {false, true}) {
-//     testLayerGrad(config, "cos_vm", 100, false, useGpu);
-//   }
-// }
-//
-// void testConvLayer(const string& type, bool trans, bool useGpu) {
-//   TestConfig config;
-//   config.biasSize = 16;
-//   config.layerConfig.set_type(type);
-//   config.layerConfig.set_num_filters(16);
-//   config.layerConfig.set_partial_sum(1);
-//   config.layerConfig.set_shared_biases(true);
-//
-//   config.inputDefs.push_back({INPUT_DATA, "layer_0", 384, 288});
-//   LayerInputConfig* input = config.layerConfig.add_inputs();
-//   ConvConfig* conv = input->mutable_conv_conf();
-//   conv->set_filter_size(2);
-//   conv->set_filter_size_y(3);
-//   conv->set_channels(3);
-//   conv->set_padding(0);
-//   conv->set_padding_y(1);
-//   conv->set_stride(2);
-//   conv->set_stride_y(2);
-//   conv->set_groups(1);
-//   conv->set_filter_channels(conv->channels() / conv->groups());
-//   conv->set_img_size(16);
-//   conv->set_img_size_y(8);
-//   conv->set_output_x(outputSize(conv->img_size(),
-//                                 conv->filter_size(),
-//                                 conv->padding(),
-//                                 conv->stride(),
-//                                 /* caffeMode */ true));
-//   conv->set_output_y(outputSize(conv->img_size_y(),
-//                                 conv->filter_size_y(),
-//                                 conv->padding_y(),
-//                                 conv->stride_y(),
-//                                 /* caffeMode */ true));
-//   config.layerConfig.set_size(conv->output_x() * conv->output_y() *
-//                               config.layerConfig.num_filters());
-//
-//   testLayerGrad(config, "conv", 100, trans, useGpu);
-//   // Use small batch_size and useWeight=true to test biasGrad
-//   testLayerGrad(config, "conv", 2, trans, useGpu, true, 0.02);
-// }
-//
-// TEST(Layer, convLayer) {
-//   testConvLayer("exconv", /* trans= */ false, /* useGpu= */ false);
-// #ifndef PADDLE_ONLY_CPU
-//   testConvLayer("exconv", /* trans= */ false, /* useGpu= */ true);
-//   testConvLayer("cudnn_conv", /* trans= */ false, /* useGpu= */ true);
-// #endif
-// }
-//
-// void testConvTransLayer(const string& type, bool trans, bool useGpu) {
-//   TestConfig config;
-//   config.biasSize = 3;
-//   config.layerConfig.set_type(type);
-//   config.layerConfig.set_num_filters(3);
-//   config.layerConfig.set_partial_sum(1);
-//   config.layerConfig.set_shared_biases(true);
-//
-//   config.inputDefs.push_back({INPUT_DATA, "layer_0", 1024, 288});
-//   LayerInputConfig* input = config.layerConfig.add_inputs();
-//   ConvConfig* conv = input->mutable_conv_conf();
-//   conv->set_filter_size(2);
-//   conv->set_filter_size_y(3);
-//   conv->set_channels(16);
-//   conv->set_padding(0);
-//   conv->set_padding_y(1);
-//   conv->set_stride(2);
-//   conv->set_stride_y(2);
-//   conv->set_groups(1);
-//   conv->set_filter_channels(3 / conv->groups());
-//   conv->set_img_size(16);
-//   conv->set_output_x(outputSize(conv->img_size(),
-//                                 conv->filter_size(),
-//                                 conv->padding(),
-//                                 conv->stride(),
-//                                 /* caffeMode */ true));
-//
-//   config.layerConfig.set_size(conv->img_size() * conv->img_size() *
-//                               config.layerConfig.num_filters());
-//
-//   testLayerGrad(config, "convTrans", 100, trans, useGpu);
-//   // Use small batch_size and useWeight=true to test biasGrad
-//   testLayerGrad(config, "convTrans", 2, trans, useGpu, true, 0.02);
-// }
-//
-// TEST(Layer, convTransLayer) {
-//   for (auto useGpu : {false, true}) {
-//     testConvTransLayer("exconvt", /* trans= */ false, /* useGpu= */ useGpu);
-//   }
-// }
-//
-// TEST(Layer, blockExpandLayer) {
-//   TestConfig config;
-//   config.biasSize = 0;
-//   config.layerConfig.set_type("blockexpand");
-//
-//   config.inputDefs.push_back({INPUT_DATA, "layer_0", 6144, 0});
-//   LayerInputConfig* input = config.layerConfig.add_inputs();
-//   BlockExpandConfig* blockExpand = input->mutable_block_expand_conf();
-//   blockExpand->set_img_size_x(64);
-//   blockExpand->set_img_size_y(32);
-//   blockExpand->set_channels(3);
-//   blockExpand->set_padding_x(0);
-//   blockExpand->set_padding_y(0);
-//   blockExpand->set_block_x(4);
-//   blockExpand->set_block_y(32);
-//   blockExpand->set_stride_x(2);
-//   blockExpand->set_stride_y(2);
-//   blockExpand->set_output_x(outputSize(blockExpand->img_size_x(),
-//                                        blockExpand->block_x(),
-//                                        blockExpand->padding_x(),
-//                                        blockExpand->stride_x(),
-//                                        /* caffeMode */ false));
-//   blockExpand->set_output_y(outputSize(blockExpand->img_size_y(),
-//                                        blockExpand->block_y(),
-//                                        blockExpand->padding_y(),
-//                                        blockExpand->stride_y(),
-//                                        /* caffeMode */ false));
-//   config.layerConfig.set_size(blockExpand->block_x() * blockExpand->block_y()
-//   *
-//                               blockExpand->channels());
-//
-//   for (auto useGpu : {false, true}) {
-//     testLayerGrad(config, "blockexpand", 100, false, useGpu);
-//   }
-// }
-//
-// TEST(Layer, maxoutLayer) {
-//   TestConfig config;
-//   config.biasSize = 0;
-//   config.layerConfig.set_type("maxout");
-//
-//   config.inputDefs.push_back({INPUT_DATA, "layer_0", 4096, 0});
-//   LayerInputConfig* input = config.layerConfig.add_inputs();
-//   MaxOutConfig* maxout = input->mutable_maxout_conf();
-//   ImageConfig* image = maxout->mutable_image_conf();
-//
-//   image->set_img_size(32);
-//   image->set_img_size_y(32);
-//   image->set_channels(4);
-//   maxout->set_groups(2);
-//
-//   for (auto useGpu : {false, true}) {
-//     testLayerGrad(config, "maxout", 10, false, useGpu);
-//   }
-// }
-// void testFcLayer(string format, size_t nnz) {
-//   TestConfig config;
-//   config.biasSize = 4096;
-//   config.layerConfig.set_type("fc");
-//   config.layerConfig.set_size(4096);
-//   config.layerConfig.set_active_type("sigmoid");
-//   config.layerConfig.set_drop_rate(0.1);
-//
-//   config.inputDefs.push_back(
-//       {INPUT_DATA, "layer_0", 8192, nnz, ParaSparse(format)});
-//   config.layerConfig.add_inputs();
-//
-//   LOG(INFO) << config.inputDefs[0].sparse.sparse << " "
-//             << config.inputDefs[0].sparse.format;
-//
-//   for (auto useGpu : {false, true}) {
-//     testLayerGrad(config,
-//                   "fc",
-//                   100,
-//                   /* trans */ false,
-//                   useGpu,
-//                   /* weight */ true);
-//   }
-// }
-//
-// TEST(Layer, fcLayer) {
-//   testFcLayer("", 4096 * 4096 * 2);
-//   testFcLayer("csc", 4096 * 40);
-//   testFcLayer("csr", 4096 * 40);
-// }
-//
-// TEST(Layer, SelectiveFullyConnectedLayer) {
-//   TestConfig config;
-//   size_t nin = 16;
-//   size_t nout = 256;
-//   config.layerConfig.set_type("selective_fc");
-//   config.layerConfig.set_size(nout);
-//   config.layerConfig.set_active_type("sigmoid");
-//   config.layerConfig.set_has_selected_colums(true);
-//   config.layerConfig.set_selective_fc_pass_generation(false);
-//   config.biasSize = nout;
-//
-//   config.inputDefs.push_back({INPUT_DATA, "input0", nin, nin * nout});
-//   config.layerConfig.add_inputs();
-//   config.inputDefs.push_back(
-//       {INPUT_SPARSE_NON_VALUE_DATA, "index", nout, 0, ParaSparse("csr",
-//       true)});
-//   config.layerConfig.add_inputs();
-//
-//   testLayerGrad(config,
-//                 "selective_fc",
-//                 100,
-//                 /* trans= */ false,
-//                 /* useGup= */ false,
-//                 false);
-// #ifndef PADDLE_ONLY_CPU
-//   testLayerGrad(config,
-//                 "selective_fc",
-//                 100,
-//                 /* trans= */ false,
-//                 /* useGup= */ true,
-//                 false);
-// #endif
-// }
-//
-// TEST(Layer, DataNormLayer) {
-//   TestConfig config;
-//   config.layerConfig.set_type("data_norm");
-//   config.layerConfig.set_size(20);
-//   config.biasSize = 0;
-//
-//   config.inputDefs.push_back({INPUT_DATA, "layer_0", 20, 100});
-//   config.inputDefs.back().isStatic = true;
-//   config.layerConfig.add_inputs();
-//
-//   for (auto strategy : {"z-score", "min-max", "decimal-scaling"}) {
-//     config.layerConfig.set_data_norm_strategy(strategy);
-//     // The parameters are static, so not support GPU now
-//     testLayerGrad(config,
-//                   "data_norm",
-//                   200,
-//                   /* trans */ false,
-//                   /* useGpu */ false);
-//   }
-// }
-//
-// TEST(Layer, hsigmoidLayer) {
-//   TestConfig config;
-//   config.layerConfig.set_type("hsigmoid");
-//   config.layerConfig.set_num_classes(5);
-//   config.layerConfig.set_size(1);
-//   config.biasSize = config.layerConfig.num_classes() - 1;
-//
-//   config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 200});
-//   config.inputDefs.push_back({INPUT_LABEL, "layer_1", 5, 0});
-//   config.layerConfig.add_inputs();
-//   config.layerConfig.add_inputs();
-//
-//   // Not support GPU now
-//   testLayerGrad(config, "hsigmoid", 100, /* trans */ false, /* useGpu */
-//   false);
-// }
-//
-// TEST(Layer, multi_cross) {
-//   TestConfig config;
-//   config.layerConfig.set_type("multi-class-cross-entropy");
-//   config.biasSize = 0;
-//
-//   config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
-//   config.inputDefs.push_back({INPUT_LABEL, "layer_1", 10, 0});
-//   config.layerConfig.add_inputs();
-//   config.layerConfig.add_inputs();
-//
-//   for (auto useGpu : {false, true}) {
-//     testLayerGrad(
-//         config, "multi-class-cross-entropy", 100, /* trans */ false, useGpu);
-//   }
-// }
-//
-// TEST(Layer, multi_binary_label_sparse_mat) {
-//   TestConfig config;
-//   config.layerConfig.set_type("multi_binary_label_cross_entropy");
-//   config.biasSize = 0;
-//
-//   config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
-//   config.inputDefs.push_back({INPUT_SPARSE_NON_VALUE_DATA, "layer_1", 50,
-//   0});
-//   config.layerConfig.add_inputs();
-//   config.layerConfig.add_inputs();
-//
-//   for (auto useGpu : {false, true}) {
-//     testLayerGrad(config,
-//                   "multi_binary_label_cross_entropy",
-//                   100,
-//                   /* trans */ false,
-//                   useGpu);
-//   }
-// }
-//
-// TEST(layer, multi_binary_label_id) {
-//   TestConfig config;
-//   config.layerConfig.set_type("multi_binary_label_cross_entropy");
-//   config.biasSize = 0;
-//
-//   config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
-//   config.inputDefs.push_back({INPUT_LABEL, "layer_1", 10, 0});
-//   config.layerConfig.add_inputs();
-//   config.layerConfig.add_inputs();
-//
-//   for (auto useGpu : {false, true}) {
-//     testLayerGrad(config,
-//                   "multi_binary_label_cross_entropy",
-//                   100,
-//                   /* trans */ false,
-//                   useGpu);
-//   }
-// }
-//
-// TEST(Layer, multi_cross_with_selfnorm) {
-//   TestConfig config;
-//   config.layerConfig.set_type("multi_class_cross_entropy_with_selfnorm");
-//   config.layerConfig.set_softmax_selfnorm_alpha(0.1);
-//   config.biasSize = 0;
-//
-//   config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
-//   config.inputDefs.push_back({INPUT_LABEL, "layer_1", 10, 0});
-//   config.layerConfig.add_inputs();
-//   config.layerConfig.add_inputs();
-//
-//   // Not support GPU now
-//   testLayerGrad(config,
-//                 "multi_class_cross_entropy_with_selfnorm",
-//                 100,
-//                 /* trans */ false,
-//                 /* useGpu */ false);
-// }
-//
-// TEST(Layer, multi_cross_soft) {
-//   TestConfig config;
-//   config.layerConfig.set_type("soft_binary_class_cross_entropy");
-//   config.biasSize = 0;
-//
-//   config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
-//   config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_1", 10, 0});
-//   config.layerConfig.add_inputs();
-//   config.layerConfig.add_inputs();
-//
-//   for (auto useGpu : {false, true}) {
-//     testLayerGrad(config,
-//                   "soft_binary_class_cross_entropy",
-//                   100,
-//                   /* trans */ false,
-//                   useGpu);
-//   }
-// }
-//
-// TEST(Layer, square_error) {
-//   TestConfig config;
-//   config.layerConfig.set_type("square_error");
-//   config.biasSize = 0;
-//
-//   config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
-//   config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_1", 10, 0});
-//   config.layerConfig.add_inputs();
-//   config.layerConfig.add_inputs();
-//
-//   for (auto useGpu : {false, true}) {
-//     testLayerGrad(config, "square_error", 100, /* trans */ false, useGpu);
-//   }
-// }
-//
-// TEST(Layer, sparse_square_error) {
-//   TestConfig config;
-//   config.layerConfig.set_type("square_error");
-//   config.biasSize = 0;
-//
-//   config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
-//   config.inputDefs.push_back({INPUT_SPARSE_NON_VALUE_DATA, "layer_1", 50,
-//   0});
-//   config.layerConfig.add_inputs();
-//   config.layerConfig.add_inputs();
-//
-//   // "GpuSparseMatrix" as label is not supported
-//   testLayerGrad(config,
-//                 "square_error",
-//                 100,
-//                 /* trans */ false,
-//                 /* useGpu */ false);
-// }
-//
-// TEST(Layer, sparse_float_square_error) {
-//   TestConfig config;
-//   config.layerConfig.set_type("square_error");
-//   config.biasSize = 0;
-//
-//   config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
-//   config.inputDefs.push_back({INPUT_SPARSE_FLOAT_VALUE_DATA, "layer_1", 50,
-//   0});
-//   config.layerConfig.add_inputs();
-//   config.layerConfig.add_inputs();
-//
-//   // "GpuSparseMatrix" as label is not supported
-//   testLayerGrad(config,
-//                 "square_error",
-//                 100,
-//                 /* trans */ false,
-//                 /* useGpu */ false);
-// }
-//
-// TEST(Layer, square_error_weighted) {
-//   TestConfig config;
-//   config.layerConfig.set_type("square_error");
-//   config.biasSize = 0;
-//   config.testAccumulate = false;
-//
-//   config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
-//   config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_1", 10, 0});
-//   config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_2", 1, 0});
-//   config.layerConfig.add_inputs();
-//   config.layerConfig.add_inputs();
-//   config.layerConfig.add_inputs();
-//
-//   for (auto useGpu : {false, true}) {
-//     testLayerGrad(config, "square_error", 100, /* trans */ false, useGpu);
-//   }
-// }
-//
-// TEST(Layer, huber_two_class) {
-//   TestConfig config;
-//   config.layerConfig.set_type("huber");
-//   config.biasSize = 0;
-//
-//   config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
-//   config.inputDefs.push_back({INPUT_LABEL, "layer_1", 2, 0});
-//   config.layerConfig.add_inputs();
-//   config.layerConfig.add_inputs();
-//
-//   for (auto useGpu : {false, true}) {
-//     testLayerGrad(config, "huber", 100, /* trans */ false, useGpu);
-//   }
-// }
-//
-// void testExpandLayer(string trans_type, bool hasSubseq) {
-//   TestConfig config;
-//   config.layerConfig.set_type("expand");
-//
-//   config.inputDefs.push_back(
-//       {trans_type == "non-seq" ? INPUT_DENSE_DIM_DATA : INPUT_SEQUENCE_DATA,
-//        "layer_0",
-//        10,
-//        0});
-//   config.inputDefs.push_back(
-//       {hasSubseq ? INPUT_HASSUB_SEQUENCE_DATA : INPUT_SEQUENCE_DATA,
-//        "layer_1",
-//        10,
-//        0});
-//   config.layerConfig.add_inputs();
-//   config.layerConfig.add_inputs();
-//   config.layerConfig.set_trans_type(trans_type);
-//   LOG(INFO) << " trans_type=" << trans_type << " hasSubseq=" << hasSubseq;
-//
-//   for (auto useGpu : {false, true}) {
-//     testLayerGrad(config, "expand", 30, false, useGpu);
-//   }
-// }
-//
-// TEST(Layer, ExpandLayer) {
-//   testExpandLayer("non-seq", false);  // non-seq expand to seq
-//   testExpandLayer("non-seq", true);   // non-seq expand to hasSubseq
-//   testExpandLayer("seq", true);       // seq expand to hasSubseq
-// }
-//
-// void testDegradeLayer(bool hasSubseq, string layer_type, string trans_type) {
-//   TestConfig config;
-//   config.layerConfig.set_type(layer_type);
-//   config.layerConfig.set_size(10);
-//   config.biasSize = 0;
-//
-//   config.inputDefs.push_back(
-//       {hasSubseq ? INPUT_HASSUB_SEQUENCE_DATA : INPUT_SEQUENCE_DATA,
-//        "layer_0",
-//        10,
-//        0});
-//   config.layerConfig.add_inputs();
-//   config.layerConfig.set_trans_type(trans_type);
-//
-//   auto testDegradeLayerGrad = [](TestConfig& config, string layer_type) {
-//     for (auto useGpu : {false, true}) {
-//       testLayerGrad(config, layer_type, 100, false, useGpu);
-//     }
-//   };
-//
-//   if (layer_type == "average") {
-//     for (auto strategy : {"average", "sum", "squarerootn"}) {
-//       LOG(INFO) << " hasSubseq=" << hasSubseq << " trans_type=" << trans_type
-//                 << " average_strategy=" << strategy;
-//       config.layerConfig.set_average_strategy(strategy);
-//       testDegradeLayerGrad(config, layer_type);
-//     }
-//   } else {
-//     LOG(INFO) << " hasSubseq=" << hasSubseq << " trans_type=" << trans_type;
-//     testDegradeLayerGrad(config, layer_type);
-//   }
-// }
-//
-// TEST(Layer, MaxLayer) {
-//   testDegradeLayer(false, "max", "non-seq");  // seq max to non-seq
-//   testDegradeLayer(true, "max", "non-seq");   // hasSubseq max to non-seq
-//   testDegradeLayer(true, "max", "seq");       // hasSubseq max to seq
-// }
-//
-// TEST(Layer, SequenceLastInstanceLayer) {
-//   testDegradeLayer(false,
-//                    "seqlastins",
-//                    "non-seq");  // seq seqlastins to non-seq
-//   testDegradeLayer(true,
-//                    "seqlastins",
-//                    "non-seq");  // hasSubseq seqlastins to non-seq
-//   testDegradeLayer(true, "seqlastins", "seq");  // hasSubseq seqlastins to
-//   seq
-// }
-//
-// TEST(Layer, AverageLayer) {
-//   testDegradeLayer(false, "average", "non-seq");  // seq average to non-seq
-//   testDegradeLayer(true, "average", "non-seq");  // hasSubseq average to
-//   non-seq
-//   testDegradeLayer(true, "average", "seq");      // hasSubseq average to seq
-// }
-//
-// TEST(Layer, SequenceConcatLayer) {
-//   TestConfig config;
-//   config.layerConfig.set_type("seqconcat");
-//   config.layerConfig.set_size(10);
-//   config.biasSize = 0;
-//
-//   config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "layer_0", 10, 0});
-//   config.layerConfig.add_inputs();
-//   config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "layer_1", 10, 0});
-//   config.layerConfig.add_inputs();
-//
-//   for (auto useGpu : {false, true}) {
-//     testLayerGrad(config, "seqconcat", 100, false, useGpu);
-//   }
-// }
-//
-// TEST(Layer, SequenceReshapeLayer) {
-//   TestConfig config;
-//   config.layerConfig.set_type("seqreshape");
-//   config.layerConfig.set_size(10);
-//
-//   config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "layer_0", 100, 0});
-//   config.layerConfig.add_inputs();
-//
-//   for (auto useGpu : {false, true}) {
-//     testLayerGrad(config, "seqreshape", 100, false, useGpu);
-//   }
-// }
-//
-// TEST(Layer, ConvShiftLayer) {
-//   TestConfig config;
-//   config.layerConfig.set_type("conv_shift");
-//   config.layerConfig.set_size(10);
-//
-//   config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
-//   config.inputDefs.push_back({INPUT_DATA, "layer_1", 3, 0});
-//   config.layerConfig.add_inputs();
-//   config.layerConfig.add_inputs();
-//
-//   // Not support GPU now
-//   testLayerGrad(config, "conv_shift", 100, false, false);
-// }
-//
-// TEST(Layer, PowerLayer) {
-//   TestConfig config;
-//   config.layerConfig.set_type("power");
-//   config.layerConfig.set_size(10);
-//
-//   config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
-//   config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
-//   config.layerConfig.add_inputs();
-//   config.layerConfig.add_inputs();
-//
-//   for (auto useGpu : {false, true}) {
-//     testLayerGrad(config, "power", 100, false, useGpu);
-//   }
-// }
-//
-// TEST(Layer, ConvexCombinationLayer) {
-//   TestConfig config;
-//   config.layerConfig.set_type("convex_comb");
-//   config.layerConfig.set_size(20);
-//   config.biasSize = 0;
-//
-//   config.inputDefs.push_back({INPUT_DATA, "layer_0", 5, 0});
-//   config.inputDefs.push_back({INPUT_DATA, "layer_1", 100, 0});
-//   config.layerConfig.add_inputs();
-//   config.layerConfig.add_inputs();
-//
-//   for (auto useGpu : {false, true}) {
-//     testLayerGrad(config, "convex_comb", 100, false, useGpu);
-//   }
-// }
-//
-// TEST(Layer, InterpolationLayer) {
-//   TestConfig config;
-//   config.layerConfig.set_type("interpolation");
-//   config.layerConfig.set_size(10);
-//   config.biasSize = 0;
-//
-//   config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
-//   config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
-//   config.inputDefs.push_back({INPUT_DATA, "layer_2", 10, 0});
-//   config.layerConfig.add_inputs();
-//   config.layerConfig.add_inputs();
-//   config.layerConfig.add_inputs();
-//
-//   for (auto useGpu : {false, true}) {
-//     testLayerGrad(config, "interpolation", 100, false, useGpu);
-//   }
-// }
-//
-// TEST(Layer, OuterProdLayer) {
-//   TestConfig config;
-//   config.layerConfig.set_type("out_prod");
-//   config.layerConfig.set_size(100);
-//
-//   config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
-//   config.layerConfig.add_inputs();
-//   config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
-//   config.layerConfig.add_inputs();
-//
-//   for (auto useGpu : {false, true}) {
-//     testLayerGrad(config, "out_prod", 100, false, useGpu);
-//   }
-// }
-//
-// TEST(Layer, SlopeInterceptLayer) {
-//   TestConfig config;
-//   config.layerConfig.set_type("slope_intercept");
-//   config.layerConfig.set_size(10);
-//   config.layerConfig.set_slope(1.0);
-//   config.layerConfig.set_intercept(0.1);
-//
-//   config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
-//   config.layerConfig.add_inputs();
-//
-//   for (auto useGpu : {false, true}) {
-//     testLayerGrad(config, "slope_intercept", 100, false, useGpu);
-//   }
-// }
-//
-// TEST(Layer, ScalingLayer) {
-//   TestConfig config;
-//   config.layerConfig.set_type("scaling");
-//   config.layerConfig.set_size(10);
-//   config.biasSize = 0;
-//
-//   config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
-//   config.layerConfig.add_inputs();
-//   config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
-//   config.layerConfig.add_inputs();
-//
-//   for (auto useGpu : {false, true}) {
-//     testLayerGrad(config, "scaling", 100, false, useGpu);
-//   }
-// }
-//
-// void testNormLayer(const string& normType, bool trans, bool useGpu) {
-//   TestConfig config;
-//   config.layerConfig.set_type("norm");
-//   config.layerConfig.set_active_type("relu");
-//
-//   config.inputDefs.push_back({INPUT_DATA, "layer_0", 1568, 0});
-//   LayerInputConfig* input = config.layerConfig.add_inputs();
-//   NormConfig* norm = input->mutable_norm_conf();
-//   norm->set_norm_type(normType);
-//   norm->set_channels(16);
-//   norm->set_size(5);
-//   norm->set_scale(0.001);
-//   norm->set_pow(0.75);
-//   norm->set_blocked(0);
-//   norm->set_img_size(14);
-//   norm->set_img_size_y(7);
-//   norm->set_output_x(norm->img_size());
-//   norm->set_output_y(norm->img_size_y());
-//   if (norm->norm_type() == "cmrnorm" ||
-//       norm->norm_type() == "cmrnorm-projection") {
-//     norm->set_scale(norm->scale() / norm->size());
-//   } else {
-//     norm->set_scale(norm->scale() / (norm->size() * norm->size()));
-//   }
-//
-//   config.layerConfig.set_size(norm->output_x() * norm->output_y() *
-//                               norm->channels());
-//   config.biasSize = 0;
-//
-//   testLayerGrad(config, "norm", 100, trans, useGpu);
-// }
-//
-// TEST(Layer, NormLayer) {
-//   testNormLayer("cmrnorm-projection", /* trans= */ false, /* useGpu= */
-//   true);
-//   testNormLayer("cmrnorm-projection", /* trans= */ false, /* useGpu= */
-//   false);
-// }
-//
-// void setPoolConfig(TestConfig* config,
-//                    PoolConfig* pool,
-//                    const string& poolType) {
-//   (*config).biasSize = 0;
-//   (*config).layerConfig.set_type("pool");
-//   (*config).layerConfig.set_num_filters(16);
-//
-//   int kw = 3, kh = 3;
-//   int pw = 0, ph = 0;
-//   int sw = 2, sh = 2;
-//   pool->set_pool_type(poolType);
-//   pool->set_channels(16);
-//   pool->set_size_x(kw);
-//   pool->set_size_y(kh);
-//   pool->set_start(0);
-//   pool->set_padding(pw);
-//   pool->set_padding_y(ph);
-//   pool->set_stride(sw);
-//   pool->set_stride_y(sh);
-//
-//   int ow = outputSize(pool->img_size(), kw, pw, sw, /* caffeMode */ false);
-//   int oh = outputSize(pool->img_size_y(), kh, ph, sh, /* caffeMode */ false);
-//   pool->set_output_x(ow);
-//   pool->set_output_y(oh);
-// }
-//
-// void testPoolLayer(const string& poolType, bool trans, bool useGpu) {
-//   TestConfig config;
-//   config.inputDefs.push_back({INPUT_DATA, "layer_0", 3136, 0});
-//   LayerInputConfig* input = config.layerConfig.add_inputs();
-//   PoolConfig* pool = input->mutable_pool_conf();
-//
-//   pool->set_img_size(14);
-//   pool->set_img_size_y(14);
-//   setPoolConfig(&config, pool, poolType);
-//   config.layerConfig.set_size(pool->output_x() * pool->output_y() *
-//                               pool->channels());
-//
-//   testLayerGrad(config, "pool", 100, trans, useGpu);
-// }
-//
-// #ifndef PADDLE_ONLY_CPU
-// void testPoolLayer2(const string& poolType, bool trans, bool useGpu) {
-//   TestConfig config;
-//   config.inputDefs.push_back({INPUT_DATA, "layer_0", 3200, 0});
-//   LayerInputConfig* input = config.layerConfig.add_inputs();
-//   PoolConfig* pool = input->mutable_pool_conf();
-//
-//   pool->set_size_y(4);
-//   pool->set_stride_y(3);
-//   pool->set_img_size(10);
-//   pool->set_img_size_y(20);
-//   setPoolConfig(&config, pool, poolType);
-//   pool->set_output_y((pool->img_size_y() - pool->start() - pool->size_y()) /
-//                          ((float)pool->stride_y()) +
-//                      1.5);
-//   config.layerConfig.set_size(pool->output_x() * pool->output_y() *
-//                               pool->channels());
-//
-//   testLayerGrad(config, "pool", 100, trans, useGpu);
-// }
-// #endif
-//
-// TEST(Layer, PoolLayer) {
-//   testPoolLayer("avg-projection", /* trans= */ false, /* useGpu= */ false);
-//   testPoolLayer("max-projection", /* trans= */ false, /* useGpu= */ false);
-//
-// #ifndef PADDLE_ONLY_CPU
-//   testPoolLayer("avg-projection", /* trans= */ false, /* useGpu= */ true);
-//   testPoolLayer("max-projection", /* trans= */ false, /* useGpu= */ true);
-//   testPoolLayer("cudnn-max-pool", /* trans= */ false, /* useGpu= */ true);
-//   testPoolLayer("cudnn-avg-pool", /* trans= */ false, /* useGpu= */ true);
-//   testPoolLayer2("cudnn-max-pool", /* trans= */ false, /* useGpu= */ true);
-//   testPoolLayer2("cudnn-avg-pool", /* trans= */ false, /* useGpu= */ true);
-// #endif
-// }
-//
-// void testSppLayer(const string& poolType,
-//                   const int pyramidHeight,
-//                   bool trans,
-//                   bool useGpu) {
-//   TestConfig config;
-//   config.layerConfig.set_type("spp");
-//   config.inputDefs.push_back({INPUT_DATA, "layer_0", 3200, 0});
-//   LayerInputConfig* input = config.layerConfig.add_inputs();
-//   SppConfig* sppConfig = input->mutable_spp_conf();
-//   sppConfig->set_pool_type(poolType);
-//   sppConfig->set_pyramid_height(pyramidHeight);
-//   ImageConfig* imageConfig = sppConfig->mutable_image_conf();
-//   imageConfig->set_channels(16);
-//   imageConfig->set_img_size(10);
-//   imageConfig->set_img_size_y(20);
-//   int outputSize = (std::pow(4, sppConfig->pyramid_height()) - 1) / (4 - 1);
-//   config.layerConfig.set_size(outputSize * imageConfig->channels());
-//   testLayerGrad(config, "spp", 100, trans, useGpu);
-// }
-//
-// TEST(Layer, SpatialPyramidPoolLayer) {
-//   for (auto useGpu : {false, true}) {
-//     for (auto pyramidHeight : {1, 2, 3}) {
-//       testSppLayer("avg-projection", pyramidHeight, false, useGpu);
-//       testSppLayer("max-projection", pyramidHeight, false, useGpu);
-//     }
-//   }
-// }
-//
-// TEST(Layer, rankCostLayer) {
-//   TestConfig config;
-//   config.layerConfig.set_type("rank-cost");
-//   config.biasSize = 0;
-//
-//   config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
-//   config.inputDefs.push_back({INPUT_DATA, "layer_1", 1, 0});
-//   config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_2", 1, 0});
-//   config.layerConfig.add_inputs();
-//   config.layerConfig.add_inputs();
-//   config.layerConfig.add_inputs();
-//
-//   for (auto useGpu : {false, true}) {
-//     testLayerGrad(config, "rank-cost", 100, false, useGpu);
-//   }
-// }
-//
-// TEST(Layer, sumCostLayer) {
-//   TestConfig config;
-//   config.layerConfig.set_type("sum_cost");
-//   config.biasSize = 0;
-//
-//   config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
-//   config.layerConfig.add_inputs();
-//
-//   for (auto useGpu : {false, true}) {
-//     testLayerGrad(config, "sum_cost", 100, false, useGpu);
-//   }
-// }
-//
-// TEST(Layer, weightedRankCostLayer) {
-//   TestConfig config;
-//   config.layerConfig.set_type("rank-cost");
-//   config.biasSize = 0;
-//
-//   config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
-//   config.inputDefs.push_back({INPUT_DATA, "layer_1", 1, 0});
-//   config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_2", 1, 0});
-//   config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_3", 1, 0});
-//   config.layerConfig.add_inputs();
-//   config.layerConfig.add_inputs();
-//   config.layerConfig.add_inputs();
-//   config.layerConfig.add_inputs();
-//
-//   for (auto useGpu : {false, true}) {
-//     testLayerGrad(config, "weighted-rank-cost", 100, false, useGpu);
-//   }
-// }
-//
-// TEST(Layer, TensorLayer) {
-//   TestConfig config;
-//   config.layerConfig.set_type("tensor");
-//   config.layerConfig.set_size(10);
-//   config.layerConfig.set_active_type("sigmoid");
-//   config.biasSize = config.layerConfig.size();
-//
-//   config.inputDefs.push_back({INPUT_DATA, "layer_0", 5, 250});
-//   config.inputDefs.push_back({INPUT_DATA, "layer_1", 5, 0});
-//   config.layerConfig.add_inputs();
-//   config.layerConfig.add_inputs();
-//
-//   for (auto useGpu : {false, true}) {
-//     testLayerGrad(config, "tensor", 100, false, useGpu);
-//   }
-// }
-//
-// TEST(Layer, RecurrentLayer) {
-//   TestConfig config;
-//   config.layerConfig.set_type("recurrent");
-//   config.layerConfig.set_size(4);
-//   config.layerConfig.set_active_type("tanh");
-//   config.biasSize = 4;
-//
-//   config.inputDefs.push_back(
-//       {INPUT_SEQUENCE_DATA, "layer_0", /* dim= */ 4, /* paraSize= */ 16});
-//   config.layerConfig.add_inputs();
-//
-//   for (auto useGpu : {false, true}) {
-//     for (auto reversed : {false, true}) {
-//       config.layerConfig.set_reversed(reversed);
-//       config.testState = !reversed;
-//       testLayerGrad(config, "recurrent", 50, /* trans= */ false, useGpu);
-//     }
-//   }
-// }
-//
-// TEST(Layer, LstmLayer) {
-//   TestConfig config;
-//   config.layerConfig.set_type("lstmemory");
-//   config.layerConfig.set_size(4);
-//   config.layerConfig.set_active_type("tanh");
-//   config.layerConfig.set_active_state_type("sigmoid");
-//   config.layerConfig.set_active_gate_type("sigmoid");
-//   config.biasSize = 28;
-//
-//   config.inputDefs.push_back(
-//       {INPUT_SEQUENCE_DATA, "layer_0", /* dim= */ 16, /* paraSize= */ 64});
-//   config.layerConfig.add_inputs();
-//
-//   for (auto useGpu : {false, true}) {
-//     for (auto reversed : {false, true}) {
-//       config.layerConfig.set_reversed(reversed);
-//       config.testState = !reversed;
-//       testLayerGrad(config, "lstmemory", 100, /* trans= */ false, useGpu);
-//     }
-//   }
-//   for (auto useGpu : {true}) {
-//     config.testBatchState = true;
-//     config.layerConfig.set_reversed(false);
-//     testLayerGrad(config, "lstmemory", 10, /* trans= */ false, useGpu);
-//   }
-// }
-//
-// TEST(Layer, MDLstmLayer) {
-//   TestConfig config;
-//   config.layerConfig.set_type("mdlstmemory");
-//   config.layerConfig.set_size(4);
-//   config.layerConfig.set_active_type("sigmoid");
-//   config.layerConfig.set_active_state_type("sigmoid");
-//   config.layerConfig.set_active_gate_type("sigmoid");
-//   config.biasSize = 4 * 9;
-//
-//   config.inputDefs.push_back(
-//       {INPUT_SEQUENCE_MDIM_DATA, "layer_0", 4 * 5, 4 * 4 * 5});
-//   config.layerConfig.add_inputs();
-//   config.layerConfig.add_directions(true);
-//   config.layerConfig.add_directions(true);
-//
-//   for (auto useGpu : {false, true}) {
-//     for (int i = 0; i < 2; i++) {
-//       for (int j = 0; j < 2; j++) {
-//         config.layerConfig.set_directions(0, bool(i));
-//         config.layerConfig.set_directions(1, bool(j));
-//         testLayerGrad(config, "mdlstmemory", 100, false, useGpu);
-//       }
-//     }
-//   }
-// }
-//
-// TEST(Layer, ParameterReluLayer) {
-//   auto testParameterReluLayer = [&](size_t inputSize, size_t channels) {
-//     TestConfig config;
-//     config.layerConfig.set_type("prelu");
-//     config.inputDefs.push_back({INPUT_DATA, "layer_0", inputSize, channels});
-//     config.layerConfig.add_inputs();
-//     config.layerConfig.set_size(inputSize);
-//     config.layerConfig.set_partial_sum(inputSize /
-//                                        channels);  // size of feature map
-//     for (auto useGpu : {false, true}) {
-//       testLayerGrad(config, "prelu", 100, false, useGpu);
-//     }
-//   };
-//
-//   testParameterReluLayer(192, 1);
-//   testParameterReluLayer(192, 3);
-//   testParameterReluLayer(192, 192);
-// }
-//
-// TEST(Layer, ResizeLayer) {
-//   TestConfig config;
-//   config.biasSize = 0;
-//   config.layerConfig.set_type("resize");
-//   config.layerConfig.set_size(64);
-//
-//   config.inputDefs.push_back({INPUT_DATA, "layer_0", 16, 0});
-//   config.layerConfig.add_inputs();
-//
-//   for (auto useGpu : {false, true}) {
-//     testLayerGrad(config, "resize", 100, false, useGpu);
-//   }
-// }
-//
-// TEST(Layer, NCELayer) {
-//   TestConfig config;
-//   size_t numClasses = 4;
-//   config.layerConfig.set_type("nce");
-//   config.layerConfig.set_size(1);
-//   config.layerConfig.set_active_type("sigmoid");
-//   config.layerConfig.set_num_classes(numClasses);
-//   config.biasSize = numClasses;
-//
-//   config.inputDefs.push_back(
-//       {INPUT_DATA, "layer_0", /* dim= */ 16, /* paraSize= */ 16 *
-//       numClasses});
-//   config.inputDefs.push_back(
-//       {INPUT_LABEL, "label", /* dim= */ numClasses, /* paraSize= */ 0});
-//   config.layerConfig.add_inputs();
-//   config.layerConfig.add_inputs();
-//
-//   for (auto withWeight : {false, true}) {
-//     if (withWeight) {
-//       config.inputDefs.push_back(
-//           {INPUT_DATA_TARGET, "weight", /* dim= */ 1, /* paraSize= */ 0});
-//       config.layerConfig.add_inputs();
-//     }
-//
-//     for (auto isIdLabel : {false, true}) {
-//       config.inputDefs[1] = {
-//           isIdLabel ? INPUT_LABEL : INPUT_SPARSE_NON_VALUE_DATA,
-//           "label",
-//           /* dim= */ numClasses,
-//           /* paraSize= */ 0};
-//
-//       for (auto withDist : {false, true}) {
-//         config.layerConfig.clear_neg_sampling_dist();
-//         if (withDist) {
-//           double sum = 0;
-//           for (size_t i = 0; i < numClasses; ++i) {
-//             real p = rand();  // NOLINT use rand_r
-//             config.layerConfig.add_neg_sampling_dist(p);
-//             sum += p;
-//           }
-//           for (size_t i = 0; i < numClasses; ++i) {
-//             real p = config.layerConfig.neg_sampling_dist(i) / sum;
-//             config.layerConfig.set_neg_sampling_dist(i, p);
-//           }
-//         }
-//         LOG(INFO) << "NCELayer "
-//                   << " isIdLabel=" << isIdLabel << " withWeight=" <<
-//                   withWeight
-//                   << " withDist=" << withDist;
-//         // Not support GPU now
-//         testLayerGrad(config,
-//                       "nce",
-//                       100,
-//                       /* trans= */ false,
-//                       /* useGpu */ false);
-//       }
-//     }
-//   }
-// }
-//
-// TEST(Layer, GatedRecurrentLayer) {
-//   TestConfig config;
-//   config.layerConfig.set_type("gated_recurrent");
-//   config.layerConfig.set_size(4);
-//   config.layerConfig.set_active_type("sigmoid");
-//   config.layerConfig.set_active_gate_type("sigmoid");
-//   config.biasSize = 12;
-//
-//   config.inputDefs.push_back(
-//       {INPUT_SEQUENCE_DATA, "layer_0", /* dim= */ 12, /* paraSize= */ 48});
-//   config.layerConfig.add_inputs();
-//
-//   for (auto useGpu : {false, true}) {
-//     for (auto reversed : {false, true}) {
-//       config.layerConfig.set_reversed(reversed);
-//       config.testState = !reversed;
-//       testLayerGrad(config, "gated_recurrent", 100, /* trans= */ false,
-//       useGpu);
-//     }
-//   }
-// }
-//
-// TEST(Layer, GruStepLayer) {
-//   TestConfig config;
-//   config.layerConfig.set_type("gru_step");
-//   config.layerConfig.set_size(4);
-//   config.layerConfig.set_active_type("sigmoid");
-//   config.layerConfig.set_active_gate_type("sigmoid");
-//   config.biasSize = 12;
-//
-//   config.inputDefs.push_back(
-//       {INPUT_DATA, "layer_0", /* dim= */ 12, /* paraSize= */ 48});
-//   config.inputDefs.push_back(
-//       {INPUT_DATA, "layer_1", /* dim= */ 4, /* paraSize= */ 0});
-//   config.layerConfig.add_inputs();
-//   config.layerConfig.add_inputs();
-//
-//   for (auto useGpu : {false, true}) {
-//     testLayerGrad(config, "gruStep", 100, /* trans= */ false, useGpu);
-//   }
-// }
-//
-// TEST(Layer, LstmStepLayer) {
-//   TestConfig config;
-//   config.layerConfig.set_type("lstm_step");
-//   config.layerConfig.set_size(4);
-//   config.layerConfig.set_active_type("sigmoid");
-//   config.layerConfig.set_active_state_type("sigmoid");
-//   config.layerConfig.set_active_gate_type("sigmoid");
-//   config.biasSize = 12;
-//   config.testAccumulate = false;
-//
-//   config.inputDefs.push_back(
-//       {INPUT_DATA, "layer_0", /* dim= */ 16, /* paraSize= */ 0});
-//   config.inputDefs.push_back(
-//       {INPUT_DATA, "layer_1", /* dim= */ 4, /* paraSize= */ 0});
-//   config.layerConfig.add_inputs();
-//   config.layerConfig.add_inputs();
-//
-//   for (auto useGpu : {false, true}) {
-//     testLayerGrad(config, "lstmStep", 100, /* trans= */ false, useGpu);
-//   }
-// }
-//
-// void testBatchNormLayer(const string& type, bool trans, bool useGpu) {
-//   TestConfig config;
-//   const int CHANNELS = 10;
-//   const int IMG_SIZE = 16;
-//   const int IMG_SIZE_Y = 8;
-//   size_t size = CHANNELS * IMG_SIZE * IMG_SIZE_Y;
-//   config.layerConfig.set_type(type);
-//   config.layerConfig.set_size(size);
-//   config.layerConfig.set_active_type("sigmoid");
-//   config.biasSize = CHANNELS;
-//   config.inputDefs.push_back({INPUT_DATA,
-//                               "layer_0",
-//                               /* dim= */ size,
-//                               /* paraSize= */ CHANNELS});
-//
-//   config.inputDefs.push_back({INPUT_DATA, "layer_1_running_mean", 1,
-//   CHANNELS});
-//   config.inputDefs.back().isStatic = true;
-//   config.inputDefs.push_back({INPUT_DATA, "layer_2_running_var", 1,
-//   CHANNELS});
-//   config.inputDefs.back().isStatic = true;
-//
-//   LayerInputConfig* input = config.layerConfig.add_inputs();
-//   config.layerConfig.add_inputs();
-//   config.layerConfig.add_inputs();
-//
-//   ImageConfig* img_conf = input->mutable_image_conf();
-//   img_conf->set_channels(CHANNELS);
-//   img_conf->set_img_size(IMG_SIZE);
-//   img_conf->set_img_size_y(IMG_SIZE_Y);
-//
-//   testLayerGrad(config,
-//                 "batch_norm",
-//                 64,
-//                 /* trans= */ trans,
-//                 useGpu,
-//                 /* useWeight */ true);
-// }
-//
-// TEST(Layer, BatchNormalizationLayer) {
-//   testBatchNormLayer("batch_norm", false, false);
-// #ifndef PADDLE_ONLY_CPU
-//   testBatchNormLayer("batch_norm", false, true);
-//   if (hl_get_cudnn_lib_version() >= int(4000)) {
-//     testBatchNormLayer("cudnn_batch_norm", false, true);
-//   }
-// #endif
-// }
-//
-// TEST(Operator, conv) {
-//   TestConfig config;
-//   const int NUM_FILTERS = 16;
-//   const int FILTER_SIZE = 2;
-//   const int FILTER_SIZE_Y = 3;
-//   const int CHANNELS = 3;
-//   const int IMAGE_SIZE = 16;
-//   const int IMAGE_SIZE_Y = 8;
-//   OperatorConfig& operatorConf = *config.layerConfig.add_operator_confs();
-//   operatorConf.set_type("conv");
-//   ConvConfig* conv = operatorConf.mutable_conv_conf();
-//   operatorConf.set_num_filters(NUM_FILTERS);
-//   conv->set_filter_size(FILTER_SIZE);
-//   conv->set_filter_size_y(FILTER_SIZE_Y);
-//   conv->set_channels(CHANNELS);
-//   conv->set_padding(0);
-//   conv->set_padding_y(1);
-//   conv->set_stride(2);
-//   conv->set_stride_y(2);
-//   conv->set_groups(1);
-//   conv->set_filter_channels(conv->channels() / conv->groups());
-//   conv->set_img_size(IMAGE_SIZE);
-//   conv->set_img_size_y(IMAGE_SIZE_Y);
-//   conv->set_output_x(outputSize(conv->img_size(),
-//                                 conv->filter_size(),
-//                                 conv->padding(),
-//                                 conv->stride(),
-//                                 /*  caffeMode */ true));
-//   conv->set_output_y(outputSize(conv->img_size_y(),
-//                                 conv->filter_size_y(),
-//                                 conv->padding_y(),
-//                                 conv->stride_y(),
-//                                 /*  caffeMode */ true));
-//   config.layerConfig.set_size(conv->output_x() * conv->output_y() *
-//                               NUM_FILTERS);
-//
-//   config.inputDefs.push_back(
-//       {INPUT_DATA, "layer_0", IMAGE_SIZE * IMAGE_SIZE_Y * CHANNELS, 0});
-//   config.inputDefs.push_back(
-//       {INPUT_DATA,
-//        "layer_1",
-//        FILTER_SIZE * FILTER_SIZE_Y * CHANNELS * NUM_FILTERS,
-//        0});
-//   config.layerConfig.add_inputs();
-//   config.layerConfig.add_inputs();
-//
-//   testOperatorGrad(config, operatorConf, 100, /*useGpu*/ true, false);
-// }
-//
-// TEST(Layer, FeatureMapExpandLayer) {
-//   TestConfig config;
-//   config.layerConfig.set_type("featmap_expand");
-//   const int CHANNELS = 10;
-//   const int INPUT_SIZE = 100;
-//   config.layerConfig.set_size(INPUT_SIZE * CHANNELS);
-//   config.layerConfig.set_num_filters(CHANNELS);
-//   config.inputDefs.push_back({INPUT_SEQUENCE_DATA,
-//                               "layer_0",
-//                               /* dim= */ INPUT_SIZE,
-//                               /* paraSize= */ 0});
-//   config.layerConfig.add_inputs();
-//   for (auto useGpu : {false, true}) {
-//     testLayerGrad(config,
-//                   "featmap_expand",
-//                   /*batch_size*/ 100,
-//                   /* trans= */ false,
-//                   useGpu,
-//                   /* useWeight */ true);
-//   }
-// }
-//
-// TEST(Layer, MultiplexLayer) {
-//   TestConfig config;
-//   const int LAYER_SIZE = 100;
-//   config.layerConfig.set_type("multiplex");
-//   config.layerConfig.set_size(LAYER_SIZE);
-//
-//   config.inputDefs.push_back({INPUT_LABEL, "layer_0", 2, 0});
-//   config.inputDefs.push_back(
-//       {INPUT_DATA, "layer_1", /* dim= */ LAYER_SIZE, /* paraSize= */ 0});
-//   config.inputDefs.push_back(
-//       {INPUT_DATA, "layer_2", /* dim= */ LAYER_SIZE, /* paraSize= */ 0});
-//   config.layerConfig.add_inputs();
-//   config.layerConfig.add_inputs();
-//   config.layerConfig.add_inputs();
-//
-//   for (auto useGpu : {false, true}) {
-//     testLayerGrad(config, "multiplex", 512, /* trans= */ false, useGpu);
-//   }
-// }
-//
+TEST(Operator, dot_mul) {
+  TestConfig config;
+  config.layerConfig.set_size(10);
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  OperatorConfig& operatorConf = *config.layerConfig.add_operator_confs();
+  operatorConf.set_type("dot_mul");
+  operatorConf.set_dotmul_scale(-1);
+
+  testOperatorGrad(config, operatorConf, 100, false, false);
+}
+
+TEST(Projection, context) {
+  for (auto contextStart : {-5, -3, -1, 0, 3}) {
+    for (auto contextLength : {1, 2, 5, 7}) {
+      for (auto batchSize : {1, 2, 5, 20, 50}) {
+        for (auto trainablePadding : {false, true}) {
+          LOG(INFO) << " contextStart=" << contextStart
+                    << " contextLength=" << contextLength
+                    << " batchSize=" << batchSize
+                    << " trainablePadding=" << trainablePadding;
+          ProjectionConfig conf;
+          conf.set_type("context");
+          conf.set_input_size(10);
+          conf.set_context_start(contextStart);
+          conf.set_context_length(contextLength);
+          conf.set_trainable_padding(trainablePadding);
+          conf.set_output_size(conf.context_length() * conf.input_size());
+          int pad =
+              std::max(0, -conf.context_start()) +
+              std::max(0, conf.context_start() + conf.context_length() - 1);
+          for (auto useGpu : {false, true}) {
+            testProjectionGrad(
+                conf,
+                INPUT_SEQUENCE_DATA,
+                trainablePadding ? conf.input_size() * pad : 0,
+                batchSize,
+                useGpu,
+                contextStart + contextLength <= 1);  // = testState
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(Projection, trans_fc) {
+  ProjectionConfig conf;
+  conf.set_type("trans_fc");
+  conf.set_input_size(50);
+  conf.set_output_size(20);
+  for (auto useGpu : {false, true}) {
+    testProjectionGrad(conf,
+                       INPUT_DATA,
+                       /* parameterSize */ 1000,
+                       /* batchSize */ 100,
+                       useGpu);
+  }
+}
+
+TEST(Projection, fc) {
+  ProjectionConfig conf;
+  conf.set_type("fc");
+  conf.set_input_size(10);
+  conf.set_output_size(20);
+  for (auto useGpu : {false, true}) {
+    testProjectionGrad(conf,
+                       INPUT_DATA,
+                       /* parameterSize */ 200,
+                       /* batchSize */ 100,
+                       useGpu);
+  }
+}
+
+TEST(Projection, dot_mul) {
+  ProjectionConfig conf;
+  conf.set_type("dot_mul");
+  conf.set_input_size(20);
+  conf.set_output_size(20);
+  for (auto useGpu : {false, true}) {
+    testProjectionGrad(conf,
+                       INPUT_DATA,
+                       /* parameterSize */ 20,
+                       /* batchSize */ 100,
+                       useGpu);
+  }
+}
+
+TEST(Projection, table) {
+  ProjectionConfig conf;
+  conf.set_type("table");
+  conf.set_input_size(10);
+  conf.set_output_size(20);
+  for (auto useGpu : {false, true}) {
+    testProjectionGrad(conf,
+                       INPUT_LABEL,
+                       /* parameterSize */ 200,
+                       /* batchSize */ 100,
+                       useGpu);
+  }
+}
+
+TEST(Projection, identity) {
+  ProjectionConfig conf;
+  conf.set_type("identity");
+  conf.set_input_size(10);
+  conf.set_output_size(10);
+  for (auto useGpu : {false, true}) {
+    testProjectionGrad(conf,
+                       INPUT_DATA,
+                       /* parameterSize */ 0,
+                       /* batchSize */ 100,
+                       useGpu);
+  }
+}
+
+TEST(Projection, scaling) {
+  ProjectionConfig conf;
+  conf.set_type("scaling");
+  conf.set_input_size(10);
+  conf.set_output_size(10);
+  for (auto useGpu : {false}) {
+    testProjectionGrad(conf,
+                       INPUT_DATA,
+                       /* parameterSize */ 1,
+                       /* batchSize */ 100,
+                       useGpu);
+  }
+}
+
+void testProjectionConv(size_t groups) {
+  const int NUM_FILTERS = 18;
+  const int FILTER_SIZE = 2;
+  const int FILTER_SIZE_Y = 3;
+  const int CHANNELS = 3;
+  const int IMAGE_SIZE = 16;
+
+  ProjectionConfig conf;
+  conf.set_type("conv");
+  conf.set_num_filters(NUM_FILTERS);
+
+  ConvConfig* conv = conf.mutable_conv_conf();
+  conv->set_filter_size(FILTER_SIZE);
+  conv->set_filter_size_y(FILTER_SIZE_Y);
+  conv->set_channels(CHANNELS);
+  conv->set_padding(0);
+  conv->set_padding_y(1);
+  conv->set_stride(2);
+  conv->set_stride_y(2);
+  conv->set_groups(groups);
+  conv->set_filter_channels(conv->channels() / conv->groups());
+  conv->set_img_size(IMAGE_SIZE);
+  int output_x = outputSize(conv->img_size(),
+                            conv->filter_size(),
+                            conv->padding(),
+                            conv->stride(),
+                            /* caffeMode */ true);
+  int output_y = outputSize(conv->img_size(),
+                            conv->filter_size_y(),
+                            conv->padding_y(),
+                            conv->stride_y(),
+                            /* caffeMode */ true);
+  conv->set_output_x(output_x);
+  conf.set_input_size(IMAGE_SIZE * IMAGE_SIZE * CHANNELS);
+  conf.set_output_size(output_x * output_y * NUM_FILTERS);
+
+  testProjectionGrad(conf,
+                     INPUT_DATA,
+                     /* parameterSize */ NUM_FILTERS * CHANNELS * FILTER_SIZE *
+                         FILTER_SIZE_Y / groups,
+                     /* batchSize */ 100,
+                     true,
+                     false,
+                     NUM_FILTERS,
+                     true);
+}
+
+#ifndef PADDLE_ONLY_CPU
+TEST(Projection, conv) {
+  testProjectionConv(1);
+  testProjectionConv(3);
+}
+#endif
+
+TEST(Layer, BilinearInterpLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("bilinear_interp");
+  config.biasSize = 0;
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 4096, 0});
+
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  BilinearInterpConfig* bilinear = input->mutable_bilinear_interp_conf();
+  ImageConfig* image = bilinear->mutable_image_conf();
+  image->set_img_size(32);
+  image->set_img_size_y(32);
+  image->set_channels(4);
+
+  for (auto useGpu : {false, true}) {
+    for (auto outSize : {32, 64}) {
+      bilinear->set_out_size_x(outSize);
+      bilinear->set_out_size_y(outSize);
+      testLayerGrad(config, "bilinear_interp", 10, false, useGpu);
+    }
+  }
+}
+
+TEST(Layer, concat) {
+  TestConfig config;
+  config.biasSize = 0;
+  config.layerConfig.set_type("concat");
+  config.layerConfig.set_size(15);
+  config.layerConfig.set_active_type("sigmoid");
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 5, 0});
+  config.layerConfig.add_inputs();
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "concat", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, AddtoLayer) {
+  TestConfig config;
+  config.biasSize = 0;
+  config.layerConfig.set_type("addto");
+  config.layerConfig.set_size(10);
+  config.layerConfig.set_active_type("sigmoid");
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
+  config.layerConfig.add_inputs();
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "addto", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, CRFLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("crf");
+  config.layerConfig.set_size(10);
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "layer_0", 10, 120});
+  config.inputDefs.push_back({INPUT_SEQUENCE_LABEL, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  // Not support GPU now
+  testLayerGrad(config,
+                "crf",
+                100,
+                /* trans */ false,
+                /* useGpu */ false,
+                false /*useWeight*/,
+                0.03 /*epsilon*/);
+}
+
+TEST(Layer, CTCLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("ctc");
+  config.layerConfig.set_norm_by_times(false);
+  config.layerConfig.set_size(10);
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "layer_0", 10, 0});
+  config.inputDefs.push_back({INPUT_SEQUENCE_LABEL, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config,
+                  "ctc",
+                  100,
+                  /* trans */ false, /* useGpu */
+                  useGpu);
+  }
+}
+
+TEST(Layer, cosSimLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("cos");
+  config.layerConfig.set_size(1);
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 50, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "cos", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, CosSimVecMatLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("cos_vm");
+  config.layerConfig.set_size(5);  // output size
+  config.layerConfig.set_cos_scale(2.0);
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 20, 0});
+  config.layerConfig.add_inputs();
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 100, 0});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "cos_vm", 100, false, useGpu);
+  }
+}
+
+void testConvLayer(const string& type, bool trans, bool useGpu) {
+  TestConfig config;
+  config.biasSize = 16;
+  config.layerConfig.set_type(type);
+  config.layerConfig.set_num_filters(16);
+  config.layerConfig.set_partial_sum(1);
+  config.layerConfig.set_shared_biases(true);
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 384, 288});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  ConvConfig* conv = input->mutable_conv_conf();
+  conv->set_filter_size(2);
+  conv->set_filter_size_y(3);
+  conv->set_channels(3);
+  conv->set_padding(0);
+  conv->set_padding_y(1);
+  conv->set_stride(2);
+  conv->set_stride_y(2);
+  conv->set_groups(1);
+  conv->set_filter_channels(conv->channels() / conv->groups());
+  conv->set_img_size(16);
+  conv->set_img_size_y(8);
+  conv->set_output_x(outputSize(conv->img_size(),
+                                conv->filter_size(),
+                                conv->padding(),
+                                conv->stride(),
+                                /* caffeMode */ true));
+  conv->set_output_y(outputSize(conv->img_size_y(),
+                                conv->filter_size_y(),
+                                conv->padding_y(),
+                                conv->stride_y(),
+                                /* caffeMode */ true));
+  config.layerConfig.set_size(conv->output_x() * conv->output_y() *
+                              config.layerConfig.num_filters());
+
+  testLayerGrad(config, "conv", 100, trans, useGpu);
+  // Use small batch_size and useWeight=true to test biasGrad
+  testLayerGrad(config, "conv", 2, trans, useGpu, true, 0.02);
+}
+
+TEST(Layer, convLayer) {
+  testConvLayer("exconv", /* trans= */ false, /* useGpu= */ false);
+#ifndef PADDLE_ONLY_CPU
+  testConvLayer("exconv", /* trans= */ false, /* useGpu= */ true);
+  testConvLayer("cudnn_conv", /* trans= */ false, /* useGpu= */ true);
+#endif
+}
+
+void testConvTransLayer(const string& type, bool trans, bool useGpu) {
+  TestConfig config;
+  config.biasSize = 3;
+  config.layerConfig.set_type(type);
+  config.layerConfig.set_num_filters(3);
+  config.layerConfig.set_partial_sum(1);
+  config.layerConfig.set_shared_biases(true);
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1024, 288});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  ConvConfig* conv = input->mutable_conv_conf();
+  conv->set_filter_size(2);
+  conv->set_filter_size_y(3);
+  conv->set_channels(16);
+  conv->set_padding(0);
+  conv->set_padding_y(1);
+  conv->set_stride(2);
+  conv->set_stride_y(2);
+  conv->set_groups(1);
+  conv->set_filter_channels(3 / conv->groups());
+  conv->set_img_size(16);
+  conv->set_output_x(outputSize(conv->img_size(),
+                                conv->filter_size(),
+                                conv->padding(),
+                                conv->stride(),
+                                /* caffeMode */ true));
+
+  config.layerConfig.set_size(conv->img_size() * conv->img_size() *
+                              config.layerConfig.num_filters());
+
+  testLayerGrad(config, "convTrans", 100, trans, useGpu);
+  // Use small batch_size and useWeight=true to test biasGrad
+  testLayerGrad(config, "convTrans", 2, trans, useGpu, true, 0.02);
+}
+
+TEST(Layer, convTransLayer) {
+  for (auto useGpu : {false, true}) {
+    testConvTransLayer("exconvt", /* trans= */ false, /* useGpu= */ useGpu);
+  }
+}
+
+TEST(Layer, blockExpandLayer) {
+  TestConfig config;
+  config.biasSize = 0;
+  config.layerConfig.set_type("blockexpand");
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 6144, 0});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  BlockExpandConfig* blockExpand = input->mutable_block_expand_conf();
+  blockExpand->set_img_size_x(64);
+  blockExpand->set_img_size_y(32);
+  blockExpand->set_channels(3);
+  blockExpand->set_padding_x(0);
+  blockExpand->set_padding_y(0);
+  blockExpand->set_block_x(4);
+  blockExpand->set_block_y(32);
+  blockExpand->set_stride_x(2);
+  blockExpand->set_stride_y(2);
+  blockExpand->set_output_x(outputSize(blockExpand->img_size_x(),
+                                       blockExpand->block_x(),
+                                       blockExpand->padding_x(),
+                                       blockExpand->stride_x(),
+                                       /* caffeMode */ false));
+  blockExpand->set_output_y(outputSize(blockExpand->img_size_y(),
+                                       blockExpand->block_y(),
+                                       blockExpand->padding_y(),
+                                       blockExpand->stride_y(),
+                                       /* caffeMode */ false));
+  config.layerConfig.set_size(blockExpand->block_x() * blockExpand->block_y() *
+                              blockExpand->channels());
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "blockexpand", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, maxoutLayer) {
+  TestConfig config;
+  config.biasSize = 0;
+  config.layerConfig.set_type("maxout");
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 4096, 0});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  MaxOutConfig* maxout = input->mutable_maxout_conf();
+  ImageConfig* image = maxout->mutable_image_conf();
+
+  image->set_img_size(32);
+  image->set_img_size_y(32);
+  image->set_channels(4);
+  maxout->set_groups(2);
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "maxout", 10, false, useGpu);
+  }
+}
+void testFcLayer(string format, size_t nnz) {
+  TestConfig config;
+  config.biasSize = 4096;
+  config.layerConfig.set_type("fc");
+  config.layerConfig.set_size(4096);
+  config.layerConfig.set_active_type("sigmoid");
+  config.layerConfig.set_drop_rate(0.1);
+
+  config.inputDefs.push_back(
+      {INPUT_DATA, "layer_0", 8192, nnz, ParaSparse(format)});
+  config.layerConfig.add_inputs();
+
+  LOG(INFO) << config.inputDefs[0].sparse.sparse << " "
+            << config.inputDefs[0].sparse.format;
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config,
+                  "fc",
+                  100,
+                  /* trans */ false,
+                  useGpu,
+                  /* weight */ true);
+  }
+}
+
+TEST(Layer, fcLayer) {
+  testFcLayer("", 4096 * 4096 * 2);
+  testFcLayer("csc", 4096 * 40);
+  testFcLayer("csr", 4096 * 40);
+}
+
+TEST(Layer, SelectiveFullyConnectedLayer) {
+  TestConfig config;
+  size_t nin = 16;
+  size_t nout = 256;
+  config.layerConfig.set_type("selective_fc");
+  config.layerConfig.set_size(nout);
+  config.layerConfig.set_active_type("sigmoid");
+  config.layerConfig.set_has_selected_colums(true);
+  config.layerConfig.set_selective_fc_pass_generation(false);
+  config.biasSize = nout;
+
+  config.inputDefs.push_back({INPUT_DATA, "input0", nin, nin * nout});
+  config.layerConfig.add_inputs();
+  config.inputDefs.push_back(
+      {INPUT_SPARSE_NON_VALUE_DATA, "index", nout, 0, ParaSparse("csr", true)});
+  config.layerConfig.add_inputs();
+
+  testLayerGrad(config,
+                "selective_fc",
+                100,
+                /* trans= */ false,
+                /* useGup= */ false,
+                false);
+#ifndef PADDLE_ONLY_CPU
+  testLayerGrad(config,
+                "selective_fc",
+                100,
+                /* trans= */ false,
+                /* useGup= */ true,
+                false);
+#endif
+}
+
+TEST(Layer, DataNormLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("data_norm");
+  config.layerConfig.set_size(20);
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 20, 100});
+  config.inputDefs.back().isStatic = true;
+  config.layerConfig.add_inputs();
+
+  for (auto strategy : {"z-score", "min-max", "decimal-scaling"}) {
+    config.layerConfig.set_data_norm_strategy(strategy);
+    // The parameters are static, so not support GPU now
+    testLayerGrad(config,
+                  "data_norm",
+                  200,
+                  /* trans */ false,
+                  /* useGpu */ false);
+  }
+}
+
+TEST(Layer, hsigmoidLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("hsigmoid");
+  config.layerConfig.set_num_classes(5);
+  config.layerConfig.set_size(1);
+  config.biasSize = config.layerConfig.num_classes() - 1;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 200});
+  config.inputDefs.push_back({INPUT_LABEL, "layer_1", 5, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  // Not support GPU now
+  testLayerGrad(config,
+                "hsigmoid",
+                100,
+                /* trans */ false, /* useGpu */
+                false);
+}
+
+TEST(Layer, multi_cross) {
+  TestConfig config;
+  config.layerConfig.set_type("multi-class-cross-entropy");
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
+  config.inputDefs.push_back({INPUT_LABEL, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(
+        config, "multi-class-cross-entropy", 100, /* trans */ false, useGpu);
+  }
+}
+
+TEST(Layer, multi_binary_label_sparse_mat) {
+  TestConfig config;
+  config.layerConfig.set_type("multi_binary_label_cross_entropy");
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
+  config.inputDefs.push_back({INPUT_SPARSE_NON_VALUE_DATA, "layer_1", 50, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config,
+                  "multi_binary_label_cross_entropy",
+                  100,
+                  /* trans */ false,
+                  useGpu);
+  }
+}
+
+TEST(layer, multi_binary_label_id) {
+  TestConfig config;
+  config.layerConfig.set_type("multi_binary_label_cross_entropy");
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
+  config.inputDefs.push_back({INPUT_LABEL, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config,
+                  "multi_binary_label_cross_entropy",
+                  100,
+                  /* trans */ false,
+                  useGpu);
+  }
+}
+
+TEST(Layer, multi_cross_with_selfnorm) {
+  TestConfig config;
+  config.layerConfig.set_type("multi_class_cross_entropy_with_selfnorm");
+  config.layerConfig.set_softmax_selfnorm_alpha(0.1);
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
+  config.inputDefs.push_back({INPUT_LABEL, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  // Not support GPU now
+  testLayerGrad(config,
+                "multi_class_cross_entropy_with_selfnorm",
+                100,
+                /* trans */ false,
+                /* useGpu */ false);
+}
+
+TEST(Layer, multi_cross_soft) {
+  TestConfig config;
+  config.layerConfig.set_type("soft_binary_class_cross_entropy");
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
+  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config,
+                  "soft_binary_class_cross_entropy",
+                  100,
+                  /* trans */ false,
+                  useGpu);
+  }
+}
+
+TEST(Layer, square_error) {
+  TestConfig config;
+  config.layerConfig.set_type("square_error");
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
+  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "square_error", 100, /* trans */ false, useGpu);
+  }
+}
+
+TEST(Layer, sparse_square_error) {
+  TestConfig config;
+  config.layerConfig.set_type("square_error");
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
+  config.inputDefs.push_back({INPUT_SPARSE_NON_VALUE_DATA, "layer_1", 50, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  // "GpuSparseMatrix" as label is not supported
+  testLayerGrad(config,
+                "square_error",
+                100,
+                /* trans */ false,
+                /* useGpu */ false);
+}
+
+TEST(Layer, sparse_float_square_error) {
+  TestConfig config;
+  config.layerConfig.set_type("square_error");
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
+  config.inputDefs.push_back({INPUT_SPARSE_FLOAT_VALUE_DATA, "layer_1", 50, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  // "GpuSparseMatrix" as label is not supported
+  testLayerGrad(config,
+                "square_error",
+                100,
+                /* trans */ false,
+                /* useGpu */ false);
+}
+
+TEST(Layer, square_error_weighted) {
+  TestConfig config;
+  config.layerConfig.set_type("square_error");
+  config.biasSize = 0;
+  config.testAccumulate = false;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
+  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_1", 10, 0});
+  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_2", 1, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "square_error", 100, /* trans */ false, useGpu);
+  }
+}
+
+TEST(Layer, huber_two_class) {
+  TestConfig config;
+  config.layerConfig.set_type("huber");
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
+  config.inputDefs.push_back({INPUT_LABEL, "layer_1", 2, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "huber", 100, /* trans */ false, useGpu);
+  }
+}
+
+void testExpandLayer(string trans_type, bool hasSubseq) {
+  TestConfig config;
+  config.layerConfig.set_type("expand");
+
+  config.inputDefs.push_back(
+      {trans_type == "non-seq" ? INPUT_DENSE_DIM_DATA : INPUT_SEQUENCE_DATA,
+       "layer_0",
+       10,
+       0});
+  config.inputDefs.push_back(
+      {hasSubseq ? INPUT_HASSUB_SEQUENCE_DATA : INPUT_SEQUENCE_DATA,
+       "layer_1",
+       10,
+       0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+  config.layerConfig.set_trans_type(trans_type);
+  LOG(INFO) << " trans_type=" << trans_type << " hasSubseq=" << hasSubseq;
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "expand", 30, false, useGpu);
+  }
+}
+
+TEST(Layer, ExpandLayer) {
+  testExpandLayer("non-seq", false);  // non-seq expand to seq
+  testExpandLayer("non-seq", true);   // non-seq expand to hasSubseq
+  testExpandLayer("seq", true);       // seq expand to hasSubseq
+}
+
+void testDegradeLayer(bool hasSubseq, string layer_type, string trans_type) {
+  TestConfig config;
+  config.layerConfig.set_type(layer_type);
+  config.layerConfig.set_size(10);
+  config.biasSize = 0;
+
+  config.inputDefs.push_back(
+      {hasSubseq ? INPUT_HASSUB_SEQUENCE_DATA : INPUT_SEQUENCE_DATA,
+       "layer_0",
+       10,
+       0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.set_trans_type(trans_type);
+
+  auto testDegradeLayerGrad = [](TestConfig& config, string layer_type) {
+    for (auto useGpu : {false, true}) {
+      testLayerGrad(config, layer_type, 100, false, useGpu);
+    }
+  };
+
+  if (layer_type == "average") {
+    for (auto strategy : {"average", "sum", "squarerootn"}) {
+      LOG(INFO) << " hasSubseq=" << hasSubseq << " trans_type=" << trans_type
+                << " average_strategy=" << strategy;
+      config.layerConfig.set_average_strategy(strategy);
+      testDegradeLayerGrad(config, layer_type);
+    }
+  } else {
+    LOG(INFO) << " hasSubseq=" << hasSubseq << " trans_type=" << trans_type;
+    testDegradeLayerGrad(config, layer_type);
+  }
+}
+
+TEST(Layer, MaxLayer) {
+  testDegradeLayer(false, "max", "non-seq");  // seq max to non-seq
+  testDegradeLayer(true, "max", "non-seq");   // hasSubseq max to non-seq
+  testDegradeLayer(true, "max", "seq");       // hasSubseq max to seq
+}
+
+TEST(Layer, SequenceLastInstanceLayer) {
+  testDegradeLayer(false,
+                   "seqlastins",
+                   "non-seq");  // seq seqlastins to non-seq
+  testDegradeLayer(true,
+                   "seqlastins",
+                   "non-seq");  // hasSubseq seqlastins to non-seq
+  testDegradeLayer(true, "seqlastins", "seq");  // hasSubseq seqlastins to
+  seq
+}
+
+TEST(Layer, AverageLayer) {
+  testDegradeLayer(false, "average", "non-seq");  // seq average to non-seq
+  testDegradeLayer(true, "average", "non-seq");   // hasSubseq average to
+  non -
+      seq testDegradeLayer(true, "average", "seq");  // hasSubseq average to seq
+}
+
+TEST(Layer, SequenceConcatLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("seqconcat");
+  config.layerConfig.set_size(10);
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "layer_0", 10, 0});
+  config.layerConfig.add_inputs();
+  config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "seqconcat", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, SequenceReshapeLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("seqreshape");
+  config.layerConfig.set_size(10);
+
+  config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "layer_0", 100, 0});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "seqreshape", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, ConvShiftLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("conv_shift");
+  config.layerConfig.set_size(10);
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 3, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  // Not support GPU now
+  testLayerGrad(config, "conv_shift", 100, false, false);
+}
+
+TEST(Layer, PowerLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("power");
+  config.layerConfig.set_size(10);
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "power", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, ConvexCombinationLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("convex_comb");
+  config.layerConfig.set_size(20);
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 5, 0});
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 100, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "convex_comb", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, InterpolationLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("interpolation");
+  config.layerConfig.set_size(10);
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
+  config.inputDefs.push_back({INPUT_DATA, "layer_2", 10, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "interpolation", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, OuterProdLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("out_prod");
+  config.layerConfig.set_size(100);
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
+  config.layerConfig.add_inputs();
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "out_prod", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, SlopeInterceptLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("slope_intercept");
+  config.layerConfig.set_size(10);
+  config.layerConfig.set_slope(1.0);
+  config.layerConfig.set_intercept(0.1);
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "slope_intercept", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, ScalingLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("scaling");
+  config.layerConfig.set_size(10);
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
+  config.layerConfig.add_inputs();
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "scaling", 100, false, useGpu);
+  }
+}
+
+void testNormLayer(const string& normType, bool trans, bool useGpu) {
+  TestConfig config;
+  config.layerConfig.set_type("norm");
+  config.layerConfig.set_active_type("relu");
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1568, 0});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  NormConfig* norm = input->mutable_norm_conf();
+  norm->set_norm_type(normType);
+  norm->set_channels(16);
+  norm->set_size(5);
+  norm->set_scale(0.001);
+  norm->set_pow(0.75);
+  norm->set_blocked(0);
+  norm->set_img_size(14);
+  norm->set_img_size_y(7);
+  norm->set_output_x(norm->img_size());
+  norm->set_output_y(norm->img_size_y());
+  if (norm->norm_type() == "cmrnorm" ||
+      norm->norm_type() == "cmrnorm-projection") {
+    norm->set_scale(norm->scale() / norm->size());
+  } else {
+    norm->set_scale(norm->scale() / (norm->size() * norm->size()));
+  }
+
+  config.layerConfig.set_size(norm->output_x() * norm->output_y() *
+                              norm->channels());
+  config.biasSize = 0;
+
+  testLayerGrad(config, "norm", 100, trans, useGpu);
+}
+
+TEST(Layer, NormLayer) {
+  testNormLayer("cmrnorm-projection",
+                /* trans= */ false, /* useGpu= */
+                true);
+  testNormLayer("cmrnorm-projection",
+                /* trans= */ false, /* useGpu= */
+                false);
+}
+
+void setPoolConfig(TestConfig* config,
+                   PoolConfig* pool,
+                   const string& poolType) {
+  (*config).biasSize = 0;
+  (*config).layerConfig.set_type("pool");
+  (*config).layerConfig.set_num_filters(16);
+
+  int kw = 3, kh = 3;
+  int pw = 0, ph = 0;
+  int sw = 2, sh = 2;
+  pool->set_pool_type(poolType);
+  pool->set_channels(16);
+  pool->set_size_x(kw);
+  pool->set_size_y(kh);
+  pool->set_start(0);
+  pool->set_padding(pw);
+  pool->set_padding_y(ph);
+  pool->set_stride(sw);
+  pool->set_stride_y(sh);
+
+  int ow = outputSize(pool->img_size(), kw, pw, sw, /* caffeMode */ false);
+  int oh = outputSize(pool->img_size_y(), kh, ph, sh, /* caffeMode */ false);
+  pool->set_output_x(ow);
+  pool->set_output_y(oh);
+}
+
+void testPoolLayer(const string& poolType, bool trans, bool useGpu) {
+  TestConfig config;
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 3136, 0});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  PoolConfig* pool = input->mutable_pool_conf();
+
+  pool->set_img_size(14);
+  pool->set_img_size_y(14);
+  setPoolConfig(&config, pool, poolType);
+  config.layerConfig.set_size(pool->output_x() * pool->output_y() *
+                              pool->channels());
+
+  testLayerGrad(config, "pool", 100, trans, useGpu);
+}
+
+#ifndef PADDLE_ONLY_CPU
+void testPoolLayer2(const string& poolType, bool trans, bool useGpu) {
+  TestConfig config;
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 3200, 0});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  PoolConfig* pool = input->mutable_pool_conf();
+
+  pool->set_size_y(4);
+  pool->set_stride_y(3);
+  pool->set_img_size(10);
+  pool->set_img_size_y(20);
+  setPoolConfig(&config, pool, poolType);
+  pool->set_output_y((pool->img_size_y() - pool->start() - pool->size_y()) /
+                         ((float)pool->stride_y()) +
+                     1.5);
+  config.layerConfig.set_size(pool->output_x() * pool->output_y() *
+                              pool->channels());
+
+  testLayerGrad(config, "pool", 100, trans, useGpu);
+}
+#endif
+
+TEST(Layer, PoolLayer) {
+  testPoolLayer("avg-projection", /* trans= */ false, /* useGpu= */ false);
+  testPoolLayer("max-projection", /* trans= */ false, /* useGpu= */ false);
+
+#ifndef PADDLE_ONLY_CPU
+  testPoolLayer("avg-projection", /* trans= */ false, /* useGpu= */ true);
+  testPoolLayer("max-projection", /* trans= */ false, /* useGpu= */ true);
+  testPoolLayer("cudnn-max-pool", /* trans= */ false, /* useGpu= */ true);
+  testPoolLayer("cudnn-avg-pool", /* trans= */ false, /* useGpu= */ true);
+  testPoolLayer2("cudnn-max-pool", /* trans= */ false, /* useGpu= */ true);
+  testPoolLayer2("cudnn-avg-pool", /* trans= */ false, /* useGpu= */ true);
+#endif
+}
+
+void testSppLayer(const string& poolType,
+                  const int pyramidHeight,
+                  bool trans,
+                  bool useGpu) {
+  TestConfig config;
+  config.layerConfig.set_type("spp");
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 3200, 0});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  SppConfig* sppConfig = input->mutable_spp_conf();
+  sppConfig->set_pool_type(poolType);
+  sppConfig->set_pyramid_height(pyramidHeight);
+  ImageConfig* imageConfig = sppConfig->mutable_image_conf();
+  imageConfig->set_channels(16);
+  imageConfig->set_img_size(10);
+  imageConfig->set_img_size_y(20);
+  int outputSize = (std::pow(4, sppConfig->pyramid_height()) - 1) / (4 - 1);
+  config.layerConfig.set_size(outputSize * imageConfig->channels());
+  testLayerGrad(config, "spp", 100, trans, useGpu);
+}
+
+TEST(Layer, SpatialPyramidPoolLayer) {
+  for (auto useGpu : {false, true}) {
+    for (auto pyramidHeight : {1, 2, 3}) {
+      testSppLayer("avg-projection", pyramidHeight, false, useGpu);
+      testSppLayer("max-projection", pyramidHeight, false, useGpu);
+    }
+  }
+}
+
+TEST(Layer, rankCostLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("rank-cost");
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 1, 0});
+  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_2", 1, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "rank-cost", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, sumCostLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("sum_cost");
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "sum_cost", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, weightedRankCostLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("rank-cost");
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 1, 0});
+  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_2", 1, 0});
+  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_3", 1, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "weighted-rank-cost", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, TensorLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("tensor");
+  config.layerConfig.set_size(10);
+  config.layerConfig.set_active_type("sigmoid");
+  config.biasSize = config.layerConfig.size();
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 5, 250});
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 5, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "tensor", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, RecurrentLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("recurrent");
+  config.layerConfig.set_size(4);
+  config.layerConfig.set_active_type("tanh");
+  config.biasSize = 4;
+
+  config.inputDefs.push_back(
+      {INPUT_SEQUENCE_DATA, "layer_0", /* dim= */ 4, /* paraSize= */ 16});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    for (auto reversed : {false, true}) {
+      config.layerConfig.set_reversed(reversed);
+      config.testState = !reversed;
+      testLayerGrad(config, "recurrent", 50, /* trans= */ false, useGpu);
+    }
+  }
+}
+
+TEST(Layer, LstmLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("lstmemory");
+  config.layerConfig.set_size(4);
+  config.layerConfig.set_active_type("tanh");
+  config.layerConfig.set_active_state_type("sigmoid");
+  config.layerConfig.set_active_gate_type("sigmoid");
+  config.biasSize = 28;
+
+  config.inputDefs.push_back(
+      {INPUT_SEQUENCE_DATA, "layer_0", /* dim= */ 16, /* paraSize= */ 64});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    for (auto reversed : {false, true}) {
+      config.layerConfig.set_reversed(reversed);
+      config.testState = !reversed;
+      testLayerGrad(config, "lstmemory", 100, /* trans= */ false, useGpu);
+    }
+  }
+  for (auto useGpu : {true}) {
+    config.testBatchState = true;
+    config.layerConfig.set_reversed(false);
+    testLayerGrad(config, "lstmemory", 10, /* trans= */ false, useGpu);
+  }
+}
+
+TEST(Layer, MDLstmLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("mdlstmemory");
+  config.layerConfig.set_size(4);
+  config.layerConfig.set_active_type("sigmoid");
+  config.layerConfig.set_active_state_type("sigmoid");
+  config.layerConfig.set_active_gate_type("sigmoid");
+  config.biasSize = 4 * 9;
+
+  config.inputDefs.push_back(
+      {INPUT_SEQUENCE_MDIM_DATA, "layer_0", 4 * 5, 4 * 4 * 5});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_directions(true);
+  config.layerConfig.add_directions(true);
+
+  for (auto useGpu : {false, true}) {
+    for (int i = 0; i < 2; i++) {
+      for (int j = 0; j < 2; j++) {
+        config.layerConfig.set_directions(0, bool(i));
+        config.layerConfig.set_directions(1, bool(j));
+        testLayerGrad(config, "mdlstmemory", 100, false, useGpu);
+      }
+    }
+  }
+}
+
+TEST(Layer, ParameterReluLayer) {
+  auto testParameterReluLayer = [&](size_t inputSize, size_t channels) {
+    TestConfig config;
+    config.layerConfig.set_type("prelu");
+    config.inputDefs.push_back({INPUT_DATA, "layer_0", inputSize, channels});
+    config.layerConfig.add_inputs();
+    config.layerConfig.set_size(inputSize);
+    config.layerConfig.set_partial_sum(inputSize /
+                                       channels);  // size of feature map
+    for (auto useGpu : {false, true}) {
+      testLayerGrad(config, "prelu", 100, false, useGpu);
+    }
+  };
+
+  testParameterReluLayer(192, 1);
+  testParameterReluLayer(192, 3);
+  testParameterReluLayer(192, 192);
+}
+
+TEST(Layer, ResizeLayer) {
+  TestConfig config;
+  config.biasSize = 0;
+  config.layerConfig.set_type("resize");
+  config.layerConfig.set_size(64);
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 16, 0});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "resize", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, NCELayer) {
+  TestConfig config;
+  size_t numClasses = 4;
+  config.layerConfig.set_type("nce");
+  config.layerConfig.set_size(1);
+  config.layerConfig.set_active_type("sigmoid");
+  config.layerConfig.set_num_classes(numClasses);
+  config.biasSize = numClasses;
+
+  config.inputDefs.push_back(
+      {INPUT_DATA, "layer_0", /* dim= */ 16, /* paraSize= */ 16 * numClasses});
+  config.inputDefs.push_back(
+      {INPUT_LABEL, "label", /* dim= */ numClasses, /* paraSize= */ 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto withWeight : {false, true}) {
+    if (withWeight) {
+      config.inputDefs.push_back(
+          {INPUT_DATA_TARGET, "weight", /* dim= */ 1, /* paraSize= */ 0});
+      config.layerConfig.add_inputs();
+    }
+
+    for (auto isIdLabel : {false, true}) {
+      config.inputDefs[1] = {
+          isIdLabel ? INPUT_LABEL : INPUT_SPARSE_NON_VALUE_DATA,
+          "label",
+          /* dim= */ numClasses,
+          /* paraSize= */ 0};
+
+      for (auto withDist : {false, true}) {
+        config.layerConfig.clear_neg_sampling_dist();
+        if (withDist) {
+          double sum = 0;
+          for (size_t i = 0; i < numClasses; ++i) {
+            real p = rand();  // NOLINT use rand_r
+            config.layerConfig.add_neg_sampling_dist(p);
+            sum += p;
+          }
+          for (size_t i = 0; i < numClasses; ++i) {
+            real p = config.layerConfig.neg_sampling_dist(i) / sum;
+            config.layerConfig.set_neg_sampling_dist(i, p);
+          }
+        }
+        LOG(INFO) << "NCELayer "
+                  << " isIdLabel=" << isIdLabel << " withWeight=" << withWeight
+                  << " withDist=" << withDist;
+        // Not support GPU now
+        testLayerGrad(config,
+                      "nce",
+                      100,
+                      /* trans= */ false,
+                      /* useGpu */ false);
+      }
+    }
+  }
+}
+
+TEST(Layer, GatedRecurrentLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("gated_recurrent");
+  config.layerConfig.set_size(4);
+  config.layerConfig.set_active_type("sigmoid");
+  config.layerConfig.set_active_gate_type("sigmoid");
+  config.biasSize = 12;
+
+  config.inputDefs.push_back(
+      {INPUT_SEQUENCE_DATA, "layer_0", /* dim= */ 12, /* paraSize= */ 48});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    for (auto reversed : {false, true}) {
+      config.layerConfig.set_reversed(reversed);
+      config.testState = !reversed;
+      testLayerGrad(config, "gated_recurrent", 100, /* trans= */ false, useGpu);
+    }
+  }
+}
+
+TEST(Layer, GruStepLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("gru_step");
+  config.layerConfig.set_size(4);
+  config.layerConfig.set_active_type("sigmoid");
+  config.layerConfig.set_active_gate_type("sigmoid");
+  config.biasSize = 12;
+
+  config.inputDefs.push_back(
+      {INPUT_DATA, "layer_0", /* dim= */ 12, /* paraSize= */ 48});
+  config.inputDefs.push_back(
+      {INPUT_DATA, "layer_1", /* dim= */ 4, /* paraSize= */ 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "gruStep", 100, /* trans= */ false, useGpu);
+  }
+}
+
+TEST(Layer, LstmStepLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("lstm_step");
+  config.layerConfig.set_size(4);
+  config.layerConfig.set_active_type("sigmoid");
+  config.layerConfig.set_active_state_type("sigmoid");
+  config.layerConfig.set_active_gate_type("sigmoid");
+  config.biasSize = 12;
+  config.testAccumulate = false;
+
+  config.inputDefs.push_back(
+      {INPUT_DATA, "layer_0", /* dim= */ 16, /* paraSize= */ 0});
+  config.inputDefs.push_back(
+      {INPUT_DATA, "layer_1", /* dim= */ 4, /* paraSize= */ 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "lstmStep", 100, /* trans= */ false, useGpu);
+  }
+}
+
+void testBatchNormLayer(const string& type, bool trans, bool useGpu) {
+  TestConfig config;
+  const int CHANNELS = 10;
+  const int IMG_SIZE = 16;
+  const int IMG_SIZE_Y = 8;
+  size_t size = CHANNELS * IMG_SIZE * IMG_SIZE_Y;
+  config.layerConfig.set_type(type);
+  config.layerConfig.set_size(size);
+  config.layerConfig.set_active_type("sigmoid");
+  config.biasSize = CHANNELS;
+  config.inputDefs.push_back({INPUT_DATA,
+                              "layer_0",
+                              /* dim= */ size,
+                              /* paraSize= */ CHANNELS});
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_1_running_mean", 1, CHANNELS});
+  config.inputDefs.back().isStatic = true;
+  config.inputDefs.push_back({INPUT_DATA, "layer_2_running_var", 1, CHANNELS});
+  config.inputDefs.back().isStatic = true;
+
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  ImageConfig* img_conf = input->mutable_image_conf();
+  img_conf->set_channels(CHANNELS);
+  img_conf->set_img_size(IMG_SIZE);
+  img_conf->set_img_size_y(IMG_SIZE_Y);
+
+  testLayerGrad(config,
+                "batch_norm",
+                64,
+                /* trans= */ trans,
+                useGpu,
+                /* useWeight */ true);
+}
+
+TEST(Layer, BatchNormalizationLayer) {
+  testBatchNormLayer("batch_norm", false, false);
+#ifndef PADDLE_ONLY_CPU
+  testBatchNormLayer("batch_norm", false, true);
+  if (hl_get_cudnn_lib_version() >= int(4000)) {
+    testBatchNormLayer("cudnn_batch_norm", false, true);
+  }
+#endif
+}
+
+TEST(Operator, conv) {
+  TestConfig config;
+  const int NUM_FILTERS = 16;
+  const int FILTER_SIZE = 2;
+  const int FILTER_SIZE_Y = 3;
+  const int CHANNELS = 3;
+  const int IMAGE_SIZE = 16;
+  const int IMAGE_SIZE_Y = 8;
+  OperatorConfig& operatorConf = *config.layerConfig.add_operator_confs();
+  operatorConf.set_type("conv");
+  ConvConfig* conv = operatorConf.mutable_conv_conf();
+  operatorConf.set_num_filters(NUM_FILTERS);
+  conv->set_filter_size(FILTER_SIZE);
+  conv->set_filter_size_y(FILTER_SIZE_Y);
+  conv->set_channels(CHANNELS);
+  conv->set_padding(0);
+  conv->set_padding_y(1);
+  conv->set_stride(2);
+  conv->set_stride_y(2);
+  conv->set_groups(1);
+  conv->set_filter_channels(conv->channels() / conv->groups());
+  conv->set_img_size(IMAGE_SIZE);
+  conv->set_img_size_y(IMAGE_SIZE_Y);
+  conv->set_output_x(outputSize(conv->img_size(),
+                                conv->filter_size(),
+                                conv->padding(),
+                                conv->stride(),
+                                /*  caffeMode */ true));
+  conv->set_output_y(outputSize(conv->img_size_y(),
+                                conv->filter_size_y(),
+                                conv->padding_y(),
+                                conv->stride_y(),
+                                /*  caffeMode */ true));
+  config.layerConfig.set_size(conv->output_x() * conv->output_y() *
+                              NUM_FILTERS);
+
+  config.inputDefs.push_back(
+      {INPUT_DATA, "layer_0", IMAGE_SIZE * IMAGE_SIZE_Y * CHANNELS, 0});
+  config.inputDefs.push_back(
+      {INPUT_DATA,
+       "layer_1",
+       FILTER_SIZE * FILTER_SIZE_Y * CHANNELS * NUM_FILTERS,
+       0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  testOperatorGrad(config, operatorConf, 100, /*useGpu*/ true, false);
+}
+
+TEST(Layer, FeatureMapExpandLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("featmap_expand");
+  const int CHANNELS = 10;
+  const int INPUT_SIZE = 100;
+  config.layerConfig.set_size(INPUT_SIZE * CHANNELS);
+  config.layerConfig.set_num_filters(CHANNELS);
+  config.inputDefs.push_back({INPUT_SEQUENCE_DATA,
+                              "layer_0",
+                              /* dim= */ INPUT_SIZE,
+                              /* paraSize= */ 0});
+  config.layerConfig.add_inputs();
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config,
+                  "featmap_expand",
+                  /*batch_size*/ 100,
+                  /* trans= */ false,
+                  useGpu,
+                  /* useWeight */ true);
+  }
+}
+
+TEST(Layer, MultiplexLayer) {
+  TestConfig config;
+  const int LAYER_SIZE = 100;
+  config.layerConfig.set_type("multiplex");
+  config.layerConfig.set_size(LAYER_SIZE);
+
+  config.inputDefs.push_back({INPUT_LABEL, "layer_0", 2, 0});
+  config.inputDefs.push_back(
+      {INPUT_DATA, "layer_1", /* dim= */ LAYER_SIZE, /* paraSize= */ 0});
+  config.inputDefs.push_back(
+      {INPUT_DATA, "layer_2", /* dim= */ LAYER_SIZE, /* paraSize= */ 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "multiplex", 512, /* trans= */ false, useGpu);
+  }
+}
+
 TEST(Layer, PadLayer) {
   TestConfig config;
   config.biasSize = 0;
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index 6e5922166d..c80ed02744 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -1109,7 +1109,7 @@ def parse_bilinear(bilinear, input_layer_name, bilinear_conf):
     bilinear_conf.out_size_y = bilinear.out_size_y
 
 
-def parse_pool(pool, input_layer_name, pool_conf):
+def parse_pool(pool, input_layer_name, pool_conf, ceil_mode):
     pool_conf.pool_type = pool.pool_type
     config_assert(pool.pool_type in [
         'max-projection', 'avg-projection', 'cudnn-max-pool', 'cudnn-avg-pool'
@@ -1134,10 +1134,10 @@ def parse_pool(pool, input_layer_name, pool_conf):
     pool_conf.padding_y = default(pool.padding_y, pool_conf.padding)
     pool_conf.output_x = cnn_output_size(pool_conf.img_size, pool_conf.size_x,
                                          pool_conf.padding, pool_conf.stride,
-                                         False)
+                                         not ceil_mode)
     pool_conf.output_y = cnn_output_size(pool_conf.img_size_y, pool_conf.size_y,
                                          pool_conf.padding_y,
-                                         pool_conf.stride_y, False)
+                                         pool_conf.stride_y, not ceil_mode)
 
 
 def parse_spp(spp, input_layer_name, spp_conf):
@@ -1810,9 +1810,8 @@ class ConvTransLayer(ConvTransLayerBase):
 
 @config_layer('norm')
 class NormLayer(LayerBase):
-    def __init__(self, name, inputs, device=None):
-        super(NormLayer, self).__init__(
-            name, 'norm', 0, inputs=inputs, device=device)
+    def __init__(self, name, inputs, **xargs):
+        super(NormLayer, self).__init__(name, 'norm', 0, inputs=inputs, **xargs)
         for input_index in xrange(len(self.inputs)):
             input_layer = self.get_input_layer(input_index)
             norm_conf = self.config.inputs[input_index].norm_conf
@@ -1824,23 +1823,22 @@ class NormLayer(LayerBase):
 
 @config_layer('pool')
 class PoolLayer(LayerBase):
-    def __init__(self, name, inputs, device=None):
-        super(PoolLayer, self).__init__(
-            name, 'pool', 0, inputs=inputs, device=device)
+    def __init__(self, name, inputs, ceil_mode=True, **xargs):
+        super(PoolLayer, self).__init__(name, 'pool', 0, inputs=inputs, **xargs)
         for input_index in xrange(len(self.inputs)):
             input_layer = self.get_input_layer(input_index)
             pool_conf = self.config.inputs[input_index].pool_conf
             parse_pool(self.inputs[input_index].pool, input_layer.name,
-                       pool_conf)
+                       pool_conf, ceil_mode)
             self.set_cnn_layer(name, pool_conf.output_y, pool_conf.output_x,
                                pool_conf.channels)
 
 
 @config_layer('spp')
 class SpatialPyramidPoolLayer(LayerBase):
-    def __init__(self, name, inputs, device=None):
+    def __init__(self, name, inputs, **xargs):
         super(SpatialPyramidPoolLayer, self).__init__(
-            name, 'spp', 0, inputs=inputs, device=device)
+            name, 'spp', 0, inputs=inputs, **xargs)
         for input_index in xrange(len(self.inputs)):
             input_layer = self.get_input_layer(input_index)
             spp_conf = self.config.inputs[input_index].spp_conf
@@ -1877,7 +1875,6 @@ class BatchNormLayer(LayerBase):
                  inputs,
                  active_type="linear",
                  bias=True,
-                 device=None,
                  use_global_stats=True,
                  moving_average_fraction=0.9,
                  batch_norm_type=None,
@@ -1919,7 +1916,6 @@ class BatchNormLayer(LayerBase):
             0,
             active_type=active_type,
             inputs=inputs,
-            device=device,
             **xargs)
 
         if use_global_stats is not None:
@@ -1953,9 +1949,9 @@ class BatchNormLayer(LayerBase):
 
 @config_layer('trans')
 class TransLayer(LayerBase):
-    def __init__(self, name, inputs, device=None):
+    def __init__(self, name, inputs, **xargs):
         super(TransLayer, self).__init__(
-            name, 'trans', 0, inputs=inputs, device=device)
+            name, 'trans', 0, inputs=inputs, **xargs)
         config_assert(
             len(self.inputs) == 1,
             'TransLayer must have one and only one input')
@@ -1964,9 +1960,9 @@ class TransLayer(LayerBase):
 
 @config_layer('resize')
 class ResizeLayer(LayerBase):
-    def __init__(self, name, size, inputs, device=None):
+    def __init__(self, name, size, inputs, **xargs):
         super(ResizeLayer, self).__init__(
-            name, 'resize', size=size, inputs=inputs, device=device)
+            name, 'resize', size=size, inputs=inputs, **xargs)
         config_assert(
             len(self.inputs) == 1,
             'ResizeLayer must have one and only one input')
@@ -1974,9 +1970,9 @@ class ResizeLayer(LayerBase):
 
 @config_layer('blockexpand')
 class BlockExpandLayer(LayerBase):
-    def __init__(self, name, inputs, device=None):
+    def __init__(self, name, inputs, **xargs):
         super(BlockExpandLayer, self).__init__(
-            name, 'blockexpand', 0, inputs=inputs, device=device)
+            name, 'blockexpand', 0, inputs=inputs, **xargs)
         for input_index in xrange(len(self.inputs)):
             input_layer = self.get_input_layer(input_index)
             parse_block_expand(
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index 56c335a050..3f88727b63 100644
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -1980,7 +1980,8 @@ def img_pool_layer(input,
                    layer_attr=None,
                    pool_size_y=None,
                    stride_y=None,
-                   padding_y=None):
+                   padding_y=None,
+                   ceil_mode=True):
     """
     Image pooling Layer.
 
@@ -2011,6 +2012,23 @@ def img_pool_layer(input,
     :type stride_y: int|None
     :param layer_attr: Extra Layer attribute.
     :type layer_attr: ExtraLayerAttribute
+    :param ceil_mode: Wether to use ceil mode to calculate output height and with.
+                      Defalut is True. If set false, Otherwise use floor.
+
+                      - ceil_mode=True:
+
+                      ..  math::
+
+                          w = 1 + int(ceil(input_width + 2 * padding - pool_size) / float(stride))
+                          h = 1 + int(ceil(input_height + 2 * padding_y - pool_size_y) / float(stride_y))
+
+                      - ceil_mode=False:
+
+                      ..  math::
+
+                          w = 1 + int(floor(input_width + 2 * padding - pool_size) / float(stride))
+                          h = 1 + int(floor(input_height + 2 * padding_y - pool_size_y) / float(stride_y))
+    :type ceil_mode: bool
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -2048,6 +2066,7 @@ def img_pool_layer(input,
                     stride_y=stride_y,
                     padding_y=padding_y))
         ],
+        ceil_mode=ceil_mode,
         **ExtraLayerAttribute.to_kwargs(layer_attr))
     return LayerOutput(
         name,

From 74f76e61d885dcc0478086c54ad330875b92bbd4 Mon Sep 17 00:00:00 2001
From: dangqingqing <dangqingqing@baidu.com>
Date: Mon, 9 Jan 2017 10:30:39 +0800
Subject: [PATCH 07/88] update code and fix bug

---
 paddle/function/PadOp.cpp               | 12 ++++++------
 paddle/function/PadOpTest.cpp           |  2 +-
 paddle/gserver/tests/test_LayerGrad.cpp |  8 +++-----
 3 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/paddle/function/PadOp.cpp b/paddle/function/PadOp.cpp
index 2dfe03dcf6..ff13f27e94 100644
--- a/paddle/function/PadOp.cpp
+++ b/paddle/function/PadOp.cpp
@@ -92,9 +92,9 @@ public:
   void calc(const Arguments& inputs,
             const Arguments& outputs,
             const Arguments& inouts) override {
-    CHECK_EQ(1, inputs.size());
-    CHECK_EQ(1, outputs.size());
-    CHECK_EQ(0, inouts.size());
+    CHECK_EQ(1UL, inputs.size());
+    CHECK_EQ(1UL, outputs.size());
+    CHECK_EQ(0UL, inouts.size());
 
     size_t num = inputs[0].dims_[0];
     size_t inC = inputs[0].dims_[1];
@@ -143,9 +143,9 @@ public:
   void calc(const Arguments& inputs,
             const Arguments& outputs,
             const Arguments& inouts) override {
-    CHECK_EQ(1, inputs.size());
-    CHECK_EQ(0, outputs.size());
-    CHECK_EQ(1, inouts.size());
+    CHECK_EQ(1UL, inputs.size());
+    CHECK_EQ(0UL, outputs.size());
+    CHECK_EQ(1UL, inouts.size());
 
     size_t n = inouts[0].dims_[0];
     size_t inC = inouts[0].dims_[1];
diff --git a/paddle/function/PadOpTest.cpp b/paddle/function/PadOpTest.cpp
index c6d573c1d9..be989807d1 100644
--- a/paddle/function/PadOpTest.cpp
+++ b/paddle/function/PadOpTest.cpp
@@ -62,7 +62,7 @@ TEST(PadGrad, real) {
           Dims inDims{numSamples, channels, imgSizeH, imgSizeW};
           Dims outDims{numSamples, channels + 5, imgSizeH + 3, imgSizeW + 5};
           compare.cmpWithArg(
-              {Tensor(nullptr, inDims)}, {}, {Tensor(nullptr, outDims)});
+              {Tensor(nullptr, outDims)}, {}, {Tensor(nullptr, inDims)});
         }
       }
     }
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index 0560bb3ed9..8c8e876bd6 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -850,15 +850,13 @@ TEST(Layer, SequenceLastInstanceLayer) {
   testDegradeLayer(true,
                    "seqlastins",
                    "non-seq");  // hasSubseq seqlastins to non-seq
-  testDegradeLayer(true, "seqlastins", "seq");  // hasSubseq seqlastins to
-  seq
+  testDegradeLayer(true, "seqlastins", "seq");  // hasSubseq seqlastins to seq
 }
 
 TEST(Layer, AverageLayer) {
   testDegradeLayer(false, "average", "non-seq");  // seq average to non-seq
-  testDegradeLayer(true, "average", "non-seq");   // hasSubseq average to
-  non -
-      seq testDegradeLayer(true, "average", "seq");  // hasSubseq average to seq
+  testDegradeLayer(true, "average", "non-seq");  // hasSubseq average to non-seq
+  testDegradeLayer(true, "average", "seq");      // hasSubseq average to seq
 }
 
 TEST(Layer, SequenceConcatLayer) {

From 93fef79e1f5936fbe45a4474cf20cf2298c5c49d Mon Sep 17 00:00:00 2001
From: dangqingqing <dangqingqing@baidu.com>
Date: Mon, 9 Jan 2017 12:40:49 +0800
Subject: [PATCH 08/88] fix build

---
 paddle/gserver/layers/PadLayer.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/paddle/gserver/layers/PadLayer.cpp b/paddle/gserver/layers/PadLayer.cpp
index 62c50af32d..9069d382ef 100644
--- a/paddle/gserver/layers/PadLayer.cpp
+++ b/paddle/gserver/layers/PadLayer.cpp
@@ -33,9 +33,9 @@ bool PadLayer::init(const LayerMap& layerMap,
                                               : img_conf.img_size());
   inDims_.push_back(img_conf.img_size());
 
-  CHECK_EQ(2UL, pad_conf.pad_c_size());
-  CHECK_EQ(2UL, pad_conf.pad_h_size());
-  CHECK_EQ(2UL, pad_conf.pad_w_size());
+  CHECK_EQ(2, pad_conf.pad_c_size());
+  CHECK_EQ(2, pad_conf.pad_h_size());
+  CHECK_EQ(2, pad_conf.pad_w_size());
   padc_.push_back(pad_conf.pad_c(0));
   padc_.push_back(pad_conf.pad_c(1));
   padh_.push_back(pad_conf.pad_h(0));
@@ -76,7 +76,7 @@ void PadLayer::setOutDims(int batchSize) {
 }
 
 void PadLayer::setTensorDim(int batchSize) {
-  CHECK_EQ(inputLayers_.size(), 1UL);
+  CHECK_EQ(static_cast<int>(inputLayers_.size()), 1);
   inDims_[0] = batchSize;
   int h = inputLayers_[0]->getOutput().getFrameHeight();
   if (h != 0) inDims_[2];

From 2e47c9d828ea48b775572384260cc806674663aa Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Wed, 11 Jan 2017 17:44:18 +0800
Subject: [PATCH 09/88] Fix bug in DenseScanner of DataProviderConverter.

---
 paddle/py_paddle/dataprovider_converter.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/paddle/py_paddle/dataprovider_converter.py b/paddle/py_paddle/dataprovider_converter.py
index 981d10afda..21d1cb75f4 100644
--- a/paddle/py_paddle/dataprovider_converter.py
+++ b/paddle/py_paddle/dataprovider_converter.py
@@ -34,6 +34,10 @@ class IScanner(object):
 
 
 class DenseScanner(IScanner):
+    """
+    :type __mat__: numpy.ndarray
+    """
+
     def __init__(self, input_type, pos):
         IScanner.__init__(self, input_type, pos)
         self.__mat__ = None
@@ -47,6 +51,8 @@ class DenseScanner(IScanner):
     def finish_scan(self, argument):
         assert isinstance(argument, swig_paddle.Arguments)
         assert isinstance(self.input_type, dp2.InputType)
+        if self.__mat__.dtype != numpy.float32:
+            self.__mat__ = self.__mat__.astype(numpy.float32)
         m = swig_paddle.Matrix.createDenseFromNumpy(self.__mat__, True, False)
         argument.setSlotValue(self.pos, m)
 

From 2629d43ff7c798f1b3cd3be3883449a4b2877c35 Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Thu, 12 Jan 2017 14:49:19 +0800
Subject: [PATCH 10/88] New FunctionTest

---
 paddle/function/FunctionTest.h | 138 ++++++++++++++++++++++++++++++---
 1 file changed, 126 insertions(+), 12 deletions(-)

diff --git a/paddle/function/FunctionTest.h b/paddle/function/FunctionTest.h
index 32131037f6..2847188fd6 100644
--- a/paddle/function/FunctionTest.h
+++ b/paddle/function/FunctionTest.h
@@ -15,9 +15,33 @@ limitations under the License. */
 #include "Function.h"
 #include "paddle/math/Vector.h"
 #include "paddle/math/tests/TensorCheck.h"
+#include "paddle/testing/TestUtil.h"
 
 namespace paddle {
 
+/**
+ * \brief A class for comparing CPU and GPU implementations of Function.
+ *
+ *
+ * Use case:
+ *  // Initializes a test object, the corresponding cpu and gpu Function
+ *  // are constructed according to FunctionName and FuncConfig.
+ *  FunctionCompare test(FunctionName, FuncConfig);
+ *  // Prepare inputs and outputs arguments.
+ *  // Here the input and output can not contain real data,
+ *  // only contains the argument type and shape.
+ *  test.addInputs(input1);
+ *  test.addInputs(input2);
+ *  test.addOutputs(output1);
+ *  test.addOutputs(output2);
+ *  // Run.
+ *  // Will according to the type and shape of arguments(inputs_/outputs_),
+ *  // automatic initialization cpu and gpu function required arguments
+ *  // (cpuInputs_/cpuOutputs_/gpuInputs_/gpuOutputs_).
+ *  // Call the CPU and GPU Function calculation results.
+ *  // Compares CPU and GPU calculation results for consistency.
+ *  test.run();
+ */
 class FunctionCompare {
 public:
   FunctionCompare(const std::string& name, const FuncConfig& config)
@@ -27,6 +51,32 @@ public:
     gpu->init(config);
   }
 
+  void addInputs(const BufferArg& input) { inputs.push_back(input); }
+
+  void addOutputs(const BufferArg& output) { outputs.push_back(output); }
+
+  void run() {
+    // prepare cpu/gpu arguments
+    prepareArgs();
+
+    // function calculate
+    cpu->calc(cpuInputs, cpuOutputs);
+    gpu->calc(gpuInputs, gpuOutputs);
+
+    // check outputs and inouts
+    auto checkArgs = [=](const BufferArgs& cpuArgs, const BufferArgs& gpuArgs) {
+      for (size_t i = 0; i < cpuArgs.size(); i++) {
+        auto cpu = cpuArgs[i];
+        auto gpu = gpuArgs[i];
+        CpuVector cpuVector(cpu.shape().getElements(), (real*)cpu.getData());
+        GpuVector gpuVector(cpu.shape().getElements(), (real*)gpu.getData());
+
+        autotest::TensorCheckErr(cpuVector, gpuVector);
+      }
+    };
+    checkArgs(cpuOutputs, gpuOutputs);
+  }
+#if 0
   void cmpWithArg(const Arguments& inputs,
                   const Arguments& outputs,
                   const Arguments& inouts) {
@@ -64,11 +114,10 @@ public:
     };
     initArgs(cpuInputs, gpuInputs, inputs);
     initArgs(cpuOutputs, gpuOutputs, outputs);
-    initArgs(cpuInouts, gpuInouts, inouts);
 
     // function calculate
-    cpu->calc(cpuInputs, cpuOutputs, cpuInouts);
-    gpu->calc(gpuInputs, gpuOutputs, gpuInouts);
+    cpu->calc(cpuInputs, cpuOutputs);
+    gpu->calc(gpuInputs, gpuOutputs);
 
     // check outputs and inouts
     auto checkArgs = [=](const Arguments& cpuArgs, const Arguments& gpuArgs) {
@@ -86,24 +135,89 @@ public:
       }
     };
     checkArgs(cpuOutputs, gpuOutputs);
-    checkArgs(cpuInouts, gpuInouts);
   }
+#endif
 
   std::shared_ptr<FunctionBase> getCpuFunction() const { return cpu; }
 
   std::shared_ptr<FunctionBase> getGpuFunction() const { return gpu; }
 
+protected:
+  void prepareArgs() {
+    // TODO, if inputs has data
+  }
+
+  void createArg(BufferArgs& cpuArgs, BufferArgs& gpuArgs, BufferArg& arg) {
+    size_t size = arg.shape().getElements() * sizeOfValuType(arg.valueType());
+    cpuMemory_.emplace_back(std::make_shared<CpuMemoryHandle>(size));
+    gpuMemory_.emplace_back(std::make_shared<GpuMemoryHandle>(size));
+
+    cpuArgs.emplace_back(
+        BufferArg(cpuMemory_.back()->getBuf()), arg.valueType(), arg.shape());
+    gpuArgs.emplace_back(
+        BufferArg(gpuMemory_.back()->getBuf()), arg.valueType(), arg.shape());
+  }
+
+  void createArg(BufferArgs& cpuArgs, BufferArgs& gpuArgs, SequenceArg& arg) {
+    size_t batchSize = arg.shape()[0];
+    size_t numSeqs = batchSize / 10 + 1;
+
+    size_t sizeId = (numSeqs + 1) * sizeOfValuType(VALUE_TYPE_INT32);
+    cpuMemory_.emplace_back(std::make_shared<CpuMemoryHandle>(size));
+    gpuMemory_.emplace_back(std::make_shared<GpuMemoryHandle>(size));
+
+    TensorShape seqsId({numSeqs + 1});
+    void* cpuBuffer = cpuMemory_.back()->getBuf();
+    void* gpuBuffer = gpuMemory_.back()->getBuf();
+
+    size_t size = arg.shape().getElements() * sizeOfValuType(arg.valueType());
+    cpuMemory_.emplace_back(std::make_shared<CpuMemoryHandle>(size));
+    gpuMemory_.emplace_back(std::make_shared<GpuMemoryHandle>(size));
+
+    cpuArgs.emplace_back(SequenceArg(cpuMemory_.back()->getBuf(),
+                                     arg.valueType(),
+                                     arg.shape(),
+                                     SequenceIdArg(cpuBuffer, seqsId)));
+    gpuArgs.emplace_back(SequenceArg(gpuMemory_.back()->getBuf(),
+                                     arg.valueType(),
+                                     arg.shape(),
+                                     SequenceIdArg(gpuBuffer, seqsId)));
+  }
+
+  // only init cpu argument, gpu argument copy from cpu argument.
+  void initArg(BufferArg& arg) {
+    CpuVector vector(arg.shape().getElements(), (real*)arg.data());
+    vector.uniform(0.001, 1);
+  }
+
+  void initArg(SequenceIdArg& arg, size_t batchSize) {
+    size_t numSeqs = arg.numSeqs();
+    int* buf = arg.data();
+    int pos = 0;
+    size_t maxLen = 2 * batchSize / numSeqs;
+    for (int i = 0; i < numSeqs; ++i) {
+      int len = uniformRandom(
+                    std::min<int64_t>(maxLen, batchSize - pos - numSeqs + i)) +
+                1;
+      buf[i] = pos;
+      pos += len;
+      VLOG(1) << " len=" << len;
+    }
+    buf[numSeqs] = batchSize;
+  }
+
 protected:
   std::shared_ptr<FunctionBase> cpu;
   std::shared_ptr<FunctionBase> gpu;
-  std::vector<CpuMemHandlePtr> cpuMemory;
-  std::vector<GpuMemHandlePtr> gpuMemory;
-  Arguments cpuInputs;
-  Arguments cpuOutputs;
-  Arguments cpuInouts;
-  Arguments gpuInputs;
-  Arguments gpuOutputs;
-  Arguments gpuInouts;
+  std::vector<CpuMemHandlePtr> cpuMemory_;
+  std::vector<GpuMemHandlePtr> gpuMemory_;
+  // inputs and outputs
+  BufferArgs inputs;
+  BufferArgs outputs;
+  BufferArgs cpuInputs_;
+  BufferArgs cpuOutputs_;
+  BufferArgs gpuInputs_;
+  BufferArgs gpuOutputs_;
 };
 
 }  // namespace paddle

From fdf194aeaf6df02fde1165737def451a5fec8e73 Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Thu, 12 Jan 2017 18:03:18 +0800
Subject: [PATCH 11/88] move a test case from BufferArgTest.cpp to
 FunctionTest.cpp

---
 paddle/function/BufferArgTest.cpp | 53 -------------------------------
 paddle/function/FunctionTest.cpp  | 52 ++++++++++++++++++++++++++++++
 2 files changed, 52 insertions(+), 53 deletions(-)

diff --git a/paddle/function/BufferArgTest.cpp b/paddle/function/BufferArgTest.cpp
index b345597435..1744f37780 100644
--- a/paddle/function/BufferArgTest.cpp
+++ b/paddle/function/BufferArgTest.cpp
@@ -14,9 +14,7 @@ limitations under the License. */
 
 #include "BufferArg.h"
 #include <gtest/gtest.h>
-#include "Function.h"
 #include "paddle/math/MemoryHandle.h"
-#include "paddle/math/SparseMatrix.h"
 
 namespace paddle {
 
@@ -37,55 +35,4 @@ TEST(BufferTest, SequenceIdArg) {
   EXPECT_EQ(buffer.numSeqs(), 9);
 }
 
-TEST(BufferTest, asArgument) {
-  MatrixPtr matrix = Matrix::create(100, 200);
-  VectorPtr vector = Vector::create(100, false);
-  CpuSparseMatrix sparse(200, 300, 50);
-
-  // prepare arguments
-  BufferArgs argments;
-  argments.addArg(*matrix);
-  argments.addArg(*vector);
-  argments.addArg(sparse);
-
-  // function
-  auto function = [=](const BufferArgs& inputs) {
-    EXPECT_EQ(inputs.size(), 3);
-
-    // check inputs[0]
-    EXPECT_EQ(inputs[0].shape().ndims(), 2);
-    EXPECT_EQ(inputs[0].shape()[0], 100);
-    EXPECT_EQ(inputs[0].shape()[1], 200);
-    EXPECT_EQ(inputs[0].data(), matrix->getData());
-
-    EXPECT_EQ(inputs[0].matrix<DEVICE_TYPE_CPU>().getHeight(),
-              matrix->getHeight());
-    EXPECT_EQ(inputs[0].matrix<DEVICE_TYPE_CPU>().getWidth(),
-              matrix->getWidth());
-    EXPECT_EQ(inputs[0].matrix<DEVICE_TYPE_CPU>().getData(), matrix->getData());
-
-    // check inputs[1]
-    EXPECT_EQ(inputs[1].shape().ndims(), 1);
-    EXPECT_EQ(inputs[1].shape()[0], 100);
-    EXPECT_EQ(inputs[1].data(), vector->getData());
-    CpuVector inVector = inputs[1].vector<real, DEVICE_TYPE_CPU>();
-    EXPECT_EQ(inVector.getSize(), vector->getSize());
-    EXPECT_EQ(inVector.getData(), vector->getData());
-
-    // check inputs[2]
-    EXPECT_EQ(inputs[2].shape().ndims(), 2);
-    EXPECT_EQ(inputs[2].shape()[0], 200);
-    EXPECT_EQ(inputs[2].shape()[1], 300);
-    EXPECT_EQ(inputs[2].data(), sparse.getData());
-    // CHECK_EQ(inputs[2].sparse().nnz(), 50);
-    // CHECK_EQ(inputs[2].sparse().dataFormat(), SPARSE_CSR_FORMAT);
-    // CHECK_EQ(inputs[2].sparse().dataType(), SPARSE_FLOAT_VALUE);
-    EXPECT_EQ(inputs[2].sparse().getRowBuf(), sparse.getRows());
-    EXPECT_EQ(inputs[2].sparse().getColBuf(), sparse.getCols());
-  };
-
-  // call function
-  function(argments);
-}
-
 }  // namespace paddle
diff --git a/paddle/function/FunctionTest.cpp b/paddle/function/FunctionTest.cpp
index 7ce908320a..6e44c2f5db 100644
--- a/paddle/function/FunctionTest.cpp
+++ b/paddle/function/FunctionTest.cpp
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "Function.h"
 #include <gtest/gtest.h>
+#include "paddle/math/SparseMatrix.h"
 
 namespace paddle {
 
@@ -56,4 +57,55 @@ TEST(Function, BufferArgs) {
   Function<DEVICE_TYPE_GPU>(gpuArgments);
 }
 
+TEST(BufferArgs, asArgument) {
+  MatrixPtr matrix = Matrix::create(100, 200);
+  VectorPtr vector = Vector::create(100, false);
+  CpuSparseMatrix sparse(200, 300, 50);
+
+  // prepare arguments
+  BufferArgs argments;
+  argments.addArg(*matrix);
+  argments.addArg(*vector);
+  argments.addArg(sparse);
+
+  // function
+  auto function = [=](const BufferArgs& inputs) {
+    EXPECT_EQ(inputs.size(), 3);
+
+    // check inputs[0]
+    EXPECT_EQ(inputs[0].shape().ndims(), 2);
+    EXPECT_EQ(inputs[0].shape()[0], 100);
+    EXPECT_EQ(inputs[0].shape()[1], 200);
+    EXPECT_EQ(inputs[0].data(), matrix->getData());
+
+    EXPECT_EQ(inputs[0].matrix<DEVICE_TYPE_CPU>().getHeight(),
+              matrix->getHeight());
+    EXPECT_EQ(inputs[0].matrix<DEVICE_TYPE_CPU>().getWidth(),
+              matrix->getWidth());
+    EXPECT_EQ(inputs[0].matrix<DEVICE_TYPE_CPU>().getData(), matrix->getData());
+
+    // check inputs[1]
+    EXPECT_EQ(inputs[1].shape().ndims(), 1);
+    EXPECT_EQ(inputs[1].shape()[0], 100);
+    EXPECT_EQ(inputs[1].data(), vector->getData());
+    CpuVector inVector = inputs[1].vector<real, DEVICE_TYPE_CPU>();
+    EXPECT_EQ(inVector.getSize(), vector->getSize());
+    EXPECT_EQ(inVector.getData(), vector->getData());
+
+    // check inputs[2]
+    EXPECT_EQ(inputs[2].shape().ndims(), 2);
+    EXPECT_EQ(inputs[2].shape()[0], 200);
+    EXPECT_EQ(inputs[2].shape()[1], 300);
+    EXPECT_EQ(inputs[2].data(), sparse.getData());
+    // CHECK_EQ(inputs[2].sparse().nnz(), 50);
+    // CHECK_EQ(inputs[2].sparse().dataFormat(), SPARSE_CSR_FORMAT);
+    // CHECK_EQ(inputs[2].sparse().dataType(), SPARSE_FLOAT_VALUE);
+    EXPECT_EQ(inputs[2].sparse().getRowBuf(), sparse.getRows());
+    EXPECT_EQ(inputs[2].sparse().getColBuf(), sparse.getCols());
+  };
+
+  // call function
+  function(argments);
+}
+
 }  // namespace paddle

From bff19f57d193f00240ff52419b2c43a7df662453 Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Thu, 12 Jan 2017 20:15:59 +0800
Subject: [PATCH 12/88] Add a CheckBufferArg. It is used to check the
 consistency between the BufferArg type argument received by Function and the
 original type argument.

---
 paddle/function/FunctionTest.cpp | 114 ++++++++++++++++++++-----------
 1 file changed, 75 insertions(+), 39 deletions(-)

diff --git a/paddle/function/FunctionTest.cpp b/paddle/function/FunctionTest.cpp
index 6e44c2f5db..eb05ca9a21 100644
--- a/paddle/function/FunctionTest.cpp
+++ b/paddle/function/FunctionTest.cpp
@@ -57,55 +57,91 @@ TEST(Function, BufferArgs) {
   Function<DEVICE_TYPE_GPU>(gpuArgments);
 }
 
-TEST(BufferArgs, asArgument) {
+/**
+ * Some tests case are used to check the consistency between the BufferArg type
+ * argument received by Function and the original type argument.
+ *
+ * Use Case:
+ *  TEST() {
+ *    Matrix matrix(...);
+ *    CheckBufferArg lambda = [=](const BufferArg& arg) {
+ *      // check matrix and arg are equivalent
+ *      EXPECT_EQ(matrix, arg);
+ *    }
+ *
+ *   BufferArgs argments{matrix...};
+ *   std::vector<CheckBufferArg> checkFunc{lambda...};
+ *   testBufferArgs(argments, checkFunc);
+ *  }
+ */
+typedef std::function<void(const BufferArg&)> CheckBufferArg;
+
+void testBufferArgs(const BufferArgs& inputs,
+                    const std::vector<CheckBufferArg>& check) {
+  EXPECT_EQ(inputs.size(), check.size());
+  for (size_t i = 0; i < inputs.size(); i++) {
+    check[i](inputs[i]);
+  }
+}
+
+TEST(Arguments, Matrix) {
   MatrixPtr matrix = Matrix::create(100, 200);
-  VectorPtr vector = Vector::create(100, false);
-  CpuSparseMatrix sparse(200, 300, 50);
+  CheckBufferArg check = [=](const BufferArg& arg) {
+    EXPECT_EQ(arg.shape().ndims(), 2);
+    EXPECT_EQ(arg.shape()[0], 100);
+    EXPECT_EQ(arg.shape()[1], 200);
+    EXPECT_EQ(arg.data(), matrix->getData());
+
+    EXPECT_EQ(arg.matrix<DEVICE_TYPE_CPU>().getHeight(), matrix->getHeight());
+    EXPECT_EQ(arg.matrix<DEVICE_TYPE_CPU>().getWidth(), matrix->getWidth());
+    EXPECT_EQ(arg.matrix<DEVICE_TYPE_CPU>().getData(), matrix->getData());
+  };
 
-  // prepare arguments
   BufferArgs argments;
   argments.addArg(*matrix);
-  argments.addArg(*vector);
-  argments.addArg(sparse);
+  std::vector<CheckBufferArg> checkFunc;
+  checkFunc.push_back(check);
+  testBufferArgs(argments, checkFunc);
+}
+
+TEST(Arguments, Vector) {
+  VectorPtr vector = Vector::create(100, false);
+  CheckBufferArg check = [=](const BufferArg& arg) {
+    EXPECT_EQ(arg.shape().ndims(), 1);
+    EXPECT_EQ(arg.shape()[0], 100);
+    EXPECT_EQ(arg.data(), vector->getData());
 
-  // function
-  auto function = [=](const BufferArgs& inputs) {
-    EXPECT_EQ(inputs.size(), 3);
-
-    // check inputs[0]
-    EXPECT_EQ(inputs[0].shape().ndims(), 2);
-    EXPECT_EQ(inputs[0].shape()[0], 100);
-    EXPECT_EQ(inputs[0].shape()[1], 200);
-    EXPECT_EQ(inputs[0].data(), matrix->getData());
-
-    EXPECT_EQ(inputs[0].matrix<DEVICE_TYPE_CPU>().getHeight(),
-              matrix->getHeight());
-    EXPECT_EQ(inputs[0].matrix<DEVICE_TYPE_CPU>().getWidth(),
-              matrix->getWidth());
-    EXPECT_EQ(inputs[0].matrix<DEVICE_TYPE_CPU>().getData(), matrix->getData());
-
-    // check inputs[1]
-    EXPECT_EQ(inputs[1].shape().ndims(), 1);
-    EXPECT_EQ(inputs[1].shape()[0], 100);
-    EXPECT_EQ(inputs[1].data(), vector->getData());
-    CpuVector inVector = inputs[1].vector<real, DEVICE_TYPE_CPU>();
+    CpuVector inVector = arg.vector<real, DEVICE_TYPE_CPU>();
     EXPECT_EQ(inVector.getSize(), vector->getSize());
     EXPECT_EQ(inVector.getData(), vector->getData());
+  };
 
-    // check inputs[2]
-    EXPECT_EQ(inputs[2].shape().ndims(), 2);
-    EXPECT_EQ(inputs[2].shape()[0], 200);
-    EXPECT_EQ(inputs[2].shape()[1], 300);
-    EXPECT_EQ(inputs[2].data(), sparse.getData());
-    // CHECK_EQ(inputs[2].sparse().nnz(), 50);
-    // CHECK_EQ(inputs[2].sparse().dataFormat(), SPARSE_CSR_FORMAT);
-    // CHECK_EQ(inputs[2].sparse().dataType(), SPARSE_FLOAT_VALUE);
-    EXPECT_EQ(inputs[2].sparse().getRowBuf(), sparse.getRows());
-    EXPECT_EQ(inputs[2].sparse().getColBuf(), sparse.getCols());
+  BufferArgs argments;
+  argments.addArg(*vector);
+  std::vector<CheckBufferArg> checkFunc;
+  checkFunc.push_back(check);
+  testBufferArgs(argments, checkFunc);
+}
+
+TEST(Arguments, CpuSparseMatrix) {
+  CpuSparseMatrix sparse(200, 300, 50);
+  CheckBufferArg check = [=](const BufferArg& arg) {
+    EXPECT_EQ(arg.shape().ndims(), 2);
+    EXPECT_EQ(arg.shape()[0], 200);
+    EXPECT_EQ(arg.shape()[1], 300);
+    EXPECT_EQ(arg.data(), sparse.getData());
+    // CHECK_EQ(arg.sparse().nnz(), 50);
+    // CHECK_EQ(arg.sparse().dataFormat(), SPARSE_CSR_FORMAT);
+    // CHECK_EQ(arg.sparse().dataType(), SPARSE_FLOAT_VALUE);
+    EXPECT_EQ(arg.sparse().getRowBuf(), sparse.getRows());
+    EXPECT_EQ(arg.sparse().getColBuf(), sparse.getCols());
   };
 
-  // call function
-  function(argments);
+  BufferArgs argments;
+  argments.addArg(sparse);
+  std::vector<CheckBufferArg> checkFunc;
+  checkFunc.push_back(check);
+  testBufferArgs(argments, checkFunc);
 }
 
 }  // namespace paddle

From edad8a6913a10ab83fd9463c6dce92f93cb5e315 Mon Sep 17 00:00:00 2001
From: zhanghaichao <zhanghaichao@baidu.com>
Date: Thu, 12 Jan 2017 11:02:17 -0800
Subject: [PATCH 13/88] bug fix in hierarchical layer doc

---
 doc/howto/deep_model/rnn/hierarchical_layer_cn.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/howto/deep_model/rnn/hierarchical_layer_cn.rst b/doc/howto/deep_model/rnn/hierarchical_layer_cn.rst
index 943b1d4bb8..4b328fc9d3 100644
--- a/doc/howto/deep_model/rnn/hierarchical_layer_cn.rst
+++ b/doc/howto/deep_model/rnn/hierarchical_layer_cn.rst
@@ -32,7 +32,7 @@ pooling_layer 的使用示例如下，详细见 :ref:`api_trainer_config_helpers
         
 - `pooling_type` 目前支持两种，分别是：MaxPooling()和AvgPooling()。
 
-- `agg_level=AggregateLevel.TIMESTEP` 时（默认值）：
+- `agg_level=AggregateLevel.EACH_TIMESTEP` 时（默认值）：
 
   - 作用：双层序列经过运算变成一个0层序列，或单层序列经过运算变成一个0层序列
   - 输入：一个双层序列，或一个单层序列
@@ -54,7 +54,7 @@ last_seq 的使用示例如下（ :ref:`api_trainer_config_helpers_layers_first_
         last = last_seq(input=layer,
                         agg_level=AggregateLevel.EACH_SEQUENCE)
         
-- `agg_level=AggregateLevel.TIMESTEP` 时（默认值）：
+- `agg_level=AggregateLevel.EACH_TIMESTEP` 时（默认值）：
 
   - 作用：一个双层序列经过运算变成一个0层序列，或一个单层序列经过运算变成一个0层序列
   - 输入：一个双层序列或一个单层序列

From 86fa8c05280e18c6fc4a569931d9f50fd9467546 Mon Sep 17 00:00:00 2001
From: xutianbing <xutianbing@baidu.com>
Date: Thu, 5 Jan 2017 11:05:18 -0800
Subject: [PATCH 14/88] Wei Xu's comments, set up right inouts.

---
 paddle/function/ContextProjectionOp.cpp     | 124 +++++++++++++-------
 paddle/function/ContextProjectionOp.h       |  20 ++--
 paddle/function/ContextProjectionOpGpu.cu   |  25 ++--
 paddle/function/ContextProjectionOpTest.cpp |  34 +++---
 4 files changed, 124 insertions(+), 79 deletions(-)

diff --git a/paddle/function/ContextProjectionOp.cpp b/paddle/function/ContextProjectionOp.cpp
index cb448562eb..8803ea7896 100644
--- a/paddle/function/ContextProjectionOp.cpp
+++ b/paddle/function/ContextProjectionOp.cpp
@@ -70,10 +70,11 @@ void ContextProjectionForward<DEVICE_TYPE_CPU>(CpuMatrix& out_mat,
 }
 
 /**
+ * \param outputs[0] output value.
+ *
  * \param inputs[0] input value.
  * \param inputs[1] input weight.
  * \param inputs[2] input sequence.
- * \param outputs[0] output value.
  */
 template <DeviceType Device>
 class ContextProjectionForwardFunc : public FunctionBase {
@@ -123,7 +124,8 @@ private:
 };
 
 template <>
-void ContextProjectionBackward<DEVICE_TYPE_CPU>(CpuMatrix& out_grad_mat,
+<<<<<<< HEAD
+void ContextProjectionBackward<DEVICE_TYPE_CPU>(const CpuMatrix& out_grad_mat,
                                                 CpuMatrix& in_grad_mat,
                                                 CpuMatrix& w_grad_mat,
                                                 const CpuIVector& seq_vec,
@@ -176,10 +178,10 @@ void ContextProjectionBackward<DEVICE_TYPE_CPU>(CpuMatrix& out_grad_mat,
 }
 
 /**
- * \param inputs[0] input grad.
- * \param inputs[1] weight grad.
- * \param inputs[2] input sequence.
- * \param outputs[0] output value.
+ * \param inputs[0]     input sequence.
+ * \param inputs[1]     output grad.
+ * \param inouts[0]     input grad.
+ * \param inouts[1]     weight grad.
  */
 template <DeviceType Device>
 class ContextProjectionBackwardFunc : public FunctionBase {
@@ -192,6 +194,7 @@ public:
     total_pad_ = config.get<size_t>("total_pad");
   }
 
+<<<<<<< HEAD
   void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
     CHECK_EQ((size_t)3, inputs.size());
     CHECK_EQ((size_t)1, outputs.size());
@@ -210,6 +213,42 @@ public:
     CHECK_EQ(outputs[0].shape()[1], inputs[0].shape()[1] * context_length_);
 
     CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+=======
+  void calc(const Arguments& inputs,
+            const Arguments& outputs,
+            const Arguments& inouts) override {
+    CHECK_EQ(2, inputs.size());
+    CHECK_EQ(0, outputs.size());
+    CHECK_EQ(2, inouts.size());
+
+    CHECK(inputs[0].getData() && inputs[1].getData());
+    CHECK_EQ(inputs[0].dims_.size(), 1);
+    CHECK_EQ(inputs[1].dims_.size(), 2);
+    CHECK_EQ(inouts[0].dims_.size(), 2);
+    CHECK_EQ(inouts[1].dims_.size(), 2);
+
+    /// dim of input grad == dim of weight grad
+    CHECK_EQ(inouts[0].dims_[1], inouts[1].dims_[1]);
+    /// input grad and output grad have the same batch_size
+    CHECK_EQ(inouts[0].dims_[0], inputs[1].dims_[0]);
+    /// dim of output = dim of input * context_length
+    CHECK_EQ(inputs[1].dims_[1], inputs[0].dims_[1] * context_length_);
+
+    typename SequenceT<Device>::type seq_vec(
+        inputs[0].dims_[0], reinterpret_cast<int*>(inputs[0].getData()));
+    const auto out_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
+        inputs[1].getData(), inputs[1].dims_[0], inputs[1].dims_[1]);
+    auto in_grad_mat =
+        !inouts[0].getData()
+            ? nullptr
+            : std::make_shared<typename MatrixT<Device>::type>(
+                  inouts[0].getData(), inouts[0].dims_[0], inouts[0].dims_[1]);
+    auto w_grad_mat =
+        !inouts[1].getData()
+            ? nullptr
+            : std::make_shared<typename MatrixT<Device>::type>(
+                  inouts[1].getData(), inouts[1].dims_[0], inouts[1].dims_[1]);
+>>>>>>> Wei Xu's comments, set up right inouts.
 
     auto out_grad_mat = outputs[0].matrix<Device>();
     auto in_grad_mat =
@@ -240,9 +279,9 @@ private:
 
 #if 0
 /**
- * \param inputs[0] input grad.
- * \param inputs[1] input sequence.
- * \param outputs[0] output grad.
+ * \param inouts[0]    input grad.
+ * \param inputs[0]    input sequence.
+ * \param inputs[1]    output grad.
  */
 template <DeviceType Device>
 class ContextProjectionBackwardDataFunc : public FunctionBase {
@@ -255,23 +294,24 @@ public:
   void calc(const Arguments& inputs,
             const Arguments& outputs,
             const Arguments& inouts) override {
-    CHECK_EQ(2, static_cast<int>(inputs.size()));
-    CHECK_EQ(1, static_cast<int>(outputs.size()));
-    CHECK_EQ(0, static_cast<int>(inouts.size()));
-    CHECK(inputs[0].getData() && outputs[0].getData() && inputs[1].getData());
-    CHECK_EQ(static_cast<int>(outputs[0].dims_.size()), 2);
-    CHECK_EQ(static_cast<int>(inputs[0].dims_.size()), 2);
-    CHECK_EQ(static_cast<int>(inputs[1].dims_.size()), 1);
-    CHECK_EQ(outputs[0].dims_[1], inputs[0].dims_[1] * context_length_);
-    /// input and output has the same batch_size
-    CHECK_EQ(inputs[0].dims_[0], outputs[0].dims_[0]);
+    CHECK_EQ(2, inputs.size());
+    CHECK_EQ(0, outputs.size());
+    CHECK_EQ(1, inouts.size());
+
+    CHECK(inouts[0].getData() && inputs[0].getData() && inputs[1].getData());
+    CHECK_EQ(inputs[0].dims_.size(), 1);
+    CHECK_EQ(inputs[1].dims_.size(), 2);
+    CHECK_EQ(inouts[0].dims_.size(), 2);
+    CHECK_EQ(inputs[1].dims_[1], inouts[0].dims_[1] * context_length_);
+    /// input and output grad have the same batch_size
+    CHECK_EQ(inouts[0].dims_[0], inputs[1].dims_[0]);
 
-    auto out_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
-        outputs[0].getData(), outputs[0].dims_[0], outputs[0].dims_[1]);
-    const auto in_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
-        inputs[0].getData(), inputs[0].dims_[0], inputs[0].dims_[1]);
     typename SequenceT<Device>::type seq_vec(
-        inputs[1].dims_[0], reinterpret_cast<int*>(inputs[1].getData()));
+        inputs[0].dims_[0], reinterpret_cast<int*>(inputs[0].getData()));
+    const auto out_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
+        inputs[1].getData(), inputs[1].dims_[0], inputs[1].dims_[1]);
+    auto in_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
+        inouts[0].getData(), inouts[0].dims_[0], inouts[0].dims_[1]);
 
     ContextProjectionBackwardData<Device>(out_grad_mat.get(),
                                           in_grad_mat.get(),
@@ -286,9 +326,9 @@ private:
 };
 
 /**
- * \param inputs[0] weight grad.
- * \param inputs[1] input sequence.
- * \param outputs[0] output grad.
+ * \param inouts[0]    weight grad.
+ * \param inputs[0]    input sequence.
+ * \param inputs[1]    output grad.
  */
 template <DeviceType Device>
 class ContextProjectionBackwardWeightFunc : public FunctionBase {
@@ -303,22 +343,22 @@ public:
   void calc(const Arguments& inputs,
             const Arguments& outputs,
             const Arguments& inouts) override {
-    CHECK_EQ(2, static_cast<int>(inputs.size()));
-    CHECK_EQ(1, static_cast<int>(outputs.size()));
-    CHECK_EQ(0, static_cast<int>(inouts.size()));
-
-    CHECK(inputs[0].getData() && outputs[0].getData() && inputs[1].getData());
-    CHECK_EQ(static_cast<int>(outputs[0].dims_.size()), 2);
-    CHECK_EQ(static_cast<int>(inputs[0].dims_.size()), 2);
-    CHECK_EQ(static_cast<int>(inputs[1].dims_.size()), 1);
-    CHECK_EQ(outputs[0].dims_[1], inputs[0].dims_[1] * context_length_);
-
-    auto out_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
-        outputs[0].getData(), outputs[0].dims_[0], outputs[0].dims_[1]);
-    auto w_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
-        inputs[0].getData(), inputs[0].dims_[0], inputs[0].dims_[1]);
+    CHECK_EQ(2, inputs.size());
+    CHECK_EQ(0, outputs.size());
+    CHECK_EQ(1, inouts.size());
+
+    CHECK(inouts[0].getData() && inputs[0].getData() && inputs[1].getData());
+    CHECK_EQ(inputs[0].dims_.size(), 1);
+    CHECK_EQ(inputs[1].dims_.size(), 2);
+    CHECK_EQ(inouts[0].dims_.size(), 2);
+    CHECK_EQ(inputs[1].dims_[1], inouts[0].dims_[1] * context_length_);
+
     typename SequenceT<Device>::type seq_vec(
-        inputs[1].dims_[0], reinterpret_cast<int*>(inputs[1].getData()));
+        inputs[0].dims_[0], reinterpret_cast<int*>(inputs[0].getData()));
+    const auto out_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
+        inputs[1].getData(), inputs[1].dims_[0], inputs[1].dims_[1]);
+    auto w_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
+        inouts[0].getData(), inouts[0].dims_[0], inouts[0].dims_[1]);
 
     ContextProjectionBackwardWeight<Device>(out_grad_mat.get(),
                                             w_grad_mat.get(),
diff --git a/paddle/function/ContextProjectionOp.h b/paddle/function/ContextProjectionOp.h
index a558df5e07..8e956c6c6f 100644
--- a/paddle/function/ContextProjectionOp.h
+++ b/paddle/function/ContextProjectionOp.h
@@ -21,14 +21,14 @@ namespace paddle {
 /**
  * \brief   Context Projection Forward.
  *
- * \param[out]  outputs           output data.
- * \param[in]   input             input data.
- * \param[in]   weight            input weight.
- * \param[in]   sequence          input data.
- * \param[in]   context_length    consecutive rows for concatenation.
- * \param[in]   context_start     context start position.
- * \param[in]   begin_pad         begining pad position.
- * \param[in]   is_padding        whether padding 0 or not.
+ * \param[in/out]  outputs           output data.
+ * \param[in]      input             input data.
+ * \param[in]      weight            input weight.
+ * \param[in]      sequence          input data.
+ * \param[in]      context_length    consecutive rows for concatenation.
+ * \param[in]      context_start     context start position.
+ * \param[in]      begin_pad         begining pad position.
+ * \param[in]      is_padding        whether padding 0 or not.
  *
  */
 template <DeviceType DType>
@@ -68,7 +68,7 @@ void ContextProjectionBackward(
 
 template <DeviceType DType>
 void ContextProjectionBackwardData(
-    typename Tensor<real, DType>::Matrix& out_grad,
+    const typename Tensor<real, DType>::Matrix& out_grad,
     typename Tensor<real, DType>::Matrix& in_grad,
     const typename Tensor<int, DType>::Vector& sequence,
     size_t context_length,
@@ -76,7 +76,7 @@ void ContextProjectionBackwardData(
 
 template <DeviceType DType>
 void ContextProjectionBackwardWeight(
-    typename Tensor<real, DType>::Matrix& out_grad,
+    const typename Tensor<real, DType>::Matrix& out_grad,
     typename Tensor<real, DType>::Matrix& w_grad,
     const typename Tensor<int, DType>::Vector& seq_vec,
     size_t context_length,
diff --git a/paddle/function/ContextProjectionOpGpu.cu b/paddle/function/ContextProjectionOpGpu.cu
index 6a4a01a651..6194ad8e74 100644
--- a/paddle/function/ContextProjectionOpGpu.cu
+++ b/paddle/function/ContextProjectionOpGpu.cu
@@ -138,10 +138,10 @@ void ContextProjectionForward<DEVICE_TYPE_GPU>(GpuMatrix& output,
                                 begin_pad);
 }
 
-__global__ void KeContextProjectionBackwardData(real* out_grad,
+__global__ void KeContextProjectionBackwardData(const real* out_grad,
                                                 const int* sequence,
                                                 real* in_grad,
-                                                int input_dim,
+                                                size_t input_dim,
                                                 int context_length,
                                                 int context_start) {
   int idx = threadIdx.x;
@@ -152,7 +152,8 @@ __global__ void KeContextProjectionBackwardData(real* out_grad,
   real value = 0;
 
   int instances = seq_end - seq_start + context_length - 1;
-  out_grad += seq_start * input_dim * context_length;
+  auto out = const_cast<real*>(out_grad);
+  out += seq_start * input_dim * context_length;
   in_grad += seq_start * input_dim;
   for (int k = 0; k <= input_dim / block_size; k++) {
     if (idx < input_dim) {
@@ -169,7 +170,7 @@ __global__ void KeContextProjectionBackwardData(real* out_grad,
         int outx = (i - context_length) < 0 ? i : (context_length - 1);
         int outy = (i - context_length) < 0 ? 0 : (i - (context_length - 1));
         real* output_r =
-          out_grad + outy * input_dim * context_length + outx * input_dim;
+          out + outy * input_dim * context_length + outx * input_dim;
         for (int j = outy; j < seq_end - seq_start; j++) {
           value += output_r[idx];
           if (j - outy == outx) break;
@@ -194,7 +195,7 @@ __global__ void KeContextProjectionBackwardData(real* out_grad,
  * @param[in]   context_start    context start.
  *
  */
-void hl_context_projection_backward_data(real* out_grad,
+void hl_context_projection_backward_data(const real* out_grad,
                                          const int* sequence,
                                          real* input_grad,
                                          size_t num_sequences,
@@ -216,7 +217,8 @@ void hl_context_projection_backward_data(real* out_grad,
 }
 
 template <>
-void ContextProjectionBackwardData<DEVICE_TYPE_GPU>(GpuMatrix& out_grad,
+<<<<<<< HEAD
+void ContextProjectionBackwardData<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
                                                     GpuMatrix& in_grad,
                                                     const GpuIVector& sequence,
                                                     size_t context_length,
@@ -231,7 +233,7 @@ void ContextProjectionBackwardData<DEVICE_TYPE_GPU>(GpuMatrix& out_grad,
 }
 
 template<int THREADS_X, int THREADS_Y>
-__global__ void KeContextProjectionBackwardWeight(real* out_grad,
+__global__ void KeContextProjectionBackwardWeight(const real* out_grad,
                                                   const int* sequence,
                                                   real* w_grad,
                                                   int num_sequences,
@@ -254,7 +256,8 @@ __global__ void KeContextProjectionBackwardWeight(real* out_grad,
     for (int seqId = idy; seqId < num_sequences; seqId += THREADS_Y) {
       int seq_start = sequence[seqId];
       int seq_end = sequence[seqId+1];
-      output_r = out_grad + seq_start * w_dim * context_length;
+      output_r = const_cast<real*>(out_grad) 
+                    + seq_start * w_dim * context_length;
 
       if (context_start < 0) {
         if (padId + context_start < 0) {
@@ -318,7 +321,7 @@ __global__ void KeContextProjectionBackwardWeight(real* out_grad,
  * beginning.
  *
  */
-void hl_context_projection_backward_weight(real* out_grad,
+void hl_context_projection_backward_weight(const real* out_grad,
                                            const int* sequence,
                                            real* w_grad,
                                            size_t num_sequences,
@@ -346,7 +349,7 @@ void hl_context_projection_backward_weight(real* out_grad,
 
 template <>
 void ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>(
-        GpuMatrix& out_grad,
+        const GpuMatrix& out_grad,
         GpuMatrix& w_grad,
         const GpuIVector& seq_vec,
         size_t context_length,
@@ -365,7 +368,7 @@ void ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>(
 }
 
 template <>
-void ContextProjectionBackward<DEVICE_TYPE_GPU>(GpuMatrix& out_grad,
+void ContextProjectionBackward<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
                                                 GpuMatrix& in_grad,
                                                 GpuMatrix& w_grad,
                                                 const GpuIVector& sequence,
diff --git a/paddle/function/ContextProjectionOpTest.cpp b/paddle/function/ContextProjectionOpTest.cpp
index 6223d2fd23..169c1dd505 100644
--- a/paddle/function/ContextProjectionOpTest.cpp
+++ b/paddle/function/ContextProjectionOpTest.cpp
@@ -62,16 +62,18 @@ void testMatrixProjectionForward(int context_start,
               Dims{pad, input_dim}),
        Tensor(reinterpret_cast<real*>(cpu_seq->getData()),
               Dims{cpu_seq->getSize()})},
-      {Tensor(cpu_out.getData(), Dims{batch_size, input_dim * context_length})},
-      {});
+      {},
+      {Tensor(cpu_out.getData(),
+              Dims{batch_size, input_dim * context_length})});
   compare.getGpuFunction()->calc(
       {Tensor(gpu_in.getData(), Dims{batch_size, input_dim}),
        Tensor(gpu_weight ? gpu_weight->getData() : nullptr,
               Dims{pad, input_dim}),
        Tensor(reinterpret_cast<real*>(gpu_seq->getData()),
               Dims{gpu_seq->getSize()})},
-      {Tensor(gpu_out.getData(), Dims{batch_size, input_dim * context_length})},
-      {});
+      {},
+      {Tensor(gpu_out.getData(),
+              Dims{batch_size, input_dim * context_length})});
 
   autotest::TensorCheckEqual(cpu_out, gpu_out);
 }
@@ -118,24 +120,24 @@ void testMatrixProjectionBackward(int context_start,
   }
 
   compare.getCpuFunction()->calc(
+      {Tensor(reinterpret_cast<real*>(cpu_seq->getData()),
+              Dims{cpu_seq->getSize()}),
+       Tensor(cpu_out_grad.getData(),
+              Dims{batch_size, input_dim * context_length})},
+      {},
       {Tensor(cpu_in_grad.getData(), Dims{batch_size, input_dim}),
        Tensor(cpu_w_grad ? cpu_w_grad->getData() : nullptr,
-              Dims{pad, input_dim}),
-       Tensor(reinterpret_cast<real*>(cpu_seq->getData()),
-              Dims{cpu_seq->getSize()})},
-      {Tensor(cpu_out_grad.getData(),
-              Dims{batch_size, input_dim * context_length})},
-      {});
+              Dims{pad, input_dim})});
 
   compare.getGpuFunction()->calc(
+      {Tensor(reinterpret_cast<real*>(gpu_seq->getData()),
+              Dims{gpu_seq->getSize()}),
+       Tensor(gpu_out_grad.getData(),
+              Dims{batch_size, input_dim * context_length})},
+      {},
       {Tensor(gpu_in_grad.getData(), Dims{batch_size, input_dim}),
        Tensor(gpu_w_grad ? gpu_w_grad->getData() : nullptr,
-              Dims{pad, input_dim}),
-       Tensor(reinterpret_cast<real*>(gpu_seq->getData()),
-              Dims{gpu_seq->getSize()})},
-      {Tensor(gpu_out_grad.getData(),
-              Dims{batch_size, input_dim * context_length})},
-      {});
+              Dims{pad, input_dim})});
 
   autotest::TensorCheckErr(cpu_in_grad, gpu_in_grad);
   if (is_padding) {

From df66957ec32f38b45b3e7274ef8f99699391854f Mon Sep 17 00:00:00 2001
From: xutianbing <xutianbing@baidu.com>
Date: Thu, 5 Jan 2017 11:14:51 -0800
Subject: [PATCH 15/88] clean a little bit code.

---
 paddle/function/ContextProjectionOp.cpp   | 2 +-
 paddle/function/ContextProjectionOpGpu.cu | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/function/ContextProjectionOp.cpp b/paddle/function/ContextProjectionOp.cpp
index 8803ea7896..f1e42cad72 100644
--- a/paddle/function/ContextProjectionOp.cpp
+++ b/paddle/function/ContextProjectionOp.cpp
@@ -232,7 +232,7 @@ public:
     /// input grad and output grad have the same batch_size
     CHECK_EQ(inouts[0].dims_[0], inputs[1].dims_[0]);
     /// dim of output = dim of input * context_length
-    CHECK_EQ(inputs[1].dims_[1], inputs[0].dims_[1] * context_length_);
+    CHECK_EQ(inputs[1].dims_[1], inouts[0].dims_[1] * context_length_);
 
     typename SequenceT<Device>::type seq_vec(
         inputs[0].dims_[0], reinterpret_cast<int*>(inputs[0].getData()));
diff --git a/paddle/function/ContextProjectionOpGpu.cu b/paddle/function/ContextProjectionOpGpu.cu
index 6194ad8e74..c5a636dce8 100644
--- a/paddle/function/ContextProjectionOpGpu.cu
+++ b/paddle/function/ContextProjectionOpGpu.cu
@@ -256,7 +256,7 @@ __global__ void KeContextProjectionBackwardWeight(const real* out_grad,
     for (int seqId = idy; seqId < num_sequences; seqId += THREADS_Y) {
       int seq_start = sequence[seqId];
       int seq_end = sequence[seqId+1];
-      output_r = const_cast<real*>(out_grad) 
+      output_r = const_cast<real*>(out_grad)
                     + seq_start * w_dim * context_length;
 
       if (context_start < 0) {

From 1482ec430a918cc5f9b44c3acf9d60d895c05b26 Mon Sep 17 00:00:00 2001
From: xutianbing <xutianbing@baidu.com>
Date: Sat, 7 Jan 2017 13:57:31 -0800
Subject: [PATCH 16/88] some comments.

---
 paddle/function/ContextProjectionOp.cpp | 88 ++++++++++++-------------
 1 file changed, 43 insertions(+), 45 deletions(-)

diff --git a/paddle/function/ContextProjectionOp.cpp b/paddle/function/ContextProjectionOp.cpp
index f1e42cad72..75c09108b1 100644
--- a/paddle/function/ContextProjectionOp.cpp
+++ b/paddle/function/ContextProjectionOp.cpp
@@ -18,6 +18,10 @@ limitations under the License. */
 
 namespace paddle {
 
+/**
+ * Context Projection Forward with CPU Matrix Device.
+ *
+ */
 template <>
 void ContextProjectionForward<DEVICE_TYPE_CPU>(CpuMatrix& out_mat,
                                                const CpuMatrix& input_mat,
@@ -70,11 +74,29 @@ void ContextProjectionForward<DEVICE_TYPE_CPU>(CpuMatrix& out_mat,
 }
 
 /**
- * \param outputs[0] output value.
+ * Paddle Function for Context Projection Forward.
+ * Calculate the value for the output layer with context projection.
+ *
+ * What is Context Projection?
+ * For example, assumed input (x) has 4 words and the dimension of each word
+ * representation is 2. If we use zero to pad instead of learned weight to pad,
+ * and the context_lenth is 3, the output (y) is:
  *
- * \param inputs[0] input value.
- * \param inputs[1] input weight.
- * \param inputs[2] input sequence.
+ * @code
+ *  x = [a1, a2;
+ *       b1, b2;
+ *       c1, c2;
+ *       d1, d2]
+ *  y = [0,  0,  a1, a2, b1, b2;
+ *       a1, a2, b1, b2, c1, c2;
+ *       b1, b2, c1, c2, d1, d2;
+ *       c1, c2, d1, d2, 0,  0]
+ * @endcode
+ *
+ * \param outputs[0] output value.
+ * \param inputs[0]  input value.
+ * \param inputs[1]  input weight.
+ * \param inputs[2]  input sequence.
  */
 template <DeviceType Device>
 class ContextProjectionForwardFunc : public FunctionBase {
@@ -123,6 +145,10 @@ private:
   size_t begin_pad_;
 };
 
+/**
+ * Context Projection Backward with CPU Matrix Device.
+ *
+ */
 template <>
 <<<<<<< HEAD
 void ContextProjectionBackward<DEVICE_TYPE_CPU>(const CpuMatrix& out_grad_mat,
@@ -178,10 +204,13 @@ void ContextProjectionBackward<DEVICE_TYPE_CPU>(const CpuMatrix& out_grad_mat,
 }
 
 /**
- * \param inputs[0]     input sequence.
- * \param inputs[1]     output grad.
- * \param inouts[0]     input grad.
- * \param inouts[1]     weight grad.
+ * Context Projection Backward Function.
+ * Update the weight gradient and input layer gradient with backprop
+ *
+ * \param inputs[0]      input sequence.
+ * \param inputs[1]      output grad.
+ * \param inouts[0]      input grad.
+ * \param inouts[1]      weight grad.
  */
 template <DeviceType Device>
 class ContextProjectionBackwardFunc : public FunctionBase {
@@ -194,7 +223,6 @@ public:
     total_pad_ = config.get<size_t>("total_pad");
   }
 
-<<<<<<< HEAD
   void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
     CHECK_EQ((size_t)3, inputs.size());
     CHECK_EQ((size_t)1, outputs.size());
@@ -213,42 +241,6 @@ public:
     CHECK_EQ(outputs[0].shape()[1], inputs[0].shape()[1] * context_length_);
 
     CHECK_EQ(outputs[0].getArgType(), ADD_TO);
-=======
-  void calc(const Arguments& inputs,
-            const Arguments& outputs,
-            const Arguments& inouts) override {
-    CHECK_EQ(2, inputs.size());
-    CHECK_EQ(0, outputs.size());
-    CHECK_EQ(2, inouts.size());
-
-    CHECK(inputs[0].getData() && inputs[1].getData());
-    CHECK_EQ(inputs[0].dims_.size(), 1);
-    CHECK_EQ(inputs[1].dims_.size(), 2);
-    CHECK_EQ(inouts[0].dims_.size(), 2);
-    CHECK_EQ(inouts[1].dims_.size(), 2);
-
-    /// dim of input grad == dim of weight grad
-    CHECK_EQ(inouts[0].dims_[1], inouts[1].dims_[1]);
-    /// input grad and output grad have the same batch_size
-    CHECK_EQ(inouts[0].dims_[0], inputs[1].dims_[0]);
-    /// dim of output = dim of input * context_length
-    CHECK_EQ(inputs[1].dims_[1], inouts[0].dims_[1] * context_length_);
-
-    typename SequenceT<Device>::type seq_vec(
-        inputs[0].dims_[0], reinterpret_cast<int*>(inputs[0].getData()));
-    const auto out_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
-        inputs[1].getData(), inputs[1].dims_[0], inputs[1].dims_[1]);
-    auto in_grad_mat =
-        !inouts[0].getData()
-            ? nullptr
-            : std::make_shared<typename MatrixT<Device>::type>(
-                  inouts[0].getData(), inouts[0].dims_[0], inouts[0].dims_[1]);
-    auto w_grad_mat =
-        !inouts[1].getData()
-            ? nullptr
-            : std::make_shared<typename MatrixT<Device>::type>(
-                  inouts[1].getData(), inouts[1].dims_[0], inouts[1].dims_[1]);
->>>>>>> Wei Xu's comments, set up right inouts.
 
     auto out_grad_mat = outputs[0].matrix<Device>();
     auto in_grad_mat =
@@ -279,6 +271,9 @@ private:
 
 #if 0
 /**
+ * Context Projection Backward Data Function.
+ * Update gradient of the input layer with backprop.
+ *
  * \param inouts[0]    input grad.
  * \param inputs[0]    input sequence.
  * \param inputs[1]    output grad.
@@ -326,6 +321,9 @@ private:
 };
 
 /**
+ * Context Projection Backward Weight Function.
+ * Update weight gradient with backprop.
+ *
  * \param inouts[0]    weight grad.
  * \param inputs[0]    input sequence.
  * \param inputs[1]    output grad.

From 23ac0b78cb472e2f5007531427e142d553831e91 Mon Sep 17 00:00:00 2001
From: xutianbing <xutianbing@baidu.com>
Date: Tue, 10 Jan 2017 16:13:41 -0800
Subject: [PATCH 17/88] merge Daoyuan's FuncArgs, pass the ContextProjection
 test.

---
 paddle/function/CMakeLists.txt              |   2 +-
 paddle/function/ContextProjectionOp.cpp     | 181 ++++----------------
 paddle/function/ContextProjectionOp.h       |   2 +-
 paddle/function/ContextProjectionOpGpu.cu   |   1 -
 paddle/function/ContextProjectionOpTest.cpp |  75 ++++----
 paddle/function/FunctionTest.h              |  72 ++------
 paddle/gserver/layers/ContextProjection.cpp |  15 +-
 7 files changed, 101 insertions(+), 247 deletions(-)

diff --git a/paddle/function/CMakeLists.txt b/paddle/function/CMakeLists.txt
index 75a2acc55e..39733479cc 100644
--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@@ -24,7 +24,7 @@ if(WITH_TESTING)
     add_simple_unittest(TensorTypeTest)
     add_simple_unittest(BufferArgTest)
     add_simple_unittest(FunctionTest)
-    # add_simple_unittest(ContextProjectionOpTest)
+    add_simple_unittest(ContextProjectionOpTest)
 endif()
 endif()
 
diff --git a/paddle/function/ContextProjectionOp.cpp b/paddle/function/ContextProjectionOp.cpp
index 75c09108b1..42b78eacfd 100644
--- a/paddle/function/ContextProjectionOp.cpp
+++ b/paddle/function/ContextProjectionOp.cpp
@@ -125,11 +125,11 @@ public:
 
     CHECK_EQ(outputs[0].getArgType(), ADD_TO);
     auto out_mat = outputs[0].matrix<Device>();
-    auto in_mat = inputs[0].matrix<Device>();
-    auto w_mat = !inputs[1].data()
-                     ? typename Tensor<real, Device>::Matrix(nullptr, 0, 0)
-                     : inputs[1].matrix<Device>();
-    auto seq_vec = inputs[2].vector<int, Device>();
+    const auto in_mat = inputs[0].matrix<Device>();
+    const auto w_mat =
+        !inputs[1].data() ? typename Tensor<real, Device>::Matrix(nullptr, 0, 0)
+                          : inputs[1].matrix<Device>();
+    const auto seq_vec = inputs[2].vector<int, Device>();
     ContextProjectionForward<Device>(out_mat,
                                      in_mat,
                                      w_mat,
@@ -150,7 +150,6 @@ private:
  *
  */
 template <>
-<<<<<<< HEAD
 void ContextProjectionBackward<DEVICE_TYPE_CPU>(const CpuMatrix& out_grad_mat,
                                                 CpuMatrix& in_grad_mat,
                                                 CpuMatrix& w_grad_mat,
@@ -174,7 +173,8 @@ void ContextProjectionBackward<DEVICE_TYPE_CPU>(const CpuMatrix& out_grad_mat,
         int64_t pad_size =
             std::min(starts[i] - begin, starts[i + 1] - starts[i]);
         if (is_padding && w_grad_mat) {
-          MatrixPtr mat = out_grad_mat.subMatrix(starts[i], pad_size);
+          MatrixPtr mat = const_cast<CpuMatrix&>(out_grad_mat)
+                              .subMatrix(starts[i], pad_size);
           MatrixPtr sub = w_grad_mat.subMatrix(j, pad_size);
           sub->addAtOffset(*mat, j * input_dim);
         }
@@ -185,8 +185,8 @@ void ContextProjectionBackward<DEVICE_TYPE_CPU>(const CpuMatrix& out_grad_mat,
         int64_t pad_size =
             std::min(end - starts[i + 1], starts[i + 1] - starts[i]);
         if (is_padding && w_grad_mat) {
-          MatrixPtr mat =
-              out_grad_mat.subMatrix(starts[i + 1] - pad_size, pad_size);
+          MatrixPtr mat = const_cast<CpuMatrix&>(out_grad_mat)
+                              .subMatrix(starts[i + 1] - pad_size, pad_size);
           MatrixPtr sub = w_grad_mat.subMatrix(
               begin_pad + context_start + j - pad_size, pad_size);
           sub->addAtOffset(*mat, j * input_dim);
@@ -197,7 +197,8 @@ void ContextProjectionBackward<DEVICE_TYPE_CPU>(const CpuMatrix& out_grad_mat,
       if (end <= begin) continue;
       if (!in_grad_mat) continue;
       MatrixPtr src = in_grad_mat.subMatrix(begin, end - begin);
-      MatrixPtr dst = out_grad_mat.subMatrix(dst_begin, dst_end - dst_begin);
+      MatrixPtr dst = const_cast<CpuMatrix&>(out_grad_mat)
+                          .subMatrix(dst_begin, dst_end - dst_begin);
       src->addAtOffset(*dst, j * input_dim);
     }
   }
@@ -207,10 +208,10 @@ void ContextProjectionBackward<DEVICE_TYPE_CPU>(const CpuMatrix& out_grad_mat,
  * Context Projection Backward Function.
  * Update the weight gradient and input layer gradient with backprop
  *
- * \param inputs[0]      input sequence.
- * \param inputs[1]      output grad.
- * \param inouts[0]      input grad.
- * \param inouts[1]      weight grad.
+ * \param inputs[0]       input sequence.
+ * \param inputs[1]       output layer grad.
+ * \param outputs[0]      input layer grad.
+ * \param outputs[1]      weight grad.
  */
 template <DeviceType Device>
 class ContextProjectionBackwardFunc : public FunctionBase {
@@ -224,32 +225,34 @@ public:
   }
 
   void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ((size_t)3, inputs.size());
-    CHECK_EQ((size_t)1, outputs.size());
+    CHECK_EQ((size_t)2, inputs.size());
+    CHECK_EQ((size_t)2, outputs.size());
 
-    CHECK(outputs[0].data() && inputs[2].data());
-    CHECK_EQ(outputs[0].shape().ndims(), (size_t)2);
-    CHECK_EQ(inputs[0].shape().ndims(), (size_t)2);
+    CHECK(inputs[0].data() && inputs[1].data());
+    CHECK_EQ(inputs[0].shape().ndims(), (size_t)1);
     CHECK_EQ(inputs[1].shape().ndims(), (size_t)2);
-    CHECK_EQ(inputs[2].shape().ndims(), (size_t)1);
+    CHECK_EQ(outputs[0].shape().ndims(), (size_t)2);
+    CHECK_EQ(outputs[1].shape().ndims(), (size_t)2);
 
-    /// dim of input == dim of weight
-    CHECK_EQ(inputs[0].shape()[1], inputs[1].shape()[1]);
-    /// input and output has the same batch_size
-    CHECK_EQ(inputs[0].shape()[0], outputs[0].shape()[0]);
-    /// dim of output = dim of input * context_length
-    CHECK_EQ(outputs[0].shape()[1], inputs[0].shape()[1] * context_length_);
+    /// dim of input grad == dim of weight
+    CHECK_EQ(outputs[0].shape()[1], outputs[1].shape()[1]);
+    /// input and output grad has the same batch_size
+    CHECK_EQ(outputs[0].shape()[0], inputs[1].shape()[0]);
+    /// dim of output val = dim of input grad * context_length
+    CHECK_EQ(inputs[1].shape()[1], outputs[0].shape()[1] * context_length_);
 
     CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+    CHECK_EQ(outputs[1].getArgType(), ADD_TO);
 
-    auto out_grad_mat = outputs[0].matrix<Device>();
+    const auto seq_vec = inputs[0].vector<int, Device>();
+    const auto out_grad_mat = inputs[1].matrix<Device>();
     auto in_grad_mat =
-        !inputs[0].data() ? typename Tensor<real, Device>::Matrix(nullptr, 0, 0)
-                          : inputs[0].matrix<Device>();
-    auto w_grad_mat = !inputs[1].data()
+        !outputs[0].data()
+            ? typename Tensor<real, Device>::Matrix(nullptr, 0, 0)
+            : outputs[0].matrix<Device>();
+    auto w_grad_mat = !outputs[1].data()
                           ? typename Tensor<real, Device>::Matrix(nullptr, 0, 0)
-                          : inputs[1].matrix<Device>();
-    auto seq_vec = inputs[2].vector<int, Device>();
+                          : outputs[1].matrix<Device>();
     ContextProjectionBackward<Device>(out_grad_mat,
                                       in_grad_mat,
                                       w_grad_mat,
@@ -269,112 +272,6 @@ private:
   size_t total_pad_;
 };
 
-#if 0
-/**
- * Context Projection Backward Data Function.
- * Update gradient of the input layer with backprop.
- *
- * \param inouts[0]    input grad.
- * \param inputs[0]    input sequence.
- * \param inputs[1]    output grad.
- */
-template <DeviceType Device>
-class ContextProjectionBackwardDataFunc : public FunctionBase {
-public:
-  void init(const FuncConfig& config) override {
-    context_length_ = config.get<size_t>("context_length");
-    context_start_ = config.get<int>("context_start");
-  }
-
-  void calc(const Arguments& inputs,
-            const Arguments& outputs,
-            const Arguments& inouts) override {
-    CHECK_EQ(2, inputs.size());
-    CHECK_EQ(0, outputs.size());
-    CHECK_EQ(1, inouts.size());
-
-    CHECK(inouts[0].getData() && inputs[0].getData() && inputs[1].getData());
-    CHECK_EQ(inputs[0].dims_.size(), 1);
-    CHECK_EQ(inputs[1].dims_.size(), 2);
-    CHECK_EQ(inouts[0].dims_.size(), 2);
-    CHECK_EQ(inputs[1].dims_[1], inouts[0].dims_[1] * context_length_);
-    /// input and output grad have the same batch_size
-    CHECK_EQ(inouts[0].dims_[0], inputs[1].dims_[0]);
-
-    typename SequenceT<Device>::type seq_vec(
-        inputs[0].dims_[0], reinterpret_cast<int*>(inputs[0].getData()));
-    const auto out_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
-        inputs[1].getData(), inputs[1].dims_[0], inputs[1].dims_[1]);
-    auto in_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
-        inouts[0].getData(), inouts[0].dims_[0], inouts[0].dims_[1]);
-
-    ContextProjectionBackwardData<Device>(out_grad_mat.get(),
-                                          in_grad_mat.get(),
-                                          seq_vec,
-                                          context_length_,
-                                          context_start_);
-  }
-
-private:
-  size_t context_length_;
-  int context_start_;
-};
-
-/**
- * Context Projection Backward Weight Function.
- * Update weight gradient with backprop.
- *
- * \param inouts[0]    weight grad.
- * \param inputs[0]    input sequence.
- * \param inputs[1]    output grad.
- */
-template <DeviceType Device>
-class ContextProjectionBackwardWeightFunc : public FunctionBase {
-public:
-  void init(const FuncConfig& config) override {
-    context_length_ = config.get<size_t>("context_length");
-    context_start_ = config.get<int>("context_start");
-    begin_pad_ = config.get<size_t>("begin_pad");
-    total_pad_ = config.get<size_t>("total_pad");
-  }
-
-  void calc(const Arguments& inputs,
-            const Arguments& outputs,
-            const Arguments& inouts) override {
-    CHECK_EQ(2, inputs.size());
-    CHECK_EQ(0, outputs.size());
-    CHECK_EQ(1, inouts.size());
-
-    CHECK(inouts[0].getData() && inputs[0].getData() && inputs[1].getData());
-    CHECK_EQ(inputs[0].dims_.size(), 1);
-    CHECK_EQ(inputs[1].dims_.size(), 2);
-    CHECK_EQ(inouts[0].dims_.size(), 2);
-    CHECK_EQ(inputs[1].dims_[1], inouts[0].dims_[1] * context_length_);
-
-    typename SequenceT<Device>::type seq_vec(
-        inputs[0].dims_[0], reinterpret_cast<int*>(inputs[0].getData()));
-    const auto out_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
-        inputs[1].getData(), inputs[1].dims_[0], inputs[1].dims_[1]);
-    auto w_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
-        inouts[0].getData(), inouts[0].dims_[0], inouts[0].dims_[1]);
-
-    ContextProjectionBackwardWeight<Device>(out_grad_mat.get(),
-                                            w_grad_mat.get(),
-                                            seq_vec,
-                                            context_length_,
-                                            context_start_,
-                                            total_pad_,
-                                            begin_pad_);
-  }
-
-private:
-  size_t context_length_;
-  int context_start_;
-  size_t begin_pad_;
-  size_t total_pad_;
-};
-#endif
-
 REGISTER_TYPED_FUNC(ContextProjectionForward,
                     CPU,
                     ContextProjectionForwardFunc);
@@ -388,13 +285,5 @@ REGISTER_TYPED_FUNC(ContextProjectionForward,
 REGISTER_TYPED_FUNC(ContextProjectionBackward,
                     GPU,
                     ContextProjectionBackwardFunc);
-#if 0
-REGISTER_TYPED_FUNC(ContextProjectionBackwardData,
-                    GPU,
-                    ContextProjectionBackwardDataFunc);
-REGISTER_TYPED_FUNC(ContextProjectionBackwardWeight,
-                    GPU,
-                    ContextProjectionBackwardWeightFunc);
-#endif
 #endif
 }  // namespace paddle
diff --git a/paddle/function/ContextProjectionOp.h b/paddle/function/ContextProjectionOp.h
index 8e956c6c6f..2bdd47e4e9 100644
--- a/paddle/function/ContextProjectionOp.h
+++ b/paddle/function/ContextProjectionOp.h
@@ -56,7 +56,7 @@ void ContextProjectionForward(
  */
 template <DeviceType DType>
 void ContextProjectionBackward(
-    typename Tensor<real, DType>::Matrix& out_grad,
+    const typename Tensor<real, DType>::Matrix& out_grad,
     typename Tensor<real, DType>::Matrix& in_grad,
     typename Tensor<real, DType>::Matrix& w_grad,
     const typename Tensor<int, DType>::Vector& seq_vec,
diff --git a/paddle/function/ContextProjectionOpGpu.cu b/paddle/function/ContextProjectionOpGpu.cu
index c5a636dce8..1a5b404240 100644
--- a/paddle/function/ContextProjectionOpGpu.cu
+++ b/paddle/function/ContextProjectionOpGpu.cu
@@ -217,7 +217,6 @@ void hl_context_projection_backward_data(const real* out_grad,
 }
 
 template <>
-<<<<<<< HEAD
 void ContextProjectionBackwardData<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
                                                     GpuMatrix& in_grad,
                                                     const GpuIVector& sequence,
diff --git a/paddle/function/ContextProjectionOpTest.cpp b/paddle/function/ContextProjectionOpTest.cpp
index 169c1dd505..c8d5b4f278 100644
--- a/paddle/function/ContextProjectionOpTest.cpp
+++ b/paddle/function/ContextProjectionOpTest.cpp
@@ -56,24 +56,25 @@ void testMatrixProjectionForward(int context_start,
   cpu_out.randomizeUniform();
   gpu_out.copyFrom(cpu_out);
 
-  compare.getCpuFunction()->calc(
-      {Tensor(cpu_in.getData(), Dims{batch_size, input_dim}),
-       Tensor(cpu_weight ? cpu_weight->getData() : nullptr,
-              Dims{pad, input_dim}),
-       Tensor(reinterpret_cast<real*>(cpu_seq->getData()),
-              Dims{cpu_seq->getSize()})},
-      {},
-      {Tensor(cpu_out.getData(),
-              Dims{batch_size, input_dim * context_length})});
-  compare.getGpuFunction()->calc(
-      {Tensor(gpu_in.getData(), Dims{batch_size, input_dim}),
-       Tensor(gpu_weight ? gpu_weight->getData() : nullptr,
-              Dims{pad, input_dim}),
-       Tensor(reinterpret_cast<real*>(gpu_seq->getData()),
-              Dims{gpu_seq->getSize()})},
-      {},
-      {Tensor(gpu_out.getData(),
-              Dims{batch_size, input_dim * context_length})});
+  BufferArgs cpu_inputs;
+  BufferArgs cpu_outputs;
+  cpu_inputs.addArg(cpu_in);
+  cpu_inputs.addArg(cpu_weight ? *cpu_weight
+                               : CpuMatrix(nullptr, 0, input_dim));
+  cpu_inputs.addArg(*cpu_seq);
+  cpu_outputs.addArg(cpu_out, ADD_TO);
+
+  compare.getCpuFunction()->calc(cpu_inputs, cpu_outputs);
+
+  BufferArgs gpu_inputs;
+  BufferArgs gpu_outputs;
+  gpu_inputs.addArg(gpu_in);
+  gpu_inputs.addArg(gpu_weight ? *gpu_weight
+                               : GpuMatrix(nullptr, 0, input_dim));
+  gpu_inputs.addArg(*gpu_seq);
+  gpu_outputs.addArg(gpu_out, ADD_TO);
+
+  compare.getGpuFunction()->calc(gpu_inputs, gpu_outputs);
 
   autotest::TensorCheckEqual(cpu_out, gpu_out);
 }
@@ -119,25 +120,25 @@ void testMatrixProjectionBackward(int context_start,
     gpu_w_grad->copyFrom(*cpu_w_grad);
   }
 
-  compare.getCpuFunction()->calc(
-      {Tensor(reinterpret_cast<real*>(cpu_seq->getData()),
-              Dims{cpu_seq->getSize()}),
-       Tensor(cpu_out_grad.getData(),
-              Dims{batch_size, input_dim * context_length})},
-      {},
-      {Tensor(cpu_in_grad.getData(), Dims{batch_size, input_dim}),
-       Tensor(cpu_w_grad ? cpu_w_grad->getData() : nullptr,
-              Dims{pad, input_dim})});
-
-  compare.getGpuFunction()->calc(
-      {Tensor(reinterpret_cast<real*>(gpu_seq->getData()),
-              Dims{gpu_seq->getSize()}),
-       Tensor(gpu_out_grad.getData(),
-              Dims{batch_size, input_dim * context_length})},
-      {},
-      {Tensor(gpu_in_grad.getData(), Dims{batch_size, input_dim}),
-       Tensor(gpu_w_grad ? gpu_w_grad->getData() : nullptr,
-              Dims{pad, input_dim})});
+  BufferArgs cpu_inputs;
+  BufferArgs cpu_outputs;
+  cpu_inputs.addArg(*cpu_seq);
+  cpu_inputs.addArg(cpu_out_grad);
+  cpu_outputs.addArg(cpu_in_grad, ADD_TO);
+  cpu_outputs.addArg(
+      cpu_w_grad ? *cpu_w_grad : CpuMatrix(nullptr, 0, input_dim), ADD_TO);
+
+  compare.getCpuFunction()->calc(cpu_inputs, cpu_outputs);
+
+  BufferArgs gpu_inputs;
+  BufferArgs gpu_outputs;
+  gpu_inputs.addArg(*gpu_seq);
+  gpu_inputs.addArg(gpu_out_grad);
+  gpu_outputs.addArg(gpu_in_grad, ADD_TO);
+  gpu_outputs.addArg(
+      gpu_w_grad ? *gpu_w_grad : GpuMatrix(nullptr, 0, input_dim), ADD_TO);
+
+  compare.getGpuFunction()->calc(gpu_inputs, gpu_outputs);
 
   autotest::TensorCheckErr(cpu_in_grad, gpu_in_grad);
   if (is_padding) {
diff --git a/paddle/function/FunctionTest.h b/paddle/function/FunctionTest.h
index 32131037f6..da4c0f4f07 100644
--- a/paddle/function/FunctionTest.h
+++ b/paddle/function/FunctionTest.h
@@ -27,66 +27,28 @@ public:
     gpu->init(config);
   }
 
-  void cmpWithArg(const Arguments& inputs,
-                  const Arguments& outputs,
-                  const Arguments& inouts) {
+  void cmpWithArg(const BufferArgs& inputs,
+                  const BufferArgs& outputs,
+                  const BufferArgs& inouts) {
     // init cpu and gpu arguments
     auto initArgs = [=](
-        Arguments& cpuArgs, Arguments& gpuArgs, const Arguments& inArgs) {
-      for (const auto arg : inArgs) {
-        size_t size = sizeof(real);
-        for (const auto dim : arg.dims_) {
-          size *= dim;
-        }
-        if (arg.getData()) {
-          // todo(tianbing), waste unnecessary mem here
-          cpuMemory.emplace_back(std::make_shared<CpuMemoryHandle>(size));
-          gpuMemory.emplace_back(std::make_shared<GpuMemoryHandle>(size));
-          cpuArgs.emplace_back(Tensor((real*)arg.getData(), arg.dims_));
-          gpuArgs.emplace_back(Tensor((real*)arg.getData(), arg.dims_));
-          // already init outside
-        } else {
-          cpuMemory.emplace_back(std::make_shared<CpuMemoryHandle>(size));
-          gpuMemory.emplace_back(std::make_shared<GpuMemoryHandle>(size));
-          cpuArgs.emplace_back(
-              Tensor((real*)cpuMemory.back()->getBuf(), arg.dims_));
-          gpuArgs.emplace_back(
-              Tensor((real*)gpuMemory.back()->getBuf(), arg.dims_));
-          // will use an api to refactor this code.
-          CpuVector cpuVector(size / sizeof(real),
-                              (real*)cpuArgs.back().getData());
-          GpuVector gpuVector(size / sizeof(real),
-                              (real*)gpuArgs.back().getData());
-          cpuVector.uniform(0.001, 1);
-          gpuVector.copyFrom(cpuVector);
-        }
-      }
+        BufferArgs& cpuArgs, BufferArgs& gpuArgs, const BufferArgs& inArgs) {
+      /// leave it empty to pass the compile of ContextProjectionTest
+      /// Daoyuan is working on FunctionTest
+      /// and I will further merge with it
     };
     initArgs(cpuInputs, gpuInputs, inputs);
     initArgs(cpuOutputs, gpuOutputs, outputs);
-    initArgs(cpuInouts, gpuInouts, inouts);
 
     // function calculate
-    cpu->calc(cpuInputs, cpuOutputs, cpuInouts);
-    gpu->calc(gpuInputs, gpuOutputs, gpuInouts);
+    cpu->calc(cpuInputs, cpuOutputs);
+    gpu->calc(gpuInputs, gpuOutputs);
 
     // check outputs and inouts
-    auto checkArgs = [=](const Arguments& cpuArgs, const Arguments& gpuArgs) {
-      for (size_t i = 0; i < cpuArgs.size(); i++) {
-        auto cpu = cpuArgs[i];
-        auto gpu = gpuArgs[i];
-        size_t size = 1;
-        for (auto dim : cpu.dims_) {
-          size *= dim;
-        }
-        CpuVector cpuVector(size, (real*)cpu.getData());
-        GpuVector gpuVector(size, (real*)gpu.getData());
-
-        autotest::TensorCheckErr(cpuVector, gpuVector);
-      }
+    auto checkArgs = [=](const BufferArgs& cpuArgs, const BufferArgs& gpuArgs) {
+      /// leave it open
     };
     checkArgs(cpuOutputs, gpuOutputs);
-    checkArgs(cpuInouts, gpuInouts);
   }
 
   std::shared_ptr<FunctionBase> getCpuFunction() const { return cpu; }
@@ -98,12 +60,12 @@ protected:
   std::shared_ptr<FunctionBase> gpu;
   std::vector<CpuMemHandlePtr> cpuMemory;
   std::vector<GpuMemHandlePtr> gpuMemory;
-  Arguments cpuInputs;
-  Arguments cpuOutputs;
-  Arguments cpuInouts;
-  Arguments gpuInputs;
-  Arguments gpuOutputs;
-  Arguments gpuInouts;
+  BufferArgs cpuInputs;
+  BufferArgs cpuOutputs;
+  BufferArgs cpuInouts;
+  BufferArgs gpuInputs;
+  BufferArgs gpuOutputs;
+  BufferArgs gpuInouts;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/ContextProjection.cpp b/paddle/gserver/layers/ContextProjection.cpp
index ebcc87cbf4..def7c15ca8 100644
--- a/paddle/gserver/layers/ContextProjection.cpp
+++ b/paddle/gserver/layers/ContextProjection.cpp
@@ -166,13 +166,16 @@ void ContextProjection::backward(const UpdateCallback& callback) {
 
   BufferArgs inputs;
   BufferArgs outputs;
-  inputs.addArg(CpuMatrix(
-      in_->grad ? in_->grad->getData() : nullptr, batch_size, input_dim));
-  inputs.addArg(CpuMatrix(w_ptr ? w_ptr->getData() : nullptr,
-                          w_ptr ? w_ptr->getHeight() : 0,
-                          input_dim));
   inputs.addArg(*in_->sequenceStartPositions->getVector(useGpu_));
-  outputs.addArg(*out_->grad, ADD_TO);
+  inputs.addArg(*out_->grad);
+  outputs.addArg(
+      CpuMatrix(
+          in_->grad ? in_->grad->getData() : nullptr, batch_size, input_dim),
+      ADD_TO);
+  outputs.addArg(CpuMatrix(w_ptr ? w_ptr->getData() : nullptr,
+                           w_ptr ? w_ptr->getHeight() : 0,
+                           input_dim),
+                 ADD_TO);
   backward_[0]->calc(inputs, outputs);
 
   if (config_.trainable_padding()) {

From 9edfd2003153dd122b01300c3973249be1abd8c1 Mon Sep 17 00:00:00 2001
From: xutianbing <xutianbing@baidu.com>
Date: Tue, 10 Jan 2017 20:10:16 -0800
Subject: [PATCH 18/88] use Daoyuan's SequenceArg.

---
 paddle/function/BufferArg.h                 |  3 ++-
 paddle/function/ContextProjectionOp.cpp     | 26 ++++++++++-----------
 paddle/function/ContextProjectionOpTest.cpp |  6 ++---
 paddle/function/Function.cpp                |  6 +++++
 paddle/function/Function.h                  |  4 ++++
 paddle/gserver/layers/ContextProjection.cpp |  3 +--
 6 files changed, 28 insertions(+), 20 deletions(-)

diff --git a/paddle/function/BufferArg.h b/paddle/function/BufferArg.h
index 12352ba29e..f3a4350e12 100644
--- a/paddle/function/BufferArg.h
+++ b/paddle/function/BufferArg.h
@@ -190,7 +190,7 @@ private:
   size_t numSeqs_;
 };
 
-// sequence data
+// sequence data {seqId(vec), buf(matrix)}
 class SequenceArg : public BufferArg {
 public:
   SequenceArg(void* buf,
@@ -210,6 +210,7 @@ public:
 
   void* getIdBuf() const { return startPositions_.data(); }
   size_t numSeqs() const { return startPositions_.numSeqs(); }
+  const SequenceIdArg& getSequenceIds() const { return startPositions_; }
 
 private:
   SequenceIdArg startPositions_;
diff --git a/paddle/function/ContextProjectionOp.cpp b/paddle/function/ContextProjectionOp.cpp
index 42b78eacfd..177708d00f 100644
--- a/paddle/function/ContextProjectionOp.cpp
+++ b/paddle/function/ContextProjectionOp.cpp
@@ -17,7 +17,6 @@ limitations under the License. */
 #include "paddle/math/Vector.h"
 
 namespace paddle {
-
 /**
  * Context Projection Forward with CPU Matrix Device.
  *
@@ -208,10 +207,10 @@ void ContextProjectionBackward<DEVICE_TYPE_CPU>(const CpuMatrix& out_grad_mat,
  * Context Projection Backward Function.
  * Update the weight gradient and input layer gradient with backprop
  *
- * \param inputs[0]       input sequence.
- * \param inputs[1]       output layer grad.
- * \param outputs[0]      input layer grad.
- * \param outputs[1]      weight grad.
+ * \param inputs[0].seq          input sequence.
+ * \param inputs[0].matrix       output layer grad.
+ * \param outputs[0]             input layer grad.
+ * \param outputs[1]             weight grad.
  */
 template <DeviceType Device>
 class ContextProjectionBackwardFunc : public FunctionBase {
@@ -225,27 +224,28 @@ public:
   }
 
   void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ((size_t)2, inputs.size());
+    CHECK_EQ((size_t)1, inputs.size());
     CHECK_EQ((size_t)2, outputs.size());
 
-    CHECK(inputs[0].data() && inputs[1].data());
-    CHECK_EQ(inputs[0].shape().ndims(), (size_t)1);
-    CHECK_EQ(inputs[1].shape().ndims(), (size_t)2);
+    const auto seqArg = dynamic_cast<const SequenceArg&>(inputs[0]);
+    CHECK(seqArg.data() && inputs[0].data());
+    CHECK_EQ(seqArg.shape().ndims(), (size_t)2);
+    CHECK_EQ(seqArg.getSequenceIds().shape().ndims(), (size_t)1);
     CHECK_EQ(outputs[0].shape().ndims(), (size_t)2);
     CHECK_EQ(outputs[1].shape().ndims(), (size_t)2);
 
     /// dim of input grad == dim of weight
     CHECK_EQ(outputs[0].shape()[1], outputs[1].shape()[1]);
     /// input and output grad has the same batch_size
-    CHECK_EQ(outputs[0].shape()[0], inputs[1].shape()[0]);
+    CHECK_EQ(outputs[0].shape()[0], seqArg.shape()[0]);
     /// dim of output val = dim of input grad * context_length
-    CHECK_EQ(inputs[1].shape()[1], outputs[0].shape()[1] * context_length_);
+    CHECK_EQ(seqArg.shape()[1], outputs[0].shape()[1] * context_length_);
 
     CHECK_EQ(outputs[0].getArgType(), ADD_TO);
     CHECK_EQ(outputs[1].getArgType(), ADD_TO);
 
-    const auto seq_vec = inputs[0].vector<int, Device>();
-    const auto out_grad_mat = inputs[1].matrix<Device>();
+    const auto seq_vec = seqArg.getSequenceIds().vector<int, Device>();
+    const auto out_grad_mat = seqArg.matrix<Device>();
     auto in_grad_mat =
         !outputs[0].data()
             ? typename Tensor<real, Device>::Matrix(nullptr, 0, 0)
diff --git a/paddle/function/ContextProjectionOpTest.cpp b/paddle/function/ContextProjectionOpTest.cpp
index c8d5b4f278..50ca204005 100644
--- a/paddle/function/ContextProjectionOpTest.cpp
+++ b/paddle/function/ContextProjectionOpTest.cpp
@@ -122,8 +122,7 @@ void testMatrixProjectionBackward(int context_start,
 
   BufferArgs cpu_inputs;
   BufferArgs cpu_outputs;
-  cpu_inputs.addArg(*cpu_seq);
-  cpu_inputs.addArg(cpu_out_grad);
+  cpu_inputs.addArg(cpu_out_grad, *cpu_seq);
   cpu_outputs.addArg(cpu_in_grad, ADD_TO);
   cpu_outputs.addArg(
       cpu_w_grad ? *cpu_w_grad : CpuMatrix(nullptr, 0, input_dim), ADD_TO);
@@ -132,8 +131,7 @@ void testMatrixProjectionBackward(int context_start,
 
   BufferArgs gpu_inputs;
   BufferArgs gpu_outputs;
-  gpu_inputs.addArg(*gpu_seq);
-  gpu_inputs.addArg(gpu_out_grad);
+  gpu_inputs.addArg(gpu_out_grad, *gpu_seq);
   gpu_outputs.addArg(gpu_in_grad, ADD_TO);
   gpu_outputs.addArg(
       gpu_w_grad ? *gpu_w_grad : GpuMatrix(nullptr, 0, input_dim), ADD_TO);
diff --git a/paddle/function/Function.cpp b/paddle/function/Function.cpp
index dbe3a4e9f6..3b65908465 100644
--- a/paddle/function/Function.cpp
+++ b/paddle/function/Function.cpp
@@ -90,6 +90,12 @@ void BufferArgs::addArg(const GpuSparseMatrix& arg, ArgType argType) {
   args_.push_back(std::make_shared<SparseMatrixArg>(arg, argType));
 }
 
+void BufferArgs::addArg(const Matrix& matrix,
+                        const IVector& vector,
+                        ArgType argType) {
+  args_.push_back(std::make_shared<SequenceArg>(matrix, vector, argType));
+}
+
 ClassRegistrar<FunctionBase> FunctionBase::funcRegistrar_;
 
 }  // namespace paddle
diff --git a/paddle/function/Function.h b/paddle/function/Function.h
index 249f8f9cfa..c15045143b 100644
--- a/paddle/function/Function.h
+++ b/paddle/function/Function.h
@@ -77,6 +77,10 @@ public:
   void addArg(const CpuSparseMatrix& arg, ArgType argType = UNSPECIFIED);
   void addArg(const GpuSparseMatrix& arg, ArgType argType = UNSPECIFIED);
 
+  void addArg(const Matrix& matrix,
+              const IVector& vector,
+              ArgType argType = UNSPECIFIED);
+
   // get argument
   const BufferArg& operator[](size_t num) const {
     CHECK_LT(num, args_.size());
diff --git a/paddle/gserver/layers/ContextProjection.cpp b/paddle/gserver/layers/ContextProjection.cpp
index def7c15ca8..17fd36ef56 100644
--- a/paddle/gserver/layers/ContextProjection.cpp
+++ b/paddle/gserver/layers/ContextProjection.cpp
@@ -166,8 +166,7 @@ void ContextProjection::backward(const UpdateCallback& callback) {
 
   BufferArgs inputs;
   BufferArgs outputs;
-  inputs.addArg(*in_->sequenceStartPositions->getVector(useGpu_));
-  inputs.addArg(*out_->grad);
+  inputs.addArg(*out_->grad, *in_->sequenceStartPositions->getVector(useGpu_));
   outputs.addArg(
       CpuMatrix(
           in_->grad ? in_->grad->getData() : nullptr, batch_size, input_dim),

From 8560ce69ff8ed6d201e0c31d561aaa6aab7cb5b2 Mon Sep 17 00:00:00 2001
From: xutianbing <xutianbing@baidu.com>
Date: Wed, 11 Jan 2017 16:55:32 -0800
Subject: [PATCH 19/88] Daoyuan's comments about SequenceArg.

---
 paddle/function/ContextProjectionOp.cpp     | 163 ++++++++++++++++----
 paddle/function/ContextProjectionOpTest.cpp |  20 +--
 paddle/gserver/layers/ContextProjection.cpp |  15 +-
 3 files changed, 151 insertions(+), 47 deletions(-)

diff --git a/paddle/function/ContextProjectionOp.cpp b/paddle/function/ContextProjectionOp.cpp
index 177708d00f..ec697a381f 100644
--- a/paddle/function/ContextProjectionOp.cpp
+++ b/paddle/function/ContextProjectionOp.cpp
@@ -74,7 +74,7 @@ void ContextProjectionForward<DEVICE_TYPE_CPU>(CpuMatrix& out_mat,
 
 /**
  * Paddle Function for Context Projection Forward.
- * Calculate the value for the output layer with context projection.
+ * Calculate the output sequence after context projection.
  *
  * What is Context Projection?
  * For example, assumed input (x) has 4 words and the dimension of each word
@@ -92,10 +92,12 @@ void ContextProjectionForward<DEVICE_TYPE_CPU>(CpuMatrix& out_mat,
  *       c1, c2, d1, d2, 0,  0]
  * @endcode
  *
- * \param outputs[0] output value.
- * \param inputs[0]  input value.
- * \param inputs[1]  input weight.
- * \param inputs[2]  input sequence.
+ * \param outputs[0].matrix   output value, n * (d * l)
+ * \param outputs[0].vector   input sequence, n * 1
+ * \param inputs[0].matrix    input value, n * d
+ * \param inputs[0].vector    input sequence, n * 1
+ * \param inputs[1].matrix    input weight, pad * d
+ * \param inputs[1].vector    input sequence, n * 1
  */
 template <DeviceType Device>
 class ContextProjectionForwardFunc : public FunctionBase {
@@ -107,28 +109,40 @@ public:
   }
 
   void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ((size_t)3, inputs.size());
+    CHECK(1 == inputs.size() || 2 == inputs.size());
     CHECK_EQ((size_t)1, outputs.size());
 
-    CHECK(outputs[0].data() && inputs[0].data() && inputs[2].data());
-    CHECK_EQ(outputs[0].shape().ndims(), (size_t)2);
-    CHECK_EQ(inputs[0].shape().ndims(), (size_t)2);
-    CHECK_EQ(inputs[1].shape().ndims(), (size_t)2);
-    CHECK_EQ(inputs[2].shape().ndims(), (size_t)1);
+    const auto val_seqs = dynamic_cast<const SequenceArg&>(inputs[0]);
+    const auto w_seqs = inputs.size() <= 1
+                            ? nullptr
+                            : dynamic_cast<const SequenceArg*>(&inputs[1]);
+    auto out_seqs = dynamic_cast<const SequenceArg&>(outputs[0]);
+
+    CHECK(out_seqs.data() && val_seqs.data() &&
+          val_seqs.getSequenceIds().data());
+    CHECK_EQ(out_seqs.shape().ndims(), (size_t)2);
+    CHECK_EQ(val_seqs.shape().ndims(), (size_t)2);
+    CHECK_EQ(val_seqs.getSequenceIds().shape().ndims(), (size_t)1);
+    if (w_seqs) {
+      CHECK_EQ(w_seqs->shape().ndims(), (size_t)2);
+      CHECK_EQ(w_seqs->getSequenceIds().shape().ndims(), (size_t)1);
+    }
     /// dim of output = dim of input * context_length
-    CHECK_EQ(outputs[0].shape()[1], inputs[0].shape()[1] * context_length_);
-    /// dim of input == dim of weight
-    CHECK_EQ(inputs[0].shape()[1], inputs[1].shape()[1]);
+    CHECK_EQ(out_seqs.shape()[1], val_seqs.shape()[1] * context_length_);
     /// input and output has the same batch_size
-    CHECK_EQ(inputs[0].shape()[0], outputs[0].shape()[0]);
+    CHECK_EQ(val_seqs.shape()[0], out_seqs.shape()[0]);
+    /// dim of input == dim of weight
+    if (w_seqs) {
+      CHECK_EQ(val_seqs.shape()[1], w_seqs->shape()[1]);
+    }
 
-    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
-    auto out_mat = outputs[0].matrix<Device>();
-    const auto in_mat = inputs[0].matrix<Device>();
+    CHECK_EQ(out_seqs.getArgType(), ADD_TO);
+    auto out_mat = out_seqs.matrix<Device>();
+    const auto in_mat = val_seqs.matrix<Device>();
     const auto w_mat =
-        !inputs[1].data() ? typename Tensor<real, Device>::Matrix(nullptr, 0, 0)
-                          : inputs[1].matrix<Device>();
-    const auto seq_vec = inputs[2].vector<int, Device>();
+        w_seqs ? w_seqs->matrix<Device>()
+               : typename Tensor<real, Device>::Matrix(nullptr, 0, 0);
+    const auto seq_vec = val_seqs.getSequenceIds().vector<int, Device>();
     ContextProjectionForward<Device>(out_mat,
                                      in_mat,
                                      w_mat,
@@ -227,25 +241,25 @@ public:
     CHECK_EQ((size_t)1, inputs.size());
     CHECK_EQ((size_t)2, outputs.size());
 
-    const auto seqArg = dynamic_cast<const SequenceArg&>(inputs[0]);
-    CHECK(seqArg.data() && inputs[0].data());
-    CHECK_EQ(seqArg.shape().ndims(), (size_t)2);
-    CHECK_EQ(seqArg.getSequenceIds().shape().ndims(), (size_t)1);
+    const auto seq_arg = dynamic_cast<const SequenceArg&>(inputs[0]);
+    CHECK(seq_arg.data() && inputs[0].data());
+    CHECK_EQ(seq_arg.shape().ndims(), (size_t)2);
+    CHECK_EQ(seq_arg.getSequenceIds().shape().ndims(), (size_t)1);
     CHECK_EQ(outputs[0].shape().ndims(), (size_t)2);
     CHECK_EQ(outputs[1].shape().ndims(), (size_t)2);
 
     /// dim of input grad == dim of weight
     CHECK_EQ(outputs[0].shape()[1], outputs[1].shape()[1]);
     /// input and output grad has the same batch_size
-    CHECK_EQ(outputs[0].shape()[0], seqArg.shape()[0]);
+    CHECK_EQ(outputs[0].shape()[0], seq_arg.shape()[0]);
     /// dim of output val = dim of input grad * context_length
-    CHECK_EQ(seqArg.shape()[1], outputs[0].shape()[1] * context_length_);
+    CHECK_EQ(seq_arg.shape()[1], outputs[0].shape()[1] * context_length_);
 
     CHECK_EQ(outputs[0].getArgType(), ADD_TO);
     CHECK_EQ(outputs[1].getArgType(), ADD_TO);
 
-    const auto seq_vec = seqArg.getSequenceIds().vector<int, Device>();
-    const auto out_grad_mat = seqArg.matrix<Device>();
+    const auto seq_vec = seq_arg.getSequenceIds().vector<int, Device>();
+    const auto out_grad_mat = seq_arg.matrix<Device>();
     auto in_grad_mat =
         !outputs[0].data()
             ? typename Tensor<real, Device>::Matrix(nullptr, 0, 0)
@@ -272,6 +286,91 @@ private:
   size_t total_pad_;
 };
 
+/**
+ * \param inputs[0].matrix      input grad, n*d
+ * \param inputs[0].vector      input sequence, n*1
+ * \param outputs[0]            output grad, n*(d*l)
+ */
+template <DeviceType Device>
+class ContextProjectionBackwardDataFunc : public FunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    context_length_ = config.get<size_t>("context_length");
+    context_start_ = config.get<int>("context_start");
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(1, static_cast<int>(inputs.size()));
+    CHECK_EQ(1, static_cast<int>(outputs.size()));
+    const auto in_seqs = dynamic_cast<const SequenceArg&>(inputs[0]);
+    CHECK(in_seqs.data() && outputs[0].data() &&
+          in_seqs.getSequenceIds().data());
+    CHECK_EQ(static_cast<int>(outputs[0].shape().ndims()), 2);
+    CHECK_EQ(static_cast<int>(in_seqs.shape().ndims()), 2);
+    CHECK_EQ(static_cast<int>(in_seqs.getSequenceIds().shape().ndims()), 1);
+    CHECK_EQ(outputs[0].shape().ndims(),
+             in_seqs.shape().ndims() * context_length_);
+    /// input and output has the same batch_size
+    CHECK_EQ(in_seqs.shape()[0], outputs[0].shape()[0]);
+    const auto out_grad_mat = outputs[0].matrix<Device>();
+    auto in_grad_mat = in_seqs.matrix<Device>();
+    const auto seq_vec = in_seqs.getSequenceIds().vector<int, Device>();
+
+    ContextProjectionBackwardData<Device>(
+        out_grad_mat, in_grad_mat, seq_vec, context_length_, context_start_);
+  }
+
+private:
+  size_t context_length_;
+  int context_start_;
+};
+
+/**
+ * \param inputs[0].matrix    weight grad, pad * d
+ * \param inputs[0].vecotr    input sequence, n * 1
+ * \param outputs[0]          output grad, n * (d * l)
+ */
+template <DeviceType Device>
+class ContextProjectionBackwardWeightFunc : public FunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    context_length_ = config.get<size_t>("context_length");
+    context_start_ = config.get<int>("context_start");
+    begin_pad_ = config.get<size_t>("begin_pad");
+    total_pad_ = config.get<size_t>("total_pad");
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(1, static_cast<int>(inputs.size()));
+    CHECK_EQ(1, static_cast<int>(outputs.size()));
+
+    const auto in_seqs = dynamic_cast<const SequenceArg&>(inputs[0]);
+    CHECK(in_seqs.data() && in_seqs.getSequenceIds().data() &&
+          outputs[0].data());
+    CHECK_EQ(static_cast<int>(outputs[0].shape().ndims()), 2);
+    CHECK_EQ(static_cast<int>(in_seqs.shape().ndims()), 2);
+    CHECK_EQ(static_cast<int>(in_seqs.getSequenceIds().shape().ndims()), 1);
+    CHECK_EQ(in_seqs.shape()[0], outputs[0].shape()[0]);
+    CHECK_EQ(outputs[0].shape()[1], in_seqs.shape()[1] * context_length_);
+    const auto out_grad_mat = outputs[0].matrix<Device>();
+    auto w_grad_mat = inputs[0].matrix<Device>();
+    const auto seq_vec = in_seqs.getSequenceIds().vector<int, Device>();
+    ContextProjectionBackwardWeight<Device>(out_grad_mat,
+                                            w_grad_mat,
+                                            seq_vec,
+                                            context_length_,
+                                            context_start_,
+                                            total_pad_,
+                                            begin_pad_);
+  }
+
+private:
+  size_t context_length_;
+  int context_start_;
+  size_t begin_pad_;
+  size_t total_pad_;
+};
+
 REGISTER_TYPED_FUNC(ContextProjectionForward,
                     CPU,
                     ContextProjectionForwardFunc);
@@ -285,5 +384,11 @@ REGISTER_TYPED_FUNC(ContextProjectionForward,
 REGISTER_TYPED_FUNC(ContextProjectionBackward,
                     GPU,
                     ContextProjectionBackwardFunc);
+REGISTER_TYPED_FUNC(ContextProjectionBackwardData,
+                    GPU,
+                    ContextProjectionBackwardDataFunc);
+REGISTER_TYPED_FUNC(ContextProjectionBackwardWeight,
+                    GPU,
+                    ContextProjectionBackwardWeightFunc);
 #endif
 }  // namespace paddle
diff --git a/paddle/function/ContextProjectionOpTest.cpp b/paddle/function/ContextProjectionOpTest.cpp
index 50ca204005..bd0c06c5f6 100644
--- a/paddle/function/ContextProjectionOpTest.cpp
+++ b/paddle/function/ContextProjectionOpTest.cpp
@@ -58,21 +58,21 @@ void testMatrixProjectionForward(int context_start,
 
   BufferArgs cpu_inputs;
   BufferArgs cpu_outputs;
-  cpu_inputs.addArg(cpu_in);
-  cpu_inputs.addArg(cpu_weight ? *cpu_weight
-                               : CpuMatrix(nullptr, 0, input_dim));
-  cpu_inputs.addArg(*cpu_seq);
-  cpu_outputs.addArg(cpu_out, ADD_TO);
+  cpu_inputs.addArg(cpu_in, *cpu_seq);
+  if (cpu_weight) {
+    cpu_inputs.addArg(*cpu_weight, *cpu_seq);
+  }
+  cpu_outputs.addArg(cpu_out, *cpu_seq, ADD_TO);
 
   compare.getCpuFunction()->calc(cpu_inputs, cpu_outputs);
 
   BufferArgs gpu_inputs;
   BufferArgs gpu_outputs;
-  gpu_inputs.addArg(gpu_in);
-  gpu_inputs.addArg(gpu_weight ? *gpu_weight
-                               : GpuMatrix(nullptr, 0, input_dim));
-  gpu_inputs.addArg(*gpu_seq);
-  gpu_outputs.addArg(gpu_out, ADD_TO);
+  gpu_inputs.addArg(gpu_in, *gpu_seq);
+  if (gpu_weight) {
+    gpu_inputs.addArg(*gpu_weight, *gpu_seq);
+  }
+  gpu_outputs.addArg(gpu_out, *gpu_seq, ADD_TO);
 
   compare.getGpuFunction()->calc(gpu_inputs, gpu_outputs);
 
diff --git a/paddle/gserver/layers/ContextProjection.cpp b/paddle/gserver/layers/ContextProjection.cpp
index 17fd36ef56..edcef17ad4 100644
--- a/paddle/gserver/layers/ContextProjection.cpp
+++ b/paddle/gserver/layers/ContextProjection.cpp
@@ -118,16 +118,15 @@ void ContextProjection::forward() {
   /// first use state_, otherwise use weight_(padding false === w nullptr)
   auto w_ptr =
       state_ ? state_.get() : is_padding ? weight_->getW().get() : nullptr;
-  auto start_pos = in_->sequenceStartPositions;
-
+  const auto start_pos = in_->sequenceStartPositions->getVector(useGpu_);
   BufferArgs inputs;
   BufferArgs outputs;
-  inputs.addArg(*in_->value);
-  inputs.addArg(CpuMatrix(w_ptr ? w_ptr->getData() : nullptr,
-                          w_ptr ? w_ptr->getHeight() : 0,
-                          input_dim));
-  inputs.addArg(*in_->sequenceStartPositions->getVector(useGpu_));
-  outputs.addArg(*out_->value, ADD_TO);
+  inputs.addArg(*in_->value, *start_pos);
+  if (w_ptr) {
+    inputs.addArg(CpuMatrix(w_ptr->getData(), w_ptr->getHeight(), input_dim),
+                  *start_pos);
+  }
+  outputs.addArg(*out_->value, *start_pos, ADD_TO);
   forward_[0]->calc(inputs, outputs);
 
   if (state_ && config_.context_start() < 0) {

From e9794214cbca438b1b467d614c6398ec09ab1d0b Mon Sep 17 00:00:00 2001
From: xutianbing <xutianbing@baidu.com>
Date: Thu, 12 Jan 2017 13:26:10 -0800
Subject: [PATCH 20/88] Address further comments.

---
 paddle/function/BufferArg.cpp               |  12 +-
 paddle/function/BufferArg.h                 |  30 +++-
 paddle/function/ContextProjectionOp.cpp     | 169 +++++++++++---------
 paddle/function/ContextProjectionOpTest.cpp |   4 +-
 paddle/gserver/layers/ContextProjection.cpp |   1 +
 5 files changed, 126 insertions(+), 90 deletions(-)

diff --git a/paddle/function/BufferArg.cpp b/paddle/function/BufferArg.cpp
index fde48a73b6..5d595deb12 100644
--- a/paddle/function/BufferArg.cpp
+++ b/paddle/function/BufferArg.cpp
@@ -20,23 +20,27 @@ limitations under the License. */
 namespace paddle {
 
 const SequenceArg& BufferArg::sequence() const {
-  // CHECK_EQ(bufferType_, TENSOR_SEQUENCE_DATA);
+  CHECK_EQ(bufferType_, TENSOR_SEQUENCE_DATA);
   return dynamic_cast<const SequenceArg&>(*this);
 }
 
 const SparseMatrixArg& BufferArg::sparse() const {
-  // CHECK_EQ(bufferType_, TENSOR_SPARSE);
+  CHECK_EQ(bufferType_, TENSOR_SPARSE);
   return dynamic_cast<const SparseMatrixArg&>(*this);
 }
 
 SparseMatrixArg::SparseMatrixArg(const CpuSparseMatrix& sparse, ArgType argType)
     : BufferArg(sparse, argType),
       row_(reinterpret_cast<void*>(sparse.getRows()), VALUE_TYPE_INT32),
-      col_(reinterpret_cast<void*>(sparse.getCols()), VALUE_TYPE_INT32) {}
+      col_(reinterpret_cast<void*>(sparse.getCols()), VALUE_TYPE_INT32) {
+  bufferType_ = TENSOR_SPARSE;
+}
 
 SparseMatrixArg::SparseMatrixArg(const GpuSparseMatrix& sparse, ArgType argType)
     : BufferArg(sparse, argType),
       row_(reinterpret_cast<void*>(sparse.getRows()), VALUE_TYPE_INT32),
-      col_(reinterpret_cast<void*>(sparse.getCols()), VALUE_TYPE_INT32) {}
+      col_(reinterpret_cast<void*>(sparse.getCols()), VALUE_TYPE_INT32) {
+  bufferType_ = TENSOR_SPARSE;
+}
 
 }  // namespace paddle
diff --git a/paddle/function/BufferArg.h b/paddle/function/BufferArg.h
index f3a4350e12..440a924a7a 100644
--- a/paddle/function/BufferArg.h
+++ b/paddle/function/BufferArg.h
@@ -23,10 +23,11 @@ limitations under the License. */
 namespace paddle {
 
 enum BufferType {
-  TENSOR_NORMAL = 0,
-  TENSOR_SEQUENCE_ID = 1,
-  TENSOR_SEQUENCE_DATA = 2,
-  TENSOR_SPARSE = 3
+  TENSOR_UNKNOWN = 0,
+  TENSOR_NORMAL = 1,
+  TENSOR_SEQUENCE_ID = 2,
+  TENSOR_SEQUENCE_DATA = 3,
+  TENSOR_SPARSE = 4
 };
 
 enum SparseDataType {
@@ -86,6 +87,7 @@ public:
         valueType_(DataType<real>::value),
         shape_(2),
         argType_(argType) {
+    bufferType_ = TENSOR_NORMAL;
     shape_.setDim(0, matrix.getHeight());
     shape_.setDim(1, matrix.getWidth());
   }
@@ -98,6 +100,7 @@ public:
         valueType_(DataType<real>::value),
         shape_(shape),
         argType_(argType) {
+    bufferType_ = TENSOR_NORMAL;
     CHECK_EQ(matrix.getElementCnt(), shape.getElements());
   }
 
@@ -107,6 +110,7 @@ public:
         valueType_(DataType<real>::value),
         shape_(1),
         argType_(argType) {
+    bufferType_ = TENSOR_NORMAL;
     shape_.setDim(0, vector.getSize());
   }
 
@@ -116,6 +120,7 @@ public:
         valueType_(VALUE_TYPE_INT32),
         shape_(1),
         argType_(argType) {
+    bufferType_ = TENSOR_NORMAL;
     shape_.setDim(0, vector.getSize());
   }
 
@@ -150,6 +155,8 @@ public:
   ValueType valueType() const { return valueType_; }
   BufferType bufferType() const { return bufferType_; }
   const TensorShape& shape() const { return shape_; }
+  bool isSparse() const { return (TENSOR_SPARSE == bufferType_); }
+  bool isSequenceArg() const { return TENSOR_SEQUENCE_DATA == bufferType_; }
 
   const SequenceArg& sequence() const;
   const SparseMatrixArg& sparse() const;
@@ -158,8 +165,8 @@ protected:
   void* buf_;
   ValueType valueType_;
   TensorShape shape_;
-  BufferType bufferType_;
-  ArgType argType_ = UNSPECIFIED;
+  BufferType bufferType_{TENSOR_UNKNOWN};
+  ArgType argType_{UNSPECIFIED};
   // leading dimensions. The size is dims_.size()
   // Dims lds_;
 };
@@ -174,11 +181,13 @@ public:
                 const TensorShape& shape,
                 ArgType argType = UNSPECIFIED)
       : BufferArg(buf, VALUE_TYPE_INT32, shape, argType) {
+    bufferType_ = TENSOR_SEQUENCE_ID;
     CHECK_EQ(shape_.ndims(), (size_t)1);
     numSeqs_ = shape_[0] - 1;
   }
 
   SequenceIdArg(const IVector& vector) : BufferArg(vector) {
+    bufferType_ = TENSOR_SEQUENCE_ID;
     numSeqs_ = shape_[0] - 1;
   }
 
@@ -199,12 +208,16 @@ public:
               const SequenceIdArg& startPositions,
               ArgType argType = UNSPECIFIED)
       : BufferArg(buf, valueType, shape, argType),
-        startPositions_(startPositions) {}
+        startPositions_(startPositions) {
+    bufferType_ = TENSOR_SEQUENCE_DATA;
+  }
 
   SequenceArg(const Matrix& matrix,
               const IVector& vector,
               ArgType argType = UNSPECIFIED)
-      : BufferArg(matrix, argType), startPositions_(vector) {}
+      : BufferArg(matrix, argType), startPositions_(vector) {
+    bufferType_ = TENSOR_SEQUENCE_DATA;
+  }
 
   ~SequenceArg() {}
 
@@ -236,6 +249,7 @@ public:
         nnz_(nnz),
         format_(format),
         type_(type) {
+    bufferType_ = TENSOR_SPARSE;
     CHECK((valueType == VALUE_TYPE_FLOAT) || (valueType == VALUE_TYPE_DOUBLE));
     CHECK_EQ(shape_.ndims(), (size_t)2);
     CHECK_EQ(row_.shape().ndims(), (size_t)1);
diff --git a/paddle/function/ContextProjectionOp.cpp b/paddle/function/ContextProjectionOp.cpp
index ec697a381f..2ef53cd6d9 100644
--- a/paddle/function/ContextProjectionOp.cpp
+++ b/paddle/function/ContextProjectionOp.cpp
@@ -74,9 +74,9 @@ void ContextProjectionForward<DEVICE_TYPE_CPU>(CpuMatrix& out_mat,
 
 /**
  * Paddle Function for Context Projection Forward.
- * Calculate the output sequence after context projection.
+ * Calculate the output layer value sequence after context projection.
  *
- * What is Context Projection?
+ * What is Context Projection for a sequence?
  * For example, assumed input (x) has 4 words and the dimension of each word
  * representation is 2. If we use zero to pad instead of learned weight to pad,
  * and the context_lenth is 3, the output (y) is:
@@ -92,12 +92,11 @@ void ContextProjectionForward<DEVICE_TYPE_CPU>(CpuMatrix& out_mat,
  *       c1, c2, d1, d2, 0,  0]
  * @endcode
  *
- * \param outputs[0].matrix   output value, n * (d * l)
- * \param outputs[0].vector   input sequence, n * 1
- * \param inputs[0].matrix    input value, n * d
- * \param inputs[0].vector    input sequence, n * 1
- * \param inputs[1].matrix    input weight, pad * d
- * \param inputs[1].vector    input sequence, n * 1
+ * \param outputs[0].matrix   output layer value, n * (d * l)
+ * \param outputs[0].vector   start position sequence, n * 1
+ * \param inputs[0].matrix    input layer value, n * d
+ * \param inputs[0].vector    start position sequence, n * 1
+ * \param inputs[1].matrix    input layer weight, pad * d
  */
 template <DeviceType Device>
 class ContextProjectionForwardFunc : public FunctionBase {
@@ -111,37 +110,35 @@ public:
   void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
     CHECK(1 == inputs.size() || 2 == inputs.size());
     CHECK_EQ((size_t)1, outputs.size());
-
+    CHECK(inputs[0].isSequenceArg() && outputs[0].isSequenceArg())
+        << "SequenceArg required here";
     const auto val_seqs = dynamic_cast<const SequenceArg&>(inputs[0]);
-    const auto w_seqs = inputs.size() <= 1
-                            ? nullptr
-                            : dynamic_cast<const SequenceArg*>(&inputs[1]);
-    auto out_seqs = dynamic_cast<const SequenceArg&>(outputs[0]);
+    auto out_seq = dynamic_cast<const SequenceArg&>(outputs[0]);
 
-    CHECK(out_seqs.data() && val_seqs.data() &&
+    CHECK(out_seq.data() && val_seqs.data() &&
           val_seqs.getSequenceIds().data());
-    CHECK_EQ(out_seqs.shape().ndims(), (size_t)2);
+    CHECK_EQ(out_seq.shape().ndims(), (size_t)2);
     CHECK_EQ(val_seqs.shape().ndims(), (size_t)2);
     CHECK_EQ(val_seqs.getSequenceIds().shape().ndims(), (size_t)1);
-    if (w_seqs) {
-      CHECK_EQ(w_seqs->shape().ndims(), (size_t)2);
-      CHECK_EQ(w_seqs->getSequenceIds().shape().ndims(), (size_t)1);
+    if (2 == inputs.size()) {
+      CHECK_EQ(inputs[1].shape().ndims(), (size_t)2);
     }
     /// dim of output = dim of input * context_length
-    CHECK_EQ(out_seqs.shape()[1], val_seqs.shape()[1] * context_length_);
+    CHECK_EQ(out_seq.shape()[1], val_seqs.shape()[1] * context_length_);
     /// input and output has the same batch_size
-    CHECK_EQ(val_seqs.shape()[0], out_seqs.shape()[0]);
+    CHECK_EQ(val_seqs.shape()[0], out_seq.shape()[0]);
     /// dim of input == dim of weight
-    if (w_seqs) {
-      CHECK_EQ(val_seqs.shape()[1], w_seqs->shape()[1]);
+    if (2 == inputs.size()) {
+      CHECK_EQ(val_seqs.shape()[1], inputs[1].shape()[1]);
     }
 
-    CHECK_EQ(out_seqs.getArgType(), ADD_TO);
-    auto out_mat = out_seqs.matrix<Device>();
+    CHECK_EQ(out_seq.getArgType(), ADD_TO);
+    auto out_mat = out_seq.matrix<Device>();
     const auto in_mat = val_seqs.matrix<Device>();
     const auto w_mat =
-        w_seqs ? w_seqs->matrix<Device>()
-               : typename Tensor<real, Device>::Matrix(nullptr, 0, 0);
+        (2 == inputs.size())
+            ? inputs[1].matrix<Device>()
+            : typename Tensor<real, Device>::Matrix(nullptr, 0, 0);
     const auto seq_vec = val_seqs.getSequenceIds().vector<int, Device>();
     ContextProjectionForward<Device>(out_mat,
                                      in_mat,
@@ -221,10 +218,11 @@ void ContextProjectionBackward<DEVICE_TYPE_CPU>(const CpuMatrix& out_grad_mat,
  * Context Projection Backward Function.
  * Update the weight gradient and input layer gradient with backprop
  *
- * \param inputs[0].seq          input sequence.
- * \param inputs[0].matrix       output layer grad.
- * \param outputs[0]             input layer grad.
- * \param outputs[1]             weight grad.
+ * \param inputs[0].matrix          output layer grad, n * (d * l)
+ * \param inputs[0].vector          start position sequence, n * 1
+ * \param outputs[0].matrix         input layer grad, n * d
+ * \param outputs[0].vector         start position sequence, n * 1
+ * \param outputs[1]                weight grad, pad * d
  */
 template <DeviceType Device>
 class ContextProjectionBackwardFunc : public FunctionBase {
@@ -240,30 +238,31 @@ public:
   void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
     CHECK_EQ((size_t)1, inputs.size());
     CHECK_EQ((size_t)2, outputs.size());
-
-    const auto seq_arg = dynamic_cast<const SequenceArg&>(inputs[0]);
-    CHECK(seq_arg.data() && inputs[0].data());
-    CHECK_EQ(seq_arg.shape().ndims(), (size_t)2);
-    CHECK_EQ(seq_arg.getSequenceIds().shape().ndims(), (size_t)1);
-    CHECK_EQ(outputs[0].shape().ndims(), (size_t)2);
+    CHECK(inputs[0].isSequenceArg() && outputs[0].isSequenceArg())
+        << "SequenceArg required here";
+    const auto in_seq = dynamic_cast<const SequenceArg&>(inputs[0]);
+    auto out_seq = dynamic_cast<const SequenceArg&>(outputs[0]);
+    CHECK(in_seq.data() && in_seq.getSequenceIds().data());
+    CHECK_EQ(in_seq.shape().ndims(), (size_t)2);
+    CHECK_EQ(in_seq.getSequenceIds().shape().ndims(), (size_t)1);
+    CHECK_EQ(out_seq.shape().ndims(), (size_t)2);
+    CHECK_EQ(out_seq.getSequenceIds().shape().ndims(), (size_t)1);
     CHECK_EQ(outputs[1].shape().ndims(), (size_t)2);
 
     /// dim of input grad == dim of weight
-    CHECK_EQ(outputs[0].shape()[1], outputs[1].shape()[1]);
+    CHECK_EQ(out_seq.shape()[1], outputs[1].shape()[1]);
     /// input and output grad has the same batch_size
-    CHECK_EQ(outputs[0].shape()[0], seq_arg.shape()[0]);
-    /// dim of output val = dim of input grad * context_length
-    CHECK_EQ(seq_arg.shape()[1], outputs[0].shape()[1] * context_length_);
-
-    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+    CHECK_EQ(out_seq.shape()[0], in_seq.shape()[0]);
+    /// dim of output grad = dim of input grad * context_length
+    CHECK_EQ(in_seq.shape()[1], out_seq.shape()[1] * context_length_);
+    CHECK_EQ(out_seq.getArgType(), ADD_TO);
     CHECK_EQ(outputs[1].getArgType(), ADD_TO);
 
-    const auto seq_vec = seq_arg.getSequenceIds().vector<int, Device>();
-    const auto out_grad_mat = seq_arg.matrix<Device>();
+    const auto seq_vec = in_seq.getSequenceIds().vector<int, Device>();
+    const auto out_grad_mat = in_seq.matrix<Device>();
     auto in_grad_mat =
-        !outputs[0].data()
-            ? typename Tensor<real, Device>::Matrix(nullptr, 0, 0)
-            : outputs[0].matrix<Device>();
+        !out_seq.data() ? typename Tensor<real, Device>::Matrix(nullptr, 0, 0)
+                        : out_seq.matrix<Device>();
     auto w_grad_mat = !outputs[1].data()
                           ? typename Tensor<real, Device>::Matrix(nullptr, 0, 0)
                           : outputs[1].matrix<Device>();
@@ -287,9 +286,15 @@ private:
 };
 
 /**
- * \param inputs[0].matrix      input grad, n*d
- * \param inputs[0].vector      input sequence, n*1
- * \param outputs[0]            output grad, n*(d*l)
+ * Context Projection Backward Data Function
+ * Update input layer grad
+ * input:  sequence of output layer grad
+ * output: sequence of input layer grad
+ *
+ * \param outputs[0].matrix              input layer grad, n * d
+ * \param outputs[0].vector              start position sequence, n * 1
+ * \param inputs[0].matrix               output layer grad, n * (d * l)
+ * \param inputs[0].vector               start positon sequence, n * 1
  */
 template <DeviceType Device>
 class ContextProjectionBackwardDataFunc : public FunctionBase {
@@ -302,19 +307,24 @@ public:
   void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
     CHECK_EQ(1, static_cast<int>(inputs.size()));
     CHECK_EQ(1, static_cast<int>(outputs.size()));
-    const auto in_seqs = dynamic_cast<const SequenceArg&>(inputs[0]);
-    CHECK(in_seqs.data() && outputs[0].data() &&
-          in_seqs.getSequenceIds().data());
-    CHECK_EQ(static_cast<int>(outputs[0].shape().ndims()), 2);
-    CHECK_EQ(static_cast<int>(in_seqs.shape().ndims()), 2);
-    CHECK_EQ(static_cast<int>(in_seqs.getSequenceIds().shape().ndims()), 1);
-    CHECK_EQ(outputs[0].shape().ndims(),
-             in_seqs.shape().ndims() * context_length_);
+    CHECK(inputs[0].isSequenceArg() && outputs[0].isSequenceArg())
+        << "SequenceArg required here";
+    const auto in_seq = dynamic_cast<const SequenceArg&>(inputs[0]);
+    const auto out_seq = dynamic_cast<const SequenceArg&>(outputs[0]);
+
+    CHECK(in_seq.data() && out_seq.data() && in_seq.getSequenceIds().data());
+    CHECK_EQ(static_cast<int>(out_seq.shape().ndims()), 2);
+    CHECK_EQ(static_cast<int>(in_seq.shape().ndims()), 2);
+    CHECK_EQ(static_cast<int>(in_seq.getSequenceIds().shape().ndims()), 1);
+    /// output layer grad dim == input layer grad dim * context_length_
+    CHECK_EQ(in_seq.shape().ndims(), out_seq.shape().ndims() * context_length_);
     /// input and output has the same batch_size
-    CHECK_EQ(in_seqs.shape()[0], outputs[0].shape()[0]);
-    const auto out_grad_mat = outputs[0].matrix<Device>();
-    auto in_grad_mat = in_seqs.matrix<Device>();
-    const auto seq_vec = in_seqs.getSequenceIds().vector<int, Device>();
+    CHECK_EQ(in_seq.shape()[0], out_seq.shape()[0]);
+    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
+
+    const auto out_grad_mat = in_seq.matrix<Device>();
+    const auto seq_vec = in_seq.getSequenceIds().vector<int, Device>();
+    auto in_grad_mat = out_seq.matrix<Device>();
 
     ContextProjectionBackwardData<Device>(
         out_grad_mat, in_grad_mat, seq_vec, context_length_, context_start_);
@@ -326,9 +336,14 @@ private:
 };
 
 /**
- * \param inputs[0].matrix    weight grad, pad * d
- * \param inputs[0].vecotr    input sequence, n * 1
- * \param outputs[0]          output grad, n * (d * l)
+ * Context Projection Backward Weight Function
+ * Update weight grad by backprop
+ * input:  sequence of output layer grad
+ * output: weight grad
+ *
+ * \param outputs[0]                   weight grad, pad * d
+ * \param inputs[0].matrix             output layer grad, n * (d * l)
+ * \param inputs[0].vecotr             start positon sequence, n * 1
  */
 template <DeviceType Device>
 class ContextProjectionBackwardWeightFunc : public FunctionBase {
@@ -343,18 +358,20 @@ public:
   void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
     CHECK_EQ(1, static_cast<int>(inputs.size()));
     CHECK_EQ(1, static_cast<int>(outputs.size()));
-
-    const auto in_seqs = dynamic_cast<const SequenceArg&>(inputs[0]);
-    CHECK(in_seqs.data() && in_seqs.getSequenceIds().data() &&
-          outputs[0].data());
+    CHECK(inputs[0].isSequenceArg()) << "SequenceArg required here";
+    const auto in_seq = dynamic_cast<const SequenceArg&>(inputs[0]);
+    CHECK(in_seq.data() && in_seq.getSequenceIds().data() && outputs[0].data());
     CHECK_EQ(static_cast<int>(outputs[0].shape().ndims()), 2);
-    CHECK_EQ(static_cast<int>(in_seqs.shape().ndims()), 2);
-    CHECK_EQ(static_cast<int>(in_seqs.getSequenceIds().shape().ndims()), 1);
-    CHECK_EQ(in_seqs.shape()[0], outputs[0].shape()[0]);
-    CHECK_EQ(outputs[0].shape()[1], in_seqs.shape()[1] * context_length_);
-    const auto out_grad_mat = outputs[0].matrix<Device>();
-    auto w_grad_mat = inputs[0].matrix<Device>();
-    const auto seq_vec = in_seqs.getSequenceIds().vector<int, Device>();
+    CHECK_EQ(static_cast<int>(in_seq.shape().ndims()), 2);
+    CHECK_EQ(static_cast<int>(in_seq.getSequenceIds().shape().ndims()), 1);
+    CHECK_EQ(in_seq.shape()[0], outputs[0].shape()[0]);
+    /// output layer grad dim == weight dim * context_length_
+    CHECK_EQ(in_seq.shape()[1], outputs[0].shape()[1] * context_length_);
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+
+    const auto seq_vec = in_seq.getSequenceIds().vector<int, Device>();
+    const auto out_grad_mat = in_seq.matrix<Device>();
+    auto w_grad_mat = outputs[0].matrix<Device>();
     ContextProjectionBackwardWeight<Device>(out_grad_mat,
                                             w_grad_mat,
                                             seq_vec,
diff --git a/paddle/function/ContextProjectionOpTest.cpp b/paddle/function/ContextProjectionOpTest.cpp
index bd0c06c5f6..c9db2ff800 100644
--- a/paddle/function/ContextProjectionOpTest.cpp
+++ b/paddle/function/ContextProjectionOpTest.cpp
@@ -123,7 +123,7 @@ void testMatrixProjectionBackward(int context_start,
   BufferArgs cpu_inputs;
   BufferArgs cpu_outputs;
   cpu_inputs.addArg(cpu_out_grad, *cpu_seq);
-  cpu_outputs.addArg(cpu_in_grad, ADD_TO);
+  cpu_outputs.addArg(cpu_in_grad, *cpu_seq, ADD_TO);
   cpu_outputs.addArg(
       cpu_w_grad ? *cpu_w_grad : CpuMatrix(nullptr, 0, input_dim), ADD_TO);
 
@@ -132,7 +132,7 @@ void testMatrixProjectionBackward(int context_start,
   BufferArgs gpu_inputs;
   BufferArgs gpu_outputs;
   gpu_inputs.addArg(gpu_out_grad, *gpu_seq);
-  gpu_outputs.addArg(gpu_in_grad, ADD_TO);
+  gpu_outputs.addArg(gpu_in_grad, *gpu_seq, ADD_TO);
   gpu_outputs.addArg(
       gpu_w_grad ? *gpu_w_grad : GpuMatrix(nullptr, 0, input_dim), ADD_TO);
 
diff --git a/paddle/gserver/layers/ContextProjection.cpp b/paddle/gserver/layers/ContextProjection.cpp
index edcef17ad4..d7042af1c2 100644
--- a/paddle/gserver/layers/ContextProjection.cpp
+++ b/paddle/gserver/layers/ContextProjection.cpp
@@ -169,6 +169,7 @@ void ContextProjection::backward(const UpdateCallback& callback) {
   outputs.addArg(
       CpuMatrix(
           in_->grad ? in_->grad->getData() : nullptr, batch_size, input_dim),
+      *in_->sequenceStartPositions->getVector(useGpu_),
       ADD_TO);
   outputs.addArg(CpuMatrix(w_ptr ? w_ptr->getData() : nullptr,
                            w_ptr ? w_ptr->getHeight() : 0,

From 143ff015c66fb70db0c89de77bbd8cb60bb082ce Mon Sep 17 00:00:00 2001
From: Helin Wang <helinwang@baidu.com>
Date: Thu, 5 Jan 2017 14:14:51 -0800
Subject: [PATCH 21/88] add permission AWSKeyManagementServicePowerUser for aws
 kms key creation

Without the permission command `aws kms --region=us-west-1 create-key --description="kube-aws assets"` will fail with:
An error occurred (AccessDeniedException) when calling the CreateKey operation:
---
 doc/howto/usage/k8s/k8s_aws_en.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/howto/usage/k8s/k8s_aws_en.md b/doc/howto/usage/k8s/k8s_aws_en.md
index b04bfba590..a82243995a 100644
--- a/doc/howto/usage/k8s/k8s_aws_en.md
+++ b/doc/howto/usage/k8s/k8s_aws_en.md
@@ -31,6 +31,7 @@ the user group:
 - IAMUserSSHKeys
 - IAMFullAccess
 - NetworkAdministrator
+- AWSKeyManagementServicePowerUser
 
 
 By the time we write this tutorial, we noticed that Chinese AWS users

From 2778a65b0ff9aa825d1ccaf6907c7364427058f1 Mon Sep 17 00:00:00 2001
From: Helin Wang <helinwang@baidu.com>
Date: Thu, 12 Jan 2017 17:19:05 -0800
Subject: [PATCH 22/88] first pass change for k8s aws en tutorial

---
 doc/howto/usage/k8s/k8s_aws_en.md | 108 +++++++++++++++---------------
 1 file changed, 55 insertions(+), 53 deletions(-)

diff --git a/doc/howto/usage/k8s/k8s_aws_en.md b/doc/howto/usage/k8s/k8s_aws_en.md
index a82243995a..c776ba9eb9 100644
--- a/doc/howto/usage/k8s/k8s_aws_en.md
+++ b/doc/howto/usage/k8s/k8s_aws_en.md
@@ -2,15 +2,9 @@
 
 ## Create AWS Account and IAM Account
 
-To use AWS, we need to sign up an AWS account on Amazon's Web site.
-An AWS account allows us to login to the AWS Console Web interface to
-create IAM users and user groups. Usually, we create a user group with
-privileges required to run PaddlePaddle, and we create users for
-those who are going to run PaddlePaddle and add these users into the
-group. IAM users can identify themselves using password and tokens,
-where passwords allows users to log in to the AWS Console, and tokens
-make it easy for users to submit and inspect jobs from the command
-line.
+AWS account allow us to manage AWS from Web Console. Amazon AMI enable us to manage AWS from command line interface.
+
+We need to create an AMI user with sufficient privilege to create kubernetes cluster on AWS.
 
 To sign up an AWS account, please
 follow
@@ -19,8 +13,7 @@ To create users and user groups under an AWS account, please
 follow
 [this guide](http://docs.aws.amazon.com/IAM/latest/UserGuide/id_users_create.html).
 
-Please be aware that this tutorial needs the following privileges in
-the user group:
+Please be aware that this tutorial needs the following privileges for the user in AMI:
 
 - AmazonEC2FullAccess
 - AmazonS3FullAccess
@@ -47,9 +40,11 @@ it.
 Here we will show you step by step on how to run PaddlePaddle training on AWS cluster.
 
 
-###Download kube-aws and kubectl
+### Download kube-aws and kubectl
+
+#### kube-aws
 
-####kube-aws
+[kube-aws](https://github.com/coreos/kube-aws) is a CLI tool to automate cluster deployment to AWS.
 
 Import the CoreOS Application Signing Public Key:
 
@@ -89,24 +84,22 @@ mv ${PLATFORM}/kube-aws /usr/local/bin
 ```
 
 
-####kubectl
+#### kubectl
+
+[kubectl](https://kubernetes.io/docs/user-guide/kubectl-overview/) is a command line interface for running commands against Kubernetes clusters.
 
 Go to the [releases](https://github.com/kubernetes/kubernetes/releases) and download the latest release tarball.
 
 Extract the tarball and then concate the kubernetes binaries directory into PATH:
 
 ```
-export PATH=<path/to/kubernetes-directory>/platforms/linux/amd64:$PATH
-
+export PATH=<path/to/kubernetes-directory>/platforms/linux/amd64:$PATH # The exact path depend on your platform
 ```
 
-User credentials and security tokens will be generated later in user directory, not in `~/.kube/config`, they will be necessary to use the CLI or the HTTP Basic Auth.
-
-
-###Configure AWS Credentials
 
-First check out [this](http://docs.aws.amazon.com/cli/latest/userguide/installing.html) for installing the AWS command line interface, if you use ec2 instance with default amazon AMI, the cli tool has already been installed on your machine.
+### Configure AWS Credentials
 
+First check out [this](http://docs.aws.amazon.com/cli/latest/userguide/installing.html) for installing the AWS command line interface.
 
 And then configure your AWS account information:
 
@@ -127,33 +120,35 @@ Default output format: json
 
 ```
 
-Test that your credentials work by describing any instances you may already have running on your account:
+Verify that your credentials work by describing any instances you may already have running on your account:
 
 ```
 aws ec2 describe-instances
 ```
 
-###Define Cluster Parameters
+### Define Cluster Parameters
 
-####EC2 key pair
+#### EC2 key pair
 
 The keypair that will authenticate SSH access to your EC2 instances. The public half of this key pair will be configured on each CoreOS node.
 
-After creating a key pair, you will use the name you gave the keys to configure the cluster. Key pairs are only available to EC2 instances in the same region. More info in the [EC2 Keypair docs](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-key-pairs.html).
+Follow [EC2 Keypair docs](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-key-pairs.html) to create a EC2 key pair
 
-####KMS key
+After creating a key pair, you will use the name you gave the keys to configure the cluster. Key pairs are only available to EC2 instances in the same region.
+
+#### KMS key
 
 Amazon KMS keys are used to encrypt and decrypt cluster TLS assets. If you already have a KMS Key that you would like to use, you can skip creating a new key and provide the Arn string for your existing key.
 
 You can create a KMS key in the AWS console, or with the aws command line tool:
 
 ```
-$ aws kms --region=us-west-2 create-key --description="kube-aws assets"
+$ aws kms --region=us-west-1 create-key --description="kube-aws assets"
 {
     "KeyMetadata": {
         "CreationDate": 1458235139.724,
         "KeyState": "Enabled",
-        "Arn": "arn:aws:kms:us-west-2:xxxxxxxxx:key/xxxxxxxxxxxxxxxxxxx",
+        "Arn": "arn:aws:kms:us-west-1:xxxxxxxxx:key/xxxxxxxxxxxxxxxxxxx",
         "AWSAccountId": "xxxxxxxxxxxxx",
         "Enabled": true,
         "KeyUsage": "ENCRYPT_DECRYPT",
@@ -167,7 +162,9 @@ You will use the `KeyMetadata.Arn` string to identify your KMS key in the init s
 
 And then you need to add several inline policies in your user permission.
 
-kms inline policy:
+Go to AMI user page, click on `Add inline policy` button, and then select `Custom Policy`
+
+paste into following inline policies:
 
 ```
 {
@@ -183,16 +180,8 @@ kms inline policy:
             "Resource": [
                 "arn:aws:kms:*:xxxxxxxxx:key/*"
             ]
-        }
-    ]
-}
-```
-cloudformation inline policy:
-
-```
-"Version": "2012-10-17",
-    "Statement": [
-        {
+        },
+		{
             "Sid": "Stmt1482205746000",
             "Effect": "Allow",
             "Action": [
@@ -201,10 +190,11 @@ cloudformation inline policy:
                 "cloudformation:DeleteStack",
                 "cloudformation:DescribeStacks",
                 "cloudformation:DescribeStackResource",
-                "cloudformation:GetTemplate"
+                "cloudformation:GetTemplate",
+                "cloudformation:DescribeStackEvents"
             ],
             "Resource": [
-                "arn:aws:cloudformation:us-west-2:xxxxxxxxx:stack/YOUR_CLUSTER_NAME/*"
+                "arn:aws:cloudformation:us-west-1:xxxxxxxxx:stack/YOUR_CLUSTER_NAME/*"
             ]
         }
     ]
@@ -212,15 +202,23 @@ cloudformation inline policy:
 ```
 
 
-####External DNS name
+#### External DNS name
 
 When the cluster is created, the controller will expose the TLS-secured API on a public IP address. You will need to create an A record for the external DNS hostname you want to point to this IP address. You can find the API external IP address after the cluster is created by invoking kube-aws status.
 
-####S3 bucket
+#### S3 bucket
 
 You need to create an S3 bucket before startup the Kubernetes cluster.
 
-####Initialize an asset directory
+command (need to have a global unique name):
+
+```
+paddle aws s3api --region=us-west-1 create-bucket --bucket bucket-name
+```
+
+If you get an error message, try a different bucket name. The bucket name needs to be globally unique.
+
+#### Initialize an asset directory
 
 Create a directory on your local machine to hold the generated assets:
 
@@ -238,12 +236,16 @@ $ kube-aws init \
 --region=us-west-1 \
 --availability-zone=us-west-1c \
 --key-name=key-pair-name \
---kms-key-arn="arn:aws:kms:us-west-2:xxxxxxxxxx:key/xxxxxxxxxxxxxxxxxxx"
+--kms-key-arn="arn:aws:kms:us-west-1:xxxxxxxxxx:key/xxxxxxxxxxxxxxxxxxx"
 ```
 
+Here `us-west-1c` is used for parameter `--availability-zone`, but supported availability zone varies among AWS accounts.
+
+Please check if `us-west-1c` is supported by `aws ec2 --region us-west-1 describe-availability-zones`, if not switch to other supported availability zone. (e.g., `us-west-1a`, or `us-west-1b`)
+
 There will now be a cluster.yaml file in the asset directory. This is the main configuration file for your cluster.
 
-####Render contents of the asset directory
+#### Render contents of the asset directory
 
 In the simplest case, you can have kube-aws generate both your TLS identities and certificate authority for you.
 
@@ -286,21 +288,21 @@ $ tree
 These assets (templates and credentials) are used to create, update and interact with your Kubernetes cluster.
 
 
-###Kubernetes Cluster Start Up
+### Kubernetes Cluster Start Up
 
-####Create the instances defined in the CloudFormation template
+#### Create the instances defined in the CloudFormation template
 
-Now for the exciting part, creating your cluster:
+Now for the exciting part, creating your cluster (choose any `<prefix>`):
 
 ```
 $ kube-aws up --s3-uri s3://<your-bucket-name>/<prefix>
 ```
 
-####Configure DNS
+#### Configure DNS
 
-You can invoke `kube-aws status` to get the cluster API endpoint after cluster creation, if necessary. This command can take a while. And then dig the load balancer hostname to get the ip address, use this ip to setup an A record for your external dns name.
+You can invoke `kube-aws status` to get the cluster API endpoint after cluster creation, if necessary. This command can take a while. And use command `dig` to check the load balancer hostname to get the ip address, use this ip to setup an A record for your external dns name.
 
-####Access the cluster
+#### Access the cluster
 
 Once the API server is running, you should see:
 
@@ -313,7 +315,7 @@ ip-10-0-0-xx.us-west-1.compute.internal    Ready,SchedulingDisabled   5m
 ```
 
 
-###Setup PaddlePaddle Environment on AWS
+### Setup PaddlePaddle Environment on AWS
 
 Now, we've created a cluster with following network capability:
 

From 2a20fdc14bce87ce3d092ab9bff8349be5194b05 Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Fri, 13 Jan 2017 14:35:56 +0800
Subject: [PATCH 23/88] Change BufferArgPtr to BufferArg*

---
 paddle/function/Function.cpp |  9 ++++++---
 paddle/function/Function.h   | 33 +++++++++++++++++++++++++++++++--
 2 files changed, 37 insertions(+), 5 deletions(-)

diff --git a/paddle/function/Function.cpp b/paddle/function/Function.cpp
index dbe3a4e9f6..3fdc37b968 100644
--- a/paddle/function/Function.cpp
+++ b/paddle/function/Function.cpp
@@ -79,15 +79,18 @@ FuncConfig& FuncConfig::set<bool>(const std::string& key, bool v) {
 void BufferArgs::addArg(const Matrix& arg,
                         const TensorShape& shape,
                         ArgType argType) {
-  args_.push_back(std::make_shared<BufferArg>(arg, shape, argType));
+  _args_.push_back(new BufferArg(arg, shape, argType));
+  addArg(*_args_.back());
 }
 
 void BufferArgs::addArg(const CpuSparseMatrix& arg, ArgType argType) {
-  args_.push_back(std::make_shared<SparseMatrixArg>(arg, argType));
+  _args_.push_back(new SparseMatrixArg(arg, argType));
+  addArg(*_args_.back());
 }
 
 void BufferArgs::addArg(const GpuSparseMatrix& arg, ArgType argType) {
-  args_.push_back(std::make_shared<SparseMatrixArg>(arg, argType));
+  _args_.push_back(new SparseMatrixArg(arg, argType));
+  addArg(*_args_.back());
 }
 
 ClassRegistrar<FunctionBase> FunctionBase::funcRegistrar_;
diff --git a/paddle/function/Function.h b/paddle/function/Function.h
index 249f8f9cfa..afbd4911b0 100644
--- a/paddle/function/Function.h
+++ b/paddle/function/Function.h
@@ -50,10 +50,25 @@ protected:
  * Argument type for Function::calc().
  * A BufferArgs contains a set of BufferArg,
  * because Function can have multiple inputs and outputs.
+ *
+ * addArg() with Matix object used to adapt Layer Argument.
+ * Will create a BufferArg object in addArg(),
+ * and free in destructor of BufferArgs.
+ *
+ * addArg() with BufferArg object, just save BufferArg object address,
+ * and the caller needs to guarantee the validity of the BufferArg object
+ * in the BufferArgs life time.
  */
 class BufferArgs {
 public:
   BufferArgs() {}
+
+  ~BufferArgs() {
+    for (auto arg : _args_) {
+      delete arg;
+    }
+  }
+
   size_t size() const { return args_.size(); }
 
   // add argument into BufferArgs
@@ -62,7 +77,8 @@ public:
   // For outputs, the argType needs to be specified as ASSIGN_TO or ADD_TO.
   template <typename Tensor>
   void addArg(const Tensor& arg, ArgType argType = UNSPECIFIED) {
-    args_.push_back(std::make_shared<BufferArg>(arg, argType));
+    _args_.push_back(new BufferArg(arg, argType));
+    addArg(*_args_.back());
   }
 
   // Add arg into BufferArgs and reshape the arg.
@@ -83,14 +99,27 @@ public:
     return *args_[num];
   }
 
+  void addArg(BufferArg& arg) { args_.push_back(&arg); }
+
+  void addArg(SequenceIdArg& arg) { args_.push_back(&arg); }
+
+  void addArg(SequenceArg& arg) { args_.push_back(&arg); }
+
+  void addArg(SparseMatrixArg& arg) { args_.push_back(&arg); }
+
 private:
-  std::vector<BufferArgPtr> args_;
+  std::vector<BufferArg*> args_;
+  // The BufferArg object is constructed and freed by BufferArgs.
+  std::vector<BufferArg*> _args_;
 };
 
 /**
  * \brief Base class for Function.
  * The basic Function implementation requires override init and calc interfaces.
  *
+ * The caller needs to ensure the validity of the arguments
+ * during Function execution.
+ *
  * Function inputs are readonly, Function outputs have two modes: ASSIGN_TO
  * and ADD_TO.
  * If output.getArgType() == ASSIGN_TO, this is assign mode, and the calculation

From 039c0bf22384607446fcd38a4ba6c349ce073213 Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Fri, 13 Jan 2017 15:05:22 +0800
Subject: [PATCH 24/88] Add some constructors for generating object that only
 contains shape (do not contains data).

---
 paddle/function/BufferArg.h      | 33 ++++++++++++++++++++++++++++++--
 paddle/function/FunctionTest.cpp | 18 +++++++++++++++++
 2 files changed, 49 insertions(+), 2 deletions(-)

diff --git a/paddle/function/BufferArg.h b/paddle/function/BufferArg.h
index 12352ba29e..28542a8657 100644
--- a/paddle/function/BufferArg.h
+++ b/paddle/function/BufferArg.h
@@ -39,7 +39,6 @@ enum SparseDataFormat { SPARSE_CSR_FORMAT = 0, SPARSE_CSC_FORMAT = 1 };
 class BufferArg;
 class SequenceArg;
 class SparseMatrixArg;
-typedef std::shared_ptr<BufferArg> BufferArgPtr;
 
 /**
  * \brief BufferArg used as the argument type of Function.
@@ -50,6 +49,11 @@ typedef std::shared_ptr<BufferArg> BufferArgPtr;
  * 3. SequenceArg for a Buffer of sequence data.
  * 4. SparseMatrixArg for a Buffer of sparse matrix.
  *
+ * Buffer shape
+ * For most buffers, the first dimension `shape()[0]` represents
+ * the size of the mini-batch.
+ *
+ * Buffer argType
  * There is an ArgType property for the BufferArg used as Function Output.
  * Whether the result of the Function calculation is assigned to the
  * output Buffer or added to the output Buffer is determined by the
@@ -71,6 +75,14 @@ public:
   ArgType getArgType() const { return argType_; }
 
 public:
+  BufferArg(ValueType valueType,
+            const TensorShape& shape,
+            ArgType argType = UNSPECIFIED)
+      : buf_(nullptr),
+        valueType_(valueType),
+        shape_(shape),
+        argType_(argType) {}
+
   BufferArg(void* buf,
             ValueType valueType,
             const TensorShape& shape,
@@ -170,6 +182,12 @@ protected:
 // if a < b then value_.buf_[a] < value_.buf_[b]
 class SequenceIdArg : public BufferArg {
 public:
+  SequenceIdArg(const TensorShape& shape, ArgType argType = UNSPECIFIED)
+      : BufferArg(VALUE_TYPE_INT32, shape, argType) {
+    CHECK_EQ(shape_.ndims(), (size_t)1);
+    numSeqs_ = shape_[0] - 1;
+  }
+
   SequenceIdArg(void* buf,
                 const TensorShape& shape,
                 ArgType argType = UNSPECIFIED)
@@ -190,9 +208,18 @@ private:
   size_t numSeqs_;
 };
 
-// sequence data
+// sequences data
+// For mini-batch calculate,
+// one batch can contain more than one sequence of data.
+// SequenceArg can be used to represent sequences that contain multiple
+// unequal lengths.
 class SequenceArg : public BufferArg {
 public:
+  SequenceArg(ValueType valueType,
+              const TensorShape& shape,
+              ArgType argType = UNSPECIFIED)
+      : BufferArg(valueType, shape, argType), startPositions_(TensorShape()) {}
+
   SequenceArg(void* buf,
               ValueType valueType,
               const TensorShape& shape,
@@ -210,6 +237,8 @@ public:
 
   void* getIdBuf() const { return startPositions_.data(); }
   size_t numSeqs() const { return startPositions_.numSeqs(); }
+  SequenceIdArg& getSequenceId() { return startPositions_; }
+  const SequenceIdArg& getSequenceId() const { return startPositions_; }
 
 private:
   SequenceIdArg startPositions_;
diff --git a/paddle/function/FunctionTest.cpp b/paddle/function/FunctionTest.cpp
index eb05ca9a21..03c609b524 100644
--- a/paddle/function/FunctionTest.cpp
+++ b/paddle/function/FunctionTest.cpp
@@ -84,6 +84,10 @@ void testBufferArgs(const BufferArgs& inputs,
   }
 }
 
+void testBufferArgs(const BufferArgs& inputs, const CheckBufferArg& check) {
+  check(inputs[0]);
+}
+
 TEST(Arguments, Matrix) {
   MatrixPtr matrix = Matrix::create(100, 200);
   CheckBufferArg check = [=](const BufferArg& arg) {
@@ -144,4 +148,18 @@ TEST(Arguments, CpuSparseMatrix) {
   testBufferArgs(argments, checkFunc);
 }
 
+TEST(Arguments, BufferArg) {
+  BufferArg arg(nullptr, VALUE_TYPE_FLOAT, {1, 2, 3});
+  CheckBufferArg check = [=](const BufferArg& arg) {
+    EXPECT_EQ(arg.shape().ndims(), 3);
+    EXPECT_EQ(arg.shape()[0], 1);
+    EXPECT_EQ(arg.shape()[1], 2);
+    EXPECT_EQ(arg.shape()[2], 3);
+  };
+
+  BufferArgs argments;
+  argments.addArg(arg);
+  testBufferArgs(argments, check);
+}
+
 }  // namespace paddle

From 678174339fcc1aeb953684f795e1bf8cf1d631a5 Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Fri, 13 Jan 2017 17:46:05 +0800
Subject: [PATCH 25/88] Implement the FunctionTest

---
 paddle/function/Function.h     |  13 +-
 paddle/function/FunctionTest.h | 225 +++++++++++++++------------------
 2 files changed, 112 insertions(+), 126 deletions(-)

diff --git a/paddle/function/Function.h b/paddle/function/Function.h
index afbd4911b0..b0c6ba0fac 100644
--- a/paddle/function/Function.h
+++ b/paddle/function/Function.h
@@ -75,8 +75,17 @@ public:
   // Tensor can be Matrix, Vector, IVector.
   // For inputs, do not need argType.
   // For outputs, the argType needs to be specified as ASSIGN_TO or ADD_TO.
-  template <typename Tensor>
-  void addArg(const Tensor& arg, ArgType argType = UNSPECIFIED) {
+  void addArg(const Matrix& arg, ArgType argType = UNSPECIFIED) {
+    _args_.push_back(new BufferArg(arg, argType));
+    addArg(*_args_.back());
+  }
+
+  void addArg(const Vector& arg, ArgType argType = UNSPECIFIED) {
+    _args_.push_back(new BufferArg(arg, argType));
+    addArg(*_args_.back());
+  }
+
+  void addArg(const IVector& arg, ArgType argType = UNSPECIFIED) {
     _args_.push_back(new BufferArg(arg, argType));
     addArg(*_args_.back());
   }
diff --git a/paddle/function/FunctionTest.h b/paddle/function/FunctionTest.h
index 2847188fd6..412e3a7d1b 100644
--- a/paddle/function/FunctionTest.h
+++ b/paddle/function/FunctionTest.h
@@ -19,6 +19,8 @@ limitations under the License. */
 
 namespace paddle {
 
+typedef std::shared_ptr<BufferArg> BufferArgPtr;
+
 /**
  * \brief A class for comparing CPU and GPU implementations of Function.
  *
@@ -45,143 +47,121 @@ namespace paddle {
 class FunctionCompare {
 public:
   FunctionCompare(const std::string& name, const FuncConfig& config)
-      : cpu(FunctionBase::funcRegistrar_.createByType(name + "-CPU")),
-        gpu(FunctionBase::funcRegistrar_.createByType(name + "-GPU")) {
-    cpu->init(config);
-    gpu->init(config);
+      : cpuFunc_(FunctionBase::funcRegistrar_.createByType(name + "-CPU")),
+        gpuFunc_(FunctionBase::funcRegistrar_.createByType(name + "-GPU")) {
+    cpuFunc_->init(config);
+    gpuFunc_->init(config);
+  }
+
+  ~FunctionCompare() {}
+
+  // input need only contains shape, do not contains data.
+  void addInputs(const BufferArg& input) {
+    size_t size =
+        input.shape().getElements() * sizeOfValuType(input.valueType());
+    cpuMemory_.emplace_back(std::make_shared<CpuMemoryHandle>(size));
+    gpuMemory_.emplace_back(std::make_shared<GpuMemoryHandle>(size));
+
+    cpuInputs_.emplace_back(std::make_shared<BufferArg>(
+        cpuMemory_.back()->getBuf(), input.valueType(), input.shape()));
+    gpuInputs_.emplace_back(std::make_shared<BufferArg>(
+        gpuMemory_.back()->getBuf(), input.valueType(), input.shape()));
+  }
+
+  // output need only contains shape, do not contains data.
+  void addOutputs(const BufferArg& output) {
+    size_t size =
+        output.shape().getElements() * sizeOfValuType(output.valueType());
+    cpuMemory_.emplace_back(std::make_shared<CpuMemoryHandle>(size));
+    gpuMemory_.emplace_back(std::make_shared<GpuMemoryHandle>(size));
+
+    cpuOutputs_.emplace_back(
+        std::make_shared<BufferArg>(cpuMemory_.back()->getBuf(),
+                                    output.valueType(),
+                                    output.shape(),
+                                    ASSIGN_TO));
+    gpuOutputs_.emplace_back(
+        std::make_shared<BufferArg>(gpuMemory_.back()->getBuf(),
+                                    output.valueType(),
+                                    output.shape(),
+                                    ASSIGN_TO));
   }
 
-  void addInputs(const BufferArg& input) { inputs.push_back(input); }
+  void addInputs(const SequenceArg& input) {
+    size_t batchSize = input.shape()[0];
+    size_t numSeqs = batchSize / 10 + 1;
+
+    size_t sizeId = (numSeqs + 1) * sizeOfValuType(VALUE_TYPE_INT32);
+    cpuMemory_.emplace_back(std::make_shared<CpuMemoryHandle>(sizeId));
+    gpuMemory_.emplace_back(std::make_shared<GpuMemoryHandle>(sizeId));
 
-  void addOutputs(const BufferArg& output) { outputs.push_back(output); }
+    TensorShape seqsId({numSeqs + 1});
+    // void* cpuBuffer = cpuMemory_.back()->getBuf();
+    // void* gpuBuffer = gpuMemory_.back()->getBuf();
+
+    size_t size =
+        input.shape().getElements() * sizeOfValuType(input.valueType());
+    cpuMemory_.emplace_back(std::make_shared<CpuMemoryHandle>(size));
+    gpuMemory_.emplace_back(std::make_shared<GpuMemoryHandle>(size));
+
+    // TODO: need be implemented.
+  }
 
   void run() {
     // prepare cpu/gpu arguments
-    prepareArgs();
+    initInputs();
 
     // function calculate
-    cpu->calc(cpuInputs, cpuOutputs);
-    gpu->calc(gpuInputs, gpuOutputs);
-
-    // check outputs and inouts
-    auto checkArgs = [=](const BufferArgs& cpuArgs, const BufferArgs& gpuArgs) {
-      for (size_t i = 0; i < cpuArgs.size(); i++) {
-        auto cpu = cpuArgs[i];
-        auto gpu = gpuArgs[i];
-        CpuVector cpuVector(cpu.shape().getElements(), (real*)cpu.getData());
-        GpuVector gpuVector(cpu.shape().getElements(), (real*)gpu.getData());
-
-        autotest::TensorCheckErr(cpuVector, gpuVector);
+    auto callFunction = [](FunctionBase* function,
+                           std::vector<BufferArgPtr>& inputs,
+                           std::vector<BufferArgPtr>& outputs) {
+      BufferArgs inArgs;
+      BufferArgs outArgs;
+      for (auto arg : inputs) {
+        inArgs.addArg(*arg);
       }
-    };
-    checkArgs(cpuOutputs, gpuOutputs);
-  }
-#if 0
-  void cmpWithArg(const Arguments& inputs,
-                  const Arguments& outputs,
-                  const Arguments& inouts) {
-    // init cpu and gpu arguments
-    auto initArgs = [=](
-        Arguments& cpuArgs, Arguments& gpuArgs, const Arguments& inArgs) {
-      for (const auto arg : inArgs) {
-        size_t size = sizeof(real);
-        for (const auto dim : arg.dims_) {
-          size *= dim;
-        }
-        if (arg.getData()) {
-          // todo(tianbing), waste unnecessary mem here
-          cpuMemory.emplace_back(std::make_shared<CpuMemoryHandle>(size));
-          gpuMemory.emplace_back(std::make_shared<GpuMemoryHandle>(size));
-          cpuArgs.emplace_back(Tensor((real*)arg.getData(), arg.dims_));
-          gpuArgs.emplace_back(Tensor((real*)arg.getData(), arg.dims_));
-          // already init outside
-        } else {
-          cpuMemory.emplace_back(std::make_shared<CpuMemoryHandle>(size));
-          gpuMemory.emplace_back(std::make_shared<GpuMemoryHandle>(size));
-          cpuArgs.emplace_back(
-              Tensor((real*)cpuMemory.back()->getBuf(), arg.dims_));
-          gpuArgs.emplace_back(
-              Tensor((real*)gpuMemory.back()->getBuf(), arg.dims_));
-          // will use an api to refactor this code.
-          CpuVector cpuVector(size / sizeof(real),
-                              (real*)cpuArgs.back().getData());
-          GpuVector gpuVector(size / sizeof(real),
-                              (real*)gpuArgs.back().getData());
-          cpuVector.uniform(0.001, 1);
-          gpuVector.copyFrom(cpuVector);
-        }
+      for (auto arg : outputs) {
+        outArgs.addArg(*arg);
       }
+      function->calc(inArgs, outArgs);
     };
-    initArgs(cpuInputs, gpuInputs, inputs);
-    initArgs(cpuOutputs, gpuOutputs, outputs);
 
-    // function calculate
-    cpu->calc(cpuInputs, cpuOutputs);
-    gpu->calc(gpuInputs, gpuOutputs);
+    callFunction(cpuFunc_.get(), cpuInputs_, cpuOutputs_);
+    callFunction(gpuFunc_.get(), gpuInputs_, gpuOutputs_);
 
     // check outputs and inouts
-    auto checkArgs = [=](const Arguments& cpuArgs, const Arguments& gpuArgs) {
-      for (size_t i = 0; i < cpuArgs.size(); i++) {
-        auto cpu = cpuArgs[i];
-        auto gpu = gpuArgs[i];
-        size_t size = 1;
-        for (auto dim : cpu.dims_) {
-          size *= dim;
-        }
-        CpuVector cpuVector(size, (real*)cpu.getData());
-        GpuVector gpuVector(size, (real*)gpu.getData());
-
-        autotest::TensorCheckErr(cpuVector, gpuVector);
-      }
-    };
-    checkArgs(cpuOutputs, gpuOutputs);
+    compareOutputs();
   }
-#endif
 
-  std::shared_ptr<FunctionBase> getCpuFunction() const { return cpu; }
+  std::shared_ptr<FunctionBase> getCpuFunction() const { return cpuFunc_; }
 
-  std::shared_ptr<FunctionBase> getGpuFunction() const { return gpu; }
+  std::shared_ptr<FunctionBase> getGpuFunction() const { return gpuFunc_; }
 
 protected:
-  void prepareArgs() {
-    // TODO, if inputs has data
-  }
+  void initInputs() {
+    for (size_t i = 0; i < cpuInputs_.size(); i++) {
+      initArg(*cpuInputs_[i]);
 
-  void createArg(BufferArgs& cpuArgs, BufferArgs& gpuArgs, BufferArg& arg) {
-    size_t size = arg.shape().getElements() * sizeOfValuType(arg.valueType());
-    cpuMemory_.emplace_back(std::make_shared<CpuMemoryHandle>(size));
-    gpuMemory_.emplace_back(std::make_shared<GpuMemoryHandle>(size));
+      // TODO: Need a BufferCopy used to copy from one BufferArg to another.
+      CpuVector cpuVector(cpuInputs_[i]->shape().getElements(),
+                          (real*)cpuInputs_[i]->data());
+      GpuVector gpuVector(gpuInputs_[i]->shape().getElements(),
+                          (real*)gpuInputs_[i]->data());
 
-    cpuArgs.emplace_back(
-        BufferArg(cpuMemory_.back()->getBuf()), arg.valueType(), arg.shape());
-    gpuArgs.emplace_back(
-        BufferArg(gpuMemory_.back()->getBuf()), arg.valueType(), arg.shape());
+      gpuVector.copyFrom(cpuVector);
+    }
   }
 
-  void createArg(BufferArgs& cpuArgs, BufferArgs& gpuArgs, SequenceArg& arg) {
-    size_t batchSize = arg.shape()[0];
-    size_t numSeqs = batchSize / 10 + 1;
+  void compareOutputs() {
+    for (size_t i = 0; i < cpuOutputs_.size(); i++) {
+      // TODO, Need a BufferCheck used to compare the two buffers.
+      auto cpu = cpuOutputs_[i];
+      auto gpu = gpuOutputs_[i];
+      CpuVector cpuVector(cpu->shape().getElements(), (real*)cpu->data());
+      GpuVector gpuVector(cpu->shape().getElements(), (real*)gpu->data());
 
-    size_t sizeId = (numSeqs + 1) * sizeOfValuType(VALUE_TYPE_INT32);
-    cpuMemory_.emplace_back(std::make_shared<CpuMemoryHandle>(size));
-    gpuMemory_.emplace_back(std::make_shared<GpuMemoryHandle>(size));
-
-    TensorShape seqsId({numSeqs + 1});
-    void* cpuBuffer = cpuMemory_.back()->getBuf();
-    void* gpuBuffer = gpuMemory_.back()->getBuf();
-
-    size_t size = arg.shape().getElements() * sizeOfValuType(arg.valueType());
-    cpuMemory_.emplace_back(std::make_shared<CpuMemoryHandle>(size));
-    gpuMemory_.emplace_back(std::make_shared<GpuMemoryHandle>(size));
-
-    cpuArgs.emplace_back(SequenceArg(cpuMemory_.back()->getBuf(),
-                                     arg.valueType(),
-                                     arg.shape(),
-                                     SequenceIdArg(cpuBuffer, seqsId)));
-    gpuArgs.emplace_back(SequenceArg(gpuMemory_.back()->getBuf(),
-                                     arg.valueType(),
-                                     arg.shape(),
-                                     SequenceIdArg(gpuBuffer, seqsId)));
+      autotest::TensorCheckErr(cpuVector, gpuVector);
+    }
   }
 
   // only init cpu argument, gpu argument copy from cpu argument.
@@ -192,10 +172,10 @@ protected:
 
   void initArg(SequenceIdArg& arg, size_t batchSize) {
     size_t numSeqs = arg.numSeqs();
-    int* buf = arg.data();
+    int* buf = (int*)arg.data();
     int pos = 0;
     size_t maxLen = 2 * batchSize / numSeqs;
-    for (int i = 0; i < numSeqs; ++i) {
+    for (int i = 0; i < (int)numSeqs; ++i) {
       int len = uniformRandom(
                     std::min<int64_t>(maxLen, batchSize - pos - numSeqs + i)) +
                 1;
@@ -207,17 +187,14 @@ protected:
   }
 
 protected:
-  std::shared_ptr<FunctionBase> cpu;
-  std::shared_ptr<FunctionBase> gpu;
+  std::shared_ptr<FunctionBase> cpuFunc_;
+  std::shared_ptr<FunctionBase> gpuFunc_;
   std::vector<CpuMemHandlePtr> cpuMemory_;
   std::vector<GpuMemHandlePtr> gpuMemory_;
-  // inputs and outputs
-  BufferArgs inputs;
-  BufferArgs outputs;
-  BufferArgs cpuInputs_;
-  BufferArgs cpuOutputs_;
-  BufferArgs gpuInputs_;
-  BufferArgs gpuOutputs_;
+  std::vector<BufferArgPtr> cpuInputs_;
+  std::vector<BufferArgPtr> cpuOutputs_;
+  std::vector<BufferArgPtr> gpuInputs_;
+  std::vector<BufferArgPtr> gpuOutputs_;
 };
 
 }  // namespace paddle

From b791dcddcc0d83c7397cbd075ccce08b60dfcb27 Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Fri, 13 Jan 2017 17:56:44 +0800
Subject: [PATCH 26/88] Fix CrossMapNormal Test

---
 paddle/function/CMakeLists.txt           |  2 +-
 paddle/function/CrossMapNormalOpTest.cpp | 51 ++++++++++++++----------
 2 files changed, 31 insertions(+), 22 deletions(-)

diff --git a/paddle/function/CMakeLists.txt b/paddle/function/CMakeLists.txt
index 75a2acc55e..566fe53b14 100644
--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@@ -19,7 +19,7 @@ if(WITH_TESTING)
     # TODO:
     # file(GLOB test_files . *OpTest.cpp)
     # add_executable(${test_bin} EXCLUDE_FROM_ALL ${test_files})
-    # add_simple_unittest(CrossMapNormalOpTest)
+    add_simple_unittest(CrossMapNormalOpTest)
     add_simple_unittest(TensorShapeTest)
     add_simple_unittest(TensorTypeTest)
     add_simple_unittest(BufferArgTest)
diff --git a/paddle/function/CrossMapNormalOpTest.cpp b/paddle/function/CrossMapNormalOpTest.cpp
index d65d9310af..da196a699c 100644
--- a/paddle/function/CrossMapNormalOpTest.cpp
+++ b/paddle/function/CrossMapNormalOpTest.cpp
@@ -27,15 +27,19 @@ TEST(CrossMapNormal, real) {
                     << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW
                     << " size=" << size;
 
-            FunctionCompare compare("CrossMapNormal",
-                                    FuncConfig()
-                                        .set("size", size)
-                                        .set("scale", (real)1.5)
-                                        .set("pow", (real)0.5));
-            Dims dims{numSamples, channels, imgSizeH, imgSizeW};
-            compare.cmpWithArg({Tensor(nullptr, dims)},
-                               {Tensor(nullptr, dims), Tensor(nullptr, dims)},
-                               {});
+            // init Test object
+            FunctionCompare test("CrossMapNormal",
+                                 FuncConfig()
+                                     .set("size", size)
+                                     .set("scale", (real)1.5)
+                                     .set("pow", (real)0.5));
+            // prepare input arguments
+            TensorShape shape{numSamples, channels, imgSizeH, imgSizeW};
+            test.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape));
+            test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, shape));
+            test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, shape));
+            // run Function
+            test.run();
           }
         }
       }
@@ -43,6 +47,9 @@ TEST(CrossMapNormal, real) {
   }
 }
 
+#if 0
+// TODO(hedaoyuan): Now CrossMapNormalGrad not support ASSIGN_TO mode.
+// Maybe all Function need support ASSIGN_TO mode.
 TEST(CrossMapNormalGrad, real) {
   for (size_t numSamples : {5, 32}) {
     for (size_t channels : {1, 5, 32}) {
@@ -53,23 +60,25 @@ TEST(CrossMapNormalGrad, real) {
                     << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW
                     << " size=" << size;
 
-            FunctionCompare compare("CrossMapNormalGrad",
-                                    FuncConfig()
-                                        .set("size", size)
-                                        .set("scale", (real)1.5)
-                                        .set("pow", (real)0.5));
-            Dims dims{numSamples, channels, imgSizeH, imgSizeW};
-            compare.cmpWithArg({Tensor(nullptr, dims),
-                                Tensor(nullptr, dims),
-                                Tensor(nullptr, dims),
-                                Tensor(nullptr, dims)},
-                               {Tensor(nullptr, dims)},
-                               {});
+            FunctionCompare test("CrossMapNormalGrad",
+                                 FuncConfig()
+                                     .set("size", size)
+                                     .set("scale", (real)1.5)
+                                     .set("pow", (real)0.5));
+            TensorShape shape{numSamples, channels, imgSizeH, imgSizeW};
+            test.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape));
+            test.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape));
+            test.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape));
+            test.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape));
+            test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, shape));
+            // run Function
+            test.run();
           }
         }
       }
     }
   }
 }
+#endif
 
 }  // namespace paddle

From d2e2042df33a0a5f55f0282cf7f10a71c98b8dd8 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Sat, 14 Jan 2017 16:04:58 +0800
Subject: [PATCH 27/88] Make sign compare as a compile warning not error.

---
 cmake/flags.cmake | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index 0d1ef5cd84..b76852fc6c 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -96,6 +96,7 @@ set(COMMON_FLAGS
     -Wno-unused-parameter
     -Wno-unused-function
     -Wno-error=literal-suffix
+    -Wno-error=sign-compare
     -Wno-error=unused-local-typedefs)
 
 set(GPU_COMMON_FLAGS
@@ -105,6 +106,7 @@ set(GPU_COMMON_FLAGS
     -Wdelete-non-virtual-dtor
     -Wno-unused-parameter
     -Wno-unused-function
+    -Wno-error=sign-compare
     -Wno-error=literal-suffix
     -Wno-error=unused-local-typedefs
     -Wno-error=unused-function  # Warnings in Numpy Header.

From 02480316016bd04c10676d8cb859c473f07a819f Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Thu, 12 Jan 2017 17:18:39 +0800
Subject: [PATCH 28/88] Add Status

---
 paddle/utils/Status.h | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)
 create mode 100644 paddle/utils/Status.h

diff --git a/paddle/utils/Status.h b/paddle/utils/Status.h
new file mode 100644
index 0000000000..398ae182ab
--- /dev/null
+++ b/paddle/utils/Status.h
@@ -0,0 +1,39 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <memory>
+#include <string>
+
+namespace paddle {
+
+class Status final : public std::exception {
+public:
+  Status() noexcept {}
+
+  Status(const std::string& msg) : errMsg_(new std::string(msg)) {}
+
+  virtual const char* what() const noexcept override {
+    if (errMsg_) {
+      return errMsg_->data();
+    } else {
+      return nullptr;
+    }
+  }
+
+  inline bool isOK() const noexcept { return errMsg_ == nullptr; }
+
+private:
+  std::unique_ptr<std::string> errMsg_;
+};
+
+}  // namespace paddle

From 6c20e08b042e1351a4ea8c97a74129d211b1d636 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Thu, 12 Jan 2017 17:55:36 +0800
Subject: [PATCH 29/88] Try using status to handle Paddle Error

---
 .../activations/ActivationFunction.cpp        | 126 ++++++++++++++----
 .../gserver/activations/ActivationFunction.h  |   5 +-
 paddle/gserver/layers/Layer.cpp               |   7 +-
 paddle/utils/Status.h                         |  36 ++++-
 paddle/utils/tests/CMakeLists.txt             |   1 +
 paddle/utils/tests/test_Status.cpp            |  29 ++++
 6 files changed, 169 insertions(+), 35 deletions(-)
 create mode 100644 paddle/utils/tests/test_Status.cpp

diff --git a/paddle/gserver/activations/ActivationFunction.cpp b/paddle/gserver/activations/ActivationFunction.cpp
index f8c4bcac2f..8a938cf7e9 100644
--- a/paddle/gserver/activations/ActivationFunction.cpp
+++ b/paddle/gserver/activations/ActivationFunction.cpp
@@ -69,8 +69,14 @@ static ClassRegistrar<ActivationFunction> gActivationRegistrar;
 class IdentityActivation : public ActivationFunction {
 public:
   static const std::string name;
-  void forward(Argument& act) { (void)act; }
-  void backward(Argument& act) { (void)act; }
+  Status forward(Argument& act) {
+    (void)act;
+    return Status();
+  }
+  Status backward(Argument& act) {
+    (void)act;
+    return Status();
+  }
   const std::string& getName() const { return name; }
 };
 const std::string IdentityActivation::name = "";
@@ -86,8 +92,14 @@ static InitFunction __reg_activation__identity([] {
  * \f]
  */
 BEGIN_DEFINE_ACTIVATION(sigmoid)
-void forward(Argument& act) { act.value->sigmoid(*act.value); }
-void backward(Argument& act) { act.grad->sigmoidDerivative(*act.value); }
+Status forward(Argument& act) {
+  act.value->sigmoid(*act.value);
+  return Status();
+}
+Status backward(Argument& act) {
+  act.grad->sigmoidDerivative(*act.value);
+  return Status();
+}
 END_DEFINE_ACTIVATION(sigmoid)
 
 /**
@@ -103,9 +115,12 @@ MatrixPtr sftMaxDot_;
 MatrixPtr one_;
 
 public:
-void forward(Argument& act) { act.value->softmax(*act.value); }
+Status forward(Argument& act) {
+  act.value->softmax(*act.value);
+  return Status();
+}
 
-void backward(Argument& act) {
+Status backward(Argument& act) {
   MatrixPtr outputV = act.value;
   MatrixPtr outputG = act.grad;
 
@@ -137,6 +152,7 @@ void backward(Argument& act) {
 
     act.grad->softmaxDerivative(*act.value, *sftMaxSum_);
   }
+  return Status();
 }
 END_DEFINE_ACTIVATION(softmax)
 
@@ -151,8 +167,11 @@ ACTIVATION_CLASS_NAME(softmax) softmax_;
 Argument argument_;
 
 public:
-void forward(Argument& act) {
-  CHECK_EQ(act.value->getWidth(), 1UL);
+Status forward(Argument& act) {
+  if (act.value->getWidth() != 1UL) {
+    return Status(
+        "Input width for each timestep of sequence softmax should be 1");
+  }
 
   if (!argument_.value) {
     argument_.value = Matrix::create(nullptr,
@@ -169,10 +188,14 @@ void forward(Argument& act) {
 
   auto starts = act.sequenceStartPositions->getVector(useGpu(act.deviceId));
   act.value->sequenceSoftmax(*act.value, *starts);
+  return Status();
 }
 
-void backward(Argument& act) {
-  CHECK_EQ(act.grad->getWidth(), 1UL);
+Status backward(Argument& act) {
+  if (act.value->getWidth() != 1UL) {
+    return Status(
+        "Input width for each timestep of sequence softmax should be 1");
+  }
 
   size_t numSequences = act.getNumSequences();
   const int* starts = act.sequenceStartPositions->getData(false);
@@ -186,6 +209,7 @@ void backward(Argument& act) {
 
     softmax_.backward(argument_);
   }
+  return Status();
 }
 END_DEFINE_ACTIVATION(sequence_softmax)
 
@@ -200,9 +224,15 @@ END_DEFINE_ACTIVATION(sequence_softmax)
  *    0 otherwise.
  */
 BEGIN_DEFINE_ACTIVATION(relu)
-void forward(Argument& act) { act.value->relu(*act.value); }
+Status forward(Argument& act) {
+  act.value->relu(*act.value);
+  return Status();
+}
 
-void backward(Argument& act) { act.grad->reluDerivative(*act.value); }
+Status backward(Argument& act) {
+  act.grad->reluDerivative(*act.value);
+  return Status();
+}
 END_DEFINE_ACTIVATION(relu)
 
 /**
@@ -219,9 +249,15 @@ END_DEFINE_ACTIVATION(relu)
  * TODO(yuyang18): Remove magic number 24 or make it configuable.
  */
 BEGIN_DEFINE_ACTIVATION(brelu)
-void forward(Argument& act) { act.value->brelu(*act.value); }
+Status forward(Argument& act) {
+  act.value->brelu(*act.value);
+  return Status();
+}
 
-void backward(Argument& act) { act.grad->breluDerivative(*act.value); }
+Status backward(Argument& act) {
+  act.grad->breluDerivative(*act.value);
+  return Status();
+}
 END_DEFINE_ACTIVATION(brelu)
 
 /**
@@ -231,9 +267,15 @@ END_DEFINE_ACTIVATION(brelu)
  * \f]
  */
 BEGIN_DEFINE_ACTIVATION(tanh)
-void forward(Argument& act) { act.value->tanh(*act.value); }
+Status forward(Argument& act) {
+  act.value->tanh(*act.value);
+  return Status();
+}
 
-void backward(Argument& act) { act.grad->tanhDerivative(*act.value); }
+Status backward(Argument& act) {
+  act.grad->tanhDerivative(*act.value);
+  return Status();
+}
 END_DEFINE_ACTIVATION(tanh)
 
 /**
@@ -248,10 +290,14 @@ real a, b;
 
 public:
 ACTIVATION_CLASS_NAME(stanh)() : a(1.7159), b(2. / 3.) {}
-void forward(Argument& act) { act.value->scaledTanh(*act.value, a, b); }
+Status forward(Argument& act) {
+  act.value->scaledTanh(*act.value, a, b);
+  return Status();
+}
 
-void backward(Argument& act) {
+Status backward(Argument& act) {
   act.grad->scaledTanhDerivative(*act.value, a, b);
+  return Status();
 }
 END_DEFINE_ACTIVATION(stanh)
 
@@ -262,9 +308,15 @@ END_DEFINE_ACTIVATION(stanh)
  * \f]
  */
 BEGIN_DEFINE_ACTIVATION(softrelu)
-void forward(Argument& act) { act.value->softrelu(*act.value); }
+Status forward(Argument& act) {
+  act.value->softrelu(*act.value);
+  return Status();
+}
 
-void backward(Argument& act) { act.grad->softreluDerivative(*act.value); }
+Status backward(Argument& act) {
+  act.grad->softreluDerivative(*act.value);
+  return Status();
+}
 END_DEFINE_ACTIVATION(softrelu)
 
 /**
@@ -280,7 +332,7 @@ END_DEFINE_ACTIVATION(softrelu)
  *     0   if z=0
  */
 BEGIN_DEFINE_ACTIVATION(abs)
-void forward(Argument& act) {
+Status forward(Argument& act) {
   SetDevice device(act.deviceId);
   Matrix::resizeOrCreate(act.in,
                          act.value->getHeight(),
@@ -290,9 +342,13 @@ void forward(Argument& act) {
 
   act.in->copyFrom(*act.value);
   act.value->abs2(*act.value);
+  return Status();
 }
 
-void backward(Argument& act) { act.grad->absDerivative(*act.in); }
+Status backward(Argument& act) {
+  act.grad->absDerivative(*act.in);
+  return Status();
+}
 END_DEFINE_ACTIVATION(abs)
 
 /**
@@ -302,7 +358,7 @@ END_DEFINE_ACTIVATION(abs)
  * \f]
  */
 BEGIN_DEFINE_ACTIVATION(square)
-void forward(Argument& act) {
+Status forward(Argument& act) {
   SetDevice device(act.deviceId);
   Matrix::resizeOrCreate(act.in,
                          act.value->getHeight(),
@@ -312,9 +368,13 @@ void forward(Argument& act) {
 
   act.in->copyFrom(*act.value);
   act.value->square2(*act.value);
+  return Status();
 }
 
-void backward(Argument& act) { act.grad->squareDerivative(*act.in); }
+Status backward(Argument& act) {
+  act.grad->squareDerivative(*act.in);
+  return Status();
+}
 END_DEFINE_ACTIVATION(square)
 
 /**
@@ -324,9 +384,15 @@ END_DEFINE_ACTIVATION(square)
  * \f]
  */
 BEGIN_DEFINE_ACTIVATION(exponential)
-void forward(Argument& act) { act.value->exp2(*act.value); }
+Status forward(Argument& act) {
+  act.value->exp2(*act.value);
+  return Status();
+}
 
-void backward(Argument& act) { act.grad->expDerivative(*act.value); }
+Status backward(Argument& act) {
+  act.grad->expDerivative(*act.value);
+  return Status();
+}
 END_DEFINE_ACTIVATION(exponential)
 
 /**
@@ -336,7 +402,7 @@ END_DEFINE_ACTIVATION(exponential)
  * \f]
  */
 BEGIN_DEFINE_ACTIVATION(log)
-void forward(Argument& act) {
+Status forward(Argument& act) {
   SetDevice device(act.deviceId);
   Matrix::resizeOrCreate(act.in,
                          act.value->getHeight(),
@@ -346,9 +412,13 @@ void forward(Argument& act) {
 
   act.in->copyFrom(*act.value);
   act.value->log2(*act.value);
+  return Status();
 }
 
-void backward(Argument& act) { act.grad->dotDiv(*act.grad, *act.in); }
+Status backward(Argument& act) {
+  act.grad->dotDiv(*act.grad, *act.in);
+  return Status();
+}
 END_DEFINE_ACTIVATION(log)
 
 ActivationFunction* ActivationFunction::create(const std::string& type) {
diff --git a/paddle/gserver/activations/ActivationFunction.h b/paddle/gserver/activations/ActivationFunction.h
index 601e3b6c0c..ad395ac28d 100644
--- a/paddle/gserver/activations/ActivationFunction.h
+++ b/paddle/gserver/activations/ActivationFunction.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include <string>
 #include <vector>
+#include "paddle/utils/Status.h"
 
 namespace paddle {
 
@@ -48,7 +49,7 @@ public:
    *
    * Usually, act is Layer::output_
    */
-  virtual void forward(Argument& act) = 0;
+  virtual Status forward(Argument& act) = 0;
 
   /**
    * @brief Backward propagaion
@@ -57,7 +58,7 @@ public:
    * - Before calling backward(), act.grad = dE / dy, where E is the error/cost
    * - After backward() returns, act.grad = dE / dx = (dE/dy) * (dy/dx)
    */
-  virtual void backward(Argument& act) = 0;
+  virtual Status backward(Argument& act) = 0;
 
   virtual const std::string& getName() const = 0;
 };
diff --git a/paddle/gserver/layers/Layer.cpp b/paddle/gserver/layers/Layer.cpp
index c47943f81c..06c936c3ae 100644
--- a/paddle/gserver/layers/Layer.cpp
+++ b/paddle/gserver/layers/Layer.cpp
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include "paddle/math/SparseMatrix.h"
 #include "paddle/utils/Logging.h"
+#include "paddle/utils/Status.h"
 
 #include "AddtoLayer.h"
 #include "CRFLayer.h"
@@ -334,7 +335,8 @@ void Layer::showOutputStats() {
 
 void Layer::forwardActivation() {
   /* activation */
-  activation_->forward(output_);
+  auto status = activation_->forward(output_);
+  CHECK(status.isOK()) << status.what();
 
   /* dropout */
   if (config_.drop_rate() > 0) {
@@ -372,7 +374,8 @@ void Layer::backwardActivation() {
     oGrad->dotMul(*oGrad, *dropOutMask_);
   }
 
-  activation_->backward(output_);
+  auto status = activation_->backward(output_);
+  CHECK(status.isOK()) << status.what();
 }
 
 void Layer::forwardDropOut() {
diff --git a/paddle/utils/Status.h b/paddle/utils/Status.h
index 398ae182ab..3456d7b686 100644
--- a/paddle/utils/Status.h
+++ b/paddle/utils/Status.h
@@ -11,18 +11,44 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+
+#pragma once
+
 #include <memory>
 #include <string>
 
 namespace paddle {
 
+/**
+ * Status is Paddle error code. It only contain a std::string as error message.
+ * Although Status inherits the std::exception, but do not throw it except you
+ * know what you are doing.
+ */
 class Status final : public std::exception {
 public:
+  /**
+   * Default Status. OK
+   */
   Status() noexcept {}
 
-  Status(const std::string& msg) : errMsg_(new std::string(msg)) {}
+  /**
+   * @brief Create Status with error message
+   * @param msg
+   */
+  explicit Status(const std::string& msg) : errMsg_(new std::string(msg)) {}
+
+  /**
+   * @brief set a error message for status.
+   * @param msg
+   */
+  inline void set(const std::string& msg) noexcept {
+    errMsg_.reset(new std::string(msg));
+  }
 
-  virtual const char* what() const noexcept override {
+  /**
+   * @brief what will return the error message. If status is OK, return nullptr.
+   */
+  const char* what() const noexcept override {
     if (errMsg_) {
       return errMsg_->data();
     } else {
@@ -30,10 +56,14 @@ public:
     }
   }
 
+  /**
+   * @brief isOK
+   * @return true if OK.
+   */
   inline bool isOK() const noexcept { return errMsg_ == nullptr; }
 
 private:
-  std::unique_ptr<std::string> errMsg_;
+  std::shared_ptr<std::string> errMsg_;
 };
 
 }  // namespace paddle
diff --git a/paddle/utils/tests/CMakeLists.txt b/paddle/utils/tests/CMakeLists.txt
index 26fafbd1ab..a1cc32668d 100644
--- a/paddle/utils/tests/CMakeLists.txt
+++ b/paddle/utils/tests/CMakeLists.txt
@@ -4,6 +4,7 @@ add_simple_unittest(test_CustomStackTrace)
 add_simple_unittest(test_ThreadBarrier)
 add_simple_unittest(test_SpinLock)
 add_simple_unittest(test_SIMDFlags)
+add_simple_unittest(test_Status)
 
 add_executable(
     test_CustomStackTracePrint
diff --git a/paddle/utils/tests/test_Status.cpp b/paddle/utils/tests/test_Status.cpp
new file mode 100644
index 0000000000..e2c2ae537d
--- /dev/null
+++ b/paddle/utils/tests/test_Status.cpp
@@ -0,0 +1,29 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/utils/Status.h"
+
+#include <gtest/gtest.h>
+
+TEST(Status, testAll) {
+  paddle::Status status;
+  ASSERT_TRUE(status.isOK());
+  status.set("I'm the error");
+  ASSERT_FALSE(status.isOK());
+  ASSERT_STREQ("I'm the error", status.what());
+
+  paddle::Status status2("error2");
+  ASSERT_FALSE(status2.isOK());
+  ASSERT_STREQ("error2", status2.what());
+}

From df62df7e6726b502d34ed5805ba26f44b7cfdd92 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Sat, 14 Jan 2017 16:30:24 +0800
Subject: [PATCH 30/88] Partially follow comments

---
 paddle/math/RowBuffer.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/paddle/math/RowBuffer.h b/paddle/math/RowBuffer.h
index bb55ca5f9f..0edcefd756 100644
--- a/paddle/math/RowBuffer.h
+++ b/paddle/math/RowBuffer.h
@@ -47,7 +47,7 @@ public:
    */
   inline void resize(int rowCnt) {
     if (preallocatedBuf_) {
-      CHECK(preallocatedBuf_->getSize() < rowCnt * width_ * sizeof(real));
+      CHECK(preallocatedBuf_->getSize() >= rowCnt * width_ * sizeof(real));
     } else {
       rowStore_.resize(rowCnt * width_);
     }
@@ -107,7 +107,7 @@ public:
    */
   inline size_t getRowCount() const {
     if (preallocatedBuf_) {
-      return preallocatedBuf_->getSize() / sizeof(float) / width_;
+      return preallocatedBuf_->getSize() / sizeof(real) / width_;
     } else {
       return rowStore_.size() / width_;
     }
@@ -117,7 +117,7 @@ public:
    * @brief get is this buffer can automatically grow or not.
    * @return ture if can automacitally grow.
    */
-  inline bool isAutoGrowth() const { return preallocatedBuf_ == nullptr; }
+  inline bool isAutoGrowth() const { return !preallocatedBuf_; }
 
   /**
    * @brief return the width of matrix. a.k.a length of row.

From cdf6af64a87c46da070da139925c32cc4064e6d3 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Sat, 14 Jan 2017 21:42:04 +0800
Subject: [PATCH 31/88] Add external openblas

---
 .travis.yml                                   |  2 +-
 cmake/external/openblas.cmake                 | 32 +++++++++++++++----
 cmake/system.cmake                            |  4 +--
 .../build_and_install/build_from_source_en.md |  5 +--
 paddle/scripts/travis/before_install.osx.sh   |  4 +--
 paddle/scripts/travis/build_and_test.sh       |  2 +-
 paddle/scripts/travis/docs.sh                 |  2 +-
 7 files changed, 34 insertions(+), 17 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 0705baa1ac..162bebba09 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -25,9 +25,9 @@ addons:
     packages:
       - gcc-4.8
       - g++-4.8
+      - gfortran-4.8
       - git
       - build-essential
-      - libatlas-base-dev
       - python
       - python-pip
       - python2.7-dev
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
index 0e8c29c831..43ebb39cd6 100644
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -15,7 +15,6 @@
 INCLUDE(cblas)
 
 IF(NOT ${CBLAS_FOUND})
-    MESSAGE(FATAL_ERROR "Please install OpenBlas, MKL or ATLAS.")
     INCLUDE(ExternalProject)
 
     SET(CBLAS_SOURCES_DIR ${THIRD_PARTY_PATH}/openblas)
@@ -28,20 +27,39 @@ IF(NOT ${CBLAS_FOUND})
         SET(CBLAS_LIBRARIES "${CBLAS_INSTALL_DIR}/lib/libopenblas.a" CACHE FILEPATH "openblas library" FORCE)
     ENDIF(WIN32)
 
+    IF(CMAKE_COMPILER_IS_GNUCC)
+        ENABLE_LANGUAGE(Fortran)
+        LIST(APPEND CBLAS_LIBRARIES gfortran pthread)
+    ENDIF(CMAKE_COMPILER_IS_GNUCC)
+
+    IF(NOT CMAKE_Fortran_COMPILER)
+        MESSAGE(FATAL_ERROR "To build lapack in libopenblas, "
+                "you need to set gfortran compiler: cmake .. -DCMAKE_Fortran_COMPILER=...")
+    ENDIF(NOT CMAKE_Fortran_COMPILER)
+
     ExternalProject_Add(
         openblas
-        ${EXTERNAL_PROJECT_LOG_ARGS}
-        URL                 "https://github.com/xianyi/OpenBLAS/archive/v0.2.19.tar.gz"
+        GIT_REPOSITORY      https://github.com/xianyi/OpenBLAS.git
+        GIT_TAG             v0.2.19
         PREFIX              ${CBLAS_SOURCES_DIR}
         INSTALL_DIR         ${CBLAS_INSTALL_DIR}
         BUILD_IN_SOURCE     1
-        CONFIGURE_COMMAND   ""
-        BUILD_COMMAND       make CC=${CMAKE_C_COMPILER} FC=${CMAKE_Fortran_COMPILER}
-        INSTALL_COMMAND     make install PREFIX=<INSTALL_DIR>
+        BUILD_COMMAND       ${CMAKE_MAKE_PROGRAM} FC=${CMAKE_Fortran_COMPILER} CC=${CMAKE_C_COMPILER} HOSTCC=${CMAKE_C_COMPILER} NO_SHARED=1 libs netlib
+        INSTALL_COMMAND     ${CMAKE_MAKE_PROGRAM} install NO_SHARED=1 PREFIX=<INSTALL_DIR>
         UPDATE_COMMAND      ""
+        CONFIGURE_COMMAND   ""
+    )
+
+    ExternalProject_Add_Step(
+        openblas lapacke_install
+        COMMAND ${CMAKE_COMMAND} -E copy "${CBLAS_SOURCES_DIR}/src/openblas/lapack-netlib/LAPACKE/include/lapacke_mangling_with_flags.h" "${CBLAS_INSTALL_DIR}/include/lapacke_mangling.h"
+        COMMAND ${CMAKE_COMMAND} -E copy "${CBLAS_SOURCES_DIR}/src/openblas/lapack-netlib/LAPACKE/include/lapacke.h" "${CBLAS_INSTALL_DIR}/include/lapacke.h"
+        COMMAND ${CMAKE_COMMAND} -E copy "${CBLAS_SOURCES_DIR}/src/openblas/lapack-netlib/LAPACKE/include/lapacke_config.h" "${CBLAS_INSTALL_DIR}/include/lapacke_config.h"
+        COMMAND ${CMAKE_COMMAND} -E copy "${CBLAS_SOURCES_DIR}/src/openblas/lapack-netlib/LAPACKE/include/lapacke_utils.h" "${CBLAS_INSTALL_DIR}/include/lapacke_utils.h"
+        DEPENDEES install
     )
 
     LIST(APPEND external_project_dependencies openblas)
-ENDIF()
+ENDIF(NOT ${CBLAS_FOUND})
 
 INCLUDE_DIRECTORIES(${CBLAS_INC_DIR})
diff --git a/cmake/system.cmake b/cmake/system.cmake
index 788db404eb..1e9f794964 100644
--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@@ -47,7 +47,7 @@ SET(EXTERNAL_PROJECT_LOG_ARGS
     LOG_DOWNLOAD    0     # Wrap download in script to log output
     LOG_UPDATE      1     # Wrap update in script to log output
     LOG_CONFIGURE   1     # Wrap configure in script to log output
-    LOG_BUILD       1     # Wrap build in script to log output
+    LOG_BUILD       0     # Wrap build in script to log output
     LOG_TEST        1     # Wrap test in script to log output
-    LOG_INSTALL     1     # Wrap install in script to log output
+    LOG_INSTALL     0     # Wrap install in script to log output
 )
diff --git a/doc/getstarted/build_and_install/build_from_source_en.md b/doc/getstarted/build_and_install/build_from_source_en.md
index 6954be3b2b..1abd7b698b 100644
--- a/doc/getstarted/build_and_install/build_from_source_en.md
+++ b/doc/getstarted/build_and_install/build_from_source_en.md
@@ -64,7 +64,8 @@ As a simple example, consider the following:
 
 1. **BLAS Dependencies(optional)**
   
-    Paddle will find BLAS from system's default path. But you can specify MKL, OpenBLAS or ATLAS via `MKL_ROOT`, `OPENBLAS_ROOT` or `ATLAS_ROOT`.
+    CMake will search BLAS libraries from system. If not found, OpenBLAS will be downloaded, built and installed automatically.
+    To utilize preinstalled BLAS， you can simply specify MKL, OpenBLAS or ATLAS via `MKL_ROOT`, `OPENBLAS_ROOT` or `ATLAS_ROOT`.
 
     ```bash
     # specify MKL
@@ -99,7 +100,7 @@ As a simple example, consider the following:
     ```bash
     # necessary
     sudo apt-get update
-    sudo apt-get install -y g++ make cmake build-essential libatlas-base-dev python python-pip libpython-dev git
+    sudo apt-get install -y g++ make cmake build-essential python python-pip libpython-dev git
     sudo pip install wheel numpy
     sudo pip install 'protobuf>=3.0.0'
     ```
diff --git a/paddle/scripts/travis/before_install.osx.sh b/paddle/scripts/travis/before_install.osx.sh
index 7036f971fd..80f031a74e 100755
--- a/paddle/scripts/travis/before_install.osx.sh
+++ b/paddle/scripts/travis/before_install.osx.sh
@@ -1,6 +1,4 @@
 #!/bin/bash
 brew update
 brew tap homebrew/science
-brew install python
-sudo pip install --upgrade protobuf
-brew install swig openblas md5sha1sum protobuf
+brew install openblas swig md5sha1sum
diff --git a/paddle/scripts/travis/build_and_test.sh b/paddle/scripts/travis/build_and_test.sh
index fd3aeb02b2..5e6350b574 100755
--- a/paddle/scripts/travis/build_and_test.sh
+++ b/paddle/scripts/travis/build_and_test.sh
@@ -6,7 +6,7 @@ if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then
   export PYTHONPATH=/opt/python/2.7.12/lib/python2.7/site-packages
   export PYTHONHOME=/opt/python/2.7.12
   export PATH=/opt/python/2.7.12/bin:${PATH}
-  cmake .. -DON_TRAVIS=ON -DON_COVERALLS=ON -DCOVERALLS_UPLOAD=ON ${EXTRA_CMAKE_OPTS}
+  cmake .. -DCMAKE_Fortran_COMPILER=/usr/bin/gfortran-4.8 -DON_TRAVIS=ON -DON_COVERALLS=ON -DCOVERALLS_UPLOAD=ON ${EXTRA_CMAKE_OPTS}
   NRPOC=`nproc`
   make -j $NPROC
   make coveralls
diff --git a/paddle/scripts/travis/docs.sh b/paddle/scripts/travis/docs.sh
index bdafb145bc..6b43cad20b 100755
--- a/paddle/scripts/travis/docs.sh
+++ b/paddle/scripts/travis/docs.sh
@@ -4,7 +4,7 @@
 source ./common.sh
 
 # Compile Documentation only.
-cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=ON ${EXTRA_CMAKE_OPTS}
+cmake .. -DCMAKE_BUILD_TYPE=Debug -DCMAKE_Fortran_COMPILER=/usr/bin/gfortran-4.8 -DWITH_GPU=OFF -DWITH_DOC=ON ${EXTRA_CMAKE_OPTS}
 make paddle_docs paddle_docs_cn
 
 # check websites for broken links

From 589bb84241baca1f53cee290b55ced88c331fc02 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Sat, 14 Jan 2017 21:53:52 +0800
Subject: [PATCH 32/88] Add openblas log configuration

---
 cmake/external/openblas.cmake | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
index 43ebb39cd6..29d17691db 100644
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -39,6 +39,7 @@ IF(NOT ${CBLAS_FOUND})
 
     ExternalProject_Add(
         openblas
+        ${EXTERNAL_PROJECT_LOG_ARGS}
         GIT_REPOSITORY      https://github.com/xianyi/OpenBLAS.git
         GIT_TAG             v0.2.19
         PREFIX              ${CBLAS_SOURCES_DIR}

From 35d7b17f79342431c7e392e4644140326d37feca Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Sun, 15 Jan 2017 00:02:57 +0800
Subject: [PATCH 33/88] Disable externel python

---
 cmake/external/python.cmake | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cmake/external/python.cmake b/cmake/external/python.cmake
index 29247d5c3d..209e679f2c 100644
--- a/cmake/external/python.cmake
+++ b/cmake/external/python.cmake
@@ -31,6 +31,7 @@ IF(PYTHONLIBS_FOUND AND PYTHONINTERP_FOUND)
         "please use pip to upgrade protobuf.")
     ENDIF(${PY_GOOGLE.PROTOBUF_VERSION} VERSION_LESS "3.0.0")
 ELSE(PYTHONLIBS_FOUND AND PYTHONINTERP_FOUND)
+    MESSAGE(FATAL_ERROR "Please install python 2.7 before building PaddlePaddle.")
     ##################################### PYTHON ########################################
     SET(PYTHON_SOURCES_DIR ${THIRD_PARTY_PATH}/python)
     SET(PYTHON_INSTALL_DIR ${THIRD_PARTY_PATH}/install/python)

From f7c8287df9604c176373c819462243644e10c05d Mon Sep 17 00:00:00 2001
From: liujunyi <liujunyi@sjtu.edu.cn>
Date: Sun, 15 Jan 2017 10:32:26 +0800
Subject: [PATCH 34/88] add lack dependency to zlib

---
 cmake/external/protobuf.cmake | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index c0cf2719f9..613614c0e3 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -54,6 +54,7 @@ ExternalProject_Add(
   CONFIGURE_COMMAND
     ${CMAKE_COMMAND} ${PROTOBUF_SOURCES_DIR}/src/protobuf/cmake
     -Dprotobuf_BUILD_TESTS=OFF
+    -DZLIB_ROOT:FILEPATH=${ZLIB_ROOT}
     -DCMAKE_POSITION_INDEPENDENT_CODE=ON
     -DCMAKE_BUILD_TYPE=Release
     -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR}

From b697154ac46f825d69c6ced3e585d41733b6247c Mon Sep 17 00:00:00 2001
From: liujunyi <liujunyi@sjtu.edu.cn>
Date: Sun, 15 Jan 2017 11:10:47 +0800
Subject: [PATCH 35/88] correct mkl env var

---
 cmake/cblas.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake
index 4e1ae7dc81..26306f9849 100644
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -16,7 +16,7 @@
 set(CBLAS_FOUND OFF)
 
 ## Find MKL First.
-set(MKL_ROOT $ENV{MKL_ROOT} CACHE PATH "Folder contains MKL")
+set(MKL_ROOT $ENV{MKLROOT} CACHE PATH "Folder contains MKL")
 
 find_path(MKL_INCLUDE_DIR mkl.h PATHS
   ${MKL_ROOT}/include)

From 94618d8776229254d52075f9aabd998d7ed6be8d Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Mon, 16 Jan 2017 10:20:26 +0800
Subject: [PATCH 36/88] Fix merge error.

---
 paddle/math/SparseRowMatrix.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/math/SparseRowMatrix.h b/paddle/math/SparseRowMatrix.h
index d7dfb2fe57..c05fc98ff9 100644
--- a/paddle/math/SparseRowMatrix.h
+++ b/paddle/math/SparseRowMatrix.h
@@ -19,8 +19,8 @@ limitations under the License. */
 #include <algorithm>
 #include "Matrix.h"
 #include "RowBuffer.h"
-#include "paddle/utils/CommandLineParser.h"
 #include "paddle/utils/Util.h"
+
 DECLARE_bool(allow_inefficient_sparse_update);
 
 namespace paddle {

From 66851af107c9eb3e6cea8485db467daf9b6cc078 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Mon, 16 Jan 2017 10:44:51 +0800
Subject: [PATCH 37/88] Detect CentOS 7 in CMake

---
 cmake/system.cmake | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/cmake/system.cmake b/cmake/system.cmake
index 788db404eb..d40499c39d 100644
--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@@ -21,6 +21,7 @@ ELSE(WIN32)
         SET(MACOS_VERSION ${VERSION})
         SET(HOST_SYSTEM "macosx")
     ELSE(APPLE)
+
         IF(EXISTS "/etc/issue")
             FILE(READ "/etc/issue" LINUX_ISSUE)
             IF(LINUX_ISSUE MATCHES "CentOS")
@@ -31,6 +32,14 @@ ELSE(WIN32)
                 SET(HOST_SYSTEM "ubuntu")
             ENDIF()
         ENDIF(EXISTS "/etc/issue")
+
+        IF(EXISTS "/etc/redhat-release")
+            FILE(READ "/etc/redhat-release" LINUX_ISSUE)
+            IF(LINUX_ISSUE MATCHES "CentOS")
+                SET(HOST_SYSTEM "centos")
+            ENDIF()
+        ENDIF(EXISTS "/etc/redhat-release")
+
     ENDIF(APPLE)
 ENDIF(WIN32)
 

From 014a72713009b52a6e6642adf2c360b2e1097ff7 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Mon, 16 Jan 2017 12:04:15 +0800
Subject: [PATCH 38/88] Add todos

---
 paddle/math/RowBuffer.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/paddle/math/RowBuffer.h b/paddle/math/RowBuffer.h
index 0edcefd756..dbb829c4e2 100644
--- a/paddle/math/RowBuffer.h
+++ b/paddle/math/RowBuffer.h
@@ -126,6 +126,8 @@ public:
   inline size_t getWidth() const { return width_; }
 
 private:
+  //! TODO(yuyang18): Add resize method to CpuMemHandlePtr, then we can get rid
+  //! of std::vector here.
   CpuMemHandlePtr preallocatedBuf_;
   std::vector<real, AlignedAllocator<real, 32>> rowStore_;
   size_t width_;

From 741637eba41f66b51bd1764900e75cc7d5bd9ce6 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Mon, 16 Jan 2017 16:14:29 +0800
Subject: [PATCH 39/88] Add printf method to Status.

---
 paddle/utils/Status.h              | 23 +++++++++++++++++++++++
 paddle/utils/tests/test_Status.cpp |  5 +++++
 2 files changed, 28 insertions(+)

diff --git a/paddle/utils/Status.h b/paddle/utils/Status.h
index 3456d7b686..db1edfb7c7 100644
--- a/paddle/utils/Status.h
+++ b/paddle/utils/Status.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <stdio.h>
 #include <memory>
 #include <string>
 
@@ -45,6 +46,28 @@ public:
     errMsg_.reset(new std::string(msg));
   }
 
+  /**
+   * @brief set a error message for status. Use C style printf
+   * @param fmt
+   */
+  template <typename... ARGS>
+  inline void setByPrintf(const char* fmt, ARGS... args) noexcept {
+    constexpr size_t bufferSize = 4096;
+    char buffer[bufferSize];
+    snprintf(buffer, bufferSize, fmt, args...);
+    errMsg_.reset(new std::string(buffer));
+  }
+
+  /**
+   * create a error status by C style printf.
+   */
+  template <typename... ARGS>
+  inline static Status printf(const char* fmt, ARGS... args) noexcept {
+    Status s;
+    s.setByPrintf(fmt, args...);
+    return s;
+  }
+
   /**
    * @brief what will return the error message. If status is OK, return nullptr.
    */
diff --git a/paddle/utils/tests/test_Status.cpp b/paddle/utils/tests/test_Status.cpp
index e2c2ae537d..04cef09579 100644
--- a/paddle/utils/tests/test_Status.cpp
+++ b/paddle/utils/tests/test_Status.cpp
@@ -26,4 +26,9 @@ TEST(Status, testAll) {
   paddle::Status status2("error2");
   ASSERT_FALSE(status2.isOK());
   ASSERT_STREQ("error2", status2.what());
+
+  int i = 3;
+  auto status3 = paddle::Status::printf("error%d", i);
+  ASSERT_FALSE(status3.isOK());
+  ASSERT_STREQ("error3", status3.what());
 }

From 1e233171ff59decc3d179e30ab29e65735583747 Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Mon, 16 Jan 2017 17:18:24 +0800
Subject: [PATCH 40/88] Fix merge error

---
 paddle/function/Function.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/paddle/function/Function.cpp b/paddle/function/Function.cpp
index 2632c17e3a..f47d55a4ad 100644
--- a/paddle/function/Function.cpp
+++ b/paddle/function/Function.cpp
@@ -96,7 +96,8 @@ void BufferArgs::addArg(const GpuSparseMatrix& arg, ArgType argType) {
 void BufferArgs::addArg(const Matrix& matrix,
                         const IVector& vector,
                         ArgType argType) {
-  args_.push_back(std::make_shared<SequenceArg>(matrix, vector, argType));
+  _args_.push_back(new SequenceArg(matrix, vector, argType));
+  addArg(*_args_.back());
 }
 
 ClassRegistrar<FunctionBase> FunctionBase::funcRegistrar_;

From 19cb2a7fb3e750eb2a81337bed063782977d7195 Mon Sep 17 00:00:00 2001
From: zhangruiqing01 <zhangruiqing01@baidu.com>
Date: Mon, 16 Jan 2017 11:26:29 +0800
Subject: [PATCH 41/88] fix default value of cos_sim scale

---
 python/paddle/trainer/config_parser.py                        | 2 +-
 python/paddle/trainer_config_helpers/layers.py                | 2 +-
 .../tests/configs/protostr/test_ntm_layers.protostr           | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index 674b5ac58b..8e9c40877b 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -2628,7 +2628,7 @@ class AverageLayer(LayerBase):
 
 @config_layer('cos')
 class CosSimLayer(LayerBase):
-    def __init__(self, name, inputs, cos_scale=5, device=None):
+    def __init__(self, name, inputs, cos_scale=1, device=None):
         super(CosSimLayer, self).__init__(
             name, 'cos', 1, inputs=inputs, device=device)
         config_assert(len(self.inputs) == 2, 'CosSimLayer must have 2 inputs')
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index 9b6e5774bc..23c0e9174f 100644
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -1673,7 +1673,7 @@ def trans_layer(input, name=None, layer_attr=None):
 
 @wrap_name_default()
 @layer_support()
-def cos_sim(a, b, scale=5, size=1, name=None, layer_attr=None):
+def cos_sim(a, b, scale=1, size=1, name=None, layer_attr=None):
     """
     Cosine Similarity Layer. The cosine similarity equation is here.
 
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_ntm_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_ntm_layers.protostr
index b30bbb2a4e..c1bfdf1b19 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_ntm_layers.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_ntm_layers.protostr
@@ -79,7 +79,7 @@ layers {
   inputs {
     input_layer_name: "b"
   }
-  cos_scale: 5
+  cos_scale: 1
 }
 layers {
   name: "__cos_sim_1__"
@@ -92,7 +92,7 @@ layers {
   inputs {
     input_layer_name: "c"
   }
-  cos_scale: 5
+  cos_scale: 1
 }
 layers {
   name: "__sum_to_one_norm_layer_0__"

From f8c9c889c34dd3530b899fc12523579802d4f582 Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Mon, 16 Jan 2017 21:30:44 +0800
Subject: [PATCH 42/88] Fix CrossMapNormalTest

---
 paddle/function/CrossMapNormalOp.cpp     | 9 +++++++--
 paddle/function/CrossMapNormalOpTest.cpp | 4 ----
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/paddle/function/CrossMapNormalOp.cpp b/paddle/function/CrossMapNormalOp.cpp
index 92980c503f..8e7dc72524 100644
--- a/paddle/function/CrossMapNormalOp.cpp
+++ b/paddle/function/CrossMapNormalOp.cpp
@@ -188,8 +188,13 @@ public:
     CHECK(inputs[0].shape() == inputs[3].shape());
     CHECK(inputs[0].shape() == outputs[0].shape());
 
-    // TODO(hedaoyuan): need support ASSIGN_TO mode.
-    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+    if (outputs[0].getArgType() != ADD_TO) {
+      // Currently, some algorithm implementations are ASSIGN_TO mode,
+      // if need to support the ADD_TO calculation, need to clear the output.
+      typename Tensor<real, Device>::Vector tmp(
+          outputs[0].shape().getElements(), outputs[0].data<real>());
+      tmp.zero();
+    }
 
     size_t samples = inputs[0].shape()[0];
     size_t channels = inputs[0].shape()[1];
diff --git a/paddle/function/CrossMapNormalOpTest.cpp b/paddle/function/CrossMapNormalOpTest.cpp
index da196a699c..51f5da81bf 100644
--- a/paddle/function/CrossMapNormalOpTest.cpp
+++ b/paddle/function/CrossMapNormalOpTest.cpp
@@ -47,9 +47,6 @@ TEST(CrossMapNormal, real) {
   }
 }
 
-#if 0
-// TODO(hedaoyuan): Now CrossMapNormalGrad not support ASSIGN_TO mode.
-// Maybe all Function need support ASSIGN_TO mode.
 TEST(CrossMapNormalGrad, real) {
   for (size_t numSamples : {5, 32}) {
     for (size_t channels : {1, 5, 32}) {
@@ -79,6 +76,5 @@ TEST(CrossMapNormalGrad, real) {
     }
   }
 }
-#endif
 
 }  // namespace paddle

From 07787f72ba69dd0bbca4ee01f84c59fb34dc02c9 Mon Sep 17 00:00:00 2001
From: Helin Wang <ustc.harry@gmail.com>
Date: Mon, 16 Jan 2017 09:19:05 -0800
Subject: [PATCH 43/88] clarify and fix problems in paddle on aws k8s (create
 cluster part)

---
 doc/howto/usage/k8s/k8s_aws_en.md | 138 ++++++++++++++++++++----------
 1 file changed, 93 insertions(+), 45 deletions(-)

diff --git a/doc/howto/usage/k8s/k8s_aws_en.md b/doc/howto/usage/k8s/k8s_aws_en.md
index c776ba9eb9..bd9eee7296 100644
--- a/doc/howto/usage/k8s/k8s_aws_en.md
+++ b/doc/howto/usage/k8s/k8s_aws_en.md
@@ -2,18 +2,18 @@
 
 ## Create AWS Account and IAM Account
 
-AWS account allow us to manage AWS from Web Console. Amazon AMI enable us to manage AWS from command line interface.
+AWS account allow us to manage AWS from Web Console. Amazon IAM enable us to manage AWS from command line interface.
 
-We need to create an AMI user with sufficient privilege to create kubernetes cluster on AWS.
+We need to create an IAM user with sufficient privilege to create kubernetes cluster on AWS.
 
 To sign up an AWS account, please
 follow
 [this guide](http://docs.aws.amazon.com/lambda/latest/dg/setting-up.html).
-To create users and user groups under an AWS account, please
+To create IAM users and user groups under an AWS account, please
 follow
 [this guide](http://docs.aws.amazon.com/IAM/latest/UserGuide/id_users_create.html).
 
-Please be aware that this tutorial needs the following privileges for the user in AMI:
+Please be aware that this tutorial needs the following privileges for the user in IAM:
 
 - AmazonEC2FullAccess
 - AmazonS3FullAccess
@@ -27,14 +27,6 @@ Please be aware that this tutorial needs the following privileges for the user i
 - AWSKeyManagementServicePowerUser
 
 
-By the time we write this tutorial, we noticed that Chinese AWS users
-might suffer from authentication problems when running this tutorial.
-Our solution is that we create a VM instance with the default Amazon
-AMI and in the same zone as our cluster runs, so we can SSH to this VM
-instance as a tunneling server and control our cluster and jobs from
-it.
-
-
 ## PaddlePaddle on AWS
 
 Here we will show you step by step on how to run PaddlePaddle training on AWS cluster.
@@ -59,7 +51,7 @@ gpg2 --fingerprint FC8A365E
 ```
 The correct key fingerprint is `18AD 5014 C99E F7E3 BA5F 6CE9 50BD D3E0 FC8A 365E`
 
-Go to the [releases](https://github.com/coreos/kube-aws/releases) and download the latest release tarball and detached signature (.sig) for your architecture.
+Go to the [releases](https://github.com/coreos/kube-aws/releases) and download release tarball (this tutorial is using v0.9.1) and detached signature (.sig) for your architecture.
 
 Validate the tarball's GPG signature:
 
@@ -88,14 +80,22 @@ mv ${PLATFORM}/kube-aws /usr/local/bin
 
 [kubectl](https://kubernetes.io/docs/user-guide/kubectl-overview/) is a command line interface for running commands against Kubernetes clusters.
 
-Go to the [releases](https://github.com/kubernetes/kubernetes/releases) and download the latest release tarball.
-
-Extract the tarball and then concate the kubernetes binaries directory into PATH:
+Download `kubectl` from the Kubernetes release artifact site with the `curl` tool.
 
 ```
-export PATH=<path/to/kubernetes-directory>/platforms/linux/amd64:$PATH # The exact path depend on your platform
+# OS X
+curl -O https://storage.googleapis.com/kubernetes-release/release/"$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)"/bin/darwin/amd64/kubectl
+
+# Linux
+curl -O https://storage.googleapis.com/kubernetes-release/release/"$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)"/bin/linux/amd64/kubectl
 ```
 
+Make the kubectl binary executable and move it to your PATH (e.g. `/usr/local/bin`):
+
+```
+chmod +x ./kubectl
+sudo mv ./kubectl /usr/local/bin/kubectl
+```
 
 ### Configure AWS Credentials
 
@@ -109,17 +109,18 @@ aws configure
 ```
 
 
-Fill in the required fields (You can get your AWS aceess key id and AWS secrete access key by following [this](http://docs.aws.amazon.com/cli/latest/userguide/cli-chap-getting-started.html) instruction):
+Fill in the required fields:
 
 
 ```
 AWS Access Key ID: YOUR_ACCESS_KEY_ID
 AWS Secrete Access Key: YOUR_SECRETE_ACCESS_KEY
-Default region name: us-west-2
+Default region name: us-west-1
 Default output format: json
-
 ```
 
+`YOUR_ACCESS_KEY_ID`, and `YOUR_SECRETE_ACCESS_KEY` is the IAM key and secret from [Create AWS Account and IAM Account](#create-aws-account-and-iam-account)
+
 Verify that your credentials work by describing any instances you may already have running on your account:
 
 ```
@@ -134,7 +135,9 @@ The keypair that will authenticate SSH access to your EC2 instances. The public
 
 Follow [EC2 Keypair docs](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-key-pairs.html) to create a EC2 key pair
 
-After creating a key pair, you will use the name you gave the keys to configure the cluster. Key pairs are only available to EC2 instances in the same region.
+After creating a key pair, you will use the key pair name to configure the cluster.
+
+Key pairs are only available to EC2 instances in the same region. We are using us-west-1 in our tutorial, so make sure to creat key pairs in that region (N. California).
 
 #### KMS key
 
@@ -143,12 +146,12 @@ Amazon KMS keys are used to encrypt and decrypt cluster TLS assets. If you alrea
 You can create a KMS key in the AWS console, or with the aws command line tool:
 
 ```
-$ aws kms --region=us-west-1 create-key --description="kube-aws assets"
+aws kms --region=us-west-1 create-key --description="kube-aws assets"
 {
     "KeyMetadata": {
         "CreationDate": 1458235139.724,
         "KeyState": "Enabled",
-        "Arn": "arn:aws:kms:us-west-1:xxxxxxxxx:key/xxxxxxxxxxxxxxxxxxx",
+        "Arn": "arn:aws:kms:us-west-1:aaaaaaaaaaaaa:key/xxxxxxxxxxxxxxxxxxx",
         "AWSAccountId": "xxxxxxxxxxxxx",
         "Enabled": true,
         "KeyUsage": "ENCRYPT_DECRYPT",
@@ -158,11 +161,11 @@ $ aws kms --region=us-west-1 create-key --description="kube-aws assets"
 }
 ```
 
-You will use the `KeyMetadata.Arn` string to identify your KMS key in the init step.
+We will need to use the value of `Arn` later.
 
 And then you need to add several inline policies in your user permission.
 
-Go to AMI user page, click on `Add inline policy` button, and then select `Custom Policy`
+Go to IAM user page, click on `Add inline policy` button, and then select `Custom Policy`
 
 paste into following inline policies:
 
@@ -178,7 +181,7 @@ paste into following inline policies:
                 "kms:Encrypt"
             ],
             "Resource": [
-                "arn:aws:kms:*:xxxxxxxxx:key/*"
+                "arn:aws:kms:*:AWS_ACCOUNT_ID:key/*"
             ]
         },
 		{
@@ -194,29 +197,37 @@ paste into following inline policies:
                 "cloudformation:DescribeStackEvents"
             ],
             "Resource": [
-                "arn:aws:cloudformation:us-west-1:xxxxxxxxx:stack/YOUR_CLUSTER_NAME/*"
+                "arn:aws:cloudformation:us-west-1:AWS_ACCOUNT_ID:stack/MY_CLUSTER_NAME/*"
             ]
         }
     ]
 }
 ```
 
+`AWS_ACCOUNT_ID`: You can get it from following command line:
+
+```
+aws sts get-caller-identity --output text --query Account
+```
+
+`MY_CLUSTER_NAME`: Pick a MY_CLUSTER_NAME that you like, you will use it later as well.
 
 #### External DNS name
 
-When the cluster is created, the controller will expose the TLS-secured API on a public IP address. You will need to create an A record for the external DNS hostname you want to point to this IP address. You can find the API external IP address after the cluster is created by invoking kube-aws status.
+When the cluster is created, the controller will expose the TLS-secured API on a DNS name.
+
+The A record of that DNS name needs to be point to the cluster ip address.
+
+We will need to use DNS name later in tutorial. If you don't already own one, you can choose any DNS name (e.g., `paddle`) and modify `/etc/hosts` to associate cluster ip with that DNS name.
 
 #### S3 bucket
 
 You need to create an S3 bucket before startup the Kubernetes cluster.
 
-command (need to have a global unique name):
+There are some bug in aws cli in creating S3 bucket, so let's use [web console](https://console.aws.amazon.com/s3/home?region=us-west-1).
 
-```
-paddle aws s3api --region=us-west-1 create-bucket --bucket bucket-name
-```
+Click on `Create Bucket`, fill in a unique BUCKET_NAME, and make sure region is us-west-1 (Northern California).
 
-If you get an error message, try a different bucket name. The bucket name needs to be globally unique.
 
 #### Initialize an asset directory
 
@@ -230,33 +241,44 @@ $ cd my-cluster
 Initialize the cluster CloudFormation stack with the KMS Arn, key pair name, and DNS name from the previous step:
 
 ```
-$ kube-aws init \
---cluster-name=my-cluster-name \
---external-dns-name=my-cluster-endpoint \
+kube-aws init \
+--cluster-name=MY_CLUSTER_NAME \
+--external-dns-name=MY_EXTERNAL_DNS_NAME \
 --region=us-west-1 \
---availability-zone=us-west-1c \
---key-name=key-pair-name \
+--availability-zone=us-west-1a \
+--key-name=KEY_PAIR_NAME \
 --kms-key-arn="arn:aws:kms:us-west-1:xxxxxxxxxx:key/xxxxxxxxxxxxxxxxxxx"
 ```
 
-Here `us-west-1c` is used for parameter `--availability-zone`, but supported availability zone varies among AWS accounts.
+`MY_CLUSTER_NAME`: the one you picked in [KMS key](#kms-key)
+
+`MY_EXTERNAL_DNS_NAME`: see [External DNS name](#external-dns-name)
 
-Please check if `us-west-1c` is supported by `aws ec2 --region us-west-1 describe-availability-zones`, if not switch to other supported availability zone. (e.g., `us-west-1a`, or `us-west-1b`)
+`KEY_PAIR_NAME`: see [EC2 key pair](#ec2-key-pair)
+
+`--kms-key-arn`: the "Arn" in [KMS key](#kms-key)
+
+Here `us-west-1a` is used for parameter `--availability-zone`, but supported availability zone varies among AWS accounts.
+
+Please check if `us-west-1a` is supported by `aws ec2 --region us-west-1 describe-availability-zones`, if not switch to other supported availability zone. (e.g., `us-west-1a`, or `us-west-1b`)
+
+Note: please don't use `us-west-1c`. Subnets can currently only be created in the following availability zones: us-west-1b, us-west-1a.
 
 There will now be a cluster.yaml file in the asset directory. This is the main configuration file for your cluster.
 
+
 #### Render contents of the asset directory
 
 In the simplest case, you can have kube-aws generate both your TLS identities and certificate authority for you.
 
 ```
-$ kube-aws render credentials --generate-ca
+kube-aws render credentials --generate-ca
 ```
 
 The next command generates the default set of cluster assets in your asset directory.
 
 ```
-sh $ kube-aws render stack
+kube-aws render stack
 ```
 
 Here's what the directory structure looks like:
@@ -292,15 +314,41 @@ These assets (templates and credentials) are used to create, update and interact
 
 #### Create the instances defined in the CloudFormation template
 
-Now for the exciting part, creating your cluster (choose any `<prefix>`):
+Now let's create your cluster (choose any PREFIX for the command below):
 
 ```
-$ kube-aws up --s3-uri s3://<your-bucket-name>/<prefix>
+kube-aws up --s3-uri s3://BUCKET_NAME/PREFIX
 ```
 
+`BUCKET_NAME`: the bucket name that you used in [S3 bucket](#s3-bucket)
+
+
 #### Configure DNS
 
-You can invoke `kube-aws status` to get the cluster API endpoint after cluster creation, if necessary. This command can take a while. And use command `dig` to check the load balancer hostname to get the ip address, use this ip to setup an A record for your external dns name.
+You can invoke `kube-aws status` to get the cluster API endpoint after cluster creation.
+
+```
+$ kube-aws status
+Cluster Name:		paddle-cluster
+Controller DNS Name:	paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-1.elb.amazonaws.com
+```
+
+Use command `dig` to check the load balancer hostname to get the ip address.
+
+```
+$ dig paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-1.elb.amazonaws.com
+
+;; QUESTION SECTION:
+;paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-1.elb.amazonaws.com. IN A
+
+;; ANSWER SECTION:
+paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-1.elb.amazonaws.com. 59 IN A 54.241.164.52
+paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-1.elb.amazonaws.com. 59 IN A 54.67.102.112
+```
+
+In the above output, both ip `54.241.164.52`, `54.67.102.112` will work.
+
+If you own a DNS name, set the A record to any of the above ip. Otherwise you can edit `/etc/hosts` to associate ip with the DNS name.
 
 #### Access the cluster
 

From 50afa35a59de9e9c2a31e873ab650eb181de801d Mon Sep 17 00:00:00 2001
From: Helin Wang <helinwang@baidu.com>
Date: Mon, 16 Jan 2017 14:29:52 -0800
Subject: [PATCH 44/88] fixes according to comment

---
 doc/howto/usage/k8s/k8s_aws_en.md | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/doc/howto/usage/k8s/k8s_aws_en.md b/doc/howto/usage/k8s/k8s_aws_en.md
index bd9eee7296..00bc41e5c3 100644
--- a/doc/howto/usage/k8s/k8s_aws_en.md
+++ b/doc/howto/usage/k8s/k8s_aws_en.md
@@ -2,9 +2,7 @@
 
 ## Create AWS Account and IAM Account
 
-AWS account allow us to manage AWS from Web Console. Amazon IAM enable us to manage AWS from command line interface.
-
-We need to create an IAM user with sufficient privilege to create kubernetes cluster on AWS.
+Under each AWS account, we can create multiple [IAM](http://docs.aws.amazon.com/IAM/latest/UserGuide/introduction.html) users. This allows us to grant some privileges to each IAM user and to create/operate AWS clusters as an IAM user.
 
 To sign up an AWS account, please
 follow
@@ -51,7 +49,7 @@ gpg2 --fingerprint FC8A365E
 ```
 The correct key fingerprint is `18AD 5014 C99E F7E3 BA5F 6CE9 50BD D3E0 FC8A 365E`
 
-Go to the [releases](https://github.com/coreos/kube-aws/releases) and download release tarball (this tutorial is using v0.9.1) and detached signature (.sig) for your architecture.
+We can download `kube-aws` from its [release page](https://github.com/coreos/kube-aws/releases). In this tutorial, we use version 0.9.1
 
 Validate the tarball's GPG signature:
 
@@ -224,7 +222,7 @@ We will need to use DNS name later in tutorial. If you don't already own one, yo
 
 You need to create an S3 bucket before startup the Kubernetes cluster.
 
-There are some bug in aws cli in creating S3 bucket, so let's use [web console](https://console.aws.amazon.com/s3/home?region=us-west-1).
+There are some bugs in aws cli in creating S3 bucket, so let's use the [Web console](https://console.aws.amazon.com/s3/home?region=us-west-1).
 
 Click on `Create Bucket`, fill in a unique BUCKET_NAME, and make sure region is us-west-1 (Northern California).
 

From 8aefc30499e09729b5755fe8edfd32ba72a9baed Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Tue, 17 Jan 2017 10:59:27 +0800
Subject: [PATCH 45/88] Fix compile error.

---
 paddle/utils/Status.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/paddle/utils/Status.h b/paddle/utils/Status.h
index db1edfb7c7..52f312378e 100644
--- a/paddle/utils/Status.h
+++ b/paddle/utils/Status.h
@@ -52,9 +52,9 @@ public:
    */
   template <typename... ARGS>
   inline void setByPrintf(const char* fmt, ARGS... args) noexcept {
-    constexpr size_t bufferSize = 4096;
-    char buffer[bufferSize];
-    snprintf(buffer, bufferSize, fmt, args...);
+    constexpr size_t kBufferSize = 4096;
+    char buffer[kBufferSize];
+    snprintf(buffer, kBufferSize, fmt, args...);
     errMsg_.reset(new std::string(buffer));
   }
 

From ae0f953eb03f021e5892bdea2009d7088a346e46 Mon Sep 17 00:00:00 2001
From: qijun <qijun1994@hotmail.com>
Date: Tue, 17 Jan 2017 03:10:53 +0000
Subject: [PATCH 46/88] add centos build doc

---
 .../build_and_install/build_from_source_en.md | 63 +++++++++++++++++++
 1 file changed, 63 insertions(+)

diff --git a/doc/getstarted/build_and_install/build_from_source_en.md b/doc/getstarted/build_and_install/build_from_source_en.md
index 1abd7b698b..924ccf0116 100644
--- a/doc/getstarted/build_and_install/build_from_source_en.md
+++ b/doc/getstarted/build_and_install/build_from_source_en.md
@@ -4,6 +4,8 @@ Installing from Sources
 * [1. Download and Setup](#download)
 * [2. Requirements](#requirements)
 * [3. Build on Ubuntu](#ubuntu)
+* [4. Build on Centos](#centos)
+
 
 ## <span id="download">Download and Setup</span> 
 You can download PaddlePaddle from the [github source](https://github.com/PaddlePaddle/Paddle).
@@ -151,3 +153,64 @@ export PATH=<path to install>/bin:$PATH
 # install PaddlePaddle Python modules.
 sudo pip install <path to install>/opt/paddle/share/wheels/*.whl
 ```
+## <span id="centos">Build on Centos 7</span>
+
+### Install Dependencies
+
+- **CPU Dependencies**
+
+    ```bash
+    # necessary
+    sudo yum update
+    sudo yum install -y epel-release
+    sudo yum install -y make cmake3 python-devel python-pip gcc-gfortran swig git
+    sudo pip install wheel numpy
+    sudo pip install 'protobuf>=3.0.0'
+    ```
+  
+- **GPU Dependencies (optional)**
+
+    To build GPU version, you will need the following installed:
+
+        1. a CUDA-capable GPU
+        2. A supported version of Linux with a gcc compiler and toolchain
+        3. NVIDIA CUDA Toolkit (available at http://developer.nvidia.com/cuda-downloads)
+        4. NVIDIA cuDNN Library (availabel at https://developer.nvidia.com/cudnn)
+
+    The CUDA development environment relies on tight integration with the host development environment,
+    including the host compiler and C runtime libraries, and is therefore only supported on
+    distribution versions that have been qualified for this CUDA Toolkit release.
+        
+    After downloading cuDNN library, issue the following commands:
+
+    ```bash
+    sudo tar -xzf cudnn-7.5-linux-x64-v5.1.tgz -C /usr/local
+    sudo chmod a+r /usr/local/cuda/include/cudnn.h /usr/local/cuda/lib64/libcudnn*
+    ```
+    Then you need to set LD\_LIBRARY\_PATH, PATH environment variables in ~/.bashrc.
+
+    ```bash
+    export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
+    export PATH=/usr/local/cuda/bin:$PATH
+    ```
+
+### Build and Install
+
+As usual, the best option is to create build folder under paddle project directory.
+
+```bash
+mkdir build && cd build
+``` 
+
+Finally, you can build and install PaddlePaddle:
+
+```bash
+# you can add build option here, such as:    
+cmake3 .. -DCMAKE_INSTALL_PREFIX=<path to install>
+# please use sudo make install, if you want to install PaddlePaddle into the system
+make -j `nproc` && make install
+# set PaddlePaddle installation path in ~/.bashrc
+export PATH=<path to install>/bin:$PATH
+# install PaddlePaddle Python modules.
+sudo pip install <path to install>/opt/paddle/share/wheels/*.whl
+```

From ceb2d39799fa600ff77fdbe019191f846829e916 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Tue, 17 Jan 2017 12:49:05 +0800
Subject: [PATCH 47/88] Fix bugs in config_helpers unittest.

* It gets wrong command line arguments before.
---
 .../tests/configs/run_tests.sh                     | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh b/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh
index a37eb6439e..c8a3b190b1 100755
--- a/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh
@@ -2,16 +2,18 @@
 cd `dirname $0`
 
 set -e
+PYTHON_EXEC=$1
+COMPARE_PROTO_UTIL=$2
 
 protostr=`dirname $0`/protostr
 
 files=`ls $protostr | grep -v "unittest"`
 
-./generate_protostr.sh $1
+./generate_protostr.sh ${PYTHON_EXEC}
 
 . ./file_list.sh
 
-if [ -z $1 ]; then
+if [ -z ${COMPARE_PROTO_UTIL} ]; then
   for file in $files
   do
       base_protostr=$protostr/$file
@@ -22,20 +24,20 @@ if [ -z $1 ]; then
 else
   for file in ${configs[*]}
   do
-    if ! $1 $protostr/$file.protostr $protostr/$file.protostr.unittest; then
+    if ! ${COMPARE_PROTO_UTIL} $protostr/$file.protostr $protostr/$file.protostr.unittest; then
       diff $protostr/$file.protostr $protostr/$file.protostr.unittest -u
     fi
-    if ! $1 $protostr/$file.protostr $protostr/$file.protostr.non_file_config.unittest; then
+    if ! ${COMPARE_PROTO_UTIL} $protostr/$file.protostr $protostr/$file.protostr.non_file_config.unittest; then
       diff $protostr/$file.protostr $protostr/$file.protostr.non_file_config.unittest -u
     fi
   done
 
   for file in ${whole_configs[*]}
   do
-    if ! $1 $protostr/$file.protostr $protostr/$file.protostr.unittest --whole; then
+    if ! ${COMPARE_PROTO_UTIL} $protostr/$file.protostr $protostr/$file.protostr.unittest --whole; then
       diff $protostr/$file.protostr $protostr/$file.protostr.unittest -u
     fi
-    if ! $1 $protostr/$file.protostr $protostr/$file.protostr.non_file_config.unittest --whole; then
+    if ! ${COMPARE_PROTO_UTIL} $protostr/$file.protostr $protostr/$file.protostr.non_file_config.unittest --whole; then
       diff $protostr/$file.protostr $protostr/$file.protostr.non_file_config.unittest -u
     fi
   done

From 4837665d227f0357e3aac51712e6bac7a595cb72 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Tue, 17 Jan 2017 17:01:44 +0800
Subject: [PATCH 48/88] Update dockerfiles for develop branch

---
 .../build_and_install/build_from_source_en.md |  3 +-
 paddle/scripts/docker/Dockerfile              | 31 ++++++++++---------
 paddle/scripts/docker/Dockerfile.gpu          | 31 ++++++++++---------
 3 files changed, 36 insertions(+), 29 deletions(-)

diff --git a/doc/getstarted/build_and_install/build_from_source_en.md b/doc/getstarted/build_and_install/build_from_source_en.md
index 1abd7b698b..7d963a5a6d 100644
--- a/doc/getstarted/build_and_install/build_from_source_en.md
+++ b/doc/getstarted/build_and_install/build_from_source_en.md
@@ -17,8 +17,9 @@ cd paddle
 To compile the source code, your computer must be equipped with the following dependencies.
 
 - **Compiler**: GCC >= 4.8 or Clang >= 3.3 (AppleClang >= 5.1)
-- **CMake**: version >= 3.0 (at least CMake 3.4 on Mac OS X)
+- **CMake**: CMake >= 3.0 (at least CMake 3.4 on Mac OS X)
 - **BLAS**: MKL, OpenBlas or ATLAS
+- **Python**: only support Python 2.7
 
 **Note:** For CUDA 7.0 and CUDA 7.5, GCC 5.0 and up are not supported!
 For CUDA 8.0, GCC versions later than 5.3 are not supported!
diff --git a/paddle/scripts/docker/Dockerfile b/paddle/scripts/docker/Dockerfile
index 1522be023f..13a5758f7b 100644
--- a/paddle/scripts/docker/Dockerfile
+++ b/paddle/scripts/docker/Dockerfile
@@ -4,28 +4,31 @@ MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
 ARG DEBIAN_FRONTEND=noninteractive
 ARG UBUNTU_MIRROR
 RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
-RUN apt-get update \
-    && apt-get install -y cmake libprotobuf-dev protobuf-compiler git \
-    libgoogle-glog-dev libgflags-dev libgtest-dev \
-    libatlas-dev libatlas3-base g++ m4 python-pip \
-    python-protobuf python-numpy python-dev swig openssh-server \
-    wget unzip python-matplotlib tar xz-utils bzip2 gzip coreutils \
-    sed grep graphviz libjpeg-dev zlib1g-dev doxygen \
-    clang-3.8 llvm-3.8 libclang-3.8-dev \
-    && apt-get clean -y
-RUN cd /usr/src/gtest && cmake . && make && cp *.a /usr/lib
-RUN pip install -U BeautifulSoup docopt PyYAML pillow \
-    sphinx sphinx_rtd_theme recommonmark jupyter
+
+RUN apt-get update && \
+    apt-get install -y git python-pip python-dev openssh-server && \
+    apt-get install -y wget unzip tar xz-utils bzip2 gzip coreutils && \
+    apt-get install -y curl sed grep graphviz libjpeg-dev zlib1g-dev && \
+    apt-get install -y g++ clang-3.8 llvm-3.8 libclang-3.8-dev && \
+    apt-get clean -y
+
+RUN pip install --upgrade pip && \ 
+    pip install 'protobuf==3.1.0.post1' && \
+    pip install -U numpy wheel matplotlib pillow && \
+    pip install -U BeautifulSoup docopt PyYAML sphinx && \
+    pip install -U sphinx_rtd_theme recommonmark jupyter
+
+RUN curl -sSL https://cmake.org/files/v3.4/cmake-3.4.1.tar.gz | tar -xz && \
+    cd cmake-3.4.1 && ./bootstrap && make -j4 && make install && \
+    cd .. && rm -rf cmake-3.4.1
 
 ARG WITH_AVX
 ARG WITH_DOC
-ARG WITH_SWIG_PY
 ARG WITH_STYLE_CHECK
 
 ENV WITH_GPU=OFF
 ENV WITH_AVX=${WITH_AVX:-ON}
 ENV WITH_DOC=${WITH_DOC:-ON}
-ENV WITH_SWIG_PY=${WITH_SWIG_PY:-ON}
 ENV WITH_STYLE_CHECK=${WITH_STYLE_CHECK:-OFF}
 
 RUN mkdir /paddle
diff --git a/paddle/scripts/docker/Dockerfile.gpu b/paddle/scripts/docker/Dockerfile.gpu
index 09f07043e2..bab86114fe 100644
--- a/paddle/scripts/docker/Dockerfile.gpu
+++ b/paddle/scripts/docker/Dockerfile.gpu
@@ -4,28 +4,31 @@ MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
 ARG DEBIAN_FRONTEND=noninteractive
 ARG UBUNTU_MIRROR
 RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
-RUN apt-get update \
-    && apt-get install -y cmake libprotobuf-dev protobuf-compiler git \
-    libgoogle-glog-dev libgflags-dev libgtest-dev \
-    libatlas-dev libatlas3-base g++ m4 python-pip \
-    python-protobuf python-numpy python-dev swig openssh-server \
-    wget unzip python-matplotlib tar xz-utils bzip2 gzip coreutils \
-    sed grep graphviz libjpeg-dev zlib1g-dev doxygen \
-    clang-3.8 llvm-3.8 libclang-3.8-dev \
-    && apt-get clean -y
-RUN cd /usr/src/gtest && cmake . && make && cp *.a /usr/lib
-RUN pip install -U BeautifulSoup docopt PyYAML pillow \
-    sphinx sphinx_rtd_theme recommonmark jupyter
+
+RUN apt-get update && \
+    apt-get install -y git python-pip python-dev openssh-server && \
+    apt-get install -y wget unzip tar xz-utils bzip2 gzip coreutils && \
+    apt-get install -y curl sed grep graphviz libjpeg-dev zlib1g-dev && \
+    apt-get install -y g++ clang-3.8 llvm-3.8 libclang-3.8-dev && \
+    apt-get clean -y
+
+RUN pip install --upgrade pip && \ 
+    pip install 'protobuf==3.1.0.post1' && \
+    pip install -U numpy wheel matplotlib pillow && \
+    pip install -U BeautifulSoup docopt PyYAML sphinx && \
+    pip install -U sphinx_rtd_theme recommonmark jupyter
+
+RUN curl -sSL https://cmake.org/files/v3.4/cmake-3.4.1.tar.gz | tar -xz && \
+    cd cmake-3.4.1 && ./bootstrap && make -j4 && make install && \
+    cd .. && rm -rf cmake-3.4.1
 
 ARG WITH_AVX
 ARG WITH_DOC
-ARG WITH_SWIG_PY
 ARG WITH_STYLE_CHECK
 
 ENV WITH_GPU=ON
 ENV WITH_AVX=${WITH_AVX:-ON}
 ENV WITH_DOC=${WITH_DOC:-ON}
-ENV WITH_SWIG_PY=${WITH_SWIG_PY:-ON}
 ENV WITH_STYLE_CHECK=${WITH_STYLE_CHECK:-OFF}
 
 RUN mkdir /paddle

From a5c1658d455008bbe4e4a5a3075bbd6eced30f28 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Tue, 17 Jan 2017 17:20:13 +0800
Subject: [PATCH 49/88] Always create protobuf_equal

* Because currently Paddle only use protobuf 3.
---
 .../tests/CMakeLists.txt                      | 21 +++++++------------
 1 file changed, 7 insertions(+), 14 deletions(-)

diff --git a/python/paddle/trainer_config_helpers/tests/CMakeLists.txt b/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
index 403aafabe9..93dd7796c2 100644
--- a/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
+++ b/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
@@ -9,17 +9,10 @@ add_test(NAME test_reset_hook
         ${PYTHON_EXECUTABLE} ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/test_reset_hook.py
     WORKING_DIRECTORY ${PROJ_ROOT}/python/paddle)
 
-if (PROTOBUF_3)
-  add_paddle_exe(protobuf_equal
-    ProtobufEqualMain.cpp)
-  add_test(NAME test_layerHelpers
-    COMMAND
-    ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh ${PYTHON_EXECUTABLE}
-    ${CMAKE_CURRENT_BINARY_DIR}/protobuf_equal
-  )
-else()
-  add_test(NAME test_layerHelpers
-    COMMAND
-    ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh ${PYTHON_EXECUTABLE}
-  )
-endif()
+add_paddle_exe(protobuf_equal
+  ProtobufEqualMain.cpp)
+add_test(NAME test_layerHelpers
+  COMMAND
+  ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh ${PYTHON_EXECUTABLE}
+  ${CMAKE_CURRENT_BINARY_DIR}/protobuf_equal
+)

From 1c5a7c431690ba6a981ac9f72aacf556da1bc1db Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Tue, 17 Jan 2017 17:33:15 +0800
Subject: [PATCH 50/88] follow comments

---
 paddle/function/BufferArg.h      | 1 +
 paddle/function/FunctionTest.cpp | 1 +
 paddle/function/FunctionTest.h   | 2 +-
 3 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/paddle/function/BufferArg.h b/paddle/function/BufferArg.h
index 9c792c6bb7..84209265ce 100644
--- a/paddle/function/BufferArg.h
+++ b/paddle/function/BufferArg.h
@@ -192,6 +192,7 @@ public:
   SequenceIdArg(const TensorShape& shape, ArgType argType = UNSPECIFIED)
       : BufferArg(VALUE_TYPE_INT32, shape, argType) {
     CHECK_EQ(shape_.ndims(), (size_t)1);
+    CHECK_GT(shape_[0], 1);
     numSeqs_ = shape_[0] - 1;
   }
 
diff --git a/paddle/function/FunctionTest.cpp b/paddle/function/FunctionTest.cpp
index 03c609b524..fdf7e631e5 100644
--- a/paddle/function/FunctionTest.cpp
+++ b/paddle/function/FunctionTest.cpp
@@ -85,6 +85,7 @@ void testBufferArgs(const BufferArgs& inputs,
 }
 
 void testBufferArgs(const BufferArgs& inputs, const CheckBufferArg& check) {
+  EXPECT_EQ(inputs.size(), 1);
   check(inputs[0]);
 }
 
diff --git a/paddle/function/FunctionTest.h b/paddle/function/FunctionTest.h
index 412e3a7d1b..24e7a36a43 100644
--- a/paddle/function/FunctionTest.h
+++ b/paddle/function/FunctionTest.h
@@ -172,7 +172,7 @@ protected:
 
   void initArg(SequenceIdArg& arg, size_t batchSize) {
     size_t numSeqs = arg.numSeqs();
-    int* buf = (int*)arg.data();
+    int* buf = reinterpret_cast<int*>(arg.data());
     int pos = 0;
     size_t maxLen = 2 * batchSize / numSeqs;
     for (int i = 0; i < (int)numSeqs; ++i) {

From c21595d6d9ac964763638531687080477e6c75ca Mon Sep 17 00:00:00 2001
From: cxysteven <derrickzy@gmail.com>
Date: Tue, 17 Jan 2017 17:33:24 +0800
Subject: [PATCH 51/88] bug fixed

---
 demo/traffic_prediction/predict.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/demo/traffic_prediction/predict.sh b/demo/traffic_prediction/predict.sh
index cec35dce11..2dbd5e8805 100755
--- a/demo/traffic_prediction/predict.sh
+++ b/demo/traffic_prediction/predict.sh
@@ -25,6 +25,6 @@ paddle train \
     --config_args=is_predict=1 \
     --predict_output_dir=. 
 
-python gen_result.py > result.txt
+python gen_result.py > result.csv
 
 rm -rf rank-00000

From d79cac158d5436b987c07e398b029e26210f2c96 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Tue, 17 Jan 2017 17:45:50 +0800
Subject: [PATCH 52/88] Update python-matplotlib in Dockerfile

---
 paddle/scripts/docker/Dockerfile     | 4 ++--
 paddle/scripts/docker/Dockerfile.gpu | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/paddle/scripts/docker/Dockerfile b/paddle/scripts/docker/Dockerfile
index 13a5758f7b..01261d7a2d 100644
--- a/paddle/scripts/docker/Dockerfile
+++ b/paddle/scripts/docker/Dockerfile
@@ -9,12 +9,12 @@ RUN apt-get update && \
     apt-get install -y git python-pip python-dev openssh-server && \
     apt-get install -y wget unzip tar xz-utils bzip2 gzip coreutils && \
     apt-get install -y curl sed grep graphviz libjpeg-dev zlib1g-dev && \
-    apt-get install -y g++ clang-3.8 llvm-3.8 libclang-3.8-dev && \
+    apt-get install -y python-matplotlib g++ && \
     apt-get clean -y
 
 RUN pip install --upgrade pip && \ 
     pip install 'protobuf==3.1.0.post1' && \
-    pip install -U numpy wheel matplotlib pillow && \
+    pip install -U numpy wheel pillow && \
     pip install -U BeautifulSoup docopt PyYAML sphinx && \
     pip install -U sphinx_rtd_theme recommonmark jupyter
 
diff --git a/paddle/scripts/docker/Dockerfile.gpu b/paddle/scripts/docker/Dockerfile.gpu
index bab86114fe..108cfd9c9e 100644
--- a/paddle/scripts/docker/Dockerfile.gpu
+++ b/paddle/scripts/docker/Dockerfile.gpu
@@ -9,12 +9,12 @@ RUN apt-get update && \
     apt-get install -y git python-pip python-dev openssh-server && \
     apt-get install -y wget unzip tar xz-utils bzip2 gzip coreutils && \
     apt-get install -y curl sed grep graphviz libjpeg-dev zlib1g-dev && \
-    apt-get install -y g++ clang-3.8 llvm-3.8 libclang-3.8-dev && \
+    apt-get install -y python-matplotlib g++ && \
     apt-get clean -y
 
 RUN pip install --upgrade pip && \ 
     pip install 'protobuf==3.1.0.post1' && \
-    pip install -U numpy wheel matplotlib pillow && \
+    pip install -U numpy wheel pillow && \
     pip install -U BeautifulSoup docopt PyYAML sphinx && \
     pip install -U sphinx_rtd_theme recommonmark jupyter
 

From 832bb6a745975c8327da396a48303f407649337c Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Tue, 17 Jan 2017 17:57:38 +0800
Subject: [PATCH 53/88] Update docs

---
 doc/getstarted/build_and_install/build_from_source_en.md | 4 ++--
 paddle/scripts/docker/Dockerfile                         | 6 +++---
 paddle/scripts/docker/Dockerfile.gpu                     | 6 +++---
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/doc/getstarted/build_and_install/build_from_source_en.md b/doc/getstarted/build_and_install/build_from_source_en.md
index 7d963a5a6d..6cd2183f48 100644
--- a/doc/getstarted/build_and_install/build_from_source_en.md
+++ b/doc/getstarted/build_and_install/build_from_source_en.md
@@ -101,8 +101,8 @@ As a simple example, consider the following:
     ```bash
     # necessary
     sudo apt-get update
-    sudo apt-get install -y g++ make cmake build-essential python python-pip libpython-dev git
-    sudo pip install wheel numpy
+    sudo apt-get install -y g++ make cmake build-essential python python-pip python-numpy libpython-dev git
+    sudo pip install wheel
     sudo pip install 'protobuf>=3.0.0'
     ```
   
diff --git a/paddle/scripts/docker/Dockerfile b/paddle/scripts/docker/Dockerfile
index 01261d7a2d..28d7696c8c 100644
--- a/paddle/scripts/docker/Dockerfile
+++ b/paddle/scripts/docker/Dockerfile
@@ -9,13 +9,13 @@ RUN apt-get update && \
     apt-get install -y git python-pip python-dev openssh-server && \
     apt-get install -y wget unzip tar xz-utils bzip2 gzip coreutils && \
     apt-get install -y curl sed grep graphviz libjpeg-dev zlib1g-dev && \
-    apt-get install -y python-matplotlib g++ && \
+    apt-get install -y python-numpy python-matplotlib g++ && \
     apt-get clean -y
 
 RUN pip install --upgrade pip && \ 
     pip install 'protobuf==3.1.0.post1' && \
-    pip install -U numpy wheel pillow && \
-    pip install -U BeautifulSoup docopt PyYAML sphinx && \
+    pip install -U wheel pillow BeautifulSoup && \
+    pip install -U docopt PyYAML sphinx && \
     pip install -U sphinx_rtd_theme recommonmark jupyter
 
 RUN curl -sSL https://cmake.org/files/v3.4/cmake-3.4.1.tar.gz | tar -xz && \
diff --git a/paddle/scripts/docker/Dockerfile.gpu b/paddle/scripts/docker/Dockerfile.gpu
index 108cfd9c9e..b1344f275b 100644
--- a/paddle/scripts/docker/Dockerfile.gpu
+++ b/paddle/scripts/docker/Dockerfile.gpu
@@ -9,13 +9,13 @@ RUN apt-get update && \
     apt-get install -y git python-pip python-dev openssh-server && \
     apt-get install -y wget unzip tar xz-utils bzip2 gzip coreutils && \
     apt-get install -y curl sed grep graphviz libjpeg-dev zlib1g-dev && \
-    apt-get install -y python-matplotlib g++ && \
+    apt-get install -y python-numpy python-matplotlib g++ && \
     apt-get clean -y
 
 RUN pip install --upgrade pip && \ 
     pip install 'protobuf==3.1.0.post1' && \
-    pip install -U numpy wheel pillow && \
-    pip install -U BeautifulSoup docopt PyYAML sphinx && \
+    pip install -U wheel pillow BeautifulSoup && \
+    pip install -U docopt PyYAML sphinx && \
     pip install -U sphinx_rtd_theme recommonmark jupyter
 
 RUN curl -sSL https://cmake.org/files/v3.4/cmake-3.4.1.tar.gz | tar -xz && \

From 3d01c60e25c1b5874e6854ac565646d6ad9432d7 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Tue, 17 Jan 2017 15:44:09 +0800
Subject: [PATCH 54/88] Stash

---
 paddle/utils/Compiler.h | 0
 paddle/utils/Status.h   | 9 ++++++++-
 2 files changed, 8 insertions(+), 1 deletion(-)
 create mode 100644 paddle/utils/Compiler.h

diff --git a/paddle/utils/Compiler.h b/paddle/utils/Compiler.h
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/paddle/utils/Status.h b/paddle/utils/Status.h
index 52f312378e..cb66e4b225 100644
--- a/paddle/utils/Status.h
+++ b/paddle/utils/Status.h
@@ -24,6 +24,13 @@ namespace paddle {
  * Status is Paddle error code. It only contain a std::string as error message.
  * Although Status inherits the std::exception, but do not throw it except you
  * know what you are doing.
+ *
+ *
+ * There are two styles to return status in Paddle.
+ *
+ * 1. Return Status
+ *
+ *
  */
 class Status final : public std::exception {
 public:
@@ -52,7 +59,7 @@ public:
    */
   template <typename... ARGS>
   inline void setByPrintf(const char* fmt, ARGS... args) noexcept {
-    constexpr size_t kBufferSize = 4096;
+    constexpr size_t kBufferSize = 1024;  // 1KB buffer
     char buffer[kBufferSize];
     snprintf(buffer, kBufferSize, fmt, args...);
     errMsg_.reset(new std::string(buffer));

From ee59d109373e1417460e10227692548698bbcac7 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Tue, 17 Jan 2017 21:08:02 +0800
Subject: [PATCH 55/88] Add gfortran in docker

---
 .../build_and_install/build_from_source_en.md   | 17 +++++++++++------
 paddle/scripts/docker/Dockerfile                |  2 +-
 paddle/scripts/docker/Dockerfile.gpu            |  2 +-
 3 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/doc/getstarted/build_and_install/build_from_source_en.md b/doc/getstarted/build_and_install/build_from_source_en.md
index 6cd2183f48..fbca407030 100644
--- a/doc/getstarted/build_and_install/build_from_source_en.md
+++ b/doc/getstarted/build_and_install/build_from_source_en.md
@@ -16,7 +16,7 @@ cd paddle
 
 To compile the source code, your computer must be equipped with the following dependencies.
 
-- **Compiler**: GCC >= 4.8 or Clang >= 3.3 (AppleClang >= 5.1)
+- **Compiler**: GCC >= 4.8 or Clang >= 3.3 (AppleClang >= 5.1) and gfortran compiler
 - **CMake**: CMake >= 3.0 (at least CMake 3.4 on Mac OS X)
 - **BLAS**: MKL, OpenBlas or ATLAS
 - **Python**: only support Python 2.7
@@ -96,16 +96,21 @@ As a simple example, consider the following:
 
 ### Install Dependencies
 
-- **CPU Dependencies**
+- **Paddle Dependencies**
 
     ```bash
     # necessary
     sudo apt-get update
-    sudo apt-get install -y g++ make cmake build-essential python python-pip python-numpy libpython-dev git
-    sudo pip install wheel
-    sudo pip install 'protobuf>=3.0.0'
+    sudo apt-get install -y git curl gcc g++ gfortran make build-essential
+    sudo apt-get install -y python python-pip python-numpy libpython-dev
+    sudo pip install 'protobuf==3.1.0.post1'
+
+    # install cmake 3.4
+    curl -sSL https://cmake.org/files/v3.4/cmake-3.4.1.tar.gz | tar -xz && \
+        cd cmake-3.4.1 && ./bootstrap && make -j4 && sudo make install && \
+        cd .. && rm -rf cmake-3.4.1
     ```
-  
+
 - **GPU Dependencies (optional)**
 
     To build GPU version, you will need the following installed:
diff --git a/paddle/scripts/docker/Dockerfile b/paddle/scripts/docker/Dockerfile
index 28d7696c8c..127ebf26c2 100644
--- a/paddle/scripts/docker/Dockerfile
+++ b/paddle/scripts/docker/Dockerfile
@@ -9,7 +9,7 @@ RUN apt-get update && \
     apt-get install -y git python-pip python-dev openssh-server && \
     apt-get install -y wget unzip tar xz-utils bzip2 gzip coreutils && \
     apt-get install -y curl sed grep graphviz libjpeg-dev zlib1g-dev && \
-    apt-get install -y python-numpy python-matplotlib g++ && \
+    apt-get install -y python-numpy python-matplotlib gcc g++ gfortran && \
     apt-get clean -y
 
 RUN pip install --upgrade pip && \ 
diff --git a/paddle/scripts/docker/Dockerfile.gpu b/paddle/scripts/docker/Dockerfile.gpu
index b1344f275b..a729e33d46 100644
--- a/paddle/scripts/docker/Dockerfile.gpu
+++ b/paddle/scripts/docker/Dockerfile.gpu
@@ -9,7 +9,7 @@ RUN apt-get update && \
     apt-get install -y git python-pip python-dev openssh-server && \
     apt-get install -y wget unzip tar xz-utils bzip2 gzip coreutils && \
     apt-get install -y curl sed grep graphviz libjpeg-dev zlib1g-dev && \
-    apt-get install -y python-numpy python-matplotlib g++ && \
+    apt-get install -y python-numpy python-matplotlib gcc g++ gfortran && \
     apt-get clean -y
 
 RUN pip install --upgrade pip && \ 

From 70dfd7f13732ab10452809ce68d394b0b300382f Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Tue, 17 Jan 2017 21:37:30 +0800
Subject: [PATCH 56/88] Add automake in dockerfile

---
 doc/getstarted/build_and_install/build_from_source_en.md | 4 ++--
 paddle/scripts/docker/Dockerfile                         | 1 +
 paddle/scripts/docker/Dockerfile.gpu                     | 1 +
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/doc/getstarted/build_and_install/build_from_source_en.md b/doc/getstarted/build_and_install/build_from_source_en.md
index fbca407030..27b478a0fd 100644
--- a/doc/getstarted/build_and_install/build_from_source_en.md
+++ b/doc/getstarted/build_and_install/build_from_source_en.md
@@ -101,8 +101,8 @@ As a simple example, consider the following:
     ```bash
     # necessary
     sudo apt-get update
-    sudo apt-get install -y git curl gcc g++ gfortran make build-essential
-    sudo apt-get install -y python python-pip python-numpy libpython-dev
+    sudo apt-get install -y git curl gcc g++ gfortran make build-essential autotools-dev
+    sudo apt-get install -y python python-pip python-numpy libpython-dev automake
     sudo pip install 'protobuf==3.1.0.post1'
 
     # install cmake 3.4
diff --git a/paddle/scripts/docker/Dockerfile b/paddle/scripts/docker/Dockerfile
index 127ebf26c2..d46dd48f74 100644
--- a/paddle/scripts/docker/Dockerfile
+++ b/paddle/scripts/docker/Dockerfile
@@ -10,6 +10,7 @@ RUN apt-get update && \
     apt-get install -y wget unzip tar xz-utils bzip2 gzip coreutils && \
     apt-get install -y curl sed grep graphviz libjpeg-dev zlib1g-dev && \
     apt-get install -y python-numpy python-matplotlib gcc g++ gfortran && \
+    apt-get install -y autotools-dev automake && \
     apt-get clean -y
 
 RUN pip install --upgrade pip && \ 
diff --git a/paddle/scripts/docker/Dockerfile.gpu b/paddle/scripts/docker/Dockerfile.gpu
index a729e33d46..58070b2ad9 100644
--- a/paddle/scripts/docker/Dockerfile.gpu
+++ b/paddle/scripts/docker/Dockerfile.gpu
@@ -10,6 +10,7 @@ RUN apt-get update && \
     apt-get install -y wget unzip tar xz-utils bzip2 gzip coreutils && \
     apt-get install -y curl sed grep graphviz libjpeg-dev zlib1g-dev && \
     apt-get install -y python-numpy python-matplotlib gcc g++ gfortran && \
+    apt-get install -y autotools-dev automake && \
     apt-get clean -y
 
 RUN pip install --upgrade pip && \ 

From 878b321a128bd405e0cb66efc00e800ea03d0fad Mon Sep 17 00:00:00 2001
From: Helin Wang <helinwang@baidu.com>
Date: Tue, 17 Jan 2017 11:59:26 -0800
Subject: [PATCH 57/88] changes wording for paddle on k8s tutorial

---
 doc/howto/usage/k8s/k8s_aws_en.md             |  92 ++++++++----------
 .../usage/k8s/src/pserver_and_trainer.png     | Bin 0 -> 71688 bytes
 2 files changed, 41 insertions(+), 51 deletions(-)
 create mode 100644 doc/howto/usage/k8s/src/pserver_and_trainer.png

diff --git a/doc/howto/usage/k8s/k8s_aws_en.md b/doc/howto/usage/k8s/k8s_aws_en.md
index 00bc41e5c3..10f5a2ef2f 100644
--- a/doc/howto/usage/k8s/k8s_aws_en.md
+++ b/doc/howto/usage/k8s/k8s_aws_en.md
@@ -361,20 +361,9 @@ ip-10-0-0-xx.us-west-1.compute.internal    Ready,SchedulingDisabled   5m
 ```
 
 
-### Setup PaddlePaddle Environment on AWS
+### Setup Elastic File System for Cluster
 
-Now, we've created a cluster with following network capability:
-
-1. All Kubernetes nodes can communicate with each other.
-
-1. All Docker containers on Kubernetes nodes can communicate with each other.
-
-1. All Kubernetes nodes can communicate with all Docker containers on Kubernetes nodes.
-
-1. All other traffic loads from outside of Kubernetes nodes cannot reach to the Docker containers on Kubernetes nodes except for creating the services for containers.
-
-
-For sharing the training data across all the Kubernetes nodes, we use EFS (Elastic File System) in AWS. Ceph might be a better solution, but it requires high version of Linux kernel that might not be stable enough at this moment. We haven't automated the EFS setup at this moment, so please do the following steps:
+Training data is usually served on a distributed filesystem, we use Elastic File System (EFS) on AWS. Ceph might be a better solution, but it requires high version of Linux kernel that might not be stable enough at this moment. We haven't automated the EFS setup at this moment, so please do the following steps:
 
 
 1. Make sure you added AmazonElasticFileSystemFullAccess policy in your group.
@@ -391,57 +380,71 @@ For sharing the training data across all the Kubernetes nodes, we use EFS (Elast
 <center>![](src/efs_mount.png)</center>
 
 
-Before starting the training, you should place your user config and divided training data onto EFS. When the training start, each task will copy related files from EFS into container, and it will also write the training results back onto EFS, we will show you how to place the data later in this article.
+We will place user config and divided training data onto EFS. Training task will cache related files by copying them from EFS into container. It will also write the training results back onto EFS. We will show you how to place the data later in this article.
+
+
+
+### Core Concepts of PaddlePaddle Training on AWS
 
+Now we've already setup a 3 nodes distributed Kubernetes cluster, and on each node we've attached the EFS volume. In this training demo, we will create three Kubernetes pods and schedule them on three nodes. Each pod contains a PaddlePaddle container. When container gets created, it will start parameter server (pserver) and trainer process, load the training data from EFS volume and start the distributed training task.
 
+#### Distributed Training Job
 
-###Core Concept of PaddlePaddle Training on AWS
+Distributed training job is represented by a [kubernetes job](https://kubernetes.io/docs/user-guide/jobs/#what-is-a-job).
 
-Now we've already setup a 3 nodes distributed Kubernetes cluster, and on each node we've attached the EFS volume, in this training demo, we will create three Kubernetes pod and scheduling them on 3 node. Each pod contains a PaddlePaddle container. When container gets created, it will start pserver and trainer process, load the training data from EFS volume and start the distributed training task.
+Kubernetes job is described by a job config file. The file contains lots of configuration information. For example, PaddlePaddle's node number, `paddle pserver` open port number, the network card info etc. These information are passed into container for `pserver` and `trainer` to use as environment variables.
 
-####Use Kubernetes Job
+In one distributed training job, we will:
 
-We use Kubernetes job to represent one time of distributed training. After the job get finished, Kubernetes will destroy job container and release all related resources.
+1. Upload the pre-divided training data and configuration file onto EFS volume.
+1. Create and submit the Kubernetes job config to the Kubernetes cluster to start the training job.
 
-We can write a yaml file to describe the Kubernetes job. The file contains lots of configuration information, for example PaddlePaddle's node number, `paddle pserver` open port number, the network card info etc., these information are passed into container for processes to use as environment variables.
+#### Parameter Server and Trainer
 
-In one time of distributed training, user will confirm the PaddlePaddle node number first. And then upload the pre-divided training data and configuration file onth EFS volume. And then create the Kubernetes job yaml file; submit to the Kubernetes cluster to start the training job.
+There are two roles in a PaddlePaddle cluster: `parameter server` and `trainer`. Each parameter server process maintains a shard of the global model. Each trainer has its local copy of the model, and uses its local data to update the model. During the training process, trainers send model updates to parameter servers, parameter servers are responsible for aggregating these updates, so that trainers can synchronize their local copy with the global model.
 
-####Create PaddlePaddle Node
+<center>![Model is partitioned into two shards. Managed by two parameter servers respectively.](src/pserver_and_trainer.png)</center>
 
-After Kubernetes master gets the request, it will parse the yaml file and create several pods (defined by PaddlePaddle's node number), Kubernetes will allocate these pods onto cluster's node. A pod represents a PaddlePaddle node, when pod is successfully allocated onto one physical/virtual machine, Kubernetes will startup the container in the pod, and this container will use the environment variables in yaml file and start up `paddle pserver` and `paddle trainer` processes.
+In order to communicate with pserver, trainer needs to know the ip address of each pserver. In kubernetes it's better to use a service discovery mechanism (e.g., DNS hostname) rather than static ip address, since any pserver's pod may be killed and a new pod could be schduled onto another node of different ip address. We will improve paddlepaddle's service discovery ability. For now we will use static ip.
 
+Parameter server and trainer are packaged into a same docker image. They will run once pod is scheduled by kubernetes job.
 
-####Start up Training
+#### Trainer ID
 
-After container gets started, it starts up the distributed training by using scripts. We know `paddle train` process need to know other node's ip address and it's own trainer_id, since PaddlePaddle currently don't have the ability to do the service discovery, so in the start up script, each node will use job pod's name to query all to pod info from Kubernetes apiserver (apiserver's endpoint is an environment variable in container by default).
+Trainer id is the index of trainer within all trainers of a job. Trainer needs this information to do things like reading the correct shared of data.
 
-With pod information, we can assign each pod a unique trainer_id. Here we sort all the pods by pod's ip, and assign the index to each PaddlePaddle node as it's trainer_id. The workflow of starting up the script is as follows:
+#### Training
 
-1. Query the api server to get pod information, and assign the trainer_id by sorting the ip.
+After container gets started, it starts up the distributed training by using scripts. Each node will use job pod's name to query Kubernetes apiserver for information of all pods in current job.
+
+From pods information, script knows static ip addresses of pservers. And assign trainer it's own `trainer_id`. The workflow of the script is as follows:
+
+1. Query the api server to get pod information, and assign the `trainer_id` by sorting the ip.
 1. Copy the training data from EFS sharing volume into container.
-1. Parse the `paddle pserver` and 'paddle trainer' startup parameters from environment variables, and then start up the processes.
-1. PaddlePaddle will automatically write the result onto the PaddlePaddle node with trainer_id:0, we set the output path to be the EFS volume to save the result data.
+1. Parse the `paddle pserver` and `paddle trainer` startup parameters from environment variables, and then start up the processes.
+1. Trainer with `train_id` 0 will automatically write results onto EFS volume.
 
 
-###Start PaddlePaddle Training Demo on AWS
+### Start PaddlePaddle Training Demo on AWS
 
 Now we'll start a PaddlePaddle training demo on AWS, steps are as follows:
 
 1. Build PaddlePaddle Docker image.
 1. Divide the training data file and upload it onto the EFS sharing volume.
-1. Create the training job yaml file, and start up the job.
+1. Create the training job config file, and start up the job.
 1. Check the result after training.
 
-####Build PaddlePaddle Docker Image
+#### Build PaddlePaddle Docker Image
 
-PaddlePaddle docker image need to provide the runtime environment for `paddle pserver` and `paddle train`, so the container use this image should have two main function:
+PaddlePaddle docker image need to provide the runtime environment for `pserver` and `trainer`, so the container use this image should have two main function:
 
 1. Copy the training data into container.
-1. Generate the startup parameter for `paddle pserver` and `paddle train` process, and startup the training.
+1. Generate the startup parameter for `pserver` and `trainer` process, and startup the training.
+
 
+We need to create a new image since official `paddledev/paddle:cpu-latest` only have PaddlePaddle binary, but lack of the above functionalities.
 
-Since official `paddledev/paddle:cpu-latest` have already included the PaddlePaddle binary, but lack of the above functionalities, so we will create the startup script based on this image, to achieve the work above. the detailed Dockerfile is as follows:
+Dockerfile for creating the new image is as follows:
 
 ```
 FROM paddledev/paddle:cpu-latest
@@ -530,7 +533,7 @@ And then push the built image onto docker registry.
 docker push  your_repo/paddle:mypaddle
 ```
 
-####Upload Training Data File
+#### Upload Training Data File
 
 Here we will use PaddlePaddle's official recommendation demo as the content for this training, we put the training data file into a directory named by job name, which located in EFS sharing volume, the tree structure for the directory looks like:
 
@@ -550,7 +553,7 @@ efs
 The `paddle-cluster-job` directory is the job name for this training, this training includes 3 PaddlePaddle node, we store the pre-divided data under `paddle-cluster-job/data` directory, directory 0, 1, 2 each represent 3 nodes' trainer_id. the training data in in recommendation directory, the training results and logs will be in the output directory.
 
 
-####Create Kubernetes Job
+#### Create Kubernetes Job
 
 Kubernetes use yaml file to describe job details, and then use command line tool to create the job in Kubernetes cluster.
 
@@ -632,7 +635,7 @@ After we execute the above command, Kubernetes will create 3 pods and then pull
 
 
-####Check Training Results
+#### Check Training Results
 
 During the training, we can see the logs and models on EFS sharing volume, the output directory contains the training results. (Caution: node_0, node_1, node_2 directories represents PaddlePaddle node and train_id, not the Kubernetes node)
 
@@ -689,7 +692,7 @@ I1116 09:10:18.019836    50 ParameterClient2.cpp:122] pserver 5 192.168.129.71:7
 It'll take around 8 hours to finish this PaddlePaddle recommendation training demo on three 2 core 8 GB EC2 machine (m3.large).
 
 
-###Kubernetes Cluster Tear Down
+### Kubernetes Cluster Tear Down
 
 
 If you want to tear down the whole Kubernetes cluster, make sure to *delete* the EFS volume first (otherwise, you will get stucked on following steps), and then use the following command:
@@ -700,16 +703,3 @@ kube-aws destroy
 It's an async call, it might take 5 min to tear down the whole cluster.
 
 If you created any Kubernetes Services of type LoadBalancer, you must delete these first, as the CloudFormation cannot be fully destroyed if any externally-managed resources still exist.
-
-
-
-## For Experts with Kubernetes and AWS
-
-Sometimes we might need to create or manage the cluster on AWS manually with limited privileges, so here we will explain more on what’s going on with the Kubernetes setup script.
-
-### Some Presumptions
-
-* Instances run on CoreOS, the official IAM.
-* Kubernetes node use instance storage, no EBS get mounted. Etcd is running on additional node.
-* For networking, we use Flannel network at this moment, we will use Calico solution later on.
-* When you create a service with Type=LoadBalancer, Kubernetes will create and ELB, and create a security group for the ELB.
diff --git a/doc/howto/usage/k8s/src/pserver_and_trainer.png b/doc/howto/usage/k8s/src/pserver_and_trainer.png
new file mode 100644
index 0000000000000000000000000000000000000000..f41fe48920590333ad332bb51eb18e03dc251541
GIT binary patch
literal 71688
zcmeFZWmFwq)&__sK!RI>I|NN|hv4q+65QQ2xVt-n-~@MvV8MdBTX1)nx{&VvzV4n`
zYu5akwPxN`Z{<?Aj?_8(?EO5?K6Qg-q=ez&u;9SJz~Dti1mwWLpjg1bAYx%&fvzB}
zqXYkd+sO&@ft3y8>;iwlT8pUKfq|jegZ>9!GQ@WVZkRMtP_b8$lwjAlvY^%dVx?z5
z>ug~S+zkfC>C6s%v@o#OC2+Pdx3pt-<|2B&gB|z`x=cqz@O+EC85fa?qznPSm8}5*
z6D=d{M<Q-G0s;a~+b@Rfasonsivxdg5gFUtTeH*AIXOAeIx)~%*&5M(Vq;^Y`$$hm
zPfr8fL1X7)X|L-{V`)eHQprE+5iqdRw>7c0H?guL0M)CjXXRkeMMMN@=%4?+^wZwN
z@Lw%i+Wmbj-~s7C-_U)c{YdxEx`CpcpsVcsRu<N_26lEp{oG8P&v*W>YyWEJ<$f7s
zD|;(o6l_iOMJ??OY=Oe|x}e^1)Bi2~e}2cm*ClOhVgNk#bLmfiOaISnfA8m{13mcv
z7>Jiqe!dC}GdCP3-9M*{8}7$C{UR6`FPNynX9Z{Q{q)z)Xo{EtZT2yv>N_pOxStgi
z_H*=<&oQZ(_u&=LuqhowaGSg-LPDV9^C2iDU9GaOFbKXKc@5(lRyEC<GZVHOZgY$<
z%qGuoEnOL24QFs&;dt{wknsNh$N#etKth2c;>*F@hk`{z^!ffRIiqJBO`XsOMOvpi
zY~==p2M)m-Dk2PQ+Mjs6cWzbCdymg~h+LQ-f@Fo?djKAvpqY9M3Zu8Dr^a~Zhi)()
z6o)i~u$a!ppGgsEX>Fo|YR=!aDw$AXtl(vDNZlc3AJz!l9kQKSE74I1d`8KTaOW2F
zk3<M4Afn~O5DWD5Pu6-qE9w!Vz$in|sMmJKkQU8*d;Z+U;XY5{gOCEsc6?wMvUvsS
zhfI=65R^!kAL>VK81d_OGag2$qfe_8Y<e3jPgPY<n5?c=5VxMASN^R$*GtESUB$;_
z?GA*^+ljeU>_Be>je1i6^C9&HqhS$iC#eVphuz^Ted@KbJ$|FXtMJSI9k>V2&K+M5
zX`q_5j7U2yR6Q-LZUWoS9Hf;-88p)pU<_pRU}PF(_3}e-FZg*qegfs&q<ZnW6EWdn
zqr8zuuL4@>6bhqh0$a5e`acu;Sc5u{W|XMUAqSy>84*XY7YhdK#|Zuql^ew?N}7dO
zahVV&XFL3~9m5{Ge>Fysb@bXpdq!RU{DBWBO%9Y^FiKkIKmnEZ!`q7iN^^pHzJGi3
zLGSe&f-J;JQy(;{*KN({V4<@SM9-yn?!F$u3rn`FzaDhxIC+AAib4b){>EFE9yW^C
z87a$_m~(5aFn3f??J4kLRQvl4v=o7i$jZm(hwt=^Jbvi#f+q(adBsn4>s_cAFz~Yd
z6?j15U;^Gvq2MGk$4W4W`%@(_0z-}%@9jiRVThG1TuJXe-`ww+Uq=D~({XSAm<%OC
zzzGZ%GTzzFhAc2pdTnaT_u>#FSy1~|{tsf5Br8x5mrqR(fBZh#g^?gpgWq}Rw!Ko1
z+!XN!if+IHMNj3s5r9XI`i}#O2<Qm``!8Z}6oUUaU|b*+EdB1$D!%?#XW;|tR1m@$
zsG0vb_%guau?cwu_X++>t5LiJK+RA&U7Y`=RRW(Va-dUv(#RYlfStU97)oUkj;B;S
zKB!y9$j2Idc@Do1z+C9ESKwfP>c0kNqw6&cov1gI7_-k}lav1Ta5}Jz=w=~xkARCn
z|3dtM;neHs6Z*e1Dn7XqyrpkK^mL<=lY?<N90`btb49o&Tun?&Zl)Hij1Vsm<`b<v
zv%>fT5eUi2$(M=PmnJ4A+|SMiM;{Fh3^o%z1a@B|;855;_^Mzu=qizbqgaywJ@1g_
znD$>$tsnvQp4A5!E2$)dh?9C$=i`poV-0pjZ{O2*Q!wB;uYgyZO(Pf##C@KvG#D$C
zCtvV-dYGv(Q*QOpY_$Jy*tkclP_Bh*V`EdGR#W6+G4q4)_F`{aS=V~jATjw4@b>f7
zcO9?svs9}L^Oq8}J$G9ZJwHF4k4sC=c6SRXl&BzY4kij0%H>&)31aH5bq60-UiJ^M
zESnFQWl%ESULJ%-;Bus#7%$q5aAA&rm#z!d2i(=pRB5V-@=^AK*Y#>3WQ3|@@KMW^
z7E3W4I_1j!*+_5J=dV#QF>99Uz2Dt0M%gJLU%j>dzRQfR{hq^-nzs3vkA##o*=3Y#
z)@jLwoJ=xhLw!Qy_26$S12WL`dV>}kux`$HI>i5JA=@ZHG1-aHx%oI28zPotIsQ~=
zo4#*otwo<S>Xtp5_DZUT>9<{Jv`-heT{X+KxTY5x?9Se<j_^F$2lI7lGSk!lR&1~}
zNNhFb;%am}_!Faj-~7kaErncrM)PXXDYL00-{bzIH@`%Mzh1y+HJ)4yv%IyZySx5p
zIQI5rEd{*<u541h6vqLhUD?A4PAu!CeW_;Clmk$<X&$pD@J{2-=1<}G#%`3JVvCm1
zPt!$;gPKb7;UOWm+M#@XHBLub^3uKEEjFFZmzrl6TUEl7Tz(7+2?*5NH(%B)>30Vq
zAD&){exyFA?7dT5eode8h3(uU`v}n^bLY;V4`Nlydk<n7`V9G(Lx&sbS|idSz6TEe
z^XlRiC6`&l)4K?*=5-2&4lo<b5u}}MU5z;)EsG`=ZrrDP&|_MC{9P7Fc5AROEZ@^t
z-uC!sIPrkEOc>u|;-SZVuG&OPn&A@D_QroS?t^k!+Fk_5p*1`n=S+NBn)yKM`EPMD
zI%ei6(gf17o>0tGPm`@>ucw0|`R@_^Ja_w5EpxT;*hTiEjy)*dr0S{?i*1j$gB05K
zB}J?<{-2*7A2jun_2x!x^+;C_udYwlTjrMPZE}n4cYl*Tt%cG#22X~VT&;9`r49C7
zE0WC(=$D5Mq%^8-dCy>&#9zW8u>=_3wiN0hn$sQg8w1KI%KS3JGS#Y|4(s78DUC(n
zZXWNB79Gw>-X&<EN328R>xthAW8Zl}0fEO_6EGyBht)=Fs7%TdvA-d|*PA!kEeM@V
z=>6(;Ll*`57wn)L+XrA(G=<KzaTjg|TnAWxcgvV4Q5~OfJE)oFj9t|3yBKVwgfRCH
zFe0qE9`)q9*@(^PKRjFQ3QToZWOfcIT^Ox38cNwrHW*fH$U69QGRlv**{@6$D;7_g
z>^^I_7VibrlyZK!*yNTsk6El-?x-Ggtrpkkh$*j(F4UR>m$$QHtbcL}nC!J-s+s!k
z?^X}Da(8vW%jPNjuRC`Hs@#<4vc?0BtSggC1JjnruYtB+LjyDIHoDIy*1HO3bUUDG
zD>avE7;2(7K;`50fj_4+?}JJ~kOA~Y#jmv9-d^k{WUJ<rSKVTrz@W)4zoX3Xq;MZB
zkPdHj*t;de#>e=Rxz@N>HW>RI_RliEV7ld;2R&@y)A`O=*Y(`7#P}Jrg}tyJY2Bh@
zs>dHsbaCYMCw^2;k8tz3YS>zyD8RwYYA&69`~3CQ&iMC8QIyS4Bc*n`3zr-z(MhkL
zqU7ltbtSiFNYF6AjPclAQ^nzndl@7Pfu_X=##*y4ugifj5qjh$pukBJG#)Fed<C#d
z0n{+Z5oaGpgnO~J-4K&Au<9hgq0FZE%+;9XiYL-;9hX*n+&NOK`~buEI7jY|>0p`P
zQ%kol>XPs}?M9W}5S=on9!#VQG^gtbrE5(hUuO7Bjl)T|xUILl%B}qU`(t-zrBDf#
z#F(ESR39_iuQaxCZN(rS#8}#S`>@}FwfEOF+sXCTt9*7J+Kjj6%P<PG+qjj=G+<|G
zA(u~}^>$t8G5|~x#rqL3I%oN{#+9M3$}e%V0~Npu&Pvpx5W*6?waifFUGlJ5+GUCj
zx}mICJxQPR2|Pb$!GTGey{=(|K_g@0=?Pe@4KP$R3;rOV&r2LxP}yxMJ&!;Fy=k_1
zKX6?Ig`A1M<v92KzJ!YNE;r<z4K-Z&H@PTk)yi(P_w4o*=M*9eJ{wff73VY~W+49w
zBLH9u7TmHoOPtPv&np0l)2Dq|&DScB6o`(#1O&3e5D6$=2#4-9YBI8<T~Ci!Jnjz{
z(_3mZMC|rE-hd0lHhXJBZ`zqBmCio7ad)>~NOCfi$>~Dxdbtm4V{TfR!fXMzXqt1d
zH-*xYsfAB9_M@88G-ILGVm`~VSltj#&T<E`ijq&kUOiMG^}`PfFC}Il6^t2zHy}{*
z?C`mH?=@^Sk0Jd<Uxv*Ez!%Z4VNpd^c92!-EH&GIPA>b=)S?GkV$1df+%COiHh<@O
z@yDR`dVbT_<-%yI_@dT))_0JuZNdEOE)bw**K*Y9IR)eKd*#drVJ<ERk!3;-rW!2-
z(OJ&6hQ9jc`?Ipq$j#qWP|N9;2mrX;89;|{kC*^R{Yaeff``Cty~xPm0mPjvwGzqm
zXswQT-8Ab5iz-p6W?kUmGXPvO*0O5cC0lUaEA>Rj&QMp7SS5HWkcNlNg?m^lpO<q-
z$DWgrokysA;}SB9SaElCB%Bras93DDS~bFLr(V5b(kJb^SbN~sC5;?=4n*FhTICXz
zXl>;DXLG6mVdjrQcU7r-d>th!z91}<CIJu#@)7P{&tI@WFnOQpej2=d_lG$Sk^8!R
zL(`@QeH4bGF`lojyO_~z>R0pa<IUBwSB3qqT<pG8d8L)hh}p$nwp^>b3lN4~3>!wc
z?$l>47sJukzKO<?Mg6GB|Jm*X`7rgXzyFiEgTm_BqSNvX&3bvc=|v8`=>$ay5cjNF
zGQA$(D{DHSS6g^lT$gqWfGmxI2#DO!U)TS~2xe`*dU`Fgl7u=+gM8HX#9dicwRp8u
zYk@t?b)}(Sn5Be0!hKiLFCPl0UYM-}XKJwe0f;Rvvr2kiuvbG@)O$w@k4)wx8C>r^
z>G>%mUv$`joPH3Yqmb@=-5&boMZB`sCq?f2WBvuFWe*W?aXXd@w<}A|r~9)*V{+4>
zwGXwKVF3DZnH;Cb)UXC0W?g@~^JuXckUOkuAsb+~_3P8sc(uF~TaUSZ#o5hsgb)q|
ze5`N3ttLehviLtybDRc*?C%SD+ljc`aGZV`wd$y&f^72BygAYbyIr%M5cHvjYfI~N
z$2k*P*^l)C!MeY@E7JDTot93p+EcAsO-`#llj#jK)_5l-Vr2@*S~Qy+X>C|dhkK;`
z_kN4hIZR{H-IIA-AFm!Vj-bk#*6-8Lc@8GfNaDxmUJB-<)eDTR{dQ#19yb^aFv-#H
zjUbcc>v3h?<<4X@iRqtq)iYU@y5eNpPP3Cp2E}!T*|a?m!7=^Qo`wSP-dO-tI;GTd
z=(KxtUUdKI@zAEp4ZNrSLrYF*32(<y4y>dyh7rzj#TrvZrxq+G(|Ts}nU%6!a^~;%
zDz50<!<g*S^sWC1NqpV6Y&4G_k+vFF==bWHZ(=uB0BDlw`wwUmOag)?J;8b6eCdrK
zA6`zkd%O7nxMX|DOzy?I;QixxN+e3fqK(H>nks`h62Ma1#7nR=P{^i85ridMJw@wC
zljt_)@D-DcQ!kOx(Klm6NNFp~KT3)+qB~`jy0gs<z_(QR<vY|^EiX~1?YHh0v%Yc&
z$6@=8mp@(XSHa{?>KdYMvQfA&+F1$RH~C!}KheC$CQ^J}(Lm~GsYP>bud7(H1t!2C
zL#+3M&j5U~WQaU02scD+$0qJU0NDJ2ccWgk-2{p#jaesxr37Y&-`p3+U~{xSgQqXC
zvGLSEQ;J`_(do#t*2&l}Vp`j4(OwGIffjJc0-sG58*8+Mvomr5KEm?IQfswr*;zdW
zfI|lCKgJdSR5P9|iichP6aT*3Ur0)p2Mx>j!VgvIBR3CcaUrTy_{{GDS>llmecQ5m
zk%{u40Cz><Vm%94^Q*;12dV@b^;fcR?*oh2f5y=2`Q$#SW2J1PusIGy6A2gLG`MAN
z`0_#MiU97v_Z#3_y=8;s%ij2V_vC<FED|5oJ2+-Pvagyp7~mD%ywhcxU-7B~geK`q
zU9VARD}8A-8?Oi9C?(Nf`NX{{Vr0|GflFkzn24#p&Qq%~9sT;OBK4*MYh}d<V7JF5
zDlyOQiO+jXE{%R1hy1^AKY+MVvUJ+(u~GjM+5hW<H5~Be|3%5$1(@;R%^!TB?Et|T
zt+UdeDAVMqX8*bLvPN+*LG=EO<5C=`8zKZ@AnyA!b}-u?0X#f_kMJ+U1v(6!3Z?zR
z3q`V=VXW1~POQIBy@Z{AJNE_y<g6E#(+XhR|0$Wta6%O<P~b(F`#-n%Z!~y?6*<dx
zoz-XC;%{63`}>eUpz5Bk=u)8faYzt=7l$oUEDm5<a?bTTPg(@HBFLj!LhwA@GaNP@
zaoZovp@(C$2wfd6o@~TQ>-9$9_IOa$3O{$uIt=Ec+Q27>p?RP-94`pk#5Y!L56nRP
z4S)m~=o3Q4>zDrb#(8^QF1sDITmx{o<*><gk}?dR2d^stzH2m_ALLwJH_x8Kj0nLw
z?5HjA`F0Q&&=-nV5bxi=-<d4N-1?O)|F*OO_IciXVnkK|vhXn^JSYf;^L&*5`fP-!
z;!8hDtP()NzJ2?)Qaf+i751J%Z0&f)2vo$H5nRoZ)%BLm83o1)B9=_Dv)SeB*VjE@
z;YR}0J@Cq#js2x+ch~~kJ0PAj-dMF<VBDVXuG7vpyVz;CpY#}y=fg8wE(8IN4$bS~
zB9SAL{N?eorM(ldvw4K~31QzsxLxjlHZ?VkHpc14cw6HP1P7|$9%seh4h%rtTvMm<
zu@WO}2O~}x&jkwK>;4lub_baLdm!4QfMl{md*~`!_sdzM(Su-kZ02sjlP9a+9IFDJ
z{^Ym|F}3*V@eUCX!CeME`n(vNJ;>l1?Yc6e!SW@NeZ^@TpbQ2R{JHH($;eX9sLJX)
zH=y|#9rsl<oLBut#Kic3c#zd5w%e7a#&8`wNZa%_ox|zp{%n=x<!gL*Z^NM!LRwk`
zAgtB(a{zS1=6L0^*Q8|1q2d&C89=c(4JJ(auGs3#riahJ^ZCQE0x|EC@hE}*t9|v)
z=`AMerhFxphC^8QM_y0-?(UqxqTN(qIJ%HzS@cuZ_L$r&P_HXnu9N^-nokTb@W^A+
z{=kzaeNfuYj*foB_ffAl)*aYUtMh!gjX9?4;QptG<Ol9U@t*%Zo9V!qP{bD(ifDX;
zPJV%gl^CuL+V_cU0u8H_nvK<WYIwgiw7ni2GCD#W0{1TGt(+3anSn1`BLG?){_Ycz
zG}o+|`9RQV7b)Jx;^xI8UF&tu-EDDs>wx^Ul+2^rW5aevRqU_geG|t?z-KN7P?%`U
zA-b8Tvpd)xEH+ISN3gmv+^qF4k=|u%=6thBEC)EL@@%IDz%6^K&-SZEF!I%cDczj$
zV*JZn>CPXrk5E0@WJCxSUOfEK_61Gd!rJc(p&Y|a^{tkxB?dDA35hT&E-tQRTT+^@
zdwlg?=Qi#y#zmUWJHy=_;!;Y*&OXPfqv^EhP$rqm+>n{u9q1=cO(Bs%3G0sEisyd(
zIaV2fSBaD`fI?*SCuH-hcyn7Sr(#$TBAipIs~#20{_W;Kyj}35pV{$BN7=^4muY|;
zefgv1d9$f?vHwPqw~r@Iwhx4~q#XN~4C>^BE*Y8<Xf^+!$ZIFwQ*PY?gQ>}6I`Q%1
z+(*8)sD5vD03Nq^%VfT2XJ@&st#F%Gr4N;oMeKGy<AL1A`<V$-Y!t0%YEZ)sxsEqF
zm>q?ocnWR`wU59X{+-r#8*-#t#EQ22(@>Xslup`DxkG0nq}dXS35fFFr5v{D)T;X^
z1QlFP0ESvW&L#cDEX1VA<L)w}-*PixO3}cVl999Lkx9JPzy9)Q`3NAwlWP2T#c(jv
z4VQCfl8s5=Ze&d(%H>)Ia-`u?xGa`w9E(bp#$M-0Pwx+#iP^Gk*X!G?X2DTdQ-J>V
z8GV@c&nZPQ{q*4}Z42g0o=Wvpq!kS%OT<<$!aUiFGxs5W?y~73f%C7E(?PmB526$f
zr!sNA$p#?O9u6lSJLE@PZ&-i7$=JC>oTn7v>}{&D7=%|ODL@Y>4_aqWyk7oz5Q)V!
zz%oyYw<WoByxO(sVp^jcslVFke`vpqdLm{uP-qZHQ)|n=SM2+(-ftkB*lWE48U;od
zh&J`KXs!)1W}QBeuN&)pU#aoGS#%Ck0QjCCPt6Pyu&nfm+hu<A1jN8c^PV8f2xpuU
z=<y|8?oA`=jd=rRjs-+<egNc!6ITPSdaEIu0!cz`D23@0fJ-@V9Qht@FDh&|#Xuff
zuS1tSRoUURa9G$wu(sjdcK+Hnq-#)Tho-=M3Xc~zAT#(Emo*Ng(<YE?aS{@Cuo4ZG
zODh&BeA@JUxKLj2*ShzspC3h?rCfIydh_gwu>iC#Y@{h;G<n(uMkbwM{+I`1R7B=%
z2T!5vq<`Zj3TrD==OrflL$+{TZkosCtmWYA&lbazm;nOJHiiA>j*hj{R(vRmq|z=;
z8EwiAIHsW%4TQSK``<XrQ98P->BQb9-a=Pw<|S&;@8)<{#QxgbD{s>(cr?Z*Eea{z
zGig}0!Va4bhqL)S2%jxM=yOck?%)HYDX4tQGOb!~^2Oth4AbR$=K*FfE8`%pN2q4$
zcNB;x49Esi;()tvpmr95AQa@y30(n#NN<U1)$zDAk8Y<QY?K8F((CMNAbS4FR>DLj
zm+}ixj1Q#wVrZE1iP3lzES^f4)|xYiqF+AW@nFv7_rBlRFb5*wX}YFL)x(*B0TzC<
zm1^~C51wT~N9%wGh=5y+%*+^%)ROzQ8RbA={kUE+KA|U&(>;^u{ybp4C@YORk`K{k
zZyEzMT{h#BxIG$8WM~kUo!8+hw1M;nz;~Yy97G7+hd1eM+5wEe_5dEa!wx|A1q@^^
zw@Z6WbZB2u<<Q{H5Q~P@U4|}gY|~l7*n>Y~xq<yteH5;m07_4!P;M?V7Qh%9*Y1X+
zs@)7QulQg->Ah*vV8gjo)JzVG!vGlQ21DW=-{W9ApzmmeV=>X2#Zv5Pp00m>^zWhw
zOu+8lX9!{;0#;lE6j9`QiwmF^__)K|-yOBR<8oy@pOhz&gQacUA<`eSx!wJqN?o(?
zqvtYS_ipLOxQF)S?=^tKc7>o*xmUq8Cs<eJ{{k?FVTlMyT5*FCKF(UfL~-jP{;U#&
zOV;_oS+h!wY5Z>F?P=D_0Biu$Oh^~onIE(+v(6zljbq~*4*dpWrSlCPwK)T|^ovrF
z10XDzv0&Z0G|3gf?=T{8O!ljK+IB!=CY2;5y>mi#@XIF}lOVuAKeQh~JxSF~^IWnY
zmo%gbHPZUHU8+{Y*W`E*m5~vF?|I$PLfX?>XM`=8!sz1{&m_7Qyu>i*QHGe>*dlnB
zJ8wbp;^tj|*w8ol?Z<~Tc}PGh;Fpt&nf-0TQAKOc@sD1mfO<9LqFat5#haT`U|fw1
z8yf;-0bGzprKEn6XS(bC1`$oC3`)*(H4jZ=Y(TcJ<B%>f3m-wSAu|0f#kPsYXfZdD
zuAPKB@2cMyMHppWO;#x$h=6;~d@?{)o^+1LbC2YDR9(&J#AbsRvYqL*ggd)ArEcZ0
zed6~5HxdElt$`uPh0W>wqtJC)(+PSrv8domm+Kimy}(CmpC3!=G-B|HV4nhoKJU)e
z6pPn>1IX!0NJt3%?qspD6OCr$gqia3>E@tjC2qH-@?e_FR`O5sfJH69UL+VXMnXDn
z7e?9V>#UrDdxGxpuxLo5I04bOtglZ*UMnrMvHP@Z@|j&;nzeg_Ye@nsBxVGkrnQxW
zE{~WBcNn1S0UGSpnH;hE>|RtE8#Hd?a6qiJCQPn08^Q~4CZLA(yWlS|pSErR7GN0w
zd^TNR#3dx`MZcgV?qgrx?K=S2M>&(*LqpGXPf^)juio|Q@IWrmv{mjMDE0*AoN623
zVKA9ne5N-TKuZ*GI*|)8_Nc(K0f->Iv;!r}DFH0I4`PMxjZ5!mXyIoqy)837mR$N$
z4bkBN)*e7^3)m5P&%_}PM!-+bXa*C7f&|<nNHQw71PANt{Pm5TvJ;SIwZ-Blk~W#C
zHbDgtC`@pI9$!TW5d<tO`?;?m{}PaVeTy;#b+I=c$WF?Yk)v+dJqLw#s>nYTV$grG
zFQu~71Ykmy$}hpL)-3`N>9*hcqA7)IanMj;jDYp(o1|Ix<})u`6_EyJr#T-R4ltK(
zP|f_<*EF4g#Nx9qN<c&eQDZtuFD!dYn0*+=_l?3KQV4f8L9clNKfHF94CG#bl+rXb
z`7iojAcY3778w-?dkQrby}#D`Z#)J`4rI)QyE;ZMvg^N{;l#ZM!aGs)UH)ZNFWMOB
z>=OmbAc{!7<^Ynq|5ooy!wQyY3GW6$TGoFYerN#Q<1=NS0U-B3V$Huk_;jLV+2;P}
zR>}Bp+=}2EDE%lm-ZA{2^#WJW5_s)(seW(*g6s*h^W)9-C)e|xHM1xhTyzM)!FJ0_
zb2%u?8RmleydnzlWAgA(uI)yo7w`_tng~#{;dHkMPAC}no6SU(MpJZ|VkMaK0L^1@
zopqrdPT7XlL<CfT*M%f|doEF{H(<dk?qS*unwQnXe6!RZbjYsPsl)|67ljaz>_j7$
zU>1bvK<Vr6_rViVsQBb6@k3GdPz80zCB8vlNeP39n0Uyhq^bQb8c0p^fR8wQM#uh+
zkn;ZVD{VG$+NBElxE-($z#d}O0vPlkq`aBfx*Rn)Dg`K`uV?o6&4GxvMajxnQ?;K+
zm}QrLmUp(LBo+-18gw*NAj<bZoxWr4a#(%WxmEgX;vx{$3os~Xq0ziH$fK``9%{(B
zxYD)bLf`z6wG`f#*cK-xBJ!QW7EWeKB?teG+}A{VT=!k(2_H~*{xEbu0=;C72Bj_2
z93pP&k*ymvgtD`hzySA^1Aw?8fmO>A#0qxpMjBE|N;vk&qK#*t-yV%}+(s+YVDn8Q
z($NFJ9;qMa$w5}siMsMf)!DG*GlU@7OEMqudz3685~njt6M-t}bZ%xL!^F?NBIGOJ
z_13bS_vcp{t@o49BAyK#39uwMqHcE-3PXrng5r5OdV2aWXx>jOBU=#=;OPwMbm=UV
z%n=Ye%%Ttoz;=M_jR#0;O$rygpLl?20s{_+5?&E(ok&Dki2S!^k$9%i?Q{c#jBMbI
zIcML_3`TIbX~3urfyYJ*0ovzVg-h<#z~nQeoIwW7u{9v^cZK_slUVP=9E+b;=xgFQ
zo!+0y;v7@};xRZ12+?#TiWqC#X~G=b=pF%6kR}R<xJJVq!X7xVJCM;K53V>(o$8pr
z-dx(3W`sqR6rj0-BGK_Gn<)o>!@%$bX6DSGEkL68Z;KQ_mvPunaG3ww!g>RG9s@Eq
zg#WUztp7h#(#JS3BDD)(3dbZ-$*^CHAZ2BFCBlrP#zq+Ez0+_A)VK}LS(R)Vf7+JZ
z4&Q<N2yYNlU;ltN6~`N7*|DIo2$}?~%`f>~@v$@(KSfe|s;8jFs&YRi5LgZpy?DFc
zBw!Ov#3$>e2q?aob_PiAacO8Rmxk-c=_};op}GKc?bMEHd&ZwAzdTd}Kz^(<sJ-**
zsJ(l>NWb3*5p%v^3!!1Sx?yeaduZv94i-C>KI~R2^@}sLHV5<s8*`U4%m3n9c_Dg$
zG(foCtAVY5bJ#C`M$iB@5J6+L6#n1bXI2tWwd^n6SsYkDG@prywWDDlNbf<#bH~ZC
z0zAR!I5gfP=@-F_0>!7KdhO}$5PKZaAfOOHJ1`ar%aWX};C;zIq!Lq{dbtgT0iyS#
z-{++0b!!m<G+5*A*iYJ!RYu@%Fu~leATF@M(m5bK4#9sufZd-mSPQYN{7m)5@Awwn
zUcT<LIp|}d?$iC7cRZi>?y?>Jx0Hs)O07cj$&}1W+mknwnK#UbrFz)WBBs$M7MZf<
zr(3L>jXG@~t2S_@@>jpdOKh>m@?ygZw`-<~pXMZ9a5(Dd4#^(5ySJwJWA~CGLv(`|
zCW;v||0Zf<_KD(%&bW4_{#o@{!Q3hlWBqk@mWxID4KDN31Jh->x_7lEHZra@hQXg&
zp&rBUIgf&fM3l7(?Le0*?u)@^s?6CD%+)9jh-LDnsN$^biLkV8-{}o1oU3aSXPVy_
zy=9iTg}Gy){Ves9P@a~Ls8PYx<<+0`>DTdJ%+X$!<TfA;cDdLMBL`CXFy5>^ZJ@SK
zwAl*3!$&MZH}GYfg1GntE{*{yDNunFh7%L`b6N}B^Y&AnJs*U%jOWdrRfNO+CunEg
zs7Xi5!i6e+J+9N0hlEZ+dGUV8@=J$Zl)a?e?*b++lD8xSDAKZml5N61yW_U<7fV=L
zN_%FvZBGz%;!lkk3F-kxFUA$X>*z|k)n@m`@fYq9847x0(AU(2<n{LaO_7o@sJp3O
zpF*IWvcbqzcVAcRp9xo({1I^~ERf5~p2BuZkYwnsW#h>iTFlI;u0D{i97oU_-c#1C
zqtH;u(L;?&j}D2ec}muAI<H%NAeL2%BhAPONpayX?~QP2YlyvAYNR!ONI&!HEfb6#
zyKF@+m^*}UdY=JWkwBdJ3W~)^MiIYZq417g-*ahZpln)rJVCZ68-XhpfnhU)X@_Hk
zMwAP5d~C8N;Y7zy)FM;DVu4^X1HbEray})|ImT`;QfzPW`1Vkdu-18nu<T`&r{PIz
zJ*k*L8a$Or7!s`f3S7Mw%xM|godtn{2C=X<?7CA<M_|zTF!m&EM)c2ABK{Q-b#dHd
zPM<6l3NCg$HZxP}^InzI_U1avD7t%8B*HzuaWie+-@%Dl3jTjs>n)h3o*r9O?3*JO
z#ktVd4pVM2ohg}3(`{l^OgO#$iY}sUbXaPKZovnIxqTq9G4wA76SMY;pMux;;`X^K
zPa>ry#SR#(1k<am7!pX5<-E2#MOAKZh(+cb@Ty*4@J>%}6O(19HC@hS2EOYN5>N;W
zGs>!%0H#YJ!5?N{lT99tjRl)3#-XiEcrzdsOB&=D+S#CX?7G<OhN1%)d<+JudjcYC
zR^GHTanbYyXib;Fa}r@7DTnA0a)`_O#-dYsXL?l*T(-hHH@D@@jK<lX{&1PyY<I&~
z83;I-_mL$&wYs|1((^xgH=32L*xGE)&vxXlbKT2pPMy`t*ow2O`E+D5<v=G}u;Yx?
zcZsEFUa;espzfx=9g3k=4%)>A!e3-jF`Oe5%#7I%nUS(r3moBk{N<>zSUE1`$})E)
z^DC6yjZu}Jgv^rId6G$`#57j#_0X_gan&1%Or4tElXIU`-IPczSH16F4>U>8E}VYg
zuCCvOk1Ly8pdbfUC0}@KxzP4^PvI{TI1`CK8?~2DikkST85I6#UwgloHksAR4S|n8
z4d=*SCG{A(X;rLbihbE2yTS@UH&)95wv`_w$CHUuKd@M8shRKLPxIBgi--rbBc2!)
znG3$w7l|fl!+YhW)Pd0+>TZVp5^l&=7qx7i)V2`UwHaHz)gE}qiTgX<zA~>}hBSyr
zwhf!I(cx!RM*KkA^8gq_?0LFPhqXx@pvp)B4TyflLnJ1mzjn}FysrDTZBZD)6b>|C
zkzh88$JC@{RT)#$j6YUN0`JPXMlSsJT0mEKTKj6ALn}h8xSCr%j`BgKC6r?CLF9{l
zn&?nMDTDCZ%7pZ|1VHpbfD|0S`xM}@-Lx^hK2Q_3{CR^Pri$Cz8U=w|dB;G?q*6bv
zCIrV@u9j3qsobRfndrKU*l(@dj;PBh+iCiF1^AH%^R{+Y7zV!%oCc)rFYLxEyHCH=
zn18)7e0v=66u7m}$@#paa2rKLG{fqZcEix>hzbvdeP{^#$y+6apmK!xft)We8y^`9
z*L(6c5(GYp&J(L6dVI4ju4=ALz5bG`W>oPOO_m4Gq=}`WN|EsQ&MCP7`0t7IgxGi!
zoY1mNRvfCIoa<8Cu+Qkn`wT{a!>SMG6RaPh&rl%#W0CS3&U%vmI&?xGakW<VYKcWx
zAMvTe*wi*p%|pj`95t>EcfX#BZ;yW%JiOw)GY(}(m^A)i*R9<5JFh)Iyt0ODet;$n
zJ!BE@IZf0~2!(JEHKHGVEmq0&$vdt{;Z37I={vUiNjX<?1v8u<&U`gOhxZeu#S8Pg
z``y<AczXfqJ0~eoam5o-=uZSEzlGH&ME@jswILgQldq7*iNc#}#rR{akoQrE!qM$T
zI6{HJ2dvF+M8lRuiZpACpS~yOGpYJ8WH99$Ut|mUuDYTE=`h~4_pg|_xU57SV6a4p
zxZd{t{t_BYQ}t!!JT_8&)tsqRN^{W0*usTE=?&WA;b|Ygqsg<I0&+rRD={O-Y36$j
z3!CgENW+0rm#Z>GH(#1!B~w+WjNBDUY96}MckR|pCZp4%ofc&Sv%pD-oHS(&0%)*?
z9UtdeQmt!K^2OpP51O#?i46Xvfh``=UkznA+Wc*SHH;U0nH@*yYZoby72=JDWHJrP
z>v5gMAd29E3>G4T6{;RmI~AXut7&&NVTp3yDu(D(t~I}qJZZh$ud?W=u=|ewN~x~M
zeh22<ODrvU>b&gx0OE^c#)}7meEJ?wJC9bb=Qojs)5?g)d?&FqF*od7zK|eWy$c(H
zrRrwvo`;J}36r^K$<!#fLtFj<3RMeO&ajBZm|M}eqs0&x{xTMIMM~zQkc5_HKE8NY
zvXRH!e>CB32iwQ;4rq=ym|VvSKAD;3DciVDX^|PFWF{o!20DpqB-N~@00Zl>HtU{T
zU2H;G$dg__AU+rG$O8K3!V?(a+&G{3EF0tX9wei!EB^-iHYgNCEBM(@(zTbH#CN1$
ziY>8Wl~9V+=VdQTu32>Axdxw@Z%nhZ#W1vITkI)8wN{#qp;3uX`CoUzg)I)|WA_9Y
z8lPio_XOmdv8uj|2hdrWhSi0J?Y^#H-bnG%v4X0hw(r}x$wH_%#?DLUydNCV;x-4p
zm%YxkZKul?kbW%Eo#KZJmVWfa*=}nUV!!b$iFevkZHN$`gcHn@c%LOly0>suZj>69
zuqGZj4Y(x)a+$Mp?6*K<O4RAShjlSFW{jR(Zc8KQlHnT~FJqa#n;v5Eq`QxoT&~mP
zYi~8EG);3Y)#iC;>JNYe7_Z4_(r77WiJOTHQqed}x6shrC7EC;ipY9f#i53yOi3@7
z>VAg~OvG|fa+F{nlsdbV{{_zO+y34Nj7Ck3-=i7-qb%MRdBh|6DGTC}w}3(oF_7>E
zPk4IVI|8@#l#Vk~`E!LHoAosAl*ih-4+m+bsOM=BCvW5f^Yr_I1Yo0MixZ@}!F-6C
z#R8edtXz4kL#-Fj$45+#N!P`y*UpzFEBJefpJNN6)&44fmJp)2w0sS(aDQFg>$2$V
zg;n&4RdG{Ie%k50vg)l><srzohjvQ$l`5)j0z*$7oFk~b?sBUblTRiv`LKtfGZnfR
zO>-3-A}+8_mBDE6@Su7-+cop?M!$|hh9);8nvG5yuF67i%3mmvYqcyzCAW}mL~Jp1
zXuQq#`<|%KfYkdD+?fWQ91gBMq|&>EkIv6=2!W8maRV?XpC1Gtj|B&%dj`<ga`|v<
zOc{}D+-I7mv9uz^Nl1z{B2TXkinDxp2wQ0tRCA`VAu}NZ;A_%cEaSNruvye<n11bz
zkA(OO8a$Ocu&HMFVuI7PLA`ZBHJ;MAC4mds_)_d1=l+)KKFFzW8kIv;U1LHxh|-Yz
z4y`~*5{=Nd6j_lVfuiW`&AEOBKgRghyj4x-&!UUhFA`omlCj5C?@#HT$BQ@N#8Tm8
zDCsIo2T@g9CM`&@D#I2g$Zu;CS^{;KMzOP|utRy8v8jZ33_APs4;H*94$LG_$6z-{
z|6E=nDRcK*#bSv$F8^UuIo<uL!d=8OFl7}Q_Q27T8CarCgj`9>*z{;n9#tN%mKGt0
zVnAxK?V6jUdNz(t7BWECQm*C@lqb|Rh}WBfuN_E51<8=XgBV`66KSgV@cD(SSib_L
zp!GA@#K@6kywLFn5+Af!RN$a$0z^c?Qy8Hw6X>2IuU=o`EJbQ$>sbp_4@GiU%#~0g
z7TyWS(yQaR6zLJm#$kO=gA2ofPjhsAJSar}d4~^?E*+D(**)HRyCWinF|Q|U%4h}N
ze8p|u6Tft!_~Gm)QUwm%n3y&d;xE&}3QOG4ET3u?oE;BrYoWTX8#4p*oR8Trn71Cb
zKs#g@k}ON>>Q2ySR7yWDL_36wLcWd(>)aU)06-z<ak%;J=ChQCu0B%LL$Mhd#&gt)
z75OK26))XmZ>27(=0YS#Op7Q&=^^2fr3#ahehd*S9p-b~A}okR0K8RXMCHZ&dN;Y}
z=kzL9H^oTgJc+=--mDx7HU$JrHA8A9z^8r?i`v?mcv7x7cYX`3joQ<m6O*5-70FPK
ze`%i^#OXmG9|UOvZ``@Ynomv%PW<L|wqPiR116mo_4n)#v$M06>LiNx2p?uXv{C>v
zoX>@+E}(y%IH~YA6e;rS=~>P@^>wXr+92I-fVE#1MyIMDCsVRa1rcQzaY9Wa^#oGN
zz!5t>$l34`+el!d%3E7p%vQL>D%O7P(rCkRlyrzIS~WJCu9XkDs@Ya7AU*?hSyqKI
zjrV}i8c$ITtM@MV$us-I%WE;>0;o_A{c>P@)f%H7VzahDE;tR=ucz?`6%`&JxT)@+
z#l<GmMsP9+z!Ji3bkXat+{`qs;Rkb{GMnOF7V2(8pQTiI)lrZo5SH;dtlu-_MD6)%
z^3dnUsd%S(_wCF;SQT?~$T^kpf1}_Rn>Jm&9Mn}-5H5_a$dd?C;V5O*{?0W88=r${
zqe&rEU>*u$=OI4c_x57IfKr@5B?R#a=SIY#a*b>TmJ;Y~g3BVRGn<2LjntKyqRS%7
zlyQzqc|K4Tq2;wwpa~R5J>g<A4e&&=Jp)M@2!N>h2;l~cmUzz@Et*mY{OtaWsE#n;
z7|}vi+D9HNs)w!&6-*<KTp?+2r;D6Naj`z297{u2srU*1gM5<#H=D#uW)%>l0Wfm5
zZ)-kGnAH~7auva}L^kA&Ds_z4Bt+_RfL=xZK0Fo7V$ON{EaX{XAov9%ptH#`lKLsW
zaQf9Iw&OWgrVy#>rHghIZCv3v+$Q$m)aQTXzv6U&>f=`W^`}MKC3(~BYtcjDE{trZ
z584H}N<_%>#K5V#Kc@!RY7!$%D;zfA2YqY8ee>!G#JG5!QDpG3(Ytf_9#oB?VpBBB
zF}={g_hUpr?Q6GzsXyMldd*D3^wnza%*Cd+!-0_V36+%+gnO+K07WE?VH(GT6;)<1
zoUqFq^A)iFL}X^ts-(;ASIr)n^!e*K^=xpiDrc!@W`d*zHE|^vDBRk!7J0;=tHUDk
zD8g6nxxv)uX9_=dx|NNxq?pDp|5twjUZ?^DYd>j%CTR#u2-CciW(pQOXRzx#E7;UV
zMVycjdD)9{+3&QYIsSEs2u^~3`K=$ZPeDCYb?Y8KW6LG<zRE{%lO|>Y;VFNplk#Yl
zQse<9<D<>5#5nJ*B|vk`3o%T{y9w-DOd)oF^ATHfucC;SN$U(;2kqhj4@3^4%2fC!
zLV?iLz50^YkF$nY&{6Fqk6Hvtt^a;}2Pv1kPvAH_|3yaxiUPBA4EyEkzieP?-vu$r
zm!f_UK5uQ@Q(_?n28CnrgejAgc^#RN(4o7#aZJogUNU*sVXzy|T3UEe!8-vV{F`lO
zDWz8$v#P(fs;e>G!i2l#0eKbhe>iUdZJHG$&IJ(&dF@bm&4{d;e8`^1S}V34Tiguo
zrbqFKUrRe|Uq*vaKxf!YS#Ql^vchyfwC8uR|IxDUocif(SI=)e?7vQm3uMeoX+MNt
z%KY2eAM`#Jp-#oK<oe=(|8?fw0hAiz^DRHF7xwgTBS--S7!@3NsGw+8H?oK%{KDqL
z9e5&@^?R>|*9d5(8MO|_1W^fF<&!JF;jrYbOMjp}YZ@`UN-6H1ucLl_bu&<<xNnSd
zpvV)ssHRd4yd>sWm+{tu9;P&w@G)aZd|6u>FlhWLzSf^RY@&E!Nk4kweY}>;a&=hg
z`P^QT+@4hK1AXSh<>uXZ;T<de>d*}=2+d>b)}fs|6U=TKOubfROf-cF(sCC*jYx{?
zELIfNN4pntFcEUT79u#$CtuR%^1uh%5@K!bv>3KT1w8B?LZ!01u`R)T2B!X;e&v+v
zwA^T0BKd9D9k>eR#lfdtj}dlk){0ookn)$Y<kJ}m@v3m2K&4P@K$u%P``}3P2nkZ(
zTBwW&^q-_il&r!AN=H6}1QNfJH?QkTI)Uiib!adiU2E48EF)K|h0WOf4IhYEEZX_%
zMxKE~US8o3dl&>3(?SBhiGs<ErLq;Xg#}(?UxPwu;;k0rF5X1n56bMTpW(~ZmC<hk
zuGU9w(UK2POor+*(km>cSk3SE?7o#G@8QWqH(1q3CFaNtGVJU9Wn0#ulmQOoERJas
z$_!mwCylDPs(?fq*js0@&MZUlland$X;Mg6V~UnDI<Uh7$o(aY4@_$t9SoW!6c{uR
zcqp5_sr&-SwjnG9{HJDfX60Z!4@s#RQ3O}9caP+7C|r+85X~Q93Y6*U!aObqN7@Na
z95XX4^JcQOln1-^_sSs{9^*3)NX(0QWZy)xXuocKqGeG&vWVObJo5RTS@Sbq-S=6V
zTQP=E1#pMSM`3CrxgQQcRj8ehm{DIbQPPfCv(+2qQ7|f{L={X*w_;Ukw?e)Dz)|v}
zxZ<P_nJUDi*l4|trHt<`u<dg^^S1NJFEO4VgG`<fNqOPM@m#TZ=BxQCbFA|j=568X
zhCBJVvaxQ>pWL&Y7K|8Pwmi1<G*sO1S4Flu>nPNBIdqORY1Fzp2DoR$rmGd^-TE`v
zz26TtTI#s+!@b-Cl$@FavB<U8hyTIJ8Gr=cYM9FwMLMs}py5hCxi>>2c-=DWL^<LP
z27)5OJuAZX8pF!`CtjwCV`tGg^inl&WMsY73NYv9ObSj%aD0T3;)P=c^Mbq{_!l+r
z5<HeXc(M<>ol;hz;P;nbDp_$966UhSF-DoSWhu;%c<wO9W4{}ZP7T^QtRTnp=km^4
zTdyMWW+-Tsh!DAmRy9Y!&fUk!ip$&Xp^Rsg6*r9W{-RgOl_(Y6UtZf7Bo6iOKxnZW
zOEQ{fUM?Zr{cY+amQM{2;?G-Ae4<u4i|UrEH*r=a3D(FzJN1{;SqSu(9ZMb$^Ee0K
zrF(=*87xt#!e2+p)Ns=y?EeHfgU(GKxwTH0UM}PA?H@0_4jYwO+R|dh{!YZhj~>+W
zb?_+<N2GCPqn{IUVZM=!^oD#I(d%v2UerJ4Qf1p>%bf|5q9Av8O<1@t_*0L>LAdhV
z0f|}q4h8FyiM@=m$gzI3j{TWL0ojElFb_+=lMP#A1mdV@@U%HaG)y{3cq|n-KY^S0
z!&PDM8b=vfgP&2lyz{lZ3UM`(f+OWe2Ajn(MC85f(`}9E8%DAO1?R1jaRd(_MKF=-
zwT2_x@zQIhrujO^Tp?pAKz+|}Sjx(#px<cLbThjXGcTb~R)geUhG5T(Q(rnWA2MCp
zD=(>iEuKl&+$U#loSeecqqqLP;R!wAC^C}!yrkdmB@7Wp0YGbBY*;wejYoJlWhe}S
zkiGS7%7rVm>4A<Y6gww726~EPG!3P!VQNUmbkhoVY>Rfu2MxSEUrX{?MvJ(9Gv{4>
zW0ygs+?i3oncwKLP4h#nC$JSWu@__>BZefAi6d5ssuu^mc8ZOLcJ{7o<g=Z&A+xP0
z?@^87sBrH*0|7XES`h8#s<SRPt990D($qz~oMSk>7pBKAPNGm4s8rV$sAjaJH|4Lm
zJg6q}mpy>e);LUyIOa2`GNJH_I?f?#OcO;GdG>7Y3)ChVrrE6)W?Uy}D2qrcHV2~<
zPP0s7HkG@iuG}q_AnNM_<LQ6%H&htHe>AF%_gs9kMMuqW(AV>BI=qzV=fZ!;GcWJ2
zrIwu-=+0cw@pbF@$8Q1HmFInvrHV2D5bN_k;%1nFo&YsLXR&H?X;Bdr()u7vwhxLG
z0Y<fKbn4qsXGQ7?`v=4+KP;Om@_JT{Uo=GdzHRA-R|kS8)-9rvQ>THO?fC{XE%rqX
zMQ$BjmOXyy<>k?16~^wH@)@FsXMl1{6_Q}CA1v2877T2!i6v9!(T`&Uh(D${QU%_(
zmng{^09KdBYFNZJ`hz?x;azP^r_&GEoE4cu#X;AL2eZ5FV@c&b*!X{Fikrvsxdr<(
zgwlhG)NSvSEVAa4FIAVOw}xoUq9n0f&L+qM#n5GcQfqxlzq?cC#~WZV2YXCLD9~5<
zD}4|GjTo4!i@|A=)INnf&q}RH@4GK^D#{cnSb8|}24;)m(j}DAY4bjKeLI--l|wID
zU)e%G+$u_UR{3x!)z9@oxed2kJFvEVNHVVKp>k4?RBCRHiu?0v>@}rBbfX2l<F8Eb
z%8PFH=)W>B3;-Wk^-|bGgpo$Ix7<C`2N^6Cy{}{MJ0A;w^X6G{oy8nw0y|Qx7_mYr
zS@s*T6a^o?K%s=6uN5z-xLs#o?u}J)2}bySsS^nJ%kVZ(f`WeO*F>k9qzek&c%j*M
zYy$7V;xCwfXhAK)t-U@|7po0cFxwa&Go~n75Lu+f@E`)JjpRk4i`H_F#|AJii=w?*
z$ZQ5m0eHVWdEr0uN=V0TIc!GvI3EC{><MKbS<@mY=mB(Q<l_%V1++!EoV8@5yp4yw
z`BqBx%E}U3rtUVOn^Tv)>wL6tyeEI(7$flELzY>_W>ekGmr^kIK3<mVWO!@t7+~&#
zW@AD^7u^R&s3!?dgSy^wSpP-S+Q2Af|C;gwNj*PClW|$ufgQwU{wL!80E-sE`>lRs
z%!1nVbA_kJN@K>2!yh#R3p0f9&KyEbuH1s_+FFNVrx3L)V-Bhwznh|B@4d3C;cfJ1
zSla%H97wPDJ}Z3<ep;I7$B(aj)VdP7P2q=3I#-qdQOS(K!E|*x_JRCaktaV&G0RU+
z8)LQY{rT?IT`sMg=6_(fo6F~y0f3!uGa+#ByI3MyLp8R!g4#vu5(=hn=8DNo<6>u5
z%IMg8F=t!G6zb^VD<~pllGQ|wJB}cQ5P+%evZPsXuh2n-FR7pAzX%KT@OtPJI9n>t
z)?6dTZXJzer(+Aa#O!5u(dK5GE1EB3J2oEeO(ieuk59-#Q|h?youoWtyR0uD+VEur
z-wpTyh(vE(Xu;g*`^VHLw*!>Hw;WLBSD__4q-vF-s5ZwRhWQk^A7(<_^)y|ca8DYZ
zn%^|4Z+hM--rJa;mg#FtroQ4ld3U_PprnAwk^UE&3H$Sc<hlb>#u%C>&-E+3UA<&o
zVbMaHKQ&~o!5Gx)-_Zv`s$}As*0mU-rGNR4WNv5&+zu7nh)K)a4)W+nH4@K$PjG0^
zU$RPinhIB8yr&hlzM<)Qe&LJ2=a>Wl&(6bC!h#Sz{8@V@yuA@h>U5ll_pHNBi|A_$
z2W1@I6AHY<WeYCfogUL~tv^(X<fL=n6GX-6Y3^xXY@Zy?0|3@;oO48q8`^le<FsW=
z_^?99)3WS#L!}YT`9YDY)FIV3J!5NX>;3cC^CH74SU7=GGAGu2{&Es{rMIq4+^i<1
zS_f98wI8ux4LKS_6<+TAn@XF1yl6kxnEa{5hKU|=i~Z|xzXT|}>OOQMHvgHy#Hy|3
ztC9~t3Rk7m^5dd1no&hbpYr!G16~Tn^pI4$`I$b%jNt_S52D5eUP1UBH2J(>5Yn8`
zoJ)u@d*MDK;9UV~ghq9i;LMiQltm09-!q{p6)!Cd2K(DBa+2OXw)&Z(qi))bKw$MP
z?fF6JBvX*n_poWG!;U8=SZcaWC){y^bKZW8N~1nlFs;#U!~3Dud~w8f&#R$&$aXBa
zoYMS4$yvpq)GqM?T==cL)vTx=k1b2yJ)jK)DGH^SliLUjARA?rT6!VcY{oXb%I00L
z89UFet2``<19~D6*;QfWZ>Ch@Mg^07@r>8XO0Nc3c74-+F@6iG>{M1;;)Jpb)G+HD
z7<PZu=uQylGnp*|p}sHs%O)Yc@#O)`PF`#3@gP+gICQ0eJ-52t9XNQfIG=n{n0z8P
z2rH)Hc@<qm7V9vRxCVispENBm1uYcuN{VCgsnm6u3<JYzfH^*Y?NN){R;e5UgEs1n
zVx^?YhtNv2U`F7{VRF0I=(KnRT^RzM3ZiMF!{O3gydqoI!cBwH;|@*a)YJKL`JMa=
z3k~_o3rP2@ySu>!(+{3;MJlMtuO0QTVt9bOA%C-yi5s329xJZmJpPi_j@>w*v<5gL
zdiv8lsEZfOiR32DG@0hlw#H2C(%bLfrBiSWTu)Teq8798B^o^m98|dy*P5e$*`E(s
z6n~-*35QWtHqG#aWJgRgzTK<?fBt;}V8aj%IIJiy@MACHyse9T5@0e)2sbx)Jk`cy
zIfrXKp_`11r0Siy`2!dcXtNm=cH2_G3)GN%7vi@&pR#9ly;|fpf2P#r>=Zy4Cn<%S
zqm&p=#KltyCKHN9H|EsK7zpbM#ABQG$bLq{Wio||XqQWJgp5CMLI2a&47>n@5Z@#A
z&3v{}&pF1MBE1eN9(W5(uRn(LkM0Eu8x)q@MpO$wD^8~JFa-%x22`h>=1e?#r2Ok4
z>Q4-DfO4fGN6oL{|7VS}I)nO6NBJ8pCWEJ2L7BWDcB;uBYX9i-mie$VSw;JT_20b6
zDsbbLS(W2>dwybLf%UXdzTMBi*};kcRZp|Y%DYV?nk9frp^Rs2#O6nJ&dP{mUaaI_
zwpErjujO9XTF>C}5<#swVY(9vgJ$w8;VWQrfcHJZrKtHNi$g5kP8M(FHh!u^zqmr6
zt~A$X0mS(9|E5)-zzBnQ{CbJ)UnS(l9{-+KKtC$NYFx>t@SiHj|0_$5Az)Dj0`Hqf
z;rr~H6EKVq&?r*2$z`Wz#6o`g61VUeNUM4o24w8Bp)}ZL`}!qVaw9{;DQwU*Dro^B
zC4S~Hk#%MNBgiBAfdP89UpvW<Hzd{zu=i((*0+W2szx|hWoU(vXWQxz)1PL<Qizzk
z#a_lrOlR9SILTSvL87rrDj2HLX|pZ=i9H~#q-)>^g$>2=B`^`F00Rmo9`gJ{GXMM4
zH2{I~#Q@tf2-#@%p9PH<B>1m0?+?(P(gd`@7fI(|Upy;3ek1^b=Cf6s`2gA-sjAxE
z+T#6xP<7T}QEp$nhhcz0q(P)pN)$yphenW48WfQbM7pG5Ko}Y%M5H^UyGvS9TDqjA
zr0ecc&;8x|Jo69d@yzUZzk9E})_1M_Swb_M`DQcG(XC^U^4TO&nPab1&qv?y$vL@L
z{j%$JK0-hH=TxxqAQJMQ0Mmr$SgWHwR2zrAa?lWA_$#iotm}$v#9G$Le}#VJc<jCo
z0XKe^bM^g`v;|(&EC~dmrDqD5*9vNP5~}*O@xR0nzlK`;FlL7^%EaoF-LhHmYmU*v
zFzHziH|jAqXd`c4l+m2ATS2_;^Twvb+8956X}zsG|7EB-*$b`i-O9(@r)tzONtnMI
zX?8=~z8@1AbmFI2i>J8Y&exSR=X_`e0!$Q!cd(oGwi!?ChwsT=*O(vgVdB$5iQC++
zyR*@%W<I!afxBX;!JPniBJhv|X>>5c^`TZutVn~1kSOqd##xSSPeqRXk!8T+=z%kL
z<EeaGZDCKgqk<&MjgIKT0HENfPC9a>I`n0EzNMKcr`}|#M{e7DN_I4E|Ki@$*kD+1
zQyg3??VP6yH2F2Z8PWLe?&rB(N}wu+<Ea6k_oOPK){#v2$djr<xo);Redzhv6*p<!
zE#=n6ii3#6O1h_+3h4^*_Cvd+oioRa{}S%@sNg=K=BIq3B)ov<Y#JwD*CbSc1xx-(
zc|HwaKK85|`6oUy=ktd89;||gcF+e7#Txv&VvHkj$*BZ}AU~>=)P3J@P8p;|>zVMe
z`21|0{XJerM(@&<whbvxPQGBfmEZw{AFg-R^6U>fJxxyRN(*%SfYQaoaql7-!Nt4g
z==&KWIsG58&GkU)Z$#Z-5vmwx_tsh%`OobUgN`T|jim&mBJU^?BAVc}6Yeh3w@8qr
zFWHzvOg!AtN;sJacZtT+eYMT?wb<}?l;#ZE<;{0-wRzenrp<5GE=H``^ok0)tFE{7
z`Dn^s){x#8eioRI`!J9!WVADtrd$5X1a?&Tnf;g=DZd_mkA0Jz5u~Dw14~ivLy2eW
zxzopUUMWua&nAyYPF$DY5^s^kvfT*7m=H4JtPJGN1mz|(E+YS2Wh}5I5*`jvXGXzt
z7(wx`dEDObGI(qhy2dkVAt(^uZtt@c%X4x!Qe#fL2lG?RDDU^{!I5Fj7zF5elmRhq
z=-~2;4Euf<8^f}DwTM~ceaneRme;M}@;WmL-}j_b#_^hQ3cSB(A}-GnD&m#-@SlAc
zAG>@VsdlgphI<{JR_1`D)<%NceN~lwsq%X<*=Kq|cp9I6*I)~rQ>Vz{8BA#!<c&~W
z_-7WEC>(J8@x;6dP{IgbQ0UtGeKW^Sx18xSYH3*+rn{H)e3z$~AH!W+wT<)7w`q9R
z_5!plwVYTq!(48W#lMpozRr!2wKvYp3Lsvf%D;G>lCbkNuY@c0QtD;Rrim$a>vYRF
zWxH(qt?YEq<8#%*p}>I0Y^*(@4cZ4wn`zUxgBtE$wKVcxr%FaeO*?K!(b#+Of?=GL
z<~V&PV|$U#&QT%IXFT9_tf^Q63bUfVBufAVkfBlG1@(^#BH^E5o!k`R92WobzWtzz
z{SP|D5rPo3&wuJ9OtNM!3i}MLaH`H?1^G%N<8M7%IxaC(y2U#7CB=j?{#Hg$hwia!
zy}-*?tla2}bWJL?NzMGN8XPh%J(O}>B16V`JWFtK^rhlbI)>vanku~ss#^}#r-rH*
zS&d_jFdC0N**wEJ9WCy>Q{(b@)Yz?0<0ItfeJiDI%&(77C)h4avFRjFRK&=hA@Ft)
z!jE+GX*QDFvP)*Q#rH6_boaGrSNw0?*K4{bbVS8kr=9P}Q29h=vcpao-%6id??w4+
z@-Mza>3iJj>#4UGw`nhkUFq~3HWi1H28>W{eWPh`bmEFuP06q0cP4rY*yS~=YHeDD
zMp{VS_gnfG$Qmn`Zx2R<8e)aH8lT-OYmg;-P^MoynAlJ9_k1-)LANLldY=jfdh7D$
z_`T=AblQe+XzbHAoHbUb)EyrY4iImAZPm1J&PsMm%AA&0DXXlGzH7SjF$lv!PhO@b
zJlpVVnN@!DQy^<x7&kwf4Z^#Zt#F%+W>|(?Z?(~D>K&T0AlWA?(fae(2Y=3_1Z8*u
z<xxSLQ8eND;yGjCa}6$Ly!K<6L$$)aAGC`)DIZRXivfjWeYE!Zn`kOQ!`xJSQKNPO
zq=Nbr5la0Z1SKY4oZ0zAHf1}axnno(GKmW=RK9B<rP5W7VdvP3*#zRxb)5a2I9Ysh
z5QpVJQ~_Vp>Fa-i+W+p83r2(5L!Cypk=S+(tpG+T&&=Th_JXOd0c#V)zF}ahoM&2?
z)aB#T8#TGL?d_UY<>q*63wP_@eLMOTGp6}TiOR#ip{T;^iaLgcRc}RE{A*KdyVVNy
zWRK*Ezmo4C12!*1zurdMLGE`}`f02RBQM!K)Iv#6kCR16>e2xCQpWg23bf>nS3Fz!
z6WlS|YJ2Z&RV$fvdqP1Sn*_nj$10Nuj7{=a=@x#1<u4LGV-apAi_2wV^A|m+Kq-br
z;0H@}SMTOb7r)w#K{;R4lAs<P`Gg0#P7g6n#9O-+#{aU4^G+iL(+9O>)i)zu&m7Fk
zqs>bTzG@j0IKkVI03)&qDoj8w0Tb7I^vgZdG$&7W(i9?<eUihH1r-nsJ#)lWpwHE3
zQoa8N64yKg9buIjD>z4q$6Z3vK6p|7@lvVLg)_u4dWn{mp!}*Z5vc~!19=y0K`12@
zM9bgCq9_E4Z36`uW|pj27`YWBhkiEQ&k4u#igeXxf0p-bw$b(D@pR8(T>oEL5!@c6
zpEE}7>sNB~k&1{%{b*NLdSUC$UiRLJ!uRCTH!K`R*iz<<ygwe0?oH~LJ!Q?nkd8|W
z!XumcoaQNP{*8A1?6B=&&V}po*<EiT(ms{Q$o-$o%V<Zw*3%1?_goZp*h@AVlK*<P
zz<JZnp7YMz=4$Q7Bq=t!+EEp!rnJ*w%2_M<RaG^#*uk&58A9dpneC!AA}Y}jbLXg|
zGHx!ek|#3oY+GL5+>F{+)51mQqvutQE!&3uqhpiYpo;0$qd>(mOg#o-vbUCROAg#$
zG8q|qL^&({O-uenIadcbKX<90z-bV9)_9`o4sW|u#L?$Gjr^QE)-m4-2g*td61yB*
zf+*@n0Fn-~HkgEez^16Be0x9(d3Kp|hoI_hyMxQY&*lP)8A^fVn!lc*A>t_!Z(Ef=
zEr&TKsN!KOSv(nh@ATrL`?54Kqs#)k+tkf4;wEox5C`8#k?aALe$4q{?|8DDr;=>@
zo#pOlO?2QIOfb{%ia@#1izN2}XlTMe?t;xN^l#f0WgWdkk?<2@)t9%Mf-!hSpQ$x;
z>Zp-ozMICW=jl>-SCb#WKAU;ND|10x<31@e*&Jt4rKH3<StG01YjdQ+y!SIv_0JG5
zjMcx1F=_?)*9NQ8V=8cxxClFH%1A8TJxkwiNY-;Oez6<s{*2UuGE75;#xt*7K0{h*
zFXo9YT$dD-hjpn>Rh)!-kG{K?K>Z*&?(f4d$bo|T-FUUIu5PG$$?hV2KKH7*>H36q
zsPcuCI*C^h?7HG2JI}{>3F|I^8(f?4sORgx%#q)d45T@8j5|5#8hEu<Y1w@8^2R79
zF6zpYw;D`Zw)54V-0`ax;VMH=!0U;8k-FYJSir<_MM~s(cO_~q(viV&iZUu*J3Vmj
z8ReBm0hfBY`>TvBHbwPkk*k4U2j?y|m8KeS;<Lk#;yeFSUG4zY<)r_f+)ru9&=9=t
zXsl~AHHQOJM23!0&K*6+52=8}AA9@sTDD7ntM)MehxoUTu^6*WHx|Ri+p3Tj8YS)g
zbIJodha-i}IIP+=DgogU?E8udtG>l&1U2dW;KJlf+$Vp6FgcO{Qe+C*=f~0<M2>8E
z3g{|NS6V^IF})tAh&l&f)W8f=1Wbn<l(Z?}&Fn|UZCKV+S@x@Z5@zOq|MsJbr$klZ
zk6-T5ZzqOG<AHuO!S^;H*2lQ%gdpzWt~&^1$VNyH{s#7~A>QsJX}}qNPlk|_jvI7i
zFZK|&SvvIARX5`ya=ge*)j6g<()8MP^lsQ*l1IyNY@g>x3;k$Ua&nBy`)HMFzulU_
z<sk|`rdg5r4K4`mG-6g-f(se<PK3oSsdu}inrn^cOByu=rdamp>-9NhD}zV<MXHzb
zz91gH#GYeO{8Q5H3**^2<anJgioNc}x0H;YEp?BB=W~G2)yQBz#d-Tex%|w*S%6{M
z5K+|A&7hTZ+RiJ-gS}OZd#Z&J*wv+vY~lI5+{ZhTsGMbP`})NG!1%<26_SiVuMj7K
zkQ-->1;0+v?C0vSKpM2I{#C;hfhlfB-N5!P^PEwCS?65)qit}+`=(48OZ+)%5s@lj
zVMdB0snKzqNmS=3ce*iVSm`iGJh3CA3clXYTreoACa!)Swn(TEnX}T?MWlH$K3SHp
z`RyZS;X2N=ee(26QW~!(B|Y~(y{xVq%Vjb;eZp8+Z!Y8cbftG|Qj==*3x)?>lqKJy
za~QbmXKV5X{4NNR<|q@-!aFx?iwrlLj7$`}ZrD$tfe=3t6s2#MSye=ZtyRpPdbHUs
z5=M3sJX(19j_Y)|oIFY!-Lb!cM5)N7pG~N;kJglv^Qe%sxhPV_n)nNE+v9;JE#JA$
z46~(Czy2%JEACmQ^qcgjFVht7JgZsw8XAk*jfB6TZCMHb)fF|4N;b-*ms0<K3X8C?
z-i3P><DEM)jAg&1!)Zl}#T5J>y7msO7h8!>mSRraY3jKH%j?lV^LnKfy!4BL;^TGb
zOqzSn3*j9P{GjfG&!cdur-f!OgE)j}56e=Y>Z>E{$}>R)QA}JQi{ZUejkHo`9P#s<
z=QZ4U>bZ-KX1W0VL6keB9~M<^F-ffWkyD4YioA?+O{J9GE(bYjAqW(g{{KgD5#cz<
zz?U6%vU>2!0Hj-Gb8n13%`@yvw<}8Mby)IZ5;ehAXIOAc&X+p31t}<S5=urBWd=_p
zrG&?W4?9^v345=$Eon_E{at0Zd=d^By4GsjrIh4T6O(%i5svp1#t#U|3UL{P9&P~+
zD!ha^Wkt4R(b$NY&=AVgmdAk}#xwo)!PlVBm~)<JrsUSZK+UyD_flVF&9``>VW~N&
zTtJcpxK;bWBuG1jl)iTLIT<xTlN~;D|GhK!GHoiz1n*CN6rTiKTznWb4DQPtf;kx6
zOlKcj6i}CHpnX$4Zg6&+f`s)3@+Vv9PtRbv&e#uwWr-I*$ylCNO|1P_amR-WZ$z)D
zMZq{Dpub~lxSB4A$DiAW>dHX#Gx5*B^?Q>rlf@B@qV_!4PW^#$vuiJ?Xn}D-u-gLn
zvy|fZMFEhC-rsC$`bH==t#fnfnNaOn=&n!i5HJ|1PxC7vSM0w0gJSVvR!x<}ya0ro
zwb_O`Xc`fAo<so6&*GO&o~pt4K*0w^M1uwX;UfD1PmL|6tBG85?3l!d1~ASKB)(01
z9M4p*XT?8o+M2k?dyx1?nnKm6q9K;5*hS#kpZ-AfQWfCGS}F+wM4=-O-x8+1G;p}v
z<2W9;q=EYS3F5&4|BAQcp9iotJh7fxut8PY{W*APy)O2OO}--jsUpoKpz@@nB&FBw
zuhBCf&07E@063sN;8MqWBG#v>xYE+!uxZhA-l9hkE5khwNHq)V_Z4o^ic}Ym{_kdN
zGYTan26ujp@()@Fb9BnGp4yCzbv;0rPW<p%o;gm6vc?}cEQDiF00QtYd{Kub+JQnH
z0EpYEZD=8>gQ-r$zvq-pKnXiy5?d%#@fvZCS~mXG#evEQz);O8cPO?|&LE3S0Qn%G
z!R`2`-bV`Xi+7Tn0SJI<gkD)_of(MnnoLSh{}n|62KFcLeR3fYc;FcY;=HYthVF4Z
ze*D<w0W&i*{Ht&y04%V*sO3p=-C$2&9*ii~tF-7=OTs~YZ9LrEoa5I0=K-HSee%Y5
zGg*2E;8DoeOrd|1_TZfWdK0+ECPF<4Cf(Z?hsSFAwp!K`gb?ZPkN`VyBR&=IECzSO
zzGxP-cG7u;fWH>n$zSpm9Z%bbKYsT>2tN_p>f>!Ctaj%$ARyUZUmi={tK28i2*^`2
z3~iMD8=He*j7<zIE>b$hN)#th5r_@*Ul$@K0vZKhy_*3@q9;ir6o8}4Z#4YUiq3V7
z;aDI&EzRbjkKd@p2lE7=T86TesOm3|rp$hnvx7nS)!47re%lB4_*0AtRWmyPd1)v`
z0^`7QptrXzN=58VYs(oZmu}D2bq1^goQ>#-+)_~ak$+}I`0O3CFd2I1Ur*2`8aC*0
z{S@-;^<#Pq7^tXlUcHA3^c}~s#q-N~=G)1=yI@}mU#NK~yYt`DE}<?p#VCD=Vc3C|
z;K(<4$Wx~t=xz!4W2cLDKjTa#ndp~SgO>nC1xX8lFYSQ3iX>u^{j>xatsO?;qW)lH
zHPT_pi7e>Zs8It}${qmMz1kcr$WhOYd}{uCb9}%;LJfbzO-dYBUA^RK;d4Qw5~3&3
zchuP1uC_iMQ#svFcpvR}@S)_LFmWC)eA3ktT==OY)T^{pZRk^0LDkuj(|^eETEO@v
zL~;J^)l#%d#x~|e(<`F1rEd%+vPSBg4iupHy7@&IE9#g)ofH9JeIeE!ghN4e*!pQP
zz6cPY5hIea=c->qeX+g2xIt`fZ*@~#_mEsaYA#HiHIBWH-f;~n-$ARt;83m1f9Hu)
zYd&5`;|bn=%9H-X*cluWmfLUfHSFBMe>vIfq)QFM!oGRi8<4H$-o`89v^Ez;!nKKE
zvbnDaro$4_6ndX!D!jTl*%MfJPqMW|eDFEO2~APli!?qvbbrHs<A@iPgBiQL68n7Y
ziphIDFze#|_^HLylh(*sW(hki2>?xp`RF`F-;0xR3DQAN#=-D0gtYvA@(Dxsfq`VQ
z&O>i{7CTwHep|#l<$<Hpy56n+UgfnovE%dpehcJAUQ^bWI%}<&4I7E1!<H)@p(~GX
zKT2melASY6qB)i<O80k)wu=_DCXXDy?QfNO{<5xI?yRy*zO}tQFUFK_oaMi_jGo~o
z4Tply5o5|Lb~lD02oWJV60KHi%KPf|aFz`AvhOj*x#Q-olLX=MeCr)=BMhT{%jTKa
zJ0aX|KXuCPzrdKf;2W|a{Fq%$<23PVto+Szu_9{OQ}LQd_kDW!^?t+eXXls4a|QOR
z{r$d!de#jlIyyS9vNQ2y;7~&q`r}=aju9JVa&J8U02Ur)^spY&^@m9}U#St<aN2Ew
zdj~}-e%IOcBmA8uL-WyZw~MF5E7Iw1TNiM7{peBuVl7-{1-A~_wA|jp@w;WZz|SQ-
zH6mJ>eNbaN<Pd){s1s(yROVzaJRh_$=D7=9z3mvbzF<-I`iJ)UX3g7#oiio~BR%Sa
zNfcq9PA{a(3h`vx-E=M+m-{s17Ot}z)hBk%DFm_q`RVd7dtrS#|M8L3Rn#@_RK>S^
z38}J$0jA4g$Q*&yTPN%UwsCzrfwAMcxXZJ*4xLJS4FXoze&qTO#2(Dpe8sB-7+GP!
z>Pn>d`-&vs7c=Mv^G+;BubU0x;iFlDflv|^k)apS&yqKPDwp074|mwFziyW*XvaL(
zi@7DIcPiAEm_#WzbC4J_n_am*8)q^+S6|j#YWbX}Z>*ovI_5LAIoU>~LuKM#KU&#L
z9cP*5BjJnwfqbX)gM=Ls*-d?w3C~rmgqgdZ=Oda*Efo?h|2;b)cy=@a-8+y*rU%4Q
zs<d7sA+~&DMd6Jn=`;FNGZ=63^-Pbx9wT;QfAicFn-jL|_?@H5Q0%_3+%SG_hU1G5
zK%8GI4O$>n0yZg@G8d63B2k>I69Al4eza9I2J>snq^h1R`zwUTwvDLfS}AuYxU+G6
zxoWF1G=J7Mc01<y*~%B~uWA@!>v#uO-pLUBEyBv(sn|qb$MDOlgBp(|bG?UeLJk^+
zDEKLdd#v$<k4uPGID~`8`%dp&n_qJMdlVN~7r@6NuZYl-Ndzu$mTjIdYh*mVtg_}m
zC*6C*{ish_gdNR6r?6yXTNsKA@pe-#!Y@n|<~rdM)9XQd4~XT(rw6OT!%sB550L1%
zo1<S$-%Z;h?*XWzskHiYq5v+0wh-@P{BT=FqvNRLDtwec1m_{#caFQ19NTjBsISs-
zRJ{NA!pmuU;&rf>`#0_$^<rH)Xu_<X7FW+?{+R^ps{6pDA3~c#JG%1CLm$;|e>b3X
z3L2!PNF8%4MFY&%n<w+1!d{cVO0c)m_(H3LXgZ~A5F1omGM28EBNXkdwB&s|*^}Rg
zXmPWwmeZ~m*e%dhdKV}r;oSl|2k`c8lh;2Kxf+ZUEEfQAxs&-3-ZsEHmjNDXU=%C>
zP6X@9C8E~v-apsQ>_8f5l9tafSbPgQ?f+~@ap!?_rsiP5N2QZNCyVDs6;p{u6q(-y
zr?!4nwm+`3D`WCG`e8$2H_R1$Ldy+|1?s~V01x!hG*5&Dc*CTh82rXYyqJckUF>-C
zz2<6oX7kMJI#fqT6SH!X3#gm(G#6ETOiW(^LU?@*7>0LQsqXvTtlR6^$(6YGtDCP5
zlCN)@zWosB4SO%=tsHuxRtvCVb5=DgA0s3rzbXofR+`Ujc1z-xx-YW!^WB8$)=TH6
zOmSWC4PK3JJ3Q_`>*>R<V#02v(D!~QzYdIF;ysQL@f7cS6$&Uz2R^+c_EJs2(jg{@
zPOx7@vN9Ta24D7lu7Zirgnr^anytf_dW7}{Y1BG5o~@SzxHx>C7tXBO3$~p!rcAOS
z#emuMgTgf7RXo4-5jz=~9<-H-Fh3hBk@DuxZ*SrNZ<<bDQfzL6uGh14zle+GhEnY`
zX>>=HNtxy02wlEmx-ae)dZgDXP5k)e++<lsv>e)Zk141Arob!mKl4e55P>dFclk<P
zg9#-Kgew<?KKiQSkN-i@jj<mqD`@4x$0;IPaVDmVOOHkq@WQtN+`3r&>f~#`&?YC<
zqt_t^-dLQ}Q}*QpdpReJ5lHU@N*5dH%K-o(w>(-yTJ$Ie3_-uxnvs8rqfun?N(*lE
z7}bfc8<)MZ+`m>qc2(Dfzgx60j^;2exPQ)N=($!KksS4Ja9xV<{?K}n-^pMhl|xHQ
zd&N$>@-1}AqI|HC-=cV|!a~c+epfeJT#1wcHg1wL#{&<GB)mfv0Vv?j8>P#JmIA)J
z_DWCx7G#UAUu<RT*yC0PD+WB`Gztu}<jZm2x78o-vFj%C-|aXO7dZZ|8T<Jo)`!+W
zYYboS_t(vUvJY=6!Y;H1E$z56hK*Zd2Yw8^b1GI+e68Ev>#;ScC#PrfgzhfMznN@>
zvHg^N^}bvmDorl}N!V~aG%ve919_B5T27uNP;vh2Avm3yHRY(^IIX^Tef~>tP+ylE
z3jx^IdfPuL1Yc}PT3p?Gf8(K;i_P=^#^9HTjsi5FUUFy<U`+Km>vIB(Ql4gkc~CCV
ziw678<EPhd9k0V`JDn`&eBxAk^n!8e%4=ze-V43A%^Tm|)Y482Bi9QoC6lxu)2@HB
zZgGBo`qpwgBsmdNLyyih<n(El!Zzf1SmEE!vcc)9S%@6W26H_GbKmtexSmEx`(~b8
zxxEg*`Dm)LFJD39iSX3amU>9$PxazbZQ=>PHnbnVkoSFDB~+w*Q&I=Z1}WuPAnWqp
zYP^XAqdNY%IV&S<x@6XRb-kX~)&zRO_6bp{p<nBCf^mN#mr|7xIM}$%lom*@Mwjla
z&^6OKCyR<_F-H|YSMj-zU<>NS|LuBMtak#{LgaTRV6RefICTv9evy~G?(5a9z;BpQ
z<oYb?o65=~#A8+4PFTaW+jP|t+>FI6ayu<*$u7WmneIK|blVlKDEd8rW5x)4=-Np2
z4MPV-C(J3G0$twszvI5pr+O2GTI!sJKe}mhurC<4lzM2QqJ8LlgU+?uWCfykdaR|H
zs=o<A#`K=ru(|pUu9}BQ8C}y7=ycn~q;=;V5&c;LyMl;CF|MW{a6%B`Lg+&IjBbV&
zVnT^8u~O-tcXkw1#o@ITlM3<~N{CPM3~F@H`pzRty`=0J)ZZU;Pf$O<Pl8at<82`B
zZ)L!TO9_d2KOy{PsPXo%tVtC(<BcWN=*Qtn)(#efk8OW@h$B}f=2LHZh_}|<9}zu@
zjF8hm+C&P5g?<#JE8)`;yW1%%udB=O=d4n_jV{Q4gDK+><$RD#)C_N>6Z9#>;{Wb|
zRURH~zYagBp>(>VpXX%iuBa0UOKIIQ$-x_;F`SQjoAQP*Np{Nsyw+UFW=Hd03IEBi
z!Q7Ul$4IFy7ObK#oVpO)Cbv;O&XaEn8u?bpqjSRK(N+cX<HE$s-MaC7=WcGzFEcd~
z$CDLC@k(A_n#u~-b>3+=EZxVeO}rK4jk4nb@M>U#V6oAaU?;(p=S`trdX3XtCnQqG
zZ3e|d?ZFU)?7+$c=Qugo1JxcPRqVC9Ym#gHYfSRPT@at%GN_Lr?=J}}DhX<ziuN)=
zmDG^F>x3(YGS|+RuQNH+@q}0JUSE8#EYJyAe>HG(lEmmVL&fr2kPGaIZVP3CgX6@T
zw^IK$0ZaxEeI!2&*#V2a3wD9NLEWQ)n+=!CO&FVemfXUHMr+&CyjwH2Z@7-dJvx$`
zwW$}G0~!QRIAUU4gm~~5F_v!2TQ{Rid~|>~m_Nt2mDf-*BWs&uL#Cx5Y_mk}BUN;W
zaEvV{z1Ou(1#-c*kVwB#ZrHxPtOa=(L_r^|d8{4gjcc{}TE#(gYr3|jv9{D|H`MN)
z$xax{3+wI*>6gHlMI9~lT!=-N>rpT8N*M{!5oC9JJ<o`C6!uEX`w*$`5<~=l3l;1U
zBg8+%m8HHO#3j6YND?s1?o{(Ziki^)F#};<wk*c-Yc%Oje@=hsSYIM(Kql#{&wAsW
zB{5zX1Ke-<t2{oJ7-`$+@w3iP!x}Xe|NC7#NbS?mgt#L@bT1-jhFz@tW~q}T={tC@
z&nsZHLZ%D5t9FeWiVB3Ef}fXe_R<i(jM+b@B78X`hSqSy5^O+1Ylz?BNWk%)N4)3g
z$IglFqQc<ni><vO>fts`3svv0RJpXC*uw8oF6*BMbpnecfJQNs8FIxmSe#06M^Ov%
zO|_Vc!RdSl<KgbwFyX2t^$lCTAl`^4P(vlSa;P?mlB(;<+Z|K&T0Kq8kCrpFyfeYM
z`{0-lfGZFU7Dq?t-8a!heS4&&cain_ycuk$y9C{kD-4s+mwya3NL<J%GI{sTO)ob}
zPh#9!1*X?fKp}h$TH=(LCb@L*m5`@D`8&LIFg$rzGBym<n~30so_yZ74R|Q%DoNeg
z+8lP_PP}Lr2a}efk5)@(Af`fFUw=rM_k-063iwf5DZcon0F*oqxVa`${x?fN?N!IN
zvx4@YZ%(m+*jbVG%j==Gh&OEcPBF=KBgEE;tUrHWxQ*fNaLRaUC3xgk2EyK4TYe`C
zSa&WFzJz5I4`em*U6TD4VDC+Vo<>%H6jy~3wD>FxR{7%o?zl8y?LU|;3MHZb%pgY)
zs8TBj4x*pks6|`kfze6HNpTawWK-y~JK$R>Nt7))Z{o6H<(5-ppZTqmqOyVTB%~Y$
zbcRX5$+noO?ZPPi$(#k+{-Nv*5lMtSlcDhsiN6yBLzHMc5x21u%SvX$pC93lP;g*@
zm#d6-iS<1Q4!HaFJ!OEWzj_Y(3T+qG{jzhlyGboDxb`~&^}@ViZOOp+@0sQ^{h5bk
z*hOG#62`-;jswC(nq+ts!_kI;w|v|+a5NX1gpOqb>i{M_E-uccXG2I(Kx74=Ir)y3
zQOH``lc0a^%oDYw==3r<QN#6QX{Jw)mBLGXrAia9C-7QIOKOE)=Zqvmjp=f8V*E~q
z(Gmbg-t1|1w6ozO!=g2Lg5U8|8+7^^+^$gl`=-pm_pwomZ{PcCZXX~_v^_}F32JdU
z@I*2DkY^I-@Y3H*^x3j!qqz#CKFuq!!{Jmv?1Tkx+mwTgtd-7ti-W>up?_bEI%;q?
z;+=>QX(XTcEI(ULRTiw_uwxR*<3Tn*^hu~n<q)*3LutKl8GDU&WOE+kbme|HbIc)@
zmiLyB5q8{AU(*K?YMI?i&!q59y-~&6FWq-ZKk=a)Xao$rcOr%dmiHktcZva-yY6sI
zZ#ymcekt)CQqPnekqWOKqKP1Car}oYaw=x{ZM4+X9j&x9&+jXWLhQ^l^b^@TA2Qn<
zj%tyAq6Cl{4vkXK>gNaGb5}#EM*q4Jh&2!x5z+8(-vKLOqQ*X3bgzxXX&!<713+1=
z5+v_4F)=MVl>5Rma1e=Xop%W_!jf>5d`=u5;2BzkRYvrlqYOG0;}jl)r@4X_JR(yZ
z0rW%^v^G>Z#6S~}NzWIMbXx&T0``^vUMMYU&agEy*&(Mecc6-m0a;*ks;bB~NZ`j~
zR@<>BVA-R=+|g)oDv*uu1-|r1uNeh=GoEEqjZj;%GS*VY+Zm0sBb9?}1$NI;S-FQ^
zAW?0CNK|>Q?fwOPgu-AuwuNOrBStj9dXvTH`o+#IvT-*?a@Fj|z)j@mGtcbJ)`DTn
zgeOFE%Kks#OsQ%qbSW=WhEwK?q)Q|}J^&NQe6s>S+8laYo9Eg_tz)SgNRyWP%iT@@
zeL4k-ET0u<_Z0ovkvGAK{VXdldPtqIIi9@Vav=Wm({x!uoq<cfY|z$0GgBc(og;Fe
z0}{x*OOEe^-*!SZAVpo<s!xsD5NtTzpm0<<v~7QE=vf3D_>Wt?j|6+}8&G=+0XR4n
zw{gIX^HOMkadcU~&^Mq$FIc6i{5b*<w~4{#^?wi(4U1^5+iBbs;KjnHHUn%<f*UI#
zkNRP?&vC&?uuo_ZG4w6ffAB%TFm645pHHEGk3{JSG1xPW?lG<)hYEPiiNua;gDnEg
z6~MgpUjue&g?@wUBxpyI;JpZtsS0J0Uj+UbO?NP}%uV>wzB~-OfUheorF3;z<Tcsq
zhnx6p<yXPNttbg;712ZW<Ch-5V<1&Fpx~>)$s;&KDo7>hn25E-X&xbrOYVUEW#PiX
zn@qSiQIb9b5Uoe2tI2v`D;z|-f~FTv{er6kYAyNznd>sPPWSh5(}1l?U%MIOGk^oA
zz8xs$XRl9d6;HW=GTwbtQped7Gx-~Dx}@n1At4=ZTqA@Vczf5_DVUSVTayfTDbWRa
zgE~~)g1tonm|nAzQzf&$!&wgmZZ`Ths0JZnB+NUv&Q$(d2{xz#sfp?KT?nIra@DYD
zvd38Kx}ie09Kh`#7#eYfAN2RvJ-K#4qHSU?<(MQf7L(PSa<$D;kxTMRZnf;ut;wvt
zGGk%JO-4&@6M(Mq<*<|!Q^N54cn)yrnZNUKCBHQ~r%CFP+R!E<G`LhR=o@CXyQ@>0
z@yPuU(EKZ3ef8G?NbqgYBO^qFVf(LZSWFHU+L2@$CL<vQy3k$Ei#@r`@0r2(^y{me
z69t)9%7+EkrJ5&#mjFt-^aTvAuqvNJOZV&OEQAX^R@`ku)%}}hy{lt(m6CgS{zy7=
zG0Azu=YqnMW#3bcL9)RqLb_frU*l99sJ;2k=AR+9jM=hgR<?Rk_Ae$X3Y#|}Q+F%X
zO52ERig06&3N$k|A#RTDph=q3R!OHWXzN9T-RU^~BDUIRP5#eH$pZUaemF>@gjjJt
zLJGfR&hxx)J!TKUnEgahI_eITT(8P1(O%6JG@GNCn?>Qx`7D1dL+Ksqiltc6$ZVO4
zEvNXhzdh)begu2|`gNB5unrAA{xL)AWr!Y2v`slw)<|x{EWlQ<e_$;8WO*>AB*KCK
zyyl}~`7{2i7Yn8W<9<Mh7jD2B5IvsGSe@abFqnzv(BODIj@hwv0(kISpNzt6{VO%|
z)Vl$TV1Yb=1H@F|81n<im~Bg*^rd`?Y$#=aA4|69lKWB1)>awDSCs@>^ffZoG-({U
zt*oASmd^=PEnRnBQ*3$<pNIi4ztdhj`4s4~m!KwRm3W7RRY-vdb`jvAg24Bl7(eIC
zdBj6Y3H5+%`(K#bnLC&NoZ-tV-faE7KD-^uKU)l>5AoI7g(1dFjmqb%SViMjG&PR5
zEG5<jP|eZu0bu<I0Q{#o)N>2~Ej*Y{aGMo%kTtT>ulnzJK_G;R5P)5-T$(WZo)HMT
zC33068qVI_%W7|+u%n^eO4{gm8889Hu~e2(<qKG5C=J66$o*TOWuDz^#?X38t?g#N
z$2UrsjUP-1=v8aUiEbTH+X9FCoiidHvRVRH>>uc-?aL#(W=dtEwo@Nl3vuT>juT0W
z5aWSDjvE3EFcIDS@^7~`13?|ub3s;Zv;0C_$IY<{kmLI<;JD?onrMpZQB)F}{qI1q
z113ef|A|`wTouxEb@m0-vk28MVreisS1nuUt2+2&+rn!xkX5KG!)6VGhNk{uno+v@
z>3+ZKEDz}3&|O0mZMArd&w6&+)@nCkr}4b-o_29Fs$pVwqqeo31#{tjLx+qSdE`R@
z0d>FOZf>)mumC9y+V}eYd{XP+HDtj1N?)+A)i%HV`nIUyXPcQ_@GwYmnNB2}H;FAN
zHWx4**Y>ICcN*Eg07;wNM~)WR%8B`J#|y*Cmb#e2L#z?ewSUe62CzlH7eCX;Xd}_C
zb;wNwDp9}h4(Q)taQ>^Vh^b@2AY}-&@E8_g_T!hk$)O-83%WkoQB&1R{Al#3OP~rY
zs%;4CN@a5ounATEg3DDusPDXhx!Om$se+FmtE}L0U&_kbUx3GNxQp9$E^2Uw&{WF>
zOBW+ExI6a)Bd^YcBjVv_AebA2RvJBOX_^JP6iD7KVyR|Ft7XrJEt>*Gt0mR<fgIl<
zcKKUP7^Eojfdb#5FN96qJmmN{hDV^}1)}@xmaGs4*dTigh1#uqy2~NHuY2*vDS6Bc
z!1&9C(|x6$Ior4W>a3?&)T8^nIWa#-+V^QOQgN0wd&P)mCFw_O%lQePC1_|c2Ab>$
z@<(ckmI&;-GTU*r#i>sOr*k=oI|xggb}kJXvS}zOhmwA&@hQXwr27&otLm1Y1IfwR
z>eI;a&)+i@S9J=#7Uk&QiSE@Zbz&7Q)h8W%ZaI|UJsJaC^72PtfZLF0EPk=;TcU;h
zx28nEntI>)ST7JR4?!iCmzt|V!fpHFKu|(Tf9YfzWU`hymz4<XO1_^Y@;~0P>qV|l
z6O0H7uPL_Tp!#IJ?)~)Pk#xY)lny`tj6)Rg_WRe{Tbiv)qgjBhzvlW)>cHT@<Z#yk
z<2U1P7!QU5S(F-+hB=44nC}bn6N9NTQx&6u(2EXds%Xv<sJ4pLi=kYJJ=cDd67G&E
z=goqut(VtlvBSYeTN5Q6)ztrHz7@>;QvoJR1_%R=$kBvRzwfWy?4%6`D*T#Pg=D{!
z2<P^zrNFe9T4i~s)fn7a-+YCnd7bcqzN!ZFqFR<mrBt>---<FotY4j#o>PlzRyIB#
zwQy`V6<^~Buoas1pZL1O-}U9=Ty6=3q5*eNHC{=-uY5ly7Jjt}p|dJ#IMSV}i4q^E
z?|6%Cs<5D*hO+o>uZs((H~E|Df=rYrh7Ul})FDAHWL!p$JexK4ujfzt1n7T0r)CAP
z_sqMXahZVi{_&KxxEpZrm|X3bfC)mxS$j-luSQ@G2fZg2SPWp=|6SSbwHx=yl#+%X
z)%NRAyu#JtnYal~h-`muIs+EHi5BOET0o1Omv_A(FqN4shS3Dasi(k$EoPAN7a$mH
zDa3m&yNcz8*%K+{-H)fW*MpwT1k(BqFF+d@Cm<gStMz?LCFA;`par7v7pNq_ecoI<
z=4iBR7ZdNn-rgtYpm$3FNZ3845ufFkdupN}JPG_&eKyQ~*RSy^P88sf?c_abP&H()
zXbJDyW+zNPoxw@OboB3#x+#B#Q%<Ge4>sP-4IF`=pe$hvc<0<$nOT_j7S)qxlckx%
z2LqQclh+^5qjYu<h@sBOZBU5t?zOt@l4GW6pauuZw_&41OyCi7>WY<D!7^^5&w6uL
zFMsBJyLCAJl3$sILy-#-qPx=}gQLNH+~V87pX7PAt{VTw#hl~>cT$|(97`0t0wLzZ
zA{8?Hlvp8pD0hhJGlnRZ37p#VJ1p(1jFc)GgN7w&VKLU|aZcSS`)UPTO_F&PZ|3ot
zuo@)(w>zk;9sIKr9LIDz=9m60#d&ai!}?_uW$4+^#Rj>Xsj{6sKwyv%@Kb_+q&1qQ
zne8Ou^#{?nK5=ykVFjR>+w9rV%6qC(9+n5>CPp{Ium+`6EzO`jRJB+^t0!oJyl4V5
zp=DOQ=F;LC&Meuk&I#X5zNAVab&K>HEP6jZRNkS*q*ICgtRV<ueIO0=18G3R^^MYe
zlH0Em;7?D2<}YunrtK7UV@G%VjGo|Cy_5C46%rv1dfaq&kAiQy_;ToVMywr{w%1lf
zU|W5|%}NBP4{7ClgfKL*Gj9trzL7CpZEM6TzXzJ=tUlT6;jB|0xtLsiqQna}e3SNd
zwjcsD@SFl&(~LmTNf@eO!yA2FvOg|i8WZ6~g|Wk%H(W&WJm^l}ep0FVhnU{p8_HCS
zwHnNLOyVBe02=PL0J=DxsnxkZ9+fO9oad<KY0w@^^D8+}9)#oi33PPQ1sr{!!h_b<
z$ud(aL!Vtle3@7N`!T8b)P`*RW=Wk9Xpg~w{ve*Jkr~r((%%B*u%Hf~_K_1VV%OW>
z2cud8S$@~5Iy*7W`6z32=)gv=icKjJ8Mv>B=B<5?8L0TdtwW7G>E-3`jxAnI<T2XN
z3wd}yp7@ySS+wA_(w3KM`4YpCe463u`OlK+s?sle9^}l5){vXk&+ljv>8w(v>=SzF
z>0i)1=(=<8*aLG9q1}%M4=5|~s79;BF~_h(>icwgVUdQR^oz{as>jQYsk_tCO_(&M
zWDHjyC#2Yp4=r%7O!b%#xDY*TuIvH$8bXD*+%L#)q@_CoN9BzS{c$^Y!Hule6CX5d
z!&7*3u0o#VR<<6^qKr7nqM!J!Cfix#QUbPN4~owO-QSbZyk1hf!sRBd2e0YH)Ombe
z;cBV;3`MuC4NmFXs?Oi0k_|pHBP+)@oE^2tAY8O~5#cGm#ulKQ^$w`q(rlkF?ahl<
zJDDO}&7n-1yQR`#B>gf&W8dS3unl!+{Z6iN23rV=-n@j1!?~}C*+SE<sq*kVAfdiK
zaiBGGjv0tX;XjDy;~&StGKA2Nl^k!~kbD>|K(}uu<7j1b9ZWBzGK+jI$$d?Df+0gg
z(DzMax%;Gl1zU29N(^t`r-r}e;<9el$IZ%_!OOGzE0T@Pk;j46;l2Yc50)$88EqOE
z;O|a5L0)PRnX9F%qZxaKkK=xfriBsEuol;&A17#bZf1jzWsx20goz)2@#4QU5o2z)
z>!Bq3j|Fh%g}xXZ+fV3b^TcOA9&{xST=N3*K{<=+2zFDse|oD+653o&nU)M@Km$jf
z6t!$~e5)PdoD<n0|18ZLYTUi7Nd8!EB6{?zj+6PGS;Aupedob>B~H5e2^kmBa!SJ;
zyrac=jPIHO1n#A6!~$r@I*c>&HvYXPNq5*fWc0nZT)32-Ihy26PNH$Y+s2QKi7oVg
z`HaB{;4S?+A0JNGaeh1vLEG`L@;-TN+L^3sHSEa1kLLlhTJmlX8IN5>H#kR1HqI_q
z_=`vDUGsRa$a09=KEd4Y_9Q))ISsA)dZi^b%rI-5rzlR}We1~1T3`E%`RWhoe6HuJ
z4_whR7kYkj%bG?S8=Hp4!P&z1HyjFkT>4YsS1=*+OYsZC7anMbKH6HP6NCZBjtqP?
zHxqAUQ~yE}g|2w1;x)vMoy5WMNR#@Ho&-qEXTb~CYv=#uR=l}jq$@20<lgG?DkSqe
zu?>$V+<eI$g#?~=Vl7#cw%3%<9jz)s=!6QtU9|zn!w$B4+*!5Vm$I2EsLI{f5NN_h
zJl-M!{xM(iIF_5E4V8`|&N7{6DKRqnMg?Lm<Pw|mW&@P$)@{Q0t{V)51Dlcz$ou;n
z9r6n}Ywy;M7_+mW7t$#VHx!Y#cLL8OhUq^S;-cf9fuMAr1o$^68EM1SpBxN!4f|=S
z-2+n6eZ%nXFHzN-c`k(%fwsQb;Yks0QQk$|L|szeVWIpCSF*0~ZU@@kiLJPWj$0E!
zI}UcmomjCY+1{MC(0yTE&H2G_DFP`6!$Atg?AwXiyr~+5<DwV^{>DQj0iBKvXVdSS
zR?8-StFApadwo*(Mv=${W&@-4lCs$0@_Q{&dmqgQxL?iU&@`9b0|J^D9x~V<qQ!RW
zS0860<wgIBdPc{koMe4V{VhtoZs`<yNY$%5UU%O~P&-9DNw@b+WAf#%E>!cC%By~U
z%M0Qs$h*Jl9X9zbl`5r!5J@VEIYd+ni<D8O9~<}UvL`&HWi^T=Pa#rm>UuX$%Q#Oc
zK<V|;Z(_4tn1|rIBRV0fyx@u+ghk>Tf-{QUR61G?4!iIAuHSsP>X^K9VrX{TfuETK
z(F)sr<s?#0Af0{Nlw01vra{)3>Dx}B)Ypa*CE274hhF(a%G1f>?OL>2A|b*b_zln!
z{?427w<yq$KQ|Ag>60@G+b!!o^0JUJm%x_z#sCCl#d)4Bc~*+VbXLh=h8N^$|F@zd
zR~qDHc4Z$@9=3S8vt5uH?u46t9~8S<c5*$kUKI_$usVU0=Nb{zb*0h6vt_N#aN3T1
z39{b7S5nNjZ-Ok#vQX}9+p$!dQt#?H*<o!&0AGW((~G&V0cmQLPI<5X*!e1HB-YaT
zR{Uihm;-v$9H2h)*pSY&h^<y}8pC`%FhYg-dt1%Or$R!;>nilLAe%n&<5kP;E|1Ng
zozCSs`Ri;RL{FEfZ|{BYllaI$cs}ew5<qj3qz08JsIULgc2Uueb39zi*c!9#VctD@
zly2_*qGw$4Ahp+wcu<U8f}RL`FTJi2dZMXA^ZqLn>b6;$jYJF>yHz;-b^h>yT0M17
zBR`H!koHw$BXU{f;57}q*1z!*K7zQzE-?(<Az^AUf$*oW=YEMUOx*%FrN0>?Zf47R
z`rws#m*rX@PK1qO4}w!7BRCIXJTf1vBtohf<TN%$U*;`4vvKzv`AAufYGx=DuqXfb
z&6tP}6G-w)z4QsKx&XK*gnIY_w{244_HEUVXVCijkuH@FMr+toTkLH|!BUikenG^m
z7<J!>pUituhaZlMwRDayT*%D}lEyVvR59ph|2M<nWRso0AB`k;b2|Qrnz3>TNouj?
zs*>8k5VFy>>?Ij6>KoK@1N6~W57auWsxk5)-StY|C2G@1CF@o78bxeK{jH&rmN{w~
zn!hd>N(1I$9K<wv718)OV%90^X}fC%8v!F-ec;yihYL2U1rmA8Y}u>t(H)SZ=}&lj
z@2b!#A)^Q&l{Pc5k<jBmezVoE#V+?o!CUGX2pKJ*$h@w9nHo6syEj}h`ju!obzU;o
z%2T3xIdA;38baTZu_OJX=yZDMv!UD%O}7&kqaF%)B2Uo1B3Bm%JtKOBq4y;;wZ(lK
zqVBA;u|g;MJqY%W-({=x<Qx8%A^br+fqf?)#}YYlPn!j@$2drVbAYwq;7kKw?|2V?
zkd1|O9h8Xa>6ldae1Ae^+-!{BBn*wb6N~1hbq&8uuOyr_UAwzEB$yC7{I%4wy1$4}
z-RfV?Pu>yY{^eA#4Ht;>`nOla@4$8ukxf;@YK&D^J;oHIv0weCoyorKiqa%dp1v)i
zxmA2u2)VFVC4jX7>yDH%K*Rs$TFoMe9Dgd)%%Do+B{)Sq<wx63eEByi3|w(}b_fAe
zRei3zP^Mi$Zx%N~w@bvd{1bsmmYu|RwjEDf^W~z_G3{NJatuRZ0oo17n;XyI@)0GX
z6R^G1QfZ&C+^+#}Hi%=1A#F6KqwoKg2fL*A(x8jfHAaTZtXjtR@67RAciQG?6W^9D
z6YYLcV-nx<*WSHZPO-QXN$|tn{)-b4aAvp|Jh*qLSC`mah+wgBHH)s<w|c2dUuPaq
z{f)Z7CS(q8P8Ill%M88TdcQQ4=^Vs1^XHh0QzgFcJ@j$yPKlJkA8^F9vo?35<SEMT
zQQs^vC8{a*SUzz(d;obbaC^X`L+q3h|IOXUL~J(@7{v$X0S!T6oC=aZK+N<nt=9-O
zXFT;IoO@Oox|0f&j`z$q^kX&x%>n`a_&OILr$I!!6f_Z1B0D|7ba(F(Kj#K>Fb$$8
zf;5rJ(l9my$nc+tQq+9=@q`4|w*IAxQ1&AP&c584cyAsOzM?d_M~B{cXO(Jy^HS^g
z`;Xenp=&m=9aiI#W%O6drzwx1c^KV6vMG0+Scgr6KZYFLLR1sa7n9<vq^3uG)%f2$
z(=zUg?6*?&V2ebvnzT1Tds5fzQyo<;uMGqAx>rU5@O1k*92tHKw;kD2@lgLD00)XU
zLMmH=ivEZ>I?y{7$3g0+z<WDdb@IsnJIBu6Xmeo)9T#fyU(rNV8G3xAyb;jTI(ID*
zLs|MyYD;_>%BGR|W>5jYlBx4hDUMnH+h65xi6LeuqE|(bS3r^`gFO#<Vi%;1MOFCm
z28AZ=e;L|D90$pT+j9m&>J;GTVqv@J9|(lqZK7pVKuTb3Ql{2acndD~kk0vJ>hdO7
z!#7x(HwlbBJ<xtT>BB#&OSDUh*o>e`+9YA<SV7fB{k6@^2)GorJ!V_sHOeqMhr;g}
z2Q&@1M7+*Idk+Iz3k=b2b1){qi5JjsBz6*ltcADJPtHPVI~eV14R$ni86iomHmr7C
zb34Zo3HC`v_f7sL&**1hyAmIk+9o~D${p?|^h9Yw3uWIp4-0S>D3Hyyf_3n+@k7Qe
zA(c3u?JbV0ut|>wY<1#DOc+l~Ctm<Qy`UH5$#4Ec*iD&_E@5S>ANzMkzKE~<O`+2J
z;e<%fx)A+}Z8cFHyOi*@HSk2-uHL5iA{E?t){{%bTU)(T%@wvwUyfxsbxzRcEVuE&
z_(-2G>gXgo5Y9p3j32jp+Q$$R^y*^|Z*G0?pUKF$iMZ-w=W4Xhd=eJ(w24H35N)Et
zi#mg<qrnT$&U~w{?TexSOR~gsK|kiHR~>N%u}2&cbOjhUl<;>>N_IKye-O<d28Q8X
zWv#FMZ{fCmXr_>lNBm-u_{_i%mJ;L8i*r*psQ%URl$DM*C|AloS7Zy#e~HoOb5b`8
z6NQc<E)1!C_V2bau!mkgGmCk&jbkH!1DUm3|5KIhZ<ZH~7ETS@3g;P@`j(1iEC2Fw
z5L^Ujkb<AU(0k}dL^615zjH5dlCDSwJd8Mb(1v|!n)*|<mMVM5I1ZWNCZQoVN5mFP
z>;<fGUy{!M4v^c#^E_^kQnm*lK769}I1tW=<IvIKTy|o`NU)J=1Y(Z|R`@jHr6a61
zLO~C1Em|kEBuzG4s<P395_p9Ox4sgr7weV=@7)IWLposfz;RtuAV%(=VThFRw#E|o
zbw)IB=)?`$%FxR~PntXDxapvmK(9H;%)(f)K!&T(<1;K<CvLxEWlP+;PwH<{$Cr2`
zvoruM1{vKGH<e$k>D7t+-~N?hesJ^rfW^+4Xn5!s<2)v$A_50rDGodH{0WJjc}k>*
zMZwb}etN876Dl#<jT@O~4!I&o3ZwS~9(TZ6&v=BWWt$dZ!`(pTt^bWPjoF6iXh~_0
zAmZnXaN;zl18T(E2&SvT^=d^qpxV*A<vru(Z7Wofif6d`wN6@nTVa&%vVF@g!Cxf~
zc7lMbk=o&ulL^fdU}{qQw{gdy0-A2P<!E_Cn{6>kFlNv?i~5{Az`C43!BEW5!%UFZ
zP>HAE`xJS*3!yH?P<P8GvT>>05#PXJp+`wwv7i><LV6yJ)QPuEUs3KXcb?<Rzr9Nu
zgO*59zqb<KC>8Yi(-Ap30r9*PaU#_cy^!#;$>kCGl^=@kvv=nsswCV=p0Hy3+Rfr<
zAnL7NGWa@JmG8OA#)uOIV4w@}u1%*z`S#=gZ+S656<%G26I#1+u^H<X+(M|cJb=cn
z?uckG4Smb~(9_hoyxPE1YQxKeyXex4%tEHrMvxcaa<{vSt+w~Ft1w2;L#RKtF0eZ&
z+g1O|&Vv~S1*-M7%XrLnT;HVW6LD;AjJObDgpBz=133cun9mBK?yu?N9gJ0cCXTF;
z&NOEDF73Ak94O1t^VVaYu+uR{Fhg!~e#Y$ct=|Zd9{X<{L_CMM`}W08JygwmH>z^~
zF^#AO>;W5t*FpL-`mZeslvRv#4&Q-Yy#;j^3-NPSwmIn(CS2mWGj*Hly$fJeQ4H<L
zG&4bOlYUZbQ})YqVOHt!|2M(T{&3c1V{9Ipyqn$9&0NqEv`%bRu?d$Ukkc?M$at3&
zCb=i(f3n+Vzhs|75`A;&Fu+#g7ThH213HZwR79cKgMpXm^vl|i2#4N3r!v9@f@j-C
zMp(C6<!#J3&kCtf)xJ7fm57Vb;%rhgV&~3EqSVi&@mfwN3py2&?&MFTUP>)gWpq2j
z`zChe2kaNBX2gc|YNh5>lzH`~KHtAQWHS;vk$L$p;6;SW@vBn}V<LLQTl!gg{1x%X
ze1g1@&(Vc0^r4i7-D-rUgNrV$C*-m4JE_jPB+QL?X=<6;A_N)bGQhCz*0vw)tFg_l
z8s9K_HuiT3U%=;SNF&MHr!G{rW1d1K3FqV2UeY(F{^Tj?egM8(;zei3d$^;;1zI`Q
zqNA^x|FQq#=+gSk)>3&VDsw~6gIFyNT3wz3kD=+;|6%O!(Gvjo!Kt31aCr!xX%1&F
zS7Nn8fb9c664-;w*jU$<B5%%P-&<1dDY`g^$UJySDZYHboA3|~9|bw+_fKd=I`y$j
zf>Lt-bbL6}U1t7f^v9M(776OljG*T10Tl#RFIayc{Q+2BXvQ>*g-`{i!!M~igah%R
zgmqoxenJ14+Q0t;TLKy~!^Z5!fJ_qdKeM=tiMV{KfTj8zpLbU#P(z*w7LG>aaX_C$
zSCQ)cDS>h?C66cSF+B!$6IxnX0)gffL!+n_zKoG*G0CtC+vmiJaM;NwxK8w?Tfi!T
z+iK5$eo_IBjcx-4p|zNBmg8d$0iuAg<~I|e{Ql`_Q=-fIsAX&Lb~dC3B}MOEoT4uZ
zw!uet;7G{@$^q<-eQlgKy8AY{Z-j{^q`Ze)mESw#>sOhUH2yo*;jTpUQxyaB`n!e7
zMQOP_jD%_sclsrlR_uzj&k2+=cXBS(J~Th=`rS#06yJY-MlE!UqxW{56oHA~)9<cC
zY@g74ua*6ruS!^>|0YR53=oFaX|SKQH(7o+?glfY<c=y;x2;)^2pdUI$MQ&k|L3E}
z{E!0G|3lbYhei2y@8W=z($Wkm(gIS$(50l5NVhZ$UD72C(vs33-6h>pN=pwzNq2Yt
z9zO5){hjx`*E!eu=iy@Fnf>foYwfk}``%g8bm%*4XB(P@Afi{6_Ux|~k&d!UQC|YX
z5t@NyvN#x2O`GBT@6MG4p^$!m*-}M`l^~z`zKHe(z8K2%3#$wsmuJ#|TX-X>fVd78
ze?oN1V#(8eVol-lw3{}xyw#sm&%LQx{BfW5_QfcX7iU17WFILD!`?r@AK+DLx{jk^
zUD_y<^1ok&SwJtAfW1JCFzzzQLWxlK+wUS(GP*KSWFEI9KP155e(l_OZ@-Bd$|czL
z`O&CU(K{f;>fktnbLdTyO5Z>K2Ly^A&`p@d6T~0wt?MScj8{P)CN59-KY?nXYmUX=
z)d^ZOU}r0xg#vCT*1-nMIPJ8H9p#fp-||txUkOK$e{(gfM(W>`2^JauE@cRIFn<(j
zuHJrvF~K2v04xS4Ly9s#7O^_<OHpd^Ev+C*UKJt^e}pRx&!~&G7wa*u=#D~y<vDzM
z1}k#-U$*sqScmg3c6$<xf)C{h+60Fr0q;yLsJoGJuz6Vv&}jklCyCN%VzatXi+Z1o
z&K0w%KKe3rn*DtB6SXz&KTQPU)VJhDF4QkEUH~8e#~;2A@&{o%VpoHTg?y~z-!u{h
ztP(<Sw3nWc(zMLkohlsNnTOhU83aFeTJ}?yzT%3kpkrGns{4&bSBvts^rO3*-8IT$
z?&pHoj{!1KdPk3J;G>Zxdm{7iZpdI9ecA{>N61yo05s|gd=cdOf)JsBCw;HRi#iP`
zy=w+jyKQ64=}=IUF+bBf2qlhxk+A5&!Y}YyJF{Zw21L)YZ{3JUyit!RSK1zeqW)>j
zTlytIVx||#VQ&@W0Tc)3<yY~l7eq~B=OPuDH#VAn1V+<~()fPP&e?l0uNqMoe*VQ^
z;Kvg5L`IdYtrCWv2NX*RSIi_NwPsy5qI|40a4T}R#cgvCt^rnqkv)nF`np@)xN+x6
z9_0_r<;n^b4GOkm32)35k4CibYM-UFEv-$SP>vnh7oH1H4Ww*xAEnKb&Z4sWZ;Auq
zXX(>IpC~@n=-j?=#~<KE;J|tIR9>s+d-t;=OsJ1^R-npS3!uZaTRqQ8CDG7xk{IGT
zcHuSJG-@$0jo*W?D1F)hVcS<DIY2F92B;IfVq+TsV27mu3lsnl5$iTO<>eR^Jsx|1
z4u`ejKW3$U0s7HkRhe$k%{{<avwZ}3ee8MGrv6w-b~XTDSW|8X3$_6Ih&!VKqCDBV
zj*%hyuwj)sUZAzW{2QpN+MaDsY5*tGZ?siaI*)=HH8|b}`|k&+<>fz5Qf{ML)$RQK
zcuaz;9wPunh5)DhZn-Y{0L6admh36U2&57Hj4t>F_lQ-pJLamDNj;!KcT?3zjlf|F
zcBY;zR5iJXp_6j6Uuf~*r~J^2fkW{Fgr_zvNMu|w0)UGccHQ%yd=W1u@UQWya1PlZ
zpw3TQ1A1uwZr*({SiKYgZXC4j)gjz8%k|)jaUw+a{{2$br?fEzKDaxu?8xsgI&4UN
zxtm9Ta?QMu`+fu~&fZIuPRvbehzP|BpkQ~J9fGhB-+e~4)s}snwPy7W2^R+>;FSA!
z*|dJq*NY;Fmm$2@`P<_Iq>n5V{+zRgqI+2Zq|uq{)2)@3ry*)xTWGF8{Sxu{DPgH%
zDtM~h&kxvEXG71yZF|>1ZSKM#n*mrQz4cuna09*UF-j*+Rm~D`^O`6sAsqlJ6xk~L
zta^L01Ob#~?XgH+Rr*|*Rd`wy_IF*T;o<m0;@>4Up*q!%G@17{Qp50Q_UDA!fO?$U
zT7qWpm8JXw4D}Q44mAXaO?0d`8BsHh-j^)hWKXfI_Vw9|!#^@KaDDfAGYl%1nAX!V
z1vJ}Y*WYv<|9+z1MM1YRW+JANt=WvgnevE%BGo#mw%6$&nLPmXQnfW+Aav7=_i<RP
z@!sKT^>>_<Tk}Y5QCpb*0p`0@F#9gIvn2y8^|J}3r*;`d7dqr9?>=aRZ&W-~Dau$y
zUpCPbbw(6uo5Yz$%O^(=*T>K4p4&y|w<jYAMgIBtEc%_-qI6kaOT1;9vecH4&3MH4
zK+opiue;$xX1*_81IWkb&F|r9pO9$04+9d|`evZq(F98ZgU;~1FY1RKPSIwfI5k~x
z<#HGVXCden&EoxWc9;m?=7huEc1cAYTY1CJtNO~SUlAh!l*q54d0Pz#NmBiHC|_aA
z*4xcPfwH<ry<NBH)4C#T9e2!$Cn-<=z&kl;N2J(kwXN|-MI4@Q7n%N!JqIwSI>1S_
z(3whOw}aLTMmkbK)0F4H`xKhchPNhN`?zv{6&oHY4JU9n?Bc0f8%?QqNRK>5KFiS#
z3)J`d4Z<)Fj^_De0PdtTS%o{J7KA6f`*YZ{k>(m^@8>MUS2@Uq>I`;+`v6$X$VIH0
zFN6Xm8IK_pe?Xw8U4v4Wu3cl#g4cn_z@RCtICRL?<f`sLAL!`eQO;2_{1xiR&$vOR
z3yeT_zMRCX2%z!1t)jb+<iS4q!jvKVUdd&Kmwu^kqzOX_VE5<BLgzfbnlHW+!F($@
z%!J^UynTe+@K_A<qDr{*vfkuw7HPf9=n_uyP}=cU^2S+rvlb5NI*}ac;Dt$5NCxWw
z;t6TrC;$j2n}cre#-n#yWf0#8+3Eh(8ZrO@5x~@Nsf8h%;Kq8+C9g`b8+M%-y5O^G
zB0($)ygNKW+*pq!q#d=34!Oci0EaybfalEFb$(PXw0a3e-MuuGg1zHJ>73m-WzAMr
z=t|vkh!jJyQ~op1^I!&Wk~+%j7pwsydZRwT?sgmEn_kr}@DrBt-*`P@cF{5=kp7MJ
z+4@8SIm`~_j&vCt%eWq}fc4<>K_h^na}VRYugLn2OL0d4aHSyL+*&OZgKi%Uea;fj
zcHulD=c>1ADxiS{kcBvgc!gA-q@hFvk~bsK?>lmEQcp6{&RDBP$h>HufO4CB2C`TC
z<uX=@VSAgt=T=@X-3T`UGfF_<?8b|iCHirSv?<{Qh=vmxx)bsMz-B|Lcp<AV+gtCi
zcfc;2Dni$rIg0b>C*wz}4~_Oy!pWVy10Q?~(67de{$Nr(vkkT+sIS5{dSrtmR-(oB
zTO5b?mX^8@@hocCYW?W%rwxIOC(|zu3%=j>H+%J&E7+zT3uIal+gGrBJygEnj6fZU
z0|~XP41BNY-z`$K%ja+*ZRNYP*ZV_BhQ3xVE|p2I(_^>^LSw}td(QE4Hua{@jp(Vq
z9Kpn$#hyHDRoG*Wswpw`s>)EAootMZu<$QZnkYFjEl~uC1%uQ@nUH<tNRTh|+{EG%
zqFqGom<yy$h*Fdy=L|6Fspm#Q8}tB#h0;>jY^}`(Ji4ogQ86XnA{pRUiV-f2{=RJm
zD+wRH0g3sx@LBy-`oP8Wcqb_aVH&pbV6xmWE_ZA3G;=6;0;B)4>2HRmsqCBNxw*^J
zO~A_5KHMGnDP?)~y5Fpkx&;E&hYzT8&hWbkR)W1%c9gGz{4EJyYm9$UA&Zc`5wQoC
zKp{Y)!M%5jm0U5yK78x8P&Menl$+pCBtSNef-A_in?MFqlauDu&*CxmLFJDy<TQHZ
z=_(#$zj1t~?{$%S@dp;TsRr#6ck#1q-E8blRZC=Eq~d45-f$Wf7qW!3;mz=QI9&)~
z=y$hfL(STP8QMV&RNr=uv+aZNwOT!E$Eil=Bcwh=iQRt!5Mk-;dJUrhW<fw`HAZS~
z@|p;_{kd$I8a@9m;ME2BeF3onWhHD}G~yve*~L300GLI@X)s4V{D}M1`SLu1c#9n2
zpN1VkA)zTzk{vMW9a_WsR+y9h{OcF-S$xRxFM3-*KX!P~n<?ET_HmKg_U8uzd#^Xj
z0Aykleq}cTz;YL>tE-**hd5dR$M{3Qd0i_NYzu%FM~1ZvIA1JDp~$&gubK~Me7L*0
zaBn&ogVSwD+X-IpyQ7R3DD(Fg11Lu2h^|IqC?cl*uZr`gw<WsElkTD%Lw_A{pl2D7
z+l;&b2on^*>$?4X$^_w@^?oWDOoB}Ci;i|f8uytGOGL-7uk=!OKaE-YsJ%*g-Y~36
z0|cinxyNiNBT+3^yjZ^U&!>kW2Y|xGYe5KbM$@;$l^y>VOK(vSb+*pE+Z!}(YdtOI
z8&;&Ei-EI}>Digt3UuhyRA^K{VyD9$74qu(QM&;h(|nE+C}RO`_?{sqUi4cKI*yZY
z*?rIcU2HQchIDRzRbOuexwkRVEC8BeV#>0748VX$C7XXD<r{&~4~Mq&B!*z5bP7%J
zyDXj2_n}HS!=GY=WSQ?3OVMmh;N#nsUAR1MT}cd?;`d<imB6tkE}1ER+Cm7mtw~>2
zT_JroB71*Sdo*b@C?-D!3AI9}@*G~`p_u2*wE%*pIu_3Wa6AYP>}{%F?Z}^4g<W52
z+e|_%2^mDV`1>@FecM&O;@S-4-PX$!X5>$8;R(!UsR|m7+zlq<O_kj8OuyzDPfAZk
zYfIn2pBeAvM1H5Q^tjSnX?(dM?Q$DdnPibpVV6A7+w@ZSHn}45%t2`!Vk3n0O8421
zs`e}-{VWaq^wJFpCUL?nj4V9b^UD~IQ2*Hth&sAqv3faFce=cG>tsE7vh>TB$sCvU
z5}znFh8_+$k!7ZdrdA#iLBMB0{>%K&P7zU&^^$nbp@l@V5I0mUu}@t#R0!S1?a0!*
zpq2C=OxXM8B~$Edw~!v5()**<QmYF@sp-0CuzdylcgELsi+_jPT$qn+TNZxFL~_qj
zi5yI*fIxQYjE>Ftw$SI!xl%CN7)je*Q=)7P03E>t08S4i$Fn;}cShZ06W^d%XA+`5
zaJ}OIXmB;NuCHGcRk_nYB`Q4^kVyD&+Lfzvqj&m~H8>RW&DL1NCfRT=bb{R5sV|Dk
zi<0syQ;x{8oPVTX0-Gf>UcFBjt1jQ!4N4T{H|nPT`#!B^ufExPk&s_isz_y8<;xyD
z&S4uQ?E}k_0Lyq`lyZ|RRO}Ofe(!5<<!yW-%0;f)Dvqakej%17B55Xz7VJH<TEn?c
zR1-)|G{wlRm{oh*SZ0#dFY#4w(!@7?4%H6Ea8#{;I2G%@&FDf}w?0J#mr2%6?ItZ%
zTHaf^>Rf=~TfP({;r84T$Q=D>Ia+mH0%f-<0A`AaA{D;PupMCZ{v0l~x!-J7JW=<g
z!m9_jd5^wCr9Q7YRg_TNf8+TGONOps#YbFWhBOMhBJBrj6MYKmJM2Hbp1k$_B$&CD
zpn=e^e~cU5!j1?EyFa?PI-NEOuwhZ6g;H;WKYtgXrvK4){rt&~&5Z#w?9zD*o|mmr
z3smijb2XBCC)6Y?$Lwzr8${WuP4IW2&OrgYmrqtf>%i7vlWs7)ofN$7B3LzzPMzqC
z7&|)HxaZ&SI(nX0CRgt}<@@VRi3g-*wfZyR640^zdjvhsiW;UUENc@+R+GIXjl(OM
z;>fnpTfB1mHwv^ph#4H_3;aR?7vdij6qr>ojO!g;5m^!oVEk9GAzH@~wT>gL<}$TB
zH-}G78v(tjfKQZeI7-2bOlap62_nJ<d(N-JG7J-!_i8q-@~zPc*XJByxzM+oukXps
zsOe*xjV(qZp@=r$yt0hr)*DVyI6aMzKHko=f?ianI%#Q^p@?O1xt}x{3c>|**(?C3
ztLXXr;iA8ZiD%{Dsl2~`M(}M*Wt>Z(W`weF;6(dZAyPwh=v3**^GHJ>27q;}h7hRS
zP%>2pGYn{f@P5|2k4glPQ9c9*t%MEWe=(oU5NW@fg|79B+nZgW8J2r3iGa3KNuTD*
z-R!>o!IEC^8Nx3c8Y$HL&R!-b8{>8Kz4@n;A!C+J<CWR!r!?P>&mlS7JG}K$H$J8q
z`VYkkWg3g~?=t0HbU1(6AR1v$*HqP%d|l8RDO<+1n*5C-I6=u4YW!KaQdAbS6QIX}
zg@!L%15PA8sn4z(EgBse%^M}MByu5p8U6&hcEr)2LMhhF!Gi|9K$>1}SjLE~-`h>6
z-^|6b<-x!xV0J^#=>3XKO6^DyvIv?}GX4g-pZ96Gs(07mgBkTpai)-+#^LguycHCw
z_ww;&mh2o9P?^PxICWc(2n?Qskyl50PO#j=+p$LCKL-F^4ZRua^+LrG!tfph!ezjN
z66QvpiI>L*qhRJbf^Nve<_UnPl588-(UTXIrE-N%I|38fWE1?djMJ{0fa1uQqE&s|
zjJ&;X4xocuX9gi1pMU4I%WTD2SZP!Ub?!1-5`<iu#Cyaef!v?fd08ZuB_G92?oEsf
zV&BaJT+#&Mt`8O52l&5y##htzJVcqSR)FHajxE&p1@*0%CTOQ~jz!@L^D-;)4(;te
zN^QuYHNx@JXeSr)_b&SEi!Ts^*_7Fhmml_4+qsDHlE_VLQF%n80qNuOJNJNEph2p~
zyWBKCMl#F})*={ozpn}M&M9+B9oa7EWvbPkSWOR8xvlzFluhPEYZpCxR=z*E80@9g
zm}xLnfN`8qr-W1Oy0dt9iezs_Cl=ss3%#GbUQ7i-q5iv5bB^#dHeAImWFQdqhn$fc
zSRp=aE~>D|k}x1&)UWtTCr7x2f9|MgM-PPA&3?EvIQaM>adzXUfQJLtMVgbC$hWs<
zES;#&*XLv%8eJO5@Qt!Y!*K8-B;@5KQDfz4%hD<27)pYGBug#`8J(el+8SvdLmg)D
zd0iY^QimGI`_w-z?STHOzu<YJSFq83*y1EV<8ktAQR$HY`6&ZnPh?cQcq|&y$^%TQ
zH&2zCfnCI922uv@o)WOH(WOOYa>$R)J29cn-B0tUJz);mKKvc|rY|SlD<uKPS-SlX
z&8Hu)`xAjI$4dZ*8u~f0ju@!TqX9R|m1Jj$2Hbs-xX}~tEBpzvrsp3`dyz*`HVF!;
zft2I{L7^UecXnq`n%-c)oeK?WtQ0lUM;q~}znkEK+I~$IRag>0L`Hx3Tn*4_R7`<e
z5|L*=|A%pxg<5C4RRaV(!j^&)q$xO#GxAtK0t=v?6~)5UHB)^~i?k~ShPZixHoith
z+PC4hDLry+--iGX%%Y~=UkJ1p69vfgK6KW3kAHiw))c~=N%Bu6QWJAmQhFg|R<b}+
zOadFy@1Kh;?g{x<dsRAjom@n(4+#geoQS{oHT*S!EVCj+|Cas&a;iX?28gLxr!$T@
zQ0^H44qVWD-emj4Vmn^Kqj*6Zs;Ja)dR9(WbQVvxBYr`w2r@7{7nq|(j^z2UyttKO
zn?cqFC0tWiQOW*ysq3x6Y%)Oa89(wqp&w>}Ap*)IK;=XBEi(vs{RI+AySRS#)ToV$
z%qEs4iVI1um!f!#M+6eA;Pz(BKW-((W{6b-+w?H}!Q)j<f7CxC<kI=&e@2KApx?yG
z8TJ9OAbvpLQwPib9u>mPr<Q(}M7+d5)p8R8f%*tL%36Q&Swr`3+1Ja27l)gJ<bM@u
zcFNu}^8+n_1LHla4DeAqb{;Wl-wb;vEK#j4p8~`#Ab;5*gi1ooHZ7GVeJ+Z5;xvl|
zIB0a{X$aCS6CpAu{t}dR{ts|Sj+E<T`tyAPFfK(p&iw$YS}Z_CDYBPYFr7kooF+h?
z`4JYyqF}_}<jeGisgE6>p8xnaK>!Yv_5<<b^u~;r#K`%0TlARPfB#kdT8Q6mw}T!(
zBj7T*xX#Nx3;PvDdqMJE|E!%*y)D=`zj#l&wGx}CeIN{A%SOKVt8h4MZ9<4GeB4zO
z`N+TyWeLEM-J=U_-7ELkka$9SewdZhMEx=x12DTsSrp^ElcIe@1<whQX;3{VK9Nf5
z*7Ae6A+9B<e^Ry4t$V^I*kwY~9{6tMZ;p?3z-tL0Dd^w3p47xam;MeXsBTU74f(k_
z?%Kqmc`H#GTWuBE$x(++Z54GPs#yT)CHjg$A({&ZFV!{{;HEy4<w%hz04~);j)%jl
z$R`R*gJIhQWoyy7t1qGCd$@QQKIZzwQUer6Oz!Km58%go5iJTzbLF0pitZ-SCe!BP
zsOhH2ro&@y_rF=hKn?hrleKV|33I-5gg6`dBXH-CIbh^#h0^HPQ5h9}OUhGX7nH}^
z!9Po`@{wx-X0-(d1fL#Qi`|&Hv5#P!0A4zAn6zdg-zlpL_9+@KvLM#j9qYf7*v}ZV
z!r$773c07)!ehPUE*I-<u~6@@{h~V(wYAM{rgGgn#$^ZeKJU37%vxz9z;6?N4)5uv
z`)7XLZ(*~6*&4QU=RL`g)>((xau!IZb>zRRwl)zlRDZm@-OA`>VP0YY=50IvibF(T
zzGRZbw|+?y(#I$ykT3B|u}>6(VqOAJdCIlv{K-K|g~cfx{WZgnK6mof=Vr;463?@t
z6n=gojHB{dN;1Ksx8Z{ZzE6|%^ujj=xe!lfdPHxGh_>h&crV^ium>IpxtGHc_Q^gM
zWpHbBoRPX&*TE>Ug;LVMFG+u$vm#J8wZ4gXhL)v@bb@br$dy8DHOi$4OJ<hDOZ>(X
z>Xd8fB?D($K4uJEIeaV26#5P3NU?5Kl>yWgYJAQQj3SlWJ6XZl2JIkH#ri8lVUqzS
zUnpXleMGJudKRLwQdvHAI_R6I+N3w^<iiq!<Tn@*vxrxW#<k39VlDi{YV#>=Zf#NI
znfF;Sa3Q|p3c^y^To@Hmq1n{I<hOC-@7^-a3<gV9e+NQdJtEJiaLqku$=5ryR&P2M
zha(O9Ot`C?U$emb#R0V{nZ);bTmt0gOxZxZ8EvZhQ#Amhsh5Q4jp})|cQ*uv{;K6P
zU_O-Wyekx{Y*Uz?^Ga5{9@_QP%b72-5=e6EkvHF&PG%3OS*_I^y1yRZ{ZcR_pE%fD
zf%l-xccpuClFYVbq(#E)Q?;zoVoHu^aY+Q?k;VG_dHecai#f}l<H<i^i(>BQL$fX5
z>9#!En~;P$>>JfG4zHivZo6M$yW3V`v!Chaq4h$1?AwLF4yMjzoC1Mj@w-KtrUp<f
zMi}3T{#a%>ea?c{(((YXxszZI_iCV>D?uP)xMDr_LiY#?No4_H>!bbQ62)$M;)(L{
zAZj-i;vE;PEWYP_SsMEz4+t9qrj6B7{Qc5F54S<?$e&m`7Dy&bR??66%wg0d4$PIL
zfX3dps=t2dN+E;qPM0C}%5Q66eTofj`wNj*l%vkaV*Dw!)>IDux5we|)QZ$Q(cinB
zH$y34Kmv-j@<5{{w{1%dxeq;ha^+(Lhe$`M_mzL#>Kf^gbW4bjAAc%0e77U$oN^qt
z2xd>UC4^<Hoz^k8Dzn?bPgF~0kI5kX@`&DGHQU)#_Ze%5n0c#LMFO)fpuy37li_cx
z1?lniLKU%Bky+c>wP)w>9A)Jbm<a0>cvP3Oy|+-FP1zEI$0uED1$(lWy<d?^qjI-f
z{1`_qvgio?U<Lm};ks{|kV0+FFrXgjY*IFKWl5F>iU#zP#1Ksa?Qqc>sx9jGnO=ZU
zKou<5ul;fU&6hmRKb9ab%*v!*l}*RcE)BboL$`jrKuGH=D(Z%+xZt-y2Wx$cZTvdB
zVLCQlrIsw-wvSn)7ms;zk6Iq>0{u`PvGCmfTkQcp*|XTM##5!_9GRc!Myu3l|7<wS
z&;8o5uEX(TT$-_*!Kb3n<QKxONlfU%N5f^<n)tnI0O6Bg5$fZNB461LrY8488#9iM
z$lg@<SXBl3xxP>ZSw!#}m6HAT;h_C$dK$d4ha6>+V;WpMWjCsYxL8t4%ckuKv(O5@
zF54STU99&(WURkVx$6w7GvV=U2hQ?vKXBe=t~vL0yG+m2p43P$eUp4Ve9wm~Fh8qz
z@|~DdeG201coc2l+$<o%T^*yub@~f(8Ny$tlS)Y0htAKOtPM}e<o~s4nO&BCR&r7#
zCIHCi`1g%2QT^zvPC91SHQ7|fTo6gN$1DVTQvKiEN!T%C+?V_ZCG%5|@i^_AFdT}x
z@;RN1>N18{7U<7_Pc4=2_i9M3B|K!`qTIXKz1HZhUqs#Bal#fdPK&thrNi~PPDv2o
zAi*WSy3FYynY(DVf3nEgz`PXytDVy;a4t0IDR4M?fec*4X29szfLZI{_4f;DxZc18
zlz@z+>w$mGDSy@AAnqU{p~_p*-(#G|zUDn~$)36su9TH;r;o;~oZP*chF+2fBqriP
zIB6BPC603If}=d-Vf9)K9XITblLA^h3n-E~cs3-eR+sW^xs}v5EcvpjQi!+(lEHdL
z-R$P^P0Kh*0ld@F>Q}X{q;_-?9oy245`)K!EcvT%9UP^;7=oyRXx9YWPqPsvtr&mH
z;$ATW?pdPhlU4St1?ObQajZ4!t-vcY4#QU#sR*I2JFyFk^d>TmG3*1h{AGP!!)G02
z*b}uY$`0*&cko&eBVUjc0@-%3@V3sg7G9ccP7~ED)%o+*#^>sxH`tq;u+j%lVE(RY
z+M>}1B+v?(^Ut0$A`A>@>RrmrF10IAi0HRXNTss-;ESm94Yt)gtY^R7=R;HrPutTz
zgxp_eDn+to`Nn{l<CP-+#&}vG1==hA$3tOfc5KYV3!*gPEDud5k4}#UjgrO;GGv{@
z8UDhRYezE5D9m1=VP1|mb{b{#o@W?wh6k4V2KhUibOv^4l%E7RLde+YDN>^_+|eoT
zPLy48KaYPFk<weKB*V2hYUDCvW`EMkG&As>cAykKk&I)eb}pJ)la5Dj#=45l!eaSg
zv@<nw)0G(YfFz~dOV<VjI2NJJRInZgNO-iO5)O@~uhv^l5Qrq-)GikcPCj+rg%FUz
zT()K`pUyu^$D_hf{Jgnp6ped+d01cMbMAdt0$czKK+U+MywO`{8^HQ^kU5wf7CSoU
zGgX0iOJl7iXJmWWuC0yf=W5YW)ye)%I{fa5+6dS>)IWcd>oruCHj8=sW`=Agu6HRr
zI9ceO>GZj{1@f@9B%3PZWo2yO7$O+*%1?BDAXzRL@u%^9)gOuBiJCP)Vu!U%aCSn|
z-VG&;Bwi_{9Kek!vVqrooDd4NRa`Ql64c4w-mu)UBiaC#UxdwO=djn(GaNzEl039?
zrf?g46ZgyU2~n@Mg-PHN7B7KEcv#>Z)XHVKGxdEMHI0~<4gDVPeEQ5`x`IXv;KeUF
zPqOj|vXRBnf2+C(7Mf>!5;>OP%e^$i9EoubvIp;Yr{sI##K3p#&CBwKtEoVmLTZ&x
ztWzrM=QGTjHD7AvfH@_Ui2x3jO(*W*37jVIGTfadaBwk;=h)cgabBH|G6}+syW{lz
zWliIgk?-}kP_N_9A>jt;cQIoEAUF3Pqk>9cl!O3WK7LS?0hS2fJdOyq2-X#6NAj%2
zhvs_4y_)(RnZ^@v!1BVdL7LMBc=ug~T3rbDa16JzhLLR$6eimWWx^;3xWnp_mxOmo
zaUtZrKG$zoS%lO0VtR}1MTiUI4yVK}POVXVkM&EBz8@+5qQwNSGru0$pKcYcsVl9I
ze0c3kX1-fSJktkAyHL#3^1v1N8Wer@G#&%u^Amq10Z{2T-rg&Ng05hV`js9DDv(P<
zZgC`tG({$c`%h6Ps7qkf1inMA7PLTrU(rATS2_I}?LRx?E#7ibF{^bj`NfHdABF8J
zx0mA~dX$Jjm_!$?pxEYSA^KI{<G_{w=fLH#c%nH#F!xwa^D&86mh(taY7#5mdw&<8
z)-BqL)%e(;wnZC_^wQR$cfAn3eqFh234vI;guJ?CI&3xW>ae(PPh*|J(Fz;7gRbq=
z{&a{5qC({vO?~kpA{?GW!z~Dbn8=sq47=4;XgwgYspUnT_$(AeWArrm+{x>n`efY~
z)3{XE4B1WmSvT{Jg5{v9{X*wadT6pba<L$;S7x|*)CvGpJk9|oE>WVb0N<mc7#g7p
zNcS}jUa2~e3zAnMd9LW+%DV~duJm#LY&nwh(s&XTW8}3zt~9FuTxrr(AHPMNg^1U_
zwstw#arm;r$AN&%eU_3npdrw581^tv6A0EQj)zZ915f={IRaz9>(`K;6h9mh)Cz{=
zSMg>LG`rwH2TZ`0WS959`!+Fu+dw&P&UHVxH+hT%-tJtK9`)3JT$;S~t_-%-;T>Yc
z78tK9(@#-z!{txrI;KwLUvD3-MrYEu0)D9%LN#>Xb3<H1-0o){XSf^3bNhK<6FXUO
zS>qMLx<?_w)zi{JAUau_ZD~U5vz3`U)S$0NpbmR{K*@9WXWB!}aoLFW-%db{;5giM
zZ<d%o&uzvZ`HAFO46@RM8a)veJ6<L*Ee^FAtbmm*hf>ky8GvLzn*q}i-w+Qy-pAHW
zjvPtK-mN#h)CdLYF9W`JXVa2($1~QoBh)1^tu;C49`=j?g!=DuOzH^QxmGog<{(6h
zel*>;-9|2C(tIAlR6eUf&GWS(Nb2uIad8Eq{2Xmg!Aoa&(M%#~jq=~z%OxLR>Ca`*
zrg6KC=Xdn-cWVC?<ZbvI6(z$p7`JuuiCq_M*=;b@rjF!D?A#_Q>pSeDWX;Ing5NZh
zoO&JBp}MSAGPZ1qIB1dRg^-mLcv~!<!xvd6Y^@mj)p>`_tDSZIozLphA9J)G%#(uG
zdb)gTy1K&Win^D319g_WAdRBG62o8+iBfI4E<a@P;^0VS*nxzjFesaOe6e8&JowPn
zwu2aB<rD38JD4*D{n0h)3sKj|mH?H^R_Cx^T~Xa^q$WXkii6!87WvbDv|Mih6yZ8G
zC4rtgyZLR?BMj_l>m}TpXQn_=v4qz1)>=Y=HjQ%l(`tv(C4eI$cr-x-<)+Y5FqnM8
z^&K9;OmIuY;FR2D3r-Vi6Bz}L2qrNk(HW2z09#Km-(2rh>@5Z1^{%sgw9XVURDHa>
z?az{VdO6+L`K+-JrmYJRpqA$BAt$qb?QEWvvN>rEjXW9VznO-6X;Mw&ky%car^62|
zP`VVI`W7vtl92gt`yxoA=iKhm0;s8W34U%FZhgq+%Gzq&NY;lXt-H8*-zXY2i#=eL
zUK}k-r+a}wmsa%G@arqT*$}^b+);6=*LBiCapiT_OsmeK##o-Mcn!<sB~$BfQqLBA
zh8^O7-NZ&r>FAQUjpN@ZO0xo3M|po)rn6-FND5?rMMc_B7Cf8g$+EhOE3aPPSXhAw
zKB}*7U8$<Pi=NwJ+_}-ib`}H#9Ps(wMWm(ohJW`SuMNx}Tz=J64mBxToj^j>#(r8D
z8A&&I`7#d-7S&6z?kBjUae7CEd=_*F?oZYr;Ws@;I^~P+n;|?-&@^!LeDf$te-40E
zbO9Cb6X?p8A7J^KJiiG5!2*!a8x2A|jHj+MGa7Kh&*)BHQpf@1+2KZjC{i$EB_S6(
zAi%LJY*f~L7|o&IG!uyZ@`8kFotmq6{+k%*=+1XJwfOkAIO8*bA@GC}VJM+nvqmq_
zppnC>Uf=g_^ErH<=rUo*WOF9>@~y;>&As}S@TS|){Vy8&;;#KH+2&g(IlO_=KNGWJ
z4SjU@jwxIV0gn7&_KW~y22iQD$mtTZ8s>&x--sk<bmV50M1f3Wiei~*qkQAk{#`;%
zA@Cb@XKb$sm5BI5V9g9C9j7mbV9kE7MElN#=86+%KGOJJW&_6^;D}a-UBVM<urqWk
z0dDuthbdfRz4Z`5O5fAo$Rr6$*zL<7<gpBZ@{gn~^!2&5Ns#K7EAHAVkFO+Cz3cOW
z2Y!HrQ#FfP{$SsG(U%nfwHs;(-|ufx_Low#3qI87mzQjph!~KtkWsgE0Wez>r3S4O
zhEF&@0D7U@5XkwW*g+x5{1c(j^V>fnhtwnl-axI06v*BJ;;M_(DEIq~Rj?=2M~8ZU
z8szpF{`zt1PA!$=`FZv3Emc7_-n<?d<Q2+NtmJDm&?x;Bl2DO)kx!>b%OG!|v&Kkv
zOqkDQhY_tV+C7SRZpNmyW)6cT>n!u@24&UlR;l`%1SNkQOz#BLL7!QI0CMCdKCldF
z%DLXD+d{F8S&Kd}y(UK6d^(Rdk9iho5WFs4A<^TJ?iwjVe|5Y+V-n4Wdk38=O>PG4
z!lS#)cyydVuoI1cXd$i|-JuD7LraZ1qfiW>;Yg$_SkLED&;TNNbeKxYzY<ABN=C@f
zMlq~WkIcl0Es;J5U8pGE9?Vpm4U_^J^ueuK4*b*qDD;GiM?5BuOtR#aP-Eqcgqpkn
ztlK<9NS>??gm@iO_-};kBTQLc%ax{^BQ;ur?jREo|AJBpx&1)H+9drSzRb&5@n5|H
zUz10xfLvdgk`d{0zMl6{&!Zp}ES1d5juNF7%N+v{L-|(|MHxF9STI%0TYN))0Ouw2
zZFN=wY6>w3%iw?T<>I@K?efZ(R|NzMv+Nh!hT~lbDAPpvEx*nF|M<ukDwxQ<#m%Tr
z6`6GW7K3cPOGXa%{{rfZe|%J_AR878!|YDzc*KnXp2}&M3&QqJLZhX@dE3Gm6lHjY
zu?+e6zmWZwDD#!Yj2{l)g1sI9Jq^LmR&wZve<kPe{ZLZa8SW!oQ3*oX09L@`$qE}}
zE;<t(6F<*h_c){rx=L<`(nl@v28UM>n%Fgr6ZP_C%Ip0Pb>e+m2Os<jD4JDLQjX^<
z)&S~F^9#N=dqkW~)&GcVFE9ZN+lw7p0iu8yR5FVki6Hdz-%OcWz=z8K)dr1f)88=&
zn}7AqBgf?fs<lalBfggqwLWl+3isnV2}(~v(DnE^nT`R}=A}TaUEu43ty+u!f~kLe
z)D<CnqVDt}MiynasFSM|z(4{L!fZO{4smbHCHnpc$%`z8Z)D$x$g&=_Y*Df7V41#(
zH3mM}o6W?gQTprajaUIJFOk#G?g<92OI>{gw%FnOf4Cg9e}R0nEaX@Zqx(mCLw|CD
zIG@VfS}LJIw|(A???v?&Z}YZx+im*5h*p~d;sIu>9tEOSneOt^J)opSGR78gD&O-m
zTK)}?B1J{6t)lDrWKjh~lM}&C&#92xJ`@_XHst`L=vVQ^ZMhA4vOZMjd*|7Q44^NQ
zTI-J1P#!I-+rP}~YfNM`A{-P3<8Zr8+o`Hnc+E)ndd!o9XSw{Q4$v_*R<Pq{`|iMi
z4ZQSXKl*88dx+syF8^A>^sFY{zqJ5BCb1dMf&3}6KM=#9EPXObnANhM2y&uBdCHV^
zF7Wpky-3#VhU^zBFHS(+y$h>0OZb0q`~T*S{p;U1j{+Qpf2?Hw$<qESSNp%N|F7@T
zill=u=RRcF&KP~GFX>PWET3!OYP$P2ldov5TaT^?E)5P&U@3jmQlmPhQTeLF4XUHN
zIbq%DHZof+T736SV=57+&iN+M=yynMX(e}XLMZrozz13tEiYi7xK*q5ufOWK*rDRy
zk2OKOX|nv{bAf-P@K)*ZgIENQ^S%}OJzUQ3{*f89p5$(Rkf<^2OO5pI+Z_?Zj*Cj=
zvwZaT+6QeRS6o9RgZy4w3^j6uLx4)hpwig)V0d2ZYh8gr&yebhHr^X(1pIW%d>}E!
zERo-4VXo9xx8|`<#X)7QTtm(?Xy7-5eUd8Fw~?rrlHw{fUV5XQec?RqeNqdD<qYM2
z^s9d-ynn>>jmoW|Il)3~dhoI*zumDpp-5v9w`t$(F>$GT*i%#IwhlqG8RwZzS({JJ
zrdbY+w2y>kM}gUs&74IktV;^+EOj+G9L$-EW)8(|#?CZKd`IQz1{z&%Z|W`!+@W2!
zVzxlMrSTp5wq!IbQr+s$0!6xS5OC-Nq0{0S)_t`{rl{S{`mK3t(y&t<n_+$4Byb|a
zq|knK%uOR(;@x#N`|Z8Du9(GprR4KYM;q#eq2jvjRl9s;0$CB1$IQ5ewTX~NTU5LR
zK`7Q^xww;J;LW6{dl-XK9s^KOlGWenMQ^#v<Q5pEUmoBb+~1D2QmGm2PsAbey%**Q
z8+XreDu$m;e=9Sfa<Go??b-*Qe!sl2uX*UWI{Ibe@R>55be5#0lSZ(6cP0A#6{jX{
zUzej_P|)Y=b5P9<HLuN#!w7i4URjLG3!euHMUZGME1d~~@J73+xj`3LP0x&Gr;K{3
zsR0e_o25Gu8xLsxyBx9CAP4s>J+HdMx%$$foh9CHq0|IBwp$GsyCW7H?)ef!%;{UJ
zuH&)E4wi#1`ESZ5_bF3fH<SQ1f)}s%tc$Aauv(d?hBA!WMZD{3TOH%o#SXiL%?4-d
ze&@G(*05jyJxg07H|rPb)8(-(RUKgTHN`NvsZOY9H5>--C-s;tR{!QatI`)j5wV8M
zv)QivWC8Eh>Ov<C4NWMtZu0}lygt1H{TU(gy>S}Wp+>Ne|Fk!(dQ$G)Yh5qIZ_(QJ
zsDrBlDgkTQLgmw|`%9yN8Lx3udJdOiY6+uw8ZhB%*N~@IJ+nKE7!stiaqCexkE9@H
zB*KGWh+dEG)@lv{DjynzBEr=EHOu}_2E&M@@S??r?3#&jqt(qv#Omvv>FZ5`MA$jn
zx=WSxYg~5*+wPyWN>hVV(PYrl8R-{!*q*??*qjzFiqIY!o{*S7*dK^4<rWV=MEe;h
z<-Jn$<U|>JqgvS$*t=j7kwT`LL#F$ROZu*e%lhM$jLB{lL$ASWk_bK{{z89RK%7cw
zmt*&S#F^9G-Td1gkC3~$*v45(GU6cpBH3fc19FkJXux!~bCBuY>#}v!ms`<>UAJ&|
z_NDKxTiRq=ezj?5{;*s|9^7jB#jot(L8jW+ko~L4j-)|dqXwCnu<zGdTtRh_k(RfO
zRGu~k&Fp5^3j9{TNja?-yzs357FtYHnpxnqY@_iVn3cWX{=_cKDD2sG%wIZ@R=*Et
zZzM>b*>qVAE>GyKwVudtKIX5O4C6Ym>{Z~mIr+|FFt&Pl)8@_&abswk`@^0ix>ckg
z(#WWPp`K~DkYnzJ>qHYNI172e7^3urX((P+et>It9h^P~!6Q=|eGb$SoduuRrR&O5
z4$99A*zdN9;guEeGf>G;R-A$eE*m~(D-sv@RQ*i^LvP5vR6iP{jQ?dC<uodF!sR<}
zrKP?vQtINgcQ<ptMv`!a<`_8cGk<Jk_@dD9YhKSEFxc!F^~93niM+s}WHvLqsxO*3
z^^m$dOUz|{by~0ow`1}cVZblNZtiA~usb3Iffq}YU3!)Cm@<x|o1OAbkHhQobA_)y
zi?v$dCH-<3zu7xjY>SOsiDkQy#S>$ta&h>5{#DI+L$;!Qo#sZueTXS@nF$O{jiS;I
zk3`I0jhx=xU-t2Q^$Kxos-zv6*Ki6vZ{7Cb+n?!Epf%-(eixN`cww`*lq5)Tdc$8U
zb09Ux&Z@m&%L<;~>n>@XOx3dq`yMJ-jgU+9KC7-T?^CvW{}4`n{2WwATi}T0D!Q#H
zxMV)<**od8(3F?*5Kt4rT=Q^eVPEdp*Cu$l{?MSSg?naaSfpFgmZV4S-KA?Su(TN}
zBG1UrnXztH;V~w3p0)E(76r@PVE{wJrOTSFIw~?%3TO;hC;#l$&u_#AiVI@ln@AaY
zb<d1y5|D<9JE4Y-{B~4aQvEXWw0l(r^8?WRaiDX?vYrrz*Du0rtv4s&hEBhtbHTPR
zrmx3aJ+CU<!o%*K5X5U59LHhU&R*YHqYls;-kirNi3}MYzklOovae?~r{#Cm*#1iH
z-J5;giVT;lgs%$n`G!6{^9D8J_Fq1hy<1LL)LUb+Dkc8qav>Y^ksVyB{>JpTTYUkq
z_0IG6<umL|I65w&dV7led)4(9*7b#`_0cndXfzF|Uw4S=B*LQ%+9?*Kxs->_=N1{L
zhfflR>-XFfh7|H`_I_}-HkLEp=R>#%O6#Ya-~!ucHDYf#SThU`j~4a6n-xqy&+{00
z5|$*RwCj4lXAo;X9!R67fIl4z`+s&%Jblc4eV&#=1g#vwjX9_h6_%isemgz0k>wXP
zvo_`Vv!4zYKX1GllXHtJW=oWyUNXmdI_29=cE2#DDu$cy=C_BKw#TV9bvzZ^iR+TL
zNs373hG#}y%ylxLkg&%}`>|{H71D7`J<pM|5R)VDN|ty4;mq&7Jd&>zbxUVpF8wjP
zK~~Z8cK6z-ZEs0!zKoOe*Y(3pn_V)+?jHl)#9?ee;~u&B`G+0_dBzmiMP)aAgvKzN
z_GUh~QCGio!o5%D;2M>sj2AkuuB$mXSxj?Zl0UDfK=%w~i&-34mWh8@c(}k+?Gt*=
zy_oo!Uq-q&BCZ5aq@M#%{u{>z2Y>v95kCdi{gJ#?SEtTchy1kNf<lq9ep7sD^&hQQ
zjRd<kImKFH<YvU6Ee3ESm;E%}*0iB(eL;hahglhjoN+f#X?|YEw%fpIzVwo&=LK<(
z-S&{Z_C8Y<MJ;0TnsHxaAN<u#Bz$_tk~wj1rfICU+_6Vmy-G$qFZ-WcR(br!wkK`o
z>uDCrD}B=7Iwi-77Vv|Zwl}1GJGAO=1${YwfGN=Qxa^b}ljoU)JHP+nGt1E4bM`yA
z^>0p2RY*C6hn;;@)%Ze*4kiPBo~7z}<FH}%jq~miJG|Ni)~=5+-7bBy-h4pVoZEsb
zzk5#p+TO>WX#dyAf}DmDxzte;`wLjWeDZ;vm{Z;_=}E^tJ94<4{S!pK!Kp>ouSwNE
z!zlKyT~qV%<GE&)Rz($4Tg+(4sBTYzrT3r;G_)<O0T6tr7xz|*vO$TK_g121g!RiJ
z$AYRRb*5dVKGs)hVTA=_nM_NM$dn(9O&XH^Sg%yog33<+QvLA#A~BIO+TbJkC!yh2
z?uc$}3%QL9($)+C<*cX@dJwVMz!<sR<|N##80uKY7Z{5uhw;NLr%NJ#ew1pGj{5!_
z3IgcVt&nf024#yebLXu;e!E)Eu6<L`gB7%Y(HX%;!HlODk6_1OJ}dcBRaEh$94R0k
z=ufqu2FY>HGud)_>S|Rx?qUme8S!7T0&S}9|9o^m3r(W0)_qs%64jt?Ht2p|{b?j|
zM(efT1s^!Rj2P6JSITXjEA~Bn9#;)3oPn%#CZ^mh!IkHJqpUnmU3m&4B;l8?`qaCt
zL0ysqE5!C&4v+U^VltJHIA%5sn_l!<)HhAH&Gogb14#pl9e^&UFbX-Rcm$3QF(F{i
z-s;h!{3f6kW$NP#kYNyI!jF?OOGEk{`AMB3BJg9*SMOWZ7WQ1eax>}IsJn6nmpMr@
ziC%kag8WYD!8%vdVs~GaZyPr^NHe3`_q*e6_kh6Stgq^&XQt79Nkwj|ee8bAJk2D7
zmO;3I5Vd2*g`F*Ltl44<!A1H;V<LGcGYb{&O)$Ub7u%U114w+Xz)h2dmVND43GErz
zt?qB|`t+y&j%Ir4a)3nibTx;f;RBiae1GZh4mZ(KZco?jBvj{YwL-<q9w%|z_Lr*c
ziWeOC0jrObcqOGH2bjd6zq7fO{8y!0SVe9ztqzU+hV=bn7pMqYC*MtHudXG@>(<MI
zm;23<-d3~^`i?en!z0~}n-}Fl{MRSv&^wRYJ1(os9fpAy1->qYKOSh?4<gy-1Q_pC
zs;913zkwC6cYN-vea_k2$H!w^d8~_ve3^>h8fq$+zw=mUWxrqUDZHq6?0gg1c=K{^
zu1y=fEMLK6AE{&C<)jT?k`}tU52#VE%*!>=0L6K)9}3kvtg5Nrpz?sBd(=L-C)W#P
z&KE2?{5uGv1W<aNENDG)`Uk1qVItpwYm7Tc;RyX5t-t@92%NhDe=^WD>oN*~Y!uRw
zfI#_EVb&vofNzwnE2wjKMDYzM{0|b>?H~r5)}sxfJQ?b9Seky@JDrx9#U8<v+ZC$!
z0~&u|kX4y61`e*H(>S|A4R_scM;QgLrh94%zS#Wc)i;2mh*K7b!o@2?jHb0-=wN8<
zz>jGt^B~H(%I|i6K3Bl*bBt@V@8^1_joXv?DzB;HtNSkMKc8VENwo{#jt3J`uC{3=
zx@-Gq_U)Khs0n&p*9DfQl?#1O>QupH7Xlh0!<S-19y`=!8IbpxdVlmIh0fL8f_i6P
z`vr!%t(IIFzzQpkn!!z`Z=qWcB{1ImY{O4o{k|6kEAiu{WfaW_buec}@vUM>>(CCr
zeSuGfH`q;coj|+;HMA=Gf%4IR|DPs&JQ6#<e93SWRi0l1O}^Af(rg1wFK9}^r0?<z
z4ru*sv@18sAR&tE`2%Jbha>x|`*l0Dt-X*rhe2m^g*e8htyf!{Cb=+bSE-)$-o@#C
zckkiaH;@yH_u(Y_BxDyR9=lp%RmNSj8RvxVwhyth(ho+_ewBmzvxTnwRJVO!x8270
ztVh4+JOfuu&K6~Q0YAE2d)BdXmLuj@2EI9Hnq&(3zCq4!)xOx44}_GCQj=3>&k1)d
z{5Iy>#?Cu<crQch^{R8^1`Y0^b~hP<t5GA9SDVKDR<k-+APq~vvA1+fQ)%{5!O|*%
z>_A)knsZMo@dIW#06TEU^naT;IjD1ol*AnSnzef4PxxH%rlH~oz}bxzI{+66C|#Nr
zfT&$dnuASpnng?i!l>ZhkMSR)zPas!ExS}*Kc-?q&Kr$yWNdziSk-=2WYvLA#B0~+
zad$DiZEETcW)Nakd(~)M203^55`|0EPaTlQ-4{7G74lHMGgqyxbFi?3CY6AwE5L2O
zX#Q0uDU#l9Rc0f{rGIu|c_4_lw_aSh>}r~m57mKaClXafrIhk-_s6%Kv9%1#u>Ucp
zN=oEIQ80_rovJgOcgfN*w;vBBpr7ckU9|#anr$d2WVO%dSqmP|uLbgl`b%J;d67Ac
zccA&Md5b)K6Y^#jc|{=J`%=Ysms~P&6<XCjBkyCxZDX>`b0jd4Y5C6|no)`th@C<a
zvs=yjn6EEWJn?+qCm|57TZjKLTwZy`mb0q9?ZY)~a=VmL97+Gyp54=8$tu$~Cm9$Y
zjFtUJoA(&br;=D{BB%%oBT<0Ba$4{)#_vXZ5krZ*?ChjR`02y4GYXf$2OX>r`|Sne
z*JVyen6&;#DF1nh`GWG2pSw!KL-{{nef-eG3&1hf7FK(d{P#Pi-vJx{u2g-2@PE4%
z@TFQ=z*e#sLl;>8`<-NFaVL60n}mzc|NHOzi?ctcZBz|sO|1|6?{^rV#mxQc)>#vw
z`>#f;grip5{PCK<E8CxAAtt&U0P)WF_{<~`k;q?NZF$`hAF{K~75xkpwjU_Hj%2Nr
zX#lf0xqi^96tS%k(@W7)oP7)l49)&8(Ri(NmRtQCnTQUB(>IsXuUp!G#cr1j+go9@
za&Z|4xf3M@B96Jz24e;}96Rr~!NvwvzEk`QX9zFmxhggS;&rU`S$x@CV8`_}MNWW(
z3(QhVNfdC1g`X*9DAT?1xB(!O446%y!!!<dko)h>rBx9C`aR*~qYWy)I1=)IUNDn^
zeveVdObYt1o<c%NApAAA+8aeR7IGHr+N2Htcw;xG1#f+cu%B-#zVI`?Ghe+*x7c?(
z{{G^^7c6_8LYL<t6L8F5mhi7nlRzD(5?(C`@Ryf6J&;b>_YQlv2aV3Ckv2B{Uff+!
zMxRvO^)jUn_=x-w%Jo)@DYVMCjJzBx;3e=>aoBoyU4)q>_P+@5Lz!Z<bMmw#mm{b1
zF7D~swU?PtwwrDh`;^9hGbxA(=~|Dy%m`}V9aqjJHH^uBcYa|zrMEpUbG>(vyFb5j
z030oII$JWfktt_09S<GP&e<tEwX~Y4$ea19W!GS!Q!C%}SGvevY^#hvFUc^Lgd@hT
zRZ;Ma_W?2$cc!rK*>)wJBC0rB{nmO_8vE)`_83D_Vj5dy{wumFht+YN>ovC7rhd5g
zi!3F5K5K+N74BVd6Q%Q)?KQ!R_x)j9^t=Z_uv+6`E&D~!NZmrKi@S-9Ij@NZlY<5?
zTLW$>toxhK8c{dVz`p*~LcZ~iowZlT*0x8lVi?sIv)iBzR{J}rGm!^hK7_L$`}L{z
z(8H3dt-h%49ZV!&NhWQBs?W1J<Kn(chsP=&JeZo5j^Jl0kua%UxPY8NRvyAedQefw
z=^o=NzUTjEcSENVCcgfXzcitjI7zO?;>Nz#K!k;V+J3LnGW)c9F|VR6p4#i<g=x^<
z(OCFc7vA-axe$pBt#RM&6axu|gP3XNH7s-Uu>LT!&yQ2z)l5zyQqVOwe@L$cZ~EuQ
zB8Bpn!xmnv@27=A_4>*x=EE_G>~Y{`(;mZvBm?)~UuQnDi`%)1MfEk&97$NssLVF0
zQ(PH}B|9Be8Y$|u=&(g{U1WS83eW#Et;-}N1II}%?5K5{Y4VOTf4CXZ;rlB^<uKmd
z%m9MIJPyRZCs!<NAEcUPyj^HZ)FSgi5D0GK(G*$L2P|AKS?!%x@SSstsj9wy|D`0y
zv(f8QnP^>EO2yTISngR(yYmKRcGJFRV724lm$rtjDnfeIA7`=nE&uvuVrvk$uQ!v>
ziV2^ym8GA*J*qCfZsh2!(jI8{QDf8N&g<LaC#o<(3HU+s82McT{{8<H`QbB5(K^S1
z5@jp9Luv;vSE$D5#A08`RoSNUE>mq~x68%phev1{B>K9l@_yl4^tx5G8a3LQ-N_ZX
za4CBUR-p0QjeQZ_{PJ_E@9bT}U5SviN?o%qVYxeEw(MmPUQ9>^=bnhT^_<r154md~
z8rGmCWRz8#kG}HB@pJKlUClD+zN-4Vb1twCDN41W`q)=}4AZ^s4q<p#ei`X9SoI!;
zI~KM!mSoqCL32n5zMrx>5<Y0fXziAE%UP#z*dhe5169ST31w6sx<6J=NO@P*Qd>#L
zu|z%fyfWN;CcIc@wX=Xcl1g|RtmJ~TEJ50EZ>H%F1Hr;*DgXU$KELI8>V@UQjse?(
zy~RB^((jPe;A76)cDilfu}SNx6QoI-i9rVbDVzOfKiA)puv*`BOWSWZt>-g4OUph%
zU({E`9#d2}sm1f17R8RR1#9>Lru`Y7U;B(Qv_WuoXBRQK-@!9c>8c6jzO_nv{j#?9
z@3ek~1d2;zJ{suA|BHdfjHinc_#NrEa$D~)5uMj{N0YgLVzcKkJOTu$Tyyi)+-`&W
z|6hCG85GslwW$b-C|QCaL2^d%0s@keC^?5F3QB5XlN&@7L2_0ksN_tO8fbz@k{}2|
z6T6Wd8fX#8X*PFiYQFFKzEksas^+yy0gFCo?|t@O>v^8F&RRXBQ<d6QW1l~BimZ##
z^80A?nm*CEwsdq9<8kybRW0c__f_qe`Wa4!^#}3@!al<>RTlWlOt7t5ieFERu~ow1
zPTFRRvD1vqxOo-rZM&B3Wj7x|8ZD{j$B-$X&E53sW5WK$RhV;c<EoWEJ_o1*qYkaq
zQ8W5=&Kuu$>ie3|jMr3T@<jDp#|0Z~7{xbabi&+j&2HLm;dlzvuK6<dRn*$A3iy^p
zZS47P-F5wJ_Sj^&-7I2#Z_4=b^3~Uv!{MXlkm0JK_7o)a?Uc_WVBnv}oozxjZsx*6
zwuMDqMmfhJXtu{2$%9P?r*zUEwG_i?E1buxl6(VWIIX*;y*#%(xhPbffz4hiiTl?8
zpCnX-P=p&mDTdOQFF1+Nr3q$r(ZN&bz0OGPhcnDn4KH_WC^^K~(pY?(vZudHZO+UM
zm%eSSlZW>}uQ|>RP`@3Im=^DLwd*tsO*Vp}q$LqHQ>Ij1ubT_>wRQ^ZviDdxm;&DX
zuFbv^0b%O$cOI>@*}N}&HDUR%rEsQHbHskv9d1=4`#Ym>N|8-`W%aO<n2Ir>Z;I*H
zJ>dr36*DK3S6LF}EM)wqI}F7@x--!Fj@FHg0Yzoe&Og4IS>4P_X%;>RG_Y;2tA6}C
zCA_S$cz(9^cfMV=*>CAbWmlN-(AonYADV6A-0X7^6sCO6UOhsIIsUf>nj!ay?4{gI
zg11T2`&&c7RyWXTou*G3mBO8GS|jt<t1FI#&+_%W#;Dqv^JsAQ|FU#!QR&e`b#kgg
zuaB!#skoS;&=}@>eXTSOJ#*u=L)$*^ElIviS<p!KOTrelP@8Tn3|W<{>zT80vA`d$
zYhp!fg}I};5#k+-5?!Ppvz=o#m8F-^Gz*CPvC-MmwOc@0I_Nme_s07K;^Kk4e9#lK
z)J&EvZL5LpGMsF3ucAen+tihLXQrp>t{=|93>xiMAGuYCg=xjUbH7S9vHAX<8kCT}
zDIa-0;~ka%<Babg()U1T_ou8*QmNdo`?43*ze>rHK2drVlR7+rK~tJrw36F%$=D*}
zGP3VN7)15b@9U=D>&8F&%OpF6b0UHpEQuYvrD@Z+l@s!Pvf-pJkF)Cv+BExyln^#|
zRT!JgG$rOVR9Mjb3i5P*;AGV4(_QSwEmCtBc4HD(FIlyqV%XbtAp7*d*9~Q^-7>n=
zsSn}K#l0riFMy^pzOMT%StTPjmr{mGWFs}(3@%HUQo;68HDovv4r~3*KG^@YvZL_+
z`o+BxnXukf*@V~!dWSP-)AP=&M;^m#qroaU=!um+VGVIC$AnKp=gx%bs%>UGb}5lJ
zb-dCRnQzj{lnH6QjzKlV%0r?;TTdTWkv-Jlcb(d7$*<HQ33eHqftmMhnjdI*x^%?<
z=F1Gvs1Q5K{y0i~R$?d{B!Idy6mC&VX|Rs?k<&ebe|}|T^~*I|tjQvOk^3^4=JC;;
zvi(g#yop$1<j&Mkp(}ZX-8b5kS5GImTlXYy#7LNKxrEVXzAgxhWHVNgSK#39ZY|?9
zp*u1|T57GcEm5jkogZpk?7utIdW0hU4K{AWk_5FaqU$!DP|}0Jklc{gS@Mm5LCEO_
zIMM2`-Wir{YR1%O`xty)l0|nyLVL5(!uZzVT4k`TWCP2{_VfmoMp`O7d9kn0l4ZCS
zP5Jc77&2VC)Pz5R)2e0N%;Uj4>4xOug!A+U$CZXDdG!Xv!sqxtWn)A~_)f{t-tp1&
zL9nG$2y-5jNJ(%G#>ljlC1%pwm+9@mw>raXKISY9pOm#eu*U12^A}_8f6ZTR5O@$C
z+i=g7s+K%N({**1e^(B4!7#vAnIIp34EN;*ZtkSf(HyYnE4djSNHE5Z_V$#wYxkvg
zYq+?K|8UcvSQdhuNOuL9?`F*e>+c>?l-15!rop$v_r?*@jbVlwH-1ZQ(}LKI|JiE@
zQ)g_DUJBB-9btwblNB_#9E(Y(GAFUPl55D@un`b|Dtr6O^ew95?QgZWIMYZuv%ejc
z=D&4R?D02yNHqkePoAujBUNXjW6x5rY#NIct2HSp9XHPj*An*3Jv?oMV&>fJ6g$mu
z*Z85^>M7PMW2@z|al?|=Aqa)TA1CTeVkwMcU&||a&+5&nSz1j>xMN-i@&^7sO;|@$
z@^4X5die6$+a#3al<_YGID1L0)A;f;o{pMXnIJa!^(7l0N2bJVbJvq8%EqE_yR{lM
zzj4O%<#x9T968U?CXDjmq0Lx12M+P7>b#TPy_sSdkCR&H7h72it04KR^|}=Ku;Z~}
znK6$OS?`S{{7DvS(ca5jgAl&^7B|(7DW*yu-)Z|bY)LL<F&XF<vhlS%E?ygRcB*o^
zUw5h~#Kg6@nBw!|n_vY_k`R*<$H>%0gVPyFBuMB?yta)ERF#5UBj*=9ydUP2ELk4w
zId!B)idxLGcJ}5^r<p&^*&<K5KHJFd0QWC|R?l2@Lfcg$C&=S0-_zJBD{-odHJmE8
zoNQm^6BxI|elUrX+b255dF<!TFyfjLs{}#E@<Kh=&%%ynLR(KB!d^;u)6na@W2ncU
zD;<Y>Q-+C_)zBY5D^nw4WnG`5VJPJ<`<)GX?o%o-Y-4S?-_MIZ${G@*JNTiB){ydp
z^4hM$oncF8a!lK1H(rHB-R;EO3}JEDcS0jO&w@`bZ`CrpW7W8NIpqFS^K{(K%ks`I
z>SIproY%b36T?5p_=EdzxJ~%3zz=<}h+cD~L5>?Eo3pM>PNYe9w%A|Jr(9zPCH#=M
z*3X<=0Y*noWw6fZv@`8@w8!rUD>e#1j8G5A->yDoq9kHvx=H0nh4!tt&BP92=RCyK
z_rz}9ccYI+2LDVWdw9P!Ge@w`!SOS@V)LxaJMWu6%%b}fY9}ixEgNrbI}6@AFnAf)
z6HffX2-w%j;;TZ@^o*)MU&{$nv&y_``5ZZT4wvsJ{tX=9&JSjTm1K^zS6f*$wwI6M
z-hXttKYlNS>tsZlN>@P5R*>5=V2I@uwoZRmsQFHP`Pky{>*(gO(9qUI!Kn{*Na3_8
z%G(3KC?P)A3qSoLoSJb{3N5RAv^z{H-Z152@g&VLdAhPil&op-($||F*{LDh`6$lE
z`Fq6H;6_5SEhd?9E%!|f%EqAF7PDqrF5i#InYY42AW}PCc)uObJ)N|9A$`2p=5Hz{
z{;t@cW+tWChFUD#ca4*PL{HXoqOTb^d%4Vqz176i;au!0W2n!Ki8)Nfu%<02M>AGC
z@|_=%&VPGE!k?j9KYr%yb6pMA>^liC=HK%x0NsKh1&`j&arFs-S}EQqLC{gq<Qwy)
zv;;QkSm0=rDm7iw-iJjLJ8$eJ+4ljBoKue7x2McZV=Ej|wzkE-*EhI#G-HBGB+rz>
z2WJUxibXYe<uVI3*pBIq+TyzpCn|L<1ws>7Y!$yjXM|iQ){zeytP4G-c1LXXg40&+
z`FB?jq|{YdE|6Y&H~JoN9;O1Jy+^$bB-eg5or?bwD^KL)upO+no8oe{e4rU4wdd6n
z(Ph{Y?pi|Eh-T3ksn=(zpixO7H)vyOZCsVB?Z@VW_Ev=;mvPf>XawHDz}weizL<P^
za^ws;U|yScD8HIm+S9MRSBdrWZM?!M3T(RHCiu&@D8$~Cs$sBsxntZj(&_u<j%B}E
z&7jx<y-krhuLh+>s9caxS|J)5FCi$-R`M;}q?SVt_9`&ufrI0B35m@I($hJ~q@G?U
zAD(EKLS{SBX6?*3^{OD<AH+tS!S9t3t6&0lAfG=}FqF7Wf30Q|^H3aQruT)|(@qMC
z?Y?J?-}>9V-2Hd%<t_otZt3mA|1P$iYcIp<K+ZBVq@#=aLvTR=+06Fg8<l^4D6|I1
zSqVm?ot<p|5Urlcl>+(3gc2yq|M(DV8KEjP*AV7`_(Ke$;0Lmq(F_0oDg*ufaaDex
z5;+<|ECl|kv=Mgz*(`!at>gI1nfl-ZK|#~Rh#Ze-4q}0`UCpn-I7?TEsO(jh<#=M&
zcmseU^!%wk+-+UhZb)1vaO9B-B7^5q0;TGqg|#SQRSCk)0e`qjyt<{5_hDIMIm>u!
zuMv*&-h=32Bu`t~vT$P>Zq2-3Gj4xP=M;qsz;Ikl@M!OS$~L2>RQ5`z&njlpK<W_M
zmp{N67we*=M{B0!VA7kOi+{Nq4q#8$GW|i0c=nhoY_0gHXN(q~)an>R8**4GwW|bj
z_DunK<V7k9*Gm1oUOR#BnA><GcOjGz_+08|Z}t_z7q?zYN>wSKck+}(@89Zmn*FH2
z1(wY6<qaA&w<e)72)`?a)-7r{&9p0^&tY{vTVn@zGzCA3@$TLjP9GgQ(tO7U|JedU
zGplD9Zc$f;=Bx(WP^KwyX8^^<hAS+EEFGxLjR!{8BH3$y%yW<Wu2_w1{q)K~cfT_|
z#h6bbTyq+41r}9#GWfkK%{q?clJ<WK<e`X3`}rn(?K*C(MkTwwW=2%F*m9Fu7xz~F
zLMR^?f624dX-EX3XJ>JRI5!5wT)Yh*29ZHh+~sf6vNlkfOArwaXK%*mYsW$K5eLYb
zU&x~#?c*{3?Of-IRFZ3Xo0z<9;pD@;J!)=zD|d{a%3*Lu$iUI58NzGib6LJjU|DJm
z^JepQIk!a-0H&oGTxB(Yc=-nQNKdu|d~lBONO5*Vq^0lCTjYP6(>_})!+Vds#Nw7X
z@BOveaxd?r0y4yH!@m~>+-m$Uu~P4zWaFBJD7qTxp>g`}-!zkFlg3SOKLs!w5-=O$
zv}w%$n$1fvo8(cYATs+}cir1GtPM6a_m-r5QC}W8Vl7s;2XkW>ON=01nA0RK^+3Gf
z&dMswJVNETLwO83oHVHVHT|H^HMsR8ZvetEwG=gofkh{f-T!1Y=!N(c?mr(z%?KZC
zwSvSp<oZG8W1B|<TB(1xnv)rx=WcBEmPxLl$)m=)wOlMkAm=fTM7priTzy%$o&K@2
z6>hP~z2Q*kOh%Vr(_#a(<zY6660{UJwCt*s9iGC=-X1oX?J`mYnt((4UH}{pQ%PPc
zm1s?W8uyFDxmqA;M)HUx<L$o1MGi4}R)uF|4@vFnQACTNCMc@AvN)1dYCuZ9Bqn<@
zqsAue_afctZz8%>4zXj-)9Rhoq1;GWX;<|HhJ4?vCb+Boq*LC;%Pdw+&W=4%oa~L6
zekn-n@5d|_m!dx>rrgz!l08a8=6=_0{bjigvW1GbSp2?Yv`-)&w(aJXxS}bwF&>nz
zmm9Qw8qaQgH|7EX4JB)rkrJ={6os9>@?a#TYBP8cJ20It8j9?`Pc}3FMn7$^;VH=A
z+o*Q?Yw+h-?qXj~T8d~|ws154aPZ65lvwG38PA7GXQZ!~m_zWAFb?O=yZ(vJ5+bsv
zVzoi5yP{Fjq39##<--dx*>Tmhw~Tz2+(yCwo+~(RGQWx{S+zA<gAGkd(+f}Jg9)6D
zCOJFb)iC=7ZFw&!F&V3%5i8*nb~W}zIEvKg=b|+j@IXm8lB)}S=YD}A>mvZk?u?#y
zX9&W^ydWY@=9Tko_lu42de>HvZ6MTOGv($jC>nftuPAgzE$>&I#)-ul!$hn^Jrk!T
zOaQVb>^dWaoiuALwVan8aT%0{8YI6VG<eYuxJFZhr0GO)tIAP?C}$FY0Xl6<FiTt<
z2oE&Ty!4-HvYQHS+u-30OnyYv?KWPl(W8)0A@|-EIF8p6c~ALmU0(l?5G1gn008I%
zt$Oq2Ij^F>O66F~fl03yFf`d8m^#&xSNSL=Zq2efvD_;&zR*+dH6(f}7I;ub-i!UP
zTZRc7w_9vkO_{e?E-g2C^&VF3H3x0ozt<`x$>cA;bqE;#NE;H>VQ|y_Bm)7xk&=R?
z)oi@ZW2a$s`<dJ@2tummaCULUq+Vjzkpk&MY9&wRyoR4jxQCN;xXovOa1(QMa~@&?
zIY33)Rz<$qud^Mh8AWynefOu#ujIaB(cOYiRGC;OFrGZ#$xXlp#eFXqwq%w-*mvFn
z8fjj-&rC%>Eo;KjxOW?xG<{uJ4KX|plbdunOKNh=0&8&@z-Q5j=A)tr1nONS;oo@`
z+4_gNtTF}3*-5W-fU9KV%e!v}SMH2+IHR}l{(yU8mleX;_Z6vbYDzPcEoivVWt+V{
z08c+$7tK-zIcr6j@|M4B{ZoT>n~*$Jqsr@Ii0xF0gWcGi=3p^kQH#zx<LYH(3v*1k
z0p)E3V#aF6mX=&}Ou=UnLW&3~Rv8tKk&)|L4f(ke+KpG;&DCMfF5P6S&Au2NX41U*
zIbYSAFp|A63%BcguR<_&nog4j0iR+<Y}>o__yj6piw`spx<|`JyXdeo8|2a|f#iV?
zHcwwlk<@^E2au6zyg}{*|Cp@5&Jz7FC(Eb36<ZGcm4UD?`9=r^=d>9fm&f`{jueFY
zoav`!neK{lu4mi`{SC%dklPY5$ZNBl>&PbOnbSyPuh?7;;m_Qq4#`cfibcAA@E*$?
zesT5G=n9hnCg9yjzNTp)LP!A^ILOZ}>hztiOuqh^C1n}MQkm8!<IM2uG2?sz!{Gs6
zXX@3@<FM=Dnjek3`dv+BRFbkA^l(pNchNP|N8Lh>zuqv^tWZ>QRui-8u!HT(U_+**
z5KYSA_wYifCsD-UA{D3Sr8zG)Q3<1b02B(LUt3O6LtcD|Rhnno22qvP6T{xVB(s|2
z@<ubsp`96P#Wp=Yj(BA7ORfhZ2Dm41oa@Z)r|iPP247T%Vz<a%+*rcXCMH_=VQGG=
zUlPf+I{pGv#vPeuZJ?Tp-4Fl?IC>m%(Pel2Y+Yh>z1Rpv6rb%|*zKqWmEU#JN?rmu
zs#rg?FI8<On0;rBb4Pf-F7e43uQURedR618MMxMfv&sWDV1e7k)CDATVFT0a$}@1B
z%)2f7y0y$Er>=5SM#3X&Jvk$}+gN+7k%(`{NxA5k><7jgPk%E7WUq-j8R0Vg+lgPO
zKchFd8<ht`angDr=K!)ioJ*M_2St^?MvBOR-3#N}!uT!()Fx{Q=dcfHWA5b0+s=2L
zB?SjKzy?hC_i7#d=2?zkrj1u}bpBwR(bf8HL{nw*t24@y2_{+mcz<l1$vFC^ZpQeE
z@9fOTK{s6|^z#EybxHx;mwoO0Ni1;?KTf@{)u6nVv~R)QpTQ9ZfS*CFi0@`Hmg)9r
z1%Aeo3V;9L0K{z$oQ0aBJCq;fkO-I`A8b$7xF>2ntwH~Ir_56ji7G1~$SZU&pba^F
zzJ_*wzbh-ry~+HN$Ul8+jim_&vQ%GoC|hvSvc(es9~Se0+RiyTo%(kCO<-RcN>t{&
zz=QaBowD%VZG6tsOxM$~*#*FNhq_aPqatB7HQtdkLn{)kg$K_4@%M!GzqIBRY!A^H
z-$&TE!FM_m0685YB@a|CLcS@zb=CqxGI&WY*N^4hkgVIbO{9ciF~LHosQ$}^R(h7o
zXjt|I3AysYXn<!vHJLZ*#I?>PA^1T@wh(T@%-7<t(o#VuOd}8t9s8=@Vvn-6(alvV
zZ}YSuOlQ-QZqzu|5%q$c;Kk;yz{xvLGLU}DyP6Cz#{$21N`lpBCTT5Sc)kvW9BTvE
z{0oPhl9G4pl;22PC3wpz7gU~b$RRUEe2=^5&P6k*L7swfS>owl?MO4KX0RoX#ilnI
z)OW~NAOMLndq<*0-_V?lu2#rLB_aDJ=V`xjmRC|rBi>Hl8PkS#2sqG!t<8JBwc-DA
zYs(YVO!Jh9oXwt9giMd3spcnV)B|@SQH_4HN!oPUBD$iXsVA}Dwg%k_e-ET?Y;J@{
zx=z+cFC<B=g(8D=R_&@U5fF2L#e8?6V~yoGy*XSl_L*GQRS-wk&Ffg!F}OsR;BbC?
z^lUJVYP;`W7`^v+)%=*81njvF=fZEwzmd9Ad<dpf6MYQ3h}i$&D7nC)sgmaZ$x$}W
zISM4jqwP<Q0=zW4UYO&b97XD!qZr#;-1&o}D4lbZ5o?D37T^Axa-C;eO&Z>$i~fUh
zC4jlIiM~<(2Yb2@T0;#e*Z&{Wk@H!26r&|<DR6RJ${@|j&9CylQiqQ((P#5FGhiv-
zjm4Qv2DD1GQl5=!_mOSr!2$*teSi(vcZ#5AY?0)he;t9a|4~^S04%}U0jHhYoR^3}
zlFy@aB=P<96R#Td`b`m~9OW3j**aG~@3!18!8-bSLKLRt<_u!L%M0YOPNSfE`u-$a
zK@^>cZVrLb70`s;uv)C_@eC&k0Ro&klLYsEYP)t_@<j;I-MR<2(Z(RqV!_nBu%Qv8
zmF3D72Qtoc9UPcTx)K@N;%oz@K~r3B&=1-9<Zx%Q%0BAX_O_aP?*6vDT7G7EXSlS$
zr>Q2IBT83TYD&tYv`bc^<i(PxjEq~0OH0N6+e^Syz&w8O?#JeBuHE<IBYJ*_G`4l&
zb7nhjh(v5bKG!AFGuAQmMB{GDyg~tUKU<#tV=ZDZl(547>WGPFyU^!g`YCoPsj0w`
z663+64-PoCd--D8MW&&wl~c|?EBT4n`Yi?9Y$}pRt`de~TBO?ccnWhyAT&?NqU3&>
z63Z}lj8~3CH`P1!%(J#kmZZENSWlsUcX-IwmES@pvP1G5FS#WVB<iyDX8%@8Kx8x9
zoL4FF_7<TNM}rMZ7i-~_5EpT8r1Xs&H~hvP_o+fVO0v?;9Zscu)_I@}HuBilCkWS_
zCl$NoRhs&G+}5YNaL19tgmtOMW%pfV_Jeh0i%Q-ZWKFC|?_USN-;_Ep`yIV`qvjhl
zg<ZP=<695Gj^&?Y>{}AYu*b%#L0WME>N|#LRZsSPxyd-Yqq`2Wy$s^e^=98A#pz&!
zq-oU+mC#RFW@oRnFa#)g+;hB6{14VG1KqZMjL^OdxTa}$$GAIMW8IKw2570J*SAM0
ziDitzcH-f3vzqhz<wEVh@FP$OJY=UOrV`~u#$B75vje1(GN_^ZnwpVzp{$iOk+JB^
z+SHmnUBkR2=1VH0yuRa6woQqK`WoT$Ps|5b65$2wD4LooHO#2Tk^xB8(dV?<+NQGO
zho<xjg5=nd!G165>~^b)^SLEJZf{9Quknr=o7(k|9@)6aeiE2+;8b-PFXP?KnIyQz
zSxX@A$?jKMay7m2N<~q#!T>Z>dS7V5_X`!{Ze|reRS>z8JjqRQ&;YUlkf&ke3{8_M
zml2|Vr1;UXSpKMEAxb-El}zp<h;nc~$lFJkA8v)!-)fpncn<zGI$2qNOC0(IPqigM
z`SGoPG=mzHK9lIOvn0!-saL~hE=rdv-8i**sVf^RIrM*4UPH)HJt!sPEutenO{m2u
zaf0~F_Yv(i99N35T(o9haOZ$#blDsIVkFo>jh996$LEve{jU#j(GdOAvmIM_qC*DN
zy_z06;b{haqu>-Z8bjpEFApCV;irA{+#XAirN>5L%F^b`OZ<CRy{MT${^UybHH`pD
zyY?Ob%EsGW;U!wO72))Z{GWr^0HcW1d9-;K%x_1tV<-~*)PHPuenNP{SZsE$AWyXK
z$)?^ryVNE1DC-%~!H<NdYWD?iD%P%ieOHe%8#>}S=bu+#;}vPRYe+ymr+3;%^U}aK
z&1)rG6?Mu<jBSk9Wa8T%RQ&BgA_4c~8c1Y6Pwk4l0aA$dlI+gEA0nu+i`%s(`_nNz
z0*)cwkUs5CEp7-nhL6&36#sAxf1Nu92b!+MKef0T;OCx_Fa0ycN{;pGxt}wA>Cy0~
zpA!UrZZb^bPd|6>+|L==SP1>;=dJ-i_y5IoSf##}r}dS+kP8lO@mU}Lu)glHxUgVg
zt1b4ou*#ziKw%aI4gibLk@Kwf`2kZ()mo4{s|UHH_TUV>7F{ttZE@ZlUG0H}6Zr`!
zD_#si`2>LG@Opu_Gng(CI%Ue`ca8611GSB<;I(A3$L=6+RO7r}Ikb7l2)rI&j2^w~
zMh|3LeOBsuFI`<wJMYvV;y3$pGUV)Jv_P9hSmtb1J7oB92lu`DW4#k0l>m66ZSN7U
zs##`0xqr0$6{Z`g*9T4{Ch;Nbi0&HD_rw+3)R3-X=SxVTssjAzoYU92m5SgU{!CE#
zJzyb0<@czumE-RE#33jX;XMZNEqXGT6|mW`Oy#Q(3sN0#3WApnEOyL14?3u|M~LVw
zess=m&38M4;^CG-=Y2><%FM;#`?IeirEaSzuxb*iLOB}d>fC0^M@x*IfooHf`M&vz
z<&O2UScxV29o(4PUW#K&M>-=Jlcd3XSB%EC@8?DKXL2wg4~z~)VfThLghyLXcZ7kL
zRsqAwlsP;0(#(_`HFpTP?>1epn`Yq|*>bWwwZioF{JhM8M=}52z+=eJ7?Kw!bEpb#
zv6uK2bYsbE0c{FSc9E@ypp8Y1P5b4{z|97;N^5zV!B0@)$Ke;?(8tQsU2r<&v}e?M
z0<Zl@p>Fyeo9BDop!q;+S*`P!L$CQf9PncXlE-?{wMtw36-{%apL615ib0!=kM+uj
zyBj6?+R}zco2Uv}Vaw8)v(r5+%V;O#olt=NUOZw=n%fqi3CbC>4E#!6*BYXIfA)o~
zD+>Noy!9aFGTE(_6O}a$;b!ojKud$4G&S*v7{K%|YP!nopW8Rhr#yd7fX6<W;!3aw
zrQfUUjOz@RQ{$_!Pu5}Mf;?kZA<d^CB%U}P(P_hck&us5$7sDM<}Mm4jxaw7_`UL>
zPYC5A-bX8Fw&L4*l!-h(!nYdR*vvwwD$mV~>I-1m%!bTdOeB7PQ_P`aY%>DOQ$QxQ
ze#Z?xcaJD^@_cKl)w+EREX$5*Jzf;X-UQcL_&QY;h;K&jmp;kx2uB8ib|ToSf~<gE
z9P5|#Oro<W2mXA`clmw7tri<o$i4L3TF38Qfwi(qZcSUtu9(K!{WondU>BY&qrjqt
zJ;at^^j-T@VsEW->Ik$4{%pHaVp{Fs1mFS4E|T7x)?Z?&IIje+C3El?v<WiOO#dt4
zWH9SQ-ldFxF6DL0px@EuzS2v1;9|mSL*S8FivsHmvTGv{Ln#mpz4Mu}DPlkohmJWb
zkav%oPZcz;ai9IXa_w*XJPrs|>?x0dxQuqj{UNCz7q8r+fd7b)IKo)J?C>HaNdoV)
z#4n=7J>p_F5fy>`z9B31XDc5%sNKJIyw1>_?yd+q!OcOY;r&=KAzqz@fP%P5Y^~1E
zI<Gxxx`y|LO#>&hrX!%e0b3yHfX#AWUmw3rxn_pffCTo?rak<jAj<>4O62rTy~V<5
zyr7ISI1W(-8})S|E*@&hY^c*8O`F(twzO%Utij%1zJRSxIy+zf;x9+jt_36l_dy$l
zCa)QfZYG-XbWMVbN1Kh38F=PhNFBZgj@*#aQp*GsYX6zhj}&^w7y}G_;Lt-l8YaIV
zp%2S1eOkGs(l5Q$L2cPoW?X@Vo?(?-x^jU>%NL60TJ7&xRu8I<92_zkjTzjOy&p9l
z@|+fwH5WMigj%v_4aVXL_9o(v#Qml-owgL_yb?b?W?)YJWEm;<L(+!dGqaoq<K9k+
zWeLla^4(o>ooMpkejRs%@WI&acx0xtwC;ik_93~<zV#5$1)>Drerxjga>IbOJ&=U}
z1tHINY5ybhJ6?M-fjEKp4Otsean96>oZ=Sz@oZyyIV+&8jj(6)9l8b=w;|Bd!_@n5
zwSbQ0JLVM=tpA7PM!H3}7nmTs+sSl;SCj?$05&+)+hY%x=V^Q40!@aRlDq+-@`f??
zD0;G<>K=zUUnmWNH&i;A@(n%X)=fdTM0#`Z8e0|fG#qOgJp+2Qu+iOPQUOuqUV8J)
zC^nhygN(Ggh<yvMEq~As>7mLkj~8_BfJw)*jcs{3j(+|HIVUDBKKD<ieX@L6X&rfK
z?=BB$b!0r<9L92zN>O<xTKz)#F@lz<uC;dB9d52v&X=`0j>tiN1?3Q|n)<nVHK5}j
z7>W?xqw#)FxhATry1Ln}amwlSn0D5K33KR}Bskr3iwV@5=o@ab;-@|fb91STdCh}`
zG5mKGJ#(l`DBn?yE8fhAL{MfAz1E_#gBSfe6)Y-z;yBe{&ec2)J%iTAZm3U1FQU$0
z42edJHt@`O(X?PXp0EfGZTT<tzkRs7SAh~meKFH9w>X^`3^rQBzu+@~DB~4^%C&f|
zpS5hcm6D#po-EKzQUoF3s(-nR>xc#KTq1w)etLSk9*+X$F0h`A`4u9aL|yev)TXiw
zDon6s-@+d<F7NzTKD?^2>n59M4nwP`G})d+)^|jdD&`DJioV{HnP-Z!2gpT^jcBfu
z+16#^^QN$c+qgXH6}I>7ldPiZK^fF~rpY;mGSCCG%;OI3$>0fhj`(^>=Tm+$yczua
zvU2Cm2@ypidJ-`X8s~ON(}D~{Cl$qRgdo3hc{1$I@3@d-MbIKF5(4?)Ki2qhWWx*z
z-6)H?b=7`2R}CvNrl%y<;^UYXxVM=u>YQp(LeZI(Q?q9}i7<%;x8+cHUo);4EeILI
zKnpd58kDkwenldOzI{+^9x7I!2@^dZ{UQ)5Q|@^Qk~is6mu-K$dCy^G_7w}7(XHt=
zqv|KjVfm9R|9sCYkDUoc&fhpt<-2ET%l>P)D~_dNw*3_@<m@PWdgHcPwS(rClQXZ4
zARnK(Bxs*g1eumj#_rN8ZRlDe2p_rd;^AG;L7>sG#uJDBvL(M;r!;YDT}L0%Cy4Z8
z_n3$<*_C}>{(UGh=8Xcd`cxc1LAsTT8!?4iUv%SUlpccp;1lNwW%gg7#LG%oG%PWY
z(%*USdBq-#OX=@%4a4p;?5PVlZuME6>Q%I!HX^YxPd|dk+p{w}Tl?ERGkUGKq&yKv
z{MY8Gy`cBrYD%r7Q>JJjKM&8()Cs>yd7YR^9*3^*)EGgqMaMV(4meZs>X@zH?o#GQ
z$)qi8V$i^uZIs=|l~y-Ru}Q(!ZZzNaI$y-+0Md`XYQmMHeuEl1{>Z%gwktqx%uZ&{
zEduB_-oV1~#Bn-k`>UmG`{p%dbgAgIfu5eCt&%z=RXny?)aEncIUH-;Y{V6NlY-mx
zF-2zrz#K|Kl1v*If#7_0jZP?;vmYRyGSG|8U-O-|%)vq;>ECOW$O?k`Ot7uFPRZSI
zyJk|E!xeVe>M-YK7y9$S>Gsgv9;j6^4{cW<bPBt`^g6sL6<`hso<z9mrve1j$?>3O
zOh6WLY;O>Ly&Tpet}c~^*#kXp06AH*nB)>TkGm^?MERYPHt0i-_Gf`1^mr7$#WGuq
zLrh_da*>Uqu$G|XNfG>Lx9T)*I=%uMawm1yrqU=vBUNC_Xb<!$tNCgYD+k&nP8WXQ
zBMZ1!-oNU?()Y=XS5OBIy0fK=xuzpE4F_g7Oxn1eMQs-s)2wzS1zsbkfdA%p?sDmQ
z^VmweX}kJ50Hy_UhI6EM>lUhQGxg2;9xw_zFMLTLkxUVqc+Lyh!h0uR)QTG>EKN;K
zJ6kWF0#KC_@Se^pNEaZJZ8VGfr;B;eZW2Nf2-q`;&i7Y*Z6~ViEthIVE@GiXR}&e~
zRJQHmq`JmO09xS+WKSGDsBVM&?XIhDB!Clw0T{|BD?7u;hn5s4ecfE)G2i9S#4u5%
zNG?5sis3XF<Uilx*8zOBH7IM>;Rkl#|K*{9dcUaz&<Y<{0Vs2~HgGx9H{i-4fZa{>
zu8M7!?Ux5Kv4Ss*Ej(T#SL)|uzkdq76gJw#rh@|niKC%64Eq|i=KIJj9Z(3xG&X`x
zar{9>D4Nb*P?bg)=e`QM!3}{%eUF>jvpu?QV&OyR*}@d=?4V)urLP>TJp(q#0s)iC
zPa4+a5&S-Jb(m~vzn|8^p>}<ipr08B<l?Jy+Mo_BEGMmD=46dy(%8bpZYw~p_K`bP
z%wh_3Ah19ISjE0YmQE*nN-(SqAIhEO_xb4ERQO%lwZfdHDI=FJQ0H13IMf8^bW|oU
z|Gtjo=c=~nS99wSz$!+X1ARv;tXhYkpmrynVW8PliG5#cm8_gf3`3r9%f3QtQSFB;
z<v8XtTt%xTdiJTMWtO;sOP;n~Bop|6a14`_QR?lwr9L#<Vv@E@0_l-c$WJmxv3?X+
z0J?&Z?;YDSxfBpHGtB454%s&DG|0=J+Ynwr2r`!L4L)FZM!04kP2n%11TkiwrDnCg
z&aS?L(tBSHJE~`{-5_+Tmh0>$fFa=$h02>lBhoJJi@t!ljKO-7d7+7kE*CW{OwJfn
zl`3tl?UJt0u5f8*t&^TSZtp}JAZ#v`C9ylJ*<QV+QKj=)_#0?32uJ`d6gB)EV^<sK
z;!W<nT>o8~rwD|M^Sdxo{CCA(?%(utvUTY)1TTKRG*l0Y?LxL5lzl%1IJ5+uaBPT}
zvRj*lDcH$X-oqhGt{UHS$;w7w!EbI*R*X~Vpq0w<qcE|y=jt<S`~c7az00M`0c#U6
zeYSrKg4z{7doLgMkxt~*1%j*#$h@EO|IC5`Ixfh3NI;Kvcm5$uU+5u~x%3Aik4FRI
zKiqg&@An5KXKiSE0#^GU^nTb3956Fu7P`1U#AOcg{p<gHH}097888p3xV8DYKlJct
z3zxJ1xIW0W4_#lgILVOk8fgtVTf6lfPASJqM6ip=dtn7Y(5d7c31tNLc>pufv@c@t
ztzmZ4yGigq`oMZ)xt8brc2iSRPJvdYQ<?0#E_Zd2pD*#_7jg-KbT3iN*-tv8dZ)$Z
zqrXl!E@s-719a+z6Z@4(navI~aaL(y>{oTQM;m*4IXJ4O-!e3++}nG)%Dge1Qych~
zZ=A*67mv8!TZ67zAOVHpM&<h~JhS<<U2|ZmXWZf@qDxw=zFTZ>e3dyFuyecr$<C9(
zJ#-UirO1GO3HwZQTjwUov$3xk_M0lMnI=9R^L-UKv&-X7e$@8(<w7|A(>(&48^k~~
zd=iFt`w1tT-$A3DN!B<DflTOn!tt}^yC#TbKt+uXaGWy6*l1}A;xJ%(uGi{Z8q;~Z
z5Dw>g{+{_5;d=Jn+-(dtDU!&Dy5lF$`hsAWsSdwtv{#8Q=j3#i*|(eu06Jj=qNsYg
zAHWH4P2x^>=CJ)+0|B$e6orK#!1htku?1oeq(`HTFq^Y?Pi0@PcMm!2RocqNubXg)
zKMH#;cc17v7$t_3>eg;%9V~g|8X-(6S`!X&Cn7d5V%<LlVk3VV^VF3cpjyWr&hb%3
z+#qgg?O0Y#xt>w}$g^GMhL3Cur$`-2C4Rh;)$PPJe%hV*2BCrg1>x=&+5UBu%Z%d@
z^#69KNsQ<qOfa!2x=$Tk&Rn1VZYTTf)zEJwCf4LF?Yo%Tj&OL|hLmfct28H!AYn%_
zS`#Ae?m$?~M{&T!)n#OOH{<9bT5wAu1J+@9TPIa+!S~I+MXEu1MF|)yw)o^i71z3v
z%cRULpJ?m*CgN&f7f@Uq-xJ$sHrH!s21{W7_Y&9{+kWpD9~rx9{7lZCa6OPQC@vC_
zl5p*;PQH0Eg$hWnRu;F<W&UN8gzFg<>MS?q+1f2#p9)BYYnm7JZXHYeD_;nGe9Z(s
z`I^Z0U7fk^b2+z)WOuNqKt#ZTh#&>LYj0#Ea8k!DQ(%?p(MkLUJg(Frvf`xZN_d!x
z;Ov|$Tl`%D+J!KcuBM{vf$Z`@Wm*{ncc`#r1di+!9b-Sp*J>Kvg=!SQ7)b6SC<#6Q
z_3%}dSv}7`Py0U(|9^h`(_Z~yga7j@|5=j%oTERH!+(s&e=Ng)EW>{+!~dI>fi;Qf
aRF+60qV`#@@0$zYkJ>{mrE&%9=l=tnkqu@5

literal 0
HcmV?d00001


From 20b851c10f91e884b5174d7c6b2870e615a6cbbf Mon Sep 17 00:00:00 2001
From: Helin Wang <helinwang@baidu.com>
Date: Tue, 17 Jan 2017 13:40:58 -0800
Subject: [PATCH 58/88] fix according to comments

---
 doc/howto/usage/k8s/k8s_aws_en.md | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/doc/howto/usage/k8s/k8s_aws_en.md b/doc/howto/usage/k8s/k8s_aws_en.md
index 10f5a2ef2f..c144bab649 100644
--- a/doc/howto/usage/k8s/k8s_aws_en.md
+++ b/doc/howto/usage/k8s/k8s_aws_en.md
@@ -392,14 +392,14 @@ Now we've already setup a 3 nodes distributed Kubernetes cluster, and on each no
 
 Distributed training job is represented by a [kubernetes job](https://kubernetes.io/docs/user-guide/jobs/#what-is-a-job).
 
-Kubernetes job is described by a job config file. The file contains lots of configuration information. For example, PaddlePaddle's node number, `paddle pserver` open port number, the network card info etc. These information are passed into container for `pserver` and `trainer` to use as environment variables.
+Each Kuberentes job is described by a job config file, which specifies the information like the number of pods in the job and environment variables.
 
-In one distributed training job, we will:
+In a distributed training job, we would:
 
-1. Upload the pre-divided training data and configuration file onto EFS volume.
-1. Create and submit the Kubernetes job config to the Kubernetes cluster to start the training job.
+1. upload the partitioned training data and configuration file onto EFS volume, and
+1. create and submit the Kubernetes job config to the Kubernetes cluster to start the training job.
 
-#### Parameter Server and Trainer
+#### Parameter Servers and Trainers
 
 There are two roles in a PaddlePaddle cluster: `parameter server` and `trainer`. Each parameter server process maintains a shard of the global model. Each trainer has its local copy of the model, and uses its local data to update the model. During the training process, trainers send model updates to parameter servers, parameter servers are responsible for aggregating these updates, so that trainers can synchronize their local copy with the global model.
 
@@ -411,13 +411,13 @@ Parameter server and trainer are packaged into a same docker image. They will ru
 
 #### Trainer ID
 
-Trainer id is the index of trainer within all trainers of a job. Trainer needs this information to do things like reading the correct shared of data.
+Each trainer process requires a trainer ID, a zero-based index value, passed in as a command-line parameter. The trainer process thus reads the data partition indexed by this ID.
 
 #### Training
 
-After container gets started, it starts up the distributed training by using scripts. Each node will use job pod's name to query Kubernetes apiserver for information of all pods in current job.
+The entry-point of a container is a Python script. As it runs in a pod, it can see some environment variables pre-defined by Kubernetes. This includes one that gives the job's identity, which can be used in a remote call to the Kubernetes apiserver that lists all pods in the job.
 
-From pods information, script knows static ip addresses of pservers. And assign trainer it's own `trainer_id`. The workflow of the script is as follows:
+We rank each pod by sorting them by their ips. The rank of each pod could be the "pod ID". Because we run one trainer and one parameter server in each pod, we can use this "pod ID" as the trainer ID. A detailed workflow of the entry-point script is as follows:
 
 1. Query the api server to get pod information, and assign the `trainer_id` by sorting the ip.
 1. Copy the training data from EFS sharing volume into container.
@@ -550,7 +550,7 @@ efs
     └── recommendation
 ```
 
-The `paddle-cluster-job` directory is the job name for this training, this training includes 3 PaddlePaddle node, we store the pre-divided data under `paddle-cluster-job/data` directory, directory 0, 1, 2 each represent 3 nodes' trainer_id. the training data in in recommendation directory, the training results and logs will be in the output directory.
+The `paddle-cluster-job` directory is the job name for this training, this training includes 3 PaddlePaddle node, we store the partitioned data under `paddle-cluster-job/data` directory, directory 0, 1, 2 each represent 3 nodes' trainer_id. the training data in in recommendation directory, the training results and logs will be in the output directory.
 
 
 #### Create Kubernetes Job

From f4ff8d26d6cb50b6749264f8db460732fda1e37b Mon Sep 17 00:00:00 2001
From: Helin Wang <helinwang@baidu.com>
Date: Tue, 17 Jan 2017 13:41:45 -0800
Subject: [PATCH 59/88] fix according to comments

---
 doc/howto/usage/k8s/k8s_aws_en.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/howto/usage/k8s/k8s_aws_en.md b/doc/howto/usage/k8s/k8s_aws_en.md
index c144bab649..a6422b9be0 100644
--- a/doc/howto/usage/k8s/k8s_aws_en.md
+++ b/doc/howto/usage/k8s/k8s_aws_en.md
@@ -390,7 +390,7 @@ Now we've already setup a 3 nodes distributed Kubernetes cluster, and on each no
 
 #### Distributed Training Job
 
-Distributed training job is represented by a [kubernetes job](https://kubernetes.io/docs/user-guide/jobs/#what-is-a-job).
+A distributed training job is represented by a [kubernetes job](https://kubernetes.io/docs/user-guide/jobs/#what-is-a-job).
 
 Each Kuberentes job is described by a job config file, which specifies the information like the number of pods in the job and environment variables.
 

From b89f2f2f0c97e3f5328bb17061c4c551b8af91cd Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Wed, 18 Jan 2017 09:35:01 +0800
Subject: [PATCH 60/88] Update build doc and dockerfile

---
 cmake/external/swig.cmake                     | 23 ++++---------------
 .../build_and_install/build_from_source_en.md |  4 ++--
 paddle/scripts/docker/Dockerfile              |  4 ++--
 paddle/scripts/docker/Dockerfile.gpu          |  4 ++--
 4 files changed, 11 insertions(+), 24 deletions(-)

diff --git a/cmake/external/swig.cmake b/cmake/external/swig.cmake
index 63e8bd2546..744c766ee7 100644
--- a/cmake/external/swig.cmake
+++ b/cmake/external/swig.cmake
@@ -38,14 +38,6 @@ IF(NOT SWIG_FOUND)
         SET(SWIG_DIR ${SWIG_SOURCES_DIR} CACHE FILEPATH "SWIG Directory" FORCE)
         SET(SWIG_EXECUTABLE ${SWIG_SOURCES_DIR}/swig.exe  CACHE FILEPATH "SWIG Executable" FORCE)
     ELSE(WIN32)
-        # From PCRE configure
-        ExternalProject_Add(pcre
-            ${EXTERNAL_PROJECT_LOG_ARGS}
-            GIT_REPOSITORY https://github.com/svn2github/pcre.git
-            PREFIX ${SWIG_SOURCES_DIR}/pcre
-            CMAKE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${SWIG_INSTALL_DIR}/pcre
-        )
-
         # swig uses bison find it by cmake and pass it down
         FIND_PACKAGE(BISON)
 
@@ -54,16 +46,11 @@ IF(NOT SWIG_FOUND)
             GIT_REPOSITORY      https://github.com/swig/swig.git
             GIT_TAG             rel-3.0.10
             PREFIX              ${SWIG_SOURCES_DIR}
-            CONFIGURE_COMMAND   cd ${SWIG_SOURCES_DIR}/src/swig && ./autogen.sh
-            CONFIGURE_COMMAND   cd ${SWIG_SOURCES_DIR}/src/swig &&
-            env "PCRE_LIBS=${SWIG_INSTALL_DIR}/pcre/lib/libpcre.a ${SWIG_INSTALL_DIR}/pcre/lib/libpcrecpp.a ${SWIG_INSTALL_DIR}/pcre/lib/libpcreposix.a"
-            ./configure
-                --prefix=${SWIG_INSTALL_DIR}
-                --with-pcre-prefix=${SWIG_INSTALL_DIR}/pcre
-            BUILD_COMMAND   cd ${SWIG_SOURCES_DIR}/src/swig && make
-            INSTALL_COMMAND cd ${SWIG_SOURCES_DIR}/src/swig && make install
-            UPDATE_COMMAND  ""
-            DEPENDS pcre
+            CONFIGURE_COMMAND   cd <SOURCE_DIR> && ./autogen.sh && ./configure
+                                --prefix=${SWIG_INSTALL_DIR} --without-pcre
+            BUILD_COMMAND       cd <SOURCE_DIR> && make
+            INSTALL_COMMAND     cd <SOURCE_DIR> && make install
+            UPDATE_COMMAND      ""
         )
 
         SET(SWIG_DIR ${SWIG_INSTALL_DIR}/share/swig/${SWIG_TARGET_VERSION})
diff --git a/doc/getstarted/build_and_install/build_from_source_en.md b/doc/getstarted/build_and_install/build_from_source_en.md
index 27b478a0fd..7e0942b211 100644
--- a/doc/getstarted/build_and_install/build_from_source_en.md
+++ b/doc/getstarted/build_and_install/build_from_source_en.md
@@ -101,8 +101,8 @@ As a simple example, consider the following:
     ```bash
     # necessary
     sudo apt-get update
-    sudo apt-get install -y git curl gcc g++ gfortran make build-essential autotools-dev
-    sudo apt-get install -y python python-pip python-numpy libpython-dev automake
+    sudo apt-get install -y git curl gcc g++ gfortran make build-essential automake
+    sudo apt-get install -y python python-pip python-numpy libpython-dev bison
     sudo pip install 'protobuf==3.1.0.post1'
 
     # install cmake 3.4
diff --git a/paddle/scripts/docker/Dockerfile b/paddle/scripts/docker/Dockerfile
index d46dd48f74..79c4efbed0 100644
--- a/paddle/scripts/docker/Dockerfile
+++ b/paddle/scripts/docker/Dockerfile
@@ -6,11 +6,11 @@ ARG UBUNTU_MIRROR
 RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
 
 RUN apt-get update && \
-    apt-get install -y git python-pip python-dev openssh-server && \
+    apt-get install -y git python-pip python-dev openssh-server bison && \
     apt-get install -y wget unzip tar xz-utils bzip2 gzip coreutils && \
     apt-get install -y curl sed grep graphviz libjpeg-dev zlib1g-dev && \
     apt-get install -y python-numpy python-matplotlib gcc g++ gfortran && \
-    apt-get install -y autotools-dev automake && \
+    apt-get install -y automake clang-3.8 llvm-3.8 libclang-3.8-dev && \
     apt-get clean -y
 
 RUN pip install --upgrade pip && \ 
diff --git a/paddle/scripts/docker/Dockerfile.gpu b/paddle/scripts/docker/Dockerfile.gpu
index 58070b2ad9..6c1c2225d1 100644
--- a/paddle/scripts/docker/Dockerfile.gpu
+++ b/paddle/scripts/docker/Dockerfile.gpu
@@ -6,11 +6,11 @@ ARG UBUNTU_MIRROR
 RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
 
 RUN apt-get update && \
-    apt-get install -y git python-pip python-dev openssh-server && \
+    apt-get install -y git python-pip python-dev openssh-server bison && \
     apt-get install -y wget unzip tar xz-utils bzip2 gzip coreutils && \
     apt-get install -y curl sed grep graphviz libjpeg-dev zlib1g-dev && \
     apt-get install -y python-numpy python-matplotlib gcc g++ gfortran && \
-    apt-get install -y autotools-dev automake && \
+    apt-get install -y automake clang-3.8 llvm-3.8 libclang-3.8-dev && \
     apt-get clean -y
 
 RUN pip install --upgrade pip && \ 

From 9bc12034002d0c0ca5f30bd1f11b30188978e327 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Wed, 18 Jan 2017 12:45:00 +0800
Subject: [PATCH 61/88] Add more comments, also add __must_check.

---
 .../activations/ActivationFunction.cpp        | 55 +++++++++---------
 .../gserver/activations/ActivationFunction.h  |  4 +-
 paddle/gserver/layers/Layer.cpp               |  4 +-
 paddle/gserver/layers/MDLstmLayer.cpp         | 25 +++++----
 paddle/gserver/layers/NCELayer.cpp            |  6 +-
 paddle/gserver/layers/RecurrentLayer.cpp      | 21 +++----
 .../layers/SelectiveFullyConnectedLayer.cpp   |  3 +-
 paddle/gserver/tests/test_WarpCTCLayer.cpp    |  4 +-
 paddle/utils/Compiler.h                       | 25 +++++++++
 paddle/utils/Status.h                         | 56 +++++++++++++++++++
 10 files changed, 147 insertions(+), 56 deletions(-)

diff --git a/paddle/gserver/activations/ActivationFunction.cpp b/paddle/gserver/activations/ActivationFunction.cpp
index 8a938cf7e9..666c2e01c8 100644
--- a/paddle/gserver/activations/ActivationFunction.cpp
+++ b/paddle/gserver/activations/ActivationFunction.cpp
@@ -69,11 +69,11 @@ static ClassRegistrar<ActivationFunction> gActivationRegistrar;
 class IdentityActivation : public ActivationFunction {
 public:
   static const std::string name;
-  Status forward(Argument& act) {
+  Status __must_check forward(Argument& act) {
     (void)act;
     return Status();
   }
-  Status backward(Argument& act) {
+  Status __must_check backward(Argument& act) {
     (void)act;
     return Status();
   }
@@ -92,11 +92,11 @@ static InitFunction __reg_activation__identity([] {
  * \f]
  */
 BEGIN_DEFINE_ACTIVATION(sigmoid)
-Status forward(Argument& act) {
+Status __must_check forward(Argument& act) {
   act.value->sigmoid(*act.value);
   return Status();
 }
-Status backward(Argument& act) {
+Status __must_check backward(Argument& act) {
   act.grad->sigmoidDerivative(*act.value);
   return Status();
 }
@@ -115,12 +115,12 @@ MatrixPtr sftMaxDot_;
 MatrixPtr one_;
 
 public:
-Status forward(Argument& act) {
+Status __must_check forward(Argument& act) {
   act.value->softmax(*act.value);
   return Status();
 }
 
-Status backward(Argument& act) {
+Status __must_check backward(Argument& act) {
   MatrixPtr outputV = act.value;
   MatrixPtr outputG = act.grad;
 
@@ -167,7 +167,7 @@ ACTIVATION_CLASS_NAME(softmax) softmax_;
 Argument argument_;
 
 public:
-Status forward(Argument& act) {
+Status __must_check forward(Argument& act) {
   if (act.value->getWidth() != 1UL) {
     return Status(
         "Input width for each timestep of sequence softmax should be 1");
@@ -191,7 +191,7 @@ Status forward(Argument& act) {
   return Status();
 }
 
-Status backward(Argument& act) {
+Status __must_check backward(Argument& act) {
   if (act.value->getWidth() != 1UL) {
     return Status(
         "Input width for each timestep of sequence softmax should be 1");
@@ -207,7 +207,8 @@ Status backward(Argument& act) {
     argument_.value->setData(act.value->getData() + offset, 1UL, size);
     argument_.grad->setData(act.grad->getData() + offset, 1UL, size);
 
-    softmax_.backward(argument_);
+    Status status = softmax_.backward(argument_);
+    if (!status.isOK()) return status;
   }
   return Status();
 }
@@ -224,12 +225,12 @@ END_DEFINE_ACTIVATION(sequence_softmax)
  *    0 otherwise.
  */
 BEGIN_DEFINE_ACTIVATION(relu)
-Status forward(Argument& act) {
+Status __must_check forward(Argument& act) {
   act.value->relu(*act.value);
   return Status();
 }
 
-Status backward(Argument& act) {
+Status __must_check backward(Argument& act) {
   act.grad->reluDerivative(*act.value);
   return Status();
 }
@@ -249,12 +250,12 @@ END_DEFINE_ACTIVATION(relu)
  * TODO(yuyang18): Remove magic number 24 or make it configuable.
  */
 BEGIN_DEFINE_ACTIVATION(brelu)
-Status forward(Argument& act) {
+Status __must_check forward(Argument& act) {
   act.value->brelu(*act.value);
   return Status();
 }
 
-Status backward(Argument& act) {
+Status __must_check backward(Argument& act) {
   act.grad->breluDerivative(*act.value);
   return Status();
 }
@@ -267,12 +268,12 @@ END_DEFINE_ACTIVATION(brelu)
  * \f]
  */
 BEGIN_DEFINE_ACTIVATION(tanh)
-Status forward(Argument& act) {
+Status __must_check forward(Argument& act) {
   act.value->tanh(*act.value);
   return Status();
 }
 
-Status backward(Argument& act) {
+Status __must_check backward(Argument& act) {
   act.grad->tanhDerivative(*act.value);
   return Status();
 }
@@ -290,12 +291,12 @@ real a, b;
 
 public:
 ACTIVATION_CLASS_NAME(stanh)() : a(1.7159), b(2. / 3.) {}
-Status forward(Argument& act) {
+Status __must_check forward(Argument& act) {
   act.value->scaledTanh(*act.value, a, b);
   return Status();
 }
 
-Status backward(Argument& act) {
+Status __must_check backward(Argument& act) {
   act.grad->scaledTanhDerivative(*act.value, a, b);
   return Status();
 }
@@ -308,12 +309,12 @@ END_DEFINE_ACTIVATION(stanh)
  * \f]
  */
 BEGIN_DEFINE_ACTIVATION(softrelu)
-Status forward(Argument& act) {
+Status __must_check forward(Argument& act) {
   act.value->softrelu(*act.value);
   return Status();
 }
 
-Status backward(Argument& act) {
+Status __must_check backward(Argument& act) {
   act.grad->softreluDerivative(*act.value);
   return Status();
 }
@@ -332,7 +333,7 @@ END_DEFINE_ACTIVATION(softrelu)
  *     0   if z=0
  */
 BEGIN_DEFINE_ACTIVATION(abs)
-Status forward(Argument& act) {
+Status __must_check forward(Argument& act) {
   SetDevice device(act.deviceId);
   Matrix::resizeOrCreate(act.in,
                          act.value->getHeight(),
@@ -345,7 +346,7 @@ Status forward(Argument& act) {
   return Status();
 }
 
-Status backward(Argument& act) {
+Status __must_check backward(Argument& act) {
   act.grad->absDerivative(*act.in);
   return Status();
 }
@@ -358,7 +359,7 @@ END_DEFINE_ACTIVATION(abs)
  * \f]
  */
 BEGIN_DEFINE_ACTIVATION(square)
-Status forward(Argument& act) {
+Status __must_check forward(Argument& act) {
   SetDevice device(act.deviceId);
   Matrix::resizeOrCreate(act.in,
                          act.value->getHeight(),
@@ -371,7 +372,7 @@ Status forward(Argument& act) {
   return Status();
 }
 
-Status backward(Argument& act) {
+Status __must_check backward(Argument& act) {
   act.grad->squareDerivative(*act.in);
   return Status();
 }
@@ -384,12 +385,12 @@ END_DEFINE_ACTIVATION(square)
  * \f]
  */
 BEGIN_DEFINE_ACTIVATION(exponential)
-Status forward(Argument& act) {
+Status __must_check forward(Argument& act) {
   act.value->exp2(*act.value);
   return Status();
 }
 
-Status backward(Argument& act) {
+Status __must_check backward(Argument& act) {
   act.grad->expDerivative(*act.value);
   return Status();
 }
@@ -402,7 +403,7 @@ END_DEFINE_ACTIVATION(exponential)
  * \f]
  */
 BEGIN_DEFINE_ACTIVATION(log)
-Status forward(Argument& act) {
+Status __must_check forward(Argument& act) {
   SetDevice device(act.deviceId);
   Matrix::resizeOrCreate(act.in,
                          act.value->getHeight(),
@@ -415,7 +416,7 @@ Status forward(Argument& act) {
   return Status();
 }
 
-Status backward(Argument& act) {
+Status __must_check backward(Argument& act) {
   act.grad->dotDiv(*act.grad, *act.in);
   return Status();
 }
diff --git a/paddle/gserver/activations/ActivationFunction.h b/paddle/gserver/activations/ActivationFunction.h
index ad395ac28d..737df2219d 100644
--- a/paddle/gserver/activations/ActivationFunction.h
+++ b/paddle/gserver/activations/ActivationFunction.h
@@ -49,7 +49,7 @@ public:
    *
    * Usually, act is Layer::output_
    */
-  virtual Status forward(Argument& act) = 0;
+  virtual Status __must_check forward(Argument& act) = 0;
 
   /**
    * @brief Backward propagaion
@@ -58,7 +58,7 @@ public:
    * - Before calling backward(), act.grad = dE / dy, where E is the error/cost
    * - After backward() returns, act.grad = dE / dx = (dE/dy) * (dy/dx)
    */
-  virtual Status backward(Argument& act) = 0;
+  virtual Status __must_check backward(Argument& act) = 0;
 
   virtual const std::string& getName() const = 0;
 };
diff --git a/paddle/gserver/layers/Layer.cpp b/paddle/gserver/layers/Layer.cpp
index 06c936c3ae..f96070fe6e 100644
--- a/paddle/gserver/layers/Layer.cpp
+++ b/paddle/gserver/layers/Layer.cpp
@@ -336,7 +336,7 @@ void Layer::showOutputStats() {
 void Layer::forwardActivation() {
   /* activation */
   auto status = activation_->forward(output_);
-  CHECK(status.isOK()) << status.what();
+  status.check();
 
   /* dropout */
   if (config_.drop_rate() > 0) {
@@ -375,7 +375,7 @@ void Layer::backwardActivation() {
   }
 
   auto status = activation_->backward(output_);
-  CHECK(status.isOK()) << status.what();
+  status.check();
 }
 
 void Layer::forwardDropOut() {
diff --git a/paddle/gserver/layers/MDLstmLayer.cpp b/paddle/gserver/layers/MDLstmLayer.cpp
index fb41af5631..88d934d782 100644
--- a/paddle/gserver/layers/MDLstmLayer.cpp
+++ b/paddle/gserver/layers/MDLstmLayer.cpp
@@ -506,9 +506,12 @@ void MDLstmLayer::forwardGate2OutputSequence(int start,
           *frameState_[start + preOffsetV[i]].value, *checkFgOneDim, 1.0, 1.0);
     }
   }
-  activationGate_->forward(frameInputGate_[idxCurr]);
-  activationGate_->forward(frameForgetGate_[idxCurr]);
-  activation_->forward(frameInputNode_[idxCurr]);
+  auto status = activationGate_->forward(frameInputGate_[idxCurr]);
+  status.check();
+  status = activationGate_->forward(frameForgetGate_[idxCurr]);
+  status.check();
+  status = activation_->forward(frameInputNode_[idxCurr]);
+  status.check();
 
   frameState_[idxCurr].value->zeroMem();
   for (int i = 0; i < numDims_; i++) {
@@ -530,10 +533,12 @@ void MDLstmLayer::forwardGate2OutputSequence(int start,
 
   frameOutputGate_[idxCurr].value->addDotMul(
       *frameState_[idxCurr].value, *checkOg_, 1.0, 1.0);
-  activationGate_->forward(frameOutputGate_[idxCurr]);
+  status = activationGate_->forward(frameOutputGate_[idxCurr]);
+  status.check();
 
   framePreOutput_[idxCurr].value->copyFrom(*(frameState_[idxCurr].value));
-  activationState_->forward(framePreOutput_[idxCurr]);
+  status = activationState_->forward(framePreOutput_[idxCurr]);
+  status.check();
 
   frameOutput_[idxCurr].value->dotMul(*framePreOutput_[idxCurr].value,
                                       *frameOutputGate_[idxCurr].value);
@@ -640,12 +645,12 @@ void MDLstmLayer::backwardGate2OutputSequence(int start,
 
   framePreOutput_[idxCurr].grad->dotMul(*frameOutput_[idxCurr].grad,
                                         *frameOutputGate_[idxCurr].value);
-  activationState_->backward(framePreOutput_[idxCurr]);
+  activationState_->backward(framePreOutput_[idxCurr]).check();
   frameState_[idxCurr].grad->copyFrom(*(framePreOutput_[idxCurr].grad));
 
   frameOutputGate_[idxCurr].grad->dotMul(*frameOutput_[idxCurr].grad,
                                          *framePreOutput_[idxCurr].value);
-  activationGate_->backward(frameOutputGate_[idxCurr]);
+  activationGate_->backward(frameOutputGate_[idxCurr]).check();
 
   frameState_[idxCurr].grad->addDotMul(
       *frameOutputGate_[idxCurr].grad, *checkOg_, 1.0, 1.0);
@@ -702,9 +707,9 @@ void MDLstmLayer::backwardGate2OutputSequence(int start,
     }
   }
 
-  activationGate_->backward(frameInputGate_[idxCurr]);
-  activationGate_->backward(frameForgetGate_[idxCurr]);
-  activation_->backward(frameInputNode_[idxCurr]);
+  activationGate_->backward(frameInputGate_[idxCurr]).check();
+  activationGate_->backward(frameForgetGate_[idxCurr]).check();
+  activation_->backward(frameInputNode_[idxCurr]).check();
 
   if (bias_->getWGrad()) {
     for (int i = 0; i < numDims_; i++) {
diff --git a/paddle/gserver/layers/NCELayer.cpp b/paddle/gserver/layers/NCELayer.cpp
index 5ab765247f..3542e739df 100644
--- a/paddle/gserver/layers/NCELayer.cpp
+++ b/paddle/gserver/layers/NCELayer.cpp
@@ -193,7 +193,8 @@ public:
       forwardOneInput(l);
     }
 
-    activation_->forward(sampleOut_);
+    auto status = activation_->forward(sampleOut_);
+    status.check();
 
     forwardCost();
   }
@@ -207,7 +208,8 @@ public:
 
     backwardCost();
 
-    activation_->backward(sampleOut_);
+    auto status = activation_->backward(sampleOut_);
+    status.check();
 
     if (biases_->getWGrad()) {
       backwardBias(callback);
diff --git a/paddle/gserver/layers/RecurrentLayer.cpp b/paddle/gserver/layers/RecurrentLayer.cpp
index 55e0fdfb90..b843fa1265 100644
--- a/paddle/gserver/layers/RecurrentLayer.cpp
+++ b/paddle/gserver/layers/RecurrentLayer.cpp
@@ -217,21 +217,22 @@ void RecurrentLayer::forwardOneSequence(int start, int length) {
     if (prevOutput_) {
       frameOutput_[start].value->mul(*prevOutput_, *weight_->getW(), 1, 1);
     }
-    activation_->forward(frameOutput_[start]);
+    activation_->forward(frameOutput_[start]).check();
+
     for (int i = 1; i < length; ++i) {
       frameOutput_[start + i].value->mul(
           *frameOutput_[start + i - 1].value, *weight_->getW(), 1, 1);
-      activation_->forward(frameOutput_[start + i]);
+      activation_->forward(frameOutput_[start + i]).check();
     }
     if (prevOutput_) {
       prevOutput_->assign(*frameOutput_[start + length - 1].value);
     }
   } else {
-    activation_->forward(frameOutput_[start + length - 1]);
+    activation_->forward(frameOutput_[start + length - 1]).check();
     for (int i = length - 2; i >= 0; --i) {
       frameOutput_[start + i].value->mul(
           *frameOutput_[start + i + 1].value, *weight_->getW(), 1, 1);
-      activation_->forward(frameOutput_[start + i]);
+      activation_->forward(frameOutput_[start + i]).check();
     }
   }
 }
@@ -280,11 +281,11 @@ void RecurrentLayer::backwardOneSequence(int start, int length) {
   MatrixPtr weightT = weight_->getW()->getTranspose();
   if (!reversed_) {
     for (int i = length - 1; i > 0; --i) {
-      activation_->backward(frameOutput_[start + i]);
+      activation_->backward(frameOutput_[start + i]).check();
       frameOutput_[start + i - 1].grad->mul(
           *frameOutput_[start + i].grad, *weightT, 1, 1);
     }
-    activation_->backward(frameOutput_[start]);
+    activation_->backward(frameOutput_[start]).check();
     if (weight_->getWGrad()) {
       weight_->getWGrad()->mul(
           *output_.value->subMatrix(start, length - 1)->getTranspose(),
@@ -294,11 +295,11 @@ void RecurrentLayer::backwardOneSequence(int start, int length) {
     }
   } else {
     for (int i = 0; i < length - 1; ++i) {
-      activation_->backward(frameOutput_[start + i]);
+      activation_->backward(frameOutput_[start + i]).check();
       frameOutput_[start + i + 1].grad->mul(
           *frameOutput_[start + i].grad, *weightT, 1, 1);
     }
-    activation_->backward(frameOutput_[start + length - 1]);
+    activation_->backward(frameOutput_[start + length - 1]).check();
     if (weight_->getWGrad()) {
       weight_->getWGrad()->mul(
           *output_.value->subMatrix(start + 1, length - 1)->getTranspose(),
@@ -333,7 +334,7 @@ void RecurrentLayer::forwardBatch(int batchSize,
       }
       Argument arg;
       arg.value = batch2;
-      activation_->forward(arg);
+      activation_->forward(arg).check();
     }
   }
   batchValue_->copyBackSeq(*output_.value);
@@ -363,7 +364,7 @@ void RecurrentLayer::backwardBatch(int batchSize,
       Argument arg;
       arg.value = batch1;
       arg.grad = batch2;
-      activation_->backward(arg);
+      activation_->backward(arg).check();
 
       if (n != 0) {
         batch1 = batchGrad_->getBatchValue(n - 1, batch2->getHeight());
diff --git a/paddle/gserver/layers/SelectiveFullyConnectedLayer.cpp b/paddle/gserver/layers/SelectiveFullyConnectedLayer.cpp
index 5eacff6b71..d9a91de8a6 100644
--- a/paddle/gserver/layers/SelectiveFullyConnectedLayer.cpp
+++ b/paddle/gserver/layers/SelectiveFullyConnectedLayer.cpp
@@ -192,7 +192,8 @@ void SelectiveFullyConnectedLayer::forward(PassType passType) {
                                nnz,
                                /*trans=*/false,
                                /*useGpu=*/useGpu_);
-    activation_->forward(arg);
+    //! TODO(yuyang18): Why we cannot invoke forwardActivation here?
+    activation_->forward(arg).check();
   } else /* train and test in train, not generating */ {
     // during training, this layer output value is *Matrix*, which is input of
     // eg. multi-class-cross-entropy
diff --git a/paddle/gserver/tests/test_WarpCTCLayer.cpp b/paddle/gserver/tests/test_WarpCTCLayer.cpp
index 23ae95852e..55427e2f12 100644
--- a/paddle/gserver/tests/test_WarpCTCLayer.cpp
+++ b/paddle/gserver/tests/test_WarpCTCLayer.cpp
@@ -148,11 +148,11 @@ LayerPtr createCTCLayer(string name,
 
   ActivationFunction* softmaxActivation = ActivationFunction::create("softmax");
 
-  softmaxActivation->forward(dataLayer->getOutput());
+  softmaxActivation->forward(dataLayer->getOutput()).check();
   layer->forward(PASS_GC);
 
   layer->backward();
-  softmaxActivation->backward(dataLayer->getOutput());
+  softmaxActivation->backward(dataLayer->getOutput()).check();
 
   return layer;
 }
diff --git a/paddle/utils/Compiler.h b/paddle/utils/Compiler.h
index e69de29bb2..22812e8398 100644
--- a/paddle/utils/Compiler.h
+++ b/paddle/utils/Compiler.h
@@ -0,0 +1,25 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef __GNUC__
+#define GCC_VERSION \
+  (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
+#else
+#define GCC_VERSION
+#endif
+
+#if GCC_VERSION >= 30400
+#define __must_check __attribute__((warn_unused_result))
+#else
+#define __must_check
+#endif
diff --git a/paddle/utils/Status.h b/paddle/utils/Status.h
index cb66e4b225..26329f8d19 100644
--- a/paddle/utils/Status.h
+++ b/paddle/utils/Status.h
@@ -14,9 +14,11 @@ limitations under the License. */
 
 #pragma once
 
+#include <glog/logging.h>
 #include <stdio.h>
 #include <memory>
 #include <string>
+#include "Compiler.h"
 
 namespace paddle {
 
@@ -29,8 +31,55 @@ namespace paddle {
  * There are two styles to return status in Paddle.
  *
  * 1. Return Status
+ *    When method return a status, the return must use `__must_check` attribute.
+ *    Example as below.
+ * @code{cpp}
+ * Status __must_check foo();
  *
+ * Status __must_check bar() {
+ *   // do something.
+ *   Status s = foo();  // invoke other method return status.
+ *   if (!s.isOK()) return s;
+ *   // do something else.
+ *   return Status();
+ * }
+ * @endcode{cpp}
  *
+ * 2. Return by parameter.
+ *    It is another way to return a status, by using a pointer parameter.
+ *    Example as below.
+ *
+ * @code{cpp}
+ * Status bar();
+ *
+ * int foo(Status* status) {
+ *   // Do something.
+ *   Status s = bar();
+ *   if (!s.isOK()) {
+ *     *status = s;
+ *     return 0;
+ *   }
+ *   // Do something else.
+ *   if (someInternalErrorHappend) {
+ *     status->setByPrintf("Some dimension is too large, %d", dimension);
+ *     return 0;
+ *   }
+ *   // End of method.
+ *   return someValue;
+ * }
+ *
+ * Status foobar() {
+ *   Status s;
+ *   // do something.
+ *   foo(&s);
+ *   if (!s.isOK()) return s;
+ * }
+ * @endcode{cpp}
+ *
+ *
+ * Currently there is a helper method 'check' in status, because Paddle always
+ * use log(FATAL) or CHECK to make program exit before. When we clean all
+ * log(FATAL) and CHECK in Paddle, 'check' method will be removed.
  */
 class Status final : public std::exception {
 public:
@@ -92,6 +141,13 @@ public:
    */
   inline bool isOK() const noexcept { return errMsg_ == nullptr; }
 
+  /**
+   * @brief check this status by glog.
+   * @note It is a temp method used during cleaning Paddle code. It will be
+   *       removed later.
+   */
+  inline void check() const { CHECK(isOK()) << what(); }
+
 private:
   std::shared_ptr<std::string> errMsg_;
 };

From 8605544c0b0cbc5ad43d86a71402f3f4075b48e3 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Wed, 18 Jan 2017 12:49:00 +0800
Subject: [PATCH 62/88] Add some comments to compiler.h

---
 paddle/utils/Compiler.h | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/paddle/utils/Compiler.h b/paddle/utils/Compiler.h
index 22812e8398..cebca5a2a3 100644
--- a/paddle/utils/Compiler.h
+++ b/paddle/utils/Compiler.h
@@ -10,7 +10,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-
+/**
+ * This header defines some useful attribute by each compiler. It is the
+ * abstract layer of compilers.
+ */
 #ifdef __GNUC__
 #define GCC_VERSION \
   (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
@@ -18,6 +21,11 @@ limitations under the License. */
 #define GCC_VERSION
 #endif
 
+/**
+ * __must_check macro. It make the function's return value must be used,
+ * otherwise it will raise a compile warning. And also Paddle treat all compile
+ * warnings as errors.
+ */
 #if GCC_VERSION >= 30400
 #define __must_check __attribute__((warn_unused_result))
 #else

From ec790e1050b52b09a182eed95fc030c7879e5012 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Wed, 18 Jan 2017 13:24:02 +0800
Subject: [PATCH 63/88] Rename Status => Error.

* Also make ErrorF as a global method.
---
 .../activations/ActivationFunction.cpp        | 110 +++++++++---------
 .../gserver/activations/ActivationFunction.h  |   6 +-
 paddle/gserver/layers/Layer.cpp               |   2 +-
 paddle/utils/{Status.h => Error.h}            |  92 ++++++++-------
 paddle/utils/tests/CMakeLists.txt             |   2 +-
 .../tests/{test_Status.cpp => test_Error.cpp} |  14 +--
 6 files changed, 114 insertions(+), 112 deletions(-)
 rename paddle/utils/{Status.h => Error.h} (70%)
 rename paddle/utils/tests/{test_Status.cpp => test_Error.cpp} (76%)

diff --git a/paddle/gserver/activations/ActivationFunction.cpp b/paddle/gserver/activations/ActivationFunction.cpp
index 666c2e01c8..f1f96fc67d 100644
--- a/paddle/gserver/activations/ActivationFunction.cpp
+++ b/paddle/gserver/activations/ActivationFunction.cpp
@@ -69,13 +69,13 @@ static ClassRegistrar<ActivationFunction> gActivationRegistrar;
 class IdentityActivation : public ActivationFunction {
 public:
   static const std::string name;
-  Status __must_check forward(Argument& act) {
+  Error __must_check forward(Argument& act) {
     (void)act;
-    return Status();
+    return Error();
   }
-  Status __must_check backward(Argument& act) {
+  Error __must_check backward(Argument& act) {
     (void)act;
-    return Status();
+    return Error();
   }
   const std::string& getName() const { return name; }
 };
@@ -92,13 +92,13 @@ static InitFunction __reg_activation__identity([] {
  * \f]
  */
 BEGIN_DEFINE_ACTIVATION(sigmoid)
-Status __must_check forward(Argument& act) {
+Error __must_check forward(Argument& act) {
   act.value->sigmoid(*act.value);
-  return Status();
+  return Error();
 }
-Status __must_check backward(Argument& act) {
+Error __must_check backward(Argument& act) {
   act.grad->sigmoidDerivative(*act.value);
-  return Status();
+  return Error();
 }
 END_DEFINE_ACTIVATION(sigmoid)
 
@@ -115,12 +115,12 @@ MatrixPtr sftMaxDot_;
 MatrixPtr one_;
 
 public:
-Status __must_check forward(Argument& act) {
+Error __must_check forward(Argument& act) {
   act.value->softmax(*act.value);
-  return Status();
+  return Error();
 }
 
-Status __must_check backward(Argument& act) {
+Error __must_check backward(Argument& act) {
   MatrixPtr outputV = act.value;
   MatrixPtr outputG = act.grad;
 
@@ -152,7 +152,7 @@ Status __must_check backward(Argument& act) {
 
     act.grad->softmaxDerivative(*act.value, *sftMaxSum_);
   }
-  return Status();
+  return Error();
 }
 END_DEFINE_ACTIVATION(softmax)
 
@@ -167,9 +167,9 @@ ACTIVATION_CLASS_NAME(softmax) softmax_;
 Argument argument_;
 
 public:
-Status __must_check forward(Argument& act) {
+Error __must_check forward(Argument& act) {
   if (act.value->getWidth() != 1UL) {
-    return Status(
+    return ErrorF(
         "Input width for each timestep of sequence softmax should be 1");
   }
 
@@ -188,12 +188,12 @@ Status __must_check forward(Argument& act) {
 
   auto starts = act.sequenceStartPositions->getVector(useGpu(act.deviceId));
   act.value->sequenceSoftmax(*act.value, *starts);
-  return Status();
+  return Error();
 }
 
-Status __must_check backward(Argument& act) {
+Error __must_check backward(Argument& act) {
   if (act.value->getWidth() != 1UL) {
-    return Status(
+    return ErrorF(
         "Input width for each timestep of sequence softmax should be 1");
   }
 
@@ -207,10 +207,10 @@ Status __must_check backward(Argument& act) {
     argument_.value->setData(act.value->getData() + offset, 1UL, size);
     argument_.grad->setData(act.grad->getData() + offset, 1UL, size);
 
-    Status status = softmax_.backward(argument_);
+    Error status = softmax_.backward(argument_);
     if (!status.isOK()) return status;
   }
-  return Status();
+  return Error();
 }
 END_DEFINE_ACTIVATION(sequence_softmax)
 
@@ -225,14 +225,14 @@ END_DEFINE_ACTIVATION(sequence_softmax)
  *    0 otherwise.
  */
 BEGIN_DEFINE_ACTIVATION(relu)
-Status __must_check forward(Argument& act) {
+Error __must_check forward(Argument& act) {
   act.value->relu(*act.value);
-  return Status();
+  return Error();
 }
 
-Status __must_check backward(Argument& act) {
+Error __must_check backward(Argument& act) {
   act.grad->reluDerivative(*act.value);
-  return Status();
+  return Error();
 }
 END_DEFINE_ACTIVATION(relu)
 
@@ -250,14 +250,14 @@ END_DEFINE_ACTIVATION(relu)
  * TODO(yuyang18): Remove magic number 24 or make it configuable.
  */
 BEGIN_DEFINE_ACTIVATION(brelu)
-Status __must_check forward(Argument& act) {
+Error __must_check forward(Argument& act) {
   act.value->brelu(*act.value);
-  return Status();
+  return Error();
 }
 
-Status __must_check backward(Argument& act) {
+Error __must_check backward(Argument& act) {
   act.grad->breluDerivative(*act.value);
-  return Status();
+  return Error();
 }
 END_DEFINE_ACTIVATION(brelu)
 
@@ -268,14 +268,14 @@ END_DEFINE_ACTIVATION(brelu)
  * \f]
  */
 BEGIN_DEFINE_ACTIVATION(tanh)
-Status __must_check forward(Argument& act) {
+Error __must_check forward(Argument& act) {
   act.value->tanh(*act.value);
-  return Status();
+  return Error();
 }
 
-Status __must_check backward(Argument& act) {
+Error __must_check backward(Argument& act) {
   act.grad->tanhDerivative(*act.value);
-  return Status();
+  return Error();
 }
 END_DEFINE_ACTIVATION(tanh)
 
@@ -291,14 +291,14 @@ real a, b;
 
 public:
 ACTIVATION_CLASS_NAME(stanh)() : a(1.7159), b(2. / 3.) {}
-Status __must_check forward(Argument& act) {
+Error __must_check forward(Argument& act) {
   act.value->scaledTanh(*act.value, a, b);
-  return Status();
+  return Error();
 }
 
-Status __must_check backward(Argument& act) {
+Error __must_check backward(Argument& act) {
   act.grad->scaledTanhDerivative(*act.value, a, b);
-  return Status();
+  return Error();
 }
 END_DEFINE_ACTIVATION(stanh)
 
@@ -309,14 +309,14 @@ END_DEFINE_ACTIVATION(stanh)
  * \f]
  */
 BEGIN_DEFINE_ACTIVATION(softrelu)
-Status __must_check forward(Argument& act) {
+Error __must_check forward(Argument& act) {
   act.value->softrelu(*act.value);
-  return Status();
+  return Error();
 }
 
-Status __must_check backward(Argument& act) {
+Error __must_check backward(Argument& act) {
   act.grad->softreluDerivative(*act.value);
-  return Status();
+  return Error();
 }
 END_DEFINE_ACTIVATION(softrelu)
 
@@ -333,7 +333,7 @@ END_DEFINE_ACTIVATION(softrelu)
  *     0   if z=0
  */
 BEGIN_DEFINE_ACTIVATION(abs)
-Status __must_check forward(Argument& act) {
+Error __must_check forward(Argument& act) {
   SetDevice device(act.deviceId);
   Matrix::resizeOrCreate(act.in,
                          act.value->getHeight(),
@@ -343,12 +343,12 @@ Status __must_check forward(Argument& act) {
 
   act.in->copyFrom(*act.value);
   act.value->abs2(*act.value);
-  return Status();
+  return Error();
 }
 
-Status __must_check backward(Argument& act) {
+Error __must_check backward(Argument& act) {
   act.grad->absDerivative(*act.in);
-  return Status();
+  return Error();
 }
 END_DEFINE_ACTIVATION(abs)
 
@@ -359,7 +359,7 @@ END_DEFINE_ACTIVATION(abs)
  * \f]
  */
 BEGIN_DEFINE_ACTIVATION(square)
-Status __must_check forward(Argument& act) {
+Error __must_check forward(Argument& act) {
   SetDevice device(act.deviceId);
   Matrix::resizeOrCreate(act.in,
                          act.value->getHeight(),
@@ -369,12 +369,12 @@ Status __must_check forward(Argument& act) {
 
   act.in->copyFrom(*act.value);
   act.value->square2(*act.value);
-  return Status();
+  return Error();
 }
 
-Status __must_check backward(Argument& act) {
+Error __must_check backward(Argument& act) {
   act.grad->squareDerivative(*act.in);
-  return Status();
+  return Error();
 }
 END_DEFINE_ACTIVATION(square)
 
@@ -385,14 +385,14 @@ END_DEFINE_ACTIVATION(square)
  * \f]
  */
 BEGIN_DEFINE_ACTIVATION(exponential)
-Status __must_check forward(Argument& act) {
+Error __must_check forward(Argument& act) {
   act.value->exp2(*act.value);
-  return Status();
+  return Error();
 }
 
-Status __must_check backward(Argument& act) {
+Error __must_check backward(Argument& act) {
   act.grad->expDerivative(*act.value);
-  return Status();
+  return Error();
 }
 END_DEFINE_ACTIVATION(exponential)
 
@@ -403,7 +403,7 @@ END_DEFINE_ACTIVATION(exponential)
  * \f]
  */
 BEGIN_DEFINE_ACTIVATION(log)
-Status __must_check forward(Argument& act) {
+Error __must_check forward(Argument& act) {
   SetDevice device(act.deviceId);
   Matrix::resizeOrCreate(act.in,
                          act.value->getHeight(),
@@ -413,12 +413,12 @@ Status __must_check forward(Argument& act) {
 
   act.in->copyFrom(*act.value);
   act.value->log2(*act.value);
-  return Status();
+  return Error();
 }
 
-Status __must_check backward(Argument& act) {
+Error __must_check backward(Argument& act) {
   act.grad->dotDiv(*act.grad, *act.in);
-  return Status();
+  return Error();
 }
 END_DEFINE_ACTIVATION(log)
 
diff --git a/paddle/gserver/activations/ActivationFunction.h b/paddle/gserver/activations/ActivationFunction.h
index 737df2219d..f208224e30 100644
--- a/paddle/gserver/activations/ActivationFunction.h
+++ b/paddle/gserver/activations/ActivationFunction.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 #include <string>
 #include <vector>
-#include "paddle/utils/Status.h"
+#include "paddle/utils/Error.h"
 
 namespace paddle {
 
@@ -49,7 +49,7 @@ public:
    *
    * Usually, act is Layer::output_
    */
-  virtual Status __must_check forward(Argument& act) = 0;
+  virtual Error __must_check forward(Argument& act) = 0;
 
   /**
    * @brief Backward propagaion
@@ -58,7 +58,7 @@ public:
    * - Before calling backward(), act.grad = dE / dy, where E is the error/cost
    * - After backward() returns, act.grad = dE / dx = (dE/dy) * (dy/dx)
    */
-  virtual Status __must_check backward(Argument& act) = 0;
+  virtual Error __must_check backward(Argument& act) = 0;
 
   virtual const std::string& getName() const = 0;
 };
diff --git a/paddle/gserver/layers/Layer.cpp b/paddle/gserver/layers/Layer.cpp
index f96070fe6e..f76d41ad3e 100644
--- a/paddle/gserver/layers/Layer.cpp
+++ b/paddle/gserver/layers/Layer.cpp
@@ -15,8 +15,8 @@ limitations under the License. */
 #include "paddle/utils/Util.h"
 
 #include "paddle/math/SparseMatrix.h"
+#include "paddle/utils/Error.h"
 #include "paddle/utils/Logging.h"
-#include "paddle/utils/Status.h"
 
 #include "AddtoLayer.h"
 #include "CRFLayer.h"
diff --git a/paddle/utils/Status.h b/paddle/utils/Error.h
similarity index 70%
rename from paddle/utils/Status.h
rename to paddle/utils/Error.h
index 26329f8d19..f1597f93d2 100644
--- a/paddle/utils/Status.h
+++ b/paddle/utils/Error.h
@@ -34,9 +34,9 @@ namespace paddle {
  *    When method return a status, the return must use `__must_check` attribute.
  *    Example as below.
  * @code{cpp}
- * Status __must_check foo();
+ * Error __must_check foo();
  *
- * Status __must_check bar() {
+ * Error __must_check bar() {
  *   // do something.
  *   Status s = foo();  // invoke other method return status.
  *   if (!s.isOK()) return s;
@@ -50,9 +50,9 @@ namespace paddle {
  *    Example as below.
  *
  * @code{cpp}
- * Status bar();
+ * Error bar();
  *
- * int foo(Status* status) {
+ * int foo(Error* status) {
  *   // Do something.
  *   Status s = bar();
  *   if (!s.isOK()) {
@@ -61,15 +61,15 @@ namespace paddle {
  *   }
  *   // Do something else.
  *   if (someInternalErrorHappend) {
- *     status->setByPrintf("Some dimension is too large, %d", dimension);
+ *     *status = ErrorF("Some dimension is too large, %d", dimension);
  *     return 0;
  *   }
  *   // End of method.
  *   return someValue;
  * }
  *
- * Status foobar() {
- *   Status s;
+ * Error foobar() {
+ *   Error s;
  *   // do something.
  *   foo(&s);
  *   if (!s.isOK()) return s;
@@ -81,48 +81,12 @@ namespace paddle {
  * use log(FATAL) or CHECK to make program exit before. When we clean all
  * log(FATAL) and CHECK in Paddle, 'check' method will be removed.
  */
-class Status final : public std::exception {
+class Error final : public std::exception {
 public:
   /**
    * Default Status. OK
    */
-  Status() noexcept {}
-
-  /**
-   * @brief Create Status with error message
-   * @param msg
-   */
-  explicit Status(const std::string& msg) : errMsg_(new std::string(msg)) {}
-
-  /**
-   * @brief set a error message for status.
-   * @param msg
-   */
-  inline void set(const std::string& msg) noexcept {
-    errMsg_.reset(new std::string(msg));
-  }
-
-  /**
-   * @brief set a error message for status. Use C style printf
-   * @param fmt
-   */
-  template <typename... ARGS>
-  inline void setByPrintf(const char* fmt, ARGS... args) noexcept {
-    constexpr size_t kBufferSize = 1024;  // 1KB buffer
-    char buffer[kBufferSize];
-    snprintf(buffer, kBufferSize, fmt, args...);
-    errMsg_.reset(new std::string(buffer));
-  }
-
-  /**
-   * create a error status by C style printf.
-   */
-  template <typename... ARGS>
-  inline static Status printf(const char* fmt, ARGS... args) noexcept {
-    Status s;
-    s.setByPrintf(fmt, args...);
-    return s;
-  }
+  Error() noexcept {}
 
   /**
    * @brief what will return the error message. If status is OK, return nullptr.
@@ -148,8 +112,46 @@ public:
    */
   inline void check() const { CHECK(isOK()) << what(); }
 
+  /**
+   * friend method to create Error.
+   */
+  template <typename... ARGS>
+  friend Error __must_check ErrorF(const char* fmt, ARGS... args);
+
 private:
   std::shared_ptr<std::string> errMsg_;
 };
 
+/**
+ * ErrorF will create an Error by printf syntax.
+ *
+ * Specialize this method because clang will give a warning when use printf(fmt)
+ * without arguments.
+ */
+template <>
+inline Error __must_check ErrorF(const char* msg) {
+  Error e;
+  e.errMsg_.reset(new std::string(msg));
+  return e;
+}
+
+/**
+ * ErrorF will create an Error by printf syntax.
+ *
+ * Examples:
+ * @code{cpp}
+ * auto err = ErrorF("SomeError");
+ * auto err2 = ErrorF("SomeErrorWithParameter %f %d", real_val, int_val);
+ * @endcode{cpp}
+ */
+template <typename... ARGS>
+inline Error __must_check ErrorF(const char* fmt, ARGS... args) {
+  constexpr size_t kBufferSize = 1024;
+  char buffer[kBufferSize];
+  snprintf(buffer, kBufferSize, fmt, args...);
+  Error e;
+  e.errMsg_.reset(new std::string(buffer));
+  return e;
+}
+
 }  // namespace paddle
diff --git a/paddle/utils/tests/CMakeLists.txt b/paddle/utils/tests/CMakeLists.txt
index a1cc32668d..aa923b3553 100644
--- a/paddle/utils/tests/CMakeLists.txt
+++ b/paddle/utils/tests/CMakeLists.txt
@@ -4,7 +4,7 @@ add_simple_unittest(test_CustomStackTrace)
 add_simple_unittest(test_ThreadBarrier)
 add_simple_unittest(test_SpinLock)
 add_simple_unittest(test_SIMDFlags)
-add_simple_unittest(test_Status)
+add_simple_unittest(test_Error)
 
 add_executable(
     test_CustomStackTracePrint
diff --git a/paddle/utils/tests/test_Status.cpp b/paddle/utils/tests/test_Error.cpp
similarity index 76%
rename from paddle/utils/tests/test_Status.cpp
rename to paddle/utils/tests/test_Error.cpp
index 04cef09579..96115f7053 100644
--- a/paddle/utils/tests/test_Status.cpp
+++ b/paddle/utils/tests/test_Error.cpp
@@ -12,23 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Status.h"
+#include "paddle/utils/Error.h"
 
 #include <gtest/gtest.h>
 
 TEST(Status, testAll) {
-  paddle::Status status;
+  paddle::Error status;
   ASSERT_TRUE(status.isOK());
-  status.set("I'm the error");
+  status = paddle::ErrorF("I'm the error");
   ASSERT_FALSE(status.isOK());
   ASSERT_STREQ("I'm the error", status.what());
 
-  paddle::Status status2("error2");
-  ASSERT_FALSE(status2.isOK());
-  ASSERT_STREQ("error2", status2.what());
+  status = paddle::ErrorF("error2");
+  ASSERT_FALSE(status.isOK());
+  ASSERT_STREQ("error2", status.what());
 
   int i = 3;
-  auto status3 = paddle::Status::printf("error%d", i);
+  auto status3 = paddle::ErrorF("error%d", i);
   ASSERT_FALSE(status3.isOK());
   ASSERT_STREQ("error3", status3.what());
 }

From 699d18f11701aae1efe72c8bf6edc50723445050 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Wed, 18 Jan 2017 13:34:20 +0800
Subject: [PATCH 64/88] Change unittest variable name

---
 paddle/utils/tests/test_Error.cpp | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/paddle/utils/tests/test_Error.cpp b/paddle/utils/tests/test_Error.cpp
index 96115f7053..e8643de9d2 100644
--- a/paddle/utils/tests/test_Error.cpp
+++ b/paddle/utils/tests/test_Error.cpp
@@ -16,19 +16,19 @@ limitations under the License. */
 
 #include <gtest/gtest.h>
 
-TEST(Status, testAll) {
-  paddle::Error status;
-  ASSERT_TRUE(status.isOK());
-  status = paddle::ErrorF("I'm the error");
-  ASSERT_FALSE(status.isOK());
-  ASSERT_STREQ("I'm the error", status.what());
+TEST(Error, testAll) {
+  paddle::Error error;
+  ASSERT_TRUE(error.isOK());
+  error = paddle::ErrorF("I'm the error");
+  ASSERT_FALSE(error.isOK());
+  ASSERT_STREQ("I'm the error", error.what());
 
-  status = paddle::ErrorF("error2");
-  ASSERT_FALSE(status.isOK());
-  ASSERT_STREQ("error2", status.what());
+  error = paddle::ErrorF("error2");
+  ASSERT_FALSE(error.isOK());
+  ASSERT_STREQ("error2", error.what());
 
   int i = 3;
-  auto status3 = paddle::ErrorF("error%d", i);
-  ASSERT_FALSE(status3.isOK());
-  ASSERT_STREQ("error3", status3.what());
+  auto error3 = paddle::ErrorF("error%d", i);
+  ASSERT_FALSE(error3.isOK());
+  ASSERT_STREQ("error3", error3.what());
 }

From b6dfd2891bd658e1212f5e38bf764cf0f901574f Mon Sep 17 00:00:00 2001
From: backyes <backyes@gmail.com>
Date: Wed, 18 Jan 2017 18:14:41 +0800
Subject: [PATCH 65/88] fix bug: * update function can not be inherited since
 updateImpl exsits * remove update implementation in one inherited updater

---
 paddle/parameter/ParameterUpdaterBase.h | 2 +-
 paddle/trainer/ParameterUpdater.h       | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/paddle/parameter/ParameterUpdaterBase.h b/paddle/parameter/ParameterUpdaterBase.h
index b230e170c1..6265c828a1 100644
--- a/paddle/parameter/ParameterUpdaterBase.h
+++ b/paddle/parameter/ParameterUpdaterBase.h
@@ -55,7 +55,7 @@ public:
   // between startBatch() and finishBatch(), update() will be called
   // by the trainer multiple times, each time for updating one Parameter
   // with its gradient in PARAMETER_GRADIENT
-  virtual void update(Parameter* para) {
+  void update(Parameter* para) {
     SetDevice setDevice(para->getDeviceId());
     para->updateHook();
     this->updateImpl(para);
diff --git a/paddle/trainer/ParameterUpdater.h b/paddle/trainer/ParameterUpdater.h
index c3207e63ce..9e9e948b88 100644
--- a/paddle/trainer/ParameterUpdater.h
+++ b/paddle/trainer/ParameterUpdater.h
@@ -184,7 +184,6 @@ protected:
    * @param para
    */
   virtual void updateImpl(Parameter* para) {}
-  virtual void update(Parameter* para) {}
 };
 
 /**

From 5a15c70e167a83e6b06686e3152ca9b30ed7800e Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Wed, 18 Jan 2017 20:27:52 +0800
Subject: [PATCH 66/88] Make Error interface cleaner

---
 .../activations/ActivationFunction.cpp        |  6 +-
 paddle/utils/Error.h                          | 92 +++++++------------
 paddle/utils/tests/test_Error.cpp             | 20 ++--
 3 files changed, 45 insertions(+), 73 deletions(-)

diff --git a/paddle/gserver/activations/ActivationFunction.cpp b/paddle/gserver/activations/ActivationFunction.cpp
index f1f96fc67d..c541b72e10 100644
--- a/paddle/gserver/activations/ActivationFunction.cpp
+++ b/paddle/gserver/activations/ActivationFunction.cpp
@@ -169,7 +169,7 @@ Argument argument_;
 public:
 Error __must_check forward(Argument& act) {
   if (act.value->getWidth() != 1UL) {
-    return ErrorF(
+    return Error(
         "Input width for each timestep of sequence softmax should be 1");
   }
 
@@ -193,7 +193,7 @@ Error __must_check forward(Argument& act) {
 
 Error __must_check backward(Argument& act) {
   if (act.value->getWidth() != 1UL) {
-    return ErrorF(
+    return Error(
         "Input width for each timestep of sequence softmax should be 1");
   }
 
@@ -208,7 +208,7 @@ Error __must_check backward(Argument& act) {
     argument_.grad->setData(act.grad->getData() + offset, 1UL, size);
 
     Error status = softmax_.backward(argument_);
-    if (!status.isOK()) return status;
+    if (!status) return status;
   }
   return Error();
 }
diff --git a/paddle/utils/Error.h b/paddle/utils/Error.h
index f1597f93d2..a8de56b980 100644
--- a/paddle/utils/Error.h
+++ b/paddle/utils/Error.h
@@ -23,14 +23,12 @@ limitations under the License. */
 namespace paddle {
 
 /**
- * Status is Paddle error code. It only contain a std::string as error message.
- * Although Status inherits the std::exception, but do not throw it except you
- * know what you are doing.
+ * Error is Paddle error code. It only contain a std::string as error message.
  *
  *
- * There are two styles to return status in Paddle.
+ * There are two styles to return error in Paddle.
  *
- * 1. Return Status
+ * 1. Return Error
  *    When method return a status, the return must use `__must_check` attribute.
  *    Example as below.
  * @code{cpp}
@@ -39,29 +37,29 @@ namespace paddle {
  * Error __must_check bar() {
  *   // do something.
  *   Status s = foo();  // invoke other method return status.
- *   if (!s.isOK()) return s;
+ *   if (!s) return s;
  *   // do something else.
  *   return Status();
  * }
  * @endcode{cpp}
  *
  * 2. Return by parameter.
- *    It is another way to return a status, by using a pointer parameter.
+ *    It is another way to return an error, by using a pointer parameter.
  *    Example as below.
  *
  * @code{cpp}
  * Error bar();
  *
- * int foo(Error* status) {
+ * int foo(Error* error) {
  *   // Do something.
- *   Status s = bar();
- *   if (!s.isOK()) {
- *     *status = s;
+ *   Error s = bar();
+ *   if (!s) {
+ *     *error = s;
  *     return 0;
  *   }
  *   // Do something else.
  *   if (someInternalErrorHappend) {
- *     *status = ErrorF("Some dimension is too large, %d", dimension);
+ *     *error = Error("Some dimension is too large, %d", dimension);
  *     return 0;
  *   }
  *   // End of method.
@@ -72,7 +70,7 @@ namespace paddle {
  *   Error s;
  *   // do something.
  *   foo(&s);
- *   if (!s.isOK()) return s;
+ *   if (!s) return s;
  * }
  * @endcode{cpp}
  *
@@ -81,17 +79,31 @@ namespace paddle {
  * use log(FATAL) or CHECK to make program exit before. When we clean all
  * log(FATAL) and CHECK in Paddle, 'check' method will be removed.
  */
-class Error final : public std::exception {
+class Error final {
 public:
   /**
    * Default Status. OK
    */
-  Error() noexcept {}
+  inline Error() {}
 
   /**
-   * @brief what will return the error message. If status is OK, return nullptr.
+   * @brief Create an Error use printf syntax.
    */
-  const char* what() const noexcept override {
+  inline explicit Error(const char* fmt, ...) {
+    va_list ap;
+    va_start(ap, fmt);
+    constexpr size_t kBufferSize = 1024;
+    this->errMsg_.reset(new std::string(kBufferSize, 0));
+    auto sz = vsnprintf(&(*errMsg_)[0], kBufferSize, fmt, ap);
+    this->errMsg_->resize(sz);
+    this->errMsg_->shrink_to_fit();
+    va_end(ap);
+  }
+
+  /**
+   * @brief what will return the error message. If no error, return nullptr.
+   */
+  inline const char* msg() const {
     if (errMsg_) {
       return errMsg_->data();
     } else {
@@ -100,58 +112,18 @@ public:
   }
 
   /**
-   * @brief isOK
-   * @return true if OK.
+   * @brief operator bool, return True if there is no error.
    */
-  inline bool isOK() const noexcept { return errMsg_ == nullptr; }
-
+  inline operator bool() const { return !errMsg_; }
   /**
    * @brief check this status by glog.
    * @note It is a temp method used during cleaning Paddle code. It will be
    *       removed later.
    */
-  inline void check() const { CHECK(isOK()) << what(); }
-
-  /**
-   * friend method to create Error.
-   */
-  template <typename... ARGS>
-  friend Error __must_check ErrorF(const char* fmt, ARGS... args);
+  inline void check() const { CHECK(*this) << msg(); }
 
 private:
   std::shared_ptr<std::string> errMsg_;
 };
 
-/**
- * ErrorF will create an Error by printf syntax.
- *
- * Specialize this method because clang will give a warning when use printf(fmt)
- * without arguments.
- */
-template <>
-inline Error __must_check ErrorF(const char* msg) {
-  Error e;
-  e.errMsg_.reset(new std::string(msg));
-  return e;
-}
-
-/**
- * ErrorF will create an Error by printf syntax.
- *
- * Examples:
- * @code{cpp}
- * auto err = ErrorF("SomeError");
- * auto err2 = ErrorF("SomeErrorWithParameter %f %d", real_val, int_val);
- * @endcode{cpp}
- */
-template <typename... ARGS>
-inline Error __must_check ErrorF(const char* fmt, ARGS... args) {
-  constexpr size_t kBufferSize = 1024;
-  char buffer[kBufferSize];
-  snprintf(buffer, kBufferSize, fmt, args...);
-  Error e;
-  e.errMsg_.reset(new std::string(buffer));
-  return e;
-}
-
 }  // namespace paddle
diff --git a/paddle/utils/tests/test_Error.cpp b/paddle/utils/tests/test_Error.cpp
index e8643de9d2..85156466e2 100644
--- a/paddle/utils/tests/test_Error.cpp
+++ b/paddle/utils/tests/test_Error.cpp
@@ -18,17 +18,17 @@ limitations under the License. */
 
 TEST(Error, testAll) {
   paddle::Error error;
-  ASSERT_TRUE(error.isOK());
-  error = paddle::ErrorF("I'm the error");
-  ASSERT_FALSE(error.isOK());
-  ASSERT_STREQ("I'm the error", error.what());
+  ASSERT_TRUE(error);
+  error = paddle::Error("I'm the error");
+  ASSERT_FALSE(error);
+  ASSERT_STREQ("I'm the error", error.msg());
 
-  error = paddle::ErrorF("error2");
-  ASSERT_FALSE(error.isOK());
-  ASSERT_STREQ("error2", error.what());
+  error = paddle::Error("error2");
+  ASSERT_FALSE(error);
+  ASSERT_STREQ("error2", error.msg());
 
   int i = 3;
-  auto error3 = paddle::ErrorF("error%d", i);
-  ASSERT_FALSE(error3.isOK());
-  ASSERT_STREQ("error3", error3.what());
+  auto error3 = paddle::Error("error%d", i);
+  ASSERT_FALSE(error3);
+  ASSERT_STREQ("error3", error3.msg());
 }

From 312326899e779d96811b7dfab3cfe5817b1b6fdb Mon Sep 17 00:00:00 2001
From: dangqingqing <dangqingqing@baidu.com>
Date: Wed, 18 Jan 2017 21:18:33 +0800
Subject: [PATCH 67/88] add api to doc

---
 doc/api/trainer_config_helpers/layers.rst      | 9 +++++++++
 python/paddle/trainer_config_helpers/layers.py | 1 +
 2 files changed, 10 insertions(+)

diff --git a/doc/api/trainer_config_helpers/layers.rst b/doc/api/trainer_config_helpers/layers.rst
index 4e429650e5..8b0e553eac 100644
--- a/doc/api/trainer_config_helpers/layers.rst
+++ b/doc/api/trainer_config_helpers/layers.rst
@@ -382,6 +382,15 @@ sampling_id_layer
     :members: sampling_id_layer
     :noindex:
 
+Slicing and Joining Layers
+==========================
+
+pad_layer
+-----------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: pad_layer
+    :noindex:
+
 ..  _api_trainer_config_helpers_layers_cost_layers:
 
 Cost Layers
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index 4e4744b518..66817fc93b 100644
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -108,6 +108,7 @@ __all__ = [
     'print_layer',
     'priorbox_layer',
     'spp_layer',
+    'pad_layer',
 ]
 
 
From a6d4a31deae38f78e24ecaf198e9250927416041 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Thu, 19 Jan 2017 10:01:47 +0800
Subject: [PATCH 68/88] Follow comments

---
 paddle/utils/Error.h | 27 +++++++++++++--------------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/paddle/utils/Error.h b/paddle/utils/Error.h
index a8de56b980..ff11541bbd 100644
--- a/paddle/utils/Error.h
+++ b/paddle/utils/Error.h
@@ -79,33 +79,32 @@ namespace paddle {
  * use log(FATAL) or CHECK to make program exit before. When we clean all
  * log(FATAL) and CHECK in Paddle, 'check' method will be removed.
  */
-class Error final {
+class Error {
 public:
   /**
-   * Default Status. OK
+   * Construct an no-error value.
    */
-  inline Error() {}
+  Error() {}
 
   /**
    * @brief Create an Error use printf syntax.
    */
-  inline explicit Error(const char* fmt, ...) {
+  explicit Error(const char* fmt, ...) {
     va_list ap;
     va_start(ap, fmt);
     constexpr size_t kBufferSize = 1024;
-    this->errMsg_.reset(new std::string(kBufferSize, 0));
-    auto sz = vsnprintf(&(*errMsg_)[0], kBufferSize, fmt, ap);
-    this->errMsg_->resize(sz);
-    this->errMsg_->shrink_to_fit();
+    char buffer[kBufferSize];
+    vsnprintf(buffer, kBufferSize, fmt, ap);
+    this->msg_.reset(new std::string(buffer));
     va_end(ap);
   }
 
   /**
    * @brief what will return the error message. If no error, return nullptr.
    */
-  inline const char* msg() const {
-    if (errMsg_) {
-      return errMsg_->data();
+  const char* msg() const {
+    if (msg_) {
+      return msg_->c_str();
     } else {
       return nullptr;
     }
@@ -114,16 +113,16 @@ public:
   /**
    * @brief operator bool, return True if there is no error.
    */
-  inline operator bool() const { return !errMsg_; }
+  operator bool() const { return !msg_; }
   /**
    * @brief check this status by glog.
    * @note It is a temp method used during cleaning Paddle code. It will be
    *       removed later.
    */
-  inline void check() const { CHECK(*this) << msg(); }
+  void check() const { CHECK(*this) << msg(); }
 
 private:
-  std::shared_ptr<std::string> errMsg_;
+  std::shared_ptr<std::string> msg_;
 };
 
 }  // namespace paddle

From c88dec209f367cb3ac1bd3fe6964e63f7274d975 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Thu, 19 Jan 2017 10:08:33 +0800
Subject: [PATCH 69/88] Fix typo

---
 paddle/utils/Error.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/paddle/utils/Error.h b/paddle/utils/Error.h
index ff11541bbd..6fe7b6ea88 100644
--- a/paddle/utils/Error.h
+++ b/paddle/utils/Error.h
@@ -82,7 +82,7 @@ namespace paddle {
 class Error {
 public:
   /**
-   * Construct an no-error value.
+   * Construct a no-error value.
    */
   Error() {}
 
@@ -100,7 +100,7 @@ public:
   }
 
   /**
-   * @brief what will return the error message. If no error, return nullptr.
+   * @brief msg will return the error message. If no error, return nullptr.
    */
   const char* msg() const {
     if (msg_) {
@@ -114,6 +114,7 @@ public:
    * @brief operator bool, return True if there is no error.
    */
   operator bool() const { return !msg_; }
+
   /**
    * @brief check this status by glog.
    * @note It is a temp method used during cleaning Paddle code. It will be

From 42ea1376e25f7ed02ea55e11cb6114983c0d1e4c Mon Sep 17 00:00:00 2001
From: dangqingqing <dangqingqing@baidu.com>
Date: Thu, 19 Jan 2017 10:11:06 +0800
Subject: [PATCH 70/88] bug fix in GatedRecurrentLayer which only occurs in
 predicting or job=test mode.

---
 paddle/gserver/layers/GatedRecurrentLayer.cpp | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/paddle/gserver/layers/GatedRecurrentLayer.cpp b/paddle/gserver/layers/GatedRecurrentLayer.cpp
index 930d9a0561..d3aeea9218 100644
--- a/paddle/gserver/layers/GatedRecurrentLayer.cpp
+++ b/paddle/gserver/layers/GatedRecurrentLayer.cpp
@@ -314,13 +314,13 @@ void GatedRecurrentLayer::forwardBatch(int batchSize,
 
   batchValue_->resizeOrCreate(*output_.value);
   batchValue_->copy(*inputValue, *gate_.value, /* seq2batch */ true);
-  if (bias_ && bias_->getWGrad()) {
+  if (bias_) {
     gate_.value->addBias(*(bias_->getW()), 1);
   }
 
   {
     int numBatch = batchValue_->getNumBatch();
-    int batchSize = 0;
+    int curBatchSize = 0;
     AsyncGpuBlock asyncGpuBlock;
     for (int n = 0; n < numBatch; n++) {
       MatrixPtr outputValueTmp = batchValue_->getBatchValue(n);
@@ -330,16 +330,17 @@ void GatedRecurrentLayer::forwardBatch(int batchSize,
       gruValue.resetOutputValue =
           (batchValue_->getBatchValue(*resetOutput_.value, n))->getData();
 
-      batchSize = outputValueTmp->getHeight();
+      curBatchSize = outputValueTmp->getHeight();
       gruValue.prevOutValue =
-          (n == 0 ? nullptr
-                  : (batchValue_->getBatchValue(n - 1, batchSize))->getData());
+          (n == 0
+               ? nullptr
+               : (batchValue_->getBatchValue(n - 1, curBatchSize))->getData());
 
       {
         if (useGpu_) {
-          GruCompute::forward<1>(gruValue, getSize(), batchSize);
+          GruCompute::forward<1>(gruValue, getSize(), curBatchSize);
         } else {
-          GruCompute::forward<0>(gruValue, getSize(), batchSize);
+          GruCompute::forward<0>(gruValue, getSize(), curBatchSize);
         }
       }
     }

From 843fb2ea32d4f0b2d1f3667545487c8084229819 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Thu, 19 Jan 2017 15:12:42 +0800
Subject: [PATCH 71/88] Make code more readable

---
 paddle/utils/Error.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/paddle/utils/Error.h b/paddle/utils/Error.h
index 6fe7b6ea88..2b4fbef4e0 100644
--- a/paddle/utils/Error.h
+++ b/paddle/utils/Error.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <glog/logging.h>
+#include <stdarg.h>
 #include <stdio.h>
 #include <memory>
 #include <string>
@@ -113,7 +114,7 @@ public:
   /**
    * @brief operator bool, return True if there is no error.
    */
-  operator bool() const { return !msg_; }
+  operator bool() const { return msg_ == nullptr; }
 
   /**
    * @brief check this status by glog.

From 7ff8c8eec3f44497816ae4fe66516f3ca3ba7cb7 Mon Sep 17 00:00:00 2001
From: xuwei06 <xuwei06@baidu.com>
Date: Thu, 19 Jan 2017 01:33:28 -0800
Subject: [PATCH 72/88] Compile glog with WITH_GFLAGS=ON

Also initialize glog after gflags in Util.cpp initMain()

Change-Id: I09ff062b462aa76d9f7b5f97e883b92939e6dcb2
---
 cmake/external/glog.cmake | 9 +++++----
 paddle/utils/Util.cpp     | 4 ++--
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake
index 71e20c8527..e1eade601e 100644
--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@@ -1,11 +1,11 @@
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 # http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -29,12 +29,13 @@ INCLUDE_DIRECTORIES(${GLOG_INCLUDE_DIR})
 ExternalProject_Add(
     glog
     ${EXTERNAL_PROJECT_LOG_ARGS}
+    DEPENDS gflags
     GIT_REPOSITORY  "https://github.com/google/glog.git"
     PREFIX          ${GLOG_SOURCES_DIR}
     UPDATE_COMMAND  ""
     CMAKE_ARGS      -DCMAKE_INSTALL_PREFIX=${GLOG_INSTALL_DIR}
     CMAKE_ARGS      -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-    CMAKE_ARGS      -DWITH_GFLAGS=OFF
+    CMAKE_ARGS      -DWITH_GFLAGS=ON
     CMAKE_ARGS      -DBUILD_TESTING=OFF
 )
 
diff --git a/paddle/utils/Util.cpp b/paddle/utils/Util.cpp
index 411a64aa8d..220aac1ff1 100644
--- a/paddle/utils/Util.cpp
+++ b/paddle/utils/Util.cpp
@@ -144,20 +144,20 @@ void runInitFunctions() {
 }
 
 void initMain(int argc, char** argv) {
-  initializeLogging(argc, argv);
   installLayerStackTracer();
   std::string line;
   for (int i = 0; i < argc; ++i) {
     line += argv[i];
     line += ' ';
   }
-  LOG(INFO) << "commandline: " << line;
 
 #ifndef GFLAGS_GFLAGS_H_
   namespace gflags = google;
 #endif
 
   gflags::ParseCommandLineFlags(&argc, &argv, true);
+  initializeLogging(argc, argv);
+  LOG(INFO) << "commandline: " << line;
   CHECK_EQ(argc, 1) << "Unknown commandline argument: " << argv[1];
 
   installProfilerSwitch();

From 63118767bf0ede148a7404604ea966d6f0bf35c7 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Thu, 19 Jan 2017 17:36:00 +0800
Subject: [PATCH 73/88] Add detect redhat and unknown in CMake

---
 cmake/system.cmake | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/cmake/system.cmake b/cmake/system.cmake
index ab124a89dc..485cf0eea1 100644
--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@@ -30,6 +30,10 @@ ELSE(WIN32)
                 SET(HOST_SYSTEM "debian")
             ELSEIF(LINUX_ISSUE MATCHES "Ubuntu")
                 SET(HOST_SYSTEM "ubuntu")
+            ELSEIF(LINUX_ISSUE MATCHES "Red Hat")
+                SET(HOST_SYSTEM "redhat")
+            ELSEIF(LINUX_ISSUE MATCHES "Fedora")
+                SET(HOST_SYSTEM "fedora")
             ENDIF()
         ENDIF(EXISTS "/etc/issue")
 
@@ -40,6 +44,10 @@ ELSE(WIN32)
             ENDIF()
         ENDIF(EXISTS "/etc/redhat-release")
 
+        IF(NOT HOST_SYSTEM)
+            SET(HOST_SYSTEM "unknown")
+        ENDIF()
+
     ENDIF(APPLE)
 ENDIF(WIN32)
 

From febf9a0b070a7c5036f2b32f12167952f1597727 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Fri, 20 Jan 2017 00:11:08 +0800
Subject: [PATCH 74/88] Add comments and CMAKE_SYSTEM_NAME

---
 cmake/system.cmake | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/cmake/system.cmake b/cmake/system.cmake
index 485cf0eea1..3e472da7e0 100644
--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@@ -12,6 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+# Detects the OS and sets appropriate variables.
+# CMAKE_SYSTEM_NAME only give us a coarse-grained name,
+# but the name like centos is necessary in some scenes
+# to distinguish system for customization.
+#
+# for instance, protobuf libs path is <install_dir>/lib64
+# on CentOS, but <install_dir>/lib on other systems.
+
 IF(WIN32)
     SET(HOST_SYSTEM "win32")
 ELSE(WIN32)
@@ -45,7 +53,7 @@ ELSE(WIN32)
         ENDIF(EXISTS "/etc/redhat-release")
 
         IF(NOT HOST_SYSTEM)
-            SET(HOST_SYSTEM "unknown")
+            SET(HOST_SYSTEM ${CMAKE_SYSTEM_NAME})
         ENDIF()
 
     ENDIF(APPLE)

From 0cc3d829024a140a5d388437b021ded4d20d1661 Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Fri, 20 Jan 2017 18:41:04 +0800
Subject: [PATCH 75/88] Add some comment of CrossMapNormalFunc

---
 paddle/function/CrossMapNormalOp.cpp | 54 ++++++++++++++++++++++------
 1 file changed, 44 insertions(+), 10 deletions(-)

diff --git a/paddle/function/CrossMapNormalOp.cpp b/paddle/function/CrossMapNormalOp.cpp
index 8e7dc72524..568b8faef4 100644
--- a/paddle/function/CrossMapNormalOp.cpp
+++ b/paddle/function/CrossMapNormalOp.cpp
@@ -112,11 +112,31 @@ void CrossMapNormalGrad<DEVICE_TYPE_CPU>(real* inputsGrad,
 }
 
 /**
- * \brief {o_0, o_1} = calc(i_0)
+ * \brief Normalization with across maps.
  *
- * \param inputs[0] input value.
- * \param outputs[0] output value.
- * \param outputs[1] denoms.
+ * This Function comes from the paper
+ * "ImageNet Classification with Deep Convolutional Neural Networks".
+ *
+ * The original formula is:
+ *
+ *                                 Input(x, y)
+ * Output(x, y) = ------------------------------------------------
+ *                       alpha   /min(F, f-[N/2] + N)
+ *                  (1 + ----- * |    (Input(x, y))^2 ) ^ (beta)
+ *                         N     /max(0, f-[N/2])
+ *
+ * Argument in the Function:
+ * \param size_      represent N
+ * \param scale_     represent alpha / N
+ * \param pow_       represent beta
+ * \param inputs[0]  represent Input
+ * \param outputs[0] represent Output
+ * \param outputs[1] represent The denominator in the formula(except beta)
+ *
+ * note:
+ * Save output[1] is to simplify the backward calculation.
+ * So, if only consider the forward calculation, we can optimize to
+ * remove the output[1].
  */
 template <DeviceType Device>
 class CrossMapNormalFunc : public FunctionBase {
@@ -161,13 +181,27 @@ private:
 };
 
 /**
- * \brief {o_0} = calc(i_0, i_1, i_2, i_3)
+ * \brief Backward calculation for normalization with across maps.
+ *
+ * The implementation of this Function is derived from the
+ * CrossMapNormalFunc implementation.
+ *
+ * InputGrad = OutputGrad * denoms ^ (-beta)
+ *    /
+ *  + | (OutputGrad * OutputValue * (-2 * alpha * beta) / denoms) * InputValue
+ *    /
  *
- * \param inputs[0] input value.
- * \param inputs[1] output value.
- * \param inputs[2] output grad.
- * \param inputs[3] denoms.
- * \param outputs[0] input grad.
+ * Argument in the Function:
+ * \param size_      represent N
+ * \param scale_     represent alpha / N
+ * \param pow_       represent beta
+ * \param inputs[0]  represent InputValue, inputs[0] of CrossMapNormalFunc
+ * \param inputs[1]  represent OutputValue, outputs[0] of CrossMapNormalFunc
+ * \param inputs[2]  represent OutputGrad
+ * \param inputs[3]  represent denoms, outputs[1] of CrossMapNormalFunc
+ *                   This is the intermediate result that is
+ *                   preserved in the forward calculation.
+ * \param outputs[0] represent InputGrad
  */
 template <DeviceType Device>
 class CrossMapNormalGradFunc : public FunctionBase {

From 95dec805aab0ddaef551b367f0c2f4e42a393819 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Fri, 20 Jan 2017 18:48:42 +0800
Subject: [PATCH 76/88] Make external/glog use local gflags.

---
 cmake/external/glog.cmake | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake
index e1eade601e..ab105611c8 100644
--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@@ -36,6 +36,7 @@ ExternalProject_Add(
     CMAKE_ARGS      -DCMAKE_INSTALL_PREFIX=${GLOG_INSTALL_DIR}
     CMAKE_ARGS      -DCMAKE_POSITION_INDEPENDENT_CODE=ON
     CMAKE_ARGS      -DWITH_GFLAGS=ON
+    CMAKE_ARGS      -Dgflags_DIR=${GFLAGS_INSTALL_DIR}/lib/cmake/gflags
     CMAKE_ARGS      -DBUILD_TESTING=OFF
 )
 

From a8583f0ccb230fb0564103a28a034a87156bac78 Mon Sep 17 00:00:00 2001
From: reyoung <reyoung@126.com>
Date: Fri, 20 Jan 2017 11:39:52 +0000
Subject: [PATCH 77/88] Correct Handle Python Proto2

---
 cmake/external/python.cmake | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cmake/external/python.cmake b/cmake/external/python.cmake
index 209e679f2c..6372a9a768 100644
--- a/cmake/external/python.cmake
+++ b/cmake/external/python.cmake
@@ -26,10 +26,10 @@ IF(PYTHONLIBS_FOUND AND PYTHONINTERP_FOUND)
     find_python_module(wheel REQUIRED)
     find_python_module(google.protobuf REQUIRED)
     FIND_PACKAGE(NumPy REQUIRED)
-    IF(${PY_GOOGLE.PROTOBUF_VERSION} VERSION_LESS "3.0.0")
+    IF(${PY_GOOGLE.PROTOBUF_VERSION} AND ${PY_GOOGLE.PROTOBUF_VERSION} VERSION_LESS "3.0.0")
         MESSAGE(FATAL_ERROR "Found Python Protobuf ${PY_GOOGLE.PROTOBUF_VERSION} < 3.0.0, "
-        "please use pip to upgrade protobuf.")
-    ENDIF(${PY_GOOGLE.PROTOBUF_VERSION} VERSION_LESS "3.0.0")
+        "please use pip to upgrade protobuf. pip install -U protobuf")
+    ENDIF()
 ELSE(PYTHONLIBS_FOUND AND PYTHONINTERP_FOUND)
     MESSAGE(FATAL_ERROR "Please install python 2.7 before building PaddlePaddle.")
     ##################################### PYTHON ########################################

From 51fa6baebec5c29f4ff0f1c2e72f6feab02bbe5e Mon Sep 17 00:00:00 2001
From: FoREacH <ikucherovskiy@gmail.com>
Date: Sat, 21 Jan 2017 00:24:38 +0200
Subject: [PATCH 78/88] Fix issue #1186

---
 cmake/external/warpctc.cmake | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake
index f5e4b3e1eb..f23a3969e9 100644
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -54,6 +54,7 @@ ExternalProject_Add(
     CMAKE_ARGS      -DWITH_GPU=${WITH_GPU}
     CMAKE_ARGS      -DWITH_OMP=${USE_OMP}
     CMAKE_ARGS      -DWITH_TORCH=OFF
+    CMAKE_ARGS      -DCMAKE_DISABLE_FIND_PACKAGE_TORCH=TRUE
     CMAKE_ARGS      -DBUILD_SHARED=ON
 )
 

From 7709b697ba85a99c9f784e5f2f8267f7a52c61fc Mon Sep 17 00:00:00 2001
From: F0REacH <ikucherovskiy@gmail.com>
Date: Sat, 21 Jan 2017 00:52:39 +0200
Subject: [PATCH 79/88] Update warpctc.cmake

Fix case-sensitiveness
---
 cmake/external/warpctc.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake
index f23a3969e9..172c318b35 100644
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -54,7 +54,7 @@ ExternalProject_Add(
     CMAKE_ARGS      -DWITH_GPU=${WITH_GPU}
     CMAKE_ARGS      -DWITH_OMP=${USE_OMP}
     CMAKE_ARGS      -DWITH_TORCH=OFF
-    CMAKE_ARGS      -DCMAKE_DISABLE_FIND_PACKAGE_TORCH=TRUE
+    CMAKE_ARGS      -DCMAKE_DISABLE_FIND_PACKAGE_Torch=TRUE
     CMAKE_ARGS      -DBUILD_SHARED=ON
 )
 

From ed5624023549374259c0fc9c4849d5ae46347b1b Mon Sep 17 00:00:00 2001
From: FoREacH <ikucherovskiy@gmail.com>
Date: Sat, 21 Jan 2017 05:46:07 +0200
Subject: [PATCH 80/88] Set protobuf CMAKE_INSTALL_LIBDIR to fixed value lib

---
 cmake/external/protobuf.cmake | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index 613614c0e3..84f459033f 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -29,17 +29,12 @@ IF(WIN32)
         "${PROTOBUF_INSTALL_DIR}/lib/libprotoc.lib" CACHE FILEPATH "protoc library." FORCE)
   SET(PROTOBUF_PROTOC_EXECUTABLE "${PROTOBUF_INSTALL_DIR}/bin/protoc.exe" CACHE FILEPATH "protobuf executable." FORCE)
 ELSE(WIN32)
-  IF(${HOST_SYSTEM} STREQUAL "centos")
-    SET(LIB "lib64")
-  ELSE()
-    SET(LIB "lib")
-  ENDIF()
   SET(PROTOBUF_LITE_LIBRARY
-        "${PROTOBUF_INSTALL_DIR}/${LIB}/libprotobuf-lite.a" CACHE FILEPATH "protobuf lite library." FORCE)
+        "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf-lite.a" CACHE FILEPATH "protobuf lite library." FORCE)
   SET(PROTOBUF_LIBRARY
-        "${PROTOBUF_INSTALL_DIR}/${LIB}/libprotobuf.a" CACHE FILEPATH "protobuf library." FORCE)
+        "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf.a" CACHE FILEPATH "protobuf library." FORCE)
   SET(PROTOBUF_PROTOC_LIBRARY
-        "${PROTOBUF_INSTALL_DIR}/${LIB}/libprotoc.a" CACHE FILEPATH "protoc library." FORCE)
+        "${PROTOBUF_INSTALL_DIR}/lib/libprotoc.a" CACHE FILEPATH "protoc library." FORCE)
   SET(PROTOBUF_PROTOC_EXECUTABLE "${PROTOBUF_INSTALL_DIR}/bin/protoc" CACHE FILEPATH "protobuf executable." FORCE)
 ENDIF(WIN32)
 
@@ -58,6 +53,7 @@ ExternalProject_Add(
     -DCMAKE_POSITION_INDEPENDENT_CODE=ON
     -DCMAKE_BUILD_TYPE=Release
     -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR}
+    -DCMAKE_INSTALL_LIBDIR=lib
 )
 
 LIST(APPEND external_project_dependencies protobuf)

From fae5d82c367482e7236489bff41c875a230598fd Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Sun, 22 Jan 2017 10:29:55 +0800
Subject: [PATCH 81/88] Fix destroy error in test_ProtoServer.

---
 paddle/pserver/test/test_ProtoServer.cpp | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/paddle/pserver/test/test_ProtoServer.cpp b/paddle/pserver/test/test_ProtoServer.cpp
index 9f86ee80f4..e70e1670d3 100644
--- a/paddle/pserver/test/test_ProtoServer.cpp
+++ b/paddle/pserver/test/test_ProtoServer.cpp
@@ -12,14 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Util.h"
-
 #include <gtest/gtest.h>
-
+#include <memory>
 #include "ParameterService.pb.h"
 #include "paddle/math/Vector.h"
 #include "paddle/pserver/ProtoServer.h"
 #include "paddle/utils/Stat.h"
+#include "paddle/utils/Util.h"
 
 DEFINE_string(server_addr, "127.0.0.1", "Server address");
 DEFINE_int64(dim, 50000000, "Data size");
@@ -163,17 +162,15 @@ int main(int argc, char** argv) {
   paddle::initMain(argc, argv);
   testing::InitGoogleTest(&argc, argv);
 
-  MyServer* server;
+  std::unique_ptr<MyServer> server;
   if (FLAGS_rdma_tcp == "rdma") {
-    server = new MyServer(FLAGS_port, 0);
+    server.reset(new MyServer(FLAGS_port, 0));
   } else {
-    server = new MyServer(FLAGS_port);
+    server.reset(new MyServer(FLAGS_port));
   }
 
   server->start();
   usleep(10000);
 
-  int ret = RUN_ALL_TESTS();
-
-  exit(ret);
+  return RUN_ALL_TESTS();
 }

From 7acd669ee291d42492dd9f6d5a9716ab2f5ac314 Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Sun, 22 Jan 2017 16:25:19 +0800
Subject: [PATCH 82/88] Supplement the data format specifications for the
 parameters.

---
 paddle/function/CrossMapNormalOp.cpp | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/paddle/function/CrossMapNormalOp.cpp b/paddle/function/CrossMapNormalOp.cpp
index 568b8faef4..2d2ff0ca86 100644
--- a/paddle/function/CrossMapNormalOp.cpp
+++ b/paddle/function/CrossMapNormalOp.cpp
@@ -126,6 +126,11 @@ void CrossMapNormalGrad<DEVICE_TYPE_CPU>(real* inputsGrad,
  *                         N     /max(0, f-[N/2])
  *
  * Argument in the Function:
+ * Input is NCHW format, while input.shape.ndims() is equal 4.
+ * And the meaning of each dimension(0-3) is respectively batch size,
+ * feature maps, rows and columns.
+ * The above formula is for each image.
+ *
  * \param size_      represent N
  * \param scale_     represent alpha / N
  * \param pow_       represent beta
@@ -135,7 +140,7 @@ void CrossMapNormalGrad<DEVICE_TYPE_CPU>(real* inputsGrad,
  *
  * note:
  * Save output[1] is to simplify the backward calculation.
- * So, if only consider the forward calculation, we can optimize to
+ * TODO, if only consider the forward calculation, we can optimize to
  * remove the output[1].
  */
 template <DeviceType Device>
@@ -192,6 +197,9 @@ private:
  *    /
  *
  * Argument in the Function:
+ * The data of inputs/outputs format is the same as the forward interface
+ * and is NCHW.
+ *
  * \param size_      represent N
  * \param scale_     represent alpha / N
  * \param pow_       represent beta

From 44e3c724f90f6944478de11cb0e0acbd71c6ce74 Mon Sep 17 00:00:00 2001
From: dangqingqing <dangqingqing@baidu.com>
Date: Sun, 22 Jan 2017 16:38:35 +0800
Subject: [PATCH 83/88] Refine and comments

---
 paddle/function/PadOp.cpp                     | 161 +++++++++++-------
 paddle/function/PadOp.h                       |  87 ++++------
 paddle/function/PadOpGpu.cu                   |  36 ++--
 paddle/function/PadOpTest.cpp                 |  24 +--
 paddle/gserver/layers/PadLayer.cpp            |  24 +--
 .../paddle/trainer_config_helpers/layers.py   |  25 +++
 6 files changed, 197 insertions(+), 160 deletions(-)

diff --git a/paddle/function/PadOp.cpp b/paddle/function/PadOp.cpp
index 75e64a8ee4..df44fd0fa6 100644
--- a/paddle/function/PadOp.cpp
+++ b/paddle/function/PadOp.cpp
@@ -24,20 +24,19 @@ void Pad<DEVICE_TYPE_CPU>(real* outputs,
                           const int inC,
                           const int inH,
                           const int inW,
-                          const int padc0,
-                          const int padc1,
-                          const int padh0,
-                          const int padh1,
-                          const int padw0,
-                          const int padw1) {
-  int outC = inC + padc0 + padc1;
-  int outH = inH + padh0 + padh1;
-  int outW = inW + padw0 + padw1;
+                          const PadConf& pad) {
+  int cstart = pad.channelStart, cend = pad.channelEnd;
+  int hstart = pad.heightStart, hend = pad.heightEnd;
+  int wstart = pad.widthStart, wend = pad.widthEnd;
+  int outC = inC + cstart + cend;
+  int outH = inH + hstart + hend;
+  int outW = inW + wstart + wend;
   for (int i = 0; i < num; i++) {
     for (int c = 0; c < inC; c++) {
       for (int h = 0; h < inH; h++) {
         int inoff = ((i * inC + c) * inH + h) * inW;
-        int outoff = ((i * outC + c + padc0) * outH + h + padh0) * outW + padw0;
+        int outoff =
+            ((i * outC + c + cstart) * outH + h + hstart) * outW + wstart;
         memcpy(outputs + outoff, inputs + inoff, inW * sizeof(real));
       }
     }
@@ -51,20 +50,19 @@ void PadGrad<DEVICE_TYPE_CPU>(real* inGrad,
                               const int inC,
                               const int inH,
                               const int inW,
-                              const int padc0,
-                              const int padc1,
-                              const int padh0,
-                              const int padh1,
-                              const int padw0,
-                              const int padw1) {
-  int outC = inC + padc0 + padc1;
-  int outH = inH + padh0 + padh1;
-  int outW = inW + padw0 + padw1;
+                              const PadConf& pad) {
+  int cstart = pad.channelStart, cend = pad.channelEnd;
+  int hstart = pad.heightStart, hend = pad.heightEnd;
+  int wstart = pad.widthStart, wend = pad.widthEnd;
+  int outC = inC + cstart + cend;
+  int outH = inH + hstart + hend;
+  int outW = inW + wstart + wend;
   for (int i = 0; i < num; i++) {
     for (int c = 0; c < inC; c++) {
       for (int h = 0; h < inH; h++) {
         int inoff = ((i * inC + c) * inH + h) * inW;
-        int outoff = ((i * outC + c + padc0) * outH + h + padh0) * outW + padw0;
+        int outoff =
+            ((i * outC + c + cstart) * outH + h + hstart) * outW + wstart;
         CpuVector inG = CpuVector(inW, inGrad + inoff);
         CpuVector outG = CpuVector(inW, const_cast<real*>(outGrad + outoff));
         inG += outG;
@@ -73,22 +71,71 @@ void PadGrad<DEVICE_TYPE_CPU>(real* inGrad,
   }
 }
 
+/**
+ * \brief Padding zeros to input according to the specify dimension.
+ *        The struct pad_ contains the padding size in each dimension.
+ *        The input and output is a 4D tensor. In PadFunc, we only
+ *        pad zeros to the 2nd to 4th dimension.
+ *
+ * Argument in this Function:
+ * \param pad_    A struct object contains the padding size in each dimension.
+ *                It has six integers. The channelStart and channelEnd indicates
+ *                how many zeros to add before and after the input in channel
+ *                dimension. And the heightStart and heightEnd indicates padding
+ *                in height dimension. The widthStart and widthEnd indicates the
+ *                padding in width dimension.
+ * \param inputs  A 4D tensor, only one input.
+ * \param outputs A 4D tensor, the output value after padding.
+ *
+ * For example,
+ * Input(2,2,2,3) = [
+ *                    [ [[1,2,3], [3,4,5]],
+ *                      [[2,3,5], [1,6,7]] ],
+ *                    [ [[4,3,1], [1,8,7]],
+ *                      [[3,8,9], [2,3,5]] ]
+ *                  ] # the shape is (1,2,2,3)
+ *
+ * pad_: if channelStart = channelEnd = 1, others are 0.
+ * Output(2,4,2,3) = [
+ *                    [ [[0,0,0], [0,0,0]],
+ *                      [[1,2,3], [3,4,5]],
+ *                      [[2,3,5], [1,6,7]],
+ *                      [[0,0,0], [0,0,0]] ],
+ *                    [ [[0,0,0], [0,0,0]],
+ *                      [[4,3,1], [1,8,7]],
+ *                      [[3,8,9], [2,3,5]],
+ *                      [[0,0,0], [0,0,0]] ]
+ *                   ] # the shape is (2,4,2,3)
+ *
+ * pad_: if widthStart = 1, widthEnd = 2, others are 0.
+ * Output(2,2,2,6) = [
+ *                     [ [[0,1,2,3,0,0], [0,3,4,5,0,0]],
+ *                       [[0,2,3,5,0,0], [0,1,6,7,0,0]] ],
+ *                     [ [[0,4,3,1,0,0], [0,1,8,7,0,0]],
+ *                       [[0,3,8,9,0,0], [0,2,3,5,0,0]] ],
+ *                   ] # the shape is (2,2,2,6)
+ *
+ * pad_: if heightStart = 1, heightEnd = 1, others are 0.
+ * Output(2,2,4,3) = [
+ *                     [ [[0,0,0], [1,2,3], [3,4,5], [0,0,0]],
+ *                       [[0,0,0], [2,3,5], [1,6,7], [0,0,0]] ],
+ *                     [ [[0,0,0], [4,3,1], [1,8,7], [0,0,0]],
+ *                       [[0,0,0], [3,8,9], [2,3,5], [0,0,0]] ],
+ *                   ] # the shape is (2,2,4,3)
+ */
+
 template <DeviceType Device>
 class PadFunc : public FunctionBase {
 public:
   void init(const FuncConfig& config) override {
-    padc0_ = config.get<int>("padc0");
-    padc1_ = config.get<int>("padc1");
-    padh0_ = config.get<int>("padh0");
-    padh1_ = config.get<int>("padh1");
-    padw0_ = config.get<int>("padw0");
-    padw1_ = config.get<int>("padw1");
+    pad_.channelStart = config.get<int>("cstart");
+    pad_.channelEnd = config.get<int>("cend");
+    pad_.heightStart = config.get<int>("hstart");
+    pad_.heightEnd = config.get<int>("hend");
+    pad_.widthStart = config.get<int>("wstart");
+    pad_.widthEnd = config.get<int>("wend");
   }
 
-  /**
-   * \param inputs[0] input value.
-   * \param outputs[0] output value.
-   */
   void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
     CHECK_EQ(1UL, inputs.size());
     CHECK_EQ(1UL, outputs.size());
@@ -108,39 +155,35 @@ public:
                 inC,
                 inH,
                 inW,
-                padc0_,
-                padc1_,
-                padh0_,
-                padh1_,
-                padw0_,
-                padw1_);
+                pad_);
   }
 
 private:
-  int padc0_;
-  int padc1_;
-  int padh0_;
-  int padh1_;
-  int padw0_;
-  int padw1_;
+  PadConf pad_;
 };
 
+/**
+ * \brief The backward propagation of padding Function. Remove the elements
+ *        in the padding positions of forward.
+ *
+ * Argument in this Function:
+ * \param pad_    The same meaning as it in PadFunc.
+ * \param inputs  The gradient with respect to the output value of PadFunc.
+ * \param outputs The gradient with respect to the input value of PadFunc.
+ */
+
 template <DeviceType Device>
 class PadGradFunc : public FunctionBase {
 public:
   void init(const FuncConfig& config) override {
-    padc0_ = config.get<int>("padc0");
-    padc1_ = config.get<int>("padc1");
-    padh0_ = config.get<int>("padh0");
-    padh1_ = config.get<int>("padh1");
-    padw0_ = config.get<int>("padw0");
-    padw1_ = config.get<int>("padw1");
+    pad_.channelStart = config.get<int>("cstart");
+    pad_.channelEnd = config.get<int>("cend");
+    pad_.heightStart = config.get<int>("hstart");
+    pad_.heightEnd = config.get<int>("hend");
+    pad_.widthStart = config.get<int>("wstart");
+    pad_.widthEnd = config.get<int>("wend");
   }
 
-  /**
-   * \param inputs[0] output grad.
-   * \param inouts[0] input grad.
-   */
   void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
     CHECK_EQ(1UL, inputs.size());
     CHECK_EQ(1UL, outputs.size());
@@ -163,21 +206,11 @@ public:
                     inC,
                     inH,
                     inW,
-                    padc0_,
-                    padc1_,
-                    padh0_,
-                    padh1_,
-                    padw0_,
-                    padw1_);
+                    pad_);
   }
 
 private:
-  int padc0_;
-  int padc1_;
-  int padh0_;
-  int padh1_;
-  int padw0_;
-  int padw1_;
+  PadConf pad_;
 };
 
 REGISTER_TYPED_FUNC(Pad, CPU, PadFunc);
diff --git a/paddle/function/PadOp.h b/paddle/function/PadOp.h
index 4a5e8fe338..7b5c730a6a 100644
--- a/paddle/function/PadOp.h
+++ b/paddle/function/PadOp.h
@@ -18,29 +18,34 @@ limitations under the License. */
 
 namespace paddle {
 
+struct PadConf {
+  /// how many values to add before the data along channel dimension.
+  int channelStart;
+  /// how many values to add after the data along channel dimension.
+  int channelEnd;
+  /// how many values to add before the data along height dimension.
+  int heightStart;
+  /// how many values to add after the data along height dimension.
+  int heightEnd;
+  /// how many values to add before the data along width dimension.
+  int widthStart;
+  /// how many values to add after the data along width dimension.
+  int widthEnd;
+};
+
 /**
  * \brief  This funtion pads zeros to inputs according to the specify dimension.
- *         The data structure of image data is NCHW.
- *
- * \param[out]  outputs  save results.
- * \param[in]   inputs   input data.
- * \param[in]   num      batch size of input data.
- * \param[in]   inC      channel number of input data.
- * \param[in]   inH      height of input data.
- * \param[in]   inH      with of input data.
- * \param[in]   padc0    how many values to add before the data in dimension of
- * channel.
- * \param[in]   padc1    how many values to add after the data in dimension of
- * channel.
- * \param[in]   padh0    how many values to add before the data in dimension of
- * height.
- * \param[in]   padh1    how many values to add after the data in dimension of
- * height.
- * \param[in]   padw0    how many values to add before the data in dimension of
- * width.
- * \param[in]   padw1    how many values to add after the data in dimension of
- * width.
+ *         The input and output is a 4D tensor. Padding zeros from the 2nd to
+ *         the 4th dimenstion according argument of pad.
  *
+ * \param[out] outputs save results.
+ * \param[in]  inputs  input data.
+ * \param[in]  num     batch size of input data.
+ * \param[in]  inC     channel number of input data.
+ * \param[in]  inH     height of input data.
+ * \param[in]  inH     with of input data.
+ * \param[in]  pad     the padding config, contains the size along the
+ *                     specify dimension.
  */
 template <DeviceType Device>
 void Pad(real* outputs,
@@ -49,36 +54,19 @@ void Pad(real* outputs,
          const int inC,
          const int inH,
          const int inW,
-         const int padc0,
-         const int padc1,
-         const int padh0,
-         const int padh1,
-         const int padw0,
-         const int padw1);
+         const PadConf& pad);
 
 /**
  * \brief   Padding operation backward.
- *          The data structure of image data is NCHW.
- *
- * \param[out]  inGrad   gradients of previous layer.
- * \param[in]   outGrad  output gradients.
- * \param[in]   num      batch size of input data.
- * \param[in]   inC      channel number of input data.
- * \param[in]   inH      height of input data.
- * \param[in]   inH      with of input data.
- * \param[in]   padc0    how many values to add before the data in dimension of
- * channel.
- * \param[in]   padc1    how many values to add after the data in dimension of
- * channel.
- * \param[in]   padh0    how many values to add before the data in dimension of
- * height.
- * \param[in]   padh1    how many values to add after the data in dimension of
- * height.
- * \param[in]   padw0    how many values to add before the data in dimension of
- * width.
- * \param[in]   padw1    how many values to add after the data in dimension of
- * width.
  *
+ * \param[out] inGrad  gradients of previous layer.
+ * \param[in]  outGrad output gradients.
+ * \param[in]  num     batch size of input data.
+ * \param[in]  inC     channel number of input data.
+ * \param[in]  inH     height of input data.
+ * \param[in]  inH     with of input data.
+ * \param[in]  pad     the padding config, contains the size along the
+ *                     specify dimension.
  */
 template <DeviceType Device>
 void PadGrad(real* inGrad,
@@ -87,10 +75,5 @@ void PadGrad(real* inGrad,
              const int inC,
              const int inH,
              const int inW,
-             const int padc0,
-             const int padc1,
-             const int padh0,
-             const int padh1,
-             const int padw0,
-             const int padw1);
+             const PadConf& pad);
 }  // namespace paddle
diff --git a/paddle/function/PadOpGpu.cu b/paddle/function/PadOpGpu.cu
index 578d6e86d7..9104b1aca5 100644
--- a/paddle/function/PadOpGpu.cu
+++ b/paddle/function/PadOpGpu.cu
@@ -40,20 +40,18 @@ void Pad<DEVICE_TYPE_GPU>(real* outputs,
                           const int inC,
                           const int inH,
                           const int inW,
-                          const int padc0,
-                          const int padc1,
-                          const int padh0,
-                          const int padh1,
-                          const int padw0,
-                          const int padw1) {
+                          const PadConf& pad) {
   size_t nth = num * inC * inH * inW;
   int blockSize = 1024;
   int gridSize = (nth + 1024 - 1) / 1024;
-  int outC = inC + padc0 + padc1;
-  int outH = inH + padh0 + padh1;
-  int outW = inW + padw0 + padw1;
+  int cstart = pad.channelStart, cend = pad.channelEnd;
+  int hstart = pad.heightStart, hend = pad.heightEnd;
+  int wstart = pad.widthStart, wend = pad.widthEnd;
+  int outC = inC + cstart + cend;
+  int outH = inH + hstart + hend;
+  int outW = inW + wstart + wend;
   KePad<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
-    (outputs, inputs, inC, inH, inW, padc0, padh0, padw0,
+    (outputs, inputs, inC, inH, inW, cstart, hstart, wstart,
      outC, outH, outW, nth);
   CHECK_SYNC("Pad");
 }
@@ -81,20 +79,18 @@ void PadGrad<DEVICE_TYPE_GPU>(real* inGrad,
                               const int inC,
                               const int inH,
                               const int inW,
-                              const int padc0,
-                              const int padc1,
-                              const int padh0,
-                              const int padh1,
-                              const int padw0,
-                              const int padw1) {
+                              const PadConf& pad) {
   int nth = num * inC * inH * inW;
   int blockSize = 1024;
   int gridSize = (nth + 1024 - 1) / 1024;
-  int outC = inC + padc0 + padc1;
-  int outH = inH + padh0 + padh1;
-  int outW = inW + padw0 + padw1;
+  int cstart = pad.channelStart, cend = pad.channelEnd;
+  int hstart = pad.heightStart, hend = pad.heightEnd;
+  int wstart = pad.widthStart, wend = pad.widthEnd;
+  int outC = inC + cstart + cend;
+  int outH = inH + hstart + hend;
+  int outW = inW + wstart + wend;
   KePadDiff <<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
-    (inGrad, outGrad, inC, inH, inW, padc0, padh0, padw0,
+    (inGrad, outGrad, inC, inH, inW, cstart, hstart, wstart,
      outC, outH, outW, nth);
   CHECK_SYNC("PadGrad");
 }
diff --git a/paddle/function/PadOpTest.cpp b/paddle/function/PadOpTest.cpp
index dce2bac3e9..cd22d91135 100644
--- a/paddle/function/PadOpTest.cpp
+++ b/paddle/function/PadOpTest.cpp
@@ -27,12 +27,12 @@ TEST(Pad, real) {
 
           FunctionCompare compare("Pad",
                                   FuncConfig()
-                                      .set("padc0", 2)
-                                      .set("padc1", 3)
-                                      .set("padh0", 1)
-                                      .set("padh1", 2)
-                                      .set("padw0", 3)
-                                      .set("padw1", 2));
+                                      .set("cstart", 2)
+                                      .set("cend", 3)
+                                      .set("hstart", 1)
+                                      .set("hend", 2)
+                                      .set("wstart", 3)
+                                      .set("wend", 2));
           TensorShape inDims{numSamples, channels, imgSizeH, imgSizeW};
           TensorShape outDims{
               numSamples, channels + 5, imgSizeH + 3, imgSizeW + 5};
@@ -54,12 +54,12 @@ TEST(PadGrad, real) {
                   << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW;
           FunctionCompare compare("PadGrad",
                                   FuncConfig()
-                                      .set("padc0", 2)
-                                      .set("padc1", 3)
-                                      .set("padh0", 1)
-                                      .set("padh1", 2)
-                                      .set("padw0", 3)
-                                      .set("padw1", 2));
+                                      .set("cstart", 2)
+                                      .set("cend", 3)
+                                      .set("hstart", 1)
+                                      .set("hend", 2)
+                                      .set("wstart", 3)
+                                      .set("wend", 2));
           TensorShape inDims{numSamples, channels, imgSizeH, imgSizeW};
           TensorShape outDims{
               numSamples, channels + 5, imgSizeH + 3, imgSizeW + 5};
diff --git a/paddle/gserver/layers/PadLayer.cpp b/paddle/gserver/layers/PadLayer.cpp
index a2a469ff92..bb618c09f9 100644
--- a/paddle/gserver/layers/PadLayer.cpp
+++ b/paddle/gserver/layers/PadLayer.cpp
@@ -49,21 +49,21 @@ bool PadLayer::init(const LayerMap& layerMap,
   createFunction(forward_,
                  "Pad",
                  FuncConfig()
-                     .set("padc0", padc_[0])
-                     .set("padc1", padc_[1])
-                     .set("padh0", padh_[0])
-                     .set("padh1", padh_[1])
-                     .set("padw0", padw_[0])
-                     .set("padw1", padw_[1]));
+                     .set("cstart", padc_[0])
+                     .set("cend", padc_[1])
+                     .set("hstart", padh_[0])
+                     .set("hend", padh_[1])
+                     .set("wstart", padw_[0])
+                     .set("wend", padw_[1]));
   createFunction(backward_,
                  "PadGrad",
                  FuncConfig()
-                     .set("padc0", padc_[0])
-                     .set("padc1", padc_[1])
-                     .set("padh0", padh_[0])
-                     .set("padh1", padh_[1])
-                     .set("padw0", padw_[0])
-                     .set("padw1", padw_[1]));
+                     .set("cstart", padc_[0])
+                     .set("cend", padc_[1])
+                     .set("hstart", padh_[0])
+                     .set("hend", padh_[1])
+                     .set("wstart", padw_[0])
+                     .set("wend", padw_[1]));
 
   return true;
 }
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index 66817fc93b..85a28e14ae 100644
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -3617,6 +3617,31 @@ def pad_layer(input,
     input data and 3 zeros after the input data in channel dimension.
     pad_h means padding zeros in height dimension. pad_w means padding zeros
     in width dimension.
+    
+    For example,
+    
+    .. code-block::
+
+      input(2,2,2,3)  = [
+                          [ [[1,2,3], [3,4,5]],
+                            [[2,3,5], [1,6,7]] ],
+                          [ [[4,3,1], [1,8,7]],
+                            [[3,8,9], [2,3,5]] ]
+                        ]
+ 
+      pad_c=[1,1], pad_h=[0,0], pad_w=[0,0]
+      output(2,4,2,3) = [
+                          [ [[0,0,0], [0,0,0]],
+                            [[1,2,3], [3,4,5]],
+                            [[2,3,5], [1,6,7]],
+                            [[0,0,0], [0,0,0]] ],
+                          [ [[0,0,0], [0,0,0]],
+                            [[4,3,1], [1,8,7]],
+                            [[3,8,9], [2,3,5]],
+                            [[0,0,0], [0,0,0]] ]
+                        ]
+
+    The simply usage is:
 
     .. code-block:: python
 

From 4426573a6ef42dee2877b5b9bfb162c2485f9345 Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Sun, 22 Jan 2017 16:51:54 +0800
Subject: [PATCH 84/88] follow comments

---
 paddle/function/CrossMapNormalOp.cpp | 29 +++++++++++++++++++---------
 1 file changed, 20 insertions(+), 9 deletions(-)

diff --git a/paddle/function/CrossMapNormalOp.cpp b/paddle/function/CrossMapNormalOp.cpp
index 2d2ff0ca86..e795567cd1 100644
--- a/paddle/function/CrossMapNormalOp.cpp
+++ b/paddle/function/CrossMapNormalOp.cpp
@@ -120,25 +120,36 @@ void CrossMapNormalGrad<DEVICE_TYPE_CPU>(real* inputsGrad,
  * The original formula is:
  *
  *                                 Input(x, y)
- * Output(x, y) = ------------------------------------------------
- *                       alpha   /min(F, f-[N/2] + N)
- *                  (1 + ----- * |    (Input(x, y))^2 ) ^ (beta)
- *                         N     /max(0, f-[N/2])
+ * Output(x, y) = ---------------------------------------------
+ *                               -- upper
+ *                  (k + alpha * >  (Input(x, y))^2) ^ (beta)
+ *                               -- lower
  *
- * Argument in the Function:
- * Input is NCHW format, while input.shape.ndims() is equal 4.
+ * upper is `min(F, f-[N/2] + N)`
+ * lower if `max(0, f-[N/2])`
+ *
+ * Function implementation:
+ *
+ * inputs and outpus is NCHW format, while input.shape.ndims() is equal 4.
  * And the meaning of each dimension(0-3) is respectively batch size,
  * feature maps, rows and columns.
- * The above formula is for each image.
+ *
+ * Input and Output in the above formula is for each map of one image, and
+ * Input(x, y), Output(x, y) represents an element in an image.
+ *
+ * In the implementation of Function, k is equal to 1,
+ * so Function has no argument for k.
+ *
+ * Function Arguments:
  *
  * \param size_      represent N
- * \param scale_     represent alpha / N
+ * \param scale_     represent alpha
  * \param pow_       represent beta
  * \param inputs[0]  represent Input
  * \param outputs[0] represent Output
  * \param outputs[1] represent The denominator in the formula(except beta)
  *
- * note:
+ * Note:
  * Save output[1] is to simplify the backward calculation.
  * TODO, if only consider the forward calculation, we can optimize to
  * remove the output[1].

From abeb3aae84027a296cf310034e398f8a9271c1e1 Mon Sep 17 00:00:00 2001
From: dangqingqing <dangqingqing@baidu.com>
Date: Sun, 22 Jan 2017 16:55:26 +0800
Subject: [PATCH 85/88] Minor changes

---
 paddle/function/PadOp.cpp        | 6 +++---
 paddle/gserver/layers/PadLayer.h | 5 +++--
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/paddle/function/PadOp.cpp b/paddle/function/PadOp.cpp
index df44fd0fa6..f1a0d2a1a9 100644
--- a/paddle/function/PadOp.cpp
+++ b/paddle/function/PadOp.cpp
@@ -79,10 +79,10 @@ void PadGrad<DEVICE_TYPE_CPU>(real* inGrad,
  *
  * Argument in this Function:
  * \param pad_    A struct object contains the padding size in each dimension.
- *                It has six integers. The channelStart and channelEnd indicates
+ *                It has six integers. The channelStart and channelEnd indicate
  *                how many zeros to add before and after the input in channel
- *                dimension. And the heightStart and heightEnd indicates padding
- *                in height dimension. The widthStart and widthEnd indicates the
+ *                dimension. And the heightStart and heightEnd indicate padding
+ *                in height dimension. The widthStart and widthEnd indicate the
  *                padding in width dimension.
  * \param inputs  A 4D tensor, only one input.
  * \param outputs A 4D tensor, the output value after padding.
diff --git a/paddle/gserver/layers/PadLayer.h b/paddle/gserver/layers/PadLayer.h
index 1f5a4a54a0..3e3a21a997 100644
--- a/paddle/gserver/layers/PadLayer.h
+++ b/paddle/gserver/layers/PadLayer.h
@@ -19,8 +19,9 @@ limitations under the License. */
 namespace paddle {
 
 /**
- * @brief response normalization across feature maps
- * namely normalize in number of size_ channels
+ * \brief  This layer pads zeros to inputs according to the specify dimension.
+ *         The input and output is a 4D tensor. Padding zeros from the 2nd to
+ *         the 4th dimenstion according padc_, padh_ and padw_.
  */
 class PadLayer : public Layer {
 public:

From 5b9450ae0869673ae11000c89adeec4d237c4ccd Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Sun, 22 Jan 2017 17:46:28 +0800
Subject: [PATCH 86/88] follow comments

---
 paddle/function/CrossMapNormalOp.cpp | 38 ++++++++++++++++++----------
 1 file changed, 24 insertions(+), 14 deletions(-)

diff --git a/paddle/function/CrossMapNormalOp.cpp b/paddle/function/CrossMapNormalOp.cpp
index e795567cd1..5c0bdd933b 100644
--- a/paddle/function/CrossMapNormalOp.cpp
+++ b/paddle/function/CrossMapNormalOp.cpp
@@ -119,14 +119,14 @@ void CrossMapNormalGrad<DEVICE_TYPE_CPU>(real* inputsGrad,
  *
  * The original formula is:
  *
- *                                 Input(x, y)
- * Output(x, y) = ---------------------------------------------
- *                               -- upper
- *                  (k + alpha * >  (Input(x, y))^2) ^ (beta)
- *                               -- lower
+ *                                Input(i, x, y)
+ * Output(i, x, y) = ----------------------------------------------
+ *                                 -- upper
+ *                    (k + alpha * >  (Input(j, x, y))^2) ^ (beta)
+ *                                 -- j = lower
  *
- * upper is `min(F, f-[N/2] + N)`
- * lower if `max(0, f-[N/2])`
+ * upper is `min(C, c + N/2)`
+ * lower if `max(0, c - N/2)`
  *
  * Function implementation:
  *
@@ -134,8 +134,12 @@ void CrossMapNormalGrad<DEVICE_TYPE_CPU>(real* inputsGrad,
  * And the meaning of each dimension(0-3) is respectively batch size,
  * feature maps, rows and columns.
  *
- * Input and Output in the above formula is for each map of one image, and
- * Input(x, y), Output(x, y) represents an element in an image.
+ * Input and Output in the above formula is for each map(i) of one image, and
+ * Input(i, x, y), Output(i, x, y) represents an element in an image.
+ *
+ * C is the number of feature maps of one image, and N is a hyper-parameters
+ * is configured when Function is initialized. The sum in the denominator
+ * is the sum of the same position in the neighboring maps.
  *
  * In the implementation of Function, k is equal to 1,
  * so Function has no argument for k.
@@ -199,20 +203,26 @@ private:
 /**
  * \brief Backward calculation for normalization with across maps.
  *
+ * Function implementation:
+ *
  * The implementation of this Function is derived from the
  * CrossMapNormalFunc implementation.
  *
  * InputGrad = OutputGrad * denoms ^ (-beta)
- *    /
- *  + | (OutputGrad * OutputValue * (-2 * alpha * beta) / denoms) * InputValue
- *    /
+ *    -- upper
+ *  + > (OutputGrad * OutputValue * (-2 * alpha * beta) / denoms) * InputValue
+ *    -- lower
  *
- * Argument in the Function:
  * The data of inputs/outputs format is the same as the forward interface
  * and is NCHW.
  *
+ * The upper and lower is the same as forward. The logic of the sum
+ * is also the same as forward.
+ *
+ * Function Arguments:
+ *
  * \param size_      represent N
- * \param scale_     represent alpha / N
+ * \param scale_     represent alpha
  * \param pow_       represent beta
  * \param inputs[0]  represent InputValue, inputs[0] of CrossMapNormalFunc
  * \param inputs[1]  represent OutputValue, outputs[0] of CrossMapNormalFunc

From a3f0aed00d304d3ac43735dcc83460b7b724f0a9 Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Mon, 23 Jan 2017 18:44:51 +0800
Subject: [PATCH 87/88] add python v2 package

---
 python/CMakeLists.txt        |  4 +++-
 python/paddle/v2/__init__.py | 13 +++++++++++++
 python/setup.py.in           |  3 ++-
 3 files changed, 18 insertions(+), 2 deletions(-)
 create mode 100644 python/paddle/v2/__init__.py

diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 1cda4762eb..ee7a5bff84 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -4,11 +4,13 @@ set(OUTPUT_DIR
 file(GLOB TRAINER_PY_FILES . ./paddle/trainer/*.py)
 file(GLOB HELPERS_PY_FILES . ./paddle/trainer_config_helpers/*.py)
 file(GLOB UTILS_PY_FILES . ./paddle/utils/*.py)
+file(GLOB V2_PY_FILES . ./paddle/v2/*.py)
 
 set(PY_FILES paddle/__init__.py
              ${TRAINER_PY_FILES}
              ${HELPERS_PY_FILES}
-             ${UTILS_PY_FILES})
+             ${UTILS_PY_FILES}
+             ${V2_PY_FILES})
 
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
     ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
diff --git a/python/paddle/v2/__init__.py b/python/paddle/v2/__init__.py
new file mode 100644
index 0000000000..f662d68263
--- /dev/null
+++ b/python/paddle/v2/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/setup.py.in b/python/setup.py.in
index b66a42e87c..1e1324eea8 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -4,7 +4,8 @@ packages=['paddle',
           'paddle.proto',
           'paddle.trainer',
           'paddle.trainer_config_helpers',
-          'paddle.utils']
+          'paddle.utils',
+          'paddle.v2']
 
 setup(name='paddle',
       version='${PADDLE_VERSION}',

From 23c8ad859c4acbf133fd00ec65643fffa27754df Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Tue, 24 Jan 2017 13:51:32 +0800
Subject: [PATCH 88/88] Make MyServer as a stack variable

---
 paddle/pserver/test/test_ProtoServer.cpp | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/paddle/pserver/test/test_ProtoServer.cpp b/paddle/pserver/test/test_ProtoServer.cpp
index e70e1670d3..04236fda2f 100644
--- a/paddle/pserver/test/test_ProtoServer.cpp
+++ b/paddle/pserver/test/test_ProtoServer.cpp
@@ -161,15 +161,8 @@ TEST(ProtoServer, extended) {
 int main(int argc, char** argv) {
   paddle::initMain(argc, argv);
   testing::InitGoogleTest(&argc, argv);
-
-  std::unique_ptr<MyServer> server;
-  if (FLAGS_rdma_tcp == "rdma") {
-    server.reset(new MyServer(FLAGS_port, 0));
-  } else {
-    server.reset(new MyServer(FLAGS_port));
-  }
-
-  server->start();
+  MyServer server(FLAGS_port, FLAGS_rdma_tcp == "rdma" ? 0 : -1);
+  server.start();
   usleep(10000);
 
   return RUN_ALL_TESTS();