From 60f706a1d6f497088f1957354910176e649059e8 Mon Sep 17 00:00:00 2001
From: Luo Tao <luotao02@baidu.com>
Date: Tue, 10 Oct 2017 19:04:29 +0800
Subject: [PATCH 1/3] add SQRT strategy for sequence_pool_op

---
 paddle/operators/sequence_pool_op.cc          | 14 +++++------
 paddle/operators/sequence_pool_op.h           |  8 ++++++
 .../v2/framework/tests/test_seq_pool.py       | 25 +++++++++++++++++++
 3 files changed, 39 insertions(+), 8 deletions(-)
diff --git a/paddle/operators/sequence_pool_op.cc b/paddle/operators/sequence_pool_op.cc
index 06c00d31ea..9b8d86b404 100644
--- a/paddle/operators/sequence_pool_op.cc
+++ b/paddle/operators/sequence_pool_op.cc
@@ -36,11 +36,9 @@ class SequencePoolOpMaker : public framework::OpProtoAndCheckerMaker {
   SequencePoolOpMaker(framework::OpProto* proto,
                       framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X",
-             "A float LoDTensor, the variable-length input of SequencePoolOp");
-    AddOutput(
-        "Out",
-        "A float LoDTensor, the variable-length output of SequencePoolOp.");
+    AddInput("X", "A LoDTensor, the variable-length input of SequencePoolOp");
+    AddOutput("Out",
+              "A LoDTensor, the variable-length output of SequencePoolOp.");
     AddAttr<int>(
         "strategy",
         "(int, default AVERAGE) the pooling strategy of SequencePoolOp.")
@@ -49,13 +47,13 @@ class SequencePoolOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
     SequencePoolOp pools features of all time-steps of each instance.
 
-    For a mini-batch of 3 variable lengths sentences, containing 2, 3, and 2 time-steps:
+    For a mini-batch of 3 variable-length sentences, containing 2, 3, and 2 time-steps:
 
-    Assume X is a [7,M,N] float LoDTensor, and X->lod()[0] = [0, 2, 5, 7].
+    Assume X is a [7,M,N] LoDTensor, and X->lod()[0] = [0, 2, 5, 7], 7=2+3+2.
     Besides, for the sake of simplicity, we assume M=1 and N=1,
     and the value of X = [[1, 3], [2, 4, 6], [5, 1]].
 
-    Thus, Out is a [3,1,1] float LoDTensor, but Out->lod() is nullptr.
+    Thus, Out is a [3,1,1] LoDTensor, but Out->lod() is nullptr.
     And for different strategy, the value of Out is as follows:
 
     - AVERAGE: [2, 4, 3], where 2=(1+3)/2, 4=(2+4+6)/3, 3=(5+1)/2
diff --git a/paddle/operators/sequence_pool_op.h b/paddle/operators/sequence_pool_op.h
index 752d714125..fd056b71cf 100644
--- a/paddle/operators/sequence_pool_op.h
+++ b/paddle/operators/sequence_pool_op.h
@@ -77,6 +77,10 @@ class SequencePoolKernel : public framework::OpKernel<T> {
         case SUM:
           out_e.device(place) = in_e.sum(Eigen::array<int, 1>({{0}}));
           break;
+        case SQRT:
+          out_e.device(place) = in_e.sum(Eigen::array<int, 1>({{0}})) /
+                                std::sqrt(static_cast<T>(h));
+          break;
         default:
           PADDLE_THROW("unsupported pooling strategy");
       }
@@ -115,6 +119,10 @@ class SequencePoolGradKernel : public framework::OpKernel<T> {
         case SUM:
           in_g_e.device(place) = (out_g_e).broadcast(bcast);
           break;
+        case SQRT:
+          in_g_e.device(place) =
+              (out_g_e / std::sqrt(static_cast<T>(h))).broadcast(bcast);
+          break;
         default:
           PADDLE_THROW("unsupported pooling strategy");
       }
diff --git a/python/paddle/v2/framework/tests/test_seq_pool.py b/python/paddle/v2/framework/tests/test_seq_pool.py
index 211086e5f4..fbcf6dac93 100644
--- a/python/paddle/v2/framework/tests/test_seq_pool.py
+++ b/python/paddle/v2/framework/tests/test_seq_pool.py
@@ -82,5 +82,30 @@ class TestSeqSumPool2D(TestSeqAvgPool2D):
             out[i] = np.reshape(sub_x.sum(axis=0), (3, 17))
 
 
+class TestSeqSqrtPool(TestSeqAvgPool):
+    def compute(self):
+        self.attrs = {'strategy': SeqPoolType.SQRT}
+        x, lod = self.inputs['X']
+        out = self.outputs['Out']
+        for i in range(4):
+            sub_x = x[lod[0][i]:lod[0][i + 1], :]
+            len = lod[0][i + 1] - lod[0][i]
+            out[i] = sub_x.sum(axis=0) / np.sqrt(len)
+
+
+class TestSeqSqrtPool2D(TestSeqAvgPool2D):
+    def compute(self):
+        self.attrs = {'strategy': SeqPoolType.SQRT}
+        x, lod = self.inputs['X']
+        out = self.outputs['Out']
+        for i in range(4):
+            sub_x = np.reshape(x[lod[0][i]:lod[0][i + 1], :], (-1, 3 * 17))
+            len = lod[0][i + 1] - lod[0][i]
+            out[i] = np.reshape(sub_x.sum(axis=0) / np.sqrt(len), (3, 17))
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out", max_relative_error=0.06)
+
+
 if __name__ == '__main__':
     unittest.main()

From 393c748c89049a7d9b8991266eeec09558395cc5 Mon Sep 17 00:00:00 2001
From: Luo Tao <luotao02@baidu.com>
Date: Thu, 12 Oct 2017 19:35:46 +0800
Subject: [PATCH 2/3] add seqlastin/seqfirstin for seq_pool op

---
 paddle/operators/sequence_pool_op.h           | 17 ++++++++
 .../v2/framework/tests/test_seq_pool.py       | 40 +++++++++++++++++++
 2 files changed, 57 insertions(+)

diff --git a/paddle/operators/sequence_pool_op.h b/paddle/operators/sequence_pool_op.h
index fd056b71cf..8bfb80c33f 100644
--- a/paddle/operators/sequence_pool_op.h
+++ b/paddle/operators/sequence_pool_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -81,6 +82,12 @@ class SequencePoolKernel : public framework::OpKernel<T> {
           out_e.device(place) = in_e.sum(Eigen::array<int, 1>({{0}})) /
                                 std::sqrt(static_cast<T>(h));
           break;
+        case LAST:
+          out_e.device(place) = in_e.chip(h - 1, 0);
+          break;
+        case FIRST:
+          out_e.device(place) = in_e.chip(0, 0);
+          break;
         default:
           PADDLE_THROW("unsupported pooling strategy");
       }
@@ -102,6 +109,10 @@ class SequencePoolGradKernel : public framework::OpKernel<T> {
     int64_t w = in->numel() / dims[0];
 
     in_g->mutable_data<T>(context.GetPlace());
+    if (strategy > 2) {
+      // set X@Grad be zero at first when strategy is LAST/FIRST/MAX
+      math::SetConstant<Place, T>(context.device_context(), in_g, 0);
+    }
     auto place = context.GetEigenDevice<Place>();
     for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
       auto in_g_t = in_g->Slice<T>(static_cast<int>(lod[i]),
@@ -123,6 +134,12 @@ class SequencePoolGradKernel : public framework::OpKernel<T> {
           in_g_e.device(place) =
               (out_g_e / std::sqrt(static_cast<T>(h))).broadcast(bcast);
           break;
+        case LAST:
+          in_g_e.chip(h - 1, 0).device(place) = out_g_e;
+          break;
+        case FIRST:
+          in_g_e.chip(0, 0).device(place) = out_g_e;
+          break;
         default:
           PADDLE_THROW("unsupported pooling strategy");
       }
diff --git a/python/paddle/v2/framework/tests/test_seq_pool.py b/python/paddle/v2/framework/tests/test_seq_pool.py
index fbcf6dac93..0ebf78bf8f 100644
--- a/python/paddle/v2/framework/tests/test_seq_pool.py
+++ b/python/paddle/v2/framework/tests/test_seq_pool.py
@@ -107,5 +107,45 @@ class TestSeqSqrtPool2D(TestSeqAvgPool2D):
         self.check_grad(["X"], "Out", max_relative_error=0.06)
 
 
+class TestSeqLastPool(TestSeqAvgPool):
+    def compute(self):
+        self.attrs = {'strategy': SeqPoolType.LAST}
+        x, lod = self.inputs['X']
+        out = self.outputs['Out']
+        for i in range(4):
+            sub_x = x[lod[0][i]:lod[0][i + 1], :]
+            out[i] = sub_x[-1, :]
+
+
+class TestSeqLastPool2D(TestSeqAvgPool2D):
+    def compute(self):
+        self.attrs = {'strategy': SeqPoolType.LAST}
+        x, lod = self.inputs['X']
+        out = self.outputs['Out']
+        for i in range(4):
+            sub_x = np.reshape(x[lod[0][i]:lod[0][i + 1], :], (-1, 3 * 17))
+            out[i] = np.reshape(sub_x[-1, :], (3, 17))
+
+
+class TestSeqFirstPool(TestSeqAvgPool):
+    def compute(self):
+        self.attrs = {'strategy': SeqPoolType.FIRST}
+        x, lod = self.inputs['X']
+        out = self.outputs['Out']
+        for i in range(4):
+            sub_x = x[lod[0][i]:lod[0][i + 1], :]
+            out[i] = sub_x[0, :]
+
+
+class TestSeqFirstPool2D(TestSeqAvgPool2D):
+    def compute(self):
+        self.attrs = {'strategy': SeqPoolType.FIRST}
+        x, lod = self.inputs['X']
+        out = self.outputs['Out']
+        for i in range(4):
+            sub_x = np.reshape(x[lod[0][i]:lod[0][i + 1], :], (-1, 3 * 17))
+            out[i] = np.reshape(sub_x[0, :], (3, 17))
+
+
 if __name__ == '__main__':
     unittest.main()

From 6a4282a20f1f9c110ea5aef5035a0b733da6db19 Mon Sep 17 00:00:00 2001
From: Luo Tao <luotao02@baidu.com>
Date: Mon, 16 Oct 2017 20:02:04 +0800
Subject: [PATCH 3/3] refine comments of sequence_pool_op

---
 paddle/operators/sequence_pool_op.cc | 7 ++++---
 paddle/operators/sequence_pool_op.h  | 4 ++--
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/paddle/operators/sequence_pool_op.cc b/paddle/operators/sequence_pool_op.cc
index 9b8d86b404..8dc4a59ba8 100644
--- a/paddle/operators/sequence_pool_op.cc
+++ b/paddle/operators/sequence_pool_op.cc
@@ -36,9 +36,10 @@ class SequencePoolOpMaker : public framework::OpProtoAndCheckerMaker {
   SequencePoolOpMaker(framework::OpProto* proto,
                       framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "A LoDTensor, the variable-length input of SequencePoolOp");
+    AddInput("X", "(LoDTensor), the variable-length input of SequencePoolOp");
     AddOutput("Out",
-              "A LoDTensor, the variable-length output of SequencePoolOp.");
+              "(Tensor), output of SequencePoolOp, which does not contain LoD "
+              "infomation.");
     AddAttr<int>(
         "strategy",
         "(int, default AVERAGE) the pooling strategy of SequencePoolOp.")
@@ -53,7 +54,7 @@ class SequencePoolOpMaker : public framework::OpProtoAndCheckerMaker {
     Besides, for the sake of simplicity, we assume M=1 and N=1,
     and the value of X = [[1, 3], [2, 4, 6], [5, 1]].
 
-    Thus, Out is a [3,1,1] LoDTensor, but Out->lod() is nullptr.
+    Thus, Out is a [3,1,1] Tensor without LoD infomation.
     And for different strategy, the value of Out is as follows:
 
     - AVERAGE: [2, 4, 3], where 2=(1+3)/2, 4=(2+4+6)/3, 3=(5+1)/2
diff --git a/paddle/operators/sequence_pool_op.h b/paddle/operators/sequence_pool_op.h
index 8bfb80c33f..ce68204d41 100644
--- a/paddle/operators/sequence_pool_op.h
+++ b/paddle/operators/sequence_pool_op.h
@@ -109,8 +109,8 @@ class SequencePoolGradKernel : public framework::OpKernel<T> {
     int64_t w = in->numel() / dims[0];
 
     in_g->mutable_data<T>(context.GetPlace());
-    if (strategy > 2) {
-      // set X@Grad be zero at first when strategy is LAST/FIRST/MAX
+    if (strategy == LAST || strategy == FIRST) {
+      // set X@Grad be zero at first when strategy is LAST/FIRST
       math::SetConstant<Place, T>(context.device_context(), in_g, 0);
     }
     auto place = context.GetEigenDevice<Place>();