From 23b0388f46e959d2334ba561ed04eefda257edf6 Mon Sep 17 00:00:00 2001
From: wanghaox <wanghaox>
Date: Thu, 9 Nov 2017 17:56:55 +0800
Subject: [PATCH 1/4] add sub sequence operator code and unittest

---
 paddle/operators/sub_sequence_op.cc           |  99 +++++++++++
 paddle/operators/sub_sequence_op.cu           |  25 +++
 paddle/operators/sub_sequence_op.h            | 156 ++++++++++++++++++
 .../framework/tests/test_sub_sequence_op.py   |  40 +++++
 4 files changed, 320 insertions(+)
 create mode 100755 paddle/operators/sub_sequence_op.cc
 create mode 100755 paddle/operators/sub_sequence_op.cu
 create mode 100755 paddle/operators/sub_sequence_op.h
 create mode 100755 python/paddle/v2/framework/tests/test_sub_sequence_op.py
diff --git a/paddle/operators/sub_sequence_op.cc b/paddle/operators/sub_sequence_op.cc
new file mode 100755
index 0000000000..f1e1c862a0
--- /dev/null
+++ b/paddle/operators/sub_sequence_op.cc
@@ -0,0 +1,99 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/sub_sequence_op.h"
+
+namespace paddle {
+namespace operators {
+
+class SubSequenceOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SubSequenceOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of SubSequenceOp should not be null.");
+    auto input_dims = ctx->GetInputDim("X");
+
+    auto offsets = ctx->Attrs().Get<std::vector<int>>("offset");
+    auto sizes = ctx->Attrs().Get<std::vector<int>>("size");
+
+    auto dim_0 = 0;
+    for (size_t i = 0; i < sizes.size(); ++i) {
+      dim_0 += sizes[i];
+    }
+
+    framework::DDim out_dims = input_dims;
+    out_dims[0] = dim_0;
+    ctx->SetOutputDim("Out", out_dims);
+  }
+};
+
+class SubSequenceGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "The gradient of Out should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName("X")),
+                   "The gradient of X should not be null.");
+    ctx->SetOutputsDim(framework::GradVarName("X"), ctx->GetInputsDim("X"));
+  }
+};
+
+class SubSequenceOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SubSequenceOpMaker(framework::OpProto* proto,
+                        framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(LoDTensor), "
+            "the variable-length input of SubSequenceOp");
+    AddAttr<std::vector<int>>(
+        "offset",
+        "A list<int> to describes offset for sub sequence item.");
+    AddAttr<std::vector<int>>(
+        "size",
+        "A list<int> to describes size for sub sequence item.");
+    AddOutput("Out",
+              "(Tensor), Variable-length output of "
+              "sequence_concat Op.");
+    AddComment(R"DOC(
+Sub Sequence operator
+          
+The operator crop a subsequence from given sequence with given start offset and subsequence size.
+It only supports sequence (LoD Tensor with level number is 1).
+- Case:
+    LoD(x) = {{0, 3, 6, 10}}; Dims(x0) = (10, 3, 2)
+    offset = (0, 1, 1); size = (2, 1, 2)
+    LoD(Out) = {{0, 2, 3, 5}}; Dims(Out) = (5,3,2)
+NOTE: The length of the input, offset and size should be the same. The offset start from 0.
+    )DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(sub_sequence, ops::SubSequenceOp, ops::SubSequenceOpMaker,
+            sub_sequence_grad, ops::SubSequenceGradOp);
+REGISTER_OP_CPU_KERNEL(
+    sub_sequence,
+    ops::SubSequenceOpKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    sub_sequence_grad,
+    ops::SubSequenceGradOpKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/sub_sequence_op.cu b/paddle/operators/sub_sequence_op.cu
new file mode 100755
index 0000000000..d4127347cb
--- /dev/null
+++ b/paddle/operators/sub_sequence_op.cu
@@ -0,0 +1,25 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+
+#include "paddle/operators/sub_sequence_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(
+    sub_sequence,
+    ops::SubSequenceOpKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(
+    sub_sequence_grad,
+    ops::SubSequenceGradOpKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/sub_sequence_op.h b/paddle/operators/sub_sequence_op.h
new file mode 100755
index 0000000000..cd291a382b
--- /dev/null
+++ b/paddle/operators/sub_sequence_op.h
@@ -0,0 +1,156 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/strided_memcpy.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using LoD = framework::LoD;
+
+template <typename T>
+LoD subsequenceLoD(const T* in, const std::vector<int> offsets,
+                   const std::vector<int> sizes) {
+  auto out_lod = in->lod();
+  size_t lod_offset = 0;
+
+  auto n = in->lod()[0].size() - 1;
+  out_lod[0][0] = 0;
+  for (size_t i = 0; i < n; ++i) {
+    lod_offset += sizes[i];
+    out_lod[0][i+1] = lod_offset;
+  }
+  return out_lod;
+}
+
+template <typename Place, typename T>
+class SubSequenceOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<LoDTensor>("X");
+    std::vector<int> offsets = ctx.Attr<std::vector<int>>("offset");
+    std::vector<int> sizes = ctx.Attr<std::vector<int>>("size");
+    auto* out = ctx.Output<LoDTensor>("Out");
+
+    auto offset_len = offsets.size();
+    auto size_len = sizes.size();
+
+    auto lod = in->lod();
+    auto n = lod[0].size() - 1;
+
+    PADDLE_ENFORCE_EQ(lod.size(), 1UL, "Only support one level sequence now.");
+    PADDLE_ENFORCE_EQ(n, offset_len,
+                      "The length of input and offset should be the same")
+    PADDLE_ENFORCE_EQ(n, size_len,
+                      "The length of input and size should be the same")
+
+    for (size_t i = 0; i < n; ++i) {
+      auto offset = offsets[i];
+      auto size = sizes[i];
+      PADDLE_ENFORCE_LT(lod[0][i] + offset + size, lod[0][i + 1],
+                        "The target tensor's length overflow")
+    }
+
+    out->mutable_data<T>(ctx.GetPlace());
+    auto out_lod = subsequenceLoD(in, offsets, sizes);
+    out->set_lod(out_lod);
+
+    auto in_stride = framework::stride(in->dims());
+    auto out_stride = framework::stride(out->dims());
+
+    size_t out_offset = 0;
+    for (size_t i = 0; i < n; ++i) {
+      auto offset = offsets[i];
+      auto size = sizes[i];
+
+      Tensor in_t = in->Slice(static_cast<int>(lod[0][i] + offset),
+                               static_cast<int>(lod[0][i] + offset + size));
+
+      StridedMemcpy<T>(ctx.device_context(), in_t.data<T>(),
+                       in_stride, in_t.dims(), out_stride,
+                       out->data<T>() + out_offset);
+      out_offset += size * in_stride[0];
+    }
+  }
+};
+
+template <typename Place, typename T>
+class SubSequenceGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<LoDTensor>("X");
+    std::vector<int> offsets = ctx.Attr<std::vector<int>>("offset");
+    std::vector<int> sizes = ctx.Attr<std::vector<int>>("size");
+    auto* out_grad =
+        ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"));
+    auto* x_grad =
+        ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
+
+    auto offset_len = offsets.size();
+    auto size_len = sizes.size();
+
+    auto lod = in->lod();
+    auto n = lod[0].size() - 1;
+
+    // check input data format
+    PADDLE_ENFORCE_EQ(lod.size(), 1UL, "Only support one level sequence now.");
+    PADDLE_ENFORCE_EQ(n, offset_len,
+                      "The length of input and offset should be the same")
+    PADDLE_ENFORCE_EQ(n, size_len,
+                      "The length of input and size should be the same")
+
+    for (size_t i = 0; i < n; ++i) {
+      auto offset = offsets[i];
+      auto size = sizes[i];
+      PADDLE_ENFORCE_LT(lod[0][i] + offset + size, lod[0][i + 1],
+                        "The target tensor's length overflow")
+    }
+
+    auto out_lod = subsequenceLoD(in, offsets, sizes);
+
+    x_grad->set_lod(lod);
+    x_grad->mutable_data<T>(ctx.GetPlace());
+    auto temp = framework::EigenVector<T>::Flatten(*x_grad);
+    temp.device(ctx.GetEigenDevice<Place>()) = temp.constant(static_cast<T>(0));
+
+    auto out_grad_stride = framework::stride(out_grad->dims());
+
+    for (size_t i = 0; i < out_lod[0].size() - 1; ++i) {
+      Tensor out_grad_t =
+          out_grad->Slice(static_cast<int>(out_lod[0][i]),
+                          static_cast<int>(out_lod[0][i + 1]));
+      auto out_grad_stride = framework::stride(out_grad_t.dims());
+
+      auto x_grad_stride = framework::stride(x_grad->dims());
+
+      auto offset = offsets[i];
+      auto size = sizes[i];
+
+      Tensor x_grad_t = x_grad->Slice(static_cast<int>(lod[0][i] + offset),
+                         static_cast<int>(lod[0][i] + offset + size));
+
+      StridedMemcpy<T>(ctx.device_context(), out_grad_t.data<T>(),
+                       out_grad_stride, out_grad_t.dims(), x_grad_stride,
+                       x_grad_t.data<T>());
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/python/paddle/v2/framework/tests/test_sub_sequence_op.py b/python/paddle/v2/framework/tests/test_sub_sequence_op.py
new file mode 100755
index 0000000000..73d81947bb
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_sub_sequence_op.py
@@ -0,0 +1,40 @@
+import unittest
+import numpy as np
+import sys
+from op_test import OpTest
+
+class TestSubSequenceOp(OpTest):
+    def set_data(self):
+        # only supprot one level LoD
+        x = np.random.random((100, 3, 2)).astype('float32')
+        lod = [[0, 20, 40, 60, 80, 100]]
+        offsets = np.array([1, 2, 3, 4, 5]).flatten()
+        sizes = np.array([10, 8, 6, 4, 2]).flatten()
+
+        self.inputs = {'X': (x, lod)}
+        self.attrs = {'offset': offsets, 'size': sizes}
+        outs = []
+        out_lod = [[0]]
+        out_lod_offset = 0
+        for i in range(len(offsets)):
+            sub_x = x[lod[0][i] + offsets[i]: lod[0]
+                      [i] + offsets[i] + sizes[i], :]
+            outs.append(sub_x)
+            out_lod_offset = out_lod_offset + len(sub_x)
+            out_lod[0].append(out_lod_offset)
+
+        outs = np.concatenate(outs, axis=0)
+        self.outputs = {'Out': outs}
+
+    def setUp(self):
+        self.op_type = "sub_sequence"
+        self.set_data()
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+if __name__ == '__main__':
+    unittest.main()

From f23d6cc4c871b35dbaede482464aa28470f0eb1a Mon Sep 17 00:00:00 2001
From: wanghaox <wanghaox>
Date: Tue, 14 Nov 2017 11:41:29 +0800
Subject: [PATCH 2/4] update the sub_sequence_op tp sequence_slice_op code.

---
 paddle/operators/{sub_sequence_op.cc => sequence_slice_op.cc}     | 0
 paddle/operators/{sub_sequence_op.cu => sequence_slice_op.cu}     | 0
 paddle/operators/{sub_sequence_op.h => sequence_slice_op.h}       | 0
 .../tests/{test_sub_sequence_op.py => test_sequence_slice_op.py}  | 0
 4 files changed, 0 insertions(+), 0 deletions(-)
 rename paddle/operators/{sub_sequence_op.cc => sequence_slice_op.cc} (100%)
 rename paddle/operators/{sub_sequence_op.cu => sequence_slice_op.cu} (100%)
 rename paddle/operators/{sub_sequence_op.h => sequence_slice_op.h} (100%)
 rename python/paddle/v2/framework/tests/{test_sub_sequence_op.py => test_sequence_slice_op.py} (100%)

diff --git a/paddle/operators/sub_sequence_op.cc b/paddle/operators/sequence_slice_op.cc
similarity index 100%
rename from paddle/operators/sub_sequence_op.cc
rename to paddle/operators/sequence_slice_op.cc
diff --git a/paddle/operators/sub_sequence_op.cu b/paddle/operators/sequence_slice_op.cu
similarity index 100%
rename from paddle/operators/sub_sequence_op.cu
rename to paddle/operators/sequence_slice_op.cu
diff --git a/paddle/operators/sub_sequence_op.h b/paddle/operators/sequence_slice_op.h
similarity index 100%
rename from paddle/operators/sub_sequence_op.h
rename to paddle/operators/sequence_slice_op.h
diff --git a/python/paddle/v2/framework/tests/test_sub_sequence_op.py b/python/paddle/v2/framework/tests/test_sequence_slice_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_sub_sequence_op.py
rename to python/paddle/v2/framework/tests/test_sequence_slice_op.py

From b24afd819a48685cc3e25e1124bf5c1192ce774e Mon Sep 17 00:00:00 2001
From: wanghaox <wanghaox>
Date: Tue, 14 Nov 2017 12:08:49 +0800
Subject: [PATCH 3/4] update the sub_sequence_op to sequence_slice_op code.

---
 paddle/operators/sequence_slice_op.cc         |  98 +++++++++------
 paddle/operators/sequence_slice_op.cu         |  12 +-
 paddle/operators/sequence_slice_op.h          | 119 ++++++++++--------
 .../framework/tests/test_sequence_slice_op.py |  24 ++--
 4 files changed, 140 insertions(+), 113 deletions(-)

diff --git a/paddle/operators/sequence_slice_op.cc b/paddle/operators/sequence_slice_op.cc
index f1e1c862a0..a7e659b763 100755
--- a/paddle/operators/sequence_slice_op.cc
+++ b/paddle/operators/sequence_slice_op.cc
@@ -12,37 +12,39 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/sub_sequence_op.h"
+#include "paddle/operators/sequence_slice_op.h"
 
 namespace paddle {
 namespace operators {
 
-class SubSequenceOp : public framework::OperatorWithKernel {
+class SequenceSliceOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of SubSequenceOp should not be null.");
+                   "Input(X) of SequenceSliceOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Offset"),
+                   "Input(Offset) of SequenceSliceOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Length"),
+                   "Input(Length) of SequenceSliceOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of SubSequenceOp should not be null.");
+                   "Output(Out) of SequenceSliceOp should not be null.");
     auto input_dims = ctx->GetInputDim("X");
 
-    auto offsets = ctx->Attrs().Get<std::vector<int>>("offset");
-    auto sizes = ctx->Attrs().Get<std::vector<int>>("size");
-
-    auto dim_0 = 0;
-    for (size_t i = 0; i < sizes.size(); ++i) {
-      dim_0 += sizes[i];
+    ctx->SetOutputDim("Out", input_dims);
     }
 
-    framework::DDim out_dims = input_dims;
-    out_dims[0] = dim_0;
-    ctx->SetOutputDim("Out", out_dims);
+ protected:
+  framework::OpKernelType GetKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+        ctx.device_context());
   }
 };
 
-class SubSequenceGradOp : public framework::OperatorWithKernel {
+class SequenceSliceGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
@@ -53,34 +55,50 @@ class SubSequenceGradOp : public framework::OperatorWithKernel {
                    "The gradient of X should not be null.");
     ctx->SetOutputsDim(framework::GradVarName("X"), ctx->GetInputsDim("X"));
   }
+
+ protected:
+  framework::OpKernelType GetKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+        ctx.device_context());
+  }
 };
 
-class SubSequenceOpMaker : public framework::OpProtoAndCheckerMaker {
+class SequenceSliceOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SubSequenceOpMaker(framework::OpProto* proto,
-                        framework::OpAttrChecker* op_checker)
+  SequenceSliceOpMaker(framework::OpProto* proto,
+                       framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "(LoDTensor), "
-            "the variable-length input of SubSequenceOp");
-    AddAttr<std::vector<int>>(
-        "offset",
-        "A list<int> to describes offset for sub sequence item.");
-    AddAttr<std::vector<int>>(
-        "size",
-        "A list<int> to describes size for sub sequence item.");
+    AddInput("X",
+             "(LoDTensor), "
+             "the input of SequenceSliceOp.");
+    AddInput("Offset",
+             "(Tensor), "
+             "A vector<int> to describes offset for sub sequence item.");
+    AddInput("Length",
+             "(Tensor), "
+             "A vector<int> to describes length for sub sequence item.");
     AddOutput("Out",
-              "(Tensor), Variable-length output of "
-              "sequence_concat Op.");
+              "(LoDTensor), output of sequence slice Op.");
     AddComment(R"DOC(
-Sub Sequence operator
-          
-The operator crop a subsequence from given sequence with given start offset and subsequence size.
+Sequence slice operator
+The operator crop a subsequence from given sequence with given start offset and subsequence length.
 It only supports sequence (LoD Tensor with level number is 1).
 - Case:
-    LoD(x) = {{0, 3, 6, 10}}; Dims(x0) = (10, 3, 2)
-    offset = (0, 1, 1); size = (2, 1, 2)
-    LoD(Out) = {{0, 2, 3, 5}}; Dims(Out) = (5,3,2)
-NOTE: The length of the input, offset and size should be the same. The offset start from 0.
+    X = [[a1, a2;
+        b1, b2;
+        c1, c2]
+       [d1, d2;
+        e1, e2]]
+    LoD(X) = {{0, 3, 5}}; Dims(X) = (4, 1, 2)
+    Offset = (0, 1); Length = (2, 1)
+
+    Out = [[a1, a2;
+            b1, b2]
+            [e1, e2]]
+    LoD(Out) = {{0, 2, 3}}
+NOTE: The length of the input, offset and length should be the same. The offset start from 0.
     )DOC");
   }
 };
@@ -89,11 +107,11 @@ NOTE: The length of the input, offset and size should be the same. The offset st
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(sub_sequence, ops::SubSequenceOp, ops::SubSequenceOpMaker,
-            sub_sequence_grad, ops::SubSequenceGradOp);
+REGISTER_OP(sequence_slice, ops::SequenceSliceOp, ops::SequenceSliceOpMaker,
+            sequence_slice_grad, ops::SequenceSliceGradOp);
 REGISTER_OP_CPU_KERNEL(
-    sub_sequence,
-    ops::SubSequenceOpKernel<paddle::platform::CPUPlace, float>);
+    sequence_slice,
+    ops::SequenceSliceOpKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(
-    sub_sequence_grad,
-    ops::SubSequenceGradOpKernel<paddle::platform::CPUPlace, float>);
+    sequence_slice_grad,
+    ops::SequenceSliceGradOpKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/sequence_slice_op.cu b/paddle/operators/sequence_slice_op.cu
index d4127347cb..a9f59dadba 100755
--- a/paddle/operators/sequence_slice_op.cu
+++ b/paddle/operators/sequence_slice_op.cu
@@ -12,14 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#define EIGEN_USE_GPU
-
-#include "paddle/operators/sub_sequence_op.h"
+#include "paddle/operators/sequence_slice_op.h"
 
 namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(
-    sub_sequence,
-    ops::SubSequenceOpKernel<paddle::platform::GPUPlace, float>);
+    sequence_slice,
+    ops::SequenceSliceOpKernel<paddle::platform::GPUPlace, float>);
 REGISTER_OP_GPU_KERNEL(
-    sub_sequence_grad,
-    ops::SubSequenceGradOpKernel<paddle::platform::GPUPlace, float>);
+    sequence_slice_grad,
+    ops::SequenceSliceGradOpKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/sequence_slice_op.h b/paddle/operators/sequence_slice_op.h
index cd291a382b..7599a0abf4 100755
--- a/paddle/operators/sequence_slice_op.h
+++ b/paddle/operators/sequence_slice_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
 #include "paddle/operators/strided_memcpy.h"
 
 namespace paddle {
@@ -25,109 +25,124 @@ using LoDTensor = framework::LoDTensor;
 using LoD = framework::LoD;
 
 template <typename T>
-LoD subsequenceLoD(const T* in, const std::vector<int> offsets,
-                   const std::vector<int> sizes) {
-  auto out_lod = in->lod();
+LoD SequenceSliceLoD(const T& in, const int64_t* offset_data,
+                     const int64_t* length_data) {
+  auto out_lod = in.lod();
   size_t lod_offset = 0;
 
-  auto n = in->lod()[0].size() - 1;
+  auto n = in.lod()[0].size() - 1;
   out_lod[0][0] = 0;
   for (size_t i = 0; i < n; ++i) {
-    lod_offset += sizes[i];
+    lod_offset += length_data[i];
     out_lod[0][i+1] = lod_offset;
   }
   return out_lod;
 }
 
 template <typename Place, typename T>
-class SubSequenceOpKernel : public framework::OpKernel<T> {
+class SequenceSliceOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* in = ctx.Input<LoDTensor>("X");
-    std::vector<int> offsets = ctx.Attr<std::vector<int>>("offset");
-    std::vector<int> sizes = ctx.Attr<std::vector<int>>("size");
+    auto* offset = ctx.Input<Tensor>("Offset");
+    auto* length = ctx.Input<Tensor>("Length");
     auto* out = ctx.Output<LoDTensor>("Out");
 
-    auto offset_len = offsets.size();
-    auto size_len = sizes.size();
+    const int64_t* offset_data = offset->data<int64_t>();
+    const int64_t* length_data = length->data<int64_t>();
+
+    if (platform::is_gpu_place(ctx.GetPlace())) {
+      framework::Tensor offset_cpu;
+      offset_cpu.mutable_data<T>(offset->dims(), platform::CPUPlace());
+      offset_cpu.CopyFrom(*offset, platform::CPUPlace(), ctx.device_context());
+      offset_data = offset_cpu.data<int64_t>();
+
+      framework::Tensor length_cpu;
+      length_cpu.mutable_data<T>(length->dims(), platform::CPUPlace());
+      length_cpu.CopyFrom(*length, platform::CPUPlace(), ctx.device_context());
+      length_data = length_cpu.data<int64_t>();
+    }
 
     auto lod = in->lod();
     auto n = lod[0].size() - 1;
 
     PADDLE_ENFORCE_EQ(lod.size(), 1UL, "Only support one level sequence now.");
-    PADDLE_ENFORCE_EQ(n, offset_len,
-                      "The length of input and offset should be the same")
-    PADDLE_ENFORCE_EQ(n, size_len,
-                      "The length of input and size should be the same")
+    PADDLE_ENFORCE_EQ(offset->dims().size(), 1UL,
+                      "Only support one level sequence now.");
+    PADDLE_ENFORCE_EQ(length->dims().size(), 1UL,
+                      "Only support one level sequence now.");
+    PADDLE_ENFORCE_EQ(
+        n, length->dims()[0],
+        "The size of input-sequence and length-array should be the same")
+    PADDLE_ENFORCE_EQ(
+        n, offset->dims()[0],
+        "The size of input-sequence and offset-array should be the same")
 
     for (size_t i = 0; i < n; ++i) {
-      auto offset = offsets[i];
-      auto size = sizes[i];
-      PADDLE_ENFORCE_LT(lod[0][i] + offset + size, lod[0][i + 1],
-                        "The target tensor's length overflow")
+      PADDLE_ENFORCE_LT(0, offset_data[i], "The offset must greater than zero")
+      PADDLE_ENFORCE_LT(0, length_data[i], "The length must greater than zero")
+      PADDLE_ENFORCE_LT(lod[0][i] + offset_data[i] + length_data[i],
+                        lod[0][i + 1], "The target tensor's length overflow")
     }
 
     out->mutable_data<T>(ctx.GetPlace());
-    auto out_lod = subsequenceLoD(in, offsets, sizes);
+    auto out_lod = SequenceSliceLoD(*in, offset_data, length_data);
     out->set_lod(out_lod);
+    math::SetConstant<Place, T> set_zero;
+    set_zero(ctx.device_context(), out, static_cast<T>(0));
 
     auto in_stride = framework::stride(in->dims());
     auto out_stride = framework::stride(out->dims());
 
     size_t out_offset = 0;
     for (size_t i = 0; i < n; ++i) {
-      auto offset = offsets[i];
-      auto size = sizes[i];
-
-      Tensor in_t = in->Slice(static_cast<int>(lod[0][i] + offset),
-                               static_cast<int>(lod[0][i] + offset + size));
+      Tensor in_t =
+          in->Slice(static_cast<int>(lod[0][i] + offset_data[i]),
+                    static_cast<int>(lod[0][i] + offset_data[i] +
+                    length_data[i]));
 
       StridedMemcpy<T>(ctx.device_context(), in_t.data<T>(),
                        in_stride, in_t.dims(), out_stride,
                        out->data<T>() + out_offset);
-      out_offset += size * in_stride[0];
+      out_offset += length_data[i] * in_stride[0];
     }
   }
 };
 
 template <typename Place, typename T>
-class SubSequenceGradOpKernel : public framework::OpKernel<T> {
+class SequenceSliceGradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* in = ctx.Input<LoDTensor>("X");
-    std::vector<int> offsets = ctx.Attr<std::vector<int>>("offset");
-    std::vector<int> sizes = ctx.Attr<std::vector<int>>("size");
+    auto* offset = ctx.Input<Tensor>("Offset");
+    auto* length = ctx.Input<Tensor>("Length");
     auto* out_grad =
         ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"));
     auto* x_grad =
         ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
 
-    auto offset_len = offsets.size();
-    auto size_len = sizes.size();
+    const int64_t* offset_data = offset->data<int64_t>();
+    const int64_t* length_data = length->data<int64_t>();
 
-    auto lod = in->lod();
-    auto n = lod[0].size() - 1;
+    if (platform::is_gpu_place(ctx.GetPlace())) {
+      framework::Tensor offset_cpu;
+      offset_cpu.mutable_data<T>(offset->dims(), platform::CPUPlace());
+      offset_cpu.CopyFrom(*offset, platform::CPUPlace(), ctx.device_context());
+      offset_data = offset_cpu.data<int64_t>();
 
-    // check input data format
-    PADDLE_ENFORCE_EQ(lod.size(), 1UL, "Only support one level sequence now.");
-    PADDLE_ENFORCE_EQ(n, offset_len,
-                      "The length of input and offset should be the same")
-    PADDLE_ENFORCE_EQ(n, size_len,
-                      "The length of input and size should be the same")
-
-    for (size_t i = 0; i < n; ++i) {
-      auto offset = offsets[i];
-      auto size = sizes[i];
-      PADDLE_ENFORCE_LT(lod[0][i] + offset + size, lod[0][i + 1],
-                        "The target tensor's length overflow")
+      framework::Tensor length_cpu;
+      length_cpu.mutable_data<T>(length->dims(), platform::CPUPlace());
+      length_cpu.CopyFrom(*length, platform::CPUPlace(), ctx.device_context());
+      length_data = length_cpu.data<int64_t>();
     }
 
-    auto out_lod = subsequenceLoD(in, offsets, sizes);
+    auto lod = in->lod();
+    auto out_lod = SequenceSliceLoD(*in, offset_data, length_data);
 
     x_grad->set_lod(lod);
     x_grad->mutable_data<T>(ctx.GetPlace());
-    auto temp = framework::EigenVector<T>::Flatten(*x_grad);
-    temp.device(ctx.GetEigenDevice<Place>()) = temp.constant(static_cast<T>(0));
+    math::SetConstant<Place, T> set_zero;
+    set_zero(ctx.device_context(), x_grad, static_cast<T>(0));
 
     auto out_grad_stride = framework::stride(out_grad->dims());
 
@@ -139,11 +154,9 @@ class SubSequenceGradOpKernel : public framework::OpKernel<T> {
 
       auto x_grad_stride = framework::stride(x_grad->dims());
 
-      auto offset = offsets[i];
-      auto size = sizes[i];
-
-      Tensor x_grad_t = x_grad->Slice(static_cast<int>(lod[0][i] + offset),
-                         static_cast<int>(lod[0][i] + offset + size));
+      Tensor x_grad_t = x_grad->Slice(
+          static_cast<int>(lod[0][i] + offset_data[i]),
+          static_cast<int>(lod[0][i] + offset_data[i] + length_data[i]));
 
       StridedMemcpy<T>(ctx.device_context(), out_grad_t.data<T>(),
                        out_grad_stride, out_grad_t.dims(), x_grad_stride,
diff --git a/python/paddle/v2/framework/tests/test_sequence_slice_op.py b/python/paddle/v2/framework/tests/test_sequence_slice_op.py
index 73d81947bb..47b616b743 100755
--- a/python/paddle/v2/framework/tests/test_sequence_slice_op.py
+++ b/python/paddle/v2/framework/tests/test_sequence_slice_op.py
@@ -3,31 +3,29 @@ import numpy as np
 import sys
 from op_test import OpTest
 
-class TestSubSequenceOp(OpTest):
+class TestSequenceSliceOp(OpTest):
     def set_data(self):
         # only supprot one level LoD
         x = np.random.random((100, 3, 2)).astype('float32')
         lod = [[0, 20, 40, 60, 80, 100]]
-        offsets = np.array([1, 2, 3, 4, 5]).flatten()
-        sizes = np.array([10, 8, 6, 4, 2]).flatten()
+        offset = np.array([1, 2, 3, 4, 5]).flatten().astype("int64")
+        length = np.array([10, 8, 6, 4, 2]).flatten().astype("int64")
 
-        self.inputs = {'X': (x, lod)}
-        self.attrs = {'offset': offsets, 'size': sizes}
-        outs = []
+        self.inputs = {'X': (x, lod), 'Offset': offset, 'Length': length}
+        outs = np.zeros((100, 3, 2)).astype('float32')
         out_lod = [[0]]
         out_lod_offset = 0
-        for i in range(len(offsets)):
-            sub_x = x[lod[0][i] + offsets[i]: lod[0]
-                      [i] + offsets[i] + sizes[i], :]
-            outs.append(sub_x)
+        for i in range(len(offset)):
+            sub_x = x[lod[0][i] + offset[i]: lod[0]
+                      [i] + offset[i] + length[i], :]
             out_lod_offset = out_lod_offset + len(sub_x)
+            outs[out_lod[0][i]: out_lod_offset, :] = sub_x
             out_lod[0].append(out_lod_offset)
 
-        outs = np.concatenate(outs, axis=0)
-        self.outputs = {'Out': outs}
+        self.outputs = {'Out': (outs, out_lod)}
 
     def setUp(self):
-        self.op_type = "sub_sequence"
+        self.op_type = "sequence_slice"
         self.set_data()
 
     def test_check_output(self):

From 9a18e78e69928299d06dc6ae9973f86faefb0f2b Mon Sep 17 00:00:00 2001
From: wanghaox <wanghaox>
Date: Tue, 14 Nov 2017 19:17:16 +0800
Subject: [PATCH 4/4] update sequence slice op, fix some error

---
 paddle/operators/sequence_slice_op.cc         | 15 +++++++------
 paddle/operators/sequence_slice_op.h          |  5 +++--
 .../tests/test_sequence_slice_op.py           | 21 ++++++++++++-------
 3 files changed, 26 insertions(+), 15 deletions(-)
 rename python/paddle/v2/{framework => fluid}/tests/test_sequence_slice_op.py (60%)

diff --git a/paddle/operators/sequence_slice_op.cc b/paddle/operators/sequence_slice_op.cc
index a7e659b763..a5928e4cfe 100755
--- a/paddle/operators/sequence_slice_op.cc
+++ b/paddle/operators/sequence_slice_op.cc
@@ -75,14 +75,17 @@ class SequenceSliceOpMaker : public framework::OpProtoAndCheckerMaker {
              "the input of SequenceSliceOp.");
     AddInput("Offset",
              "(Tensor), "
-             "A vector<int> to describes offset for sub sequence item.");
+             "a vector<int> to describe the offset of every input sequence for "
+             "sub sequence item.");
     AddInput("Length",
              "(Tensor), "
-             "A vector<int> to describes length for sub sequence item.");
+             "a vector<int> to describe the length of every input sequence for "
+             "sub sequence item.");
     AddOutput("Out",
-              "(LoDTensor), output of sequence slice Op.");
+              "(LoDTensor), The output of SequenceSliceOp.");
     AddComment(R"DOC(
 Sequence slice operator
+
 The operator crop a subsequence from given sequence with given start offset and subsequence length.
 It only supports sequence (LoD Tensor with level number is 1).
 - Case:
@@ -91,13 +94,13 @@ It only supports sequence (LoD Tensor with level number is 1).
         c1, c2]
        [d1, d2;
         e1, e2]]
-    LoD(X) = {{0, 3, 5}}; Dims(X) = (4, 1, 2)
-    Offset = (0, 1); Length = (2, 1)
+    LoD(X) = {{0, 3, 5}}; Dims(X) = (5, 2)
+    Offset = [0, 1]; Length = [2, 1]
 
     Out = [[a1, a2;
             b1, b2]
             [e1, e2]]
-    LoD(Out) = {{0, 2, 3}}
+    LoD(Out) = {{0, 2, 3}}; Dims(Out) = (3, 2)
 NOTE: The length of the input, offset and length should be the same. The offset start from 0.
     )DOC");
   }
diff --git a/paddle/operators/sequence_slice_op.h b/paddle/operators/sequence_slice_op.h
index 7599a0abf4..8717413197 100755
--- a/paddle/operators/sequence_slice_op.h
+++ b/paddle/operators/sequence_slice_op.h
@@ -87,9 +87,10 @@ class SequenceSliceOpKernel : public framework::OpKernel<T> {
 
     out->mutable_data<T>(ctx.GetPlace());
     auto out_lod = SequenceSliceLoD(*in, offset_data, length_data);
+    auto out_dims = in->dims();
+    out_dims[0] = out_lod[0][out_lod[0].size() - 1];
+    out->Resize(out_dims);
     out->set_lod(out_lod);
-    math::SetConstant<Place, T> set_zero;
-    set_zero(ctx.device_context(), out, static_cast<T>(0));
 
     auto in_stride = framework::stride(in->dims());
     auto out_stride = framework::stride(out->dims());
diff --git a/python/paddle/v2/framework/tests/test_sequence_slice_op.py b/python/paddle/v2/fluid/tests/test_sequence_slice_op.py
similarity index 60%
rename from python/paddle/v2/framework/tests/test_sequence_slice_op.py
rename to python/paddle/v2/fluid/tests/test_sequence_slice_op.py
index 47b616b743..80f4bfbdd1 100755
--- a/python/paddle/v2/framework/tests/test_sequence_slice_op.py
+++ b/python/paddle/v2/fluid/tests/test_sequence_slice_op.py
@@ -5,25 +5,32 @@ from op_test import OpTest
 
 class TestSequenceSliceOp(OpTest):
     def set_data(self):
+        self.init_test_case()
         # only supprot one level LoD
-        x = np.random.random((100, 3, 2)).astype('float32')
-        lod = [[0, 20, 40, 60, 80, 100]]
-        offset = np.array([1, 2, 3, 4, 5]).flatten().astype("int64")
-        length = np.array([10, 8, 6, 4, 2]).flatten().astype("int64")
+        x = np.random.random(self.x_dim).astype('float32')
+        lod = self.x_lod
+        offset = np.array(self.offset).flatten().astype("int64")
+        length = np.array(self.length).flatten().astype("int64")
 
         self.inputs = {'X': (x, lod), 'Offset': offset, 'Length': length}
-        outs = np.zeros((100, 3, 2)).astype('float32')
+        outs = [] #np.zeros((100, 3, 2)).astype('float32')
         out_lod = [[0]]
         out_lod_offset = 0
         for i in range(len(offset)):
             sub_x = x[lod[0][i] + offset[i]: lod[0]
                       [i] + offset[i] + length[i], :]
             out_lod_offset = out_lod_offset + len(sub_x)
-            outs[out_lod[0][i]: out_lod_offset, :] = sub_x
+            outs.append(sub_x)
             out_lod[0].append(out_lod_offset)
-
+        outs = np.concatenate(outs, axis=0)
         self.outputs = {'Out': (outs, out_lod)}
 
+    def init_test_case(self):
+        self.x_dim = (100, 3, 2)
+        self.x_lod = [[0, 20, 40, 60, 80, 100]]
+        self.offset = [1, 2, 3, 4, 5]
+        self.length = [10, 8, 6, 4, 2]
+
     def setUp(self):
         self.op_type = "sequence_slice"
         self.set_data()