Merge branch 'develop' into cross_entropy_over_beam

8 years ago · 74d3ca8bab
parent 5e59ca7ccc ab6b3c481a
commit 74d3ca8bab
48 changed files with 1563 additions and 276 deletions
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@ -257,6 +257,11 @@ seq_concat
 ..  autoclass:: paddle.v2.layer.seq_concat
    :noindex:

+seq_slice
+---------
+..  autoclass:: paddle.v2.layer.seq_slice
+    :noindex:
+
 kmax_sequence_score
 -------------------
 ..  autoclass:: paddle.v2.layer.kmax_sequence_score
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@ -15,6 +15,7 @@ if(Boost_FOUND)
  add_subdirectory(platform)
  add_subdirectory(framework)
  add_subdirectory(operators)
+  add_subdirectory(pybind)
 endif()

 if(WITH_C_API)
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@ -18,8 +18,8 @@ cc_test(scope_test SRCS scope_test.cc DEPS scope)
 proto_library(framework_proto SRCS framework.proto)

 cc_library(attribute SRCS attribute.cc DEPS framework_proto)
-
-cc_library(operator SRCS operator.cc DEPS framework_proto device_context tensor scope attribute)
+cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
+cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope)
 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry)

 cc_library(grad_op_builder SRCS grad_op_builder.cc DEPS operator)
@ -39,21 +39,3 @@ add_custom_command(TARGET framework_py_proto POST_BUILD

 cc_library(backward SRCS backward.cc DEPS net_op)
 cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context)
-
-if(WITH_PYTHON)
-cc_library(paddle_pybind SHARED
-    SRCS pybind.cc
-    DEPS pybind python backward
-    sgd_op
-    add_op
-    mul_op
-    rowwise_add_op
-    sigmoid_op
-    softmax_op
-    mean_op
-    cross_entropy_op
-    recurrent_op
-    uniform_random_op
-    gaussian_random_op
-    fill_zeros_like_op)
-endif(WITH_PYTHON)
--- a/paddle/framework/backward_test.cc
+++ b/paddle/framework/backward_test.cc
@ -72,8 +72,8 @@ class NoGradOpMaker : public OpProtoAndCheckerMaker {

 class FcOp : public operators::NetOp {
 public:
-  FcOp(const std::string &type, const VarNameMap &inputs,
-       const VarNameMap &outputs, const AttributeMap &attrs)
+  FcOp(const std::string &type, const VariableNameMap &inputs,
+       const VariableNameMap &outputs, const AttributeMap &attrs)
      : NetOp(type, inputs, outputs, attrs) {
    AppendOp(OpRegistry::CreateOp("mul",
                                  {{"X", {Input("X")}}, {"Y", {Input("W")}}},
--- a/paddle/framework/grad_op_builder.cc
+++ b/paddle/framework/grad_op_builder.cc
@ -20,13 +20,13 @@ namespace framework {
 enum class OpArgType { IN, OUT };

 static void TransOpArg(const OperatorBase* src_op, const OpArgType& src_type,
-                       bool is_grad, OperatorBase::VarNameMap* vars) {
+                       bool is_grad, VariableNameMap* vars) {
  const auto& src_inout =
      src_type == OpArgType::IN ? src_op->Inputs() : src_op->Outputs();
  auto& dst_inout = *vars;
-  const OpProto* proto = OpRegistry::op_info_map().at(src_op->Type()).proto_;
+  auto& proto = OpInfoMap::Instance().Get(src_op->Type()).Proto();
  const auto& src_arg_list =
-      src_type == OpArgType::IN ? proto->inputs() : proto->outputs();
+      src_type == OpArgType::IN ? proto.inputs() : proto.outputs();
  for (const auto& arg : src_arg_list) {
    if (arg.not_in_gradient() && !is_grad) continue;
    const std::string src_name = arg.name();
@ -40,26 +40,18 @@ static void TransOpArg(const OperatorBase* src_op, const OpArgType& src_type,
 }

 OperatorBase* BuildGradOp(const OperatorBase* op) {
-  auto it = OpRegistry::op_info_map().find(op->Type());
-  PADDLE_ENFORCE(it != OpRegistry::op_info_map().end(),
-                 "'%s' has not been registered.", op->Type());
-  PADDLE_ENFORCE(it->second.proto_ != nullptr, "'%s' has no OpProto.",
-                 op->Type());
-  std::string grad_op_type = it->second.grad_op_type_;
-  PADDLE_ENFORCE(!grad_op_type.empty(), "'%s' has no gradient operator.",
-                 op->Type());
+  auto& info = OpInfoMap::Instance().Get(op->Type());
+  PADDLE_ENFORCE(info.HasGradientOp());

-  OperatorBase::VarNameMap inputs;
-  OperatorBase::VarNameMap outputs;
+  VariableNameMap inputs;
+  VariableNameMap outputs;
  TransOpArg(op, OpArgType::IN, false, &inputs);   // I
  TransOpArg(op, OpArgType::OUT, false, &inputs);  // O
  TransOpArg(op, OpArgType::OUT, true, &inputs);   // OG
  TransOpArg(op, OpArgType::IN, true, &outputs);   // IG

-  it = OpRegistry::op_info_map().find(grad_op_type);
-  PADDLE_ENFORCE(it != OpRegistry::op_info_map().end(),
-                 "'%s' has not been registered.", grad_op_type);
-  return it->second.creator_(grad_op_type, inputs, outputs, op->Attrs());
+  auto& grad_info = OpInfoMap::Instance().Get(info.grad_op_type_);
+  return grad_info.Creator()(info.grad_op_type_, inputs, outputs, op->Attrs());
 }

 }  // namespace framework
--- a/paddle/framework/op_info.cc
+++ b/paddle/framework/op_info.cc
@ -0,0 +1,29 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/framework/op_info.h"
+
+namespace paddle {
+namespace framework {
+
+static OpInfoMap* g_op_info_map = nullptr;
+
+OpInfoMap& OpInfoMap::Instance() {
+  if (g_op_info_map == nullptr) {
+    g_op_info_map = new OpInfoMap();
+  }
+  return *g_op_info_map;
+}
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/op_info.h
+++ b/paddle/framework/op_info.h
@ -0,0 +1,101 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include <functional>
+#include <map>
+#include <string>
+#include <unordered_map>
+
+#include "paddle/framework/attribute.h"
+
+namespace paddle {
+namespace framework {
+class OperatorBase;
+using VariableNameMap = std::map<std::string, std::vector<std::string>>;
+
+using OpCreator = std::function<OperatorBase*(
+    const std::string& /*type*/, const VariableNameMap& /*inputs*/,
+    const VariableNameMap& /*outputs*/, const AttributeMap& /*attrs*/)>;
+
+struct OpInfo {
+  OpCreator creator_;
+  std::string grad_op_type_;
+  OpProto* proto_;
+  OpAttrChecker* checker_;
+
+  bool HasOpProtoAndChecker() const {
+    return proto_ != nullptr && checker_ != nullptr;
+  }
+
+  const OpProto& Proto() const {
+    PADDLE_ENFORCE_NOT_NULL(proto_, "Operator Proto has not been registered");
+    PADDLE_ENFORCE(proto_->IsInitialized(),
+                   "Operator Proto must be initialized in op info");
+    return *proto_;
+  }
+
+  const OpAttrChecker& Checker() const {
+    PADDLE_ENFORCE_NOT_NULL(checker_,
+                            "Operator Checker has not been registered");
+    return *checker_;
+  }
+
+  const OpCreator& Creator() const {
+    PADDLE_ENFORCE_NOT_NULL(creator_,
+                            "Operator Creator has not been registered");
+    return creator_;
+  }
+
+  bool HasGradientOp() const { return !grad_op_type_.empty(); }
+};
+
+class OpInfoMap {
+ public:
+  static OpInfoMap& Instance();
+
+  OpInfoMap(const OpInfoMap& o) = delete;
+  OpInfoMap(OpInfoMap&& o) = delete;
+  OpInfoMap& operator=(const OpInfoMap& o) = delete;
+  OpInfoMap& operator=(OpInfoMap&& o) = delete;
+
+  bool Has(const std::string& op_type) const {
+    return map_.find(op_type) != map_.end();
+  }
+
+  void Insert(const std::string& type, const OpInfo& info) {
+    PADDLE_ENFORCE(!Has(type), "Operator %s has been registered", type);
+    map_.insert({type, info});
+  }
+
+  const OpInfo& Get(const std::string& type) const {
+    auto it = map_.find(type);
+    PADDLE_ENFORCE(it != map_.end(), "Operator %s are not found", type);
+    return it->second;
+  }
+
+  template <typename Callback>
+  void IterAllInfo(Callback callback) {
+    for (auto& it : map_) {
+      callback(it.first, it.second);
+    }
+  }
+
+ private:
+  OpInfoMap() = default;
+  std::unordered_map<std::string, const OpInfo> map_;
+};
+
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/op_registry.cc
+++ b/paddle/framework/op_registry.cc
@ -19,32 +19,18 @@ limitations under the License. */
 namespace paddle {
 namespace framework {

-std::unique_ptr<OperatorBase> OpRegistry::CreateOp(const std::string& type,
-                                                   const VarNameMap& inputs,
-                                                   const VarNameMap& outputs,
-                                                   AttributeMap attrs) {
-  auto it = op_info_map().find(type);
-  PADDLE_ENFORCE(it != op_info_map().end(),
-                 "Operator '%s' has not been registered.", type);
-  it->second.checker_->Check(attrs);
-  auto op = it->second.creator_(type, inputs, outputs, attrs);
+std::unique_ptr<OperatorBase> OpRegistry::CreateOp(
+    const std::string& type, const VariableNameMap& inputs,
+    const VariableNameMap& outputs, AttributeMap attrs) {
+  auto& info = OpInfoMap::Instance().Get(type);
+  info.Checker().Check(attrs);
+  auto op = info.Creator()(type, inputs, outputs, attrs);
  return std::unique_ptr<OperatorBase>(op);
 }

-std::unique_ptr<OperatorBase> OpRegistry::CreateOp(const OpDesc& op_desc) {
-  VarNameMap inputs = ConvertOpDescVarsToVarNameMap(op_desc.inputs());
-  VarNameMap outputs = ConvertOpDescVarsToVarNameMap(op_desc.outputs());
-  AttributeMap attrs;
-  for (auto& attr : op_desc.attrs()) {
-    attrs[attr.name()] = GetAttrValue(attr);
-  }
-
-  return CreateOp(op_desc.type(), inputs, outputs, attrs);
-}
-
-OperatorBase::VarNameMap OpRegistry::ConvertOpDescVarsToVarNameMap(
+static VariableNameMap ConvertOpDescVarsToVarNameMap(
    const google::protobuf::RepeatedPtrField<OpDesc::Var>& op_desc_vars) {
-  VarNameMap ret_val;
+  VariableNameMap ret_val;
  for (auto& var : op_desc_vars) {
    auto& var_names = ret_val[var.parameter()];
    auto& var_names_in_proto = var.arguments();
@ -55,6 +41,17 @@ OperatorBase::VarNameMap OpRegistry::ConvertOpDescVarsToVarNameMap(
  return ret_val;
 }

+std::unique_ptr<OperatorBase> OpRegistry::CreateOp(const OpDesc& op_desc) {
+  VariableNameMap inputs = ConvertOpDescVarsToVarNameMap(op_desc.inputs());
+  VariableNameMap outputs = ConvertOpDescVarsToVarNameMap(op_desc.outputs());
+  AttributeMap attrs;
+  for (auto& attr : op_desc.attrs()) {
+    attrs[attr.name()] = GetAttrValue(attr);
+  }
+
+  return CreateOp(op_desc.type(), inputs, outputs, attrs);
+}
+
 std::unique_ptr<OperatorBase> OpRegistry::CreateGradOp(const OperatorBase& op) {
  PADDLE_ENFORCE(!op.IsNetOp(), "Use framework::Backward to get backward ops");
  return std::unique_ptr<OperatorBase>(BuildGradOp(&op));
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@ -23,6 +23,7 @@ limitations under the License. */
 #include "paddle/framework/attribute.h"
 #include "paddle/framework/framework.pb.h"
 #include "paddle/framework/grad_op_builder.h"
+#include "paddle/framework/op_info.h"
 #include "paddle/framework/operator.h"
 #include "paddle/framework/scope.h"

@ -30,28 +31,16 @@ namespace paddle {
 namespace framework {

 class OpRegistry {
-  using VarNameMap = OperatorBase::VarNameMap;
-  using OpCreator = std::function<OperatorBase*(
-      const std::string& /*type*/, const VarNameMap& /*inputs*/,
-      const VarNameMap& /*outputs*/, const AttributeMap& /*attrs*/)>;
-
 public:
-  struct OpInfo {
-    OpCreator creator_;
-    std::string grad_op_type_;
-    OpProto* proto_;
-    OpAttrChecker* checker_;
-  };
-
  template <typename OpType, typename ProtoMakerType, typename GradOpType>
  static void RegisterOp(const std::string& op_type,
                         const std::string& grad_op_type) {
-    PADDLE_ENFORCE(op_info_map().count(op_type) == 0,
+    PADDLE_ENFORCE(!OpInfoMap::Instance().Has(op_type),
                   "'%s' is registered more than once.", op_type);
    OpInfo op_info;
-    op_info.creator_ = [](const std::string& type, const VarNameMap& inputs,
-                          const VarNameMap& outputs,
-                          const AttributeMap& attrs) {
+    op_info.creator_ = [](
+        const std::string& type, const VariableNameMap& inputs,
+        const VariableNameMap& outputs, const AttributeMap& attrs) {
      return new OpType(type, inputs, outputs, attrs);
    };
    op_info.grad_op_type_ = grad_op_type;
@ -70,7 +59,7 @@ class OpRegistry {
      op_info.proto_ = nullptr;
      op_info.checker_ = nullptr;
    }
-    op_info_map().insert(std::make_pair(op_type, op_info));
+    OpInfoMap::Instance().Insert(op_type, op_info);
    // register gradient op
    if (!grad_op_type.empty()) {
      RegisterOp<GradOpType, NOPMaker, NOP>(grad_op_type, "");
@ -78,21 +67,13 @@ class OpRegistry {
  }

  static std::unique_ptr<OperatorBase> CreateOp(const std::string& type,
-                                                const VarNameMap& inputs,
-                                                const VarNameMap& outputs,
+                                                const VariableNameMap& inputs,
+                                                const VariableNameMap& outputs,
                                                AttributeMap attrs);

  static std::unique_ptr<OperatorBase> CreateOp(const OpDesc& op_desc);

-  static VarNameMap ConvertOpDescVarsToVarNameMap(
-      const google::protobuf::RepeatedPtrField<OpDesc::Var>& op_desc_vars);
-
  static std::unique_ptr<OperatorBase> CreateGradOp(const OperatorBase& op);
-
-  static std::unordered_map<std::string, const OpInfo>& op_info_map() {
-    static std::unordered_map<std::string, const OpInfo> op_info_map_;
-    return op_info_map_;
-  }
 };

 class Registrar {
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@ -115,8 +115,8 @@ void OperatorBase::Rename(const std::string& old_name,
 }

 OperatorBase::OperatorBase(const std::string& type,
-                           const OperatorBase::VarNameMap& inputs,
-                           const OperatorBase::VarNameMap& outputs,
+                           const VariableNameMap& inputs,
+                           const VariableNameMap& outputs,
                           const AttributeMap& attrs)
    : type_(type), inputs_(inputs), outputs_(outputs), attrs_(attrs) {
  static std::atomic<size_t> gUniqId(0UL);
@ -141,18 +141,10 @@ std::vector<std::string> OperatorBase::OutputVars(bool has_intermediate) const {
    }
    return ret_val;
  }
-  auto it = OpRegistry::op_info_map().find(type_);
-  PADDLE_ENFORCE(
-      it != OpRegistry::op_info_map().end(),
-      "Operator %s not registered, cannot figure out intermediate outputs",
-      type_);
-  PADDLE_ENFORCE(
-      it->second.proto_ != nullptr,
-      "Operator %s has no OpProto, cannot figure out intermediate outputs",
-      type_);
+  auto& info = OpInfoMap::Instance().Get(Type());

  // get all OpProto::Var for outputs
-  for (auto& o : it->second.proto_->outputs()) {
+  for (auto& o : info.Proto().outputs()) {
    // ignore all intermediate output
    if (o.intermediate()) continue;
    auto out = outputs_.find(o.name());
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@ -19,6 +19,7 @@ limitations under the License. */
 #include <unordered_map>
 #include <vector>

+#include "op_info.h"
 #include "paddle/framework/attribute.h"
 #include "paddle/framework/framework.pb.h"
 #include "paddle/framework/scope.h"
@ -62,10 +63,8 @@ class ExecutionContext;
 */
 class OperatorBase {
 public:
-  using VarNameMap = std::map<std::string, std::vector<std::string>>;
-
-  OperatorBase(const std::string& type, const VarNameMap& inputs,
-               const VarNameMap& outputs, const AttributeMap& attrs);
+  OperatorBase(const std::string& type, const VariableNameMap& inputs,
+               const VariableNameMap& outputs, const AttributeMap& attrs);

  virtual ~OperatorBase() {}

@ -93,8 +92,8 @@ class OperatorBase {
  /// rename inputs outputs name
  void Rename(const std::string& old_name, const std::string& new_name);

-  const VarNameMap& Inputs() const { return inputs_; }
-  const VarNameMap& Outputs() const { return outputs_; }
+  const VariableNameMap& Inputs() const { return inputs_; }
+  const VariableNameMap& Outputs() const { return outputs_; }
  //! Get a input with argument's name described in `op_proto`
  const std::string& Input(const std::string& name) const;
  //! Get a input which has multiple variables.
@ -122,30 +121,32 @@ class OperatorBase {
  // I (Inputs)opear
  // O (Outputs)
  // OG (Output Gradients)
-  VarNameMap inputs_;
+  VariableNameMap inputs_;

  // NOTE: in case of OpGrad, outputs_ contains
  // IG (Inputs Gradients)
-  VarNameMap outputs_;
+  VariableNameMap outputs_;
  AttributeMap attrs_;
 };

 // Macro for define a clone method.
 // If you are writing an kernel operator, `Clone` will be defined when you
 // register it. i.e. `Clone` method is not needed to define by yourself.
-#define DEFINE_OP_CLONE_METHOD(CLS)                       \
+#define DEFINE_OP_CLONE_METHOD(cls)                       \
  std::unique_ptr<OperatorBase> Clone() const final {     \
-    return std::unique_ptr<OperatorBase>(new CLS(*this)); \
+    return std::unique_ptr<OperatorBase>(new cls(*this)); \
  }

 // Macro for define a default constructor for Operator.
 // You can also use
 //   using PARENT_CLASS::PARENT_CLASS;
 // to use parent's constructor.
-#define DEFINE_OP_CONSTRUCTOR(CLS, PARENT_CLS)                                 \
-  CLS(const std::string& type, const VarNameMap& inputs,                       \
-      const VarNameMap& outputs, const paddle::framework::AttributeMap& attrs) \
-      : PARENT_CLS(type, inputs, outputs, attrs) {}
+#define DEFINE_OP_CONSTRUCTOR(cls, parent_cls)             \
+  cls(const std::string& type,                             \
+      const ::paddle::framework::VariableNameMap& inputs,  \
+      const ::paddle::framework::VariableNameMap& outputs, \
+      const paddle::framework::AttributeMap& attrs)        \
+      : parent_cls(type, inputs, outputs, attrs) {}

 class NOP : public OperatorBase {
 public:
@ -389,8 +390,8 @@ class OperatorWithKernel : public OperatorBase {
  using OpKernelMap =
      std::unordered_map<OpKernelKey, std::unique_ptr<OpKernel>, OpKernelHash>;

-  OperatorWithKernel(const std::string& type, const VarNameMap& inputs,
-                     const VarNameMap& outputs, const AttributeMap& attrs)
+  OperatorWithKernel(const std::string& type, const VariableNameMap& inputs,
+                     const VariableNameMap& outputs, const AttributeMap& attrs)
      : OperatorBase(type, inputs, outputs, attrs) {}

  void InferShape(const Scope& scope) const override {
--- a/paddle/framework/operator_test.cc
+++ b/paddle/framework/operator_test.cc
@ -23,8 +23,8 @@ static int op_run_num = 0;

 class OpWithoutKernelTest : public OperatorBase {
 public:
-  OpWithoutKernelTest(const std::string& type, const VarNameMap& inputs,
-                      const VarNameMap& outputs, const AttributeMap& attrs)
+  OpWithoutKernelTest(const std::string& type, const VariableNameMap& inputs,
+                      const VariableNameMap& outputs, const AttributeMap& attrs)
      : OperatorBase(type, inputs, outputs, attrs), x(1) {}
  void InferShape(const Scope& scope) const override {}
  void Run(const Scope& scope,
@ -249,8 +249,9 @@ TEST(OpKernel, multi_inputs) {
 class OperatorClone : public paddle::framework::OperatorBase {
 public:
  DEFINE_OP_CLONE_METHOD(OperatorClone);
-  OperatorClone(const std::string& type, const VarNameMap& inputs,
-                const VarNameMap& outputs,
+  OperatorClone(const std::string& type,
+                const paddle::framework::VariableNameMap& inputs,
+                const paddle::framework::VariableNameMap& outputs,
                const paddle::framework::AttributeMap& attrs)
      : OperatorBase(type, inputs, outputs, attrs) {}
  void InferShape(const paddle::framework::Scope& scope) const override {}
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@ -105,7 +105,10 @@ class Tensor {
  template <typename T>
  inline Tensor Slice(const int& begin_idx, const int& end_idx) const;

-  platform::Place place() const { return holder_->place(); }
+  platform::Place place() const {
+    PADDLE_ENFORCE_NOT_NULL(holder_, "Tensor get place() must contains holder");
+    return holder_->place();
+  }

 private:
  template <typename T>
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
@ -1012,11 +1012,6 @@ void RecurrentGradientMachine::generateSequence() {
                           /* width */ resultNum,
                           false,
                           /* useGpu */ false);
-    Matrix::resizeOrCreate(generator_.outArg.value,
-                           /* height */ maxGenWordCount,
-                           /* width */ 1,
-                           false,
-                           /* useGpu */ false);
  }
  ICpuGpuVector::resizeOrCreate(generator_.outArg.sequenceStartPositions,
                                numSequences + 1,
@ -1026,7 +1021,7 @@ void RecurrentGradientMachine::generateSequence() {
  } else {
    oneWaySearch(numSequences);
  }
-  if (dataArgsSize_) createDataOutlink(batchMachineIdVec_);
+  if (dataArgsSize_) createDataOutlink();

  size_t size = generator_.ids.size();
  generator_.outArg.ids->resize(size);
@ -1106,6 +1101,7 @@ void RecurrentGradientMachine::oneWaySearch(size_t batchSize) {
  }

  batchMachineIdVec_.clear();
+  batchMachineStartPos_.clear();
  int* starts = generator_.outArg.sequenceStartPositions->getMutableData(false);
  starts[0] = 0;
  generator_.ids.clear();
@ -1312,13 +1308,20 @@ void RecurrentGradientMachine::fillGenOutputs() {
    finalPaths_[i].resize(minFinalPathsSize);
  }

-  batchMachineIdVec_.clear();
  generator_.ids.clear();
  int* starts = generator_.outArg.sequenceStartPositions->getMutableData(false);
  starts[0] = 0;
  if (numResults > 1) {
-    real* probs = generator_.outArg.in->getData();
+    int idsProbSaveSize = 0;
+    for (auto inSeq : finalPaths_) {
+      for (auto path : inSeq) idsProbSaveSize += path.ids.size();
+      idsProbSaveSize += inSeq.size();
+    }
+    Matrix::resizeOrCreate(
+        generator_.outArg.value, idsProbSaveSize, 1, false, false);
    real* idsProb = generator_.outArg.value->getData();
+
+    real* probs = generator_.outArg.in->getData();
    size_t curPos = 0;
    for (size_t i = 0; i < finalPaths_.size(); ++i) {
      for (size_t j = 0; j < finalPaths_[i].size(); ++j) {
@ -1333,24 +1336,16 @@ void RecurrentGradientMachine::fillGenOutputs() {
        curPos += genLen;
        idsProb[curPos++] = -1.0;
        probs[i * numResults + j] = path.logProb;
-
-        if (!j && dataArgsSize_) {
-          // in beam search, here only reserved the top 1 generated result
-          // for out_links that are not the generated word indices.
-          batchMachineIdVec_.insert(batchMachineIdVec_.end(),
-                                    path.machineIdVec.begin(),
-                                    path.machineIdVec.end());
-        }
      }
      starts[i + 1] = generator_.ids.size();
    }
  } else {
    for (size_t i = 0; i < finalPaths_.size(); ++i) {
      CHECK(!finalPaths_[i].empty());
-      generator_.ids.insert(generator_.ids.begin(),
-                            finalPaths_[i][0].ids.begin(),
-                            finalPaths_[i][0].ids.end());
-      starts[i + 1] = starts[i] + finalPaths_[i][0].ids.size();
+      Path& path = finalPaths_[i][0];
+      generator_.ids.insert(
+          generator_.ids.begin(), path.ids.begin(), path.ids.end());
+      starts[i + 1] = starts[i] + path.ids.size();
    }
  }
 }
@ -1364,25 +1359,76 @@ void RecurrentGradientMachine::copyDataOutlinkFrame(size_t machineCur) {
  }
 }

-void RecurrentGradientMachine::createDataOutlink(
-    std::vector<int>& machineIdVec) {
-  size_t seqNum =
-      getBeamSize() > 1UL ? finalPaths_.size() : finalPaths_[0].size();
-  std::vector<int> starts(seqNum + 1, 0);
-  for (size_t i = 0; i < seqNum; ++i) {
-    size_t seqLen = getBeamSize() > 1UL ? finalPaths_[i][0].ids.size()
-                                        : finalPaths_[0][i].ids.size();
-    starts[i + 1] = starts[i] + seqLen;
+void RecurrentGradientMachine::createDataOutlinkSelRowsInfo(
+    bool isSeq, std::vector<Argument>& outArgs) {
+  batchMachineIdVec_.clear();
+
+  size_t seqIdx = 0;
+  for (size_t i = 0; i < finalPaths_.size(); ++i) {
+    for (size_t j = 0; j < finalPaths_[i].size(); ++j) {
+      std::vector<int>& machineIdVec = finalPaths_[i][j].machineIdVec;
+      if (isSeq) {
+        for (size_t i = 0; i < machineIdVec.size(); ++i) {
+          size_t rowId = machineIdVec[i];
+          int* seqPos =
+              outArgs[i].sequenceStartPositions->getMutableData(false);
+          batchMachineIdVec_.push_back(seqPos[rowId]);
+        }
+      } else {
+        batchMachineIdVec_.insert(
+            batchMachineIdVec_.end(), machineIdVec.begin(), machineIdVec.end());
+      }
+      seqIdx++;
+    }
+  }
+}
+
+void RecurrentGradientMachine::createDataOutlinkCopySizeInfo(
+    bool isSeq, std::vector<Argument>& outArgs, std::vector<int>& copySize) {
+  size_t totalSeqNum = std::accumulate(
+      finalPaths_.begin(),
+      finalPaths_.end(),
+      0UL,
+      [](size_t a, const std::vector<Path>& b) { return a + b.size(); });
+  copySize.resize(totalSeqNum, 1);
+
+  batchMachineStartPos_.resize(totalSeqNum + 1, 0);
+  if (isSeq) {
+    ICpuGpuVectorPtr inputSeqStartPos = outArgs[0].sequenceStartPositions;
+    CHECK_EQ(static_cast<size_t>(inputSeqStartPos->getSize() - 1),
+             getBeamSize() > 1 ? finalPaths_.size() : finalPaths_[0].size());
+    int* starts = inputSeqStartPos->getMutableData(false);
+    int seqId = 0;
+    for (int i = 0; i < finalPaths_.size(); ++i) {
+      for (int j = 0; j < finalPaths_[i].size(); ++j) {
+        copySize[seqId] = getBeamSize() > 1 ? starts[i + 1] - starts[i]
+                                            : starts[j + 1] - starts[j];
+        batchMachineStartPos_[seqId + 1] =
+            batchMachineStartPos_[seqId] + finalPaths_[i][j].ids.size();
+        seqId++;
+      }
+    }
+  } else {
+    for (size_t i = 0; i < finalPaths_[0].size(); ++i)
+      batchMachineStartPos_[i + 1] =
+          batchMachineStartPos_[i] + finalPaths_[0][i].ids.size();
  }
+}

+void RecurrentGradientMachine::createDataOutlink() {
  for (size_t i = 0; i < dataArgsSize_; i++) {
+    bool isSeq = dataArgsFrame_[i][0].hasSeq();
+    std::vector<int> copySize;
+    createDataOutlinkCopySizeInfo(isSeq, dataArgsFrame_[i], copySize);
+    createDataOutlinkSelRowsInfo(isSeq, dataArgsFrame_[i]);
+
    dataArgs_[i].concat(dataArgsFrame_[i],
-                        machineIdVec,
-                        starts,
+                        batchMachineIdVec_,
+                        batchMachineStartPos_,
+                        copySize,
                        useGpu_,
                        HPPL_STREAM_1,
                        PASS_TEST);
-
    auto dataAgent =
        dynamic_cast<DataLayer*>(outFrameLines_[i + 1].agentLayer.get());
    CHECK_NOTNULL(dataAgent);
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
@ -190,7 +190,7 @@ public:
    std::vector<int> ids;

    /**
-     * @brief idsProb, log probability of each generated words.
+     * @brief idsProb, log probability of each generated word.
     */
    std::vector<real> idsProb;

@ -472,15 +472,43 @@ private:
  void copyDataOutlinkFrame(size_t machineCur);

  /*
-   * @brief In generation, if the layer group has more than 1 outlink, outlinks
-   * except the first one are data outlinks. This function creates the data
-   * outlinks.
-   * @note In beam search, only one generated sequence with the hightest log
-   * probabilites are retained.
-   * @param machineIdVec : select a row of output matrix in each frame
-   * that the generation process expanded.
+   * @brief In generation, if the layer group has more than 1 outlink, outlink
+   * except the first one is a data outlink. In RecurrentLayerGroup, each time
+   * step is a separate Network, outputs of a layer inside the
+   * RecurrentLayerGroup are stored in separate Arguments. If one layer is
+   * specified as an outlink of RecurrentLayerGroup. This function will
+   * collect outputs in each time step of each generated sequence which are
+   * dispersed in separate Arguments to form a new single Argument as output of
+   * RecurrentLayerGroup.
   */
-  void createDataOutlink(std::vector<int>& machineIdVec);
+  void createDataOutlink();
+
+  /*
+   * @brief decide to select how many rows from the Matrix stored the forward
+   * pass results from a start position.
+   *
+   * @param isSeq: a flag indicating whetehr the layer to be output of the
+   * RecurrentGradientMachine is a sequence or not
+   * @param outArgs: all of the the returned Arguments of the forward pass
+   * during the generation process.
+   * @param copySize: the returned result, number of rows to select from the
+   * Matrix stored the forward pass results from a start position.
+   */
+  void createDataOutlinkCopySizeInfo(bool isSeq,
+                                     std::vector<Argument>& outArgs,
+                                     std::vector<int>& copySize);
+
+  /*
+   * @brief decide index of the start row for each time step of a generated
+   * sequence in Matrix stored the entire beam search batch's forward pass
+   * results.
+   *
+   * @param isSeq: a flag indicating whether the layer to be output of the
+   * RecurrentGradientMachine is a sequence or not
+   * @param outArgs: all of the returned Arguments of the forward pass
+   * during the generation process.
+   */
+  void createDataOutlinkSelRowsInfo(bool isSeq, std::vector<Argument>& outArgs);

  /*
   * @brief used in beam search, connect previous frame to form recurrent link
@ -543,6 +571,7 @@ private:
  std::vector<int> topIds_;
  std::vector<int> seqIds_;
  std::vector<int> batchMachineIdVec_;
+  std::vector<int> batchMachineStartPos_;
  std::vector<std::vector<Path>> finalPaths_;
  std::vector<real> minFinalPathLogProb_;
  BeamSearchControlCallbacks* beamSearchCtrlCallbacks_;
--- a/paddle/gserver/layers/KmaxSeqScoreLayer.cpp
+++ b/paddle/gserver/layers/KmaxSeqScoreLayer.cpp
@ -80,13 +80,14 @@ void KmaxSeqScoreLayer::forward(PassType passType) {
      << "input of " << getName()
      << " must be a sequence or a nested sequence.";
  CHECK_EQ(input.value->getWidth(), 1UL)
-      << "input of " << getName()
-      << " is score over a sequence or a nested sequence, so its width "
-      << " must be 1.";
+      << "input of " << getName() << " are scores over a sequence or "
+      << "a nested sequence, so its width must be 1.";

  if (useGpu_) {
-    // this Layer runs only in CPU, if the model is runing on GPU,
-    // then copy the input to this layer from GPU to CPU.
+    /*
+     * currently, this Layer only runs in CPU, if the other part of the model is
+     * runing on GPU, then copy the input to this layer from GPU to CPU.
+     */
    Matrix::resizeOrCreate(scores_,
                           inputScore->getHeight(),
                           1,
@ -97,6 +98,14 @@ void KmaxSeqScoreLayer::forward(PassType passType) {
    scores_ = inputScore;
  }

+  /*
+   * TODO(caoying)
+   * In PaddePaddle, currently all matrices are real number types,
+   * but output of this layer which is some selected indices of the give
+   * sequence are actually filled with int types so that storing int types
+   * information in a real number matrix is dangerous, since real numbers will
+   * be convered to int types.
+   */
  Matrix::resizeOrCreate(
      output_.value,
      input.hasSubseq() ? input.getNumSubSequences() : input.getNumSequences(),
--- a/paddle/gserver/layers/SequenceSliceLayer.cpp
+++ b/paddle/gserver/layers/SequenceSliceLayer.cpp
@ -0,0 +1,220 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/math/Vector.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+class SequenceSliceLayer : public Layer {
+public:
+  explicit SequenceSliceLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+
+private:
+  /*
+   * TODO(caoying)
+   * In PaddePaddle, currently all matrices are real number types,
+   * but the second and the (optional) third input which are some
+   * selected indices of the give sequence to trim the sequence, are actually
+   * filled with int types so that storing int types information in real number
+   * matrices is very dangerous, since real numbers will be convered to int
+   * types. If a user fills this matrix himself, invalid data may occor.
+   */
+
+  MatrixPtr startIdsOnCpu_;
+  MatrixPtr endIdsOnCpu_;
+
+  std::vector<int> selectedRows_;
+  IVectorPtr rowIndice_;
+  std::vector<std::vector<int>> inputSeqInfoVec_;
+  std::vector<int> outSubSeqStartPos_;
+  std::vector<int> outSeqStartPos_;
+
+  void checkInputs();
+  void copySliceIdsToCpu();
+  void calSelectedRows(const MatrixPtr starts, const MatrixPtr ends);
+};
+
+REGISTER_LAYER(seq_slice, SequenceSliceLayer);
+
+bool SequenceSliceLayer::init(const LayerMap& layerMap,
+                              const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+  CHECK_GE(inputLayers_.size(), 2U);
+  CHECK_LE(inputLayers_.size(), 3U);
+
+  setNeedSequenceInfo(false);
+  return true;
+}
+
+void SequenceSliceLayer::checkInputs() {
+  const Argument& inputSeq = getInput(0);
+  CHECK(inputSeq.hasSeq()) << "The first input of sequence slice layer "
+                           << "must be a sequence.";
+  const MatrixPtr indices1 = getInputValue(1);
+  CHECK_EQ(static_cast<size_t>(indices1->getHeight()),
+           inputSeq.hasSubseq() ? inputSeq.getNumSubSequences()
+                                : inputSeq.getNumSequences())
+      << "Height of the second input should be equal to number of sequence "
+      << "in the first input.";
+  if (inputLayers_.size() == 3) {
+    const MatrixPtr indices2 = getInputValue(2);
+    CHECK_EQ(indices2->getHeight(), indices1->getHeight())
+        << "start indices and end indices should have the same height.";
+    CHECK_EQ(indices2->getWidth(), indices1->getWidth())
+        << "start indices and end indices should have the same Width.";
+  }
+}
+
+void SequenceSliceLayer::copySliceIdsToCpu() {
+  const MatrixPtr indices1 = getInputValue(1);
+  if (inputLayers_.size() == 2U) {
+    if (config_.select_first()) {
+      Matrix::resizeOrCreate(startIdsOnCpu_,
+                             indices1->getHeight(),
+                             indices1->getWidth(),
+                             false /* trans */,
+                             false /* useGpu */);
+      startIdsOnCpu_->copyFrom(*indices1);
+      endIdsOnCpu_ = nullptr;
+    } else {
+      Matrix::resizeOrCreate(endIdsOnCpu_,
+                             indices1->getHeight(),
+                             indices1->getWidth(),
+                             false /* trans */,
+                             false /* useGpu */);
+      endIdsOnCpu_->copyFrom(*indices1);
+      startIdsOnCpu_ = nullptr;
+    }
+  } else if (inputLayers_.size() == 3U) {
+    Matrix::resizeOrCreate(startIdsOnCpu_,
+                           indices1->getHeight(),
+                           indices1->getWidth(),
+                           false /* trans */,
+                           false /* useGpu */);
+    startIdsOnCpu_->copyFrom(*indices1);
+
+    const MatrixPtr indices2 = getInputValue(2);
+    Matrix::resizeOrCreate(endIdsOnCpu_,
+                           indices2->getHeight(),
+                           indices2->getWidth(),
+                           false /* trans */,
+                           false /* useGpu */);
+    endIdsOnCpu_->copyFrom(*indices2);
+  }
+}
+
+void SequenceSliceLayer::calSelectedRows(const MatrixPtr starts,
+                                         const MatrixPtr ends) {
+  CHECK(starts || ends) << "At least one of the start or end indices "
+                        << "should be given.";
+
+  outSeqStartPos_.resize(1, 0);
+  outSubSeqStartPos_.resize(1, 0);
+  selectedRows_.clear();
+
+  size_t beamSize = starts ? starts->getWidth() : ends->getWidth();
+  size_t rowIdx = 0;
+  for (size_t i = 0; i < inputSeqInfoVec_.size(); ++i) {
+    for (size_t j = 0; j < inputSeqInfoVec_[i].size() - 1; ++j) {
+      for (size_t k = 0; k < beamSize; ++k) {
+        if (starts && starts->getElement(rowIdx, k) == -1.) break;
+        if (ends && ends->getElement(rowIdx, k) == -1.) break;
+
+        int begPos = inputSeqInfoVec_[i][j];
+        if (starts) begPos += starts->getElement(rowIdx, k);
+
+        int endPos = inputSeqInfoVec_[i][j + 1] - 1;
+        if (ends) endPos = inputSeqInfoVec_[i][j] + ends->getElement(rowIdx, k);
+
+        int seqLen = endPos - begPos + 1;
+        CHECK_GT(seqLen, 0U);
+        for (int m = begPos; m <= endPos; ++m) selectedRows_.push_back(m);
+        inputSeqInfoVec_.size() > 1
+            ? outSubSeqStartPos_.push_back(outSubSeqStartPos_.back() + seqLen)
+            : outSeqStartPos_.push_back(outSeqStartPos_.back() + seqLen);
+      }
+      rowIdx++;
+    }
+    if (inputSeqInfoVec_.size() > 1)
+      outSeqStartPos_.push_back(outSubSeqStartPos_.back());
+  }
+
+  if (useGpu_) {
+    rowIndice_ = IVector::create(selectedRows_.size(), useGpu_);
+    rowIndice_->copyFrom(selectedRows_.data(), selectedRows_.size());
+  } else {
+    rowIndice_ =
+        IVector::create(selectedRows_.data(), selectedRows_.size(), useGpu_);
+  }
+
+  // create the sequence information for the output.
+  ICpuGpuVector::resizeOrCreate(
+      output_.sequenceStartPositions, outSeqStartPos_.size(), false);
+  output_.sequenceStartPositions->copyFrom(
+      outSeqStartPos_.data(), outSeqStartPos_.size(), false);
+
+  if (inputSeqInfoVec_.size() > 1) {
+    ICpuGpuVector::resizeOrCreate(
+        output_.subSequenceStartPositions, outSubSeqStartPos_.size(), false);
+    output_.subSequenceStartPositions->copyFrom(
+        outSubSeqStartPos_.data(), outSubSeqStartPos_.size(), false);
+  }
+}
+
+void SequenceSliceLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  checkInputs();
+
+  const Argument& inputSeq = getInput(0);
+  inputSeqInfoVec_.clear();
+  Argument::reorganizeSeqInfo(inputSeq.sequenceStartPositions,
+                              inputSeq.subSequenceStartPositions,
+                              inputSeqInfoVec_);
+  if (!useGpu_) {
+    if (inputLayers_.size() == 2U) {
+      startIdsOnCpu_ = config_.select_first() ? getInputValue(1) : nullptr;
+      endIdsOnCpu_ = config_.select_first() ? nullptr : getInputValue(1);
+    } else if (inputLayers_.size() == 3U) {
+      startIdsOnCpu_ = getInputValue(1);
+      endIdsOnCpu_ = getInputValue(2);
+    }
+  } else
+    copySliceIdsToCpu();
+
+  // calculate the selected row indices in a batch,
+  // and build the output sequence information.
+  calSelectedRows(startIdsOnCpu_ ? startIdsOnCpu_ : nullptr,
+                  endIdsOnCpu_ ? endIdsOnCpu_ : nullptr);
+
+  resetOutput(selectedRows_.size(), getSize());
+
+  getOutputValue()->selectRows(*getInputValue(0), *rowIndice_);
+}
+
+void SequenceSliceLayer::backward(const UpdateCallback& callback) {
+  getOutputGrad()->addToRows(*getInputGrad(0), *rowIndice_);
+}
+
+}  // namespace paddle
--- a/paddle/gserver/layers/SubNestedSequenceLayer.cpp
+++ b/paddle/gserver/layers/SubNestedSequenceLayer.cpp
@ -52,23 +52,34 @@ private:
   *   ]
   *
   * ths output is saved to private member rowIndice_;
-   * [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
-   *  16,17,18,19,20,21,22,23,24,25,26,27]
+   * [0,1,2,3,4,5,6,7,8,9,15,16,17,18,19,20,21,23,24,25,26,27]
   */

-  void calSelectedCols(const MatrixPtr selectedIndices,
+  void calSelectedRows(const MatrixPtr selectedIndices,
                       const std::vector<std::vector<int>>& inputSeqInfo);

-  // if the second input of this layer is on GPU memory, copy it to CPU memory.
+  /*
+   * TODO(caoying)
+   * In PaddePaddle, currently all matrices are real number types,
+   * but the second is some selected indices of the give sequence to trim
+   * the nested sequence, are actually filled with int types so that storing
+   * int types information in real number matrices is very dangerous, since
+   * real numbers will be convered to int types. If a user fills this matrix
+   * himself, invalid data may occor.
+   *
+   * if the second input of this layer is on GPU memory, copy it to CPU memory.
+   */
  MatrixPtr selIdsCpu_;

-  // reorganized sequenceStartPositions and subSequenceStartPositions
-  // into a 2d vector to facilitate the sequence selection process.
+  /*
+   * reorganize sequenceStartPositions and subSequenceStartPositions
+   * into a 2d vector to facilitate the sequence selection process.
+   */
  std::vector<std::vector<int>> inputSeqInfoVec_;

-  // the final selected row indices in a batch,
-  // rowIdx_ and selectedRows_ actually share a same memory.
+  /* store the final selected row indices in a batch */
  IVectorPtr rowIndice_;
+  /* rowIndice_ and selectedRows_ actually share a same memory. */
  std::vector<int> selectedRows_;
 };

@ -83,7 +94,7 @@ bool SubNestedSequenceLayer::init(const LayerMap& layerMap,
  return true;
 }

-void SubNestedSequenceLayer::calSelectedCols(
+void SubNestedSequenceLayer::calSelectedRows(
    const MatrixPtr selectedIndices,
    const std::vector<std::vector<int>>& inputSeqInfo) {
  selectedRows_.clear();
@ -160,7 +171,7 @@ void SubNestedSequenceLayer::forward(PassType passType) {
  Argument::reorganizeSeqInfo(inputSeq.sequenceStartPositions,
                              inputSeq.subSequenceStartPositions,
                              inputSeqInfoVec_);
-  calSelectedCols(selIdsCpu_, inputSeqInfoVec_);
+  calSelectedRows(selIdsCpu_, inputSeqInfoVec_);

  resetOutput(selectedRows_.size(), getSize());
  getOutputValue()->selectRows(*getInputValue(0), *rowIndice_);
--- a/paddle/gserver/tests/CMakeLists.txt
+++ b/paddle/gserver/tests/CMakeLists.txt
@ -41,6 +41,13 @@ add_unittest_without_exec(test_CrossEntropyOverBeam
 add_test(NAME test_CrossEntropyOverBeam
    COMMAND test_CrossEntropyOverBeam)

+################ test_SeqSliceLayerGrad ####################
+add_unittest_without_exec(test_SeqSliceLayerGrad
+    test_SeqSliceLayerGrad.cpp
+    LayerGradUtil.cpp)
+add_test(NAME test_SeqSliceLayerGrad
+    COMMAND test_SeqSliceLayerGrad)
+
 add_unittest_without_exec(test_ActivationGrad
    test_ActivationGrad.cpp
    LayerGradUtil.cpp)
--- a/paddle/gserver/tests/test_SeqSliceLayerGrad.cpp
+++ b/paddle/gserver/tests/test_SeqSliceLayerGrad.cpp
@ -0,0 +1,223 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "ModelConfig.pb.h"
+#include "paddle/gserver/layers/DataLayer.h"
+#include "paddle/trainer/Trainer.h"
+
+#include "LayerGradUtil.h"
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+DECLARE_int32(gpu_id);
+DECLARE_bool(thread_local_rand_use_global_seed);
+
+const int MAX_SEQ_NUM = 17;
+const int MAX_SEQ_LEN = 23;
+const int MAX_BEAM_SIZE = 13;
+
+vector<real> randSampling(real range, int n) {
+  CHECK_GE(range, n);
+  vector<real> num(range);
+  iota(begin(num), end(num), 0.);
+  if (range == n) return num;
+
+  random_shuffle(begin(num), end(num));
+  num.resize(n);
+  sort(begin(num), end(num));
+  return num;
+}
+
+void genSeqInfo(vector<int>& seqStartPos, vector<int>& subSeqStartPos) {
+  seqStartPos.resize(1, 0);
+  subSeqStartPos.resize(1, 0);
+
+  srand((size_t)(time(NULL)));
+  int seqNum = 1 + (rand() % MAX_SEQ_NUM);
+  for (int i = 0; i < seqNum; ++i) {
+    int subSeqNum = 1 + (rand() % MAX_SEQ_NUM);
+    for (int j = 0; j < subSeqNum; ++j)
+      subSeqStartPos.push_back(subSeqStartPos.back() +
+                               (1 + (rand() % MAX_SEQ_LEN)));
+    seqStartPos.push_back(subSeqStartPos.back());
+  }
+}
+
+/*
+  generate start indices according to sequence start positions.
+ */
+void genStarts(vector<int>& seqStartPos,
+               vector<vector<real>>& starts,
+               size_t beamSize) {
+  starts.clear();
+  starts.resize(seqStartPos.size() - 1, vector<real>(beamSize, -1.));
+
+  for (size_t i = 0; i < seqStartPos.size() - 1; ++i) {
+    int seqLen = seqStartPos[i + 1] - seqStartPos[i];
+    vector<real> randStarts =
+        randSampling(seqLen, min(seqLen, static_cast<int>(beamSize)));
+    copy(begin(randStarts), end(randStarts), begin(starts[i]));
+  }
+}
+
+/*
+  generate end indices according to sequence start positions and start indices.
+ */
+void genEnds(vector<int>& seqStartPos,
+             vector<vector<real>>& starts,
+             vector<vector<real>>& ends,
+             size_t beamSize) {
+  CHECK_EQ(seqStartPos.size() - 1, starts.size());
+  ends.clear();
+  ends.resize(seqStartPos.size() - 1, vector<real>(beamSize, -1.));
+
+  for (size_t i = 0; i < starts.size(); ++i) {
+    for (size_t j = 0; j < starts[i].size(); ++j) {
+      int seqLen = seqStartPos[i + 1] - seqStartPos[i];
+      CHECK_GE(seqLen - 1, starts[i][j]);
+      if (starts[i][j] == -1.) break;
+      if (starts[i][j] == (seqLen - 1)) {
+        ends[i][j] = starts[i][j];
+      } else {
+        ends[i][j] = starts[i][j] + randSampling(seqLen - starts[i][j], 1)[0];
+      }
+    }
+  }
+}
+
+void genTestData(vector<int>& seqStartPos,
+                 vector<int>& subSeqStartPos,
+                 vector<vector<real>>& starts,
+                 vector<vector<real>>& ends,
+                 bool hasSubseq) {
+  size_t beamSize = 1 + (rand() % MAX_BEAM_SIZE);
+  genSeqInfo(seqStartPos, subSeqStartPos);
+
+  genStarts(hasSubseq ? subSeqStartPos : seqStartPos, starts, beamSize);
+  genEnds(hasSubseq ? subSeqStartPos : seqStartPos, starts, ends, beamSize);
+}
+
+template <typename T>
+void flatten2dVector(vector<vector<T>>& inVec, vector<T>& outVec) {
+  size_t totalSize{0};
+  for (auto const& items : inVec) totalSize += items.size();
+  outVec.reserve(totalSize);
+
+  for (auto& items : inVec)
+    move(items.begin(), items.end(), back_inserter(outVec));
+}
+
+void testSeqSliceLayer(bool hasSubseq,
+                       bool useGpu,
+                       vector<int>& seqStartPos,
+                       vector<int>& subSeqStartPos,
+                       vector<vector<real>>& starts,
+                       vector<vector<real>>& ends) {
+  // layer size is not crutial for this layer,
+  // so here use a small layer size in the unittest.
+  const size_t layerSize{4};
+  TestConfig config;
+  config.layerConfig.set_type("seq_slice");
+  config.layerConfig.set_size(layerSize);
+
+  // add the first input
+  MatrixPtr seqInputPtr =
+      Matrix::create(hasSubseq ? subSeqStartPos.back() : seqStartPos.back(),
+                     layerSize,
+                     false,
+                     false);
+  seqInputPtr->randomizeUniform();
+
+  if (hasSubseq) {
+    config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA,
+                                "seq_input",
+                                seqInputPtr,
+                                seqStartPos,
+                                subSeqStartPos});
+  } else {
+    config.inputDefs.push_back(
+        {INPUT_SELF_DEFINE_DATA, "seq_input", seqInputPtr, seqStartPos});
+  }
+  config.layerConfig.add_inputs();
+
+  // add start indices
+  if (starts.size()) {
+    vector<real> startsToVec;
+    flatten2dVector(starts, startsToVec);
+
+    MatrixPtr startMatrixPtr =
+        Matrix::create(starts.size(), starts[0].size(), false, false);
+    startMatrixPtr->copyFrom(startsToVec.data(), startsToVec.size());
+
+    config.inputDefs.push_back(
+        {INPUT_SELF_DEFINE_DATA, "starts", startMatrixPtr});
+    config.layerConfig.add_inputs();
+    config.layerConfig.set_select_first(true);
+  }
+
+  // add end indices
+  if (ends.size()) {
+    vector<real> endsToVec;
+    flatten2dVector(ends, endsToVec);
+
+    MatrixPtr endMatrixPtr =
+        Matrix::create(ends.size(), ends[0].size(), false, false);
+    endMatrixPtr->copyFrom(endsToVec.data(), endsToVec.size());
+
+    config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA, "ends", endMatrixPtr});
+    config.layerConfig.add_inputs();
+    config.layerConfig.set_select_first(false);
+  }
+
+  testLayerGrad(config, "seq_slice", /*batchSize*/ 100, false, useGpu, false);
+}
+
+TEST(Layer, SeqSliceLayer) {
+  vector<int> seqStartPos;
+  vector<int> subSeqStartPos;
+  vector<vector<real>> starts;
+  vector<vector<real>> ends;
+
+  std::vector<bool> mode = {false};
+#ifndef PADDLE_ONLY_CPU
+  mode.push_back(true);
+#endif
+  genSeqInfo(seqStartPos, subSeqStartPos);
+  for (bool hasSubseq : {true, false}) {
+    LOG(INFO) << "hasSubSeq : " << hasSubseq;
+    genTestData(seqStartPos, subSeqStartPos, starts, ends, hasSubseq);
+    for (bool useGpu : mode) {
+      vector<vector<real>> tmp;
+      testSeqSliceLayer(
+          hasSubseq, useGpu, seqStartPos, subSeqStartPos, tmp, ends);
+      testSeqSliceLayer(
+          hasSubseq, useGpu, seqStartPos, subSeqStartPos, starts, tmp);
+      testSeqSliceLayer(
+          hasSubseq, useGpu, seqStartPos, subSeqStartPos, starts, ends);
+    }
+  }
+}
+
+int main(int argc, char** argv) {
+  initMain(argc, argv);
+  hl_start();
+  hl_init(FLAGS_gpu_id);
+  FLAGS_thread_local_rand_use_global_seed = true;
+  srand(1);
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@ -43,6 +43,7 @@ endfunction()

 add_subdirectory(math)
 cc_test(gather_test SRCS gather_test.cc DEPS tensor)
+op_library(gather_op SRCS gather_op.cc gather_op.cu)

 cc_test(scatter_test SRCS scatter_test.cc DEPS tensor)

@ -68,3 +69,4 @@ op_library(recurrent_op SRCS recurrent_op.cc rnn/recurrent_op_utils.cc
    DEPS framework_proto tensor op_registry operator net_op)
 op_library(uniform_random_op
        SRCS uniform_random_op.cc uniform_random_op.cu)
+op_library(scale_op SRCS scale_op.cc scale_op.cu DEPS net_op)
--- a/paddle/operators/gather.h
+++ b/paddle/operators/gather.h
@ -17,6 +17,7 @@ limitations under the License. */
 #include <cstring>

 #include "paddle/framework/ddim.h"
+#include "paddle/framework/eigen.h"
 #include "paddle/framework/tensor.h"
 #include "paddle/platform/place.h"

@ -25,13 +26,13 @@ namespace operators {

 // Implementation of CPU copy
 template <typename T>
-void CPUGather(const T* params, const int* indices, const int slice_size,
+void CPUGather(const T* src, const int* indices, const int slice_size,
               const int index_size, T* output) {
  const size_t slice_bytes = slice_size * sizeof(T);

  for (int i = 0; i < index_size; ++i) {
    int index_ = indices[i];
-    memcpy(output + i * slice_size, params + index_ * slice_size, slice_bytes);
+    memcpy(output + i * slice_size, src + index_ * slice_size, slice_bytes);
  }
 }

@ -55,7 +56,7 @@ void Gather(const platform::Place& place, const paddle::framework::Tensor* src,
  int index_size = index->dims()[0];

  auto src_dims = src->dims();
-  paddle::framework::DDim output_dims(src_dims);
+  framework::DDim output_dims(src_dims);
  output_dims[0] = index_size;

  // slice size
--- a/paddle/operators/gather_op.cc
+++ b/paddle/operators/gather_op.cc
@ -0,0 +1,72 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/gather_op.h"
+#include "paddle/framework/ddim.h"
+
+namespace paddle {
+namespace operators {
+
+class GatherOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    int batch_size = ctx.Input<Tensor>("Index")->dims()[0];
+    PADDLE_ENFORCE_GE(batch_size, 0, "Batch size must be >0");
+    framework::DDim output_dims(ctx.Input<Tensor>("X")->dims());
+    output_dims[0] = batch_size;
+    ctx.Output<Tensor>("Out")->Resize(output_dims);
+  }
+};
+
+class GatherGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    auto X_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto X = ctx.Input<Tensor>("X");
+
+    X_grad->Resize(X->dims());
+  }
+};
+
+class GatherOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  GatherOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The source input of gather op");
+    AddInput("Index", "The index input of gather op");
+    AddOutput("Out", "The output of add op");
+    AddComment(R"DOC(
+Gather Operator by selecting from the first axis, 
+
+Out = X[Index]
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(gather, ops::GatherOp, ops::GatherOpMaker, gather_grad,
+            ops::GatherGradOp);
+REGISTER_OP_CPU_KERNEL(gather,
+                       ops::GatherOpKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    gather_grad,
+    ops::GatherGradientOpKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/gather_op.cu
+++ b/paddle/operators/gather_op.cu
@ -0,0 +1,20 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/gather_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(gather,
+                       ops::GatherOpKernel<paddle::platform::GPUPlace, float>);
--- a/paddle/operators/gather_op.h
+++ b/paddle/operators/gather_op.h
@ -0,0 +1,53 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "gather.h"
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "scatter.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename Place, typename T>
+class GatherOpKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *X = ctx.Input<Tensor>("X");
+    auto *Index = ctx.Input<Tensor>("Index");
+    auto *Y = ctx.Output<Tensor>("Out");
+
+    Y->mutable_data<T>(ctx.GetPlace());
+    Gather<T>(ctx.GetPlace(), X, Index, Y);
+  }
+};
+
+template <typename Place, typename T>
+class GatherGradientOpKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *Index = ctx.Input<Tensor>("Index");
+    auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto *dO = ctx.Input<Tensor>(framework::GradVarName("Out"));
+
+    dX->mutable_data<T>(ctx.GetPlace());
+    ScatterUpdate<T>(ctx.GetPlace(), dO, Index, dX);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/Show More
+++ b/Show More