From f2c4bb679bb3f247c12888b878ec4dddff358e49 Mon Sep 17 00:00:00 2001
From: Yibing Liu <liuyibing01@baidu.com>
Date: Sun, 21 Jan 2018 09:24:40 -0800
Subject: [PATCH 1/8] Add lstm with recurrent projection operator

---
 paddle/operators/lstmp_op.cc                  | 296 ++++++++++++++
 paddle/operators/lstmp_op.cu.cc               |  24 ++
 paddle/operators/lstmp_op.h                   | 384 ++++++++++++++++++
 python/paddle/v2/fluid/tests/test_lstmp_op.py | 314 ++++++++++++++
 4 files changed, 1018 insertions(+)
 create mode 100644 paddle/operators/lstmp_op.cc
 create mode 100644 paddle/operators/lstmp_op.cu.cc
 create mode 100644 paddle/operators/lstmp_op.h
 create mode 100644 python/paddle/v2/fluid/tests/test_lstmp_op.py

diff --git a/paddle/operators/lstmp_op.cc b/paddle/operators/lstmp_op.cc
new file mode 100644
index 0000000000..4c7f7713ee
--- /dev/null
+++ b/paddle/operators/lstmp_op.cc
@@ -0,0 +1,296 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/lstmp_op.h"
+
+namespace paddle {
+namespace operators {
+
+class LSTMPOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"),
+                   "Input(Input) of LSTMP should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Weight"),
+                   "Input(Weight) of LSTMP should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("ProjWeight"),
+                   "Input(ProjWeight) of LSTMP should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Bias"),
+                   "Input(Bias) of LSTMP should not be null.");
+
+    PADDLE_ENFORCE(ctx->HasOutput("Projection"),
+                   "Output(Projection) of LSTMP should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Cell"),
+                   "Output(Cell) of LSTMP should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("BatchGate"),
+                   "Output(BatchGate) of LSTMP should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("BatchCellPreAct"),
+                   "Output(BatchGate) of LSTMP should not be null.");
+
+    auto in_dims = ctx->GetInputDim("Input");
+    PADDLE_ENFORCE_EQ(in_dims.size(), 2, "Input(X)'s rank must be 2.");
+
+    if (ctx->HasInput("H0")) {
+      PADDLE_ENFORCE(ctx->HasInput("C0"),
+                     "Input(C0) and Input(H0) of LSTMP should not "
+                     "be null at the same time.");
+      auto h_dims = ctx->GetInputDim("H0");
+      auto c_dims = ctx->GetInputDim("C0");
+      PADDLE_ENFORCE(h_dims == c_dims,
+                     "The dimension of Input(H0) and Input(C0) "
+                     "should be the same.");
+    }
+
+    int frame_size = in_dims[1] / 4;
+    auto w_dims = ctx->GetInputDim("Weight");
+    auto proj_dims = ctx->GetInputDim("ProjWeight");
+    PADDLE_ENFORCE_EQ(w_dims.size(), 2,
+                      "The rank of Input(Weight) should be 2.");
+    PADDLE_ENFORCE_EQ(w_dims[0], proj_dims[1],
+                      "The first dimension of Input(Weight) "
+                      "should be %d.",
+                      proj_dims[1]);
+    PADDLE_ENFORCE_EQ(w_dims[1], 4 * frame_size,
+                      "The second dimension of Input(Weight) "
+                      "should be 4 * %d.",
+                      frame_size);
+
+    PADDLE_ENFORCE_EQ(proj_dims.size(), 2,
+                      "The rank of Input(ProjWeight) should be 2.");
+    PADDLE_ENFORCE_EQ(proj_dims[0], frame_size,
+                      "The first dimension of Input(ProjWeight) "
+                      "should be %d.",
+                      frame_size);
+
+    auto b_dims = ctx->GetInputDim("Bias");
+    PADDLE_ENFORCE_EQ(b_dims.size(), 2, "The rank of Input(Bias) should be 2.");
+    PADDLE_ENFORCE_EQ(b_dims[0], 1,
+                      "The first dimension of Input(Bias) should be 1.");
+
+    if (ctx->Attrs().Get<bool>("use_peepholes")) {
+      PADDLE_ENFORCE_EQ(b_dims[1], 7 * frame_size,
+                        "The second dimension of Input(Bias) should be "
+                        "7 * %d if enable peepholes connection",
+                        frame_size);
+    } else {
+      PADDLE_ENFORCE_EQ(b_dims[1], 4 * frame_size,
+                        "The second dimension of Input(Bias) should be "
+                        "4 * %d if disable peepholes connection",
+                        frame_size);
+    }
+
+    framework::DDim out_dims({in_dims[0], frame_size});
+    framework::DDim proj_out_dims({in_dims[0], proj_dims[1]});
+    ctx->SetOutputDim("Projection", proj_out_dims);
+    ctx->SetOutputDim("Cell", out_dims);
+    ctx->SetOutputDim("BatchGate", in_dims);
+    ctx->SetOutputDim("BatchCellPreAct", out_dims);
+    ctx->ShareLoD("Input", "Projection");
+    ctx->ShareLoD("Input", "Cell");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("Input")->type()),
+        ctx.device_context());
+  }
+};
+
+class LSTMPOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  LSTMPOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Input",
+             "(LoDTensor) the first input is a LodTensor, which support "
+             "variable-time length input sequence. The underlying tensor in "
+             "this LoDTensor is a matrix with shape (T X 4D), where T is the "
+             "total time steps in this mini-batch, D is the hidden size.");
+    AddInput("H0",
+             "(Tensor, optional) the initial hidden state is an optional "
+             "input. This is a tensor with shape (N x D), where N is the "
+             "batch size and D is the hidden size.")
+        .AsDispensable();
+    AddInput("C0",
+             "(Tensor, optional) the initial cell state is an optional "
+             "input. This is a tensor with shape (N x D), where N is the "
+             "batch size. `H0` and `C0` can be NULL but only at the same time")
+        .AsDispensable();
+    AddInput("Weight",
+             "(Tensor) the learnable hidden-hidden weights."
+             " - The shape is (P x 4D), where P is the recurrent projection "
+             "layer size and  D is the hidden size. "
+             " - Weight = {W_cr, W_ir, W_fr, W_or}");
+    AddInput("ProjWeight",
+             "(Tensor) the learnable weight `W_rh` of the projection layer."
+             " - The shape is (D x P), where P is the recurrent projection "
+             "layer size and  D is the hidden size.");
+    AddInput("Bias",
+             "(Tensor) the learnable weights, which contains two parts: "
+             "input-hidden bias weight and peephole connections weight if "
+             "setting `use_peepholes` True. "
+             "1. `use_peepholes = False` "
+             " - The shape is (1 x 4D). "
+             " - Bias = {b_c, b_i, b_f, b_o}."
+             "2. `use_peepholes = True` "
+             " - The shape is (1 x 7D). "
+             " - Bias = {b_c, b_i, b_f, b_o, W_ic, W_fc, W_oc}.");
+    AddOutput("Projection",
+              "(LoDTensor) the projection of the hidden state of LSTMP "
+              "operator. The shape is (T x P), and lod is the same with the "
+              "`Input`.");
+    AddOutput("Cell",
+              "(LoDTensor) the cell state of LSTMP operator. "
+              "The shape is (T x D), and lod is the same with the `Input`.");
+    AddOutput("BatchGate",
+              "(LoDTensor) This LoDTensor contains input gate, forget gate "
+              "and output gate after the nonlinear computation. This "
+              "LoDTensor has the same shape as the reorganized input, which "
+              "is also be called batch input. The LoD size is 2. The first "
+              "LoD is the batch offsets and the second LoD contains the "
+              "indexes, which denote the position of reorganized sequence "
+              "in the raw input.")
+        .AsIntermediate();
+    AddOutput("BatchCellPreAct",
+              "(LoDTensor) This LoDTensor is obtained in the forward and used "
+              "in the backward.")
+        .AsIntermediate();
+    AddAttr<bool>("use_peepholes",
+                  "(bool, defalut: True) "
+                  "whether to enable diagonal/peephole connections.")
+        .SetDefault(true);
+    AddAttr<bool>("is_reverse",
+                  "(bool, defalut: False) "
+                  "whether to compute reversed LSTMP.")
+        .SetDefault(false);
+    AddAttr<std::string>(
+        "gate_activation",
+        "(string, default: sigmoid)"
+        "The activation for input gate, forget gate and output "
+        "gate, `sigmoid` by default.")
+        .SetDefault("sigmoid")
+        .InEnum({"sigmoid", "tanh", "relu", "identity"});
+    AddAttr<std::string>("cell_activation",
+                         "(string, default: tanh)"
+                         "The activation for cell output, `tanh` by defalut.")
+        .SetDefault("tanh")
+        .InEnum({"sigmoid", "tanh", "relu", "identity"});
+    AddAttr<std::string>("candidate_activation",
+                         "(string, default: tanh)"
+                         "The activation for candidate hidden state, "
+                         "`tanh` by default.")
+        .SetDefault("tanh")
+        .InEnum({"sigmoid", "tanh", "relu", "identity"});
+    AddComment(R"DOC(
+Long-Short Term Memory with Recurrent Projection (LSTMP) Operator.
+
+LATMP is stand LSTM appended by a recurrent projection layer to reduce the
+number of parameters, espeacially when the output size is relative large. 
+The formula is as follows:
+
+$$
+i_t = \sigma(W_{ix}x_{t} + W_{ih}r_{t-1} + W_{ic}c_{t-1} + b_i) \\
+
+f_t = \sigma(W_{fx}x_{t} + W_{fh}r_{t-1} + W_{fc}c_{t-1} + b_f) \\
+
+c_t = f_t \odot c_{t-1} + i_t \odot act_g(W_{cx}x_t + W_{ch}r_{t-1} + b_c) \\
+
+o_t = \sigma(W_{ox}x_{t} + W_{oh}r_{t-1} + W_{oc}c_t + b_o) \\
+
+h_t = o_t \odot act_h(c_t)
+
+r_t = W_{rh}h_t
+$$
+
+where the W terms denote weight matrices (e.g. $W_{xi}$ is the matrix
+of weights from the input gate to the input), $W_{ic}, W_{fc}, W_{oc}$
+are diagonal weight matrices for peephole connections. In our implementation,
+we use vectors to reprenset these diagonal weight matrices. The b terms
+denote bias vectors ($b_i$ is the input gate bias vector), $\sigma$
+is the non-line activations, such as logistic sigmoid function, and
+$i, f, o$ and $c$ are the input gate, forget gate, output gate,
+and cell activation vectors, respectively, all of which have the same size as
+the cell output activation vector $h$. $r$ denotes the recurrent projection 
+layer.
+
+The $\odot$ is the element-wise product of the vectors. $act_g$ and $act_h$
+are the cell input and cell output activation functions and `tanh` is usually
+used for them.
+
+Note that these $W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}$
+operations on the input $x_{t}$ are NOT included in this operator.
+Users can choose to use fully-connect operator before LSTMP operator.
+
+)DOC");
+  }
+};
+
+class LSTMPGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"),
+                   "Input(Input) of LSTMP should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Hidden"),
+                   "Input(Hidden) of LSTMP should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Cell"),
+                   "Input(Cell) of LSTMP should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Weight"),
+                   "Input(Weight) of LSTMP should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Bias"),
+                   "Input(Bias) of LSTMP should not be null.");
+
+    PADDLE_ENFORCE(ctx->HasInput("BatchGate"),
+                   "Input(BatchGate) of LSTMP should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("BatchCellPreAct"),
+                   "Input(BatchGate) of LSTMP should not be null.");
+
+    auto SetOutGradDim = [&ctx](const std::string& name) {
+      auto g_name = framework::GradVarName(name);
+      if (ctx->HasOutput(g_name))
+        ctx->SetOutputDim(g_name, ctx->GetInputDim(name));
+    };
+
+    SetOutGradDim("Input");
+    SetOutGradDim("Weight");
+    SetOutGradDim("Bias");
+    SetOutGradDim("H0");
+    SetOutGradDim("C0");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("Input")->type()),
+        ctx.device_context());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(lstmp, ops::LSTMPOp, ops::LSTMPOpMaker, lstmp_grad,
+            ops::LSTMPGradOp);
+REGISTER_OP_CPU_KERNEL(
+    lstmp, ops::LSTMPKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::LSTMPKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    lstmp_grad, ops::LSTMPGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::LSTMPGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/lstmp_op.cu.cc b/paddle/operators/lstmp_op.cu.cc
new file mode 100644
index 0000000000..7fcbcfecc8
--- /dev/null
+++ b/paddle/operators/lstmp_op.cu.cc
@@ -0,0 +1,24 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/lstmp_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    lstmp, ops::LSTMPKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LSTMPKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    lstmp_grad,
+    ops::LSTMPGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LSTMPGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/lstmp_op.h b/paddle/operators/lstmp_op.h
new file mode 100644
index 0000000000..f5a38b2ff5
--- /dev/null
+++ b/paddle/operators/lstmp_op.h
@@ -0,0 +1,384 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/detail/activation_functions.h"
+#include "paddle/operators/math/lstm_compute.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/sequence2batch.h"
+
+namespace paddle {
+namespace operators {
+
+using LoDTensor = framework::LoDTensor;
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+inline void ReorderInitState(const DeviceContext& ctx,
+                             const framework::Tensor& src, const size_t* index,
+                             framework::Tensor* dst, bool indexed_src) {
+  math::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
+  dst->mutable_data<T>(src.dims(), ctx.GetPlace());
+  row_shuffle(ctx, src, index, *dst, indexed_src);
+}
+
+template <typename DeviceContext, typename T>
+class LSTMPKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<LoDTensor>("Input");
+    auto* weight = ctx.Input<Tensor>("Weight");
+    auto* proj_weight = ctx.Input<Tensor>("ProjWeight");
+    auto* bias = ctx.Input<Tensor>("Bias");
+
+    auto* hidden_t0 = ctx.Input<Tensor>("H0");
+    auto* cell_t0 = ctx.Input<Tensor>("C0");
+
+    auto* batch_gate = ctx.Output<LoDTensor>("BatchGate");
+    batch_gate->mutable_data<T>(ctx.GetPlace());
+    auto* proj_out = ctx.Output<LoDTensor>("Projection");
+    proj_out->mutable_data<T>(ctx.GetPlace());
+    auto* cell_out = ctx.Output<LoDTensor>("Cell");
+    cell_out->mutable_data<T>(ctx.GetPlace());
+
+    bool is_reverse = ctx.Attr<bool>("is_reverse");
+    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
+    auto& device_ctx = ctx.template device_context<DeviceContext>();
+    to_batch(device_ctx, *input, *batch_gate, true, is_reverse);
+
+    auto in_dims = input->dims();
+    int frame_size = static_cast<int>(in_dims[1] / 4);
+    framework::DDim dims({in_dims[0], frame_size});
+    framework::DDim proj_dims({in_dims[0], proj_weight->dims()[1]});
+
+    if (bias) {
+      Tensor b = *bias;
+      b.Resize({bias->numel(), 1});
+      Tensor gate_bias = b.Slice(0, 4 * frame_size);
+      math::RowwiseAdd<DeviceContext, T> add_bias;
+      add_bias(device_ctx, *batch_gate, gate_bias, batch_gate);
+    }
+
+    math::LstmMetaValue<T> lstmp_value;
+    if (bias && ctx.Attr<bool>("use_peepholes")) {
+      T* bias_data = const_cast<T*>(bias->data<T>());
+      // the code style in LstmpMetaValue will be updated later.
+
+      lstmp_value.check_ig = bias_data + 4 * frame_size;
+      lstmp_value.check_fg = lstmp_value.check_ig + frame_size;
+      lstmp_value.check_og = lstmp_value.check_fg + frame_size;
+    } else {
+      lstmp_value.check_ig = nullptr;
+      lstmp_value.check_fg = nullptr;
+      lstmp_value.check_og = nullptr;
+    }
+    lstmp_value.prev_state_value = nullptr;
+    Tensor ordered_c0;
+    const size_t* order = batch_gate->lod()[2].data();
+    if (cell_t0) {
+      // Since the batch computing for LSTMP reorders the input sequence
+      // according to their length. The initialized cell state also needs
+      // to reorder.
+      ReorderInitState<DeviceContext, T>(device_ctx, *cell_t0, order,
+                                         &ordered_c0, true);
+      lstmp_value.prev_state_value = ordered_c0.data<T>();
+    }
+
+    // Use the local variable as here.
+    LoDTensor batch_hidden, batch_proj, batch_cell;
+    auto* batch_cell_pre_act = ctx.Output<LoDTensor>("BatchCellPreAct");
+    batch_hidden.mutable_data<T>(dims, ctx.GetPlace());     // T x D
+    batch_proj.mutable_data<T>(proj_dims, ctx.GetPlace());  // T x P
+    batch_cell.mutable_data<T>(dims, ctx.GetPlace());       // T x D
+    batch_cell_pre_act->mutable_data<T>(dims, ctx.GetPlace());
+
+    auto batch_starts = batch_gate->lod()[0];
+    size_t num_batch = batch_starts.size() - 1;
+    auto gate_act = math::detail::GetActivationType(
+        ctx.Attr<std::string>("gate_activation"));
+    auto cell_act = math::detail::GetActivationType(
+        ctx.Attr<std::string>("cell_activation"));
+    auto cand_act = math::detail::GetActivationType(
+        ctx.Attr<std::string>("candidate_activation"));
+
+    for (size_t n = 0; n < num_batch; n++) {
+      int bstart = static_cast<int>(batch_starts[n]);
+      int bend = static_cast<int>(batch_starts[n + 1]);
+
+      Tensor gate_t = batch_gate->Slice(bstart, bend);
+      Tensor hidden_t = batch_hidden.Slice(bstart, bend);
+      Tensor proj_t = batch_proj.Slice(bstart, bend);
+      Tensor cell_t = batch_cell.Slice(bstart, bend);
+      Tensor cell_pre_act_t = batch_cell_pre_act->Slice(bstart, bend);
+
+      int cur_batch_size = bend - bstart;
+
+      if (n > 0) {
+        int pre_h_start = static_cast<int>(batch_starts[n - 1]);
+        int pre_h_end = pre_h_start + cur_batch_size;
+        auto pre_proj_t = batch_proj.Slice(pre_h_start, pre_h_end);
+        math::matmul<DeviceContext, T>(device_ctx, pre_proj_t, false, *weight,
+                                       false, static_cast<T>(1.0), &gate_t,
+                                       static_cast<T>(1.0));
+      } else if (hidden_t0) {
+        // If n == 0 and there is no initialized hidden state, that is to say
+        // the H0 is zeros, the calculation W_h * H0 will be skiped.
+        // If n == 0 and there is initialized hidden state, calculate W_h * H0.
+
+        // Since the batch computing for LSTMP reorders the input sequence
+        // according to their length. The initialized hidden state also needs
+        // to reorder.
+        Tensor ordered_h0, ordered_proj0;
+        ordered_proj0.Resize({1, proj_weight->dims()[1]});
+        ordered_proj0.mutable_data<T>(ctx.GetPlace());
+        ReorderInitState<DeviceContext, T>(device_ctx, *hidden_t0, order,
+                                           &ordered_h0, true);
+        math::matmul<DeviceContext, T>(device_ctx, ordered_h0, false,
+                                       *proj_weight, false, static_cast<T>(1.0),
+                                       &ordered_proj0, static_cast<T>(0.0));
+        math::matmul<DeviceContext, T>(device_ctx, ordered_proj0, false,
+                                       *weight, false, static_cast<T>(1.0),
+                                       &gate_t, static_cast<T>(1.0));
+      }
+
+      lstmp_value.gate_value = gate_t.data<T>();
+      lstmp_value.output_value = hidden_t.data<T>();
+      lstmp_value.state_value = cell_t.data<T>();
+      lstmp_value.state_active_value = cell_pre_act_t.data<T>();
+      math::LstmUnitFunctor<DeviceContext, T>::compute(
+          device_ctx, lstmp_value, frame_size, cur_batch_size, gate_act,
+          cell_act, cand_act);
+      lstmp_value.prev_state_value = lstmp_value.state_value;
+      math::matmul<DeviceContext, T>(device_ctx, hidden_t, false, *proj_weight,
+                                     false, static_cast<T>(1.0), &proj_t,
+                                     static_cast<T>(0.0));
+    }
+
+    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
+    batch_proj.set_lod(batch_gate->lod());
+    // restore the output hidden in LoDTensor from the batch hidden
+    to_seq(device_ctx, batch_proj, *proj_out);
+
+    batch_cell.set_lod(batch_gate->lod());
+    // restore the output cell state in LoDTensor from the batch cell
+    to_seq(device_ctx, batch_cell, *cell_out);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class LSTMPGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<LoDTensor>("Input");
+    auto* weight = ctx.Input<Tensor>("Weight");
+    auto* bias = ctx.Input<Tensor>("Bias");
+
+    auto* proj_out = ctx.Input<LoDTensor>("Projection");
+    auto* cell_out = ctx.Input<LoDTensor>("Cell");
+
+    auto* batch_gate = ctx.Input<LoDTensor>("BatchGate");
+    auto* batch_cell_pre_act = ctx.Input<LoDTensor>("BatchCellPreAct");
+
+    auto* hidden_g = ctx.Input<LoDTensor>(framework::GradVarName("Projection"));
+
+    auto* in_g = ctx.Output<LoDTensor>(framework::GradVarName("Input"));
+    auto* weight_g = ctx.Output<Tensor>(framework::GradVarName("Weight"));
+    auto* bias_g = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+
+    auto* h0 = ctx.Input<Tensor>("H0");
+    auto* c0 = ctx.Input<Tensor>("C0");
+
+    auto* h0_g = ctx.Output<Tensor>(framework::GradVarName("H0"));
+    auto* c0_g = ctx.Output<Tensor>(framework::GradVarName("C0"));
+
+    auto& device_ctx = ctx.template device_context<DeviceContext>();
+    math::SetConstant<DeviceContext, T> zero;
+    if (weight_g) {
+      weight_g->mutable_data<T>(ctx.GetPlace());
+      zero(device_ctx, weight_g, static_cast<T>(0.0));
+    }
+
+    // ordered_h0/c0 is the reordered hidden/cell initialization.
+    // ordered_h0_g/c0_g is the reordered gradient of hidden/cell
+    // initialization.
+    Tensor ordered_h0, ordered_c0, ordered_h0_g, ordered_c0_g;
+    const size_t* order = batch_gate->lod()[2].data();
+    if (c0) {
+      ReorderInitState<DeviceContext, T>(device_ctx, *c0, order, &ordered_c0,
+                                         true);
+    }
+    if (c0 && c0_g) {
+      ordered_c0_g.mutable_data<T>(c0_g->dims(), ctx.GetPlace());
+    }
+
+    auto in_dims = input->dims();
+    auto out_dims = hidden_g->dims();
+    int frame_size = static_cast<int>(in_dims[1] / 4);
+    PADDLE_ENFORCE_EQ(frame_size, out_dims[1]);
+
+    math::LstmMetaValue<T> lstmp_value;
+    if (bias && ctx.Attr<bool>("use_peepholes")) {
+      T* bias_data = const_cast<T*>(bias->data<T>());
+      lstmp_value.check_ig = bias_data + 4 * frame_size;
+      lstmp_value.check_fg = lstmp_value.check_ig + frame_size;
+      lstmp_value.check_og = lstmp_value.check_fg + frame_size;
+    } else {
+      lstmp_value.check_ig = nullptr;
+      lstmp_value.check_fg = nullptr;
+      lstmp_value.check_og = nullptr;
+    }
+
+    math::LstmMetaGrad<T> lstmp_grad;
+
+    if (bias && bias_g) {
+      bias_g->mutable_data<T>(ctx.GetPlace());
+      zero(device_ctx, bias_g, static_cast<T>(0.0));
+    }
+    if (bias && bias_g && ctx.Attr<bool>("use_peepholes")) {
+      T* bias_g_data = bias_g->data<T>();
+      lstmp_grad.check_ig_grad = bias_g_data + 4 * frame_size;
+      lstmp_grad.check_fg_grad = lstmp_grad.check_ig_grad + frame_size;
+      lstmp_grad.check_og_grad = lstmp_grad.check_fg_grad + frame_size;
+    } else {
+      lstmp_grad.check_ig_grad = nullptr;
+      lstmp_grad.check_fg_grad = nullptr;
+      lstmp_grad.check_og_grad = nullptr;
+    }
+
+    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
+
+    auto ToBatch = [&batch_gate, &to_batch](
+        const DeviceContext& ctx, const framework::LoDTensor& src,
+        const framework::DDim& dims, framework::LoDTensor& dst) {
+      dst.mutable_data<T>(dims, ctx.GetPlace());
+      dst.set_lod(batch_gate->lod());
+      to_batch(ctx, src, dst, false);
+    };
+
+    LoDTensor batch_proj, batch_proj_g, batch_cell;
+    ToBatch(device_ctx, *proj_out, out_dims, batch_proj);
+    ToBatch(device_ctx, *hidden_g, out_dims, batch_proj_g);
+    ToBatch(device_ctx, *cell_out, out_dims, batch_cell);
+
+    LoDTensor batch_cell_g, batch_gate_g;
+    batch_cell_g.mutable_data<T>(out_dims, ctx.GetPlace());
+    // TODO(qingqing) support the case output cell has gradient.
+    // to_batch(device_ctx, *cell_g, batch_cell_g, false);
+    zero(device_ctx, &batch_cell_g, static_cast<T>(0.0));
+    batch_gate_g.mutable_data<T>(batch_gate->dims(), ctx.GetPlace());
+    batch_gate_g.set_lod(batch_gate->lod());
+
+    auto gate_act = math::detail::GetActivationType(
+        ctx.Attr<std::string>("gate_activation"));
+    auto cell_act = math::detail::GetActivationType(
+        ctx.Attr<std::string>("cell_activation"));
+    auto cand_act = math::detail::GetActivationType(
+        ctx.Attr<std::string>("candidate_activation"));
+
+    auto batch_starts = batch_gate->lod()[0];
+    size_t num_batch = batch_starts.size() - 1;
+    for (int n = static_cast<int>(num_batch) - 1; n >= 0; n--) {
+      int bstart = static_cast<int>(batch_starts[n]);
+      int bend = static_cast<int>(batch_starts[n + 1]);
+
+      Tensor gate = batch_gate->Slice(bstart, bend);
+      Tensor cell = batch_cell.Slice(bstart, bend);
+      Tensor cell_pre_act = batch_cell_pre_act->Slice(bstart, bend);
+      lstmp_value.gate_value = gate.data<T>();
+      lstmp_value.state_value = cell.data<T>();
+      lstmp_value.state_active_value = cell_pre_act.data<T>();
+
+      Tensor out_g = batch_proj_g.Slice(bstart, bend);
+      Tensor gate_g = batch_gate_g.Slice(bstart, bend);
+      Tensor cell_g = batch_cell_g.Slice(bstart, bend);
+      lstmp_grad.state_grad = cell_g.data<T>();
+      lstmp_grad.gate_grad = gate_g.data<T>();
+      lstmp_grad.output_grad = out_g.data<T>();
+
+      if (n > 0) {
+        int bstart_pre = static_cast<int>(batch_starts[n - 1]);
+        Tensor cell_pre = batch_cell.Slice(bstart_pre, bstart);
+        Tensor cell_pre_g = batch_cell_g.Slice(bstart_pre, bstart);
+        lstmp_value.prev_state_value = cell_pre.data<T>();
+        lstmp_grad.prev_state_grad = cell_pre_g.data<T>();
+      } else {
+        lstmp_value.prev_state_value = c0 ? ordered_c0.data<T>() : nullptr;
+        lstmp_grad.prev_state_grad = c0_g ? ordered_c0_g.data<T>() : nullptr;
+      }
+
+      int cur_batch_size = bend - bstart;
+      math::LstmUnitGradFunctor<DeviceContext, T>::compute(
+          device_ctx, lstmp_value, lstmp_grad, frame_size, cur_batch_size,
+          gate_act, cell_act, cand_act);
+
+      if (n > 0) {
+        int pre_h_start = static_cast<int>(batch_starts[n - 1]);
+        int pre_h_end = pre_h_start + cur_batch_size;
+        auto pre_proj_g = batch_proj_g.Slice(pre_h_start, pre_h_end);
+        math::matmul<DeviceContext, T>(device_ctx, gate_g, false, *weight, true,
+                                       static_cast<T>(1.0), &pre_proj_g,
+                                       static_cast<T>(1.0));
+        if (weight_g) {
+          /* backward weight */
+          auto pre_proj = batch_proj.Slice(pre_h_start, pre_h_end);
+          math::matmul<DeviceContext, T>(device_ctx, pre_proj, true, gate_g,
+                                         false, static_cast<T>(1.0), weight_g,
+                                         static_cast<T>(1.0));
+        }
+      } else {
+        if (h0 && weight_g) {
+          ReorderInitState<DeviceContext, T>(device_ctx, *h0, order,
+                                             &ordered_h0, true);
+          math::matmul<DeviceContext, T>(device_ctx, ordered_h0, true, gate_g,
+                                         false, static_cast<T>(1.0), weight_g,
+                                         static_cast<T>(1.0));
+        }
+        if (h0 && h0_g) {
+          ordered_h0_g.mutable_data<T>(h0_g->dims(), ctx.GetPlace());
+          math::matmul<DeviceContext, T>(device_ctx, gate_g, false, *weight,
+                                         true, static_cast<T>(1.0),
+                                         &ordered_h0_g, static_cast<T>(0.0));
+        }
+      }
+    }
+
+    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
+    if (in_g) {
+      /* backward data */
+      in_g->mutable_data<T>(ctx.GetPlace());
+      to_seq(device_ctx, batch_gate_g, *in_g);
+    }
+    if (bias && bias_g) {
+      /* backward bias */
+      Tensor b_g = *bias_g;
+      b_g.Resize({bias_g->numel(), 1});
+      Tensor gate_bias_g = b_g.Slice(0, 4 * frame_size);
+      math::ColwiseSum<DeviceContext, T> col_sum;
+      col_sum(device_ctx, batch_gate_g, &gate_bias_g);
+    }
+
+    if (h0 && h0_g) {
+      ReorderInitState<DeviceContext, T>(device_ctx, ordered_h0_g, order, h0_g,
+                                         false);
+    }
+    if (c0 && c0_g) {
+      ReorderInitState<DeviceContext, T>(device_ctx, ordered_c0_g, order, c0_g,
+                                         false);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/python/paddle/v2/fluid/tests/test_lstmp_op.py b/python/paddle/v2/fluid/tests/test_lstmp_op.py
new file mode 100644
index 0000000000..e35712ec06
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_lstmp_op.py
@@ -0,0 +1,314 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+import unittest
+import numpy as np
+from op_test import OpTest
+
+SIGMOID_THRESHOLD_MIN = -40.0
+SIGMOID_THRESHOLD_MAX = 13.0
+EXP_MAX_INPUT = 40.0
+
+
+def identity(x):
+    return x
+
+
+def sigmoid(x):
+    y = np.copy(x)
+    y[x < SIGMOID_THRESHOLD_MIN] = SIGMOID_THRESHOLD_MIN
+    y[x > SIGMOID_THRESHOLD_MAX] = SIGMOID_THRESHOLD_MAX
+    return 1. / (1. + np.exp(-y))
+
+
+def tanh(x):
+    y = -2. * x
+    y[y > EXP_MAX_INPUT] = EXP_MAX_INPUT
+    return (2. / (1. + np.exp(y))) - 1.
+
+
+def relu(x):
+    return np.maximum(x, 0)
+
+
+ACTVATION = {
+    'identity': identity,
+    'sigmoid': sigmoid,
+    'tanh': tanh,
+    'relu': relu
+}
+
+
+# LSTM with recurrent projection Layer
+def lstmp(
+        input,  # T x 4D
+        lod,  # 1 x N
+        h0=None,  # N x D
+        c0=None,  # N x D
+        w_r=None,  # P x 5D
+        w_rh=None,  # D x P
+        w_b=None,  # 1 x 4D
+        w_c=None,  # 1 x 3D
+        is_reverse=False,
+        act_gate=None,
+        act_cell=None,
+        act_cand=None):
+    def _step(x, w_r, w_rh, w_c, r_pre, c_pre, act_gate, act_cell, act_cand):
+        g = np.dot(r_pre, w_r)  # 1 x 4D
+        g = g + x
+        g = np.reshape(g, (1, g.size))
+        c, g_i, g_f, g_o = np.split(g, 4, axis=1)
+        if w_c is None:
+            g_i = act_gate(g_i)  # 1 x D
+            g_f = act_gate(g_f)  # 1 x D
+        else:
+            w_ic, w_fc, _ = np.split(w_c, 3, axis=1)
+            g_i = act_gate(g_i + w_ic * c_pre)  # 1 x D
+            g_f = act_gate(g_f + w_fc * c_pre)  # 1 x D
+        c = g_f * c_pre + g_i * act_cand(c)  # 1 x D
+
+        if w_c is None:
+            g_o = act_gate(g_o)  # 1 x D
+        else:
+            _, _, w_oc = np.split(w_c, 3, axis=1)
+            g_o = act_gate(g_o + w_oc * c)  # 1 x D
+        h = g_o * act_cell(c)
+        # projection
+        r = np.dot(h, w_rh)
+        return r, c
+
+    def _reverse(x, lod):
+        y = np.zeros_like(x)
+        for i in range(len(lod) - 1):
+            b, e = lod[i], lod[i + 1]
+            y[b:e, :] = np.flip(x[b:e, :], 0)
+        return y
+
+    offset = lod[0]
+    batch_size = len(offset) - 1
+    # recurrent projection state
+    projection = []
+    cell = []
+    input = _reverse(input, offset) if is_reverse else input
+    if w_b is not None:
+        input = input + np.tile(w_b, (offset[-1], 1))
+    for i in range(batch_size):
+        # compute one sequence
+        seq_len = offset[i + 1] - offset[i]
+        x = input[offset[i]:offset[i + 1], :]
+        r_pre = np.dot(h0[i], w_rh)  # 1 x P
+        c_pre = c0[i]  # 1 x D
+        for j in range(seq_len):
+            # compute one step
+            r_pre, c_pre = _step(x[j], w_r, w_rh, w_c, r_pre, c_pre, act_gate,
+                                 act_cell, act_cand)
+            projection.append(r_pre.flatten())
+            cell.append(c_pre.flatten())
+
+    projection = np.array(projection).astype('float64')
+    cell = np.array(cell).astype('float64')
+
+    projection = _reverse(projection, offset) if is_reverse else projection
+    cell = _reverse(cell, offset) if is_reverse else cell
+
+    assert projection.shape == (input.shape[0], w_r.shape[0])  # T x P
+    assert cell.shape == (input.shape[0], input.shape[1] / 4)  # T x D
+    return projection, cell
+
+
+class TestLstmOp(OpTest):
+    def set_argument(self):
+        self.lod = [[0, 2, 5, 7]]
+        # hidden size
+        self.D = 16
+        # projection size
+        self.P = 10
+
+        self.act_gate = 'sigmoid'
+        self.act_cell = 'tanh'
+        self.act_cand = 'tanh'
+
+        self.has_initial_state = False
+        self.is_reverse = False
+        self.use_peepholes = True
+
+    def setUp(self):
+        self.set_argument()
+        self.op_type = 'lstmp'
+
+        T = self.lod[0][-1]
+        N = len(self.lod[0]) - 1
+
+        x = np.random.normal(size=(T, 4 * self.D)).astype('float64')
+        if self.has_initial_state:
+            h0 = np.random.normal(size=(N, self.D)).astype('float64')
+            c0 = np.random.normal(size=(N, self.D)).astype('float64')
+        else:
+            h0 = np.zeros((N, self.D)).astype('float64')
+            c0 = np.zeros((N, self.D)).astype('float64')
+        w = np.random.normal(size=(self.P, 4 * self.D)).astype('float64')
+        if self.use_peepholes:
+            b = np.random.normal(size=(1, 7 * self.D)).astype('float64')
+        else:
+            b = np.random.normal(size=(1, 4 * self.D)).astype('float64')
+
+        w_b = b[:, 0:4 * self.D]
+        w_c = b[:, 4 * self.D:] if self.use_peepholes else None
+        w_rh = np.random.normal(size=(self.D, self.P)).astype('float64')
+        r, c = lstmp(x, self.lod, h0, c0, w, w_rh, w_b, w_c, self.is_reverse,
+                     ACTVATION[self.act_gate], ACTVATION[self.act_cell],
+                     ACTVATION[self.act_cand])
+
+        self.inputs = {'Input': (x, self.lod), 'Weight': w, 'ProjWeight': w_rh}
+
+        self.inputs['Bias'] = b
+
+        if self.has_initial_state:
+            self.inputs['H0'] = h0
+            self.inputs['C0'] = c0
+
+        self.outputs = {
+            'Projection': (r, self.lod),
+            'Cell': (c, self.lod),
+        }
+        self.attrs = {
+            'use_peepholes': self.use_peepholes,
+            'is_reverse': self.is_reverse,
+            'gate_activation': self.act_gate,
+            'cell_activation': self.act_cell,
+            'candidate_activation': self.act_cand
+        }
+
+    def test_check_output(self):
+        self.check_output(atol=1e-8)
+
+    """
+    def test_check_grad(self):
+        # TODO(qingqing) remove folowing lines after the check_grad is refined.
+        N = len(self.lod[0]) - 1
+        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+        self.outputs['BatchCellPreAct'] = np.zeros(
+            (N, self.D)).astype('float64')
+        self.check_grad(
+            ['Input', 'Weight', 'Bias'], ['Hidden'], max_relative_error=5e-4)
+    """
+
+
+"""
+class TestLstmOpHasInitial(TestLstmOp):
+    def set_argument(self):
+        self.lod = [[0, 2, 5, 7]]
+        self.D = 16
+
+        self.act_gate = 'sigmoid'
+        self.act_cell = 'tanh'
+        self.act_cand = 'tanh'
+
+        self.has_initial_state = True
+        self.is_reverse = True
+        self.use_peepholes = True
+
+    def test_check_grad(self):
+        # TODO(qingqing) remove folowing lines after the check_grad is refined.
+        N = len(self.lod[0]) - 1
+        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+        self.outputs['BatchCellPreAct'] = np.zeros(
+            (N, self.D)).astype('float64')
+        self.check_grad(
+            ['Input', 'Weight', 'Bias', 'H0', 'C0'], ['Hidden'],
+            max_relative_error=5e-4)
+
+    def test_check_grad_ingore_bias(self):
+        N = len(self.lod[0]) - 1
+        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+        self.outputs['BatchCellPreAct'] = np.zeros(
+            (N, self.D)).astype('float64')
+        self.check_grad(
+            ['Input', 'Weight'], ['Hidden'],
+            max_relative_error=5e-4,
+            no_grad_set=set('Bias'))
+
+    def test_check_grad_ingore_weight(self):
+        N = len(self.lod[0]) - 1
+        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+        self.outputs['BatchCellPreAct'] = np.zeros(
+            (N, self.D)).astype('float64')
+        self.check_grad(
+            ['Input', 'Bias'], ['Hidden'],
+            max_relative_error=5e-4,
+            no_grad_set=set('Weight'))
+
+    def test_check_grad_ingore_input(self):
+        N = len(self.lod[0]) - 1
+        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+        self.outputs['BatchCellPreAct'] = np.zeros(
+            (N, self.D)).astype('float64')
+        self.check_grad(
+            ['Weight', 'Bias'], ['Hidden'],
+            max_relative_error=5e-4,
+            no_grad_set=set('Input'))
+
+    def test_check_grad_ingore_h0(self):
+        N = len(self.lod[0]) - 1
+        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+        self.outputs['BatchCellPreAct'] = np.zeros(
+            (N, self.D)).astype('float64')
+        self.check_grad(
+            ['Input', 'Weight', 'Bias', 'C0'], ['Hidden'],
+            max_relative_error=5e-4,
+            no_grad_set=set('H0'))
+
+    def test_check_grad_ingore_c0(self):
+        N = len(self.lod[0]) - 1
+        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+        self.outputs['BatchCellPreAct'] = np.zeros(
+            (N, self.D)).astype('float64')
+        self.check_grad(
+            ['Input', 'Weight', 'Bias', 'H0'], ['Hidden'],
+            max_relative_error=5e-4,
+            no_grad_set=set('C0'))
+"""
+
+
+class TestLstmOpRerverse(TestLstmOp):
+    def set_argument(self):
+        self.lod = [[0, 2, 5, 7]]
+        self.D = 16
+        self.P = 10
+
+        self.act_gate = 'sigmoid'
+        self.act_cell = 'tanh'
+        self.act_cand = 'tanh'
+
+        self.has_initial_state = False
+        self.is_reverse = True
+        self.use_peepholes = True
+
+
+class TestLstmOpNotUsePeepholes(TestLstmOp):
+    def set_argument(self):
+        self.lod = [[0, 2, 5, 7]]
+        self.D = 16
+        self.P = 10
+
+        self.act_gate = 'sigmoid'
+        self.act_cell = 'tanh'
+        self.act_cand = 'tanh'
+
+        self.has_initial_state = False
+        self.is_reverse = True
+        self.use_peepholes = False
+
+
+if __name__ == '__main__':
+    unittest.main()

From 552c901204744003f0653dbcc9afe615d5a66334 Mon Sep 17 00:00:00 2001
From: Yibing Liu <liuyibing01@baidu.com>
Date: Tue, 23 Jan 2018 04:04:06 -0800
Subject: [PATCH 2/8] Enable backward computation in lstmp_op

---
 paddle/operators/lstmp_op.cc                  |  53 ++++--
 .../operators/{lstmp_op.cu.cc => lstmp_op.cu} |   0
 paddle/operators/lstmp_op.h                   | 151 +++++++++++++++---
 python/paddle/v2/fluid/tests/test_lstmp_op.py |  59 ++++---
 4 files changed, 206 insertions(+), 57 deletions(-)
 rename paddle/operators/{lstmp_op.cu.cc => lstmp_op.cu} (100%)

diff --git a/paddle/operators/lstmp_op.cc b/paddle/operators/lstmp_op.cc
index 4c7f7713ee..266612294c 100644
--- a/paddle/operators/lstmp_op.cc
+++ b/paddle/operators/lstmp_op.cc
@@ -39,21 +39,12 @@ class LSTMPOp : public framework::OperatorWithKernel {
                    "Output(BatchGate) of LSTMP should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("BatchCellPreAct"),
                    "Output(BatchGate) of LSTMP should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("BatchHidden"),
+                   "Output(BatchHidden) of LSTMP should not be null.");
 
     auto in_dims = ctx->GetInputDim("Input");
     PADDLE_ENFORCE_EQ(in_dims.size(), 2, "Input(X)'s rank must be 2.");
 
-    if (ctx->HasInput("H0")) {
-      PADDLE_ENFORCE(ctx->HasInput("C0"),
-                     "Input(C0) and Input(H0) of LSTMP should not "
-                     "be null at the same time.");
-      auto h_dims = ctx->GetInputDim("H0");
-      auto c_dims = ctx->GetInputDim("C0");
-      PADDLE_ENFORCE(h_dims == c_dims,
-                     "The dimension of Input(H0) and Input(C0) "
-                     "should be the same.");
-    }
-
     int frame_size = in_dims[1] / 4;
     auto w_dims = ctx->GetInputDim("Weight");
     auto proj_dims = ctx->GetInputDim("ProjWeight");
@@ -75,6 +66,18 @@ class LSTMPOp : public framework::OperatorWithKernel {
                       "should be %d.",
                       frame_size);
 
+    if (ctx->HasInput("H0")) {
+      PADDLE_ENFORCE(ctx->HasInput("C0"),
+                     "Input(C0) and Input(H0) of LSTMP should not "
+                     "be null at the same time.");
+      auto h_dims = ctx->GetInputDim("H0");
+      auto c_dims = ctx->GetInputDim("C0");
+      PADDLE_ENFORCE(h_dims == c_dims,
+                     "The dimension of Input(H0) and Input(C0) "
+                     "should be the same.");
+      ctx->SetOutputDim("OrderedP0", {h_dims[0], proj_dims[1]});
+    }
+
     auto b_dims = ctx->GetInputDim("Bias");
     PADDLE_ENFORCE_EQ(b_dims.size(), 2, "The rank of Input(Bias) should be 2.");
     PADDLE_ENFORCE_EQ(b_dims[0], 1,
@@ -98,6 +101,7 @@ class LSTMPOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim("Cell", out_dims);
     ctx->SetOutputDim("BatchGate", in_dims);
     ctx->SetOutputDim("BatchCellPreAct", out_dims);
+    ctx->SetOutputDim("BatchHidden", out_dims);
     ctx->ShareLoD("Input", "Projection");
     ctx->ShareLoD("Input", "Cell");
   }
@@ -169,6 +173,15 @@ class LSTMPOpMaker : public framework::OpProtoAndCheckerMaker {
               "(LoDTensor) This LoDTensor is obtained in the forward and used "
               "in the backward.")
         .AsIntermediate();
+    AddOutput("BatchHidden",
+              "(LoDTensor) This LoDTensor is obtained in the forward and used "
+              "in the backward.")
+        .AsIntermediate();
+    AddOutput("OrderedP0",
+              "(Tensor) the projection of the initial hidden state "
+              "H0. This is a tensor with shape (N x P), where N is the "
+              "batch size and P is the hidden size.")
+        .AsIntermediate();
     AddAttr<bool>("use_peepholes",
                   "(bool, defalut: True) "
                   "whether to enable diagonal/peephole connections.")
@@ -177,6 +190,12 @@ class LSTMPOpMaker : public framework::OpProtoAndCheckerMaker {
                   "(bool, defalut: False) "
                   "whether to compute reversed LSTMP.")
         .SetDefault(false);
+    AddAttr<bool>("share_cell_act",
+                  "(bool, defalut: True) "
+                  "whether to share activation with cell output. "
+                  "If false, the projection would be linear, else "
+                  "through an activation same with the cell output.")
+        .SetDefault(true);
     AddAttr<std::string>(
         "gate_activation",
         "(string, default: sigmoid)"
@@ -213,7 +232,7 @@ o_t = \sigma(W_{ox}x_{t} + W_{oh}r_{t-1} + W_{oc}c_t + b_o) \\
 
 h_t = o_t \odot act_h(c_t)
 
-r_t = W_{rh}h_t
+r_t = act_h'(W_{rh}h_t)
 $$
 
 where the W terms denote weight matrices (e.g. $W_{xi}$ is the matrix
@@ -229,7 +248,8 @@ layer.
 
 The $\odot$ is the element-wise product of the vectors. $act_g$ and $act_h$
 are the cell input and cell output activation functions and `tanh` is usually
-used for them.
+used for them. If `share_cell_act` setted to `False`, $act_h'$ will be linear
+else will be same with $act_h$.
 
 Note that these $W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}$
 operations on the input $x_{t}$ are NOT included in this operator.
@@ -246,12 +266,14 @@ class LSTMPGradOp : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("Input"),
                    "Input(Input) of LSTMP should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Hidden"),
-                   "Input(Hidden) of LSTMP should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Projection"),
+                   "Input(Projection) of LSTMP should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("Cell"),
                    "Input(Cell) of LSTMP should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("Weight"),
                    "Input(Weight) of LSTMP should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("ProjWeight"),
+                   "Input(ProjWeight) of LSTMP should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("Bias"),
                    "Input(Bias) of LSTMP should not be null.");
 
@@ -268,6 +290,7 @@ class LSTMPGradOp : public framework::OperatorWithKernel {
 
     SetOutGradDim("Input");
     SetOutGradDim("Weight");
+    SetOutGradDim("ProjWeight");
     SetOutGradDim("Bias");
     SetOutGradDim("H0");
     SetOutGradDim("C0");
diff --git a/paddle/operators/lstmp_op.cu.cc b/paddle/operators/lstmp_op.cu
similarity index 100%
rename from paddle/operators/lstmp_op.cu.cc
rename to paddle/operators/lstmp_op.cu
diff --git a/paddle/operators/lstmp_op.h b/paddle/operators/lstmp_op.h
index f5a38b2ff5..9467ccdb5a 100644
--- a/paddle/operators/lstmp_op.h
+++ b/paddle/operators/lstmp_op.h
@@ -13,18 +13,25 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/op_registry.h"
+#include "paddle/operators/activation_op.h"
 #include "paddle/operators/math/detail/activation_functions.h"
 #include "paddle/operators/math/lstm_compute.h"
 #include "paddle/operators/math/math_function.h"
 #include "paddle/operators/math/sequence2batch.h"
 
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
 namespace paddle {
 namespace operators {
 
 using LoDTensor = framework::LoDTensor;
 using Tensor = framework::Tensor;
 
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
 template <typename DeviceContext, typename T>
 inline void ReorderInitState(const DeviceContext& ctx,
                              const framework::Tensor& src, const size_t* index,
@@ -37,6 +44,21 @@ inline void ReorderInitState(const DeviceContext& ctx,
 template <typename DeviceContext, typename T>
 class LSTMPKernel : public framework::OpKernel<T> {
  public:
+  template <typename Device, typename X, typename Y>
+  void ActCompute(const math::detail::ActivationType act_type, const Device& d,
+                  X x, Y y) const {
+    if (act_type == math::detail::ActivationType::kIdentity)
+      y.device(d) = x;
+    else if (act_type == math::detail::ActivationType::kSigmoid)
+      SigmoidFunctor<T>()(d, x, y);
+    else if (act_type == math::detail::ActivationType::kTanh)
+      TanhFunctor<T>()(d, x, y);
+    else if (act_type == math::detail::ActivationType::kReLU)
+      ReluFunctor<T>()(d, x, y);
+    else
+      PADDLE_THROW("unsupported activation type");
+  }
+
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* input = ctx.Input<LoDTensor>("Input");
     auto* weight = ctx.Input<Tensor>("Weight");
@@ -44,6 +66,7 @@ class LSTMPKernel : public framework::OpKernel<T> {
     auto* bias = ctx.Input<Tensor>("Bias");
 
     auto* hidden_t0 = ctx.Input<Tensor>("H0");
+    auto* ordered_proj0 = ctx.Output<Tensor>("OrderedP0");
     auto* cell_t0 = ctx.Input<Tensor>("C0");
 
     auto* batch_gate = ctx.Output<LoDTensor>("BatchGate");
@@ -97,12 +120,13 @@ class LSTMPKernel : public framework::OpKernel<T> {
     }
 
     // Use the local variable as here.
-    LoDTensor batch_hidden, batch_proj, batch_cell;
+    LoDTensor batch_proj, batch_cell;
     auto* batch_cell_pre_act = ctx.Output<LoDTensor>("BatchCellPreAct");
-    batch_hidden.mutable_data<T>(dims, ctx.GetPlace());     // T x D
+    batch_cell_pre_act->mutable_data<T>(dims, ctx.GetPlace());
+    auto* batch_hidden = ctx.Output<LoDTensor>("BatchHidden");
+    batch_hidden->mutable_data<T>(dims, ctx.GetPlace());    // T x D
     batch_proj.mutable_data<T>(proj_dims, ctx.GetPlace());  // T x P
     batch_cell.mutable_data<T>(dims, ctx.GetPlace());       // T x D
-    batch_cell_pre_act->mutable_data<T>(dims, ctx.GetPlace());
 
     auto batch_starts = batch_gate->lod()[0];
     size_t num_batch = batch_starts.size() - 1;
@@ -112,13 +136,15 @@ class LSTMPKernel : public framework::OpKernel<T> {
         ctx.Attr<std::string>("cell_activation"));
     auto cand_act = math::detail::GetActivationType(
         ctx.Attr<std::string>("candidate_activation"));
+    auto share_cell_act = ctx.Attr<bool>("share_cell_act");
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
 
     for (size_t n = 0; n < num_batch; n++) {
       int bstart = static_cast<int>(batch_starts[n]);
       int bend = static_cast<int>(batch_starts[n + 1]);
 
       Tensor gate_t = batch_gate->Slice(bstart, bend);
-      Tensor hidden_t = batch_hidden.Slice(bstart, bend);
+      Tensor hidden_t = batch_hidden->Slice(bstart, bend);
       Tensor proj_t = batch_proj.Slice(bstart, bend);
       Tensor cell_t = batch_cell.Slice(bstart, bend);
       Tensor cell_pre_act_t = batch_cell_pre_act->Slice(bstart, bend);
@@ -140,15 +166,19 @@ class LSTMPKernel : public framework::OpKernel<T> {
         // Since the batch computing for LSTMP reorders the input sequence
         // according to their length. The initialized hidden state also needs
         // to reorder.
-        Tensor ordered_h0, ordered_proj0;
-        ordered_proj0.Resize({1, proj_weight->dims()[1]});
-        ordered_proj0.mutable_data<T>(ctx.GetPlace());
+
+        Tensor ordered_h0;
+        ordered_proj0->mutable_data<T>(ctx.GetPlace());
         ReorderInitState<DeviceContext, T>(device_ctx, *hidden_t0, order,
                                            &ordered_h0, true);
         math::matmul<DeviceContext, T>(device_ctx, ordered_h0, false,
                                        *proj_weight, false, static_cast<T>(1.0),
-                                       &ordered_proj0, static_cast<T>(0.0));
-        math::matmul<DeviceContext, T>(device_ctx, ordered_proj0, false,
+                                       ordered_proj0, static_cast<T>(0.0));
+        if (share_cell_act) {
+          auto proj0_dev = EigenMatrix<T>::From(*ordered_proj0);
+          ActCompute(cell_act, place, proj0_dev, proj0_dev);
+        }
+        math::matmul<DeviceContext, T>(device_ctx, *ordered_proj0, false,
                                        *weight, false, static_cast<T>(1.0),
                                        &gate_t, static_cast<T>(1.0));
       }
@@ -164,6 +194,10 @@ class LSTMPKernel : public framework::OpKernel<T> {
       math::matmul<DeviceContext, T>(device_ctx, hidden_t, false, *proj_weight,
                                      false, static_cast<T>(1.0), &proj_t,
                                      static_cast<T>(0.0));
+      if (share_cell_act) {
+        auto proj_t_dev = EigenMatrix<T>::From(proj_t);
+        ActCompute(cell_act, place, proj_t_dev, proj_t_dev);
+      }
     }
 
     math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
@@ -180,9 +214,26 @@ class LSTMPKernel : public framework::OpKernel<T> {
 template <typename DeviceContext, typename T>
 class LSTMPGradKernel : public framework::OpKernel<T> {
  public:
+  template <typename Device, typename X, typename Y, typename DX, typename DY>
+  void ActGradCompute(const math::detail::ActivationType act_type,
+                      const Device& d, X x, Y y, DX dx, DY dy) const {
+    // x is dummy and won't be used even in Relu(use y instead)
+    if (act_type == math::detail::ActivationType::kIdentity)
+      dx.device(d) = dy;
+    else if (act_type == math::detail::ActivationType::kSigmoid)
+      SigmoidGradFunctor<T>()(d, x, y, dy, dx);
+    else if (act_type == math::detail::ActivationType::kTanh)
+      TanhGradFunctor<T>()(d, x, y, dy, dx);
+    else if (act_type == math::detail::ActivationType::kReLU)
+      ReluGradFunctor<T>()(d, x, y, dy, dx);
+    else
+      PADDLE_THROW("unsupported activation type");
+  }
+
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* input = ctx.Input<LoDTensor>("Input");
     auto* weight = ctx.Input<Tensor>("Weight");
+    auto* proj_weight = ctx.Input<Tensor>("ProjWeight");
     auto* bias = ctx.Input<Tensor>("Bias");
 
     auto* proj_out = ctx.Input<LoDTensor>("Projection");
@@ -190,14 +241,19 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
 
     auto* batch_gate = ctx.Input<LoDTensor>("BatchGate");
     auto* batch_cell_pre_act = ctx.Input<LoDTensor>("BatchCellPreAct");
+    auto* batch_hidden = ctx.Input<LoDTensor>("BatchHidden");
 
-    auto* hidden_g = ctx.Input<LoDTensor>(framework::GradVarName("Projection"));
+    auto* projection_g =
+        ctx.Input<LoDTensor>(framework::GradVarName("Projection"));
 
     auto* in_g = ctx.Output<LoDTensor>(framework::GradVarName("Input"));
     auto* weight_g = ctx.Output<Tensor>(framework::GradVarName("Weight"));
+    auto* proj_weight_g =
+        ctx.Output<Tensor>(framework::GradVarName("ProjWeight"));
     auto* bias_g = ctx.Output<Tensor>(framework::GradVarName("Bias"));
 
     auto* h0 = ctx.Input<Tensor>("H0");
+    auto* ordered_proj0 = ctx.Input<Tensor>("OrderedP0");
     auto* c0 = ctx.Input<Tensor>("C0");
 
     auto* h0_g = ctx.Output<Tensor>(framework::GradVarName("H0"));
@@ -209,6 +265,10 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
       weight_g->mutable_data<T>(ctx.GetPlace());
       zero(device_ctx, weight_g, static_cast<T>(0.0));
     }
+    if (proj_weight_g) {
+      proj_weight_g->mutable_data<T>(ctx.GetPlace());
+      zero(device_ctx, proj_weight_g, static_cast<T>(0.0));
+    }
 
     // ordered_h0/c0 is the reordered hidden/cell initialization.
     // ordered_h0_g/c0_g is the reordered gradient of hidden/cell
@@ -224,7 +284,8 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
     }
 
     auto in_dims = input->dims();
-    auto out_dims = hidden_g->dims();
+    auto out_dims = cell_out->dims();
+    framework::DDim proj_dims({in_dims[0], proj_weight->dims()[1]});
     int frame_size = static_cast<int>(in_dims[1] / 4);
     PADDLE_ENFORCE_EQ(frame_size, out_dims[1]);
 
@@ -267,10 +328,11 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
       to_batch(ctx, src, dst, false);
     };
 
-    LoDTensor batch_proj, batch_proj_g, batch_cell;
-    ToBatch(device_ctx, *proj_out, out_dims, batch_proj);
-    ToBatch(device_ctx, *hidden_g, out_dims, batch_proj_g);
-    ToBatch(device_ctx, *cell_out, out_dims, batch_cell);
+    LoDTensor batch_hidden_g, batch_proj, batch_proj_g, batch_cell;
+    batch_hidden_g.mutable_data<T>(out_dims, ctx.GetPlace());
+    ToBatch(device_ctx, *proj_out, proj_dims, batch_proj);        // T x P
+    ToBatch(device_ctx, *projection_g, proj_dims, batch_proj_g);  // T x P
+    ToBatch(device_ctx, *cell_out, out_dims, batch_cell);         // T x D
 
     LoDTensor batch_cell_g, batch_gate_g;
     batch_cell_g.mutable_data<T>(out_dims, ctx.GetPlace());
@@ -286,6 +348,8 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
         ctx.Attr<std::string>("cell_activation"));
     auto cand_act = math::detail::GetActivationType(
         ctx.Attr<std::string>("candidate_activation"));
+    auto share_cell_act = ctx.Attr<bool>("share_cell_act");
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
 
     auto batch_starts = batch_gate->lod()[0];
     size_t num_batch = batch_starts.size() - 1;
@@ -293,6 +357,19 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
       int bstart = static_cast<int>(batch_starts[n]);
       int bend = static_cast<int>(batch_starts[n + 1]);
 
+      Tensor cur_proj = batch_proj.Slice(bstart, bend);
+      Tensor proj_g = batch_proj_g.Slice(bstart, bend);
+      if (share_cell_act) {
+        auto cur_proj_dev = EigenMatrix<T>::From(cur_proj);
+        auto proj_g_dev = EigenMatrix<T>::From(proj_g);
+        ActGradCompute(cell_act, place, cur_proj_dev, cur_proj_dev, proj_g_dev,
+                       proj_g_dev);
+      }
+      Tensor out_g = batch_hidden_g.Slice(bstart, bend);
+      math::matmul<DeviceContext, T>(device_ctx, proj_g, false, *proj_weight,
+                                     true, static_cast<T>(1.0), &out_g,
+                                     static_cast<T>(0.0));
+
       Tensor gate = batch_gate->Slice(bstart, bend);
       Tensor cell = batch_cell.Slice(bstart, bend);
       Tensor cell_pre_act = batch_cell_pre_act->Slice(bstart, bend);
@@ -300,7 +377,6 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
       lstmp_value.state_value = cell.data<T>();
       lstmp_value.state_active_value = cell_pre_act.data<T>();
 
-      Tensor out_g = batch_proj_g.Slice(bstart, bend);
       Tensor gate_g = batch_gate_g.Slice(bstart, bend);
       Tensor cell_g = batch_cell_g.Slice(bstart, bend);
       lstmp_grad.state_grad = cell_g.data<T>();
@@ -337,19 +413,48 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
                                          false, static_cast<T>(1.0), weight_g,
                                          static_cast<T>(1.0));
         }
+        if (proj_weight_g) {
+          /* backward proj weigh */
+          Tensor hidden_t = batch_hidden->Slice(bstart, bend);
+          math::matmul<DeviceContext, T>(device_ctx, hidden_t, true, proj_g,
+                                         false, static_cast<T>(1.0),
+                                         proj_weight_g, static_cast<T>(1.0));
+        }
       } else {
         if (h0 && weight_g) {
           ReorderInitState<DeviceContext, T>(device_ctx, *h0, order,
                                              &ordered_h0, true);
-          math::matmul<DeviceContext, T>(device_ctx, ordered_h0, true, gate_g,
-                                         false, static_cast<T>(1.0), weight_g,
-                                         static_cast<T>(1.0));
+          if (weight_g) {
+            math::matmul<DeviceContext, T>(device_ctx, *ordered_proj0, true,
+                                           gate_g, false, static_cast<T>(1.0),
+                                           weight_g, static_cast<T>(1.0));
+          }
         }
-        if (h0 && h0_g) {
+        if (h0 && (h0_g || proj_weight_g)) {
           ordered_h0_g.mutable_data<T>(h0_g->dims(), ctx.GetPlace());
+          Tensor proj0_g;
+          proj0_g.Resize({in_dims[0], proj_weight->dims()[1]});
+          proj0_g.mutable_data<T>(ctx.GetPlace());
           math::matmul<DeviceContext, T>(device_ctx, gate_g, false, *weight,
-                                         true, static_cast<T>(1.0),
-                                         &ordered_h0_g, static_cast<T>(0.0));
+                                         true, static_cast<T>(1.0), &proj0_g,
+                                         static_cast<T>(0.0));
+          if (share_cell_act) {
+            auto proj0_dev = EigenMatrix<T>::From(*ordered_proj0);
+            auto proj0_g_dev = EigenMatrix<T>::From(proj0_g);
+            ActGradCompute(cell_act, place, proj0_dev, proj0_dev, proj0_g_dev,
+                           proj0_g_dev);
+          }
+          // Tensor proj0_g = proj_g.Slice(bstart, bend);
+          if (h0_g) {
+            math::matmul<DeviceContext, T>(
+                device_ctx, proj0_g, false, *proj_weight, true,
+                static_cast<T>(1.0), &ordered_h0_g, static_cast<T>(0.0));
+          }
+          if (proj_weight_g) {
+            math::matmul<DeviceContext, T>(device_ctx, ordered_h0, true,
+                                           proj0_g, false, static_cast<T>(1.0),
+                                           proj_weight_g, static_cast<T>(1.0));
+          }
         }
       }
     }
diff --git a/python/paddle/v2/fluid/tests/test_lstmp_op.py b/python/paddle/v2/fluid/tests/test_lstmp_op.py
index e35712ec06..81e06063fc 100644
--- a/python/paddle/v2/fluid/tests/test_lstmp_op.py
+++ b/python/paddle/v2/fluid/tests/test_lstmp_op.py
@@ -62,7 +62,8 @@ def lstmp(
         is_reverse=False,
         act_gate=None,
         act_cell=None,
-        act_cand=None):
+        act_cand=None,
+        share_cell_act=True):
     def _step(x, w_r, w_rh, w_c, r_pre, c_pre, act_gate, act_cell, act_cand):
         g = np.dot(r_pre, w_r)  # 1 x 4D
         g = g + x
@@ -85,6 +86,8 @@ def lstmp(
         h = g_o * act_cell(c)
         # projection
         r = np.dot(h, w_rh)
+        if share_cell_act:
+            r = act_cell(r)
         return r, c
 
     def _reverse(x, lod):
@@ -107,6 +110,8 @@ def lstmp(
         seq_len = offset[i + 1] - offset[i]
         x = input[offset[i]:offset[i + 1], :]
         r_pre = np.dot(h0[i], w_rh)  # 1 x P
+        if share_cell_act:
+            r_pre = act_cell(r_pre)
         c_pre = c0[i]  # 1 x D
         for j in range(seq_len):
             # compute one step
@@ -138,6 +143,7 @@ class TestLstmOp(OpTest):
         self.act_cell = 'tanh'
         self.act_cand = 'tanh'
 
+        self.share_cell_act = True
         self.has_initial_state = False
         self.is_reverse = False
         self.use_peepholes = True
@@ -167,7 +173,7 @@ class TestLstmOp(OpTest):
         w_rh = np.random.normal(size=(self.D, self.P)).astype('float64')
         r, c = lstmp(x, self.lod, h0, c0, w, w_rh, w_b, w_c, self.is_reverse,
                      ACTVATION[self.act_gate], ACTVATION[self.act_cell],
-                     ACTVATION[self.act_cand])
+                     ACTVATION[self.act_cand], self.share_cell_act)
 
         self.inputs = {'Input': (x, self.lod), 'Weight': w, 'ProjWeight': w_rh}
 
@@ -192,28 +198,30 @@ class TestLstmOp(OpTest):
     def test_check_output(self):
         self.check_output(atol=1e-8)
 
-    """
     def test_check_grad(self):
         # TODO(qingqing) remove folowing lines after the check_grad is refined.
         N = len(self.lod[0]) - 1
+        self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+        self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
         self.outputs['BatchCellPreAct'] = np.zeros(
             (N, self.D)).astype('float64')
         self.check_grad(
-            ['Input', 'Weight', 'Bias'], ['Hidden'], max_relative_error=5e-4)
-    """
+            ['Input', 'Weight', 'Bias'], ['Projection'],
+            max_relative_error=5e-3)
 
 
-"""
 class TestLstmOpHasInitial(TestLstmOp):
     def set_argument(self):
         self.lod = [[0, 2, 5, 7]]
         self.D = 16
+        self.P = 5
 
         self.act_gate = 'sigmoid'
         self.act_cell = 'tanh'
         self.act_cand = 'tanh'
 
+        self.share_cell_act = True
         self.has_initial_state = True
         self.is_reverse = True
         self.use_peepholes = True
@@ -221,63 +229,74 @@ class TestLstmOpHasInitial(TestLstmOp):
     def test_check_grad(self):
         # TODO(qingqing) remove folowing lines after the check_grad is refined.
         N = len(self.lod[0]) - 1
+        self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+        self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
         self.outputs['BatchCellPreAct'] = np.zeros(
             (N, self.D)).astype('float64')
         self.check_grad(
-            ['Input', 'Weight', 'Bias', 'H0', 'C0'], ['Hidden'],
-            max_relative_error=5e-4)
+            ['Input', 'Weight', 'Bias', 'H0', 'C0'], ['Projection'],
+            max_relative_error=5e-3)
 
     def test_check_grad_ingore_bias(self):
         N = len(self.lod[0]) - 1
+        self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+        self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
         self.outputs['BatchCellPreAct'] = np.zeros(
             (N, self.D)).astype('float64')
         self.check_grad(
-            ['Input', 'Weight'], ['Hidden'],
-            max_relative_error=5e-4,
+            ['Input', 'Weight'], ['Projection'],
+            max_relative_error=5e-3,
             no_grad_set=set('Bias'))
 
     def test_check_grad_ingore_weight(self):
         N = len(self.lod[0]) - 1
+        self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+        self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
         self.outputs['BatchCellPreAct'] = np.zeros(
             (N, self.D)).astype('float64')
         self.check_grad(
-            ['Input', 'Bias'], ['Hidden'],
-            max_relative_error=5e-4,
+            ['Input', 'Bias'], ['Projection'],
+            max_relative_error=5e-3,
             no_grad_set=set('Weight'))
 
     def test_check_grad_ingore_input(self):
         N = len(self.lod[0]) - 1
+        self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+        self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
         self.outputs['BatchCellPreAct'] = np.zeros(
             (N, self.D)).astype('float64')
         self.check_grad(
-            ['Weight', 'Bias'], ['Hidden'],
-            max_relative_error=5e-4,
+            ['Weight', 'Bias'], ['Projection'],
+            max_relative_error=5e-3,
             no_grad_set=set('Input'))
 
     def test_check_grad_ingore_h0(self):
         N = len(self.lod[0]) - 1
+        self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+        self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
         self.outputs['BatchCellPreAct'] = np.zeros(
             (N, self.D)).astype('float64')
         self.check_grad(
-            ['Input', 'Weight', 'Bias', 'C0'], ['Hidden'],
-            max_relative_error=5e-4,
+            ['Input', 'Weight', 'Bias', 'C0'], ['Projection'],
+            max_relative_error=5e-3,
             no_grad_set=set('H0'))
 
     def test_check_grad_ingore_c0(self):
         N = len(self.lod[0]) - 1
+        self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+        self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
         self.outputs['BatchCellPreAct'] = np.zeros(
             (N, self.D)).astype('float64')
         self.check_grad(
-            ['Input', 'Weight', 'Bias', 'H0'], ['Hidden'],
-            max_relative_error=5e-4,
+            ['Input', 'Weight', 'Bias', 'H0'], ['Projection'],
+            max_relative_error=5e-3,
             no_grad_set=set('C0'))
-"""
 
 
 class TestLstmOpRerverse(TestLstmOp):
@@ -290,6 +309,7 @@ class TestLstmOpRerverse(TestLstmOp):
         self.act_cell = 'tanh'
         self.act_cand = 'tanh'
 
+        self.share_cell_act = True
         self.has_initial_state = False
         self.is_reverse = True
         self.use_peepholes = True
@@ -305,6 +325,7 @@ class TestLstmOpNotUsePeepholes(TestLstmOp):
         self.act_cell = 'tanh'
         self.act_cand = 'tanh'
 
+        self.share_cell_act = True
         self.has_initial_state = False
         self.is_reverse = True
         self.use_peepholes = False

From 7a5b8ffacb2aa64981d6262790501b10257b6321 Mon Sep 17 00:00:00 2001
From: Yibing Liu <liuyibing01@baidu.com>
Date: Tue, 23 Jan 2018 09:51:15 -0800
Subject: [PATCH 3/8] Pass grad checking for projection weight

---
 paddle/operators/lstmp_op.cc                  |  4 +-
 paddle/operators/lstmp_op.h                   | 18 ++++----
 python/paddle/v2/fluid/tests/test_lstmp_op.py | 41 ++++++++++++-------
 3 files changed, 38 insertions(+), 25 deletions(-)

diff --git a/paddle/operators/lstmp_op.cc b/paddle/operators/lstmp_op.cc
index 266612294c..932e76e913 100644
--- a/paddle/operators/lstmp_op.cc
+++ b/paddle/operators/lstmp_op.cc
@@ -217,7 +217,7 @@ class LSTMPOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 Long-Short Term Memory with Recurrent Projection (LSTMP) Operator.
 
-LATMP is stand LSTM appended by a recurrent projection layer to reduce the
+LSTMP is stand LSTM appended by a recurrent projection layer to reduce the
 number of parameters, espeacially when the output size is relative large. 
 The formula is as follows:
 
@@ -232,7 +232,7 @@ o_t = \sigma(W_{ox}x_{t} + W_{oh}r_{t-1} + W_{oc}c_t + b_o) \\
 
 h_t = o_t \odot act_h(c_t)
 
-r_t = act_h'(W_{rh}h_t)
+r_t = act_{h'}(W_{rh}h_t)
 $$
 
 where the W terms denote weight matrices (e.g. $W_{xi}$ is the matrix
diff --git a/paddle/operators/lstmp_op.h b/paddle/operators/lstmp_op.h
index 9467ccdb5a..0048f7e1c6 100644
--- a/paddle/operators/lstmp_op.h
+++ b/paddle/operators/lstmp_op.h
@@ -365,10 +365,18 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
         ActGradCompute(cell_act, place, cur_proj_dev, cur_proj_dev, proj_g_dev,
                        proj_g_dev);
       }
+      /* hidden state backwarad */
       Tensor out_g = batch_hidden_g.Slice(bstart, bend);
       math::matmul<DeviceContext, T>(device_ctx, proj_g, false, *proj_weight,
                                      true, static_cast<T>(1.0), &out_g,
                                      static_cast<T>(0.0));
+      /* projection weight backward*/
+      if (proj_weight_g) {
+        Tensor hidden_t = batch_hidden->Slice(bstart, bend);
+        math::matmul<DeviceContext, T>(device_ctx, hidden_t, true, proj_g,
+                                       false, static_cast<T>(1.0),
+                                       proj_weight_g, static_cast<T>(1.0));
+      }
 
       Tensor gate = batch_gate->Slice(bstart, bend);
       Tensor cell = batch_cell.Slice(bstart, bend);
@@ -407,19 +415,12 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
                                        static_cast<T>(1.0), &pre_proj_g,
                                        static_cast<T>(1.0));
         if (weight_g) {
-          /* backward weight */
+          /* weight backward*/
           auto pre_proj = batch_proj.Slice(pre_h_start, pre_h_end);
           math::matmul<DeviceContext, T>(device_ctx, pre_proj, true, gate_g,
                                          false, static_cast<T>(1.0), weight_g,
                                          static_cast<T>(1.0));
         }
-        if (proj_weight_g) {
-          /* backward proj weigh */
-          Tensor hidden_t = batch_hidden->Slice(bstart, bend);
-          math::matmul<DeviceContext, T>(device_ctx, hidden_t, true, proj_g,
-                                         false, static_cast<T>(1.0),
-                                         proj_weight_g, static_cast<T>(1.0));
-        }
       } else {
         if (h0 && weight_g) {
           ReorderInitState<DeviceContext, T>(device_ctx, *h0, order,
@@ -444,7 +445,6 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
             ActGradCompute(cell_act, place, proj0_dev, proj0_dev, proj0_g_dev,
                            proj0_g_dev);
           }
-          // Tensor proj0_g = proj_g.Slice(bstart, bend);
           if (h0_g) {
             math::matmul<DeviceContext, T>(
                 device_ctx, proj0_g, false, *proj_weight, true,
diff --git a/python/paddle/v2/fluid/tests/test_lstmp_op.py b/python/paddle/v2/fluid/tests/test_lstmp_op.py
index 81e06063fc..a0f6955d77 100644
--- a/python/paddle/v2/fluid/tests/test_lstmp_op.py
+++ b/python/paddle/v2/fluid/tests/test_lstmp_op.py
@@ -207,8 +207,8 @@ class TestLstmOp(OpTest):
         self.outputs['BatchCellPreAct'] = np.zeros(
             (N, self.D)).astype('float64')
         self.check_grad(
-            ['Input', 'Weight', 'Bias'], ['Projection'],
-            max_relative_error=5e-3)
+            ['Input', 'Weight', 'ProjWeight', 'Bias'], ['Projection'],
+            max_relative_error=1e-2)
 
 
 class TestLstmOpHasInitial(TestLstmOp):
@@ -235,8 +235,9 @@ class TestLstmOpHasInitial(TestLstmOp):
         self.outputs['BatchCellPreAct'] = np.zeros(
             (N, self.D)).astype('float64')
         self.check_grad(
-            ['Input', 'Weight', 'Bias', 'H0', 'C0'], ['Projection'],
-            max_relative_error=5e-3)
+            ['Input', 'Weight', 'ProjWeight', 'Bias', 'H0', 'C0'],
+            ['Projection'],
+            max_relative_error=1e-2)
 
     def test_check_grad_ingore_bias(self):
         N = len(self.lod[0]) - 1
@@ -246,8 +247,8 @@ class TestLstmOpHasInitial(TestLstmOp):
         self.outputs['BatchCellPreAct'] = np.zeros(
             (N, self.D)).astype('float64')
         self.check_grad(
-            ['Input', 'Weight'], ['Projection'],
-            max_relative_error=5e-3,
+            ['Input', 'ProjWeight', 'Weight'], ['Projection'],
+            max_relative_error=1e-2,
             no_grad_set=set('Bias'))
 
     def test_check_grad_ingore_weight(self):
@@ -258,10 +259,22 @@ class TestLstmOpHasInitial(TestLstmOp):
         self.outputs['BatchCellPreAct'] = np.zeros(
             (N, self.D)).astype('float64')
         self.check_grad(
-            ['Input', 'Bias'], ['Projection'],
-            max_relative_error=5e-3,
+            ['Input', 'ProjWeight', 'Bias'], ['Projection'],
+            max_relative_error=1e-2,
             no_grad_set=set('Weight'))
 
+    def test_check_grad_ingore_proj_weight(self):
+        N = len(self.lod[0]) - 1
+        self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
+        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+        self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
+        self.outputs['BatchCellPreAct'] = np.zeros(
+            (N, self.D)).astype('float64')
+        self.check_grad(
+            ['Input', 'Weight', 'Bias'], ['Projection'],
+            max_relative_error=1e-2,
+            no_grad_set=set('ProjWeight'))
+
     def test_check_grad_ingore_input(self):
         N = len(self.lod[0]) - 1
         self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
@@ -270,8 +283,8 @@ class TestLstmOpHasInitial(TestLstmOp):
         self.outputs['BatchCellPreAct'] = np.zeros(
             (N, self.D)).astype('float64')
         self.check_grad(
-            ['Weight', 'Bias'], ['Projection'],
-            max_relative_error=5e-3,
+            ['Weight', 'ProjWeight', 'Bias'], ['Projection'],
+            max_relative_error=1e-2,
             no_grad_set=set('Input'))
 
     def test_check_grad_ingore_h0(self):
@@ -282,8 +295,8 @@ class TestLstmOpHasInitial(TestLstmOp):
         self.outputs['BatchCellPreAct'] = np.zeros(
             (N, self.D)).astype('float64')
         self.check_grad(
-            ['Input', 'Weight', 'Bias', 'C0'], ['Projection'],
-            max_relative_error=5e-3,
+            ['Input', 'Weight', 'ProjWeight', 'Bias', 'C0'], ['Projection'],
+            max_relative_error=1e-2,
             no_grad_set=set('H0'))
 
     def test_check_grad_ingore_c0(self):
@@ -294,8 +307,8 @@ class TestLstmOpHasInitial(TestLstmOp):
         self.outputs['BatchCellPreAct'] = np.zeros(
             (N, self.D)).astype('float64')
         self.check_grad(
-            ['Input', 'Weight', 'Bias', 'H0'], ['Projection'],
-            max_relative_error=5e-3,
+            ['Input', 'Weight', 'ProjWeight', 'Bias', 'H0'], ['Projection'],
+            max_relative_error=1e-2,
             no_grad_set=set('C0'))
 
 

From a3d1f86947dc46dfbff734cf0b5b529eaff4703e Mon Sep 17 00:00:00 2001
From: Yibing Liu <liuyibing01@baidu.com>
Date: Tue, 23 Jan 2018 10:27:44 -0800
Subject: [PATCH 4/8] Add unit test for linear projection

---
 python/paddle/v2/fluid/tests/test_lstmp_op.py | 21 +++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/python/paddle/v2/fluid/tests/test_lstmp_op.py b/python/paddle/v2/fluid/tests/test_lstmp_op.py
index a0f6955d77..8835cae504 100644
--- a/python/paddle/v2/fluid/tests/test_lstmp_op.py
+++ b/python/paddle/v2/fluid/tests/test_lstmp_op.py
@@ -192,7 +192,8 @@ class TestLstmOp(OpTest):
             'is_reverse': self.is_reverse,
             'gate_activation': self.act_gate,
             'cell_activation': self.act_cell,
-            'candidate_activation': self.act_cand
+            'candidate_activation': self.act_cand,
+            'share_cell_act': self.share_cell_act
         }
 
     def test_check_output(self):
@@ -340,9 +341,25 @@ class TestLstmOpNotUsePeepholes(TestLstmOp):
 
         self.share_cell_act = True
         self.has_initial_state = False
-        self.is_reverse = True
+        self.is_reverse = False
         self.use_peepholes = False
 
 
+class TestLstmOpNotShareCellAct(TestLstmOp):
+    def set_argument(self):
+        self.lod = [[0, 2, 5, 7]]
+        self.D = 16
+        self.P = 10
+
+        self.act_gate = 'sigmoid'
+        self.act_cell = 'tanh'
+        self.act_cand = 'tanh'
+
+        self.share_cell_act = False
+        self.has_initial_state = False
+        self.is_reverse = False
+        self.use_peepholes = True
+
+
 if __name__ == '__main__':
     unittest.main()

From db1f6a591ae9291de9877099e6801f101e679969 Mon Sep 17 00:00:00 2001
From: Yibing Liu <liuyibing01@baidu.com>
Date: Tue, 23 Jan 2018 19:29:12 -0800
Subject: [PATCH 5/8] Update doc in lstmp_op

---
 paddle/operators/lstmp_op.cc | 86 ++++++++++++++++++++----------------
 1 file changed, 49 insertions(+), 37 deletions(-)

diff --git a/paddle/operators/lstmp_op.cc b/paddle/operators/lstmp_op.cc
index 932e76e913..85be64f44c 100644
--- a/paddle/operators/lstmp_op.cc
+++ b/paddle/operators/lstmp_op.cc
@@ -120,7 +120,7 @@ class LSTMPOpMaker : public framework::OpProtoAndCheckerMaker {
   LSTMPOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Input",
-             "(LoDTensor) the first input is a LodTensor, which support "
+             "(LoDTensor) the input for sequence data, which supports "
              "variable-time length input sequence. The underlying tensor in "
              "this LoDTensor is a matrix with shape (T X 4D), where T is the "
              "total time steps in this mini-batch, D is the hidden size.");
@@ -132,21 +132,23 @@ class LSTMPOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("C0",
              "(Tensor, optional) the initial cell state is an optional "
              "input. This is a tensor with shape (N x D), where N is the "
-             "batch size. `H0` and `C0` can be NULL but only at the same time")
+             "batch size. Only one of `H0` and `C0` can be NULL at the same "
+             "time.")
         .AsDispensable();
     AddInput("Weight",
              "(Tensor) the learnable hidden-hidden weights."
-             " - The shape is (P x 4D), where P is the recurrent projection "
-             "layer size and  D is the hidden size. "
+             " - The shape is (P x 4D), where P is the projection layer size "
+             "and  D is the hidden size."
              " - Weight = {W_cr, W_ir, W_fr, W_or}");
     AddInput("ProjWeight",
-             "(Tensor) the learnable weight `W_rh` of the projection layer."
+             "(Tensor) the learnable weight of the projection layer."
              " - The shape is (D x P), where P is the recurrent projection "
-             "layer size and  D is the hidden size.");
+             "layer size and  D is the hidden size."
+             " - ProjWeight = {W_rh}");
     AddInput("Bias",
-             "(Tensor) the learnable weights, which contains two parts: "
-             "input-hidden bias weight and peephole connections weight if "
-             "setting `use_peepholes` True. "
+             "(Tensor) the learnable biases, which contains two parts: "
+             "input-hidden biases and peephole connections weights if "
+             "setting `use_peepholes` to `True`. "
              "1. `use_peepholes = False` "
              " - The shape is (1 x 4D). "
              " - Bias = {b_c, b_i, b_f, b_o}."
@@ -155,27 +157,28 @@ class LSTMPOpMaker : public framework::OpProtoAndCheckerMaker {
              " - Bias = {b_c, b_i, b_f, b_o, W_ic, W_fc, W_oc}.");
     AddOutput("Projection",
               "(LoDTensor) the projection of the hidden state of LSTMP "
-              "operator. The shape is (T x P), and lod is the same with the "
+              "operator. The shape is (T x P), and LoD is the same with the "
               "`Input`.");
     AddOutput("Cell",
               "(LoDTensor) the cell state of LSTMP operator. "
               "The shape is (T x D), and lod is the same with the `Input`.");
     AddOutput("BatchGate",
               "(LoDTensor) This LoDTensor contains input gate, forget gate "
-              "and output gate after the nonlinear computation. This "
-              "LoDTensor has the same shape as the reorganized input, which "
-              "is also be called batch input. The LoD size is 2. The first "
-              "LoD is the batch offsets and the second LoD contains the "
-              "indexes, which denote the position of reorganized sequence "
-              "in the raw input.")
+              "and output gate after the activations. This LoDTensor has the "
+              "same shape as the reorganized input, which is also be called "
+              "batch input. The LoD size is 2. The first-level LoD is the "
+              "batch offsets and the second contains the indices, which "
+              "denotes the position of reorganized sequence in the raw input.")
         .AsIntermediate();
     AddOutput("BatchCellPreAct",
-              "(LoDTensor) This LoDTensor is obtained in the forward and used "
-              "in the backward.")
+              "(LoDTensor) the pre-activation cell state reorganized in batch. "
+              "This LoDTensor is obtained in the forward and used in the "
+              "backward.")
         .AsIntermediate();
     AddOutput("BatchHidden",
-              "(LoDTensor) This LoDTensor is obtained in the forward and used "
-              "in the backward.")
+              "(LoDTensor) the hidden state reorganized in batch. "
+              "This LoDTensor is obtained in the forward and used in the "
+              "backward.")
         .AsIntermediate();
     AddOutput("OrderedP0",
               "(Tensor) the projection of the initial hidden state "
@@ -190,12 +193,6 @@ class LSTMPOpMaker : public framework::OpProtoAndCheckerMaker {
                   "(bool, defalut: False) "
                   "whether to compute reversed LSTMP.")
         .SetDefault(false);
-    AddAttr<bool>("share_cell_act",
-                  "(bool, defalut: True) "
-                  "whether to share activation with cell output. "
-                  "If false, the projection would be linear, else "
-                  "through an activation same with the cell output.")
-        .SetDefault(true);
     AddAttr<std::string>(
         "gate_activation",
         "(string, default: sigmoid)"
@@ -214,11 +211,21 @@ class LSTMPOpMaker : public framework::OpProtoAndCheckerMaker {
                          "`tanh` by default.")
         .SetDefault("tanh")
         .InEnum({"sigmoid", "tanh", "relu", "identity"});
+    AddAttr<bool>("share_cell_act",
+                  "(bool, defalut: True) "
+                  "whether to share the activation of cell output with the "
+                  "projection layer. When set to `False`, the projection "
+                  "is simple linear, otherwise it will go through an "
+                  "activation function same as `cell_activation`.")
+        .SetDefault(true);
     AddComment(R"DOC(
-Long-Short Term Memory with Recurrent Projection (LSTMP) Operator.
+Long-Short Term Memory with recurrent Projection layer (LSTMP) Operator.
 
-LSTMP is stand LSTM appended by a recurrent projection layer to reduce the
-number of parameters, espeacially when the output size is relative large. 
+LSTMP has a separate projection layer after the LSTM layer, projecting the 
+original hidden state to a lower-dimensional one, which is proposed to reduce 
+the number of total parameters and furthermore computational complexity for 
+the LSTM, espeacially for the case that the size of output units is relative 
+large (https://research.google.com/pubs/archive/43905.pdf). 
 The formula is as follows:
 
 $$
@@ -226,13 +233,15 @@ i_t = \sigma(W_{ix}x_{t} + W_{ih}r_{t-1} + W_{ic}c_{t-1} + b_i) \\
 
 f_t = \sigma(W_{fx}x_{t} + W_{fh}r_{t-1} + W_{fc}c_{t-1} + b_f) \\
 
-c_t = f_t \odot c_{t-1} + i_t \odot act_g(W_{cx}x_t + W_{ch}r_{t-1} + b_c) \\
+\tilde{c_t} = act_g(W_{cx}x_t + W_{ch}r_{t-1} + b_c) \\
 
 o_t = \sigma(W_{ox}x_{t} + W_{oh}r_{t-1} + W_{oc}c_t + b_o) \\
 
+c_t = f_t \odot c_{t-1} + i_t \odot \tilde{c_t}
+
 h_t = o_t \odot act_h(c_t)
 
-r_t = act_{h'}(W_{rh}h_t)
+r_t = \overline{act_h}(W_{rh}h_t)
 $$
 
 where the W terms denote weight matrices (e.g. $W_{xi}$ is the matrix
@@ -240,20 +249,23 @@ of weights from the input gate to the input), $W_{ic}, W_{fc}, W_{oc}$
 are diagonal weight matrices for peephole connections. In our implementation,
 we use vectors to reprenset these diagonal weight matrices. The b terms
 denote bias vectors ($b_i$ is the input gate bias vector), $\sigma$
-is the non-line activations, such as logistic sigmoid function, and
+is the activation, such as logistic sigmoid function, and
 $i, f, o$ and $c$ are the input gate, forget gate, output gate,
 and cell activation vectors, respectively, all of which have the same size as
-the cell output activation vector $h$. $r$ denotes the recurrent projection 
-layer.
+the cell output activation vector $h$. Here $h$ is usually called the hidden 
+state and $r$ denotes its recurrent projection. And $\tilde{c_t}$ is also 
+called the candidate hidden state, whose computation is based on the current 
+input and previous hidden state.
 
 The $\odot$ is the element-wise product of the vectors. $act_g$ and $act_h$
 are the cell input and cell output activation functions and `tanh` is usually
-used for them. If `share_cell_act` setted to `False`, $act_h'$ will be linear
-else will be same with $act_h$.
+used for them. $\overline{act_h}$ is the activation function for the projection 
+layer. When `share_cell_act` set to `False`, $\overline{act_h}$ is an
+identity activation, otherwise it will be same as $act_h$.
 
 Note that these $W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}$
 operations on the input $x_{t}$ are NOT included in this operator.
-Users can choose to use fully-connect operator before LSTMP operator.
+Users can choose to use fully-connected operator before LSTMP operator.
 
 )DOC");
   }

From 76beff86a0f8e0d6856691b2968bafa52bf3a859 Mon Sep 17 00:00:00 2001
From: Yibing Liu <liuyibing01@baidu.com>
Date: Wed, 24 Jan 2018 01:34:54 -0800
Subject: [PATCH 6/8] Make the projection activation configurable

---
 paddle/operators/lstmp_op.cc                  | 76 +++++++++----------
 paddle/operators/lstmp_op.h                   | 14 ++--
 python/paddle/v2/fluid/tests/test_lstmp_op.py | 41 +++++-----
 3 files changed, 66 insertions(+), 65 deletions(-)

diff --git a/paddle/operators/lstmp_op.cc b/paddle/operators/lstmp_op.cc
index 85be64f44c..14469c708d 100644
--- a/paddle/operators/lstmp_op.cc
+++ b/paddle/operators/lstmp_op.cc
@@ -23,27 +23,29 @@ class LSTMPOp : public framework::OperatorWithKernel {
 
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("Input"),
-                   "Input(Input) of LSTMP should not be null.");
+                   "Input(Input) of LSTMP operator should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("Weight"),
-                   "Input(Weight) of LSTMP should not be null.");
+                   "Input(Weight) of LSTMP operator should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("ProjWeight"),
-                   "Input(ProjWeight) of LSTMP should not be null.");
+                   "Input(ProjWeight) of LSTMP operator should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("Bias"),
-                   "Input(Bias) of LSTMP should not be null.");
+                   "Input(Bias) of LSTMP operator should not be null.");
 
     PADDLE_ENFORCE(ctx->HasOutput("Projection"),
-                   "Output(Projection) of LSTMP should not be null.");
+                   "Output(Projection) of LSTMP operator should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Cell"),
-                   "Output(Cell) of LSTMP should not be null.");
+                   "Output(Cell) of LSTMP operator should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("BatchGate"),
-                   "Output(BatchGate) of LSTMP should not be null.");
+                   "Output(BatchGate) of LSTMP operator should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("BatchCellPreAct"),
-                   "Output(BatchGate) of LSTMP should not be null.");
+                   "Output(BatchCellPreAct) of LSTMP operator should not be "
+                   "null.");
     PADDLE_ENFORCE(ctx->HasOutput("BatchHidden"),
-                   "Output(BatchHidden) of LSTMP should not be null.");
+                   "Output(BatchHidden) of LSTMP operator should not be null.");
 
     auto in_dims = ctx->GetInputDim("Input");
-    PADDLE_ENFORCE_EQ(in_dims.size(), 2, "Input(X)'s rank must be 2.");
+    PADDLE_ENFORCE_EQ(in_dims.size(), 2,
+                      "Input(X)'s rank of LSTMP operator must be 2.");
 
     int frame_size = in_dims[1] / 4;
     auto w_dims = ctx->GetInputDim("Weight");
@@ -68,8 +70,8 @@ class LSTMPOp : public framework::OperatorWithKernel {
 
     if (ctx->HasInput("H0")) {
       PADDLE_ENFORCE(ctx->HasInput("C0"),
-                     "Input(C0) and Input(H0) of LSTMP should not "
-                     "be null at the same time.");
+                     "Input(C0) of LSTMP operator should not be null after "
+                     "Input(H0) provided.");
       auto h_dims = ctx->GetInputDim("H0");
       auto c_dims = ctx->GetInputDim("C0");
       PADDLE_ENFORCE(h_dims == c_dims,
@@ -132,8 +134,7 @@ class LSTMPOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("C0",
              "(Tensor, optional) the initial cell state is an optional "
              "input. This is a tensor with shape (N x D), where N is the "
-             "batch size. Only one of `H0` and `C0` can be NULL at the same "
-             "time.")
+             "batch size. `C0` should not be null if `H0` provided.")
         .AsDispensable();
     AddInput("Weight",
              "(Tensor) the learnable hidden-hidden weights."
@@ -211,13 +212,12 @@ class LSTMPOpMaker : public framework::OpProtoAndCheckerMaker {
                          "`tanh` by default.")
         .SetDefault("tanh")
         .InEnum({"sigmoid", "tanh", "relu", "identity"});
-    AddAttr<bool>("share_cell_act",
-                  "(bool, defalut: True) "
-                  "whether to share the activation of cell output with the "
-                  "projection layer. When set to `False`, the projection "
-                  "is simple linear, otherwise it will go through an "
-                  "activation function same as `cell_activation`.")
-        .SetDefault(true);
+    AddAttr<std::string>("proj_activation",
+                         "(string, default: tanh)"
+                         "The activation for projection output, "
+                         "`tanh` by defalut.")
+        .SetDefault("tanh")
+        .InEnum({"sigmoid", "tanh", "relu", "identity"});
     AddComment(R"DOC(
 Long-Short Term Memory with recurrent Projection layer (LSTMP) Operator.
 
@@ -226,20 +226,21 @@ original hidden state to a lower-dimensional one, which is proposed to reduce
 the number of total parameters and furthermore computational complexity for 
 the LSTM, espeacially for the case that the size of output units is relative 
 large (https://research.google.com/pubs/archive/43905.pdf). 
+
 The formula is as follows:
 
 $$
-i_t = \sigma(W_{ix}x_{t} + W_{ih}r_{t-1} + W_{ic}c_{t-1} + b_i) \\
+i_t = \sigma(W_{ix}x_{t} + W_{ir}r_{t-1} + W_{ic}c_{t-1} + b_i) \\
 
-f_t = \sigma(W_{fx}x_{t} + W_{fh}r_{t-1} + W_{fc}c_{t-1} + b_f) \\
+f_t = \sigma(W_{fx}x_{t} + W_{fr}r_{t-1} + W_{fc}c_{t-1} + b_f) \\
 
-\tilde{c_t} = act_g(W_{cx}x_t + W_{ch}r_{t-1} + b_c) \\
+\tilde{c_t} = act_g(W_{cx}x_t + W_{cr}r_{t-1} + b_c) \\
 
-o_t = \sigma(W_{ox}x_{t} + W_{oh}r_{t-1} + W_{oc}c_t + b_o) \\
+o_t = \sigma(W_{ox}x_{t} + W_{or}r_{t-1} + W_{oc}c_t + b_o) \\
 
-c_t = f_t \odot c_{t-1} + i_t \odot \tilde{c_t}
+c_t = f_t \odot c_{t-1} + i_t \odot \tilde{c_t} \\
 
-h_t = o_t \odot act_h(c_t)
+h_t = o_t \odot act_h(c_t) \\
 
 r_t = \overline{act_h}(W_{rh}h_t)
 $$
@@ -259,9 +260,8 @@ input and previous hidden state.
 
 The $\odot$ is the element-wise product of the vectors. $act_g$ and $act_h$
 are the cell input and cell output activation functions and `tanh` is usually
-used for them. $\overline{act_h}$ is the activation function for the projection 
-layer. When `share_cell_act` set to `False`, $\overline{act_h}$ is an
-identity activation, otherwise it will be same as $act_h$.
+used for them. $\overline{act_h}$ is the activation function for the 
+projection output, usually using `identity` or same as $act_h$.
 
 Note that these $W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}$
 operations on the input $x_{t}$ are NOT included in this operator.
@@ -277,22 +277,22 @@ class LSTMPGradOp : public framework::OperatorWithKernel {
 
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("Input"),
-                   "Input(Input) of LSTMP should not be null.");
+                   "Input(Input) of LSTMP operator should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("Projection"),
-                   "Input(Projection) of LSTMP should not be null.");
+                   "Input(Projection) of LSTMP operator should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("Cell"),
-                   "Input(Cell) of LSTMP should not be null.");
+                   "Input(Cell) of LSTMP operator should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("Weight"),
-                   "Input(Weight) of LSTMP should not be null.");
+                   "Input(Weight) of LSTMP operator should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("ProjWeight"),
-                   "Input(ProjWeight) of LSTMP should not be null.");
+                   "Input(ProjWeight) of LSTMP operator should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("Bias"),
-                   "Input(Bias) of LSTMP should not be null.");
+                   "Input(Bias) of LSTMP operator should not be null.");
 
     PADDLE_ENFORCE(ctx->HasInput("BatchGate"),
-                   "Input(BatchGate) of LSTMP should not be null.");
+                   "Input(BatchGate) of LSTMP operator should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("BatchCellPreAct"),
-                   "Input(BatchGate) of LSTMP should not be null.");
+                   "Input(BatchGate) of LSTMP operator should not be null.");
 
     auto SetOutGradDim = [&ctx](const std::string& name) {
       auto g_name = framework::GradVarName(name);
diff --git a/paddle/operators/lstmp_op.h b/paddle/operators/lstmp_op.h
index 0048f7e1c6..9dc37615f0 100644
--- a/paddle/operators/lstmp_op.h
+++ b/paddle/operators/lstmp_op.h
@@ -136,7 +136,8 @@ class LSTMPKernel : public framework::OpKernel<T> {
         ctx.Attr<std::string>("cell_activation"));
     auto cand_act = math::detail::GetActivationType(
         ctx.Attr<std::string>("candidate_activation"));
-    auto share_cell_act = ctx.Attr<bool>("share_cell_act");
+    auto proj_act = math::detail::GetActivationType(
+        ctx.Attr<std::string>("proj_activation"));
     auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
 
     for (size_t n = 0; n < num_batch; n++) {
@@ -174,7 +175,7 @@ class LSTMPKernel : public framework::OpKernel<T> {
         math::matmul<DeviceContext, T>(device_ctx, ordered_h0, false,
                                        *proj_weight, false, static_cast<T>(1.0),
                                        ordered_proj0, static_cast<T>(0.0));
-        if (share_cell_act) {
+        if (proj_act != math::detail::ActivationType::kIdentity) {
           auto proj0_dev = EigenMatrix<T>::From(*ordered_proj0);
           ActCompute(cell_act, place, proj0_dev, proj0_dev);
         }
@@ -194,7 +195,7 @@ class LSTMPKernel : public framework::OpKernel<T> {
       math::matmul<DeviceContext, T>(device_ctx, hidden_t, false, *proj_weight,
                                      false, static_cast<T>(1.0), &proj_t,
                                      static_cast<T>(0.0));
-      if (share_cell_act) {
+      if (proj_act != math::detail::ActivationType::kIdentity) {
         auto proj_t_dev = EigenMatrix<T>::From(proj_t);
         ActCompute(cell_act, place, proj_t_dev, proj_t_dev);
       }
@@ -348,7 +349,8 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
         ctx.Attr<std::string>("cell_activation"));
     auto cand_act = math::detail::GetActivationType(
         ctx.Attr<std::string>("candidate_activation"));
-    auto share_cell_act = ctx.Attr<bool>("share_cell_act");
+    auto proj_act = math::detail::GetActivationType(
+        ctx.Attr<std::string>("proj_activation"));
     auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
 
     auto batch_starts = batch_gate->lod()[0];
@@ -359,7 +361,7 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
 
       Tensor cur_proj = batch_proj.Slice(bstart, bend);
       Tensor proj_g = batch_proj_g.Slice(bstart, bend);
-      if (share_cell_act) {
+      if (proj_act != math::detail::ActivationType::kIdentity) {
         auto cur_proj_dev = EigenMatrix<T>::From(cur_proj);
         auto proj_g_dev = EigenMatrix<T>::From(proj_g);
         ActGradCompute(cell_act, place, cur_proj_dev, cur_proj_dev, proj_g_dev,
@@ -439,7 +441,7 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
           math::matmul<DeviceContext, T>(device_ctx, gate_g, false, *weight,
                                          true, static_cast<T>(1.0), &proj0_g,
                                          static_cast<T>(0.0));
-          if (share_cell_act) {
+          if (proj_act != math::detail::ActivationType::kIdentity) {
             auto proj0_dev = EigenMatrix<T>::From(*ordered_proj0);
             auto proj0_g_dev = EigenMatrix<T>::From(proj0_g);
             ActGradCompute(cell_act, place, proj0_dev, proj0_dev, proj0_g_dev,
diff --git a/python/paddle/v2/fluid/tests/test_lstmp_op.py b/python/paddle/v2/fluid/tests/test_lstmp_op.py
index 8835cae504..08fc32e117 100644
--- a/python/paddle/v2/fluid/tests/test_lstmp_op.py
+++ b/python/paddle/v2/fluid/tests/test_lstmp_op.py
@@ -41,7 +41,7 @@ def relu(x):
     return np.maximum(x, 0)
 
 
-ACTVATION = {
+ACTIVATION = {
     'identity': identity,
     'sigmoid': sigmoid,
     'tanh': tanh,
@@ -63,8 +63,9 @@ def lstmp(
         act_gate=None,
         act_cell=None,
         act_cand=None,
-        share_cell_act=True):
-    def _step(x, w_r, w_rh, w_c, r_pre, c_pre, act_gate, act_cell, act_cand):
+        act_proj=None):
+    def _step(x, w_r, w_rh, w_c, r_pre, c_pre, act_gate, act_cell, act_cand,
+              act_proj):
         g = np.dot(r_pre, w_r)  # 1 x 4D
         g = g + x
         g = np.reshape(g, (1, g.size))
@@ -86,8 +87,7 @@ def lstmp(
         h = g_o * act_cell(c)
         # projection
         r = np.dot(h, w_rh)
-        if share_cell_act:
-            r = act_cell(r)
+        r = act_proj(r)
         return r, c
 
     def _reverse(x, lod):
@@ -110,13 +110,12 @@ def lstmp(
         seq_len = offset[i + 1] - offset[i]
         x = input[offset[i]:offset[i + 1], :]
         r_pre = np.dot(h0[i], w_rh)  # 1 x P
-        if share_cell_act:
-            r_pre = act_cell(r_pre)
+        r_pre = act_proj(r_pre)
         c_pre = c0[i]  # 1 x D
         for j in range(seq_len):
             # compute one step
             r_pre, c_pre = _step(x[j], w_r, w_rh, w_c, r_pre, c_pre, act_gate,
-                                 act_cell, act_cand)
+                                 act_cell, act_cand, act_proj)
             projection.append(r_pre.flatten())
             cell.append(c_pre.flatten())
 
@@ -131,7 +130,7 @@ def lstmp(
     return projection, cell
 
 
-class TestLstmOp(OpTest):
+class TestLstmpOp(OpTest):
     def set_argument(self):
         self.lod = [[0, 2, 5, 7]]
         # hidden size
@@ -142,8 +141,8 @@ class TestLstmOp(OpTest):
         self.act_gate = 'sigmoid'
         self.act_cell = 'tanh'
         self.act_cand = 'tanh'
+        self.act_proj = self.act_cell
 
-        self.share_cell_act = True
         self.has_initial_state = False
         self.is_reverse = False
         self.use_peepholes = True
@@ -172,8 +171,8 @@ class TestLstmOp(OpTest):
         w_c = b[:, 4 * self.D:] if self.use_peepholes else None
         w_rh = np.random.normal(size=(self.D, self.P)).astype('float64')
         r, c = lstmp(x, self.lod, h0, c0, w, w_rh, w_b, w_c, self.is_reverse,
-                     ACTVATION[self.act_gate], ACTVATION[self.act_cell],
-                     ACTVATION[self.act_cand], self.share_cell_act)
+                     ACTIVATION[self.act_gate], ACTIVATION[self.act_cell],
+                     ACTIVATION[self.act_cand], ACTIVATION[self.act_proj])
 
         self.inputs = {'Input': (x, self.lod), 'Weight': w, 'ProjWeight': w_rh}
 
@@ -193,7 +192,7 @@ class TestLstmOp(OpTest):
             'gate_activation': self.act_gate,
             'cell_activation': self.act_cell,
             'candidate_activation': self.act_cand,
-            'share_cell_act': self.share_cell_act
+            'proj_activation': self.act_proj
         }
 
     def test_check_output(self):
@@ -212,7 +211,7 @@ class TestLstmOp(OpTest):
             max_relative_error=1e-2)
 
 
-class TestLstmOpHasInitial(TestLstmOp):
+class TestLstmpOpHasInitial(TestLstmpOp):
     def set_argument(self):
         self.lod = [[0, 2, 5, 7]]
         self.D = 16
@@ -221,8 +220,8 @@ class TestLstmOpHasInitial(TestLstmOp):
         self.act_gate = 'sigmoid'
         self.act_cell = 'tanh'
         self.act_cand = 'tanh'
+        self.act_proj = self.act_cell
 
-        self.share_cell_act = True
         self.has_initial_state = True
         self.is_reverse = True
         self.use_peepholes = True
@@ -313,7 +312,7 @@ class TestLstmOpHasInitial(TestLstmOp):
             no_grad_set=set('C0'))
 
 
-class TestLstmOpRerverse(TestLstmOp):
+class TestLstmpOpRerverse(TestLstmpOp):
     def set_argument(self):
         self.lod = [[0, 2, 5, 7]]
         self.D = 16
@@ -322,14 +321,14 @@ class TestLstmOpRerverse(TestLstmOp):
         self.act_gate = 'sigmoid'
         self.act_cell = 'tanh'
         self.act_cand = 'tanh'
+        self.act_proj = self.act_cell
 
-        self.share_cell_act = True
         self.has_initial_state = False
         self.is_reverse = True
         self.use_peepholes = True
 
 
-class TestLstmOpNotUsePeepholes(TestLstmOp):
+class TestLstmpOpNotUsePeepholes(TestLstmpOp):
     def set_argument(self):
         self.lod = [[0, 2, 5, 7]]
         self.D = 16
@@ -338,14 +337,14 @@ class TestLstmOpNotUsePeepholes(TestLstmOp):
         self.act_gate = 'sigmoid'
         self.act_cell = 'tanh'
         self.act_cand = 'tanh'
+        self.act_proj = self.act_cell
 
-        self.share_cell_act = True
         self.has_initial_state = False
         self.is_reverse = False
         self.use_peepholes = False
 
 
-class TestLstmOpNotShareCellAct(TestLstmOp):
+class TestLstmpOpLinearProjection(TestLstmpOp):
     def set_argument(self):
         self.lod = [[0, 2, 5, 7]]
         self.D = 16
@@ -354,8 +353,8 @@ class TestLstmOpNotShareCellAct(TestLstmOp):
         self.act_gate = 'sigmoid'
         self.act_cell = 'tanh'
         self.act_cand = 'tanh'
+        self.act_proj = 'identity'
 
-        self.share_cell_act = False
         self.has_initial_state = False
         self.is_reverse = False
         self.use_peepholes = True

From 9ecc54a11b61e09c3c503b51049394e61f8e1fa3 Mon Sep 17 00:00:00 2001
From: Yibing Liu <liuyibing01@baidu.com>
Date: Wed, 24 Jan 2018 09:00:03 -0800
Subject: [PATCH 7/8] Remove redundant code in unit test

---
 paddle/operators/lstmp_op.cc                  |  2 +-
 paddle/operators/lstmp_op.h                   |  2 +-
 python/paddle/v2/fluid/tests/test_lstmp_op.py | 60 +++----------------
 3 files changed, 11 insertions(+), 53 deletions(-)

diff --git a/paddle/operators/lstmp_op.cc b/paddle/operators/lstmp_op.cc
index 14469c708d..c96b30ba35 100644
--- a/paddle/operators/lstmp_op.cc
+++ b/paddle/operators/lstmp_op.cc
@@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 
-http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
diff --git a/paddle/operators/lstmp_op.h b/paddle/operators/lstmp_op.h
index 9dc37615f0..ee82d5c10a 100644
--- a/paddle/operators/lstmp_op.h
+++ b/paddle/operators/lstmp_op.h
@@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 
-http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
diff --git a/python/paddle/v2/fluid/tests/test_lstmp_op.py b/python/paddle/v2/fluid/tests/test_lstmp_op.py
index 08fc32e117..f137fc61b3 100644
--- a/python/paddle/v2/fluid/tests/test_lstmp_op.py
+++ b/python/paddle/v2/fluid/tests/test_lstmp_op.py
@@ -131,7 +131,10 @@ def lstmp(
 
 
 class TestLstmpOp(OpTest):
-    def set_argument(self):
+    def reset_argument(self):
+        pass
+
+    def setUp(self):
         self.lod = [[0, 2, 5, 7]]
         # hidden size
         self.D = 16
@@ -147,8 +150,7 @@ class TestLstmpOp(OpTest):
         self.is_reverse = False
         self.use_peepholes = True
 
-    def setUp(self):
-        self.set_argument()
+        self.reset_argument()
         self.op_type = 'lstmp'
 
         T = self.lod[0][-1]
@@ -212,19 +214,8 @@ class TestLstmpOp(OpTest):
 
 
 class TestLstmpOpHasInitial(TestLstmpOp):
-    def set_argument(self):
-        self.lod = [[0, 2, 5, 7]]
-        self.D = 16
-        self.P = 5
-
-        self.act_gate = 'sigmoid'
-        self.act_cell = 'tanh'
-        self.act_cand = 'tanh'
-        self.act_proj = self.act_cell
-
+    def reset_argument(self):
         self.has_initial_state = True
-        self.is_reverse = True
-        self.use_peepholes = True
 
     def test_check_grad(self):
         # TODO(qingqing) remove folowing lines after the check_grad is refined.
@@ -313,52 +304,19 @@ class TestLstmpOpHasInitial(TestLstmpOp):
 
 
 class TestLstmpOpRerverse(TestLstmpOp):
-    def set_argument(self):
-        self.lod = [[0, 2, 5, 7]]
-        self.D = 16
-        self.P = 10
-
-        self.act_gate = 'sigmoid'
-        self.act_cell = 'tanh'
-        self.act_cand = 'tanh'
-        self.act_proj = self.act_cell
-
-        self.has_initial_state = False
+    def reset_argument(self):
         self.is_reverse = True
-        self.use_peepholes = True
 
 
 class TestLstmpOpNotUsePeepholes(TestLstmpOp):
-    def set_argument(self):
-        self.lod = [[0, 2, 5, 7]]
-        self.D = 16
-        self.P = 10
-
-        self.act_gate = 'sigmoid'
-        self.act_cell = 'tanh'
-        self.act_cand = 'tanh'
-        self.act_proj = self.act_cell
-
-        self.has_initial_state = False
-        self.is_reverse = False
+    def reset_argument(self):
         self.use_peepholes = False
 
 
 class TestLstmpOpLinearProjection(TestLstmpOp):
-    def set_argument(self):
-        self.lod = [[0, 2, 5, 7]]
-        self.D = 16
-        self.P = 10
-
-        self.act_gate = 'sigmoid'
-        self.act_cell = 'tanh'
-        self.act_cand = 'tanh'
+    def reset_argument(self):
         self.act_proj = 'identity'
 
-        self.has_initial_state = False
-        self.is_reverse = False
-        self.use_peepholes = True
-
 
 if __name__ == '__main__':
     unittest.main()

From 5c6fc3f92ff05edbc77284a1ec34666eac34646e Mon Sep 17 00:00:00 2001
From: Yibing Liu <liuyibing01@baidu.com>
Date: Wed, 24 Jan 2018 19:03:12 -0800
Subject: [PATCH 8/8] Make TestLstmpOp inherit from TestLstmOp

---
 python/paddle/v2/fluid/tests/test_lstm_op.py  |  6 +--
 python/paddle/v2/fluid/tests/test_lstmp_op.py | 52 +++----------------
 2 files changed, 11 insertions(+), 47 deletions(-)

diff --git a/python/paddle/v2/fluid/tests/test_lstm_op.py b/python/paddle/v2/fluid/tests/test_lstm_op.py
index d9fa01e247..3e79f9d8e1 100644
--- a/python/paddle/v2/fluid/tests/test_lstm_op.py
+++ b/python/paddle/v2/fluid/tests/test_lstm_op.py
@@ -42,7 +42,7 @@ def relu(x):
     return np.maximum(x, 0)
 
 
-ACTVATION = {
+ACTIVATION = {
     'identity': identity,
     'sigmoid': sigmoid,
     'tanh': tanh,
@@ -158,8 +158,8 @@ class TestLstmOp(OpTest):
         w_b = b[:, 0:4 * self.D]
         w_c = b[:, 4 * self.D:] if self.use_peepholes else None
         h, c = lstm(x, self.lod, h0, c0, w, w_b, w_c, self.is_reverse,
-                    ACTVATION[self.act_gate], ACTVATION[self.act_cell],
-                    ACTVATION[self.act_cand])
+                    ACTIVATION[self.act_gate], ACTIVATION[self.act_cell],
+                    ACTIVATION[self.act_cand])
 
         self.inputs = {'Input': (x, self.lod), 'Weight': w}
 
diff --git a/python/paddle/v2/fluid/tests/test_lstmp_op.py b/python/paddle/v2/fluid/tests/test_lstmp_op.py
index f137fc61b3..92a954a9aa 100644
--- a/python/paddle/v2/fluid/tests/test_lstmp_op.py
+++ b/python/paddle/v2/fluid/tests/test_lstmp_op.py
@@ -13,39 +13,13 @@
 #limitations under the License.
 import unittest
 import numpy as np
-from op_test import OpTest
-
-SIGMOID_THRESHOLD_MIN = -40.0
-SIGMOID_THRESHOLD_MAX = 13.0
-EXP_MAX_INPUT = 40.0
-
-
-def identity(x):
-    return x
-
-
-def sigmoid(x):
-    y = np.copy(x)
-    y[x < SIGMOID_THRESHOLD_MIN] = SIGMOID_THRESHOLD_MIN
-    y[x > SIGMOID_THRESHOLD_MAX] = SIGMOID_THRESHOLD_MAX
-    return 1. / (1. + np.exp(-y))
-
-
-def tanh(x):
-    y = -2. * x
-    y[y > EXP_MAX_INPUT] = EXP_MAX_INPUT
-    return (2. / (1. + np.exp(y))) - 1.
-
-
-def relu(x):
-    return np.maximum(x, 0)
-
+import test_lstm_op as LstmTest
 
 ACTIVATION = {
-    'identity': identity,
-    'sigmoid': sigmoid,
-    'tanh': tanh,
-    'relu': relu
+    'identity': LstmTest.identity,
+    'sigmoid': LstmTest.sigmoid,
+    'tanh': LstmTest.tanh,
+    'relu': LstmTest.relu
 }
 
 
@@ -55,7 +29,7 @@ def lstmp(
         lod,  # 1 x N
         h0=None,  # N x D
         c0=None,  # N x D
-        w_r=None,  # P x 5D
+        w_r=None,  # P x 4D
         w_rh=None,  # D x P
         w_b=None,  # 1 x 4D
         w_c=None,  # 1 x 3D
@@ -130,26 +104,16 @@ def lstmp(
     return projection, cell
 
 
-class TestLstmpOp(OpTest):
+class TestLstmpOp(LstmTest.TestLstmOp):
     def reset_argument(self):
         pass
 
     def setUp(self):
-        self.lod = [[0, 2, 5, 7]]
-        # hidden size
-        self.D = 16
+        self.set_argument()
         # projection size
         self.P = 10
-
-        self.act_gate = 'sigmoid'
-        self.act_cell = 'tanh'
-        self.act_cand = 'tanh'
         self.act_proj = self.act_cell
 
-        self.has_initial_state = False
-        self.is_reverse = False
-        self.use_peepholes = True
-
         self.reset_argument()
         self.op_type = 'lstmp'