Merge pull request #5429 from qingqing01/lstm_fix

Enable hidden/cell state initialization and enhance unit testing in LSTM operator.
7 years ago · 41d0533932
parent 9876bb0952 8bf37994bd
commit 41d0533932
9 changed files with 305 additions and 125 deletions
--- a/paddle/operators/lstm_op.cc
+++ b/paddle/operators/lstm_op.cc
@ -24,6 +24,11 @@ class LSTMOp : public framework::OperatorWithKernel {
  void InferShape(framework::InferShapeContext* ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("Input"),
                   "Input(Input) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Weight"),
+                   "Input(Weight) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Bias"),
+                   "Input(Bias) of LSTM should not be null.");
+
    PADDLE_ENFORCE(ctx->HasOutput("Hidden"),
                   "Output(Hidden) of LSTM should not be null.");
    PADDLE_ENFORCE(ctx->HasOutput("Cell"),
@ -59,11 +64,13 @@ class LSTMOp : public framework::OperatorWithKernel {
                      "The second dimension of Input(Weight) "
                      "should be 4 * %d.",
                      frame_size);
+
    auto b_dims = ctx->GetInputDim("Bias");
    PADDLE_ENFORCE_EQ(b_dims.size(), 2, "The rank of Input(Bias) should be 2.");
    PADDLE_ENFORCE_EQ(b_dims[0], 1,
                      "The first dimension of Input(Bias) should be 1.");
-    if (ctx->Attrs().Get<bool>("usePeepholes")) {
+
+    if (ctx->Attrs().Get<bool>("use_peepholes")) {
      PADDLE_ENFORCE_EQ(b_dims[1], 7 * frame_size,
                        "The second dimension of Input(Bias) should be "
                        "7 * %d if enable peepholes connection",
@ -74,6 +81,7 @@ class LSTMOp : public framework::OperatorWithKernel {
                        "4 * %d if disable peepholes connection",
                        frame_size);
    }
+
    framework::DDim out_dims({in_dims[0], frame_size});
    ctx->SetOutputDim("Hidden", out_dims);
    ctx->SetOutputDim("Cell", out_dims);
@ -118,14 +126,13 @@ class LSTMOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("Bias",
             "(Tensor) the learnable weights, which contains two parts: "
             "input-hidden bias weight and peephole connections weight if "
-             "setting `usePeepholes` True. "
-             "1. `usePeepholes = False` "
+             "setting `use_peepholes` True. "
+             "1. `use_peepholes = False` "
             " - The shape is (1 x 4D). "
             " - Bias = {b_c, b_i, b_f, b_o}."
-             "2. `usePeepholes = True` "
+             "2. `use_peepholes = True` "
             " - The shape is (1 x 7D). "
-             " - Bias = {b_c, b_i, b_f, b_o, W_ic, W_fc, W_oc}.")
-        .AsDispensable();
+             " - Bias = {b_c, b_i, b_f, b_o, W_ic, W_fc, W_oc}.");
    AddOutput("Hidden",
              "(LoDTensor) the hidden state of LSTM operator. "
              "The shape is (T x D), and lod is the same with the `Input`.");
@ -145,29 +152,32 @@ class LSTMOpMaker : public framework::OpProtoAndCheckerMaker {
              "(LoDTensor) This LoDTensor is obtained in the forward and used "
              "in the backward.")
        .AsIntermediate();
-    AddAttr<bool>("usePeepholes",
-                  "(bool, default True) "
+    AddAttr<bool>("use_peepholes",
+                  "(bool, defalut: True) "
                  "whether to enable diagonal/peephole connections.")
        .SetDefault(true);
-    AddAttr<bool>("isReverse",
-                  "(bool, default False) "
+    AddAttr<bool>("is_reverse",
+                  "(bool, defalut: False) "
                  "whether to compute reversed LSTM.")
        .SetDefault(false);
    AddAttr<std::string>(
-        "gateActivation",
-        "(string, default sigmoid)"
+        "gate_activation",
+        "(string, default: sigmoid)"
        "The activation for input gate, forget gate and output "
        "gate, `sigmoid` by default.")
-        .SetDefault("sigmoid");
-    AddAttr<std::string>("cellActivation",
-                         "(string, default tanh)"
+        .SetDefault("sigmoid")
+        .InEnum({"sigmoid", "tanh", "relu", "identity"});
+    AddAttr<std::string>("cell_activation",
+                         "(string, default: tanh)"
                         "The activation for cell output, `tanh` by defalut.")
-        .SetDefault("tanh");
-    AddAttr<std::string>("candidateActivation",
-                         "(string, default tanh)"
+        .SetDefault("tanh")
+        .InEnum({"sigmoid", "tanh", "relu", "identity"});
+    AddAttr<std::string>("candidate_activation",
+                         "(string, default: tanh)"
                         "The activation for candidate hidden state, "
                         "`tanh` by default.")
-        .SetDefault("tanh");
+        .SetDefault("tanh")
+        .InEnum({"sigmoid", "tanh", "relu", "identity"});
    AddComment(R"DOC(
 Long-Short Term Memory (LSTM) Operator.

@ -203,7 +213,7 @@ are the cell input and cell output activation functions and `tanh` is usually
 used for them. \f$\tilde{c_t}\f$ is also called candidate hidden state,
 which is computed based on the current input and the previous hidden state.

-Set usePeepholes False to disable peephole connection 
+Set `use_peepholes` False to disable peephole connection 
 (http://www.bioinf.jku.at/publications/older/2604.pdf). The formula
 is omitted here.

@ -226,23 +236,27 @@ class LSTMGradOp : public framework::OperatorWithKernel {
                   "Input(Hidden) of LSTM should not be null.");
    PADDLE_ENFORCE(ctx->HasInput("Cell"),
                   "Input(Cell) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Weight"),
+                   "Input(Weight) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Bias"),
+                   "Input(Bias) of LSTM should not be null.");

    PADDLE_ENFORCE(ctx->HasInput("BatchGate"),
                   "Input(BatchGate) of LSTM should not be null.");
    PADDLE_ENFORCE(ctx->HasInput("BatchCellPreAct"),
                   "Input(BatchGate) of LSTM should not be null.");

-    auto in_g_name = framework::GradVarName("Input");
-    if (ctx->HasOutput(in_g_name))
-      ctx->SetOutputDim(in_g_name, ctx->GetInputDim("Input"));
-
-    auto w_g_name = framework::GradVarName("Weight");
-    if (ctx->HasOutput(w_g_name))
-      ctx->SetOutputDim(w_g_name, ctx->GetInputDim("Weight"));
-
-    auto b_g_name = framework::GradVarName("Bias");
-    if (ctx->HasOutput(b_g_name))
-      ctx->SetOutputDim(b_g_name, ctx->GetInputDim("Bias"));
+    auto SetOutGradDim = [&ctx](const std::string& name) {
+      auto g_name = framework::GradVarName(name);
+      if (ctx->HasOutput(g_name))
+        ctx->SetOutputDim(g_name, ctx->GetInputDim(name));
+    };
+
+    SetOutGradDim("Input");
+    SetOutGradDim("Weight");
+    SetOutGradDim("Bias");
+    SetOutGradDim("H0");
+    SetOutGradDim("C0");
  }

 protected:
--- a/paddle/operators/lstm_op.h
+++ b/paddle/operators/lstm_op.h
--- a/paddle/operators/math/detail/lstm_cpu_kernel.h
+++ b/paddle/operators/math/detail/lstm_cpu_kernel.h
@ -52,9 +52,9 @@ void naive_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
    rValueIg = valueIg[i];
    rValueFg = valueFg[i];
    rValueOg = valueOg[i];
-    rCheckI = value.checkIg[i];
-    rCheckF = value.checkFg[i];
-    rCheckO = value.checkOg[i];
+    rCheckI = value.checkIg ? value.checkIg[i] : 0;
+    rCheckF = value.checkFg ? value.checkFg[i] : 0;
+    rCheckO = value.checkOg ? value.checkOg[i] : 0;

    if (value.prevStateValue) {
      rPrevState = value.prevStateValue[i];
@ -114,9 +114,9 @@ void naive_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
    rValueIg = valueIg[i];
    rValueFg = valueFg[i];
    rValueOg = valueOg[i];
-    rCheckI = value.checkIg[i];
-    rCheckF = value.checkFg[i];
-    rCheckO = value.checkOg[i];
+    rCheckI = value.checkIg ? value.checkIg[i] : 0;
+    rCheckF = value.checkFg ? value.checkFg[i] : 0;
+    rCheckO = value.checkOg ? value.checkOg[i] : 0;
    rState = value.stateValue[i];
    rStateAtv = value.stateActiveValue[i];
    rOutputGrad = grad.outputGrad[i];
@ -155,9 +155,9 @@ void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value, int frameSize,
  __m256 rValueIg;
  __m256 rValueFg;
  __m256 rValueOg;
-  __m256 rCheckI;
-  __m256 rCheckF;
-  __m256 rCheckO;
+  __m256 rCheckI = _mm256_set1_ps(0.0f);
+  __m256 rCheckF = _mm256_set1_ps(0.0f);
+  __m256 rCheckO = _mm256_set1_ps(0.0f);
  __m256 rState;
  __m256 rPrevState = _mm256_set1_ps(0.0f);
  __m256 rStateAtv;
@ -173,9 +173,11 @@ void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value, int frameSize,
    rValueIg = valueIg[i];
    rValueFg = valueFg[i];
    rValueOg = valueOg[i];
-    rCheckI = ((__m256 *)value.checkIg)[i];
-    rCheckF = ((__m256 *)value.checkFg)[i];
-    rCheckO = ((__m256 *)value.checkOg)[i];
+    if (value.checkIg) {
+      rCheckI = ((__m256 *)value.checkIg)[i];
+      rCheckF = ((__m256 *)value.checkFg)[i];
+      rCheckO = ((__m256 *)value.checkOg)[i];
+    }

    if (value.prevStateValue) {
      rPrevState = ((__m256 *)value.prevStateValue)[i];
@ -216,9 +218,9 @@ void avx_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
  __m256 rState;
  __m256 rStateAtv;
  __m256 rOutputGrad;
-  __m256 rCheckI;
-  __m256 rCheckF;
-  __m256 rCheckO;
+  __m256 rCheckI = _mm256_set1_ps(0.0f);
+  __m256 rCheckF = _mm256_set1_ps(0.0f);
+  __m256 rCheckO = _mm256_set1_ps(0.0f);
  __m256 rCheckIGrad;
  __m256 rCheckFGrad;
  __m256 rCheckOGrad;
@ -237,9 +239,11 @@ void avx_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
    rValueIg = valueIg[i];
    rValueFg = valueFg[i];
    rValueOg = valueOg[i];
-    rCheckI = ((__m256 *)value.checkIg)[i];
-    rCheckF = ((__m256 *)value.checkFg)[i];
-    rCheckO = ((__m256 *)value.checkOg)[i];
+    if (value.checkIg) {
+      rCheckI = ((__m256 *)value.checkIg)[i];
+      rCheckF = ((__m256 *)value.checkFg)[i];
+      rCheckO = ((__m256 *)value.checkOg)[i];
+    }
    rState = ((__m256 *)value.stateValue)[i];
    rStateAtv = ((__m256 *)value.stateActiveValue)[i];
    rOutputGrad = ((__m256 *)grad.outputGrad)[i];
--- a/paddle/operators/math/detail/lstm_gpu_kernel.h
+++ b/paddle/operators/math/detail/lstm_gpu_kernel.h
@ -55,9 +55,10 @@ __global__ void KeLstmForward(Op op, LstmMetaValue<T> value, int frameSize,
  T rValueIg;
  T rValueFg;
  T rValueOg;
-  T rCheckI = value.checkIg[frameIdx];
-  T rCheckF = value.checkFg[frameIdx];
-  T rCheckO = value.checkOg[frameIdx];
+
+  T rCheckI = value.checkIg ? value.checkIg[frameIdx] : 0;
+  T rCheckF = value.checkFg ? value.checkFg[frameIdx] : 0;
+  T rCheckO = value.checkOg ? value.checkOg[frameIdx] : 0;

  rValueIn = value.gateValue[frameIdx];
  rValueIg = value.gateValue[frameIdx + frameSize];
@ -121,9 +122,10 @@ __global__ void KeLstmBackward(Op op, LstmMetaValue<T> value,
  T rStateGrad;
  T rStateAtv;
  T rOutputGrad;
-  T rCheckI = value.checkIg[frameIdx];
-  T rCheckF = value.checkFg[frameIdx];
-  T rCheckO = value.checkOg[frameIdx];
+  T rCheckI = value.checkIg ? value.checkIg[frameIdx] : 0;
+  T rCheckF = value.checkFg ? value.checkFg[frameIdx] : 0;
+  T rCheckO = value.checkOg ? value.checkOg[frameIdx] : 0;
+
  T rCheckIGrad;
  T rCheckFGrad;
  T rCheckOGrad;
--- a/paddle/operators/math/sequence2batch.cc
+++ b/paddle/operators/math/sequence2batch.cc
@ -22,8 +22,8 @@ template <typename T>
 class CopyMatrixRowsFunctor<platform::CPUPlace, T> {
 public:
  void operator()(const platform::DeviceContext& context,
-                  const framework::LoDTensor& src, const size_t* index,
-                  framework::LoDTensor& dst, bool is_src_index) {
+                  const framework::Tensor& src, const size_t* index,
+                  framework::Tensor& dst, bool is_src_index) {
    auto src_dims = src.dims();
    auto dst_dims = dst.dims();
    PADDLE_ENFORCE_EQ(src_dims.size(), 2UL,
--- a/paddle/operators/math/sequence2batch.cu
+++ b/paddle/operators/math/sequence2batch.cu
@ -41,8 +41,8 @@ template <typename T>
 class CopyMatrixRowsFunctor<platform::GPUPlace, T> {
 public:
  void operator()(const platform::DeviceContext& context,
-                  const framework::LoDTensor& src, const size_t* index,
-                  framework::LoDTensor& dst, bool is_src_index) {
+                  const framework::Tensor& src, const size_t* index,
+                  framework::Tensor& dst, bool is_src_index) {
    auto src_dims = src.dims();
    auto dst_dims = dst.dims();
    PADDLE_ENFORCE_EQ(src_dims.size(), 2,
--- a/paddle/operators/math/sequence2batch.h
+++ b/paddle/operators/math/sequence2batch.h
@ -30,8 +30,8 @@ class CopyMatrixRowsFunctor {
  // copy the input src to the indexed rows of output dst.
  // The indexed rows are based on the input index.
  void operator()(const platform::DeviceContext& context,
-                  const framework::LoDTensor& src, const size_t* index,
-                  framework::LoDTensor& dst, bool is_src_index);
+                  const framework::Tensor& src, const size_t* index,
+                  framework::Tensor& dst, bool is_src_index);
 };

 template <typename Place, typename T>
@ -57,7 +57,7 @@ class LoDTensor2BatchFunctor {
                  bool is_reverse = false) const {
    if (!is_cal_batch_lod) {
      auto lods = batch.lod();
-      PADDLE_ENFORCE_EQ(lods.size(), 2UL);
+      PADDLE_ENFORCE_GT(lods.size(), 2UL);
      PADDLE_ENFORCE_EQ(lods[1].size(),
                        static_cast<size_t>(lod_tensor.dims()[0]));
      CopyMatrixRowsFunctor<Place, T> to_batch;
@ -66,8 +66,8 @@ class LoDTensor2BatchFunctor {
    }

    auto lods = lod_tensor.lod();
-    PADDLE_ENFORCE_EQ(lods.size(), 1UL, "Only support one level sequence now.");
    auto lod = lods[0];
+    PADDLE_ENFORCE_EQ(lods.size(), 1UL, "Only support one level sequence now.");

    std::vector<SeqInfo> seq_info;
    for (size_t seq_id = 0; seq_id < lod.size() - 1; ++seq_id) {
@ -78,8 +78,7 @@ class LoDTensor2BatchFunctor {
    std::sort(seq_info.begin(), seq_info.end(),
              [](SeqInfo a, SeqInfo b) { return a.length > b.length; });

-    // calculate the start position of each batch
-    // (numBatch equal the maxLength of sequences)
+    // Calculate the start position of each batch.
    // example:  sequences = {s0, s1, s2}
    //           s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2
    //           num_batch = 5,
@ -95,19 +94,25 @@ class LoDTensor2BatchFunctor {
    //                                6, 2, 11,
    //                                7, 3,
    //                                8}
-    // The batch number represents batch size after rearranging the
+    //           seq_order = {1, 0, 2}, the sort order.
+    //               where 1 is the second sequence,
+    //                     0 is the first sequence,
+    //                     2 is the third sequence.
+    // The num_batch represents batch size after rearranging the
    // input LodTensor. It is also the maximum length of input sequence.

    paddle::framework::LoD batch_lods;
    batch_lods.emplace_back(std::vector<size_t>{0});
    batch_lods.emplace_back(std::vector<size_t>{0});
+    batch_lods.emplace_back(std::vector<size_t>{0});

    // batch_lods[0] is the start positions for batch LoDTensor
    int num_batch = seq_info[0].length;
    batch_lods[0].resize(static_cast<size_t>(num_batch + 1));
    // batch_lods[1] is the raw index in the input LoDTensor
-    auto dims = lod_tensor.dims();
-    batch_lods[1].resize(static_cast<size_t>(dims[0]));
+    batch_lods[1].resize(static_cast<size_t>(lod_tensor.dims()[0]));
+    // batch_lods[2] is the sort order for the input LoDTensor.
+    batch_lods[2].resize(seq_info.size());

    size_t* batch_starts = batch_lods[0].data();
    size_t* seq2batch_idx = batch_lods[1].data();
@ -127,6 +132,10 @@ class LoDTensor2BatchFunctor {
      }
      batch_starts[n + 1] = static_cast<size_t>(batch_id);
    }
+    size_t* seq_order = batch_lods[2].data();
+    for (size_t i = 0; i < seq_info.size(); ++i) {
+      seq_order[i] = seq_info[i].seq_idx;
+    }
    batch.set_lod(batch_lods);

    CopyMatrixRowsFunctor<Place, T> to_batch;
@ -141,8 +150,7 @@ class Batch2LoDTensorFunctor {
                  const framework::LoDTensor& batch,
                  framework::LoDTensor& lod_tensor) const {
    auto in_lod = batch.lod();
-    PADDLE_ENFORCE_EQ(in_lod.size(), 2UL,
-                      "The LoD size of input `batch` should be 2.");
+    PADDLE_ENFORCE_GT(in_lod.size(), 2UL);
    PADDLE_ENFORCE_EQ(in_lod[1].size(),
                      static_cast<size_t>(lod_tensor.dims()[0]));
    CopyMatrixRowsFunctor<Place, T> to_seq;
--- a/python/paddle/v2/framework/tests/test_lstm_op.py
+++ b/python/paddle/v2/framework/tests/test_lstm_op.py
@ -117,8 +117,9 @@ class TestLstmOp(OpTest):
        self.act_cell = 'tanh'
        self.act_cand = 'tanh'

-        self.has_initial_state = True
+        self.has_initial_state = False
        self.is_reverse = False
+        self.use_peepholes = True

    def setUp(self):
        self.set_argument()
@ -128,18 +129,28 @@ class TestLstmOp(OpTest):
        N = len(self.lod[0]) - 1

        x = np.random.normal(size=(T, 4 * self.D)).astype('float64')
-        h0 = np.zeros((N, self.D)).astype('float64')
-        c0 = np.zeros((N, self.D)).astype('float64')
+        if self.has_initial_state:
+            h0 = np.random.normal(size=(N, self.D)).astype('float64')
+            c0 = np.random.normal(size=(N, self.D)).astype('float64')
+        else:
+            h0 = np.zeros((N, self.D)).astype('float64')
+            c0 = np.zeros((N, self.D)).astype('float64')
        w = np.random.normal(size=(self.D, 4 * self.D)).astype('float64')
-        b = np.random.normal(size=(1, 7 * self.D)).astype('float64')
+        if self.use_peepholes:
+            b = np.random.normal(size=(1, 7 * self.D)).astype('float64')
+        else:
+            b = np.random.normal(size=(1, 4 * self.D)).astype('float64')

        w_b = b[:, 0:4 * self.D]
-        w_c = b[:, 4 * self.D:]
+        w_c = b[:, 4 * self.D:] if self.use_peepholes else None
        h, c = lstm(x, self.lod, h0, c0, w, w_b, w_c, self.is_reverse,
                    ACTVATION[self.act_gate], ACTVATION[self.act_cell],
                    ACTVATION[self.act_cand])

-        self.inputs = {'Input': (x, self.lod), 'Weight': w, 'Bias': b}
+        self.inputs = {'Input': (x, self.lod), 'Weight': w}
+
+        self.inputs['Bias'] = b
+
        if self.has_initial_state:
            self.inputs['H0'] = h0
            self.inputs['C0'] = c0
@ -149,17 +160,16 @@ class TestLstmOp(OpTest):
            'Cell': (c, self.lod),
        }
        self.attrs = {
-            'usePeepholes': True,
-            'isReverse': self.is_reverse,
-            'gateActivation': self.act_gate,
-            'cellActivation': self.act_cell,
-            'candidateActivation': self.act_cand
+            'use_peepholes': self.use_peepholes,
+            'is_reverse': self.is_reverse,
+            'gate_activation': self.act_gate,
+            'cell_activation': self.act_cell,
+            'candidate_activation': self.act_cand
        }

    def test_check_output(self):
        self.check_output(atol=1e-8)

-    #TODO(qingqing) add more unit testing case
    def test_check_grad(self):
        # TODO(qingqing) remove folowing lines after the check_grad is refined.
        N = len(self.lod[0]) - 1
@ -170,7 +180,7 @@ class TestLstmOp(OpTest):
            ['Input', 'Weight', 'Bias'], ['Hidden'], max_relative_error=5e-4)


-class TestLstmOpHasNoInitial(TestLstmOp):
+class TestLstmOpHasInitial(TestLstmOp):
    def set_argument(self):
        self.lod = [[0, 2, 5, 7]]
        self.D = 16
@ -179,8 +189,69 @@ class TestLstmOpHasNoInitial(TestLstmOp):
        self.act_cell = 'tanh'
        self.act_cand = 'tanh'

-        self.has_initial_state = False
+        self.has_initial_state = True
        self.is_reverse = True
+        self.use_peepholes = True
+
+    def test_check_grad(self):
+        # TODO(qingqing) remove folowing lines after the check_grad is refined.
+        N = len(self.lod[0]) - 1
+        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+        self.outputs['BatchCellPreAct'] = np.zeros(
+            (N, self.D)).astype('float64')
+        self.check_grad(
+            ['Input', 'Weight', 'Bias', 'H0', 'C0'], ['Hidden'],
+            max_relative_error=5e-4)
+
+    def test_check_grad_ingore_bias(self):
+        N = len(self.lod[0]) - 1
+        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+        self.outputs['BatchCellPreAct'] = np.zeros(
+            (N, self.D)).astype('float64')
+        self.check_grad(
+            ['Input', 'Weight'], ['Hidden'],
+            max_relative_error=5e-4,
+            no_grad_set=set('Bias'))
+
+    def test_check_grad_ingore_weight(self):
+        N = len(self.lod[0]) - 1
+        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+        self.outputs['BatchCellPreAct'] = np.zeros(
+            (N, self.D)).astype('float64')
+        self.check_grad(
+            ['Input', 'Bias'], ['Hidden'],
+            max_relative_error=5e-4,
+            no_grad_set=set('Weight'))
+
+    def test_check_grad_ingore_input(self):
+        N = len(self.lod[0]) - 1
+        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+        self.outputs['BatchCellPreAct'] = np.zeros(
+            (N, self.D)).astype('float64')
+        self.check_grad(
+            ['Weight', 'Bias'], ['Hidden'],
+            max_relative_error=5e-4,
+            no_grad_set=set('Input'))
+
+    def test_check_grad_ingore_h0(self):
+        N = len(self.lod[0]) - 1
+        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+        self.outputs['BatchCellPreAct'] = np.zeros(
+            (N, self.D)).astype('float64')
+        self.check_grad(
+            ['Input', 'Weight', 'Bias', 'C0'], ['Hidden'],
+            max_relative_error=5e-4,
+            no_grad_set=set('H0'))
+
+    def test_check_grad_ingore_c0(self):
+        N = len(self.lod[0]) - 1
+        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+        self.outputs['BatchCellPreAct'] = np.zeros(
+            (N, self.D)).astype('float64')
+        self.check_grad(
+            ['Input', 'Weight', 'Bias', 'H0'], ['Hidden'],
+            max_relative_error=5e-4,
+            no_grad_set=set('C0'))


 class TestLstmOpRerverse(TestLstmOp):
@ -192,8 +263,23 @@ class TestLstmOpRerverse(TestLstmOp):
        self.act_cell = 'tanh'
        self.act_cand = 'tanh'

-        self.has_initial_state = True
+        self.has_initial_state = False
+        self.is_reverse = True
+        self.use_peepholes = True
+
+
+class TestLstmOpNotUsePeepholes(TestLstmOp):
+    def set_argument(self):
+        self.lod = [[0, 2, 5, 7]]
+        self.D = 16
+
+        self.act_gate = 'sigmoid'
+        self.act_cell = 'tanh'
+        self.act_cand = 'tanh'
+
+        self.has_initial_state = False
        self.is_reverse = True
+        self.use_peepholes = False


 if __name__ == '__main__':
--- a/python/paddle/v2/optimizer.py
+++ b/python/paddle/v2/optimizer.py
@ -102,7 +102,7 @@ class Momentum(Optimizer):

    ..  math::

-        v_{t} &= k * v_{t-1} - \\gamma_t / (g_{t} + \\lambda w_{t-1}) \\\\
+        v_{t} &= k * v_{t-1} - \\gamma_t (g_{t} + \\lambda w_{t-1}) \\\\
        w_{t} &= w_{t-1} + v_{t} \\\\

    where, :math:`k` is momentum, :math:`\\lambda` is decay rate,