fix cudnn rnn; test=develop

6 years ago · d1a17cadd4
parent 487ee36aec
commit d1a17cadd4
4 changed files with 78 additions and 49 deletions
--- a/paddle/fluid/operators/cudnn_lstm_op.cc
+++ b/paddle/fluid/operators/cudnn_lstm_op.cc
@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@ -122,13 +122,11 @@ class CudnnLSTMOpMaker : public framework::OpProtoAndCheckerMaker {
                  "The will affect the shape of the Out, last_h, and last_c")
        .SetDefault(false);
    AddAttr<int>("input_size", "input size ot the Input Tensor").SetDefault(10);
    AddAttr<int>("batch_size", "the instance number the batch").SetDefault(10);
    AddAttr<int>("hidden_size", "hidden size of the LSTM").SetDefault(100);
    AddAttr<int>("num_layers", "the total layer number of the LSTM")
        .SetDefault(1);
    AddAttr<bool>("is_test", "True if in test phase.").SetDefault(false);
-    AddAttr<bool>("fix_seed", "True if it fix dropout seed").SetDefault(false);
+    AddAttr<int>("seed", "seed to used if fix_seed is True").SetDefault(-1);
    AddAttr<int>("seed", "seed to used if fix_seed is True").SetDefault(0);
    AddComment(R"DOC(
 CUDNN LSTM implementation
@ -136,16 +134,32 @@ A four-gate Long Short-Term Memory network with no peephole connections.
 In the forward pass the output ht and cell output ct for a given iteration can be computed from the recurrent input ht-1, 
 the cell input ct-1 and the previous layer input xt given matrices W, R and biases bW, bR from the following equations:
-it = σ(Wi X xt + Ri X ht-1 + bWi + bRi)
+$$ i_t = sigmoid(W_{ix}x_{t} + W_{ih}h_{t-1} + bx_i + bh_i) $$
 ft = σ(Wf X xt + Rf X ht-1 + bWf + bRf)
 ot = σ(Wo X xt + Ro X ht-1 + bWo + bRo)
 c't = tanh(Wc X xt + Rc X ht-1 + bWc + bRc)
 ct = ft * ct-1 + it * c't
 ht = ot * tanh(ct)
-Where σ is the sigmoid operator: σ(x) = 1 / (1 + e^-x), * represents a point-wise multiplication, 
+$$ f_t = sigmoid(W_{fx}x_{t} + W_{fh}h_{t-1} + bx_f + bh_f) $$
 $$ o_t = sigmoid(W_{ox}x_{t} + W_{oh}h_{t-1} + bx_o + bh_o) $$
 $$ \\tilde{c_t} = tanh(W_{cx}x_t + W_{ch}h_{t-1} + bx_c + bh_c) $$
 $$ c_t = f_t \\odot c_{t-1} + i_t \\odot \\tilde{c_t} $$
 $$ h_t = o_t \\odot tanh(c_t) $$
 - W terms denote weight matrices (e.g. $W_{ix}$ is the matrix
  of weights from the input gate to the input)
 - The b terms denote bias vectors ($bx_i$ and $bh_i$ are the input gate bias vector).
 - sigmoid is the logistic sigmoid function.
 - $i, f, o$ and $c$ are the input gate, forget gate, output gate,
  and cell activation vectors, respectively, all of which have the same size as
  the cell output activation vector $h$.
 - The $\odot$ is the element-wise product of the vectors.
 - `tanh` is the activation functions.
 - $\tilde{c_t}$ is also called candidate hidden state,
  which is computed based on the current input and the previous hidden state.
 Where sigmoid is the sigmoid operator: sigmoid(x) = 1 / (1 + e^-x), * represents a point-wise multiplication, 
 X represensts a matrix multiplication
 and tanh is the hyperbolic tangent function. it, ft, ot, c't represent the input, forget, output and new gates respectively.
 )DOC");
--- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc
+++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@ -273,7 +273,6 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
    size_t max_len = ctx.Attr<int>("max_len");
    float dropout_prob = ctx.Attr<float>("dropout_prob");
    bool is_bidirec = ctx.Attr<bool>("is_bidirec");
    int batch_size = ctx.Attr<int>("batch_size");
    int input_size = ctx.Attr<int>("input_size");
    int hidden_size = ctx.Attr<int>("hidden_size");
    int num_layers = ctx.Attr<int>("num_layers");
@ -304,9 +303,13 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
      cudnn_rnn_cache = const_cast<framework::Variable *>(cache_var)
                            ->GetMutable<CudnnRNNCache>();
      std::random_device rnd;
-      int seed = ctx.Attr<bool>("fix_seed") ? ctx.Attr<int>("seed") : rnd();
+      int seed = ctx.Attr<int>("seed");
      if (seed == -1) {
        seed = rnd();
      }
      auto input_w_numel = w->numel();
      auto batch_size = x->dims()[1];
      cudnn_rnn_cache->init(handle, ctx, max_len, batch_size, input_size,
                            hidden_size, num_layers, dropout_prob, is_bidirec,
                            seed, input_w_numel);
--- a/paddle/fluid/operators/cudnn_lstm_op.h
+++ b/paddle/fluid/operators/cudnn_lstm_op.h
@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@ -29,7 +29,10 @@ using Tensor = framework::Tensor;
 template <typename DeviceContext, typename T>
 class CudnnLSTMKernel : public framework::OpKernel<T> {
 public:
-  void Compute(const framework::ExecutionContext& ctx) const override {}
+  void Compute(const framework::ExecutionContext& ctx) const override {
    PADDLE_THROW(
        "CPU is not support for this kernel now. Will be add in the future");
  }
 };
 template <typename DeviceContext, typename T>
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@ -169,7 +169,7 @@ __all__ = [
    'log_loss',
    'add_position_encoding',
    'bilinear_tensor_product',
-    'cudnn_lstm',
+    'lstm',
 ]
@ -467,39 +467,53 @@ def dynamic_lstm(input,
    return hidden, cell
-def cudnn_lstm(input,
+def lstm(input,
-               init_h,
+         init_h,
-               init_c,
+         init_c,
-               batch_size,
+         max_len,
-               max_len,
+         dropout_prob,
-               dropout_prob,
+         input_size,
-               input_size,
+         hidden_size,
-               hidden_size,
+         num_layers,
-               num_layers,
+         is_bidirec=False,
-               is_bidirec=False,
+         dtype='float32',
-               dtype='float32',
+         is_test=False,
-               is_test=False,
+         name=None,
-               name=None,
+         default_initializer=None,
-               default_initializer=None,
+         seed=-1):
               fix_seed=False,
               seed=0):
    """
-    CUDNN LSTM implementation
+    If Device is GPU, This op will use cudnn LSTM implementation
    A four-gate Long Short-Term Memory network with no peephole connections.
    In the forward pass the output ht and cell output ct for a given iteration can be computed from the recurrent input ht-1, 
    the cell input ct-1 and the previous layer input xt given matrices W, R and biases bW, bR from the following equations:
-    it = sigmoid(Wi X xt + Ri X ht-1 + bWi + bRi)
+    $$ i_t = \\sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + bx_i + bh_i) $$
-    ft = sigmoid(Wf X xt + Rf X ht-1 + bWf + bRf)
+
-    ot = sigmoid(Wo X xt + Ro X ht-1 + bWo + bRo)
+    $$ f_t = \\sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + bx_f + bh_f) $$
-    c't = tanh(Wc X xt + Rc X ht-1 + bWc + bRc)
+
-    ct = ft * ct-1 + it * c't
+    $$ o_t = \\sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + bx_o + bh_o) $$
-    ht = ot * tanh(ct)
+
    $$ \\tilde{c_t} = tanh(W_{cx}x_t + W_{ch}h_{t-1} + bx_c + bh_c) $$
    $$ c_t = f_t \\odot c_{t-1} + i_t \\odot \\tilde{c_t} $$
    $$ h_t = o_t \\odot tanh(c_t) $$
    - W terms denote weight matrices (e.g. $W_{ix}$ is the matrix
      of weights from the input gate to the input)
    - The b terms denote bias vectors ($bx_i$ and $bh_i$ are the input gate bias vector).
    - sigmoid is the logistic sigmoid function.
    - $i, f, o$ and $c$ are the input gate, forget gate, output gate,
      and cell activation vectors, respectively, all of which have the same size as
      the cell output activation vector $h$.
    - The $\odot$ is the element-wise product of the vectors.
    - `tanh` is the activation functions.
    - $\tilde{c_t}$ is also called candidate hidden state,
      which is computed based on the current input and the previous hidden state.
    Where sigmoid is the sigmoid operator: sigmoid(x) = 1 / (1 + e^-x), * represents a point-wise multiplication, 
    X represensts a matrix multiplication
    and tanh is the hyperbolic tangent function. it, ft, ot, c't represent the input, forget, output and new gates respectively.
    Args:
@ -510,7 +524,6 @@ def cudnn_lstm(input,
        init_c(Variable): The initial cell state of the LSTM.
                       This is a tensor with shape ( num_layers x batch_size x hidden_size )
                       if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size)
        batch_size (int): total distance numer of the batch
        max_len (int): max length of LSTM. the first dim of input tensor CAN NOT greater than max_len 
        dropout_prob(float): dropout prob, dropout ONLY work between rnn layers, NOT between time steps
                             There is NO dropout work on rnn output of the last RNN layers
@ -524,9 +537,7 @@ def cudnn_lstm(input,
                         will be named automatically.
        default_initializer(Initialize|None): Where use initializer to initialize the Weight
                         If set None, defaule initializer will be used
-
+        seed(int): Seed for dropout in LSTM, If it's -1, dropout will use random seed
        fix_seed(bool): If it's True, fix seed will used for dropout in LSTM
        seed(int): If fix_seed is True, dropout seed in LSTM will use this seed 
    Returns:
@ -553,7 +564,7 @@ def cudnn_lstm(input,
            init_hidden1 = layers.fill_constant( [num_layers, batch_size, hidden_size], 'float32', 0.0, stop_grad=False)
            init_cell1 = layers.fill_constant( [num_layers, batch_size, hidden_size], 'float32', 0.0, stop_grad=False)
-            rnn_out, last_h, last_c = layers.cudnn_lstm( input, init_h, init_c, batch_size, \
+            rnn_out, last_h, last_c = layers.lstm( input, init_h, init_c, \
                    max_len, dropout_prob, input_size, hidden_size, \
                    num_layers)
    """
@ -610,12 +621,10 @@ def cudnn_lstm(input,
            'max_len': max_len,
            'is_bidirec': is_bidirec,
            'input_size': input_size,
            'batch_size': batch_size,
            'hidden_size': hidden_size,
            'num_layers': num_layers,
            'is_test': is_test,
            'dropout_prob': dropout_prob,
            'fix_seed': fix_seed,
            'seed': seed,
        })
    return out, last_h, last_c