From 734e87e55b00418aed0fac5a879b2704d62cf3ab Mon Sep 17 00:00:00 2001
From: yangyaming <mxscmxsc@gmail.com>
Date: Fri, 15 Dec 2017 20:08:55 +0800
Subject: [PATCH 1/7] Add python wrapper for lstm unit op.

---
 doc/api/v2/fluid/layers.rst                 |  11 +-
 python/paddle/v2/fluid/layers/nn.py         | 112 +++++++++++++++++++-
 python/paddle/v2/fluid/tests/test_layers.py |  17 +++
 3 files changed, 132 insertions(+), 8 deletions(-)

diff --git a/doc/api/v2/fluid/layers.rst b/doc/api/v2/fluid/layers.rst
index 89e5fec13b..0ab36402fa 100644
--- a/doc/api/v2/fluid/layers.rst
+++ b/doc/api/v2/fluid/layers.rst
@@ -188,12 +188,6 @@ beam_search_decode
     :noindex:
 
 
-lstm
----------
-..  autofunction:: paddle.v2.fluid.layers.lstm
-    :noindex:
-
-
 lod_rank_table
 ---------
 ..  autofunction:: paddle.v2.fluid.layers.lod_rank_table
@@ -300,3 +294,8 @@ conv2d_transpose
 ..  autofunction:: paddle.v2.fluid.layers.conv2d_transpose
     :noindex:
 
+
+lstm_unit
+---------
+..  autofunction:: paddle.v2.fluid.layers.lstm_unit
+    :noindex:
diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py
index bad7dbd84e..84e62d988c 100644
--- a/python/paddle/v2/fluid/layers/nn.py
+++ b/python/paddle/v2/fluid/layers/nn.py
@@ -5,12 +5,13 @@ All layers just related to the neural network.
 from ..layer_helper import LayerHelper
 from ..initializer import Normal, Constant
 from ..framework import Variable
+from tensor import concat
 
 __all__ = [
     'fc', 'embedding', 'dynamic_lstm', 'gru_unit', 'linear_chain_crf',
     'crf_decoding', 'cos_sim', 'cross_entropy', 'square_error_cost', 'accuracy',
     'chunk_eval', 'sequence_conv', 'conv2d', 'sequence_pool', 'pool2d',
-    'batch_norm', 'beam_search_decode', 'conv2d_transpose'
+    'batch_norm', 'beam_search_decode', 'conv2d_transpose', 'lstm_unit'
 ]
 
 
@@ -392,7 +393,7 @@ def chunk_eval(input,
                excluded_chunk_types=None,
                **kwargs):
     """
-    This function computes and outputs the precision, recall and 
+    This function computes and outputs the precision, recall and
     F1-score of chunk detection.
     """
     helper = LayerHelper("chunk_eval", **kwargs)
@@ -789,3 +790,110 @@ def conv2d_transpose(input,
         attrs=op_attr)
 
     return out
+
+
+def lstm_unit(x_t,
+              hidden_t_prev,
+              cell_t_prev,
+              forget_bias=0.0,
+              main_program=None,
+              startup_program=None):
+    """Lstm unit layer. The equation of a lstm step is:
+
+        .. math::
+
+            i_t & = \sigma(W_{x_i}x_{t} + W_{h_i}h_{t-1} + W_{c_i}c_{t-1} + b_i)
+
+            f_t & = \sigma(W_{x_f}x_{t} + W_{h_f}h_{t-1} + W_{c_f}c_{t-1} + b_f)
+
+            c_t & = f_tc_{t-1} + i_t tanh (W_{x_c}x_t+W_{h_c}h_{t-1} + b_c)
+
+            o_t & = \sigma(W_{x_o}x_{t} + W_{h_o}h_{t-1} + W_{c_o}c_t + b_o)
+
+            h_t & = o_t tanh(c_t)
+
+    The inputs of lstm unit includes :math:`x_t`, :math:`h_{t-1}` and
+    :math:`c_{t-1}`. The implementation separates the linear transformation
+    and non-linear transformation apart. Here, we take :math:`i_t` as an
+    example. The linear transformation is applied by calling a `fc` layer and
+    the equation is:
+
+        .. math::
+
+            L_{i_t} = W_{x_i}x_{t} + W_{h_i}h_{t-1} + W_{c_i}c_{t-1} + b_i
+
+    The non-linear transformation is applied by calling `lstm_unit_op` and the
+    equation is:
+
+        .. math::
+
+            i_t = \sigma(L_{i_t})
+
+    This layer has two outputs including :math:`o_t` and :math:`h_t`.
+
+    Args:
+        x_t (Variable): The input value of current step.
+        hidden_t_prev (Variable): The hidden value of lstm unit.
+        cell_t_prev (Variable): The cell value of lstm unit.
+        forget_bias (float): The forget bias of lstm unit.
+        main_program (Program): The main program.
+        startup_program (Program): the startup program.
+
+    Returns:
+        tuple: The cell value and hidden value of lstm unit.
+
+    Raises:
+        ValueError: The ranks of **x_t**, **hidden_t_prev** and **cell_t_prev**\
+                not be 2 or the 1st dimensions of **x_t**, **hidden_t_prev** \
+                and **cell_t_prev** not be the same.
+
+    Examples:
+
+        .. code-block:: python
+
+             x_t = fluid.layers.fc(input=x_t_data, size=10)
+             prev_hidden = fluid.layers.fc(input=prev_hidden_data, size=20)
+             prev_cell = fluid.layers.fc(input=prev_cell_data, size=30)
+             cell_value, hidden_value = fluid.layers.lstm_unit(x_t=x_t,
+                                                    hidden_t_prev=prev_hidden,
+                                                    cell_t_prev=prev_cell)
+    """
+    helper = LayerHelper('lstm_unit', **locals())
+
+    if len(x_t.shape) != 2:
+        raise ValueError("Rank of x_t must be 2.")
+
+    if len(hidden_t_prev.shape) != 2:
+        raise ValueError("Rank of hidden_t_prev must be 2.")
+
+    if len(cell_t_prev.shape) != 2:
+        raise ValueError("Rank of cell_t_prev must be 2.")
+
+    if x_t.shape[0] != hidden_t_prev.shape[0] or x_t.shape[
+            0] != cell_t_prev.shape[0]:
+        raise ValueError("The 1s dimension of x_t, hidden_t_prev and "
+                         "cell_t_prev must be the same.")
+
+    size = cell_t_prev.shape[1]
+    concat_out = concat(
+        input=[x_t, hidden_t_prev],
+        axis=1,
+        main_program=main_program,
+        startup_program=startup_program)
+    fc_out = fc(input=concat_out,
+                size=4 * size,
+                main_program=main_program,
+                startup_program=startup_program)
+    dtype = x_t.dtype
+    c = helper.create_tmp_variable(dtype)
+    h = helper.create_tmp_variable(dtype)
+
+    helper.append_op(
+        type='lstm_unit',
+        inputs={"X": fc_out,
+                "C_prev": cell_t_prev},
+        outputs={"C": c,
+                 "H": h},
+        attrs={"forget_bias": forget_bias})
+
+    return c, h
diff --git a/python/paddle/v2/fluid/tests/test_layers.py b/python/paddle/v2/fluid/tests/test_layers.py
index 9b88080158..468bd41285 100644
--- a/python/paddle/v2/fluid/tests/test_layers.py
+++ b/python/paddle/v2/fluid/tests/test_layers.py
@@ -161,6 +161,23 @@ class TestBook(unittest.TestCase):
                     x=dat, label=lbl))
         print(str(program))
 
+    def test_lstm_unit(self):
+        program = Program()
+        with program_guard(program):
+            x_t_data = layers.data(
+                name='x_t_data', shape=[10, 10], dtype='float32')
+            x_t = layers.fc(input=x_t_data, size=10)
+            prev_hidden_data = layers.data(
+                name='prev_hidden_data', shape=[10, 20], dtype='float32')
+            prev_hidden = layers.fc(input=prev_hidden_data, size=20)
+            prev_cell_data = layers.data(
+                name='prev_cell', shape=[10, 30], dtype='float32')
+            prev_cell = layers.fc(input=prev_cell_data, size=30)
+            self.assertIsNotNone(
+                layers.lstm_unit(
+                    x_t=x_t, hidden_t_prev=prev_hidden, cell_t_prev=prev_cell))
+        print(str(program))
+
 
 if __name__ == '__main__':
     unittest.main()

From a398e25d6ac786e14aa18be79438b8d2d1b191d0 Mon Sep 17 00:00:00 2001
From: yangyaming <mxscmxsc@gmail.com>
Date: Mon, 18 Dec 2017 20:09:36 +0800
Subject: [PATCH 2/7] Expose param_attr and bias_attr.

---
 paddle/operators/lstm_unit_op.cc    | 5 ++++-
 python/paddle/v2/fluid/layers/nn.py | 9 +++++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/paddle/operators/lstm_unit_op.cc b/paddle/operators/lstm_unit_op.cc
index 18b9cdf2a3..b6eb33bafe 100644
--- a/paddle/operators/lstm_unit_op.cc
+++ b/paddle/operators/lstm_unit_op.cc
@@ -51,7 +51,10 @@ class LstmUnitOpMaker : public framework::OpProtoAndCheckerMaker {
   LstmUnitOpMaker(framework::OpProto* proto,
                   framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "FC input before the non-linear activation.");
+    AddInput("X",
+             "Lstm unit only applies non-linear activations, please make sure"
+             "that linear tranformation has already been applied to `X`. "
+             "Linear tranformation can be applied by adding a `fc` layer");
     AddInput(
         "C_prev",
         "The cell state tensor of last time-step in the Lstm Unit operator.");
diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py
index 84e62d988c..1c101c62c2 100644
--- a/python/paddle/v2/fluid/layers/nn.py
+++ b/python/paddle/v2/fluid/layers/nn.py
@@ -5,6 +5,7 @@ All layers just related to the neural network.
 from ..layer_helper import LayerHelper
 from ..initializer import Normal, Constant
 from ..framework import Variable
+from ..param_attr import ParamAttr
 from tensor import concat
 
 __all__ = [
@@ -796,6 +797,8 @@ def lstm_unit(x_t,
               hidden_t_prev,
               cell_t_prev,
               forget_bias=0.0,
+              param_attr=None,
+              bias_attr=ParamAttr(),
               main_program=None,
               startup_program=None):
     """Lstm unit layer. The equation of a lstm step is:
@@ -836,6 +839,10 @@ def lstm_unit(x_t,
         hidden_t_prev (Variable): The hidden value of lstm unit.
         cell_t_prev (Variable): The cell value of lstm unit.
         forget_bias (float): The forget bias of lstm unit.
+        param_attr (ParamAttr): The attributes of parameter weights, used to set
+            initializer, name etc.
+        bias_attr (ParamAttr): The attributes of bias weights, used to set
+            initializer, name etc.
         main_program (Program): The main program.
         startup_program (Program): the startup program.
 
@@ -882,6 +889,8 @@ def lstm_unit(x_t,
         startup_program=startup_program)
     fc_out = fc(input=concat_out,
                 size=4 * size,
+                param_attr=param_attr,
+                bias_attr=bias_attr,
                 main_program=main_program,
                 startup_program=startup_program)
     dtype = x_t.dtype

From 58d6946c874bbe539ace4fde05e7fb4693f30ca1 Mon Sep 17 00:00:00 2001
From: yangyaming <mxscmxsc@gmail.com>
Date: Tue, 19 Dec 2017 11:03:20 +0800
Subject: [PATCH 3/7] Set the act to 'linear'.

---
 python/paddle/v2/fluid/layers/nn.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py
index 1c101c62c2..ab443826bd 100644
--- a/python/paddle/v2/fluid/layers/nn.py
+++ b/python/paddle/v2/fluid/layers/nn.py
@@ -891,6 +891,7 @@ def lstm_unit(x_t,
                 size=4 * size,
                 param_attr=param_attr,
                 bias_attr=bias_attr,
+                act='linear',
                 main_program=main_program,
                 startup_program=startup_program)
     dtype = x_t.dtype

From d993a4f58b7e2be4a76fda406e964229edff2dcb Mon Sep 17 00:00:00 2001
From: yangyaming <mxscmxsc@gmail.com>
Date: Tue, 19 Dec 2017 11:19:24 +0800
Subject: [PATCH 4/7] Change default value for bias_attr.

---
 python/paddle/v2/fluid/layers/nn.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py
index 9728adba73..31a0a312db 100644
--- a/python/paddle/v2/fluid/layers/nn.py
+++ b/python/paddle/v2/fluid/layers/nn.py
@@ -866,7 +866,7 @@ def lstm_unit(x_t,
               cell_t_prev,
               forget_bias=0.0,
               param_attr=None,
-              bias_attr=ParamAttr(),
+              bias_attr=None,
               main_program=None,
               startup_program=None):
     """Lstm unit layer. The equation of a lstm step is:
@@ -909,8 +909,8 @@ def lstm_unit(x_t,
         forget_bias (float): The forget bias of lstm unit.
         param_attr (ParamAttr): The attributes of parameter weights, used to set
             initializer, name etc.
-        bias_attr (ParamAttr): The attributes of bias weights, used to set
-            initializer, name etc.
+        bias_attr (ParamAttr): The attributes of bias weights, if not False,
+            bias weights will be created and be set to default value.
         main_program (Program): The main program.
         startup_program (Program): the startup program.
 
@@ -949,6 +949,9 @@ def lstm_unit(x_t,
         raise ValueError("The 1s dimension of x_t, hidden_t_prev and "
                          "cell_t_prev must be the same.")
 
+    if bias_attr is None:
+        bias_attr = ParamAttr()
+
     size = cell_t_prev.shape[1]
     concat_out = concat(
         input=[x_t, hidden_t_prev],

From 9ee9fefd2de46f2383309f489033fc6d94cd8628 Mon Sep 17 00:00:00 2001
From: yangyaming <mxscmxsc@gmail.com>
Date: Tue, 19 Dec 2017 11:27:35 +0800
Subject: [PATCH 5/7] Change the return order to h, c.

---
 python/paddle/v2/fluid/layers/nn.py         | 8 ++++----
 python/paddle/v2/fluid/tests/test_layers.py | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py
index 31a0a312db..dd6bb54599 100644
--- a/python/paddle/v2/fluid/layers/nn.py
+++ b/python/paddle/v2/fluid/layers/nn.py
@@ -900,7 +900,7 @@ def lstm_unit(x_t,
 
             i_t = \sigma(L_{i_t})
 
-    This layer has two outputs including :math:`o_t` and :math:`h_t`.
+    This layer has two outputs including :math:`h_t` and :math:`o_t`.
 
     Args:
         x_t (Variable): The input value of current step.
@@ -915,7 +915,7 @@ def lstm_unit(x_t,
         startup_program (Program): the startup program.
 
     Returns:
-        tuple: The cell value and hidden value of lstm unit.
+        tuple: The hidden value and cell value of lstm unit.
 
     Raises:
         ValueError: The ranks of **x_t**, **hidden_t_prev** and **cell_t_prev**\
@@ -929,7 +929,7 @@ def lstm_unit(x_t,
              x_t = fluid.layers.fc(input=x_t_data, size=10)
              prev_hidden = fluid.layers.fc(input=prev_hidden_data, size=20)
              prev_cell = fluid.layers.fc(input=prev_cell_data, size=30)
-             cell_value, hidden_value = fluid.layers.lstm_unit(x_t=x_t,
+             hidden_value, cell_value = fluid.layers.lstm_unit(x_t=x_t,
                                                     hidden_t_prev=prev_hidden,
                                                     cell_t_prev=prev_cell)
     """
@@ -977,4 +977,4 @@ def lstm_unit(x_t,
                  "H": h},
         attrs={"forget_bias": forget_bias})
 
-    return c, h
+    return h, c
diff --git a/python/paddle/v2/fluid/tests/test_layers.py b/python/paddle/v2/fluid/tests/test_layers.py
index 7b56ae464c..d4a95bf6fc 100644
--- a/python/paddle/v2/fluid/tests/test_layers.py
+++ b/python/paddle/v2/fluid/tests/test_layers.py
@@ -161,7 +161,7 @@ class TestBook(unittest.TestCase):
                     x=dat, label=lbl))
         print(str(program))
 
-    def test_seq_expand(self):
+    def test_sequence_expand(self):
         program = Program()
         with program_guard(program):
             x = layers.data(name='x', shape=[10], dtype='float32')

From cb23c637c1cd86ad6844ee0dab5ae891635b6e17 Mon Sep 17 00:00:00 2001
From: kavyasrinet <kavyasrinet@baidu.com>
Date: Mon, 18 Dec 2017 20:54:25 -0800
Subject: [PATCH 6/7] Polishing executor design doc (#6721)

* Polish executor design doc

* Adding few details
---
 doc/design/executor.md | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/doc/design/executor.md b/doc/design/executor.md
index aa738ab598..2d4b371cc5 100644
--- a/doc/design/executor.md
+++ b/doc/design/executor.md
@@ -1,27 +1,29 @@
 # Executor Design Doc
 
 ## Motivation
-In the [fluid](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/fluid.md), we encourage user use deep learning programming paradigms to describe training process. When the user-written Python program is executed, it will create a protobuf message
+In [fluid](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/fluid.md), we encourage the user to use deep learning programming paradigms to describe the training process. When the user-written Python program is executed, it will first create a protobuf message
 [`ProgramDesc`](https://github.com/PaddlePaddle/Paddle/blob/a91efdde6910ce92a78e3aa7157412c4c88d9ee8/paddle/framework/framework.proto#L145) that describes the process and is conceptually like an [abstract syntax tree](https://en.wikipedia.org/wiki/Abstract_syntax_tree).
 
-The executor runs the `ProgramDesc` like an interpreter. `ProgramDesc` contains intrinsics/operators and variables which will be used, executor explicitly execute the stored precompiled code.
+The executor runs the `ProgramDesc` like an interpreter. `ProgramDesc` contains the intrinsics (operators in this case) and variables which will be used, executor explicitly executes the stored precompiled code.
 
 ## Overview
 
-An executor takes a `ProgramDesc`, a `block_id` and a `Scope`.  The `ProgramDesc` is a list of blocks and each block contains the protobuf definition of all the parameters and operators. The `block_id` specifies the entrance block. And the `Scope` is the container of all the variable instance, which is persistent throughout different runs.
+An executor takes a `ProgramDesc`, a `block_id` and a `Scope`.  The `ProgramDesc` is a list of blocks and each block contains the protobuf definition of all the parameters and operators in the block. The `block_id` specifies the entrance block. And the `Scope` is the container of all the variable instances, which is persistent throughout different runs.
 
 ## Executor
 
-`Executor` explicitly executes all the intrinsics/operators in the `block_id`th block of a `ProgramDesc`. Essentially, it instantiates Variables and Operators, then runs all the operators in sequence. It is very similar to push stack frame when entering the block, it will destroy the temporary variables when mini-batch is finished, but it does not have stack frame pop process.
+The `Executor` explicitly executes all the intrinsics (operators here) in the `block_id`th block of a `ProgramDesc`. Essentially, it instantiates Variables and Operators, then runs all the operators in sequence one-by-one.
+It is very similar to how a push stack frame works when entering a block, following which it cleans up all the temporary variables when a mini-batch is finished. It does not however, have the stack frame pop process.
 
-### Interface
+### The interface
 ```c++
   Executor(places);
 ```
-A executor does not own any computing resources, user can only construct an executor with specified places.
+A executor does not own any computing resources, a user can only construct an executor using the specified places.
 
+### Running an Executor
 
 ```
   void Run(ProgramDesc, Scope, block_id, create_local_scope);
 ```
-A executor only provides an unified way to execute `ProgramDesc`. `ProgramDesc` is the target will be executed, scope specifies the variable container. `block_id` indicates the entrance block, `create_local_scope` means if it will destroy the temporary variables after execution finished.
+An `Executor` only provides a unified way to execute `ProgramDesc`. `ProgramDesc` is the target that will be executed, the `Scope` specifies the variable container, the `block_id` indicates the entrance block and `create_local_scope` is a boolean that states whether it will destroy the temporary variables after the execution is finished.

From 9573256f9d802dfe1daf9f6887044931ff03f636 Mon Sep 17 00:00:00 2001
From: yangyaming <mxscmxsc@gmail.com>
Date: Tue, 19 Dec 2017 13:24:12 +0800
Subject: [PATCH 7/7] Remove main_program and startup_program.

---
 python/paddle/v2/fluid/layers/nn.py | 21 ++++-----------------
 1 file changed, 4 insertions(+), 17 deletions(-)

diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py
index 1d03f357eb..2c38c23224 100644
--- a/python/paddle/v2/fluid/layers/nn.py
+++ b/python/paddle/v2/fluid/layers/nn.py
@@ -764,7 +764,7 @@ def conv2d_transpose(input,
     return out
 
 
-def sequence_expand(x, y, main_program=None, startup_program=None):
+def sequence_expand(x, y):
     """Sequence Expand Layer. This layer will expand the input variable **x**
     according to LoD information of **y**. And the following examples will
     explain how sequence_expand works:
@@ -808,8 +808,6 @@ def sequence_expand(x, y, main_program=None, startup_program=None):
     Args:
         x (Variable): The input variable which is a Tensor or LoDTensor.
         y (Variable): The input variable which is a LoDTensor.
-        main_program (Program): The main program.
-        startup_program (Program): The startup program.
 
     Returns:
         Variable: The expanded variable which is a LoDTensor.
@@ -836,9 +834,7 @@ def lstm_unit(x_t,
               cell_t_prev,
               forget_bias=0.0,
               param_attr=None,
-              bias_attr=None,
-              main_program=None,
-              startup_program=None):
+              bias_attr=None):
     """Lstm unit layer. The equation of a lstm step is:
 
         .. math::
@@ -881,8 +877,6 @@ def lstm_unit(x_t,
             initializer, name etc.
         bias_attr (ParamAttr): The attributes of bias weights, if not False,
             bias weights will be created and be set to default value.
-        main_program (Program): The main program.
-        startup_program (Program): the startup program.
 
     Returns:
         tuple: The hidden value and cell value of lstm unit.
@@ -923,18 +917,11 @@ def lstm_unit(x_t,
         bias_attr = ParamAttr()
 
     size = cell_t_prev.shape[1]
-    concat_out = concat(
-        input=[x_t, hidden_t_prev],
-        axis=1,
-        main_program=main_program,
-        startup_program=startup_program)
+    concat_out = concat(input=[x_t, hidden_t_prev], axis=1)
     fc_out = fc(input=concat_out,
                 size=4 * size,
                 param_attr=param_attr,
-                bias_attr=bias_attr,
-                act='linear',
-                main_program=main_program,
-                startup_program=startup_program)
+                bias_attr=bias_attr)
     dtype = x_t.dtype
     c = helper.create_tmp_variable(dtype)
     h = helper.create_tmp_variable(dtype)