From 35bed7ce1cb48bd8e35e0a76fbcf7359eabc37ef Mon Sep 17 00:00:00 2001
From: weixing02 <564445201@qq.com>
Date: Wed, 21 Mar 2018 19:10:52 +0800
Subject: [PATCH 01/62]  Add contents for manully build documentation(cn
 version)

---
 doc/v2/dev/write_docs_cn.rst | 31 ++++++++++++++++++++++++++++---
 1 file changed, 28 insertions(+), 3 deletions(-)

diff --git a/doc/v2/dev/write_docs_cn.rst b/doc/v2/dev/write_docs_cn.rst
index a055bb04c0..674efabcef 100644
--- a/doc/v2/dev/write_docs_cn.rst
+++ b/doc/v2/dev/write_docs_cn.rst
@@ -64,9 +64,31 @@ PaddlePaddle.org工具可以配合Docker使用，需要在系统里先安装好D
 不使用PaddlePaddle.org工具
 --------------------------
 
-使用Docker构建PaddlePaddle的文档，需要在系统里先安装好Docker工具包。Docker安装请参考 `Docker的官网 <https://docs.docker.com/>`_ 。安装好Docker之后可以使用源码目录下的脚本构建文档，即
+使用Docker构建PaddlePaddle的文档，需要在系统里先安装好Docker工具包。Docker安装请参考 `Docker的官网 <https://docs.docker.com/>`_ 。该方法与 `从源码编译PaddlePaddle <http://paddlepaddle.org/docs/develop/documentation/zh/build_and_install/build_from_source_cn.html>`_ 相似，通过从源码中构建可用于编译PaddlePaddle文档的Docker镜像并运行，在进入Docker容器后使用源码中的脚本构建PaddlePaddle文档，具体步骤如下：
 
-[TBD]
+.. code-block:: bash
+
+   mkdir paddle
+   cd paddle
+   git clone https://github.com/PaddlePaddle/Paddle.git
+   cd Paddle
+
+   # 从源码中构建可用于编译PaddlePaddle文档的Docker镜像
+   docker build -t paddle:dev .
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev /bin/bash
+
+   # 进入Docker容器后使用build.sh脚本构建PaddlePaddle文档
+   bash -x /paddle/paddle/scripts/docker/build.sh
+
+注：上述命令把当前目录（源码根目录）映射为 container 里的 :code:`/paddle` 目录。
+
+编译完成后，进入 ``paddle/build/doc/v2`` 目录，该目录下生成了 ``cn/html/`` 、 ``en/html`` 以及 ``api/en/html`` 共三个子目录，分别进入这些目录下，执行以下命令：
+
+.. code-block:: bash
+
+   python -m SimpleHTTPServer 8088
+
+在浏览器中输入http://localhost:8088就可以看到编译生成的中/英文的文档页面和英文的API页面。
 
 如果不想使用Docker，也可以使用以下命令直接构建PaddlePaddle文档，即
 
@@ -75,6 +97,7 @@ PaddlePaddle.org工具可以配合Docker使用，需要在系统里先安装好D
    mkdir paddle
    cd paddle
    git clone https://github.com/PaddlePaddle/Paddle.git
+   cd Paddle
    mkdir -p build
    cd build
    cmake .. -DCMAKE_BUILD_TYPE=Release -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON
@@ -96,7 +119,9 @@ PaddlePaddle.org工具可以配合Docker使用，需要在系统里先安装好D
 
    python -m SimpleHTTPServer 8088
 
-在浏览器中输入http://localhost:8088就可以看到编译生成的中/英文的文档页面和英文的API页面,下图为生成的英文文档首页示例。注意，示例中由于使用了sphinx的原始主题，所以页面的风格与官网并不一致，但这并不影响开发者进行调试。
+在浏览器中输入http://localhost:8088就可以看到编译生成的中/英文的文档页面和英文的API页面。
+
+下图为生成的英文文档首页示例。注意，示例中由于使用了sphinx的原始主题，所以页面的风格与官网并不一致，但这并不影响开发者进行调试。
 
 ..  image:: src/doc_en.png
     :align: center

From 154a1db04916efe74baa37e06128a43787fe6716 Mon Sep 17 00:00:00 2001
From: weixing02 <564445201@qq.com>
Date: Thu, 22 Mar 2018 11:21:31 +0800
Subject: [PATCH 02/62] Adjust some commands

---
 doc/v2/dev/write_docs_cn.rst | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/doc/v2/dev/write_docs_cn.rst b/doc/v2/dev/write_docs_cn.rst
index 674efabcef..8514e635ff 100644
--- a/doc/v2/dev/write_docs_cn.rst
+++ b/doc/v2/dev/write_docs_cn.rst
@@ -18,9 +18,6 @@ PaddlePaddle.org工具可以配合Docker使用，需要在系统里先安装好D
 
 ..  code-block:: bash
 
-    mkdir paddlepaddle # Create paddlepaddle working directory
-    cd paddlepaddle
-
     # Clone the content repositories
     git clone https://github.com/PaddlePaddle/Paddle.git
     git clone https://github.com/PaddlePaddle/book.git
@@ -38,9 +35,6 @@ PaddlePaddle.org工具可以配合Docker使用，需要在系统里先安装好D
 
 ..  code-block:: bash
 
-    mkdir paddlepaddle # Create paddlepaddle working directory
-    cd paddlepaddle
-
     # Clone the content repositories and PaddlePaddle.org
     git clone https://github.com/PaddlePaddle/Paddle.git
     git clone https://github.com/PaddlePaddle/book.git
@@ -68,14 +62,12 @@ PaddlePaddle.org工具可以配合Docker使用，需要在系统里先安装好D
 
 .. code-block:: bash
 
-   mkdir paddle
-   cd paddle
    git clone https://github.com/PaddlePaddle/Paddle.git
    cd Paddle
 
    # 从源码中构建可用于编译PaddlePaddle文档的Docker镜像
    docker build -t paddle:dev .
-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev /bin/bash
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" -e "WITH_DOC=ON" paddle:dev /bin/bash
 
    # 进入Docker容器后使用build.sh脚本构建PaddlePaddle文档
    bash -x /paddle/paddle/scripts/docker/build.sh
@@ -94,8 +86,6 @@ PaddlePaddle.org工具可以配合Docker使用，需要在系统里先安装好D
 
 .. code-block:: bash
 
-   mkdir paddle
-   cd paddle
    git clone https://github.com/PaddlePaddle/Paddle.git
    cd Paddle
    mkdir -p build

From 86626a74780176a991064ce6ea7ac5d4bd683775 Mon Sep 17 00:00:00 2001
From: weixing02 <564445201@qq.com>
Date: Wed, 28 Mar 2018 15:22:43 +0800
Subject: [PATCH 03/62] Add English version

---
 doc/v2/dev/write_docs_cn.rst |  6 ++++++
 doc/v2/dev/write_docs_en.rst | 24 ++++++++++++++++++++++--
 2 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/doc/v2/dev/write_docs_cn.rst b/doc/v2/dev/write_docs_cn.rst
index f18dd86b51..83d065d3bb 100644
--- a/doc/v2/dev/write_docs_cn.rst
+++ b/doc/v2/dev/write_docs_cn.rst
@@ -19,6 +19,9 @@ PaddlePaddle.org工具可以配合Docker使用，需要在系统里先安装好D
 
 ..  code-block:: bash
 
+    mkdir paddlepaddle # Create paddlepaddle working directory
+    cd paddlepaddle
+
     # Clone the content repositories
     git clone https://github.com/PaddlePaddle/Paddle.git
     git clone https://github.com/PaddlePaddle/book.git
@@ -36,6 +39,9 @@ PaddlePaddle.org工具可以配合Docker使用，需要在系统里先安装好D
 
 ..  code-block:: bash
 
+    mkdir paddlepaddle # Create paddlepaddle working directory
+    cd paddlepaddle
+
     # Clone the content repositories and PaddlePaddle.org
     git clone https://github.com/PaddlePaddle/Paddle.git
     git clone https://github.com/PaddlePaddle/book.git
diff --git a/doc/v2/dev/write_docs_en.rst b/doc/v2/dev/write_docs_en.rst
index 15ff0d34ad..8bc43be6de 100644
--- a/doc/v2/dev/write_docs_en.rst
+++ b/doc/v2/dev/write_docs_en.rst
@@ -68,9 +68,29 @@ Please `click here <https://github.com/PaddlePaddle/PaddlePaddle.org/blob/develo
 Manually Building the Documentation
 -------------------------------------
 
-Build PaddlePaddle's documentation with Docker，you need to install Docker first. Please refer to `Docker's official website <https://docs.docker.com/>`_ on how to install Docker. After Docker is installed, you could use the scripts in the source directory to build the documentation.
+Build PaddlePaddle's documentation with Docker，you need to install Docker first. Please refer to `Docker's official website <https://docs.docker.com/>`_ on how to install Docker. This method is quite similar to ` Build From Sources <http://paddlepaddle.org/docs/develop/documentation/en/build_and_install/build_from_source_en.html>`_ , by constructing, from source code, a docker image that can be used to build PaddlePaddle documentation. Enter the Docker container and use the script ``build.sh`` in the source directory to build the PaddlePaddle documentation. The specific steps are as follows:
 
-[TBD]
+.. code-block:: bash
+
+   git clone https://github.com/PaddlePaddle/Paddle.git
+   cd Paddle
+
+   # Construct a docker image from source code
+   docker build -t paddle:dev .
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" -e "WITH_DOC=ON" paddle:dev /bin/bash
+
+   # Use build.sh to build PaddlePaddle documentation
+   bash -x /paddle/paddle/scripts/docker/build.sh
+
+Note: The above commands maps the current directory (source root directory) to the :code:`/paddle` directory in the container.
+
+After compiling, you could enter the ``paddle/build/doc/v2`` directory, where three subdirectories ``cn/html/``, ``en/html`` and ``api/en/html`` are generated. Please enter these directories respectively and execute the following commands:
+
+.. code-block:: bash
+
+   python -m SimpleHTTPServer 8088
+
+Use a web browser and navigate to http://localhost:8000, you could see the compiled Chinese/English documents page and the English APIs page.
 
 If you do not wish to use Docker, you can also use the following commands to directly build the PaddlePaddle documentation.
 

From e1290c4fd7facfa9abfbb6e710ab3fa5f4ed3d10 Mon Sep 17 00:00:00 2001
From: wanghaoshuang <wanghaoshuang@baidu.com>
Date: Wed, 28 Mar 2018 23:09:32 +0800
Subject: [PATCH 04/62] Make Average Model support for 'moving mean' and
 'moving variance' of batch_normal op

---
 python/paddle/fluid/optimizer.py | 28 ++++++++++++++++++++++------
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 180575c35d..d21320f705 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -850,23 +850,39 @@ class ModelAverage(Optimizer):
         self.min_average_window = min_average_window
         self.max_average_window = max_average_window
         self.params_grads = params_grads
+
+        # append 'moving mean' and 'moving variance' to self.params_grads
+        pattern = re.compile(r"batch_norm_\d+\.w_[1,2]")
+        for param in framework.default_main_program().global_block(
+        ).all_parameters():
+            if pattern.match(param.name) is not None:
+                self.params_grads.append((param, None))
+        # create a tmp gradient variable to backup parameter value
+        # for parameter whose grad is None
+        for i, param_grad in enumerate(self.params_grads):
+            param, grad = param_grad
+            if grad is None:
+                grad = param.block.create_var(
+                    name=unique_name.generate(".".join([param.name, 'tmp'])),
+                    dtype=param.dtype,
+                    persistable=False,
+                    stop_gradient=stop_gradient)
+                self.params_grads[i] = (param, grad)
+
         for param, grad in self.params_grads:
-            if grad is not None:
-                self._append_average_accumulate_op(param)
+            self._append_average_accumulate_op(param)
 
         self.apply_program = Program()
         block = self.apply_program.global_block()
         with program_guard(main_program=self.apply_program):
             for param_grad in self.params_grads:
-                if param_grad[1] is not None:
-                    self._add_average_apply_op(block, param_grad)
+                self._add_average_apply_op(block, param_grad)
 
         self.restore_program = Program()
         block = self.restore_program.global_block()
         with program_guard(main_program=self.restore_program):
             for param_grad in self.params_grads:
-                if param_grad[1] is not None:
-                    self._add_average_restore_op(block, param_grad)
+                self._add_average_restore_op(block, param_grad)
 
     def _add_average_apply_op(self, block, param_grad):
         param = block.clone_variable(param_grad[0])

From 62373edb0c6ad7b55ca7af5b632ecd415e9d51bb Mon Sep 17 00:00:00 2001
From: weixing02 <564445201@qq.com>
Date: Fri, 30 Mar 2018 19:27:08 +0800
Subject: [PATCH 05/62] Adjust

---
 doc/v2/dev/write_docs_cn.rst |  7 ++-----
 doc/v2/dev/write_docs_en.rst | 11 ++++-------
 2 files changed, 6 insertions(+), 12 deletions(-)

diff --git a/doc/v2/dev/write_docs_cn.rst b/doc/v2/dev/write_docs_cn.rst
index 83d065d3bb..0795b2d146 100644
--- a/doc/v2/dev/write_docs_cn.rst
+++ b/doc/v2/dev/write_docs_cn.rst
@@ -100,13 +100,10 @@ PaddlePaddle.org工具可以配合Docker使用，需要在系统里先安装好D
    cmake .. -DCMAKE_BUILD_TYPE=Release -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON
 
    # 如果只需要构建使用文档，则执行以下命令
-   make -j $processors gen_proto_py
-   make -j $processors paddle_docs paddle_docs_cn
+   make -j $processors paddle_docs
 
    # 如果只需要构建API，则执行以下命令
-   make -j $processors gen_proto_py framework_py_proto
-   make -j $processors copy_paddle_pybind
-   make -j $processors paddle_api_docs
+   make -j $processors paddle_apis
 
 其中$processors代表启动和CPU核一样多的进程来并行编译，可以根据本机的CPU核数设置相应的值。
 
diff --git a/doc/v2/dev/write_docs_en.rst b/doc/v2/dev/write_docs_en.rst
index 8bc43be6de..f03daa300f 100644
--- a/doc/v2/dev/write_docs_en.rst
+++ b/doc/v2/dev/write_docs_en.rst
@@ -96,21 +96,18 @@ If you do not wish to use Docker, you can also use the following commands to dir
 
 .. code-block:: bash
 
-   mkdir paddle
-   cd paddle
+
    git clone https://github.com/PaddlePaddle/Paddle.git
+   cd Paddle
    mkdir -p build
    cd build
    cmake .. -DCMAKE_BUILD_TYPE=Release -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON
 
    # If you only need to build documents, use the following commands
-   make -j $processors gen_proto_py
-   make -j $processors paddle_docs paddle_docs_cn
+   make -j $processors paddle_docs
 
    # If you only need to build APIs, use the following commands
-   make -j $processors gen_proto_py framework_py_proto
-   make -j $processors copy_paddle_pybind
-   make -j $processors paddle_api_docs
+   make -j $processors paddle_apis
 
 $processors indicates that as many processes as the CPU cores are started to compile in parallel. It should be set according to the number of CPU cores of your machine.
 

From abc630ecf9e01f7c09b8833ad25fa60cb9cbc6c8 Mon Sep 17 00:00:00 2001
From: weixing02 <564445201@qq.com>
Date: Fri, 30 Mar 2018 21:12:29 +0800
Subject: [PATCH 06/62] Adjust descriptions for building fluid docs and api

---
 doc/v2/dev/write_docs_cn.rst | 4 ++--
 doc/v2/dev/write_docs_en.rst | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/doc/v2/dev/write_docs_cn.rst b/doc/v2/dev/write_docs_cn.rst
index 0795b2d146..887d92942e 100644
--- a/doc/v2/dev/write_docs_cn.rst
+++ b/doc/v2/dev/write_docs_cn.rst
@@ -107,13 +107,13 @@ PaddlePaddle.org工具可以配合Docker使用，需要在系统里先安装好D
 
 其中$processors代表启动和CPU核一样多的进程来并行编译，可以根据本机的CPU核数设置相应的值。
 
-编译完成后，进入 ``doc/v2`` 目录，如果选择构建文档则会在该目录下生成 ``cn/html/`` 、 ``en/html`` 两个子目录，选择构建API则会生成 ``api/en/html`` 目录，分别进入这些目录下，执行以下命令：
+编译完成后，会产生 ``doc/v2`` 和 ``doc/fluid`` 两个目录，如果选择构建文档则会在这两个目录下分别都生成 ``cn/html/`` 、 ``en/html`` 两个子目录，选择构建API则会在这两个目录下分别生成 ``api/en/html`` 目录，分别进入这些子目录下，执行以下命令：
 
 .. code-block:: bash
 
    python -m SimpleHTTPServer 8088
 
-在浏览器中输入 http://localhost:8088 就可以看到编译生成的中/英文的文档页面和英文的API页面,下图为生成的英文文档首页示例。注意，示例中由于使用了sphinx的原始主题，所以页面的风格与官网并不一致，但这并不影响开发者进行调试。
+在浏览器中输入 http://localhost:8088 就可以看到编译生成的 ``v2`` 和 ``fluid`` 两种版本的中/英文的文档页面和英文的API页面,下图为生成的 ``v2`` 英文文档首页示例。注意，示例中由于使用了sphinx的原始主题，所以页面的风格与官网并不一致，但这并不影响开发者进行调试。
 
 ..  image:: src/doc_en.png
     :align: center
diff --git a/doc/v2/dev/write_docs_en.rst b/doc/v2/dev/write_docs_en.rst
index f03daa300f..435bbdb60f 100644
--- a/doc/v2/dev/write_docs_en.rst
+++ b/doc/v2/dev/write_docs_en.rst
@@ -111,13 +111,13 @@ If you do not wish to use Docker, you can also use the following commands to dir
 
 $processors indicates that as many processes as the CPU cores are started to compile in parallel. It should be set according to the number of CPU cores of your machine.
 
-After the compilation is complete, enter the ``doc/v2`` directory. If you chose to build documents, it will generate ``cn/html/`` and ``en/html`` subdirectories under this directory. If you chose to build APIs，it will generate``api/en/html`` subdirectory. Please enter these directories respectively and execute the following commands:
+After the compilation is complete, there should be two generated directories: ``doc/v2`` and ``doc/fluid`` . If you chose to build documents, two subdirectories ``cn/html/`` and ``en/html``  will be generated in both two directories. If you chose to build APIs，a subdirectory ``api/en/html`` will be generated. Please enter these directories respectively and execute the following commands:
 
 .. code-block:: bash
 
    python -m SimpleHTTPServer 8088
 
-Use a web browser and navigate to http://localhost:8000, you could see the compiled Chinese/English documents page and the English APIs page. The following figure is an example of the built English documents home page. Note that due to the sphinx's original theme used in the example, the style of the page is not consistent with the official website, but this does not affect the developer's debugging.
+Use a web browser and navigate to http://localhost:8000, you could see the compiled  ``v2`` 's and ``fluid`` 's Chinese/English documents page and English APIs page. The following figure is an example of the built ``v2`` 's English documents home page. Note that due to the sphinx's original theme used in the example, the style of the page is not consistent with the official website, but this does not affect the developer's debugging.
 
 ..  image:: src/doc_en.png
     :align: center

From 9708b21f191b3ff606651dfaeb7cf65dfd250881 Mon Sep 17 00:00:00 2001
From: wanghaoshuang <wanghaoshuang@baidu.com>
Date: Mon, 2 Apr 2018 10:51:31 +0800
Subject: [PATCH 07/62] Refine average model option 1. Add attr 'average' into
 ParamAttr. 2. Make 'params_grads' optional for AverageModel. 3. Add option
 'average_mean' and 'average_variance' for batch_normal.

---
 python/paddle/fluid/framework.py  |  4 +++-
 python/paddle/fluid/layers/nn.py  | 12 +++++++++---
 python/paddle/fluid/optimizer.py  | 28 ++++++++++++----------------
 python/paddle/fluid/param_attr.py |  9 ++++++---
 4 files changed, 30 insertions(+), 23 deletions(-)

diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 3e78788f47..92c299a4b6 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -1137,6 +1137,8 @@ class Parameter(Variable):
 
         self.gradient_clip_attr = kwargs.get('gradient_clip_attr', None)
 
+        self.average = kwargs.get('average', True)
+
     def __str__(self):
         return self.to_string(True)
 
@@ -1157,7 +1159,7 @@ class Parameter(Variable):
         if with_details:
             res_str = Variable.to_string(self, throw_on_error, True)
             additional_attr = ("trainable", "optimize_attr", "regularizer",
-                               "gradient_clip_attr")
+                               "gradient_clip_attr", "average")
             for attr_name in additional_attr:
                 res_str += "%s: %s\n" % (attr_name,
                                          str(getattr(self, attr_name)))
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 0332556f62..3265ff733b 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -1486,7 +1486,9 @@ def batch_norm(input,
                in_place=False,
                name=None,
                moving_mean_name=None,
-               moving_variance_name=None):
+               moving_variance_name=None,
+               average_mean=True,
+               average_variance=True):
     """
     This function helps create an operator to implement
     the BatchNorm layer using the configurations from the input parameters.
@@ -1517,7 +1519,10 @@ def batch_norm(input,
 
     mean = helper.create_parameter(
         attr=ParamAttr(
-            name=moving_mean_name, initializer=Constant(0.0), trainable=False),
+            name=moving_mean_name,
+            initializer=Constant(0.0),
+            trainable=False,
+            average=average_variance),
         shape=param_shape,
         dtype=input.dtype)
     mean.stop_gradient = True
@@ -1526,7 +1531,8 @@ def batch_norm(input,
         attr=ParamAttr(
             name=moving_variance_name,
             initializer=Constant(1.0),
-            trainable=False),
+            trainable=False,
+            average=average_mean),
         shape=param_shape,
         dtype=input.dtype)
     variance.stop_gradient = True
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index d21320f705..560257a356 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+import re
 from collections import defaultdict
 from paddle.fluid.framework import Program
 import framework
@@ -818,8 +818,8 @@ class ModelAverage(Optimizer):
     min_average_window, max_average_window and current update times.
 
     Args:
-        params_grads: A list of parameter-grad variable pairs.
         average_window_rate: The rate of average window.
+        params_grads: A list of parameter-grad variable pairs.
         min_average_window: The minimum size of average window.
         max_average_window: The maximum size of average window.
 
@@ -840,8 +840,8 @@ class ModelAverage(Optimizer):
     """
 
     def __init__(self,
-                 params_grads,
-                 average_window_rate,
+                 average_window_rate=0.15,
+                 params_grads=None,
                  min_average_window=10000,
                  max_average_window=10000,
                  **kwargs):
@@ -849,25 +849,21 @@ class ModelAverage(Optimizer):
         self.average_window = average_window_rate
         self.min_average_window = min_average_window
         self.max_average_window = max_average_window
-        self.params_grads = params_grads
 
-        # append 'moving mean' and 'moving variance' to self.params_grads
-        pattern = re.compile(r"batch_norm_\d+\.w_[1,2]")
+        self.params_grads = [] if params_grads is None else params_grads
+        params = {}
+        for param, grad in self.params_grads:
+            params[param.name] = (param, grad)
         for param in framework.default_main_program().global_block(
         ).all_parameters():
-            if pattern.match(param.name) is not None:
-                self.params_grads.append((param, None))
-        # create a tmp gradient variable to backup parameter value
-        # for parameter whose grad is None
-        for i, param_grad in enumerate(self.params_grads):
-            param, grad = param_grad
-            if grad is None:
+            if param.name not in params and param.average:
                 grad = param.block.create_var(
                     name=unique_name.generate(".".join([param.name, 'tmp'])),
                     dtype=param.dtype,
                     persistable=False,
-                    stop_gradient=stop_gradient)
-                self.params_grads[i] = (param, grad)
+                    stop_gradient=True)
+                params[param.name] = (param, grad)
+        self.params_grads = params.values()
 
         for param, grad in self.params_grads:
             self._append_average_accumulate_op(param)
diff --git a/python/paddle/fluid/param_attr.py b/python/paddle/fluid/param_attr.py
index 255cd21043..74b968f8ee 100644
--- a/python/paddle/fluid/param_attr.py
+++ b/python/paddle/fluid/param_attr.py
@@ -28,13 +28,15 @@ class ParamAttr(object):
                  learning_rate=1.0,
                  regularizer=None,
                  trainable=True,
-                 gradient_clip=None):
+                 gradient_clip=None,
+                 average=True):
         self.name = name
         self.initializer = initializer
         self.learning_rate = learning_rate
         self.regularizer = regularizer
         self.trainable = trainable
         self.gradient_clip = gradient_clip
+        self.average = average
 
     def set_default_initializer(self, initializer):
         if initializer is None:
@@ -80,7 +82,8 @@ class ParamAttr(object):
             },
             'regularizer': self.regularizer,
             'trainable': self.trainable,
-            'gradient_clip_attr': self.gradient_clip
+            'gradient_clip_attr': self.gradient_clip,
+            'average': self.average
         }
         if with_initializer:
             kwargs['initializer'] = self.initializer
@@ -90,7 +93,7 @@ class ParamAttr(object):
 class WeightNormParamAttr(ParamAttr):
     """
     Used for weight normalization. Any field in ParamAttr can also be set here.
-    Besides, an extra field dim can be set to indicate the dimension except 
+    Besides, an extra field dim can be set to indicate the dimension except
     which to normalize.
     """
     # List to record the parameters reparameterized by weight normalization.

From af242901232464d8a59d26cba9084ffe22562fdf Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Wed, 4 Apr 2018 15:05:46 +0800
Subject: [PATCH 08/62] Add 'buffer_size' api for open_files op

---
 paddle/fluid/operators/reader/open_files_op.cc | 15 ++++++++++-----
 python/paddle/fluid/layers/io.py               | 12 ++++++++++--
 2 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/paddle/fluid/operators/reader/open_files_op.cc b/paddle/fluid/operators/reader/open_files_op.cc
index eacedeea88..db4e619e7b 100644
--- a/paddle/fluid/operators/reader/open_files_op.cc
+++ b/paddle/fluid/operators/reader/open_files_op.cc
@@ -38,8 +38,9 @@ class MultipleReader : public framework::ReaderBase {
   };
 
   MultipleReader(const std::vector<std::string>& file_names,
-                 const std::vector<framework::DDim>& dims, size_t thread_num)
-      : file_names_(file_names), dims_(dims) {
+                 const std::vector<framework::DDim>& dims, size_t thread_num,
+                 size_t buffer_size)
+      : file_names_(file_names), dims_(dims), buffer_size_(buffer_size) {
     prefetchers_.resize(thread_num);
     StartNewScheduler();
   }
@@ -60,6 +61,7 @@ class MultipleReader : public framework::ReaderBase {
   std::vector<framework::DDim> dims_;
   std::thread scheduler_;
   std::vector<std::thread> prefetchers_;
+  size_t buffer_size_;
   framework::Channel<size_t>* waiting_file_idx_;
   framework::Channel<size_t>* available_thread_idx_;
   framework::Channel<std::vector<framework::LoDTensor>>* buffer_;
@@ -92,7 +94,7 @@ void MultipleReader::StartNewScheduler() {
   waiting_file_idx_ = framework::MakeChannel<size_t>(file_names_.size());
   available_thread_idx_ = framework::MakeChannel<size_t>(thread_num);
   buffer_ =
-      framework::MakeChannel<std::vector<framework::LoDTensor>>(thread_num);
+      framework::MakeChannel<std::vector<framework::LoDTensor>>(buffer_size_);
 
   for (size_t i = 0; i < file_names_.size(); ++i) {
     waiting_file_idx_->Send(&i);
@@ -197,11 +199,13 @@ class OpenFilesOp : public framework::OperatorBase {
     const auto& file_names = Attr<std::vector<std::string>>("file_names");
     PADDLE_ENFORCE(!file_names.empty(), "No file to be read!");
     const size_t thread_num = Attr<int>("thread_num");
+    const size_t buffer_size = Attr<int>("buffer_size");
 
     auto* out = scope.FindVar(Output("Out"))
                     ->template GetMutable<framework::ReaderHolder>();
-    out->Reset(new MultipleReader(
-        file_names, RestoreShapes(shape_concat, ranks), thread_num));
+    out->Reset(new MultipleReader(file_names,
+                                  RestoreShapes(shape_concat, ranks),
+                                  thread_num, buffer_size));
   }
 };
 
@@ -212,6 +216,7 @@ class OpenFilesOpMaker : public FileReaderMakerBase {
     AddAttr<std::vector<std::string>>("file_names", "Files to be read.");
     AddAttr<int>("thread_num", "The maximal concurrent prefetch thread number.")
         .GreaterThan(0);
+    AddAttr<int>("buffer_size", "The size of prefetch buffer.").GreaterThan(0);
 
     AddComment(R"DOC(
       OpenFiles Operator
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index bd7e9c30fe..da5b4853d3 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -287,7 +287,14 @@ def open_recordio_file(filename, shapes, lod_levels, dtypes):
                              startup_var)
 
 
-def open_files(filenames, thread_num, shapes, lod_levels, dtypes):
+def open_files(filenames,
+               shapes,
+               lod_levels,
+               dtypes,
+               thread_num,
+               buffer_size=None):
+    if buffer_size is None:
+        buffer_size = thread_num
     dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes]
     shape_concat = []
     ranks = []
@@ -308,7 +315,8 @@ def open_files(filenames, thread_num, shapes, lod_levels, dtypes):
             'lod_levels': lod_levels,
             'ranks': ranks,
             'file_names': filenames,
-            'thread_num': thread_num
+            'thread_num': thread_num,
+            'buffer_size': buffer_size
         })
 
     startup_var.desc.set_dtypes(dtypes)

From 6dcfd97a9285161efa767516d466a084b6a45bed Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Wed, 4 Apr 2018 15:35:28 +0800
Subject: [PATCH 09/62] add docstring

---
 python/paddle/fluid/layers/io.py | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index da5b4853d3..97ac01b775 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -293,8 +293,40 @@ def open_files(filenames,
                dtypes,
                thread_num,
                buffer_size=None):
+    """
+    Open files
+
+    This layer takes a list of files to read from and returns a Reader Variable. Via the Reader Variable, we can get data from given files.
+
+    Args:
+       filenames(list): The list of file names.
+       shapes(list): List of tuples which declaring data shapes.
+       lod_levels(list): List of ints which declaring data lod_level.
+       dtypes(list): List of strs which declaring data type.
+       thread_num(int): The maximal concurrent prefetch thread number.
+       buffer_size(int): The size of prefetch buffer.
+
+    Returns:
+       Variable: A Reader Variable via which we can get file data.
+
+    Examples:
+       .. code-block:: python
+
+         reader = fluid.layers.open_files(filenames=['./data1.recordio',
+                                                     './data2.recordio'],
+                                          shapes=[(3,224,224), (1)],
+                                          lod_levels=[0, 0],
+                                          dtypes=['float32', 'int64'],
+                                          thread_num=2,
+                                          buffer_size=2)
+
+         # Via the reader, we can use 'read_file' layer to get data:
+         image, label = fluid.layers.read_file(reader)
+    """
     if buffer_size is None:
         buffer_size = thread_num
+    if isinstance(filenames, basestring):
+        filenames = [filenames]
     dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes]
     shape_concat = []
     ranks = []

From 2e40660e7a81962a56d89bdd1e2a86d9f78cab35 Mon Sep 17 00:00:00 2001
From: wanghaoshuang <wanghaoshuang@baidu.com>
Date: Wed, 4 Apr 2018 18:13:45 +0800
Subject: [PATCH 10/62] Fix some issues.

---
 python/paddle/fluid/framework.py  |  4 ++--
 python/paddle/fluid/layers/nn.py  | 20 +++++++++++---------
 python/paddle/fluid/optimizer.py  |  4 ++--
 python/paddle/fluid/param_attr.py |  6 +++---
 4 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 370a477932..6120d66c12 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -1155,7 +1155,7 @@ class Parameter(Variable):
 
         self.gradient_clip_attr = kwargs.get('gradient_clip_attr', None)
 
-        self.average = kwargs.get('average', True)
+        self.do_model_average = kwargs.get('do_model_average', None)
 
     def __str__(self):
         return self.to_string(True)
@@ -1177,7 +1177,7 @@ class Parameter(Variable):
         if with_details:
             res_str = Variable.to_string(self, throw_on_error, True)
             additional_attr = ("trainable", "optimize_attr", "regularizer",
-                               "gradient_clip_attr", "average")
+                               "gradient_clip_attr", "do_model_average")
             for attr_name in additional_attr:
                 res_str += "%s: %s\n" % (attr_name,
                                          str(getattr(self, attr_name)))
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index e5ae10636d..37ce738275 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -1489,8 +1489,7 @@ def batch_norm(input,
                name=None,
                moving_mean_name=None,
                moving_variance_name=None,
-               average_mean=True,
-               average_variance=True):
+               do_model_average_for_mean_and_var=False):
     """
     This function helps create an operator to implement
     the BatchNorm layer using the configurations from the input parameters.
@@ -1519,12 +1518,15 @@ def batch_norm(input,
     bias = helper.create_parameter(
         attr=helper.bias_attr, shape=param_shape, dtype=dtype, is_bias=True)
 
+    if do_model_average_for_mean_and_var:
+        do_model_average_for_mean_and_var = None
+
     mean = helper.create_parameter(
         attr=ParamAttr(
             name=moving_mean_name,
             initializer=Constant(0.0),
             trainable=False,
-            average=average_variance),
+            do_model_average=do_model_average_for_mean_and_var),
         shape=param_shape,
         dtype=input.dtype)
     mean.stop_gradient = True
@@ -1534,7 +1536,7 @@ def batch_norm(input,
             name=moving_variance_name,
             initializer=Constant(1.0),
             trainable=False,
-            average=average_mean),
+            do_model_average=do_model_average_for_mean_and_var),
         shape=param_shape,
         dtype=input.dtype)
     variance.stop_gradient = True
@@ -3352,14 +3354,14 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
     Here are some examples to explain it.
 
     1. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape
-    is [6, 8], the reshape operator will transform x into a 2-D tensor with 
+    is [6, 8], the reshape operator will transform x into a 2-D tensor with
     shape [6, 8] and leaving x's data unchanged.
 
     2. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape
     specified is [2, 3, -1, 2], the reshape operator will transform x into a
     4-D tensor with shape [2, 3, 4, 2] and leaving x's data unchanged. In this
-    case, one dimension of the target shape is set to -1, the value of this 
-    dimension is inferred from the total element number of x and remaining 
+    case, one dimension of the target shape is set to -1, the value of this
+    dimension is inferred from the total element number of x and remaining
     dimensions.
 
     3. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape
@@ -3593,7 +3595,7 @@ def lrn(input, n=5, k=1.0, alpha=1e-4, beta=0.75, name=None):
 def pad(x, paddings, pad_value=0., name=None):
     """
     Pads a tensor with a constant value given by :attr:`pad_value`, and the
-    padded width is specified by :attr:`paddings`. 
+    padded width is specified by :attr:`paddings`.
 
     Specifically, the number of values padded before the contents of :attr:`x`
     in dimension :attr:`i` is indicated by :attr:`paddings[i]`, and the number
@@ -3621,7 +3623,7 @@ def pad(x, paddings, pad_value=0., name=None):
         x (Variable): The input tensor variable.
         paddings (list): A list of integers. Its elements specify the padded
                          width before and after for each dimension in turn.
-                         The length of :attr:paddings must be 
+                         The length of :attr:paddings must be
                          :math:`rank(x) \\times 2`.
         pad_value (float): The constant value used to pad.
         name(str|None): A name for this layer(optional). If set None, the layer
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 560257a356..1917b7d044 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -840,7 +840,7 @@ class ModelAverage(Optimizer):
     """
 
     def __init__(self,
-                 average_window_rate=0.15,
+                 average_window_rate,
                  params_grads=None,
                  min_average_window=10000,
                  max_average_window=10000,
@@ -856,7 +856,7 @@ class ModelAverage(Optimizer):
             params[param.name] = (param, grad)
         for param in framework.default_main_program().global_block(
         ).all_parameters():
-            if param.name not in params and param.average:
+            if param.name not in params and param.do_model_average != False:
                 grad = param.block.create_var(
                     name=unique_name.generate(".".join([param.name, 'tmp'])),
                     dtype=param.dtype,
diff --git a/python/paddle/fluid/param_attr.py b/python/paddle/fluid/param_attr.py
index 74b968f8ee..1c6970441b 100644
--- a/python/paddle/fluid/param_attr.py
+++ b/python/paddle/fluid/param_attr.py
@@ -29,14 +29,14 @@ class ParamAttr(object):
                  regularizer=None,
                  trainable=True,
                  gradient_clip=None,
-                 average=True):
+                 do_model_average=None):
         self.name = name
         self.initializer = initializer
         self.learning_rate = learning_rate
         self.regularizer = regularizer
         self.trainable = trainable
         self.gradient_clip = gradient_clip
-        self.average = average
+        self.model_average = do_model_average
 
     def set_default_initializer(self, initializer):
         if initializer is None:
@@ -83,7 +83,7 @@ class ParamAttr(object):
             'regularizer': self.regularizer,
             'trainable': self.trainable,
             'gradient_clip_attr': self.gradient_clip,
-            'average': self.average
+            'model_average': self.model_average
         }
         if with_initializer:
             kwargs['initializer'] = self.initializer

From 442c150333ce169b9e1221c0f2e61af8cfdc1e2b Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Wed, 4 Apr 2018 20:35:59 +0800
Subject: [PATCH 11/62] a draft of ThreadedReader

---
 .../reader/create_threaded_reader_op.cc       | 125 ++++++++++++++++++
 1 file changed, 125 insertions(+)
 create mode 100644 paddle/fluid/operators/reader/create_threaded_reader_op.cc

diff --git a/paddle/fluid/operators/reader/create_threaded_reader_op.cc b/paddle/fluid/operators/reader/create_threaded_reader_op.cc
new file mode 100644
index 0000000000..a4aebafa8b
--- /dev/null
+++ b/paddle/fluid/operators/reader/create_threaded_reader_op.cc
@@ -0,0 +1,125 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/detail/safe_ref.h"
+#include "paddle/fluid/operators/reader/reader_op_registry.h"
+
+namespace paddle {
+namespace operators {
+namespace reader {
+
+class ThreadedReader : public framework::DecoratedReader {
+ public:
+  ThreadedReader(ReaderBase* reader, bool unsafe_mode)
+      : DecoratedReader(reader), unsafe_mode_(unsafe_mode) {}
+
+  void ReadNext(std::vector<framework::LoDTensor>* out) override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (!unsafe_mode) {
+      if (!reader_->HasNext()) {
+        PADDLE_THROW("There is no next data!");
+      }
+      reader_->ReadNext(out);
+    } else {
+      auto& thread_buffer = thread_buffers_[std::this_thread::get_id()];
+      if (thread_buffer.empty()) {
+        PADDLE_THROW(
+            "thread_buffer is empty! HasNext() must be invoked before "
+            "ReadNext() in the same thread.");
+      }
+      *out = thread_buffer;
+      thread_buffer.clear();
+    }
+  }
+
+  bool HasNext() const override {
+    if (!unsafe_mode_) {
+      PADDLE_THROW(
+          "ThreadedReader::HasNext() is disabled when 'unsafe_mode' is false.");
+    }
+    std::thread::id thread_id = std::this_thread::get_id();
+    std::lock_guard<std::mutex> lock(mutex_);
+    auto& thread_buffer = thread_buffers_[thread_id];
+    if (thread_buffer.empty() && reader_->HasNext()) {
+      reader_->ReadNext(&thread_buffer);
+    }
+    return !threda_buffer.empty();
+  }
+
+  void ReInit() override;
+
+  ~ThreadedReader() {
+    for (auto& p : thread_buffers_) {
+      if (!p.second.empty()) {
+        PADDLE_THROW(
+            "Find an unused data batch in ThreadedReader! Maybe one thread "
+            "invokes 'HasNext()' without subsequent 'ReadNext()'.");
+      }
+    }
+  }
+
+ private:
+  mutable std::mutex mutex_;
+  mutable std::unordered_map<std::thread::id, std::vector<framework::LoDTensor>>
+      thread_buffers_;
+};
+
+class CreateThreadedReaderOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
+    auto* out = detail::Ref(scope.FindVar(Output("Out")))
+                    .GetMutable<framework::ReaderHolder>();
+    if (out->Get() != nullptr) {
+      return;
+    }
+    const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
+                                        ->Get<framework::ReaderHolder>();
+    bool unsafe_mode = Attr<bool>("unsafe_mode");
+    out->Reset(new ThreadedReader(underlying_reader.Get(), unsafe_mode));
+  }
+};
+
+class CreateThreadedReaderOpMaker : public DecoratedReaderMakerBase {
+ public:
+  CreateThreadedReaderOpMaker(OpProto* op_proto, OpAttrChecker* op_checker)
+      : DecoratedReaderMakerBase(op_proto, op_checker) {
+    AddAttr<bool>("unsafe_mode",
+                  "When 'unsafe_mode' is false, invoking 'HasNext()' or "
+                  "'ReInit()' is not allowed to avoid unexpected bugs in "
+                  "multi-thread environment.")
+        .SetDefault(false);
+    AddComment(R"DOC(
+      CreateThreadedReader Operator
+
+      This operator creates a threaded reader. A threaded reader's 
+      'ReadNext()' can be invoked by several threads at the same 
+      time. 
+      When the attribute 'unsafe_mode' is false, the threaded reader's 
+      'HasNext()' and 'ReInit()' will be disabled to avoid unexpected 
+      bugs in multi-thread environment. If you really need them, you 
+      can enable them by setting 'unsafe_mode' true. In this case, 
+      'HasNext()' returning true only guarantees the safety of 
+      invoking 'ReadNext()' in the same thread. Each thread must 
+      invoke 'HasNext()' and 'ReadNext()' in pair.
+    )DOC")
+  }
+};
+
+}  // namespace reader
+}  // namespace operators
+}  // namespace paddle

From 01c6618de904e1d49660486cd65f8810cc9665a3 Mon Sep 17 00:00:00 2001
From: typhoonzero <typhoonzero1986@gmail.com>
Date: Sun, 8 Apr 2018 09:38:26 +0800
Subject: [PATCH 12/62] first wip commit

---
 .../fluid/framework/details/send_op_handle.cc | 78 +++++++++++++++++++
 .../fluid/framework/details/send_op_handle.h  | 50 ++++++++++++
 paddle/fluid/operators/detail/grpc_client.cc  |  3 +-
 3 files changed, 129 insertions(+), 2 deletions(-)
 create mode 100644 paddle/fluid/framework/details/send_op_handle.cc
 create mode 100644 paddle/fluid/framework/details/send_op_handle.h

diff --git a/paddle/fluid/framework/details/send_op_handle.cc b/paddle/fluid/framework/details/send_op_handle.cc
new file mode 100644
index 0000000000..bd2a0a9c29
--- /dev/null
+++ b/paddle/fluid/framework/details/send_op_handle.cc
@@ -0,0 +1,78 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/send_op_handle.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+SendOpHandle::SendOpHandle(const std::vector<Scope *> &local_scopes,
+                           const std::vector<platform::Place> &places,
+                           const platform::NCCLContextMap &ctxs)
+    : local_scopes_(local_scopes), places_(places) {}
+
+void SendOpHandle::RunImpl() {
+  if (inputs_.size() == 1) {
+    return;  // No need to all reduce when GPU count = 1;
+  } else {
+    // Wait input done
+    for (auto *in : inputs_) {
+      auto &p = static_cast<VarHandle *>(in)->place_;
+      in->generated_op_->Wait(dev_ctxes_[p]);
+    }
+
+    auto &var_name = static_cast<VarHandle *>(this->inputs_[0])->name_;
+    int dtype = -1;
+    size_t numel = 0;
+
+    std::vector<std::function<void()>> all_reduce_calls;
+
+    for (size_t i = 0; i < local_scopes_.size(); ++i) {
+      auto &p = places_[i];
+      auto *s = local_scopes_[i];
+      int dev_id = boost::get<platform::CUDAPlace>(p).device;
+
+      auto &lod_tensor = s->FindVar(var_name)->Get<LoDTensor>();
+      void *buffer = const_cast<void *>(lod_tensor.data<void>());
+
+      if (dtype == -1) {
+        dtype = platform::ToNCCLDataType(lod_tensor.type());
+      }
+
+      if (numel == 0) {
+        numel = static_cast<size_t>(lod_tensor.numel());
+      }
+
+      auto &nccl_ctx = nccl_ctxs_.at(dev_id);
+      auto stream = nccl_ctx.stream();
+      auto comm = nccl_ctx.comm_;
+      all_reduce_calls.emplace_back([=] {
+        PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
+            buffer, buffer, numel, static_cast<ncclDataType_t>(dtype), ncclSum,
+            comm, stream));
+      });
+    }
+
+    platform::NCCLGroupGuard guard;
+    for (auto &call : all_reduce_calls) {
+      call();
+    }
+  }
+}
+
+std::string NCCLAllReduceOpHandle::Name() const { return "nccl_all_reduce"; }
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/send_op_handle.h b/paddle/fluid/framework/details/send_op_handle.h
new file mode 100644
index 0000000000..515f1a10a8
--- /dev/null
+++ b/paddle/fluid/framework/details/send_op_handle.h
@@ -0,0 +1,50 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/platform/nccl_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+struct SendOpHandle : public OpHandleBase {
+  const std::vector<Scope *> &local_scopes_;
+  const std::vector<platform::Place> &places_;
+  const platform::NCCLContextMap &nccl_ctxs_;
+
+  SendOpHandle(const std::vector<Scope *> &local_scopes,
+               const std::vector<platform::Place> &places,
+               const platform::NCCLContextMap &ctxs);
+
+  std::string Name() const override;
+
+  // Delay and buffer nccl_all_reduce together can significantly increase
+  // performance. Disable this feature by returning false.
+  bool IsMultiDeviceTransfer() override { return true; };
+
+ protected:
+  void RunImpl() override;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/operators/detail/grpc_client.cc b/paddle/fluid/operators/detail/grpc_client.cc
index ef987d07f0..3cf286575e 100644
--- a/paddle/fluid/operators/detail/grpc_client.cc
+++ b/paddle/fluid/operators/detail/grpc_client.cc
@@ -65,9 +65,8 @@ bool RPCClient::AsyncSendVariable(const std::string& ep,
 }
 
 void ProcGetResponse(const VarHandle& var_h,
-                     // const sendrecv::VariableMessage& ret_msg) {
                      const ::grpc::ByteBuffer& ret_msg) {
-  framework::Variable* outvar = NULL;
+  framework::Variable* outvar = nullptr;
   DeserializeFromByteBuffer(ret_msg, *var_h.ctx, var_h.scope, &outvar);
 }
 

From 8fed780f14bf24954300ba37cebd2338ee7d199c Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Sun, 8 Apr 2018 11:26:06 +0800
Subject: [PATCH 13/62] Complete threaded reader

---
 paddle/fluid/operators/reader/CMakeLists.txt      |  1 +
 .../operators/reader/create_threaded_reader_op.cc | 15 ++++++++++++++-
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/reader/CMakeLists.txt b/paddle/fluid/operators/reader/CMakeLists.txt
index 6fa0195b9a..845528860f 100644
--- a/paddle/fluid/operators/reader/CMakeLists.txt
+++ b/paddle/fluid/operators/reader/CMakeLists.txt
@@ -22,5 +22,6 @@ reader_library(create_batch_reader_op SRCS create_batch_reader_op.cc)
 reader_library(create_recordio_file_reader_op SRCS create_recordio_file_reader_op.cc)
 reader_library(create_double_buffer_reader_op SRCS create_double_buffer_reader_op.cc)
 reader_library(create_multi_pass_reader_op SRCS create_multi_pass_reader_op.cc)
+reader_library(create_threaded_reader_op SRCS create_threaded_reader_op.cc)
 # Export local libraries to parent
 set(READER_LIBRARY ${LOCAL_READER_LIBS} PARENT_SCOPE)
diff --git a/paddle/fluid/operators/reader/create_threaded_reader_op.cc b/paddle/fluid/operators/reader/create_threaded_reader_op.cc
index a4aebafa8b..489866ca80 100644
--- a/paddle/fluid/operators/reader/create_threaded_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_threaded_reader_op.cc
@@ -57,7 +57,15 @@ class ThreadedReader : public framework::DecoratedReader {
     return !threda_buffer.empty();
   }
 
-  void ReInit() override;
+  void ReInit() override {
+    if (!unsafe_mode_) {
+      PADDLE_THROW(
+          "ThreadedReader::ReInit() is disabled when 'unsafe_mode' is false.");
+    }
+    VLOG(5) << "ThreadedReader::ReInit() is invoked! It might be buggy in "
+               "multi-thread environment.";
+    reader_->ReInit();
+  }
 
   ~ThreadedReader() {
     for (auto& p : thread_buffers_) {
@@ -123,3 +131,8 @@ class CreateThreadedReaderOpMaker : public DecoratedReaderMakerBase {
 }  // namespace reader
 }  // namespace operators
 }  // namespace paddle
+
+namespace reader = paddle::operators::reader;
+REGISTER_FILE_READER_OPERATOR(create_threaded_reader,
+                              reader::CreateThreadedReaderOp,
+                              reader::CreateThreadedReaderOpMaker);

From 03ff0e58fe433496330801627e0ae2f15e21df20 Mon Sep 17 00:00:00 2001
From: JiayiFeng <fengjiayi@baidu.com>
Date: Sun, 8 Apr 2018 04:25:56 +0000
Subject: [PATCH 14/62] fix compile errors

---
 paddle/fluid/operators/reader/create_threaded_reader_op.cc | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/operators/reader/create_threaded_reader_op.cc b/paddle/fluid/operators/reader/create_threaded_reader_op.cc
index 489866ca80..565cbe4d9f 100644
--- a/paddle/fluid/operators/reader/create_threaded_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_threaded_reader_op.cc
@@ -26,7 +26,7 @@ class ThreadedReader : public framework::DecoratedReader {
 
   void ReadNext(std::vector<framework::LoDTensor>* out) override {
     std::lock_guard<std::mutex> lock(mutex_);
-    if (!unsafe_mode) {
+    if (!unsafe_mode_) {
       if (!reader_->HasNext()) {
         PADDLE_THROW("There is no next data!");
       }
@@ -54,7 +54,7 @@ class ThreadedReader : public framework::DecoratedReader {
     if (thread_buffer.empty() && reader_->HasNext()) {
       reader_->ReadNext(&thread_buffer);
     }
-    return !threda_buffer.empty();
+    return !thread_buffer.empty();
   }
 
   void ReInit() override {
@@ -78,6 +78,7 @@ class ThreadedReader : public framework::DecoratedReader {
   }
 
  private:
+  bool unsafe_mode_;
   mutable std::mutex mutex_;
   mutable std::unordered_map<std::thread::id, std::vector<framework::LoDTensor>>
       thread_buffers_;
@@ -124,7 +125,7 @@ class CreateThreadedReaderOpMaker : public DecoratedReaderMakerBase {
       'HasNext()' returning true only guarantees the safety of 
       invoking 'ReadNext()' in the same thread. Each thread must 
       invoke 'HasNext()' and 'ReadNext()' in pair.
-    )DOC")
+    )DOC");
   }
 };
 

From 49ab52d64d8aced5da6d4eedd34773baebae5546 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Sun, 8 Apr 2018 13:01:26 +0800
Subject: [PATCH 15/62] Modify MultipleReader

1. Removes MultipleReader's multi-thread support, for we have got
ThreadedReader.
2. Rename MultipleReader to MultiFileReader
---
 .../reader/create_threaded_reader_op.cc       |  2 +-
 .../fluid/operators/reader/open_files_op.cc   | 66 +++++++------------
 2 files changed, 24 insertions(+), 44 deletions(-)

diff --git a/paddle/fluid/operators/reader/create_threaded_reader_op.cc b/paddle/fluid/operators/reader/create_threaded_reader_op.cc
index 565cbe4d9f..854381e0ee 100644
--- a/paddle/fluid/operators/reader/create_threaded_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_threaded_reader_op.cc
@@ -124,7 +124,7 @@ class CreateThreadedReaderOpMaker : public DecoratedReaderMakerBase {
       can enable them by setting 'unsafe_mode' true. In this case, 
       'HasNext()' returning true only guarantees the safety of 
       invoking 'ReadNext()' in the same thread. Each thread must 
-      invoke 'HasNext()' and 'ReadNext()' in pair.
+      invoke 'HasNext()' and 'ReadNext()' in pairs.
     )DOC");
   }
 };
diff --git a/paddle/fluid/operators/reader/open_files_op.cc b/paddle/fluid/operators/reader/open_files_op.cc
index db4e619e7b..45db94e780 100644
--- a/paddle/fluid/operators/reader/open_files_op.cc
+++ b/paddle/fluid/operators/reader/open_files_op.cc
@@ -19,27 +19,11 @@ namespace paddle {
 namespace operators {
 namespace reader {
 
-class MultipleReader : public framework::ReaderBase {
+class MultiFileReader : public framework::ReaderBase {
  public:
-  class ThreadBufferMap {
-   public:
-    std::vector<framework::LoDTensor>& operator[](
-        const std::thread::id& thread_id) {
-      std::lock_guard<std::mutex> lock(mutex_);
-      return buffer_[thread_id];
-    }
-
-    void Clear() { buffer_.clear(); }
-
-   private:
-    std::mutex mutex_;
-    std::unordered_map<std::thread::id, std::vector<framework::LoDTensor>>
-        buffer_;
-  };
-
-  MultipleReader(const std::vector<std::string>& file_names,
-                 const std::vector<framework::DDim>& dims, size_t thread_num,
-                 size_t buffer_size)
+  MultiFileReader(const std::vector<std::string>& file_names,
+                  const std::vector<framework::DDim>& dims, size_t thread_num,
+                  size_t buffer_size)
       : file_names_(file_names), dims_(dims), buffer_size_(buffer_size) {
     prefetchers_.resize(thread_num);
     StartNewScheduler();
@@ -49,7 +33,7 @@ class MultipleReader : public framework::ReaderBase {
   bool HasNext() const override;
   void ReInit() override;
 
-  ~MultipleReader() { EndScheduler(); }
+  ~MultiFileReader() { EndScheduler(); }
 
  private:
   void StartNewScheduler();
@@ -65,31 +49,27 @@ class MultipleReader : public framework::ReaderBase {
   framework::Channel<size_t>* waiting_file_idx_;
   framework::Channel<size_t>* available_thread_idx_;
   framework::Channel<std::vector<framework::LoDTensor>>* buffer_;
-  mutable ThreadBufferMap thread_buffer_map_;
 };
 
-void MultipleReader::ReadNext(std::vector<framework::LoDTensor>* out) {
+void MultiFileReader::ReadNext(std::vector<framework::LoDTensor>* out) {
   if (!HasNext()) {
     PADDLE_THROW("There is no next data!");
   }
-  auto& thread_local_buffer = thread_buffer_map_[std::this_thread::get_id()];
-  *out = thread_local_buffer;
-  thread_local_buffer.clear();
+  buffer_->Receive(out);
 }
 
-bool MultipleReader::HasNext() const {
-  auto& thread_local_buffer = thread_buffer_map_[std::this_thread::get_id()];
-  return thread_local_buffer.empty() ? buffer_->Receive(&thread_local_buffer)
-                                     : true;
+bool MultiFileReader::HasNext() const {
+  while (!buffer_->IsClosed() && !buffer_->CanReceive()) {
+  }
+  return buffer_->CanReceive();
 }
 
-void MultipleReader::ReInit() {
+void MultiFileReader::ReInit() {
   EndScheduler();
-  thread_buffer_map_.Clear();
   StartNewScheduler();
 }
 
-void MultipleReader::StartNewScheduler() {
+void MultiFileReader::StartNewScheduler() {
   size_t thread_num = prefetchers_.size();
   waiting_file_idx_ = framework::MakeChannel<size_t>(file_names_.size());
   available_thread_idx_ = framework::MakeChannel<size_t>(thread_num);
@@ -107,7 +87,7 @@ void MultipleReader::StartNewScheduler() {
   scheduler_ = std::thread([this] { ScheduleThreadFunc(); });
 }
 
-void MultipleReader::EndScheduler() {
+void MultiFileReader::EndScheduler() {
   available_thread_idx_->Close();
   buffer_->Close();
   waiting_file_idx_->Close();
@@ -119,8 +99,8 @@ void MultipleReader::EndScheduler() {
   delete waiting_file_idx_;
 }
 
-void MultipleReader::ScheduleThreadFunc() {
-  VLOG(5) << "MultipleReader schedule thread starts.";
+void MultiFileReader::ScheduleThreadFunc() {
+  VLOG(5) << "MultiFileReader schedule thread starts.";
   size_t completed_thread_num = 0;
   size_t thread_idx;
   while (available_thread_idx_->Receive(&thread_idx)) {
@@ -152,11 +132,11 @@ void MultipleReader::ScheduleThreadFunc() {
       p.join();
     }
   }
-  VLOG(5) << "MultipleReader schedule thread terminates.";
+  VLOG(5) << "MultiFileReader schedule thread terminates.";
 }
 
-void MultipleReader::PrefetchThreadFunc(std::string file_name,
-                                        size_t thread_idx) {
+void MultiFileReader::PrefetchThreadFunc(std::string file_name,
+                                         size_t thread_idx) {
   VLOG(5) << "The prefetch thread of file '" << file_name << "' starts.";
   std::unique_ptr<framework::ReaderBase> reader =
       CreateReaderByFileName(file_name, dims_);
@@ -203,9 +183,9 @@ class OpenFilesOp : public framework::OperatorBase {
 
     auto* out = scope.FindVar(Output("Out"))
                     ->template GetMutable<framework::ReaderHolder>();
-    out->Reset(new MultipleReader(file_names,
-                                  RestoreShapes(shape_concat, ranks),
-                                  thread_num, buffer_size));
+    out->Reset(new MultiFileReader(file_names,
+                                   RestoreShapes(shape_concat, ranks),
+                                   thread_num, buffer_size));
   }
 };
 
@@ -221,7 +201,7 @@ class OpenFilesOpMaker : public FileReaderMakerBase {
     AddComment(R"DOC(
       OpenFiles Operator
 
-      An OpenFilesOp creates a MultipleReader, which is able to 
+      An OpenFilesOp creates a MultiFileReader, which is able to 
       read data multi-threaded from multiple files.
     )DOC");
   }

From fca9e8847d5017601251ee8813e7af513b2603ed Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Sun, 8 Apr 2018 16:10:14 +0800
Subject: [PATCH 16/62] Update Readers Python API

1. Combine 'open_files', 'multi_pass_reader' and 'threaded_reader'
together to make the new 'open_files' interface.
2. Add some docstring.
3. Simplify interface names of 'create_XXX_reader', e.g, rename
'create_double_buffer_reader' to 'double_buffer'.
---
 python/paddle/fluid/layers/io.py | 109 ++++++++++++++++++++++++-------
 1 file changed, 85 insertions(+), 24 deletions(-)

diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index 7413e69234..fc8809ce15 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -22,7 +22,7 @@ from ..executor import global_scope
 __all__ = [
     'data', 'BlockGuardServ', 'ListenAndServ', 'Send', 'open_recordio_file',
     'open_files', 'read_file', 'create_shuffle_reader',
-    'create_double_buffer_reader', 'create_multi_pass_reader'
+    'create_double_buffer_reader'
 ]
 
 
@@ -283,7 +283,43 @@ def _copy_reader_create_op_(block, op):
     return new_op
 
 
-def open_recordio_file(filename, shapes, lod_levels, dtypes):
+def open_recordio_file(filename,
+                       shapes,
+                       lod_levels,
+                       dtypes,
+                       pass_num=1,
+                       for_parallel=False):
+    """
+    Open a RecordIO file
+
+    This layer takes a RecordIO file to read from and returns a Reader Variable.
+    Via the Reader Variable, we can get data from the given RecordIO file.
+
+    Args:
+       filename(str): The RecordIO file's name.
+       shapes(list): List of tuples which declaring data shapes.
+       lod_levels(list): List of ints which declaring data lod_level.
+       dtypes(list): List of strs which declaring data type.
+       pass_num(int): Number of passes to run. After completing the
+            given number of passes, 'has_next()' will return False.
+       for_parallel(Bool): Set it as True if you are going to run
+            subsequent operators in parallel.
+
+    Returns:
+       Variable: A Reader Variable via which we can get RecordIO file data.
+
+    Examples:
+       .. code-block:: python
+
+         reader = fluid.layers.io.open_recordio_file(
+                                          filename='./data.recordio',
+                                          shapes=[(3,224,224), (1)],
+                                          lod_levels=[0, 0],
+                                          dtypes=['float32', 'int64'])
+
+         # Via the reader, we can use 'read_file' layer to get data:
+         image, label = fluid.layers.read_file(reader)
+    """
     dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes]
     shape_concat = []
     ranks = []
@@ -310,6 +346,13 @@ def open_recordio_file(filename, shapes, lod_levels, dtypes):
     startup_var.persistable = True
     main_prog_var = _copy_reader_var_(default_main_program().current_block(),
                                       startup_var)
+
+    if pass_num > 1:
+        main_prog_var = multi_pass(reader=main_prog_var, pass_num=pass_num)
+
+    if for_parallel:
+        main_prog_var = for_parallel(reader=main_prog_var)
+
     return monkey_patch_reader_methods(main_prog_var)
 
 
@@ -318,11 +361,15 @@ def open_files(filenames,
                lod_levels,
                dtypes,
                thread_num,
-               buffer_size=None):
+               buffer_size=None,
+               pass_num=1,
+               for_parallel=False):
     """
     Open files
 
-    This layer takes a list of files to read from and returns a Reader Variable. Via the Reader Variable, we can get data from given files.
+    This layer takes a list of files to read from and returns a Reader Variable. 
+    Via the Reader Variable, we can get data from given files. All files must 
+    have name suffixs to indicate their formats, e.g., '*.recordio'. 
 
     Args:
        filenames(list): The list of file names.
@@ -331,6 +378,10 @@ def open_files(filenames,
        dtypes(list): List of strs which declaring data type.
        thread_num(int): The maximal concurrent prefetch thread number.
        buffer_size(int): The size of prefetch buffer.
+       pass_num(int): Number of passes to run. After completing the 
+            given number of passes, 'has_next()' will return False.
+       for_parallel(Bool): Set it as True if you are going to run 
+            subsequent operators in parallel.
 
     Returns:
        Variable: A Reader Variable via which we can get file data.
@@ -338,16 +389,16 @@ def open_files(filenames,
     Examples:
        .. code-block:: python
 
-         reader = fluid.layers.open_files(filenames=['./data1.recordio',
+         reader = fluid.layers.io.open_files(filenames=['./data1.recordio',
                                                      './data2.recordio'],
-                                          shapes=[(3,224,224), (1)],
-                                          lod_levels=[0, 0],
-                                          dtypes=['float32', 'int64'],
-                                          thread_num=2,
-                                          buffer_size=2)
+                                             shapes=[(3,224,224), (1)],
+                                             lod_levels=[0, 0],
+                                             dtypes=['float32', 'int64'],
+                                             thread_num=2,
+                                             buffer_size=2)
 
          # Via the reader, we can use 'read_file' layer to get data:
-         image, label = fluid.layers.read_file(reader)
+         image, label = fluid.layers.io.read_file(reader)
     """
     if buffer_size is None:
         buffer_size = thread_num
@@ -361,13 +412,12 @@ def open_files(filenames,
         shape_concat.extend(shape)
         ranks.append(len(shape))
 
-    var_name = unique_name('multiple_reader')
-
+    multi_file_reader_name = unique_name('multi_file_reader')
     startup_blk = default_startup_program().current_block()
-    startup_var = startup_blk.create_var(name=var_name)
+    startup_reader = startup_blk.create_var(name=multi_file_reader_name)
     startup_blk.append_op(
         type='open_files',
-        outputs={'Out': [startup_var]},
+        outputs={'Out': [startup_reader]},
         attrs={
             'shape_concat': shape_concat,
             'lod_levels': lod_levels,
@@ -377,14 +427,21 @@ def open_files(filenames,
             'buffer_size': buffer_size
         })
 
-    startup_var.desc.set_dtypes(dtypes)
-    startup_var.persistable = True
-    main_prog_var = _copy_reader_var_(default_main_program().current_block(),
-                                      startup_var)
-    return monkey_patch_reader_methods(main_prog_var)
+    startup_reader.desc.set_dtypes(dtypes)
+    startup_reader.persistable = True
+    main_prog_reader = _copy_reader_var_(default_main_program().current_block(),
+                                         startup_reader)
+    if pass_num > 1:
+        main_prog_reader = multi_pass(
+            reader=main_prog_reader, pass_num=pass_num)
 
+    if for_parallel:
+        main_prog_reader = for_parallel(reader=main_prog_reader)
 
-def __create_decorated_reader__(op_type, reader, attrs):
+    return monkey_patch_reader_methods(main_prog_reader)
+
+
+def __create_decorated_reader__(op_type, reader, attrs={}):
     var_name = unique_name(op_type)
     startup_blk = default_startup_program().current_block()
     startup_var = startup_blk.create_var(name=var_name)
@@ -400,12 +457,12 @@ def __create_decorated_reader__(op_type, reader, attrs):
     return monkey_patch_reader_methods(main_prog_var)
 
 
-def create_shuffle_reader(reader, buffer_size):
+def shuffle(reader, buffer_size):
     return __create_decorated_reader__('create_shuffle_reader', reader,
                                        {'buffer_size': int(buffer_size)})
 
 
-def create_double_buffer_reader(reader, place=None):
+def double_buffer(reader, place=None):
     attrs = dict()
     if place is not None:
         attrs['place'] = str(place).upper()
@@ -413,11 +470,15 @@ def create_double_buffer_reader(reader, place=None):
                                        attrs)
 
 
-def create_multi_pass_reader(reader, pass_num):
+def multi_pass(reader, pass_num):
     return __create_decorated_reader__('create_multi_pass_reader', reader,
                                        {'pass_num': int(pass_num)})
 
 
+def for_parallel(reader):
+    return __create_decorated_reader__('create_threaded_reader', reader)
+
+
 def read_file(file_obj):
     helper = LayerHelper('read_file')
     out = [

From 5ad2486905214e658a0ef8f54e9b447c1fec03b2 Mon Sep 17 00:00:00 2001
From: JiayiFeng <fengjiayi@baidu.com>
Date: Sun, 8 Apr 2018 09:15:58 +0000
Subject: [PATCH 17/62] fix errors

---
 python/paddle/fluid/layers/io.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index fc8809ce15..dbba1a46eb 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -21,8 +21,7 @@ from ..executor import global_scope
 
 __all__ = [
     'data', 'BlockGuardServ', 'ListenAndServ', 'Send', 'open_recordio_file',
-    'open_files', 'read_file', 'create_shuffle_reader',
-    'create_double_buffer_reader'
+    'open_files', 'read_file', 'shuffle', 'double_buffer'
 ]
 
 

From baea2cf17892f2cba47c8bde29bccd7488c2ee52 Mon Sep 17 00:00:00 2001
From: typhoonzero <typhoonzero1986@gmail.com>
Date: Sun, 8 Apr 2018 18:35:49 +0800
Subject: [PATCH 18/62] wip

---
 paddle/fluid/framework/details/CMakeLists.txt |  1 +
 .../details/multi_devices_graph_builder.cc    | 59 +++++++++++++----
 .../details/multi_devices_graph_builder.h     | 14 ++++-
 .../fluid/framework/details/send_op_handle.cc | 63 ++++---------------
 .../fluid/framework/details/send_op_handle.h  | 15 ++---
 python/paddle/fluid/framework.py              |  7 +++
 6 files changed, 87 insertions(+), 72 deletions(-)

diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index 89b5c6847f..caaf418076 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -5,6 +5,7 @@ cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod
 nv_library(nccl_all_reduce_op_handle SRCS nccl_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
         dynload_cuda)
 cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry)
+cc_library(send_op_handle SRCS send_op_handle.cc DEPS framework_proto scope place operator op_registry)
 
 cc_library(ssa_graph SRCS ssa_graph.cc DEPS var_handle op_handle_base)
 cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS ssa_graph)
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
index 128a5344fb..bea9489bbd 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
 #include "paddle/fluid/framework/details/computation_op_handle.h"
 #include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h"
+#include "paddle/fluid/framework/details/send_op_handle.h"
 #include "paddle/fluid/framework/scope.h"
 
 #ifdef PADDLE_WITH_CUDA
@@ -34,26 +35,46 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
     const std::string &loss_var_name,
     const std::unordered_set<std::string> &params,
     const std::vector<Scope *> &local_scopes,
-    platform::NCCLContextMap *nccl_ctxs)
+    platform::NCCLContextMap *nccl_ctxs, bool distributed)
     : loss_var_name_(loss_var_name),
       places_(places),
       local_scopes_(local_scopes),
+      distributed_(distributed),
       nccl_ctxs_(nccl_ctxs) {
 #else
 MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
     const std::vector<platform::Place> &places,
     const std::string &loss_var_name,
     const std::unordered_set<std::string> &params,
-    const std::vector<Scope *> &local_scopes)
+    const std::vector<Scope *> &local_scopes, bool distributed)
     : loss_var_name_(loss_var_name),
       places_(places),
-      local_scopes_(local_scopes) {
+      local_scopes_(local_scopes),
+      distributed_(distributed) {
 #endif
   for (auto &p : params) {
     grad_names_.insert(GradVarName(p));
   }
 }
 
+void MultiDevSSAGraphBuilder::CreateOpHandleIOs(SSAGraph *result, OpDesc *op,
+                                                const platform::Place &p,
+                                                const size_t &i) const {
+  auto *op_handle = result->ops_.back().get();
+
+  auto var_names = op->InputArgumentNames();
+
+  for (auto &each_var_name : var_names) {
+    VarHandle *var = CreateOrGetLatestVarHandle(result, each_var_name, p, i);
+    op_handle->AddInput(var);
+  }
+  var_names = op->OutputArgumentNames();
+
+  for (auto &each_var_name : var_names) {
+    CreateOpOutput(result, op_handle, each_var_name, p, i);
+  }
+}
+
 std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
     const ProgramDesc &program) const {
   auto graph = new SSAGraph();
@@ -72,6 +93,17 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
       }
     }
 
+    // append send op if program is distributed trainer main program.
+    // always use the first device
+    if (is_forwarding && distributed_ && op->Type() == "send") {
+      auto &p = places_[0];
+      auto *s = local_scopes_[0];
+      size_t i = 0;
+      result.ops_.emplace_back(new SendOpHandle(*op, s, p));
+      CreateOpHandleIOs(&result, op, p, i);
+      continue;
+    }
+
     for (size_t i = 0; i < places_.size(); ++i) {
       auto &p = places_[i];
       auto *s = local_scopes_[i];
@@ -81,18 +113,19 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
       op_handle->dev_ctxes_[p] = const_cast<platform::DeviceContext *>(
           platform::DeviceContextPool::Instance().Get(p));
 
-      auto var_names = op->InputArgumentNames();
+      CreateOpHandleIOs(&result, op, p, i);
+      // auto var_names = op->InputArgumentNames();
 
-      for (auto &each_var_name : var_names) {
-        VarHandle *var =
-            CreateOrGetLatestVarHandle(&result, each_var_name, p, i);
-        op_handle->AddInput(var);
-      }
-      var_names = op->OutputArgumentNames();
+      // for (auto &each_var_name : var_names) {
+      //   VarHandle *var =
+      //       CreateOrGetLatestVarHandle(&result, each_var_name, p, i);
+      //   op_handle->AddInput(var);
+      // }
+      auto var_names = op->OutputArgumentNames();
 
-      for (auto &each_var_name : var_names) {
-        CreateOpOutput(&result, op_handle, each_var_name, p, i);
-      }
+      // for (auto &each_var_name : var_names) {
+      //   CreateOpOutput(&result, op_handle, each_var_name, p, i);
+      // }
 
       if (is_forwarding) {
         if (var_names.size() == 1 && var_names[0] == loss_var_name_) {
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.h b/paddle/fluid/framework/details/multi_devices_graph_builder.h
index d3c8e582cf..004d6d50ab 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h
@@ -14,6 +14,9 @@
 
 #pragma once
 
+#include <string>
+#include <vector>
+
 #include "paddle/fluid/framework/details/ssa_graph_builder.h"
 
 namespace paddle {
@@ -31,21 +34,28 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
                           const std::string &loss_var_name,
                           const std::unordered_set<std::string> &params,
                           const std::vector<Scope *> &local_scopes,
-                          platform::NCCLContextMap *nccl_ctxs);
+                          platform::NCCLContextMap *nccl_ctxs,
+                          bool distributed = false);
 #else
   MultiDevSSAGraphBuilder(const std::vector<platform::Place> &places,
                           const std::string &loss_var_name,
                           const std::unordered_set<std::string> &params,
-                          const std::vector<Scope *> &local_scopes);
+                          const std::vector<Scope *> &local_scopes,
+                          bool distributed = false);
 #endif
 
   std::unique_ptr<SSAGraph> Build(const ProgramDesc &program) const override;
 
+ private:
+  void CreateOpHandleIOs(SSAGraph *result, OpDesc *op, const platform::Place &p,
+                         const size_t &i) const;
+
  private:
   std::string loss_var_name_;
   const std::vector<platform::Place> &places_;
   const std::vector<Scope *> &local_scopes_;
   std::unordered_set<std::string> grad_names_;
+  bool distributed_;
 
 #ifdef PADDLE_WITH_CUDA
   platform::NCCLContextMap *nccl_ctxs_;
diff --git a/paddle/fluid/framework/details/send_op_handle.cc b/paddle/fluid/framework/details/send_op_handle.cc
index bd2a0a9c29..ae5637b804 100644
--- a/paddle/fluid/framework/details/send_op_handle.cc
+++ b/paddle/fluid/framework/details/send_op_handle.cc
@@ -18,61 +18,24 @@ namespace paddle {
 namespace framework {
 namespace details {
 
-SendOpHandle::SendOpHandle(const std::vector<Scope *> &local_scopes,
-                           const std::vector<platform::Place> &places,
-                           const platform::NCCLContextMap &ctxs)
-    : local_scopes_(local_scopes), places_(places) {}
+SendOpHandle::SendOpHandle(const framework::OpDesc &op_desc,
+                           const Scope *local_scope,
+                           const platform::Place &place)
+    : op_(framework::OpRegistry::CreateOp(op_desc)),
+      local_scope_(local_scope),
+      place_(place) {}
 
 void SendOpHandle::RunImpl() {
-  if (inputs_.size() == 1) {
-    return;  // No need to all reduce when GPU count = 1;
-  } else {
-    // Wait input done
-    for (auto *in : inputs_) {
-      auto &p = static_cast<VarHandle *>(in)->place_;
-      in->generated_op_->Wait(dev_ctxes_[p]);
-    }
-
-    auto &var_name = static_cast<VarHandle *>(this->inputs_[0])->name_;
-    int dtype = -1;
-    size_t numel = 0;
-
-    std::vector<std::function<void()>> all_reduce_calls;
-
-    for (size_t i = 0; i < local_scopes_.size(); ++i) {
-      auto &p = places_[i];
-      auto *s = local_scopes_[i];
-      int dev_id = boost::get<platform::CUDAPlace>(p).device;
-
-      auto &lod_tensor = s->FindVar(var_name)->Get<LoDTensor>();
-      void *buffer = const_cast<void *>(lod_tensor.data<void>());
-
-      if (dtype == -1) {
-        dtype = platform::ToNCCLDataType(lod_tensor.type());
-      }
-
-      if (numel == 0) {
-        numel = static_cast<size_t>(lod_tensor.numel());
-      }
-
-      auto &nccl_ctx = nccl_ctxs_.at(dev_id);
-      auto stream = nccl_ctx.stream();
-      auto comm = nccl_ctx.comm_;
-      all_reduce_calls.emplace_back([=] {
-        PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
-            buffer, buffer, numel, static_cast<ncclDataType_t>(dtype), ncclSum,
-            comm, stream));
-      });
-    }
-
-    platform::NCCLGroupGuard guard;
-    for (auto &call : all_reduce_calls) {
-      call();
-    }
+  // Wait input done
+  for (auto *in : inputs_) {
+    auto &p = static_cast<VarHandle *>(in)->place_;
+    in->generated_op_->Wait(dev_ctxes_[p]);
   }
+
+  op_->Run(*local_scope_, place_);
 }
 
-std::string NCCLAllReduceOpHandle::Name() const { return "nccl_all_reduce"; }
+std::string SendOpHandle::Name() const { return "send"; }
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/details/send_op_handle.h b/paddle/fluid/framework/details/send_op_handle.h
index 515f1a10a8..e7857c1f23 100644
--- a/paddle/fluid/framework/details/send_op_handle.h
+++ b/paddle/fluid/framework/details/send_op_handle.h
@@ -19,6 +19,8 @@
 
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/platform/nccl_helper.h"
 
@@ -27,19 +29,18 @@ namespace framework {
 namespace details {
 
 struct SendOpHandle : public OpHandleBase {
-  const std::vector<Scope *> &local_scopes_;
-  const std::vector<platform::Place> &places_;
-  const platform::NCCLContextMap &nccl_ctxs_;
+  std::unique_ptr<OperatorBase> op_;
+  const Scope* local_scope_;
+  const platform::Place& place_;
 
-  SendOpHandle(const std::vector<Scope *> &local_scopes,
-               const std::vector<platform::Place> &places,
-               const platform::NCCLContextMap &ctxs);
+  SendOpHandle(const framework::OpDesc& op_desc, const Scope* local_scope,
+               const platform::Place& place);
 
   std::string Name() const override;
 
   // Delay and buffer nccl_all_reduce together can significantly increase
   // performance. Disable this feature by returning false.
-  bool IsMultiDeviceTransfer() override { return true; };
+  bool IsMultiDeviceTransfer() override { return false; };
 
  protected:
   void RunImpl() override;
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 39d4017861..8bd9161fcb 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -951,6 +951,13 @@ class Block(object):
         if var.type == core.VarDesc.VarType.STEP_SCOPES:
             ret_var = self.create_var(
                 name=var.name, persistable=var.persistable, type=var.type)
+        elif var.type == core.VarDesc.VarType.SELECTED_ROWS:
+            ret_var = self.create_var(
+                name=var.name,
+                shape=var.shape,
+                dtype=var.dtype,
+                type=var.type,
+                persistable=True)
         else:
             ret_var = self.create_var(
                 name=var.name,

From 3f90a583b4e5f8a3534b03fb9ed83280ac2d69e4 Mon Sep 17 00:00:00 2001
From: JiayiFeng <fengjiayi@baidu.com>
Date: Mon, 9 Apr 2018 04:35:25 +0000
Subject: [PATCH 19/62] update unittest

---
 python/paddle/fluid/tests/unittests/test_multi_pass_reader.py | 2 +-
 python/paddle/fluid/tests/unittests/test_recordio_reader.py   | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py b/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py
index 0b7a290759..c8a8afbea6 100644
--- a/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py
+++ b/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py
@@ -44,7 +44,7 @@ class TestMultipleReader(unittest.TestCase):
                 shapes=[(-1, 784), (-1, 1)],
                 lod_levels=[0, 0],
                 dtypes=['float32', 'int64'])
-            data_file = fluid.layers.create_multi_pass_reader(
+            data_file = fluid.layers.io.multi_pass(
                 reader=data_file, pass_num=self.pass_num)
             img, label = fluid.layers.read_file(data_file)
 
diff --git a/python/paddle/fluid/tests/unittests/test_recordio_reader.py b/python/paddle/fluid/tests/unittests/test_recordio_reader.py
index 24a0074d9b..096d99a3f3 100644
--- a/python/paddle/fluid/tests/unittests/test_recordio_reader.py
+++ b/python/paddle/fluid/tests/unittests/test_recordio_reader.py
@@ -74,8 +74,8 @@ class TestRecordIO(unittest.TestCase):
             self.assertLess(avg_loss_np[-1], avg_loss_np[0])
 
     def test_shuffle_reader(self):
-        self.test_main(decorator_callback=lambda reader: fluid.layers.create_shuffle_reader(reader, buffer_size=200))
+        self.test_main(decorator_callback=lambda reader: fluid.layers.io.shuffle(reader, buffer_size=200))
 
     def test_double_buffer_reader(self):
-        self.test_main(decorator_callback=lambda reader: fluid.layers.create_double_buffer_reader(reader,
+        self.test_main(decorator_callback=lambda reader: fluid.layers.io.double_buffer(reader,
                                                                                                   place='cuda:0' if fluid.core.is_compiled_with_cuda() else 'cpu'))

From 5416bac5d84b1d846744481505749df0a87db133 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Mon, 9 Apr 2018 15:47:46 +0800
Subject: [PATCH 20/62] Make shared decorated readers' creater be only in
 main_program

---
 .../reader/create_double_buffer_reader_op.cc  |  9 ++++--
 python/paddle/fluid/layers/io.py              | 30 ++++++++++++++-----
 2 files changed, 28 insertions(+), 11 deletions(-)

diff --git a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
index ed868786ab..d9f799f14d 100644
--- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
@@ -109,7 +109,9 @@ class CreateDoubleBufferReaderOp : public framework::OperatorBase {
 
     auto place_str = Attr<std::string>("place");
     platform::Place place;
-    if (place_str == "CPU") {
+    if (place_str == "AUTO") {
+      place = dev_place;
+    } else if (place_str == "CPU") {
       place = platform::CPUPlace();
     } else {
       std::istringstream sin(place_str);
@@ -140,8 +142,9 @@ class CreateDoubleBufferReaderOpMaker : public DecoratedReaderMakerBase {
       enum_range.insert(string::Sprintf("CUDA:%d", i));
     }
     enum_range.insert("CPU");
-    AddAttr<std::string>("place", "The double buffer place, default is CPU")
-        .SetDefault("CPU")
+    enum_range.insert("AUTO");
+    AddAttr<std::string>("place", "The double buffer place")
+        .SetDefault("AUTO")
         .InEnum({enum_range});
   }
 };
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index dbba1a46eb..4901521db5 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -440,7 +440,7 @@ def open_files(filenames,
     return monkey_patch_reader_methods(main_prog_reader)
 
 
-def __create_decorated_reader__(op_type, reader, attrs={}):
+def __create_unshared_decorated_reader__(op_type, reader, attrs={}):
     var_name = unique_name(op_type)
     startup_blk = default_startup_program().current_block()
     startup_var = startup_blk.create_var(name=var_name)
@@ -456,26 +456,40 @@ def __create_decorated_reader__(op_type, reader, attrs={}):
     return monkey_patch_reader_methods(main_prog_var)
 
 
+def __create_shared_decorated_reader__(op_type, reader, attrs={}):
+    new_reader_name = unique_name(op_type)
+    main_blk = default_main_program().current_block()
+    new_reader = main_blk.create_var(name=new_reader_name)
+    main_blk.append_op(
+        type=op_type,
+        inputs={'UnderlyingReader': reader},
+        outputs={'Out': [new_reader]},
+        attrs=attrs)
+    new_reader.persistable = True
+    new_reader.stop_gradient = True
+    return monkey_patch_reader_methods(new_reader)
+
+
 def shuffle(reader, buffer_size):
-    return __create_decorated_reader__('create_shuffle_reader', reader,
-                                       {'buffer_size': int(buffer_size)})
+    return __create_unshared_decorated_reader__(
+        'create_shuffle_reader', reader, {'buffer_size': int(buffer_size)})
 
 
 def double_buffer(reader, place=None):
     attrs = dict()
     if place is not None:
         attrs['place'] = str(place).upper()
-    return __create_decorated_reader__('create_double_buffer_reader', reader,
-                                       attrs)
+    return __create_unshared_decorated_reader__('create_double_buffer_reader',
+                                                reader, attrs)
 
 
 def multi_pass(reader, pass_num):
-    return __create_decorated_reader__('create_multi_pass_reader', reader,
-                                       {'pass_num': int(pass_num)})
+    return __create_shared_decorated_reader__(
+        'create_multi_pass_reader', reader, {'pass_num': int(pass_num)})
 
 
 def for_parallel(reader):
-    return __create_decorated_reader__('create_threaded_reader', reader)
+    return __create_shared_decorated_reader__('create_threaded_reader', reader)
 
 
 def read_file(file_obj):

From 0586b9e5c95a7368fc944da96fc2bfac6796b07e Mon Sep 17 00:00:00 2001
From: weixing02 <564445201@qq.com>
Date: Mon, 9 Apr 2018 15:49:15 +0800
Subject: [PATCH 21/62] Add *Initializer to fluid api docs

---
 python/paddle/fluid/initializer.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py
index 927f1e625a..11015b6127 100644
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -17,8 +17,7 @@ import numpy as np
 import contextlib
 
 __all__ = [
-    'Constant', 'Uniform', 'Normal', 'Xavier', 'force_init_on_cpu',
-    'init_on_cpu'
+    'ConstantInitializer', 'UniformInitializer', 'NormalInitializer', 'XavierInitializer', 'Constant', 'Uniform', 'Normal', 'Xavier', 'force_init_on_cpu','init_on_cpu'
 ]
 
 _force_init_on_cpu_ = False

From 9bec0d26dbbe31c057426c1f88af64dd8c907e1e Mon Sep 17 00:00:00 2001
From: weixing02 <564445201@qq.com>
Date: Mon, 9 Apr 2018 16:50:10 +0800
Subject: [PATCH 22/62] Adjust some contents

---
 doc/v2/dev/write_docs_cn.rst | 8 ++++----
 doc/v2/dev/write_docs_en.rst | 6 +++---
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/doc/v2/dev/write_docs_cn.rst b/doc/v2/dev/write_docs_cn.rst
index 887d92942e..4231f2bb5c 100644
--- a/doc/v2/dev/write_docs_cn.rst
+++ b/doc/v2/dev/write_docs_cn.rst
@@ -81,13 +81,13 @@ PaddlePaddle.org工具可以配合Docker使用，需要在系统里先安装好D
 
 注：上述命令把当前目录（源码根目录）映射为 container 里的 :code:`/paddle` 目录。
 
-编译完成后，进入 ``paddle/build/doc/v2`` 目录，该目录下生成了 ``cn/html/`` 、 ``en/html`` 以及 ``api/en/html`` 共三个子目录，分别进入这些目录下，执行以下命令：
+编译完成后，会产生 ``doc/v2`` 和 ``doc/fluid`` 两个目录，在这两个目录下分别都生成 ``cn/html/`` 、 ``en/html`` 、 ``api/en/html`` 共三个子目录，分别进入这些目录下，执行以下命令：
 
 .. code-block:: bash
 
    python -m SimpleHTTPServer 8088
 
-在浏览器中输入http://localhost:8088就可以看到编译生成的中/英文的文档页面和英文的API页面。
+在浏览器中输入 http://localhost:8088 就可以看到编译生成的 ``v2`` 和 ``fluid`` 两种版本的中/英文的文档页面和英文的API页面。
 
 如果不想使用Docker，也可以使用以下命令直接构建PaddlePaddle文档，即
 
@@ -107,13 +107,13 @@ PaddlePaddle.org工具可以配合Docker使用，需要在系统里先安装好D
 
 其中$processors代表启动和CPU核一样多的进程来并行编译，可以根据本机的CPU核数设置相应的值。
 
-编译完成后，会产生 ``doc/v2`` 和 ``doc/fluid`` 两个目录，如果选择构建文档则会在这两个目录下分别都生成 ``cn/html/`` 、 ``en/html`` 两个子目录，选择构建API则会在这两个目录下分别生成 ``api/en/html`` 目录，分别进入这些子目录下，执行以下命令：
+编译完成后，同样会产生 ``doc/v2`` 和 ``doc/fluid`` 两个目录，如果选择构建文档则会在这两个目录下分别都生成 ``cn/html/`` 、 ``en/html`` 两个子目录，选择构建API则会在这两个目录下分别生成 ``api/en/html`` 目录，分别进入这些子目录下，执行以下命令：
 
 .. code-block:: bash
 
    python -m SimpleHTTPServer 8088
 
-在浏览器中输入 http://localhost:8088 就可以看到编译生成的 ``v2`` 和 ``fluid`` 两种版本的中/英文的文档页面和英文的API页面,下图为生成的 ``v2`` 英文文档首页示例。注意，示例中由于使用了sphinx的原始主题，所以页面的风格与官网并不一致，但这并不影响开发者进行调试。
+在浏览器中输入 http://localhost:8088 就可以看到编译生成的 ``v2`` 和 ``fluid`` 两种版本的中/英文的文档页面和英文的API页面。下图为生成的 ``v2`` 英文文档首页示例。注意，示例中由于使用了sphinx的原始主题，所以页面的风格与官网并不一致，但这并不影响开发者进行调试。
 
 ..  image:: src/doc_en.png
     :align: center
diff --git a/doc/v2/dev/write_docs_en.rst b/doc/v2/dev/write_docs_en.rst
index 435bbdb60f..6105455e20 100644
--- a/doc/v2/dev/write_docs_en.rst
+++ b/doc/v2/dev/write_docs_en.rst
@@ -84,13 +84,13 @@ Build PaddlePaddle's documentation with Docker，you need to install Docker firs
 
 Note: The above commands maps the current directory (source root directory) to the :code:`/paddle` directory in the container.
 
-After compiling, you could enter the ``paddle/build/doc/v2`` directory, where three subdirectories ``cn/html/``, ``en/html`` and ``api/en/html`` are generated. Please enter these directories respectively and execute the following commands:
+After compiling, there should be two generated directories: ``doc/v2`` and ``doc/fluid``, where three subdirectories ``cn/html/``, ``en/html`` and ``api/en/html`` are generated. Please enter these directories respectively and execute the following commands:
 
 .. code-block:: bash
 
    python -m SimpleHTTPServer 8088
 
-Use a web browser and navigate to http://localhost:8000, you could see the compiled Chinese/English documents page and the English APIs page.
+Use a web browser and navigate to http://localhost:8000, you could see the compiled  ``v2`` 's and ``fluid`` 's Chinese/English documents page and English APIs page.
 
 If you do not wish to use Docker, you can also use the following commands to directly build the PaddlePaddle documentation.
 
@@ -111,7 +111,7 @@ If you do not wish to use Docker, you can also use the following commands to dir
 
 $processors indicates that as many processes as the CPU cores are started to compile in parallel. It should be set according to the number of CPU cores of your machine.
 
-After the compilation is complete, there should be two generated directories: ``doc/v2`` and ``doc/fluid`` . If you chose to build documents, two subdirectories ``cn/html/`` and ``en/html``  will be generated in both two directories. If you chose to build APIs，a subdirectory ``api/en/html`` will be generated. Please enter these directories respectively and execute the following commands:
+After compiling, there also should be two generated directories: ``doc/v2`` and ``doc/fluid`` . If you chose to build documents, two subdirectories ``cn/html/`` and ``en/html``  will be generated in both two directories. If you chose to build APIs，a subdirectory ``api/en/html`` will be generated. Please enter these directories respectively and execute the following commands:
 
 .. code-block:: bash
 

From cd41dca2ab9f5d98d6817147df2e71af152150f5 Mon Sep 17 00:00:00 2001
From: weixing02 <564445201@qq.com>
Date: Mon, 9 Apr 2018 18:22:57 +0800
Subject: [PATCH 23/62] Adjust

---
 python/paddle/fluid/initializer.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py
index 11015b6127..ad15c32a35 100644
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -17,7 +17,8 @@ import numpy as np
 import contextlib
 
 __all__ = [
-    'ConstantInitializer', 'UniformInitializer', 'NormalInitializer', 'XavierInitializer', 'Constant', 'Uniform', 'Normal', 'Xavier', 'force_init_on_cpu','init_on_cpu'
+    'Constant', 'Uniform', 'Normal', 'Xavier', 'force_init_on_cpu',
+    'init_on_cpu', 'ConstantInitializer', 'UniformInitializer', 'NormalInitializer', 'XavierInitializer'
 ]
 
 _force_init_on_cpu_ = False

From 9fe938cb2aefcbced1e60fa459c943fa2ea245e6 Mon Sep 17 00:00:00 2001
From: jshower <j.shower@163.com>
Date: Tue, 10 Apr 2018 03:48:26 +0000
Subject: [PATCH 24/62] Changing network configuration, avoid nan

---
 .../fluid/tests/book/test_label_semantic_roles.py    | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/python/paddle/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
index c0a6df831a..5fc64ea958 100644
--- a/python/paddle/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
@@ -77,7 +77,7 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
     emb_layers.append(mark_embedding)
 
     hidden_0_layers = [
-        fluid.layers.fc(input=emb, size=hidden_dim) for emb in emb_layers
+        fluid.layers.fc(input=emb, size=hidden_dim, act='tanh') for emb in emb_layers
     ]
 
     hidden_0 = fluid.layers.sums(input=hidden_0_layers)
@@ -94,8 +94,8 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
 
     for i in range(1, depth):
         mix_hidden = fluid.layers.sums(input=[
-            fluid.layers.fc(input=input_tmp[0], size=hidden_dim),
-            fluid.layers.fc(input=input_tmp[1], size=hidden_dim)
+            fluid.layers.fc(input=input_tmp[0], size=hidden_dim, act='tanh'),
+            fluid.layers.fc(input=input_tmp[1], size=hidden_dim, act='tanh')
         ])
 
         lstm = fluid.layers.dynamic_lstm(
@@ -109,8 +109,8 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
         input_tmp = [mix_hidden, lstm]
 
     feature_out = fluid.layers.sums(input=[
-        fluid.layers.fc(input=input_tmp[0], size=label_dict_len),
-        fluid.layers.fc(input=input_tmp[1], size=label_dict_len)
+        fluid.layers.fc(input=input_tmp[0], size=label_dict_len, act='tanh'),
+        fluid.layers.fc(input=input_tmp[1], size=label_dict_len, act='tanh')
     ])
 
     return feature_out
@@ -171,7 +171,7 @@ def train(use_cuda, save_dirname=None, is_local=True):
     # check other optimizers and check why out will be NAN
     sgd_optimizer = fluid.optimizer.SGD(
         learning_rate=fluid.layers.exponential_decay(
-            learning_rate=0.0001,
+            learning_rate=0.01,
             decay_steps=100000,
             decay_rate=0.5,
             staircase=True))

From ee178d5aebaa48f3434ec577ab1b450f4e3d7eab Mon Sep 17 00:00:00 2001
From: JiayiFeng <fengjiayi@baidu.com>
Date: Tue, 10 Apr 2018 04:41:39 +0000
Subject: [PATCH 25/62] fix bugs

---
 paddle/fluid/framework/parallel_executor.cc         |  4 +---
 paddle/fluid/operators/read_op.cc                   |  7 -------
 .../operators/reader/create_threaded_reader_op.cc   |  8 ++++----
 python/paddle/fluid/layers/io.py                    | 13 +++++++------
 4 files changed, 12 insertions(+), 20 deletions(-)

diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 74945fb4f2..1bb089c344 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -115,14 +115,12 @@ void ParallelExecutor::BCastParamsToGPUs(
 
   for (auto &var : vars) {
     auto *main_var = main_scope->FindVar(var);
-    if (!main_var->IsType<LoDTensor>()) {
+    if (main_var == nullptr || !main_var->IsType<LoDTensor>()) {
       continue;
     }
 
     auto &main_tensor = main_var->Get<LoDTensor>();
-
     auto &dims = main_tensor.dims();
-
     if (paddle::platform::is_gpu_place(main_tensor.place())) {
       size_t numel = main_tensor.numel();
       ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type());
diff --git a/paddle/fluid/operators/read_op.cc b/paddle/fluid/operators/read_op.cc
index 2925b8a85d..4496110cf8 100644
--- a/paddle/fluid/operators/read_op.cc
+++ b/paddle/fluid/operators/read_op.cc
@@ -66,13 +66,6 @@ class ReadOp : public framework::OperatorBase {
     std::vector<std::string> out_arg_names = Outputs("Out");
     std::vector<framework::LoDTensor> ins;
     reader->ReadNext(&ins);
-    if (ins.empty()) {
-      reader->ReInit();
-      reader->ReadNext(&ins);
-      PADDLE_ENFORCE(
-          !ins.empty(),
-          "Reader can not read the next data even it has been re-initialized.");
-    }
     PADDLE_ENFORCE_EQ(ins.size(), out_arg_names.size());
     for (size_t i = 0; i < ins.size(); ++i) {
       auto* out =
diff --git a/paddle/fluid/operators/reader/create_threaded_reader_op.cc b/paddle/fluid/operators/reader/create_threaded_reader_op.cc
index 854381e0ee..7b10135afc 100644
--- a/paddle/fluid/operators/reader/create_threaded_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_threaded_reader_op.cc
@@ -111,7 +111,7 @@ class CreateThreadedReaderOpMaker : public DecoratedReaderMakerBase {
                   "When 'unsafe_mode' is false, invoking 'HasNext()' or "
                   "'ReInit()' is not allowed to avoid unexpected bugs in "
                   "multi-thread environment.")
-        .SetDefault(false);
+        .SetDefault(true);
     AddComment(R"DOC(
       CreateThreadedReader Operator
 
@@ -134,6 +134,6 @@ class CreateThreadedReaderOpMaker : public DecoratedReaderMakerBase {
 }  // namespace paddle
 
 namespace reader = paddle::operators::reader;
-REGISTER_FILE_READER_OPERATOR(create_threaded_reader,
-                              reader::CreateThreadedReaderOp,
-                              reader::CreateThreadedReaderOpMaker);
+REGISTER_DECORATED_READER_OPERATOR(create_threaded_reader,
+                                   reader::CreateThreadedReaderOp,
+                                   reader::CreateThreadedReaderOpMaker);
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index 4901521db5..d016ab9008 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -350,7 +350,7 @@ def open_recordio_file(filename,
         main_prog_var = multi_pass(reader=main_prog_var, pass_num=pass_num)
 
     if for_parallel:
-        main_prog_var = for_parallel(reader=main_prog_var)
+        main_prog_var = parallelize(reader=main_prog_var)
 
     return monkey_patch_reader_methods(main_prog_var)
 
@@ -435,12 +435,12 @@ def open_files(filenames,
             reader=main_prog_reader, pass_num=pass_num)
 
     if for_parallel:
-        main_prog_reader = for_parallel(reader=main_prog_reader)
+        main_prog_reader = parallelize(reader=main_prog_reader)
 
     return monkey_patch_reader_methods(main_prog_reader)
 
 
-def __create_unshared_decorated_reader__(op_type, reader, attrs={}):
+def __create_shared_decorated_reader__(op_type, reader, attrs):
     var_name = unique_name(op_type)
     startup_blk = default_startup_program().current_block()
     startup_var = startup_blk.create_var(name=var_name)
@@ -456,7 +456,7 @@ def __create_unshared_decorated_reader__(op_type, reader, attrs={}):
     return monkey_patch_reader_methods(main_prog_var)
 
 
-def __create_shared_decorated_reader__(op_type, reader, attrs={}):
+def __create_unshared_decorated_reader__(op_type, reader, attrs):
     new_reader_name = unique_name(op_type)
     main_blk = default_main_program().current_block()
     new_reader = main_blk.create_var(name=new_reader_name)
@@ -488,8 +488,9 @@ def multi_pass(reader, pass_num):
         'create_multi_pass_reader', reader, {'pass_num': int(pass_num)})
 
 
-def for_parallel(reader):
-    return __create_shared_decorated_reader__('create_threaded_reader', reader)
+def parallelize(reader):
+    return __create_shared_decorated_reader__('create_threaded_reader', reader,
+                                              {})
 
 
 def read_file(file_obj):

From d9a52223852a92d532ff2522cb648758511abe26 Mon Sep 17 00:00:00 2001
From: jshower <j.shower@163.com>
Date: Tue, 10 Apr 2018 04:57:30 +0000
Subject: [PATCH 26/62] code style

---
 .../tests/book/test_label_semantic_roles.py   | 67 ++++++++++---------
 1 file changed, 34 insertions(+), 33 deletions(-)

diff --git a/python/paddle/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
index 5fc64ea958..4f5d30ac00 100644
--- a/python/paddle/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
@@ -70,14 +70,15 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
         fluid.layers.embedding(
             size=[word_dict_len, word_dim],
             input=x,
-            param_attr=fluid.ParamAttr(
-                name=embedding_name, trainable=False)) for x in word_input
+            param_attr=fluid.ParamAttr(name=embedding_name, trainable=False))
+        for x in word_input
     ]
     emb_layers.append(predicate_embedding)
     emb_layers.append(mark_embedding)
 
     hidden_0_layers = [
-        fluid.layers.fc(input=emb, size=hidden_dim, act='tanh') for emb in emb_layers
+        fluid.layers.fc(input=emb, size=hidden_dim, act='tanh')
+        for emb in emb_layers
     ]
 
     hidden_0 = fluid.layers.sums(input=hidden_0_layers)
@@ -163,8 +164,7 @@ def train(use_cuda, save_dirname=None, is_local=True):
     crf_cost = fluid.layers.linear_chain_crf(
         input=feature_out,
         label=target,
-        param_attr=fluid.ParamAttr(
-            name='crfw', learning_rate=mix_hidden_lr))
+        param_attr=fluid.ParamAttr(name='crfw', learning_rate=mix_hidden_lr))
     avg_cost = fluid.layers.mean(crf_cost)
 
     # TODO(qiao)
@@ -189,8 +189,7 @@ def train(use_cuda, save_dirname=None, is_local=True):
         num_chunk_types=int(math.ceil((label_dict_len - 1) / 2.0)))
 
     train_data = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.conll05.test(), buf_size=8192),
+        paddle.reader.shuffle(paddle.dataset.conll05.test(), buf_size=8192),
         batch_size=BATCH_SIZE)
 
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
@@ -223,24 +222,25 @@ def train(use_cuda, save_dirname=None, is_local=True):
                     exe)
 
                 if batch_id % 10 == 0:
-                    print("avg_cost:" + str(cost) + " precision:" + str(
-                        precision) + " recall:" + str(recall) + " f1_score:" +
-                          str(f1_score) + " pass_precision:" + str(
-                              pass_precision) + " pass_recall:" + str(
-                                  pass_recall) + " pass_f1_score:" + str(
-                                      pass_f1_score))
+                    print(
+                        "avg_cost:" + str(cost) + " precision:" +
+                        str(precision) + " recall:" + str(recall) +
+                        " f1_score:" + str(f1_score) + " pass_precision:" + str(
+                            pass_precision) + " pass_recall:" + str(pass_recall)
+                        + " pass_f1_score:" + str(pass_f1_score))
                     if batch_id != 0:
-                        print("second per batch: " + str((time.time(
-                        ) - start_time) / batch_id))
+                        print("second per batch: " + str(
+                            (time.time() - start_time) / batch_id))
                     # Set the threshold low to speed up the CI test
                     if float(pass_precision) > 0.05:
                         if save_dirname is not None:
                             # TODO(liuyiqun): Change the target to crf_decode
-                            fluid.io.save_inference_model(save_dirname, [
-                                'word_data', 'verb_data', 'ctx_n2_data',
-                                'ctx_n1_data', 'ctx_0_data', 'ctx_p1_data',
-                                'ctx_p2_data', 'mark_data'
-                            ], [feature_out], exe)
+                            fluid.io.save_inference_model(
+                                save_dirname, [
+                                    'word_data', 'verb_data', 'ctx_n2_data',
+                                    'ctx_n1_data', 'ctx_0_data', 'ctx_p1_data',
+                                    'ctx_p2_data', 'mark_data'
+                                ], [feature_out], exe)
                         return
 
                 batch_id = batch_id + 1
@@ -320,19 +320,20 @@ def infer(use_cuda, save_dirname=None):
         assert feed_target_names[6] == 'ctx_p2_data'
         assert feed_target_names[7] == 'mark_data'
 
-        results = exe.run(inference_program,
-                          feed={
-                              feed_target_names[0]: word,
-                              feed_target_names[1]: pred,
-                              feed_target_names[2]: ctx_n2,
-                              feed_target_names[3]: ctx_n1,
-                              feed_target_names[4]: ctx_0,
-                              feed_target_names[5]: ctx_p1,
-                              feed_target_names[6]: ctx_p2,
-                              feed_target_names[7]: mark
-                          },
-                          fetch_list=fetch_targets,
-                          return_numpy=False)
+        results = exe.run(
+            inference_program,
+            feed={
+                feed_target_names[0]: word,
+                feed_target_names[1]: pred,
+                feed_target_names[2]: ctx_n2,
+                feed_target_names[3]: ctx_n1,
+                feed_target_names[4]: ctx_0,
+                feed_target_names[5]: ctx_p1,
+                feed_target_names[6]: ctx_p2,
+                feed_target_names[7]: mark
+            },
+            fetch_list=fetch_targets,
+            return_numpy=False)
         print(results[0].lod())
         np_data = np.array(results[0])
         print("Inference Shape: ", np_data.shape)

From 93940642ac1f52d28743a9dbdc2940a4d986f612 Mon Sep 17 00:00:00 2001
From: weixing02 <564445201@qq.com>
Date: Tue, 10 Apr 2018 14:28:50 +0800
Subject: [PATCH 27/62] Adjust

---
 python/paddle/fluid/initializer.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py
index ad15c32a35..4e132ed261 100644
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -18,7 +18,8 @@ import contextlib
 
 __all__ = [
     'Constant', 'Uniform', 'Normal', 'Xavier', 'force_init_on_cpu',
-    'init_on_cpu', 'ConstantInitializer', 'UniformInitializer', 'NormalInitializer', 'XavierInitializer'
+    'init_on_cpu', 'ConstantInitializer', 'UniformInitializer',
+    'NormalInitializer', 'XavierInitializer'
 ]
 
 _force_init_on_cpu_ = False

From 7c1434dd73d367932e98ae569093183d33b7e5fb Mon Sep 17 00:00:00 2001
From: jshower <j.shower@163.com>
Date: Tue, 10 Apr 2018 07:36:15 +0000
Subject: [PATCH 28/62] code style

---
 .../tests/book/test_label_semantic_roles.py   | 64 +++++++++----------
 1 file changed, 32 insertions(+), 32 deletions(-)

diff --git a/python/paddle/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
index 4f5d30ac00..ace2e39ba4 100644
--- a/python/paddle/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
@@ -70,8 +70,8 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
         fluid.layers.embedding(
             size=[word_dict_len, word_dim],
             input=x,
-            param_attr=fluid.ParamAttr(name=embedding_name, trainable=False))
-        for x in word_input
+            param_attr=fluid.ParamAttr(
+                name=embedding_name, trainable=False)) for x in word_input
     ]
     emb_layers.append(predicate_embedding)
     emb_layers.append(mark_embedding)
@@ -164,7 +164,8 @@ def train(use_cuda, save_dirname=None, is_local=True):
     crf_cost = fluid.layers.linear_chain_crf(
         input=feature_out,
         label=target,
-        param_attr=fluid.ParamAttr(name='crfw', learning_rate=mix_hidden_lr))
+        param_attr=fluid.ParamAttr(
+            name='crfw', learning_rate=mix_hidden_lr))
     avg_cost = fluid.layers.mean(crf_cost)
 
     # TODO(qiao)
@@ -189,7 +190,8 @@ def train(use_cuda, save_dirname=None, is_local=True):
         num_chunk_types=int(math.ceil((label_dict_len - 1) / 2.0)))
 
     train_data = paddle.batch(
-        paddle.reader.shuffle(paddle.dataset.conll05.test(), buf_size=8192),
+        paddle.reader.shuffle(
+            paddle.dataset.conll05.test(), buf_size=8192),
         batch_size=BATCH_SIZE)
 
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
@@ -222,25 +224,24 @@ def train(use_cuda, save_dirname=None, is_local=True):
                     exe)
 
                 if batch_id % 10 == 0:
-                    print(
-                        "avg_cost:" + str(cost) + " precision:" +
-                        str(precision) + " recall:" + str(recall) +
-                        " f1_score:" + str(f1_score) + " pass_precision:" + str(
-                            pass_precision) + " pass_recall:" + str(pass_recall)
-                        + " pass_f1_score:" + str(pass_f1_score))
+                    print("avg_cost:" + str(cost) + " precision:" + str(
+                        precision) + " recall:" + str(recall) + " f1_score:" +
+                          str(f1_score) + " pass_precision:" + str(
+                              pass_precision) + " pass_recall:" + str(
+                                  pass_recall) + " pass_f1_score:" + str(
+                                      pass_f1_score))
                     if batch_id != 0:
-                        print("second per batch: " + str(
-                            (time.time() - start_time) / batch_id))
+                        print("second per batch: " + str((time.time(
+                        ) - start_time) / batch_id))
                     # Set the threshold low to speed up the CI test
                     if float(pass_precision) > 0.05:
                         if save_dirname is not None:
                             # TODO(liuyiqun): Change the target to crf_decode
-                            fluid.io.save_inference_model(
-                                save_dirname, [
-                                    'word_data', 'verb_data', 'ctx_n2_data',
-                                    'ctx_n1_data', 'ctx_0_data', 'ctx_p1_data',
-                                    'ctx_p2_data', 'mark_data'
-                                ], [feature_out], exe)
+                            fluid.io.save_inference_model(save_dirname, [
+                                'word_data', 'verb_data', 'ctx_n2_data',
+                                'ctx_n1_data', 'ctx_0_data', 'ctx_p1_data',
+                                'ctx_p2_data', 'mark_data'
+                            ], [feature_out], exe)
                         return
 
                 batch_id = batch_id + 1
@@ -320,20 +321,19 @@ def infer(use_cuda, save_dirname=None):
         assert feed_target_names[6] == 'ctx_p2_data'
         assert feed_target_names[7] == 'mark_data'
 
-        results = exe.run(
-            inference_program,
-            feed={
-                feed_target_names[0]: word,
-                feed_target_names[1]: pred,
-                feed_target_names[2]: ctx_n2,
-                feed_target_names[3]: ctx_n1,
-                feed_target_names[4]: ctx_0,
-                feed_target_names[5]: ctx_p1,
-                feed_target_names[6]: ctx_p2,
-                feed_target_names[7]: mark
-            },
-            fetch_list=fetch_targets,
-            return_numpy=False)
+        results = exe.run(inference_program,
+                          feed={
+                              feed_target_names[0]: word,
+                              feed_target_names[1]: pred,
+                              feed_target_names[2]: ctx_n2,
+                              feed_target_names[3]: ctx_n1,
+                              feed_target_names[4]: ctx_0,
+                              feed_target_names[5]: ctx_p1,
+                              feed_target_names[6]: ctx_p2,
+                              feed_target_names[7]: mark
+                          },
+                          fetch_list=fetch_targets,
+                          return_numpy=False)
         print(results[0].lod())
         np_data = np.array(results[0])
         print("Inference Shape: ", np_data.shape)

From 284a2137742f63e8a70f4d98805328edc067f054 Mon Sep 17 00:00:00 2001
From: JiayiFeng <fengjiayi@baidu.com>
Date: Tue, 10 Apr 2018 07:46:03 +0000
Subject: [PATCH 29/62] fix a name conflict

---
 python/paddle/fluid/layers/io.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index dbba1a46eb..7b590fb510 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -350,7 +350,7 @@ def open_recordio_file(filename,
         main_prog_var = multi_pass(reader=main_prog_var, pass_num=pass_num)
 
     if for_parallel:
-        main_prog_var = for_parallel(reader=main_prog_var)
+        main_prog_var = parallel(reader=main_prog_var)
 
     return monkey_patch_reader_methods(main_prog_var)
 
@@ -435,7 +435,7 @@ def open_files(filenames,
             reader=main_prog_reader, pass_num=pass_num)
 
     if for_parallel:
-        main_prog_reader = for_parallel(reader=main_prog_reader)
+        main_prog_reader = parallel(reader=main_prog_reader)
 
     return monkey_patch_reader_methods(main_prog_reader)
 
@@ -474,7 +474,7 @@ def multi_pass(reader, pass_num):
                                        {'pass_num': int(pass_num)})
 
 
-def for_parallel(reader):
+def parallel(reader):
     return __create_decorated_reader__('create_threaded_reader', reader)
 
 

From 0bf799a52388dd77743623dcb2d1ebacb352858b Mon Sep 17 00:00:00 2001
From: typhoonzero <typhoonzero1986@gmail.com>
Date: Tue, 10 Apr 2018 17:00:06 +0800
Subject: [PATCH 30/62] wip testing

---
 paddle/fluid/framework/details/CMakeLists.txt          |  2 +-
 .../framework/details/multi_devices_graph_builder.cc   | 10 ++++------
 .../framework/details/multi_devices_graph_builder.h    |  7 ++-----
 paddle/fluid/framework/parallel_executor.h             |  4 ++--
 paddle/fluid/operators/detail/serde_test.cc            |  2 +-
 paddle/fluid/pybind/pybind.cc                          |  1 +
 python/paddle/fluid/parallel_executor.py               |  7 +++++--
 7 files changed, 16 insertions(+), 17 deletions(-)

diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index caaf418076..85b649b293 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -16,7 +16,7 @@ else()
     set(multi_devices_graph_builder_deps)
 endif()
 cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle
-            scale_loss_grad_op_handle ${multi_devices_graph_builder_deps})
+            scale_loss_grad_op_handle send_op_handle ${multi_devices_graph_builder_deps})
 cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph framework_proto)
 cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
         simple_threadpool device_context)
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
index 8a28b18715..8a53270110 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@@ -35,22 +35,20 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
     const std::string &loss_var_name,
     const std::unordered_set<std::string> &params,
     const std::vector<Scope *> &local_scopes,
-    platform::NCCLContextMap *nccl_ctxs, bool distributed)
+    platform::NCCLContextMap *nccl_ctxs)
     : loss_var_name_(loss_var_name),
       places_(places),
       local_scopes_(local_scopes),
-      distributed_(distributed),
       nccl_ctxs_(nccl_ctxs) {
 #else
 MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
     const std::vector<platform::Place> &places,
     const std::string &loss_var_name,
     const std::unordered_set<std::string> &params,
-    const std::vector<Scope *> &local_scopes, bool distributed)
+    const std::vector<Scope *> &local_scopes)
     : loss_var_name_(loss_var_name),
       places_(places),
-      local_scopes_(local_scopes),
-      distributed_(distributed) {
+      local_scopes_(local_scopes) {
 #endif
   for (auto &p : params) {
     grad_names_.insert(GradVarName(p));
@@ -99,7 +97,7 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
 
     // append send op if program is distributed trainer main program.
     // always use the first device
-    if (is_forwarding && distributed_ && op->Type() == "send") {
+    if (!is_forwarding && op->Type() == "send") {
       auto &p = places_[0];
       auto *s = local_scopes_[0];
       size_t i = 0;
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.h b/paddle/fluid/framework/details/multi_devices_graph_builder.h
index 004d6d50ab..de34caab1b 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h
@@ -34,14 +34,12 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
                           const std::string &loss_var_name,
                           const std::unordered_set<std::string> &params,
                           const std::vector<Scope *> &local_scopes,
-                          platform::NCCLContextMap *nccl_ctxs,
-                          bool distributed = false);
+                          platform::NCCLContextMap *nccl_ctxs);
 #else
   MultiDevSSAGraphBuilder(const std::vector<platform::Place> &places,
                           const std::string &loss_var_name,
                           const std::unordered_set<std::string> &params,
-                          const std::vector<Scope *> &local_scopes,
-                          bool distributed = false);
+                          const std::vector<Scope *> &local_scopes);
 #endif
 
   std::unique_ptr<SSAGraph> Build(const ProgramDesc &program) const override;
@@ -55,7 +53,6 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
   const std::vector<platform::Place> &places_;
   const std::vector<Scope *> &local_scopes_;
   std::unordered_set<std::string> grad_names_;
-  bool distributed_;
 
 #ifdef PADDLE_WITH_CUDA
   platform::NCCLContextMap *nccl_ctxs_;
diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h
index c048c3865f..b4f16dba85 100644
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -48,13 +48,13 @@ class ParallelExecutor {
            const std::string& fetched_var_name,
            const std::unordered_map<std::string, LoDTensor>& feed_tensors);
 
+  void BCastParamsToGPUs(const std::unordered_set<std::string>& vars) const;
+
  private:
   void SplitTensorToPlaces(
       const std::unordered_map<std::string, LoDTensor>& feed_tensors);
 
   ParallelExecutorPrivate* member_;
-
-  void BCastParamsToGPUs(const std::unordered_set<std::string>& vars) const;
 };
 
 }  // namespace framework
diff --git a/paddle/fluid/operators/detail/serde_test.cc b/paddle/fluid/operators/detail/serde_test.cc
index f8cae6b26a..cb5f895834 100644
--- a/paddle/fluid/operators/detail/serde_test.cc
+++ b/paddle/fluid/operators/detail/serde_test.cc
@@ -107,7 +107,7 @@ void RunSerdeTestSelectedRows(platform::Place place) {
   for (int i = 0; i < tensor_numel; ++i) {
     EXPECT_FLOAT_EQ(tensor_data2[i], 32.7);
   }
-  for (int64_t i = 0; i < rows2->size(); ++i) {
+  for (size_t i = 0; i < rows2->size(); ++i) {
     EXPECT_EQ(rows_data2[i], i);
   }
   EXPECT_EQ(slr2->height(), 1000);
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 3924040455..a9a5d87d77 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -554,6 +554,7 @@ All parameter, weight, gradient are variables in Paddle.
                                   bcast_vars, main_program, loss_var_name,
                                   scope, local_scopes, allow_op_delay);
            })
+      .def("bcast_params", &ParallelExecutor::BCastParamsToGPUs)
       .def("local_scopes",
            [](ParallelExecutor &self) -> std::vector<Scope *> * {
              return &self.GetLocalScopes();
diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py
index b93f2f974c..a23cc9b772 100644
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -99,7 +99,7 @@ class ParallelExecutor(object):
         local_scopes = share_vars_from.executor.local_scopes(
         ) if share_vars_from else []
 
-        persistable_vars = [
+        self.persistable_vars = [
             v.name
             for v in filter(lambda var: var.persistable, main.list_vars())
         ]
@@ -112,7 +112,7 @@ class ParallelExecutor(object):
                 p.name for p in main.global_block().iter_parameters()
                 if not p.stop_gradient
             ]),
-            set(persistable_vars),
+            set(self.persistable_vars),
             main.desc,
             loss_name if loss_name else '',
             scope,
@@ -142,3 +142,6 @@ class ParallelExecutor(object):
         self.executor.run(fetch_list, fetch_var_name, feed_tensor_dict)
         arr = self.scope.find_var(fetch_var_name).get_lod_tensor_array()
         return [arr[i] for i in range(len(arr))]
+
+    def bcast_params(self):
+        self.executor.bcast_params(set(self.persistable_vars))

From ad6ddf533cfb1542283f741cddb78835fb3b8658 Mon Sep 17 00:00:00 2001
From: jshower <j.shower@163.com>
Date: Tue, 10 Apr 2018 09:23:11 +0000
Subject: [PATCH 31/62] for ci

---
 python/paddle/fluid/tests/book/test_label_semantic_roles.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
index ace2e39ba4..4d8bca4d24 100644
--- a/python/paddle/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
@@ -37,7 +37,7 @@ depth = 8
 mix_hidden_lr = 1e-3
 
 IS_SPARSE = True
-PASS_NUM = 10
+PASS_NUM = 100
 BATCH_SIZE = 10
 
 embedding_name = 'emb'
@@ -234,7 +234,7 @@ def train(use_cuda, save_dirname=None, is_local=True):
                         print("second per batch: " + str((time.time(
                         ) - start_time) / batch_id))
                     # Set the threshold low to speed up the CI test
-                    if float(pass_precision) > 0.05:
+                    if float(pass_precision) > 0.01:
                         if save_dirname is not None:
                             # TODO(liuyiqun): Change the target to crf_decode
                             fluid.io.save_inference_model(save_dirname, [

From 40e3fe173ca06a8196c3a24906923833cfb0f372 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Tue, 10 Apr 2018 17:46:53 +0800
Subject: [PATCH 32/62] Make cuda_helper.h Pass cpplint

---
 paddle/fluid/platform/cuda_helper.h | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/paddle/fluid/platform/cuda_helper.h b/paddle/fluid/platform/cuda_helper.h
index 881d611d4a..8758af0804 100644
--- a/paddle/fluid/platform/cuda_helper.h
+++ b/paddle/fluid/platform/cuda_helper.h
@@ -33,22 +33,26 @@ constexpr int PADDLE_CUDA_NUM_THREADS = 512;
 USE_CUDA_ATOMIC(Add, float);
 USE_CUDA_ATOMIC(Add, int);
 USE_CUDA_ATOMIC(Add, unsigned int);
-USE_CUDA_ATOMIC(Add, unsigned long long int);
+// CUDA API uses unsigned long long int, we cannot use uint64_t here.
+// It because unsigned long long int is not necessarily uint64_t
+USE_CUDA_ATOMIC(Add, unsigned long long int);  // NOLINT
 
 CUDA_ATOMIC_WRAPPER(Add, int64_t) {
-  static_assert(sizeof(int64_t) == sizeof(long long int),
+  // Here, we check long long int must be int64_t.
+  static_assert(sizeof(int64_t) == sizeof(long long int),  // NOLINT
                 "long long should be int64");
-  return CudaAtomicAdd(reinterpret_cast<unsigned long long int*>(address),
-                       static_cast<unsigned long long int>(val));
+  return CudaAtomicAdd(
+      reinterpret_cast<unsigned long long int*>(address),  // NOLINT
+      static_cast<unsigned long long int>(val));           // NOLINT
 }
 
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600
 USE_CUDA_ATOMIC(Add, double);
 #else
 CUDA_ATOMIC_WRAPPER(Add, double) {
-  unsigned long long int* address_as_ull =
-      reinterpret_cast<unsigned long long int*>(address);
-  unsigned long long int old = *address_as_ull, assumed;
+  unsigned long long int* address_as_ull =                 // NOLINT
+      reinterpret_cast<unsigned long long int*>(address);  // NOLINT
+  unsigned long long int old = *address_as_ull, assumed;   // NOLINT
 
   do {
     assumed = old;

From ce08dc8751b5f605ce6aece70ce6f16af72f4759 Mon Sep 17 00:00:00 2001
From: typhoonzero <typhoonzero1986@gmail.com>
Date: Tue, 10 Apr 2018 20:40:43 +0800
Subject: [PATCH 33/62] have stream removed error

---
 .../details/multi_devices_graph_builder.cc    | 34 ++++++++-----------
 .../details/multi_devices_graph_builder.h     |  2 +-
 .../fluid/framework/details/send_op_handle.cc | 10 +++---
 .../fluid/framework/details/send_op_handle.h  |  4 +--
 python/paddle/fluid/distribute_transpiler.py  |  1 +
 python/paddle/fluid/parallel_executor.py      |  4 ++-
 6 files changed, 24 insertions(+), 31 deletions(-)

diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
index 8a53270110..0ebcd627bd 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@@ -57,8 +57,11 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
 
 void MultiDevSSAGraphBuilder::CreateOpHandleIOs(SSAGraph *result, OpDesc *op,
                                                 const platform::Place &p,
-                                                const size_t &i) const {
+                                                const size_t &i,
+                                                bool create_output) const {
   auto *op_handle = result->ops_.back().get();
+  op_handle->dev_ctxes_[p] = const_cast<platform::DeviceContext *>(
+      platform::DeviceContextPool::Instance().Get(p));
 
   auto var_names = op->InputArgumentNames();
 
@@ -66,10 +69,12 @@ void MultiDevSSAGraphBuilder::CreateOpHandleIOs(SSAGraph *result, OpDesc *op,
     VarHandle *var = CreateOrGetLatestVarHandle(result, each_var_name, p, i);
     op_handle->AddInput(var);
   }
-  var_names = op->OutputArgumentNames();
+  if (create_output) {
+    var_names = op->OutputArgumentNames();
 
-  for (auto &each_var_name : var_names) {
-    CreateOpOutput(result, op_handle, each_var_name, p, i);
+    for (auto &each_var_name : var_names) {
+      CreateOpOutput(result, op_handle, each_var_name, p, i);
+    }
   }
 }
 
@@ -100,9 +105,11 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
     if (!is_forwarding && op->Type() == "send") {
       auto &p = places_[0];
       auto *s = local_scopes_[0];
-      size_t i = 0;
-      result.ops_.emplace_back(new SendOpHandle(*op, s, p));
-      CreateOpHandleIOs(&result, op, p, i);
+      // FIXME(wuyi): send op always copy from GPU 0
+      result.ops_.emplace_back(new SendOpHandle(*op, s));
+      // Create inputs for output on original place and no ssa output
+      // is created for send op.
+      CreateOpHandleIOs(&result, op, p, 0, false);
       continue;
     }
 
@@ -112,23 +119,10 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
 
       result.ops_.emplace_back(new ComputationOpHandle(*op, s, p));
       auto *op_handle = result.ops_.back().get();
-      op_handle->dev_ctxes_[p] = const_cast<platform::DeviceContext *>(
-          platform::DeviceContextPool::Instance().Get(p));
-
       CreateOpHandleIOs(&result, op, p, i);
-      // auto var_names = op->InputArgumentNames();
 
-      // for (auto &each_var_name : var_names) {
-      //   VarHandle *var =
-      //       CreateOrGetLatestVarHandle(&result, each_var_name, p, i);
-      //   op_handle->AddInput(var);
-      // }
       auto var_names = op->OutputArgumentNames();
 
-      // for (auto &each_var_name : var_names) {
-      //   CreateOpOutput(&result, op_handle, each_var_name, p, i);
-      // }
-
       if (is_forwarding) {
         if (var_names.size() == 1 && var_names[0] == loss_var_name_) {
 // Insert ScaleCost OpHandle
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.h b/paddle/fluid/framework/details/multi_devices_graph_builder.h
index de34caab1b..137c817fde 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h
@@ -46,7 +46,7 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
 
  private:
   void CreateOpHandleIOs(SSAGraph *result, OpDesc *op, const platform::Place &p,
-                         const size_t &i) const;
+                         const size_t &i, bool create_output = true) const;
 
  private:
   std::string loss_var_name_;
diff --git a/paddle/fluid/framework/details/send_op_handle.cc b/paddle/fluid/framework/details/send_op_handle.cc
index ae5637b804..caacfa6b1e 100644
--- a/paddle/fluid/framework/details/send_op_handle.cc
+++ b/paddle/fluid/framework/details/send_op_handle.cc
@@ -19,11 +19,9 @@ namespace framework {
 namespace details {
 
 SendOpHandle::SendOpHandle(const framework::OpDesc &op_desc,
-                           const Scope *local_scope,
-                           const platform::Place &place)
+                           const Scope *local_scope)
     : op_(framework::OpRegistry::CreateOp(op_desc)),
-      local_scope_(local_scope),
-      place_(place) {}
+      local_scope_(local_scope) {}
 
 void SendOpHandle::RunImpl() {
   // Wait input done
@@ -31,8 +29,8 @@ void SendOpHandle::RunImpl() {
     auto &p = static_cast<VarHandle *>(in)->place_;
     in->generated_op_->Wait(dev_ctxes_[p]);
   }
-
-  op_->Run(*local_scope_, place_);
+  platform::CPUPlace cpu;
+  op_->Run(*local_scope_, cpu);
 }
 
 std::string SendOpHandle::Name() const { return "send"; }
diff --git a/paddle/fluid/framework/details/send_op_handle.h b/paddle/fluid/framework/details/send_op_handle.h
index e7857c1f23..8a7b62ba1c 100644
--- a/paddle/fluid/framework/details/send_op_handle.h
+++ b/paddle/fluid/framework/details/send_op_handle.h
@@ -31,10 +31,8 @@ namespace details {
 struct SendOpHandle : public OpHandleBase {
   std::unique_ptr<OperatorBase> op_;
   const Scope* local_scope_;
-  const platform::Place& place_;
 
-  SendOpHandle(const framework::OpDesc& op_desc, const Scope* local_scope,
-               const platform::Place& place);
+  SendOpHandle(const framework::OpDesc& op_desc, const Scope* local_scope);
 
   std::string Name() const override;
 
diff --git a/python/paddle/fluid/distribute_transpiler.py b/python/paddle/fluid/distribute_transpiler.py
index 0ec3ebc7e3..e18ace844e 100644
--- a/python/paddle/fluid/distribute_transpiler.py
+++ b/python/paddle/fluid/distribute_transpiler.py
@@ -255,6 +255,7 @@ class DistributeTranspiler:
     def get_trainer_program(self):
         # remove optimize ops and add a send op to main_program
         self.program.global_block().delete_ops(self.optimize_ops)
+        self.program.sync_with_cpp()
         # FIXME(typhoonzero): serialize once will fix error occurs when clone.
         self.program.__str__()
         return self.program
diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py
index a23cc9b772..c709f364c1 100644
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -101,7 +101,9 @@ class ParallelExecutor(object):
 
         self.persistable_vars = [
             v.name
-            for v in filter(lambda var: var.persistable, main.list_vars())
+            for v in filter(lambda var: \
+                var.persistable and var.type != core.VarDesc.VarType.RAW,
+                main.list_vars())
         ]
 
         self.executor = core.ParallelExecutor(

From a84b81502cd26c02fbc3b4c46d751b0363a5ff46 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Wed, 11 Apr 2018 01:30:56 +0800
Subject: [PATCH 34/62] Remove Readers' HasNext()

---
 paddle/fluid/framework/lod_tensor.cc          | 32 ++++----
 paddle/fluid/framework/lod_tensor.h           |  7 +-
 paddle/fluid/framework/lod_tensor_test.cc     | 18 ++---
 paddle/fluid/framework/reader.h               | 13 +---
 .../reader/create_double_buffer_reader_op.cc  | 38 +++++-----
 .../reader/create_multi_pass_reader_op.cc     | 16 +---
 .../reader/create_random_data_generator_op.cc |  4 +-
 .../reader/create_recordio_file_reader_op.cc  | 10 +--
 .../reader/create_shuffle_reader_op.cc        | 24 +++---
 .../reader/create_threaded_reader_op.cc       | 75 ++++---------------
 .../fluid/operators/reader/open_files_op.cc   | 25 ++++---
 paddle/fluid/pybind/pybind.cc                 |  1 -
 paddle/fluid/pybind/recordio.cc               |  2 +-
 python/paddle/fluid/layers/io.py              | 10 +--
 14 files changed, 105 insertions(+), 170 deletions(-)

diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc
index 8155cb55a4..a56674cbe2 100644
--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
@@ -12,9 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/lod_tensor.h"
+#include <stdint.h>
+#include <string.h>
+#include <algorithm>
+#include <iterator>
+
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/lod_tensor.h"
 
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/memory/memory.h"
@@ -22,11 +27,6 @@ limitations under the License. */
 #include "paddle/fluid/recordio/scanner.h"
 #include "paddle/fluid/recordio/writer.h"
 
-#include <stdint.h>
-#include <string.h>
-#include <algorithm>
-#include <iterator>
-
 namespace paddle {
 namespace framework {
 
@@ -294,7 +294,7 @@ void DeserializeFromStream(std::istream &is, LoDTensor *tensor,
   TensorFromStream(is, static_cast<Tensor *>(tensor), dev_ctx);
 }
 
-void WriteToRecordIO(recordio::Writer &writer,
+void WriteToRecordIO(recordio::Writer *writer,
                      const std::vector<LoDTensor> &tensor,
                      const platform::DeviceContext &dev_ctx) {
   std::stringstream buffer;
@@ -303,18 +303,20 @@ void WriteToRecordIO(recordio::Writer &writer,
   for (auto &each : tensor) {
     SerializeToStream(buffer, each, dev_ctx);
   }
-  writer.Write(buffer.str());
+  writer->Write(buffer.str());
 }
 
 std::vector<LoDTensor> ReadFromRecordIO(
-    recordio::Scanner &scanner, const platform::DeviceContext &dev_ctx) {
-  std::istringstream sin(scanner.Next());
-  uint32_t sz;
-  sin.read(reinterpret_cast<char *>(&sz), sizeof(uint32_t));
+    recordio::Scanner *scanner, const platform::DeviceContext &dev_ctx) {
   std::vector<LoDTensor> result;
-  result.resize(sz);
-  for (uint32_t i = 0; i < sz; ++i) {
-    DeserializeFromStream(sin, &result[i], dev_ctx);
+  if (scanner->HasNext()) {
+    std::istringstream sin(scanner->Next());
+    uint32_t sz;
+    sin.read(reinterpret_cast<char *>(&sz), sizeof(uint32_t));
+    result.resize(sz);
+    for (uint32_t i = 0; i < sz; ++i) {
+      DeserializeFromStream(sin, &result[i], dev_ctx);
+    }
   }
   return result;
 }
diff --git a/paddle/fluid/framework/lod_tensor.h b/paddle/fluid/framework/lod_tensor.h
index 4f130d2659..1159fee39b 100644
--- a/paddle/fluid/framework/lod_tensor.h
+++ b/paddle/fluid/framework/lod_tensor.h
@@ -15,6 +15,9 @@ limitations under the License. */
 #pragma once
 
 #include <memory>
+#include <string>
+#include <utility>
+#include <vector>
 #ifdef PADDLE_WITH_CUDA
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
@@ -216,12 +219,12 @@ void SerializeToStream(std::ostream& os, const LoDTensor& tensor,
 void DeserializeFromStream(std::istream& is, LoDTensor* tensor,
                            const platform::DeviceContext& dev_ctx);
 
-extern void WriteToRecordIO(recordio::Writer& writer,
+extern void WriteToRecordIO(recordio::Writer* writer,
                             const std::vector<LoDTensor>& tensor,
                             const platform::DeviceContext& dev_ctx);
 
 extern std::vector<LoDTensor> ReadFromRecordIO(
-    recordio::Scanner& scanner, const platform::DeviceContext& dev_ctx);
+    recordio::Scanner* scanner, const platform::DeviceContext& dev_ctx);
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/lod_tensor_test.cc b/paddle/fluid/framework/lod_tensor_test.cc
index e691e29383..97ab98f09b 100644
--- a/paddle/fluid/framework/lod_tensor_test.cc
+++ b/paddle/fluid/framework/lod_tensor_test.cc
@@ -12,17 +12,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/lod_tensor.h"
-
-#include "paddle/fluid/recordio/scanner.h"
-#include "paddle/fluid/recordio/writer.h"
-
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 #include <algorithm>
 #include <memory>
 #include <vector>
 
+#include "paddle/fluid/framework/lod_tensor.h"
+
+#include "paddle/fluid/recordio/scanner.h"
+#include "paddle/fluid/recordio/writer.h"
+
 namespace paddle {
 namespace framework {
 
@@ -240,8 +240,8 @@ TEST(LoDTensor, RecordIO) {
       *platform::DeviceContextPool::Instance().Get(platform::CPUPlace());
   {
     recordio::Writer writer(stream, recordio::Compressor::kSnappy);
-    WriteToRecordIO(writer, {tensor, tensor}, ctx);
-    WriteToRecordIO(writer, {tensor, tensor}, ctx);
+    WriteToRecordIO(&writer, {tensor, tensor}, ctx);
+    WriteToRecordIO(&writer, {tensor, tensor}, ctx);
     writer.Flush();
   }
 
@@ -254,11 +254,11 @@ TEST(LoDTensor, RecordIO) {
   {
     std::unique_ptr<std::istream> stream_ptr(stream);
     recordio::Scanner scanner(std::move(stream_ptr));
-    auto tensors = ReadFromRecordIO(scanner, ctx);
+    auto tensors = ReadFromRecordIO(&scanner, ctx);
     ASSERT_EQ(tensors.size(), 2);
     assert_tensor_ok(tensors[0]);
     assert_tensor_ok(tensors[1]);
-    tensors = ReadFromRecordIO(scanner, ctx);
+    tensors = ReadFromRecordIO(&scanner, ctx);
     ASSERT_EQ(tensors.size(), 2);
     assert_tensor_ok(tensors[0]);
     assert_tensor_ok(tensors[1]);
diff --git a/paddle/fluid/framework/reader.h b/paddle/fluid/framework/reader.h
index 3573b99bec..3a413941df 100644
--- a/paddle/fluid/framework/reader.h
+++ b/paddle/fluid/framework/reader.h
@@ -14,14 +14,13 @@
 
 #pragma once
 
+#include <memory>
+#include <vector>
+
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/platform/place.h"
 
-#include <memory>
-#include <thread>
-#include <vector>
-
 namespace paddle {
 namespace framework {
 
@@ -31,8 +30,6 @@ class ReaderBase {
 
   virtual void ReInit() = 0;
 
-  virtual bool HasNext() const = 0;
-
   virtual ~ReaderBase();
 };
 
@@ -44,8 +41,6 @@ class DecoratedReader : public ReaderBase {
 
   void ReInit() override { reader_->ReInit(); }
 
-  bool HasNext() const override { return reader_->HasNext(); }
-
  protected:
   ReaderBase* reader_;
 };
@@ -80,8 +75,6 @@ class ReaderHolder {
     reader_->ReInit();
   }
 
-  bool HasNext() const { return reader_->HasNext(); }
-
  private:
   std::unique_ptr<ReaderBase> reader_;
 };
diff --git a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
index d9f799f14d..33a50b5ceb 100644
--- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
@@ -63,13 +63,14 @@ class DoubleBufferReader : public framework::DecoratedReader {
     StartPrefetcher();
   }
 
-  bool HasNext() const override;
   void ReadNext(std::vector<framework::LoDTensor>* out) override;
   void ReInit() override;
 
   ~DoubleBufferReader() { EndPrefetcher(); }
 
  private:
+  bool HasNext() const;
+
   void StartPrefetcher() {
     channel_ = framework::MakeChannel<Item>(kChannelSize);
     prefetcher_ = std::thread([this] { PrefetchThreadFunc(); });
@@ -149,22 +150,15 @@ class CreateDoubleBufferReaderOpMaker : public DecoratedReaderMakerBase {
   }
 };
 
-bool DoubleBufferReader::HasNext() const {
-  while (!channel_->IsClosed() && !channel_->CanReceive()) {
-  }
-  return channel_->CanReceive();
-}
-
 void DoubleBufferReader::ReadNext(std::vector<framework::LoDTensor>* out) {
-  if (!HasNext()) {
-    PADDLE_THROW("There is no next data!");
-  }
-
-  Item batch;
-  channel_->Receive(&batch);
-  *out = batch.payloads_;
-  if (batch.ctx_) {
-    batch.ctx_->Wait();
+  out->clear();
+  if (HasNext()) {
+    Item batch;
+    channel_->Receive(&batch);
+    *out = batch.payloads_;
+    if (batch.ctx_) {
+      batch.ctx_->Wait();
+    }
   }
 }
 
@@ -174,16 +168,26 @@ void DoubleBufferReader::ReInit() {
   StartPrefetcher();
 }
 
+bool DoubleBufferReader::HasNext() const {
+  while (!channel_->IsClosed() && !channel_->CanReceive()) {
+  }
+  return channel_->CanReceive();
+}
+
 void DoubleBufferReader::PrefetchThreadFunc() {
   VLOG(5) << "A new prefetch thread starts.";
   std::vector<std::vector<framework::LoDTensor>> cpu_tensor_cache(kCacheSize);
   std::vector<std::vector<framework::LoDTensor>> gpu_tensor_cache(kCacheSize);
   size_t cached_tensor_id = 0;
 
-  while (reader_->HasNext()) {
+  while (true) {
     Item batch;
     auto& cpu_batch = cpu_tensor_cache[cached_tensor_id];
     reader_->ReadNext(&cpu_batch);
+    if (cpu_batch.empty()) {
+      // The underlying reader have no next data.
+      break;
+    }
     if (platform::is_gpu_place(place_)) {
       auto& gpu_batch = gpu_tensor_cache[cached_tensor_id];
       auto* gpu_ctx = ctxs_[cached_tensor_id].get();
diff --git a/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc b/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc
index b72ccc77a3..0573345ba5 100644
--- a/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc
@@ -25,22 +25,12 @@ class MultiPassReader : public framework::DecoratedReader {
       : DecoratedReader(reader), pass_num_(pass_num), pass_count_(0) {}
 
   void ReadNext(std::vector<framework::LoDTensor>* out) override {
-    if (!HasNext()) {
-      PADDLE_THROW("There is no next data!");
-    }
     reader_->ReadNext(out);
-  }
-
-  bool HasNext() const override {
-    if (reader_->HasNext()) {
-      return true;
-    } else {
+    if (out->empty()) {
       ++pass_count_;
-      if (pass_count_ >= pass_num_) {
-        return false;
-      } else {
+      if (pass_count_ < pass_num_) {
         reader_->ReInit();
-        return true;
+        reader_->ReadNext(out);
       }
     }
   }
diff --git a/paddle/fluid/operators/reader/create_random_data_generator_op.cc b/paddle/fluid/operators/reader/create_random_data_generator_op.cc
index 95d8674c08..d1cb8e47da 100644
--- a/paddle/fluid/operators/reader/create_random_data_generator_op.cc
+++ b/paddle/fluid/operators/reader/create_random_data_generator_op.cc
@@ -52,8 +52,6 @@ class RandomDataGenerator : public framework::ReaderBase {
 
   void ReInit() override { return; }
 
-  bool HasNext() const override { return true; }
-
  private:
   float min_;
   float max_;
@@ -74,7 +72,7 @@ class CreateRandomDataGeneratorOp : public framework::OperatorBase {
     const auto& ranks = Attr<std::vector<int>>("ranks");
     PADDLE_ENFORCE(!shape_concat.empty() && !ranks.empty());
     PADDLE_ENFORCE_EQ(std::accumulate(ranks.begin(), ranks.end(), 0),
-                      int(shape_concat.size()),
+                      static_cast<int>(shape_concat.size()),
                       "The accumulate of all ranks should be equal to the "
                       "shape concat's length.");
     std::vector<framework::DDim> shapes = RestoreShapes(shape_concat, ranks);
diff --git a/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc b/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc
index adaa0b9e5f..2ae2972556 100644
--- a/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc
@@ -12,8 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <mutex>
-#include <thread>
 #include "paddle/fluid/operators/reader/reader_op_registry.h"
 #include "paddle/fluid/recordio/scanner.h"
 
@@ -35,17 +33,15 @@ class RecordIOFileReader : public framework::FileReader {
     LOG(INFO) << "Creating file reader" << filename;
   }
 
-  bool HasNext() const override { return scanner_.HasNext(); }
-
   void ReInit() override { scanner_.Reset(); }
 
  protected:
   void ReadNextImpl(std::vector<framework::LoDTensor>* out) override {
     if (ThreadSafe) {
       std::lock_guard<std::mutex> guard(*mutex_);
-      *out = framework::ReadFromRecordIO(scanner_, dev_ctx_);
+      *out = framework::ReadFromRecordIO(&scanner_, dev_ctx_);
     } else {
-      *out = framework::ReadFromRecordIO(scanner_, dev_ctx_);
+      *out = framework::ReadFromRecordIO(&scanner_, dev_ctx_);
     }
   }
 
@@ -66,7 +62,7 @@ class CreateRecordIOReaderOp : public framework::OperatorBase {
     const auto& ranks = Attr<std::vector<int>>("ranks");
     PADDLE_ENFORCE(!shape_concat.empty() && !ranks.empty());
     PADDLE_ENFORCE_EQ(std::accumulate(ranks.begin(), ranks.end(), 0),
-                      int(shape_concat.size()),
+                      static_cast<int>(shape_concat.size()),
                       "The accumulate of all ranks should be equal to the "
                       "shape concat's length.");
     std::string filename = Attr<std::string>("filename");
diff --git a/paddle/fluid/operators/reader/create_shuffle_reader_op.cc b/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
index b164ce232d..13825d6591 100644
--- a/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
@@ -30,35 +30,33 @@ class ShuffleReader : public framework::DecoratedReader {
       std::random_device device;
       seed_ = device();
     }
-    ReadIntoBuffers();
+    ReloadBuffer();
   }
 
   void ReadNext(std::vector<framework::LoDTensor>* out) override {
-    if (!HasNext()) {
-      PADDLE_THROW("There is no next data!");
-    }
+    out->clear();
     if (iteration_pos_ >= buffer_.size()) {
       VLOG(10) << "Resetting shuffle buffer";
-      ReadIntoBuffers();
+      ReloadBuffer();
+      if (buffer_.empty()) {
+        return;
+      }
     }
     *out = buffer_[iteration_pos_++];
   }
 
-  bool HasNext() const override {
-    return iteration_pos_ < buffer_.size() || reader_->HasNext();
-  }
-
  private:
-  void ReadIntoBuffers() {
+  void ReloadBuffer() {
     buffer_.clear();
     buffer_.reserve(buffer_size_);
     iteration_pos_ = 0;
     for (size_t i = 0; i < buffer_size_; ++i) {
-      if (!reader_->HasNext()) {
+      std::vector<framework::LoDTensor> ins;
+      reader_->ReadNext(&ins);
+      if (ins.empty()) {
         break;
       }
-      buffer_.emplace_back();
-      reader_->ReadNext(&buffer_.back());
+      buffer_.emplace_back(ins);
     }
     std::mt19937 g(seed_);
     std::shuffle(buffer_.begin(), buffer_.end(), g);
diff --git a/paddle/fluid/operators/reader/create_threaded_reader_op.cc b/paddle/fluid/operators/reader/create_threaded_reader_op.cc
index 7b10135afc..cbf709d5e7 100644
--- a/paddle/fluid/operators/reader/create_threaded_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_threaded_reader_op.cc
@@ -21,67 +21,27 @@ namespace reader {
 
 class ThreadedReader : public framework::DecoratedReader {
  public:
-  ThreadedReader(ReaderBase* reader, bool unsafe_mode)
-      : DecoratedReader(reader), unsafe_mode_(unsafe_mode) {}
+  ThreadedReader(ReaderBase* reader, bool safe_mode)
+      : DecoratedReader(reader), safe_mode_(safe_mode) {}
 
   void ReadNext(std::vector<framework::LoDTensor>* out) override {
     std::lock_guard<std::mutex> lock(mutex_);
-    if (!unsafe_mode_) {
-      if (!reader_->HasNext()) {
-        PADDLE_THROW("There is no next data!");
-      }
-      reader_->ReadNext(out);
-    } else {
-      auto& thread_buffer = thread_buffers_[std::this_thread::get_id()];
-      if (thread_buffer.empty()) {
-        PADDLE_THROW(
-            "thread_buffer is empty! HasNext() must be invoked before "
-            "ReadNext() in the same thread.");
-      }
-      *out = thread_buffer;
-      thread_buffer.clear();
-    }
-  }
-
-  bool HasNext() const override {
-    if (!unsafe_mode_) {
-      PADDLE_THROW(
-          "ThreadedReader::HasNext() is disabled when 'unsafe_mode' is false.");
-    }
-    std::thread::id thread_id = std::this_thread::get_id();
-    std::lock_guard<std::mutex> lock(mutex_);
-    auto& thread_buffer = thread_buffers_[thread_id];
-    if (thread_buffer.empty() && reader_->HasNext()) {
-      reader_->ReadNext(&thread_buffer);
-    }
-    return !thread_buffer.empty();
+    reader_->ReadNext(out);
   }
 
   void ReInit() override {
-    if (!unsafe_mode_) {
+    if (safe_mode_) {
       PADDLE_THROW(
-          "ThreadedReader::ReInit() is disabled when 'unsafe_mode' is false.");
+          "ThreadedReader::ReInit() is disabled when 'safe_mode' is true.");
     }
     VLOG(5) << "ThreadedReader::ReInit() is invoked! It might be buggy in "
                "multi-thread environment.";
     reader_->ReInit();
   }
 
-  ~ThreadedReader() {
-    for (auto& p : thread_buffers_) {
-      if (!p.second.empty()) {
-        PADDLE_THROW(
-            "Find an unused data batch in ThreadedReader! Maybe one thread "
-            "invokes 'HasNext()' without subsequent 'ReadNext()'.");
-      }
-    }
-  }
-
  private:
-  bool unsafe_mode_;
-  mutable std::mutex mutex_;
-  mutable std::unordered_map<std::thread::id, std::vector<framework::LoDTensor>>
-      thread_buffers_;
+  bool safe_mode_;
+  std::mutex mutex_;
 };
 
 class CreateThreadedReaderOp : public framework::OperatorBase {
@@ -98,8 +58,8 @@ class CreateThreadedReaderOp : public framework::OperatorBase {
     }
     const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
                                         ->Get<framework::ReaderHolder>();
-    bool unsafe_mode = Attr<bool>("unsafe_mode");
-    out->Reset(new ThreadedReader(underlying_reader.Get(), unsafe_mode));
+    bool safe_mode = Attr<bool>("safe_mode");
+    out->Reset(new ThreadedReader(underlying_reader.Get(), safe_mode));
   }
 };
 
@@ -107,10 +67,9 @@ class CreateThreadedReaderOpMaker : public DecoratedReaderMakerBase {
  public:
   CreateThreadedReaderOpMaker(OpProto* op_proto, OpAttrChecker* op_checker)
       : DecoratedReaderMakerBase(op_proto, op_checker) {
-    AddAttr<bool>("unsafe_mode",
-                  "When 'unsafe_mode' is false, invoking 'HasNext()' or "
-                  "'ReInit()' is not allowed to avoid unexpected bugs in "
-                  "multi-thread environment.")
+    AddAttr<bool>("safe_mode",
+                  "When 'safe_mode' is true, 'ReInit()' is disabled to avoid "
+                  "unexpected bugs in multi-thread environment.")
         .SetDefault(true);
     AddComment(R"DOC(
       CreateThreadedReader Operator
@@ -118,13 +77,9 @@ class CreateThreadedReaderOpMaker : public DecoratedReaderMakerBase {
       This operator creates a threaded reader. A threaded reader's 
       'ReadNext()' can be invoked by several threads at the same 
       time. 
-      When the attribute 'unsafe_mode' is false, the threaded reader's 
-      'HasNext()' and 'ReInit()' will be disabled to avoid unexpected 
-      bugs in multi-thread environment. If you really need them, you 
-      can enable them by setting 'unsafe_mode' true. In this case, 
-      'HasNext()' returning true only guarantees the safety of 
-      invoking 'ReadNext()' in the same thread. Each thread must 
-      invoke 'HasNext()' and 'ReadNext()' in pairs.
+      When the attribute 'safe_mode' is true, the threaded reader's 
+      'ReInit()' is disabled to avoid unexpected bugs in multi-thread 
+      environment.
     )DOC");
   }
 };
diff --git a/paddle/fluid/operators/reader/open_files_op.cc b/paddle/fluid/operators/reader/open_files_op.cc
index 45db94e780..9ce2e5dc2c 100644
--- a/paddle/fluid/operators/reader/open_files_op.cc
+++ b/paddle/fluid/operators/reader/open_files_op.cc
@@ -30,12 +30,12 @@ class MultiFileReader : public framework::ReaderBase {
   }
 
   void ReadNext(std::vector<framework::LoDTensor>* out) override;
-  bool HasNext() const override;
   void ReInit() override;
 
   ~MultiFileReader() { EndScheduler(); }
 
  private:
+  bool HasNext();
   void StartNewScheduler();
   void EndScheduler();
   void ScheduleThreadFunc();
@@ -52,16 +52,10 @@ class MultiFileReader : public framework::ReaderBase {
 };
 
 void MultiFileReader::ReadNext(std::vector<framework::LoDTensor>* out) {
-  if (!HasNext()) {
-    PADDLE_THROW("There is no next data!");
+  out->clear();
+  if (HasNext()) {
+    buffer_->Receive(out);
   }
-  buffer_->Receive(out);
-}
-
-bool MultiFileReader::HasNext() const {
-  while (!buffer_->IsClosed() && !buffer_->CanReceive()) {
-  }
-  return buffer_->CanReceive();
 }
 
 void MultiFileReader::ReInit() {
@@ -69,6 +63,12 @@ void MultiFileReader::ReInit() {
   StartNewScheduler();
 }
 
+bool MultiFileReader::HasNext() {
+  while (!buffer_->IsClosed() && !buffer_->CanReceive()) {
+  }
+  return buffer_->CanReceive();
+}
+
 void MultiFileReader::StartNewScheduler() {
   size_t thread_num = prefetchers_.size();
   waiting_file_idx_ = framework::MakeChannel<size_t>(file_names_.size());
@@ -140,9 +140,12 @@ void MultiFileReader::PrefetchThreadFunc(std::string file_name,
   VLOG(5) << "The prefetch thread of file '" << file_name << "' starts.";
   std::unique_ptr<framework::ReaderBase> reader =
       CreateReaderByFileName(file_name, dims_);
-  while (reader->HasNext()) {
+  while (true) {
     std::vector<framework::LoDTensor> ins;
     reader->ReadNext(&ins);
+    if (ins.empty()) {
+      break;
+    }
     try {
       buffer_->Send(&ins);
     } catch (paddle::platform::EnforceNotMet e) {
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index bd8446df66..c7a5d1c714 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -252,7 +252,6 @@ All parameter, weight, gradient are variables in Paddle.
            py::return_value_policy::reference);
 
   py::class_<framework::ReaderHolder>(m, "Reader", "")
-      .def("has_next", &framework::ReaderHolder::HasNext)
       .def("reset", &framework::ReaderHolder::ReInit);
 
   py::class_<Scope>(m, "Scope", "")
diff --git a/paddle/fluid/pybind/recordio.cc b/paddle/fluid/pybind/recordio.cc
index 0644d91425..330d104e0a 100644
--- a/paddle/fluid/pybind/recordio.cc
+++ b/paddle/fluid/pybind/recordio.cc
@@ -39,7 +39,7 @@ class RecordIOWriter {
   void CompleteAppendTensor() {
     auto& ctx =
         *platform::DeviceContextPool::Instance().Get(platform::CPUPlace());
-    framework::WriteToRecordIO(writer_, tensors_, ctx);
+    framework::WriteToRecordIO(&writer_, tensors_, ctx);
     tensors_.clear();
   }
 
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index d016ab9008..8ba6bd18e9 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -236,13 +236,9 @@ def monkey_patch_reader_methods(reader):
         var = scope.find_var(reader.name)
         return var.get_reader()
 
-    def eof():
-        return not __get_reader__().has_next()
-
     def reset():
         return __get_reader__().reset()
 
-    reader.eof = eof
     reader.reset = reset
     reader.stop_gradient = True
     reader.persistable = True
@@ -299,8 +295,7 @@ def open_recordio_file(filename,
        shapes(list): List of tuples which declaring data shapes.
        lod_levels(list): List of ints which declaring data lod_level.
        dtypes(list): List of strs which declaring data type.
-       pass_num(int): Number of passes to run. After completing the
-            given number of passes, 'has_next()' will return False.
+       pass_num(int): Number of passes to run.
        for_parallel(Bool): Set it as True if you are going to run
             subsequent operators in parallel.
 
@@ -377,8 +372,7 @@ def open_files(filenames,
        dtypes(list): List of strs which declaring data type.
        thread_num(int): The maximal concurrent prefetch thread number.
        buffer_size(int): The size of prefetch buffer.
-       pass_num(int): Number of passes to run. After completing the 
-            given number of passes, 'has_next()' will return False.
+       pass_num(int): Number of passes to run.
        for_parallel(Bool): Set it as True if you are going to run 
             subsequent operators in parallel.
 

From 7a7829466664aff8a41364d566b852ca5859bed2 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Wed, 11 Apr 2018 01:37:24 +0800
Subject: [PATCH 35/62] Remove Readers' HasNext()

---
 paddle/fluid/operators/reader/open_files_op.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/paddle/fluid/operators/reader/open_files_op.cc b/paddle/fluid/operators/reader/open_files_op.cc
index 9ce2e5dc2c..779dc8a6a0 100644
--- a/paddle/fluid/operators/reader/open_files_op.cc
+++ b/paddle/fluid/operators/reader/open_files_op.cc
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <thread>  // NOLINT
+
 #include "paddle/fluid/framework/channel.h"
 #include "paddle/fluid/operators/reader/reader_op_registry.h"
 

From a7c6bf771c493cc9031975ceabcb126ef9ed1188 Mon Sep 17 00:00:00 2001
From: wanghaoshuang <wanghaoshuang@baidu.com>
Date: Wed, 11 Apr 2018 09:53:56 +0800
Subject: [PATCH 36/62] Change do_model_average_for_mean_and_var to boolean in
 batch_normal.

---
 python/paddle/fluid/layers/nn.py | 3 ---
 python/paddle/fluid/optimizer.py | 3 ++-
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 37ce738275..56c37f05cc 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -1518,9 +1518,6 @@ def batch_norm(input,
     bias = helper.create_parameter(
         attr=helper.bias_attr, shape=param_shape, dtype=dtype, is_bias=True)
 
-    if do_model_average_for_mean_and_var:
-        do_model_average_for_mean_and_var = None
-
     mean = helper.create_parameter(
         attr=ParamAttr(
             name=moving_mean_name,
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 1917b7d044..36503cac6d 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -853,7 +853,8 @@ class ModelAverage(Optimizer):
         self.params_grads = [] if params_grads is None else params_grads
         params = {}
         for param, grad in self.params_grads:
-            params[param.name] = (param, grad)
+            if param.do_model_average != False:
+                params[param.name] = (param, grad)
         for param in framework.default_main_program().global_block(
         ).all_parameters():
             if param.name not in params and param.do_model_average != False:

From 72b5de05fee1c94c7bd40c8b69ff8c4fe2aff7d9 Mon Sep 17 00:00:00 2001
From: JiayiFeng <fengjiayi@baidu.com>
Date: Wed, 11 Apr 2018 02:54:56 +0000
Subject: [PATCH 37/62] update unittest

---
 paddle/fluid/operators/read_op.cc              |  1 +
 .../tests/unittests/test_parallel_executor.py  | 18 ++++++++++++------
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/paddle/fluid/operators/read_op.cc b/paddle/fluid/operators/read_op.cc
index 4496110cf8..bf02b99589 100644
--- a/paddle/fluid/operators/read_op.cc
+++ b/paddle/fluid/operators/read_op.cc
@@ -66,6 +66,7 @@ class ReadOp : public framework::OperatorBase {
     std::vector<std::string> out_arg_names = Outputs("Out");
     std::vector<framework::LoDTensor> ins;
     reader->ReadNext(&ins);
+    PADDLE_ENFORCE(!ins.empty(), "There is no next data.");
     PADDLE_ENFORCE_EQ(ins.size(), out_arg_names.size());
     for (size_t i = 0; i < ins.size(); ++i) {
       auto* out =
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor.py b/python/paddle/fluid/tests/unittests/test_parallel_executor.py
index 8401716db8..3c00f708f0 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor.py
@@ -26,11 +26,14 @@ def simple_fc_net(use_feed):
         img = fluid.layers.data(name='image', shape=[784], dtype='float32')
         label = fluid.layers.data(name='label', shape=[1], dtype='int64')
     else:
-        reader = fluid.layers.open_recordio_file(
-            filename='./mnist.recordio',
+        reader = fluid.layers.open_files(
+            filenames=['./mnist.recordio'],
             shapes=[[-1, 784], [-1, 1]],
             lod_levels=[0, 0],
-            dtypes=['float32', 'int64'])
+            dtypes=['float32', 'int64'],
+            thread_num=1,
+            for_parallel=True)
+        reader = fluid.layers.io.double_buffer(reader)
         img, label = fluid.layers.read_file(reader)
     hidden = img
     for _ in xrange(4):
@@ -51,11 +54,14 @@ def fc_with_batchnorm(use_feed):
         img = fluid.layers.data(name='image', shape=[784], dtype='float32')
         label = fluid.layers.data(name='label', shape=[1], dtype='int64')
     else:
-        reader = fluid.layers.open_recordio_file(
-            filename='./mnist.recordio',
+        reader = fluid.layers.open_files(
+            filenames=['mnist.recordio'],
             shapes=[[-1, 784], [-1, 1]],
             lod_levels=[0, 0],
-            dtypes=['float32', 'int64'])
+            dtypes=['float32', 'int64'],
+            thread_num=1,
+            for_parallel=True)
+        reader = fluid.layers.io.double_buffer(reader)
         img, label = fluid.layers.read_file(reader)
 
     hidden = img

From 129859e732fa7ac056c4c453619b2c84c98bc0ac Mon Sep 17 00:00:00 2001
From: qingqing01 <dangqingqing@baidu.com>
Date: Wed, 11 Apr 2018 12:34:46 +0800
Subject: [PATCH 38/62] Support data type int64 in NCCL. (#9818)

---
 paddle/fluid/platform/nccl_helper.h | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h
index 2999004320..3a2a423486 100644
--- a/paddle/fluid/platform/nccl_helper.h
+++ b/paddle/fluid/platform/nccl_helper.h
@@ -14,8 +14,9 @@
 
 #pragma once
 
-#include <thread>
+#include <thread>  // NOLINT
 #include <typeindex>
+#include <vector>
 #include "paddle/fluid/platform/dynload/nccl.h"
 #include "paddle/fluid/platform/enforce.h"
 
@@ -29,6 +30,8 @@ inline ncclDataType_t ToNCCLDataType(std::type_index type) {
     return ncclDouble;
   } else if (type == typeid(int)) {  // NOLINT
     return ncclInt;
+  } else if (type == typeid(int64_t)) {  // NOLINT
+    return ncclInt64;
   } else {
     PADDLE_THROW("Not supported");
   }
@@ -66,23 +69,23 @@ struct NCCLContext {
     return boost::get<platform::CUDAPlace>(ctx_->GetPlace()).device;
   }
 
-  static void InitNCCLContext(std::unordered_map<int, NCCLContext> &contexts,
+  static void InitNCCLContext(std::unordered_map<int, NCCLContext> *contexts,
                               const std::vector<platform::Place> &places) {
     std::vector<ncclComm_t> comms;
     std::vector<int> devs;
-    comms.resize(contexts.size());
-    devs.reserve(contexts.size());
+    comms.resize(contexts->size());
+    devs.reserve(contexts->size());
 
     for (auto &p : places) {
       devs.push_back(boost::get<platform::CUDAPlace>(p).device);
     }
 
     PADDLE_ENFORCE(platform::dynload::ncclCommInitAll(
-        &comms[0], static_cast<int>(contexts.size()), &devs[0]));
+        &comms[0], static_cast<int>(contexts->size()), &devs[0]));
 
     int i = 0;
     for (auto &dev_id : devs) {
-      contexts.at(dev_id).comm_ = comms[i++];
+      contexts->at(dev_id).comm_ = comms[i++];
     }
   }
 };
@@ -91,7 +94,7 @@ struct NCCLContextMap {
   std::unordered_map<int, NCCLContext> contexts_;
   std::vector<int> order_;
 
-  NCCLContextMap(const std::vector<platform::Place> &places) {
+  explicit NCCLContextMap(const std::vector<platform::Place> &places) {
     order_.reserve(places.size());
     for (auto &p : places) {
       int dev_id = boost::get<CUDAPlace>(p).device;

From 38f86769892568981693d46cd3a7a77f5bf21b04 Mon Sep 17 00:00:00 2001
From: Luo Tao <luotao02@baidu.com>
Date: Wed, 11 Apr 2018 12:37:53 +0800
Subject: [PATCH 39/62] remove unused nccl.cmake

---
 cmake/external/nccl.cmake | 67 ---------------------------------------
 1 file changed, 67 deletions(-)
 delete mode 100644 cmake/external/nccl.cmake

diff --git a/cmake/external/nccl.cmake b/cmake/external/nccl.cmake
deleted file mode 100644
index af5c689c35..0000000000
--- a/cmake/external/nccl.cmake
+++ /dev/null
@@ -1,67 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-if(NOT WITH_GPU)
-  return()
-endif()
-
-include(ExternalProject)
-
-set(NCCL_SOURCE_DIR ${THIRD_PARTY_PATH}/nccl)
-
-include_directories(${NCCL_SOURCE_DIR}/src/extern_nccl/src)
-
-if(WITH_DSO)
-  # If we use DSO, we do not build nccl, just download the dependencies
-  set(NCCL_BUILD_COMMAND "")
-  set(NCCL_INSTALL_COMMAND "")
-  set(NCCL_INSTALL_DIR "")
-else()
-  # otherwise, we build nccl and link it.
-  set(NCCL_INSTALL_DIR ${THIRD_PARTY_PATH}/install/nccl)
-  # Note: cuda 8.0 is needed to make nccl
-  # When cuda is not installed on the system directory, need to set CUDA_HOME to your cuda root
-  set(NCCL_BUILD_COMMAND "make -j 8")
-  set(NCCL_INSTALL_COMMAND  "make install PREFIX=${NCCL_INSTALL_DIR}")
-endif()
-
-ExternalProject_Add(
-    extern_nccl
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    GIT_REPOSITORY  "https://github.com/NVIDIA/nccl.git"
-    GIT_TAG         "v1.3.4-1"
-    PREFIX          "${NCCL_SOURCE_DIR}"
-    UPDATE_COMMAND  ""
-    CONFIGURE_COMMAND ""
-    BUILD_COMMAND     "${NCCL_BUILD_COMMAND}"
-    INSTALL_COMMAND   "${NCCL_INSTALL_COMMAND}"
-    INSTALL_DIR       "${NCCL_INSTALL_DIR}"
-    TEST_COMMAND      ""
-)
-
-if(WITH_DSO)
-  if(${CMAKE_VERSION} VERSION_LESS "3.3.0")
-    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/lib_nccl_dummy.c)
-    file(WRITE ${dummyfile} "const char * dummy_nccl = \"${dummyfile}\";")
-    add_library(nccl STATIC ${dummyfile})
-  else()
-    add_library(nccl INTERFACE)
-  endif()
-else()
-  add_library(nccl STATIC IMPORTED GLOBAL)
-  set_property(TARGET nccl PROPERTY IMPORTED_LOCATION
-               ${NCCL_INSTALL_DIR}/lib/libnccl_static.a)
-endif()
-
-add_dependencies(nccl extern_nccl)

From 273f4892b21ff8e17fba300071943846e06b75cf Mon Sep 17 00:00:00 2001
From: JiayiFeng <fengjiayi@baidu.com>
Date: Wed, 11 Apr 2018 04:52:08 +0000
Subject: [PATCH 40/62] update recordio unittest

---
 paddle/fluid/framework/reader.cc                       |  4 +++-
 .../fluid/tests/unittests/test_recordio_reader.py      | 10 ++++++++--
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/framework/reader.cc b/paddle/fluid/framework/reader.cc
index 56bf00e5f9..76126f3dc6 100644
--- a/paddle/fluid/framework/reader.cc
+++ b/paddle/fluid/framework/reader.cc
@@ -22,7 +22,9 @@ FileReader::FileReader(const std::vector<DDim> &dims) : dims_(dims) {}
 
 void FileReader::ReadNext(std::vector<LoDTensor> *out) {
   ReadNextImpl(out);
-  PADDLE_ENFORCE_EQ(out->size(), dims_.size());
+  if (out->empty()) {
+    return;
+  }
   for (size_t i = 0; i < dims_.size(); ++i) {
     auto &actual = out->at(i).dims();
     auto &expect = dims_[i];
diff --git a/python/paddle/fluid/tests/unittests/test_recordio_reader.py b/python/paddle/fluid/tests/unittests/test_recordio_reader.py
index 096d99a3f3..2982cb8ceb 100644
--- a/python/paddle/fluid/tests/unittests/test_recordio_reader.py
+++ b/python/paddle/fluid/tests/unittests/test_recordio_reader.py
@@ -65,8 +65,14 @@ class TestRecordIO(unittest.TestCase):
 
             # train a pass
             batch_id = 0
-            while not data_file.eof():
-                tmp, = exe.run(fetch_list=[avg_loss])
+            while True:
+                ex = None
+                try:
+                    tmp, = exe.run(fetch_list=[avg_loss])
+                except fluid.core.EnforceNotMet as ex:
+                    self.assertIn("There is no next data.", ex.message)
+                    break
+
                 avg_loss_np.append(tmp)
                 batch_id += 1
             data_file.reset()

From b1cc28dab34c2fb5b4eeb9446ee3b066e86fac71 Mon Sep 17 00:00:00 2001
From: JiayiFeng <fengjiayi@baidu.com>
Date: Wed, 11 Apr 2018 04:54:14 +0000
Subject: [PATCH 41/62] update

---
 python/paddle/fluid/tests/unittests/test_recordio_reader.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/test_recordio_reader.py b/python/paddle/fluid/tests/unittests/test_recordio_reader.py
index 2982cb8ceb..7c8e7f634f 100644
--- a/python/paddle/fluid/tests/unittests/test_recordio_reader.py
+++ b/python/paddle/fluid/tests/unittests/test_recordio_reader.py
@@ -66,7 +66,6 @@ class TestRecordIO(unittest.TestCase):
             # train a pass
             batch_id = 0
             while True:
-                ex = None
                 try:
                     tmp, = exe.run(fetch_list=[avg_loss])
                 except fluid.core.EnforceNotMet as ex:

From 0dacbbe1fe79d7643ce56c6ee630b8fe1270990b Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Wed, 11 Apr 2018 14:07:58 +0800
Subject: [PATCH 42/62] update multi_pass_reader unittest

---
 .../fluid/tests/unittests/test_multi_pass_reader.py       | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py b/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py
index c8a8afbea6..1471843ded 100644
--- a/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py
+++ b/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py
@@ -57,8 +57,12 @@ class TestMultipleReader(unittest.TestCase):
             exe.run(fluid.default_startup_program())
 
             batch_count = 0
-            while not data_file.eof():
-                img_val, = exe.run(fetch_list=[img])
+            while True:
+                try:
+                    img_val, = exe.run(fetch_list=[img])
+                except fluid.core.EnforceNotMet as ex:
+                    self.assertIn("There is no next data.", ex.message)
+                    break
                 batch_count += 1
                 self.assertLessEqual(img_val.shape[0], self.batch_size)
             data_file.reset()

From 8c1eb8693e58b2d516eb5bba1ed966ee81bf6cbf Mon Sep 17 00:00:00 2001
From: JiayiFeng <fengjiayi@baidu.com>
Date: Wed, 11 Apr 2018 06:12:35 +0000
Subject: [PATCH 43/62] update unittest

---
 ...{test_multiple_reader.py => test_multi_file_reader.py} | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)
 rename python/paddle/fluid/tests/unittests/{test_multiple_reader.py => test_multi_file_reader.py} (91%)

diff --git a/python/paddle/fluid/tests/unittests/test_multiple_reader.py b/python/paddle/fluid/tests/unittests/test_multi_file_reader.py
similarity index 91%
rename from python/paddle/fluid/tests/unittests/test_multiple_reader.py
rename to python/paddle/fluid/tests/unittests/test_multi_file_reader.py
index a60a5d6c4a..5dc41e54d6 100644
--- a/python/paddle/fluid/tests/unittests/test_multiple_reader.py
+++ b/python/paddle/fluid/tests/unittests/test_multi_file_reader.py
@@ -61,8 +61,12 @@ class TestMultipleReader(unittest.TestCase):
             exe.run(fluid.default_startup_program())
 
             batch_count = 0
-            while not data_files.eof():
-                img_val, = exe.run(fetch_list=[img])
+            while True:
+                try:
+                    img_val, = exe.run(fetch_list=[img])
+                except fluid.core.EnforceNotMet as ex:
+                    self.assertIn("There is no next data.", ex.message)
+                    break
                 batch_count += 1
                 self.assertLessEqual(img_val.shape[0], self.batch_size)
             data_files.reset()

From c64190ecbb211c09054b0ffea25179fdcad50207 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Wed, 11 Apr 2018 14:44:22 +0800
Subject: [PATCH 44/62] Polish NCCLHelper

---
 paddle/fluid/platform/nccl_helper.h | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h
index 3a2a423486..ca9ab2c7ae 100644
--- a/paddle/fluid/platform/nccl_helper.h
+++ b/paddle/fluid/platform/nccl_helper.h
@@ -61,7 +61,7 @@ struct NCCLContext {
   ncclComm_t comm_;
 
   explicit NCCLContext(int dev_id)
-      : ctx_(new CUDADeviceContext(CUDAPlace(dev_id))) {}
+      : ctx_(new CUDADeviceContext(CUDAPlace(dev_id))), comm_{nullptr} {}
 
   cudaStream_t stream() const { return ctx_->stream(); }
 
@@ -95,6 +95,7 @@ struct NCCLContextMap {
   std::vector<int> order_;
 
   explicit NCCLContextMap(const std::vector<platform::Place> &places) {
+    PADDLE_ENFORCE(!places.empty());
     order_.reserve(places.size());
     for (auto &p : places) {
       int dev_id = boost::get<CUDAPlace>(p).device;
@@ -105,15 +106,17 @@ struct NCCLContextMap {
         order_.size(), contexts_.size(),
         "NCCL Context Map does not support contain two or more same device");
 
-    std::vector<ncclComm_t> comms;
-    comms.resize(order_.size());
+    if (places.size() > 1) {
+      std::vector<ncclComm_t> comms;
+      comms.resize(order_.size());
 
-    PADDLE_ENFORCE(platform::dynload::ncclCommInitAll(
-        &comms[0], static_cast<int>(order_.size()), &order_[0]));
+      PADDLE_ENFORCE(platform::dynload::ncclCommInitAll(
+          &comms[0], static_cast<int>(order_.size()), &order_[0]));
 
-    int i = 0;
-    for (auto &dev_id : order_) {
-      contexts_.at(dev_id).comm_ = comms[i++];
+      int i = 0;
+      for (auto &dev_id : order_) {
+        contexts_.at(dev_id).comm_ = comms[i++];
+      }
     }
   }
 

From 16a9dfe4805fa88670338b52bf898f60043fc16f Mon Sep 17 00:00:00 2001
From: typhoonzero <typhoonzero1986@gmail.com>
Date: Wed, 11 Apr 2018 15:12:58 +0800
Subject: [PATCH 45/62] finish

---
 .../details/multi_devices_graph_builder.cc       | 16 +++++++---------
 .../details/multi_devices_graph_builder.h        |  2 +-
 paddle/fluid/framework/details/send_op_handle.cc | 12 ++++++++----
 paddle/fluid/framework/details/send_op_handle.h  |  4 +++-
 4 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
index 0ebcd627bd..e0dd9e6068 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@@ -57,8 +57,7 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
 
 void MultiDevSSAGraphBuilder::CreateOpHandleIOs(SSAGraph *result, OpDesc *op,
                                                 const platform::Place &p,
-                                                const size_t &i,
-                                                bool create_output) const {
+                                                const size_t &i) const {
   auto *op_handle = result->ops_.back().get();
   op_handle->dev_ctxes_[p] = const_cast<platform::DeviceContext *>(
       platform::DeviceContextPool::Instance().Get(p));
@@ -69,12 +68,11 @@ void MultiDevSSAGraphBuilder::CreateOpHandleIOs(SSAGraph *result, OpDesc *op,
     VarHandle *var = CreateOrGetLatestVarHandle(result, each_var_name, p, i);
     op_handle->AddInput(var);
   }
-  if (create_output) {
-    var_names = op->OutputArgumentNames();
 
-    for (auto &each_var_name : var_names) {
-      CreateOpOutput(result, op_handle, each_var_name, p, i);
-    }
+  var_names = op->OutputArgumentNames();
+
+  for (auto &each_var_name : var_names) {
+    CreateOpOutput(result, op_handle, each_var_name, p, i);
   }
 }
 
@@ -106,10 +104,10 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
       auto &p = places_[0];
       auto *s = local_scopes_[0];
       // FIXME(wuyi): send op always copy from GPU 0
-      result.ops_.emplace_back(new SendOpHandle(*op, s));
+      result.ops_.emplace_back(new SendOpHandle(*op, s, p));
       // Create inputs for output on original place and no ssa output
       // is created for send op.
-      CreateOpHandleIOs(&result, op, p, 0, false);
+      CreateOpHandleIOs(&result, op, p, 0);
       continue;
     }
 
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.h b/paddle/fluid/framework/details/multi_devices_graph_builder.h
index 137c817fde..de34caab1b 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h
@@ -46,7 +46,7 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
 
  private:
   void CreateOpHandleIOs(SSAGraph *result, OpDesc *op, const platform::Place &p,
-                         const size_t &i, bool create_output = true) const;
+                         const size_t &i) const;
 
  private:
   std::string loss_var_name_;
diff --git a/paddle/fluid/framework/details/send_op_handle.cc b/paddle/fluid/framework/details/send_op_handle.cc
index caacfa6b1e..d181607e86 100644
--- a/paddle/fluid/framework/details/send_op_handle.cc
+++ b/paddle/fluid/framework/details/send_op_handle.cc
@@ -19,18 +19,22 @@ namespace framework {
 namespace details {
 
 SendOpHandle::SendOpHandle(const framework::OpDesc &op_desc,
-                           const Scope *local_scope)
+                           const Scope *local_scope,
+                           const platform::Place &place)
     : op_(framework::OpRegistry::CreateOp(op_desc)),
-      local_scope_(local_scope) {}
+      local_scope_(local_scope),
+      place_(place) {}
 
 void SendOpHandle::RunImpl() {
   // Wait input done
   for (auto *in : inputs_) {
     auto &p = static_cast<VarHandle *>(in)->place_;
+    if (in->DebugString() == "dummy") {  // HACK
+      continue;
+    }
     in->generated_op_->Wait(dev_ctxes_[p]);
   }
-  platform::CPUPlace cpu;
-  op_->Run(*local_scope_, cpu);
+  op_->Run(*local_scope_, place_);
 }
 
 std::string SendOpHandle::Name() const { return "send"; }
diff --git a/paddle/fluid/framework/details/send_op_handle.h b/paddle/fluid/framework/details/send_op_handle.h
index 8a7b62ba1c..e7857c1f23 100644
--- a/paddle/fluid/framework/details/send_op_handle.h
+++ b/paddle/fluid/framework/details/send_op_handle.h
@@ -31,8 +31,10 @@ namespace details {
 struct SendOpHandle : public OpHandleBase {
   std::unique_ptr<OperatorBase> op_;
   const Scope* local_scope_;
+  const platform::Place& place_;
 
-  SendOpHandle(const framework::OpDesc& op_desc, const Scope* local_scope);
+  SendOpHandle(const framework::OpDesc& op_desc, const Scope* local_scope,
+               const platform::Place& place);
 
   std::string Name() const override;
 

From d1e63a1d9205e99483a3b69058fdf36e54dc348e Mon Sep 17 00:00:00 2001
From: typhoonzero <typhoonzero1986@gmail.com>
Date: Wed, 11 Apr 2018 15:18:55 +0800
Subject: [PATCH 46/62] fix ci

---
 paddle/fluid/framework/details/send_op_handle.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/paddle/fluid/framework/details/send_op_handle.h b/paddle/fluid/framework/details/send_op_handle.h
index e7857c1f23..173f9d7261 100644
--- a/paddle/fluid/framework/details/send_op_handle.h
+++ b/paddle/fluid/framework/details/send_op_handle.h
@@ -22,7 +22,6 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/platform/nccl_helper.h"
 
 namespace paddle {
 namespace framework {

From d52fa26fdab7a0497a3e7f49833d1b3827955c44 Mon Sep 17 00:00:00 2001
From: dzhwinter <dongzhihong01@baidu.com>
Date: Wed, 11 Apr 2018 17:03:39 +0800
Subject: [PATCH 47/62] Feature/metrics (#9791)

* "add metrics"

* "add fluid metrics"

* "add import guards"

* "show warnings"

* "add demo"

* "fix ci"

* "add some details"

* "fix cci"

* "add demo Python"

* "add metrics"
---
 benchmark/fluid/mnist.py             |   7 +-
 python/paddle/fluid/__init__.py      |   1 +
 python/paddle/fluid/average.py       |   6 +
 python/paddle/fluid/evaluator.py     |   4 +
 python/paddle/fluid/layers/metric.py |  37 ++-
 python/paddle/fluid/metrics.py       | 378 +++++++++++++++++++++++++++
 6 files changed, 427 insertions(+), 6 deletions(-)
 create mode 100644 python/paddle/fluid/metrics.py

diff --git a/benchmark/fluid/mnist.py b/benchmark/fluid/mnist.py
index 43866da9cb..dc10ac2ec1 100644
--- a/benchmark/fluid/mnist.py
+++ b/benchmark/fluid/mnist.py
@@ -139,9 +139,6 @@ def run_benchmark(model, args):
 
     # inference program
     inference_program = fluid.default_main_program().clone()
-    with fluid.program_guard(inference_program):
-        inference_program = fluid.io.get_inference_program(
-            target_vars=[batch_acc, batch_size_tensor])
 
     # Optimization
     opt = fluid.optimizer.AdamOptimizer(
@@ -161,7 +158,7 @@ def run_benchmark(model, args):
     train_reader = paddle.batch(
         paddle.dataset.mnist.train(), batch_size=args.batch_size)
 
-    accuracy = fluid.average.WeightedAverage()
+    accuracy = fluid.metrics.Accuracy()
     iters, num_samples, start_time = 0, 0, time.time()
     for pass_id in range(args.pass_num):
         accuracy.reset()
@@ -184,7 +181,7 @@ def run_benchmark(model, args):
                       "label": y_data},
                 fetch_list=[avg_cost, batch_acc, batch_size_tensor]
             )  # The accuracy is the accumulation of batches, but not the current batch.
-            accuracy.add(value=outs[1], weight=outs[2])
+            accuracy.update(value=outs[1], weight=outs[2])
             iters += 1
             num_samples += len(y_data)
             loss = np.array(outs[0])
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index a5a3884750..f757411b85 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -29,6 +29,7 @@ import optimizer
 import backward
 import regularizer
 import average
+import metrics
 from param_attr import ParamAttr, WeightNormParamAttr
 from data_feeder import DataFeeder
 from core import LoDTensor, CPUPlace, CUDAPlace, CUDAPinnedPlace
diff --git a/python/paddle/fluid/average.py b/python/paddle/fluid/average.py
index ded6eb0859..6abe8233b0 100644
--- a/python/paddle/fluid/average.py
+++ b/python/paddle/fluid/average.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import numpy as np
+import warnings
 """
     Class of all kinds of Average.
 
@@ -22,6 +23,8 @@ import numpy as np
     wrappers of Python functions.
 """
 
+__all__ = ["WeightedAverage"]
+
 
 def _is_number_(var):
     return isinstance(var, int) or isinstance(var, float) or (isinstance(
@@ -34,6 +37,9 @@ def _is_number_or_matrix_(var):
 
 class WeightedAverage(object):
     def __init__(self):
+        warnings.warn(
+            "The %s is deprecated, please use fluid.metrics.Accuracy instead." %
+            (self.__class__.__name__), Warning)
         self.reset()
 
     def reset(self):
diff --git a/python/paddle/fluid/evaluator.py b/python/paddle/fluid/evaluator.py
index 19e5b61b0b..13475025b5 100644
--- a/python/paddle/fluid/evaluator.py
+++ b/python/paddle/fluid/evaluator.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import warnings
 import numpy as np
 
 import layers
@@ -59,6 +60,9 @@ class Evaluator(object):
     """
 
     def __init__(self, name, **kwargs):
+        warnings.warn(
+            "The %s is deprecated, because maintain a modified program inside evaluator cause bug easily, please use fluid.metrics.%s instead."
+            % (self.__class__.__name__, self.__class__.__name__), Warning)
         self.states = []
         self.metrics = []
         self.helper = LayerHelper(name, **kwargs)
diff --git a/python/paddle/fluid/layers/metric.py b/python/paddle/fluid/layers/metric.py
index 3d9157ad4e..f66dccfa2d 100644
--- a/python/paddle/fluid/layers/metric.py
+++ b/python/paddle/fluid/layers/metric.py
@@ -15,12 +15,13 @@
 All layers just related to metric.
 """
 
+import warnings
 from ..layer_helper import LayerHelper
 from ..initializer import Normal, Constant
 from ..framework import Variable
 from ..param_attr import ParamAttr
 
-__all__ = ['accuracy']
+__all__ = ['accuracy', 'auc']
 
 
 def accuracy(input, label, k=1, correct=None, total=None):
@@ -55,3 +56,37 @@ def accuracy(input, label, k=1, correct=None, total=None):
             "Total": [total],
         })
     return acc_out
+
+
+def auc(input, label, curve='ROC', num_thresholds=200):
+    warnings.warn(
+        "This interface not recommended, fluid.layers.auc compute the auc at every minibatch, \
+        but can not aggregate them and get the pass AUC, because pass \
+        auc can not be averaged with weighted from the minibatch auc value. \
+        Please use fluid.metrics.Auc, it can compute the auc value via Python natively, \
+        which can get every minibatch and every pass auc value.", Warning)
+    helper = LayerHelper("auc", **locals())
+    topk_out = helper.create_tmp_variable(dtype=input.dtype)
+    topk_indices = helper.create_tmp_variable(dtype="int64")
+    helper.append_op(
+        type="top_k",
+        inputs={"X": [input]},
+        outputs={"Out": [topk_out],
+                 "Indices": [topk_indices]},
+        attrs={"k": k})
+    auc_out = helper.create_tmp_variable(dtype="float32")
+    if correct is None:
+        correct = helper.create_tmp_variable(dtype="int64")
+    if total is None:
+        total = helper.create_tmp_variable(dtype="int64")
+    helper.append_op(
+        type="accuracy",
+        inputs={
+            "Out": [topk_out],
+            "Indices": [topk_indices],
+            "Label": [label]
+        },
+        attrs={"curve": curve,
+               "num_thresholds": num_thresholds},
+        outputs={"AUC": [auc_out], })
+    return auc_out
diff --git a/python/paddle/fluid/metrics.py b/python/paddle/fluid/metrics.py
new file mode 100644
index 0000000000..99a81c1d42
--- /dev/null
+++ b/python/paddle/fluid/metrics.py
@@ -0,0 +1,378 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Fluid Metrics
+
+The metrics are accomplished via Python natively. 
+"""
+import numpy as np
+import copy
+import warnings
+
+__all__ = [
+    'MetricBase',
+    'CompositeMetric',
+    'Accuracy',
+    'ChunkEvaluator',
+    'EditDistance',
+    'DetectionMAP',
+    'Auc',
+]
+
+
+def _is_numpy_(var):
+    return isinstance(var, (np.ndarray, np.generic))
+
+
+def _is_number_(var):
+    return isinstance(var, int) or isinstance(var, float) or (isinstance(
+        var, np.ndarray) and var.shape == (1, ))
+
+
+def _is_number_or_matrix_(var):
+    return _is_number_(var) or isinstance(var, np.ndarray)
+
+
+class MetricBase(object):
+    """
+    Base Class for all evaluators
+
+    Args:
+        name(str): The name of evaluator. such as, "accuracy". Used for generate
+            temporary variable name.
+    Interface:
+        Note(*) : the states is the attributes who not has _ prefix.
+
+        get_config(): print current states and configuration
+        reset(): clear the states. If the Metrics states type is not (int, float, np.ndarray),
+                Please override this method.
+        update(): update states at every minibatch
+        eval(): get metric evaluation in numpy type.
+    """
+
+    def __init__(self, name, **kwargs):
+        self._name = str(name) if name != None else self.__class__.__name__
+        self._kwargs = kwargs if kwargs != None else dict()
+        self.reset()
+
+    def __str__(self):
+        return self._name
+
+    def reset(self):
+        """
+        states is the attributes who not has _ prefix.
+        reset the states of metrics.
+        """
+        states = {
+            attr: value
+            for attr, value in self.__dict__.iteritems()
+            if not attr.startswith("_")
+        }
+        for attr, value in states.iteritems():
+            if isinstance(value, int):
+                setattr(self, attr, 0)
+            elif isinstance(value, float):
+                setattr(self, attr, .0)
+            elif isinstance(value, (np.ndarray, np.generic)):
+                setattr(self, attr, np.zeros_like(value))
+            else:
+                setattr(self, attr, None)
+
+    def get_config(self):
+        states = {
+            attr: value
+            for attr, value in self.__dict__.iteritems()
+            if not attr.startswith("_")
+        }
+        config = copy.deepcopy(self._kwargs)
+        config.update({"name": self._name, "states": copy.deepcopy(states)})
+        return config
+
+    def update(self):
+        raise NotImplementedError()
+
+    def eval(self):
+        raise NotImplementedError()
+
+
+class CompositeMetric(MetricBase):
+    """
+    Compute multiple metrics in each minibatch.
+    for example, merge F1, accuracy, recall into one Metric.
+    """
+
+    def __init__(self, name=None, **kwargs):
+        super(CompositeMetric, self).__init__(name, kwargs)
+        self._metrics = []
+
+    def add_metric(self, metric):
+        if not isinstance(metric, MetricBase):
+            raise ValueError("SubMetric should be inherit from MetricBase.")
+        self._metrics.append(metric)
+
+    def eval(self):
+        ans = []
+        for m in self._metrics:
+            ans.append(m.eval())
+        return ans
+
+
+class Accuracy(MetricBase):
+    """
+    Accumulate the accuracy from minibatches and compute the average accuracy
+    for every pass.
+
+    Args:
+       name: the metrics name
+
+    Example:
+        minibatch_accuracy = fluid.layers.accuracy(pred, label)
+        accuracy_evaluator = fluid.metrics.Accuracy()
+        for epoch in PASS_NUM:
+            accuracy_evaluator.reset()
+            for data in batches:
+                loss = exe.run(fetch_list=[cost, minibatch_accuracy])
+            accuracy_evaluator.update(value=minibatch_accuracy, weight=batches)
+            accuracy = accuracy_evaluator.eval()
+    """
+
+    def __init__(self, name=None):
+        super(Accuracy, self).__init__(name)
+        self.value = .0
+        self.weight = .0
+
+    def update(self, value, weight):
+        if not _is_number_or_matrix_(value):
+            raise ValueError(
+                "The 'value' must be a number(int, float) or a numpy ndarray.")
+        if not _is_number_(weight):
+            raise ValueError("The 'weight' must be a number(int, float).")
+        self.value += value * weight
+        self.weight += weight
+
+    def eval(self):
+        if self.weight == 0:
+            raise ValueError(
+                "There is no data in Accuracy Metrics. Please check layers.accuracy output has added to Accuracy."
+            )
+        return self.value / self.weight
+
+
+class ChunkEvalutor(MetricBase):
+    """
+    Accumulate counter numbers output by chunk_eval from mini-batches and
+    compute the precision recall and F1-score using the accumulated counter
+    numbers.
+    """
+
+    def __init__(self, name=None):
+        super(ChunkEvalutor, self).__init__(name)
+        self.num_infer_chunks = 0
+        self.num_label_chunks = 0
+        self.num_correct_chunks = 0
+
+    def update(self, num_infer_chunks, num_label_chunks, num_correct_chunks):
+        if not _is_number_or_matrix_(num_infer_chunks):
+            raise ValueError(
+                "The 'num_infer_chunks' must be a number(int, float) or a numpy ndarray."
+            )
+        if not _is_number_or_matrix_(num_label_chunks):
+            raise ValueError(
+                "The 'num_label_chunks' must be a number(int, float) or a numpy ndarray."
+            )
+        if not _is_number_or_matrix_(num_correct_chunks):
+            raise ValueError(
+                "The 'num_correct_chunks' must be a number(int, float) or a numpy ndarray."
+            )
+        self.num_infer_chunks += num_infer_chunks
+        self.num_label_chunks += num_label_chunks
+        self.num_correct_chunks += num_correct_chunks
+
+    def eval(self):
+        precision = float(
+            self.num_correct_chunks
+        ) / self.num_infer_chunks if self.num_infer_chunks else 0
+        recall = float(self.num_correct_chunks
+                       ) / self.num_label_chunks if self.num_label_chunks else 0
+        f1_score = float(2 * precision * recall) / (
+            precision + recall) if self.num_correct_chunks else 0
+        return precision, recall, f1_score
+
+
+class EditDistance(MetricBase):
+    """
+    Accumulate edit distance sum and sequence number from mini-batches and
+    compute the average edit_distance and instance error of all batches.
+
+    Args:
+        name: the metrics name
+
+    Example:
+        edit_distance_metrics = fluid.layers.edit_distance(input, label)
+        distance_evaluator = fluid.metrics.EditDistance()
+        for epoch in PASS_NUM:
+            distance_evaluator.reset()
+            for data in batches:
+                loss = exe.run(fetch_list=[cost] + list(edit_distance_metrics))
+            distance_evaluator.update(*edit_distance_metrics)
+            distance, instance_error = distance_evaluator.eval()
+
+        In the above example:
+        'distance' is the average of the edit distance in a pass.
+        'instance_error' is the instance error rate in a pass.
+
+    """
+
+    def __init__(self, name):
+        super(EditDistance, self).__init__(name)
+        self.total_distance = .0
+        self.seq_num = 0
+        self.instance_error = 0
+
+    def update(self, distances, seq_num):
+        if not _is_numpy_(distances):
+            raise ValueError("The 'distances' must be a numpy ndarray.")
+        if not _is_number_(seq_num):
+            raise ValueError("The 'seq_num' must be a number(int, float).")
+        seq_right_count = np.sum(distances == 0)
+        total_distance = np.sum(distances)
+        self.seq_num += seq_num
+        self.instance_error += seq_num - seq_right_count
+        self.total_distance += total_distance
+
+    def eval():
+        if self.seq_num == 0:
+            raise ValueError(
+                "There is no data in EditDistance Metric. Please check layers.edit_distance output has been added to EditDistance."
+            )
+        avg_distance = self.total_distance / self.seq_num
+        avg_instance_error = self.instance_error / self.seq_num
+        return avg_distance, avg_instance_error
+
+
+class DetectionMAP(MetricBase):
+    """
+    Calculate the detection mean average precision (mAP).
+
+    TODO (Dang Qingqing): update the following doc.
+    The general steps are as follows:
+    1. calculate the true positive and false positive according to the input
+        of detection and labels.
+    2. calculate mAP value, support two versions: '11 point' and 'integral'.
+
+    Please get more information from the following articles:
+      https://sanchom.wordpress.com/tag/average-precision/
+      https://arxiv.org/abs/1512.02325
+    """
+
+    def __init__(self, name=None):
+        super(DetectionMAP, self).__init__(name)
+        # the current map value
+        self.value = .0
+
+    def update(self, value, weight):
+        if not _is_number_or_matrix_(value):
+            raise ValueError(
+                "The 'value' must be a number(int, float) or a numpy ndarray.")
+        if not _is_number_(weight):
+            raise ValueError("The 'weight' must be a number(int, float).")
+        self.value += value
+        self.weight += weight
+
+    def eval(self):
+        if self.weight == 0:
+            raise ValueError(
+                "There is no data in DetectionMAP Metrics. "
+                "Please check layers.detection_map output has added to DetectionMAP."
+            )
+        return self.value / self.weight
+
+
+class Auc(MetricBase):
+    """
+    Auc Metrics which adapts to binary classification.
+    Need to note that auc metrics compute the value via Python natively.
+    If you concern the speed, please use the fluid.layers.auc instead.
+
+    The `auc` function creates four local variables, `true_positives`,
+      `true_negatives`, `false_positives` and `false_negatives` that are used to
+      compute the AUC. To discretize the AUC curve, a linearly spaced set of
+      thresholds is used to compute pairs of recall and precision values. The area
+      under the ROC-curve is therefore computed using the height of the recall
+      values by the false positive rate, while the area under the PR-curve is the
+      computed using the height of the precision values by the recall.
+
+    Args:
+        name: metric name
+        curve: Specifies the name of the curve to be computed, 'ROC' [default] or
+          'PR' for the Precision-Recall-curve.
+        num_thresholds: The number of thresholds to use when discretizing the roc
+            curve.
+
+    "NOTE: only implement the ROC curve type via Python now."
+    """
+
+    def __init__(self, name, curve='ROC', num_thresholds=200):
+        super(MetricBase, self).__init__(name, curve, num_thresholds)
+        self._curve = curve
+        self._num_thresholds = num_thresholds
+        self._epsilon = 1e-6
+        self.tp_list = np.ndarray((num_thresholds, ))
+        self.fn_list = np.ndarray((num_thresholds, ))
+        self.tn_list = np.ndarray((num_thresholds, ))
+        self.fp_list = np.ndarray((num_thresholds, ))
+
+    def update(self, labels, predictions, axis=1):
+        if not _is_numpy_(labels):
+            raise ValueError("The 'labels' must be a numpy ndarray.")
+        if not _is_numpy_(predictions):
+            raise ValueError("The 'predictions' must be a numpy ndarray.")
+
+        kepsilon = 1e-7  # to account for floating point imprecisions
+        thresholds = [(i + 1) * 1.0 / (num_thresholds - 1)
+                      for i in range(num_thresholds - 2)]
+        thresholds = [0.0 - kepsilon] + thresholds + [1.0 + kepsilon]
+
+        # caculate TP, FN, TN, FP count
+        for idx_thresh, thresh in enumerate(thresholds):
+            tp, fn, tn, fp = 0, 0, 0, 0
+            for i, lbl in enumerate(labels):
+                if lbl:
+                    if predictions[i, 0] >= thresh:
+                        tp += 1
+                    else:
+                        fn += 1
+                else:
+                    if predictions[i, 0] >= thresh:
+                        fp += 1
+                    else:
+                        tn += 1
+            tp_list[idx_thresh] += tp
+            fn_list[idx_thresh] += fn
+            tn_list[idx_thresh] += tn
+            fp_list[idx_thresh] += fp
+
+    def eval(self):
+        epsilon = self._epsilon
+        num_thresholds = self._num_thresholds
+        tpr = (tp_list.astype("float32") + epsilon) / (
+            tp_list + fn_list + epsilon)
+        fpr = fp_list.astype("float32") / (fp_list + tn_list + epsilon)
+        rec = (tp_list.astype("float32") + epsilon) / (
+            tp_list + fp_list + epsilon)
+
+        x = fpr[:num_thresholds - 1] - fpr[1:]
+        y = (tpr[:num_thresholds - 1] + tpr[1:]) / 2.0
+        auc_value = np.sum(x * y)
+        return auc_value

From 5ceea265bba7d5612a03a645f366484ef39f8582 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Wed, 11 Apr 2018 18:45:10 +0800
Subject: [PATCH 48/62] Disable unstable unittest

---
 paddle/fluid/inference/tests/book/CMakeLists.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/inference/tests/book/CMakeLists.txt b/paddle/fluid/inference/tests/book/CMakeLists.txt
index 6ed77adb9d..86e36f3f65 100644
--- a/paddle/fluid/inference/tests/book/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/book/CMakeLists.txt
@@ -24,7 +24,8 @@ function(inference_test TARGET_NAME)
   endforeach()
 endfunction(inference_test)
 
-inference_test(fit_a_line)
+# This unittest is buggy!
+#inference_test(fit_a_line)
 inference_test(image_classification ARGS vgg resnet)
 inference_test(label_semantic_roles)
 inference_test(recognize_digits ARGS mlp conv)

From 9fd76c4f3a4eb4729401d5cae73fc6cb865c0eb3 Mon Sep 17 00:00:00 2001
From: Dang Qingqing <dangqingqing@baidu.com>
Date: Wed, 11 Apr 2018 19:48:51 +0800
Subject: [PATCH 49/62] Improve test_parallel_executor.

---
 .../fluid/tests/unittests/test_parallel_executor.py       | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor.py b/python/paddle/fluid/tests/unittests/test_parallel_executor.py
index 8401716db8..e5c3961c55 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor.py
@@ -467,7 +467,7 @@ class ParallelExecutorTestingDuringTraining(unittest.TestCase):
             loss = simple_fc_net(True)
             test_program = main.clone(for_test=True)
 
-            opt = fluid.optimizer.SGD(learning_rate=0.0001)
+            opt = fluid.optimizer.SGD(learning_rate=0.001)
             opt.minimize(loss)
 
             batch_size = 32
@@ -494,4 +494,8 @@ class ParallelExecutorTestingDuringTraining(unittest.TestCase):
 
                 train_loss, = train_exe.run([loss.name], feed_dict=feed_dict)
                 train_loss = numpy.array(train_loss)
-                self.assertTrue(numpy.allclose(train_loss, test_loss))
+                self.assertTrue(
+                    numpy.allclose(
+                        train_loss, test_loss, atol=1e-8),
+                    "Train loss: " + str(train_loss) + "\n Test loss:" +
+                    str(test_loss))

From 8d3ce01f363051f56ef1ce89197f8e47e814876a Mon Sep 17 00:00:00 2001
From: Siddharth Goyal <vi.siddharth78@gmail.com>
Date: Wed, 11 Apr 2018 11:31:06 -0700
Subject: [PATCH 50/62] Fix cpplint errors for a set of operators (#9837)

* Fix cpplint errors, round2

* Fix pointer issue
---
 paddle/fluid/operators/ctc_align_op.cu       |  1 +
 paddle/fluid/operators/ctc_align_op.h        |  1 +
 paddle/fluid/operators/elementwise_op.h      | 13 +++++++------
 paddle/fluid/operators/gru_op.cc             |  1 +
 paddle/fluid/operators/gru_op.h              |  7 +++----
 paddle/fluid/operators/im2sequence_op.cc     |  1 +
 paddle/fluid/operators/im2sequence_op.h      |  2 +-
 paddle/fluid/operators/label_smooth_op.cc    |  1 +
 paddle/fluid/operators/linear_chain_crf_op.h |  2 +-
 paddle/fluid/operators/logical_op.cc         |  1 +
 paddle/fluid/operators/lrn_op.cc             |  1 +
 paddle/fluid/operators/lstm_op.cc            |  1 +
 paddle/fluid/operators/lstm_op.h             |  1 +
 paddle/fluid/operators/lstm_unit_op.cu       |  1 +
 paddle/fluid/operators/lstmp_op.cc           |  1 +
 paddle/fluid/operators/lstmp_op.h            |  1 +
 paddle/fluid/operators/matmul_op.cc          |  2 ++
 paddle/fluid/operators/matmul_op.h           |  4 +++-
 paddle/fluid/operators/maxout_op.cc          |  2 ++
 paddle/fluid/operators/minus_op.cc           |  2 ++
 paddle/fluid/operators/momentum_op.cu        |  1 +
 paddle/fluid/operators/mul_op.cc             |  1 +
 22 files changed, 35 insertions(+), 13 deletions(-)

diff --git a/paddle/fluid/operators/ctc_align_op.cu b/paddle/fluid/operators/ctc_align_op.cu
index 54e0b1d9ad..bbad74e96d 100644
--- a/paddle/fluid/operators/ctc_align_op.cu
+++ b/paddle/fluid/operators/ctc_align_op.cu
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <stdio.h>
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
+#include <vector>
 #include "paddle/fluid/operators/ctc_align_op.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/ctc_align_op.h b/paddle/fluid/operators/ctc_align_op.h
index 70698d9958..9c5c6f5aa0 100644
--- a/paddle/fluid/operators/ctc_align_op.h
+++ b/paddle/fluid/operators/ctc_align_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <string.h>
+#include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
 
diff --git a/paddle/fluid/operators/elementwise_op.h b/paddle/fluid/operators/elementwise_op.h
index f04d8d8fd8..a33634ab25 100644
--- a/paddle/fluid/operators/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise_op.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <string>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 
@@ -106,18 +107,18 @@ information. However, the output only shares the LoD information with input $X$.
  protected:
   std::string comment_;
 
-  void Replace(std::string& src, std::string from, std::string to) {
+  void Replace(std::string* src, std::string from, std::string to) {
     std::size_t len_from = std::strlen(from.c_str());
     std::size_t len_to = std::strlen(to.c_str());
-    for (std::size_t pos = src.find(from); pos != std::string::npos;
-         pos = src.find(from, pos + len_to)) {
-      src.replace(pos, len_from, to);
+    for (std::size_t pos = src->find(from); pos != std::string::npos;
+         pos = src->find(from, pos + len_to)) {
+      src->replace(pos, len_from, to);
     }
   }
 
   void SetComment(std::string name, std::string equation) {
-    Replace(comment_, "{name}", name);
-    Replace(comment_, "{equation}", equation);
+    Replace(&comment_, "{name}", name);
+    Replace(&comment_, "{equation}", equation);
   }
 };
 
diff --git a/paddle/fluid/operators/gru_op.cc b/paddle/fluid/operators/gru_op.cc
index 2a91dcbcd4..2490b83b8c 100644
--- a/paddle/fluid/operators/gru_op.cc
+++ b/paddle/fluid/operators/gru_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/gru_op.h"
+#include <string>
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/gru_op.h b/paddle/fluid/operators/gru_op.h
index 0886bebc41..1d5c291495 100644
--- a/paddle/fluid/operators/gru_op.h
+++ b/paddle/fluid/operators/gru_op.h
@@ -13,15 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-
+#include <string>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/detail/activation_functions.h"
 #include "paddle/fluid/operators/math/gru_compute.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/sequence2batch.h"
 
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/im2sequence_op.cc b/paddle/fluid/operators/im2sequence_op.cc
index 048391549d..5b387d8d34 100644
--- a/paddle/fluid/operators/im2sequence_op.cc
+++ b/paddle/fluid/operators/im2sequence_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/im2sequence_op.h"
+#include <vector>
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/im2sequence_op.h b/paddle/fluid/operators/im2sequence_op.h
index a6a83fefbc..d792c68f78 100644
--- a/paddle/fluid/operators/im2sequence_op.h
+++ b/paddle/fluid/operators/im2sequence_op.h
@@ -13,7 +13,7 @@
    limitations under the License. */
 
 #pragma once
-
+#include <vector>
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
diff --git a/paddle/fluid/operators/label_smooth_op.cc b/paddle/fluid/operators/label_smooth_op.cc
index eef25f8a06..c2a8c7f867 100644
--- a/paddle/fluid/operators/label_smooth_op.cc
+++ b/paddle/fluid/operators/label_smooth_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/label_smooth_op.h"
+#include <string>
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/linear_chain_crf_op.h b/paddle/fluid/operators/linear_chain_crf_op.h
index 800a1303e1..d5162bcd74 100644
--- a/paddle/fluid/operators/linear_chain_crf_op.h
+++ b/paddle/fluid/operators/linear_chain_crf_op.h
@@ -100,7 +100,7 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
     auto x_row_max = EigenMatrix<T>::From(emission_row_max);
     x_row_max.device(place) =
         x.maximum(Eigen::DSizes<int, 1>(1))
-            .reshape(Eigen::DSizes<int, 2>(int(batch_size), 1));
+            .reshape(Eigen::DSizes<int, 2>(static_cast<int>(batch_size), 1));
 
     auto x_exps = EigenMatrix<T>::From(*emission_exps);
     x_exps.device(place) =
diff --git a/paddle/fluid/operators/logical_op.cc b/paddle/fluid/operators/logical_op.cc
index 6a7db31cf3..41aa00ee8a 100644
--- a/paddle/fluid/operators/logical_op.cc
+++ b/paddle/fluid/operators/logical_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/logical_op.h"
+#include <string>
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/lrn_op.cc b/paddle/fluid/operators/lrn_op.cc
index cb15683981..553a06c3dc 100644
--- a/paddle/fluid/operators/lrn_op.cc
+++ b/paddle/fluid/operators/lrn_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/lrn_op.h"
+#include <string>
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
diff --git a/paddle/fluid/operators/lstm_op.cc b/paddle/fluid/operators/lstm_op.cc
index d75537741e..e062d62c66 100644
--- a/paddle/fluid/operators/lstm_op.cc
+++ b/paddle/fluid/operators/lstm_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/lstm_op.h"
+#include <string>
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/lstm_op.h b/paddle/fluid/operators/lstm_op.h
index 11f9f223b5..a1ef0eb278 100644
--- a/paddle/fluid/operators/lstm_op.h
+++ b/paddle/fluid/operators/lstm_op.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <string>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/detail/activation_functions.h"
 #include "paddle/fluid/operators/math/lstm_compute.h"
diff --git a/paddle/fluid/operators/lstm_unit_op.cu b/paddle/fluid/operators/lstm_unit_op.cu
index 76245a1b5a..acf094238f 100644
--- a/paddle/fluid/operators/lstm_unit_op.cu
+++ b/paddle/fluid/operators/lstm_unit_op.cu
@@ -18,6 +18,7 @@ https://github.com/caffe2/caffe2/blob/master/caffe2/operators/lstm_unit_op_gpu.c
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/cross_entropy_op.h"
+#include "paddle/fluid/operators/lstm_unit_op.h"
 #include "paddle/fluid/platform/assert.h"
 #include "paddle/fluid/platform/hostdevice.h"
 
diff --git a/paddle/fluid/operators/lstmp_op.cc b/paddle/fluid/operators/lstmp_op.cc
index a881ef82ec..82541517e1 100644
--- a/paddle/fluid/operators/lstmp_op.cc
+++ b/paddle/fluid/operators/lstmp_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/lstmp_op.h"
+#include <string>
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/lstmp_op.h b/paddle/fluid/operators/lstmp_op.h
index dfa7f74d51..172db54896 100644
--- a/paddle/fluid/operators/lstmp_op.h
+++ b/paddle/fluid/operators/lstmp_op.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <string>
 #include "paddle/fluid/operators/activation_op.h"
 #include "paddle/fluid/operators/math/detail/activation_functions.h"
 #include "paddle/fluid/operators/math/lstm_compute.h"
diff --git a/paddle/fluid/operators/matmul_op.cc b/paddle/fluid/operators/matmul_op.cc
index 8585592852..1f52558873 100644
--- a/paddle/fluid/operators/matmul_op.cc
+++ b/paddle/fluid/operators/matmul_op.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/matmul_op.h"
+#include <algorithm>
+#include <vector>
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/matmul_op.h b/paddle/fluid/operators/matmul_op.h
index 1cd8fe55dc..f2e9cfdcdb 100644
--- a/paddle/fluid/operators/matmul_op.h
+++ b/paddle/fluid/operators/matmul_op.h
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-
+#include <algorithm>
+#include <functional>
+#include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/matmul.h"
diff --git a/paddle/fluid/operators/maxout_op.cc b/paddle/fluid/operators/maxout_op.cc
index efaae7d5f2..4e28d98834 100644
--- a/paddle/fluid/operators/maxout_op.cc
+++ b/paddle/fluid/operators/maxout_op.cc
@@ -13,6 +13,8 @@
  *     limitations under the License. */
 
 #include "paddle/fluid/operators/maxout_op.h"
+#include <vector>
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/minus_op.cc b/paddle/fluid/operators/minus_op.cc
index 7de9d94979..5790deb54d 100644
--- a/paddle/fluid/operators/minus_op.cc
+++ b/paddle/fluid/operators/minus_op.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/minus_op.h"
+#include <string>
+#include <vector>
 #include "paddle/fluid/operators/net_op.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/momentum_op.cu b/paddle/fluid/operators/momentum_op.cu
index da4a6af298..5eb9d99502 100644
--- a/paddle/fluid/operators/momentum_op.cu
+++ b/paddle/fluid/operators/momentum_op.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/momentum_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/mul_op.cc b/paddle/fluid/operators/mul_op.cc
index 90af1e2d60..5038287527 100644
--- a/paddle/fluid/operators/mul_op.cc
+++ b/paddle/fluid/operators/mul_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/mul_op.h"
+#include <vector>
 
 namespace paddle {
 namespace operators {

From b26f5050020ea14fca6c1c2c759aa269c6331177 Mon Sep 17 00:00:00 2001
From: "Yang Yang(Tony)" <yangyang62@baidu.com>
Date: Wed, 11 Apr 2018 14:04:30 -0700
Subject: [PATCH 51/62] remove net op and cond_op (#9663)

* remove net op and cond_op

* fix cpplint

* fix dependency

* delete backward_test; fix compile

* disable batch_norm backward

* rm test_net.py

* make batchnorm test independent of backward.cc

* make test_layer_norm_op independent of backward.cc

* make test_layer_norm_op independent of backward.cc

* delete unused code

* clean up
---
 paddle/fluid/framework/CMakeLists.txt         |   3 +-
 paddle/fluid/framework/backward.cc            | 119 +--
 paddle/fluid/framework/backward_test.cc       | 918 ------------------
 paddle/fluid/framework/prune_test.cc          |   7 +-
 paddle/fluid/operators/CMakeLists.txt         |   4 +-
 paddle/fluid/operators/cond_op.cc             | 235 -----
 paddle/fluid/operators/cond_op.h              |  96 --
 paddle/fluid/operators/minus_op.cc            |   2 +-
 paddle/fluid/operators/net_op.cc              | 103 --
 paddle/fluid/operators/net_op.h               | 130 ---
 paddle/fluid/operators/net_op_test.cc         | 103 --
 paddle/fluid/operators/prelu_op.cc            |   3 +-
 paddle/fluid/operators/scale_op.cc            |   3 +-
 paddle/fluid/operators/split_op.cc            |   1 -
 paddle/fluid/pybind/pybind.cc                 |  49 +-
 paddle/fluid/recordio/chunk.cc                |   4 +-
 paddle/fluid/recordio/header.cc               |   3 +
 .../tests/unittests/test_batch_norm_op.py     | 361 +++----
 .../fluid/tests/unittests/test_cond_op.py     | 128 ---
 .../tests/unittests/test_layer_norm_op.py     | 231 ++---
 .../paddle/fluid/tests/unittests/test_net.py  |  53 -
 21 files changed, 205 insertions(+), 2351 deletions(-)
 delete mode 100644 paddle/fluid/framework/backward_test.cc
 delete mode 100644 paddle/fluid/operators/cond_op.cc
 delete mode 100644 paddle/fluid/operators/cond_op.h
 delete mode 100644 paddle/fluid/operators/net_op.cc
 delete mode 100644 paddle/fluid/operators/net_op.h
 delete mode 100644 paddle/fluid/operators/net_op_test.cc
 delete mode 100644 python/paddle/fluid/tests/unittests/test_cond_op.py
 delete mode 100644 python/paddle/fluid/tests/unittests/test_net.py

diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 3840bbe83b..77f459b4dd 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -79,8 +79,7 @@ add_custom_command(TARGET framework_py_proto POST_BUILD
     COMMENT "Copy generated python proto into directory paddle/fluid/proto."
     WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 
-cc_library(backward SRCS backward.cc DEPS net_op)
-cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context fill_constant_op)
+cc_library(backward SRCS backward.cc DEPS operator)
 cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)
 
 cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog)
diff --git a/paddle/fluid/framework/backward.cc b/paddle/fluid/framework/backward.cc
index 1314af2b3d..76133b5136 100644
--- a/paddle/fluid/framework/backward.cc
+++ b/paddle/fluid/framework/backward.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/backward.h"
-#include "paddle/fluid/operators/net_op.h"
 
 #include <deque>
 #include <list>
@@ -22,7 +21,6 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/net_op.h"
 
 namespace paddle {
 namespace framework {
@@ -60,12 +58,7 @@ static inline std::unique_ptr<OperatorBase> CreateGradOp(
   if (grad_ops.size() == 1) {
     return std::move(grad_ops[0]);
   } else {
-    auto net_op = new operators::NetOp();
-    for (auto& grad_op : grad_ops) {
-      net_op->AppendOp(std::move(grad_op));
-    }
-    net_op->CompleteAddOp();
-    return std::unique_ptr<OperatorBase>(net_op);
+    PADDLE_THROW("Unexpected Branch");
   }
 }
 
@@ -91,10 +84,7 @@ static bool AllInSet(
 }
 
 static std::unique_ptr<OperatorBase> NOP() {
-  auto net_op = new operators::NetOp();
-  net_op->SetType("@NOP@");
-  net_op->CompleteAddOp();
-  return std::unique_ptr<OperatorBase>(net_op);
+  PADDLE_THROW("Unexpected Branch");
 }
 
 //  Get backward operator from a forward operator, a recursive implementation.
@@ -136,110 +126,7 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
   }
 
   // Returned gradient network
-  auto net = std::unique_ptr<operators::NetOp>(new operators::NetOp());
-
-  if (forwardOp.IsNetOp()) {
-    // Because forwardOp is a net op, it can static_cast.
-    auto& forwardNet = static_cast<const operators::NetOp&>(forwardOp);
-
-    // Map from output gradient variable name to operator's indices in
-    // backward net's ops_. That operator generates that variable.
-    std::unordered_map<std::string, std::vector<size_t>> dup_output_ops;
-
-    size_t local_op_id = 0;
-    // reversely travel forwardNet and collect all duplicate outputs.
-    for (auto it = forwardNet.ops_.rbegin(); it != forwardNet.ops_.rend();
-         ++it, ++local_op_id) {
-      auto& fwd = *it;
-      auto bwd = BackwardRecursive(*fwd, no_grad_names, grad_to_var, uniq_id);
-      ForEachVarName(bwd->Outputs(),
-                     [&dup_output_ops, local_op_id](const std::string& out) {
-                       dup_output_ops[out].emplace_back(local_op_id);
-                       return false;
-                     });
-      net->AppendOp(std::move(bwd));
-    }
-    // Get unique ID for this method.
-    auto uid = uniq_id++;
-    // TODO(dzh): more comment
-    // multiple operators which have the same output (y for example) may
-    // overwrite the same y variable when backward, special operations are token
-    // to handle this case. For each duplicate output, rename it to an alias
-    // (original name with a offset), append an `add` op for its operator,
-    // and finally sum all the alias variable to the final output variable y.
-    using Pos = std::pair<size_t, std::unique_ptr<OperatorBase>>;
-    std::list<Pos> insert_position;
-    for (auto& dup_output_op : dup_output_ops) {
-      const std::string& name = dup_output_op.first;
-      // duplicate @Empty@ don't need to be added
-      if (name == kEmptyVarName) continue;
-
-      auto& dup_op = dup_output_op.second;
-      // no duplicate output
-      if (dup_op.size() == 1) continue;
-
-      // process the duplicate outputs
-      std::vector<std::string> dup_outputs;
-      for (size_t i = 0; i < dup_op.size(); ++i) {
-        // rename each duplicate output to an alias
-        auto op_offset = dup_op[i];
-        dup_outputs.push_back(name + "@RENAME@" + std::to_string(uid) + "@" +
-                              std::to_string(i));
-        net->ops_[op_offset]->Rename(name, dup_outputs.back());
-      }
-      // collect all the offset for each alias,
-      // insert a sum operator to add all aliases to output
-      insert_position.push_back(
-          {dup_op.back(),
-           OpRegistry::CreateOp("sum", {{"X", dup_outputs}}, {{"Out", {name}}},
-                                AttributeMap{})});
-    }
-
-    // make sure the inserted `sum` ops follow the BFS order.
-    insert_position.sort(
-        [](const Pos& l, const Pos& r) { return l.first > r.first; });
-
-    for (auto& pos : insert_position) {
-      net->InsertOp(pos.first + 1, std::move(pos.second));
-    }
-  } else {
-    std::unique_ptr<OperatorBase> grad_op(
-        CreateGradOp(forwardOp, no_grad_names, grad_to_var));
-
-    ForEachVarName(grad_op->Inputs(), [&no_grad_names, &net, &grad_op](
-                                          const std::string& grad_input) {
-      if (no_grad_names.count(grad_input)) {
-        // +1 for \0
-        std::string prefix = grad_input.substr(
-            0, grad_input.size() - sizeof(kGradVarSuffix) / sizeof(char) + 1);
-        grad_op->Rename(grad_input, prefix + kZeroVarSuffix);
-
-        // If part of input gradient of that operator is not calculated, fill
-        // zero variables to that input gradient.
-        net->AppendOp(OpRegistry::CreateOp("fill_zeros_like", {{"X", {prefix}}},
-                                           {{"Out", {grad_input}}},
-                                           AttributeMap{}));
-      }
-      return false;
-    });
-
-    ForEachVarName(grad_op->Outputs(),
-                   [&no_grad_names, &grad_op](const std::string& grad_output) {
-                     if (no_grad_names.count(grad_output)) {
-                       grad_op->Rename(grad_output, kEmptyVarName);
-                     }
-                     return false;
-                   });
-
-    if (net->ops_.empty()) {  // Current no aux op is added to network
-      return grad_op;
-    }
-    net->AppendOp(std::move(grad_op));
-  }
-  net->SetType("@GENERATED_BACKWARD@");
-  net->CompleteAddOp();
-  return std::unique_ptr<OperatorBase>(
-      static_cast<OperatorBase*>(net.release()));
+  PADDLE_THROW("Unexpected Branch");
 }
 
 // See header for comments
diff --git a/paddle/fluid/framework/backward_test.cc b/paddle/fluid/framework/backward_test.cc
deleted file mode 100644
index cc1f871360..0000000000
--- a/paddle/fluid/framework/backward_test.cc
+++ /dev/null
@@ -1,918 +0,0 @@
-//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/backward.h"
-
-#include <gtest/gtest.h>
-#include "paddle/fluid/framework/block_desc.h"
-#include "paddle/fluid/framework/op_desc.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/var_desc.h"
-#include "paddle/fluid/operators/net_op.h"
-
-USE_NO_KERNEL_OP(fill_constant);
-
-namespace paddle {
-namespace framework {
-
-using DeviceContext = platform::DeviceContext;
-
-class NoneOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext *ctx) const override {}
-};
-
-template <typename Place, typename T>
-class NoneKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {}
-};
-
-class RowWiseAddOpMaker : public OpProtoAndCheckerMaker {
- public:
-  RowWiseAddOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input X of Add");
-    AddInput("b", "Bias of Add");
-    AddOutput("Out", "Out of Add");
-    AddComment("Add Op");
-  }
-};
-
-class RowWiseAddGradMaker : public SingleGradOpDescMaker {
- public:
-  using SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<OpDesc> Apply() const override {
-    auto grad_op = new OpDesc();
-    grad_op->SetInput(GradVarName("Out"), OutputGrad("Out"));
-    grad_op->SetOutput(GradVarName("X"), InputGrad("X"));
-    grad_op->SetOutput(GradVarName("b"), InputGrad("b"));
-    grad_op->SetType("rowwise_add_grad");
-    return std::unique_ptr<OpDesc>(grad_op);
-  }
-};
-
-class MulOpMaker : public OpProtoAndCheckerMaker {
- public:
-  MulOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "A");
-    AddInput("Y", "B");
-    AddOutput("Out", "Out");
-    AddAttr<int>("x_num_col_dims", "").SetDefault(1).EqualGreaterThan(1);
-    AddAttr<int>("y_num_col_dims", "").SetDefault(1).EqualGreaterThan(1);
-    AddComment("Mul");
-  }
-};
-
-class SigmoidOpMaker : public OpProtoAndCheckerMaker {
- public:
-  SigmoidOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "X");
-    AddOutput("Out", "Y");
-    AddComment("Sigmoid");
-  }
-};
-
-class NoGradOpMaker : public OpProtoAndCheckerMaker {
- public:
-  NoGradOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "X input");
-    AddOutput("Out", "Y output");
-    AddComment("NoGradOp, same input output. no Grad");
-  }
-};
-
-class FcOp : public operators::NetOp {
- public:
-  FcOp(const std::string &type, const VariableNameMap &inputs,
-       const VariableNameMap &outputs, const AttributeMap &attrs)
-      : NetOp(type, inputs, outputs, attrs) {
-    AppendOp(OpRegistry::CreateOp(
-        "mul", {{"X", {Input("X")}}, {"Y", {Input("W")}}},
-        {{"Out", {Output("mul_result")}}}, AttributeMap{}));
-    auto input_b = Inputs("b");
-    std::string before_act = "mul_result";
-    if (input_b.size() != 0) {
-      AppendOp(OpRegistry::CreateOp(
-          "rowwise_add", {{"X", {Output("mul_result")}}, {"b", {input_b[0]}}},
-          {{"Out", {Output("add_result")}}}, AttributeMap{}));
-      before_act = "add_result";
-    } else {
-      auto out_varname = Output("add_result");
-      if (out_varname != kEmptyVarName) {
-        this->Rename(out_varname, kEmptyVarName);
-      }
-    }
-
-    AppendOp(OpRegistry::CreateOp("sigmoid", {{"X", {Output(before_act)}}},
-                                  {{"Out", {Output("Out")}}}, AttributeMap{}));
-    CompleteAddOp(false);
-  }
-};
-
-class FcOpMaker : public OpProtoAndCheckerMaker {
- public:
-  FcOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "x");
-    AddInput("W", "w");
-    AddInput("b", "b");
-    AddOutput("mul_result", "").AsIntermediate();
-    AddOutput("add_result", "").AsIntermediate();
-    AddOutput("Out", "");
-    AddComment("");
-  }
-};
-
-class ManyOutputOpMaker : public OpProtoAndCheckerMaker {
- public:
-  ManyOutputOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("x", "x");
-    AddOutput("y", "y");
-    AddOutput("z", "z");
-    AddComment("");
-  }
-};
-
-class FillZeroOpMaker : public OpProtoAndCheckerMaker {
- public:
-  FillZeroOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "x");
-    AddOutput("Out", "out");
-    AddComment("");
-  }
-};
-
-class SumOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  SumOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "the input tensors of sum operator.").AsDuplicable();
-    AddOutput("Out", "the output tensor of sum operator.");
-    AddComment("");
-  }
-};
-
-class MultInOutOpMaker : public OpProtoAndCheckerMaker {
- public:
-  MultInOutOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "x");
-    AddInput("H", "h");
-    AddOutput("Y", "y");
-    AddOutput("Z", "z");
-    AddComment("");
-  }
-};
-
-class MinusGradOpDescMaker : public GradOpDescMakerBase {
- public:
-  using GradOpDescMakerBase::GradOpDescMakerBase;
-
-  std::vector<std::unique_ptr<OpDesc>> operator()() const override {
-    std::vector<std::unique_ptr<OpDesc>> retv;
-    auto x_g = InputGrad("X");
-    if (!x_g.empty()) {
-      auto *op_desc = new OpDesc();
-      op_desc->SetType("scale");
-      op_desc->SetInput("X", OutputGrad("Out"));
-      op_desc->SetOutput("Out", x_g);
-      op_desc->SetAttr("scale", 1.0f);
-      retv.emplace_back(op_desc);
-    }
-
-    auto y_g = InputGrad("Y");
-    if (!y_g.empty()) {
-      auto *op_desc = new OpDesc();
-      op_desc->SetType("scale");
-      op_desc->SetInput("X", OutputGrad("Out"));
-      op_desc->SetOutput("Out", y_g);
-      op_desc->SetAttr("scale", -1.0f);
-      retv.emplace_back(op_desc);
-    }
-    return retv;
-  }
-};
-
-class MinusOpMaker : public OpProtoAndCheckerMaker {
- public:
-  MinusOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "");
-    AddInput("Y", "");
-    AddOutput("Out", "");
-    AddComment("minus for unittest");
-  }
-};
-}  // namespace framework
-}  // namespace paddle
-
-namespace f = paddle::framework;
-namespace ops = paddle::operators;
-using EnforceNotMet = paddle::platform::EnforceNotMet;
-// rowwise_add
-REGISTER_OPERATOR(rowwise_add, f::NoneOp, f::RowWiseAddOpMaker,
-                  f::RowWiseAddGradMaker);
-REGISTER_OP_CPU_KERNEL(rowwise_add,
-                       f::NoneKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OPERATOR(rowwise_add_grad, f::NoneOp);
-REGISTER_OP_CPU_KERNEL(rowwise_add_grad,
-                       f::NoneKernel<paddle::platform::CPUPlace, float>);
-// mul
-REGISTER_OP(mul, f::NoneOp, f::MulOpMaker, mul_grad, f::NoneOp);
-REGISTER_OP_CPU_KERNEL(mul, f::NoneKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(mul_grad,
-                       f::NoneKernel<paddle::platform::CPUPlace, float>);
-// sigmoid
-REGISTER_OP(sigmoid, f::NoneOp, f::SigmoidOpMaker, sigmoid_grad, f::NoneOp);
-REGISTER_OP_CPU_KERNEL(sigmoid,
-                       f::NoneKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_WITHOUT_GRADIENT(nograd, f::NoneOp, f::NoGradOpMaker);
-// fill_zeros_like
-REGISTER_OP_WITHOUT_GRADIENT(fill_zeros_like, f::NoneOp, f::FillZeroOpMaker);
-REGISTER_OP_CPU_KERNEL(fill_zeros_like,
-                       f::NoneKernel<paddle::platform::CPUPlace, float>);
-// sum
-REGISTER_OP(sum, f::NoneOp, f::SumOpMaker, sum_grad, f::NoneOp);
-REGISTER_OP_CPU_KERNEL(sum, f::NoneKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(sum_grad,
-                       f::NoneKernel<paddle::platform::CPUPlace, float>);
-// fc
-REGISTER_OP_WITHOUT_GRADIENT(fc, f::FcOp, f::FcOpMaker);
-// many_output_op
-REGISTER_OP(many_output_op, f::NoneOp, f::ManyOutputOpMaker,
-            many_output_op_grad, f::NoneOp);
-// mult_in_out
-REGISTER_OP(mult_in_out, f::NoneOp, f::MultInOutOpMaker, mult_in_out_grad,
-            f::NoneOp);
-REGISTER_OP_CPU_KERNEL(mult_in_out,
-                       f::NoneKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(mult_in_out_grad,
-                       f::NoneKernel<paddle::platform::CPUPlace, float>);
-// minus
-REGISTER_OPERATOR(minus, f::NoneOp, f::MinusOpMaker, f::MinusGradOpDescMaker);
-REGISTER_OP_CPU_KERNEL(minus, f::NoneKernel<paddle::platform::CPUPlace, float>);
-// scale
-REGISTER_OPERATOR(scale, f::NoneOp);
-REGISTER_OP_CPU_KERNEL(scale, f::NoneKernel<paddle::platform::CPUPlace, float>);
-
-TEST(Backward, simple_op_not_need_grad) {
-  auto fwd =
-      f::OpRegistry::CreateOp("rowwise_add", {{"X", {"x"}}, {"b", {"b"}}},
-                              {{"Out", {"out"}}}, f::AttributeMap{});
-  ASSERT_NE(fwd, nullptr);
-  auto gop = f::Backward(*fwd, {"x"});
-  ASSERT_EQ(gop->Output(f::GradVarName("X")), f::kEmptyVarName);
-
-  auto no_input_gop = f::Backward(*fwd, {"x", "b"});
-  ASSERT_NE(no_input_gop, nullptr);
-  ASSERT_TRUE(no_input_gop->IsNetOp());
-  ASSERT_EQ(0UL, static_cast<ops::NetOp *>(no_input_gop.get())->ops_.size());
-}
-
-TEST(Backward, net_fc_backward_normal) {
-  std::shared_ptr<f::OperatorBase> fwd =
-      f::OpRegistry::CreateOp("fc", {{"X", {"x"}}, {"W", {"w"}}, {"b", {"b"}}},
-                              {{"mul_result", {"mul_res"}},
-                               {"add_result", {"add_re"}},
-                               {"Out", {"out"}}},
-                              f::AttributeMap{});
-  ASSERT_NE(fwd, nullptr);
-  std::shared_ptr<f::OperatorBase> gop =
-      f::Backward(*fwd, std::unordered_set<std::string>{});
-  ASSERT_TRUE(gop->IsNetOp());
-  auto net = static_cast<ops::NetOp *>(gop.get());
-
-  ASSERT_NO_THROW(net->DebugString());
-
-  ASSERT_EQ(3UL, net->ops_.size());
-
-  f::OperatorBase &d_sigmoid = *net->ops_[0];
-  ASSERT_EQ("sigmoid_grad", d_sigmoid.Type());
-
-  f::OperatorBase &d_add = *net->ops_[1];
-  ASSERT_EQ("rowwise_add_grad", d_add.Type());
-
-  f::OperatorBase &d_mul = *net->ops_[2];
-  ASSERT_EQ("mul_grad", d_mul.Type());
-}
-
-TEST(Backward, net_fc_backward_not_have_b) {
-  std::shared_ptr<f::OperatorBase> fwd =
-      f::OpRegistry::CreateOp("fc", {{"X", {"x"}}, {"W", {"w"}}, {"b", {}}},
-                              {{"mul_result", {"mul_res"}},
-                               {"add_result", {"add_res"}},
-                               {"Out", {"tmp"}}},
-                              f::AttributeMap{});
-  ASSERT_NE(fwd, nullptr);
-  std::shared_ptr<f::OperatorBase> gop =
-      f::Backward(*fwd, std::unordered_set<std::string>{});
-  ASSERT_TRUE(gop->IsNetOp());
-  auto net = static_cast<ops::NetOp *>(gop.get());
-
-  ASSERT_NO_THROW(net->DebugString());
-
-  ASSERT_EQ(2UL, net->ops_.size());
-
-  f::OperatorBase &d_sigmoid = *net->ops_[0];
-  ASSERT_EQ("sigmoid_grad", d_sigmoid.Type());
-
-  f::OperatorBase &d_mul = *net->ops_[1];
-  ASSERT_EQ("mul_grad", d_mul.Type());
-}
-
-TEST(Backward, net_input_of_network_not_need_grad) {
-  ops::NetOp net;
-  net.AppendOp(f::OpRegistry::CreateOp(
-      "fc", {{"X", {"x"}}, {"W", {"W1"}}, {"b", {"b1"}}},
-      {{"mul_result", {"mul_tmp_0"}},
-       {"add_result", {"add_tmp_0"}},
-       {"Out", {"hidden0"}}},
-      f::AttributeMap{}));
-  net.AppendOp(f::OpRegistry::CreateOp(
-      "fc", {{"X", {"hidden0"}}, {"W", {"W2"}}, {"b", {"b2"}}},
-      {{"mul_result", {"mul_tmp_1"}},
-       {"add_result", {"add_tmp_1"}},
-       {"Out", {"hidden1"}}},
-      f::AttributeMap{}));
-  net.CompleteAddOp();
-  auto bwd = Backward(net, {"x"});  // x@GRAD is not need.
-  ASSERT_TRUE(bwd->IsNetOp());
-  auto bwd_net = static_cast<ops::NetOp *>(bwd.get());
-
-  auto output_vars = bwd_net->OutputVars(true);
-  std::unordered_set<std::string> all_outputs =
-      std::unordered_set<std::string>(output_vars.begin(), output_vars.end());
-  all_outputs.erase(f::kEmptyVarName);
-
-  for (auto &out : {"W1", "b1", "hidden0", "W2", "b2"}) {
-    ASSERT_NE(all_outputs.find(f::GradVarName(out)), all_outputs.end());
-  }
-
-  // Not Generated X
-  ASSERT_EQ(all_outputs.find(f::GradVarName("X")), all_outputs.end());
-
-  ASSERT_EQ(2UL, bwd_net->ops_.size());
-  ASSERT_TRUE(bwd_net->ops_[1]->IsNetOp());
-  auto first_fc_grad = static_cast<ops::NetOp *>(bwd_net->ops_[1].get());
-  ASSERT_EQ(3UL, first_fc_grad->ops_.size());
-  ASSERT_EQ(f::kEmptyVarName,
-            first_fc_grad->ops_[2]->Output(f::GradVarName("X")));
-}
-
-TEST(Backward, net_shared_weight) {
-  ops::NetOp net;
-  net.AppendOp(f::OpRegistry::CreateOp("mul", {{"X", {"x"}}, {"Y", {"w"}}},
-                                       {{"Out", {"out"}}}, f::AttributeMap{}));
-  net.AppendOp(f::OpRegistry::CreateOp("mul", {{"X", {"out"}}, {"Y", {"w"}}},
-                                       {{"Out", {"FinalOut"}}},
-                                       f::AttributeMap{}));
-  net.CompleteAddOp();
-
-  auto bwd = f::Backward(net, std::unordered_set<std::string>{});
-  ASSERT_TRUE(bwd->IsNetOp());
-  auto bwd_net = static_cast<ops::NetOp *>(bwd.get());
-  ASSERT_EQ(3UL, bwd_net->ops_.size());
-  ASSERT_EQ("sum", bwd_net->ops_[2]->Type());
-}
-
-TEST(Backward, op_all_input_are_not_need) {
-  auto fwd =
-      f::OpRegistry::CreateOp("rowwise_add", {{"X", {"x"}}, {"b", {"b"}}},
-                              {{"Out", {"out"}}}, f::AttributeMap{});
-  auto backward = f::Backward(*fwd, {"x", "b"});
-  ASSERT_TRUE(backward->IsNetOp());
-  auto net = static_cast<ops::NetOp *>(backward.get());
-  ASSERT_TRUE(net->ops_.empty());
-}
-
-TEST(Backward, op_all_output_are_not_need) {
-  auto fwd =
-      f::OpRegistry::CreateOp("rowwise_add", {{"X", {"x"}}, {"b", {"b"}}},
-                              {{"Out", {"out"}}}, f::AttributeMap{});
-  auto backward = f::Backward(*fwd, {"out"});
-  ASSERT_TRUE(backward->IsNetOp());
-  auto net = static_cast<ops::NetOp *>(backward.get());
-  ASSERT_TRUE(net->ops_.empty());
-}
-
-TEST(Backward, op_part_of_output_are_not_need) {
-  auto fwd =
-      f::OpRegistry::CreateOp("many_output_op", {{"x", {"X"}}},
-                              {{"y", {"Y"}}, {"z", {"Z"}}}, f::AttributeMap{});
-  auto backward = f::Backward(*fwd, {"Z"});
-  ASSERT_TRUE(backward->IsNetOp());
-  auto net = static_cast<ops::NetOp *>(backward.get());
-  ASSERT_EQ(net->ops_.size(), 2UL);
-
-  auto &fill_zero = *net->ops_[0];
-  ASSERT_EQ("fill_zeros_like", fill_zero.Type());
-  ASSERT_EQ(1UL, fill_zero.Inputs("X").size());
-  ASSERT_EQ("Z", fill_zero.Input("X"));
-  ASSERT_EQ(1UL, fill_zero.Outputs("Out").size());
-  ASSERT_EQ(std::string("Z") + f::kZeroVarSuffix, fill_zero.Output("Out"));
-
-  auto &d_many_out = *net->ops_[1];
-  ASSERT_EQ("many_output_op_grad", d_many_out.Type());
-  ASSERT_EQ(1UL + 2UL + 2UL, d_many_out.Inputs().size());  // I/O/OG
-  ASSERT_EQ(std::string("Z") + f::kZeroVarSuffix,
-            d_many_out.Input(f::GradVarName("z")));
-  ASSERT_EQ(f::GradVarName("Y"), d_many_out.Input(f::GradVarName("y")));
-  ASSERT_EQ(f::GradVarName("X"), d_many_out.Output(f::GradVarName("x")));
-}
-
-TEST(Backward, op_part_of_input_are_not_need) {
-  auto fwd = f::OpRegistry::CreateOp("mul", {{"X", {"a"}}, {"Y", {"b"}}},
-                                     {{"Out", {"out"}}}, f::AttributeMap{});
-  auto backward = f::Backward(*fwd, {"a"});
-  auto &grad_mul = *backward;
-  ASSERT_EQ(grad_mul.Type(), "mul_grad");
-  ASSERT_EQ(grad_mul.Inputs().size(), 2UL + 1UL + 1UL);
-  ASSERT_EQ(grad_mul.Outputs().size(), 2UL);
-  ASSERT_EQ(grad_mul.Output(f::GradVarName("X")), f::kEmptyVarName);
-  ASSERT_EQ(grad_mul.Output(f::GradVarName("Y")), f::GradVarName("b"));
-  ASSERT_EQ(grad_mul.Input(f::GradVarName("Out")), f::GradVarName("out"));
-  ASSERT_EQ(grad_mul.Input("X"), "a");
-  ASSERT_EQ(grad_mul.Input("Y"), "b");
-  ASSERT_EQ(grad_mul.Input("Out"), "out");
-}
-
-TEST(Backward, linear_net_intermediate_variable_has_no_grad) {
-  ops::NetOp net;
-  net.AppendOp(f::OpRegistry::CreateOp(
-      "fc", {{"X", {"x1"}}, {"W", {"w1"}}, {"b", {"b1"}}},
-      {{"mul_result", {"mul_out1"}},
-       {"add_result", {"add_out1"}},
-       {"Out", {"out1"}}},
-      f::AttributeMap{}));
-  net.AppendOp(f::OpRegistry::CreateOp(
-      "fc", {{"X", {"out1"}}, {"W", {"w2"}}, {"b", {"b2"}}},
-      {{"mul_result", {"mul_out2"}},
-       {"add_result", {"tmp_out2"}},
-       {"Out", {"out2"}}},
-      f::AttributeMap{}));
-  net.AppendOp(f::OpRegistry::CreateOp(
-      "fc", {{"X", {"out2"}}, {"W", {"w3"}}, {"b", {"b3"}}},
-      {{"mul_result", {"mul_out3"}},
-       {"add_result", {"tmp_out3"}},
-       {"Out", {"out3"}}},
-      f::AttributeMap{}));
-  net.CompleteAddOp();
-
-  auto backward = f::Backward(net, {"mul_out2", "tmp_out2", "out2"});
-  ASSERT_TRUE(backward->IsNetOp());
-  auto bwd_net = static_cast<ops::NetOp *>(backward.get());
-  ASSERT_EQ(bwd_net->ops_.size(), 3UL);
-  auto &grad_fc = *bwd_net->ops_[0];
-
-  const char *all = paddle::operators::NetOp::kAll;
-  EXPECT_EQ(grad_fc.Inputs(all).size(),
-            2UL       /* external input number */
-                + 1UL /* external output number*/
-                + 1UL /* number of gradient of external output*/
-                + 2UL /* internal variable number*/
-            );
-  EXPECT_EQ(grad_fc.Outputs(all).size(),
-            2UL       /* input number of mul*/
-                + 2UL /* input number of rowwise_add*/
-                + 1UL /* input number of sigmod */
-                - 1UL /* out2 is not needed*/);
-  EXPECT_EQ(bwd_net->ops_[1]->Inputs(all).size(), 0UL);
-  EXPECT_EQ(bwd_net->ops_[1]->Outputs(all).size(), 0UL);
-  EXPECT_EQ(bwd_net->ops_[2]->Inputs(all).size(), 0UL);
-  EXPECT_EQ(bwd_net->ops_[2]->Outputs(all).size(), 0UL);
-}
-
-TEST(Backward, simple_single_op) {
-  f::ProgramDesc program;
-  f::BlockDesc *block = program.MutableBlock(0);
-
-  f::OpDesc *op = block->AppendOp();
-  op->SetType("rowwise_add");
-  op->SetInput("X", {"x"});
-  op->SetInput("b", {"b"});
-  op->SetOutput("Out", {"out"});
-
-  auto target = f::VarDesc("out");
-  target.SetShape({1});
-  auto var_to_grad =
-      AppendBackward(program, target, std::unordered_set<std::string>{});
-
-  ASSERT_EQ(block->AllOps().size(), 3UL);
-  f::OpDesc *fill_op = block->AllOps()[1];
-  EXPECT_EQ(fill_op->Type(), "fill_constant");
-
-  f::OpDesc *grad_op = block->AllOps()[2];
-  EXPECT_EQ(grad_op->Type(), "rowwise_add_grad");
-  ASSERT_EQ(grad_op->InputNames().size(), 1UL);
-  ASSERT_EQ(grad_op->OutputNames().size(), 2UL);
-  EXPECT_EQ(grad_op->Input(f::GradVarName("Out")),
-            std::vector<std::string>({f::GradVarName("out")}));
-  EXPECT_EQ(grad_op->Output(f::GradVarName("X")),
-            std::vector<std::string>({f::GradVarName("x")}));
-  EXPECT_EQ(grad_op->Output(f::GradVarName("b")),
-            std::vector<std::string>({f::GradVarName("b")}));
-
-  EXPECT_EQ(var_to_grad.size(), 3UL);
-  EXPECT_EQ(var_to_grad.at("b"), f::GradVarInfo(f::GradVarName("b"), 0, 2));
-  EXPECT_EQ(var_to_grad.at("x"), f::GradVarInfo(f::GradVarName("x"), 0, 2));
-
-  EXPECT_TRUE(block->HasVar(f::GradVarName("b")));
-  EXPECT_TRUE(block->HasVar(f::GradVarName("x")));
-}
-
-TEST(Backward, default_attribute) {
-  f::ProgramDesc program;
-  f::BlockDesc *block = program.MutableBlock(0);
-  f::OpDesc *op = block->AppendOp();
-  op->SetType("mul");
-  op->SetInput("X", {"x"});
-  op->SetInput("Y", {"y"});
-  op->SetOutput("Out", {"out"});
-  op->CheckAttrs();
-
-  auto target = f::VarDesc("out");
-  target.SetShape({1});
-  AppendBackward(program, target, std::unordered_set<std::string>{});
-
-  ASSERT_EQ(block->AllOps().size(), 3UL);
-  EXPECT_EQ(boost::get<int>(op->GetAttr("x_num_col_dims")), 1);
-  EXPECT_EQ(boost::get<int>(op->GetAttr("y_num_col_dims")), 1);
-
-  f::OpDesc *fill_op = block->AllOps()[1];
-  EXPECT_EQ(fill_op->Type(), "fill_constant");
-
-  f::OpDesc *grad_op = block->AllOps()[2];
-  ASSERT_EQ(grad_op->Type(), "mul_grad");
-  EXPECT_EQ(boost::get<int>(grad_op->GetAttr("x_num_col_dims")), 1);
-  EXPECT_EQ(boost::get<int>(grad_op->GetAttr("y_num_col_dims")), 1);
-}
-
-TEST(Backward, simple_mult_op) {
-  f::ProgramDesc program;
-  f::BlockDesc *block = program.MutableBlock(0);
-  f::OpDesc *op1 = block->AppendOp();
-  op1->SetType("rowwise_add");
-  op1->SetInput("X", {"x1"});
-  op1->SetInput("b", {"b1"});
-  op1->SetOutput("Out", {"out1"});
-
-  f::OpDesc *op2 = block->AppendOp();
-  op2->SetType("mul");
-  op2->SetInput("X", {"out1"});
-  op2->SetInput("Y", {"y2"});
-  op2->SetOutput("Out", {"out2"});
-
-  f::OpDesc *op3 = block->AppendOp();
-  op3->SetType("rowwise_add");
-  op3->SetInput("X", {"out2"});
-  op3->SetInput("b", {"b3"});
-  op3->SetOutput("Out", {"out3"});
-
-  auto target = f::VarDesc("out3");
-  target.SetShape({1});
-  size_t forward_len = block->AllOps().size();
-  auto var_to_grad =
-      AppendBackward(program, target, std::unordered_set<std::string>{});
-
-  ASSERT_EQ(block->AllOps().size(), 6UL + 1);
-  f::OpDesc *fill_op = block->AllOps()[forward_len];
-  EXPECT_EQ(fill_op->Type(), "fill_constant");
-
-  f::OpDesc *grad_op1 = block->AllOps()[6];
-  EXPECT_EQ(grad_op1->Type(), "rowwise_add_grad");
-  ASSERT_EQ(grad_op1->InputNames().size(), 1UL);
-  ASSERT_EQ(grad_op1->OutputNames().size(), 2UL);
-  EXPECT_EQ(grad_op1->Input(f::GradVarName("Out")),
-            std::vector<std::string>({f::GradVarName("out1")}));
-  EXPECT_EQ(grad_op1->Output(f::GradVarName("X")),
-            std::vector<std::string>({f::GradVarName("x1")}));
-  EXPECT_EQ(grad_op1->Output(f::GradVarName("b")),
-            std::vector<std::string>({f::GradVarName("b1")}));
-
-  f::OpDesc *grad_op2 = block->AllOps()[5];
-  EXPECT_EQ(grad_op2->Type(), "mul_grad");
-  ASSERT_EQ(grad_op2->InputNames().size(), 4UL);
-  ASSERT_EQ(grad_op2->OutputNames().size(), 2UL);
-  EXPECT_EQ(grad_op2->Input("X"), std::vector<std::string>({"out1"}));
-  EXPECT_EQ(grad_op2->Input("Y"), std::vector<std::string>({"y2"}));
-  EXPECT_EQ(grad_op2->Input("Out"), std::vector<std::string>({"out2"}));
-  EXPECT_EQ(grad_op2->Input(f::GradVarName("Out")),
-            std::vector<std::string>({f::GradVarName("out2")}));
-  EXPECT_EQ(grad_op2->Output(f::GradVarName("X")),
-            std::vector<std::string>({f::GradVarName("out1")}));
-  EXPECT_EQ(grad_op2->Output(f::GradVarName("Y")),
-            std::vector<std::string>({f::GradVarName("y2")}));
-
-  f::OpDesc *grad_op3 = block->AllOps()[4];
-  EXPECT_EQ(grad_op3->Type(), "rowwise_add_grad");
-  ASSERT_EQ(grad_op3->InputNames().size(), 1UL);
-  ASSERT_EQ(grad_op3->OutputNames().size(), 2UL);
-  EXPECT_EQ(grad_op3->Input(f::GradVarName("Out")),
-            std::vector<std::string>({f::GradVarName("out3")}));
-  EXPECT_EQ(grad_op3->Output(f::GradVarName("X")),
-            std::vector<std::string>({f::GradVarName("out2")}));
-  EXPECT_EQ(grad_op3->Output(f::GradVarName("b")),
-            std::vector<std::string>({f::GradVarName("b3")}));
-
-  EXPECT_EQ(var_to_grad.size(), 7UL);
-  EXPECT_EQ(var_to_grad.at("x1"), f::GradVarInfo(f::GradVarName("x1"), 0, 6));
-  EXPECT_EQ(var_to_grad.at("b1"), f::GradVarInfo(f::GradVarName("b1"), 0, 6));
-  EXPECT_EQ(var_to_grad.at("out1"),
-            f::GradVarInfo(f::GradVarName("out1"), 0, 5));
-  EXPECT_EQ(var_to_grad.at("y2"), f::GradVarInfo(f::GradVarName("y2"), 0, 5));
-  EXPECT_EQ(var_to_grad.at("out2"),
-            f::GradVarInfo(f::GradVarName("out2"), 0, 4));
-  EXPECT_EQ(var_to_grad.at("b3"), f::GradVarInfo(f::GradVarName("b3"), 0, 4));
-
-  EXPECT_TRUE(block->HasVar(f::GradVarName("x1")));
-  EXPECT_TRUE(block->HasVar(f::GradVarName("b1")));
-  EXPECT_TRUE(block->HasVar(f::GradVarName("out1")));
-  EXPECT_TRUE(block->HasVar(f::GradVarName("y2")));
-  EXPECT_TRUE(block->HasVar(f::GradVarName("out2")));
-  EXPECT_TRUE(block->HasVar(f::GradVarName("b3")));
-}
-
-TEST(Backward, intermedia_var_no_grad) {
-  f::ProgramDesc program;
-  f::BlockDesc *block = program.MutableBlock(0);
-  f::OpDesc *op1 = block->AppendOp();
-  op1->SetType("rowwise_add");
-  op1->SetInput("X", {"x1"});
-  op1->SetInput("b", {"b1"});
-  op1->SetOutput("Out", {"out1"});
-
-  f::OpDesc *op2 = block->AppendOp();
-  op2->SetType("mul");
-  op2->SetInput("X", {"x2"});
-  op2->SetInput("Y", {"y2"});
-  op2->SetOutput("Out", {"out2"});
-
-  f::OpDesc *op3 = block->AppendOp();
-  op3->SetType("rowwise_add");
-  op3->SetInput("X", {"out2"});
-  op3->SetInput("b", {"b3"});
-  op3->SetOutput("Out", {"out3"});
-
-  f::OpDesc *op4 = block->AppendOp();
-  op4->SetType("mul");
-  op4->SetInput("X", {"out1"});
-  op4->SetInput("Y", {"out3"});
-  op4->SetOutput("Out", {"out4"});
-
-  auto target = f::VarDesc("out4");
-  target.SetShape({1});
-  size_t forward_len = block->AllOps().size();
-  auto var_to_grad = AppendBackward(program, target, {"out3"});
-
-  ASSERT_EQ(block->AllOps().size(), 7UL);
-  f::OpDesc *fill_op = block->AllOps()[forward_len];
-  EXPECT_EQ(fill_op->Type(), "fill_constant");
-
-  f::OpDesc *grad_op1 = block->AllOps()[6];
-  EXPECT_EQ(grad_op1->Type(), "rowwise_add_grad");
-  ASSERT_EQ(grad_op1->InputNames().size(), 1UL);
-  ASSERT_EQ(grad_op1->OutputNames().size(), 2UL);
-  EXPECT_EQ(grad_op1->Input(f::GradVarName("Out")),
-            std::vector<std::string>({f::GradVarName("out1")}));
-  EXPECT_EQ(grad_op1->Output(f::GradVarName("X")),
-            std::vector<std::string>({f::GradVarName("x1")}));
-  EXPECT_EQ(grad_op1->Output(f::GradVarName("b")),
-            std::vector<std::string>({f::GradVarName("b1")}));
-
-  f::OpDesc *grad_op4 = block->AllOps()[5];
-  EXPECT_EQ(grad_op4->Type(), "mul_grad");
-  ASSERT_EQ(grad_op4->InputNames().size(), 4UL);
-  ASSERT_EQ(grad_op4->OutputNames().size(), 2UL);
-  EXPECT_EQ(grad_op4->Input("X"), std::vector<std::string>({"out1"}));
-  EXPECT_EQ(grad_op4->Input("Y"), std::vector<std::string>({"out3"}));
-  EXPECT_EQ(grad_op4->Input("Out"), std::vector<std::string>({"out4"}));
-  EXPECT_EQ(grad_op4->Input(f::GradVarName("Out")),
-            std::vector<std::string>({f::GradVarName("out4")}));
-  EXPECT_EQ(grad_op4->Output(f::GradVarName("X")),
-            std::vector<std::string>({f::GradVarName("out1")}));
-  EXPECT_EQ(grad_op4->Output(f::GradVarName("Y")), std::vector<std::string>());
-
-  EXPECT_EQ(var_to_grad.size(), 4UL);
-  EXPECT_EQ(var_to_grad.at("x1"), f::GradVarInfo(f::GradVarName("x1"), 0, 6));
-  EXPECT_EQ(var_to_grad.at("b1"), f::GradVarInfo(f::GradVarName("b1"), 0, 6));
-  EXPECT_EQ(var_to_grad.at("out1"),
-            f::GradVarInfo(f::GradVarName("out1"), 0, 5));
-
-  EXPECT_TRUE(block->HasVar(f::GradVarName("x1")));
-  EXPECT_TRUE(block->HasVar(f::GradVarName("b1")));
-  EXPECT_TRUE(block->HasVar(f::GradVarName("out1")));
-}
-
-TEST(Backward, var_no_grad) {
-  f::ProgramDesc program;
-  f::BlockDesc *block = program.MutableBlock(0);
-  f::OpDesc *op1 = block->AppendOp();
-  op1->SetType("mult_in_out");
-  op1->SetInput("X", {"x1"});
-  op1->SetInput("H", {"h1"});
-  op1->SetOutput("Y", {"y1"});
-  op1->SetOutput("Z", {"z1"});
-
-  f::OpDesc *op2 = block->AppendOp();
-  op2->SetType("mult_in_out");
-  op2->SetInput("X", {"y1"});
-  op2->SetInput("H", {"z1"});
-  op2->SetOutput("Y", {"y2"});
-  op2->SetOutput("Z", {"z2"});
-
-  auto target = f::VarDesc("z2");
-  target.SetShape({1});
-  size_t forward_len = block->AllOps().size();
-  auto var_to_grad = AppendBackward(program, target, {"z1"});
-
-  ASSERT_EQ(block->AllOps().size(), 6UL);
-  f::OpDesc *fill_op = block->AllOps()[forward_len];
-  EXPECT_EQ(fill_op->Type(), "fill_constant");
-
-  f::OpDesc *grad_op2 = block->AllOps()[3];
-  ASSERT_EQ(grad_op2->Type(), "mult_in_out_grad");
-  ASSERT_EQ(grad_op2->InputNames().size(), 6UL);
-  ASSERT_EQ(grad_op2->OutputNames().size(), 2UL);
-  EXPECT_EQ(grad_op2->Input("X"), std::vector<std::string>({"y1"}));
-  EXPECT_EQ(grad_op2->Input("H"), std::vector<std::string>({"z1"}));
-  EXPECT_EQ(grad_op2->Input("Y"), std::vector<std::string>({"y2"}));
-  EXPECT_EQ(grad_op2->Input("Z"), std::vector<std::string>({"z2"}));
-  EXPECT_EQ(grad_op2->Input(f::GradVarName("Y")),
-            std::vector<std::string>({f::GradVarName("y2")}));
-  EXPECT_EQ(grad_op2->Input(f::GradVarName("Z")),
-            std::vector<std::string>({f::GradVarName("z2")}));
-  EXPECT_EQ(grad_op2->Output(f::GradVarName("X")),
-            std::vector<std::string>({f::GradVarName("y1")}));
-  EXPECT_EQ(grad_op2->Output(f::GradVarName("H")), std::vector<std::string>());
-
-  f::OpDesc *fill_zero_op = block->AllOps()[4];
-  ASSERT_EQ(fill_zero_op->Type(), "fill_zeros_like");
-  ASSERT_EQ(fill_zero_op->InputNames().size(), 1UL);
-  ASSERT_EQ(fill_zero_op->OutputNames().size(), 1UL);
-  EXPECT_EQ(fill_zero_op->Input("X"), std::vector<std::string>({"z1"}));
-  EXPECT_EQ(fill_zero_op->Output("Out"),
-            std::vector<std::string>({std::string("z1") + f::kZeroVarSuffix}));
-
-  f::OpDesc *grad_op1 = block->AllOps()[5];
-  ASSERT_EQ(grad_op1->Type(), "mult_in_out_grad");
-  ASSERT_EQ(grad_op1->InputNames().size(), 6UL);
-  ASSERT_EQ(grad_op1->OutputNames().size(), 2UL);
-  EXPECT_EQ(grad_op1->Input("X"), std::vector<std::string>({"x1"}));
-  EXPECT_EQ(grad_op1->Input("H"), std::vector<std::string>({"h1"}));
-  EXPECT_EQ(grad_op1->Input("Y"), std::vector<std::string>({"y1"}));
-  EXPECT_EQ(grad_op1->Input("Z"), std::vector<std::string>({"z1"}));
-  EXPECT_EQ(grad_op1->Input(f::GradVarName("Y")),
-            std::vector<std::string>({f::GradVarName("y1")}));
-  EXPECT_EQ(grad_op1->Input(f::GradVarName("Z")),
-            std::vector<std::string>({std::string("z1") + f::kZeroVarSuffix}));
-  EXPECT_EQ(grad_op1->Output(f::GradVarName("X")),
-            std::vector<std::string>({f::GradVarName("x1")}));
-  EXPECT_EQ(grad_op1->Output(f::GradVarName("H")),
-            std::vector<std::string>({f::GradVarName("h1")}));
-
-  EXPECT_EQ(var_to_grad.size(), 4UL);
-  EXPECT_EQ(var_to_grad.at("y1"), f::GradVarInfo(f::GradVarName("y1"), 0, 3));
-  EXPECT_EQ(var_to_grad.at("x1"), f::GradVarInfo(f::GradVarName("x1"), 0, 5));
-  EXPECT_EQ(var_to_grad.at("h1"), f::GradVarInfo(f::GradVarName("h1"), 0, 5));
-
-  EXPECT_TRUE(block->HasVar(f::GradVarName("y1")));
-  EXPECT_TRUE(block->HasVar(f::GradVarName("x1")));
-  EXPECT_TRUE(block->HasVar(f::GradVarName("h1")));
-}
-
-TEST(Backward, shared_var) {
-  f::ProgramDesc program;
-  f::BlockDesc *block = program.MutableBlock(0);
-  f::OpDesc *op1 = block->AppendOp();
-  op1->SetType("rowwise_add");
-  op1->SetInput("X", {"x1"});
-  op1->SetInput("b", {"b1"});
-  op1->SetOutput("Out", {"out1"});
-
-  f::OpDesc *op2 = block->AppendOp();
-  op2->SetType("mul");
-  op2->SetInput("X", {"out1"});
-  op2->SetInput("Y", {"y2"});
-  op2->SetOutput("Out", {"out2"});
-
-  f::OpDesc *op3 = block->AppendOp();
-  op3->SetType("rowwise_add");
-  op3->SetInput("X", {"out1"});
-  op3->SetInput("b", {"b3"});
-  op3->SetOutput("Out", {"out3"});
-
-  auto target = f::VarDesc("out3");
-  target.SetShape({1});
-  size_t forward_len = block->AllOps().size();
-  auto var_to_grad =
-      AppendBackward(program, target, std::unordered_set<std::string>{});
-
-  ASSERT_EQ(block->AllOps().size(), 8UL);
-  f::OpDesc *fill_op = block->AllOps()[forward_len];
-  EXPECT_EQ(fill_op->Type(), "fill_constant");
-
-  f::OpDesc *grad_op3 = block->AllOps()[4];
-  ASSERT_EQ(grad_op3->Type(), "rowwise_add_grad");
-  ASSERT_EQ(grad_op3->InputNames().size(), 1UL);
-  ASSERT_EQ(grad_op3->OutputNames().size(), 2UL);
-  EXPECT_EQ(grad_op3->Input(f::GradVarName("Out")),
-            std::vector<std::string>({f::GradVarName("out3")}));
-  EXPECT_EQ(grad_op3->Output(f::GradVarName("X")),
-            std::vector<std::string>({f::GradVarName("out1") + "@RENAME@0"}));
-  EXPECT_EQ(grad_op3->Output(f::GradVarName("b")),
-            std::vector<std::string>({f::GradVarName("b3")}));
-
-  f::OpDesc *grad_op4 = block->AllOps()[5];
-  ASSERT_EQ(grad_op4->Type(), "mul_grad");
-  ASSERT_EQ(grad_op4->InputNames().size(), 4UL);
-  ASSERT_EQ(grad_op4->OutputNames().size(), 2UL);
-  EXPECT_EQ(grad_op4->Input("X"), std::vector<std::string>({"out1"}));
-  EXPECT_EQ(grad_op4->Input("Y"), std::vector<std::string>({"y2"}));
-  EXPECT_EQ(grad_op4->Input("Out"), std::vector<std::string>({"out2"}));
-  EXPECT_EQ(grad_op4->Input(f::GradVarName("Out")),
-            std::vector<std::string>({f::GradVarName("out2")}));
-  EXPECT_EQ(grad_op4->Output(f::GradVarName("X")),
-            std::vector<std::string>({f::GradVarName("out1") + "@RENAME@1"}));
-  EXPECT_EQ(grad_op4->Output(f::GradVarName("Y")),
-            std::vector<std::string>({f::GradVarName("y2")}));
-
-  f::OpDesc *sum_op = block->AllOps()[6];
-  ASSERT_EQ(sum_op->Type(), "sum");
-  ASSERT_EQ(sum_op->InputNames().size(), 1UL);
-  ASSERT_EQ(sum_op->OutputNames().size(), 1UL);
-  EXPECT_EQ(sum_op->Input("X"),
-            std::vector<std::string>({f::GradVarName("out1") + "@RENAME@0",
-                                      f::GradVarName("out1") + "@RENAME@1"}));
-  EXPECT_EQ(sum_op->Output("Out"),
-            std::vector<std::string>({f::GradVarName("out1")}));
-
-  f::OpDesc *grad_op1 = block->AllOps()[7];
-  ASSERT_EQ(grad_op1->Type(), "rowwise_add_grad");
-  ASSERT_EQ(grad_op1->InputNames().size(), 1UL);
-  ASSERT_EQ(grad_op1->OutputNames().size(), 2UL);
-  EXPECT_EQ(grad_op1->Input(f::GradVarName("Out")),
-            std::vector<std::string>({f::GradVarName("out1")}));
-  EXPECT_EQ(grad_op1->Output(f::GradVarName("X")),
-            std::vector<std::string>({f::GradVarName("x1")}));
-  EXPECT_EQ(grad_op1->Output(f::GradVarName("b")),
-            std::vector<std::string>({f::GradVarName("b1")}));
-
-  EXPECT_EQ(var_to_grad.size(), 6UL);
-  EXPECT_EQ(var_to_grad.at("b3"), f::GradVarInfo(f::GradVarName("b3"), 0, 4));
-  EXPECT_EQ(var_to_grad.at("y2"), f::GradVarInfo(f::GradVarName("y2"), 0, 5));
-  EXPECT_EQ(var_to_grad.at("out1"),
-            f::GradVarInfo(f::GradVarName("out1"), 0, 6));
-  EXPECT_EQ(var_to_grad.at("x1"), f::GradVarInfo(f::GradVarName("x1"), 0, 7));
-  EXPECT_EQ(var_to_grad.at("b1"), f::GradVarInfo(f::GradVarName("b1"), 0, 7));
-
-  EXPECT_TRUE(block->HasVar(f::GradVarName("b3")));
-  EXPECT_TRUE(block->HasVar(f::GradVarName("y2")));
-  EXPECT_TRUE(block->HasVar(f::GradVarName("out1")));
-  EXPECT_TRUE(block->HasVar(f::GradVarName("x1")));
-  EXPECT_TRUE(block->HasVar(f::GradVarName("b1")));
-}
-
-TEST(Backward, half_backward) {
-  f::ProgramDesc program;
-  f::BlockDesc *block = program.MutableBlock(0);
-  auto *op1 = block->AppendOp();
-  op1->SetType("minus");
-  op1->SetInput("X", {"a"});
-  op1->SetInput("Y", {"b"});
-  op1->SetOutput("Out", {"out"});
-
-  auto target = f::VarDesc("out");
-  target.SetShape({1});
-  size_t forward_len = block->AllOps().size();
-  auto var_to_grad = AppendBackward(program, target, {"b"});
-  f::OpDesc *fill_op = block->AllOps()[forward_len];
-  EXPECT_EQ(fill_op->Type(), "fill_constant");
-  auto ops = block->AllOps();
-  ASSERT_EQ(3UL, ops.size());
-
-  EXPECT_EQ(var_to_grad.size(), 2UL);
-  EXPECT_EQ(var_to_grad.at("a"),
-            f::GradVarInfo(f::GradVarName("a"), 0, forward_len + 1));
-}
diff --git a/paddle/fluid/framework/prune_test.cc b/paddle/fluid/framework/prune_test.cc
index 0e44b34383..8af7d2d510 100644
--- a/paddle/fluid/framework/prune_test.cc
+++ b/paddle/fluid/framework/prune_test.cc
@@ -14,18 +14,17 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/prune.h"
 
+#include <gtest/gtest.h>
+#include <string>
+
 #include "paddle/fluid/framework/attribute.h"
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/net_op.h"
 
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/program_desc.h"
 
-#include <gtest/gtest.h>
-
 namespace f = paddle::framework;
-namespace ops = paddle::operators;
 
 void AddOp(const std::string &type, const f::VariableNameMap &inputs,
            const f::VariableNameMap &outputs, f::AttributeMap attrs,
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 5ff987ad8b..3c8696b508 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -100,7 +100,7 @@ function(op_library TARGET)
     endif()
 
     # Define operators that don't need pybind here.
-    foreach(manual_pybind_op "net_op" "compare_op" "logical_op" "nccl_op" "tensor_array_read_write_op")
+    foreach(manual_pybind_op "compare_op" "logical_op" "nccl_op" "tensor_array_read_write_op")
         if ("${TARGET}" STREQUAL "${manual_pybind_op}")
             set(pybind_flag 1)
         endif()
@@ -199,7 +199,6 @@ else()
     set(DEPS_OPS ${DEPS_OPS} send_op prefetch_op recv_op listen_and_serv_op send_vars_op send_barrier_op)
 endif()
 
-op_library(cond_op DEPS framework_proto tensor net_op)
 op_library(cross_entropy_op DEPS cross_entropy)
 op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax)
 op_library(softmax_op DEPS softmax)
@@ -259,7 +258,6 @@ endforeach()
 set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
 
 cc_test(gather_test SRCS gather_test.cc DEPS tensor)
-cc_test(net_op_test SRCS net_op_test.cc DEPS net_op)
 cc_test(scatter_test SRCS scatter_test.cc DEPS tensor)
 cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor)
 cc_test(beam_search_op_test SRCS beam_search_op_test.cc DEPS lod_tensor beam_search_op)
diff --git a/paddle/fluid/operators/cond_op.cc b/paddle/fluid/operators/cond_op.cc
deleted file mode 100644
index 15dce9e3e2..0000000000
--- a/paddle/fluid/operators/cond_op.cc
+++ /dev/null
@@ -1,235 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/cond_op.h"
-#include "paddle/fluid/operators/gather.h"
-#include "paddle/fluid/operators/scatter.h"
-#include "paddle/fluid/platform/device_context.h"
-
-namespace paddle {
-namespace operators {
-
-using Scope = framework::Scope;
-using Variable = framework::Variable;
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-using DDim = framework::DDim;
-
-framework::Scope& CondOp::AddSubScope(const Scope& scope) const {
-  auto sub_scopes_var = scope.FindVar("SubScopes");
-  PADDLE_ENFORCE_NOT_NULL(sub_scopes_var,
-                          "Output(SubScopes) of CondOp should not be null.");
-  auto sub_scopes = sub_scopes_var->GetMutable<std::vector<Scope*>>();
-  auto& sub_scope = scope.NewScope();
-  sub_scopes->push_back(&sub_scope);
-  return sub_scope;
-}
-
-std::vector<framework::Scope*>& CondOp::GetSubScopes(
-    const framework::Scope& scope) const {
-  auto sub_scopes_var = scope.FindVar("SubScopes");
-  PADDLE_ENFORCE_NOT_NULL(sub_scopes_var,
-                          "Output(SubScopes) of CondOp should not be null.");
-  return *sub_scopes_var->GetMutable<std::vector<framework::Scope*>>();
-}
-
-LoDTensor& CondOp::AddIndexTensor(const Scope& scope) const {
-  auto index_tensors_var = scope.FindVar("IndexTensors");
-  PADDLE_ENFORCE_NOT_NULL(index_tensors_var,
-                          "Output(IndexTensors) of CondOp should not be null.");
-  auto& index_tensors =
-      *index_tensors_var->GetMutable<std::vector<LoDTensor>>();
-  index_tensors.push_back(LoDTensor());
-  return index_tensors.back();
-}
-
-std::vector<framework::LoDTensor>& CondOp::GetIndexTensors(
-    const framework::Scope& scope) const {
-  auto* index_tensors_var = scope.FindVar("IndexTensors");
-  PADDLE_ENFORCE_NOT_NULL(index_tensors_var,
-                          "Output(IndexTensors) of CondOp should not be null.");
-  return *index_tensors_var->GetMutable<std::vector<framework::LoDTensor>>();
-}
-
-void CondOp::PrepareDataForSubnet(
-    const framework::Scope& scope,
-    const platform::DeviceContext& dev_ctx) const {
-  PADDLE_ENFORCE(!Inputs("Xs").empty(), "Inputs(Xs) of CondOp can't be empty.");
-
-  for (int i = 0; i < BRANCH_NUM; ++i) {
-    // Create two sub scopes for true and false branches
-    //   sub_scopes[0] for the true branch
-    //   sub_scopes[1] for the false branch
-    AddSubScope(scope);
-    // Create two tensors for true and false indices:
-    //   index_tensors[0] for the true branch
-    //   index_tensors[1] for the false branch
-    AddIndexTensor(scope);
-  }
-
-  Variable* cond_var = scope.FindVar(Input("Cond"));
-  PADDLE_ENFORCE_NOT_NULL(cond_var,
-                          "Input(Cond) of CondOp should not be null.");
-  const LoDTensor* cond = cond_var->GetMutable<LoDTensor>();
-
-  // get the true/false index at runtime according to cond tensor
-  // index_vectors[0]: vector<int>, contains all index for cond[i] == true
-  // index_vectors[1]: vector<int>, contains all index for cond[i] == false
-  std::vector<std::vector<int>> index_vectors;
-  index_vectors.resize(BRANCH_NUM);
-
-  const int* cond_data = cond->data<int>();
-  for (int i = 0; i < cond->dims()[0]; ++i) {
-    if (cond_data[i])
-      index_vectors[TRUE_BRANCH].push_back(i);
-    else
-      index_vectors[FALSE_BRANCH].push_back(i);
-  }
-
-  // put index_vectors[0] and index_vectors[1] into two tensors:
-  // index_tensors[0] and index_tensors[1]
-  std::vector<framework::LoDTensor>& index_tensors = GetIndexTensors(scope);
-  std::vector<framework::Scope*>& sub_scopes = GetSubScopes(scope);
-
-  for (int i = 0; i < BRANCH_NUM; ++i) {
-    DDim dim = {static_cast<int64_t>(index_vectors[i].size())};
-    int* index_tensor_data_ptr =
-        index_tensors[i].mutable_data<int>(dim, platform::CPUPlace());
-    memcpy(index_tensor_data_ptr, index_vectors[i].data(),
-           dim[0] * sizeof(int));
-  }
-
-  // create input in subscopes according to index_vectors
-  for (auto& input : Inputs("Xs")) {
-    Variable* var_parent = scope.FindVar(input);
-    PADDLE_ENFORCE_NOT_NULL(var_parent);
-    const auto* tensor_parent = &var_parent->Get<LoDTensor>();
-
-    for (int i = 0; i < BRANCH_NUM; ++i) {
-      Variable* var_child = sub_scopes[i]->FindVar(input);
-      PADDLE_ENFORCE_NOT_NULL(var_child);
-      auto* tensor_child = var_child->GetMutable<LoDTensor>();
-
-      // Resize child
-      DDim dim = tensor_parent->dims();
-      dim[0] = index_tensors[i].dims()[0];
-      tensor_child->mutable_data<float>(dim, platform::CPUPlace());
-
-      CPUGather<float>(dev_ctx, *tensor_parent, index_tensors[i], tensor_child);
-    }
-  }
-
-  // create output_tensors in subscope for sub_net
-  for (int i = 0; i < BRANCH_NUM; ++i) {
-    for (auto& output : (*sub_net_op_[i]).Outputs()) {
-      for (auto& var_name : output.second) {
-        sub_scopes[i]->Var(var_name);
-      }
-    }
-  }
-}
-
-void CondOp::MergeDataFromSubnet(const framework::Scope& scope,
-                                 const platform::DeviceContext& dev_ctx) const {
-  std::vector<framework::Scope*>& sub_scopes = GetSubScopes(scope);
-  const std::vector<framework::LoDTensor>& index_tensors =
-      GetIndexTensors(scope);
-
-  // Infer the output dim, out_dim[0] = true_dim[0] + false_dim[0]
-  PADDLE_ENFORCE(!Outputs("Outs").empty(),
-                 "Outputs(Outs) of CondOp can't be empty.");
-  for (auto& output : Outputs("Outs")) {
-    const LoDTensor* tensor_t_out =
-        &sub_scopes[TRUE_BRANCH]->FindVar(output)->Get<LoDTensor>();
-    PADDLE_ENFORCE_NOT_NULL(tensor_t_out, "True output should not be NULL");
-    const LoDTensor* tensor_f_out =
-        &sub_scopes[FALSE_BRANCH]->FindVar(output)->Get<LoDTensor>();
-    PADDLE_ENFORCE_NOT_NULL(tensor_f_out, "False output should not be NULL");
-
-    auto* var_out = scope.FindVar(output);
-    PADDLE_ENFORCE_NOT_NULL(var_out, "Output not found");
-    LoDTensor* tensor_out = var_out->GetMutable<LoDTensor>();
-    PADDLE_ENFORCE_NOT_NULL(tensor_t_out,
-                            "True output tensor should not be NULL");
-
-    DDim true_dim = tensor_t_out->dims();
-    DDim false_dim = tensor_f_out->dims();
-    true_dim[0] = 0;
-    false_dim[0] = 0;
-    PADDLE_ENFORCE_EQ(true_dim, false_dim,
-                      "Outputs not of the same shape except the first dim");
-
-    DDim out_dim = tensor_t_out->dims();
-    out_dim[0] = tensor_t_out->dims()[0] + tensor_f_out->dims()[0];
-    tensor_out->Resize(out_dim);
-    tensor_out->mutable_data<float>(platform::CPUPlace());
-  }
-
-  // merge output results:
-  // output_tensor = true_output_tensor + false_output_tensor
-  for (auto& output : Outputs("Outs")) {
-    Variable* var_parent = scope.FindVar(output);
-    PADDLE_ENFORCE_NOT_NULL(var_parent);
-    auto* tensor_parent = var_parent->GetMutable<LoDTensor>();
-
-    for (int i = 0; i < BRANCH_NUM; ++i) {
-      Variable* var_child = sub_scopes[i]->FindVar(output);
-      PADDLE_ENFORCE_NOT_NULL(var_child);
-      auto* tensor_child = &var_child->Get<LoDTensor>();
-      ScatterAssign<float>(dev_ctx, *tensor_child, index_tensors[i],
-                           tensor_parent);
-    }
-  }
-}
-
-void CondOp::RunImpl(const Scope& scope, const platform::Place& place) const {
-  // get device context from pool
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  auto& dev_ctx = *pool.Get(place);
-
-  PrepareDataForSubnet(scope, dev_ctx);
-  std::vector<framework::Scope*>& sub_scopes = GetSubScopes(scope);
-  for (int i = 0; i < BRANCH_NUM; ++i) {
-    sub_net_op_[i]->Run(*sub_scopes[i], place);
-  }
-  MergeDataFromSubnet(scope, dev_ctx);
-}
-
-class CondOpProtoAndCheckerMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  CondOpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("Cond", "The condition, which is a bool vector");
-    AddInput("Xs", "Inputs of Subnets").AsDuplicable();
-    AddOutput("Outs", "Outputs of Cond_Op after merge").AsDuplicable();
-
-    AddOutput("SubScopes", "sub scopes for true and false branches");
-    AddOutput("IndexTensors", "Index Tensors contains indices for true/false");
-
-    AddComment(R"DOC(
-Sample Dependent Conditional Operator.
-
-Given Cond[i] as a 1/0 vector to indicate true/false:
-Out[i] = subnet_true[i], if Cond[i] == true
-Out[i] = subnet_false[i], if Cond[i] == false
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_WITHOUT_GRADIENT(cond, paddle::operators::CondOp,
-                             paddle::operators::CondOpProtoAndCheckerMaker);
diff --git a/paddle/fluid/operators/cond_op.h b/paddle/fluid/operators/cond_op.h
deleted file mode 100644
index d3888923db..0000000000
--- a/paddle/fluid/operators/cond_op.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <string>
-#include <vector>
-#include "glog/logging.h"
-#include "paddle/fluid/framework/ddim.h"
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/net_op.h"
-
-namespace paddle {
-namespace operators {
-
-/*
- * @brief CondOp is a dynamic if-else Operator
- *
- * It has a input tensor named cond indicating which netop each instance will
- * run.
- *
- * if cond == 1, it will run true_net, which is a NetOp.
- *
- * if cond == 0, it will run false_net, which is another NetOp.
- */
-class CondOp : public framework::OperatorBase {
- public:
-  CondOp(const std::string& type, const framework::VariableNameMap& inputs,
-         const framework::VariableNameMap& outputs,
-         const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {
-    sub_net_op_.resize(BRANCH_NUM);
-  }
-
-  CondOp(const CondOp& o)
-      : framework::OperatorBase(
-            static_cast<const framework::OperatorBase&>(o)) {
-    // TODO(yuyang18): Implement copy ctor well.
-    PADDLE_THROW("Not implemented");
-  }
-
-  framework::Scope& AddSubScope(const framework::Scope& scope) const;
-  std::vector<framework::Scope*>& GetSubScopes(
-      const framework::Scope& scope) const;
-
-  framework::LoDTensor& AddIndexTensor(const framework::Scope& scope) const;
-  std::vector<framework::LoDTensor>& GetIndexTensors(
-      const framework::Scope& scope) const;
-
-  void PrepareDataForSubnet(const framework::Scope& scope,
-                            const platform::DeviceContext& dev_ctx) const;
-  void MergeDataFromSubnet(const framework::Scope& scope,
-                           const platform::DeviceContext& dev_ctx) const;
-
-  /*
-   * Set True Block
-   */
-  void set_truenet(std::unique_ptr<OperatorBase>&& net) {
-    sub_net_op_[TRUE_BRANCH] = std::move(net);
-  }
-
-  /*
-   * Set False Block
-   */
-  void set_falsenet(std::unique_ptr<OperatorBase>&& net) {
-    sub_net_op_[FALSE_BRANCH] = std::move(net);
-  }
-
- private:
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& place) const override;
-
- private:
-  const int TRUE_BRANCH = 0;
-  const int FALSE_BRANCH = 1;
-  const int BRANCH_NUM = 2;
-
-  // sub_net_op_[0]: subnet_t
-  // sub_net_op_[1]: subnet_f
-  std::vector<std::unique_ptr<framework::OperatorBase>> sub_net_op_;
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/minus_op.cc b/paddle/fluid/operators/minus_op.cc
index 5790deb54d..a302b24560 100644
--- a/paddle/fluid/operators/minus_op.cc
+++ b/paddle/fluid/operators/minus_op.cc
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/minus_op.h"
+
 #include <string>
 #include <vector>
-#include "paddle/fluid/operators/net_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/net_op.cc b/paddle/fluid/operators/net_op.cc
deleted file mode 100644
index 0c2da74417..0000000000
--- a/paddle/fluid/operators/net_op.cc
+++ /dev/null
@@ -1,103 +0,0 @@
-//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/net_op.h"
-#include <set>
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-const char NetOp::kAll[] = "all";
-
-void NetOp::CompleteAddOp(bool calc) {
-  add_op_done_ = true;
-  if (!calc) return;
-  std::set<std::string> input_set;
-  std::set<std::string> output_set;
-  for (auto& op : ops_) {
-    for (auto& ipt : op->Inputs()) {
-      for (auto& var_name : ipt.second) {
-        // If input variable has been in output set, then it will be
-        // added into intermediate_outputs_. Otherwise, it will be
-        // added into input set.
-        if (Contains(output_set, var_name)) {
-          intermediate_outputs_.insert(var_name);
-        } else {
-          input_set.insert(var_name);
-        }
-      }
-    }
-
-    for (auto& opt : op->Outputs()) {
-      for (auto& var_name : opt.second) {
-        output_set.insert(var_name);
-      }
-    }
-  }
-  auto& inputs = inputs_[kAll];
-  inputs.reserve(input_set.size());
-  std::copy(input_set.begin(), input_set.end(), std::back_inserter(inputs));
-  auto& outputs = outputs_[kAll];
-  outputs.reserve(output_set.size());
-  std::copy(output_set.begin(), output_set.end(), std::back_inserter(outputs));
-}
-
-std::string NetOp::DebugStringEx(const framework::Scope* scope) const {
-  std::ostringstream os;
-  os << OperatorBase::DebugStringEx(scope) << std::endl;
-  for (auto& op : ops_) {
-    std::istringstream is(op->DebugStringEx(scope));
-    for (std::string line; std::getline(is, line);) {
-      os << "    " << line << std::endl;
-    }
-  }
-  return os.str();
-}
-
-bool NetOp::IsNetOp() const { return true; }
-
-std::vector<std::string> NetOp::OutputVars(bool has_intermediate) const {
-  std::vector<std::string> all;
-  for (auto& pair : this->outputs_) {
-    for (auto& var_name : pair.second) {
-      all.push_back(var_name);
-    }
-  }
-  if (has_intermediate) {
-    return all;
-  }
-  std::vector<std::string> ret_val;
-  for (auto& each : all) {
-    if (!Contains(intermediate_outputs_, each)) {
-      ret_val.push_back(each);
-    }
-  }
-  return ret_val;
-}
-
-NetOp::NetOp(const std::string& type, const framework::VariableNameMap& inputs,
-             const framework::VariableNameMap& outputs,
-             const framework::AttributeMap& attrs)
-    : framework::OperatorBase(type, inputs, outputs, attrs) {}
-
-std::unique_ptr<framework::OperatorBase> NetOp::Clone() const {
-  PADDLE_ENFORCE(
-      add_op_done_,
-      "Must clone a sealed NetOp, invoke Net::CompleteAddOp before clone");
-  return std::unique_ptr<OperatorBase>(new NetOp(*this));
-}
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/net_op.h b/paddle/fluid/operators/net_op.h
deleted file mode 100644
index cbf8820cf4..0000000000
--- a/paddle/fluid/operators/net_op.h
+++ /dev/null
@@ -1,130 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <set>
-#include "paddle/fluid/framework/framework.pb.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-/**
- * @brief Network is also a type of Operator
- *
- * It will manage the operators it has.
- *
- * Network is the container and controller of a set of operators.
-
- * A network object knows all Operators belonging to this network. Variables,
- * which are inputs and outputs of these operators, are created and managed by a
- * hierarchy of Scope objects.
- *
- * This is the base class of network, all the networks should implement the APIs
- * it defines.
- */
-class NetOp : public framework::OperatorBase {
- public:
-  static const char kAll[];
-  NetOp()
-      : framework::OperatorBase("plain_net", framework::VariableNameMap{},
-                                framework::VariableNameMap{},
-                                framework::AttributeMap{}) {}
-
-  NetOp(const std::string& type, const framework::VariableNameMap& inputs,
-        const framework::VariableNameMap& outputs,
-        const framework::AttributeMap& attrs);
-
-  NetOp(const NetOp& o) : framework::OperatorBase(o.type_, {}, {}, o.attrs_) {
-    this->ops_.reserve(o.ops_.size());
-    std::transform(
-        o.ops_.begin(), o.ops_.end(), std::back_inserter(this->ops_),
-        [](const std::unique_ptr<framework::OperatorBase>& op) {
-          return std::unique_ptr<framework::OperatorBase>(op->Clone());
-        });
-    this->CompleteAddOp();
-  }
-
-  bool SupportGPU() const override {
-    for (auto& op : ops_) {
-      if (!op->SupportGPU()) {
-        return false;
-      }
-    }
-    return true;
-  }
-
-  void AppendOp(const framework::OperatorBase& op) { AppendOp(op.Clone()); }
-
-  /**
-   * @brief Add an operator by ptr
-   */
-  void AppendOp(std::unique_ptr<framework::OperatorBase> op) {
-    PADDLE_ENFORCE(!add_op_done_,
-                   "Cannot AppendOp when this network is sealed");
-    PADDLE_ENFORCE_NOT_NULL(op, "Cannot Insert Null op");
-    ops_.push_back(std::move(op));
-  }
-
-  void InsertOp(size_t pos, std::unique_ptr<framework::OperatorBase> op) {
-    PADDLE_ENFORCE(!add_op_done_,
-                   "Cannot InsertOp when this network is sealed");
-    PADDLE_ENFORCE_NOT_NULL(op, "Cannot Insert Null op");
-    PADDLE_ENFORCE_LE(pos, ops_.size(), "Out of range");
-    ops_.insert(ops_.begin() + pos, std::move(op));
-  }
-
-  void InsertOp(size_t pos, const framework::OperatorBase& op) {
-    InsertOp(pos, op.Clone());
-  }
-
-  void CompleteAddOp(bool calculate = true);
-
-  std::string DebugStringEx(
-      const framework::Scope* scope = nullptr) const override;
-
-  bool IsNetOp() const override;
-  std::vector<std::string> OutputVars(bool has_intermediate) const override;
-
-  std::unique_ptr<framework::OperatorBase> Clone() const override;
-
-  std::vector<std::unique_ptr<framework::OperatorBase>> ops_;
-
- private:
-  /**
-   * @brief Run the network.
-   *
-   * Run all the operators with the `scope`, if no scope is provided, default
-   * scope will be used instead. If no OpContext is provicded, default context
-   * will be used.
-   */
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& place) const override {
-    for (auto& op : ops_) {
-      op->Run(scope, place);
-    }
-  }
-
-  bool add_op_done_{false};
-  std::set<std::string> intermediate_outputs_;
-
-  template <typename T, typename KeyType>
-  static bool Contains(T container, KeyType key) {
-    return container.find(key) != container.end();
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/net_op_test.cc b/paddle/fluid/operators/net_op_test.cc
deleted file mode 100644
index 3b5f575485..0000000000
--- a/paddle/fluid/operators/net_op_test.cc
+++ /dev/null
@@ -1,103 +0,0 @@
-//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/operators/net_op.h"
-
-#include <gtest/gtest.h>
-
-namespace paddle {
-namespace operators {
-using Scope = framework::Scope;
-using DeviceContext = platform::DeviceContext;
-
-static int run_cnt = 0;
-
-class TestOp : public framework::OperatorBase {
- public:
-  using framework::OperatorBase::OperatorBase;
-  DEFINE_OP_CLONE_METHOD(TestOp);
-
- private:
-  void RunImpl(const Scope& scope,
-               const platform::Place& place) const override {
-    ++run_cnt;
-  }
-};
-
-template <typename T>
-void AssertSameVectorWithoutOrder(const std::vector<T>& expected,
-                                  const std::vector<T>& actual) {
-  ASSERT_EQ(expected.size(), actual.size());
-  std::unordered_set<T> expected_set;
-  for (auto& tmp : expected) {
-    expected_set.insert(tmp);
-  }
-  for (auto& act : actual) {
-    ASSERT_NE(expected_set.end(), expected_set.find(act));
-  }
-}
-
-TEST(OpKernel, all) {
-  auto net = std::make_shared<NetOp>();
-  ASSERT_NE(net, nullptr);
-
-  net->AppendOp(std::unique_ptr<TestOp>(
-      new TestOp("test", {{"X", {"x"}}, {"W", {"w1"}}, {"b", {"b1"}}},
-                 {{"Out", {"y"}}}, framework::AttributeMap{})));
-  net->AppendOp(std::unique_ptr<TestOp>(
-      new TestOp("test", {{"X", {"y"}}, {"W", {"w2"}}, {"b", {"b2"}}},
-                 {{"Out", {"z"}}}, framework::AttributeMap{})));
-
-  net->CompleteAddOp();
-  AssertSameVectorWithoutOrder({"x", "w1", "b1", "w2", "b2"},
-                               net->Inputs(NetOp::kAll));
-  AssertSameVectorWithoutOrder({"y", "z"}, net->Outputs(NetOp::kAll));
-
-  auto final_outs = net->OutputVars(false);
-
-  ASSERT_EQ(final_outs.size(), 1UL);
-  ASSERT_EQ(final_outs[0], "z");
-}
-
-TEST(NetOp, insert_op) {
-  NetOp net;
-  auto op1 = std::unique_ptr<framework::NOP>(
-      new framework::NOP("empty", {{"X", {"x"}}, {"W", {"w1"}}, {"b", {"b1"}}},
-                         {{"Out", {"y"}}}, framework::AttributeMap{}));
-  net.AppendOp(*op1);
-  net.InsertOp(0, *op1);
-  ASSERT_EQ(2UL, net.ops_.size());
-  net.InsertOp(2, std::move(op1));
-  ASSERT_EQ(3UL, net.ops_.size());
-}
-
-TEST(NetOp, Clone) {
-  NetOp net;
-  net.AppendOp(std::unique_ptr<framework::NOP>(new framework::NOP{
-      "empty", framework::VariableNameMap{}, framework::VariableNameMap{},
-      framework::AttributeMap{}}));
-  net.AppendOp(std::unique_ptr<framework::NOP>(new framework::NOP{
-      "empty2", framework::VariableNameMap{}, framework::VariableNameMap{},
-      framework::AttributeMap{}}));
-  net.CompleteAddOp(true);
-  auto new_net_op = net.Clone();
-  ASSERT_NE(new_net_op, nullptr);
-  ASSERT_TRUE(new_net_op->IsNetOp());
-  auto* new_net = static_cast<NetOp*>(new_net_op.get());
-  ASSERT_EQ(2UL, new_net->ops_.size());
-  ASSERT_EQ(new_net->ops_[0]->Type(), "empty");
-  ASSERT_EQ(new_net->ops_[1]->Type(), "empty2");
-}
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/prelu_op.cc b/paddle/fluid/operators/prelu_op.cc
index 447b854544..7fb45bd19d 100644
--- a/paddle/fluid/operators/prelu_op.cc
+++ b/paddle/fluid/operators/prelu_op.cc
@@ -13,7 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/prelu_op.h"
-#include "paddle/fluid/operators/net_op.h"
+
+#include <string>
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc
index b16d06df8d..7ca7639fdb 100644
--- a/paddle/fluid/operators/scale_op.cc
+++ b/paddle/fluid/operators/scale_op.cc
@@ -13,7 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/scale_op.h"
-#include "paddle/fluid/operators/net_op.h"
+
+#include <string>
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/split_op.cc b/paddle/fluid/operators/split_op.cc
index dffac772f1..e745509ec8 100644
--- a/paddle/fluid/operators/split_op.cc
+++ b/paddle/fluid/operators/split_op.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/split_op.h"
-#include "paddle/fluid/operators/net_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index d559743a69..e571da42cb 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -20,8 +20,6 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
-#include "paddle/fluid/pybind/protobuf.h"
-
 #include "paddle/fluid/framework/backward.h"
 #include "paddle/fluid/framework/channel.h"
 #include "paddle/fluid/framework/executor.h"
@@ -31,18 +29,18 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_rank_table.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/parallel_executor.h"
 #include "paddle/fluid/framework/prune.h"
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/operators/cond_op.h"
-#include "paddle/fluid/operators/net_op.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/pybind/const_value.h"
 #include "paddle/fluid/pybind/exception.h"
-#include "paddle/fluid/pybind/pybind.h"
+#include "paddle/fluid/pybind/protobuf.h"
+#include "paddle/fluid/pybind/pybind.h"  // NOLINT
 #include "paddle/fluid/pybind/recordio.h"
 #include "paddle/fluid/pybind/tensor_py.h"
 
@@ -239,11 +237,6 @@ All parameter, weight, gradient are variables in Paddle.
            },
            py::return_value_policy::reference)
 #endif
-      .def("get_net",
-           [](Variable &self) -> operators::NetOp * {
-             return self.GetMutable<operators::NetOp>();
-           },
-           py::return_value_policy::reference)
       .def("get_reader",
            [](Variable &self) -> framework::ReaderHolder * {
              PADDLE_ENFORCE(self.IsType<framework::ReaderHolder>());
@@ -420,42 +413,6 @@ All parameter, weight, gradient are variables in Paddle.
            [](const OperatorBase &op) { return op.OutputVars(false); })
       .def("support_gpu", &OperatorBase::SupportGPU);
 
-  py::class_<operators::NetOp, OperatorBase>(m, "Net")
-      .def_static("create",
-                  []() -> operators::NetOp * {
-                    auto *retv = new operators::NetOp;
-                    retv->SetType("plain_net");
-                    return retv;
-                  })
-      .def("append_op", [](operators::NetOp &self,
-                           const OperatorBase &op) { self.AppendOp(op); })
-      .def("complete_add_op", &operators::NetOp::CompleteAddOp)
-      .def("complete_add_op", [](std::shared_ptr<operators::NetOp> &self) {
-        self->CompleteAddOp();
-      });
-
-  // cond_op
-  py::class_<operators::CondOp, OperatorBase>(m, "CondOp")
-      .def_static("create",
-                  [](py::bytes protobin) -> operators::CondOp * {
-                    proto::OpDesc desc;
-                    PADDLE_ENFORCE(desc.ParsePartialFromString(protobin),
-                                   "Cannot parse user input to OpDesc");
-                    PADDLE_ENFORCE(desc.IsInitialized(),
-                                   "User OpDesc is not initialized, reason %s",
-                                   desc.InitializationErrorString());
-                    auto cond_op = OpRegistry::CreateOp(desc);
-                    return static_cast<operators::CondOp *>(cond_op.release());
-                  })
-      .def("set_truenet",
-           [](operators::CondOp &self, const operators::NetOp &net) -> void {
-             self.set_truenet(net.Clone());
-           })
-      .def("set_falsenet",
-           [](operators::CondOp &self, const operators::NetOp &net) -> void {
-             self.set_falsenet(net.Clone());
-           });
-
   py::class_<framework::Executor>(m, "Executor")
       .def(py::init<const platform::Place &>())
       .def("run",
diff --git a/paddle/fluid/recordio/chunk.cc b/paddle/fluid/recordio/chunk.cc
index e7ebbba452..82d9aa601c 100644
--- a/paddle/fluid/recordio/chunk.cc
+++ b/paddle/fluid/recordio/chunk.cc
@@ -14,13 +14,13 @@
 
 #include "paddle/fluid/recordio/chunk.h"
 
+#include <zlib.h>
 #include <algorithm>
 #include <memory>
 #include <sstream>
 
 #include "paddle/fluid/platform/enforce.h"
-#include "snappy_stream/include/snappystream.hpp"
-#include "zlib/include/zlib.h"
+#include "snappystream.hpp"
 
 namespace paddle {
 namespace recordio {
diff --git a/paddle/fluid/recordio/header.cc b/paddle/fluid/recordio/header.cc
index ed09d58f6a..c4822329a4 100644
--- a/paddle/fluid/recordio/header.cc
+++ b/paddle/fluid/recordio/header.cc
@@ -13,6 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/recordio/header.h"
+
+#include <string>
+
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
index 10aa63e18a..7ecf9a1459 100644
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
@@ -14,23 +14,13 @@
 
 import unittest
 import numpy as np
-from op_test import OpTest
 import paddle.fluid.core as core
 from paddle.fluid.op import Operator
+import paddle.fluid as fluid
+from op_test import OpTest
 from paddle.fluid.framework import grad_var_name
 
 
-def get_backward_op(scope, op, no_grad_set):
-    backward_op = core.Operator.backward(op, no_grad_set)
-    for input in backward_op.input_vars():
-        var = scope.var(input)
-        var.get_tensor()
-    for output in backward_op.output_vars():
-        var = scope.var(output)
-        var.get_tensor()
-    return backward_op
-
-
 def _reference_testing(x, scale, offset, mean, var, epsilon, data_format):
     x_shape = x.shape
     if len(x_shape) == 2:
@@ -64,11 +54,6 @@ def _reference_testing(x, scale, offset, mean, var, epsilon, data_format):
 
 def _reference_training(x, scale, offset, epsilon, data_format):
     x_shape = x.shape
-    if len(x_shape) == 2:
-        if data_format == "NCHW":
-            x = np.reshape(x, (x.shape[0], x.shape[1], 1, 1))
-        else:
-            x = np.reshape(x, (x.shape[0], 1, 1, x.shape[1]))
 
     if data_format == "NCHW":
         n, c, h, w = x.shape
@@ -88,8 +73,6 @@ def _reference_training(x, scale, offset, epsilon, data_format):
         offset_tile = np.reshape(offset, (1, c, 1, 1))
         offset_tile = np.reshape(offset_tile, (1, c, 1, 1))
         y = normalized * scale_tile + offset_tile
-        if len(x_shape) == 2:
-            y = np.reshape(y, (y.shape[0], y.shape[1]))
         return y, mean, var
     elif data_format == "NHWC":
         x_square = x * x
@@ -100,59 +83,42 @@ def _reference_training(x, scale, offset, epsilon, data_format):
         var = x_square_sum / element_count - mean * mean
         normalized = (x - mean) / np.sqrt(var + epsilon)
         y = normalized * scale + offset
-        if len(x_shape) == 2:
-            y = np.reshape(y, x_shape)
         return y, mean, var
     else:
         raise ValueError("Unknown data order.")
 
 
-def _reference_grad(x, grad_y, scale, mean, var, epsilon, data_format):
+def _reference_grad(x, y_grad, scale, mean, var, epsilon, data_format):
     # Use the following formulas to calculate gradients:
     # grad_scale =
     #   sum(grad_y * (x - mean)) * rsqrt(var + epsilon)
     #
     # grad_offset = sum(output_y)
     #
-    # grad_x =
+    # x_grad =
     #   1/N * scale * rsqrt(var + epsilon) * (N * grad_y - sum(grad_y) -
     #   (x - mean) * sum(grad_y * (x - mean)) / (var + epsilon))
 
     # transfer from (N, C, H, W) to (N, H, W, C) to simplify computation
-    x_shape = x.shape
-
-    if len(x_shape) == 2:
-        if data_format == "NCHW":
-            x = np.reshape(x, (x.shape[0], x.shape[1], 1, 1))
-            grad_y = np.reshape(grad_y,
-                                (grad_y.shape[0], grad_y.shape[1], 1, 1))
-        else:
-            x = np.reshape(x, (x.shape[0], 1, 1, x.shape[1]))
-            grad_y = np.reshape(grad_y,
-                                (grad_y.shape[0], 1, 1, grad_y.shape[1]))
-
     if data_format == "NCHW":
         x = np.transpose(x, (0, 2, 3, 1))
-        grad_y = np.transpose(grad_y, (0, 2, 3, 1))
+        y_grad = np.transpose(y_grad, (0, 2, 3, 1))
 
-        # raise ValueError("data_format must be NHWC, got %s." % data_format)
-    grad_x = scale * (grad_y - np.mean(
-        grad_y, axis=(0, 1, 2)) - (x - mean) * np.mean(
-            grad_y * (x - mean), axis=(0, 1, 2)) /
+    x_grad = scale * (y_grad - np.mean(
+        y_grad, axis=(0, 1, 2)) - (x - mean) * np.mean(
+            y_grad * (x - mean), axis=(0, 1, 2)) /
                       (var + epsilon)) / np.sqrt(var + epsilon)
-    grad_scale = np.sum(grad_y * (x - mean) / np.sqrt(var + epsilon),
+    grad_scale = np.sum(y_grad * (x - mean) / np.sqrt(var + epsilon),
                         axis=(0, 1, 2))
-    grad_offset = np.sum(grad_y, axis=(0, 1, 2))
+    grad_offset = np.sum(y_grad, axis=(0, 1, 2))
 
     # transfer back to N, C, H, W
     if data_format == "NCHW":
-        grad_x = np.transpose(grad_x, (0, 3, 1, 2))
+        x_grad = np.transpose(x_grad, (0, 3, 1, 2))
         x = np.transpose(x, (0, 3, 1, 2))
-        grad_y = np.transpose(grad_y, (0, 3, 1, 2))
+        y_grad = np.transpose(y_grad, (0, 3, 1, 2))
 
-    if len(x_shape) == 2:
-        grad_x = np.reshape(grad_x, x_shape)
-    return grad_x, grad_scale, grad_offset
+    return x_grad, grad_scale, grad_offset
 
 
 def create_or_get_tensor(scope, var_name, var, place):
@@ -186,7 +152,7 @@ def set_output_grad(scope, outputs, place, feed_dict=None):
         __set_tensor__(output, data)
 
 
-class TestBatchNormOpInference(OpTest):
+class TestBatchNormOpInference(unittest.TestCase):
     def setUp(self):
         self.dtype = np.float32
 
@@ -304,231 +270,121 @@ class TestFP16BatchNormOpInference(TestBatchNormOpInference):
                 self.check_with_place(place, data_format, self.dtype, [2, 3])
 
 
-class TestBatchNormOpTraining(OpTest):
+class TestBatchNormOpTraining(unittest.TestCase):
     def __assert_close(self, tensor, np_array, msg, atol=1e-4):
+        if not np.allclose(np.array(tensor), np_array, atol=atol):
+            import pdb
+            pdb.set_trace()
         self.assertTrue(np.allclose(np.array(tensor), np_array, atol=atol), msg)
 
-    def test_python_testing(self):
-        data_format = "NHWC"
-        epsilon = 0.00001
-
-        n, h, w, c = 2, 3, 4, 5
-        x_shape = [n, h, w, c]
-        scale_shape = [c]
-
-        x_val = np.random.random_sample(x_shape).astype(np.float32)
-        scale_val = np.random.random_sample(scale_shape).astype(np.float32)
-        bias_val = np.random.random_sample(scale_shape).astype(np.float32)
-
-        mean = np.zeros(scale_shape).astype(np.float32)
-        variance = np.ones(scale_shape).astype(np.float32)
-
-        y_out = _reference_testing(x_val, scale_val, bias_val, mean, variance,
-                                   epsilon, "NHWC")
-
-        # running N, C, H, W case
-        # should produce the same results
-        x_shape2 = [n, c, h, w]
-        x_val2 = np.transpose(x_val, (0, 3, 1, 2))
-        y_out2 = _reference_testing(x_val2, scale_val, bias_val, mean, variance,
-                                    epsilon, "NCHW")
-
-        # transfer (N, C, H, W) back to (N, H, W, C)
-        y_out2_trans = np.transpose(y_out2, (0, 2, 3, 1))
-        self.__assert_close(y_out, y_out2_trans, "inference output")
-        print 'python: NHWC, NCHW, inference checking passed'
-
-    def test_python_training(self):
-        data_format = "NHWC"
-        epsilon = 0.00001
-        momentum = 0.9
-
-        # N, H, W, C: 2, 3, 4, 2
-        n, h, w, c = 2, 3, 4, 5
-        x_shape = [n, h, w, c]
-        scale_shape = [c]
-
-        x_val = np.random.random_sample(x_shape).astype(np.float32)
-        scale_val = np.random.random_sample(scale_shape).astype(np.float32)
-        bias_val = np.random.random_sample(scale_shape).astype(np.float32)
-
-        mean = np.zeros(scale_shape).astype(np.float32)
-        variance = np.ones(scale_shape).astype(np.float32)
-
-        # run forward
-        y_out, saved_mean, var_ref = _reference_training(
-            x_val, scale_val, bias_val, epsilon, "NHWC")
-
-        #
-        mean_out = saved_mean * (1. - momentum) + momentum * mean
-        variance_out = var_ref * (1. - momentum) + momentum * variance
-        saved_variance = 1. / np.sqrt(var_ref + epsilon)
-
-        # running N, C, H, W case
-        # should produce the same results
-        x_shape2 = [n, c, h, w]
-        x_val2 = np.transpose(x_val, (0, 3, 1, 2))
-        y_out2, saved_mean2, var_ref2 = _reference_training(
-            x_val2, scale_val, bias_val, epsilon, "NCHW")
-
-        self.__assert_close(saved_mean, saved_mean2, "batch mean")
-        self.__assert_close(var_ref, var_ref2, "batch variance")
-
-        # transfer (N, C, H, W) back to (N, H, W, C)
-        y_out2_trans = np.transpose(y_out2, (0, 2, 3, 1))
-        self.__assert_close(y_out, y_out2_trans, "batch output")
-        print 'python: NHWC, NCHW, forward checking passed'
-
-        # test backward now
-        # NHWC
-        self.y_grad = np.random.random_sample(x_shape).astype(np.float32)
-        y_grad = self.y_grad
-        # y_grad = np.ones(x_shape).astype(np.float32)
-        x_grad_ref, scale_grad_ref, bias_grad_ref = _reference_grad(
-            x_val, y_grad, scale_val, saved_mean, var_ref, epsilon, "NHWC")
-
-        # NCHW
-        y_grad2 = np.transpose(y_grad, (0, 3, 1, 2))
-        # y_grad2 = np.ones(x_shape2).astype(np.float32)
-        x_grad_ref2, scale_grad_ref2, bias_grad_ref2 = _reference_grad(
-            x_val2, y_grad2, scale_val, saved_mean2, var_ref2, epsilon, "NCHW")
-
-        self.__assert_close(scale_grad_ref, scale_grad_ref2, "scale gradient")
-        self.__assert_close(bias_grad_ref, bias_grad_ref2, "bias gradient")
-
-        x_grad_transpose = np.transpose(x_grad_ref2, (0, 2, 3, 1))
-        self.__assert_close(x_grad_ref, x_grad_transpose, "x gradient")
-        print 'python: NHWC, NCHW, backward checking passed'
-
     def test_forward_backward(self):
         def test_with_place(place, data_layout, shape):
             # attr
             epsilon = 0.00001
             momentum = 0.9
-
-            if len(shape) == 2:
-                x_shape = shape
-                c = shape[1]
+            if data_layout == "NCHW":
+                n, c, h, w = shape[0], shape[1], shape[2], shape[3]
             else:
-                # n, h, w, c = 2, 3, 4, 2
                 n, h, w, c = shape[0], shape[1], shape[2], shape[3]
-                if data_format == "NHWC":
-                    x_shape = [n, h, w, c]
-                elif data_format == "NCHW":
-                    x_shape = [n, c, h, w]
-                else:
-                    raise ValueError("Unknown data type.")
             scale_shape = [c]
 
-            x_val = np.random.random_sample(x_shape).astype(np.float32)
-            scale_val = np.random.random_sample(scale_shape).astype(np.float32)
-            bias_val = np.random.random_sample(scale_shape).astype(np.float32)
-
+            np.random.seed(123)
+            x = np.random.random_sample(shape).astype(np.float32)
+            scale = np.random.random_sample(scale_shape).astype(np.float32)
+            bias = np.random.random_sample(scale_shape).astype(np.float32)
             mean = np.zeros(scale_shape).astype(np.float32)
             variance = np.ones(scale_shape).astype(np.float32)
 
             # run forward
-            y_out, saved_mean, var_ref = _reference_training(
-                x_val, scale_val, bias_val, epsilon, data_format)
-
-            # update moving mean and variance
+            y, saved_mean, var_ref = _reference_training(x, scale, bias,
+                                                         epsilon, data_layout)
             mean_out = saved_mean * (1. - momentum) + momentum * mean
             variance_out = var_ref * (1. - momentum) + momentum * variance
             saved_variance = 1. / np.sqrt(var_ref + epsilon)
-
-            #  for gradient test
-            # y_grad = np.ones(x_shape).astype(np.float32)
-            y_grad = np.zeros(x_shape).astype(np.float32)
-            if len(y_grad.shape) == 2:
-                y_grad[0, 0] = 1.
-            else:
-                y_grad[0, 0, 0, 0] = 1.
-            # y_grad = np.random.random_sample(x_shape).astype(np.float32)
-            x_grad_ref, scale_grad_ref, bias_grad_ref = _reference_grad(
-                x_val, y_grad, scale_val, saved_mean, var_ref, epsilon,
-                data_format)
-
-            scope = core.Scope()
-
-            # create input
-            x_tensor = create_or_get_tensor(scope, "x_val", x_val, place)
-            scale_tensor = create_or_get_tensor(scope, "scale_val", scale_val,
-                                                place)
-            bias_tensor = create_or_get_tensor(scope, "bias_val", bias_val,
-                                               place)
-            mean_tensor = create_or_get_tensor(scope, "mean", mean, place)
-            variance_tensor = create_or_get_tensor(scope, "variance", variance,
-                                                   place)
-
-            # create output
-            y_tensor = create_or_get_tensor(scope, "y_out", None, place)
-            saved_mean_tensor = create_or_get_tensor(scope, "saved_mean", None,
-                                                     place)
-            saved_variance_tensor = create_or_get_tensor(
-                scope, "saved_variance", None, place)
-            mean_out_tensor = mean_tensor
-            variance_out_tensor = variance_tensor
-
-            batch_norm_op = Operator(
-                "batch_norm",
-                # inputs
-                X="x_val",
-                Scale="scale_val",
-                Bias="bias_val",
-                Mean="mean",
-                Variance="variance",
-                # outputs
-                Y="y_out",
-                MeanOut="mean",
-                VarianceOut="variance",
-                SavedMean="saved_mean",
-                SavedVariance="saved_variance",
-                # attrs
-                is_test=False,
-                data_layout=data_layout,
-                momentum=momentum,
-                epsilon=epsilon)
-
-            batch_norm_op.run(scope, place)
-
-            # check forward result
-            self.__assert_close(y_tensor, y_out, "y_out")
-            self.__assert_close(saved_mean_tensor, saved_mean, "saved_mean")
-            self.__assert_close(saved_variance_tensor, saved_variance,
-                                "saved_variance")
-            self.__assert_close(mean_out_tensor, mean_out, "mean_out")
-            if isinstance(place, core.CUDAPlace):
-                atol = 5e-2
-            else:
-                atol = 1e-4
-            self.__assert_close(variance_out_tensor, variance_out,
-                                "variance_out", atol)
-            print "op test forward passed: ", str(place), data_layout
-
             # run backward
-            batch_norm_op_grad = get_backward_op(scope, batch_norm_op, set())
-            set_output_grad(
-                scope,
-                ["y_out", "mean", "variance", "saved_mean", "saved_variance"],
-                place,
-                feed_dict={"y_out": y_grad})
-            batch_norm_op_grad.run(scope, place)
-
-            x_grad_tensor = create_or_get_tensor(scope,
-                                                 grad_var_name("x_val"), None,
-                                                 place)
-            scale_grad_tensor = create_or_get_tensor(scope,
-                                                     grad_var_name("scale_val"),
-                                                     None, place)
-            bias_grad_tensor = create_or_get_tensor(scope,
-                                                    grad_var_name("bias_val"),
-                                                    None, place)
+            y_grad = np.random.random_sample(shape).astype(np.float32)
+            x_grad, scale_grad, bias_grad = _reference_grad(
+                x, y_grad, scale, saved_mean, var_ref, epsilon, data_format)
+
+            var_dict = locals()
+            var_dict['y@GRAD'] = y_grad
+
+            var_names = [
+                'x', 'scale', 'bias', 'mean', 'variance', 'y', 'saved_mean',
+                'saved_variance'
+            ]
+            ground_truth = {name: var_dict[name] for name in var_names}
+
+            program = fluid.Program()
+            with fluid.program_guard(program):
+                block = program.global_block()
+                for name in ground_truth:
+                    block.create_var(
+                        name=name,
+                        dtype='float32',
+                        shape=ground_truth[name].shape)
+                bn_op = block.append_op(
+                    type="batch_norm",
+                    inputs={
+                        "X": block.var('x'),
+                        "Scale": block.var('scale'),
+                        "Bias": block.var('bias'),
+                        "Mean": block.var('mean'),
+                        "Variance": block.var('variance')
+                    },
+                    outputs={
+                        "Y": block.var('y'),
+                        "MeanOut": block.var('mean'),  # share the same memory
+                        "VarianceOut":
+                        block.var('variance'),  # share the same memory
+                        "SavedMean": block.var('saved_mean'),
+                        "SavedVariance": block.var('saved_variance')
+                    },
+                    attrs={
+                        "momentum": momentum,
+                        "epsilon": epsilon,
+                        "is_test": False,
+                        "data_layout": data_layout
+                    })
+                block.create_var(name='y@GRAD', dtype='float32', shape=y.shape)
+
+                # generate backward op_desc
+                grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(
+                    bn_op.desc, set(), [])
+                grad_op_desc = grad_op_desc_list[0]
+                new_op_desc = block.desc.append_op()
+                new_op_desc.copy_from(grad_op_desc)
+                for var_name in grad_op_desc.output_arg_names():
+                    block.desc.var(var_name.encode("ascii"))
+                grad_op_desc.infer_var_type(block.desc)
+                grad_op_desc.infer_shape(block.desc)
+                for arg in grad_op_desc.output_arg_names():
+                    grad_var = block.desc.find_var(arg.encode("ascii"))
+                    grad_var.set_dtype(core.VarDesc.VarType.FP32)
+
+                exe = fluid.Executor(place)
+                out = exe.run(
+                    program,
+                    feed={
+                        name: var_dict[name]
+                        for name in
+                        ['x', 'scale', 'bias', 'mean', 'variance', 'y@GRAD']
+                    },
+                    fetch_list=[
+                        'y', 'mean', 'variance', 'saved_mean', 'saved_variance',
+                        'x@GRAD', 'scale@GRAD', 'bias@GRAD'
+                    ])
+
+            self.__assert_close(y, out[0], "y")
+            self.__assert_close(mean_out, out[1], "mean")
+            self.__assert_close(variance_out, out[2], "variance", 1e-3)
+            self.__assert_close(saved_mean, out[3], "saved_mean")
+            self.__assert_close(saved_variance, out[4], "saved_variance", 1e-3)
+            self.__assert_close(x_grad, out[5], "x_grad")
+            self.__assert_close(scale_grad, out[6], "scale_grad")
+            self.__assert_close(bias_grad, out[7], "bias_grad")
 
-            # check gradient output
-            self.__assert_close(x_grad_tensor, x_grad_ref, "x_grad")
-            self.__assert_close(scale_grad_tensor, scale_grad_ref, "scale_grad")
-            self.__assert_close(bias_grad_tensor, bias_grad_ref, "bias_grad")
-            print "op test backward passed: ", str(place), data_layout
+            print "op test forward passed: ", str(place), data_layout
 
         places = [core.CPUPlace()]
         if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
@@ -537,7 +393,6 @@ class TestBatchNormOpTraining(OpTest):
         for place in places:
             for data_format in ["NCHW", "NHWC"]:
                 test_with_place(place, data_format, [2, 3, 4, 5])
-                test_with_place(place, data_format, [2, 3])
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_cond_op.py b/python/paddle/fluid/tests/unittests/test_cond_op.py
deleted file mode 100644
index 66fbae961a..0000000000
--- a/python/paddle/fluid/tests/unittests/test_cond_op.py
+++ /dev/null
@@ -1,128 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-import paddle.fluid.core as core
-import unittest
-import numpy as np
-from paddle.fluid.op import Operator, CondOp
-
-
-class PySimpleCond(object):
-    '''
-    A simple implementation of dynamic if-else based on numpy
-    '''
-
-    def __init__(self):
-        array = [1] * 10
-        for i in range(1, 10, 2):
-            array[i] = 0
-        self.cond = np.array(array)
-        self.x = np.ones(shape=(10, 1)).astype("float32")
-
-    def forward(self):
-        self.index_t = np.where(self.cond == 1)
-        self.index_f = np.where(self.cond == 0)
-        y_t = self.x[self.index_t]
-        y_f = self.x[self.index_f]
-        y_t = y_t * 2.
-        y_f = y_f * (-2.)
-        output = np.zeros(shape=(10, 1))
-        output[self.index_t] = y_t
-        output[self.index_f] = y_f
-        return output
-
-
-class PySimpleCondTest(unittest.TestCase):
-    def setUp(self):
-        self.condnn = PySimpleCond()
-
-    def test_forward(self):
-        output = self.condnn.forward()
-
-
-def create_tensor(scope, name, shape, np_data):
-    tensor = scope.var(name).get_tensor()
-    tensor.set_dims(shape)
-    tensor.set(np_data, core.CPUPlace())
-    return tensor
-
-
-class TestCondOp(unittest.TestCase):
-    '''
-    Test CondOp
-
-    equation:
-        cond = [True, False, True, False, ...]
-        y[index_t] = x[index_t] * 2.
-        y[index_f] = x[index_f] * -2.
-    outputs:
-        y
-    '''
-
-    def setUp(self):
-        self.py_cond = PySimpleCond()
-
-    def forward(self):
-        self.scope = core.Scope()
-        self.create_global_variables()
-        self.create_cond_op()
-        self.create_sub_net()
-        self.condop.run(self.scope, core.CPUPlace())
-        return np.array(self.scope.find_var("Out").get_tensor())
-
-    def create_global_variables(self):
-        x_np_data = self.py_cond.x
-        create_tensor(self.scope, "X", [10, 1], x_np_data)
-        cond_np_data = self.py_cond.cond.astype("int32")
-        create_tensor(self.scope, "cond", [10, 1], cond_np_data)
-        self.scope.var("SubScopes")
-        self.scope.var("IndexTensors")
-        self.scope.var("Out")
-
-    def create_cond_op(self):
-        self.condop = CondOp(
-            Cond="cond",
-            Xs=["X"],
-            Outs=["Out"],
-            SubScopes="SubScopes",
-            IndexTensors="IndexTensors")
-
-    def create_sub_net(self):
-        truenet = core.Net.create()
-        scale_op_t = Operator("scale", X='X', Out='Out', scale=2.)
-        truenet.append_op(scale_op_t)
-        truenet.complete_add_op(True)
-        self.condop.set_truenet(truenet)
-
-        falsenet = core.Net.create()
-        scale_op_t = Operator("scale", X='X', Out='Out', scale=-2.)
-        falsenet.append_op(scale_op_t)
-        falsenet.complete_add_op(True)
-        self.condop.set_falsenet(falsenet)
-
-    def test_forward(self):
-        print 'test cond op forward'
-        pd_output = self.forward()
-        py_output = self.py_cond.forward()
-        print 'pd_output', pd_output
-        print
-        print 'py_output', py_output
-        self.assertEqual(pd_output.shape, py_output.shape)
-        print 'test passed'
-        return 0
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_layer_norm_op.py b/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
index 8c67e45b7f..69365db4d1 100644
--- a/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
@@ -15,10 +15,8 @@ import unittest
 import numpy as np
 
 from operator import mul
-from op_test import OpTest
 import paddle.fluid.core as core
-from paddle.fluid.op import Operator
-from paddle.fluid.framework import grad_var_name
+import paddle.fluid as fluid
 
 np.random.random(123)
 
@@ -70,161 +68,93 @@ def _reference_layer_norm_grad(x, grad_y, scale, mean, var, begin_norm_axis=1):
     return grad_x, d_scale, d_bias
 
 
-def get_backward_op(scope, op, no_grad_set):
-    backward_op = core.Operator.backward(op, no_grad_set)
-    for input in backward_op.input_vars():
-        var = scope.var(input)
-        var.get_tensor()
-    for output in backward_op.output_vars():
-        var = scope.var(output)
-        var.get_tensor()
-    return backward_op
-
-
-def create_or_get_tensor(scope, var_name, var, place):
-    tensor = scope.var(var_name).get_tensor()
-    if var is not None:
-        assert isinstance(var, np.ndarray)
-        tensor.set_lod([[]])
-        tensor.set_dims(var.shape)
-        tensor.set(var, place)
-    return tensor
-
-
-def set_output_grad(scope, outputs, place, feed_dict=None):
-    def __set_tensor__(name, data=None):
-        out_tensor = scope.find_var(name).get_tensor()
-        grad_tensor = scope.var(grad_var_name(name)).get_tensor()
-        out_dtype = out_tensor.dtype()
-        if data is None:
-            if out_dtype == core.VarDesc.VarType.FP64:
-                data = np.ones(out_tensor.shape(), dtype=np.float64)
-            elif out_dtype == core.VarDesc.VarType.FP32:
-                data = np.ones(out_tensor.shape(), dtype=np.float32)
-            else:
-                raise ValueError("Not supported data type " + str(out_dtype))
-        grad_tensor.set(data, place)
-
-    for output in outputs:
-        data = None
-        if output in feed_dict:
-            data = feed_dict[output]
-        __set_tensor__(output, data)
-
-
-class TestLayerNormdOp(OpTest):
+class TestLayerNormdOp(unittest.TestCase):
     def __assert_close(self, tensor, np_array, msg, atol=1e-4):
         self.assertTrue(np.allclose(np.array(tensor), np_array, atol=atol), msg)
 
-    def __assert_grad_close(self,
-                            tensor,
-                            np_array,
-                            name,
-                            place,
-                            max_relative_error=0.02):
-        a = np.array(tensor)
-        b = np_array
-        abs_a = np.abs(a)
-        abs_a[abs_a < 1e-5] = 1
-
-        diff_mat = np.abs(a - b) / abs_a
-        max_diff = np.max(diff_mat)
-
-        def err_msg():
-            offset = np.argmax(diff_mat > max_relative_error)
-            return ("%s Variable %s max gradient diff %f over limit %f, "
-                    "the first error element is %d, %f, %f") % (
-                        "Gradient Check On %s" % str(place), name, max_diff,
-                        max_relative_error, offset, a.flatten()[offset],
-                        b.flatten()[offset])
-
-        self.assertLessEqual(max_diff, max_relative_error, err_msg())
-
     def check_forward_backward(self, shape, begin_norm_axis):
-        def test_with_place(place, shape, begin_norm_axis=1):
-            # setUp
-            assert begin_norm_axis > 0 and begin_norm_axis < len(
-                shape), 'begin_norm_axis must be between 0 and len(shape)-1.'
+        def test_with_place(place, shape, begin_norm_axis):
             # attr
             epsilon = 0.00001
             x_shape = shape
             D = reduce(mul, x_shape[begin_norm_axis:len(x_shape)], 1)
             scale_shape = [D]
 
-            x_val = np.random.random_sample(x_shape).astype(np.float32)
-            scale_val = np.random.random_sample(scale_shape).astype(np.float32)
-            bias_val = np.random.random_sample(scale_shape).astype(np.float32)
+            np.random.seed(123)
+            x = np.random.random_sample(x_shape).astype(np.float32)
+            scale = np.random.random_sample(scale_shape).astype(np.float32)
+            bias = np.random.random_sample(scale_shape).astype(np.float32)
             y_grad = np.random.random_sample(x_shape).astype(np.float32)
 
-            # run forward
-            y_out, saved_mean, var_ref = _reference_layer_norm_naive(
-                x_val, scale_val, bias_val, epsilon, begin_norm_axis)
-            naive_fw = {"Y": y_out, "Mean": saved_mean, "Variance": var_ref}
-
-            # get gradient
-            x_grad_ref, scale_grad_ref, bias_grad_ref = _reference_layer_norm_grad(
-                x_val, y_grad, scale_val, saved_mean, var_ref, begin_norm_axis)
-            naive_grad = {
-                "X": x_grad_ref,
-                "Scale": scale_grad_ref,
-                "Bias": bias_grad_ref
-            }
-
-            scope = core.Scope()
-
-            # create input
-            input_map = {"X": x_val, "Scale": scale_val, "Bias": bias_val}
-            for i_name in input_map:
-                create_or_get_tensor(scope, i_name, input_map[i_name], place)
-
-            # create output
-            output_map = {"Y": None, "Mean": None, "Variance": None}
-            output_tensor = {}
-            for o_name in output_map:
-                output_tensor[o_name] = create_or_get_tensor(
-                    scope, o_name, output_map[o_name], place)
-
-            layer_norm_op = Operator(
-                "layer_norm",
-                # inputs
-                X="X",
-                Scale="Scale",
-                Bias="Bias",
-                # outputs
-                Y="Y",
-                Mean="Mean",
-                Variance="Variance",
-                # attrs
-                epsilon=epsilon,
-                begin_norm_axis=begin_norm_axis)
-
-            layer_norm_op.run(scope, place)
-
-            # check forward result
-            atol = 5e-2 if isinstance(place, core.CUDAPlace) else 1e-4
-            for o_tensor in output_tensor:
-                self.__assert_close(output_tensor[o_tensor], naive_fw[o_tensor],
-                                    o_tensor, atol)
-
-            # run backward
-            layer_norm_op_grad = get_backward_op(scope, layer_norm_op, set())
-            set_output_grad(
-                scope, ["Y", "Mean", "Variance"],
-                place,
-                feed_dict={"Y": y_grad})
-            layer_norm_op_grad.run(scope, place)
-
-            # get output
-            grad_tensor = {}
-            for o_name in naive_grad:
-                grad_tensor[o_name] = x_ = create_or_get_tensor(
-                    scope, grad_var_name(o_name), None, place)
-
-            # check gradient output
-            for o_grad in naive_grad:
-                self.__assert_grad_close(grad_tensor[o_grad],
-                                         naive_grad[o_grad], o_grad + "@GRAD",
-                                         place)
+            # reference forward & backward
+            y, mean, variance = _reference_layer_norm_naive(
+                x, scale, bias, epsilon, begin_norm_axis)
+            x_grad, scale_grad, bias_grad = _reference_layer_norm_grad(
+                x, y_grad, scale, mean, variance, begin_norm_axis)
+
+            var_dict = locals()
+            var_dict['y@GRAD'] = y_grad
+            var_names = [
+                'x', 'scale', 'bias', 'mean', 'variance', 'y', 'y@GRAD'
+            ]
+            ground_truth = {name: var_dict[name] for name in var_names}
+
+            program = fluid.Program()
+            with fluid.program_guard(program):
+                block = program.global_block()
+                for name in ground_truth:
+                    block.create_var(
+                        name=name,
+                        dtype='float32',
+                        shape=ground_truth[name].shape)
+                layer_norm_op = block.append_op(
+                    type="layer_norm",
+                    inputs={
+                        "X": block.var('x'),
+                        "Scale": block.var('scale'),
+                        "Bias": block.var('bias'),
+                    },
+                    outputs={
+                        "Y": block.var('y'),
+                        "Mean": block.var('mean'),  # share the same memory
+                        "Variance":
+                        block.var('variance'),  # share the same memory
+                    },
+                    attrs={
+                        "epsilon": epsilon,
+                        "begin_norm_axis": begin_norm_axis
+                    })
+
+                # generate backward op_desc
+                grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(
+                    layer_norm_op.desc, set(), [])
+                grad_op_desc = grad_op_desc_list[0]
+                new_op_desc = block.desc.append_op()
+                new_op_desc.copy_from(grad_op_desc)
+                for var_name in grad_op_desc.output_arg_names():
+                    block.desc.var(var_name.encode("ascii"))
+                grad_op_desc.infer_var_type(block.desc)
+                grad_op_desc.infer_shape(block.desc)
+                for arg in grad_op_desc.output_arg_names():
+                    grad_var = block.desc.find_var(arg.encode("ascii"))
+                    grad_var.set_dtype(core.VarDesc.VarType.FP32)
+
+                exe = fluid.Executor(place)
+                out = exe.run(program,
+                              feed={
+                                  name: var_dict[name]
+                                  for name in ['x', 'scale', 'bias', 'y@GRAD']
+                              },
+                              fetch_list=[
+                                  'y', 'mean', 'variance', 'x@GRAD',
+                                  'scale@GRAD', 'bias@GRAD'
+                              ])
+                self.__assert_close(y, out[0], "y")
+                self.__assert_close(mean, out[1], "mean")
+                self.__assert_close(variance, out[2], "variance", 1e-3)
+                self.__assert_close(x_grad, out[3], "x_grad")
+                self.__assert_close(scale_grad, out[4], "scale_grad", 1e-3)
+                self.__assert_close(bias_grad, out[5], "bias_grad")
 
         places = [core.CPUPlace()]
         if core.is_compiled_with_cuda() and core.op_support_gpu("layer_norm"):
@@ -237,15 +167,6 @@ class TestLayerNormdOp(OpTest):
         self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=1)
         self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=3)
 
-    def test_check_forward_backward_with_scale(self):
-        pass  # TODO(zcd)
-
-    def test_check_forward_backward_with_bias(self):
-        pass  # TODO(zcd)
-
-    def test_check_forward_backward(self):
-        pass  # TODO(zcd)
-
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_net.py b/python/paddle/fluid/tests/unittests/test_net.py
deleted file mode 100644
index ae1699d647..0000000000
--- a/python/paddle/fluid/tests/unittests/test_net.py
+++ /dev/null
@@ -1,53 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.fluid.core as core
-from paddle.fluid.op import Operator
-import unittest
-
-
-def fc(X, W, Y):
-    ret_v = core.Net.create()
-
-    ret_v.append_op(Operator("mul", X="X", Y="W", Out="pre_activation"))
-    ret_v.append_op(Operator("sigmoid", X="pre_activation", Out=Y))
-    ret_v.complete_add_op(True)
-    return ret_v
-
-
-class TestNet(unittest.TestCase):
-    def test_net_all(self):
-        net = core.Net.create()
-        op1 = Operator("sum", X=["X", "Y"], Out="Out")
-        net.append_op(op1)
-
-        net2 = core.Net.create()
-        net2.append_op(fc(X="X", W="w", Y="fc.out"))
-        net2.complete_add_op(True)
-        net.append_op(net2)
-        net.complete_add_op(True)
-
-        expected = '''
-Op(plain_net), inputs:{all[W, X, Y]}, outputs:{all[Out, fc.out, pre_activation]}.
-    Op(sum), inputs:{X[X, Y]}, outputs:{Out[Out]}.
-    Op(plain_net), inputs:{all[W, X]}, outputs:{all[fc.out, pre_activation]}.
-        Op(plain_net), inputs:{all[W, X]}, outputs:{all[fc.out, pre_activation]}.
-            Op(mul), inputs:{X[X], Y[W]}, outputs:{Out[pre_activation]}.
-            Op(sigmoid), inputs:{X[pre_activation]}, outputs:{Out[fc.out]}.
-'''
-        self.assertEqual(expected, "\n" + str(net))
-
-
-if __name__ == "__main__":
-    unittest.main()

From a64edbf14e9c2f499a0dbb34417c50b8ded67c58 Mon Sep 17 00:00:00 2001
From: "Yang Yang(Tony)" <yangyang62@baidu.com>
Date: Wed, 11 Apr 2018 14:55:13 -0700
Subject: [PATCH 52/62] delete backward.cc related code on the python side
 (#9854)

---
 paddle/fluid/framework/CMakeLists.txt         |   3 +-
 paddle/fluid/framework/backward.cc            | 472 ------------------
 paddle/fluid/framework/backward.h             |  56 ---
 paddle/fluid/pybind/CMakeLists.txt            |   4 +-
 paddle/fluid/pybind/protobuf.cc               |  18 -
 paddle/fluid/pybind/pybind.cc                 |   6 -
 python/paddle/fluid/framework.py              |  18 -
 .../fluid/tests/unittests/test_layers.py      |   3 -
 .../fluid/tests/unittests/test_program.py     |  51 --
 9 files changed, 3 insertions(+), 628 deletions(-)
 delete mode 100644 paddle/fluid/framework/backward.cc
 delete mode 100644 paddle/fluid/framework/backward.h

diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 77f459b4dd..1f3ca24df1 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -79,13 +79,12 @@ add_custom_command(TARGET framework_py_proto POST_BUILD
     COMMENT "Copy generated python proto into directory paddle/fluid/proto."
     WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 
-cc_library(backward SRCS backward.cc DEPS operator)
 cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)
 
 cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog)
 
 cc_library(executor SRCS executor.cc DEPS op_registry device_context scope
-framework_proto backward glog lod_rank_table feed_fetch_method)
+framework_proto glog lod_rank_table feed_fetch_method)
 
 
 cc_library(parallel_executor SRCS parallel_executor.cc DEPS multi_devices_graph_builder threaded_ssa_graph_executor)
diff --git a/paddle/fluid/framework/backward.cc b/paddle/fluid/framework/backward.cc
deleted file mode 100644
index 76133b5136..0000000000
--- a/paddle/fluid/framework/backward.cc
+++ /dev/null
@@ -1,472 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/backward.h"
-
-#include <deque>
-#include <list>
-#include <memory>
-#include <unordered_set>
-
-#include "paddle/fluid/framework/block_desc.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace framework {
-
-static std::unordered_set<std::string>* g_ctrl_flow_ops_ = nullptr;
-// Control Flow operators's backward is significantly different from
-// computational operators. Hack Code here.
-// We should design a better way to backward CtrlFlowOps.
-static std::unordered_set<std::string>& CtrlFlowOps() {
-  if (g_ctrl_flow_ops_ == nullptr) {
-    g_ctrl_flow_ops_ = new std::unordered_set<std::string>{
-        "increment", "lod_rank_table", "less_than"};
-  }
-  return *g_ctrl_flow_ops_;
-}
-
-static inline std::unique_ptr<OperatorBase> CreateGradOp(
-    const OperatorBase& op, const std::unordered_set<std::string>& no_grad_set,
-    std::unordered_map<std::string, std::string>* grad_to_var) {
-  OpDesc op_desc;
-  op_desc.SetInputMap(op.Inputs());
-  op_desc.SetOutputMap(op.Outputs());
-  op_desc.SetType(op.Type());
-  op_desc.SetAttrMap(op.Attrs());
-  auto& info = OpInfoMap::Instance().Get(op.Type());
-  auto grad_descs = info.GradOpMaker()(op_desc, no_grad_set, grad_to_var, {});
-  std::vector<std::unique_ptr<OperatorBase>> grad_ops;
-  grad_ops.reserve(grad_descs.size());
-  std::transform(grad_descs.begin(), grad_descs.end(),
-                 std::back_inserter(grad_ops),
-                 [](const std::unique_ptr<OpDesc>& grad_desc) {
-                   return OpRegistry::CreateOp(*grad_desc);
-                 });
-  PADDLE_ENFORCE(!grad_ops.empty());
-  if (grad_ops.size() == 1) {
-    return std::move(grad_ops[0]);
-  } else {
-    PADDLE_THROW("Unexpected Branch");
-  }
-}
-
-template <typename Map, typename T>
-static void ForEachVarName(const Map& names, T callback) {
-  for (auto& name : names) {
-    for (auto& n : name.second) {
-      if (callback(n)) return;
-    }
-  }
-}
-
-// return whether all the names + suffixes in the set
-static bool AllInSet(
-    const std::map<std::string, std::vector<std::string>>& names,
-    const std::string& suffix, const std::unordered_set<std::string>& set) {
-  bool all_in_set = true;
-  ForEachVarName(names, [&all_in_set, &set, &suffix](const std::string& n) {
-    all_in_set = set.find(n + suffix) != set.end();
-    return !all_in_set;
-  });
-  return all_in_set;
-}
-
-static std::unique_ptr<OperatorBase> NOP() {
-  PADDLE_THROW("Unexpected Branch");
-}
-
-//  Get backward operator from a forward operator, a recursive implementation.
-//
-//  no_grad_names the gradient variable names without gradient calculating.
-//
-//  uniq_id is a unique index used inside recursively calling
-//  BackwardRecursive. use `uid = uniq_id++;` to get the unique index, and
-//  pass `uniq_id` through recursive calling.
-//
-//  returns The backward operator. In a simple situation, it may be a simple
-//  operator, in a complex situation, it maybe a NetOp.
-//
-//  See Backward.h for details
-static std::unique_ptr<OperatorBase> BackwardRecursive(
-    const OperatorBase& forwardOp,
-    std::unordered_set<std::string>& no_grad_names,
-    std::unordered_map<std::string, std::string>* grad_to_var,
-    size_t& uniq_id) {
-  //  If all input gradients of forwarding operator do not need to calculate,
-  //  just return an NOP. Not return null ptr because NOP does not take
-  //  too much time for calculation, but it is useful for simplifying logic.
-  if (AllInSet(forwardOp.Inputs() /*names*/, kGradVarSuffix /*suffix*/,
-               no_grad_names /*set*/)) {
-    return NOP();
-  }
-
-  //  All output gradients of forwarding operator do not need to calculate.
-  //  Then all input gradients cannot be computed at all, and we put them into
-  //  `no_grad_names` set. Return an NOP.
-  if (AllInSet(forwardOp.Outputs() /*names*/, kGradVarSuffix /*suffix*/,
-               no_grad_names /*set*/)) {
-    ForEachVarName(forwardOp.Inputs(),
-                   [&no_grad_names](const std::string& name) -> bool {
-                     no_grad_names.insert(GradVarName(name));
-                     return false;
-                   });
-    return NOP();
-  }
-
-  // Returned gradient network
-  PADDLE_THROW("Unexpected Branch");
-}
-
-// See header for comments
-std::unique_ptr<OperatorBase> Backward(
-    const OperatorBase& forwardOp,
-    const std::unordered_set<std::string>& no_grad_vars) {
-  std::unordered_set<std::string> no_grad_names;
-  no_grad_names.reserve(no_grad_vars.size() + 1);
-
-  no_grad_names.insert(std::string(kEmptyVarName) + kGradVarSuffix);
-
-  for (auto& name : no_grad_vars) {
-    no_grad_names.insert(name + kGradVarSuffix);
-  }
-  size_t uid = 0;
-  std::unordered_map<std::string, std::string> grad_to_var;
-  return BackwardRecursive(forwardOp, no_grad_names, &grad_to_var, uid);
-}
-
-// ====================================  //
-
-static bool AllGradInSet(const std::vector<std::string>& names,
-                         const std::unordered_set<std::string>& set) {
-  for (const std::string& name : names) {
-    if (!set.count(GradVarName(name))) {
-      return false;
-    }
-  }
-  if (VLOG_IS_ON(10)) {
-    std::ostringstream sout;
-    sout << "All input {";
-    for (auto& name : names) {
-      sout << name << ",";
-    }
-    sout << "} is in {";
-    for (auto& name : set) {
-      sout << name << ",";
-    }
-    sout << "}";
-    VLOG(10) << sout.str();
-  }
-  return true;
-}
-
-static std::string FwdName(const std::string& grad_name) {
-  auto pos = grad_name.find("@GRAD");
-  if (pos == std::string::npos) {
-    return "";
-  } else {
-    return grad_name.substr(0, pos);
-  }
-}
-
-static void CreateGradVarInBlock(
-    size_t grad_op_start_index,
-    const std::unordered_map<std::string, std::string>& param_name_map,
-    BlockDesc* block_desc,
-    std::unordered_map<std::string, GradVarInfo>* grad_var_record) {
-  auto ops = block_desc->AllOps();
-  for (size_t op_index = grad_op_start_index; op_index < ops.size();
-       ++op_index) {
-    std::unordered_set<std::string> new_vars;
-    auto& ctrl_flow_ops = CtrlFlowOps();
-    ForEachVarName(ops[op_index]->Outputs(),
-                   [&](const std::string& grad_var_name) {
-                     if (ctrl_flow_ops.find(ops[op_index]->Type()) !=
-                         ctrl_flow_ops.end()) {
-                       if (block_desc->HasVarRecursive(grad_var_name)) {
-                         return false;
-                       }
-                     } else {
-                       if (block_desc->HasVar(grad_var_name)) {
-                         return false;
-                       }
-                     }
-                     if (grad_var_name == framework::kEmptyVarName) {
-                       return false;
-                     }
-                     auto var = block_desc->Var(grad_var_name);
-                     VLOG(10) << "Creating Variable " << grad_var_name;
-                     new_vars.insert(var->Name());
-                     auto it = param_name_map.find(grad_var_name);
-                     if (it == param_name_map.end()) {
-                       return false;
-                     }
-                     auto param_var_name = it->second;
-                     auto& grad_record = (*grad_var_record)[param_var_name];
-                     grad_record.name_ = grad_var_name;
-                     grad_record.block_idx_ = block_desc->ID();
-                     grad_record.op_idx_ = static_cast<int>(op_index);
-                     return false; /* not break */
-                   });
-    ops[op_index]->InferVarType(block_desc);
-    for (auto& arg : ops[op_index]->OutputArgumentNames()) {
-      if (new_vars.find(arg) == new_vars.end()) {
-        continue;
-      }
-      auto pname = FwdName(arg);
-      auto* param = block_desc->FindVarRecursive(pname);
-      auto* grad = block_desc->FindVar(arg);
-      if (param == nullptr) {
-        grad->SetDataType(proto::VarType::FP32);
-      } else {
-        grad->SetDataType(param->GetDataType());
-      }
-    }
-    ops[op_index]->InferShape(*block_desc);
-  }
-}
-
-std::vector<std::unique_ptr<OpDesc>> MakeOpGrad(
-    const OpDesc* op_desc, std::unordered_set<std::string>* no_grad_vars,
-    std::unordered_map<std::string, std::string>* grad_to_var,
-    const std::vector<BlockDesc*>& grad_block = std::vector<BlockDesc*>()) {
-  std::vector<std::unique_ptr<OpDesc>> grad_op_descs;
-  // All input gradients of forwarding operator do not need to calculate.
-  const std::vector<std::string>& inputs = op_desc->InputArgumentNames();
-  if (AllGradInSet(inputs, *no_grad_vars)) {
-    VLOG(10) << "Drop operator  " << op_desc->Type();
-    return grad_op_descs;  // empty vector
-  }
-
-  // All output gradients of forwarding operator do not need to calculate.
-  const std::vector<std::string>& outputs = op_desc->OutputArgumentNames();
-
-  if (AllGradInSet(outputs, *no_grad_vars)) {
-    VLOG(10) << "Drop operator " << op_desc->Type();
-    // FIXME: Hack code here
-    auto& ctrl_flow_ops = CtrlFlowOps();
-    if (ctrl_flow_ops.find(op_desc->Type()) == ctrl_flow_ops.end()) {
-      // Only computational op need drop input's gradient.
-      for (const std::string& name : inputs) {
-        no_grad_vars->insert(GradVarName(name));
-        VLOG(10) << " Also drop " << GradVarName(name);
-      }
-    }
-
-    return grad_op_descs;  // empty vector
-  }
-
-  grad_op_descs =
-      OpInfoMap::Instance()
-          .Get(op_desc->Type())
-          .GradOpMaker()(*op_desc, *no_grad_vars, grad_to_var, grad_block);
-
-  std::list<std::unique_ptr<OpDesc>> pending_fill_zeros_ops;
-  for (auto& desc : grad_op_descs) {
-    for (const std::string& in_name : desc->InputArgumentNames()) {
-      if (no_grad_vars->count(in_name)) {
-        std::string prefix = in_name.substr(
-            0, in_name.size() - sizeof(kGradVarSuffix) / sizeof(char) + 1);
-        std::string new_name = prefix + kZeroVarSuffix;
-        desc->Rename(in_name, new_name);
-        std::unique_ptr<OpDesc> fill_zeros_op(
-            new OpDesc("fill_zeros_like", {{"X", {prefix}}},
-                       {{"Out", {new_name}}}, AttributeMap{}));
-        pending_fill_zeros_ops.push_back(std::move(fill_zeros_op));
-      }
-    }
-  }
-
-  for (auto& p : pending_fill_zeros_ops) {
-    grad_op_descs.insert(grad_op_descs.begin(), std::move(p));
-  }
-  return grad_op_descs;
-}
-
-static BlockDesc* CreateStepBlock(
-    ProgramDesc& program_desc, std::unordered_set<std::string>* no_grad_vars,
-    std::unordered_map<std::string, std::string>* grad_to_var,
-    int step_block_idx);
-
-std::vector<std::unique_ptr<OpDesc>> MakeBlockBackward(
-    ProgramDesc& program_desc, int block_idx,
-    std::unordered_set<std::string>* no_grad_vars,
-    std::unordered_map<std::string, std::string>* grad_to_var) {
-  VLOG(5) << "MakeBlockBackward";
-  BlockDesc* cur_block = program_desc.MutableBlock(block_idx);
-  std::vector<OpDesc*> op_descs = cur_block->AllOps();
-  std::unordered_map<std::string, std::vector<size_t>> dup_out_ops;
-  size_t grad_desc_idx = 0;
-  std::vector<std::unique_ptr<OpDesc>> backward_descs;
-
-  for (auto it = op_descs.rbegin(); it != op_descs.rend(); ++it) {
-    VLOG(5) << "Making backward " << (*it)->Type() << " op";
-    std::vector<std::unique_ptr<OpDesc>> op_grads;
-
-    if ((*it)->Type() == "recurrent" || (*it)->Type() == "while" ||
-        (*it)->Type() == "parallel_do") {
-      int step_block_idx = (*it)->GetBlockAttr("sub_block");
-      BlockDesc* backward_block = CreateStepBlock(program_desc, no_grad_vars,
-                                                  grad_to_var, step_block_idx);
-      op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var, {backward_block});
-    } else if ((*it)->Type() == "conditional_block") {
-      BlockDesc* backward_block =
-          CreateStepBlock(program_desc, no_grad_vars, grad_to_var,
-                          (*it)->GetBlockAttr("sub_block"));
-      op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var, {backward_block});
-    } else {
-      op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var);
-    }
-
-    if (VLOG_IS_ON(10)) {
-      std::ostringstream sout;
-      sout << "Made ";
-      for (auto& op_grad : op_grads) {
-        sout << op_grad->Type() << " ";
-      }
-      VLOG(10) << sout.str();
-    }
-
-    for (const auto& desc : op_grads) {
-      for (const std::string& out_name : desc->OutputArgumentNames()) {
-        if (out_name.find("@GRAD") == std::string::npos) {
-          // Not all outputs of a backward operator is a gradient. Only gradient
-          // need to be sum. Skip variables are not gradient.
-          continue;
-        }
-        dup_out_ops[out_name].emplace_back(grad_desc_idx);
-      }
-      ++grad_desc_idx;
-    }
-    std::transform(op_grads.begin(), op_grads.end(),
-                   std::back_inserter(backward_descs),
-                   [](std::unique_ptr<OpDesc>& ptr) { return std::move(ptr); });
-  }
-
-  VLOG(5) << "Appending Sums";
-  // Check whether some variables are written more than once
-  std::list<std::pair<size_t, std::unique_ptr<OpDesc>>> pending_sum_ops;
-  for (const auto& dup : dup_out_ops) {
-    const std::string& out_name = dup.first;
-    const std::vector<size_t> dup_op = dup.second;
-    if (out_name != kEmptyVarName && dup_op.size() > 1) {
-      std::vector<std::string> sum_op_inputs;
-      std::string next_g_name = out_name;
-      for (size_t i = 0; i < dup_op.size(); ++i) {
-        VLOG(10) << backward_descs[dup_op[i]]->Type() << " has " << out_name
-                 << " duplicated";
-        std::string new_name = out_name + "@RENAME@" + std::to_string(i);
-        backward_descs[dup_op[i]]->RenameOutput(out_name, new_name);
-        backward_descs[dup_op[i]]->RenameInput(out_name, next_g_name);
-        sum_op_inputs.emplace_back(new_name);
-        next_g_name = sum_op_inputs.back();
-      }
-      std::unique_ptr<OpDesc> sum_op(new OpDesc("sum", {{"X", sum_op_inputs}},
-                                                {{"Out", {out_name}}},
-                                                AttributeMap{}));
-      pending_sum_ops.push_back({dup_op.back(), std::move(sum_op)});
-    }
-  }
-
-  pending_sum_ops.sort([](const std::pair<size_t, std::unique_ptr<OpDesc>>& a,
-                          const std::pair<size_t, std::unique_ptr<OpDesc>>& b) {
-    return a.first > b.first;
-  });
-  for (auto& p : pending_sum_ops) {
-    backward_descs.insert(backward_descs.begin() + p.first + 1,
-                          std::move(p.second));
-  }
-
-  VLOG(5) << "MakeBlockBackward Finished";
-
-  return backward_descs;
-}
-
-static BlockDesc* CreateStepBlock(
-    ProgramDesc& program_desc, std::unordered_set<std::string>* no_grad_vars,
-    std::unordered_map<std::string, std::string>* grad_to_var,
-    int step_block_idx) {
-  auto backward_block_op_descs = MakeBlockBackward(program_desc, step_block_idx,
-                                                   no_grad_vars, grad_to_var);
-  BlockDesc* backward_block =
-      program_desc.AppendBlock(*program_desc.MutableBlock(step_block_idx));
-  for (auto& ptr : backward_block_op_descs) {
-    backward_block->AppendAllocatedOp(move(ptr));
-  }
-  return backward_block;
-}
-
-ParamGradInfoMap AppendBackward(
-    ProgramDesc& program_desc, const VarDesc& target,
-    const std::unordered_set<std::string>& no_grad_vars) {
-  std::unordered_set<std::string> no_grad_var_names;
-  no_grad_var_names.reserve(no_grad_vars.size() + 1);
-  no_grad_var_names.insert(std::string(kEmptyVarName) + kGradVarSuffix);
-  for (auto& name : no_grad_vars) {
-    no_grad_var_names.insert(GradVarName(name));
-  }
-
-  const int root_block_idx = 0;
-  auto root_block = program_desc.MutableBlock(root_block_idx);
-
-  std::string fill_one_op_out = GradVarName(target.Name());
-  bool is_scalar = target.GetShape() == std::vector<int64_t>{1};
-  PADDLE_ENFORCE(is_scalar, "target should be scalar");
-  VLOG(3) << "backward from loss=" << target.Name()
-          << " data_type=" << target.GetDataType();
-  std::unique_ptr<OpDesc> fill_one_op(
-      new OpDesc("fill_constant", {}, {{"Out", {fill_one_op_out}}},
-                 {{"shape", std::vector<int>{1}},
-                  {"value", static_cast<float>(1.0)},
-                  {"dtype", target.GetDataType()}}));
-  // infer var type of fill_one_op
-  fill_one_op->InferVarType(root_block);
-
-  root_block->AppendAllocatedOp(std::move(fill_one_op));
-  size_t forward_op_num = root_block->OpSize();
-  size_t forward_block_num = program_desc.Size();
-
-  // Insert backward operators
-  std::unordered_map<std::string, std::string> grad_to_var;
-  auto backward_op_descs = MakeBlockBackward(program_desc, root_block_idx,
-                                             &no_grad_var_names, &grad_to_var);
-
-  for (auto& ptr : backward_op_descs) {
-    root_block->AppendAllocatedOp(std::move(ptr));
-  }
-  // Create Variable
-
-  // Create target gradient variable
-  std::unordered_map<std::string, GradVarInfo> retv;
-
-  auto var = root_block->Var(fill_one_op_out);
-  var->SetDataType(target.GetDataType());
-  var->SetShape(target.GetShape());
-  auto& target_grad = retv[target.Name()];
-  target_grad.name_ = fill_one_op_out;
-  target_grad.block_idx_ = root_block_idx;
-  target_grad.op_idx_ = static_cast<int>(forward_op_num);
-
-  // create grad_var for all blocks in this program
-  CreateGradVarInBlock(forward_op_num, grad_to_var, root_block, &retv);
-  for (size_t block_index = forward_block_num;
-       block_index < program_desc.Size(); ++block_index) {
-    CreateGradVarInBlock(0, grad_to_var, program_desc.MutableBlock(block_index),
-                         &retv);
-  }
-  return retv;
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/backward.h b/paddle/fluid/framework/backward.h
deleted file mode 100644
index 3a971090c2..0000000000
--- a/paddle/fluid/framework/backward.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-
-namespace paddle {
-namespace framework {
-
-// Create the backward operator from a forward operator.
-// TODO(yuyang18): Add more API reference comment.
-extern std::unique_ptr<OperatorBase> Backward(
-    const OperatorBase& forwardOp,
-    const std::unordered_set<std::string>& no_grad_vars);
-
-struct GradVarInfo {
-  GradVarInfo() {}
-  GradVarInfo(const std::string& name, int block_idx, int op_idx)
-      : name_(name), block_idx_(block_idx), op_idx_(op_idx) {}
-
-  bool operator==(const GradVarInfo& b) const {
-    return name_ == b.name_ && block_idx_ == b.block_idx_ &&
-           op_idx_ == b.op_idx_;
-  }
-
-  std::string name_;
-  int block_idx_;
-  int op_idx_;
-};
-
-using ParamGradInfoMap = std::unordered_map<std::string /*fwd_var_name*/,
-                                            GradVarInfo /*grad_var_info*/>;
-
-ParamGradInfoMap AppendBackward(
-    ProgramDesc& program_desc, const VarDesc& target,
-    const std::unordered_set<std::string>& no_grad_vars);
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 884289a7fd..4fef351c21 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -2,13 +2,13 @@ if(WITH_PYTHON)
   if(WITH_AMD_GPU)
     hip_library(paddle_pybind SHARED
       SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc
-      DEPS pybind python backward proto_desc memory executor prune init profiler feed_fetch_method
+      DEPS pybind python proto_desc memory executor prune init profiler feed_fetch_method
            parallel_executor
       ${GLOB_OP_LIB})
   else()
     cc_library(paddle_pybind SHARED
       SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc
-      DEPS pybind python backward proto_desc memory executor prune init profiler feed_fetch_method
+      DEPS pybind python proto_desc memory executor prune init profiler feed_fetch_method
            parallel_executor
       ${GLOB_OP_LIB})
     if(NOT APPLE AND NOT ANDROID)
diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc
index 2fe8290363..93533e5c9d 100644
--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
@@ -18,7 +18,6 @@ limitations under the License. */
 #include <string>
 #include <tuple>
 
-#include "paddle/fluid/framework/backward.h"
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/program_desc.h"
@@ -125,23 +124,6 @@ void BindProgramDesc(pybind11::module *m) {
            })
       .def("append_block", &pd::ProgramDesc::AppendBlock,
            pybind11::return_value_policy::reference)
-      .def("append_backward",
-           [](pd::ProgramDesc &program_desc, const pd::VarDesc &target,
-              const std::unordered_set<std::string> &no_grad_vars) {
-             pd::ParamGradInfoMap param_grad_map =
-                 AppendBackward(program_desc, target, no_grad_vars);
-             std::unordered_map<
-                 std::string, std::tuple<std::string /* grad_var_name */,
-                                         int /* block_idx */, int /* op_idx */>>
-                 retv;
-             for (auto it = param_grad_map.begin(); it != param_grad_map.end();
-                  ++it) {
-               const auto &grad_info = it->second;
-               retv[it->first] = std::make_tuple(
-                   grad_info.name_, grad_info.block_idx_, grad_info.op_idx_);
-             }
-             return retv;
-           })
       .def("block", &pd::ProgramDesc::MutableBlock,
            pybind11::return_value_policy::reference)
       .def("num_blocks", &pd::ProgramDesc::Size)
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index e571da42cb..a1e8ff6399 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -20,7 +20,6 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
-#include "paddle/fluid/framework/backward.h"
 #include "paddle/fluid/framework/channel.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/feed_fetch_method.h"
@@ -381,11 +380,6 @@ All parameter, weight, gradient are variables in Paddle.
                                    desc.InitializationErrorString());
                     return OpRegistry::CreateOp(desc);
                   })
-      .def("backward",
-           [](const OperatorBase &forwardOp,
-              const std::unordered_set<std::string> &no_grad_vars) {
-             return Backward(forwardOp, no_grad_vars).release();
-           })
       .def("run",
            [](OperatorBase &self, const Scope &scope,
               const platform::CPUPlace &place) { self.Run(scope, place); })
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 793421a22f..ea9abdcae2 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -1119,24 +1119,6 @@ class Program(object):
     def current_block(self):
         return self.blocks[self.current_block_idx]
 
-    def append_backward(self, target, no_grad_set=None):
-        """
-        return map(param_name -> (grad_name, block_index, op_index))
-        """
-        assert isinstance(target, Variable)
-        if no_grad_set is None:
-            no_grad_set = set()
-        try:
-            param_to_grad_info = self.desc.append_backward(target.desc,
-                                                           no_grad_set)
-        except Exception as e:
-            raise core.EnforceNotMet(
-                str(e) + "\nCurrent protobuf is\n{0}".format(
-                    self.to_string(False)))
-
-        self.sync_with_cpp()
-        return param_to_grad_info
-
     def create_block(self, parent_idx=None):
         new_block_idx = len(self.blocks)
         parent = self.current_block() if parent_idx is None else self.block(
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 2179826d81..f88a6f1ce6 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -32,7 +32,6 @@ class TestBook(unittest.TestCase):
             cost = layers.square_error_cost(input=y_predict, label=y)
             avg_cost = layers.mean(cost)
             self.assertIsNotNone(avg_cost)
-            program.append_backward(avg_cost)
 
         print(str(program))
 
@@ -94,8 +93,6 @@ class TestBook(unittest.TestCase):
             cost = layers.cross_entropy(input=predict, label=label)
             avg_cost = layers.mean(cost)
 
-            program.append_backward(avg_cost)
-
         print(str(program))
 
     def test_word_embedding(self):
diff --git a/python/paddle/fluid/tests/unittests/test_program.py b/python/paddle/fluid/tests/unittests/test_program.py
index 87a2195f0d..c51a482393 100644
--- a/python/paddle/fluid/tests/unittests/test_program.py
+++ b/python/paddle/fluid/tests/unittests/test_program.py
@@ -87,57 +87,6 @@ class TestProgram(unittest.TestCase):
         print(prog)
         print(prog_restored)
 
-    def test_append_backward(self):
-        prog = Program()
-        block = prog.global_block()
-
-        mul_x = block.create_var(
-            dtype="float32", shape=[5, 10], lod_level=0, name="mul.x")
-        mul_y = block.create_var(
-            dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
-        mul_out = block.create_var(
-            dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
-        mul_op = block.append_op(
-            type="mul",
-            inputs={"X": [mul_x],
-                    "Y": mul_y},
-            outputs={"Out": [mul_out]},
-            attrs={"x_num_col_dims": 1})
-
-        add_y = block.create_var(
-            dtype="float32", shape=[5, 8], lod_level=0, name="add.y")
-        add_out = block.create_var(
-            dtype="float32", shape=[5, 8], lod_level=0, name="add.out")
-        add_op = block.append_op(
-            type="elementwise_add",
-            inputs={"X": mul_out,
-                    "Y": add_y},
-            outputs={"Out": add_out},
-            attrs={"x_num_col_dims": 1})
-        mean_out = block.create_var(
-            dtype="float32", shape=[1], lod_level=0, name="mean.out")
-        block.append_op(
-            type="mean", inputs={"X": add_out}, outputs={"Out": mean_out})
-
-        self.assertEqual(mul_op.idx, 0)
-        self.assertEqual(add_op.idx, 1)
-        param_to_grad = prog.append_backward(mean_out, set())
-
-        for var_name in ("mul.x", "mul.y", "mul.out", "add.y", "add.out",
-                         "mean.out"):
-            self.assertEqual(param_to_grad[var_name][0],
-                             grad_var_name(var_name))
-            self.assertEqual(param_to_grad[var_name][1], 0)
-
-        expect_ops = [
-            "mul", "elementwise_add", "mean", "fill_constant", "mean_grad",
-            "elementwise_add_grad", "mul_grad"
-        ]
-        actual_ops = []
-        for op in block.ops:
-            actual_ops.append(op.type)
-        self.assertEqual(actual_ops, expect_ops)
-
     def test_program_clone_with_parameter(self):
         main_program = Program()
         startup_program = Program()

From 35b0ed369cf7796d58c1f9eb1c81bcb376194fd8 Mon Sep 17 00:00:00 2001
From: Helin Wang <ustc.harry@gmail.com>
Date: Wed, 11 Apr 2018 15:00:33 -0700
Subject: [PATCH 53/62] make -j nproc when making inference_lib_dist

---
 paddle/scripts/docker/build.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index 4885b74e6c..be1565ab53 100755
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -231,7 +231,7 @@ function gen_fluid_inference_lib() {
     Deploying fluid inference library ...
     ========================================
 EOF
-        make inference_lib_dist
+        make -j `nproc` inference_lib_dist
     fi
 }
 

From d798e3258d475140a532804d7cd4980aa38475cd Mon Sep 17 00:00:00 2001
From: typhoonzero <typhoonzero1986@gmail.com>
Date: Thu, 12 Apr 2018 09:57:38 +0800
Subject: [PATCH 54/62] update grpc version

---
 cmake/external/grpc.cmake | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cmake/external/grpc.cmake b/cmake/external/grpc.cmake
index 0853b98181..aa24915947 100644
--- a/cmake/external/grpc.cmake
+++ b/cmake/external/grpc.cmake
@@ -24,16 +24,16 @@ SET(GRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/grpc)
 SET(GRPC_INCLUDE_DIR "${GRPC_INSTALL_DIR}/include/" CACHE PATH "grpc include directory." FORCE)
 SET(GRPC_CPP_PLUGIN "${GRPC_INSTALL_DIR}/bin/grpc_cpp_plugin" CACHE FILEPATH "GRPC_CPP_PLUGIN" FORCE)
 IF(APPLE)
-  SET(BUILD_CMD make -n HAS_SYSTEM_PROTOBUF=false -s -j8 static grpc_cpp_plugin | sed "s/-Werror//g" | sh)
+  SET(BUILD_CMD make -n HAS_SYSTEM_PROTOBUF=false -s -j static grpc_cpp_plugin | sed "s/-Werror//g" | sh)
 ELSE()
-  SET(BUILD_CMD make HAS_SYSTEM_PROTOBUF=false -s -j8 static grpc_cpp_plugin)
+  SET(BUILD_CMD make HAS_SYSTEM_PROTOBUF=false -s -j static grpc_cpp_plugin)
 ENDIF()
 
 ExternalProject_Add(
     extern_grpc
     DEPENDS protobuf zlib
     GIT_REPOSITORY "https://github.com/grpc/grpc.git"
-    GIT_TAG "v1.8.x"
+    GIT_TAG "v1.11.x"
     PREFIX          ${GRPC_SOURCES_DIR}
     UPDATE_COMMAND  ""
     CONFIGURE_COMMAND ""

From e90e7ab237723ddab75be247d5f29780968924f4 Mon Sep 17 00:00:00 2001
From: Yiqun Liu <liuyiqun01@baidu.com>
Date: Thu, 12 Apr 2018 11:45:43 +0800
Subject: [PATCH 55/62] Remove the use of ARCHIVE_START/END (#9844)

* Add USE_OP of all operators and kernels and remove ARCHIVE_START/END in CMakeLists.txt of inference unittests.

* Remove ARCHIVE_START/END when linking inference shared library.

* Disable some fluid related cmake operations for cross-compiling.
---
 cmake/cblas.cmake                             | 34 +++++++++++--------
 cmake/external/snappy.cmake                   | 16 ++++-----
 cmake/external/snappystream.cmake             | 14 ++++----
 cmake/generic.cmake                           | 15 ++------
 cmake/inference_lib.cmake                     | 32 +++++++++++++++++
 paddle/CMakeLists.txt                         |  2 +-
 paddle/fluid/CMakeLists.txt                   |  3 +-
 paddle/fluid/inference/CMakeLists.txt         |  4 +--
 paddle/fluid/inference/io.cc                  |  6 ++++
 paddle/fluid/inference/io.h                   |  3 ++
 .../fluid/inference/tests/book/CMakeLists.txt |  2 +-
 11 files changed, 83 insertions(+), 48 deletions(-)

diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake
index 6320b17520..52a22c1fbf 100644
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -62,29 +62,33 @@ endif()
 
 
 ## Then find the reference-cblas.  www.netlib.org/blas/
-
-
 set(REFERENCE_CBLAS_ROOT $ENV{REFERENCE_CBLAS_ROOT} CACHE PATH
   "Folder contains reference-cblas")
-set(REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS
-  ${REFERENCE_CBLAS_ROOT}/include
-  /usr/include
-  /usr/include/cblas
-)
-
-set(REFERENCE_CBLAS_LIB_SEARCH_PATHS
-  ${REFERENCE_CBLAS_ROOT}/lib
-  /usr/lib
-  /usr/lib/blas/reference/
-  /usr/lib/reference/
-)
+if(NOT CMAKE_CROSSCOMPILING)
+  set(REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS
+    ${REFERENCE_CBLAS_ROOT}/include
+    /usr/include
+    /usr/include/cblas
+  )
+
+  set(REFERENCE_CBLAS_LIB_SEARCH_PATHS
+    ${REFERENCE_CBLAS_ROOT}/lib
+    /usr/lib
+    /usr/lib/blas/reference/
+    /usr/lib/reference/
+  )
+else()
+  # Diable the finding of reference cblas under host's system path
+  set(REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS ${REFERENCE_CBLAS_ROOT}/include)
+  set(REFERENCE_CBLAS_LIB_SEARCH_PATHS ${REFERENCE_CBLAS_ROOT}/lib)
+endif()
 
 find_path(REFERENCE_CBLAS_INCLUDE_DIR NAMES cblas.h PATHS
         ${REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS})
 find_library(REFERENCE_CBLAS_LIBRARY NAMES cblas PATHS
         ${REFERENCE_CBLAS_LIB_SEARCH_PATHS})
 
-if (REFERENCE_CBLAS_INCLUDE_DIR AND REFERENCE_CBLAS_LIBRARY)
+if(REFERENCE_CBLAS_INCLUDE_DIR AND REFERENCE_CBLAS_LIBRARY)
   set(CBLAS_FOUND ON)
   set(CBLAS_PROVIDER REFERENCE)
   set(CBLAS_INC_DIR ${REFERENCE_CBLAS_INCLUDE_DIR})
diff --git a/cmake/external/snappy.cmake b/cmake/external/snappy.cmake
index 71f54c425d..80282329c6 100644
--- a/cmake/external/snappy.cmake
+++ b/cmake/external/snappy.cmake
@@ -11,19 +11,20 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-#
 
-IF(MOBILE_INFERENCE)
+if(MOBILE_INFERENCE OR RPI)
     return()
-ENDIF()
+endif()
 
 include (ExternalProject)
 
 # NOTE: snappy is needed when linking with recordio
 
-SET(SNAPPY_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy)
-SET(SNAPPY_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy)
-SET(SNAPPY_INCLUDE_DIR "${SNAPPY_INSTALL_DIR}/include/" CACHE PATH "snappy include directory." FORCE)
+set(SNAPPY_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy)
+set(SNAPPY_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy)
+set(SNAPPY_INCLUDE_DIR "${SNAPPY_INSTALL_DIR}/include" CACHE PATH "snappy include directory." FORCE)
+
+set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/libsnappy.a")
 
 ExternalProject_Add(
     extern_snappy
@@ -51,8 +52,7 @@ ExternalProject_Add(
 )
 
 add_library(snappy STATIC IMPORTED GLOBAL)
-set_property(TARGET snappy PROPERTY IMPORTED_LOCATION
-             "${SNAPPY_INSTALL_DIR}/lib/libsnappy.a")
+set_property(TARGET snappy PROPERTY IMPORTED_LOCATION ${SNAPPY_LIBRARIES})
 
 include_directories(${SNAPPY_INCLUDE_DIR})
 add_dependencies(snappy extern_snappy)
diff --git a/cmake/external/snappystream.cmake b/cmake/external/snappystream.cmake
index 8f7a3bf8ee..20a9643082 100644
--- a/cmake/external/snappystream.cmake
+++ b/cmake/external/snappystream.cmake
@@ -11,9 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-#
 
-IF(MOBILE_INFERENCE)
+IF(MOBILE_INFERENCE OR RPI)
     return()
 ENDIF()
 
@@ -21,9 +20,11 @@ include (ExternalProject)
 
 # NOTE: snappy is needed when linking with recordio
 
-SET(SNAPPYSTREAM_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy_stream)
-SET(SNAPPYSTREAM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy_stream)
-SET(SNAPPYSTREAM_INCLUDE_DIR "${SNAPPYSTREAM_INSTALL_DIR}/include/" CACHE PATH "snappy stream include directory." FORCE)
+set(SNAPPYSTREAM_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy_stream)
+set(SNAPPYSTREAM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy_stream)
+set(SNAPPYSTREAM_INCLUDE_DIR "${SNAPPYSTREAM_INSTALL_DIR}/include" CACHE PATH "snappy stream include directory." FORCE)
+
+set(SNAPPYSTREAM_LIBRARIES "${SNAPPYSTREAM_INSTALL_DIR}/lib/libsnappystream.a")
 
 ExternalProject_Add(
         extern_snappystream
@@ -51,8 +52,7 @@ ExternalProject_Add(
 )
 
 add_library(snappystream STATIC IMPORTED GLOBAL)
-set_property(TARGET snappystream PROPERTY IMPORTED_LOCATION
-        "${SNAPPYSTREAM_INSTALL_DIR}/lib/libsnappystream.a")
+set_property(TARGET snappystream PROPERTY IMPORTED_LOCATION ${SNAPPYSTREAM_LIBRARIES})
 
 include_directories(${SNAPPYSTREAM_INCLUDE_DIR}) # For snappysteam to include its own headers.
 include_directories(${THIRD_PARTY_PATH}/install) # For Paddle to include snappy stream headers.
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index c4c9f77df8..1d3e2ade6d 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -195,14 +195,7 @@ function(cc_library TARGET_NAME)
         list(REMOVE_ITEM cc_library_DEPS warpctc)
         add_dependencies(${TARGET_NAME} warpctc)
       endif()
-      if("${cc_library_DEPS}" MATCHES "ARCHIVE_START")
-        # Support linking flags: --whole-archive (Linux) / -force_load (MacOS).
-        # WARNING: Please don't use ARCHIVE_START&ARCHIVE_END if TARGET_NAME will be linked by other libraries.
-        target_circle_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
-        list(REMOVE_ITEM cc_library_DEPS ARCHIVE_START ARCHIVE_END)
-      else()
-        target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
-      endif()
+      target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
       add_dependencies(${TARGET_NAME} ${cc_library_DEPS})
     endif()
     
@@ -243,11 +236,7 @@ function(cc_test TARGET_NAME)
     set(multiValueArgs SRCS DEPS ARGS)
     cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     add_executable(${TARGET_NAME} ${cc_test_SRCS})
-    # Support linking flags: --whole-archive (Linux) / -force_load (MacOS)
-    target_circle_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main memory gtest gflags glog)
-    if("${cc_test_DEPS}" MATCHES "ARCHIVE_START")
-      list(REMOVE_ITEM cc_test_DEPS ARCHIVE_START ARCHIVE_END)
-    endif()
+    target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main memory gtest gflags glog)
     add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main memory gtest gflags glog)
     add_test(NAME ${TARGET_NAME}
              COMMAND ${TARGET_NAME} ${cc_test_ARGS}
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index 0323cd9698..cc75801982 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -1,7 +1,22 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 set_property(GLOBAL PROPERTY FLUID_MODULES "")
 # find all fluid modules is used for paddle fluid static library
 function(find_fluid_modules TARGET_NAME)
   get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE)
+  string(REGEX REPLACE "^${PADDLE_SOURCE_DIR}/" "" __target_path ${__target_path})
   string(FIND "${__target_path}" "fluid" pos)
   if(pos GREATER 1)
     get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
@@ -77,6 +92,23 @@ elseif (WITH_MKLML)
     )
 endif()
 
+if(NOT MOBILE_INFERENCE AND NOT RPI)
+  set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/snappy")
+  copy(snappy_lib
+    SRCS ${SNAPPY_INCLUDE_DIR} ${SNAPPY_LIBRARIES}
+    DSTS ${dst_dir} ${dst_dir}/lib)
+
+  set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/snappystream")
+  copy(snappystream_lib
+    SRCS ${SNAPPYSTREAM_INCLUDE_DIR} ${SNAPPYSTREAM_LIBRARIES}
+    DSTS ${dst_dir} ${dst_dir}/lib)
+
+  set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/zlib")
+  copy(zlib_lib
+    SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES}
+    DSTS ${dst_dir} ${dst_dir}/lib)
+endif()
+
 # paddle fluid module
 set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
 set(dst_dir "${CMAKE_INSTALL_PREFIX}/paddle/fluid")
diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt
index c44f8a8a8e..8b1ca5e165 100644
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -24,6 +24,6 @@ if(NOT WITH_FLUID_ONLY)
 endif()
 
 add_subdirectory(testing)
-if(NOT MOBILE_INFERENCE AND NOT ANDROID AND NOT IOS)
+if(NOT MOBILE_INFERENCE AND NOT RPI)
   add_subdirectory(fluid)
 endif()
diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt
index d725763b01..d274d96c29 100644
--- a/paddle/fluid/CMakeLists.txt
+++ b/paddle/fluid/CMakeLists.txt
@@ -3,6 +3,7 @@ add_subdirectory(platform)
 add_subdirectory(framework)
 add_subdirectory(operators)
 add_subdirectory(pybind)
-add_subdirectory(inference)
 add_subdirectory(string)
 add_subdirectory(recordio)
+# NOTE: please add subdirectory inference at last.
+add_subdirectory(inference)
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index f417f62f3f..e53bcf2384 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor prune init)
+set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor init)
 
 cc_library(paddle_fluid_api
     SRCS io.cc
@@ -11,7 +11,7 @@ cc_library(paddle_fluid DEPS ${fluid_modules})
 # Create shared library
 cc_library(paddle_fluid_shared SHARED
     SRCS io.cc
-    DEPS ARCHIVE_START ${GLOB_OP_LIB} ${FLUID_CORE_MODULES} ARCHIVE_END)
+    DEPS ${fluid_modules})
 set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid)
 if(NOT APPLE)
   # TODO(liuyiqun): Temporarily disable the link flag because it is not support on Mac.
diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc
index a5b62ef322..a29d457b6f 100644
--- a/paddle/fluid/inference/io.cc
+++ b/paddle/fluid/inference/io.cc
@@ -17,10 +17,16 @@ limitations under the License. */
 #include <fstream>
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/feed_fetch_type.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/pybind/pybind.h"
 
 namespace paddle {
 namespace inference {
 
+// Temporarilly add this function for exposing framework::InitDevices() when
+// linking the inference shared library.
+void Init(bool init_p2p) { framework::InitDevices(init_p2p); }
+
 void ReadBinaryFile(const std::string& filename, std::string& contents) {
   std::ifstream fin(filename, std::ios::in | std::ios::binary);
   PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s", filename);
diff --git a/paddle/fluid/inference/io.h b/paddle/fluid/inference/io.h
index d07d315b93..756c936b33 100644
--- a/paddle/fluid/inference/io.h
+++ b/paddle/fluid/inference/io.h
@@ -18,12 +18,15 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/init.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
 
 namespace paddle {
 namespace inference {
 
+void Init(bool init_p2p);
+
 void LoadPersistables(framework::Executor& executor, framework::Scope& scope,
                       const framework::ProgramDesc& main_program,
                       const std::string& dirname,
diff --git a/paddle/fluid/inference/tests/book/CMakeLists.txt b/paddle/fluid/inference/tests/book/CMakeLists.txt
index 86e36f3f65..97d9f03f88 100644
--- a/paddle/fluid/inference/tests/book/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/book/CMakeLists.txt
@@ -17,7 +17,7 @@ function(inference_test TARGET_NAME)
     string(REGEX REPLACE "^_$" "" arg "${arg}")
     cc_test(test_inference_${TARGET_NAME}${arg}
         SRCS test_inference_${TARGET_NAME}.cc
-        DEPS ARCHIVE_START paddle_fluid ARCHIVE_END
+        DEPS paddle_fluid
         ARGS --dirname=${PYTHON_TESTS_DIR}/book/${TARGET_NAME}${arg}.inference.model)
     set_tests_properties(test_inference_${TARGET_NAME}${arg}
         PROPERTIES DEPENDS test_${TARGET_NAME})

From 1204d9f3d1b76de8d3fce594634134bcfb653c8e Mon Sep 17 00:00:00 2001
From: Dang Qingqing <dangqingqing@baidu.com>
Date: Thu, 12 Apr 2018 13:12:05 +0800
Subject: [PATCH 56/62] Refine batch_norm_op.

---
 paddle/fluid/operators/batch_norm_op.cu.cc | 27 ++++++++++++----------
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/paddle/fluid/operators/batch_norm_op.cu.cc b/paddle/fluid/operators/batch_norm_op.cu.cc
index eecb58e11e..cb1927bc0f 100644
--- a/paddle/fluid/operators/batch_norm_op.cu.cc
+++ b/paddle/fluid/operators/batch_norm_op.cu.cc
@@ -114,23 +114,11 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
     const auto *bias = ctx.Input<Tensor>("Bias");
 
     auto *y = ctx.Output<Tensor>("Y");
-    auto *mean_out = ctx.Output<Tensor>("MeanOut");
-    auto *variance_out = ctx.Output<Tensor>("VarianceOut");
-    auto *saved_mean = ctx.Output<Tensor>("SavedMean");
-    auto *saved_variance = ctx.Output<Tensor>("SavedVariance");
 
     // alloc memory
     y->mutable_data<T>(ctx.GetPlace());
-    mean_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-    variance_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-    saved_mean->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-    saved_variance->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
 
     auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    math::SetConstant<platform::CUDADeviceContext, BatchNormParamType<T>>
-        functor;
-    functor(dev_ctx, saved_mean, static_cast<BatchNormParamType<T>>(0));
-    functor(dev_ctx, saved_variance, static_cast<BatchNormParamType<T>>(0));
 
     auto handle = dev_ctx.cudnn_handle();
 
@@ -159,6 +147,21 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
       // Run training mode.
       // obtain running mean and running inv var, and see if we need to
       // initialize them.
+
+      auto *mean_out = ctx.Output<Tensor>("MeanOut");
+      auto *variance_out = ctx.Output<Tensor>("VarianceOut");
+      mean_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+      variance_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+
+      auto *saved_mean = ctx.Output<Tensor>("SavedMean");
+      auto *saved_variance = ctx.Output<Tensor>("SavedVariance");
+      saved_mean->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+      saved_variance->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+      math::SetConstant<platform::CUDADeviceContext, BatchNormParamType<T>>
+          functor;
+      functor(dev_ctx, saved_mean, static_cast<BatchNormParamType<T>>(0));
+      functor(dev_ctx, saved_variance, static_cast<BatchNormParamType<T>>(0));
+
       double this_factor = 1. - momentum;
 
       CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationForwardTraining(

From ad73b331c757a0a0d795d9aa99a86b077f144357 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Thu, 12 Apr 2018 13:30:04 +0800
Subject: [PATCH 57/62] Eagerly drop local scope in iteration (#9838)

* Eagerly drop local scope in iteration

* Correct create var

* Fix typo

* Debug
---
 .../details/computation_op_handle.cc          |  4 +-
 .../framework/details/fetch_op_handle.cc      |  8 +++-
 .../fluid/framework/details/op_handle_base.h  |  2 +
 .../framework/details/ssa_graph_executor.h    |  4 +-
 .../details/threaded_ssa_graph_executor.cc    | 30 ------------
 .../details/threaded_ssa_graph_executor.h     |  3 --
 paddle/fluid/framework/parallel_executor.cc   | 47 +++++++++++++++----
 7 files changed, 54 insertions(+), 44 deletions(-)

diff --git a/paddle/fluid/framework/details/computation_op_handle.cc b/paddle/fluid/framework/details/computation_op_handle.cc
index 7a1b40c0b6..e3f8bbb72f 100644
--- a/paddle/fluid/framework/details/computation_op_handle.cc
+++ b/paddle/fluid/framework/details/computation_op_handle.cc
@@ -14,6 +14,8 @@
 
 #include "paddle/fluid/framework/details/computation_op_handle.h"
 
+#include <string>
+
 namespace paddle {
 namespace framework {
 namespace details {
@@ -33,7 +35,7 @@ void ComputationOpHandle::RunImpl() {
     }
   }
 
-  op_->Run(*scope_->FindVar("@TMP_SCOPE@")->Get<Scope *>(), place_);
+  op_->Run(*scope_->FindVar(kLocalExecScopeName)->Get<Scope *>(), place_);
 }
 
 std::string ComputationOpHandle::Name() const { return op_->Type(); }
diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc
index 9180903b86..e3e7c55d15 100644
--- a/paddle/fluid/framework/details/fetch_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_op_handle.cc
@@ -14,6 +14,9 @@
 
 #include "paddle/fluid/framework/details/fetch_op_handle.h"
 
+#include <string>
+#include <vector>
+
 namespace paddle {
 namespace framework {
 namespace details {
@@ -57,7 +60,10 @@ void FetchOpHandle::RunImpl() {
 
   for (size_t i = 0; i < scopes.size(); ++i) {
     auto &scope = scopes[i];
-    auto &t = scope->FindVar(var_name)->Get<framework::LoDTensor>();
+    auto &t = scope->FindVar(kLocalExecScopeName)
+                  ->Get<Scope *>()
+                  ->FindVar(var_name)
+                  ->Get<framework::LoDTensor>();
     if (platform::is_gpu_place(var->place_)) {
 #ifdef PADDLE_WITH_CUDA
       TensorCopy(t, cpu, *dev_ctxes_[t.place()], &tensors_[i]);
diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h
index d7a541ac4b..fbdb54ba8d 100644
--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -24,6 +24,8 @@ namespace paddle {
 namespace framework {
 namespace details {
 
+constexpr char kLocalExecScopeName[] = "@LCOAL_SCOPE@";
+
 class OpHandleBase {
  private:
   DISABLE_COPY_AND_ASSIGN(OpHandleBase);
diff --git a/paddle/fluid/framework/details/ssa_graph_executor.h b/paddle/fluid/framework/details/ssa_graph_executor.h
index 3b818b1a45..a8833b7388 100644
--- a/paddle/fluid/framework/details/ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/ssa_graph_executor.h
@@ -15,13 +15,15 @@
 #pragma once
 
 #include <memory>
+#include <string>
+#include <vector>
+
 #include "paddle/fluid/framework/details/ssa_graph.h"
 #include "paddle/fluid/framework/feed_fetch_type.h"
 
 namespace paddle {
 namespace framework {
 namespace details {
-
 class SSAGraphExecutor {
   DISABLE_COPY_AND_ASSIGN(SSAGraphExecutor);
 
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
index 62af4c1d79..1ce69ab02b 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -136,12 +136,6 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
     ready_ops.clear();
   };
 
-  // Create local scopes.
-  for (auto &scope : local_scopes_) {
-    auto &local_scope = scope->NewScope();
-    *scope->Var("@TMP_SCOPE@")->GetMutable<Scope *>() = &local_scope;
-  }
-
   // Step 3. Execution
   while (!pending_vars.empty() || !ready_ops.empty() || !delayed_ops.empty()) {
     // 1. Run All Ready ops
@@ -189,34 +183,10 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
   PADDLE_ENFORCE(ready_ops.empty());
   PADDLE_ENFORCE(delayed_ops.empty());
   PADDLE_ENFORCE(blocked_by_delayed_ops.empty());
-  ++computation_count_;
-
-  auto sync_computation = [&] {
-    computation_count_ = 0;
-    // Wait All computational streams
-    for (auto p : this->places_) {
-      platform::DeviceContextPool::Instance().Get(p)->Wait();
-    }
-    for (auto &scope : local_scopes_) {
-      scope->DropKids();
-    }
-  };
 
   // Wait FetchOps.
   if (!fetch_ops.empty()) {
     fetch_ops.clear();
-    sync_computation();
-  }
-
-  if (computation_count_ == max_async_computation) {
-    sync_computation();
-  }
-
-  // NOTE: the temp scope can be dropped lazily if needed.
-  // Drop tmp scopes;
-  for (auto &scope : local_scopes_) {
-    auto &kid = *scope->Var("@TMP_SCOPE@")->GetMutable<Scope *>();
-    kid = nullptr;
   }
 
   return fetch_data;
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
index 79cfc26b46..bb5e837b13 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@@ -99,9 +99,6 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
   std::unique_ptr<platform::EnforceNotMet> exception_;
   std::atomic<int> running_ops_;
   bool allow_op_delay_;
-
-  size_t computation_count_{0};
-  size_t max_async_computation{100};
 };
 
 }  // namespace details
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 20dcc080b6..c1486b527d 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/parallel_executor.h"
 
 #include <string>
+#include <tuple>
 #include <vector>
 
 #ifdef PADDLE_WITH_CUDA
@@ -41,6 +42,8 @@ class ParallelExecutorPrivate {
 #ifdef PADDLE_WITH_CUDA
   std::unique_ptr<platform::NCCLContextMap> nccl_ctxs_;
 #endif
+
+  std::vector<std::tuple<std::string, proto::VarType::Type, bool>> var_types_;
 };
 
 std::vector<Scope *> &ParallelExecutor::GetLocalScopes() {
@@ -97,14 +100,9 @@ ParallelExecutor::ParallelExecutor(
       allow_op_delay));
 
   // Step 3. Create vars in each scope;
-  for (auto *scope : member_->local_scopes_) {
-    for (auto *var : main_program.Block(0).AllVars()) {
-      if (scope->FindVar(var->Name()) != nullptr) {
-        continue;
-      }
-
-      InitializeVariable(scope->Var(var->Name()), var->GetType());
-    }
+  for (auto *var : main_program.Block(0).AllVars()) {
+    member_->var_types_.emplace_back(var->Name(), var->GetType(),
+                                     var->Persistable());
   }
 }
 
@@ -163,9 +161,42 @@ void ParallelExecutor::Run(
     const std::unordered_map<std::string, LoDTensor> &feed_tensors) {
   platform::RecordBlock b(0);
   SplitTensorToPlaces(feed_tensors);
+
+  // Create local scopes.
+  for (auto &scope : member_->local_scopes_) {
+    Scope &local_scope = scope->NewScope();
+    *scope->Var(details::kLocalExecScopeName)->GetMutable<Scope *>() =
+        &local_scope;
+
+    for (auto &name_type_pair : member_->var_types_) {
+      if (scope->FindVar(std::get<0>(name_type_pair)) != nullptr) {
+        continue;
+      }
+
+      if (std::get<2>(name_type_pair)) {  // Persistable
+        InitializeVariable(scope->Var(std::get<0>(name_type_pair)),
+                           std::get<1>(name_type_pair));
+      } else {
+        InitializeVariable(scope->Var(std::get<0>(name_type_pair)),
+                           std::get<1>(name_type_pair));
+      }
+    }
+  }
+
   auto fetch_data = member_->executor_->Run(fetch_tensors);
   *member_->global_scope_->Var(fetched_var_name)->GetMutable<FeedFetchList>() =
       fetch_data;
+
+  // Wait All computational streams
+  for (auto p : member_->places_) {
+    platform::DeviceContextPool::Instance().Get(p)->Wait();
+  }
+  for (auto &scope : member_->local_scopes_) {
+    auto &local_scope =
+        *scope->Var(details::kLocalExecScopeName)->GetMutable<Scope *>();
+    scope->DeleteScope(local_scope);
+    local_scope = nullptr;
+  }
 }
 
 void ParallelExecutor::SplitTensorToPlaces(

From 4c55a6022a0a758295177371fc67c6800658b286 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Thu, 12 Apr 2018 15:26:35 +0800
Subject: [PATCH 58/62] Dist transpiler support prefetch (#9714)

* init

* add some check

* add dist transpile logic

* add insert op for block

* init change get_pserver_program

* optimize code

* fix a bug

* can run now

* start to do table split

* start to process table gradient

* complete pserver part

* can send_vars now

* revert cpplint

* fix a bug

* optimize code

* move dist test to models

* revert the interface of distribute_transpiler.transpile

* fix prefetch_block

* optimize trainspiler code

* add comment to sum_op

* add warning log

* fix comment

* fix test_send_recv

* fix test_send_recv

* fix train with no distributed table

* optimize GetDims
---
 paddle/fluid/framework/block_desc.h          |   2 +-
 paddle/fluid/framework/operator.cc           |  13 +-
 paddle/fluid/operators/concat_op.cc          |   6 +-
 paddle/fluid/operators/detail/grpc_server.cc |   1 +
 paddle/fluid/operators/listen_and_serv_op.cc |  45 ++-
 paddle/fluid/operators/listen_and_serv_op.h  |   2 +
 paddle/fluid/operators/lookup_table_op.cc    |   3 +
 paddle/fluid/operators/prefetch_op.cc        |   8 +-
 paddle/fluid/operators/send_recv_op_test.cc  |  26 +-
 paddle/fluid/operators/send_vars_op.cc       |   4 +-
 paddle/fluid/operators/sgd_op.cc             |   4 +-
 paddle/fluid/operators/split_ids_op.cc       |  14 +-
 paddle/fluid/operators/split_ids_op.h        |  70 ++--
 paddle/fluid/operators/sum_op.cc             |   7 +-
 python/paddle/fluid/distribute_transpiler.py | 343 +++++++++++++++++--
 python/paddle/fluid/layers/nn.py             |   8 +-
 16 files changed, 450 insertions(+), 106 deletions(-)

diff --git a/paddle/fluid/framework/block_desc.h b/paddle/fluid/framework/block_desc.h
index 873969b2a8..eef19c4f09 100644
--- a/paddle/fluid/framework/block_desc.h
+++ b/paddle/fluid/framework/block_desc.h
@@ -92,7 +92,7 @@ class BlockDesc {
 
   /*
    * Remove Op and its input/output variables.
-   * Note that for either input or ouput variable, if it is also an input or
+   * Note that for either input or output variable, if it is also an input or
    * output variable of other ops, we should remain it.
    */
   void RemoveOp(size_t s, size_t e);
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index a3b4a8c082..f97bd08274 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -46,7 +46,8 @@ proto::VarType::Type GetDataTypeOfVar(const Variable* var) {
   }
 }
 
-static DDim GetDims(const Scope& scope, const std::string& name) {
+static DDim GetDims(const Scope& scope, const std::string& name,
+                    bool get_actual_dim = false) {
   Variable* var = scope.FindVar(name);
   if (var == nullptr) {
     return DDim({-1});
@@ -55,7 +56,11 @@ static DDim GetDims(const Scope& scope, const std::string& name) {
   if (var->IsType<LoDTensor>()) {
     return var->Get<LoDTensor>().dims();
   } else if (var->IsType<SelectedRows>()) {
-    return var->Get<SelectedRows>().GetCompleteDims();
+    if (get_actual_dim) {
+      return var->Get<SelectedRows>().value().dims();
+    } else {
+      return var->Get<SelectedRows>().GetCompleteDims();
+    }
   } else {
     return DDim({-1});
   }
@@ -129,7 +134,7 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const {
     for (size_t i = 0; i < input.second.size(); ++i) {
       ss << input.second[i];
       if (scope) {
-        ss << "[" << GetDims(*scope, input.second[i]) << "]";
+        ss << "[" << GetDims(*scope, input.second[i], true) << "]";
         ss << "(" << GetLoD(*scope, input.second[i]) << ")";
       }
       if (i != input.second.size() - 1) {
@@ -149,7 +154,7 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const {
     for (size_t i = 0; i < output.second.size(); ++i) {
       ss << output.second[i];
       if (scope) {
-        ss << "[" << GetDims(*scope, output.second[i]) << "]";
+        ss << "[" << GetDims(*scope, output.second[i], true) << "]";
         ss << "(" << GetLoD(*scope, output.second[i]) << ")";
       }
       if (i != output.second.size() - 1) {
diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc
index d65a7b3467..4a36b03cb6 100644
--- a/paddle/fluid/operators/concat_op.cc
+++ b/paddle/fluid/operators/concat_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/concat_op.h"
+
 #include <string>
 #include <vector>
 
@@ -34,7 +35,10 @@ class ConcatOp : public framework::OperatorWithKernel {
     size_t axis = static_cast<size_t>(ctx->Attrs().Get<int>("axis"));
     const size_t n = ins.size();
 
-    PADDLE_ENFORCE_GT(n, 1, "Input tensors count should > 1.");
+    PADDLE_ENFORCE_GT(n, 0, "Input tensors count should > 0.");
+    if (n == 1) {
+      VLOG(3) << "Warning: concat op have only one input, may waste memory";
+    }
 
     auto out_dims = ins[0];
     size_t in_zero_dims_size = out_dims.size();
diff --git a/paddle/fluid/operators/detail/grpc_server.cc b/paddle/fluid/operators/detail/grpc_server.cc
index d5fc163bc2..0b582a08bc 100644
--- a/paddle/fluid/operators/detail/grpc_server.cc
+++ b/paddle/fluid/operators/detail/grpc_server.cc
@@ -161,6 +161,7 @@ class RequestPrefetch final : public RequestBase {
     ::grpc::ByteBuffer reply;
 
     std::string var_name = request_->OutVarname();
+    VLOG(3) << "prefetch var " << var_name;
     auto var_desc = program_->Block(0).FindVar(var_name);
     framework::Scope* local_scope = &scope_->NewScope();
     auto* var = local_scope->FindVar(var_name);
diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc
index 9188f2d989..5d293665f0 100644
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -13,7 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <ostream>
-#include <thread>
+#include <thread>  // NOLINT
+#include <vector>
 
 #include "paddle/fluid/operators/listen_and_serv_op.h"
 
@@ -88,8 +89,9 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
 
   auto ins = Inputs("X");
   auto fan_in = Attr<int>("Fanin");
-  auto *block = Attr<framework::BlockDesc *>(kOptimizeBlock);
-  auto *program = block->Program();
+  auto *optimize_block = Attr<framework::BlockDesc *>(kOptimizeBlock);
+  auto *prefetch_block = Attr<framework::BlockDesc *>(kPrefetchBlock);
+  auto *program = optimize_block->Program();
   size_t num_blocks = program->Size();
   PADDLE_ENFORCE_GE(num_blocks, 2,
                     "server program should have at least 2 blocks");
@@ -97,18 +99,25 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
   framework::Executor executor(dev_place);
   std::vector<int> block_list;
   for (size_t blkid = 1; blkid < num_blocks; ++blkid) {
-    block_list.push_back(blkid);
+    if (blkid != prefetch_block->ID()) {
+      block_list.push_back(blkid);
+    }
   }
-  auto prepared = executor.Prepare(*program, block_list);
+  auto optimize_prepared = executor.Prepare(*program, block_list);
   // Insert placeholder for block0 which holds current op itself.
-  prepared.insert(prepared.begin(),
-                  std::shared_ptr<framework::ExecutorPrepareContext>(nullptr));
+  optimize_prepared.insert(
+      optimize_prepared.begin(),
+      std::shared_ptr<framework::ExecutorPrepareContext>(nullptr));
 
   rpc_service_->SetScope(&recv_scope);
   rpc_service_->SetDevCtx(&dev_ctx);
   // TODO(qiao) set proper fields for table lookup and update
   rpc_service_->SetExecutor(&executor);
-  rpc_service_->SetPrefetchBlkdId(0);
+  VLOG(3) << "prefetch block id is " << prefetch_block->ID();
+  auto prefetch_prepared = executor.Prepare(*program, prefetch_block->ID());
+  rpc_service_->SetPrefetchBlkdId(prefetch_block->ID());
+  rpc_service_->SetPrefetchPreparedCtx(prefetch_prepared.get());
+  prefetch_prepared.release();
   rpc_service_->SetProgram(program);
   // start the server listening after all member initialized.
   server_thread_.reset(new std::thread(RunServer, rpc_service_));
@@ -166,16 +175,18 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
     parallel_blkids.push_back(1);
     double ts = detail::GetTimestamp();
     for (size_t blkid = 2; blkid < num_blocks; ++blkid) {
-      if (program->Block(blkid).Parent() != last_parent_blkid) {
-        ParallelExecuteBlocks(parallel_blkids, &executor, prepared, program,
-                              &recv_scope);
-        parallel_blkids.clear();
-        last_parent_blkid = program->Block(blkid).Parent();
+      if (blkid != prefetch_block->ID()) {
+        if (program->Block(blkid).Parent() != last_parent_blkid) {
+          ParallelExecuteBlocks(parallel_blkids, &executor, optimize_prepared,
+                                program, &recv_scope);
+          parallel_blkids.clear();
+          last_parent_blkid = program->Block(blkid).Parent();
+        }
+        parallel_blkids.push_back(blkid);
       }
-      parallel_blkids.push_back(blkid);
     }
-    ParallelExecuteBlocks(parallel_blkids, &executor, prepared, program,
-                          &recv_scope);
+    ParallelExecuteBlocks(parallel_blkids, &executor, optimize_prepared,
+                          program, &recv_scope);
     VLOG(2) << "run all blocks spent " << detail::GetTimestamp() - ts << "(ms)";
 
     // Reset the received sparse variables, the sum operator would not
@@ -211,6 +222,8 @@ from send_op and send back variables to recv_op.
         .AddCustomChecker([](const std::string &ip) { return !ip.empty(); });
     AddAttr<framework::BlockDesc *>(kOptimizeBlock,
                                     "BlockID to run on server side.");
+    AddAttr<framework::BlockDesc *>(kPrefetchBlock,
+                                    "prefetch block to run on server side.");
     AddAttr<int>("Fanin", "How many clients send to this server.")
         .SetDefault(1);
   }
diff --git a/paddle/fluid/operators/listen_and_serv_op.h b/paddle/fluid/operators/listen_and_serv_op.h
index 0da87afc96..759b2a462b 100644
--- a/paddle/fluid/operators/listen_and_serv_op.h
+++ b/paddle/fluid/operators/listen_and_serv_op.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <stdint.h>
 #include <ostream>
+#include <string>
 
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/lod_tensor.h"
@@ -27,6 +28,7 @@ namespace paddle {
 namespace operators {
 
 constexpr char kOptimizeBlock[] = "OptimizeBlock";
+constexpr char kPrefetchBlock[] = "PrefetchBlock";
 
 void RunServer(std::shared_ptr<detail::AsyncGRPCServer> service);
 
diff --git a/paddle/fluid/operators/lookup_table_op.cc b/paddle/fluid/operators/lookup_table_op.cc
index bf33be3106..5e59bd1b17 100644
--- a/paddle/fluid/operators/lookup_table_op.cc
+++ b/paddle/fluid/operators/lookup_table_op.cc
@@ -78,6 +78,9 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
                   "(boolean, default false) "
                   "Sparse update.")
         .SetDefault(false);
+    AddAttr<bool>("is_distributed",
+                  "(boolean, default false) distributed lookup table.")
+        .SetDefault(false);
     AddAttr<int64_t>("padding_idx",
                      "(int64, default -1) "
                      "If the value is -1, it makes no effect to lookup. "
diff --git a/paddle/fluid/operators/prefetch_op.cc b/paddle/fluid/operators/prefetch_op.cc
index 09ab7da663..f9ae01ab5d 100644
--- a/paddle/fluid/operators/prefetch_op.cc
+++ b/paddle/fluid/operators/prefetch_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <future>
+#include <future>  // NOLINT
 #include <ostream>
 
 #include "paddle/fluid/framework/data_type.h"
@@ -50,8 +50,8 @@ class PrefetchOp : public framework::OperatorBase {
 
     for (size_t i = 0; i < ins.size(); i++) {
       if (NeedSend(scope, ins[i])) {
-        VLOG(3) << "sending " << ins[i] << " to " << epmap[i] << "to get "
-                << outs[i] << "back";
+        VLOG(3) << "sending " << ins[i] << " to " << epmap[i] << " to get "
+                << outs[i] << " back";
         rpc_client->AsyncPrefetchVariable(epmap[i], ctx, scope, ins[i],
                                           outs[i]);
       } else {
@@ -71,7 +71,7 @@ class PrefetchOpMaker : public framework::OpProtoAndCheckerMaker {
               "(RPCClient) The RPC client object which will be"
               "initialized at most once.");
     AddOutput("Out",
-              "(SelectedRows) result "
+              "(LoDTensor) result "
               "to be fetched from parameter server")
         .AsDuplicable();
     AddAttr<std::vector<std::string>>(
diff --git a/paddle/fluid/operators/send_recv_op_test.cc b/paddle/fluid/operators/send_recv_op_test.cc
index 542bc3fde2..3bf5d57809 100644
--- a/paddle/fluid/operators/send_recv_op_test.cc
+++ b/paddle/fluid/operators/send_recv_op_test.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include <unistd.h>
 #include <string>
-#include <thread>
+#include <thread>  // NOLINT
 
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/op_registry.h"
@@ -37,11 +37,11 @@ namespace m = paddle::operators::math;
 std::unique_ptr<f::OperatorBase> listen_and_serv_op;
 int selected_port;
 
-void InitTensorsInScope(f::Scope &scope, p::CPUPlace &place) {
+void InitTensorsInScope(const p::CPUPlace &place, f::Scope *scope) {
   p::CPUDeviceContext ctx(place);
   for (int i = 0; i < 2; ++i) {
     auto var_name = paddle::string::Sprintf("x%d", i);
-    auto var = scope.Var(var_name);
+    auto var = scope->Var(var_name);
     auto tensor = var->GetMutable<f::LoDTensor>();
     tensor->Resize({10, 10});
     float *expect = tensor->mutable_data<float>(place);
@@ -50,20 +50,20 @@ void InitTensorsInScope(f::Scope &scope, p::CPUPlace &place) {
     }
   }
 
-  auto out_var = scope.Var("Out");
+  auto out_var = scope->Var("Out");
   auto out_tensor = out_var->GetMutable<f::LoDTensor>();
   out_tensor->Resize({10, 10});
   out_tensor->mutable_data<float>(place);  // allocate
 }
 
-void InitSelectedRowsInScope(f::Scope &scope, p::CPUPlace &place) {
+void InitSelectedRowsInScope(const p::CPUPlace &place, f::Scope *scope) {
   p::CPUDeviceContext ctx(place);
   int64_t height = 10;
   int64_t row_numel = 10;
   m::SetConstant<p::CPUDeviceContext, float> set_one;
   // init x0
   std::vector<int64_t> rows0{0, 4, 7};
-  auto x0_var = scope.Var("x0");
+  auto x0_var = scope->Var("x0");
   auto x0 = x0_var->GetMutable<f::SelectedRows>();
   x0->set_rows(rows0);
   x0->set_height(height);
@@ -74,7 +74,7 @@ void InitSelectedRowsInScope(f::Scope &scope, p::CPUPlace &place) {
 
   // init x1
   std::vector<int64_t> rows1{2, 9};
-  auto x1_var = scope.Var("x1");
+  auto x1_var = scope->Var("x1");
   auto x1 = x1_var->GetMutable<f::SelectedRows>();
   x1->set_rows(rows1);
   x1->set_height(height);
@@ -83,7 +83,7 @@ void InitSelectedRowsInScope(f::Scope &scope, p::CPUPlace &place) {
       f::make_ddim({static_cast<int64_t>(rows1.size()), row_numel}), place);
   set_one(ctx, x1_value, 1.0);
 
-  auto out_var = scope.Var("Out");
+  auto out_var = scope->Var("Out");
   auto out = out_var->GetMutable<f::SelectedRows>();
   auto out_value = out->mutable_value();
   out->set_height(height);
@@ -117,15 +117,16 @@ void StartServerNet(bool is_sparse) {
   f::Scope scope;
   p::CPUPlace place;
   if (is_sparse) {
-    InitSelectedRowsInScope(scope, place);
+    InitSelectedRowsInScope(place, &scope);
   } else {
-    InitTensorsInScope(scope, place);
+    InitTensorsInScope(place, &scope);
   }
 
   // sub program run in listen_and_serv_op, for simple test we use sum
   f::ProgramDesc program;
   const auto &root_block = program.Block(0);
   auto *optimize_block = program.AppendBlock(root_block);
+  auto *prefetch_block = program.AppendBlock(root_block);
   // X for server side tensors, RX for received tensers, must be of same shape.
   AddOp("sum", {{"X", {"x0", "x1"}}}, {{"Out", {"Out"}}}, {}, optimize_block);
 
@@ -135,6 +136,7 @@ void StartServerNet(bool is_sparse) {
   attrs.insert({"ParamList", std::vector<std::string>({"Out"})});
   attrs.insert({"GradList", std::vector<std::string>({"x1"})});
   attrs.insert({"OptimizeBlock", optimize_block});
+  attrs.insert({"PrefetchBlock", prefetch_block});
   listen_and_serv_op =
       f::OpRegistry::CreateOp("listen_and_serv", {{"X", {"x1"}}}, {}, attrs);
   LOG(INFO) << "selected port before run " << selected_port;
@@ -148,7 +150,7 @@ TEST(SendRecvOp, CPUDense) {
   // local net
   f::Scope scope;
   p::CPUPlace place;
-  InitTensorsInScope(scope, place);
+  InitTensorsInScope(place, &scope);
   // create rpc client var
   scope.Var("RPC_CLIENT_VAR");
 
@@ -191,7 +193,7 @@ TEST(SendRecvOp, CPUSparse) {
   f::Scope scope;
   p::CPUPlace place;
   p::CPUDeviceContext ctx(place);
-  InitSelectedRowsInScope(scope, place);
+  InitSelectedRowsInScope(place, &scope);
   scope.Var("RPC_CLIENT_VAR");
   f::AttributeMap attrs;
   selected_port = static_cast<paddle::operators::ListenAndServOp *>(
diff --git a/paddle/fluid/operators/send_vars_op.cc b/paddle/fluid/operators/send_vars_op.cc
index 2cbd9e2394..56b3713d6a 100644
--- a/paddle/fluid/operators/send_vars_op.cc
+++ b/paddle/fluid/operators/send_vars_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <future>
+#include <future>  // NOLINT
 #include <ostream>
 
 #include "paddle/fluid/framework/data_type.h"
@@ -36,7 +36,7 @@ class SendVarsOp : public framework::OperatorBase {
     auto ins = Inputs("X");
 
     std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
-    int sync_send = Attr<int>("sync_sent");
+    int sync_send = Attr<int>("sync_send");
 
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     auto& ctx = *pool.Get(place);
diff --git a/paddle/fluid/operators/sgd_op.cc b/paddle/fluid/operators/sgd_op.cc
index 074fa9e00f..06cb0550ad 100644
--- a/paddle/fluid/operators/sgd_op.cc
+++ b/paddle/fluid/operators/sgd_op.cc
@@ -35,8 +35,8 @@ class SGDOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
                       "Learning rate should have 1 element");
     auto param_dim = ctx->GetInputDim("Param");
-    // TODO(qijun): check dimensions of Param and Grad at complie
-    // and run time.
+    // TODO(qijun): check dimensions of Param and Grad at compile
+    // and runtime.
     ctx->SetOutputDim("ParamOut", param_dim);
   }
 
diff --git a/paddle/fluid/operators/split_ids_op.cc b/paddle/fluid/operators/split_ids_op.cc
index a54f8a2878..a53cbc8ac5 100644
--- a/paddle/fluid/operators/split_ids_op.cc
+++ b/paddle/fluid/operators/split_ids_op.cc
@@ -48,11 +48,11 @@ class SplitIdsOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(ctx->HasOutputs("Out"), "SplitIdsOp must has output Out.");
 
     auto ids_var_type = ctx->GetInputsVarType("Ids").front();
-    PADDLE_ENFORCE_EQ(ids_var_type, framework::proto::VarType::LOD_TENSOR);
-
     auto ids_dims = ctx->GetInputDim("Ids");
-    PADDLE_ENFORCE_EQ(ids_dims.size(), 2);
-    PADDLE_ENFORCE_EQ(ids_dims[1], 1);
+    if (ids_var_type == framework::proto::VarType::LOD_TENSOR) {
+      PADDLE_ENFORCE_EQ(ids_dims.size(), 2);
+      PADDLE_ENFORCE_EQ(ids_dims[1], 1);
+    }
   }
 };
 
@@ -60,8 +60,9 @@ class SplitIdsOpInferVarType : public framework::VarTypeInference {
  public:
   void operator()(const framework::OpDesc &op_desc,
                   framework::BlockDesc *block) const override {
+    auto *input_var = block->Var(op_desc.Input("Ids")[0]);
     for (auto &out_var : op_desc.Output("Out")) {
-      block->Var(out_var)->SetType(framework::proto::VarType::LOD_TENSOR);
+      block->Var(out_var)->SetType(input_var->GetType());
     }
   }
 };
@@ -73,4 +74,5 @@ namespace ops = paddle::operators;
 REGISTER_OPERATOR(split_ids, ops::SplitIdsOp, ops::SplitIdsOpMaker,
                   ops::SplitIdsOpInferVarType);
 REGISTER_OP_CPU_KERNEL(
-    split_ids, ops::SplitIdsOpKernel<paddle::platform::CPUPlace, int64_t>);
+    split_ids, ops::SplitIdsOpKernel<paddle::platform::CPUPlace, int64_t>,
+    ops::SplitIdsOpKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/fluid/operators/split_ids_op.h b/paddle/fluid/operators/split_ids_op.h
index d36ed398eb..ba1e903dbb 100644
--- a/paddle/fluid/operators/split_ids_op.h
+++ b/paddle/fluid/operators/split_ids_op.h
@@ -24,35 +24,63 @@ namespace operators {
 template <typename DeviceContext, typename T>
 class SplitIdsOpKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
+  void Compute(const framework::ExecutionContext &ctx) const override {
     auto place = ctx.GetPlace();
     if (!platform::is_cpu_place(place)) {
       PADDLE_THROW("SplitIds do not support GPU kernel");
     }
 
-    auto& ids_dims = ctx.Input<framework::LoDTensor>("Ids")->dims();
-    const T* ids = ctx.Input<framework::LoDTensor>("Ids")->data<T>();
-    auto outs = ctx.MultiOutput<framework::LoDTensor>("Out");
-    const size_t shard_num = outs.size();
+    const auto *ids_var = ctx.InputVar("Ids");
+    if (ids_var->IsType<framework::LoDTensor>()) {
+      const auto &ids_dims = ctx.Input<framework::LoDTensor>("Ids")->dims();
+      const T *ids = ctx.Input<framework::LoDTensor>("Ids")->data<T>();
+      auto outs = ctx.MultiOutput<framework::LoDTensor>("Out");
+      const size_t shard_num = outs.size();
 
-    std::vector<std::vector<T>> out_ids;
-    out_ids.resize(outs.size());
+      std::vector<std::vector<T>> out_ids;
+      out_ids.resize(outs.size());
 
-    // split id by their shard_num.
-    for (int i = 0; i < ids_dims[0]; ++i) {
-      T id = ids[i];
-      size_t shard_id = static_cast<size_t>(id) % shard_num;
-      out_ids[shard_id].push_back(id);
-    }
+      // split id by their shard_num.
+      for (int i = 0; i < ids_dims[0]; ++i) {
+        T id = ids[i];
+        size_t shard_id = static_cast<size_t>(id) % shard_num;
+        out_ids[shard_id].push_back(id);
+      }
+
+      // create tensor for each shard and send to parameter server
+      for (size_t i = 0; i < out_ids.size(); ++i) {
+        auto *shard_t = outs[i];
+        std::vector<T> ids = out_ids[i];
+        auto *shard_data = shard_t->mutable_data<T>(
+            framework::make_ddim({static_cast<int64_t>(ids.size()), 1}), place);
+        for (size_t i = 0; i < ids.size(); ++i) {
+          shard_data[i] = ids[i];
+        }
+      }
+    } else if (ids_var->IsType<framework::SelectedRows>()) {
+      const auto *ids_selected_rows = ctx.Input<framework::SelectedRows>("Ids");
+      auto &ids_dims = ids_selected_rows->value().dims();
+      PADDLE_ENFORCE_EQ(ids_dims[0], ids_selected_rows->rows().size(), "");
+      const T *ids = ids_selected_rows->value().data<T>();
+      const auto &ids_rows = ids_selected_rows->rows();
+      auto outs = ctx.MultiOutput<framework::SelectedRows>("Out");
+      const size_t shard_num = outs.size();
+      // get rows for outputs
+      for (auto &id : ids_rows) {
+        size_t shard_id = static_cast<size_t>(id) % shard_num;
+        outs[shard_id]->mutable_rows()->push_back(id);
+      }
 
-    // create tensor for each shard and send to parameter server
-    for (size_t i = 0; i < out_ids.size(); ++i) {
-      auto* shard_t = outs[i];
-      std::vector<T> ids = out_ids[i];
-      auto* shard_data = shard_t->mutable_data<T>(
-          framework::make_ddim({static_cast<int64_t>(ids.size()), 1}), place);
-      for (size_t i = 0; i < ids.size(); ++i) {
-        shard_data[i] = ids[i];
+      int64_t row_width = ids_dims[1];
+      for (auto &out : outs) {
+        out->set_height(ids_selected_rows->height());
+        framework::DDim ddim = framework::make_ddim(
+            {static_cast<int64_t>(out->rows().size()), row_width});
+        T *output = out->mutable_value()->mutable_data<T>(ddim, place);
+        for (size_t i = 0; i < ddim[0]; ++i) {
+          memcpy(output + i * row_width, ids + out->rows()[i] * row_width,
+                 row_width * sizeof(T));
+        }
       }
     }
   }
diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc
index 9061e137bd..108f26fafe 100644
--- a/paddle/fluid/operators/sum_op.cc
+++ b/paddle/fluid/operators/sum_op.cc
@@ -10,9 +10,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/sum_op.h"
+
 #include <algorithm>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/var_type_inference.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
 
@@ -37,7 +39,10 @@ class SumOp : public framework::OperatorWithKernel {
 
     auto x_dims = ctx->GetInputsDim("X");
     size_t N = x_dims.size();
-    PADDLE_ENFORCE_GT(N, 1, "Input tensors count should > 1.");
+    PADDLE_ENFORCE_GT(N, 0, "Input tensors count should > 0.");
+    if (N == 1) {
+      VLOG(3) << "Warning: sum have only one input, may waste memory";
+    }
 
     framework::DDim in_dim({0});
     for (auto& x_dim : x_dims) {
diff --git a/python/paddle/fluid/distribute_transpiler.py b/python/paddle/fluid/distribute_transpiler.py
index e18ace844e..b0522b49f4 100644
--- a/python/paddle/fluid/distribute_transpiler.py
+++ b/python/paddle/fluid/distribute_transpiler.py
@@ -13,14 +13,17 @@
 # limitations under the License.
 
 from __future__ import print_function
-import framework
-from framework import Program, default_main_program, default_startup_program, Parameter, Variable
-import optimizer
-from layer_helper import LayerHelper
-import distributed_splitter as splitter
+
 import math
+
+import distributed_splitter as splitter
+import framework
+from framework import Program, default_main_program, Variable
 from . import core
-import debuger
+
+LOOKUP_TABLE_TYPE = "lookup_table"
+LOOKUP_TABLE_GRAD_TYPE = "lookup_table_grad"
+RPC_CLIENT_VAR_NAME = "RPC_CLIENT_VAR"
 
 
 class VarBlock:
@@ -35,9 +38,9 @@ class VarBlock:
 
 
 class UnionFind(object):
-    """ Union-find data struct.
+    """ Union-find data structure.
 
-    Union-find is a data struct that keeps track of a set of elements partitioned
+    Union-find is a data structure that keeps track of a set of elements partitioned
     into a number of disjoint (non-overlapping) subsets.
 
     Reference:
@@ -185,19 +188,66 @@ class DistributeTranspiler:
         assert (callable(split_method))
         if program is None:
             program = default_main_program()
-        self.program = program
-        self.trainers = trainers
+        self.origin_program = program
+        self.trainer_num = trainers
         self.optimize_ops = optimize_ops
         # TODO(typhoonzero): currently trainer_id is fetched from cluster system
         # like Kubernetes, we should port this to use etcd later when developing
         # fluid distributed training with fault-tolerance.
         self.trainer_id = trainer_id
         pserver_endpoints = pservers.split(",")
+        self.pserver_endpoints = pserver_endpoints
+
+        # process lookup_table_op
+        # 1. check all lookup_table_op is distributed
+        # 2. check all lookup_table_op share the same table.
+        distributed_lookup_table_ops = []
+        # support only one distributed_lookup_table now
+        self.table_name = None
+        for op in program.global_block().ops:
+            if op.type == LOOKUP_TABLE_TYPE:
+                if op.attrs['is_distributed'] is True:
+                    if self.table_name is None:
+                        self.table_name = op.input("W")[0]
+                    if self.table_name != op.input("W")[0]:
+                        raise RuntimeError("all distributed lookup_table_ops"
+                                           " should have only one table")
+                    distributed_lookup_table_ops.append(op)
+                else:
+                    if self.table_name is not None:
+                        assert op.input("W")[0] != self.table_name
+
+        self.has_distributed_lookup_table = len(
+            distributed_lookup_table_ops) > 0
 
         # step1: For large parameters and gradients, split them into smaller
         # blocks.
         param_list = [pg[0] for pg in params_grads]
         grad_list = [pg[1] for pg in params_grads]
+
+        if self.has_distributed_lookup_table:
+            param_list = [
+                param for param in param_list if param.name != self.table_name
+            ]
+            grad_list = [
+                grad for grad in grad_list
+                if grad.name != framework.grad_var_name(self.table_name)
+            ]
+            self.table_param_grad = [
+                param_grad for param_grad in params_grads
+                if param_grad[0].name == self.table_name
+            ][0]
+            table_grad_var = self.table_param_grad[1]
+            self.table_grad_list = [
+                program.global_block().create_var(
+                    name="%s.trainer_%d.pserver_%d" %
+                    (table_grad_var.name, trainer_id, index),
+                    type=table_grad_var.type,
+                    shape=table_grad_var.shape,
+                    dtype=table_grad_var.dtype)
+                for index in range(len(self.pserver_endpoints))
+            ]
+
         grad_blocks = split_dense_variable(grad_list, len(pserver_endpoints))
         param_blocks = split_dense_variable(param_list, len(pserver_endpoints))
         # step2: Create new vars for the parameters and gradients blocks and
@@ -229,7 +279,7 @@ class DistributeTranspiler:
             self.param_grad_ep_mapping[ep]["grads"].append(grad)
 
         rpc_client_var = program.global_block().create_var(
-            name="RPC_CLIENT_VAR",
+            name=RPC_CLIENT_VAR_NAME,
             persistable=True,
             type=core.VarDesc.VarType.RAW)
 
@@ -252,13 +302,19 @@ class DistributeTranspiler:
                 outputs={"Out": [orig_param]},
                 attrs={"axis": 0})
 
+        if self.has_distributed_lookup_table:
+            self._replace_lookup_table_op_with_prefetch(program, rpc_client_var,
+                                                        eplist)
+            self._split_table_grad_and_add_send_vars(program, rpc_client_var,
+                                                     pserver_endpoints)
+
     def get_trainer_program(self):
         # remove optimize ops and add a send op to main_program
-        self.program.global_block().delete_ops(self.optimize_ops)
-        self.program.sync_with_cpp()
+        self.origin_program.global_block().delete_ops(self.optimize_ops)
+        self.origin_program.sync_with_cpp()
         # FIXME(typhoonzero): serialize once will fix error occurs when clone.
-        self.program.__str__()
-        return self.program
+        self.origin_program.__str__()
+        return self.origin_program
 
     def get_pserver_program(self, endpoint):
         """
@@ -294,8 +350,8 @@ class DistributeTranspiler:
                     type=v.type,
                     dtype=v.dtype,
                     shape=v.shape)
-            if self.trainers > 1:
-                for trainer_id in xrange(self.trainers):
+            if self.trainer_num > 1:
+                for trainer_id in xrange(self.trainer_num):
                     var = pserver_program.global_block().create_var(
                         name="%s.trainer_%d" % (orig_var_name, trainer_id),
                         persistable=False,
@@ -309,7 +365,7 @@ class DistributeTranspiler:
         # step3
         optimize_block = pserver_program.create_block(0)
         # step 4
-        # Create a union-find data struct from optimize ops,
+        # Create a union-find data structure from optimize ops,
         # If two ops are connected, we could add these two ops
         # into one set.
         ufind = self._create_ufind(self.optimize_ops)
@@ -384,6 +440,23 @@ class DistributeTranspiler:
         #             __append_optimize_op__(glb_op, optimize_block)
         #             break
 
+        # process distributed lookup_table
+        prefetch_block = None
+        if self.has_distributed_lookup_table:
+            pserver_index = self.pserver_endpoints.index(endpoint)
+            self._create_table_optimize_block(pserver_index, pserver_program,
+                                              append_block)
+            prefetch_block = self._create_prefetch_block(
+                pserver_index, pserver_program, optimize_block)
+
+        # NOTE: if has_distributed_lookup_table is False, then prefetch_block will
+        # not be executed, so it's safe to use optimize_block to hold the place
+        if self.has_distributed_lookup_table:
+            assert prefetch_block is not None
+        else:
+            assert prefetch_block is None
+            prefetch_block = pserver_program.global_block()
+
         # step5 append the listen_and_serv op
         pserver_program.global_block().append_op(
             type="listen_and_serv",
@@ -392,8 +465,10 @@ class DistributeTranspiler:
             attrs={
                 "OptimizeBlock": optimize_block,
                 "endpoint": endpoint,
-                "Fanin": self.trainers
+                "Fanin": self.trainer_num,
+                "PrefetchBlock": prefetch_block
             })
+
         pserver_program.sync_with_cpp()
         return pserver_program
 
@@ -451,6 +526,197 @@ class DistributeTranspiler:
                     attrs=op.attrs)
         return s_prog
 
+    # transpiler function for dis lookup_table
+    def _replace_lookup_table_op_with_prefetch(self, program, rpc_client_var,
+                                               eplist):
+        # 1. replace lookup_table_op with split_ids_op -> prefetch_op -> sum_op
+        self.prefetch_input_vars = None
+        self.prefetch_output_vars = None
+
+        continue_search_lookup_table_op = True
+        while continue_search_lookup_table_op:
+            continue_search_lookup_table_op = False
+            all_ops = program.global_block().ops
+            for op in all_ops:
+                if op.type == LOOKUP_TABLE_TYPE:
+                    continue_search_lookup_table_op = True
+
+                    op_index = list(all_ops).index(op)
+                    ids_name = op.input("Ids")
+                    out_name = op.output("Out")
+
+                    if self.prefetch_input_vars is None:
+                        ids_var = program.global_block().vars[ids_name[0]]
+                        self.prefetch_input_vars = self.create_splited_vars(
+                            source_var=ids_var,
+                            block=program.global_block(),
+                            tag="_prefetch_in_")
+                    if self.prefetch_output_vars is None:
+                        out_var = program.global_block().vars[out_name[0]]
+                        self.prefetch_output_vars = self.create_splited_vars(
+                            source_var=out_var,
+                            block=program.global_block(),
+                            tag="_prefetch_out_")
+
+                    # insert split_ids_op
+                    program.global_block().insert_op(
+                        index=op_index,
+                        type="split_ids",
+                        inputs={
+                            'Ids': [
+                                program.global_block().vars[varname]
+                                for varname in ids_name
+                            ]
+                        },
+                        outputs={"Out": self.prefetch_input_vars})
+
+                    # insert prefetch_op
+                    program.global_block().insert_op(
+                        index=op_index + 1,
+                        type="prefetch",
+                        inputs={'X': self.prefetch_input_vars},
+                        outputs={
+                            "Out": self.prefetch_output_vars,
+                            "RPCClient": rpc_client_var
+                        },
+                        attrs={"epmap": eplist})
+
+                    # insert concat_op
+                    program.global_block().insert_op(
+                        index=op_index + 2,
+                        type="concat",
+                        inputs={'X': self.prefetch_output_vars},
+                        outputs={
+                            "Out": [
+                                program.global_block().vars[varname]
+                                for varname in out_name
+                            ]
+                        },
+                        attrs={"axis": 0})
+
+                    # delete lookup_table_op
+                    program.global_block().delete_ops([op])
+                    program.sync_with_cpp()
+                    # break for loop
+                    break
+
+    def _split_table_grad_and_add_send_vars(self, program, rpc_client_var,
+                                            pserver_endpoints):
+        # 2. add split_ids_op and send_vars_op to send gradient to pservers
+        # there should only be one table_name
+        all_ops = program.global_block().ops
+        table_grad_name = framework.grad_var_name(self.table_name)
+        for op in all_ops:
+            if table_grad_name in op.output_arg_names:
+                op_index = list(all_ops).index(op)
+                # insert split_ids_op
+                program.global_block().insert_op(
+                    index=op_index + 1,
+                    type="split_ids",
+                    inputs={
+                        'Ids': [program.global_block().vars[table_grad_name]]
+                    },
+                    outputs={"Out": self.table_grad_list})
+                program.global_block().insert_op(
+                    index=op_index + 2,
+                    type="send_vars",
+                    inputs={'X': self.table_grad_list},
+                    outputs={"RPCClient": rpc_client_var},
+                    attrs={"sync_send": True,
+                           "epmap": pserver_endpoints})
+                break
+
+    def _create_prefetch_block(self, pserver_index, pserver_program,
+                               optimize_block):
+        # STEP: create prefetch block
+        table_var = pserver_program.global_block().vars[self.table_name]
+        prefetch_block = pserver_program.create_block(optimize_block.idx)
+        trainer_ids = self.prefetch_input_vars[pserver_index]
+        pserver_ids = pserver_program.global_block().create_var(
+            name=trainer_ids.name,
+            type=trainer_ids.type,
+            shape=trainer_ids.shape,
+            dtype=trainer_ids.dtype)
+        trainer_out = self.prefetch_output_vars[pserver_index]
+        pserver_out = pserver_program.global_block().create_var(
+            name=trainer_out.name,
+            type=trainer_out.type,
+            shape=trainer_out.shape,
+            dtype=trainer_out.dtype)
+        prefetch_block.append_op(
+            type=LOOKUP_TABLE_TYPE,
+            inputs={'Ids': pserver_ids,
+                    "W": table_var},
+            outputs={"Out": pserver_out},
+            attrs={
+                "is_sparse": True,  # has no effect on lookup_table op
+                "is_distributed": True,
+                "padding_idx": -1
+            })
+        return prefetch_block
+
+    def _create_table_optimize_block(self, pserver_index, pserver_program,
+                                     append_block):
+        def _clone_var(block, var, persistable=True):
+            assert isinstance(var, Variable)
+            return block.create_var(
+                name=var.name,
+                shape=var.shape,
+                dtype=var.dtype,
+                type=var.type,
+                persistable=persistable)
+
+        # STEP: create table optimize block
+        # create table param and grad var in pserver program
+        param_var = _clone_var(
+            pserver_program.global_block(),
+            self.origin_program.global_block().vars[self.table_name])
+        grad_var = _clone_var(
+            pserver_program.global_block(),
+            self.origin_program.global_block().vars[framework.grad_var_name(
+                self.table_name)],
+            persistable=False)
+
+        # create grad vars in pserver program
+        table_grad_var = self.table_param_grad[1]
+        table_grad_list = [
+            pserver_program.global_block().create_var(
+                name="%s.trainer_%d.pserver_%d" %
+                (table_grad_var.name, index, pserver_index),
+                type=table_grad_var.type,
+                shape=table_grad_var.shape,
+                dtype=table_grad_var.dtype) for index in range(self.trainer_num)
+        ]
+
+        # create table optimize block in pserver program
+        table_opt_op = [
+            op for op in self.optimize_ops
+            if op.input("Param")[0] == self.table_name
+        ][0]
+        table_opt_block = pserver_program.create_block(append_block.idx)
+        # only support sgd now
+        assert table_opt_op.type == "sgd"
+
+        # append sum op for table_grad_list
+        table_opt_block.append_op(
+            type="sum",
+            inputs={"X": table_grad_list},
+            outputs={"Out": [grad_var]})
+
+        lr_var = pserver_program.global_block().vars[table_opt_op.input(
+            "LearningRate")[0]]
+        inputs = {
+            "Param": [param_var],
+            "Grad": [grad_var],
+            "LearningRate": [lr_var]
+        }
+        outputs = {"ParamOut": [param_var]}
+        table_opt_block.append_op(
+            type=table_opt_op.type,
+            inputs=inputs,
+            outputs=outputs,
+            attrs=table_opt_op.attrs)
+
     # ====================== private transpiler functions =====================
     def _create_vars_from_blocklist(self,
                                     program,
@@ -512,7 +778,17 @@ class DistributeTranspiler:
             program.global_block().sync_with_cpp()
         return var_mapping
 
-    def _clone_var(self, block, var):
+    def create_splited_vars(self, source_var, block, tag):
+        return [
+            block.create_var(
+                name=str(source_var.name + tag + str(index)),
+                type=source_var.type,
+                shape=source_var.shape,
+                dtype=source_var.dtype)
+            for index in range(len(self.pserver_endpoints))
+        ]
+
+    def _clone_var(self, block, var, persistable=True):
         assert isinstance(var, Variable)
         return block.create_var(
             name=var.name,
@@ -520,12 +796,12 @@ class DistributeTranspiler:
             dtype=var.dtype,
             type=var.type,
             lod_level=var.lod_level,
-            persistable=True)
+            persistable=persistable)
 
     def _append_split_op(self, program, gradblocks):
         # Split variables that need to be split and append respective ops
         add_suffix = False
-        if self.trainers > 1:
+        if self.trainer_num > 1:
             add_suffix = True
         var_mapping = self._create_vars_from_blocklist(
             program, gradblocks, add_trainer_suffix=add_suffix)
@@ -616,9 +892,9 @@ class DistributeTranspiler:
                     return
                 merged_var = \
                     pserver_block.vars[self._orig_varname(grad_block.name)]
-                if self.trainers > 1:
+                if self.trainer_num > 1:
                     vars2merge = []
-                    for i in xrange(self.trainers):
+                    for i in xrange(self.trainer_num):
                         per_trainer_name = "%s.trainer_%d" % \
                         (self._orig_varname(grad_block.name), i)
                         vars2merge.append(pserver_block.vars[per_trainer_name])
@@ -633,7 +909,7 @@ class DistributeTranspiler:
                             type="scale",
                             inputs={"X": merged_var},
                             outputs={"Out": merged_var},
-                            attrs={"scale": 1.0 / float(self.trainers)})
+                            attrs={"scale": 1.0 / float(self.trainer_num)})
                 new_inputs[key] = merged_var
             elif key == "Param":
                 # param is already created on global program
@@ -669,7 +945,7 @@ class DistributeTranspiler:
             new_shape = None
             if key in ["Param", "Grad", "LearningRate"]:
                 continue
-            var = self.program.global_block().vars[opt_op.input(key)[0]]
+            var = self.origin_program.global_block().vars[opt_op.input(key)[0]]
             # update accumulator variable shape
             param_shape = new_inputs["Param"].shape
             new_shape = self._get_optimizer_input_shape(opt_op.type, key,
@@ -682,8 +958,8 @@ class DistributeTranspiler:
             new_inputs[key] = tmpvar
 
         # change output's ParamOut variable
-        outputs = self._get_output_map_from_op(self.program.global_block().vars,
-                                               opt_op)
+        outputs = self._get_output_map_from_op(
+            self.origin_program.global_block().vars, opt_op)
         outputs["ParamOut"] = new_inputs["Param"]
 
         optimize_block.append_op(
@@ -695,8 +971,8 @@ class DistributeTranspiler:
     def _append_pserver_non_opt_ops(self, optimize_block, opt_op):
         program = optimize_block.program
         # Append the ops for parameters that do not need to be optimized/updated
-        inputs = self._get_input_map_from_op(self.program.global_block().vars,
-                                             opt_op)
+        inputs = self._get_input_map_from_op(
+            self.origin_program.global_block().vars, opt_op)
         for varlist in inputs.itervalues():
             if not isinstance(varlist, list):
                 varlist = [varlist]
@@ -709,8 +985,8 @@ class DistributeTranspiler:
                         dtype=var.dtype,
                         shape=var.shape)
 
-        outputs = self._get_output_map_from_op(self.program.global_block().vars,
-                                               opt_op)
+        outputs = self._get_output_map_from_op(
+            self.origin_program.global_block().vars, opt_op)
 
         for varlist in outputs.itervalues():
             if not isinstance(varlist, list):
@@ -783,7 +1059,6 @@ class DistributeTranspiler:
                 if same_or_split_var(n, param) and n != param:
                     return True
             return False
-        return False
 
     def _get_input_map_from_op(self, varmap, op):
         """Returns a dict from op input name to the vars in varmap."""
@@ -821,7 +1096,7 @@ class DistributeTranspiler:
 
         find_ops = []
         # find ops which output is lr var
-        block = self.program.global_block()
+        block = self.origin_program.global_block()
         for op in block.ops:
             if set(op.output_arg_names) & lr_vars:
                 find_ops.append(op)
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 7ca4ed9a7b..5c2c2dd7ab 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -218,6 +218,7 @@ def fc(input,
 def embedding(input,
               size,
               is_sparse=False,
+              is_distributed=False,
               padding_idx=None,
               param_attr=None,
               dtype='float32'):
@@ -268,8 +269,11 @@ def embedding(input,
         inputs={'Ids': input,
                 'W': w},
         outputs={'Out': tmp},
-        attrs={'is_sparse': is_sparse,
-               'padding_idx': padding_idx})
+        attrs={
+            'is_sparse': is_sparse,
+            'is_distributed': is_distributed,
+            'padding_idx': padding_idx
+        })
     return tmp
 
 

From d24b5e060f738139feab99b1c4a97042bce1982f Mon Sep 17 00:00:00 2001
From: mozga-intel <mateusz.ozga@intel.com>
Date: Thu, 12 Apr 2018 14:33:38 +0200
Subject: [PATCH 59/62] The fully connected: the operator is removed when the
 MKLDNN flag is OFF

---
 paddle/fluid/operators/CMakeLists.txt              | 8 ++++++++
 python/paddle/fluid/tests/unittests/CMakeLists.txt | 6 ++++++
 2 files changed, 14 insertions(+)

diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 3c8696b508..7d6781c2c3 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -245,9 +245,17 @@ op_library(channel_send_op DEPS concurrency)
 op_library(channel_recv_op DEPS concurrency)
 
 list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
+
+# The fully connected layer is deleted when the WITH_MKLDNN flag is OFF
+# Because the fully connected layer has only one MKLDNN's operator
+if(NOT WITH_MKLDNN)
+    list(REMOVE_ITEM GENERAL_OPS fc_op)
+endif(NOT WITH_MKLDNN)
+
 foreach(src ${GENERAL_OPS})
     op_library(${src})
 endforeach()
+
 file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\n")
 
 add_subdirectory(reader)
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index f10ef9b634..3bd24c98a2 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -1,6 +1,12 @@
 file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 
+# The fully connected test is removed whe the WITH_MKLDNN flag is OFF
+# Because the fully connected layer has only one kernel (MKLDNN)
+if(NOT WITH_MKLDNN)
+    list(REMOVE_ITEM TEST_OPS test_fc_op)
+endif(NOT WITH_MKLDNN)
+
 if(NOT WITH_DISTRIBUTE)
     list(REMOVE_ITEM TEST_OPS test_recv_op)
 endif(NOT WITH_DISTRIBUTE)

From 617e790a596ccd3f2eb940fcfe76803c01ee6cc8 Mon Sep 17 00:00:00 2001
From: Kexin Zhao <kexin.zhao.paddle@gmail.com>
Date: Thu, 12 Apr 2018 11:48:17 -0700
Subject: [PATCH 60/62] fix cuda 7.5 compile error (#9885)

---
 paddle/fluid/operators/math/math_function.cu | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/paddle/fluid/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu
index e53183603f..c28047e6e9 100644
--- a/paddle/fluid/operators/math/math_function.cu
+++ b/paddle/fluid/operators/math/math_function.cu
@@ -288,9 +288,14 @@ void batched_gemm<platform::CUDADeviceContext, float16>(
   // TODO(kexinzhao): add processing code for compute capability < 53 case
   PADDLE_ENFORCE_GE(context.GetComputeCapability(), 53,
                     "cublas Hgemm requires GPU compute capability >= 53");
+
+#if CUDA_VERSION >= 8000
   PADDLE_ENFORCE(platform::dynload::cublasHgemmStridedBatched(
       context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb,
       strideB, h_A, lda, strideA, &h_beta, h_C, ldc, strideC, batchCount));
+#else
+  PADDLE_ENFORCE(false, "HgemmStridedBatched is not supported on cuda <= 7.5");
+#endif
 }
 
 template <>
@@ -310,9 +315,13 @@ void batched_gemm<platform::CUDADeviceContext, float>(
       (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
   const int strideC = M * N;
 
+#if CUDA_VERSION >= 8000
   PADDLE_ENFORCE(platform::dynload::cublasSgemmStridedBatched(
       context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb,
       strideB, A, lda, strideA, &beta, C, ldc, strideC, batchCount));
+#else
+  PADDLE_ENFORCE(false, "SgemmStridedBatched is not supported on cuda <= 7.5");
+#endif
 }
 
 template <>
@@ -332,9 +341,13 @@ void batched_gemm<platform::CUDADeviceContext, double>(
       (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
   const int strideC = M * N;
 
+#if CUDA_VERSION >= 8000
   PADDLE_ENFORCE(platform::dynload::cublasDgemmStridedBatched(
       context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb,
       strideB, A, lda, strideA, &beta, C, ldc, strideC, batchCount));
+#else
+  PADDLE_ENFORCE(false, "DgemmStridedBatched is not supported on cuda <= 7.5");
+#endif
 }
 
 template <>

From 59234b7287980ef0fec0a064f524e6c25697b7c7 Mon Sep 17 00:00:00 2001
From: redrayqll <redray2006@gmail.com>
Date: Fri, 13 Apr 2018 03:25:44 +0800
Subject: [PATCH 61/62] =?UTF-8?q?modify=20=E2=80=9Cif-then-else=E2=80=9D?=
 =?UTF-8?q?=20md=20path=20(#9876)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 doc/fluid/design/motivation/fluid.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/fluid/design/motivation/fluid.md b/doc/fluid/design/motivation/fluid.md
index 5e147f8263..4b7696cc1b 100644
--- a/doc/fluid/design/motivation/fluid.md
+++ b/doc/fluid/design/motivation/fluid.md
@@ -119,7 +119,7 @@ An actual Fluid example is described  [here](https://github.com/PaddlePaddle/Pad
 
 From the example, the Fluid programs look very similar to their PyTorch equivalent programs, except that Fluid's loop structure, wrapped with Python's `with` statement, could run much faster than just a Python loop.
 
-We have more examples of the [`if-then-else`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/if_else_op.md) structure of Fluid.
+We have more examples of the [`if-then-else`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/execution/if_else_op.md) structure of Fluid.
 
 ## Turing Completeness
 

From c241959e489053259274edb2614381d7058463a4 Mon Sep 17 00:00:00 2001
From: Abhinav Arora <abhinavarora28@gmail.com>
Date: Thu, 12 Apr 2018 16:45:40 -0700
Subject: [PATCH 62/62] Fix CPPLint errors in operators (#9828)

* Fix CPPLint errors in operators

* Fix prior box op

* Fix Prior Box op

* Fix top_k_op.cu

* Fix pool mkmldnn

* Fix pool mkmldnn
---
 paddle/fluid/operators/pad_op.h             |  2 +
 paddle/fluid/operators/pool_mkldnn_op.cc    | 12 ++-
 paddle/fluid/operators/pool_op.h            |  2 +
 paddle/fluid/operators/pool_with_index_op.h |  1 +
 paddle/fluid/operators/prelu_op.cc          |  1 -
 paddle/fluid/operators/prior_box_op.cc      |  2 +-
 paddle/fluid/operators/prior_box_op.cu      |  2 +-
 paddle/fluid/operators/prior_box_op.h       | 18 +++--
 paddle/fluid/operators/rank_loss_op.cc      |  1 +
 paddle/fluid/operators/recv_op.cc           |  2 +-
 paddle/fluid/operators/roi_pool_op.h        |  2 +
 paddle/fluid/operators/strided_memcpy.h     |  4 +-
 paddle/fluid/operators/top_k_op.cu          | 83 +++++++++++----------
 13 files changed, 73 insertions(+), 59 deletions(-)

diff --git a/paddle/fluid/operators/pad_op.h b/paddle/fluid/operators/pad_op.h
index a36abe3789..c93c096575 100644
--- a/paddle/fluid/operators/pad_op.h
+++ b/paddle/fluid/operators/pad_op.h
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #pragma once
 
+#include <utility>
+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 
diff --git a/paddle/fluid/operators/pool_mkldnn_op.cc b/paddle/fluid/operators/pool_mkldnn_op.cc
index c88578570c..63eaaedcd5 100644
--- a/paddle/fluid/operators/pool_mkldnn_op.cc
+++ b/paddle/fluid/operators/pool_mkldnn_op.cc
@@ -83,9 +83,11 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     dev_ctx.SetBlob(key_pool_workspace_memory, workspace_memory);
 
     auto src_memory =
-        mkldnn::memory({src_md, mkldnn_engine}, (void*)input_data);
+        mkldnn::memory({src_md, mkldnn_engine},
+                       static_cast<void*>(const_cast<T*>(input_data)));
     auto dst_memory =
-        mkldnn::memory({dst_md, mkldnn_engine}, (void*)output_data);
+        mkldnn::memory({dst_md, mkldnn_engine},
+                       static_cast<void*>(const_cast<T*>(output_data)));
 
     auto pool_prim = mkldnn::pooling_forward(*pool_pd, src_memory, dst_memory,
                                              *workspace_memory);
@@ -195,9 +197,11 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
         pool_bwd_desc, mkldnn_engine, *pool_pd);
 
     auto diff_src_memory =
-        mkldnn::memory({diff_src_md, mkldnn_engine}, (void*)in_x_grad_data);
+        mkldnn::memory({diff_src_md, mkldnn_engine},
+                       static_cast<void*>(const_cast<T*>(in_x_grad_data)));
     auto diff_dst_memory =
-        mkldnn::memory({diff_dst_md, mkldnn_engine}, (void*)out_grad_data);
+        mkldnn::memory({diff_dst_md, mkldnn_engine},
+                       static_cast<void*>(const_cast<T*>(out_grad_data)));
 
     auto bwd_prim = mkldnn::pooling_backward(
         pool_bwd_pd, diff_dst_memory, *workspace_memory, diff_src_memory);
diff --git a/paddle/fluid/operators/pool_op.h b/paddle/fluid/operators/pool_op.h
index 2fec50ef25..a48127ea69 100644
--- a/paddle/fluid/operators/pool_op.h
+++ b/paddle/fluid/operators/pool_op.h
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #pragma once
 
+#include <string>
+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
diff --git a/paddle/fluid/operators/pool_with_index_op.h b/paddle/fluid/operators/pool_with_index_op.h
index 83e7bd138a..b55fa76eae 100644
--- a/paddle/fluid/operators/pool_with_index_op.h
+++ b/paddle/fluid/operators/pool_with_index_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
diff --git a/paddle/fluid/operators/prelu_op.cc b/paddle/fluid/operators/prelu_op.cc
index 7fb45bd19d..8eaa12a4a6 100644
--- a/paddle/fluid/operators/prelu_op.cc
+++ b/paddle/fluid/operators/prelu_op.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/prelu_op.h"
-
 #include <string>
 
 namespace paddle {
diff --git a/paddle/fluid/operators/prior_box_op.cc b/paddle/fluid/operators/prior_box_op.cc
index 82e54139c8..058b13eeb8 100644
--- a/paddle/fluid/operators/prior_box_op.cc
+++ b/paddle/fluid/operators/prior_box_op.cc
@@ -45,7 +45,7 @@ class PriorBoxOp : public framework::OperatorWithKernel {
     bool flip = ctx->Attrs().Get<bool>("flip");
 
     std::vector<float> aspect_ratios_vec;
-    ExpandAspectRatios(aspect_ratios, flip, aspect_ratios_vec);
+    ExpandAspectRatios(aspect_ratios, flip, &aspect_ratios_vec);
 
     size_t num_priors = aspect_ratios_vec.size() * min_sizes.size();
     if (max_sizes.size() > 0) {
diff --git a/paddle/fluid/operators/prior_box_op.cu b/paddle/fluid/operators/prior_box_op.cu
index 76bf2b3b7d..0ea8909296 100644
--- a/paddle/fluid/operators/prior_box_op.cu
+++ b/paddle/fluid/operators/prior_box_op.cu
@@ -96,7 +96,7 @@ class PriorBoxOpCUDAKernel : public framework::OpKernel<T> {
     auto clip = ctx.Attr<bool>("clip");
 
     std::vector<float> aspect_ratios;
-    ExpandAspectRatios(input_aspect_ratio, flip, aspect_ratios);
+    ExpandAspectRatios(input_aspect_ratio, flip, &aspect_ratios);
 
     T step_w = static_cast<T>(ctx.Attr<float>("step_w"));
     T step_h = static_cast<T>(ctx.Attr<float>("step_h"));
diff --git a/paddle/fluid/operators/prior_box_op.h b/paddle/fluid/operators/prior_box_op.h
index 1e4a12aac1..1c62fd8d2c 100644
--- a/paddle/fluid/operators/prior_box_op.h
+++ b/paddle/fluid/operators/prior_box_op.h
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <algorithm>
+#include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/transform.h"
@@ -22,23 +24,23 @@ namespace operators {
 
 inline void ExpandAspectRatios(const std::vector<float>& input_aspect_ratior,
                                bool flip,
-                               std::vector<float>& output_aspect_ratior) {
+                               std::vector<float>* output_aspect_ratior) {
   constexpr float epsilon = 1e-6;
-  output_aspect_ratior.clear();
-  output_aspect_ratior.push_back(1.0f);
+  output_aspect_ratior->clear();
+  output_aspect_ratior->push_back(1.0f);
   for (size_t i = 0; i < input_aspect_ratior.size(); ++i) {
     float ar = input_aspect_ratior[i];
     bool already_exist = false;
-    for (size_t j = 0; j < output_aspect_ratior.size(); ++j) {
-      if (fabs(ar - output_aspect_ratior[j]) < epsilon) {
+    for (size_t j = 0; j < output_aspect_ratior->size(); ++j) {
+      if (fabs(ar - output_aspect_ratior->at(j)) < epsilon) {
         already_exist = true;
         break;
       }
     }
     if (!already_exist) {
-      output_aspect_ratior.push_back(ar);
+      output_aspect_ratior->push_back(ar);
       if (flip) {
-        output_aspect_ratior.push_back(1.0f / ar);
+        output_aspect_ratior->push_back(1.0f / ar);
       }
     }
   }
@@ -68,7 +70,7 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
     auto clip = ctx.Attr<bool>("clip");
 
     std::vector<float> aspect_ratios;
-    ExpandAspectRatios(input_aspect_ratio, flip, aspect_ratios);
+    ExpandAspectRatios(input_aspect_ratio, flip, &aspect_ratios);
 
     T step_w = static_cast<T>(ctx.Attr<float>("step_w"));
     T step_h = static_cast<T>(ctx.Attr<float>("step_h"));
diff --git a/paddle/fluid/operators/rank_loss_op.cc b/paddle/fluid/operators/rank_loss_op.cc
index 767eef5686..a1127f11a7 100644
--- a/paddle/fluid/operators/rank_loss_op.cc
+++ b/paddle/fluid/operators/rank_loss_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/rank_loss_op.h"
+#include <string>
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/recv_op.cc b/paddle/fluid/operators/recv_op.cc
index 083c1fae5e..a4dcf704a6 100644
--- a/paddle/fluid/operators/recv_op.cc
+++ b/paddle/fluid/operators/recv_op.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <future>  // NOLINT
 #include <ostream>
 
 #include "paddle/fluid/framework/data_type.h"
@@ -19,7 +20,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 
-#include <future>
 #include "paddle/fluid/operators/detail/grpc_client.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/roi_pool_op.h b/paddle/fluid/operators/roi_pool_op.h
index f38c5a3c0c..54e0749031 100644
--- a/paddle/fluid/operators/roi_pool_op.h
+++ b/paddle/fluid/operators/roi_pool_op.h
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <algorithm>
+#include <limits>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
 
diff --git a/paddle/fluid/operators/strided_memcpy.h b/paddle/fluid/operators/strided_memcpy.h
index 22c1db82e9..7a10218e15 100644
--- a/paddle/fluid/operators/strided_memcpy.h
+++ b/paddle/fluid/operators/strided_memcpy.h
@@ -37,8 +37,8 @@ inline void StridedMemcpy(const platform::DeviceContext& dev_ctx, const T* src,
                           const framework::DDim& src_stride,
                           const framework::DDim& dst_dim,
                           const framework::DDim& dst_stride, T* dst) {
-  using namespace detail;
-  StridedCopyDimVisitor<T> func(dev_ctx, src, src_stride, dst_stride, dst);
+  paddle::operators::detail::StridedCopyDimVisitor<T> func(
+      dev_ctx, src, src_stride, dst_stride, dst);
   boost::apply_visitor(func, dst_dim);
 }
 
diff --git a/paddle/fluid/operators/top_k_op.cu b/paddle/fluid/operators/top_k_op.cu
index bfd26c2f22..d7f4d383ce 100644
--- a/paddle/fluid/operators/top_k_op.cu
+++ b/paddle/fluid/operators/top_k_op.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/top_k_op.h"
 #include "paddle/fluid/platform/assert.h"
 
 namespace paddle {
@@ -133,71 +134,71 @@ __device__ __forceinline__ void GetTopK(Pair<T> topk[], const T* val, int* col,
 }
 
 template <typename T, int MaxLength, int BlockSize>
-__device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int& beam,
+__device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
                                               int beam_size, const T* src,
-                                              bool& firstStep, bool& is_empty,
-                                              Pair<T>& max, int dim,
+                                              bool* firstStep, bool* is_empty,
+                                              Pair<T>* max, int dim,
                                               const int tid) {
-  if (beam > 0) {
-    int length = beam < beam_size ? beam : beam_size;
-    if (firstStep) {
-      firstStep = false;
+  if (*beam > 0) {
+    int length = (*beam) < beam_size ? *beam : beam_size;
+    if (*firstStep) {
+      *firstStep = false;
       GetTopK<T, BlockSize>(topk, src, tid, dim, length);
     } else {
       for (int k = 0; k < MaxLength; k++) {
-        if (k < MaxLength - beam) {
-          topk[k] = topk[k + beam];
+        if (k < MaxLength - (*beam)) {
+          topk[k] = topk[k + *beam];
         } else {
           topk[k].set(-INFINITY, -1);
         }
       }
-      if (!is_empty) {
-        GetTopK<T, BlockSize>(topk + MaxLength - beam, src, tid, dim, max,
+      if (!(*is_empty)) {
+        GetTopK<T, BlockSize>(topk + MaxLength - *beam, src, tid, dim, *max,
                               length);
       }
     }
 
-    max = topk[MaxLength - 1];
-    if (max.v == -1) is_empty = true;
-    beam = 0;
+    *max = topk[MaxLength - 1];
+    if ((*max).v == -1) *is_empty = true;
+    *beam = 0;
   }
 }
 
 template <typename T, int MaxLength, int BlockSize>
-__device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int& beam,
+__device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
                                               int beam_size, const T* val,
-                                              int* col, bool& firstStep,
-                                              bool& is_empty, Pair<T>& max,
+                                              int* col, bool* firstStep,
+                                              bool* is_empty, Pair<T>* max,
                                               int dim, const int tid) {
-  if (beam > 0) {
-    int length = beam < beam_size ? beam : beam_size;
-    if (firstStep) {
-      firstStep = false;
+  if (*beam > 0) {
+    int length = (*beam) < beam_size ? *beam : beam_size;
+    if (*firstStep) {
+      *firstStep = false;
       GetTopK<T, BlockSize>(topk, val, col, tid, dim, length);
     } else {
       for (int k = 0; k < MaxLength; k++) {
-        if (k < MaxLength - beam) {
-          topk[k] = topk[k + beam];
+        if (k < MaxLength - *beam) {
+          topk[k] = topk[k + *beam];
         } else {
           topk[k].set(-INFINITY, -1);
         }
       }
-      if (!is_empty) {
-        GetTopK<T, BlockSize>(topk + MaxLength - beam, val, col, tid, dim, max,
+      if (!(*is_empty)) {
+        GetTopK<T, BlockSize>(topk + MaxLength - *beam, val, col, tid, dim, max,
                               length);
       }
     }
 
-    max = topk[MaxLength - 1];
-    if (max.v == -1) is_empty = true;
-    beam = 0;
+    *max = topk[MaxLength - 1];
+    if ((*max).v == -1) *is_empty = true;
+    *beam = 0;
   }
 }
 
 template <typename T, int MaxLength, int BlockSize>
 __device__ __forceinline__ void BlockReduce(Pair<T>* sh_topk, int* maxid,
                                             Pair<T> topk[], T** topVal,
-                                            int64_t** topIds, int& beam, int& k,
+                                            int64_t** topIds, int* beam, int* k,
                                             const int tid, const int warp) {
   while (true) {
     __syncthreads();
@@ -225,17 +226,17 @@ __device__ __forceinline__ void BlockReduce(Pair<T>* sh_topk, int* maxid,
       (*topVal)++;
       (*topIds)++;
     }
-    if (tid == maxid[0]) beam++;
-    if (--k == 0) break;
+    if (tid == maxid[0]) (*beam)++;
+    if (--(*k) == 0) break;
     __syncthreads();
 
     if (tid == maxid[0]) {
-      if (beam < MaxLength) {
-        sh_topk[tid] = topk[beam];
+      if (*beam < MaxLength) {
+        sh_topk[tid] = topk[*beam];
       }
     }
     if (maxid[0] / 32 == warp) {
-      if (__shfl(beam, (maxid[0]) % 32, 32) == MaxLength) break;
+      if (__shfl(*beam, (maxid[0]) % 32, 32) == MaxLength) break;
     }
   }
 }
@@ -268,13 +269,13 @@ __global__ void KeMatrixTopK(T* output, int output_stride, int64_t* indices,
     topk[k].set(-INFINITY, -1);
   }
   while (k) {
-    ThreadGetTopK<T, MaxLength, BlockSize>(topk, beam, k,
-                                           src + blockIdx.x * lds, firststep,
-                                           is_empty, max, dim, tid);
+    ThreadGetTopK<T, MaxLength, BlockSize>(topk, &beam, k,
+                                           src + blockIdx.x * lds, &firststep,
+                                           &is_empty, &max, dim, tid);
 
     sh_topk[tid] = topk[0];
     BlockReduce<T, MaxLength, BlockSize>(sh_topk, maxid, topk, &output,
-                                         &indices, beam, k, tid, warp);
+                                         &indices, &beam, &k, tid, warp);
   }
 }
 
@@ -308,9 +309,9 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> {
     KeMatrixTopK<T, 5, 256><<<
         grid, threads, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
                               ctx.device_context())
-                              .stream()>>>(output_data, output->dims()[1],
-                                           indices_data, input_data,
-                                           input_width, input_width, int(k));
+                              .stream()>>>(
+        output_data, output->dims()[1], indices_data, input_data, input_width,
+        input_width, static_cast<int>(k));
   }
 };