From 29254ebe0d7deb5c5091853970d3bbc34579c5f6 Mon Sep 17 00:00:00 2001
From: kolinwei <331911734@qq.com>
Date: Mon, 23 Apr 2018 14:51:41 +0800
Subject: [PATCH 01/25] Update stacked_dynamic_lstm.py

---
 benchmark/fluid/stacked_dynamic_lstm.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/benchmark/fluid/stacked_dynamic_lstm.py b/benchmark/fluid/stacked_dynamic_lstm.py
index 5fcbdd64af..73bcc47b4d 100644
--- a/benchmark/fluid/stacked_dynamic_lstm.py
+++ b/benchmark/fluid/stacked_dynamic_lstm.py
@@ -23,10 +23,10 @@ import random
 import time
 
 import numpy
-import paddle.v2 as paddle
-import paddle.v2.dataset.imdb as imdb
+import paddle
+import paddle.dataset.imdb as imdb
 import paddle.fluid as fluid
-from paddle.v2 import batch
+import paddle.batch as batch
 import paddle.fluid.profiler as profiler
 
 

From 44b460919c1794a70c2d6b4fa175512b91711464 Mon Sep 17 00:00:00 2001
From: kolinwei <331911734@qq.com>
Date: Mon, 23 Apr 2018 14:52:26 +0800
Subject: [PATCH 02/25] Update machine_translation.py

---
 benchmark/fluid/machine_translation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmark/fluid/machine_translation.py b/benchmark/fluid/machine_translation.py
index d7a421c109..adde5f21ac 100644
--- a/benchmark/fluid/machine_translation.py
+++ b/benchmark/fluid/machine_translation.py
@@ -21,7 +21,7 @@ import argparse
 import time
 import distutils.util
 
-import paddle.v2 as paddle
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import paddle.fluid.framework as framework

From dd680c69e70e75e099471eeab405d29130961d44 Mon Sep 17 00:00:00 2001
From: kolinwei <331911734@qq.com>
Date: Mon, 23 Apr 2018 14:52:51 +0800
Subject: [PATCH 03/25] Update mnist.py

---
 benchmark/fluid/mnist.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmark/fluid/mnist.py b/benchmark/fluid/mnist.py
index dc10ac2ec1..1e2185dfac 100644
--- a/benchmark/fluid/mnist.py
+++ b/benchmark/fluid/mnist.py
@@ -20,7 +20,7 @@ import numpy as np
 import argparse
 import time
 
-import paddle.v2 as paddle
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.profiler as profiler
 

From e964164e36010657aeea67a64ebb416d5ea87c59 Mon Sep 17 00:00:00 2001
From: kolinwei <331911734@qq.com>
Date: Mon, 23 Apr 2018 14:53:11 +0800
Subject: [PATCH 04/25] Update resnet.py

---
 benchmark/fluid/resnet.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmark/fluid/resnet.py b/benchmark/fluid/resnet.py
index 1af5eaf6b4..831fa2c019 100644
--- a/benchmark/fluid/resnet.py
+++ b/benchmark/fluid/resnet.py
@@ -23,7 +23,7 @@ import time
 
 import cProfile, pstats, StringIO
 
-import paddle.v2 as paddle
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import paddle.fluid.profiler as profiler

From d7762b65de4f17dc702ac765b345911121093a67 Mon Sep 17 00:00:00 2001
From: kolinwei <331911734@qq.com>
Date: Mon, 23 Apr 2018 14:53:48 +0800
Subject: [PATCH 05/25] Update vgg.py

---
 benchmark/fluid/vgg.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmark/fluid/vgg.py b/benchmark/fluid/vgg.py
index 9d990eff62..53e34e0cbd 100644
--- a/benchmark/fluid/vgg.py
+++ b/benchmark/fluid/vgg.py
@@ -17,7 +17,7 @@ from __future__ import print_function
 import sys
 import time
 import numpy as np
-import paddle.v2 as paddle
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import argparse

From a41a94f2ee9af8abcbfcc17b87641e4af9efcac8 Mon Sep 17 00:00:00 2001
From: typhoonzero <typhoonzero1986@gmail.com>
Date: Wed, 16 May 2018 13:39:02 +0800
Subject: [PATCH 06/25] support nccl2 dist train in trainer

---
 python/paddle/fluid/trainer.py | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py
index c24662ac21..a47af7ccb2 100644
--- a/python/paddle/fluid/trainer.py
+++ b/python/paddle/fluid/trainer.py
@@ -131,7 +131,40 @@ class Trainer(object):
             # load params from param_path into scope
             io.load_persistables(exe, dirname=param_path)
 
+    def _transpile_nccl2_dist(self):
+        # PADDLE_TRAINER_IPS
+        if "PADDLE_TRAINER_IPS" not in os.environ:
+            self.nccl_id_var = None
+        else:
+            self.trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
+            port = os.getenv("PADDLE_PSERVER_PORT")
+            worker_ips = os.getenv("PADDLE_TRAINER_IPS")
+            worker_endpoints = []
+            for ip in worker_ips.split(","):
+                worker_endpoints.append(':'.join([ip, port]))
+            self.num_trainers = len(worker_endpoints)
+            current_endpoint = os.getenv("POD_IP") + ":" + port
+            worker_endpoints.remove(current_endpoint)
+            # TODO(wuyi): use self.nccl_id_var, self.num_trainers and self.trainer_id
+            # in ParallelExecutor to start
+            # distributed training using NCCL2
+            self.nccl_id_var = self.startup_program.global_block().create_var(
+                name="NCCLID", persistable=True, type=core.VarDesc.VarType.RAW)
+            self.startup_program.global_block().append_op(
+                type="gen_nccl_id",
+                inputs={},
+                outputs={"NCCLID": self.nccl_id_var},
+                attrs={
+                    "endpoint": current_endpoint,
+                    "endpoint_list": worker_endpoints,
+                    "trainer_id": self.trainer_id
+                })
+
     def _dist_transpile_if_necessary(self, optimize_ops, params_grads):
+        self._transpile_nccl2_dist()
+        if self.nccl_id_var != None:
+            return
+
         if "PADDLE_TRAINING_ROLE" not in os.environ:
             return
 

From 11a88147588152d98c8c6f2c01fbbf209579c2bb Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Wed, 16 May 2018 14:42:41 +0800
Subject: [PATCH 07/25] add pybind.h to inference lib for static link

---
 cmake/inference_lib.cmake | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index 06a7ae5682..4d67be20a7 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -148,4 +148,10 @@ copy(string_lib
   DSTS ${dst_dir}/${module} ${dst_dir}/${module}/tinyformat
 )
 
+set(module "pybind")
+copy(pybind_lib
+  SRCS ${src_dir}/${module}/pybind.h
+  DSTS ${dst_dir}/${module}
+)
+
 add_custom_target(inference_lib_dist DEPENDS ${inference_lib_dist_dep}) 

From ed812bdbe87ae3ccd93847c6cb6fbd6fa232bd30 Mon Sep 17 00:00:00 2001
From: kolinwei <331911734@qq.com>
Date: Wed, 16 May 2018 15:22:56 +0800
Subject: [PATCH 08/25] benchmark script support multi card train

---
 benchmark/fluid/mnist.py  | 12 ++++++------
 benchmark/fluid/resnet.py | 10 ++++++----
 benchmark/fluid/vgg.py    | 10 ++++++----
 3 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/benchmark/fluid/mnist.py b/benchmark/fluid/mnist.py
index 1e2185dfac..3574c1c0e9 100644
--- a/benchmark/fluid/mnist.py
+++ b/benchmark/fluid/mnist.py
@@ -159,6 +159,7 @@ def run_benchmark(model, args):
         paddle.dataset.mnist.train(), batch_size=args.batch_size)
 
     accuracy = fluid.metrics.Accuracy()
+    train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=avg_cost.name)
     iters, num_samples, start_time = 0, 0, time.time()
     for pass_id in range(args.pass_num):
         accuracy.reset()
@@ -175,17 +176,16 @@ def run_benchmark(model, args):
             y_data = np.array(map(lambda x: x[1], data)).astype("int64")
             y_data = y_data.reshape([len(y_data), 1])
 
-            outs = exe.run(
-                fluid.default_main_program(),
+            outs = train_exe.run(
                 feed={"pixel": img_data,
                       "label": y_data},
-                fetch_list=[avg_cost, batch_acc, batch_size_tensor]
+                fetch_list=[avg_cost.name, batch_acc.name, batch_size_tensor.name]
             )  # The accuracy is the accumulation of batches, but not the current batch.
-            accuracy.update(value=outs[1], weight=outs[2])
+            accuracy.update(value=np.array(np.mean(outs[1])), weight=np.mean(np.array(outs[2])))
             iters += 1
             num_samples += len(y_data)
-            loss = np.array(outs[0])
-            acc = np.array(outs[1])
+            loss = np.mean(np.array(outs[0]))
+            acc = np.mean(np.array(outs[1]))
             train_losses.append(loss)
             train_accs.append(acc)
             print("Pass: %d, Iter: %d, Loss: %f, Accuracy: %f" %
diff --git a/benchmark/fluid/resnet.py b/benchmark/fluid/resnet.py
index 831fa2c019..5f60f806f7 100644
--- a/benchmark/fluid/resnet.py
+++ b/benchmark/fluid/resnet.py
@@ -241,6 +241,7 @@ def run_benchmark(model, args):
     exe = fluid.Executor(place)
     exe.run(fluid.default_startup_program())
     accuracy = fluid.average.WeightedAverage()
+    train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=avg_cost.name)
     if args.use_fake_data:
         data = train_reader().next()
         image = np.array(map(lambda x: x[0].reshape(dshape), data)).astype(
@@ -264,14 +265,15 @@ def run_benchmark(model, args):
                                      data)).astype('float32')
                 label = np.array(map(lambda x: x[1], data)).astype('int64')
                 label = label.reshape([-1, 1])
-            loss, acc, weight = exe.run(
-                fluid.default_main_program(),
+            loss, acc, weight = train_exe.run(
                 feed={'data': image,
                       'label': label},
-                fetch_list=[avg_cost, batch_acc, batch_size_tensor])
+                fetch_list=[avg_cost.name, batch_acc.name, batch_size_tensor.name])
             iters += 1
             num_samples += len(label)
-            accuracy.add(value=acc, weight=weight)
+            accuracy.add(value=np.array(np.mean(acc)), weight=np.mean(weight))
+            loss = np.mean(np.array(loss))
+            acc = np.mean(np.array(acc))
             train_losses.append(loss)
             train_accs.append(acc)
             print("Pass: %d, Iter: %d, Loss: %f, Accuracy: %f" %
diff --git a/benchmark/fluid/vgg.py b/benchmark/fluid/vgg.py
index 53e34e0cbd..261446e7e9 100644
--- a/benchmark/fluid/vgg.py
+++ b/benchmark/fluid/vgg.py
@@ -169,6 +169,7 @@ def main():
 
     iters, num_samples, start_time = 0, 0, time.time()
     accuracy = fluid.average.WeightedAverage()
+    train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=avg_cost.name)
     for pass_id in range(args.pass_num):
         accuracy.reset()
         train_accs = []
@@ -184,14 +185,15 @@ def main():
             y_data = np.array(map(lambda x: x[1], data)).astype("int64")
             y_data = y_data.reshape([-1, 1])
 
-            loss, acc, weight = exe.run(
-                fluid.default_main_program(),
+            loss, acc, weight = train_exe.run(
                 feed={"pixel": img_data,
                       "label": y_data},
-                fetch_list=[avg_cost, batch_acc, batch_size_tensor])
-            accuracy.add(value=acc, weight=weight)
+                fetch_list=[avg_cost.name, batch_acc.name, batch_size_tensor.name])
+            accuracy.add(value=np.array(np.mean(acc)), weight=np.mean(weight))
             iters += 1
             num_samples += len(y_data)
+            loss = np.mean(np.array(loss))
+            acc = np.mean(np.array(acc))
             print(
                 "Pass = %d, Iter = %d, Loss = %f, Accuracy = %f" %
                 (pass_id, iters, loss, acc)

From ff24f789cbef9a54c90a55827a69091371cddec6 Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Wed, 16 May 2018 15:35:05 +0800
Subject: [PATCH 09/25] update pybind.h source dir

---
 CMakeLists.txt            | 2 ++
 cmake/inference_lib.cmake | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 030bd19b3f..6d87031314 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -216,6 +216,8 @@ endif(WITH_GOLANG)
 
 set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build")
 
+message(STATUS "CMAKE_CURRENT_BINARY_DIR: ${CMAKE_CURRENT_BINARY_DIR}")
+
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
 set(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
 
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index 4d67be20a7..807a48a41f 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -150,7 +150,7 @@ copy(string_lib
 
 set(module "pybind")
 copy(pybind_lib
-  SRCS ${src_dir}/${module}/pybind.h
+  SRCS ${CMAKE_CURRENT_BINARY_DIR}/paddle/fluid/${module}/pybind.h
   DSTS ${dst_dir}/${module}
 )
 

From 1d69b1ffe37c2356710c63e611931afa026ecfd2 Mon Sep 17 00:00:00 2001
From: kolinwei <331911734@qq.com>
Date: Wed, 16 May 2018 16:06:33 +0800
Subject: [PATCH 10/25] change some code style

---
 benchmark/fluid/mnist.py  | 8 ++++++--
 benchmark/fluid/resnet.py | 4 +++-
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/benchmark/fluid/mnist.py b/benchmark/fluid/mnist.py
index 3574c1c0e9..1cb4314fb2 100644
--- a/benchmark/fluid/mnist.py
+++ b/benchmark/fluid/mnist.py
@@ -179,9 +179,13 @@ def run_benchmark(model, args):
             outs = train_exe.run(
                 feed={"pixel": img_data,
                       "label": y_data},
-                fetch_list=[avg_cost.name, batch_acc.name, batch_size_tensor.name]
+                fetch_list=[
+                    avg_cost.name, batch_acc.name, batch_size_tensor.name
+                ]
             )  # The accuracy is the accumulation of batches, but not the current batch.
-            accuracy.update(value=np.array(np.mean(outs[1])), weight=np.mean(np.array(outs[2])))
+            accuracy.update(
+                    value=np.array(np.mean(outs[1])), 
+                    weight=np.mean(np.array(outs[2])))
             iters += 1
             num_samples += len(y_data)
             loss = np.mean(np.array(outs[0]))
diff --git a/benchmark/fluid/resnet.py b/benchmark/fluid/resnet.py
index 5f60f806f7..0fd7258a80 100644
--- a/benchmark/fluid/resnet.py
+++ b/benchmark/fluid/resnet.py
@@ -268,7 +268,9 @@ def run_benchmark(model, args):
             loss, acc, weight = train_exe.run(
                 feed={'data': image,
                       'label': label},
-                fetch_list=[avg_cost.name, batch_acc.name, batch_size_tensor.name])
+                fetch_list=[
+                    avg_cost.name, batch_acc.name, batch_size_tensor.name
+                ])
             iters += 1
             num_samples += len(label)
             accuracy.add(value=np.array(np.mean(acc)), weight=np.mean(weight))

From 3c2fe94c694664055b6122e3f7ef78276b761ad3 Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Wed, 16 May 2018 16:54:26 +0800
Subject: [PATCH 11/25] clean code

---
 CMakeLists.txt | 2 --
 1 file changed, 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6d87031314..030bd19b3f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -216,8 +216,6 @@ endif(WITH_GOLANG)
 
 set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build")
 
-message(STATUS "CMAKE_CURRENT_BINARY_DIR: ${CMAKE_CURRENT_BINARY_DIR}")
-
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
 set(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
 

From 113548801f05c01e1e16962fa09a9446a52c73f5 Mon Sep 17 00:00:00 2001
From: kolinwei <331911734@qq.com>
Date: Wed, 16 May 2018 16:55:29 +0800
Subject: [PATCH 12/25] change some code style

---
 benchmark/fluid/mnist.py | 4 ++--
 benchmark/fluid/vgg.py   | 4 +++-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/benchmark/fluid/mnist.py b/benchmark/fluid/mnist.py
index 1cb4314fb2..8d51d4aa60 100644
--- a/benchmark/fluid/mnist.py
+++ b/benchmark/fluid/mnist.py
@@ -184,8 +184,8 @@ def run_benchmark(model, args):
                 ]
             )  # The accuracy is the accumulation of batches, but not the current batch.
             accuracy.update(
-                    value=np.array(np.mean(outs[1])), 
-                    weight=np.mean(np.array(outs[2])))
+                value=np.array(np.mean(outs[1])), 
+                weight=np.mean(np.array(outs[2])))
             iters += 1
             num_samples += len(y_data)
             loss = np.mean(np.array(outs[0]))
diff --git a/benchmark/fluid/vgg.py b/benchmark/fluid/vgg.py
index 261446e7e9..2a9566a45c 100644
--- a/benchmark/fluid/vgg.py
+++ b/benchmark/fluid/vgg.py
@@ -188,7 +188,9 @@ def main():
             loss, acc, weight = train_exe.run(
                 feed={"pixel": img_data,
                       "label": y_data},
-                fetch_list=[avg_cost.name, batch_acc.name, batch_size_tensor.name])
+                fetch_list=[
+                    avg_cost.name, batch_acc.name, batch_size_tensor.name
+                ])
             accuracy.add(value=np.array(np.mean(acc)), weight=np.mean(weight))
             iters += 1
             num_samples += len(y_data)

From a85d79cdcb1463c57afd5a80304dce005918eb23 Mon Sep 17 00:00:00 2001
From: kolinwei <331911734@qq.com>
Date: Wed, 16 May 2018 18:39:18 +0800
Subject: [PATCH 13/25] change some code style

---
 benchmark/fluid/mnist.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmark/fluid/mnist.py b/benchmark/fluid/mnist.py
index 8d51d4aa60..1cb4314fb2 100644
--- a/benchmark/fluid/mnist.py
+++ b/benchmark/fluid/mnist.py
@@ -184,8 +184,8 @@ def run_benchmark(model, args):
                 ]
             )  # The accuracy is the accumulation of batches, but not the current batch.
             accuracy.update(
-                value=np.array(np.mean(outs[1])), 
-                weight=np.mean(np.array(outs[2])))
+                    value=np.array(np.mean(outs[1])), 
+                    weight=np.mean(np.array(outs[2])))
             iters += 1
             num_samples += len(y_data)
             loss = np.mean(np.array(outs[0]))

From 07536903d08dc0b01cc4f871d03b55ae82012f9a Mon Sep 17 00:00:00 2001
From: kolinwei <331911734@qq.com>
Date: Wed, 16 May 2018 19:08:04 +0800
Subject: [PATCH 14/25] change some code style

---
 benchmark/fluid/mnist.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmark/fluid/mnist.py b/benchmark/fluid/mnist.py
index 1cb4314fb2..8d51d4aa60 100644
--- a/benchmark/fluid/mnist.py
+++ b/benchmark/fluid/mnist.py
@@ -184,8 +184,8 @@ def run_benchmark(model, args):
                 ]
             )  # The accuracy is the accumulation of batches, but not the current batch.
             accuracy.update(
-                    value=np.array(np.mean(outs[1])), 
-                    weight=np.mean(np.array(outs[2])))
+                value=np.array(np.mean(outs[1])), 
+                weight=np.mean(np.array(outs[2])))
             iters += 1
             num_samples += len(y_data)
             loss = np.mean(np.array(outs[0]))

From 1ee9fea3518d75ca9e668df2d8a1af6dd18d3827 Mon Sep 17 00:00:00 2001
From: kolinwei <331911734@qq.com>
Date: Wed, 16 May 2018 20:14:06 +0800
Subject: [PATCH 15/25] change some code style

---
 benchmark/fluid/mnist.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmark/fluid/mnist.py b/benchmark/fluid/mnist.py
index 8d51d4aa60..400200c474 100644
--- a/benchmark/fluid/mnist.py
+++ b/benchmark/fluid/mnist.py
@@ -184,7 +184,7 @@ def run_benchmark(model, args):
                 ]
             )  # The accuracy is the accumulation of batches, but not the current batch.
             accuracy.update(
-                value=np.array(np.mean(outs[1])), 
+                value=np.array(np.mean(outs[1])),
                 weight=np.mean(np.array(outs[2])))
             iters += 1
             num_samples += len(y_data)

From d73f2bd6bd37282c3b8e281a9343870dfc23e05f Mon Sep 17 00:00:00 2001
From: Kexin Zhao <zhaokexin01@baidu.com>
Date: Tue, 15 May 2018 15:58:40 -0700
Subject: [PATCH 16/25] fix data_feeder lod bug

---
 python/paddle/fluid/data_feeder.py            |  4 +-
 python/paddle/fluid/tests/test_data_feeder.py | 61 ++++++++++++++++---
 2 files changed, 56 insertions(+), 9 deletions(-)

diff --git a/python/paddle/fluid/data_feeder.py b/python/paddle/fluid/data_feeder.py
index 0051b69847..a44e078d0c 100644
--- a/python/paddle/fluid/data_feeder.py
+++ b/python/paddle/fluid/data_feeder.py
@@ -54,9 +54,9 @@ class DataToLoDTensorConverter(object):
             self.data.append(data)
         else:
             cur_lod_len = len(data)
-            lod[-1].append(lod[-1][-1] + cur_lod_len)
+            lod[0].append(lod[0][-1] + cur_lod_len)
             for each_data in data:
-                self._feed_impl_(each_data, lod[:-1], lod_level - 1)
+                self._feed_impl_(each_data, lod[1:], lod_level - 1)
 
     def done(self):
         arr = numpy.array(self.data, dtype=self.dtype).reshape(self.shape)
diff --git a/python/paddle/fluid/tests/test_data_feeder.py b/python/paddle/fluid/tests/test_data_feeder.py
index 861dd3174a..ce3ba3ebc5 100644
--- a/python/paddle/fluid/tests/test_data_feeder.py
+++ b/python/paddle/fluid/tests/test_data_feeder.py
@@ -13,15 +13,62 @@
 # limitations under the License.
 
 import paddle.fluid as fluid
+import unittest
 
 
-def test_converter():
-    img = fluid.layers.data(name='image', shape=[1, 28, 28])
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-    feeder = fluid.DataFeeder([img, label], fluid.CPUPlace())
-    result = feeder.feed([[[0] * 784, [9]], [[1] * 784, [1]]])
-    print(result)
+class TestDataFeeder(unittest.TestCase):
+    def test_lod_level_0_converter(self):
+        img = fluid.layers.data(name='image', shape=[1, 28, 28])
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+        feeder = fluid.DataFeeder([img, label], fluid.CPUPlace())
+        result = feeder.feed([([0] * 784, [9]), ([1] * 784, [1])])
+        print(result)
+
+        self.assertEqual(result['image'].shape(), [2, 1, 28, 28])
+        self.assertEqual(result['label'].shape(), [2, 1])
+        self.assertEqual(result['image'].lod(), [])
+        self.assertEqual(result['label'].lod(), [])
+
+    def test_lod_level_1_converter(self):
+        # lod_level = 1
+        # each sentence has a different number of words
+        sentences = fluid.layers.data(
+            name='sentences', shape=[1], dtype='int64', lod_level=1)
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+        feeder = fluid.DataFeeder([sentences, label], fluid.CPUPlace())
+
+        # lod = [[0, 3, 5, 9]]
+        # data = [[1, 2, 3], [4, 5], [6, 7, 8, 9]]
+        # label = [1] * len(data)
+        result = feeder.feed(
+            [([1, 2, 3], [1]), ([4, 5], [1]), ([6, 7, 8, 9], [1])])
+        print(result)
+
+        self.assertEqual(result['sentences'].shape(), [9, 1])
+        self.assertEqual(result['label'].shape(), [3, 1])
+        self.assertEqual(result['sentences'].lod(), [[0, 3, 5, 9]])
+        self.assertEqual(result['label'].lod(), [])
+
+    def test_lod_level_2_converter(self):
+        # lod_level = 2
+        # paragraphs -> sentences -> words
+        paragraphs = fluid.layers.data(
+            name='paragraphs', shape=[1], dtype='int64', lod_level=2)
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+        feeder = fluid.DataFeeder([paragraphs, label], fluid.CPUPlace())
+
+        # lod = [[0, 2, 3], [0, 3, 5, 9]]
+        # data = [[[1, 2, 3], [4, 5]], [[6, 7, 8, 9]]]
+        # label = [1] * len(data)
+        result = feeder.feed(
+            [([[1, 2, 3], [4, 5]], [1]), ([[6, 7, 8, 9]], [1])])
+        print(result)
+
+        self.assertEqual(result['paragraphs'].shape(), [9, 1])
+        self.assertEqual(result['label'].shape(), [2, 1])
+        self.assertEqual(result['paragraphs'].lod(), [[0, 2, 3], [0, 3, 5, 9]])
+        self.assertEqual(result['label'].lod(), [])
 
 
 if __name__ == '__main__':
-    test_converter()
+    unittest.main()

From 14248a64d7016850ae0bd51752b95aaf1dac295c Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Thu, 17 May 2018 02:37:20 +0800
Subject: [PATCH 17/25] Fix hang when input is duplicated (#10709)

---
 paddle/fluid/framework/details/op_handle_base.h           | 8 ++++++++
 .../framework/details/threaded_ssa_graph_executor.cc      | 2 +-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h
index fe1735d05d..8f94206a87 100644
--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -70,6 +70,14 @@ class OpHandleBase {
 
   const std::vector<VarHandleBase *> &Inputs() const { return inputs_; }
 
+  size_t NoDupInputSize() const {
+    std::unordered_set<VarHandleBase *> res;
+    for (auto *var : inputs_) {
+      res.emplace(var);
+    }
+    return res.size();
+  }
+
   const std::vector<VarHandleBase *> &Outputs() const { return outputs_; }
 
  protected:
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
index ef263d82c5..815f739371 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -174,7 +174,7 @@ void ThreadedSSAGraphExecutor::InsertFetchOps(
 void ThreadedSSAGraphExecutor::InsertPendingOp(
     std::unordered_map<OpHandleBase *, size_t> *pending_ops,
     OpHandleBase *op_instance) const {
-  pending_ops->insert({op_instance, op_instance->Inputs().size()});
+  pending_ops->insert({op_instance, op_instance->NoDupInputSize()});
 }
 
 void ThreadedSSAGraphExecutor::InsertPendingVar(

From b67ce353fae31a352bfe0642925139f87aa2d858 Mon Sep 17 00:00:00 2001
From: "Yang Yang(Tony)" <yangyang62@baidu.com>
Date: Wed, 16 May 2018 14:02:29 -0700
Subject: [PATCH 18/25] speed up test label semantic roles (#10718)

---
 .../tests/book/test_label_semantic_roles.py   | 27 +++++--------------
 1 file changed, 6 insertions(+), 21 deletions(-)

diff --git a/python/paddle/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
index 09793760e5..f1ee5dfd99 100644
--- a/python/paddle/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
@@ -182,12 +182,6 @@ def train(use_cuda, save_dirname=None, is_local=True):
     crf_decode = fluid.layers.crf_decoding(
         input=feature_out, param_attr=fluid.ParamAttr(name='crfw'))
 
-    chunk_evaluator = fluid.evaluator.ChunkEvaluator(
-        input=crf_decode,
-        label=target,
-        chunk_scheme="IOB",
-        num_chunk_types=int(math.ceil((label_dict_len - 1) / 2.0)))
-
     train_data = paddle.batch(
         paddle.reader.shuffle(
             paddle.dataset.conll05.test(), buf_size=8192),
@@ -203,7 +197,6 @@ def train(use_cuda, save_dirname=None, is_local=True):
 
     def train_loop(main_program):
         exe.run(fluid.default_startup_program())
-
         embedding_param = fluid.global_scope().find_var(
             embedding_name).get_tensor()
         embedding_param.set(
@@ -213,27 +206,19 @@ def train(use_cuda, save_dirname=None, is_local=True):
         start_time = time.time()
         batch_id = 0
         for pass_id in xrange(PASS_NUM):
-            chunk_evaluator.reset(exe)
             for data in train_data():
-                cost, precision, recall, f1_score = exe.run(
-                    main_program,
-                    feed=feeder.feed(data),
-                    fetch_list=[avg_cost] + chunk_evaluator.metrics)
-                pass_precision, pass_recall, pass_f1_score = chunk_evaluator.eval(
-                    exe)
+                cost = exe.run(main_program,
+                               feed=feeder.feed(data),
+                               fetch_list=[avg_cost])
+                cost = cost[0]
 
                 if batch_id % 10 == 0:
-                    print("avg_cost:" + str(cost) + " precision:" + str(
-                        precision) + " recall:" + str(recall) + " f1_score:" +
-                          str(f1_score) + " pass_precision:" + str(
-                              pass_precision) + " pass_recall:" + str(
-                                  pass_recall) + " pass_f1_score:" + str(
-                                      pass_f1_score))
+                    print("avg_cost:" + str(cost))
                     if batch_id != 0:
                         print("second per batch: " + str((time.time(
                         ) - start_time) / batch_id))
                     # Set the threshold low to speed up the CI test
-                    if float(pass_precision) > 0.01:
+                    if float(cost) < 60.0:
                         if save_dirname is not None:
                             # TODO(liuyiqun): Change the target to crf_decode
                             fluid.io.save_inference_model(save_dirname, [

From eec1ac8638e4f5d447cb99276df6ec4c816d88c6 Mon Sep 17 00:00:00 2001
From: Kexin Zhao <zhaokexin01@baidu.com>
Date: Wed, 16 May 2018 16:46:30 -0700
Subject: [PATCH 19/25] fix warning

---
 paddle/fluid/inference/tensorrt/convert/op_converter.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h
index abc9ebf472..1cd3ed9a00 100644
--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -49,7 +49,7 @@ class OpConverter {
   // convert fluid block to tensorrt network
   void ConvertBlock(const framework::proto::BlockDesc& block,
                     TensorRTEngine* engine) {
-    for (size_t i = 0; i < block.ops_size(); i++) {
+    for (int i = 0; i < block.ops_size(); i++) {
       const auto& op = block.ops(i);
       OpConverter::Run(op, engine);
     }

From dd6742ff85498611b148a567f9ee52ed75802270 Mon Sep 17 00:00:00 2001
From: Lei Wang <bestwanglei@gmail.com>
Date: Wed, 16 May 2018 17:17:51 -0700
Subject: [PATCH 20/25] Scripts: generate dockerfile after each build. (#10719)

---
 paddle/scripts/paddle_build.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index cb0624eac4..928b95b4f5 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -480,6 +480,7 @@ function main() {
       build)
         cmake_gen ${PYTHON_ABI:-""}
         build
+        gen_dockerfile
         ;;
       build_android)
         build_android

From 32a15ed90c17a4865cf0d0e070b8ba09f307ebd6 Mon Sep 17 00:00:00 2001
From: Lei Wang <bestwanglei@gmail.com>
Date: Wed, 16 May 2018 17:38:33 -0700
Subject: [PATCH 21/25] Doc: fix missing parenthesis. (#10726)

---
 doc/fluid/design/concepts/functions_operators_layers.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/fluid/design/concepts/functions_operators_layers.md b/doc/fluid/design/concepts/functions_operators_layers.md
index 30bc488a18..1f86b99e51 100644
--- a/doc/fluid/design/concepts/functions_operators_layers.md
+++ b/doc/fluid/design/concepts/functions_operators_layers.md
@@ -40,7 +40,7 @@ template <typename T>
 class FCOp : public OperatorBase {
  public:
   void Run(...) {
-    add(mul(Input<T>("X"), Input<T>("W")), Input<T>("b");
+    add(mul(Input<T>("X"), Input<T>("W")), Input<T>("b"));
   }
 };
 REGISTER_OP(FCOp, "fc");

From 3e34c9798a201df38c335d201442554f2c405a2d Mon Sep 17 00:00:00 2001
From: zhanghaichao <zhanghaichao@baidu.com>
Date: Wed, 16 May 2018 16:58:37 -0700
Subject: [PATCH 22/25] improved the documentation for the sequence_pool
 function

---
 python/paddle/fluid/layers/nn.py | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 1786be22fd..914434ab1b 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -1329,6 +1329,8 @@ def sequence_pool(input, pool_type):
          sqrt   : out.data = [2.82, 6.93, 4.24], where 2.82=(1+3)/sqrt(2),
                     6.93=(2+4+6)/sqrt(3), 4.24=(5+1)/sqrt(2)
          max    : out.data = [3, 6, 5], where 3=max(1,3), 6=max(2,4,6), 5=max(5,1)
+         last   : out.data = [3, 6, 1], where 3=last(1,3), 6=last(2,4,6), 1=last(5,1)
+         first  : out.data = [1, 2, 5], where 1=first(1,3), 2=first(2,4,6), 5=first(5,1)
 
     Args:
         input(variable): The input variable which is a LoDTensor.
@@ -1348,6 +1350,8 @@ def sequence_pool(input, pool_type):
              sum_x = fluid.layers.sequence_pool(input=x, pool_type='sum')
              sqrt_x = fluid.layers.sequence_pool(input=x, pool_type='sqrt')
              max_x = fluid.layers.sequence_pool(input=x, pool_type='max')
+             last_x = fluid.layers.sequence_pool(input=x, pool_type='last')
+             first_x = fluid.layers.sequence_pool(input=x, pool_type='first')
     """
     helper = LayerHelper('sequence_pool', **locals())
     dtype = helper.input_dtype()
@@ -3769,13 +3773,13 @@ def label_smooth(label,
 
 def roi_pool(input, rois, pooled_height=1, pooled_width=1, spatial_scale=1.0):
     """
-    Region of interest pooling (also known as RoI pooling) is to perform 
+    Region of interest pooling (also known as RoI pooling) is to perform
         is to perform max pooling on inputs of nonuniform sizes to obtain
         fixed-size feature maps (e.g. 7*7).
-    The operator has three steps: 
-        1. Dividing each region proposal into equal-sized sections with 
-           the pooled_width and pooled_height 
-        2. Finding the largest value in each section 
+    The operator has three steps:
+        1. Dividing each region proposal into equal-sized sections with
+           the pooled_width and pooled_height
+        2. Finding the largest value in each section
         3. Copying these max values to the output buffer
 
     Args:
@@ -3783,8 +3787,8 @@ def roi_pool(input, rois, pooled_height=1, pooled_width=1, spatial_scale=1.0):
         rois (Variable): ROIs (Regions of Interest) to pool over. It should
                          be a 2-D one level LoTensor of shape [num_rois, 4].
                          The layout is [x1, y1, x2, y2], where (x1, y1)
-                         is the top left coordinates, and (x2, y2) is the 
-                         bottom right coordinates. The num_rois is the 
+                         is the top left coordinates, and (x2, y2) is the
+                         bottom right coordinates. The num_rois is the
                          total number of ROIs in this batch data.
         pooled_height (integer): The pooled output height. Default: 1
         pooled_width (integer): The pooled output width. Default: 1
@@ -3793,11 +3797,11 @@ def roi_pool(input, rois, pooled_height=1, pooled_width=1, spatial_scale=1.0):
                                to the scale used when pooling. Default: 1.0
 
     Returns:
-        pool_out (Variable): The output is a 4-D tensor of the shape 
+        pool_out (Variable): The output is a 4-D tensor of the shape
                              (num_rois, channels, pooled_h, pooled_w).
 
     Examples:
-            pool_out = fluid.layers.roi_pool(input=x, rois=rois, 7, 7, 1.0) 
+            pool_out = fluid.layers.roi_pool(input=x, rois=rois, 7, 7, 1.0)
     """
     helper = LayerHelper('roi_pool', **locals())
     dtype = helper.input_dtype()

From 1f8243b7f8dd2bd3a90d70a51d222ff83663aa47 Mon Sep 17 00:00:00 2001
From: qingqing01 <dangqingqing@baidu.com>
Date: Thu, 17 May 2018 09:14:09 +0800
Subject: [PATCH 23/25] Refine smooth L1 loss. (#10713)

---
 paddle/fluid/operators/smooth_l1_loss_op.cc | 25 +++++++++++++++++++--
 python/paddle/fluid/layers/nn.py            | 16 ++++++-------
 2 files changed, 31 insertions(+), 10 deletions(-)

diff --git a/paddle/fluid/operators/smooth_l1_loss_op.cc b/paddle/fluid/operators/smooth_l1_loss_op.cc
index c44c5f164b..622420c1c3 100644
--- a/paddle/fluid/operators/smooth_l1_loss_op.cc
+++ b/paddle/fluid/operators/smooth_l1_loss_op.cc
@@ -105,7 +105,7 @@ class SmoothL1LossGradOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    auto in_dims = ctx->GetInputDim("X");
+    auto in_dims = ctx->GetInputDim("Diff");
     auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
 
     PADDLE_ENFORCE_GE(out_dims.size(), 2,
@@ -127,12 +127,33 @@ class SmoothL1LossGradOp : public framework::OperatorWithKernel {
   }
 };
 
+class SmoothL1LossGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto* op = new framework::OpDesc();
+    op->SetType("smooth_l1_loss_grad");
+    op->SetInput("InsideWeight", Input("InsideWeight"));
+    op->SetInput("OutsideWeight", Input("OutsideWeight"));
+    op->SetInput("Diff", Output("Diff"));
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+
+    op->SetAttrMap(Attrs());
+
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetOutput(framework::GradVarName("Y"), InputGrad("Y"));
+    return std::unique_ptr<framework::OpDesc>(op);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(smooth_l1_loss, ops::SmoothL1LossOp, ops::SmoothL1LossOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
+                  ops::SmoothL1LossGradMaker);
 REGISTER_OPERATOR(smooth_l1_loss_grad, ops::SmoothL1LossGradOp);
 REGISTER_OP_CPU_KERNEL(
     smooth_l1_loss,
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 1786be22fd..93c5e6ba96 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -3263,35 +3263,35 @@ def smooth_l1(x, y, inside_weight=None, outside_weight=None, sigma=None):
     """
     **Smooth L1 Loss Operator. **
 
-    This operator computes the smooth l1 loss for X and Y.
+    This operator computes the smooth L1 loss for X and Y.
     The operator takes the first dimension of X and Y as batch size.
-    For each instance, it computes the smooth l1 loss element by element first
+    For each instance, it computes the smooth L1 loss element by element first
     and then sums all the losses. So the shape of Out is [batch_size, 1].
 
     Args:
         x (Variable): A tensor with rank at least 2. The input value of smooth
-            l1 loss op with shape [batch_size, dim1, ..., dimN].
+            L1 loss op with shape [batch_size, dim1, ..., dimN].
         y (Variable): A tensor with rank at least 2. The target value of smooth
-            l1 loss op with same shape as x.
+            L1 loss op with same shape as x.
         inside_weight (Variable|None):  A tensor with rank at least 2. This
             input is optional and should have same shape with x. If provided,
             the result of (x - y) will be multiplied by this tensor element by
             element.
         outside_weight (Variable|None): A tensor with rank at least 2. This
             input is optional and should have same shape with x. If provided,
-            the out smooth l1 loss will be multiplied by this tensor element
+            the out smooth L1 loss will be multiplied by this tensor element
             by element.
-        sigma (float|None): Hyper parameter of smooth l1 loss op. A float scalar
+        sigma (float|None): Hyper parameter of smooth L1 loss op. A float scalar
             with default value 1.0.
     Returns:
-        Variable: A tensor with rank be 2. The output smooth l1 loss with
+        Variable: A tensor with rank be 2. The output smooth L1 loss with
             shape [batch_size, 1].
 
     Examples:
         .. code-block:: python
 
             data = fluid.layers.data(name='data', shape=[128], dtype='float32')
-            label = fluid.layers.data(name='label', shape=[100], dtype='int64')
+            label = fluid.layers.data(name='label', shape=[100], dtype='float32')
             fc = fluid.layers.fc(input=data, size=100)
             out = fluid.layers.smooth_l1(x=fc, y=label)
     """

From 63012df4970814062ad3be6a4139a4d5f6a50bc0 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Thu, 17 May 2018 09:59:57 +0800
Subject: [PATCH 24/25] Switch scope/program for every test (#10702)

---
 .../fluid/tests/unittests/CMakeLists.txt      |  4 +-
 tools/test_runner.py                          | 48 +++++++++++++++++++
 2 files changed, 50 insertions(+), 2 deletions(-)
 create mode 100644 tools/test_runner.py

diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 496acc5791..2ae9653953 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -28,11 +28,11 @@ function(py_test_modules TARGET_NAME)
   if(WITH_TESTING)
     set(options "")
     set(oneValueArgs "")
-    set(multiValueArgs MODULES DEPS ARGS ENVS)
+    set(multiValueArgs MODULES DEPS ENVS)
     cmake_parse_arguments(py_test_modules "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     add_test(NAME ${TARGET_NAME}
              COMMAND env PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_modules_ENVS}
-             ${PYTHON_EXECUTABLE} -u -m unittest --verbose ${py_test_modules_MODULES} ${py_test_modules_ARGS}
+             ${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/tools/test_runner.py ${py_test_modules_MODULES}
              WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
   endif()
 endfunction()
diff --git a/tools/test_runner.py b/tools/test_runner.py
new file mode 100644
index 0000000000..9dc750b890
--- /dev/null
+++ b/tools/test_runner.py
@@ -0,0 +1,48 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import os
+import sys
+import paddle.fluid as fluid
+import importlib
+import cStringIO
+
+
+def main():
+    sys.path.append(os.getcwd())
+    some_test_failed = False
+    for module_name in sys.argv[1:]:
+        buffer = cStringIO.StringIO()
+        main = fluid.Program()
+        startup = fluid.Program()
+        scope = fluid.core.Scope()
+        with fluid.program_guard(main, startup):
+            with fluid.scope_guard(scope):
+                with fluid.unique_name.guard():
+                    test_loader = unittest.TestLoader()
+                    module = importlib.import_module(module_name)
+                    tests = test_loader.loadTestsFromModule(module)
+                    res = unittest.TextTestRunner(stream=buffer).run(tests)
+                    if not res.wasSuccessful():
+                        some_test_failed = True
+                        print >> sys.stderr, module_name, 'failed\n', buffer.getvalue(
+                        )
+
+    if some_test_failed:
+        exit(1)
+
+
+if __name__ == '__main__':
+    main()

From dbbeccc466bac75556399a89251c8142e8dbb377 Mon Sep 17 00:00:00 2001
From: Lei Wang <bestwanglei@gmail.com>
Date: Wed, 16 May 2018 19:59:58 -0700
Subject: [PATCH 25/25] Scripts: add a feature to run a single test. (#10678)

---
 paddle/scripts/paddle_build.sh        | 38 ++++++++++++++++++++++-----
 paddle/scripts/paddle_docker_build.sh |  1 +
 2 files changed, 33 insertions(+), 6 deletions(-)

diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 928b95b4f5..58a30ab3e5 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -20,19 +20,15 @@
 #=================================================
 
 function print_usage() {
-    RED='\033[0;31m'
-    BLUE='\033[0;34m'
-    BOLD='\033[1m'
-    NONE='\033[0m'
-    
     echo -e "\n${RED}Usage${NONE}:
-    ${BOLD}$0${NONE} [OPTION]"
+    ${BOLD}${SCRIPT_NAME}${NONE} [OPTION]"
     
     echo -e "\n${RED}Options${NONE}:
     ${BLUE}build${NONE}: run build for x86 platform
     ${BLUE}build_android${NONE}: run build for android platform
     ${BLUE}build_ios${NONE}: run build for ios platform
     ${BLUE}test${NONE}: run all unit tests
+    ${BLUE}single_test${NONE}: run a single unit test
     ${BLUE}bind_test${NONE}: parallel tests bind to different GPU
     ${BLUE}doc${NONE}: generate paddle documents
     ${BLUE}html${NONE}: convert C++ source code into HTML
@@ -45,7 +41,15 @@ function print_usage() {
 }
 
 function init() {
+    RED='\033[0;31m'
+    BLUE='\033[0;34m'
+    BOLD='\033[1m'
+    NONE='\033[0m'
+
     PADDLE_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}")/../../" && pwd )"
+    if [ -z "${SCRIPT_NAME}" ]; then
+        SCRIPT_NAME=$0
+    fi
 }
 
 function cmake_gen() {
@@ -309,6 +313,25 @@ EOF
     fi
 }
 
+function single_test() {
+    TEST_NAME=$1
+    if [ -z "${TEST_NAME}" ]; then
+        echo -e "${RED}Usage:${NONE}"
+        echo -e "${BOLD}${SCRIPT_NAME}${NONE} ${BLUE}single_test${NONE} [test_name]"
+        exit 1
+    fi
+    mkdir -p ${PADDLE_ROOT}/build
+    cd ${PADDLE_ROOT}/build
+    if [ ${WITH_TESTING:-ON} == "ON" ] ; then
+    cat <<EOF
+    ========================================
+    Running ${TEST_NAME} ...
+    ========================================
+EOF
+        ctest --output-on-failure -R ${TEST_NAME}
+    fi
+}
+
 function bind_test() {
     # the number of process to run tests
     NUM_PROC=6
@@ -491,6 +514,9 @@ function main() {
       test)
         run_test
         ;;
+      single_test)
+        single_test $2
+        ;;
       bind_test)
         bind_test
         ;;
diff --git a/paddle/scripts/paddle_docker_build.sh b/paddle/scripts/paddle_docker_build.sh
index ac32bf0292..77588b8872 100755
--- a/paddle/scripts/paddle_docker_build.sh
+++ b/paddle/scripts/paddle_docker_build.sh
@@ -63,6 +63,7 @@ EOL
     ${DOCKER_CMD} run -it \
         --name $CONTAINER_ID \
         ${DOCKER_ENV} \
+        -e SCRIPT_NAME=$0 \
         -v $PADDLE_ROOT:/paddle \
         -v ${HOME}/.ccache:/root/.ccache \
         -w /paddle \