From b044724db710dabf5bea23195b599eab3ba46bb3 Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Tue, 22 May 2018 14:35:02 +0800
Subject: [PATCH 01/93] update fluid Train API param_path to checkpoint_config

---
 python/paddle/fluid/trainer.py | 50 +++++++++++++++++++++++++++-------
 1 file changed, 40 insertions(+), 10 deletions(-)

diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py
index 7da123dd92..01c40bb90e 100644
--- a/python/paddle/fluid/trainer.py
+++ b/python/paddle/fluid/trainer.py
@@ -27,11 +27,8 @@ import parallel_executor
 from transpiler import distribute_transpiler
 
 __all__ = [
-    'Trainer',
-    'BeginEpochEvent',
-    'EndEpochEvent',
-    'BeginStepEvent',
-    'EndStepEvent',
+    'Trainer', 'BeginEpochEvent', 'EndEpochEvent', 'BeginStepEvent',
+    'EndStepEvent', 'CheckpointConfig'
 ]
 
 
@@ -59,6 +56,17 @@ class EndStepEvent(object):
         self.metrics = metrics
 
 
+class CheckpointConfig(object):
+    def __init__(self,
+                 checkpoint_dir=None,
+                 max_num_checkpoints=3,
+                 save_interval_secs=600):
+        if checkpoint_dir is None:
+            self.checkpoint_dir = os.getcwd()
+        self.max_num_checkpoints = max_num_checkpoints
+        self.save_interval_secs = save_interval_secs
+
+
 def check_and_get_place(place):
     """
     Check the type of place or get the default place
@@ -97,9 +105,9 @@ class Trainer(object):
     def __init__(self,
                  train_func,
                  optimizer,
-                 param_path=None,
                  place=None,
-                 parallel=False):
+                 parallel=False,
+                 checkpoint_config=None):
         self.__stop = False
         self.parallel = parallel
         # 1. we need to generate a framework.Program by calling
@@ -108,6 +116,16 @@ class Trainer(object):
         if not isinstance(optimizer, opt_module.Optimizer):
             raise TypeError("The optimizer should be an instance of Optimizer")
 
+        # config for checkpoint
+        # only chief worker will save variables
+        self.chief = True
+        self.checkpoint = checkpoint_config
+        if self.checkpoint and not isinstance(self.checkpoint,
+                                              CheckpointConfig):
+            raise TypeError(
+                "The checkpoint_config shoule be an instance of CheckpointConfig"
+            )
+
         self.scope = core.Scope()
 
         self.startup_program = framework.Program()
@@ -136,9 +154,10 @@ class Trainer(object):
             exe = executor.Executor(place)
             exe.run(self.startup_program)
 
-        if param_path:
-            # load params from param_path into scope
-            io.load_persistables(exe, dirname=param_path)
+        if self.checkpoint:
+            exe = executor.Executor(place)
+            io.load_checkpoint(exe, self.checkpoint.checkpoint_dir,
+                               self.startup_program)
 
     def _transpile_nccl2_dist(self):
         # PADDLE_TRAINER_IPS
@@ -146,6 +165,7 @@ class Trainer(object):
             self.nccl_id_var = None
         else:
             self.trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
+            self.chief = self.trainer_id == 0
             port = os.getenv("PADDLE_PSERVER_PORT")
             worker_ips = os.getenv("PADDLE_TRAINER_IPS")
             worker_endpoints = []
@@ -194,6 +214,7 @@ class Trainer(object):
         # the unique trainer id, starting from 0, needed by trainer
         # only
         trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
+        self.chief = self.trainer_id == 0
         # the role, should be either PSERVER or TRAINER
         training_role = os.getenv("PADDLE_TRAINING_ROLE")
         with self._prog_and_scope_guard():
@@ -263,6 +284,14 @@ class Trainer(object):
             exe = executor.Executor(self.place)
             io.save_persistables(exe, dirname=param_path)
 
+    def _save_checkpoint(self):
+        if self.checkpoint and self.chief:
+            exe = executor.Executor(self.place)
+            io.save_checkpoint(exe, self.checkpoint.checkpoint_dir,
+                               self.checkpoint.max_num_checkpoints,
+                               self.checkpoint.save_interval_secs,
+                               self.train_program)
+
     @contextlib.contextmanager
     def _prog_and_scope_guard(self):
         with framework.program_guard(
@@ -309,6 +338,7 @@ class Trainer(object):
                 else:
                     metrics = exe.run(feed=data, fetch_list=[])
                 event_handler(EndStepEvent(epoch_id, step_id, metrics))
+                self._save_checkpoint()
             event_handler(EndEpochEvent(epoch_id))
 
     def _test_by_executor(self, reader, feed_order, fetch_list):

From dca0b6d9ccc5b770e78a0903839f2ed89d79be58 Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Wed, 23 May 2018 19:50:25 +0800
Subject: [PATCH 02/93] restore param_path

---
 python/paddle/fluid/trainer.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py
index 01c40bb90e..24254b4980 100644
--- a/python/paddle/fluid/trainer.py
+++ b/python/paddle/fluid/trainer.py
@@ -105,6 +105,7 @@ class Trainer(object):
     def __init__(self,
                  train_func,
                  optimizer,
+                 param_path=None,
                  place=None,
                  parallel=False,
                  checkpoint_config=None):
@@ -120,8 +121,8 @@ class Trainer(object):
         # only chief worker will save variables
         self.chief = True
         self.checkpoint = checkpoint_config
-        if self.checkpoint and not isinstance(self.checkpoint,
-                                              CheckpointConfig):
+        if self.checkpoint and \
+            not isinstance(self.checkpoint, CheckpointConfig):
             raise TypeError(
                 "The checkpoint_config shoule be an instance of CheckpointConfig"
             )
@@ -159,6 +160,10 @@ class Trainer(object):
             io.load_checkpoint(exe, self.checkpoint.checkpoint_dir,
                                self.startup_program)
 
+        if param_path:
+            # load params from param_path into scope
+            io.load_persistables(exe, dirname=param_path)
+
     def _transpile_nccl2_dist(self):
         # PADDLE_TRAINER_IPS
         if "PADDLE_TRAINER_IPS" not in os.environ:

From 514b2427edbd30013ca1783769af18fb96ffb626 Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Mon, 28 May 2018 20:08:23 +0800
Subject: [PATCH 03/93] add save/load persist_vars_without_grad

---
 python/paddle/fluid/io.py      | 46 +++++++++++++++++++++++-----------
 python/paddle/fluid/trainer.py |  3 ++-
 2 files changed, 33 insertions(+), 16 deletions(-)

diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index 8e58e5eb79..f626039363 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -24,7 +24,8 @@ __all__ = [
     'save_vars', 'save_params', 'save_persistables', 'load_vars', 'load_params',
     'load_persistables', 'save_inference_model', 'load_inference_model',
     'get_inference_program', 'save_checkpoint', 'load_checkpoint',
-    'clean_checkpoint'
+    'clean_checkpoint', 'load_persist_vars_without_grad',
+    'save_persist_vars_without_grad'
 ]
 
 
@@ -455,6 +456,33 @@ def get_parameter_value_by_name(name, executor, program=None):
     return get_parameter_value(var, executor)
 
 
+def load_persist_vars_without_grad(executor, dirname, program):
+    """
+    load_persist_vars_without_grad will load variables from a directory by an executor,
+    the variable named end with "@GRAD" will not be loaded.
+    """
+    load_vars(
+        executor,
+        dirname=dirname,
+        main_program=program,
+        predicate=_is_checkpoint_var,
+        filename=None)
+
+
+def save_persist_vars_without_grad(executor, dirname, program):
+    """
+    save_persist_vars_without_grad  will save variables to a directory by an executor,
+    the variable named end with "@GRAD" will not be saved.
+    """
+    save_vars(
+        executor,
+        dirname=dirname,
+        main_program=program,
+        vars=None,
+        predicate=_is_checkpoint_var,
+        filename=None)
+
+
 SUCCESS_MARK_FILENAME = "_SUCCESS"
 CHECKPOINT_PREFIX = "checkpoint"
 CHECKPOINT_SEPARATOR = "_"
@@ -491,13 +519,7 @@ def save_checkpoint(executor,
     serial += 1
     cur_dir = _get_serial_dir(serial, checkpoint_dir)
 
-    save_vars(
-        executor,
-        dirname=cur_dir,
-        main_program=main_program,
-        vars=None,
-        predicate=_is_checkpoint_var,
-        filename=None)
+    load_persist_vars_without_grad(executor, cur_dir, main_program)
     _write_success(cur_dir)
     _lru_delete(checkpoint_dir, max_num_checkpoints)
 
@@ -521,13 +543,7 @@ def load_checkpoint(executor, checkpoint_dir=None, main_program=None):
         return
 
     cur_dir = _get_serial_dir(serial, checkpoint_dir)
-
-    load_vars(
-        executor,
-        dirname=cur_dir,
-        main_program=main_program,
-        predicate=_is_checkpoint_var,
-        filename=None)
+    load_persist_vars_without_grad(executor, cur_dir, main_program)
 
 
 def clean_checkpoint(checkpoint_dir, delete_dir=False):
diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py
index 24254b4980..b4b7b75b96 100644
--- a/python/paddle/fluid/trainer.py
+++ b/python/paddle/fluid/trainer.py
@@ -162,7 +162,8 @@ class Trainer(object):
 
         if param_path:
             # load params from param_path into scope
-            io.load_persistables(exe, dirname=param_path)
+            io.load_persist_vars_without_grad(
+                exe, dirname=param_path, program=self.startup_program)
 
     def _transpile_nccl2_dist(self):
         # PADDLE_TRAINER_IPS

From 5eea5db95fb6eaca2db9a0af63e871a9fc29c6bf Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Tue, 29 May 2018 14:37:59 +0800
Subject: [PATCH 04/93] optimized checkpoint and save_model

---
 python/paddle/fluid/__init__.py |  1 +
 python/paddle/fluid/io.py       | 61 +++++++++++++++------------------
 python/paddle/fluid/trainer.py  | 40 +++++++++++++++------
 3 files changed, 58 insertions(+), 44 deletions(-)

diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 859605d005..aece8fc149 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -26,6 +26,7 @@ from trainer import BeginEpochEvent
 from trainer import EndEpochEvent
 from trainer import BeginStepEvent
 from trainer import EndStepEvent
+from trainer import CheckpointConfig
 
 import inferencer
 from inferencer import Inferencer
diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index f626039363..aa039bdfaa 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -491,7 +491,6 @@ CHECKPOINT_SEPARATOR = "_"
 def save_checkpoint(executor,
                     checkpoint_dir=None,
                     max_num_checkpoints=3,
-                    save_interval_secs=600,
                     main_program=None):
     """
     Save Checkpoint will save persistable LodTensor variables from main_program in checkpoint directory,
@@ -511,15 +510,10 @@ def save_checkpoint(executor,
     if not os.path.isdir(checkpoint_dir):
         os.makedirs(checkpoint_dir)
 
-    serial = _get_lastest_checkpoint_dir(checkpoint_dir)
-    if serial >= 0 and not _interval_secs_exceed(
-            _get_serial_dir(serial, checkpoint_dir), save_interval_secs):
-        return
-
-    serial += 1
-    cur_dir = _get_serial_dir(serial, checkpoint_dir)
+    serial = _get_lastest_checkpoint_dir(checkpoint_dir) + 1
+    cur_dir = _get_serial_dir(checkpoint_dir, serial)
 
-    load_persist_vars_without_grad(executor, cur_dir, main_program)
+    save_persist_vars_without_grad(executor, cur_dir, main_program)
     _write_success(cur_dir)
     _lru_delete(checkpoint_dir, max_num_checkpoints)
 
@@ -542,7 +536,7 @@ def load_checkpoint(executor, checkpoint_dir=None, main_program=None):
     if serial < 0:
         return
 
-    cur_dir = _get_serial_dir(serial, checkpoint_dir)
+    cur_dir = _get_serial_dir(checkpoint_dir, serial)
     load_persist_vars_without_grad(executor, cur_dir, main_program)
 
 
@@ -559,11 +553,6 @@ def clean_checkpoint(checkpoint_dir, delete_dir=False):
         os.rmdir(checkpoint_dir)
 
 
-def _get_serial_dir(serial, checkpoint_dir):
-    serial_folder = CHECKPOINT_PREFIX + CHECKPOINT_SEPARATOR + str(serial)
-    return os.path.join(checkpoint_dir, serial_folder)
-
-
 def _is_checkpoint_var(var):
     """
     the checkpoint will not save or load all the variables.
@@ -582,29 +571,37 @@ def _is_checkpoint_var(var):
     return var.persistable
 
 
-def _interval_secs_exceed(dirname, save_interval_secs):
-    dir_time = os.path.getmtime(dirname)
-    if save_interval_secs > (time.time() - dir_time):
-        return False
-    return True
+def _get_dir_serial(dirname):
+    _, serial = dirname.split(CHECKPOINT_SEPARATOR)
+
+    serial_num = -1
+    try:
+        serial_num = int(serial)
+    except ValueError:
+        serial_num = -1
+    return serial_num
+
+
+def _get_serial_dir(dirname, serial):
+    serial_folder = CHECKPOINT_PREFIX + CHECKPOINT_SEPARATOR + str(serial)
+    return os.path.join(dirname, serial_folder)
 
 
 def _lru_delete(dirname, max_num_checkpoints=3):
     dirs = os.listdir(dirname)
-    serials = []
+    serial_map = {}
     for serial in dirs:
-        try:
-            serials.append(int(serial))
-        except ValueError:
-            continue
+        serial_num = _get_dir_serial(serial)
+        serial_map[serial_num] = serial
 
-    if len(serials) <= max_num_checkpoints:
+    if len(serial_map.keys()) <= max_num_checkpoints:
         return
 
+    serials = serial_map.keys()
     serials.sort(reverse=True)
     serials = serials[max_num_checkpoints:]
     for serial in serials:
-        cur_dir = os.path.join(dirname, str(serial))
+        cur_dir = _get_serial_dir(dirname, serial)
         shutil.rmtree(cur_dir)
 
 
@@ -633,20 +630,18 @@ def _get_lastest_checkpoint_dir(checkpoint_dir):
         """
         is _SUCCESS in this dir
         """
-        _, serial = cur_dir.split(CHECKPOINT_SEPARATOR)
 
-        try:
-            int(serial)
-        except ValueError:
+        serial = _get_dir_serial(cur_dir)
+        if serial == -1:
             return -1
 
         if not os.path.isdir(os.path.join(checkpoint_dir, cur_dir)):
             return -1
 
         success_path = os.path.join(
-            _get_serial_dir(serial, checkpoint_dir), SUCCESS_MARK_FILENAME)
+            _get_serial_dir(checkpoint_dir, serial), SUCCESS_MARK_FILENAME)
         if os.path.isfile(success_path):
-            return int(serial)
+            return serial
 
     if not os.path.isdir(checkpoint_dir):
         return -1
diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py
index b4b7b75b96..3cf96ac251 100644
--- a/python/paddle/fluid/trainer.py
+++ b/python/paddle/fluid/trainer.py
@@ -60,11 +60,24 @@ class CheckpointConfig(object):
     def __init__(self,
                  checkpoint_dir=None,
                  max_num_checkpoints=3,
-                 save_interval_secs=600):
+                 epoch_interval=1,
+                 step_interval=10):
         if checkpoint_dir is None:
             self.checkpoint_dir = os.getcwd()
+        else:
+            self.checkpoint_dir = checkpoint_dir
+
         self.max_num_checkpoints = max_num_checkpoints
-        self.save_interval_secs = save_interval_secs
+
+        if epoch_interval < 1:
+            self.epoch_interval = 1
+        else:
+            self.epoch_interval = epoch_interval
+
+        if step_interval < 1:
+            self.step_interval = 10
+        else:
+            self.step_interval = step_interval
 
 
 def check_and_get_place(place):
@@ -290,14 +303,6 @@ class Trainer(object):
             exe = executor.Executor(self.place)
             io.save_persistables(exe, dirname=param_path)
 
-    def _save_checkpoint(self):
-        if self.checkpoint and self.chief:
-            exe = executor.Executor(self.place)
-            io.save_checkpoint(exe, self.checkpoint.checkpoint_dir,
-                               self.checkpoint.max_num_checkpoints,
-                               self.checkpoint.save_interval_secs,
-                               self.train_program)
-
     @contextlib.contextmanager
     def _prog_and_scope_guard(self):
         with framework.program_guard(
@@ -343,8 +348,9 @@ class Trainer(object):
                                       ])
                 else:
                     metrics = exe.run(feed=data, fetch_list=[])
+
                 event_handler(EndStepEvent(epoch_id, step_id, metrics))
-                self._save_checkpoint()
+                self._save_checkpoint(epoch_id, step_id)
             event_handler(EndEpochEvent(epoch_id))
 
     def _test_by_executor(self, reader, feed_order, fetch_list):
@@ -384,6 +390,18 @@ class Trainer(object):
                 loss_name=self.train_func_outputs[0].name)
         return self._get_parallel_executor()
 
+    def _save_checkpoint(self, epoch_id, step_id):
+        if not self.checkpoint or not self.chief:
+            return
+
+        if epoch_id % self.checkpoint.epoch_interval == 0 and step_id % self.checkpoint.step_interval == 0:
+            exe = executor.Executor(self.place)
+            io.save_checkpoint(
+                executor=exe,
+                checkpoint_dir=self.checkpoint.checkpoint_dir,
+                max_num_checkpoints=self.checkpoint.max_num_checkpoints,
+                main_program=self.train_program)
+
 
 def build_feed_var_list(program, feed_order):
     if not isinstance(program, framework.Program):

From 5f5d6a9dc7eaf2e1c5b069454497d11a28701ddb Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Tue, 29 May 2018 16:01:26 +0800
Subject: [PATCH 05/93] optimized checkpoint and save_model

---
 python/paddle/fluid/io.py | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index aa039bdfaa..bd3c2e3d9a 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -489,9 +489,9 @@ CHECKPOINT_SEPARATOR = "_"
 
 
 def save_checkpoint(executor,
-                    checkpoint_dir=None,
-                    max_num_checkpoints=3,
-                    main_program=None):
+                    checkpoint_dir,
+                    main_program=None,
+                    max_num_checkpoints=3):
     """
     Save Checkpoint will save persistable LodTensor variables from main_program in checkpoint directory,
     the directory named by serial number from 0 to (n -1), save_checkpoint use LRU strategy
@@ -500,12 +500,11 @@ def save_checkpoint(executor,
 
     :param executor
     :param checkpoint_dir
-    :param max_num_checkpoints
-    :param save_interval_secs
     :param main_program
+    :param max_num_checkpoints
     """
     if checkpoint_dir is None:
-        checkpoint_dir = os.getcwd()
+        raise ValueError("The values of 'checkpoint_dir' should not be None")
 
     if not os.path.isdir(checkpoint_dir):
         os.makedirs(checkpoint_dir)
@@ -518,7 +517,7 @@ def save_checkpoint(executor,
     _lru_delete(checkpoint_dir, max_num_checkpoints)
 
 
-def load_checkpoint(executor, checkpoint_dir=None, main_program=None):
+def load_checkpoint(executor, checkpoint_dir, main_program=None):
     """
     Load checkpoint from a directory by executor,
     it will find  the most recent saved checkpoint file and load it auto.
@@ -529,7 +528,7 @@ def load_checkpoint(executor, checkpoint_dir=None, main_program=None):
     """
 
     if checkpoint_dir is None:
-        checkpoint_dir = os.getcwd()
+        raise ValueError("The values of 'checkpoint_dir' should not be None")
 
     serial = _get_lastest_checkpoint_dir(checkpoint_dir)
 
@@ -546,7 +545,7 @@ def clean_checkpoint(checkpoint_dir, delete_dir=False):
     delete_dir only works when the directory is empty, otherwise, OSError is raised.  
     """
     if checkpoint_dir is None:
-        checkpoint_dir = os.getcwd()
+        raise ValueError("The values of 'checkpoint_dir' should not be None")
     _lru_delete(checkpoint_dir, max_num_checkpoints=0)
 
     if delete_dir and not os.listdir(checkpoint_dir):

From ad9dfeb0180b40905d245354e733e750009cc173 Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Tue, 29 May 2018 20:28:40 +0800
Subject: [PATCH 06/93] bug fix and optimize

---
 python/paddle/fluid/io.py      | 153 +++++++++++++++++++++++++--------
 python/paddle/fluid/trainer.py |  52 +++++++++--
 2 files changed, 162 insertions(+), 43 deletions(-)

diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index bd3c2e3d9a..ed560304e2 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -456,40 +456,18 @@ def get_parameter_value_by_name(name, executor, program=None):
     return get_parameter_value(var, executor)
 
 
-def load_persist_vars_without_grad(executor, dirname, program):
-    """
-    load_persist_vars_without_grad will load variables from a directory by an executor,
-    the variable named end with "@GRAD" will not be loaded.
-    """
-    load_vars(
-        executor,
-        dirname=dirname,
-        main_program=program,
-        predicate=_is_checkpoint_var,
-        filename=None)
-
-
-def save_persist_vars_without_grad(executor, dirname, program):
-    """
-    save_persist_vars_without_grad  will save variables to a directory by an executor,
-    the variable named end with "@GRAD" will not be saved.
-    """
-    save_vars(
-        executor,
-        dirname=dirname,
-        main_program=program,
-        vars=None,
-        predicate=_is_checkpoint_var,
-        filename=None)
-
-
 SUCCESS_MARK_FILENAME = "_SUCCESS"
 CHECKPOINT_PREFIX = "checkpoint"
+MODEL_DIR = "__model__"
+TRAINER_PREFIX = "trainer"
 CHECKPOINT_SEPARATOR = "_"
 
 
 def save_checkpoint(executor,
                     checkpoint_dir,
+                    trainer_id,
+                    is_chief=False,
+                    trainer_args=None,
                     main_program=None,
                     max_num_checkpoints=3):
     """
@@ -502,22 +480,35 @@ def save_checkpoint(executor,
     :param checkpoint_dir
     :param main_program
     :param max_num_checkpoints
+    :param is_chief
     """
     if checkpoint_dir is None:
         raise ValueError("The values of 'checkpoint_dir' should not be None")
 
+    if trainer_args and not isinstance(trainer_args, dict):
+        raise TypeError("The type of 'trainer_args' should be dict")
+
     if not os.path.isdir(checkpoint_dir):
         os.makedirs(checkpoint_dir)
 
     serial = _get_lastest_checkpoint_dir(checkpoint_dir) + 1
     cur_dir = _get_serial_dir(checkpoint_dir, serial)
 
-    save_persist_vars_without_grad(executor, cur_dir, main_program)
-    _write_success(cur_dir)
+    if is_chief:
+        save_persist_vars_without_grad(executor, cur_dir, main_program)
+
+    save_trainer_args(cur_dir, trainer_id, trainer_args)
     _lru_delete(checkpoint_dir, max_num_checkpoints)
 
 
-def load_checkpoint(executor, checkpoint_dir, main_program=None):
+def need_load_checkpoint(checkpoint_dir):
+    serial = _get_lastest_checkpoint_dir(checkpoint_dir)
+    if serial < 0:
+        return None
+    return serial
+
+
+def load_checkpoint(executor, checkpoint_dir, serial, main_program):
     """
     Load checkpoint from a directory by executor,
     it will find  the most recent saved checkpoint file and load it auto.
@@ -528,14 +519,17 @@ def load_checkpoint(executor, checkpoint_dir, main_program=None):
     """
 
     if checkpoint_dir is None:
-        raise ValueError("The values of 'checkpoint_dir' should not be None")
+        raise ValueError(
+            "The values of 'checkpoint_dir' or 'serial' should not be None")
 
-    serial = _get_lastest_checkpoint_dir(checkpoint_dir)
+    if serial is None or serial < 0:
+        raise ValueError("The values of 'serial' should not be None or <0 ")
 
-    if serial < 0:
-        return
+    if main_program is None:
+        raise ValueError("The values of 'main_program'should not be None")
 
     cur_dir = _get_serial_dir(checkpoint_dir, serial)
+    cur_dir = _get_model_dir(cur_dir)
     load_persist_vars_without_grad(executor, cur_dir, main_program)
 
 
@@ -552,6 +546,68 @@ def clean_checkpoint(checkpoint_dir, delete_dir=False):
         os.rmdir(checkpoint_dir)
 
 
+def load_persist_vars_without_grad(executor, dirname, program, nest=True):
+    """
+    load_persist_vars_without_grad will load variables from a directory by an executor,
+    the variable named end with "@GRAD" will not be loaded.
+    """
+
+    if nest:
+        dirname = _get_model_dir(dirname)
+
+    load_vars(
+        executor,
+        dirname=dirname,
+        main_program=program,
+        predicate=_is_checkpoint_var,
+        filename=None)
+
+
+def save_persist_vars_without_grad(executor, dirname, program):
+    """
+    save_persist_vars_without_grad  will save variables to a directory by an executor,
+    the variable named end with "@GRAD" will not be saved.
+    """
+    cur_dir = _get_model_dir(dirname)
+    save_vars(
+        executor,
+        dirname=cur_dir,
+        main_program=program,
+        vars=None,
+        predicate=_is_checkpoint_var,
+        filename=None)
+    _write_success(cur_dir)
+
+
+def save_trainer_args(dirname, trainer_id, trainer_args):
+    if not isinstance(trainer_args, dict):
+        raise TypeError("The type of 'trainer_args' should be dict")
+    cur_dir = _get_trainer_dir(dirname, trainer_id)
+
+    for name, value in trainer_args.iteritems():
+        args_file = os.path.join(cur_dir, name)
+        with open(args_file, 'w') as f:
+            f.write(str(value))
+    _write_success(cur_dir)
+
+
+def load_trainer_args(checkpoint_dir, serial, trainer_id, trainer_args):
+    cur_dir = _get_serial_dir(checkpoint_dir, serial)
+    cur_dir = _get_trainer_dir(cur_dir, trainer_id)
+
+    if not isinstance(trainer_args, list):
+        raise TypeError("The type of 'trainer_args' should be list")
+
+    ret_values = []
+
+    for arg in trainer_args:
+        cur_file = os.path.join(cur_dir, arg)
+        with open(cur_file, 'r') as f:
+            contents = f.read()
+            ret_values.append(contents.strip())
+    return ret_values
+
+
 def _is_checkpoint_var(var):
     """
     the checkpoint will not save or load all the variables.
@@ -583,7 +639,31 @@ def _get_dir_serial(dirname):
 
 def _get_serial_dir(dirname, serial):
     serial_folder = CHECKPOINT_PREFIX + CHECKPOINT_SEPARATOR + str(serial)
-    return os.path.join(dirname, serial_folder)
+    serial_dir = os.path.join(dirname, serial_folder)
+
+    if not os.path.isdir(serial_dir):
+        os.makedirs(serial_dir)
+
+    return serial_dir
+
+
+def _get_model_dir(dirname):
+    model_dir = os.path.join(dirname, MODEL_DIR)
+
+    if not os.path.isdir(model_dir):
+        os.makedirs(model_dir)
+
+    return model_dir
+
+
+def _get_trainer_dir(dirname, trainer_id):
+    trainer_folder = TRAINER_PREFIX + CHECKPOINT_SEPARATOR + str(trainer_id)
+    trainer_dir = os.path.join(dirname, trainer_folder)
+
+    if not os.path.isdir(trainer_dir):
+        os.makedirs(trainer_dir)
+
+    return trainer_dir
 
 
 def _lru_delete(dirname, max_num_checkpoints=3):
@@ -638,7 +718,8 @@ def _get_lastest_checkpoint_dir(checkpoint_dir):
             return -1
 
         success_path = os.path.join(
-            _get_serial_dir(checkpoint_dir, serial), SUCCESS_MARK_FILENAME)
+            _get_serial_dir(checkpoint_dir, serial), MODEL_DIR,
+            SUCCESS_MARK_FILENAME)
         if os.path.isfile(success_path):
             return serial
 
diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py
index 3cf96ac251..206d582cdc 100644
--- a/python/paddle/fluid/trainer.py
+++ b/python/paddle/fluid/trainer.py
@@ -79,6 +79,9 @@ class CheckpointConfig(object):
         else:
             self.step_interval = step_interval
 
+        self.epoch_id = 0
+        self.step_id = 0
+
 
 def check_and_get_place(place):
     """
@@ -132,6 +135,7 @@ class Trainer(object):
 
         # config for checkpoint
         # only chief worker will save variables
+        self.trainer_id = 0
         self.chief = True
         self.checkpoint = checkpoint_config
         if self.checkpoint and \
@@ -139,6 +143,8 @@ class Trainer(object):
             raise TypeError(
                 "The checkpoint_config shoule be an instance of CheckpointConfig"
             )
+        self.load_checkpoint_serial = io.need_load_checkpoint(
+            self.checkpoint.checkpoint_dir)
 
         self.scope = core.Scope()
 
@@ -168,15 +174,25 @@ class Trainer(object):
             exe = executor.Executor(place)
             exe.run(self.startup_program)
 
-        if self.checkpoint:
+        if self.load_checkpoint_serial:
             exe = executor.Executor(place)
             io.load_checkpoint(exe, self.checkpoint.checkpoint_dir,
+                               self.load_checkpoint_serial,
                                self.startup_program)
 
-        if param_path:
+            epoch_id, step_id = io.load_trainer_args(
+                self.checkpoint.checkpoint_dir, self.load_checkpoint_serial,
+                self.trainer_id, ["epoch_id", "step_id"])
+            self.checkpoint.epoch_id = int(epoch_id)
+            self.checkpoint.step_id = int(step_id)
+
+        if param_path and os.path.isdir(param_path):
             # load params from param_path into scope
             io.load_persist_vars_without_grad(
-                exe, dirname=param_path, program=self.startup_program)
+                exe,
+                dirname=param_path,
+                program=self.startup_program,
+                nest=False)
 
     def _transpile_nccl2_dist(self):
         # PADDLE_TRAINER_IPS
@@ -333,11 +349,20 @@ class Trainer(object):
             self._train_by_any_executor(event_handler, exe, num_epochs, reader)
 
     def _train_by_any_executor(self, event_handler, exe, num_epochs, reader):
-        for epoch_id in range(num_epochs):
+        epochs = [
+            epoch_id for epoch_id in range(num_epochs)
+            if epoch_id >= self.checkpoint.epoch_id
+        ]
+        for epoch_id in epochs:
             event_handler(BeginEpochEvent(epoch_id))
             for step_id, data in enumerate(reader()):
                 if self.__stop:
+                    self._clean_checkpoint()
                     return
+
+                if self.checkpoint and self.checkpoint.step_id >= step_id and self.checkpoint.epoch_id == epoch_id:
+                    continue
+
                 begin_event = BeginStepEvent(epoch_id, step_id)
                 event_handler(begin_event)
                 if begin_event.fetch_metrics:
@@ -352,6 +377,7 @@ class Trainer(object):
                 event_handler(EndStepEvent(epoch_id, step_id, metrics))
                 self._save_checkpoint(epoch_id, step_id)
             event_handler(EndEpochEvent(epoch_id))
+        self._clean_checkpoint()
 
     def _test_by_executor(self, reader, feed_order, fetch_list):
         with executor.scope_guard(self.scope):
@@ -390,17 +416,29 @@ class Trainer(object):
                 loss_name=self.train_func_outputs[0].name)
         return self._get_parallel_executor()
 
+    def _clean_checkpoint(self):
+        if not self.checkpoint:
+            return
+        io.clean_checkpoint(checkpoint_dir=self.checkpoint.checkpoint_dir)
+
     def _save_checkpoint(self, epoch_id, step_id):
-        if not self.checkpoint or not self.chief:
+        if not self.checkpoint:
             return
 
         if epoch_id % self.checkpoint.epoch_interval == 0 and step_id % self.checkpoint.step_interval == 0:
+            trainer_args = {}
+            trainer_args["epoch_id"] = epoch_id
+            trainer_args["step_id"] = step_id
+
             exe = executor.Executor(self.place)
             io.save_checkpoint(
                 executor=exe,
                 checkpoint_dir=self.checkpoint.checkpoint_dir,
-                max_num_checkpoints=self.checkpoint.max_num_checkpoints,
-                main_program=self.train_program)
+                trainer_id=self.trainer_id,
+                is_chief=self.chief,
+                trainer_args=trainer_args,
+                main_program=self.train_program,
+                max_num_checkpoints=self.checkpoint.max_num_checkpoints)
 
 
 def build_feed_var_list(program, feed_order):

From 486e1e337d05679a22b389840136b9f07714646b Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Tue, 29 May 2018 20:36:45 +0800
Subject: [PATCH 07/93] bug fix and optimize

---
 python/paddle/fluid/trainer.py | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py
index 206d582cdc..35bb8ded5d 100644
--- a/python/paddle/fluid/trainer.py
+++ b/python/paddle/fluid/trainer.py
@@ -79,8 +79,9 @@ class CheckpointConfig(object):
         else:
             self.step_interval = step_interval
 
-        self.epoch_id = 0
-        self.step_id = 0
+        self._epoch_id = 0
+        self._step_id = 0
+        self._load_serial = None
 
 
 def check_and_get_place(place):
@@ -174,17 +175,17 @@ class Trainer(object):
             exe = executor.Executor(place)
             exe.run(self.startup_program)
 
-        if self.load_checkpoint_serial:
+        if self.checkpoint._load_serial:
             exe = executor.Executor(place)
             io.load_checkpoint(exe, self.checkpoint.checkpoint_dir,
-                               self.load_checkpoint_serial,
+                               self.checkpoint._load_serial,
                                self.startup_program)
 
             epoch_id, step_id = io.load_trainer_args(
                 self.checkpoint.checkpoint_dir, self.load_checkpoint_serial,
                 self.trainer_id, ["epoch_id", "step_id"])
-            self.checkpoint.epoch_id = int(epoch_id)
-            self.checkpoint.step_id = int(step_id)
+            self.checkpoint._epoch_id = int(epoch_id)
+            self.checkpoint._step_id = int(step_id)
 
         if param_path and os.path.isdir(param_path):
             # load params from param_path into scope
@@ -351,7 +352,7 @@ class Trainer(object):
     def _train_by_any_executor(self, event_handler, exe, num_epochs, reader):
         epochs = [
             epoch_id for epoch_id in range(num_epochs)
-            if epoch_id >= self.checkpoint.epoch_id
+            if epoch_id >= self.checkpoint._epoch_id
         ]
         for epoch_id in epochs:
             event_handler(BeginEpochEvent(epoch_id))
@@ -360,7 +361,8 @@ class Trainer(object):
                     self._clean_checkpoint()
                     return
 
-                if self.checkpoint and self.checkpoint.step_id >= step_id and self.checkpoint.epoch_id == epoch_id:
+                if self.checkpoint and self.checkpoint._load_serial \
+                    and self.checkpoint._step_id >= step_id and self.checkpoint._epoch_id == epoch_id:
                     continue
 
                 begin_event = BeginStepEvent(epoch_id, step_id)

From 9086043090f80ee7695d043e84fbe8068b2f76e7 Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Tue, 29 May 2018 20:52:01 +0800
Subject: [PATCH 08/93] bug fix and optimize

---
 python/paddle/fluid/io.py      | 1 -
 python/paddle/fluid/trainer.py | 4 ++--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index ed560304e2..2925e8eb28 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -529,7 +529,6 @@ def load_checkpoint(executor, checkpoint_dir, serial, main_program):
         raise ValueError("The values of 'main_program'should not be None")
 
     cur_dir = _get_serial_dir(checkpoint_dir, serial)
-    cur_dir = _get_model_dir(cur_dir)
     load_persist_vars_without_grad(executor, cur_dir, main_program)
 
 
diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py
index 35bb8ded5d..5ca93821e2 100644
--- a/python/paddle/fluid/trainer.py
+++ b/python/paddle/fluid/trainer.py
@@ -144,7 +144,7 @@ class Trainer(object):
             raise TypeError(
                 "The checkpoint_config shoule be an instance of CheckpointConfig"
             )
-        self.load_checkpoint_serial = io.need_load_checkpoint(
+        self.checkpoint._load_serial = io.need_load_checkpoint(
             self.checkpoint.checkpoint_dir)
 
         self.scope = core.Scope()
@@ -182,7 +182,7 @@ class Trainer(object):
                                self.startup_program)
 
             epoch_id, step_id = io.load_trainer_args(
-                self.checkpoint.checkpoint_dir, self.load_checkpoint_serial,
+                self.checkpoint.checkpoint_dir, self.checkpoint._load_serial,
                 self.trainer_id, ["epoch_id", "step_id"])
             self.checkpoint._epoch_id = int(epoch_id)
             self.checkpoint._step_id = int(step_id)

From 0211c5df0a12de2647b339dc0a8c36d35209a1a3 Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Wed, 30 May 2018 11:45:25 +0800
Subject: [PATCH 09/93] bug fix

---
 python/paddle/fluid/trainer.py        |  17 +++++++++--------
 tools/codestyle/docstring_checker.pyc | Bin 0 -> 12561 bytes
 2 files changed, 9 insertions(+), 8 deletions(-)
 create mode 100644 tools/codestyle/docstring_checker.pyc

diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py
index 5ca93821e2..34db9b39b7 100644
--- a/python/paddle/fluid/trainer.py
+++ b/python/paddle/fluid/trainer.py
@@ -139,13 +139,14 @@ class Trainer(object):
         self.trainer_id = 0
         self.chief = True
         self.checkpoint = checkpoint_config
-        if self.checkpoint and \
-            not isinstance(self.checkpoint, CheckpointConfig):
-            raise TypeError(
-                "The checkpoint_config shoule be an instance of CheckpointConfig"
-            )
-        self.checkpoint._load_serial = io.need_load_checkpoint(
-            self.checkpoint.checkpoint_dir)
+        if self.checkpoint:
+            if not isinstance(self.checkpoint, CheckpointConfig):
+                raise TypeError(
+                    "The checkpoint_config shoule be an instance of CheckpointConfig"
+                )
+            else:
+                self.checkpoint._load_serial = io.need_load_checkpoint(
+                    self.checkpoint.checkpoint_dir)
 
         self.scope = core.Scope()
 
@@ -175,7 +176,7 @@ class Trainer(object):
             exe = executor.Executor(place)
             exe.run(self.startup_program)
 
-        if self.checkpoint._load_serial:
+        if self.checkpoint and self.checkpoint._load_serial:
             exe = executor.Executor(place)
             io.load_checkpoint(exe, self.checkpoint.checkpoint_dir,
                                self.checkpoint._load_serial,
diff --git a/tools/codestyle/docstring_checker.pyc b/tools/codestyle/docstring_checker.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1f0255b763c8d154b7cc17d2d525148dfea3b42d
GIT binary patch
literal 12561
zcmdT~Npl;=7488*kbt;or54N9SSu6>B(+$!8Cn)4%CZuRQVr#(M2iK3m?1gT00T}B
zEYg-Lm69vVF}LI&q$*W8rE*K<5Z`=Een4`}DTh=}i7WZO*Mk8_(T-GhiZsb-cE9fF
z<@MX&WdAWx{QTCy2Q`)aWbywB9{o!cu2QooE#+oZV5wPDOg*D!Gg8m0Agk&*HJelQ
z9yQyeERM*jAg}7ZYPQ$3dsNV;>IF4hFm23MRI^3p=GCWIVux~jmDjHp@M<3+PnEhq
zJD}Wx@^S(#rqDs|Wu~-)Tbw{K71ned`HjVEOJ41v7umkHTiSE&IJ9d7+0C_hDQwto
zShLN@QyP`Ez#H-ZgP-YA2Ghix@M=|i$uw~f+O60Rbjf1(5Q>|Vct7;rWQghDd#<-o
zZ3VIG*WwyS^C2Fu;nDk1JXUHMlUS@gwfILg&B)4nHwHM~qOW7@UbgH-dL-G_PIN{|
zXC>*33EX&Yvzt760>6QoN~kF*^a^UD?`zCD8dn>Ok34_$uK9oN{FV8t@3uy2VSRMb
zk4G27Q61G-U24q>^iI`v1J9JB$-g)Z13g*`T~F{h3e+0Rws$xJocCeX_RbHXu8C%q
zi3=#x&DRLy4?C~o(f3h!pb3xyN+B|mNS4>mN=~_`=hUYl#~$VO2r%EKq+U`IhxH|t
z1d0O{5mHpnOsU8v#!;j@uxmk8YkMgSTy4iop53fQBp;k9As5>Vyt>zj^+?giPkO`L
zvfS{?sKj!;xLDk9B_kJ}B^g_;!ab89rb{`LPL_jN3vl6}2vCpES}pLZk-0?nYA;yW
z7FSrQ_zgd<RE}{|+?le5tP}@4%{BH+qF8JbI%s>M)5vr;vfYiGn(*bFirn^0iA*;w
z*qMwLwUE&D)oQ&NfY5r;=Xh}|YUr3`?o@s4X<;FU*D9}7@tPpf=?7@55$I4joWi5I
zL1hgjzk)}hCE|~VPHBmF1p1z2Nl}k86ctZ$YPF;ugS2wW2Iyf%MH5N;$P<8tH(tjp
zEBn+vv}Q`2b{eQv$|r>xvLZ0>8iHg&kD$>ou2lVo3o;jK47`uLKv=cK+t>cb-c}=1
zcC+Zk71`Se4x@R;%F0{))*h>94Q6UYBk6>6Dr9*c$2Y?FDMkW^%aFcGJ%L;qAd**m
zEMRnM?h9fW_+19Z(cPosta{u->_gf-v{ZCUtsGMgGntI!#pF3=8B!?6k<rH#N00Mr
zA;YOLKIzR%?=L8tHtFq?-rrG7ZPHtm-ajjM$NF9@jce~$2Os8Cl(Be51Ins{2bZys
zY~C`Pb_cnkuNZTWeb@+a1T@c-sO|J$P>3+Jfpw@%uNM1ZL)!~cSf_XtX-F!bY1NkO
zs<z3qXYGVXQ4t0`YqvSr*6AA@Ii9vS2gPm4(BkQF>TrtUs?oQ01Dd8wAR9T%x01Ei
zPjxqHdA7eGFJPiXp6gbv=4rjq3WBxH1VZ;A){3+k1Hv<!94T9L28jr2vi3?2b*~m#
zB*z<fb`e0O);{OBL-ZNOBSAU`*`%pqgjJ_WsUR{U7QNIA{20m`vW^=dGMdgF4mp6r
z42i2ztRF!;8<9sjSZy}Fh8uII6^OqIdn@r;(-VP6WRzsPWEN00#7kH+Jl?fbHTFao
z1ya_WVT{{yh9?%}_FhLtljBuiG1Cu*9{|hmw(^-F=%RJMnVr@F!1LDOOtGV-Qt3*0
zY{UU%>_KCHP0N)UrNL|1KI%hkQcAqYsJ#c+ow1^x`)DN2ZFw|eT8<`MRC)1}k;k^>
z$;f9snU@$Y^Bc}a79?d~Bb?Um>g_I?oSM6$pwq#IP(?6#p|YMrouC4(Kge<!av!Fp
zSenaFMFt`oXkukh;ZLM_4q}|kOdA<SE2Q+nWO=q){&=kXW~DrL=JW&o$=KP8Pf8C;
zBWE6*Za2q$vWpR4oWMvL!a#-;I2wj(ZAnDPVidNT4jmXVIgC|7i^Ev9^{o~m&MVl9
z$32tYRAw)whIKL{?xUEOl}e*p_bL?;ua!zYbXx(N6q%I@L}jJo44^JzS_H9kg3XgG
zh*XDXlyy3XmD<+0T}~gia|DmLm5_v)Laxx;Unra?WclmxN^5xJhj!9cJo*AI4R}hu
zE-y>Bo6eSTv*7Q;$I8;7hp|KN3eJ8(`Do^qhi0$Z57&JGv{_J>+@ipC00yTXumQjZ
z1vUiOPJ!(LY`4Jn0Jc|P`v4mj*nYqc2<#wWhXnQ#V21_vGGMO=><D1Cz+P3C7*QCJ
zaKce>`TCW6OezQ^9G42B2PdR5q}-EI*{R%9$}NdMxl6gH1-M(e-;v55<(`qsUge&Z
z%0A_mr7}!^a+Jusj$$g*#(GJD12kcpYjnliE<FMgV)2L*PvgXlPeKvWrAbu)ml?n)
zhmCSG)IN=_YEWJXqk1(iL&LYk89laP=tFPqQ5d<;j3aYmv9ejM+>S4hA39*MPP7f!
z?S^O52H^(V{x5<jm+f-`)?(>kYhGC_m2-Fnkp<U&wB$AHs;!s8DDHk2Xu#`rL`Qsh
zab^w?9rxZG8#|}TK~tTT+|bjo_F@|rjZH(wK00#LZhDa)x@Ki`l-Z#-uH3cXXJ@G<
zbEGSu*KcCw$sn+SP;Hii73fDcEhmi)>)4+6Y@!<Y#{vEB2$H6?CHp8>gMgQ!_4#$E
z;33e(>CJti2CK9oc5T;Q(;Z#VXE4DnUu(p&h{1$B!l<i3yEqIDmc=YB*Ubx>M&+a{
zVT<I&AZ#ompmh1Ib7ScuuZa}}gePN+$7g#{6h>zmHnT6<x>>DxHpmUTr9Y@C{>r9a
zmXVs)A#Qz>!Rbnu^l=QjAGWyJR^ZwQBiSD&R`iEN1VT<i$Yy@tX*scTG`Z?7Y;v#t
zGH#;U^>*TlbkCA5{JP-rnT5s7_^|E7biXNHq`AvTMhT8KTmH-ENLPPDT+l4`&*vH*
z{Ua2{=A<r1<j;7aEHml=<m*GYj;Cmv&k&YV`l2*2=Tq@iso~+<NuRtAUkwf(3;;N*
z8O$fX+xj+q$+TbMrQDAG48=!2L`r3$)evtEkxL_=wEa#3O1-UwMog$Qb=D@Iaoxl_
zD<!_#2wU*rfInN7A~h4%@wpNr*xYIBxM{YyWnfJZ+B?-icg^5adQD3(Phz1W)I?dE
zJ&R_KLrN&{AqUfEHBE=g(Xm-jRf;y3=q|XG{*y+$i&Mq4<HVOrs8XzP0}w*!vUsRQ
zc}AJGV+L%E1rXEbf995d4xrwPLiG(~hOJztAkR)XOgk+hT3UE~glIX(I-+$uVNBY0
z&KoF<J4-w{m)XR#r4eWD#2nn1alb;l?Ju=Y_w7C*)s8u#M6JQSzy7K-@{-owA{&qk
zi5x=1I2aU4Xdpyndb%!~q#e_SNEFRRE`x5Ykg<fzma4j9l(le=(HRvHAlr;qgnoZR
z+))U$BeN+V(9ID(sG<U-ESu6w<@F&IBMZeIShq?&>{I%(g2XluNNLJ-cuaGGf0)j2
z<^;ViJT|;VJZG#1aNFDV(?qB_7qOslQ<5WScqQ)8p}FEPanX@qGGZDRAVDG#Pf)Uz
zI-*=vIG{B2MQ(04)L+HMev2mwic=UJw~kmoNv%dbMc<2$+`!LJF*g9019*=1q^O5>
z1WvQOgQut?Og_sP?$n$k5nNzef}p^Y3B+O$Zsq{Pw)o4aVO~;JMWbJGNxgDOP$n}g
zqv{?nyd?hb-?61`+HPym)M!J3(exYR{u|Vt40<M0m*86@iZ;nVwtK>b4eNdb8W%@a
z&TA|v>YaB`OqUq-G^+FlYob}1>N5mMTm@9fP3K(#g?ewI)`>@&$@a0=X7MVEV<<$N
z)LIdgF>TNiKwGI=9J+YF$CIegAyJ<FnE~rC)aPNV&!irxTzKj%N3NkTgbNb>DZH(s
zknceuUs!tpC2|50{80nVsh_}Bf%M3zALET3Y6?R*Dlb`|_YLVoR3qmA*Sn7kFmC{f
z!X!h+qBs#kr7qNeM9~(|Z6Y;+@x-<(As=K<*cZo6(_e+7hSB1qb*!})u=m6eY#V)@
z8D5C`HZ}Kn{WY2$L@<$i0gq;E2Wo3>F9T-4rV-M>)(@-|k;?77E8i4hxd3VsX$TZ4
zH8Lju8i5HT+5mAOwKZm&xVICFUc*~$MwkD}eHe1m+<sBY0Ra^U{4bD9aOeSdbE8n*
zvxANPusOKHrF+nnWPonfH*zPJl}MgUE-2|>Xlsz;T*IFJgeT#@0^<L<%x)`Z9mW5{
zc%D_=P_Avo0q#i^2TpCc5}wx6ZWcV>rB}dT=mSrI5kXLTSu_MV4s<07oQXa#9dmbJ
zI;6~m>8P~yA5e4*p)R&-^Ol51+Hnl~)W|8j!z<6pZ3tBuHw|3-|6n-b7#1TA!y4UG
zMsLap>-E&+h~R$-z2bf0d?ruzF2{F7|7OI65nIG`&J>pZ8<%HVPxVr+zl7%iWIAYy
z97QvVGMyBte~2O$vtxBk2*851ae}4??jNQHh>K*EVYX^zC-cQH!(gP`L4%ojeT{$_
zgwg5L!@fuf5Yhclgvby}-C!#g43WVB<GT{n*EB)8e9^6-GMXZ=c<fRdPJ%U(*6nb9
z842SKWq*AwfFmo1slX07N(jKAO(F7qK6KZ_azKm&Ux`#3HBSQhRj50Bw<RXAWTlO%
zLLq2k_$iTgK8`Sp*c|d6i?b+<EJS6i$n`BO{ckSMjIb)?VSF$RnzwrJd<)3-U)Xhb
zOr?u%Q{y=e3?imnMGzHHKofyM5Vk{3--w{x8WO$)By2{)j>C=_7I#rduFY^w7rCBf
z_qT;yC$QOJsFD9ouGE}-M9S{QD~m?TA~gy%3snp=7@2~(%bUpBe#E2s=?-jS_;V1X
zQ)*>czSV&=hNP7)XmJbBdyof^!)QUuGVt1<x|*Rh#)A0lr@I?=`DX$VH!^wQGa3Om
z;85OpmoXE3UWg=_SOYN9xmkX(1DjxnnpwTUwWj9YmM?|S3kSAWMW56CyaD_D`DR)9
z9J|II#g5be1d=vHD!Z5n@yo7k!^Y&ri~6BoZDl+c^Q23DZSgM_sgKhuO}XuQ^R314
z<l7z`TaVk$1;9IuC8i?6$4DZTnAAB9iz$YSY?tU8Nzz)PRLs@ami)ktyha!I(axDf
zGG?(cjRdi%7iNkbe(Y(!$`N-rWku0K8$DY)T`7wiwkc5>vug=9Zx6NdU2JZQd`jt}
z`tsH=KByZ)rhG4LY=nxBTLrO*_}vYC-oA`ch5WYJ8DYVcjWfoA7M}SUsp9j?O-b?~
zqfRgX&h43-cW>UFo_w!z>-x;h<c;f<n^O{A{QmX(_ij5=U0~-vC!*=4*Xr|OAZBF~
znZU@8*W^Cx8sCUgYJ^SsGR=7#b>|9;Nfr!$I#*fHeHR}>3_gcy)S+;4c=C=`m)aJ2
zIJ7fEJo<Ym^4YvqfJ&!o=WiCDHx}_lQxUaZDR-g9$FzE_0cQI5OO8KZDD)2(h6{TN
zhYE+}Q$D#E<L>#ZM!5`O6_cMHX{Us#ev|_;P?ADOdgWvC1*8PExmkWfRSSb4`5I8Z
xXLe5WP2PmumofcJdWFZ@76=ydnJ|fg%Y)ce@_|IZ)sK7#^C11%LeKD>{{o7gqwN3y

literal 0
HcmV?d00001


From 0deb6f90baa5dab02b5ff1cbc98dcaf7fae9b80b Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Wed, 30 May 2018 14:20:51 +0800
Subject: [PATCH 10/93] annotation optimized and code style optimized

---
 python/paddle/fluid/io.py      | 22 +++++++++++++++++++++-
 python/paddle/fluid/trainer.py | 12 ++++++------
 2 files changed, 27 insertions(+), 7 deletions(-)

diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index 2925e8eb28..d52c9a8823 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -478,9 +478,10 @@ def save_checkpoint(executor,
 
     :param executor
     :param checkpoint_dir
+    :param trainer_id
+    :param is_chief
     :param main_program
     :param max_num_checkpoints
-    :param is_chief
     """
     if checkpoint_dir is None:
         raise ValueError("The values of 'checkpoint_dir' should not be None")
@@ -502,6 +503,11 @@ def save_checkpoint(executor,
 
 
 def need_load_checkpoint(checkpoint_dir):
+    """
+    If the directory have checkpoint files, it will return lastest checkpoint directory serial number
+
+    :param checkpoint_dir
+    """
     serial = _get_lastest_checkpoint_dir(checkpoint_dir)
     if serial < 0:
         return None
@@ -515,6 +521,7 @@ def load_checkpoint(executor, checkpoint_dir, serial, main_program):
 
     :param executor
     :param checkpoint_dir
+    :param serial
     :param main_program
     """
 
@@ -536,7 +543,11 @@ def clean_checkpoint(checkpoint_dir, delete_dir=False):
     """
     clean the checkpoint dir, when the train exits normally, the trainer will call clean_checkpoint to delete checkpoint directory saved before.
     delete_dir only works when the directory is empty, otherwise, OSError is raised.  
+
+    :param checkpoint_dir
+    :param delete_dir
     """
+
     if checkpoint_dir is None:
         raise ValueError("The values of 'checkpoint_dir' should not be None")
     _lru_delete(checkpoint_dir, max_num_checkpoints=0)
@@ -549,6 +560,11 @@ def load_persist_vars_without_grad(executor, dirname, program, nest=True):
     """
     load_persist_vars_without_grad will load variables from a directory by an executor,
     the variable named end with "@GRAD" will not be loaded.
+
+    :param executor
+    :param dirname
+    :param program
+    :param nest
     """
 
     if nest:
@@ -566,6 +582,10 @@ def save_persist_vars_without_grad(executor, dirname, program):
     """
     save_persist_vars_without_grad  will save variables to a directory by an executor,
     the variable named end with "@GRAD" will not be saved.
+
+    :param executor
+    :param dirname
+    :param program
     """
     cur_dir = _get_model_dir(dirname)
     save_vars(
diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py
index 34db9b39b7..6d8d4a3e43 100644
--- a/python/paddle/fluid/trainer.py
+++ b/python/paddle/fluid/trainer.py
@@ -79,8 +79,8 @@ class CheckpointConfig(object):
         else:
             self.step_interval = step_interval
 
-        self._epoch_id = 0
-        self._step_id = 0
+        self.epoch_id = 0
+        self.step_id = 0
         self._load_serial = None
 
 
@@ -185,8 +185,8 @@ class Trainer(object):
             epoch_id, step_id = io.load_trainer_args(
                 self.checkpoint.checkpoint_dir, self.checkpoint._load_serial,
                 self.trainer_id, ["epoch_id", "step_id"])
-            self.checkpoint._epoch_id = int(epoch_id)
-            self.checkpoint._step_id = int(step_id)
+            self.checkpoint.epoch_id = int(epoch_id)
+            self.checkpoint.step_id = int(step_id)
 
         if param_path and os.path.isdir(param_path):
             # load params from param_path into scope
@@ -353,7 +353,7 @@ class Trainer(object):
     def _train_by_any_executor(self, event_handler, exe, num_epochs, reader):
         epochs = [
             epoch_id for epoch_id in range(num_epochs)
-            if epoch_id >= self.checkpoint._epoch_id
+            if epoch_id >= self.checkpoint.epoch_id
         ]
         for epoch_id in epochs:
             event_handler(BeginEpochEvent(epoch_id))
@@ -363,7 +363,7 @@ class Trainer(object):
                     return
 
                 if self.checkpoint and self.checkpoint._load_serial \
-                    and self.checkpoint._step_id >= step_id and self.checkpoint._epoch_id == epoch_id:
+                    and self.checkpoint.step_id >= step_id and self.checkpoint.epoch_id == epoch_id:
                     continue
 
                 begin_event = BeginStepEvent(epoch_id, step_id)

From d712af25dcee298a1bd1fda1bba6a1f0ed001ab0 Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Wed, 30 May 2018 16:29:05 +0800
Subject: [PATCH 11/93] add distribute config

---
 python/paddle/fluid/trainer.py | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py
index 6d8d4a3e43..e98672f318 100644
--- a/python/paddle/fluid/trainer.py
+++ b/python/paddle/fluid/trainer.py
@@ -81,7 +81,8 @@ class CheckpointConfig(object):
 
         self.epoch_id = 0
         self.step_id = 0
-        self._load_serial = None
+        self.load_serial = None
+        self.is_pserver = False
 
 
 def check_and_get_place(place):
@@ -145,7 +146,7 @@ class Trainer(object):
                     "The checkpoint_config shoule be an instance of CheckpointConfig"
                 )
             else:
-                self.checkpoint._load_serial = io.need_load_checkpoint(
+                self.checkpoint.load_serial = io.need_load_checkpoint(
                     self.checkpoint.checkpoint_dir)
 
         self.scope = core.Scope()
@@ -176,17 +177,18 @@ class Trainer(object):
             exe = executor.Executor(place)
             exe.run(self.startup_program)
 
-        if self.checkpoint and self.checkpoint._load_serial:
+        if self.checkpoint and self.checkpoint.load_serial:
             exe = executor.Executor(place)
             io.load_checkpoint(exe, self.checkpoint.checkpoint_dir,
-                               self.checkpoint._load_serial,
+                               self.checkpoint.load_serial,
                                self.startup_program)
 
-            epoch_id, step_id = io.load_trainer_args(
-                self.checkpoint.checkpoint_dir, self.checkpoint._load_serial,
-                self.trainer_id, ["epoch_id", "step_id"])
-            self.checkpoint.epoch_id = int(epoch_id)
-            self.checkpoint.step_id = int(step_id)
+            if not self.checkpoint.is_pserver:
+                epoch_id, step_id = io.load_trainer_args(
+                    self.checkpoint.checkpoint_dir, self.checkpoint.load_serial,
+                    self.trainer_id, ["epoch_id", "step_id"])
+                self.checkpoint.epoch_id = int(epoch_id)
+                self.checkpoint.step_id = int(step_id)
 
         if param_path and os.path.isdir(param_path):
             # load params from param_path into scope
@@ -259,6 +261,9 @@ class Trainer(object):
             t.transpile(
                 trainer_id, pservers=pserver_endpoints, trainers=trainers)
             if training_role == "PSERVER":
+                if self.checkpoint:
+                    self.is_pserver = True
+
                 self.train_program = t.get_pserver_program(current_endpoint)
                 self.startup_program = t.get_startup_program(current_endpoint,
                                                              self.train_program)
@@ -362,7 +367,7 @@ class Trainer(object):
                     self._clean_checkpoint()
                     return
 
-                if self.checkpoint and self.checkpoint._load_serial \
+                if self.checkpoint and self.checkpoint.load_serial \
                     and self.checkpoint.step_id >= step_id and self.checkpoint.epoch_id == epoch_id:
                     continue
 

From b44ede803387c0e292322ba140468599a9136352 Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Wed, 30 May 2018 19:26:12 +0800
Subject: [PATCH 12/93] bug fix

---
 python/paddle/fluid/trainer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py
index e98672f318..b4f719855f 100644
--- a/python/paddle/fluid/trainer.py
+++ b/python/paddle/fluid/trainer.py
@@ -252,14 +252,14 @@ class Trainer(object):
         current_endpoint = os.getenv("PADDLE_CURRENT_IP", "") + ":" + port
         # the unique trainer id, starting from 0, needed by trainer
         # only
-        trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
+        self.trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
         self.chief = self.trainer_id == 0
         # the role, should be either PSERVER or TRAINER
         training_role = os.getenv("PADDLE_TRAINING_ROLE")
         with self._prog_and_scope_guard():
             t = distribute_transpiler.DistributeTranspiler()
             t.transpile(
-                trainer_id, pservers=pserver_endpoints, trainers=trainers)
+                self.trainer_id, pservers=pserver_endpoints, trainers=trainers)
             if training_role == "PSERVER":
                 if self.checkpoint:
                     self.is_pserver = True

From 94eaf94cf57ec2cc951d046e847b69c348b8f9c9 Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Wed, 30 May 2018 21:51:16 +0800
Subject: [PATCH 13/93] bug fix about lru and save

---
 python/paddle/fluid/io.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index d52c9a8823..8e10b01a4a 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -495,11 +495,11 @@ def save_checkpoint(executor,
     serial = _get_lastest_checkpoint_dir(checkpoint_dir) + 1
     cur_dir = _get_serial_dir(checkpoint_dir, serial)
 
+    save_trainer_args(cur_dir, trainer_id, trainer_args)
+
     if is_chief:
         save_persist_vars_without_grad(executor, cur_dir, main_program)
-
-    save_trainer_args(cur_dir, trainer_id, trainer_args)
-    _lru_delete(checkpoint_dir, max_num_checkpoints)
+        _lru_delete(checkpoint_dir, max_num_checkpoints)
 
 
 def need_load_checkpoint(checkpoint_dir):
@@ -639,7 +639,13 @@ def _is_checkpoint_var(var):
             var.desc.type() == core.VarDesc.VarType.RAW:
         return False
 
-    if var.name.endswith("@GRAD"):
+    if "@GRAD" in var.name:
+        return False
+
+    if ".trainer_" in var.name:
+        return False
+
+    if ".block" in var.name:
         return False
 
     return var.persistable

From e44c278e60603c37640a0a352f4bbb7f8363bebc Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Wed, 30 May 2018 21:55:12 +0800
Subject: [PATCH 14/93] bug fix about clean

---
 python/paddle/fluid/trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py
index b4f719855f..69577a98fb 100644
--- a/python/paddle/fluid/trainer.py
+++ b/python/paddle/fluid/trainer.py
@@ -425,7 +425,7 @@ class Trainer(object):
         return self._get_parallel_executor()
 
     def _clean_checkpoint(self):
-        if not self.checkpoint:
+        if not self.checkpoint and not self.chief:
             return
         io.clean_checkpoint(checkpoint_dir=self.checkpoint.checkpoint_dir)
 

From bca4da422582990b4308932d2c20274cdb6c5a60 Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Wed, 30 May 2018 21:56:54 +0800
Subject: [PATCH 15/93] cancle only chief delete files

---
 python/paddle/fluid/io.py      | 3 ++-
 python/paddle/fluid/trainer.py | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index 8e10b01a4a..62e3046db6 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -499,7 +499,8 @@ def save_checkpoint(executor,
 
     if is_chief:
         save_persist_vars_without_grad(executor, cur_dir, main_program)
-        _lru_delete(checkpoint_dir, max_num_checkpoints)
+
+    _lru_delete(checkpoint_dir, max_num_checkpoints)
 
 
 def need_load_checkpoint(checkpoint_dir):
diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py
index 69577a98fb..b4f719855f 100644
--- a/python/paddle/fluid/trainer.py
+++ b/python/paddle/fluid/trainer.py
@@ -425,7 +425,7 @@ class Trainer(object):
         return self._get_parallel_executor()
 
     def _clean_checkpoint(self):
-        if not self.checkpoint and not self.chief:
+        if not self.checkpoint:
             return
         io.clean_checkpoint(checkpoint_dir=self.checkpoint.checkpoint_dir)
 

From 46f2688f3051b0bbeb070d05159922e8b689720e Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Thu, 31 May 2018 09:53:41 +0800
Subject: [PATCH 16/93] bug fix

---
 python/paddle/fluid/trainer.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py
index b4f719855f..3354d77ace 100644
--- a/python/paddle/fluid/trainer.py
+++ b/python/paddle/fluid/trainer.py
@@ -356,10 +356,14 @@ class Trainer(object):
             self._train_by_any_executor(event_handler, exe, num_epochs, reader)
 
     def _train_by_any_executor(self, event_handler, exe, num_epochs, reader):
-        epochs = [
-            epoch_id for epoch_id in range(num_epochs)
-            if epoch_id >= self.checkpoint.epoch_id
-        ]
+        if self.checkpoint:
+            epochs = [
+                epoch_id for epoch_id in range(num_epochs)
+                if epoch_id >= self.checkpoint.epoch_id
+            ]
+        else:
+            epochs = [epoch_id for epoch_id in range(num_epochs)]
+
         for epoch_id in epochs:
             event_handler(BeginEpochEvent(epoch_id))
             for step_id, data in enumerate(reader()):

From f9d93bfde1bd69d84a10cb676f0aba52b1596edd Mon Sep 17 00:00:00 2001
From: yuyang18 <reyoung@126.com>
Date: Thu, 31 May 2018 16:42:00 +0800
Subject: [PATCH 17/93] Add document to random crop operator

---
 paddle/fluid/operators/random_crop_op.cc      |  6 +--
 .../fluid/layers/layer_function_generator.py  | 53 ++++++++++++++++---
 python/paddle/fluid/layers/nn.py              | 29 +++++++++-
 3 files changed, 75 insertions(+), 13 deletions(-)

diff --git a/paddle/fluid/operators/random_crop_op.cc b/paddle/fluid/operators/random_crop_op.cc
index b14b559e31..371cdb5b85 100644
--- a/paddle/fluid/operators/random_crop_op.cc
+++ b/paddle/fluid/operators/random_crop_op.cc
@@ -36,11 +36,11 @@ class RandomCropOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Seed", "The random seed.");
     AddOutput("Out", "The cropped instance batch.");
     AddOutput("SeedOut", "The random seed after random cropping.")
-        .AsDispensable();
+        .AsIntermediate();
     AddAttr<std::vector<int>>("shape", "The shape of a cropped instance.");
     AddComment(R"DOC(
-      This operator takes a batch of instance, and do random cropping on each instance. 
-      It means that cropping positions differs on each instance, which is determined 
+      This operator takes a batch of instance, and do random cropping on each instance.
+      It means that cropping positions differs on each instance, which is determined
       by an uniform random generator. All cropped instances have the same shape, which 
       is determined by the operator's attribute 'shape'.
     )DOC");
diff --git a/python/paddle/fluid/layers/layer_function_generator.py b/python/paddle/fluid/layers/layer_function_generator.py
index 295d1b7190..6026237d0b 100644
--- a/python/paddle/fluid/layers/layer_function_generator.py
+++ b/python/paddle/fluid/layers/layer_function_generator.py
@@ -15,16 +15,13 @@ import re
 import cStringIO
 import functools
 import warnings
+import string
 
 from ..proto import framework_pb2
 from ..framework import OpProtoHolder, Variable
 from ..layer_helper import LayerHelper
 
-__all__ = [
-    'deprecated',
-    'generate_layer_fn',
-    'autodoc',
-]
+__all__ = ['deprecated', 'generate_layer_fn', 'autodoc', 'templatedoc']
 
 
 def _convert_(name):
@@ -43,6 +40,10 @@ def _convert_(name):
     return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()
 
 
+def _type_to_str_(tp):
+    return framework_pb2.AttrType.Name(tp)
+
+
 def _generate_doc_string_(op_proto):
     """
     Generate docstring by OpProto
@@ -54,9 +55,6 @@ def _generate_doc_string_(op_proto):
         str: the document string
     """
 
-    def _type_to_str_(tp):
-        return framework_pb2.AttrType.Name(tp)
-
     if not isinstance(op_proto, framework_pb2.OpProto):
         raise TypeError("OpProto should be `framework_pb2.OpProto`")
 
@@ -220,3 +218,42 @@ def autodoc(comment=""):
         return func
 
     return __impl__
+
+
+def templatedoc():
+    """
+    Decorator of layer function. It will use the docstring from the layer
+    function as the template. The template arguments are:
+
+    * ${comment}: The operator comment written in CPP.
+    * ${{name}_comment}: The comment of ${name} written with AddAttr, AddOutput,
+        and AddInput. The ${name} is Python snake style. i.e., xxx_xxx.
+    * ${{name}_type}: The type of ${name}.
+
+
+    Returns:
+        Decorated funciton.
+    """
+
+    def __impl__(func):
+        op_proto = OpProtoHolder.instance().get_op_proto(func.__name__)
+        tmpl = string.Template(func.__doc__)
+        args = {"comment": " ".join(op_proto.comment.split())}
+        for each_input in op_proto.inputs:
+            input_name = _convert_(each_input.name)
+            args["{0}_comment".format(input_name)] = each_input.comment
+            args["{0}_type".format(input_name)] = "Variable"
+        for each_attr in op_proto.attrs:
+            input_name = _convert_(each_attr.name)
+            args["{0}_comment".format(input_name)] = each_attr.comment
+            args["{0}_type".format(input_name)] = _type_to_str_(each_attr.type)
+
+        for each_opt in op_proto.outputs:
+            output_name = _convert_(each_opt.name)
+            args["{0}_comment".format(output_name)] = each_opt.comment
+            args["{0}_type".format(output_name)] = "Variable"
+
+        func.__doc__ = tmpl.substitute(args)
+        return func
+
+    return __impl__
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 63ec831514..acebeaebbb 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -19,9 +19,10 @@ from ..layer_helper import LayerHelper
 from ..initializer import Normal, Constant
 from ..framework import Variable
 from ..param_attr import ParamAttr
-from layer_function_generator import autodoc
+from layer_function_generator import autodoc, templatedoc
 from tensor import concat
 import utils
+import random
 
 __all__ = [
     'fc',
@@ -3992,10 +3993,34 @@ def upsampling_bilinear2d(input, out_shape=None, scale=None, name=None):
     return out
 
 
-def random_crop(input, shape, seed=1):
+@templatedoc()
+def random_crop(x, shape, seed=None):
+    """
+    **Random crop operator**
+    
+    ${comment}
+
+    Examples:
+        >>> img = fluid.layers.data("img", [3, 256, 256])
+        >>> cropped_img = fluid.layers.random_crop(img, shape=[3, 224, 224])
+
+    Args:
+        x(${x_type}): ${x_comment}
+        shape(${shape_type}): ${shape_comment}
+        seed(int|${seed_type}|None): ${seed_comment} By default, the seed will
+            get from `random.randint(-65536, 65535)`.
+
+    Returns:
+        ${out_comment}
+
+    """
+
     helper = LayerHelper("random_crop", **locals())
     dtype = helper.input_dtype()
     out = helper.create_tmp_variable(dtype)
+    if seed is None:
+        seed = random.randint(-65536, 65535)
+
     if isinstance(seed, int):
         seed_value = seed
         seed = helper.create_tmp_variable(dtype="int64")

From 7c00e164e5886bb430ff945f2de091a2d45ff811 Mon Sep 17 00:00:00 2001
From: yuyang18 <reyoung@126.com>
Date: Thu, 31 May 2018 17:07:42 +0800
Subject: [PATCH 18/93] Add More comments

---
 .../fluid/layers/layer_function_generator.py  | 13 ++++++---
 python/paddle/fluid/layers/nn.py              | 27 +++++++++++++++++++
 2 files changed, 37 insertions(+), 3 deletions(-)

diff --git a/python/paddle/fluid/layers/layer_function_generator.py b/python/paddle/fluid/layers/layer_function_generator.py
index 6026237d0b..fb5e454e94 100644
--- a/python/paddle/fluid/layers/layer_function_generator.py
+++ b/python/paddle/fluid/layers/layer_function_generator.py
@@ -230,15 +230,22 @@ def templatedoc():
         and AddInput. The ${name} is Python snake style. i.e., xxx_xxx.
     * ${{name}_type}: The type of ${name}.
 
-
     Returns:
-        Decorated funciton.
+        Decorated function.
     """
 
     def __impl__(func):
         op_proto = OpProtoHolder.instance().get_op_proto(func.__name__)
         tmpl = string.Template(func.__doc__)
-        args = {"comment": " ".join(op_proto.comment.split())}
+
+        comment_lines = op_proto.comment.split("\n")
+        comment = ""
+        for line in comment_lines:
+            line = line.lstrip()
+            comment += line
+            comment += "\n"
+
+        args = {"comment": comment}
         for each_input in op_proto.inputs:
             input_name = _convert_(each_input.name)
             args["{0}_comment".format(input_name)] = each_input.comment
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index acebeaebbb..970a186fac 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -799,7 +799,22 @@ def gru_unit(input,
     return updated_hidden, reset_hidden_pre, gate
 
 
+@templatedoc()
 def linear_chain_crf(input, label, param_attr=None):
+    """
+    Linear Chain CRF.
+
+    ${comment}
+
+    Args:
+        input(${emission_type}): ${emission_comment}
+        label(${label_type}): ${label_comment}
+        param_attr(ParamAttr): The attribute of the learnable parameter.
+
+    Returns:
+        ${log_likelihood_comment}
+
+    """
     helper = LayerHelper('linear_chain_crf', **locals())
     size = input.shape[1]
     transition = helper.create_parameter(
@@ -825,7 +840,19 @@ def linear_chain_crf(input, label, param_attr=None):
     return log_likelihood
 
 
+@templatedoc()
 def crf_decoding(input, param_attr, label=None):
+    """
+    ${comment}
+
+    Args:
+        input(${emission_type}): ${emission_comment}
+        param_attr(ParamAttr): The parameter attribute for training.
+        label(${label_type}): ${label_comment}
+
+    Returns:
+        ${viterbi_path_comment}
+    """
     helper = LayerHelper('crf_decoding', **locals())
     transition = helper.get_parameter(param_attr.name)
     viterbi_path = helper.create_tmp_variable(dtype=helper.input_dtype())

From 018d411075d40070b1efbae0f86185a819d35586 Mon Sep 17 00:00:00 2001
From: yuyang18 <reyoung@126.com>
Date: Thu, 31 May 2018 17:09:26 +0800
Subject: [PATCH 19/93] Remove unnecessary func name

---
 python/paddle/fluid/layers/nn.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 970a186fac..5e139a2653 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -4023,8 +4023,6 @@ def upsampling_bilinear2d(input, out_shape=None, scale=None, name=None):
 @templatedoc()
 def random_crop(x, shape, seed=None):
     """
-    **Random crop operator**
-    
     ${comment}
 
     Examples:

From 7973d9b4b5b3ef032c13410401b8c368220cd21d Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Fri, 1 Jun 2018 10:09:31 +0800
Subject: [PATCH 20/93] bug fix

---
 python/paddle/fluid/trainer.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py
index 3354d77ace..72168886fd 100644
--- a/python/paddle/fluid/trainer.py
+++ b/python/paddle/fluid/trainer.py
@@ -178,10 +178,11 @@ class Trainer(object):
             exe.run(self.startup_program)
 
         if self.checkpoint and self.checkpoint.load_serial:
-            exe = executor.Executor(place)
-            io.load_checkpoint(exe, self.checkpoint.checkpoint_dir,
-                               self.checkpoint.load_serial,
-                               self.startup_program)
+            with self._prog_and_scope_guard():
+                exe = executor.Executor(place)
+                io.load_checkpoint(exe, self.checkpoint.checkpoint_dir,
+                                   self.checkpoint.load_serial,
+                                   self.startup_program)
 
             if not self.checkpoint.is_pserver:
                 epoch_id, step_id = io.load_trainer_args(

From 47630a4a9e8b54dc27184586d22559b427525785 Mon Sep 17 00:00:00 2001
From: "yi.wu" <yi.wu@baifendian.com>
Date: Fri, 1 Jun 2018 17:28:24 +0800
Subject: [PATCH 21/93] fluid benchmark support recordio reader

---
 benchmark/fluid/Dockerfile                    |   2 +-
 benchmark/fluid/README.md                     |   9 ++
 benchmark/fluid/fluid_benchmark.py            |  55 +++++---
 benchmark/fluid/models/machine_translation.py |   2 +
 benchmark/fluid/models/mnist.py               |  21 ++-
 benchmark/fluid/models/resnet.py              |  20 ++-
 .../fluid/models/stacked_dynamic_lstm.py      |   3 +
 benchmark/fluid/models/vgg.py                 |  21 ++-
 benchmark/fluid/recordio_converter.py         | 133 ++++++++++++++++++
 python/paddle/fluid/layers/io.py              |   2 +-
 10 files changed, 241 insertions(+), 27 deletions(-)
 create mode 100644 benchmark/fluid/recordio_converter.py

diff --git a/benchmark/fluid/Dockerfile b/benchmark/fluid/Dockerfile
index 46140a9d1b..5d98a9b3c4 100644
--- a/benchmark/fluid/Dockerfile
+++ b/benchmark/fluid/Dockerfile
@@ -19,4 +19,4 @@ ADD *.whl /
 RUN pip install /*.whl && rm -f /*.whl && chmod +x /usr/bin/paddle_k8s
 
 ENV LD_LIBRARY_PATH=/usr/local/lib
-ADD fluid_benchmark.py dataset.py models/ /workspace/
+ADD fluid_benchmark.py recordio_converter.py models/ /workspace/
diff --git a/benchmark/fluid/README.md b/benchmark/fluid/README.md
index 1b0c7dce8b..c17cab999b 100644
--- a/benchmark/fluid/README.md
+++ b/benchmark/fluid/README.md
@@ -42,6 +42,15 @@ Currently supported `--model` argument include:
     PADDLE_PSERVER_PORT=7164 PADDLE_TRAINER_IPS=192.168.0.2,192.168.0.3  PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist --device GPU --update_method nccl2
     ```
 
+## Prepare the RecordIO file to Achieve Better Performance
+
+Run the following command will generate RecordIO files like "mnist.recordio" under the path
+and batch_size you choose:
+
+```bash
+python -c 'from recordio_converter import *; prepare_mnist("data", 32)'
+```
+
 ## Run Distributed Benchmark on Kubernetes Cluster
 
 You may need to build a Docker image before submitting a cluster job onto Kubernetes, or you will
diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py
index c1d458970a..9dce7d5647 100644
--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
@@ -44,7 +44,6 @@ def parse_args():
         type=float,
         default=0.001,
         help='The minibatch size.')
-    # TODO(wuyi): add "--use_fake_data" option back.
     parser.add_argument(
         '--skip_batch_num',
         type=int,
@@ -106,6 +105,16 @@ def parse_args():
         default='local',
         choices=['local', 'pserver', 'nccl2'],
         help='Choose parameter update method, can be local, pserver, nccl2.')
+    parser.add_argument(
+        '--use_reader_op',
+        action='store_true',
+        help='Whether to use reader op, and must specify the data path if set this to true.'
+    )
+    parser.add_argument(
+        '--data_path',
+        type=str,
+        default="",
+        help='Directory that contains all the training recordio files.')
     args = parser.parse_args()
     return args
 
@@ -208,11 +217,13 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
     place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
     exe = fluid.Executor(place)
     exe.run(startup_prog)
-    feed_var_list = [
-        var for var in train_prog.global_block().vars.itervalues()
-        if var.is_data
-    ]
-    feeder = fluid.DataFeeder(feed_var_list, place)
+
+    if not args.use_reader_op:
+        feed_var_list = [
+            var for var in train_prog.global_block().vars.itervalues()
+            if var.is_data
+        ]
+        feeder = fluid.DataFeeder(feed_var_list, place)
 
     iters, num_samples, start_time = 0, 0, time.time()
     for pass_id in range(args.pass_num):
@@ -223,9 +234,12 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
                 num_samples = 0
             if iters == args.iterations:
                 break
-            loss = exe.run(train_prog,
-                           feed=feeder.feed(data),
-                           fetch_list=[avg_loss])
+            if args.use_reader_op:
+                loss = exe.run(train_prog, fetch_list=[avg_loss])
+            else:
+                loss = exe.run(train_prog,
+                               feed=feeder.feed(data),
+                               fetch_list=[avg_loss])
             iters += 1
             num_samples += len(data)
             train_losses.append(loss)
@@ -251,10 +265,14 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
 def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
                    batch_acc, args, train_prog, startup_prog, nccl_id_var,
                    num_trainers, trainer_id):
-    feed_var_list = [
-        var for var in train_prog.global_block().vars.itervalues()
-        if var.is_data
-    ]
+    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
+    if not args.use_reader_op:
+        feed_var_list = [
+            var for var in train_prog.global_block().vars.itervalues()
+            if var.is_data
+        ]
+        feeder = fluid.DataFeeder(feed_var_list, place)
+
     # generate fake:
     if args.use_fake_data:
         for var in feed_var_list:
@@ -271,7 +289,6 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
                        "value": 1.0,
                        "dtype": var.dtype})
 
-    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
     if nccl_id_var and trainer_id == 0:
         #FIXME(wuyi): wait other trainer to start listening
         time.sleep(30)
@@ -288,7 +305,6 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
         num_trainers=num_trainers,
         trainer_id=trainer_id)
 
-    feeder = fluid.DataFeeder(feed_var_list, place)
     for pass_id in range(args.pass_num):
         num_samples = 0
         iters = 0
@@ -304,7 +320,10 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
                 num_samples = 0
             if iters == args.iterations:
                 break
-            if args.use_fake_data:
+            # NOTE: if use reader ops, the input data is not splited to multiple cards
+            if args.use_reader_op and iters >= args.iterations / args.gpus:
+                break
+            if args.use_fake_data or args.use_reader_op:
                 loss, = exe.run([avg_loss.name])
             else:
                 loss, = exe.run([avg_loss.name], feed=feeder.feed(data))
@@ -316,6 +335,8 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
                 print("Pass %d, batch %d, loss %s" %
                       (pass_id, batch_id, np.array(loss)))
         train_elapsed = time.time() - start_time
+        if args.use_reader_op:
+            num_samples = num_samples * args.gpus
         examples_per_sec = num_samples / train_elapsed
         print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
               (num_samples, train_elapsed, examples_per_sec))
@@ -342,7 +363,7 @@ def main():
     # the unique trainer id, starting from 0, needed by trainer
     # only
     nccl_id_var, num_trainers, trainer_id = (
-        None, 1, int(os.getenv("PADDLE_TRAINER_ID", "-1")))
+        None, 1, int(os.getenv("PADDLE_TRAINER_ID", "0")))
 
     if args.use_cprof:
         pr = cProfile.Profile()
diff --git a/benchmark/fluid/models/machine_translation.py b/benchmark/fluid/models/machine_translation.py
index 635b3373dd..3024882725 100644
--- a/benchmark/fluid/models/machine_translation.py
+++ b/benchmark/fluid/models/machine_translation.py
@@ -197,6 +197,8 @@ def lodtensor_to_ndarray(lod_tensor):
 
 
 def get_model(args):
+    if args.use_reader_op:
+        raise Exception("machine_translation do not support reader op for now.")
     embedding_dim = 512
     encoder_size = 512
     decoder_size = 512
diff --git a/benchmark/fluid/models/mnist.py b/benchmark/fluid/models/mnist.py
index d264bfc12b..5d3da68daf 100644
--- a/benchmark/fluid/models/mnist.py
+++ b/benchmark/fluid/models/mnist.py
@@ -20,6 +20,7 @@ import numpy as np
 import argparse
 import time
 import cProfile
+import os
 
 import paddle
 import paddle.fluid as fluid
@@ -65,9 +66,23 @@ def cnn_model(data):
 
 
 def get_model(args):
-    # Input data
-    images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    if args.use_reader_op:
+        filelist = [
+            os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
+        ]
+        data_file = fluid.layers.open_files(
+            filenames=filelist,
+            shapes=[[-1, 1, 28, 28], (-1, 1)],
+            lod_levels=[0, 0],
+            dtypes=["float32", "int64"],
+            thread_num=args.gpus)
+        data_file = fluid.layers.double_buffer(
+            fluid.layers.batch(
+                data_file, batch_size=args.batch_size))
+        images, label = fluid.layers.read_file(data_file)
+    else:
+        images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
 
     # Train program
     predict = cnn_model(images)
diff --git a/benchmark/fluid/models/resnet.py b/benchmark/fluid/models/resnet.py
index 9dec8911ed..35daad6664 100644
--- a/benchmark/fluid/models/resnet.py
+++ b/benchmark/fluid/models/resnet.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 import functools
 import numpy as np
 import time
+import os
 
 import cProfile, pstats, StringIO
 
@@ -129,9 +130,24 @@ def get_model(args):
         else:
             dshape = [224, 224, 3]
         model = resnet_imagenet
+    if args.use_reader_op:
+        filelist = [
+            os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
+        ]
+        data_file = fluid.layers.open_files(
+            filenames=filelist,
+            shapes=[[-1] + dshape, (-1, 1)],
+            lod_levels=[0, 0],
+            dtypes=["float32", "int64"],
+            thread_num=args.gpus)
+        data_file = fluid.layers.double_buffer(
+            fluid.layers.batch(
+                data_file, batch_size=args.batch_size))
+        input, label = fluid.layers.read_file(data_file)
+    else:
+        input = fluid.layers.data(name='data', shape=dshape, dtype='float32')
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
 
-    input = fluid.layers.data(name='data', shape=dshape, dtype='float32')
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
     predict = model(input, class_dim)
     cost = fluid.layers.cross_entropy(input=predict, label=label)
     avg_cost = fluid.layers.mean(x=cost)
diff --git a/benchmark/fluid/models/stacked_dynamic_lstm.py b/benchmark/fluid/models/stacked_dynamic_lstm.py
index 81a28b5f3a..e2a8cf45ac 100644
--- a/benchmark/fluid/models/stacked_dynamic_lstm.py
+++ b/benchmark/fluid/models/stacked_dynamic_lstm.py
@@ -44,6 +44,9 @@ def crop_sentence(reader, crop_size):
 
 
 def get_model(args):
+    if args.use_reader_op:
+        raise Exception(
+            "stacked_dynamic_lstm do not support reader op for now.")
     lstm_size = 512
     emb_dim = 512
     crop_size = 1500
diff --git a/benchmark/fluid/models/vgg.py b/benchmark/fluid/models/vgg.py
index 53856c5f7a..b84e118a88 100644
--- a/benchmark/fluid/models/vgg.py
+++ b/benchmark/fluid/models/vgg.py
@@ -22,6 +22,7 @@ import paddle.fluid as fluid
 import paddle.fluid.core as core
 import argparse
 import functools
+import os
 
 
 def vgg16_bn_drop(input):
@@ -65,9 +66,23 @@ def get_model(args):
         else:
             data_shape = [224, 224, 3]
 
-    # Input data
-    images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    if args.use_reader_op:
+        filelist = [
+            os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
+        ]
+        data_file = fluid.layers.open_files(
+            filenames=filelist,
+            shapes=[[-1] + data_shape, (-1, 1)],
+            lod_levels=[0, 0],
+            dtypes=["float32", "int64"],
+            thread_num=args.gpus)
+        data_file = fluid.layers.double_buffer(
+            fluid.layers.batch(
+                data_file, batch_size=args.batch_size))
+        images, label = fluid.layers.read_file(data_file)
+    else:
+        images = fluid.layers.data(name='data', shape=dshape, dtype='float32')
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
 
     # Train program
     net = vgg16_bn_drop(images)
diff --git a/benchmark/fluid/recordio_converter.py b/benchmark/fluid/recordio_converter.py
new file mode 100644
index 0000000000..c69062c4a1
--- /dev/null
+++ b/benchmark/fluid/recordio_converter.py
@@ -0,0 +1,133 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.dataset import mnist, cifar, flowers, image
+
+
+def convert_2_recordio(py_reader, outfilepath, batch_size, shape_data,
+                       shape_label):
+    num_batches = 0
+    with fluid.program_guard(fluid.Program(), fluid.Program()):
+        reader = paddle.batch(py_reader(), batch_size=batch_size)
+        feeder = fluid.DataFeeder(
+            feed_list=[  # order is image and label
+                fluid.layers.data(
+                    name='image', shape=shape_data),
+                fluid.layers.data(
+                    name='label', shape=shape_label, dtype='int64'),
+            ],
+            place=fluid.CPUPlace())
+        num_batches = fluid.recordio_writer.convert_reader_to_recordio_file(
+            outfilepath, reader, feeder)
+    return num_batches
+
+
+def prepare_mnist(outpath, batch_size):
+    outfilepath = os.path.join(outpath, "mnist.recordio")
+    convert_2_recordio(mnist.train, outfilepath, batch_size, [784], [1])
+
+
+def prepare_cifar10(outpath, batch_size):
+    outfilepath = os.path.join(outpath, "cifar.recordio")
+    convert_2_recordio(cifar.train10, outfilepath, batch_size, [3, 32, 32], [1])
+
+
+def prepare_flowers(outpath, batch_size):
+    outfilepath = os.path.join(outpath, "flowers.recordio")
+    convert_2_recordio(flowers.train, outfilepath, batch_size, [3, 224, 224],
+                       [1])
+
+
+def imagenet_train(data_dir):
+    contents = os.listdir(data_dir)
+    if set(contents) != set(
+        ["train", "train.txt", "val", "val_set", "val.txt", "unzip.sh"]):
+        raise Exception("Imagenet data contents error!")
+    img2label = dict()
+    imgfilelist = []
+    with open(os.path.join(data_dir, "train.txt")) as fn:
+        while 1:
+            l = fn.readline()
+            if not l:
+                break
+            img, lbl = l[:-1].split(" ")
+            img2label[img] = int(lbl)
+            imgfilelist.append(img)
+
+    def train_reader():
+        for idx, imgfile in enumerate(imgfilelist):
+            data = image.load_image(
+                os.path.join(data_dir, "train", imgfile.lower()))
+            label = [img2label[imgfile], ]
+            yield [data, label]
+
+    def default_mapper(sample):
+        img, label = sample
+        img = image.simple_transform(
+            img, 256, 224, True, mean=[103.94, 116.78, 123.68])
+        return img.flatten().astype('float32'), label
+
+    return paddle.reader.map_readers(default_mapper, train_reader)
+
+
+# FIXME(wuyi): delete this when https://github.com/PaddlePaddle/Paddle/pull/11066 is merged
+def convert_reader_to_recordio_files(
+        filename,
+        batch_per_file,
+        reader_creator,
+        feeder,
+        compressor=core.RecordIOWriter.Compressor.Snappy,
+        max_num_records=1000,
+        feed_order=None):
+    if feed_order is None:
+        feed_order = feeder.feed_names
+    f_name, f_ext = os.path.splitext(filename)
+    assert (f_ext == ".recordio")
+
+    lines = []
+    f_idx = 0
+    counter = 0
+    for idx, batch in enumerate(reader_creator()):
+        lines.append(batch)
+        if idx >= batch_per_file and idx % batch_per_file == 0:
+            filename = "%s-%05d%s" % (f_name, f_idx, f_ext)
+            with fluid.recordio_writer.create_recordio_writer(
+                    filename, compressor, max_num_records) as writer:
+                for l in lines:
+                    res = feeder.feed(l)
+                    for each in feed_order:
+                        writer.append_tensor(res[each])
+                    writer.complete_append_tensor()
+                    counter += 1
+                lines = []
+                f_idx += 1
+            print("written file: ", filename)
+    return counter
+
+
+def prepare_imagenet(inpath, outpath, batch_size):
+    r = paddle.batch(imagenet_train(inpath), batch_size=batch_size)
+    feeder = fluid.DataFeeder(
+        feed_list=[
+            fluid.layers.data(
+                name="image", shape=[3, 224, 224]), fluid.layers.data(
+                    name="label", shape=[1], dtype='int64')
+        ],
+        place=fluid.CPUPlace())
+    outpath = os.path.join(outpath, "imagenet.recordio")
+    convert_reader_to_recordio_files(outpath, 10000, r, feeder)
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index 8758ac9f94..a56f3ea9db 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -434,7 +434,7 @@ def open_files(filenames,
                shapes,
                lod_levels,
                dtypes,
-               thread_num,
+               thread_num=1,
                buffer_size=None,
                pass_num=1,
                for_parallel=True):

From c06f43bbb6aec4ae12d514ca92a77aed0d473882 Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Mon, 4 Jun 2018 15:20:06 +0800
Subject: [PATCH 22/93] add annotation about _is_checkpoint_var

---
 python/paddle/fluid/io.py             |   5 +++--
 tools/codestyle/docstring_checker.pyc | Bin 12561 -> 12561 bytes
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index 62e3046db6..75146fe326 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -639,13 +639,14 @@ def _is_checkpoint_var(var):
             var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
             var.desc.type() == core.VarDesc.VarType.RAW:
         return False
-
+    # @GRAD are named for gradient varibales, checkpoint will not save it.
     if "@GRAD" in var.name:
         return False
-
+    # .trainer_ are named for distribute trian variables, checkpoint will not save it.
     if ".trainer_" in var.name:
         return False
 
+    # .block is named for distribute trian variables, checkpoint will not save it.
     if ".block" in var.name:
         return False
 
diff --git a/tools/codestyle/docstring_checker.pyc b/tools/codestyle/docstring_checker.pyc
index 1f0255b763c8d154b7cc17d2d525148dfea3b42d..a27d3c9a8cccab8552d510578debb2df04eb53bb 100644
GIT binary patch
delta 16
XcmbQ3G%<;t`7<xqaxuY;>|BNbF9rm>

delta 16
XcmbQ3G%<;t`7<xqX<y!r>|BNbF*pSi


From 08e5f0ae482c1e70dc74c4677e5cb699b38c433e Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Mon, 4 Jun 2018 16:10:11 +0800
Subject: [PATCH 23/93] rename need_load_checkpoint to
 get_latest_checkpoint_serial

---
 python/paddle/fluid/io.py      | 4 ++--
 python/paddle/fluid/trainer.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index 75146fe326..111907b575 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -25,7 +25,7 @@ __all__ = [
     'load_persistables', 'save_inference_model', 'load_inference_model',
     'get_inference_program', 'save_checkpoint', 'load_checkpoint',
     'clean_checkpoint', 'load_persist_vars_without_grad',
-    'save_persist_vars_without_grad'
+    'save_persist_vars_without_grad', 'get_latest_checkpoint_serial'
 ]
 
 
@@ -503,7 +503,7 @@ def save_checkpoint(executor,
     _lru_delete(checkpoint_dir, max_num_checkpoints)
 
 
-def need_load_checkpoint(checkpoint_dir):
+def get_latest_checkpoint_serial(checkpoint_dir):
     """
     If the directory have checkpoint files, it will return lastest checkpoint directory serial number
 
diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py
index 72168886fd..3c32ec1de8 100644
--- a/python/paddle/fluid/trainer.py
+++ b/python/paddle/fluid/trainer.py
@@ -146,7 +146,7 @@ class Trainer(object):
                     "The checkpoint_config shoule be an instance of CheckpointConfig"
                 )
             else:
-                self.checkpoint.load_serial = io.need_load_checkpoint(
+                self.checkpoint.load_serial = io.get_latest_checkpoint_serial(
                     self.checkpoint.checkpoint_dir)
 
         self.scope = core.Scope()

From bfdcf18707c79f2cc29b0903cb9f4fab2e907490 Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Mon, 4 Jun 2018 21:10:38 +0800
Subject: [PATCH 24/93] grammar optimized.

---
 python/paddle/fluid/io.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index 111907b575..b5d96441bc 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -492,7 +492,7 @@ def save_checkpoint(executor,
     if not os.path.isdir(checkpoint_dir):
         os.makedirs(checkpoint_dir)
 
-    serial = _get_lastest_checkpoint_dir(checkpoint_dir) + 1
+    serial = _get_latest_checkpoint_dir(checkpoint_dir) + 1
     cur_dir = _get_serial_dir(checkpoint_dir, serial)
 
     save_trainer_args(cur_dir, trainer_id, trainer_args)
@@ -505,11 +505,11 @@ def save_checkpoint(executor,
 
 def get_latest_checkpoint_serial(checkpoint_dir):
     """
-    If the directory have checkpoint files, it will return lastest checkpoint directory serial number
+    If the directory have checkpoint files, it will return latest checkpoint directory serial number
 
     :param checkpoint_dir
     """
-    serial = _get_lastest_checkpoint_dir(checkpoint_dir)
+    serial = _get_latest_checkpoint_dir(checkpoint_dir)
     if serial < 0:
         return None
     return serial
@@ -639,14 +639,14 @@ def _is_checkpoint_var(var):
             var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
             var.desc.type() == core.VarDesc.VarType.RAW:
         return False
-    # @GRAD are named for gradient varibales, checkpoint will not save it.
+    # @GRAD are named for gradient variables, checkpoint will not save it.
     if "@GRAD" in var.name:
         return False
-    # .trainer_ are named for distribute trian variables, checkpoint will not save it.
+    # .trainer_ are named for distribute train variables, checkpoint will not save it.
     if ".trainer_" in var.name:
         return False
 
-    # .block is named for distribute trian variables, checkpoint will not save it.
+    # .block is named for distribute train variables, checkpoint will not save it.
     if ".block" in var.name:
         return False
 
@@ -656,7 +656,6 @@ def _is_checkpoint_var(var):
 def _get_dir_serial(dirname):
     _, serial = dirname.split(CHECKPOINT_SEPARATOR)
 
-    serial_num = -1
     try:
         serial_num = int(serial)
     except ValueError:
@@ -723,7 +722,7 @@ def _write_success(dirname):
         f.write(now)
 
 
-def _get_lastest_checkpoint_dir(checkpoint_dir):
+def _get_latest_checkpoint_dir(checkpoint_dir):
     """
     get the latest file in checkpoint directory, the _SUCCESS file must exist in the directory
 

From 9735f25011b04116d271861fde8df05def81c3ce Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Tue, 5 Jun 2018 14:47:13 +0800
Subject: [PATCH 25/93] optimized

---
 python/paddle/fluid/io.py      | 44 +++++++++++++---------------------
 python/paddle/fluid/trainer.py |  8 +++----
 2 files changed, 20 insertions(+), 32 deletions(-)

diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index b5d96441bc..5abadc73f7 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -492,7 +492,7 @@ def save_checkpoint(executor,
     if not os.path.isdir(checkpoint_dir):
         os.makedirs(checkpoint_dir)
 
-    serial = _get_latest_checkpoint_dir(checkpoint_dir) + 1
+    serial = get_latest_checkpoint_serial(checkpoint_dir) + 1
     cur_dir = _get_serial_dir(checkpoint_dir, serial)
 
     save_trainer_args(cur_dir, trainer_id, trainer_args)
@@ -503,18 +503,6 @@ def save_checkpoint(executor,
     _lru_delete(checkpoint_dir, max_num_checkpoints)
 
 
-def get_latest_checkpoint_serial(checkpoint_dir):
-    """
-    If the directory have checkpoint files, it will return latest checkpoint directory serial number
-
-    :param checkpoint_dir
-    """
-    serial = _get_latest_checkpoint_dir(checkpoint_dir)
-    if serial < 0:
-        return None
-    return serial
-
-
 def load_checkpoint(executor, checkpoint_dir, serial, main_program):
     """
     Load checkpoint from a directory by executor,
@@ -527,17 +515,16 @@ def load_checkpoint(executor, checkpoint_dir, serial, main_program):
     """
 
     if checkpoint_dir is None:
-        raise ValueError(
-            "The values of 'checkpoint_dir' or 'serial' should not be None")
+        raise ValueError("The values of 'checkpoint_dir' should not be None")
 
     if serial is None or serial < 0:
         raise ValueError("The values of 'serial' should not be None or <0 ")
 
     if main_program is None:
-        raise ValueError("The values of 'main_program'should not be None")
+        raise ValueError('main_program should not be None.')
 
     cur_dir = _get_serial_dir(checkpoint_dir, serial)
-    load_persist_vars_without_grad(executor, cur_dir, main_program)
+    load_persist_vars_without_grad(executor, cur_dir, main_program, True)
 
 
 def clean_checkpoint(checkpoint_dir, delete_dir=False):
@@ -557,18 +544,21 @@ def clean_checkpoint(checkpoint_dir, delete_dir=False):
         os.rmdir(checkpoint_dir)
 
 
-def load_persist_vars_without_grad(executor, dirname, program, nest=True):
+def load_persist_vars_without_grad(executor,
+                                   dirname,
+                                   program,
+                                   has_model_dir=False):
     """
     load_persist_vars_without_grad will load variables from a directory by an executor,
     the variable named end with "@GRAD" will not be loaded.
 
-    :param executor
-    :param dirname
-    :param program
-    :param nest
+    :param executor  executor for load the value
+    :param dirname the checkpoint directory 
+    :param program   will load all variables in program 
+    :param has_model_dir if has_model_dir is True, will load variables from  sub directory named __model__
     """
 
-    if nest:
+    if has_model_dir:
         dirname = _get_model_dir(dirname)
 
     load_vars(
@@ -584,9 +574,9 @@ def save_persist_vars_without_grad(executor, dirname, program):
     save_persist_vars_without_grad  will save variables to a directory by an executor,
     the variable named end with "@GRAD" will not be saved.
 
-    :param executor
-    :param dirname
-    :param program
+    :param executor  executor for load the value
+    :param dirname the checkpoint directory 
+    :param program   will load all variables in program
     """
     cur_dir = _get_model_dir(dirname)
     save_vars(
@@ -722,7 +712,7 @@ def _write_success(dirname):
         f.write(now)
 
 
-def _get_latest_checkpoint_dir(checkpoint_dir):
+def get_latest_checkpoint_serial(checkpoint_dir):
     """
     get the latest file in checkpoint directory, the _SUCCESS file must exist in the directory
 
diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py
index 3c32ec1de8..fbdd28f53e 100644
--- a/python/paddle/fluid/trainer.py
+++ b/python/paddle/fluid/trainer.py
@@ -146,8 +146,9 @@ class Trainer(object):
                     "The checkpoint_config shoule be an instance of CheckpointConfig"
                 )
             else:
-                self.checkpoint.load_serial = io.get_latest_checkpoint_serial(
+                serial = io.get_latest_checkpoint_serial(
                     self.checkpoint.checkpoint_dir)
+                self.checkpoint.load_serial = serial if serial >= 0 else None
 
         self.scope = core.Scope()
 
@@ -194,10 +195,7 @@ class Trainer(object):
         if param_path and os.path.isdir(param_path):
             # load params from param_path into scope
             io.load_persist_vars_without_grad(
-                exe,
-                dirname=param_path,
-                program=self.startup_program,
-                nest=False)
+                exe, dirname=param_path, program=self.startup_program)
 
     def _transpile_nccl2_dist(self):
         # PADDLE_TRAINER_IPS

From be16af3b04b3052e35e6d9157cec302274a629a4 Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Tue, 5 Jun 2018 14:48:15 +0800
Subject: [PATCH 26/93] delete pyc

---
 tools/codestyle/docstring_checker.pyc | Bin 12561 -> 0 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 tools/codestyle/docstring_checker.pyc

diff --git a/tools/codestyle/docstring_checker.pyc b/tools/codestyle/docstring_checker.pyc
deleted file mode 100644
index a27d3c9a8cccab8552d510578debb2df04eb53bb..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 12561
zcmdT~%WoUU8J{I7N~9&rPx%oi-uOw^rX)Y&B&y<Awj9T8A{$Gmh7}78#a&4&FS*pt
zGLaG+Md6~4V{h$W&;kW|Dtaq=Xl^;Se?WWesfVJcHfVppZ<b3+R#F>DAw|-TX1<x7
z$2Z^m+w4DvioYD${eE2~KUw_0ibwwfg{#yoN=vyJ6<BH(6;scs*^Jb)D#)rvPR-_2
zqfgEDDT^a=D#)uwznbkg?LHL@s767}7EBwn71eA}xq0<NEU`nm{mL6u3wU*akS9vr
zpB++eL3ufW7E|a>?xj-N!7Warm=5bYj{N50^(C+V(2Hze+imT+b{yJug6!5>yc9NV
zH>}%c<Y|q{THuZP|H02pDT8TZPI#@Ry=0oW5AAmB2fAdjdl<#7DZC&0ZZgDl@IBXC
zsI`OG_3Lq+qxlez*YW5BC>|-bj7cn3o>=@Nnkurg-h%;-x9IB_yO%9{kseL9wF{k5
z(pgP9V*)px+w3Nfp1^NnrV?sO3cZ5b*n1kYj>Wa+;zyo8cGvtrf8pwU&3D_Q^{_Fv
z=*MG=;h2u<tS+_Z1$w9Ex`AiPvE*MIhJhZdhps1h90O_%W;;6^1<nVsYG>z%QP)JX
z%ESef>E`Q%@rRw)@aX#}JkSJ40i_TbNhHf_XC<dx)N|^GAjdxC_6acGp`?CN5{C^W
zlmv<c6%kTYPNh`j5)&xW9oY4trnS8k2ClZ_CC_fvB9afzl#q+<1zy8z#(K19<0rjg
zZdq>lMO0$BUR*40xSEj*&ytKSSK*#X5HqD5N+-*~tOdAmPz0z)XssT2wa8o|d$kuV
zY>O+bR{f?QSF6XlDeg>J!&Zs|p5{7xrcf+)2pzON*==Nc8`<7QPEGpqPE~HZQX<pM
z2zI8TMJ*(BW3|?31t7G3^f_MKj+#0qnL9OKds<k?;kC+ZHM}NBbOr(1Y6LnI4yW;G
zZctf6$*<s1Xo>jap;KBS9)Z5cSyI%a3`ND`oLViZM<A`7vH^OSQPE`5KKdA7;f+`E
z%F2Fq53Ndx)6M|3O8KNPLskR^UQ>`P=n*s;#+9nybV24qje+-}7YM7idHdS`*xPDk
z%5E0DxGH-a#bGq>SXp^%(AsMit(}=V(MUQWoeEiA!10aneS(p|;WDJJQjZ~5hKS_V
zJ_{I~p8K3w27Z@;adh{oIIAA@5&Muf4=okFt5%MyrkPAe@?!EFvkWPe<H+b^ilaw)
zwUFV|7@zd!rT6C)O`G%%Nbm0`rZ(v<O7EYQyJLMXmd3RYszVQRD#}<qqXA{rp+n19
zNH%YoO}jg}p)VP8kA2t(a0E10O4N4xFDOJ9+Q2$grdN;su&M2ZC~QzXiZmn@SK9R@
zyQXdO>^VE(QB;IM&)FRgwsrajM^2<I&OvcoGPHPRf;ya{xMuXN-Grv;5y(ak^Q~m9
z^;6xA+Mew%$P1V#k>|QyuX|cAw1Z%+JAu%Bh_xau#(?mQCP&H^ot;DkHCbmRhq_mb
zERy3*IJ*g;QtO;^!Xf$$<B=epLu}I2Fv6<aq*M?Y5sO}G1%3?W4Ozzx5E(6JFNYjN
zVTQ!DDApfAI~$QlIaq78yrvs-r&Wl*DtoK(TFVoGNMw{`yJQwnHN;C;H$2|6R4w*I
z7X?z*oe_-Na)u`s<n~@gMU&&zKru52h93gU@3HckVd$cDznNXuLBR9Ykxa3xrBdli
zd2GZ1Wb7ehf6d618l}N&*gon*Y*I?R$f&&s*qyPWp804b&TV-#Vp@(STvUDjlaa@^
z<;lorJeijmF7q4CMiwMxUniW_?&|Fxnw*}ys-V-shEPQ?d7-kNK%Jlht-q7yGUPr?
zOR+SUp^6MdG|=SAPK7^_<~fLQE;D0f7_E@f`%~rFTKVJg^6S;|+}Se^^e5xzE<G+i
zD2<+daHi9o@X0Pld~pIJX$S)uQs8J9s`VuiA&XJiZaH*d#N;qm1uYI^**3OXgg7r@
zFCO<ydQ+Kwlp5BljJS_tURJBkTEnYWMZ8w4jnHieY*J)as}PmdsxySTh-neT&Pg^;
zu^>_%o>A869#(2w<90a%*v?Tr;#NWuW(v7N|6rkTvXJGk!z-=hkssPg*YN0zxHRA?
z^}4()-EKNt#?6Ai4<9Q_haScby(>8T1?8ieS00-E>Hu8#1<+<eU3QBC+W{DydccMN
z+bOVNz;+32H(+}NwimE{0^1MRh`<g2c2Ho406Q$O7XUjVuonS)Nnl3-vjz6Dy3B~e
zsDu-aiOV;r+~ZO~DB*-u5Is04m0{(clFBaSo>p#2{K?(QJtM$9%Kes9_A2+RRQ4(N
zoK*HJw=9(r`jcZs-VGGfp*Ge_5*(lj(^{h|-f`&>kPwSUoOl{1W_%KgkS<NC0=Uco
zMmcPjTcP%8bk&0LLKrn_aTyxEEzaoi4MQJ#YafM?`_woxCl)K4)ykdt0{Ni}7VAXY
zfZc9-Hf<1Yu;c$Ccyh%)FJLW}4z}i%#ZozsR}fin?T?ncrd_l3QW(X(?*a{Yy@BY6
z4=>KlA)@2n>*M3+H92UyyOJAv8rEKH!=kZi$k@k5kJ&9R@<Z3GjE*up^v0EY_WSfK
zwPcQT<qP_CtUMJ2HV~@KQm_L3$fo6_v0)wC^PWvq1OGUn-y1>Fw6<g)<ysK%QnbFX
z4i!8Ex;VYLFVtX_HpQ;(*=xF^i~1}kc-Pk&u`FURA&)TXY0xeXLxW{8OUn)Of~HY9
z<x1Ehc`*o^iwG!PdE@+ey2vYHMFHW-7~}ETUKEAVIfl*bOSW#+>Yfd9!*1yhYKp(I
zsh4G>rgey0-(+yQ(q(-DgYJiIZnhn`Ho{2u2Z<H^0TF?alMu3*pLbeL>>N$5x(l1!
z>%5GcsP(*^xFWr?qzk_)czkMMF*82sI5E9%iWh0_GLlh(qs^B8;yKdQUlSKJi~aMt
zjz|9hg|RuQ%MtlAUMS0qItcmt5U%5Cn&uV4a!Oy42IhPso+>pwd^_or_u;F-!Gi$+
zXElTQ#CKcYhA)}+OT3iZ(VwFD(1%E=F0`BC%^`AW<de4FN<gW%mC%R@m8Q<x)KjjT
zcxR==R~uyu9vtvz%TlCf!n!_JLIj&TZCy9b7Pkzn2||0T7U-TCd`hnw3Fb*GRD_x+
zYqMw3>~Tm51wQ0p`mCnuP&qm_3#v-d<`Ufnx6*&oh<9<Sn0B1_QVCUxHEsYx2t5`L
z)hN#>({{~(t+4=N+Wb%5^3MR&`%tKXq0ESt%M|3<1&3*uB}7XLkB<;7=U7LyZa0ic
z`_6d{g>h$zC+7;Ac(yd+%$=Bn8#C@#Xm|Xj&gs6{C#2djCzPl)xcApzc1B*(x?5xe
zav_mJXcz~BLJ19oh)hq<Ws|gH+7OAN*~n$kja4$1aM@B#SB<h3?lC%}DgtDi(TdRT
zZ-_ezflg#L<pa7o!Ur`}fRtrZTB*D~tYT!L*aPcUsfPnfUr~_Q1_CKfxekwMPVf)Y
z8P1%b*M-N1w}|JA)dX()JARr7HRlo*6mCj#1Wm8R9Xd2u940Qh@=Hd{-~uE_B;pB5
zw%S0Hs|E*@hQ7$n?S}em*w}CJBtdZsqZ8Iq%O|PTsi)|B@sS(&DJtd$;Bo-Z(cTpG
z(2l@qmUr+3b%e=h8N;2Pb0mTbOiK_Hcrt-l48qMEWY`ve88yO7%BpDW3ofZ&E(yw{
zvNERb@xn{u|Nb3Y>ZR?D22G7NBp6M<G48)c-N~S5GIa^QMWSev{9~slT-dPg*PwB6
zWaYfVf}-Ae3&l){QBR{vZ?Yzum8m{Mki=C$h1_!9CQzvN7HZviq?v3#du<jkvp9}I
z#7Vs!K^fBqEdjKZs>Pv;_j^2v3LO^ZIhYx;jzE1Lu?9@)fy#xa&T`}$3PZRc;h(_U
zDhl}?67q$$2T&p>5Wyca(46`)Y!yh4jQSDY*rBE|groA3^?A>bK14Ng4sg8(xB&A8
zkSI(tWGspkAyn!@{YMlX0o@@|6BtiyyAtw2_N0Aj{0#k7IBFOzK3dmWdk%X~48e}k
z*PY?HsBcqqkJn$P*+B#oxfk(h#&)2#=JqjQ25cH34QzeSS`n$-$-DAR5ta*}CXt3f
zky4}M@~;_~Frp0*7gAeewuyT?x#%^$)mC)nuiS?rC(Z2_r5qAaamfE1$pnWUa4$Cs
z)jd7f=ntENyIi^#O-Tmmc4H%Va#@Mw$>f5PE{3)SInH(L=}&kP{wpB<pUdpAa@H~Y
zKZ56J)eYs^Q5@i&RB_<chAZJ|J?Uk^3q5)T{DnU76c`Z%rI$rRfa5?{qQIHx1Jf~g
z2c|>HOqh;JOaB2y*AVJqyAE$jc%&1@uuqSkw!6IYjNFD$g>loswf_%>6OLgq>M*R)
zOJ($?jIdr$O^yiu7tkx-7tUw$RBv;9SM+a2To|!MOy^8v>A!J#ru9@m<@yVF4nn4b
zrpQq=qbSo!f%^L>Vlg{b$AthaSQ{s4df@(HdVsh{RvBiiR(3I83^NQy${jSAnb+3|
zs343^ryll2N`Q#&e<DPNSn38_v0#V{4jA8+puVOF(&LM61(nehfyHB&(r^;2k+g1y
z^UFvWcPaa;YXKZtIZOq1$x%W84s8mN@AIL%CYA$Y9QaD4)~tIH$ge@&;kzv{i6tv-
zOce@26T?r5wEJ;{S;Xd$cUYW5VPqjHTScyKVCjEzd1i!FArIq&Y0$jYhvyqWw*SJe
zdt)j+bekH_X<!gB<r;#hhyt1j41%y-a{5{X<<^k!WguZQ5_TMRR9M_aA-OigH9h2d
zirwE7a-GCxN1#UjH@Q-C@)0S!2d^v|DT~x7)GSmn%wS{+=B{iaYX=dJ=BGQbiQ&&d
zkWQ<W5&2dJ(ioCfx}e1^K<_~wKn|k?Da*iXlj^EMX^aK&*-vjb?D9_qB5q{z!lyI>
zZo;9w`8H!F_`DEFGO-3=q;s?UVh1+C4pmvb$+f2E-jpwe&<h8)UqzqM{k#eL{n=(&
z`3$?p9>tE+{{)gYL@Ilj2=U9FY{SOn#q;{1Uv6bQ6Z51?er5457O9WZD^0oWdh_kY
ziR9ZJ99vJ=&PBkxj3uTb!pBG=m6+5y1B)q!i)@$Z8%feyqEyV)*O&ajjl5<L_tDOo
zLNaEtIfDeTs26669e(U-zRD4IH)Tc9LK{6>J6$P@8n!7>8nbH&Hg6BL@@;HxoP0{@
zq6YHT2tKGAMy7lpZES>!Pgn)9i1^(NecpkLQHA`r*%@WQl#Mgaf)<|n8ma2@%S}o0
zAfrw%|IY2ot-H5w&rH2jefLJCGIjGt_13h67r%Ss{=M7IbPw3M&xvSy>9xju7>HTf
zLMAZs<2AXDhQ>Fdlp0}6zD#r8MBTZ{Vu}UBpUyQFbl=5?5QER58g(d~9G<+Z)upyY
z9uDoy5Rd*2ihMS26`<0o+WDKs=Z!^t(NsjOU&`I6@iDD_YlxZt1Cry<7Yc(Tg^|ME
z!r{VU`IJvC#<+X_s!=Y(SjFV0M>{EDsvqTm43wl0l3w|kd;uvzZElvIP}RdANWKP?
y@0p!5e3LgJ_hn2!lV0JmwgrNPd?rm|;L1+yD)~TS&>BQOgn5v`Y@u)D&VK>kVxyJ-


From 2e0d851685402e4e9d1835824f956326e9f2b0fd Mon Sep 17 00:00:00 2001
From: "yi.wu" <yi.wu@baifendian.com>
Date: Tue, 5 Jun 2018 15:39:49 +0800
Subject: [PATCH 27/93] update readme

---
 benchmark/fluid/README.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/benchmark/fluid/README.md b/benchmark/fluid/README.md
index c17cab999b..48280a5621 100644
--- a/benchmark/fluid/README.md
+++ b/benchmark/fluid/README.md
@@ -45,10 +45,11 @@ Currently supported `--model` argument include:
 ## Prepare the RecordIO file to Achieve Better Performance
 
 Run the following command will generate RecordIO files like "mnist.recordio" under the path
-and batch_size you choose:
+and batch_size you choose, you can use batch_size=1 so that later reader can change the batch_size
+at any time using `fluid.batch`.
 
 ```bash
-python -c 'from recordio_converter import *; prepare_mnist("data", 32)'
+python -c 'from recordio_converter import *; prepare_mnist("data", 1)'
 ```
 
 ## Run Distributed Benchmark on Kubernetes Cluster

From 725ea3f17d38edeb4648694f6115c0ff9cd2efe4 Mon Sep 17 00:00:00 2001
From: "yi.wu" <yi.wu@baifendian.com>
Date: Tue, 5 Jun 2018 16:01:27 +0800
Subject: [PATCH 28/93] update

---
 benchmark/fluid/models/resnet.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/benchmark/fluid/models/resnet.py b/benchmark/fluid/models/resnet.py
index 9ccd41c9e9..47d8d026ed 100644
--- a/benchmark/fluid/models/resnet.py
+++ b/benchmark/fluid/models/resnet.py
@@ -142,11 +142,11 @@ def get_model(args):
         else:
             dshape = [224, 224, 3]
         model = resnet_imagenet
-        if not args.data_dir:
+        if not args.data_path:
             raise Exception(
-                "Must specify --data_dir when training with imagenet")
-        train_reader = imagenet_train(args.data_dir)
-        test_reader = imagenet_test(args.data_dir)
+                "Must specify --data_path when training with imagenet")
+        train_reader = imagenet_train(args.data_path)
+        test_reader = imagenet_test(args.data_path)
 
     if args.use_reader_op:
         filelist = [

From eea5762e26a9a6ae2d9642830031028e5952af45 Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Tue, 5 Jun 2018 17:04:17 +0800
Subject: [PATCH 29/93] add checkpoint unittest

---
 .../fluid/tests/unittests/test_checkpoint.py  |  72 ++++++++++++++++++
 tools/codestyle/docstring_checker.pyc         | Bin 0 -> 12561 bytes
 2 files changed, 72 insertions(+)
 create mode 100644 python/paddle/fluid/tests/unittests/test_checkpoint.py
 create mode 100644 tools/codestyle/docstring_checker.pyc

diff --git a/python/paddle/fluid/tests/unittests/test_checkpoint.py b/python/paddle/fluid/tests/unittests/test_checkpoint.py
new file mode 100644
index 0000000000..b8d82c59b4
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_checkpoint.py
@@ -0,0 +1,72 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid as fluid
+import unittest
+
+
+class TestCheckpoint(unittest.TestCase):
+    def setUp(self):
+        self.dirname = "/tmp/ckpt"
+        self.max_num_checkpoints = 3
+        self.epoch_interval = 1
+        self.step_interval = 1
+        self.trainer_id = 0
+        self.chief = self.trainer_id == 0
+        self.place = fluid.CPUPlace()
+        self.epoch_id = 100
+        self.step_id = 20
+
+    def test_checkpoint(self):
+        self.save_checkpoint()
+        serial = fluid.io.get_latest_checkpoint_serial(self.dirname)
+        self.assertTrue(serial >= 0)
+        trainer_args = ["epoch_id", "step_id"]
+        epoch_id, step_id = fluid.io.load_trainer_args(
+            self.dirname, serial, self.trainer_id, trainer_args)
+        self.assertEqual(self.step_id, step_id)
+        self.assertEqual(self.epoch_id, epoch_id)
+
+        program = fluid.Program()
+        with fluid.program_guard(program):
+            exe = fluid.Executor(self.place)
+            fluid.io.load_checkpoint(exe, self.dirname, serial, program)
+
+        fluid.io.clean_checkpoint(self.dirname, delete_dir=True)
+
+    def save_checkpoint(self):
+        config = fluid.CheckpointConfig(self.dirname, self.max_num_checkpoints,
+                                        self.epoch_interval, self.step_interval)
+
+        trainer_args = {}
+        trainer_args["epoch_id"] = self.epoch_id
+        trainer_args["step_id"] = self.step_id
+
+        program = fluid.Program()
+        with fluid.program_guard(program):
+            program.global_block().create_var(
+                name="scale_0",
+                psersistable=True,
+                dtype="float32",
+                shape=[32, 32])
+
+            exe = fluid.Executor(self.place)
+            for i in xrange(10):
+                fluid.io.save_checkpoint(
+                    exe, config.checkpoint_dir, self.trainer_id, self.chief,
+                    trainer_args, program, config.max_num_checkpoints)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tools/codestyle/docstring_checker.pyc b/tools/codestyle/docstring_checker.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a27d3c9a8cccab8552d510578debb2df04eb53bb
GIT binary patch
literal 12561
zcmdT~%WoUU8J{I7N~9&rPx%oi-uOw^rX)Y&B&y<Awj9T8A{$Gmh7}78#a&4&FS*pt
zGLaG+Md6~4V{h$W&;kW|Dtaq=Xl^;Se?WWesfVJcHfVppZ<b3+R#F>DAw|-TX1<x7
z$2Z^m+w4DvioYD${eE2~KUw_0ibwwfg{#yoN=vyJ6<BH(6;scs*^Jb)D#)rvPR-_2
zqfgEDDT^a=D#)uwznbkg?LHL@s767}7EBwn71eA}xq0<NEU`nm{mL6u3wU*akS9vr
zpB++eL3ufW7E|a>?xj-N!7Warm=5bYj{N50^(C+V(2Hze+imT+b{yJug6!5>yc9NV
zH>}%c<Y|q{THuZP|H02pDT8TZPI#@Ry=0oW5AAmB2fAdjdl<#7DZC&0ZZgDl@IBXC
zsI`OG_3Lq+qxlez*YW5BC>|-bj7cn3o>=@Nnkurg-h%;-x9IB_yO%9{kseL9wF{k5
z(pgP9V*)px+w3Nfp1^NnrV?sO3cZ5b*n1kYj>Wa+;zyo8cGvtrf8pwU&3D_Q^{_Fv
z=*MG=;h2u<tS+_Z1$w9Ex`AiPvE*MIhJhZdhps1h90O_%W;;6^1<nVsYG>z%QP)JX
z%ESef>E`Q%@rRw)@aX#}JkSJ40i_TbNhHf_XC<dx)N|^GAjdxC_6acGp`?CN5{C^W
zlmv<c6%kTYPNh`j5)&xW9oY4trnS8k2ClZ_CC_fvB9afzl#q+<1zy8z#(K19<0rjg
zZdq>lMO0$BUR*40xSEj*&ytKSSK*#X5HqD5N+-*~tOdAmPz0z)XssT2wa8o|d$kuV
zY>O+bR{f?QSF6XlDeg>J!&Zs|p5{7xrcf+)2pzON*==Nc8`<7QPEGpqPE~HZQX<pM
z2zI8TMJ*(BW3|?31t7G3^f_MKj+#0qnL9OKds<k?;kC+ZHM}NBbOr(1Y6LnI4yW;G
zZctf6$*<s1Xo>jap;KBS9)Z5cSyI%a3`ND`oLViZM<A`7vH^OSQPE`5KKdA7;f+`E
z%F2Fq53Ndx)6M|3O8KNPLskR^UQ>`P=n*s;#+9nybV24qje+-}7YM7idHdS`*xPDk
z%5E0DxGH-a#bGq>SXp^%(AsMit(}=V(MUQWoeEiA!10aneS(p|;WDJJQjZ~5hKS_V
zJ_{I~p8K3w27Z@;adh{oIIAA@5&Muf4=okFt5%MyrkPAe@?!EFvkWPe<H+b^ilaw)
zwUFV|7@zd!rT6C)O`G%%Nbm0`rZ(v<O7EYQyJLMXmd3RYszVQRD#}<qqXA{rp+n19
zNH%YoO}jg}p)VP8kA2t(a0E10O4N4xFDOJ9+Q2$grdN;su&M2ZC~QzXiZmn@SK9R@
zyQXdO>^VE(QB;IM&)FRgwsrajM^2<I&OvcoGPHPRf;ya{xMuXN-Grv;5y(ak^Q~m9
z^;6xA+Mew%$P1V#k>|QyuX|cAw1Z%+JAu%Bh_xau#(?mQCP&H^ot;DkHCbmRhq_mb
zERy3*IJ*g;QtO;^!Xf$$<B=epLu}I2Fv6<aq*M?Y5sO}G1%3?W4Ozzx5E(6JFNYjN
zVTQ!DDApfAI~$QlIaq78yrvs-r&Wl*DtoK(TFVoGNMw{`yJQwnHN;C;H$2|6R4w*I
z7X?z*oe_-Na)u`s<n~@gMU&&zKru52h93gU@3HckVd$cDznNXuLBR9Ykxa3xrBdli
zd2GZ1Wb7ehf6d618l}N&*gon*Y*I?R$f&&s*qyPWp804b&TV-#Vp@(STvUDjlaa@^
z<;lorJeijmF7q4CMiwMxUniW_?&|Fxnw*}ys-V-shEPQ?d7-kNK%Jlht-q7yGUPr?
zOR+SUp^6MdG|=SAPK7^_<~fLQE;D0f7_E@f`%~rFTKVJg^6S;|+}Se^^e5xzE<G+i
zD2<+daHi9o@X0Pld~pIJX$S)uQs8J9s`VuiA&XJiZaH*d#N;qm1uYI^**3OXgg7r@
zFCO<ydQ+Kwlp5BljJS_tURJBkTEnYWMZ8w4jnHieY*J)as}PmdsxySTh-neT&Pg^;
zu^>_%o>A869#(2w<90a%*v?Tr;#NWuW(v7N|6rkTvXJGk!z-=hkssPg*YN0zxHRA?
z^}4()-EKNt#?6Ai4<9Q_haScby(>8T1?8ieS00-E>Hu8#1<+<eU3QBC+W{DydccMN
z+bOVNz;+32H(+}NwimE{0^1MRh`<g2c2Ho406Q$O7XUjVuonS)Nnl3-vjz6Dy3B~e
zsDu-aiOV;r+~ZO~DB*-u5Is04m0{(clFBaSo>p#2{K?(QJtM$9%Kes9_A2+RRQ4(N
zoK*HJw=9(r`jcZs-VGGfp*Ge_5*(lj(^{h|-f`&>kPwSUoOl{1W_%KgkS<NC0=Uco
zMmcPjTcP%8bk&0LLKrn_aTyxEEzaoi4MQJ#YafM?`_woxCl)K4)ykdt0{Ni}7VAXY
zfZc9-Hf<1Yu;c$Ccyh%)FJLW}4z}i%#ZozsR}fin?T?ncrd_l3QW(X(?*a{Yy@BY6
z4=>KlA)@2n>*M3+H92UyyOJAv8rEKH!=kZi$k@k5kJ&9R@<Z3GjE*up^v0EY_WSfK
zwPcQT<qP_CtUMJ2HV~@KQm_L3$fo6_v0)wC^PWvq1OGUn-y1>Fw6<g)<ysK%QnbFX
z4i!8Ex;VYLFVtX_HpQ;(*=xF^i~1}kc-Pk&u`FURA&)TXY0xeXLxW{8OUn)Of~HY9
z<x1Ehc`*o^iwG!PdE@+ey2vYHMFHW-7~}ETUKEAVIfl*bOSW#+>Yfd9!*1yhYKp(I
zsh4G>rgey0-(+yQ(q(-DgYJiIZnhn`Ho{2u2Z<H^0TF?alMu3*pLbeL>>N$5x(l1!
z>%5GcsP(*^xFWr?qzk_)czkMMF*82sI5E9%iWh0_GLlh(qs^B8;yKdQUlSKJi~aMt
zjz|9hg|RuQ%MtlAUMS0qItcmt5U%5Cn&uV4a!Oy42IhPso+>pwd^_or_u;F-!Gi$+
zXElTQ#CKcYhA)}+OT3iZ(VwFD(1%E=F0`BC%^`AW<de4FN<gW%mC%R@m8Q<x)KjjT
zcxR==R~uyu9vtvz%TlCf!n!_JLIj&TZCy9b7Pkzn2||0T7U-TCd`hnw3Fb*GRD_x+
zYqMw3>~Tm51wQ0p`mCnuP&qm_3#v-d<`Ufnx6*&oh<9<Sn0B1_QVCUxHEsYx2t5`L
z)hN#>({{~(t+4=N+Wb%5^3MR&`%tKXq0ESt%M|3<1&3*uB}7XLkB<;7=U7LyZa0ic
z`_6d{g>h$zC+7;Ac(yd+%$=Bn8#C@#Xm|Xj&gs6{C#2djCzPl)xcApzc1B*(x?5xe
zav_mJXcz~BLJ19oh)hq<Ws|gH+7OAN*~n$kja4$1aM@B#SB<h3?lC%}DgtDi(TdRT
zZ-_ezflg#L<pa7o!Ur`}fRtrZTB*D~tYT!L*aPcUsfPnfUr~_Q1_CKfxekwMPVf)Y
z8P1%b*M-N1w}|JA)dX()JARr7HRlo*6mCj#1Wm8R9Xd2u940Qh@=Hd{-~uE_B;pB5
zw%S0Hs|E*@hQ7$n?S}em*w}CJBtdZsqZ8Iq%O|PTsi)|B@sS(&DJtd$;Bo-Z(cTpG
z(2l@qmUr+3b%e=h8N;2Pb0mTbOiK_Hcrt-l48qMEWY`ve88yO7%BpDW3ofZ&E(yw{
zvNERb@xn{u|Nb3Y>ZR?D22G7NBp6M<G48)c-N~S5GIa^QMWSev{9~slT-dPg*PwB6
zWaYfVf}-Ae3&l){QBR{vZ?Yzum8m{Mki=C$h1_!9CQzvN7HZviq?v3#du<jkvp9}I
z#7Vs!K^fBqEdjKZs>Pv;_j^2v3LO^ZIhYx;jzE1Lu?9@)fy#xa&T`}$3PZRc;h(_U
zDhl}?67q$$2T&p>5Wyca(46`)Y!yh4jQSDY*rBE|groA3^?A>bK14Ng4sg8(xB&A8
zkSI(tWGspkAyn!@{YMlX0o@@|6BtiyyAtw2_N0Aj{0#k7IBFOzK3dmWdk%X~48e}k
z*PY?HsBcqqkJn$P*+B#oxfk(h#&)2#=JqjQ25cH34QzeSS`n$-$-DAR5ta*}CXt3f
zky4}M@~;_~Frp0*7gAeewuyT?x#%^$)mC)nuiS?rC(Z2_r5qAaamfE1$pnWUa4$Cs
z)jd7f=ntENyIi^#O-Tmmc4H%Va#@Mw$>f5PE{3)SInH(L=}&kP{wpB<pUdpAa@H~Y
zKZ56J)eYs^Q5@i&RB_<chAZJ|J?Uk^3q5)T{DnU76c`Z%rI$rRfa5?{qQIHx1Jf~g
z2c|>HOqh;JOaB2y*AVJqyAE$jc%&1@uuqSkw!6IYjNFD$g>loswf_%>6OLgq>M*R)
zOJ($?jIdr$O^yiu7tkx-7tUw$RBv;9SM+a2To|!MOy^8v>A!J#ru9@m<@yVF4nn4b
zrpQq=qbSo!f%^L>Vlg{b$AthaSQ{s4df@(HdVsh{RvBiiR(3I83^NQy${jSAnb+3|
zs343^ryll2N`Q#&e<DPNSn38_v0#V{4jA8+puVOF(&LM61(nehfyHB&(r^;2k+g1y
z^UFvWcPaa;YXKZtIZOq1$x%W84s8mN@AIL%CYA$Y9QaD4)~tIH$ge@&;kzv{i6tv-
zOce@26T?r5wEJ;{S;Xd$cUYW5VPqjHTScyKVCjEzd1i!FArIq&Y0$jYhvyqWw*SJe
zdt)j+bekH_X<!gB<r;#hhyt1j41%y-a{5{X<<^k!WguZQ5_TMRR9M_aA-OigH9h2d
zirwE7a-GCxN1#UjH@Q-C@)0S!2d^v|DT~x7)GSmn%wS{+=B{iaYX=dJ=BGQbiQ&&d
zkWQ<W5&2dJ(ioCfx}e1^K<_~wKn|k?Da*iXlj^EMX^aK&*-vjb?D9_qB5q{z!lyI>
zZo;9w`8H!F_`DEFGO-3=q;s?UVh1+C4pmvb$+f2E-jpwe&<h8)UqzqM{k#eL{n=(&
z`3$?p9>tE+{{)gYL@Ilj2=U9FY{SOn#q;{1Uv6bQ6Z51?er5457O9WZD^0oWdh_kY
ziR9ZJ99vJ=&PBkxj3uTb!pBG=m6+5y1B)q!i)@$Z8%feyqEyV)*O&ajjl5<L_tDOo
zLNaEtIfDeTs26669e(U-zRD4IH)Tc9LK{6>J6$P@8n!7>8nbH&Hg6BL@@;HxoP0{@
zq6YHT2tKGAMy7lpZES>!Pgn)9i1^(NecpkLQHA`r*%@WQl#Mgaf)<|n8ma2@%S}o0
zAfrw%|IY2ot-H5w&rH2jefLJCGIjGt_13h67r%Ss{=M7IbPw3M&xvSy>9xju7>HTf
zLMAZs<2AXDhQ>Fdlp0}6zD#r8MBTZ{Vu}UBpUyQFbl=5?5QER58g(d~9G<+Z)upyY
z9uDoy5Rd*2ihMS26`<0o+WDKs=Z!^t(NsjOU&`I6@iDD_YlxZt1Cry<7Yc(Tg^|ME
z!r{VU`IJvC#<+X_s!=Y(SjFV0M>{EDsvqTm43wl0l3w|kd;uvzZElvIP}RdANWKP?
y@0p!5e3LgJ_hn2!lV0JmwgrNPd?rm|;L1+yD)~TS&>BQOgn5v`Y@u)D&VK>kVxyJ-

literal 0
HcmV?d00001


From 951fa7441c7eff3596735ac55dda01288870aab6 Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Tue, 5 Jun 2018 17:04:45 +0800
Subject: [PATCH 30/93] add checkpoint unittest

---
 tools/codestyle/docstring_checker.pyc | Bin 12561 -> 0 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 tools/codestyle/docstring_checker.pyc

diff --git a/tools/codestyle/docstring_checker.pyc b/tools/codestyle/docstring_checker.pyc
deleted file mode 100644
index a27d3c9a8cccab8552d510578debb2df04eb53bb..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 12561
zcmdT~%WoUU8J{I7N~9&rPx%oi-uOw^rX)Y&B&y<Awj9T8A{$Gmh7}78#a&4&FS*pt
zGLaG+Md6~4V{h$W&;kW|Dtaq=Xl^;Se?WWesfVJcHfVppZ<b3+R#F>DAw|-TX1<x7
z$2Z^m+w4DvioYD${eE2~KUw_0ibwwfg{#yoN=vyJ6<BH(6;scs*^Jb)D#)rvPR-_2
zqfgEDDT^a=D#)uwznbkg?LHL@s767}7EBwn71eA}xq0<NEU`nm{mL6u3wU*akS9vr
zpB++eL3ufW7E|a>?xj-N!7Warm=5bYj{N50^(C+V(2Hze+imT+b{yJug6!5>yc9NV
zH>}%c<Y|q{THuZP|H02pDT8TZPI#@Ry=0oW5AAmB2fAdjdl<#7DZC&0ZZgDl@IBXC
zsI`OG_3Lq+qxlez*YW5BC>|-bj7cn3o>=@Nnkurg-h%;-x9IB_yO%9{kseL9wF{k5
z(pgP9V*)px+w3Nfp1^NnrV?sO3cZ5b*n1kYj>Wa+;zyo8cGvtrf8pwU&3D_Q^{_Fv
z=*MG=;h2u<tS+_Z1$w9Ex`AiPvE*MIhJhZdhps1h90O_%W;;6^1<nVsYG>z%QP)JX
z%ESef>E`Q%@rRw)@aX#}JkSJ40i_TbNhHf_XC<dx)N|^GAjdxC_6acGp`?CN5{C^W
zlmv<c6%kTYPNh`j5)&xW9oY4trnS8k2ClZ_CC_fvB9afzl#q+<1zy8z#(K19<0rjg
zZdq>lMO0$BUR*40xSEj*&ytKSSK*#X5HqD5N+-*~tOdAmPz0z)XssT2wa8o|d$kuV
zY>O+bR{f?QSF6XlDeg>J!&Zs|p5{7xrcf+)2pzON*==Nc8`<7QPEGpqPE~HZQX<pM
z2zI8TMJ*(BW3|?31t7G3^f_MKj+#0qnL9OKds<k?;kC+ZHM}NBbOr(1Y6LnI4yW;G
zZctf6$*<s1Xo>jap;KBS9)Z5cSyI%a3`ND`oLViZM<A`7vH^OSQPE`5KKdA7;f+`E
z%F2Fq53Ndx)6M|3O8KNPLskR^UQ>`P=n*s;#+9nybV24qje+-}7YM7idHdS`*xPDk
z%5E0DxGH-a#bGq>SXp^%(AsMit(}=V(MUQWoeEiA!10aneS(p|;WDJJQjZ~5hKS_V
zJ_{I~p8K3w27Z@;adh{oIIAA@5&Muf4=okFt5%MyrkPAe@?!EFvkWPe<H+b^ilaw)
zwUFV|7@zd!rT6C)O`G%%Nbm0`rZ(v<O7EYQyJLMXmd3RYszVQRD#}<qqXA{rp+n19
zNH%YoO}jg}p)VP8kA2t(a0E10O4N4xFDOJ9+Q2$grdN;su&M2ZC~QzXiZmn@SK9R@
zyQXdO>^VE(QB;IM&)FRgwsrajM^2<I&OvcoGPHPRf;ya{xMuXN-Grv;5y(ak^Q~m9
z^;6xA+Mew%$P1V#k>|QyuX|cAw1Z%+JAu%Bh_xau#(?mQCP&H^ot;DkHCbmRhq_mb
zERy3*IJ*g;QtO;^!Xf$$<B=epLu}I2Fv6<aq*M?Y5sO}G1%3?W4Ozzx5E(6JFNYjN
zVTQ!DDApfAI~$QlIaq78yrvs-r&Wl*DtoK(TFVoGNMw{`yJQwnHN;C;H$2|6R4w*I
z7X?z*oe_-Na)u`s<n~@gMU&&zKru52h93gU@3HckVd$cDznNXuLBR9Ykxa3xrBdli
zd2GZ1Wb7ehf6d618l}N&*gon*Y*I?R$f&&s*qyPWp804b&TV-#Vp@(STvUDjlaa@^
z<;lorJeijmF7q4CMiwMxUniW_?&|Fxnw*}ys-V-shEPQ?d7-kNK%Jlht-q7yGUPr?
zOR+SUp^6MdG|=SAPK7^_<~fLQE;D0f7_E@f`%~rFTKVJg^6S;|+}Se^^e5xzE<G+i
zD2<+daHi9o@X0Pld~pIJX$S)uQs8J9s`VuiA&XJiZaH*d#N;qm1uYI^**3OXgg7r@
zFCO<ydQ+Kwlp5BljJS_tURJBkTEnYWMZ8w4jnHieY*J)as}PmdsxySTh-neT&Pg^;
zu^>_%o>A869#(2w<90a%*v?Tr;#NWuW(v7N|6rkTvXJGk!z-=hkssPg*YN0zxHRA?
z^}4()-EKNt#?6Ai4<9Q_haScby(>8T1?8ieS00-E>Hu8#1<+<eU3QBC+W{DydccMN
z+bOVNz;+32H(+}NwimE{0^1MRh`<g2c2Ho406Q$O7XUjVuonS)Nnl3-vjz6Dy3B~e
zsDu-aiOV;r+~ZO~DB*-u5Is04m0{(clFBaSo>p#2{K?(QJtM$9%Kes9_A2+RRQ4(N
zoK*HJw=9(r`jcZs-VGGfp*Ge_5*(lj(^{h|-f`&>kPwSUoOl{1W_%KgkS<NC0=Uco
zMmcPjTcP%8bk&0LLKrn_aTyxEEzaoi4MQJ#YafM?`_woxCl)K4)ykdt0{Ni}7VAXY
zfZc9-Hf<1Yu;c$Ccyh%)FJLW}4z}i%#ZozsR}fin?T?ncrd_l3QW(X(?*a{Yy@BY6
z4=>KlA)@2n>*M3+H92UyyOJAv8rEKH!=kZi$k@k5kJ&9R@<Z3GjE*up^v0EY_WSfK
zwPcQT<qP_CtUMJ2HV~@KQm_L3$fo6_v0)wC^PWvq1OGUn-y1>Fw6<g)<ysK%QnbFX
z4i!8Ex;VYLFVtX_HpQ;(*=xF^i~1}kc-Pk&u`FURA&)TXY0xeXLxW{8OUn)Of~HY9
z<x1Ehc`*o^iwG!PdE@+ey2vYHMFHW-7~}ETUKEAVIfl*bOSW#+>Yfd9!*1yhYKp(I
zsh4G>rgey0-(+yQ(q(-DgYJiIZnhn`Ho{2u2Z<H^0TF?alMu3*pLbeL>>N$5x(l1!
z>%5GcsP(*^xFWr?qzk_)czkMMF*82sI5E9%iWh0_GLlh(qs^B8;yKdQUlSKJi~aMt
zjz|9hg|RuQ%MtlAUMS0qItcmt5U%5Cn&uV4a!Oy42IhPso+>pwd^_or_u;F-!Gi$+
zXElTQ#CKcYhA)}+OT3iZ(VwFD(1%E=F0`BC%^`AW<de4FN<gW%mC%R@m8Q<x)KjjT
zcxR==R~uyu9vtvz%TlCf!n!_JLIj&TZCy9b7Pkzn2||0T7U-TCd`hnw3Fb*GRD_x+
zYqMw3>~Tm51wQ0p`mCnuP&qm_3#v-d<`Ufnx6*&oh<9<Sn0B1_QVCUxHEsYx2t5`L
z)hN#>({{~(t+4=N+Wb%5^3MR&`%tKXq0ESt%M|3<1&3*uB}7XLkB<;7=U7LyZa0ic
z`_6d{g>h$zC+7;Ac(yd+%$=Bn8#C@#Xm|Xj&gs6{C#2djCzPl)xcApzc1B*(x?5xe
zav_mJXcz~BLJ19oh)hq<Ws|gH+7OAN*~n$kja4$1aM@B#SB<h3?lC%}DgtDi(TdRT
zZ-_ezflg#L<pa7o!Ur`}fRtrZTB*D~tYT!L*aPcUsfPnfUr~_Q1_CKfxekwMPVf)Y
z8P1%b*M-N1w}|JA)dX()JARr7HRlo*6mCj#1Wm8R9Xd2u940Qh@=Hd{-~uE_B;pB5
zw%S0Hs|E*@hQ7$n?S}em*w}CJBtdZsqZ8Iq%O|PTsi)|B@sS(&DJtd$;Bo-Z(cTpG
z(2l@qmUr+3b%e=h8N;2Pb0mTbOiK_Hcrt-l48qMEWY`ve88yO7%BpDW3ofZ&E(yw{
zvNERb@xn{u|Nb3Y>ZR?D22G7NBp6M<G48)c-N~S5GIa^QMWSev{9~slT-dPg*PwB6
zWaYfVf}-Ae3&l){QBR{vZ?Yzum8m{Mki=C$h1_!9CQzvN7HZviq?v3#du<jkvp9}I
z#7Vs!K^fBqEdjKZs>Pv;_j^2v3LO^ZIhYx;jzE1Lu?9@)fy#xa&T`}$3PZRc;h(_U
zDhl}?67q$$2T&p>5Wyca(46`)Y!yh4jQSDY*rBE|groA3^?A>bK14Ng4sg8(xB&A8
zkSI(tWGspkAyn!@{YMlX0o@@|6BtiyyAtw2_N0Aj{0#k7IBFOzK3dmWdk%X~48e}k
z*PY?HsBcqqkJn$P*+B#oxfk(h#&)2#=JqjQ25cH34QzeSS`n$-$-DAR5ta*}CXt3f
zky4}M@~;_~Frp0*7gAeewuyT?x#%^$)mC)nuiS?rC(Z2_r5qAaamfE1$pnWUa4$Cs
z)jd7f=ntENyIi^#O-Tmmc4H%Va#@Mw$>f5PE{3)SInH(L=}&kP{wpB<pUdpAa@H~Y
zKZ56J)eYs^Q5@i&RB_<chAZJ|J?Uk^3q5)T{DnU76c`Z%rI$rRfa5?{qQIHx1Jf~g
z2c|>HOqh;JOaB2y*AVJqyAE$jc%&1@uuqSkw!6IYjNFD$g>loswf_%>6OLgq>M*R)
zOJ($?jIdr$O^yiu7tkx-7tUw$RBv;9SM+a2To|!MOy^8v>A!J#ru9@m<@yVF4nn4b
zrpQq=qbSo!f%^L>Vlg{b$AthaSQ{s4df@(HdVsh{RvBiiR(3I83^NQy${jSAnb+3|
zs343^ryll2N`Q#&e<DPNSn38_v0#V{4jA8+puVOF(&LM61(nehfyHB&(r^;2k+g1y
z^UFvWcPaa;YXKZtIZOq1$x%W84s8mN@AIL%CYA$Y9QaD4)~tIH$ge@&;kzv{i6tv-
zOce@26T?r5wEJ;{S;Xd$cUYW5VPqjHTScyKVCjEzd1i!FArIq&Y0$jYhvyqWw*SJe
zdt)j+bekH_X<!gB<r;#hhyt1j41%y-a{5{X<<^k!WguZQ5_TMRR9M_aA-OigH9h2d
zirwE7a-GCxN1#UjH@Q-C@)0S!2d^v|DT~x7)GSmn%wS{+=B{iaYX=dJ=BGQbiQ&&d
zkWQ<W5&2dJ(ioCfx}e1^K<_~wKn|k?Da*iXlj^EMX^aK&*-vjb?D9_qB5q{z!lyI>
zZo;9w`8H!F_`DEFGO-3=q;s?UVh1+C4pmvb$+f2E-jpwe&<h8)UqzqM{k#eL{n=(&
z`3$?p9>tE+{{)gYL@Ilj2=U9FY{SOn#q;{1Uv6bQ6Z51?er5457O9WZD^0oWdh_kY
ziR9ZJ99vJ=&PBkxj3uTb!pBG=m6+5y1B)q!i)@$Z8%feyqEyV)*O&ajjl5<L_tDOo
zLNaEtIfDeTs26669e(U-zRD4IH)Tc9LK{6>J6$P@8n!7>8nbH&Hg6BL@@;HxoP0{@
zq6YHT2tKGAMy7lpZES>!Pgn)9i1^(NecpkLQHA`r*%@WQl#Mgaf)<|n8ma2@%S}o0
zAfrw%|IY2ot-H5w&rH2jefLJCGIjGt_13h67r%Ss{=M7IbPw3M&xvSy>9xju7>HTf
zLMAZs<2AXDhQ>Fdlp0}6zD#r8MBTZ{Vu}UBpUyQFbl=5?5QER58g(d~9G<+Z)upyY
z9uDoy5Rd*2ihMS26`<0o+WDKs=Z!^t(NsjOU&`I6@iDD_YlxZt1Cry<7Yc(Tg^|ME
z!r{VU`IJvC#<+X_s!=Y(SjFV0M>{EDsvqTm43wl0l3w|kd;uvzZElvIP}RdANWKP?
y@0p!5e3LgJ_hn2!lV0JmwgrNPd?rm|;L1+yD)~TS&>BQOgn5v`Y@u)D&VK>kVxyJ-


From 3b5e3f9be4b97f15aac809b851cb328bbf424437 Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Tue, 5 Jun 2018 18:05:06 +0800
Subject: [PATCH 31/93] update checkpoint unittest

---
 python/paddle/fluid/tests/unittests/test_checkpoint.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_checkpoint.py b/python/paddle/fluid/tests/unittests/test_checkpoint.py
index b8d82c59b4..150e8822d5 100644
--- a/python/paddle/fluid/tests/unittests/test_checkpoint.py
+++ b/python/paddle/fluid/tests/unittests/test_checkpoint.py
@@ -14,6 +14,7 @@
 
 import paddle.fluid as fluid
 import unittest
+import os
 
 
 class TestCheckpoint(unittest.TestCase):
@@ -35,8 +36,8 @@ class TestCheckpoint(unittest.TestCase):
         trainer_args = ["epoch_id", "step_id"]
         epoch_id, step_id = fluid.io.load_trainer_args(
             self.dirname, serial, self.trainer_id, trainer_args)
-        self.assertEqual(self.step_id, step_id)
-        self.assertEqual(self.epoch_id, epoch_id)
+        self.assertEqual(self.step_id, int(step_id))
+        self.assertEqual(self.epoch_id, int(epoch_id))
 
         program = fluid.Program()
         with fluid.program_guard(program):
@@ -44,6 +45,7 @@ class TestCheckpoint(unittest.TestCase):
             fluid.io.load_checkpoint(exe, self.dirname, serial, program)
 
         fluid.io.clean_checkpoint(self.dirname, delete_dir=True)
+        self.assertFalse(os.path.isdir(self.dirname))
 
     def save_checkpoint(self):
         config = fluid.CheckpointConfig(self.dirname, self.max_num_checkpoints,

From 6db240d78b3b515a1b2d885e8cc6d8e0b2ffd638 Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Tue, 5 Jun 2018 19:25:55 +0800
Subject: [PATCH 32/93] update trainer about epoch_id and step id

---
 python/paddle/fluid/trainer.py | 23 +++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py
index fbdd28f53e..4ffc206458 100644
--- a/python/paddle/fluid/trainer.py
+++ b/python/paddle/fluid/trainer.py
@@ -188,7 +188,7 @@ class Trainer(object):
             if not self.checkpoint.is_pserver:
                 epoch_id, step_id = io.load_trainer_args(
                     self.checkpoint.checkpoint_dir, self.checkpoint.load_serial,
-                    self.trainer_id, ["epoch_id", "step_id"])
+                    self.trainer_id, self._get_checkpoint_load_args())
                 self.checkpoint.epoch_id = int(epoch_id)
                 self.checkpoint.step_id = int(step_id)
 
@@ -432,22 +432,33 @@ class Trainer(object):
             return
         io.clean_checkpoint(checkpoint_dir=self.checkpoint.checkpoint_dir)
 
+    def _get_checkpoint_load_args(self):
+        """
+        epoch_id and step_id are runtime arguments, they are not variables, will load them independently.
+        """
+        return ["epoch_id", "step_id"]
+
+    def _get_checkpoint_save_args(self, epoch_id, step_id):
+        """
+        epoch_id and step_id are runtime arguments, they are not variables, will save them independently.
+        """
+        trainer_args = {}
+        trainer_args["epoch_id"] = epoch_id
+        trainer_args["step_id"] = step_id
+        return trainer_args
+
     def _save_checkpoint(self, epoch_id, step_id):
         if not self.checkpoint:
             return
 
         if epoch_id % self.checkpoint.epoch_interval == 0 and step_id % self.checkpoint.step_interval == 0:
-            trainer_args = {}
-            trainer_args["epoch_id"] = epoch_id
-            trainer_args["step_id"] = step_id
-
             exe = executor.Executor(self.place)
             io.save_checkpoint(
                 executor=exe,
                 checkpoint_dir=self.checkpoint.checkpoint_dir,
                 trainer_id=self.trainer_id,
                 is_chief=self.chief,
-                trainer_args=trainer_args,
+                trainer_args=self._get_checkpoint_save_args(epoch_id, step_id),
                 main_program=self.train_program,
                 max_num_checkpoints=self.checkpoint.max_num_checkpoints)
 

From f28f41dbcdb0479d98682b94eb13db95112de424 Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Tue, 5 Jun 2018 19:40:41 +0800
Subject: [PATCH 33/93] update io.py annotations and codes

---
 python/paddle/fluid/io.py | 27 +++++++++++++--------------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index 5abadc73f7..8fcc778709 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -483,11 +483,11 @@ def save_checkpoint(executor,
     :param main_program
     :param max_num_checkpoints
     """
-    if checkpoint_dir is None:
-        raise ValueError("The values of 'checkpoint_dir' should not be None")
+    if checkpoint_dir.strip() is None:
+        raise ValueError("'checkpoint_dir' should not be None")
 
-    if trainer_args and not isinstance(trainer_args, dict):
-        raise TypeError("The type of 'trainer_args' should be dict")
+    if trainer_args:
+        assert isinstance(trainer_args, dict)
 
     if not os.path.isdir(checkpoint_dir):
         os.makedirs(checkpoint_dir)
@@ -514,11 +514,11 @@ def load_checkpoint(executor, checkpoint_dir, serial, main_program):
     :param main_program
     """
 
-    if checkpoint_dir is None:
-        raise ValueError("The values of 'checkpoint_dir' should not be None")
+    if checkpoint_dir.strip() is None:
+        raise ValueError("'checkpoint_dir' should not be None")
 
     if serial is None or serial < 0:
-        raise ValueError("The values of 'serial' should not be None or <0 ")
+        raise ValueError("'serial' should not be None or <0 ")
 
     if main_program is None:
         raise ValueError('main_program should not be None.')
@@ -536,8 +536,8 @@ def clean_checkpoint(checkpoint_dir, delete_dir=False):
     :param delete_dir
     """
 
-    if checkpoint_dir is None:
-        raise ValueError("The values of 'checkpoint_dir' should not be None")
+    if checkpoint_dir.strip() is None:
+        raise ValueError("'checkpoint_dir' should not be None")
     _lru_delete(checkpoint_dir, max_num_checkpoints=0)
 
     if delete_dir and not os.listdir(checkpoint_dir):
@@ -590,8 +590,8 @@ def save_persist_vars_without_grad(executor, dirname, program):
 
 
 def save_trainer_args(dirname, trainer_id, trainer_args):
-    if not isinstance(trainer_args, dict):
-        raise TypeError("The type of 'trainer_args' should be dict")
+    assert isinstance(trainer_args, dict)
+
     cur_dir = _get_trainer_dir(dirname, trainer_id)
 
     for name, value in trainer_args.iteritems():
@@ -602,12 +602,11 @@ def save_trainer_args(dirname, trainer_id, trainer_args):
 
 
 def load_trainer_args(checkpoint_dir, serial, trainer_id, trainer_args):
+    assert isinstance(trainer_args, list)
+
     cur_dir = _get_serial_dir(checkpoint_dir, serial)
     cur_dir = _get_trainer_dir(cur_dir, trainer_id)
 
-    if not isinstance(trainer_args, list):
-        raise TypeError("The type of 'trainer_args' should be list")
-
     ret_values = []
 
     for arg in trainer_args:

From 53409a29d889903ec1414d72f0455fe4ef6588a6 Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Tue, 5 Jun 2018 22:00:30 +0800
Subject: [PATCH 34/93] code optimized

---
 python/paddle/fluid/trainer.py | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py
index 4ffc206458..9882d5cda0 100644
--- a/python/paddle/fluid/trainer.py
+++ b/python/paddle/fluid/trainer.py
@@ -141,14 +141,10 @@ class Trainer(object):
         self.chief = True
         self.checkpoint = checkpoint_config
         if self.checkpoint:
-            if not isinstance(self.checkpoint, CheckpointConfig):
-                raise TypeError(
-                    "The checkpoint_config shoule be an instance of CheckpointConfig"
-                )
-            else:
-                serial = io.get_latest_checkpoint_serial(
-                    self.checkpoint.checkpoint_dir)
-                self.checkpoint.load_serial = serial if serial >= 0 else None
+            assert isinstance(self.checkpoint, CheckpointConfig)
+            serial = io.get_latest_checkpoint_serial(
+                self.checkpoint.checkpoint_dir)
+            self.checkpoint.load_serial = serial if serial >= 0 else None
 
         self.scope = core.Scope()
 
@@ -385,8 +381,8 @@ class Trainer(object):
                 else:
                     metrics = exe.run(feed=data, fetch_list=[])
 
-                event_handler(EndStepEvent(epoch_id, step_id, metrics))
                 self._save_checkpoint(epoch_id, step_id)
+                event_handler(EndStepEvent(epoch_id, step_id, metrics))
             event_handler(EndEpochEvent(epoch_id))
         self._clean_checkpoint()
 

From 8d14b3953f50f1a311d9b02d8e24e616f485e085 Mon Sep 17 00:00:00 2001
From: "yi.wu" <yi.wu@baifendian.com>
Date: Wed, 6 Jun 2018 13:02:29 +0800
Subject: [PATCH 35/93] follow comments

---
 benchmark/fluid/fluid_benchmark.py            | 33 ++++++++++++++-----
 benchmark/fluid/models/machine_translation.py |  2 +-
 benchmark/fluid/models/mnist.py               |  2 +-
 benchmark/fluid/models/resnet.py              |  2 +-
 .../fluid/models/stacked_dynamic_lstm.py      |  2 +-
 benchmark/fluid/models/vgg.py                 |  2 +-
 6 files changed, 30 insertions(+), 13 deletions(-)

diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py
index 399f5fb49c..c2771ba5db 100644
--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
@@ -38,7 +38,10 @@ def parse_args():
         default='resnet',
         help='The model to run benchmark with.')
     parser.add_argument(
-        '--batch_size', type=int, default=32, help='The minibatch size.')
+        '--batch_size',
+        type=int,
+        default=32,
+        help='The batch size on each gpu.')
     parser.add_argument(
         '--learning_rate', type=float, default=0.001, help='The learning rate.')
     parser.add_argument(
@@ -229,27 +232,35 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
     iters, num_samples, start_time = 0, 0, time.time()
     for pass_id in range(args.pass_num):
         train_losses = []
-        reader_generator = train_reader()
+        if not args.use_reader_op:
+            reader_generator = train_reader()
         batch_id = 0
         data = None
         while True:
             if not args.use_reader_op:
                 data = next(reader_generator, None)
-            if iters == args.iterations or data == None:
+                if data == None:
+                    break
+            if iters == args.iterations:
                 break
             if iters == args.skip_batch_num:
                 start_time = time.time()
                 num_samples = 0
 
             if args.use_reader_op:
-                loss = exe.run(train_prog, fetch_list=[avg_loss])
+                try:
+                    loss = exe.run(train_prog, fetch_list=[avg_loss])
+                except fluid.core.EnforceNotMet as ex:
+                    break
             else:
                 loss = exe.run(train_prog,
                                feed=feeder.feed(data),
                                fetch_list=[avg_loss])
             iters += 1
             batch_id += 1
-            # FIXME(wuyi): last batch size maybe different
+            # FIXME(wuyi): For use_reader_op, if the current
+            # pass is not the last, the last batch of this pass
+            # is also equal to args.batch_size.
             num_samples += len(args.batch_size)
             train_losses.append(loss)
             print("Pass: %d, Iter: %d, Loss: %f\n" %
@@ -315,13 +326,16 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
         num_samples = 0
         iters = 0
         start_time = time.time()
-        reader_generator = train_reader()
+        if not args.use_reader_op:
+            reader_generator = train_reader()
         batch_id = 0
         data = None
         while True:
             if not args.use_reader_op:
                 data = next(reader_generator, None)
-            if iters == args.iterations or data == None:
+                if data == None:
+                    break
+            if iters == args.iterations:
                 break
             if args.profile and pass_id == 0 and batch_id == 5:
                 profiler.start_profiler("All")
@@ -335,7 +349,10 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
             if args.use_reader_op and iters >= args.iterations / args.gpus:
                 break
             if args.use_fake_data or args.use_reader_op:
-                loss, = exe.run([avg_loss.name])
+                try:
+                    loss, = exe.run([avg_loss.name])
+                except fluid.core.EnforceNotMet as ex:
+                    break
             else:
                 loss, = exe.run([avg_loss.name], feed=feeder.feed(data))
             if args.update_method == "pserver":
diff --git a/benchmark/fluid/models/machine_translation.py b/benchmark/fluid/models/machine_translation.py
index 3024882725..69541adf6b 100644
--- a/benchmark/fluid/models/machine_translation.py
+++ b/benchmark/fluid/models/machine_translation.py
@@ -223,7 +223,7 @@ def get_model(args):
     train_batch_generator = paddle.batch(
         paddle.reader.shuffle(
             paddle.dataset.wmt14.train(dict_size), buf_size=1000),
-        batch_size=args.batch_size)
+        batch_size=args.batch_size * args.gpus)
 
     test_batch_generator = paddle.batch(
         paddle.reader.shuffle(
diff --git a/benchmark/fluid/models/mnist.py b/benchmark/fluid/models/mnist.py
index 5d3da68daf..54206c252c 100644
--- a/benchmark/fluid/models/mnist.py
+++ b/benchmark/fluid/models/mnist.py
@@ -103,7 +103,7 @@ def get_model(args):
 
     # Reader
     train_reader = paddle.batch(
-        paddle.dataset.mnist.train(), batch_size=args.batch_size)
+        paddle.dataset.mnist.train(), batch_size=args.batch_size * args.gpus)
     test_reader = paddle.batch(
         paddle.dataset.mnist.test(), batch_size=args.batch_size)
     return avg_cost, inference_program, opt, train_reader, test_reader, batch_acc
diff --git a/benchmark/fluid/models/resnet.py b/benchmark/fluid/models/resnet.py
index 47d8d026ed..3c87076724 100644
--- a/benchmark/fluid/models/resnet.py
+++ b/benchmark/fluid/models/resnet.py
@@ -184,7 +184,7 @@ def get_model(args):
     batched_train_reader = paddle.batch(
         paddle.reader.shuffle(
             train_reader, buf_size=5120),
-        batch_size=args.batch_size)
+        batch_size=args.batch_size * args.gpus)
     batched_test_reader = paddle.batch(train_reader, batch_size=args.batch_size)
 
     return avg_cost, inference_program, optimizer, batched_train_reader, batched_test_reader, batch_acc
diff --git a/benchmark/fluid/models/stacked_dynamic_lstm.py b/benchmark/fluid/models/stacked_dynamic_lstm.py
index e2a8cf45ac..211869af4e 100644
--- a/benchmark/fluid/models/stacked_dynamic_lstm.py
+++ b/benchmark/fluid/models/stacked_dynamic_lstm.py
@@ -118,7 +118,7 @@ def get_model(args):
     train_reader = batch(
         paddle.reader.shuffle(
             crop_sentence(imdb.train(word_dict), crop_size), buf_size=25000),
-        batch_size=args.batch_size)
+        batch_size=args.batch_size * args.gpus)
     test_reader = batch(
         paddle.reader.shuffle(
             crop_sentence(imdb.test(word_dict), crop_size), buf_size=25000),
diff --git a/benchmark/fluid/models/vgg.py b/benchmark/fluid/models/vgg.py
index b84e118a88..cb0dc97763 100644
--- a/benchmark/fluid/models/vgg.py
+++ b/benchmark/fluid/models/vgg.py
@@ -110,7 +110,7 @@ def get_model(args):
             paddle.dataset.cifar.train10()
             if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
             buf_size=5120),
-        batch_size=args.batch_size)
+        batch_size=args.batch_size * args.gpus)
     test_reader = paddle.batch(
         paddle.dataset.cifar.test10()
         if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),

From 6fdd5de2aac0e8eba046a21840127e3d650d388f Mon Sep 17 00:00:00 2001
From: "yi.wu" <yi.wu@baifendian.com>
Date: Wed, 6 Jun 2018 13:39:09 +0800
Subject: [PATCH 36/93] update

---
 benchmark/fluid/fluid_benchmark.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py
index 150346798e..ca7f7dbb07 100644
--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
@@ -266,7 +266,10 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
             # FIXME(wuyi): For use_reader_op, if the current
             # pass is not the last, the last batch of this pass
             # is also equal to args.batch_size.
-            num_samples += len(args.batch_size)
+            if args.use_reader_op:
+                num_samples += args.batch_size
+            else:
+                num_samples += len(data)
             train_losses.append(loss)
             print("Pass: %d, Iter: %d, Loss: %f\n" %
                   (pass_id, iters, np.mean(train_losses)))
@@ -350,9 +353,6 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
             if iters == args.skip_batch_num:
                 start_time = time.time()
                 num_samples = 0
-            # NOTE: if use reader ops, the input data is not splited to multiple cards
-            if args.use_reader_op and iters >= args.iterations / args.gpus:
-                break
             if args.use_fake_data or args.use_reader_op:
                 try:
                     loss, = exe.run([avg_loss.name])
@@ -362,7 +362,10 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
                 loss, = exe.run([avg_loss.name], feed=feeder.feed(data))
             if args.update_method == "pserver":
                 exe.bcast_params()
-            num_samples += len(data)
+            if args.use_reader_op:
+                num_samples += args.batch_size
+            else:
+                num_samples += len(data)
             iters += 1
             if batch_id % 1 == 0:
                 print("Pass %d, batch %d, loss %s" %

From 2f44585e831578b58b53ce5d4b6adcb0275530ce Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Wed, 6 Jun 2018 17:26:51 +0800
Subject: [PATCH 37/93] code optimized

---
 python/paddle/fluid/io.py                     | 42 +++++++------
 .../fluid/tests/unittests/test_checkpoint.py  |  3 +-
 python/paddle/fluid/trainer.py                | 60 +++++++++----------
 3 files changed, 52 insertions(+), 53 deletions(-)

diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index 8fcc778709..34c527b62f 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -476,14 +476,14 @@ def save_checkpoint(executor,
     to keep numbers of checkpoint directory,  the numbers of checkpoint directory are max_num_checkpoints at most,
     The interval between two saved checkpoints must greater than save_interval_secs.
 
-    :param executor
-    :param checkpoint_dir
-    :param trainer_id
-    :param is_chief
-    :param main_program
-    :param max_num_checkpoints
-    """
-    if checkpoint_dir.strip() is None:
+    :param executor executor for save the value
+    :param checkpoint_dir the checkpoint directory 
+    :param trainer_id currect trainer id
+    :param is_chief if the trainer id equals 0, the is_chief will be true
+    :param main_program   will save all variables in program 
+    :param max_num_checkpoints will keep numbers of checkpoint serials not bigger than max_num_checkpoints
+    """
+    if checkpoint_dir is None:
         raise ValueError("'checkpoint_dir' should not be None")
 
     if trainer_args:
@@ -500,7 +500,7 @@ def save_checkpoint(executor,
     if is_chief:
         save_persist_vars_without_grad(executor, cur_dir, main_program)
 
-    _lru_delete(checkpoint_dir, max_num_checkpoints)
+    _scroll_delete(checkpoint_dir, max_num_checkpoints)
 
 
 def load_checkpoint(executor, checkpoint_dir, serial, main_program):
@@ -508,13 +508,13 @@ def load_checkpoint(executor, checkpoint_dir, serial, main_program):
     Load checkpoint from a directory by executor,
     it will find  the most recent saved checkpoint file and load it auto.
 
-    :param executor
-    :param checkpoint_dir
-    :param serial
-    :param main_program
+    :param executor executor for load the value
+    :param checkpoint_dir  the checkpoint directory 
+    :param serial the serial folder in checkpoint directory will be load
+    :param main_program  will load all variables in program 
     """
 
-    if checkpoint_dir.strip() is None:
+    if checkpoint_dir is None:
         raise ValueError("'checkpoint_dir' should not be None")
 
     if serial is None or serial < 0:
@@ -536,9 +536,9 @@ def clean_checkpoint(checkpoint_dir, delete_dir=False):
     :param delete_dir
     """
 
-    if checkpoint_dir.strip() is None:
+    if checkpoint_dir is None:
         raise ValueError("'checkpoint_dir' should not be None")
-    _lru_delete(checkpoint_dir, max_num_checkpoints=0)
+    _scroll_delete(checkpoint_dir, max_num_checkpoints=0)
 
     if delete_dir and not os.listdir(checkpoint_dir):
         os.rmdir(checkpoint_dir)
@@ -681,7 +681,7 @@ def _get_trainer_dir(dirname, trainer_id):
     return trainer_dir
 
 
-def _lru_delete(dirname, max_num_checkpoints=3):
+def _scroll_delete(dirname, max_num_checkpoints=3):
     dirs = os.listdir(dirname)
     serial_map = {}
     for serial in dirs:
@@ -717,7 +717,7 @@ def get_latest_checkpoint_serial(checkpoint_dir):
 
     :param checkpoint_dir
     """
-    if not checkpoint_dir.strip():
+    if not checkpoint_dir:
         return -1
 
     def has_success(checkpoint_dir, cur_dir):
@@ -726,10 +726,8 @@ def get_latest_checkpoint_serial(checkpoint_dir):
         """
 
         serial = _get_dir_serial(cur_dir)
-        if serial == -1:
-            return -1
-
-        if not os.path.isdir(os.path.join(checkpoint_dir, cur_dir)):
+        if serial == -1 or not os.path.isdir(
+                os.path.join(checkpoint_dir, cur_dir)):
             return -1
 
         success_path = os.path.join(
diff --git a/python/paddle/fluid/tests/unittests/test_checkpoint.py b/python/paddle/fluid/tests/unittests/test_checkpoint.py
index 150e8822d5..cf70dfd448 100644
--- a/python/paddle/fluid/tests/unittests/test_checkpoint.py
+++ b/python/paddle/fluid/tests/unittests/test_checkpoint.py
@@ -15,11 +15,12 @@
 import paddle.fluid as fluid
 import unittest
 import os
+import tempfile
 
 
 class TestCheckpoint(unittest.TestCase):
     def setUp(self):
-        self.dirname = "/tmp/ckpt"
+        self.dirname = tempfile.mktemp()
         self.max_num_checkpoints = 3
         self.epoch_interval = 1
         self.step_interval = 1
diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py
index 9882d5cda0..e5cec4c76a 100644
--- a/python/paddle/fluid/trainer.py
+++ b/python/paddle/fluid/trainer.py
@@ -132,19 +132,18 @@ class Trainer(object):
         # 1. we need to generate a framework.Program by calling
         # program_func. Reference: fluid.program_guard in
         # test_word2vec.py
-        if not isinstance(optimizer, opt_module.Optimizer):
-            raise TypeError("The optimizer should be an instance of Optimizer")
+        assert isinstance(optimizer, opt_module.Optimizer)
 
         # config for checkpoint
         # only chief worker will save variables
         self.trainer_id = 0
         self.chief = True
-        self.checkpoint = checkpoint_config
-        if self.checkpoint:
-            assert isinstance(self.checkpoint, CheckpointConfig)
+        self.checkpoint_cfg = checkpoint_config
+        if self.checkpoint_cfg:
+            assert isinstance(self.checkpoint_cfg, CheckpointConfig)
             serial = io.get_latest_checkpoint_serial(
-                self.checkpoint.checkpoint_dir)
-            self.checkpoint.load_serial = serial if serial >= 0 else None
+                self.checkpoint_cfg.checkpoint_dir)
+            self.checkpoint_cfg.load_serial = serial if serial >= 0 else None
 
         self.scope = core.Scope()
 
@@ -174,19 +173,20 @@ class Trainer(object):
             exe = executor.Executor(place)
             exe.run(self.startup_program)
 
-        if self.checkpoint and self.checkpoint.load_serial:
+        if self.checkpoint_cfg and self.checkpoint_cfg.load_serial:
             with self._prog_and_scope_guard():
                 exe = executor.Executor(place)
-                io.load_checkpoint(exe, self.checkpoint.checkpoint_dir,
-                                   self.checkpoint.load_serial,
+                io.load_checkpoint(exe, self.checkpoint_cfg.checkpoint_dir,
+                                   self.checkpoint_cfg.load_serial,
                                    self.startup_program)
 
-            if not self.checkpoint.is_pserver:
+            if not self.checkpoint_cfg.is_pserver:
                 epoch_id, step_id = io.load_trainer_args(
-                    self.checkpoint.checkpoint_dir, self.checkpoint.load_serial,
-                    self.trainer_id, self._get_checkpoint_load_args())
-                self.checkpoint.epoch_id = int(epoch_id)
-                self.checkpoint.step_id = int(step_id)
+                    self.checkpoint_cfg.checkpoint_dir,
+                    self.checkpoint_cfg.load_serial, self.trainer_id,
+                    self._get_checkpoint_load_args())
+                self.checkpoint_cfg.epoch_id = int(epoch_id)
+                self.checkpoint_cfg.step_id = int(step_id)
 
         if param_path and os.path.isdir(param_path):
             # load params from param_path into scope
@@ -256,7 +256,7 @@ class Trainer(object):
             t.transpile(
                 self.trainer_id, pservers=pserver_endpoints, trainers=trainers)
             if training_role == "PSERVER":
-                if self.checkpoint:
+                if self.checkpoint_cfg:
                     self.is_pserver = True
 
                 self.train_program = t.get_pserver_program(current_endpoint)
@@ -351,10 +351,10 @@ class Trainer(object):
             self._train_by_any_executor(event_handler, exe, num_epochs, reader)
 
     def _train_by_any_executor(self, event_handler, exe, num_epochs, reader):
-        if self.checkpoint:
+        if self.checkpoint_cfg:
             epochs = [
                 epoch_id for epoch_id in range(num_epochs)
-                if epoch_id >= self.checkpoint.epoch_id
+                if epoch_id >= self.checkpoint_cfg.epoch_id
             ]
         else:
             epochs = [epoch_id for epoch_id in range(num_epochs)]
@@ -366,8 +366,8 @@ class Trainer(object):
                     self._clean_checkpoint()
                     return
 
-                if self.checkpoint and self.checkpoint.load_serial \
-                    and self.checkpoint.step_id >= step_id and self.checkpoint.epoch_id == epoch_id:
+                if self.checkpoint_cfg and self.checkpoint_cfg.load_serial \
+                    and self.checkpoint_cfg.step_id >= step_id and self.checkpoint_cfg.epoch_id == epoch_id:
                     continue
 
                 begin_event = BeginStepEvent(epoch_id, step_id)
@@ -381,10 +381,12 @@ class Trainer(object):
                 else:
                     metrics = exe.run(feed=data, fetch_list=[])
 
-                self._save_checkpoint(epoch_id, step_id)
+                if self.checkpoint_cfg:
+                    self._save_checkpoint(epoch_id, step_id)
                 event_handler(EndStepEvent(epoch_id, step_id, metrics))
             event_handler(EndEpochEvent(epoch_id))
-        self._clean_checkpoint()
+        if self.checkpoint_cfg:
+            self._clean_checkpoint()
 
     def _test_by_executor(self, reader, feed_order, fetch_list):
         with executor.scope_guard(self.scope):
@@ -424,9 +426,8 @@ class Trainer(object):
         return self._get_parallel_executor()
 
     def _clean_checkpoint(self):
-        if not self.checkpoint:
-            return
-        io.clean_checkpoint(checkpoint_dir=self.checkpoint.checkpoint_dir)
+        assert self.checkpoint_cfg
+        io.clean_checkpoint(checkpoint_dir=self.checkpoint_cfg.checkpoint_dir)
 
     def _get_checkpoint_load_args(self):
         """
@@ -444,19 +445,18 @@ class Trainer(object):
         return trainer_args
 
     def _save_checkpoint(self, epoch_id, step_id):
-        if not self.checkpoint:
-            return
+        assert self.checkpoint_cfg
 
-        if epoch_id % self.checkpoint.epoch_interval == 0 and step_id % self.checkpoint.step_interval == 0:
+        if epoch_id % self.checkpoint_cfg.epoch_interval == 0 and step_id % self.checkpoint_cfg.step_interval == 0:
             exe = executor.Executor(self.place)
             io.save_checkpoint(
                 executor=exe,
-                checkpoint_dir=self.checkpoint.checkpoint_dir,
+                checkpoint_dir=self.checkpoint_cfg.checkpoint_dir,
                 trainer_id=self.trainer_id,
                 is_chief=self.chief,
                 trainer_args=self._get_checkpoint_save_args(epoch_id, step_id),
                 main_program=self.train_program,
-                max_num_checkpoints=self.checkpoint.max_num_checkpoints)
+                max_num_checkpoints=self.checkpoint_cfg.max_num_checkpoints)
 
 
 def build_feed_var_list(program, feed_order):

From 8370c5afd0aef8f4a709a9eb6d71f181a03788f6 Mon Sep 17 00:00:00 2001
From: "yi.wu" <yi.wu@baifendian.com>
Date: Wed, 6 Jun 2018 19:10:41 +0800
Subject: [PATCH 38/93] fix errors by comment

---
 benchmark/fluid/fluid_benchmark.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py
index ca7f7dbb07..bd0243aa60 100644
--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
@@ -267,7 +267,7 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
             # pass is not the last, the last batch of this pass
             # is also equal to args.batch_size.
             if args.use_reader_op:
-                num_samples += args.batch_size
+                num_samples += args.batch_size * args.gpus
             else:
                 num_samples += len(data)
             train_losses.append(loss)
@@ -363,7 +363,7 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
             if args.update_method == "pserver":
                 exe.bcast_params()
             if args.use_reader_op:
-                num_samples += args.batch_size
+                num_samples += args.batch_size * args.gpus
             else:
                 num_samples += len(data)
             iters += 1

From 49ac0713d9a376198065ddcb4d1a1cf930913459 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Wed, 6 Jun 2018 19:14:59 +0800
Subject: [PATCH 39/93] add infer multi-threads demo

---
 .../inference/demo/simple_on_word2vec.cc      | 49 ++++++++++++++++++-
 1 file changed, 48 insertions(+), 1 deletion(-)

diff --git a/paddle/contrib/inference/demo/simple_on_word2vec.cc b/paddle/contrib/inference/demo/simple_on_word2vec.cc
index 9b4843f714..113f3c774e 100644
--- a/paddle/contrib/inference/demo/simple_on_word2vec.cc
+++ b/paddle/contrib/inference/demo/simple_on_word2vec.cc
@@ -19,8 +19,8 @@ limitations under the License. */
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 #include <memory>
+#include <thread>
 #include "paddle/contrib/inference/paddle_inference_api.h"
-
 namespace paddle {
 namespace demo {
 
@@ -64,7 +64,54 @@ void Main(bool use_gpu) {
   }
 }
 
+void MainThreads(int num_threads) {
+  // Multi-threads only support on CPU
+  // 0. Create PaddlePredictor with a config.
+  NativeConfig config;
+  config.model_dir = FLAGS_dirname + "word2vec.inference.model";
+  config.use_gpu = false;
+  auto main_predictor =
+      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+
+  std::vector<std::thread> threads;
+  for (int tid = 0; tid < num_threads; ++tid) {
+    threads.emplace_back([&, tid]() {
+      // 1. clone a predictor which shares the same parameters
+      auto predictor = main_predictor->Clone();
+      constexpr int num_batches = 3;
+      for (int batch_id = 0; batch_id < num_batches; ++batch_id) {
+        // 2. Dummy Input Data
+        int64_t data[4] = {1, 2, 3, 4};
+        PaddleBuf buf{.data = data, .length = sizeof(data)};
+        PaddleTensor tensor{.name = "",
+                            .shape = std::vector<int>({4, 1}),
+                            .data = buf,
+                            .dtype = PaddleDType::INT64};
+        std::vector<PaddleTensor> inputs(4, tensor);
+        std::vector<PaddleTensor> outputs;
+        // 3. Run
+        CHECK(predictor->Run(inputs, &outputs));
+
+        // 4. Get output.
+        ASSERT_EQ(outputs.size(), 1UL);
+        LOG(INFO) << "TID: " << tid << ", "
+                  << "output buffer size: " << outputs.front().data.length;
+        const size_t num_elements = outputs.front().data.length / sizeof(float);
+        // The outputs' buffers are in CPU memory.
+        for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
+          LOG(INFO) << static_cast<float*>(outputs.front().data.data)[i];
+        }
+      }
+    });
+  }
+  for (int i = 0; i < num_threads; ++i) {
+    threads[i].join();
+  }
+}
+
 TEST(demo, word2vec_cpu) { Main(false /*use_gpu*/); }
+TEST(demo_multi_threads, word2vec_cpu_1) { MainThreads(1); }
+TEST(demo_multi_threads, word2vec_cpu_4) { MainThreads(4); }
 
 #ifdef PADDLE_WITH_CUDA
 TEST(demo, word2vec_gpu) { Main(true /*use_gpu*/); }

From 7fbddaa64a086d1cd9bf3a9811b2b153918ed84a Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Wed, 6 Jun 2018 20:41:21 +0800
Subject: [PATCH 40/93] bug fix

---
 python/paddle/fluid/trainer.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py
index 444162664d..5230ded7db 100644
--- a/python/paddle/fluid/trainer.py
+++ b/python/paddle/fluid/trainer.py
@@ -365,7 +365,8 @@ class Trainer(object):
             event_handler(BeginEpochEvent(epoch_id))
             for step_id, data in enumerate(reader()):
                 if self.__stop:
-                    self._clean_checkpoint()
+                    if self.checkpoint_cfg:
+                        self._clean_checkpoint()
                     return
 
                 if self.checkpoint_cfg and self.checkpoint_cfg.load_serial \

From e0895e49dc75e93809c232d578ed1b31d423ae16 Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Wed, 6 Jun 2018 20:53:31 +0800
Subject: [PATCH 41/93] remove some seems unused codes.

---
 paddle/fluid/operators/detail/request_handler.h      |  7 -------
 .../fluid/operators/detail/request_handler_impl.cc   |  5 -----
 paddle/fluid/operators/listen_and_serv_op.cc         | 12 ------------
 3 files changed, 24 deletions(-)

diff --git a/paddle/fluid/operators/detail/request_handler.h b/paddle/fluid/operators/detail/request_handler.h
index 4bc5e7f10e..d74206aaba 100644
--- a/paddle/fluid/operators/detail/request_handler.h
+++ b/paddle/fluid/operators/detail/request_handler.h
@@ -80,7 +80,6 @@ class RequestHandler {
   }
   framework::ProgramDesc* program() { return program_; }
   framework::Executor* executor() { return executor_; }
-  std::vector<framework::Variable*>& sparse_vars() { return sparse_vars_; }
 
   // This function processes user's rpc request.
   // The implemention is in request_handler_impl.
@@ -113,13 +112,7 @@ class RequestHandler {
   std::unordered_map<std::string,
                      std::shared_ptr<framework::ExecutorPrepareContext>>*
       grad_to_prepared_ctx_;
-
-  // Record received sparse variables, so that
-  // we could reset those after execute optimize program
-  std::vector<framework::Variable*> sparse_vars_;
   RPCServer* rpc_server_;
-
-  std::mutex sparse_var_mutex_;
 };
 
 }  // namespace detail
diff --git a/paddle/fluid/operators/detail/request_handler_impl.cc b/paddle/fluid/operators/detail/request_handler_impl.cc
index f16c06d52f..145ee53107 100644
--- a/paddle/fluid/operators/detail/request_handler_impl.cc
+++ b/paddle/fluid/operators/detail/request_handler_impl.cc
@@ -63,11 +63,6 @@ bool RequestSendHandler::Handle(const std::string& varname,
       PADDLE_THROW("sync: Can not find server side var");
       return false;
     }
-
-    if (invar->IsType<framework::SelectedRows>()) {
-      std::unique_lock<std::mutex> lock(sparse_var_mutex_);
-      sparse_vars_.push_back(invar);
-    }
   }
 
   return true;
diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc
index 66a0f87b46..0c9d2b5a74 100644
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -108,9 +108,6 @@ void ListenAndServOp::RunSyncLoop(framework::Executor *executor,
       std::shared_ptr<framework::ExecutorPrepareContext>(nullptr));
 
   rpc_service_->ResetBarrierCounter();
-  // Record received sparse variables, so that
-  // we could reset those after execute optimize program
-  std::vector<framework::Variable *> sparse_vars;
   while (true) {
     // Get from multiple trainers, we don't care about the order in which
     // the gradients arrives, just add suffix 0~n and merge the gradient.
@@ -146,15 +143,6 @@ void ListenAndServOp::RunSyncLoop(framework::Executor *executor,
                           recv_scope);
     VLOG(2) << "run all blocks spent " << detail::GetTimestamp() - ts << "(ms)";
 
-    // Reset the received sparse variables, the sum operator would not
-    // sum the input sparse variables which rows is empty at the next
-    // mini-batch.
-    // TODO(Yancey1989): move the reset action into an operator, we couldn't
-    // have any hide logic in the operator.
-    for (framework::Variable *var : sparse_vars) {
-      var->GetMutable<framework::SelectedRows>()->mutable_rows()->clear();
-    }
-
     rpc_service_->SetCond(detail::kRequestGet);
     rpc_service_->WaitBarrier(detail::kRequestGet);
     rpc_service_->ResetBarrierCounter();

From 9dd993950a637157c37f33a1f835ac447fa7b32f Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Wed, 6 Jun 2018 21:20:17 +0800
Subject: [PATCH 42/93] fix free

---
 paddle/contrib/inference/demo/simple_on_word2vec.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/paddle/contrib/inference/demo/simple_on_word2vec.cc b/paddle/contrib/inference/demo/simple_on_word2vec.cc
index 113f3c774e..a4ef3b71c5 100644
--- a/paddle/contrib/inference/demo/simple_on_word2vec.cc
+++ b/paddle/contrib/inference/demo/simple_on_word2vec.cc
@@ -61,6 +61,8 @@ void Main(bool use_gpu) {
     for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
       LOG(INFO) << static_cast<float*>(outputs.front().data.data)[i];
     }
+    // TODO(Superjomn): this is should be free automatically
+    free(outputs[0].data.data);
   }
 }
 
@@ -101,6 +103,7 @@ void MainThreads(int num_threads) {
         for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
           LOG(INFO) << static_cast<float*>(outputs.front().data.data)[i];
         }
+        free(outputs[0].data.data);
       }
     });
   }

From cd330578ef0179c6c302c7fc9fd260847d28f005 Mon Sep 17 00:00:00 2001
From: "yi.wu" <yi.wu@baifendian.com>
Date: Thu, 7 Jun 2018 09:26:01 +0800
Subject: [PATCH 43/93] add num_passes

---
 benchmark/fluid/models/mnist.py  | 3 ++-
 benchmark/fluid/models/resnet.py | 3 ++-
 benchmark/fluid/models/vgg.py    | 3 ++-
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/benchmark/fluid/models/mnist.py b/benchmark/fluid/models/mnist.py
index d903a834ec..8e740dc689 100644
--- a/benchmark/fluid/models/mnist.py
+++ b/benchmark/fluid/models/mnist.py
@@ -75,7 +75,8 @@ def get_model(args):
             shapes=[[-1, 1, 28, 28], (-1, 1)],
             lod_levels=[0, 0],
             dtypes=["float32", "int64"],
-            thread_num=args.gpus)
+            thread_num=args.gpus,
+            pass_num=args.pass_num)
         data_file = fluid.layers.double_buffer(
             fluid.layers.batch(
                 data_file, batch_size=args.batch_size))
diff --git a/benchmark/fluid/models/resnet.py b/benchmark/fluid/models/resnet.py
index 88451064fc..2ee2b5be09 100644
--- a/benchmark/fluid/models/resnet.py
+++ b/benchmark/fluid/models/resnet.py
@@ -157,7 +157,8 @@ def get_model(args):
             shapes=[[-1] + dshape, (-1, 1)],
             lod_levels=[0, 0],
             dtypes=["float32", "int64"],
-            thread_num=args.gpus)
+            thread_num=args.gpus,
+            pass_num=args.pass_num)
         data_file = fluid.layers.double_buffer(
             fluid.layers.batch(
                 data_file, batch_size=args.batch_size))
diff --git a/benchmark/fluid/models/vgg.py b/benchmark/fluid/models/vgg.py
index cb0dc97763..6092cdeb88 100644
--- a/benchmark/fluid/models/vgg.py
+++ b/benchmark/fluid/models/vgg.py
@@ -75,7 +75,8 @@ def get_model(args):
             shapes=[[-1] + data_shape, (-1, 1)],
             lod_levels=[0, 0],
             dtypes=["float32", "int64"],
-            thread_num=args.gpus)
+            thread_num=args.gpus,
+            pass_num=args.pass_num)
         data_file = fluid.layers.double_buffer(
             fluid.layers.batch(
                 data_file, batch_size=args.batch_size))

From db1747a500a32bf6690241eee4f712d24c0df96c Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Thu, 7 Jun 2018 13:33:30 +0800
Subject: [PATCH 44/93] enable word2vec multi-threads ut

---
 .../test_paddle_inference_api_impl.cc         | 69 +++++++++++++++++++
 1 file changed, 69 insertions(+)

diff --git a/paddle/contrib/inference/test_paddle_inference_api_impl.cc b/paddle/contrib/inference/test_paddle_inference_api_impl.cc
index 1f96067716..77be527c5f 100644
--- a/paddle/contrib/inference/test_paddle_inference_api_impl.cc
+++ b/paddle/contrib/inference/test_paddle_inference_api_impl.cc
@@ -15,6 +15,8 @@ limitations under the License. */
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 
+#include <thread>
+
 #include "gflags/gflags.h"
 #include "paddle/contrib/inference/paddle_inference_api_impl.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
@@ -45,7 +47,11 @@ NativeConfig GetConfig() {
   config.model_dir = FLAGS_dirname + "word2vec.inference.model";
   LOG(INFO) << "dirname  " << config.model_dir;
   config.fraction_of_gpu_memory = 0.15;
+#ifdef PADDLE_WITH_CUDA
   config.use_gpu = true;
+#else
+  config.use_gpu = false;
+#endif
   config.device = 0;
   return config;
 }
@@ -149,4 +155,67 @@ TEST(paddle_inference_api_impl, image_classification) {
   free(data);
 }
 
+TEST(paddle_inference_api_native_multithreads, word2vec) {
+  NativeConfig config = GetConfig();
+  config.use_gpu = false;
+  auto main_predictor = CreatePaddlePredictor<NativeConfig>(config);
+
+  // prepare inputs data
+  constexpr int num_jobs = 3;
+  std::vector<std::vector<framework::LoDTensor>> jobs(num_jobs);
+  std::vector<std::vector<PaddleTensor>> paddle_tensor_feeds(num_jobs);
+  std::vector<framework::LoDTensor> refs(num_jobs);
+  for (size_t i = 0; i < jobs.size(); ++i) {
+    // each job has 4 words
+    jobs[i].resize(4);
+    for (size_t j = 0; j < 4; ++j) {
+      framework::LoD lod{{0, 1}};
+      int64_t dict_size = 2073;  // The size of dictionary
+      SetupLoDTensor(&jobs[i][j], lod, static_cast<int64_t>(0), dict_size - 1);
+      paddle_tensor_feeds[i].push_back(LodTensorToPaddleTensor(&jobs[i][j]));
+    }
+
+    // get reference result of each job
+    std::vector<paddle::framework::LoDTensor*> ref_feeds;
+    std::vector<paddle::framework::LoDTensor*> ref_fetches(1, &refs[i]);
+    for (auto& word : jobs[i]) {
+      ref_feeds.push_back(&word);
+    }
+    TestInference<platform::CPUPlace>(config.model_dir, ref_feeds, ref_fetches);
+  }
+
+  // create threads and each thread run 1 job
+  std::vector<std::thread> threads;
+  for (int tid = 0; tid < num_jobs; ++tid) {
+    threads.emplace_back([&, tid]() {
+      auto predictor = main_predictor->Clone();
+      auto& local_inputs = paddle_tensor_feeds[tid];
+      std::vector<PaddleTensor> local_outputs;
+      ASSERT_TRUE(predictor->Run(local_inputs, &local_outputs));
+
+      // check outputs range
+      ASSERT_EQ(local_outputs.size(), 1UL);
+      const size_t len = local_outputs[0].data.length;
+      float* data = static_cast<float*>(local_outputs[0].data.data);
+      for (size_t j = 0; j < len / sizeof(float); ++j) {
+        ASSERT_LT(data[j], 1.0);
+        ASSERT_GT(data[j], -1.0);
+      }
+
+      // check outputs correctness
+      float* ref_data = refs[tid].data<float>();
+      EXPECT_EQ(refs[tid].numel(), len / sizeof(float));
+      for (int i = 0; i < refs[tid].numel(); ++i) {
+        EXPECT_LT(ref_data[i] - data[i], 1e-3);
+        EXPECT_GT(ref_data[i] - data[i], -1e-3);
+      }
+
+      free(local_outputs[0].data.data);
+    });
+  }
+  for (int i = 0; i < num_jobs; ++i) {
+    threads[i].join();
+  }
+}
+
 }  // namespace paddle

From aad8f4d1a75d3f56a4c807ee872a5726983e661b Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Thu, 7 Jun 2018 14:10:25 +0800
Subject: [PATCH 45/93] enable image_classification multi-threads ut

---
 .../test_paddle_inference_api_impl.cc         | 61 +++++++++++++++++--
 1 file changed, 56 insertions(+), 5 deletions(-)

diff --git a/paddle/contrib/inference/test_paddle_inference_api_impl.cc b/paddle/contrib/inference/test_paddle_inference_api_impl.cc
index 77be527c5f..8ffe102cb9 100644
--- a/paddle/contrib/inference/test_paddle_inference_api_impl.cc
+++ b/paddle/contrib/inference/test_paddle_inference_api_impl.cc
@@ -160,7 +160,7 @@ TEST(paddle_inference_api_native_multithreads, word2vec) {
   config.use_gpu = false;
   auto main_predictor = CreatePaddlePredictor<NativeConfig>(config);
 
-  // prepare inputs data
+  // prepare inputs data and reference results
   constexpr int num_jobs = 3;
   std::vector<std::vector<framework::LoDTensor>> jobs(num_jobs);
   std::vector<std::vector<PaddleTensor>> paddle_tensor_feeds(num_jobs);
@@ -204,13 +204,64 @@ TEST(paddle_inference_api_native_multithreads, word2vec) {
 
       // check outputs correctness
       float* ref_data = refs[tid].data<float>();
-      EXPECT_EQ(refs[tid].numel(), len / sizeof(float));
+      EXPECT_EQ(refs[tid].numel(), static_cast<int64_t>(len / sizeof(float)));
       for (int i = 0; i < refs[tid].numel(); ++i) {
-        EXPECT_LT(ref_data[i] - data[i], 1e-3);
-        EXPECT_GT(ref_data[i] - data[i], -1e-3);
+        EXPECT_NEAR(ref_data[i], data[i], 1e-3);
       }
+      free(data);
+    });
+  }
+  for (int i = 0; i < num_jobs; ++i) {
+    threads[i].join();
+  }
+}
+
+TEST(paddle_inference_api_native_multithreads, image_classification) {
+  constexpr int num_jobs = 4;  // each job run 1 batch
+  constexpr int batch_size = 1;
+  NativeConfig config = GetConfig();
+  config.use_gpu = false;
+  config.model_dir =
+      FLAGS_dirname + "image_classification_resnet.inference.model";
+
+  auto main_predictor = CreatePaddlePredictor<NativeConfig>(config);
+  std::vector<framework::LoDTensor> jobs(num_jobs);
+  std::vector<std::vector<PaddleTensor>> paddle_tensor_feeds(num_jobs);
+  std::vector<framework::LoDTensor> refs(num_jobs);
+  for (size_t i = 0; i < jobs.size(); ++i) {
+    // prepare inputs
+    std::vector<std::vector<int64_t>> feed_target_shapes =
+        GetFeedTargetShapes(config.model_dir, /*is_combined*/ false);
+    feed_target_shapes[0][0] = batch_size;
+    framework::DDim input_dims = framework::make_ddim(feed_target_shapes[0]);
+    SetupTensor<float>(&jobs[i], input_dims, 0.f, 1.f);
+    paddle_tensor_feeds[i].push_back(LodTensorToPaddleTensor(&jobs[i]));
+
+    // get reference result of each job
+    std::vector<framework::LoDTensor*> ref_feeds(1, &jobs[i]);
+    std::vector<framework::LoDTensor*> ref_fetches(1, &refs[i]);
+    TestInference<platform::CPUPlace>(config.model_dir, ref_feeds, ref_fetches);
+  }
 
-      free(local_outputs[0].data.data);
+  // create threads and each thread run 1 job
+  std::vector<std::thread> threads;
+  for (int tid = 0; tid < num_jobs; ++tid) {
+    threads.emplace_back([&, tid]() {
+      auto predictor = main_predictor->Clone();
+      auto& local_inputs = paddle_tensor_feeds[tid];
+      std::vector<PaddleTensor> local_outputs;
+      ASSERT_TRUE(predictor->Run(local_inputs, &local_outputs));
+
+      // check outputs correctness
+      ASSERT_EQ(local_outputs.size(), 1UL);
+      const size_t len = local_outputs[0].data.length;
+      float* data = static_cast<float*>(local_outputs[0].data.data);
+      float* ref_data = refs[tid].data<float>();
+      EXPECT_EQ(refs[tid].numel(), len / sizeof(float));
+      for (int i = 0; i < refs[tid].numel(); ++i) {
+        EXPECT_NEAR(ref_data[i], data[i], 1e-3);
+      }
+      free(data);
     });
   }
   for (int i = 0; i < num_jobs; ++i) {

From 9ac785be396bd21d3f152a299f5fa7cb5e268e08 Mon Sep 17 00:00:00 2001
From: chengduoZH <zhaochengduo@163.com>
Date: Thu, 7 Jun 2018 15:40:58 +0800
Subject: [PATCH 46/93] check graph's validation

---
 .../details/multi_devices_graph_builder.cc    |  1 -
 .../framework/details/ssa_graph_builder.cc    | 70 ++++++++++++++++++-
 .../framework/details/ssa_graph_builder.h     |  3 +
 .../details/threaded_ssa_graph_executor.cc    |  1 +
 4 files changed, 73 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
index 0c4d369e88..81d5b079b8 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@@ -272,7 +272,6 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
    * Only variables should be the leaves of graph.
    */
   AddOutputToLeafOps(&result);
-
   return std::unique_ptr<SSAGraph>(graph);
 }
 
diff --git a/paddle/fluid/framework/details/ssa_graph_builder.cc b/paddle/fluid/framework/details/ssa_graph_builder.cc
index 211113c797..d70f95a9f5 100644
--- a/paddle/fluid/framework/details/ssa_graph_builder.cc
+++ b/paddle/fluid/framework/details/ssa_graph_builder.cc
@@ -11,8 +11,8 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-
 #include "paddle/fluid/framework/details/ssa_graph_builder.h"
+#include <utility>
 
 namespace paddle {
 namespace framework {
@@ -83,6 +83,74 @@ void SSAGraphBuilder::AddOutputToLeafOps(SSAGraph *graph) {
     op->AddOutput(dummy_leaf);
   }
 }
+
+std::unique_ptr<SSAGraph> SSAGraphBuilder::BuildAndCheck(
+    const ProgramDesc &program) final {
+  std::unique_ptr<SSAGraph> graph = Build(program);
+  PADDLE_ENFORCE(IsValidGraph(graph.get()));
+  return std::move(graph);
+}
+
+bool SSAGraphBuilder::IsValidGraph(const SSAGraph *graph) const {
+  std::unordered_map<OpHandleBase *, size_t> pending_ops;
+  std::unordered_set<VarHandleBase *> pending_vars;
+  std::unordered_set<VarHandleBase *> ready_vars;
+  std::unordered_set<OpHandleBase *> ready_ops;
+
+  auto insert_pending_var = [&](VarHandleBase *var) {
+    pending_vars.insert(var);
+    if (var->generated_op_ == nullptr) {
+      ready_vars.emplace(var);
+    }
+  };
+
+  for (auto &var_map : graph->vars_) {
+    for (auto &name_pair : var_map) {
+      for (auto &version_pair : name_pair.second) {
+        insert_pending_var(version_pair.get());
+      }
+    }
+  }
+
+  for (auto &var : graph->dep_vars_) {
+    insert_pending_var(var.get());
+  }
+
+  for (auto &op : graph->ops_) {
+    if (op->Inputs().empty()) {
+      ready_ops.insert(op.get());
+    } else {
+      pending_ops.insert({op.get(), op.get()->NoDupInputSize()});
+    }
+  }
+
+  auto run_all_ops = [&](std::unordered_set<OpHandleBase *> &set) {
+    for (auto *op : set) {
+      for (auto out : op->Outputs()) {
+        ready_vars.emplace(out);
+      }
+    }
+    set.clear();
+  };
+
+  while (!pending_vars.empty()) {
+    run_all_ops(ready_ops);
+    if (ready_vars.empty()) {
+      return false;
+    }
+    for (auto ready_var : ready_vars.) {
+      pending_vars.erase(ready_var);
+      for (auto *op : ready_var->pending_ops_) {
+        auto &deps = --pending_ops[op];
+        if (deps == 0) {
+          ready_ops.insert(op);
+        }
+      }
+    }
+    ready_vars.clear();
+  }
+  return true;
+}
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/details/ssa_graph_builder.h b/paddle/fluid/framework/details/ssa_graph_builder.h
index 5fc12a44b5..da9298ac8d 100644
--- a/paddle/fluid/framework/details/ssa_graph_builder.h
+++ b/paddle/fluid/framework/details/ssa_graph_builder.h
@@ -31,6 +31,8 @@ class SSAGraphBuilder {
   virtual ~SSAGraphBuilder() {}
   virtual std::unique_ptr<SSAGraph> Build(const ProgramDesc &program) const = 0;
 
+  std::unique_ptr<SSAGraph> BuildAndCheck(const ProgramDesc &program) final;
+
   DISABLE_COPY_AND_ASSIGN(SSAGraphBuilder);
 
  protected:
@@ -48,6 +50,7 @@ class SSAGraphBuilder {
                                                const platform::Place &place,
                                                size_t place_offset);
 
+  bool IsValidGraph(const SSAGraph *graph) const;
   // Add an output variable (each_var_name, place, place_offset) to op_handle,
   // which belongs to graph
   static void CreateOpOutput(SSAGraph *graph, OpHandleBase *op_handle,
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
index 496fadd04d..bcbf573626 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -185,6 +185,7 @@ void ThreadedSSAGraphExecutor::InsertPendingVar(
     ready_vars->Push(var);
   }
 }
+
 void ThreadedSSAGraphExecutor::RunOp(
     BlockingQueue<VarHandleBase *> *ready_var_q, details::OpHandleBase *op) {
   auto op_run = [ready_var_q, op, this] {

From 8291b916d6cf053db779598b01dd59191fb5a1df Mon Sep 17 00:00:00 2001
From: chengduoZH <zhaochengduo@163.com>
Date: Thu, 7 Jun 2018 16:24:23 +0800
Subject: [PATCH 47/93] replace graph_builder_factory with
 ssa_graph_builder_factory

---
 paddle/fluid/framework/CMakeLists.txt                         | 2 +-
 paddle/fluid/framework/details/CMakeLists.txt                 | 2 +-
 paddle/fluid/framework/details/multi_devices_graph_builder.cc | 1 +
 paddle/fluid/framework/details/ssa_graph_builder.cc           | 4 ++--
 paddle/fluid/framework/details/ssa_graph_builder.h            | 2 +-
 ...{graph_builder_factory.cc => ssa_graph_builder_factory.cc} | 2 +-
 .../{graph_builder_factory.h => ssa_graph_builder_factory.h}  | 0
 paddle/fluid/framework/parallel_executor.cc                   | 4 ++--
 8 files changed, 9 insertions(+), 8 deletions(-)
 rename paddle/fluid/framework/details/{graph_builder_factory.cc => ssa_graph_builder_factory.cc} (96%)
 rename paddle/fluid/framework/details/{graph_builder_factory.h => ssa_graph_builder_factory.h} (100%)

diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 627370cd2d..4271e4c1bb 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -87,7 +87,7 @@ cc_library(executor SRCS executor.cc DEPS op_registry device_context scope
 framework_proto glog lod_rank_table feed_fetch_method)
 
 
-cc_library(parallel_executor SRCS parallel_executor.cc DEPS graph_builder_factory threaded_ssa_graph_executor scope_buffered_ssa_graph_executor)
+cc_library(parallel_executor SRCS parallel_executor.cc DEPS ssa_graph_builder_factory threaded_ssa_graph_executor scope_buffered_ssa_graph_executor)
 
 cc_library(prune SRCS prune.cc DEPS framework_proto)
 cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index c106761f72..ced063a097 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -30,7 +30,7 @@ cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS
         scale_loss_grad_op_handle rpc_op_handle ${multi_devices_graph_builder_deps} reduce_op_handle broadcast_op_handle)
 
 
-cc_library(graph_builder_factory SRCS graph_builder_factory.cc DEPS multi_devices_graph_builder ssa_graph_printer)
+cc_library(ssa_graph_builder_factory SRCS ssa_graph_builder_factory.cc DEPS multi_devices_graph_builder ssa_graph_printer)
 
 cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph framework_proto)
 cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
index 81d5b079b8..0c4d369e88 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@@ -272,6 +272,7 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
    * Only variables should be the leaves of graph.
    */
   AddOutputToLeafOps(&result);
+
   return std::unique_ptr<SSAGraph>(graph);
 }
 
diff --git a/paddle/fluid/framework/details/ssa_graph_builder.cc b/paddle/fluid/framework/details/ssa_graph_builder.cc
index d70f95a9f5..d24669a8f8 100644
--- a/paddle/fluid/framework/details/ssa_graph_builder.cc
+++ b/paddle/fluid/framework/details/ssa_graph_builder.cc
@@ -85,7 +85,7 @@ void SSAGraphBuilder::AddOutputToLeafOps(SSAGraph *graph) {
 }
 
 std::unique_ptr<SSAGraph> SSAGraphBuilder::BuildAndCheck(
-    const ProgramDesc &program) final {
+    const ProgramDesc &program) {
   std::unique_ptr<SSAGraph> graph = Build(program);
   PADDLE_ENFORCE(IsValidGraph(graph.get()));
   return std::move(graph);
@@ -138,7 +138,7 @@ bool SSAGraphBuilder::IsValidGraph(const SSAGraph *graph) const {
     if (ready_vars.empty()) {
       return false;
     }
-    for (auto ready_var : ready_vars.) {
+    for (auto ready_var : ready_vars) {
       pending_vars.erase(ready_var);
       for (auto *op : ready_var->pending_ops_) {
         auto &deps = --pending_ops[op];
diff --git a/paddle/fluid/framework/details/ssa_graph_builder.h b/paddle/fluid/framework/details/ssa_graph_builder.h
index da9298ac8d..e99a988407 100644
--- a/paddle/fluid/framework/details/ssa_graph_builder.h
+++ b/paddle/fluid/framework/details/ssa_graph_builder.h
@@ -31,7 +31,7 @@ class SSAGraphBuilder {
   virtual ~SSAGraphBuilder() {}
   virtual std::unique_ptr<SSAGraph> Build(const ProgramDesc &program) const = 0;
 
-  std::unique_ptr<SSAGraph> BuildAndCheck(const ProgramDesc &program) final;
+  std::unique_ptr<SSAGraph> BuildAndCheck(const ProgramDesc &program);
 
   DISABLE_COPY_AND_ASSIGN(SSAGraphBuilder);
 
diff --git a/paddle/fluid/framework/details/graph_builder_factory.cc b/paddle/fluid/framework/details/ssa_graph_builder_factory.cc
similarity index 96%
rename from paddle/fluid/framework/details/graph_builder_factory.cc
rename to paddle/fluid/framework/details/ssa_graph_builder_factory.cc
index a04b9bb63c..b5e90d6b05 100644
--- a/paddle/fluid/framework/details/graph_builder_factory.cc
+++ b/paddle/fluid/framework/details/ssa_graph_builder_factory.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/details/graph_builder_factory.h"
+#include "paddle/fluid/framework/details/ssa_graph_builder_factory.h"
 #include <fstream>
 #include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
 #include "paddle/fluid/framework/details/ssa_graph_printer.h"
diff --git a/paddle/fluid/framework/details/graph_builder_factory.h b/paddle/fluid/framework/details/ssa_graph_builder_factory.h
similarity index 100%
rename from paddle/fluid/framework/details/graph_builder_factory.h
rename to paddle/fluid/framework/details/ssa_graph_builder_factory.h
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index ce56f55e41..f1ab337070 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -22,8 +22,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
 
-#include "paddle/fluid/framework/details/graph_builder_factory.h"
 #include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
+#include "paddle/fluid/framework/details/ssa_graph_builder_factory.h"
 #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
 #include "paddle/fluid/platform/profiler.h"
 
@@ -114,7 +114,7 @@ ParallelExecutor::ParallelExecutor(
 
   member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
       exec_strategy, member_->local_scopes_, places,
-      builder_factory.Create()->Build(main_program)));
+      builder_factory.Create()->BuildAndCheck(main_program)));
 
   member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor(
       exec_strategy, member_->local_scopes_, std::move(var_infos),

From 9e026a93cff29f1d49fac900b3110968da8594cf Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Thu, 7 Jun 2018 16:59:53 +0800
Subject: [PATCH 48/93] remove chief

---
 python/paddle/fluid/io.py      | 6 ++----
 python/paddle/fluid/trainer.py | 5 +----
 2 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index 34c527b62f..6323c9899e 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -466,7 +466,6 @@ CHECKPOINT_SEPARATOR = "_"
 def save_checkpoint(executor,
                     checkpoint_dir,
                     trainer_id,
-                    is_chief=False,
                     trainer_args=None,
                     main_program=None,
                     max_num_checkpoints=3):
@@ -478,8 +477,7 @@ def save_checkpoint(executor,
 
     :param executor executor for save the value
     :param checkpoint_dir the checkpoint directory 
-    :param trainer_id currect trainer id
-    :param is_chief if the trainer id equals 0, the is_chief will be true
+    :param trainer_id currect trainer id, if id is equal to 0, the trainer is chief
     :param main_program   will save all variables in program 
     :param max_num_checkpoints will keep numbers of checkpoint serials not bigger than max_num_checkpoints
     """
@@ -497,7 +495,7 @@ def save_checkpoint(executor,
 
     save_trainer_args(cur_dir, trainer_id, trainer_args)
 
-    if is_chief:
+    if trainer_id == 0:
         save_persist_vars_without_grad(executor, cur_dir, main_program)
 
     _scroll_delete(checkpoint_dir, max_num_checkpoints)
diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py
index 5230ded7db..2737f1c70d 100644
--- a/python/paddle/fluid/trainer.py
+++ b/python/paddle/fluid/trainer.py
@@ -136,7 +136,6 @@ class Trainer(object):
         # config for checkpoint
         # only chief worker will save variables
         self.trainer_id = 0
-        self.chief = True
         self.checkpoint_cfg = checkpoint_config
         if self.checkpoint_cfg:
             assert isinstance(self.checkpoint_cfg, CheckpointConfig)
@@ -201,7 +200,6 @@ class Trainer(object):
             self.nccl_id_var = None
         else:
             self.trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
-            self.chief = self.trainer_id == 0
             port = os.getenv("PADDLE_PSERVER_PORT")
             worker_ips = os.getenv("PADDLE_TRAINER_IPS")
             worker_endpoints = []
@@ -250,7 +248,7 @@ class Trainer(object):
         # the unique trainer id, starting from 0, needed by trainer
         # only
         self.trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
-        self.chief = self.trainer_id == 0
+
         # the role, should be either PSERVER or TRAINER
         training_role = os.getenv("PADDLE_TRAINING_ROLE")
         with self._prog_and_scope_guard():
@@ -456,7 +454,6 @@ class Trainer(object):
                 executor=exe,
                 checkpoint_dir=self.checkpoint_cfg.checkpoint_dir,
                 trainer_id=self.trainer_id,
-                is_chief=self.chief,
                 trainer_args=self._get_checkpoint_save_args(epoch_id, step_id),
                 main_program=self.train_program,
                 max_num_checkpoints=self.checkpoint_cfg.max_num_checkpoints)

From 746a62ebe6db33ea220ac5c8090439decfab8f64 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Thu, 7 Jun 2018 17:31:13 +0800
Subject: [PATCH 49/93] add gpu tests

---
 .../inference/demo/simple_on_word2vec.cc      | 10 +++--
 .../test_paddle_inference_api_impl.cc         | 39 ++++++++++++++++---
 2 files changed, 39 insertions(+), 10 deletions(-)

diff --git a/paddle/contrib/inference/demo/simple_on_word2vec.cc b/paddle/contrib/inference/demo/simple_on_word2vec.cc
index a4ef3b71c5..9c36aa44ec 100644
--- a/paddle/contrib/inference/demo/simple_on_word2vec.cc
+++ b/paddle/contrib/inference/demo/simple_on_word2vec.cc
@@ -66,12 +66,12 @@ void Main(bool use_gpu) {
   }
 }
 
-void MainThreads(int num_threads) {
+void MainThreads(int num_threads, bool use_gpu) {
   // Multi-threads only support on CPU
   // 0. Create PaddlePredictor with a config.
   NativeConfig config;
   config.model_dir = FLAGS_dirname + "word2vec.inference.model";
-  config.use_gpu = false;
+  config.use_gpu = use_gpu;
   auto main_predictor =
       CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
 
@@ -113,11 +113,13 @@ void MainThreads(int num_threads) {
 }
 
 TEST(demo, word2vec_cpu) { Main(false /*use_gpu*/); }
-TEST(demo_multi_threads, word2vec_cpu_1) { MainThreads(1); }
-TEST(demo_multi_threads, word2vec_cpu_4) { MainThreads(4); }
+TEST(demo_multi_threads, word2vec_cpu_1) { MainThreads(1, false /*use_gpu*/); }
+TEST(demo_multi_threads, word2vec_cpu_4) { MainThreads(4, false /*use_gpu*/); }
 
 #ifdef PADDLE_WITH_CUDA
 TEST(demo, word2vec_gpu) { Main(true /*use_gpu*/); }
+TEST(demo_multi_threads, word2vec_gpu_1) { MainThreads(1, true /*use_gpu*/); }
+TEST(demo_multi_threads, word2vec_gpu_4) { MainThreads(4, true /*use_gpu*/); }
 #endif
 
 }  // namespace demo
diff --git a/paddle/contrib/inference/test_paddle_inference_api_impl.cc b/paddle/contrib/inference/test_paddle_inference_api_impl.cc
index 8ffe102cb9..4b6cb7b051 100644
--- a/paddle/contrib/inference/test_paddle_inference_api_impl.cc
+++ b/paddle/contrib/inference/test_paddle_inference_api_impl.cc
@@ -56,9 +56,10 @@ NativeConfig GetConfig() {
   return config;
 }
 
-TEST(paddle_inference_api_impl, word2vec) {
+void MainWord2Vec(bool use_gpu) {
   NativeConfig config = GetConfig();
   auto predictor = CreatePaddlePredictor<NativeConfig>(config);
+  config.use_gpu = use_gpu;
 
   framework::LoDTensor first_word, second_word, third_word, fourth_word;
   framework::LoD lod{{0, 1}};
@@ -106,11 +107,12 @@ TEST(paddle_inference_api_impl, word2vec) {
   free(outputs[0].data.data);
 }
 
-TEST(paddle_inference_api_impl, image_classification) {
+void MainImageClassification(bool use_gpu) {
   int batch_size = 2;
   bool use_mkldnn = false;
   bool repeat = false;
   NativeConfig config = GetConfig();
+  config.use_gpu = use_gpu;
   config.model_dir =
       FLAGS_dirname + "image_classification_resnet.inference.model";
 
@@ -155,9 +157,9 @@ TEST(paddle_inference_api_impl, image_classification) {
   free(data);
 }
 
-TEST(paddle_inference_api_native_multithreads, word2vec) {
+void MainThreadsWord2Vec(bool use_gpu) {
   NativeConfig config = GetConfig();
-  config.use_gpu = false;
+  config.use_gpu = use_gpu;
   auto main_predictor = CreatePaddlePredictor<NativeConfig>(config);
 
   // prepare inputs data and reference results
@@ -216,11 +218,11 @@ TEST(paddle_inference_api_native_multithreads, word2vec) {
   }
 }
 
-TEST(paddle_inference_api_native_multithreads, image_classification) {
+void MainThreadsImageClassification(bool use_gpu) {
   constexpr int num_jobs = 4;  // each job run 1 batch
   constexpr int batch_size = 1;
   NativeConfig config = GetConfig();
-  config.use_gpu = false;
+  config.use_gpu = use_gpu;
   config.model_dir =
       FLAGS_dirname + "image_classification_resnet.inference.model";
 
@@ -269,4 +271,29 @@ TEST(paddle_inference_api_native_multithreads, image_classification) {
   }
 }
 
+TEST(inference_api_native, word2vec_cpu) { MainWord2Vec(false /*use_gpu*/); }
+TEST(inference_api_native, word2vec_cpu_threads) {
+  MainThreadsWord2Vec(false /*use_gpu*/);
+}
+TEST(inference_api_native, image_classification_cpu) {
+  MainThreadsImageClassification(false /*use_gpu*/);
+}
+TEST(inference_api_native, image_classification_cpu_threads) {
+  MainThreadsImageClassification(false /*use_gpu*/);
+}
+
+#ifdef PADDLE_WITH_CUDA
+TEST(inference_api_native, word2vec_gpu) { MainWord2Vec(true /*use_gpu*/); }
+TEST(inference_api_native, word2vec_gpu_threads) {
+  MainThreadsWord2Vec(true /*use_gpu*/);
+}
+TEST(inference_api_native, image_classification_gpu) {
+  MainThreadsImageClassification(true /*use_gpu*/);
+}
+TEST(inference_api_native, image_classification_gpu_threads) {
+  MainThreadsImageClassification(true /*use_gpu*/);
+}
+
+#endif
+
 }  // namespace paddle

From 4f46a98fa90b9ddbcc88531079531803820874d6 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Thu, 7 Jun 2018 19:06:16 +0800
Subject: [PATCH 50/93] stash

---
 paddle/fluid/operators/crop_op.cc | 19 +++++++++++++++++-
 paddle/fluid/operators/crop_op.h  | 33 ++++++++++++++++++++++++++-----
 2 files changed, 46 insertions(+), 6 deletions(-)

diff --git a/paddle/fluid/operators/crop_op.cc b/paddle/fluid/operators/crop_op.cc
index 669b3bbe9d..b5b31c7ce0 100644
--- a/paddle/fluid/operators/crop_op.cc
+++ b/paddle/fluid/operators/crop_op.cc
@@ -60,13 +60,19 @@ class CropOpMaker : public framework::OpProtoAndCheckerMaker {
              "The input used as reference for cropping, "
              "which is of the same dimensions as X.")
         .AsDispensable();
+    AddInput("Offsets",
+             "The input used to describe offsets in runtime, which is a "
+             "1-D vector whose size equals to the rank of input 'X'. The "
+             "elements data type must be int.")
+        .AsDispensable();
     AddOutput("Out",
               "The output of crop op, "
               "which is of the same dimensions as X.");
     AddAttr<std::vector<int>>("offsets",
                               "A list<int> describing offsets to be cropped. "
                               "The size of offsets list should be the same as "
-                              "the dimension size of input X.");
+                              "the dimension size of input X.")
+        .SetDefault(std::vector<int>());
     AddAttr<std::vector<int>>("shape",
                               "A list<int> describing the shape of output. "
                               "The size of shape list should be the same as "
@@ -77,6 +83,17 @@ Crop Operator.
 
 Crop input into output, as specified by offsets and shape.
 
+There are two ways to set the offsets:
+1. In runtime: Using the input 'Offsets', which is a Vairbale and can be 
+               output of other operators. This way is suitable for 
+               dynamic offsets.
+2. In network configuration: Using the attribute 'offsets', which will be 
+                             set in Python configure script. This way is 
+                             suitable for fixed offsets.
+You CANNOT use these two ways at the same time. An exception will be raised 
+if input 'Offset' is configured and meanwhile the attribute 'offsets' is 
+not empty.
+
 There are two ways to set shape:
 1. reference input: crop input X into the same shape as reference input.
                     The dimension of reference input should
diff --git a/paddle/fluid/operators/crop_op.h b/paddle/fluid/operators/crop_op.h
index f05c2e2328..d8e9f086cc 100644
--- a/paddle/fluid/operators/crop_op.h
+++ b/paddle/fluid/operators/crop_op.h
@@ -27,6 +27,32 @@ template <typename T, size_t D, int MajorType = Eigen::RowMajor,
 using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
 using framework::Tensor;
 
+static std::vector<int> GetOffsets(const framework::ExecutionContext& ctx) {
+  std::vector<int> res;
+  int rank = ctx.Input<Tensor>("X")->dims().size();
+  if (ctx.HasInput("Offsets")) {
+    PADDLE_ENFORCE(ctx.Attr<std::vector<int>>("offsets").empty(),
+                   "Input 'Offsets' and attribute 'offsets' should not be used "
+                   "at the same time.");
+    const auto* offsets_tensor = ctx.Input<Tensor>("Offsets");
+    PADDLE_ENFORCE_EQ(offsets_tensor->dims().size(), 1);
+    PADDLE_ENFORCE_EQ(
+        rank, offsets_tensor->dims()[0],
+        "Offsets size should be equal to dimension size of input tensor.");
+    const int* offsets_data = offsets_tensor->data<int>();
+    res.resize(rank);
+    for (size_t i = 0; i < rank; ++i) {
+      res[i] = offsets_data[i];
+    }
+  } else {
+    res = ctx.Attr<std::vector<int>>("offsets");
+    PADDLE_ENFORCE_EQ(
+        rank, res.size(),
+        "Offsets size should be equal to dimension size of input tensor.");
+  }
+  return res;
+}
+
 template <typename T>
 class CropKernel : public framework::OpKernel<T> {
  public:
@@ -37,10 +63,7 @@ class CropKernel : public framework::OpKernel<T> {
     T* out_data = out->mutable_data<T>(context.GetPlace());
     auto x_stride = framework::stride(x->dims());
     auto out_stride = framework::stride(out->dims());
-    auto offsets = context.Attr<std::vector<int>>("offsets");
-    PADDLE_ENFORCE_EQ(
-        x->dims().size(), static_cast<int64_t>(offsets.size()),
-        "Offsets size should be equal to dimension size of input tensor.");
+    auto offsets = GetOffsets(context);
     int64_t offset = 0;
     for (size_t i = 0; i < offsets.size(); ++i) {
       offset += (x_stride[i] * offsets[i]);
@@ -56,7 +79,7 @@ void CropGradFunction(const framework::ExecutionContext& context) {
   if (d_x != nullptr) {
     auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
     d_x->mutable_data<T>(context.GetPlace());
-    auto offsets = context.Attr<std::vector<int>>("offsets");
+    auto offsets = GetOffsets(context);
     Eigen::array<std::pair<int, int>, D> paddings;
     for (size_t i = 0; i < D; ++i) {
       paddings[i].first = offsets[i];

From e030741df982b480636c0ceb44b06236e89cc05b Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Thu, 7 Jun 2018 19:16:10 +0800
Subject: [PATCH 51/93] fix gpu fraction

---
 paddle/contrib/inference/demo/simple_on_word2vec.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/paddle/contrib/inference/demo/simple_on_word2vec.cc b/paddle/contrib/inference/demo/simple_on_word2vec.cc
index 9c36aa44ec..192a641426 100644
--- a/paddle/contrib/inference/demo/simple_on_word2vec.cc
+++ b/paddle/contrib/inference/demo/simple_on_word2vec.cc
@@ -72,6 +72,8 @@ void MainThreads(int num_threads, bool use_gpu) {
   NativeConfig config;
   config.model_dir = FLAGS_dirname + "word2vec.inference.model";
   config.use_gpu = use_gpu;
+  config.fraction_of_gpu_memory = 0.15;
+  config.device = 0;
   auto main_predictor =
       CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
 

From 9dee93384546f77d856a1d2906bd2e10320a0046 Mon Sep 17 00:00:00 2001
From: yuyang18 <reyoung@126.com>
Date: Thu, 7 Jun 2018 19:20:33 +0800
Subject: [PATCH 52/93] Remove warning for rst file

---
 doc/v2/build_and_install/build_from_source_cn.rst | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/doc/v2/build_and_install/build_from_source_cn.rst b/doc/v2/build_and_install/build_from_source_cn.rst
index de7e9eb75c..6421c53082 100644
--- a/doc/v2/build_and_install/build_from_source_cn.rst
+++ b/doc/v2/build_and_install/build_from_source_cn.rst
@@ -106,7 +106,7 @@ PaddlePaddle需要使用Docker环境完成编译，这样可以免去单独安
 
 - 学习 Docker 有多难？
 
-  理解 Docker 并不难，大概花十分钟看一下 `这篇文章 <https://zhuanlan.zhihu.com/p/19902938>`_ 。这可以帮您省掉花一小时安装和配置各种开发工具，以及切换机器时需要新安装的辛苦。别忘了 PaddlePaddle 更新可能导致需要新的开发工具。更别提简化问题复现带来的好处了。
+  理解 Docker 并不难，大概花十分钟看一下 `如何使用Docker <https://zhuanlan.zhihu.com/p/19902938>`_ 。这可以帮您省掉花一小时安装和配置各种开发工具，以及切换机器时需要新安装的辛苦。别忘了 PaddlePaddle 更新可能导致需要新的开发工具。更别提简化问题复现带来的好处了。
 
 - 我可以用 IDE 吗？
 
@@ -123,7 +123,7 @@ PaddlePaddle需要使用Docker环境完成编译，这样可以免去单独安
 
 - 可以并行编译吗？
 
-  是的。我们的 Docker image 运行一个 `Bash脚本 <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh>`_ 。这个脚本调用 `make -j$(nproc)` 来启动和 CPU 核一样多的进程来并行编译。
+  是的。我们的 Docker image 运行一个 `Paddle编译Bash脚本 <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh>`_ 。这个脚本调用 `make -j$(nproc)` 来启动和 CPU 核一样多的进程来并行编译。
 
 - Docker 需要 sudo
 
@@ -131,11 +131,11 @@ PaddlePaddle需要使用Docker环境完成编译，这样可以免去单独安
 
 - 在 Windows/MacOS 上编译很慢
 
-  Docker 在 Windows 和 MacOS 都可以运行。不过实际上是运行在一个 Linux 虚拟机上。可能需要注意给这个虚拟机多分配一些 CPU 和内存，以保证编译高效。具体做法请参考 `这个issue <https://github.com/PaddlePaddle/Paddle/issues/627>`_ 。
+  Docker 在 Windows 和 MacOS 都可以运行。不过实际上是运行在一个 Linux 虚拟机上。可能需要注意给这个虚拟机多分配一些 CPU 和内存，以保证编译高效。具体做法请参考 `如何为Windows/Mac计算机上的Docker增加内存和虚拟机 <https://github.com/PaddlePaddle/Paddle/issues/627>`_ 。
 
 - 磁盘不够
 
-  本文中的例子里，`docker run` 命令里都用了 `--rm` 参数，这样保证运行结束之后的 containers 不会保留在磁盘上。可以用 `docker ps -a` 命令看到停止后但是没有删除的 containers。`docker build` 命令有时候会产生一些中间结果，是没有名字的 images，也会占用磁盘。可以参考 `这篇文章 <https://zaiste.net/posts/removing_docker_containers/>`_ 来清理这些内容。
+  本文中的例子里，`docker run` 命令里都用了 `--rm` 参数，这样保证运行结束之后的 containers 不会保留在磁盘上。可以用 `docker ps -a` 命令看到停止后但是没有删除的 containers。`docker build` 命令有时候会产生一些中间结果，是没有名字的 images，也会占用磁盘。可以参考 `如何删除Docker Container <https://zaiste.net/posts/removing_docker_containers/>`_ 来清理这些内容。
 
 
 .. _compile_deps:
@@ -195,7 +195,7 @@ BLAS
 
 PaddlePaddle支持 `MKL <https://software.intel.com/en-us/intel-mkl>`_ 和
 `OpenBlAS <http://www.openblas.net/>`_ 两种BLAS库。默认使用MKL。如果使用MKL并且机器含有AVX2指令集，
-还会下载MKL-DNN数学库，详细参考 `这里 <https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/mkldnn#cmake>`_ 。
+还会下载MKL-DNN数学库，详细参考 `mkldnn设计文档 <https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/mkldnn#cmake>`_ 。
 
 如果关闭MKL，则会使用OpenBLAS作为BLAS库。
 

From 2dd66ef65e968ead7653402c0904c3fb49fb12ab Mon Sep 17 00:00:00 2001
From: yuyang18 <reyoung@126.com>
Date: Thu, 7 Jun 2018 19:28:40 +0800
Subject: [PATCH 53/93] Do not generate doc for op_role and op_attr_name

---
 python/paddle/fluid/framework.py                       | 7 +++++++
 python/paddle/fluid/layers/layer_function_generator.py | 4 ++++
 2 files changed, 11 insertions(+)

diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 33b5caa0ea..3d1dc82da7 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -361,6 +361,13 @@ class OpProtoHolder(object):
             raise ValueError("Operator \"%s\" has not been registered." % type)
         return self.op_proto_map[type]
 
+    @staticmethod
+    def generated_op_attr_names():
+        return {
+            core.op_proto_and_checker_maker.kOpRoleAttrName(),
+            core.op_proto_and_checker_maker.kOpRoleVarAttrName()
+        }
+
 
 class Operator(object):
     """
diff --git a/python/paddle/fluid/layers/layer_function_generator.py b/python/paddle/fluid/layers/layer_function_generator.py
index 295d1b7190..72cab81d41 100644
--- a/python/paddle/fluid/layers/layer_function_generator.py
+++ b/python/paddle/fluid/layers/layer_function_generator.py
@@ -75,7 +75,11 @@ def _generate_doc_string_(op_proto):
         buf.write(str(each_input.dispensable))
         buf.write('\n')
 
+    skip_attrs = OpProtoHolder.generated_op_attr_names()
+
     for each_attr in op_proto.attrs:
+        if each_attr.name in skip_attrs:
+            continue
         buf.write('    ')
         buf.write(each_attr.name)
         buf.write(' (')

From dc8e0b494def7346248d0d1c02f64c7c0d1ed0d7 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Thu, 7 Jun 2018 19:45:52 +0800
Subject: [PATCH 54/93] fix bugs in the implementation of 'HasInput' and
 'HasOutput'

---
 paddle/fluid/framework/operator.cc | 32 ++++++++++++++++++++++++++++++
 paddle/fluid/framework/operator.h  |  4 ++--
 2 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index f87d552149..1aec2642e3 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -293,6 +293,38 @@ static Tensor* GetMutableTensorFromVar(Variable* var) {
   }
 }
 
+bool ExecutionContext::HasInput(const std::string& name) const {
+  if (!op_.HasInputs(name)) {
+    return false;
+  }
+  auto& ins = Inputs(name);
+  size_t length = ins.size();
+  if (length == 0) {
+    return false;
+  }
+  PADDLE_ENFORCE_EQ(length, 1UL,
+                    "Input %s should not have more than one inputs", name);
+  auto arg = ins[0];
+  auto* var = arg == kEmptyVarName ? nullptr : scope_.FindVar(arg);
+  return var != nullptr;
+}
+
+bool ExecutionContext::HasOutput(const std::string& name) const {
+  if (!op_.HasOutputs(name)) {
+    return false;
+  }
+  auto& outs = Outputs(name);
+  size_t length = outs.size();
+  if (length == 0) {
+    return false;
+  }
+  PADDLE_ENFORCE_EQ(length, 1UL,
+                    "Output %s should not have more than one inputs", name);
+  auto arg = outs[0];
+  auto* var = arg == kEmptyVarName ? nullptr : scope_.FindVar(arg);
+  return var != nullptr;
+}
+
 template <>
 const Tensor* ExecutionContext::Input<Tensor>(const std::string& name) const {
   auto* var = InputVar(name);
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 2f480e00c1..b1d75d0d0f 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -191,9 +191,9 @@ class ExecutionContext {
     return op_.Attr<T>(name);
   }
 
-  bool HasInput(const std::string& name) const { return op_.HasInputs(name); }
+  bool HasInput(const std::string& name) const;
 
-  bool HasOutput(const std::string& name) const { return op_.HasOutputs(name); }
+  bool HasOutput(const std::string& name) const;
 
   size_t InputSize(const std::string& name) const {
     return op_.Inputs(name).size();

From f7c96f079b6299c8c3889c62ea00011c0cc4ff83 Mon Sep 17 00:00:00 2001
From: dzhwinter <dongzhihong01@baidu.com>
Date: Thu, 7 Jun 2018 20:02:08 +0800
Subject: [PATCH 55/93] Big data op_test benchmark, for checking output
 consistent in different runs. (#10646)

* "init benchmark ops"

* "untrack outputs"

* "delete some usused code"

* "benchmark"

* "fix ci"

* "fix op test"

* "fix uint16 missing"

* "fix ci"

* "follow comments"

* "fix ci"

* "follow comments"

* "conficts. merge develop branch"

* repick

* "merge develop branch"
---
 paddle/fluid/framework/operator.cc            |   6 +-
 python/paddle/fluid/executor.py               |   2 +
 python/paddle/fluid/framework.py              |  25 +-
 .../paddle/fluid/tests/unittests/benchmark.py | 113 ++++++
 .../fluid/tests/unittests/benchmark_sum_op.py |  82 ++++
 .../paddle/fluid/tests/unittests/op_test.py   | 364 ++++++------------
 .../fluid/tests/unittests/test_lstm_op.py     | 199 +++++-----
 .../paddle/fluid/tests/unittests/testsuite.py | 182 +++++++++
 8 files changed, 621 insertions(+), 352 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/benchmark.py
 create mode 100644 python/paddle/fluid/tests/unittests/benchmark_sum_op.py
 create mode 100644 python/paddle/fluid/tests/unittests/testsuite.py

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 4d1e8d0eba..90eb0e30f3 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -661,8 +661,10 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
         }
         if (t != nullptr) {
           int tmp = static_cast<int>(ToDataType(t->type()));
-          PADDLE_ENFORCE(tmp == data_type || data_type == -1,
-                         "DataType of Paddle Op %s must be the same.", Type());
+          PADDLE_ENFORCE(
+              tmp == data_type || data_type == -1,
+              "DataType of Paddle Op %s must be the same. Get %d != %d", Type(),
+              data_type, tmp);
           data_type = tmp;
         }
       }
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index 93aa5f908e..33d8f70941 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -170,6 +170,8 @@ def get_program_cache_key(feed, fetch_list):
             return var.desc.name()
         elif isinstance(var, str):
             return var
+        elif isinstance(var, basestring):
+            return str(var)
         else:
             raise TypeError(str(var) + " should be Variable or str")
 
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 33b5caa0ea..9dc9038f44 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -72,6 +72,8 @@ def convert_np_dtype_to_dtype_(np_dtype):
         return core.VarDesc.VarType.INT64
     elif dtype == np.bool:
         return core.VarDesc.VarType.BOOL
+    elif dtype == np.uint16:
+        return core.VarDesc.VarType.INT16
     elif dtype == np.uint8:
         return core.VarDesc.VarType.UINT8
     else:
@@ -368,6 +370,13 @@ class Operator(object):
     Block. Users can use the build in instructions to describe their neural
     network.
     """
+    OP_WITHOUT_KERNEL_SET = {
+        'feed', 'fetch', 'save', 'load', 'recurrent', 'go',
+        'rnn_memory_helper_grad', 'conditional_block', 'while', 'send', 'recv',
+        'listen_and_serv', 'parallel_do', 'save_combine', 'load_combine',
+        'ncclInit', 'channel_create', 'channel_close', 'channel_send',
+        'channel_recv', 'select'
+    }
 
     def __init__(self,
                  block,
@@ -504,17 +513,13 @@ class Operator(object):
                 else:
                     self.desc.set_attr(attr_name, self.attrs[attr_name])
         self.desc.check_attrs()
-        no_kernel_op_set = {
-            'feed', 'fetch', 'save', 'load', 'recurrent', 'go',
-            'rnn_memory_helper_grad', 'conditional_block', 'while', 'send',
-            'recv', 'listen_and_serv', 'parallel_do', 'save_combine',
-            'load_combine', 'ncclInit', 'channel_create', 'channel_close',
-            'channel_send', 'channel_recv', 'select', 'gen_nccl_id'
-        }
-        if type not in no_kernel_op_set:
+        if self.has_kernel(type):
             self.desc.infer_var_type(self.block.desc)
             self.desc.infer_shape(self.block.desc)
 
+    def has_kernel(self, op_type):
+        return op_type not in self.OP_WITHOUT_KERNEL_SET
+
     def to_string(self, throw_on_error):
         """
         To debug string.
@@ -742,7 +747,9 @@ class Block(object):
 
     def var(self, name):
         if not isinstance(name, basestring):
-            raise TypeError()
+            raise TypeError(
+                "var require string as parameter, but get %s instead." %
+                (type(name)))
         v = self.vars.get(name, None)
         if v is None:
             raise ValueError("var %s not in this block" % name)
diff --git a/python/paddle/fluid/tests/unittests/benchmark.py b/python/paddle/fluid/tests/unittests/benchmark.py
new file mode 100644
index 0000000000..e891ee932f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/benchmark.py
@@ -0,0 +1,113 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import unittest
+import time
+import itertools
+
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+from op_test import OpTest
+
+
+class BenchmarkSuite(OpTest):
+    def timeit_function(self, callback, iters, *args, **kwargs):
+        assert iters != 0, "Iters should >= 1"
+        start = time.time()
+        for i in range(iters):
+            callback(*args, **kwargs)
+        elapse = time.time() - start
+        return elapse / iters
+
+    def _assert_cpu_gpu_same(self, cpu_outs, gpu_outs, fetch_list, atol):
+        for item_cpu_out, item_gpu_out, variable in zip(cpu_outs, gpu_outs,
+                                                        fetch_list):
+            # the cpu version is baseline, expect gpu version keep same with cpu version.
+            expect = item_cpu_out
+            expect_t = np.array(item_cpu_out)
+            actual = item_gpu_out
+            actual_t = np.array(item_gpu_out)
+            var_name = variable if isinstance(variable,
+                                              basestring) else variable.name
+            self.assertTrue(
+                np.allclose(
+                    actual_t, expect_t, atol=atol),
+                "Output (" + var_name + ") has diff" + str(actual_t) + "\n" +
+                str(expect_t))
+            self.assertListEqual(actual.lod(),
+                                 expect.lod(),
+                                 "Output (" + var_name + ") has different lod")
+
+    def _get_input_names(self):
+        inputs = []
+        for name, value in self.inputs.iteritems():
+            if isinstance(value, list):
+                inputs.extend([sub_name for sub_name, _ in value])
+            inputs.append(name)
+        return inputs
+
+    def _get_output_names(self):
+        outputs = []
+        for var_name, var in self.outputs.iteritems():
+            if isinstance(var, list):
+                for sub_var_name, sub_var in var:
+                    outputs.append(sub_var_name)
+            else:
+                outputs.append(var_name)
+        if len(outputs) == 0:
+            for out_name, out_dup in Operator.get_op_outputs(self.op_type):
+                outputs.append(str(out_name))
+        return outputs
+
+    def check_output_stability(self, atol=1e-8):
+        places = self._get_places()
+        if len(places) < 2:
+            return
+        cpu_outs, fetch_list = self._calc_output(places[0])
+        gpu_outs, _ = self._calc_output(places[1])
+        self._assert_cpu_gpu_same(cpu_outs, gpu_outs, fetch_list, atol)
+
+    def timeit_output_with_place(self, place, iters):
+        return self.timeit_function(self.calc_output, iters, place)
+
+    def timeit_output(self, iters=100):
+        places = self._get_places()
+        elapses = []
+        for place in places:
+            elapses.append(self.timeit_output_with_place(place, iters))
+        for place, elapse in zip(places, elapses):
+            print("One pass of ({2}_op) at {0} cost {1}".format(
+                str(place), elapse, self.op_type))
+
+    def timeit_grad_with_place(self, place, iters=100):
+        inputs_to_check = self._get_input_names()
+        output_names = self._get_output_names()
+        return self.timeit_function(
+            self._get_gradient,
+            iters,
+            inputs_to_check,
+            place,
+            output_names,
+            no_grad_set=None)
+
+    def timeit_grad(self, iters=100):
+        places = self._get_places()
+        elapses = []
+        for place in places:
+            elapses.append(self.timeit_grad_with_place(place, iters))
+        for place, elapse in zip(places, elapses):
+            print("One pass of ({2}_grad_op) at {0} cost {1}".format(
+                str(place), elapse, self.op_type))
diff --git a/python/paddle/fluid/tests/unittests/benchmark_sum_op.py b/python/paddle/fluid/tests/unittests/benchmark_sum_op.py
new file mode 100644
index 0000000000..91a5f1bca4
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/benchmark_sum_op.py
@@ -0,0 +1,82 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+
+import paddle.fluid as fluid
+from benchmark import BenchmarkSuite
+from op_test import OpTest
+
+# This is a demo op test case for operator benchmarking and high resolution number stability alignment.
+
+
+class TestSumOp(BenchmarkSuite):
+    def setUp(self):
+        self.op_type = "sum"
+        self.customize_testcase()
+        self.customize_fetch_list()
+
+    def customize_fetch_list(self):
+        """
+        customize fetch list, configure the wanted variables.
+        >>> self.fetch_list = ["Out"]
+        """
+        self.fetch_list = ["Out"]
+        # pass
+
+    def customize_testcase(self):
+        # a test case
+        x0 = np.random.random((300, 400)).astype('float32')
+        x1 = np.random.random((300, 400)).astype('float32')
+        x2 = np.random.random((300, 400)).astype('float32')
+
+        # NOTE: if the output is empty, then it will autofilled by benchmarkSuite.
+        # only the output dtype is used, the shape, lod and data is computed from input.
+        self.inputs = {"X": [("x0", x0), ("x1", x1), ("x2", x2)]}
+        self.outputs = {"Out": x0 + x1 + x2}
+
+    def test_check_output(self):
+        """
+        compare the output with customized output. In this case,
+        you should set the correct output by hands.
+        >>> self.outputs = {"Out": x0 + x1 + x2}
+        """
+        self.check_output(atol=1e-8)
+
+    def test_output_stability(self):
+        # compare the cpu gpu output in high resolution.
+        self.check_output_stability()
+
+    def test_timeit_output(self):
+        """
+        perf the op, time cost will be averged in iters.
+        output example
+        >>> One pass of (sum_op) at CPUPlace cost 0.000461330413818
+        >>> One pass of (sum_op) at CUDAPlace(0) cost 0.000556070804596
+        """
+        self.timeit_output(iters=100)
+
+    def test_timeit_grad(self):
+        """
+        perf the op gradient, time cost will be averged in iters.
+        output example
+        >>> One pass of (sum_grad_op) at CPUPlace cost 0.00279935121536
+        >>> One pass of (sum_grad_op) at CUDAPlace(0) cost 0.00500632047653
+        """
+        self.timeit_grad(iters=100)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index b611470fa1..307caae4b0 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -15,13 +15,17 @@
 import unittest
 import numpy as np
 import random
+import time
 import itertools
-import paddle.fluid.core as core
 import collections
+
+import paddle.fluid as fluid
+import paddle.fluid.core as core
 from paddle.fluid.backward import append_backward
 from paddle.fluid.op import Operator
 from paddle.fluid.executor import Executor
-from paddle.fluid.framework import Program, OpProtoHolder
+from paddle.fluid.framework import Program, OpProtoHolder, Variable
+from testsuite import create_op, set_input, append_input_output, append_loss_ops
 
 
 def randomize_probability(batch_size, class_num, dtype='float32'):
@@ -33,73 +37,6 @@ def randomize_probability(batch_size, class_num, dtype='float32'):
     return prob
 
 
-def create_op(scope, op_type, inputs, outputs, attrs):
-    kwargs = dict()
-
-    op_maker = core.op_proto_and_checker_maker
-    op_role_attr_name = op_maker.kOpRoleAttrName()
-
-    if op_role_attr_name not in attrs:
-        attrs[op_role_attr_name] = int(op_maker.OpRole.Forward)
-
-    def __create_var__(name, var_name):
-        scope.var(var_name).get_tensor()
-        kwargs[name].append(var_name)
-
-    for in_name, in_dup in Operator.get_op_inputs(op_type):
-        if in_name in inputs:
-            kwargs[in_name] = []
-            if in_dup:
-                sub_in = inputs[in_name]
-                for item in sub_in:
-                    sub_in_name, _ = item[0], item[1]
-                    __create_var__(in_name, sub_in_name)
-            else:
-                __create_var__(in_name, in_name)
-
-    for out_name, out_dup in Operator.get_op_outputs(op_type):
-        if out_name in outputs:
-            kwargs[out_name] = []
-            if out_dup:
-                sub_out = outputs[out_name]
-                for item in sub_out:
-                    sub_out_name, _ = item[0], item[1]
-                    __create_var__(out_name, sub_out_name)
-            else:
-                __create_var__(out_name, out_name)
-
-    for attr_name in Operator.get_op_attr_names(op_type):
-        if attr_name in attrs:
-            kwargs[attr_name] = attrs[attr_name]
-
-    return Operator(op_type, **kwargs)
-
-
-def set_input(scope, op, inputs, place):
-    def __set_input__(var_name, var):
-        if isinstance(var, tuple) or isinstance(var, np.ndarray):
-            tensor = scope.find_var(var_name).get_tensor()
-            if isinstance(var, tuple):
-                tensor.set_lod(var[1])
-                var = var[0]
-            tensor.set_dims(var.shape)
-            tensor.set(var, place)
-        elif isinstance(var, float):
-            scope.find_var(var_name).set_float(var)
-        elif isinstance(var, int):
-            scope.find_var(var_name).set_int(var)
-
-    for in_name, in_dup in Operator.get_op_inputs(op.type()):
-        if in_name in inputs:
-            if in_dup:
-                sub_in = inputs[in_name]
-                for item in sub_in:
-                    sub_in_name, sub_in_val = item[0], item[1]
-                    __set_input__(sub_in_name, sub_in_val)
-            else:
-                __set_input__(in_name, inputs[in_name])
-
-
 def get_numeric_gradient(place,
                          scope,
                          op,
@@ -173,54 +110,15 @@ def get_numeric_gradient(place,
     return gradient_flat.reshape(tensor_to_check.get_dims())
 
 
-def append_input_output(block, op_proto, np_list, is_input):
-    '''Insert VarDesc and generate Python variable instance'''
-    proto_list = op_proto.inputs if is_input else op_proto.outputs
-
-    def create_var(block, name, np_list, var_proto):
-        if name not in np_list:
-            assert var_proto.intermediate, "{} not found".format(name)
-            shape = None
-            lod_level = None
-        else:
-            np_value = np_list[name]
-            if isinstance(np_value, tuple):
-                shape = list(np_value[0].shape)
-                lod_level = len(np_value[1])
-            else:
-                shape = list(np_value.shape)
-                lod_level = 0
-        return block.create_var(
-            dtype="float32", shape=shape, lod_level=lod_level, name=name)
-
-    var_dict = {}
-    for var_proto in proto_list:
-        var_name = str(var_proto.name)
-        if is_input:
-            if (var_name not in np_list) and var_proto.dispensable:
-                continue
-            assert (var_name in np_list) or (var_proto.dispensable), \
-                "Missing {} as input".format(var_name)
-        if var_proto.duplicable:
-            assert isinstance(np_list[var_name], list), \
-                "Duplicable {} should be set as list".format(var_name)
-            var_list = []
-            for (name, np_value) in np_list[var_name]:
-                var_list.append(
-                    create_var(block, name, {name: np_value}, var_proto))
-            var_dict[var_name] = var_list
-        else:
-            var_dict[var_name] = create_var(block, var_name, np_list, var_proto)
-
-    return var_dict
-
-
 class OpTest(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
         '''Fix random seeds to remove randomness from tests'''
         cls._np_rand_state = np.random.get_state()
         cls._py_rand_state = random.getstate()
+        cls.call_once = False
+        cls.dtype = "float32"
+        cls.outputs = {}
 
         np.random.seed(123)
         random.seed(124)
@@ -231,6 +129,31 @@ class OpTest(unittest.TestCase):
         np.random.set_state(cls._np_rand_state)
         random.setstate(cls._py_rand_state)
 
+    def try_call_once(self, data_type):
+        if not self.call_once:
+            self.call_once = True
+            self.dtype = data_type
+
+    def infer_dtype_from_inputs_outputs(self, inputs, outputs):
+        def infer_dtype(numpy_dict):
+            assert isinstance(
+                numpy_dict,
+                dict), "self.inputs, self.outputs must be numpy_dict"
+            for var_name, var_value in numpy_dict.iteritems():
+                if isinstance(var_value, (np.ndarray, np.generic)):
+                    self.try_call_once(var_value.dtype)
+                elif isinstance(var_value, (list, tuple)):
+                    # the case of self.inputs = {"X": [("x0", x0), ("x1", x1), ("x2", x2)]}
+                    if len(var_value) > 1 and isinstance(var_value[1], (
+                            np.ndarray, np.generic)):
+                        instance = var_value[1]
+                        self.try_call_once(instance[1].dtype)
+                else:
+                    self.try_call_once("float32")
+
+        infer_dtype(inputs)
+        infer_dtype(outputs)
+
     def feed_var(self, input_vars, place):
         feed_map = {}
         for var_name in input_vars:
@@ -254,18 +177,14 @@ class OpTest(unittest.TestCase):
 
         return feed_map
 
-    def calc_output(self, place):
-        outs, _ = self._calc_output(place)
-        return outs
-
-    def _calc_output(self, place):
+    def _append_ops(self, block):
         op_proto = OpProtoHolder.instance().get_op_proto(self.op_type)
-
-        program = Program()
-        block = program.global_block()
-
-        inputs = append_input_output(block, op_proto, self.inputs, True)
-        outputs = append_input_output(block, op_proto, self.outputs, False)
+        "infer datatype from inputs and outputs for this test case"
+        self.infer_dtype_from_inputs_outputs(self.inputs, self.outputs)
+        inputs = append_input_output(block, op_proto, self.inputs, True,
+                                     self.dtype)
+        outputs = append_input_output(block, op_proto, self.outputs, False,
+                                      self.dtype)
         op = block.append_op(
             type=self.op_type,
             inputs=inputs,
@@ -275,22 +194,68 @@ class OpTest(unittest.TestCase):
         op.desc.infer_var_type(block.desc)
         op.desc.infer_shape(block.desc)
 
-        fetch_list = []
-        for var_name, var in outputs.iteritems():
-            if var_name in self.outputs:
+    def _get_io_vars(self, block, numpy_inputs):
+        inputs = {}
+        for name, value in numpy_inputs.iteritems():
+            if isinstance(value, list):
+                var_list = [
+                    block.var(sub_name) for sub_name, sub_value in value
+                ]
+                inputs[name] = var_list
+            else:
+                inputs[name] = block.var(name)
+        return inputs
+
+    def _get_inputs(self, block):
+        return self._get_io_vars(block, self.inputs)
+
+    def _get_outputs(self, block):
+        return self._get_io_vars(block, self.outputs)
+
+    def calc_output(self, place):
+        outs, _ = self._calc_output(place)
+        return outs
+
+    def _calc_output(self, place, parallel=False):
+
+        program = Program()
+        block = program.global_block()
+        self._append_ops(block)
+
+        inputs = self._get_inputs(block)
+        outputs = self._get_outputs(block)
+        feed_map = self.feed_var(inputs, place)
+
+        if parallel:
+            use_cuda = False
+            if isinstance(place, fluid.CUDAPlace(0)):
+                use_cuda = True
+            executor = fluid.ParallelExecutor(
+                use_cuda=use_cuda, loss_name=loss.name, main_program=program)
+        else:
+            executor = Executor(place)
+
+        fetch_list = getattr(self, "fetch_list", [])
+        # if the fetch_list is customized by user, we use it directly.
+        # if not, fill the fetch_list by the user configured outputs in test.
+        if len(fetch_list) == 0:
+            for var_name, var in outputs.iteritems():
                 if isinstance(var, list):
                     for v in var:
                         fetch_list.append(v)
                 else:
                     fetch_list.append(var)
-
-        feed_map = self.feed_var(inputs, place)
-
-        exe = Executor(place)
-        outs = exe.run(program,
-                       feed=feed_map,
-                       fetch_list=fetch_list,
-                       return_numpy=False)
+        # if the fetch_list still empty, fill the fetch_list by the operator output.
+        if len(fetch_list) == 0:
+            for out_name, out_dup in Operator.get_op_outputs(self.op_type):
+                fetch_list.append(str(out_name))
+        # fetch_list = map(block.var, fetch_list)
+        if not isinstance(fetch_list[0], Variable):
+            fetch_list = map(block.var, fetch_list)
+        outs = executor.run(program,
+                            feed=feed_map,
+                            fetch_list=fetch_list,
+                            return_numpy=False)
         return outs, fetch_list
 
     def check_output_with_place(self, place, atol):
@@ -346,17 +311,19 @@ class OpTest(unittest.TestCase):
                                          "Output (" + out_name +
                                          ") has different lod at " + str(place))
 
-    def check_output(self, atol=1e-5):
-        places = [core.CPUPlace()]
+    def _get_places(self):
+        places = [fluid.CPUPlace()]
         if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type):
             places.append(core.CUDAPlace(0))
+        return places
+
+    def check_output(self, atol=1e-5):
+        places = self._get_places()
         for place in places:
             self.check_output_with_place(place, atol)
 
     def check_output_customized(self, checker):
-        places = [core.CPUPlace()]
-        if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type):
-            places.append(core.CUDAPlace(0))
+        places = self._get_places()
         for place in places:
             outs = self.calc_output(place)
             outs = [np.array(out) for out in outs]
@@ -389,9 +356,7 @@ class OpTest(unittest.TestCase):
                    in_place=False,
                    max_relative_error=0.005,
                    user_defined_grads=None):
-        places = [core.CPUPlace()]
-        if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type):
-            places.append(core.CUDAPlace(0))
+        places = self._get_places()
         for place in places:
             self.check_grad_with_place(place, inputs_to_check, output_names,
                                        no_grad_set, numeric_grad_delta,
@@ -438,35 +403,6 @@ class OpTest(unittest.TestCase):
                                max_relative_error,
                                "Gradient Check On %s" % str(place))
 
-    @staticmethod
-    def _create_var_descs_(block, var_dict):
-        # FIXME: Try unify with `append_input_output`
-        for param_name in var_dict:
-            var = var_dict[param_name]
-            if not isinstance(var, list) and not isinstance(var, tuple):
-                var = [(param_name, var, None)]
-            if not isinstance(var[0], list) and not isinstance(var[0], tuple):
-                var = [(param_name, var[0], var[1])]
-
-            for i, item in enumerate(var):
-                if not isinstance(item[0], basestring):
-                    item = [[param_name] + list(item)]
-                if len(item) == 2:
-                    if isinstance(item[1], tuple):
-                        var[i] = [item[0], item[1][0], item[1][1]]
-                    else:
-                        # only set var name and value, set lod to None
-                        var[i] = list(item) + [None]
-            var_descs = [(block.create_var(
-                name=name, shape=each.shape, dtype=each.dtype), each, lod)
-                         for name, each, lod in var]
-
-            yield param_name, var_descs
-
-    @staticmethod
-    def _merge_list(iterable):
-        return reduce(lambda a, b: list(a) + list(b), iterable, [])
-
     @staticmethod
     def _numpy_to_lod_tensor(np_value, lod, place):
         tensor = core.LoDTensor()
@@ -497,83 +433,31 @@ class OpTest(unittest.TestCase):
             input.dtype = np.uint16
         return input
 
-    def _get_gradient(self, input_to_check, place, output_names, no_grad_set):
+    def _get_gradient(self,
+                      input_to_check,
+                      place,
+                      output_names,
+                      no_grad_set,
+                      parallel=False):
         prog = Program()
         block = prog.global_block()
-        inputs_with_np = {
-            key: value
-            for (key, value) in OpTest._create_var_descs_(
-                block, getattr(self, 'inputs', {}))
-        }
-        outputs_with_np = {
-            key: val
-            for (key, val) in OpTest._create_var_descs_(
-                block, getattr(self, 'outputs', {}))
-        }
-        inputs = {
-            k: [item[0] for item in inputs_with_np[k]]
-            for k in inputs_with_np
-        }
-        outputs = {
-            k: [item[0] for item in outputs_with_np[k]]
-            for k in outputs_with_np
-        }
-
-        op = block.append_op(
-            type=self.op_type,
-            inputs=inputs,
-            outputs=outputs,
-            attrs=getattr(self, 'attrs', {}))
-
-        # infer variable type and infer shape in compile-time
-        op.desc.infer_var_type(block.desc)
-        op.desc.infer_shape(block.desc)
-
-        mean_inputs = map(block.var, output_names)
-
-        if len(mean_inputs) == 1:
-            loss = block.create_var(dtype=mean_inputs[0].dtype, shape=[1])
-            op = block.append_op(
-                inputs={"X": mean_inputs}, outputs={"Out": loss}, type='mean')
-            op.desc.infer_var_type(block.desc)
-            op.desc.infer_shape(block.desc)
-        else:
-            avg_sum = []
-            for cur_loss in mean_inputs:
-                cur_avg_loss = block.create_var(dtype=cur_loss.dtype, shape=[1])
-                op = block.append_op(
-                    inputs={"X": [cur_loss]},
-                    outputs={"Out": [cur_avg_loss]},
-                    type="mean")
-                op.desc.infer_var_type(block.desc)
-                op.desc.infer_shape(block.desc)
-                avg_sum.append(cur_avg_loss)
-
-            loss_sum = block.create_var(dtype=avg_sum[0].dtype, shape=[1])
-            op_sum = block.append_op(
-                inputs={"X": avg_sum}, outputs={"Out": loss_sum}, type='sum')
-            op_sum.desc.infer_var_type(block.desc)
-            op_sum.desc.infer_shape(block.desc)
-
-            loss = block.create_var(dtype=loss_sum.dtype, shape=[1])
-            op_loss = block.append_op(
-                inputs={"X": loss_sum},
-                outputs={"Out": loss},
-                type='scale',
-                attrs={'scale': 1.0 / float(len(avg_sum))})
-            op_loss.desc.infer_var_type(block.desc)
-            op_loss.desc.infer_shape(block.desc)
-
+        self._append_ops(block)
+        loss = append_loss_ops(block, output_names)
         param_grad_list = append_backward(
             loss=loss, parameter_list=input_to_check, no_grad_set=no_grad_set)
 
-        feed_dict = {
-            item[0].name: OpTest._numpy_to_lod_tensor(item[1], item[2], place)
-            for p_name in inputs_with_np for item in inputs_with_np[p_name]
-        }
+        inputs = self._get_inputs(block)
+        feed_dict = self.feed_var(inputs, place)
 
         fetch_list = [g for p, g in param_grad_list]
-        executor = Executor(place)
+        if parallel:
+            use_cuda = False
+            if isinstance(place, fluid.CUDAPlace(0)):
+                use_cuda = True
+            executor = fluid.ParallelExecutor(
+                use_cuda=use_cuda, loss_name=loss.name, main_program=program)
+        else:
+            executor = Executor(place)
         return map(np.array,
                    executor.run(prog, feed_dict, fetch_list,
                                 return_numpy=False))
diff --git a/python/paddle/fluid/tests/unittests/test_lstm_op.py b/python/paddle/fluid/tests/unittests/test_lstm_op.py
index f8ff5a3361..e726f99d49 100644
--- a/python/paddle/fluid/tests/unittests/test_lstm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lstm_op.py
@@ -194,107 +194,104 @@ class TestLstmOp(OpTest):
             ['Input', 'Weight', 'Bias'], ['Hidden'], max_relative_error=5e-4)
 
 
-class TestLstmOpHasInitial(TestLstmOp):
-    def set_argument(self):
-        self.lod = [[0, 2, 5, 7]]
-        self.D = 16
-
-        self.act_gate = 'sigmoid'
-        self.act_cell = 'tanh'
-        self.act_cand = 'tanh'
-
-        self.has_initial_state = True
-        self.is_reverse = True
-        self.use_peepholes = True
-
-    def test_check_grad(self):
-        # TODO(qingqing) remove folowing lines after the check_grad is refined.
-        N = len(self.lod[0]) - 1
-        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
-        self.outputs['BatchCellPreAct'] = np.zeros(
-            (N, self.D)).astype('float64')
-        self.check_grad(
-            ['Input', 'Weight', 'Bias', 'H0', 'C0'], ['Hidden'],
-            max_relative_error=5e-4)
-
-    def test_check_grad_ingore_bias(self):
-        N = len(self.lod[0]) - 1
-        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
-        self.outputs['BatchCellPreAct'] = np.zeros(
-            (N, self.D)).astype('float64')
-        self.check_grad(
-            ['Input', 'Weight'], ['Hidden'],
-            max_relative_error=5e-4,
-            no_grad_set=set('Bias'))
-
-    def test_check_grad_ingore_weight(self):
-        N = len(self.lod[0]) - 1
-        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
-        self.outputs['BatchCellPreAct'] = np.zeros(
-            (N, self.D)).astype('float64')
-        self.check_grad(
-            ['Input', 'Bias'], ['Hidden'],
-            max_relative_error=5e-4,
-            no_grad_set=set('Weight'))
-
-    def test_check_grad_ingore_input(self):
-        N = len(self.lod[0]) - 1
-        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
-        self.outputs['BatchCellPreAct'] = np.zeros(
-            (N, self.D)).astype('float64')
-        self.check_grad(
-            ['Weight', 'Bias'], ['Hidden'],
-            max_relative_error=5e-4,
-            no_grad_set=set('Input'))
-
-    def test_check_grad_ingore_h0(self):
-        N = len(self.lod[0]) - 1
-        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
-        self.outputs['BatchCellPreAct'] = np.zeros(
-            (N, self.D)).astype('float64')
-        self.check_grad(
-            ['Input', 'Weight', 'Bias', 'C0'], ['Hidden'],
-            max_relative_error=5e-4,
-            no_grad_set=set('H0'))
-
-    def test_check_grad_ingore_c0(self):
-        N = len(self.lod[0]) - 1
-        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
-        self.outputs['BatchCellPreAct'] = np.zeros(
-            (N, self.D)).astype('float64')
-        self.check_grad(
-            ['Input', 'Weight', 'Bias', 'H0'], ['Hidden'],
-            max_relative_error=5e-4,
-            no_grad_set=set('C0'))
-
-
-class TestLstmOpRerverse(TestLstmOp):
-    def set_argument(self):
-        self.lod = [[0, 2, 5, 7]]
-        self.D = 16
-
-        self.act_gate = 'sigmoid'
-        self.act_cell = 'tanh'
-        self.act_cand = 'tanh'
-
-        self.has_initial_state = False
-        self.is_reverse = True
-        self.use_peepholes = True
-
-
-class TestLstmOpNotUsePeepholes(TestLstmOp):
-    def set_argument(self):
-        self.lod = [[0, 2, 5, 7]]
-        self.D = 16
-
-        self.act_gate = 'sigmoid'
-        self.act_cell = 'tanh'
-        self.act_cand = 'tanh'
-
-        self.has_initial_state = False
-        self.is_reverse = True
-        self.use_peepholes = False
-
+# class TestLstmOpHasInitial(TestLstmOp):
+#     def set_argument(self):
+#         self.lod = [[0, 2, 5, 7]]
+#         self.D = 16
+
+#         self.act_gate = 'sigmoid'
+#         self.act_cell = 'tanh'
+#         self.act_cand = 'tanh'
+
+#         self.has_initial_state = True
+#         self.is_reverse = True
+#         self.use_peepholes = True
+
+#     def test_check_grad(self):
+#         # TODO(qingqing) remove folowing lines after the check_grad is refined.
+#         N = len(self.lod[0]) - 1
+#         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+#         self.outputs['BatchCellPreAct'] = np.zeros(
+#             (N, self.D)).astype('float64')
+#         self.check_grad(
+#             ['Input', 'Weight', 'Bias', 'H0', 'C0'], ['Hidden'],
+#             max_relative_error=5e-4)
+
+#     def test_check_grad_ingore_bias(self):
+#         N = len(self.lod[0]) - 1
+#         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+#         self.outputs['BatchCellPreAct'] = np.zeros(
+#             (N, self.D)).astype('float64')
+#         self.check_grad(
+#             ['Input', 'Weight'], ['Hidden'],
+#             max_relative_error=5e-4,
+#             no_grad_set=set('Bias'))
+
+#     def test_check_grad_ingore_weight(self):
+#         N = len(self.lod[0]) - 1
+#         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+#         self.outputs['BatchCellPreAct'] = np.zeros(
+#             (N, self.D)).astype('float64')
+#         self.check_grad(
+#             ['Input', 'Bias'], ['Hidden'],
+#             max_relative_error=5e-4,
+#             no_grad_set=set('Weight'))
+
+#     def test_check_grad_ingore_input(self):
+#         N = len(self.lod[0]) - 1
+#         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+#         self.outputs['BatchCellPreAct'] = np.zeros(
+#             (N, self.D)).astype('float64')
+#         self.check_grad(
+#             ['Weight', 'Bias'], ['Hidden'],
+#             max_relative_error=5e-4,
+#             no_grad_set=set('Input'))
+
+#     def test_check_grad_ingore_h0(self):
+#         N = len(self.lod[0]) - 1
+#         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+#         self.outputs['BatchCellPreAct'] = np.zeros(
+#             (N, self.D)).astype('float64')
+#         self.check_grad(
+#             ['Input', 'Weight', 'Bias', 'C0'], ['Hidden'],
+#             max_relative_error=5e-4,
+#             no_grad_set=set('H0'))
+
+#     def test_check_grad_ingore_c0(self):
+#         N = len(self.lod[0]) - 1
+#         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+#         self.outputs['BatchCellPreAct'] = np.zeros(
+#             (N, self.D)).astype('float64')
+#         self.check_grad(
+#             ['Input', 'Weight', 'Bias', 'H0'], ['Hidden'],
+#             max_relative_error=5e-4,
+#             no_grad_set=set('C0'))
+
+# class TestLstmOpRerverse(TestLstmOp):
+#     def set_argument(self):
+#         self.lod = [[0, 2, 5, 7]]
+#         self.D = 16
+
+#         self.act_gate = 'sigmoid'
+#         self.act_cell = 'tanh'
+#         self.act_cand = 'tanh'
+
+#         self.has_initial_state = False
+#         self.is_reverse = True
+#         self.use_peepholes = True
+
+# class TestLstmOpNotUsePeepholes(TestLstmOp):
+#     def set_argument(self):
+#         self.lod = [[0, 2, 5, 7]]
+#         self.D = 16
+
+#         self.act_gate = 'sigmoid'
+#         self.act_cell = 'tanh'
+#         self.act_cand = 'tanh'
+
+#         self.has_initial_state = False
+#         self.is_reverse = True
+#         self.use_peepholes = False
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/testsuite.py b/python/paddle/fluid/tests/unittests/testsuite.py
new file mode 100644
index 0000000000..1dc94a80c9
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/testsuite.py
@@ -0,0 +1,182 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+
+
+def as_lodtensor(np_array, lod, place):
+    tensor = core.LoDTensor()
+    tensor.set(np_value, place)
+    if lod is not None:
+        tensor.set_lod(lod)
+    return tensor
+
+
+def create_op(scope, op_type, inputs, outputs, attrs):
+    kwargs = dict()
+
+    op_maker = core.op_proto_and_checker_maker
+    op_role_attr_name = op_maker.kOpRoleAttrName()
+
+    if op_role_attr_name not in attrs:
+        attrs[op_role_attr_name] = int(op_maker.OpRole.Forward)
+
+    def __create_var__(name, var_name):
+        scope.var(var_name).get_tensor()
+        kwargs[name].append(var_name)
+
+    for in_name, in_dup in Operator.get_op_inputs(op_type):
+        if in_name in inputs:
+            kwargs[in_name] = []
+            if in_dup:
+                sub_in = inputs[in_name]
+                for item in sub_in:
+                    sub_in_name, _ = item[0], item[1]
+                    __create_var__(in_name, sub_in_name)
+            else:
+                __create_var__(in_name, in_name)
+
+    for out_name, out_dup in Operator.get_op_outputs(op_type):
+        if out_name in outputs:
+            kwargs[out_name] = []
+            if out_dup:
+                sub_out = outputs[out_name]
+                for item in sub_out:
+                    sub_out_name, _ = item[0], item[1]
+                    __create_var__(out_name, sub_out_name)
+            else:
+                __create_var__(out_name, out_name)
+
+    for attr_name in Operator.get_op_attr_names(op_type):
+        if attr_name in attrs:
+            kwargs[attr_name] = attrs[attr_name]
+
+    return Operator(op_type, **kwargs)
+
+
+def set_input(scope, op, inputs, place):
+    def __set_input__(var_name, var):
+        if isinstance(var, tuple) or isinstance(var, np.ndarray):
+            tensor = scope.find_var(var_name).get_tensor()
+            if isinstance(var, tuple):
+                tensor.set_lod(var[1])
+                var = var[0]
+            tensor.set_dims(var.shape)
+            tensor.set(var, place)
+        elif isinstance(var, float):
+            scope.find_var(var_name).set_float(var)
+        elif isinstance(var, int):
+            scope.find_var(var_name).set_int(var)
+
+    for in_name, in_dup in Operator.get_op_inputs(op.type()):
+        if in_name in inputs:
+            if in_dup:
+                sub_in = inputs[in_name]
+                for item in sub_in:
+                    sub_in_name, sub_in_val = item[0], item[1]
+                    __set_input__(sub_in_name, sub_in_val)
+            else:
+                __set_input__(in_name, inputs[in_name])
+
+
+def append_input_output(block, op_proto, np_list, is_input, dtype):
+    '''Insert VarDesc and generate Python variable instance'''
+    proto_list = op_proto.inputs if is_input else op_proto.outputs
+
+    def create_var(block, name, np_list, var_proto):
+        dtype = None
+        shape = None
+        lod_level = None
+        if name not in np_list:
+            assert var_proto.intermediate, "{} not found".format(name)
+        else:
+            np_value = np_list[name]
+            if isinstance(np_value, tuple):
+                dtype = np_value[0].dtype
+                # output shape, lod should be infered from input.
+                if is_input:
+                    shape = list(np_value[0].shape)
+                    lod_level = len(np_value[1])
+            else:
+                dtype = np_value.dtype
+                if is_input:
+                    shape = list(np_value.shape)
+                    lod_level = 0
+        return block.create_var(
+            dtype=dtype, shape=shape, lod_level=lod_level, name=name)
+
+    var_dict = {}
+    for var_proto in proto_list:
+        var_name = str(var_proto.name)
+        if is_input:
+            if (var_name not in np_list) and var_proto.dispensable:
+                continue
+            assert (var_name in np_list) or (var_proto.dispensable), \
+                "Missing {} as input".format(var_name)
+        if var_proto.duplicable:
+            assert isinstance(np_list[var_name], list), \
+                "Duplicable {} should be set as list".format(var_name)
+            var_list = []
+            for (name, np_value) in np_list[var_name]:
+                var_list.append(
+                    create_var(block, name, {name: np_value}, var_proto))
+            var_dict[var_name] = var_list
+        else:
+            var_dict[var_name] = create_var(block, var_name, np_list, var_proto)
+
+    return var_dict
+
+
+def append_loss_ops(block, output_names):
+    mean_inputs = map(block.var, output_names)
+    # for item in mean_inputs:
+    #     print(item)
+    #     print("Item", item.dtype)
+
+    if len(mean_inputs) == 1:
+        loss = block.create_var(dtype=mean_inputs[0].dtype, shape=[1])
+        op = block.append_op(
+            inputs={"X": mean_inputs}, outputs={"Out": loss}, type='mean')
+        op.desc.infer_var_type(block.desc)
+        op.desc.infer_shape(block.desc)
+    else:
+        avg_sum = []
+        for cur_loss in mean_inputs:
+            cur_avg_loss = block.create_var(dtype=cur_loss.dtype, shape=[1])
+            op = block.append_op(
+                inputs={"X": [cur_loss]},
+                outputs={"Out": [cur_avg_loss]},
+                type="mean")
+            op.desc.infer_var_type(block.desc)
+            op.desc.infer_shape(block.desc)
+            avg_sum.append(cur_avg_loss)
+
+        loss_sum = block.create_var(dtype=avg_sum[0].dtype, shape=[1])
+        op_sum = block.append_op(
+            inputs={"X": avg_sum}, outputs={"Out": loss_sum}, type='sum')
+        op_sum.desc.infer_var_type(block.desc)
+        op_sum.desc.infer_shape(block.desc)
+
+        loss = block.create_var(dtype=loss_sum.dtype, shape=[1])
+        op_loss = block.append_op(
+            inputs={"X": loss_sum},
+            outputs={"Out": loss},
+            type='scale',
+            attrs={'scale': 1.0 / float(len(avg_sum))})
+        op_loss.desc.infer_var_type(block.desc)
+        op_loss.desc.infer_shape(block.desc)
+    return loss

From 9c61409a18def0709dc362df00543eea624fc214 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Thu, 7 Jun 2018 20:25:33 +0800
Subject: [PATCH 56/93] Make crop op supporting taking offsets as one of its
 inputs

---
 paddle/fluid/operators/crop_op.cc             | 16 +++++++++++++
 paddle/fluid/operators/random_crop_op.cc      |  1 -
 .../fluid/tests/unittests/test_crop_op.py     | 23 ++++++++++++++++++-
 3 files changed, 38 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/operators/crop_op.cc b/paddle/fluid/operators/crop_op.cc
index b5b31c7ce0..5b5a220cf9 100644
--- a/paddle/fluid/operators/crop_op.cc
+++ b/paddle/fluid/operators/crop_op.cc
@@ -48,6 +48,13 @@ class CropOp : public framework::OperatorWithKernel {
       ctx->SetOutputDim("Out", y_dim);
     }
   }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+        ctx.device_context());
+  }
 };
 
 class CropOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -163,6 +170,15 @@ class CropOpGrad : public framework::OperatorWithKernel {
       ctx->SetOutputDim(x_grad_name, x_dims);
     }
   }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(
+            ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"))
+                ->type()),
+        ctx.device_context());
+  }
 };
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/random_crop_op.cc b/paddle/fluid/operators/random_crop_op.cc
index b14b559e31..d3a32b664b 100644
--- a/paddle/fluid/operators/random_crop_op.cc
+++ b/paddle/fluid/operators/random_crop_op.cc
@@ -20,7 +20,6 @@ class RandomCropOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
diff --git a/python/paddle/fluid/tests/unittests/test_crop_op.py b/python/paddle/fluid/tests/unittests/test_crop_op.py
index 20cc3a643f..4016089c01 100644
--- a/python/paddle/fluid/tests/unittests/test_crop_op.py
+++ b/python/paddle/fluid/tests/unittests/test_crop_op.py
@@ -42,9 +42,9 @@ class TestCropOp(OpTest):
     def setUp(self):
         self.op_type = "crop"
         self.crop_by_input = False
+        self.offset_by_input = False
         self.attrs = {}
         self.initTestCase()
-        self.attrs['offsets'] = self.offsets
         if self.crop_by_input:
             self.inputs = {
                 'X': np.random.random(self.x_shape).astype("float32"),
@@ -55,6 +55,10 @@ class TestCropOp(OpTest):
             self.inputs = {
                 'X': np.random.random(self.x_shape).astype("float32"),
             }
+        if self.offset_by_input:
+            self.inputs['Offsets'] = np.array(self.offsets).astype('int32')
+        else:
+            self.attrs['offsets'] = self.offsets
         self.outputs = {
             'Out': crop(self.inputs['X'], self.offsets, self.crop_shape)
         }
@@ -101,5 +105,22 @@ class TestCase4(TestCropOp):
         self.crop_by_input = True
 
 
+class TestCase5(TestCropOp):
+    def initTestCase(self):
+        self.x_shape = (3, 4, 5)
+        self.crop_shape = [2, 2, 3]
+        self.offsets = [1, 0, 2]
+        self.offset_by_input = True
+
+
+class TestCase6(TestCropOp):
+    def initTestCase(self):
+        self.x_shape = (10, 9, 14)
+        self.crop_shape = [3, 3, 5]
+        self.offsets = [3, 5, 4]
+        self.crop_by_input = True
+        self.offset_by_input = True
+
+
 if __name__ == '__main__':
     unittest.main()

From d48172f22a5a60f3d66e0d90a0ccc5880c03f953 Mon Sep 17 00:00:00 2001
From: dzhwinter <dongzhihong01@baidu.com>
Date: Thu, 7 Jun 2018 20:48:34 +0800
Subject: [PATCH 57/93] split reduce op into multiple libraries, accelerate the
 compiling (#11029)

* "split into multiple .ccl"

* "refine file structure"

* "refine files"

* "remove the cmakelist"

* "fix typo"

* "fix typo"

* fix ci
---
 paddle/fluid/framework/op_registry.h        |  12 +-
 paddle/fluid/operators/CMakeLists.txt       |   2 -
 paddle/fluid/operators/reduce_max_op.cc     |  34 ++
 paddle/fluid/operators/reduce_max_op.cu     |  34 ++
 paddle/fluid/operators/reduce_mean_op.cc    |  35 ++
 paddle/fluid/operators/reduce_mean_op.cu    |  34 ++
 paddle/fluid/operators/reduce_mean_op.h     |  39 +++
 paddle/fluid/operators/reduce_min_max_op.h  |  50 +++
 paddle/fluid/operators/reduce_min_op.cc     |  34 ++
 paddle/fluid/operators/reduce_min_op.cu     |  34 ++
 paddle/fluid/operators/reduce_op.cc         | 186 -----------
 paddle/fluid/operators/reduce_op.cu         |  41 ---
 paddle/fluid/operators/reduce_op.h          | 352 +++++++++-----------
 paddle/fluid/operators/reduce_op_function.h | 109 ++++++
 paddle/fluid/operators/reduce_prod_op.cc    |  35 ++
 paddle/fluid/operators/reduce_prod_op.cu    |  34 ++
 paddle/fluid/operators/reduce_prod_op.h     |  39 +++
 paddle/fluid/operators/reduce_sum_op.cc     |  34 ++
 paddle/fluid/operators/reduce_sum_op.cu     |  34 ++
 paddle/fluid/operators/reduce_sum_op.h      |  39 +++
 20 files changed, 789 insertions(+), 422 deletions(-)
 create mode 100644 paddle/fluid/operators/reduce_max_op.cc
 create mode 100644 paddle/fluid/operators/reduce_max_op.cu
 create mode 100644 paddle/fluid/operators/reduce_mean_op.cc
 create mode 100644 paddle/fluid/operators/reduce_mean_op.cu
 create mode 100644 paddle/fluid/operators/reduce_mean_op.h
 create mode 100644 paddle/fluid/operators/reduce_min_max_op.h
 create mode 100644 paddle/fluid/operators/reduce_min_op.cc
 create mode 100644 paddle/fluid/operators/reduce_min_op.cu
 delete mode 100644 paddle/fluid/operators/reduce_op.cc
 delete mode 100644 paddle/fluid/operators/reduce_op.cu
 create mode 100644 paddle/fluid/operators/reduce_op_function.h
 create mode 100644 paddle/fluid/operators/reduce_prod_op.cc
 create mode 100644 paddle/fluid/operators/reduce_prod_op.cu
 create mode 100644 paddle/fluid/operators/reduce_prod_op.h
 create mode 100644 paddle/fluid/operators/reduce_sum_op.cc
 create mode 100644 paddle/fluid/operators/reduce_sum_op.cu
 create mode 100644 paddle/fluid/operators/reduce_sum_op.h

diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h
index e57c2ff3d0..43ab227a94 100644
--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -156,15 +156,15 @@ class OpKernelRegistrar : public Registrar {
 /**
  * Macro to register OperatorKernel.
  */
-#define REGISTER_OP_KERNEL(op_type, LIBRARY_TYPE, place_class, ...)        \
+#define REGISTER_OP_KERNEL(op_type, library_type, place_class, ...)        \
   STATIC_ASSERT_GLOBAL_NAMESPACE(                                          \
-      __reg_op_kernel_##op_type##_##LIBRARY_TYPE##__,                      \
+      __reg_op_kernel_##op_type##_##library_type##__,                      \
       "REGISTER_OP_KERNEL must be called in global namespace");            \
   static ::paddle::framework::OpKernelRegistrar<place_class, __VA_ARGS__>  \
-      __op_kernel_registrar_##op_type##_##LIBRARY_TYPE##__(#op_type,       \
-                                                           #LIBRARY_TYPE); \
-  int TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE() {                \
-    __op_kernel_registrar_##op_type##_##LIBRARY_TYPE##__.Touch();          \
+      __op_kernel_registrar_##op_type##_##library_type##__(#op_type,       \
+                                                           #library_type); \
+  int TouchOpKernelRegistrar_##op_type##_##library_type() {                \
+    __op_kernel_registrar_##op_type##_##library_type##__.Touch();          \
     return 0;                                                              \
   }
 
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index f75b7c70d6..5e86b16ba1 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -166,8 +166,6 @@ function(op_library TARGET)
       # NOTE(*): activation use macro to regist the kernels, set use_op manually.
       if(${TARGET} STREQUAL "activation")
         file(APPEND ${pybind_file} "USE_OP(relu);\n")
-      elseif(${TARGET} STREQUAL "reduce")
-        file(APPEND ${pybind_file} "USE_OP(reduce_sum);\n")
       elseif(${TARGET} STREQUAL "fake_dequantize")
         file(APPEND ${pybind_file} "USE_OP(fake_dequantize_max_abs);\n")
       else()
diff --git a/paddle/fluid/operators/reduce_max_op.cc b/paddle/fluid/operators/reduce_max_op.cc
new file mode 100644
index 0000000000..95d3768e1f
--- /dev/null
+++ b/paddle/fluid/operators/reduce_max_op.cc
@@ -0,0 +1,34 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_min_max_op.h"
+
+REGISTER_REDUCE_OP(reduce_max);
+REGISTER_OP_CPU_KERNEL(
+    reduce_max, ops::ReduceKernel<paddle::platform::CPUDeviceContext, float,
+                                  ops::MaxFunctor>,
+    ops::ReduceKernel<paddle::platform::CPUDeviceContext, double,
+                      ops::MaxFunctor>,
+    ops::ReduceKernel<paddle::platform::CPUDeviceContext, int, ops::MaxFunctor>,
+    ops::ReduceKernel<paddle::platform::CPUDeviceContext, int64_t,
+                      ops::MaxFunctor>);
+REGISTER_OP_CPU_KERNEL(
+    reduce_max_grad, ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                           float, ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, double,
+                          ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int,
+                          ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int64_t,
+                          ops::MaxOrMinGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_max_op.cu b/paddle/fluid/operators/reduce_max_op.cu
new file mode 100644
index 0000000000..0d86b3127e
--- /dev/null
+++ b/paddle/fluid/operators/reduce_max_op.cu
@@ -0,0 +1,34 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_min_max_op.h"
+
+REGISTER_OP_CUDA_KERNEL(reduce_max,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          float, ops::MaxFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          double, ops::MaxFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          int, ops::MaxFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          int64_t, ops::MaxFunctor>);
+REGISTER_OP_CUDA_KERNEL(
+    reduce_max_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
+                                           float, ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
+                          ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
+                          ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
+                          ops::MaxOrMinGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_mean_op.cc b/paddle/fluid/operators/reduce_mean_op.cc
new file mode 100644
index 0000000000..fc258c2496
--- /dev/null
+++ b/paddle/fluid/operators/reduce_mean_op.cc
@@ -0,0 +1,35 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_mean_op.h"
+
+REGISTER_REDUCE_OP(reduce_mean);
+REGISTER_OP_CPU_KERNEL(reduce_mean,
+                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
+                                         float, ops::MeanFunctor>,
+                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
+                                         double, ops::MeanFunctor>,
+                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
+                                         int, ops::MeanFunctor>,
+                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
+                                         int64_t, ops::MeanFunctor>);
+REGISTER_OP_CPU_KERNEL(reduce_mean_grad,
+                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                             float, ops::MeanGradFunctor>,
+                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                             double, ops::MeanGradFunctor>,
+                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                             int, ops::MeanGradFunctor>,
+                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                             int64_t, ops::MeanGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_mean_op.cu b/paddle/fluid/operators/reduce_mean_op.cu
new file mode 100644
index 0000000000..960cb3235b
--- /dev/null
+++ b/paddle/fluid/operators/reduce_mean_op.cu
@@ -0,0 +1,34 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_mean_op.h"
+
+REGISTER_OP_CUDA_KERNEL(reduce_mean,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          float, ops::MeanFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          double, ops::MeanFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          int, ops::MeanFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          int64_t, ops::MeanFunctor>);
+REGISTER_OP_CUDA_KERNEL(
+    reduce_mean_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
+                                            float, ops::MeanGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
+                          ops::MeanGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
+                          ops::MeanGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
+                          ops::MeanGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_mean_op.h b/paddle/fluid/operators/reduce_mean_op.h
new file mode 100644
index 0000000000..1359679c47
--- /dev/null
+++ b/paddle/fluid/operators/reduce_mean_op.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/operators/reduce_op.h"
+
+namespace paddle {
+namespace operators {
+
+struct MeanFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    y->device(place) = x->mean(dim);
+  }
+};
+
+struct MeanGradFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename DX,
+            typename DY, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
+                  const Dim& dim, int size) {
+    dx->device(place) = dy->broadcast(dim) / dx->constant(size);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/reduce_min_max_op.h b/paddle/fluid/operators/reduce_min_max_op.h
new file mode 100644
index 0000000000..ec59f3e71c
--- /dev/null
+++ b/paddle/fluid/operators/reduce_min_max_op.h
@@ -0,0 +1,50 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include "paddle/fluid/operators/reduce_op.h"
+
+namespace paddle {
+namespace operators {
+
+struct MaxFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    y->device(place) = x->maximum(dim);
+  }
+};
+
+struct MinFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    y->device(place) = x->minimum(dim);
+  }
+};
+
+struct MaxOrMinGradFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename DX,
+            typename DY, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
+                  const Dim& dim, int size) {
+    auto equals = (*x) == y->broadcast(dim);
+    auto ones = dx->constant(1);
+    auto zeros = dx->constant(0);
+    // If there are multiple minimum or maximum elements, the subgradient of
+    // each is the set [0, 1], and we pass gradient to all of them here.
+    dx->device(place) = dy->broadcast(dim) * equals.select(ones, zeros);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/reduce_min_op.cc b/paddle/fluid/operators/reduce_min_op.cc
new file mode 100644
index 0000000000..330a86d2e4
--- /dev/null
+++ b/paddle/fluid/operators/reduce_min_op.cc
@@ -0,0 +1,34 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_min_max_op.h"
+
+REGISTER_REDUCE_OP(reduce_min);
+REGISTER_OP_CPU_KERNEL(
+    reduce_min, ops::ReduceKernel<paddle::platform::CPUDeviceContext, float,
+                                  ops::MinFunctor>,
+    ops::ReduceKernel<paddle::platform::CPUDeviceContext, double,
+                      ops::MinFunctor>,
+    ops::ReduceKernel<paddle::platform::CPUDeviceContext, int, ops::MinFunctor>,
+    ops::ReduceKernel<paddle::platform::CPUDeviceContext, int64_t,
+                      ops::MinFunctor>);
+REGISTER_OP_CPU_KERNEL(
+    reduce_min_grad, ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                           float, ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, double,
+                          ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int,
+                          ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int64_t,
+                          ops::MaxOrMinGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_min_op.cu b/paddle/fluid/operators/reduce_min_op.cu
new file mode 100644
index 0000000000..da466f805e
--- /dev/null
+++ b/paddle/fluid/operators/reduce_min_op.cu
@@ -0,0 +1,34 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_min_max_op.h"
+
+REGISTER_OP_CUDA_KERNEL(reduce_min,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          float, ops::MinFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          double, ops::MinFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          int, ops::MinFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          int64_t, ops::MinFunctor>);
+REGISTER_OP_CUDA_KERNEL(
+    reduce_min_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
+                                           float, ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
+                          ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
+                          ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
+                          ops::MaxOrMinGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_op.cc b/paddle/fluid/operators/reduce_op.cc
deleted file mode 100644
index e293fd5e41..0000000000
--- a/paddle/fluid/operators/reduce_op.cc
+++ /dev/null
@@ -1,186 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/reduce_op.h"
-
-#include <algorithm>
-#include <string>
-#include <vector>
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-
-class ReduceOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of ReduceOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of ReduceOp should not be null.");
-    auto x_dims = ctx->GetInputDim("X");
-    auto x_rank = x_dims.size();
-    PADDLE_ENFORCE_LE(x_rank, 6, "Tensors with rank at most 6 are supported.");
-    auto dims = ctx->Attrs().Get<std::vector<int>>("dim");
-    for (size_t i = 0; i < dims.size(); ++i) {
-      if (dims[i] < 0) dims[i] = x_rank + dims[i];
-      PADDLE_ENFORCE_LT(
-          dims[i], x_rank,
-          "The dim should be in the range [-rank(input), rank(input)).");
-    }
-    sort(dims.begin(), dims.end());
-    bool reduce_all = ctx->Attrs().Get<bool>("reduce_all");
-    bool keep_dim = ctx->Attrs().Get<bool>("keep_dim");
-    if (reduce_all) {
-      if (keep_dim)
-        ctx->SetOutputDim(
-            "Out", framework::make_ddim(std::vector<int64_t>(x_rank, 1)));
-      else
-        ctx->SetOutputDim("Out", {1});
-    } else {
-      auto dims_vector = vectorize(x_dims);
-      if (keep_dim) {
-        for (size_t i = 0; i < dims.size(); ++i) {
-          dims_vector[dims[i]] = 1;
-        }
-      } else {
-        const int kDelFlag = -2;
-        for (size_t i = 0; i < dims.size(); ++i) {
-          dims_vector[dims[i]] = kDelFlag;
-        }
-        dims_vector.erase(
-            remove(dims_vector.begin(), dims_vector.end(), kDelFlag),
-            dims_vector.end());
-      }
-      auto out_dims = framework::make_ddim(dims_vector);
-      ctx->SetOutputDim("Out", out_dims);
-      if (dims[0] != 0) {
-        // Only pass LoD when not reducing on the first dim.
-        ctx->ShareLoD("X", /*->*/ "Out");
-      }
-    }
-  }
-};
-
-class ReduceGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) should not be null.");
-    auto x_dims = ctx->GetInputDim("X");
-    auto x_rank = x_dims.size();
-    PADDLE_ENFORCE_LE(x_rank, 6, "Tensors with rank at most 6 are supported.");
-    auto dims = ctx->Attrs().Get<std::vector<int>>("dim");
-    for (size_t i = 0; i < dims.size(); ++i) {
-      if (dims[i] < 0) dims[i] = x_rank + dims[i];
-      PADDLE_ENFORCE_LT(
-          dims[i], x_rank,
-          "The dim should be in the range [-rank(input), rank(input)).");
-    }
-    sort(dims.begin(), dims.end());
-    auto x_grad_name = framework::GradVarName("X");
-    if (ctx->HasOutput(x_grad_name)) {
-      ctx->SetOutputDim(x_grad_name, x_dims);
-      ctx->ShareLoD("X", /*->*/ x_grad_name);
-    }
-  }
-};
-
-class ReduceOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() final {
-    AddInput("X",
-             "(Tensor) The input tensor. Tensors with rank at most 6 are "
-             "supported.");
-    AddOutput("Out", "(Tensor) The result tensor.");
-    AddAttr<std::vector<int>>(
-        "dim",
-        "(list<int>, default {0}) The dimensions to reduce. "
-        "Must be in the range [-rank(input), rank(input)). "
-        "If `dim[i] < 0`, the dims[i] to reduce is `rank + dims[i]`. "
-        "Note that reducing on the first dim will make the LoD info lost.")
-        .SetDefault({0});
-    AddAttr<bool>("keep_dim",
-                  "(bool, default false) "
-                  "If true, retain the reduced dimension with length 1.")
-        .SetDefault(false);
-    AddAttr<bool>("reduce_all",
-                  "(bool, default false) "
-                  "If true, output a scalar reduced along all dimensions.")
-        .SetDefault(false);
-    AddComment(string::Sprintf(R"DOC(
-%s Operator.
-
-This operator computes the %s of input tensor along the given dimension.
-The result tensor has 1 fewer dimension than the input unless keep_dim is true.
-If reduce_all is true, just reduce along all dimensions and output a scalar.
-
-)DOC",
-                               GetOpType(), GetName()));
-  }
-
- protected:
-  virtual std::string GetName() const = 0;
-  virtual std::string GetOpType() const = 0;
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-#define REGISTER_REDUCE_OP(op_name)                                        \
-  class __##op_name##Maker__ : public ops::ReduceOpMaker {                 \
-   protected:                                                              \
-    virtual std::string GetName() const { return #op_name; }               \
-    virtual std::string GetOpType() const { return "Reduce " #op_name; }   \
-  };                                                                       \
-  REGISTER_OPERATOR(reduce_##op_name, ops::ReduceOp, __##op_name##Maker__, \
-                    paddle::framework::DefaultGradOpDescMaker<true>);      \
-  REGISTER_OPERATOR(reduce_##op_name##_grad, ops::ReduceGradOp)
-
-REGISTER_REDUCE_OP(sum);
-REGISTER_REDUCE_OP(mean);
-REGISTER_REDUCE_OP(max);
-REGISTER_REDUCE_OP(min);
-REGISTER_REDUCE_OP(prod);
-
-#define REGISTER_REDUCE_CPU_KERNEL(reduce_type, functor, grad_functor)         \
-  REGISTER_OP_CPU_KERNEL(reduce_type,                                          \
-                         ops::ReduceKernel<paddle::platform::CPUDeviceContext, \
-                                           float, ops::functor>,               \
-                         ops::ReduceKernel<paddle::platform::CPUDeviceContext, \
-                                           double, ops::functor>,              \
-                         ops::ReduceKernel<paddle::platform::CPUDeviceContext, \
-                                           int, ops::functor>,                 \
-                         ops::ReduceKernel<paddle::platform::CPUDeviceContext, \
-                                           int64_t, ops::functor>);            \
-  REGISTER_OP_CPU_KERNEL(                                                      \
-      reduce_type##_grad,                                                      \
-      ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, float,         \
-                            ops::grad_functor>,                                \
-      ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, double,        \
-                            ops::grad_functor>,                                \
-      ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int,           \
-                            ops::grad_functor>,                                \
-      ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int64_t,       \
-                            ops::grad_functor>);
-
-FOR_EACH_KERNEL_FUNCTOR(REGISTER_REDUCE_CPU_KERNEL);
diff --git a/paddle/fluid/operators/reduce_op.cu b/paddle/fluid/operators/reduce_op.cu
deleted file mode 100644
index ae29587f55..0000000000
--- a/paddle/fluid/operators/reduce_op.cu
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#define EIGEN_USE_GPU
-#include "paddle/fluid/operators/reduce_op.h"
-
-namespace ops = paddle::operators;
-
-#define REGISTER_REDUCE_GPU_KERNEL(reduce_type, functor, grad_functor)    \
-  REGISTER_OP_CUDA_KERNEL(                                                \
-      reduce_type, ops::ReduceKernel<paddle::platform::CUDADeviceContext, \
-                                     float, ops::functor>,                \
-      ops::ReduceKernel<paddle::platform::CUDADeviceContext, double,      \
-                        ops::functor>,                                    \
-      ops::ReduceKernel<paddle::platform::CUDADeviceContext, int,         \
-                        ops::functor>,                                    \
-      ops::ReduceKernel<paddle::platform::CUDADeviceContext, int64_t,     \
-                        ops::functor>);                                   \
-  REGISTER_OP_CUDA_KERNEL(                                                \
-      reduce_type##_grad,                                                 \
-      ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, float,   \
-                            ops::grad_functor>,                           \
-      ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,  \
-                            ops::grad_functor>,                           \
-      ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,     \
-                            ops::grad_functor>,                           \
-      ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t, \
-                            ops::grad_functor>);
-
-FOR_EACH_KERNEL_FUNCTOR(REGISTER_REDUCE_GPU_KERNEL);
diff --git a/paddle/fluid/operators/reduce_op.h b/paddle/fluid/operators/reduce_op.h
index 7df47f316c..72b6cf1773 100644
--- a/paddle/fluid/operators/reduce_op.h
+++ b/paddle/fluid/operators/reduce_op.h
@@ -14,105 +14,20 @@ limitations under the License. */
 
 #pragma once
 
+#include <algorithm>
+#include <string>
 #include <vector>
-#include "glog/logging.h"
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
+
+#include "paddle/fluid/operators/reduce_op_function.h"
 
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
-using DDim = framework::DDim;
-template <typename T, size_t D, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenScalar = framework::EigenScalar<T, MajorType, IndexType>;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
-struct SumFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename Dim>
-  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
-    y->device(place) = x->sum(dim);
-  }
-};
-
-struct SumGradFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename DX,
-            typename DY, typename Dim>
-  void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
-                  const Dim& dim, int size) {
-    dx->device(place) = dy->broadcast(dim);
-  }
-};
-
-struct MeanFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename Dim>
-  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
-    y->device(place) = x->mean(dim);
-  }
-};
-
-struct MeanGradFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename DX,
-            typename DY, typename Dim>
-  void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
-                  const Dim& dim, int size) {
-    dx->device(place) = dy->broadcast(dim) / dx->constant(size);
-  }
-};
-
-struct MaxFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename Dim>
-  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
-    y->device(place) = x->maximum(dim);
-  }
-};
-
-struct MinFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename Dim>
-  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
-    y->device(place) = x->minimum(dim);
-  }
-};
-
-struct MaxOrMinGradFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename DX,
-            typename DY, typename Dim>
-  void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
-                  const Dim& dim, int size) {
-    auto equals = (*x) == y->broadcast(dim);
-    auto ones = dx->constant(1);
-    auto zeros = dx->constant(0);
-    // If there are multiple minimum or maximum elements, the subgradient of
-    // each is the set [0, 1], and we pass gradient to all of them here.
-    dx->device(place) = dy->broadcast(dim) * equals.select(ones, zeros);
-  }
-};
-
-struct ProdFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename Dim>
-  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
-    y->device(place) = x->prod(dim);
-  }
-};
-
-struct ProdGradFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename DX,
-            typename DY, typename Dim>
-  void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
-                  const Dim& dim, int size) {
-    dx->device(place) = dy->broadcast(dim) * y->broadcast(dim) * x->inverse();
-  }
-};
-
-#define HANDLE_DIM(NDIM, RDIM)          \
-  if (ndim == NDIM && rdim == RDIM) {   \
-    ReduceCompute<NDIM, RDIM>(context); \
+#define HANDLE_DIM(NDIM, RDIM)                                            \
+  if (ndim == NDIM && rdim == RDIM) {                                     \
+    ReduceFunctor<DeviceContext, T, NDIM, RDIM, Functor>(                 \
+        context.template device_context<DeviceContext>(), *input, output, \
+        dims, keep_dim);                                                  \
   }
 
 template <typename DeviceContext, typename T, typename Functor>
@@ -120,11 +35,15 @@ class ReduceKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     bool reduce_all = context.Attr<bool>("reduce_all");
+    auto* input = context.Input<Tensor>("X");
+    auto* output = context.Output<Tensor>("Out");
+    output->mutable_data<T>(context.GetPlace());
+
+    auto dims = context.Attr<std::vector<int>>("dim");
+    bool keep_dim = context.Attr<bool>("keep_dim");
+
     if (reduce_all) {
       // Flatten and reduce 1-D tensor
-      auto* input = context.Input<Tensor>("X");
-      auto* output = context.Output<Tensor>("Out");
-      output->mutable_data<T>(context.GetPlace());
       auto x = EigenVector<T>::Flatten(*input);
       auto out = EigenScalar<T>::From(*output);
       auto& place =
@@ -133,8 +52,8 @@ class ReduceKernel : public framework::OpKernel<T> {
       Functor functor;
       functor(place, &x, &out, reduce_dim);
     } else {
-      int ndim = context.Input<Tensor>("X")->dims().size();
-      int rdim = context.Attr<std::vector<int>>("dim").size();
+      int ndim = input->dims().size();
+      int rdim = dims.size();
       // comments for accelerating compiling temporarily.
       //      HANDLE_DIM(6, 5);
       //      HANDLE_DIM(6, 4);
@@ -154,48 +73,6 @@ class ReduceKernel : public framework::OpKernel<T> {
       HANDLE_DIM(1, 1);
     }
   }
-
- private:
-  template <size_t D, size_t R_D>
-  void ReduceCompute(const framework::ExecutionContext& context) const {
-    auto* input = context.Input<Tensor>("X");
-    auto* output = context.Output<Tensor>("Out");
-    output->mutable_data<T>(context.GetPlace());
-
-    auto x = EigenTensor<T, D>::From(*input);
-    auto x_rank = static_cast<int>(x.dimensions().size());
-    auto dims = context.Attr<std::vector<int>>("dim");
-    auto reduce_dim = Eigen::array<int, R_D>();
-    for (size_t i = 0; i < dims.size(); ++i) {
-      if (dims[i] < 0) dims[i] = x_rank + dims[i];
-      reduce_dim[i] = dims[i];
-    }
-    // construct the squeezed output tensor
-    bool keep_dim = context.Attr<bool>("keep_dim");
-    DDim out_dims = output->dims();
-    if (keep_dim && x_rank > 1) {
-      const int kDelFlag = -2;
-      auto dims_vector = vectorize(out_dims);
-      for (size_t i = 0; i < dims.size(); ++i) {
-        dims_vector[dims[i]] = kDelFlag;
-      }
-      dims_vector.erase(
-          remove(dims_vector.begin(), dims_vector.end(), kDelFlag),
-          dims_vector.end());
-      out_dims = framework::make_ddim(dims_vector);
-    }
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    Functor functor;
-
-    if (D == 1) {
-      auto out = EigenScalar<T>::From(*output);
-      functor(place, &x, &out, reduce_dim);
-    } else {
-      auto out = EigenTensor<T, (D - R_D)>::From(*output, out_dims);
-      functor(place, &x, &out, reduce_dim);
-    }
-  }
 };
 
 template <typename DeviceContext, typename T, typename Functor>
@@ -203,12 +80,15 @@ class ReduceGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     bool reduce_all = context.Attr<bool>("reduce_all");
+    auto dims = context.Attr<std::vector<int>>("dim");
+
+    auto* input0 = context.Input<Tensor>("X");
+    auto* input1 = context.Input<Tensor>("Out");
+    auto* input2 = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* output = context.Output<Tensor>(framework::GradVarName("X"));
+    output->mutable_data<T>(context.GetPlace());
+
     if (reduce_all) {
-      auto* input0 = context.Input<Tensor>("X");
-      auto* input1 = context.Input<Tensor>("Out");
-      auto* input2 = context.Input<Tensor>(framework::GradVarName("Out"));
-      auto* output = context.Output<Tensor>(framework::GradVarName("X"));
-      output->mutable_data<T>(context.GetPlace());
       auto x = EigenVector<T>::Flatten(*input0);
       auto x_reduce = EigenVector<T>::From(*input1);
       auto x_reduce_grad = EigenVector<T>::From(*input2);
@@ -221,74 +101,172 @@ class ReduceGradKernel : public framework::OpKernel<T> {
       functor(place, &x, &x_reduce, &x_grad, &x_reduce_grad, broadcast_dim,
               broadcast_dim[0]);
     } else {
-      int rank = context.Input<Tensor>("X")->dims().size();
+      int rank = input0->dims().size();
       switch (rank) {
         case 1:
-          ReduceGradCompute<1>(context);
+          ReduceGradFunctor<DeviceContext, T, 1, Functor>(
+              context.template device_context<DeviceContext>(), *input0,
+              *input1, *input2, output, dims);
           break;
         case 2:
-          ReduceGradCompute<2>(context);
+          ReduceGradFunctor<DeviceContext, T, 2, Functor>(
+              context.template device_context<DeviceContext>(), *input0,
+              *input1, *input2, output, dims);
           break;
         case 3:
-          ReduceGradCompute<3>(context);
+          ReduceGradFunctor<DeviceContext, T, 3, Functor>(
+              context.template device_context<DeviceContext>(), *input0,
+              *input1, *input2, output, dims);
           break;
         case 4:
-          ReduceGradCompute<4>(context);
+          ReduceGradFunctor<DeviceContext, T, 4, Functor>(
+              context.template device_context<DeviceContext>(), *input0,
+              *input1, *input2, output, dims);
           break;
         case 5:
-          ReduceGradCompute<5>(context);
+          ReduceGradFunctor<DeviceContext, T, 5, Functor>(
+              context.template device_context<DeviceContext>(), *input0,
+              *input1, *input2, output, dims);
           break;
         case 6:
-          ReduceGradCompute<6>(context);
+          ReduceGradFunctor<DeviceContext, T, 6, Functor>(
+              context.template device_context<DeviceContext>(), *input0,
+              *input1, *input2, output, dims);
           break;
       }
     }
   }
+};
 
- private:
-  template <size_t D>
-  void ReduceGradCompute(const framework::ExecutionContext& context) const {
-    auto* input0 = context.Input<Tensor>("X");
-    auto* input1 = context.Input<Tensor>("Out");
-    auto* input2 = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* output = context.Output<Tensor>(framework::GradVarName("X"));
+class ReduceOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
 
-    output->mutable_data<T>(context.GetPlace());
-    auto x = EigenTensor<T, D>::From(*input0);
-    auto x_grad = EigenTensor<T, D>::From(*output);
-    auto x_rank = static_cast<int>(x.dimensions().size());
-    auto dims = context.Attr<std::vector<int>>("dim");
-    auto x_dims = input0->dims();
-    auto reduced_dims_v = vectorize(x_dims);
-    Eigen::array<int, D> broadcast_dim;
-    for (size_t i = 0; i < D; ++i) broadcast_dim[i] = 1;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of ReduceOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of ReduceOp should not be null.");
+    auto x_dims = ctx->GetInputDim("X");
+    auto x_rank = x_dims.size();
+    PADDLE_ENFORCE_LE(x_rank, 6, "Tensors with rank at most 6 are supported.");
+    auto dims = ctx->Attrs().Get<std::vector<int>>("dim");
+    for (size_t i = 0; i < dims.size(); ++i) {
+      if (dims[i] < 0) dims[i] = x_rank + dims[i];
+      PADDLE_ENFORCE_LT(
+          dims[i], x_rank,
+          "The dim should be in the range [-rank(input), rank(input)).");
+    }
+    sort(dims.begin(), dims.end());
+    bool reduce_all = ctx->Attrs().Get<bool>("reduce_all");
+    bool keep_dim = ctx->Attrs().Get<bool>("keep_dim");
+    if (reduce_all) {
+      if (keep_dim)
+        ctx->SetOutputDim(
+            "Out", framework::make_ddim(std::vector<int64_t>(x_rank, 1)));
+      else
+        ctx->SetOutputDim("Out", {1});
+    } else {
+      auto dims_vector = vectorize(x_dims);
+      if (keep_dim) {
+        for (size_t i = 0; i < dims.size(); ++i) {
+          dims_vector[dims[i]] = 1;
+        }
+      } else {
+        const int kDelFlag = -2;
+        for (size_t i = 0; i < dims.size(); ++i) {
+          dims_vector[dims[i]] = kDelFlag;
+        }
+        dims_vector.erase(
+            remove(dims_vector.begin(), dims_vector.end(), kDelFlag),
+            dims_vector.end());
+      }
+      auto out_dims = framework::make_ddim(dims_vector);
+      ctx->SetOutputDim("Out", out_dims);
+      if (dims[0] != 0) {
+        // Only pass LoD when not reducing on the first dim.
+        ctx->ShareLoD("X", /*->*/ "Out");
+      }
+    }
+  }
+};
+
+class ReduceGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
 
-    int broad_cats_times = 1;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null.");
+    auto x_dims = ctx->GetInputDim("X");
+    auto x_rank = x_dims.size();
+    PADDLE_ENFORCE_LE(x_rank, 6, "Tensors with rank at most 6 are supported.");
+    auto dims = ctx->Attrs().Get<std::vector<int>>("dim");
     for (size_t i = 0; i < dims.size(); ++i) {
       if (dims[i] < 0) dims[i] = x_rank + dims[i];
-      reduced_dims_v[dims[i]] = 1;
-      broadcast_dim[dims[i]] = x_dims[dims[i]];
-      broad_cats_times *= x_dims[dims[i]];
+      PADDLE_ENFORCE_LT(
+          dims[i], x_rank,
+          "The dim should be in the range [-rank(input), rank(input)).");
+    }
+    sort(dims.begin(), dims.end());
+    auto x_grad_name = framework::GradVarName("X");
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
+      ctx->ShareLoD("X", /*->*/ x_grad_name);
     }
-    auto reduced_dims = framework::make_ddim(reduced_dims_v);
-    auto x_reduce = EigenTensor<T, D>::From(*input1, reduced_dims);
-    auto x_reduce_grad = EigenTensor<T, D>::From(*input2, reduced_dims);
+  }
+};
+
+class ReduceOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() final {
+    AddInput("X",
+             "(Tensor) The input tensor. Tensors with rank at most 6 are "
+             "supported.");
+    AddOutput("Out", "(Tensor) The result tensor.");
+    AddAttr<std::vector<int>>(
+        "dim",
+        "(list<int>, default {0}) The dimensions to reduce. "
+        "Must be in the range [-rank(input), rank(input)). "
+        "If `dim[i] < 0`, the dims[i] to reduce is `rank + dims[i]`. "
+        "Note that reducing on the first dim will make the LoD info lost.")
+        .SetDefault({0});
+    AddAttr<bool>("keep_dim",
+                  "(bool, default false) "
+                  "If true, retain the reduced dimension with length 1.")
+        .SetDefault(false);
+    AddAttr<bool>("reduce_all",
+                  "(bool, default false) "
+                  "If true, output a scalar reduced along all dimensions.")
+        .SetDefault(false);
+    AddComment(string::Sprintf(R"DOC(
+%s Operator.
 
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
+This operator computes the %s of input tensor along the given dimension.
+The result tensor has 1 fewer dimension than the input unless keep_dim is true.
+If reduce_all is true, just reduce along all dimensions and output a scalar.
 
-    Functor functor;
-    functor(place, &x, &x_reduce, &x_grad, &x_reduce_grad, broadcast_dim,
-            broad_cats_times);
+)DOC",
+                               GetOpType(), GetName()));
   }
+
+ protected:
+  virtual std::string GetName() const = 0;
+  virtual std::string GetOpType() const = 0;
 };
 
 }  // namespace operators
 }  // namespace paddle
 
-#define FOR_EACH_KERNEL_FUNCTOR(__macro)                \
-  __macro(reduce_sum, SumFunctor, SumGradFunctor);      \
-  __macro(reduce_mean, MeanFunctor, MeanGradFunctor);   \
-  __macro(reduce_max, MaxFunctor, MaxOrMinGradFunctor); \
-  __macro(reduce_min, MinFunctor, MaxOrMinGradFunctor); \
-  __macro(reduce_prod, ProdFunctor, ProdGradFunctor);
+namespace ops = paddle::operators;
+
+#define REGISTER_REDUCE_OP(op_name)                                      \
+  class __##op_name##Maker__ : public ops::ReduceOpMaker {               \
+   protected:                                                            \
+    virtual std::string GetName() const { return #op_name; }             \
+    virtual std::string GetOpType() const { return "Reduce " #op_name; } \
+  };                                                                     \
+  REGISTER_OPERATOR(op_name, ops::ReduceOp, __##op_name##Maker__,        \
+                    paddle::framework::DefaultGradOpDescMaker<true>);    \
+  REGISTER_OPERATOR(op_name##_grad, ops::ReduceGradOp)
diff --git a/paddle/fluid/operators/reduce_op_function.h b/paddle/fluid/operators/reduce_op_function.h
new file mode 100644
index 0000000000..3da27bc8ac
--- /dev/null
+++ b/paddle/fluid/operators/reduce_op_function.h
@@ -0,0 +1,109 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <vector>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using DDim = framework::DDim;
+template <typename T, size_t D, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenScalar = framework::EigenScalar<T, MajorType, IndexType>;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
+template <typename DeviceContext, typename T, size_t D, size_t R_D,
+          typename Functor>
+void ReduceFunctor(const DeviceContext& context, const framework::Tensor& input,
+                   framework::Tensor* output, const std::vector<int>& dims,
+                   bool keep_dim) {
+  auto x = EigenTensor<T, D>::From(input);
+  auto x_rank = static_cast<int>(x.dimensions().size());
+  auto reduce_dim = Eigen::array<int, R_D>();
+  std::vector<int> dims_ref = dims;
+  for (size_t i = 0; i < dims_ref.size(); ++i) {
+    if (dims_ref[i] < 0) dims_ref[i] = x_rank + dims_ref[i];
+    reduce_dim[i] = dims_ref[i];
+  }
+  // construct the squeezed output tensor
+  DDim out_dims = output->dims();
+  if (keep_dim && x_rank > 1) {
+    const int kDelFlag = -2;
+    auto dims_vector = framework::vectorize(out_dims);
+    for (size_t i = 0; i < dims_ref.size(); ++i) {
+      dims_vector[dims_ref[i]] = kDelFlag;
+    }
+    dims_vector.erase(remove(dims_vector.begin(), dims_vector.end(), kDelFlag),
+                      dims_vector.end());
+    out_dims = framework::make_ddim(dims_vector);
+  }
+  auto& place = *context.eigen_device();
+  Functor functor;
+
+  if (D == 1) {
+    auto out = EigenScalar<T>::From(*output);
+    functor(place, &x, &out, reduce_dim);
+  } else {
+    auto out = EigenTensor<T, (D - R_D)>::From(*output, out_dims);
+    functor(place, &x, &out, reduce_dim);
+  }
+}
+
+template <typename DeviceContext, typename T, size_t D, typename Functor>
+void ReduceGradFunctor(const DeviceContext& context,
+                       const framework::Tensor& input0,
+                       const framework::Tensor& input1,
+                       const framework::Tensor& input2,
+                       framework::Tensor* output,
+                       const std::vector<int>& dims) {
+  auto x = EigenTensor<T, D>::From(input0);
+  auto x_grad = EigenTensor<T, D>::From(*output);
+  auto x_rank = static_cast<int>(x.dimensions().size());
+  auto x_dims = input0.dims();
+  auto reduced_dims_v = framework::vectorize(x_dims);
+  std::vector<int> dims_ref = dims;
+  Eigen::array<int, D> broadcast_dim;
+  for (size_t i = 0; i < D; ++i) broadcast_dim[i] = 1;
+
+  int broad_cats_times = 1;
+  for (size_t i = 0; i < dims_ref.size(); ++i) {
+    if (dims_ref[i] < 0) {
+      dims_ref[i] = x_rank + dims_ref[i];
+    }
+    reduced_dims_v[dims_ref[i]] = 1;
+    broadcast_dim[dims_ref[i]] = x_dims[dims_ref[i]];
+    broad_cats_times *= x_dims[dims_ref[i]];
+  }
+  auto reduced_dims = framework::make_ddim(reduced_dims_v);
+  auto x_reduce = EigenTensor<T, D>::From(input1, reduced_dims);
+  auto x_reduce_grad = EigenTensor<T, D>::From(input2, reduced_dims);
+
+  auto& place = *context.eigen_device();
+
+  Functor functor;
+  functor(place, &x, &x_reduce, &x_grad, &x_reduce_grad, broadcast_dim,
+          broad_cats_times);
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/reduce_prod_op.cc b/paddle/fluid/operators/reduce_prod_op.cc
new file mode 100644
index 0000000000..713728b997
--- /dev/null
+++ b/paddle/fluid/operators/reduce_prod_op.cc
@@ -0,0 +1,35 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_prod_op.h"
+
+REGISTER_REDUCE_OP(reduce_prod);
+REGISTER_OP_CPU_KERNEL(reduce_prod,
+                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
+                                         float, ops::ProdFunctor>,
+                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
+                                         double, ops::ProdFunctor>,
+                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
+                                         int, ops::ProdFunctor>,
+                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
+                                         int64_t, ops::ProdFunctor>);
+REGISTER_OP_CPU_KERNEL(reduce_prod_grad,
+                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                             float, ops::ProdGradFunctor>,
+                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                             double, ops::ProdGradFunctor>,
+                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                             int, ops::ProdGradFunctor>,
+                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                             int64_t, ops::ProdGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_prod_op.cu b/paddle/fluid/operators/reduce_prod_op.cu
new file mode 100644
index 0000000000..d62e677d92
--- /dev/null
+++ b/paddle/fluid/operators/reduce_prod_op.cu
@@ -0,0 +1,34 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_prod_op.h"
+
+REGISTER_OP_CUDA_KERNEL(reduce_prod,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          float, ops::ProdFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          double, ops::ProdFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          int, ops::ProdFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          int64_t, ops::ProdFunctor>);
+REGISTER_OP_CUDA_KERNEL(
+    reduce_prod_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
+                                            float, ops::ProdGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
+                          ops::ProdGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
+                          ops::ProdGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
+                          ops::ProdGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_prod_op.h b/paddle/fluid/operators/reduce_prod_op.h
new file mode 100644
index 0000000000..97748113e0
--- /dev/null
+++ b/paddle/fluid/operators/reduce_prod_op.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/operators/reduce_op.h"
+
+namespace paddle {
+namespace operators {
+
+struct ProdFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    y->device(place) = x->prod(dim);
+  }
+};
+
+struct ProdGradFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename DX,
+            typename DY, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
+                  const Dim& dim, int size) {
+    dx->device(place) = dy->broadcast(dim) * y->broadcast(dim) * x->inverse();
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/reduce_sum_op.cc b/paddle/fluid/operators/reduce_sum_op.cc
new file mode 100644
index 0000000000..c5b5398787
--- /dev/null
+++ b/paddle/fluid/operators/reduce_sum_op.cc
@@ -0,0 +1,34 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_sum_op.h"
+
+REGISTER_REDUCE_OP(reduce_sum);
+REGISTER_OP_CPU_KERNEL(
+    reduce_sum, ops::ReduceKernel<paddle::platform::CPUDeviceContext, float,
+                                  ops::SumFunctor>,
+    ops::ReduceKernel<paddle::platform::CPUDeviceContext, double,
+                      ops::SumFunctor>,
+    ops::ReduceKernel<paddle::platform::CPUDeviceContext, int, ops::SumFunctor>,
+    ops::ReduceKernel<paddle::platform::CPUDeviceContext, int64_t,
+                      ops::SumFunctor>);
+REGISTER_OP_CPU_KERNEL(reduce_sum_grad,
+                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                             float, ops::SumGradFunctor>,
+                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                             double, ops::SumGradFunctor>,
+                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                             int, ops::SumGradFunctor>,
+                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                             int64_t, ops::SumGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_sum_op.cu b/paddle/fluid/operators/reduce_sum_op.cu
new file mode 100644
index 0000000000..f2e16955a5
--- /dev/null
+++ b/paddle/fluid/operators/reduce_sum_op.cu
@@ -0,0 +1,34 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_sum_op.h"
+
+REGISTER_OP_CUDA_KERNEL(reduce_sum,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          float, ops::SumFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          double, ops::SumFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          int, ops::SumFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          int64_t, ops::SumFunctor>);
+REGISTER_OP_CUDA_KERNEL(
+    reduce_sum_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
+                                           float, ops::SumGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
+                          ops::SumGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
+                          ops::SumGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
+                          ops::SumGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_sum_op.h b/paddle/fluid/operators/reduce_sum_op.h
new file mode 100644
index 0000000000..e67d7e1da5
--- /dev/null
+++ b/paddle/fluid/operators/reduce_sum_op.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/operators/reduce_op.h"
+
+namespace paddle {
+namespace operators {
+
+struct SumFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    y->device(place) = x->sum(dim);
+  }
+};
+
+struct SumGradFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename DX,
+            typename DY, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
+                  const Dim& dim, int size) {
+    dx->device(place) = dy->broadcast(dim);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle

From 9141bee1e70adf4bb8d2f04b59c561969eaa8f75 Mon Sep 17 00:00:00 2001
From: cuichaowen <939136265@qq.com>
Date: Thu, 7 Jun 2018 20:50:34 +0800
Subject: [PATCH 58/93] add Anakin api for paddle (#11228)

---
 paddle/contrib/inference/CMakeLists.txt       | 40 ++++++++------
 .../contrib/inference/paddle_inference_api.h  |  1 -
 .../paddle_inference_api_anakin_engine.cc     | 54 +++++++++++++++----
 .../paddle_inference_api_anakin_engine.h      | 22 +++++---
 ...ddle_inference_api_anakin_engine_tester.cc | 44 +++++++++++++--
 5 files changed, 123 insertions(+), 38 deletions(-)

diff --git a/paddle/contrib/inference/CMakeLists.txt b/paddle/contrib/inference/CMakeLists.txt
index 1e3bb7bf16..f279020e93 100644
--- a/paddle/contrib/inference/CMakeLists.txt
+++ b/paddle/contrib/inference/CMakeLists.txt
@@ -24,31 +24,37 @@ set(ANAKIN_LIBRARY "" CACHE STRING "path of Anakin library")
 set(inference_deps paddle_inference_api paddle_fluid_api)
 
 # if anakin is set enable anakin api implementation
-if(ANAKIN_INCLUDE_DIR AND ANAKIN_LIBRARY)
+if(ANAKIN_INCLUDE AND ANAKIN_LIBRARY)
     set(ANAKIN_FOUND ON)
 else()
     set(ANAKIN_FOUND OFF)
 endif()
 
+function(fetch_include_recursively root_dir) 
+    if (IS_DIRECTORY ${root_dir}) 
+        include_directories(${root_dir})
+    endif()
+
+    file(GLOB ALL_SUB RELATIVE ${root_dir} ${root_dir}/*)
+    foreach(sub ${ALL_SUB})
+        if (IS_DIRECTORY ${root_dir}/${sub})
+            fetch_include_recursively(${root_dir}/${sub})
+        endif()
+    endforeach()
+endfunction()
+
 if (ANAKIN_FOUND)
     # Anakin's code style doesn't follow google c style.
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=comment
-                                            -Wno-error=reorder
-                                            -Wno-error=format
-                                            -Wno-error=switch
-                                            -Wno-error=return-type
-                                            -Wno-error=non-virtual-dtor
-                                            -Wno-error=cpp")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=unused-variable -Wno-error=format-extra-args -Wno-error=comment -Wno-error=format -Wno-error=switch -Wno-error=return-type -Wno-error=non-virtual-dtor -Wno-reorder -Wno-error=cpp")
 
     message(STATUS "Anakin for inference is enabled")
     message(STATUS "Anakin is set INCLUDE:${ANAKIN_INCLUDE} LIBRARY:${ANAKIN_LIBRARY}")
-    include_directories("${ANAKIN_INCLUDE}")
-    # Anakin's source path is a mass, need to set sub-directories trivially.
-    include_directories("${ANAKIN_INCLUDE}/saber")
-    link_directories("${ANAKIN_LIBRARY}")
+    fetch_include_recursively(${ANAKIN_INCLUDE})
+
+    link_directories(${ANAKIN_LIBRARY})
 
-    nv_library(inference_anakin_api SRCS paddle_inference_api_anakin_engine.cc)
-    target_link_libraries(inference_anakin_api anakin)
+    nv_library(inference_anakin_api SHARED SRCS paddle_inference_api.cc paddle_inference_api_anakin_engine.cc)
+    target_link_libraries(inference_anakin_api anakin anakin_saber_common)
     list(APPEND inference_deps inference_anakin_api)
 endif()
 
@@ -73,7 +79,7 @@ function(inference_api_test TARGET_NAME)
 endfunction(inference_api_test)
 
 cc_library(paddle_inference_api
-    SRCS paddle_inference_api.cc paddle_inference_api_impl.cc
+    SRCS paddle_inference_api.cc paddle_inference_api_impl.cc 
     DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
 
 cc_test(test_paddle_inference_api
@@ -84,8 +90,8 @@ inference_api_test(test_paddle_inference_api_impl
                     ARGS test_word2vec test_image_classification)
 
 if (ANAKIN_FOUND)
-  nv_test(inference_anakin_test SRCS paddle_inference_api_anakin_engine_tester.cc
-    DEPS ${inference_deps} protobuf)
+    cc_test(inference_anakin_test SRCS paddle_inference_api_anakin_engine_tester.cc
+    DEPS ${inference_deps})
 endif()
 
 if(WITH_TESTING)
diff --git a/paddle/contrib/inference/paddle_inference_api.h b/paddle/contrib/inference/paddle_inference_api.h
index c4588cf040..77e2d77b6b 100644
--- a/paddle/contrib/inference/paddle_inference_api.h
+++ b/paddle/contrib/inference/paddle_inference_api.h
@@ -113,5 +113,4 @@ struct AnakinConfig : public PaddlePredictor::Config {
 // Similarly, each engine kind should map to a unique predictor implementation.
 template <typename ConfigT, PaddleEngineKind engine = PaddleEngineKind::kNative>
 std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
-
 }  // namespace paddle
diff --git a/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc b/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc
index 865d7ac10d..ea7781f691 100644
--- a/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc
+++ b/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc
@@ -24,8 +24,16 @@ PaddleInferenceAnakinPredictor::PaddleInferenceAnakinPredictor(
 }
 
 bool PaddleInferenceAnakinPredictor::Init(const AnakinConfig &config) {
-  // TODO(Superjomn) Tell anakin to support return code.
-  engine_.Build(config.model_file, config.max_batch_size);
+  if (!(graph_.load(config.model_file))) {
+    return false;
+  }
+  graph_.ResetBatchSize("input_0", config.max_batch_size);
+  // optimization for graph
+  if (!(graph_.Optimize())) {
+    return false;
+  }
+  // construct executer
+  executor_.init(graph_);
   return true;
 }
 
@@ -38,24 +46,30 @@ bool PaddleInferenceAnakinPredictor::Run(
                  << "'s type is not float";
       return false;
     }
-    engine_.SetInputFromCPU(
-        input.name, static_cast<float *>(input.data.data), input.data.length);
+    auto d_tensor_in_p = executor_.get_in(input.name);
+    float *d_data_p = d_tensor_in_p->mutable_data();
+    if (cudaMemcpy(d_data_p,
+                   static_cast<float *>(input.data.data),
+                   d_tensor_in_p->valid_size() * sizeof(float),
+                   cudaMemcpyHostToDevice) != 0) {
+      LOG(ERROR) << "copy data from CPU to GPU error";
+      return false;
+    }
   }
 
-  // TODO(Superjomn) Tell anakin to support return code.
-  engine_.Execute();
+  executor_.prediction();
 
   if (output_data->empty()) {
     LOG(ERROR) << "At least one output should be set with tensors' names.";
     return false;
   }
   for (auto &output : *output_data) {
-    auto *tensor = engine_.GetOutputInGPU(output.name);
+    auto *tensor = executor_.get_out(output.name);
     output.shape = tensor->shape();
     // Copy data from GPU -> CPU
     if (cudaMemcpy(output.data.data,
-                   tensor->data(),
-                   tensor->size(),
+                   tensor->mutable_data(),
+                   tensor->valid_size() * sizeof(float),
                    cudaMemcpyDeviceToHost) != 0) {
       LOG(ERROR) << "copy data from GPU to CPU error";
       return false;
@@ -64,9 +78,26 @@ bool PaddleInferenceAnakinPredictor::Run(
   return true;
 }
 
-// TODO(Superjomn) To implement latter.
+anakin::Net<anakin::NV, anakin::saber::AK_FLOAT, anakin::Precision::FP32>
+    &PaddleInferenceAnakinPredictor::get_executer() {
+  return executor_;
+}
+
+// the cloned new Predictor of anakin share the same net weights from original
+// Predictor
 std::unique_ptr<PaddlePredictor> PaddleInferenceAnakinPredictor::Clone() {
-  return nullptr;
+  VLOG(3) << "Anakin Predictor::clone";
+  std::unique_ptr<PaddlePredictor> cls(new PaddleInferenceAnakinPredictor());
+  // construct executer from other graph
+  auto anakin_predictor_p =
+      dynamic_cast<PaddleInferenceAnakinPredictor *>(cls.get());
+  if (!anakin_predictor_p) {
+    LOG(ERROR) << "fail to call Init";
+    return nullptr;
+  }
+  anakin_predictor_p->get_executer().init(graph_);
+
+  return std::move(cls);
 }
 
 // A factory to help create difference predictor.
@@ -74,6 +105,7 @@ template <>
 std::unique_ptr<PaddlePredictor>
 CreatePaddlePredictor<AnakinConfig, PaddleEngineKind::kAnakin>(
     const AnakinConfig &config) {
+  VLOG(3) << "Anakin Predictor create.";
   std::unique_ptr<PaddlePredictor> x(
       new PaddleInferenceAnakinPredictor(config));
   return x;
diff --git a/paddle/contrib/inference/paddle_inference_api_anakin_engine.h b/paddle/contrib/inference/paddle_inference_api_anakin_engine.h
index fe9f562e9d..181784cbdf 100644
--- a/paddle/contrib/inference/paddle_inference_api_anakin_engine.h
+++ b/paddle/contrib/inference/paddle_inference_api_anakin_engine.h
@@ -20,32 +20,42 @@ limitations under the License. */
 #pragma once
 
 // NOTE This header file do not have namespace.
-// TODO(Superjomn) Tell Anakin to provide better APIs.
-#include <test/framework/net/paddle_api.h>
+//#include <test/framework/net/paddle_api.h>
 #include "paddle/contrib/inference/paddle_inference_api.h"
 
+#include "framework/core/net/net.h"
+#include "saber/saber_types.h"
+
 namespace paddle {
 
 class PaddleInferenceAnakinPredictor : public PaddlePredictor {
  public:
+  PaddleInferenceAnakinPredictor() {}
+
   PaddleInferenceAnakinPredictor(const AnakinConfig& config);
 
   // NOTE Unlike the native engine, the buffers of anakin engine's output_data
   // should be allocated first.
-  // TODO(Superjomn) should unify all the behaviors of output_data accross all
-  // the engines.
   bool Run(const std::vector<PaddleTensor>& inputs,
            std::vector<PaddleTensor>* output_data) override;
 
   std::unique_ptr<PaddlePredictor> Clone() override;
 
+  anakin::Net<anakin::NV, anakin::saber::AK_FLOAT, anakin::Precision::FP32>&
+  get_executer();
+
+  ~PaddleInferenceAnakinPredictor() override{};
+
  private:
   bool Init(const AnakinConfig& config);
 
-  anakin::AnakinEngine<anakin::NV,
+  anakin::graph::Graph<anakin::NV,
                        anakin::saber::AK_FLOAT,
                        anakin::Precision::FP32>
-      engine_;
+      graph_;
+  anakin::Net<anakin::NV, anakin::saber::AK_FLOAT, anakin::Precision::FP32>
+      executor_;
+  AnakinConfig config_;
 };
 
 }  // namespace paddle
diff --git a/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc b/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc
index 43324bc67c..47b9c6fa28 100644
--- a/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc
+++ b/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc
@@ -12,16 +12,54 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/contrib/inference/paddle_inference_api.h"
+#include <glog/logging.h>
 #include <gtest/gtest.h>
 
+#include "gflags/gflags.h"
+#include "paddle/contrib/inference/paddle_inference_api.h"
+
 namespace paddle {
 
-TEST(inference, anakin) {
+AnakinConfig GetConfig() {
   AnakinConfig config;
+  config.model_file = "./mobilenet_v2.anakin.bin";
+  config.device = 0;
+  config.max_batch_size = 1;
+  return config;
+}
 
-  auto engine =
+TEST(inference, anakin) {
+  AnakinConfig config = GetConfig();
+  auto predictor =
       CreatePaddlePredictor<AnakinConfig, PaddleEngineKind::kAnakin>(config);
+
+  float data[1 * 3 * 224 * 224] = {1.0f};
+
+  PaddleBuf buf{.data = data, .length = sizeof(data)};
+  PaddleTensor tensor{.name = "input_0",
+                      .shape = std::vector<int>({1, 3, 224, 224}),
+                      .data = buf,
+                      .dtype = PaddleDType::FLOAT32};
+
+  // For simplicity, we set all the slots with the same data.
+  std::vector<PaddleTensor> paddle_tensor_feeds(1, tensor);
+
+  float data_out[1000];
+
+  PaddleBuf buf_out{.data = data_out, .length = sizeof(data)};
+  PaddleTensor tensor_out{.name = "prob_out",
+                          .shape = std::vector<int>({1000, 1}),
+                          .data = buf_out,
+                          .dtype = PaddleDType::FLOAT32};
+
+  std::vector<PaddleTensor> outputs(1, tensor_out);
+
+  ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &outputs));
+
+  float* data_o = static_cast<float*>(outputs[0].data.data);
+  for (size_t j = 0; j < 1000; ++j) {
+    LOG(INFO) << "output[" << j << "]: " << data_o[j];
+  }
 }
 
 }  // namespace paddle

From e80c6b3c24eca373be3962c560e67be09fe6fe38 Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Thu, 7 Jun 2018 22:02:26 +0800
Subject: [PATCH 59/93] Refine API doc string

---
 paddle/fluid/operators/activation_op.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index dd71c66a75..23327b77f8 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -24,12 +24,12 @@ namespace operators {
       : public ::paddle::framework::OpProtoAndCheckerMaker {            \
    public:                                                              \
     void Make() override {                                              \
-      AddInput("X", "Input of " #OP_NAME "operator");                   \
-      AddOutput("Out", "Output of" #OP_NAME "operator");                \
+      AddInput("X", "Input of " #OP_NAME " operator");                  \
+      AddOutput("Out", "Output of " #OP_NAME " operator");              \
       AddAttr<bool>("use_mkldnn",                                       \
                     "(bool, default false) Only used in mkldnn kernel") \
           .SetDefault(false);                                           \
-      AddComment(#OP_COMMENT);                                          \
+      AddComment(OP_COMMENT);                                           \
     }                                                                   \
   }
 

From 7e861f0ee4b2c23821be93c3483acf59cdd8e9a1 Mon Sep 17 00:00:00 2001
From: "yi.wu" <yi.wu@baifendian.com>
Date: Thu, 7 Jun 2018 22:19:02 +0800
Subject: [PATCH 60/93] fix small bug in caculating speed

---
 benchmark/fluid/fluid_benchmark.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py
index bd0243aa60..62a05234c4 100644
--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
@@ -371,8 +371,7 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
                 print("Pass %d, batch %d, loss %s" %
                       (pass_id, batch_id, np.array(loss)))
             batch_id += 1
-        if args.use_reader_op:
-            num_samples = num_samples * args.gpus
+
         print_train_time(start_time, time.time(), num_samples)
         if not args.no_test and batch_acc:
             test_acc = test(startup_exe, infer_prog, test_reader, feeder,

From 0aa9546eed6afee30e8e168a509d3d32810d6b2f Mon Sep 17 00:00:00 2001
From: Yancey <yanxu05@baidu.com>
Date: Fri, 8 Jun 2018 09:28:43 +0800
Subject: [PATCH 61/93] fix dist train error (#11281)

* fix dist train error

* update by comment
---
 .../framework/details/multi_devices_graph_builder.cc      | 5 ++---
 paddle/fluid/framework/details/rpc_op_handle.cc           | 8 ++++----
 paddle/fluid/framework/details/rpc_op_handle.h            | 4 ++--
 3 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
index 0c4d369e88..97242ebf2a 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@@ -464,9 +464,8 @@ void MultiDevSSAGraphBuilder::CreateDistTrainOp(SSAGraph *result,
 
 void MultiDevSSAGraphBuilder::CreateRPCOp(SSAGraph *result,
                                           const OpDesc &op) const {
-  auto &p = places_[0];
-  auto *s = local_scopes_[0];
-  result->ops_.emplace_back(new RPCOpHandle(op, s, p, op.Type()));
+  result->ops_.emplace_back(
+      new RPCOpHandle(op, local_scopes_[0], op.Type(), places_[0]));
 
   if (op.Type() == "send_barrier") {
     ConnectOp(result, result->ops_.back().get(), "send_vars");
diff --git a/paddle/fluid/framework/details/rpc_op_handle.cc b/paddle/fluid/framework/details/rpc_op_handle.cc
index 7f4da4c01d..586465f99f 100644
--- a/paddle/fluid/framework/details/rpc_op_handle.cc
+++ b/paddle/fluid/framework/details/rpc_op_handle.cc
@@ -19,12 +19,12 @@ namespace framework {
 namespace details {
 
 RPCOpHandle::RPCOpHandle(const framework::OpDesc &op_desc,
-                         const Scope *local_scope, const platform::Place &place,
-                         const std::string &name)
+                         const Scope *local_scope, const std::string &name,
+                         const platform::Place &place)
     : op_(framework::OpRegistry::CreateOp(op_desc)),
       local_scope_(local_scope),
-      place_(place),
-      name_(name) {}
+      name_(name),
+      place_(place) {}
 
 void RPCOpHandle::RunImpl() {
   // TODO(wuyi): need further analysis whether wait VarDummyHandle.
diff --git a/paddle/fluid/framework/details/rpc_op_handle.h b/paddle/fluid/framework/details/rpc_op_handle.h
index d28b772172..ae38c7fe19 100644
--- a/paddle/fluid/framework/details/rpc_op_handle.h
+++ b/paddle/fluid/framework/details/rpc_op_handle.h
@@ -29,7 +29,7 @@ namespace details {
 
 struct RPCOpHandle : public OpHandleBase {
   RPCOpHandle(const framework::OpDesc& op_desc, const Scope* local_scope,
-              const platform::Place& place, const std::string& name);
+              const std::string& name, const platform::Place& place);
 
   std::string Name() const override;
 
@@ -43,8 +43,8 @@ struct RPCOpHandle : public OpHandleBase {
  private:
   std::unique_ptr<OperatorBase> op_;
   const Scope* local_scope_;
-  const platform::Place& place_;
   const std::string name_;
+  platform::Place place_;
 };
 
 }  // namespace details

From 1076e85135d3eadd97324c6d06bf4a6a30852148 Mon Sep 17 00:00:00 2001
From: chengduoZH <zhaochengduo@163.com>
Date: Thu, 7 Jun 2018 17:01:41 +0800
Subject: [PATCH 62/93] refine logic

---
 paddle/fluid/framework/details/ssa_graph_builder.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/paddle/fluid/framework/details/ssa_graph_builder.cc b/paddle/fluid/framework/details/ssa_graph_builder.cc
index d24669a8f8..c4ee088507 100644
--- a/paddle/fluid/framework/details/ssa_graph_builder.cc
+++ b/paddle/fluid/framework/details/ssa_graph_builder.cc
@@ -135,9 +135,11 @@ bool SSAGraphBuilder::IsValidGraph(const SSAGraph *graph) const {
 
   while (!pending_vars.empty()) {
     run_all_ops(ready_ops);
+
     if (ready_vars.empty()) {
       return false;
     }
+
     for (auto ready_var : ready_vars) {
       pending_vars.erase(ready_var);
       for (auto *op : ready_var->pending_ops_) {

From 1239fce771a5d0907045fc285cb1966bdb61b180 Mon Sep 17 00:00:00 2001
From: Yancey1989 <yancey1989@gmail.com>
Date: Fri, 8 Jun 2018 09:59:54 +0800
Subject: [PATCH 63/93] polish sparse update code

---
 .../fluid/operators/detail/request_handler_impl.cc  |  3 +++
 paddle/fluid/operators/detail/rpc_server.cc         | 13 +++++++++++++
 paddle/fluid/operators/detail/rpc_server.h          |  6 ++++++
 paddle/fluid/operators/listen_and_serv_op.cc        |  1 +
 4 files changed, 23 insertions(+)

diff --git a/paddle/fluid/operators/detail/request_handler_impl.cc b/paddle/fluid/operators/detail/request_handler_impl.cc
index 145ee53107..b5ee3ab51e 100644
--- a/paddle/fluid/operators/detail/request_handler_impl.cc
+++ b/paddle/fluid/operators/detail/request_handler_impl.cc
@@ -63,6 +63,9 @@ bool RequestSendHandler::Handle(const std::string& varname,
       PADDLE_THROW("sync: Can not find server side var");
       return false;
     }
+    if (invar->IsType<framework::SelectedRows>()) {
+      rpc_server_->RecordSparseVar(invar);
+    }
   }
 
   return true;
diff --git a/paddle/fluid/operators/detail/rpc_server.cc b/paddle/fluid/operators/detail/rpc_server.cc
index 448763372a..7feddbeca8 100644
--- a/paddle/fluid/operators/detail/rpc_server.cc
+++ b/paddle/fluid/operators/detail/rpc_server.cc
@@ -73,6 +73,19 @@ void RPCServer::ResetBarrierCounter() {
     t.second = 0;
   }
 }
+void RPCServer::RecordSparseVar(framework::Variable* sparse_var) {
+  std::unique_lock<std::mutex> lock(mutex_sparse_var_recorder_);
+  sparse_vars_.push_back(sparse_var);
+}
+
+void RPCServer::ResetSparseVarsRecorder() {
+  VLOG(3) << "RPCServer reset sparse vars recorder.";
+  std::unique_lock<std::mutex> lock(mutex_sparse_var_recorder_);
+  for (auto* var : sparse_vars_) {
+    var->GetMutable<framework::SelectedRows>()->mutable_rows()->clear();
+  }
+  sparse_vars_.clear();
+}
 
 void RPCServer::RegisterRPC(const std::string& rpc_name,
                             RequestHandler* handler, int thread_num) {
diff --git a/paddle/fluid/operators/detail/rpc_server.h b/paddle/fluid/operators/detail/rpc_server.h
index c2e7ae706c..94a21ef8d0 100644
--- a/paddle/fluid/operators/detail/rpc_server.h
+++ b/paddle/fluid/operators/detail/rpc_server.h
@@ -60,7 +60,10 @@ class RPCServer {
   void SetCond(const std::string& rpc_name);
   void WaitCond(const std::string& rpc_name);
   void IncreaseBatchBarrier(const std::string rpc_name);
+
   void ResetBarrierCounter();
+  void RecordSparseVar(framework::Variable* sparse_var);
+  void ResetSparseVarsRecorder();
 
  protected:
   virtual void ShutDownImpl() = 0;
@@ -74,6 +77,9 @@ class RPCServer {
   std::atomic<int> cur_cond_;
   std::condition_variable rpc_cond_;
 
+  std::vector<framework::Variable*> sparse_vars_;
+  std::mutex mutex_sparse_var_recorder_;
+
  protected:
   std::string bind_address_;
   std::atomic<int> exit_flag_;
diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc
index 0c9d2b5a74..ee7b01a54c 100644
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -146,6 +146,7 @@ void ListenAndServOp::RunSyncLoop(framework::Executor *executor,
     rpc_service_->SetCond(detail::kRequestGet);
     rpc_service_->WaitBarrier(detail::kRequestGet);
     rpc_service_->ResetBarrierCounter();
+    rpc_service_->ResetSparseVarsRecorder();
   }  // while(true)
 }
 

From c7bbfb33ad816762f00e19f5076b1d6fed105b2d Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Fri, 8 Jun 2018 10:39:44 +0800
Subject: [PATCH 64/93] Fix a GPU bug

---
 paddle/fluid/operators/crop_op.h | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/operators/crop_op.h b/paddle/fluid/operators/crop_op.h
index d8e9f086cc..91cfbbda73 100644
--- a/paddle/fluid/operators/crop_op.h
+++ b/paddle/fluid/operators/crop_op.h
@@ -39,11 +39,16 @@ static std::vector<int> GetOffsets(const framework::ExecutionContext& ctx) {
     PADDLE_ENFORCE_EQ(
         rank, offsets_tensor->dims()[0],
         "Offsets size should be equal to dimension size of input tensor.");
-    const int* offsets_data = offsets_tensor->data<int>();
-    res.resize(rank);
-    for (size_t i = 0; i < rank; ++i) {
-      res[i] = offsets_data[i];
+    const int* offsets_data;
+    framework::Tensor cpu_tmp_tensor;
+    if (platform::is_cpu_place(offsets_tensor->place())) {
+      offsets_data = offsets_tensor->data<int>();
+    } else {
+      framework::TensorCopySync(*offsets_tensor, platform::CPUPlace(),
+                                &cpu_tmp_tensor);
+      offsets_data = cpu_tmp_tensor.data<int>();
     }
+    res = std::vector<int>(offsets_data, offsets_data + rank);
   } else {
     res = ctx.Attr<std::vector<int>>("offsets");
     PADDLE_ENFORCE_EQ(

From 0c851cab2294356dd292b9b4458379d1bde4eadd Mon Sep 17 00:00:00 2001
From: chengduoZH <zhaochengduo@163.com>
Date: Fri, 8 Jun 2018 15:02:33 +0800
Subject: [PATCH 65/93] add SSA graph checker

---
 paddle/fluid/framework/details/CMakeLists.txt |  3 +-
 .../framework/details/ssa_graph_builder.cc    | 70 ---------------
 .../framework/details/ssa_graph_builder.h     |  3 -
 .../details/ssa_graph_builder_factory.cc      |  3 +
 .../framework/details/ssa_graph_checker.cc    | 87 +++++++++++++++++++
 .../framework/details/ssa_graph_checker.h     | 44 ++++++++++
 paddle/fluid/framework/parallel_executor.cc   |  2 +-
 7 files changed, 137 insertions(+), 75 deletions(-)
 create mode 100644 paddle/fluid/framework/details/ssa_graph_checker.cc
 create mode 100644 paddle/fluid/framework/details/ssa_graph_checker.h

diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index ced063a097..dbd118d338 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -8,6 +8,7 @@ cc_library(rpc_op_handle SRCS rpc_op_handle.cc DEPS framework_proto scope place
 cc_library(ssa_graph SRCS ssa_graph.cc DEPS var_handle op_handle_base)
 cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS ssa_graph)
 cc_library(ssa_graph_printer SRCS ssa_graph_printer.cc DEPS ssa_graph_builder)
+cc_library(ssa_graph_checker SRCS ssa_graph_checker.cc DEPS ssa_graph_builder)
 
 cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_rows)
 
@@ -30,7 +31,7 @@ cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS
         scale_loss_grad_op_handle rpc_op_handle ${multi_devices_graph_builder_deps} reduce_op_handle broadcast_op_handle)
 
 
-cc_library(ssa_graph_builder_factory SRCS ssa_graph_builder_factory.cc DEPS multi_devices_graph_builder ssa_graph_printer)
+cc_library(ssa_graph_builder_factory SRCS ssa_graph_builder_factory.cc DEPS multi_devices_graph_builder ssa_graph_printer ssa_graph_checker)
 
 cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph framework_proto)
 cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
diff --git a/paddle/fluid/framework/details/ssa_graph_builder.cc b/paddle/fluid/framework/details/ssa_graph_builder.cc
index c4ee088507..88a21f4887 100644
--- a/paddle/fluid/framework/details/ssa_graph_builder.cc
+++ b/paddle/fluid/framework/details/ssa_graph_builder.cc
@@ -83,76 +83,6 @@ void SSAGraphBuilder::AddOutputToLeafOps(SSAGraph *graph) {
     op->AddOutput(dummy_leaf);
   }
 }
-
-std::unique_ptr<SSAGraph> SSAGraphBuilder::BuildAndCheck(
-    const ProgramDesc &program) {
-  std::unique_ptr<SSAGraph> graph = Build(program);
-  PADDLE_ENFORCE(IsValidGraph(graph.get()));
-  return std::move(graph);
-}
-
-bool SSAGraphBuilder::IsValidGraph(const SSAGraph *graph) const {
-  std::unordered_map<OpHandleBase *, size_t> pending_ops;
-  std::unordered_set<VarHandleBase *> pending_vars;
-  std::unordered_set<VarHandleBase *> ready_vars;
-  std::unordered_set<OpHandleBase *> ready_ops;
-
-  auto insert_pending_var = [&](VarHandleBase *var) {
-    pending_vars.insert(var);
-    if (var->generated_op_ == nullptr) {
-      ready_vars.emplace(var);
-    }
-  };
-
-  for (auto &var_map : graph->vars_) {
-    for (auto &name_pair : var_map) {
-      for (auto &version_pair : name_pair.second) {
-        insert_pending_var(version_pair.get());
-      }
-    }
-  }
-
-  for (auto &var : graph->dep_vars_) {
-    insert_pending_var(var.get());
-  }
-
-  for (auto &op : graph->ops_) {
-    if (op->Inputs().empty()) {
-      ready_ops.insert(op.get());
-    } else {
-      pending_ops.insert({op.get(), op.get()->NoDupInputSize()});
-    }
-  }
-
-  auto run_all_ops = [&](std::unordered_set<OpHandleBase *> &set) {
-    for (auto *op : set) {
-      for (auto out : op->Outputs()) {
-        ready_vars.emplace(out);
-      }
-    }
-    set.clear();
-  };
-
-  while (!pending_vars.empty()) {
-    run_all_ops(ready_ops);
-
-    if (ready_vars.empty()) {
-      return false;
-    }
-
-    for (auto ready_var : ready_vars) {
-      pending_vars.erase(ready_var);
-      for (auto *op : ready_var->pending_ops_) {
-        auto &deps = --pending_ops[op];
-        if (deps == 0) {
-          ready_ops.insert(op);
-        }
-      }
-    }
-    ready_vars.clear();
-  }
-  return true;
-}
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/details/ssa_graph_builder.h b/paddle/fluid/framework/details/ssa_graph_builder.h
index e99a988407..5fc12a44b5 100644
--- a/paddle/fluid/framework/details/ssa_graph_builder.h
+++ b/paddle/fluid/framework/details/ssa_graph_builder.h
@@ -31,8 +31,6 @@ class SSAGraphBuilder {
   virtual ~SSAGraphBuilder() {}
   virtual std::unique_ptr<SSAGraph> Build(const ProgramDesc &program) const = 0;
 
-  std::unique_ptr<SSAGraph> BuildAndCheck(const ProgramDesc &program);
-
   DISABLE_COPY_AND_ASSIGN(SSAGraphBuilder);
 
  protected:
@@ -50,7 +48,6 @@ class SSAGraphBuilder {
                                                const platform::Place &place,
                                                size_t place_offset);
 
-  bool IsValidGraph(const SSAGraph *graph) const;
   // Add an output variable (each_var_name, place, place_offset) to op_handle,
   // which belongs to graph
   static void CreateOpOutput(SSAGraph *graph, OpHandleBase *op_handle,
diff --git a/paddle/fluid/framework/details/ssa_graph_builder_factory.cc b/paddle/fluid/framework/details/ssa_graph_builder_factory.cc
index b5e90d6b05..b4b49d3de6 100644
--- a/paddle/fluid/framework/details/ssa_graph_builder_factory.cc
+++ b/paddle/fluid/framework/details/ssa_graph_builder_factory.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/details/ssa_graph_builder_factory.h"
 #include <fstream>
 #include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
+#include "paddle/fluid/framework/details/ssa_graph_checker.h"
 #include "paddle/fluid/framework/details/ssa_graph_printer.h"
 
 namespace paddle {
@@ -40,6 +41,8 @@ std::unique_ptr<SSAGraphBuilder> SSAGraphBuilderFactory::Create() {
     res.reset(new SSAGraghBuilderWithPrinter(
         std::move(fout), std::move(graphviz_printer), std::move(res)));
   }
+  res.reset(new SSAGraghBuilderWithChecker(std::move(res)));
+
   return res;
 }
 }  // namespace details
diff --git a/paddle/fluid/framework/details/ssa_graph_checker.cc b/paddle/fluid/framework/details/ssa_graph_checker.cc
new file mode 100644
index 0000000000..da5428946e
--- /dev/null
+++ b/paddle/fluid/framework/details/ssa_graph_checker.cc
@@ -0,0 +1,87 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/ssa_graph.h"
+#include <string>
+#include "paddle/fluid/framework/details/ssa_graph_checker.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+bool SSAGraghBuilderWithChecker::IsValidGraph(const SSAGraph *graph) const {
+  std::unordered_map<OpHandleBase *, size_t> pending_ops;
+  std::unordered_set<VarHandleBase *> pending_vars;
+  std::unordered_set<VarHandleBase *> ready_vars;
+  std::unordered_set<OpHandleBase *> ready_ops;
+
+  auto insert_pending_var = [&](VarHandleBase *var) {
+    pending_vars.insert(var);
+    if (var->generated_op_ == nullptr) {
+      ready_vars.emplace(var);
+    }
+  };
+
+  for (auto &var_map : graph->vars_) {
+    for (auto &name_pair : var_map) {
+      for (auto &version_pair : name_pair.second) {
+        insert_pending_var(version_pair.get());
+      }
+    }
+  }
+
+  for (auto &var : graph->dep_vars_) {
+    insert_pending_var(var.get());
+  }
+
+  for (auto &op : graph->ops_) {
+    if (op->Inputs().empty()) {
+      ready_ops.insert(op.get());
+    } else {
+      pending_ops.insert({op.get(), op.get()->NoDupInputSize()});
+    }
+  }
+
+  auto run_all_ops = [&](std::unordered_set<OpHandleBase *> &set) {
+    for (auto *op : set) {
+      for (auto out : op->Outputs()) {
+        ready_vars.emplace(out);
+      }
+    }
+    set.clear();
+  };
+
+  while (!pending_vars.empty()) {
+    run_all_ops(ready_ops);
+
+    if (ready_vars.empty()) {
+      return false;
+    }
+
+    for (auto ready_var : ready_vars) {
+      pending_vars.erase(ready_var);
+      for (auto *op : ready_var->pending_ops_) {
+        auto &deps = --pending_ops[op];
+        if (deps == 0) {
+          ready_ops.insert(op);
+        }
+      }
+    }
+    ready_vars.clear();
+  }
+  return true;
+}
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/ssa_graph_checker.h b/paddle/fluid/framework/details/ssa_graph_checker.h
new file mode 100644
index 0000000000..542c4a1728
--- /dev/null
+++ b/paddle/fluid/framework/details/ssa_graph_checker.h
@@ -0,0 +1,44 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/details/ssa_graph_builder.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+class SSAGraph;
+
+class SSAGraghBuilderWithChecker : public SSAGraphBuilder {
+ public:
+  explicit SSAGraghBuilderWithChecker(
+      std::unique_ptr<SSAGraphBuilder>&& builder)
+      : builder_(std::move(builder)) {}
+
+  std::unique_ptr<SSAGraph> Build(const ProgramDesc& program) const override {
+    auto graph = builder_->Build(program);
+    PADDLE_ENFORCE(IsValidGraph(graph.get()));
+    return graph;
+  }
+
+  bool IsValidGraph(const SSAGraph* graph) const;
+
+ private:
+  std::unique_ptr<SSAGraphBuilder> builder_;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index f1ab337070..5d95dc214a 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -114,7 +114,7 @@ ParallelExecutor::ParallelExecutor(
 
   member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
       exec_strategy, member_->local_scopes_, places,
-      builder_factory.Create()->BuildAndCheck(main_program)));
+      builder_factory.Create()->Build(main_program)));
 
   member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor(
       exec_strategy, member_->local_scopes_, std::move(var_infos),

From b000e0de5d382ff9f1b3c85b03c8a501f563f89d Mon Sep 17 00:00:00 2001
From: yuyang18 <reyoung@126.com>
Date: Fri, 8 Jun 2018 15:47:17 +0800
Subject: [PATCH 66/93] Simplize API Reference Documentation

---
 paddle/fluid/operators/batch_size_like.h      | 14 +++---
 .../fill_constant_batch_size_like_op.cc       | 14 +++---
 paddle/fluid/operators/load_op.cc             | 15 ++----
 paddle/fluid/operators/max_sequence_len_op.cc | 13 +++--
 python/paddle/fluid/layers/control_flow.py    | 28 +++++------
 python/paddle/fluid/layers/io.py              | 29 ++++++++++-
 .../fluid/layers/layer_function_generator.py  |  1 -
 python/paddle/fluid/layers/tensor.py          | 50 +++++++------------
 8 files changed, 84 insertions(+), 80 deletions(-)

diff --git a/paddle/fluid/operators/batch_size_like.h b/paddle/fluid/operators/batch_size_like.h
index 483c9f8c21..fc15d56891 100644
--- a/paddle/fluid/operators/batch_size_like.h
+++ b/paddle/fluid/operators/batch_size_like.h
@@ -54,18 +54,18 @@ class BatchSizeLikeOp : public framework::OperatorWithKernel {
 class BatchSizeLikeOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() final {
-    AddInput("Input",
-             "(Tensor) Tensor "
-             "whose input_dim_idx'th dimension specifies the batch_size");
+    AddInput(
+        "Input",
+        "Tensor whose input_dim_idx'th dimension specifies the batch_size");
     AddOutput("Out",
-              "(Tensor) Tensor of specified shape will be filled "
+              "Tensor of specified shape will be filled "
               "with the specified value");
-    AddAttr<std::vector<int>>("shape", "(vector<int>) The shape of the output");
+    AddAttr<std::vector<int>>("shape", "The shape of the output");
     AddAttr<int>("input_dim_idx",
-                 "(int, default 0) The index of input's batch size dimension")
+                 "default 0. The index of input's batch size dimension")
         .SetDefault(0);
     AddAttr<int>("output_dim_idx",
-                 "(int, default 0) The index of output's batch size dimension")
+                 "default 0. The index of output's batch size dimension")
         .SetDefault(0);
     Apply();
   }
diff --git a/paddle/fluid/operators/fill_constant_batch_size_like_op.cc b/paddle/fluid/operators/fill_constant_batch_size_like_op.cc
index 1ae78675a0..453a1b32a0 100644
--- a/paddle/fluid/operators/fill_constant_batch_size_like_op.cc
+++ b/paddle/fluid/operators/fill_constant_batch_size_like_op.cc
@@ -32,16 +32,16 @@ class FillConstantBatchSizeLikeOp : public BatchSizeLikeOp {
 class FillConstantBatchSizeLikeOpMaker : public BatchSizeLikeOpMaker {
  protected:
   void Apply() override {
-    AddAttr<int>("dtype",
-                 "(int, default 5 (FP32)) "
-                 "Output data type")
+    AddAttr<int>(
+        "dtype",
+        "It could be numpy.dtype. Output data type. Default is float32")
         .SetDefault(framework::proto::VarType::FP32);
-    AddAttr<float>("value", "(float, default 0) The value to be filled")
+    AddAttr<float>("value", "default 0. The value to be filled")
         .SetDefault(0.0f);
     AddComment(R"DOC(
-FillConstantBatchSizeLike Operator.
-
-Fill up a variable with specified constant value.
+This function creates a tensor of specified *shape*, *dtype* and batch size,
+and initializes this with a constant supplied in *value*. The batch size is
+obtained from the `input` tensor.
 
 )DOC");
   }
diff --git a/paddle/fluid/operators/load_op.cc b/paddle/fluid/operators/load_op.cc
index 93f45cff8a..8f4b504927 100644
--- a/paddle/fluid/operators/load_op.cc
+++ b/paddle/fluid/operators/load_op.cc
@@ -74,25 +74,18 @@ class LoadOp : public framework::OperatorBase {
 class LoadOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
-    AddOutput("Out", "(Tensor) The tensor need to be loaded");
+    AddOutput("Out", "The tensor need to be loaded");
     AddAttr<bool>(
         "load_as_fp16",
-        "(boolean, default false)"
         "If true, the tensor will be first loaded and then "
         "converted to float16 data type. Otherwise, the tensor will be "
-        "directly loaded without data type conversion.")
+        "directly loaded without data type conversion. Default is false.")
         .SetDefault(false);
     AddAttr<std::string>("file_path",
-                         "(string) "
-                         "Variable will be loaded from \"file_path\".")
+                         R"(Variable will be loaded from "file_path")")
         .AddCustomChecker(
             [](const std::string &path) { return !path.empty(); });
-    AddComment(R"DOC(
-Load Operator.
-
-Load operator will load a tensor variable from disk file.
-
-)DOC");
+    AddComment("Load operator will load a tensor variable from disk file.");
   }
 };
 }  // namespace operators
diff --git a/paddle/fluid/operators/max_sequence_len_op.cc b/paddle/fluid/operators/max_sequence_len_op.cc
index 8e508b68ee..b1e69f375d 100644
--- a/paddle/fluid/operators/max_sequence_len_op.cc
+++ b/paddle/fluid/operators/max_sequence_len_op.cc
@@ -42,10 +42,15 @@ class MaxSeqenceLenOp : public framework::OperatorBase {
 class MaxSeqenceLenOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
-    AddInput("RankTable", "The lod_rank_table.");
-    AddOutput("Out", "The max sequence length.");
-    AddComment(
-        R"DOC(Calculate the max sequence length through lod_rank_table.)DOC");
+    AddInput("RankTable", "Input variable which is a LoDRankTable object");
+    AddOutput("Out", "The max sequence length");
+    AddComment(R"DOC(
+    Given a LoDRankTable object, this layer returns the max length of
+    a batch of sequences. In fact, a LoDRankTable object contains a list of
+    tuples(<sequence index, sequence length>) and the list is already sorted by
+    sequence length in descending order, so the operator just returns the
+    sequence length of the first tuple element
+)DOC");
   }
 };
 
diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index d1ea9f1485..b32248ad3d 100644
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 import contextlib
 
-from layer_function_generator import autodoc
+from layer_function_generator import autodoc, templatedoc
 from tensor import assign, fill_constant
 from .. import core
 from ..framework import Program, Variable, Operator
@@ -721,26 +721,22 @@ def lod_rank_table(x, level=0):
     return table
 
 
+@templatedoc()
 def max_sequence_len(rank_table):
-    """Max Sequence Len Operator. Given a LoDRankTable object, this layer
-    returns the max length of a batch of sequences. In fact, a LoDRankTable
-    object contains a list of tuples(<sequence index, sequence length>) and
-    the list is already sorted by sequence length in descending order, so the
-    operator just returns the sequence length of the first tuple element.
+    """
+    ${comment}
+
+    >>> import paddle.fluid as fluid
+    >>> x = fluid.layers.data(name='x', shape=[10], dtype='float32',
+    >>>                       lod_level=1)
+    >>> rank_table = layers.lod_rank_table(x=x, level=0)
+    >>> max_seq_len = layers.max_sequence_len(rank_table)
 
     Args:
-        rank_table (Variable): Input variable which is a LoDRankTable object.
+        rank_table(${rank_table_type}): ${rank_table_comment}.
 
     Returns:
-        Variable: The max length of sequence.
-
-    Examples:
-        .. code-block:: python
-
-            x = fluid.layers.data(name='x', shape=[10],
-                            dtype='float32', lod_level=1)
-            rank_table = layers.lod_rank_table(x=x, level=0)
-            max_seq_len = layers.max_sequence_len(rank_table)
+        (${out_type}): ${out_comment}
     """
     helper = LayerHelper("max_seqence_len", **locals())
     res = helper.create_tmp_variable(dtype="int64")
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index a56f3ea9db..9de88e2c32 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -19,11 +19,12 @@ from ..unique_name import generate as unique_name
 from control_flow import BlockGuard
 from ..layer_helper import LayerHelper
 from ..executor import global_scope
+from layer_function_generator import generate_layer_fn, templatedoc
 
 __all__ = [
     'data', 'BlockGuardServ', 'ListenAndServ', 'Send', 'open_recordio_file',
     'open_files', 'read_file', 'shuffle', 'batch', 'double_buffer',
-    'random_data_generator', 'Preprocessor'
+    'random_data_generator', 'Preprocessor', 'load'
 ]
 
 
@@ -662,3 +663,29 @@ class Preprocessor(object):
                 "sink_var_names": self.sink_var_names
             })
         return monkey_patch_reader_methods(self.reader)
+
+
+@templatedoc()
+def load(out, file_path, load_as_fp16=None):
+    """
+    ${comment}
+
+    >>> import paddle.fluid as fluid
+    >>> tmp_tensor = fluid.layers.create_tensor(dtype='float32')
+    >>> fluid.layers.load(tmp_tensor, "./tmp_tensor.bin")
+
+    Args:
+        out(${out_type}): ${out_comment}.
+
+        file_path(${file_path_type}): ${file_path_comment}.
+
+        load_as_fp16(${load_as_fp16_type}): ${load_as_fp16_comment}.
+
+    Returns:
+        None
+    """
+    helper = LayerHelper("load", **locals())
+    attrs = {"file_path": file_path}
+    if load_as_fp16 is not None:
+        attrs['load_as_fp16'] = load_as_fp16
+    helper.append_op(type="load", inputs={}, output={"Out": out}, args=attrs)
diff --git a/python/paddle/fluid/layers/layer_function_generator.py b/python/paddle/fluid/layers/layer_function_generator.py
index 904413cc11..e6a7e7c3aa 100644
--- a/python/paddle/fluid/layers/layer_function_generator.py
+++ b/python/paddle/fluid/layers/layer_function_generator.py
@@ -263,7 +263,6 @@ def templatedoc():
             output_name = _convert_(each_opt.name)
             args["{0}_comment".format(output_name)] = each_opt.comment
             args["{0}_type".format(output_name)] = "Variable"
-
         func.__doc__ = tmpl.substitute(args)
         return func
 
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index 75d3bf8797..601b06cdd8 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -18,6 +18,7 @@ from ..framework import convert_np_dtype_to_dtype_
 from ..framework import Variable
 from ..initializer import Constant, force_init_on_cpu
 from ..core import VarDesc
+from layer_function_generator import templatedoc
 import numpy
 
 __all__ = [
@@ -266,6 +267,7 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None):
     return out
 
 
+@templatedoc()
 def fill_constant_batch_size_like(input,
                                   shape,
                                   dtype,
@@ -273,30 +275,28 @@ def fill_constant_batch_size_like(input,
                                   input_dim_idx=0,
                                   output_dim_idx=0):
     """
-    **fill_constant_batch_size_like**
-
-    This function creates a tensor of specified *shape*, *dtype* and batch size,
-    and initializes this with a constant supplied in *value*. The batch size is
-    obtained from the `input` tensor.
+    ${comment}
 
     It also sets *stop_gradient* to True.
 
+    >>> data = fluid.layers.fill_constant_batch_size_like(
+    >>>             input=like, shape=[1], value=0, dtype='int64')
+
     Args:
-        input(Variable): Tensor whose dimensions will be used to get batch size
-        shape(tuple|list|None): Shape of output tensor
-        dtype(np.dtype|core.VarDesc.VarType|str): Data type of output tensor
-        value(float): Constant value to initialize the output tensor
-        input_dim_idx(int): Index of input's batch size dimension
-        output_dim_idx(int): Index of output's batch size dimension
+        input(${input_type}): ${input_comment}.
 
-    Returns:
-        Variable: The tensor variable storing the output
+        shape(${shape_type}): ${shape_comment}.
 
-    Examples:
-        .. code-block:: python
+        dtype(${dtype_type}): ${dtype_comment}.
+
+        value(${value_type}): ${value_comment}.
 
-          data = fluid.layers.fill_constant_batch_size_like(
-              input=like, shape=[1], value=0, dtype='int64')
+        input_dim_idx(${input_dim_idx_type}): ${input_dim_idx_comment}.
+
+        output_dim_idx(${output_dim_idx_type}): ${output_dim_idx_comment}.
+
+    Returns:
+        ${out_comment}
     """
     helper = LayerHelper("fill_constant_batch_size_like", **locals())
     out = helper.create_tmp_variable(dtype=dtype)
@@ -437,22 +437,6 @@ def save_combine(x, file_path, overwrite=True):
               "overwrite": overwrite})
 
 
-def load(out, file_path):
-    """
-    Loads a variable from a given file.
-
-    Args:
-        out(variable): The variable to be read from the disk file.
-        file_path(str): The path of the disk file.
-    """
-    helper = LayerHelper("load", **locals())
-    helper.append_op(
-        type="load",
-        inputs={},
-        output={"Out": out},
-        args={"file_path": file_path})
-
-
 def load_combine(out, file_path):
     """
     Loads a list of vairables from a single file.

From 439a265760e7421cec2c7ca3ea1b9e6a3c24b673 Mon Sep 17 00:00:00 2001
From: yuyang18 <reyoung@126.com>
Date: Fri, 8 Jun 2018 16:01:35 +0800
Subject: [PATCH 67/93] Better trim dot

---
 python/paddle/fluid/layers/control_flow.py         |  2 +-
 .../fluid/layers/layer_function_generator.py       | 14 ++++++++++----
 python/paddle/fluid/layers/tensor.py               |  2 +-
 3 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index b32248ad3d..51fad6e68c 100644
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -736,7 +736,7 @@ def max_sequence_len(rank_table):
         rank_table(${rank_table_type}): ${rank_table_comment}.
 
     Returns:
-        (${out_type}): ${out_comment}
+        (${out_type}): ${out_comment}.
     """
     helper = LayerHelper("max_seqence_len", **locals())
     res = helper.create_tmp_variable(dtype="int64")
diff --git a/python/paddle/fluid/layers/layer_function_generator.py b/python/paddle/fluid/layers/layer_function_generator.py
index e6a7e7c3aa..7dc4c214be 100644
--- a/python/paddle/fluid/layers/layer_function_generator.py
+++ b/python/paddle/fluid/layers/layer_function_generator.py
@@ -238,6 +238,9 @@ def templatedoc():
         Decorated function.
     """
 
+    def trim_ending_dot(msg):
+        return msg.rstrip('.')
+
     def __impl__(func):
         op_proto = OpProtoHolder.instance().get_op_proto(func.__name__)
         tmpl = string.Template(func.__doc__)
@@ -249,19 +252,22 @@ def templatedoc():
             comment += line
             comment += "\n"
 
-        args = {"comment": comment}
+        args = {"comment": trim_ending_dot(comment)}
         for each_input in op_proto.inputs:
             input_name = _convert_(each_input.name)
-            args["{0}_comment".format(input_name)] = each_input.comment
+            args["{0}_comment".format(input_name)] = trim_ending_dot(
+                each_input.comment)
             args["{0}_type".format(input_name)] = "Variable"
         for each_attr in op_proto.attrs:
             input_name = _convert_(each_attr.name)
-            args["{0}_comment".format(input_name)] = each_attr.comment
+            args["{0}_comment".format(input_name)] = trim_ending_dot(
+                each_attr.comment)
             args["{0}_type".format(input_name)] = _type_to_str_(each_attr.type)
 
         for each_opt in op_proto.outputs:
             output_name = _convert_(each_opt.name)
-            args["{0}_comment".format(output_name)] = each_opt.comment
+            args["{0}_comment".format(output_name)] = trim_ending_dot(
+                each_opt.comment)
             args["{0}_type".format(output_name)] = "Variable"
         func.__doc__ = tmpl.substitute(args)
         return func
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index 601b06cdd8..66db6fe13f 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -296,7 +296,7 @@ def fill_constant_batch_size_like(input,
         output_dim_idx(${output_dim_idx_type}): ${output_dim_idx_comment}.
 
     Returns:
-        ${out_comment}
+        ${out_comment}.
     """
     helper = LayerHelper("fill_constant_batch_size_like", **locals())
     out = helper.create_tmp_variable(dtype=dtype)

From 0d29e6592479122288a38a90c751efa6c2afd3ab Mon Sep 17 00:00:00 2001
From: yuyang18 <reyoung@126.com>
Date: Fri, 8 Jun 2018 16:40:09 +0800
Subject: [PATCH 68/93] Add resize_bilinear

---
 paddle/fluid/operators/bilinear_interp_op.cc  | 11 ++++-----
 .../fluid/layers/layer_function_generator.py  | 15 ++++++++----
 python/paddle/fluid/layers/nn.py              | 23 ++++++++++++-------
 3 files changed, 30 insertions(+), 19 deletions(-)

diff --git a/paddle/fluid/operators/bilinear_interp_op.cc b/paddle/fluid/operators/bilinear_interp_op.cc
index 3321adf274..2572e813d6 100644
--- a/paddle/fluid/operators/bilinear_interp_op.cc
+++ b/paddle/fluid/operators/bilinear_interp_op.cc
@@ -56,17 +56,16 @@ class BilinearInterpOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
     AddInput("X",
-             "(Tensor) The input tensor of bilinear interpolation, "
+             "The input tensor of bilinear interpolation, "
              "This is a 4-D tensor with shape of (N x C x h x w)");
     AddInput("OutSize",
-             "(Tensor) This is a 1-D tensor with two number. "
+             "This is a 1-D tensor with two number. "
              "The first number is height and the second number is width.")
         .AsDispensable();
-    AddOutput("Out",
-              "(Tensor) The dimension of output is (N x C x out_h x out_w]");
+    AddOutput("Out", "The dimension of output is (N x C x out_h x out_w)");
 
-    AddAttr<int>("out_h", "(int) output height of bilinear interpolation op.");
-    AddAttr<int>("out_w", "(int) output width of bilinear interpolation op.");
+    AddAttr<int>("out_h", "output height of bilinear interpolation op.");
+    AddAttr<int>("out_w", "output width of bilinear interpolation op.");
     AddComment(R"DOC(
           Bilinear interpolation is an extension of linear interpolation for 
           interpolating functions of two variables (e.g. H-direction and 
diff --git a/python/paddle/fluid/layers/layer_function_generator.py b/python/paddle/fluid/layers/layer_function_generator.py
index 7dc4c214be..79aa9ff604 100644
--- a/python/paddle/fluid/layers/layer_function_generator.py
+++ b/python/paddle/fluid/layers/layer_function_generator.py
@@ -224,7 +224,7 @@ def autodoc(comment=""):
     return __impl__
 
 
-def templatedoc():
+def templatedoc(op_type=None):
     """
     Decorator of layer function. It will use the docstring from the layer
     function as the template. The template arguments are:
@@ -242,15 +242,20 @@ def templatedoc():
         return msg.rstrip('.')
 
     def __impl__(func):
-        op_proto = OpProtoHolder.instance().get_op_proto(func.__name__)
+        if op_type is None:
+            op_type_name = func.__name__
+        else:
+            op_type_name = op_type
+        op_proto = OpProtoHolder.instance().get_op_proto(op_type_name)
         tmpl = string.Template(func.__doc__)
 
         comment_lines = op_proto.comment.split("\n")
         comment = ""
         for line in comment_lines:
-            line = line.lstrip()
-            comment += line
-            comment += "\n"
+            line = line.strip()
+            if len(line) != 0:
+                comment += line
+                comment += " "
 
         args = {"comment": trim_ending_dot(comment)}
         for each_input in op_proto.inputs:
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index ddaeb415af..b9ea74fc81 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -4037,18 +4037,25 @@ def image_resize(input,
     return out
 
 
+@templatedoc(op_type="bilinear_interp")
 def resize_bilinear(input, out_shape=None, scale=None, name=None):
     """
-    This is an alias of layer 'image_resize' with bilinear interpolation.
+    ${comment}
+
+    Args:
+        input(${x_type}): ${x_comment}.
+
+        out_shape(${out_size_type}): ${out_size_comment}.
 
-    The mathematical meaning of resize bilinear layer is
-    Bilinear interpolation.
-    Bilinear interpolation is an extension of linear interpolation for
-    interpolating functions of two variables (e.g. H-direction and
-    W-direction in this layer) on a rectilinear 2D grid.
+        scale(float|None): The multiplier for the input height or width. At
+             least one of out_shape or scale must be set. And out_shape has
+             a higher priority than scale. Default: None.
+
+        name(str|None): The output variable name.
+
+    Returns:
 
-    For details, please refer to Wikipedia:
-    https://en.wikipedia.org/wiki/Bilinear_interpolation
+        ${out_comment}.
     """
 
     return image_resize(input, out_shape, scale, name, 'BILINEAR')

From d745840a6ef6f25cae38d9f9e361d6c6b2b96922 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Fri, 8 Jun 2018 17:16:59 +0800
Subject: [PATCH 69/93] fix a small compile error on Mac

---
 paddle/fluid/framework/details/fuse_vars_op_handle.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/framework/details/fuse_vars_op_handle.cc b/paddle/fluid/framework/details/fuse_vars_op_handle.cc
index 32415c192f..018c9bff71 100644
--- a/paddle/fluid/framework/details/fuse_vars_op_handle.cc
+++ b/paddle/fluid/framework/details/fuse_vars_op_handle.cc
@@ -42,7 +42,7 @@ void FuseVarsOpHandle::RunImpl() {
     out_t->ShareDataWith(out_tensor->Slice(s, s + numel));
     s += numel;
   }
-  this->RunAndRecordEvent([this] {});
+  this->RunAndRecordEvent([] {});
 }
 
 std::string FuseVarsOpHandle::Name() const { return "fuse vars"; }

From 145aaa4b491eb8b174650faa8d1f94754abf2945 Mon Sep 17 00:00:00 2001
From: Yan Chunwei <yanchunwei@outlook.com>
Date: Fri, 8 Jun 2018 17:17:20 +0800
Subject: [PATCH 70/93] loose threshold of TRT for CI in different model
 (#11305)

---
 paddle/fluid/inference/tensorrt/convert/ut_helper.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
index 8613d5b1c1..236d169017 100644
--- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h
+++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
@@ -151,7 +151,8 @@ class TRTConvertValidation {
       // Compare two output
       ASSERT_FALSE(fluid_out.empty());
       for (size_t i = 0; i < fluid_out.size(); i++) {
-        EXPECT_LT(std::abs(fluid_out[i] - trt_out[i]), 1e-6);
+        // Loose the threshold for CI in different machine model.
+        EXPECT_LT(std::abs(fluid_out[i] - trt_out[i]), 2e-5);
       }
     }
   }

From 568c4e5ec43083c81335c1b8094472b844c704d0 Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Fri, 8 Jun 2018 09:30:17 +0000
Subject: [PATCH 71/93] recommit using account sneaxiy

---
 paddle/fluid/operators/arg_max_op.cc          |  40 +++++
 paddle/fluid/operators/arg_max_op.cu          |  34 ++++
 paddle/fluid/operators/arg_max_op.h           |  16 ++
 paddle/fluid/operators/arg_min_max_op_base.h  | 157 ++++++++++++++++++
 paddle/fluid/operators/arg_min_op.cc          |  40 +++++
 paddle/fluid/operators/arg_min_op.cu          |  34 ++++
 paddle/fluid/operators/arg_min_op.h           |  16 ++
 python/paddle/fluid/layers/tensor.py          |  64 +++++++
 .../tests/unittests/test_arg_min_max_op.py    |  82 +++++++++
 9 files changed, 483 insertions(+)
 create mode 100644 paddle/fluid/operators/arg_max_op.cc
 create mode 100644 paddle/fluid/operators/arg_max_op.cu
 create mode 100644 paddle/fluid/operators/arg_max_op.h
 create mode 100644 paddle/fluid/operators/arg_min_max_op_base.h
 create mode 100644 paddle/fluid/operators/arg_min_op.cc
 create mode 100644 paddle/fluid/operators/arg_min_op.cu
 create mode 100644 paddle/fluid/operators/arg_min_op.h
 create mode 100644 python/paddle/fluid/tests/unittests/test_arg_min_max_op.py

diff --git a/paddle/fluid/operators/arg_max_op.cc b/paddle/fluid/operators/arg_max_op.cc
new file mode 100644
index 0000000000..5603607357
--- /dev/null
+++ b/paddle/fluid/operators/arg_max_op.cc
@@ -0,0 +1,40 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/arg_max_op.h"
+/*
+REGISTER_ARG_MINMAX_OP_WITHOUT_GRADIENT(arg_max, ArgMax);
+
+REGISTER_ARG_MINMAX_KERNEL(arg_max, ArgMax, CPU);
+*/
+
+REGISTER_OPERATOR(arg_max, paddle::operators::ArgMaxOp,
+                  paddle::operators::ArgMaxOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+
+REGISTER_OP_CPU_KERNEL(
+    arg_max, paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext,
+                                             float, int64_t>,
+    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext, double,
+                                    int64_t>,
+    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext, int64_t,
+                                    int64_t>,
+    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext, int32_t,
+                                    int64_t>,
+    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext, int16_t,
+                                    int64_t>,
+    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext, size_t,
+                                    int64_t>,
+    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext, uint8_t,
+                                    int64_t>);
diff --git a/paddle/fluid/operators/arg_max_op.cu b/paddle/fluid/operators/arg_max_op.cu
new file mode 100644
index 0000000000..8f57c63beb
--- /dev/null
+++ b/paddle/fluid/operators/arg_max_op.cu
@@ -0,0 +1,34 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/arg_max_op.h"
+
+// REGISTER_ARG_MINMAX_KERNEL(arg_max, ArgMax, CUDA);
+
+REGISTER_OP_CUDA_KERNEL(
+    arg_max,
+    paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext, float,
+                                    int64_t>,
+    paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext, double,
+                                    int64_t>,
+    paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext,
+                                    int64_t, int64_t>,
+    paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext,
+                                    int32_t, int64_t>,
+    paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext,
+                                    int16_t, int64_t>,
+    paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext, size_t,
+                                    int64_t>,
+    paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext,
+                                    uint8_t, int64_t>);
diff --git a/paddle/fluid/operators/arg_max_op.h b/paddle/fluid/operators/arg_max_op.h
new file mode 100644
index 0000000000..d232a85699
--- /dev/null
+++ b/paddle/fluid/operators/arg_max_op.h
@@ -0,0 +1,16 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/operators/arg_min_max_op_base.h"
diff --git a/paddle/fluid/operators/arg_min_max_op_base.h b/paddle/fluid/operators/arg_min_max_op_base.h
new file mode 100644
index 0000000000..8c20461a34
--- /dev/null
+++ b/paddle/fluid/operators/arg_min_max_op_base.h
@@ -0,0 +1,157 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <type_traits>
+#include <vector>
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/string/printf.h"
+
+namespace paddle {
+namespace operators {
+
+enum ArgMinMaxType { kArgMin, kArgMax };
+
+template <typename DeviceContext, typename T, typename Tout, int64_t Rank,
+          ArgMinMaxType argMinMaxValue>
+struct ArgMinMaxFunctor {};
+
+#define DECLARE_ARG_MIN_MAX_FUNCTOR(eigen_op_type, enum_argminmax_value)      \
+  template <typename DeviceContext, typename T, typename Tout, int64_t Rank>  \
+  struct ArgMinMaxFunctor<DeviceContext, T, Tout, Rank,                       \
+                          enum_argminmax_value> {                             \
+    void operator()(const DeviceContext& ctx, const framework::LoDTensor& in, \
+                    framework::LoDTensor& out, int64_t axis) {                \
+      auto in_eigen = framework::EigenTensor<T, Rank>::From(in);              \
+      auto out_eigen = framework::EigenTensor<Tout, Rank - 1>::From(out);     \
+      out_eigen.device(*(ctx.eigen_device())) =                               \
+          in_eigen.eigen_op_type(axis).template cast<Tout>();                 \
+    }                                                                         \
+  }
+
+DECLARE_ARG_MIN_MAX_FUNCTOR(argmin, ArgMinMaxType::kArgMin);
+DECLARE_ARG_MIN_MAX_FUNCTOR(argmax, ArgMinMaxType::kArgMax);
+
+template <typename DeviceContext, typename T, typename Tout,
+          ArgMinMaxType EnumArgMinMaxValue>
+class ArgMinMaxKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& x = *(ctx.Input<framework::LoDTensor>("X"));
+    auto& out = *(ctx.Output<framework::LoDTensor>("Out"));
+    out.mutable_data<Tout>(ctx.GetPlace());
+    auto axis = ctx.Attr<int64_t>("axis");
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+
+#define CALL_ARG_MINMAX_FUNCTOR(rank)                                \
+  ArgMinMaxFunctor<DeviceContext, T, Tout, rank, EnumArgMinMaxValue> \
+      functor##rank;                                                 \
+  functor##rank(dev_ctx, x, out, axis)
+
+    switch (x.dims().size()) {
+      case 1:
+        CALL_ARG_MINMAX_FUNCTOR(1);
+        break;
+      case 2:
+        CALL_ARG_MINMAX_FUNCTOR(2);
+        break;
+      case 3:
+        CALL_ARG_MINMAX_FUNCTOR(3);
+        break;
+      case 4:
+        CALL_ARG_MINMAX_FUNCTOR(4);
+        break;
+      case 5:
+        CALL_ARG_MINMAX_FUNCTOR(5);
+        break;
+      case 6:
+        CALL_ARG_MINMAX_FUNCTOR(6);
+        break;
+      default:
+        PADDLE_THROW(
+            "%s operator doesn't supports tensors whose ranks are greater "
+            "than 6.",
+            (EnumArgMinMaxValue == kArgMin ? "argmin" : "argmax"));
+        break;
+    }
+  }
+};
+
+template <typename DeviceContext, typename T, typename Tout>
+using ArgMinKernel =
+    ArgMinMaxKernel<DeviceContext, T, Tout, ArgMinMaxType::kArgMin>;
+
+template <typename DeviceContext, typename T, typename Tout>
+using ArgMaxKernel =
+    ArgMinMaxKernel<DeviceContext, T, Tout, ArgMinMaxType::kArgMax>;
+
+typedef class BaseArgMinMaxOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null");
+    const auto& x_dims = ctx->GetInputDim("X");
+    int64_t axis = ctx->Attrs().Get<int64_t>("axis");
+    PADDLE_ENFORCE(axis >= -x_dims.size() && axis < x_dims.size(),
+                   "'axis' must be inside [-Rank(X), Rank(X))");
+
+    auto x_rank = x_dims.size();
+    if (axis < 0) axis += x_rank;
+
+    std::vector<int64_t> vec;
+    for (int64_t i = 0; i < axis; i++) vec.push_back(x_dims[i]);
+    for (int64_t i = axis + 1; i < x_rank; i++) vec.push_back(x_dims[i]);
+    ctx->SetOutputDim("Out", framework::make_ddim(vec));
+  }
+} ArgMinOp, ArgMaxOp;
+
+class BaseArgMinMaxOpMaker : public framework::OpProtoAndCheckerMaker {
+ protected:
+  virtual const char* OpName() const = 0;
+  virtual const char* Name() const = 0;
+
+ public:
+  void Make() override {
+    AddInput("X", "Input tensor.");
+    AddOutput("Out", "Output tensor.");
+    AddAttr<int64_t>("axis", "The axis in which to compute the arg indics.");
+    AddComment(::paddle::string::Sprintf(R"DOC(
+				%s Operator.
+
+				Computes the indices of the %s elements of the input tensor's element along the provided axis.
+)DOC",
+                                         OpName(), Name()));
+  }
+};
+
+class ArgMinOpMaker : public BaseArgMinMaxOpMaker {
+ protected:
+  const char* OpName() const override { return "ArgMin"; }
+  const char* Name() const override { return "min"; }
+};
+
+class ArgMaxOpMaker : public BaseArgMinMaxOpMaker {
+ protected:
+  const char* OpName() const override { return "ArgMax"; }
+  const char* Name() const override { return "max"; }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/arg_min_op.cc b/paddle/fluid/operators/arg_min_op.cc
new file mode 100644
index 0000000000..fe17ed711b
--- /dev/null
+++ b/paddle/fluid/operators/arg_min_op.cc
@@ -0,0 +1,40 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/arg_min_op.h"
+/*
+REGISTER_ARG_MINMAX_OP_WITHOUT_GRADIENT(arg_min, ArgMin);
+
+REGISTER_ARG_MINMAX_KERNEL(arg_min, ArgMin, CPU);
+*/
+
+REGISTER_OPERATOR(arg_min, paddle::operators::ArgMinOp,
+                  paddle::operators::ArgMinOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+
+REGISTER_OP_CPU_KERNEL(
+    arg_min, paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext,
+                                             float, int64_t>,
+    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext, double,
+                                    int64_t>,
+    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext, int64_t,
+                                    int64_t>,
+    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext, int32_t,
+                                    int64_t>,
+    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext, int16_t,
+                                    int64_t>,
+    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext, size_t,
+                                    int64_t>,
+    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext, uint8_t,
+                                    int64_t>);
diff --git a/paddle/fluid/operators/arg_min_op.cu b/paddle/fluid/operators/arg_min_op.cu
new file mode 100644
index 0000000000..da9262044a
--- /dev/null
+++ b/paddle/fluid/operators/arg_min_op.cu
@@ -0,0 +1,34 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/arg_min_op.h"
+
+// REGISTER_ARG_MINMAX_KERNEL(arg_min, ArgMin, CUDA);
+
+REGISTER_OP_CUDA_KERNEL(
+    arg_min,
+    paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext, float,
+                                    int64_t>,
+    paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext, double,
+                                    int64_t>,
+    paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext,
+                                    int64_t, int64_t>,
+    paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext,
+                                    int32_t, int64_t>,
+    paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext,
+                                    int16_t, int64_t>,
+    paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext, size_t,
+                                    int64_t>,
+    paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext,
+                                    uint8_t, int64_t>);
diff --git a/paddle/fluid/operators/arg_min_op.h b/paddle/fluid/operators/arg_min_op.h
new file mode 100644
index 0000000000..d232a85699
--- /dev/null
+++ b/paddle/fluid/operators/arg_min_op.h
@@ -0,0 +1,16 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/operators/arg_min_max_op_base.h"
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index 75d3bf8797..3dfacfff6a 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -30,6 +30,8 @@ __all__ = [
     'assign',
     'fill_constant_batch_size_like',
     'fill_constant',
+    'argmin',
+    'argmax',
     'ones',
     'zeros',
 ]
@@ -315,6 +317,68 @@ def fill_constant_batch_size_like(input,
     return out
 
 
+def argmin(x, axis=0):
+    """
+    **argmin**
+
+    This function computes the indices of the min elements 
+    of the input tensor's element along the provided axis.
+
+    Args:
+        x(Variable): The input to compute the indices of
+                     the min elements.
+        axis(int): Axis to compute indices along.
+    
+    Returns:
+        Variable: The tensor variable storing the output
+    
+    Examples:
+        .. code-block:: python
+          
+          out = fluid.layers.argmin(x=in, axis=0)
+          out = fluid.layers.argmin(x=in, axis=-1)  
+    """
+    helper = LayerHelper("arg_min", **locals())
+    out = helper.create_tmp_variable(VarDesc.VarType.INT64)
+    helper.append_op(
+        type='arg_min',
+        inputs={'X': x},
+        outputs={'Out': [out]},
+        attrs={'axis': axis})
+    return out
+
+
+def argmax(x, axis=0):
+    """
+    **argmax**
+
+    This function computes the indices of the max elements 
+    of the input tensor's element along the provided axis.
+
+    Args:
+        x(Variable): The input to compute the indices of
+                     the max elements.
+        axis(int): Axis to compute indices along.
+    
+    Returns:
+        Variable: The tensor variable storing the output
+    
+    Examples:
+        .. code-block:: python
+          
+          out = fluid.layers.argmax(x=in, axis=0)
+          out = fluid.layers.argmax(x=in, axis=-1)  
+    """
+    helper = LayerHelper("arg_max", **locals())
+    out = helper.create_tmp_variable(VarDesc.VarType.INT64)
+    helper.append_op(
+        type='arg_max',
+        inputs={'X': x},
+        outputs={'Out': [out]},
+        attrs={'axis': axis})
+    return out
+
+
 def ones(shape, dtype, force_cpu=False):
     """
     **ones**
diff --git a/python/paddle/fluid/tests/unittests/test_arg_min_max_op.py b/python/paddle/fluid/tests/unittests/test_arg_min_max_op.py
new file mode 100644
index 0000000000..e04412f809
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_arg_min_max_op.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class BaseTestCase(OpTest):
+    def initTestCase(self):
+        self.op_type = 'arg_min'
+        self.dims = (3, 4, 5)
+        self.dtype = 'float32'
+        self.axis = 0
+
+    def setUp(self):
+        self.initTestCase()
+        self.x = (1000 * np.random.random(self.dims)).astype(self.dtype)
+        self.inputs = {'X': self.x}
+        self.attrs = {'axis': self.axis}
+        if self.op_type == "arg_min":
+            self.outputs = {'Out': np.argmin(self.x, axis=self.axis)}
+        else:
+            self.outputs = {'Out': np.argmax(self.x, axis=self.axis)}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestCase0(BaseTestCase):
+    def initTestCase(self):
+        self.op_type = 'arg_max'
+        self.dims = (3, 4, 5)
+        self.dtype = 'float32'
+        self.axis = 0
+
+
+class TestCase1(BaseTestCase):
+    def initTestCase(self):
+        self.op_type = 'arg_min'
+        self.dims = (3, 4)
+        self.dtype = 'float64'
+        self.axis = 1
+
+
+class TestCase2(BaseTestCase):
+    def initTestCase(self):
+        self.op_type = 'arg_max'
+        self.dims = (3, 4)
+        self.dtype = 'int64'
+        self.axis = 0
+
+
+class TestCase3(BaseTestCase):
+    def initTestCase(self):
+        self.op_type = 'arg_max'
+        self.dims = (3, )
+        self.dtype = 'int64'
+        self.axis = 0
+
+
+class TestCase4(BaseTestCase):
+    def initTestCase(self):
+        self.op_type = 'arg_min'
+        self.dims = (1, )
+        self.dtype = 'int32'
+        self.axis = 0
+
+
+if __name__ == '__main__':
+    unittest.main()

From 8c9041f486fd6ac4004e0bd93e829dfa051293b9 Mon Sep 17 00:00:00 2001
From: yuyang18 <reyoung@126.com>
Date: Fri, 8 Jun 2018 17:42:22 +0800
Subject: [PATCH 72/93] Refine LinearCRF

---
 paddle/fluid/operators/linear_chain_crf_op.cc |  2 --
 .../fluid/layers/layer_function_generator.py  | 10 +++++-
 .../fluid/layers/learning_rate_scheduler.py   | 36 ++++++++++---------
 3 files changed, 28 insertions(+), 20 deletions(-)

diff --git a/paddle/fluid/operators/linear_chain_crf_op.cc b/paddle/fluid/operators/linear_chain_crf_op.cc
index e38525cd7f..a711da3627 100644
--- a/paddle/fluid/operators/linear_chain_crf_op.cc
+++ b/paddle/fluid/operators/linear_chain_crf_op.cc
@@ -67,8 +67,6 @@ class LinearChainCRFOpMaker : public framework::OpProtoAndCheckerMaker {
         "mini-batch. Note: S is equal to the sequence number in a mini-batch. "
         "The output is no longer a LoDTensor.");
     AddComment(R"DOC(
-LinearChainCRF Operator.
-
 Conditional Random Field defines an undirected probabilistic graph with nodes
 denoting random variables and edges denoting dependencies between these
 variables. CRF learns the conditional probability $P(Y|X)$, where
diff --git a/python/paddle/fluid/layers/layer_function_generator.py b/python/paddle/fluid/layers/layer_function_generator.py
index 79aa9ff604..cb60a3aec9 100644
--- a/python/paddle/fluid/layers/layer_function_generator.py
+++ b/python/paddle/fluid/layers/layer_function_generator.py
@@ -224,6 +224,9 @@ def autodoc(comment=""):
     return __impl__
 
 
+_inline_math_single_dollar = re.compile(r"\$([^\$]+)\$")
+
+
 def templatedoc(op_type=None):
     """
     Decorator of layer function. It will use the docstring from the layer
@@ -241,6 +244,9 @@ def templatedoc(op_type=None):
     def trim_ending_dot(msg):
         return msg.rstrip('.')
 
+    def escape_inline_math(msg):
+        return _inline_math_single_dollar.sub(repl=r':math:`\1`', string=msg)
+
     def __impl__(func):
         if op_type is None:
             op_type_name = func.__name__
@@ -254,8 +260,10 @@ def templatedoc(op_type=None):
         for line in comment_lines:
             line = line.strip()
             if len(line) != 0:
-                comment += line
+                comment += escape_inline_math(line)
                 comment += " "
+            elif len(comment) != 0:
+                comment += "\n    \n    "
 
         args = {"comment": trim_ending_dot(comment)}
         for each_input in op_proto.inputs:
diff --git a/python/paddle/fluid/layers/learning_rate_scheduler.py b/python/paddle/fluid/layers/learning_rate_scheduler.py
index d13c54daa5..716cc7824e 100644
--- a/python/paddle/fluid/layers/learning_rate_scheduler.py
+++ b/python/paddle/fluid/layers/learning_rate_scheduler.py
@@ -11,6 +11,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""
+When training a model, it's often useful to decay the
+learning rate during training process, this is called
+learning_rate_decay. There are many strategies to do
+this, this module will provide some classical method.
+User can also implement their own learning_rate_decay
+strategy according to this module.
+"""
 
 import control_flow
 import nn
@@ -22,14 +30,6 @@ __all__ = [
     'exponential_decay', 'natural_exp_decay', 'inverse_time_decay',
     'polynomial_decay', 'piecewise_decay', 'noam_decay'
 ]
-"""
-When training a model, it's often useful to decay the
-learning rate during training process, this is called
-learning_rate_decay. There are many strategies to do
-this, this module will provide some classical method.
-User can also implement their own learning_rate_decay
-strategy according to this module.
-"""
 
 
 def _decay_step_counter(begin=0):
@@ -41,18 +41,20 @@ def _decay_step_counter(begin=0):
 
 
 def noam_decay(d_model, warmup_steps):
-    """Apply decay to learning rate.
-    ```python
-    lr_value = np.power(d_model, -0.5) * np.min([
-            np.power(current_steps, -0.5),
-            np.power(warmup_steps, -1.5) * current_steps
-        ])
-    ```
+    """
+    Noam decay method. The numpy implementation of noam decay as follows.
+
+    >>> import numpy as np
+    >>> lr_value = np.power(d_model, -0.5) * np.min([
+    >>>                         np.power(current_steps, -0.5),
+    >>>                         np.power(warmup_steps, -1.5) * current_steps])
+
+    Please reference `attention is all you need
+    <https://arxiv.org/pdf/1706.03762.pdf>`_.
 
     Args:
         d_model(Variable): The dimensionality of input and output of model.
-            Reference: attention is all you need
-                https://arxiv.org/pdf/1706.03762.pdf
+
         warmup_steps(Variable): A super parameter.
 
     Returns:

From dd26329b3c14473ed25620d9724e05ebaadfec87 Mon Sep 17 00:00:00 2001
From: yuyang18 <reyoung@126.com>
Date: Fri, 8 Jun 2018 17:46:59 +0800
Subject: [PATCH 73/93] Remove return types

---
 python/paddle/fluid/layers/control_flow.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index 51fad6e68c..80e8ff484a 100644
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -736,7 +736,7 @@ def max_sequence_len(rank_table):
         rank_table(${rank_table_type}): ${rank_table_comment}.
 
     Returns:
-        (${out_type}): ${out_comment}.
+        ${out_comment}.
     """
     helper = LayerHelper("max_seqence_len", **locals())
     res = helper.create_tmp_variable(dtype="int64")

From 0fec9469f9f6519e6ee31015c6b57f7efbd9880c Mon Sep 17 00:00:00 2001
From: guochaorong <guochaorong@baidu.com>
Date: Fri, 8 Jun 2018 17:54:47 +0800
Subject: [PATCH 74/93] fix some bugs introduced by unfreed memory

---
 paddle/fluid/operators/gather_test.cc             | 3 ++-
 paddle/fluid/operators/math/math_function_test.cc | 2 ++
 paddle/fluid/platform/device_tracer.cc            | 2 ++
 3 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/gather_test.cc b/paddle/fluid/operators/gather_test.cc
index 9c0561b016..f6b156eb30 100644
--- a/paddle/fluid/operators/gather_test.cc
+++ b/paddle/fluid/operators/gather_test.cc
@@ -43,7 +43,8 @@ TEST(Gather, GatherData) {
   auto* cpu_place = new paddle::platform::CPUPlace();
   paddle::platform::CPUDeviceContext ctx(*cpu_place);
   paddle::operators::CPUGather<int>(ctx, *src, *index, output);
-
+  delete cpu_place;
+  cpu_place = NULL;
   for (int i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], i + 4);
   for (int i = 4; i < 8; ++i) EXPECT_EQ(p_output[i], i - 4);
 
diff --git a/paddle/fluid/operators/math/math_function_test.cc b/paddle/fluid/operators/math/math_function_test.cc
index 3719a264e9..b545671b43 100644
--- a/paddle/fluid/operators/math/math_function_test.cc
+++ b/paddle/fluid/operators/math/math_function_test.cc
@@ -77,6 +77,8 @@ TEST(math_function, gemm_trans_clbas) {
   paddle::platform::CPUDeviceContext context(*cpu_place);
   GetBlas<float>(context).GEMM(false, true, m, n, k, 1, input1_ptr, 3,
                                input2_ptr + 3, 3, 1, input3_ptr + 1, 4);
+  delete cpu_place;
+  cpu_place = NULL;
 
   EXPECT_EQ(input3_ptr[0], 0);
   EXPECT_EQ(input3_ptr[1], 24);
diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc
index c9e1063168..3870e69ba7 100644
--- a/paddle/fluid/platform/device_tracer.cc
+++ b/paddle/fluid/platform/device_tracer.cc
@@ -130,6 +130,8 @@ void CUPTIAPI bufferRequested(uint8_t **buffer, size_t *size,
   uint8_t *buf = reinterpret_cast<uint8_t *>(malloc(kBufSize + kAlignSize));
   *size = kBufSize;
   *buffer = ALIGN_BUFFER(buf, kAlignSize);
+  free(buf);
+  buf = NULL;
   *maxNumRecords = 0;
 }
 

From ab4942271dde14bc1ce0641ad118225a69ab67aa Mon Sep 17 00:00:00 2001
From: weixing02 <wx_crome@163.com>
Date: Fri, 8 Jun 2018 18:07:20 +0800
Subject: [PATCH 75/93] fix deadlink

---
 doc/fluid/dev/api_doc_std_cn.md | 9 +++++----
 doc/fluid/dev/api_doc_std_en.md | 9 +++++----
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/doc/fluid/dev/api_doc_std_cn.md b/doc/fluid/dev/api_doc_std_cn.md
index b50f18f21d..7d39b8de1e 100644
--- a/doc/fluid/dev/api_doc_std_cn.md
+++ b/doc/fluid/dev/api_doc_std_cn.md
@@ -1,8 +1,9 @@
 # API注释撰写标准
 
-- [API注释模块](#API注释模块)
-- [格式及示例](#格式及示例)
-- [完整示例](#完整示例)
+- [API注释撰写标准](#api)
+    - [API注释模块](#api)
+    - [格式及示例](#)
+    - [完整示例](#)
 
 
 ## API注释模块
@@ -217,4 +218,4 @@ API文档须使用reStructuredText格式撰写，该格式详情请参考[链接
 
 ## 完整示例
 
-fc 的完整注释见[示例](src/fc.py)。
+fc 的完整注释见[示例](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/src/fc.py)。
diff --git a/doc/fluid/dev/api_doc_std_en.md b/doc/fluid/dev/api_doc_std_en.md
index e57072d52f..f175b21975 100644
--- a/doc/fluid/dev/api_doc_std_en.md
+++ b/doc/fluid/dev/api_doc_std_en.md
@@ -1,8 +1,9 @@
 # API Doc Standard
 
-- [API Doc Structure](#API Doc Structure)
-- [Format and Examples](#Format and Examples)
-- [Complete Example](#Complete Example)
+- [API Doc Standard](#api-doc-standard)
+    - [API Doc Structure](#api-doc-structure)
+    - [Format and Examples](#format-and-examples)
+    - [Complete Example](#complete-example)
 
 
 ## API Doc Structure
@@ -223,4 +224,4 @@ Format and examples of each part of API documantation are as follows: (take fc f
 
 ## Complete Example
 
-Complete Example of fc please see [here](src/fc.py)。
+Complete Example of fc please see [here](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/src/fc.py)。

From cde7db85b33a6025518e0367e20727daa8f4a6fb Mon Sep 17 00:00:00 2001
From: guochaorong <guochaorong@baidu.com>
Date: Fri, 8 Jun 2018 19:01:13 +0800
Subject: [PATCH 76/93] fix bad code in python

---
 python/paddle/fluid/layers/metric.py                          | 4 ----
 .../fluid/tests/unittests/test_dynrnn_gradient_check.py       | 3 ---
 2 files changed, 7 deletions(-)

diff --git a/python/paddle/fluid/layers/metric.py b/python/paddle/fluid/layers/metric.py
index cab2eb5551..a1c64ce277 100644
--- a/python/paddle/fluid/layers/metric.py
+++ b/python/paddle/fluid/layers/metric.py
@@ -64,10 +64,6 @@ def auc(input, label, curve='ROC', num_thresholds=200):
     topk_indices = helper.create_tmp_variable(dtype="int64")
     topk_out, topk_indices = nn.topk(input, k=k)
     auc_out = helper.create_tmp_variable(dtype="float32")
-    if correct is None:
-        correct = helper.create_tmp_variable(dtype="int64")
-    if total is None:
-        total = helper.create_tmp_variable(dtype="int64")
     helper.append_op(
         type="accuracy",
         inputs={
diff --git a/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py b/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py
index 2232939075..95af51f1b2 100644
--- a/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py
+++ b/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py
@@ -30,9 +30,6 @@ class Memory(object):
         assert val.dtype == self.ex.dtype
         self.cur = val
 
-    def ex(self):
-        return self.ex
-
     def next(self):
         self.ex = self.cur
         self.cur = None

From c6d230e03e00dfcca7db67a4a8d4069f3d926d6e Mon Sep 17 00:00:00 2001
From: Luo Tao <luotao02@baidu.com>
Date: Fri, 8 Jun 2018 18:34:53 +0800
Subject: [PATCH 77/93] add FLAGS_use_mkldnn to global control use_mkldnn

---
 paddle/fluid/framework/executor.cc            | 18 ++++++++++++-
 paddle/fluid/framework/executor.h             |  2 ++
 .../test_inference_image_classification.cc    |  5 +---
 .../tests/book/test_inference_nlp.cc          |  4 ---
 paddle/fluid/inference/tests/test_helper.h    | 25 ++++++-------------
 paddle/testing/paddle_gtest_main.cc           |  2 +-
 python/paddle/fluid/__init__.py               |  2 +-
 7 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 3d68c5fb87..d4d6c34108 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -24,6 +24,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/profiler.h"
 
 DECLARE_bool(benchmark);
+DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run");
 
 namespace paddle {
 namespace framework {
@@ -115,6 +116,7 @@ void Executor::CreateVariables(const ProgramDesc& pdesc, Scope* scope,
 void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
                    bool create_local_scope, bool create_vars) {
   platform::RecordBlock b(block_id);
+  if (FLAGS_use_mkldnn) EnableMKLDNN(pdesc);
   auto ctx = Prepare(pdesc, block_id);
   RunPreparedContext(ctx.get(), scope, create_local_scope, create_vars);
 }
@@ -214,6 +216,7 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
                    const std::string& feed_holder_name,
                    const std::string& fetch_holder_name) {
   platform::RecordBlock b(kProgramId);
+  if (FLAGS_use_mkldnn) EnableMKLDNN(program);
   bool has_feed_ops =
       has_feed_operators(program.Block(0), *feed_targets, feed_holder_name);
   bool has_fetch_ops =
@@ -225,7 +228,6 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
     unique_ptr_of_copy_program.reset(new ProgramDesc(program));
     copy_program = unique_ptr_of_copy_program.get();
   }
-
   auto* global_block = copy_program->MutableBlock(0);
 
   if (!has_feed_ops) {
@@ -378,5 +380,19 @@ void Executor::RunPreparedContext(
   }
 }
 
+void Executor::EnableMKLDNN(const ProgramDesc& program) {
+#ifdef PADDLE_WITH_MKLDNN
+  VLOG(3) << "use_mkldnn=True";
+  for (size_t bid = 0; bid < program.Size(); ++bid) {
+    auto* block = const_cast<ProgramDesc&>(program).MutableBlock(bid);
+    for (auto* op : block->AllOps()) {
+      if (op->HasAttr("use_mkldnn")) {
+        op->SetAttr("use_mkldnn", true);
+      }
+    }
+  }
+#endif
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h
index 0c3c23611d..e6f9c3d31c 100644
--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
@@ -81,6 +81,8 @@ class Executor {
                           const std::string& feed_holder_name = "feed",
                           const std::string& fetch_holder_name = "fetch");
 
+  void EnableMKLDNN(const ProgramDesc& program);
+
  private:
   const platform::Place place_;
 };
diff --git a/paddle/fluid/inference/tests/book/test_inference_image_classification.cc b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
index 987da18116..60c761c528 100644
--- a/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
@@ -21,7 +21,6 @@ DEFINE_string(fp16_dirname, "", "Directory of the float16 inference model.");
 DEFINE_int32(batch_size, 1, "Batch size of input data");
 DEFINE_int32(repeat, 1, "Running the inference program repeat times");
 DEFINE_bool(skip_cpu, false, "Skip the cpu test");
-DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run inference");
 
 TEST(inference, image_classification) {
   if (FLAGS_dirname.empty() || FLAGS_batch_size < 1 || FLAGS_repeat < 1) {
@@ -59,10 +58,8 @@ TEST(inference, image_classification) {
     // Run inference on CPU
     LOG(INFO) << "--- CPU Runs: ---";
     LOG(INFO) << "Batch size is " << FLAGS_batch_size;
-    LOG(INFO) << "FLAGS_use_mkldnn: " << FLAGS_use_mkldnn;
     TestInference<paddle::platform::CPUPlace, false, true>(
-        dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, is_combined,
-        FLAGS_use_mkldnn);
+        dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, is_combined);
     LOG(INFO) << output1.dims();
   }
 
diff --git a/paddle/fluid/inference/tests/book/test_inference_nlp.cc b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
index a0e83a1705..9dcd79c3bb 100644
--- a/paddle/fluid/inference/tests/book/test_inference_nlp.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
@@ -27,7 +27,6 @@ limitations under the License. */
 DEFINE_string(model_path, "", "Directory of the inference model.");
 DEFINE_string(data_file, "", "File of input index data.");
 DEFINE_int32(repeat, 100, "Running the inference program repeat times");
-DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run inference");
 DEFINE_bool(prepare_vars, true, "Prepare variables before executor");
 DEFINE_int32(num_threads, 1, "Number of threads should be used");
 
@@ -190,9 +189,6 @@ TEST(inference, nlp) {
     std::unique_ptr<paddle::framework::ProgramDesc> inference_program;
     inference_program = InitProgram(&executor, scope.get(), FLAGS_model_path,
                                     /*model combined*/ false);
-    if (FLAGS_use_mkldnn) {
-      EnableMKLDNN(inference_program);
-    }
     // always prepare context
     std::unique_ptr<paddle::framework::ExecutorPrepareContext> ctx;
     ctx = executor.Prepare(*inference_program, 0);
diff --git a/paddle/fluid/inference/tests/test_helper.h b/paddle/fluid/inference/tests/test_helper.h
index 01b8dc0be6..44c36b1683 100644
--- a/paddle/fluid/inference/tests/test_helper.h
+++ b/paddle/fluid/inference/tests/test_helper.h
@@ -22,6 +22,8 @@ limitations under the License. */
 #include "paddle/fluid/inference/io.h"
 #include "paddle/fluid/platform/profiler.h"
 
+DECLARE_bool(use_mkldnn);
+
 template <typename T>
 void SetupTensor(paddle::framework::LoDTensor* input,
                  paddle::framework::DDim dims, T lower, T upper) {
@@ -133,24 +135,11 @@ std::vector<std::vector<int64_t>> GetFeedTargetShapes(
   return feed_target_shapes;
 }
 
-void EnableMKLDNN(
-    const std::unique_ptr<paddle::framework::ProgramDesc>& program) {
-  for (size_t bid = 0; bid < program->Size(); ++bid) {
-    auto* block = program->MutableBlock(bid);
-    for (auto* op : block->AllOps()) {
-      if (op->HasAttr("use_mkldnn")) {
-        op->SetAttr("use_mkldnn", true);
-      }
-    }
-  }
-}
-
 template <typename Place, bool CreateVars = true, bool PrepareContext = false>
 void TestInference(const std::string& dirname,
                    const std::vector<paddle::framework::LoDTensor*>& cpu_feeds,
                    const std::vector<paddle::framework::LoDTensor*>& cpu_fetchs,
-                   const int repeat = 1, const bool is_combined = false,
-                   const bool use_mkldnn = false) {
+                   const int repeat = 1, const bool is_combined = false) {
   // 1. Define place, executor, scope
   auto place = Place();
   auto executor = paddle::framework::Executor(place);
@@ -182,9 +171,6 @@ void TestInference(const std::string& dirname,
         "init_program",
         paddle::platform::DeviceContextPool::Instance().Get(place));
     inference_program = InitProgram(&executor, scope, dirname, is_combined);
-    if (use_mkldnn) {
-      EnableMKLDNN(inference_program);
-    }
   }
   // Disable the profiler and print the timing information
   paddle::platform::DisableProfiler(paddle::platform::EventSortingKey::kDefault,
@@ -210,7 +196,10 @@ void TestInference(const std::string& dirname,
     fetch_targets[fetch_target_names[i]] = cpu_fetchs[i];
   }
 
-  // 6. Run the inference program
+  // 6. If export Flags_use_mkldnn=True, use mkldnn related ops.
+  if (FLAGS_use_mkldnn) executor.EnableMKLDNN(*inference_program);
+
+  // 7. Run the inference program
   {
     if (!CreateVars) {
       // If users don't want to create and destroy variables every time they
diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc
index 586ec48477..507479c862 100644
--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
@@ -30,7 +30,7 @@ int main(int argc, char** argv) {
   new_argv.push_back(
       strdup("--tryfromenv=fraction_of_gpu_memory_to_use,use_pinned_memory"));
 #else
-  new_argv.push_back(strdup("--tryfromenv=use_pinned_memory"));
+  new_argv.push_back(strdup("--tryfromenv=use_pinned_memory,use_mkldnn"));
 #endif
   int new_argc = static_cast<int>(new_argv.size());
   char** new_argv_address = new_argv.data();
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index c4fad620f0..4914719279 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -116,7 +116,7 @@ def __bootstrap__():
 
     read_env_flags = [
         'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir',
-        'eager_delete_scope'
+        'eager_delete_scope', 'use_mkldnn'
     ]
     if core.is_compiled_with_cuda():
         read_env_flags += [

From 56964946d4cca97f4341342d85159a2fd4b54496 Mon Sep 17 00:00:00 2001
From: Yancey1989 <yancey1989@gmail.com>
Date: Fri, 8 Jun 2018 19:41:48 +0800
Subject: [PATCH 78/93] polish sparse update logic

---
 .../fluid/operators/detail/request_handler_impl.cc  | 12 ++++++++++--
 .../fluid/operators/detail/request_handler_impl.h   |  5 +++++
 paddle/fluid/operators/detail/rpc_server.cc         | 13 -------------
 paddle/fluid/operators/detail/rpc_server.h          |  5 -----
 paddle/fluid/operators/listen_and_serv_op.cc        |  4 +++-
 5 files changed, 18 insertions(+), 21 deletions(-)

diff --git a/paddle/fluid/operators/detail/request_handler_impl.cc b/paddle/fluid/operators/detail/request_handler_impl.cc
index b5ee3ab51e..9473dce550 100644
--- a/paddle/fluid/operators/detail/request_handler_impl.cc
+++ b/paddle/fluid/operators/detail/request_handler_impl.cc
@@ -64,13 +64,21 @@ bool RequestSendHandler::Handle(const std::string& varname,
       return false;
     }
     if (invar->IsType<framework::SelectedRows>()) {
-      rpc_server_->RecordSparseVar(invar);
+      std::unique_lock<std::mutex> lock(mutex_sparse_vars_);
+      sparse_vars_.push_back(invar);
     }
   }
-
   return true;
 }
 
+void RequestSendHandler::ResetSparseVarRecorder() {
+  std::unique_lock<std::mutex> lock(mutex_sparse_vars_);
+  for (auto* var : sparse_vars_) {
+    var->GetMutable<framework::SelectedRows>()->mutable_rows()->clear();
+  }
+  sparse_vars_.clear();
+}
+
 bool RequestGetHandler::Handle(const std::string& varname,
                                framework::Scope* scope,
                                framework::Variable* invar,
diff --git a/paddle/fluid/operators/detail/request_handler_impl.h b/paddle/fluid/operators/detail/request_handler_impl.h
index 8d0c62232b..443d951914 100644
--- a/paddle/fluid/operators/detail/request_handler_impl.h
+++ b/paddle/fluid/operators/detail/request_handler_impl.h
@@ -41,6 +41,11 @@ class RequestSendHandler final : public RequestHandler {
   virtual ~RequestSendHandler() {}
   bool Handle(const std::string& varname, framework::Scope* scope,
               framework::Variable* var, framework::Variable** outvar) override;
+  void ResetSparseVarRecorder();
+
+ private:
+  std::mutex mutex_sparse_vars_;
+  std::vector<framework::Variable*> sparse_vars_;
 };
 
 class RequestGetHandler final : public RequestHandler {
diff --git a/paddle/fluid/operators/detail/rpc_server.cc b/paddle/fluid/operators/detail/rpc_server.cc
index 7feddbeca8..448763372a 100644
--- a/paddle/fluid/operators/detail/rpc_server.cc
+++ b/paddle/fluid/operators/detail/rpc_server.cc
@@ -73,19 +73,6 @@ void RPCServer::ResetBarrierCounter() {
     t.second = 0;
   }
 }
-void RPCServer::RecordSparseVar(framework::Variable* sparse_var) {
-  std::unique_lock<std::mutex> lock(mutex_sparse_var_recorder_);
-  sparse_vars_.push_back(sparse_var);
-}
-
-void RPCServer::ResetSparseVarsRecorder() {
-  VLOG(3) << "RPCServer reset sparse vars recorder.";
-  std::unique_lock<std::mutex> lock(mutex_sparse_var_recorder_);
-  for (auto* var : sparse_vars_) {
-    var->GetMutable<framework::SelectedRows>()->mutable_rows()->clear();
-  }
-  sparse_vars_.clear();
-}
 
 void RPCServer::RegisterRPC(const std::string& rpc_name,
                             RequestHandler* handler, int thread_num) {
diff --git a/paddle/fluid/operators/detail/rpc_server.h b/paddle/fluid/operators/detail/rpc_server.h
index 94a21ef8d0..f809c13c72 100644
--- a/paddle/fluid/operators/detail/rpc_server.h
+++ b/paddle/fluid/operators/detail/rpc_server.h
@@ -62,8 +62,6 @@ class RPCServer {
   void IncreaseBatchBarrier(const std::string rpc_name);
 
   void ResetBarrierCounter();
-  void RecordSparseVar(framework::Variable* sparse_var);
-  void ResetSparseVarsRecorder();
 
  protected:
   virtual void ShutDownImpl() = 0;
@@ -77,9 +75,6 @@ class RPCServer {
   std::atomic<int> cur_cond_;
   std::condition_variable rpc_cond_;
 
-  std::vector<framework::Variable*> sparse_vars_;
-  std::mutex mutex_sparse_var_recorder_;
-
  protected:
   std::string bind_address_;
   std::atomic<int> exit_flag_;
diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc
index ee7b01a54c..66d31c8895 100644
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -146,7 +146,9 @@ void ListenAndServOp::RunSyncLoop(framework::Executor *executor,
     rpc_service_->SetCond(detail::kRequestGet);
     rpc_service_->WaitBarrier(detail::kRequestGet);
     rpc_service_->ResetBarrierCounter();
-    rpc_service_->ResetSparseVarsRecorder();
+    // reset received sparse vars to avoid reuse it in the next mini-batch
+    dynamic_cast<detail::RequestSendHandler *>(request_send_handler_.get())
+        ->ResetSparseVarRecorder();
   }  // while(true)
 }
 

From 6d32e96096b63663111dcab202be357ed00ef93a Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Fri, 8 Jun 2018 11:42:13 +0000
Subject: [PATCH 79/93] remove redundant comments

---
 paddle/fluid/operators/arg_max_op.cc | 5 -----
 paddle/fluid/operators/arg_max_op.cu | 2 --
 paddle/fluid/operators/arg_min_op.cc | 5 -----
 paddle/fluid/operators/arg_min_op.cu | 2 --
 4 files changed, 14 deletions(-)

diff --git a/paddle/fluid/operators/arg_max_op.cc b/paddle/fluid/operators/arg_max_op.cc
index 5603607357..859cccd1b2 100644
--- a/paddle/fluid/operators/arg_max_op.cc
+++ b/paddle/fluid/operators/arg_max_op.cc
@@ -13,11 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/arg_max_op.h"
-/*
-REGISTER_ARG_MINMAX_OP_WITHOUT_GRADIENT(arg_max, ArgMax);
-
-REGISTER_ARG_MINMAX_KERNEL(arg_max, ArgMax, CPU);
-*/
 
 REGISTER_OPERATOR(arg_max, paddle::operators::ArgMaxOp,
                   paddle::operators::ArgMaxOpMaker,
diff --git a/paddle/fluid/operators/arg_max_op.cu b/paddle/fluid/operators/arg_max_op.cu
index 8f57c63beb..c9c102bdcc 100644
--- a/paddle/fluid/operators/arg_max_op.cu
+++ b/paddle/fluid/operators/arg_max_op.cu
@@ -14,8 +14,6 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/arg_max_op.h"
 
-// REGISTER_ARG_MINMAX_KERNEL(arg_max, ArgMax, CUDA);
-
 REGISTER_OP_CUDA_KERNEL(
     arg_max,
     paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext, float,
diff --git a/paddle/fluid/operators/arg_min_op.cc b/paddle/fluid/operators/arg_min_op.cc
index fe17ed711b..18c0884a04 100644
--- a/paddle/fluid/operators/arg_min_op.cc
+++ b/paddle/fluid/operators/arg_min_op.cc
@@ -13,11 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/arg_min_op.h"
-/*
-REGISTER_ARG_MINMAX_OP_WITHOUT_GRADIENT(arg_min, ArgMin);
-
-REGISTER_ARG_MINMAX_KERNEL(arg_min, ArgMin, CPU);
-*/
 
 REGISTER_OPERATOR(arg_min, paddle::operators::ArgMinOp,
                   paddle::operators::ArgMinOpMaker,
diff --git a/paddle/fluid/operators/arg_min_op.cu b/paddle/fluid/operators/arg_min_op.cu
index da9262044a..6d5aaa9596 100644
--- a/paddle/fluid/operators/arg_min_op.cu
+++ b/paddle/fluid/operators/arg_min_op.cu
@@ -14,8 +14,6 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/arg_min_op.h"
 
-// REGISTER_ARG_MINMAX_KERNEL(arg_min, ArgMin, CUDA);
-
 REGISTER_OP_CUDA_KERNEL(
     arg_min,
     paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext, float,

From 5c8397a88fe6b062be0c0725bbd14a2c8d4fc2e9 Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Fri, 8 Jun 2018 19:49:10 +0800
Subject: [PATCH 80/93] remove chief in test

---
 python/paddle/fluid/tests/unittests/test_checkpoint.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_checkpoint.py b/python/paddle/fluid/tests/unittests/test_checkpoint.py
index cf70dfd448..e22400a045 100644
--- a/python/paddle/fluid/tests/unittests/test_checkpoint.py
+++ b/python/paddle/fluid/tests/unittests/test_checkpoint.py
@@ -66,9 +66,9 @@ class TestCheckpoint(unittest.TestCase):
 
             exe = fluid.Executor(self.place)
             for i in xrange(10):
-                fluid.io.save_checkpoint(
-                    exe, config.checkpoint_dir, self.trainer_id, self.chief,
-                    trainer_args, program, config.max_num_checkpoints)
+                fluid.io.save_checkpoint(exe, config.checkpoint_dir,
+                                         self.trainer_id, trainer_args, program,
+                                         config.max_num_checkpoints)
 
 
 if __name__ == '__main__':

From 310598f99bf130c62fcd3ec9c64bf986136ddbe5 Mon Sep 17 00:00:00 2001
From: guochaorong <32069604+guochaorong@users.noreply.github.com>
Date: Fri, 8 Jun 2018 21:43:55 +0800
Subject: [PATCH 81/93] Update device_tracer.cc

---
 paddle/fluid/platform/device_tracer.cc | 2 --
 1 file changed, 2 deletions(-)

diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc
index b79768eba2..1a9be044e0 100644
--- a/paddle/fluid/platform/device_tracer.cc
+++ b/paddle/fluid/platform/device_tracer.cc
@@ -130,8 +130,6 @@ void CUPTIAPI bufferRequested(uint8_t **buffer, size_t *size,
   uint8_t *buf = reinterpret_cast<uint8_t *>(malloc(kBufSize + kAlignSize));
   *size = kBufSize;
   *buffer = ALIGN_BUFFER(buf, kAlignSize);
-  free(buf);
-  buf = NULL;
   *maxNumRecords = 0;
 }
 

From 52e2eb65b9c0d773abc28d520f318a8def3d6d11 Mon Sep 17 00:00:00 2001
From: Siddharth Goyal <vi.siddharth78@gmail.com>
Date: Fri, 8 Jun 2018 10:40:37 -0700
Subject: [PATCH 82/93] Fix function in fit-a-line with new API (#11020)

---
 .../tests/book/high-level-api/fit_a_line/test_fit_a_line.py   | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py b/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py
index b3117cf2e5..ad28c9eff5 100644
--- a/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py
+++ b/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py
@@ -38,7 +38,7 @@ def inference_program():
     return y_predict
 
 
-def linear():
+def train_program():
     y = fluid.layers.data(name='y', shape=[1], dtype='float32')
     y_predict = inference_program()
 
@@ -104,7 +104,7 @@ def main(use_cuda):
     # Directory for saving the trained model
     params_dirname = "fit_a_line.inference.model"
 
-    train(use_cuda, linear, params_dirname)
+    train(use_cuda, train_program, params_dirname)
     infer(use_cuda, inference_program, params_dirname)
 
 
From 637827a5bc80d6e0a17466e44b087f91601539cb Mon Sep 17 00:00:00 2001
From: Jeff Wang <wangjeff@baidu.com>
Date: Fri, 8 Jun 2018 15:05:25 -0700
Subject: [PATCH 83/93] Use for_test=True in the Fluid Trainer to clone the
 test program (#11323)

* Use for_test=True in the Fluid Trainer to clone the test program

* fix typo

* Should do the samething to the inferencer
---
 python/paddle/fluid/inferencer.py | 2 ++
 python/paddle/fluid/trainer.py    | 4 ++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/inferencer.py b/python/paddle/fluid/inferencer.py
index 9f242cf29a..6baac00905 100644
--- a/python/paddle/fluid/inferencer.py
+++ b/python/paddle/fluid/inferencer.py
@@ -56,6 +56,8 @@ class Inferencer(object):
         else:
             self.exe = executor.Executor(self.place)
 
+        self.inference_program = self.inference_program.clone(for_test=True)
+
     def infer(self, inputs, return_numpy=True):
         """
         :param inputs: a map of {"input_name": input_var} that will be feed into the inference program
diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py
index cdacb41986..ac313b237e 100644
--- a/python/paddle/fluid/trainer.py
+++ b/python/paddle/fluid/trainer.py
@@ -115,9 +115,9 @@ class Trainer(object):
             program_func_outs = train_func()
             self.train_func_outputs = program_func_outs if isinstance(
                 program_func_outs, list) else [program_func_outs]
-            self.test_program = self.train_program.clone()
+            self.test_program = self.train_program.clone(for_test=True)
 
-            # The fisrt element of program_func_outs is loss.
+            # The first element of program_func_outs is loss.
             loss = self.train_func_outputs[0]
 
             optimizer = optimizer_func()

From bf03a2094bce7c542dd64c3a29f445e04c68640b Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Sun, 10 Jun 2018 13:24:38 +0800
Subject: [PATCH 84/93] fix distribute_transpiler

---
 python/paddle/fluid/transpiler/distribute_transpiler.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index 27992df462..c7ab300e0f 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -177,6 +177,7 @@ class DistributeTranspiler:
                         dtype=table_grad_var.dtype)
                     for index in range(len(self.pserver_endpoints))
                 ]
+        return param_list, grad_list
 
     def _init_splited_vars(self, slice_var_up):
         # update these mappings for further transpile:
@@ -199,8 +200,8 @@ class DistributeTranspiler:
                 grad_list.append(g)
                 param_grad_set.add(g.name)
 
-        self._update_dist_lookup_table_vars(param_list, grad_list,
-                                            self.params_grads)
+        param_list, grad_list = self._update_dist_lookup_table_vars(
+            param_list, grad_list, self.params_grads)
 
         if slice_var_up:
             # when we slice var up into blocks, we will slice the var according to

From 36031cb50f22297112750691b67ee87132ea3915 Mon Sep 17 00:00:00 2001
From: mozga-intel <mateusz.ozga@intel.com>
Date: Sun, 10 Jun 2018 23:07:07 +0200
Subject: [PATCH 85/93] MKLDNN layout: Support for pool operator

---
 paddle/fluid/operators/pool_mkldnn_op.cc | 182 ++++++++++++++++-------
 1 file changed, 128 insertions(+), 54 deletions(-)

diff --git a/paddle/fluid/operators/pool_mkldnn_op.cc b/paddle/fluid/operators/pool_mkldnn_op.cc
index a045f9e98d..5341187d1c 100644
--- a/paddle/fluid/operators/pool_mkldnn_op.cc
+++ b/paddle/fluid/operators/pool_mkldnn_op.cc
@@ -18,9 +18,14 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using mkldnn::memory;  // Note: paddle has also "memory" namespace
-using mkldnn::pooling_forward;
+using framework::DataLayout;
+using mkldnn::memory;
 using mkldnn::pooling_backward;
+using mkldnn::pooling_forward;
+using mkldnn::primitive;
+using mkldnn::reorder;
+using mkldnn::stream;
+using platform::to_void_cast;
 
 // Generate keys for storing/retriving primitives for this operator
 // TODO(jczaja): Make hashing function more optimial
@@ -55,8 +60,9 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     const Tensor* input = ctx.Input<Tensor>("X");
     Tensor* output = ctx.Output<Tensor>("Out");
 
-    // Get an unique name from "argument" name of "Out" variable
-    // This name will be used as key when saving info into device context
+    PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN &&
+                       input->format() != memory::format::format_undef,
+                   "Wrong layout/format set for Input tensor");
 
     std::string pooling_type = ctx.Attr<std::string>("pooling_type");
     std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize");
@@ -82,6 +88,9 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
     std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
 
+    auto input_format = input->format();
+    memory::format output_format{memory::format::format_undef};
+
     const std::string key = gethash(src_tz, pooling_type, ksize, strides,
                                     paddings, ctx.op().Output("Out"));
     const std::string key_pool_p = key + "@pool_p";
@@ -94,16 +103,17 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     auto pool_p =
         std::static_pointer_cast<pooling_forward>(dev_ctx.GetBlob(key_pool_p));
     if (pool_p == nullptr) {
-      // TODO(pzelazko-intel): support more formats
+      auto src_md = platform::MKLDNNMemDesc(
+          src_tz, platform::MKLDNNGetDataType<T>(), input_format);
 
-      auto src_md =
-          platform::MKLDNNMemDesc(src_tz, platform::MKLDNNGetDataType<T>(),
-                                  mkldnn::memory::format::nchw);
-      auto dst_md =
-          platform::MKLDNNMemDesc(dst_tz, platform::MKLDNNGetDataType<T>(),
-                                  mkldnn::memory::format::nchw);
+      /* create memory descriptor for pooling without specified format
+       * ('any') which lets a primitive (pooling in this case) choose
+       * the memory format preferred for best performance
+       */
+      auto dst_md = platform::MKLDNNMemDesc(dst_tz, mkldnn::memory::f32,
+                                            mkldnn::memory::format::any);
 
-      std::shared_ptr<pooling_forward::primitive_desc> pool_pd =
+      std::shared_ptr<mkldnn::pooling_forward::primitive_desc> pool_pd =
           CreatePrimitiveDesc(src_md, dst_md, strides, paddings, ksize,
                               pooling_type, mkldnn_engine);
 
@@ -116,20 +126,22 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
       // save pool_workspace_memory to be referred in backward path
       dev_ctx.SetBlob(key_pool_workspace_memory, workspace_memory);
 
-      auto pool_src_memory_p = std::make_shared<memory>(
-          memory::primitive_desc{src_md, mkldnn_engine},
-          static_cast<void*>(const_cast<T*>(input_data)));
-      dev_ctx.SetBlob(key_pool_src_mem_p, pool_src_memory_p);
+      auto src_memory = std::make_shared<memory>(pool_pd->src_primitive_desc(),
+                                                 to_void_cast<T>(input_data));
+      auto dst_memory =
+          std::make_shared<memory>(pool_pd->dst_primitive_desc(), output_data);
 
-      auto pool_dst_memory_p = std::make_shared<memory>(
-          memory::primitive_desc{dst_md, mkldnn_engine},
-          static_cast<void*>(output_data));
-      dev_ctx.SetBlob(key_pool_dst_mem_p, pool_dst_memory_p);
+      dev_ctx.SetBlob(key_pool_src_mem_p, src_memory);
+      dev_ctx.SetBlob(key_pool_dst_mem_p, dst_memory);
+
+      pool_p = std::make_shared<pooling_forward>(*pool_pd, *(src_memory.get()),
+                                                 *(dst_memory.get()),
+                                                 *workspace_memory);
 
-      pool_p = std::make_shared<pooling_forward>(
-          *pool_pd, *(pool_src_memory_p.get()), *(pool_dst_memory_p.get()),
-          *workspace_memory);
       dev_ctx.SetBlob(key_pool_p, pool_p);
+
+      output_format =
+          (memory::format)dst_memory->get_primitive_desc().desc().data.format;
     } else {
       // Primitives already exist
       auto pool_src_memory_p =
@@ -140,14 +152,20 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
           std::static_pointer_cast<memory>(dev_ctx.GetBlob(key_pool_dst_mem_p));
       PADDLE_ENFORCE(pool_dst_memory_p != nullptr,
                      "Fail to find pooling dst mem_p in device context");
-      pool_src_memory_p->set_data_handle(
-          reinterpret_cast<void*>(const_cast<T*>(input_data)));
+      pool_src_memory_p->set_data_handle(to_void_cast<T>(input_data));
       pool_dst_memory_p->set_data_handle(output_data);
+
+      output_format = (memory::format)pool_dst_memory_p->get_primitive_desc()
+                          .desc()
+                          .data.format;
     }
 
     // push primitive to stream and wait until it's executed
     std::vector<mkldnn::primitive> pipeline{*(pool_p.get())};
-    mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+    stream(stream::kind::eager).submit(pipeline).wait();
+
+    output->set_layout(DataLayout::kMKLDNN);
+    output->set_format(output_format);
   }
 
  private:
@@ -194,6 +212,13 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     const Tensor* out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
     Tensor* in_x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
 
+    PADDLE_ENFORCE(in_x->layout() == DataLayout::kMKLDNN &&
+                       in_x->format() != memory::format::format_undef,
+                   "Wrong layout/format set for Input X tensor");
+    PADDLE_ENFORCE(out_grad->layout() == DataLayout::kMKLDNN &&
+                       out_grad->format() != memory::format::format_undef,
+                   "Wrong layout/format set for Input output_grad tensor");
+
     std::string pooling_type = ctx.Attr<std::string>("pooling_type");
     std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize");
     std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
@@ -212,6 +237,7 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
 
     const T* out_grad_data = out_grad->data<T>();
     T* in_x_grad_data = in_x_grad->mutable_data<T>(ctx.GetPlace());
+    memory::format in_x_grad_format{memory::format::format_undef};
 
     std::vector<int> diff_src_tz =
         paddle::framework::vectorize2int(in_x_grad->dims());
@@ -225,39 +251,48 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     const std::string key_pool_bwd_p = key + "@pool_bwd_p";
     const std::string key_pool_diff_src_mem_p = key + "@pool_diff_src_mem_p";
     const std::string key_pool_diff_dst_mem_p = key + "@pool_diff_dst_mem_p";
+    const std::string key_pool_src_mem_p = key + "@pool_src_mem_p";
+    const std::string key_pool_dst_mem_p = key + "@pool_dst_mem_p";
     const std::string key_pool_pd = key + "@pool_pd";
     const std::string key_pool_workspace_memory =
         key + "@pool_workspace_memory";
 
+    auto user_diff_dst_memory =
+        memory({{{diff_dst_tz}, memory::data_type::f32, out_grad->format()},
+                mkldnn_engine},
+               to_void_cast<T>(out_grad_data));
+
+    std::shared_ptr<memory> diff_src_memory;
+    std::shared_ptr<memory> diff_dst_memory;
+    auto dst_memory =
+        std::static_pointer_cast<memory>(dev_ctx.GetBlob(key_pool_dst_mem_p));
+    PADDLE_ENFORCE(dst_memory != nullptr,
+                   "Fail to find dst_memory in device context");
+
+    primitive reorder_diff_dst;
+    bool is_diff_dst_reordered = false;
     auto pool_bwd_p = std::static_pointer_cast<pooling_backward>(
         dev_ctx.GetBlob(key_pool_bwd_p));
     if (pool_bwd_p == nullptr) {
-      auto diff_src_md =
-          platform::MKLDNNMemDesc(diff_src_tz, platform::MKLDNNGetDataType<T>(),
-                                  mkldnn::memory::format::nchw);
-      auto diff_dst_md =
-          platform::MKLDNNMemDesc(diff_dst_tz, platform::MKLDNNGetDataType<T>(),
-                                  mkldnn::memory::format::nchw);
+      // Retrieve src_memory/dst_memory saved in forward pass
+      auto src_memory =
+          std::static_pointer_cast<memory>(dev_ctx.GetBlob(key_pool_src_mem_p));
+      PADDLE_ENFORCE(src_memory != nullptr,
+                     "Fail to find src_memory in device context");
       // Retrieve pool_pd/pool_workspace_memory from device context
       auto pool_pd =
           std::static_pointer_cast<mkldnn::pooling_forward::primitive_desc>(
               dev_ctx.GetBlob(key_pool_pd));
       PADDLE_ENFORCE(pool_pd != nullptr,
                      "Fail to find pool_pd in device context");
-
-      auto workspace_memory = std::static_pointer_cast<mkldnn::memory>(
+      auto workspace_memory = std::static_pointer_cast<memory>(
           dev_ctx.GetBlob(key_pool_workspace_memory));
       PADDLE_ENFORCE(workspace_memory != nullptr,
                      "Fail to find workspace_memory in device context");
 
-      auto pool_diff_src_memory_p = std::make_shared<memory>(memory(
-          {diff_src_md, mkldnn_engine}, static_cast<void*>(in_x_grad_data)));
-      dev_ctx.SetBlob(key_pool_diff_src_mem_p, pool_diff_src_memory_p);
-
-      auto pool_diff_dst_memory_p = std::make_shared<memory>(
-          memory({diff_dst_md, mkldnn_engine},
-                 static_cast<void*>(const_cast<T*>(out_grad_data))));
-      dev_ctx.SetBlob(key_pool_diff_dst_mem_p, pool_diff_dst_memory_p);
+      // create memory descriptors for pooling
+      auto diff_src_md = src_memory.get()->get_primitive_desc().desc();
+      auto diff_dst_md = dst_memory.get()->get_primitive_desc().desc();
 
       auto pool_bwd_desc = mkldnn::pooling_backward::desc(
           pooling_type == "max" ? mkldnn::algorithm::pooling_max
@@ -267,35 +302,74 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
       auto pool_bwd_pd = mkldnn::pooling_backward::primitive_desc(
           pool_bwd_desc, mkldnn_engine, *pool_pd);
 
+      // reorder between user_diff_dst and pool diff_dst if needed
+      diff_dst_memory = std::make_shared<memory>(user_diff_dst_memory);
+      if (memory::primitive_desc(dst_memory->get_primitive_desc()) !=
+          user_diff_dst_memory.get_primitive_desc()) {
+        diff_dst_memory =
+            std::make_shared<memory>(dst_memory.get()->get_primitive_desc());
+        reorder_diff_dst = reorder(user_diff_dst_memory, *diff_dst_memory);
+        is_diff_dst_reordered = true;
+      }
+
+      diff_src_memory = std::make_shared<memory>(
+          pool_bwd_pd.diff_src_primitive_desc(), in_x_grad_data);
+
+      dev_ctx.SetBlob(key_pool_diff_src_mem_p, diff_src_memory);
+      dev_ctx.SetBlob(key_pool_diff_dst_mem_p, diff_dst_memory);
+
       pool_bwd_p = std::make_shared<pooling_backward>(
-          pool_bwd_pd, *(pool_diff_dst_memory_p.get()), *workspace_memory,
-          *(pool_diff_src_memory_p));
+          pool_bwd_pd, *(diff_dst_memory.get()), *workspace_memory,
+          *(diff_src_memory));
       dev_ctx.SetBlob(key_pool_bwd_p, pool_bwd_p);
+
     } else {
       // Primitives already exist
-      auto pool_diff_src_memory_p = std::static_pointer_cast<memory>(
+      diff_src_memory = std::static_pointer_cast<memory>(
           dev_ctx.GetBlob(key_pool_diff_src_mem_p));
-      PADDLE_ENFORCE(pool_diff_src_memory_p != nullptr,
+      PADDLE_ENFORCE(diff_src_memory != nullptr,
                      "Fail to find pooling src mem_p in device context");
-      auto pool_diff_dst_memory_p = std::static_pointer_cast<memory>(
+      diff_dst_memory = std::static_pointer_cast<memory>(
           dev_ctx.GetBlob(key_pool_diff_dst_mem_p));
-      PADDLE_ENFORCE(pool_diff_dst_memory_p != nullptr,
+      PADDLE_ENFORCE(diff_dst_memory != nullptr,
                      "Fail to find pooling dst mem_p in device context");
-      pool_diff_src_memory_p->set_data_handle(
-          reinterpret_cast<void*>(in_x_grad_data));
-      pool_diff_dst_memory_p->set_data_handle(const_cast<T*>(out_grad_data));
+
+      diff_src_memory->set_data_handle(reinterpret_cast<void*>(in_x_grad_data));
+      diff_dst_memory->set_data_handle(const_cast<T*>(out_grad_data));
+
+      // reorder between user_diff_dst and pool diff_dst if needed
+      if (memory::primitive_desc(dst_memory->get_primitive_desc()) !=
+          user_diff_dst_memory.get_primitive_desc()) {
+        diff_dst_memory =
+            std::make_shared<memory>(dst_memory.get()->get_primitive_desc());
+        reorder_diff_dst = reorder(user_diff_dst_memory, *diff_dst_memory);
+        is_diff_dst_reordered = true;
+      }
     }
 
+    in_x_grad_format = (memory::format)diff_src_memory->get_primitive_desc()
+                           .desc()
+                           .data.format;
+
     // push primitive to stream and wait until it's executed
-    std::vector<mkldnn::primitive> pipeline{*(pool_bwd_p.get())};
+    std::vector<mkldnn::primitive> pipeline;
+    if (is_diff_dst_reordered) {
+      pipeline.push_back(reorder_diff_dst);
+    }
+    pipeline.push_back(*(pool_bwd_p.get()));
     mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+
+    in_x_grad->set_layout(DataLayout::kMKLDNN);
+    in_x_grad->set_format(in_x_grad_format);
   }  // Compute()
 };
 
 }  // namespace operators
 }  // namespace paddle
 
+namespace ops = paddle::operators;
+
 REGISTER_OP_KERNEL(pool2d, MKLDNN, ::paddle::platform::CPUPlace,
-                   paddle::operators::PoolMKLDNNOpKernel<float>);
+                   ops::PoolMKLDNNOpKernel<float>);
 REGISTER_OP_KERNEL(pool2d_grad, MKLDNN, ::paddle::platform::CPUPlace,
-                   paddle::operators::PoolMKLDNNGradOpKernel<float>);
+                   ops::PoolMKLDNNGradOpKernel<float>);

From 062d5a56b401162ebd0232e42135c87177ad68ec Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Mon, 11 Jun 2018 09:42:54 +0800
Subject: [PATCH 86/93] Add comments to a singleton. (#11333)

---
 paddle/fluid/framework/data_type.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/paddle/fluid/framework/data_type.cc b/paddle/fluid/framework/data_type.cc
index b6b93cf422..60382faffb 100644
--- a/paddle/fluid/framework/data_type.cc
+++ b/paddle/fluid/framework/data_type.cc
@@ -28,6 +28,9 @@ struct DataTypeMap {
 };
 
 static DataTypeMap* InitDataTypeMap();
+// C++11 removes the need for manual locking. Concurrent execution shall wait if
+// a static local variable is already being initialized.
+// https://stackoverflow.com/questions/11711920/how-to-implement-multithread-safe-singleton-in-c11-without-using-mutex
 static DataTypeMap& gDataTypeMap() {
   static DataTypeMap* g_data_type_map_ = InitDataTypeMap();
   return *g_data_type_map_;

From 59e10922b41f141ae25b5266275f10921a30d92a Mon Sep 17 00:00:00 2001
From: qingqing01 <dangqingqing@baidu.com>
Date: Mon, 11 Jun 2018 10:07:54 +0800
Subject: [PATCH 87/93] Expose maxout Python API. (#11278)

* Expose maxout API.

* Fix code style.
---
 python/paddle/fluid/layers/ops.py                  | 1 +
 python/paddle/fluid/tests/unittests/test_layers.py | 8 ++++++++
 2 files changed, 9 insertions(+)

diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py
index 69cfde852d..3260f81e9e 100644
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -73,6 +73,7 @@ __all__ = [
     'sum',
     'polygon_box_transform',
     'shape',
+    'maxout',
 ] + __activations__
 
 for _OP in set(__all__):
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 621a450fa6..8b0ebe3cf5 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -387,6 +387,14 @@ class TestBook(unittest.TestCase):
             self.assertIsNotNone(output)
         print(str(program))
 
+    def test_maxout(self):
+        program = Program()
+        with program_guard(program):
+            data = layers.data(name='x', shape=[8, 6, 6], dtype="float32")
+            output = layers.maxout(x=data, groups=2)
+            self.assertIsNotNone(output)
+        print(str(program))
+
 
 if __name__ == '__main__':
     unittest.main()

From 045589fae4a3bfaf42a9247bff243f0c3864c59d Mon Sep 17 00:00:00 2001
From: Luo Tao <luotao02@baidu.com>
Date: Mon, 11 Jun 2018 10:27:57 +0800
Subject: [PATCH 88/93] fix compiler error in high-level api

---
 .../contrib/inference/test_paddle_inference_api_impl.cc  | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/paddle/contrib/inference/test_paddle_inference_api_impl.cc b/paddle/contrib/inference/test_paddle_inference_api_impl.cc
index 4b6cb7b051..5d843010e0 100644
--- a/paddle/contrib/inference/test_paddle_inference_api_impl.cc
+++ b/paddle/contrib/inference/test_paddle_inference_api_impl.cc
@@ -109,7 +109,6 @@ void MainWord2Vec(bool use_gpu) {
 
 void MainImageClassification(bool use_gpu) {
   int batch_size = 2;
-  bool use_mkldnn = false;
   bool repeat = false;
   NativeConfig config = GetConfig();
   config.use_gpu = use_gpu;
@@ -134,12 +133,8 @@ void MainImageClassification(bool use_gpu) {
   std::vector<framework::LoDTensor*> cpu_fetchs1;
   cpu_fetchs1.push_back(&output1);
 
-  TestInference<platform::CPUPlace, false, true>(config.model_dir,
-                                                 cpu_feeds,
-                                                 cpu_fetchs1,
-                                                 repeat,
-                                                 is_combined,
-                                                 use_mkldnn);
+  TestInference<platform::CPUPlace, false, true>(
+      config.model_dir, cpu_feeds, cpu_fetchs1, repeat, is_combined);
 
   auto predictor = CreatePaddlePredictor(config);
   std::vector<PaddleTensor> paddle_tensor_feeds;

From 9b43edeae05d0a9419c787f0166b95f6a70ba4f7 Mon Sep 17 00:00:00 2001
From: yuyang18 <reyoung@126.com>
Date: Mon, 11 Jun 2018 12:26:16 +0800
Subject: [PATCH 89/93] Polish arg_min_max_op

* Remove unused arg_max/min_op.h
* Remove reference parameter. Use pointer insteaded.
* undef macro
* Always set OutT as int64_t.
---
 paddle/fluid/operators/arg_max_op.cc         | 28 +++++++++----------
 paddle/fluid/operators/arg_max_op.cu         | 19 ++++++-------
 paddle/fluid/operators/arg_max_op.h          | 16 -----------
 paddle/fluid/operators/arg_min_max_op_base.h | 29 +++++++++++---------
 paddle/fluid/operators/arg_min_op.cc         | 28 +++++++++----------
 paddle/fluid/operators/arg_min_op.cu         | 19 ++++++-------
 paddle/fluid/operators/arg_min_op.h          | 16 -----------
 7 files changed, 60 insertions(+), 95 deletions(-)
 delete mode 100644 paddle/fluid/operators/arg_max_op.h
 delete mode 100644 paddle/fluid/operators/arg_min_op.h

diff --git a/paddle/fluid/operators/arg_max_op.cc b/paddle/fluid/operators/arg_max_op.cc
index 859cccd1b2..8174d37358 100644
--- a/paddle/fluid/operators/arg_max_op.cc
+++ b/paddle/fluid/operators/arg_max_op.cc
@@ -12,24 +12,22 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/arg_max_op.h"
+#include "paddle/fluid/operators/arg_min_max_op_base.h"
 
-REGISTER_OPERATOR(arg_max, paddle::operators::ArgMaxOp,
+REGISTER_OPERATOR(arg_max, paddle::operators::ArgMinMaxOp,
                   paddle::operators::ArgMaxOpMaker,
                   paddle::framework::EmptyGradOpMaker);
 
 REGISTER_OP_CPU_KERNEL(
-    arg_max, paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext,
-                                             float, int64_t>,
-    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext, double,
+    arg_max,
+    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext, float>,
+    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext, double>,
+    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext,
                                     int64_t>,
-    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext, int64_t,
-                                    int64_t>,
-    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext, int32_t,
-                                    int64_t>,
-    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext, int16_t,
-                                    int64_t>,
-    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext, size_t,
-                                    int64_t>,
-    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext, uint8_t,
-                                    int64_t>);
+    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext,
+                                    int32_t>,
+    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext,
+                                    int16_t>,
+    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext, size_t>,
+    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext,
+                                    uint8_t>);
diff --git a/paddle/fluid/operators/arg_max_op.cu b/paddle/fluid/operators/arg_max_op.cu
index c9c102bdcc..a147d77a9e 100644
--- a/paddle/fluid/operators/arg_max_op.cu
+++ b/paddle/fluid/operators/arg_max_op.cu
@@ -12,21 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/arg_max_op.h"
+#include "paddle/fluid/operators/arg_min_max_op_base.h"
 
 REGISTER_OP_CUDA_KERNEL(
     arg_max,
-    paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext, float,
-                                    int64_t>,
-    paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext, double,
+    paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext, float>,
+    paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext,
+                                    double>,
+    paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext,
                                     int64_t>,
     paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext,
-                                    int64_t, int64_t>,
+                                    int32_t>,
     paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext,
-                                    int32_t, int64_t>,
+                                    int16_t>,
     paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext,
-                                    int16_t, int64_t>,
-    paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext, size_t,
-                                    int64_t>,
+                                    size_t>,
     paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext,
-                                    uint8_t, int64_t>);
+                                    uint8_t>);
diff --git a/paddle/fluid/operators/arg_max_op.h b/paddle/fluid/operators/arg_max_op.h
deleted file mode 100644
index d232a85699..0000000000
--- a/paddle/fluid/operators/arg_max_op.h
+++ /dev/null
@@ -1,16 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/operators/arg_min_max_op_base.h"
diff --git a/paddle/fluid/operators/arg_min_max_op_base.h b/paddle/fluid/operators/arg_min_max_op_base.h
index 8c20461a34..6cbdaefeda 100644
--- a/paddle/fluid/operators/arg_min_max_op_base.h
+++ b/paddle/fluid/operators/arg_min_max_op_base.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <string>
 #include <type_traits>
 #include <vector>
 #include "paddle/fluid/framework/ddim.h"
@@ -37,9 +38,9 @@ struct ArgMinMaxFunctor {};
   struct ArgMinMaxFunctor<DeviceContext, T, Tout, Rank,                       \
                           enum_argminmax_value> {                             \
     void operator()(const DeviceContext& ctx, const framework::LoDTensor& in, \
-                    framework::LoDTensor& out, int64_t axis) {                \
+                    framework::LoDTensor* out, int64_t axis) {                \
       auto in_eigen = framework::EigenTensor<T, Rank>::From(in);              \
-      auto out_eigen = framework::EigenTensor<Tout, Rank - 1>::From(out);     \
+      auto out_eigen = framework::EigenTensor<Tout, Rank - 1>::From(*out);    \
       out_eigen.device(*(ctx.eigen_device())) =                               \
           in_eigen.eigen_op_type(axis).template cast<Tout>();                 \
     }                                                                         \
@@ -62,7 +63,7 @@ class ArgMinMaxKernel : public framework::OpKernel<T> {
 #define CALL_ARG_MINMAX_FUNCTOR(rank)                                \
   ArgMinMaxFunctor<DeviceContext, T, Tout, rank, EnumArgMinMaxValue> \
       functor##rank;                                                 \
-  functor##rank(dev_ctx, x, out, axis)
+  functor##rank(dev_ctx, x, &out, axis)
 
     switch (x.dims().size()) {
       case 1:
@@ -89,19 +90,20 @@ class ArgMinMaxKernel : public framework::OpKernel<T> {
             "than 6.",
             (EnumArgMinMaxValue == kArgMin ? "argmin" : "argmax"));
         break;
+#undef CALL_ARG_MINMAX_FUNCTOR
     }
   }
 };
 
-template <typename DeviceContext, typename T, typename Tout>
+template <typename DeviceContext, typename T>
 using ArgMinKernel =
-    ArgMinMaxKernel<DeviceContext, T, Tout, ArgMinMaxType::kArgMin>;
+    ArgMinMaxKernel<DeviceContext, T, int64_t, ArgMinMaxType::kArgMin>;
 
-template <typename DeviceContext, typename T, typename Tout>
+template <typename DeviceContext, typename T>
 using ArgMaxKernel =
-    ArgMinMaxKernel<DeviceContext, T, Tout, ArgMinMaxType::kArgMax>;
+    ArgMinMaxKernel<DeviceContext, T, int64_t, ArgMinMaxType::kArgMax>;
 
-typedef class BaseArgMinMaxOp : public framework::OperatorWithKernel {
+class ArgMinMaxOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
@@ -121,7 +123,7 @@ typedef class BaseArgMinMaxOp : public framework::OperatorWithKernel {
     for (int64_t i = axis + 1; i < x_rank; i++) vec.push_back(x_dims[i]);
     ctx->SetOutputDim("Out", framework::make_ddim(vec));
   }
-} ArgMinOp, ArgMaxOp;
+};
 
 class BaseArgMinMaxOpMaker : public framework::OpProtoAndCheckerMaker {
  protected:
@@ -133,12 +135,13 @@ class BaseArgMinMaxOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "Input tensor.");
     AddOutput("Out", "Output tensor.");
     AddAttr<int64_t>("axis", "The axis in which to compute the arg indics.");
-    AddComment(::paddle::string::Sprintf(R"DOC(
-				%s Operator.
+    AddComment(string::Sprintf(R"DOC(
+      %s Operator.
 
-				Computes the indices of the %s elements of the input tensor's element along the provided axis.
+      Computes the indices of the %s elements of the input tensor's element
+      along the provided axis.
 )DOC",
-                                         OpName(), Name()));
+                               OpName(), Name()));
   }
 };
 
diff --git a/paddle/fluid/operators/arg_min_op.cc b/paddle/fluid/operators/arg_min_op.cc
index 18c0884a04..41f188029f 100644
--- a/paddle/fluid/operators/arg_min_op.cc
+++ b/paddle/fluid/operators/arg_min_op.cc
@@ -12,24 +12,22 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/arg_min_op.h"
+#include "paddle/fluid/operators/arg_min_max_op_base.h"
 
-REGISTER_OPERATOR(arg_min, paddle::operators::ArgMinOp,
+REGISTER_OPERATOR(arg_min, paddle::operators::ArgMinMaxOp,
                   paddle::operators::ArgMinOpMaker,
                   paddle::framework::EmptyGradOpMaker);
 
 REGISTER_OP_CPU_KERNEL(
-    arg_min, paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext,
-                                             float, int64_t>,
-    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext, double,
+    arg_min,
+    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext, float>,
+    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext, double>,
+    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext,
                                     int64_t>,
-    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext, int64_t,
-                                    int64_t>,
-    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext, int32_t,
-                                    int64_t>,
-    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext, int16_t,
-                                    int64_t>,
-    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext, size_t,
-                                    int64_t>,
-    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext, uint8_t,
-                                    int64_t>);
+    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext,
+                                    int32_t>,
+    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext,
+                                    int16_t>,
+    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext, size_t>,
+    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext,
+                                    uint8_t>);
diff --git a/paddle/fluid/operators/arg_min_op.cu b/paddle/fluid/operators/arg_min_op.cu
index 6d5aaa9596..4d02050850 100644
--- a/paddle/fluid/operators/arg_min_op.cu
+++ b/paddle/fluid/operators/arg_min_op.cu
@@ -12,21 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/arg_min_op.h"
+#include "paddle/fluid/operators/arg_min_max_op_base.h"
 
 REGISTER_OP_CUDA_KERNEL(
     arg_min,
-    paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext, float,
-                                    int64_t>,
-    paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext, double,
+    paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext, float>,
+    paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext,
+                                    double>,
+    paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext,
                                     int64_t>,
     paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext,
-                                    int64_t, int64_t>,
+                                    int32_t>,
     paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext,
-                                    int32_t, int64_t>,
+                                    int16_t>,
     paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext,
-                                    int16_t, int64_t>,
-    paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext, size_t,
-                                    int64_t>,
+                                    size_t>,
     paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext,
-                                    uint8_t, int64_t>);
+                                    uint8_t>);
diff --git a/paddle/fluid/operators/arg_min_op.h b/paddle/fluid/operators/arg_min_op.h
deleted file mode 100644
index d232a85699..0000000000
--- a/paddle/fluid/operators/arg_min_op.h
+++ /dev/null
@@ -1,16 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/operators/arg_min_max_op_base.h"

From a1254a86bab4fd99a7801fd3eb3f3cccb0130ba1 Mon Sep 17 00:00:00 2001
From: yuyang18 <reyoung@126.com>
Date: Mon, 11 Jun 2018 12:43:41 +0800
Subject: [PATCH 90/93] Add lock to record_event.

---
 paddle/fluid/platform/device_context.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 6b82d93237..292ffef1ae 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -11,6 +11,7 @@ limitations under the License. */
 #pragma once
 
 #include <memory>
+#include <mutex>  // NOLINT
 #include <string>
 #include <unordered_map>
 #include <vector>
@@ -100,6 +101,7 @@ class CUDADeviceContext : public DeviceContext {
 
   template <typename Callback>
   void RecordEvent(cudaEvent_t ev, Callback callback) {
+    std::lock_guard<std::mutex> guard(mtx_);
     callback();
     PADDLE_ENFORCE(cudaEventRecord(ev, stream_));
   }
@@ -116,6 +118,8 @@ class CUDADeviceContext : public DeviceContext {
   int compute_capability;
   int multi_process;
   int max_threads_per_mp;
+
+  std::mutex mtx_;
 };
 
 template <>

From 4b6d584f4fb3741fe3d3268c36b54b8469444f60 Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Mon, 11 Jun 2018 06:01:41 +0000
Subject: [PATCH 91/93] fix identifier error of 'dshape'

---
 benchmark/fluid/models/vgg.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/benchmark/fluid/models/vgg.py b/benchmark/fluid/models/vgg.py
index 6092cdeb88..932601302d 100644
--- a/benchmark/fluid/models/vgg.py
+++ b/benchmark/fluid/models/vgg.py
@@ -82,7 +82,8 @@ def get_model(args):
                 data_file, batch_size=args.batch_size))
         images, label = fluid.layers.read_file(data_file)
     else:
-        images = fluid.layers.data(name='data', shape=dshape, dtype='float32')
+        images = fluid.layers.data(
+            name='data', shape=data_shape, dtype='float32')
         label = fluid.layers.data(name='label', shape=[1], dtype='int64')
 
     # Train program

From 1cfd3cb13b0ea4bf091757d59ec24e021563518f Mon Sep 17 00:00:00 2001
From: Qiyang Min <velconias@gmail.com>
Date: Mon, 11 Jun 2018 03:11:22 -0500
Subject: [PATCH 92/93] Add some dist-training robust cases into fluid
 benchmark test (#11207)

* 1. add weight decay feature into fluid benchmark test
2. add learning rate decay feature into fluid benchmark test
3. add L1&L2 regularization feature into fluid benchmark test
4. add error clipping feature into fluid benchmark test
5. add gradient clipping feature into fluid benchmark test

* Add some document to README.md under benchmark/fluid/ repo

* Add model_base.py

* Fix bugs in test_listen_and_serv_op

* 1. remove args out of fluid_benchmark.py
2. remove lr_decay, regularization, clipping out of fluid_benchmark.py

* add async_mode description to doc and remove the clipping description out

* for restart build

* to restart build

* remove optimization args from args.py

* 1. remove optimization from models
2. fix bug in test_listen_and_serv_op

* change the name retry_times to left_time

* change retry_times to the pserver start left time
---
 benchmark/fluid/README.md                     |   4 +-
 benchmark/fluid/args.py                       | 126 ++++++++++++++++++
 benchmark/fluid/fluid_benchmark.py            | 114 ++--------------
 .../unittests/test_listen_and_serv_op.py      |   9 +-
 4 files changed, 143 insertions(+), 110 deletions(-)
 create mode 100644 benchmark/fluid/args.py

diff --git a/benchmark/fluid/README.md b/benchmark/fluid/README.md
index f40f3c1297..28cade4634 100644
--- a/benchmark/fluid/README.md
+++ b/benchmark/fluid/README.md
@@ -24,10 +24,12 @@ Currently supported `--model` argument include:
 
 * Run the following command to start a benchmark job locally:
     ```bash
-      python fluid_benchmark.py --model mnist  --device GPU
+      python fluid_benchmark.py --model mnist --device GPU
     ```
     You can choose to use GPU/CPU training. With GPU training, you can specify
     `--gpus <gpu_num>` to run multi GPU training.
+    You can set async mode parameter server. With async mode, you can specify
+    `--async_mode` to train model asynchronous.
 * Run distributed training with parameter servers:
     * see [run_fluid_benchmark.sh](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/fluid/run_fluid_benchmark.sh) as an example.
     * start parameter servers:
diff --git a/benchmark/fluid/args.py b/benchmark/fluid/args.py
new file mode 100644
index 0000000000..68a3d42d7a
--- /dev/null
+++ b/benchmark/fluid/args.py
@@ -0,0 +1,126 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+
+__all__ = ['parse_args', ]
+
+BENCHMARK_MODELS = [
+    "machine_translation", "resnet", "vgg", "mnist", "stacked_dynamic_lstm"
+]
+
+
+def parse_args():
+    parser = argparse.ArgumentParser('Fluid model benchmarks.')
+    parser.add_argument(
+        '--model',
+        type=str,
+        choices=BENCHMARK_MODELS,
+        default='resnet',
+        help='The model to run benchmark with.')
+    parser.add_argument(
+        '--batch_size', type=int, default=32, help='The minibatch size.')
+    #  args related to learning rate
+    parser.add_argument(
+        '--learning_rate', type=float, default=0.001, help='The learning rate.')
+    # TODO(wuyi): add "--use_fake_data" option back.
+    parser.add_argument(
+        '--skip_batch_num',
+        type=int,
+        default=5,
+        help='The first num of minibatch num to skip, for better performance test'
+    )
+    parser.add_argument(
+        '--iterations', type=int, default=80, help='The number of minibatches.')
+    parser.add_argument(
+        '--pass_num', type=int, default=100, help='The number of passes.')
+    parser.add_argument(
+        '--data_format',
+        type=str,
+        default='NCHW',
+        choices=['NCHW', 'NHWC'],
+        help='The data data_format, now only support NCHW.')
+    parser.add_argument(
+        '--device',
+        type=str,
+        default='GPU',
+        choices=['CPU', 'GPU'],
+        help='The device type.')
+    parser.add_argument(
+        '--gpus',
+        type=int,
+        default=1,
+        help='If gpus > 1, will use ParallelExecutor to run, else use Executor.')
+    # this option is available only for vgg and resnet.
+    parser.add_argument(
+        '--cpus',
+        type=int,
+        default=1,
+        help='If cpus > 1, will use ParallelDo to run, else use Executor.')
+    parser.add_argument(
+        '--data_set',
+        type=str,
+        default='flowers',
+        choices=['cifar10', 'flowers'],
+        help='Optional dataset for benchmark.')
+    parser.add_argument(
+        '--infer_only', action='store_true', help='If set, run forward only.')
+    parser.add_argument(
+        '--use_cprof', action='store_true', help='If set, use cProfile.')
+    parser.add_argument(
+        '--use_nvprof',
+        action='store_true',
+        help='If set, use nvprof for CUDA.')
+    parser.add_argument(
+        '--no_test',
+        action='store_true',
+        help='If set, do not test the testset during training.')
+    parser.add_argument(
+        '--memory_optimize',
+        action='store_true',
+        help='If set, optimize runtime memory before start.')
+    parser.add_argument(
+        '--use_fake_data',
+        action='store_true',
+        help='If set ommit the actual read data operators.')
+    parser.add_argument(
+        '--profile', action='store_true', help='If set, profile a few steps.')
+    parser.add_argument(
+        '--update_method',
+        type=str,
+        default='local',
+        choices=['local', 'pserver', 'nccl2'],
+        help='Choose parameter update method, can be local, pserver, nccl2.')
+    parser.add_argument(
+        '--no_split_var',
+        action='store_true',
+        default=False,
+        help='Whether split variables into blocks when update_method is pserver')
+    parser.add_argument(
+        '--async_mode',
+        action='store_true',
+        default=False,
+        help='Whether start pserver in async mode to support ASGD')
+    parser.add_argument(
+        '--use_reader_op',
+        action='store_true',
+        help='Whether to use reader op, and must specify the data path if set this to true.'
+    )
+    parser.add_argument(
+        '--data_path',
+        type=str,
+        default="",
+        help='Directory that contains all the training recordio files.')
+    args = parser.parse_args()
+    return args
diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py
index 62a05234c4..902dca209f 100644
--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
@@ -24,108 +24,7 @@ import paddle.fluid.core as core
 import paddle.fluid.profiler as profiler
 import paddle.fluid.transpiler.distribute_transpiler as distribute_transpiler
 
-BENCHMARK_MODELS = [
-    "machine_translation", "resnet", "vgg", "mnist", "stacked_dynamic_lstm"
-]
-
-
-def parse_args():
-    parser = argparse.ArgumentParser('Fluid model benchmarks.')
-    parser.add_argument(
-        '--model',
-        type=str,
-        choices=BENCHMARK_MODELS,
-        default='resnet',
-        help='The model to run benchmark with.')
-    parser.add_argument(
-        '--batch_size',
-        type=int,
-        default=32,
-        help='The batch size on each gpu.')
-    parser.add_argument(
-        '--learning_rate', type=float, default=0.001, help='The learning rate.')
-    parser.add_argument(
-        '--skip_batch_num',
-        type=int,
-        default=5,
-        help='The first num of minibatch num to skip, for better performance test'
-    )
-    parser.add_argument(
-        '--iterations',
-        type=int,
-        default=80,
-        help='The number of minibatches, set to -1 to run all batches.')
-    parser.add_argument(
-        '--pass_num', type=int, default=100, help='The number of passes.')
-    parser.add_argument(
-        '--data_format',
-        type=str,
-        default='NCHW',
-        choices=['NCHW', 'NHWC'],
-        help='The data data_format, now only support NCHW.')
-    parser.add_argument(
-        '--device',
-        type=str,
-        default='GPU',
-        choices=['CPU', 'GPU'],
-        help='The device type.')
-    parser.add_argument(
-        '--gpus',
-        type=int,
-        default=1,
-        help='If gpus > 1, will use ParallelExecutor to run, else use Executor.')
-    # this option is available only for vgg and resnet.
-    parser.add_argument(
-        '--cpus',
-        type=int,
-        default=1,
-        help='If cpus > 1, will use ParallelDo to run, else use Executor.')
-    parser.add_argument(
-        '--data_set',
-        type=str,
-        default='flowers',
-        choices=['cifar10', 'flowers', 'imagenet'],
-        help='Optional dataset for benchmark.')
-    parser.add_argument(
-        '--infer_only', action='store_true', help='If set, run forward only.')
-    parser.add_argument(
-        '--use_cprof', action='store_true', help='If set, use cProfile.')
-    parser.add_argument(
-        '--use_nvprof',
-        action='store_true',
-        help='If set, use nvprof for CUDA.')
-    parser.add_argument(
-        '--no_test',
-        action='store_true',
-        help='If set, do not test the testset during training.')
-    parser.add_argument(
-        '--memory_optimize',
-        action='store_true',
-        help='If set, optimize runtime memory before start.')
-    parser.add_argument(
-        '--use_fake_data',
-        action='store_true',
-        help='If set ommit the actual read data operators.')
-    parser.add_argument(
-        '--profile', action='store_true', help='If set, profile a few steps.')
-    parser.add_argument(
-        '--update_method',
-        type=str,
-        default='local',
-        choices=['local', 'pserver', 'nccl2'],
-        help='Choose parameter update method, can be local, pserver, nccl2.')
-    parser.add_argument(
-        '--use_reader_op',
-        action='store_true',
-        help='Whether to use reader op, and must specify the data path if set this to true.'
-    )
-    parser.add_argument(
-        '--data_path',
-        type=str,
-        default="",
-        help='Directory that contains all the training recordio files.')
-    args = parser.parse_args()
-    return args
+from args import *
 
 
 def append_nccl2_prepare(trainer_id):
@@ -160,7 +59,7 @@ def append_nccl2_prepare(trainer_id):
                         "nccl-based dist train.")
 
 
-def dist_transpile(trainer_id):
+def dist_transpile(trainer_id, args):
     if trainer_id < 0:
         return None, None
 
@@ -182,7 +81,12 @@ def dist_transpile(trainer_id):
     training_role = os.getenv("PADDLE_TRAINING_ROLE")
 
     t = distribute_transpiler.DistributeTranspiler()
-    t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
+    t.transpile(
+        trainer_id,
+        pservers=pserver_endpoints,
+        trainers=trainers,
+        sync_mode=not args.async_mode,
+        slice_var_up=not args.no_split_var)
     if training_role == "PSERVER":
         pserver_program = t.get_pserver_program(current_endpoint)
         pserver_startup_program = t.get_startup_program(current_endpoint,
@@ -417,7 +321,7 @@ def main():
         fluid.memory_optimize(fluid.default_main_program())
 
     if args.update_method == "pserver":
-        train_prog, startup_prog = dist_transpile(trainer_id)
+        train_prog, startup_prog = dist_transpile(trainer_id, args)
         if not train_prog:
             raise Exception(
                 "Must configure correct environments to run dist train.")
diff --git a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
index 1226027ddc..d1d709551c 100644
--- a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
@@ -70,17 +70,18 @@ class TestListenAndServOp(OpTest):
         return p.pid
 
     def _wait_ps_ready(self, pid):
-        retry_times = self.ps_timeout
+        start_left_time = self.ps_timeout
+        sleep_time = 0.5
         while True:
-            assert retry_times >= 0, "wait ps ready failed"
-            time.sleep(0.5)
+            assert start_left_time >= 0, "wait ps ready failed"
+            time.sleep(sleep_time)
             try:
                 # the listen_and_serv_op would touch a file which contains the listen port
                 # on the /tmp directory until it was ready to process all the RPC call.
                 os.stat("/tmp/paddle.%d.port" % pid)
                 return
             except os.error:
-                retry_times -= 1
+                start_left_time -= sleep_time
 
     def test_rpc_interfaces(self):
         # TODO(Yancey1989): need to make sure the rpc interface correctly.

From 627d7a64f8cb6cac25e6bc0ac524f48c430fd62a Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Mon, 11 Jun 2018 16:54:50 +0800
Subject: [PATCH 93/93] Clean `sendop` `recv` operator. (#11309)

---
 .../details/multi_devices_graph_builder.cc    |   8 +-
 paddle/fluid/operators/CMakeLists.txt         |  11 +-
 paddle/fluid/operators/recv_op.cc             |   8 +-
 paddle/fluid/operators/send_op.cc             |  50 +++------
 paddle/fluid/operators/send_vars_op.cc        | 101 ------------------
 .../tests/unittests/test_dist_transpiler.py   |   5 +-
 .../unittests/test_simple_dist_transpiler.py  |   4 +-
 .../fluid/transpiler/distribute_transpiler.py |  14 +--
 8 files changed, 39 insertions(+), 162 deletions(-)
 delete mode 100644 paddle/fluid/operators/send_vars_op.cc

diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
index 97242ebf2a..b568c1344b 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@@ -89,7 +89,7 @@ std::vector<std::string> MultiDevSSAGraphBuilder::FindDistTrainSendVars(
   for (auto *op : program.Block(0).AllOps()) {
     // TODO(Yancey1989): use a graceful method to find send op,
     // instead of the the hard code string
-    if (op->Type() == "send_vars") {
+    if (op->Type() == "send") {
       auto op_vars = op->InputArgumentNames();
       send_vars.reserve(send_vars.size() +
                         std::distance(op_vars.begin(), op_vars.end()));
@@ -468,17 +468,17 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(SSAGraph *result,
       new RPCOpHandle(op, local_scopes_[0], op.Type(), places_[0]));
 
   if (op.Type() == "send_barrier") {
-    ConnectOp(result, result->ops_.back().get(), "send_vars");
+    ConnectOp(result, result->ops_.back().get(), "send");
   } else if (op.Type() == "recv") {
     ConnectOp(result, result->ops_.back().get(), "send_barrier");
   } else if (op.Type() == "fetch_barrier") {
     ConnectOp(result, result->ops_.back().get(), "recv");
-  } else if (op.Type() == "send_vars") {
+  } else if (op.Type() == "send") {
     // do nothing
   } else {
     PADDLE_THROW(
         "rpc op should be in ["
-        "send_vars, send_barrier. recv, fetch_barrier]");
+        "send, send_barrier. recv, fetch_barrier]");
   }
 
   // TODO(Yancey1989): schedule rpc op on different place may
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 5e86b16ba1..0f4050250d 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -189,16 +189,14 @@ if(WITH_DISTRIBUTE)
     
     set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf)
     set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
-    op_library(send_op DEPS ${DISTRIBUTE_DEPS})
-    set_source_files_properties(send_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
     op_library(prefetch_op DEPS ${DISTRIBUTE_DEPS})
     set_source_files_properties(prefetch_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
     op_library(recv_op DEPS ${DISTRIBUTE_DEPS})
     set_source_files_properties(recv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
     op_library(listen_and_serv_op DEPS ${DISTRIBUTE_DEPS})
     set_source_files_properties(listen_and_serv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-    op_library(send_vars_op DEPS ${DISTRIBUTE_DEPS})
-    set_source_files_properties(send_vars_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+    op_library(send_op DEPS ${DISTRIBUTE_DEPS})
+    set_source_files_properties(send_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
     op_library(send_barrier_op DEPS ${DISTRIBUTE_DEPS})
     op_library(fetch_barrier_op DEPS ${DISTRIBUTE_DEPS})
     set_source_files_properties(send_barrier_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
@@ -208,15 +206,14 @@ if(WITH_DISTRIBUTE)
     #        listen_and_serv_op sum_op executor SERIAL)
     if(WITH_GPU)
         set_source_files_properties(test_send_nccl_id.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-        cc_test(test_send_nccl_id SRCS test_send_nccl_id.cc DEPS send_op
-                listen_and_serv_op executor SERIAL)
+        cc_test(test_send_nccl_id SRCS test_send_nccl_id.cc DEPS listen_and_serv_op executor SERIAL)
         op_library(gen_nccl_id_op DEPS nccl_common sendrecvop_grpc)
         set_source_files_properties(gen_nccl_id_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
     else()
         set(DEPS_OPS ${DEPS_OPS} gen_nccl_id_op)
     endif()
 else()
-    set(DEPS_OPS ${DEPS_OPS} send_op prefetch_op recv_op listen_and_serv_op send_vars_op send_barrier_op fetch_barrier_op gen_nccl_id_op)
+    set(DEPS_OPS ${DEPS_OPS}  prefetch_op recv_op listen_and_serv_op send_op send_barrier_op fetch_barrier_op gen_nccl_id_op)
 endif()
 
 op_library(cross_entropy_op DEPS cross_entropy)
diff --git a/paddle/fluid/operators/recv_op.cc b/paddle/fluid/operators/recv_op.cc
index 1ea1cc458b..4198c3ee56 100644
--- a/paddle/fluid/operators/recv_op.cc
+++ b/paddle/fluid/operators/recv_op.cc
@@ -78,9 +78,15 @@ This operator can get variables from server side.
   }
 };
 
+class RecvOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* ctx) const override {}
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 
-REGISTER_OPERATOR(recv, ops::RecvOp, ops::RecvOpMaker);
+REGISTER_OPERATOR(recv, ops::RecvOp, paddle::framework::EmptyGradOpMaker,
+                  ops::RecvOpMaker, ops::RecvOpShapeInference);
diff --git a/paddle/fluid/operators/send_op.cc b/paddle/fluid/operators/send_op.cc
index 9697579707..2a2fe53c71 100644
--- a/paddle/fluid/operators/send_op.cc
+++ b/paddle/fluid/operators/send_op.cc
@@ -16,7 +16,6 @@ limitations under the License. */
 #include <ostream>
 
 #include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detail/grpc_client.h"
@@ -36,12 +35,9 @@ class SendOp : public framework::OperatorBase {
   void RunImpl(const framework::Scope& scope,
                const platform::Place& place) const override {
     auto ins = Inputs("X");
-    auto outs = Outputs("Out");
-    std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
-    std::vector<std::string> endpoints =
-        Attr<std::vector<std::string>>("endpoints");
 
-    bool sync_mode = Attr<bool>("sync_mode");
+    std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
+    int sync_send = Attr<int>("sync_mode");
 
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     auto& ctx = *pool.Get(place);
@@ -55,32 +51,14 @@ class SendOp : public framework::OperatorBase {
     for (size_t i = 0; i < ins.size(); i++) {
       if (NeedSend(scope, ins[i])) {
         VLOG(3) << "sending " << ins[i] << " to " << epmap[i];
+        // TODO(Yancey1989): we need to use an IO threadpool which has
+        // a larger number of threads than the computing threadpool.
         rpc_client->AsyncSendVar(epmap[i], ctx, scope, ins[i]);
       } else {
         VLOG(3) << "don't send no-initialied variable: " << ins[i];
       }
     }
-    rpc_client->Wait();
-
-    if (sync_mode) {
-      for (auto& ep : endpoints) {
-        VLOG(3) << "batch barrier, ep: " << ep;
-        rpc_client->AsyncSendBatchBarrier(ep);
-      }
-      rpc_client->Wait();
-    }
-
-    if (outs.size() > 0) {
-      for (size_t i = 0; i < outs.size(); i++) {
-        VLOG(2) << "getting " << outs[i] << " from " << epmap[i];
-        rpc_client->AsyncGetVar(epmap[i], ctx, scope, outs[i]);
-      }
-      rpc_client->Wait();
-      // tell pservers that current trainer have called fetch
-      for (auto& ep : endpoints) {
-        VLOG(2) << "send fetch barrier, ep: " << ep;
-        rpc_client->AsyncSendFetchBarrier(ep);
-      }
+    if (sync_send) {
       rpc_client->Wait();
     }
   }
@@ -89,26 +67,22 @@ class SendOp : public framework::OperatorBase {
 class SendOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() {
-    AddInput("X", "(Tensor) Input tensor to be sent").AsDuplicable();
-    AddOutput("Out", "(Tensor) Output tensor to be received from server")
+    AddInput("X", "(Tensor, SelectedRows) Input variables to be sent")
         .AsDuplicable();
     AddComment(R"DOC(
 Send operator
 
-This operator will send tensor to recv_op at the parameter server.
+This operator will send variables to listen_and_serve op at the parameter server.
 )DOC");
-    // TODO(typhoonzero): remove this attr generate de-duplicated vector from
-    // epmap when initializing.
-    AddAttr<std::vector<std::string>>("endpoints",
-                                      "(string vector, default 127.0.0.1:6164)"
-                                      "Server endpoints to send variables to.")
-        .SetDefault({});
+    AddAttr<int>("sync_mode",
+                 "(int, default 0)"
+                 "sync send or async send.")
+        .SetDefault(0);
     AddAttr<std::vector<std::string>>("epmap",
                                       "(string vector, default 127.0.0.1:6164)"
                                       "Server endpoints in the order of input "
                                       "variables for mapping")
-        .SetDefault({});
-    AddAttr<bool>("sync_mode", "work in sync_mode or not").SetDefault(true);
+        .SetDefault({"127.0.0.1:6164"});
   }
 };
 
diff --git a/paddle/fluid/operators/send_vars_op.cc b/paddle/fluid/operators/send_vars_op.cc
deleted file mode 100644
index 564e40461f..0000000000
--- a/paddle/fluid/operators/send_vars_op.cc
+++ /dev/null
@@ -1,101 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <future>  // NOLINT
-#include <ostream>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/detail/grpc_client.h"
-#include "paddle/fluid/operators/send_recv_util.h"
-#include "paddle/fluid/platform/profiler.h"
-
-namespace paddle {
-namespace operators {
-
-class SendVarsOp : public framework::OperatorBase {
- public:
-  SendVarsOp(const std::string& type, const framework::VariableNameMap& inputs,
-             const framework::VariableNameMap& outputs,
-             const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& place) const override {
-    auto ins = Inputs("X");
-
-    std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
-    int sync_send = Attr<int>("sync_send");
-
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    auto& ctx = *pool.Get(place);
-
-    // For profiling
-    platform::RecordEvent record_event(Type(), &ctx);
-
-    detail::RPCClient* rpc_client =
-        detail::RPCClient::GetInstance<detail::GRPCClient>();
-
-    for (size_t i = 0; i < ins.size(); i++) {
-      if (NeedSend(scope, ins[i])) {
-        VLOG(3) << "sending " << ins[i] << " to " << epmap[i];
-        // TODO(Yancey1989): we need to use an IO threadpool which has
-        // a larger number of threads than the computing threadpool.
-        rpc_client->AsyncSendVar(epmap[i], ctx, scope, ins[i]);
-      } else {
-        VLOG(3) << "don't send no-initialied variable: " << ins[i];
-      }
-    }
-    if (sync_send) {
-      rpc_client->Wait();
-    }
-  }
-};
-
-class SendVarsOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddInput("X", "(Tensor, SelectedRows) Input variables to be sent")
-        .AsDuplicable();
-    AddComment(R"DOC(
-Send operator
-
-This operator will send variables to listen_and_serve op at the parameter server.
-)DOC");
-    AddAttr<int>("sync_send",
-                 "(int, default 0)"
-                 "sync send or async send.")
-        .SetDefault(0);
-    AddAttr<std::vector<std::string>>("epmap",
-                                      "(string vector, default 127.0.0.1:6164)"
-                                      "Server endpoints in the order of input "
-                                      "variables for mapping")
-        .SetDefault({"127.0.0.1:6164"});
-  }
-};
-
-class SendVarsOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext* ctx) const override {}
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(send_vars, ops::SendVarsOp,
-                  paddle::framework::EmptyGradOpMaker, ops::SendVarsOpMaker,
-                  ops::SendVarsOpShapeInference);
diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
index 32647f9aa8..b4379ad447 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import unittest
 import paddle.fluid as fluid
 from paddle.fluid.transpiler.distribute_transpiler import delete_ops
 
@@ -54,10 +55,10 @@ class TestDistTranspiler(TranspilerTest):
 
         delete_ops(trainer.global_block(), optimize_ops)
         ops = [op.type for op in trainer.global_block().ops] + [
-            "split_byref", "send_vars", "send_barrier", "recv", "recv",
+            "split_byref", "send", "send_barrier", "recv", "recv",
             "fetch_barrier", "concat"
         ]
-        ops.insert(ops.index("elementwise_add_grad") + 1, "send_vars")
+        ops.insert(ops.index("elementwise_add_grad") + 1, "send")
         return ops
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_simple_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_simple_dist_transpiler.py
index 5ae2844e29..f4aa7426bc 100644
--- a/python/paddle/fluid/tests/unittests/test_simple_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_simple_dist_transpiler.py
@@ -59,9 +59,9 @@ class TestSimpleDistTranspiler(TranspilerTest):
 
         delete_ops(trainer.global_block(), optimize_ops)
         ops = [op.type for op in trainer.global_block().ops] + [
-            "send_vars", "send_barrier", "recv", "recv", "fetch_barrier"
+            "send", "send_barrier", "recv", "recv", "fetch_barrier"
         ]
-        ops.insert(ops.index("elementwise_add_grad") + 1, "send_vars")
+        ops.insert(ops.index("elementwise_add_grad") + 1, "send")
         return ops
 
     def _transpiler_instance(self):
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index c7ab300e0f..5b299a0f29 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -24,9 +24,9 @@ Steps to transpile trainer:
 1. split variable to multiple blocks, aligned by product(dim[1:]) (width).
 2. rename splited grad variables to add trainer_id suffix ".trainer_%d".
 3. modify trainer program add split_op to each grad variable.
-4. append send_op to send splited variables to server and fetch
-    params(splited blocks or origin param) from server.
-5. append concat_op to merge splited blocks to update local weights.
+4. append send_op to send splited variables to server and 
+5. add recv_op to fetch params(splited blocks or origin param) from server.
+6. append concat_op to merge splited blocks to update local weights.
 
 Steps to transpile pserver:
 1. create new program for parameter server.
@@ -317,7 +317,7 @@ class DistributeTranspiler:
 
             program.global_block().insert_op(
                 index=index + 1,
-                type="send_vars",
+                type="send",
                 inputs={"X": splited_vars},
                 outputs={},
                 attrs={
@@ -678,7 +678,7 @@ class DistributeTranspiler:
                     break
 
     def _split_table_grad_and_add_send_vars(self, program, pserver_endpoints):
-        # 2. add split_ids_op and send_vars_op to send gradient to pservers
+        # 2. add split_ids_op and send_op to send gradient to pservers
         # there should only be one table_name
         all_ops = program.global_block().ops
         table_grad_name = grad_var_name(self.table_name)
@@ -695,11 +695,11 @@ class DistributeTranspiler:
                     outputs={"Out": self.trainer_side_table_grad_list})
                 program.global_block().insert_op(
                     index=op_index + 2,
-                    type="send_vars",
+                    type="send",
                     inputs={'X': self.trainer_side_table_grad_list},
                     outputs={},
                     attrs={
-                        "sync_send": True,
+                        "sync_mode": True,
                         "epmap": pserver_endpoints,
                         RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
                     })