From e896926b9c3c8bed544f92ad82d42daac9fac0c0 Mon Sep 17 00:00:00 2001
From: Yancey1989 <yancey1989@gmail.com>
Date: Tue, 5 Jun 2018 15:42:29 +0800
Subject: [PATCH 01/28] add unit test for dist mnist

---
 .../fluid/tests/unittests/CMakeLists.txt      |   1 +
 .../fluid/tests/unittests/test_dist_mnist.py  | 208 ++++++++++++++++++
 2 files changed, 209 insertions(+)
 create mode 100644 python/paddle/fluid/tests/unittests/test_dist_mnist.py

diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index c33539f6b5..2f2e9c96c1 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -52,3 +52,4 @@ py_test_modules(test_dist_train MODULES test_dist_train SERIAL)
 # since load cudnn libraries, so we use a longer timeout to make this
 # unit test stability.
 set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 30)
+set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 60)
diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist.py b/python/paddle/fluid/tests/unittests/test_dist_mnist.py
new file mode 100644
index 0000000000..e6bc56cce1
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist.py
@@ -0,0 +1,208 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import argparse
+import time
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import os
+import signal
+
+SEED = 1
+DTYPE = "float32"
+
+
+# random seed must set before configuring the network.
+# fluid.default_startup_program().random_seed = SEED
+def cnn_model(data):
+    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+        input=data,
+        filter_size=5,
+        num_filters=20,
+        pool_size=2,
+        pool_stride=2,
+        act="relu")
+    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+        input=conv_pool_1,
+        filter_size=5,
+        num_filters=50,
+        pool_size=2,
+        pool_stride=2,
+        act="relu")
+
+    # TODO(dzhwinter) : refine the initializer and random seed settting
+    SIZE = 10
+    input_shape = conv_pool_2.shape
+    param_shape = [reduce(lambda a, b: a * b, input_shape[1:], 1)] + [SIZE]
+    scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5
+
+    predict = fluid.layers.fc(
+        input=conv_pool_2,
+        size=SIZE,
+        act="softmax",
+        param_attr=fluid.param_attr.ParamAttr(
+            initializer=fluid.initializer.NormalInitializer(
+                loc=0.0, scale=scale)))
+    return predict
+
+
+def get_model(batch_size):
+    # Input data
+    images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+    # Train program
+    predict = cnn_model(images)
+    cost = fluid.layers.cross_entropy(input=predict, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+
+    # Evaluator
+    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+    batch_acc = fluid.layers.accuracy(
+        input=predict, label=label, total=batch_size_tensor)
+
+    inference_program = fluid.default_main_program().clone()
+    # Optimization
+    opt = fluid.optimizer.AdamOptimizer(
+        learning_rate=0.001, beta1=0.9, beta2=0.999)
+
+    # Reader
+    train_reader = paddle.batch(
+        paddle.dataset.mnist.train(), batch_size=batch_size)
+    test_reader = paddle.batch(
+        paddle.dataset.mnist.test(), batch_size=batch_size)
+    opt.minimize(avg_cost)
+    return inference_program, avg_cost, train_reader, test_reader, batch_acc, predict
+
+
+def get_transpiler(trainer_id, main_program, pserver_endpoints, trainers):
+    t = fluid.DistributeTranspiler()
+    t.transpile(
+        trainer_id=trainer_id,
+        program=main_program,
+        pservers=pserver_endpoints,
+        trainers=trainers)
+    return t
+
+
+def run_pserver(pserver_endpoints, trainers, current_endpoint):
+    get_model(batch_size=20)
+    t = get_transpiler(0,
+                       fluid.default_main_program(), pserver_endpoints,
+                       trainers)
+    pserver_prog = t.get_pserver_program(current_endpoint)
+    startup_prog = t.get_startup_program(current_endpoint, pserver_prog)
+
+    place = fluid.CPUPlace()
+    exe = fluid.Executor(place)
+    exe.run(startup_prog)
+
+    exe.run(pserver_prog)
+
+
+class TestDistMnist(unittest.TestCase):
+    def setUp(self):
+        self._trainers = 1
+        self._pservers = 1
+        self._ps_endpoints = "127.0.0.1:9123"
+
+    def start_pserver(self, endpoint):
+        p = Process(
+            target=run_pserver,
+            args=(self._ps_endpoints, self._trainers, endpoint))
+        p.start()
+        return p.pid
+
+    def _wait_ps_ready(self, pid):
+        retry_times = 5
+        while True:
+            assert retry_times >= 0, "wait ps ready failed"
+            time.sleep(1)
+            try:
+                # the listen_and_serv_op would touch a file which contains the listen port
+                # on the /tmp directory until it was ready to process all the RPC call.
+                os.stat("/tmp/paddle.%d.port" % pid)
+                return
+            except os.error:
+                retry_times -= 1
+
+    def stop_pserver(self, pid):
+        os.kill(pid, signal.SIGTERM)
+
+    def test_with_place(self):
+        p = fluid.CUDAPlace() if core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+
+        pserver_pid = self.start_pserver(self._ps_endpoints)
+        self._wait_ps_ready(pserver_pid)
+
+        self.run_trainer(p, 0)
+
+        self.stop_pserver(pserver_pid)
+
+    def run_trainer(self, place, trainer_id):
+        test_program, avg_cost, train_reader, test_reader, batch_acc, predict = get_model(
+            batch_size=20)
+        t = get_transpiler(trainer_id,
+                           fluid.default_main_program(), self._ps_endpoints,
+                           self._trainers)
+
+        trainer_prog = t.get_trainer_program()
+
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+
+        feed_var_list = [
+            var for var in trainer_prog.global_block().vars.itervalues()
+            if var.is_data
+        ]
+
+        feeder = fluid.DataFeeder(feed_var_list, place)
+        for pass_id in xrange(10):
+            for batch_id, data in enumerate(train_reader()):
+                exe.run(trainer_prog, feed=feeder.feed(data))
+
+                if (batch_id + 1) % 10 == 0:
+                    acc_set = []
+                    avg_loss_set = []
+                    for test_data in test_reader():
+                        acc_np, avg_loss_np = exe.run(
+                            program=test_program,
+                            feed=feeder.feed(test_data),
+                            fetch_list=[batch_acc, avg_cost])
+                        acc_set.append(float(acc_np))
+                        avg_loss_set.append(float(avg_loss_np))
+                    # get test acc and loss
+                    acc_val = np.array(acc_set).mean()
+                    avg_loss_val = np.array(avg_loss_set).mean()
+                    if float(acc_val
+                             ) > 0.2:  # Smaller value to increase CI speed
+                        return
+                    else:
+                        print(
+                            'PassID {0:1}, BatchID {1:04}, Test Loss {2:2.2}, Acc {3:2.2}'.
+                            format(pass_id, batch_id + 1,
+                                   float(avg_loss_val), float(acc_val)))
+                        if math.isnan(float(avg_loss_val)):
+                            assert ("got Nan loss, training failed.")
+
+
+if __name__ == "__main__":
+    unittest.main()

From a158bd9173d745b4969e39fcc1c00681a791d251 Mon Sep 17 00:00:00 2001
From: Yancey1989 <yancey1989@gmail.com>
Date: Tue, 5 Jun 2018 17:15:18 +0800
Subject: [PATCH 02/28] fix ci

---
 python/paddle/fluid/tests/unittests/test_dist_mnist.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist.py b/python/paddle/fluid/tests/unittests/test_dist_mnist.py
index e6bc56cce1..e4d39c8b3f 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist.py
@@ -27,6 +27,7 @@ import signal
 
 SEED = 1
 DTYPE = "float32"
+paddle.dataset.mnist.fetch()
 
 
 # random seed must set before configuring the network.
@@ -147,7 +148,7 @@ class TestDistMnist(unittest.TestCase):
         os.kill(pid, signal.SIGTERM)
 
     def test_with_place(self):
-        p = fluid.CUDAPlace() if core.is_compiled_with_cuda(
+        p = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
         ) else fluid.CPUPlace()
 
         pserver_pid = self.start_pserver(self._ps_endpoints)

From df7a1471fd1c2c003a8df1af152cd1da28f7f568 Mon Sep 17 00:00:00 2001
From: Yancey1989 <yancey1989@gmail.com>
Date: Tue, 5 Jun 2018 19:21:35 +0800
Subject: [PATCH 03/28] fix fetch mnist dataset failed

---
 python/paddle/v2/dataset/mnist.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/v2/dataset/mnist.py b/python/paddle/v2/dataset/mnist.py
index 9f675bed89..2b959c48e4 100644
--- a/python/paddle/v2/dataset/mnist.py
+++ b/python/paddle/v2/dataset/mnist.py
@@ -112,7 +112,7 @@ def fetch():
     paddle.v2.dataset.common.download(TRAIN_IMAGE_URL, 'mnist', TRAIN_IMAGE_MD5)
     paddle.v2.dataset.common.download(TRAIN_LABEL_URL, 'mnist', TRAIN_LABEL_MD5)
     paddle.v2.dataset.common.download(TEST_IMAGE_URL, 'mnist', TEST_IMAGE_MD5)
-    paddle.v2.dataset.common.download(TEST_LABEL_URL, 'mnist', TRAIN_LABEL_MD5)
+    paddle.v2.dataset.common.download(TEST_LABEL_URL, 'mnist', TEST_LABEL_MD5)
 
 
 def convert(path):

From cdb705d19360cdf2ad4de0bbe4f129a70f6ca28c Mon Sep 17 00:00:00 2001
From: Yancey1989 <yancey1989@gmail.com>
Date: Wed, 6 Jun 2018 13:27:22 +0800
Subject: [PATCH 04/28] fix mnist dataset md5

---
 python/paddle/dataset/mnist.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/dataset/mnist.py b/python/paddle/dataset/mnist.py
index 6a1b8b5fac..9d05aeeb95 100644
--- a/python/paddle/dataset/mnist.py
+++ b/python/paddle/dataset/mnist.py
@@ -111,7 +111,7 @@ def fetch():
     paddle.dataset.common.download(TRAIN_IMAGE_URL, 'mnist', TRAIN_IMAGE_MD5)
     paddle.dataset.common.download(TRAIN_LABEL_URL, 'mnist', TRAIN_LABEL_MD5)
     paddle.dataset.common.download(TEST_IMAGE_URL, 'mnist', TEST_IMAGE_MD5)
-    paddle.dataset.common.download(TEST_LABEL_URL, 'mnist', TRAIN_LABEL_MD5)
+    paddle.dataset.common.download(TEST_LABEL_URL, 'mnist', TEST_LABEL_MD5)
 
 
 def convert(path):

From 05b6aa180594e379adfa4d5ba44d3e9ac937f5b2 Mon Sep 17 00:00:00 2001
From: Yancey1989 <yancey1989@gmail.com>
Date: Wed, 6 Jun 2018 14:02:32 +0800
Subject: [PATCH 05/28] increase dist unit test timeout

---
 python/paddle/fluid/tests/unittests/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 2f2e9c96c1..32176fc7e5 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -51,5 +51,5 @@ py_test_modules(test_dist_train MODULES test_dist_train SERIAL)
 # FIXME(Yancey1989): this test would cost much more time on CUDAPlace
 # since load cudnn libraries, so we use a longer timeout to make this
 # unit test stability.
-set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 30)
-set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 60)
+set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 60)
+set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 180)

From e9ed62bfed91fa86906b805f6cf22c3c7e51490d Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Tue, 26 Jun 2018 11:10:08 +0800
Subject: [PATCH 06/28] make framework.Parameter public

---
 python/paddle/fluid/framework.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index db21b1f3c0..6c6f90a0cf 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -27,6 +27,7 @@ __all__ = [
     'Variable',
     'Program',
     'Operator',
+    'Parameter',
     'default_startup_program',
     'default_main_program',
     'program_guard',
@@ -1905,7 +1906,7 @@ def program_guard(main_program, startup_program=None):
 def get_var(name, program=None):
     """
     Get a variable by name from the global block of a program.
-    
+
     Args:
         name(str): name of the variable
         program(Program|None): program object.

From e09ac3df188a8a9ce68845b606ffe301f0eaed7b Mon Sep 17 00:00:00 2001
From: Kexin Zhao <zhaokexin01@baidu.com>
Date: Mon, 25 Jun 2018 22:01:48 -0700
Subject: [PATCH 07/28] replace lod name with recur_seq_lens

---
 doc/fluid/design/concepts/lod_tensor.md       | 20 ++++++
 .../test_label_semantic_roles_newapi.py       | 28 ++++-----
 .../test_machine_translation.py               | 12 ++--
 .../test_recommender_system_newapi.py         | 12 ++--
 .../test_understand_sentiment_conv.py         | 10 +--
 .../test_understand_sentiment_dynamic_rnn.py  | 10 +--
 .../test_understand_sentiment_stacked_lstm.py | 10 +--
 .../word2vec/test_word2vec_new_api.py         | 19 +++---
 .../tests/book/notest_understand_sentiment.py | 16 +++--
 .../tests/book/test_label_semantic_roles.py   | 62 ++++++++++++++-----
 .../tests/book/test_machine_translation.py    | 14 +++--
 .../tests/book/test_recommender_system.py     | 12 ++--
 .../tests/book/test_rnn_encoder_decoder.py    | 14 ++---
 .../paddle/fluid/tests/book/test_word2vec.py  | 21 ++++---
 14 files changed, 163 insertions(+), 97 deletions(-)

diff --git a/doc/fluid/design/concepts/lod_tensor.md b/doc/fluid/design/concepts/lod_tensor.md
index d606d7a790..748488f6d5 100644
--- a/doc/fluid/design/concepts/lod_tensor.md
+++ b/doc/fluid/design/concepts/lod_tensor.md
@@ -173,6 +173,7 @@ are transformed into offsets of elements/words as follows:
 
 ## Slicing of LoD Tensors
 
+
 When we use the above 2-level LoD Tensor as the input to a nested-RNN, we need to retrieve certain sequences.  Here we define the sequence identified by branch <i,j,...> as the **<i,j,...>-slice**.
 
 For example, the <2>-slice of above example is
@@ -189,3 +190,22 @@ and the <2,0>-slice of above slice is
 10  12
   ||
 ```
+
+## Length Representation vs Offset Representation
+
+The offset representation is an implementation-oriented decision and it makes understanding the idea behind LoDTensor difficult.
+Hence, we encapsulate this implementation detail in C++ and expose the original length representation in our Python API. 
+Specifically, we call this length representation `recursive_sequence_lengths` and users can use the following code to set or get the `recursive_sequence_lengths` of a LoDTensor in Python:
+```Python
+# length representation of lod called recursive_sequence_lengths
+recursive_seq_lens = [[3, 1, 2], [2, 2, 1, 3, 1, 2]]
+# Create a LoDTensor that has the above recursive_sequence_lengths info.
+# This recursive_sequence_lengths will be converted to an offset representation of LoD in the C++ implementation under the hood.
+tensor = fluid.LoDTensor(lod)
+
+# Set/Change the recursive_sequence_lengths info of LoDTensor
+tensor.set_recursive_sequence_lengths([[3, 1, 2]])
+# Get the recursive_sequence_lengths info of a LoDTensor (the offset-based LoD representation stored in C++ will be converted 
+# back to length-based recursive_sequence_lengths), new_recursive_seq_lens = [[3, 1, 2]]
+new_recursive_seq_lens = tensor.recursive_sequence_lengths()
+```
diff --git a/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py b/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py
index 0ccb3a39e0..67aa21e8c5 100755
--- a/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py
+++ b/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py
@@ -206,35 +206,35 @@ def infer(use_cuda, inference_program, params_dirname):
     inferencer = fluid.Inferencer(
         inference_program, param_path=params_dirname, place=place)
 
-    # Setup inputs by creating LoDTensors to represent sequences of words.
-    # Here each word is the basic element of these LoDTensors and the shape of 
+    # Setup input by creating LoDTensor to represent sequence of words.
+    # Here each word is the basic element of the LoDTensor and the shape of 
     # each word (base_shape) should be [1] since it is simply an index to 
     # look up for the corresponding word vector.
-    # Suppose the length_based level of detail (lod) info is set to [[3, 4, 2]],
-    # which has only one lod level. Then the created LoDTensors will have only 
+    # Suppose the recursive_sequence_lengths info is set to [[3, 4, 2]],
+    # which has only one level of detail. Then the created LoDTensor will have only 
     # one higher level structure (sequence of words, or sentence) than the basic 
     # element (word). Hence the LoDTensor will hold data for three sentences of 
     # length 3, 4 and 2, respectively. 
-    # Note that lod info should be a list of lists.
-    lod = [[3, 4, 2]]
+    # Note that recursive_sequence_lengths should be a list of lists.
+    recursive_seq_lens = [[3, 4, 2]]
     base_shape = [1]
     # The range of random integers is [low, high]
     word = fluid.create_random_int_lodtensor(
-        lod, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
+        recursive_seq_lens, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
     ctx_n2 = fluid.create_random_int_lodtensor(
-        lod, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
+        recursive_seq_lens, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
     ctx_n1 = fluid.create_random_int_lodtensor(
-        lod, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
+        recursive_seq_lens, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
     ctx_0 = fluid.create_random_int_lodtensor(
-        lod, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
+        recursive_seq_lens, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
     ctx_p1 = fluid.create_random_int_lodtensor(
-        lod, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
+        recursive_seq_lens, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
     ctx_p2 = fluid.create_random_int_lodtensor(
-        lod, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
+        recursive_seq_lens, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
     pred = fluid.create_random_int_lodtensor(
-        lod, base_shape, place, low=0, high=PRED_DICT_LEN - 1)
+        recursive_seq_lens, base_shape, place, low=0, high=PRED_DICT_LEN - 1)
     mark = fluid.create_random_int_lodtensor(
-        lod, base_shape, place, low=0, high=MARK_DICT_LEN - 1)
+        recursive_seq_lens, base_shape, place, low=0, high=MARK_DICT_LEN - 1)
 
     results = inferencer.infer(
         {
diff --git a/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py b/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py
index c4b37df3a0..31d756503a 100644
--- a/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py
+++ b/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py
@@ -215,11 +215,13 @@ def decode_main(use_cuda, is_sparse):
         [1. for _ in range(batch_size)], dtype='float32')
     init_ids_data = init_ids_data.reshape((batch_size, 1))
     init_scores_data = init_scores_data.reshape((batch_size, 1))
-    init_lod = [1] * batch_size
-    init_lod = [init_lod, init_lod]
+    init_recursive_seq_lens = [1] * batch_size
+    init_recursive_seq_lens = [init_recursive_seq_lens, init_recursive_seq_lens]
 
-    init_ids = fluid.create_lod_tensor(init_ids_data, init_lod, place)
-    init_scores = fluid.create_lod_tensor(init_scores_data, init_lod, place)
+    init_ids = fluid.create_lod_tensor(init_ids_data,
+                                       init_init_recursive_seq_lens, place)
+    init_scores = fluid.create_lod_tensor(init_scores_data,
+                                          init_init_recursive_seq_lens, place)
 
     train_data = paddle.batch(
         paddle.reader.shuffle(
@@ -243,7 +245,7 @@ def decode_main(use_cuda, is_sparse):
             feed=feed_dict,
             fetch_list=[translation_ids, translation_scores],
             return_numpy=False)
-        print result_ids.lod()
+        print result_ids.recursive_sequence_lengths()
         break
 
 
diff --git a/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py b/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py
index 090c11ce1e..c860f16417 100644
--- a/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py
+++ b/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py
@@ -209,13 +209,15 @@ def infer(use_cuda, inference_program, params_dirname):
         inference_program, param_path=params_dirname, place=place)
 
     # Use the first data from paddle.dataset.movielens.test() as input.
-    # Use create_lod_tensor(data, lod, place) API to generate LoD Tensor,
-    # where `data` is a list of sequences of index numbers, `lod` is 
-    # the level of detail (lod) info associated with `data`.
+    # Use create_lod_tensor(data, recursive_sequence_lengths, place) API 
+    # to generate LoD Tensor where `data` is a list of sequences of index 
+    # numbers, `recursive_sequence_lengths` is the length-based level of detail 
+    # (lod) info associated with `data`.
     # For example, data = [[10, 2, 3], [2, 3]] means that it contains
     # two sequences of indexes, of length 3 and 2, respectively.
-    # Correspondingly, lod = [[3, 2]] contains one level of detail info,
-    # indicating that `data` consists of two sequences of length 3 and 2. 
+    # Correspondingly, recursive_sequence_lengths = [[3, 2]] contains one 
+    # level of detail info, indicating that `data` consists of two sequences 
+    # of length 3 and 2, respectively. 
     user_id = fluid.create_lod_tensor([[1]], [[1]], place)
     gender_id = fluid.create_lod_tensor([[1]], [[1]], place)
     age_id = fluid.create_lod_tensor([[0]], [[1]], place)
diff --git a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py
index 9b61f7a00c..1668ae83d3 100644
--- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py
@@ -128,17 +128,17 @@ def infer(use_cuda, inference_program, params_dirname=None):
     # Here each word is the basic element of the LoDTensor and the shape of 
     # each word (base_shape) should be [1] since it is simply an index to 
     # look up for the corresponding word vector.
-    # Suppose the length_based level of detail (lod) info is set to [[3, 4, 2]],
-    # which has only one lod level. Then the created LoDTensor will have only 
+    # Suppose the recursive_sequence_lengths info is set to [[3, 4, 2]],
+    # which has only one level of detail. Then the created LoDTensor will have only 
     # one higher level structure (sequence of words, or sentence) than the basic 
     # element (word). Hence the LoDTensor will hold data for three sentences of 
     # length 3, 4 and 2, respectively. 
-    # Note that lod info should be a list of lists.
-    lod = [[3, 4, 2]]
+    # Note that recursive_sequence_lengths should be a list of lists.
+    recursive_seq_lens = [[3, 4, 2]]
     base_shape = [1]
     # The range of random integers is [low, high]
     tensor_words = fluid.create_random_int_lodtensor(
-        lod, base_shape, place, low=0, high=len(word_dict) - 1)
+        recursive_seq_lens, base_shape, place, low=0, high=len(word_dict) - 1)
     results = inferencer.infer({'words': tensor_words})
     print("infer results: ", results)
 
diff --git a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py
index aa7c567b4d..8da89d82cb 100644
--- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py
@@ -143,17 +143,17 @@ def infer(use_cuda, inference_program, params_dirname=None):
     # Here each word is the basic element of the LoDTensor and the shape of 
     # each word (base_shape) should be [1] since it is simply an index to 
     # look up for the corresponding word vector.
-    # Suppose the length_based level of detail (lod) info is set to [[3, 4, 2]],
-    # which has only one lod level. Then the created LoDTensor will have only 
+    # Suppose the recursive_sequence_lengths info is set to [[3, 4, 2]],
+    # which has only one level of detail. Then the created LoDTensor will have only 
     # one higher level structure (sequence of words, or sentence) than the basic 
     # element (word). Hence the LoDTensor will hold data for three sentences of 
     # length 3, 4 and 2, respectively. 
-    # Note that lod info should be a list of lists.
-    lod = [[3, 4, 2]]
+    # Note that recursive_sequence_lengths should be a list of lists.
+    recursive_seq_lens = [[3, 4, 2]]
     base_shape = [1]
     # The range of random integers is [low, high]
     tensor_words = fluid.create_random_int_lodtensor(
-        lod, base_shape, place, low=0, high=len(word_dict) - 1)
+        recursive_seq_lens, base_shape, place, low=0, high=len(word_dict) - 1)
     results = inferencer.infer({'words': tensor_words})
     print("infer results: ", results)
 
diff --git a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py
index 8c74be0f08..74faa2e8aa 100644
--- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py
@@ -138,17 +138,17 @@ def infer(use_cuda, inference_program, params_dirname=None):
     # Here each word is the basic element of the LoDTensor and the shape of 
     # each word (base_shape) should be [1] since it is simply an index to 
     # look up for the corresponding word vector.
-    # Suppose the length_based level of detail (lod) info is set to [[3, 4, 2]],
-    # which has only one lod level. Then the created LoDTensor will have only 
+    # Suppose the recursive_sequence_lengths info is set to [[3, 4, 2]],
+    # which has only one level of detail. Then the created LoDTensor will have only 
     # one higher level structure (sequence of words, or sentence) than the basic 
     # element (word). Hence the LoDTensor will hold data for three sentences of 
     # length 3, 4 and 2, respectively. 
-    # Note that lod info should be a list of lists.
-    lod = [[3, 4, 2]]
+    # Note that recursive_sequence_lengths should be a list of lists.
+    recursive_seq_lens = [[3, 4, 2]]
     base_shape = [1]
     # The range of random integers is [low, high]
     tensor_words = fluid.create_random_int_lodtensor(
-        lod, base_shape, place, low=0, high=len(word_dict) - 1)
+        recursive_seq_lens, base_shape, place, low=0, high=len(word_dict) - 1)
     results = inferencer.infer({'words': tensor_words})
     print("infer results: ", results)
 
diff --git a/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py b/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py
index ba44f72d9b..02e65cf56c 100644
--- a/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py
+++ b/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py
@@ -124,21 +124,22 @@ def infer(use_cuda, inference_program, params_dirname=None):
 
     # Setup inputs by creating 4 LoDTensors representing 4 words. Here each word 
     # is simply an index to look up for the corresponding word vector and hence 
-    # the shape of word (base_shape) should be [1]. The length-based level of 
-    # detail (lod) info of each LoDtensor should be [[1]] meaning there is only 
-    # one lod_level and there is only one sequence of one word on this level.
-    # Note that lod info should be a list of lists.
-    lod = [[1]]
+    # the shape of word (base_shape) should be [1]. The recursive_sequence_lengths, 
+    # which is length-based level of detail (lod) of each LoDTensor, should be [[1]] 
+    # meaning there is only one level of detail and there is only one sequence of 
+    # one word on this level.
+    # Note that recursive_sequence_lengths should be a list of lists.
+    recursive_seq_lens = [[1]]
     base_shape = [1]
     # The range of random integers is [low, high]
     first_word = fluid.create_random_int_lodtensor(
-        lod, base_shape, place, low=0, high=dict_size - 1)
+        recursive_seq_lens, base_shape, place, low=0, high=dict_size - 1)
     second_word = fluid.create_random_int_lodtensor(
-        lod, base_shape, place, low=0, high=dict_size - 1)
+        recursive_seq_lens, base_shape, place, low=0, high=dict_size - 1)
     third_word = fluid.create_random_int_lodtensor(
-        lod, base_shape, place, low=0, high=dict_size - 1)
+        recursive_seq_lens, base_shape, place, low=0, high=dict_size - 1)
     fourth_word = fluid.create_random_int_lodtensor(
-        lod, base_shape, place, low=0, high=dict_size - 1)
+        recursive_seq_lens, base_shape, place, low=0, high=dict_size - 1)
 
     result = inferencer.infer(
         {
diff --git a/python/paddle/fluid/tests/book/notest_understand_sentiment.py b/python/paddle/fluid/tests/book/notest_understand_sentiment.py
index 5d9a47c9ba..1df7b99aad 100644
--- a/python/paddle/fluid/tests/book/notest_understand_sentiment.py
+++ b/python/paddle/fluid/tests/book/notest_understand_sentiment.py
@@ -238,17 +238,21 @@ def infer(word_dict, use_cuda, save_dirname=None):
         # Here each word is the basic element of the LoDTensor and the shape of 
         # each word (base_shape) should be [1] since it is simply an index to 
         # look up for the corresponding word vector.
-        # Suppose the length_based level of detail (lod) info is set to [[3, 4, 2]],
-        # which has only one lod level. Then the created LoDTensor will have only 
+        # Suppose the recursive_sequence_lengths info is set to [[3, 4, 2]],
+        # which has only one level of detail. Then the created LoDTensor will have only 
         # one higher level structure (sequence of words, or sentence) than the basic 
         # element (word). Hence the LoDTensor will hold data for three sentences of 
         # length 3, 4 and 2, respectively. 
-        # Note that lod info should be a list of lists.
-        lod = [[3, 4, 2]]
+        # Note that recursive_sequence_lengths should be a list of lists.
+        recursive_seq_lens = [[3, 4, 2]]
         base_shape = [1]
         # The range of random integers is [low, high]
         tensor_words = fluid.create_random_int_lodtensor(
-            lod, base_shape, place, low=0, high=word_dict_len - 1)
+            recursive_seq_lens,
+            base_shape,
+            place,
+            low=0,
+            high=word_dict_len - 1)
 
         # Construct feed as a dictionary of {feed_target_name: feed_target_data}
         # and results will contain a list of data corresponding to fetch_targets.
@@ -257,7 +261,7 @@ def infer(word_dict, use_cuda, save_dirname=None):
                           feed={feed_target_names[0]: tensor_words},
                           fetch_list=fetch_targets,
                           return_numpy=False)
-        print(results[0].lod())
+        print(results[0].recursive_sequence_lengths())
         np_data = np.array(results[0])
         print("Inference Shape: ", np_data.shape)
         print("Inference results: ", np_data)
diff --git a/python/paddle/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
index e214ced0b5..d489feae9c 100644
--- a/python/paddle/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
@@ -247,35 +247,67 @@ def infer(use_cuda, save_dirname=None):
         [inference_program, feed_target_names,
          fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
 
-        # Setup inputs by creating LoDTensors to represent sequences of words.
-        # Here each word is the basic element of these LoDTensors and the shape of 
+        # Setup input by creating LoDTensor to represent sequence of words.
+        # Here each word is the basic element of the LoDTensor and the shape of 
         # each word (base_shape) should be [1] since it is simply an index to 
         # look up for the corresponding word vector.
-        # Suppose the length_based level of detail (lod) info is set to [[3, 4, 2]],
-        # which has only one lod level. Then the created LoDTensors will have only 
+        # Suppose the recursive_sequence_lengths info is set to [[3, 4, 2]],
+        # which has only one level of detail. Then the created LoDTensor will have only 
         # one higher level structure (sequence of words, or sentence) than the basic 
         # element (word). Hence the LoDTensor will hold data for three sentences of 
         # length 3, 4 and 2, respectively. 
-        # Note that lod info should be a list of lists.
-        lod = [[3, 4, 2]]
+        # Note that recursive_sequence_lengths should be a list of lists.
+        recursive_seq_lens = [[3, 4, 2]]
         base_shape = [1]
         # The range of random integers is [low, high]
         word = fluid.create_random_int_lodtensor(
-            lod, base_shape, place, low=0, high=word_dict_len - 1)
+            recursive_seq_lens,
+            base_shape,
+            place,
+            low=0,
+            high=word_dict_len - 1)
         pred = fluid.create_random_int_lodtensor(
-            lod, base_shape, place, low=0, high=pred_dict_len - 1)
+            recursive_seq_lens,
+            base_shape,
+            place,
+            low=0,
+            high=pred_dict_len - 1)
         ctx_n2 = fluid.create_random_int_lodtensor(
-            lod, base_shape, place, low=0, high=word_dict_len - 1)
+            recursive_seq_lens,
+            base_shape,
+            place,
+            low=0,
+            high=word_dict_len - 1)
         ctx_n1 = fluid.create_random_int_lodtensor(
-            lod, base_shape, place, low=0, high=word_dict_len - 1)
+            recursive_seq_lens,
+            base_shape,
+            place,
+            low=0,
+            high=word_dict_len - 1)
         ctx_0 = fluid.create_random_int_lodtensor(
-            lod, base_shape, place, low=0, high=word_dict_len - 1)
+            recursive_seq_lens,
+            base_shape,
+            place,
+            low=0,
+            high=word_dict_len - 1)
         ctx_p1 = fluid.create_random_int_lodtensor(
-            lod, base_shape, place, low=0, high=word_dict_len - 1)
+            recursive_seq_lens,
+            base_shape,
+            place,
+            low=0,
+            high=word_dict_len - 1)
         ctx_p2 = fluid.create_random_int_lodtensor(
-            lod, base_shape, place, low=0, high=word_dict_len - 1)
+            recursive_seq_lens,
+            base_shape,
+            place,
+            low=0,
+            high=word_dict_len - 1)
         mark = fluid.create_random_int_lodtensor(
-            lod, base_shape, place, low=0, high=mark_dict_len - 1)
+            recursive_seq_lens,
+            base_shape,
+            place,
+            low=0,
+            high=mark_dict_len - 1)
 
         # Construct feed as a dictionary of {feed_target_name: feed_target_data}
         # and results will contain a list of data corresponding to fetch_targets.
@@ -301,7 +333,7 @@ def infer(use_cuda, save_dirname=None):
                           },
                           fetch_list=fetch_targets,
                           return_numpy=False)
-        print(results[0].lod())
+        print(results[0].recursive_sequence_lengths())
         np_data = np.array(results[0])
         print("Inference Shape: ", np_data.shape)
 
diff --git a/python/paddle/fluid/tests/book/test_machine_translation.py b/python/paddle/fluid/tests/book/test_machine_translation.py
index 372d6ec822..a68eae5bcb 100644
--- a/python/paddle/fluid/tests/book/test_machine_translation.py
+++ b/python/paddle/fluid/tests/book/test_machine_translation.py
@@ -108,7 +108,7 @@ def decoder_decode(context, is_sparse):
         pre_state = pd.array_read(array=state_array, i=counter)
         pre_score = pd.array_read(array=scores_array, i=counter)
 
-        # expand the lod of pre_state to be the same with pre_score
+        # expand the recursive_sequence_lengths of pre_state to be the same with pre_score
         pre_state_expanded = pd.sequence_expand(pre_state, pre_score)
 
         pre_ids_emb = pd.embedding(
@@ -238,11 +238,13 @@ def decode_main(use_cuda, is_sparse):
         [1. for _ in range(batch_size)], dtype='float32')
     init_ids_data = init_ids_data.reshape((batch_size, 1))
     init_scores_data = init_scores_data.reshape((batch_size, 1))
-    init_lod = [1] * batch_size
-    init_lod = [init_lod, init_lod]
+    init_recursive_seq_lens = [1] * batch_size
+    init_recursive_seq_lens = [init_recursive_seq_lens, init_recursive_seq_lens]
 
-    init_ids = fluid.create_lod_tensor(init_ids_data, init_lod, place)
-    init_scores = fluid.create_lod_tensor(init_scores_data, init_lod, place)
+    init_ids = fluid.create_lod_tensor(init_ids_data, init_recursive_seq_lens,
+                                       place)
+    init_scores = fluid.create_lod_tensor(init_scores_data,
+                                          init_recursive_seq_lens, place)
 
     train_data = paddle.batch(
         paddle.reader.shuffle(
@@ -266,7 +268,7 @@ def decode_main(use_cuda, is_sparse):
             feed=feed_dict,
             fetch_list=[translation_ids, translation_scores],
             return_numpy=False)
-        print result_ids.lod()
+        print result_ids.recursive_sequence_lengths()
         break
 
 
diff --git a/python/paddle/fluid/tests/book/test_recommender_system.py b/python/paddle/fluid/tests/book/test_recommender_system.py
index 937d8dd5b0..6548766ef5 100644
--- a/python/paddle/fluid/tests/book/test_recommender_system.py
+++ b/python/paddle/fluid/tests/book/test_recommender_system.py
@@ -260,13 +260,15 @@ def infer(use_cuda, save_dirname=None):
 
         # Use the first data from paddle.dataset.movielens.test() as input
         assert feed_target_names[0] == "user_id"
-        # Use create_lod_tensor(data, lod, place) API to generate LoD Tensor
-        # where `data` is a list of sequences of index numbers, `lod` is 
-        # the level of detail (lod) info associated with `data`.
+        # Use create_lod_tensor(data, recursive_sequence_lengths, place) API 
+        # to generate LoD Tensor where `data` is a list of sequences of index 
+        # numbers, `recursive_sequence_lengths` is the length-based level of detail 
+        # (lod) info associated with `data`.
         # For example, data = [[10, 2, 3], [2, 3]] means that it contains
         # two sequences of indexes, of length 3 and 2, respectively.
-        # Correspondingly, lod = [[3, 2]] contains one level of detail info,
-        # indicating that `data` consists of two sequences of length 3 and 2. 
+        # Correspondingly, recursive_sequence_lengths = [[3, 2]] contains one 
+        # level of detail info, indicating that `data` consists of two sequences 
+        # of length 3 and 2, respectively. 
         user_id = fluid.create_lod_tensor([[1]], [[1]], place)
 
         assert feed_target_names[1] == "gender_id"
diff --git a/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py b/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py
index 7ada57def6..4672826241 100644
--- a/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py
+++ b/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py
@@ -216,19 +216,19 @@ def infer(use_cuda, save_dirname=None):
         # Here each word is the basic element of the LoDTensor and the shape of 
         # each word (base_shape) should be [1] since it is simply an index to 
         # look up for the corresponding word vector.
-        # Suppose the length_based level of detail (lod) info is set to [[4, 6]],
-        # which has only one lod level. Then the created LoDTensor will have only 
+        # Suppose the recursive_sequence_lengths info is set to [[4, 6]],
+        # which has only one level of detail. Then the created LoDTensor will have only 
         # one higher level structure (sequence of words, or sentence) than the basic 
         # element (word). Hence the LoDTensor will hold data for two sentences of 
         # length 4 and 6, respectively. 
-        # Note that lod info should be a list of lists.
-        lod = [[4, 6]]
+        # Note that recursive_sequence_lengths should be a list of lists.
+        recursive_seq_lens = [[4, 6]]
         base_shape = [1]
         # The range of random integers is [low, high]
         word_data = fluid.create_random_int_lodtensor(
-            lod, base_shape, place, low=0, high=1)
+            recursive_seq_lens, base_shape, place, low=0, high=1)
         trg_word = fluid.create_random_int_lodtensor(
-            lod, base_shape, place, low=0, high=1)
+            recursive_seq_lens, base_shape, place, low=0, high=1)
 
         # Construct feed as a dictionary of {feed_target_name: feed_target_data}
         # and results will contain a list of data corresponding to fetch_targets.
@@ -241,7 +241,7 @@ def infer(use_cuda, save_dirname=None):
                           },
                           fetch_list=fetch_targets,
                           return_numpy=False)
-        print(results[0].lod())
+        print(results[0].recursive_sequence_lengths())
         np_data = np.array(results[0])
         print("Inference shape: ", np_data.shape)
         print("Inference results: ", np_data)
diff --git a/python/paddle/fluid/tests/book/test_word2vec.py b/python/paddle/fluid/tests/book/test_word2vec.py
index 75bed06bd7..49bd72c7a5 100644
--- a/python/paddle/fluid/tests/book/test_word2vec.py
+++ b/python/paddle/fluid/tests/book/test_word2vec.py
@@ -168,21 +168,22 @@ def infer(use_cuda, save_dirname=None):
 
         # Setup inputs by creating 4 LoDTensors representing 4 words. Here each word 
         # is simply an index to look up for the corresponding word vector and hence 
-        # the shape of word (base_shape) should be [1]. The length-based level of 
-        # detail (lod) info of each LoDtensor should be [[1]] meaning there is only 
-        # one lod_level and there is only one sequence of one word on this level.
-        # Note that lod info should be a list of lists.
-        lod = [[1]]
+        # the shape of word (base_shape) should be [1]. The recursive_sequence_lengths, 
+        # which is length-based level of detail (lod) of each LoDTensor, should be [[1]] 
+        # meaning there is only one level of detail and there is only one sequence of 
+        # one word on this level.
+        # Note that recursive_sequence_lengths should be a list of lists.
+        recursive_seq_lens = [[1]]
         base_shape = [1]
         # The range of random integers is [low, high]
         first_word = fluid.create_random_int_lodtensor(
-            lod, base_shape, place, low=0, high=dict_size - 1)
+            recursive_seq_lens, base_shape, place, low=0, high=dict_size - 1)
         second_word = fluid.create_random_int_lodtensor(
-            lod, base_shape, place, low=0, high=dict_size - 1)
+            recursive_seq_lens, base_shape, place, low=0, high=dict_size - 1)
         third_word = fluid.create_random_int_lodtensor(
-            lod, base_shape, place, low=0, high=dict_size - 1)
+            recursive_seq_lens, base_shape, place, low=0, high=dict_size - 1)
         fourth_word = fluid.create_random_int_lodtensor(
-            lod, base_shape, place, low=0, high=dict_size - 1)
+            recursive_seq_lens, base_shape, place, low=0, high=dict_size - 1)
 
         assert feed_target_names[0] == 'firstw'
         assert feed_target_names[1] == 'secondw'
@@ -200,7 +201,7 @@ def infer(use_cuda, save_dirname=None):
                           },
                           fetch_list=fetch_targets,
                           return_numpy=False)
-        print(results[0].lod())
+        print(results[0].recursive_sequence_lengths())
         np_data = np.array(results[0])
         print("Inference Shape: ", np_data.shape)
 

From a2e43ae5ce691fef18c0e6600dde7c8c4e5d1c27 Mon Sep 17 00:00:00 2001
From: "yi.wu" <yi.wu@baifendian.com>
Date: Mon, 25 Jun 2018 17:16:00 +0800
Subject: [PATCH 08/28] fix trainer nccl2 env

---
 python/paddle/fluid/trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py
index 45ab889bea..fc4d7cba71 100644
--- a/python/paddle/fluid/trainer.py
+++ b/python/paddle/fluid/trainer.py
@@ -315,7 +315,7 @@ class Trainer(object):
             for ip in worker_ips.split(","):
                 worker_endpoints.append(':'.join([ip, port]))
             self.num_trainers = len(worker_endpoints)
-            current_endpoint = os.getenv("POD_IP") + ":" + port
+            current_endpoint = os.getenv("PADDLE_CURRENT_IP") + ":" + port
             worker_endpoints.remove(current_endpoint)
             # TODO(wuyi): use self.nccl_id_var, self.num_trainers and self.trainer_id
             # in ParallelExecutor to start

From 480d848eb989d4decd68cc76cf2e34c27dee5763 Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Tue, 26 Jun 2018 22:56:21 +0800
Subject: [PATCH 09/28] optimize doc for host_memory_profiling_cn

---
 doc/fluid/howto/optimization/host_memory_profiling_cn.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/doc/fluid/howto/optimization/host_memory_profiling_cn.md b/doc/fluid/howto/optimization/host_memory_profiling_cn.md
index 9b55a66ded..7fb0883dd9 100644
--- a/doc/fluid/howto/optimization/host_memory_profiling_cn.md
+++ b/doc/fluid/howto/optimization/host_memory_profiling_cn.md
@@ -1,4 +1,4 @@
-## 堆内存分析和优化
+# 堆内存分析和优化
 
 计算机程序都可能有内存泄漏的风险。**内存泄漏**一般是由于程序在堆(heap)上分配了内存而没有释放，随着程序的运行占用的内存越来越大，一方面会影响程序的稳定性，可能让运行速度越来越慢，或者造成oom，甚至会影响运行程序的机器的稳定性，造成宕机。
 
@@ -20,11 +20,11 @@ Paddle也提供了基于gperftool的[CPU性能分析教程](https://github.com/P
 
 对于堆内存的分析，主要用到thread-caching malloc和heap-profiling using tcmalloc。
 
-## 使用流程
-#### 环境
+## 环境
+
 本教程基于paddle提供的Docker开发环境paddlepaddle/paddle:latest-dev，基于Ubuntu 16.04.4 LTS环境。
 
-#### 使用流程
+## 使用流程
 
 - 安装google-perftools
 

From 6b95a8a89cc79da73a9d3c461083025e298638a7 Mon Sep 17 00:00:00 2001
From: Kexin Zhao <zhaokexin01@baidu.com>
Date: Tue, 26 Jun 2018 22:57:25 -0700
Subject: [PATCH 10/28] fix error

---
 .../machine_translation/test_machine_translation.py         | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py b/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py
index 31d756503a..12c4134dc9 100644
--- a/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py
+++ b/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py
@@ -218,10 +218,10 @@ def decode_main(use_cuda, is_sparse):
     init_recursive_seq_lens = [1] * batch_size
     init_recursive_seq_lens = [init_recursive_seq_lens, init_recursive_seq_lens]
 
-    init_ids = fluid.create_lod_tensor(init_ids_data,
-                                       init_init_recursive_seq_lens, place)
+    init_ids = fluid.create_lod_tensor(init_ids_data, init_recursive_seq_lens,
+                                       place)
     init_scores = fluid.create_lod_tensor(init_scores_data,
-                                          init_init_recursive_seq_lens, place)
+                                          init_recursive_seq_lens, place)
 
     train_data = paddle.batch(
         paddle.reader.shuffle(

From c45a4b8567ee6b7b30bbe4a5733775970e831a88 Mon Sep 17 00:00:00 2001
From: Yancey1989 <yancey1989@gmail.com>
Date: Wed, 27 Jun 2018 14:15:35 +0800
Subject: [PATCH 11/28] use sigkill to stop pserver

---
 python/paddle/fluid/tests/unittests/test_dist_mnist.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist.py b/python/paddle/fluid/tests/unittests/test_dist_mnist.py
index e4d39c8b3f..450ec414d1 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist.py
@@ -145,7 +145,7 @@ class TestDistMnist(unittest.TestCase):
                 retry_times -= 1
 
     def stop_pserver(self, pid):
-        os.kill(pid, signal.SIGTERM)
+        os.kill(pid, signal.SIGKILL)
 
     def test_with_place(self):
         p = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(

From b7c179a87fc370c4a4b176621b5176c0aff5a7d1 Mon Sep 17 00:00:00 2001
From: Kexin Zhao <zhaokexin01@baidu.com>
Date: Tue, 26 Jun 2018 23:26:10 -0700
Subject: [PATCH 12/28] fix lodtensor.py

---
 python/paddle/fluid/lod_tensor.py            | 57 ++++++++-------
 python/paddle/fluid/tests/test_lod_tensor.py | 73 +++++++++++---------
 2 files changed, 73 insertions(+), 57 deletions(-)

diff --git a/python/paddle/fluid/lod_tensor.py b/python/paddle/fluid/lod_tensor.py
index c417ab393f..b2b3186c1e 100644
--- a/python/paddle/fluid/lod_tensor.py
+++ b/python/paddle/fluid/lod_tensor.py
@@ -18,15 +18,16 @@ import numpy as np
 __all__ = ['create_lod_tensor', 'create_random_int_lodtensor']
 
 
-def create_lod_tensor(data, lod, place):
+def create_lod_tensor(data, recursive_seq_lens, place):
     """
     Create a lod tensor from a numpy array, a list, or an existing lod tensor.
 
     Create a lod tensor by doing the following:
 
-    1. Check that the length-based input lod is valid.
+    1. Check that the length-based level of detail (LoD) also known as 
+       recursive_sequence_lengths of the input is valid.
 
-    2. Convert the length-based lod to a offset-based LoD.
+    2. Convert recursive_sequence_lengths to a offset-based LoD.
 
     3. Copy the data from a numpy array, a list or a existing lod tensor to
        CPU or GPU device (based on input place).
@@ -37,45 +38,47 @@ def create_lod_tensor(data, lod, place):
 
         Suppose we want LoDTensor to hold data for sequences of word, where each
         word is represented by an integer. If we want to create a LoDTensor to
-        represent two  sentences, one of 2 words, and one of 3 words.
+        represent two sentences, one of 2 words, and one of 3 words.
 
         Then :code:`data` can be a numpy array of integers with shape (5, 1).
-        :code:`lod` will be [[2, 3]], indicating the length(# of words) in each
-        sentence. This length-based input lod [[2, 3]] will be converted to
-        offset-based lod [[0, 2, 5]] inside the function call.
+        :code:`recursive_seq_lens` will be [[2, 3]], indicating the length(# of words) in each
+        sentence. This length-based :code:`recursive_seq_lens` [[2, 3]] will be converted to
+        offset-based LoD [[0, 2, 5]] inside the function call.
 
     Please reference :ref:`api_guide_low_level_lod_tensor` for more details
     regarding LoD.
 
     Args:
         data(numpy.ndarray|list|LoDTensor): a numpy array or a LoDTensor or a
-            list holding the data to be  copied.
-        lod(list): a list of lists indicating the length-based LoD info
-            specified by the user.
+            list holding the data to be copied.
+        recursive_seq_lens(list): a list of lists indicating the length-based level of detail 
+            info specified by the user.
         place(Place): CPU or GPU place indicating where the data in the new
             LoDTensor will be stored.
 
     Returns:
-        A fluid LoDTensor object with tensor data and lod info.
+        A fluid LoDTensor object with tensor data and recursive_seq_lens info.
     """
     if isinstance(data, core.LoDTensor):
-        return create_lod_tensor(np.array(data), lod, place)
+        return create_lod_tensor(np.array(data), recursive_seq_lens, place)
     elif isinstance(data, list):
         # When input data is a list, it only deal with the case where the base element 
         # is an index of shape [1] and dtype int64 (e.g., word id). Hence, the generated 
         # LoDTensor will be of shape [n, 1] and dtype int64, where `n` is the total number 
         # of words or other indexes in the sequence. 
-        new_lod = []
+        new_recursive_seq_lens = []
         for seq in data:
-            new_lod.append(len(seq))
-        assert [new_lod] == lod, "data and lod do not match"
+            new_recursive_seq_lens.append(len(seq))
+        assert [
+            new_recursive_seq_lens
+        ] == recursive_seq_lens, "data and recursive_seq_lens do not match"
         flattened_data = np.concatenate(data, axis=0).astype("int64")
         flattened_data = flattened_data.reshape([len(flattened_data), 1])
-        return create_lod_tensor(flattened_data, lod, place)
+        return create_lod_tensor(flattened_data, recursive_seq_lens, place)
     elif isinstance(data, np.ndarray):
         tensor = core.LoDTensor()
         tensor.set(data, place)
-        tensor.set_recursive_sequence_lengths(lod)
+        tensor.set_recursive_sequence_lengths(recursive_seq_lens)
         assert tensor.has_valid_recursive_sequence_lengths(
         ), "the provided lod info is invalid"
         return tensor
@@ -84,7 +87,8 @@ def create_lod_tensor(data, lod, place):
             "data should be either a LoDTensor, a Numpy array or a list")
 
 
-def create_random_int_lodtensor(lod, base_shape, place, low, high):
+def create_random_int_lodtensor(recursive_seq_lens, base_shape, place, low,
+                                high):
     """
     Create a LoDTensor containing random integers.
 
@@ -95,7 +99,7 @@ def create_random_int_lodtensor(lod, base_shape, place, low, high):
     The function does the following:
 
     1. Calculate the overall shape of the LoDTensor based on the length-based
-       :code:`lod` input and the shape of the basic element in
+       :code:`recursive_seq_lens` input and the shape of the basic element in
        :code:`base_shape`.
 
     2. Create a numpy array of this shape.
@@ -105,12 +109,13 @@ def create_random_int_lodtensor(lod, base_shape, place, low, high):
     Suppose we want LoDTensor to hold data for sequences of word, where each
     word is represented by an integer. If we want to create a LoDTensor to
     represent two sentences, one of 2 words, and one of 3 words. Then
-    'base_shape' is [1], input length-based 'lod' is [[2, 3]]. Then the overall
-    shape of the LoDTensor would be [5, 1], holding 5 words for two sentences.
+    'base_shape' is [1], input length-based 'recursive_seq_lens' is [[2, 3]]. 
+    Then the overall shape of the LoDTensor would be [5, 1], holding 5 words 
+    for two sentences.
 
     Args:
-        lod(list): a list of lists indicating the length-based LoD info
-            specified by the user.
+        recursive_seq_lens(list): a list of lists indicating the length-based 
+            level of detail info specified by the user.
         base_shape(list): the shape of the basic element to be held by the
             LoDTensor.
         place(Place): CPU or GPU place indicating where the data in the new
@@ -119,11 +124,11 @@ def create_random_int_lodtensor(lod, base_shape, place, low, high):
         high(int): the upper bound of the random integers.
 
     Returns:
-        A fluid LoDTensor object with tensor data and lod info. 
+        A fluid LoDTensor object with tensor data and recursive_seq_lens info. 
     """
     assert isinstance(base_shape, list), "base_shape should be a list"
     # append the total number of basic elements to the front of its shape
-    overall_shape = [sum(lod[-1])] + base_shape
+    overall_shape = [sum(recursive_seq_lens[-1])] + base_shape
     # the range of integer data elements is [low, high]    
     data = np.random.random_integers(low, high, overall_shape).astype("int64")
-    return create_lod_tensor(data, lod, place)
+    return create_lod_tensor(data, recursive_seq_lens, place)
diff --git a/python/paddle/fluid/tests/test_lod_tensor.py b/python/paddle/fluid/tests/test_lod_tensor.py
index b7e7f5801f..f7a9dd4129 100644
--- a/python/paddle/fluid/tests/test_lod_tensor.py
+++ b/python/paddle/fluid/tests/test_lod_tensor.py
@@ -19,18 +19,21 @@ import unittest
 
 
 class TestLoDTensor(unittest.TestCase):
-    def test_pybind_lod(self):
+    def test_pybind_recursive_seq_lens(self):
         tensor = fluid.LoDTensor()
-        lod = []
-        tensor.set_recursive_sequence_lengths(lod)
-        lod = [[], [1], [3]]
-        self.assertRaises(Exception, tensor.set_recursive_sequence_lengths, lod)
-        lod = [[0], [2], [3]]
-        self.assertRaises(Exception, tensor.set_recursive_sequence_lengths, lod)
+        recursive_seq_lens = []
+        tensor.set_recursive_sequence_lengths(recursive_seq_lens)
+        recursive_seq_lens = [[], [1], [3]]
+        self.assertRaises(Exception, tensor.set_recursive_sequence_lengths,
+                          recursive_seq_lens)
+        recursive_seq_lens = [[0], [2], [3]]
+        self.assertRaises(Exception, tensor.set_recursive_sequence_lengths,
+                          recursive_seq_lens)
 
-        lod = [[1, 2, 3]]
-        tensor.set_recursive_sequence_lengths(lod)
-        self.assertEqual(tensor.recursive_sequence_lengths(), lod)
+        recursive_seq_lens = [[1, 2, 3]]
+        tensor.set_recursive_sequence_lengths(recursive_seq_lens)
+        self.assertEqual(tensor.recursive_sequence_lengths(),
+                         recursive_seq_lens)
         tensor.set(np.random.random([6, 1]), fluid.CPUPlace())
         self.assertTrue(tensor.has_valid_recursive_sequence_lengths())
         tensor.set(np.random.random([9, 1]), fluid.CPUPlace())
@@ -38,13 +41,14 @@ class TestLoDTensor(unittest.TestCase):
 
         # Each level's sum should be equal to the number of items in the next level
         # Moreover, last level's sum should be equal to the tensor height
-        lod = [[2, 3], [1, 3, 1, 2, 2]]
-        tensor.set_recursive_sequence_lengths(lod)
-        self.assertEqual(tensor.recursive_sequence_lengths(), lod)
+        recursive_seq_lens = [[2, 3], [1, 3, 1, 2, 2]]
+        tensor.set_recursive_sequence_lengths(recursive_seq_lens)
+        self.assertEqual(tensor.recursive_sequence_lengths(),
+                         recursive_seq_lens)
         tensor.set(np.random.random([8, 1]), fluid.CPUPlace())
         self.assertFalse(tensor.has_valid_recursive_sequence_lengths())
-        lod = [[2, 3], [1, 3, 1, 2, 1]]
-        tensor.set_recursive_sequence_lengths(lod)
+        recursive_seq_lens = [[2, 3], [1, 3, 1, 2, 1]]
+        tensor.set_recursive_sequence_lengths(recursive_seq_lens)
         self.assertTrue(tensor.has_valid_recursive_sequence_lengths())
         tensor.set(np.random.random([9, 1]), fluid.CPUPlace())
         self.assertFalse(tensor.has_valid_recursive_sequence_lengths())
@@ -52,35 +56,42 @@ class TestLoDTensor(unittest.TestCase):
     def test_create_lod_tensor(self):
         # Create LoDTensor from a list
         data = [[1, 2, 3], [3, 4]]
-        wrong_lod = [[2, 2]]
-        correct_lod = [[3, 2]]
-        self.assertRaises(AssertionError, create_lod_tensor, data, wrong_lod,
-                          fluid.CPUPlace())
-        tensor = create_lod_tensor(data, correct_lod, fluid.CPUPlace())
-        self.assertEqual(tensor.recursive_sequence_lengths(), correct_lod)
+        wrong_recursive_seq_lens = [[2, 2]]
+        correct_recursive_seq_lens = [[3, 2]]
+        self.assertRaises(AssertionError, create_lod_tensor, data,
+                          wrong_recursive_seq_lens, fluid.CPUPlace())
+        tensor = create_lod_tensor(data, correct_recursive_seq_lens,
+                                   fluid.CPUPlace())
+        self.assertEqual(tensor.recursive_sequence_lengths(),
+                         correct_recursive_seq_lens)
 
         # Create LoDTensor from numpy array
         data = np.random.random([10, 1])
-        lod = [[2, 1], [3, 3, 4]]
-        tensor = create_lod_tensor(data, lod, fluid.CPUPlace())
-        self.assertEqual(tensor.recursive_sequence_lengths(), lod)
+        recursive_seq_lens = [[2, 1], [3, 3, 4]]
+        tensor = create_lod_tensor(data, recursive_seq_lens, fluid.CPUPlace())
+        self.assertEqual(tensor.recursive_sequence_lengths(),
+                         recursive_seq_lens)
 
         # Create LoDTensor from another LoDTensor, they are differnt instances
-        new_lod = [[2, 2, 1], [1, 2, 2, 3, 2]]
-        new_tensor = create_lod_tensor(tensor, new_lod, fluid.CPUPlace())
-        self.assertEqual(tensor.recursive_sequence_lengths(), lod)
-        self.assertEqual(new_tensor.recursive_sequence_lengths(), new_lod)
+        new_recursive_seq_lens = [[2, 2, 1], [1, 2, 2, 3, 2]]
+        new_tensor = create_lod_tensor(tensor, new_recursive_seq_lens,
+                                       fluid.CPUPlace())
+        self.assertEqual(tensor.recursive_sequence_lengths(),
+                         recursive_seq_lens)
+        self.assertEqual(new_tensor.recursive_sequence_lengths(),
+                         new_recursive_seq_lens)
 
     def test_create_random_int_lodtensor(self):
         # The shape of a word, commonly used in speech and NLP problem, is [1]
         shape = [1]
-        lod = [[2, 3, 5]]
+        recursive_seq_lens = [[2, 3, 5]]
         dict_size = 10000
         low = 0
         high = dict_size - 1
-        tensor = create_random_int_lodtensor(lod, shape,
+        tensor = create_random_int_lodtensor(recursive_seq_lens, shape,
                                              fluid.CPUPlace(), low, high)
-        self.assertEqual(tensor.recursive_sequence_lengths(), lod)
+        self.assertEqual(tensor.recursive_sequence_lengths(),
+                         recursive_seq_lens)
         self.assertEqual(tensor.shape(), [10, 1])
 
 

From df7a266ae238afbbde27a38b279d5145f188f12b Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Wed, 27 Jun 2018 15:56:45 +0800
Subject: [PATCH 13/28] fix adam op for selected rows

---
 paddle/fluid/operators/adam_op.cc | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/operators/adam_op.cc b/paddle/fluid/operators/adam_op.cc
index 6ee73c3000..5d670fe3b9 100644
--- a/paddle/fluid/operators/adam_op.cc
+++ b/paddle/fluid/operators/adam_op.cc
@@ -56,9 +56,12 @@ class AdamOp : public framework::OperatorWithKernel {
                       "Beta2 power accumulator should have 1 dimension");
 
     auto param_dims = ctx->GetInputDim("Param");
-    PADDLE_ENFORCE_EQ(
-        param_dims, ctx->GetInputDim("Grad"),
-        "Param and Grad input of AdamOp should have same dimension");
+    if (ctx->GetInputsVarType("Grad")[0] ==
+        framework::proto::VarType::LOD_TENSOR) {
+      PADDLE_ENFORCE_EQ(
+          param_dims, ctx->GetInputDim("Grad"),
+          "Param and Grad input of AdamOp should have same dimension");
+    }
     PADDLE_ENFORCE_EQ(
         param_dims, ctx->GetInputDim("Moment1"),
         "Param and Moment1 input of AdamOp should have same dimension");

From 7d9c9a013be761f7d9827823fda106670fe1e899 Mon Sep 17 00:00:00 2001
From: Yancey1989 <yancey1989@gmail.com>
Date: Wed, 27 Jun 2018 16:33:28 +0800
Subject: [PATCH 14/28] update by comment

---
 python/paddle/fluid/tests/unittests/test_dist_mnist.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist.py b/python/paddle/fluid/tests/unittests/test_dist_mnist.py
index 450ec414d1..ad2d57f7c5 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist.py
@@ -15,6 +15,7 @@
 import numpy as np
 import argparse
 import time
+import math
 
 import paddle
 import paddle.fluid as fluid
@@ -145,7 +146,7 @@ class TestDistMnist(unittest.TestCase):
                 retry_times -= 1
 
     def stop_pserver(self, pid):
-        os.kill(pid, signal.SIGKILL)
+        os.kill(pid, signal.SIGTERM)
 
     def test_with_place(self):
         p = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
@@ -194,7 +195,7 @@ class TestDistMnist(unittest.TestCase):
                     acc_val = np.array(acc_set).mean()
                     avg_loss_val = np.array(avg_loss_set).mean()
                     if float(acc_val
-                             ) > 0.2:  # Smaller value to increase CI speed
+                             ) > 0.8:  # Smaller value to increase CI speed
                         return
                     else:
                         print(

From 20fae681366de7c799dfbc92a7be53b027d532c2 Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Wed, 27 Jun 2018 19:39:15 +0800
Subject: [PATCH 15/28] adam op handle grad.rows().size == 0 condition

---
 paddle/fluid/operators/adam_op.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/paddle/fluid/operators/adam_op.h b/paddle/fluid/operators/adam_op.h
index f82ff47b52..a7a28b02b6 100644
--- a/paddle/fluid/operators/adam_op.h
+++ b/paddle/fluid/operators/adam_op.h
@@ -282,6 +282,10 @@ class AdamOpKernel : public framework::OpKernel<T> {
     } else if (grad_var->IsType<framework::SelectedRows>()) {
       auto& grad =
           Ref(ctx.Input<framework::SelectedRows>("Grad"), "Must set Grad");
+      if (grad.rows().size() == 0) {
+        VLOG(3) << "grad row size is 0!!";
+        return;
+      }
       // merge duplicated rows if any.
       scatter::MergeAdd<DeviceContext, T> merge_func;
       auto grad_merge =

From 64e44e929c06722a87dfe6989b46f66130cfe27a Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Wed, 27 Jun 2018 19:45:33 +0800
Subject: [PATCH 16/28] add option to compile inference demo

---
 paddle/contrib/inference/demo/CMakeLists.txt | 5 +++++
 paddle/scripts/paddle_build.sh               | 4 +++-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/paddle/contrib/inference/demo/CMakeLists.txt b/paddle/contrib/inference/demo/CMakeLists.txt
index 566c7d1a07..ecece6fe34 100644
--- a/paddle/contrib/inference/demo/CMakeLists.txt
+++ b/paddle/contrib/inference/demo/CMakeLists.txt
@@ -15,6 +15,11 @@
 
 inference_api_test(simple_on_word2vec ARGS test_word2vec)
 
+option(WITH_INFERENCE_DEMO "Compile with Inference demo" OFF)
+if(NOT WITH_INFERENCE_DEMO)
+  return()
+endif()
+
 set(DEMO_INSTALL_DIR "${PADDLE_BINARY_DIR}/inference_demo")
 set(URL_ROOT http://paddlemodels.bj.bcebos.com/inference-vis-demos%2F)
 
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 037688bde9..b16c834931 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -106,6 +106,7 @@ function cmake_gen() {
         -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF}
         -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
         -DWITH_CONTRIB=${WITH_CONTRIB:-ON}
+        -DWITH_INFERENCE_DEMO=${WITH_INFERENCE_DEMO:-ON}
     ========================================
 EOF
     # Disable UNITTEST_USE_VIRTUALENV in docker because
@@ -133,7 +134,8 @@ EOF
         -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} \
         -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
         -DWITH_CONTRIB=${WITH_CONTRIB:-ON} \
-        -DWITH_ANAKIN=${WITH_ANAKIN:-ON}
+        -DWITH_ANAKIN=${WITH_ANAKIN:-ON} \
+        -DWITH_INFERENCE_DEMO=${WITH_INFERENCE_DEMO:-ON}
 }
 
 function abort(){

From bc28cf613f9e41908d6da9a889cbb3242e0589d2 Mon Sep 17 00:00:00 2001
From: Haichao Zhang <hczhang1@gmail.com>
Date: Wed, 27 Jun 2018 16:56:41 -0700
Subject: [PATCH 17/28] Extend fill_zeros_like_op for zero-filling an
 LoDTensorArray (#11496)

* Add fill_zeros_array op. This op is used for zero-filling an LoDTensorArray.

* merge fill_zeros_array_op with fill_zeros_like_op

* add unit_test for fill_zeros_like for array
---
 paddle/fluid/framework/operator.cc            |  4 +
 paddle/fluid/operators/fill_zeros_like_op.cc  | 10 ++-
 paddle/fluid/operators/fill_zeros_like_op.h   | 30 +++++--
 python/paddle/fluid/layers/nn.py              | 38 ++++++++
 .../test_fill_zeros_like_op_for_array.py      | 88 +++++++++++++++++++
 5 files changed, 161 insertions(+), 9 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/test_fill_zeros_like_op_for_array.py

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 122ee1dab3..c1329b06d7 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -713,6 +713,10 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
           t = &var->Get<LoDTensor>();
         } else if (var->IsType<SelectedRows>()) {
           t = &(var->Get<SelectedRows>().value());
+        } else if (var->IsType<LoDTensorArray>()) {
+          const LoDTensorArray& arr = var->Get<LoDTensorArray>();
+          PADDLE_ENFORCE(arr.size() > 0);
+          t = &(arr[0]);
         }
         if (t != nullptr) {
           int tmp = static_cast<int>(ToDataType(t->type()));
diff --git a/paddle/fluid/operators/fill_zeros_like_op.cc b/paddle/fluid/operators/fill_zeros_like_op.cc
index d67bec36b3..a9d47c0172 100644
--- a/paddle/fluid/operators/fill_zeros_like_op.cc
+++ b/paddle/fluid/operators/fill_zeros_like_op.cc
@@ -26,8 +26,12 @@ class FillZerosLikeOp : public framework::OperatorWithKernel {
                    "Input(X) of FillZerosLikeOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of FillZerosLikeOp should not be null.");
-    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-    ctx->ShareLoD("X", /*->*/ "Out");
+
+    if (ctx->IsRuntime() &&
+        ctx->GetOutputsVarType("Out")[0] ==
+            framework::proto::VarType::LOD_TENSOR_ARRAY) {
+      return;  // skip runtime infershape when is tensor array;
+    }
   }
 };
 
@@ -39,7 +43,7 @@ class FillZerosLikeOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 FillZerosLike Operator.
 
-Fill up a variable with zeros.
+Fill up a variable with zeros, supporting both LoDTensor and LoDTensorArray.
 The output will have the same size as the input.
 
 )DOC");
diff --git a/paddle/fluid/operators/fill_zeros_like_op.h b/paddle/fluid/operators/fill_zeros_like_op.h
index 4bbe0df6b6..daa6521b32 100644
--- a/paddle/fluid/operators/fill_zeros_like_op.h
+++ b/paddle/fluid/operators/fill_zeros_like_op.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
 
@@ -23,12 +24,29 @@ template <typename DeviceContext, typename T>
 class FillZerosLikeKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* out = context.Output<framework::Tensor>("Out");
-    out->mutable_data<T>(context.GetPlace());
-
-    math::SetConstant<DeviceContext, T> setter;
-    setter(context.template device_context<DeviceContext>(), out,
-           static_cast<T>(0));
+    auto var = context.InputVar("X");
+    if (var->IsType<framework::LoDTensor>()) {
+      auto& input = *context.Input<framework::LoDTensor>("X");
+      auto& output = *context.Output<framework::LoDTensor>("Out");
+      output.Resize(input.dims());
+      output.set_lod(input.lod());
+      output.mutable_data<T>(context.GetPlace());
+      math::SetConstant<DeviceContext, T> setter;
+      setter(context.template device_context<DeviceContext>(), &(output),
+             static_cast<T>(0));
+    } else if (var->IsType<framework::LoDTensorArray>()) {
+      auto& input = *context.Input<framework::LoDTensorArray>("X");
+      auto& output = *context.Output<framework::LoDTensorArray>("Out");
+      output.resize(input.size());
+      for (auto i = 0; i < input.size(); i++) {
+        output[i].Resize(input[i].dims());
+        output[i].set_lod(input[i].lod());
+        output[i].mutable_data<T>(context.GetPlace());
+        math::SetConstant<DeviceContext, T> setter;
+        setter(context.template device_context<DeviceContext>(), &(output[i]),
+               static_cast<T>(0));
+      }
+    }
   }
 };
 
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 64f48e259a..bc379da4e3 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -95,6 +95,7 @@ __all__ = [
     'relu',
     'log',
     'crop',
+    'fill_zeros_like',
 ]
 
 
@@ -5184,3 +5185,40 @@ def crop(x, shape=None, offsets=None, name=None):
         outputs={'Out': out},
         attrs=None if len(attrs) == 0 else attrs)
     return out
+
+
+def fill_zeros_like(x):
+    """
+    This layer takes an input and outputs a variable that has the same structure as
+    the input and with all the element values as zero. The variable can be a Tensor
+    or TensorArray.
+
+    .. code-block:: text
+
+
+       Given
+          X = [[0, 1, 2, 0],
+               [0, 3, 4, 0],
+               [0, 0, 0, 0]],
+       output is:
+          Out = [[0, 0, 0, 0],
+                 [0, 0, 0, 0],
+                 [0, 0, 0, 0]].
+
+    Args:
+        x (Variable): The input variable, which could be a tensor or tensor array
+
+    Returns:
+        Variable: The zero-filled variable, which has the same type and shape as
+                  the input variable.
+
+    Examples:
+
+        .. code-block:: python
+            y = fluid.layers.fill_zeros_like(x)
+    """
+    helper = LayerHelper('fill_zeros_like', **locals())
+    out = helper.create_tmp_variable(dtype=x.dtype)
+    helper.append_op(
+        type='fill_zeros_like', inputs={'X': [x]}, outputs={'Out': [out]})
+    return out
diff --git a/python/paddle/fluid/tests/unittests/test_fill_zeros_like_op_for_array.py b/python/paddle/fluid/tests/unittests/test_fill_zeros_like_op_for_array.py
new file mode 100644
index 0000000000..23871508d8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fill_zeros_like_op_for_array.py
@@ -0,0 +1,88 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle.fluid.core as core
+import numpy
+import paddle.fluid.layers as layers
+from paddle.fluid.framework import Program, program_guard
+from paddle.fluid.executor import Executor
+
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+
+
+class TestFillZerosLikeOpForTensorArray(unittest.TestCase):
+    def place(self):
+        return core.CPUPlace()
+
+    def test_zero_filling_lod_tensor_array(self):
+        tensor = core.LoDTensor()
+        tensor.set(
+            numpy.arange(20).reshape(20, 1).astype('int32'), self.place())
+        tensor.set_lod([[0, 2, 5], [0, 3, 9, 11, 17, 20]])
+
+        expect = [
+            numpy.array(
+                [0, 0, 0, 0, 0], dtype='int32'), numpy.array(
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='int32'),
+            numpy.array(
+                [0, 0, 0], dtype='int32')
+        ]
+
+        lod = [[[0, 2, 5]], [[0, 6, 12]], [[0, 3]]]
+        self.main(
+            tensor=tensor,
+            expect_array=expect,
+            expect_lod=lod,
+            expect_max_len=3)
+
+    def main(self, tensor, expect_array, expect_lod, expect_max_len, level=0):
+        place = self.place()
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name='x', shape=[10])
+            x.persistable = True
+            table = layers.lod_rank_table(x, level=level)
+            max_len = layers.max_sequence_len(table)
+            max_len.persistable = True
+            array = layers.lod_tensor_to_array(x, table)
+            array = layers.fill_zeros_like(array)
+            array.persistable = True
+
+            result = layers.array_to_lod_tensor(array, table)
+            result.persistable = True
+        exe = Executor(place)
+        scope = core.Scope()
+        exe.run(program, feed={'x': tensor}, scope=scope)
+        var = scope.find_var(array.name)
+        array = var.get_lod_tensor_array()
+        if expect_array is not None and expect_lod is not None:
+            self.check_array_same(array, expect_array, expect_lod)
+
+        self.assertEqual(
+            numpy.array(scope.find_var(max_len.name).get_tensor())[0],
+            expect_max_len)
+
+    def check_array_same(self, array, expect_tensor, expect_lod):
+        self.assertEqual(len(expect_tensor), len(array))
+        for i, exp in enumerate(zip(expect_tensor, expect_lod)):
+            exp_tensor, exp_lod = exp
+            exp_tensor = numpy.expand_dims(exp_tensor, axis=1)
+            self.assertTrue(numpy.allclose(exp_tensor, numpy.array(array[i])))
+            self.assertEqual(exp_lod, array[i].lod())
+
+
+if __name__ == '__main__':
+    unittest.main()

From 5082642bdb038ef87f81549a3589724a65c29799 Mon Sep 17 00:00:00 2001
From: Yan Chunwei <yanchunwei@outlook.com>
Date: Thu, 28 Jun 2018 09:13:55 +0800
Subject: [PATCH 18/28] feature/analysis to support sub-graph for TRT engine
 (#11538)

---
 paddle/contrib/inference/CMakeLists.txt       |  10 +-
 .../contrib/inference/paddle_inference_api.h  |  11 +-
 .../inference/paddle_inference_api_impl.cc    |   6 +-
 .../inference/paddle_inference_api_impl.h     |   2 +-
 ..._inference_api_tensorrt_subgraph_engine.cc | 126 ++++++++++++++++++
 ..._inference_api_tensorrt_subgraph_engine.cc |  64 +++++++++
 .../fluid/inference/analysis/CMakeLists.txt   |  12 +-
 paddle/fluid/inference/analysis/analyzer.cc   |  82 ++++++++++++
 paddle/fluid/inference/analysis/analyzer.h    |  66 +++++++++
 .../inference/analysis/analyzer_tester.cc     |  29 ++++
 paddle/fluid/inference/analysis/argument.h    |   3 +
 .../inference/analysis/data_flow_graph.cc     |  21 ++-
 .../inference/analysis/data_flow_graph.h      |  23 +++-
 .../analysis/data_flow_graph_to_fluid_pass.cc | 124 ++++++++++++++---
 .../analysis/data_flow_graph_to_fluid_pass.h  |   6 +-
 .../analysis/dfg_graphviz_draw_pass.cc        |  15 ++-
 .../analysis/dfg_graphviz_draw_pass.h         |  13 +-
 .../analysis/dfg_graphviz_draw_pass_tester.cc |   4 +-
 .../analysis/fluid_to_data_flow_graph_pass.cc |  23 +++-
 .../analysis/fluid_to_data_flow_graph_pass.h  |   3 +-
 paddle/fluid/inference/analysis/helper.cc     |  60 +++++++++
 paddle/fluid/inference/analysis/helper.h      |  22 ++-
 paddle/fluid/inference/analysis/node.cc       |  11 ++
 paddle/fluid/inference/analysis/node.h        |  90 +++++++------
 .../inference/analysis/node_attr_flags.h      |  32 +++++
 paddle/fluid/inference/analysis/pass.h        |   3 +
 .../fluid/inference/analysis/pass_manager.cc  |  12 ++
 .../fluid/inference/analysis/pass_manager.h   |  12 +-
 .../inference/analysis/pass_manager_tester.cc |   1 +
 .../inference/analysis/subgraph_splitter.cc   |  32 +++--
 .../tensorrt_subgraph_node_mark_pass.cc       |  78 +++++++++++
 .../tensorrt_subgraph_node_mark_pass.h        |  53 ++++++++
 ...tensorrt_subgraph_node_mark_pass_tester.cc |  50 +++++++
 .../analysis/tensorrt_subgraph_pass.cc        |   2 +-
 .../analysis/tensorrt_subgraph_pass.h         |   5 +
 .../analysis/tensorrt_subgraph_pass_tester.cc |  51 ++++---
 paddle/fluid/operators/CMakeLists.txt         |   3 +-
 paddle/fluid/operators/tensorrt_engine_op.h   |   1 +
 .../operators/tensorrt_engine_op_test.cc      |  43 +-----
 39 files changed, 1015 insertions(+), 189 deletions(-)
 create mode 100644 paddle/contrib/inference/paddle_inference_api_tensorrt_subgraph_engine.cc
 create mode 100644 paddle/contrib/inference/test_paddle_inference_api_tensorrt_subgraph_engine.cc
 create mode 100644 paddle/fluid/inference/analysis/analyzer.cc
 create mode 100644 paddle/fluid/inference/analysis/analyzer.h
 create mode 100644 paddle/fluid/inference/analysis/analyzer_tester.cc
 create mode 100644 paddle/fluid/inference/analysis/helper.cc
 create mode 100644 paddle/fluid/inference/analysis/node_attr_flags.h
 create mode 100644 paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc
 create mode 100644 paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h
 create mode 100644 paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass_tester.cc

diff --git a/paddle/contrib/inference/CMakeLists.txt b/paddle/contrib/inference/CMakeLists.txt
index 45bbb4b237..153216abb4 100644
--- a/paddle/contrib/inference/CMakeLists.txt
+++ b/paddle/contrib/inference/CMakeLists.txt
@@ -18,7 +18,7 @@ if(APPLE)
 endif(APPLE)
 
 
-set(inference_deps paddle_inference_api paddle_fluid_api)
+set(inference_deps paddle_inference_api paddle_fluid_api paddle_inference_tensorrt_subgraph_engine)
 
 function(inference_api_test TARGET_NAME)
     if (WITH_TESTING)
@@ -50,6 +50,14 @@ cc_test(test_paddle_inference_api
 inference_api_test(test_paddle_inference_api_impl
                     ARGS test_word2vec test_image_classification)
 
+if(WITH_GPU AND TENSORRT_FOUND)
+cc_library(paddle_inference_tensorrt_subgraph_engine
+        SRCS paddle_inference_api_tensorrt_subgraph_engine.cc
+        DEPS paddle_inference_api analysis tensorrt_engine paddle_inference_api paddle_fluid_api)
+
+inference_api_test(test_paddle_inference_api_tensorrt_subgraph_engine ARGS test_word2vec)
+endif()
+
 if (WITH_ANAKIN AND WITH_TESTING) # only needed in CI
     # Due to Anakin do not have official library releases and the versions of protobuf and cuda do not match Paddle's,
     # so anakin library will not be merged to our official inference library. To use anakin prediction API, one need to
diff --git a/paddle/contrib/inference/paddle_inference_api.h b/paddle/contrib/inference/paddle_inference_api.h
index 238d8c772e..b8ba2d14a5 100644
--- a/paddle/contrib/inference/paddle_inference_api.h
+++ b/paddle/contrib/inference/paddle_inference_api.h
@@ -73,12 +73,12 @@ struct PaddleTensor {
 };
 
 enum class PaddleEngineKind {
-  kNative = 0,  // Use the native Fluid facility.
-  kAnakin,      // Use Anakin for inference.
+  kNative = 0,         // Use the native Fluid facility.
+  kAnakin,             // Use Anakin for inference.
+  kAutoMixedTensorRT,  // Automatically mix Fluid with TensorRT.
   // TODO(Superjomn) support following engines latter.
   // kTensorRT,           // Use TensorRT for inference.
   // kAutoMixedAnakin,    // Automatically mix Fluid with Anakin.
-  // kAutoMixedTensorRT,  // Automatically mix Fluid with TensorRT.
 };
 
 /*
@@ -130,6 +130,11 @@ struct AnakinConfig : public PaddlePredictor::Config {
   int max_batch_size{-1};
 };
 
+struct TensorRTConfig : public NativeConfig {
+  // Determine whether a subgraph will be executed by TRT.
+  int min_subgraph_size{1};
+};
+
 // A factory to help create different predictors.
 //
 // FOR EXTENSION DEVELOPER:
diff --git a/paddle/contrib/inference/paddle_inference_api_impl.cc b/paddle/contrib/inference/paddle_inference_api_impl.cc
index d9129a704b..b1e5b87598 100644
--- a/paddle/contrib/inference/paddle_inference_api_impl.cc
+++ b/paddle/contrib/inference/paddle_inference_api_impl.cc
@@ -89,6 +89,7 @@ bool NativePaddlePredictor::Init(
     LOG(ERROR) << "fail to load inference model.";
     return false;
   }
+
   ctx_ = executor_->Prepare(*inference_program_, 0);
   executor_->CreateVariables(
       *inference_program_, sub_scope_ ? sub_scope_ : scope_.get(), 0);
@@ -119,6 +120,7 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
     return false;
   }
   for (size_t i = 0; i < feed_target_names_.size(); ++i) {
+    VLOG(4) << "setting " << i << "-th target";
     feed_targets[feed_target_names_[i]] = &feeds[i];
   }
   // get fetch variable
@@ -130,14 +132,16 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
   }
   // Run the inference program
   // if share variables, we need not create variables
+  VLOG(4) << "Run prepared context";
   executor_->RunPreparedContext(
       ctx_.get(),
       sub_scope_ != nullptr ? sub_scope_ : scope_.get(),
       &feed_targets,
       &fetch_targets,
       false /* don't create variable eatch time */);
+  VLOG(4) << "Finish prepared context";
   if (!GetFetch(fetchs, output_data)) {
-    LOG(ERROR) << "fail to get fetchs";
+    LOG(ERROR) << "fail to get fetches";
     return false;
   }
   VLOG(3) << "predict cost: " << timer.toc() << "ms";
diff --git a/paddle/contrib/inference/paddle_inference_api_impl.h b/paddle/contrib/inference/paddle_inference_api_impl.h
index 86d1db7bcc..ba266b608d 100644
--- a/paddle/contrib/inference/paddle_inference_api_impl.h
+++ b/paddle/contrib/inference/paddle_inference_api_impl.h
@@ -44,7 +44,7 @@ class NativePaddlePredictor : public PaddlePredictor {
 
   ~NativePaddlePredictor() override;
 
- private:
+ protected:
   bool SetFeed(const std::vector<PaddleTensor> &input_datas,
                std::vector<framework::LoDTensor> *feeds);
   bool GetFetch(const std::vector<framework::LoDTensor> &fetchs,
diff --git a/paddle/contrib/inference/paddle_inference_api_tensorrt_subgraph_engine.cc b/paddle/contrib/inference/paddle_inference_api_tensorrt_subgraph_engine.cc
new file mode 100644
index 0000000000..a11396cee9
--- /dev/null
+++ b/paddle/contrib/inference/paddle_inference_api_tensorrt_subgraph_engine.cc
@@ -0,0 +1,126 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/contrib/inference/paddle_inference_api.h"
+#include "paddle/contrib/inference/paddle_inference_api_impl.h"
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include "paddle/fluid/inference/utils/singleton.h"
+
+namespace paddle {
+
+using inference::analysis::Argument;
+using inference::Singleton;
+using inference::analysis::Analyzer;
+using framework::proto::ProgramDesc;
+
+class TensorRTSubgraphPredictor : public NativePaddlePredictor {
+ public:
+  explicit TensorRTSubgraphPredictor(const TensorRTConfig& config)
+      : NativePaddlePredictor(config), config_(config) {}
+
+  bool Init(const std::shared_ptr<framework::Scope>& parent_scope) {
+    VLOG(3) << "Predictor::init()";
+
+    if (config_.use_gpu) {
+      place_ = paddle::platform::CUDAPlace(config_.device);
+    } else {
+      place_ = paddle::platform::CPUPlace();
+    }
+    if (parent_scope) {
+      scope_ = parent_scope;
+      sub_scope_ = &(parent_scope->NewScope());
+    } else {
+      paddle::framework::InitDevices(false);
+      scope_.reset(new paddle::framework::Scope());
+    }
+
+    executor_.reset(new paddle::framework::Executor(place_));
+
+    // Initialize the inference program
+    if (!config_.model_dir.empty()) {
+      // Parameters are saved in separate files sited in
+      // the specified `dirname`.
+      inference_program_ = paddle::inference::Load(
+          executor_.get(), scope_.get(), config_.model_dir);
+    } else if (!config_.prog_file.empty() && !config_.param_file.empty()) {
+      // All parameters are saved in a single file.
+      // The file names should be consistent with that used
+      // in Python API `fluid.io.save_inference_model`.
+      inference_program_ = paddle::inference::Load(
+          executor_.get(), scope_.get(), config_.prog_file, config_.param_file);
+    } else {
+      LOG(ERROR) << "fail to load inference model.";
+      return false;
+    }
+
+    // Analyze inference_program
+    Argument argument;
+    argument.origin_program_desc.reset(
+        new ProgramDesc(*inference_program_->Proto()));
+    Singleton<Analyzer>::Global().Run(&argument);
+    CHECK(argument.transformed_program_desc);
+    VLOG(5) << "transformed program:\n"
+            << argument.transformed_program_desc->SerializeAsString();
+    VLOG(5) << "to prepare executor";
+    *inference_program_->Proto() = *argument.transformed_program_desc;
+    ctx_ = executor_->Prepare(*inference_program_, 0);
+
+    VLOG(5) << "to create variables";
+    executor_->CreateVariables(
+        *inference_program_, sub_scope_ ? sub_scope_ : scope_.get(), 0);
+
+    // Get the feed_target_names and fetch_target_names
+    feed_target_names_ = inference_program_->GetFeedTargetNames();
+    fetch_target_names_ = inference_program_->GetFetchTargetNames();
+    return true;
+  }
+
+ private:
+  TensorRTConfig config_;
+};
+
+template <>
+std::unique_ptr<PaddlePredictor>
+CreatePaddlePredictor<TensorRTConfig, PaddleEngineKind::kAutoMixedTensorRT>(
+    const TensorRTConfig& config) {
+  VLOG(3) << "create TensorRTSubgraphPredictor";
+  if (config.use_gpu) {
+    // 1. GPU memeroy
+    PADDLE_ENFORCE_GT(
+        config.fraction_of_gpu_memory,
+        0.f,
+        "fraction_of_gpu_memory in the config should be set to range (0., 1.]");
+    PADDLE_ENFORCE_GE(config.device, 0, "Invalid device id %d", config.device);
+    std::vector<std::string> flags;
+    if (config.fraction_of_gpu_memory >= 0.0f ||
+        config.fraction_of_gpu_memory <= 0.95f) {
+      flags.push_back("dummpy");
+      std::string flag = "--fraction_of_gpu_memory_to_use=" +
+                         std::to_string(config.fraction_of_gpu_memory);
+      flags.push_back(flag);
+      VLOG(3) << "set flag: " << flag;
+      framework::InitGflags(flags);
+    }
+  }
+
+  std::unique_ptr<PaddlePredictor> predictor(
+      new TensorRTSubgraphPredictor(config));
+  if (!dynamic_cast<TensorRTSubgraphPredictor*>(predictor.get())
+           ->Init(nullptr)) {
+    return nullptr;
+  }
+  return std::move(predictor);
+}
+
+}  // namespace paddle
diff --git a/paddle/contrib/inference/test_paddle_inference_api_tensorrt_subgraph_engine.cc b/paddle/contrib/inference/test_paddle_inference_api_tensorrt_subgraph_engine.cc
new file mode 100644
index 0000000000..b100630dbe
--- /dev/null
+++ b/paddle/contrib/inference/test_paddle_inference_api_tensorrt_subgraph_engine.cc
@@ -0,0 +1,64 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include "paddle/contrib/inference/paddle_inference_api.h"
+
+namespace paddle {
+
+DEFINE_string(dirname, "", "Directory of the inference model.");
+
+void Main(bool use_gpu) {
+  //# 1. Create PaddlePredictor with a config.
+  TensorRTConfig config;
+  config.model_dir = FLAGS_dirname + "word2vec.inference.model";
+  config.use_gpu = use_gpu;
+  config.fraction_of_gpu_memory = 0.15;
+  config.device = 0;
+  auto predictor =
+      CreatePaddlePredictor<TensorRTConfig,
+                            PaddleEngineKind::kAutoMixedTensorRT>(config);
+
+  for (int batch_id = 0; batch_id < 3; batch_id++) {
+    //# 2. Prepare input.
+    int64_t data[4] = {1, 2, 3, 4};
+
+    PaddleTensor tensor{.name = "",
+                        .shape = std::vector<int>({4, 1}),
+                        .data = PaddleBuf(data, sizeof(data)),
+                        .dtype = PaddleDType::INT64};
+
+    // For simplicity, we set all the slots with the same data.
+    std::vector<PaddleTensor> slots(4, tensor);
+
+    //# 3. Run
+    std::vector<PaddleTensor> outputs;
+    CHECK(predictor->Run(slots, &outputs));
+
+    //# 4. Get output.
+    ASSERT_EQ(outputs.size(), 1UL);
+    LOG(INFO) << "output buffer size: " << outputs.front().data.length();
+    const size_t num_elements = outputs.front().data.length() / sizeof(float);
+    // The outputs' buffers are in CPU memory.
+    for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
+      LOG(INFO) << static_cast<float*>(outputs.front().data.data())[i];
+    }
+  }
+}
+
+TEST(paddle_inference_api_tensorrt_subgraph_engine, main) { Main(true); }
+
+}  // namespace paddle
\ No newline at end of file
diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt
index 2bb2c8135d..33b0e3b127 100644
--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
@@ -1,10 +1,12 @@
-set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor init)
 cc_library(analysis SRCS pass_manager.cc dot.cc node.cc data_flow_graph.cc graph_traits.cc subgraph_splitter.cc
   fluid_to_data_flow_graph_pass.cc
   data_flow_graph_to_fluid_pass.cc
-  tensorrt_subgraph_pass.cc
   dfg_graphviz_draw_pass.cc
-  DEPS framework_proto)
+  tensorrt_subgraph_pass.cc
+  tensorrt_subgraph_node_mark_pass.cc
+  analyzer.cc
+  helper.cc
+  DEPS framework_proto proto_desc)
 cc_test(test_node SRCS node_tester.cc DEPS analysis)
 cc_test(test_dot SRCS dot_tester.cc DEPS analysis)
 
@@ -28,5 +30,7 @@ inference_analysis_test(test_data_flow_graph_to_fluid_pass SRCS data_flow_graph_
 inference_analysis_test(test_fluid_to_data_flow_graph_pass SRCS fluid_to_data_flow_graph_pass_tester.cc)
 inference_analysis_test(test_subgraph_splitter SRCS subgraph_splitter_tester.cc)
 inference_analysis_test(test_dfg_graphviz_draw_pass SRCS dfg_graphviz_draw_pass_tester.cc)
-#inference_analysis_test(test_tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass_tester.cc)
+inference_analysis_test(test_tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass_tester.cc)
 inference_analysis_test(test_pass_manager SRCS pass_manager_tester.cc)
+inference_analysis_test(test_tensorrt_subgraph_node_mark_pass SRCS tensorrt_subgraph_node_mark_pass_tester.cc)
+inference_analysis_test(test_analyzer SRCS analyzer_tester.cc)
diff --git a/paddle/fluid/inference/analysis/analyzer.cc b/paddle/fluid/inference/analysis/analyzer.cc
new file mode 100644
index 0000000000..5d85530969
--- /dev/null
+++ b/paddle/fluid/inference/analysis/analyzer.cc
@@ -0,0 +1,82 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
+#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
+#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
+#include "paddle/fluid/inference/analysis/pass_manager.h"
+#include "paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h"
+#include "paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+DEFINE_bool(inference_analysis_enable_tensorrt_subgraph_engine, false,
+            "Enable subgraph to TensorRT engine for acceleration");
+
+DEFINE_string(inference_analysis_graphviz_log_root, "./",
+              "Graphviz debuger for data flow graphs.");
+
+class DfgPassManagerImpl final : public DfgPassManager {
+ public:
+  DfgPassManagerImpl() {
+    // TODO(Superjomn) set the key with pass reprs.
+    AddPass("fluid-to-data-flow-graph", new FluidToDataFlowGraphPass);
+    if (FLAGS_inference_analysis_enable_tensorrt_subgraph_engine) {
+      auto trt_teller = [](const Node* node) {
+        if (!node->IsFunction()) return false;
+        return static_cast<const Function*>(node)->func_type() == "mul";
+      };
+      AddPass("tensorrt-subgraph-marker",
+              new TensorRTSubgraphNodeMarkPass(trt_teller));
+      AddPass("tensorrt-subgraph", new TensorRTSubGraphPass(trt_teller));
+    }
+    AddPass("data-flow-graph-to-fluid", new DataFlowGraphToFluidPass);
+  }
+
+  std::string repr() const override { return "dfg-pass-manager"; }
+  std::string description() const override { return "DFG pass manager."; }
+
+ private:
+  void AddPass(const std::string& name, Pass* pass) {
+    LOG(INFO) << "Adding pass " << name;
+    Register(name, pass);
+    AddGraphvizDebugerPass(pass);
+  }
+
+  // Add the graphviz debuger pass if the parent pass has one.
+  void AddGraphvizDebugerPass(Pass* pass) {
+    auto* debuger_pass = pass->CreateGraphvizDebugerPass();
+    if (debuger_pass) {
+      LOG(INFO) << " - register debug pass [" << debuger_pass->repr() << "]";
+      Register(debuger_pass->repr(), debuger_pass);
+    }
+  }
+};
+
+Analyzer::Analyzer() { Register("manager1", new DfgPassManagerImpl); }
+
+void Analyzer::Run(Argument* argument) {
+  for (auto& x : data_) {
+    PADDLE_ENFORCE(x->Initialize(argument));
+    x->RunAll();
+    PADDLE_ENFORCE(x->Finalize());
+  }
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
\ No newline at end of file
diff --git a/paddle/fluid/inference/analysis/analyzer.h b/paddle/fluid/inference/analysis/analyzer.h
new file mode 100644
index 0000000000..f290a3777d
--- /dev/null
+++ b/paddle/fluid/inference/analysis/analyzer.h
@@ -0,0 +1,66 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+ * This file contains Analyzer, an class that exposed as a library that analyze
+ * and optimize
+ * Fluid ProgramDesc for inference. Similar to LLVM, it has multiple flags to
+ * control whether
+ * an process is applied on the program.
+ *
+ * The processes are called Passes in analysis, the Passes are placed in a
+ * pipeline, the first
+ * Pass is the FluidToDataFlowGraphPass which transforms a Fluid ProgramDesc to
+ * a data flow
+ * graph, the last Pass is DataFlowGraphToFluidPass which transforms a data flow
+ * graph to a
+ * Fluid ProgramDesc. The passes in the middle of the pipeline can be any Passes
+ * which take a
+ * node or data flow graph as input.
+ *
+ * The Analyzer can be used in two methods, the first is a executable file which
+ * can be used to
+ * pre-process the inference model and can be controlled by passing difference
+ * command flags;
+ * the other way is to compose inside the inference API as a runtime pre-process
+ * phase in the
+ * inference service.
+ */
+
+#include <gflags/gflags.h>
+#include "paddle/fluid/inference/analysis/pass.h"
+#include "paddle/fluid/inference/analysis/pass_manager.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+// TODO(Superjomn) add a definition flag like PADDLE_WITH_TENSORRT and hide this
+// flag if not available.
+DECLARE_bool(inference_analysis_enable_tensorrt_subgraph_engine);
+DECLARE_string(inference_analysis_graphviz_log_root);
+
+class Analyzer : public OrderedRegistry<PassManager> {
+ public:
+  // Register all the pass-managers.
+  Analyzer();
+
+  void Run(Argument* argument);
+
+  DISABLE_COPY_AND_ASSIGN(Analyzer);
+};
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc
new file mode 100644
index 0000000000..d7c1a72932
--- /dev/null
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
@@ -0,0 +1,29 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include "paddle/fluid/inference/analysis/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+TEST_F(DFG_Tester, main) {
+  Analyzer analyser;
+  analyser.Run(&argument);
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
index f7f4e03968..6d316f20bf 100644
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -41,6 +41,9 @@ struct Argument {
 
   // The original program desc.
   std::unique_ptr<framework::proto::ProgramDesc> origin_program_desc;
+
+  // The processed program desc.
+  std::unique_ptr<framework::proto::ProgramDesc> transformed_program_desc;
 };
 
 #define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
diff --git a/paddle/fluid/inference/analysis/data_flow_graph.cc b/paddle/fluid/inference/analysis/data_flow_graph.cc
index c30a7c26ce..d09bf3ed16 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph.cc
@@ -20,7 +20,7 @@ namespace paddle {
 namespace inference {
 namespace analysis {
 
-// It is a better idea that the inputs and outputs of this graph is set manully
+// It is a better idea that the inputs and outputs of this graph is set manually
 // before, but there must be a Pass that helps to prune the unnecessary ops that
 // do not contribute to the given targets, so in this pass, analysis and get the
 // inputs and outputs is OK.
@@ -50,6 +50,25 @@ void DataFlowGraph::Build() {
       outputs.push_back(out);
     }
   }
+
+  Clean();
+}
+
+void DataFlowGraph::Clean() {
+  for (auto &node : nodes.nodes()) {
+    std::unordered_set<Node *> inlinks_set(node->inlinks.begin(),
+                                           node->inlinks.end());
+    std::unordered_set<Node *> outlinks_set(node->outlinks.begin(),
+                                            node->outlinks.end());
+    if (inlinks_set.size() < node->inlinks.size()) {
+      LOG(INFO) << "Clean: node " << node->repr() << " prune duplicate inputs";
+      node->inlinks.assign(inlinks_set.begin(), inlinks_set.end());
+    }
+    if (outlinks_set.size() < node->outlinks.size()) {
+      LOG(INFO) << "Clean: node " << node->repr() << " prune duplicate inputs";
+      node->outlinks.assign(outlinks_set.begin(), outlinks_set.end());
+    }
+  }
 }
 
 std::string DataFlowGraph::DotString() const {
diff --git a/paddle/fluid/inference/analysis/data_flow_graph.h b/paddle/fluid/inference/analysis/data_flow_graph.h
index 913e344d37..30c60661f3 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph.h
+++ b/paddle/fluid/inference/analysis/data_flow_graph.h
@@ -47,6 +47,10 @@ struct DataFlowGraph {
 
   // Output a DOT graph file for debug.
   std::string DotString() const;
+
+ private:
+  // Remove duplicate edges and so on.
+  void Clean();
 };
 
 /*
@@ -133,17 +137,24 @@ struct GraphTraits<DataFlowGraph> {
 // Extract the inputs and outputs of a graph. The inputs and outputs of a
 // sub-graph is the inputs nodes and output nodes that doesn't inside the
 // sub-graph.
-std::pair<
-    std::vector<Node *>,
-    std::vector<
-        Node *>> static ExtractInputAndOutputOfSubGraph(std::vector<Node *>
-                                                            &graph) {
+static std::pair<std::vector<Node *>, std::vector<Node *>>
+ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph) {
   std::unordered_set<Node *> nodes(graph.begin(), graph.end());
   std::unordered_set<Node *> inputs;
   std::unordered_set<Node *> outputs;
+  // Input a Value, check whether its inlink is in the subgraph.
+  auto inlink_in_subgraph = [&](Node *n) {
+    for (auto *in : n->inlinks) {
+      if (nodes.count(in)) return true;
+    }
+    return false;
+  };
   for (auto &node : graph) {
     for (auto *in : node->inlinks) {
-      if (!nodes.count(in) && in->type() == Node::Type::kValue) {
+      // The Value that is written by nodes inside a sub-graph shouldn't be the
+      // input of the sub-graph.
+      if (!nodes.count(in) && in->type() == Node::Type::kValue &&
+          !inlink_in_subgraph(in)) {
         inputs.insert(in);
       }
     }
diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
index f7d4cca213..e74efd17b8 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
@@ -13,21 +13,34 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/proto_desc.h"
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
 
 namespace paddle {
 namespace inference {
 namespace analysis {
 
+using framework::proto::ProgramDesc;
+
+std::vector<std::string> ExtractParameters(
+    const std::vector<std::unique_ptr<Node>>& nodes);
+
 bool DataFlowGraphToFluidPass::Initialize(Argument* argument) {
   ANALYSIS_ARGUMENT_CHECK_FIELD(argument)
   ANALYSIS_ARGUMENT_CHECK_FIELD(argument->origin_program_desc)
-  desc_ = argument->origin_program_desc.get();
-  // Here some logic from program_desc.cc and will not add new interfaces into
-  // framework::ProgramDesc class, use some UT to assure the correctness.
-  auto* block = desc_->mutable_blocks()->Add();
-  block->set_idx(framework::kRootBlockIndex);
-  block->set_parent_idx(framework::kNoneBlockIndex);
+  PADDLE_ENFORCE(!argument->transformed_program_desc);
+  // The transformed_program_desc should inherit all the VarDesc and BlockDesc
+  // from the original program desc. The operators of the main block(the first
+  // block) should rewritten by data flow graph.
+  argument->transformed_program_desc.reset(
+      new ProgramDesc(*argument->origin_program_desc));
+  argument->transformed_program_desc->mutable_blocks(framework::kRootBlockIndex)
+      ->clear_ops();
+  desc_ = argument->transformed_program_desc.get();
+  argument_ = argument;
   return true;
 }
 
@@ -37,14 +50,17 @@ void DataFlowGraphToFluidPass::Run(DataFlowGraph* graph) {
   auto traits = GraphTraits<DataFlowGraph>(graph);
   for (auto it = traits.nodes().begin(); it != traits.nodes().end(); ++it) {
     if (it->deleted()) continue;
+
     switch (it->type()) {
-      case Node::Type::kFunction:
-        LOG(INFO) << "add function " << it->name();
+      case Node::Type::kFunction: {
+        LOG(INFO) << "add function " << it->repr();
         AddFluidOp(&(*it));
-        break;
-      case Node::Type::kFunctionBlock:
+      } break;
+      case Node::Type::kFunctionBlock: {
+        LOG(INFO) << "add engine op " << it->repr() << " , "
+                  << static_cast<FunctionBlock*>(&(*it))->subgraph.size();
         AddEngineOp(&(*it));
-        break;
+      } break;
       default:
         continue;
     }
@@ -52,12 +68,10 @@ void DataFlowGraphToFluidPass::Run(DataFlowGraph* graph) {
 }
 
 void DataFlowGraphToFluidPass::AddFluidOp(Node* node) {
-  LOG(INFO) << "processing func " << node->name();
   auto* ori_op = static_cast<framework::proto::OpDesc*>(node->pb_desc());
   // currently only the main block is analyzed.
   auto* main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
   auto* op = main_block->add_ops();
-  LOG(INFO) << "to copy the op";
   *op = *ori_op;  // copy the attributes, by default, these will not be changed
                   // by analysis phrase.
   // The inputs and outputs of the existing ops are not changed by tensorrt
@@ -65,11 +79,89 @@ void DataFlowGraphToFluidPass::AddFluidOp(Node* node) {
   // NOTE It might be changed by other passes in the long run.
 }
 
+void CreateTrtEngineOp(Node* node, const DataFlowGraph& graph,
+                       const framework::proto::BlockDesc& block) {
+  static int counter{0};
+  PADDLE_ENFORCE(node->IsFunctionBlock());
+  framework::OpDesc desc;
+  auto* func = static_cast<FunctionBlock*>(node);
+
+  // collect inputs
+  std::vector<std::string> io;
+  for (auto* x : func->inlinks) {
+    io.push_back(x->name());
+  }
+  desc.SetInput("Xs", io);
+
+  // collect outputs
+  io.clear();
+  for (auto* x : func->outlinks) {
+    io.push_back(x->name());
+  }
+  desc.SetOutput("Ys", io);
+
+  desc.SetType("tensorrt_engine");
+  // Set attrs
+  SetAttr(desc.Proto(), "subgraph", block.SerializeAsString());
+  SetAttr(desc.Proto(), "engine_unique_key",
+          "trt-" + std::to_string(counter++));
+  SetAttr(desc.Proto(), "max_batch", 100);  // TODO(Superjomn) add config latter
+  SetAttr(desc.Proto(), "max_workspace",
+          1024);  // TODO(Superjomn) add config latter
+  SetAttr(desc.Proto(), "parameters", ExtractParameters(graph.nodes.nodes()));
+  node->SetPbMsg(desc.Proto()->SerializeAsString());
+}
+
+std::vector<std::string> ExtractParameters(
+    const std::vector<std::unique_ptr<Node>>& nodes) {
+  std::vector<std::string> parameters;
+  for (const auto& node : nodes) {
+    if (!node->IsValue()) continue;
+    PADDLE_ENFORCE(!node->pb_msg().empty(), "pb_msg should be set first");
+    framework::proto::VarDesc var;
+    var.ParseFromString(node->pb_msg());
+    if (var.persistable()) {
+      parameters.push_back(var.name());
+    }
+  }
+  return parameters;
+}
+
 void DataFlowGraphToFluidPass::AddEngineOp(Node* node) {
-  // auto* ori_op = static_cast<framework::proto::OpDesc*>(node->extra_info());
-  // auto* main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
-  // auto* op = main_block->add_ops();
   // TODO(Superjomn) Here need to expose some arguments for default setting.
+  PADDLE_ENFORCE(node->IsFunctionBlock());
+  auto* block_node = static_cast<FunctionBlock*>(node);
+  framework::proto::BlockDesc proto;
+  framework::BlockDesc block_desc(nullptr, &proto);
+  // copy ops.
+  for (auto* node : block_node->subgraph) {
+    auto* op = block_desc.AppendOp();
+    PADDLE_ENFORCE(!node->pb_msg().empty());
+    op->Proto()->ParseFromString(node->pb_msg());
+  }
+  CreateTrtEngineOp(node, *argument_->main_dfg, *block_desc.Proto());
+  auto* main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
+  auto* op = main_block->add_ops();
+  PADDLE_ENFORCE(!node->pb_msg().empty(), "failed to set desc for block");
+  op->ParseFromString(node->pb_msg());
+}
+
+namespace {
+class DFG_DebuggerPass : public DFG_GraphvizDrawPass {
+ public:
+  using Config = DFG_GraphvizDrawPass::Config;
+  DFG_DebuggerPass(const Config& config) : DFG_GraphvizDrawPass(config) {}
+
+  std::string repr() const override { return "dfg-to-fluid-debuger-pass"; }
+
+  bool Finalize() override { return true; }
+};
+}
+
+Pass* DataFlowGraphToFluidPass::CreateGraphvizDebugerPass() const {
+  return new DFG_DebuggerPass(DFG_GraphvizDrawPass::Config(
+      FLAGS_inference_analysis_graphviz_log_root,
+      "data_flow_graph_to_fluid_graphviz_debugger"));
 }
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h
index cbb05f622c..1726e056ed 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h
@@ -40,10 +40,7 @@ class DataFlowGraphToFluidPass final : public DataFlowGraphPass {
     return "Transform a DFG to a Fluid ProgramDesc";
   }
 
-  Pass *CreatePrinterPass(std::ostream &os,
-                          const std::string &banner) const override {
-    return nullptr;
-  }
+  Pass *CreateGraphvizDebugerPass() const override;
 
  protected:
   // Add a Fluid Op into the ProgramDesc.
@@ -53,6 +50,7 @@ class DataFlowGraphToFluidPass final : public DataFlowGraphPass {
 
  private:
   framework::proto::ProgramDesc *desc_;
+  Argument *argument_;
 };
 }  // namespace analysis
 }  // namespace inference
diff --git a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc
index afffb3feb0..a6f8548475 100644
--- a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc
+++ b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc
@@ -18,12 +18,19 @@ namespace paddle {
 namespace inference {
 namespace analysis {
 
+int DFG_GraphvizDrawPass::counter_{0};
+
 void DFG_GraphvizDrawPass::Run(DataFlowGraph *graph) {
   auto content = Draw(graph);
-  std::ofstream file(GenDotPath());
+  auto dot_path = GenDotPath();
+  std::ofstream file(dot_path);
   file.write(content.c_str(), content.size());
   file.close();
-  LOG(INFO) << "draw dot to " << GenDotPath();
+
+  auto png_path = dot_path.substr(0, dot_path.size() - 4) + ".png";
+  std::string message;
+  LOG(INFO) << "draw to " << png_path;
+  ExecShellCommand("dot -Tpng " + dot_path + " -o " + png_path, &message);
 }
 
 std::string DFG_GraphvizDrawPass::Draw(DataFlowGraph *graph) {
@@ -41,9 +48,7 @@ std::string DFG_GraphvizDrawPass::Draw(DataFlowGraph *graph) {
     if (!config_.display_deleted_node && node.deleted()) continue;
     for (auto &in : node.inlinks) {
       if (!config_.display_deleted_node && in->deleted()) continue;
-      for (auto &in : node.inlinks) {
-        dot.AddEdge(in->repr(), node.repr(), {});
-      }
+      dot.AddEdge(in->repr(), node.repr(), {});
     }
   }
   return dot.Build();
diff --git a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h
index 93ebff59ae..b064782586 100644
--- a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h
+++ b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h
@@ -50,20 +50,25 @@ class DFG_GraphvizDrawPass : public DataFlowGraphPass {
 
   bool Initialize(Argument *argument) override { return true; }
   void Run(DataFlowGraph *graph) override;
-  bool Finalize() override { return Pass::Finalize(); }
+  bool Finalize() override { return true; }
 
   std::string repr() const override { return "DFG graphviz drawer"; }
   std::string description() const override {
     return "Debug a DFG by draw with graphviz";
   }
 
- private:
+ protected:
+  // A counter to add a number prefix to the debugger image output so that they
+  // will sort in the triggered order.
+  static int counter_;
+
   // Path of the dot file to output.
   std::string GenDotPath() const {
-    return config_.dir + "/" + "graph_" + config_.id + ".dot";
+    return config_.dir + "/" + std::to_string(counter_++) + "-graph_" +
+           config_.id + ".dot";
   }
 
-  std::string Draw(DataFlowGraph *graph);
+  virtual std::string Draw(DataFlowGraph *graph);
 
   Config config_;
 };
diff --git a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc
index f4b5c5fd22..162455b9c4 100644
--- a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc
@@ -31,7 +31,7 @@ TEST_F(DFG_Tester, dfg_graphviz_draw_pass_tester) {
   pass.Run(&dfg);
 
   // test content
-  std::ifstream file("./graph_test.dot");
+  std::ifstream file("./0-graph_test.dot");
   ASSERT_TRUE(file.is_open());
 
   std::string line;
@@ -40,7 +40,7 @@ TEST_F(DFG_Tester, dfg_graphviz_draw_pass_tester) {
     no++;
   }
   // DFG is sensitive to ProgramDesc, be careful to change the existing models.
-  ASSERT_EQ(no, 112);
+  ASSERT_EQ(no, 82);
 }
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
index 5f62eef528..5d7eb43b7c 100644
--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
@@ -15,6 +15,8 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
+#include "analyzer.h"
+#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
 #include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
 
 namespace paddle {
@@ -33,7 +35,7 @@ bool FluidToDataFlowGraphPass::Initialize(Argument *argument) {
   return true;
 }
 
-bool FluidToDataFlowGraphPass::Finalize() { return Pass::Finalize(); }
+bool FluidToDataFlowGraphPass::Finalize() { return true; }
 
 void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
   PADDLE_ENFORCE(graph);
@@ -46,6 +48,7 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
     auto *v = graph->nodes.Create(Node::Type::kValue);
     v->SetName(var.name());
     v->SetPbDesc(const_cast<void *>(static_cast<const void *>(&var)));
+    v->SetPbMsg(var.SerializeAsString());
     var2id[var.name()] = v->id();
   }
   for (int i = 0; i < main_block.ops_size(); i++) {
@@ -56,6 +59,8 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
     // Link to the original protobuf message's memory, make it easier to
     // generate from a data flow graph to fluid ProgramDesc.
     o->SetPbDesc(const_cast<void *>(static_cast<const void *>(&op)));
+    o->SetPbMsg(op.SerializeAsString());
+
     // set inputs and outputs
     // TODO(Superjomn) make sure the InputNames is the real variable name.
     for (int j = 0; j < op.inputs_size(); j++) {
@@ -79,9 +84,19 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
   graph->Build();
 }
 
-Pass *FluidToDataFlowGraphPass::CreatePrinterPass(
-    std::ostream &os, const std::string &banner) const {
-  return nullptr;
+namespace {
+class DFG_DebuggerPass : public DFG_GraphvizDrawPass {
+ public:
+  using Config = DFG_GraphvizDrawPass::Config;
+  DFG_DebuggerPass(const Config &config) : DFG_GraphvizDrawPass(config) {}
+  std::string repr() const override { return "fluid-to-dfg-debuger-pass"; }
+  bool Finalize() override { return true; }
+};
+}
+
+Pass *FluidToDataFlowGraphPass::CreateGraphvizDebugerPass() const {
+  return new DFG_DebuggerPass(DFG_GraphvizDrawPass::Config(
+      FLAGS_inference_analysis_graphviz_log_root, "fluid-to-dfg-debuger"));
 }
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
index 176faf0220..da8463b63b 100644
--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
@@ -46,8 +46,7 @@ class FluidToDataFlowGraphPass final : public DataFlowGraphPass {
     return "transform a fluid ProgramDesc to a data flow graph.";
   }
 
-  Pass *CreatePrinterPass(std::ostream &os,
-                          const std::string &banner) const override;
+  Pass *CreateGraphvizDebugerPass() const override;
 
  private:
   framework::proto::ProgramDesc const *desc_;
diff --git a/paddle/fluid/inference/analysis/helper.cc b/paddle/fluid/inference/analysis/helper.cc
new file mode 100644
index 0000000000..ca40c01fc5
--- /dev/null
+++ b/paddle/fluid/inference/analysis/helper.cc
@@ -0,0 +1,60 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/helper.h"
+#include "paddle/fluid/framework/framework.pb.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+template <>
+void SetAttr<std::string>(framework::proto::OpDesc *op, const std::string &name,
+                          const std::string &data) {
+  auto *attr = op->add_attrs();
+  attr->set_name(name);
+  attr->set_type(paddle::framework::proto::AttrType::STRING);
+  attr->set_s(data);
+}
+template <>
+void SetAttr<int>(framework::proto::OpDesc *op, const std::string &name,
+                  const int &data) {
+  auto *attr = op->add_attrs();
+  attr->set_name(name);
+  attr->set_type(paddle::framework::proto::AttrType::INT);
+  attr->set_i(data);
+}
+template <>
+void SetAttr<int64_t>(framework::proto::OpDesc *op, const std::string &name,
+                      const int64_t &data) {
+  auto *attr = op->add_attrs();
+  attr->set_name(name);
+  attr->set_type(paddle::framework::proto::AttrType::LONG);
+  attr->set_l(data);
+}
+template <>
+void SetAttr<std::vector<std::string>>(framework::proto::OpDesc *op,
+                                       const std::string &name,
+                                       const std::vector<std::string> &data) {
+  auto *attr = op->add_attrs();
+  attr->set_name(name);
+  attr->set_type(paddle::framework::proto::AttrType::STRINGS);
+  for (const auto &s : data) {
+    attr->add_strings(s.c_str());
+  }
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/helper.h b/paddle/fluid/inference/analysis/helper.h
index f0039e1131..fff1621d3f 100644
--- a/paddle/fluid/inference/analysis/helper.h
+++ b/paddle/fluid/inference/analysis/helper.h
@@ -14,10 +14,12 @@ limitations under the License. */
 
 #pragma once
 
+#include <cstdio>
 #include <string>
 #include <unordered_map>
 #include <vector>
 
+#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -26,6 +28,10 @@ namespace paddle {
 namespace inference {
 namespace analysis {
 
+template <typename T>
+void SetAttr(framework::proto::OpDesc *op, const std::string &name,
+             const T &data);
+
 template <typename Vec>
 int AccuDims(Vec &&vec, int size) {
   int res = 1;
@@ -93,7 +99,7 @@ template <typename T>
 class OrderedRegistry {
  public:
   T *Register(const std::string &name, T *x) {
-    PADDLE_ENFORCE(!dic_.count(name));
+    PADDLE_ENFORCE(!dic_.count(name), "duplicate key [%s]", name);
     dic_[name] = data_.size();
     data_.emplace_back(std::unique_ptr<T>(x));
     return data_.back().get();
@@ -117,6 +123,20 @@ T &GetFromScope(const framework::Scope &scope, const std::string &name) {
   return *var->GetMutable<T>();
 }
 
+static void ExecShellCommand(const std::string &cmd, std::string *message) {
+  char buffer[128];
+  std::shared_ptr<FILE> pipe(popen(cmd.c_str(), "r"), pclose);
+  if (!pipe) {
+    LOG(ERROR) << "error running command: " << cmd;
+    return;
+  }
+  while (!feof(pipe.get())) {
+    if (fgets(buffer, 128, pipe.get()) != nullptr) {
+      *message += buffer;
+    }
+  }
+}
+
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/node.cc b/paddle/fluid/inference/analysis/node.cc
index 3339b5044d..d9d265d225 100644
--- a/paddle/fluid/inference/analysis/node.cc
+++ b/paddle/fluid/inference/analysis/node.cc
@@ -20,6 +20,17 @@ namespace paddle {
 namespace inference {
 namespace analysis {
 
+template <>
+std::string &NodeAttr::As<std::string>() {
+  if (data_.empty()) {
+    type_hash_ = typeid(std::string).hash_code();
+  }
+  PADDLE_ENFORCE_EQ(type_hash_, typeid(std::string).hash_code());
+  return data_;
+}
+
+std::string &NodeAttr::String() { return As<std::string>(); }
+
 std::vector<Dot::Attr> Value::dot_attrs() const {
   return std::vector<Dot::Attr>({Dot::Attr("style", "filled,rounded"),
                                  Dot::Attr("shape", "box"),
diff --git a/paddle/fluid/inference/analysis/node.h b/paddle/fluid/inference/analysis/node.h
index 8c2e6d88b9..8ecd1ae730 100644
--- a/paddle/fluid/inference/analysis/node.h
+++ b/paddle/fluid/inference/analysis/node.h
@@ -35,6 +35,44 @@ namespace analysis {
 
 class NodeMap;
 
+// A helper class to maintain the status from Pass.
+struct NodeAttr {
+  // NOTE T should be a primary type or a struct combined by several primary
+  // types.
+  // NOTE the STL containers should not use here.
+  // Some usages
+  //   Attr attr;
+  //   attr.Bool() = true;
+
+  bool &Bool() { return As<bool>(); }
+  float &Float() { return As<float>(); }
+  int32_t &Int32() { return As<int32_t>(); }
+  int64_t &Int64() { return As<int64_t>(); }
+  void *&Pointer() { return As<void *>(); }
+  std::string &String();
+
+ private:
+  template <typename T>
+  T &As() {
+    // init storage in the first usage.
+    if (data_.empty()) {
+      VLOG(4) << "resize data to " << sizeof(T);
+      type_hash_ = typeid(T).hash_code();
+      data_.resize(sizeof(T));
+    }
+    PADDLE_ENFORCE(type_hash_ == typeid(T).hash_code(),
+                   "type not matched, origin is %s, want %s",
+                   DataTypeNamer::Global().repr(type_hash_),
+                   DataTypeNamer::Global().repr<T>());
+    PADDLE_ENFORCE_EQ(data_.size(), sizeof(T), "Node attr type recast error");
+    return *reinterpret_cast<T *>(&data_[0]);
+  }
+
+ private:
+  std::string data_;
+  size_t type_hash_{std::numeric_limits<size_t>::max()};
+};
+
 /*
  * Node Representation.
  *
@@ -50,8 +88,6 @@ class Node {
 
   Node() = default;
 
-  struct Attr;
-
   // Cast to a subclass type, Function for example.
   template <typename Subclass>
   Subclass &As() {
@@ -71,7 +107,7 @@ class Node {
 
   // Get an additional attribute and convert it to T data type. NOTE this will
   // silently create a new attribute if not exists.
-  Attr &attr(const std::string &name) const { return attrs_[name]; }
+  NodeAttr &attr(const std::string &name) const { return attrs_[name]; }
 
   int id() const { return id_; }
 
@@ -80,6 +116,9 @@ class Node {
   void SetPbDesc(void *pb) { attr("pb_desc").Pointer() = pb; }
   void *pb_desc() const { return attr("pb_desc").Pointer(); }
 
+  void SetPbMsg(const std::string &s) { attr("pb_msg").String() = s; }
+  const std::string &pb_msg() const { return attr("pb_msg").String(); }
+
   void SetDeleted() { deleted_ = true; }
   bool deleted() const { return deleted_; }
 
@@ -94,43 +133,6 @@ class Node {
   // Output links.
   std::vector<Node *> outlinks;
 
-  // A helper class to maintain the status from Pass.
-  struct Attr {
-    // NOTE T should be a primary type or a struct combined by several primary
-    // types.
-    // NOTE the STL containers should not use here.
-    // Some usages
-    //   Attr attr;
-    //   attr.Bool() = true;
-
-    bool &Bool() { return As<bool>(); }
-    float &Float() { return As<float>(); }
-    int32_t &Int32() { return As<int32_t>(); }
-    int64_t &Int64() { return As<int64_t>(); }
-    void *&Pointer() { return As<void *>(); }
-
-   private:
-    template <typename T>
-    T &As() {
-      // init storage in the first usage.
-      if (data_.empty()) {
-        VLOG(4) << "resize data to " << sizeof(T);
-        type_hash_ = typeid(T).hash_code();
-        data_.resize(sizeof(T));
-      }
-      PADDLE_ENFORCE(type_hash_ == typeid(T).hash_code(),
-                     "type not matched, origin is %s, want %s",
-                     DataTypeNamer::Global().repr(type_hash_),
-                     DataTypeNamer::Global().repr<T>());
-      PADDLE_ENFORCE_EQ(data_.size(), sizeof(T), "Node attr type recast error");
-      return *reinterpret_cast<T *>(&data_[0]);
-    }
-
-   private:
-    std::string data_;
-    size_t type_hash_{std::numeric_limits<size_t>::max()};
-  };
-
   // Type checks.
   bool IsFunction() const { return type_ == Node::Type::kFunction; }
   bool IsValue() const { return type_ == Node::Type::kValue; }
@@ -150,7 +152,7 @@ class Node {
   Type type_{Type::kNone};
   // Mark this node is deleted by some pass.
   bool deleted_{false};
-  mutable std::unordered_map<std::string, Attr> attrs_;
+  mutable std::unordered_map<std::string, NodeAttr> attrs_;
 };
 
 class Function;
@@ -213,6 +215,10 @@ class Function : public Node {
 struct FunctionBlock : public Node {
   std::string repr() const override { return "block-" + std::to_string(id()); }
   std::vector<Node *> subgraph;
+
+ protected:
+  FunctionBlock() { SetType(Node::Type::kFunctionBlock); }
+  friend class NodeMap;
 };
 
 class NodeMap {
@@ -227,7 +233,7 @@ class NodeMap {
 
   void Delete(size_t id);
 
-  const std::vector<std::unique_ptr<Node>> &nodes() { return nodes_; }
+  const std::vector<std::unique_ptr<Node>> &nodes() const { return nodes_; }
 
   size_t size() const { return nodes_.size(); }
 
diff --git a/paddle/fluid/inference/analysis/node_attr_flags.h b/paddle/fluid/inference/analysis/node_attr_flags.h
new file mode 100644
index 0000000000..a3f70e5419
--- /dev/null
+++ b/paddle/fluid/inference/analysis/node_attr_flags.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/*
+ * This file contains all the flags that declared in Node::Attr.
+ *
+ * The Node::Attr is designed to share information between different passes, one
+ * can get other's attributes in a Node by the flags in this file.
+ */
+#pragma once
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+#define DECLARE_NODE_ATTR(flag__) const char ATTR_##flag__[] = #flag__;
+
+DECLARE_NODE_ATTR(supported_by_tensorrt)  // bool
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/pass.h b/paddle/fluid/inference/analysis/pass.h
index 65632b7491..25c566ebfa 100644
--- a/paddle/fluid/inference/analysis/pass.h
+++ b/paddle/fluid/inference/analysis/pass.h
@@ -60,6 +60,9 @@ class Pass {
     return nullptr;
   }
 
+  // Create a debugger Pass that draw the DFG by graphviz toolkit.
+  virtual Pass *CreateGraphvizDebugerPass() const { return nullptr; }
+
   // Run on a single Node.
   virtual void Run(Node *x) { LOG(FATAL) << "not valid"; }
   // Run on a single Function.
diff --git a/paddle/fluid/inference/analysis/pass_manager.cc b/paddle/fluid/inference/analysis/pass_manager.cc
index b17c0e0d72..b428bb22b1 100644
--- a/paddle/fluid/inference/analysis/pass_manager.cc
+++ b/paddle/fluid/inference/analysis/pass_manager.cc
@@ -19,6 +19,18 @@ namespace paddle {
 namespace inference {
 namespace analysis {
 
+bool PassManager::Initialize(Argument* argument) {
+  argument_ = argument;
+  for (auto& pass : data_) {
+    LOG(INFO) << "Initializing pass " << pass->repr();
+    if (!pass->Initialize(argument)) {
+      LOG(ERROR) << "Failed to initialize pass [" << pass->repr() << "]";
+      return false;
+    }
+  }
+  return true;
+}
+
 void DfgPassManager::RunAll() {
   PADDLE_ENFORCE(argument_);
   for (auto& pass : data_) {
diff --git a/paddle/fluid/inference/analysis/pass_manager.h b/paddle/fluid/inference/analysis/pass_manager.h
index 7841c4b9d0..81a17e0287 100644
--- a/paddle/fluid/inference/analysis/pass_manager.h
+++ b/paddle/fluid/inference/analysis/pass_manager.h
@@ -50,17 +50,7 @@ class PassManager : public OrderedRegistry<Pass> {
   // globally shared, so pass them as the arguemnts for all the pass managers.
   virtual bool Initialize(const Argument& argument) { return false; }
 
-  virtual bool Initialize(Argument* argument) {
-    argument_ = argument;
-    for (auto& pass : data_) {
-      LOG(INFO) << "Initializing pass " << pass->repr();
-      if (!pass->Initialize(argument)) {
-        LOG(ERROR) << "Failed to initialize pass [" << pass->repr() << "]";
-        return false;
-      }
-    }
-    return true;
-  }
+  virtual bool Initialize(Argument* argument);
 
   // Call all the passes' Finalize methods.
   virtual bool Finalize() {
diff --git a/paddle/fluid/inference/analysis/pass_manager_tester.cc b/paddle/fluid/inference/analysis/pass_manager_tester.cc
index 7af6a19951..6caba8f042 100644
--- a/paddle/fluid/inference/analysis/pass_manager_tester.cc
+++ b/paddle/fluid/inference/analysis/pass_manager_tester.cc
@@ -64,6 +64,7 @@ TEST_F(DFG_Tester, DFG_pass_manager) {
   manager.Register("graphviz", new DFG_GraphvizDrawPass(config));
   manager.Register("dfg-to-fluid", new DataFlowGraphToFluidPass);
 
+  ASSERT_TRUE(&argument);
   ASSERT_TRUE(manager.Initialize(&argument));
   manager.RunAll();
 }
diff --git a/paddle/fluid/inference/analysis/subgraph_splitter.cc b/paddle/fluid/inference/analysis/subgraph_splitter.cc
index 43ccac96c8..389f9e1a91 100644
--- a/paddle/fluid/inference/analysis/subgraph_splitter.cc
+++ b/paddle/fluid/inference/analysis/subgraph_splitter.cc
@@ -119,10 +119,12 @@ void SubGraphFuse::operator()() { ReplaceNodesWithSubGraphs(); }
 void SubGraphFuse::ReplaceNodesWithSubGraphs() {
   auto subgraphs = SubGraphSplitter(graph_, node_inside_subgraph_teller_)();
   for (auto &subgraph : subgraphs) {
+    std::unordered_set<Node *> subgraph_uniq(subgraph.begin(), subgraph.end());
     // replace this sub-graph with the first node. Two steps: 1. Create a Block
     // Node that contains this subgraph 2. Mark the nodes inside the sub-graph
     // as deleted. 3. Replace the deleted node with the new Block Node.
-    auto *block_node = graph_->nodes.Create(Node::Type::kFunctionBlock);
+    auto *block_node = static_cast<FunctionBlock *>(
+        graph_->nodes.Create(Node::Type::kFunctionBlock));
     auto io = ExtractInputAndOutputOfSubGraph(subgraph);
     block_node->inlinks = std::move(io.first);
     block_node->outlinks = std::move(io.second);
@@ -130,21 +132,25 @@ void SubGraphFuse::ReplaceNodesWithSubGraphs() {
       // TODO(Superjomn) need a unified mechanism to treat deleted node in each
       // pass.
       node->SetDeleted();
+      block_node->subgraph.push_back(node);
     }
 
-    std::unordered_map<Node *, Node *>
-        delelte_node_map;  // deleted node to BlockNode
-    for (auto *n : block_node->inlinks) {
-      n->inlinks.clear();
-    }
-    for (auto *n : block_node->outlinks) {
-      n->outlinks.clear();
-    }
-    for (auto *n : block_node->inlinks) {
-      n->outlinks.push_back(block_node);
+    // Change all the sub-graph's inputs and outputs corresponding inlink and
+    // outlink to this sub-graph node.
+    auto inlink_or_outlink_cleaner = [&](std::vector<Node *> &nodes) {
+      for (auto *&n : nodes) {
+        if (subgraph_uniq.count(n)) {
+          n = block_node;
+        }
+      }
+      std::unordered_set<Node *> uniq(nodes.begin(), nodes.end());
+      nodes.assign(uniq.begin(), uniq.end());
+    };
+    for (auto *i : block_node->inlinks) {
+      inlink_or_outlink_cleaner(i->outlinks);
     }
-    for (auto *n : block_node->outlinks) {
-      n->inlinks.push_back(n);
+    for (auto *&o : block_node->outlinks) {
+      inlink_or_outlink_cleaner(o->inlinks);
     }
   }
 }
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc
new file mode 100644
index 0000000000..5ad092a9ed
--- /dev/null
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc
@@ -0,0 +1,78 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h"
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
+#include "paddle/fluid/inference/analysis/node_attr_flags.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+void TensorRTSubgraphNodeMarkPass::Run(DataFlowGraph *graph) {
+  for (auto &node : graph->nodes.nodes()) {
+    node->attr(ATTR_supported_by_tensorrt).Bool() = teller_(node.get());
+  }
+}
+
+class DfgDebuggerPass : public DFG_GraphvizDrawPass {
+ public:
+  DfgDebuggerPass(const DFG_GraphvizDrawPass::Config &config)
+      : DFG_GraphvizDrawPass(config) {}
+
+  std::string repr() const override {
+    return "tensorrt-subgraph-node-mark-debugger";
+  }
+
+  bool Finalize() override { return true; }
+
+ protected:
+  std::string Draw(DataFlowGraph *graph) override {
+    Dot dot;
+    // Add nodes
+    for (size_t i = 0; i < graph->nodes.size(); i++) {
+      const Node &node = graph->nodes.Get(i);
+      if (config_.display_deleted_node || !node.deleted()) {
+        auto dot_attr = node.dot_attrs();
+        if (node.attr(ATTR_supported_by_tensorrt).Bool()) {
+          dot_attr.assign(
+              {Dot::Attr{"color", "green"}, Dot::Attr{"style", "filled"}});
+        }
+        dot.AddNode(node.repr(), dot_attr);
+      }
+    }
+    // Add edges
+    for (size_t i = 0; i < graph->nodes.size(); i++) {
+      const Node &node = graph->nodes.Get(i);
+      if (!config_.display_deleted_node && node.deleted()) continue;
+      for (auto &in : node.inlinks) {
+        if (!config_.display_deleted_node && in->deleted()) continue;
+        dot.AddEdge(in->repr(), node.repr(), {});
+      }
+    }
+    return dot.Build();
+  }
+};
+
+Pass *TensorRTSubgraphNodeMarkPass::CreateGraphvizDebugerPass() const {
+  DFG_GraphvizDrawPass::Config config(
+      FLAGS_inference_analysis_graphviz_log_root, "tensorrt_marked_node");
+  return new DfgDebuggerPass(config);
+}
+bool TensorRTSubgraphNodeMarkPass::Finalize() { return true; }
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h
new file mode 100644
index 0000000000..6cfac55d3b
--- /dev/null
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h
@@ -0,0 +1,53 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/*
+ * This file defines TensorRTSubgraphNodeMarkPass which helps to mark the ops
+ * that supported by TensorRT engine.
+ */
+#include "paddle/fluid/inference/analysis/pass.h"
+#include "paddle/fluid/inference/analysis/subgraph_splitter.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+/*
+ * Mark the operators that TensorRT engine supports.
+ */
+class TensorRTSubgraphNodeMarkPass : public DataFlowGraphPass {
+ public:
+  using teller_t = SubGraphSplitter::NodeInsideSubgraphTeller;
+
+  TensorRTSubgraphNodeMarkPass(const teller_t& teller) : teller_(teller) {}
+
+  bool Initialize(Argument* argument) override { return true; }
+
+  // This class get a sub-graph as input and determine whether to transform this
+  // sub-graph into TensorRT.
+  void Run(DataFlowGraph* graph) override;
+
+  std::string repr() const { return "tensorrt-sub-subgraph-mark"; }
+  std::string description() const { return "tensorrt sub-graph mark pass"; }
+
+  Pass* CreateGraphvizDebugerPass() const override;
+  bool Finalize() override;
+
+ private:
+  teller_t teller_;
+};
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass_tester.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass_tester.cc
new file mode 100644
index 0000000000..a6c15e848b
--- /dev/null
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass_tester.cc
@@ -0,0 +1,50 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h"
+
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/analysis/node_attr_flags.h"
+#include "paddle/fluid/inference/analysis/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+TEST_F(DFG_Tester, tensorrt_subgraph_node_mark_pass) {
+  // init
+  FluidToDataFlowGraphPass pass;
+  ASSERT_TRUE(pass.Initialize(&argument));
+  argument.main_dfg.reset(new DataFlowGraph);
+  pass.Run(argument.main_dfg.get());
+
+  TensorRTSubgraphNodeMarkPass::teller_t teller = [](const Node* node) {
+    return node->IsFunction() &&
+           static_cast<const Function*>(node)->func_type() == "mul";
+  };
+  TensorRTSubgraphNodeMarkPass pass1(teller);
+  ASSERT_TRUE(pass1.Initialize(&argument));
+  pass1.Run(argument.main_dfg.get());
+
+  int counter{0};
+  for (auto& node : argument.main_dfg->nodes.nodes()) {
+    counter += node->attr(ATTR_supported_by_tensorrt).Bool();
+  }
+
+  LOG(INFO) << counter << " nodes marked";
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc
index c7f40d43c9..9993de2280 100644
--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc
@@ -24,7 +24,7 @@ TensorRTSubGraphPass::TensorRTSubGraphPass(
     : node_inside_subgraph_teller_(teller) {}
 
 void TensorRTSubGraphPass::Run(DataFlowGraph *graph) {
-  SubGraphFuse(graph, node_inside_subgraph_teller_);
+  SubGraphFuse(graph, node_inside_subgraph_teller_)();
 }
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h
index 79e9e2bcc9..11e0880695 100644
--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h
@@ -38,6 +38,11 @@ class TensorRTSubGraphPass : public DataFlowGraphPass {
   // sub-graph into TensorRT.
   void Run(DataFlowGraph* graph) override;
 
+  bool Finalize() override { return true; }
+
+  std::string repr() const { return "tensorrt-sub-graph"; }
+  std::string description() const { return "tensorrt sub graph pass"; }
+
  private:
   NodeInsideSubgraphTeller node_inside_subgraph_teller_;
 };
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc
index d12dcf0d0f..1d749d3fa3 100644
--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc
@@ -23,49 +23,48 @@ namespace paddle {
 namespace inference {
 namespace analysis {
 
-DEFINE_string(model_dir, "", "inference test model dir");
+DEFINE_string(dot_dir, "./", "");
 
-TEST(TensorRTSubGraph, single_pass) {
-  auto desc = LoadProgramDesc();
-  auto dfg = ProgramDescToDFG(desc);
-
-  SubGraphSplitter::NodeInsideSubgraphTeller teller = [](const Node* node) {
+TEST_F(DFG_Tester, tensorrt_single_pass) {
+  std::unordered_set<std::string> teller_set(
+      {"elementwise_add", "mul", "sigmoid"});
+  SubGraphSplitter::NodeInsideSubgraphTeller teller = [&](const Node* node) {
     if (node->type() != Node::Type::kFunction) return false;
     const auto* func = static_cast<const Function*>(node);
-    if (func->func_type() == "elementwise_add" || func->func_type() == "relu" ||
-        func->func_type() == "conv2d" || func->func_type() == "mul" ||
-        func->func_type() == "sigmoid" || func->func_type() == "softmax") {
-      LOG(INFO) << "sub-graph marked " << node->repr();
-      return true;
-    }
+    if (teller_set.count(func->func_type())) return true;
     return false;
   };
 
-  DFG_GraphvizDrawPass::Config config{"./", "test"};
-  DFG_GraphvizDrawPass dfg_pass(config);
-  dfg_pass.Initialize();
-
-  DFG_GraphvizDrawPass dfg_pass1(config);
-  dfg_pass1.Initialize();
-
-  dfg_pass.Run(&dfg);
+  LOG(INFO) << "init";
+  DFG_GraphvizDrawPass::Config config{FLAGS_dot_dir, "origin"};
+  DFG_GraphvizDrawPass::Config config1{FLAGS_dot_dir, "fusion"};
 
+  DFG_GraphvizDrawPass dfg_pass(config);
+  DFG_GraphvizDrawPass dfg_pass1(config1);
+  FluidToDataFlowGraphPass pass0;
   TensorRTSubGraphPass trt_pass(std::move(teller));
-  trt_pass.Initialize();
 
-  trt_pass.Run(&dfg);
+  LOG(INFO) << "Initialize";
+  dfg_pass.Initialize(&argument);
+  dfg_pass1.Initialize(&argument);
+  pass0.Initialize(&argument);
+  trt_pass.Initialize(&argument);
 
-  dfg_pass1.Run(&dfg);
+  LOG(INFO) << "Run";
+  argument.main_dfg.reset(new DataFlowGraph);
+  pass0.Run(argument.main_dfg.get());
+  dfg_pass.Run(argument.main_dfg.get());
+  trt_pass.Run(argument.main_dfg.get());
+  dfg_pass1.Run(argument.main_dfg.get());
 
   // Check the TRT op's block desc
-  for (auto node : dfg.nodes.nodes()) {
+  for (auto& node : argument.main_dfg->nodes.nodes()) {
     if (node->IsFunctionBlock()) {
+      LOG(INFO) << "get function block";
     }
   }
 }
 
-TEST(TensorRTSubGraph, pass_manager) {}
-
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 4c338c67d3..9dc39ad0dd 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -226,7 +226,8 @@ op_library(sequence_softmax_op DEPS softmax)
 if (WITH_GPU AND TENSORRT_FOUND)
     op_library(tensorrt_engine_op DEPS tensorrt_engine)
     nv_test(test_tensorrt_engine_op SRCS tensorrt_engine_op_test.cc
-      DEPS tensorrt_engine_op tensorrt_engine tensorrt_converter)
+      DEPS tensorrt_engine_op tensorrt_engine tensorrt_converter
+      analysis)
 else()
     set(DEPS_OPS ${DEPS_OPS} tensorrt_engine_op)
 endif()
diff --git a/paddle/fluid/operators/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt_engine_op.h
index 295d6ba039..1602a913ae 100644
--- a/paddle/fluid/operators/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt_engine_op.h
@@ -53,6 +53,7 @@ template <typename DeviceContext, typename T>
 class TensorRTEngineKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
+    VLOG(4) << "TensorRTEngineKernel executing";
     auto engine_name = context.Attr<std::string>("engine_uniq_key");
     if (!Singleton<TRT_EngineManager>::Global().HasEngine(engine_name)) {
       Prepare(context);
diff --git a/paddle/fluid/operators/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt_engine_op_test.cc
index 358e2d151b..82a16361e4 100644
--- a/paddle/fluid/operators/tensorrt_engine_op_test.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op_test.cc
@@ -19,6 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/inference/analysis/helper.h"
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
 
@@ -51,48 +52,10 @@ void AddTensorToBlockDesc(framework::proto::BlockDesc* block,
   *var = *desc.Proto();
 }
 
-template <typename T>
-void SetAttr(framework::proto::OpDesc* op, const std::string& name,
-             const T& data);
-
-template <>
-void SetAttr<std::string>(framework::proto::OpDesc* op, const std::string& name,
-                          const std::string& data) {
-  auto* attr = op->add_attrs();
-  attr->set_name(name);
-  attr->set_type(paddle::framework::proto::AttrType::STRING);
-  attr->set_s(data);
-}
-template <>
-void SetAttr<int>(framework::proto::OpDesc* op, const std::string& name,
-                  const int& data) {
-  auto* attr = op->add_attrs();
-  attr->set_name(name);
-  attr->set_type(paddle::framework::proto::AttrType::INT);
-  attr->set_i(data);
-}
-template <>
-void SetAttr<int64_t>(framework::proto::OpDesc* op, const std::string& name,
-                      const int64_t& data) {
-  auto* attr = op->add_attrs();
-  attr->set_name(name);
-  attr->set_type(paddle::framework::proto::AttrType::LONG);
-  attr->set_l(data);
-}
-template <>
-void SetAttr<std::vector<std::string>>(framework::proto::OpDesc* op,
-                                       const std::string& name,
-                                       const std::vector<std::string>& data) {
-  auto* attr = op->add_attrs();
-  attr->set_name(name);
-  attr->set_type(paddle::framework::proto::AttrType::STRINGS);
-  for (const auto& s : data) {
-    attr->add_strings(s.c_str());
-  }
-}
-
 }  // namespace
 
+using inference::analysis::SetAttr;
+
 TEST(TensorRTEngineOp, manual) {
   framework::ProgramDesc program;
   auto* block_ = program.Proto()->add_blocks();

From 921182484103b1b5e6256cb59e77cac8ed1c0272 Mon Sep 17 00:00:00 2001
From: Luo Tao <luotao02@baidu.com>
Date: Thu, 28 Jun 2018 10:17:33 +0800
Subject: [PATCH 19/28] fix tensorrt compiler bug

---
 paddle/contrib/inference/CMakeLists.txt | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/paddle/contrib/inference/CMakeLists.txt b/paddle/contrib/inference/CMakeLists.txt
index 153216abb4..ef768d989a 100644
--- a/paddle/contrib/inference/CMakeLists.txt
+++ b/paddle/contrib/inference/CMakeLists.txt
@@ -18,7 +18,10 @@ if(APPLE)
 endif(APPLE)
 
 
-set(inference_deps paddle_inference_api paddle_fluid_api paddle_inference_tensorrt_subgraph_engine)
+set(inference_deps paddle_inference_api paddle_fluid_api)
+if(WITH_GPU AND TENSORRT_FOUND)
+    set(inference_deps ${inference_deps} paddle_inference_tensorrt_subgraph_engine)
+endif()
 
 function(inference_api_test TARGET_NAME)
     if (WITH_TESTING)

From 2ecc56226d4d4b9151fdcfce9ffe5af6aa58eb5b Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Thu, 28 Jun 2018 12:40:51 +0800
Subject: [PATCH 20/28] small AverageOptimizer enhance. (#11761)

* small AverageOptimizer enhance.

* clean

* clean
---
 .../fluid/operators/average_accumulates_op.cc | 22 +++++++++----------
 .../fluid/operators/average_accumulates_op.h  |  5 +++--
 2 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/paddle/fluid/operators/average_accumulates_op.cc b/paddle/fluid/operators/average_accumulates_op.cc
index 25864e95d7..f389eab605 100644
--- a/paddle/fluid/operators/average_accumulates_op.cc
+++ b/paddle/fluid/operators/average_accumulates_op.cc
@@ -19,28 +19,28 @@ namespace operators {
 
 template <>
 void GetAccumulators<paddle::platform::CPUDeviceContext>(
-    const framework::ExecutionContext& ctx, int64_t* num_updates_,
-    int64_t* num_accumulates_, int64_t* old_num_accumulates_) {
+    const framework::ExecutionContext& ctx, int64_t* num_updates,
+    int64_t* num_accumulates, int64_t* old_num_accumulates) {
   auto* in_old_num_accumulates = ctx.Input<Tensor>("in_old_num_accumulates");
   auto* in_num_accumulates = ctx.Input<Tensor>("in_num_accumulates");
   auto* in_num_updates = ctx.Input<Tensor>("in_num_updates");
 
-  *old_num_accumulates_ = in_old_num_accumulates->data<int64_t>()[0];
-  *num_accumulates_ = in_num_accumulates->data<int64_t>()[0];
-  *num_updates_ = in_num_updates->data<int64_t>()[0];
+  *old_num_accumulates = in_old_num_accumulates->data<int64_t>()[0];
+  *num_accumulates = in_num_accumulates->data<int64_t>()[0];
+  *num_updates = in_num_updates->data<int64_t>()[0];
 }
 
 template <>
 void SetAccumulators<paddle::platform::CPUDeviceContext>(
-    const framework::ExecutionContext& ctx, int64_t num_updates_,
-    int64_t num_accumulates_, int64_t old_num_accumulates_) {
+    const framework::ExecutionContext& ctx, int64_t num_updates,
+    int64_t num_accumulates, int64_t old_num_accumulates) {
   auto* out_old_num_accumulates = ctx.Output<Tensor>("out_old_num_accumulates");
   auto* out_num_accumulates = ctx.Output<Tensor>("out_num_accumulates");
   auto* out_num_updates = ctx.Output<Tensor>("out_num_updates");
 
-  out_old_num_accumulates->data<int64_t>()[0] = old_num_accumulates_;
-  out_num_accumulates->data<int64_t>()[0] = num_accumulates_;
-  out_num_updates->data<int64_t>()[0] = num_updates_;
+  out_old_num_accumulates->data<int64_t>()[0] = old_num_accumulates;
+  out_num_accumulates->data<int64_t>()[0] = num_accumulates;
+  out_num_updates->data<int64_t>()[0] = num_updates;
 }
 
 class AverageAccumulatesOp : public framework::OperatorWithKernel {
@@ -177,7 +177,7 @@ class AverageAccumulatesOpMaker : public framework::OpProtoAndCheckerMaker {
 
     AddComment(R"DOC(
 AverageAccumulates Operator.
-Accumulate the sum of parameter whtin sliding window. The size of sliding window is
+Accumulate the sum of parameter within sliding window. The size of sliding window is
 determined by 'average_window', 'max_average_window' and 'min_average_window'.
 Memory was shared by Input(in_sum_1) and Output(out_sum_1) which acts as an accumulator 'sum_1'.
 'sum_2', 'sum_3', 'num_accumulates', 'old_num_accumulates' and 'num_updates' were the same as 'sum_1'.
diff --git a/paddle/fluid/operators/average_accumulates_op.h b/paddle/fluid/operators/average_accumulates_op.h
index 07ac5ced11..3958d3f685 100644
--- a/paddle/fluid/operators/average_accumulates_op.h
+++ b/paddle/fluid/operators/average_accumulates_op.h
@@ -54,8 +54,9 @@ class AverageAccumulatesKernel : public framework::OpKernel<T> {
     float average_window = ctx.Attr<float>("average_window");
     int64_t max_average_window = ctx.Attr<int64_t>("max_average_window");
     int64_t min_average_window = ctx.Attr<int64_t>("min_average_window");
-    min_average_window =
-        std::min<int64_t>(min_average_window, max_average_window);
+    PADDLE_ENFORCE_LE(min_average_window, max_average_window,
+                      "min_average_window shouldn't be larger than "
+                      "max_average_window");
 
     // Get inputs
     auto* param = ctx.Input<Tensor>("param");

From e265e61193560b37673b4dedddca0391d17645b9 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Thu, 28 Jun 2018 13:10:12 +0800
Subject: [PATCH 21/28] Add WITH_ANAKIN option log in paddle_build.sh

---
 paddle/scripts/paddle_build.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 037688bde9..017ac9d3ec 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -106,6 +106,7 @@ function cmake_gen() {
         -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF}
         -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
         -DWITH_CONTRIB=${WITH_CONTRIB:-ON}
+        -DWITH_ANAKIN=${WITH_ANAKIN:-ON}
     ========================================
 EOF
     # Disable UNITTEST_USE_VIRTUALENV in docker because

From f1224945ba4585e71beb33a30fcd7c88d912f796 Mon Sep 17 00:00:00 2001
From: superjomn <yanchunwei@outlook.com>
Date: Thu, 28 Jun 2018 13:27:30 +0800
Subject: [PATCH 22/28] fix analysis compile bug

---
 paddle/fluid/inference/CMakeLists.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index ec16a1c600..7071eea19c 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -28,9 +28,10 @@ endif()
 if(WITH_TESTING)
   # both tests/book and analysis depends the models that generated by python/paddle/fluid/tests/book
   add_subdirectory(tests/book)
-  add_subdirectory(analysis)
 endif()
 
+add_subdirectory(analysis)
+
 if (TENSORRT_FOUND)
   add_subdirectory(tensorrt)
 endif()

From ba99bc238409b3be2e1d42f0d8baaf42b544d774 Mon Sep 17 00:00:00 2001
From: superjomn <yanchunwei@outlook.com>
Date: Thu, 28 Jun 2018 14:02:13 +0800
Subject: [PATCH 23/28] update

---
 .../fluid/inference/analysis/CMakeLists.txt   | 20 ++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt
index 33b0e3b127..cdd67fdc92 100644
--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
@@ -13,16 +13,18 @@ cc_test(test_dot SRCS dot_tester.cc DEPS analysis)
 set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests)
 
 function (inference_analysis_test TARGET)
-    set(options "")
-    set(oneValueArgs "")
-    set(multiValueArgs SRCS)
-    cmake_parse_arguments(analysis_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    if(WITH_TESTING)
+        set(options "")
+        set(oneValueArgs "")
+        set(multiValueArgs SRCS)
+        cmake_parse_arguments(analysis_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
-    cc_test(${TARGET}
-            SRCS "${analysis_test_SRCS}"
-            DEPS analysis
-            ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model --fraction_of_gpu_memory_to_use=0.5)
-    set_tests_properties(${TARGET} PROPERTIES DEPENDS test_word2vec)
+        cc_test(${TARGET}
+                SRCS "${analysis_test_SRCS}"
+                DEPS analysis
+                ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model --fraction_of_gpu_memory_to_use=0.5)
+        set_tests_properties(${TARGET} PROPERTIES DEPENDS test_word2vec)
+    endif(WITH_TESTING)
 endfunction(inference_analysis_test)
 
 inference_analysis_test(test_data_flow_graph SRCS data_flow_graph_tester.cc)

From 995ead08496ccfe2560fece740be903f50cc7bec Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Thu, 28 Jun 2018 15:50:04 +0800
Subject: [PATCH 24/28] add timeline_cn

---
 doc/fluid/howto/optimization/timeline_cn.md   | 26 +++++++++++++++++++
 .../{timeline.md => timeline_en.md}           |  0
 2 files changed, 26 insertions(+)
 create mode 100644 doc/fluid/howto/optimization/timeline_cn.md
 rename doc/fluid/howto/optimization/{timeline.md => timeline_en.md} (100%)

diff --git a/doc/fluid/howto/optimization/timeline_cn.md b/doc/fluid/howto/optimization/timeline_cn.md
new file mode 100644
index 0000000000..5d061e1c00
--- /dev/null
+++ b/doc/fluid/howto/optimization/timeline_cn.md
@@ -0,0 +1,26 @@
+# 如何使用timeline工具做性能分析
+
+1. 在训练的主循环外加上`with profiler.profiler(...)`。运行之后，代码会在`/tmp/profile`目录下生成一个profile的记录文件。
+
+	**提示：**
+	请不要在timeline记录信息时运行太多次迭代，因为timeline中的记录数量和迭代次数是成正比的。
+
+	```python
+	with profiler.profiler('All', 'total', '/tmp/profile') as prof:
+	    for pass_id in range(pass_num):
+	        for batch_id, data in enumerate(train_reader()):
+	            exe.run(fluid.default_main_program(),
+	                    feed=feeder.feed(data),
+	                    fetch_list=[])
+	            ...
+	```
+
+1. 运行`python paddle/tools/timeline.py`来处理`/tmp/profile`，这个程序默认会生成一个`/tmp/timeline`文件，你也可以用命令行参数来修改这个路径，请参考[timeline.py](https://github.com/PaddlePaddle/Paddle/blob/develop/tools/timeline.py)。
+
+1. 打开chrome浏览器，访问<chrome://tracing/>，用`load`按钮来加载生成的`timeline`文件。
+
+	![chrome tracing](./tracing.jpeg)
+
+1. 结果如下图所示，可以放到来查看timetime的细节信息。
+
+	![chrome timeline](./timeline.jpeg)
diff --git a/doc/fluid/howto/optimization/timeline.md b/doc/fluid/howto/optimization/timeline_en.md
similarity index 100%
rename from doc/fluid/howto/optimization/timeline.md
rename to doc/fluid/howto/optimization/timeline_en.md

From 95385de798e9fc6fd3c8b7b47db75dc922028344 Mon Sep 17 00:00:00 2001
From: Yan Chunwei <yanchunwei@outlook.com>
Date: Thu, 28 Jun 2018 22:02:48 +0800
Subject: [PATCH 25/28] fix anakin manylinux build

---
 paddle/contrib/inference/CMakeLists.txt | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/paddle/contrib/inference/CMakeLists.txt b/paddle/contrib/inference/CMakeLists.txt
index ef768d989a..2cd6ab2bbf 100644
--- a/paddle/contrib/inference/CMakeLists.txt
+++ b/paddle/contrib/inference/CMakeLists.txt
@@ -61,7 +61,7 @@ cc_library(paddle_inference_tensorrt_subgraph_engine
 inference_api_test(test_paddle_inference_api_tensorrt_subgraph_engine ARGS test_word2vec)
 endif()
 
-if (WITH_ANAKIN AND WITH_TESTING) # only needed in CI
+if (WITH_ANAKIN) # only needed in CI
     # Due to Anakin do not have official library releases and the versions of protobuf and cuda do not match Paddle's,
     # so anakin library will not be merged to our official inference library. To use anakin prediction API, one need to
     # compile the libinference_anakin_api.a and compile with anakin.so.
@@ -71,10 +71,12 @@ if (WITH_ANAKIN AND WITH_TESTING) # only needed in CI
     target_compile_options(inference_anakin_api_shared BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
     target_link_libraries(inference_anakin_api anakin anakin_saber_common)
     target_link_libraries(inference_anakin_api_shared anakin anakin_saber_common)
-    cc_test(inference_anakin_test SRCS paddle_inference_api_anakin_engine_tester.cc
+    if (WITH_TESTING)
+        cc_test(inference_anakin_test SRCS paddle_inference_api_anakin_engine_tester.cc
                                   ARGS --model=${ANAKIN_INSTALL_DIR}/mobilenet_v2.anakin.bin
                                   DEPS inference_anakin_api)
-    target_compile_options(inference_anakin_test BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
+        target_compile_options(inference_anakin_test BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
+     endif(WITH_TESTING)
 endif()
 
 if(WITH_TESTING)

From 6711b7b5f174df83a0b06e24167509a15ae1a0bd Mon Sep 17 00:00:00 2001
From: chengduo <zhaochengduo@baidu.com>
Date: Thu, 28 Jun 2018 22:46:51 +0800
Subject: [PATCH 26/28] fix FeedAndSplitTensorIntoLocalScopes (#11817)

---
 paddle/fluid/framework/parallel_executor.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index b53a6f43fb..751b10eeee 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -253,6 +253,9 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
       t->set_lod(lod_tensors[j].lod());
     }
   }
+  for (auto &p : member_->places_) {
+    platform::DeviceContextPool::Instance().Get(p)->Wait();
+  }
 }
 
 ParallelExecutor::~ParallelExecutor() {

From 93e25301d75e9be7666af7c206cec67fb2838441 Mon Sep 17 00:00:00 2001
From: whs <wanghaoshuang@baidu.com>
Date: Fri, 29 Jun 2018 10:39:13 +0800
Subject: [PATCH 27/28] Fix python api of mean iou op. (#11797)

* Fix mean iou op.

* Fix crop layer test.

* Fix unitest

* fix unitest.
---
 python/paddle/fluid/layers/nn.py                   | 10 +++++-----
 python/paddle/fluid/tests/unittests/test_layers.py | 11 ++++++++++-
 2 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index bc379da4e3..61c01b3b00 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -5078,12 +5078,12 @@ def mean_iou(input, label, num_classes):
     out_correct = helper.create_tmp_variable(dtype='int32')
     helper.append_op(
         type="mean_iou",
-        inputs={"predictions": input,
-                "labels": label},
+        inputs={"Predictions": input,
+                "Labels": label},
         outputs={
-            "out_mean_iou": out_mean_iou,
-            "out_wrong": out_wrong,
-            "out_correct": out_correct
+            "OutMeanIou": out_mean_iou,
+            "OutWrong": out_wrong,
+            "OutCorrect": out_correct
         },
         attrs={"num_classes": num_classes})
     return out_mean_iou, out_wrong, out_correct
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 82074955fa..9d4b2d4434 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -401,7 +401,7 @@ class TestBook(unittest.TestCase):
             self.assertIsNotNone(output)
         print(str(program))
 
-    def test_maxout(self):
+    def test_crop(self):
         program = Program()
         with program_guard(program):
             x = layers.data(name='x', shape=[3, 5], dtype="float32")
@@ -410,6 +410,15 @@ class TestBook(unittest.TestCase):
             self.assertIsNotNone(output)
         print(str(program))
 
+    def test_mean_iou(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name='x', shape=[16], dtype='float32')
+            y = layers.data(name='label', shape=[1], dtype='int64')
+            iou = layers.mean_iou(x, y, 2)
+            self.assertIsNotNone(iou)
+        print(str(program))
+
 
 if __name__ == '__main__':
     unittest.main()

From 02e521e3ac9ec145576e64c1fd52e0e76dbbcb46 Mon Sep 17 00:00:00 2001
From: whs <wanghaoshuang@baidu.com>
Date: Fri, 29 Jun 2018 10:40:12 +0800
Subject: [PATCH 28/28]  Fix model average on multi-GPUs. (#11814)

* Fix average_accumulate_op for parallel executor.

* Fix model average on multi-GPUs.
---
 python/paddle/fluid/optimizer.py | 17 +++++------------
 1 file changed, 5 insertions(+), 12 deletions(-)

diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 607a68e256..75ee40fa9c 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -1113,7 +1113,6 @@ class ModelAverage(Optimizer):
 
     Args:
         average_window_rate: The rate of average window.
-        params_grads: A list of parameter-grad variable pairs.
         min_average_window: The minimum size of average window.
         max_average_window: The maximum size of average window.
 
@@ -1122,8 +1121,8 @@ class ModelAverage(Optimizer):
       .. code-block:: python
 
         optimizer = fluid.optimizer.Momentum()
-        _, params_grads = optimizer.minimize(cost)
-        model_average = fluid.optimizer.ModelAverage(params_grads, 0.15,
+        optimizer.minimize(cost)
+        model_average = fluid.optimizer.ModelAverage(0.15,
                                                 min_average_window=10000,
                                                 max_average_window=20000)
         for pass_id in range(args.pass_num):
@@ -1137,7 +1136,6 @@ class ModelAverage(Optimizer):
 
     def __init__(self,
                  average_window_rate,
-                 params_grads=None,
                  min_average_window=10000,
                  max_average_window=10000,
                  **kwargs):
@@ -1146,21 +1144,16 @@ class ModelAverage(Optimizer):
         self.min_average_window = min_average_window
         self.max_average_window = max_average_window
 
-        self.params_grads = [] if params_grads is None else params_grads
-        params = {}
-        for param, grad in self.params_grads:
-            if param.do_model_average != False:
-                params[param.name] = (param, grad)
+        self.params_grads = []
         for param in framework.default_main_program().global_block(
         ).all_parameters():
-            if param.name not in params and param.do_model_average != False:
+            if param.do_model_average != False:
                 grad = param.block.create_var(
                     name=unique_name.generate(".".join([param.name, 'tmp'])),
                     dtype=param.dtype,
                     persistable=False,
                     stop_gradient=True)
-                params[param.name] = (param, grad)
-        self.params_grads = params.values()
+                self.params_grads.append((param, grad))
 
         for param, grad in self.params_grads:
             self._append_average_accumulate_op(param)