From e896926b9c3c8bed544f92ad82d42daac9fac0c0 Mon Sep 17 00:00:00 2001
From: Yancey1989 <yancey1989@gmail.com>
Date: Tue, 5 Jun 2018 15:42:29 +0800
Subject: [PATCH 01/68] add unit test for dist mnist

---
 .../fluid/tests/unittests/CMakeLists.txt      |   1 +
 .../fluid/tests/unittests/test_dist_mnist.py  | 208 ++++++++++++++++++
 2 files changed, 209 insertions(+)
 create mode 100644 python/paddle/fluid/tests/unittests/test_dist_mnist.py

diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index c33539f6b5..2f2e9c96c1 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -52,3 +52,4 @@ py_test_modules(test_dist_train MODULES test_dist_train SERIAL)
 # since load cudnn libraries, so we use a longer timeout to make this
 # unit test stability.
 set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 30)
+set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 60)
diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist.py b/python/paddle/fluid/tests/unittests/test_dist_mnist.py
new file mode 100644
index 0000000000..e6bc56cce1
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist.py
@@ -0,0 +1,208 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import argparse
+import time
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import os
+import signal
+
+SEED = 1
+DTYPE = "float32"
+
+
+# random seed must set before configuring the network.
+# fluid.default_startup_program().random_seed = SEED
+def cnn_model(data):
+    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+        input=data,
+        filter_size=5,
+        num_filters=20,
+        pool_size=2,
+        pool_stride=2,
+        act="relu")
+    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+        input=conv_pool_1,
+        filter_size=5,
+        num_filters=50,
+        pool_size=2,
+        pool_stride=2,
+        act="relu")
+
+    # TODO(dzhwinter) : refine the initializer and random seed settting
+    SIZE = 10
+    input_shape = conv_pool_2.shape
+    param_shape = [reduce(lambda a, b: a * b, input_shape[1:], 1)] + [SIZE]
+    scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5
+
+    predict = fluid.layers.fc(
+        input=conv_pool_2,
+        size=SIZE,
+        act="softmax",
+        param_attr=fluid.param_attr.ParamAttr(
+            initializer=fluid.initializer.NormalInitializer(
+                loc=0.0, scale=scale)))
+    return predict
+
+
+def get_model(batch_size):
+    # Input data
+    images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+    # Train program
+    predict = cnn_model(images)
+    cost = fluid.layers.cross_entropy(input=predict, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+
+    # Evaluator
+    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+    batch_acc = fluid.layers.accuracy(
+        input=predict, label=label, total=batch_size_tensor)
+
+    inference_program = fluid.default_main_program().clone()
+    # Optimization
+    opt = fluid.optimizer.AdamOptimizer(
+        learning_rate=0.001, beta1=0.9, beta2=0.999)
+
+    # Reader
+    train_reader = paddle.batch(
+        paddle.dataset.mnist.train(), batch_size=batch_size)
+    test_reader = paddle.batch(
+        paddle.dataset.mnist.test(), batch_size=batch_size)
+    opt.minimize(avg_cost)
+    return inference_program, avg_cost, train_reader, test_reader, batch_acc, predict
+
+
+def get_transpiler(trainer_id, main_program, pserver_endpoints, trainers):
+    t = fluid.DistributeTranspiler()
+    t.transpile(
+        trainer_id=trainer_id,
+        program=main_program,
+        pservers=pserver_endpoints,
+        trainers=trainers)
+    return t
+
+
+def run_pserver(pserver_endpoints, trainers, current_endpoint):
+    get_model(batch_size=20)
+    t = get_transpiler(0,
+                       fluid.default_main_program(), pserver_endpoints,
+                       trainers)
+    pserver_prog = t.get_pserver_program(current_endpoint)
+    startup_prog = t.get_startup_program(current_endpoint, pserver_prog)
+
+    place = fluid.CPUPlace()
+    exe = fluid.Executor(place)
+    exe.run(startup_prog)
+
+    exe.run(pserver_prog)
+
+
+class TestDistMnist(unittest.TestCase):
+    def setUp(self):
+        self._trainers = 1
+        self._pservers = 1
+        self._ps_endpoints = "127.0.0.1:9123"
+
+    def start_pserver(self, endpoint):
+        p = Process(
+            target=run_pserver,
+            args=(self._ps_endpoints, self._trainers, endpoint))
+        p.start()
+        return p.pid
+
+    def _wait_ps_ready(self, pid):
+        retry_times = 5
+        while True:
+            assert retry_times >= 0, "wait ps ready failed"
+            time.sleep(1)
+            try:
+                # the listen_and_serv_op would touch a file which contains the listen port
+                # on the /tmp directory until it was ready to process all the RPC call.
+                os.stat("/tmp/paddle.%d.port" % pid)
+                return
+            except os.error:
+                retry_times -= 1
+
+    def stop_pserver(self, pid):
+        os.kill(pid, signal.SIGTERM)
+
+    def test_with_place(self):
+        p = fluid.CUDAPlace() if core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+
+        pserver_pid = self.start_pserver(self._ps_endpoints)
+        self._wait_ps_ready(pserver_pid)
+
+        self.run_trainer(p, 0)
+
+        self.stop_pserver(pserver_pid)
+
+    def run_trainer(self, place, trainer_id):
+        test_program, avg_cost, train_reader, test_reader, batch_acc, predict = get_model(
+            batch_size=20)
+        t = get_transpiler(trainer_id,
+                           fluid.default_main_program(), self._ps_endpoints,
+                           self._trainers)
+
+        trainer_prog = t.get_trainer_program()
+
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+
+        feed_var_list = [
+            var for var in trainer_prog.global_block().vars.itervalues()
+            if var.is_data
+        ]
+
+        feeder = fluid.DataFeeder(feed_var_list, place)
+        for pass_id in xrange(10):
+            for batch_id, data in enumerate(train_reader()):
+                exe.run(trainer_prog, feed=feeder.feed(data))
+
+                if (batch_id + 1) % 10 == 0:
+                    acc_set = []
+                    avg_loss_set = []
+                    for test_data in test_reader():
+                        acc_np, avg_loss_np = exe.run(
+                            program=test_program,
+                            feed=feeder.feed(test_data),
+                            fetch_list=[batch_acc, avg_cost])
+                        acc_set.append(float(acc_np))
+                        avg_loss_set.append(float(avg_loss_np))
+                    # get test acc and loss
+                    acc_val = np.array(acc_set).mean()
+                    avg_loss_val = np.array(avg_loss_set).mean()
+                    if float(acc_val
+                             ) > 0.2:  # Smaller value to increase CI speed
+                        return
+                    else:
+                        print(
+                            'PassID {0:1}, BatchID {1:04}, Test Loss {2:2.2}, Acc {3:2.2}'.
+                            format(pass_id, batch_id + 1,
+                                   float(avg_loss_val), float(acc_val)))
+                        if math.isnan(float(avg_loss_val)):
+                            assert ("got Nan loss, training failed.")
+
+
+if __name__ == "__main__":
+    unittest.main()

From a158bd9173d745b4969e39fcc1c00681a791d251 Mon Sep 17 00:00:00 2001
From: Yancey1989 <yancey1989@gmail.com>
Date: Tue, 5 Jun 2018 17:15:18 +0800
Subject: [PATCH 02/68] fix ci

---
 python/paddle/fluid/tests/unittests/test_dist_mnist.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist.py b/python/paddle/fluid/tests/unittests/test_dist_mnist.py
index e6bc56cce1..e4d39c8b3f 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist.py
@@ -27,6 +27,7 @@ import signal
 
 SEED = 1
 DTYPE = "float32"
+paddle.dataset.mnist.fetch()
 
 
 # random seed must set before configuring the network.
@@ -147,7 +148,7 @@ class TestDistMnist(unittest.TestCase):
         os.kill(pid, signal.SIGTERM)
 
     def test_with_place(self):
-        p = fluid.CUDAPlace() if core.is_compiled_with_cuda(
+        p = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
         ) else fluid.CPUPlace()
 
         pserver_pid = self.start_pserver(self._ps_endpoints)

From df7a1471fd1c2c003a8df1af152cd1da28f7f568 Mon Sep 17 00:00:00 2001
From: Yancey1989 <yancey1989@gmail.com>
Date: Tue, 5 Jun 2018 19:21:35 +0800
Subject: [PATCH 03/68] fix fetch mnist dataset failed

---
 python/paddle/v2/dataset/mnist.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/v2/dataset/mnist.py b/python/paddle/v2/dataset/mnist.py
index 9f675bed89..2b959c48e4 100644
--- a/python/paddle/v2/dataset/mnist.py
+++ b/python/paddle/v2/dataset/mnist.py
@@ -112,7 +112,7 @@ def fetch():
     paddle.v2.dataset.common.download(TRAIN_IMAGE_URL, 'mnist', TRAIN_IMAGE_MD5)
     paddle.v2.dataset.common.download(TRAIN_LABEL_URL, 'mnist', TRAIN_LABEL_MD5)
     paddle.v2.dataset.common.download(TEST_IMAGE_URL, 'mnist', TEST_IMAGE_MD5)
-    paddle.v2.dataset.common.download(TEST_LABEL_URL, 'mnist', TRAIN_LABEL_MD5)
+    paddle.v2.dataset.common.download(TEST_LABEL_URL, 'mnist', TEST_LABEL_MD5)
 
 
 def convert(path):

From cdb705d19360cdf2ad4de0bbe4f129a70f6ca28c Mon Sep 17 00:00:00 2001
From: Yancey1989 <yancey1989@gmail.com>
Date: Wed, 6 Jun 2018 13:27:22 +0800
Subject: [PATCH 04/68] fix mnist dataset md5

---
 python/paddle/dataset/mnist.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/dataset/mnist.py b/python/paddle/dataset/mnist.py
index 6a1b8b5fac..9d05aeeb95 100644
--- a/python/paddle/dataset/mnist.py
+++ b/python/paddle/dataset/mnist.py
@@ -111,7 +111,7 @@ def fetch():
     paddle.dataset.common.download(TRAIN_IMAGE_URL, 'mnist', TRAIN_IMAGE_MD5)
     paddle.dataset.common.download(TRAIN_LABEL_URL, 'mnist', TRAIN_LABEL_MD5)
     paddle.dataset.common.download(TEST_IMAGE_URL, 'mnist', TEST_IMAGE_MD5)
-    paddle.dataset.common.download(TEST_LABEL_URL, 'mnist', TRAIN_LABEL_MD5)
+    paddle.dataset.common.download(TEST_LABEL_URL, 'mnist', TEST_LABEL_MD5)
 
 
 def convert(path):

From 05b6aa180594e379adfa4d5ba44d3e9ac937f5b2 Mon Sep 17 00:00:00 2001
From: Yancey1989 <yancey1989@gmail.com>
Date: Wed, 6 Jun 2018 14:02:32 +0800
Subject: [PATCH 05/68] increase dist unit test timeout

---
 python/paddle/fluid/tests/unittests/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 2f2e9c96c1..32176fc7e5 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -51,5 +51,5 @@ py_test_modules(test_dist_train MODULES test_dist_train SERIAL)
 # FIXME(Yancey1989): this test would cost much more time on CUDAPlace
 # since load cudnn libraries, so we use a longer timeout to make this
 # unit test stability.
-set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 30)
-set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 60)
+set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 60)
+set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 180)

From 741046e8ea5657d857fae7fd0560af0b86e630c0 Mon Sep 17 00:00:00 2001
From: guosheng <guosheng@baidu.com>
Date: Wed, 6 Jun 2018 16:57:21 +0800
Subject: [PATCH 06/68] Fix and enhance beam_search_op and beam_searc_decode_op
 to be comparable with python beam search

---
 paddle/fluid/operators/CMakeLists.txt         |   4 +-
 .../fluid/operators/beam_search_decode_op.cc  |  42 +++++--
 .../fluid/operators/beam_search_decode_op.h   | 116 +++++++++++++++++-
 paddle/fluid/operators/beam_search_op.cc      |  83 +++++++++----
 paddle/fluid/operators/beam_search_op.h       |  46 ++++---
 .../operators/tensor_array_read_write_op.cc   |   5 +-
 python/paddle/fluid/layers/nn.py              |   9 +-
 7 files changed, 243 insertions(+), 62 deletions(-)

diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 7fce138e3f..4c7691b776 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -288,8 +288,8 @@ set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
 
 cc_test(gather_test SRCS gather_test.cc DEPS tensor)
 cc_test(scatter_test SRCS scatter_test.cc DEPS tensor)
-cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor)
-cc_test(beam_search_op_test SRCS beam_search_op_test.cc DEPS lod_tensor beam_search_op)
+# cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor)
+# cc_test(beam_search_op_test SRCS beam_search_op_test.cc DEPS lod_tensor beam_search_op)
 cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor memory)
 cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
 cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op)
diff --git a/paddle/fluid/operators/beam_search_decode_op.cc b/paddle/fluid/operators/beam_search_decode_op.cc
index c3dd22119d..39877cfdc1 100644
--- a/paddle/fluid/operators/beam_search_decode_op.cc
+++ b/paddle/fluid/operators/beam_search_decode_op.cc
@@ -12,8 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/beam_search_decode_op.h"
+#include <algorithm>
 #include <string>
+
+#include "paddle/fluid/operators/beam_search_decode_op.h"
 #include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
@@ -22,8 +24,11 @@ namespace operators {
 struct BeamSearchDecodeFunctor {
   BeamSearchDecodeFunctor(const LoDTensorArray& step_ids,
                           const LoDTensorArray& step_scores,
-                          LoDTensor* id_tensor, LoDTensor* score_tensor)
-      : step_ids_origin_(step_ids),
+                          LoDTensor* id_tensor, LoDTensor* score_tensor,
+                          size_t beam_size, int end_id)
+      : beam_size_(beam_size),
+        end_id_(end_id),
+        step_ids_origin_(step_ids),
         step_scores_origin_(step_scores),
         id_tensor_(id_tensor),
         score_tensor_(score_tensor) {
@@ -67,6 +72,8 @@ struct BeamSearchDecodeFunctor {
   void operator()() const;
 
   bool tensor_on_gpu_;
+  size_t beam_size_;
+  int end_id_;
   const LoDTensorArray& step_ids_origin_;
   const LoDTensorArray& step_scores_origin_;
   LoDTensorArray step_ids_ = LoDTensorArray();
@@ -77,14 +84,18 @@ struct BeamSearchDecodeFunctor {
 
 template <typename T>
 void BeamSearchDecodeFunctor::operator()() const {
-  BeamSearchDecoder<T> beam_search_decoder;
+  BeamSearchDecoder<T> beam_search_decoder(beam_size_, end_id_);
   // Check if the tensor is on GPU. If so, use the CPU copy instead
   if (tensor_on_gpu_) {
-    beam_search_decoder.PackAllSteps(step_ids_, step_scores_, id_tensor_,
-                                     score_tensor_);
+    // beam_search_decoder.PackAllSteps(step_ids_, step_scores_, id_tensor_,
+    //                                  score_tensor_);
+    beam_search_decoder.Backtrace(step_ids_, step_scores_, id_tensor_,
+                                  score_tensor_);
   } else {
-    beam_search_decoder.PackAllSteps(step_ids_origin_, step_scores_origin_,
-                                     id_tensor_, score_tensor_);
+    // beam_search_decoder.PackAllSteps(step_ids_origin_, step_scores_origin_,
+    //                                  id_tensor_, score_tensor_);
+    beam_search_decoder.Backtrace(step_ids_origin_, step_scores_origin_,
+                                  id_tensor_, score_tensor_);
   }
 }
 
@@ -122,13 +133,17 @@ class BeamSearchDecodeOp : public framework::OperatorBase {
                         "Level of LodTensor should be 2");
     }
 
+    size_t beam_size = ctx.Attr<int>("beam_size");
+    int end_id = ctx.Attr<int>("end_id");
+
     // prepare output
     LoDTensor* sentenceIds = ctx.Output<LoDTensor>("SentenceIds");
     LoDTensor* sentenceScores = ctx.Output<LoDTensor>("SentenceScores");
 
     framework::VisitDataType(
         framework::ToDataType(scores->at(0).type()),
-        BeamSearchDecodeFunctor(*ids, *scores, sentenceIds, sentenceScores));
+        BeamSearchDecodeFunctor(*ids, *scores, sentenceIds, sentenceScores,
+                                beam_size, end_id));
   }
 };
 
@@ -147,6 +162,9 @@ class BeamSearchDecodeOpProtoMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("SentenceScores",
               "(LodTensor)"
               "All possible result sentences of word scores");
+    AddAttr<int>("beam_size", "beam size for beam search");
+    AddAttr<int>("end_id",
+                 "the token id which indicates the end of a sequence");
     AddComment(R"DOC(
 Pack the result of Beam search op into SentenceIds and SentenceScores.
 )DOC");
@@ -172,10 +190,12 @@ class BeamSearchDecodeInferVarType : public framework::VarTypeInference {
   void operator()(const framework::OpDesc& op_desc,
                   framework::BlockDesc* block) const override {
     for (auto& o : op_desc.Output("SentenceIds")) {
-      block->Var(o)->SetType(framework::proto::VarType::LOD_TENSOR);
+      auto& sentence_ids = block->FindRecursiveOrCreateVar(o);
+      sentence_ids.SetType(framework::proto::VarType::LOD_TENSOR);
     }
     for (auto& o : op_desc.Output("SentenceScores")) {
-      block->Var(o)->SetType(framework::proto::VarType::LOD_TENSOR);
+      auto& sentence_scores = block->FindRecursiveOrCreateVar(o);
+      sentence_scores.SetType(framework::proto::VarType::LOD_TENSOR);
     }
   }
 };
diff --git a/paddle/fluid/operators/beam_search_decode_op.h b/paddle/fluid/operators/beam_search_decode_op.h
index 3c01f81c83..322838951b 100644
--- a/paddle/fluid/operators/beam_search_decode_op.h
+++ b/paddle/fluid/operators/beam_search_decode_op.h
@@ -14,7 +14,9 @@ limitations under the License. */
 
 #pragma once
 
+#include <algorithm>
 #include <vector>
+
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/op_registry.h"
 
@@ -72,6 +74,9 @@ using SentenceVector = std::vector<Sentence<T>>;
 
 template <typename T>
 struct BeamSearchDecoder {
+  BeamSearchDecoder(size_t beam_size, int end_id)
+      : beam_size_(beam_size), end_id_(end_id) {}
+
   /**
    * make a BeamNode and all it's related prefix BeanNode into a Sentence.
    */
@@ -103,7 +108,8 @@ struct BeamSearchDecoder {
    */
   void ConvertSentenceVectorToLodTensor(
       std::vector<SentenceVector<T>> sentence_vector_list, LoDTensor* id_tensor,
-      LoDTensor* score_tensor) const;
+      LoDTensor* score_tensor, bool reverse = false,
+      bool sort_by_score = true) const;
 
   /**
    * Pack all steps of id/score LodTensor into sentence LoDTensor
@@ -121,6 +127,13 @@ struct BeamSearchDecoder {
   void PackAllSteps(const LoDTensorArray& step_ids,
                     const LoDTensorArray& step_scores, LoDTensor* id_tensor,
                     LoDTensor* score_tensor) const;
+
+  void Backtrace(const LoDTensorArray& step_ids,
+                 const LoDTensorArray& step_scores, LoDTensor* id_tensor,
+                 LoDTensor* score_tensor) const;
+
+  size_t beam_size_;
+  int end_id_;
 };
 
 template <typename T>
@@ -200,7 +213,7 @@ std::vector<BeamNodeVector<T>> BeamSearchDecoder<T>::PackTwoSteps(
 template <typename T>
 void BeamSearchDecoder<T>::ConvertSentenceVectorToLodTensor(
     std::vector<SentenceVector<T>> sentence_vector_list, LoDTensor* id_tensor,
-    LoDTensor* score_tensor) const {
+    LoDTensor* score_tensor, bool reverse, bool sort_by_score) const {
   size_t src_num = sentence_vector_list.size();
 
   PADDLE_ENFORCE_NE(src_num, 0, "src_num should not be 0");
@@ -211,11 +224,29 @@ void BeamSearchDecoder<T>::ConvertSentenceVectorToLodTensor(
   std::vector<T> score_data;
 
   for (size_t src_idx = 0; src_idx < src_num; ++src_idx) {
+    if (sort_by_score) {
+      sort(sentence_vector_list[src_idx].begin(),
+           sentence_vector_list[src_idx].end(),
+           [reverse](const Sentence<T>& a, const Sentence<T>& b) {
+             if (reverse)
+               return a.scores.front() > b.scores.front();
+             else
+               return a.scores.back() > b.scores.back();
+           });
+    }
     for (Sentence<T>& sentence : sentence_vector_list[src_idx]) {
-      id_data.insert(id_data.end(), sentence.word_ids.begin(),
-                     sentence.word_ids.end());
-      score_data.insert(score_data.end(), sentence.scores.begin(),
-                        sentence.scores.end());
+      if (reverse) {
+        id_data.insert(id_data.end(), sentence.word_ids.rbegin(),
+                       sentence.word_ids.rend());
+        score_data.insert(score_data.end(), sentence.scores.rbegin(),
+                          sentence.scores.rend());
+      } else {
+        id_data.insert(id_data.end(), sentence.word_ids.begin(),
+                       sentence.word_ids.end());
+        score_data.insert(score_data.end(), sentence.scores.begin(),
+                          sentence.scores.end());
+      }
+
       sentence_level_lod.push_back(sentence_level_lod.back() +
                                    sentence.word_ids.size());
     }
@@ -278,5 +309,78 @@ void BeamSearchDecoder<T>::PackAllSteps(const LoDTensorArray& step_ids,
                                    score_tensor);
 }
 
+template <typename T>
+void BeamSearchDecoder<T>::Backtrace(const LoDTensorArray& step_ids,
+                                     const LoDTensorArray& step_scores,
+                                     LoDTensor* id_tensor,
+                                     LoDTensor* score_tensor) const {
+  PADDLE_ENFORCE(!step_ids.empty(), "step num should be larger than 0");
+  PADDLE_ENFORCE_EQ(step_ids.size(), step_scores.size(),
+                    "step_ids and step_scores should be the same");
+  const size_t step_num = step_ids.size();
+  const size_t src_num = step_ids.at(0).lod().at(kSourceLevel).size() - 1;
+  std::vector<SentenceVector<T>> sentence_vector_list(
+      src_num, SentenceVector<T>(beam_size_));
+  std::vector<std::vector<size_t>> prefix_idx_vector_list(
+      src_num, std::vector<size_t>());
+  for (int step_id = step_num - 1; step_id >= 0; --step_id) {
+    auto& cur_ids = step_ids.at(step_id);
+    auto& cur_scores = step_scores.at(step_id);
+    for (size_t src_idx = 0; src_idx < src_num; ++src_idx) {
+      // for each source sentence
+      auto& sentence_vector = sentence_vector_list.at(src_idx);
+      auto& prefix_idx_vector = prefix_idx_vector_list.at(src_idx);
+      size_t src_prefix_start = cur_ids.lod().at(kSourceLevel)[src_idx];
+      size_t src_prefix_end = cur_ids.lod().at(kSourceLevel)[src_idx + 1];
+      if (prefix_idx_vector.empty()) {  // be finished and pruned at this step
+                                        // or the last time step
+        for (size_t prefix_idx = src_prefix_start; prefix_idx < src_prefix_end;
+             ++prefix_idx) {
+          size_t candidate_start = cur_ids.lod().at(kSentenceLevel)[prefix_idx];
+          size_t candidate_end =
+              cur_ids.lod().at(kSentenceLevel)[prefix_idx + 1];
+          for (size_t candidate_idx = candidate_start;
+               candidate_idx < candidate_end; ++candidate_idx) {
+            prefix_idx_vector.push_back(prefix_idx);
+            size_t idx = prefix_idx_vector.size() - 1;
+            auto cur_id = cur_ids.data<int64_t>()[candidate_idx];
+            auto cur_score = cur_scores.data<T>()[candidate_idx];
+            sentence_vector.at(idx).word_ids.push_back(cur_id);
+            sentence_vector.at(idx).scores.push_back(cur_score);
+          }
+        }
+      } else {  // use prefix_idx_vector to backtrace
+        size_t src_candidate_start =
+            cur_ids.lod().at(kSentenceLevel)[src_prefix_start];
+        size_t prefix_idx = src_prefix_start;
+        size_t candidate_num =
+            cur_ids.lod().at(kSentenceLevel)[prefix_idx + 1] -
+            cur_ids.lod().at(kSentenceLevel)[prefix_idx];
+        for (size_t idx = 0; idx < prefix_idx_vector.size(); ++idx) {
+          auto candidate_idx = prefix_idx_vector.at(idx);
+          auto cur_id = cur_ids.data<int64_t>()[candidate_idx];
+          auto cur_score = cur_scores.data<T>()[candidate_idx];
+          if (cur_id != end_id_ || sentence_vector.at(idx).word_ids.empty()) {
+            // to skip redundant end tokens
+            sentence_vector.at(idx).word_ids.push_back(cur_id);
+            sentence_vector.at(idx).scores.push_back(cur_score);
+          }
+
+          while (src_candidate_start + candidate_num <=
+                 candidate_idx) {  // search the corresponding prefix
+            prefix_idx++;
+            candidate_num += cur_ids.lod().at(kSentenceLevel)[prefix_idx + 1] -
+                             cur_ids.lod().at(kSentenceLevel)[prefix_idx];
+          }
+          prefix_idx_vector.at(idx) = prefix_idx;
+        }
+      }
+    }
+  }
+
+  ConvertSentenceVectorToLodTensor(sentence_vector_list, id_tensor,
+                                   score_tensor, true, true);
+}
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/beam_search_op.cc b/paddle/fluid/operators/beam_search_op.cc
index df0b50881f..9b462ef8d0 100644
--- a/paddle/fluid/operators/beam_search_op.cc
+++ b/paddle/fluid/operators/beam_search_op.cc
@@ -12,25 +12,27 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/beam_search_op.h"
-
 #include <algorithm>
+#include <limits>
 #include <map>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/beam_search_op.h"
 
 namespace paddle {
 namespace operators {
 
 void BeamSearch::operator()(const framework::LoDTensor &pre_ids,
+                            const framework::LoDTensor &pre_scores,
                             framework::LoDTensor *selected_ids,
                             framework::LoDTensor *selected_scores) {
   auto abs_lod = framework::ToAbsOffset(ids_->lod());
   auto &high_level = abs_lod[lod_level_];
 
-  auto items = SelectTopBeamSizeItems();
+  auto items = SelectTopBeamSizeItems(pre_ids, pre_scores);
   auto selected_items = ToMap(items, high_level.back());
   VLOG(3) << "selected_items:";
   for (size_t i = 0; i < selected_items.size(); ++i) {
@@ -39,7 +41,8 @@ void BeamSearch::operator()(const framework::LoDTensor &pre_ids,
       VLOG(3) << ItemToString(item);
     }
   }
-  PruneEndidCandidates(pre_ids, &selected_items);
+
+  PruneEndBeams(pre_ids, &selected_items);
   // calculate the output tensor's height
   size_t num_instances = std::accumulate(
       std::begin(selected_items), std::end(selected_items), 0,
@@ -61,12 +64,6 @@ void BeamSearch::operator()(const framework::LoDTensor &pre_ids,
   size_t low_offset = 0;
   for (auto &items : selected_items) {
     low_level.push_back(low_offset);
-    sort(items.begin(), items.end(), [](const Item &a, const Item &b) {
-      if (a.offset < b.offset) {
-        return true;
-      }
-      return a.id < b.id;
-    });
     for (auto &item : items) {
       ids_data[low_offset] = item.id;
       scores_data[low_offset] = item.score;
@@ -86,6 +83,33 @@ void BeamSearch::operator()(const framework::LoDTensor &pre_ids,
   selected_scores->set_lod(lod);
 }
 
+void BeamSearch::PruneEndBeams(const framework::LoDTensor &pre_ids,
+                               std::vector<std::vector<Item>> *items) {
+  auto *pre_ids_data = pre_ids.data<int64_t>();
+  auto abs_lod = framework::ToAbsOffset(ids_->lod());
+  auto &high_level = abs_lod[lod_level_];
+  for (size_t src_idx = 0; src_idx < high_level.size(); ++src_idx) {
+    size_t src_prefix_start = high_level[src_idx];
+    size_t src_prefix_end = high_level[src_idx + 1];
+    bool finish_flag = true;
+    for (size_t offset = src_prefix_start; offset < src_prefix_end; offset++) {
+      for (auto &item : items->at(offset)) {
+        if (item.id != static_cast<size_t>(end_id_) ||
+            pre_ids_data[offset] != end_id_) {
+          finish_flag = false;
+          break;
+        }
+      }
+      if (!finish_flag) break;
+    }
+    if (finish_flag) {  // all branchs of the beam (source sentence) end and
+                        // prune this beam
+      for (size_t offset = src_prefix_start; offset < src_prefix_end; offset++)
+        items->at(offset).clear();
+    }
+  }
+}
+
 int BeamSearch::PruneEndidCandidates(const framework::LoDTensor &pre_ids,
                                      std::vector<std::vector<Item>> *items) {
   auto *pre_ids_data = pre_ids.data<int64_t>();
@@ -115,13 +139,14 @@ std::vector<std::vector<BeamSearch::Item>> BeamSearch::ToMap(
   return result;
 }
 
-std::vector<std::vector<BeamSearch::Item>>
-BeamSearch::SelectTopBeamSizeItems() {
+std::vector<std::vector<BeamSearch::Item>> BeamSearch::SelectTopBeamSizeItems(
+    const framework::LoDTensor &pre_ids,
+    const framework::LoDTensor &pre_scores) {
   std::vector<std::vector<Item>> result;
   std::vector<Item> items;
   // for each source sentence, select the top beam_size items across all
   // candidate sets.
-  while (NextItemSet(&items)) {
+  while (NextItemSet(pre_ids, pre_scores, &items)) {
     std::nth_element(std::begin(items), std::begin(items) + beam_size_,
                      std::end(items), [](const Item &a, const Item &b) {
                        // TODO(superjom) make score's comparation customizable.
@@ -146,7 +171,9 @@ BeamSearch::SelectTopBeamSizeItems() {
 }
 
 // the candidates of a source
-bool BeamSearch::NextItemSet(std::vector<BeamSearch::Item> *items) {
+bool BeamSearch::NextItemSet(const framework::LoDTensor &pre_ids,
+                             const framework::LoDTensor &pre_scores,
+                             std::vector<BeamSearch::Item> *items) {
   if (sent_offset_ >= ids_->NumElements(lod_level_)) {
     return false;
   }
@@ -164,14 +191,25 @@ bool BeamSearch::NextItemSet(std::vector<BeamSearch::Item> *items) {
     instance_dim *= ids.dims()[i];
   }
 
+  auto *pre_ids_data = pre_ids.data<int64_t>();
+  auto *pre_scores_data = pre_scores.data<float>();
   items->clear();
   items->reserve(framework::product(ids.dims()));
   for (size_t offset = abs_lod[lod_level_][sent_offset_];
        offset < abs_lod[lod_level_][sent_offset_ + 1]; offset++) {
-    for (size_t d = 0; d < instance_dim; d++) {
-      const size_t dim_offset = offset * instance_dim + d;
-      items->emplace_back(offset, ids_data[dim_offset],
-                          scores_data[dim_offset]);
+    auto pre_id = pre_ids_data[offset];
+    auto pre_score = pre_scores_data[offset];
+    if (pre_id == end_id_) {
+      // Allocate all probability mass to eos_id for finished branchs and the
+      // other
+      // candidate ids can be ignored.
+      items->emplace_back(offset, end_id_, pre_score);
+    } else {
+      for (size_t d = 0; d < instance_dim; d++) {
+        const size_t dim_offset = offset * instance_dim + d;
+        items->emplace_back(offset, ids_data[dim_offset],
+                            scores_data[dim_offset]);
+      }
     }
   }
 
@@ -199,7 +237,8 @@ class BeamSearchOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
     // inputs and outputs stored in proto
-    AddInput("pre_ids", "ids in previous step");
+    AddInput("pre_ids", "ids in the previous step");
+    AddInput("pre_scores", "accumulated scores in the previous step");
     AddInput("ids", "a LoDTensor of shape of [None,k]");
     AddInput("scores",
              "a LoDTensor that has the same shape and LoD with `ids`");
@@ -253,10 +292,12 @@ class BeamSearchInferVarType : public framework::VarTypeInference {
   void operator()(const framework::OpDesc &op_desc,
                   framework::BlockDesc *block) const override {
     for (auto &o : op_desc.Output("selected_ids")) {
-      block->Var(o)->SetType(framework::proto::VarType::LOD_TENSOR);
+      auto &selected_ids = block->FindRecursiveOrCreateVar(o);
+      selected_ids.SetType(framework::proto::VarType::LOD_TENSOR);
     }
     for (auto &o : op_desc.Output("selected_scores")) {
-      block->Var(o)->SetType(framework::proto::VarType::LOD_TENSOR);
+      auto &selected_scores = block->FindRecursiveOrCreateVar(o);
+      selected_scores.SetType(framework::proto::VarType::LOD_TENSOR);
     }
   }
 };
diff --git a/paddle/fluid/operators/beam_search_op.h b/paddle/fluid/operators/beam_search_op.h
index 46bc4f6f93..a595726f12 100644
--- a/paddle/fluid/operators/beam_search_op.h
+++ b/paddle/fluid/operators/beam_search_op.h
@@ -132,6 +132,7 @@ class BeamSearch {
    * that means no candidates is provided, and the task will stop running.
    */
   void operator()(const framework::LoDTensor& pre_ids,
+                  const framework::LoDTensor& pre_scores,
                   framework::LoDTensor* selected_ids,
                   framework::LoDTensor* selected_scores);
   /*
@@ -152,6 +153,14 @@ class BeamSearch {
   };
 
  protected:
+  /*
+   * Prune the source sentences all branchs finished, and it is optional.
+   * Pruning must one step later than finishing, since the end tokens
+   * must be writed out. Also the finished branchs with top 1 score can
+   * be pruned.
+   */
+  void PruneEndBeams(const framework::LoDTensor& pre_ids,
+                     std::vector<std::vector<Item>>* items);
   /*
    * Delete all the records that follows the end token.
    */
@@ -160,7 +169,7 @@ class BeamSearch {
 
   /*
    * Transform the items into a map whose key is offset, value is the items.
-   * NOTE low performance
+   * NOTE low performance.
    */
   std::vector<std::vector<Item>> ToMap(
       const std::vector<std::vector<Item>>& inputs, size_t element_num);
@@ -168,12 +177,16 @@ class BeamSearch {
   /*
    * For each source, select top beam_size records.
    */
-  std::vector<std::vector<Item>> SelectTopBeamSizeItems();
+  std::vector<std::vector<Item>> SelectTopBeamSizeItems(
+      const framework::LoDTensor& pre_ids,
+      const framework::LoDTensor& pre_scores);
 
   /*
    * Get the items of next source sequence, return false if no remaining items.
    */
-  bool NextItemSet(std::vector<Item>* items);
+  bool NextItemSet(const framework::LoDTensor& pre_ids,
+                   const framework::LoDTensor& pre_scores,
+                   std::vector<Item>* items);
 
  private:
   size_t beam_size_;
@@ -192,24 +205,25 @@ template <typename DeviceContext, typename T>
 class BeamSearchOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* ids_var = context.Input<framework::LoDTensor>("ids");
-    auto* scores_var = context.Input<framework::LoDTensor>("scores");
-    auto* pre_ids_var = context.Input<framework::LoDTensor>("pre_ids");
-    PADDLE_ENFORCE_NOT_NULL(ids_var);
-    PADDLE_ENFORCE_NOT_NULL(scores_var);
-    PADDLE_ENFORCE_NOT_NULL(pre_ids_var);
+    auto* ids = context.Input<framework::LoDTensor>("ids");
+    auto* scores = context.Input<framework::LoDTensor>("scores");
+    auto* pre_ids = context.Input<framework::LoDTensor>("pre_ids");
+    auto* pre_scores = context.Input<framework::LoDTensor>("pre_scores");
+    PADDLE_ENFORCE_NOT_NULL(ids);
+    PADDLE_ENFORCE_NOT_NULL(scores);
+    PADDLE_ENFORCE_NOT_NULL(pre_ids);
+    PADDLE_ENFORCE_NOT_NULL(pre_scores);
 
     size_t level = context.Attr<int>("level");
     size_t beam_size = context.Attr<int>("beam_size");
     int end_id = context.Attr<int>("end_id");
-    BeamSearch alg(*ids_var, *scores_var, level, beam_size, end_id);
-    auto selected_ids_var =
-        context.Output<framework::LoDTensor>("selected_ids");
-    auto selected_scores_var =
+    BeamSearch alg(*ids, *scores, level, beam_size, end_id);
+    auto selected_ids = context.Output<framework::LoDTensor>("selected_ids");
+    auto selected_scores =
         context.Output<framework::LoDTensor>("selected_scores");
-    PADDLE_ENFORCE_NOT_NULL(selected_ids_var);
-    PADDLE_ENFORCE_NOT_NULL(selected_scores_var);
-    alg(*pre_ids_var, selected_ids_var, selected_scores_var);
+    PADDLE_ENFORCE_NOT_NULL(selected_ids);
+    PADDLE_ENFORCE_NOT_NULL(selected_scores);
+    alg(*pre_ids, *pre_scores, selected_ids, selected_scores);
   }
 };
 }  // namespace operators
diff --git a/paddle/fluid/operators/tensor_array_read_write_op.cc b/paddle/fluid/operators/tensor_array_read_write_op.cc
index c703d11eec..a2d44284e9 100644
--- a/paddle/fluid/operators/tensor_array_read_write_op.cc
+++ b/paddle/fluid/operators/tensor_array_read_write_op.cc
@@ -38,15 +38,14 @@ class WriteToArrayOp : public ArrayOp {
                << " to " << offset + 1;
       out->resize(offset + 1);
     }
+    auto *out_tensor = &out->at(offset);
+    out_tensor->set_lod(x_tensor.lod());
     if (x_tensor.memory_size() > 0) {
-      auto *out_tensor = &out->at(offset);
-
       platform::DeviceContextPool &pool =
           platform::DeviceContextPool::Instance();
       auto &dev_ctx = *pool.Get(place);
 
       TensorCopy(x_tensor, place, dev_ctx, out_tensor);
-      out_tensor->set_lod(x_tensor.lod());
     } else {
       VLOG(10) << "WARNING: The input tensor 'x_tensor' holds no memory, so "
                   "nothing has been written to output array["
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 561c8bd42f..c753caa7e9 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -1686,7 +1686,7 @@ def layer_norm(input,
     return helper.append_activation(layer_norm_out)
 
 
-def beam_search_decode(ids, scores, name=None):
+def beam_search_decode(ids, scores, beam_size, end_id, name=None):
     helper = LayerHelper('beam_search_decode', **locals())
     sentence_ids = helper.create_tmp_variable(dtype=ids.dtype)
     sentence_scores = helper.create_tmp_variable(dtype=ids.dtype)
@@ -1698,7 +1698,9 @@ def beam_search_decode(ids, scores, name=None):
         outputs={
             "SentenceIds": sentence_ids,
             "SentenceScores": sentence_scores
-        })
+        },
+        attrs={"beam_size": beam_size,
+               "end_id": end_id})
 
     return sentence_ids, sentence_scores
 
@@ -1926,7 +1928,7 @@ def sequence_expand(x, y, ref_level=-1, name=None):
     return tmp
 
 
-def beam_search(pre_ids, ids, scores, beam_size, end_id, level=0):
+def beam_search(pre_ids, pre_scores, ids, scores, beam_size, end_id, level=0):
     '''
     This function implements the beam search algorithm.
     '''
@@ -1941,6 +1943,7 @@ def beam_search(pre_ids, ids, scores, beam_size, end_id, level=0):
         type='beam_search',
         inputs={
             'pre_ids': pre_ids,
+            'pre_scores': pre_scores,
             'ids': ids,
             'scores': scores,
         },

From a281e1016ec7bbd5e019a85a4b55bf7cb6107a19 Mon Sep 17 00:00:00 2001
From: guosheng <guosheng@baidu.com>
Date: Thu, 7 Jun 2018 19:20:08 +0800
Subject: [PATCH 07/68] Make cc_test of beam_search_op and beam_searc_decode_op
 run correctly

---
 paddle/fluid/operators/CMakeLists.txt         |   4 +-
 .../fluid/operators/beam_search_decode_op.cc  |   4 -
 .../fluid/operators/beam_search_decode_op.h   | 184 +-----------------
 .../operators/beam_search_decode_op_test.cc   | 148 +++-----------
 paddle/fluid/operators/beam_search_op.cc      |  21 +-
 paddle/fluid/operators/beam_search_op.h       |  10 +-
 paddle/fluid/operators/beam_search_op_test.cc |  15 +-
 7 files changed, 50 insertions(+), 336 deletions(-)

diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 4c7691b776..7fce138e3f 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -288,8 +288,8 @@ set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
 
 cc_test(gather_test SRCS gather_test.cc DEPS tensor)
 cc_test(scatter_test SRCS scatter_test.cc DEPS tensor)
-# cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor)
-# cc_test(beam_search_op_test SRCS beam_search_op_test.cc DEPS lod_tensor beam_search_op)
+cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor)
+cc_test(beam_search_op_test SRCS beam_search_op_test.cc DEPS lod_tensor beam_search_op)
 cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor memory)
 cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
 cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op)
diff --git a/paddle/fluid/operators/beam_search_decode_op.cc b/paddle/fluid/operators/beam_search_decode_op.cc
index 39877cfdc1..b518c11e8c 100644
--- a/paddle/fluid/operators/beam_search_decode_op.cc
+++ b/paddle/fluid/operators/beam_search_decode_op.cc
@@ -87,13 +87,9 @@ void BeamSearchDecodeFunctor::operator()() const {
   BeamSearchDecoder<T> beam_search_decoder(beam_size_, end_id_);
   // Check if the tensor is on GPU. If so, use the CPU copy instead
   if (tensor_on_gpu_) {
-    // beam_search_decoder.PackAllSteps(step_ids_, step_scores_, id_tensor_,
-    //                                  score_tensor_);
     beam_search_decoder.Backtrace(step_ids_, step_scores_, id_tensor_,
                                   score_tensor_);
   } else {
-    // beam_search_decoder.PackAllSteps(step_ids_origin_, step_scores_origin_,
-    //                                  id_tensor_, score_tensor_);
     beam_search_decoder.Backtrace(step_ids_origin_, step_scores_origin_,
                                   id_tensor_, score_tensor_);
   }
diff --git a/paddle/fluid/operators/beam_search_decode_op.h b/paddle/fluid/operators/beam_search_decode_op.h
index 322838951b..1da4fe26af 100644
--- a/paddle/fluid/operators/beam_search_decode_op.h
+++ b/paddle/fluid/operators/beam_search_decode_op.h
@@ -28,41 +28,11 @@ using LoDTensorArray = framework::LoDTensorArray;
 
 // all the lod have 2 levels.
 // The First is source level, the second is sentence level.
-// source level describe how many candidate words for this source.
-// sentence level describe these candidates belong to which prefix
+// source level describe how many prefixes (branchs) for each source sentece
+// (beam). sentence level describe how these candidates belong to the prefixes.
 const size_t kSourceLevel = 0;
 const size_t kSentenceLevel = 1;
 
-template <typename T>
-struct BeamNode {
-  BeamNode(int64_t word_id, T score) : word_id_(word_id), score_(score) {}
-
-  ~BeamNode() {
-    if (parent_) {
-      parent_->DropKid(this);
-      if (parent_->kids_.size() == 0UL) {
-        delete parent_;
-      }
-    }
-    VLOG(3) << "Delete BeamNode root with word_id:" << this->word_id_;
-  }
-
-  void AppendTo(BeamNode* parent) {
-    parent_ = parent;
-    parent->kids_.insert(this);
-  }
-
-  void DropKid(BeamNode* kid) { kids_.erase(kid); }
-
-  BeamNode* parent_ = nullptr;
-  std::unordered_set<BeamNode*> kids_;
-  int64_t word_id_;
-  T score_;
-};
-
-template <typename T>
-using BeamNodeVector = std::vector<std::unique_ptr<BeamNode<T>>>;
-
 template <typename T>
 struct Sentence {
   std::vector<int64_t> word_ids;
@@ -77,25 +47,6 @@ struct BeamSearchDecoder {
   BeamSearchDecoder(size_t beam_size, int end_id)
       : beam_size_(beam_size), end_id_(end_id) {}
 
-  /**
-   * make a BeamNode and all it's related prefix BeanNode into a Sentence.
-   */
-  Sentence<T> MakeSentence(const BeamNode<T>* node) const;
-
-  /**
-   * Param:
-   *  cur_ids: LoDTensor of One step for word ID
-   *  cur_scores: LoDTensor of One Step for word score
-   *  prefixes_list: prefixes for each source sentence.
-   *  sentence_vector_list: result sentence_vector for each source sentence.
-   * Return:
-   *  a new prefixes list for each source of current step
-   */
-  std::vector<BeamNodeVector<T>> PackTwoSteps(
-      const LoDTensor& cur_ids, const LoDTensor& cur_scores,
-      std::vector<BeamNodeVector<T>>* prefixes_list,
-      std::vector<SentenceVector<T>>* sentence_vector_list) const;
-
   /**
    * convert the result sentence_vector for each source sentence into two
    * LodTensor.
@@ -105,29 +56,18 @@ struct BeamSearchDecoder {
    *  sentence_vector_list: sentence_vector for each source sentence.
    *  id_tensor: result LoDTensor for sentences of id.
    *  score_tensor: result LoDTensor for sentences of score.
+   *  reverse: whether ids of sentence in sentence_vector_list is reversed
+   *  sort_by_score: whether to sort hypotheses of each sentence by scores.
    */
   void ConvertSentenceVectorToLodTensor(
       std::vector<SentenceVector<T>> sentence_vector_list, LoDTensor* id_tensor,
-      LoDTensor* score_tensor, bool reverse = false,
+      LoDTensor* score_tensor, bool reverse = true,
       bool sort_by_score = true) const;
 
   /**
-   * Pack all steps of id/score LodTensor into sentence LoDTensor
-   * it's main logic is:
-   * ```python
-   *   prefix
-   *   result_sentence
-   *   result_lod_tensor
-   *
-   *   for (step in steps):
-   *     prefix = PackTwoSteps(prefix, step, &result_sentence)
-   *   ConvertSentenceVector<T>ToLodTensor(result_sentence, &result_lod_tensor)
-   * ```
+   * Gather the hypotheses for each source sentence by backtrace though the
+   * LoDTensorArray step_ids whose lods reserve the path in the tree.
    */
-  void PackAllSteps(const LoDTensorArray& step_ids,
-                    const LoDTensorArray& step_scores, LoDTensor* id_tensor,
-                    LoDTensor* score_tensor) const;
-
   void Backtrace(const LoDTensorArray& step_ids,
                  const LoDTensorArray& step_scores, LoDTensor* id_tensor,
                  LoDTensor* score_tensor) const;
@@ -136,80 +76,6 @@ struct BeamSearchDecoder {
   int end_id_;
 };
 
-template <typename T>
-Sentence<T> BeamSearchDecoder<T>::MakeSentence(const BeamNode<T>* node) const {
-  Sentence<T> sentence;
-  while (node != nullptr) {
-    sentence.word_ids.emplace_back(node->word_id_);
-    sentence.scores.emplace_back(node->score_);
-    node = node->parent_;
-  }
-
-  std::reverse(std::begin(sentence.word_ids), std::end(sentence.word_ids));
-  std::reverse(std::begin(sentence.scores), std::end(sentence.scores));
-
-  return sentence;
-}
-
-template <typename T>
-std::vector<BeamNodeVector<T>> BeamSearchDecoder<T>::PackTwoSteps(
-    const LoDTensor& cur_ids, const LoDTensor& cur_scores,
-    std::vector<BeamNodeVector<T>>* prefixes_list,
-    std::vector<SentenceVector<T>>* sentence_vector_list) const {
-  std::vector<BeamNodeVector<T>> result;
-
-  for (size_t src_idx = 0; src_idx < cur_ids.lod()[kSourceLevel].size() - 1;
-       ++src_idx) {
-    size_t src_start = cur_ids.lod().at(kSourceLevel)[src_idx];
-    size_t src_end = cur_ids.lod().at(kSourceLevel)[src_idx + 1];
-
-    BeamNodeVector<T> beam_nodes;
-
-    // if prefixes size is 0, it means this is the first step. In this step,
-    // all candidate id is the start of candidate sentences.
-    if (prefixes_list->empty()) {
-      PADDLE_ENFORCE_EQ(cur_ids.lod().at(kSourceLevel).back(),
-                        cur_ids.lod().at(kSentenceLevel).back(),
-                        "in the first step");
-      for (size_t id_idx = src_start; id_idx < src_end; ++id_idx) {
-        beam_nodes.push_back(std::unique_ptr<BeamNode<T>>(new BeamNode<T>(
-            cur_ids.data<int64_t>()[id_idx], cur_scores.data<T>()[id_idx])));
-      }
-    } else {
-      BeamNodeVector<T>& prefixes = prefixes_list->at(src_idx);
-      SentenceVector<T>& sentence_vector = (*sentence_vector_list)[src_idx];
-
-      PADDLE_ENFORCE_EQ(src_end - src_start, prefixes.size(),
-                        "prefix and candidate set number should be the same");
-
-      auto candidate_offset = cur_ids.lod()[kSentenceLevel];
-      for (size_t prefix_idx = 0; prefix_idx < prefixes.size(); ++prefix_idx) {
-        std::unique_ptr<BeamNode<T>>& prefix = prefixes[prefix_idx];
-        size_t candidate_start = candidate_offset[src_start + prefix_idx];
-        size_t candidate_end = candidate_offset[src_start + prefix_idx + 1];
-        if (candidate_start == candidate_end) {
-          VLOG(3) << "this sentence has no more candidate, "
-                     "add to result sentence and rm it from beam tree";
-          sentence_vector.push_back(MakeSentence(prefix.get()));
-          prefix.reset();
-        } else {
-          for (size_t candidate_idx = candidate_start;
-               candidate_idx < candidate_end; ++candidate_idx) {
-            auto* candidate =
-                new BeamNode<T>(cur_ids.data<int64_t>()[candidate_idx],
-                                cur_scores.data<T>()[candidate_idx]);
-            candidate->AppendTo(prefix.get());
-            beam_nodes.push_back(std::unique_ptr<BeamNode<T>>(candidate));
-          }
-          prefix.release();
-        }
-      }
-    }
-    result.push_back(std::move(beam_nodes));
-  }
-  return result;
-}
-
 template <typename T>
 void BeamSearchDecoder<T>::ConvertSentenceVectorToLodTensor(
     std::vector<SentenceVector<T>> sentence_vector_list, LoDTensor* id_tensor,
@@ -273,42 +139,6 @@ void BeamSearchDecoder<T>::ConvertSentenceVectorToLodTensor(
   framework::TensorFromVector<T>(score_data, cpu_ctx, score_tensor);
 }
 
-template <typename T>
-void BeamSearchDecoder<T>::PackAllSteps(const LoDTensorArray& step_ids,
-                                        const LoDTensorArray& step_scores,
-                                        LoDTensor* id_tensor,
-                                        LoDTensor* score_tensor) const {
-  PADDLE_ENFORCE(!step_ids.empty(), "step num should be larger than 0");
-  PADDLE_ENFORCE_EQ(step_ids.size(), step_scores.size(),
-                    "step_ids and step_scores should be the same");
-  const size_t step_num = step_ids.size();
-  const size_t src_num = step_ids.at(0).lod().at(kSourceLevel).size() - 1;
-
-  PADDLE_ENFORCE_GT(src_num, 0UL, "source num should be larger than 0");
-
-  // previous prefixes for each step,
-  // the init length is 0, means this is the first step.
-  std::vector<BeamNodeVector<T>> beamnode_vector_list(0);
-  std::vector<SentenceVector<T>> sentence_vector_list(src_num);
-
-  // pack all steps for one batch first, then another batch
-  for (size_t step_id = 0; step_id < step_num; ++step_id) {
-    beamnode_vector_list =
-        PackTwoSteps(step_ids.at(step_id), step_scores.at(step_id),
-                     &beamnode_vector_list, &sentence_vector_list);
-  }
-  // append last beam_node to result
-  for (size_t src_idx = 0; src_idx < src_num; ++src_idx) {
-    for (auto& beam_node : beamnode_vector_list.at(src_idx)) {
-      sentence_vector_list[src_idx].push_back(MakeSentence(beam_node.get()));
-      beam_node.reset();
-    }
-  }
-
-  ConvertSentenceVectorToLodTensor(sentence_vector_list, id_tensor,
-                                   score_tensor);
-}
-
 template <typename T>
 void BeamSearchDecoder<T>::Backtrace(const LoDTensorArray& step_ids,
                                      const LoDTensorArray& step_scores,
diff --git a/paddle/fluid/operators/beam_search_decode_op_test.cc b/paddle/fluid/operators/beam_search_decode_op_test.cc
index 36f9594969..c6cccafcf4 100644
--- a/paddle/fluid/operators/beam_search_decode_op_test.cc
+++ b/paddle/fluid/operators/beam_search_decode_op_test.cc
@@ -20,15 +20,11 @@ using LoD = paddle::framework::LoD;
 using LoDTensor = paddle::framework::LoDTensor;
 using LoDTensorArray = paddle::framework::LoDTensorArray;
 
-template <typename T>
-using BeamNode = paddle::operators::BeamNode<T>;
 template <typename T>
 using BeamSearchDecoder = paddle::operators::BeamSearchDecoder<T>;
 template <typename T>
 using Sentence = paddle::operators::Sentence<T>;
 template <typename T>
-using BeamNodeVector = paddle::operators::BeamNodeVector<T>;
-template <typename T>
 using SentenceVector = paddle::operators::SentenceVector<T>;
 
 namespace paddle {
@@ -77,138 +73,50 @@ void GenerateExample(const std::vector<size_t>& level_0,
 }  // namespace test
 }  // namespace paddle
 
-TEST(BeamSearchDecodeOp, DeleteBeamNode) {
-  auto* root = new BeamNode<float>(0, 0);
-  auto* b1 = new BeamNode<float>(1, 1);
-  auto* b2 = new BeamNode<float>(2, 2);
-  auto* b3 = new BeamNode<float>(3, 3);
-
-  b1->AppendTo(root);
-  b2->AppendTo(root);
-  b3->AppendTo(b1);
-
-  delete b3;
-  delete b2;
-}
-
-TEST(BeamSearchDecodeOp, MakeSentence) {
-  auto* root = new BeamNode<float>(0, 0);
-  auto* b1 = new BeamNode<float>(1, 1);
-  auto* end = new BeamNode<float>(2, 2);
-  b1->AppendTo(root);
-  end->AppendTo(b1);
-
-  BeamSearchDecoder<float> helper;
-  Sentence<float> sentence = helper.MakeSentence(end);
-  delete end;
-
-  std::vector<int64_t> expect_ids = {0, 1, 2};
-  ASSERT_EQ(sentence.word_ids, expect_ids);
-
-  std::vector<float> expect_scores = {0, 1, 2};
-  ASSERT_EQ(sentence.scores, expect_scores);
-}
-
-TEST(BeamSearchDecodeOp, PackTwoStepsFistStep) {
-  CPUPlace place;
-
-  LoDTensorArray ids;
-  LoDTensorArray scores;
-
-  paddle::test::GenerateExample(
-      std::vector<size_t>{0, 2, 6}, std::vector<size_t>{0, 1, 2, 3, 4, 5, 6},
-      std::vector<int>{1, 2, 3, 4, 5, 6}, &ids, &scores);
-
-  std::vector<BeamNodeVector<float>> beamnode_vector_list;
-  std::vector<SentenceVector<float>> sentence_vector_list(
-      2, SentenceVector<float>());
-
-  BeamSearchDecoder<float> helper;
-  beamnode_vector_list = helper.PackTwoSteps(
-      ids[0], scores[0], &beamnode_vector_list, &sentence_vector_list);
-  ASSERT_EQ(beamnode_vector_list.size(), 2UL);
-  ASSERT_EQ(beamnode_vector_list[0].size(), 2UL);
-  ASSERT_EQ(beamnode_vector_list[1].size(), 4UL);
-}
-
-TEST(BeamSearchDecodeOp, PackTwoSteps) {
-  CPUPlace place;
-
-  // first source has three prefix
-  BeamNodeVector<float> source0_prefixes;
-  source0_prefixes.push_back(
-      std::unique_ptr<BeamNode<float>>(new BeamNode<float>(1, 1)));
-  source0_prefixes.push_back(
-      std::unique_ptr<BeamNode<float>>(new BeamNode<float>(0, 0)));
-  source0_prefixes.push_back(
-      std::unique_ptr<BeamNode<float>>(new BeamNode<float>(3, 3)));
-
-  // second source has two prefix
-  BeamNodeVector<float> source1_prefixes;
-  source1_prefixes.push_back(
-      std::unique_ptr<BeamNode<float>>(new BeamNode<float>(4, 4)));
-  source1_prefixes.push_back(
-      std::unique_ptr<BeamNode<float>>(new BeamNode<float>(5, 5)));
-
-  std::vector<BeamNodeVector<float>> beamnode_vector_list;
-  std::vector<SentenceVector<float>> sentence_vector_list(
-      2, SentenceVector<float>());
-
-  beamnode_vector_list.push_back(std::move(source0_prefixes));
-  beamnode_vector_list.push_back(std::move(source1_prefixes));
-
-  // generate data for one step
-  LoDTensorArray ids;
-  LoDTensorArray scores;
-
-  paddle::test::GenerateExample(std::vector<size_t>{0, 3, 5},
-                                std::vector<size_t>{0, 1, 1, 3, 4, 5},
-                                std::vector<int>{0, 1, 2, 3, 4}, &ids, &scores);
-
-  BeamSearchDecoder<float> helper1;
-  beamnode_vector_list = helper1.PackTwoSteps(
-      ids[0], scores[0], &beamnode_vector_list, &sentence_vector_list);
-
-  ASSERT_EQ(sentence_vector_list[0].size(), 1UL);
-  ASSERT_EQ(sentence_vector_list[1].size(), 0UL);
-  ASSERT_EQ(beamnode_vector_list[0].size(), 3UL);
-  ASSERT_EQ(beamnode_vector_list[1].size(), 2UL);
-}
-
-TEST(BeamSearchDecodeOp, PackAllSteps) {
+TEST(BeamSearchDecodeOp, Backtrace) {
   CPUPlace place;
 
-  // we will constuct a sample data with 3 steps and 2 source sentences
+  // we will constuct a sample data with 4 steps and 2 source sentences
+  // beam_size = 2, start_id = 0, end_id = 1
   LoDTensorArray ids;
   LoDTensorArray scores;
 
   paddle::test::GenerateExample(
-      std::vector<size_t>{0, 3, 6}, std::vector<size_t>{0, 1, 2, 3, 4, 5, 6},
-      std::vector<int>{1, 2, 3, 4, 5, 6}, &ids, &scores);
+      std::vector<size_t>{0, 1, 2}, std::vector<size_t>{0, 1, 2},
+      std::vector<int>{0, 0}, &ids, &scores);  // start with start_id
+  paddle::test::GenerateExample(std::vector<size_t>{0, 1, 2},
+                                std::vector<size_t>{0, 2, 4},
+                                std::vector<int>{2, 3, 4, 5}, &ids, &scores);
+  paddle::test::GenerateExample(std::vector<size_t>{0, 2, 4},
+                                std::vector<size_t>{0, 2, 2, 4, 4},
+                                std::vector<int>{3, 1, 5, 4}, &ids, &scores);
+  paddle::test::GenerateExample(std::vector<size_t>{0, 2, 4},
+                                std::vector<size_t>{0, 1, 2, 3, 4},
+                                std::vector<int>{1, 1, 3, 5}, &ids, &scores);
   paddle::test::GenerateExample(
-      std::vector<size_t>{0, 3, 6}, std::vector<size_t>{0, 1, 1, 3, 5, 5, 6},
-      std::vector<int>{0, 1, 2, 3, 4, 5}, &ids, &scores);
-  paddle::test::GenerateExample(std::vector<size_t>{0, 3, 6},
-                                std::vector<size_t>{0, 0, 1, 2, 3, 4, 5},
-                                std::vector<int>{0, 1, 2, 3, 4}, &ids, &scores);
+      std::vector<size_t>{0, 2, 4},
+      std::vector<size_t>{0, 0, 0, 2,
+                          2},  // the branchs of the first source sentence
+                               // are pruned since finished
+      std::vector<int>{5, 1},
+      &ids, &scores);
 
-  ASSERT_EQ(ids.size(), 3UL);
-  ASSERT_EQ(scores.size(), 3UL);
+  ASSERT_EQ(ids.size(), 5UL);
+  ASSERT_EQ(scores.size(), 5UL);
 
-  BeamSearchDecoder<float> helper;
+  BeamSearchDecoder<float> helper(2, 1);  // beam_size = 2, end_id = 1
 
   LoDTensor id_tensor;
   LoDTensor score_tensor;
-  helper.PackAllSteps(ids, scores, &id_tensor, &score_tensor);
+  helper.Backtrace(ids, scores, &id_tensor, &score_tensor);
 
   LoD lod = id_tensor.lod();
-  std::vector<size_t> expect_source_lod = {0, 4, 8};
+  std::vector<size_t> expect_source_lod = {0, 2, 4};
   EXPECT_EQ(lod[0], expect_source_lod);
-  std::vector<size_t> expect_sentence_lod = {0, 1, 3, 6, 9, 10, 13, 16, 19};
+  std::vector<size_t> expect_sentence_lod = {0, 4, 7, 12, 17};
   EXPECT_EQ(lod[1], expect_sentence_lod);
-  // 2| 1, 0| 3, 1, 0| 3, 2, 1| 5| 4, 3, 2| 4, 4, 3| 6, 5, 4
-  std::vector<int> expect_data = {2, 1, 0, 3, 1, 0, 3, 2, 1, 5,
-                                  4, 3, 2, 4, 4, 3, 6, 5, 4};
+  std::vector<int> expect_data = {0, 2, 3, 1, 0, 2, 1, 0, 4,
+                                  5, 3, 5, 0, 4, 5, 3, 1};
   ASSERT_EQ(id_tensor.dims()[0], static_cast<int64_t>(expect_data.size()));
   for (size_t i = 0; i < expect_data.size(); ++i) {
     ASSERT_EQ(id_tensor.data<int64_t>()[i],
diff --git a/paddle/fluid/operators/beam_search_op.cc b/paddle/fluid/operators/beam_search_op.cc
index 9b462ef8d0..6d936a7142 100644
--- a/paddle/fluid/operators/beam_search_op.cc
+++ b/paddle/fluid/operators/beam_search_op.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <algorithm>
-#include <limits>
 #include <map>
 #include <string>
 #include <vector>
@@ -110,23 +109,6 @@ void BeamSearch::PruneEndBeams(const framework::LoDTensor &pre_ids,
   }
 }
 
-int BeamSearch::PruneEndidCandidates(const framework::LoDTensor &pre_ids,
-                                     std::vector<std::vector<Item>> *items) {
-  auto *pre_ids_data = pre_ids.data<int64_t>();
-
-  int res = 0;
-  for (size_t offset = 0; offset < items->size(); offset++) {
-    auto prefix_id = pre_ids_data[offset];
-    if (prefix_id == end_id_) {
-      items->at(offset).clear();
-    } else {
-      res++;
-    }
-  }
-
-  return res;
-}
-
 std::vector<std::vector<BeamSearch::Item>> BeamSearch::ToMap(
     const std::vector<std::vector<Item>> &items, size_t element_num) {
   std::vector<std::vector<Item>> result;
@@ -201,8 +183,7 @@ bool BeamSearch::NextItemSet(const framework::LoDTensor &pre_ids,
     auto pre_score = pre_scores_data[offset];
     if (pre_id == end_id_) {
       // Allocate all probability mass to eos_id for finished branchs and the
-      // other
-      // candidate ids can be ignored.
+      // other candidate ids can be ignored.
       items->emplace_back(offset, end_id_, pre_score);
     } else {
       for (size_t d = 0; d < instance_dim; d++) {
diff --git a/paddle/fluid/operators/beam_search_op.h b/paddle/fluid/operators/beam_search_op.h
index a595726f12..b5e2ed0592 100644
--- a/paddle/fluid/operators/beam_search_op.h
+++ b/paddle/fluid/operators/beam_search_op.h
@@ -155,17 +155,11 @@ class BeamSearch {
  protected:
   /*
    * Prune the source sentences all branchs finished, and it is optional.
-   * Pruning must one step later than finishing, since the end tokens
-   * must be writed out. Also the finished branchs with top 1 score can
-   * be pruned.
+   * Pruning must one step later than finishing (thus pre_ids is needed here),
+   * since the end tokens must be writed out.
    */
   void PruneEndBeams(const framework::LoDTensor& pre_ids,
                      std::vector<std::vector<Item>>* items);
-  /*
-   * Delete all the records that follows the end token.
-   */
-  int PruneEndidCandidates(const framework::LoDTensor& pre_ids,
-                           std::vector<std::vector<Item>>* items);
 
   /*
    * Transform the items into a map whose key is offset, value is the items.
diff --git a/paddle/fluid/operators/beam_search_op_test.cc b/paddle/fluid/operators/beam_search_op_test.cc
index ec666359aa..df30c0a54c 100644
--- a/paddle/fluid/operators/beam_search_op_test.cc
+++ b/paddle/fluid/operators/beam_search_op_test.cc
@@ -30,7 +30,7 @@ using std::endl;
 
 void CreateInput(LoDTensor* ids, LoDTensor* scores) {
   LoD lod;
-  vector<size_t> level0({0, 1, 4});
+  vector<size_t> level0({0, 2, 4});
   vector<size_t> level1({0, 1, 2, 3, 4});
   lod.push_back(level0);
   lod.push_back(level1);
@@ -64,17 +64,22 @@ TEST(beam_search_op, run) {
   for (int i = 0; i < 4; i++) {
     pre_ids.mutable_data<int64_t>(place)[i] = i + 1;
   }
+  LoDTensor pre_scores;
+  pre_scores.Resize(framework::make_ddim(vector<int64_t>(4, 1)));
+  for (int i = 0; i < 4; i++) {
+    pre_scores.mutable_data<float>(place)[i] = 0.1;
+  }
 
-  BeamSearch beamsearch(ids, scores, (int64_t)0, (int64_t)2, 0);
+  BeamSearch beamsearch(ids, scores, (size_t)0, (size_t)2, 0);
   LoDTensor sids, sscores;
-  beamsearch(pre_ids, &sids, &sscores);
+  beamsearch(pre_ids, pre_scores, &sids, &sscores);
 
   LOG(INFO) << "score: " << sscores << endl;
 
   ASSERT_EQ(sids.lod(), sscores.lod());
 
-  vector<int> tids({2, 4, 3, 8});
-  vector<float> tscores({0.3, 0.5, 0.9, 0.7});
+  vector<int> tids({4, 2, 3, 8});
+  vector<float> tscores({0.5, 0.6, 0.9, 0.7});
 
   for (int i = 0; i < 4; i++) {
     ASSERT_EQ(tids[i], sids.data<int64_t>()[i]);

From 5e20a8ef931faf11a21a31bce553f8354b2f3958 Mon Sep 17 00:00:00 2001
From: guosheng <guosheng@baidu.com>
Date: Fri, 8 Jun 2018 16:34:22 +0800
Subject: [PATCH 08/68] Make python unit test of beam_search_op and
 beam_searc_decode_op run correctly

---
 .../operators/beam_search_decode_op_test.cc   |  2 +-
 paddle/fluid/operators/beam_search_op_test.cc |  2 +-
 .../unittests/test_beam_search_decode_op.py   | 70 +++++++++++--------
 .../tests/unittests/test_beam_search_op.py    | 24 +++++--
 4 files changed, 63 insertions(+), 35 deletions(-)

diff --git a/paddle/fluid/operators/beam_search_decode_op_test.cc b/paddle/fluid/operators/beam_search_decode_op_test.cc
index c6cccafcf4..88339e38d8 100644
--- a/paddle/fluid/operators/beam_search_decode_op_test.cc
+++ b/paddle/fluid/operators/beam_search_decode_op_test.cc
@@ -76,7 +76,7 @@ void GenerateExample(const std::vector<size_t>& level_0,
 TEST(BeamSearchDecodeOp, Backtrace) {
   CPUPlace place;
 
-  // we will constuct a sample data with 4 steps and 2 source sentences
+  // Construct sample data with 5 steps and 2 source sentences
   // beam_size = 2, start_id = 0, end_id = 1
   LoDTensorArray ids;
   LoDTensorArray scores;
diff --git a/paddle/fluid/operators/beam_search_op_test.cc b/paddle/fluid/operators/beam_search_op_test.cc
index df30c0a54c..c4f4b478fb 100644
--- a/paddle/fluid/operators/beam_search_op_test.cc
+++ b/paddle/fluid/operators/beam_search_op_test.cc
@@ -67,7 +67,7 @@ TEST(beam_search_op, run) {
   LoDTensor pre_scores;
   pre_scores.Resize(framework::make_ddim(vector<int64_t>(4, 1)));
   for (int i = 0; i < 4; i++) {
-    pre_scores.mutable_data<float>(place)[i] = 0.1;
+    pre_scores.mutable_data<float>(place)[i] = 0.1 * (i + 1);
   }
 
   BeamSearch beamsearch(ids, scores, (size_t)0, (size_t)2, 0);
diff --git a/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py b/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py
index 7976dd7c3f..877accafb1 100644
--- a/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py
+++ b/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py
@@ -32,32 +32,44 @@ class TestBeamSearchDecodeOp(unittest.TestCase):
 
     def test_get_set(self):
         ids = self.scope.var("ids").get_lod_tensor_array()
-        self.append_lod_tensor(
-            ids, [[0, 3, 6], [0, 1, 2, 3, 4, 5, 6]],
-            np.array(
-                [1, 2, 3, 4, 5, 6], dtype="int64"))
-        self.append_lod_tensor(
-            ids, [[0, 3, 6], [0, 1, 1, 3, 5, 5, 6]],
-            np.array(
-                [0, 1, 2, 3, 4, 5], dtype="int64"))
-        self.append_lod_tensor(
-            ids, [[0, 3, 6], [0, 0, 1, 2, 3, 4, 5]],
-            np.array(
-                [0, 1, 2, 3, 4], dtype="int64"))
-
         scores = self.scope.var("scores").get_lod_tensor_array()
-        self.append_lod_tensor(
-            scores, [[0, 3, 6], [0, 1, 2, 3, 4, 5, 6]],
-            np.array(
-                [1, 2, 3, 4, 5, 6], dtype="float64"))
-        self.append_lod_tensor(
-            scores, [[0, 3, 6], [0, 1, 1, 3, 5, 5, 6]],
-            np.array(
-                [0, 1, 2, 3, 4, 5], dtype="float64"))
-        self.append_lod_tensor(
-            scores, [[0, 3, 6], [0, 0, 1, 2, 3, 4, 5]],
-            np.array(
-                [0, 1, 2, 3, 4], dtype="float64"))
+        # Construct sample data with 5 steps and 2 source sentences
+        # beam_size = 2, end_id = 1
+        # start with start_id
+        [
+            self.append_lod_tensor(
+                array, [[0, 1, 2], [0, 1, 2]], np.array(
+                    [0, 0], dtype=dtype))
+            for array, dtype in ((ids, "int64"), (scores, "float32"))
+        ]
+        [
+            self.append_lod_tensor(
+                array, [[0, 1, 2], [0, 2, 4]],
+                np.array(
+                    [2, 3, 4, 5], dtype=dtype))
+            for array, dtype in ((ids, "int64"), (scores, "float32"))
+        ]
+        [
+            self.append_lod_tensor(
+                array, [[0, 2, 4], [0, 2, 2, 4, 4]],
+                np.array(
+                    [3, 1, 5, 4], dtype=dtype))
+            for array, dtype in ((ids, "int64"), (scores, "float32"))
+        ]
+        [
+            self.append_lod_tensor(
+                array, [[0, 2, 4], [0, 1, 2, 3, 4]],
+                np.array(
+                    [1, 1, 3, 5], dtype=dtype))
+            for array, dtype in ((ids, "int64"), (scores, "float32"))
+        ]
+        [
+            self.append_lod_tensor(
+                array, [[0, 2, 4], [0, 0, 0, 2, 2]],
+                np.array(
+                    [5, 1], dtype=dtype))
+            for array, dtype in ((ids, "int64"), (scores, "float32"))
+        ]
 
         sentence_ids = self.scope.var("sentence_ids").get_tensor()
         sentence_scores = self.scope.var("sentence_scores").get_tensor()
@@ -69,16 +81,18 @@ class TestBeamSearchDecodeOp(unittest.TestCase):
             Scores="scores",
             # outputs
             SentenceIds="sentence_ids",
-            SentenceScores="sentence_scores")
+            SentenceScores="sentence_scores",
+            beam_size=2,
+            end_id=1, )
 
         beam_search_decode_op.run(self.scope, self.place)
 
-        expected_lod = [[0, 4, 8], [0, 1, 3, 6, 9, 10, 13, 16, 19]]
+        expected_lod = [[0, 2, 4], [0, 4, 7, 12, 17]]
         self.assertEqual(sentence_ids.lod(), expected_lod)
         self.assertEqual(sentence_scores.lod(), expected_lod)
 
         expected_data = np.array(
-            [2, 1, 0, 3, 1, 0, 3, 2, 1, 5, 4, 3, 2, 4, 4, 3, 6, 5, 4], "int64")
+            [0, 2, 3, 1, 0, 2, 1, 0, 4, 5, 3, 5, 0, 4, 5, 3, 1], "int64")
         self.assertTrue(np.array_equal(np.array(sentence_ids), expected_data))
         self.assertTrue(
             np.array_equal(np.array(sentence_scores), expected_data))
diff --git a/python/paddle/fluid/tests/unittests/test_beam_search_op.py b/python/paddle/fluid/tests/unittests/test_beam_search_op.py
index bc708f3aff..6fdf4a3086 100644
--- a/python/paddle/fluid/tests/unittests/test_beam_search_op.py
+++ b/python/paddle/fluid/tests/unittests/test_beam_search_op.py
@@ -29,6 +29,7 @@ class BeamSearchOpTester(unittest.TestCase):
     def setUp(self):
         self.scope = core.Scope()
         self._create_ids()
+        self._create_pre_scores()
         self._create_scores()
         self._create_pre_ids()
         self.scope.var('selected_ids')
@@ -37,7 +38,8 @@ class BeamSearchOpTester(unittest.TestCase):
     def test_run(self):
         op = Operator(
             'beam_search',
-            pre_ids="pre_ids",
+            pre_ids='pre_ids',
+            pre_scores='pre_scores',
             ids='ids',
             scores='scores',
             selected_ids='selected_ids',
@@ -47,15 +49,27 @@ class BeamSearchOpTester(unittest.TestCase):
             end_id=0, )
         op.run(self.scope, core.CPUPlace())
         selected_ids = self.scope.find_var("selected_ids").get_tensor()
-        print 'selected_ids', np.array(selected_ids)
-        print 'lod', selected_ids.lod()
+        selected_scores = self.scope.find_var("selected_scores").get_tensor()
+        self.assertTrue(
+            np.allclose(
+                np.array(selected_ids), np.array([4, 2, 3, 8])[:, np.newaxis]))
+        self.assertTrue(
+            np.allclose(
+                np.array(selected_scores),
+                np.array([0.5, 0.6, 0.9, 0.7])[:, np.newaxis]))
+        self.assertEqual(selected_ids.lod(),
+                         [[0L, 2L, 4L], [0L, 1L, 2L, 3L, 4L]])
 
     def _create_pre_ids(self):
         np_data = np.array([[1, 2, 3, 4]], dtype='int64')
-        tensor = create_tensor(self.scope, "pre_ids", np_data)
+        tensor = create_tensor(self.scope, 'pre_ids', np_data)
+
+    def _create_pre_scores(self):
+        np_data = np.array([[0.1, 0.2, 0.3, 0.4]], dtype='float32')
+        tensor = create_tensor(self.scope, 'pre_scores', np_data)
 
     def _create_ids(self):
-        self.lod = [[0, 1, 4], [0, 1, 2, 3, 4]]
+        self.lod = [[0, 2, 4], [0, 1, 2, 3, 4]]
         np_data = np.array(
             [[4, 2, 5], [2, 1, 3], [3, 5, 2], [8, 2, 1]], dtype='int64')
         tensor = create_tensor(self.scope, "ids", np_data)

From 1eeb11ef6190a7697cdce7914646a0d6163e7597 Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Tue, 12 Jun 2018 02:43:44 +0000
Subject: [PATCH 09/68] refine ZeroGradFunctor in activation_op.h

---
 paddle/fluid/operators/activation_op.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h
index 9124151926..497a233338 100644
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -353,7 +353,7 @@ struct ZeroGradFunctor : public BaseActivationFunctor<T> {
   template <typename Device, typename X, typename Out, typename dOut,
             typename dX>
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = static_cast<T>(0) / out;
+    dx.device(d) = out.constant(static_cast<T>(0));
   }
 };
 

From 592f84a4af61985e03153e9e9740ece57ab3c58f Mon Sep 17 00:00:00 2001
From: guosheng <guosheng@baidu.com>
Date: Tue, 12 Jun 2018 23:33:26 +0800
Subject: [PATCH 10/68] Complete the docs of beam_search_op,
 beam_searc_decode_op and the python wrapper

---
 .../fluid/operators/beam_search_decode_op.cc  |  29 +++--
 .../fluid/operators/beam_search_decode_op.h   |   2 +-
 paddle/fluid/operators/beam_search_op.cc      |  53 +++++---
 python/paddle/fluid/layers/nn.py              | 115 +++++++++++++++++-
 .../tests/book/test_machine_translation.py    |  16 ++-
 5 files changed, 182 insertions(+), 33 deletions(-)

diff --git a/paddle/fluid/operators/beam_search_decode_op.cc b/paddle/fluid/operators/beam_search_decode_op.cc
index b518c11e8c..57496dd2bb 100644
--- a/paddle/fluid/operators/beam_search_decode_op.cc
+++ b/paddle/fluid/operators/beam_search_decode_op.cc
@@ -148,21 +148,32 @@ class BeamSearchDecodeOpProtoMaker : public framework::OpProtoAndCheckerMaker {
   void Make() override {
     AddInput("Ids",
              "(LodTensorArray)"
-             "score of the candidate words in each step");
+             "The LodTensorArray containing the selected ids of all steps");
     AddInput("Scores",
              "(LodTensorArray)"
-             "score of the candidate words in each step");
-    AddOutput("SentenceIds",
-              "(LodTensor)"
-              "All possible result sentences of word ids");
-    AddOutput("SentenceScores",
-              "(LodTensor)"
-              "All possible result sentences of word scores");
+             "The LodTensorArray containing the selected scores of all steps");
+    AddOutput(
+        "SentenceIds",
+        "(LodTensor)"
+        "An LodTensor containing all generated id sequences for all source "
+        "sentences");
+    AddOutput(
+        "SentenceScores",
+        "(LodTensor)"
+        "An LodTensor containing scores corresponding to Output(SentenceIds)");
     AddAttr<int>("beam_size", "beam size for beam search");
     AddAttr<int>("end_id",
                  "the token id which indicates the end of a sequence");
     AddComment(R"DOC(
-Pack the result of Beam search op into SentenceIds and SentenceScores.
+Beam Search Decode Operator. This Operator constructs the full hypotheses for
+each source sentence by walking back along the LoDTensorArray Input(ids)
+whose lods can be used to restore the path in the beam search tree.
+
+The Output(SentenceIds) and Output(SentenceScores) separately contain the 
+generated id sequences and the corresponding scores. The shapes and lods of the 
+two LodTensor are same. The lod level is 2 and the two levels separately 
+indicate how many hypotheses each source sentence has and how many ids each 
+hypothesis has.
 )DOC");
   }
 };
diff --git a/paddle/fluid/operators/beam_search_decode_op.h b/paddle/fluid/operators/beam_search_decode_op.h
index 1da4fe26af..bb5936a095 100644
--- a/paddle/fluid/operators/beam_search_decode_op.h
+++ b/paddle/fluid/operators/beam_search_decode_op.h
@@ -27,7 +27,7 @@ using LoDTensor = framework::LoDTensor;
 using LoDTensorArray = framework::LoDTensorArray;
 
 // all the lod have 2 levels.
-// The First is source level, the second is sentence level.
+// The first is source level, the second is sentence level.
 // source level describe how many prefixes (branchs) for each source sentece
 // (beam). sentence level describe how these candidates belong to the prefixes.
 const size_t kSourceLevel = 0;
diff --git a/paddle/fluid/operators/beam_search_op.cc b/paddle/fluid/operators/beam_search_op.cc
index 6d936a7142..89e74e35d8 100644
--- a/paddle/fluid/operators/beam_search_op.cc
+++ b/paddle/fluid/operators/beam_search_op.cc
@@ -129,12 +129,9 @@ std::vector<std::vector<BeamSearch::Item>> BeamSearch::SelectTopBeamSizeItems(
   // for each source sentence, select the top beam_size items across all
   // candidate sets.
   while (NextItemSet(pre_ids, pre_scores, &items)) {
-    std::nth_element(std::begin(items), std::begin(items) + beam_size_,
-                     std::end(items), [](const Item &a, const Item &b) {
-                       // TODO(superjom) make score's comparation customizable.
-                       // partial sort in descending order
-                       return a.score > b.score;
-                     });
+    std::nth_element(
+        std::begin(items), std::begin(items) + beam_size_, std::end(items),
+        [](const Item &a, const Item &b) { return a.score > b.score; });
     // prune the top beam_size items.
     if (items.size() > beam_size_) {
       items.resize(beam_size_);
@@ -218,16 +215,27 @@ class BeamSearchOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
     // inputs and outputs stored in proto
-    AddInput("pre_ids", "ids in the previous step");
-    AddInput("pre_scores", "accumulated scores in the previous step");
-    AddInput("ids", "a LoDTensor of shape of [None,k]");
+    AddInput("pre_ids",
+             "(LoDTensor) The LoDTensor containing the selected ids at the "
+             "previous step. It should be a tensor with shape (batch_size, 1) "
+             "and lod `[[0, 1, ... , batch_size], [0, 1, ..., batch_size]]` at "
+             "thefirst step.");
+    AddInput("pre_scores",
+             "(LoDTensor) The LoDTensor containing the accumulated "
+             "scores corresponding to the selected ids at the previous step.");
+    AddInput("ids",
+             "(LoDTensor) The LoDTensor containing the candidates ids. Its "
+             "shape should be (batch_size * beam_size, K), where K supposed to "
+             "be beam_size.");
     AddInput("scores",
-             "a LoDTensor that has the same shape and LoD with `ids`");
+             "(LoDTensor) The LodTensor containing the accumulated scores "
+             "corresponding to Input(ids) and its shape is the same as the "
+             "shape of Input(ids).");
     AddOutput("selected_ids",
-              "a LoDTensor that stores the IDs selected by beam search");
-    AddOutput(
-        "selected_scores",
-        "a LoDTensor that has the same shape and LoD with `selected_ids`");
+              "A LodTensor that stores the IDs selected by beam search.");
+    AddOutput("selected_scores",
+              "A LoDTensor containing the accumulated scores corresponding to "
+              "Output(selected_ids).");
 
     // Attributes stored in AttributeMap
     AddAttr<int>("level", "the level of LoDTensor");
@@ -235,8 +243,21 @@ class BeamSearchOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<int>("end_id",
                  "the token id which indicates the end of a sequence");
 
-    AddComment(
-        "This is a beam search operator that help to generate sequences.");
+    AddComment(R"DOC(
+This operator does the search in beams for one time step. 
+Specifically, it selects the top-K candidate word ids of current step from
+Input(ids) according to their Input(scores) for all source sentences,
+where K is Attr(beam_size) and Input(ids), Input(scores) are predicted results
+from the computation cell. Additionally, Input(pre_ids) and Input(pre_scores)
+are the output of beam_search at previous step, they are needed for special use
+to handle ended candidate translations. The paths linking prefixes and selected
+candidates are organized and reserved in lod.
+
+Note that the Input(scores) passed in should be accumulated scores, and
+length penalty should be done with extra operators before calculating the
+accumulated scores if needed, also suggest finding top-K before it and
+using the top-K candidates following.
+)DOC");
   }
 };
 
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index c753caa7e9..ddf502f08a 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -1687,6 +1687,40 @@ def layer_norm(input,
 
 
 def beam_search_decode(ids, scores, beam_size, end_id, name=None):
+    """
+    Beam Search Decode Layer. This layer constructs the full hypotheses for
+    each source sentence by walking back along the LoDTensorArray :attr:`ids`
+    whose lods can be used to restore the path in the beam search tree.
+
+    Please see the following demo for a fully beam search usage example:
+
+        fluid/tests/book/test_machine_translation.py
+
+    Args:
+        ids(Variable): The LodTensorArray variable containing the selected ids
+            of all steps.
+        scores(Variable): The LodTensorArray variable containing the selected
+            scores of all steps.
+        beam_size(int): The beam width used in beam search.
+        end_id(int): The id of end token.
+        name(str|None): A name for this layer(optional). If set None, the layer
+                        will be named automatically.
+
+    Returns:
+        Variable: The LodTensor pair containing the generated id sequences \
+            and the corresponding scores. The shapes and lods of the two \
+            LodTensor are same. The lod level is 2 and the two levels \
+            separately indicate how many hypotheses each source sentence has \
+            and how many ids each hypothesis has.
+
+    Examples:
+        .. code-block:: python
+
+            # Suppose `ids` and `scores` are LodTensorArray variables reserving
+            # the selected ids and scores of all steps
+            finished_ids, finished_scores = layers.beam_search_decode(
+                ids, scores, beam_size=5, end_id=0)
+    """
     helper = LayerHelper('beam_search_decode', **locals())
     sentence_ids = helper.create_tmp_variable(dtype=ids.dtype)
     sentence_scores = helper.create_tmp_variable(dtype=ids.dtype)
@@ -1928,10 +1962,83 @@ def sequence_expand(x, y, ref_level=-1, name=None):
     return tmp
 
 
-def beam_search(pre_ids, pre_scores, ids, scores, beam_size, end_id, level=0):
-    '''
-    This function implements the beam search algorithm.
-    '''
+def beam_search(pre_ids,
+                pre_scores,
+                ids,
+                scores,
+                beam_size,
+                end_id,
+                level=0,
+                name=None):
+    """
+    Beam Search Layer. This layer does the search in beams for one time step. 
+    Specifically, it selects the top-K candidate word ids of current step from
+    :attr:`ids` according to their :attr:`scores` for all source sentences,
+    where K is :attr:`beam_size` and :attr:`ids, scores` are predicted results
+    from the computation cell. Additionally, :attr:`pre_ids` and
+    :attr:`pre_scores` are the output of beam_search at previous step, they are
+    needed for special use to handle ended candidate translations.
+ 
+    Note that the :attr:`scores` passed in should be accumulated scores, and
+    length penalty should be done with extra operators before calculating the
+    accumulated scores if needed, also suggest finding top-K before it and
+    using the top-K candidates following.
+
+    Please see the following demo for a fully beam search usage example:
+
+        fluid/tests/book/test_machine_translation.py
+
+    Args:
+        pre_ids(Variable): The LodTensor variable which is the output of
+            beam_search at previous step. It should be a LodTensor with shape
+            :math:`(batch_size, 1)` and lod
+            :math:`[[0, 1, ... , batch_size], [0, 1, ..., batch_size]]` at the
+            first step.
+        pre_scores(Variable): The LodTensor variable which is the output of
+            beam_search at previous step.
+        ids(Variable): The LodTensor variable containing the candidates ids.
+            Its shape should be :math:`(batch_size \\times beam_size, K)`,
+            where :math:`K` supposed to be :attr:`beam_size`.
+        scores(Variable): The LodTensor variable containing the accumulated
+            scores corresponding to :attr:`ids` and its shape is the same as
+            the shape of :attr:`ids`.
+        beam_size(int): The beam width used in beam search.
+        end_id(int): The id of end token.
+        level(int, default 0): It can be ignored and mustn't change currently.
+            It means the source level of lod, which is explained as following.
+            The lod level of :attr:`ids` should be 2. The first level is source
+            level which describes how many prefixes (branchs) for each source
+            sentece (beam), and the second level is sentence level which
+            describes how these candidates belong to the prefix. The paths
+            linking prefixes and selected candidates are organized and reserved
+            in lod.
+        name(str|None): A name for this layer(optional). If set None, the layer
+                        will be named automatically.
+
+    Returns:
+        Variable: The LodTensor pair containing the selected ids and the \
+            corresponding scores.
+
+    Examples:
+        .. code-block:: python
+
+            # Suppose `probs` contains predicted results from the computation
+            # cell and `pre_ids` and `pre_scores` is the output of beam_search
+            # at previous step.
+            topk_scores, topk_indices = layers.topk(probs, k=beam_size)
+            accu_scores = layers.elementwise_add(
+                x=layers.log(x=topk_scores)),
+                y=layers.reshape(
+                    pre_scores, shape=[-1]),
+                axis=0)
+            selected_ids, selected_scores = layers.beam_search(
+                pre_ids=pre_ids,
+                pre_scores=pre_scores,
+                ids=topk_indices,
+                scores=accu_scores,
+                beam_size=beam_size,
+                end_id=end_id)
+    """
     helper = LayerHelper('beam_search', **locals())
     score_type = scores.dtype
     id_type = ids.dtype
diff --git a/python/paddle/fluid/tests/book/test_machine_translation.py b/python/paddle/fluid/tests/book/test_machine_translation.py
index e8a75f473f..c4b6519a20 100644
--- a/python/paddle/fluid/tests/book/test_machine_translation.py
+++ b/python/paddle/fluid/tests/book/test_machine_translation.py
@@ -126,9 +126,19 @@ def decoder_decode(context, is_sparse):
         current_score = pd.fc(input=current_state_with_lod,
                               size=target_dict_dim,
                               act='softmax')
-        topk_scores, topk_indices = pd.topk(current_score, k=50)
+        topk_scores, topk_indices = pd.topk(current_score, k=beam_size)
+        # calculate accumulated scores after topk to reduce computation cost
+        accu_scores = pd.elementwise_add(
+            x=pd.log(topk_scores), y=pd.reshape(
+                pre_score, shape=[-1]), axis=0)
         selected_ids, selected_scores = pd.beam_search(
-            pre_ids, topk_indices, topk_scores, beam_size, end_id=10, level=0)
+            pre_ids,
+            pre_score,
+            topk_indices,
+            accu_scores,
+            beam_size,
+            end_id=10,
+            level=0)
 
         pd.increment(x=counter, value=1, in_place=True)
 
@@ -140,7 +150,7 @@ def decoder_decode(context, is_sparse):
         pd.less_than(x=counter, y=array_len, cond=cond)
 
     translation_ids, translation_scores = pd.beam_search_decode(
-        ids=ids_array, scores=scores_array)
+        ids=ids_array, scores=scores_array, beam_size=beam_size, end_id=10)
 
     # return init_ids, init_scores
 

From 6e38cc337dcc8303bc20f35445a25b10e2ab729e Mon Sep 17 00:00:00 2001
From: guosheng <guosheng@baidu.com>
Date: Wed, 13 Jun 2018 14:10:06 +0800
Subject: [PATCH 11/68] Fix the beam_search in test_machine_translation.py

---
 .../test_machine_translation.py                  | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py b/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py
index c4b37df3a0..ccb7a4f9ab 100644
--- a/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py
+++ b/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py
@@ -127,9 +127,19 @@ def decode(context, is_sparse):
         current_score = pd.fc(input=current_state_with_lod,
                               size=target_dict_dim,
                               act='softmax')
-        topk_scores, topk_indices = pd.topk(current_score, k=topk_size)
+        topk_scores, topk_indices = pd.topk(current_score, k=beam_size)
+        # calculate accumulated scores after topk to reduce computation cost
+        accu_scores = pd.elementwise_add(
+            x=pd.log(topk_scores), y=pd.reshape(
+                pre_score, shape=[-1]), axis=0)
         selected_ids, selected_scores = pd.beam_search(
-            pre_ids, topk_indices, topk_scores, beam_size, end_id=10, level=0)
+            pre_ids,
+            pre_score,
+            topk_indices,
+            accu_scores,
+            beam_size,
+            end_id=10,
+            level=0)
 
         pd.increment(x=counter, value=1, in_place=True)
 
@@ -141,7 +151,7 @@ def decode(context, is_sparse):
         pd.less_than(x=counter, y=array_len, cond=cond)
 
     translation_ids, translation_scores = pd.beam_search_decode(
-        ids=ids_array, scores=scores_array)
+        ids=ids_array, scores=scores_array, beam_size=beam_size, end_id=10)
 
     # return init_ids, init_scores
 

From 49ca424d6e965fc390e013fd5e3843c6136d184b Mon Sep 17 00:00:00 2001
From: guosheng <guosheng@baidu.com>
Date: Thu, 14 Jun 2018 01:19:20 +0800
Subject: [PATCH 12/68] Fix src_idx out of range in beam_search_op

---
 paddle/fluid/operators/beam_search_op.cc                    | 2 +-
 .../machine_translation/test_machine_translation.py         | 6 +++++-
 python/paddle/fluid/tests/book/test_machine_translation.py  | 6 +++++-
 3 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/operators/beam_search_op.cc b/paddle/fluid/operators/beam_search_op.cc
index 89e74e35d8..62771d09f1 100644
--- a/paddle/fluid/operators/beam_search_op.cc
+++ b/paddle/fluid/operators/beam_search_op.cc
@@ -87,7 +87,7 @@ void BeamSearch::PruneEndBeams(const framework::LoDTensor &pre_ids,
   auto *pre_ids_data = pre_ids.data<int64_t>();
   auto abs_lod = framework::ToAbsOffset(ids_->lod());
   auto &high_level = abs_lod[lod_level_];
-  for (size_t src_idx = 0; src_idx < high_level.size(); ++src_idx) {
+  for (size_t src_idx = 0; src_idx < high_level.size() - 1; ++src_idx) {
     size_t src_prefix_start = high_level[src_idx];
     size_t src_prefix_end = high_level[src_idx + 1];
     bool finish_flag = true;
diff --git a/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py b/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py
index ccb7a4f9ab..f690a0d233 100644
--- a/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py
+++ b/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py
@@ -148,7 +148,11 @@ def decode(context, is_sparse):
         pd.array_write(selected_ids, array=ids_array, i=counter)
         pd.array_write(selected_scores, array=scores_array, i=counter)
 
-        pd.less_than(x=counter, y=array_len, cond=cond)
+        # update the break condition: up to the max length or all candidates of
+        # source sentences have ended.
+        length_cond = pd.less_than(x=counter, y=array_len)
+        finish_cond = pd.logical_not(pd.is_empty(x=selected_ids))
+        pd.logical_and(x=length_cond, y=finish_cond, out=cond)
 
     translation_ids, translation_scores = pd.beam_search_decode(
         ids=ids_array, scores=scores_array, beam_size=beam_size, end_id=10)
diff --git a/python/paddle/fluid/tests/book/test_machine_translation.py b/python/paddle/fluid/tests/book/test_machine_translation.py
index d8499fa3f7..44e4c62643 100644
--- a/python/paddle/fluid/tests/book/test_machine_translation.py
+++ b/python/paddle/fluid/tests/book/test_machine_translation.py
@@ -147,7 +147,11 @@ def decoder_decode(context, is_sparse):
         pd.array_write(selected_ids, array=ids_array, i=counter)
         pd.array_write(selected_scores, array=scores_array, i=counter)
 
-        pd.less_than(x=counter, y=array_len, cond=cond)
+        # update the break condition: up to the max length or all candidates of
+        # source sentences have ended.
+        length_cond = pd.less_than(x=counter, y=array_len)
+        finish_cond = pd.logical_not(pd.is_empty(x=selected_ids))
+        pd.logical_and(x=length_cond, y=finish_cond, out=cond)
 
     translation_ids, translation_scores = pd.beam_search_decode(
         ids=ids_array, scores=scores_array, beam_size=beam_size, end_id=10)

From e0f883e66bcde3512b43dc907d04c917f8cd37bf Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Thu, 14 Jun 2018 05:45:01 +0000
Subject: [PATCH 13/68] commit

---
 benchmark/fluid/args.pyc               | Bin 0 -> 3164 bytes
 benchmark/fluid/fluid_benchmark.py     |  12 ++++++++++--
 benchmark/fluid/recordio_converter.pyc | Bin 0 -> 6172 bytes
 3 files changed, 10 insertions(+), 2 deletions(-)
 create mode 100644 benchmark/fluid/args.pyc
 create mode 100644 benchmark/fluid/recordio_converter.pyc

diff --git a/benchmark/fluid/args.pyc b/benchmark/fluid/args.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cac4bec4c1fc1776b3bf13a2cb913f12c81e8f4e
GIT binary patch
literal 3164
zcmbVOX>$}s8179-!V&HpLD~zW?ru0#P|lD9ur!bgQdHrGsm@IAZkxHPyC-oi_=JDP
ze_{C({3n!Ot<vXxXJ+XV^@DrN^z`%ISI_kC`}h8`c>cRsG@lvV@8IvQ;*#Lk5vdT*
zu$U6Vj(FA)$+SpziDX73vm%)j!x`~xN+i3*Pap%-5sw%4fY<zq&qKc#%TigVBV~uq
zAH`#);*rU9<gLnGs=O(3KZ|{<U9P>K#&UZ&^khkv8|VEotejWzmQJE%oU6>lQR;lg
zMo$4GCCfzg3Qt~$?_p&}yzGb<LOjPK?89$bKsHSGf%UNaWoK8XA=<-4@C?7b<Xo~3
z@N^5$GYqc*?`QZx2p<f%4>5c=gpUO9QHGC&@bLgX!SKltKE*KXhwjsy09?<AA^d`w
zZ-I2;#o2)L9Fx8+l5;KT`H)lwO5b5g?}kd>djozygg*!rU0_8YisVA8XdzVekw`uk
z$tSF$3t*~^X)gf!A+Q(#7enAu09+1%D*^Cn2z(X*pNGKJ0Js(cUj&|g$)4eQT_iWe
z5Gg>wZ-ROf@LR2z8R3E9cN_4eZ0@`PFNN^k76#vPXv9ij#43xuH{nCs3h|;No>RNs
zN9jLwCvm-(Rwj{Ik?2%z>O3B0%5J&7n*-MC@jj{_Ju)seOcf#?6(-h>*NGmeD)k%G
zQA{<`Y3U@Mz8&dCs4t|g^4GHEj`5?un+H*^x2b$Qik$gT8{%@*?96I%!<Zzj?+!9`
zs+G-6K8&pLdiclBuYOzGT)*v(G)yfl3GTZ?%;UC98P#g?Dl;@995p^^85rxl#I`I3
zGMlKR4OmT_T{$Q$Z1-MUS!#=?Oy#kb9ua7KFu-^uIkQgE&ot5@W)obYXpO+q<mvTF
z<y=GtVPFb({0$buiF!X%C(5gcb$br;!{rB0>&1ho%dXpS6O{}|yD~3!WRa(1>8i3U
zte3>tce`t$dK-!4;dy`k3E!93pK#bHI0?g&^4@neY<s<7Svk|e(k<cg)j-mQyd^Jn
zk<>JmmD6%vS(T<bUHw7Fl}GUE{CA~Jff7ZFv{GXTao1~swEh=?E5(0_s-w!zJ7kLr
zb&wW28VTm2H3MZYU3BNM|EQ$%L!}%l5H~4qO24g4lMl3wD0qC*sF)8OQsdwi9YvLP
zr)#Srq{~H)6zwRRaLoH|AJAS8E=O@`ivedDL1StPM7)k&lOnuzGZ1CI&7@}o!9-eb
zv!%;VR+jpX&dXjeFCs3)ivf93Anl$DRC?r*3E*W?S+tJC5sE_3hkbXLMPxcF>^LeA
zlnkxrdV#2RlY9e=j%H%?102;72dwpNZ<6r?wWTA<0heU$sw}e1z#LeD7*+_o)hglG
z%K{yx5KYv1?GO2mfVj~8W=|?P&^yqnOOV&|pemtEN167cBH`0NEn<~&#!IK|wzj;R
z$8mbu-Gpb$qoP1K@p_@ah3>3Hz%dfhm9e5pC0R^YHZsshB%tpOaKsRFSEj~C+jvQ#
zMtC{`H)Qdy#I8}Ba6+0KwZ92jTTbqbblxay?3Ko~3Yz>YH_qdTMy|UOils2Oq!22Z
zj;AY+siw7;9^YSS<APjJl&OGAw-(5PK~zpPi1>G<$}#;bj6qlG*bK&;Das*&<VVtQ
zy<!vAf|;tmEo^Q!BEstvy5brxE5_;=2_-4Y%NVa|s5U1t5m;_pd@vv`Y+?$GU_`3V
zuYaD#5?=ZE-Op33m+Y|0bnY=zkWEirqLPGO3wfJImh;xrq`$k0*|xT{@pZKJXl3=A
z#|$C^TmTpas$Shw7(*MN7Ylr7(O_eRmggzGaV{`Q7k2@d#dYXRb$gqfk&6R5ITwke
zUyfPiDdPE+I6DVBvy<-}R;Q=xf4w_LPqaIsnePx){SDVd+3@@gKZ<Ai)3@dQr+Z5?
z3!7T}HOK2!sA;~lKxh8O30%0Fq6jIBq76FR>gZu1{#T?p%{NfxRjO}MAM@cdKecE2
H_^!VI>TUvR

literal 0
HcmV?d00001

diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py
index 902dca209f..a3b81d8205 100644
--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
@@ -156,15 +156,23 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
                 start_time = time.time()
                 num_samples = 0
 
+            if arg.profile and pass_id == 0 and batch_id == 5:
+                profiler.start_profiler("All")
+            elif args.profile and pass_id == 0 and batch_id == 10:
+                profiler.stop_profiler("total", "/tmp/profile")
+
             if args.use_reader_op:
                 try:
-                    loss = exe.run(train_prog, fetch_list=[avg_loss])
+                    loss = exe.run(train_prog,
+                                   fetch_list=[avg_loss],
+                                   use_program_cache=True)
                 except fluid.core.EnforceNotMet as ex:
                     break
             else:
                 loss = exe.run(train_prog,
                                feed=feeder.feed(data),
-                               fetch_list=[avg_loss])
+                               fetch_list=[avg_loss],
+                               use_program_cache=True)
             iters += 1
             batch_id += 1
             # FIXME(wuyi): For use_reader_op, if the current
diff --git a/benchmark/fluid/recordio_converter.pyc b/benchmark/fluid/recordio_converter.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9b603f41c6748bb65da06076d5f69e4754c341c0
GIT binary patch
literal 6172
zcmcgwTXP&&5$@TGw5z-2$QNwvRf1u8K)%Ew5KK^FIoKo!c@$8x*bJkc+10FfXV-H^
z$vW)5L?L!w@H>BiAHWOG{DAxes{BYniWdr=sDkh7Gpn^!6rpUFB~9zhIcLu4?$iBs
zpU(e%cIu1kU-@}mvY!I}ui`PEg9P|<q%Bh0Z#z<RM*TUd<wotiv<gxyj9!a;ZB0mR
zLVI%3o|H~WY9;BErB;^Cl+>oAGcC1gae};jf<0y=C}?9=f}%F&B$&{~yabclSdgHk
zjUy72wXrDSgtU%I>jh~oNiao@gK7CBFSX+m%;=mG?A)x*LZs;5_*t(~%(Mv7i6fJ0
zs~$D{B%MTGquu>5OpNwM9lsgYIkSXk4$oCQ=Jz02b6AvAq~$2h$sQKPt2Xv>^6)cp
zQdP`AI2Pn&fHB7zPUyU%KyrQn0VXLt#D>6wJSOK3^kWqLm{6kFdy;QL@l*P3i7R9N
za8h5((wdTiBLmEzl)XGz+`>8Dd~Z;M3Z}vLdITcr6v(!p)^|J;eH5l;yf(rx@Y+-e
z^%MJ@(9Lry#Wy>CFVt4s-wxZ_3etO7Ulga`zN~${wqFl*AAN1KccUPs=6gxEnfM*A
zx$7r^>UqQVq!S$VgP?8KSEiJ`H9z&=0#9M0bK3qr9t(L3Fp}~_u5SEvgFB}4cuB%~
zHwmJy_hAyH_~<P9>fQJQG`%GBv5uE^(aL5uqIOv2A}W0EUN$GClDfMohkJgy<EpVx
zU8)05_vRe?YSqE_c(>yz^Py3G?Uvf4X2Yb4NutDDM(g~B3O)L_9me&Yj-T8)uSz?=
z&u1n=n5^{frHlA*B-5pRS&pOEoB)yBgmcz8;goYT4u4PN=AE*$<kWK*<6%<SSMiwd
zfY8Vs8Y~PH<_w!1<^(1S9k+@!R#+0Owh5Xi-7CUd1$IQ0tQ)$6I?&39*JuHIL=O6^
z+$iO)U3+9yrJh}@8^!J!47hl#aixhL#Z|R6Iv|w4uXvYIJqyKI8^nnwp{GoL6AK#(
zB9H5b`>TFlN8|t050pdwfEEsDiGE|Xq~J2v5+#i)B>SpzX{;{$++`9~5J%NjR6!Rn
zsEXW|I8Mf&nQrJjy~OH9S-p5+1zpcWI}6w+1KdOUd4!QK(GShbeG%k&C?@0bWo&0?
z+%%h0PMv4O^CW<(;k<(e{OJ~)2@wMRq)tXpQfGos&|XGp53AWLIGBkEEwnKiLx67d
z<fM^<vz4$QKpPuHkI+;n>V$sWy!G&p|ET`!{ne)Xhcmza>&4rv%`a}U^^4U}9P`tc
zFI8!V7Rae=6Los+(1S(ArqNA0y4<_TZkT=Jr)d}~X!(ZtZrzds8=qeS9>7Sa>5{Ey
z;h02`AZ+-%?bPe|y<V7nmz!~U0fJ39%T~$sOm=dAlZQ5}Vjm#<LC=W63Rh@T!>Z~7
zdY<64J9$a26XC3eKXxQmg+lG0Ino5hdD)<@K^cn!|A345_F;{u+27!~v^4Av^9<JJ
zH?o0M&_Z~ww8|Qy9X<boW(Ww#MK16FM_Q085AgXkTj+A`_GN1ekb3jIbVgdUf<boQ
z7rF1qeHZ{%osv&Qm@~{tYhGFlk{)4v2Ul4@R~IleK*fobw4drJec-qCQ1Rb0VQQ#B
zv^lQo-HktrdMjqfd=&#XnCZk}T48*vAjHrNOeIW`ZnCUtQ%X~>)Io&kLz7Tv_$3<s
z_jw$a`?cF|tMp(RhoSKW>@XoL3KeQ?9RlFwVDaiC)Qux25o4%SoFwoxCqHxU&T9Iy
zS*5G5TVJOb8nQz~+%WOeEKOmEZg>5_)1>VXplUXviJeXUXhl)bSN*`68SLGo=(A?c
zQOz%?3@?Zh-4F9Fjl+HR74xMkrpuGZ^cn8P5SV2k42*`-h;vfyf7Yoo?yE@3F*hYw
z<P#^W++FL}!(JNU;CMPgRJUndw-?6axS~QaF=<CQYfbEW?Z}!7k~r)4@WD8+DU2Tn
z-HvW)c6J*LD2cJcJq@CH5*#*T+PD|cS>@rlbYR_SWJa0@G-9=&wtJq#WWBxJcCX;S
z>d#_0KXqRt`5wrV>y)UXb^tL6c2T^_^%yL~nFp?%0WCwjXPmsEOaZ!HaE|9@b4!zT
zS~?{u;ZYs`6OHGDH^7SLfj9U6E5s-WwdMr*^p`=5dl5QuFM&KQu(*VgIv&N8eO&n%
z9sOTDjw_wRxPpi|Mij(NRUBadByT<l1AtnE6DDRl26Tbd<Hp5X7Qm*UsGK1WQr-fw
z`XKUX7N^3h`L0A{RDG<~3!B#9FQe-a;J8=WN8@o{0nvQeeU-18c@wR-eG__`7t`6*
zTJ`l15UD$jec~y>hBuWIH<Zk6al02o03z0$LrOF60Bx8kSb4tB7IR_u2PCT`Z;(_-
zmPy_uQEAuMx<;~(67F@r9)t$>2K#<Q@-_+az<q~=$fxo>TQJa@e-__73V#dxm<q`M
z#)73%^k<x4olrmk<}p!Fa{~YY|3KirQ<CHatqB(?#Av1yYIF#9NCIJE%mXK7SdsxQ
z@5n%MV&0?;BKK$_LLyzj1e7g6r=|Zl0q!FH?di8`xBsNy&&Y6A2DsZ#%3zj@I)fR;
z%R5C$0<rC5SOJUU9rlmA{{U%6iTT5^A$TeY5)+mNz_;2s9OPj~hI7*2<jRcML-;)8
zKyEUxzOf+vvy^h}_Hh{^<v{v@T^D3<1pdX`<u9_uPH+T04yKvZz6v4kelA;9H1ME(
zQ&|lH1r9KPr1po`?*@XuEqt}9uCcE9%lHZaEsJ)YGv7LQ;g#T=(Yt`&IeEQDc2Z%`
zeyvg^)T=+PcLDZ1BQ0dGeqqK=sqx4#*o8=s_{|%}pfZ4fd5t-at3Is8OLuUWxau*G
zI>Kcd?q9V>#RUdg?xV#Grmb-R(~3<uv0rsHSq{AeLmI}F6Y@?Dl@nWsLFVhRTf4G)
zg>DY!n4zg?b*!6&hTG7`I)2|{-N9bK%~*OALb9?+!_$2TV-1h8v&HalD_jk4lm^wi
zyGZ&H3sOV?O#y5zMIBXRHa1sfhkkv>x+-QqMWC+W>rp`?6Z&wG#`L+v3W$`nsNpQ4
z%)#H&0ISo^Q3Ucuv<O{G&XhBUkp=YB;k!a<_x<*Fc(Gxew*~bvrlSlE4nu>csc<AV
zG)%yk_OEKVGvJRGHk9Wcp8&^3<*uxC5?K5+CTN9jM%Tez4jZ9DS9r_~aX*3E=oJxp
z6fcK!bUxk+3R#VX$GdLt`INzBT(t3oa1>Ft=n?x$Y%ybT8B|;f>uP=K2nRCQ8++Em
zH-NEf;tJ9V^*`n}+Q)BPVx0{_FR*o}G|@tzUMkf-qF8G+FL5G$LW86=*k@rLWN%cO
z7|HTm^cfBaF5s+mEjy=B=UQ^AC(&oweTyv$h<nwjyttx;HI)-rsDr&LC<=yZ*xDdI
z*qY=DuK>ug8JfgZHS;cFPeQ9SU*qf7NuXI9>s(dnCSKK<T`f7@_{o;7OR71J{g?;4
z83QNox$klT>euanjP<9jCr^fHAy+!xV7DE<PMv7zvjxI=_{od;@?yD&--+^M`E+@C
JhAHu>{{S4*<hTF;

literal 0
HcmV?d00001


From 090cd0ab48065dd5bc346ce703b4b88a27454162 Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Wed, 20 Jun 2018 02:55:11 +0000
Subject: [PATCH 14/68] add python_data_feeding.md

---
 .../design/concepts/python_data_feeding.md    | 110 ++++++++++++++++++
 1 file changed, 110 insertions(+)
 create mode 100644 doc/fluid/design/concepts/python_data_feeding.md

diff --git a/doc/fluid/design/concepts/python_data_feeding.md b/doc/fluid/design/concepts/python_data_feeding.md
new file mode 100644
index 0000000000..bf88b99013
--- /dev/null
+++ b/doc/fluid/design/concepts/python_data_feeding.md
@@ -0,0 +1,110 @@
+# Python Data Feeding
+
+In the former implementation of Paddle Fluid, there are two ways to feed data:
+
+- Use `reader_op` in backend C++ side. This method only supports data feeding from recordio files and random data generators, but supports many kinds of `decorated_readers`. For examples, `double_buffer_reader` uses two threads to achieve better performance: one for time-consuming I/O operations, and the other for `Executor.Run()`. See [C++ Data Feeding](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/cpp_data_feeding.md) for details.
+
+- Feed data directly using `DataFeeder.feed()` in Python codes. It is more flexible than the first way. Many kinds of preprocessing steps can be performed before feeding using Python or any other languages, instead of adding many uncommon `operators` in C++ side. But this method is less efficient: the program cannot read the next mini-batch data before `Executor.Run()` ends. Moreover, `decorated_readers` such as `double_buffer_reader` cannot be used for better performance.
+
+In this document, we design a Python Data Feeding process combining the efficiency of the first way and the flexibility of the second way. A data queue `PyArrayFeedQueue` is designed to be shared by the Python and C++ side, while Python array is pushed into the queue and `reader_op` in C++ side reads out the data from the queue.
+
+## Design of PyArrayFeedQueue
+`PyArrayFeedQueue` is a blocking queue with a fixed `capacity` and accepts Python array with shapes indicated by `dims`.
+```C++
+class PyArrayFeedQueueHolder;
+
+class PyArrayFeedQueue {
+  friend class PyArrayFeedQueueHolder;
+ private:
+  PyArrayFeedQueue(size_t capacity, const std::vector<framework::DDim>& dims, const Place& place);
+ public:
+  size_t size() const; // Get the current size of the queue
+  size_t capacity() const; // Get the capacity of the queue
+  bool is_full() const;
+  bool is_empty() const;
+  
+  // Convert Python array tuple to std::vector<framework::LoDTensor> and store it.
+  // Block if is_full() == true
+  // Use pybind11::gil_scoped_release to release GIL of Python
+  void push(const pybind11::tuple& array_tuple);
+  
+  // Block if is_empty() == true
+  // Use pybind11::gil_scoped_release to release GIL of Python
+  std::vector<framework::LoDTensor> pop();
+ private:
+  BlockingQueue<std::vector<framework::LoDTensor>> queue_;
+};
+
+class PyArrayFeedQueueHolder {
+ public:
+  PyArrayFeedQueueHolder() {}
+  
+  // Calls the constructor of PyArrayFeedQueue to create feeder_
+  // For each instance of PyArrayFeedQueueHolder, this function can only called once
+  void init_once(size_t capacity, const std::vector<framework::DDim>& dims, const Place& place);
+  
+  PyArrayFeedQueue& feeder(); // Get feeder_
+  const PyArrayFeederQueue& feeder() const; // Get feeder_
+ private:
+  std::unique_ptr<PyArrayFeedQueue> feeder_;
+};
+```
+
+There are some major things that must be concerned:
+- `PyArrayFeedQueueHolder` should be a `Variable` in global scope, so that `reader_op` can find it when reading data. Since `PyArrayFeedQueue` does not have a default constructor, it cannot be constructed by `Scope::Var()::GetMutable<T>()`. To solve this problem, `PyArrayFeedQueueHolder` is designed to defer construction of `PyArrayFeedQueue`.
+- A `Variable` of `PyArrayFeedQueueHolder` but not `VarDesc` must be created in Python code before `Executor.Run()` so that `Executor.Run()` can get the feeding data when it is called.
+- `Create_reader_op` should accept the name or address of `PyArrayFeedQueueHolder` as an input or attribute.
+
+
+## Design of PyArrayReader
+`PyArrayReader` is a reader which holds a `PyArrayFeedQueue` object. Notice that `ReInit()` function is not supported because the capacity of the `PyArrayFeedQueue` object is limited.
+```C++
+class PyArrayReader : public ReaderBase {
+ public:
+  explicit PyArrayReader(PyArrayFeedQueue* queue);
+  
+  void ReadNext(std::vector<framework::LoDTensor>* out) override;
+  
+  void ReInit() override {
+    PADDLE_THROW("PyArrayReader does not support ReInit()");
+  }
+ private:
+  PyArrayFeedQueue* queue_;
+};
+```
+
+## Design of CreatePyArrayReaderOp
+`CreatePyArrayReaderOp` is used to create `PyArrayReader` object. It requires an attribute of `feeder_name` which indicates the name of the `PyArrayFeedQueueHolder` variable.
+```C++
+class CreatePyArrayReaderOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+ private:
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
+    const std::string& feeder_name = Attr<std::string>("feeder_name");
+    auto* feeder_holder_var = scope.FindVar(feeder_name);
+    PADDLE_ENFORCE(feed_holder_var != nullptr);
+    auto* feeder_holder = feeder_holder_var
+                    ->template GetMutable<framework::PyArrayFeedQueueHolder>();
+    auto* out = scope.FindVar(Output("Out"))
+                    ->template GetMutable<framework::ReaderHolder>();
+    out->Reset(new PyArrayReader(feeder_holder->feeder());
+  }
+};
+```
+
+## Design of Python codes
+The design of Python codes are as follows. First, we construct a variable of `PyArrayFeedQueueHolder` and init it with given parameters, returning the `PyArrayFeedQueue` object after initialization. After that, a layer of `CreatePyArrayReaderOp` is constructed and accepts the name of the `PyArrayFeedQueueHolder` variable. The `PyArrayFeedQueue` object and result of the layer are both returned.
+```Python
+def py_array_reader(capacity, shapes, place):
+  feeder_name = unique_name.generate("py_array_feed_queue")
+  var = global_scope().var(feeder_name) # create PyArrayFeedQueueHolder Variable
+  feed_queue = core.init_py_array_feed_queue(var, capacity, shapes, place) # init PyArrayFeedQueue
+  out = create_var()
+  create_reader_op_with_feeder_name(
+      type='create_py_array_reader',
+      outputs={'Out':[out]},
+      attrs = {'feeder_name': feeder_name})  
+  return out, feed_queue
+```

From 882a9327aee54da09a6e0265ed8387eed48c63ea Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Wed, 20 Jun 2018 03:03:11 +0000
Subject: [PATCH 15/68] Revert "commit"

This reverts commit e0f883e66bcde3512b43dc907d04c917f8cd37bf.
---
 benchmark/fluid/args.pyc               | Bin 3164 -> 0 bytes
 benchmark/fluid/fluid_benchmark.py     |  12 ++----------
 benchmark/fluid/recordio_converter.pyc | Bin 6172 -> 0 bytes
 3 files changed, 2 insertions(+), 10 deletions(-)
 delete mode 100644 benchmark/fluid/args.pyc
 delete mode 100644 benchmark/fluid/recordio_converter.pyc

diff --git a/benchmark/fluid/args.pyc b/benchmark/fluid/args.pyc
deleted file mode 100644
index cac4bec4c1fc1776b3bf13a2cb913f12c81e8f4e..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 3164
zcmbVOX>$}s8179-!V&HpLD~zW?ru0#P|lD9ur!bgQdHrGsm@IAZkxHPyC-oi_=JDP
ze_{C({3n!Ot<vXxXJ+XV^@DrN^z`%ISI_kC`}h8`c>cRsG@lvV@8IvQ;*#Lk5vdT*
zu$U6Vj(FA)$+SpziDX73vm%)j!x`~xN+i3*Pap%-5sw%4fY<zq&qKc#%TigVBV~uq
zAH`#);*rU9<gLnGs=O(3KZ|{<U9P>K#&UZ&^khkv8|VEotejWzmQJE%oU6>lQR;lg
zMo$4GCCfzg3Qt~$?_p&}yzGb<LOjPK?89$bKsHSGf%UNaWoK8XA=<-4@C?7b<Xo~3
z@N^5$GYqc*?`QZx2p<f%4>5c=gpUO9QHGC&@bLgX!SKltKE*KXhwjsy09?<AA^d`w
zZ-I2;#o2)L9Fx8+l5;KT`H)lwO5b5g?}kd>djozygg*!rU0_8YisVA8XdzVekw`uk
z$tSF$3t*~^X)gf!A+Q(#7enAu09+1%D*^Cn2z(X*pNGKJ0Js(cUj&|g$)4eQT_iWe
z5Gg>wZ-ROf@LR2z8R3E9cN_4eZ0@`PFNN^k76#vPXv9ij#43xuH{nCs3h|;No>RNs
zN9jLwCvm-(Rwj{Ik?2%z>O3B0%5J&7n*-MC@jj{_Ju)seOcf#?6(-h>*NGmeD)k%G
zQA{<`Y3U@Mz8&dCs4t|g^4GHEj`5?un+H*^x2b$Qik$gT8{%@*?96I%!<Zzj?+!9`
zs+G-6K8&pLdiclBuYOzGT)*v(G)yfl3GTZ?%;UC98P#g?Dl;@995p^^85rxl#I`I3
zGMlKR4OmT_T{$Q$Z1-MUS!#=?Oy#kb9ua7KFu-^uIkQgE&ot5@W)obYXpO+q<mvTF
z<y=GtVPFb({0$buiF!X%C(5gcb$br;!{rB0>&1ho%dXpS6O{}|yD~3!WRa(1>8i3U
zte3>tce`t$dK-!4;dy`k3E!93pK#bHI0?g&^4@neY<s<7Svk|e(k<cg)j-mQyd^Jn
zk<>JmmD6%vS(T<bUHw7Fl}GUE{CA~Jff7ZFv{GXTao1~swEh=?E5(0_s-w!zJ7kLr
zb&wW28VTm2H3MZYU3BNM|EQ$%L!}%l5H~4qO24g4lMl3wD0qC*sF)8OQsdwi9YvLP
zr)#Srq{~H)6zwRRaLoH|AJAS8E=O@`ivedDL1StPM7)k&lOnuzGZ1CI&7@}o!9-eb
zv!%;VR+jpX&dXjeFCs3)ivf93Anl$DRC?r*3E*W?S+tJC5sE_3hkbXLMPxcF>^LeA
zlnkxrdV#2RlY9e=j%H%?102;72dwpNZ<6r?wWTA<0heU$sw}e1z#LeD7*+_o)hglG
z%K{yx5KYv1?GO2mfVj~8W=|?P&^yqnOOV&|pemtEN167cBH`0NEn<~&#!IK|wzj;R
z$8mbu-Gpb$qoP1K@p_@ah3>3Hz%dfhm9e5pC0R^YHZsshB%tpOaKsRFSEj~C+jvQ#
zMtC{`H)Qdy#I8}Ba6+0KwZ92jTTbqbblxay?3Ko~3Yz>YH_qdTMy|UOils2Oq!22Z
zj;AY+siw7;9^YSS<APjJl&OGAw-(5PK~zpPi1>G<$}#;bj6qlG*bK&;Das*&<VVtQ
zy<!vAf|;tmEo^Q!BEstvy5brxE5_;=2_-4Y%NVa|s5U1t5m;_pd@vv`Y+?$GU_`3V
zuYaD#5?=ZE-Op33m+Y|0bnY=zkWEirqLPGO3wfJImh;xrq`$k0*|xT{@pZKJXl3=A
z#|$C^TmTpas$Shw7(*MN7Ylr7(O_eRmggzGaV{`Q7k2@d#dYXRb$gqfk&6R5ITwke
zUyfPiDdPE+I6DVBvy<-}R;Q=xf4w_LPqaIsnePx){SDVd+3@@gKZ<Ai)3@dQr+Z5?
z3!7T}HOK2!sA;~lKxh8O30%0Fq6jIBq76FR>gZu1{#T?p%{NfxRjO}MAM@cdKecE2
H_^!VI>TUvR

diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py
index 12edcde148..aa70783ecd 100644
--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
@@ -156,23 +156,15 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
                 start_time = time.time()
                 num_samples = 0
 
-            if arg.profile and pass_id == 0 and batch_id == 5:
-                profiler.start_profiler("All")
-            elif args.profile and pass_id == 0 and batch_id == 10:
-                profiler.stop_profiler("total", "/tmp/profile")
-
             if args.use_reader_op:
                 try:
-                    loss = exe.run(train_prog,
-                                   fetch_list=[avg_loss],
-                                   use_program_cache=True)
+                    loss = exe.run(train_prog, fetch_list=[avg_loss])
                 except fluid.core.EnforceNotMet as ex:
                     break
             else:
                 loss = exe.run(train_prog,
                                feed=feeder.feed(data),
-                               fetch_list=[avg_loss],
-                               use_program_cache=True)
+                               fetch_list=[avg_loss])
             iters += 1
             batch_id += 1
             # FIXME(wuyi): For use_reader_op, if the current
diff --git a/benchmark/fluid/recordio_converter.pyc b/benchmark/fluid/recordio_converter.pyc
deleted file mode 100644
index 9b603f41c6748bb65da06076d5f69e4754c341c0..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 6172
zcmcgwTXP&&5$@TGw5z-2$QNwvRf1u8K)%Ew5KK^FIoKo!c@$8x*bJkc+10FfXV-H^
z$vW)5L?L!w@H>BiAHWOG{DAxes{BYniWdr=sDkh7Gpn^!6rpUFB~9zhIcLu4?$iBs
zpU(e%cIu1kU-@}mvY!I}ui`PEg9P|<q%Bh0Z#z<RM*TUd<wotiv<gxyj9!a;ZB0mR
zLVI%3o|H~WY9;BErB;^Cl+>oAGcC1gae};jf<0y=C}?9=f}%F&B$&{~yabclSdgHk
zjUy72wXrDSgtU%I>jh~oNiao@gK7CBFSX+m%;=mG?A)x*LZs;5_*t(~%(Mv7i6fJ0
zs~$D{B%MTGquu>5OpNwM9lsgYIkSXk4$oCQ=Jz02b6AvAq~$2h$sQKPt2Xv>^6)cp
zQdP`AI2Pn&fHB7zPUyU%KyrQn0VXLt#D>6wJSOK3^kWqLm{6kFdy;QL@l*P3i7R9N
za8h5((wdTiBLmEzl)XGz+`>8Dd~Z;M3Z}vLdITcr6v(!p)^|J;eH5l;yf(rx@Y+-e
z^%MJ@(9Lry#Wy>CFVt4s-wxZ_3etO7Ulga`zN~${wqFl*AAN1KccUPs=6gxEnfM*A
zx$7r^>UqQVq!S$VgP?8KSEiJ`H9z&=0#9M0bK3qr9t(L3Fp}~_u5SEvgFB}4cuB%~
zHwmJy_hAyH_~<P9>fQJQG`%GBv5uE^(aL5uqIOv2A}W0EUN$GClDfMohkJgy<EpVx
zU8)05_vRe?YSqE_c(>yz^Py3G?Uvf4X2Yb4NutDDM(g~B3O)L_9me&Yj-T8)uSz?=
z&u1n=n5^{frHlA*B-5pRS&pOEoB)yBgmcz8;goYT4u4PN=AE*$<kWK*<6%<SSMiwd
zfY8Vs8Y~PH<_w!1<^(1S9k+@!R#+0Owh5Xi-7CUd1$IQ0tQ)$6I?&39*JuHIL=O6^
z+$iO)U3+9yrJh}@8^!J!47hl#aixhL#Z|R6Iv|w4uXvYIJqyKI8^nnwp{GoL6AK#(
zB9H5b`>TFlN8|t050pdwfEEsDiGE|Xq~J2v5+#i)B>SpzX{;{$++`9~5J%NjR6!Rn
zsEXW|I8Mf&nQrJjy~OH9S-p5+1zpcWI}6w+1KdOUd4!QK(GShbeG%k&C?@0bWo&0?
z+%%h0PMv4O^CW<(;k<(e{OJ~)2@wMRq)tXpQfGos&|XGp53AWLIGBkEEwnKiLx67d
z<fM^<vz4$QKpPuHkI+;n>V$sWy!G&p|ET`!{ne)Xhcmza>&4rv%`a}U^^4U}9P`tc
zFI8!V7Rae=6Los+(1S(ArqNA0y4<_TZkT=Jr)d}~X!(ZtZrzds8=qeS9>7Sa>5{Ey
z;h02`AZ+-%?bPe|y<V7nmz!~U0fJ39%T~$sOm=dAlZQ5}Vjm#<LC=W63Rh@T!>Z~7
zdY<64J9$a26XC3eKXxQmg+lG0Ino5hdD)<@K^cn!|A345_F;{u+27!~v^4Av^9<JJ
zH?o0M&_Z~ww8|Qy9X<boW(Ww#MK16FM_Q085AgXkTj+A`_GN1ekb3jIbVgdUf<boQ
z7rF1qeHZ{%osv&Qm@~{tYhGFlk{)4v2Ul4@R~IleK*fobw4drJec-qCQ1Rb0VQQ#B
zv^lQo-HktrdMjqfd=&#XnCZk}T48*vAjHrNOeIW`ZnCUtQ%X~>)Io&kLz7Tv_$3<s
z_jw$a`?cF|tMp(RhoSKW>@XoL3KeQ?9RlFwVDaiC)Qux25o4%SoFwoxCqHxU&T9Iy
zS*5G5TVJOb8nQz~+%WOeEKOmEZg>5_)1>VXplUXviJeXUXhl)bSN*`68SLGo=(A?c
zQOz%?3@?Zh-4F9Fjl+HR74xMkrpuGZ^cn8P5SV2k42*`-h;vfyf7Yoo?yE@3F*hYw
z<P#^W++FL}!(JNU;CMPgRJUndw-?6axS~QaF=<CQYfbEW?Z}!7k~r)4@WD8+DU2Tn
z-HvW)c6J*LD2cJcJq@CH5*#*T+PD|cS>@rlbYR_SWJa0@G-9=&wtJq#WWBxJcCX;S
z>d#_0KXqRt`5wrV>y)UXb^tL6c2T^_^%yL~nFp?%0WCwjXPmsEOaZ!HaE|9@b4!zT
zS~?{u;ZYs`6OHGDH^7SLfj9U6E5s-WwdMr*^p`=5dl5QuFM&KQu(*VgIv&N8eO&n%
z9sOTDjw_wRxPpi|Mij(NRUBadByT<l1AtnE6DDRl26Tbd<Hp5X7Qm*UsGK1WQr-fw
z`XKUX7N^3h`L0A{RDG<~3!B#9FQe-a;J8=WN8@o{0nvQeeU-18c@wR-eG__`7t`6*
zTJ`l15UD$jec~y>hBuWIH<Zk6al02o03z0$LrOF60Bx8kSb4tB7IR_u2PCT`Z;(_-
zmPy_uQEAuMx<;~(67F@r9)t$>2K#<Q@-_+az<q~=$fxo>TQJa@e-__73V#dxm<q`M
z#)73%^k<x4olrmk<}p!Fa{~YY|3KirQ<CHatqB(?#Av1yYIF#9NCIJE%mXK7SdsxQ
z@5n%MV&0?;BKK$_LLyzj1e7g6r=|Zl0q!FH?di8`xBsNy&&Y6A2DsZ#%3zj@I)fR;
z%R5C$0<rC5SOJUU9rlmA{{U%6iTT5^A$TeY5)+mNz_;2s9OPj~hI7*2<jRcML-;)8
zKyEUxzOf+vvy^h}_Hh{^<v{v@T^D3<1pdX`<u9_uPH+T04yKvZz6v4kelA;9H1ME(
zQ&|lH1r9KPr1po`?*@XuEqt}9uCcE9%lHZaEsJ)YGv7LQ;g#T=(Yt`&IeEQDc2Z%`
zeyvg^)T=+PcLDZ1BQ0dGeqqK=sqx4#*o8=s_{|%}pfZ4fd5t-at3Is8OLuUWxau*G
zI>Kcd?q9V>#RUdg?xV#Grmb-R(~3<uv0rsHSq{AeLmI}F6Y@?Dl@nWsLFVhRTf4G)
zg>DY!n4zg?b*!6&hTG7`I)2|{-N9bK%~*OALb9?+!_$2TV-1h8v&HalD_jk4lm^wi
zyGZ&H3sOV?O#y5zMIBXRHa1sfhkkv>x+-QqMWC+W>rp`?6Z&wG#`L+v3W$`nsNpQ4
z%)#H&0ISo^Q3Ucuv<O{G&XhBUkp=YB;k!a<_x<*Fc(Gxew*~bvrlSlE4nu>csc<AV
zG)%yk_OEKVGvJRGHk9Wcp8&^3<*uxC5?K5+CTN9jM%Tez4jZ9DS9r_~aX*3E=oJxp
z6fcK!bUxk+3R#VX$GdLt`INzBT(t3oa1>Ft=n?x$Y%ybT8B|;f>uP=K2nRCQ8++Em
zH-NEf;tJ9V^*`n}+Q)BPVx0{_FR*o}G|@tzUMkf-qF8G+FL5G$LW86=*k@rLWN%cO
z7|HTm^cfBaF5s+mEjy=B=UQ^AC(&oweTyv$h<nwjyttx;HI)-rsDr&LC<=yZ*xDdI
z*qY=DuK>ug8JfgZHS;cFPeQ9SU*qf7NuXI9>s(dnCSKK<T`f7@_{o;7OR71J{g?;4
z83QNox$klT>euanjP<9jCr^fHAy+!xV7DE<PMv7zvjxI=_{od;@?yD&--+^M`E+@C
JhAHu>{{S4*<hTF;


From 731632f15836ab3670660009dc390001e3054a0b Mon Sep 17 00:00:00 2001
From: sneaxiy <32832641+sneaxiy@users.noreply.github.com>
Date: Thu, 21 Jun 2018 10:05:53 +0800
Subject: [PATCH 16/68] Update python_data_feeding.md

---
 .../design/concepts/python_data_feeding.md    | 38 +++++++++++++------
 1 file changed, 27 insertions(+), 11 deletions(-)

diff --git a/doc/fluid/design/concepts/python_data_feeding.md b/doc/fluid/design/concepts/python_data_feeding.md
index bf88b99013..e05e2aedea 100644
--- a/doc/fluid/design/concepts/python_data_feeding.md
+++ b/doc/fluid/design/concepts/python_data_feeding.md
@@ -2,9 +2,9 @@
 
 In the former implementation of Paddle Fluid, there are two ways to feed data:
 
-- Use `reader_op` in backend C++ side. This method only supports data feeding from recordio files and random data generators, but supports many kinds of `decorated_readers`. For examples, `double_buffer_reader` uses two threads to achieve better performance: one for time-consuming I/O operations, and the other for `Executor.Run()`. See [C++ Data Feeding](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/cpp_data_feeding.md) for details.
+- Use `reader_op` in backend C++ side. This method only supports data feeding from recordio files and random data generators, but supports many kinds of `decorated_readers`. For examples, `double_buffer_reader` uses two threads to achieve better performance: one for time-consuming I/O operations, and the other for `Executor::Run()`. See [C++ Data Feeding](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/cpp_data_feeding.md) for details.
 
-- Feed data directly using `DataFeeder.feed()` in Python codes. It is more flexible than the first way. Many kinds of preprocessing steps can be performed before feeding using Python or any other languages, instead of adding many uncommon `operators` in C++ side. But this method is less efficient: the program cannot read the next mini-batch data before `Executor.Run()` ends. Moreover, `decorated_readers` such as `double_buffer_reader` cannot be used for better performance.
+- Feed data directly using `DataFeeder.feed()` in Python codes. It is more flexible than the first way. Many kinds of preprocessing steps can be performed before feeding using Python or any other languages, instead of adding many uncommon `operators` in C++ side. But this method is less efficient: the program cannot read the next mini-batch data before `Executor::Run()` ends. Moreover, `decorated_readers` such as `double_buffer_reader` cannot be used for better performance.
 
 In this document, we design a Python Data Feeding process combining the efficiency of the first way and the flexibility of the second way. A data queue `PyArrayFeedQueue` is designed to be shared by the Python and C++ side, while Python array is pushed into the queue and `reader_op` in C++ side reads out the data from the queue.
 
@@ -16,8 +16,16 @@ class PyArrayFeedQueueHolder;
 class PyArrayFeedQueue {
   friend class PyArrayFeedQueueHolder;
  private:
-  PyArrayFeedQueue(size_t capacity, const std::vector<framework::DDim>& dims, const Place& place);
+  // PyArrayFeedQueue can only be constructed by PyArrayFeedQueueHolder
+  PyArrayFeedQueue(size_t capacity, const std::vector<framework::DDim>& dims, const platform::Place& place);
+ 
  public:
+  // Not copyable and not moveable
+  PyArrayFeedQueue(const PyArrayFeedQueue&) = delete;
+  PyArrayFeedQueue(PyArrayFeedQueue&&) = delete;
+  PyArrayFeedQueue& operator = (const PyArrayFeedQueue&) = delete;
+  PyArrayFeedQueue& operator = (PyArrayFeedQueue&&) = delete;
+
   size_t size() const; // Get the current size of the queue
   size_t capacity() const; // Get the capacity of the queue
   bool is_full() const;
@@ -32,7 +40,12 @@ class PyArrayFeedQueue {
   // Use pybind11::gil_scoped_release to release GIL of Python
   std::vector<framework::LoDTensor> pop();
  private:
-  BlockingQueue<std::vector<framework::LoDTensor>> queue_;
+  // CircularQueue is a class like `boost::circular_buffer`
+  framework::CircularQueue<std::vector<framework::LoDTensor>> queue_;
+  std::vector<framework::DDim> dims_;
+  platform::Place place_;
+  mutable std::mutex mutex_;
+  mutable std::condition_variable cv_;
 };
 
 class PyArrayFeedQueueHolder {
@@ -40,19 +53,19 @@ class PyArrayFeedQueueHolder {
   PyArrayFeedQueueHolder() {}
   
   // Calls the constructor of PyArrayFeedQueue to create feeder_
-  // For each instance of PyArrayFeedQueueHolder, this function can only called once
+  // `init_once` can only called once, otherwise an exception would raise
   void init_once(size_t capacity, const std::vector<framework::DDim>& dims, const Place& place);
   
-  PyArrayFeedQueue& feeder(); // Get feeder_
-  const PyArrayFeederQueue& feeder() const; // Get feeder_
+  PyArrayFeedQueue* feeder(); // feeder_.get()
+  const PyArrayFeederQueue* feeder() const; // feeder_.get()
  private:
-  std::unique_ptr<PyArrayFeedQueue> feeder_;
+  std::shared_ptr<PyArrayFeedQueue> feeder_;
 };
 ```
 
 There are some major things that must be concerned:
 - `PyArrayFeedQueueHolder` should be a `Variable` in global scope, so that `reader_op` can find it when reading data. Since `PyArrayFeedQueue` does not have a default constructor, it cannot be constructed by `Scope::Var()::GetMutable<T>()`. To solve this problem, `PyArrayFeedQueueHolder` is designed to defer construction of `PyArrayFeedQueue`.
-- A `Variable` of `PyArrayFeedQueueHolder` but not `VarDesc` must be created in Python code before `Executor.Run()` so that `Executor.Run()` can get the feeding data when it is called.
+- A `Variable` of `PyArrayFeedQueueHolder` but not `VarDesc` must be created in Python code before `Executor::Run()` so that `Executor::Run()` can get the feeding data when it is called.
 - `Create_reader_op` should accept the name or address of `PyArrayFeedQueueHolder` as an input or attribute.
 
 
@@ -61,15 +74,18 @@ There are some major things that must be concerned:
 ```C++
 class PyArrayReader : public ReaderBase {
  public:
-  explicit PyArrayReader(PyArrayFeedQueue* queue);
+  explicit PyArrayReader(const std::shared_ptr<PyArrayFeedQueue>& queue);
   
   void ReadNext(std::vector<framework::LoDTensor>* out) override;
   
   void ReInit() override {
     PADDLE_THROW("PyArrayReader does not support ReInit()");
   }
+
+  PyArrayFeedQueue* feeder();
+  const PyArrayFeederQueue* feeder() const;
  private:
-  PyArrayFeedQueue* queue_;
+  std::shared_ptr<PyArrayFeedQueue> queue_;
 };
 ```
 

From 56087ab5267f143292504223efb277fd9b788890 Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Thu, 21 Jun 2018 02:12:48 +0000
Subject: [PATCH 17/68] update python_data_feeding.md

---
 .../design/concepts/python_data_feeding.md    | 31 ++++++++++++-------
 1 file changed, 20 insertions(+), 11 deletions(-)

diff --git a/doc/fluid/design/concepts/python_data_feeding.md b/doc/fluid/design/concepts/python_data_feeding.md
index bf88b99013..0905950413 100644
--- a/doc/fluid/design/concepts/python_data_feeding.md
+++ b/doc/fluid/design/concepts/python_data_feeding.md
@@ -2,9 +2,9 @@
 
 In the former implementation of Paddle Fluid, there are two ways to feed data:
 
-- Use `reader_op` in backend C++ side. This method only supports data feeding from recordio files and random data generators, but supports many kinds of `decorated_readers`. For examples, `double_buffer_reader` uses two threads to achieve better performance: one for time-consuming I/O operations, and the other for `Executor.Run()`. See [C++ Data Feeding](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/cpp_data_feeding.md) for details.
+- Use `reader_op` in backend C++ side. This method only supports data feeding from recordio files and random data generators, but supports many kinds of `decorated_readers`. For examples, `double_buffer_reader` uses two threads to achieve better performance: one for time-consuming I/O operations, and the other for `Executor::Run()`. See [C++ Data Feeding](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/cpp_data_feeding.md) for details.
 
-- Feed data directly using `DataFeeder.feed()` in Python codes. It is more flexible than the first way. Many kinds of preprocessing steps can be performed before feeding using Python or any other languages, instead of adding many uncommon `operators` in C++ side. But this method is less efficient: the program cannot read the next mini-batch data before `Executor.Run()` ends. Moreover, `decorated_readers` such as `double_buffer_reader` cannot be used for better performance.
+- Feed data directly using `DataFeeder.feed()` in Python codes. It is more flexible than the first way. Many kinds of preprocessing steps can be performed before feeding using Python or any other languages, instead of adding many uncommon `operators` in C++ side. But this method is less efficient: the program cannot read the next mini-batch data before `Executor::Run()` ends. Moreover, `decorated_readers` such as `double_buffer_reader` cannot be used for better performance.
 
 In this document, we design a Python Data Feeding process combining the efficiency of the first way and the flexibility of the second way. A data queue `PyArrayFeedQueue` is designed to be shared by the Python and C++ side, while Python array is pushed into the queue and `reader_op` in C++ side reads out the data from the queue.
 
@@ -16,7 +16,10 @@ class PyArrayFeedQueueHolder;
 class PyArrayFeedQueue {
   friend class PyArrayFeedQueueHolder;
  private:
-  PyArrayFeedQueue(size_t capacity, const std::vector<framework::DDim>& dims, const Place& place);
+  // PyArrayFeedQueue can only be constructed by PyArrayFeedQueueHolder
+  PyArrayFeedQueue(size_t capacity, const std::vector<framework::DDim>& dims, 
+                    const platform::Place& place);
+ 
  public:
   size_t size() const; // Get the current size of the queue
   size_t capacity() const; // Get the capacity of the queue
@@ -32,7 +35,12 @@ class PyArrayFeedQueue {
   // Use pybind11::gil_scoped_release to release GIL of Python
   std::vector<framework::LoDTensor> pop();
  private:
-  BlockingQueue<std::vector<framework::LoDTensor>> queue_;
+  // CircularQueue is a class like `boost::circular_buffer`
+  framework::CircularQueue<std::vector<framework::LoDTensor>> queue_;
+  std::vector<framework::DDim> dims_;
+  platform::Place place_;
+  mutable std::mutex mutex_;
+  mutable std::condition_variable cv_;
 };
 
 class PyArrayFeedQueueHolder {
@@ -40,19 +48,19 @@ class PyArrayFeedQueueHolder {
   PyArrayFeedQueueHolder() {}
   
   // Calls the constructor of PyArrayFeedQueue to create feeder_
-  // For each instance of PyArrayFeedQueueHolder, this function can only called once
+  // `init_once` can only called once, otherwise an exception would raise
   void init_once(size_t capacity, const std::vector<framework::DDim>& dims, const Place& place);
   
-  PyArrayFeedQueue& feeder(); // Get feeder_
-  const PyArrayFeederQueue& feeder() const; // Get feeder_
+  PyArrayFeedQueue* feeder(); // feeder_.get()
+  const PyArrayFeederQueue* feeder() const; // feeder_.get()
  private:
-  std::unique_ptr<PyArrayFeedQueue> feeder_;
+  std::shared_ptr<PyArrayFeedQueue> feeder_;
 };
 ```
 
 There are some major things that must be concerned:
 - `PyArrayFeedQueueHolder` should be a `Variable` in global scope, so that `reader_op` can find it when reading data. Since `PyArrayFeedQueue` does not have a default constructor, it cannot be constructed by `Scope::Var()::GetMutable<T>()`. To solve this problem, `PyArrayFeedQueueHolder` is designed to defer construction of `PyArrayFeedQueue`.
-- A `Variable` of `PyArrayFeedQueueHolder` but not `VarDesc` must be created in Python code before `Executor.Run()` so that `Executor.Run()` can get the feeding data when it is called.
+- A `Variable` of `PyArrayFeedQueueHolder` but not `VarDesc` must be created in Python code before `Executor::Run()` so that `Executor::Run()` can get the feeding data when it is called.
 - `Create_reader_op` should accept the name or address of `PyArrayFeedQueueHolder` as an input or attribute.
 
 
@@ -61,15 +69,16 @@ There are some major things that must be concerned:
 ```C++
 class PyArrayReader : public ReaderBase {
  public:
-  explicit PyArrayReader(PyArrayFeedQueue* queue);
+  explicit PyArrayReader(const std::shared_ptr<PyArrayFeedQueue>& queue);
   
   void ReadNext(std::vector<framework::LoDTensor>* out) override;
   
   void ReInit() override {
     PADDLE_THROW("PyArrayReader does not support ReInit()");
   }
+
  private:
-  PyArrayFeedQueue* queue_;
+  std::shared_ptr<PyArrayFeedQueue> queue_;
 };
 ```
 

From 1d0a5d47035ce4573092b5cd5a35fb22f79cf636 Mon Sep 17 00:00:00 2001
From: sneaxiy <32832641+sneaxiy@users.noreply.github.com>
Date: Thu, 21 Jun 2018 10:30:39 +0800
Subject: [PATCH 18/68] Delete python_data_feeding.md

---
 .../design/concepts/python_data_feeding.md    | 126 ------------------
 1 file changed, 126 deletions(-)
 delete mode 100644 doc/fluid/design/concepts/python_data_feeding.md

diff --git a/doc/fluid/design/concepts/python_data_feeding.md b/doc/fluid/design/concepts/python_data_feeding.md
deleted file mode 100644
index e05e2aedea..0000000000
--- a/doc/fluid/design/concepts/python_data_feeding.md
+++ /dev/null
@@ -1,126 +0,0 @@
-# Python Data Feeding
-
-In the former implementation of Paddle Fluid, there are two ways to feed data:
-
-- Use `reader_op` in backend C++ side. This method only supports data feeding from recordio files and random data generators, but supports many kinds of `decorated_readers`. For examples, `double_buffer_reader` uses two threads to achieve better performance: one for time-consuming I/O operations, and the other for `Executor::Run()`. See [C++ Data Feeding](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/cpp_data_feeding.md) for details.
-
-- Feed data directly using `DataFeeder.feed()` in Python codes. It is more flexible than the first way. Many kinds of preprocessing steps can be performed before feeding using Python or any other languages, instead of adding many uncommon `operators` in C++ side. But this method is less efficient: the program cannot read the next mini-batch data before `Executor::Run()` ends. Moreover, `decorated_readers` such as `double_buffer_reader` cannot be used for better performance.
-
-In this document, we design a Python Data Feeding process combining the efficiency of the first way and the flexibility of the second way. A data queue `PyArrayFeedQueue` is designed to be shared by the Python and C++ side, while Python array is pushed into the queue and `reader_op` in C++ side reads out the data from the queue.
-
-## Design of PyArrayFeedQueue
-`PyArrayFeedQueue` is a blocking queue with a fixed `capacity` and accepts Python array with shapes indicated by `dims`.
-```C++
-class PyArrayFeedQueueHolder;
-
-class PyArrayFeedQueue {
-  friend class PyArrayFeedQueueHolder;
- private:
-  // PyArrayFeedQueue can only be constructed by PyArrayFeedQueueHolder
-  PyArrayFeedQueue(size_t capacity, const std::vector<framework::DDim>& dims, const platform::Place& place);
- 
- public:
-  // Not copyable and not moveable
-  PyArrayFeedQueue(const PyArrayFeedQueue&) = delete;
-  PyArrayFeedQueue(PyArrayFeedQueue&&) = delete;
-  PyArrayFeedQueue& operator = (const PyArrayFeedQueue&) = delete;
-  PyArrayFeedQueue& operator = (PyArrayFeedQueue&&) = delete;
-
-  size_t size() const; // Get the current size of the queue
-  size_t capacity() const; // Get the capacity of the queue
-  bool is_full() const;
-  bool is_empty() const;
-  
-  // Convert Python array tuple to std::vector<framework::LoDTensor> and store it.
-  // Block if is_full() == true
-  // Use pybind11::gil_scoped_release to release GIL of Python
-  void push(const pybind11::tuple& array_tuple);
-  
-  // Block if is_empty() == true
-  // Use pybind11::gil_scoped_release to release GIL of Python
-  std::vector<framework::LoDTensor> pop();
- private:
-  // CircularQueue is a class like `boost::circular_buffer`
-  framework::CircularQueue<std::vector<framework::LoDTensor>> queue_;
-  std::vector<framework::DDim> dims_;
-  platform::Place place_;
-  mutable std::mutex mutex_;
-  mutable std::condition_variable cv_;
-};
-
-class PyArrayFeedQueueHolder {
- public:
-  PyArrayFeedQueueHolder() {}
-  
-  // Calls the constructor of PyArrayFeedQueue to create feeder_
-  // `init_once` can only called once, otherwise an exception would raise
-  void init_once(size_t capacity, const std::vector<framework::DDim>& dims, const Place& place);
-  
-  PyArrayFeedQueue* feeder(); // feeder_.get()
-  const PyArrayFeederQueue* feeder() const; // feeder_.get()
- private:
-  std::shared_ptr<PyArrayFeedQueue> feeder_;
-};
-```
-
-There are some major things that must be concerned:
-- `PyArrayFeedQueueHolder` should be a `Variable` in global scope, so that `reader_op` can find it when reading data. Since `PyArrayFeedQueue` does not have a default constructor, it cannot be constructed by `Scope::Var()::GetMutable<T>()`. To solve this problem, `PyArrayFeedQueueHolder` is designed to defer construction of `PyArrayFeedQueue`.
-- A `Variable` of `PyArrayFeedQueueHolder` but not `VarDesc` must be created in Python code before `Executor::Run()` so that `Executor::Run()` can get the feeding data when it is called.
-- `Create_reader_op` should accept the name or address of `PyArrayFeedQueueHolder` as an input or attribute.
-
-
-## Design of PyArrayReader
-`PyArrayReader` is a reader which holds a `PyArrayFeedQueue` object. Notice that `ReInit()` function is not supported because the capacity of the `PyArrayFeedQueue` object is limited.
-```C++
-class PyArrayReader : public ReaderBase {
- public:
-  explicit PyArrayReader(const std::shared_ptr<PyArrayFeedQueue>& queue);
-  
-  void ReadNext(std::vector<framework::LoDTensor>* out) override;
-  
-  void ReInit() override {
-    PADDLE_THROW("PyArrayReader does not support ReInit()");
-  }
-
-  PyArrayFeedQueue* feeder();
-  const PyArrayFeederQueue* feeder() const;
- private:
-  std::shared_ptr<PyArrayFeedQueue> queue_;
-};
-```
-
-## Design of CreatePyArrayReaderOp
-`CreatePyArrayReaderOp` is used to create `PyArrayReader` object. It requires an attribute of `feeder_name` which indicates the name of the `PyArrayFeedQueueHolder` variable.
-```C++
-class CreatePyArrayReaderOp : public framework::OperatorBase {
- public:
-  using framework::OperatorBase::OperatorBase;
- private:
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& dev_place) const override {
-    const std::string& feeder_name = Attr<std::string>("feeder_name");
-    auto* feeder_holder_var = scope.FindVar(feeder_name);
-    PADDLE_ENFORCE(feed_holder_var != nullptr);
-    auto* feeder_holder = feeder_holder_var
-                    ->template GetMutable<framework::PyArrayFeedQueueHolder>();
-    auto* out = scope.FindVar(Output("Out"))
-                    ->template GetMutable<framework::ReaderHolder>();
-    out->Reset(new PyArrayReader(feeder_holder->feeder());
-  }
-};
-```
-
-## Design of Python codes
-The design of Python codes are as follows. First, we construct a variable of `PyArrayFeedQueueHolder` and init it with given parameters, returning the `PyArrayFeedQueue` object after initialization. After that, a layer of `CreatePyArrayReaderOp` is constructed and accepts the name of the `PyArrayFeedQueueHolder` variable. The `PyArrayFeedQueue` object and result of the layer are both returned.
-```Python
-def py_array_reader(capacity, shapes, place):
-  feeder_name = unique_name.generate("py_array_feed_queue")
-  var = global_scope().var(feeder_name) # create PyArrayFeedQueueHolder Variable
-  feed_queue = core.init_py_array_feed_queue(var, capacity, shapes, place) # init PyArrayFeedQueue
-  out = create_var()
-  create_reader_op_with_feeder_name(
-      type='create_py_array_reader',
-      outputs={'Out':[out]},
-      attrs = {'feeder_name': feeder_name})  
-  return out, feed_queue
-```

From 9f6aa4c898a06f525ddbbc88079a8795a39db606 Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Thu, 21 Jun 2018 02:35:01 +0000
Subject: [PATCH 19/68] update python_data_feeding.md

---
 doc/fluid/design/concepts/python_data_feeding.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/fluid/design/concepts/python_data_feeding.md b/doc/fluid/design/concepts/python_data_feeding.md
index 0905950413..a41d1ad0ca 100644
--- a/doc/fluid/design/concepts/python_data_feeding.md
+++ b/doc/fluid/design/concepts/python_data_feeding.md
@@ -51,8 +51,8 @@ class PyArrayFeedQueueHolder {
   // `init_once` can only called once, otherwise an exception would raise
   void init_once(size_t capacity, const std::vector<framework::DDim>& dims, const Place& place);
   
-  PyArrayFeedQueue* feeder(); // feeder_.get()
-  const PyArrayFeederQueue* feeder() const; // feeder_.get()
+  PyArrayFeedQueue* feeder() { return feeder_.get(); }
+  const PyArrayFeederQueue* feeder() const { return feeder_.get(); }
  private:
   std::shared_ptr<PyArrayFeedQueue> feeder_;
 };

From 6f74583436a7f2e18114451cabc2606bd1954a57 Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Thu, 21 Jun 2018 02:55:11 +0000
Subject: [PATCH 20/68] update python_data_feeding.md

---
 doc/fluid/design/concepts/python_data_feeding.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/fluid/design/concepts/python_data_feeding.md b/doc/fluid/design/concepts/python_data_feeding.md
index a41d1ad0ca..7966fc27c0 100644
--- a/doc/fluid/design/concepts/python_data_feeding.md
+++ b/doc/fluid/design/concepts/python_data_feeding.md
@@ -51,8 +51,8 @@ class PyArrayFeedQueueHolder {
   // `init_once` can only called once, otherwise an exception would raise
   void init_once(size_t capacity, const std::vector<framework::DDim>& dims, const Place& place);
   
-  PyArrayFeedQueue* feeder() { return feeder_.get(); }
-  const PyArrayFeederQueue* feeder() const { return feeder_.get(); }
+  std::shared_ptr<PyArrayFeedQueue> feeder() { return feeder_; }
+  const std::shared_ptr<PyArrayFeederQueue>& feeder() const { return feeder_; }
  private:
   std::shared_ptr<PyArrayFeedQueue> feeder_;
 };

From 697ba4b13d25adc485480ff61536c82c285af193 Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Mon, 25 Jun 2018 01:40:46 +0000
Subject: [PATCH 21/68] Add Python array reader op

---
 benchmark/fluid/args.py                       |  10 +
 benchmark/fluid/fluid_benchmark.py            |  86 ++++++--
 benchmark/fluid/models/machine_translation.py |   2 +-
 benchmark/fluid/models/mnist.py               |  29 ++-
 benchmark/fluid/models/resnet.py              |  20 +-
 .../fluid/models/stacked_dynamic_lstm.py      |   2 +-
 benchmark/fluid/models/vgg.py                 |  29 ++-
 paddle/fluid/operators/reader/CMakeLists.txt  |   1 +
 .../reader/create_py_array_reader_op.cc       |  80 +++++++
 .../operators/reader/py_array_feed_queue.h    | 207 ++++++++++++++++++
 .../operators/reader/py_blocking_queue.h      | 125 +++++++++++
 paddle/fluid/pybind/pybind.cc                 |  46 +++-
 paddle/fluid/pybind/tensor_py.h               |   6 +-
 python/paddle/fluid/layers/io.py              |  58 ++++-
 14 files changed, 664 insertions(+), 37 deletions(-)
 create mode 100644 paddle/fluid/operators/reader/create_py_array_reader_op.cc
 create mode 100644 paddle/fluid/operators/reader/py_array_feed_queue.h
 create mode 100644 paddle/fluid/operators/reader/py_blocking_queue.h

diff --git a/benchmark/fluid/args.py b/benchmark/fluid/args.py
index 68a3d42d7a..dcd4ee2324 100644
--- a/benchmark/fluid/args.py
+++ b/benchmark/fluid/args.py
@@ -122,5 +122,15 @@ def parse_args():
         type=str,
         default="",
         help='Directory that contains all the training recordio files.')
+    parser.add_argument(
+        '--use_py_reader_op',
+        action='store_true',
+        help='Whether to use Python reader op, omitted when use_reader_op is true'
+    )
+    parser.add_argument(
+        '--feed_queue_capacity',
+        type=int,
+        default=64,
+        help='Capacity of feed queue when use_py_reader_op is true')
     args = parser.parse_args()
     return args
diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py
index ece1102dce..b5acb6549f 100644
--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
@@ -25,6 +25,9 @@ import paddle.fluid.profiler as profiler
 import paddle.fluid.transpiler.distribute_transpiler as distribute_transpiler
 
 from args import *
+import threading
+
+feed_queue = None
 
 
 def append_nccl2_prepare(trainer_id):
@@ -131,7 +134,7 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
     exe = fluid.Executor(place)
     exe.run(startup_prog)
 
-    if not args.use_reader_op:
+    if not args.use_reader_op and not args.use_py_reader_op:
         feed_var_list = [
             var for var in train_prog.global_block().vars.itervalues()
             if var.is_data
@@ -141,12 +144,12 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
     iters, num_samples, start_time = 0, 0, time.time()
     for pass_id in range(args.pass_num):
         train_losses = []
-        if not args.use_reader_op:
+        if not args.use_reader_op and not args.use_py_reader_op:
             reader_generator = train_reader()
         batch_id = 0
         data = None
         while True:
-            if not args.use_reader_op:
+            if not args.use_reader_op and not args.use_py_reader_op:
                 data = next(reader_generator, None)
                 if data == None:
                     break
@@ -156,7 +159,7 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
                 start_time = time.time()
                 num_samples = 0
 
-            if args.use_reader_op:
+            if args.use_reader_op or args.use_py_reader_op:
                 try:
                     loss = exe.run(train_prog, fetch_list=[avg_loss])
                 except fluid.core.EnforceNotMet as ex:
@@ -170,7 +173,7 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
             # FIXME(wuyi): For use_reader_op, if the current
             # pass is not the last, the last batch of this pass
             # is also equal to args.batch_size.
-            if args.use_reader_op:
+            if args.use_reader_op or args.use_py_reader_op:
                 num_samples += args.batch_size * args.gpus
             else:
                 num_samples += len(data)
@@ -180,12 +183,13 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
         print_train_time(start_time, time.time(), num_samples)
         print("Pass: %d, Loss: %f" % (pass_id, np.mean(train_losses))),
         # evaluation
-        if not args.no_test and batch_acc and not args.use_reader_op:
+        if not args.no_test and batch_acc and not args.use_reader_op and not args.use_py_reader_op:
             pass_test_acc = test(exe, infer_prog, test_reader, feeder,
                                  batch_acc)
             print(", Test Accuracy: %f" % pass_test_acc)
         print("\n")
         # TODO(wuyi): add warmup passes to get better perf data.
+        close_feed_queue()
         exit(0)
 
 
@@ -195,7 +199,7 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
                    batch_acc, args, train_prog, startup_prog, nccl_id_var,
                    num_trainers, trainer_id):
     place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
-    if not args.use_reader_op:
+    if not args.use_reader_op and not args.use_py_reader_op:
         feed_var_list = [
             var for var in train_prog.global_block().vars.itervalues()
             if var.is_data
@@ -238,12 +242,12 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
         num_samples = 0
         iters = 0
         start_time = time.time()
-        if not args.use_reader_op:
+        if not args.use_reader_op and not args.use_py_reader_op:
             reader_generator = train_reader()
         batch_id = 0
         data = None
         while True:
-            if not args.use_reader_op:
+            if not args.use_reader_op and not args.use_py_reader_op:
                 data = next(reader_generator, None)
                 if data == None:
                     break
@@ -257,14 +261,14 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
             if iters == args.skip_batch_num:
                 start_time = time.time()
                 num_samples = 0
-            if args.use_fake_data or args.use_reader_op:
+            if args.use_fake_data or args.use_reader_op or args.use_py_reader_op:
                 try:
                     loss, = exe.run([avg_loss.name])
                 except fluid.core.EnforceNotMet as ex:
                     break
             else:
                 loss, = exe.run([avg_loss.name], feed=feeder.feed(data))
-            if args.use_reader_op:
+            if args.use_reader_op or args.use_py_reader_op:
                 num_samples += args.batch_size * args.gpus
             else:
                 num_samples += len(data)
@@ -275,7 +279,7 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
             batch_id += 1
 
         print_train_time(start_time, time.time(), num_samples)
-        if not args.no_test and batch_acc and not args.use_reader_op:
+        if not args.no_test and batch_acc and not args.use_reader_op and not args.use_py_reader_op:
             # we have not implement record io for test
             # skip test when use args.use_reader_op
             test_acc = test(startup_exe, infer_prog, test_reader, feeder,
@@ -307,7 +311,46 @@ def print_paddle_envs():
     print('------------------------------------------------')
 
 
+def feed_data(feed_queue, train_reader, test_reader, dshapes, args):
+    train_cnt = 0
+    test_cnt = 0
+    print_per_train_batch = 1
+    train_data_generator = train_reader()
+    start = time.time()
+    while True:
+        next_data = next(train_data_generator, None)
+        if next_data is None:
+            break
+
+        next_data = list(next_data)
+        for i in range(len(next_data)):
+            if not isinstance(next_data[i], np.ndarray):
+                next_data[i] = np.array(next_data[i])
+            next_data[i] = next_data[i].reshape([-1] + dshapes[i])
+
+        if not feed_queue.enqueue(next_data):
+            break
+
+        train_cnt += 1
+        '''
+        if train_cnt % print_per_train_batch == 0:
+            end = time.time()
+            print('Feed queue size: %d, capacity: %d, speed: %.5fsec/batch'
+                % (feed_queue.size(), feed_queue.capacity(), (end-start)/print_per_train_batch))
+            start = end
+        '''
+    feed_queue.close()
+
+
+def close_feed_queue():
+    global feed_queue
+    if feed_queue is not None:
+        feed_queue.close()
+
+
 def main():
+    global feed_queue
+
     args = parse_args()
     print_arguments(args)
     print_paddle_envs()
@@ -321,8 +364,23 @@ def main():
         pr = cProfile.Profile()
         pr.enable()
     model_def = __import__("models.%s" % args.model, fromlist=["models"])
-    train_args = list(model_def.get_model(args))
+    model = model_def.get_model(args)
+
+    if not args.use_reader_op and args.use_py_reader_op:
+        feed_queue = model[-4]
+        train_reader = model[-3]
+        test_reader = model[-2]
+        dshapes = model[-1]
+        feed_thread = threading.Thread(
+            target=feed_data,
+            args=(feed_queue, train_reader, test_reader, dshapes, args))
+        #feed_thread.setDaemon(True)
+        feed_thread.start()
+        model = model[:-4]
+
+    train_args = list(model)
     train_args.append(args)
+
     # Run optimizer.minimize(avg_loss)
     train_args[2].minimize(train_args[0])
     if args.memory_optimize:
@@ -338,6 +396,7 @@ def main():
             train_args.extend([nccl_id_var, num_trainers, trainer_id])
             train_parallel(*train_args)
         train(*train_args)
+        close_feed_queue()
         exit(0)
 
     # for other update methods, use default programs
@@ -362,3 +421,4 @@ def main():
 
 if __name__ == "__main__":
     main()
+    close_feed_queue()
diff --git a/benchmark/fluid/models/machine_translation.py b/benchmark/fluid/models/machine_translation.py
index 17f6b03826..43f0368cd4 100644
--- a/benchmark/fluid/models/machine_translation.py
+++ b/benchmark/fluid/models/machine_translation.py
@@ -182,7 +182,7 @@ def lodtensor_to_ndarray(lod_tensor):
 
 
 def get_model(args):
-    if args.use_reader_op:
+    if args.use_reader_op or args.use_py_reader_op:
         raise Exception("machine_translation do not support reader op for now.")
     embedding_dim = 512
     encoder_size = 512
diff --git a/benchmark/fluid/models/mnist.py b/benchmark/fluid/models/mnist.py
index 8e740dc689..fa5e1b6d64 100644
--- a/benchmark/fluid/models/mnist.py
+++ b/benchmark/fluid/models/mnist.py
@@ -66,13 +66,14 @@ def cnn_model(data):
 
 
 def get_model(args):
+    dshape = [1, 28, 28]
     if args.use_reader_op:
         filelist = [
             os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
         ]
         data_file = fluid.layers.open_files(
             filenames=filelist,
-            shapes=[[-1, 1, 28, 28], (-1, 1)],
+            shapes=[[-1] + dshape, (-1, 1)],
             lod_levels=[0, 0],
             dtypes=["float32", "int64"],
             thread_num=args.gpus,
@@ -81,8 +82,18 @@ def get_model(args):
             fluid.layers.batch(
                 data_file, batch_size=args.batch_size))
         images, label = fluid.layers.read_file(data_file)
+    elif args.use_py_reader_op:
+        data_file, feed_queue = fluid.layers.py_array_reader(
+            capacity=args.feed_queue_capacity,
+            shapes=[[-1] + dshape, [-1, 1]],
+            lod_levels=[0, 0],
+            dtypes=['float32', 'int64'])
+        data_file = fluid.layers.double_buffer(
+            fluid.layers.batch(
+                data_file, batch_size=args.batch_size))
+        images, label = fluid.layers.read_file(data_file)
     else:
-        images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
+        images = fluid.layers.data(name='pixel', shape=dshape, dtype=DTYPE)
         label = fluid.layers.data(name='label', shape=[1], dtype='int64')
 
     if args.device == 'CPU' and args.cpus > 1:
@@ -118,8 +129,16 @@ def get_model(args):
         learning_rate=0.001, beta1=0.9, beta2=0.999)
 
     # Reader
+    underlying_train_reader = paddle.dataset.mnist.train()
+    underlying_test_reader = paddle.dataset.mnist.test()
     train_reader = paddle.batch(
-        paddle.dataset.mnist.train(), batch_size=args.batch_size * args.gpus)
+        underlying_train_reader, batch_size=args.batch_size * args.gpus)
     test_reader = paddle.batch(
-        paddle.dataset.mnist.test(), batch_size=args.batch_size)
-    return avg_cost, inference_program, opt, train_reader, test_reader, batch_acc
+        underlying_test_reader, batch_size=args.batch_size)
+
+    if not args.use_reader_op and args.use_py_reader_op:
+        return avg_cost, inference_program, opt, train_reader, test_reader, batch_acc, \
+    feed_queue, underlying_train_reader, underlying_test_reader, \
+    (dshape, [1])
+    else:
+        return avg_cost, inference_program, opt, train_reader, test_reader, batch_acc
diff --git a/benchmark/fluid/models/resnet.py b/benchmark/fluid/models/resnet.py
index 9ed1093c54..7fb81b04fb 100644
--- a/benchmark/fluid/models/resnet.py
+++ b/benchmark/fluid/models/resnet.py
@@ -163,6 +163,16 @@ def get_model(args):
             fluid.layers.batch(
                 data_file, batch_size=args.batch_size))
         input, label = fluid.layers.read_file(data_file)
+    elif args.use_py_reader_op:
+        data_file, feed_queue = fluid.layers.py_array_reader(
+            capacity=args.feed_queue_capacity,
+            shapes=[[-1] + dshape, [-1, 1]],
+            lod_levels=[0, 0],
+            dtypes=['float32', 'int64'])
+        data_file = fluid.layers.double_buffer(
+            fluid.layers.batch(
+                data_file, batch_size=args.batch_size))
+        input, label = fluid.layers.read_file(data_file)
     else:
         input = fluid.layers.data(name='data', shape=dshape, dtype='float32')
         label = fluid.layers.data(name='label', shape=[1], dtype='int64')
@@ -204,5 +214,11 @@ def get_model(args):
     batched_test_reader = paddle.batch(
         train_reader, batch_size=args.batch_size, drop_last=True)
 
-    return avg_cost, inference_program, optimizer, batched_train_reader,\
-                   batched_test_reader, batch_acc
+    if not args.use_reader_op and args.use_py_reader_op:
+        return avg_cost, inference_program, optimizer, batched_train_reader,\
+                      batched_test_reader, batch_acc, \
+                      feed_queue, train_reader, test_reader, \
+                      (dshape, [1])
+    else:
+        return avg_cost, inference_program, optimizer, batched_train_reader,\
+                      batched_test_reader, batch_acc
diff --git a/benchmark/fluid/models/stacked_dynamic_lstm.py b/benchmark/fluid/models/stacked_dynamic_lstm.py
index 3231542a17..64c8cde15f 100644
--- a/benchmark/fluid/models/stacked_dynamic_lstm.py
+++ b/benchmark/fluid/models/stacked_dynamic_lstm.py
@@ -44,7 +44,7 @@ def crop_sentence(reader, crop_size):
 
 
 def get_model(args):
-    if args.use_reader_op:
+    if args.use_reader_op or args.use_py_reader_op:
         raise Exception(
             "stacked_dynamic_lstm do not support reader op for now.")
     lstm_size = 512
diff --git a/benchmark/fluid/models/vgg.py b/benchmark/fluid/models/vgg.py
index 932601302d..739681d4b6 100644
--- a/benchmark/fluid/models/vgg.py
+++ b/benchmark/fluid/models/vgg.py
@@ -54,12 +54,16 @@ def vgg16_bn_drop(input):
 
 def get_model(args):
     if args.data_set == "cifar10":
+        underlying_train_reader = paddle.dataset.cifar.train10()
+        underlying_test_reader = paddle.dataset.cifar.test10()
         classdim = 10
         if args.data_format == 'NCHW':
             data_shape = [3, 32, 32]
         else:
             data_shape = [32, 32, 3]
     else:
+        underlying_train_reader = paddle.dataset.flowers.train()
+        underlying_test_reader = paddle.dataset.flowers.test()
         classdim = 102
         if args.data_format == 'NCHW':
             data_shape = [3, 224, 224]
@@ -81,6 +85,16 @@ def get_model(args):
             fluid.layers.batch(
                 data_file, batch_size=args.batch_size))
         images, label = fluid.layers.read_file(data_file)
+    elif args.use_py_reader_op:
+        data_file, feed_queue = fluid.layers.py_array_reader(
+            capacity=args.feed_queue_capacity,
+            shapes=[[-1] + data_shape, [-1, 1]],
+            lod_levels=[0, 0],
+            dtypes=["float32", "int64"])
+        data_file = fluid.layers.double_buffer(
+            fluid.layers.batch(
+                data_file, batch_size=args.batch_size))
+        images, label = fluid.layers.read_file(data_file)
     else:
         images = fluid.layers.data(
             name='data', shape=data_shape, dtype='float32')
@@ -109,13 +123,14 @@ def get_model(args):
     # data reader
     train_reader = paddle.batch(
         paddle.reader.shuffle(
-            paddle.dataset.cifar.train10()
-            if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
-            buf_size=5120),
+            underlying_train_reader, buf_size=5120),
         batch_size=args.batch_size * args.gpus)
     test_reader = paddle.batch(
-        paddle.dataset.cifar.test10()
-        if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
-        batch_size=args.batch_size)
+        underlying_test_reader, batch_size=args.batch_size)
 
-    return avg_cost, inference_program, optimizer, train_reader, test_reader, batch_acc
+    if not args.use_reader_op and args.use_py_reader_op:
+        return avg_cost, inference_program, optimizer, train_reader, test_reader, batch_acc, \
+                      feed_queue, underlying_train_reader, underlying_test_reader, \
+                      (data_shape, [1])
+    else:
+        return avg_cost, inference_program, optimizer, train_reader, test_reader, batch_acc
diff --git a/paddle/fluid/operators/reader/CMakeLists.txt b/paddle/fluid/operators/reader/CMakeLists.txt
index 62532036f8..b6016e1d20 100644
--- a/paddle/fluid/operators/reader/CMakeLists.txt
+++ b/paddle/fluid/operators/reader/CMakeLists.txt
@@ -24,6 +24,7 @@ reader_library(create_double_buffer_reader_op SRCS create_double_buffer_reader_o
 reader_library(create_multi_pass_reader_op SRCS create_multi_pass_reader_op.cc)
 reader_library(create_threaded_reader_op SRCS create_threaded_reader_op.cc)
 reader_library(create_custom_reader_op SRCS create_custom_reader_op.cc)
+reader_library(create_py_array_reader_op SRCS create_py_array_reader_op.cc)
 
 cc_test(reader_blocking_queue_test SRCS reader_blocking_queue_test.cc)
 # Export local libraries to parent
diff --git a/paddle/fluid/operators/reader/create_py_array_reader_op.cc b/paddle/fluid/operators/reader/create_py_array_reader_op.cc
new file mode 100644
index 0000000000..ed7ef4affb
--- /dev/null
+++ b/paddle/fluid/operators/reader/create_py_array_reader_op.cc
@@ -0,0 +1,80 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reader/py_array_feed_queue.h"
+
+namespace paddle {
+namespace operators {
+namespace reader {
+
+class PyArrayReader : public framework::ReaderBase {
+ public:
+  explicit PyArrayReader(const std::shared_ptr<PyArrayFeedQueue>& queue) {
+    PADDLE_ENFORCE(queue != nullptr, "PyArrayFeedQueue must not be null");
+    queue_ = queue;
+  }
+
+  void ReadNext(std::vector<framework::LoDTensor>* out) override {
+    *out = queue_->Dequeue();
+  }
+
+  void ReInit() override {
+    // PADDLE_THROW("PyArrayReader does not support ReInit()");
+  }
+
+ private:
+  std::shared_ptr<PyArrayFeedQueue> queue_;
+};
+
+class CreatePyArrayReaderOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
+    const std::string& feeder_name = Attr<std::string>("feeder_name");
+    auto* feeder_holder_var = scope.FindVar(feeder_name);
+    PADDLE_ENFORCE(feeder_holder_var != nullptr,
+                   "No PyArrayFeedQueue variable with name %s found",
+                   feeder_name);
+    auto* feeder_holder =
+        feeder_holder_var->template GetMutable<PyArrayFeedQueueHolder>();
+    auto* out = scope.FindVar(Output("Out"))
+                    ->template GetMutable<framework::ReaderHolder>();
+    out->Reset(new PyArrayReader(feeder_holder->GetFeeder()));
+  }
+};
+
+class CreatePyArrayReaderOpMaker : public FileReaderMakerBase {
+ protected:
+  void Apply() override {
+    AddAttr<std::string>("feeder_name",
+                         "Name of the `PyArrayFeedQueueHolder` variable");
+
+    AddComment(R"DOC(
+			Create PyArrayReader to accept Python data feeding.
+      )DOC");
+  }
+};
+
+}  // namespace reader
+}  // namespace operators
+}  // namespace paddle
+
+namespace reader = ::paddle::operators::reader;
+
+REGISTER_FILE_READER_OPERATOR(create_py_array_reader,
+                              reader::CreatePyArrayReaderOp,
+                              reader::CreatePyArrayReaderOpMaker);
diff --git a/paddle/fluid/operators/reader/py_array_feed_queue.h b/paddle/fluid/operators/reader/py_array_feed_queue.h
new file mode 100644
index 0000000000..f9552f73a6
--- /dev/null
+++ b/paddle/fluid/operators/reader/py_array_feed_queue.h
@@ -0,0 +1,207 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <condition_variable>  //NOLINT
+#include <memory>
+#include <mutex>  // NOLINT
+#include <vector>
+#include "glog/logging.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/operators/reader/py_blocking_queue.h"
+#include "paddle/fluid/operators/reader/reader_op_registry.h"
+#include "paddle/fluid/pybind/tensor_py.h"
+
+namespace paddle {
+namespace operators {
+namespace reader {
+
+using PyTuple = ::pybind11::tuple;
+using PyArray = ::pybind11::array;
+
+template <typename T>
+using PyArrayT = ::pybind11::array_t<T, ::pybind11::array::c_style |
+                                            ::pybind11::array::forcecast>;
+
+class PyArrayToTensorVisitor : public boost::static_visitor<void> {
+ public:
+#define PY_ARRAY_TO_TENSOR_WITH_TYPE(dtype, func_name)                       \
+  pybind::func_name(tensor_, static_cast<const PyArrayT<dtype>&>(py_array_), \
+                    place)
+
+#define PY_ARRAY_TO_TENSOR(func_name)                  \
+  if (IsType<size_t>()) {                              \
+    PY_ARRAY_TO_TENSOR_WITH_TYPE(size_t, func_name);   \
+  } else if (IsType<int64_t>()) {                      \
+    PY_ARRAY_TO_TENSOR_WITH_TYPE(int64_t, func_name);  \
+  } else if (IsType<int32_t>()) {                      \
+    PY_ARRAY_TO_TENSOR_WITH_TYPE(int32_t, func_name);  \
+  } else if (IsType<int16_t>()) {                      \
+    PY_ARRAY_TO_TENSOR_WITH_TYPE(int16_t, func_name);  \
+  } else if (IsType<uint8_t>()) {                      \
+    PY_ARRAY_TO_TENSOR_WITH_TYPE(uint8_t, func_name);  \
+  } else if (IsType<float>()) {                        \
+    PY_ARRAY_TO_TENSOR_WITH_TYPE(float, func_name);    \
+  } else if (IsType<double>()) {                       \
+    PY_ARRAY_TO_TENSOR_WITH_TYPE(double, func_name);   \
+  } else {                                             \
+    PADDLE_THROW("unsupported dtype of python array"); \
+  }
+
+  PyArrayToTensorVisitor(const PyArray& py_array, framework::Tensor* tensor)
+      : py_array_(py_array), tensor_(tensor) {}
+
+  void operator()(const platform::CPUPlace& place) {
+    PY_ARRAY_TO_TENSOR(PyCPUTensorSetFromArray);
+  }
+
+  void operator()(const platform::CUDAPlace& place) {
+#ifdef PADDLE_WITH_CUDA
+    PY_ARRAY_TO_TENSOR(PyCUDATensorSetFromArray);
+#else
+    PADDLE_THROW("CUDAPlace is not supported in CPU only version");
+#endif
+  }
+
+  void operator()(const platform::CUDAPinnedPlace& place) {
+#ifdef PADDLE_WITH_CUDA
+    PY_ARRAY_TO_TENSOR(PyCUDAPinnedTensorSetFromArray);
+#else
+    PADDLE_THROW("CUDAPinnedPlace is not supported in CPU only version");
+#endif
+  }
+
+#undef PY_ARRAY_TO_TENSOR
+#undef PY_ARRAY_TO_TENSOR_WITH_TYPE
+
+ private:
+  template <typename T>
+  inline bool IsType() const {
+    return ::pybind11::isinstance<PyArrayT<T>>(py_array_);
+  }
+
+ private:
+  const PyArray& py_array_;
+  framework::Tensor* tensor_;
+};
+
+class PyArrayFeedQueueHolder;
+
+// PyArrayFeedQueue must be thread-safe
+class PyArrayFeedQueue {
+  friend class PyArrayFeedQueueHolder;
+
+ private:
+  PyArrayFeedQueue(size_t capacity, const std::vector<framework::DDim>& dims,
+                   const platform::Place& place)
+      : dims_(dims), place_(place) {
+    queue_.reset(
+        new PyBlockingQueue<std::vector<framework::LoDTensor>>(capacity));
+  }
+
+ public:
+  ~PyArrayFeedQueue() { Close(); }
+
+  bool Enqueue(const std::vector<PyArray>& py_array_vec) {
+    auto lod_tensor_vec = PyArrayVecToLoDTensorVec(py_array_vec);
+    VLOG(5) << "Enqueue at address " << reinterpret_cast<void*>(this);
+    return queue_->Send(std::move(lod_tensor_vec));
+  }
+
+  bool Enqueue(const std::vector<framework::LoDTensor>& tensor_vec) {
+    VLOG(5) << "Enqueue at address " << reinterpret_cast<void*>(this);
+    return queue_->Send(tensor_vec);
+  }
+
+  std::vector<framework::LoDTensor> Dequeue() {
+    VLOG(5) << "Dequeue at address " << reinterpret_cast<void*>(this);
+    std::vector<framework::LoDTensor> ret;
+    return queue_->Receive(&ret) ? ret : std::vector<framework::LoDTensor>();
+  }
+
+  inline size_t Size() const { return queue_->Size(); }
+
+  inline size_t Cap() const { return queue_->Cap(); }
+
+  inline bool IsClosed() const { return queue_->IsClosed(); }
+
+  inline void Close() { queue_->Close(); }
+
+ private:
+  std::vector<framework::LoDTensor> PyArrayVecToLoDTensorVec(
+      const std::vector<PyArray>& py_array_vec) {
+    PADDLE_ENFORCE(dims_.size() == py_array_vec.size(),
+                   "expected input tensor number %d but found %d", dims_.size(),
+                   py_array_vec.size());
+
+    size_t i = 0;
+    if (py_array_vec.size() > 1) {
+      size_t dim0 = py_array_vec[0].shape()[0];
+      for (size_t j = 1; j < py_array_vec.size(); ++j) {
+        PADDLE_ENFORCE(dim0 == py_array_vec[j].shape()[0],
+                       "0-dim of the %d-th input tensor is %d, but 0-dim of "
+                       "the 0-th input tensor is %d",
+                       j, py_array_vec[j].shape()[0], dim0);
+      }
+    }
+
+    std::vector<framework::LoDTensor> lod_tensor_vec;
+    lod_tensor_vec.reserve(py_array_vec.size());
+
+    std::for_each(
+        py_array_vec.begin(), py_array_vec.end(), [&](const PyArray& py_array) {
+          for (int64_t j = 1; j < dims_[i].size(); ++j) {
+            PADDLE_ENFORCE(
+                dims_[i][j] == static_cast<int64_t>(py_array.shape()[j]),
+                "expected %d-dim of %d-th input tensor is %d but found %d", j,
+                i, dims_[i][j], py_array.shape()[j]);
+          }
+
+          lod_tensor_vec.emplace_back(framework::LoDTensor());
+          PyArrayToTensorVisitor visitor(py_array, &(lod_tensor_vec.back()));
+          boost::apply_visitor(visitor, place_);
+          ++i;
+        });
+    return lod_tensor_vec;
+  }
+
+  std::unique_ptr<PyBlockingQueue<std::vector<framework::LoDTensor>>> queue_;
+  std::vector<framework::DDim> dims_;
+  platform::Place place_;
+};
+
+class PyArrayFeedQueueHolder {
+ public:
+  PyArrayFeedQueueHolder() {}
+
+  void InitOnce(size_t capacity, const std::vector<framework::DDim>& dims,
+                const platform::Place& place) {
+    PADDLE_ENFORCE(
+        feeder_ == nullptr,
+        "PyArrayFeedQueueHolder::InitOnce() can only be called once");
+    feeder_.reset(new PyArrayFeedQueue(capacity, dims, place));
+  }
+
+  std::shared_ptr<PyArrayFeedQueue> GetFeeder() { return feeder_; }
+  const std::shared_ptr<PyArrayFeedQueue>& GetFeeder() const { return feeder_; }
+
+ private:
+  std::shared_ptr<PyArrayFeedQueue> feeder_;
+};
+
+}  // namespace reader
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/reader/py_blocking_queue.h b/paddle/fluid/operators/reader/py_blocking_queue.h
new file mode 100644
index 0000000000..721767102b
--- /dev/null
+++ b/paddle/fluid/operators/reader/py_blocking_queue.h
@@ -0,0 +1,125 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <condition_variable>  // NOLINT
+#include <deque>
+
+#include "Python.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "pybind11/pybind11.h"
+
+namespace paddle {
+namespace operators {
+namespace reader {
+
+// PyBlockingQueue is designed for PyArrayFeedQueue
+// PyBlockingQueue would release GIL of Python when
+// the queue is full to avoid deadlock.
+template <typename T>
+class PyBlockingQueue {
+ public:
+  explicit PyBlockingQueue(size_t capacity)
+      : capacity_(capacity), closed_(false) {
+    PADDLE_ENFORCE_GT(
+        capacity_, 0,
+        "The capacity of a reader::PyBlockingQueue must be greater than 0.");
+  }
+
+  ~PyBlockingQueue() { Close(); }
+
+  bool Send(const T& elem) {
+    std::unique_lock<std::mutex> lock(mutex_);
+    receive_cv_.notify_one();
+    if (queue_.size() >= capacity_ && (!closed_)) {
+      pybind11::gil_scoped_release release;
+      send_cv_.wait(lock, [&] { return queue_.size() < capacity_ || closed_; });
+    }
+    if (closed_) {
+      VLOG(5)
+          << "WARNING: Sending an element to a closed reader::BlockingQueue.";
+      return false;
+    }
+    PADDLE_ENFORCE_LT(queue_.size(), capacity_);
+    queue_.push_back(elem);
+    return true;
+  }
+
+  bool Send(T&& elem) {
+    std::unique_lock<std::mutex> lock(mutex_);
+    receive_cv_.notify_one();
+    if (queue_.size() >= capacity_ && (!closed_)) {
+      pybind11::gil_scoped_release release;
+      send_cv_.wait(lock, [&] { return queue_.size() < capacity_ || closed_; });
+    }
+    if (closed_) {
+      VLOG(5)
+          << "WARNING: Sending an element to a closed reader::BlokcingQueue.";
+      return false;
+    }
+    PADDLE_ENFORCE_LT(queue_.size(), capacity_);
+    queue_.emplace_back(std::move(elem));
+    return true;
+  }
+
+  bool Receive(T* elem) {
+    std::unique_lock<std::mutex> lock(mutex_);
+    send_cv_.notify_one();
+    receive_cv_.wait(lock, [&] { return !queue_.empty() || closed_; });
+    if (!queue_.empty()) {
+      PADDLE_ENFORCE_NOT_NULL(elem);
+      *elem = queue_.front();
+      queue_.pop_front();
+      return true;
+    } else {
+      PADDLE_ENFORCE(closed_);
+      return false;
+    }
+  }
+
+  void Close() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    closed_ = true;
+    send_cv_.notify_all();
+    receive_cv_.notify_all();
+  }
+
+  bool IsClosed() const {
+    std::lock_guard<std::mutex> lock(mutex_);
+    return closed_;
+  }
+
+  size_t Cap() const {
+    std::lock_guard<std::mutex> lock(mutex_);
+    return capacity_;
+  }
+
+  size_t Size() const {
+    std::lock_guard<std::mutex> lock(mutex_);
+    return queue_.size();
+  }
+
+ private:
+  size_t capacity_;
+  bool closed_;
+  std::deque<T> queue_;
+
+  mutable std::mutex mutex_;
+  mutable std::condition_variable receive_cv_;
+  mutable std::condition_variable send_cv_;
+};
+}  // namespace reader
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 5a45e431df..472595f6a8 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -34,6 +34,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/operators/activation_op.h"
+#include "paddle/fluid/operators/reader/py_array_feed_queue.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -297,6 +298,42 @@ All parameter, weight, gradient are variables in Paddle.
   py::class_<framework::ReaderHolder>(m, "Reader", "")
       .def("reset", &framework::ReaderHolder::ReInit);
 
+  using PyArrayFeedQueue = ::paddle::operators::reader::PyArrayFeedQueue;
+  using PyArrayFeedQueueHolder =
+      ::paddle::operators::reader::PyArrayFeedQueueHolder;
+  using PyArray = ::paddle::operators::reader::PyArray;
+  py::class_<PyArrayFeedQueue>(m, "PyArrayFeedQueue", "")
+      .def(
+          "enqueue",
+          [](PyArrayFeedQueue &self, const std::vector<PyArray> &py_array_vec) {
+            return self.Enqueue(py_array_vec);
+          })
+      .def("enqueue",
+           [](PyArrayFeedQueue &self,
+              const std::vector<framework::LoDTensor> &lod_tensor_vec) {
+             return self.Enqueue(lod_tensor_vec);
+           })
+      .def("size", [](const PyArrayFeedQueue &self) { return self.Size(); })
+      .def("capacity", [](const PyArrayFeedQueue &self) { return self.Cap(); })
+      .def("close", [](PyArrayFeedQueue &self) { return self.Close(); })
+      .def("is_closed",
+           [](const PyArrayFeedQueue &self) { return self.IsClosed(); });
+
+  m.def("init_py_array_feed_queue",
+        [](Variable &var, size_t capacity,
+           const std::vector<std::vector<int64_t>> &shapes,
+           const ::paddle::platform::Place &place) -> PyArrayFeedQueue * {
+          std::vector<DDim> dims(shapes.size());
+          std::transform(shapes.begin(), shapes.end(), dims.begin(),
+                         [](const std::vector<int64_t> &shape) {
+                           return make_ddim(shape);
+                         });
+          auto *holder = var.GetMutable<PyArrayFeedQueueHolder>();
+          holder->InitOnce(capacity, dims, place);
+          return holder->GetFeeder().get();
+        },
+        py::return_value_policy::reference);
+
   py::class_<Scope>(m, "Scope", "")
       .def("var",
            [](Scope &self, const std::string &name) -> Variable * {
@@ -463,10 +500,11 @@ All parameter, weight, gradient are variables in Paddle.
 #ifdef PADDLE_WITH_DISTRIBUTE
       .def("complete", &Executor::Complete)
 #endif
-      .def("run",
-           (void (Executor::*)(const ProgramDesc &, Scope *, int, bool, bool)) &
-               Executor::Run);
-
+      .def("run", [](Executor &self, const ProgramDesc &prog, Scope *scope,
+                     int block_id, bool create_local_scope, bool create_vars) {
+        pybind11::gil_scoped_release release;
+        self.Run(prog, scope, block_id, create_local_scope, create_vars);
+      });
   m.def("init_gflags", framework::InitGflags);
   m.def("init_glog", framework::InitGLOG);
   m.def("init_devices",
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index 6da3846ac6..3e2ea1ef88 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -146,7 +146,7 @@ void PyCPUTensorSetFromArray(
 template <>
 // This following specialization maps uint16_t in the parameter type to
 // platform::float16.
-void PyCPUTensorSetFromArray(
+inline void PyCPUTensorSetFromArray(
     framework::Tensor *self,
     pybind11::array_t<uint16_t,
                       pybind11::array::c_style | pybind11::array::forcecast>
@@ -185,7 +185,7 @@ void PyCUDATensorSetFromArray(
 template <>
 // This following specialization maps uint16_t in the parameter type to
 // platform::float16.
-void PyCUDATensorSetFromArray(
+inline void PyCUDATensorSetFromArray(
     framework::Tensor *self,
     pybind11::array_t<uint16_t,
                       pybind11::array::c_style | pybind11::array::forcecast>
@@ -224,7 +224,7 @@ void PyCUDAPinnedTensorSetFromArray(
 template <>
 // This following specialization maps uint16_t in the parameter type to
 // platform::float16.
-void PyCUDAPinnedTensorSetFromArray(
+inline void PyCUDAPinnedTensorSetFromArray(
     framework::Tensor *self,
     pybind11::array_t<uint16_t,
                       pybind11::array::c_style | pybind11::array::forcecast>
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index f3ab47c96b..3773653bcd 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -24,7 +24,8 @@ from layer_function_generator import generate_layer_fn, templatedoc
 __all__ = [
     'data', 'BlockGuardServ', 'ListenAndServ', 'Send', 'Recv',
     'open_recordio_file', 'open_files', 'read_file', 'shuffle', 'batch',
-    'double_buffer', 'random_data_generator', 'Preprocessor', 'load'
+    'double_buffer', 'random_data_generator', 'py_array_reader', 'Preprocessor',
+    'load'
 ]
 
 
@@ -448,6 +449,61 @@ def random_data_generator(low, high, shapes, lod_levels, for_parallel=True):
     return monkey_patch_reader_methods(main_prog_var)
 
 
+# UNCHECK(zengjinle)
+def py_array_reader(capacity,
+                    shapes,
+                    lod_levels,
+                    dtypes,
+                    place=None,
+                    for_parallel=True):
+
+    if place is None:
+        place = core.CPUPlace()
+
+    if not isinstance(place, core.Place):
+        new_place = core.Place()
+        new_place.set_place(place)
+        place = new_place
+
+    dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes]
+    shape_concat = []
+    ranks = []
+
+    for shape in shapes:
+        shape_concat.extend(shape)
+        ranks.append(len(shape))
+
+    feeder_name = unique_name('py_array_feed_queue')
+    var = global_scope().var(feeder_name)
+
+    #feed_shapes = [shape[1:] for shape in shapes]
+    feed_queue = core.init_py_array_feed_queue(var, capacity, shapes, place)
+
+    startup_blk = default_startup_program().current_block()
+    startup_var = startup_blk.create_var(
+        name=unique_name('create_py_array_reader'))
+    startup_blk.append_op(
+        type='create_py_array_reader',
+        outputs={'Out': [startup_var]},
+        attrs={
+            'shape_concat': shape_concat,
+            'lod_levels': lod_levels,
+            'ranks': ranks,
+            'feeder_name': feeder_name
+        })
+
+    startup_var.desc.set_dtypes(dtypes)
+    startup_var.persistable = True
+
+    main_prog_var = _copy_reader_var_(default_main_program().current_block(),
+                                      startup_var)
+
+    if for_parallel:
+        main_prog_var = parallel(reader=main_prog_var)
+
+    return monkey_patch_reader_methods(main_prog_var), feed_queue
+
+
 def open_files(filenames,
                shapes,
                lod_levels,

From 9b63fef32d724d9b1c68383a1483d2a0c1291415 Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Mon, 25 Jun 2018 02:01:23 +0000
Subject: [PATCH 22/68] delete some redundant comments

---
 paddle/fluid/operators/reader/create_py_array_reader_op.cc | 4 +---
 python/paddle/fluid/layers/io.py                           | 1 -
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/paddle/fluid/operators/reader/create_py_array_reader_op.cc b/paddle/fluid/operators/reader/create_py_array_reader_op.cc
index ed7ef4affb..36378c7deb 100644
--- a/paddle/fluid/operators/reader/create_py_array_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_py_array_reader_op.cc
@@ -29,9 +29,7 @@ class PyArrayReader : public framework::ReaderBase {
     *out = queue_->Dequeue();
   }
 
-  void ReInit() override {
-    // PADDLE_THROW("PyArrayReader does not support ReInit()");
-  }
+  void ReInit() override {}
 
  private:
   std::shared_ptr<PyArrayFeedQueue> queue_;
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index 3773653bcd..811471c5fd 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -449,7 +449,6 @@ def random_data_generator(low, high, shapes, lod_levels, for_parallel=True):
     return monkey_patch_reader_methods(main_prog_var)
 
 
-# UNCHECK(zengjinle)
 def py_array_reader(capacity,
                     shapes,
                     lod_levels,

From 8536d261ab0bd7dbdb609f6f6280fd49d3fef844 Mon Sep 17 00:00:00 2001
From: guosheng <guosheng@baidu.com>
Date: Mon, 25 Jun 2018 12:11:36 +0800
Subject: [PATCH 23/68] Fix doc format of beam_search

---
 python/paddle/fluid/layers/nn.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index bfd9a59628..5db0916504 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -2745,6 +2745,7 @@ def beam_search_decode(ids, scores, beam_size, end_id, name=None):
     whose lods can be used to restore the path in the beam search tree.
     Please see the following demo for a fully beam search usage example:
         fluid/tests/book/test_machine_translation.py
+
     Args:
         ids(Variable): The LodTensorArray variable containing the selected ids
             of all steps.
@@ -2754,12 +2755,14 @@ def beam_search_decode(ids, scores, beam_size, end_id, name=None):
         end_id(int): The id of end token.
         name(str|None): A name for this layer(optional). If set None, the layer
                         will be named automatically.
+
     Returns:
         Variable: The LodTensor pair containing the generated id sequences \
             and the corresponding scores. The shapes and lods of the two \
             LodTensor are same. The lod level is 2 and the two levels \
             separately indicate how many hypotheses each source sentence has \
             and how many ids each hypothesis has.
+
     Examples:
         .. code-block:: python
             # Suppose `ids` and `scores` are LodTensorArray variables reserving

From 64292f07d2f0835f20587e2b20853a1472261f63 Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Mon, 25 Jun 2018 07:19:41 +0000
Subject: [PATCH 24/68] remove python_data_feeding.md in local branch

---
 .../design/concepts/python_data_feeding.md    | 119 ------------------
 1 file changed, 119 deletions(-)
 delete mode 100644 doc/fluid/design/concepts/python_data_feeding.md

diff --git a/doc/fluid/design/concepts/python_data_feeding.md b/doc/fluid/design/concepts/python_data_feeding.md
deleted file mode 100644
index 7966fc27c0..0000000000
--- a/doc/fluid/design/concepts/python_data_feeding.md
+++ /dev/null
@@ -1,119 +0,0 @@
-# Python Data Feeding
-
-In the former implementation of Paddle Fluid, there are two ways to feed data:
-
-- Use `reader_op` in backend C++ side. This method only supports data feeding from recordio files and random data generators, but supports many kinds of `decorated_readers`. For examples, `double_buffer_reader` uses two threads to achieve better performance: one for time-consuming I/O operations, and the other for `Executor::Run()`. See [C++ Data Feeding](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/cpp_data_feeding.md) for details.
-
-- Feed data directly using `DataFeeder.feed()` in Python codes. It is more flexible than the first way. Many kinds of preprocessing steps can be performed before feeding using Python or any other languages, instead of adding many uncommon `operators` in C++ side. But this method is less efficient: the program cannot read the next mini-batch data before `Executor::Run()` ends. Moreover, `decorated_readers` such as `double_buffer_reader` cannot be used for better performance.
-
-In this document, we design a Python Data Feeding process combining the efficiency of the first way and the flexibility of the second way. A data queue `PyArrayFeedQueue` is designed to be shared by the Python and C++ side, while Python array is pushed into the queue and `reader_op` in C++ side reads out the data from the queue.
-
-## Design of PyArrayFeedQueue
-`PyArrayFeedQueue` is a blocking queue with a fixed `capacity` and accepts Python array with shapes indicated by `dims`.
-```C++
-class PyArrayFeedQueueHolder;
-
-class PyArrayFeedQueue {
-  friend class PyArrayFeedQueueHolder;
- private:
-  // PyArrayFeedQueue can only be constructed by PyArrayFeedQueueHolder
-  PyArrayFeedQueue(size_t capacity, const std::vector<framework::DDim>& dims, 
-                    const platform::Place& place);
- 
- public:
-  size_t size() const; // Get the current size of the queue
-  size_t capacity() const; // Get the capacity of the queue
-  bool is_full() const;
-  bool is_empty() const;
-  
-  // Convert Python array tuple to std::vector<framework::LoDTensor> and store it.
-  // Block if is_full() == true
-  // Use pybind11::gil_scoped_release to release GIL of Python
-  void push(const pybind11::tuple& array_tuple);
-  
-  // Block if is_empty() == true
-  // Use pybind11::gil_scoped_release to release GIL of Python
-  std::vector<framework::LoDTensor> pop();
- private:
-  // CircularQueue is a class like `boost::circular_buffer`
-  framework::CircularQueue<std::vector<framework::LoDTensor>> queue_;
-  std::vector<framework::DDim> dims_;
-  platform::Place place_;
-  mutable std::mutex mutex_;
-  mutable std::condition_variable cv_;
-};
-
-class PyArrayFeedQueueHolder {
- public:
-  PyArrayFeedQueueHolder() {}
-  
-  // Calls the constructor of PyArrayFeedQueue to create feeder_
-  // `init_once` can only called once, otherwise an exception would raise
-  void init_once(size_t capacity, const std::vector<framework::DDim>& dims, const Place& place);
-  
-  std::shared_ptr<PyArrayFeedQueue> feeder() { return feeder_; }
-  const std::shared_ptr<PyArrayFeederQueue>& feeder() const { return feeder_; }
- private:
-  std::shared_ptr<PyArrayFeedQueue> feeder_;
-};
-```
-
-There are some major things that must be concerned:
-- `PyArrayFeedQueueHolder` should be a `Variable` in global scope, so that `reader_op` can find it when reading data. Since `PyArrayFeedQueue` does not have a default constructor, it cannot be constructed by `Scope::Var()::GetMutable<T>()`. To solve this problem, `PyArrayFeedQueueHolder` is designed to defer construction of `PyArrayFeedQueue`.
-- A `Variable` of `PyArrayFeedQueueHolder` but not `VarDesc` must be created in Python code before `Executor::Run()` so that `Executor::Run()` can get the feeding data when it is called.
-- `Create_reader_op` should accept the name or address of `PyArrayFeedQueueHolder` as an input or attribute.
-
-
-## Design of PyArrayReader
-`PyArrayReader` is a reader which holds a `PyArrayFeedQueue` object. Notice that `ReInit()` function is not supported because the capacity of the `PyArrayFeedQueue` object is limited.
-```C++
-class PyArrayReader : public ReaderBase {
- public:
-  explicit PyArrayReader(const std::shared_ptr<PyArrayFeedQueue>& queue);
-  
-  void ReadNext(std::vector<framework::LoDTensor>* out) override;
-  
-  void ReInit() override {
-    PADDLE_THROW("PyArrayReader does not support ReInit()");
-  }
-
- private:
-  std::shared_ptr<PyArrayFeedQueue> queue_;
-};
-```
-
-## Design of CreatePyArrayReaderOp
-`CreatePyArrayReaderOp` is used to create `PyArrayReader` object. It requires an attribute of `feeder_name` which indicates the name of the `PyArrayFeedQueueHolder` variable.
-```C++
-class CreatePyArrayReaderOp : public framework::OperatorBase {
- public:
-  using framework::OperatorBase::OperatorBase;
- private:
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& dev_place) const override {
-    const std::string& feeder_name = Attr<std::string>("feeder_name");
-    auto* feeder_holder_var = scope.FindVar(feeder_name);
-    PADDLE_ENFORCE(feed_holder_var != nullptr);
-    auto* feeder_holder = feeder_holder_var
-                    ->template GetMutable<framework::PyArrayFeedQueueHolder>();
-    auto* out = scope.FindVar(Output("Out"))
-                    ->template GetMutable<framework::ReaderHolder>();
-    out->Reset(new PyArrayReader(feeder_holder->feeder());
-  }
-};
-```
-
-## Design of Python codes
-The design of Python codes are as follows. First, we construct a variable of `PyArrayFeedQueueHolder` and init it with given parameters, returning the `PyArrayFeedQueue` object after initialization. After that, a layer of `CreatePyArrayReaderOp` is constructed and accepts the name of the `PyArrayFeedQueueHolder` variable. The `PyArrayFeedQueue` object and result of the layer are both returned.
-```Python
-def py_array_reader(capacity, shapes, place):
-  feeder_name = unique_name.generate("py_array_feed_queue")
-  var = global_scope().var(feeder_name) # create PyArrayFeedQueueHolder Variable
-  feed_queue = core.init_py_array_feed_queue(var, capacity, shapes, place) # init PyArrayFeedQueue
-  out = create_var()
-  create_reader_op_with_feeder_name(
-      type='create_py_array_reader',
-      outputs={'Out':[out]},
-      attrs = {'feeder_name': feeder_name})  
-  return out, feed_queue
-```

From 748e204effe86559cc7db7ead2260a0a15915f05 Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Mon, 25 Jun 2018 07:26:42 +0000
Subject: [PATCH 25/68] Revert "refine ZeroGradFunctor in activation_op.h"

This reverts commit 1eeb11ef6190a7697cdce7914646a0d6163e7597.
---
 paddle/fluid/operators/activation_op.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h
index 497a233338..9124151926 100644
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -353,7 +353,7 @@ struct ZeroGradFunctor : public BaseActivationFunctor<T> {
   template <typename Device, typename X, typename Out, typename dOut,
             typename dX>
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = out.constant(static_cast<T>(0));
+    dx.device(d) = static_cast<T>(0) / out;
   }
 };
 

From 2dcf0e4e665fb553494fa144dc59129dc0acb309 Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Mon, 25 Jun 2018 09:09:27 +0000
Subject: [PATCH 26/68] delete py_array_feed_queue.h

---
 .../operators/reader/py_array_feed_queue.h    | 207 ------------------
 1 file changed, 207 deletions(-)
 delete mode 100644 paddle/fluid/operators/reader/py_array_feed_queue.h

diff --git a/paddle/fluid/operators/reader/py_array_feed_queue.h b/paddle/fluid/operators/reader/py_array_feed_queue.h
deleted file mode 100644
index f9552f73a6..0000000000
--- a/paddle/fluid/operators/reader/py_array_feed_queue.h
+++ /dev/null
@@ -1,207 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <algorithm>
-#include <condition_variable>  //NOLINT
-#include <memory>
-#include <mutex>  // NOLINT
-#include <vector>
-#include "glog/logging.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/operators/reader/py_blocking_queue.h"
-#include "paddle/fluid/operators/reader/reader_op_registry.h"
-#include "paddle/fluid/pybind/tensor_py.h"
-
-namespace paddle {
-namespace operators {
-namespace reader {
-
-using PyTuple = ::pybind11::tuple;
-using PyArray = ::pybind11::array;
-
-template <typename T>
-using PyArrayT = ::pybind11::array_t<T, ::pybind11::array::c_style |
-                                            ::pybind11::array::forcecast>;
-
-class PyArrayToTensorVisitor : public boost::static_visitor<void> {
- public:
-#define PY_ARRAY_TO_TENSOR_WITH_TYPE(dtype, func_name)                       \
-  pybind::func_name(tensor_, static_cast<const PyArrayT<dtype>&>(py_array_), \
-                    place)
-
-#define PY_ARRAY_TO_TENSOR(func_name)                  \
-  if (IsType<size_t>()) {                              \
-    PY_ARRAY_TO_TENSOR_WITH_TYPE(size_t, func_name);   \
-  } else if (IsType<int64_t>()) {                      \
-    PY_ARRAY_TO_TENSOR_WITH_TYPE(int64_t, func_name);  \
-  } else if (IsType<int32_t>()) {                      \
-    PY_ARRAY_TO_TENSOR_WITH_TYPE(int32_t, func_name);  \
-  } else if (IsType<int16_t>()) {                      \
-    PY_ARRAY_TO_TENSOR_WITH_TYPE(int16_t, func_name);  \
-  } else if (IsType<uint8_t>()) {                      \
-    PY_ARRAY_TO_TENSOR_WITH_TYPE(uint8_t, func_name);  \
-  } else if (IsType<float>()) {                        \
-    PY_ARRAY_TO_TENSOR_WITH_TYPE(float, func_name);    \
-  } else if (IsType<double>()) {                       \
-    PY_ARRAY_TO_TENSOR_WITH_TYPE(double, func_name);   \
-  } else {                                             \
-    PADDLE_THROW("unsupported dtype of python array"); \
-  }
-
-  PyArrayToTensorVisitor(const PyArray& py_array, framework::Tensor* tensor)
-      : py_array_(py_array), tensor_(tensor) {}
-
-  void operator()(const platform::CPUPlace& place) {
-    PY_ARRAY_TO_TENSOR(PyCPUTensorSetFromArray);
-  }
-
-  void operator()(const platform::CUDAPlace& place) {
-#ifdef PADDLE_WITH_CUDA
-    PY_ARRAY_TO_TENSOR(PyCUDATensorSetFromArray);
-#else
-    PADDLE_THROW("CUDAPlace is not supported in CPU only version");
-#endif
-  }
-
-  void operator()(const platform::CUDAPinnedPlace& place) {
-#ifdef PADDLE_WITH_CUDA
-    PY_ARRAY_TO_TENSOR(PyCUDAPinnedTensorSetFromArray);
-#else
-    PADDLE_THROW("CUDAPinnedPlace is not supported in CPU only version");
-#endif
-  }
-
-#undef PY_ARRAY_TO_TENSOR
-#undef PY_ARRAY_TO_TENSOR_WITH_TYPE
-
- private:
-  template <typename T>
-  inline bool IsType() const {
-    return ::pybind11::isinstance<PyArrayT<T>>(py_array_);
-  }
-
- private:
-  const PyArray& py_array_;
-  framework::Tensor* tensor_;
-};
-
-class PyArrayFeedQueueHolder;
-
-// PyArrayFeedQueue must be thread-safe
-class PyArrayFeedQueue {
-  friend class PyArrayFeedQueueHolder;
-
- private:
-  PyArrayFeedQueue(size_t capacity, const std::vector<framework::DDim>& dims,
-                   const platform::Place& place)
-      : dims_(dims), place_(place) {
-    queue_.reset(
-        new PyBlockingQueue<std::vector<framework::LoDTensor>>(capacity));
-  }
-
- public:
-  ~PyArrayFeedQueue() { Close(); }
-
-  bool Enqueue(const std::vector<PyArray>& py_array_vec) {
-    auto lod_tensor_vec = PyArrayVecToLoDTensorVec(py_array_vec);
-    VLOG(5) << "Enqueue at address " << reinterpret_cast<void*>(this);
-    return queue_->Send(std::move(lod_tensor_vec));
-  }
-
-  bool Enqueue(const std::vector<framework::LoDTensor>& tensor_vec) {
-    VLOG(5) << "Enqueue at address " << reinterpret_cast<void*>(this);
-    return queue_->Send(tensor_vec);
-  }
-
-  std::vector<framework::LoDTensor> Dequeue() {
-    VLOG(5) << "Dequeue at address " << reinterpret_cast<void*>(this);
-    std::vector<framework::LoDTensor> ret;
-    return queue_->Receive(&ret) ? ret : std::vector<framework::LoDTensor>();
-  }
-
-  inline size_t Size() const { return queue_->Size(); }
-
-  inline size_t Cap() const { return queue_->Cap(); }
-
-  inline bool IsClosed() const { return queue_->IsClosed(); }
-
-  inline void Close() { queue_->Close(); }
-
- private:
-  std::vector<framework::LoDTensor> PyArrayVecToLoDTensorVec(
-      const std::vector<PyArray>& py_array_vec) {
-    PADDLE_ENFORCE(dims_.size() == py_array_vec.size(),
-                   "expected input tensor number %d but found %d", dims_.size(),
-                   py_array_vec.size());
-
-    size_t i = 0;
-    if (py_array_vec.size() > 1) {
-      size_t dim0 = py_array_vec[0].shape()[0];
-      for (size_t j = 1; j < py_array_vec.size(); ++j) {
-        PADDLE_ENFORCE(dim0 == py_array_vec[j].shape()[0],
-                       "0-dim of the %d-th input tensor is %d, but 0-dim of "
-                       "the 0-th input tensor is %d",
-                       j, py_array_vec[j].shape()[0], dim0);
-      }
-    }
-
-    std::vector<framework::LoDTensor> lod_tensor_vec;
-    lod_tensor_vec.reserve(py_array_vec.size());
-
-    std::for_each(
-        py_array_vec.begin(), py_array_vec.end(), [&](const PyArray& py_array) {
-          for (int64_t j = 1; j < dims_[i].size(); ++j) {
-            PADDLE_ENFORCE(
-                dims_[i][j] == static_cast<int64_t>(py_array.shape()[j]),
-                "expected %d-dim of %d-th input tensor is %d but found %d", j,
-                i, dims_[i][j], py_array.shape()[j]);
-          }
-
-          lod_tensor_vec.emplace_back(framework::LoDTensor());
-          PyArrayToTensorVisitor visitor(py_array, &(lod_tensor_vec.back()));
-          boost::apply_visitor(visitor, place_);
-          ++i;
-        });
-    return lod_tensor_vec;
-  }
-
-  std::unique_ptr<PyBlockingQueue<std::vector<framework::LoDTensor>>> queue_;
-  std::vector<framework::DDim> dims_;
-  platform::Place place_;
-};
-
-class PyArrayFeedQueueHolder {
- public:
-  PyArrayFeedQueueHolder() {}
-
-  void InitOnce(size_t capacity, const std::vector<framework::DDim>& dims,
-                const platform::Place& place) {
-    PADDLE_ENFORCE(
-        feeder_ == nullptr,
-        "PyArrayFeedQueueHolder::InitOnce() can only be called once");
-    feeder_.reset(new PyArrayFeedQueue(capacity, dims, place));
-  }
-
-  std::shared_ptr<PyArrayFeedQueue> GetFeeder() { return feeder_; }
-  const std::shared_ptr<PyArrayFeedQueue>& GetFeeder() const { return feeder_; }
-
- private:
-  std::shared_ptr<PyArrayFeedQueue> feeder_;
-};
-
-}  // namespace reader
-}  // namespace operators
-}  // namespace paddle

From 7b2339d7c5c60cb4d3364601fb1b44564f392de3 Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Mon, 25 Jun 2018 09:09:58 +0000
Subject: [PATCH 27/68] delete create_py_array_reader_op.cc

---
 .../reader/create_py_array_reader_op.cc       | 78 -------------------
 1 file changed, 78 deletions(-)
 delete mode 100644 paddle/fluid/operators/reader/create_py_array_reader_op.cc

diff --git a/paddle/fluid/operators/reader/create_py_array_reader_op.cc b/paddle/fluid/operators/reader/create_py_array_reader_op.cc
deleted file mode 100644
index 36378c7deb..0000000000
--- a/paddle/fluid/operators/reader/create_py_array_reader_op.cc
+++ /dev/null
@@ -1,78 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/reader/py_array_feed_queue.h"
-
-namespace paddle {
-namespace operators {
-namespace reader {
-
-class PyArrayReader : public framework::ReaderBase {
- public:
-  explicit PyArrayReader(const std::shared_ptr<PyArrayFeedQueue>& queue) {
-    PADDLE_ENFORCE(queue != nullptr, "PyArrayFeedQueue must not be null");
-    queue_ = queue;
-  }
-
-  void ReadNext(std::vector<framework::LoDTensor>* out) override {
-    *out = queue_->Dequeue();
-  }
-
-  void ReInit() override {}
-
- private:
-  std::shared_ptr<PyArrayFeedQueue> queue_;
-};
-
-class CreatePyArrayReaderOp : public framework::OperatorBase {
- public:
-  using framework::OperatorBase::OperatorBase;
-
- private:
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& dev_place) const override {
-    const std::string& feeder_name = Attr<std::string>("feeder_name");
-    auto* feeder_holder_var = scope.FindVar(feeder_name);
-    PADDLE_ENFORCE(feeder_holder_var != nullptr,
-                   "No PyArrayFeedQueue variable with name %s found",
-                   feeder_name);
-    auto* feeder_holder =
-        feeder_holder_var->template GetMutable<PyArrayFeedQueueHolder>();
-    auto* out = scope.FindVar(Output("Out"))
-                    ->template GetMutable<framework::ReaderHolder>();
-    out->Reset(new PyArrayReader(feeder_holder->GetFeeder()));
-  }
-};
-
-class CreatePyArrayReaderOpMaker : public FileReaderMakerBase {
- protected:
-  void Apply() override {
-    AddAttr<std::string>("feeder_name",
-                         "Name of the `PyArrayFeedQueueHolder` variable");
-
-    AddComment(R"DOC(
-			Create PyArrayReader to accept Python data feeding.
-      )DOC");
-  }
-};
-
-}  // namespace reader
-}  // namespace operators
-}  // namespace paddle
-
-namespace reader = ::paddle::operators::reader;
-
-REGISTER_FILE_READER_OPERATOR(create_py_array_reader,
-                              reader::CreatePyArrayReaderOp,
-                              reader::CreatePyArrayReaderOpMaker);

From 502faf62a991d0421969de114393b95f73f0bcae Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Mon, 25 Jun 2018 11:07:06 +0000
Subject: [PATCH 28/68] complete_py_reader_cpp

---
 benchmark/fluid/args.py                       |  10 --
 benchmark/fluid/fluid_benchmark.py            |  86 +++-----------
 benchmark/fluid/models/machine_translation.py |   2 +-
 benchmark/fluid/models/mnist.py               |  29 +----
 benchmark/fluid/models/resnet.py              |  20 +---
 .../fluid/models/stacked_dynamic_lstm.py      |   2 +-
 benchmark/fluid/models/vgg.py                 |  29 ++---
 paddle/fluid/operators/reader/CMakeLists.txt  |   2 +-
 .../fluid/operators/reader/blocking_queue.h   |  17 ++-
 .../operators/reader/create_py_reader_op.cc   |  81 +++++++++++++
 .../reader/lod_tensor_blocking_queue.h        | 107 ++++++++++++++++++
 paddle/fluid/pybind/pybind.cc                 |  62 +++++-----
 python/paddle/fluid/layers/io.py              |  57 +---------
 13 files changed, 264 insertions(+), 240 deletions(-)
 create mode 100644 paddle/fluid/operators/reader/create_py_reader_op.cc
 create mode 100644 paddle/fluid/operators/reader/lod_tensor_blocking_queue.h

diff --git a/benchmark/fluid/args.py b/benchmark/fluid/args.py
index dcd4ee2324..68a3d42d7a 100644
--- a/benchmark/fluid/args.py
+++ b/benchmark/fluid/args.py
@@ -122,15 +122,5 @@ def parse_args():
         type=str,
         default="",
         help='Directory that contains all the training recordio files.')
-    parser.add_argument(
-        '--use_py_reader_op',
-        action='store_true',
-        help='Whether to use Python reader op, omitted when use_reader_op is true'
-    )
-    parser.add_argument(
-        '--feed_queue_capacity',
-        type=int,
-        default=64,
-        help='Capacity of feed queue when use_py_reader_op is true')
     args = parser.parse_args()
     return args
diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py
index b5acb6549f..ece1102dce 100644
--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
@@ -25,9 +25,6 @@ import paddle.fluid.profiler as profiler
 import paddle.fluid.transpiler.distribute_transpiler as distribute_transpiler
 
 from args import *
-import threading
-
-feed_queue = None
 
 
 def append_nccl2_prepare(trainer_id):
@@ -134,7 +131,7 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
     exe = fluid.Executor(place)
     exe.run(startup_prog)
 
-    if not args.use_reader_op and not args.use_py_reader_op:
+    if not args.use_reader_op:
         feed_var_list = [
             var for var in train_prog.global_block().vars.itervalues()
             if var.is_data
@@ -144,12 +141,12 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
     iters, num_samples, start_time = 0, 0, time.time()
     for pass_id in range(args.pass_num):
         train_losses = []
-        if not args.use_reader_op and not args.use_py_reader_op:
+        if not args.use_reader_op:
             reader_generator = train_reader()
         batch_id = 0
         data = None
         while True:
-            if not args.use_reader_op and not args.use_py_reader_op:
+            if not args.use_reader_op:
                 data = next(reader_generator, None)
                 if data == None:
                     break
@@ -159,7 +156,7 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
                 start_time = time.time()
                 num_samples = 0
 
-            if args.use_reader_op or args.use_py_reader_op:
+            if args.use_reader_op:
                 try:
                     loss = exe.run(train_prog, fetch_list=[avg_loss])
                 except fluid.core.EnforceNotMet as ex:
@@ -173,7 +170,7 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
             # FIXME(wuyi): For use_reader_op, if the current
             # pass is not the last, the last batch of this pass
             # is also equal to args.batch_size.
-            if args.use_reader_op or args.use_py_reader_op:
+            if args.use_reader_op:
                 num_samples += args.batch_size * args.gpus
             else:
                 num_samples += len(data)
@@ -183,13 +180,12 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
         print_train_time(start_time, time.time(), num_samples)
         print("Pass: %d, Loss: %f" % (pass_id, np.mean(train_losses))),
         # evaluation
-        if not args.no_test and batch_acc and not args.use_reader_op and not args.use_py_reader_op:
+        if not args.no_test and batch_acc and not args.use_reader_op:
             pass_test_acc = test(exe, infer_prog, test_reader, feeder,
                                  batch_acc)
             print(", Test Accuracy: %f" % pass_test_acc)
         print("\n")
         # TODO(wuyi): add warmup passes to get better perf data.
-        close_feed_queue()
         exit(0)
 
 
@@ -199,7 +195,7 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
                    batch_acc, args, train_prog, startup_prog, nccl_id_var,
                    num_trainers, trainer_id):
     place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
-    if not args.use_reader_op and not args.use_py_reader_op:
+    if not args.use_reader_op:
         feed_var_list = [
             var for var in train_prog.global_block().vars.itervalues()
             if var.is_data
@@ -242,12 +238,12 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
         num_samples = 0
         iters = 0
         start_time = time.time()
-        if not args.use_reader_op and not args.use_py_reader_op:
+        if not args.use_reader_op:
             reader_generator = train_reader()
         batch_id = 0
         data = None
         while True:
-            if not args.use_reader_op and not args.use_py_reader_op:
+            if not args.use_reader_op:
                 data = next(reader_generator, None)
                 if data == None:
                     break
@@ -261,14 +257,14 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
             if iters == args.skip_batch_num:
                 start_time = time.time()
                 num_samples = 0
-            if args.use_fake_data or args.use_reader_op or args.use_py_reader_op:
+            if args.use_fake_data or args.use_reader_op:
                 try:
                     loss, = exe.run([avg_loss.name])
                 except fluid.core.EnforceNotMet as ex:
                     break
             else:
                 loss, = exe.run([avg_loss.name], feed=feeder.feed(data))
-            if args.use_reader_op or args.use_py_reader_op:
+            if args.use_reader_op:
                 num_samples += args.batch_size * args.gpus
             else:
                 num_samples += len(data)
@@ -279,7 +275,7 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
             batch_id += 1
 
         print_train_time(start_time, time.time(), num_samples)
-        if not args.no_test and batch_acc and not args.use_reader_op and not args.use_py_reader_op:
+        if not args.no_test and batch_acc and not args.use_reader_op:
             # we have not implement record io for test
             # skip test when use args.use_reader_op
             test_acc = test(startup_exe, infer_prog, test_reader, feeder,
@@ -311,46 +307,7 @@ def print_paddle_envs():
     print('------------------------------------------------')
 
 
-def feed_data(feed_queue, train_reader, test_reader, dshapes, args):
-    train_cnt = 0
-    test_cnt = 0
-    print_per_train_batch = 1
-    train_data_generator = train_reader()
-    start = time.time()
-    while True:
-        next_data = next(train_data_generator, None)
-        if next_data is None:
-            break
-
-        next_data = list(next_data)
-        for i in range(len(next_data)):
-            if not isinstance(next_data[i], np.ndarray):
-                next_data[i] = np.array(next_data[i])
-            next_data[i] = next_data[i].reshape([-1] + dshapes[i])
-
-        if not feed_queue.enqueue(next_data):
-            break
-
-        train_cnt += 1
-        '''
-        if train_cnt % print_per_train_batch == 0:
-            end = time.time()
-            print('Feed queue size: %d, capacity: %d, speed: %.5fsec/batch'
-                % (feed_queue.size(), feed_queue.capacity(), (end-start)/print_per_train_batch))
-            start = end
-        '''
-    feed_queue.close()
-
-
-def close_feed_queue():
-    global feed_queue
-    if feed_queue is not None:
-        feed_queue.close()
-
-
 def main():
-    global feed_queue
-
     args = parse_args()
     print_arguments(args)
     print_paddle_envs()
@@ -364,23 +321,8 @@ def main():
         pr = cProfile.Profile()
         pr.enable()
     model_def = __import__("models.%s" % args.model, fromlist=["models"])
-    model = model_def.get_model(args)
-
-    if not args.use_reader_op and args.use_py_reader_op:
-        feed_queue = model[-4]
-        train_reader = model[-3]
-        test_reader = model[-2]
-        dshapes = model[-1]
-        feed_thread = threading.Thread(
-            target=feed_data,
-            args=(feed_queue, train_reader, test_reader, dshapes, args))
-        #feed_thread.setDaemon(True)
-        feed_thread.start()
-        model = model[:-4]
-
-    train_args = list(model)
+    train_args = list(model_def.get_model(args))
     train_args.append(args)
-
     # Run optimizer.minimize(avg_loss)
     train_args[2].minimize(train_args[0])
     if args.memory_optimize:
@@ -396,7 +338,6 @@ def main():
             train_args.extend([nccl_id_var, num_trainers, trainer_id])
             train_parallel(*train_args)
         train(*train_args)
-        close_feed_queue()
         exit(0)
 
     # for other update methods, use default programs
@@ -421,4 +362,3 @@ def main():
 
 if __name__ == "__main__":
     main()
-    close_feed_queue()
diff --git a/benchmark/fluid/models/machine_translation.py b/benchmark/fluid/models/machine_translation.py
index 43f0368cd4..17f6b03826 100644
--- a/benchmark/fluid/models/machine_translation.py
+++ b/benchmark/fluid/models/machine_translation.py
@@ -182,7 +182,7 @@ def lodtensor_to_ndarray(lod_tensor):
 
 
 def get_model(args):
-    if args.use_reader_op or args.use_py_reader_op:
+    if args.use_reader_op:
         raise Exception("machine_translation do not support reader op for now.")
     embedding_dim = 512
     encoder_size = 512
diff --git a/benchmark/fluid/models/mnist.py b/benchmark/fluid/models/mnist.py
index fa5e1b6d64..8e740dc689 100644
--- a/benchmark/fluid/models/mnist.py
+++ b/benchmark/fluid/models/mnist.py
@@ -66,14 +66,13 @@ def cnn_model(data):
 
 
 def get_model(args):
-    dshape = [1, 28, 28]
     if args.use_reader_op:
         filelist = [
             os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
         ]
         data_file = fluid.layers.open_files(
             filenames=filelist,
-            shapes=[[-1] + dshape, (-1, 1)],
+            shapes=[[-1, 1, 28, 28], (-1, 1)],
             lod_levels=[0, 0],
             dtypes=["float32", "int64"],
             thread_num=args.gpus,
@@ -82,18 +81,8 @@ def get_model(args):
             fluid.layers.batch(
                 data_file, batch_size=args.batch_size))
         images, label = fluid.layers.read_file(data_file)
-    elif args.use_py_reader_op:
-        data_file, feed_queue = fluid.layers.py_array_reader(
-            capacity=args.feed_queue_capacity,
-            shapes=[[-1] + dshape, [-1, 1]],
-            lod_levels=[0, 0],
-            dtypes=['float32', 'int64'])
-        data_file = fluid.layers.double_buffer(
-            fluid.layers.batch(
-                data_file, batch_size=args.batch_size))
-        images, label = fluid.layers.read_file(data_file)
     else:
-        images = fluid.layers.data(name='pixel', shape=dshape, dtype=DTYPE)
+        images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
         label = fluid.layers.data(name='label', shape=[1], dtype='int64')
 
     if args.device == 'CPU' and args.cpus > 1:
@@ -129,16 +118,8 @@ def get_model(args):
         learning_rate=0.001, beta1=0.9, beta2=0.999)
 
     # Reader
-    underlying_train_reader = paddle.dataset.mnist.train()
-    underlying_test_reader = paddle.dataset.mnist.test()
     train_reader = paddle.batch(
-        underlying_train_reader, batch_size=args.batch_size * args.gpus)
+        paddle.dataset.mnist.train(), batch_size=args.batch_size * args.gpus)
     test_reader = paddle.batch(
-        underlying_test_reader, batch_size=args.batch_size)
-
-    if not args.use_reader_op and args.use_py_reader_op:
-        return avg_cost, inference_program, opt, train_reader, test_reader, batch_acc, \
-    feed_queue, underlying_train_reader, underlying_test_reader, \
-    (dshape, [1])
-    else:
-        return avg_cost, inference_program, opt, train_reader, test_reader, batch_acc
+        paddle.dataset.mnist.test(), batch_size=args.batch_size)
+    return avg_cost, inference_program, opt, train_reader, test_reader, batch_acc
diff --git a/benchmark/fluid/models/resnet.py b/benchmark/fluid/models/resnet.py
index 7fb81b04fb..9ed1093c54 100644
--- a/benchmark/fluid/models/resnet.py
+++ b/benchmark/fluid/models/resnet.py
@@ -163,16 +163,6 @@ def get_model(args):
             fluid.layers.batch(
                 data_file, batch_size=args.batch_size))
         input, label = fluid.layers.read_file(data_file)
-    elif args.use_py_reader_op:
-        data_file, feed_queue = fluid.layers.py_array_reader(
-            capacity=args.feed_queue_capacity,
-            shapes=[[-1] + dshape, [-1, 1]],
-            lod_levels=[0, 0],
-            dtypes=['float32', 'int64'])
-        data_file = fluid.layers.double_buffer(
-            fluid.layers.batch(
-                data_file, batch_size=args.batch_size))
-        input, label = fluid.layers.read_file(data_file)
     else:
         input = fluid.layers.data(name='data', shape=dshape, dtype='float32')
         label = fluid.layers.data(name='label', shape=[1], dtype='int64')
@@ -214,11 +204,5 @@ def get_model(args):
     batched_test_reader = paddle.batch(
         train_reader, batch_size=args.batch_size, drop_last=True)
 
-    if not args.use_reader_op and args.use_py_reader_op:
-        return avg_cost, inference_program, optimizer, batched_train_reader,\
-                      batched_test_reader, batch_acc, \
-                      feed_queue, train_reader, test_reader, \
-                      (dshape, [1])
-    else:
-        return avg_cost, inference_program, optimizer, batched_train_reader,\
-                      batched_test_reader, batch_acc
+    return avg_cost, inference_program, optimizer, batched_train_reader,\
+                   batched_test_reader, batch_acc
diff --git a/benchmark/fluid/models/stacked_dynamic_lstm.py b/benchmark/fluid/models/stacked_dynamic_lstm.py
index 64c8cde15f..3231542a17 100644
--- a/benchmark/fluid/models/stacked_dynamic_lstm.py
+++ b/benchmark/fluid/models/stacked_dynamic_lstm.py
@@ -44,7 +44,7 @@ def crop_sentence(reader, crop_size):
 
 
 def get_model(args):
-    if args.use_reader_op or args.use_py_reader_op:
+    if args.use_reader_op:
         raise Exception(
             "stacked_dynamic_lstm do not support reader op for now.")
     lstm_size = 512
diff --git a/benchmark/fluid/models/vgg.py b/benchmark/fluid/models/vgg.py
index 739681d4b6..932601302d 100644
--- a/benchmark/fluid/models/vgg.py
+++ b/benchmark/fluid/models/vgg.py
@@ -54,16 +54,12 @@ def vgg16_bn_drop(input):
 
 def get_model(args):
     if args.data_set == "cifar10":
-        underlying_train_reader = paddle.dataset.cifar.train10()
-        underlying_test_reader = paddle.dataset.cifar.test10()
         classdim = 10
         if args.data_format == 'NCHW':
             data_shape = [3, 32, 32]
         else:
             data_shape = [32, 32, 3]
     else:
-        underlying_train_reader = paddle.dataset.flowers.train()
-        underlying_test_reader = paddle.dataset.flowers.test()
         classdim = 102
         if args.data_format == 'NCHW':
             data_shape = [3, 224, 224]
@@ -85,16 +81,6 @@ def get_model(args):
             fluid.layers.batch(
                 data_file, batch_size=args.batch_size))
         images, label = fluid.layers.read_file(data_file)
-    elif args.use_py_reader_op:
-        data_file, feed_queue = fluid.layers.py_array_reader(
-            capacity=args.feed_queue_capacity,
-            shapes=[[-1] + data_shape, [-1, 1]],
-            lod_levels=[0, 0],
-            dtypes=["float32", "int64"])
-        data_file = fluid.layers.double_buffer(
-            fluid.layers.batch(
-                data_file, batch_size=args.batch_size))
-        images, label = fluid.layers.read_file(data_file)
     else:
         images = fluid.layers.data(
             name='data', shape=data_shape, dtype='float32')
@@ -123,14 +109,13 @@ def get_model(args):
     # data reader
     train_reader = paddle.batch(
         paddle.reader.shuffle(
-            underlying_train_reader, buf_size=5120),
+            paddle.dataset.cifar.train10()
+            if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
+            buf_size=5120),
         batch_size=args.batch_size * args.gpus)
     test_reader = paddle.batch(
-        underlying_test_reader, batch_size=args.batch_size)
+        paddle.dataset.cifar.test10()
+        if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
+        batch_size=args.batch_size)
 
-    if not args.use_reader_op and args.use_py_reader_op:
-        return avg_cost, inference_program, optimizer, train_reader, test_reader, batch_acc, \
-                      feed_queue, underlying_train_reader, underlying_test_reader, \
-                      (data_shape, [1])
-    else:
-        return avg_cost, inference_program, optimizer, train_reader, test_reader, batch_acc
+    return avg_cost, inference_program, optimizer, train_reader, test_reader, batch_acc
diff --git a/paddle/fluid/operators/reader/CMakeLists.txt b/paddle/fluid/operators/reader/CMakeLists.txt
index b6016e1d20..a39c8a0053 100644
--- a/paddle/fluid/operators/reader/CMakeLists.txt
+++ b/paddle/fluid/operators/reader/CMakeLists.txt
@@ -24,7 +24,7 @@ reader_library(create_double_buffer_reader_op SRCS create_double_buffer_reader_o
 reader_library(create_multi_pass_reader_op SRCS create_multi_pass_reader_op.cc)
 reader_library(create_threaded_reader_op SRCS create_threaded_reader_op.cc)
 reader_library(create_custom_reader_op SRCS create_custom_reader_op.cc)
-reader_library(create_py_array_reader_op SRCS create_py_array_reader_op.cc)
+reader_library(create_py_reader_op SRCS create_py_reader_op.cc)
 
 cc_test(reader_blocking_queue_test SRCS reader_blocking_queue_test.cc)
 # Export local libraries to parent
diff --git a/paddle/fluid/operators/reader/blocking_queue.h b/paddle/fluid/operators/reader/blocking_queue.h
index 71684b1417..6befc868a7 100644
--- a/paddle/fluid/operators/reader/blocking_queue.h
+++ b/paddle/fluid/operators/reader/blocking_queue.h
@@ -38,6 +38,8 @@ class BlockingQueue {
         "The capacity of a reader::BlockingQueue must be greater than 0.");
   }
 
+  ~BlockingQueue() { Close(); }
+
   bool Send(const T& elem) {
     std::unique_lock<std::mutex> lock(mutex_);
     send_cv_.wait(lock, [&] { return queue_.size() < capacity_ || closed_; });
@@ -88,24 +90,29 @@ class BlockingQueue {
     receive_cv_.notify_all();
   }
 
-  bool IsClosed() {
+  bool IsClosed() const {
     std::lock_guard<std::mutex> lock(mutex_);
     return closed_;
   }
 
-  size_t Cap() {
+  size_t Cap() const {
     std::lock_guard<std::mutex> lock(mutex_);
     return capacity_;
   }
 
+  size_t Size() const {
+    std::lock_guard<std::mutex> lock(mutex_);
+    return queue_.size();
+  }
+
  private:
   size_t capacity_;
   bool closed_;
   std::deque<T> queue_;
 
-  std::mutex mutex_;
-  std::condition_variable receive_cv_;
-  std::condition_variable send_cv_;
+  mutable std::mutex mutex_;
+  mutable std::condition_variable receive_cv_;
+  mutable std::condition_variable send_cv_;
 };
 }  // namespace reader
 }  // namespace operators
diff --git a/paddle/fluid/operators/reader/create_py_reader_op.cc b/paddle/fluid/operators/reader/create_py_reader_op.cc
new file mode 100644
index 0000000000..aac81d1813
--- /dev/null
+++ b/paddle/fluid/operators/reader/create_py_reader_op.cc
@@ -0,0 +1,81 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
+#include "paddle/fluid/operators/reader/reader_op_registry.h"
+
+namespace paddle {
+namespace operators {
+namespace reader {
+
+class PyReader : public framework::ReaderBase {
+ public:
+  explicit PyReader(const std::shared_ptr<LoDTensorBlockingQueue>& queue) {
+    PADDLE_ENFORCE(queue != nullptr, "LoDTensorBlockingQueue must not be null");
+    queue_ = queue;
+  }
+
+  void ReadNext(std::vector<framework::LoDTensor>* out) override {
+    bool success;
+    *out = queue_->Dequeue(&success);
+    if (!success) out->clear();
+  }
+
+  void ReInit() override {}
+
+ private:
+  std::shared_ptr<LoDTensorBlockingQueue> queue_;
+};
+
+class CreatePyReaderOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
+    const std::string& queue_name = Input("blocking_queue");
+    auto* queue_holder_var = scope.FindVar(queue_name);
+    PADDLE_ENFORCE(
+        queue_holder_var != nullptr,
+        "No LoDTensorBlockingQueueHolder variable with name %s found",
+        queue_name);
+    auto* queue_holder =
+        queue_holder_var->template GetMutable<LoDTensorBlockingQueueHolder>();
+    auto* out = scope.FindVar(Output("Out"))
+                    ->template GetMutable<framework::ReaderHolder>();
+    out->Reset(new PyReader(queue_holder->GetQueue()));
+  }
+};
+
+class CreatePyReaderOpMaker : public FileReaderMakerBase {
+ protected:
+  void Apply() override {
+    AddInput("blocking_queue",
+             "Name of the `LoDTensorBlockingQueueHolder` variable");
+
+    AddComment(R"DOC(
+			Create PyReader to support LoDTensor data feeding in Python side.
+      )DOC");
+  }
+};
+
+}  // namespace reader
+}  // namespace operators
+}  // namespace paddle
+
+namespace reader = ::paddle::operators::reader;
+
+REGISTER_FILE_READER_OPERATOR(create_py_reader, reader::CreatePyReaderOp,
+                              reader::CreatePyReaderOpMaker);
diff --git a/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h b/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h
new file mode 100644
index 0000000000..a2129f6af4
--- /dev/null
+++ b/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h
@@ -0,0 +1,107 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <vector>
+
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/operators/reader/blocking_queue.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace operators {
+namespace reader {
+
+class LoDTensorBlockingQueueHolder;
+
+class LoDTensorBlockingQueue {
+  friend class LoDTensorBlockingQueueHolder;
+
+ private:
+  LoDTensorBlockingQueue(size_t capacity,
+                         const std::vector<framework::DDim>& dims)
+      : dims_(dims) {
+    queue_.reset(
+        new BlockingQueue<std::vector<framework::LoDTensor>>(capacity));
+  }
+
+ public:
+  bool Enqueue(const std::vector<framework::LoDTensor>& lod_tensor_vec) {
+    CheckDims(lod_tensor_vec);
+    return queue_->Send(lod_tensor_vec);
+  }
+
+  bool Enqueue(std::vector<framework::LoDTensor>&& lod_tensor_vec) {
+    CheckDims(lod_tensor_vec);
+    return queue_->Send(std::move(lod_tensor_vec));
+  }
+
+  std::vector<framework::LoDTensor> Dequeue(bool* ok = nullptr) {
+    std::vector<framework::LoDTensor> lod_tensor_vec;
+    bool success = queue_->Receive(&lod_tensor_vec);
+    if (ok != nullptr) *ok = success;
+    return lod_tensor_vec;
+  }
+
+  inline size_t Cap() const { return queue_->Cap(); }
+
+  inline size_t Size() const { return queue_->Size(); }
+
+  inline void Close() { return queue_->Close(); }
+
+  inline bool IsClosed() const { return queue_->IsClosed(); }
+
+ private:
+  void CheckDims(const std::vector<framework::LoDTensor>& lod_tensor_vec) {
+    PADDLE_ENFORCE(dims_.size() == lod_tensor_vec.size(),
+                   "Expect input size is %d but found %s", dims_.size(),
+                   lod_tensor_vec.size());
+    for (size_t i = 0; i < dims_.size(); ++i) {
+      const auto& in_dims = lod_tensor_vec[i].dims();
+      const auto& expect_dims =
+          framework::slice_ddim(dims_[i], 1, dims_[i].size());
+      PADDLE_ENFORCE(in_dims == expect_dims,
+                     "Dims of the %d-th input tensor does not match", i);
+    }
+  }
+
+  std::unique_ptr<BlockingQueue<std::vector<framework::LoDTensor>>> queue_;
+  std::vector<framework::DDim> dims_;
+};
+
+class LoDTensorBlockingQueueHolder {
+ public:
+  void InitOnce(size_t capacity, const std::vector<framework::DDim>& dims) {
+    PADDLE_ENFORCE(
+        queue_ == nullptr,
+        "LoDTensorBlockingQueueHolder::InitOnce() can only be called once");
+    queue_.reset(new LoDTensorBlockingQueue(capacity, dims));
+  }
+
+  inline std::shared_ptr<LoDTensorBlockingQueue> GetQueue() { return queue_; }
+
+  inline const std::shared_ptr<LoDTensorBlockingQueue>& GetQueue() const {
+    return queue_;
+  }
+
+ private:
+  std::shared_ptr<LoDTensorBlockingQueue> queue_;
+};
+
+}  // namespace reader
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 472595f6a8..6963a0c101 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -34,7 +34,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/operators/activation_op.h"
-#include "paddle/fluid/operators/reader/py_array_feed_queue.h"
+#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -298,40 +298,38 @@ All parameter, weight, gradient are variables in Paddle.
   py::class_<framework::ReaderHolder>(m, "Reader", "")
       .def("reset", &framework::ReaderHolder::ReInit);
 
-  using PyArrayFeedQueue = ::paddle::operators::reader::PyArrayFeedQueue;
-  using PyArrayFeedQueueHolder =
-      ::paddle::operators::reader::PyArrayFeedQueueHolder;
-  using PyArray = ::paddle::operators::reader::PyArray;
-  py::class_<PyArrayFeedQueue>(m, "PyArrayFeedQueue", "")
-      .def(
-          "enqueue",
-          [](PyArrayFeedQueue &self, const std::vector<PyArray> &py_array_vec) {
-            return self.Enqueue(py_array_vec);
-          })
+  using LoDTensorBlockingQueue =
+      ::paddle::operators::reader::LoDTensorBlockingQueue;
+  using LoDTensorBlockingQueueHolder =
+      ::paddle::operators::reader::LoDTensorBlockingQueueHolder;
+  py::class_<LoDTensorBlockingQueue>(m, "LoDTensorBlockingQueue", "")
       .def("enqueue",
-           [](PyArrayFeedQueue &self,
+           [](LoDTensorBlockingQueue &self,
               const std::vector<framework::LoDTensor> &lod_tensor_vec) {
+             pybind11::gil_scoped_release release;
              return self.Enqueue(lod_tensor_vec);
            })
-      .def("size", [](const PyArrayFeedQueue &self) { return self.Size(); })
-      .def("capacity", [](const PyArrayFeedQueue &self) { return self.Cap(); })
-      .def("close", [](PyArrayFeedQueue &self) { return self.Close(); })
+      .def("size",
+           [](const LoDTensorBlockingQueue &self) { return self.Size(); })
+      .def("capacity",
+           [](const LoDTensorBlockingQueue &self) { return self.Cap(); })
+      .def("close", [](LoDTensorBlockingQueue &self) { return self.Close(); })
       .def("is_closed",
-           [](const PyArrayFeedQueue &self) { return self.IsClosed(); });
+           [](const LoDTensorBlockingQueue &self) { return self.IsClosed(); });
 
-  m.def("init_py_array_feed_queue",
+  m.def("init_lod_tensor_blocking_queue",
         [](Variable &var, size_t capacity,
-           const std::vector<std::vector<int64_t>> &shapes,
-           const ::paddle::platform::Place &place) -> PyArrayFeedQueue * {
-          std::vector<DDim> dims(shapes.size());
-          std::transform(shapes.begin(), shapes.end(), dims.begin(),
-                         [](const std::vector<int64_t> &shape) {
-                           return make_ddim(shape);
-                         });
-          auto *holder = var.GetMutable<PyArrayFeedQueueHolder>();
-          holder->InitOnce(capacity, dims, place);
-          return holder->GetFeeder().get();
-        },
+           const std::vector<std::vector<int64_t>> &shapes)
+            -> LoDTensorBlockingQueue * {
+              std::vector<DDim> dims(shapes.size());
+              std::transform(shapes.begin(), shapes.end(), dims.begin(),
+                             [](const std::vector<int64_t> &shape) {
+                               return make_ddim(shape);
+                             });
+              auto *holder = var.GetMutable<LoDTensorBlockingQueueHolder>();
+              holder->InitOnce(capacity, dims);
+              return holder->GetQueue().get();
+            },
         py::return_value_policy::reference);
 
   py::class_<Scope>(m, "Scope", "")
@@ -505,6 +503,7 @@ All parameter, weight, gradient are variables in Paddle.
         pybind11::gil_scoped_release release;
         self.Run(prog, scope, block_id, create_local_scope, create_vars);
       });
+
   m.def("init_gflags", framework::InitGflags);
   m.def("init_glog", framework::InitGLOG);
   m.def("init_devices",
@@ -669,7 +668,12 @@ All parameter, weight, gradient are variables in Paddle.
            &ParallelExecutor::FeedTensorsIntoLocalScopes)
       .def("feed_and_split_tensor_into_local_scopes",
            &ParallelExecutor::FeedAndSplitTensorIntoLocalScopes)
-      .def("run", &ParallelExecutor::Run);
+      .def("run", [](ParallelExecutor &self,
+                     const std::vector<std::string> &fetch_tensors,
+                     const std::string &fetched_var_name) {
+        pybind11::gil_scoped_release release;
+        self.Run(fetch_tensors, fetched_var_name);
+      });
 
   BindRecordIOWriter(&m);
   return m.ptr();
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index 811471c5fd..f3ab47c96b 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -24,8 +24,7 @@ from layer_function_generator import generate_layer_fn, templatedoc
 __all__ = [
     'data', 'BlockGuardServ', 'ListenAndServ', 'Send', 'Recv',
     'open_recordio_file', 'open_files', 'read_file', 'shuffle', 'batch',
-    'double_buffer', 'random_data_generator', 'py_array_reader', 'Preprocessor',
-    'load'
+    'double_buffer', 'random_data_generator', 'Preprocessor', 'load'
 ]
 
 
@@ -449,60 +448,6 @@ def random_data_generator(low, high, shapes, lod_levels, for_parallel=True):
     return monkey_patch_reader_methods(main_prog_var)
 
 
-def py_array_reader(capacity,
-                    shapes,
-                    lod_levels,
-                    dtypes,
-                    place=None,
-                    for_parallel=True):
-
-    if place is None:
-        place = core.CPUPlace()
-
-    if not isinstance(place, core.Place):
-        new_place = core.Place()
-        new_place.set_place(place)
-        place = new_place
-
-    dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes]
-    shape_concat = []
-    ranks = []
-
-    for shape in shapes:
-        shape_concat.extend(shape)
-        ranks.append(len(shape))
-
-    feeder_name = unique_name('py_array_feed_queue')
-    var = global_scope().var(feeder_name)
-
-    #feed_shapes = [shape[1:] for shape in shapes]
-    feed_queue = core.init_py_array_feed_queue(var, capacity, shapes, place)
-
-    startup_blk = default_startup_program().current_block()
-    startup_var = startup_blk.create_var(
-        name=unique_name('create_py_array_reader'))
-    startup_blk.append_op(
-        type='create_py_array_reader',
-        outputs={'Out': [startup_var]},
-        attrs={
-            'shape_concat': shape_concat,
-            'lod_levels': lod_levels,
-            'ranks': ranks,
-            'feeder_name': feeder_name
-        })
-
-    startup_var.desc.set_dtypes(dtypes)
-    startup_var.persistable = True
-
-    main_prog_var = _copy_reader_var_(default_main_program().current_block(),
-                                      startup_var)
-
-    if for_parallel:
-        main_prog_var = parallel(reader=main_prog_var)
-
-    return monkey_patch_reader_methods(main_prog_var), feed_queue
-
-
 def open_files(filenames,
                shapes,
                lod_levels,

From 67556e4aa4b5064fc699f9f10937dc21b2f19726 Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Mon, 25 Jun 2018 11:14:27 +0000
Subject: [PATCH 29/68] update blocking queue

---
 .../fluid/operators/reader/blocking_queue.h   |   2 -
 .../operators/reader/py_blocking_queue.h      | 125 ------------------
 2 files changed, 127 deletions(-)
 delete mode 100644 paddle/fluid/operators/reader/py_blocking_queue.h

diff --git a/paddle/fluid/operators/reader/blocking_queue.h b/paddle/fluid/operators/reader/blocking_queue.h
index 6befc868a7..db8cf3b605 100644
--- a/paddle/fluid/operators/reader/blocking_queue.h
+++ b/paddle/fluid/operators/reader/blocking_queue.h
@@ -38,8 +38,6 @@ class BlockingQueue {
         "The capacity of a reader::BlockingQueue must be greater than 0.");
   }
 
-  ~BlockingQueue() { Close(); }
-
   bool Send(const T& elem) {
     std::unique_lock<std::mutex> lock(mutex_);
     send_cv_.wait(lock, [&] { return queue_.size() < capacity_ || closed_; });
diff --git a/paddle/fluid/operators/reader/py_blocking_queue.h b/paddle/fluid/operators/reader/py_blocking_queue.h
deleted file mode 100644
index 721767102b..0000000000
--- a/paddle/fluid/operators/reader/py_blocking_queue.h
+++ /dev/null
@@ -1,125 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <condition_variable>  // NOLINT
-#include <deque>
-
-#include "Python.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "pybind11/pybind11.h"
-
-namespace paddle {
-namespace operators {
-namespace reader {
-
-// PyBlockingQueue is designed for PyArrayFeedQueue
-// PyBlockingQueue would release GIL of Python when
-// the queue is full to avoid deadlock.
-template <typename T>
-class PyBlockingQueue {
- public:
-  explicit PyBlockingQueue(size_t capacity)
-      : capacity_(capacity), closed_(false) {
-    PADDLE_ENFORCE_GT(
-        capacity_, 0,
-        "The capacity of a reader::PyBlockingQueue must be greater than 0.");
-  }
-
-  ~PyBlockingQueue() { Close(); }
-
-  bool Send(const T& elem) {
-    std::unique_lock<std::mutex> lock(mutex_);
-    receive_cv_.notify_one();
-    if (queue_.size() >= capacity_ && (!closed_)) {
-      pybind11::gil_scoped_release release;
-      send_cv_.wait(lock, [&] { return queue_.size() < capacity_ || closed_; });
-    }
-    if (closed_) {
-      VLOG(5)
-          << "WARNING: Sending an element to a closed reader::BlockingQueue.";
-      return false;
-    }
-    PADDLE_ENFORCE_LT(queue_.size(), capacity_);
-    queue_.push_back(elem);
-    return true;
-  }
-
-  bool Send(T&& elem) {
-    std::unique_lock<std::mutex> lock(mutex_);
-    receive_cv_.notify_one();
-    if (queue_.size() >= capacity_ && (!closed_)) {
-      pybind11::gil_scoped_release release;
-      send_cv_.wait(lock, [&] { return queue_.size() < capacity_ || closed_; });
-    }
-    if (closed_) {
-      VLOG(5)
-          << "WARNING: Sending an element to a closed reader::BlokcingQueue.";
-      return false;
-    }
-    PADDLE_ENFORCE_LT(queue_.size(), capacity_);
-    queue_.emplace_back(std::move(elem));
-    return true;
-  }
-
-  bool Receive(T* elem) {
-    std::unique_lock<std::mutex> lock(mutex_);
-    send_cv_.notify_one();
-    receive_cv_.wait(lock, [&] { return !queue_.empty() || closed_; });
-    if (!queue_.empty()) {
-      PADDLE_ENFORCE_NOT_NULL(elem);
-      *elem = queue_.front();
-      queue_.pop_front();
-      return true;
-    } else {
-      PADDLE_ENFORCE(closed_);
-      return false;
-    }
-  }
-
-  void Close() {
-    std::lock_guard<std::mutex> lock(mutex_);
-    closed_ = true;
-    send_cv_.notify_all();
-    receive_cv_.notify_all();
-  }
-
-  bool IsClosed() const {
-    std::lock_guard<std::mutex> lock(mutex_);
-    return closed_;
-  }
-
-  size_t Cap() const {
-    std::lock_guard<std::mutex> lock(mutex_);
-    return capacity_;
-  }
-
-  size_t Size() const {
-    std::lock_guard<std::mutex> lock(mutex_);
-    return queue_.size();
-  }
-
- private:
-  size_t capacity_;
-  bool closed_;
-  std::deque<T> queue_;
-
-  mutable std::mutex mutex_;
-  mutable std::condition_variable receive_cv_;
-  mutable std::condition_variable send_cv_;
-};
-}  // namespace reader
-}  // namespace operators
-}  // namespace paddle

From e9ed62bfed91fa86906b805f6cf22c3c7e51490d Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Tue, 26 Jun 2018 11:10:08 +0800
Subject: [PATCH 30/68] make framework.Parameter public

---
 python/paddle/fluid/framework.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index db21b1f3c0..6c6f90a0cf 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -27,6 +27,7 @@ __all__ = [
     'Variable',
     'Program',
     'Operator',
+    'Parameter',
     'default_startup_program',
     'default_main_program',
     'program_guard',
@@ -1905,7 +1906,7 @@ def program_guard(main_program, startup_program=None):
 def get_var(name, program=None):
     """
     Get a variable by name from the global block of a program.
-    
+
     Args:
         name(str): name of the variable
         program(Program|None): program object.

From e09ac3df188a8a9ce68845b606ffe301f0eaed7b Mon Sep 17 00:00:00 2001
From: Kexin Zhao <zhaokexin01@baidu.com>
Date: Mon, 25 Jun 2018 22:01:48 -0700
Subject: [PATCH 31/68] replace lod name with recur_seq_lens

---
 doc/fluid/design/concepts/lod_tensor.md       | 20 ++++++
 .../test_label_semantic_roles_newapi.py       | 28 ++++-----
 .../test_machine_translation.py               | 12 ++--
 .../test_recommender_system_newapi.py         | 12 ++--
 .../test_understand_sentiment_conv.py         | 10 +--
 .../test_understand_sentiment_dynamic_rnn.py  | 10 +--
 .../test_understand_sentiment_stacked_lstm.py | 10 +--
 .../word2vec/test_word2vec_new_api.py         | 19 +++---
 .../tests/book/notest_understand_sentiment.py | 16 +++--
 .../tests/book/test_label_semantic_roles.py   | 62 ++++++++++++++-----
 .../tests/book/test_machine_translation.py    | 14 +++--
 .../tests/book/test_recommender_system.py     | 12 ++--
 .../tests/book/test_rnn_encoder_decoder.py    | 14 ++---
 .../paddle/fluid/tests/book/test_word2vec.py  | 21 ++++---
 14 files changed, 163 insertions(+), 97 deletions(-)

diff --git a/doc/fluid/design/concepts/lod_tensor.md b/doc/fluid/design/concepts/lod_tensor.md
index d606d7a790..748488f6d5 100644
--- a/doc/fluid/design/concepts/lod_tensor.md
+++ b/doc/fluid/design/concepts/lod_tensor.md
@@ -173,6 +173,7 @@ are transformed into offsets of elements/words as follows:
 
 ## Slicing of LoD Tensors
 
+
 When we use the above 2-level LoD Tensor as the input to a nested-RNN, we need to retrieve certain sequences.  Here we define the sequence identified by branch <i,j,...> as the **<i,j,...>-slice**.
 
 For example, the <2>-slice of above example is
@@ -189,3 +190,22 @@ and the <2,0>-slice of above slice is
 10  12
   ||
 ```
+
+## Length Representation vs Offset Representation
+
+The offset representation is an implementation-oriented decision and it makes understanding the idea behind LoDTensor difficult.
+Hence, we encapsulate this implementation detail in C++ and expose the original length representation in our Python API. 
+Specifically, we call this length representation `recursive_sequence_lengths` and users can use the following code to set or get the `recursive_sequence_lengths` of a LoDTensor in Python:
+```Python
+# length representation of lod called recursive_sequence_lengths
+recursive_seq_lens = [[3, 1, 2], [2, 2, 1, 3, 1, 2]]
+# Create a LoDTensor that has the above recursive_sequence_lengths info.
+# This recursive_sequence_lengths will be converted to an offset representation of LoD in the C++ implementation under the hood.
+tensor = fluid.LoDTensor(lod)
+
+# Set/Change the recursive_sequence_lengths info of LoDTensor
+tensor.set_recursive_sequence_lengths([[3, 1, 2]])
+# Get the recursive_sequence_lengths info of a LoDTensor (the offset-based LoD representation stored in C++ will be converted 
+# back to length-based recursive_sequence_lengths), new_recursive_seq_lens = [[3, 1, 2]]
+new_recursive_seq_lens = tensor.recursive_sequence_lengths()
+```
diff --git a/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py b/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py
index 0ccb3a39e0..67aa21e8c5 100755
--- a/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py
+++ b/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py
@@ -206,35 +206,35 @@ def infer(use_cuda, inference_program, params_dirname):
     inferencer = fluid.Inferencer(
         inference_program, param_path=params_dirname, place=place)
 
-    # Setup inputs by creating LoDTensors to represent sequences of words.
-    # Here each word is the basic element of these LoDTensors and the shape of 
+    # Setup input by creating LoDTensor to represent sequence of words.
+    # Here each word is the basic element of the LoDTensor and the shape of 
     # each word (base_shape) should be [1] since it is simply an index to 
     # look up for the corresponding word vector.
-    # Suppose the length_based level of detail (lod) info is set to [[3, 4, 2]],
-    # which has only one lod level. Then the created LoDTensors will have only 
+    # Suppose the recursive_sequence_lengths info is set to [[3, 4, 2]],
+    # which has only one level of detail. Then the created LoDTensor will have only 
     # one higher level structure (sequence of words, or sentence) than the basic 
     # element (word). Hence the LoDTensor will hold data for three sentences of 
     # length 3, 4 and 2, respectively. 
-    # Note that lod info should be a list of lists.
-    lod = [[3, 4, 2]]
+    # Note that recursive_sequence_lengths should be a list of lists.
+    recursive_seq_lens = [[3, 4, 2]]
     base_shape = [1]
     # The range of random integers is [low, high]
     word = fluid.create_random_int_lodtensor(
-        lod, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
+        recursive_seq_lens, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
     ctx_n2 = fluid.create_random_int_lodtensor(
-        lod, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
+        recursive_seq_lens, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
     ctx_n1 = fluid.create_random_int_lodtensor(
-        lod, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
+        recursive_seq_lens, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
     ctx_0 = fluid.create_random_int_lodtensor(
-        lod, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
+        recursive_seq_lens, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
     ctx_p1 = fluid.create_random_int_lodtensor(
-        lod, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
+        recursive_seq_lens, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
     ctx_p2 = fluid.create_random_int_lodtensor(
-        lod, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
+        recursive_seq_lens, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
     pred = fluid.create_random_int_lodtensor(
-        lod, base_shape, place, low=0, high=PRED_DICT_LEN - 1)
+        recursive_seq_lens, base_shape, place, low=0, high=PRED_DICT_LEN - 1)
     mark = fluid.create_random_int_lodtensor(
-        lod, base_shape, place, low=0, high=MARK_DICT_LEN - 1)
+        recursive_seq_lens, base_shape, place, low=0, high=MARK_DICT_LEN - 1)
 
     results = inferencer.infer(
         {
diff --git a/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py b/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py
index c4b37df3a0..31d756503a 100644
--- a/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py
+++ b/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py
@@ -215,11 +215,13 @@ def decode_main(use_cuda, is_sparse):
         [1. for _ in range(batch_size)], dtype='float32')
     init_ids_data = init_ids_data.reshape((batch_size, 1))
     init_scores_data = init_scores_data.reshape((batch_size, 1))
-    init_lod = [1] * batch_size
-    init_lod = [init_lod, init_lod]
+    init_recursive_seq_lens = [1] * batch_size
+    init_recursive_seq_lens = [init_recursive_seq_lens, init_recursive_seq_lens]
 
-    init_ids = fluid.create_lod_tensor(init_ids_data, init_lod, place)
-    init_scores = fluid.create_lod_tensor(init_scores_data, init_lod, place)
+    init_ids = fluid.create_lod_tensor(init_ids_data,
+                                       init_init_recursive_seq_lens, place)
+    init_scores = fluid.create_lod_tensor(init_scores_data,
+                                          init_init_recursive_seq_lens, place)
 
     train_data = paddle.batch(
         paddle.reader.shuffle(
@@ -243,7 +245,7 @@ def decode_main(use_cuda, is_sparse):
             feed=feed_dict,
             fetch_list=[translation_ids, translation_scores],
             return_numpy=False)
-        print result_ids.lod()
+        print result_ids.recursive_sequence_lengths()
         break
 
 
diff --git a/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py b/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py
index 090c11ce1e..c860f16417 100644
--- a/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py
+++ b/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py
@@ -209,13 +209,15 @@ def infer(use_cuda, inference_program, params_dirname):
         inference_program, param_path=params_dirname, place=place)
 
     # Use the first data from paddle.dataset.movielens.test() as input.
-    # Use create_lod_tensor(data, lod, place) API to generate LoD Tensor,
-    # where `data` is a list of sequences of index numbers, `lod` is 
-    # the level of detail (lod) info associated with `data`.
+    # Use create_lod_tensor(data, recursive_sequence_lengths, place) API 
+    # to generate LoD Tensor where `data` is a list of sequences of index 
+    # numbers, `recursive_sequence_lengths` is the length-based level of detail 
+    # (lod) info associated with `data`.
     # For example, data = [[10, 2, 3], [2, 3]] means that it contains
     # two sequences of indexes, of length 3 and 2, respectively.
-    # Correspondingly, lod = [[3, 2]] contains one level of detail info,
-    # indicating that `data` consists of two sequences of length 3 and 2. 
+    # Correspondingly, recursive_sequence_lengths = [[3, 2]] contains one 
+    # level of detail info, indicating that `data` consists of two sequences 
+    # of length 3 and 2, respectively. 
     user_id = fluid.create_lod_tensor([[1]], [[1]], place)
     gender_id = fluid.create_lod_tensor([[1]], [[1]], place)
     age_id = fluid.create_lod_tensor([[0]], [[1]], place)
diff --git a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py
index 9b61f7a00c..1668ae83d3 100644
--- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py
@@ -128,17 +128,17 @@ def infer(use_cuda, inference_program, params_dirname=None):
     # Here each word is the basic element of the LoDTensor and the shape of 
     # each word (base_shape) should be [1] since it is simply an index to 
     # look up for the corresponding word vector.
-    # Suppose the length_based level of detail (lod) info is set to [[3, 4, 2]],
-    # which has only one lod level. Then the created LoDTensor will have only 
+    # Suppose the recursive_sequence_lengths info is set to [[3, 4, 2]],
+    # which has only one level of detail. Then the created LoDTensor will have only 
     # one higher level structure (sequence of words, or sentence) than the basic 
     # element (word). Hence the LoDTensor will hold data for three sentences of 
     # length 3, 4 and 2, respectively. 
-    # Note that lod info should be a list of lists.
-    lod = [[3, 4, 2]]
+    # Note that recursive_sequence_lengths should be a list of lists.
+    recursive_seq_lens = [[3, 4, 2]]
     base_shape = [1]
     # The range of random integers is [low, high]
     tensor_words = fluid.create_random_int_lodtensor(
-        lod, base_shape, place, low=0, high=len(word_dict) - 1)
+        recursive_seq_lens, base_shape, place, low=0, high=len(word_dict) - 1)
     results = inferencer.infer({'words': tensor_words})
     print("infer results: ", results)
 
diff --git a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py
index aa7c567b4d..8da89d82cb 100644
--- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py
@@ -143,17 +143,17 @@ def infer(use_cuda, inference_program, params_dirname=None):
     # Here each word is the basic element of the LoDTensor and the shape of 
     # each word (base_shape) should be [1] since it is simply an index to 
     # look up for the corresponding word vector.
-    # Suppose the length_based level of detail (lod) info is set to [[3, 4, 2]],
-    # which has only one lod level. Then the created LoDTensor will have only 
+    # Suppose the recursive_sequence_lengths info is set to [[3, 4, 2]],
+    # which has only one level of detail. Then the created LoDTensor will have only 
     # one higher level structure (sequence of words, or sentence) than the basic 
     # element (word). Hence the LoDTensor will hold data for three sentences of 
     # length 3, 4 and 2, respectively. 
-    # Note that lod info should be a list of lists.
-    lod = [[3, 4, 2]]
+    # Note that recursive_sequence_lengths should be a list of lists.
+    recursive_seq_lens = [[3, 4, 2]]
     base_shape = [1]
     # The range of random integers is [low, high]
     tensor_words = fluid.create_random_int_lodtensor(
-        lod, base_shape, place, low=0, high=len(word_dict) - 1)
+        recursive_seq_lens, base_shape, place, low=0, high=len(word_dict) - 1)
     results = inferencer.infer({'words': tensor_words})
     print("infer results: ", results)
 
diff --git a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py
index 8c74be0f08..74faa2e8aa 100644
--- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py
@@ -138,17 +138,17 @@ def infer(use_cuda, inference_program, params_dirname=None):
     # Here each word is the basic element of the LoDTensor and the shape of 
     # each word (base_shape) should be [1] since it is simply an index to 
     # look up for the corresponding word vector.
-    # Suppose the length_based level of detail (lod) info is set to [[3, 4, 2]],
-    # which has only one lod level. Then the created LoDTensor will have only 
+    # Suppose the recursive_sequence_lengths info is set to [[3, 4, 2]],
+    # which has only one level of detail. Then the created LoDTensor will have only 
     # one higher level structure (sequence of words, or sentence) than the basic 
     # element (word). Hence the LoDTensor will hold data for three sentences of 
     # length 3, 4 and 2, respectively. 
-    # Note that lod info should be a list of lists.
-    lod = [[3, 4, 2]]
+    # Note that recursive_sequence_lengths should be a list of lists.
+    recursive_seq_lens = [[3, 4, 2]]
     base_shape = [1]
     # The range of random integers is [low, high]
     tensor_words = fluid.create_random_int_lodtensor(
-        lod, base_shape, place, low=0, high=len(word_dict) - 1)
+        recursive_seq_lens, base_shape, place, low=0, high=len(word_dict) - 1)
     results = inferencer.infer({'words': tensor_words})
     print("infer results: ", results)
 
diff --git a/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py b/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py
index ba44f72d9b..02e65cf56c 100644
--- a/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py
+++ b/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py
@@ -124,21 +124,22 @@ def infer(use_cuda, inference_program, params_dirname=None):
 
     # Setup inputs by creating 4 LoDTensors representing 4 words. Here each word 
     # is simply an index to look up for the corresponding word vector and hence 
-    # the shape of word (base_shape) should be [1]. The length-based level of 
-    # detail (lod) info of each LoDtensor should be [[1]] meaning there is only 
-    # one lod_level and there is only one sequence of one word on this level.
-    # Note that lod info should be a list of lists.
-    lod = [[1]]
+    # the shape of word (base_shape) should be [1]. The recursive_sequence_lengths, 
+    # which is length-based level of detail (lod) of each LoDTensor, should be [[1]] 
+    # meaning there is only one level of detail and there is only one sequence of 
+    # one word on this level.
+    # Note that recursive_sequence_lengths should be a list of lists.
+    recursive_seq_lens = [[1]]
     base_shape = [1]
     # The range of random integers is [low, high]
     first_word = fluid.create_random_int_lodtensor(
-        lod, base_shape, place, low=0, high=dict_size - 1)
+        recursive_seq_lens, base_shape, place, low=0, high=dict_size - 1)
     second_word = fluid.create_random_int_lodtensor(
-        lod, base_shape, place, low=0, high=dict_size - 1)
+        recursive_seq_lens, base_shape, place, low=0, high=dict_size - 1)
     third_word = fluid.create_random_int_lodtensor(
-        lod, base_shape, place, low=0, high=dict_size - 1)
+        recursive_seq_lens, base_shape, place, low=0, high=dict_size - 1)
     fourth_word = fluid.create_random_int_lodtensor(
-        lod, base_shape, place, low=0, high=dict_size - 1)
+        recursive_seq_lens, base_shape, place, low=0, high=dict_size - 1)
 
     result = inferencer.infer(
         {
diff --git a/python/paddle/fluid/tests/book/notest_understand_sentiment.py b/python/paddle/fluid/tests/book/notest_understand_sentiment.py
index 5d9a47c9ba..1df7b99aad 100644
--- a/python/paddle/fluid/tests/book/notest_understand_sentiment.py
+++ b/python/paddle/fluid/tests/book/notest_understand_sentiment.py
@@ -238,17 +238,21 @@ def infer(word_dict, use_cuda, save_dirname=None):
         # Here each word is the basic element of the LoDTensor and the shape of 
         # each word (base_shape) should be [1] since it is simply an index to 
         # look up for the corresponding word vector.
-        # Suppose the length_based level of detail (lod) info is set to [[3, 4, 2]],
-        # which has only one lod level. Then the created LoDTensor will have only 
+        # Suppose the recursive_sequence_lengths info is set to [[3, 4, 2]],
+        # which has only one level of detail. Then the created LoDTensor will have only 
         # one higher level structure (sequence of words, or sentence) than the basic 
         # element (word). Hence the LoDTensor will hold data for three sentences of 
         # length 3, 4 and 2, respectively. 
-        # Note that lod info should be a list of lists.
-        lod = [[3, 4, 2]]
+        # Note that recursive_sequence_lengths should be a list of lists.
+        recursive_seq_lens = [[3, 4, 2]]
         base_shape = [1]
         # The range of random integers is [low, high]
         tensor_words = fluid.create_random_int_lodtensor(
-            lod, base_shape, place, low=0, high=word_dict_len - 1)
+            recursive_seq_lens,
+            base_shape,
+            place,
+            low=0,
+            high=word_dict_len - 1)
 
         # Construct feed as a dictionary of {feed_target_name: feed_target_data}
         # and results will contain a list of data corresponding to fetch_targets.
@@ -257,7 +261,7 @@ def infer(word_dict, use_cuda, save_dirname=None):
                           feed={feed_target_names[0]: tensor_words},
                           fetch_list=fetch_targets,
                           return_numpy=False)
-        print(results[0].lod())
+        print(results[0].recursive_sequence_lengths())
         np_data = np.array(results[0])
         print("Inference Shape: ", np_data.shape)
         print("Inference results: ", np_data)
diff --git a/python/paddle/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
index e214ced0b5..d489feae9c 100644
--- a/python/paddle/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
@@ -247,35 +247,67 @@ def infer(use_cuda, save_dirname=None):
         [inference_program, feed_target_names,
          fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
 
-        # Setup inputs by creating LoDTensors to represent sequences of words.
-        # Here each word is the basic element of these LoDTensors and the shape of 
+        # Setup input by creating LoDTensor to represent sequence of words.
+        # Here each word is the basic element of the LoDTensor and the shape of 
         # each word (base_shape) should be [1] since it is simply an index to 
         # look up for the corresponding word vector.
-        # Suppose the length_based level of detail (lod) info is set to [[3, 4, 2]],
-        # which has only one lod level. Then the created LoDTensors will have only 
+        # Suppose the recursive_sequence_lengths info is set to [[3, 4, 2]],
+        # which has only one level of detail. Then the created LoDTensor will have only 
         # one higher level structure (sequence of words, or sentence) than the basic 
         # element (word). Hence the LoDTensor will hold data for three sentences of 
         # length 3, 4 and 2, respectively. 
-        # Note that lod info should be a list of lists.
-        lod = [[3, 4, 2]]
+        # Note that recursive_sequence_lengths should be a list of lists.
+        recursive_seq_lens = [[3, 4, 2]]
         base_shape = [1]
         # The range of random integers is [low, high]
         word = fluid.create_random_int_lodtensor(
-            lod, base_shape, place, low=0, high=word_dict_len - 1)
+            recursive_seq_lens,
+            base_shape,
+            place,
+            low=0,
+            high=word_dict_len - 1)
         pred = fluid.create_random_int_lodtensor(
-            lod, base_shape, place, low=0, high=pred_dict_len - 1)
+            recursive_seq_lens,
+            base_shape,
+            place,
+            low=0,
+            high=pred_dict_len - 1)
         ctx_n2 = fluid.create_random_int_lodtensor(
-            lod, base_shape, place, low=0, high=word_dict_len - 1)
+            recursive_seq_lens,
+            base_shape,
+            place,
+            low=0,
+            high=word_dict_len - 1)
         ctx_n1 = fluid.create_random_int_lodtensor(
-            lod, base_shape, place, low=0, high=word_dict_len - 1)
+            recursive_seq_lens,
+            base_shape,
+            place,
+            low=0,
+            high=word_dict_len - 1)
         ctx_0 = fluid.create_random_int_lodtensor(
-            lod, base_shape, place, low=0, high=word_dict_len - 1)
+            recursive_seq_lens,
+            base_shape,
+            place,
+            low=0,
+            high=word_dict_len - 1)
         ctx_p1 = fluid.create_random_int_lodtensor(
-            lod, base_shape, place, low=0, high=word_dict_len - 1)
+            recursive_seq_lens,
+            base_shape,
+            place,
+            low=0,
+            high=word_dict_len - 1)
         ctx_p2 = fluid.create_random_int_lodtensor(
-            lod, base_shape, place, low=0, high=word_dict_len - 1)
+            recursive_seq_lens,
+            base_shape,
+            place,
+            low=0,
+            high=word_dict_len - 1)
         mark = fluid.create_random_int_lodtensor(
-            lod, base_shape, place, low=0, high=mark_dict_len - 1)
+            recursive_seq_lens,
+            base_shape,
+            place,
+            low=0,
+            high=mark_dict_len - 1)
 
         # Construct feed as a dictionary of {feed_target_name: feed_target_data}
         # and results will contain a list of data corresponding to fetch_targets.
@@ -301,7 +333,7 @@ def infer(use_cuda, save_dirname=None):
                           },
                           fetch_list=fetch_targets,
                           return_numpy=False)
-        print(results[0].lod())
+        print(results[0].recursive_sequence_lengths())
         np_data = np.array(results[0])
         print("Inference Shape: ", np_data.shape)
 
diff --git a/python/paddle/fluid/tests/book/test_machine_translation.py b/python/paddle/fluid/tests/book/test_machine_translation.py
index 372d6ec822..a68eae5bcb 100644
--- a/python/paddle/fluid/tests/book/test_machine_translation.py
+++ b/python/paddle/fluid/tests/book/test_machine_translation.py
@@ -108,7 +108,7 @@ def decoder_decode(context, is_sparse):
         pre_state = pd.array_read(array=state_array, i=counter)
         pre_score = pd.array_read(array=scores_array, i=counter)
 
-        # expand the lod of pre_state to be the same with pre_score
+        # expand the recursive_sequence_lengths of pre_state to be the same with pre_score
         pre_state_expanded = pd.sequence_expand(pre_state, pre_score)
 
         pre_ids_emb = pd.embedding(
@@ -238,11 +238,13 @@ def decode_main(use_cuda, is_sparse):
         [1. for _ in range(batch_size)], dtype='float32')
     init_ids_data = init_ids_data.reshape((batch_size, 1))
     init_scores_data = init_scores_data.reshape((batch_size, 1))
-    init_lod = [1] * batch_size
-    init_lod = [init_lod, init_lod]
+    init_recursive_seq_lens = [1] * batch_size
+    init_recursive_seq_lens = [init_recursive_seq_lens, init_recursive_seq_lens]
 
-    init_ids = fluid.create_lod_tensor(init_ids_data, init_lod, place)
-    init_scores = fluid.create_lod_tensor(init_scores_data, init_lod, place)
+    init_ids = fluid.create_lod_tensor(init_ids_data, init_recursive_seq_lens,
+                                       place)
+    init_scores = fluid.create_lod_tensor(init_scores_data,
+                                          init_recursive_seq_lens, place)
 
     train_data = paddle.batch(
         paddle.reader.shuffle(
@@ -266,7 +268,7 @@ def decode_main(use_cuda, is_sparse):
             feed=feed_dict,
             fetch_list=[translation_ids, translation_scores],
             return_numpy=False)
-        print result_ids.lod()
+        print result_ids.recursive_sequence_lengths()
         break
 
 
diff --git a/python/paddle/fluid/tests/book/test_recommender_system.py b/python/paddle/fluid/tests/book/test_recommender_system.py
index 937d8dd5b0..6548766ef5 100644
--- a/python/paddle/fluid/tests/book/test_recommender_system.py
+++ b/python/paddle/fluid/tests/book/test_recommender_system.py
@@ -260,13 +260,15 @@ def infer(use_cuda, save_dirname=None):
 
         # Use the first data from paddle.dataset.movielens.test() as input
         assert feed_target_names[0] == "user_id"
-        # Use create_lod_tensor(data, lod, place) API to generate LoD Tensor
-        # where `data` is a list of sequences of index numbers, `lod` is 
-        # the level of detail (lod) info associated with `data`.
+        # Use create_lod_tensor(data, recursive_sequence_lengths, place) API 
+        # to generate LoD Tensor where `data` is a list of sequences of index 
+        # numbers, `recursive_sequence_lengths` is the length-based level of detail 
+        # (lod) info associated with `data`.
         # For example, data = [[10, 2, 3], [2, 3]] means that it contains
         # two sequences of indexes, of length 3 and 2, respectively.
-        # Correspondingly, lod = [[3, 2]] contains one level of detail info,
-        # indicating that `data` consists of two sequences of length 3 and 2. 
+        # Correspondingly, recursive_sequence_lengths = [[3, 2]] contains one 
+        # level of detail info, indicating that `data` consists of two sequences 
+        # of length 3 and 2, respectively. 
         user_id = fluid.create_lod_tensor([[1]], [[1]], place)
 
         assert feed_target_names[1] == "gender_id"
diff --git a/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py b/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py
index 7ada57def6..4672826241 100644
--- a/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py
+++ b/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py
@@ -216,19 +216,19 @@ def infer(use_cuda, save_dirname=None):
         # Here each word is the basic element of the LoDTensor and the shape of 
         # each word (base_shape) should be [1] since it is simply an index to 
         # look up for the corresponding word vector.
-        # Suppose the length_based level of detail (lod) info is set to [[4, 6]],
-        # which has only one lod level. Then the created LoDTensor will have only 
+        # Suppose the recursive_sequence_lengths info is set to [[4, 6]],
+        # which has only one level of detail. Then the created LoDTensor will have only 
         # one higher level structure (sequence of words, or sentence) than the basic 
         # element (word). Hence the LoDTensor will hold data for two sentences of 
         # length 4 and 6, respectively. 
-        # Note that lod info should be a list of lists.
-        lod = [[4, 6]]
+        # Note that recursive_sequence_lengths should be a list of lists.
+        recursive_seq_lens = [[4, 6]]
         base_shape = [1]
         # The range of random integers is [low, high]
         word_data = fluid.create_random_int_lodtensor(
-            lod, base_shape, place, low=0, high=1)
+            recursive_seq_lens, base_shape, place, low=0, high=1)
         trg_word = fluid.create_random_int_lodtensor(
-            lod, base_shape, place, low=0, high=1)
+            recursive_seq_lens, base_shape, place, low=0, high=1)
 
         # Construct feed as a dictionary of {feed_target_name: feed_target_data}
         # and results will contain a list of data corresponding to fetch_targets.
@@ -241,7 +241,7 @@ def infer(use_cuda, save_dirname=None):
                           },
                           fetch_list=fetch_targets,
                           return_numpy=False)
-        print(results[0].lod())
+        print(results[0].recursive_sequence_lengths())
         np_data = np.array(results[0])
         print("Inference shape: ", np_data.shape)
         print("Inference results: ", np_data)
diff --git a/python/paddle/fluid/tests/book/test_word2vec.py b/python/paddle/fluid/tests/book/test_word2vec.py
index 75bed06bd7..49bd72c7a5 100644
--- a/python/paddle/fluid/tests/book/test_word2vec.py
+++ b/python/paddle/fluid/tests/book/test_word2vec.py
@@ -168,21 +168,22 @@ def infer(use_cuda, save_dirname=None):
 
         # Setup inputs by creating 4 LoDTensors representing 4 words. Here each word 
         # is simply an index to look up for the corresponding word vector and hence 
-        # the shape of word (base_shape) should be [1]. The length-based level of 
-        # detail (lod) info of each LoDtensor should be [[1]] meaning there is only 
-        # one lod_level and there is only one sequence of one word on this level.
-        # Note that lod info should be a list of lists.
-        lod = [[1]]
+        # the shape of word (base_shape) should be [1]. The recursive_sequence_lengths, 
+        # which is length-based level of detail (lod) of each LoDTensor, should be [[1]] 
+        # meaning there is only one level of detail and there is only one sequence of 
+        # one word on this level.
+        # Note that recursive_sequence_lengths should be a list of lists.
+        recursive_seq_lens = [[1]]
         base_shape = [1]
         # The range of random integers is [low, high]
         first_word = fluid.create_random_int_lodtensor(
-            lod, base_shape, place, low=0, high=dict_size - 1)
+            recursive_seq_lens, base_shape, place, low=0, high=dict_size - 1)
         second_word = fluid.create_random_int_lodtensor(
-            lod, base_shape, place, low=0, high=dict_size - 1)
+            recursive_seq_lens, base_shape, place, low=0, high=dict_size - 1)
         third_word = fluid.create_random_int_lodtensor(
-            lod, base_shape, place, low=0, high=dict_size - 1)
+            recursive_seq_lens, base_shape, place, low=0, high=dict_size - 1)
         fourth_word = fluid.create_random_int_lodtensor(
-            lod, base_shape, place, low=0, high=dict_size - 1)
+            recursive_seq_lens, base_shape, place, low=0, high=dict_size - 1)
 
         assert feed_target_names[0] == 'firstw'
         assert feed_target_names[1] == 'secondw'
@@ -200,7 +201,7 @@ def infer(use_cuda, save_dirname=None):
                           },
                           fetch_list=fetch_targets,
                           return_numpy=False)
-        print(results[0].lod())
+        print(results[0].recursive_sequence_lengths())
         np_data = np.array(results[0])
         print("Inference Shape: ", np_data.shape)
 

From 3d69a82b837e9dbfb907c36cd4a029e10d682db8 Mon Sep 17 00:00:00 2001
From: "yi.wu" <yi.wu@baifendian.com>
Date: Tue, 26 Jun 2018 16:23:31 +0800
Subject: [PATCH 32/68] fix dist train broadcasting bug

---
 .../framework/details/multi_devices_graph_builder.cc |  3 +++
 paddle/fluid/framework/details/ssa_graph_builder.h   |  2 +-
 paddle/fluid/framework/details/ssa_graph_checker.h   |  6 ++++++
 paddle/fluid/framework/details/ssa_graph_printer.h   |  5 +++++
 paddle/fluid/framework/parallel_executor.cc          | 12 +++++++-----
 5 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
index 74f3687c0d..8765af52b0 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@@ -483,6 +483,9 @@ void MultiDevSSAGraphBuilder::CreateDistTrainOp(SSAGraph *result,
     }
   } else if (op.Type() == "concat") {
     op_dev_id = GetVarDeviceID(op.InputArgumentNames()[0]);
+    for (auto &varname : op.OutputArgumentNames()) {
+      var_name_on_devices_.emplace(varname, op_dev_id);
+    }
   } else {
     PADDLE_ENFORCE(
         "the distribute training related op should be in [split_byref, "
diff --git a/paddle/fluid/framework/details/ssa_graph_builder.h b/paddle/fluid/framework/details/ssa_graph_builder.h
index 9eb23c4626..18612c3c1b 100644
--- a/paddle/fluid/framework/details/ssa_graph_builder.h
+++ b/paddle/fluid/framework/details/ssa_graph_builder.h
@@ -30,7 +30,7 @@ class SSAGraphBuilder {
   SSAGraphBuilder() {}
   virtual ~SSAGraphBuilder() {}
   virtual std::unique_ptr<SSAGraph> Build(const ProgramDesc &program) const = 0;
-  virtual int GetVarDeviceID(const std::string &var_name) const { return -1; }
+  virtual int GetVarDeviceID(const std::string &var_name) const = 0;
 
   DISABLE_COPY_AND_ASSIGN(SSAGraphBuilder);
 
diff --git a/paddle/fluid/framework/details/ssa_graph_checker.h b/paddle/fluid/framework/details/ssa_graph_checker.h
index 304b221e7e..331aa9d2b5 100644
--- a/paddle/fluid/framework/details/ssa_graph_checker.h
+++ b/paddle/fluid/framework/details/ssa_graph_checker.h
@@ -16,6 +16,8 @@
 
 #include "paddle/fluid/framework/details/ssa_graph_builder.h"
 
+#include <string>
+
 namespace paddle {
 namespace framework {
 namespace details {
@@ -33,6 +35,10 @@ class SSAGraghBuilderWithChecker : public SSAGraphBuilder {
     return graph;
   }
 
+  int GetVarDeviceID(const std::string& var_name) const override {
+    return builder_->GetVarDeviceID(var_name);
+  }
+
   bool IsValidGraph(const SSAGraph* graph) const;
 
  private:
diff --git a/paddle/fluid/framework/details/ssa_graph_printer.h b/paddle/fluid/framework/details/ssa_graph_printer.h
index b4c9001378..09b0333ef2 100644
--- a/paddle/fluid/framework/details/ssa_graph_printer.h
+++ b/paddle/fluid/framework/details/ssa_graph_printer.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <iosfwd>
+#include <string>
 #include "paddle/fluid/framework/details/ssa_graph_builder.h"
 
 namespace paddle {
@@ -55,6 +56,10 @@ class SSAGraghBuilderWithPrinter : public SSAGraphBuilder {
     return graph;
   }
 
+  int GetVarDeviceID(const std::string& var_name) const override {
+    return builder_->GetVarDeviceID(var_name);
+  }
+
  private:
   std::unique_ptr<SSAGraphPrinter> printer_;
   std::unique_ptr<SSAGraphBuilder> builder_;
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index a6788cb6d5..d06e4c89ea 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -133,17 +133,18 @@ ParallelExecutor::ParallelExecutor(
 
 void ParallelExecutor::BCastParamsToGPUs(
     const std::unordered_set<std::string> &vars) const {
-  // the the initialize bcast, all vars would be bcast from device(0), otherwise
+  // the the initializing bcast, all vars would be bcast from device(0),
+  // otherwise
   // bcast from the specified device.
-  bool initialize = builder_.get() == nullptr ? true : false;
+  bool initializing = builder_.get() == nullptr ? false : true;
 
   for (auto &var : vars) {
     int var_dev_id =
         builder_.get() == nullptr ? -1 : builder_->GetVarDeviceID(var);
-    if (!initialize && var_dev_id == -1) continue;
+    if (!initializing && var_dev_id == -1) continue;
 
     framework::Variable *main_var = nullptr;
-    if (initialize) {
+    if (initializing) {
       main_var = member_->local_scopes_[0]->FindVar(var);
     } else {
       main_var = member_->local_scopes_[var_dev_id]->FindVar(var);
@@ -164,7 +165,8 @@ void ParallelExecutor::BCastParamsToGPUs(
         auto place = member_->places_[i];
         void *buffer;
 
-        if ((initialize && i == 0) || (!initialize && i == var_dev_id)) {
+        if ((initializing && i == 0) ||
+            (!initializing && i == static_cast<size_t>(var_dev_id))) {
           buffer = const_cast<void *>(main_tensor.data<void>());
         } else {
           auto local_scope = member_->local_scopes_[i];

From b0f98849fdc37b3c7ccb78815143db9c86a1d845 Mon Sep 17 00:00:00 2001
From: Yan Chunwei <yanchunwei@outlook.com>
Date: Tue, 26 Jun 2018 16:58:28 +0800
Subject: [PATCH 33/68] inference doc fix grammer (#11718)

---
 paddle/contrib/inference/high_level_api.md | 27 +++++++++++-----------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/paddle/contrib/inference/high_level_api.md b/paddle/contrib/inference/high_level_api.md
index 563b696143..eb92885052 100644
--- a/paddle/contrib/inference/high_level_api.md
+++ b/paddle/contrib/inference/high_level_api.md
@@ -1,10 +1,10 @@
 # Inference High-level APIs
-This document describes the high-level inference APIs one can use to easily deploy a Paddle model for an application.
+This document describes the high-level inference APIs, one can use them to deploy a Paddle model for an application quickly.
 
-The APIs are described in `paddle_inference_api.h`, just one header file, and two libaries `libpaddle_fluid.so` and `libpaddle_fluid_api.so` are needed.
+The APIs are described in `paddle_inference_api.h`, just one header file, and two libaries `libpaddle_fluid.so` and `libpaddle_fluid_api.so` are needed for a deployment.
 
 ## PaddleTensor
-We provide the `PaddleTensor` data structure is to give a general tensor interface.
+We provide the `PaddleTensor` data structure to give a general tensor interface.
 
 The definition is 
 
@@ -17,18 +17,19 @@ struct PaddleTensor {
 };
 ```
 
-The data is stored in a continuous memory `PaddleBuf`, and tensor's data type is specified by a `PaddleDType`. 
-The `name` field is used to specify the name of input variable, 
-that is important when there are multiple inputs and need to distiuish which variable to set.
+The data is stored in a continuous memory `PaddleBuf,` and a `PaddleDType` specifies tensor's data type. 
+The `name` field is used to specify the name of an input variable, 
+that is important when there are multiple inputs and need to distinguish which variable to set.
 
 ## engine
-The inference APIs has two different underlying implementation, currently there are two valid engines:
+The inference APIs has two different underlying engines
 
 - the native engine, which is consists of the native operators and framework,
-- the Anakin engine, which is a Anakin library embeded.
+- the Anakin engine, which has an Anakin library embedded.
 
 The native engine takes a native Paddle model as input, and supports any model that trained by Paddle, 
-but the Anakin engine can only take the Anakin model as input(user need to manully transform the format first) and currently not all Paddle models are supported.
+the Anakin engine is faster for some model, 
+but it can only take the Anakin model as input(user need to transform the format first manually) and currently not all Paddle models are supported.
 
 ```c++
 enum class PaddleEngineKind {
@@ -38,10 +39,10 @@ enum class PaddleEngineKind {
 ```
 
 ## PaddlePredictor and how to create one
-The main interface is `PaddlePredictor`, there are following methods 
+The main interface is `PaddlePredictor,` there are following methods 
 
 - `bool Run(const std::vector<PaddleTensor>& inputs, std::vector<PaddleTensor>* output_data)`
-  - take inputs and output `output_data`
+  - take inputs and output `output_data.`
 - `Clone` to clone a predictor from an existing one, with model parameter shared.
 
 There is a factory method to help create a predictor, and the user takes the ownership of this object.
@@ -51,9 +52,9 @@ template <typename ConfigT, PaddleEngineKind engine = PaddleEngineKind::kNative>
 std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
 ```
 
-By specifying the engine kind and config, one can get an specific implementation.
+By specifying the engine kind and config, one can get a specific implementation.
 
 ## Reference
 
 - [paddle_inference_api.h](./paddle_inference_api.h)
-- [demos](./demo)
+- [some demos](./demo)

From 8e48c77b549f6b18389c32d96788026b2abbec31 Mon Sep 17 00:00:00 2001
From: "yi.wu" <yi.wu@baifendian.com>
Date: Tue, 26 Jun 2018 17:21:01 +0800
Subject: [PATCH 34/68] wip

---
 paddle/fluid/framework/parallel_executor.cc            |  3 ++-
 .../paddle/fluid/transpiler/distribute_transpiler.py   | 10 +++++-----
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index d06e4c89ea..dfebf36d04 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -136,7 +136,7 @@ void ParallelExecutor::BCastParamsToGPUs(
   // the the initializing bcast, all vars would be bcast from device(0),
   // otherwise
   // bcast from the specified device.
-  bool initializing = builder_.get() == nullptr ? false : true;
+  bool initializing = builder_.get() == nullptr ? true : false;
 
   for (auto &var : vars) {
     int var_dev_id =
@@ -153,6 +153,7 @@ void ParallelExecutor::BCastParamsToGPUs(
     if (main_var == nullptr || !main_var->IsType<LoDTensor>()) {
       continue;
     }
+    VLOG(3) << "run broadcast " << var << " " << var_dev_id;
 
     auto &main_tensor = main_var->Get<LoDTensor>();
     auto &dims = main_tensor.dims();
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index bb61f82a9c..930cdabf11 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -302,7 +302,6 @@ class DistributeTranspiler(object):
         """
         # remove optimize ops and add a send op to main_program
         delete_ops(self.origin_program.global_block(), self.optimize_ops)
-        # FIXME(typhoonzero): serialize once will fix error occurs when clone.
         self.origin_program.__str__()
         return self.origin_program
 
@@ -383,11 +382,12 @@ class DistributeTranspiler(object):
             if self._is_adam_connected_op(op):
                 global_ops.append(op)
 
-        def __append_optimize_op__(op, block, grad_to_block_id, merged_var):
+        def __append_optimize_op__(op, block, grad_to_block_id, merged_var,
+                                   lr_ops):
             if self._is_optimizer_op(op):
                 self._append_pserver_ops(block, op, endpoint, grad_to_block_id,
                                          self.origin_program, merged_var)
-            else:
+            elif op not in lr_ops:
                 self._append_pserver_non_opt_ops(block, op)
 
         def __op_have_grad_input__(op):
@@ -452,7 +452,7 @@ class DistributeTranspiler(object):
                 # optimizer is connected to itself
                 if ufind.is_connected(op, opt_op) and op not in global_ops:
                     __append_optimize_op__(op, per_opt_block, grad_to_block_id,
-                                           merged_var)
+                                           merged_var, lr_ops)
 
         # append global ops
         if global_ops:
@@ -461,7 +461,7 @@ class DistributeTranspiler(object):
             optimize_blocks.append(opt_state_block)
             for glb_op in global_ops:
                 __append_optimize_op__(glb_op, opt_state_block,
-                                       grad_to_block_id, None)
+                                       grad_to_block_id, None, lr_ops)
 
         # process distributed lookup_table
         prefetch_var_name_to_block_id = []

From a2e43ae5ce691fef18c0e6600dde7c8c4e5d1c27 Mon Sep 17 00:00:00 2001
From: "yi.wu" <yi.wu@baifendian.com>
Date: Mon, 25 Jun 2018 17:16:00 +0800
Subject: [PATCH 35/68] fix trainer nccl2 env

---
 python/paddle/fluid/trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py
index 45ab889bea..fc4d7cba71 100644
--- a/python/paddle/fluid/trainer.py
+++ b/python/paddle/fluid/trainer.py
@@ -315,7 +315,7 @@ class Trainer(object):
             for ip in worker_ips.split(","):
                 worker_endpoints.append(':'.join([ip, port]))
             self.num_trainers = len(worker_endpoints)
-            current_endpoint = os.getenv("POD_IP") + ":" + port
+            current_endpoint = os.getenv("PADDLE_CURRENT_IP") + ":" + port
             worker_endpoints.remove(current_endpoint)
             # TODO(wuyi): use self.nccl_id_var, self.num_trainers and self.trainer_id
             # in ParallelExecutor to start

From c6e36e773812029077187190ec6c3d3b67c576bc Mon Sep 17 00:00:00 2001
From: chengduo <zhaochengduo@baidu.com>
Date: Tue, 26 Jun 2018 20:02:19 +0800
Subject: [PATCH 36/68] Change return_numpy [ParallelExecutor] default value
 (#11713)

* change return_numpy[PE] default value

* Remove convert to numpy in unit test
---
 python/paddle/fluid/parallel_executor.py                     | 4 ++--
 .../fluid/tests/unittests/parallel_executor_test_base.py     | 3 ---
 .../fluid/tests/unittests/test_parallel_executor_crf.py      | 5 ++---
 .../tests/unittests/test_parallel_executor_fetch_feed.py     | 2 +-
 .../unittests/test_parallel_executor_test_while_train.py     | 3 +--
 5 files changed, 6 insertions(+), 11 deletions(-)

diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py
index bb7b7d82f0..6baf648198 100644
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -160,7 +160,7 @@ class ParallelExecutor(object):
             build_strategy, num_trainers, trainer_id)
         self.scope = scope
 
-    def run(self, fetch_list, feed=None, feed_dict=None, return_numpy=False):
+    def run(self, fetch_list, feed=None, feed_dict=None, return_numpy=True):
         """
         Run a parallel executor with fetch_list.
 
@@ -197,7 +197,7 @@ class ParallelExecutor(object):
             feed_dict: Alias for feed parameter, for backward compatibility.
                 This parameter has been deprecated. Default None.
             return_numpy(bool): Whether converts the fetched tensor to numpy.
-                Default: False.
+                Default: True.
 
         Returns:
             List: The fetched result list.
diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
index 829c5a1a5f..21f2037ad4 100644
--- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
+++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
@@ -81,7 +81,6 @@ class TestParallelExecutorBase(unittest.TestCase):
             begin = time.time()
             first_loss, = run_executor(
                 exe=exe, feed=feed_dict, fetch_list=[loss.name])
-            first_loss = np.array(first_loss)
 
             for i in xrange(iter):
                 run_executor(exe=exe, feed=feed_dict, fetch_list=[])
@@ -94,8 +93,6 @@ class TestParallelExecutorBase(unittest.TestCase):
                 print "%.4f Instance per second" % (
                     (batch_size * iter + 2) / (end - begin))
 
-            last_loss = np.array(last_loss)
-
             print first_loss, last_loss
             # self.assertGreater(first_loss[0], last_loss[0])
             return first_loss, last_loss
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
index 1ea7a6a568..63fb58c692 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
@@ -169,9 +169,8 @@ class TestCRFModel(unittest.TestCase):
             data = train_data()
             for i in xrange(10):
                 cur_batch = next(data)
-                print map(np.array,
-                          pe.run(feed=feeder.feed(cur_batch),
-                                 fetch_list=[avg_cost.name]))[0]
+                print pe.run(feed=feeder.feed(cur_batch),
+                             fetch_list=[avg_cost.name])[0]
 
     @unittest.skip(reason="CI hangs")
     def test_update_sparse_parameter_all_reduce(self):
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
index 3b18072c7b..1f5d2f1677 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
@@ -130,7 +130,7 @@ class TestFeedParallel(unittest.TestCase):
             use_cuda=use_cuda, loss_name=loss.name, main_program=main)
 
         for batch_id, data in enumerate(reader()):
-            loss_np = np.array(pe.run(feed=data, fetch_list=[loss.name])[0])
+            loss_np = pe.run(feed=data, fetch_list=[loss.name])[0]
             print batch_id, loss_np
             if batch_id == 2:
                 break
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
index 31ba8c1d60..2527939444 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
@@ -70,10 +70,9 @@ class ParallelExecutorTestingDuringTraining(unittest.TestCase):
 
             for i in xrange(5):
                 test_loss, = test_exe.run([loss.name], feed=feed_dict)
-                test_loss = np.array(test_loss)
 
                 train_loss, = train_exe.run([loss.name], feed=feed_dict)
-                train_loss = np.array(train_loss)
+
                 self.assertTrue(
                     np.allclose(
                         train_loss, test_loss, atol=1e-8),

From 6f0107126a21b2e5e5df3be131531eeba33d7ef3 Mon Sep 17 00:00:00 2001
From: "yi.wu" <yi.wu@baifendian.com>
Date: Tue, 26 Jun 2018 20:16:24 +0800
Subject: [PATCH 37/68] fix broadcast bug

---
 paddle/fluid/framework/parallel_executor.cc | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index dfebf36d04..485d89aa56 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -153,7 +153,6 @@ void ParallelExecutor::BCastParamsToGPUs(
     if (main_var == nullptr || !main_var->IsType<LoDTensor>()) {
       continue;
     }
-    VLOG(3) << "run broadcast " << var << " " << var_dev_id;
 
     auto &main_tensor = main_var->Get<LoDTensor>();
     auto &dims = main_tensor.dims();
@@ -184,8 +183,16 @@ void ParallelExecutor::BCastParamsToGPUs(
         platform::NCCLGroupGuard guard;
         for (size_t i = 0; i < member_->places_.size(); ++i) {
           auto &nccl_ctx = member_->nccl_ctxs_->at(member_->places_[i]);
-          platform::dynload::ncclBcast(buffers[i], numel, data_type, 0,
-                                       nccl_ctx.comm_, nccl_ctx.stream());
+          if (initializing) {
+            platform::dynload::ncclBcast(buffers[i], numel, data_type, 0,
+                                         nccl_ctx.comm_, nccl_ctx.stream());
+          } else {
+            if (static_cast<size_t>(var_dev_id)) {
+              platform::dynload::ncclBcast(buffers[i], numel, data_type,
+                                           var_dev_id, nccl_ctx.comm_,
+                                           nccl_ctx.stream());
+            }
+          }
         }
         member_->nccl_ctxs_->WaitAll();
       }

From fa18f2a1a9099316dc173a372e4a963bbfb7a5da Mon Sep 17 00:00:00 2001
From: captainwoon <captain_woon@163.com>
Date: Tue, 26 Jun 2018 20:18:15 +0800
Subject: [PATCH 38/68] create about_us.rst
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

增加【关于我们】文案
---
 doc/about/about_us.rst | 44 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)
 create mode 100644 doc/about/about_us.rst

diff --git a/doc/about/about_us.rst b/doc/about/about_us.rst
new file mode 100644
index 0000000000..4dec448055
--- /dev/null
+++ b/doc/about/about_us.rst
@@ -0,0 +1,44 @@
+=========
+关于我们
+=========
+
+什么是PaddlePaddle
+--------------------
+
+PaddlePaddle是百度提供的开源深度学习框架，它能够让开发者和企业安全、快速地实现自己的AI想法
+项目团队汇聚了全球顶级的深度学习科学家，致力于为开发者和企业提供最好的深度学习研发体验
+框架本身具有易学、易用、安全、高效四大特性，是最适合中国开发者和企业的深度学习工具
+
+PaddlePaddle的技术特色
+-------------------------
+
+新一代深度学习框架： PaddlePaddle是基于“深度学习编程语言”的新一代深度学习框架，在兼具性能的同时，极大的提升了框架对模型的表达能力，能够描述任意潜在可能出现的模型
+对大规模计算更加友好：经过百度内多种大规模计算业务的打磨，PaddlePaddle在分布式计算上表现优异，基于EDL技术能够节约大量计算资源，同时也能支持大规模稀疏模型的训练
+提供可视化的深度学习：通过Visual DL可以帮助开发者方便的观测训练整体趋势、数据样本质量和中间结果、参数分布和变化趋势、以及模型的结构，帮助开发者更便捷的完成编程过程
+
+提供基于PaddlePaddle的教育体系
+--------------------------------
+
+深度学习课程：百度与中国市场顶级的教育、培训机构共同开发了深度学习精品课程以及学习教材，帮助开发者从零掌握深度学习
+深度学习实训：对于目的是科研和学习的用户，PaddlePaddle提供了无需安装、线上运行的开发环境，并提供算法、算力、数据支持
+线下培训：提供丰富、高质量的线下教育活动，如青年教师培训、线下实战营、沙龙等多种形式的培训和交流
+
+
+提供基于PaddlePaddle的AI服务
+------------------------------
+
+EadyDL：可以帮助零算法基础的企业快速完成一个深度学习任务，只需少量的数据即可得到优质的模型
+AI市场：提供标准化的AI 能力、产品的交易机制，帮助企业快速找到所需，有效开展AI业务
+深度学习竞赛： PaddlePaddle汇聚顶尖深度学习开发者，企业可以发布自己的商业问题，通过竞赛方式快速找到最优的解决方案
+
+你对PaddlePaddle有任何的问题都可以通过以下方式联系到我们
+-----------------------------------------------------------
+
+学习/使用问题：可以在PaddlePaddle开源社区（https://github.com/PaddlePaddle/Paddle/issues），以及PaddlePaddle中文社区（http://ai.baidu.com/forum/topic/list/168）向我们反馈
+对PaddlePaddle框架发展的建议：可发送邮件至Paddle-better@baidu.com
+
+我们期待与你一起打造世界顶级深度学习框架，共同推动AI技术的进步
+
+
+
+PaddlePaddle团队

From 8d04d0e2a372e2914530f852b1dafa8f1adc093d Mon Sep 17 00:00:00 2001
From: "yi.wu" <yi.wu@baifendian.com>
Date: Tue, 26 Jun 2018 20:30:16 +0800
Subject: [PATCH 39/68] update

---
 paddle/fluid/framework/parallel_executor.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 485d89aa56..b53a6f43fb 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -166,7 +166,7 @@ void ParallelExecutor::BCastParamsToGPUs(
         void *buffer;
 
         if ((initializing && i == 0) ||
-            (!initializing && i == static_cast<size_t>(var_dev_id))) {
+            (!initializing && static_cast<int>(i) == var_dev_id)) {
           buffer = const_cast<void *>(main_tensor.data<void>());
         } else {
           auto local_scope = member_->local_scopes_[i];
@@ -187,7 +187,7 @@ void ParallelExecutor::BCastParamsToGPUs(
             platform::dynload::ncclBcast(buffers[i], numel, data_type, 0,
                                          nccl_ctx.comm_, nccl_ctx.stream());
           } else {
-            if (static_cast<size_t>(var_dev_id)) {
+            if (var_dev_id >= 0) {
               platform::dynload::ncclBcast(buffers[i], numel, data_type,
                                            var_dev_id, nccl_ctx.comm_,
                                            nccl_ctx.stream());

From d4d946db5a58d1439d71eb17e69fc79a1d869b32 Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Mon, 25 Jun 2018 11:18:22 +0000
Subject: [PATCH 40/68] update blocking queue

---
 .../operators/reader/create_py_reader_op.cc   |  9 +++--
 .../reader/lod_tensor_blocking_queue.h        | 34 ++++++++-----------
 paddle/fluid/pybind/pybind.cc                 | 15 ++++----
 3 files changed, 27 insertions(+), 31 deletions(-)

diff --git a/paddle/fluid/operators/reader/create_py_reader_op.cc b/paddle/fluid/operators/reader/create_py_reader_op.cc
index aac81d1813..36587360f7 100644
--- a/paddle/fluid/operators/reader/create_py_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_py_reader_op.cc
@@ -28,7 +28,7 @@ class PyReader : public framework::ReaderBase {
 
   void ReadNext(std::vector<framework::LoDTensor>* out) override {
     bool success;
-    *out = queue_->Dequeue(&success);
+    *out = queue_->Pop(&success);
     if (!success) out->clear();
   }
 
@@ -45,6 +45,10 @@ class CreatePyReaderOp : public framework::OperatorBase {
  private:
   void RunImpl(const framework::Scope& scope,
                const platform::Place& dev_place) const override {
+    auto* out = scope.FindVar(Output("Out"))
+                    ->template GetMutable<framework::ReaderHolder>();
+    if (out->Get() != nullptr) return;
+
     const std::string& queue_name = Input("blocking_queue");
     auto* queue_holder_var = scope.FindVar(queue_name);
     PADDLE_ENFORCE(
@@ -53,8 +57,7 @@ class CreatePyReaderOp : public framework::OperatorBase {
         queue_name);
     auto* queue_holder =
         queue_holder_var->template GetMutable<LoDTensorBlockingQueueHolder>();
-    auto* out = scope.FindVar(Output("Out"))
-                    ->template GetMutable<framework::ReaderHolder>();
+
     out->Reset(new PyReader(queue_holder->GetQueue()));
   }
 };
diff --git a/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h b/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h
index a2129f6af4..30d962ba10 100644
--- a/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h
+++ b/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h
@@ -34,36 +34,33 @@ class LoDTensorBlockingQueue {
  private:
   LoDTensorBlockingQueue(size_t capacity,
                          const std::vector<framework::DDim>& dims)
-      : dims_(dims) {
-    queue_.reset(
-        new BlockingQueue<std::vector<framework::LoDTensor>>(capacity));
-  }
+      : queue_(capacity), dims_(dims) {}
 
  public:
-  bool Enqueue(const std::vector<framework::LoDTensor>& lod_tensor_vec) {
+  bool Push(const std::vector<framework::LoDTensor>& lod_tensor_vec) {
     CheckDims(lod_tensor_vec);
-    return queue_->Send(lod_tensor_vec);
+    return queue_.Send(lod_tensor_vec);
   }
 
-  bool Enqueue(std::vector<framework::LoDTensor>&& lod_tensor_vec) {
+  bool Push(std::vector<framework::LoDTensor>&& lod_tensor_vec) {
     CheckDims(lod_tensor_vec);
-    return queue_->Send(std::move(lod_tensor_vec));
+    return queue_.Send(std::move(lod_tensor_vec));
   }
 
-  std::vector<framework::LoDTensor> Dequeue(bool* ok = nullptr) {
+  std::vector<framework::LoDTensor> Pop(bool* ok = nullptr) {
     std::vector<framework::LoDTensor> lod_tensor_vec;
-    bool success = queue_->Receive(&lod_tensor_vec);
+    bool success = queue_.Receive(&lod_tensor_vec);
     if (ok != nullptr) *ok = success;
     return lod_tensor_vec;
   }
 
-  inline size_t Cap() const { return queue_->Cap(); }
+  inline size_t Cap() const { return queue_.Cap(); }
 
-  inline size_t Size() const { return queue_->Size(); }
+  inline size_t Size() const { return queue_.Size(); }
 
-  inline void Close() { return queue_->Close(); }
+  inline void Close() { return queue_.Close(); }
 
-  inline bool IsClosed() const { return queue_->IsClosed(); }
+  inline bool IsClosed() const { return queue_.IsClosed(); }
 
  private:
   void CheckDims(const std::vector<framework::LoDTensor>& lod_tensor_vec) {
@@ -71,15 +68,16 @@ class LoDTensorBlockingQueue {
                    "Expect input size is %d but found %s", dims_.size(),
                    lod_tensor_vec.size());
     for (size_t i = 0; i < dims_.size(); ++i) {
-      const auto& in_dims = lod_tensor_vec[i].dims();
+      const auto& in_dims = framework::slice_ddim(
+          lod_tensor_vec[i].dims(), 1, lod_tensor_vec[i].dims().size());
       const auto& expect_dims =
           framework::slice_ddim(dims_[i], 1, dims_[i].size());
       PADDLE_ENFORCE(in_dims == expect_dims,
-                     "Dims of the %d-th input tensor does not match", i);
+                     "Dims of the %d-th input tensor do not match", i);
     }
   }
 
-  std::unique_ptr<BlockingQueue<std::vector<framework::LoDTensor>>> queue_;
+  BlockingQueue<std::vector<framework::LoDTensor>> queue_;
   std::vector<framework::DDim> dims_;
 };
 
@@ -92,8 +90,6 @@ class LoDTensorBlockingQueueHolder {
     queue_.reset(new LoDTensorBlockingQueue(capacity, dims));
   }
 
-  inline std::shared_ptr<LoDTensorBlockingQueue> GetQueue() { return queue_; }
-
   inline const std::shared_ptr<LoDTensorBlockingQueue>& GetQueue() const {
     return queue_;
   }
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 6963a0c101..36d0809968 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -303,19 +303,16 @@ All parameter, weight, gradient are variables in Paddle.
   using LoDTensorBlockingQueueHolder =
       ::paddle::operators::reader::LoDTensorBlockingQueueHolder;
   py::class_<LoDTensorBlockingQueue>(m, "LoDTensorBlockingQueue", "")
-      .def("enqueue",
+      .def("push",
            [](LoDTensorBlockingQueue &self,
               const std::vector<framework::LoDTensor> &lod_tensor_vec) {
              pybind11::gil_scoped_release release;
-             return self.Enqueue(lod_tensor_vec);
+             return self.Push(lod_tensor_vec);
            })
-      .def("size",
-           [](const LoDTensorBlockingQueue &self) { return self.Size(); })
-      .def("capacity",
-           [](const LoDTensorBlockingQueue &self) { return self.Cap(); })
-      .def("close", [](LoDTensorBlockingQueue &self) { return self.Close(); })
-      .def("is_closed",
-           [](const LoDTensorBlockingQueue &self) { return self.IsClosed(); });
+      .def("size", &LoDTensorBlockingQueue::Size)
+      .def("capacity", &LoDTensorBlockingQueue::Cap)
+      .def("close", &LoDTensorBlockingQueue::Close)
+      .def("is_closed", &LoDTensorBlockingQueue::IsClosed);
 
   m.def("init_lod_tensor_blocking_queue",
         [](Variable &var, size_t capacity,

From 480d848eb989d4decd68cc76cf2e34c27dee5763 Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Tue, 26 Jun 2018 22:56:21 +0800
Subject: [PATCH 41/68] optimize doc for host_memory_profiling_cn

---
 doc/fluid/howto/optimization/host_memory_profiling_cn.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/doc/fluid/howto/optimization/host_memory_profiling_cn.md b/doc/fluid/howto/optimization/host_memory_profiling_cn.md
index 9b55a66ded..7fb0883dd9 100644
--- a/doc/fluid/howto/optimization/host_memory_profiling_cn.md
+++ b/doc/fluid/howto/optimization/host_memory_profiling_cn.md
@@ -1,4 +1,4 @@
-## 堆内存分析和优化
+# 堆内存分析和优化
 
 计算机程序都可能有内存泄漏的风险。**内存泄漏**一般是由于程序在堆(heap)上分配了内存而没有释放，随着程序的运行占用的内存越来越大，一方面会影响程序的稳定性，可能让运行速度越来越慢，或者造成oom，甚至会影响运行程序的机器的稳定性，造成宕机。
 
@@ -20,11 +20,11 @@ Paddle也提供了基于gperftool的[CPU性能分析教程](https://github.com/P
 
 对于堆内存的分析，主要用到thread-caching malloc和heap-profiling using tcmalloc。
 
-## 使用流程
-#### 环境
+## 环境
+
 本教程基于paddle提供的Docker开发环境paddlepaddle/paddle:latest-dev，基于Ubuntu 16.04.4 LTS环境。
 
-#### 使用流程
+## 使用流程
 
 - 安装google-perftools
 

From d15b2e02c8a1fbfdac4b8a1ee6b7f9809eb0abdd Mon Sep 17 00:00:00 2001
From: guosheng <guosheng@baidu.com>
Date: Wed, 27 Jun 2018 02:25:42 +0800
Subject: [PATCH 42/68] Fix copying empty tensor in beam_search_decode_op

---
 paddle/fluid/operators/beam_search_decode_op.cc | 17 +++++++++++------
 paddle/fluid/operators/beam_search_decode_op.h  |  3 +--
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/paddle/fluid/operators/beam_search_decode_op.cc b/paddle/fluid/operators/beam_search_decode_op.cc
index 57496dd2bb..10d678111f 100644
--- a/paddle/fluid/operators/beam_search_decode_op.cc
+++ b/paddle/fluid/operators/beam_search_decode_op.cc
@@ -42,9 +42,11 @@ struct BeamSearchDecodeFunctor {
       // Copy all tensors in the input tensor array
       for (auto& step_id : step_ids_origin_) {
         framework::LoDTensor out;
-        dev_ctx->Wait();
-        framework::TensorCopy(step_id, platform::CPUPlace(), *dev_ctx, &out);
-        dev_ctx->Wait();
+        if (step_id.numel() > 0) {
+          dev_ctx->Wait();
+          framework::TensorCopy(step_id, platform::CPUPlace(), *dev_ctx, &out);
+          dev_ctx->Wait();
+        }
 
         out.set_lod(step_id.lod());
         step_ids_.push_back(out);
@@ -58,9 +60,12 @@ struct BeamSearchDecodeFunctor {
       // Copy all tensors in the input tensor array
       for (auto& step_score : step_scores_origin_) {
         framework::LoDTensor out;
-        dev_ctx->Wait();
-        framework::TensorCopy(step_score, platform::CPUPlace(), *dev_ctx, &out);
-        dev_ctx->Wait();
+        if (step_score.numel() > 0) {
+          dev_ctx->Wait();
+          framework::TensorCopy(step_score, platform::CPUPlace(), *dev_ctx,
+                                &out);
+          dev_ctx->Wait();
+        }
 
         out.set_lod(step_score.lod());
         step_scores_.push_back(out);
diff --git a/paddle/fluid/operators/beam_search_decode_op.h b/paddle/fluid/operators/beam_search_decode_op.h
index bb5936a095..6aefc5446f 100644
--- a/paddle/fluid/operators/beam_search_decode_op.h
+++ b/paddle/fluid/operators/beam_search_decode_op.h
@@ -151,8 +151,7 @@ void BeamSearchDecoder<T>::Backtrace(const LoDTensorArray& step_ids,
   const size_t src_num = step_ids.at(0).lod().at(kSourceLevel).size() - 1;
   std::vector<SentenceVector<T>> sentence_vector_list(
       src_num, SentenceVector<T>(beam_size_));
-  std::vector<std::vector<size_t>> prefix_idx_vector_list(
-      src_num, std::vector<size_t>());
+  std::vector<std::vector<size_t>> prefix_idx_vector_list(src_num);
   for (int step_id = step_num - 1; step_id >= 0; --step_id) {
     auto& cur_ids = step_ids.at(step_id);
     auto& cur_scores = step_scores.at(step_id);

From 7c8b49d3cee0d5c0feda2f3f5fb62b3e3729cd87 Mon Sep 17 00:00:00 2001
From: Shan Yi <35982308+shanyi15@users.noreply.github.com>
Date: Wed, 27 Jun 2018 10:22:41 +0800
Subject: [PATCH 43/68] fix typo

---
 doc/about/about_us.rst | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/doc/about/about_us.rst b/doc/about/about_us.rst
index 4dec448055..62469dc9e7 100644
--- a/doc/about/about_us.rst
+++ b/doc/about/about_us.rst
@@ -34,8 +34,9 @@ AI市场：提供标准化的AI 能力、产品的交易机制，帮助企业快
 你对PaddlePaddle有任何的问题都可以通过以下方式联系到我们
 -----------------------------------------------------------
 
-学习/使用问题：可以在PaddlePaddle开源社区（https://github.com/PaddlePaddle/Paddle/issues），以及PaddlePaddle中文社区（http://ai.baidu.com/forum/topic/list/168）向我们反馈
-对PaddlePaddle框架发展的建议：可发送邮件至Paddle-better@baidu.com
+* 学习/使用问题：可以在 `PaddlePaddle开源社区 <https://github.com/PaddlePaddle/Paddle/issues>`_，以及 `PaddlePaddle中文社区 <http://ai.baidu.com/forum/topic/list/168>`_ 向我们反馈
+
+* 对PaddlePaddle框架发展的建议：可发送邮件至Paddle-better@baidu.com
 
 我们期待与你一起打造世界顶级深度学习框架，共同推动AI技术的进步
 

From 52993878a4e316d2cd4d782304bed017b6dc1c30 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Wed, 27 Jun 2018 11:12:48 +0800
Subject: [PATCH 44/68] add feature/vis infer demos (#11708)

---
 paddle/contrib/inference/demo/CMakeLists.txt  |  40 +++++
 paddle/contrib/inference/demo/README.md       |  36 +++++
 .../inference/demo/simple_on_word2vec.cc      |   1 +
 paddle/contrib/inference/demo/utils.h         |  68 ++++++++
 paddle/contrib/inference/demo/vis_demo.cc     | 149 ++++++++++++++++++
 .../contrib/inference/paddle_inference_api.cc |  15 +-
 .../contrib/inference/paddle_inference_api.h  |   5 +-
 7 files changed, 312 insertions(+), 2 deletions(-)
 create mode 100644 paddle/contrib/inference/demo/README.md
 create mode 100644 paddle/contrib/inference/demo/utils.h
 create mode 100644 paddle/contrib/inference/demo/vis_demo.cc

diff --git a/paddle/contrib/inference/demo/CMakeLists.txt b/paddle/contrib/inference/demo/CMakeLists.txt
index 7b0fa77ad1..566c7d1a07 100644
--- a/paddle/contrib/inference/demo/CMakeLists.txt
+++ b/paddle/contrib/inference/demo/CMakeLists.txt
@@ -14,3 +14,43 @@
 #
 
 inference_api_test(simple_on_word2vec ARGS test_word2vec)
+
+set(DEMO_INSTALL_DIR "${PADDLE_BINARY_DIR}/inference_demo")
+set(URL_ROOT http://paddlemodels.bj.bcebos.com/inference-vis-demos%2F)
+
+function(inference_download_test_demo TARGET)
+    if (NOT WITH_TESTING)
+        return()
+    endif()
+    set(options "")
+    set(oneValueArgs URL)
+    set(multiValueArgs SRCS)
+    cmake_parse_arguments(tests "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+    set(test_dir "${DEMO_INSTALL_DIR}/${TARGET}")
+    message(STATUS "inference demo ${test_dir}")
+
+    if(NOT EXISTS "${test_dir}")
+        message(STATUS "Download ${TARGET} model from ${tests_URL}")
+        execute_process(COMMAND bash -c "mkdir -p ${test_dir}")
+        execute_process(COMMAND bash -c "cd ${test_dir}; wget -q ${tests_URL}")
+        execute_process(COMMAND bash -c "cd ${test_dir}; tar xzf *.tar.gz")
+    endif()
+
+    cc_test(${TARGET} SRCS "${tests_SRCS}"
+        DEPS paddle_inference_api paddle_fluid
+        ARGS --data=${test_dir}/data.txt
+             --modeldir=${test_dir}/model
+             --refer=${test_dir}/result.txt)
+endfunction()
+
+# disable mobilenet test
+#inference_download_test_demo(mobilenet_inference_demo
+#    SRCS vis_demo.cc
+#    URL ${URL_ROOT}mobilenet.tar.gz)
+inference_download_test_demo(se_resnext50_inference_demo
+    SRCS vis_demo.cc
+    URL ${URL_ROOT}se_resnext50.tar.gz)
+inference_download_test_demo(ocr_inference_demo
+    SRCS vis_demo.cc
+    URL ${URL_ROOT}ocr.tar.gz)
diff --git a/paddle/contrib/inference/demo/README.md b/paddle/contrib/inference/demo/README.md
new file mode 100644
index 0000000000..f1d2566602
--- /dev/null
+++ b/paddle/contrib/inference/demo/README.md
@@ -0,0 +1,36 @@
+# Infernce Demos
+
+Input data format:
+
+- Each line contains a single record
+- Each record's format is
+
+```
+<space splitted floats as data>\t<space splitted ints as shape>
+```
+
+Follow the C++ codes in `vis_demo.cc`.
+
+## MobileNet
+
+To execute the demo, simply run
+
+```sh
+./mobilenet_inference_demo --modeldir <model> --data <datafile>
+```
+
+## SE-ResNeXt-50
+
+To execute the demo, simply run
+
+```sh
+./se_resnext50_inference_demo --modeldir <model> --data <datafile>
+```
+
+## OCR
+
+To execute the demo, simply run
+
+```sh
+./ocr_inference_demo --modeldir <model> --data <datafile>
+```
diff --git a/paddle/contrib/inference/demo/simple_on_word2vec.cc b/paddle/contrib/inference/demo/simple_on_word2vec.cc
index 2a4bfc8706..c253014642 100644
--- a/paddle/contrib/inference/demo/simple_on_word2vec.cc
+++ b/paddle/contrib/inference/demo/simple_on_word2vec.cc
@@ -21,6 +21,7 @@ limitations under the License. */
 #include <memory>
 #include <thread>
 #include "paddle/contrib/inference/paddle_inference_api.h"
+
 namespace paddle {
 namespace demo {
 
diff --git a/paddle/contrib/inference/demo/utils.h b/paddle/contrib/inference/demo/utils.h
new file mode 100644
index 0000000000..b5330d8d9d
--- /dev/null
+++ b/paddle/contrib/inference/demo/utils.h
@@ -0,0 +1,68 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+
+#include "paddle/contrib/inference/paddle_inference_api.h"
+
+namespace paddle {
+namespace demo {
+
+static void split(const std::string& str,
+                  char sep,
+                  std::vector<std::string>* pieces) {
+  pieces->clear();
+  if (str.empty()) {
+    return;
+  }
+  size_t pos = 0;
+  size_t next = str.find(sep, pos);
+  while (next != std::string::npos) {
+    pieces->push_back(str.substr(pos, next - pos));
+    pos = next + 1;
+    next = str.find(sep, pos);
+  }
+  if (!str.substr(pos).empty()) {
+    pieces->push_back(str.substr(pos));
+  }
+}
+
+/*
+ * Get a summary of a PaddleTensor content.
+ */
+static std::string SummaryTensor(const PaddleTensor& tensor) {
+  std::stringstream ss;
+  int num_elems = tensor.data.length() / PaddleDtypeSize(tensor.dtype);
+
+  ss << "data[:10]\t";
+  switch (tensor.dtype) {
+    case PaddleDType::INT64: {
+      for (int i = 0; i < std::min(num_elems, 10); i++) {
+        ss << static_cast<int64_t*>(tensor.data.data())[i] << " ";
+      }
+      break;
+    }
+    case PaddleDType::FLOAT32:
+      for (int i = 0; i < std::min(num_elems, 10); i++) {
+        ss << static_cast<float*>(tensor.data.data())[i] << " ";
+      }
+      break;
+  }
+  return ss.str();
+}
+
+}  // namespace demo
+}  // namespace paddle
diff --git a/paddle/contrib/inference/demo/vis_demo.cc b/paddle/contrib/inference/demo/vis_demo.cc
new file mode 100644
index 0000000000..45575f9a86
--- /dev/null
+++ b/paddle/contrib/inference/demo/vis_demo.cc
@@ -0,0 +1,149 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+ * This file contains demo for mobilenet, se-resnext50 and ocr.
+ */
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>  // use glog instead of PADDLE_ENFORCE to avoid importing other paddle header files.
+#include <gtest/gtest.h>
+#include <fstream>
+#include <iostream>
+#include "paddle/contrib/inference/demo/utils.h"
+#include "paddle/contrib/inference/paddle_inference_api.h"
+
+#ifdef PADDLE_WITH_CUDA
+DECLARE_double(fraction_of_gpu_memory_to_use);
+#endif
+
+namespace paddle {
+namespace demo {
+
+DEFINE_string(modeldir, "", "Directory of the inference model.");
+DEFINE_string(refer, "", "path to reference result for comparison.");
+DEFINE_string(
+    data,
+    "",
+    "path of data; each line is a record, format is "
+    "'<space splitted floats as data>\t<space splitted ints as shape'");
+
+struct Record {
+  std::vector<float> data;
+  std::vector<int32_t> shape;
+};
+
+void split(const std::string& str, char sep, std::vector<std::string>* pieces);
+
+Record ProcessALine(const std::string& line) {
+  LOG(INFO) << "process a line";
+  std::vector<std::string> columns;
+  split(line, '\t', &columns);
+  CHECK_EQ(columns.size(), 2UL)
+      << "data format error, should be <data>\t<shape>";
+
+  Record record;
+  std::vector<std::string> data_strs;
+  split(columns[0], ' ', &data_strs);
+  for (auto& d : data_strs) {
+    record.data.push_back(std::stof(d));
+  }
+
+  std::vector<std::string> shape_strs;
+  split(columns[1], ' ', &shape_strs);
+  for (auto& s : shape_strs) {
+    record.shape.push_back(std::stoi(s));
+  }
+  LOG(INFO) << "data size " << record.data.size();
+  LOG(INFO) << "data shape size " << record.shape.size();
+  return record;
+}
+
+void CheckOutput(const std::string& referfile, const PaddleTensor& output) {
+  std::string line;
+  std::ifstream file(referfile);
+  std::getline(file, line);
+  auto refer = ProcessALine(line);
+  file.close();
+
+  size_t numel = output.data.length() / PaddleDtypeSize(output.dtype);
+  LOG(INFO) << "predictor output numel " << numel;
+  LOG(INFO) << "reference output numel " << refer.data.size();
+  EXPECT_EQ(numel, refer.data.size());
+  switch (output.dtype) {
+    case PaddleDType::INT64: {
+      for (size_t i = 0; i < numel; ++i) {
+        EXPECT_EQ(static_cast<int64_t*>(output.data.data())[i], refer.data[i]);
+      }
+      break;
+    }
+    case PaddleDType::FLOAT32:
+      for (size_t i = 0; i < numel; ++i) {
+        EXPECT_NEAR(
+            static_cast<float*>(output.data.data())[i], refer.data[i], 1e-5);
+      }
+      break;
+  }
+}
+
+/*
+ * Use the native fluid engine to inference the demo.
+ */
+void Main(bool use_gpu) {
+  NativeConfig config;
+  config.param_file = FLAGS_modeldir + "/__params__";
+  config.prog_file = FLAGS_modeldir + "/__model__";
+  config.use_gpu = use_gpu;
+  config.device = 0;
+#ifdef PADDLE_WITH_CUDA
+  config.fraction_of_gpu_memory = FLAGS_fraction_of_gpu_memory_to_use;
+#endif
+
+  LOG(INFO) << "init predictor";
+  auto predictor =
+      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+
+  LOG(INFO) << "begin to process data";
+  // Just a single batch of data.
+  std::string line;
+  std::ifstream file(FLAGS_data);
+  std::getline(file, line);
+  auto record = ProcessALine(line);
+  file.close();
+
+  // Inference.
+  PaddleTensor input{
+      .name = "xx",
+      .shape = record.shape,
+      .data = PaddleBuf(record.data.data(), record.data.size() * sizeof(float)),
+      .dtype = PaddleDType::FLOAT32};
+
+  LOG(INFO) << "run executor";
+  std::vector<PaddleTensor> output;
+  predictor->Run({input}, &output);
+
+  LOG(INFO) << "output.size " << output.size();
+  auto& tensor = output.front();
+  LOG(INFO) << "output: " << SummaryTensor(tensor);
+
+  // compare with reference result
+  CheckOutput(FLAGS_refer, tensor);
+}
+
+TEST(demo, vis_demo_cpu) { Main(false /*use_gpu*/); }
+#ifdef PADDLE_WITH_CUDA
+TEST(demo, vis_demo_gpu) { Main(true /*use_gpu*/); }
+#endif
+}  // namespace demo
+}  // namespace paddle
diff --git a/paddle/contrib/inference/paddle_inference_api.cc b/paddle/contrib/inference/paddle_inference_api.cc
index dc2842ae0e..ea46b3006f 100644
--- a/paddle/contrib/inference/paddle_inference_api.cc
+++ b/paddle/contrib/inference/paddle_inference_api.cc
@@ -16,6 +16,19 @@ limitations under the License. */
 
 namespace paddle {
 
+int PaddleDtypeSize(PaddleDType dtype) {
+  switch (dtype) {
+    case PaddleDType::FLOAT32:
+      return sizeof(float);
+    case PaddleDType::INT64:
+      return sizeof(int64_t);
+    default:
+      //
+      assert(false);
+      return -1;
+  }
+}
+
 PaddleBuf::PaddleBuf(PaddleBuf&& other)
     : data_(other.data_),
       length_(other.length_),
@@ -62,4 +75,4 @@ void PaddleBuf::Free() {
   }
 }
 
-}  // namespace paddle
\ No newline at end of file
+}  // namespace paddle
diff --git a/paddle/contrib/inference/paddle_inference_api.h b/paddle/contrib/inference/paddle_inference_api.h
index 38e3cc2141..238d8c772e 100644
--- a/paddle/contrib/inference/paddle_inference_api.h
+++ b/paddle/contrib/inference/paddle_inference_api.h
@@ -15,7 +15,7 @@ limitations under the License. */
 /*
  * This file contains the definition of a simple Inference API for Paddle.
  *
- * ATTENTION: It requires some C++ features, for lower version C++ or C, we
+ * ATTENTION: It requires some C++11 features, for lower version C++ or C, we
  * might release another API.
  */
 
@@ -140,4 +140,7 @@ struct AnakinConfig : public PaddlePredictor::Config {
 // Similarly, each engine kind should map to a unique predictor implementation.
 template <typename ConfigT, PaddleEngineKind engine = PaddleEngineKind::kNative>
 std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
+
+int PaddleDtypeSize(PaddleDType dtype);
+
 }  // namespace paddle

From 99bd7e38377a9f50b2a4cac237797dc0615e6bfe Mon Sep 17 00:00:00 2001
From: Shan Yi <35982308+shanyi15@users.noreply.github.com>
Date: Wed, 27 Jun 2018 11:39:14 +0800
Subject: [PATCH 45/68] fix list

---
 doc/about/about_us.rst | 36 ++++++++++++++++++++++--------------
 1 file changed, 22 insertions(+), 14 deletions(-)

diff --git a/doc/about/about_us.rst b/doc/about/about_us.rst
index 62469dc9e7..f1b1a12216 100644
--- a/doc/about/about_us.rst
+++ b/doc/about/about_us.rst
@@ -5,38 +5,46 @@
 什么是PaddlePaddle
 --------------------
 
-PaddlePaddle是百度提供的开源深度学习框架，它能够让开发者和企业安全、快速地实现自己的AI想法
-项目团队汇聚了全球顶级的深度学习科学家，致力于为开发者和企业提供最好的深度学习研发体验
-框架本身具有易学、易用、安全、高效四大特性，是最适合中国开发者和企业的深度学习工具
+- PaddlePaddle是百度提供的开源深度学习框架，它能够让开发者和企业安全、快速地实现自己的AI想法
+
+- 项目团队汇聚了全球顶级的深度学习科学家，致力于为开发者和企业提供最好的深度学习研发体验
+
+- 框架本身具有易学、易用、安全、高效四大特性，是最适合中国开发者和企业的深度学习工具
 
 PaddlePaddle的技术特色
 -------------------------
 
-新一代深度学习框架： PaddlePaddle是基于“深度学习编程语言”的新一代深度学习框架，在兼具性能的同时，极大的提升了框架对模型的表达能力，能够描述任意潜在可能出现的模型
-对大规模计算更加友好：经过百度内多种大规模计算业务的打磨，PaddlePaddle在分布式计算上表现优异，基于EDL技术能够节约大量计算资源，同时也能支持大规模稀疏模型的训练
-提供可视化的深度学习：通过Visual DL可以帮助开发者方便的观测训练整体趋势、数据样本质量和中间结果、参数分布和变化趋势、以及模型的结构，帮助开发者更便捷的完成编程过程
+- 新一代深度学习框架： PaddlePaddle是基于“深度学习编程语言”的新一代深度学习框架，在兼具性能的同时，极大的提升了框架对模型的表达能力，能够描述任意潜在可能出现的模型
+
+- 对大规模计算更加友好：经过百度内多种大规模计算业务的打磨，PaddlePaddle在分布式计算上表现优异，基于EDL技术能够节约大量计算资源，同时也能支持大规模稀疏模型的训练
+
+- 提供可视化的深度学习：通过Visual DL可以帮助开发者方便的观测训练整体趋势、数据样本质量和中间结果、参数分布和变化趋势、以及模型的结构，帮助开发者更便捷的完成编程过程
 
 提供基于PaddlePaddle的教育体系
 --------------------------------
 
-深度学习课程：百度与中国市场顶级的教育、培训机构共同开发了深度学习精品课程以及学习教材，帮助开发者从零掌握深度学习
-深度学习实训：对于目的是科研和学习的用户，PaddlePaddle提供了无需安装、线上运行的开发环境，并提供算法、算力、数据支持
-线下培训：提供丰富、高质量的线下教育活动，如青年教师培训、线下实战营、沙龙等多种形式的培训和交流
+- 深度学习课程：百度与中国市场顶级的教育、培训机构共同开发了深度学习精品课程以及学习教材，帮助开发者从零掌握深度学习
+
+- 深度学习实训：对于目的是科研和学习的用户，PaddlePaddle提供了无需安装、线上运行的开发环境，并提供算法、算力、数据支持
+
+- 线下培训：提供丰富、高质量的线下教育活动，如青年教师培训、线下实战营、沙龙等多种形式的培训和交流
 
 
 提供基于PaddlePaddle的AI服务
 ------------------------------
 
-EadyDL：可以帮助零算法基础的企业快速完成一个深度学习任务，只需少量的数据即可得到优质的模型
-AI市场：提供标准化的AI 能力、产品的交易机制，帮助企业快速找到所需，有效开展AI业务
-深度学习竞赛： PaddlePaddle汇聚顶尖深度学习开发者，企业可以发布自己的商业问题，通过竞赛方式快速找到最优的解决方案
+- EadyDL：可以帮助零算法基础的企业快速完成一个深度学习任务，只需少量的数据即可得到优质的模型
+
+- AI市场：提供标准化的AI 能力、产品的交易机制，帮助企业快速找到所需，有效开展AI业务
+
+- 深度学习竞赛： PaddlePaddle汇聚顶尖深度学习开发者，企业可以发布自己的商业问题，通过竞赛方式快速找到最优的解决方案
 
 你对PaddlePaddle有任何的问题都可以通过以下方式联系到我们
 -----------------------------------------------------------
 
-* 学习/使用问题：可以在 `PaddlePaddle开源社区 <https://github.com/PaddlePaddle/Paddle/issues>`_，以及 `PaddlePaddle中文社区 <http://ai.baidu.com/forum/topic/list/168>`_ 向我们反馈
+- 学习/使用问题：可以在 `PaddlePaddle开源社区 <https://github.com/PaddlePaddle/Paddle/issues>`_，以及 `PaddlePaddle中文社区 <http://ai.baidu.com/forum/topic/list/168>`_ 向我们反馈
 
-* 对PaddlePaddle框架发展的建议：可发送邮件至Paddle-better@baidu.com
+- 对PaddlePaddle框架发展的建议：可发送邮件至Paddle-better@baidu.com
 
 我们期待与你一起打造世界顶级深度学习框架，共同推动AI技术的进步
 

From 429a140ce1e252e9ca87e53340dbb237393d1595 Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Wed, 27 Jun 2018 03:49:04 +0000
Subject: [PATCH 46/68] add python_data_feeding.md

---
 .../design/concepts/python_data_feeding.md    | 130 ++++++++++++++++++
 1 file changed, 130 insertions(+)
 create mode 100644 doc/fluid/design/concepts/python_data_feeding.md

diff --git a/doc/fluid/design/concepts/python_data_feeding.md b/doc/fluid/design/concepts/python_data_feeding.md
new file mode 100644
index 0000000000..dffee8e02b
--- /dev/null
+++ b/doc/fluid/design/concepts/python_data_feeding.md
@@ -0,0 +1,130 @@
+# Python Data Feeding
+
+In the former implementation of Paddle Fluid, there are two ways to feed data:
+
+- Use `reader_op` in backend C++ side. This method only supports data feeding from recordio files and random data generators, but supports many kinds of `decorated_readers`. For examples, `double_buffer_reader` uses two threads to achieve better performance: one for time-consuming I/O operations, and the other for `Executor::Run()`. See [C++ Data Feeding](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/cpp_data_feeding.md) for details.
+
+- Feed data directly using `DataFeeder.feed()` in Python codes. It is more flexible than the first way. Many kinds of preprocessing steps can be performed before feeding using Python or any other languages, instead of adding many uncommon `operators` in C++ side. But this method is less efficient: the program cannot read the next mini-batch data before `Executor::Run()` ends. Moreover, `decorated_readers` such as `double_buffer_reader` cannot be used for better performance.
+
+In this document, we design a Python Data Feeding process combining the efficiency of the first way and the flexibility of the second way. A data queue `LoDTensorBlockingQueue` is designed to be shared by the Python and C++ side, while `LoDTensorArray` is pushed into the queue in Python side and `reader_op` in C++ side reads out the data from the queue.
+
+
+## Design of LoDTensorBlockingQueue
+`LoDTensorBlockingQueue` is a blocking queue with a fixed `capacity` and accepts `std::vector<framework::LoDTensor>` with shapes indicated by `dims`. Since `LoDTensorBlockingQueue` must be constructed using `capacity` and `dims`, it cannot be a `Variable` type. Therefore, a `LoDTensorBlockingQueueHolder` is designed to defer construction of `LoDTensorBlockingQueue`.
+
+```C++
+class LoDTensorBlockingQueueHolder;
+
+class LoDTensorBlockingQueue {
+  friend class LoDTensorBlockingQueueHolder;
+ private:
+  // `LoDTensorBlockingQueue` can only be constructed by 
+  // `LoDTensorBlockingQueueHolder::InitOnce()`
+  LoDTensorBlockingQueue(size_t capacity, const std::vector<framework::DDim>& dims);
+ 
+ public:
+  size_t Size() const { return queue_.Size(); } // Get the current size of the queue
+
+  size_t Cap() const { return queue_.Cap(); }// Get the capacity of the queue
+
+  void Close() { return queue_.Close(); }
+
+  bool IsClosed() const { return queue_.IsClosed(); }
+
+  // Block if Size() == Cap()
+  // Return false only when queue_.IsClosed() == true
+  bool Push(const std::vector<framework::LoDTensor> &lod_tensor_vec);
+  
+  // Block if Size() == 0.
+  // *Success == false when queue_.IsClosed() == true
+  std::vector<framework::LoDTensor> Pop(bool *success = nullptr);
+ 
+ private:
+  // Use reader::BlockingQueue as the inner data structure
+  BlockingQueue<std::vector<framework::LoDTensor>> queue_;
+  std::vector<framework::DDim> dims_;
+};
+
+class LoDTensorBlockingQueueHolder {
+ public:  
+  // Call the constructor of `LoDTensorBlockingQueue` to create queue_
+  // `InitOnce` can only called once, otherwise an exception would raise
+  void InitOnce(size_t capacity, const std::vector<framework::DDim>& dims) {
+    PADDLE_ENFORCE(queue_ == nullptr);
+    queue_.reset(new LoDTensorBlockingQueue(capacity, dims));
+  }
+
+  const std::shared_ptr<LoDTensorBlockingQueue>& GetQueue() const { return queue_; }
+
+ private:
+  std::shared_ptr<LoDTensorBlockingQueue> queue_;
+};
+```
+
+There are some major things that must be concerned:
+- `LoDTensorBlockingQueueHolder` should be a `Variable` in global scope, so that `reader_op` can find it when reading data.
+- A `Variable` of `LoDTensorBlockingQueueHolder` but not `VarDesc` must be created in Python code before `Executor::Run()` so that `Executor::Run()` can get the feeding data when it is called.
+- `Create_reader_op` should accept the name of the `LoDTensorBlockingQueueHolder` variable as an input.
+
+
+## Release of the GIL in pybind
+`Pybind11::gil_scoped_release` is used to release GIL (Global Interpreter Lock) when `LoDTensorBlockingQueue::Push()` or `Executor::Run()` method are invoked in Python side, making `LoDTensorBlockingQueue::Push()` and `Executor::Run()` run in parallel.
+
+
+## Design of PyReader
+`PyReader` is a reader which holds a `LoDTensorBlockingQueue` object.
+```C++
+class PyReader : public ReaderBase {
+ public:
+  explicit PyReader(const std::shared_ptr<LoDTensorBlockingQueue>& queue);
+  
+  void ReadNext(std::vector<framework::LoDTensor>* out) override {
+    bool success;
+    *out = queue_->Pop(&success);
+    if (!success) out->clear();
+  }
+  
+  void ReInit() override { return; }
+
+ private:
+  std::shared_ptr<LoDTensorBlockingQueue> queue_;
+};
+```
+
+
+## Design of CreatePyReaderOp
+`CreatePyReaderOp` is used to create the `PyReader` object. It requires an input `blocking_queue` which indicates the name of the `LoDTensorBlockingQueueHolder` variable.
+```C++
+class CreatePyReaderOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+ private:
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
+    auto* out = scope.FindVar(Output("Out"))
+                    ->template GetMutable<framework::ReaderHolder>();
+    if (out->Get() != nullptr) return;
+    
+    const std::string& queue_name = Input("blocking_queue");
+    auto* queue_holder_var = scope.FindVar(queue_name);
+    PADDLE_ENFORCE(queue_holder_var != nullptr);
+		auto* queue_holder = queue_holder_var
+                    ->template GetMutable<framework::LoDTensorBlockingQueueHolder>();
+    out->Reset(new PyReader(queue_holder->GetQueue()));
+  }
+};
+```
+
+## Design of Python codes
+The design of Python codes are as follows. First, we construct a variable of `LoDTensorBlockingQueueHolder` and init it with given parameters, returning the `LoDTensorBlockingQueue` object after initialization. After that, a layer of `CreatePyReaderOp` is constructed and accepts the name of the `LoDTensorBlockingQueueHolder` variable. The `LoDTensorBlockingQueue` object and result of the layer are both returned.
+```Python
+def py_reader(capacity, shapes):
+  queue_name = unique_name.generate("lod_tensor_blocking_queue")
+  var = global_scope().var(feeder_name) # create LoDTensorBlockingQueueHolder Variable
+  feed_queue = core.init_lod_tensor_blocking_queue(var, capacity, shapes) # init the queue
+  out = create_var()
+  create_py_reader_op_with_queue_name(
+      inputs={'blocking_queue': queue_name},
+      outputs={'Out':[out]})  
+  return out, feed_queue
+```

From 29bdeb1e587092a4bb9560e31af5b0596ce12a3e Mon Sep 17 00:00:00 2001
From: captainwoon <captain_woon@163.com>
Date: Wed, 27 Jun 2018 11:59:52 +0800
Subject: [PATCH 47/68] =?UTF-8?q?=E6=8C=89review=E6=84=8F=E8=A7=81?=
 =?UTF-8?q?=E8=BF=9B=E8=A1=8C=E4=BF=AE=E6=94=B9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1.PaddlePaddle是百度提供的开源深度学习框架->PaddlePaddle是百度自主研发并开源的深度学习框架
2.框架本身具有易学,去掉本身
3.在兼具性能的同时->在保证性能的同时
---
 doc/about/about_us.rst | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/doc/about/about_us.rst b/doc/about/about_us.rst
index f1b1a12216..f67d8b8130 100644
--- a/doc/about/about_us.rst
+++ b/doc/about/about_us.rst
@@ -5,16 +5,16 @@
 什么是PaddlePaddle
 --------------------
 
-- PaddlePaddle是百度提供的开源深度学习框架，它能够让开发者和企业安全、快速地实现自己的AI想法
+- PaddlePaddle是百度自主研发并开源的深度学习框架，它能够让开发者和企业安全、快速地实现自己的AI想法
 
 - 项目团队汇聚了全球顶级的深度学习科学家，致力于为开发者和企业提供最好的深度学习研发体验
 
-- 框架本身具有易学、易用、安全、高效四大特性，是最适合中国开发者和企业的深度学习工具
+- 框架具有易学、易用、安全、高效四大特性，是最适合中国开发者和企业的深度学习工具
 
 PaddlePaddle的技术特色
 -------------------------
 
-- 新一代深度学习框架： PaddlePaddle是基于“深度学习编程语言”的新一代深度学习框架，在兼具性能的同时，极大的提升了框架对模型的表达能力，能够描述任意潜在可能出现的模型
+- 新一代深度学习框架： PaddlePaddle是基于“深度学习编程语言”的新一代深度学习框架，在保证性能的同时，极大的提升了框架对模型的表达能力，能够描述任意潜在可能出现的模型
 
 - 对大规模计算更加友好：经过百度内多种大规模计算业务的打磨，PaddlePaddle在分布式计算上表现优异，基于EDL技术能够节约大量计算资源，同时也能支持大规模稀疏模型的训练
 

From 8630ba2eb135dc417fb31de554939fe8918132d0 Mon Sep 17 00:00:00 2001
From: Qingsheng Li <liqingsheng@baidu.com>
Date: Wed, 27 Jun 2018 13:55:30 +0800
Subject: [PATCH 48/68] Fix sequence expand op (#11618)

* Set zero outside functor
---
 paddle/fluid/operators/sequence_expand_op.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/operators/sequence_expand_op.h b/paddle/fluid/operators/sequence_expand_op.h
index d62c387c3e..39301e1ac0 100644
--- a/paddle/fluid/operators/sequence_expand_op.h
+++ b/paddle/fluid/operators/sequence_expand_op.h
@@ -151,9 +151,6 @@ struct SequenceExpandGradFunctor<platform::CPUDeviceContext, T> {
       const framework::Vector<size_t>& x_lod,   /*expand source lod*/
       const framework::Vector<size_t>& ref_lod, /*expand referenced lod*/
       LoDTensor* dx) {
-    math::SetConstant<platform::CPUDeviceContext, T> set_zero;
-    set_zero(context, dx, static_cast<T>(0));
-
     int dout_offset = 0;
     for (size_t i = 1; i < ref_lod.size(); ++i) {
       int repeat_num = ref_lod[i] - ref_lod[i - 1];
@@ -187,6 +184,10 @@ class SequenceExpandGradKernel : public framework::OpKernel<T> {
     g_x->mutable_data<T>(context.GetPlace());
     g_x->set_lod(x->lod());
 
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    math::SetConstant<DeviceContext, T> set_zero;
+    set_zero(dev_ctx, g_x, static_cast<T>(0));
+
     auto& y_lod = y->lod();
     if (ref_level == -1) ref_level = y_lod.size() - 1;
     // just copy the gradient

From 6b95a8a89cc79da73a9d3c461083025e298638a7 Mon Sep 17 00:00:00 2001
From: Kexin Zhao <zhaokexin01@baidu.com>
Date: Tue, 26 Jun 2018 22:57:25 -0700
Subject: [PATCH 49/68] fix error

---
 .../machine_translation/test_machine_translation.py         | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py b/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py
index 31d756503a..12c4134dc9 100644
--- a/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py
+++ b/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py
@@ -218,10 +218,10 @@ def decode_main(use_cuda, is_sparse):
     init_recursive_seq_lens = [1] * batch_size
     init_recursive_seq_lens = [init_recursive_seq_lens, init_recursive_seq_lens]
 
-    init_ids = fluid.create_lod_tensor(init_ids_data,
-                                       init_init_recursive_seq_lens, place)
+    init_ids = fluid.create_lod_tensor(init_ids_data, init_recursive_seq_lens,
+                                       place)
     init_scores = fluid.create_lod_tensor(init_scores_data,
-                                          init_init_recursive_seq_lens, place)
+                                          init_recursive_seq_lens, place)
 
     train_data = paddle.batch(
         paddle.reader.shuffle(

From c45a4b8567ee6b7b30bbe4a5733775970e831a88 Mon Sep 17 00:00:00 2001
From: Yancey1989 <yancey1989@gmail.com>
Date: Wed, 27 Jun 2018 14:15:35 +0800
Subject: [PATCH 50/68] use sigkill to stop pserver

---
 python/paddle/fluid/tests/unittests/test_dist_mnist.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist.py b/python/paddle/fluid/tests/unittests/test_dist_mnist.py
index e4d39c8b3f..450ec414d1 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist.py
@@ -145,7 +145,7 @@ class TestDistMnist(unittest.TestCase):
                 retry_times -= 1
 
     def stop_pserver(self, pid):
-        os.kill(pid, signal.SIGTERM)
+        os.kill(pid, signal.SIGKILL)
 
     def test_with_place(self):
         p = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(

From b7c179a87fc370c4a4b176621b5176c0aff5a7d1 Mon Sep 17 00:00:00 2001
From: Kexin Zhao <zhaokexin01@baidu.com>
Date: Tue, 26 Jun 2018 23:26:10 -0700
Subject: [PATCH 51/68] fix lodtensor.py

---
 python/paddle/fluid/lod_tensor.py            | 57 ++++++++-------
 python/paddle/fluid/tests/test_lod_tensor.py | 73 +++++++++++---------
 2 files changed, 73 insertions(+), 57 deletions(-)

diff --git a/python/paddle/fluid/lod_tensor.py b/python/paddle/fluid/lod_tensor.py
index c417ab393f..b2b3186c1e 100644
--- a/python/paddle/fluid/lod_tensor.py
+++ b/python/paddle/fluid/lod_tensor.py
@@ -18,15 +18,16 @@ import numpy as np
 __all__ = ['create_lod_tensor', 'create_random_int_lodtensor']
 
 
-def create_lod_tensor(data, lod, place):
+def create_lod_tensor(data, recursive_seq_lens, place):
     """
     Create a lod tensor from a numpy array, a list, or an existing lod tensor.
 
     Create a lod tensor by doing the following:
 
-    1. Check that the length-based input lod is valid.
+    1. Check that the length-based level of detail (LoD) also known as 
+       recursive_sequence_lengths of the input is valid.
 
-    2. Convert the length-based lod to a offset-based LoD.
+    2. Convert recursive_sequence_lengths to a offset-based LoD.
 
     3. Copy the data from a numpy array, a list or a existing lod tensor to
        CPU or GPU device (based on input place).
@@ -37,45 +38,47 @@ def create_lod_tensor(data, lod, place):
 
         Suppose we want LoDTensor to hold data for sequences of word, where each
         word is represented by an integer. If we want to create a LoDTensor to
-        represent two  sentences, one of 2 words, and one of 3 words.
+        represent two sentences, one of 2 words, and one of 3 words.
 
         Then :code:`data` can be a numpy array of integers with shape (5, 1).
-        :code:`lod` will be [[2, 3]], indicating the length(# of words) in each
-        sentence. This length-based input lod [[2, 3]] will be converted to
-        offset-based lod [[0, 2, 5]] inside the function call.
+        :code:`recursive_seq_lens` will be [[2, 3]], indicating the length(# of words) in each
+        sentence. This length-based :code:`recursive_seq_lens` [[2, 3]] will be converted to
+        offset-based LoD [[0, 2, 5]] inside the function call.
 
     Please reference :ref:`api_guide_low_level_lod_tensor` for more details
     regarding LoD.
 
     Args:
         data(numpy.ndarray|list|LoDTensor): a numpy array or a LoDTensor or a
-            list holding the data to be  copied.
-        lod(list): a list of lists indicating the length-based LoD info
-            specified by the user.
+            list holding the data to be copied.
+        recursive_seq_lens(list): a list of lists indicating the length-based level of detail 
+            info specified by the user.
         place(Place): CPU or GPU place indicating where the data in the new
             LoDTensor will be stored.
 
     Returns:
-        A fluid LoDTensor object with tensor data and lod info.
+        A fluid LoDTensor object with tensor data and recursive_seq_lens info.
     """
     if isinstance(data, core.LoDTensor):
-        return create_lod_tensor(np.array(data), lod, place)
+        return create_lod_tensor(np.array(data), recursive_seq_lens, place)
     elif isinstance(data, list):
         # When input data is a list, it only deal with the case where the base element 
         # is an index of shape [1] and dtype int64 (e.g., word id). Hence, the generated 
         # LoDTensor will be of shape [n, 1] and dtype int64, where `n` is the total number 
         # of words or other indexes in the sequence. 
-        new_lod = []
+        new_recursive_seq_lens = []
         for seq in data:
-            new_lod.append(len(seq))
-        assert [new_lod] == lod, "data and lod do not match"
+            new_recursive_seq_lens.append(len(seq))
+        assert [
+            new_recursive_seq_lens
+        ] == recursive_seq_lens, "data and recursive_seq_lens do not match"
         flattened_data = np.concatenate(data, axis=0).astype("int64")
         flattened_data = flattened_data.reshape([len(flattened_data), 1])
-        return create_lod_tensor(flattened_data, lod, place)
+        return create_lod_tensor(flattened_data, recursive_seq_lens, place)
     elif isinstance(data, np.ndarray):
         tensor = core.LoDTensor()
         tensor.set(data, place)
-        tensor.set_recursive_sequence_lengths(lod)
+        tensor.set_recursive_sequence_lengths(recursive_seq_lens)
         assert tensor.has_valid_recursive_sequence_lengths(
         ), "the provided lod info is invalid"
         return tensor
@@ -84,7 +87,8 @@ def create_lod_tensor(data, lod, place):
             "data should be either a LoDTensor, a Numpy array or a list")
 
 
-def create_random_int_lodtensor(lod, base_shape, place, low, high):
+def create_random_int_lodtensor(recursive_seq_lens, base_shape, place, low,
+                                high):
     """
     Create a LoDTensor containing random integers.
 
@@ -95,7 +99,7 @@ def create_random_int_lodtensor(lod, base_shape, place, low, high):
     The function does the following:
 
     1. Calculate the overall shape of the LoDTensor based on the length-based
-       :code:`lod` input and the shape of the basic element in
+       :code:`recursive_seq_lens` input and the shape of the basic element in
        :code:`base_shape`.
 
     2. Create a numpy array of this shape.
@@ -105,12 +109,13 @@ def create_random_int_lodtensor(lod, base_shape, place, low, high):
     Suppose we want LoDTensor to hold data for sequences of word, where each
     word is represented by an integer. If we want to create a LoDTensor to
     represent two sentences, one of 2 words, and one of 3 words. Then
-    'base_shape' is [1], input length-based 'lod' is [[2, 3]]. Then the overall
-    shape of the LoDTensor would be [5, 1], holding 5 words for two sentences.
+    'base_shape' is [1], input length-based 'recursive_seq_lens' is [[2, 3]]. 
+    Then the overall shape of the LoDTensor would be [5, 1], holding 5 words 
+    for two sentences.
 
     Args:
-        lod(list): a list of lists indicating the length-based LoD info
-            specified by the user.
+        recursive_seq_lens(list): a list of lists indicating the length-based 
+            level of detail info specified by the user.
         base_shape(list): the shape of the basic element to be held by the
             LoDTensor.
         place(Place): CPU or GPU place indicating where the data in the new
@@ -119,11 +124,11 @@ def create_random_int_lodtensor(lod, base_shape, place, low, high):
         high(int): the upper bound of the random integers.
 
     Returns:
-        A fluid LoDTensor object with tensor data and lod info. 
+        A fluid LoDTensor object with tensor data and recursive_seq_lens info. 
     """
     assert isinstance(base_shape, list), "base_shape should be a list"
     # append the total number of basic elements to the front of its shape
-    overall_shape = [sum(lod[-1])] + base_shape
+    overall_shape = [sum(recursive_seq_lens[-1])] + base_shape
     # the range of integer data elements is [low, high]    
     data = np.random.random_integers(low, high, overall_shape).astype("int64")
-    return create_lod_tensor(data, lod, place)
+    return create_lod_tensor(data, recursive_seq_lens, place)
diff --git a/python/paddle/fluid/tests/test_lod_tensor.py b/python/paddle/fluid/tests/test_lod_tensor.py
index b7e7f5801f..f7a9dd4129 100644
--- a/python/paddle/fluid/tests/test_lod_tensor.py
+++ b/python/paddle/fluid/tests/test_lod_tensor.py
@@ -19,18 +19,21 @@ import unittest
 
 
 class TestLoDTensor(unittest.TestCase):
-    def test_pybind_lod(self):
+    def test_pybind_recursive_seq_lens(self):
         tensor = fluid.LoDTensor()
-        lod = []
-        tensor.set_recursive_sequence_lengths(lod)
-        lod = [[], [1], [3]]
-        self.assertRaises(Exception, tensor.set_recursive_sequence_lengths, lod)
-        lod = [[0], [2], [3]]
-        self.assertRaises(Exception, tensor.set_recursive_sequence_lengths, lod)
+        recursive_seq_lens = []
+        tensor.set_recursive_sequence_lengths(recursive_seq_lens)
+        recursive_seq_lens = [[], [1], [3]]
+        self.assertRaises(Exception, tensor.set_recursive_sequence_lengths,
+                          recursive_seq_lens)
+        recursive_seq_lens = [[0], [2], [3]]
+        self.assertRaises(Exception, tensor.set_recursive_sequence_lengths,
+                          recursive_seq_lens)
 
-        lod = [[1, 2, 3]]
-        tensor.set_recursive_sequence_lengths(lod)
-        self.assertEqual(tensor.recursive_sequence_lengths(), lod)
+        recursive_seq_lens = [[1, 2, 3]]
+        tensor.set_recursive_sequence_lengths(recursive_seq_lens)
+        self.assertEqual(tensor.recursive_sequence_lengths(),
+                         recursive_seq_lens)
         tensor.set(np.random.random([6, 1]), fluid.CPUPlace())
         self.assertTrue(tensor.has_valid_recursive_sequence_lengths())
         tensor.set(np.random.random([9, 1]), fluid.CPUPlace())
@@ -38,13 +41,14 @@ class TestLoDTensor(unittest.TestCase):
 
         # Each level's sum should be equal to the number of items in the next level
         # Moreover, last level's sum should be equal to the tensor height
-        lod = [[2, 3], [1, 3, 1, 2, 2]]
-        tensor.set_recursive_sequence_lengths(lod)
-        self.assertEqual(tensor.recursive_sequence_lengths(), lod)
+        recursive_seq_lens = [[2, 3], [1, 3, 1, 2, 2]]
+        tensor.set_recursive_sequence_lengths(recursive_seq_lens)
+        self.assertEqual(tensor.recursive_sequence_lengths(),
+                         recursive_seq_lens)
         tensor.set(np.random.random([8, 1]), fluid.CPUPlace())
         self.assertFalse(tensor.has_valid_recursive_sequence_lengths())
-        lod = [[2, 3], [1, 3, 1, 2, 1]]
-        tensor.set_recursive_sequence_lengths(lod)
+        recursive_seq_lens = [[2, 3], [1, 3, 1, 2, 1]]
+        tensor.set_recursive_sequence_lengths(recursive_seq_lens)
         self.assertTrue(tensor.has_valid_recursive_sequence_lengths())
         tensor.set(np.random.random([9, 1]), fluid.CPUPlace())
         self.assertFalse(tensor.has_valid_recursive_sequence_lengths())
@@ -52,35 +56,42 @@ class TestLoDTensor(unittest.TestCase):
     def test_create_lod_tensor(self):
         # Create LoDTensor from a list
         data = [[1, 2, 3], [3, 4]]
-        wrong_lod = [[2, 2]]
-        correct_lod = [[3, 2]]
-        self.assertRaises(AssertionError, create_lod_tensor, data, wrong_lod,
-                          fluid.CPUPlace())
-        tensor = create_lod_tensor(data, correct_lod, fluid.CPUPlace())
-        self.assertEqual(tensor.recursive_sequence_lengths(), correct_lod)
+        wrong_recursive_seq_lens = [[2, 2]]
+        correct_recursive_seq_lens = [[3, 2]]
+        self.assertRaises(AssertionError, create_lod_tensor, data,
+                          wrong_recursive_seq_lens, fluid.CPUPlace())
+        tensor = create_lod_tensor(data, correct_recursive_seq_lens,
+                                   fluid.CPUPlace())
+        self.assertEqual(tensor.recursive_sequence_lengths(),
+                         correct_recursive_seq_lens)
 
         # Create LoDTensor from numpy array
         data = np.random.random([10, 1])
-        lod = [[2, 1], [3, 3, 4]]
-        tensor = create_lod_tensor(data, lod, fluid.CPUPlace())
-        self.assertEqual(tensor.recursive_sequence_lengths(), lod)
+        recursive_seq_lens = [[2, 1], [3, 3, 4]]
+        tensor = create_lod_tensor(data, recursive_seq_lens, fluid.CPUPlace())
+        self.assertEqual(tensor.recursive_sequence_lengths(),
+                         recursive_seq_lens)
 
         # Create LoDTensor from another LoDTensor, they are differnt instances
-        new_lod = [[2, 2, 1], [1, 2, 2, 3, 2]]
-        new_tensor = create_lod_tensor(tensor, new_lod, fluid.CPUPlace())
-        self.assertEqual(tensor.recursive_sequence_lengths(), lod)
-        self.assertEqual(new_tensor.recursive_sequence_lengths(), new_lod)
+        new_recursive_seq_lens = [[2, 2, 1], [1, 2, 2, 3, 2]]
+        new_tensor = create_lod_tensor(tensor, new_recursive_seq_lens,
+                                       fluid.CPUPlace())
+        self.assertEqual(tensor.recursive_sequence_lengths(),
+                         recursive_seq_lens)
+        self.assertEqual(new_tensor.recursive_sequence_lengths(),
+                         new_recursive_seq_lens)
 
     def test_create_random_int_lodtensor(self):
         # The shape of a word, commonly used in speech and NLP problem, is [1]
         shape = [1]
-        lod = [[2, 3, 5]]
+        recursive_seq_lens = [[2, 3, 5]]
         dict_size = 10000
         low = 0
         high = dict_size - 1
-        tensor = create_random_int_lodtensor(lod, shape,
+        tensor = create_random_int_lodtensor(recursive_seq_lens, shape,
                                              fluid.CPUPlace(), low, high)
-        self.assertEqual(tensor.recursive_sequence_lengths(), lod)
+        self.assertEqual(tensor.recursive_sequence_lengths(),
+                         recursive_seq_lens)
         self.assertEqual(tensor.shape(), [10, 1])
 
 
From b756063ce7e71528d57c67caa94871bd924729d9 Mon Sep 17 00:00:00 2001
From: qingqing01 <dangqingqing@baidu.com>
Date: Wed, 27 Jun 2018 15:03:29 +0800
Subject: [PATCH 52/68] Speed depthwise transposed conv2d. (#11740)

* Speed depthwise transposed conv2d.
---
 paddle/fluid/operators/conv_transpose_op.cc   | 18 +++++
 .../fluid/operators/conv_transpose_op.cu.cc   | 45 ++++++------
 paddle/fluid/operators/conv_transpose_op.h    | 70 +++++++++++++++++++
 python/paddle/fluid/layers/nn.py              | 13 +++-
 .../unittests/test_conv2d_transpose_op.py     | 13 ++++
 5 files changed, 135 insertions(+), 24 deletions(-)

diff --git a/paddle/fluid/operators/conv_transpose_op.cc b/paddle/fluid/operators/conv_transpose_op.cc
index 2e9e957ebd..eeb98ee44f 100644
--- a/paddle/fluid/operators/conv_transpose_op.cc
+++ b/paddle/fluid/operators/conv_transpose_op.cc
@@ -302,6 +302,7 @@ framework::OpKernelType ConvTransposeOpGrad::GetExpectedKernelType(
 
 namespace ops = paddle::operators;
 
+// conv2d_transpose
 REGISTER_OPERATOR(conv2d_transpose, ops::ConvTransposeOp,
                   ops::Conv2DTransposeOpMaker,
                   paddle::framework::DefaultGradOpDescMaker<true>);
@@ -317,6 +318,7 @@ REGISTER_OP_CPU_KERNEL(
     ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext,
                                      double>);
 
+// conv3d_transpose
 REGISTER_OPERATOR(conv3d_transpose, ops::ConvTransposeOp,
                   ops::Conv3DTransposeOpMaker,
                   paddle::framework::DefaultGradOpDescMaker<true>);
@@ -331,3 +333,19 @@ REGISTER_OP_CPU_KERNEL(
     ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext, float>,
     ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext,
                                      double>);
+
+// depthwise conv2d_transpose
+REGISTER_OPERATOR(depthwise_conv2d_transpose, ops::ConvTransposeOp,
+                  ops::Conv2DTransposeOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(depthwise_conv2d_transpose_grad, ops::ConvTransposeOpGrad);
+
+REGISTER_OP_CPU_KERNEL(
+    depthwise_conv2d_transpose,
+    ops::GemmConvTransposeKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GemmConvTransposeKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    depthwise_conv2d_transpose_grad,
+    ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext,
+                                     double>);
diff --git a/paddle/fluid/operators/conv_transpose_op.cu.cc b/paddle/fluid/operators/conv_transpose_op.cu.cc
index 640fa7d14a..a6d5665df8 100644
--- a/paddle/fluid/operators/conv_transpose_op.cu.cc
+++ b/paddle/fluid/operators/conv_transpose_op.cu.cc
@@ -15,25 +15,28 @@ limitations under the License. */
 #include "paddle/fluid/operators/conv_transpose_op.h"
 
 namespace ops = paddle::operators;
+using CUDA = paddle::platform::CUDADeviceContext;
 
-REGISTER_OP_CUDA_KERNEL(
-    conv2d_transpose,
-    ops::GemmConvTransposeKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GemmConvTransposeKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    conv2d_transpose_grad,
-    ops::GemmConvTransposeGradKernel<paddle::platform::CUDADeviceContext,
-                                     float>,
-    ops::GemmConvTransposeGradKernel<paddle::platform::CUDADeviceContext,
-                                     double>);
-
-REGISTER_OP_CUDA_KERNEL(
-    conv3d_transpose,
-    ops::GemmConvTransposeKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GemmConvTransposeKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    conv3d_transpose_grad,
-    ops::GemmConvTransposeGradKernel<paddle::platform::CUDADeviceContext,
-                                     float>,
-    ops::GemmConvTransposeGradKernel<paddle::platform::CUDADeviceContext,
-                                     double>);
+// conv2d
+REGISTER_OP_CUDA_KERNEL(conv2d_transpose,
+                        ops::GemmConvTransposeKernel<CUDA, float>,
+                        ops::GemmConvTransposeKernel<CUDA, double>);
+REGISTER_OP_CUDA_KERNEL(conv2d_transpose_grad,
+                        ops::GemmConvTransposeGradKernel<CUDA, float>,
+                        ops::GemmConvTransposeGradKernel<CUDA, double>);
+
+// conv3d
+REGISTER_OP_CUDA_KERNEL(conv3d_transpose,
+                        ops::GemmConvTransposeKernel<CUDA, float>,
+                        ops::GemmConvTransposeKernel<CUDA, double>);
+REGISTER_OP_CUDA_KERNEL(conv3d_transpose_grad,
+                        ops::GemmConvTransposeGradKernel<CUDA, float>,
+                        ops::GemmConvTransposeGradKernel<CUDA, double>);
+
+// depthwise conv2d
+REGISTER_OP_CUDA_KERNEL(depthwise_conv2d_transpose,
+                        ops::DepthwiseConvTransposeKernel<CUDA, float>,
+                        ops::DepthwiseConvTransposeKernel<CUDA, double>);
+REGISTER_OP_CUDA_KERNEL(depthwise_conv2d_transpose_grad,
+                        ops::DepthwiseConvTransposeGradKernel<CUDA, float>,
+                        ops::DepthwiseConvTransposeGradKernel<CUDA, double>);
diff --git a/paddle/fluid/operators/conv_transpose_op.h b/paddle/fluid/operators/conv_transpose_op.h
index 1dcfc651fd..0d9c6a62fe 100644
--- a/paddle/fluid/operators/conv_transpose_op.h
+++ b/paddle/fluid/operators/conv_transpose_op.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/math/depthwise_conv.h"
 #include "paddle/fluid/operators/math/im2col.h"
 #include "paddle/fluid/operators/math/vol2col.h"
 
@@ -316,5 +317,74 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
     }
   }
 };
+
+template <typename DeviceContext, typename T>
+class DepthwiseConvTransposeKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* input = context.Input<Tensor>("Input");
+    Tensor filter = *context.Input<Tensor>("Filter");
+    Tensor* output = context.Output<Tensor>("Output");
+    output->mutable_data<T>(context.GetPlace());
+
+    int groups = context.Attr<int>("groups");
+    PADDLE_ENFORCE_EQ(groups, filter.dims()[0]);
+
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
+    for (auto v : dilations) {
+      PADDLE_ENFORCE_EQ(v, 1);
+    }
+
+    output->mutable_data<T>(context.GetPlace());
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    math::SetConstant<DeviceContext, T> set_zero;
+    set_zero(dev_ctx, output, static_cast<T>(0));
+
+    math::DepthwiseConvInputGradFunctor<DeviceContext, T>
+        depthwiseConvInputGrad;
+    depthwiseConvInputGrad(dev_ctx, *output, filter, *input, strides, paddings,
+                           output);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class DepthwiseConvTransposeGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* input = context.Input<Tensor>("Input");
+    const Tensor* output_grad =
+        context.Input<Tensor>(framework::GradVarName("Output"));
+    Tensor* input_grad =
+        context.Output<Tensor>(framework::GradVarName("Input"));
+    Tensor* filter_grad =
+        context.Output<Tensor>(framework::GradVarName("Filter"));
+    Tensor filter = *context.Input<Tensor>("Filter");
+
+    if (!input_grad && !filter_grad) return;
+
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+
+    if (input_grad) {
+      math::DepthwiseConvFunctor<DeviceContext, T> depthwiseConv;
+      depthwiseConv(dev_ctx, *output_grad, filter, strides, paddings,
+                    input_grad);
+    }
+
+    if (filter_grad) {
+      math::SetConstant<DeviceContext, T> set_zero;
+      filter_grad->mutable_data<T>(context.GetPlace());
+      set_zero(dev_ctx, filter_grad, static_cast<T>(0));
+
+      math::DepthwiseConvFilterGradFunctor<DeviceContext, T>
+          depthwiseConvFilterGrad;
+      depthwiseConvFilterGrad(dev_ctx, *output_grad, *input, strides, paddings,
+                              filter_grad);
+    }
+  }
+};
 }  // namespace operators
 }  // namespace paddle
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index f5700ed562..02ea2af325 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -2334,10 +2334,17 @@ def conv2d_transpose(input,
           data = fluid.layers.data(name='data', shape=[3, 32, 32], dtype='float32')
           conv2d_transpose = fluid.layers.conv2d_transpose(input=data, num_filters=2, filter_size=3)
     """
-    helper = LayerHelper("conv2d_transpose", **locals())
+
+    input_channel = input.shape[1]
+
+    op_type = 'conv2d_transpose'
+    if (input_channel == groups and num_filters == input_channel and
+            not use_cudnn):
+        op_type = 'depthwise_conv2d_transpose'
+
+    helper = LayerHelper(op_type, **locals())
     if not isinstance(input, Variable):
         raise TypeError("Input of conv2d_transpose must be Variable")
-    input_channel = input.shape[1]
 
     padding = utils.convert_to_list(padding, 2, 'padding')
     stride = utils.convert_to_list(stride, 2, 'stride')
@@ -2371,7 +2378,7 @@ def conv2d_transpose(input,
 
     pre_bias = helper.create_tmp_variable(dtype=input.dtype)
     helper.append_op(
-        type='conv2d_transpose',
+        type=op_type,
         inputs={'Input': [input],
                 'Filter': [img_filter]},
         outputs={'Output': pre_bias},
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
index ded2f13028..07545e7feb 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
@@ -242,6 +242,19 @@ class TestCUDNNWithGroups(TestWithGroups):
         self.op_type = "conv2d_transpose"
 
 
+class TestDepthwiseConvTranspose(TestConv2dTransposeOp):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [2, 2]
+        self.dilations = [1, 1]
+        self.input_size = [2, 8, 16, 16]  # NCHW
+        self.groups = 8
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] / self.groups
+        self.filter_size = [self.input_size[1], f_c, 4, 4]
+        self.op_type = "depthwise_conv2d_transpose"
+
+
 # Please Don't remove the following code.
 # Currently, CI use cudnn V5.0 which not support dilation conv.
 # class TestCUDNNWithDilation(TestWithDilation):

From 008e0c9bc1efe552cfbb349765364beca2c4c5a9 Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Tue, 26 Jun 2018 16:01:41 +0800
Subject: [PATCH 53/68] small clean

---
 python/paddle/fluid/transpiler/distribute_transpiler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index 4a3bd3bef2..343901cda3 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -301,6 +301,7 @@ class DistributeTranspiler(object):
             Program: trainer side program.
         """
         # remove optimize ops and add a send op to main_program
+        # FIXME(typhoonzero): Also ops like clip_gradient, lrn_decay?
         delete_ops(self.origin_program.global_block(), self.optimize_ops)
         self.origin_program.__str__()
         return self.origin_program
@@ -537,7 +538,6 @@ class DistributeTranspiler(object):
 
         # 2. rename op outputs
         for op in orig_s_prog.global_block().ops:
-            new_inputs = dict()
             new_outputs = dict()
             # do not append startup op if var is not on this pserver
             op_on_pserver = False

From f2459aafd263b0504fc5c2fa0c7e5d7a58a717a1 Mon Sep 17 00:00:00 2001
From: Yan Chunwei <yanchunwei@outlook.com>
Date: Wed, 27 Jun 2018 15:48:03 +0800
Subject: [PATCH 54/68] inference API init cn (#11731)

---
 paddle/contrib/inference/high_level_api_cn.md | 87 +++++++++++++++++++
 1 file changed, 87 insertions(+)
 create mode 100644 paddle/contrib/inference/high_level_api_cn.md

diff --git a/paddle/contrib/inference/high_level_api_cn.md b/paddle/contrib/inference/high_level_api_cn.md
new file mode 100644
index 0000000000..a57f015a4e
--- /dev/null
+++ b/paddle/contrib/inference/high_level_api_cn.md
@@ -0,0 +1,87 @@
+# Paddle 预测 API
+
+为了更简单方便的预测部署，Fluid 提供了一套高层 API 用来隐藏底层不同的优化实现。
+
+预测库包含:
+
+- 头文件 `paddle_inference_api.h` 定义了所有的接口
+- 库文件`libpaddle_fluid.so` 或 `libpaddle_fluid.a`
+- 库文件 `libpaddle_inference_api.so` 或 `libpaddle_inference_api.a`
+
+下面是详细的一些 API 概念介绍
+
+## PaddleTensor
+
+PaddleTensor 定义了预测最基本的输入输出的数据格式，其定义是
+
+```c++
+struct PaddleTensor {
+  std::string name;  // variable name.
+  std::vector<int> shape;
+  PaddleBuf data;  // blob of data.
+  PaddleDType dtype;
+};
+```
+
+- `name` 用于指定输入数据对应的 模型中variable 的名字 （暂时没有用，但会在后续支持任意 target 时启用）
+- `shape` 表示一个 Tensor 的 shape
+- `data`  数据以连续内存的方式存储在`PaddleBuf` 中，`PaddleBuf` 可以接收外面的数据或者独立`malloc`内存，详细可以参考头文件中相关定义。
+- `dtype` 表示 Tensor 的数据类型
+
+## engine
+
+高层 API 底层有多种优化实现，我们称之为 engine，目前有三种 engine
+
+- 原生 engine，由 paddle 原生的 forward operator 组成，可以天然支持所有paddle 训练出的模型，
+- Anakin engine，封装了 [Anakin](https://github.com/PaddlePaddle/Anakin) ，在某些模型上性能不错，但只能接受自带模型格式，无法支持所有 paddle 模型，
+- TensorRT mixed engine，用子图的方式支持了 [TensorRT](https://developer.nvidia.com/tensorrt) ，支持所有paddle 模型，并自动切割部分计算子图到 TensorRT 上加速（WIP）
+
+其实现为
+
+```c++
+enum class PaddleEngineKind {
+  kNative = 0,       // Use the native Fluid facility.
+  kAnakin,           // Use Anakin for inference.
+  kAutoMixedTensorRT // Automatically mixing TensorRT with the Fluid ops.
+};
+```
+
+## 预测部署过程
+
+总体上分为以下步骤
+
+1. 用合适的配置创建 `PaddlePredictor`
+2. 创建输入用的 `PaddleTensor`，传入到 `PaddlePredictor` 中
+3. 获取输出的 `PaddleTensor` ，将结果取出
+
+下面完整演示一个简单的模型，部分细节代码隐去
+
+```c++
+#include "paddle_inference_api.h"
+
+// 创建一个 config，并修改相关设置
+paddle::NativeConfig config;
+config.model_dir = "xxx";
+config.use_gpu = false;
+// 创建一个原生的 PaddlePredictor
+auto predictor =
+      paddle::CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+// 创建输入 tensor
+int64_t data[4] = {1, 2, 3, 4};
+paddle::PaddleTensor tensor{.name = "",
+                            .shape = std::vector<int>({4, 1}),
+                            .data = PaddleBuf(data, sizeof(data)),
+                            .dtype = PaddleDType::INT64};
+// 创建输出 tensor，输出 tensor 的内存可以复用
+std::vector<paddle::PaddleTensor> outputs;
+// 执行预测
+CHECK(predictor->Run(slots, &outputs));
+// 获取 outputs ...
+```
+
+编译时，联编 `libpaddle_fluid.a/.so` 和 `libpaddle_inference_api.a/.so` 便可。 
+
+## 详细代码参考
+
+- [inference demos](./demo)
+- [复杂单线程/多线程例子](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/contrib/inference/test_paddle_inference_api_impl.cc)

From c228977727cfbcd53dbccb4f93153c6102cd7815 Mon Sep 17 00:00:00 2001
From: Yan Chunwei <yanchunwei@outlook.com>
Date: Wed, 27 Jun 2018 15:56:12 +0800
Subject: [PATCH 55/68] add anakin release (#11747)

---
 cmake/external/anakin.cmake             | 16 +++++++------
 cmake/inference_lib.cmake               | 30 +++++++++++++++++--------
 paddle/contrib/inference/CMakeLists.txt |  5 ++++-
 3 files changed, 34 insertions(+), 17 deletions(-)

diff --git a/cmake/external/anakin.cmake b/cmake/external/anakin.cmake
index f1cd9c99eb..d205e39582 100644
--- a/cmake/external/anakin.cmake
+++ b/cmake/external/anakin.cmake
@@ -26,13 +26,15 @@ function(fetch_include_recursively root_dir)
     endforeach()
 endfunction()
 
-# download library
-message(STATUS "Download Anakin library from ${ANAKIN_LIBRARY_URL}")
-execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}")
-execute_process(COMMAND bash -c "rm -rf ${ANAKIN_INSTALL_DIR}/*")
-execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; wget -q ${ANAKIN_LIBRARY_URL}")
-execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}")
-execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; tar xzf anakin_release_simple.tar.gz")
+if (NOT EXISTS "${ANAKIN_INSTALL_DIR}")
+    # download library
+    message(STATUS "Download Anakin library from ${ANAKIN_LIBRARY_URL}")
+    execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}")
+    execute_process(COMMAND bash -c "rm -rf ${ANAKIN_INSTALL_DIR}/*")
+    execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; wget -q ${ANAKIN_LIBRARY_URL}")
+    execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}")
+    execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; tar xzf anakin_release_simple.tar.gz")
+endif()
 
 if (WITH_ANAKIN)
     message(STATUS "Anakin for inference is enabled")
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index cd44fe2542..850098297e 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -149,21 +149,33 @@ copy(memory_lib
   DSTS ${dst_dir}/${module} ${dst_dir}/${module}/detail
 )
 
-set(module "inference")
-copy(inference_lib DEPS paddle_fluid_shared paddle_fluid
-  SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*
-  DSTS ${dst_dir}/${module} ${dst_dir}/${module}
-)
+set(inference_deps paddle_fluid_shared paddle_fluid)
 
 if(WITH_CONTRIB)
-   set(contrib_dst_dir "${FLUID_INSTALL_DIR}/contrib/inference")
-   copy(contrib_inference_lib DEPS paddle_inference_api
+    message(STATUS "installing contrib")
+    set(contrib_dst_dir "${FLUID_INSTALL_DIR}/contrib/inference")
+    if (WITH_ANAKIN)
+        copy(contrib_anakin_inference_lib DEPS paddle_inference_api inference_anakin_api
+            SRCS
+            ${PADDLE_BINARY_DIR}/paddle/contrib/inference/libinference_anakin_api* # compiled anakin api
+            ${PADDLE_BINARY_DIR}/third_party/install/anakin/*.tar.gz # anakin release
+            DSTS ${contrib_dst_dir}/anakin ${contrib_dst_dir}/anakin)
+        list(APPEND inference_deps contrib_anakin_inference_lib)
+   endif()
+
+  copy(contrib_inference_lib DEPS paddle_inference_api
         SRCS ${PADDLE_SOURCE_DIR}/paddle/contrib/inference/paddle_inference_api.h
         ${PADDLE_BINARY_DIR}/paddle/contrib/inference/libpaddle_inference_api.*
-        DSTS ${contrib_dst_dir} ${contrib_dst_dir}
-   )
+        DSTS ${contrib_dst_dir} ${contrib_dst_dir})
+  list(APPEND inference_deps contrib_inference_lib)
 endif()
 
+set(module "inference")
+copy(inference_lib DEPS ${inference_deps}
+  SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*
+  DSTS ${dst_dir}/${module} ${dst_dir}/${module}
+)
+
 set(module "platform")
 copy(platform_lib DEPS profiler_py_proto
   SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h ${src_dir}/${module}/details/*.h
diff --git a/paddle/contrib/inference/CMakeLists.txt b/paddle/contrib/inference/CMakeLists.txt
index 0f56d648b1..45bbb4b237 100644
--- a/paddle/contrib/inference/CMakeLists.txt
+++ b/paddle/contrib/inference/CMakeLists.txt
@@ -54,9 +54,12 @@ if (WITH_ANAKIN AND WITH_TESTING) # only needed in CI
     # Due to Anakin do not have official library releases and the versions of protobuf and cuda do not match Paddle's,
     # so anakin library will not be merged to our official inference library. To use anakin prediction API, one need to
     # compile the libinference_anakin_api.a and compile with anakin.so.
-    nv_library(inference_anakin_api SHARED SRCS paddle_inference_api.cc paddle_inference_api_anakin_engine.cc)
+    nv_library(inference_anakin_api SRCS paddle_inference_api.cc paddle_inference_api_anakin_engine.cc)
+    nv_library(inference_anakin_api_shared SHARED SRCS paddle_inference_api.cc paddle_inference_api_anakin_engine.cc)
     target_compile_options(inference_anakin_api BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
+    target_compile_options(inference_anakin_api_shared BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
     target_link_libraries(inference_anakin_api anakin anakin_saber_common)
+    target_link_libraries(inference_anakin_api_shared anakin anakin_saber_common)
     cc_test(inference_anakin_test SRCS paddle_inference_api_anakin_engine_tester.cc
                                   ARGS --model=${ANAKIN_INSTALL_DIR}/mobilenet_v2.anakin.bin
                                   DEPS inference_anakin_api)

From df7a266ae238afbbde27a38b279d5145f188f12b Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Wed, 27 Jun 2018 15:56:45 +0800
Subject: [PATCH 56/68] fix adam op for selected rows

---
 paddle/fluid/operators/adam_op.cc | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/operators/adam_op.cc b/paddle/fluid/operators/adam_op.cc
index 6ee73c3000..5d670fe3b9 100644
--- a/paddle/fluid/operators/adam_op.cc
+++ b/paddle/fluid/operators/adam_op.cc
@@ -56,9 +56,12 @@ class AdamOp : public framework::OperatorWithKernel {
                       "Beta2 power accumulator should have 1 dimension");
 
     auto param_dims = ctx->GetInputDim("Param");
-    PADDLE_ENFORCE_EQ(
-        param_dims, ctx->GetInputDim("Grad"),
-        "Param and Grad input of AdamOp should have same dimension");
+    if (ctx->GetInputsVarType("Grad")[0] ==
+        framework::proto::VarType::LOD_TENSOR) {
+      PADDLE_ENFORCE_EQ(
+          param_dims, ctx->GetInputDim("Grad"),
+          "Param and Grad input of AdamOp should have same dimension");
+    }
     PADDLE_ENFORCE_EQ(
         param_dims, ctx->GetInputDim("Moment1"),
         "Param and Moment1 input of AdamOp should have same dimension");

From 7d9c9a013be761f7d9827823fda106670fe1e899 Mon Sep 17 00:00:00 2001
From: Yancey1989 <yancey1989@gmail.com>
Date: Wed, 27 Jun 2018 16:33:28 +0800
Subject: [PATCH 57/68] update by comment

---
 python/paddle/fluid/tests/unittests/test_dist_mnist.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist.py b/python/paddle/fluid/tests/unittests/test_dist_mnist.py
index 450ec414d1..ad2d57f7c5 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist.py
@@ -15,6 +15,7 @@
 import numpy as np
 import argparse
 import time
+import math
 
 import paddle
 import paddle.fluid as fluid
@@ -145,7 +146,7 @@ class TestDistMnist(unittest.TestCase):
                 retry_times -= 1
 
     def stop_pserver(self, pid):
-        os.kill(pid, signal.SIGKILL)
+        os.kill(pid, signal.SIGTERM)
 
     def test_with_place(self):
         p = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
@@ -194,7 +195,7 @@ class TestDistMnist(unittest.TestCase):
                     acc_val = np.array(acc_set).mean()
                     avg_loss_val = np.array(avg_loss_set).mean()
                     if float(acc_val
-                             ) > 0.2:  # Smaller value to increase CI speed
+                             ) > 0.8:  # Smaller value to increase CI speed
                         return
                     else:
                         print(

From 778b71fc93cf1cc541cabfddbd1b229898229506 Mon Sep 17 00:00:00 2001
From: baiyf <baiyfbupt@gmail.com>
Date: Wed, 27 Jun 2018 16:51:42 +0800
Subject: [PATCH 58/68] Optimize bipartite_match_op in large scale input
 (#11730)

* optimize bipartite_match_op in large scale input
---
 .../operators/detection/bipartite_match_op.cc | 98 +++++++++++++------
 .../unittests/test_bipartite_match_op.py      | 17 ++++
 2 files changed, 84 insertions(+), 31 deletions(-)

diff --git a/paddle/fluid/operators/detection/bipartite_match_op.cc b/paddle/fluid/operators/detection/bipartite_match_op.cc
index d437ad5c19..c23b65fe4d 100644
--- a/paddle/fluid/operators/detection/bipartite_match_op.cc
+++ b/paddle/fluid/operators/detection/bipartite_match_op.cc
@@ -51,6 +51,12 @@ class BipartiteMatchOp : public framework::OperatorWithKernel {
   }
 };
 
+template <class T>
+bool DistPairDescend(std::tuple<int, int, T> pair1,
+                     std::tuple<int, int, T> pair2) {
+  return std::get<2>(pair1) > std::get<2>(pair2);
+}
+
 template <typename T>
 class BipartiteMatchKernel : public framework::OpKernel<T> {
  public:
@@ -58,46 +64,76 @@ class BipartiteMatchKernel : public framework::OpKernel<T> {
   // The match_dist must be initialized to 0 at first.
   void BipartiteMatch(const Tensor& dist, int* match_indices,
                       T* match_dist) const {
-    constexpr T kEPS = static_cast<T>(1e-6);
     PADDLE_ENFORCE_EQ(dist.dims().size(), 2, "The rank of dist must be 2.");
     int64_t row = dist.dims()[0];
     int64_t col = dist.dims()[1];
     auto* dist_data = dist.data<T>();
-    std::vector<int> row_pool;
-    for (int i = 0; i < row; ++i) {
-      row_pool.push_back(i);
-    }
-    while (row_pool.size() > 0) {
-      int max_idx = -1;
-      int max_row_idx = -1;
-      T max_dist = -1;
-      for (int64_t j = 0; j < col; ++j) {
-        if (match_indices[j] != -1) {
-          continue;
+    // Test result: When row==130 the speed of these two methods almost the same
+    if (row >= 130) {
+      std::vector<std::tuple<int, int, T>> match_pair;
+
+      for (int64_t i = 0; i < row; ++i) {
+        for (int64_t j = 0; j < col; ++j) {
+          match_pair.push_back(std::make_tuple(i, j, dist_data[i * col + j]));
         }
-        for (size_t k = 0; k < row_pool.size(); ++k) {
-          int m = row_pool[k];
-          // distance is 0 between m-th row and j-th column
-          if (dist_data[m * col + j] < kEPS) {
+      }
+      std::sort(match_pair.begin(), match_pair.end(), DistPairDescend<T>);
+      std::vector<int> row_indices(row, -1);
+
+      int64_t idx = 0;
+      for (int64_t k = 0; k < row * col; ++k) {
+        int64_t i = std::get<0>(match_pair[k]);
+        int64_t j = std::get<1>(match_pair[k]);
+        T dist = std::get<2>(match_pair[k]);
+
+        if (idx >= row) {
+          break;
+        }
+        if (match_indices[j] == -1 && row_indices[i] == -1 && dist > 0) {
+          match_indices[j] = i;
+          row_indices[i] = j;
+          match_dist[j] = dist;
+          idx += 1;
+        }
+      }
+    } else {
+      constexpr T kEPS = static_cast<T>(1e-6);
+      std::vector<int> row_pool;
+      for (int i = 0; i < row; ++i) {
+        row_pool.push_back(i);
+      }
+      while (row_pool.size() > 0) {
+        int max_idx = -1;
+        int max_row_idx = -1;
+        T max_dist = -1;
+        for (int64_t j = 0; j < col; ++j) {
+          if (match_indices[j] != -1) {
             continue;
           }
-          if (dist_data[m * col + j] > max_dist) {
-            max_idx = j;
-            max_row_idx = m;
-            max_dist = dist_data[m * col + j];
+          for (size_t k = 0; k < row_pool.size(); ++k) {
+            int m = row_pool[k];
+            // distance is 0 between m-th row and j-th column
+            if (dist_data[m * col + j] < kEPS) {
+              continue;
+            }
+            if (dist_data[m * col + j] > max_dist) {
+              max_idx = j;
+              max_row_idx = m;
+              max_dist = dist_data[m * col + j];
+            }
           }
         }
-      }
-      if (max_idx == -1) {
-        // Cannot find good match.
-        break;
-      } else {
-        PADDLE_ENFORCE_EQ(match_indices[max_idx], -1);
-        match_indices[max_idx] = max_row_idx;
-        match_dist[max_idx] = max_dist;
-        // Erase the row index.
-        row_pool.erase(
-            std::find(row_pool.begin(), row_pool.end(), max_row_idx));
+        if (max_idx == -1) {
+          // Cannot find good match.
+          break;
+        } else {
+          PADDLE_ENFORCE_EQ(match_indices[max_idx], -1);
+          match_indices[max_idx] = max_row_idx;
+          match_dist[max_idx] = max_dist;
+          // Erase the row index.
+          row_pool.erase(
+              std::find(row_pool.begin(), row_pool.end(), max_row_idx));
+        }
       }
     }
   }
diff --git a/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py b/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py
index 1a245fd756..d5bd726c4a 100644
--- a/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py
@@ -114,6 +114,23 @@ class TestBipartiteMatchOpWithoutLoD(OpTest):
         self.check_output()
 
 
+class TestBipartiteMatchOpWithoutLoDLargeScaleInput(OpTest):
+    def setUp(self):
+        self.op_type = 'bipartite_match'
+        lod = [[300]]
+        dist = np.random.random((300, 17)).astype('float32')
+        match_indices, match_dist = batch_bipartite_match(dist, lod[0])
+
+        self.inputs = {'DistMat': dist}
+        self.outputs = {
+            'ColToRowMatchIndices': match_indices,
+            'ColToRowMatchDist': match_dist,
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
 class TestBipartiteMatchOpWithPerPredictionType(OpTest):
     def setUp(self):
         self.op_type = 'bipartite_match'

From 9a15c92317e6ac938de0279c7506a28e3c100116 Mon Sep 17 00:00:00 2001
From: pzelazko-intel <pawel.zelazko@intel.com>
Date: Wed, 27 Jun 2018 12:06:52 +0200
Subject: [PATCH 59/68] bnorm+relu fuse for mkldnn (inference) (#11434)

* bnorm+relu fuse for mkldnn

* separate fuse_relu function

* bug fix

* proper while range in inference_transpiler

* description fix

* review fix

* review fix

* unit test for fwd batch norm+relu MKLDNN fuse
---
 benchmark/fluid/args.py                       |  4 +
 benchmark/fluid/fluid_benchmark.py            |  5 +
 .../fluid/operators/batch_norm_mkldnn_op.cc   |  2 +
 paddle/fluid/operators/batch_norm_op.cc       |  3 +
 python/paddle/fluid/layers/nn.py              |  7 +-
 .../unittests/test_batch_norm_mkldnn_op.py    | 12 +++
 .../tests/unittests/test_batch_norm_op.py     | 11 ++-
 .../fluid/transpiler/inference_transpiler.py  | 99 ++++++++++++++-----
 8 files changed, 115 insertions(+), 28 deletions(-)
 mode change 100644 => 100755 benchmark/fluid/fluid_benchmark.py

diff --git a/benchmark/fluid/args.py b/benchmark/fluid/args.py
index 68a3d42d7a..99c9d79b06 100644
--- a/benchmark/fluid/args.py
+++ b/benchmark/fluid/args.py
@@ -122,5 +122,9 @@ def parse_args():
         type=str,
         default="",
         help='Directory that contains all the training recordio files.')
+    parser.add_argument(
+        '--use_inference_transpiler',
+        action='store_true',
+        help='If set, uses inference transpiler to optimize the program.')
     args = parser.parse_args()
     return args
diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py
old mode 100644
new mode 100755
index ece1102dce..dcd4d9ea95
--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
@@ -131,6 +131,11 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
     exe = fluid.Executor(place)
     exe.run(startup_prog)
 
+    # Use inference_transpiler to speedup
+    if args.use_inference_transpiler:
+        t = fluid.InferenceTranspiler()
+        t.transpile(infer_prog, place)
+
     if not args.use_reader_op:
         feed_var_list = [
             var for var in train_prog.global_block().vars.itervalues()
diff --git a/paddle/fluid/operators/batch_norm_mkldnn_op.cc b/paddle/fluid/operators/batch_norm_mkldnn_op.cc
index cc158e57f7..6ecb43c49c 100644
--- a/paddle/fluid/operators/batch_norm_mkldnn_op.cc
+++ b/paddle/fluid/operators/batch_norm_mkldnn_op.cc
@@ -66,6 +66,7 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     const float epsilon = ctx.Attr<float>("epsilon");
     const float momentum = ctx.Attr<float>("momentum");
     const bool is_test = ctx.Attr<bool>("is_test");
+    const bool fuse_with_relu = ctx.Attr<bool>("fuse_with_relu");
 
     const auto *x = ctx.Input<Tensor>("X");
     const auto *mean = ctx.Input<Tensor>("Mean");
@@ -111,6 +112,7 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
 
     unsigned flags = mkldnn::use_scale_shift;
     if (is_test) flags |= mkldnn::use_global_stats;
+    if (fuse_with_relu) flags |= mkldnn::fuse_bn_relu;
 
     // create mkldnn memory from input x tensor
     auto src_memory =
diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc
index 52b0bf85c0..693bf973c2 100644
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -155,6 +155,9 @@ class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<bool>("use_mkldnn",
                   "(bool, default false) Only used in mkldnn kernel")
         .SetDefault(false);
+    AddAttr<bool>("fuse_with_relu",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false);
     AddComment(R"DOC(
 Batch Normalization.
 
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 02ea2af325..64f48e259a 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -1993,7 +1993,8 @@ def batch_norm(input,
                name=None,
                moving_mean_name=None,
                moving_variance_name=None,
-               do_model_average_for_mean_and_var=False):
+               do_model_average_for_mean_and_var=False,
+               fuse_with_relu=False):
     """
     **Batch Normalization Layer**
 
@@ -2036,6 +2037,7 @@ def batch_norm(input,
         moving_mean_name(string, Default None): The name of moving_mean which store the global Mean.
         moving_variance_name(string, Default None): The name of the moving_variance which store the global Variance.
         do_model_average_for_mean_and_var(bool, Default False): Do model average for mean and variance or not.
+        fuse_with_relu (bool): if True, this OP performs relu after batch norm.
 
     Returns:
         Variable: A tensor variable which is the result after applying batch normalization on the input.
@@ -2121,7 +2123,8 @@ def batch_norm(input,
             "momentum": momentum,
             "epsilon": epsilon,
             "is_test": is_test,
-            "use_mkldnn": use_mkldnn
+            "use_mkldnn": use_mkldnn,
+            "fuse_with_relu": fuse_with_relu
         })
 
     return helper.append_activation(batch_norm_out)
diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_batch_norm_mkldnn_op.py
index f6097d4b84..18fa546159 100644
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_mkldnn_op.py
@@ -52,5 +52,17 @@ class TestMKLDNNBatchNormOpInference(TestBatchNormOpInference):
         self.check_with_place(place, data_format, self.dtype, [2, 3, 4, 5])
 
 
+class TestMKLDNNBatchNormOpWithReluInference(TestBatchNormOpInference):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+        self.fuse_with_relu = True
+
+    def test_check_output(self):
+        place = core.CPUPlace()
+        data_format = "NCHW"
+
+        self.check_with_place(place, data_format, self.dtype, [2, 3, 4, 5])
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
index 01e5749bdb..a62ee9596d 100644
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
@@ -159,6 +159,7 @@ class TestBatchNormOpInference(unittest.TestCase):
     def setUp(self):
         self.dtype = np.float32
         self.use_mkldnn = False
+        self.fuse_with_relu = False
         self.init_kernel_type()
 
     def __assert_close(self, tensor, np_array, msg, atol=1e-4):
@@ -180,6 +181,8 @@ class TestBatchNormOpInference(unittest.TestCase):
         scale_shape = [c]
 
         x_val = np.random.random_sample(x_shape).astype(dtype)
+        # generate some negative values to test case with relu fused
+        x_val = x_val - 0.5
         scale_val = np.random.random_sample(scale_shape).astype(np.float32)
         bias_val = np.random.random_sample(scale_shape).astype(np.float32)
 
@@ -188,6 +191,8 @@ class TestBatchNormOpInference(unittest.TestCase):
 
         y_out = _reference_testing(x_val, scale_val, bias_val, mean, variance,
                                    epsilon, data_layout).astype(dtype)
+        if self.fuse_with_relu:
+            y_out = np.maximum(y_out, 0)
 
         scope = core.Scope()
 
@@ -233,6 +238,7 @@ class TestBatchNormOpInference(unittest.TestCase):
             is_test=True,
             data_layout=data_layout,
             use_mkldnn=self.use_mkldnn,
+            fuse_with_relu=self.fuse_with_relu,
             epsilon=epsilon)
 
         batch_norm_op.run(scope, place)
@@ -265,6 +271,7 @@ class TestFP16BatchNormOpInference(TestBatchNormOpInference):
     def setUp(self):
         self.dtype = np.float16
         self.use_mkldnn = False
+        self.fuse_with_relu = False
         self.init_kernel_type()
 
     def test_check_output(self):
@@ -284,6 +291,7 @@ class TestFP16BatchNormOpInference(TestBatchNormOpInference):
 class TestBatchNormOpTraining(unittest.TestCase):
     def setUp(self):
         self.use_mkldnn = False
+        self.fuse_with_relu = False
         self.data_formats = ["NCHW", "NHWC"]
         self.init_kernel_type()
 
@@ -367,7 +375,8 @@ class TestBatchNormOpTraining(unittest.TestCase):
                         "epsilon": epsilon,
                         "is_test": False,
                         "data_layout": data_layout,
-                        "use_mkldnn": self.use_mkldnn
+                        "use_mkldnn": self.use_mkldnn,
+                        "fuse_with_relu": self.fuse_with_relu
                     })
                 block.create_var(name='y@GRAD', dtype='float32', shape=y.shape)
 
diff --git a/python/paddle/fluid/transpiler/inference_transpiler.py b/python/paddle/fluid/transpiler/inference_transpiler.py
index 0629f2916b..d32c69d148 100644
--- a/python/paddle/fluid/transpiler/inference_transpiler.py
+++ b/python/paddle/fluid/transpiler/inference_transpiler.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 import numpy as np
 from .. import core
 from ..framework import Program
@@ -20,12 +21,15 @@ from ..executor import global_scope
 
 class InferenceTranspiler:
     '''
-    Convert the fluid program to optimized inference program. 
-    
-    There are several optimizations, only fuse batch normalization is supported now.
+    Convert the fluid program to optimized inference program.
+
+    There are several optimizations:
+
+      - fuse convolution and batch normalization
+      - fuse batch normalization and relu (MKLDNN only)
 
     Examples:
-   
+
     .. code-block:: python
 
         # As InferenceTranspiler will modify the original program,
@@ -54,19 +58,64 @@ class InferenceTranspiler:
         if not isinstance(scope, core.Scope):
             raise TypeError("scope should be as Scope type or None")
         self.fuse_batch_norm(program, place, scope)
+        self.fuse_relu_mkldnn(program)
+
+    def fuse_relu_mkldnn(self, program):
+        '''
+        Transpile the program by fused relu activation for MKLDNN program.
+
+        Relu activation following batch norm OP can be fused by adding
+        :math:`fuse_with_relu` attribute to batch norm OP.
+
+        The result of fuse is:
+
+        - before:
+
+          - batch_norm->relu->any_other_op
+
+        - after:
+
+          - batch_norm->any_other_op
+
+        :param program: program to transpile
+        :type program: Program
+        '''
+        use_mkldnn = bool(os.getenv("FLAGS_use_mkldnn", False))
+        if not use_mkldnn:
+            return
+
+        self.block = program.block(0)
+
+        i = 0
+        while i < len(self.block.ops) - 1:
+            current_op = self.block.ops[i]
+            if current_op.type in ['batch_norm']:
+                next_op = self.block.ops[i + 1]
+                if next_op.type == 'relu':
+                    # modify bnorm OP to include relu
+                    current_op.set_attr("fuse_with_relu", True)
+                    # remove relu OP
+                    self.block.remove_op(i + 1)
+            i = i + 1
+
+        self._remove_unused_var()
+        # TODO(luotao): use clone() method to flush the program.desc in force,
+        # since some large program.desc will not be flushed immediately.
+        # And a better solution will be considered later.
+        program = program.clone()
 
     def fuse_batch_norm(self, program, place, scope):
         '''
         Transpile the program by fused batch normalization.
- 
-        The batch normalization followed the convolution or fully connected layer 
-        can be integrated with them. Doing so will give us a forward acceleration, 
+
+        The batch normalization followed the convolution or fully connected layer
+        can be integrated with them. Doing so will give us a forward acceleration,
         especially in environments like mobile or embedded.
-                    
+
         For input :math:`X`:
 
-        - Conv process:        :math:`X = input * W + bias` 
-        - Batch norm process:  :math:`X' = (X - mean) / std` 
+        - Conv process:        :math:`X = input * W + bias`
+        - Batch norm process:  :math:`X' = (X - mean) / std`
         - Scale Process:       :math:`Y = a * X' + b`
 
         After fuse into one operation:
@@ -76,17 +125,17 @@ class InferenceTranspiler:
             Y &= (input * W + bias - mean) / std * a + b \\\\
               &= input * a * W / std + ((bias - mean) / std * a + b)
 
-        The operator transformation is: 
+        The operator transformation is:
 
         - before:
 
           - conv->batch_norm->any_other_op (bias == 0)
           - conv->elementwise_add->batch_norm->any_other_op (bias != 0)
-            
-        - after: 
+
+        - after:
 
           - conv->elementwise_add->any_other_op
-        
+
         The transpile stages are:
 
         1. insert elementwise_add op when bias == 0.
@@ -99,20 +148,20 @@ class InferenceTranspiler:
             program (Program): program to transpile
             place (Place): inference place
             scope (Scope): inference Scope
-        
+
         '''
         self.scope = scope
         self.place = place
         self.block = program.block(0)
-        self.input_map = {}  # store the input names should be adjusted 
+        self.input_map = {}  # store the input names should be adjusted
 
         i = 0
-        while i < len(self.block.ops):
+        while i < len(self.block.ops) - 2:
             current_op = self.block.ops[i]
             # TODO(luotao1): consider only conv2d now. fc would be delt later.
             if current_op.type in ['conv2d']:
-                # TODO(luotao1): consider single chain network now. 
-                # For branch network, we counldn't use block.ops[i + 1] as 
+                # TODO(luotao1): consider single chain network now.
+                # For branch network, we counldn't use block.ops[i + 1] as
                 # the judgment condition.
                 next_op = self.block.ops[i + 1]
                 # conv2d without bias
@@ -137,17 +186,17 @@ class InferenceTranspiler:
 
         self._adjust_input()
         self._remove_unused_var()
-        # TODO(luotao): use clone() method to flush the program.desc in force, 
-        # since some large program.desc will not be flushed immediately. 
+        # TODO(luotao): use clone() method to flush the program.desc in force,
+        # since some large program.desc will not be flushed immediately.
         # And a better solution will be considered later.
         program = program.clone()
 
     # ====================== private transpiler functions =====================
     def _insert_bias_op(self, index, current_op, bn_op):
         '''
-        Construct elementwise_add operator for adding bias 
+        Construct elementwise_add operator for adding bias
         and insert it into program.
-        
+
         :param index: insert location of bias_op
         :type index: Int
         :param current_op: current operator (conv or fc)
@@ -175,14 +224,14 @@ class InferenceTranspiler:
     def _fuse_param(self, current_op, bn_op, bias_op, with_bias):
         '''
         fuse the batch_norm_op' parameters to current_op (conv or fc)
-        
+
         :param current_op: current operator (conv or fc)
         :type current_op: Operator
         :param bn_op: batch norm operator
         :type bn_op: Operator
         :param bias_op: elementwise_add operator for adding bias
         :type bias_op: Operator
-        :param with_bias: If current operator has bias, with_bias = 1; otherwise 0. 
+        :param with_bias: If current operator has bias, with_bias = 1; otherwise 0.
         :type with_bias: Int
         '''
 

From 20fae681366de7c799dfbc92a7be53b027d532c2 Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Wed, 27 Jun 2018 19:39:15 +0800
Subject: [PATCH 60/68] adam op handle grad.rows().size == 0 condition

---
 paddle/fluid/operators/adam_op.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/paddle/fluid/operators/adam_op.h b/paddle/fluid/operators/adam_op.h
index f82ff47b52..a7a28b02b6 100644
--- a/paddle/fluid/operators/adam_op.h
+++ b/paddle/fluid/operators/adam_op.h
@@ -282,6 +282,10 @@ class AdamOpKernel : public framework::OpKernel<T> {
     } else if (grad_var->IsType<framework::SelectedRows>()) {
       auto& grad =
           Ref(ctx.Input<framework::SelectedRows>("Grad"), "Must set Grad");
+      if (grad.rows().size() == 0) {
+        VLOG(3) << "grad row size is 0!!";
+        return;
+      }
       // merge duplicated rows if any.
       scatter::MergeAdd<DeviceContext, T> merge_func;
       auto grad_merge =

From 64e44e929c06722a87dfe6989b46f66130cfe27a Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Wed, 27 Jun 2018 19:45:33 +0800
Subject: [PATCH 61/68] add option to compile inference demo

---
 paddle/contrib/inference/demo/CMakeLists.txt | 5 +++++
 paddle/scripts/paddle_build.sh               | 4 +++-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/paddle/contrib/inference/demo/CMakeLists.txt b/paddle/contrib/inference/demo/CMakeLists.txt
index 566c7d1a07..ecece6fe34 100644
--- a/paddle/contrib/inference/demo/CMakeLists.txt
+++ b/paddle/contrib/inference/demo/CMakeLists.txt
@@ -15,6 +15,11 @@
 
 inference_api_test(simple_on_word2vec ARGS test_word2vec)
 
+option(WITH_INFERENCE_DEMO "Compile with Inference demo" OFF)
+if(NOT WITH_INFERENCE_DEMO)
+  return()
+endif()
+
 set(DEMO_INSTALL_DIR "${PADDLE_BINARY_DIR}/inference_demo")
 set(URL_ROOT http://paddlemodels.bj.bcebos.com/inference-vis-demos%2F)
 
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 037688bde9..b16c834931 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -106,6 +106,7 @@ function cmake_gen() {
         -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF}
         -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
         -DWITH_CONTRIB=${WITH_CONTRIB:-ON}
+        -DWITH_INFERENCE_DEMO=${WITH_INFERENCE_DEMO:-ON}
     ========================================
 EOF
     # Disable UNITTEST_USE_VIRTUALENV in docker because
@@ -133,7 +134,8 @@ EOF
         -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} \
         -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
         -DWITH_CONTRIB=${WITH_CONTRIB:-ON} \
-        -DWITH_ANAKIN=${WITH_ANAKIN:-ON}
+        -DWITH_ANAKIN=${WITH_ANAKIN:-ON} \
+        -DWITH_INFERENCE_DEMO=${WITH_INFERENCE_DEMO:-ON}
 }
 
 function abort(){

From bc28cf613f9e41908d6da9a889cbb3242e0589d2 Mon Sep 17 00:00:00 2001
From: Haichao Zhang <hczhang1@gmail.com>
Date: Wed, 27 Jun 2018 16:56:41 -0700
Subject: [PATCH 62/68] Extend fill_zeros_like_op for zero-filling an
 LoDTensorArray (#11496)

* Add fill_zeros_array op. This op is used for zero-filling an LoDTensorArray.

* merge fill_zeros_array_op with fill_zeros_like_op

* add unit_test for fill_zeros_like for array
---
 paddle/fluid/framework/operator.cc            |  4 +
 paddle/fluid/operators/fill_zeros_like_op.cc  | 10 ++-
 paddle/fluid/operators/fill_zeros_like_op.h   | 30 +++++--
 python/paddle/fluid/layers/nn.py              | 38 ++++++++
 .../test_fill_zeros_like_op_for_array.py      | 88 +++++++++++++++++++
 5 files changed, 161 insertions(+), 9 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/test_fill_zeros_like_op_for_array.py

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 122ee1dab3..c1329b06d7 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -713,6 +713,10 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
           t = &var->Get<LoDTensor>();
         } else if (var->IsType<SelectedRows>()) {
           t = &(var->Get<SelectedRows>().value());
+        } else if (var->IsType<LoDTensorArray>()) {
+          const LoDTensorArray& arr = var->Get<LoDTensorArray>();
+          PADDLE_ENFORCE(arr.size() > 0);
+          t = &(arr[0]);
         }
         if (t != nullptr) {
           int tmp = static_cast<int>(ToDataType(t->type()));
diff --git a/paddle/fluid/operators/fill_zeros_like_op.cc b/paddle/fluid/operators/fill_zeros_like_op.cc
index d67bec36b3..a9d47c0172 100644
--- a/paddle/fluid/operators/fill_zeros_like_op.cc
+++ b/paddle/fluid/operators/fill_zeros_like_op.cc
@@ -26,8 +26,12 @@ class FillZerosLikeOp : public framework::OperatorWithKernel {
                    "Input(X) of FillZerosLikeOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of FillZerosLikeOp should not be null.");
-    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-    ctx->ShareLoD("X", /*->*/ "Out");
+
+    if (ctx->IsRuntime() &&
+        ctx->GetOutputsVarType("Out")[0] ==
+            framework::proto::VarType::LOD_TENSOR_ARRAY) {
+      return;  // skip runtime infershape when is tensor array;
+    }
   }
 };
 
@@ -39,7 +43,7 @@ class FillZerosLikeOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 FillZerosLike Operator.
 
-Fill up a variable with zeros.
+Fill up a variable with zeros, supporting both LoDTensor and LoDTensorArray.
 The output will have the same size as the input.
 
 )DOC");
diff --git a/paddle/fluid/operators/fill_zeros_like_op.h b/paddle/fluid/operators/fill_zeros_like_op.h
index 4bbe0df6b6..daa6521b32 100644
--- a/paddle/fluid/operators/fill_zeros_like_op.h
+++ b/paddle/fluid/operators/fill_zeros_like_op.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
 
@@ -23,12 +24,29 @@ template <typename DeviceContext, typename T>
 class FillZerosLikeKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* out = context.Output<framework::Tensor>("Out");
-    out->mutable_data<T>(context.GetPlace());
-
-    math::SetConstant<DeviceContext, T> setter;
-    setter(context.template device_context<DeviceContext>(), out,
-           static_cast<T>(0));
+    auto var = context.InputVar("X");
+    if (var->IsType<framework::LoDTensor>()) {
+      auto& input = *context.Input<framework::LoDTensor>("X");
+      auto& output = *context.Output<framework::LoDTensor>("Out");
+      output.Resize(input.dims());
+      output.set_lod(input.lod());
+      output.mutable_data<T>(context.GetPlace());
+      math::SetConstant<DeviceContext, T> setter;
+      setter(context.template device_context<DeviceContext>(), &(output),
+             static_cast<T>(0));
+    } else if (var->IsType<framework::LoDTensorArray>()) {
+      auto& input = *context.Input<framework::LoDTensorArray>("X");
+      auto& output = *context.Output<framework::LoDTensorArray>("Out");
+      output.resize(input.size());
+      for (auto i = 0; i < input.size(); i++) {
+        output[i].Resize(input[i].dims());
+        output[i].set_lod(input[i].lod());
+        output[i].mutable_data<T>(context.GetPlace());
+        math::SetConstant<DeviceContext, T> setter;
+        setter(context.template device_context<DeviceContext>(), &(output[i]),
+               static_cast<T>(0));
+      }
+    }
   }
 };
 
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 64f48e259a..bc379da4e3 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -95,6 +95,7 @@ __all__ = [
     'relu',
     'log',
     'crop',
+    'fill_zeros_like',
 ]
 
 
@@ -5184,3 +5185,40 @@ def crop(x, shape=None, offsets=None, name=None):
         outputs={'Out': out},
         attrs=None if len(attrs) == 0 else attrs)
     return out
+
+
+def fill_zeros_like(x):
+    """
+    This layer takes an input and outputs a variable that has the same structure as
+    the input and with all the element values as zero. The variable can be a Tensor
+    or TensorArray.
+
+    .. code-block:: text
+
+
+       Given
+          X = [[0, 1, 2, 0],
+               [0, 3, 4, 0],
+               [0, 0, 0, 0]],
+       output is:
+          Out = [[0, 0, 0, 0],
+                 [0, 0, 0, 0],
+                 [0, 0, 0, 0]].
+
+    Args:
+        x (Variable): The input variable, which could be a tensor or tensor array
+
+    Returns:
+        Variable: The zero-filled variable, which has the same type and shape as
+                  the input variable.
+
+    Examples:
+
+        .. code-block:: python
+            y = fluid.layers.fill_zeros_like(x)
+    """
+    helper = LayerHelper('fill_zeros_like', **locals())
+    out = helper.create_tmp_variable(dtype=x.dtype)
+    helper.append_op(
+        type='fill_zeros_like', inputs={'X': [x]}, outputs={'Out': [out]})
+    return out
diff --git a/python/paddle/fluid/tests/unittests/test_fill_zeros_like_op_for_array.py b/python/paddle/fluid/tests/unittests/test_fill_zeros_like_op_for_array.py
new file mode 100644
index 0000000000..23871508d8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fill_zeros_like_op_for_array.py
@@ -0,0 +1,88 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle.fluid.core as core
+import numpy
+import paddle.fluid.layers as layers
+from paddle.fluid.framework import Program, program_guard
+from paddle.fluid.executor import Executor
+
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+
+
+class TestFillZerosLikeOpForTensorArray(unittest.TestCase):
+    def place(self):
+        return core.CPUPlace()
+
+    def test_zero_filling_lod_tensor_array(self):
+        tensor = core.LoDTensor()
+        tensor.set(
+            numpy.arange(20).reshape(20, 1).astype('int32'), self.place())
+        tensor.set_lod([[0, 2, 5], [0, 3, 9, 11, 17, 20]])
+
+        expect = [
+            numpy.array(
+                [0, 0, 0, 0, 0], dtype='int32'), numpy.array(
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='int32'),
+            numpy.array(
+                [0, 0, 0], dtype='int32')
+        ]
+
+        lod = [[[0, 2, 5]], [[0, 6, 12]], [[0, 3]]]
+        self.main(
+            tensor=tensor,
+            expect_array=expect,
+            expect_lod=lod,
+            expect_max_len=3)
+
+    def main(self, tensor, expect_array, expect_lod, expect_max_len, level=0):
+        place = self.place()
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name='x', shape=[10])
+            x.persistable = True
+            table = layers.lod_rank_table(x, level=level)
+            max_len = layers.max_sequence_len(table)
+            max_len.persistable = True
+            array = layers.lod_tensor_to_array(x, table)
+            array = layers.fill_zeros_like(array)
+            array.persistable = True
+
+            result = layers.array_to_lod_tensor(array, table)
+            result.persistable = True
+        exe = Executor(place)
+        scope = core.Scope()
+        exe.run(program, feed={'x': tensor}, scope=scope)
+        var = scope.find_var(array.name)
+        array = var.get_lod_tensor_array()
+        if expect_array is not None and expect_lod is not None:
+            self.check_array_same(array, expect_array, expect_lod)
+
+        self.assertEqual(
+            numpy.array(scope.find_var(max_len.name).get_tensor())[0],
+            expect_max_len)
+
+    def check_array_same(self, array, expect_tensor, expect_lod):
+        self.assertEqual(len(expect_tensor), len(array))
+        for i, exp in enumerate(zip(expect_tensor, expect_lod)):
+            exp_tensor, exp_lod = exp
+            exp_tensor = numpy.expand_dims(exp_tensor, axis=1)
+            self.assertTrue(numpy.allclose(exp_tensor, numpy.array(array[i])))
+            self.assertEqual(exp_lod, array[i].lod())
+
+
+if __name__ == '__main__':
+    unittest.main()

From 5082642bdb038ef87f81549a3589724a65c29799 Mon Sep 17 00:00:00 2001
From: Yan Chunwei <yanchunwei@outlook.com>
Date: Thu, 28 Jun 2018 09:13:55 +0800
Subject: [PATCH 63/68] feature/analysis to support sub-graph for TRT engine
 (#11538)

---
 paddle/contrib/inference/CMakeLists.txt       |  10 +-
 .../contrib/inference/paddle_inference_api.h  |  11 +-
 .../inference/paddle_inference_api_impl.cc    |   6 +-
 .../inference/paddle_inference_api_impl.h     |   2 +-
 ..._inference_api_tensorrt_subgraph_engine.cc | 126 ++++++++++++++++++
 ..._inference_api_tensorrt_subgraph_engine.cc |  64 +++++++++
 .../fluid/inference/analysis/CMakeLists.txt   |  12 +-
 paddle/fluid/inference/analysis/analyzer.cc   |  82 ++++++++++++
 paddle/fluid/inference/analysis/analyzer.h    |  66 +++++++++
 .../inference/analysis/analyzer_tester.cc     |  29 ++++
 paddle/fluid/inference/analysis/argument.h    |   3 +
 .../inference/analysis/data_flow_graph.cc     |  21 ++-
 .../inference/analysis/data_flow_graph.h      |  23 +++-
 .../analysis/data_flow_graph_to_fluid_pass.cc | 124 ++++++++++++++---
 .../analysis/data_flow_graph_to_fluid_pass.h  |   6 +-
 .../analysis/dfg_graphviz_draw_pass.cc        |  15 ++-
 .../analysis/dfg_graphviz_draw_pass.h         |  13 +-
 .../analysis/dfg_graphviz_draw_pass_tester.cc |   4 +-
 .../analysis/fluid_to_data_flow_graph_pass.cc |  23 +++-
 .../analysis/fluid_to_data_flow_graph_pass.h  |   3 +-
 paddle/fluid/inference/analysis/helper.cc     |  60 +++++++++
 paddle/fluid/inference/analysis/helper.h      |  22 ++-
 paddle/fluid/inference/analysis/node.cc       |  11 ++
 paddle/fluid/inference/analysis/node.h        |  90 +++++++------
 .../inference/analysis/node_attr_flags.h      |  32 +++++
 paddle/fluid/inference/analysis/pass.h        |   3 +
 .../fluid/inference/analysis/pass_manager.cc  |  12 ++
 .../fluid/inference/analysis/pass_manager.h   |  12 +-
 .../inference/analysis/pass_manager_tester.cc |   1 +
 .../inference/analysis/subgraph_splitter.cc   |  32 +++--
 .../tensorrt_subgraph_node_mark_pass.cc       |  78 +++++++++++
 .../tensorrt_subgraph_node_mark_pass.h        |  53 ++++++++
 ...tensorrt_subgraph_node_mark_pass_tester.cc |  50 +++++++
 .../analysis/tensorrt_subgraph_pass.cc        |   2 +-
 .../analysis/tensorrt_subgraph_pass.h         |   5 +
 .../analysis/tensorrt_subgraph_pass_tester.cc |  51 ++++---
 paddle/fluid/operators/CMakeLists.txt         |   3 +-
 paddle/fluid/operators/tensorrt_engine_op.h   |   1 +
 .../operators/tensorrt_engine_op_test.cc      |  43 +-----
 39 files changed, 1015 insertions(+), 189 deletions(-)
 create mode 100644 paddle/contrib/inference/paddle_inference_api_tensorrt_subgraph_engine.cc
 create mode 100644 paddle/contrib/inference/test_paddle_inference_api_tensorrt_subgraph_engine.cc
 create mode 100644 paddle/fluid/inference/analysis/analyzer.cc
 create mode 100644 paddle/fluid/inference/analysis/analyzer.h
 create mode 100644 paddle/fluid/inference/analysis/analyzer_tester.cc
 create mode 100644 paddle/fluid/inference/analysis/helper.cc
 create mode 100644 paddle/fluid/inference/analysis/node_attr_flags.h
 create mode 100644 paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc
 create mode 100644 paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h
 create mode 100644 paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass_tester.cc

diff --git a/paddle/contrib/inference/CMakeLists.txt b/paddle/contrib/inference/CMakeLists.txt
index 45bbb4b237..153216abb4 100644
--- a/paddle/contrib/inference/CMakeLists.txt
+++ b/paddle/contrib/inference/CMakeLists.txt
@@ -18,7 +18,7 @@ if(APPLE)
 endif(APPLE)
 
 
-set(inference_deps paddle_inference_api paddle_fluid_api)
+set(inference_deps paddle_inference_api paddle_fluid_api paddle_inference_tensorrt_subgraph_engine)
 
 function(inference_api_test TARGET_NAME)
     if (WITH_TESTING)
@@ -50,6 +50,14 @@ cc_test(test_paddle_inference_api
 inference_api_test(test_paddle_inference_api_impl
                     ARGS test_word2vec test_image_classification)
 
+if(WITH_GPU AND TENSORRT_FOUND)
+cc_library(paddle_inference_tensorrt_subgraph_engine
+        SRCS paddle_inference_api_tensorrt_subgraph_engine.cc
+        DEPS paddle_inference_api analysis tensorrt_engine paddle_inference_api paddle_fluid_api)
+
+inference_api_test(test_paddle_inference_api_tensorrt_subgraph_engine ARGS test_word2vec)
+endif()
+
 if (WITH_ANAKIN AND WITH_TESTING) # only needed in CI
     # Due to Anakin do not have official library releases and the versions of protobuf and cuda do not match Paddle's,
     # so anakin library will not be merged to our official inference library. To use anakin prediction API, one need to
diff --git a/paddle/contrib/inference/paddle_inference_api.h b/paddle/contrib/inference/paddle_inference_api.h
index 238d8c772e..b8ba2d14a5 100644
--- a/paddle/contrib/inference/paddle_inference_api.h
+++ b/paddle/contrib/inference/paddle_inference_api.h
@@ -73,12 +73,12 @@ struct PaddleTensor {
 };
 
 enum class PaddleEngineKind {
-  kNative = 0,  // Use the native Fluid facility.
-  kAnakin,      // Use Anakin for inference.
+  kNative = 0,         // Use the native Fluid facility.
+  kAnakin,             // Use Anakin for inference.
+  kAutoMixedTensorRT,  // Automatically mix Fluid with TensorRT.
   // TODO(Superjomn) support following engines latter.
   // kTensorRT,           // Use TensorRT for inference.
   // kAutoMixedAnakin,    // Automatically mix Fluid with Anakin.
-  // kAutoMixedTensorRT,  // Automatically mix Fluid with TensorRT.
 };
 
 /*
@@ -130,6 +130,11 @@ struct AnakinConfig : public PaddlePredictor::Config {
   int max_batch_size{-1};
 };
 
+struct TensorRTConfig : public NativeConfig {
+  // Determine whether a subgraph will be executed by TRT.
+  int min_subgraph_size{1};
+};
+
 // A factory to help create different predictors.
 //
 // FOR EXTENSION DEVELOPER:
diff --git a/paddle/contrib/inference/paddle_inference_api_impl.cc b/paddle/contrib/inference/paddle_inference_api_impl.cc
index d9129a704b..b1e5b87598 100644
--- a/paddle/contrib/inference/paddle_inference_api_impl.cc
+++ b/paddle/contrib/inference/paddle_inference_api_impl.cc
@@ -89,6 +89,7 @@ bool NativePaddlePredictor::Init(
     LOG(ERROR) << "fail to load inference model.";
     return false;
   }
+
   ctx_ = executor_->Prepare(*inference_program_, 0);
   executor_->CreateVariables(
       *inference_program_, sub_scope_ ? sub_scope_ : scope_.get(), 0);
@@ -119,6 +120,7 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
     return false;
   }
   for (size_t i = 0; i < feed_target_names_.size(); ++i) {
+    VLOG(4) << "setting " << i << "-th target";
     feed_targets[feed_target_names_[i]] = &feeds[i];
   }
   // get fetch variable
@@ -130,14 +132,16 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
   }
   // Run the inference program
   // if share variables, we need not create variables
+  VLOG(4) << "Run prepared context";
   executor_->RunPreparedContext(
       ctx_.get(),
       sub_scope_ != nullptr ? sub_scope_ : scope_.get(),
       &feed_targets,
       &fetch_targets,
       false /* don't create variable eatch time */);
+  VLOG(4) << "Finish prepared context";
   if (!GetFetch(fetchs, output_data)) {
-    LOG(ERROR) << "fail to get fetchs";
+    LOG(ERROR) << "fail to get fetches";
     return false;
   }
   VLOG(3) << "predict cost: " << timer.toc() << "ms";
diff --git a/paddle/contrib/inference/paddle_inference_api_impl.h b/paddle/contrib/inference/paddle_inference_api_impl.h
index 86d1db7bcc..ba266b608d 100644
--- a/paddle/contrib/inference/paddle_inference_api_impl.h
+++ b/paddle/contrib/inference/paddle_inference_api_impl.h
@@ -44,7 +44,7 @@ class NativePaddlePredictor : public PaddlePredictor {
 
   ~NativePaddlePredictor() override;
 
- private:
+ protected:
   bool SetFeed(const std::vector<PaddleTensor> &input_datas,
                std::vector<framework::LoDTensor> *feeds);
   bool GetFetch(const std::vector<framework::LoDTensor> &fetchs,
diff --git a/paddle/contrib/inference/paddle_inference_api_tensorrt_subgraph_engine.cc b/paddle/contrib/inference/paddle_inference_api_tensorrt_subgraph_engine.cc
new file mode 100644
index 0000000000..a11396cee9
--- /dev/null
+++ b/paddle/contrib/inference/paddle_inference_api_tensorrt_subgraph_engine.cc
@@ -0,0 +1,126 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/contrib/inference/paddle_inference_api.h"
+#include "paddle/contrib/inference/paddle_inference_api_impl.h"
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include "paddle/fluid/inference/utils/singleton.h"
+
+namespace paddle {
+
+using inference::analysis::Argument;
+using inference::Singleton;
+using inference::analysis::Analyzer;
+using framework::proto::ProgramDesc;
+
+class TensorRTSubgraphPredictor : public NativePaddlePredictor {
+ public:
+  explicit TensorRTSubgraphPredictor(const TensorRTConfig& config)
+      : NativePaddlePredictor(config), config_(config) {}
+
+  bool Init(const std::shared_ptr<framework::Scope>& parent_scope) {
+    VLOG(3) << "Predictor::init()";
+
+    if (config_.use_gpu) {
+      place_ = paddle::platform::CUDAPlace(config_.device);
+    } else {
+      place_ = paddle::platform::CPUPlace();
+    }
+    if (parent_scope) {
+      scope_ = parent_scope;
+      sub_scope_ = &(parent_scope->NewScope());
+    } else {
+      paddle::framework::InitDevices(false);
+      scope_.reset(new paddle::framework::Scope());
+    }
+
+    executor_.reset(new paddle::framework::Executor(place_));
+
+    // Initialize the inference program
+    if (!config_.model_dir.empty()) {
+      // Parameters are saved in separate files sited in
+      // the specified `dirname`.
+      inference_program_ = paddle::inference::Load(
+          executor_.get(), scope_.get(), config_.model_dir);
+    } else if (!config_.prog_file.empty() && !config_.param_file.empty()) {
+      // All parameters are saved in a single file.
+      // The file names should be consistent with that used
+      // in Python API `fluid.io.save_inference_model`.
+      inference_program_ = paddle::inference::Load(
+          executor_.get(), scope_.get(), config_.prog_file, config_.param_file);
+    } else {
+      LOG(ERROR) << "fail to load inference model.";
+      return false;
+    }
+
+    // Analyze inference_program
+    Argument argument;
+    argument.origin_program_desc.reset(
+        new ProgramDesc(*inference_program_->Proto()));
+    Singleton<Analyzer>::Global().Run(&argument);
+    CHECK(argument.transformed_program_desc);
+    VLOG(5) << "transformed program:\n"
+            << argument.transformed_program_desc->SerializeAsString();
+    VLOG(5) << "to prepare executor";
+    *inference_program_->Proto() = *argument.transformed_program_desc;
+    ctx_ = executor_->Prepare(*inference_program_, 0);
+
+    VLOG(5) << "to create variables";
+    executor_->CreateVariables(
+        *inference_program_, sub_scope_ ? sub_scope_ : scope_.get(), 0);
+
+    // Get the feed_target_names and fetch_target_names
+    feed_target_names_ = inference_program_->GetFeedTargetNames();
+    fetch_target_names_ = inference_program_->GetFetchTargetNames();
+    return true;
+  }
+
+ private:
+  TensorRTConfig config_;
+};
+
+template <>
+std::unique_ptr<PaddlePredictor>
+CreatePaddlePredictor<TensorRTConfig, PaddleEngineKind::kAutoMixedTensorRT>(
+    const TensorRTConfig& config) {
+  VLOG(3) << "create TensorRTSubgraphPredictor";
+  if (config.use_gpu) {
+    // 1. GPU memeroy
+    PADDLE_ENFORCE_GT(
+        config.fraction_of_gpu_memory,
+        0.f,
+        "fraction_of_gpu_memory in the config should be set to range (0., 1.]");
+    PADDLE_ENFORCE_GE(config.device, 0, "Invalid device id %d", config.device);
+    std::vector<std::string> flags;
+    if (config.fraction_of_gpu_memory >= 0.0f ||
+        config.fraction_of_gpu_memory <= 0.95f) {
+      flags.push_back("dummpy");
+      std::string flag = "--fraction_of_gpu_memory_to_use=" +
+                         std::to_string(config.fraction_of_gpu_memory);
+      flags.push_back(flag);
+      VLOG(3) << "set flag: " << flag;
+      framework::InitGflags(flags);
+    }
+  }
+
+  std::unique_ptr<PaddlePredictor> predictor(
+      new TensorRTSubgraphPredictor(config));
+  if (!dynamic_cast<TensorRTSubgraphPredictor*>(predictor.get())
+           ->Init(nullptr)) {
+    return nullptr;
+  }
+  return std::move(predictor);
+}
+
+}  // namespace paddle
diff --git a/paddle/contrib/inference/test_paddle_inference_api_tensorrt_subgraph_engine.cc b/paddle/contrib/inference/test_paddle_inference_api_tensorrt_subgraph_engine.cc
new file mode 100644
index 0000000000..b100630dbe
--- /dev/null
+++ b/paddle/contrib/inference/test_paddle_inference_api_tensorrt_subgraph_engine.cc
@@ -0,0 +1,64 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include "paddle/contrib/inference/paddle_inference_api.h"
+
+namespace paddle {
+
+DEFINE_string(dirname, "", "Directory of the inference model.");
+
+void Main(bool use_gpu) {
+  //# 1. Create PaddlePredictor with a config.
+  TensorRTConfig config;
+  config.model_dir = FLAGS_dirname + "word2vec.inference.model";
+  config.use_gpu = use_gpu;
+  config.fraction_of_gpu_memory = 0.15;
+  config.device = 0;
+  auto predictor =
+      CreatePaddlePredictor<TensorRTConfig,
+                            PaddleEngineKind::kAutoMixedTensorRT>(config);
+
+  for (int batch_id = 0; batch_id < 3; batch_id++) {
+    //# 2. Prepare input.
+    int64_t data[4] = {1, 2, 3, 4};
+
+    PaddleTensor tensor{.name = "",
+                        .shape = std::vector<int>({4, 1}),
+                        .data = PaddleBuf(data, sizeof(data)),
+                        .dtype = PaddleDType::INT64};
+
+    // For simplicity, we set all the slots with the same data.
+    std::vector<PaddleTensor> slots(4, tensor);
+
+    //# 3. Run
+    std::vector<PaddleTensor> outputs;
+    CHECK(predictor->Run(slots, &outputs));
+
+    //# 4. Get output.
+    ASSERT_EQ(outputs.size(), 1UL);
+    LOG(INFO) << "output buffer size: " << outputs.front().data.length();
+    const size_t num_elements = outputs.front().data.length() / sizeof(float);
+    // The outputs' buffers are in CPU memory.
+    for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
+      LOG(INFO) << static_cast<float*>(outputs.front().data.data())[i];
+    }
+  }
+}
+
+TEST(paddle_inference_api_tensorrt_subgraph_engine, main) { Main(true); }
+
+}  // namespace paddle
\ No newline at end of file
diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt
index 2bb2c8135d..33b0e3b127 100644
--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
@@ -1,10 +1,12 @@
-set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor init)
 cc_library(analysis SRCS pass_manager.cc dot.cc node.cc data_flow_graph.cc graph_traits.cc subgraph_splitter.cc
   fluid_to_data_flow_graph_pass.cc
   data_flow_graph_to_fluid_pass.cc
-  tensorrt_subgraph_pass.cc
   dfg_graphviz_draw_pass.cc
-  DEPS framework_proto)
+  tensorrt_subgraph_pass.cc
+  tensorrt_subgraph_node_mark_pass.cc
+  analyzer.cc
+  helper.cc
+  DEPS framework_proto proto_desc)
 cc_test(test_node SRCS node_tester.cc DEPS analysis)
 cc_test(test_dot SRCS dot_tester.cc DEPS analysis)
 
@@ -28,5 +30,7 @@ inference_analysis_test(test_data_flow_graph_to_fluid_pass SRCS data_flow_graph_
 inference_analysis_test(test_fluid_to_data_flow_graph_pass SRCS fluid_to_data_flow_graph_pass_tester.cc)
 inference_analysis_test(test_subgraph_splitter SRCS subgraph_splitter_tester.cc)
 inference_analysis_test(test_dfg_graphviz_draw_pass SRCS dfg_graphviz_draw_pass_tester.cc)
-#inference_analysis_test(test_tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass_tester.cc)
+inference_analysis_test(test_tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass_tester.cc)
 inference_analysis_test(test_pass_manager SRCS pass_manager_tester.cc)
+inference_analysis_test(test_tensorrt_subgraph_node_mark_pass SRCS tensorrt_subgraph_node_mark_pass_tester.cc)
+inference_analysis_test(test_analyzer SRCS analyzer_tester.cc)
diff --git a/paddle/fluid/inference/analysis/analyzer.cc b/paddle/fluid/inference/analysis/analyzer.cc
new file mode 100644
index 0000000000..5d85530969
--- /dev/null
+++ b/paddle/fluid/inference/analysis/analyzer.cc
@@ -0,0 +1,82 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
+#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
+#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
+#include "paddle/fluid/inference/analysis/pass_manager.h"
+#include "paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h"
+#include "paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+DEFINE_bool(inference_analysis_enable_tensorrt_subgraph_engine, false,
+            "Enable subgraph to TensorRT engine for acceleration");
+
+DEFINE_string(inference_analysis_graphviz_log_root, "./",
+              "Graphviz debuger for data flow graphs.");
+
+class DfgPassManagerImpl final : public DfgPassManager {
+ public:
+  DfgPassManagerImpl() {
+    // TODO(Superjomn) set the key with pass reprs.
+    AddPass("fluid-to-data-flow-graph", new FluidToDataFlowGraphPass);
+    if (FLAGS_inference_analysis_enable_tensorrt_subgraph_engine) {
+      auto trt_teller = [](const Node* node) {
+        if (!node->IsFunction()) return false;
+        return static_cast<const Function*>(node)->func_type() == "mul";
+      };
+      AddPass("tensorrt-subgraph-marker",
+              new TensorRTSubgraphNodeMarkPass(trt_teller));
+      AddPass("tensorrt-subgraph", new TensorRTSubGraphPass(trt_teller));
+    }
+    AddPass("data-flow-graph-to-fluid", new DataFlowGraphToFluidPass);
+  }
+
+  std::string repr() const override { return "dfg-pass-manager"; }
+  std::string description() const override { return "DFG pass manager."; }
+
+ private:
+  void AddPass(const std::string& name, Pass* pass) {
+    LOG(INFO) << "Adding pass " << name;
+    Register(name, pass);
+    AddGraphvizDebugerPass(pass);
+  }
+
+  // Add the graphviz debuger pass if the parent pass has one.
+  void AddGraphvizDebugerPass(Pass* pass) {
+    auto* debuger_pass = pass->CreateGraphvizDebugerPass();
+    if (debuger_pass) {
+      LOG(INFO) << " - register debug pass [" << debuger_pass->repr() << "]";
+      Register(debuger_pass->repr(), debuger_pass);
+    }
+  }
+};
+
+Analyzer::Analyzer() { Register("manager1", new DfgPassManagerImpl); }
+
+void Analyzer::Run(Argument* argument) {
+  for (auto& x : data_) {
+    PADDLE_ENFORCE(x->Initialize(argument));
+    x->RunAll();
+    PADDLE_ENFORCE(x->Finalize());
+  }
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
\ No newline at end of file
diff --git a/paddle/fluid/inference/analysis/analyzer.h b/paddle/fluid/inference/analysis/analyzer.h
new file mode 100644
index 0000000000..f290a3777d
--- /dev/null
+++ b/paddle/fluid/inference/analysis/analyzer.h
@@ -0,0 +1,66 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+ * This file contains Analyzer, an class that exposed as a library that analyze
+ * and optimize
+ * Fluid ProgramDesc for inference. Similar to LLVM, it has multiple flags to
+ * control whether
+ * an process is applied on the program.
+ *
+ * The processes are called Passes in analysis, the Passes are placed in a
+ * pipeline, the first
+ * Pass is the FluidToDataFlowGraphPass which transforms a Fluid ProgramDesc to
+ * a data flow
+ * graph, the last Pass is DataFlowGraphToFluidPass which transforms a data flow
+ * graph to a
+ * Fluid ProgramDesc. The passes in the middle of the pipeline can be any Passes
+ * which take a
+ * node or data flow graph as input.
+ *
+ * The Analyzer can be used in two methods, the first is a executable file which
+ * can be used to
+ * pre-process the inference model and can be controlled by passing difference
+ * command flags;
+ * the other way is to compose inside the inference API as a runtime pre-process
+ * phase in the
+ * inference service.
+ */
+
+#include <gflags/gflags.h>
+#include "paddle/fluid/inference/analysis/pass.h"
+#include "paddle/fluid/inference/analysis/pass_manager.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+// TODO(Superjomn) add a definition flag like PADDLE_WITH_TENSORRT and hide this
+// flag if not available.
+DECLARE_bool(inference_analysis_enable_tensorrt_subgraph_engine);
+DECLARE_string(inference_analysis_graphviz_log_root);
+
+class Analyzer : public OrderedRegistry<PassManager> {
+ public:
+  // Register all the pass-managers.
+  Analyzer();
+
+  void Run(Argument* argument);
+
+  DISABLE_COPY_AND_ASSIGN(Analyzer);
+};
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc
new file mode 100644
index 0000000000..d7c1a72932
--- /dev/null
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
@@ -0,0 +1,29 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include "paddle/fluid/inference/analysis/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+TEST_F(DFG_Tester, main) {
+  Analyzer analyser;
+  analyser.Run(&argument);
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
index f7f4e03968..6d316f20bf 100644
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -41,6 +41,9 @@ struct Argument {
 
   // The original program desc.
   std::unique_ptr<framework::proto::ProgramDesc> origin_program_desc;
+
+  // The processed program desc.
+  std::unique_ptr<framework::proto::ProgramDesc> transformed_program_desc;
 };
 
 #define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
diff --git a/paddle/fluid/inference/analysis/data_flow_graph.cc b/paddle/fluid/inference/analysis/data_flow_graph.cc
index c30a7c26ce..d09bf3ed16 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph.cc
@@ -20,7 +20,7 @@ namespace paddle {
 namespace inference {
 namespace analysis {
 
-// It is a better idea that the inputs and outputs of this graph is set manully
+// It is a better idea that the inputs and outputs of this graph is set manually
 // before, but there must be a Pass that helps to prune the unnecessary ops that
 // do not contribute to the given targets, so in this pass, analysis and get the
 // inputs and outputs is OK.
@@ -50,6 +50,25 @@ void DataFlowGraph::Build() {
       outputs.push_back(out);
     }
   }
+
+  Clean();
+}
+
+void DataFlowGraph::Clean() {
+  for (auto &node : nodes.nodes()) {
+    std::unordered_set<Node *> inlinks_set(node->inlinks.begin(),
+                                           node->inlinks.end());
+    std::unordered_set<Node *> outlinks_set(node->outlinks.begin(),
+                                            node->outlinks.end());
+    if (inlinks_set.size() < node->inlinks.size()) {
+      LOG(INFO) << "Clean: node " << node->repr() << " prune duplicate inputs";
+      node->inlinks.assign(inlinks_set.begin(), inlinks_set.end());
+    }
+    if (outlinks_set.size() < node->outlinks.size()) {
+      LOG(INFO) << "Clean: node " << node->repr() << " prune duplicate inputs";
+      node->outlinks.assign(outlinks_set.begin(), outlinks_set.end());
+    }
+  }
 }
 
 std::string DataFlowGraph::DotString() const {
diff --git a/paddle/fluid/inference/analysis/data_flow_graph.h b/paddle/fluid/inference/analysis/data_flow_graph.h
index 913e344d37..30c60661f3 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph.h
+++ b/paddle/fluid/inference/analysis/data_flow_graph.h
@@ -47,6 +47,10 @@ struct DataFlowGraph {
 
   // Output a DOT graph file for debug.
   std::string DotString() const;
+
+ private:
+  // Remove duplicate edges and so on.
+  void Clean();
 };
 
 /*
@@ -133,17 +137,24 @@ struct GraphTraits<DataFlowGraph> {
 // Extract the inputs and outputs of a graph. The inputs and outputs of a
 // sub-graph is the inputs nodes and output nodes that doesn't inside the
 // sub-graph.
-std::pair<
-    std::vector<Node *>,
-    std::vector<
-        Node *>> static ExtractInputAndOutputOfSubGraph(std::vector<Node *>
-                                                            &graph) {
+static std::pair<std::vector<Node *>, std::vector<Node *>>
+ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph) {
   std::unordered_set<Node *> nodes(graph.begin(), graph.end());
   std::unordered_set<Node *> inputs;
   std::unordered_set<Node *> outputs;
+  // Input a Value, check whether its inlink is in the subgraph.
+  auto inlink_in_subgraph = [&](Node *n) {
+    for (auto *in : n->inlinks) {
+      if (nodes.count(in)) return true;
+    }
+    return false;
+  };
   for (auto &node : graph) {
     for (auto *in : node->inlinks) {
-      if (!nodes.count(in) && in->type() == Node::Type::kValue) {
+      // The Value that is written by nodes inside a sub-graph shouldn't be the
+      // input of the sub-graph.
+      if (!nodes.count(in) && in->type() == Node::Type::kValue &&
+          !inlink_in_subgraph(in)) {
         inputs.insert(in);
       }
     }
diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
index f7d4cca213..e74efd17b8 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
@@ -13,21 +13,34 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/proto_desc.h"
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
 
 namespace paddle {
 namespace inference {
 namespace analysis {
 
+using framework::proto::ProgramDesc;
+
+std::vector<std::string> ExtractParameters(
+    const std::vector<std::unique_ptr<Node>>& nodes);
+
 bool DataFlowGraphToFluidPass::Initialize(Argument* argument) {
   ANALYSIS_ARGUMENT_CHECK_FIELD(argument)
   ANALYSIS_ARGUMENT_CHECK_FIELD(argument->origin_program_desc)
-  desc_ = argument->origin_program_desc.get();
-  // Here some logic from program_desc.cc and will not add new interfaces into
-  // framework::ProgramDesc class, use some UT to assure the correctness.
-  auto* block = desc_->mutable_blocks()->Add();
-  block->set_idx(framework::kRootBlockIndex);
-  block->set_parent_idx(framework::kNoneBlockIndex);
+  PADDLE_ENFORCE(!argument->transformed_program_desc);
+  // The transformed_program_desc should inherit all the VarDesc and BlockDesc
+  // from the original program desc. The operators of the main block(the first
+  // block) should rewritten by data flow graph.
+  argument->transformed_program_desc.reset(
+      new ProgramDesc(*argument->origin_program_desc));
+  argument->transformed_program_desc->mutable_blocks(framework::kRootBlockIndex)
+      ->clear_ops();
+  desc_ = argument->transformed_program_desc.get();
+  argument_ = argument;
   return true;
 }
 
@@ -37,14 +50,17 @@ void DataFlowGraphToFluidPass::Run(DataFlowGraph* graph) {
   auto traits = GraphTraits<DataFlowGraph>(graph);
   for (auto it = traits.nodes().begin(); it != traits.nodes().end(); ++it) {
     if (it->deleted()) continue;
+
     switch (it->type()) {
-      case Node::Type::kFunction:
-        LOG(INFO) << "add function " << it->name();
+      case Node::Type::kFunction: {
+        LOG(INFO) << "add function " << it->repr();
         AddFluidOp(&(*it));
-        break;
-      case Node::Type::kFunctionBlock:
+      } break;
+      case Node::Type::kFunctionBlock: {
+        LOG(INFO) << "add engine op " << it->repr() << " , "
+                  << static_cast<FunctionBlock*>(&(*it))->subgraph.size();
         AddEngineOp(&(*it));
-        break;
+      } break;
       default:
         continue;
     }
@@ -52,12 +68,10 @@ void DataFlowGraphToFluidPass::Run(DataFlowGraph* graph) {
 }
 
 void DataFlowGraphToFluidPass::AddFluidOp(Node* node) {
-  LOG(INFO) << "processing func " << node->name();
   auto* ori_op = static_cast<framework::proto::OpDesc*>(node->pb_desc());
   // currently only the main block is analyzed.
   auto* main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
   auto* op = main_block->add_ops();
-  LOG(INFO) << "to copy the op";
   *op = *ori_op;  // copy the attributes, by default, these will not be changed
                   // by analysis phrase.
   // The inputs and outputs of the existing ops are not changed by tensorrt
@@ -65,11 +79,89 @@ void DataFlowGraphToFluidPass::AddFluidOp(Node* node) {
   // NOTE It might be changed by other passes in the long run.
 }
 
+void CreateTrtEngineOp(Node* node, const DataFlowGraph& graph,
+                       const framework::proto::BlockDesc& block) {
+  static int counter{0};
+  PADDLE_ENFORCE(node->IsFunctionBlock());
+  framework::OpDesc desc;
+  auto* func = static_cast<FunctionBlock*>(node);
+
+  // collect inputs
+  std::vector<std::string> io;
+  for (auto* x : func->inlinks) {
+    io.push_back(x->name());
+  }
+  desc.SetInput("Xs", io);
+
+  // collect outputs
+  io.clear();
+  for (auto* x : func->outlinks) {
+    io.push_back(x->name());
+  }
+  desc.SetOutput("Ys", io);
+
+  desc.SetType("tensorrt_engine");
+  // Set attrs
+  SetAttr(desc.Proto(), "subgraph", block.SerializeAsString());
+  SetAttr(desc.Proto(), "engine_unique_key",
+          "trt-" + std::to_string(counter++));
+  SetAttr(desc.Proto(), "max_batch", 100);  // TODO(Superjomn) add config latter
+  SetAttr(desc.Proto(), "max_workspace",
+          1024);  // TODO(Superjomn) add config latter
+  SetAttr(desc.Proto(), "parameters", ExtractParameters(graph.nodes.nodes()));
+  node->SetPbMsg(desc.Proto()->SerializeAsString());
+}
+
+std::vector<std::string> ExtractParameters(
+    const std::vector<std::unique_ptr<Node>>& nodes) {
+  std::vector<std::string> parameters;
+  for (const auto& node : nodes) {
+    if (!node->IsValue()) continue;
+    PADDLE_ENFORCE(!node->pb_msg().empty(), "pb_msg should be set first");
+    framework::proto::VarDesc var;
+    var.ParseFromString(node->pb_msg());
+    if (var.persistable()) {
+      parameters.push_back(var.name());
+    }
+  }
+  return parameters;
+}
+
 void DataFlowGraphToFluidPass::AddEngineOp(Node* node) {
-  // auto* ori_op = static_cast<framework::proto::OpDesc*>(node->extra_info());
-  // auto* main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
-  // auto* op = main_block->add_ops();
   // TODO(Superjomn) Here need to expose some arguments for default setting.
+  PADDLE_ENFORCE(node->IsFunctionBlock());
+  auto* block_node = static_cast<FunctionBlock*>(node);
+  framework::proto::BlockDesc proto;
+  framework::BlockDesc block_desc(nullptr, &proto);
+  // copy ops.
+  for (auto* node : block_node->subgraph) {
+    auto* op = block_desc.AppendOp();
+    PADDLE_ENFORCE(!node->pb_msg().empty());
+    op->Proto()->ParseFromString(node->pb_msg());
+  }
+  CreateTrtEngineOp(node, *argument_->main_dfg, *block_desc.Proto());
+  auto* main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
+  auto* op = main_block->add_ops();
+  PADDLE_ENFORCE(!node->pb_msg().empty(), "failed to set desc for block");
+  op->ParseFromString(node->pb_msg());
+}
+
+namespace {
+class DFG_DebuggerPass : public DFG_GraphvizDrawPass {
+ public:
+  using Config = DFG_GraphvizDrawPass::Config;
+  DFG_DebuggerPass(const Config& config) : DFG_GraphvizDrawPass(config) {}
+
+  std::string repr() const override { return "dfg-to-fluid-debuger-pass"; }
+
+  bool Finalize() override { return true; }
+};
+}
+
+Pass* DataFlowGraphToFluidPass::CreateGraphvizDebugerPass() const {
+  return new DFG_DebuggerPass(DFG_GraphvizDrawPass::Config(
+      FLAGS_inference_analysis_graphviz_log_root,
+      "data_flow_graph_to_fluid_graphviz_debugger"));
 }
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h
index cbb05f622c..1726e056ed 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h
@@ -40,10 +40,7 @@ class DataFlowGraphToFluidPass final : public DataFlowGraphPass {
     return "Transform a DFG to a Fluid ProgramDesc";
   }
 
-  Pass *CreatePrinterPass(std::ostream &os,
-                          const std::string &banner) const override {
-    return nullptr;
-  }
+  Pass *CreateGraphvizDebugerPass() const override;
 
  protected:
   // Add a Fluid Op into the ProgramDesc.
@@ -53,6 +50,7 @@ class DataFlowGraphToFluidPass final : public DataFlowGraphPass {
 
  private:
   framework::proto::ProgramDesc *desc_;
+  Argument *argument_;
 };
 }  // namespace analysis
 }  // namespace inference
diff --git a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc
index afffb3feb0..a6f8548475 100644
--- a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc
+++ b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc
@@ -18,12 +18,19 @@ namespace paddle {
 namespace inference {
 namespace analysis {
 
+int DFG_GraphvizDrawPass::counter_{0};
+
 void DFG_GraphvizDrawPass::Run(DataFlowGraph *graph) {
   auto content = Draw(graph);
-  std::ofstream file(GenDotPath());
+  auto dot_path = GenDotPath();
+  std::ofstream file(dot_path);
   file.write(content.c_str(), content.size());
   file.close();
-  LOG(INFO) << "draw dot to " << GenDotPath();
+
+  auto png_path = dot_path.substr(0, dot_path.size() - 4) + ".png";
+  std::string message;
+  LOG(INFO) << "draw to " << png_path;
+  ExecShellCommand("dot -Tpng " + dot_path + " -o " + png_path, &message);
 }
 
 std::string DFG_GraphvizDrawPass::Draw(DataFlowGraph *graph) {
@@ -41,9 +48,7 @@ std::string DFG_GraphvizDrawPass::Draw(DataFlowGraph *graph) {
     if (!config_.display_deleted_node && node.deleted()) continue;
     for (auto &in : node.inlinks) {
       if (!config_.display_deleted_node && in->deleted()) continue;
-      for (auto &in : node.inlinks) {
-        dot.AddEdge(in->repr(), node.repr(), {});
-      }
+      dot.AddEdge(in->repr(), node.repr(), {});
     }
   }
   return dot.Build();
diff --git a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h
index 93ebff59ae..b064782586 100644
--- a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h
+++ b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h
@@ -50,20 +50,25 @@ class DFG_GraphvizDrawPass : public DataFlowGraphPass {
 
   bool Initialize(Argument *argument) override { return true; }
   void Run(DataFlowGraph *graph) override;
-  bool Finalize() override { return Pass::Finalize(); }
+  bool Finalize() override { return true; }
 
   std::string repr() const override { return "DFG graphviz drawer"; }
   std::string description() const override {
     return "Debug a DFG by draw with graphviz";
   }
 
- private:
+ protected:
+  // A counter to add a number prefix to the debugger image output so that they
+  // will sort in the triggered order.
+  static int counter_;
+
   // Path of the dot file to output.
   std::string GenDotPath() const {
-    return config_.dir + "/" + "graph_" + config_.id + ".dot";
+    return config_.dir + "/" + std::to_string(counter_++) + "-graph_" +
+           config_.id + ".dot";
   }
 
-  std::string Draw(DataFlowGraph *graph);
+  virtual std::string Draw(DataFlowGraph *graph);
 
   Config config_;
 };
diff --git a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc
index f4b5c5fd22..162455b9c4 100644
--- a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc
@@ -31,7 +31,7 @@ TEST_F(DFG_Tester, dfg_graphviz_draw_pass_tester) {
   pass.Run(&dfg);
 
   // test content
-  std::ifstream file("./graph_test.dot");
+  std::ifstream file("./0-graph_test.dot");
   ASSERT_TRUE(file.is_open());
 
   std::string line;
@@ -40,7 +40,7 @@ TEST_F(DFG_Tester, dfg_graphviz_draw_pass_tester) {
     no++;
   }
   // DFG is sensitive to ProgramDesc, be careful to change the existing models.
-  ASSERT_EQ(no, 112);
+  ASSERT_EQ(no, 82);
 }
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
index 5f62eef528..5d7eb43b7c 100644
--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
@@ -15,6 +15,8 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
+#include "analyzer.h"
+#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
 #include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
 
 namespace paddle {
@@ -33,7 +35,7 @@ bool FluidToDataFlowGraphPass::Initialize(Argument *argument) {
   return true;
 }
 
-bool FluidToDataFlowGraphPass::Finalize() { return Pass::Finalize(); }
+bool FluidToDataFlowGraphPass::Finalize() { return true; }
 
 void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
   PADDLE_ENFORCE(graph);
@@ -46,6 +48,7 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
     auto *v = graph->nodes.Create(Node::Type::kValue);
     v->SetName(var.name());
     v->SetPbDesc(const_cast<void *>(static_cast<const void *>(&var)));
+    v->SetPbMsg(var.SerializeAsString());
     var2id[var.name()] = v->id();
   }
   for (int i = 0; i < main_block.ops_size(); i++) {
@@ -56,6 +59,8 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
     // Link to the original protobuf message's memory, make it easier to
     // generate from a data flow graph to fluid ProgramDesc.
     o->SetPbDesc(const_cast<void *>(static_cast<const void *>(&op)));
+    o->SetPbMsg(op.SerializeAsString());
+
     // set inputs and outputs
     // TODO(Superjomn) make sure the InputNames is the real variable name.
     for (int j = 0; j < op.inputs_size(); j++) {
@@ -79,9 +84,19 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
   graph->Build();
 }
 
-Pass *FluidToDataFlowGraphPass::CreatePrinterPass(
-    std::ostream &os, const std::string &banner) const {
-  return nullptr;
+namespace {
+class DFG_DebuggerPass : public DFG_GraphvizDrawPass {
+ public:
+  using Config = DFG_GraphvizDrawPass::Config;
+  DFG_DebuggerPass(const Config &config) : DFG_GraphvizDrawPass(config) {}
+  std::string repr() const override { return "fluid-to-dfg-debuger-pass"; }
+  bool Finalize() override { return true; }
+};
+}
+
+Pass *FluidToDataFlowGraphPass::CreateGraphvizDebugerPass() const {
+  return new DFG_DebuggerPass(DFG_GraphvizDrawPass::Config(
+      FLAGS_inference_analysis_graphviz_log_root, "fluid-to-dfg-debuger"));
 }
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
index 176faf0220..da8463b63b 100644
--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
@@ -46,8 +46,7 @@ class FluidToDataFlowGraphPass final : public DataFlowGraphPass {
     return "transform a fluid ProgramDesc to a data flow graph.";
   }
 
-  Pass *CreatePrinterPass(std::ostream &os,
-                          const std::string &banner) const override;
+  Pass *CreateGraphvizDebugerPass() const override;
 
  private:
   framework::proto::ProgramDesc const *desc_;
diff --git a/paddle/fluid/inference/analysis/helper.cc b/paddle/fluid/inference/analysis/helper.cc
new file mode 100644
index 0000000000..ca40c01fc5
--- /dev/null
+++ b/paddle/fluid/inference/analysis/helper.cc
@@ -0,0 +1,60 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/helper.h"
+#include "paddle/fluid/framework/framework.pb.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+template <>
+void SetAttr<std::string>(framework::proto::OpDesc *op, const std::string &name,
+                          const std::string &data) {
+  auto *attr = op->add_attrs();
+  attr->set_name(name);
+  attr->set_type(paddle::framework::proto::AttrType::STRING);
+  attr->set_s(data);
+}
+template <>
+void SetAttr<int>(framework::proto::OpDesc *op, const std::string &name,
+                  const int &data) {
+  auto *attr = op->add_attrs();
+  attr->set_name(name);
+  attr->set_type(paddle::framework::proto::AttrType::INT);
+  attr->set_i(data);
+}
+template <>
+void SetAttr<int64_t>(framework::proto::OpDesc *op, const std::string &name,
+                      const int64_t &data) {
+  auto *attr = op->add_attrs();
+  attr->set_name(name);
+  attr->set_type(paddle::framework::proto::AttrType::LONG);
+  attr->set_l(data);
+}
+template <>
+void SetAttr<std::vector<std::string>>(framework::proto::OpDesc *op,
+                                       const std::string &name,
+                                       const std::vector<std::string> &data) {
+  auto *attr = op->add_attrs();
+  attr->set_name(name);
+  attr->set_type(paddle::framework::proto::AttrType::STRINGS);
+  for (const auto &s : data) {
+    attr->add_strings(s.c_str());
+  }
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/helper.h b/paddle/fluid/inference/analysis/helper.h
index f0039e1131..fff1621d3f 100644
--- a/paddle/fluid/inference/analysis/helper.h
+++ b/paddle/fluid/inference/analysis/helper.h
@@ -14,10 +14,12 @@ limitations under the License. */
 
 #pragma once
 
+#include <cstdio>
 #include <string>
 #include <unordered_map>
 #include <vector>
 
+#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -26,6 +28,10 @@ namespace paddle {
 namespace inference {
 namespace analysis {
 
+template <typename T>
+void SetAttr(framework::proto::OpDesc *op, const std::string &name,
+             const T &data);
+
 template <typename Vec>
 int AccuDims(Vec &&vec, int size) {
   int res = 1;
@@ -93,7 +99,7 @@ template <typename T>
 class OrderedRegistry {
  public:
   T *Register(const std::string &name, T *x) {
-    PADDLE_ENFORCE(!dic_.count(name));
+    PADDLE_ENFORCE(!dic_.count(name), "duplicate key [%s]", name);
     dic_[name] = data_.size();
     data_.emplace_back(std::unique_ptr<T>(x));
     return data_.back().get();
@@ -117,6 +123,20 @@ T &GetFromScope(const framework::Scope &scope, const std::string &name) {
   return *var->GetMutable<T>();
 }
 
+static void ExecShellCommand(const std::string &cmd, std::string *message) {
+  char buffer[128];
+  std::shared_ptr<FILE> pipe(popen(cmd.c_str(), "r"), pclose);
+  if (!pipe) {
+    LOG(ERROR) << "error running command: " << cmd;
+    return;
+  }
+  while (!feof(pipe.get())) {
+    if (fgets(buffer, 128, pipe.get()) != nullptr) {
+      *message += buffer;
+    }
+  }
+}
+
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/node.cc b/paddle/fluid/inference/analysis/node.cc
index 3339b5044d..d9d265d225 100644
--- a/paddle/fluid/inference/analysis/node.cc
+++ b/paddle/fluid/inference/analysis/node.cc
@@ -20,6 +20,17 @@ namespace paddle {
 namespace inference {
 namespace analysis {
 
+template <>
+std::string &NodeAttr::As<std::string>() {
+  if (data_.empty()) {
+    type_hash_ = typeid(std::string).hash_code();
+  }
+  PADDLE_ENFORCE_EQ(type_hash_, typeid(std::string).hash_code());
+  return data_;
+}
+
+std::string &NodeAttr::String() { return As<std::string>(); }
+
 std::vector<Dot::Attr> Value::dot_attrs() const {
   return std::vector<Dot::Attr>({Dot::Attr("style", "filled,rounded"),
                                  Dot::Attr("shape", "box"),
diff --git a/paddle/fluid/inference/analysis/node.h b/paddle/fluid/inference/analysis/node.h
index 8c2e6d88b9..8ecd1ae730 100644
--- a/paddle/fluid/inference/analysis/node.h
+++ b/paddle/fluid/inference/analysis/node.h
@@ -35,6 +35,44 @@ namespace analysis {
 
 class NodeMap;
 
+// A helper class to maintain the status from Pass.
+struct NodeAttr {
+  // NOTE T should be a primary type or a struct combined by several primary
+  // types.
+  // NOTE the STL containers should not use here.
+  // Some usages
+  //   Attr attr;
+  //   attr.Bool() = true;
+
+  bool &Bool() { return As<bool>(); }
+  float &Float() { return As<float>(); }
+  int32_t &Int32() { return As<int32_t>(); }
+  int64_t &Int64() { return As<int64_t>(); }
+  void *&Pointer() { return As<void *>(); }
+  std::string &String();
+
+ private:
+  template <typename T>
+  T &As() {
+    // init storage in the first usage.
+    if (data_.empty()) {
+      VLOG(4) << "resize data to " << sizeof(T);
+      type_hash_ = typeid(T).hash_code();
+      data_.resize(sizeof(T));
+    }
+    PADDLE_ENFORCE(type_hash_ == typeid(T).hash_code(),
+                   "type not matched, origin is %s, want %s",
+                   DataTypeNamer::Global().repr(type_hash_),
+                   DataTypeNamer::Global().repr<T>());
+    PADDLE_ENFORCE_EQ(data_.size(), sizeof(T), "Node attr type recast error");
+    return *reinterpret_cast<T *>(&data_[0]);
+  }
+
+ private:
+  std::string data_;
+  size_t type_hash_{std::numeric_limits<size_t>::max()};
+};
+
 /*
  * Node Representation.
  *
@@ -50,8 +88,6 @@ class Node {
 
   Node() = default;
 
-  struct Attr;
-
   // Cast to a subclass type, Function for example.
   template <typename Subclass>
   Subclass &As() {
@@ -71,7 +107,7 @@ class Node {
 
   // Get an additional attribute and convert it to T data type. NOTE this will
   // silently create a new attribute if not exists.
-  Attr &attr(const std::string &name) const { return attrs_[name]; }
+  NodeAttr &attr(const std::string &name) const { return attrs_[name]; }
 
   int id() const { return id_; }
 
@@ -80,6 +116,9 @@ class Node {
   void SetPbDesc(void *pb) { attr("pb_desc").Pointer() = pb; }
   void *pb_desc() const { return attr("pb_desc").Pointer(); }
 
+  void SetPbMsg(const std::string &s) { attr("pb_msg").String() = s; }
+  const std::string &pb_msg() const { return attr("pb_msg").String(); }
+
   void SetDeleted() { deleted_ = true; }
   bool deleted() const { return deleted_; }
 
@@ -94,43 +133,6 @@ class Node {
   // Output links.
   std::vector<Node *> outlinks;
 
-  // A helper class to maintain the status from Pass.
-  struct Attr {
-    // NOTE T should be a primary type or a struct combined by several primary
-    // types.
-    // NOTE the STL containers should not use here.
-    // Some usages
-    //   Attr attr;
-    //   attr.Bool() = true;
-
-    bool &Bool() { return As<bool>(); }
-    float &Float() { return As<float>(); }
-    int32_t &Int32() { return As<int32_t>(); }
-    int64_t &Int64() { return As<int64_t>(); }
-    void *&Pointer() { return As<void *>(); }
-
-   private:
-    template <typename T>
-    T &As() {
-      // init storage in the first usage.
-      if (data_.empty()) {
-        VLOG(4) << "resize data to " << sizeof(T);
-        type_hash_ = typeid(T).hash_code();
-        data_.resize(sizeof(T));
-      }
-      PADDLE_ENFORCE(type_hash_ == typeid(T).hash_code(),
-                     "type not matched, origin is %s, want %s",
-                     DataTypeNamer::Global().repr(type_hash_),
-                     DataTypeNamer::Global().repr<T>());
-      PADDLE_ENFORCE_EQ(data_.size(), sizeof(T), "Node attr type recast error");
-      return *reinterpret_cast<T *>(&data_[0]);
-    }
-
-   private:
-    std::string data_;
-    size_t type_hash_{std::numeric_limits<size_t>::max()};
-  };
-
   // Type checks.
   bool IsFunction() const { return type_ == Node::Type::kFunction; }
   bool IsValue() const { return type_ == Node::Type::kValue; }
@@ -150,7 +152,7 @@ class Node {
   Type type_{Type::kNone};
   // Mark this node is deleted by some pass.
   bool deleted_{false};
-  mutable std::unordered_map<std::string, Attr> attrs_;
+  mutable std::unordered_map<std::string, NodeAttr> attrs_;
 };
 
 class Function;
@@ -213,6 +215,10 @@ class Function : public Node {
 struct FunctionBlock : public Node {
   std::string repr() const override { return "block-" + std::to_string(id()); }
   std::vector<Node *> subgraph;
+
+ protected:
+  FunctionBlock() { SetType(Node::Type::kFunctionBlock); }
+  friend class NodeMap;
 };
 
 class NodeMap {
@@ -227,7 +233,7 @@ class NodeMap {
 
   void Delete(size_t id);
 
-  const std::vector<std::unique_ptr<Node>> &nodes() { return nodes_; }
+  const std::vector<std::unique_ptr<Node>> &nodes() const { return nodes_; }
 
   size_t size() const { return nodes_.size(); }
 
diff --git a/paddle/fluid/inference/analysis/node_attr_flags.h b/paddle/fluid/inference/analysis/node_attr_flags.h
new file mode 100644
index 0000000000..a3f70e5419
--- /dev/null
+++ b/paddle/fluid/inference/analysis/node_attr_flags.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/*
+ * This file contains all the flags that declared in Node::Attr.
+ *
+ * The Node::Attr is designed to share information between different passes, one
+ * can get other's attributes in a Node by the flags in this file.
+ */
+#pragma once
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+#define DECLARE_NODE_ATTR(flag__) const char ATTR_##flag__[] = #flag__;
+
+DECLARE_NODE_ATTR(supported_by_tensorrt)  // bool
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/pass.h b/paddle/fluid/inference/analysis/pass.h
index 65632b7491..25c566ebfa 100644
--- a/paddle/fluid/inference/analysis/pass.h
+++ b/paddle/fluid/inference/analysis/pass.h
@@ -60,6 +60,9 @@ class Pass {
     return nullptr;
   }
 
+  // Create a debugger Pass that draw the DFG by graphviz toolkit.
+  virtual Pass *CreateGraphvizDebugerPass() const { return nullptr; }
+
   // Run on a single Node.
   virtual void Run(Node *x) { LOG(FATAL) << "not valid"; }
   // Run on a single Function.
diff --git a/paddle/fluid/inference/analysis/pass_manager.cc b/paddle/fluid/inference/analysis/pass_manager.cc
index b17c0e0d72..b428bb22b1 100644
--- a/paddle/fluid/inference/analysis/pass_manager.cc
+++ b/paddle/fluid/inference/analysis/pass_manager.cc
@@ -19,6 +19,18 @@ namespace paddle {
 namespace inference {
 namespace analysis {
 
+bool PassManager::Initialize(Argument* argument) {
+  argument_ = argument;
+  for (auto& pass : data_) {
+    LOG(INFO) << "Initializing pass " << pass->repr();
+    if (!pass->Initialize(argument)) {
+      LOG(ERROR) << "Failed to initialize pass [" << pass->repr() << "]";
+      return false;
+    }
+  }
+  return true;
+}
+
 void DfgPassManager::RunAll() {
   PADDLE_ENFORCE(argument_);
   for (auto& pass : data_) {
diff --git a/paddle/fluid/inference/analysis/pass_manager.h b/paddle/fluid/inference/analysis/pass_manager.h
index 7841c4b9d0..81a17e0287 100644
--- a/paddle/fluid/inference/analysis/pass_manager.h
+++ b/paddle/fluid/inference/analysis/pass_manager.h
@@ -50,17 +50,7 @@ class PassManager : public OrderedRegistry<Pass> {
   // globally shared, so pass them as the arguemnts for all the pass managers.
   virtual bool Initialize(const Argument& argument) { return false; }
 
-  virtual bool Initialize(Argument* argument) {
-    argument_ = argument;
-    for (auto& pass : data_) {
-      LOG(INFO) << "Initializing pass " << pass->repr();
-      if (!pass->Initialize(argument)) {
-        LOG(ERROR) << "Failed to initialize pass [" << pass->repr() << "]";
-        return false;
-      }
-    }
-    return true;
-  }
+  virtual bool Initialize(Argument* argument);
 
   // Call all the passes' Finalize methods.
   virtual bool Finalize() {
diff --git a/paddle/fluid/inference/analysis/pass_manager_tester.cc b/paddle/fluid/inference/analysis/pass_manager_tester.cc
index 7af6a19951..6caba8f042 100644
--- a/paddle/fluid/inference/analysis/pass_manager_tester.cc
+++ b/paddle/fluid/inference/analysis/pass_manager_tester.cc
@@ -64,6 +64,7 @@ TEST_F(DFG_Tester, DFG_pass_manager) {
   manager.Register("graphviz", new DFG_GraphvizDrawPass(config));
   manager.Register("dfg-to-fluid", new DataFlowGraphToFluidPass);
 
+  ASSERT_TRUE(&argument);
   ASSERT_TRUE(manager.Initialize(&argument));
   manager.RunAll();
 }
diff --git a/paddle/fluid/inference/analysis/subgraph_splitter.cc b/paddle/fluid/inference/analysis/subgraph_splitter.cc
index 43ccac96c8..389f9e1a91 100644
--- a/paddle/fluid/inference/analysis/subgraph_splitter.cc
+++ b/paddle/fluid/inference/analysis/subgraph_splitter.cc
@@ -119,10 +119,12 @@ void SubGraphFuse::operator()() { ReplaceNodesWithSubGraphs(); }
 void SubGraphFuse::ReplaceNodesWithSubGraphs() {
   auto subgraphs = SubGraphSplitter(graph_, node_inside_subgraph_teller_)();
   for (auto &subgraph : subgraphs) {
+    std::unordered_set<Node *> subgraph_uniq(subgraph.begin(), subgraph.end());
     // replace this sub-graph with the first node. Two steps: 1. Create a Block
     // Node that contains this subgraph 2. Mark the nodes inside the sub-graph
     // as deleted. 3. Replace the deleted node with the new Block Node.
-    auto *block_node = graph_->nodes.Create(Node::Type::kFunctionBlock);
+    auto *block_node = static_cast<FunctionBlock *>(
+        graph_->nodes.Create(Node::Type::kFunctionBlock));
     auto io = ExtractInputAndOutputOfSubGraph(subgraph);
     block_node->inlinks = std::move(io.first);
     block_node->outlinks = std::move(io.second);
@@ -130,21 +132,25 @@ void SubGraphFuse::ReplaceNodesWithSubGraphs() {
       // TODO(Superjomn) need a unified mechanism to treat deleted node in each
       // pass.
       node->SetDeleted();
+      block_node->subgraph.push_back(node);
     }
 
-    std::unordered_map<Node *, Node *>
-        delelte_node_map;  // deleted node to BlockNode
-    for (auto *n : block_node->inlinks) {
-      n->inlinks.clear();
-    }
-    for (auto *n : block_node->outlinks) {
-      n->outlinks.clear();
-    }
-    for (auto *n : block_node->inlinks) {
-      n->outlinks.push_back(block_node);
+    // Change all the sub-graph's inputs and outputs corresponding inlink and
+    // outlink to this sub-graph node.
+    auto inlink_or_outlink_cleaner = [&](std::vector<Node *> &nodes) {
+      for (auto *&n : nodes) {
+        if (subgraph_uniq.count(n)) {
+          n = block_node;
+        }
+      }
+      std::unordered_set<Node *> uniq(nodes.begin(), nodes.end());
+      nodes.assign(uniq.begin(), uniq.end());
+    };
+    for (auto *i : block_node->inlinks) {
+      inlink_or_outlink_cleaner(i->outlinks);
     }
-    for (auto *n : block_node->outlinks) {
-      n->inlinks.push_back(n);
+    for (auto *&o : block_node->outlinks) {
+      inlink_or_outlink_cleaner(o->inlinks);
     }
   }
 }
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc
new file mode 100644
index 0000000000..5ad092a9ed
--- /dev/null
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc
@@ -0,0 +1,78 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h"
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
+#include "paddle/fluid/inference/analysis/node_attr_flags.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+void TensorRTSubgraphNodeMarkPass::Run(DataFlowGraph *graph) {
+  for (auto &node : graph->nodes.nodes()) {
+    node->attr(ATTR_supported_by_tensorrt).Bool() = teller_(node.get());
+  }
+}
+
+class DfgDebuggerPass : public DFG_GraphvizDrawPass {
+ public:
+  DfgDebuggerPass(const DFG_GraphvizDrawPass::Config &config)
+      : DFG_GraphvizDrawPass(config) {}
+
+  std::string repr() const override {
+    return "tensorrt-subgraph-node-mark-debugger";
+  }
+
+  bool Finalize() override { return true; }
+
+ protected:
+  std::string Draw(DataFlowGraph *graph) override {
+    Dot dot;
+    // Add nodes
+    for (size_t i = 0; i < graph->nodes.size(); i++) {
+      const Node &node = graph->nodes.Get(i);
+      if (config_.display_deleted_node || !node.deleted()) {
+        auto dot_attr = node.dot_attrs();
+        if (node.attr(ATTR_supported_by_tensorrt).Bool()) {
+          dot_attr.assign(
+              {Dot::Attr{"color", "green"}, Dot::Attr{"style", "filled"}});
+        }
+        dot.AddNode(node.repr(), dot_attr);
+      }
+    }
+    // Add edges
+    for (size_t i = 0; i < graph->nodes.size(); i++) {
+      const Node &node = graph->nodes.Get(i);
+      if (!config_.display_deleted_node && node.deleted()) continue;
+      for (auto &in : node.inlinks) {
+        if (!config_.display_deleted_node && in->deleted()) continue;
+        dot.AddEdge(in->repr(), node.repr(), {});
+      }
+    }
+    return dot.Build();
+  }
+};
+
+Pass *TensorRTSubgraphNodeMarkPass::CreateGraphvizDebugerPass() const {
+  DFG_GraphvizDrawPass::Config config(
+      FLAGS_inference_analysis_graphviz_log_root, "tensorrt_marked_node");
+  return new DfgDebuggerPass(config);
+}
+bool TensorRTSubgraphNodeMarkPass::Finalize() { return true; }
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h
new file mode 100644
index 0000000000..6cfac55d3b
--- /dev/null
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h
@@ -0,0 +1,53 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/*
+ * This file defines TensorRTSubgraphNodeMarkPass which helps to mark the ops
+ * that supported by TensorRT engine.
+ */
+#include "paddle/fluid/inference/analysis/pass.h"
+#include "paddle/fluid/inference/analysis/subgraph_splitter.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+/*
+ * Mark the operators that TensorRT engine supports.
+ */
+class TensorRTSubgraphNodeMarkPass : public DataFlowGraphPass {
+ public:
+  using teller_t = SubGraphSplitter::NodeInsideSubgraphTeller;
+
+  TensorRTSubgraphNodeMarkPass(const teller_t& teller) : teller_(teller) {}
+
+  bool Initialize(Argument* argument) override { return true; }
+
+  // This class get a sub-graph as input and determine whether to transform this
+  // sub-graph into TensorRT.
+  void Run(DataFlowGraph* graph) override;
+
+  std::string repr() const { return "tensorrt-sub-subgraph-mark"; }
+  std::string description() const { return "tensorrt sub-graph mark pass"; }
+
+  Pass* CreateGraphvizDebugerPass() const override;
+  bool Finalize() override;
+
+ private:
+  teller_t teller_;
+};
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass_tester.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass_tester.cc
new file mode 100644
index 0000000000..a6c15e848b
--- /dev/null
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass_tester.cc
@@ -0,0 +1,50 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h"
+
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/analysis/node_attr_flags.h"
+#include "paddle/fluid/inference/analysis/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+TEST_F(DFG_Tester, tensorrt_subgraph_node_mark_pass) {
+  // init
+  FluidToDataFlowGraphPass pass;
+  ASSERT_TRUE(pass.Initialize(&argument));
+  argument.main_dfg.reset(new DataFlowGraph);
+  pass.Run(argument.main_dfg.get());
+
+  TensorRTSubgraphNodeMarkPass::teller_t teller = [](const Node* node) {
+    return node->IsFunction() &&
+           static_cast<const Function*>(node)->func_type() == "mul";
+  };
+  TensorRTSubgraphNodeMarkPass pass1(teller);
+  ASSERT_TRUE(pass1.Initialize(&argument));
+  pass1.Run(argument.main_dfg.get());
+
+  int counter{0};
+  for (auto& node : argument.main_dfg->nodes.nodes()) {
+    counter += node->attr(ATTR_supported_by_tensorrt).Bool();
+  }
+
+  LOG(INFO) << counter << " nodes marked";
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc
index c7f40d43c9..9993de2280 100644
--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc
@@ -24,7 +24,7 @@ TensorRTSubGraphPass::TensorRTSubGraphPass(
     : node_inside_subgraph_teller_(teller) {}
 
 void TensorRTSubGraphPass::Run(DataFlowGraph *graph) {
-  SubGraphFuse(graph, node_inside_subgraph_teller_);
+  SubGraphFuse(graph, node_inside_subgraph_teller_)();
 }
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h
index 79e9e2bcc9..11e0880695 100644
--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h
@@ -38,6 +38,11 @@ class TensorRTSubGraphPass : public DataFlowGraphPass {
   // sub-graph into TensorRT.
   void Run(DataFlowGraph* graph) override;
 
+  bool Finalize() override { return true; }
+
+  std::string repr() const { return "tensorrt-sub-graph"; }
+  std::string description() const { return "tensorrt sub graph pass"; }
+
  private:
   NodeInsideSubgraphTeller node_inside_subgraph_teller_;
 };
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc
index d12dcf0d0f..1d749d3fa3 100644
--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc
@@ -23,49 +23,48 @@ namespace paddle {
 namespace inference {
 namespace analysis {
 
-DEFINE_string(model_dir, "", "inference test model dir");
+DEFINE_string(dot_dir, "./", "");
 
-TEST(TensorRTSubGraph, single_pass) {
-  auto desc = LoadProgramDesc();
-  auto dfg = ProgramDescToDFG(desc);
-
-  SubGraphSplitter::NodeInsideSubgraphTeller teller = [](const Node* node) {
+TEST_F(DFG_Tester, tensorrt_single_pass) {
+  std::unordered_set<std::string> teller_set(
+      {"elementwise_add", "mul", "sigmoid"});
+  SubGraphSplitter::NodeInsideSubgraphTeller teller = [&](const Node* node) {
     if (node->type() != Node::Type::kFunction) return false;
     const auto* func = static_cast<const Function*>(node);
-    if (func->func_type() == "elementwise_add" || func->func_type() == "relu" ||
-        func->func_type() == "conv2d" || func->func_type() == "mul" ||
-        func->func_type() == "sigmoid" || func->func_type() == "softmax") {
-      LOG(INFO) << "sub-graph marked " << node->repr();
-      return true;
-    }
+    if (teller_set.count(func->func_type())) return true;
     return false;
   };
 
-  DFG_GraphvizDrawPass::Config config{"./", "test"};
-  DFG_GraphvizDrawPass dfg_pass(config);
-  dfg_pass.Initialize();
-
-  DFG_GraphvizDrawPass dfg_pass1(config);
-  dfg_pass1.Initialize();
-
-  dfg_pass.Run(&dfg);
+  LOG(INFO) << "init";
+  DFG_GraphvizDrawPass::Config config{FLAGS_dot_dir, "origin"};
+  DFG_GraphvizDrawPass::Config config1{FLAGS_dot_dir, "fusion"};
 
+  DFG_GraphvizDrawPass dfg_pass(config);
+  DFG_GraphvizDrawPass dfg_pass1(config1);
+  FluidToDataFlowGraphPass pass0;
   TensorRTSubGraphPass trt_pass(std::move(teller));
-  trt_pass.Initialize();
 
-  trt_pass.Run(&dfg);
+  LOG(INFO) << "Initialize";
+  dfg_pass.Initialize(&argument);
+  dfg_pass1.Initialize(&argument);
+  pass0.Initialize(&argument);
+  trt_pass.Initialize(&argument);
 
-  dfg_pass1.Run(&dfg);
+  LOG(INFO) << "Run";
+  argument.main_dfg.reset(new DataFlowGraph);
+  pass0.Run(argument.main_dfg.get());
+  dfg_pass.Run(argument.main_dfg.get());
+  trt_pass.Run(argument.main_dfg.get());
+  dfg_pass1.Run(argument.main_dfg.get());
 
   // Check the TRT op's block desc
-  for (auto node : dfg.nodes.nodes()) {
+  for (auto& node : argument.main_dfg->nodes.nodes()) {
     if (node->IsFunctionBlock()) {
+      LOG(INFO) << "get function block";
     }
   }
 }
 
-TEST(TensorRTSubGraph, pass_manager) {}
-
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 4c338c67d3..9dc39ad0dd 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -226,7 +226,8 @@ op_library(sequence_softmax_op DEPS softmax)
 if (WITH_GPU AND TENSORRT_FOUND)
     op_library(tensorrt_engine_op DEPS tensorrt_engine)
     nv_test(test_tensorrt_engine_op SRCS tensorrt_engine_op_test.cc
-      DEPS tensorrt_engine_op tensorrt_engine tensorrt_converter)
+      DEPS tensorrt_engine_op tensorrt_engine tensorrt_converter
+      analysis)
 else()
     set(DEPS_OPS ${DEPS_OPS} tensorrt_engine_op)
 endif()
diff --git a/paddle/fluid/operators/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt_engine_op.h
index 295d6ba039..1602a913ae 100644
--- a/paddle/fluid/operators/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt_engine_op.h
@@ -53,6 +53,7 @@ template <typename DeviceContext, typename T>
 class TensorRTEngineKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
+    VLOG(4) << "TensorRTEngineKernel executing";
     auto engine_name = context.Attr<std::string>("engine_uniq_key");
     if (!Singleton<TRT_EngineManager>::Global().HasEngine(engine_name)) {
       Prepare(context);
diff --git a/paddle/fluid/operators/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt_engine_op_test.cc
index 358e2d151b..82a16361e4 100644
--- a/paddle/fluid/operators/tensorrt_engine_op_test.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op_test.cc
@@ -19,6 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/inference/analysis/helper.h"
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
 
@@ -51,48 +52,10 @@ void AddTensorToBlockDesc(framework::proto::BlockDesc* block,
   *var = *desc.Proto();
 }
 
-template <typename T>
-void SetAttr(framework::proto::OpDesc* op, const std::string& name,
-             const T& data);
-
-template <>
-void SetAttr<std::string>(framework::proto::OpDesc* op, const std::string& name,
-                          const std::string& data) {
-  auto* attr = op->add_attrs();
-  attr->set_name(name);
-  attr->set_type(paddle::framework::proto::AttrType::STRING);
-  attr->set_s(data);
-}
-template <>
-void SetAttr<int>(framework::proto::OpDesc* op, const std::string& name,
-                  const int& data) {
-  auto* attr = op->add_attrs();
-  attr->set_name(name);
-  attr->set_type(paddle::framework::proto::AttrType::INT);
-  attr->set_i(data);
-}
-template <>
-void SetAttr<int64_t>(framework::proto::OpDesc* op, const std::string& name,
-                      const int64_t& data) {
-  auto* attr = op->add_attrs();
-  attr->set_name(name);
-  attr->set_type(paddle::framework::proto::AttrType::LONG);
-  attr->set_l(data);
-}
-template <>
-void SetAttr<std::vector<std::string>>(framework::proto::OpDesc* op,
-                                       const std::string& name,
-                                       const std::vector<std::string>& data) {
-  auto* attr = op->add_attrs();
-  attr->set_name(name);
-  attr->set_type(paddle::framework::proto::AttrType::STRINGS);
-  for (const auto& s : data) {
-    attr->add_strings(s.c_str());
-  }
-}
-
 }  // namespace
 
+using inference::analysis::SetAttr;
+
 TEST(TensorRTEngineOp, manual) {
   framework::ProgramDesc program;
   auto* block_ = program.Proto()->add_blocks();

From 921182484103b1b5e6256cb59e77cac8ed1c0272 Mon Sep 17 00:00:00 2001
From: Luo Tao <luotao02@baidu.com>
Date: Thu, 28 Jun 2018 10:17:33 +0800
Subject: [PATCH 64/68] fix tensorrt compiler bug

---
 paddle/contrib/inference/CMakeLists.txt | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/paddle/contrib/inference/CMakeLists.txt b/paddle/contrib/inference/CMakeLists.txt
index 153216abb4..ef768d989a 100644
--- a/paddle/contrib/inference/CMakeLists.txt
+++ b/paddle/contrib/inference/CMakeLists.txt
@@ -18,7 +18,10 @@ if(APPLE)
 endif(APPLE)
 
 
-set(inference_deps paddle_inference_api paddle_fluid_api paddle_inference_tensorrt_subgraph_engine)
+set(inference_deps paddle_inference_api paddle_fluid_api)
+if(WITH_GPU AND TENSORRT_FOUND)
+    set(inference_deps ${inference_deps} paddle_inference_tensorrt_subgraph_engine)
+endif()
 
 function(inference_api_test TARGET_NAME)
     if (WITH_TESTING)

From 2ecc56226d4d4b9151fdcfce9ffe5af6aa58eb5b Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Thu, 28 Jun 2018 12:40:51 +0800
Subject: [PATCH 65/68] small AverageOptimizer enhance. (#11761)

* small AverageOptimizer enhance.

* clean

* clean
---
 .../fluid/operators/average_accumulates_op.cc | 22 +++++++++----------
 .../fluid/operators/average_accumulates_op.h  |  5 +++--
 2 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/paddle/fluid/operators/average_accumulates_op.cc b/paddle/fluid/operators/average_accumulates_op.cc
index 25864e95d7..f389eab605 100644
--- a/paddle/fluid/operators/average_accumulates_op.cc
+++ b/paddle/fluid/operators/average_accumulates_op.cc
@@ -19,28 +19,28 @@ namespace operators {
 
 template <>
 void GetAccumulators<paddle::platform::CPUDeviceContext>(
-    const framework::ExecutionContext& ctx, int64_t* num_updates_,
-    int64_t* num_accumulates_, int64_t* old_num_accumulates_) {
+    const framework::ExecutionContext& ctx, int64_t* num_updates,
+    int64_t* num_accumulates, int64_t* old_num_accumulates) {
   auto* in_old_num_accumulates = ctx.Input<Tensor>("in_old_num_accumulates");
   auto* in_num_accumulates = ctx.Input<Tensor>("in_num_accumulates");
   auto* in_num_updates = ctx.Input<Tensor>("in_num_updates");
 
-  *old_num_accumulates_ = in_old_num_accumulates->data<int64_t>()[0];
-  *num_accumulates_ = in_num_accumulates->data<int64_t>()[0];
-  *num_updates_ = in_num_updates->data<int64_t>()[0];
+  *old_num_accumulates = in_old_num_accumulates->data<int64_t>()[0];
+  *num_accumulates = in_num_accumulates->data<int64_t>()[0];
+  *num_updates = in_num_updates->data<int64_t>()[0];
 }
 
 template <>
 void SetAccumulators<paddle::platform::CPUDeviceContext>(
-    const framework::ExecutionContext& ctx, int64_t num_updates_,
-    int64_t num_accumulates_, int64_t old_num_accumulates_) {
+    const framework::ExecutionContext& ctx, int64_t num_updates,
+    int64_t num_accumulates, int64_t old_num_accumulates) {
   auto* out_old_num_accumulates = ctx.Output<Tensor>("out_old_num_accumulates");
   auto* out_num_accumulates = ctx.Output<Tensor>("out_num_accumulates");
   auto* out_num_updates = ctx.Output<Tensor>("out_num_updates");
 
-  out_old_num_accumulates->data<int64_t>()[0] = old_num_accumulates_;
-  out_num_accumulates->data<int64_t>()[0] = num_accumulates_;
-  out_num_updates->data<int64_t>()[0] = num_updates_;
+  out_old_num_accumulates->data<int64_t>()[0] = old_num_accumulates;
+  out_num_accumulates->data<int64_t>()[0] = num_accumulates;
+  out_num_updates->data<int64_t>()[0] = num_updates;
 }
 
 class AverageAccumulatesOp : public framework::OperatorWithKernel {
@@ -177,7 +177,7 @@ class AverageAccumulatesOpMaker : public framework::OpProtoAndCheckerMaker {
 
     AddComment(R"DOC(
 AverageAccumulates Operator.
-Accumulate the sum of parameter whtin sliding window. The size of sliding window is
+Accumulate the sum of parameter within sliding window. The size of sliding window is
 determined by 'average_window', 'max_average_window' and 'min_average_window'.
 Memory was shared by Input(in_sum_1) and Output(out_sum_1) which acts as an accumulator 'sum_1'.
 'sum_2', 'sum_3', 'num_accumulates', 'old_num_accumulates' and 'num_updates' were the same as 'sum_1'.
diff --git a/paddle/fluid/operators/average_accumulates_op.h b/paddle/fluid/operators/average_accumulates_op.h
index 07ac5ced11..3958d3f685 100644
--- a/paddle/fluid/operators/average_accumulates_op.h
+++ b/paddle/fluid/operators/average_accumulates_op.h
@@ -54,8 +54,9 @@ class AverageAccumulatesKernel : public framework::OpKernel<T> {
     float average_window = ctx.Attr<float>("average_window");
     int64_t max_average_window = ctx.Attr<int64_t>("max_average_window");
     int64_t min_average_window = ctx.Attr<int64_t>("min_average_window");
-    min_average_window =
-        std::min<int64_t>(min_average_window, max_average_window);
+    PADDLE_ENFORCE_LE(min_average_window, max_average_window,
+                      "min_average_window shouldn't be larger than "
+                      "max_average_window");
 
     // Get inputs
     auto* param = ctx.Input<Tensor>("param");

From f1224945ba4585e71beb33a30fcd7c88d912f796 Mon Sep 17 00:00:00 2001
From: superjomn <yanchunwei@outlook.com>
Date: Thu, 28 Jun 2018 13:27:30 +0800
Subject: [PATCH 66/68] fix analysis compile bug

---
 paddle/fluid/inference/CMakeLists.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index ec16a1c600..7071eea19c 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -28,9 +28,10 @@ endif()
 if(WITH_TESTING)
   # both tests/book and analysis depends the models that generated by python/paddle/fluid/tests/book
   add_subdirectory(tests/book)
-  add_subdirectory(analysis)
 endif()
 
+add_subdirectory(analysis)
+
 if (TENSORRT_FOUND)
   add_subdirectory(tensorrt)
 endif()

From ba99bc238409b3be2e1d42f0d8baaf42b544d774 Mon Sep 17 00:00:00 2001
From: superjomn <yanchunwei@outlook.com>
Date: Thu, 28 Jun 2018 14:02:13 +0800
Subject: [PATCH 67/68] update

---
 .../fluid/inference/analysis/CMakeLists.txt   | 20 ++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt
index 33b0e3b127..cdd67fdc92 100644
--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
@@ -13,16 +13,18 @@ cc_test(test_dot SRCS dot_tester.cc DEPS analysis)
 set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests)
 
 function (inference_analysis_test TARGET)
-    set(options "")
-    set(oneValueArgs "")
-    set(multiValueArgs SRCS)
-    cmake_parse_arguments(analysis_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    if(WITH_TESTING)
+        set(options "")
+        set(oneValueArgs "")
+        set(multiValueArgs SRCS)
+        cmake_parse_arguments(analysis_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
-    cc_test(${TARGET}
-            SRCS "${analysis_test_SRCS}"
-            DEPS analysis
-            ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model --fraction_of_gpu_memory_to_use=0.5)
-    set_tests_properties(${TARGET} PROPERTIES DEPENDS test_word2vec)
+        cc_test(${TARGET}
+                SRCS "${analysis_test_SRCS}"
+                DEPS analysis
+                ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model --fraction_of_gpu_memory_to_use=0.5)
+        set_tests_properties(${TARGET} PROPERTIES DEPENDS test_word2vec)
+    endif(WITH_TESTING)
 endfunction(inference_analysis_test)
 
 inference_analysis_test(test_data_flow_graph SRCS data_flow_graph_tester.cc)

From 995ead08496ccfe2560fece740be903f50cc7bec Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Thu, 28 Jun 2018 15:50:04 +0800
Subject: [PATCH 68/68] add timeline_cn

---
 doc/fluid/howto/optimization/timeline_cn.md   | 26 +++++++++++++++++++
 .../{timeline.md => timeline_en.md}           |  0
 2 files changed, 26 insertions(+)
 create mode 100644 doc/fluid/howto/optimization/timeline_cn.md
 rename doc/fluid/howto/optimization/{timeline.md => timeline_en.md} (100%)

diff --git a/doc/fluid/howto/optimization/timeline_cn.md b/doc/fluid/howto/optimization/timeline_cn.md
new file mode 100644
index 0000000000..5d061e1c00
--- /dev/null
+++ b/doc/fluid/howto/optimization/timeline_cn.md
@@ -0,0 +1,26 @@
+# 如何使用timeline工具做性能分析
+
+1. 在训练的主循环外加上`with profiler.profiler(...)`。运行之后，代码会在`/tmp/profile`目录下生成一个profile的记录文件。
+
+	**提示：**
+	请不要在timeline记录信息时运行太多次迭代，因为timeline中的记录数量和迭代次数是成正比的。
+
+	```python
+	with profiler.profiler('All', 'total', '/tmp/profile') as prof:
+	    for pass_id in range(pass_num):
+	        for batch_id, data in enumerate(train_reader()):
+	            exe.run(fluid.default_main_program(),
+	                    feed=feeder.feed(data),
+	                    fetch_list=[])
+	            ...
+	```
+
+1. 运行`python paddle/tools/timeline.py`来处理`/tmp/profile`，这个程序默认会生成一个`/tmp/timeline`文件，你也可以用命令行参数来修改这个路径，请参考[timeline.py](https://github.com/PaddlePaddle/Paddle/blob/develop/tools/timeline.py)。
+
+1. 打开chrome浏览器，访问<chrome://tracing/>，用`load`按钮来加载生成的`timeline`文件。
+
+	![chrome tracing](./tracing.jpeg)
+
+1. 结果如下图所示，可以放到来查看timetime的细节信息。
+
+	![chrome timeline](./timeline.jpeg)
diff --git a/doc/fluid/howto/optimization/timeline.md b/doc/fluid/howto/optimization/timeline_en.md
similarity index 100%
rename from doc/fluid/howto/optimization/timeline.md
rename to doc/fluid/howto/optimization/timeline_en.md