From 79cec5311179e6e50b0126fea0e6dfa8a7cf354a Mon Sep 17 00:00:00 2001
From: jerrywgz <jerrwgz@126.com>
Date: Tue, 20 Nov 2018 12:37:04 +0000
Subject: [PATCH 01/90] add ignore index for sigmoid cross entropy with logits
 op, test=develop

---
 .../sigmoid_cross_entropy_with_logits_op.cc   |  5 +
 .../sigmoid_cross_entropy_with_logits_op.h    | 93 ++++++++++++++-----
 python/paddle/fluid/layers/nn.py              |  5 +-
 .../fluid/tests/unittests/test_layers.py      |  3 +-
 ...st_sigmoid_cross_entropy_with_logits_op.py | 35 +++++++
 5 files changed, 116 insertions(+), 25 deletions(-)

diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
index 193de05422..d6a2fa6a17 100644
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
@@ -100,6 +100,11 @@ class SigmoidCrossEntropyWithLogitsOpMaker
     AddOutput("Out",
               "(Tensor, default Tensor<float>), a 2-D tensor with shape N x D "
               " of elementwise logistic losses.");
+    AddAttr<int>(
+        "ignore_index",
+        "(int, default -1), Specifies a target value that is ignored and"
+        "does not contribute to the input gradient.")
+        .SetDefault(-1);
     AddComment(R"DOC(
 SigmoidCrossEntropyWithLogits Operator.
 
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h
index faef72866e..2bfba6f170 100644
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h
@@ -15,33 +15,82 @@ limitations under the License. */
 #pragma once
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/hostdevice.h"
+#include "paddle/legacy/utils/Logging.h"
 
 namespace paddle {
 namespace operators {
 
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename T>
+struct SigmoidCrossEntropyWithLogitsForward {
+  // EIGEN_EMPTY_STRUCT_CTOR(SigmoidCrossEntropyWithLogitsForward)
+  HOSTDEVICE SigmoidCrossEntropyWithLogitsForward(const int &ignore_index)
+      : ignore_index(ignore_index) {}
+
+  HOSTDEVICE T operator()(const T &x, const T &label) const {
+    if (static_cast<int>(label) == ignore_index) {
+      return static_cast<T>(0.);
+    }
+    T term1 = (x > 0) ? x : 0;
+    T term2 = x * label;
+    T term3 = std::log(static_cast<T>(1) + std::exp(-(std::abs(x))));
+    return term1 - term2 + term3;
+  }
+
+  int ignore_index;
+};
+
+template <typename T>
+struct SigmoidCrossEntropyWithLogitsBackward {
+  // EIGEN_EMPTY_STRUCT_CTOR(SigmoidCrossEntropyWithLogitsForward)
+  HOSTDEVICE SigmoidCrossEntropyWithLogitsBackward(const int &ignore_index)
+      : ignore_index(ignore_index) {}
+
+  HOSTDEVICE T operator()(const T &x, const T &label) const {
+    if (static_cast<int>(label) == ignore_index) {
+      return static_cast<T>(0.);
+    }
+    T simoid_x = static_cast<T>(1) / (static_cast<T>(1) + std::exp(-x));
+    return simoid_x - label;
+  }
+
+  int ignore_index;
+};
+
 // Out = max(X, 0) - X * Labels + log(1 + exp(-abs(X)))
 template <typename DeviceContext, typename T>
 class SigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
-    const framework::Tensor *X = context.Input<framework::Tensor>("X");
-    const framework::Tensor *Labels = context.Input<framework::Tensor>("Label");
-    framework::Tensor *Out = context.Output<framework::Tensor>("Out");
+    const Tensor *X = context.Input<Tensor>("X");
+    const Tensor *Labels = context.Input<Tensor>("Label");
+    Tensor *Out = context.Output<Tensor>("Out");
     Out->mutable_data<T>(context.GetPlace());
+    int ignore_index = context.Attr<int>("ignore_index");
 
-    auto x = framework::EigenVector<T>::Flatten(*X);
-    auto labels = framework::EigenVector<T>::Flatten(*Labels);
-    auto out = framework::EigenVector<T>::Flatten(*Out);
+    auto x = EigenVector<T>::Flatten(*X);
+    auto labels = EigenVector<T>::Flatten(*Labels);
+    auto out = EigenVector<T>::Flatten(*Out);
     auto &place = *context.device_context<DeviceContext>().eigen_device();
 
+    out.device(place) = x.binaryExpr(
+        labels, SigmoidCrossEntropyWithLogitsForward<T>(ignore_index));
     // term1 = max(x, 0)
-    auto term1 = x.cwiseMax(static_cast<T>(0));
+    // auto term1 = x.cwiseMax(static_cast<T>(0));
     // term2 = x * labels
-    auto term2 = x * labels;
+    // auto term2 = x * labels;
     // term3 = log(1 + exp(-abs(x)))
-    auto term3 = (static_cast<T>(1) + (-(x.abs())).exp()).log();
+    // auto term3 = (static_cast<T>(1) + (-(x.abs())).exp()).log();
 
-    out.device(place) = term1 - term2 + term3;
+    // out.device(place) = term1 - term2 + term3;
   }
 };
 
@@ -50,23 +99,23 @@ template <typename DeviceContext, typename T>
 class SigmoidCrossEntropyWithLogitsGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
-    const framework::Tensor *X = context.Input<framework::Tensor>("X");
-    const framework::Tensor *Labels = context.Input<framework::Tensor>("Label");
-    const framework::Tensor *dOut =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    framework::Tensor *dX =
-        context.Output<framework::Tensor>(framework::GradVarName("X"));
+    const Tensor *X = context.Input<Tensor>("X");
+    const Tensor *Labels = context.Input<Tensor>("Label");
+    const Tensor *dOut = context.Input<Tensor>(framework::GradVarName("Out"));
+    Tensor *dX = context.Output<Tensor>(framework::GradVarName("X"));
     dX->mutable_data<T>(context.GetPlace());
 
-    auto x = framework::EigenVector<T>::Flatten(*X);
-    auto labels = framework::EigenVector<T>::Flatten(*Labels);
-    auto dout = framework::EigenVector<T>::Flatten(*dOut);
-    auto dx = framework::EigenVector<T>::Flatten(*dX);
+    auto ignore_index = context.Attr<int>("ignore_index");
+    auto x = EigenVector<T>::Flatten(*X);
+    auto labels = EigenVector<T>::Flatten(*Labels);
+    auto dout = EigenVector<T>::Flatten(*dOut);
+    auto dx = EigenVector<T>::Flatten(*dX);
     auto &place =
         *context.template device_context<DeviceContext>().eigen_device();
 
-    auto sigmoid_x = static_cast<T>(1) / (static_cast<T>(1) + (-x).exp());
-    dx.device(place) = dout * (sigmoid_x - labels);
+    auto diff = x.binaryExpr(labels, SigmoidCrossEntropyWithLogitsBackward<T>(
+                                         static_cast<int>(ignore_index)));
+    dx.device(place) = dout * diff;
   }
 };
 
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 99acd7e308..e032835de3 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -7892,13 +7892,14 @@ def mul(x, y, x_num_col_dims=1, y_num_col_dims=1, name=None):
 
 
 @templatedoc()
-def sigmoid_cross_entropy_with_logits(x, label, name=None):
+def sigmoid_cross_entropy_with_logits(x, label, ignore_index=-1, name=None):
     """
     ${comment}
 
     Args:
         x(${x_type}): ${x_comment}
         label(${label_type}): ${label_comment}
+        ignore_index(&{ignore_index}): ${ignore_index_comment}
         name(basestring|None): Name of the output.
 
     Returns:
@@ -7917,7 +7918,7 @@ def sigmoid_cross_entropy_with_logits(x, label, name=None):
         type="sigmoid_cross_entropy_with_logits",
         inputs={"X": x,
                 "Label": label},
-        attrs={},
+        attrs={"ignore_index": ignore_index},
         outputs={"Out": out})
     return out
 
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index a8fa5436c4..8e098e4961 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -170,9 +170,10 @@ class TestBook(unittest.TestCase):
         with program_guard(program):
             dat = layers.data(name='data', shape=[10], dtype='float32')
             lbl = layers.data(name='label', shape=[10], dtype='float32')
+            ignore_index = -1
             self.assertIsNotNone(
                 layers.sigmoid_cross_entropy_with_logits(
-                    x=dat, label=lbl))
+                    x=dat, label=lbl, ignore_index=-1))
         print(str(program))
 
     def test_hsigmoid(self):
diff --git a/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py b/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py
index 97ff203499..64f6f088e1 100644
--- a/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py
@@ -56,6 +56,40 @@ class TestSigmoidCrossEntropyWithLogitsOp2(OpTest):
     """Test sigmoid_cross_entropy_with_logit_op with probabalistic label
     """
 
+    def setUp(self):
+        self.op_type = "sigmoid_cross_entropy_with_logits"
+        batch_size = 64
+        num_classes = 20
+        ignore_index = -1
+        self.inputs = {
+            'X': logit(
+                np.random.uniform(0, 1, (batch_size, num_classes))
+                .astype("float32")),
+            'Label': np.random.randint(-1, 2, (batch_size, num_classes))
+            .astype("float32")
+        }
+        self.attrs = {'ignore_index': ignore_index, }
+        # Fw Pass is implemented as elementwise sigmoid followed by
+        # elementwise logistic loss
+        # Label * -log(sigmoid(X)) + (1 - label) * -log(1 - sigmoid(X))
+        sigmoid_X = expit(self.inputs['X'])
+        term1 = self.inputs['Label'] * np.log(sigmoid_X)
+        term2 = (1 - self.inputs['Label']) * np.log(1 - sigmoid_X)
+        out = -term1 - term2
+        out[np.where(self.inputs['Label'] == ignore_index)] = 0
+        self.outputs = {'Out': out}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestSigmoidCrossEntropyWithLogitsOp3(OpTest):
+    """Test sigmoid_cross_entropy_with_logit_op with probabalistic label
+    """
+
     def setUp(self):
         self.op_type = "sigmoid_cross_entropy_with_logits"
         batch_size = 64
@@ -85,3 +119,4 @@ class TestSigmoidCrossEntropyWithLogitsOp2(OpTest):
 
 if __name__ == '__main__':
     unittest.main()
+    np.random.seed(0)

From 13e254faedd2c464fa14057d90c66995b2b4f159 Mon Sep 17 00:00:00 2001
From: jerrywgz <jerrwgz@126.com>
Date: Tue, 20 Nov 2018 13:08:23 +0000
Subject: [PATCH 02/90] refine code, test=develop

---
 paddle/fluid/API.spec                                  |  2 +-
 .../operators/sigmoid_cross_entropy_with_logits_op.cc  |  4 ++--
 .../operators/sigmoid_cross_entropy_with_logits_op.h   | 10 ----------
 python/paddle/fluid/layers/nn.py                       |  2 +-
 python/paddle/fluid/tests/unittests/test_layers.py     |  2 +-
 .../test_sigmoid_cross_entropy_with_logits_op.py       |  1 -
 6 files changed, 5 insertions(+), 16 deletions(-)

diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index da8941c351..f84ec4cb3e 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -174,7 +174,7 @@ paddle.fluid.layers.clip ArgSpec(args=['x', 'min', 'max', 'name'], varargs=None,
 paddle.fluid.layers.clip_by_norm ArgSpec(args=['x', 'max_norm', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.mean ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.mul ArgSpec(args=['x', 'y', 'x_num_col_dims', 'y_num_col_dims', 'name'], varargs=None, keywords=None, defaults=(1, 1, None))
-paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'ignore_index', 'name'], varargs=None, keywords=None, defaults=(-100, None))
 paddle.fluid.layers.maxout ArgSpec(args=['x', 'groups', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.space_to_depth ArgSpec(args=['x', 'blocksize', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.affine_grid ArgSpec(args=['theta', 'out_shape', 'name'], varargs=None, keywords=None, defaults=(None,))
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
index d6a2fa6a17..368988d60d 100644
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
@@ -102,9 +102,9 @@ class SigmoidCrossEntropyWithLogitsOpMaker
               " of elementwise logistic losses.");
     AddAttr<int>(
         "ignore_index",
-        "(int, default -1), Specifies a target value that is ignored and"
+        "(int, default -100), Specifies a target value that is ignored and"
         "does not contribute to the input gradient.")
-        .SetDefault(-1);
+        .SetDefault(-100);
     AddComment(R"DOC(
 SigmoidCrossEntropyWithLogits Operator.
 
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h
index 2bfba6f170..b8731c2327 100644
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h
@@ -31,7 +31,6 @@ using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
 template <typename T>
 struct SigmoidCrossEntropyWithLogitsForward {
-  // EIGEN_EMPTY_STRUCT_CTOR(SigmoidCrossEntropyWithLogitsForward)
   HOSTDEVICE SigmoidCrossEntropyWithLogitsForward(const int &ignore_index)
       : ignore_index(ignore_index) {}
 
@@ -50,7 +49,6 @@ struct SigmoidCrossEntropyWithLogitsForward {
 
 template <typename T>
 struct SigmoidCrossEntropyWithLogitsBackward {
-  // EIGEN_EMPTY_STRUCT_CTOR(SigmoidCrossEntropyWithLogitsForward)
   HOSTDEVICE SigmoidCrossEntropyWithLogitsBackward(const int &ignore_index)
       : ignore_index(ignore_index) {}
 
@@ -83,14 +81,6 @@ class SigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel<T> {
 
     out.device(place) = x.binaryExpr(
         labels, SigmoidCrossEntropyWithLogitsForward<T>(ignore_index));
-    // term1 = max(x, 0)
-    // auto term1 = x.cwiseMax(static_cast<T>(0));
-    // term2 = x * labels
-    // auto term2 = x * labels;
-    // term3 = log(1 + exp(-abs(x)))
-    // auto term3 = (static_cast<T>(1) + (-(x.abs())).exp()).log();
-
-    // out.device(place) = term1 - term2 + term3;
   }
 };
 
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index e032835de3..38da9173cc 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -7892,7 +7892,7 @@ def mul(x, y, x_num_col_dims=1, y_num_col_dims=1, name=None):
 
 
 @templatedoc()
-def sigmoid_cross_entropy_with_logits(x, label, ignore_index=-1, name=None):
+def sigmoid_cross_entropy_with_logits(x, label, ignore_index=-100, name=None):
     """
     ${comment}
 
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 8e098e4961..326938e115 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -173,7 +173,7 @@ class TestBook(unittest.TestCase):
             ignore_index = -1
             self.assertIsNotNone(
                 layers.sigmoid_cross_entropy_with_logits(
-                    x=dat, label=lbl, ignore_index=-1))
+                    x=dat, label=lbl, ignore_index=ignore_index))
         print(str(program))
 
     def test_hsigmoid(self):
diff --git a/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py b/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py
index 64f6f088e1..41797a241c 100644
--- a/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py
@@ -119,4 +119,3 @@ class TestSigmoidCrossEntropyWithLogitsOp3(OpTest):
 
 if __name__ == '__main__':
     unittest.main()
-    np.random.seed(0)

From e0b48f7e29fced72f439896fed46b76adc945035 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Thu, 22 Nov 2018 16:44:15 +0800
Subject: [PATCH 03/90] init lookup remote table

---
 .../distributed_ops/lookup_remote_table.h     | 192 ++++++++++++++++++
 1 file changed, 192 insertions(+)
 create mode 100644 paddle/fluid/operators/distributed_ops/lookup_remote_table.h

diff --git a/paddle/fluid/operators/distributed_ops/lookup_remote_table.h b/paddle/fluid/operators/distributed_ops/lookup_remote_table.h
new file mode 100644
index 0000000000..5b066c8196
--- /dev/null
+++ b/paddle/fluid/operators/distributed_ops/lookup_remote_table.h
@@ -0,0 +1,192 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <future>  // NOLINT
+#include <ostream>
+#include <vector>
+#include <set>
+#include <unordered_map>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/operators/detail/macros.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+inline size_t GetSectionIndex(int64_t id, const std::vector<int64_t>& abs_sections) {
+  for (size_t i = 1; i < abs_sections.size(); ++i) {
+    if (row < abs_sections[i]) {
+      return i - 1;
+    }
+  }
+  return abs_sections.size() - 1;
+}
+
+inline std::vector<int64_t> ToAbsoluteSection(
+        const std::vector<int64_t>& height_sections) {
+  std::vector<int64_t> abs_sections;
+  abs_sections.resize(height_sections.size());
+  abs_sections[0] = 0;
+  for (size_t i = 1; i < height_sections.size(); ++i) {
+    abs_sections[i] = height_sections[i - 1] + abs_sections[i - 1];
+  }
+  return abs_sections;
+}
+
+inline std::vector<std::vector<int64_t>> SplitIds(
+        const std::string& id_name,
+        const std::vector<int64_t>& height_section,
+        framework::Scope* scope) {
+  auto& id_tensor = scope->Var(id_name)->Get<framework::LoDTensor>();
+  auto* id_data = id_tensor.data<int64_t>();
+  std::set<int64_t> all_ids;
+  for (size_t i = 0; i < id_tensor.numel(); ++i) {
+    all_ids.insert(id_data[i]);
+  }
+  auto abs_sections = ToAbsoluteSection(height_section);
+  std::vector<std::vector<int64_t>> splited_ids;
+  splited_ids.resize(height_section.size() + 1);
+  for (auto& id : all_ids) {
+    auto section_index = GetSectionIndex(id);
+    splited_ids[section_index].push_back(id - abs_sections[section_index]);
+  }
+}
+
+inline void SplitIdsIntoMultipleVarsBySection(
+        const std::string& id_name,
+        const std::vector<std::string>& in_var_names,
+        const std::vector<int64_t>& height_section,
+        const std::vector<std::vector<int64_t>>& splited_ids,
+        framework::Scope* scope) {
+  PADDLE_ENFORCE_EQ(in_var_names.size(), height_section.size() + 1, "");
+
+  auto place = platform::CPUPlace();
+
+  for (size_t i = 0; i < in_var_names.size(); ++i) {
+    auto* id_tensor = scope->Var(in_var_names[i])->GetMutable<framework::LoDTensor>();
+    auto& ids = splited_ids[i];
+    if (!ids.empty()) {
+      auto* id_tensor_data = id_tensor->mutable_data<int64_t>(framework::make_ddim({ids.size(), 1}), place);
+      memcpy(id_tensor_data, ids.data(), sizeof(int64_t) * ids.size());
+    }
+  }
+}
+
+inline void MergeMultipleVarsIntoOnBySection(
+        const std::string& id_name,
+        const std::string& out_name,
+        const std::vector<std::string>& out_var_names,
+        const std::vector<int64_t>& height_section,
+        const std::vector<std::vector<int64_t>>& splited_ids,
+        framework::Scope* scope) {
+  PADDLE_ENFORCE_EQ(in_var_names.size(), height_section.size() + 1, "");
+
+  auto cpu_place = platform::CPUPlace();
+
+  auto abs_sections = ToAbsoluteSection(height_section);
+  auto& id_tensor = scope->Var(id_name)->Get<framework::LoDTensor>();
+  auto* id_data = id_tensor.data<int64_t>();
+  std::unordered_map<int64_t, std::vector<size_t>> id_to_offset;
+  for (size_t i = 0; i < id_tensor.numel(); ++i) {
+    id_to_offset[id_data[i]].push_back(i);
+  }
+
+  auto& out_tensor = scope->Var(out_name)->Get<framework::LoDTensor>();
+  auto* out_tensor_data = out_tensor.mutable_data<float>();
+
+  for (size_t section_idx = 0; section_idx < out_var_names.size(); ++section_idx) {
+    auto& ids_in_this_section = splited_ids[section_idx];
+    auto& prefetch_out_var = scope->Var(out_var_names[section_idx])->Get<framework::LoDTensor>();
+    const auto* out_var_data = prefetch_out_var.mutable_data<float>();
+    auto& dims = prefetch_out_var.dims();
+
+    PADDLE_ENFORCE_EQ(dims.size(), 2, "");
+    PADDLE_ENFORCE_EQ(ids_in_this_section.size(), dims[0]);
+
+    auto row_numel = dims[1];
+
+    for (size_t i = 0; i < dims[0]; ++i) {
+      auto id = ids_in_this_section[i];
+      auto origin_id = id + abs_sections[section_idx];
+      auto& offsets = id_to_offset[origin_id];
+      for (auto& offset : offsets) {
+        // should support GPU tensor
+        memory::Copy(cpu_place, out_tensor_data + offset * row_numel,
+                     cpu_place, out_var_data + i * grad_row_numel,
+                     sizeof(T) * grad_row_numel);
+      }
+    }
+  }
+}
+
+inline void prefetch(
+        const std::string& table_name,
+        const std::string& id_name,
+        const std::string& out_name,
+        const std::vector<std::string>& epmap,
+        const std::vector<int64_t>& height_section,
+        const framework::Scope& scope,
+        const platform::Place& place) const {
+
+  auto local_scope = scope.NewScope();
+
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  auto& ctx = *pool.Get(place);
+
+  distributed::RPCClient* rpc_client =
+          distributed::RPCClient::GetInstance<RPCCLIENT_T>(
+                  Attr<int>("trainer_id"));
+
+  std::vector<std::string> in_var_names;
+  std::vector<std::string> out_var_names;
+  for (size_t i = 0; i < epmap.size(); ++i) {
+    in_var_names.push_back(id_name + "@" + epmap[i]);
+    out_var_names.push_back(out_name + "@" + epmap[i]);
+  }
+
+  auto splited_ids = SplitIds(id_name, height_section, local_scope);
+  SplitIdsIntoMultipleVarsBySection(id_name, in_var_names, height_section, splited_ids, local_scope);
+
+  // create output var in local scope
+  for (auto& name : out_var_names) {
+    local_scope.Var(name)->GetMutable<framework::LoDTensor>();
+  }
+
+  std::vector<distributed::VarHandlePtr> rets;
+  for (size_t i = 0; i < ins.size(); i++) {
+    if (NeedSend(local_scope, ins[i])) {
+      VLOG(30) << "sending " << ins[i] << " to " << epmap[i] << " to get "
+      << outs[i] << " back";
+      rets.push_back(rpc_client->AsyncPrefetchVar(epmap[i], ctx, local_scope,
+                                                  in_var_names[i], out_var_names[i]));
+    } else {
+      VLOG(30) << "don't send no-initialied variable: " << out_var_names[i];
+    }
+  }
+  for (size_t i = 0; i < rets.size(); i++) {
+    PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient");
+  }
+
+  MergeMultipleVarsIntoOnBySection(id_name, out_name, out_var_names, height_section, plited_ids, scope)
+
+  scope.DeleteScope(local_scope);
+}
+
+}  // namespace distributed
+}  // namespace operators
+}  // namespace paddle

From 60a4f69b3c1af76e27c9c91e929eb6cac8c07730 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Thu, 22 Nov 2018 17:11:15 +0800
Subject: [PATCH 04/90] add lookup remote table op

---
 .../distributed_ops/lookup_remote_table_op.cc | 104 +++++++++++++
 ...emote_table.h => lookup_remote_table_op.h} | 141 +++++++++++++-----
 2 files changed, 204 insertions(+), 41 deletions(-)
 create mode 100644 paddle/fluid/operators/distributed_ops/lookup_remote_table_op.cc
 rename paddle/fluid/operators/distributed_ops/{lookup_remote_table.h => lookup_remote_table_op.h} (54%)

diff --git a/paddle/fluid/operators/distributed_ops/lookup_remote_table_op.cc b/paddle/fluid/operators/distributed_ops/lookup_remote_table_op.cc
new file mode 100644
index 0000000000..06e96a7f98
--- /dev/null
+++ b/paddle/fluid/operators/distributed_ops/lookup_remote_table_op.cc
@@ -0,0 +1,104 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/distributed_ops/lookup_remote_table_op.h"
+#include "paddle/fluid/framework/var_type_inference.h"
+
+namespace paddle {
+namespace operators {
+
+class LookupRemoteTableOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("W"),
+                   "Input(W) of LookupRemoteTableOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Ids"),
+                   "Input(Ids) of LookupRemoteTableOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of LookupRemoteTableOp should not be null.");
+
+    auto table_dims = ctx->GetInputDim("W");
+    auto ids_dims = ctx->GetInputDim("Ids");
+    int ids_rank = ids_dims.size();
+
+    PADDLE_ENFORCE_EQ(table_dims.size(), 2);
+    PADDLE_ENFORCE_EQ(ids_dims[ids_rank - 1], 1,
+                      "The last dimension of the 'Ids' tensor must be 1.");
+
+    auto output_dims =
+        framework::vectorize(framework::slice_ddim(ids_dims, 0, ids_rank - 1));
+    output_dims.push_back(table_dims[1]);
+    ctx->SetOutputDim("Out", framework::make_ddim(output_dims));
+
+    if (ctx->GetOutputsVarType("Out")[0] ==
+        framework::proto::VarType::LOD_TENSOR) {
+      ctx->ShareLoD("Ids", /*->*/ "Out");
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("W"));
+    return framework::OpKernelType(data_type, ctx.device_context());
+  }
+};
+
+class LookupRemoteTableOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("W",
+             "(Tensor) The input represents embedding tensors, "
+             "which is a learnable parameter.");
+    AddInput("Ids",
+             "An input with type int32 or int64 "
+             "contains the ids to be looked up in W. "
+             "The last dimension size must be 1.");
+    AddOutput("Out", "The lookup results, which have the same type as W.");
+    AddAttr<int64_t>("padding_idx",
+                     "(int64, default -1) "
+                     "If the value is -1, it makes no effect to lookup. "
+                     "Otherwise the given value indicates padding the output "
+                     "with zeros whenever lookup encounters it in Ids.")
+        .SetDefault(kNoPadding);
+    // NOTE(minqiyang): grad_inplace is an temporal attribute,
+    // please do NOT set this attribute in python layer.
+    AddAttr<bool>("grad_inplace",
+                  "(boolean, default false) "
+                  "If the grad op reuse the input's variable.")
+        .SetDefault(false);
+    AddComment(R"DOC(
+Lookup Remote Table Operator.
+
+This operator is used to perform lookups on the parameter W,
+then concatenated into a dense tensor.
+
+The input Ids can carry the LoD (Level of Details) information,
+or not. And the output only shares the LoD information with input Ids.
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(lookup_remote_table, ops::LookupRemoteTableOp,
+                  ops::EmptyGradOpMaker, ops::LookupRemoteTableOpMaker);
+
+REGISTER_OP_CPU_KERNEL(lookup_remote_table, ops::LookupRemoteTableKernel<float>,
+                       ops::LookupRemoteTableKernel<double>);
diff --git a/paddle/fluid/operators/distributed_ops/lookup_remote_table.h b/paddle/fluid/operators/distributed_ops/lookup_remote_table_op.h
similarity index 54%
rename from paddle/fluid/operators/distributed_ops/lookup_remote_table.h
rename to paddle/fluid/operators/distributed_ops/lookup_remote_table_op.h
index 5b066c8196..1a383f6d3e 100644
--- a/paddle/fluid/operators/distributed_ops/lookup_remote_table.h
+++ b/paddle/fluid/operators/distributed_ops/lookup_remote_table_op.h
@@ -14,21 +14,22 @@ limitations under the License. */
 
 #include <future>  // NOLINT
 #include <ostream>
-#include <vector>
 #include <set>
 #include <unordered_map>
+#include <vector>
 
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/operators/detail/macros.h"
 #include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/operators/detail/macros.h"
 #include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
 
 namespace paddle {
 namespace operators {
 namespace distributed {
 
-inline size_t GetSectionIndex(int64_t id, const std::vector<int64_t>& abs_sections) {
+inline size_t GetSectionIndex(int64_t id,
+                              const std::vector<int64_t>& abs_sections) {
   for (size_t i = 1; i < abs_sections.size(); ++i) {
     if (row < abs_sections[i]) {
       return i - 1;
@@ -38,7 +39,7 @@ inline size_t GetSectionIndex(int64_t id, const std::vector<int64_t>& abs_sectio
 }
 
 inline std::vector<int64_t> ToAbsoluteSection(
-        const std::vector<int64_t>& height_sections) {
+    const std::vector<int64_t>& height_sections) {
   std::vector<int64_t> abs_sections;
   abs_sections.resize(height_sections.size());
   abs_sections[0] = 0;
@@ -49,9 +50,8 @@ inline std::vector<int64_t> ToAbsoluteSection(
 }
 
 inline std::vector<std::vector<int64_t>> SplitIds(
-        const std::string& id_name,
-        const std::vector<int64_t>& height_section,
-        framework::Scope* scope) {
+    const std::string& id_name, const std::vector<int64_t>& height_section,
+    framework::Scope* scope) {
   auto& id_tensor = scope->Var(id_name)->Get<framework::LoDTensor>();
   auto* id_data = id_tensor.data<int64_t>();
   std::set<int64_t> all_ids;
@@ -68,32 +68,32 @@ inline std::vector<std::vector<int64_t>> SplitIds(
 }
 
 inline void SplitIdsIntoMultipleVarsBySection(
-        const std::string& id_name,
-        const std::vector<std::string>& in_var_names,
-        const std::vector<int64_t>& height_section,
-        const std::vector<std::vector<int64_t>>& splited_ids,
-        framework::Scope* scope) {
+    const std::string& id_name, const std::vector<std::string>& in_var_names,
+    const std::vector<int64_t>& height_section,
+    const std::vector<std::vector<int64_t>>& splited_ids,
+    framework::Scope* scope) {
   PADDLE_ENFORCE_EQ(in_var_names.size(), height_section.size() + 1, "");
 
   auto place = platform::CPUPlace();
 
   for (size_t i = 0; i < in_var_names.size(); ++i) {
-    auto* id_tensor = scope->Var(in_var_names[i])->GetMutable<framework::LoDTensor>();
+    auto* id_tensor =
+        scope->Var(in_var_names[i])->GetMutable<framework::LoDTensor>();
     auto& ids = splited_ids[i];
     if (!ids.empty()) {
-      auto* id_tensor_data = id_tensor->mutable_data<int64_t>(framework::make_ddim({ids.size(), 1}), place);
+      auto* id_tensor_data = id_tensor->mutable_data<int64_t>(
+          framework::make_ddim({ids.size(), 1}), place);
       memcpy(id_tensor_data, ids.data(), sizeof(int64_t) * ids.size());
     }
   }
 }
 
 inline void MergeMultipleVarsIntoOnBySection(
-        const std::string& id_name,
-        const std::string& out_name,
-        const std::vector<std::string>& out_var_names,
-        const std::vector<int64_t>& height_section,
-        const std::vector<std::vector<int64_t>>& splited_ids,
-        framework::Scope* scope) {
+    const std::string& id_name, const std::string& out_name,
+    const std::vector<std::string>& out_var_names,
+    const std::vector<int64_t>& height_section,
+    const std::vector<std::vector<int64_t>>& splited_ids,
+    framework::Scope* scope) {
   PADDLE_ENFORCE_EQ(in_var_names.size(), height_section.size() + 1, "");
 
   auto cpu_place = platform::CPUPlace();
@@ -109,9 +109,11 @@ inline void MergeMultipleVarsIntoOnBySection(
   auto& out_tensor = scope->Var(out_name)->Get<framework::LoDTensor>();
   auto* out_tensor_data = out_tensor.mutable_data<float>();
 
-  for (size_t section_idx = 0; section_idx < out_var_names.size(); ++section_idx) {
+  for (size_t section_idx = 0; section_idx < out_var_names.size();
+       ++section_idx) {
     auto& ids_in_this_section = splited_ids[section_idx];
-    auto& prefetch_out_var = scope->Var(out_var_names[section_idx])->Get<framework::LoDTensor>();
+    auto& prefetch_out_var =
+        scope->Var(out_var_names[section_idx])->Get<framework::LoDTensor>();
     const auto* out_var_data = prefetch_out_var.mutable_data<float>();
     auto& dims = prefetch_out_var.dims();
 
@@ -126,31 +128,27 @@ inline void MergeMultipleVarsIntoOnBySection(
       auto& offsets = id_to_offset[origin_id];
       for (auto& offset : offsets) {
         // should support GPU tensor
-        memory::Copy(cpu_place, out_tensor_data + offset * row_numel,
-                     cpu_place, out_var_data + i * grad_row_numel,
+        memory::Copy(cpu_place, out_tensor_data + offset * row_numel, cpu_place,
+                     out_var_data + i * grad_row_numel,
                      sizeof(T) * grad_row_numel);
       }
     }
   }
 }
 
-inline void prefetch(
-        const std::string& table_name,
-        const std::string& id_name,
-        const std::string& out_name,
-        const std::vector<std::string>& epmap,
-        const std::vector<int64_t>& height_section,
-        const framework::Scope& scope,
-        const platform::Place& place) const {
-
+inline void prefetch(const std::string& table_name, const std::string& id_name,
+                     const std::string& out_name,
+                     const std::vector<std::string>& epmap,
+                     const std::vector<int64_t>& height_section,
+                     const framework::Scope& scope,
+                     const platform::Place& place) const {
   auto local_scope = scope.NewScope();
 
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
   auto& ctx = *pool.Get(place);
 
   distributed::RPCClient* rpc_client =
-          distributed::RPCClient::GetInstance<RPCCLIENT_T>(
-                  Attr<int>("trainer_id"));
+      distributed::RPCClient::GetInstance<RPCCLIENT_T>(Attr<int>("trainer_id"));
 
   std::vector<std::string> in_var_names;
   std::vector<std::string> out_var_names;
@@ -160,7 +158,8 @@ inline void prefetch(
   }
 
   auto splited_ids = SplitIds(id_name, height_section, local_scope);
-  SplitIdsIntoMultipleVarsBySection(id_name, in_var_names, height_section, splited_ids, local_scope);
+  SplitIdsIntoMultipleVarsBySection(id_name, in_var_names, height_section,
+                                    splited_ids, local_scope);
 
   // create output var in local scope
   for (auto& name : out_var_names) {
@@ -171,9 +170,9 @@ inline void prefetch(
   for (size_t i = 0; i < ins.size(); i++) {
     if (NeedSend(local_scope, ins[i])) {
       VLOG(30) << "sending " << ins[i] << " to " << epmap[i] << " to get "
-      << outs[i] << " back";
-      rets.push_back(rpc_client->AsyncPrefetchVar(epmap[i], ctx, local_scope,
-                                                  in_var_names[i], out_var_names[i]));
+               << outs[i] << " back";
+      rets.push_back(rpc_client->AsyncPrefetchVar(
+          epmap[i], ctx, local_scope, in_var_names[i], out_var_names[i]));
     } else {
       VLOG(30) << "don't send no-initialied variable: " << out_var_names[i];
     }
@@ -182,11 +181,71 @@ inline void prefetch(
     PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient");
   }
 
-  MergeMultipleVarsIntoOnBySection(id_name, out_name, out_var_names, height_section, plited_ids, scope)
+  MergeMultipleVarsIntoOnBySection(id_name, out_name, out_var_names,
+                                   height_section, plited_ids, scope)
 
-  scope.DeleteScope(local_scope);
+      scope.DeleteScope(local_scope);
 }
 
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using SelectedRows = framework::SelectedRows;
+using DDim = framework::DDim;
+
+constexpr int64_t kNoPadding = -1;
+
+template <typename T>
+class LookupRemoteTableKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* ids_t = context.Input<LoDTensor>("Ids");      // int tensor
+    auto* output_t = context.Output<LoDTensor>("Out");  // float tensor
+    auto* table_var = context.InputVar("W");
+
+    int64_t padding_idx = context.Attr<int64_t>("padding_idx");
+    int64_t* ids = const_cast<int64_t*>(ids_t->data<int64_t>());
+    int64_t ids_numel = ids_t->numel();
+
+    if (table_var->IsType<LoDTensor>()) {
+      auto* table_t = context.Input<LoDTensor>("W");
+      int64_t row_number = table_t->dims()[0];
+      int64_t row_width = table_t->dims()[1];
+
+      auto* table = table_t->data<T>();
+      auto* output = output_t->mutable_data<T>(context.GetPlace());
+
+      for (int64_t i = 0; i < ids_numel; ++i) {
+        if (padding_idx != kNoPadding && ids[i] == padding_idx) {
+          memset(output + i * row_width, 0, row_width * sizeof(T));
+        } else {
+          PADDLE_ENFORCE_LT(ids[i], row_number);
+          PADDLE_ENFORCE_GE(ids[i], 0, "ids %d", i);
+          memcpy(output + i * row_width, table + ids[i] * row_width,
+                 row_width * sizeof(T));
+        }
+      }
+    } else if (table_var->IsType<SelectedRows>()) {
+      const auto& table_t = table_var->Get<SelectedRows>();
+      int64_t row_width = table_t.value().dims()[1];
+      const auto* table = table_t.value().data<T>();
+      auto* output = output_t->mutable_data<T>(context.GetPlace());
+
+      auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
+      for (int64_t i = 0; i < ids_numel; ++i) {
+        if (padding_idx != kNoPadding && ids[i] == padding_idx) {
+          memset(output + i * row_width, 0, row_width * sizeof(T));
+        } else {
+          PADDLE_ENFORCE_GE(ids[i], 0);
+          auto id_index = table_t.Index(ids[i]);
+          PADDLE_ENFORCE_GE(id_index, 0, "the input key should be exists.");
+          blas.VCOPY(row_width, table + id_index * row_width,
+                     output + i * row_width);
+        }
+      }
+    }
+  }
+};
+
 }  // namespace distributed
 }  // namespace operators
 }  // namespace paddle

From 361cb0e078d1942e06ffcb3586e68be11c465d29 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Fri, 23 Nov 2018 10:53:35 +0800
Subject: [PATCH 05/90] lookup remote table can compile

---
 .../distributed_ops/lookup_remote_table_op.cc |  12 +-
 .../distributed_ops/lookup_remote_table_op.h  | 220 ++++++++++--------
 2 files changed, 133 insertions(+), 99 deletions(-)

diff --git a/paddle/fluid/operators/distributed_ops/lookup_remote_table_op.cc b/paddle/fluid/operators/distributed_ops/lookup_remote_table_op.cc
index 06e96a7f98..5d3a50a44c 100644
--- a/paddle/fluid/operators/distributed_ops/lookup_remote_table_op.cc
+++ b/paddle/fluid/operators/distributed_ops/lookup_remote_table_op.cc
@@ -68,6 +68,15 @@ class LookupRemoteTableOpMaker : public framework::OpProtoAndCheckerMaker {
              "contains the ids to be looked up in W. "
              "The last dimension size must be 1.");
     AddOutput("Out", "The lookup results, which have the same type as W.");
+    AddAttr<std::vector<int64_t>>("height_sections",
+                                  "Height for each output SelectedRows.")
+        .SetDefault(std::vector<int64_t>({}));
+    AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
+    AddAttr<std::vector<std::string>>(
+        "epmap",
+        "(string vector, default 127.0.0.1:6164)"
+        "Server endpoints in the order of input variables for mapping")
+        .SetDefault({"127.0.0.1:6164"});
     AddAttr<int64_t>("padding_idx",
                      "(int64, default -1) "
                      "If the value is -1, it makes no effect to lookup. "
@@ -98,7 +107,8 @@ or not. And the output only shares the LoD information with input Ids.
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(lookup_remote_table, ops::LookupRemoteTableOp,
-                  ops::EmptyGradOpMaker, ops::LookupRemoteTableOpMaker);
+                  paddle::framework::EmptyGradOpMaker,
+                  ops::LookupRemoteTableOpMaker);
 
 REGISTER_OP_CPU_KERNEL(lookup_remote_table, ops::LookupRemoteTableKernel<float>,
                        ops::LookupRemoteTableKernel<double>);
diff --git a/paddle/fluid/operators/distributed_ops/lookup_remote_table_op.h b/paddle/fluid/operators/distributed_ops/lookup_remote_table_op.h
index 1a383f6d3e..ddf57016db 100644
--- a/paddle/fluid/operators/distributed_ops/lookup_remote_table_op.h
+++ b/paddle/fluid/operators/distributed_ops/lookup_remote_table_op.h
@@ -12,26 +12,32 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#pragma once
+
 #include <future>  // NOLINT
 #include <ostream>
 #include <set>
+#include <string>
 #include <unordered_map>
 #include <vector>
 
 #include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/operators/detail/macros.h"
 #include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
+#include "paddle/fluid/operators/math/blas.h"
 
 namespace paddle {
 namespace operators {
-namespace distributed {
 
 inline size_t GetSectionIndex(int64_t id,
                               const std::vector<int64_t>& abs_sections) {
   for (size_t i = 1; i < abs_sections.size(); ++i) {
-    if (row < abs_sections[i]) {
+    if (id < abs_sections[i]) {
       return i - 1;
     }
   }
@@ -62,9 +68,10 @@ inline std::vector<std::vector<int64_t>> SplitIds(
   std::vector<std::vector<int64_t>> splited_ids;
   splited_ids.resize(height_section.size() + 1);
   for (auto& id : all_ids) {
-    auto section_index = GetSectionIndex(id);
+    auto section_index = GetSectionIndex(id, abs_sections);
     splited_ids[section_index].push_back(id - abs_sections[section_index]);
   }
+  return splited_ids;
 }
 
 inline void SplitIdsIntoMultipleVarsBySection(
@@ -82,7 +89,7 @@ inline void SplitIdsIntoMultipleVarsBySection(
     auto& ids = splited_ids[i];
     if (!ids.empty()) {
       auto* id_tensor_data = id_tensor->mutable_data<int64_t>(
-          framework::make_ddim({ids.size(), 1}), place);
+          framework::make_ddim({static_cast<int64_t>(ids.size()), 1}), place);
       memcpy(id_tensor_data, ids.data(), sizeof(int64_t) * ids.size());
     }
   }
@@ -93,8 +100,8 @@ inline void MergeMultipleVarsIntoOnBySection(
     const std::vector<std::string>& out_var_names,
     const std::vector<int64_t>& height_section,
     const std::vector<std::vector<int64_t>>& splited_ids,
-    framework::Scope* scope) {
-  PADDLE_ENFORCE_EQ(in_var_names.size(), height_section.size() + 1, "");
+    const framework::ExecutionContext& context, framework::Scope* scope) {
+  PADDLE_ENFORCE_EQ(out_var_names.size(), height_section.size() + 1, "");
 
   auto cpu_place = platform::CPUPlace();
 
@@ -106,15 +113,15 @@ inline void MergeMultipleVarsIntoOnBySection(
     id_to_offset[id_data[i]].push_back(i);
   }
 
-  auto& out_tensor = scope->Var(out_name)->Get<framework::LoDTensor>();
-  auto* out_tensor_data = out_tensor.mutable_data<float>();
+  auto* out_tensor = scope->Var(out_name)->GetMutable<framework::LoDTensor>();
+  auto* out_tensor_data = out_tensor->mutable_data<float>(context.GetPlace());
 
   for (size_t section_idx = 0; section_idx < out_var_names.size();
        ++section_idx) {
     auto& ids_in_this_section = splited_ids[section_idx];
     auto& prefetch_out_var =
         scope->Var(out_var_names[section_idx])->Get<framework::LoDTensor>();
-    const auto* out_var_data = prefetch_out_var.mutable_data<float>();
+    const auto* out_var_data = prefetch_out_var.data<float>();
     auto& dims = prefetch_out_var.dims();
 
     PADDLE_ENFORCE_EQ(dims.size(), 2, "");
@@ -129,63 +136,64 @@ inline void MergeMultipleVarsIntoOnBySection(
       for (auto& offset : offsets) {
         // should support GPU tensor
         memory::Copy(cpu_place, out_tensor_data + offset * row_numel, cpu_place,
-                     out_var_data + i * grad_row_numel,
-                     sizeof(T) * grad_row_numel);
+                     out_var_data + i * row_numel, sizeof(float) * row_numel);
       }
     }
   }
 }
 
-inline void prefetch(const std::string& table_name, const std::string& id_name,
-                     const std::string& out_name,
-                     const std::vector<std::string>& epmap,
-                     const std::vector<int64_t>& height_section,
-                     const framework::Scope& scope,
-                     const platform::Place& place) const {
-  auto local_scope = scope.NewScope();
-
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  auto& ctx = *pool.Get(place);
-
-  distributed::RPCClient* rpc_client =
-      distributed::RPCClient::GetInstance<RPCCLIENT_T>(Attr<int>("trainer_id"));
-
-  std::vector<std::string> in_var_names;
-  std::vector<std::string> out_var_names;
-  for (size_t i = 0; i < epmap.size(); ++i) {
-    in_var_names.push_back(id_name + "@" + epmap[i]);
-    out_var_names.push_back(out_name + "@" + epmap[i]);
-  }
-
-  auto splited_ids = SplitIds(id_name, height_section, local_scope);
-  SplitIdsIntoMultipleVarsBySection(id_name, in_var_names, height_section,
-                                    splited_ids, local_scope);
-
-  // create output var in local scope
-  for (auto& name : out_var_names) {
-    local_scope.Var(name)->GetMutable<framework::LoDTensor>();
-  }
-
-  std::vector<distributed::VarHandlePtr> rets;
-  for (size_t i = 0; i < ins.size(); i++) {
-    if (NeedSend(local_scope, ins[i])) {
-      VLOG(30) << "sending " << ins[i] << " to " << epmap[i] << " to get "
-               << outs[i] << " back";
-      rets.push_back(rpc_client->AsyncPrefetchVar(
-          epmap[i], ctx, local_scope, in_var_names[i], out_var_names[i]));
-    } else {
-      VLOG(30) << "don't send no-initialied variable: " << out_var_names[i];
-    }
-  }
-  for (size_t i = 0; i < rets.size(); i++) {
-    PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient");
-  }
-
-  MergeMultipleVarsIntoOnBySection(id_name, out_name, out_var_names,
-                                   height_section, plited_ids, scope)
-
-      scope.DeleteScope(local_scope);
-}
+// inline void prefetch(const std::string& table_name, const std::string&
+// id_name,
+//                     const std::string& out_name,
+//                     const std::vector<std::string>& epmap,
+//                     const std::vector<int64_t>& height_section,
+//                     const framework::Scope& scope,
+//                     const platform::Place& place) {
+//  auto& local_scope = scope.NewScope();
+//
+//  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+//  auto& ctx = *pool.Get(place);
+//
+//  distributed::RPCClient* rpc_client =
+//      distributed::RPCClient::GetInstance<RPCCLIENT_T>(Attr<int>("trainer_id"));
+//
+//  std::vector<std::string> in_var_names;
+//  std::vector<std::string> out_var_names;
+//  for (size_t i = 0; i < epmap.size(); ++i) {
+//    in_var_names.push_back(id_name + "@" + epmap[i]);
+//    out_var_names.push_back(out_name + "@" + epmap[i]);
+//  }
+//
+//  auto splited_ids = SplitIds(id_name, height_section, &local_scope);
+//  SplitIdsIntoMultipleVarsBySection(id_name, in_var_names, height_section,
+//                                    splited_ids, &local_scope);
+//
+//  // create output var in local scope
+//  for (auto& name : out_var_names) {
+//    local_scope.Var(name)->GetMutable<framework::LoDTensor>();
+//  }
+//
+//  std::vector<distributed::VarHandlePtr> rets;
+//  for (size_t i = 0; i < in_var_names.size(); i++) {
+//    if (NeedSend(local_scope, in_var_names[i])) {
+//      VLOG(30) << "sending " << in_var_names[i] << " to " << epmap[i] << " to
+//      get "
+//               << out_var_names[i] << " back";
+//      rets.push_back(rpc_client->AsyncPrefetchVar(
+//          epmap[i], ctx, local_scope, in_var_names[i], out_var_names[i]));
+//    } else {
+//      VLOG(30) << "don't send no-initialied variable: " << out_var_names[i];
+//    }
+//  }
+//  for (size_t i = 0; i < rets.size(); i++) {
+//    PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient");
+//  }
+//
+//  MergeMultipleVarsIntoOnBySection(id_name, out_name, out_var_names,
+//                                   height_section, splited_ids, &local_scope);
+//
+//  scope.DeleteScope(&local_scope);
+//}
 
 using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
@@ -198,54 +206,70 @@ template <typename T>
 class LookupRemoteTableKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* ids_t = context.Input<LoDTensor>("Ids");      // int tensor
+    std::string id_name = context.Inputs("Ids").front();
+    auto* ids_t = context.Input<LoDTensor>("Ids");  // int tensor
+
+    std::string out_name = context.Outputs("Out").front();
     auto* output_t = context.Output<LoDTensor>("Out");  // float tensor
+
+    std::string table_name = context.Inputs("W").front();
     auto* table_var = context.InputVar("W");
 
     int64_t padding_idx = context.Attr<int64_t>("padding_idx");
     int64_t* ids = const_cast<int64_t*>(ids_t->data<int64_t>());
     int64_t ids_numel = ids_t->numel();
 
-    if (table_var->IsType<LoDTensor>()) {
-      auto* table_t = context.Input<LoDTensor>("W");
-      int64_t row_number = table_t->dims()[0];
-      int64_t row_width = table_t->dims()[1];
-
-      auto* table = table_t->data<T>();
-      auto* output = output_t->mutable_data<T>(context.GetPlace());
-
-      for (int64_t i = 0; i < ids_numel; ++i) {
-        if (padding_idx != kNoPadding && ids[i] == padding_idx) {
-          memset(output + i * row_width, 0, row_width * sizeof(T));
-        } else {
-          PADDLE_ENFORCE_LT(ids[i], row_number);
-          PADDLE_ENFORCE_GE(ids[i], 0, "ids %d", i);
-          memcpy(output + i * row_width, table + ids[i] * row_width,
-                 row_width * sizeof(T));
-        }
-      }
-    } else if (table_var->IsType<SelectedRows>()) {
-      const auto& table_t = table_var->Get<SelectedRows>();
-      int64_t row_width = table_t.value().dims()[1];
-      const auto* table = table_t.value().data<T>();
-      auto* output = output_t->mutable_data<T>(context.GetPlace());
-
-      auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
-      for (int64_t i = 0; i < ids_numel; ++i) {
-        if (padding_idx != kNoPadding && ids[i] == padding_idx) {
-          memset(output + i * row_width, 0, row_width * sizeof(T));
-        } else {
-          PADDLE_ENFORCE_GE(ids[i], 0);
-          auto id_index = table_t.Index(ids[i]);
-          PADDLE_ENFORCE_GE(id_index, 0, "the input key should be exists.");
-          blas.VCOPY(row_width, table + id_index * row_width,
-                     output + i * row_width);
-        }
+    auto epmap = context.Attr<std::vector<std::string>>("epmap");
+    auto height_sections =
+        context.Attr<std::vector<int64_t>>("height_sections");
+
+    auto& local_scope = context.scope().NewScope();
+
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    auto& ctx = *pool.Get(context.GetPlace());
+
+    distributed::RPCClient* rpc_client =
+        distributed::RPCClient::GetInstance<RPCCLIENT_T>(
+            context.Attr<int>("trainer_id"));
+
+    std::vector<std::string> in_var_names;
+    std::vector<std::string> out_var_names;
+    for (size_t i = 0; i < epmap.size(); ++i) {
+      in_var_names.push_back(id_name + "@" + epmap[i]);
+      out_var_names.push_back(out_name + "@" + epmap[i]);
+    }
+
+    auto splited_ids = SplitIds(id_name, height_sections, &local_scope);
+    SplitIdsIntoMultipleVarsBySection(id_name, in_var_names, height_sections,
+                                      splited_ids, &local_scope);
+
+    // create output var in local scope
+    for (auto& name : out_var_names) {
+      local_scope.Var(name)->GetMutable<framework::LoDTensor>();
+    }
+
+    std::vector<distributed::VarHandlePtr> rets;
+    for (size_t i = 0; i < in_var_names.size(); i++) {
+      if (NeedSend(local_scope, in_var_names[i])) {
+        VLOG(30) << "sending " << in_var_names[i] << " to " << epmap[i]
+                 << " to get " << out_var_names[i] << " back";
+        rets.push_back(rpc_client->AsyncPrefetchVar(
+            epmap[i], ctx, local_scope, in_var_names[i], out_var_names[i]));
+      } else {
+        VLOG(30) << "don't send no-initialied variable: " << out_var_names[i];
       }
     }
+    for (size_t i = 0; i < rets.size(); i++) {
+      PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient");
+    }
+
+    MergeMultipleVarsIntoOnBySection(id_name, out_name, out_var_names,
+                                     height_sections, splited_ids, context,
+                                     &local_scope);
+
+    context.scope().DeleteScope(&local_scope);
   }
 };
 
-}  // namespace distributed
 }  // namespace operators
 }  // namespace paddle

From 1f87f263a2906cb1130fdb3cf3c415197cf0d549 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Fri, 23 Nov 2018 10:56:45 +0800
Subject: [PATCH 06/90] clean code

---
 .../distributed_ops/lookup_remote_table_op.h  | 67 ++-----------------
 1 file changed, 7 insertions(+), 60 deletions(-)

diff --git a/paddle/fluid/operators/distributed_ops/lookup_remote_table_op.h b/paddle/fluid/operators/distributed_ops/lookup_remote_table_op.h
index ddf57016db..5c53ca6951 100644
--- a/paddle/fluid/operators/distributed_ops/lookup_remote_table_op.h
+++ b/paddle/fluid/operators/distributed_ops/lookup_remote_table_op.h
@@ -34,6 +34,13 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using SelectedRows = framework::SelectedRows;
+using DDim = framework::DDim;
+
+constexpr int64_t kNoPadding = -1;
+
 inline size_t GetSectionIndex(int64_t id,
                               const std::vector<int64_t>& abs_sections) {
   for (size_t i = 1; i < abs_sections.size(); ++i) {
@@ -142,66 +149,6 @@ inline void MergeMultipleVarsIntoOnBySection(
   }
 }
 
-// inline void prefetch(const std::string& table_name, const std::string&
-// id_name,
-//                     const std::string& out_name,
-//                     const std::vector<std::string>& epmap,
-//                     const std::vector<int64_t>& height_section,
-//                     const framework::Scope& scope,
-//                     const platform::Place& place) {
-//  auto& local_scope = scope.NewScope();
-//
-//  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-//  auto& ctx = *pool.Get(place);
-//
-//  distributed::RPCClient* rpc_client =
-//      distributed::RPCClient::GetInstance<RPCCLIENT_T>(Attr<int>("trainer_id"));
-//
-//  std::vector<std::string> in_var_names;
-//  std::vector<std::string> out_var_names;
-//  for (size_t i = 0; i < epmap.size(); ++i) {
-//    in_var_names.push_back(id_name + "@" + epmap[i]);
-//    out_var_names.push_back(out_name + "@" + epmap[i]);
-//  }
-//
-//  auto splited_ids = SplitIds(id_name, height_section, &local_scope);
-//  SplitIdsIntoMultipleVarsBySection(id_name, in_var_names, height_section,
-//                                    splited_ids, &local_scope);
-//
-//  // create output var in local scope
-//  for (auto& name : out_var_names) {
-//    local_scope.Var(name)->GetMutable<framework::LoDTensor>();
-//  }
-//
-//  std::vector<distributed::VarHandlePtr> rets;
-//  for (size_t i = 0; i < in_var_names.size(); i++) {
-//    if (NeedSend(local_scope, in_var_names[i])) {
-//      VLOG(30) << "sending " << in_var_names[i] << " to " << epmap[i] << " to
-//      get "
-//               << out_var_names[i] << " back";
-//      rets.push_back(rpc_client->AsyncPrefetchVar(
-//          epmap[i], ctx, local_scope, in_var_names[i], out_var_names[i]));
-//    } else {
-//      VLOG(30) << "don't send no-initialied variable: " << out_var_names[i];
-//    }
-//  }
-//  for (size_t i = 0; i < rets.size(); i++) {
-//    PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient");
-//  }
-//
-//  MergeMultipleVarsIntoOnBySection(id_name, out_name, out_var_names,
-//                                   height_section, splited_ids, &local_scope);
-//
-//  scope.DeleteScope(&local_scope);
-//}
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-using SelectedRows = framework::SelectedRows;
-using DDim = framework::DDim;
-
-constexpr int64_t kNoPadding = -1;
-
 template <typename T>
 class LookupRemoteTableKernel : public framework::OpKernel<T> {
  public:

From 9851a534780471b5eefed15fed8846e25a319149 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Fri, 23 Nov 2018 15:18:24 +0800
Subject: [PATCH 07/90] add prefetch part in pserver

---
 .../operators/distributed/grpc_server.cc      |  1 +
 .../operators/distributed/request_handler.h   |  3 +-
 .../distributed/request_handler_impl.cc       | 24 +++++++----
 .../distributed/request_handler_impl.h        | 40 +++++++++++++++----
 .../operators/distributed/send_recv.proto.in  |  1 +
 .../operators/distributed/variable_response.h |  1 +
 6 files changed, 54 insertions(+), 16 deletions(-)

diff --git a/paddle/fluid/operators/distributed/grpc_server.cc b/paddle/fluid/operators/distributed/grpc_server.cc
index ffd2b1707b..d5295dc63d 100644
--- a/paddle/fluid/operators/distributed/grpc_server.cc
+++ b/paddle/fluid/operators/distributed/grpc_server.cc
@@ -181,6 +181,7 @@ class RequestPrefetch final : public RequestBase {
     // prefetch process...
     std::string in_var_name = request_->Varname();
     std::string out_var_name = request_->OutVarname();
+    std::string table_name = request_->TableName();
     int trainer_id = request_->GetTrainerId();
     VLOG(40) << "RequestPrefetch, in_var_name: " << in_var_name
              << " out_var_name: " << out_var_name;
diff --git a/paddle/fluid/operators/distributed/request_handler.h b/paddle/fluid/operators/distributed/request_handler.h
index 3bcc59a47b..f29b2bf7d6 100644
--- a/paddle/fluid/operators/distributed/request_handler.h
+++ b/paddle/fluid/operators/distributed/request_handler.h
@@ -191,7 +191,8 @@ class RequestHandler {
   virtual bool Handle(const std::string& varname, framework::Scope* scope,
                       framework::Variable* var, framework::Variable** outvar,
                       const int trainer_id,
-                      const std::string& out_var_name = "") = 0;
+                      const std::string& out_var_name = "",
+                      const std::string& table_name = "") = 0;
 
  protected:
   const bool sync_mode_;
diff --git a/paddle/fluid/operators/distributed/request_handler_impl.cc b/paddle/fluid/operators/distributed/request_handler_impl.cc
index dae56cc843..0f1264ee96 100644
--- a/paddle/fluid/operators/distributed/request_handler_impl.cc
+++ b/paddle/fluid/operators/distributed/request_handler_impl.cc
@@ -37,7 +37,8 @@ bool RequestSendHandler::Handle(const std::string& varname,
                                 framework::Variable* invar,
                                 framework::Variable** outvar,
                                 const int trainer_id,
-                                const std::string& out_var_name) {
+                                const std::string& out_var_name,
+                                const std::string& table_name) {
   VLOG(40) << "RequestSendHandler:" << varname;
 
   // Sync
@@ -77,7 +78,8 @@ bool RequestGetHandler::Handle(const std::string& varname,
                                framework::Variable* invar,
                                framework::Variable** outvar,
                                const int trainer_id,
-                               const std::string& out_var_name) {
+                               const std::string& out_var_name,
+                               const std::string& table_name) {
   VLOG(40) << "RequestGetHandler:" << varname;
   if (sync_mode_) {
     if (varname == FETCH_BARRIER_MESSAGE) {
@@ -114,14 +116,21 @@ bool RequestPrefetchHandler::Handle(const std::string& varname,
                                     framework::Variable* invar,
                                     framework::Variable** outvar,
                                     const int trainer_id,
-                                    const std::string& out_var_name) {
+                                    const std::string& out_var_name,
+                                    const std::string& table_name) {
   VLOG(40) << "RequestPrefetchHandler " << varname;
 
   auto var_desc = program_->Block(0).FindVar(out_var_name);
   InitializeVariable(*outvar, var_desc->GetType());
-  executor_->RunPreparedContext(
-      (*prefetch_var_name_to_prepared_ctx_)[varname].get(), scope);
-
+  if (table_name.empty()) {
+    executor_->RunPreparedContext(
+        (*prefetch_var_name_to_prepared_ctx_)[varname].get(), scope);
+  } else {
+    auto lookup_table_op =
+        BuildLookupTableOp(table_name, varname, out_var_name);
+    paddle::platform::CPUPlace cpu_place;
+    lookup_table_op->Run(*scope, cpu_place);
+  }
   return true;
 }
 
@@ -130,7 +139,8 @@ bool RequestCheckpointHandler::Handle(const std::string& varname,
                                       framework::Variable* invar,
                                       framework::Variable** outvar,
                                       const int trainer_id,
-                                      const std::string& out_var_name) {
+                                      const std::string& out_var_name,
+                                      const std::string& table_name) {
   PADDLE_ENFORCE(
       checkpoint_notify_id != -1,
       "when checkpoint_notify_id = -1, there should be no RPC invoke.");
diff --git a/paddle/fluid/operators/distributed/request_handler_impl.h b/paddle/fluid/operators/distributed/request_handler_impl.h
index c1afda9dd2..5e0b25c5c2 100644
--- a/paddle/fluid/operators/distributed/request_handler_impl.h
+++ b/paddle/fluid/operators/distributed/request_handler_impl.h
@@ -24,6 +24,7 @@
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
@@ -43,8 +44,8 @@ class RequestSendHandler final : public RequestHandler {
   virtual ~RequestSendHandler() {}
   bool Handle(const std::string& varname, framework::Scope* scope,
               framework::Variable* var, framework::Variable** outvar,
-              const int trainer_id,
-              const std::string& out_var_name = "") override;
+              const int trainer_id, const std::string& out_var_name = "",
+              const std::string& table_name = "") override;
 
  private:
   bool enable_dc_asgd_;
@@ -59,21 +60,44 @@ class RequestGetHandler final : public RequestHandler {
   virtual ~RequestGetHandler() {}
   bool Handle(const std::string& varname, framework::Scope* scope,
               framework::Variable* var, framework::Variable** outvar,
-              const int trainer_id,
-              const std::string& out_var_name = "") override;
+              const int trainer_id, const std::string& out_var_name = "",
+              const std::string& table_name = "") override;
 
  private:
   bool enable_dc_asgd_;
 };
 
+static inline void BuildVar(const std::string& param_name,
+                            std::initializer_list<const char*> arguments,
+                            paddle::framework::proto::OpDesc::Var* var) {
+  var->set_parameter(param_name);
+  for (auto& arg_name : arguments) {
+    *var->mutable_arguments()->Add() = arg_name;
+  }
+}
+
 class RequestPrefetchHandler final : public RequestHandler {
  public:
   explicit RequestPrefetchHandler(bool sync_mode) : RequestHandler(sync_mode) {}
   virtual ~RequestPrefetchHandler() {}
   bool Handle(const std::string& varname, framework::Scope* scope,
               framework::Variable* var, framework::Variable** outvar,
-              const int trainer_id,
-              const std::string& out_var_name = "") override;
+              const int trainer_id, const std::string& out_var_name = "",
+              const std::string& table_name = "") override;
+
+ private:
+  std::unique_ptr<paddle::framework::OperatorBase> BuildLookupTableOp(
+      const std::string& table_name, const std::string& id_name,
+      const std::string& out_name) {
+    paddle::framework::proto::OpDesc op_desc;
+    op_desc.set_type("lookup_table");
+    BuildVar("W", {table_name.data()}, op_desc.add_inputs());
+    BuildVar("Ids", {id_name.data()}, op_desc.add_inputs());
+    BuildVar("Out", {out_name.data()}, op_desc.add_outputs());
+
+    auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
+    return op;
+  }
 };
 
 class RequestCheckpointHandler final : public RequestHandler {
@@ -85,8 +109,8 @@ class RequestCheckpointHandler final : public RequestHandler {
   virtual ~RequestCheckpointHandler() {}
   bool Handle(const std::string& varname, framework::Scope* scope,
               framework::Variable* var, framework::Variable** outvar,
-              const int trainer_id,
-              const std::string& out_var_name = "") override;
+              const int trainer_id, const std::string& out_var_name = "",
+              const std::string& table_name = "") override;
 
  private:
   int checkpoint_notify_id;
diff --git a/paddle/fluid/operators/distributed/send_recv.proto.in b/paddle/fluid/operators/distributed/send_recv.proto.in
index 55820c980e..7b7d069f17 100644
--- a/paddle/fluid/operators/distributed/send_recv.proto.in
+++ b/paddle/fluid/operators/distributed/send_recv.proto.in
@@ -80,6 +80,7 @@ message VariableMessage {
   // when profile switches from 1 to 2.
   int64 profile = 11;
   int64 trainer_id = 12;
+  string table_name = 13;
 }
 
 message VoidMessage {}
diff --git a/paddle/fluid/operators/distributed/variable_response.h b/paddle/fluid/operators/distributed/variable_response.h
index 4c7fcbbdfb..a4324f67bb 100644
--- a/paddle/fluid/operators/distributed/variable_response.h
+++ b/paddle/fluid/operators/distributed/variable_response.h
@@ -85,6 +85,7 @@ class VariableResponse {
   inline framework::Scope* GetMutableLocalScope() const { return local_scope_; }
   inline std::string Varname() const { return meta_.varname(); }
   inline std::string OutVarname() const { return meta_.out_varname(); }
+  inline std::string TableName() const { return meta_.table_name(); }
 
   // should call parse first.
   framework::Variable* GetVar() {

From 9d276fe8a890dbfc62ceb21366d2eab99870d9c7 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Sun, 25 Nov 2018 10:11:06 +0800
Subject: [PATCH 08/90] add parameter prefetch

---
 .../distributed/parameter_prefetch.h          | 209 ++++++++++++++++++
 .../distributed_ops/lookup_remote_table_op.h  |  52 +++++
 2 files changed, 261 insertions(+)
 create mode 100644 paddle/fluid/operators/distributed/parameter_prefetch.h

diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.h b/paddle/fluid/operators/distributed/parameter_prefetch.h
new file mode 100644
index 0000000000..03336367df
--- /dev/null
+++ b/paddle/fluid/operators/distributed/parameter_prefetch.h
@@ -0,0 +1,209 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <set>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/var_type.h"
+
+#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h"
+#include "paddle/fluid/operators/distributed/send_recv.pb.h"
+
+#include "google/protobuf/io/coded_stream.h"
+#include "google/protobuf/io/zero_copy_stream.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/distributed/grpc_bytebuffer_stream.h"
+#include "paddle/fluid/operators/distributed/variable_response.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using SelectedRows = framework::SelectedRows;
+using DDim = framework::DDim;
+
+constexpr int64_t kNoPadding = -1;
+
+inline size_t GetSectionIndex(int64_t id,
+                              const std::vector<int64_t>& abs_sections) {
+  for (size_t i = 1; i < abs_sections.size(); ++i) {
+    if (id < abs_sections[i]) {
+      return i - 1;
+    }
+  }
+  return abs_sections.size() - 1;
+}
+
+inline std::vector<int64_t> ToAbsoluteSection(
+    const std::vector<int64_t>& height_sections) {
+  std::vector<int64_t> abs_sections;
+  abs_sections.resize(height_sections.size());
+  abs_sections[0] = 0;
+  for (size_t i = 1; i < height_sections.size(); ++i) {
+    abs_sections[i] = height_sections[i - 1] + abs_sections[i - 1];
+  }
+  return abs_sections;
+}
+
+inline std::vector<std::vector<int64_t>> SplitIds(
+    const std::string& id_name, const std::vector<int64_t>& height_section,
+    framework::Scope* scope) {
+  auto& id_tensor = scope->Var(id_name)->Get<framework::LoDTensor>();
+  auto* id_data = id_tensor.data<int64_t>();
+  std::set<int64_t> all_ids;
+  for (size_t i = 0; i < id_tensor.numel(); ++i) {
+    all_ids.insert(id_data[i]);
+  }
+  auto abs_sections = ToAbsoluteSection(height_section);
+  std::vector<std::vector<int64_t>> splited_ids;
+  splited_ids.resize(height_section.size() + 1);
+  for (auto& id : all_ids) {
+    auto section_index = GetSectionIndex(id, abs_sections);
+    splited_ids[section_index].push_back(id - abs_sections[section_index]);
+  }
+  return splited_ids;
+}
+
+inline void SplitIdsIntoMultipleVarsBySection(
+    const std::string& id_name, const std::vector<std::string>& in_var_names,
+    const std::vector<int64_t>& height_section,
+    const std::vector<std::vector<int64_t>>& splited_ids,
+    framework::Scope* scope) {
+  PADDLE_ENFORCE_EQ(in_var_names.size(), height_section.size() + 1, "");
+
+  auto place = platform::CPUPlace();
+
+  for (size_t i = 0; i < in_var_names.size(); ++i) {
+    auto* id_tensor =
+        scope->Var(in_var_names[i])->GetMutable<framework::LoDTensor>();
+    auto& ids = splited_ids[i];
+    if (!ids.empty()) {
+      auto* id_tensor_data = id_tensor->mutable_data<int64_t>(
+          framework::make_ddim({static_cast<int64_t>(ids.size()), 1}), place);
+      memcpy(id_tensor_data, ids.data(), sizeof(int64_t) * ids.size());
+    }
+  }
+}
+
+inline void MergeMultipleVarsIntoOnBySection(
+    const std::string& id_name, const std::string& out_name,
+    const std::vector<std::string>& out_var_names,
+    const std::vector<int64_t>& height_section,
+    const std::vector<std::vector<int64_t>>& splited_ids,
+    const framework::ExecutionContext& context, framework::Scope* scope) {
+  PADDLE_ENFORCE_EQ(out_var_names.size(), height_section.size() + 1, "");
+
+  auto cpu_place = platform::CPUPlace();
+
+  auto abs_sections = ToAbsoluteSection(height_section);
+  auto& id_tensor = scope->Var(id_name)->Get<framework::LoDTensor>();
+  auto* id_data = id_tensor.data<int64_t>();
+  std::unordered_map<int64_t, std::vector<size_t>> id_to_offset;
+  for (size_t i = 0; i < id_tensor.numel(); ++i) {
+    id_to_offset[id_data[i]].push_back(i);
+  }
+
+  auto* out_tensor = scope->Var(out_name)->GetMutable<framework::LoDTensor>();
+  auto* out_tensor_data = out_tensor->mutable_data<float>(context.GetPlace());
+
+  for (size_t section_idx = 0; section_idx < out_var_names.size();
+       ++section_idx) {
+    auto& ids_in_this_section = splited_ids[section_idx];
+    auto& prefetch_out_var =
+        scope->Var(out_var_names[section_idx])->Get<framework::LoDTensor>();
+    const auto* out_var_data = prefetch_out_var.data<float>();
+    auto& dims = prefetch_out_var.dims();
+
+    PADDLE_ENFORCE_EQ(dims.size(), 2, "");
+    PADDLE_ENFORCE_EQ(ids_in_this_section.size(), dims[0]);
+
+    auto row_numel = dims[1];
+
+    for (size_t i = 0; i < dims[0]; ++i) {
+      auto id = ids_in_this_section[i];
+      auto origin_id = id + abs_sections[section_idx];
+      auto& offsets = id_to_offset[origin_id];
+      for (auto& offset : offsets) {
+        // should support GPU tensor
+        memory::Copy(cpu_place, out_tensor_data + offset * row_numel, cpu_place,
+                     out_var_data + i * row_numel, sizeof(float) * row_numel);
+      }
+    }
+  }
+}
+
+void prefetch(const std::string& id_name, const std::string& out_name,
+              const std::string& table_name,
+              const std::vector<std::string>& epmap,
+              const std::vector<int64_t>& height_sections,
+              const framework::ExecutionContext& context) {
+  auto& local_scope = context.scope().NewScope();
+
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  auto& ctx = *pool.Get(context.GetPlace());
+
+  distributed::RPCClient* rpc_client =
+      distributed::RPCClient::GetInstance<RPCCLIENT_T>(
+          context.Attr<int>("trainer_id"));
+
+  std::vector<std::string> in_var_names;
+  std::vector<std::string> out_var_names;
+  for (size_t i = 0; i < epmap.size(); ++i) {
+    in_var_names.push_back(id_name + "@" + epmap[i]);
+    out_var_names.push_back(out_name + "@" + epmap[i]);
+  }
+
+  auto splited_ids = SplitIds(id_name, height_sections, &local_scope);
+  SplitIdsIntoMultipleVarsBySection(id_name, in_var_names, height_sections,
+                                    splited_ids, &local_scope);
+
+  // create output var in local scope
+  for (auto& name : out_var_names) {
+    local_scope.Var(name)->GetMutable<framework::LoDTensor>();
+  }
+
+  std::vector<distributed::VarHandlePtr> rets;
+  for (size_t i = 0; i < in_var_names.size(); i++) {
+    if (NeedSend(local_scope, in_var_names[i])) {
+      VLOG(30) << "sending " << in_var_names[i] << " to " << epmap[i]
+               << " to get " << out_var_names[i] << " back";
+      rets.push_back(rpc_client->AsyncPrefetchVar(
+          epmap[i], ctx, local_scope, in_var_names[i], out_var_names[i]));
+    } else {
+      VLOG(30) << "don't send no-initialied variable: " << out_var_names[i];
+    }
+  }
+  for (size_t i = 0; i < rets.size(); i++) {
+    PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient");
+  }
+
+  MergeMultipleVarsIntoOnBySection(id_name, out_name, out_var_names,
+                                   height_sections, splited_ids, context,
+                                   &local_scope);
+
+  context.scope().DeleteScope(&local_scope);
+}
+
+};  // namespace distributed
+};  // namespace operators
+};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed_ops/lookup_remote_table_op.h b/paddle/fluid/operators/distributed_ops/lookup_remote_table_op.h
index 5c53ca6951..97c8fbfed3 100644
--- a/paddle/fluid/operators/distributed_ops/lookup_remote_table_op.h
+++ b/paddle/fluid/operators/distributed_ops/lookup_remote_table_op.h
@@ -149,6 +149,58 @@ inline void MergeMultipleVarsIntoOnBySection(
   }
 }
 
+void prefetch(const std::string& id_name, const std::string& out_name,
+              const std::string& table_name,
+              const std::vector<std::string>& epmap,
+              const std::vector<int64_t>& height_sections,
+              const framework::ExecutionContext& context) {
+  auto& local_scope = context.scope().NewScope();
+
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  auto& ctx = *pool.Get(context.GetPlace());
+
+  distributed::RPCClient* rpc_client =
+      distributed::RPCClient::GetInstance<RPCCLIENT_T>(
+          context.Attr<int>("trainer_id"));
+
+  std::vector<std::string> in_var_names;
+  std::vector<std::string> out_var_names;
+  for (size_t i = 0; i < epmap.size(); ++i) {
+    in_var_names.push_back(id_name + "@" + epmap[i]);
+    out_var_names.push_back(out_name + "@" + epmap[i]);
+  }
+
+  auto splited_ids = SplitIds(id_name, height_sections, &local_scope);
+  SplitIdsIntoMultipleVarsBySection(id_name, in_var_names, height_sections,
+                                    splited_ids, &local_scope);
+
+  // create output var in local scope
+  for (auto& name : out_var_names) {
+    local_scope.Var(name)->GetMutable<framework::LoDTensor>();
+  }
+
+  std::vector<distributed::VarHandlePtr> rets;
+  for (size_t i = 0; i < in_var_names.size(); i++) {
+    if (NeedSend(local_scope, in_var_names[i])) {
+      VLOG(30) << "sending " << in_var_names[i] << " to " << epmap[i]
+               << " to get " << out_var_names[i] << " back";
+      rets.push_back(rpc_client->AsyncPrefetchVar(
+          epmap[i], ctx, local_scope, in_var_names[i], out_var_names[i]));
+    } else {
+      VLOG(30) << "don't send no-initialied variable: " << out_var_names[i];
+    }
+  }
+  for (size_t i = 0; i < rets.size(); i++) {
+    PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient");
+  }
+
+  MergeMultipleVarsIntoOnBySection(id_name, out_name, out_var_names,
+                                   height_sections, splited_ids, context,
+                                   &local_scope);
+
+  context.scope().DeleteScope(&local_scope);
+}
+
 template <typename T>
 class LookupRemoteTableKernel : public framework::OpKernel<T> {
  public:

From 4ad5fd8f54512e0b9c28cbeb7890dc4c6e6248e5 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Sun, 25 Nov 2018 11:24:05 +0800
Subject: [PATCH 09/90] add parameter prefetch

---
 .../operators/distributed/CMakeLists.txt      |  35 +--
 .../distributed/parameter_prefetch.cc         | 204 ++++++++++++++++++
 .../distributed/parameter_prefetch.h          | 179 +--------------
 paddle/fluid/operators/lookup_table_op.cc     |  12 ++
 paddle/fluid/operators/lookup_table_op.h      |   2 +
 5 files changed, 238 insertions(+), 194 deletions(-)
 create mode 100644 paddle/fluid/operators/distributed/parameter_prefetch.cc

diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt
index 21db93958a..0858ec6a22 100644
--- a/paddle/fluid/operators/distributed/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed/CMakeLists.txt
@@ -9,36 +9,37 @@ else()
 endif()
 configure_file(send_recv.proto.in ${CMAKE_CURRENT_SOURCE_DIR}/send_recv.proto @ONLY)
 
+set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+
 if(WITH_GRPC)
   grpc_library(sendrecvop_grpc SRCS grpc_bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc
         request_handler_impl.cc rpc_client.cc rpc_server.cc grpc_server.cc variable_response.cc grpc_variable_response.cc grpc_serde.cc
       PROTO send_recv.proto 
       DEPS lod_tensor selected_rows memory)
-  set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+
   set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
   cc_test(grpc_serde_test SRCS grpc_serde_test.cc 
     DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc scope profiler math_function SERIAL)
   cc_test(rpc_server_test SRCS rpc_server_test.cc
     DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor  proto_desc lookup_sparse_table_op SERIAL)
   cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler)
-  return()
-endif()
-
-
-set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+  cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_grpc)
+else()
+  set_source_files_properties(brpc_server.cc brpc_client.cc rpc_server_test.cc brpc_serde_test.cc
+      brpc_variable_response.cc brpc_sendrecvop_utils.cc brpc_rdma_pool.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 
-set_source_files_properties(brpc_server.cc brpc_client.cc rpc_server_test.cc brpc_serde_test.cc
-    brpc_variable_response.cc brpc_sendrecvop_utils.cc brpc_rdma_pool.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+  brpc_library(sendrecvop_brpc SRCS brpc_client.cc brpc_server.cc rpc_server.cc rpc_client.cc request_handler_impl.cc brpc_sendrecvop_utils.cc
+      brpc_variable_response.cc variable_response.cc sendrecvop_utils.cc brpc_rdma_pool.cc
+    PROTO send_recv.proto
+    DEPS lod_tensor selected_rows memory)
 
-brpc_library(sendrecvop_brpc SRCS brpc_client.cc brpc_server.cc rpc_server.cc rpc_client.cc request_handler_impl.cc brpc_sendrecvop_utils.cc 
-    brpc_variable_response.cc variable_response.cc sendrecvop_utils.cc brpc_rdma_pool.cc
-  PROTO send_recv.proto
-  DEPS lod_tensor selected_rows memory)
+  cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_brpc)
 
-set(brpc_test_depends sendrecvop_brpc brpc ssl crypto protobuf leveldb gflags glog executor proto_desc lookup_table_op snappystream snappy)
+  set(brpc_test_depends sendrecvop_brpc brpc ssl crypto protobuf leveldb gflags glog executor proto_desc lookup_table_op snappystream snappy)
 
-cc_test(brpc_server_test SRCS rpc_server_test.cc 
-    DEPS ${brpc_test_depends} SERIAL)
+  cc_test(brpc_server_test SRCS rpc_server_test.cc
+      DEPS ${brpc_test_depends} SERIAL)
 
-cc_test(brpc_serde_test SRCS brpc_serde_test.cc 
-    DEPS ${brpc_test_depends} SERIAL)
+  cc_test(brpc_serde_test SRCS brpc_serde_test.cc
+      DEPS ${brpc_test_depends} SERIAL)
+endif()
diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.cc b/paddle/fluid/operators/distributed/parameter_prefetch.cc
new file mode 100644
index 0000000000..b7ba938cf1
--- /dev/null
+++ b/paddle/fluid/operators/distributed/parameter_prefetch.cc
@@ -0,0 +1,204 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <set>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/operators/distributed/parameter_prefetch.h"
+
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/tensor.h"
+
+#include "paddle/fluid/operators/detail/macros.h"
+#include "paddle/fluid/operators/distributed/rpc_client.h"
+#include "paddle/fluid/operators/distributed/variable_response.h"
+#include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using SelectedRows = framework::SelectedRows;
+using DDim = framework::DDim;
+
+constexpr int64_t kNoPadding = -1;
+
+inline size_t GetSectionIndex(int64_t id,
+                              const std::vector<int64_t>& abs_sections) {
+  for (size_t i = 1; i < abs_sections.size(); ++i) {
+    if (id < abs_sections[i]) {
+      return i - 1;
+    }
+  }
+  return abs_sections.size() - 1;
+}
+
+inline std::vector<int64_t> ToAbsoluteSection(
+    const std::vector<int64_t>& height_sections) {
+  std::vector<int64_t> abs_sections;
+  abs_sections.resize(height_sections.size());
+  abs_sections[0] = 0;
+  for (size_t i = 1; i < height_sections.size(); ++i) {
+    abs_sections[i] = height_sections[i - 1] + abs_sections[i - 1];
+  }
+  return abs_sections;
+}
+
+inline std::vector<std::vector<int64_t>> SplitIds(
+    const std::string& id_name, const std::vector<int64_t>& height_section,
+    framework::Scope* scope) {
+  auto& id_tensor = scope->Var(id_name)->Get<framework::LoDTensor>();
+  auto* id_data = id_tensor.data<int64_t>();
+  std::set<int64_t> all_ids;
+  for (size_t i = 0; i < id_tensor.numel(); ++i) {
+    all_ids.insert(id_data[i]);
+  }
+  auto abs_sections = ToAbsoluteSection(height_section);
+  std::vector<std::vector<int64_t>> splited_ids;
+  splited_ids.resize(height_section.size() + 1);
+  for (auto& id : all_ids) {
+    auto section_index = GetSectionIndex(id, abs_sections);
+    splited_ids[section_index].push_back(id - abs_sections[section_index]);
+  }
+  return splited_ids;
+}
+
+inline void SplitIdsIntoMultipleVarsBySection(
+    const std::string& id_name, const std::vector<std::string>& in_var_names,
+    const std::vector<int64_t>& height_section,
+    const std::vector<std::vector<int64_t>>& splited_ids,
+    framework::Scope* scope) {
+  PADDLE_ENFORCE_EQ(in_var_names.size(), height_section.size() + 1, "");
+
+  auto place = platform::CPUPlace();
+
+  for (size_t i = 0; i < in_var_names.size(); ++i) {
+    auto* id_tensor =
+        scope->Var(in_var_names[i])->GetMutable<framework::LoDTensor>();
+    auto& ids = splited_ids[i];
+    if (!ids.empty()) {
+      auto* id_tensor_data = id_tensor->mutable_data<int64_t>(
+          framework::make_ddim({static_cast<int64_t>(ids.size()), 1}), place);
+      memcpy(id_tensor_data, ids.data(), sizeof(int64_t) * ids.size());
+    }
+  }
+}
+
+inline void MergeMultipleVarsIntoOnBySection(
+    const std::string& id_name, const std::string& out_name,
+    const std::vector<std::string>& out_var_names,
+    const std::vector<int64_t>& height_section,
+    const std::vector<std::vector<int64_t>>& splited_ids,
+    const framework::ExecutionContext& context, framework::Scope* scope) {
+  PADDLE_ENFORCE_EQ(out_var_names.size(), height_section.size() + 1, "");
+
+  auto cpu_place = platform::CPUPlace();
+
+  auto abs_sections = ToAbsoluteSection(height_section);
+  auto& id_tensor = scope->Var(id_name)->Get<framework::LoDTensor>();
+  auto* id_data = id_tensor.data<int64_t>();
+  std::unordered_map<int64_t, std::vector<size_t>> id_to_offset;
+  for (size_t i = 0; i < id_tensor.numel(); ++i) {
+    id_to_offset[id_data[i]].push_back(i);
+  }
+
+  auto* out_tensor = scope->Var(out_name)->GetMutable<framework::LoDTensor>();
+  auto* out_tensor_data = out_tensor->mutable_data<float>(context.GetPlace());
+
+  for (size_t section_idx = 0; section_idx < out_var_names.size();
+       ++section_idx) {
+    auto& ids_in_this_section = splited_ids[section_idx];
+    auto& prefetch_out_var =
+        scope->Var(out_var_names[section_idx])->Get<framework::LoDTensor>();
+    const auto* out_var_data = prefetch_out_var.data<float>();
+    auto& dims = prefetch_out_var.dims();
+
+    PADDLE_ENFORCE_EQ(dims.size(), 2, "");
+    PADDLE_ENFORCE_EQ(ids_in_this_section.size(), dims[0]);
+
+    auto row_numel = dims[1];
+
+    for (size_t i = 0; i < dims[0]; ++i) {
+      auto id = ids_in_this_section[i];
+      auto origin_id = id + abs_sections[section_idx];
+      auto& offsets = id_to_offset[origin_id];
+      for (auto& offset : offsets) {
+        // should support GPU tensor
+        memory::Copy(cpu_place, out_tensor_data + offset * row_numel, cpu_place,
+                     out_var_data + i * row_numel, sizeof(float) * row_numel);
+      }
+    }
+  }
+}
+
+void prefetch(const std::string& id_name, const std::string& out_name,
+              const std::string& table_name,
+              const std::vector<std::string>& epmap,
+              const std::vector<int64_t>& height_sections,
+              const framework::ExecutionContext& context) {
+  auto& local_scope = context.scope().NewScope();
+
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  auto& ctx = *pool.Get(context.GetPlace());
+
+  distributed::RPCClient* rpc_client =
+      distributed::RPCClient::GetInstance<RPCCLIENT_T>(
+          context.Attr<int>("trainer_id"));
+
+  std::vector<std::string> in_var_names;
+  std::vector<std::string> out_var_names;
+  for (size_t i = 0; i < epmap.size(); ++i) {
+    in_var_names.push_back(id_name + "@" + epmap[i]);
+    out_var_names.push_back(out_name + "@" + epmap[i]);
+  }
+
+  auto splited_ids = SplitIds(id_name, height_sections, &local_scope);
+  SplitIdsIntoMultipleVarsBySection(id_name, in_var_names, height_sections,
+                                    splited_ids, &local_scope);
+
+  // create output var in local scope
+  for (auto& name : out_var_names) {
+    local_scope.Var(name)->GetMutable<framework::LoDTensor>();
+  }
+
+  std::vector<distributed::VarHandlePtr> rets;
+  for (size_t i = 0; i < in_var_names.size(); i++) {
+    if (NeedSend(local_scope, in_var_names[i])) {
+      VLOG(30) << "sending " << in_var_names[i] << " to " << epmap[i]
+               << " to get " << out_var_names[i] << " back";
+      rets.push_back(rpc_client->AsyncPrefetchVar(
+          epmap[i], ctx, local_scope, in_var_names[i], out_var_names[i]));
+    } else {
+      VLOG(30) << "don't send no-initialied variable: " << out_var_names[i];
+    }
+  }
+  for (size_t i = 0; i < rets.size(); i++) {
+    PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient");
+  }
+
+  MergeMultipleVarsIntoOnBySection(id_name, out_name, out_var_names,
+                                   height_sections, splited_ids, context,
+                                   &local_scope);
+
+  context.scope().DeleteScope(&local_scope);
+}
+
+};  // namespace distributed
+};  // namespace operators
+};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.h b/paddle/fluid/operators/distributed/parameter_prefetch.h
index 03336367df..9e680ec20b 100644
--- a/paddle/fluid/operators/distributed/parameter_prefetch.h
+++ b/paddle/fluid/operators/distributed/parameter_prefetch.h
@@ -14,195 +14,20 @@
 
 #pragma once
 
-#include <set>
 #include <string>
 #include <vector>
 
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/var_type.h"
-
-#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h"
-#include "paddle/fluid/operators/distributed/send_recv.pb.h"
-
-#include "google/protobuf/io/coded_stream.h"
-#include "google/protobuf/io/zero_copy_stream.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/distributed/grpc_bytebuffer_stream.h"
-#include "paddle/fluid/operators/distributed/variable_response.h"
+#include "paddle/fluid/framework/operator.h"
 
 namespace paddle {
 namespace operators {
 namespace distributed {
 
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-using SelectedRows = framework::SelectedRows;
-using DDim = framework::DDim;
-
-constexpr int64_t kNoPadding = -1;
-
-inline size_t GetSectionIndex(int64_t id,
-                              const std::vector<int64_t>& abs_sections) {
-  for (size_t i = 1; i < abs_sections.size(); ++i) {
-    if (id < abs_sections[i]) {
-      return i - 1;
-    }
-  }
-  return abs_sections.size() - 1;
-}
-
-inline std::vector<int64_t> ToAbsoluteSection(
-    const std::vector<int64_t>& height_sections) {
-  std::vector<int64_t> abs_sections;
-  abs_sections.resize(height_sections.size());
-  abs_sections[0] = 0;
-  for (size_t i = 1; i < height_sections.size(); ++i) {
-    abs_sections[i] = height_sections[i - 1] + abs_sections[i - 1];
-  }
-  return abs_sections;
-}
-
-inline std::vector<std::vector<int64_t>> SplitIds(
-    const std::string& id_name, const std::vector<int64_t>& height_section,
-    framework::Scope* scope) {
-  auto& id_tensor = scope->Var(id_name)->Get<framework::LoDTensor>();
-  auto* id_data = id_tensor.data<int64_t>();
-  std::set<int64_t> all_ids;
-  for (size_t i = 0; i < id_tensor.numel(); ++i) {
-    all_ids.insert(id_data[i]);
-  }
-  auto abs_sections = ToAbsoluteSection(height_section);
-  std::vector<std::vector<int64_t>> splited_ids;
-  splited_ids.resize(height_section.size() + 1);
-  for (auto& id : all_ids) {
-    auto section_index = GetSectionIndex(id, abs_sections);
-    splited_ids[section_index].push_back(id - abs_sections[section_index]);
-  }
-  return splited_ids;
-}
-
-inline void SplitIdsIntoMultipleVarsBySection(
-    const std::string& id_name, const std::vector<std::string>& in_var_names,
-    const std::vector<int64_t>& height_section,
-    const std::vector<std::vector<int64_t>>& splited_ids,
-    framework::Scope* scope) {
-  PADDLE_ENFORCE_EQ(in_var_names.size(), height_section.size() + 1, "");
-
-  auto place = platform::CPUPlace();
-
-  for (size_t i = 0; i < in_var_names.size(); ++i) {
-    auto* id_tensor =
-        scope->Var(in_var_names[i])->GetMutable<framework::LoDTensor>();
-    auto& ids = splited_ids[i];
-    if (!ids.empty()) {
-      auto* id_tensor_data = id_tensor->mutable_data<int64_t>(
-          framework::make_ddim({static_cast<int64_t>(ids.size()), 1}), place);
-      memcpy(id_tensor_data, ids.data(), sizeof(int64_t) * ids.size());
-    }
-  }
-}
-
-inline void MergeMultipleVarsIntoOnBySection(
-    const std::string& id_name, const std::string& out_name,
-    const std::vector<std::string>& out_var_names,
-    const std::vector<int64_t>& height_section,
-    const std::vector<std::vector<int64_t>>& splited_ids,
-    const framework::ExecutionContext& context, framework::Scope* scope) {
-  PADDLE_ENFORCE_EQ(out_var_names.size(), height_section.size() + 1, "");
-
-  auto cpu_place = platform::CPUPlace();
-
-  auto abs_sections = ToAbsoluteSection(height_section);
-  auto& id_tensor = scope->Var(id_name)->Get<framework::LoDTensor>();
-  auto* id_data = id_tensor.data<int64_t>();
-  std::unordered_map<int64_t, std::vector<size_t>> id_to_offset;
-  for (size_t i = 0; i < id_tensor.numel(); ++i) {
-    id_to_offset[id_data[i]].push_back(i);
-  }
-
-  auto* out_tensor = scope->Var(out_name)->GetMutable<framework::LoDTensor>();
-  auto* out_tensor_data = out_tensor->mutable_data<float>(context.GetPlace());
-
-  for (size_t section_idx = 0; section_idx < out_var_names.size();
-       ++section_idx) {
-    auto& ids_in_this_section = splited_ids[section_idx];
-    auto& prefetch_out_var =
-        scope->Var(out_var_names[section_idx])->Get<framework::LoDTensor>();
-    const auto* out_var_data = prefetch_out_var.data<float>();
-    auto& dims = prefetch_out_var.dims();
-
-    PADDLE_ENFORCE_EQ(dims.size(), 2, "");
-    PADDLE_ENFORCE_EQ(ids_in_this_section.size(), dims[0]);
-
-    auto row_numel = dims[1];
-
-    for (size_t i = 0; i < dims[0]; ++i) {
-      auto id = ids_in_this_section[i];
-      auto origin_id = id + abs_sections[section_idx];
-      auto& offsets = id_to_offset[origin_id];
-      for (auto& offset : offsets) {
-        // should support GPU tensor
-        memory::Copy(cpu_place, out_tensor_data + offset * row_numel, cpu_place,
-                     out_var_data + i * row_numel, sizeof(float) * row_numel);
-      }
-    }
-  }
-}
-
 void prefetch(const std::string& id_name, const std::string& out_name,
               const std::string& table_name,
               const std::vector<std::string>& epmap,
               const std::vector<int64_t>& height_sections,
-              const framework::ExecutionContext& context) {
-  auto& local_scope = context.scope().NewScope();
-
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  auto& ctx = *pool.Get(context.GetPlace());
-
-  distributed::RPCClient* rpc_client =
-      distributed::RPCClient::GetInstance<RPCCLIENT_T>(
-          context.Attr<int>("trainer_id"));
-
-  std::vector<std::string> in_var_names;
-  std::vector<std::string> out_var_names;
-  for (size_t i = 0; i < epmap.size(); ++i) {
-    in_var_names.push_back(id_name + "@" + epmap[i]);
-    out_var_names.push_back(out_name + "@" + epmap[i]);
-  }
-
-  auto splited_ids = SplitIds(id_name, height_sections, &local_scope);
-  SplitIdsIntoMultipleVarsBySection(id_name, in_var_names, height_sections,
-                                    splited_ids, &local_scope);
-
-  // create output var in local scope
-  for (auto& name : out_var_names) {
-    local_scope.Var(name)->GetMutable<framework::LoDTensor>();
-  }
-
-  std::vector<distributed::VarHandlePtr> rets;
-  for (size_t i = 0; i < in_var_names.size(); i++) {
-    if (NeedSend(local_scope, in_var_names[i])) {
-      VLOG(30) << "sending " << in_var_names[i] << " to " << epmap[i]
-               << " to get " << out_var_names[i] << " back";
-      rets.push_back(rpc_client->AsyncPrefetchVar(
-          epmap[i], ctx, local_scope, in_var_names[i], out_var_names[i]));
-    } else {
-      VLOG(30) << "don't send no-initialied variable: " << out_var_names[i];
-    }
-  }
-  for (size_t i = 0; i < rets.size(); i++) {
-    PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient");
-  }
-
-  MergeMultipleVarsIntoOnBySection(id_name, out_name, out_var_names,
-                                   height_sections, splited_ids, context,
-                                   &local_scope);
-
-  context.scope().DeleteScope(&local_scope);
-}
+              const framework::ExecutionContext& context);
 
 };  // namespace distributed
 };  // namespace operators
diff --git a/paddle/fluid/operators/lookup_table_op.cc b/paddle/fluid/operators/lookup_table_op.cc
index 1878dfe8a8..74baa8a350 100644
--- a/paddle/fluid/operators/lookup_table_op.cc
+++ b/paddle/fluid/operators/lookup_table_op.cc
@@ -87,6 +87,18 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
                   "(boolean, default false) "
                   "If the grad op reuse the input's variable.")
         .SetDefault(false);
+
+    // for parameter prefetch
+    AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
+    AddAttr<std::vector<int64_t>>("height_sections",
+                                  "Height for each output SelectedRows.")
+        .SetDefault(std::vector<int64_t>({}));
+    AddAttr<std::vector<std::string>>(
+        "epmap",
+        "(string vector, default 127.0.0.1:6164)"
+        "Server endpoints in the order of input variables for mapping")
+        .SetDefault({"127.0.0.1:6164"});
+
     AddComment(R"DOC(
 Lookup Table Operator.
 
diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h
index e504c4f0cd..69cae78b70 100644
--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
@@ -23,6 +23,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/operators/math/blas.h"
 
+#include "paddle/fluid/operators/distributed/parameter_prefetch.h"
+
 namespace paddle {
 namespace operators {
 

From 47280ef8b4a9b0336878e14594cbed9be0928239 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Sun, 25 Nov 2018 17:12:17 +0800
Subject: [PATCH 10/90] lookup table op support prefetch

---
 paddle/fluid/operators/CMakeLists.txt    |  4 +-
 paddle/fluid/operators/lookup_table_op.h | 98 +++++++++++++++---------
 2 files changed, 64 insertions(+), 38 deletions(-)

diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index de4f23515d..a824fec1e4 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -37,7 +37,7 @@ if (WITH_GPU)
     SET(OP_HEADER_DEPS ${OP_HEADER_DEPS} cub)
 endif()
 
-register_operators(EXCLUDES warpctc_op conv_fusion_op DEPS ${OP_HEADER_DEPS})
+register_operators(EXCLUDES warpctc_op conv_fusion_op lookup_table_op DEPS ${OP_HEADER_DEPS})
 
 # warpctc_op needs cudnn 7 above
 if (WITH_GPU AND NOT WIN32)
@@ -55,6 +55,8 @@ else()
     op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
 endif()
 
+op_library(lookup_table_op DEPS parameter_prefetch)
+
 set(COMMON_OP_DEPS ${OP_HEADER_DEPS})
 
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor)
diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h
index 69cae78b70..335e4adafa 100644
--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
@@ -23,8 +23,12 @@ limitations under the License. */
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/operators/math/blas.h"
 
+#ifdef PADDLE_WITH_DISTRIBUTE
+
 #include "paddle/fluid/operators/distributed/parameter_prefetch.h"
 
+#endif
+
 namespace paddle {
 namespace operators {
 
@@ -43,44 +47,64 @@ class LookupTableKernel : public framework::OpKernel<T> {
     auto *output_t = context.Output<LoDTensor>("Out");  // float tensor
     auto *table_var = context.InputVar("W");
 
-    int64_t padding_idx = context.Attr<int64_t>("padding_idx");
-    int64_t *ids = const_cast<int64_t *>(ids_t->data<int64_t>());
-    int64_t ids_numel = ids_t->numel();
-
-    if (table_var->IsType<LoDTensor>()) {
-      auto *table_t = context.Input<LoDTensor>("W");
-      int64_t row_number = table_t->dims()[0];
-      int64_t row_width = table_t->dims()[1];
-
-      auto *table = table_t->data<T>();
-      auto *output = output_t->mutable_data<T>(context.GetPlace());
-
-      for (int64_t i = 0; i < ids_numel; ++i) {
-        if (padding_idx != kNoPadding && ids[i] == padding_idx) {
-          memset(output + i * row_width, 0, row_width * sizeof(T));
-        } else {
-          PADDLE_ENFORCE_LT(ids[i], row_number);
-          PADDLE_ENFORCE_GE(ids[i], 0, "ids %d", i);
-          memcpy(output + i * row_width, table + ids[i] * row_width,
-                 row_width * sizeof(T));
+    auto id_name = context.Inputs("Ids").front();
+    auto out_name = context.Outputs("Out").front();
+    auto table_name = context.Inputs("W").front();
+    auto epmap = context.Attr<std::vector<std::string>>("epmap");
+    auto height_sections =
+        context.Attr<std::vector<int64_t>>("height_sections");
+
+    if (!epmap.empty()) {
+// if emap is not empty, then the paramter will be fetched from remote parameter
+// server
+#ifdef PADDLE_WITH_DISTRIBUTE
+      operators::distributed::prefetch(id_name, out_name, table_name, epmap,
+                                       height_sections, context);
+#else
+      PADDLE_THROW(
+          "paddle is not compiled with distribute support, can not do "
+          "parameter prefetch!");
+#endif
+    } else {
+      int64_t padding_idx = context.Attr<int64_t>("padding_idx");
+      int64_t *ids = const_cast<int64_t *>(ids_t->data<int64_t>());
+      int64_t ids_numel = ids_t->numel();
+
+      if (table_var->IsType<LoDTensor>()) {
+        auto *table_t = context.Input<LoDTensor>("W");
+        int64_t row_number = table_t->dims()[0];
+        int64_t row_width = table_t->dims()[1];
+
+        auto *table = table_t->data<T>();
+        auto *output = output_t->mutable_data<T>(context.GetPlace());
+
+        for (int64_t i = 0; i < ids_numel; ++i) {
+          if (padding_idx != kNoPadding && ids[i] == padding_idx) {
+            memset(output + i * row_width, 0, row_width * sizeof(T));
+          } else {
+            PADDLE_ENFORCE_LT(ids[i], row_number);
+            PADDLE_ENFORCE_GE(ids[i], 0, "ids %d", i);
+            memcpy(output + i * row_width, table + ids[i] * row_width,
+                   row_width * sizeof(T));
+          }
         }
-      }
-    } else if (table_var->IsType<SelectedRows>()) {
-      const auto &table_t = table_var->Get<SelectedRows>();
-      int64_t row_width = table_t.value().dims()[1];
-      const auto *table = table_t.value().data<T>();
-      auto *output = output_t->mutable_data<T>(context.GetPlace());
-
-      auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
-      for (int64_t i = 0; i < ids_numel; ++i) {
-        if (padding_idx != kNoPadding && ids[i] == padding_idx) {
-          memset(output + i * row_width, 0, row_width * sizeof(T));
-        } else {
-          PADDLE_ENFORCE_GE(ids[i], 0);
-          auto id_index = table_t.Index(ids[i]);
-          PADDLE_ENFORCE_GE(id_index, 0, "the input key should be exists.");
-          blas.VCOPY(row_width, table + id_index * row_width,
-                     output + i * row_width);
+      } else if (table_var->IsType<SelectedRows>()) {
+        const auto &table_t = table_var->Get<SelectedRows>();
+        int64_t row_width = table_t.value().dims()[1];
+        const auto *table = table_t.value().data<T>();
+        auto *output = output_t->mutable_data<T>(context.GetPlace());
+
+        auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
+        for (int64_t i = 0; i < ids_numel; ++i) {
+          if (padding_idx != kNoPadding && ids[i] == padding_idx) {
+            memset(output + i * row_width, 0, row_width * sizeof(T));
+          } else {
+            PADDLE_ENFORCE_GE(ids[i], 0);
+            auto id_index = table_t.Index(ids[i]);
+            PADDLE_ENFORCE_GE(id_index, 0, "the input key should be exists.");
+            blas.VCOPY(row_width, table + id_index * row_width,
+                       output + i * row_width);
+          }
         }
       }
     }

From cc6ef41dc3c0991f0a40588d453c7f6caade5a1d Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Sun, 25 Nov 2018 21:37:56 +0800
Subject: [PATCH 11/90] update dist transpiler

---
 .../fluid/transpiler/distribute_transpiler.py | 79 +++++++++++++------
 1 file changed, 56 insertions(+), 23 deletions(-)

diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index 89bc248027..7a3cf1230b 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -236,6 +236,22 @@ class DistributeTranspiler(object):
         else:
             raise ValueError("must set trainer_id > 0")
 
+    def _get_all_sparse_update_op(self, main_program):
+        sparse_update_ops = []
+        sparse_update_op_types = ["lookup_table"]
+        for op in main_program.global_block().ops:
+            if op.type in sparse_update_op_types and op.attr(
+                    'is_sparse') is True and not op.attr('is_distributed'):
+                sparse_update_ops.append(op)
+        return sparse_update_ops
+
+    def _update_sparse_update_op(self, param_varname, height_sections,
+                                 endpint_map):
+        for op in self.sparse_update_ops:
+            if param_varname in op.input_arg_names:
+                op._set_attr('epmap', endpint_map)
+                op._set_attr('height_sections', height_sections)
+
     def transpile(self,
                   trainer_id,
                   program=None,
@@ -299,6 +315,11 @@ class DistributeTranspiler(object):
             self.param_name_to_grad_name[param_var.name] = grad_var.name
             self.grad_name_to_param_name[grad_var.name] = param_var.name
 
+        # get all sparse update ops
+        self.sparse_update_ops = self._get_all_sparse_update_op(
+            self.origin_program)
+        self.sparse_param_to_height_sections = dict()
+
         # add distributed attrs to program
         self.origin_program._is_distributed = True
         self.origin_program._endpoints = self.pserver_endpoints
@@ -425,18 +446,24 @@ class DistributeTranspiler(object):
             if len(splited_trainer_grad) == 1:
                 recv_op_role_var_name = splited_trainer_grad[0].name
 
-            program.global_block().append_op(
-                type="recv",
-                inputs={"X": [recv_dep_in]},
-                outputs={"Out": splited_var},
-                attrs={
-                    "epmap": eps,
-                    "trainer_id": self.trainer_id,
-                    RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE,
-                    OP_ROLE_VAR_ATTR_NAME:
-                    [param_varname, recv_op_role_var_name],
-                    "sync_mode": not self.sync_mode
-                })
+            if param_varname in self.sparse_param_to_height_sections:
+                height_sections = self.sparse_param_to_height_sections[
+                    param_varname]
+                self._update_sparse_update_op(param_varname, height_sections,
+                                              eps)
+            else:
+                program.global_block().append_op(
+                    type="recv",
+                    inputs={"X": [recv_dep_in]},
+                    outputs={"Out": splited_var},
+                    attrs={
+                        "epmap": eps,
+                        "trainer_id": self.trainer_id,
+                        RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE,
+                        OP_ROLE_VAR_ATTR_NAME:
+                        [param_varname, recv_op_role_var_name],
+                        "sync_mode": not self.sync_mode
+                    })
 
         if self.sync_mode:
             # form a WAW dependency
@@ -454,14 +481,17 @@ class DistributeTranspiler(object):
             if len(splited_var) <= 1:
                 continue
             orig_param = program.global_block().vars[param_varname]
-            program.global_block().append_op(
-                type="concat",
-                inputs={"X": splited_var},
-                outputs={"Out": [orig_param]},
-                attrs={
-                    "axis": 0,
-                    RPC_OP_ROLE_ATTR_NAME: DIST_OP_ROLE_ATTR_VALUE
-                })
+            print("sparse_param_to_height_sections: " + str(
+                self.sparse_param_to_height_sections))
+            if param_varname not in self.sparse_param_to_height_sections:
+                program.global_block().append_op(
+                    type="concat",
+                    inputs={"X": splited_var},
+                    outputs={"Out": [orig_param]},
+                    attrs={
+                        "axis": 0,
+                        RPC_OP_ROLE_ATTR_NAME: DIST_OP_ROLE_ATTR_VALUE
+                    })
 
         self._get_trainer_startup_program(recv_vars=recv_vars, eplist=eplist)
 
@@ -1237,9 +1267,8 @@ to transpile() call.")
         # create table param and grad var in pserver program
         # create table optimize block in pserver program
         table_opt_op = [
-            op for op in self.optimize_ops
-            if 'Param' in op.input_names and op.input("Param")[0] ==
-            self.table_name
+            op for op in self.optimize_ops if 'Param' in op.input_names and
+            op.input("Param")[0] == self.table_name
         ][0]
 
         origin_param_var = self.origin_program.global_block().vars[
@@ -1418,6 +1447,10 @@ to transpile() call.")
             height_sections = []
             for v in splited_vars:
                 height_sections.append(v.shape[0])
+            sparse_param_name = self.grad_name_to_param_name[orig_var.name]
+            if sparse_param_name != self.table_name:
+                self.sparse_param_to_height_sections[
+                    sparse_param_name] = height_sections
             program.global_block()._insert_op(
                 index=index + 1,
                 type="split_selected_rows",

From 2b6c0c09d6273c970652371ec49750fd8243f54a Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Sun, 25 Nov 2018 22:51:16 +0800
Subject: [PATCH 12/90] add unit test

---
 paddle/fluid/operators/lookup_table_op.cc     |   1 +
 python/paddle/fluid/layers/nn.py              |   4 +
 .../tests/unittests/test_dist_transpiler.py   | 110 +++++++++++++++++-
 .../fluid/transpiler/distribute_transpiler.py |  26 +++--
 4 files changed, 125 insertions(+), 16 deletions(-)

diff --git a/paddle/fluid/operators/lookup_table_op.cc b/paddle/fluid/operators/lookup_table_op.cc
index 74baa8a350..99944b800c 100644
--- a/paddle/fluid/operators/lookup_table_op.cc
+++ b/paddle/fluid/operators/lookup_table_op.cc
@@ -89,6 +89,7 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault(false);
 
     // for parameter prefetch
+    AddAttr<bool>("remote_prefetch", "").SetDefault(false);
     AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
     AddAttr<std::vector<int64_t>>("height_sections",
                                   "Height for each output SelectedRows.")
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index ccd9175b64..a2a47ce384 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -285,6 +285,7 @@ def embedding(input,
               size,
               is_sparse=False,
               is_distributed=False,
+              remote_prefetch=False,
               padding_idx=None,
               param_attr=None,
               dtype='float32'):
@@ -326,6 +327,8 @@ def embedding(input,
     """
 
     helper = LayerHelper('embedding', **locals())
+    if remote_prefetch:
+        assert is_sparse is True and is_distributed is False
     w = helper.create_parameter(
         attr=helper.param_attr, shape=size, dtype=dtype, is_bias=False)
     tmp = helper.create_variable_for_type_inference(dtype)
@@ -339,6 +342,7 @@ def embedding(input,
         attrs={
             'is_sparse': is_sparse,
             'is_distributed': is_distributed,
+            'remote_prefetch': remote_prefetch,
             'padding_idx': padding_idx
         })
     return tmp
diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
index d132dd3c48..dbc4583763 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
@@ -447,19 +447,23 @@ class TestEmptyPserverOptimizeBlocks(TranspilerTest):
 
 
 class TestDistLookupTableBase(TranspilerTest):
-    def network_with_table(self, is_sparse, is_distributed):
+    def network_with_table(self,
+                           is_sparse,
+                           is_distributed,
+                           remote_prefetch=False):
         self.table_size = 1000
         self.emb_size = 64
         self.lookup_table_name = 'shared_w'
 
-        def emb_pool(ids, table_name, is_distributed):
+        def emb_pool(ids, table_name, is_distributed, remote_prefetch):
             emb = fluid.layers.embedding(
                 input=ids,
                 size=[self.table_size, self.emb_size],
                 dtype='float32',
                 param_attr=table_name,
                 is_sparse=is_sparse,
-                is_distributed=is_distributed)
+                is_distributed=is_distributed,
+                remote_prefetch=remote_prefetch)
             pool = fluid.layers.sequence_pool(input=emb, pool_type='average')
             return pool
 
@@ -469,9 +473,12 @@ class TestDistLookupTableBase(TranspilerTest):
             name='brand_ids', shape=[1], dtype='int64', lod_level=1)
         profile_ids = fluid.layers.data(
             name='brand_ids', shape=[1], dtype='int64', lod_level=1)
-        title_emb = emb_pool(title_ids, self.lookup_table_name, is_distributed)
-        brand_emb = emb_pool(brand_ids, self.lookup_table_name, is_distributed)
-        profile_emb = emb_pool(profile_ids, "profile_emb", False)
+        title_emb = emb_pool(title_ids, self.lookup_table_name, is_distributed,
+                             False)
+        brand_emb = emb_pool(brand_ids, self.lookup_table_name, is_distributed,
+                             False)
+        profile_emb = emb_pool(profile_ids, "profile_emb", False,
+                               remote_prefetch)
         fc0 = fluid.layers.concat(
             input=[title_emb, brand_emb, profile_emb], axis=1)
         predict = fluid.layers.fc(input=fc0,
@@ -575,6 +582,57 @@ class TestDistLookupTable(TestDistLookupTableBase):
                          startup_ops)
 
 
+class TestRemoteLookupTable(TestDistLookupTableBase):
+    def net_conf(self):
+        self.network_with_table(
+            is_sparse=True, is_distributed=False, remote_prefetch=True)
+
+    def transpiler_test_impl(self):
+        pserver1, startup1 = self.get_pserver(self.pserver1_ep)
+
+        self.assertEqual(len(pserver1.blocks), 6)
+        # 0 listen_and_serv
+        # 1 optimize for fc_w or fc_b adam
+        self.assertEqual([op.type for op in pserver1.blocks[1].ops],
+                         ["sum", "scale", "adam", "scale", "scale"])
+        # 4 prefetch -> lookup_sparse_table for data0
+        self.assertEqual([op.type for op in pserver1.blocks[2].ops],
+                         ["sum", "scale", "adam", "scale", "scale"])
+        # 2 optimize for table sgd
+        self.assertEqual([op.type for op in pserver1.blocks[3].ops],
+                         ["sum", "sgd"])
+        # 3 prefetch -> lookup_sparse_table for data0
+        self.assertEqual([op.type for op in pserver1.blocks[4].ops],
+                         ["lookup_sparse_table"])
+        # 5 save table
+        self.assertEqual([op.type for op in pserver1.blocks[5].ops], ["save"])
+
+        trainer, trainer_startup = self.get_trainer()
+        self.assertEqual(len(trainer.blocks), 1)
+        ops = [
+            'split_ids', 'prefetch', 'merge_ids', 'sequence_pool',
+            'sequence_pool', 'lookup_table', 'sequence_pool', 'concat', 'mul',
+            'elementwise_add', 'cross_entropy', 'mean', 'fill_constant',
+            'mean_grad', 'cross_entropy_grad', 'elementwise_add_grad', 'send',
+            'mul_grad', 'send', 'concat_grad', 'sequence_pool_grad',
+            'lookup_table_grad', 'split_selected_rows', 'send',
+            'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad',
+            'lookup_table_grad', 'sum', 'split_ids', 'send', 'send_barrier',
+            'recv', 'recv', 'recv', 'fetch_barrier', 'concat'
+        ]
+        self.assertEqual([op.type for op in trainer.blocks[0].ops], ops)
+        startup_ops = [
+            'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
+            'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
+            'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
+            'fill_constant', 'fill_constant', 'uniform_random',
+            'uniform_random', 'recv', 'recv', 'recv', 'fetch_barrier', 'concat',
+            'fake_init'
+        ]
+        self.assertEqual([op.type for op in trainer_startup.blocks[0].ops],
+                         startup_ops)
+
+
 class TestAsyncLocalLookupTable(TestDistLookupTableBase):
     def net_conf(self):
         self.network_with_table(is_sparse=True, is_distributed=False)
@@ -782,5 +840,45 @@ class TestNCCL2Transpile(TranspilerTest):
             pass
 
 
+# test for remote prefetch
+class TestRemoteLookupTable(TestDistLookupTableBase):
+    def net_conf(self):
+        self.network_with_table(
+            is_sparse=True, is_distributed=False, remote_prefetch=True)
+
+    def transpiler_test_impl(self):
+        pserver1, startup1 = self.get_pserver(self.pserver1_ep)
+
+        self.assertEqual(len(pserver1.blocks), 4)
+        # 0 listen_and_serv
+        # 1 optimize for fc_w or fc_b adam
+        self.assertEqual([op.type for op in pserver1.blocks[1].ops],
+                         ["sum", "scale", "adam", "scale", "scale"])
+        # 2 optimize for table adam
+        # NOTE: if param is not selected rows, the grad will scaled to grad / trainer_num
+        self.assertEqual([op.type for op in pserver1.blocks[2].ops],
+                         ["sum", "scale", "adam", "scale", "scale"])
+
+        # 3 optimize for table 2 adam
+        # NOTE: if param is not selected rows, the grad will scaled to grad / trainer_num
+        self.assertEqual([op.type for op in pserver1.blocks[3].ops],
+                         ["sum", "scale", "adam", "scale", "scale"])
+
+        trainer, _ = self.get_trainer()
+        self.assertEqual(len(trainer.blocks), 1)
+        ops = [
+            'lookup_table', 'sequence_pool', 'lookup_table', 'sequence_pool',
+            'lookup_table', 'sequence_pool', 'concat', 'mul', 'elementwise_add',
+            'cross_entropy', 'mean', 'fill_constant', 'mean_grad',
+            'cross_entropy_grad', 'elementwise_add_grad', 'send', 'mul_grad',
+            'send', 'concat_grad', 'sequence_pool_grad', 'lookup_table_grad',
+            'split_selected_rows', 'send', 'sequence_pool_grad',
+            'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad',
+            'sum', 'split_selected_rows', 'send', 'send_barrier', 'recv',
+            'recv', 'recv', 'fetch_barrier', 'concat'
+        ]
+        self.assertEqual([op.type for op in trainer.blocks[0].ops], ops)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index 7a3cf1230b..ddf7468cdd 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -236,21 +236,29 @@ class DistributeTranspiler(object):
         else:
             raise ValueError("must set trainer_id > 0")
 
-    def _get_all_sparse_update_op(self, main_program):
+    def _get_all_remote_sparse_update_op(self, main_program):
         sparse_update_ops = []
         sparse_update_op_types = ["lookup_table"]
         for op in main_program.global_block().ops:
             if op.type in sparse_update_op_types and op.attr(
-                    'is_sparse') is True and not op.attr('is_distributed'):
+                    'remote_prefetch') is True and not op.attr(
+                        'is_distributed'):
                 sparse_update_ops.append(op)
         return sparse_update_ops
 
-    def _update_sparse_update_op(self, param_varname, height_sections,
-                                 endpint_map):
+    def _update_remote_sparse_update_op(self, param_varname, height_sections,
+                                        endpint_map):
         for op in self.sparse_update_ops:
             if param_varname in op.input_arg_names:
                 op._set_attr('epmap', endpint_map)
                 op._set_attr('height_sections', height_sections)
+                op._set_attr('trainer_id', self.trainer_id)
+
+    def _is_input_of_remote_sparse_update_op(self, param_name):
+        for op in self.sparse_update_ops:
+            if param_name in op.input_arg_names:
+                return True
+        return False
 
     def transpile(self,
                   trainer_id,
@@ -316,7 +324,7 @@ class DistributeTranspiler(object):
             self.grad_name_to_param_name[grad_var.name] = param_var.name
 
         # get all sparse update ops
-        self.sparse_update_ops = self._get_all_sparse_update_op(
+        self.sparse_update_ops = self._get_all_remote_sparse_update_op(
             self.origin_program)
         self.sparse_param_to_height_sections = dict()
 
@@ -449,8 +457,8 @@ class DistributeTranspiler(object):
             if param_varname in self.sparse_param_to_height_sections:
                 height_sections = self.sparse_param_to_height_sections[
                     param_varname]
-                self._update_sparse_update_op(param_varname, height_sections,
-                                              eps)
+                self._update_remote_sparse_update_op(param_varname,
+                                                     height_sections, eps)
             else:
                 program.global_block().append_op(
                     type="recv",
@@ -481,8 +489,6 @@ class DistributeTranspiler(object):
             if len(splited_var) <= 1:
                 continue
             orig_param = program.global_block().vars[param_varname]
-            print("sparse_param_to_height_sections: " + str(
-                self.sparse_param_to_height_sections))
             if param_varname not in self.sparse_param_to_height_sections:
                 program.global_block().append_op(
                     type="concat",
@@ -1448,7 +1454,7 @@ to transpile() call.")
             for v in splited_vars:
                 height_sections.append(v.shape[0])
             sparse_param_name = self.grad_name_to_param_name[orig_var.name]
-            if sparse_param_name != self.table_name:
+            if self._is_input_of_remote_sparse_update_op(sparse_param_name):
                 self.sparse_param_to_height_sections[
                     sparse_param_name] = height_sections
             program.global_block()._insert_op(

From 312b7786d919550b55143d070153c3bcd311a2f6 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Mon, 26 Nov 2018 00:05:08 +0800
Subject: [PATCH 13/90] clean code

---
 .../distributed_ops/lookup_remote_table_op.cc | 114 --------
 .../distributed_ops/lookup_remote_table_op.h  | 274 ------------------
 paddle/fluid/operators/lookup_table_op.cc     |   2 +-
 paddle/fluid/operators/lookup_table_op.h      |   3 +-
 4 files changed, 3 insertions(+), 390 deletions(-)
 delete mode 100644 paddle/fluid/operators/distributed_ops/lookup_remote_table_op.cc
 delete mode 100644 paddle/fluid/operators/distributed_ops/lookup_remote_table_op.h

diff --git a/paddle/fluid/operators/distributed_ops/lookup_remote_table_op.cc b/paddle/fluid/operators/distributed_ops/lookup_remote_table_op.cc
deleted file mode 100644
index 5d3a50a44c..0000000000
--- a/paddle/fluid/operators/distributed_ops/lookup_remote_table_op.cc
+++ /dev/null
@@ -1,114 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/distributed_ops/lookup_remote_table_op.h"
-#include "paddle/fluid/framework/var_type_inference.h"
-
-namespace paddle {
-namespace operators {
-
-class LookupRemoteTableOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("W"),
-                   "Input(W) of LookupRemoteTableOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Ids"),
-                   "Input(Ids) of LookupRemoteTableOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of LookupRemoteTableOp should not be null.");
-
-    auto table_dims = ctx->GetInputDim("W");
-    auto ids_dims = ctx->GetInputDim("Ids");
-    int ids_rank = ids_dims.size();
-
-    PADDLE_ENFORCE_EQ(table_dims.size(), 2);
-    PADDLE_ENFORCE_EQ(ids_dims[ids_rank - 1], 1,
-                      "The last dimension of the 'Ids' tensor must be 1.");
-
-    auto output_dims =
-        framework::vectorize(framework::slice_ddim(ids_dims, 0, ids_rank - 1));
-    output_dims.push_back(table_dims[1]);
-    ctx->SetOutputDim("Out", framework::make_ddim(output_dims));
-
-    if (ctx->GetOutputsVarType("Out")[0] ==
-        framework::proto::VarType::LOD_TENSOR) {
-      ctx->ShareLoD("Ids", /*->*/ "Out");
-    }
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("W"));
-    return framework::OpKernelType(data_type, ctx.device_context());
-  }
-};
-
-class LookupRemoteTableOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("W",
-             "(Tensor) The input represents embedding tensors, "
-             "which is a learnable parameter.");
-    AddInput("Ids",
-             "An input with type int32 or int64 "
-             "contains the ids to be looked up in W. "
-             "The last dimension size must be 1.");
-    AddOutput("Out", "The lookup results, which have the same type as W.");
-    AddAttr<std::vector<int64_t>>("height_sections",
-                                  "Height for each output SelectedRows.")
-        .SetDefault(std::vector<int64_t>({}));
-    AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
-    AddAttr<std::vector<std::string>>(
-        "epmap",
-        "(string vector, default 127.0.0.1:6164)"
-        "Server endpoints in the order of input variables for mapping")
-        .SetDefault({"127.0.0.1:6164"});
-    AddAttr<int64_t>("padding_idx",
-                     "(int64, default -1) "
-                     "If the value is -1, it makes no effect to lookup. "
-                     "Otherwise the given value indicates padding the output "
-                     "with zeros whenever lookup encounters it in Ids.")
-        .SetDefault(kNoPadding);
-    // NOTE(minqiyang): grad_inplace is an temporal attribute,
-    // please do NOT set this attribute in python layer.
-    AddAttr<bool>("grad_inplace",
-                  "(boolean, default false) "
-                  "If the grad op reuse the input's variable.")
-        .SetDefault(false);
-    AddComment(R"DOC(
-Lookup Remote Table Operator.
-
-This operator is used to perform lookups on the parameter W,
-then concatenated into a dense tensor.
-
-The input Ids can carry the LoD (Level of Details) information,
-or not. And the output only shares the LoD information with input Ids.
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(lookup_remote_table, ops::LookupRemoteTableOp,
-                  paddle::framework::EmptyGradOpMaker,
-                  ops::LookupRemoteTableOpMaker);
-
-REGISTER_OP_CPU_KERNEL(lookup_remote_table, ops::LookupRemoteTableKernel<float>,
-                       ops::LookupRemoteTableKernel<double>);
diff --git a/paddle/fluid/operators/distributed_ops/lookup_remote_table_op.h b/paddle/fluid/operators/distributed_ops/lookup_remote_table_op.h
deleted file mode 100644
index 97c8fbfed3..0000000000
--- a/paddle/fluid/operators/distributed_ops/lookup_remote_table_op.h
+++ /dev/null
@@ -1,274 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <future>  // NOLINT
-#include <ostream>
-#include <set>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/memory/memcpy.h"
-#include "paddle/fluid/operators/detail/macros.h"
-#include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
-#include "paddle/fluid/operators/math/blas.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-using SelectedRows = framework::SelectedRows;
-using DDim = framework::DDim;
-
-constexpr int64_t kNoPadding = -1;
-
-inline size_t GetSectionIndex(int64_t id,
-                              const std::vector<int64_t>& abs_sections) {
-  for (size_t i = 1; i < abs_sections.size(); ++i) {
-    if (id < abs_sections[i]) {
-      return i - 1;
-    }
-  }
-  return abs_sections.size() - 1;
-}
-
-inline std::vector<int64_t> ToAbsoluteSection(
-    const std::vector<int64_t>& height_sections) {
-  std::vector<int64_t> abs_sections;
-  abs_sections.resize(height_sections.size());
-  abs_sections[0] = 0;
-  for (size_t i = 1; i < height_sections.size(); ++i) {
-    abs_sections[i] = height_sections[i - 1] + abs_sections[i - 1];
-  }
-  return abs_sections;
-}
-
-inline std::vector<std::vector<int64_t>> SplitIds(
-    const std::string& id_name, const std::vector<int64_t>& height_section,
-    framework::Scope* scope) {
-  auto& id_tensor = scope->Var(id_name)->Get<framework::LoDTensor>();
-  auto* id_data = id_tensor.data<int64_t>();
-  std::set<int64_t> all_ids;
-  for (size_t i = 0; i < id_tensor.numel(); ++i) {
-    all_ids.insert(id_data[i]);
-  }
-  auto abs_sections = ToAbsoluteSection(height_section);
-  std::vector<std::vector<int64_t>> splited_ids;
-  splited_ids.resize(height_section.size() + 1);
-  for (auto& id : all_ids) {
-    auto section_index = GetSectionIndex(id, abs_sections);
-    splited_ids[section_index].push_back(id - abs_sections[section_index]);
-  }
-  return splited_ids;
-}
-
-inline void SplitIdsIntoMultipleVarsBySection(
-    const std::string& id_name, const std::vector<std::string>& in_var_names,
-    const std::vector<int64_t>& height_section,
-    const std::vector<std::vector<int64_t>>& splited_ids,
-    framework::Scope* scope) {
-  PADDLE_ENFORCE_EQ(in_var_names.size(), height_section.size() + 1, "");
-
-  auto place = platform::CPUPlace();
-
-  for (size_t i = 0; i < in_var_names.size(); ++i) {
-    auto* id_tensor =
-        scope->Var(in_var_names[i])->GetMutable<framework::LoDTensor>();
-    auto& ids = splited_ids[i];
-    if (!ids.empty()) {
-      auto* id_tensor_data = id_tensor->mutable_data<int64_t>(
-          framework::make_ddim({static_cast<int64_t>(ids.size()), 1}), place);
-      memcpy(id_tensor_data, ids.data(), sizeof(int64_t) * ids.size());
-    }
-  }
-}
-
-inline void MergeMultipleVarsIntoOnBySection(
-    const std::string& id_name, const std::string& out_name,
-    const std::vector<std::string>& out_var_names,
-    const std::vector<int64_t>& height_section,
-    const std::vector<std::vector<int64_t>>& splited_ids,
-    const framework::ExecutionContext& context, framework::Scope* scope) {
-  PADDLE_ENFORCE_EQ(out_var_names.size(), height_section.size() + 1, "");
-
-  auto cpu_place = platform::CPUPlace();
-
-  auto abs_sections = ToAbsoluteSection(height_section);
-  auto& id_tensor = scope->Var(id_name)->Get<framework::LoDTensor>();
-  auto* id_data = id_tensor.data<int64_t>();
-  std::unordered_map<int64_t, std::vector<size_t>> id_to_offset;
-  for (size_t i = 0; i < id_tensor.numel(); ++i) {
-    id_to_offset[id_data[i]].push_back(i);
-  }
-
-  auto* out_tensor = scope->Var(out_name)->GetMutable<framework::LoDTensor>();
-  auto* out_tensor_data = out_tensor->mutable_data<float>(context.GetPlace());
-
-  for (size_t section_idx = 0; section_idx < out_var_names.size();
-       ++section_idx) {
-    auto& ids_in_this_section = splited_ids[section_idx];
-    auto& prefetch_out_var =
-        scope->Var(out_var_names[section_idx])->Get<framework::LoDTensor>();
-    const auto* out_var_data = prefetch_out_var.data<float>();
-    auto& dims = prefetch_out_var.dims();
-
-    PADDLE_ENFORCE_EQ(dims.size(), 2, "");
-    PADDLE_ENFORCE_EQ(ids_in_this_section.size(), dims[0]);
-
-    auto row_numel = dims[1];
-
-    for (size_t i = 0; i < dims[0]; ++i) {
-      auto id = ids_in_this_section[i];
-      auto origin_id = id + abs_sections[section_idx];
-      auto& offsets = id_to_offset[origin_id];
-      for (auto& offset : offsets) {
-        // should support GPU tensor
-        memory::Copy(cpu_place, out_tensor_data + offset * row_numel, cpu_place,
-                     out_var_data + i * row_numel, sizeof(float) * row_numel);
-      }
-    }
-  }
-}
-
-void prefetch(const std::string& id_name, const std::string& out_name,
-              const std::string& table_name,
-              const std::vector<std::string>& epmap,
-              const std::vector<int64_t>& height_sections,
-              const framework::ExecutionContext& context) {
-  auto& local_scope = context.scope().NewScope();
-
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  auto& ctx = *pool.Get(context.GetPlace());
-
-  distributed::RPCClient* rpc_client =
-      distributed::RPCClient::GetInstance<RPCCLIENT_T>(
-          context.Attr<int>("trainer_id"));
-
-  std::vector<std::string> in_var_names;
-  std::vector<std::string> out_var_names;
-  for (size_t i = 0; i < epmap.size(); ++i) {
-    in_var_names.push_back(id_name + "@" + epmap[i]);
-    out_var_names.push_back(out_name + "@" + epmap[i]);
-  }
-
-  auto splited_ids = SplitIds(id_name, height_sections, &local_scope);
-  SplitIdsIntoMultipleVarsBySection(id_name, in_var_names, height_sections,
-                                    splited_ids, &local_scope);
-
-  // create output var in local scope
-  for (auto& name : out_var_names) {
-    local_scope.Var(name)->GetMutable<framework::LoDTensor>();
-  }
-
-  std::vector<distributed::VarHandlePtr> rets;
-  for (size_t i = 0; i < in_var_names.size(); i++) {
-    if (NeedSend(local_scope, in_var_names[i])) {
-      VLOG(30) << "sending " << in_var_names[i] << " to " << epmap[i]
-               << " to get " << out_var_names[i] << " back";
-      rets.push_back(rpc_client->AsyncPrefetchVar(
-          epmap[i], ctx, local_scope, in_var_names[i], out_var_names[i]));
-    } else {
-      VLOG(30) << "don't send no-initialied variable: " << out_var_names[i];
-    }
-  }
-  for (size_t i = 0; i < rets.size(); i++) {
-    PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient");
-  }
-
-  MergeMultipleVarsIntoOnBySection(id_name, out_name, out_var_names,
-                                   height_sections, splited_ids, context,
-                                   &local_scope);
-
-  context.scope().DeleteScope(&local_scope);
-}
-
-template <typename T>
-class LookupRemoteTableKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    std::string id_name = context.Inputs("Ids").front();
-    auto* ids_t = context.Input<LoDTensor>("Ids");  // int tensor
-
-    std::string out_name = context.Outputs("Out").front();
-    auto* output_t = context.Output<LoDTensor>("Out");  // float tensor
-
-    std::string table_name = context.Inputs("W").front();
-    auto* table_var = context.InputVar("W");
-
-    int64_t padding_idx = context.Attr<int64_t>("padding_idx");
-    int64_t* ids = const_cast<int64_t*>(ids_t->data<int64_t>());
-    int64_t ids_numel = ids_t->numel();
-
-    auto epmap = context.Attr<std::vector<std::string>>("epmap");
-    auto height_sections =
-        context.Attr<std::vector<int64_t>>("height_sections");
-
-    auto& local_scope = context.scope().NewScope();
-
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    auto& ctx = *pool.Get(context.GetPlace());
-
-    distributed::RPCClient* rpc_client =
-        distributed::RPCClient::GetInstance<RPCCLIENT_T>(
-            context.Attr<int>("trainer_id"));
-
-    std::vector<std::string> in_var_names;
-    std::vector<std::string> out_var_names;
-    for (size_t i = 0; i < epmap.size(); ++i) {
-      in_var_names.push_back(id_name + "@" + epmap[i]);
-      out_var_names.push_back(out_name + "@" + epmap[i]);
-    }
-
-    auto splited_ids = SplitIds(id_name, height_sections, &local_scope);
-    SplitIdsIntoMultipleVarsBySection(id_name, in_var_names, height_sections,
-                                      splited_ids, &local_scope);
-
-    // create output var in local scope
-    for (auto& name : out_var_names) {
-      local_scope.Var(name)->GetMutable<framework::LoDTensor>();
-    }
-
-    std::vector<distributed::VarHandlePtr> rets;
-    for (size_t i = 0; i < in_var_names.size(); i++) {
-      if (NeedSend(local_scope, in_var_names[i])) {
-        VLOG(30) << "sending " << in_var_names[i] << " to " << epmap[i]
-                 << " to get " << out_var_names[i] << " back";
-        rets.push_back(rpc_client->AsyncPrefetchVar(
-            epmap[i], ctx, local_scope, in_var_names[i], out_var_names[i]));
-      } else {
-        VLOG(30) << "don't send no-initialied variable: " << out_var_names[i];
-      }
-    }
-    for (size_t i = 0; i < rets.size(); i++) {
-      PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient");
-    }
-
-    MergeMultipleVarsIntoOnBySection(id_name, out_name, out_var_names,
-                                     height_sections, splited_ids, context,
-                                     &local_scope);
-
-    context.scope().DeleteScope(&local_scope);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/lookup_table_op.cc b/paddle/fluid/operators/lookup_table_op.cc
index 99944b800c..faf91775e4 100644
--- a/paddle/fluid/operators/lookup_table_op.cc
+++ b/paddle/fluid/operators/lookup_table_op.cc
@@ -98,7 +98,7 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
         "epmap",
         "(string vector, default 127.0.0.1:6164)"
         "Server endpoints in the order of input variables for mapping")
-        .SetDefault({"127.0.0.1:6164"});
+        .SetDefault({});
 
     AddComment(R"DOC(
 Lookup Table Operator.
diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h
index 335e4adafa..4adb829f20 100644
--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
@@ -51,10 +51,11 @@ class LookupTableKernel : public framework::OpKernel<T> {
     auto out_name = context.Outputs("Out").front();
     auto table_name = context.Inputs("W").front();
     auto epmap = context.Attr<std::vector<std::string>>("epmap");
+    auto remote_prefetch = context.Attr<bool>("remote_prefetch");
     auto height_sections =
         context.Attr<std::vector<int64_t>>("height_sections");
 
-    if (!epmap.empty()) {
+    if (remote_prefetch) {
 // if emap is not empty, then the paramter will be fetched from remote parameter
 // server
 #ifdef PADDLE_WITH_DISTRIBUTE

From 5856c2f3320181b7001335f0081308527bfa18fe Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Mon, 26 Nov 2018 10:13:50 +0800
Subject: [PATCH 14/90] change Var to FindVar

---
 paddle/fluid/operators/distributed/parameter_prefetch.cc | 7 ++++---
 paddle/fluid/operators/lookup_table_op.h                 | 3 ++-
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.cc b/paddle/fluid/operators/distributed/parameter_prefetch.cc
index b7ba938cf1..327c8cb4de 100644
--- a/paddle/fluid/operators/distributed/parameter_prefetch.cc
+++ b/paddle/fluid/operators/distributed/parameter_prefetch.cc
@@ -63,7 +63,7 @@ inline std::vector<int64_t> ToAbsoluteSection(
 inline std::vector<std::vector<int64_t>> SplitIds(
     const std::string& id_name, const std::vector<int64_t>& height_section,
     framework::Scope* scope) {
-  auto& id_tensor = scope->Var(id_name)->Get<framework::LoDTensor>();
+  auto& id_tensor = scope->FindVar(id_name)->Get<framework::LoDTensor>();
   auto* id_data = id_tensor.data<int64_t>();
   std::set<int64_t> all_ids;
   for (size_t i = 0; i < id_tensor.numel(); ++i) {
@@ -111,14 +111,15 @@ inline void MergeMultipleVarsIntoOnBySection(
   auto cpu_place = platform::CPUPlace();
 
   auto abs_sections = ToAbsoluteSection(height_section);
-  auto& id_tensor = scope->Var(id_name)->Get<framework::LoDTensor>();
+  auto& id_tensor = scope->FindVar(id_name)->Get<framework::LoDTensor>();
   auto* id_data = id_tensor.data<int64_t>();
   std::unordered_map<int64_t, std::vector<size_t>> id_to_offset;
   for (size_t i = 0; i < id_tensor.numel(); ++i) {
     id_to_offset[id_data[i]].push_back(i);
   }
 
-  auto* out_tensor = scope->Var(out_name)->GetMutable<framework::LoDTensor>();
+  auto* out_tensor =
+      scope->FindVar(out_name)->GetMutable<framework::LoDTensor>();
   auto* out_tensor_data = out_tensor->mutable_data<float>(context.GetPlace());
 
   for (size_t section_idx = 0; section_idx < out_var_names.size();
diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h
index 4adb829f20..223de413b2 100644
--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
@@ -56,7 +56,8 @@ class LookupTableKernel : public framework::OpKernel<T> {
         context.Attr<std::vector<int64_t>>("height_sections");
 
     if (remote_prefetch) {
-// if emap is not empty, then the paramter will be fetched from remote parameter
+// if emap is not empty, then the parameter will be fetched from remote
+// parameter
 // server
 #ifdef PADDLE_WITH_DISTRIBUTE
       operators::distributed::prefetch(id_name, out_name, table_name, epmap,

From d827881502592b91a486727f496c40249eee03a4 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Mon, 26 Nov 2018 11:34:46 +0800
Subject: [PATCH 15/90] fix pserver and prefetch rpc

---
 paddle/fluid/operators/distributed/grpc_client.cc          | 6 ++++--
 paddle/fluid/operators/distributed/grpc_client.h           | 1 +
 paddle/fluid/operators/distributed/grpc_serde.cc           | 6 +++++-
 paddle/fluid/operators/distributed/grpc_serde.h            | 3 ++-
 paddle/fluid/operators/distributed/parameter_prefetch.cc   | 5 +++--
 paddle/fluid/operators/distributed/request_handler_impl.cc | 5 +++--
 paddle/fluid/operators/distributed/rpc_client.h            | 2 +-
 7 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/paddle/fluid/operators/distributed/grpc_client.cc b/paddle/fluid/operators/distributed/grpc_client.cc
index c28f86146d..39365dd068 100644
--- a/paddle/fluid/operators/distributed/grpc_client.cc
+++ b/paddle/fluid/operators/distributed/grpc_client.cc
@@ -169,6 +169,7 @@ VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep,
                                           const framework::Scope& scope,
                                           const std::string& in_var_name,
                                           const std::string& out_var_name,
+                                          const std::string& table_name,
                                           int64_t time_out) {
   const platform::DeviceContext* p_ctx = &ctx;
   const std::string ep_val = ep;
@@ -184,11 +185,12 @@ VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep,
   s->Prepare(h, time_out);
 
   framework::AsyncIO([in_var_name_val, out_var_name_val, ep_val, p_scope, p_ctx,
-                      s, method, h, this] {
+                      s, method, h, table_name, this] {
     auto* var = p_scope->FindVar(in_var_name_val);
 
     ::grpc::ByteBuffer req;
-    SerializeToByteBuffer(in_var_name_val, var, *p_ctx, &req, out_var_name_val);
+    SerializeToByteBuffer(in_var_name_val, var, *p_ctx, &req, out_var_name_val,
+                          0, table_name);
 
     VLOG(30) << s->GetVarHandlePtr()->String() << " begin";
 
diff --git a/paddle/fluid/operators/distributed/grpc_client.h b/paddle/fluid/operators/distributed/grpc_client.h
index d8e9cee85b..a31a465645 100644
--- a/paddle/fluid/operators/distributed/grpc_client.h
+++ b/paddle/fluid/operators/distributed/grpc_client.h
@@ -194,6 +194,7 @@ class GRPCClient : public RPCClient {
                                 const framework::Scope& scope,
                                 const std::string& in_var_name,
                                 const std::string& out_var_name,
+                                const std::string& table_name = "",
                                 int64_t time_out = FLAGS_rpc_deadline) override;
 
   VarHandlePtr AsyncSendBatchBarrier(
diff --git a/paddle/fluid/operators/distributed/grpc_serde.cc b/paddle/fluid/operators/distributed/grpc_serde.cc
index f27b70a5a3..8b3009d39f 100644
--- a/paddle/fluid/operators/distributed/grpc_serde.cc
+++ b/paddle/fluid/operators/distributed/grpc_serde.cc
@@ -42,7 +42,8 @@ static void SerializeDestroyCallback(void* payload) {
 void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
                            const platform::DeviceContext& ctx,
                            ::grpc::ByteBuffer* msg, const std::string& out_name,
-                           const int trainer_id) {
+                           const int trainer_id,
+                           const std::string& table_name) {
   platform::RecordRPCEvent record_event("serial", &ctx);
   VarMsg request;
   TensorPayload* payload = nullptr;
@@ -63,6 +64,9 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
   if (!out_name.empty()) {
     request.set_out_varname(out_name);
   }
+  if (!table_name.empty()) {
+    request.set_table_name(table_name);
+  }
   if (var->IsType<framework::LoDTensor>()) {
     request.set_type(::sendrecv::LOD_TENSOR);
     payload = new TensorPayload(GetTensorPayload(var, ctx, &request));
diff --git a/paddle/fluid/operators/distributed/grpc_serde.h b/paddle/fluid/operators/distributed/grpc_serde.h
index 7ec489e961..fe566d9b4c 100644
--- a/paddle/fluid/operators/distributed/grpc_serde.h
+++ b/paddle/fluid/operators/distributed/grpc_serde.h
@@ -39,7 +39,8 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
                            const platform::DeviceContext& ctx,
                            ::grpc::ByteBuffer* msg,
                            const std::string& out_varname = std::string(),
-                           const int trainer_id = 0);
+                           const int trainer_id = 0,
+                           const std::string& table_name = std::string());
 
 void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
                                const platform::DeviceContext& ctx,
diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.cc b/paddle/fluid/operators/distributed/parameter_prefetch.cc
index 327c8cb4de..4d677e30b8 100644
--- a/paddle/fluid/operators/distributed/parameter_prefetch.cc
+++ b/paddle/fluid/operators/distributed/parameter_prefetch.cc
@@ -84,7 +84,7 @@ inline void SplitIdsIntoMultipleVarsBySection(
     const std::vector<int64_t>& height_section,
     const std::vector<std::vector<int64_t>>& splited_ids,
     framework::Scope* scope) {
-  PADDLE_ENFORCE_EQ(in_var_names.size(), height_section.size() + 1, "");
+  PADDLE_ENFORCE_EQ(in_var_names.size(), height_section.size(), "");
 
   auto place = platform::CPUPlace();
 
@@ -184,7 +184,8 @@ void prefetch(const std::string& id_name, const std::string& out_name,
       VLOG(30) << "sending " << in_var_names[i] << " to " << epmap[i]
                << " to get " << out_var_names[i] << " back";
       rets.push_back(rpc_client->AsyncPrefetchVar(
-          epmap[i], ctx, local_scope, in_var_names[i], out_var_names[i]));
+          epmap[i], ctx, local_scope, in_var_names[i], out_var_names[i],
+          table_name));
     } else {
       VLOG(30) << "don't send no-initialied variable: " << out_var_names[i];
     }
diff --git a/paddle/fluid/operators/distributed/request_handler_impl.cc b/paddle/fluid/operators/distributed/request_handler_impl.cc
index 0f1264ee96..e041337fd9 100644
--- a/paddle/fluid/operators/distributed/request_handler_impl.cc
+++ b/paddle/fluid/operators/distributed/request_handler_impl.cc
@@ -120,12 +120,13 @@ bool RequestPrefetchHandler::Handle(const std::string& varname,
                                     const std::string& table_name) {
   VLOG(40) << "RequestPrefetchHandler " << varname;
 
-  auto var_desc = program_->Block(0).FindVar(out_var_name);
-  InitializeVariable(*outvar, var_desc->GetType());
   if (table_name.empty()) {
+    auto var_desc = program_->Block(0).FindVar(out_var_name);
+    InitializeVariable(*outvar, var_desc->GetType());
     executor_->RunPreparedContext(
         (*prefetch_var_name_to_prepared_ctx_)[varname].get(), scope);
   } else {
+    (*outvar)->GetMutable<framework::LoDTensor>();
     auto lookup_table_op =
         BuildLookupTableOp(table_name, varname, out_var_name);
     paddle::platform::CPUPlace cpu_place;
diff --git a/paddle/fluid/operators/distributed/rpc_client.h b/paddle/fluid/operators/distributed/rpc_client.h
index 1983802e49..4cd3abb5a6 100644
--- a/paddle/fluid/operators/distributed/rpc_client.h
+++ b/paddle/fluid/operators/distributed/rpc_client.h
@@ -48,7 +48,7 @@ class RPCClient {
   virtual VarHandlePtr AsyncPrefetchVar(
       const std::string& ep, const platform::DeviceContext& ctx,
       const framework::Scope& scope, const std::string& in_var_name,
-      const std::string& out_var_name,
+      const std::string& out_var_name, const std::string& table_name = "",
       int64_t time_out = FLAGS_rpc_deadline) = 0;
 
   virtual VarHandlePtr AsyncSendBatchBarrier(

From 686d15c8e02d3e90437050fdde96004d225f7c29 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Mon, 26 Nov 2018 14:53:41 +0800
Subject: [PATCH 16/90] update grpc_variable_response

---
 paddle/fluid/operators/distributed/grpc_client.cc  |  5 +++--
 .../fluid/operators/distributed/grpc_serde_test.cc |  3 ++-
 .../distributed/grpc_variable_response.cc          | 14 ++++++++++++++
 3 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/operators/distributed/grpc_client.cc b/paddle/fluid/operators/distributed/grpc_client.cc
index 39365dd068..bee6020d5d 100644
--- a/paddle/fluid/operators/distributed/grpc_client.cc
+++ b/paddle/fluid/operators/distributed/grpc_client.cc
@@ -175,6 +175,7 @@ VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep,
   const std::string ep_val = ep;
   const std::string in_var_name_val = in_var_name;
   const std::string out_var_name_val = out_var_name;
+  const std::string table_name_val = table_name;
   const framework::Scope* p_scope = &scope;
   const auto ch = GetChannel(ep_val);
   GetProcessor* s = new GetProcessor(ch);
@@ -185,12 +186,12 @@ VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep,
   s->Prepare(h, time_out);
 
   framework::AsyncIO([in_var_name_val, out_var_name_val, ep_val, p_scope, p_ctx,
-                      s, method, h, table_name, this] {
+                      s, method, h, table_name_val, this] {
     auto* var = p_scope->FindVar(in_var_name_val);
 
     ::grpc::ByteBuffer req;
     SerializeToByteBuffer(in_var_name_val, var, *p_ctx, &req, out_var_name_val,
-                          0, table_name);
+                          0, table_name_val);
 
     VLOG(30) << s->GetVarHandlePtr()->String() << " begin";
 
diff --git a/paddle/fluid/operators/distributed/grpc_serde_test.cc b/paddle/fluid/operators/distributed/grpc_serde_test.cc
index 96ea05e74e..1936c2c623 100644
--- a/paddle/fluid/operators/distributed/grpc_serde_test.cc
+++ b/paddle/fluid/operators/distributed/grpc_serde_test.cc
@@ -130,7 +130,8 @@ void RunTestLodTensor(platform::Place place, int from_type = 0) {
   math::set_constant(ctx, tensor, 31.9);
 
   ::grpc::ByteBuffer msg;
-  operators::distributed::SerializeToByteBuffer("myvar", &var, ctx, &msg);
+  operators::distributed::SerializeToByteBuffer("myvar", &var, ctx, &msg,
+                                                "outvar", 0, "table_name");
   EXPECT_GT(msg.Length(), static_cast<size_t>(0));
 
   // deserialize
diff --git a/paddle/fluid/operators/distributed/grpc_variable_response.cc b/paddle/fluid/operators/distributed/grpc_variable_response.cc
index d6d219d436..76ad02b030 100644
--- a/paddle/fluid/operators/distributed/grpc_variable_response.cc
+++ b/paddle/fluid/operators/distributed/grpc_variable_response.cc
@@ -301,6 +301,20 @@ int GRPCVariableResponse::Parse(Source* source) {
         meta_.set_trainer_id(trainer_id);
         break;
       }
+      case sendrecv::VariableMessage::kTableNameFieldNumber: {
+        uint32_t length;
+        if ((wt != WIRETYPE_LENGTH_DELIMITED) || !input.ReadVarint32(&length)) {
+          return tag;
+        }
+
+        std::string temp;
+        if (!input.ReadString(&temp, length)) {
+          return tag;
+        }
+
+        meta_.set_table_name(temp);
+        break;
+      }
       default: {
         // Unknown tag, return unknown error.
         return -1;

From ed9fa4b3011bd5e092819776a55922aac8ce9fdf Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Mon, 26 Nov 2018 16:17:51 +0800
Subject: [PATCH 17/90] can run

---
 paddle/fluid/operators/distributed/grpc_server.cc        | 2 +-
 paddle/fluid/operators/distributed/parameter_prefetch.cc | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/operators/distributed/grpc_server.cc b/paddle/fluid/operators/distributed/grpc_server.cc
index d5295dc63d..ad74e7a031 100644
--- a/paddle/fluid/operators/distributed/grpc_server.cc
+++ b/paddle/fluid/operators/distributed/grpc_server.cc
@@ -192,7 +192,7 @@ class RequestPrefetch final : public RequestBase {
     framework::Variable* outvar = scope->Var(out_var_name);
 
     request_handler_->Handle(in_var_name, scope, invar, &outvar, trainer_id,
-                             out_var_name);
+                             out_var_name, table_name);
 
     SerializeToByteBuffer(out_var_name, outvar, *request_handler_->dev_ctx(),
                           &reply_);
diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.cc b/paddle/fluid/operators/distributed/parameter_prefetch.cc
index 4d677e30b8..23beabc6e1 100644
--- a/paddle/fluid/operators/distributed/parameter_prefetch.cc
+++ b/paddle/fluid/operators/distributed/parameter_prefetch.cc
@@ -106,7 +106,7 @@ inline void MergeMultipleVarsIntoOnBySection(
     const std::vector<int64_t>& height_section,
     const std::vector<std::vector<int64_t>>& splited_ids,
     const framework::ExecutionContext& context, framework::Scope* scope) {
-  PADDLE_ENFORCE_EQ(out_var_names.size(), height_section.size() + 1, "");
+  PADDLE_ENFORCE_EQ(out_var_names.size(), height_section.size(), "");
 
   auto cpu_place = platform::CPUPlace();
 
@@ -185,7 +185,7 @@ void prefetch(const std::string& id_name, const std::string& out_name,
                << " to get " << out_var_names[i] << " back";
       rets.push_back(rpc_client->AsyncPrefetchVar(
           epmap[i], ctx, local_scope, in_var_names[i], out_var_names[i],
-          table_name));
+          table_name + ".block" + std::to_string(i)));
     } else {
       VLOG(30) << "don't send no-initialied variable: " << out_var_names[i];
     }

From 5d5e0656b28b1ec9f27c73aff8bb4edcac719c17 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Mon, 26 Nov 2018 19:02:23 +0800
Subject: [PATCH 18/90] clean code

---
 .../tests/unittests/test_dist_transpiler.py   | 51 -------------------
 1 file changed, 51 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
index dbc4583763..3a905d4a02 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
@@ -582,57 +582,6 @@ class TestDistLookupTable(TestDistLookupTableBase):
                          startup_ops)
 
 
-class TestRemoteLookupTable(TestDistLookupTableBase):
-    def net_conf(self):
-        self.network_with_table(
-            is_sparse=True, is_distributed=False, remote_prefetch=True)
-
-    def transpiler_test_impl(self):
-        pserver1, startup1 = self.get_pserver(self.pserver1_ep)
-
-        self.assertEqual(len(pserver1.blocks), 6)
-        # 0 listen_and_serv
-        # 1 optimize for fc_w or fc_b adam
-        self.assertEqual([op.type for op in pserver1.blocks[1].ops],
-                         ["sum", "scale", "adam", "scale", "scale"])
-        # 4 prefetch -> lookup_sparse_table for data0
-        self.assertEqual([op.type for op in pserver1.blocks[2].ops],
-                         ["sum", "scale", "adam", "scale", "scale"])
-        # 2 optimize for table sgd
-        self.assertEqual([op.type for op in pserver1.blocks[3].ops],
-                         ["sum", "sgd"])
-        # 3 prefetch -> lookup_sparse_table for data0
-        self.assertEqual([op.type for op in pserver1.blocks[4].ops],
-                         ["lookup_sparse_table"])
-        # 5 save table
-        self.assertEqual([op.type for op in pserver1.blocks[5].ops], ["save"])
-
-        trainer, trainer_startup = self.get_trainer()
-        self.assertEqual(len(trainer.blocks), 1)
-        ops = [
-            'split_ids', 'prefetch', 'merge_ids', 'sequence_pool',
-            'sequence_pool', 'lookup_table', 'sequence_pool', 'concat', 'mul',
-            'elementwise_add', 'cross_entropy', 'mean', 'fill_constant',
-            'mean_grad', 'cross_entropy_grad', 'elementwise_add_grad', 'send',
-            'mul_grad', 'send', 'concat_grad', 'sequence_pool_grad',
-            'lookup_table_grad', 'split_selected_rows', 'send',
-            'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad',
-            'lookup_table_grad', 'sum', 'split_ids', 'send', 'send_barrier',
-            'recv', 'recv', 'recv', 'fetch_barrier', 'concat'
-        ]
-        self.assertEqual([op.type for op in trainer.blocks[0].ops], ops)
-        startup_ops = [
-            'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
-            'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
-            'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
-            'fill_constant', 'fill_constant', 'uniform_random',
-            'uniform_random', 'recv', 'recv', 'recv', 'fetch_barrier', 'concat',
-            'fake_init'
-        ]
-        self.assertEqual([op.type for op in trainer_startup.blocks[0].ops],
-                         startup_ops)
-
-
 class TestAsyncLocalLookupTable(TestDistLookupTableBase):
     def net_conf(self):
         self.network_with_table(is_sparse=True, is_distributed=False)

From af2f5fc8249bd449a3d90df6e5cce1a63f9c244c Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Mon, 26 Nov 2018 20:28:26 +0800
Subject: [PATCH 19/90] fix some bugs

---
 .../details/multi_devices_graph_pass.cc       |  2 +-
 .../distributed/parameter_prefetch.cc         | 50 +++++++++++--------
 .../fluid/transpiler/distribute_transpiler.py |  3 +-
 3 files changed, 31 insertions(+), 24 deletions(-)

diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
index 8c98b78130..359064cbf2 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -862,7 +862,7 @@ int MultiDevSSAGraphBuilder::CreateRPCOp(
       if (node->Op()->Type() == "fetch_barrier") {
         outvar_dev_id =
             GetVarDeviceID(*result, output->Name(), *sharded_var_device);
-        PADDLE_ENFORCE_NE(outvar_dev_id, -1);
+        PADDLE_ENFORCE_NE(outvar_dev_id, -1, "output name %s", output->Name());
       }
       p = places_[outvar_dev_id];
       ir::Node *new_node = nullptr;
diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.cc b/paddle/fluid/operators/distributed/parameter_prefetch.cc
index 23beabc6e1..f409b13f01 100644
--- a/paddle/fluid/operators/distributed/parameter_prefetch.cc
+++ b/paddle/fluid/operators/distributed/parameter_prefetch.cc
@@ -100,7 +100,7 @@ inline void SplitIdsIntoMultipleVarsBySection(
   }
 }
 
-inline void MergeMultipleVarsIntoOnBySection(
+inline void MergeMultipleVarsIntoOneBySection(
     const std::string& id_name, const std::string& out_name,
     const std::vector<std::string>& out_var_names,
     const std::vector<int64_t>& height_section,
@@ -125,25 +125,30 @@ inline void MergeMultipleVarsIntoOnBySection(
   for (size_t section_idx = 0; section_idx < out_var_names.size();
        ++section_idx) {
     auto& ids_in_this_section = splited_ids[section_idx];
-    auto& prefetch_out_var =
-        scope->Var(out_var_names[section_idx])->Get<framework::LoDTensor>();
-    const auto* out_var_data = prefetch_out_var.data<float>();
-    auto& dims = prefetch_out_var.dims();
-
-    PADDLE_ENFORCE_EQ(dims.size(), 2, "");
-    PADDLE_ENFORCE_EQ(ids_in_this_section.size(), dims[0]);
-
-    auto row_numel = dims[1];
-
-    for (size_t i = 0; i < dims[0]; ++i) {
-      auto id = ids_in_this_section[i];
-      auto origin_id = id + abs_sections[section_idx];
-      auto& offsets = id_to_offset[origin_id];
-      for (auto& offset : offsets) {
-        // should support GPU tensor
-        memory::Copy(cpu_place, out_tensor_data + offset * row_numel, cpu_place,
-                     out_var_data + i * row_numel, sizeof(float) * row_numel);
+    if (!ids_in_this_section.empty()) {
+      auto& prefetch_out_var =
+          scope->Var(out_var_names[section_idx])->Get<framework::LoDTensor>();
+      const auto* out_var_data = prefetch_out_var.data<float>();
+      auto& dims = prefetch_out_var.dims();
+
+      PADDLE_ENFORCE_EQ(dims.size(), 2, "");
+      PADDLE_ENFORCE_EQ(ids_in_this_section.size(), dims[0]);
+
+      auto row_numel = dims[1];
+
+      for (size_t i = 0; i < dims[0]; ++i) {
+        auto id = ids_in_this_section[i];
+        auto origin_id = id + abs_sections[section_idx];
+        auto& offsets = id_to_offset[origin_id];
+        for (auto& offset : offsets) {
+          // should support GPU tensor
+          memory::Copy(cpu_place, out_tensor_data + offset * row_numel,
+                       cpu_place, out_var_data + i * row_numel,
+                       sizeof(float) * row_numel);
+        }
       }
+    } else {
+      VLOG(30) << "ids in this section is empty";
     }
   }
 }
@@ -190,13 +195,14 @@ void prefetch(const std::string& id_name, const std::string& out_name,
       VLOG(30) << "don't send no-initialied variable: " << out_var_names[i];
     }
   }
+
   for (size_t i = 0; i < rets.size(); i++) {
     PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient");
   }
 
-  MergeMultipleVarsIntoOnBySection(id_name, out_name, out_var_names,
-                                   height_sections, splited_ids, context,
-                                   &local_scope);
+  MergeMultipleVarsIntoOneBySection(id_name, out_name, out_var_names,
+                                    height_sections, splited_ids, context,
+                                    &local_scope);
 
   context.scope().DeleteScope(&local_scope);
 }
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index ddf7468cdd..59f89e331d 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -444,7 +444,7 @@ class DistributeTranspiler(object):
                 # connect deps to send op in async mode
                 recv_dep_in = self.grad_name_to_send_dummy_out[
                     self.param_name_to_grad_name[param_varname]]
-            all_recv_outputs.extend(splited_var)
+
             # get recv op_role_var, if not splited, the grad should have .trainer suffix
             # if splited, grad should be the original grad var name. ParallelExecutor
             # will use op_role_var to get expected device place to run this op.
@@ -460,6 +460,7 @@ class DistributeTranspiler(object):
                 self._update_remote_sparse_update_op(param_varname,
                                                      height_sections, eps)
             else:
+                all_recv_outputs.extend(splited_var)
                 program.global_block().append_op(
                     type="recv",
                     inputs={"X": [recv_dep_in]},

From fb24690a58190885a4e12b9a3a46e5f91149a3f3 Mon Sep 17 00:00:00 2001
From: Jacek Czaja <jacek.czaja@intel.com>
Date: Tue, 20 Nov 2018 11:19:51 +0100
Subject: [PATCH 20/90] - conv2d transpose MKL-DNN

test=develop

- Added new header for MKLDNN reuse functionality

- Extended conv2d_transpose GetExpectedKernelType for MKL-DNN supporrt

- Buildable conv transpose mkldnn and conv mkldnn using conv template

- Conv2d transpose roughlt implemented and buildable

- Added modifications conv2d transpose MKLDNN unit tests

- Fix to UT of conv2d transpose mkldnn op

- Wrong type of MKLDNN primitive was chosen for conv2d transpose

- HAcks for conv2d transpose

- UT enalbed

- Replaced copying loop with memcpy

- Draft of passing lambda into AcquireMemory

- Made reorder (IOHW->OIHW) to be called only once
---
 .../fluid/operators/batch_norm_mkldnn_op.cc   |   2 +-
 paddle/fluid/operators/conv_mkldnn_op.cc      | 270 +----------
 .../operators/conv_transpose_mkldnn_op.cc     | 299 ++++++++++++
 paddle/fluid/operators/conv_transpose_op.cc   |  32 +-
 paddle/fluid/operators/softmax_mkldnn_op.cc   |   2 +-
 paddle/fluid/platform/mkldnn_helper.h         | 164 -------
 paddle/fluid/platform/mkldnn_reuse.h          | 458 ++++++++++++++++++
 .../test_conv2d_transpose_mkldnn_op.py        |  73 +++
 .../unittests/test_conv2d_transpose_op.py     |   7 +-
 9 files changed, 872 insertions(+), 435 deletions(-)
 create mode 100644 paddle/fluid/operators/conv_transpose_mkldnn_op.cc
 create mode 100644 paddle/fluid/platform/mkldnn_reuse.h
 create mode 100644 python/paddle/fluid/tests/unittests/test_conv2d_transpose_mkldnn_op.py

diff --git a/paddle/fluid/operators/batch_norm_mkldnn_op.cc b/paddle/fluid/operators/batch_norm_mkldnn_op.cc
index de641cb08e..c3a4b0824f 100644
--- a/paddle/fluid/operators/batch_norm_mkldnn_op.cc
+++ b/paddle/fluid/operators/batch_norm_mkldnn_op.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "mkldnn.hpp"
 #include "paddle/fluid/operators/batch_norm_op.h"
-#include "paddle/fluid/platform/mkldnn_helper.h"
+#include "paddle/fluid/platform/mkldnn_reuse.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/conv_mkldnn_op.cc b/paddle/fluid/operators/conv_mkldnn_op.cc
index 9e2e2cf818..05e268bf6a 100644
--- a/paddle/fluid/operators/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/conv_mkldnn_op.cc
@@ -15,7 +15,7 @@
 #include "paddle/fluid/framework/data_layout_transform.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/operators/conv_op.h"
-#include "paddle/fluid/platform/mkldnn_helper.h"
+#include "paddle/fluid/platform/mkldnn_reuse.h"
 
 namespace paddle {
 namespace operators {
@@ -28,259 +28,6 @@ using mkldnn::stream;
 using platform::to_void_cast;
 using platform::GetMKLDNNFormat;
 
-class ConvMKLDNNHandler : public platform::MKLDNNHandler {
- public:
-  ConvMKLDNNHandler(
-      std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd,
-      const platform::MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine,
-      const std::string& base_key)
-      : platform::MKLDNNHandler(dev_ctx, engine, base_key) {
-    conv_pd_ = conv_pd;
-  }
-
-  ConvMKLDNNHandler(
-      std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd,
-      std::shared_ptr<mkldnn::convolution_backward_data::primitive_desc>
-          conv_bwd_data_pd,
-      std::shared_ptr<mkldnn::convolution_backward_weights::primitive_desc>
-          conv_bwd_weights_pd,
-      const platform::MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine,
-      const std::string& base_key)
-      : platform::MKLDNNHandler(dev_ctx, engine, base_key),
-        conv_pd_(conv_pd),
-        conv_bwd_weights_pd_(conv_bwd_weights_pd),
-        conv_bwd_data_pd_(conv_bwd_data_pd) {
-    // If we are in Grad operatgor then update a key with BWD suffix to
-    // distinguish from FWD memory primitives
-    key_ += "-BWD";
-  }
-
-  size_t GetDstMemorySize() const {
-    return conv_pd_->dst_primitive_desc().get_size();
-  }
-
-  mkldnn::memory::format GetDstFormat() const {
-    return static_cast<mkldnn::memory::format>(
-        conv_pd_->dst_primitive_desc().desc().data.format);
-  }
-
-  size_t GetDiffWeightsMemorySize() const {
-    return conv_bwd_weights_pd_->diff_weights_primitive_desc().get_size();
-  }
-
-  size_t GetDiffSourceMemorySize() const {
-    return conv_bwd_data_pd_->diff_src_primitive_desc().get_size();
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireSrcMemoryFromWeightsPrimitive(
-      const std::shared_ptr<mkldnn::memory> user_memory_p,
-      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
-    auto src_pd = conv_bwd_weights_pd_->src_primitive_desc();
-    auto user_pd = user_memory_p->get_primitive_desc();
-    return this->AcquireMemory(src_pd, user_pd, user_memory_p,
-                               "@weights-src_mem_p", pipeline);
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireDiffDstMemoryFromWeightsPrimitive(
-      const std::shared_ptr<mkldnn::memory> user_memory_p,
-      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
-    auto diff_dst_pd = conv_bwd_weights_pd_->diff_dst_primitive_desc();
-    auto user_pd = user_memory_p->get_primitive_desc();
-    return this->AcquireMemory(diff_dst_pd, user_pd, user_memory_p,
-                               "@weights-diff_dst_mem_p", pipeline);
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireDiffWeightsMemoryFromWeightsPrimitive(
-      void* ptr) {
-    return this->AcquireMemoryFromPrimitive(
-        conv_bwd_weights_pd_->diff_weights_primitive_desc(), ptr,
-        "@diff_weights_mem_p");
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireDiffDstMemoryFromDataPrimitive(
-      const std::shared_ptr<mkldnn::memory> user_memory_p,
-      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
-    auto diff_dst_pd = conv_bwd_data_pd_->diff_dst_primitive_desc();
-    auto user_pd = user_memory_p->get_primitive_desc();
-    return this->AcquireMemory(diff_dst_pd, user_pd, user_memory_p,
-                               "@data-diff_dst_mem_p", pipeline);
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireWeightsMemoryFromDataPrimitive(
-      const std::shared_ptr<mkldnn::memory> user_weights_memory_p,
-      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
-    auto weights_pd = conv_bwd_data_pd_->weights_primitive_desc();
-    auto user_pd = user_weights_memory_p->get_primitive_desc();
-    return this->AcquireMemory(weights_pd, user_pd, user_weights_memory_p,
-                               "@data-weights_mem_p", pipeline);
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireResidualDataMemory(
-      const mkldnn::memory::desc& md, void* ptr) {
-    return this->AcquireMemory(md, ptr, "@user_residual_data_mem_p");
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireDstMemoryFromResidualDataMemory(
-      const std::shared_ptr<mkldnn::memory>& user_residual_memory_p,
-      void* dst_ptr,
-      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
-    return this->AcquireMemory(user_residual_memory_p,
-                               this->AcquireDstMemoryFromPrimitive(dst_ptr),
-                               "@residual_data_mem_p", pipeline);
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireDiffSrcMemoryFromDataPrimitive(
-      void* ptr) {
-    return this->AcquireMemoryFromPrimitive(
-        conv_bwd_data_pd_->diff_src_primitive_desc(), ptr, "@diff_src_mem_p");
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireDstMemoryFromPrimitive(void* ptr) {
-    return this->AcquireMemoryFromPrimitive(conv_pd_->dst_primitive_desc(), ptr,
-                                            "@dst_mem_p");
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireSrcMemoryFromPrimitive(
-      const std::shared_ptr<mkldnn::memory> user_memory_p,
-      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
-    auto src_pd = conv_pd_->src_primitive_desc();
-    auto user_pd = user_memory_p->get_primitive_desc();
-    return this->AcquireMemory(src_pd, user_pd, user_memory_p, "@src_mem_p",
-                               pipeline);
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireWeightsMemoryFromPrimitive(
-      const std::shared_ptr<mkldnn::memory> user_weights_memory_p,
-      std::vector<mkldnn::primitive>& pipeline,  // NOLINT
-      bool is_persistent = false) {
-    auto user_weights_pd = user_weights_memory_p->get_primitive_desc();
-    auto weights_pd = conv_pd_->weights_primitive_desc();
-    return this->AcquireMemory(weights_pd, user_weights_pd,
-                               user_weights_memory_p, "@weights_mem_p",
-                               pipeline, is_persistent);
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireBiasMemoryFromPrimitive(
-      const std::shared_ptr<mkldnn::memory> user_bias_memory_p,
-      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
-    auto user_bias_pd = user_bias_memory_p->get_primitive_desc();
-    auto bias_pd = conv_pd_->bias_primitive_desc();
-    return this->AcquireMemory(bias_pd, user_bias_pd, user_bias_memory_p,
-                               "@bias_mem_p", pipeline);
-  }
-
-  std::shared_ptr<mkldnn::convolution_forward> AcquireConvolution(
-      std::shared_ptr<mkldnn::memory> src_memory_p,
-      std::shared_ptr<mkldnn::memory> weights_memory_p,
-      std::shared_ptr<mkldnn::memory> dst_memory_p) {
-    auto prim_key = key_ + "@conv_p";
-    auto conv_p = std::static_pointer_cast<mkldnn::convolution_forward>(
-        dev_ctx_.GetBlob(prim_key));
-    PADDLE_ENFORCE((conv_p != nullptr) || (is_reusing_ == false),
-                   "Fail to find convolution primitive in device context");
-    if (conv_p == nullptr) {
-      conv_p = std::make_shared<mkldnn::convolution_forward>(
-          *conv_pd_, *(src_memory_p), *(weights_memory_p.get()),
-          *(dst_memory_p.get()));
-
-      dev_ctx_.SetBlob(prim_key, conv_p);
-    } else {
-      is_reusing_ = true;
-    }
-    return conv_p;
-  }
-
-  std::shared_ptr<mkldnn::convolution_forward> AcquireConvolution(
-      std::shared_ptr<mkldnn::memory> src_memory_p,
-      std::shared_ptr<mkldnn::memory> weights_memory_p,
-      std::shared_ptr<mkldnn::memory> bias_memory_p,
-      std::shared_ptr<mkldnn::memory> dst_memory_p) {
-    auto prim_key = key_ + "@conv_p";
-    auto conv_p = std::static_pointer_cast<mkldnn::convolution_forward>(
-        dev_ctx_.GetBlob(prim_key));
-    PADDLE_ENFORCE((conv_p != nullptr) || (is_reusing_ == false),
-                   "Fail to find convolution primitive in device context");
-    if (conv_p == nullptr) {
-      conv_p = std::make_shared<mkldnn::convolution_forward>(
-          *conv_pd_, *(src_memory_p), *(weights_memory_p.get()),
-          *(bias_memory_p.get()), *(dst_memory_p.get()));
-
-      dev_ctx_.SetBlob(prim_key, conv_p);
-    } else {
-      is_reusing_ = true;
-    }
-    return conv_p;
-  }
-
-  std::shared_ptr<mkldnn::convolution_backward_weights>
-  AcquireConvolutionBackwardWeights(
-      std::shared_ptr<mkldnn::memory> src_memory_p,
-      std::shared_ptr<mkldnn::memory> diff_dst_memory_p,
-      std::shared_ptr<mkldnn::memory> diff_weights_memory_p) {
-    auto prim_key = key_ + "@conv_bwd_weights_p";
-    auto conv_bwd_weights_p =
-        std::static_pointer_cast<mkldnn::convolution_backward_weights>(
-            dev_ctx_.GetBlob(prim_key));
-    PADDLE_ENFORCE(
-        (conv_bwd_weights_p != nullptr) || (is_reusing_ == false),
-        "Fail to find convolution bwd weights primitive in device context");
-    if (conv_bwd_weights_p == nullptr) {
-      // create backward conv primitive for weights
-      conv_bwd_weights_p =
-          std::make_shared<mkldnn::convolution_backward_weights>(
-              *conv_bwd_weights_pd_, *src_memory_p, *diff_dst_memory_p,
-              *diff_weights_memory_p);
-      dev_ctx_.SetBlob(prim_key, conv_bwd_weights_p);
-    } else {
-      is_reusing_ = true;
-    }
-    return conv_bwd_weights_p;
-  }
-
-  std::shared_ptr<mkldnn::convolution_backward_data>
-  AcquireConvolutionBackwardData(
-      std::shared_ptr<mkldnn::memory> diff_dst_memory_p,
-      std::shared_ptr<mkldnn::memory> weights_memory_p,
-      std::shared_ptr<mkldnn::memory> diff_src_memory_p) {
-    auto prim_key = key_ + "@conv_bwd_data_p";
-    auto conv_bwd_data_p =
-        std::static_pointer_cast<mkldnn::convolution_backward_data>(
-            dev_ctx_.GetBlob(prim_key));
-    PADDLE_ENFORCE(
-        (conv_bwd_data_p != nullptr) || (is_reusing_ == false),
-        "Fail to find convolution bwd data primitive in device context");
-    if (conv_bwd_data_p == nullptr) {
-      conv_bwd_data_p = std::make_shared<mkldnn::convolution_backward_data>(
-          *conv_bwd_data_pd_, *diff_dst_memory_p, *weights_memory_p,
-          *diff_src_memory_p);
-      dev_ctx_.SetBlob(prim_key, conv_bwd_data_p);
-    } else {
-      is_reusing_ = true;
-    }
-    return conv_bwd_data_p;
-  }
-
-  // Generate keys for storing/retriving primitives for this operator
-  // TODO(jczaja): Make hashing function more optimial
-  static std::string GetHash(memory::dims& input_dims,     // NOLINT
-                             memory::dims& weights_dims,   // NOLINT
-                             std::vector<int>& strides,    // NOLINT
-                             std::vector<int>& paddings,   // NOLINT
-                             std::vector<int>& dilations,  // NOLINT
-                             int groups, const std::string& suffix) {
-    return dims2str(input_dims) + dims2str(weights_dims) + dims2str(strides) +
-           dims2str(paddings) + dims2str(dilations) + std::to_string(groups) +
-           suffix;
-  }
-
- private:
-  std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd_;
-  std::shared_ptr<mkldnn::convolution_backward_weights::primitive_desc>
-      conv_bwd_weights_pd_;
-  std::shared_ptr<mkldnn::convolution_backward_data::primitive_desc>
-      conv_bwd_data_pd_;
-};
-
 template <typename T>
 class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
  public:
@@ -351,7 +98,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
 
     // Get unique name for storing MKLDNN primitives
-    const std::string key = ConvMKLDNNHandler::GetHash(
+    const std::string key = platform::ConvMKLDNNHandler::GetHash(
         src_tz, weights_tz, strides, paddings, dilations, groups,
         ctx.op().Output("Output"));
     const std::string key_conv_pd = key + "@conv_pd";
@@ -400,7 +147,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     // Save conv_pd/src_memory/weights_memory for backward pass
     if (!is_test) dev_ctx.SetBlob(key_conv_pd, conv_pd);
 
-    ConvMKLDNNHandler handler(conv_pd, dev_ctx, mkldnn_engine, key);
+    platform::ConvMKLDNNHandler handler(conv_pd, dev_ctx, mkldnn_engine, key);
 
     // create mkldnn memory from input tensors (data/weights)
     auto user_src_memory_p =
@@ -616,9 +363,9 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     // Get an unique name from "argument" name of "Output" variable
     // as well as attributes of primitive to be created
     // This name will be used as key when saving info into device context
-    const std::string key =
-        ConvMKLDNNHandler::GetHash(src_tz, weights_tz, strides, paddings,
-                                   dilations, groups, ctx.op().Input("Output"));
+    const std::string key = platform::ConvMKLDNNHandler::GetHash(
+        src_tz, weights_tz, strides, paddings, dilations, groups,
+        ctx.op().Input("Output"));
 
     const std::string key_conv_pd = key + "@conv_pd";
     std::vector<primitive> pipeline;
@@ -673,8 +420,9 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
         std::make_shared<mkldnn::convolution_backward_data::primitive_desc>(
             conv_bwd_data_desc, mkldnn_engine, *conv_pd);
 
-    ConvMKLDNNHandler handler(conv_pd, conv_bwd_data_pd, conv_bwd_weights_pd,
-                              dev_ctx, mkldnn_engine, key);
+    platform::ConvMKLDNNHandler handler(conv_pd, conv_bwd_data_pd,
+                                        conv_bwd_weights_pd, dev_ctx,
+                                        mkldnn_engine, key);
 
     // create mkldnn memory from input tensors (data/weights)
     auto user_src_memory_p =
diff --git a/paddle/fluid/operators/conv_transpose_mkldnn_op.cc b/paddle/fluid/operators/conv_transpose_mkldnn_op.cc
new file mode 100644
index 0000000000..317d4cebe2
--- /dev/null
+++ b/paddle/fluid/operators/conv_transpose_mkldnn_op.cc
@@ -0,0 +1,299 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/fluid/framework/data_layout_transform.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/platform/mkldnn_reuse.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using framework::DataLayout;
+
+template <typename T>
+class ConvTransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
+                   "It must use CPUPlace.");
+
+    const bool is_test = ctx.Attr<bool>("is_test");
+    PADDLE_ENFORCE(
+        is_test == true,
+        "ConvTransposeMKLDNN works only for inference!. Set is_test = True");
+
+    auto& dev_ctx =
+        ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
+    const auto& mkldnn_engine = dev_ctx.GetEngine();
+
+    auto* input = ctx.Input<Tensor>("Input");
+    auto* filter = ctx.Input<Tensor>("Filter");
+    auto* bias = ctx.HasInput("Bias") ? ctx.Input<Tensor>("Bias") : nullptr;
+    auto* output = ctx.Output<Tensor>("Output");
+
+    PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN &&
+                       input->format() != mkldnn::memory::format::format_undef,
+                   "Wrong layout/format set for Input tensor");
+    PADDLE_ENFORCE(filter->layout() == DataLayout::kMKLDNN &&
+                       filter->format() != mkldnn::memory::format::format_undef,
+                   "Wrong layout/format set for Filter tensor");
+    PADDLE_ENFORCE(input->dims().size() == 4,
+                   "Input must be with 4 dimensions, i.e. NCHW");
+    PADDLE_ENFORCE(filter->dims().size() == 4,
+                   "Filter must be with 4 dimensions, i.e. OIHW");
+
+    if (bias) {
+      PADDLE_ENFORCE(bias->layout() == DataLayout::kMKLDNN &&
+                         bias->format() != mkldnn::memory::format::format_undef,
+                     "Wrong layout/format set for Bias tensor");
+      PADDLE_ENFORCE(bias->dims().size() == 1,
+                     "Bias must only have 1 dimension, i.e. X");
+    }
+
+    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
+    int groups = ctx.Attr<int>("groups");
+
+    // TODO(tpatejko): add support for dilation
+    PADDLE_ENFORCE(
+        dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1,
+        "dilation in convolution is not implemented yet");
+
+    const T* input_data = input->data<T>();
+    const T* filter_data = filter->data<T>();
+
+    std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
+    std::vector<int> iohw_weights_tz =
+        paddle::framework::vectorize2int(filter->dims());
+    std::vector<int> weights_tz = iohw_weights_tz;
+    // IOHW -> OIHW
+    weights_tz[0] = iohw_weights_tz[1];
+    weights_tz[1] = iohw_weights_tz[0];
+
+    // Custom Reorder from IOHW to OIHW
+    auto iohw2oihw_reorder =
+        [&iohw_weights_tz](const T* filter_data) -> std::shared_ptr<T> {
+      int o = iohw_weights_tz[1];
+      int c = iohw_weights_tz[0];
+      int h = iohw_weights_tz[2];
+      int w = iohw_weights_tz[3];
+      std::shared_ptr<T> reordered_filter_data(new T[o * c * h * w](),
+                                               std::default_delete<T[]>());
+      for (int i = 0; i < c; ++i) {
+        for (int j = 0; j < o; ++j) {
+          int in_offset = j * h * w + i * o * h * w;
+          int out_offset = j * c * h * w + i * h * w;
+          std::memcpy(&(reordered_filter_data.get())[out_offset],
+                      &filter_data[in_offset], h * w * sizeof(T));
+        }
+      }
+
+      return reordered_filter_data;
+    };
+
+    int g = std::max(groups, 1);
+    if (g > 1) {
+      int o = weights_tz[0];
+      int i = weights_tz[1];
+      int h = weights_tz[2];
+      int w = weights_tz[3];
+      weights_tz.resize(5);
+      weights_tz[0] = g;
+      weights_tz[1] = o / g;
+      weights_tz[2] = i;
+      weights_tz[3] = h;
+      weights_tz[4] = w;
+    }
+    std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
+
+    // Get unique name for storing MKLDNN primitives
+    const std::string key = platform::ConvTransposeMKLDNNHandler::GetHash(
+        src_tz, weights_tz, strides, paddings, dilations, groups,
+        ctx.op().Output("Output"));
+    const std::string key_conv_transpose_pd = key + "@conv_transpose_pd";
+
+    std::vector<mkldnn::primitive> pipeline;
+
+    auto user_src_md = platform::MKLDNNMemDesc(
+        {src_tz}, platform::MKLDNNGetDataType<T>(), input->format());
+    auto user_weights_md =
+        platform::MKLDNNMemDesc({weights_tz}, platform::MKLDNNGetDataType<T>(),
+                                (g == 1) ? mkldnn::memory::format::oihw
+                                         : mkldnn::memory::format::goihw);
+
+    /* create memory descriptor for convolution without specified format
+     * ('any') which lets a primitive (convolution in this case) choose
+     * the memory format preferred for best performance
+     */
+    std::string data_format = ctx.Attr<std::string>("data_format");
+    auto chosen_memory_format =
+        platform::data_format_to_memory_format(data_format);
+    bool fuse_relu = ctx.Attr<bool>("fuse_relu");
+
+    auto src_md = platform::MKLDNNMemDesc(
+        src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
+    auto weights_md = platform::MKLDNNMemDesc(
+        weights_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
+    std::vector<int> bias_tz;  // TODO(mgallus): avoid empty vector creation.
+                               // Currently used whenever bias is != nullptr.
+    auto dst_md = platform::MKLDNNMemDesc(
+        dst_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
+
+    // create a deconv(conv transpose) primitive descriptor and save it for
+    // usage in backward
+    std::shared_ptr<mkldnn::deconvolution_forward::primitive_desc>
+        conv_transpose_pd;
+    auto fwd_prop_kind = is_test ? mkldnn::prop_kind::forward_inference
+                                 : mkldnn::prop_kind::forward_training;
+    if (bias) {
+      bias_tz = paddle::framework::vectorize2int(bias->dims());
+      auto bias_md = platform::MKLDNNMemDesc(
+          bias_tz, platform::MKLDNNGetDataType<T>(), mkldnn::memory::format::x);
+      conv_transpose_pd = ConvTransposeFwdPrimitiveDesc(
+          src_md, weights_md, bias_md, dst_md, strides, paddings, mkldnn_engine,
+          fuse_relu, fwd_prop_kind);
+    } else {
+      conv_transpose_pd = ConvTransposeFwdPrimitiveDesc(
+          src_md, weights_md, dst_md, strides, paddings, mkldnn_engine,
+          fuse_relu, fwd_prop_kind);
+    }
+    // Save conv_pd/src_memory/weights_memory for backward pass
+    if (!is_test) dev_ctx.SetBlob(key_conv_transpose_pd, conv_transpose_pd);
+
+    platform::ConvTransposeMKLDNNHandler handler(conv_transpose_pd, dev_ctx,
+                                                 mkldnn_engine, key);
+
+    // create mkldnn memory from input tensors (data/weights)
+    auto user_src_memory_p = handler.AcquireSrcMemory(
+        user_src_md, platform::to_void_cast<T>(input_data));
+    auto user_weights_memory_p = handler.AcquireWeightsMemory(
+        user_weights_md, platform::to_void_cast<T>(filter_data),
+        is_test ? iohw2oihw_reorder : platform::user_function());
+
+    // create reorder primitive if the input format is not the preferred one
+    auto src_memory_p =
+        handler.AcquireSrcMemoryFromPrimitive(user_src_memory_p, pipeline);
+    auto weights_memory_p = handler.AcquireWeightsMemoryFromPrimitive(
+        user_weights_memory_p, pipeline, is_test);
+
+    std::shared_ptr<mkldnn::memory> dst_memory_p;
+
+    auto output_data = output->mutable_data<T>(
+        ctx.GetPlace(), paddle::memory::Allocator::kDefault,
+        handler.GetDstMemorySize());
+    dst_memory_p = handler.AcquireDstMemoryFromPrimitive(
+        platform::to_void_cast<T>(output_data));
+
+    // create convolution op primitive
+    std::shared_ptr<mkldnn::deconvolution_forward> conv_p;
+    if (bias) {
+      const T* bias_data = bias->data<T>();
+      auto user_bias_md =
+          platform::MKLDNNMemDesc({bias_tz}, platform::MKLDNNGetDataType<T>(),
+                                  mkldnn::memory::format::x);
+      auto user_bias_memory_p = handler.AcquireBiasMemory(
+          user_bias_md, platform::to_void_cast<T>(bias_data));
+
+      auto bias_memory_p =
+          handler.AcquireBiasMemoryFromPrimitive(user_bias_memory_p, pipeline);
+      conv_p = handler.AcquireConvolution(src_memory_p, weights_memory_p,
+                                          bias_memory_p, dst_memory_p);
+    } else {
+      conv_p = handler.AcquireConvolution(src_memory_p, weights_memory_p,
+                                          dst_memory_p);
+    }
+
+    // push primitive to stream and wait until it's executed
+    pipeline.push_back(*conv_p);
+    mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+
+    output->set_layout(DataLayout::kMKLDNN);
+    output->set_format(platform::GetMKLDNNFormat(*dst_memory_p));
+  }
+
+ private:
+  mkldnn::primitive_attr CreatePostOps(bool fuse_relu) const {
+    mkldnn::primitive_attr conv_attr;
+    mkldnn::post_ops post_operations;
+    // Fusion with ReLU layer is executed through the PostOps feature. Create a
+    // PostOps object and configure it to execute an eltwise relu operation.
+    if (fuse_relu) {
+      constexpr float scale = 1.0f;
+      constexpr float negative_slope = 0.0f;
+      constexpr float placeholder = 0.0f;
+      post_operations.append_eltwise(scale, mkldnn::algorithm::eltwise_relu,
+                                     negative_slope, placeholder);
+    }
+    conv_attr.set_post_ops(post_operations);
+    return conv_attr;
+  }
+
+  std::unique_ptr<mkldnn::deconvolution_forward::primitive_desc>
+  ConvTransposeFwdPrimitiveDesc(
+      const mkldnn::memory::desc& src, const mkldnn::memory::desc& weights,
+      const mkldnn::memory::desc& dst, const std::vector<int>& strides,
+      const std::vector<int>& paddings, const mkldnn::engine& engine,
+      const bool fuse_relu, mkldnn::prop_kind fwd_prop_kind) const {
+    mkldnn::memory::dims stride_dims = {strides[0], strides[1]};
+    mkldnn::memory::dims padding_dims = {paddings[0], paddings[1]};
+
+    auto deconv_desc = mkldnn::deconvolution_forward::desc(
+        fwd_prop_kind, mkldnn::deconvolution_direct, src, weights, dst,
+        stride_dims, padding_dims, padding_dims, mkldnn::padding_kind::zero);
+
+    mkldnn::primitive_attr deconv_attr = CreatePostOps(fuse_relu);
+
+    auto p_conv_transpose_pd =
+        new mkldnn::deconvolution_forward::primitive_desc(deconv_desc,
+                                                          deconv_attr, engine);
+
+    return std::unique_ptr<mkldnn::deconvolution_forward::primitive_desc>(
+        p_conv_transpose_pd);
+  }
+
+  std::unique_ptr<mkldnn::deconvolution_forward::primitive_desc>
+  ConvTransposeFwdPrimitiveDesc(
+      const mkldnn::memory::desc& src, const mkldnn::memory::desc& weights,
+      const mkldnn::memory::desc& bias, const mkldnn::memory::desc& dst,
+      const std::vector<int>& strides, const std::vector<int>& paddings,
+      const mkldnn::engine& engine, const bool fuse_relu,
+      mkldnn::prop_kind fwd_prop_kind) const {
+    mkldnn::memory::dims stride_dims = {strides[0], strides[1]};
+    mkldnn::memory::dims padding_dims = {paddings[0], paddings[1]};
+
+    auto deconv_desc = mkldnn::deconvolution_forward::desc(
+        fwd_prop_kind, mkldnn::deconvolution_direct, src, weights, bias, dst,
+        stride_dims, padding_dims, padding_dims, mkldnn::padding_kind::zero);
+
+    mkldnn::primitive_attr deconv_attr = CreatePostOps(fuse_relu);
+
+    auto p_conv_transpose_pd =
+        new mkldnn::deconvolution_forward::primitive_desc(deconv_desc,
+                                                          deconv_attr, engine);
+
+    return std::unique_ptr<mkldnn::deconvolution_forward::primitive_desc>(
+        p_conv_transpose_pd);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_KERNEL(conv2d_transpose, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::ConvTransposeMKLDNNOpKernel<float>);
diff --git a/paddle/fluid/operators/conv_transpose_op.cc b/paddle/fluid/operators/conv_transpose_op.cc
index a916dd3496..fe09b5c17c 100644
--- a/paddle/fluid/operators/conv_transpose_op.cc
+++ b/paddle/fluid/operators/conv_transpose_op.cc
@@ -16,6 +16,10 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
+
 namespace paddle {
 namespace operators {
 
@@ -78,29 +82,38 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
 
 framework::OpKernelType ConvTransposeOp::GetExpectedKernelType(
     const framework::ExecutionContext& ctx) const {
+  framework::LibraryType library_{framework::LibraryType::kPlain};
+  std::string data_format = ctx.Attr<std::string>("data_format");
+  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
   bool use_cudnn = ctx.Attr<bool>("use_cudnn");
   use_cudnn &= platform::is_gpu_place(ctx.GetPlace());
 #ifdef PADDLE_WITH_CUDA
   if (platform::is_gpu_place(ctx.GetPlace())) {
     auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     use_cudnn &= dev_ctx.cudnn_handle() != nullptr;
+    if (use_cudnn) {
+      library_ = framework::LibraryType::kCUDNN;
+    }
   }
 #endif
-  framework::LibraryType library_;
-  if (use_cudnn) {
-    library_ = framework::LibraryType::kCUDNN;
-  } else {
-    library_ = framework::LibraryType::kPlain;
+#ifdef PADDLE_WITH_MKLDNN
+  if (library_ == framework::LibraryType::kPlain &&
+      platform::CanMKLDNNBeUsed(ctx)) {
+    library_ = framework::LibraryType::kMKLDNN;
+    layout_ = framework::DataLayout::kMKLDNN;
   }
+#endif
 
-  std::string data_format = ctx.Attr<std::string>("data_format");
-  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
   return framework::OpKernelType(
       framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
       layout_, library_);
 }
 
 void Conv2DTransposeOpMaker::Make() {
+  AddAttr<bool>("is_test",
+                "(bool, default false) Set to true for inference only, false "
+                "for training. Some layers may run faster when this is true.")
+      .SetDefault(false);
   AddInput(
       "Input",
       "(Tensor) The input tensor of convolution transpose operator. "
@@ -145,6 +158,11 @@ void Conv2DTransposeOpMaker::Make() {
       "use_cudnn",
       "(bool, default false) Only used in cudnn kernel, need install cudnn")
       .SetDefault(false);
+  AddAttr<bool>("use_mkldnn",
+                "(bool, default false) Only used in mkldnn kernel")
+      .SetDefault(false);
+  AddAttr<bool>("fuse_relu", "(bool, default false) Only used in mkldnn kernel")
+      .SetDefault(false);
   AddAttr<std::string>(
       "data_format",
       "(string, default NCHW) Only used in "
diff --git a/paddle/fluid/operators/softmax_mkldnn_op.cc b/paddle/fluid/operators/softmax_mkldnn_op.cc
index 01819f53e3..d2b1495354 100644
--- a/paddle/fluid/operators/softmax_mkldnn_op.cc
+++ b/paddle/fluid/operators/softmax_mkldnn_op.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <iostream>
 #include "mkldnn.hpp"
 #include "paddle/fluid/operators/softmax_op.h"
-#include "paddle/fluid/platform/mkldnn_helper.h"
+#include "paddle/fluid/platform/mkldnn_reuse.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h
index 814012e6c1..7685e35bbb 100644
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
@@ -106,170 +106,6 @@ inline mkldnn::memory::format GetMKLDNNFormat(
       memory.dst_primitive_desc().desc().data.format);
 }
 
-class MKLDNNHandler {
- public:
-  MKLDNNHandler(const MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine,
-                const std::string& base_key)
-      : dev_ctx_(dev_ctx),
-        engine_(engine),
-        key_(base_key),
-        is_reusing_(false) {}
-
-  std::shared_ptr<mkldnn::memory> AcquireSrcMemory(
-      const mkldnn::memory::desc& md, void* ptr) {
-    return this->AcquireMemory(md, ptr, "@user_src_mem_p");
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireWeightsMemory(
-      const mkldnn::memory::desc& md, void* ptr) {
-    return this->AcquireMemory(md, ptr, "@user_weights_mem_p");
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireBiasMemory(
-      const mkldnn::memory::desc& md, void* ptr) {
-    return this->AcquireMemory(md, ptr, "@user_bias_mem_p");
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireDstMemory(
-      const mkldnn::memory::desc& md, void* ptr) {
-    return this->AcquireMemory(md, ptr, "@user_dst_mem_p");
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireDiffDstMemory(
-      const mkldnn::memory::desc& md, void* ptr) {
-    return this->AcquireMemory(md, ptr, "@user_diff_dst_mem_p");
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireDiffSrcMemory(
-      const mkldnn::memory::desc& md, void* ptr) {
-    return this->AcquireMemory(md, ptr, "@user_diff_src_mem_p");
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireMemoryFromPrimitive(
-      mkldnn::memory::primitive_desc mdp, void* ptr,
-      const std::string& suffix) {
-    auto local_key = key_ + suffix;
-    auto mem_p =
-        std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
-    PADDLE_ENFORCE((mem_p != nullptr) || (is_reusing_ == false),
-                   "Fail to find mem primitive in device context");
-    if (mem_p == nullptr) {
-      mem_p = std::make_shared<mkldnn::memory>(mdp, ptr);
-      dev_ctx_.SetBlob(local_key, mem_p);
-    } else {
-      mem_p->set_data_handle(ptr);
-      // Mark that reusing happenned. All primitives from operator instance
-      // should be reused or none of them. So we check consistency
-      is_reusing_ = true;
-    }
-    return mem_p;
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireMemory(const mkldnn::memory::desc& md,
-                                                void* ptr,
-                                                const std::string& suffix) {
-    /*Generate key*/
-    auto local_key = key_ + suffix;
-    auto mem_p =
-        std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
-    PADDLE_ENFORCE((mem_p != nullptr) || (is_reusing_ == false),
-                   "Fail to find mem primitive in device context");
-    if (mem_p == nullptr) {
-      mem_p = std::make_shared<mkldnn::memory>(
-          mkldnn::memory::primitive_desc{md, engine_}, ptr);
-      dev_ctx_.SetBlob(local_key, mem_p);
-    } else {
-      mem_p->set_data_handle(ptr);
-      // Mark that reusing happenned. All primitives from operator instance
-      // should be reused or none of them. So we check consistency
-      is_reusing_ = true;
-    }
-    return mem_p;
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireMemory(
-      const std::shared_ptr<mkldnn::memory>& user_memory_p,
-      const std::shared_ptr<mkldnn::memory>& target_memory_p,
-      const std::string& suffix,
-      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
-    auto local_key = key_ + suffix;
-    auto key_reorder_p = key_ + suffix + "reorder_p";
-
-    auto stored_reorder_p = std::static_pointer_cast<mkldnn::reorder>(
-        dev_ctx_.GetBlob(key_reorder_p));
-
-    if (stored_reorder_p) {
-      pipeline.push_back(*stored_reorder_p);
-    } else {
-      auto reorder_p =
-          std::make_shared<mkldnn::reorder>(*user_memory_p, *target_memory_p);
-      dev_ctx_.SetBlob(key_reorder_p, reorder_p);
-      pipeline.push_back(*reorder_p);
-    }
-
-    return target_memory_p;
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireMemory(
-      mkldnn::memory::primitive_desc& mpd,       // NOLINT
-      mkldnn::memory::primitive_desc& user_mpd,  // NOLINT
-      const std::shared_ptr<mkldnn::memory> user_memory_p,
-      const std::string& suffix,
-      std::vector<mkldnn::primitive>& pipeline,  // NOLINT
-      bool is_persistent = false) {
-    // create reorder primitive if the input format is not the preferred one
-    auto local_key = key_ + suffix;
-    auto key_reorder_p = key_ + suffix + "reorder_p";
-
-    auto target_memory_p =
-        std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
-    PADDLE_ENFORCE((target_memory_p != nullptr) || (is_reusing_ == false),
-                   "Fail to find mem primitive in device context");
-    if (target_memory_p == nullptr) {
-      target_memory_p = user_memory_p;
-      std::shared_ptr<mkldnn::primitive> reorder_p;
-      if (mpd != user_mpd) {
-        target_memory_p = std::make_shared<mkldnn::memory>(mpd);
-
-        auto reorder_p =
-            std::make_shared<mkldnn::reorder>(*user_memory_p, *target_memory_p);
-        dev_ctx_.SetBlob(key_reorder_p, reorder_p);
-        pipeline.push_back(*reorder_p);
-      }
-      dev_ctx_.SetBlob(local_key, target_memory_p);
-    } else if (!is_persistent) {
-      // Make reorder if needed
-      auto reorder_p = std::static_pointer_cast<mkldnn::reorder>(
-          dev_ctx_.GetBlob(key_reorder_p));
-      if (reorder_p != nullptr) {
-        pipeline.push_back(*reorder_p);
-      }
-      is_reusing_ = true;
-    }
-    return target_memory_p;
-  }
-
-  static std::string GetHash(mkldnn::memory::dims& operand_dims,  // NOLINT
-                             const std::string& suffix) {
-    return dims2str(operand_dims) + suffix;
-  }
-
- protected:
-  static std::string dims2str(const mkldnn::memory::dims& operand_dims) {
-    std::string dstr = "";
-    for (size_t i = 0; i < operand_dims.size(); ++i) {
-      dstr += std::to_string(operand_dims[i]) + "-";
-    }
-    return dstr;
-  }
-
- protected:
-  const MKLDNNDeviceContext& dev_ctx_;
-  mkldnn::engine engine_;
-  std::string key_;
-  bool is_reusing_;
-};
-
 inline mkldnn::memory::format MKLDNNFormatForSize(
     size_t dims_size, mkldnn::memory::format data_format) {
   if (dims_size == 1) {
diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
new file mode 100644
index 0000000000..1c6421f3fa
--- /dev/null
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -0,0 +1,458 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace platform {
+
+using user_function = std::function<std::shared_ptr<float>(const float*)>;
+
+class MKLDNNHandler {
+ public:
+  MKLDNNHandler(const MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine,
+                const std::string& base_key)
+      : dev_ctx_(dev_ctx),
+        engine_(engine),
+        key_(base_key),
+        is_reusing_(false) {}
+
+  std::shared_ptr<mkldnn::memory> AcquireSrcMemory(
+      const mkldnn::memory::desc& md, void* ptr) {
+    return this->AcquireMemory(md, ptr, "@user_src_mem_p");
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireWeightsMemory(
+      const mkldnn::memory::desc& md, void* ptr,
+      user_function custom_func = {}) {
+    return this->AcquireMemory(md, ptr, "@user_weights_mem_p", custom_func);
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireBiasMemory(
+      const mkldnn::memory::desc& md, void* ptr) {
+    return this->AcquireMemory(md, ptr, "@user_bias_mem_p");
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireDstMemory(
+      const mkldnn::memory::desc& md, void* ptr) {
+    return this->AcquireMemory(md, ptr, "@user_dst_mem_p");
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireDiffDstMemory(
+      const mkldnn::memory::desc& md, void* ptr) {
+    return this->AcquireMemory(md, ptr, "@user_diff_dst_mem_p");
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireDiffSrcMemory(
+      const mkldnn::memory::desc& md, void* ptr) {
+    return this->AcquireMemory(md, ptr, "@user_diff_src_mem_p");
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireMemoryFromPrimitive(
+      mkldnn::memory::primitive_desc mdp, void* ptr,
+      const std::string& suffix) {
+    auto local_key = key_ + suffix;
+    auto mem_p =
+        std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
+    PADDLE_ENFORCE((mem_p != nullptr) || (is_reusing_ == false),
+                   "Fail to find mem primitive in device context");
+    if (mem_p == nullptr) {
+      mem_p = std::make_shared<mkldnn::memory>(mdp, ptr);
+      dev_ctx_.SetBlob(local_key, mem_p);
+    } else {
+      mem_p->set_data_handle(ptr);
+      // Mark that reusing happenned. All primitives from operator instance
+      // should be reused or none of them. So we check consistency
+      is_reusing_ = true;
+    }
+    return mem_p;
+  }
+
+  // This incarnation of AcquireMemory can call user function eg. custom reorder
+  // or preprocessing routine if needed
+  std::shared_ptr<mkldnn::memory> AcquireMemory(
+      const mkldnn::memory::desc& md, void* ptr, const std::string& suffix,
+      user_function custom_func = {}) {
+    /*Generate key*/
+    auto local_key = key_ + suffix;
+    auto mem_p =
+        std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
+    PADDLE_ENFORCE((mem_p != nullptr) || (is_reusing_ == false),
+                   "Fail to find mem primitive in device context");
+    if (mem_p == nullptr) {
+      // Call custom reorder/preprocessing func if available
+      if (custom_func) {
+        auto reordered_data = custom_func(reinterpret_cast<const float*>(ptr));
+        dev_ctx_.SetBlob(local_key + "-custom_reorder", reordered_data);
+        ptr = reinterpret_cast<void*>(reordered_data.get());
+      }
+
+      mem_p = std::make_shared<mkldnn::memory>(
+          mkldnn::memory::primitive_desc{md, engine_}, ptr);
+      dev_ctx_.SetBlob(local_key, mem_p);
+    } else {
+      mem_p->set_data_handle(ptr);
+      // Mark that reusing happenned. All primitives from operator instance
+      // should be reused or none of them. So we check consistency
+      is_reusing_ = true;
+    }
+    return mem_p;
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireMemory(
+      const std::shared_ptr<mkldnn::memory>& user_memory_p,
+      const std::shared_ptr<mkldnn::memory>& target_memory_p,
+      const std::string& suffix,
+      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
+    auto local_key = key_ + suffix;
+    auto key_reorder_p = key_ + suffix + "reorder_p";
+
+    auto stored_reorder_p = std::static_pointer_cast<mkldnn::reorder>(
+        dev_ctx_.GetBlob(key_reorder_p));
+
+    if (stored_reorder_p) {
+      pipeline.push_back(*stored_reorder_p);
+    } else {
+      auto reorder_p =
+          std::make_shared<mkldnn::reorder>(*user_memory_p, *target_memory_p);
+      dev_ctx_.SetBlob(key_reorder_p, reorder_p);
+      pipeline.push_back(*reorder_p);
+    }
+
+    return target_memory_p;
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireMemory(
+      mkldnn::memory::primitive_desc& mpd,       // NOLINT
+      mkldnn::memory::primitive_desc& user_mpd,  // NOLINT
+      const std::shared_ptr<mkldnn::memory> user_memory_p,
+      const std::string& suffix,
+      std::vector<mkldnn::primitive>& pipeline,  // NOLINT
+      bool is_persistent = false) {
+    // create reorder primitive if the input format is not the preferred one
+    auto local_key = key_ + suffix;
+    auto key_reorder_p = key_ + suffix + "reorder_p";
+
+    auto target_memory_p =
+        std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
+    PADDLE_ENFORCE((target_memory_p != nullptr) || (is_reusing_ == false),
+                   "Fail to find mem primitive in device context");
+    if (target_memory_p == nullptr) {
+      target_memory_p = user_memory_p;
+      std::shared_ptr<mkldnn::primitive> reorder_p;
+      if (mpd != user_mpd) {
+        target_memory_p = std::make_shared<mkldnn::memory>(mpd);
+        auto reorder_p =
+            std::make_shared<mkldnn::reorder>(*user_memory_p, *target_memory_p);
+        dev_ctx_.SetBlob(key_reorder_p, reorder_p);
+        pipeline.push_back(*reorder_p);
+      }
+      dev_ctx_.SetBlob(local_key, target_memory_p);
+    } else if (!is_persistent) {
+      // Make reorder if needed
+      auto reorder_p = std::static_pointer_cast<mkldnn::reorder>(
+          dev_ctx_.GetBlob(key_reorder_p));
+      if (reorder_p != nullptr) {
+        pipeline.push_back(*reorder_p);
+      }
+      is_reusing_ = true;
+    }
+    return target_memory_p;
+  }
+
+  static std::string GetHash(mkldnn::memory::dims& operand_dims,  // NOLINT
+                             const std::string& suffix) {
+    return dims2str(operand_dims) + suffix;
+  }
+
+ protected:
+  static std::string dims2str(const mkldnn::memory::dims& operand_dims) {
+    std::string dstr = "";
+    for (size_t i = 0; i < operand_dims.size(); ++i) {
+      dstr += std::to_string(operand_dims[i]) + "-";
+    }
+    return dstr;
+  }
+
+ protected:
+  const MKLDNNDeviceContext& dev_ctx_;
+  mkldnn::engine engine_;
+  std::string key_;
+  bool is_reusing_;
+};
+
+template <class forward_t, class backward_data_t, class backward_weights_t>
+class ConvMKLDNNTemplateHandler : public MKLDNNHandler {
+ public:
+  ConvMKLDNNTemplateHandler(
+      std::shared_ptr<typename forward_t::primitive_desc> conv_pd,
+      const platform::MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine,
+      const std::string& base_key)
+      : platform::MKLDNNHandler(dev_ctx, engine, base_key) {
+    conv_pd_ = conv_pd;
+  }
+
+  ConvMKLDNNTemplateHandler(
+      std::shared_ptr<typename forward_t::primitive_desc> conv_pd,
+      std::shared_ptr<typename backward_data_t::primitive_desc>
+          conv_bwd_data_pd,
+      std::shared_ptr<typename backward_weights_t::primitive_desc>
+          conv_bwd_weights_pd,
+      const platform::MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine,
+      const std::string& base_key)
+      : platform::MKLDNNHandler(dev_ctx, engine, base_key),
+        conv_pd_(conv_pd),
+        conv_bwd_weights_pd_(conv_bwd_weights_pd),
+        conv_bwd_data_pd_(conv_bwd_data_pd) {
+    // If we are in Grad operatgor then update a key with BWD suffix to
+    // distinguish from FWD memory primitives
+    key_ += "-BWD";
+  }
+
+  size_t GetDstMemorySize() const {
+    return conv_pd_->dst_primitive_desc().get_size();
+  }
+
+  mkldnn::memory::format GetDstFormat() const {
+    return static_cast<mkldnn::memory::format>(
+        conv_pd_->dst_primitive_desc().desc().data.format);
+  }
+
+  size_t GetDiffWeightsMemorySize() const {
+    return conv_bwd_weights_pd_->diff_weights_primitive_desc().get_size();
+  }
+
+  size_t GetDiffSourceMemorySize() const {
+    return conv_bwd_data_pd_->diff_src_primitive_desc().get_size();
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireSrcMemoryFromWeightsPrimitive(
+      const std::shared_ptr<mkldnn::memory> user_memory_p,
+      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
+    auto src_pd = conv_bwd_weights_pd_->src_primitive_desc();
+    auto user_pd = user_memory_p->get_primitive_desc();
+    return this->AcquireMemory(src_pd, user_pd, user_memory_p,
+                               "@weights-src_mem_p", pipeline);
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireDiffDstMemoryFromWeightsPrimitive(
+      const std::shared_ptr<mkldnn::memory> user_memory_p,
+      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
+    auto diff_dst_pd = conv_bwd_weights_pd_->diff_dst_primitive_desc();
+    auto user_pd = user_memory_p->get_primitive_desc();
+    return this->AcquireMemory(diff_dst_pd, user_pd, user_memory_p,
+                               "@weights-diff_dst_mem_p", pipeline);
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireDiffWeightsMemoryFromWeightsPrimitive(
+      void* ptr) {
+    return this->AcquireMemoryFromPrimitive(
+        conv_bwd_weights_pd_->diff_weights_primitive_desc(), ptr,
+        "@diff_weights_mem_p");
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireDiffDstMemoryFromDataPrimitive(
+      const std::shared_ptr<mkldnn::memory> user_memory_p,
+      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
+    auto diff_dst_pd = conv_bwd_data_pd_->diff_dst_primitive_desc();
+    auto user_pd = user_memory_p->get_primitive_desc();
+    return this->AcquireMemory(diff_dst_pd, user_pd, user_memory_p,
+                               "@data-diff_dst_mem_p", pipeline);
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireWeightsMemoryFromDataPrimitive(
+      const std::shared_ptr<mkldnn::memory> user_weights_memory_p,
+      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
+    auto weights_pd = conv_bwd_data_pd_->weights_primitive_desc();
+    auto user_pd = user_weights_memory_p->get_primitive_desc();
+    return this->AcquireMemory(weights_pd, user_pd, user_weights_memory_p,
+                               "@data-weights_mem_p", pipeline);
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireResidualDataMemory(
+      const mkldnn::memory::desc& md, void* ptr) {
+    return this->AcquireMemory(md, ptr, "@user_residual_data_mem_p");
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireDstMemoryFromResidualDataMemory(
+      const std::shared_ptr<mkldnn::memory>& user_residual_memory_p,
+      void* dst_ptr,
+      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
+    return this->AcquireMemory(user_residual_memory_p,
+                               this->AcquireDstMemoryFromPrimitive(dst_ptr),
+                               "@residual_data_mem_p", pipeline);
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireDiffSrcMemoryFromDataPrimitive(
+      void* ptr) {
+    return this->AcquireMemoryFromPrimitive(
+        conv_bwd_data_pd_->diff_src_primitive_desc(), ptr, "@diff_src_mem_p");
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireDstMemoryFromPrimitive(void* ptr) {
+    return this->AcquireMemoryFromPrimitive(conv_pd_->dst_primitive_desc(), ptr,
+                                            "@dst_mem_p");
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireSrcMemoryFromPrimitive(
+      const std::shared_ptr<mkldnn::memory> user_memory_p,
+      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
+    auto src_pd = conv_pd_->src_primitive_desc();
+    auto user_pd = user_memory_p->get_primitive_desc();
+    return this->AcquireMemory(src_pd, user_pd, user_memory_p, "@src_mem_p",
+                               pipeline);
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireWeightsMemoryFromPrimitive(
+      const std::shared_ptr<mkldnn::memory> user_weights_memory_p,
+      std::vector<mkldnn::primitive>& pipeline,  // NOLINT
+      bool is_persistent = false) {
+    auto user_weights_pd = user_weights_memory_p->get_primitive_desc();
+    auto weights_pd = conv_pd_->weights_primitive_desc();
+    return this->AcquireMemory(weights_pd, user_weights_pd,
+                               user_weights_memory_p, "@weights_mem_p",
+                               pipeline, is_persistent);
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireBiasMemoryFromPrimitive(
+      const std::shared_ptr<mkldnn::memory> user_bias_memory_p,
+      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
+    auto user_bias_pd = user_bias_memory_p->get_primitive_desc();
+    auto bias_pd = conv_pd_->bias_primitive_desc();
+    return this->AcquireMemory(bias_pd, user_bias_pd, user_bias_memory_p,
+                               "@bias_mem_p", pipeline);
+  }
+
+  std::shared_ptr<forward_t> AcquireConvolution(
+      std::shared_ptr<mkldnn::memory> src_memory_p,
+      std::shared_ptr<mkldnn::memory> weights_memory_p,
+      std::shared_ptr<mkldnn::memory> dst_memory_p) {
+    auto prim_key = key_ + "@conv_p";
+    auto conv_p =
+        std::static_pointer_cast<forward_t>(dev_ctx_.GetBlob(prim_key));
+    PADDLE_ENFORCE((conv_p != nullptr) || (is_reusing_ == false),
+                   "Fail to find convolution primitive in device context");
+    if (conv_p == nullptr) {
+      conv_p = std::make_shared<forward_t>(*conv_pd_, *(src_memory_p),
+                                           *(weights_memory_p.get()),
+                                           *(dst_memory_p.get()));
+
+      dev_ctx_.SetBlob(prim_key, conv_p);
+    } else {
+      is_reusing_ = true;
+    }
+    return conv_p;
+  }
+
+  std::shared_ptr<forward_t> AcquireConvolution(
+      std::shared_ptr<mkldnn::memory> src_memory_p,
+      std::shared_ptr<mkldnn::memory> weights_memory_p,
+      std::shared_ptr<mkldnn::memory> bias_memory_p,
+      std::shared_ptr<mkldnn::memory> dst_memory_p) {
+    auto prim_key = key_ + "@conv_p";
+    auto conv_p =
+        std::static_pointer_cast<forward_t>(dev_ctx_.GetBlob(prim_key));
+    PADDLE_ENFORCE((conv_p != nullptr) || (is_reusing_ == false),
+                   "Fail to find convolution primitive in device context");
+    if (conv_p == nullptr) {
+      conv_p = std::make_shared<forward_t>(
+          *conv_pd_, *(src_memory_p), *(weights_memory_p.get()),
+          *(bias_memory_p.get()), *(dst_memory_p.get()));
+
+      dev_ctx_.SetBlob(prim_key, conv_p);
+    } else {
+      is_reusing_ = true;
+    }
+    return conv_p;
+  }
+
+  std::shared_ptr<backward_weights_t> AcquireConvolutionBackwardWeights(
+      std::shared_ptr<mkldnn::memory> src_memory_p,
+      std::shared_ptr<mkldnn::memory> diff_dst_memory_p,
+      std::shared_ptr<mkldnn::memory> diff_weights_memory_p) {
+    auto prim_key = key_ + "@conv_bwd_weights_p";
+    auto conv_bwd_weights_p = std::static_pointer_cast<backward_weights_t>(
+        dev_ctx_.GetBlob(prim_key));
+    PADDLE_ENFORCE(
+        (conv_bwd_weights_p != nullptr) || (is_reusing_ == false),
+        "Fail to find convolution bwd weights primitive in device context");
+    if (conv_bwd_weights_p == nullptr) {
+      // create backward conv primitive for weights
+      conv_bwd_weights_p = std::make_shared<backward_weights_t>(
+          *conv_bwd_weights_pd_, *src_memory_p, *diff_dst_memory_p,
+          *diff_weights_memory_p);
+      dev_ctx_.SetBlob(prim_key, conv_bwd_weights_p);
+    } else {
+      is_reusing_ = true;
+    }
+    return conv_bwd_weights_p;
+  }
+
+  std::shared_ptr<backward_data_t> AcquireConvolutionBackwardData(
+      std::shared_ptr<mkldnn::memory> diff_dst_memory_p,
+      std::shared_ptr<mkldnn::memory> weights_memory_p,
+      std::shared_ptr<mkldnn::memory> diff_src_memory_p) {
+    auto prim_key = key_ + "@conv_bwd_data_p";
+    auto conv_bwd_data_p =
+        std::static_pointer_cast<backward_data_t>(dev_ctx_.GetBlob(prim_key));
+    PADDLE_ENFORCE(
+        (conv_bwd_data_p != nullptr) || (is_reusing_ == false),
+        "Fail to find convolution bwd data primitive in device context");
+    if (conv_bwd_data_p == nullptr) {
+      conv_bwd_data_p = std::make_shared<backward_data_t>(
+          *conv_bwd_data_pd_, *diff_dst_memory_p, *weights_memory_p,
+          *diff_src_memory_p);
+      dev_ctx_.SetBlob(prim_key, conv_bwd_data_p);
+    } else {
+      is_reusing_ = true;
+    }
+    return conv_bwd_data_p;
+  }
+
+  // Generate keys for storing/retriving primitives for this operator
+  // TODO(jczaja): Make hashing function more optimial
+  static std::string GetHash(mkldnn::memory::dims& input_dims,    // NOLINT
+                             mkldnn::memory::dims& weights_dims,  // NOLINT
+                             std::vector<int>& strides,           // NOLINT
+                             std::vector<int>& paddings,          // NOLINT
+                             std::vector<int>& dilations,         // NOLINT
+                             int groups, const std::string& suffix) {
+    return dims2str(input_dims) + dims2str(weights_dims) + dims2str(strides) +
+           dims2str(paddings) + dims2str(dilations) + std::to_string(groups) +
+           suffix;
+  }
+
+ private:
+  std::shared_ptr<typename forward_t::primitive_desc> conv_pd_;
+  std::shared_ptr<typename backward_weights_t::primitive_desc>
+      conv_bwd_weights_pd_;
+  std::shared_ptr<typename backward_data_t::primitive_desc> conv_bwd_data_pd_;
+};
+
+using ConvMKLDNNHandler =
+    ConvMKLDNNTemplateHandler<mkldnn::convolution_forward,
+                              mkldnn::convolution_backward_data,
+                              mkldnn::convolution_backward_weights>;
+
+using ConvTransposeMKLDNNHandler =
+    ConvMKLDNNTemplateHandler<mkldnn::deconvolution_forward,
+                              mkldnn::deconvolution_backward_data,
+                              mkldnn::deconvolution_backward_weights>;
+}  // namespace platform
+}  // namespace paddle
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_mkldnn_op.py
new file mode 100644
index 0000000000..b33a2b0bf6
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_mkldnn_op.py
@@ -0,0 +1,73 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+
+from test_conv2d_transpose_op import TestConv2dTransposeOp, TestWithPad, TestWithStride
+
+
+class TestMKLDNN(TestConv2dTransposeOp):
+    def init_op_type(self):
+        self.is_test = True
+        self.use_mkldnn = True
+        self.data_format = "NCHW"
+        self.op_type = "conv2d_transpose"
+
+    def test_check_grad(self):
+        return
+
+    def test_check_grad_no_input(self):
+        return
+
+    def test_check_grad_no_filter(self):
+        return
+
+
+class TestMKLDNNWithPad(TestWithPad):
+    def init_op_type(self):
+        self.is_test = True
+        self.use_mkldnn = True
+        self.data_format = "NCHW"
+        self.op_type = "conv2d_transpose"
+
+    def test_check_grad(self):
+        return
+
+    def test_check_grad_no_input(self):
+        return
+
+    def test_check_grad_no_filter(self):
+        return
+
+class TestMKLDNNWithStride(TestWithStride):
+    def init_op_type(self):
+        self.is_test = True
+        self.use_mkldnn = True
+        self.data_format = "NCHW"
+        self.op_type = "conv2d_transpose"
+
+    def test_check_grad(self):
+        return
+
+    def test_check_grad_no_input(self):
+        return
+
+    def test_check_grad_no_filter(self):
+        return
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
index 5bb769b168..3b820f6ad7 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
@@ -68,8 +68,11 @@ def conv2dtranspose_forward_naive(input_, filter_, attrs):
 class TestConv2dTransposeOp(OpTest):
     def setUp(self):
         # init as conv transpose
+        self.is_test = False
         self.use_cudnn = False
+        self.use_mkldnn = False
         self.output_size = None
+        self.data_format = "AnyLayout"
         self.init_op_type()
         self.init_test_case()
 
@@ -83,7 +86,9 @@ class TestConv2dTransposeOp(OpTest):
             'groups': self.groups,
             'dilations': self.dilations,
             'use_cudnn': self.use_cudnn,
-            'data_format': 'AnyLayout'  # TODO(dzhwinter) : should be fix latter
+            'is_test': self.is_test,
+            'use_mkldnn': self.use_mkldnn,
+            'data_format': self.data_format
         }
         if self.output_size is not None:
             self.attrs['output_size'] = self.output_size

From 06d8777ec2857020da34b60ba359cf9b8e963b4e Mon Sep 17 00:00:00 2001
From: Jacek Czaja <jacek.czaja@intel.com>
Date: Mon, 26 Nov 2018 18:00:56 +0100
Subject: [PATCH 21/90] Coding style fixes

test=develop
---
 .../fluid/tests/unittests/test_conv2d_transpose_mkldnn_op.py     | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_mkldnn_op.py
index b33a2b0bf6..01a7cd6ca1 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_mkldnn_op.py
@@ -52,6 +52,7 @@ class TestMKLDNNWithPad(TestWithPad):
     def test_check_grad_no_filter(self):
         return
 
+
 class TestMKLDNNWithStride(TestWithStride):
     def init_op_type(self):
         self.is_test = True

From d98c59fd2ca226be9f078a7d98a30c8265dcab86 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Tue, 27 Nov 2018 09:53:58 +0800
Subject: [PATCH 22/90] support none sliced variable

---
 .../operators/distributed/parameter_prefetch.cc |  4 ++--
 .../operators/distributed/parameter_prefetch.h  |  2 +-
 paddle/fluid/operators/lookup_table_op.cc       |  6 ++++++
 paddle/fluid/operators/lookup_table_op.h        |  3 ++-
 .../fluid/transpiler/distribute_transpiler.py   | 17 ++++++++++++++---
 5 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.cc b/paddle/fluid/operators/distributed/parameter_prefetch.cc
index f409b13f01..d2b514b7b4 100644
--- a/paddle/fluid/operators/distributed/parameter_prefetch.cc
+++ b/paddle/fluid/operators/distributed/parameter_prefetch.cc
@@ -154,7 +154,7 @@ inline void MergeMultipleVarsIntoOneBySection(
 }
 
 void prefetch(const std::string& id_name, const std::string& out_name,
-              const std::string& table_name,
+              const std::vector<std::string>& table_names,
               const std::vector<std::string>& epmap,
               const std::vector<int64_t>& height_sections,
               const framework::ExecutionContext& context) {
@@ -190,7 +190,7 @@ void prefetch(const std::string& id_name, const std::string& out_name,
                << " to get " << out_var_names[i] << " back";
       rets.push_back(rpc_client->AsyncPrefetchVar(
           epmap[i], ctx, local_scope, in_var_names[i], out_var_names[i],
-          table_name + ".block" + std::to_string(i)));
+          table_names[i]));
     } else {
       VLOG(30) << "don't send no-initialied variable: " << out_var_names[i];
     }
diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.h b/paddle/fluid/operators/distributed/parameter_prefetch.h
index 9e680ec20b..0693cfc1fd 100644
--- a/paddle/fluid/operators/distributed/parameter_prefetch.h
+++ b/paddle/fluid/operators/distributed/parameter_prefetch.h
@@ -24,7 +24,7 @@ namespace operators {
 namespace distributed {
 
 void prefetch(const std::string& id_name, const std::string& out_name,
-              const std::string& table_name,
+              const std::vector<std::string>& table_names,
               const std::vector<std::string>& epmap,
               const std::vector<int64_t>& height_sections,
               const framework::ExecutionContext& context);
diff --git a/paddle/fluid/operators/lookup_table_op.cc b/paddle/fluid/operators/lookup_table_op.cc
index faf91775e4..ab6518641b 100644
--- a/paddle/fluid/operators/lookup_table_op.cc
+++ b/paddle/fluid/operators/lookup_table_op.cc
@@ -99,6 +99,12 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
         "(string vector, default 127.0.0.1:6164)"
         "Server endpoints in the order of input variables for mapping")
         .SetDefault({});
+    AddAttr<std::vector<std::string>>(
+        "table_names",
+        "(string vector, the splited table names that will be fetched from "
+        "parameter server)"
+        "in the order of input variables for mapping")
+        .SetDefault({});
 
     AddComment(R"DOC(
 Lookup Table Operator.
diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h
index 223de413b2..12c5f8f1eb 100644
--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
@@ -54,13 +54,14 @@ class LookupTableKernel : public framework::OpKernel<T> {
     auto remote_prefetch = context.Attr<bool>("remote_prefetch");
     auto height_sections =
         context.Attr<std::vector<int64_t>>("height_sections");
+    auto table_names = context.Attr<std::vector<std::string>>("table_names");
 
     if (remote_prefetch) {
 // if emap is not empty, then the parameter will be fetched from remote
 // parameter
 // server
 #ifdef PADDLE_WITH_DISTRIBUTE
-      operators::distributed::prefetch(id_name, out_name, table_name, epmap,
+      operators::distributed::prefetch(id_name, out_name, table_names, epmap,
                                        height_sections, context);
 #else
       PADDLE_THROW(
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index 59f89e331d..a1ccb704b2 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -247,10 +247,11 @@ class DistributeTranspiler(object):
         return sparse_update_ops
 
     def _update_remote_sparse_update_op(self, param_varname, height_sections,
-                                        endpint_map):
+                                        endpint_map, table_names):
         for op in self.sparse_update_ops:
             if param_varname in op.input_arg_names:
                 op._set_attr('epmap', endpint_map)
+                op._set_attr('table_names', table_names)
                 op._set_attr('height_sections', height_sections)
                 op._set_attr('trainer_id', self.trainer_id)
 
@@ -326,6 +327,7 @@ class DistributeTranspiler(object):
         # get all sparse update ops
         self.sparse_update_ops = self._get_all_remote_sparse_update_op(
             self.origin_program)
+        # use_sparse_update_param_name -> split_height_section
         self.sparse_param_to_height_sections = dict()
 
         # add distributed attrs to program
@@ -365,6 +367,13 @@ class DistributeTranspiler(object):
                 splited_grad_varname = splited_vars[0].name
                 index = find_op_by_output_arg(
                     program.global_block(), splited_grad_varname, reverse=True)
+                if splited_vars[0].type == core.VarDesc.VarType.SELECTED_ROWS:
+                    sparse_param_name = self.grad_name_to_param_name[
+                        splited_grad_varname]
+                    if self._is_input_of_remote_sparse_update_op(
+                            sparse_param_name):
+                        self.sparse_param_to_height_sections[
+                            sparse_param_name] = [splited_vars[0].shape[0]]
             elif len(splited_vars) > 1:
                 orig_var = program.global_block().vars[splited_grad_varname]
                 index = find_op_by_output_arg(
@@ -435,9 +444,11 @@ class DistributeTranspiler(object):
         all_recv_outputs = []
         for param_varname, splited_var in six.iteritems(self.param_var_mapping):
             eps = []
+            table_names = []
             for var in splited_var:
                 index = [v.name for v in recv_vars].index(var.name)
                 eps.append(eplist[index])
+                table_names.append(var.name)
             if self.sync_mode:
                 recv_dep_in = send_barrier_out
             else:
@@ -457,8 +468,8 @@ class DistributeTranspiler(object):
             if param_varname in self.sparse_param_to_height_sections:
                 height_sections = self.sparse_param_to_height_sections[
                     param_varname]
-                self._update_remote_sparse_update_op(param_varname,
-                                                     height_sections, eps)
+                self._update_remote_sparse_update_op(
+                    param_varname, height_sections, eps, table_names)
             else:
                 all_recv_outputs.extend(splited_var)
                 program.global_block().append_op(

From 87e4edd2eaebb8b0dc3259ff82797b9551448a34 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Tue, 27 Nov 2018 10:40:46 +0800
Subject: [PATCH 23/90] fix grad_varname in remote prefetch

---
 paddle/fluid/operators/lookup_table_op.h                | 6 +++---
 python/paddle/fluid/transpiler/distribute_transpiler.py | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h
index 12c5f8f1eb..c1a1ea87a0 100644
--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
@@ -49,14 +49,14 @@ class LookupTableKernel : public framework::OpKernel<T> {
 
     auto id_name = context.Inputs("Ids").front();
     auto out_name = context.Outputs("Out").front();
-    auto table_name = context.Inputs("W").front();
+
+    // for remote prefetch
     auto epmap = context.Attr<std::vector<std::string>>("epmap");
-    auto remote_prefetch = context.Attr<bool>("remote_prefetch");
     auto height_sections =
         context.Attr<std::vector<int64_t>>("height_sections");
     auto table_names = context.Attr<std::vector<std::string>>("table_names");
 
-    if (remote_prefetch) {
+    if (!height_sections.empty()) {
 // if emap is not empty, then the parameter will be fetched from remote
 // parameter
 // server
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index a1ccb704b2..d163357401 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -369,7 +369,7 @@ class DistributeTranspiler(object):
                     program.global_block(), splited_grad_varname, reverse=True)
                 if splited_vars[0].type == core.VarDesc.VarType.SELECTED_ROWS:
                     sparse_param_name = self.grad_name_to_param_name[
-                        splited_grad_varname]
+                        grad_varname]
                     if self._is_input_of_remote_sparse_update_op(
                             sparse_param_name):
                         self.sparse_param_to_height_sections[

From 05917c3c7901e00266e4e57c0c043bcc5beecdf4 Mon Sep 17 00:00:00 2001
From: liuhongyu <phliuhongyu@126.com>
Date: Tue, 27 Nov 2018 15:35:18 +0800
Subject: [PATCH 24/90] add cudnn lstm; test=develop

---
 paddle/fluid/API.spec                      |   1 +
 paddle/fluid/framework/operator.h          |   8 +
 paddle/fluid/operators/cudnn_lstm_op.cc    | 204 +++++++++
 paddle/fluid/operators/cudnn_lstm_op.cu.cc | 491 +++++++++++++++++++++
 paddle/fluid/operators/cudnn_lstm_op.h     |  42 ++
 paddle/fluid/platform/dynload/cudnn.h      |  18 +-
 python/paddle/fluid/layers/nn.py           | 152 +++++++
 7 files changed, 915 insertions(+), 1 deletion(-)
 create mode 100644 paddle/fluid/operators/cudnn_lstm_op.cc
 create mode 100644 paddle/fluid/operators/cudnn_lstm_op.cu.cc
 create mode 100644 paddle/fluid/operators/cudnn_lstm_op.h

diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index 0a71f15343..550c7ddeb4 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -187,6 +187,7 @@ paddle.fluid.layers.grid_sampler ArgSpec(args=['x', 'grid', 'name'], varargs=Non
 paddle.fluid.layers.log_loss ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None))
 paddle.fluid.layers.add_position_encoding ArgSpec(args=['input', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.bilinear_tensor_product ArgSpec(args=['x', 'y', 'size', 'act', 'name', 'param_attr', 'bias_attr'], varargs=None, keywords=None, defaults=(None, None, None, None))
+paddle.fluid.layers.cudnn_lstm ArgSpec(args=['input', 'init_h', 'init_c', 'batch_size', 'max_len', 'dropout_prob', 'input_size', 'hidden_size', 'num_layers', 'is_bidirec', 'dtype', 'is_test', 'name', 'default_initializer', 'fix_seed', 'seed'], varargs=None, keywords=None, defaults=(False, 'float32', False, None, None, False, 0))
 paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
 paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
 paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None)
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index bfdfdc56b3..06d3ee9e72 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -174,6 +174,14 @@ class ExecutionContext {
     return op_.Inputs(name).size();
   }
 
+  const std::string InputVarName(const std::string& name) const {
+    return op_.Input(name);
+  }
+
+  const std::string OutputVarName(const std::string& name) const {
+    return op_.Output(name);
+  }
+
   size_t OutputSize(const std::string& name) const {
     return op_.Outputs(name).size();
   }
diff --git a/paddle/fluid/operators/cudnn_lstm_op.cc b/paddle/fluid/operators/cudnn_lstm_op.cc
new file mode 100644
index 0000000000..cadc5b8830
--- /dev/null
+++ b/paddle/fluid/operators/cudnn_lstm_op.cc
@@ -0,0 +1,204 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/cudnn_lstm_op.h"
+#include <string>
+
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/cudnn_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+class CudnnLSTMOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"),
+                   "Input(Input) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("W"),
+                   "Input(Weight) of LSTM should not be null.");
+
+    PADDLE_ENFORCE(ctx->HasInput("InitH"),
+                   "Input(init_h) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("InitC"),
+                   "Input(init_c) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Cache"),
+                   "Input(Cache) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("last_h"),
+                   "Output(last_h) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("last_c"),
+                   "Output(last_c) of LSTM should not be null.");
+
+    auto in_dims = ctx->GetInputDim("Input");
+    PADDLE_ENFORCE_EQ(in_dims.size(), 3, "Input(X)'s rank must be 3.");
+
+    ctx->SetOutputDim("Out", ctx->GetInputDim("Input"));
+    ctx->SetOutputDim("last_h", ctx->GetInputDim("InitH"));
+    ctx->SetOutputDim("last_c", ctx->GetInputDim("InitC"));
+  }
+};
+
+class CudnnLSTMOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput(
+        "Input",
+        "(Tensor) RNN input tensor, which support variable-time length input "
+        "sequence."
+        "The shape of the Tensor MUST be ( seq_len * batch_size * input_size)"
+        "seq_len is the total time step in this mini-batch (CAN be change in "
+        "different batch)"
+        "batch_size is the instance number of this batch"
+        "input_size is the hidden size of the input."
+        "input_hidden_size and the hidden_size in the next may not be same");
+    AddInput("InitH",
+             "(Tensor) the initial hidden state of the LSTM"
+             "input. This is a tensor with shape (num_layers x batch_size x "
+             "hidden_size)"
+             "and When is_bidirec is True, the shape will be (num_layers*2 x "
+             "batch_size x hidden_size)");
+    AddInput("InitC",
+             "(Tensor) the initial cell state of the LSTm "
+             "input. This is a tensor with shape (num_layers x batch_size x "
+             "hidden_size)"
+             "and When is_bidirec is True, the shape will be (num_layers*2 x "
+             "batch_size x hidden_size)");
+    AddInput("W",
+             "(Tensor) the learnable hidden-hidden weights."
+             " The shape is (N), where N is total weight size of the LSTM. "
+             " cudnn concatenate all the weight to one Tensor");
+    AddInput("Cache",
+             "The cache of dropout op, a RAW type variable including random "
+             "number generator states and some descriptors, which is used in "
+             "cudnn kernel.")
+        .AsDispensable();
+    AddOutput("Out",
+              "(Tensor) the hidden state of LSTM operator. "
+              "The shape is ( seq_len x batch_size x hidden_size) if "
+              "is_bidirec is False"
+              "and When is_bidirec is True, the shape will be ( seq_len x "
+              "batch_size x hidden_size * 2) ");
+    AddOutput("last_h",
+              "(Tensor) the hidden state of the last step. "
+              "The shape is ( num_layers x batch_size x hidden_size) if "
+              "is_bidirec is False"
+              "and When is_bidirec is True, the shape will be (num_layers*2 x "
+              "batch_size x hidden_size)");
+    AddOutput("last_c",
+              "(Tensor) the cell state of the last step"
+              "The shape is ( num_layers x batch_size x hidden_size) if "
+              "is_bidirec is False"
+              "and When is_bidirect is True, the shape will be (num_layers*2 x "
+              "batch_size x hidden_size*2)");
+    AddAttr<int>("max_len",
+                 "max length of the LSTM op"
+                 "the first dim of the Input can NOT be greater than max_len")
+        .SetDefault(20);
+    AddAttr<float>(
+        "dropout_prob",
+        "dropout prob of the dropout op"
+        "the dropout ONLY work between lstm layers, not between time steps"
+        "There is no dropout work on the Out tensor")
+        .SetDefault(0.0);
+    AddAttr<bool>("is_bidirec",
+                  "is_bidirec"
+                  "if it is bidirection rnn"
+                  "The will affect the shape of the Out, last_h, and last_c")
+        .SetDefault(false);
+    AddAttr<int>("input_size", "input size ot the Input Tensor").SetDefault(10);
+    AddAttr<int>("batch_size", "the instance number the batch").SetDefault(10);
+    AddAttr<int>("hidden_size", "hidden size of the LSTM").SetDefault(100);
+    AddAttr<int>("num_layers", "the total layer number of the LSTM")
+        .SetDefault(1);
+    AddAttr<bool>("is_test", "True if in test phase.").SetDefault(false);
+    AddAttr<bool>("fix_seed", "True if it fix dropout seed").SetDefault(false);
+    AddAttr<int>("seed", "seed to used if fix_seed is True").SetDefault(0);
+    AddComment(R"DOC(
+CUDNN LSTM implementation
+
+A four-gate Long Short-Term Memory network with no peephole connections.
+In the forward pass the output ht and cell output ct for a given iteration can be computed from the recurrent input ht-1, 
+the cell input ct-1 and the previous layer input xt given matrices W, R and biases bW, bR from the following equations:
+
+it = σ(Wi X xt + Ri X ht-1 + bWi + bRi)
+ft = σ(Wf X xt + Rf X ht-1 + bWf + bRf)
+ot = σ(Wo X xt + Ro X ht-1 + bWo + bRo)
+c't = tanh(Wc X xt + Rc X ht-1 + bWc + bRc)
+ct = ft * ct-1 + it * c't
+ht = ot * tanh(ct)
+
+Where σ is the sigmoid operator: σ(x) = 1 / (1 + e^-x), * represents a point-wise multiplication, 
+X represensts a matrix multiplication
+and tanh is the hyperbolic tangent function. it, ft, ot, c't represent the input, forget, output and new gates respectively.
+
+
+)DOC");
+  }
+};
+
+class CudnnLSTMGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"),
+                   "Input(Input) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("W"), "Input(W) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("last_h"),
+                   "Input(last_h) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("last_c"),
+                   "Input(last_c) of LSTM should not be null.");
+
+    PADDLE_ENFORCE(ctx->HasInput("Cache"),
+                   "Input(last_c) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("InitH"),
+                   "Input(init_h) of LSTM should not be null.");
+
+    PADDLE_ENFORCE(ctx->HasInput("InitC"),
+                   "Input(init_c) of LSTM should not be null.");
+
+    auto SetOutGradDim = [&ctx](const std::string& name) {
+      auto g_name = framework::GradVarName(name);
+      if (ctx->HasOutput(g_name)) {
+        ctx->SetOutputDim(g_name, ctx->GetInputDim(name));
+      }
+    };
+
+    SetOutGradDim("Input");
+    SetOutGradDim("W");
+    SetOutGradDim("InitH");
+    SetOutGradDim("InitC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(cudnn_lstm, ops::CudnnLSTMOp, ops::CudnnLSTMOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(cudnn_lstm_grad, ops::CudnnLSTMGradOp);
+
+REGISTER_OP_CPU_KERNEL(
+    cudnn_lstm,
+    ops::CudnnLSTMKernel<paddle::platform::CPUDeviceContext, float>);
+
+REGISTER_OP_CPU_KERNEL(
+    cudnn_lstm_grad,
+    ops::CudnnLSTMGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/cudnn_lstm_op.cu.cc b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
new file mode 100644
index 0000000000..9caf65b53f
--- /dev/null
+++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
@@ -0,0 +1,491 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/cudnn_lstm_op.h"
+#include "paddle/fluid/platform/cudnn_helper.h"
+
+namespace paddle {
+namespace operators {
+
+using LoDTensor = framework::LoDTensor;
+using Tensor = framework::Tensor;
+
+struct CudnnRNNCache {
+  CudnnRNNCache() {
+    x_desc_ = NULL;
+    y_desc_ = NULL;
+    dx_desc_ = NULL;
+    dy_desc_ = NULL;
+  }
+  ~CudnnRNNCache() { release(); }
+
+  cudnnRNNDescriptor_t rnn_desc_;
+  cudnnTensorDescriptor_t *x_desc_;
+  cudnnTensorDescriptor_t *y_desc_;
+  cudnnTensorDescriptor_t *dx_desc_;
+  cudnnTensorDescriptor_t *dy_desc_;
+
+  cudnnTensorDescriptor_t hx_desc_;
+  cudnnTensorDescriptor_t cx_desc_;
+  cudnnTensorDescriptor_t hy_desc_;
+  cudnnTensorDescriptor_t cy_desc_;
+
+  cudnnTensorDescriptor_t dhx_desc_;
+  cudnnTensorDescriptor_t dcx_desc_;
+  cudnnTensorDescriptor_t dhy_desc_;
+  cudnnTensorDescriptor_t dcy_desc_;
+
+  cudnnTensorDescriptor_t output_x_desc_;
+  cudnnTensorDescriptor_t output_y_desc_;
+
+  cudnnDropoutDescriptor_t dropout_desc_;
+
+  size_t weights_size_;
+  cudnnFilterDescriptor_t w_desc_;
+  cudnnFilterDescriptor_t dw_desc_;
+
+  size_t workspace_size_;
+  size_t reserve_size_;
+  Tensor reserve_data_;
+  Tensor workspace_data_;
+
+  Tensor dropout_state_;
+
+  size_t max_length_;
+
+  float dropout_prob_;
+  bool is_bidirec_;
+
+  int batch_size_;
+  int input_size_;
+  int hidden_size_;
+  int num_layers_;
+  int seed_;
+
+  void init(cudnnHandle_t handle, const framework::ExecutionContext &ctx,
+            size_t max_len, int batch_size, int input_size, int hidden_size,
+            int num_layers, float dropout_prob, bool is_bidirec, int seed,
+            int weight_numel) {
+    max_length_ = max_len;
+    batch_size_ = batch_size;
+    input_size_ = input_size;
+    hidden_size_ = hidden_size;
+    num_layers_ = num_layers;
+    dropout_prob_ = dropout_prob;
+    is_bidirec_ = is_bidirec;
+    seed_ = seed;
+
+    x_desc_ = new cudnnTensorDescriptor_t[max_length_];
+    y_desc_ = new cudnnTensorDescriptor_t[max_length_];
+    dx_desc_ = new cudnnTensorDescriptor_t[max_length_];
+    dy_desc_ = new cudnnTensorDescriptor_t[max_length_];
+    int dim_a[3];
+    int stride_a[3];
+
+    for (size_t i = 0; i < max_length_; ++i) {
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnCreateTensorDescriptor(&x_desc_[i]));
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnCreateTensorDescriptor(&y_desc_[i]));
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnCreateTensorDescriptor(&dx_desc_[i]));
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnCreateTensorDescriptor(&dy_desc_[i]));
+      dim_a[0] = batch_size_;
+      dim_a[1] = input_size_;
+      dim_a[2] = 1;
+
+      stride_a[0] = dim_a[2] * dim_a[1];
+      stride_a[1] = dim_a[2];
+      stride_a[2] = 1;
+      CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+          x_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+      CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+          dx_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+
+      dim_a[0] = batch_size_;
+      dim_a[1] = is_bidirec_ ? hidden_size_ * 2 : hidden_size_;
+      dim_a[2] = 1;
+
+      stride_a[0] = dim_a[2] * dim_a[1];
+      stride_a[1] = dim_a[2];
+      stride_a[2] = 1;
+
+      CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+          y_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+      CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+          dy_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+    }
+
+    dim_a[0] = num_layers_ * (is_bidirec_ ? 2 : 1);
+    dim_a[1] = batch_size_;
+    dim_a[2] = hidden_size_;
+
+    stride_a[0] = dim_a[2] * dim_a[1];
+    stride_a[1] = dim_a[2];
+    stride_a[2] = 1;
+
+    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&hx_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&cx_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&hy_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&cy_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dhx_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dcx_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dhy_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dcy_desc_));
+
+    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+        hx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+        cx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+        hy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+        cy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+        dhx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+        dcx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+        dhy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+        dcy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+
+    CUDNN_ENFORCE(
+        platform::dynload::cudnnCreateDropoutDescriptor(&dropout_desc_));
+
+    size_t state_size;
+    CUDNN_ENFORCE(
+        platform::dynload::cudnnDropoutGetStatesSize(handle, &state_size);
+        dropout_state_.Resize({static_cast<int64_t>(state_size)}));
+    auto *dropout_state_data =
+        dropout_state_.mutable_data<uint8_t>(ctx.GetPlace());
+    CUDNN_ENFORCE(platform::dynload::cudnnSetDropoutDescriptor(
+        dropout_desc_, handle, dropout_prob_, dropout_state_data, state_size,
+        seed_));
+
+    CUDNN_ENFORCE(platform::dynload::cudnnCreateRNNDescriptor(&rnn_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnSetRNNDescriptor_v6(
+        handle, rnn_desc_, hidden_size_, num_layers_, dropout_desc_,
+        CUDNN_LINEAR_INPUT,
+        is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM,
+        CUDNN_RNN_ALGO_STANDARD, CUDNN_DATA_FLOAT));
+
+    CUDNN_ENFORCE(platform::dynload::cudnnCreateFilterDescriptor(&w_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnCreateFilterDescriptor(&dw_desc_));
+
+    CUDNN_ENFORCE(platform::dynload::cudnnGetRNNParamsSize(
+        handle, rnn_desc_, x_desc_[0], &weights_size_, CUDNN_DATA_FLOAT));
+
+    PADDLE_ENFORCE_EQ(weights_size_, sizeof(float) * weight_numel,
+                      "cudnn lstm weight size should be SAME");
+    int dim_w[3];
+    dim_w[0] = weights_size_ / sizeof(float);
+    dim_w[1] = 1;
+    dim_w[2] = 1;
+    CUDNN_ENFORCE(platform::dynload::cudnnSetFilterNdDescriptor(
+        w_desc_, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, dim_w));
+    CUDNN_ENFORCE(platform::dynload::cudnnSetFilterNdDescriptor(
+        dw_desc_, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, dim_w));
+
+    CUDNN_ENFORCE(platform::dynload::cudnnGetRNNWorkspaceSize(
+        handle, rnn_desc_, max_length_, x_desc_, &workspace_size_));
+    CUDNN_ENFORCE(platform::dynload::cudnnGetRNNTrainingReserveSize(
+        handle, rnn_desc_, max_length_, x_desc_, &reserve_size_));
+
+    reserve_data_.Resize({static_cast<int64_t>(reserve_size_)});
+    reserve_data_.mutable_data<uint8_t>(ctx.GetPlace());
+
+    workspace_data_.Resize({static_cast<int64_t>(workspace_size_)});
+    workspace_data_.mutable_data<uint8_t>(ctx.GetPlace());
+  }
+
+  void release() {
+    for (size_t i = 0; i < max_length_; ++i) {
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnDestroyTensorDescriptor(x_desc_[i]));
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnDestroyTensorDescriptor(y_desc_[i]));
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnDestroyTensorDescriptor(dx_desc_[i]));
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnDestroyTensorDescriptor(dy_desc_[i]));
+    }
+
+    delete[] x_desc_;
+    delete[] y_desc_;
+    delete[] dx_desc_;
+    delete[] dy_desc_;
+
+    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(hx_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(cx_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(hy_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(cy_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dhx_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dcx_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dhy_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dcy_desc_));
+
+    CUDNN_ENFORCE(
+        platform::dynload::cudnnDestroyDropoutDescriptor(dropout_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnDestroyRNNDescriptor(rnn_desc_));
+
+    CUDNN_ENFORCE(platform::dynload::cudnnDestroyFilterDescriptor(w_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnDestroyFilterDescriptor(dw_desc_));
+  }
+};
+
+template <typename DeviceContext, typename T>
+class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    const Tensor *x = ctx.Input<Tensor>("Input");
+    const Tensor *init_h = ctx.Input<Tensor>("InitH");
+    const Tensor *init_c = ctx.Input<Tensor>("InitC");
+
+    auto w = ctx.Input<Tensor>("W");
+
+    Tensor *out = ctx.Output<Tensor>("Out");
+    Tensor *last_h = ctx.Output<Tensor>("last_h");
+    Tensor *last_c = ctx.Output<Tensor>("last_c");
+
+    const T *x_data = x->data<T>();
+    const T *init_h_data = init_h->data<T>();
+    const T *init_c_data = init_c->data<T>();
+
+    const T *w_data = w->data<T>();
+
+    T *out_data = out->mutable_data<T>(ctx.GetPlace());
+    T *last_h_data = last_h->mutable_data<T>(ctx.GetPlace());
+    T *last_c_data = last_c->mutable_data<T>(ctx.GetPlace());
+
+    size_t max_len = ctx.Attr<int>("max_len");
+    float dropout_prob = ctx.Attr<float>("dropout_prob");
+    bool is_bidirec = ctx.Attr<bool>("is_bidirec");
+    int batch_size = ctx.Attr<int>("batch_size");
+    int input_size = ctx.Attr<int>("input_size");
+    int hidden_size = ctx.Attr<int>("hidden_size");
+    int num_layers = ctx.Attr<int>("num_layers");
+    bool is_test = ctx.Attr<bool>("is_test");
+
+    /*
+    if (is_test) {
+      TensorCopy(*x, ctx.GetPlace(), out);
+      return;
+    }*/
+
+    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto handle = dev_ctx.cudnn_handle();
+    auto *cache_var = ctx.InputVar("Cache");
+    if (!cache_var) {
+      // The RAW type cache variable wouldn't be created and broadcasted on
+      // multi-devices before the first running.
+      // use parent scope to make cache persistable
+      auto *scope = const_cast<framework::Scope *>(ctx.scope().parent());
+      auto cache_var_name = ctx.InputVarName("Cache");
+      cache_var = scope->Var(cache_var_name);
+    }
+    CudnnRNNCache *cudnn_rnn_cache = nullptr;
+    if (cache_var->IsInitialized()) {
+      cudnn_rnn_cache = const_cast<framework::Variable *>(cache_var)
+                            ->GetMutable<CudnnRNNCache>();
+    } else {
+      cudnn_rnn_cache = const_cast<framework::Variable *>(cache_var)
+                            ->GetMutable<CudnnRNNCache>();
+      std::random_device rnd;
+      int seed = ctx.Attr<bool>("fix_seed") ? ctx.Attr<int>("seed") : rnd();
+
+      auto input_w_numel = w->numel();
+      cudnn_rnn_cache->init(handle, ctx, max_len, batch_size, input_size,
+                            hidden_size, num_layers, dropout_prob, is_bidirec,
+                            seed, input_w_numel);
+    }
+
+    auto run_seq_len = x->dims()[0];
+
+    if (is_test) {
+      // for inference
+      CUDNN_ENFORCE(platform::dynload::cudnnRNNForwardInference(
+          handle, cudnn_rnn_cache->rnn_desc_, run_seq_len,
+          cudnn_rnn_cache->x_desc_, x_data, cudnn_rnn_cache->hx_desc_,
+          init_h_data, cudnn_rnn_cache->cx_desc_, init_c_data,
+          cudnn_rnn_cache->w_desc_, w_data, cudnn_rnn_cache->y_desc_, out_data,
+          cudnn_rnn_cache->hy_desc_, last_h_data, cudnn_rnn_cache->cy_desc_,
+          last_c_data, cudnn_rnn_cache->workspace_data_.data<uint8_t>(),
+          cudnn_rnn_cache->workspace_size_));
+    } else {
+      // for train
+      CUDNN_ENFORCE(platform::dynload::cudnnRNNForwardTraining(
+          handle, cudnn_rnn_cache->rnn_desc_, run_seq_len,
+          cudnn_rnn_cache->x_desc_, x_data, cudnn_rnn_cache->hx_desc_,
+          init_h_data, cudnn_rnn_cache->cx_desc_, init_c_data,
+          cudnn_rnn_cache->w_desc_, w_data, cudnn_rnn_cache->y_desc_, out_data,
+          cudnn_rnn_cache->hy_desc_, last_h_data, cudnn_rnn_cache->cy_desc_,
+          last_c_data, cudnn_rnn_cache->workspace_data_.data<uint8_t>(),
+          cudnn_rnn_cache->workspace_size_,
+          cudnn_rnn_cache->reserve_data_.data<uint8_t>(),
+          cudnn_rnn_cache->reserve_size_));
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *input = ctx.Input<Tensor>("Input");
+    auto *weight = ctx.Input<Tensor>("W");
+    auto *init_h = ctx.Input<Tensor>("InitH");
+    auto *init_c = ctx.Input<Tensor>("InitC");
+    // auto * last_h = ctx.Input<Tensor>("last_h");
+    // auto * last_c = ctx.Input<Tensor>("last_c");
+    auto *out = ctx.Input<Tensor>("Out");
+    auto *out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto *last_h_grad = ctx.Input<Tensor>(framework::GradVarName("last_h"));
+    auto *last_c_grad = ctx.Input<Tensor>(framework::GradVarName("last_c"));
+
+    // auto* init_h = ctx.Input<Tensor>("init_h");
+    // auto* init_c = ctx.Input<Tensor>("init_c");
+
+    auto *in_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
+    auto *weight_grad = ctx.Output<Tensor>(framework::GradVarName("W"));
+    auto *init_h_grad = ctx.Output<Tensor>(framework::GradVarName("InitH"));
+    auto *init_c_grad = ctx.Output<Tensor>(framework::GradVarName("InitC"));
+
+    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto handle = dev_ctx.cudnn_handle();
+    auto *cache_var = ctx.InputVar("Cache");
+    PADDLE_ENFORCE(cache_var->IsInitialized());
+    CudnnRNNCache *cudnn_rnn_cache =
+        const_cast<framework::Variable *>(cache_var)
+            ->GetMutable<CudnnRNNCache>();
+
+    auto input_dims = input->dims();
+    auto weight_dims = weight->dims();
+    auto init_h_dims = init_h->dims();
+    auto init_c_dims = init_c->dims();
+    in_grad->mutable_data<T>(ctx.GetPlace());
+    weight_grad->mutable_data<T>(ctx.GetPlace());
+    math::SetConstant<DeviceContext, T> zero;
+    zero(dev_ctx, in_grad, static_cast<T>(0.0));
+    zero(dev_ctx, weight_grad, static_cast<T>(0.0));
+
+    T *init_h_grad_data = NULL;
+    if (init_h_grad == nullptr) {
+      Tensor init_h_grad_temp;
+      init_h_grad_temp.mutable_data<T>(init_h_dims, ctx.GetPlace());
+      zero(dev_ctx, &init_h_grad_temp, static_cast<T>(0.0));
+
+      init_h_grad_data = init_h_grad_temp.data<T>();
+    } else {
+      init_h_grad->mutable_data<T>(init_h_dims, ctx.GetPlace());
+      zero(dev_ctx, init_h_grad, static_cast<T>(0.0));
+      init_h_grad_data = init_h_grad->data<T>();
+    }
+
+    T *init_c_grad_data = NULL;
+    if (init_c_grad == nullptr) {
+      Tensor init_c_grad_temp;
+      init_c_grad_temp.mutable_data<T>(init_c_dims, ctx.GetPlace());
+      zero(dev_ctx, &init_c_grad_temp, static_cast<T>(0.0));
+
+      init_c_grad_data = init_c_grad_temp.data<T>();
+    } else {
+      init_c_grad->mutable_data<T>(init_c_dims, ctx.GetPlace());
+      zero(dev_ctx, init_c_grad, static_cast<T>(0.0));
+      init_c_grad_data = init_c_grad->data<T>();
+    }
+
+    const T *last_h_grad_data = NULL;
+    if (last_h_grad == nullptr) {
+      Tensor last_h_grad_temp;
+      last_h_grad_temp.mutable_data<T>(init_h_dims, ctx.GetPlace());
+      zero(dev_ctx, &last_h_grad_temp, static_cast<T>(0.0));
+
+      last_h_grad_data = (const T *)last_h_grad_temp.data<T>();
+    } else {
+      last_h_grad_data = last_h_grad->data<T>();
+    }
+
+    const T *last_c_grad_data = NULL;
+    if (last_c_grad == nullptr) {
+      Tensor last_c_grad_temp;
+      last_c_grad_temp.mutable_data<T>(init_c_dims, ctx.GetPlace());
+      zero(dev_ctx, &last_c_grad_temp, static_cast<T>(0.0));
+
+      last_c_grad_data = (const T *)last_c_grad_temp.data<T>();
+    } else {
+      last_c_grad_data = last_c_grad->data<T>();
+    }
+
+    const T *out_grad_data = NULL;
+    if (out_grad == nullptr) {
+      Tensor out_grad_temp;
+      out_grad_temp.mutable_data<T>(out->dims(), ctx.GetPlace());
+      zero(dev_ctx, &out_grad_temp, static_cast<T>(0.0));
+
+      out_grad_data = (const T *)out_grad_temp.data<T>();
+    } else {
+      out_grad_data = out_grad->data<T>();
+    }
+
+    // zero( dev_ctx, last_h_grad, static_cast<T>(0.0));
+    // zero( dev_ctx, last_c_grad, static_cast<T>(0.0));
+
+    auto out_data = out->data<T>();
+    // auto out_grad_data = out_grad->data<T>();
+    auto weight_data = weight->data<T>();
+    auto init_h_data = init_h->data<T>();
+    auto init_c_data = init_c->data<T>();
+    auto in_grad_data = in_grad->data<T>();
+
+    auto work_data = cudnn_rnn_cache->workspace_data_.data<uint8_t>();
+    auto reserve_data = cudnn_rnn_cache->reserve_data_.data<uint8_t>();
+
+    auto run_seq_len = input_dims[0];
+    PADDLE_ENFORCE_LE((size_t)run_seq_len, cudnn_rnn_cache->max_length_,
+                      "cudnn running seq_len CAN not greater max_lengh");
+    CUDNN_ENFORCE(platform::dynload::cudnnRNNBackwardData(
+        handle, cudnn_rnn_cache->rnn_desc_, run_seq_len,
+        cudnn_rnn_cache->y_desc_, out_data, cudnn_rnn_cache->dy_desc_,
+        out_grad_data, cudnn_rnn_cache->dhy_desc_, last_h_grad_data,
+        cudnn_rnn_cache->dcy_desc_, last_c_grad_data, cudnn_rnn_cache->w_desc_,
+        weight_data, cudnn_rnn_cache->hx_desc_, init_h_data,
+        cudnn_rnn_cache->cx_desc_, init_c_data, cudnn_rnn_cache->dx_desc_,
+        in_grad_data, cudnn_rnn_cache->dhx_desc_, init_h_grad_data,
+        cudnn_rnn_cache->dcx_desc_, init_c_grad_data, work_data,
+        cudnn_rnn_cache->workspace_size_, reserve_data,
+        cudnn_rnn_cache->reserve_size_));
+
+    CUDNN_ENFORCE(platform::dynload::cudnnRNNBackwardWeights(
+        handle, cudnn_rnn_cache->rnn_desc_, run_seq_len,
+        cudnn_rnn_cache->x_desc_, input->data<T>(), cudnn_rnn_cache->hx_desc_,
+        init_h->data<T>(), cudnn_rnn_cache->y_desc_, out->data<T>(),
+        cudnn_rnn_cache->workspace_data_.data<uint8_t>(),
+        cudnn_rnn_cache->workspace_size_, cudnn_rnn_cache->dw_desc_,
+        weight_grad->data<T>(), cudnn_rnn_cache->reserve_data_.data<uint8_t>(),
+        cudnn_rnn_cache->reserve_size_));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    cudnn_lstm,
+    ops::CudnnLSTMGPUKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    cudnn_lstm_grad,
+    ops::CudnnLSTMGPUGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/cudnn_lstm_op.h b/paddle/fluid/operators/cudnn_lstm_op.h
new file mode 100644
index 0000000000..fb4b37e46e
--- /dev/null
+++ b/paddle/fluid/operators/cudnn_lstm_op.h
@@ -0,0 +1,42 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <string>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/math/detail/activation_functions.h"
+#include "paddle/fluid/operators/math/lstm_compute.h"
+#include "paddle/fluid/operators/math/sequence2batch.h"
+
+namespace paddle {
+namespace operators {
+
+using LoDTensor = framework::LoDTensor;
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class CudnnLSTMKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {}
+};
+
+template <typename DeviceContext, typename T>
+class CudnnLSTMGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {}
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h
index db62377898..213cd8a9ce 100644
--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
@@ -111,7 +111,23 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
   __macro(cudnnFindConvolutionForwardAlgorithmEx);        \
   __macro(cudnnFindConvolutionBackwardFilterAlgorithmEx); \
   __macro(cudnnFindConvolutionBackwardDataAlgorithmEx);   \
-  __macro(cudnnGetErrorString);
+  __macro(cudnnGetErrorString);                           \
+  __macro(cudnnCreateDropoutDescriptor);                  \
+  __macro(cudnnDropoutGetStatesSize);                     \
+  __macro(cudnnSetDropoutDescriptor);                     \
+  __macro(cudnnCreateRNNDescriptor);                      \
+  __macro(cudnnSetRNNDescriptor);                         \
+  __macro(cudnnGetRNNParamsSize);                         \
+  __macro(cudnnGetRNNWorkspaceSize);                      \
+  __macro(cudnnGetRNNTrainingReserveSize);                \
+  __macro(cudnnRNNForwardTraining);                       \
+  __macro(cudnnRNNBackwardData);                          \
+  __macro(cudnnRNNBackwardWeights);                       \
+  __macro(cudnnRNNForwardInference);                      \
+  __macro(cudnnDestroyDropoutDescriptor);                 \
+  __macro(cudnnDestroyRNNDescriptor);                     \
+  __macro(cudnnSetRNNDescriptor_v6);
+
 CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 
 #define CUDNN_DNN_ROUTINE_EACH_R2(__macro) \
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 7af1f380e7..abb82e7505 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -169,6 +169,7 @@ __all__ = [
     'log_loss',
     'add_position_encoding',
     'bilinear_tensor_product',
+    'cudnn_lstm',
 ]
 
 
@@ -466,6 +467,157 @@ def dynamic_lstm(input,
     return hidden, cell
 
 
+def cudnn_lstm(input,
+               init_h,
+               init_c,
+               batch_size,
+               max_len,
+               dropout_prob,
+               input_size,
+               hidden_size,
+               num_layers,
+               is_bidirec=False,
+               dtype='float32',
+               is_test=False,
+               name=None,
+               default_initializer=None,
+               fix_seed=False,
+               seed=0):
+    """
+    CUDNN LSTM implementation
+
+    A four-gate Long Short-Term Memory network with no peephole connections.
+    In the forward pass the output ht and cell output ct for a given iteration can be computed from the recurrent input ht-1, 
+    the cell input ct-1 and the previous layer input xt given matrices W, R and biases bW, bR from the following equations:
+
+    it = sigmoid(Wi X xt + Ri X ht-1 + bWi + bRi)
+    ft = sigmoid(Wf X xt + Rf X ht-1 + bWf + bRf)
+    ot = sigmoid(Wo X xt + Ro X ht-1 + bWo + bRo)
+    c't = tanh(Wc X xt + Rc X ht-1 + bWc + bRc)
+    ct = ft * ct-1 + it * c't
+    ht = ot * tanh(ct)
+
+    Where sigmoid is the sigmoid operator: sigmoid(x) = 1 / (1 + e^-x), * represents a point-wise multiplication, 
+    X represensts a matrix multiplication
+    and tanh is the hyperbolic tangent function. it, ft, ot, c't represent the input, forget, output and new gates respectively.
+
+
+    Args:
+        input (Variable): LSTM input tensor, shape MUST be ( seq_len x batch_size x input_size )
+        init_h(Variable): The initial hidden state of the LSTM                       
+                       This is a tensor with shape ( num_layers x batch_size x hidden_size)
+                       if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size)
+        init_c(Variable): The initial cell state of the LSTM.
+                       This is a tensor with shape ( num_layers x batch_size x hidden_size )
+                       if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size)
+        batch_size (int): total distance numer of the batch
+        max_len (int): max length of LSTM. the first dim of input tensor CAN NOT greater than max_len 
+        dropout_prob(float): dropout prob, dropout ONLY work between rnn layers, NOT between time steps
+                             There is NO dropout work on rnn output of the last RNN layers
+        input_size (int): hidden size of the input tensor
+        hidden_size (int): hidden size of the LSTM
+        num_layers (int): total layers number of the LSTM
+        is_bidirec (bool): If it is bidirectional
+        dtype (str): Data type. Choices = ["float32", "float64"], default "float32".
+        is_test (bool): If it is in test phrase
+        name (str|None): A name for this layer(optional). If set None, the layer
+                         will be named automatically.
+        default_initializer(Initialize|None): Where use initializer to initialize the Weight
+                         If set None, defaule initializer will be used
+
+
+    Returns:
+        rnn_out(Tensor): result of LSTM hidden, shape is (seq_len x batch_size x hidden_size)
+                         if is_bidirec set to True, shape will be ( seq_len x batch_sze x hidden_size*2)
+        last_h(Tensor): the hidden state of the last step of LSTM
+                        shape is ( num_layers x batch_size x hidden_size )
+                        if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size)                     
+        last_c(Tensor): the cell state of the last step of LSTM
+                        shape is ( num_layers x batch_size x hidden_size )
+                        if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size)                     
+
+
+    Examples:
+        .. code-block:: python
+
+            input = embedding
+            batch_size = 20
+            max_len = 100
+            dropout_prob = 0.2
+            input_size = 100
+            hidden_size = 150
+            num_layers = 1
+            init_hidden1 = layers.fill_constant( [num_layers, batch_size, hidden_size], 'float32', 0.0, stop_grad=False)
+            init_cell1 = layers.fill_constant( [num_layers, batch_size, hidden_size], 'float32', 0.0, stop_grad=False)
+
+            rnn_out, last_h, last_c = layers.cudnn_lstm( input, init_h, init_c, batch_size, \
+                    max_len, dropout_prob, input_size, hidden_size, \
+                    num_layers)
+    """
+
+    helper = LayerHelper('cudnn_lstm', **locals())
+
+    weight_size = 0
+    for i in range(num_layers):
+        if i == 0:
+            input_weight_size = (input_size * hidden_size) * 4
+        else:
+            if is_bidirec:
+                input_weight_size = (hidden_size * 2 * hidden_size) * 4
+            else:
+                input_weight_size = (hidden_size * hidden_size) * 4
+
+        hidden_weight_size = (hidden_size * hidden_size) * 4
+
+        if is_bidirec:
+            weight_size += (input_weight_size + hidden_weight_size) * 2
+            weight_size += hidden_size * 8 * 2
+        else:
+            weight_size += input_weight_size + hidden_weight_size
+            weight_size += hidden_size * 8
+
+    weight = helper.create_parameter(
+        attr=helper.param_attr,
+        shape=[weight_size],
+        dtype=dtype,
+        default_initializer=default_initializer)
+
+    out = helper.create_variable_for_type_inference(dtype)
+    last_h = helper.create_variable_for_type_inference(dtype)
+    last_c = helper.create_variable_for_type_inference(dtype)
+
+    cache = helper.create_variable(
+        persistable=True, type=core.VarDesc.VarType.RAW, stop_gradient=True)
+
+    helper.append_op(
+        type='cudnn_lstm',
+        inputs={
+            'Input': input,
+            'InitH': init_h,
+            'InitC': init_c,
+            'W': weight,
+            'Cache': cache,
+        },
+        outputs={
+            'Out': out,
+            'last_h': last_h,
+            'last_c': last_c,
+        },
+        attrs={
+            'max_len': max_len,
+            'is_bidirec': is_bidirec,
+            'input_size': input_size,
+            'batch_size': batch_size,
+            'hidden_size': hidden_size,
+            'num_layers': num_layers,
+            'is_test': is_test,
+            'dropout_prob': dropout_prob,
+            'fix_seed': fix_seed,
+            'seed': seed,
+        })
+    return out, last_h, last_c
+
+
 def dynamic_lstmp(input,
                   size,
                   proj_size,

From 084ff6574d8122afa256651aec7abb55260220f0 Mon Sep 17 00:00:00 2001
From: phlrain <phliuhongyu@126.com>
Date: Tue, 27 Nov 2018 16:40:26 +0800
Subject: [PATCH 25/90] add cudnn lstm; test=develop

---
 python/paddle/fluid/layers/nn.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index ef46bbb271..d2df819c08 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -525,6 +525,9 @@ def cudnn_lstm(input,
         default_initializer(Initialize|None): Where use initializer to initialize the Weight
                          If set None, defaule initializer will be used
 
+        fix_seed(bool): If it's True, fix seed will used for dropout in LSTM
+        seed(int): If fix_seed is True, dropout seed in LSTM will use this seed 
+
 
     Returns:
         rnn_out(Tensor): result of LSTM hidden, shape is (seq_len x batch_size x hidden_size)

From 97cbec9b74bc9ddf137608e95cecb93f352153d4 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Tue, 27 Nov 2018 19:02:29 +0800
Subject: [PATCH 26/90] clean code

---
 .../operators/distributed/parameter_prefetch.cc      | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.cc b/paddle/fluid/operators/distributed/parameter_prefetch.cc
index d2b514b7b4..13fc0bc0cd 100644
--- a/paddle/fluid/operators/distributed/parameter_prefetch.cc
+++ b/paddle/fluid/operators/distributed/parameter_prefetch.cc
@@ -37,9 +37,7 @@ using LoDTensor = framework::LoDTensor;
 using SelectedRows = framework::SelectedRows;
 using DDim = framework::DDim;
 
-constexpr int64_t kNoPadding = -1;
-
-inline size_t GetSectionIndex(int64_t id,
+static size_t GetSectionIndex(int64_t id,
                               const std::vector<int64_t>& abs_sections) {
   for (size_t i = 1; i < abs_sections.size(); ++i) {
     if (id < abs_sections[i]) {
@@ -49,7 +47,7 @@ inline size_t GetSectionIndex(int64_t id,
   return abs_sections.size() - 1;
 }
 
-inline std::vector<int64_t> ToAbsoluteSection(
+static std::vector<int64_t> ToAbsoluteSection(
     const std::vector<int64_t>& height_sections) {
   std::vector<int64_t> abs_sections;
   abs_sections.resize(height_sections.size());
@@ -60,7 +58,7 @@ inline std::vector<int64_t> ToAbsoluteSection(
   return abs_sections;
 }
 
-inline std::vector<std::vector<int64_t>> SplitIds(
+static std::vector<std::vector<int64_t>> SplitIds(
     const std::string& id_name, const std::vector<int64_t>& height_section,
     framework::Scope* scope) {
   auto& id_tensor = scope->FindVar(id_name)->Get<framework::LoDTensor>();
@@ -79,7 +77,7 @@ inline std::vector<std::vector<int64_t>> SplitIds(
   return splited_ids;
 }
 
-inline void SplitIdsIntoMultipleVarsBySection(
+static void SplitIdsIntoMultipleVarsBySection(
     const std::string& id_name, const std::vector<std::string>& in_var_names,
     const std::vector<int64_t>& height_section,
     const std::vector<std::vector<int64_t>>& splited_ids,
@@ -100,7 +98,7 @@ inline void SplitIdsIntoMultipleVarsBySection(
   }
 }
 
-inline void MergeMultipleVarsIntoOneBySection(
+static void MergeMultipleVarsIntoOneBySection(
     const std::string& id_name, const std::string& out_name,
     const std::vector<std::string>& out_var_names,
     const std::vector<int64_t>& height_section,

From 18fd2d01b787b1a3a8d305eb7214f09bac5a59fa Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Tue, 27 Nov 2018 19:36:39 +0800
Subject: [PATCH 27/90] update embedding api

---
 paddle/fluid/API.spec            | 2 +-
 python/paddle/fluid/layers/nn.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index 0a71f15343..96b38902e8 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -50,7 +50,7 @@ paddle.fluid.initializer.MSRAInitializer.__init__ ArgSpec(args=['self', 'uniform
 paddle.fluid.initializer.force_init_on_cpu ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
 paddle.fluid.initializer.init_on_cpu ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
 paddle.fluid.layers.fc ArgSpec(args=['input', 'size', 'num_flatten_dims', 'param_attr', 'bias_attr', 'act', 'is_test', 'name'], varargs=None, keywords=None, defaults=(1, None, None, None, False, None))
-paddle.fluid.layers.embedding ArgSpec(args=['input', 'size', 'is_sparse', 'is_distributed', 'padding_idx', 'param_attr', 'dtype'], varargs=None, keywords=None, defaults=(False, False, None, None, 'float32'))
+paddle.fluid.layers.embedding ArgSpec(args=['input', 'size', 'is_sparse', 'is_distributed', 'padding_idx', 'param_attr', 'dtype', 'remote_prefetch'], varargs=None, keywords=None, defaults=(False, False, None, None, 'float32', False))
 paddle.fluid.layers.dynamic_lstm ArgSpec(args=['input', 'size', 'h_0', 'c_0', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'float32', None))
 paddle.fluid.layers.dynamic_lstmp ArgSpec(args=['input', 'size', 'proj_size', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'proj_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'tanh', 'float32', None))
 paddle.fluid.layers.dynamic_gru ArgSpec(args=['input', 'size', 'param_attr', 'bias_attr', 'is_reverse', 'gate_activation', 'candidate_activation', 'h_0'], varargs=None, keywords=None, defaults=(None, None, False, 'sigmoid', 'tanh', None))
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index d62429e7a8..6b5a55a662 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -285,10 +285,10 @@ def embedding(input,
               size,
               is_sparse=False,
               is_distributed=False,
-              remote_prefetch=False,
               padding_idx=None,
               param_attr=None,
-              dtype='float32'):
+              dtype='float32',
+              remote_prefetch=False):
     """
     **Embedding Layer**
 

From 953c79caecf66760236767349401b2b595094da1 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Tue, 27 Nov 2018 19:49:17 +0800
Subject: [PATCH 28/90] use dist_ctr to test remote_prefetch

---
 python/paddle/fluid/tests/unittests/dist_ctr.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/dist_ctr.py b/python/paddle/fluid/tests/unittests/dist_ctr.py
index 902dc6544e..088f16a8ac 100644
--- a/python/paddle/fluid/tests/unittests/dist_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_ctr.py
@@ -59,7 +59,8 @@ class TestDistCTR2x2(TestDistRunnerBase):
             param_attr=fluid.ParamAttr(
                 name="deep_embedding",
                 initializer=fluid.initializer.Constant(value=0.01)),
-            is_sparse=IS_SPARSE)
+            is_sparse=IS_SPARSE,
+            remote_prefetch=True)
         dnn_pool = fluid.layers.sequence_pool(
             input=dnn_embedding, pool_type="sum")
         dnn_out = dnn_pool
@@ -81,7 +82,8 @@ class TestDistCTR2x2(TestDistRunnerBase):
             param_attr=fluid.ParamAttr(
                 name="wide_embedding",
                 initializer=fluid.initializer.Constant(value=0.01)),
-            is_sparse=IS_SPARSE)
+            is_sparse=IS_SPARSE,
+            remote_prefetch=True)
         lr_pool = fluid.layers.sequence_pool(input=lr_embbding, pool_type="sum")
 
         merge_layer = fluid.layers.concat(input=[dnn_out, lr_pool], axis=1)

From 1540df51cf5b88a08afec4e94d5fc86d4ddfe5c5 Mon Sep 17 00:00:00 2001
From: Jacek Czaja <jacek.czaja@intel.com>
Date: Tue, 27 Nov 2018 11:41:53 +0100
Subject: [PATCH 29/90] - Fix to test_conv2d_transpose_mkldnn for GPU

test=develop
---
 paddle/fluid/operators/conv_transpose_op.cc                    | 3 +++
 .../fluid/tests/unittests/test_conv2d_transpose_mkldnn_op.py   | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/paddle/fluid/operators/conv_transpose_op.cc b/paddle/fluid/operators/conv_transpose_op.cc
index fe09b5c17c..2fdfc40d19 100644
--- a/paddle/fluid/operators/conv_transpose_op.cc
+++ b/paddle/fluid/operators/conv_transpose_op.cc
@@ -256,6 +256,9 @@ void Conv3DTransposeOpMaker::Make() {
       "use_cudnn",
       "(bool, default false) Only used in cudnn kernel, need install cudnn")
       .SetDefault(false);
+  AddAttr<bool>("use_mkldnn",
+                "(bool, default false) Only used in mkldnn kernel")
+      .SetDefault(false);
   AddAttr<std::string>(
       "data_format",
       "(string, default NCHW) Only used in "
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_mkldnn_op.py
index 01a7cd6ca1..deefdd09ab 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_mkldnn_op.py
@@ -25,6 +25,7 @@ class TestMKLDNN(TestConv2dTransposeOp):
         self.use_mkldnn = True
         self.data_format = "NCHW"
         self.op_type = "conv2d_transpose"
+        self._cpu_only = True
 
     def test_check_grad(self):
         return
@@ -42,6 +43,7 @@ class TestMKLDNNWithPad(TestWithPad):
         self.use_mkldnn = True
         self.data_format = "NCHW"
         self.op_type = "conv2d_transpose"
+        self._cpu_only = True
 
     def test_check_grad(self):
         return
@@ -59,6 +61,7 @@ class TestMKLDNNWithStride(TestWithStride):
         self.use_mkldnn = True
         self.data_format = "NCHW"
         self.op_type = "conv2d_transpose"
+        self._cpu_only = True
 
     def test_check_grad(self):
         return

From 40f68b1349ed10654e27621a7ff39f594e9f8512 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Wed, 28 Nov 2018 12:45:02 +0800
Subject: [PATCH 30/90] unit test ready

---
 .../distributed/parameter_prefetch.cc         |  10 +-
 .../distributed/parameter_prefetch.h          |   2 +-
 paddle/fluid/operators/lookup_table_op.cc     |   6 +-
 paddle/fluid/operators/lookup_table_op.h      |   3 +-
 .../unittests/test_lookup_remote_table_op.py  | 146 ++++++++++++++++++
 5 files changed, 156 insertions(+), 11 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/test_lookup_remote_table_op.py

diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.cc b/paddle/fluid/operators/distributed/parameter_prefetch.cc
index 13fc0bc0cd..77cae35313 100644
--- a/paddle/fluid/operators/distributed/parameter_prefetch.cc
+++ b/paddle/fluid/operators/distributed/parameter_prefetch.cc
@@ -48,7 +48,7 @@ static size_t GetSectionIndex(int64_t id,
 }
 
 static std::vector<int64_t> ToAbsoluteSection(
-    const std::vector<int64_t>& height_sections) {
+    const std::vector<int>& height_sections) {
   std::vector<int64_t> abs_sections;
   abs_sections.resize(height_sections.size());
   abs_sections[0] = 0;
@@ -59,7 +59,7 @@ static std::vector<int64_t> ToAbsoluteSection(
 }
 
 static std::vector<std::vector<int64_t>> SplitIds(
-    const std::string& id_name, const std::vector<int64_t>& height_section,
+    const std::string& id_name, const std::vector<int>& height_section,
     framework::Scope* scope) {
   auto& id_tensor = scope->FindVar(id_name)->Get<framework::LoDTensor>();
   auto* id_data = id_tensor.data<int64_t>();
@@ -79,7 +79,7 @@ static std::vector<std::vector<int64_t>> SplitIds(
 
 static void SplitIdsIntoMultipleVarsBySection(
     const std::string& id_name, const std::vector<std::string>& in_var_names,
-    const std::vector<int64_t>& height_section,
+    const std::vector<int>& height_section,
     const std::vector<std::vector<int64_t>>& splited_ids,
     framework::Scope* scope) {
   PADDLE_ENFORCE_EQ(in_var_names.size(), height_section.size(), "");
@@ -101,7 +101,7 @@ static void SplitIdsIntoMultipleVarsBySection(
 static void MergeMultipleVarsIntoOneBySection(
     const std::string& id_name, const std::string& out_name,
     const std::vector<std::string>& out_var_names,
-    const std::vector<int64_t>& height_section,
+    const std::vector<int>& height_section,
     const std::vector<std::vector<int64_t>>& splited_ids,
     const framework::ExecutionContext& context, framework::Scope* scope) {
   PADDLE_ENFORCE_EQ(out_var_names.size(), height_section.size(), "");
@@ -154,7 +154,7 @@ static void MergeMultipleVarsIntoOneBySection(
 void prefetch(const std::string& id_name, const std::string& out_name,
               const std::vector<std::string>& table_names,
               const std::vector<std::string>& epmap,
-              const std::vector<int64_t>& height_sections,
+              const std::vector<int>& height_sections,
               const framework::ExecutionContext& context) {
   auto& local_scope = context.scope().NewScope();
 
diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.h b/paddle/fluid/operators/distributed/parameter_prefetch.h
index 0693cfc1fd..53b0fbfb51 100644
--- a/paddle/fluid/operators/distributed/parameter_prefetch.h
+++ b/paddle/fluid/operators/distributed/parameter_prefetch.h
@@ -26,7 +26,7 @@ namespace distributed {
 void prefetch(const std::string& id_name, const std::string& out_name,
               const std::vector<std::string>& table_names,
               const std::vector<std::string>& epmap,
-              const std::vector<int64_t>& height_sections,
+              const std::vector<int>& height_sections,
               const framework::ExecutionContext& context);
 
 };  // namespace distributed
diff --git a/paddle/fluid/operators/lookup_table_op.cc b/paddle/fluid/operators/lookup_table_op.cc
index ab6518641b..658b586e4c 100644
--- a/paddle/fluid/operators/lookup_table_op.cc
+++ b/paddle/fluid/operators/lookup_table_op.cc
@@ -91,9 +91,9 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
     // for parameter prefetch
     AddAttr<bool>("remote_prefetch", "").SetDefault(false);
     AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
-    AddAttr<std::vector<int64_t>>("height_sections",
-                                  "Height for each output SelectedRows.")
-        .SetDefault(std::vector<int64_t>({}));
+    AddAttr<std::vector<int>>("height_sections",
+                              "Height for each output SelectedRows.")
+        .SetDefault(std::vector<int>({}));
     AddAttr<std::vector<std::string>>(
         "epmap",
         "(string vector, default 127.0.0.1:6164)"
diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h
index c1a1ea87a0..17286191c2 100644
--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
@@ -52,8 +52,7 @@ class LookupTableKernel : public framework::OpKernel<T> {
 
     // for remote prefetch
     auto epmap = context.Attr<std::vector<std::string>>("epmap");
-    auto height_sections =
-        context.Attr<std::vector<int64_t>>("height_sections");
+    auto height_sections = context.Attr<std::vector<int>>("height_sections");
     auto table_names = context.Attr<std::vector<std::string>>("table_names");
 
     if (!height_sections.empty()) {
diff --git a/python/paddle/fluid/tests/unittests/test_lookup_remote_table_op.py b/python/paddle/fluid/tests/unittests/test_lookup_remote_table_op.py
new file mode 100644
index 0000000000..95a266540d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_lookup_remote_table_op.py
@@ -0,0 +1,146 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import signal
+import time
+import unittest
+from multiprocessing import Process
+
+import numpy as np
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+from paddle.fluid.framework import Program, program_guard
+
+
+def run_pserver(use_cuda, sync_mode):
+    scope = fluid.core.Scope()
+    program = Program()
+    with fluid.scope_guard(scope):
+        with program_guard(program, startup_program=Program()):
+            # create table parameter in scope
+            place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+            # create and initialize Param Variable
+            param = scope.var('table').get_tensor()
+            param_array = np.full((10, 8), 5.0).astype("float32")
+            param.set(param_array, place)
+
+            optimize_block = program._create_block(program.global_block().idx)
+            program.global_block().append_op(
+                type="listen_and_serv",
+                inputs={'X': []},
+                outputs={},
+                attrs={
+                    "optimize_blocks": [optimize_block],
+                    "endpoint": '127.0.0.1:0',
+                    "Fanin": 1,
+                    "sync_mode": True,
+                    "grad_to_block_id": []
+                })
+
+            exe = fluid.Executor(place)
+            exe.run(program)
+
+
+class TestListenAndServOp(unittest.TestCase):
+    def setUp(self):
+        self.ps_timeout = 5
+
+    def _start_pserver(self, use_cuda, sync_mode, pserver_func):
+        p = Process(target=pserver_func, args=(use_cuda, sync_mode))
+        p.daemon = True
+        p.start()
+        return p
+
+    def _wait_ps_ready(self, pid):
+        start_left_time = self.ps_timeout
+        sleep_time = 0.5
+        while True:
+            assert start_left_time >= 0, "wait ps ready failed"
+            time.sleep(sleep_time)
+            try:
+                # the listen_and_serv_op would touch a file which contains the listen port
+                # on the /tmp directory until it was ready to process all the RPC call.
+                os.stat("/tmp/paddle.%d.port" % pid)
+                return
+            except os.error:
+                start_left_time -= sleep_time
+
+    def _get_pserver_port(self, pid):
+        with open("/tmp/paddle.%d.port" % pid, 'r') as f:
+            port = int(f.read().strip())
+        return port
+
+    def _run_lookup_table_op(self, place, port):
+        scope = fluid.core.Scope()
+        program = Program()
+        with fluid.scope_guard(scope):
+            with program_guard(program, startup_program=Program()):
+                # create and initialize Param Variable
+                param = scope.var('W').get_tensor()
+                param_array = np.full((10, 8), 1.0).astype("float32")
+                param.set(param_array, place)
+
+                ids = scope.var('Ids').get_tensor()
+                ids_array = np.array([[1.0], [2.0]]).astype("int64")
+                ids.set(ids_array, place)
+                ids.set_lod([[0, 1, 2]])
+
+                out = scope.var('Out').get_tensor()
+
+                emaps = ['127.0.0.1:' + str(port)]
+                table_names = ['table']
+                height_sections = [10]
+                # create and run sgd operator
+                lookup_table_op = Operator(
+                    "lookup_table",
+                    W='W',
+                    Ids='Ids',
+                    Out='Out',
+                    remote_prefetch=True,
+                    epmap=emaps,
+                    table_names=table_names,
+                    height_sections=height_sections)
+                lookup_table_op.run(scope, place)
+
+                # get and compare result
+                result_array = np.array(out)
+
+                print(result_array)
+
+                self.assertTrue((result_array[0] == 5).all())
+                self.assertTrue((result_array[0] == 5).all())
+
+    def test_lookup_remote_table(self):
+        # run pserver on CPU in sync mode
+        p1 = self._start_pserver(False, True, run_pserver)
+        self._wait_ps_ready(p1.pid)
+        port = self._get_pserver_port(p1.pid)
+
+        places = [core.CPUPlace()]
+        # if core.is_compiled_with_cuda():
+        #     places.append(core.CUDAPlace(0))
+        for place in places:
+            self._run_lookup_table_op(place, port)
+
+        # raise SIGTERM to pserver
+        os.kill(p1.pid, signal.SIGINT)
+        p1.join()
+
+
+if __name__ == '__main__':
+    unittest.main()

From b2c9efef2bb10a46b88a99f5daeef20a292b18bc Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Wed, 28 Nov 2018 13:35:07 +0800
Subject: [PATCH 31/90] add more unit test for lookup_remote_table test=develop

---
 .../unittests/test_lookup_remote_table_op.py  | 82 ++++++++++++++++---
 1 file changed, 69 insertions(+), 13 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_lookup_remote_table_op.py b/python/paddle/fluid/tests/unittests/test_lookup_remote_table_op.py
index 95a266540d..01e9eaf3c8 100644
--- a/python/paddle/fluid/tests/unittests/test_lookup_remote_table_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lookup_remote_table_op.py
@@ -27,7 +27,7 @@ from paddle.fluid.op import Operator
 from paddle.fluid.framework import Program, program_guard
 
 
-def run_pserver(use_cuda, sync_mode):
+def run_pserver(pserver_id, use_cuda, sync_mode):
     scope = fluid.core.Scope()
     program = Program()
     with fluid.scope_guard(scope):
@@ -36,7 +36,10 @@ def run_pserver(use_cuda, sync_mode):
             place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
             # create and initialize Param Variable
             param = scope.var('table').get_tensor()
-            param_array = np.full((10, 8), 5.0).astype("float32")
+
+            param_array = np.ones((10, 8)).astype("float32")
+            for i in range(len(param_array)):
+                param_array[i] *= param_array[i] * i + pserver_id * 10
             param.set(param_array, place)
 
             optimize_block = program._create_block(program.global_block().idx)
@@ -60,8 +63,8 @@ class TestListenAndServOp(unittest.TestCase):
     def setUp(self):
         self.ps_timeout = 5
 
-    def _start_pserver(self, use_cuda, sync_mode, pserver_func):
-        p = Process(target=pserver_func, args=(use_cuda, sync_mode))
+    def _start_pserver(self, pserver_id, use_cuda, sync_mode, pserver_func):
+        p = Process(target=pserver_func, args=(pserver_id, use_cuda, sync_mode))
         p.daemon = True
         p.start()
         return p
@@ -85,7 +88,7 @@ class TestListenAndServOp(unittest.TestCase):
             port = int(f.read().strip())
         return port
 
-    def _run_lookup_table_op(self, place, port):
+    def _run_lookup_table_op_one_pserver(self, place, port):
         scope = fluid.core.Scope()
         program = Program()
         with fluid.scope_guard(scope):
@@ -96,15 +99,17 @@ class TestListenAndServOp(unittest.TestCase):
                 param.set(param_array, place)
 
                 ids = scope.var('Ids').get_tensor()
-                ids_array = np.array([[1.0], [2.0]]).astype("int64")
+                ids_array = np.array([[1], [2], [5]]).astype("int64")
                 ids.set(ids_array, place)
-                ids.set_lod([[0, 1, 2]])
+                ids_lod = [[0, 1, 2, 3]]
+                ids.set_lod(ids_lod)
 
                 out = scope.var('Out').get_tensor()
 
                 emaps = ['127.0.0.1:' + str(port)]
                 table_names = ['table']
                 height_sections = [10]
+
                 # create and run sgd operator
                 lookup_table_op = Operator(
                     "lookup_table",
@@ -120,24 +125,75 @@ class TestListenAndServOp(unittest.TestCase):
                 # get and compare result
                 result_array = np.array(out)
 
-                print(result_array)
+                self.assertEqual(out.lod(), ids_lod)
+                self.assertEqual(list(result_array.shape), [len(ids_array), 8])
+                for i in range(len(ids_array)):
+                    id = ids_array[i][0]
+                    self.assertTrue((result_array[i] == id).all())
 
-                self.assertTrue((result_array[0] == 5).all())
-                self.assertTrue((result_array[0] == 5).all())
+    def _run_lookup_table_op_two_pserver(self, place, port0, port1):
+        scope = fluid.core.Scope()
+        program = Program()
+        with fluid.scope_guard(scope):
+            with program_guard(program, startup_program=Program()):
+                # create and initialize Param Variable
+                param = scope.var('W').get_tensor()
+                param_array = np.full((10, 8), 1.0).astype("float32")
+                param.set(param_array, place)
+
+                ids = scope.var('Ids').get_tensor()
+                ids_array = np.array([[1], [2], [11], [13]]).astype("int64")
+                ids.set(ids_array, place)
+                ids_lod = [[0, 2, 3, 4]]
+                ids.set_lod(ids_lod)
+
+                out = scope.var('Out').get_tensor()
+
+                emaps = ['127.0.0.1:' + str(port0), '127.0.0.1:' + str(port1)]
+                table_names = ['table', 'table']
+                height_sections = [10, 20]
+
+                # create and run sgd operator
+                lookup_table_op = Operator(
+                    "lookup_table",
+                    W='W',
+                    Ids='Ids',
+                    Out='Out',
+                    remote_prefetch=True,
+                    epmap=emaps,
+                    table_names=table_names,
+                    height_sections=height_sections)
+                lookup_table_op.run(scope, place)
+
+                # get and compare result
+                result_array = np.array(out)
+                self.assertEqual(out.lod(), ids_lod)
+                self.assertEqual(list(result_array.shape), [len(ids_array), 8])
+                for i in range(len(ids_array)):
+                    id = ids_array[i][0]
+                    self.assertTrue((result_array[i] == id).all())
 
     def test_lookup_remote_table(self):
         # run pserver on CPU in sync mode
-        p1 = self._start_pserver(False, True, run_pserver)
+        p0 = self._start_pserver(0, False, True, run_pserver)
+        self._wait_ps_ready(p0.pid)
+        port0 = self._get_pserver_port(p0.pid)
+
+        p1 = self._start_pserver(1, False, True, run_pserver)
         self._wait_ps_ready(p1.pid)
-        port = self._get_pserver_port(p1.pid)
+        port1 = self._get_pserver_port(p1.pid)
 
         places = [core.CPUPlace()]
         # if core.is_compiled_with_cuda():
         #     places.append(core.CUDAPlace(0))
+
         for place in places:
-            self._run_lookup_table_op(place, port)
+            self._run_lookup_table_op_one_pserver(place, port0)
+            self._run_lookup_table_op_two_pserver(place, port0, port1)
 
         # raise SIGTERM to pserver
+        os.kill(p0.pid, signal.SIGINT)
+        p0.join()
         os.kill(p1.pid, signal.SIGINT)
         p1.join()
 

From c8b0241da22f2745b53f03e65107927b4aa912a8 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Wed, 28 Nov 2018 13:44:41 +0800
Subject: [PATCH 32/90] fix code style test=develop

---
 python/paddle/fluid/transpiler/distribute_transpiler.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index 7990d3ee0c..5d348f0995 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -1287,8 +1287,9 @@ to transpile() call.")
         # create table param and grad var in pserver program
         # create table optimize block in pserver program
         table_opt_op = [
-            op for op in self.optimize_ops if 'Param' in op.input_names and
-            op.input("Param")[0] == self.table_name
+            op for op in self.optimize_ops
+            if 'Param' in op.input_names and op.input("Param")[0] ==
+            self.table_name
         ][0]
 
         origin_param_var = self.origin_program.global_block().vars[

From b9d3d75fc41533d80753810d53e05132bcca5619 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Wed, 28 Nov 2018 14:23:19 +0800
Subject: [PATCH 33/90] fix prefetch dependency test=develop

---
 paddle/fluid/operators/CMakeLists.txt | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index a824fec1e4..8c8dc7026e 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -37,7 +37,13 @@ if (WITH_GPU)
     SET(OP_HEADER_DEPS ${OP_HEADER_DEPS} cub)
 endif()
 
-register_operators(EXCLUDES warpctc_op conv_fusion_op lookup_table_op DEPS ${OP_HEADER_DEPS})
+SET(OP_PREFETCH_DEPS "")
+if (WITH_DISTRIBUTE)
+    SET(OP_PREFETCH_DEPS ${OP_PREFETCH_DEPS} parameter_prefetch)
+endif()
+
+register_operators(EXCLUDES warpctc_op conv_fusion_op DEPS ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS})
+
 
 # warpctc_op needs cudnn 7 above
 if (WITH_GPU AND NOT WIN32)
@@ -55,8 +61,6 @@ else()
     op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
 endif()
 
-op_library(lookup_table_op DEPS parameter_prefetch)
-
 set(COMMON_OP_DEPS ${OP_HEADER_DEPS})
 
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor)

From 5a660aee7d0102e1c3ca26c6c6ab20ea32a88cdf Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Wed, 28 Nov 2018 16:02:39 +0800
Subject: [PATCH 34/90] update log level in parameter prefetch test=develop

---
 paddle/fluid/operators/distributed/parameter_prefetch.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.cc b/paddle/fluid/operators/distributed/parameter_prefetch.cc
index 77cae35313..859ea77a51 100644
--- a/paddle/fluid/operators/distributed/parameter_prefetch.cc
+++ b/paddle/fluid/operators/distributed/parameter_prefetch.cc
@@ -146,7 +146,7 @@ static void MergeMultipleVarsIntoOneBySection(
         }
       }
     } else {
-      VLOG(30) << "ids in this section is empty";
+      VLOG(3) << "ids in this section is empty";
     }
   }
 }
@@ -184,13 +184,13 @@ void prefetch(const std::string& id_name, const std::string& out_name,
   std::vector<distributed::VarHandlePtr> rets;
   for (size_t i = 0; i < in_var_names.size(); i++) {
     if (NeedSend(local_scope, in_var_names[i])) {
-      VLOG(30) << "sending " << in_var_names[i] << " to " << epmap[i]
+      VLOG(3) << "sending " << in_var_names[i] << " to " << epmap[i]
                << " to get " << out_var_names[i] << " back";
       rets.push_back(rpc_client->AsyncPrefetchVar(
           epmap[i], ctx, local_scope, in_var_names[i], out_var_names[i],
           table_names[i]));
     } else {
-      VLOG(30) << "don't send no-initialied variable: " << out_var_names[i];
+      VLOG(3) << "don't send no-initialied variable: " << out_var_names[i];
     }
   }
 

From d32de7e6e1d6a37b48d50c6570dd15f14413b674 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Wed, 28 Nov 2018 16:14:32 +0800
Subject: [PATCH 35/90] fix code format test=develop

---
 paddle/fluid/operators/distributed/parameter_prefetch.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.cc b/paddle/fluid/operators/distributed/parameter_prefetch.cc
index 859ea77a51..9db5345153 100644
--- a/paddle/fluid/operators/distributed/parameter_prefetch.cc
+++ b/paddle/fluid/operators/distributed/parameter_prefetch.cc
@@ -185,7 +185,7 @@ void prefetch(const std::string& id_name, const std::string& out_name,
   for (size_t i = 0; i < in_var_names.size(); i++) {
     if (NeedSend(local_scope, in_var_names[i])) {
       VLOG(3) << "sending " << in_var_names[i] << " to " << epmap[i]
-               << " to get " << out_var_names[i] << " back";
+              << " to get " << out_var_names[i] << " back";
       rets.push_back(rpc_client->AsyncPrefetchVar(
           epmap[i], ctx, local_scope, in_var_names[i], out_var_names[i],
           table_names[i]));

From 900fbb83f920b0492c7b7aca37f8fc6f8e58295a Mon Sep 17 00:00:00 2001
From: nhzlx <zlx_hg@163.com>
Date: Wed, 28 Nov 2018 11:23:05 +0000
Subject: [PATCH 36/90] add params sync pass

---
 .../inference/analysis/passes/CMakeLists.txt  |  3 +-
 .../passes/ir_analysis_compose_pass.cc        |  1 +
 .../analysis/passes/ir_graph_build_pass.cc    |  7 +-
 .../ir_params_sync_among_devices_pass.cc      | 86 +++++++++++++++++++
 .../ir_params_sync_among_devices_pass.h       | 42 +++++++++
 .../fluid/inference/analysis/passes/passes.cc |  4 +
 .../fluid/inference/api/paddle_pass_builder.h |  6 +-
 7 files changed, 137 insertions(+), 12 deletions(-)
 create mode 100644 paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
 create mode 100644 paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h

diff --git a/paddle/fluid/inference/analysis/passes/CMakeLists.txt b/paddle/fluid/inference/analysis/passes/CMakeLists.txt
index a30c27b118..98334760a6 100644
--- a/paddle/fluid/inference/analysis/passes/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/passes/CMakeLists.txt
@@ -1,6 +1,7 @@
 cc_library(ir_graph_build_pass SRCS ir_graph_build_pass.cc DEPS analysis_pass argument ir_pass_manager)
 cc_library(ir_analysis_pass SRCS ir_analysis_pass.cc DEPS analysis_pass argument ir_pass_manager)
-cc_library(analysis_passes SRCS passes.cc DEPS ir_graph_build_pass ir_analysis_pass)
+cc_library(ir_params_sync_among_devices_pass SRCS ir_params_sync_among_devices_pass.cc DEPS analysis_pass argument ir_pass_manager analysis_helper)
+cc_library(analysis_passes SRCS passes.cc DEPS ir_graph_build_pass ir_analysis_pass ir_params_sync_among_devices_pass)
 
 set(analysis_deps ${analysis_deps}
         ir_graph_build_pass
diff --git a/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc b/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc
index 108cb6f74b..c3a2b3ca1d 100644
--- a/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc
@@ -61,6 +61,7 @@ void IrAnalysisComposePass::InitTensorRTAttrs(Argument *argument) {
 void IrAnalysisComposePass::ApplyIrPasses(Argument *argument) {
   std::vector<std::string> passes({
       "ir_graph_build_pass", "ir_analysis_pass",
+      "ir_params_sync_among_devices_pass",
   });
   for (const auto &pass : passes) {
     VLOG(2) << "Run pass " << pass;
diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
index d5e0d90de1..740030c3a8 100644
--- a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
@@ -36,12 +36,7 @@ void IrGraphBuildPass::RunImpl(Argument *argument) {
   // so that the parameters will on the same device, or they will keep copying
   // between difference devices.
   platform::Place place;
-  if (argument->use_gpu()) {
-    PADDLE_ENFORCE(argument->gpu_device_id_valid());
-    place = platform::CUDAPlace(argument->gpu_device_id());
-  } else {
-    place = platform::CPUPlace();
-  }
+  place = platform::CPUPlace();
 
   if (argument->model_dir_valid()) {
     auto program =
diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
new file mode 100644
index 0000000000..e42f135052
--- /dev/null
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
@@ -0,0 +1,86 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h"
+#include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace {
+bool IsPersistable(const framework::VarDesc *var) {
+  if (var->Persistable() &&
+      var->GetType() != framework::proto::VarType::FEED_MINIBATCH &&
+      var->GetType() != framework::proto::VarType::FETCH_LIST) {
+    return true;
+  }
+  return false;
+}
+}  // namespace
+namespace inference {
+namespace analysis {
+
+void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
+  PADDLE_ENFORCE(argument->scope_valid());
+  PADDLE_ENFORCE(argument->use_gpu_valid());
+
+  platform::Place place;
+
+  // The parameters are on the cpu, therefore, synchronization is not necessary.
+  if (!argument->use_gpu()) return;
+
+  LOG(INFO) << "Sync params from CPU to GPU";
+
+  PADDLE_ENFORCE(argument->gpu_device_id_valid());
+  place = platform::CUDAPlace(argument->gpu_device_id());
+
+  auto *scope = argument->scope_ptr();
+  // Get the program which has been processed by several passes.
+  analysis_program_.reset(
+      new framework::ProgramDesc(argument->ir_analyzed_program()));
+
+  const auto &global_block = analysis_program_->Block(0);
+
+  // sync the params from cpu to gpu.
+  for (auto &var : global_block.AllVars()) {
+    if (IsPersistable(var)) {
+      std::string var_name = var->Name();
+      LOG(INFO) << var_name;
+      auto &t = inference::analysis::GetFromScope<framework::LoDTensor>(
+          *scope, var_name);
+
+      platform::CPUPlace cpu_place;
+      framework::LoDTensor temp_tensor;
+      temp_tensor.Resize(t.dims());
+      temp_tensor.mutable_data<float>(cpu_place);
+
+      // Copy the parameter data to a tmp tensor.
+      TensorCopySync(t, cpu_place, &temp_tensor);
+      // Reallocation the space on GPU
+      t.mutable_data<float>(place);
+
+      // Copy parameter data to newly allocated GPU space.
+      TensorCopySync(temp_tensor, place, &t);
+    }
+  }
+}
+
+std::string IrParamsSyncAmongDevicesPass::repr() const {
+  return "ir-params-sync-among-devices-pass";
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
new file mode 100644
index 0000000000..6818887b96
--- /dev/null
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
@@ -0,0 +1,42 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/inference/analysis/analysis_pass.h"
+#include "paddle/fluid/inference/analysis/helper.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+/*
+ * Sync parameter from CPU to GPU.
+ */
+class IrParamsSyncAmongDevicesPass : public AnalysisPass {
+ public:
+  void RunImpl(Argument *argument) override;
+  std::string repr() const override;
+
+ private:
+  std::unique_ptr<framework::ProgramDesc> analysis_program_;
+};
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/passes/passes.cc b/paddle/fluid/inference/analysis/passes/passes.cc
index 2ef515f45f..9245e32cee 100644
--- a/paddle/fluid/inference/analysis/passes/passes.cc
+++ b/paddle/fluid/inference/analysis/passes/passes.cc
@@ -16,6 +16,7 @@
 #include "paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc"
 #include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h"
 #include "paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h"
+#include "paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h"
 
 namespace paddle {
 namespace inference {
@@ -27,6 +28,9 @@ PassRegistry::PassRegistry() {
                   std::unique_ptr<AnalysisPass>(new IrGraphBuildPass));
   passes_.emplace("ir_analysis_compose_pass",
                   std::unique_ptr<AnalysisPass>(new IrAnalysisComposePass));
+  passes_.emplace(
+      "ir_params_sync_among_devices_pass",
+      std::unique_ptr<AnalysisPass>(new IrParamsSyncAmongDevicesPass));
 }
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h
index 12e3a6f42e..825bee833b 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -116,12 +116,8 @@ class CpuPassStrategy : public PassStrategy {
 class GpuPassStrategy : public PassStrategy {
  public:
   GpuPassStrategy() : PassStrategy({}) {
-    // TODO(NHZlX) Problem with Data synchronization between GPU and CPU
-    // When running in GPU mode, the parameters are all on GPU. But the
-    // opearations of "conv_bn_fuse_pass" are on CPU.
     passes_.assign({
-        "infer_clean_graph_pass",
-        // "infer_clean_graph_pass", "conv_bn_fuse_pass",
+        "infer_clean_graph_pass", "conv_bn_fuse_pass",
     });
   }
 

From d666c8eb1de356903bf91c69df4a4045dabbd933 Mon Sep 17 00:00:00 2001
From: nhzlx <zlx_hg@163.com>
Date: Wed, 28 Nov 2018 11:25:52 +0000
Subject: [PATCH 37/90] fix benchmark

---
 paddle/fluid/inference/utils/benchmark.cc | 2 +-
 paddle/fluid/inference/utils/benchmark.h  | 8 +++++---
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/inference/utils/benchmark.cc b/paddle/fluid/inference/utils/benchmark.cc
index 021edc2de5..d03aa11b75 100644
--- a/paddle/fluid/inference/utils/benchmark.cc
+++ b/paddle/fluid/inference/utils/benchmark.cc
@@ -33,7 +33,7 @@ std::string Benchmark::SerializeToString() const {
   ss << batch_size_ << "\t";
   ss << num_threads_ << "\t";
   ss << latency_ << "\t";
-  ss << 1000 / latency_;
+  ss << 1000.0 / latency_;
   ss << '\n';
   return ss.str();
 }
diff --git a/paddle/fluid/inference/utils/benchmark.h b/paddle/fluid/inference/utils/benchmark.h
index 80e8f77adb..76a3dd2c29 100644
--- a/paddle/fluid/inference/utils/benchmark.h
+++ b/paddle/fluid/inference/utils/benchmark.h
@@ -11,9 +11,11 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+#pragma once
 
 #include <fstream>
 #include <iostream>
+#include <string>
 
 namespace paddle {
 namespace inference {
@@ -31,8 +33,8 @@ struct Benchmark {
   bool use_gpu() const { return use_gpu_; }
   void SetUseGpu() { use_gpu_ = true; }
 
-  int latency() const { return latency_; }
-  void SetLatency(int x) { latency_ = x; }
+  float latency() const { return latency_; }
+  void SetLatency(float x) { latency_ = x; }
 
   const std::string& name() const { return name_; }
   void SetName(const std::string& name) { name_ = name; }
@@ -43,7 +45,7 @@ struct Benchmark {
  private:
   bool use_gpu_{false};
   int batch_size_{0};
-  int latency_;
+  float latency_;
   int num_threads_{1};
   std::string name_;
 };

From 4b9082a4cddbd8cbc10c061569d481c2ac8a21fa Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Wed, 28 Nov 2018 22:16:33 +0800
Subject: [PATCH 38/90] follow comment

---
 paddle/fluid/operators/lookup_table_op.h | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h
index 17286191c2..3a73a7637c 100644
--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
@@ -24,9 +24,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/blas.h"
 
 #ifdef PADDLE_WITH_DISTRIBUTE
-
 #include "paddle/fluid/operators/distributed/parameter_prefetch.h"
-
 #endif
 
 namespace paddle {
@@ -55,8 +53,8 @@ class LookupTableKernel : public framework::OpKernel<T> {
     auto height_sections = context.Attr<std::vector<int>>("height_sections");
     auto table_names = context.Attr<std::vector<std::string>>("table_names");
 
-    if (!height_sections.empty()) {
-// if emap is not empty, then the parameter will be fetched from remote
+    if (!epmap.empty()) {
+// if epmap is not empty, then the parameter will be fetched from remote
 // parameter
 // server
 #ifdef PADDLE_WITH_DISTRIBUTE

From 1d19eb2bd440bd391d292d207fe7c8351fce4d99 Mon Sep 17 00:00:00 2001
From: baojun-nervana <baojun.liu@intel.com>
Date: Thu, 15 Nov 2018 16:48:01 -0800
Subject: [PATCH 39/90] Implemented ngraph engine test=develop

---
 paddle/fluid/framework/CMakeLists.txt     |   3 +-
 paddle/fluid/framework/ngraph_bridge.cc   |  88 +++++-
 paddle/fluid/framework/ngraph_bridge.h    |  14 +-
 paddle/fluid/framework/ngraph_operator.cc | 363 +++++++++++++++++++++-
 paddle/fluid/framework/ngraph_operator.h  |   7 +-
 5 files changed, 449 insertions(+), 26 deletions(-)

diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 52946c7f11..94b3af9159 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -126,8 +126,9 @@ cc_library(version SRCS version.cc)
 cc_test(version_test SRCS version_test.cc DEPS version)
 
 cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version)
-cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto)
+
 if(NOT WIN32)
+cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto ngraph)
 cc_library(ngraph_operator SRCS ngraph_operator.cc DEPS ngraph_bridge operator op_info device_context tensor scope glog
   shape_inference data_transform lod_tensor profiler)
 endif(NOT WIN32)
diff --git a/paddle/fluid/framework/ngraph_bridge.cc b/paddle/fluid/framework/ngraph_bridge.cc
index 8177436d0b..45ef0211ad 100644
--- a/paddle/fluid/framework/ngraph_bridge.cc
+++ b/paddle/fluid/framework/ngraph_bridge.cc
@@ -15,23 +15,105 @@ limitations under the License. */
 #ifdef PADDLE_WITH_NGRAPH
 #include <algorithm>
 #include <functional>
+#include <vector>
 
 #include "paddle/fluid/framework/ngraph_bridge.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/enforce.h"
 
 #include "ngraph/ngraph.hpp"
 
 namespace paddle {
 namespace framework {
 
+static std::shared_ptr<ngraph::Node> GetNode(
+    const std::shared_ptr<OperatorBase>& op, const std::string prm,
+    const VariableNameMap& var_map,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto& var_names = var_map.at(prm);
+  PADDLE_ENFORCE_EQ(var_names.size(), 1,
+                    "op %s prm %s expects one associated var", op->Type(), prm);
+  if (ngb_node_map->find(var_names[0]) != ngb_node_map->end()) {
+    return (*ngb_node_map)[var_names[0]];
+  } else {
+    return nullptr;
+  }
+}
+
+static std::shared_ptr<ngraph::Node> GetInputNode(
+    const std::shared_ptr<OperatorBase>& op, const std::string prm,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  return GetNode(op, prm, op->Inputs(), ngb_node_map);
+}
+
+static std::shared_ptr<ngraph::Node> GetOutputNode(
+    const std::shared_ptr<OperatorBase>& op, const std::string prm,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  return GetNode(op, prm, op->Outputs(), ngb_node_map);
+}
+
+static void SetOutputNode(
+    const std::shared_ptr<OperatorBase>& op, const std::string prm,
+    std::shared_ptr<ngraph::Node> node,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto& var_names = op->Outputs().at(prm);
+  if (var_names.size() == 1) {
+    (*ngb_node_map)[var_names[0]] = node;
+  } else if (var_names.size() == 0) {
+    (*ngb_node_map)[""] = node;
+  } else {
+    PADDLE_THROW("prm %s has more than 1 var_names.", prm);
+  }
+}
+
+static bool HasOutput(const std::shared_ptr<OperatorBase>& op,
+                      const std::string prm) {
+  auto& outputs = op->Outputs();
+  if (outputs.find(prm) == outputs.end()) return false;
+  return outputs.at(prm).size() > 0;
+}
+
+template <typename T>
+static void BuildBinaryNode(
+    const std::shared_ptr<OperatorBase>& op,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto x = GetInputNode(op, "X", ngb_node_map);
+  auto y = GetInputNode(op, "Y", ngb_node_map);
+  auto out = std::make_shared<T>(x, y);
+  SetOutputNode(op, "Out", out, ngb_node_map);
+}
+
+template <typename T>
+static void BuildUnaryNode(
+    const std::shared_ptr<OperatorBase>& op,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto input = GetInputNode(op, "X", ngb_node_map);
+  auto out = std::make_shared<T>(input);
+  SetOutputNode(op, "Out", out, ngb_node_map);
+}
+
 std::map<std::string,
          std::function<void(const std::shared_ptr<OperatorBase>&,
                             std::shared_ptr<std::unordered_map<
                                 std::string, std::shared_ptr<ngraph::Node>>>)>>
-    NgraphBridge::NG_NODE_MAP = {};
+    NgraphBridge::NG_NODE_MAP = {{"relu", BuildUnaryNode<ngraph::op::Relu>},
+                                 {"tanh", BuildUnaryNode<ngraph::op::Tanh>}};
 
-void NgraphBridge::build_graph(const std::shared_ptr<OperatorBase>& op) {
+void NgraphBridge::BuildNgGraph(const std::shared_ptr<OperatorBase>& op) {
   auto& op_type = op->Type();
-  NG_NODE_MAP[op_type](op, ngb_node_map);
+  NG_NODE_MAP[op_type](op, ngb_node_map_);
 }
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/ngraph_bridge.h b/paddle/fluid/framework/ngraph_bridge.h
index 55bf0d21f3..3cf62b6daa 100644
--- a/paddle/fluid/framework/ngraph_bridge.h
+++ b/paddle/fluid/framework/ngraph_bridge.h
@@ -20,16 +20,14 @@ limitations under the License. */
 #include <map>
 #include <string>
 #include <unordered_map>
-#include <vector>
 
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/platform/enforce.h"
-
-#include "ngraph/ngraph.hpp"
+#include "ngraph/node.hpp"
 
 namespace paddle {
 namespace framework {
 
+class OperatorBase;
+
 class NgraphBridge {
  public:
   static std::map<
@@ -43,14 +41,14 @@ class NgraphBridge {
       std::shared_ptr<
           std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
           var_node_map)
-      : ngb_node_map(var_node_map) {}
+      : ngb_node_map_(var_node_map) {}
 
-  void build_graph(const std::shared_ptr<OperatorBase>& op);
+  void BuildNgGraph(const std::shared_ptr<OperatorBase>& op);
 
  private:
   std::shared_ptr<
       std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-      ngb_node_map;
+      ngb_node_map_;
 };
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/ngraph_operator.cc b/paddle/fluid/framework/ngraph_operator.cc
index d967b2780c..e9ff051355 100644
--- a/paddle/fluid/framework/ngraph_operator.cc
+++ b/paddle/fluid/framework/ngraph_operator.cc
@@ -19,14 +19,29 @@ limitations under the License. */
 #include <map>
 
 #include "paddle/fluid/framework/feed_fetch_type.h"
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/ngraph_bridge.h"
 #include "paddle/fluid/framework/ngraph_operator.h"
-#include "paddle/fluid/framework/shape_inference.h"
+#include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/var_desc.h"
 #include "paddle/fluid/framework/var_type.h"
 
+#include "ngraph/ngraph.hpp"
+
 namespace paddle {
 namespace framework {
 
+static ngraph::Shape Ddim2Shape(const DDim& dims) {
+  ngraph::Shape sp;
+  for (int i = 0; i < dims.size(); ++i) {
+    int k = dims[i];
+    k = k == 0 ? 1 : k;
+    sp.push_back(k);
+  }
+  return sp;
+}
+
 static std::map<proto::VarType::Type, ngraph::element::Type> pd2ng_type_map = {
     {proto::VarType::FP32, ngraph::element::f32},
     {proto::VarType::FP64, ngraph::element::f64},
@@ -59,13 +74,23 @@ class NgraphOperator {
         persistables_(persist),
         fetches_(fetches),
         post_op_inputs_(post_op_inputs),
-        ng_op_state_(ng_op_state) {}
+        ng_op_state_(ng_op_state) {
+    var_in_node_map_ = std::make_shared<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>();
+
+    var_node_map_ = std::make_shared<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>();
+
+    BuildNgIO();
+
+    GetNgFunction();
+  }
 
   void Run(const Scope& scope, const platform::Place& place) const;
 
  private:
   static std::unordered_map<std::string, std::shared_ptr<ngraph::Function>>
-      func_cache;
+      func_cache_;
   const Scope& scope_;
   const platform::Place& place_;
   std::vector<std::shared_ptr<OperatorBase>> fused_ops_;
@@ -74,6 +99,35 @@ class NgraphOperator {
   std::unordered_set<std::string> fetches_;
   std::unordered_set<std::string> post_op_inputs_;
   op_state ng_op_state_;
+
+  static std::shared_ptr<ngraph::runtime::Backend> backend_;
+
+  std::shared_ptr<ngraph::Function> ngraph_function_;
+  // var_name of inputs
+  std::vector<std::string> var_in_;
+  // var_name of outputs from  fetch in order
+  std::vector<std::string> var_out_;
+
+  std::shared_ptr<
+      std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+      var_in_node_map_;
+
+  // map each var name with a ngraph node
+  std::shared_ptr<
+      std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+      var_node_map_;
+
+  std::shared_ptr<std::string> GetCacheKey();
+
+  void GetNgInputShape(std::shared_ptr<OperatorBase> op);
+
+  void BuildNgNode();
+
+  void BuildNgIO();
+
+  void BuildNgFunction();
+
+  void GetNgFunction();
 };
 
 std::vector<std::vector<std::vector<std::unique_ptr<OperatorBase>>::iterator>>
@@ -86,7 +140,7 @@ FusedOperator::FusedOpIntervals(
   }
   size_t size = ops->size();
   size_t left = 0;
-  while (left < size && ops.at(left)->Type() != kFeedOpType) {
+  while (left < size && ops->at(left)->Type() != kFeedOpType) {
     ++left;
   }
   if (left == size) {
@@ -116,7 +170,7 @@ FusedOperator::FusedOpIntervals(
       size_t start = pivot, end = start;
       while (pivot < right &&
              (paddle::framework::NgraphBridge::NG_NODE_MAP.find(
-                  ops.at(pivot)->Type()) !=
+                  ops->at(pivot)->Type()) !=
               paddle::framework::NgraphBridge::NG_NODE_MAP.end())) {
         ++pivot;
         ++end;
@@ -136,7 +190,9 @@ FusedOperator::FusedOperator(
     std::vector<std::unique_ptr<OperatorBase>>::iterator end,
     const std::string& type, const VariableNameMap& inputs,
     const VariableNameMap& outputs, const AttributeMap& attrs)
-    : OperatorBase(type, inputs, outputs, attrs), pdesc(prog), block(block_id) {
+    : OperatorBase(type, inputs, outputs, attrs),
+      pdesc_(prog),
+      block_(block_id) {
   for (std::vector<std::unique_ptr<OperatorBase>>::iterator it = start;
        it != end; ++it) {
     fused_ops_.push_back(std::move(*it));
@@ -152,7 +208,7 @@ FusedOperator::FusedOperator(
   }
 
   if ((*(start - 1))->Type() == kFeedOpType && (*end)->Type() == kFetchOpType) {
-    is_complete = true;
+    is_full_ = true;
   }
 
   Process();
@@ -205,7 +261,7 @@ void FusedOperator::RunImpl(const Scope& scope,
     }
   }
 
-  if (is_full) {
+  if (is_full_) {
     ng_op_state = ng_op_state == PARTIAL_TEST ? FULL_TEST : FULL_TRAIN;
   }
 
@@ -215,6 +271,297 @@ void FusedOperator::RunImpl(const Scope& scope,
   ngraph_op.Run(scope, place);
 }
 
+std::unordered_map<std::string, std::shared_ptr<ngraph::Function>>
+    NgraphOperator::func_cache_ = {};
+
+std::shared_ptr<ngraph::runtime::Backend> NgraphOperator::backend_ =
+    ngraph::runtime::Backend::create("CPU");
+
+void NgraphOperator::GetNgInputShape(std::shared_ptr<OperatorBase> op) {
+  RuntimeInferShapeContext infer_shape_ctx(*op, scope_);
+  std::shared_ptr<OperatorWithKernel> op_k =
+      std::dynamic_pointer_cast<OperatorWithKernel>(op);
+  op_k->InferShape(&infer_shape_ctx);
+
+  for (auto& var_name_item : op->Inputs()) {
+    std::vector<ngraph::Shape> vshape;
+    auto& var_prm_name = var_name_item.first;
+    auto var_name_size = var_name_item.second.size();
+    if (var_name_size == 1) {
+      auto dim = infer_shape_ctx.GetInputDim(var_prm_name);
+      vshape.push_back(Ddim2Shape(dim));
+    } else if (var_name_item.second.size() > 1) {
+      auto vdim = infer_shape_ctx.GetInputsDim(var_prm_name);
+      PADDLE_ENFORCE_EQ(vdim.size(), var_name_item.second.size(),
+                        "Need dim info for each var");
+      for (auto& dim : vdim) {
+        vshape.push_back(Ddim2Shape(dim));
+      }
+    } else {
+      // 0 size : conv2d Bias
+    }
+
+    for (size_t i = 0; i < var_name_item.second.size(); ++i) {
+      auto var_name = var_name_item.second.at(i);
+      if (std::find(var_in_.begin(), var_in_.end(), var_name) !=
+          var_in_.end()) {
+        if (var_node_map_->find(var_name) == var_node_map_->end()) {
+          auto ng_type = var_type_map_.at(var_name);
+          auto prm = std::make_shared<ngraph::op::Parameter>(
+              ng_type, vshape.at(i), true);
+          (*var_node_map_)[var_name] = prm;
+          (*var_in_node_map_)[var_name] = prm;
+        }
+      }
+    }
+  }
+}
+
+void NgraphOperator::BuildNgNode() {
+  for (auto& var_name : var_out_) {
+    if (var_node_map_->find(var_name) == var_node_map_->end()) {
+      auto* var = scope_.FindVar(var_name);
+      if (var && VarIsTensor(*var)) {
+        auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
+        auto& ddim = tensor_pd->dims();
+        auto ng_shape = Ddim2Shape(ddim);
+        auto ng_type = var_type_map_.at(var_name);
+        auto prm =
+            std::make_shared<ngraph::op::Parameter>(ng_type, ng_shape, true);
+        (*var_node_map_)[var_name] = prm;
+      }
+    }
+  }
+
+  paddle::framework::NgraphBridge ngb(var_node_map_);
+  for (auto& op : fused_ops_) {
+    ngb.BuildNgGraph(op);
+  }
+}
+
+void NgraphOperator::BuildNgIO() {
+  std::unordered_set<std::string> inputs;
+  std::unordered_set<std::string> outputs;
+
+  for (auto& op : fused_ops_) {
+    for (auto& var_name_item : op->Inputs()) {
+      for (auto& var_name : var_name_item.second) {
+        inputs.insert(var_name);
+        const bool is_output = outputs.find(var_name) != outputs.end();
+        if (!is_output &&
+            std::find(var_in_.begin(), var_in_.end(), var_name) ==
+                var_in_.end()) {
+          // fill var_in here to keep lhs and rhs order
+          var_in_.push_back(var_name);
+        }
+      }
+    }
+
+    if (op->Type() != "fill_constant") {
+      GetNgInputShape(op);
+    }
+
+    for (auto& var_name_item : op->Outputs()) {
+      PADDLE_ENFORCE_LE(var_name_item.second.size(), 1,
+                        "op %s has more than 1 output - Not handling yet",
+                        op->Type());
+      for (auto& var_name : var_name_item.second) {
+        outputs.insert(var_name);
+      }
+    }
+  }
+
+  // var_out.clear();
+  for (auto& op : fused_ops_) {
+    for (auto& var_name_item : op->Outputs()) {
+      PADDLE_ENFORCE_LE(var_name_item.second.size(), 1,
+                        "op %s has more than 1 output - Not handling yet",
+                        op->Type());
+      for (auto& var_name : var_name_item.second) {
+        switch (ng_op_state_) {
+          case PARTIAL_TEST:
+            if (post_op_inputs_.find(var_name) != post_op_inputs_.end() ||
+                fetches_.find(var_name) != fetches_.end()) {
+              var_out_.push_back(var_name);
+            }
+            break;
+          case FULL_TEST:
+            if (fetches_.find(var_name) != fetches_.end()) {
+              var_out_.push_back(var_name);
+            }
+            break;
+          case PARTIAL_TRAIN:
+            if (fetches_.find(var_name) != fetches_.end() ||
+                post_op_inputs_.find(var_name) != post_op_inputs_.end() ||
+                persistables_.find(var_name) != persistables_.end()) {
+              var_out_.push_back(var_name);
+            }
+            break;
+          case FULL_TRAIN:
+            if (fetches_.find(var_name) != fetches_.end() ||
+                persistables_.find(var_name) != persistables_.end()) {
+              var_out_.push_back(var_name);
+            }
+            break;
+          default:
+            var_out_.push_back(var_name);
+        }
+      }
+    }
+  }
+}
+
+void NgraphOperator::BuildNgFunction() {
+  BuildNgNode();
+  ngraph_function_ = nullptr;
+  ngraph::NodeVector func_outputs;
+  ngraph::op::ParameterVector func_inputs;
+
+  for (auto& vo : var_out_) {
+    func_outputs.push_back(var_node_map_->at(vo));
+  }
+
+  for (auto& vi : var_in_) {
+    std::shared_ptr<ngraph::op::Parameter> prm =
+        std::dynamic_pointer_cast<ngraph::op::Parameter>(
+            var_in_node_map_->at(vi));
+    func_inputs.push_back(prm);
+  }
+
+  ngraph_function_ =
+      std::make_shared<ngraph::Function>(func_outputs, func_inputs);
+}
+
+std::shared_ptr<std::string> NgraphOperator::GetCacheKey() {
+  auto cache_key = std::make_shared<std::string>("");
+  *cache_key += std::to_string(fused_ops_.size());
+  for (auto& op : fused_ops_) {
+    *cache_key += op->Type();
+  }
+  for (auto& var_name : var_in_) {
+    auto shape = var_node_map_->at(var_name)->get_shape();
+    *cache_key += var_name;
+    *cache_key += var_type_map_.at(var_name).c_type_string();
+    for (size_t i = 0; i < shape.size(); ++i) {
+      *cache_key += std::to_string(shape.at(i));
+    }
+  }
+
+  for (auto& var_name : var_out_) {
+    auto* var = scope_.FindVar(var_name);
+    if (var && VarIsTensor(*var)) {
+      auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
+      auto& ddim = tensor_pd->dims();
+      for (int i = 0; i < ddim.size(); ++i) {
+        *cache_key += std::to_string(ddim[i]);
+      }
+    }
+  }
+  return cache_key;
+}
+
+void NgraphOperator::GetNgFunction() {
+  bool cache_on = true;
+  if (cache_on) {
+    std::string cache_key_val = *GetCacheKey();
+    if (func_cache_.find(cache_key_val) != func_cache_.end()) {
+      ngraph_function_ = func_cache_.at(cache_key_val);
+    } else {
+      BuildNgFunction();
+      func_cache_[cache_key_val] = ngraph_function_;
+    }
+  } else {
+    BuildNgFunction();
+  }
+}
+
+void NgraphOperator::Run(const Scope& scope,
+                         const platform::Place& place) const {
+  std::vector<std::shared_ptr<ngraph::runtime::Tensor>> t_in;
+  std::vector<std::shared_ptr<ngraph::runtime::Tensor>> t_out;
+
+  for (size_t i = 0; i < var_in_.size(); ++i) {
+    auto vi = var_in_.at(i);
+    auto sp = var_node_map_->at(vi)->get_shape();
+    std::shared_ptr<ngraph::runtime::Tensor> ti;
+    auto* var = scope.FindVar(vi);
+    if (var && VarIsTensor(*var)) {
+      auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
+      PADDLE_ENFORCE(sp == Ddim2Shape(tensor_pd->dims()),
+                     "Ensure ngraph tensor layout align with paddle tensor");
+      if (tensor_pd->type().hash_code() ==
+          typeid(float).hash_code()) {  // NOLINT
+        const float* arr = tensor_pd->data<float>();
+        ti = backend_->create_tensor(ngraph::element::f32, sp,
+                                     const_cast<float*>(arr));
+      } else if (tensor_pd->type().hash_code() ==
+                 typeid(int).hash_code()) {  // NOLINT
+        const int* arr = tensor_pd->data<int>();
+        ti = backend_->create_tensor(ngraph::element::i32, sp,
+                                     const_cast<int*>(arr));
+      } else if (tensor_pd->type().hash_code() == typeid(int64_t).hash_code()) {
+        const int64_t* arr = tensor_pd->data<int64_t>();
+        ti = backend_->create_tensor(ngraph::element::i64, sp,
+                                     const_cast<int64_t*>(arr));
+      } else if (tensor_pd->type().hash_code() ==
+                 typeid(double).hash_code()) {  // NOLINT
+        const double* arr = tensor_pd->data<double>();
+        ti = backend_->create_tensor(ngraph::element::f64, sp,
+                                     const_cast<double*>(arr));
+      } else if (tensor_pd->type().hash_code() ==
+                 typeid(bool).hash_code()) {  // NOLINT
+        const bool* arr = tensor_pd->data<bool>();
+        ti = backend_->create_tensor(ngraph::element::boolean, sp,
+                                     const_cast<bool*>(arr));
+      } else {
+        PADDLE_THROW("Data type not handling for var %s", vi);
+      }
+    } else {
+      PADDLE_THROW("Cannot find var or tensor with var name %s", vi);
+    }
+    bool is_test = (ng_op_state_ == PARTIAL_TEST || ng_op_state_ == FULL_TEST)
+                       ? true
+                       : false;
+    bool is_persistable =
+        (persistables_.find(vi) != persistables_.end()) ? true : false;
+    if (is_test && is_persistable) {
+      ti->set_stale(false);
+    }
+    t_in.push_back(ti);
+  }
+
+  for (size_t i = 0; i < var_out_.size(); ++i) {
+    auto var_name = var_out_[i];
+    auto* var = scope.FindVar(var_name);
+    std::shared_ptr<ngraph::runtime::Tensor> to;
+    if (var && VarIsTensor(*var)) {
+      auto* tensor_pd = GetMutableLoDTensorOrSelectedRowsValueFromVar(var);
+      auto dd = tensor_pd->dims();
+      ngraph::Shape sp = Ddim2Shape(dd);
+      auto ng_type = var_type_map_.at(var_name);
+      if (ng_type == ngraph::element::f32) {
+        auto pd_arr = tensor_pd->mutable_data<float>(place);
+        to = backend_->create_tensor(ngraph::element::f32, sp, pd_arr);
+      } else if (ng_type == ngraph::element::i64) {
+        auto pd_arr = tensor_pd->mutable_data<int64_t>(place);
+        to = backend_->create_tensor(ngraph::element::i64, sp, pd_arr);
+      } else if (ng_type == ngraph::element::f64) {
+        auto pd_arr = tensor_pd->mutable_data<double>(place);
+        to = backend_->create_tensor(ngraph::element::f64, sp, pd_arr);
+      } else if (ng_type == ngraph::element::boolean) {
+        auto pd_arr = tensor_pd->mutable_data<bool>(place);
+        to = backend_->create_tensor(ngraph::element::boolean, sp, pd_arr);
+      } else {
+        PADDLE_THROW("Data type not handled in for var %s", var_name);
+      }
+      t_out.push_back(to);
+    } else {
+      PADDLE_THROW("Cannot find var or tensor with var name %s", var_name);
+    }
+  }
+
+  backend_->call(ngraph_function_, t_out, t_in);
+}  // NgraphOperator::RunImpl
 }  // namespace framework
 }  // namespace paddle
 #endif
diff --git a/paddle/fluid/framework/ngraph_operator.h b/paddle/fluid/framework/ngraph_operator.h
index 0f655cef1d..3ca023e111 100644
--- a/paddle/fluid/framework/ngraph_operator.h
+++ b/paddle/fluid/framework/ngraph_operator.h
@@ -17,24 +17,19 @@ limitations under the License. */
 #ifdef PADDLE_WITH_NGRAPH
 
 #include <algorithm>
-#include <atomic>
 #include <string>
 #include <unordered_map>
 #include <vector>
 
 #include "paddle/fluid/framework/attribute.h"
-#include "paddle/fluid/framework/framework.pb.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/ngraph_bridge.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/op_kernel_type.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/variant.h"
 
-#include "ngraph/ngraph.hpp"
+#include "ngraph/type/element_type.hpp"
 
 namespace paddle {
 namespace framework {

From caf4b937b35aa3cb504ac0d6b76d663945b59cb7 Mon Sep 17 00:00:00 2001
From: baojun-nervana <baojun.liu@intel.com>
Date: Tue, 27 Nov 2018 11:02:10 -0800
Subject: [PATCH 40/90] Added RunInferShape test=develop

---
 paddle/fluid/framework/ngraph_operator.cc | 47 ++++++++---------------
 paddle/fluid/framework/operator.cc        |  8 +++-
 paddle/fluid/framework/operator.h         |  6 +++
 3 files changed, 28 insertions(+), 33 deletions(-)

diff --git a/paddle/fluid/framework/ngraph_operator.cc b/paddle/fluid/framework/ngraph_operator.cc
index e9ff051355..8878917da1 100644
--- a/paddle/fluid/framework/ngraph_operator.cc
+++ b/paddle/fluid/framework/ngraph_operator.cc
@@ -278,39 +278,22 @@ std::shared_ptr<ngraph::runtime::Backend> NgraphOperator::backend_ =
     ngraph::runtime::Backend::create("CPU");
 
 void NgraphOperator::GetNgInputShape(std::shared_ptr<OperatorBase> op) {
-  RuntimeInferShapeContext infer_shape_ctx(*op, scope_);
-  std::shared_ptr<OperatorWithKernel> op_k =
-      std::dynamic_pointer_cast<OperatorWithKernel>(op);
-  op_k->InferShape(&infer_shape_ctx);
-
+  op->RunInferShape(scope_, place_);
   for (auto& var_name_item : op->Inputs()) {
-    std::vector<ngraph::Shape> vshape;
-    auto& var_prm_name = var_name_item.first;
-    auto var_name_size = var_name_item.second.size();
-    if (var_name_size == 1) {
-      auto dim = infer_shape_ctx.GetInputDim(var_prm_name);
-      vshape.push_back(Ddim2Shape(dim));
-    } else if (var_name_item.second.size() > 1) {
-      auto vdim = infer_shape_ctx.GetInputsDim(var_prm_name);
-      PADDLE_ENFORCE_EQ(vdim.size(), var_name_item.second.size(),
-                        "Need dim info for each var");
-      for (auto& dim : vdim) {
-        vshape.push_back(Ddim2Shape(dim));
-      }
-    } else {
-      // 0 size : conv2d Bias
-    }
-
-    for (size_t i = 0; i < var_name_item.second.size(); ++i) {
-      auto var_name = var_name_item.second.at(i);
-      if (std::find(var_in_.begin(), var_in_.end(), var_name) !=
-          var_in_.end()) {
-        if (var_node_map_->find(var_name) == var_node_map_->end()) {
-          auto ng_type = var_type_map_.at(var_name);
-          auto prm = std::make_shared<ngraph::op::Parameter>(
-              ng_type, vshape.at(i), true);
-          (*var_node_map_)[var_name] = prm;
-          (*var_in_node_map_)[var_name] = prm;
+    for (auto& var_name : var_name_item.second) {
+      auto* var = scope_.FindVar(var_name);
+      if (var && VarIsTensor(*var)) {
+        auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
+        auto sp = Ddim2Shape(tensor_pd->dims());
+        if (std::find(var_in_.begin(), var_in_.end(), var_name) !=
+            var_in_.end()) {
+          if (var_node_map_->find(var_name) == var_node_map_->end()) {
+            auto ng_type = var_type_map_.at(var_name);
+            auto prm =
+                std::make_shared<ngraph::op::Parameter>(ng_type, sp, true);
+            (*var_node_map_)[var_name] = prm;
+            (*var_in_node_map_)[var_name] = prm;
+          }
         }
       }
     }
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 8bfdf38912..a816aa94c0 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -355,7 +355,7 @@ void OperatorBase::GenerateTemporaryNames() {
   }
 }
 
-static bool VarIsTensor(const Variable& var) {
+bool VarIsTensor(const Variable& var) {
   return var.IsType<LoDTensor>() || var.IsType<SelectedRows>();
 }
 
@@ -695,6 +695,12 @@ static void CheckTensorNANOrInf(const std::string& name,
                  "Tensor %s contains NAN", name);
 }
 
+void OperatorWithKernel::RunInferShape(const Scope& scope,
+                                       const platform::Place& place) const {
+  RuntimeInferShapeContext infer_shape_ctx(*this, scope);
+  this->InferShape(&infer_shape_ctx);
+}
+
 void OperatorWithKernel::RunImpl(const Scope& scope,
                                  const platform::Place& place) const {
   RuntimeInferShapeContext infer_shape_ctx(*this, scope);
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 5bd68f9ac2..fcf889f3db 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -64,6 +64,7 @@ inline std::string GradVarName(const std::string& var_name) {
 }
 
 proto::VarType::Type GetDataTypeOfVar(const Variable* var);
+bool VarIsTensor(const Variable& var);
 const Tensor* GetLoDTensorOrSelectedRowsValueFromVar(const Variable& var);
 Tensor* GetMutableLoDTensorOrSelectedRowsValueFromVar(Variable* var);
 
@@ -128,6 +129,8 @@ class OperatorBase {
   virtual std::vector<std::string> OutputVars(bool has_intermediate) const;
 
   void SetIsCalledByExecutor(bool x) { run_by_executor_ = x; }
+  virtual void RunInferShape(const Scope& scope,
+                             const platform::Place& place) const {}
 
  protected:
   std::string type_;
@@ -348,6 +351,9 @@ class OperatorWithKernel : public OperatorBase {
     OpInfoMap::Instance().Get(Type()).infer_shape_(ctx);
   }
 
+  void RunInferShape(const Scope& scope,
+                     const platform::Place& place) const override;
+
  protected:
   virtual OpKernelType GetExpectedKernelType(const ExecutionContext& ctx) const;
   virtual OpKernelType GetKernelTypeForVar(

From d6125a5eec073c595f84dfbc987bd0daad6742c6 Mon Sep 17 00:00:00 2001
From: Sang Ik Lee <sang.ik.lee@intel.com>
Date: Mon, 19 Nov 2018 15:44:38 -0800
Subject: [PATCH 41/90] Include ngraph in inference demo build. test=develop

---
 cmake/inference_lib.cmake                       |  9 +++++++++
 .../fluid/inference/api/demo_ci/CMakeLists.txt  | 17 ++++++++++++++++-
 2 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index 0b95a78072..c679d8507d 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -129,6 +129,15 @@ if (WITH_MKLDNN)
             )
 endif ()
 
+if (WITH_NGRAPH)
+    set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/ngraph")
+    copy(ngraph_lib
+            SRCS ${NGRAPH_INC_DIR} ${NGRAPH_LIB_DIR}
+            DSTS ${dst_dir} ${dst_dir}
+            DEPS ngraph
+            )
+endif ()
+
 if (NOT WIN32)
     if (NOT MOBILE_INFERENCE AND NOT RPI)
         set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappy")
diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
index 8fb464c0f5..d58486c5f0 100644
--- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
@@ -79,6 +79,21 @@ link_directories("${PADDLE_LIB}/third_party/install/gflags/lib")
 link_directories("${PADDLE_LIB}/third_party/install/xxhash/lib")
 link_directories("${PADDLE_LIB}/paddle/lib")
 
+if (NOT WIN32)
+    set(NGRAPH_PATH "${PADDLE_LIB}/third_party/install/ngraph")
+    if(EXISTS ${NGRAPH_PATH})
+        include_directories("${NGRAPH_PATH}/include")
+        if(UNIX AND NOT APPLE)
+            include(GNUInstallDirs)
+            link_directories("${NGRAPH_PATH}/${CMAKE_INSTALL_LIBDIR}")
+            set(NGRAPH_LIB ${NGRAPH_PATH}/${CMAKE_INSTALL_LIBDIR}/libngraph${CMAKE_SHARED_LIBRARY_SUFFIX})
+        else()
+            link_directories("${NGRAPH_PATH}/lib")
+            set(NGRAPH_LIB ${NGRAPH_PATH}/lib/libngraph${CMAKE_SHARED_LIBRARY_SUFFIX})
+        endif()
+    endif()
+endif()
+
 add_executable(${DEMO_NAME} ${DEMO_NAME}.cc)
 
 if(WITH_MKL)
@@ -106,7 +121,7 @@ endif()
 if (NOT WIN32)
 set(EXTERNAL_LIB "-lrt -ldl -lpthread")
 set(DEPS ${DEPS}
-    ${MATH_LIB} ${MKLDNN_LIB}
+    ${MATH_LIB} ${MKLDNN_LIB} ${NGRAPH_LIB}
     glog gflags protobuf snappystream snappy z xxhash
     ${EXTERNAL_LIB})
 else()

From a29696146cb9a403700faa0662e0a7f3c589a79e Mon Sep 17 00:00:00 2001
From: baojun-nervana <baojun.liu@intel.com>
Date: Mon, 19 Nov 2018 16:31:16 -0800
Subject: [PATCH 42/90] Added annotation test=develop

---
 paddle/fluid/framework/ngraph_operator.cc | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/paddle/fluid/framework/ngraph_operator.cc b/paddle/fluid/framework/ngraph_operator.cc
index 8878917da1..99c4cf0da6 100644
--- a/paddle/fluid/framework/ngraph_operator.cc
+++ b/paddle/fluid/framework/ngraph_operator.cc
@@ -57,6 +57,7 @@ typedef enum {                /* nGraph support state on ops          */
                PARTIAL_TEST   /* Support partial list of ops for test */
 } op_state;
 
+// perform graph build through bridge and execute computation
 class NgraphOperator {
  public:
   explicit NgraphOperator(const Scope& scope, const platform::Place& place,
@@ -100,33 +101,33 @@ class NgraphOperator {
   std::unordered_set<std::string> post_op_inputs_;
   op_state ng_op_state_;
 
+  // ngraph backend eg. CPU
   static std::shared_ptr<ngraph::runtime::Backend> backend_;
-
+  // ngraph function to call and execute
   std::shared_ptr<ngraph::Function> ngraph_function_;
   // var_name of inputs
   std::vector<std::string> var_in_;
   // var_name of outputs from  fetch in order
   std::vector<std::string> var_out_;
-
+  // map input vars to nodes
   std::shared_ptr<
       std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
       var_in_node_map_;
-
   // map each var name with a ngraph node
   std::shared_ptr<
       std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
       var_node_map_;
-
+  // cache key to check if function is cached
   std::shared_ptr<std::string> GetCacheKey();
-
+  // get ngraph input and define ngraph input parameters
   void GetNgInputShape(std::shared_ptr<OperatorBase> op);
-
+  // Call ngraph bridge to map ops
   void BuildNgNode();
-
+  // get the ngraph input and output var list
   void BuildNgIO();
-
+  // build ngraph function call
   void BuildNgFunction();
-
+  // Check cache for ngraph function or otherwise build the function
   void GetNgFunction();
 };
 

From 24e70920db7f034c4d6c7f85456dcf3115fbac6d Mon Sep 17 00:00:00 2001
From: Sang Ik Lee <sang.ik.lee@intel.com>
Date: Tue, 20 Nov 2018 15:47:49 -0800
Subject: [PATCH 43/90] Refactor some build settings. test=develop

---
 cmake/external/ngraph.cmake                   | 18 +++++----------
 .../inference/api/demo_ci/CMakeLists.txt      | 11 +++------
 python/setup.py.in                            | 23 +++++++++++--------
 3 files changed, 23 insertions(+), 29 deletions(-)

diff --git a/cmake/external/ngraph.cmake b/cmake/external/ngraph.cmake
index 2e335579f3..e66459fa3a 100644
--- a/cmake/external/ngraph.cmake
+++ b/cmake/external/ngraph.cmake
@@ -32,6 +32,8 @@ IF(NOT ${WITH_NGRAPH})
     return()
 ENDIF()
 
+INCLUDE(GNUInstallDirs)
+
 INCLUDE(ExternalProject)
 
 SET(NGRAPH_PROJECT         "extern_ngraph")
@@ -40,10 +42,14 @@ SET(NGRAPH_GIT_TAG         "f9fd9d4cc318dc59dd4b68448e7fbb5f67a28bd0")
 SET(NGRAPH_SOURCES_DIR     ${THIRD_PARTY_PATH}/ngraph)
 SET(NGRAPH_INSTALL_DIR     ${THIRD_PARTY_PATH}/install/ngraph)
 SET(NGRAPH_INC_DIR         ${NGRAPH_INSTALL_DIR}/include)
+SET(NGRAPH_LIB_DIR         ${NGRAPH_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR})
 SET(NGRAPH_SHARED_LIB_NAME libngraph.so.${NGRAPH_VERSION})
 SET(NGRAPH_CPU_LIB_NAME    libcpu_backend.so)
 SET(NGRAPH_TBB_LIB_NAME    libtbb.so.2)
 SET(NGRAPH_GIT_REPO        "https://github.com/NervanaSystems/ngraph.git")
+SET(NGRAPH_SHARED_LIB      ${NGRAPH_LIB_DIR}/${NGRAPH_SHARED_LIB_NAME})
+SET(NGRAPH_CPU_LIB         ${NGRAPH_LIB_DIR}/${NGRAPH_CPU_LIB_NAME})
+SET(NGRAPH_TBB_LIB         ${NGRAPH_LIB_DIR}/${NGRAPH_TBB_LIB_NAME})
 
 ExternalProject_Add(
     ${NGRAPH_PROJECT}
@@ -63,18 +69,6 @@ ExternalProject_Add(
     CMAKE_ARGS          -DMKLDNN_LIB_DIR=${MKLDNN_INSTALL_DIR}/lib
 )
 
-if(UNIX AND NOT APPLE)
-    include(GNUInstallDirs)
-    SET(NGRAPH_LIB_DIR ${NGRAPH_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR})
-else()
-    SET(NGRAPH_LIB_DIR ${NGRAPH_INSTALL_DIR}/lib)
-endif()
-MESSAGE(STATUS "nGraph lib will be installed at: ${NGRAPH_LIB_DIR}")
-
-SET(NGRAPH_SHARED_LIB      ${NGRAPH_LIB_DIR}/${NGRAPH_SHARED_LIB_NAME})
-SET(NGRAPH_CPU_LIB         ${NGRAPH_LIB_DIR}/${NGRAPH_CPU_LIB_NAME})
-SET(NGRAPH_TBB_LIB         ${NGRAPH_LIB_DIR}/${NGRAPH_TBB_LIB_NAME})
-
 # Workaround for nGraph expecting mklml to be in mkldnn install directory.
 ExternalProject_Add_Step(
     ${NGRAPH_PROJECT}
diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
index d58486c5f0..ec93729cd2 100644
--- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
@@ -82,15 +82,10 @@ link_directories("${PADDLE_LIB}/paddle/lib")
 if (NOT WIN32)
     set(NGRAPH_PATH "${PADDLE_LIB}/third_party/install/ngraph")
     if(EXISTS ${NGRAPH_PATH})
+        include(GNUInstallDirs)
         include_directories("${NGRAPH_PATH}/include")
-        if(UNIX AND NOT APPLE)
-            include(GNUInstallDirs)
-            link_directories("${NGRAPH_PATH}/${CMAKE_INSTALL_LIBDIR}")
-            set(NGRAPH_LIB ${NGRAPH_PATH}/${CMAKE_INSTALL_LIBDIR}/libngraph${CMAKE_SHARED_LIBRARY_SUFFIX})
-        else()
-            link_directories("${NGRAPH_PATH}/lib")
-            set(NGRAPH_LIB ${NGRAPH_PATH}/lib/libngraph${CMAKE_SHARED_LIBRARY_SUFFIX})
-        endif()
+        link_directories("${NGRAPH_PATH}/${CMAKE_INSTALL_LIBDIR}")
+        set(NGRAPH_LIB ${NGRAPH_PATH}/${CMAKE_INSTALL_LIBDIR}/libngraph${CMAKE_SHARED_LIBRARY_SUFFIX})
     endif()
 endif()
 
diff --git a/python/setup.py.in b/python/setup.py.in
index 200b96ec54..5aee26b638 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -165,9 +165,9 @@ if '${WITH_MKL}' == 'ON':
     shutil.copy('${MKLML_LIB}', libs_path)
     shutil.copy('${MKLML_IOMP_LIB}', libs_path)
     package_data['paddle.libs']+=['libmklml_intel' + ext_name,'libiomp5' + ext_name]
-if '${CMAKE_BUILD_TYPE}' == 'Release':
-    # only change rpath in Release mode.
-    if '${WITH_MKLDNN}' == 'ON':
+if '${WITH_MKLDNN}' == 'ON':
+    if '${CMAKE_BUILD_TYPE}' == 'Release':
+        # only change rpath in Release mode.
         # TODO(typhoonzero): use install_name_tool to patch mkl libs once
         # we can support mkl on mac.
         #
@@ -177,14 +177,19 @@ if '${CMAKE_BUILD_TYPE}' == 'Release':
         command = "patchelf --set-rpath '$ORIGIN/' ${MKLDNN_SHARED_LIB}"
         if os.system(command) != 0:
             raise Exception("patch libmkldnn.so failed, command: %s" % command)
-        package_data['paddle.libs']+=['libmkldnn.so.0']
-        shutil.copy('${MKLDNN_SHARED_LIB}', libs_path)
+    package_data['paddle.libs']+=['libmkldnn.so.0']
+    shutil.copy('${MKLDNN_SHARED_LIB}', libs_path)
 if '${WITH_NGRAPH}' == 'ON':
+    # only change rpath in Release mode,
+    # since in Debug mode, nGraph lib may be too large to be changed?
     if '${CMAKE_BUILD_TYPE}' == 'Release':
-        # only change rpath in Release mode.
-        command = "patchelf --set-rpath '$ORIGIN/' ${NGRAPH_SHARED_LIB}"
-        if os.system(command) != 0:
-            raise Exception("patch ${NGRAPH_SHARED_LIB_NAME} failed, command: %s" % command)
+        if os.name != 'nt':
+            if "@APPLE@" == "1":
+                command = "install_name_tool -id \"@loader_path/\" ${NGRAPH_SHARED_LIB}"
+            else:
+                command = "patchelf --set-rpath '$ORIGIN/' ${NGRAPH_SHARED_LIB}"
+            if os.system(command) != 0:
+                raise Exception("patch ${NGRAPH_SHARED_LIB_NAME} failed, command: %s" % command)
     shutil.copy('${NGRAPH_SHARED_LIB}', libs_path)
     shutil.copy('${NGRAPH_CPU_LIB}', libs_path)
     shutil.copy('${NGRAPH_TBB_LIB}', libs_path)

From e6bd53be60676c8f73e1cc80705f6e599db1985c Mon Sep 17 00:00:00 2001
From: baojun-nervana <baojun.liu@intel.com>
Date: Tue, 27 Nov 2018 21:38:54 -0800
Subject: [PATCH 44/90] Named to RuntimeInferShape test=develop

---
 paddle/fluid/framework/ngraph_operator.cc | 2 +-
 paddle/fluid/framework/operator.cc        | 4 ++--
 paddle/fluid/framework/operator.h         | 8 ++++----
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/paddle/fluid/framework/ngraph_operator.cc b/paddle/fluid/framework/ngraph_operator.cc
index 99c4cf0da6..61bae1aba4 100644
--- a/paddle/fluid/framework/ngraph_operator.cc
+++ b/paddle/fluid/framework/ngraph_operator.cc
@@ -279,7 +279,7 @@ std::shared_ptr<ngraph::runtime::Backend> NgraphOperator::backend_ =
     ngraph::runtime::Backend::create("CPU");
 
 void NgraphOperator::GetNgInputShape(std::shared_ptr<OperatorBase> op) {
-  op->RunInferShape(scope_, place_);
+  op->RuntimeInferShape(scope_, place_);
   for (auto& var_name_item : op->Inputs()) {
     for (auto& var_name : var_name_item.second) {
       auto* var = scope_.FindVar(var_name);
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index a816aa94c0..f3d225df69 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -695,8 +695,8 @@ static void CheckTensorNANOrInf(const std::string& name,
                  "Tensor %s contains NAN", name);
 }
 
-void OperatorWithKernel::RunInferShape(const Scope& scope,
-                                       const platform::Place& place) const {
+void OperatorWithKernel::RuntimeInferShape(const Scope& scope,
+                                           const platform::Place& place) const {
   RuntimeInferShapeContext infer_shape_ctx(*this, scope);
   this->InferShape(&infer_shape_ctx);
 }
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index fcf889f3db..efc9a1b6f5 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -129,8 +129,8 @@ class OperatorBase {
   virtual std::vector<std::string> OutputVars(bool has_intermediate) const;
 
   void SetIsCalledByExecutor(bool x) { run_by_executor_ = x; }
-  virtual void RunInferShape(const Scope& scope,
-                             const platform::Place& place) const {}
+  virtual void RuntimeInferShape(const Scope& scope,
+                                 const platform::Place& place) const {}
 
  protected:
   std::string type_;
@@ -351,8 +351,8 @@ class OperatorWithKernel : public OperatorBase {
     OpInfoMap::Instance().Get(Type()).infer_shape_(ctx);
   }
 
-  void RunInferShape(const Scope& scope,
-                     const platform::Place& place) const override;
+  void RuntimeInferShape(const Scope& scope,
+                         const platform::Place& place) const override;
 
  protected:
   virtual OpKernelType GetExpectedKernelType(const ExecutionContext& ctx) const;

From d5ee05e6c376929e416854f8864a672c6cc84958 Mon Sep 17 00:00:00 2001
From: baojun-nervana <baojun.liu@intel.com>
Date: Tue, 27 Nov 2018 23:06:17 -0800
Subject: [PATCH 45/90] Replaced VarIsTensor test=develop

---
 paddle/fluid/framework/ngraph_operator.cc | 10 +++++-----
 paddle/fluid/framework/operator.cc        |  2 +-
 paddle/fluid/framework/operator.h         |  1 -
 3 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/paddle/fluid/framework/ngraph_operator.cc b/paddle/fluid/framework/ngraph_operator.cc
index 61bae1aba4..1c770a2370 100644
--- a/paddle/fluid/framework/ngraph_operator.cc
+++ b/paddle/fluid/framework/ngraph_operator.cc
@@ -283,7 +283,7 @@ void NgraphOperator::GetNgInputShape(std::shared_ptr<OperatorBase> op) {
   for (auto& var_name_item : op->Inputs()) {
     for (auto& var_name : var_name_item.second) {
       auto* var = scope_.FindVar(var_name);
-      if (var && VarIsTensor(*var)) {
+      if (var && var->IsType<LoDTensor>()) {
         auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
         auto sp = Ddim2Shape(tensor_pd->dims());
         if (std::find(var_in_.begin(), var_in_.end(), var_name) !=
@@ -305,7 +305,7 @@ void NgraphOperator::BuildNgNode() {
   for (auto& var_name : var_out_) {
     if (var_node_map_->find(var_name) == var_node_map_->end()) {
       auto* var = scope_.FindVar(var_name);
-      if (var && VarIsTensor(*var)) {
+      if (var && var->IsType<LoDTensor>()) {
         auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
         auto& ddim = tensor_pd->dims();
         auto ng_shape = Ddim2Shape(ddim);
@@ -433,7 +433,7 @@ std::shared_ptr<std::string> NgraphOperator::GetCacheKey() {
 
   for (auto& var_name : var_out_) {
     auto* var = scope_.FindVar(var_name);
-    if (var && VarIsTensor(*var)) {
+    if (var && var->IsType<LoDTensor>()) {
       auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
       auto& ddim = tensor_pd->dims();
       for (int i = 0; i < ddim.size(); ++i) {
@@ -469,7 +469,7 @@ void NgraphOperator::Run(const Scope& scope,
     auto sp = var_node_map_->at(vi)->get_shape();
     std::shared_ptr<ngraph::runtime::Tensor> ti;
     auto* var = scope.FindVar(vi);
-    if (var && VarIsTensor(*var)) {
+    if (var && var->IsType<LoDTensor>()) {
       auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
       PADDLE_ENFORCE(sp == Ddim2Shape(tensor_pd->dims()),
                      "Ensure ngraph tensor layout align with paddle tensor");
@@ -518,7 +518,7 @@ void NgraphOperator::Run(const Scope& scope,
     auto var_name = var_out_[i];
     auto* var = scope.FindVar(var_name);
     std::shared_ptr<ngraph::runtime::Tensor> to;
-    if (var && VarIsTensor(*var)) {
+    if (var && var->IsType<LoDTensor>()) {
       auto* tensor_pd = GetMutableLoDTensorOrSelectedRowsValueFromVar(var);
       auto dd = tensor_pd->dims();
       ngraph::Shape sp = Ddim2Shape(dd);
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index f3d225df69..c6f3254e9f 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -355,7 +355,7 @@ void OperatorBase::GenerateTemporaryNames() {
   }
 }
 
-bool VarIsTensor(const Variable& var) {
+static bool VarIsTensor(const Variable& var) {
   return var.IsType<LoDTensor>() || var.IsType<SelectedRows>();
 }
 
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index efc9a1b6f5..0a6a28a5bc 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -64,7 +64,6 @@ inline std::string GradVarName(const std::string& var_name) {
 }
 
 proto::VarType::Type GetDataTypeOfVar(const Variable* var);
-bool VarIsTensor(const Variable& var);
 const Tensor* GetLoDTensorOrSelectedRowsValueFromVar(const Variable& var);
 Tensor* GetMutableLoDTensorOrSelectedRowsValueFromVar(Variable* var);
 

From fe0dee88d85f2710ba3d585816aefc69868ab72c Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Thu, 29 Nov 2018 10:44:19 +0800
Subject: [PATCH 46/90] Change pip version to correct version when install
 wheel package

test=develop
---
 paddle/scripts/paddle_build.sh | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index a6720fa798..dbb73f7a27 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -440,11 +440,29 @@ EOF
         ctest --output-on-failure -j $1
         # make install should also be test when unittest
         make install -j 8
-        pip install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
+        if [ "$1" == "cp27-cp27m" ]; then
+            pip install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
+        elif [ "$1" == "cp35-cp35m" ]; then
+            pip3.5 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
+        elif [ "$1" == "cp36-cp36m" ]; then
+            pip3.6 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
+        elif [ "$1" == "cp37-cp37m" ]; then
+            pip3.7 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
+        fi
+
         if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]] ; then
             paddle version
         fi
-        pip uninstall -y paddlepaddle
+
+        if [ "$1" == "cp27-cp27m" ]; then
+            pip uninstall -y paddlepaddle
+        elif [ "$1" == "cp35-cp35m" ]; then
+            pip3.5 uninstall -y paddlepaddle
+        elif [ "$1" == "cp36-cp36m" ]; then
+            pip3.6 uninstall -y paddlepaddle
+        elif [ "$1" == "cp37-cp37m" ]; then
+            pip3.7 uninstall -y paddlepaddle
+        fi
     fi
 }
 

From 3a3cfc2d8df13b906951cca47e44bad53a716fc2 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Thu, 29 Nov 2018 09:29:23 +0800
Subject: [PATCH 47/90] prefetch support gpu test=develop

---
 .../distributed/parameter_prefetch.cc         | 89 +++++++++++++------
 .../unittests/test_lookup_remote_table_op.py  |  4 +-
 2 files changed, 66 insertions(+), 27 deletions(-)

diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.cc b/paddle/fluid/operators/distributed/parameter_prefetch.cc
index 9db5345153..36f4f0eefd 100644
--- a/paddle/fluid/operators/distributed/parameter_prefetch.cc
+++ b/paddle/fluid/operators/distributed/parameter_prefetch.cc
@@ -59,14 +59,13 @@ static std::vector<int64_t> ToAbsoluteSection(
 }
 
 static std::vector<std::vector<int64_t>> SplitIds(
-    const std::string& id_name, const std::vector<int>& height_section,
-    framework::Scope* scope) {
-  auto& id_tensor = scope->FindVar(id_name)->Get<framework::LoDTensor>();
-  auto* id_data = id_tensor.data<int64_t>();
+    const std::vector<int64_t>& ids_vector,
+    const std::vector<int>& height_section, framework::Scope* scope) {
   std::set<int64_t> all_ids;
-  for (size_t i = 0; i < id_tensor.numel(); ++i) {
-    all_ids.insert(id_data[i]);
+  for (auto id : ids_vector) {
+    all_ids.insert(id);
   }
+
   auto abs_sections = ToAbsoluteSection(height_section);
   std::vector<std::vector<int64_t>> splited_ids;
   splited_ids.resize(height_section.size() + 1);
@@ -78,7 +77,7 @@ static std::vector<std::vector<int64_t>> SplitIds(
 }
 
 static void SplitIdsIntoMultipleVarsBySection(
-    const std::string& id_name, const std::vector<std::string>& in_var_names,
+    const std::vector<std::string>& in_var_names,
     const std::vector<int>& height_section,
     const std::vector<std::vector<int64_t>>& splited_ids,
     framework::Scope* scope) {
@@ -99,8 +98,8 @@ static void SplitIdsIntoMultipleVarsBySection(
 }
 
 static void MergeMultipleVarsIntoOneBySection(
-    const std::string& id_name, const std::string& out_name,
-    const std::vector<std::string>& out_var_names,
+    const std::string& id_name, const std::vector<int64_t>& ids_vector,
+    const std::string& out_name, const std::vector<std::string>& out_var_names,
     const std::vector<int>& height_section,
     const std::vector<std::vector<int64_t>>& splited_ids,
     const framework::ExecutionContext& context, framework::Scope* scope) {
@@ -109,16 +108,20 @@ static void MergeMultipleVarsIntoOneBySection(
   auto cpu_place = platform::CPUPlace();
 
   auto abs_sections = ToAbsoluteSection(height_section);
-  auto& id_tensor = scope->FindVar(id_name)->Get<framework::LoDTensor>();
-  auto* id_data = id_tensor.data<int64_t>();
   std::unordered_map<int64_t, std::vector<size_t>> id_to_offset;
-  for (size_t i = 0; i < id_tensor.numel(); ++i) {
-    id_to_offset[id_data[i]].push_back(i);
+  for (size_t i = 0; i < ids_vector.size(); ++i) {
+    id_to_offset[ids_vector[i]].push_back(i);
   }
 
+  auto& id_tensor = scope->FindVar(id_name)->Get<framework::LoDTensor>();
   auto* out_tensor =
       scope->FindVar(out_name)->GetMutable<framework::LoDTensor>();
-  auto* out_tensor_data = out_tensor->mutable_data<float>(context.GetPlace());
+  auto* out_tensor_data = out_tensor->mutable_data<float>(id_tensor.place());
+
+  bool is_on_cpu_place = true;
+  if (!platform::is_cpu_place(id_tensor.place())) {
+    is_on_cpu_place = false;
+  }
 
   for (size_t section_idx = 0; section_idx < out_var_names.size();
        ++section_idx) {
@@ -140,9 +143,20 @@ static void MergeMultipleVarsIntoOneBySection(
         auto& offsets = id_to_offset[origin_id];
         for (auto& offset : offsets) {
           // should support GPU tensor
-          memory::Copy(cpu_place, out_tensor_data + offset * row_numel,
-                       cpu_place, out_var_data + i * row_numel,
-                       sizeof(float) * row_numel);
+          if (is_on_cpu_place) {
+            memory::Copy(cpu_place, out_tensor_data + offset * row_numel,
+                         cpu_place, out_var_data + i * row_numel,
+                         sizeof(float) * row_numel);
+          } else {
+#ifndef PADDLE_WITH_CUDA
+            PADDLE_THROW("paddle is not compiled with CUDA!");
+#else
+            memory::Copy(boost::get<platform::CUDAPlace>(id_tensor.place()),
+                         out_tensor_data + offset * row_numel, cpu_place,
+                         out_var_data + i * row_numel,
+                         sizeof(float) * row_numel);
+#endif
+          }
         }
       }
     } else {
@@ -159,7 +173,7 @@ void prefetch(const std::string& id_name, const std::string& out_name,
   auto& local_scope = context.scope().NewScope();
 
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  auto& ctx = *pool.Get(context.GetPlace());
+  auto& cpu_ctx = *pool.Get(platform::CPUPlace());
 
   distributed::RPCClient* rpc_client =
       distributed::RPCClient::GetInstance<RPCCLIENT_T>(
@@ -172,9 +186,34 @@ void prefetch(const std::string& id_name, const std::string& out_name,
     out_var_names.push_back(out_name + "@" + epmap[i]);
   }
 
-  auto splited_ids = SplitIds(id_name, height_sections, &local_scope);
-  SplitIdsIntoMultipleVarsBySection(id_name, in_var_names, height_sections,
-                                    splited_ids, &local_scope);
+  auto& id_tensor = local_scope.FindVar(id_name)->Get<framework::LoDTensor>();
+  std::vector<int64_t> ids_vector;
+  if (platform::is_cpu_place(id_tensor.place())) {
+    auto* id_data = id_tensor.data<int64_t>();
+    for (size_t i = 0; i < id_tensor.numel(); ++i) {
+      ids_vector.push_back(id_data[i]);
+    }
+  } else {
+#ifndef PADDLE_WITH_CUDA
+    PADDLE_THROW("paddle is not compiled with CUDA!");
+#else
+    auto cpu_place = platform::CPUPlace();
+    framework::Tensor cpu_tensor;
+    auto* cpu_tensor_data =
+        cpu_tensor.mutable_data<int64_t>(id_tensor.dims(), cpu_place);
+    memory::Copy(cpu_place, cpu_tensor_data,
+                 boost::get<platform::CUDAPlace>(id_tensor.place()),
+                 id_tensor.data<int64_t>(),
+                 sizeof(int64_t) * id_tensor.numel());
+    for (size_t i = 0; i < id_tensor.numel(); ++i) {
+      ids_vector.push_back(cpu_tensor_data[i]);
+    }
+#endif
+  }
+
+  auto splited_ids = SplitIds(ids_vector, height_sections, &local_scope);
+  SplitIdsIntoMultipleVarsBySection(in_var_names, height_sections, splited_ids,
+                                    &local_scope);
 
   // create output var in local scope
   for (auto& name : out_var_names) {
@@ -187,7 +226,7 @@ void prefetch(const std::string& id_name, const std::string& out_name,
       VLOG(3) << "sending " << in_var_names[i] << " to " << epmap[i]
               << " to get " << out_var_names[i] << " back";
       rets.push_back(rpc_client->AsyncPrefetchVar(
-          epmap[i], ctx, local_scope, in_var_names[i], out_var_names[i],
+          epmap[i], cpu_ctx, local_scope, in_var_names[i], out_var_names[i],
           table_names[i]));
     } else {
       VLOG(3) << "don't send no-initialied variable: " << out_var_names[i];
@@ -198,9 +237,9 @@ void prefetch(const std::string& id_name, const std::string& out_name,
     PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient");
   }
 
-  MergeMultipleVarsIntoOneBySection(id_name, out_name, out_var_names,
-                                    height_sections, splited_ids, context,
-                                    &local_scope);
+  MergeMultipleVarsIntoOneBySection(id_name, ids_vector, out_name,
+                                    out_var_names, height_sections, splited_ids,
+                                    context, &local_scope);
 
   context.scope().DeleteScope(&local_scope);
 }
diff --git a/python/paddle/fluid/tests/unittests/test_lookup_remote_table_op.py b/python/paddle/fluid/tests/unittests/test_lookup_remote_table_op.py
index 01e9eaf3c8..b46e61d2ef 100644
--- a/python/paddle/fluid/tests/unittests/test_lookup_remote_table_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lookup_remote_table_op.py
@@ -184,8 +184,8 @@ class TestListenAndServOp(unittest.TestCase):
         port1 = self._get_pserver_port(p1.pid)
 
         places = [core.CPUPlace()]
-        # if core.is_compiled_with_cuda():
-        #     places.append(core.CUDAPlace(0))
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
 
         for place in places:
             self._run_lookup_table_op_one_pserver(place, port0)

From 3c83a2f72003be15d16faaf02e1fcc708d0e8075 Mon Sep 17 00:00:00 2001
From: nhzlx <zlx_hg@163.com>
Date: Thu, 29 Nov 2018 06:12:47 +0000
Subject: [PATCH 48/90] fix comments

---
 .../inference/analysis/passes/CMakeLists.txt  |  2 +-
 .../ir_params_sync_among_devices_pass.cc      | 40 +++++++------------
 .../ir_params_sync_among_devices_pass.h       |  5 +--
 3 files changed, 16 insertions(+), 31 deletions(-)

diff --git a/paddle/fluid/inference/analysis/passes/CMakeLists.txt b/paddle/fluid/inference/analysis/passes/CMakeLists.txt
index 98334760a6..d3ea511d8f 100644
--- a/paddle/fluid/inference/analysis/passes/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/passes/CMakeLists.txt
@@ -1,6 +1,6 @@
 cc_library(ir_graph_build_pass SRCS ir_graph_build_pass.cc DEPS analysis_pass argument ir_pass_manager)
 cc_library(ir_analysis_pass SRCS ir_analysis_pass.cc DEPS analysis_pass argument ir_pass_manager)
-cc_library(ir_params_sync_among_devices_pass SRCS ir_params_sync_among_devices_pass.cc DEPS analysis_pass argument ir_pass_manager analysis_helper)
+cc_library(ir_params_sync_among_devices_pass SRCS ir_params_sync_among_devices_pass.cc DEPS analysis_pass argument ir_pass_manager)
 cc_library(analysis_passes SRCS passes.cc DEPS ir_graph_build_pass ir_analysis_pass ir_params_sync_among_devices_pass)
 
 set(analysis_deps ${analysis_deps}
diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
index e42f135052..8be2d3ac0b 100644
--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
@@ -19,16 +19,6 @@
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
-namespace {
-bool IsPersistable(const framework::VarDesc *var) {
-  if (var->Persistable() &&
-      var->GetType() != framework::proto::VarType::FEED_MINIBATCH &&
-      var->GetType() != framework::proto::VarType::FETCH_LIST) {
-    return true;
-  }
-  return false;
-}
-}  // namespace
 namespace inference {
 namespace analysis {
 
@@ -47,32 +37,30 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
   place = platform::CUDAPlace(argument->gpu_device_id());
 
   auto *scope = argument->scope_ptr();
-  // Get the program which has been processed by several passes.
-  analysis_program_.reset(
-      new framework::ProgramDesc(argument->ir_analyzed_program()));
-
-  const auto &global_block = analysis_program_->Block(0);
+  std::vector<std::string> all_vars = scope->LocalVarNames();
 
-  // sync the params from cpu to gpu.
-  for (auto &var : global_block.AllVars()) {
-    if (IsPersistable(var)) {
-      std::string var_name = var->Name();
-      LOG(INFO) << var_name;
-      auto &t = inference::analysis::GetFromScope<framework::LoDTensor>(
-          *scope, var_name);
+  // We get all the vars from local_scope instead of the ProgramDesc.
+  // Because there exists the case that new parameter variables are not added to
+  // the program in the analysis pass.
+  for (auto &var_name : all_vars) {
+    auto *var = scope->FindLocalVar(var_name);
+    PADDLE_ENFORCE(var != nullptr);
+    if (var->IsType<framework::LoDTensor>() ||
+        var->IsType<framework::Tensor>()) {
+      auto *t = var->GetMutable<framework::LoDTensor>();
 
       platform::CPUPlace cpu_place;
       framework::LoDTensor temp_tensor;
-      temp_tensor.Resize(t.dims());
+      temp_tensor.Resize(t->dims());
       temp_tensor.mutable_data<float>(cpu_place);
 
       // Copy the parameter data to a tmp tensor.
-      TensorCopySync(t, cpu_place, &temp_tensor);
+      TensorCopySync(*t, cpu_place, &temp_tensor);
       // Reallocation the space on GPU
-      t.mutable_data<float>(place);
+      t->mutable_data<float>(place);
 
       // Copy parameter data to newly allocated GPU space.
-      TensorCopySync(temp_tensor, place, &t);
+      TensorCopySync(temp_tensor, place, t);
     }
   }
 }
diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
index 6818887b96..a95f460df6 100644
--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
@@ -15,10 +15,10 @@
 #pragma once
 
 #include <string>
+#include <vector>
 
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/inference/analysis/analysis_pass.h"
-#include "paddle/fluid/inference/analysis/helper.h"
 #include "paddle/fluid/platform/place.h"
 
 namespace paddle {
@@ -32,9 +32,6 @@ class IrParamsSyncAmongDevicesPass : public AnalysisPass {
  public:
   void RunImpl(Argument *argument) override;
   std::string repr() const override;
-
- private:
-  std::unique_ptr<framework::ProgramDesc> analysis_program_;
 };
 
 }  // namespace analysis

From 731d45a39ab1a076519cedb0167c04801d1e7c84 Mon Sep 17 00:00:00 2001
From: qingqing01 <dangqingqing@baidu.com>
Date: Thu, 29 Nov 2018 15:18:56 +0800
Subject: [PATCH 49/90] Enable BatchNorm to use global mean and variane during
 training (#14630)

* Enable BatchNorm to use global mean and variane during training
* Update doc and follow comments.
---
 paddle/fluid/API.spec                         |   2 +-
 .../fluid/operators/batch_norm_mkldnn_op.cc   |  17 +-
 paddle/fluid/operators/batch_norm_op.cc       | 176 +++++++++----
 .../{batch_norm_op.cu.cc => batch_norm_op.cu} | 234 +++++++++++++-----
 python/paddle/fluid/layers/nn.py              |  30 ++-
 .../tests/unittests/test_batch_norm_op.py     | 133 ++++++++--
 .../fluid/tests/unittests/test_layers.py      |   9 +
 7 files changed, 456 insertions(+), 145 deletions(-)
 rename paddle/fluid/operators/{batch_norm_op.cu.cc => batch_norm_op.cu} (57%)

diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index c40f603341..6b5ed10244 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -69,7 +69,7 @@ paddle.fluid.layers.sequence_softmax ArgSpec(args=['input', 'use_cudnn', 'name']
 paddle.fluid.layers.softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(True, None))
 paddle.fluid.layers.pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True))
 paddle.fluid.layers.pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True))
-paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False))
+paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu', 'use_global_stats'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False, False))
 paddle.fluid.layers.beam_search_decode ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
 paddle.fluid.layers.conv3d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
diff --git a/paddle/fluid/operators/batch_norm_mkldnn_op.cc b/paddle/fluid/operators/batch_norm_mkldnn_op.cc
index de641cb08e..29d950967f 100644
--- a/paddle/fluid/operators/batch_norm_mkldnn_op.cc
+++ b/paddle/fluid/operators/batch_norm_mkldnn_op.cc
@@ -146,7 +146,9 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     const float epsilon = ctx.Attr<float>("epsilon");
     const float momentum = ctx.Attr<float>("momentum");
     const bool is_test = ctx.Attr<bool>("is_test");
+    const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
     const bool fuse_with_relu = ctx.Attr<bool>("fuse_with_relu");
+    bool global_stats = is_test || use_global_stats;
 
     const auto *x = ctx.Input<Tensor>("X");
     const auto *mean = ctx.Input<Tensor>("Mean");
@@ -177,13 +179,14 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     T *batch_mean_data = nullptr;
     T *batch_variance_data = nullptr;
 
-    if (!is_test) {
+    if (!global_stats) {
       batch_mean_data = batch_mean->mutable_data<T>(ctx.GetPlace());
       batch_variance_data = batch_variance->mutable_data<T>(ctx.GetPlace());
     }
 
-    auto propagation = is_test == true ? mkldnn::prop_kind::forward_scoring
-                                       : mkldnn::prop_kind::forward_training;
+    auto propagation = global_stats == true
+                           ? mkldnn::prop_kind::forward_scoring
+                           : mkldnn::prop_kind::forward_training;
 
     auto src_tz = paddle::framework::vectorize2int(x->dims());
     auto scale_tz = paddle::framework::vectorize2int(scale->dims());
@@ -199,7 +202,7 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
                     shift->data<T>() + ic, &scaleshift_data);
 
     unsigned flags = mkldnn::use_scale_shift;
-    if (is_test) flags |= mkldnn::use_global_stats;
+    if (global_stats) flags |= mkldnn::use_global_stats;
     if (fuse_with_relu) flags |= mkldnn::fuse_bn_relu;
 
     // create mkldnn memory from input x tensor
@@ -208,7 +211,7 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
 
     // keys for backward pass
     const std::string key = BatchNormMKLDNNHandler::GetHash(
-        src_tz, epsilon, flags, is_test, input_format,
+        src_tz, epsilon, flags, global_stats, input_format,
         ctx.op().Output("SavedMean"));
     const std::string key_batch_norm_fwd_pd = key + "@bn_fwd_pd";
 
@@ -239,7 +242,7 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
         batch_norm_fwd_pd->dst_primitive_desc().desc(), y_data);
 
     std::shared_ptr<batch_norm_fwd> batch_norm_p;
-    if (is_test) {
+    if (global_stats) {
       // create mkldnn memory for stats (as input)
       std::shared_ptr<memory> mean_memory =
           handler.AcquireMeanMemoryFromPrimitive(to_void_cast(mean_data));
@@ -269,7 +272,7 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     pipeline.push_back(*batch_norm_p);
     mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
 
-    if (!is_test) {
+    if (!global_stats) {
       // mkldnn only compute stats for current batch
       // so we need compute momentum stats via Eigen lib
       EigenVectorArrayMap<T> batch_mean_e(batch_mean_data, ic);
diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc
index 2463c939bc..f66813989c 100644
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -159,6 +159,14 @@ class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<bool>("fuse_with_relu",
                   "(bool, default false) Only used in mkldnn kernel")
         .SetDefault(false);
+    AddAttr<bool>("use_global_stats",
+                  "(bool, default false) Whether to use global mean and "
+                  "variance. In inference or test mode, set use_global_stats "
+                  "to true or is_test true. the behavior is equivalent. "
+                  "In train mode, when setting use_global_stats True, the "
+                  "global mean and variance are also used during train time, "
+                  "the BN acts as scaling and shiffting.")
+        .SetDefault(false);
     AddComment(R"DOC(
 Batch Normalization.
 
@@ -190,6 +198,10 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
     const float epsilon = ctx.Attr<float>("epsilon");
     const float momentum = ctx.Attr<float>("momentum");
     const bool is_test = ctx.Attr<bool>("is_test");
+    const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
+
+    bool global_stats = is_test || use_global_stats;
+
     const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
     const DataLayout data_layout =
         framework::StringToDataLayout(data_layout_str);
@@ -217,7 +229,7 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
     saved_mean->mutable_data<T>(ctx.GetPlace());
     saved_variance->mutable_data<T>(ctx.GetPlace());
 
-    if (!is_test) {
+    if (!global_stats) {
       // saved_xx is use just in this batch of data
       EigenVectorArrayMap<T> saved_mean_e(
           saved_mean->mutable_data<T>(ctx.GetPlace()), C);
@@ -234,7 +246,7 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
       if ((N * sample_size) == 1) {
         LOG(WARNING) << "Only 1 element in normalization dimension, "
                      << "we skip the batch norm calculation, let y = x.";
-        framework::TensorCopySync(*x, ctx.GetPlace(), y);
+        framework::TensorCopy(*x, ctx.GetPlace(), y);
         return;
       }
 
@@ -277,7 +289,7 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
 
     // use SavedMean and SavedVariance to do normalize
     Eigen::Array<T, Eigen::Dynamic, 1> inv_std(C);
-    if (is_test) {
+    if (global_stats) {
       ConstEigenVectorArrayMap<T> var_arr(
           ctx.Input<Tensor>("Variance")->data<T>(), C);
       inv_std = (var_arr + epsilon).sqrt().inverse();
@@ -289,8 +301,8 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
       inv_std = saved_inv_std;
     }
     ConstEigenVectorArrayMap<T> mean_arr(
-        is_test ? ctx.Input<Tensor>("Mean")->data<T>()
-                : ctx.Output<Tensor>("SavedMean")->data<T>(),
+        global_stats ? ctx.Input<Tensor>("Mean")->data<T>()
+                     : ctx.Output<Tensor>("SavedMean")->data<T>(),
         C);
 
     //   ((x - est_mean) * (inv_var) * scale + bias
@@ -336,15 +348,27 @@ class BatchNormGradOp : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext *ctx) const override {
     // check input
     PADDLE_ENFORCE(ctx->HasInput("X"));
-    PADDLE_ENFORCE(ctx->HasInput("Scale"), "");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")), "");
-    PADDLE_ENFORCE(ctx->HasInput("SavedMean"), "");
-    PADDLE_ENFORCE(ctx->HasInput("SavedVariance"), "");
+    PADDLE_ENFORCE(ctx->HasInput("Scale"), "Input(scale) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")),
+                   "Input(Y@GRAD) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("SavedMean"),
+                   "Input(SavedMean) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("SavedVariance"),
+                   "Input(SavedVariance) should not be null");
 
     // check output
     PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), "");
-    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Scale")), "");
-    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Bias")), "");
+    if (ctx->HasOutput(framework::GradVarName("Scale"))) {
+      PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Bias")),
+                     "Output(Scale@GRAD) and Output(Bias@GRAD) should not be "
+                     "null at same time");
+    }
+    const bool use_global_stats = ctx->Attrs().Get<bool>("use_global_stats");
+    if (use_global_stats) {
+      PADDLE_ENFORCE(!ctx->Attrs().Get<bool>("use_mkldnn"),
+                     "Using global stats during training is not supported "
+                     "in gradient op kernel of batch_norm_mkldnn_op now.");
+    }
 
     const auto x_dims = ctx->GetInputDim("X");
     const DataLayout data_layout = framework::StringToDataLayout(
@@ -354,8 +378,10 @@ class BatchNormGradOp : public framework::OperatorWithKernel {
                                           : x_dims[x_dims.size() - 1]);
 
     ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
-    ctx->SetOutputDim(framework::GradVarName("Scale"), {C});
-    ctx->SetOutputDim(framework::GradVarName("Bias"), {C});
+    if (ctx->HasOutput(framework::GradVarName("Scale"))) {
+      ctx->SetOutputDim(framework::GradVarName("Scale"), {C});
+      ctx->SetOutputDim(framework::GradVarName("Bias"), {C});
+    }
   }
 
  protected:
@@ -405,6 +431,8 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T>
     // SavedVariance have been reverted in forward operator
     const auto *saved_inv_variance = ctx.Input<Tensor>("SavedVariance");
     const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+    const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
+    const float epsilon = ctx.Attr<float>("epsilon");
     const DataLayout data_layout =
         framework::StringToDataLayout(data_layout_str);
 
@@ -419,38 +447,60 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T>
                                           : x_dims[x_dims.size() - 1]);
     const int sample_size = x->numel() / N / C;
 
-    ConstEigenVectorArrayMap<T> scale_arr(scale->data<T>(), C);
-    ConstEigenVectorArrayMap<T> mean_arr(saved_mean->data<T>(), C);
-    ConstEigenVectorArrayMap<T> inv_var_arr(saved_inv_variance->data<T>(), C);
-
     // init output
     auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
     auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
 
     d_x->mutable_data<T>(ctx.GetPlace());
-    d_scale->mutable_data<T>(ctx.GetPlace());
-    d_bias->mutable_data<T>(ctx.GetPlace());
+
+    const T *mean_data = saved_mean->data<T>();
+    const T *inv_var_data = saved_inv_variance->data<T>();
+    Tensor inv_var_tensor;
+    if (use_global_stats) {
+      const auto *running_mean = ctx.Input<Tensor>("Mean");
+      const auto *running_variance = ctx.Input<Tensor>("Variance");
+      mean_data = running_mean->data<T>();
+      T *running_inv_var_data = inv_var_tensor.mutable_data<T>(ctx.GetPlace());
+      EigenVectorArrayMap<T> inv_var_tmp(running_inv_var_data, C);
+      ConstEigenVectorArrayMap<T> var_arr(running_variance->data<T>(), C);
+
+      inv_var_tmp = (var_arr + epsilon).sqrt().inverse().eval();
+      inv_var_data = running_inv_var_data;
+    }
+
+    ConstEigenVectorArrayMap<T> scale_arr(scale->data<T>(), C);
+    ConstEigenVectorArrayMap<T> mean_arr(mean_data, C);
+    ConstEigenVectorArrayMap<T> inv_var_arr(inv_var_data, C);
+
+    T *d_bias_data = nullptr;
+    T *d_scale_data = nullptr;
+    if (d_scale && d_bias) {
+      d_scale->mutable_data<T>(ctx.GetPlace());
+      d_bias->mutable_data<T>(ctx.GetPlace());
+      d_bias_data = d_bias->mutable_data<T>(ctx.GetPlace());
+      d_scale_data = d_scale->mutable_data<T>(ctx.GetPlace());
+    }
 
     // d_bias = np.sum(d_y, axis=0)
     // d_scale = np.sum((X - mean) / inv_std * dy, axis=0)
     // d_x = (1. / N) * scale * inv_var * (N * d_y - np.sum(d_y, axis=0)
     //   - (X - mean) * inv_var * inv_var * np.sum(d_y * (X - mean), axis=0))
+    EigenVectorArrayMap<T> d_bias_arr(d_bias_data, C);
+    EigenVectorArrayMap<T> d_scale_arr(d_scale_data, C);
 
-    EigenVectorArrayMap<T> d_bias_arr(d_bias->mutable_data<T>(ctx.GetPlace()),
-                                      C);
-    EigenVectorArrayMap<T> d_scale_arr(d_scale->mutable_data<T>(ctx.GetPlace()),
-                                       C);
-
-    d_bias_arr.setZero();
-    d_scale_arr.setZero();
+    if (d_scale && d_bias) {
+      d_bias_arr.setZero();
+      d_scale_arr.setZero();
+    }
 
-    if ((N * sample_size) == 1) {
-      framework::TensorCopySync(*d_y, ctx.GetPlace(), d_x);
+    if ((N * sample_size) == 1 && !use_global_stats) {
+      framework::TensorCopy(*d_y, ctx.GetPlace(), d_x);
       return;
     }
 
-    const auto scale_inv_var_nhw = scale_arr * inv_var_arr / (N * sample_size);
+    int scale_coefff = use_global_stats ? 1 : N * sample_size;
+    const auto scale_inv_var_nhw = scale_arr * inv_var_arr / scale_coefff;
 
     switch (data_layout) {
       case DataLayout::kNCHW: {
@@ -460,19 +510,29 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T>
                                  sample_size, N * C);
         d_x_arr.setZero();
 
-        for (int nc = 0; nc < N * C; ++nc) {
-          int c = nc % C;
-          d_bias_arr(c) += d_y_arr.col(nc).sum();
-          d_scale_arr(c) +=
-              ((x_arr.col(nc) - mean_arr(c)) * inv_var_arr(c) * d_y_arr.col(nc))
-                  .sum();
+        if (d_scale && d_bias) {
+          for (int nc = 0; nc < N * C; ++nc) {
+            int c = nc % C;
+            d_bias_arr(c) += d_y_arr.col(nc).sum();
+            d_scale_arr(c) += ((x_arr.col(nc) - mean_arr(c)) * inv_var_arr(c) *
+                               d_y_arr.col(nc))
+                                  .sum();
+          }
         }
-        for (int nc = 0; nc < N * C; ++nc) {
-          int c = nc % C;
-          d_x_arr.col(nc) +=
-              scale_inv_var_nhw(c) *
-              (d_y_arr.col(nc) * N * sample_size - d_bias_arr(c) -
-               (x_arr.col(nc) - mean_arr[c]) * d_scale_arr(c) * inv_var_arr(c));
+        if (!use_global_stats) {
+          for (int nc = 0; nc < N * C; ++nc) {
+            int c = nc % C;
+            d_x_arr.col(nc) +=
+                scale_inv_var_nhw(c) *
+                (d_y_arr.col(nc) * N * sample_size - d_bias_arr(c) -
+                 (x_arr.col(nc) - mean_arr[c]) * d_scale_arr(c) *
+                     inv_var_arr(c));
+          }
+        } else {
+          for (int nc = 0; nc < N * C; ++nc) {
+            int c = nc % C;
+            d_x_arr.col(nc) += scale_inv_var_nhw(c) * d_y_arr.col(nc);
+          }
         }
         break;
       }
@@ -488,15 +548,27 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T>
         const auto d_y_mul_x_minus_mean_row_sum =
             (d_y_arr * x_minus_mean).rowwise().sum();
         const auto inv_var_sqr = inv_var_arr * inv_var_arr;
-        for (int nhw = 0; nhw < N * sample_size; ++nhw) {
-          d_bias_arr += d_y_arr.col(nhw);
-          d_scale_arr +=
-              (x_arr.col(nhw) - mean_arr) * inv_var_arr * d_y_arr.col(nhw);
-          d_x_arr.col(nhw) +=
-              scale_inv_var_nhw *
-              (d_y_arr.col(nhw) * N * sample_size - d_y_row_sum -
-               x_minus_mean.col(nhw) * inv_var_sqr *
-                   d_y_mul_x_minus_mean_row_sum);
+
+        if (d_scale && d_bias) {
+          for (int nhw = 0; nhw < N * sample_size; ++nhw) {
+            d_bias_arr += d_y_arr.col(nhw);
+            d_scale_arr +=
+                (x_arr.col(nhw) - mean_arr) * inv_var_arr * d_y_arr.col(nhw);
+          }
+        }
+
+        if (!use_global_stats) {
+          for (int nhw = 0; nhw < N * sample_size; ++nhw) {
+            d_x_arr.col(nhw) +=
+                scale_inv_var_nhw *
+                (d_y_arr.col(nhw) * N * sample_size - d_y_row_sum -
+                 x_minus_mean.col(nhw) * inv_var_sqr *
+                     d_y_mul_x_minus_mean_row_sum);
+          }
+        } else {
+          for (int nhw = 0; nhw < N * sample_size; ++nhw) {
+            d_x_arr.col(nhw) += scale_inv_var_nhw * d_y_arr.col(nhw);
+          }
         }
         break;
       }
@@ -522,6 +594,10 @@ class BatchNormGradMaker : public framework::SingleGradOpDescMaker {
     op->SetInput("SavedMean", Output("SavedMean"));
     op->SetInput("SavedVariance", Output("SavedVariance"));
 
+    // used when setting use_global_stats True during training
+    op->SetInput("Mean", Output("MeanOut"));
+    op->SetInput("Variance", Output("VarianceOut"));
+
     op->SetAttrMap(Attrs());
 
     op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
diff --git a/paddle/fluid/operators/batch_norm_op.cu.cc b/paddle/fluid/operators/batch_norm_op.cu
similarity index 57%
rename from paddle/fluid/operators/batch_norm_op.cu.cc
rename to paddle/fluid/operators/batch_norm_op.cu
index aaed335c90..1c45746a92 100644
--- a/paddle/fluid/operators/batch_norm_op.cu.cc
+++ b/paddle/fluid/operators/batch_norm_op.cu
@@ -12,9 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/batch_norm_op.h"
+#include <algorithm>
 #include <cfloat>
+#include <string>
+#include <vector>
+#include "cub/cub.cuh"
 #include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/operators/batch_norm_op.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/cudnn_helper.h"
 #include "paddle/fluid/platform/float16.h"
@@ -59,6 +63,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
     double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
     const float momentum = ctx.Attr<float>("momentum");
     const bool is_test = ctx.Attr<bool>("is_test");
+    const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
     const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
     const DataLayout data_layout =
         framework::StringToDataLayout(data_layout_str);
@@ -121,7 +126,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
     auto handle = dev_ctx.cudnn_handle();
 
     // Now, depending on whether we are running test or not, we have two paths.
-    if (is_test) {
+    if (is_test || use_global_stats) {
       // only when test we use input to do computation.
       const auto *est_mean = ctx.Input<Tensor>("Mean");
       const auto *est_var = ctx.Input<Tensor>("Variance");
@@ -163,7 +168,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
       if ((N * H * W * D) == 1) {
         LOG(WARNING) << "Only 1 element in normalization dimension, "
                      << "we skip the batch norm calculation, let y = x.";
-        framework::TensorCopySync(*x, ctx.GetPlace(), y);
+        framework::TensorCopy(*x, ctx.GetPlace(), y);
       } else {
         double this_factor = 1. - momentum;
 
@@ -191,6 +196,58 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
   }
 };
 
+template <typename T, framework::DataLayout layout>
+static __global__ void KeBNBackwardData(const T *dy,
+                                        const BatchNormParamType<T> *scale,
+                                        const BatchNormParamType<T> *variance,
+                                        const double epsilon, const int C,
+                                        const int HxW, const int num, T *dx) {
+  int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  for (int i = gid; i < num; i += stride) {
+    const int c = layout == framework::DataLayout::kNCHW ? i / HxW % C : i % C;
+    BatchNormParamType<T> inv_var = 1.0 / sqrt(variance[c] + epsilon);
+    dx[i] = static_cast<T>(static_cast<BatchNormParamType<T>>(dy[i]) *
+                           scale[c] * inv_var);
+  }
+}
+
+template <typename T, int BlockDim, framework::DataLayout layout>
+static __global__ void KeBNBackwardScaleBias(
+    const T *dy, const T *x, const BatchNormParamType<T> *mean,
+    const BatchNormParamType<T> *variance, const double epsilon, const int N,
+    const int C, const int HxW, BatchNormParamType<T> *dscale,
+    BatchNormParamType<T> *dbias) {
+  const int outer_size = C;
+  const int inner_size = N * HxW;
+  typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage ds_storage;
+  __shared__ typename BlockReduce::TempStorage db_storage;
+
+  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
+    BatchNormParamType<T> ds_sum = static_cast<BatchNormParamType<T>>(0);
+    BatchNormParamType<T> db_sum = static_cast<BatchNormParamType<T>>(0);
+
+    BatchNormParamType<T> inv_var_i = 1.0 / sqrt(variance[i] + epsilon);
+    BatchNormParamType<T> mean_i = mean[i];
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int index = layout == framework::DataLayout::kNCHW
+                            ? (j / HxW * C + i) * HxW + j % HxW
+                            : j * outer_size + i;
+      ds_sum += static_cast<BatchNormParamType<T>>(dy[index]) *
+                (static_cast<BatchNormParamType<T>>(x[index]) - mean_i);
+      db_sum += static_cast<BatchNormParamType<T>>(dy[index]);
+    }
+    ds_sum = BlockReduce(ds_storage).Reduce(ds_sum, cub::Sum());
+    db_sum = BlockReduce(db_storage).Reduce(db_sum, cub::Sum());
+    if (threadIdx.x == 0) {
+      dscale[i] = ds_sum * inv_var_i;
+      dbias[i] = db_sum;
+    }
+    __syncthreads();
+  }
+}
+
 template <typename T>
 class BatchNormGradKernel<platform::CUDADeviceContext, T>
     : public framework::OpKernel<T> {
@@ -200,6 +257,8 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
                    "It must use CUDAPlace.");
     double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
     const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+    const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
+
     const DataLayout data_layout =
         framework::StringToDataLayout(data_layout_str);
     const auto *x = ctx.Input<Tensor>("X");
@@ -219,42 +278,13 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
     auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
 
     d_x->mutable_data<T>(ctx.GetPlace());
-    d_scale->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-    d_bias->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-
-    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    if ((N * H * W * D) == 1) {
-      framework::TensorCopySync(*d_y, ctx.GetPlace(), d_x);
-      math::SetConstant<platform::CUDADeviceContext, BatchNormParamType<T>>
-          functor;
-      functor(dev_ctx, d_scale, static_cast<BatchNormParamType<T>>(0));
-      functor(dev_ctx, d_bias, static_cast<BatchNormParamType<T>>(0));
-      return;
+    if (d_scale && d_bias) {
+      d_scale->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+      d_bias->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
     }
-
     PADDLE_ENFORCE_EQ(scale->dims().size(), 1UL);
     PADDLE_ENFORCE_EQ(scale->dims()[0], C);
 
-    // ------------------- cudnn descriptors ---------------------
-    cudnnTensorDescriptor_t data_desc_;
-    cudnnTensorDescriptor_t bn_param_desc_;
-    cudnnBatchNormMode_t mode_;
-
-    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
-    CUDNN_ENFORCE(
-        platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
-    if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
-      LOG(ERROR) << "Provided epsilon is smaller than "
-                 << "CUDNN_BN_MIN_EPSILON. Setting it to "
-                 << "CUDNN_BN_MIN_EPSILON instead.";
-    }
-    epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
-#if CUDNN_VERSION_MIN(7, 0, 0)
-    mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
-#else
-    mode_ = CUDNN_BATCHNORM_SPATIAL;
-#endif
-
     std::vector<int> dims;
     std::vector<int> strides;
     if (data_layout == DataLayout::kNCHW) {
@@ -264,34 +294,114 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
       dims = {N, C, H, W, D};
       strides = {H * W * C * D, 1, W * D * C, D * C, C};
     }
-    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
-        data_desc_, CudnnDataType<T>::type,
-        x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));
-    CUDNN_ENFORCE(platform::dynload::cudnnDeriveBNTensorDescriptor(
-        bn_param_desc_, data_desc_, mode_));
-
-    const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
-    const auto *saved_var = ctx.Input<Tensor>("SavedVariance");
-    const void *saved_mean_data =
-        saved_mean->template data<BatchNormParamType<T>>();
-    const void *saved_var_data =
-        saved_var->template data<BatchNormParamType<T>>();
-
-    CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationBackward(
-        dev_ctx.cudnn_handle(), mode_, CudnnDataType<T>::kOne(),
-        CudnnDataType<T>::kZero(), CudnnDataType<T>::kOne(),
-        CudnnDataType<T>::kZero(), data_desc_, x->template data<T>(),
-        data_desc_, d_y->template data<T>(), data_desc_,
-        d_x->template mutable_data<T>(ctx.GetPlace()), bn_param_desc_,
-        scale->template data<BatchNormParamType<T>>(),
-        d_scale->template mutable_data<BatchNormParamType<T>>(ctx.GetPlace()),
-        d_bias->template mutable_data<BatchNormParamType<T>>(ctx.GetPlace()),
-        epsilon, saved_mean_data, saved_var_data));
 
-    // clean when exit.
-    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
-    CUDNN_ENFORCE(
-        platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
+    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    if (!use_global_stats) {
+      if ((N * H * W * D) == 1) {
+        framework::TensorCopy(*d_y, ctx.GetPlace(), d_x);
+        math::SetConstant<platform::CUDADeviceContext, BatchNormParamType<T>>
+            functor;
+        functor(dev_ctx, d_scale, static_cast<BatchNormParamType<T>>(0));
+        functor(dev_ctx, d_bias, static_cast<BatchNormParamType<T>>(0));
+        return;
+      }
+
+      // ------------------- cudnn descriptors ---------------------
+      cudnnTensorDescriptor_t data_desc_;
+      cudnnTensorDescriptor_t bn_param_desc_;
+      cudnnBatchNormMode_t mode_;
+
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
+      if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
+        LOG(ERROR) << "Provided epsilon is smaller than "
+                   << "CUDNN_BN_MIN_EPSILON. Setting it to "
+                   << "CUDNN_BN_MIN_EPSILON instead.";
+      }
+      epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
+#if CUDNN_VERSION_MIN(7, 0, 0)
+      mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
+#else
+      mode_ = CUDNN_BATCHNORM_SPATIAL;
+#endif
+
+      CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+          data_desc_, CudnnDataType<T>::type,
+          x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));
+      CUDNN_ENFORCE(platform::dynload::cudnnDeriveBNTensorDescriptor(
+          bn_param_desc_, data_desc_, mode_));
+
+      const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
+      const auto *saved_var = ctx.Input<Tensor>("SavedVariance");
+      const void *saved_mean_data =
+          saved_mean->template data<BatchNormParamType<T>>();
+      const void *saved_var_data =
+          saved_var->template data<BatchNormParamType<T>>();
+
+      CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationBackward(
+          dev_ctx.cudnn_handle(), mode_, CudnnDataType<T>::kOne(),
+          CudnnDataType<T>::kZero(), CudnnDataType<T>::kOne(),
+          CudnnDataType<T>::kZero(), data_desc_, x->template data<T>(),
+          data_desc_, d_y->template data<T>(), data_desc_,
+          d_x->template mutable_data<T>(ctx.GetPlace()), bn_param_desc_,
+          scale->template data<BatchNormParamType<T>>(),
+          d_scale->template mutable_data<BatchNormParamType<T>>(ctx.GetPlace()),
+          d_bias->template mutable_data<BatchNormParamType<T>>(ctx.GetPlace()),
+          epsilon, saved_mean_data, saved_var_data));
+
+      // clean when exit.
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
+    } else {
+      const auto *running_mean = ctx.Input<Tensor>("Mean");
+      const auto *running_var = ctx.Input<Tensor>("Variance");
+
+      const auto *running_mean_data =
+          running_mean->template data<BatchNormParamType<T>>();
+      const auto *running_var_data =
+          running_var->template data<BatchNormParamType<T>>();
+
+      const int num = x->numel();
+      const int block = 512;
+      int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
+      const int max_blocks = std::max(max_threads / block, 1);
+      int grid1 = (num + block - 1) / block;
+      int grid2 = std::min(C, max_blocks);
+
+      if (data_layout == framework::DataLayout::kNCHW) {
+        if (d_x) {
+          KeBNBackwardData<T, framework::DataLayout::kNCHW><<<
+              grid1, block, 0, dev_ctx.stream()>>>(
+              d_y->data<T>(), scale->data<BatchNormParamType<T>>(),
+              running_var_data, epsilon, C, H * W, num, d_x->data<T>());
+        }
+        if (d_scale && d_bias) {
+          KeBNBackwardScaleBias<T, block, framework::DataLayout::kNCHW><<<
+              grid2, block, 0, dev_ctx.stream()>>>(
+              d_y->data<T>(), x->data<T>(), running_mean_data, running_var_data,
+              epsilon, C, H * W, num, d_scale->data<BatchNormParamType<T>>(),
+              d_bias->data<BatchNormParamType<T>>());
+        }
+      } else {
+        if (d_x) {
+          KeBNBackwardData<T, framework::DataLayout::kNHWC><<<
+              grid1, block, 0, dev_ctx.stream()>>>(
+              d_y->data<T>(), scale->data<BatchNormParamType<T>>(),
+              running_var_data, epsilon, C, H * W, num, d_x->data<T>());
+        }
+        if (d_scale && d_bias) {
+          KeBNBackwardScaleBias<T, block, framework::DataLayout::kNCHW><<<
+              grid2, block, 0, dev_ctx.stream()>>>(
+              d_y->data<T>(), x->data<T>(), running_mean_data, running_var_data,
+              epsilon, C, H * W, num, d_scale->data<BatchNormParamType<T>>(),
+              d_bias->data<BatchNormParamType<T>>());
+        }
+      }
+    }
   }
 };
 
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 4df74edfce..b1fc6b808c 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -2300,7 +2300,8 @@ def batch_norm(input,
                moving_mean_name=None,
                moving_variance_name=None,
                do_model_average_for_mean_and_var=False,
-               fuse_with_relu=False):
+               fuse_with_relu=False,
+               use_global_stats=False):
     """
     **Batch Normalization Layer**
 
@@ -2327,6 +2328,19 @@ def batch_norm(input,
         \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
         y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
 
+
+    When use_global_stats = True, the :math:`\\mu_{\\beta}`
+    and :math:`\\sigma_{\\beta}^{2}` are not the statistics of one mini-batch.
+    They are global (or running) statistics. (It usually got from the
+    pre-trained model.)
+    The training and testing (or inference) have the same behavior:
+
+    ..  math::
+
+        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
+        \\sigma_{\\beta}^{2} + \\epsilon}}  \\\\
+        y_i &\\gets \\gamma \\hat{x_i} + \\beta
+
     Args:
         input(variable): The input variable which is a LoDTensor.
         act(string, Default None): Activation type, linear|relu|prelu|...
@@ -2349,6 +2363,11 @@ def batch_norm(input,
         moving_variance_name(string, Default None): The name of the moving_variance which store the global Variance.
         do_model_average_for_mean_and_var(bool, Default False): Do model average for mean and variance or not.
         fuse_with_relu (bool): if True, this OP performs relu after batch norm.
+        use_global_stats(bool, Default False): Whether to use global mean and
+            variance. In inference or test mode, set use_global_stats to true
+            or is_test to true, and the behavior is equivalent.
+            In train mode, when setting use_global_stats True, the global mean
+            and variance are also used during train period.
 
     Returns:
         Variable: A tensor variable which is the result after applying batch normalization on the input.
@@ -2381,9 +2400,15 @@ def batch_norm(input,
         shape=param_shape,
         dtype=dtype,
         default_initializer=Constant(1.0))
+    # setting stop_gradient=True to reduce computation
+    if use_global_stats and helper.param_attr.learning_rate == 0.:
+        scale.stop_gradient = True
 
     bias = helper.create_parameter(
         attr=helper.bias_attr, shape=param_shape, dtype=dtype, is_bias=True)
+    # setting stop_gradient=True to reduce computation
+    if use_global_stats and helper.bias_attr.learning_rate == 0.:
+        scale.stop_gradient = True
 
     mean = helper.create_parameter(
         attr=ParamAttr(
@@ -2439,7 +2464,8 @@ def batch_norm(input,
             "epsilon": epsilon,
             "is_test": is_test,
             "use_mkldnn": False,
-            "fuse_with_relu": fuse_with_relu
+            "fuse_with_relu": fuse_with_relu,
+            "use_global_stats": use_global_stats
         })
 
     return helper.append_activation(batch_norm_out)
diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
index 80261eff4e..2869a6ba53 100644
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
@@ -54,6 +54,19 @@ def _reference_testing(x, scale, offset, mean, var, epsilon, data_format):
     return y
 
 
+def _cal_mean_variance(x, epsilon, data_format):
+    assert data_format in ['NCHW', 'NHWC']
+    x_square = x * x
+    axis = (0, 2, 3) if data_format == 'NCHW' else (0, 1, 2)
+    C = x.shape[1] if data_format == 'NCHW' else x.shape[-1]
+    x_square_sum = np.sum(x_square, axis)
+    x_sum = np.sum(x, axis=axis)
+    element_count = np.size(x) / C
+    mean = x_sum / element_count
+    var = x_square_sum / element_count - mean * mean
+    return mean, var
+
+
 def _reference_training(x, scale, offset, epsilon, data_format):
     x_shape = x.shape
 
@@ -294,7 +307,18 @@ class TestBatchNormOpTraining(unittest.TestCase):
         self.use_mkldnn = False
         self.fuse_with_relu = False
         self.data_formats = ["NCHW", "NHWC"]
+        self.momentum = 0.9
+        self.epsilon = 0.00001
         self.init_kernel_type()
+        self.init_test_case()
+
+    def init_test_case(self):
+        self.use_global_stats = False
+        self.no_grad_set = set()
+        self.fetch_list = [
+            'y', 'mean', 'variance', 'saved_mean', 'saved_variance', 'x@GRAD',
+            'scale@GRAD', 'bias@GRAD'
+        ]
 
     def __assert_close(self, tensor, np_array, msg, atol=1e-4):
         np.allclose(np.array(tensor), np_array, atol=atol)
@@ -313,11 +337,22 @@ class TestBatchNormOpTraining(unittest.TestCase):
 
         return y, mean_out, variance_out, saved_mean, saved_variance, x_grad, scale_grad, bias_grad
 
+    def set_mean_variance(self, scale_shape, x, data_layout):
+        mean = np.zeros(scale_shape).astype(np.float32)
+        variance = np.ones(scale_shape).astype(np.float32)
+        # computing global mean/variance for one step
+        if self.use_global_stats:
+            mom = self.momentum
+            x_mean, x_var = _cal_mean_variance(x, self.epsilon, data_layout)
+            mean = x_mean * (1. - mom) + mom * mean
+            variance = x_var * (1. - mom) + mom * variance
+        return mean, variance
+
     def test_forward_backward(self):
         def test_with_place(place, data_layout, shape):
             # attr
-            epsilon = 0.00001
-            momentum = 0.9
+            epsilon = self.epsilon
+            momentum = self.momentum
             if data_layout == "NCHW":
                 n, c, h, w = shape[0], shape[1], shape[2], shape[3]
             else:
@@ -328,9 +363,7 @@ class TestBatchNormOpTraining(unittest.TestCase):
             x = np.random.random_sample(shape).astype(np.float32)
             scale = np.random.random_sample(scale_shape).astype(np.float32)
             bias = np.random.random_sample(scale_shape).astype(np.float32)
-            mean = np.zeros(scale_shape).astype(np.float32)
-            variance = np.ones(scale_shape).astype(np.float32)
-
+            mean, variance = self.set_mean_variance(scale_shape, x, data_layout)
             y_grad = np.random.random_sample(shape).astype(np.float32)
 
             y, mean_out, variance_out, saved_mean, saved_variance, x_grad, scale_grad, bias_grad = self.ref_forward_backward(
@@ -339,6 +372,9 @@ class TestBatchNormOpTraining(unittest.TestCase):
 
             var_dict = locals()
             var_dict['y@GRAD'] = y_grad
+            var_dict['x@GRAD'] = x_grad
+            var_dict['scale@GRAD'] = scale_grad
+            var_dict['bias@GRAD'] = bias_grad
 
             var_names = [
                 'x', 'scale', 'bias', 'mean', 'variance', 'y', 'saved_mean',
@@ -365,9 +401,8 @@ class TestBatchNormOpTraining(unittest.TestCase):
                     },
                     outputs={
                         "Y": block.var('y'),
-                        "MeanOut": block.var('mean'),  # share the same memory
-                        "VarianceOut":
-                        block.var('variance'),  # share the same memory
+                        "MeanOut": block.var('mean'),  # share memory
+                        "VarianceOut": block.var('variance'),  # share memory
                         "SavedMean": block.var('saved_mean'),
                         "SavedVariance": block.var('saved_variance')
                     },
@@ -377,13 +412,14 @@ class TestBatchNormOpTraining(unittest.TestCase):
                         "is_test": False,
                         "data_layout": data_layout,
                         "use_mkldnn": self.use_mkldnn,
-                        "fuse_with_relu": self.fuse_with_relu
+                        "fuse_with_relu": self.fuse_with_relu,
+                        "use_global_stats": self.use_global_stats
                     })
                 block.create_var(name='y@GRAD', dtype='float32', shape=y.shape)
 
                 # generate backward op_desc
                 grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(
-                    bn_op.desc, set(), [])
+                    bn_op.desc, self.no_grad_set, [])
                 grad_op_desc = grad_op_desc_list[0]
                 new_op_desc = block.desc.append_op()
                 new_op_desc.copy_from(grad_op_desc)
@@ -403,20 +439,10 @@ class TestBatchNormOpTraining(unittest.TestCase):
                         for name in
                         ['x', 'scale', 'bias', 'mean', 'variance', 'y@GRAD']
                     },
-                    fetch_list=[
-                        'y', 'mean', 'variance', 'saved_mean', 'saved_variance',
-                        'x@GRAD', 'scale@GRAD', 'bias@GRAD'
-                    ])
-
-            self.__assert_close(y, out[0], "y")
-            self.__assert_close(mean_out, out[1], "mean")
-            self.__assert_close(variance_out, out[2], "variance", 1e-3)
-            self.__assert_close(saved_mean, out[3], "saved_mean")
-            self.__assert_close(saved_variance, out[4], "saved_variance", 1e-3)
-            self.__assert_close(x_grad, out[5], "x_grad")
-            self.__assert_close(scale_grad, out[6], "scale_grad")
-            self.__assert_close(bias_grad, out[7], "bias_grad")
+                    fetch_list=self.fetch_list)
 
+            for id, name in enumerate(self.fetch_list):
+                self.__assert_close(var_dict[name], out[id], name)
             print("op test forward passed: ", str(place), data_layout)
 
         places = [core.CPUPlace()]
@@ -432,5 +458,66 @@ class TestBatchNormOpTraining(unittest.TestCase):
         pass
 
 
+class TestBatchNormOpFreezeStatsTraining(TestBatchNormOpTraining):
+    def init_test_case(self):
+        self.use_global_stats = True
+        self.no_grad_set = set()
+        self.fetch_list = [
+            'y', 'mean', 'variance', 'x@GRAD', 'scale@GRAD', 'bias@GRAD'
+        ]
+
+    def reference_grad(self, x, y_grad, scale, mean, var, epsilon, data_format):
+        if data_format == "NCHW":
+            x = np.transpose(x, (0, 2, 3, 1))
+            y_grad = np.transpose(y_grad, (0, 2, 3, 1))
+
+        x_grad = scale * y_grad / np.sqrt(var + epsilon)
+        grad_scale = np.sum(y_grad * (x - mean) / np.sqrt(var + epsilon),
+                            axis=(0, 1, 2))
+        grad_offset = np.sum(y_grad, axis=(0, 1, 2))
+
+        # transfer back to N, C, H, W
+        if data_format == "NCHW":
+            x_grad = np.transpose(x_grad, (0, 3, 1, 2))
+            x = np.transpose(x, (0, 3, 1, 2))
+            y_grad = np.transpose(y_grad, (0, 3, 1, 2))
+
+        return x_grad, grad_scale, grad_offset
+
+    def ref_forward_backward(self, x, y_grad, scale, bias, mean, variance,
+                             epsilon, momentum, shape, data_layout):
+        if data_layout != "NCHW" and data_layout != "NHWC":
+            raise ValueError("Unknown data order.")
+
+        if data_layout == "NCHW":
+            x = np.transpose(x, (0, 2, 3, 1))
+
+        # run normalizaton
+        normalized = (x - mean) / np.sqrt(variance + epsilon)
+        y = normalized * scale + bias
+
+        # transfer back to N, C, H, W
+        if data_layout == "NCHW":
+            x = np.transpose(x, (0, 3, 1, 2))
+            y = np.transpose(y, (0, 3, 1, 2))
+
+        mean_out = mean
+        variance_out = variance
+        saved_variance = 1. / np.sqrt(variance + epsilon)
+        # run backward
+        x_grad, scale_grad, bias_grad = self.reference_grad(
+            x, y_grad, scale, mean, variance, epsilon, data_layout)
+
+        return y, mean_out, variance_out, mean, saved_variance, x_grad, scale_grad, bias_grad
+
+
+class TestBatchNormOpFreezeStatsAndScaleBiasTraining(
+        TestBatchNormOpFreezeStatsTraining):
+    def init_test_case(self):
+        self.use_global_stats = True
+        self.no_grad_set = set(['scale@GRAD', 'bias@GRAD'])
+        self.fetch_list = ['y', 'mean', 'variance', 'x@GRAD']
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 5411607711..2004c91793 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -955,6 +955,15 @@ class TestBook(unittest.TestCase):
 
         print(str(program))
 
+    def test_batch_norm(self):
+        program = Program()
+        with program_guard(program):
+            data = layers.data(
+                name='data', shape=[32, 128, 128], dtype="float32")
+            out = layers.batch_norm(data)
+
+        print(str(program))
+
 
 if __name__ == '__main__':
     unittest.main()

From 3e45a5a5ec1a3bcc6d94d279ba7861fed24f2a53 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Thu, 29 Nov 2018 16:22:00 +0800
Subject: [PATCH 50/90] lookup_table gpu kernel support prefetch test=develop

---
 .../operators/distributed/CMakeLists.txt      |  4 +-
 .../distributed/parameter_prefetch.cc         | 18 ++++--
 paddle/fluid/operators/lookup_table_op.cu     | 63 ++++++++++++-------
 3 files changed, 56 insertions(+), 29 deletions(-)

diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt
index 0858ec6a22..36979de68f 100644
--- a/paddle/fluid/operators/distributed/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed/CMakeLists.txt
@@ -23,7 +23,7 @@ if(WITH_GRPC)
   cc_test(rpc_server_test SRCS rpc_server_test.cc
     DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor  proto_desc lookup_sparse_table_op SERIAL)
   cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler)
-  cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_grpc)
+  cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_grpc memory)
 else()
   set_source_files_properties(brpc_server.cc brpc_client.cc rpc_server_test.cc brpc_serde_test.cc
       brpc_variable_response.cc brpc_sendrecvop_utils.cc brpc_rdma_pool.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
@@ -33,7 +33,7 @@ else()
     PROTO send_recv.proto
     DEPS lod_tensor selected_rows memory)
 
-  cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_brpc)
+  cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_brpc memory)
 
   set(brpc_test_depends sendrecvop_brpc brpc ssl crypto protobuf leveldb gflags glog executor proto_desc lookup_table_op snappystream snappy)
 
diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.cc b/paddle/fluid/operators/distributed/parameter_prefetch.cc
index 36f4f0eefd..cf14538b1c 100644
--- a/paddle/fluid/operators/distributed/parameter_prefetch.cc
+++ b/paddle/fluid/operators/distributed/parameter_prefetch.cc
@@ -102,7 +102,8 @@ static void MergeMultipleVarsIntoOneBySection(
     const std::string& out_name, const std::vector<std::string>& out_var_names,
     const std::vector<int>& height_section,
     const std::vector<std::vector<int64_t>>& splited_ids,
-    const framework::ExecutionContext& context, framework::Scope* scope) {
+    const framework::ExecutionContext& context, framework::Scope* scope,
+    platform::DeviceContext* actual_ctx) {
   PADDLE_ENFORCE_EQ(out_var_names.size(), height_section.size(), "");
 
   auto cpu_place = platform::CPUPlace();
@@ -151,10 +152,12 @@ static void MergeMultipleVarsIntoOneBySection(
 #ifndef PADDLE_WITH_CUDA
             PADDLE_THROW("paddle is not compiled with CUDA!");
 #else
+            auto stream =
+                static_cast<platform::CUDADeviceContext*>(actual_ctx)->stream();
             memory::Copy(boost::get<platform::CUDAPlace>(id_tensor.place()),
                          out_tensor_data + offset * row_numel, cpu_place,
                          out_var_data + i * row_numel,
-                         sizeof(float) * row_numel);
+                         sizeof(float) * row_numel, stream);
 #endif
           }
         }
@@ -174,6 +177,7 @@ void prefetch(const std::string& id_name, const std::string& out_name,
 
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
   auto& cpu_ctx = *pool.Get(platform::CPUPlace());
+  auto& actual_ctx = *pool.Get(context.GetPlace());
 
   distributed::RPCClient* rpc_client =
       distributed::RPCClient::GetInstance<RPCCLIENT_T>(
@@ -201,11 +205,13 @@ void prefetch(const std::string& id_name, const std::string& out_name,
     framework::Tensor cpu_tensor;
     auto* cpu_tensor_data =
         cpu_tensor.mutable_data<int64_t>(id_tensor.dims(), cpu_place);
+    auto stream =
+        static_cast<platform::CUDADeviceContext*>(&actual_ctx)->stream();
     memory::Copy(cpu_place, cpu_tensor_data,
                  boost::get<platform::CUDAPlace>(id_tensor.place()),
-                 id_tensor.data<int64_t>(),
-                 sizeof(int64_t) * id_tensor.numel());
-    for (size_t i = 0; i < id_tensor.numel(); ++i) {
+                 id_tensor.data<int64_t>(), sizeof(int64_t) * id_tensor.numel(),
+                 stream);
+    for (size_t i = 0; i < cpu_tensor.numel(); ++i) {
       ids_vector.push_back(cpu_tensor_data[i]);
     }
 #endif
@@ -239,7 +245,7 @@ void prefetch(const std::string& id_name, const std::string& out_name,
 
   MergeMultipleVarsIntoOneBySection(id_name, ids_vector, out_name,
                                     out_var_names, height_sections, splited_ids,
-                                    context, &local_scope);
+                                    context, &local_scope, &actual_ctx);
 
   context.scope().DeleteScope(&local_scope);
 }
diff --git a/paddle/fluid/operators/lookup_table_op.cu b/paddle/fluid/operators/lookup_table_op.cu
index abd5dce8f7..36156a1f61 100644
--- a/paddle/fluid/operators/lookup_table_op.cu
+++ b/paddle/fluid/operators/lookup_table_op.cu
@@ -78,27 +78,47 @@ class LookupTableCUDAKernel : public framework::OpKernel<T> {
     auto *output_t = context.Output<LoDTensor>("Out");
     int64_t padding_idx = context.Attr<int64_t>("padding_idx");
 
-    size_t N = table_t->dims()[0];
-    size_t D = table_t->dims()[1];
-    size_t K = ids_t->numel();
-
-    auto *ids = ids_t->data<int64_t>();
-    auto *table = table_t->data<T>();
-    auto *output = output_t->mutable_data<T>(context.GetPlace());
-
-    dim3 threads(128, 8);
-    dim3 grids(8, 1);
-
-    if (padding_idx == -1)
-      LookupTable<
-          T, 128, 8, 8,
-          false><<<grids, threads, 0, context.cuda_device_context().stream()>>>(
-          output, table, ids, N, K, D, padding_idx);
-    else
-      LookupTable<
-          T, 128, 8, 8,
-          true><<<grids, threads, 0, context.cuda_device_context().stream()>>>(
-          output, table, ids, N, K, D, padding_idx);
+    auto id_name = context.Inputs("Ids").front();
+    auto out_name = context.Outputs("Out").front();
+
+    // for remote prefetch
+    auto epmap = context.Attr<std::vector<std::string>>("epmap");
+    auto height_sections = context.Attr<std::vector<int>>("height_sections");
+    auto table_names = context.Attr<std::vector<std::string>>("table_names");
+
+    if (!epmap.empty()) {
+// if epmap is not empty, then the parameter will be fetched from remote
+// parameter
+// server
+#ifdef PADDLE_WITH_DISTRIBUTE
+      operators::distributed::prefetch(id_name, out_name, table_names, epmap,
+                                       height_sections, context);
+#else
+      PADDLE_THROW(
+          "paddle is not compiled with distribute support, can not do "
+          "parameter prefetch!");
+#endif
+    } else {
+      size_t N = table_t->dims()[0];
+      size_t D = table_t->dims()[1];
+      size_t K = ids_t->numel();
+
+      auto *ids = ids_t->data<int64_t>();
+      auto *table = table_t->data<T>();
+      auto *output = output_t->mutable_data<T>(context.GetPlace());
+
+      dim3 threads(128, 8);
+      dim3 grids(8, 1);
+
+      if (padding_idx == -1)
+        LookupTable<T, 128, 8, 8, false><<<
+            grids, threads, 0, context.cuda_device_context().stream()>>>(
+            output, table, ids, N, K, D, padding_idx);
+      else
+        LookupTable<T, 128, 8, 8, true><<<
+            grids, threads, 0, context.cuda_device_context().stream()>>>(
+            output, table, ids, N, K, D, padding_idx);
+    }
   }
 };
 
@@ -109,6 +129,7 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
     auto &dev_ctx =
         context.template device_context<platform::CUDADeviceContext>();
     bool is_sparse = context.Attr<bool>("is_sparse");
+
     // Since paddings are not trainable and fixed in forward, the gradient of
     // paddings makes no sense and we don't deal with it in backward.
     if (is_sparse) {

From 9450048acb141151895a4405c7d5d118165677de Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Thu, 29 Nov 2018 16:48:43 +0800
Subject: [PATCH 51/90] add PADDLE_ENABLE_REMOTE_PREFETCH to enable remote
 prefetch test=develop

---
 paddle/fluid/API.spec                                       | 2 +-
 python/paddle/fluid/layers/nn.py                            | 6 ++++--
 python/paddle/fluid/tests/unittests/dist_ctr.py             | 2 ++
 python/paddle/fluid/tests/unittests/test_dist_transpiler.py | 2 ++
 .../fluid/tests/unittests/test_lookup_remote_table_op.py    | 1 +
 5 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index 59605a7e16..c40f603341 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -50,7 +50,7 @@ paddle.fluid.initializer.MSRAInitializer.__init__ ArgSpec(args=['self', 'uniform
 paddle.fluid.initializer.force_init_on_cpu ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
 paddle.fluid.initializer.init_on_cpu ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
 paddle.fluid.layers.fc ArgSpec(args=['input', 'size', 'num_flatten_dims', 'param_attr', 'bias_attr', 'act', 'is_test', 'name'], varargs=None, keywords=None, defaults=(1, None, None, None, False, None))
-paddle.fluid.layers.embedding ArgSpec(args=['input', 'size', 'is_sparse', 'is_distributed', 'padding_idx', 'param_attr', 'dtype', 'remote_prefetch'], varargs=None, keywords=None, defaults=(False, False, None, None, 'float32', False))
+paddle.fluid.layers.embedding ArgSpec(args=['input', 'size', 'is_sparse', 'is_distributed', 'padding_idx', 'param_attr', 'dtype'], varargs=None, keywords=None, defaults=(False, False, None, None, 'float32'))
 paddle.fluid.layers.dynamic_lstm ArgSpec(args=['input', 'size', 'h_0', 'c_0', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'float32', None))
 paddle.fluid.layers.dynamic_lstmp ArgSpec(args=['input', 'size', 'proj_size', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'proj_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'tanh', 'float32', None))
 paddle.fluid.layers.dynamic_gru ArgSpec(args=['input', 'size', 'param_attr', 'bias_attr', 'is_reverse', 'gate_activation', 'candidate_activation', 'h_0'], varargs=None, keywords=None, defaults=(None, None, False, 'sigmoid', 'tanh', None))
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 2051a1ea01..29a0de29dc 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -287,8 +287,7 @@ def embedding(input,
               is_distributed=False,
               padding_idx=None,
               param_attr=None,
-              dtype='float32',
-              remote_prefetch=False):
+              dtype='float32'):
     """
     **Embedding Layer**
 
@@ -327,6 +326,9 @@ def embedding(input,
     """
 
     helper = LayerHelper('embedding', **locals())
+    remote_prefetch = False
+    if os.environ.get('PADDLE_ENABLE_REMOTE_PREFETCH'):
+        remote_prefetch = True
     if remote_prefetch:
         assert is_sparse is True and is_distributed is False
     w = helper.create_parameter(
diff --git a/python/paddle/fluid/tests/unittests/dist_ctr.py b/python/paddle/fluid/tests/unittests/dist_ctr.py
index 088f16a8ac..d7ddafdb17 100644
--- a/python/paddle/fluid/tests/unittests/dist_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_ctr.py
@@ -16,11 +16,13 @@ from __future__ import print_function
 
 import paddle
 import paddle.fluid as fluid
+import os
 
 import dist_ctr_reader
 from test_dist_base import TestDistRunnerBase, runtime_main
 
 IS_SPARSE = True
+os.environ['PADDLE_ENABLE_REMOTE_PREFETCH'] = "1"
 
 # Fix seed for test
 fluid.default_startup_program().random_seed = 1
diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
index 3a905d4a02..26c596ba54 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
@@ -792,6 +792,8 @@ class TestNCCL2Transpile(TranspilerTest):
 # test for remote prefetch
 class TestRemoteLookupTable(TestDistLookupTableBase):
     def net_conf(self):
+        import os
+        os.environ['PADDLE_ENABLE_REMOTE_PREFETCH'] = "1"
         self.network_with_table(
             is_sparse=True, is_distributed=False, remote_prefetch=True)
 
diff --git a/python/paddle/fluid/tests/unittests/test_lookup_remote_table_op.py b/python/paddle/fluid/tests/unittests/test_lookup_remote_table_op.py
index b46e61d2ef..47830fb56b 100644
--- a/python/paddle/fluid/tests/unittests/test_lookup_remote_table_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lookup_remote_table_op.py
@@ -174,6 +174,7 @@ class TestListenAndServOp(unittest.TestCase):
                     self.assertTrue((result_array[i] == id).all())
 
     def test_lookup_remote_table(self):
+        os.environ['PADDLE_ENABLE_REMOTE_PREFETCH'] = "1"
         # run pserver on CPU in sync mode
         p0 = self._start_pserver(0, False, True, run_pserver)
         self._wait_ps_ready(p0.pid)

From d1a17cadd48be68a3abde522e6dc94a608db207d Mon Sep 17 00:00:00 2001
From: phlrain <phliuhongyu@126.com>
Date: Thu, 29 Nov 2018 17:17:52 +0800
Subject: [PATCH 52/90] fix cudnn rnn; test=develop

---
 paddle/fluid/operators/cudnn_lstm_op.cc    | 38 +++++++----
 paddle/fluid/operators/cudnn_lstm_op.cu.cc |  9 ++-
 paddle/fluid/operators/cudnn_lstm_op.h     |  7 ++-
 python/paddle/fluid/layers/nn.py           | 73 ++++++++++++----------
 4 files changed, 78 insertions(+), 49 deletions(-)

diff --git a/paddle/fluid/operators/cudnn_lstm_op.cc b/paddle/fluid/operators/cudnn_lstm_op.cc
index cadc5b8830..c73c64f4a8 100644
--- a/paddle/fluid/operators/cudnn_lstm_op.cc
+++ b/paddle/fluid/operators/cudnn_lstm_op.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -122,13 +122,11 @@ class CudnnLSTMOpMaker : public framework::OpProtoAndCheckerMaker {
                   "The will affect the shape of the Out, last_h, and last_c")
         .SetDefault(false);
     AddAttr<int>("input_size", "input size ot the Input Tensor").SetDefault(10);
-    AddAttr<int>("batch_size", "the instance number the batch").SetDefault(10);
     AddAttr<int>("hidden_size", "hidden size of the LSTM").SetDefault(100);
     AddAttr<int>("num_layers", "the total layer number of the LSTM")
         .SetDefault(1);
     AddAttr<bool>("is_test", "True if in test phase.").SetDefault(false);
-    AddAttr<bool>("fix_seed", "True if it fix dropout seed").SetDefault(false);
-    AddAttr<int>("seed", "seed to used if fix_seed is True").SetDefault(0);
+    AddAttr<int>("seed", "seed to used if fix_seed is True").SetDefault(-1);
     AddComment(R"DOC(
 CUDNN LSTM implementation
 
@@ -136,16 +134,32 @@ A four-gate Long Short-Term Memory network with no peephole connections.
 In the forward pass the output ht and cell output ct for a given iteration can be computed from the recurrent input ht-1, 
 the cell input ct-1 and the previous layer input xt given matrices W, R and biases bW, bR from the following equations:
 
-it = σ(Wi X xt + Ri X ht-1 + bWi + bRi)
-ft = σ(Wf X xt + Rf X ht-1 + bWf + bRf)
-ot = σ(Wo X xt + Ro X ht-1 + bWo + bRo)
-c't = tanh(Wc X xt + Rc X ht-1 + bWc + bRc)
-ct = ft * ct-1 + it * c't
-ht = ot * tanh(ct)
+$$ i_t = sigmoid(W_{ix}x_{t} + W_{ih}h_{t-1} + bx_i + bh_i) $$
 
-Where σ is the sigmoid operator: σ(x) = 1 / (1 + e^-x), * represents a point-wise multiplication, 
+$$ f_t = sigmoid(W_{fx}x_{t} + W_{fh}h_{t-1} + bx_f + bh_f) $$
+
+$$ o_t = sigmoid(W_{ox}x_{t} + W_{oh}h_{t-1} + bx_o + bh_o) $$
+
+$$ \\tilde{c_t} = tanh(W_{cx}x_t + W_{ch}h_{t-1} + bx_c + bh_c) $$
+
+$$ c_t = f_t \\odot c_{t-1} + i_t \\odot \\tilde{c_t} $$
+
+$$ h_t = o_t \\odot tanh(c_t) $$
+
+- W terms denote weight matrices (e.g. $W_{ix}$ is the matrix
+  of weights from the input gate to the input)
+- The b terms denote bias vectors ($bx_i$ and $bh_i$ are the input gate bias vector).
+- sigmoid is the logistic sigmoid function.
+- $i, f, o$ and $c$ are the input gate, forget gate, output gate,
+  and cell activation vectors, respectively, all of which have the same size as
+  the cell output activation vector $h$.
+- The $\odot$ is the element-wise product of the vectors.
+- `tanh` is the activation functions.
+- $\tilde{c_t}$ is also called candidate hidden state,
+  which is computed based on the current input and the previous hidden state.
+
+Where sigmoid is the sigmoid operator: sigmoid(x) = 1 / (1 + e^-x), * represents a point-wise multiplication, 
 X represensts a matrix multiplication
-and tanh is the hyperbolic tangent function. it, ft, ot, c't represent the input, forget, output and new gates respectively.
 
 
 )DOC");
diff --git a/paddle/fluid/operators/cudnn_lstm_op.cu.cc b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
index 9caf65b53f..cadd3772af 100644
--- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc
+++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -273,7 +273,6 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
     size_t max_len = ctx.Attr<int>("max_len");
     float dropout_prob = ctx.Attr<float>("dropout_prob");
     bool is_bidirec = ctx.Attr<bool>("is_bidirec");
-    int batch_size = ctx.Attr<int>("batch_size");
     int input_size = ctx.Attr<int>("input_size");
     int hidden_size = ctx.Attr<int>("hidden_size");
     int num_layers = ctx.Attr<int>("num_layers");
@@ -304,9 +303,13 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
       cudnn_rnn_cache = const_cast<framework::Variable *>(cache_var)
                             ->GetMutable<CudnnRNNCache>();
       std::random_device rnd;
-      int seed = ctx.Attr<bool>("fix_seed") ? ctx.Attr<int>("seed") : rnd();
+      int seed = ctx.Attr<int>("seed");
+      if (seed == -1) {
+        seed = rnd();
+      }
 
       auto input_w_numel = w->numel();
+      auto batch_size = x->dims()[1];
       cudnn_rnn_cache->init(handle, ctx, max_len, batch_size, input_size,
                             hidden_size, num_layers, dropout_prob, is_bidirec,
                             seed, input_w_numel);
diff --git a/paddle/fluid/operators/cudnn_lstm_op.h b/paddle/fluid/operators/cudnn_lstm_op.h
index fb4b37e46e..fc329cc239 100644
--- a/paddle/fluid/operators/cudnn_lstm_op.h
+++ b/paddle/fluid/operators/cudnn_lstm_op.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -29,7 +29,10 @@ using Tensor = framework::Tensor;
 template <typename DeviceContext, typename T>
 class CudnnLSTMKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& ctx) const override {}
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_THROW(
+        "CPU is not support for this kernel now. Will be add in the future");
+  }
 };
 
 template <typename DeviceContext, typename T>
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 30d108bbe8..fa2215f9f5 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -169,7 +169,7 @@ __all__ = [
     'log_loss',
     'add_position_encoding',
     'bilinear_tensor_product',
-    'cudnn_lstm',
+    'lstm',
 ]
 
 
@@ -467,39 +467,53 @@ def dynamic_lstm(input,
     return hidden, cell
 
 
-def cudnn_lstm(input,
-               init_h,
-               init_c,
-               batch_size,
-               max_len,
-               dropout_prob,
-               input_size,
-               hidden_size,
-               num_layers,
-               is_bidirec=False,
-               dtype='float32',
-               is_test=False,
-               name=None,
-               default_initializer=None,
-               fix_seed=False,
-               seed=0):
+def lstm(input,
+         init_h,
+         init_c,
+         max_len,
+         dropout_prob,
+         input_size,
+         hidden_size,
+         num_layers,
+         is_bidirec=False,
+         dtype='float32',
+         is_test=False,
+         name=None,
+         default_initializer=None,
+         seed=-1):
     """
-    CUDNN LSTM implementation
+    If Device is GPU, This op will use cudnn LSTM implementation
 
     A four-gate Long Short-Term Memory network with no peephole connections.
     In the forward pass the output ht and cell output ct for a given iteration can be computed from the recurrent input ht-1, 
     the cell input ct-1 and the previous layer input xt given matrices W, R and biases bW, bR from the following equations:
 
-    it = sigmoid(Wi X xt + Ri X ht-1 + bWi + bRi)
-    ft = sigmoid(Wf X xt + Rf X ht-1 + bWf + bRf)
-    ot = sigmoid(Wo X xt + Ro X ht-1 + bWo + bRo)
-    c't = tanh(Wc X xt + Rc X ht-1 + bWc + bRc)
-    ct = ft * ct-1 + it * c't
-    ht = ot * tanh(ct)
+    $$ i_t = \\sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + bx_i + bh_i) $$
+
+    $$ f_t = \\sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + bx_f + bh_f) $$
+
+    $$ o_t = \\sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + bx_o + bh_o) $$
+
+    $$ \\tilde{c_t} = tanh(W_{cx}x_t + W_{ch}h_{t-1} + bx_c + bh_c) $$
+
+    $$ c_t = f_t \\odot c_{t-1} + i_t \\odot \\tilde{c_t} $$
+
+    $$ h_t = o_t \\odot tanh(c_t) $$
+
+    - W terms denote weight matrices (e.g. $W_{ix}$ is the matrix
+      of weights from the input gate to the input)
+    - The b terms denote bias vectors ($bx_i$ and $bh_i$ are the input gate bias vector).
+    - sigmoid is the logistic sigmoid function.
+    - $i, f, o$ and $c$ are the input gate, forget gate, output gate,
+      and cell activation vectors, respectively, all of which have the same size as
+      the cell output activation vector $h$.
+    - The $\odot$ is the element-wise product of the vectors.
+    - `tanh` is the activation functions.
+    - $\tilde{c_t}$ is also called candidate hidden state,
+      which is computed based on the current input and the previous hidden state.
 
     Where sigmoid is the sigmoid operator: sigmoid(x) = 1 / (1 + e^-x), * represents a point-wise multiplication, 
     X represensts a matrix multiplication
-    and tanh is the hyperbolic tangent function. it, ft, ot, c't represent the input, forget, output and new gates respectively.
 
 
     Args:
@@ -510,7 +524,6 @@ def cudnn_lstm(input,
         init_c(Variable): The initial cell state of the LSTM.
                        This is a tensor with shape ( num_layers x batch_size x hidden_size )
                        if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size)
-        batch_size (int): total distance numer of the batch
         max_len (int): max length of LSTM. the first dim of input tensor CAN NOT greater than max_len 
         dropout_prob(float): dropout prob, dropout ONLY work between rnn layers, NOT between time steps
                              There is NO dropout work on rnn output of the last RNN layers
@@ -524,9 +537,7 @@ def cudnn_lstm(input,
                          will be named automatically.
         default_initializer(Initialize|None): Where use initializer to initialize the Weight
                          If set None, defaule initializer will be used
-
-        fix_seed(bool): If it's True, fix seed will used for dropout in LSTM
-        seed(int): If fix_seed is True, dropout seed in LSTM will use this seed 
+        seed(int): Seed for dropout in LSTM, If it's -1, dropout will use random seed
 
 
     Returns:
@@ -553,7 +564,7 @@ def cudnn_lstm(input,
             init_hidden1 = layers.fill_constant( [num_layers, batch_size, hidden_size], 'float32', 0.0, stop_grad=False)
             init_cell1 = layers.fill_constant( [num_layers, batch_size, hidden_size], 'float32', 0.0, stop_grad=False)
 
-            rnn_out, last_h, last_c = layers.cudnn_lstm( input, init_h, init_c, batch_size, \
+            rnn_out, last_h, last_c = layers.lstm( input, init_h, init_c, \
                     max_len, dropout_prob, input_size, hidden_size, \
                     num_layers)
     """
@@ -610,12 +621,10 @@ def cudnn_lstm(input,
             'max_len': max_len,
             'is_bidirec': is_bidirec,
             'input_size': input_size,
-            'batch_size': batch_size,
             'hidden_size': hidden_size,
             'num_layers': num_layers,
             'is_test': is_test,
             'dropout_prob': dropout_prob,
-            'fix_seed': fix_seed,
             'seed': seed,
         })
     return out, last_h, last_c

From 4b9689379f690aebc6b0d953b998b3ab2dcd3f81 Mon Sep 17 00:00:00 2001
From: phlrain <phliuhongyu@126.com>
Date: Thu, 29 Nov 2018 17:19:47 +0800
Subject: [PATCH 53/90] fix cudnn lstm; test=develop

---
 paddle/fluid/API.spec | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index c27078a6e4..dd9ff7cb7b 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -187,7 +187,7 @@ paddle.fluid.layers.grid_sampler ArgSpec(args=['x', 'grid', 'name'], varargs=Non
 paddle.fluid.layers.log_loss ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None))
 paddle.fluid.layers.add_position_encoding ArgSpec(args=['input', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.bilinear_tensor_product ArgSpec(args=['x', 'y', 'size', 'act', 'name', 'param_attr', 'bias_attr'], varargs=None, keywords=None, defaults=(None, None, None, None))
-paddle.fluid.layers.cudnn_lstm ArgSpec(args=['input', 'init_h', 'init_c', 'batch_size', 'max_len', 'dropout_prob', 'input_size', 'hidden_size', 'num_layers', 'is_bidirec', 'dtype', 'is_test', 'name', 'default_initializer', 'fix_seed', 'seed'], varargs=None, keywords=None, defaults=(False, 'float32', False, None, None, False, 0))
+paddle.fluid.layers.lstm ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'dropout_prob', 'input_size', 'hidden_size', 'num_layers', 'is_bidirec', 'dtype', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(False, 'float32', False, None, None, False, 0))
 paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
 paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
 paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None)

From 589b863b986b4ab3bae4c572c89a201df4a3edc7 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Thu, 29 Nov 2018 17:20:20 +0800
Subject: [PATCH 54/90] Add EstiminateFlops

test=develop
---
 paddle/fluid/framework/details/op_registry.h | 20 +++++++++++++++++---
 paddle/fluid/framework/op_info.h             |  7 +++++++
 paddle/fluid/framework/type_defs.h           |  2 ++
 3 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/framework/details/op_registry.h b/paddle/fluid/framework/details/op_registry.h
index eea7e712f8..1ce18c3d6b 100644
--- a/paddle/fluid/framework/details/op_registry.h
+++ b/paddle/fluid/framework/details/op_registry.h
@@ -32,7 +32,9 @@ enum OpInfoFillType {
   kOpProtoAndCheckerMaker = 1,
   kGradOpDescMaker = 2,
   kVarTypeInference = 3,
-  kShapeInference = 4
+  kShapeInference = 4,
+  kEstimateFlops = 5,
+  kUnknown = -1
 };
 
 template <typename T>
@@ -48,8 +50,10 @@ struct OpInfoFillTypeID {
                                     ? kVarTypeInference
                                     : (std::is_base_of<InferShapeBase, T>::value
                                            ? kShapeInference
-                                           : static_cast<OpInfoFillType>(
-                                                 -1)))));
+                                           : (std::is_base_of<EstimateFlopsBase,
+                                                              T>::value
+                                                  ? kEstimateFlops
+                                                  : kUnknown)))));
   }
 };
 
@@ -139,6 +143,16 @@ struct OpInfoFiller<T, kShapeInference> {
   }
 };
 
+template <typename T>
+struct OpInfoFiller<T, kEstimateFlops> {
+  void operator()(const char* op_tpe, OpInfo* info) const {
+    info->estimate_flops_ = [](InferShapeContext* ctx) {
+      T estimate_flops;
+      return estimate_flops(ctx);
+    };
+  }
+};
+
 }  // namespace details
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/op_info.h b/paddle/fluid/framework/op_info.h
index 19e5c2c73e..e0bf5ed999 100644
--- a/paddle/fluid/framework/op_info.h
+++ b/paddle/fluid/framework/op_info.h
@@ -31,6 +31,12 @@ class InferShapeBase {
   virtual void operator()(InferShapeContext*) const = 0;
 };
 
+class EstimateFlopsBase {
+ public:
+  virtual ~EstimateFlopsBase() = default;
+  virtual size_t operator()(InferShapeContext*) const = 0;
+};
+
 struct OpInfo {
   OpCreator creator_;
   GradOpMakerFN grad_op_maker_;
@@ -38,6 +44,7 @@ struct OpInfo {
   OpAttrChecker* checker_{nullptr};
   InferVarTypeFN infer_var_type_;
   InferShapeFN infer_shape_;
+  EstimateFlopsFN estimate_flops_;
 
   bool HasOpProtoAndChecker() const {
     return proto_ != nullptr && checker_ != nullptr;
diff --git a/paddle/fluid/framework/type_defs.h b/paddle/fluid/framework/type_defs.h
index 2de6233a9e..cdc5fa6862 100644
--- a/paddle/fluid/framework/type_defs.h
+++ b/paddle/fluid/framework/type_defs.h
@@ -54,5 +54,7 @@ using InferVarTypeFN =
 
 using InferShapeFN = std::function<void(InferShapeContext*)>;
 
+using EstimateFlopsFN = std::function<void(InferShapeContext*)>;
+
 }  // namespace framework
 }  // namespace paddle

From 839193fd1fb9441e5928973458e0af92924663ce Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Thu, 29 Nov 2018 17:30:37 +0800
Subject: [PATCH 55/90] fix unit test test=develop

---
 .../paddle/fluid/tests/unittests/dist_ctr.py  |  6 ++---
 .../tests/unittests/test_dist_transpiler.py   | 24 +++++++------------
 2 files changed, 10 insertions(+), 20 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/dist_ctr.py b/python/paddle/fluid/tests/unittests/dist_ctr.py
index d7ddafdb17..6596982433 100644
--- a/python/paddle/fluid/tests/unittests/dist_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_ctr.py
@@ -61,8 +61,7 @@ class TestDistCTR2x2(TestDistRunnerBase):
             param_attr=fluid.ParamAttr(
                 name="deep_embedding",
                 initializer=fluid.initializer.Constant(value=0.01)),
-            is_sparse=IS_SPARSE,
-            remote_prefetch=True)
+            is_sparse=IS_SPARSE)
         dnn_pool = fluid.layers.sequence_pool(
             input=dnn_embedding, pool_type="sum")
         dnn_out = dnn_pool
@@ -84,8 +83,7 @@ class TestDistCTR2x2(TestDistRunnerBase):
             param_attr=fluid.ParamAttr(
                 name="wide_embedding",
                 initializer=fluid.initializer.Constant(value=0.01)),
-            is_sparse=IS_SPARSE,
-            remote_prefetch=True)
+            is_sparse=IS_SPARSE)
         lr_pool = fluid.layers.sequence_pool(input=lr_embbding, pool_type="sum")
 
         merge_layer = fluid.layers.concat(input=[dnn_out, lr_pool], axis=1)
diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
index 26c596ba54..194387bc98 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
@@ -447,23 +447,19 @@ class TestEmptyPserverOptimizeBlocks(TranspilerTest):
 
 
 class TestDistLookupTableBase(TranspilerTest):
-    def network_with_table(self,
-                           is_sparse,
-                           is_distributed,
-                           remote_prefetch=False):
+    def network_with_table(self, is_sparse, is_distributed):
         self.table_size = 1000
         self.emb_size = 64
         self.lookup_table_name = 'shared_w'
 
-        def emb_pool(ids, table_name, is_distributed, remote_prefetch):
+        def emb_pool(ids, table_name, is_distributed):
             emb = fluid.layers.embedding(
                 input=ids,
                 size=[self.table_size, self.emb_size],
                 dtype='float32',
                 param_attr=table_name,
                 is_sparse=is_sparse,
-                is_distributed=is_distributed,
-                remote_prefetch=remote_prefetch)
+                is_distributed=is_distributed)
             pool = fluid.layers.sequence_pool(input=emb, pool_type='average')
             return pool
 
@@ -473,12 +469,9 @@ class TestDistLookupTableBase(TranspilerTest):
             name='brand_ids', shape=[1], dtype='int64', lod_level=1)
         profile_ids = fluid.layers.data(
             name='brand_ids', shape=[1], dtype='int64', lod_level=1)
-        title_emb = emb_pool(title_ids, self.lookup_table_name, is_distributed,
-                             False)
-        brand_emb = emb_pool(brand_ids, self.lookup_table_name, is_distributed,
-                             False)
-        profile_emb = emb_pool(profile_ids, "profile_emb", False,
-                               remote_prefetch)
+        title_emb = emb_pool(title_ids, self.lookup_table_name, is_distributed)
+        brand_emb = emb_pool(brand_ids, self.lookup_table_name, is_distributed)
+        profile_emb = emb_pool(profile_ids, "profile_emb", False)
         fc0 = fluid.layers.concat(
             input=[title_emb, brand_emb, profile_emb], axis=1)
         predict = fluid.layers.fc(input=fc0,
@@ -794,8 +787,7 @@ class TestRemoteLookupTable(TestDistLookupTableBase):
     def net_conf(self):
         import os
         os.environ['PADDLE_ENABLE_REMOTE_PREFETCH'] = "1"
-        self.network_with_table(
-            is_sparse=True, is_distributed=False, remote_prefetch=True)
+        self.network_with_table(is_sparse=True, is_distributed=False)
 
     def transpiler_test_impl(self):
         pserver1, startup1 = self.get_pserver(self.pserver1_ep)
@@ -826,7 +818,7 @@ class TestRemoteLookupTable(TestDistLookupTableBase):
             'split_selected_rows', 'send', 'sequence_pool_grad',
             'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad',
             'sum', 'split_selected_rows', 'send', 'send_barrier', 'recv',
-            'recv', 'recv', 'fetch_barrier', 'concat'
+            'recv', 'fetch_barrier'
         ]
         self.assertEqual([op.type for op in trainer.blocks[0].ops], ops)
 

From 92f5be1d82b1d6994ad18ac09f3e926bd1b631c4 Mon Sep 17 00:00:00 2001
From: phlrain <phliuhongyu@126.com>
Date: Thu, 29 Nov 2018 19:04:38 +0800
Subject: [PATCH 56/90] remove inputvarname in operator; test=develop

---
 paddle/fluid/framework/operator.h          | 8 --------
 paddle/fluid/operators/cudnn_lstm_op.cu.cc | 2 +-
 2 files changed, 1 insertion(+), 9 deletions(-)

diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 7289a451e5..5bd68f9ac2 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -174,14 +174,6 @@ class ExecutionContext {
     return op_.Inputs(name).size();
   }
 
-  const std::string InputVarName(const std::string& name) const {
-    return op_.Input(name);
-  }
-
-  const std::string OutputVarName(const std::string& name) const {
-    return op_.Output(name);
-  }
-
   size_t OutputSize(const std::string& name) const {
     return op_.Outputs(name).size();
   }
diff --git a/paddle/fluid/operators/cudnn_lstm_op.cu.cc b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
index cadd3772af..811975a9f3 100644
--- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc
+++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
@@ -292,7 +292,7 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
       // multi-devices before the first running.
       // use parent scope to make cache persistable
       auto *scope = const_cast<framework::Scope *>(ctx.scope().parent());
-      auto cache_var_name = ctx.InputVarName("Cache");
+      auto cache_var_name = ctx.Inputs("Cache")[0];
       cache_var = scope->Var(cache_var_name);
     }
     CudnnRNNCache *cudnn_rnn_cache = nullptr;

From bd94ab0ef3e172fab40aa316914dbae8868c9f41 Mon Sep 17 00:00:00 2001
From: phlrain <phliuhongyu@126.com>
Date: Thu, 29 Nov 2018 19:21:11 +0800
Subject: [PATCH 57/90] rename op; test=develop

---
 paddle/fluid/operators/{cudnn_lstm_op.cc => lstm_cudnn_op.cc}   | 2 +-
 .../operators/{cudnn_lstm_op.cu.cc => lstm_cudnn_op.cu.cc}      | 2 +-
 paddle/fluid/operators/{cudnn_lstm_op.h => lstm_cudnn_op.h}     | 0
 3 files changed, 2 insertions(+), 2 deletions(-)
 rename paddle/fluid/operators/{cudnn_lstm_op.cc => lstm_cudnn_op.cc} (99%)
 rename paddle/fluid/operators/{cudnn_lstm_op.cu.cc => lstm_cudnn_op.cu.cc} (99%)
 rename paddle/fluid/operators/{cudnn_lstm_op.h => lstm_cudnn_op.h} (100%)

diff --git a/paddle/fluid/operators/cudnn_lstm_op.cc b/paddle/fluid/operators/lstm_cudnn_op.cc
similarity index 99%
rename from paddle/fluid/operators/cudnn_lstm_op.cc
rename to paddle/fluid/operators/lstm_cudnn_op.cc
index c73c64f4a8..ca60fb4b0b 100644
--- a/paddle/fluid/operators/cudnn_lstm_op.cc
+++ b/paddle/fluid/operators/lstm_cudnn_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/cudnn_lstm_op.h"
+#include "paddle/fluid/operators/lstm_cudnn_op.h"
 #include <string>
 
 #ifdef PADDLE_WITH_CUDA
diff --git a/paddle/fluid/operators/cudnn_lstm_op.cu.cc b/paddle/fluid/operators/lstm_cudnn_op.cu.cc
similarity index 99%
rename from paddle/fluid/operators/cudnn_lstm_op.cu.cc
rename to paddle/fluid/operators/lstm_cudnn_op.cu.cc
index 811975a9f3..7a67bbe539 100644
--- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc
+++ b/paddle/fluid/operators/lstm_cudnn_op.cu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/cudnn_lstm_op.h"
+#include "paddle/fluid/operators/lstm_cudnn_op.h"
 #include "paddle/fluid/platform/cudnn_helper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/cudnn_lstm_op.h b/paddle/fluid/operators/lstm_cudnn_op.h
similarity index 100%
rename from paddle/fluid/operators/cudnn_lstm_op.h
rename to paddle/fluid/operators/lstm_cudnn_op.h

From 8f2e556e65ceaf4e530bcbe0055b33b236c73e17 Mon Sep 17 00:00:00 2001
From: ZhenWang <wangzhen31@baidu.com>
Date: Thu, 29 Nov 2018 19:33:47 +0800
Subject: [PATCH 58/90] support the small dam model. test=develop

---
 paddle/fluid/inference/tests/api/CMakeLists.txt         | 4 +++-
 paddle/fluid/inference/tests/api/analyzer_dam_tester.cc | 9 ++++-----
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 7dc88d9dd0..b54693d948 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -48,7 +48,9 @@ inference_analysis_api_test(test_analyzer_rnn2 ${RNN2_INSTALL_DIR} analyzer_rnn2
 
 # DAM
 set(DAM_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/dam")
-download_model_and_data(${DAM_INSTALL_DIR} "DAM_model.tar.gz" "DAM_data.txt.tar.gz")
+# For the normal DAM model
+# download_model_and_data(${DAM_INSTALL_DIR} "DAM_model.tar.gz" "DAM_data.txt.tar.gz")
+download_model_and_data(${DAM_INSTALL_DIR} "small_dam_model.tar.gz" "small_dam_data.txt.tar.gz")
 inference_analysis_api_test(test_analyzer_dam ${DAM_INSTALL_DIR} analyzer_dam_tester.cc)
 
 # chinese_ner
diff --git a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
index b369cba5c8..b5a68538ab 100644
--- a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
@@ -17,7 +17,7 @@
 namespace paddle {
 namespace inference {
 using contrib::AnalysisConfig;
-#define MAX_TURN_NUM 9
+#define MAX_TURN_NUM 1
 #define MAX_TURN_LEN 50
 static std::vector<float> result_data;
 
@@ -148,8 +148,7 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
 }
 
 void SetConfig(contrib::AnalysisConfig *cfg) {
-  cfg->prog_file = FLAGS_infer_model + "/__model__";
-  cfg->param_file = FLAGS_infer_model + "/param";
+  cfg->model_dir = FLAGS_infer_model;
   cfg->use_gpu = false;
   cfg->device = 0;
   cfg->specify_input_name = true;
@@ -202,8 +201,8 @@ TEST(Analyzer_dam, fuse_statis) {
   auto fuse_statis = GetFuseStatis(
       static_cast<AnalysisPredictor *>(predictor.get()), &num_ops);
   ASSERT_TRUE(fuse_statis.count("fc_fuse"));
-  EXPECT_EQ(fuse_statis.at("fc_fuse"), 317);
-  EXPECT_EQ(num_ops, 2020);
+  EXPECT_EQ(fuse_statis.at("fc_fuse"), 45);
+  EXPECT_EQ(num_ops, 292);
 }
 
 // Compare result of NativeConfig and AnalysisConfig

From c856ac8721d6ce41e4cb89c9d6fb369fb0f8d739 Mon Sep 17 00:00:00 2001
From: Tao Luo <luotao02@baidu.com>
Date: Thu, 29 Nov 2018 20:30:44 +0800
Subject: [PATCH 59/90] add OpHasAttr in node.h, update is_test_pass and
 mkldnn_placement_pass

test=develop
---
 paddle/fluid/framework/ir/is_test_pass.cc     |  2 +-
 .../fluid/framework/ir/is_test_pass_tester.cc |  4 +--
 .../framework/ir/mkldnn_placement_pass.cc     |  2 +-
 paddle/fluid/framework/ir/node.cc             | 26 ++++++++++++++++++-
 paddle/fluid/framework/ir/node.h              |  2 ++
 5 files changed, 31 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/framework/ir/is_test_pass.cc b/paddle/fluid/framework/ir/is_test_pass.cc
index 292f232ffc..a61bd5f291 100644
--- a/paddle/fluid/framework/ir/is_test_pass.cc
+++ b/paddle/fluid/framework/ir/is_test_pass.cc
@@ -38,7 +38,7 @@ std::unique_ptr<ir::Graph> IsTestPass::ApplyImpl(
   for (const Node* n : graph->Nodes()) {
     if (n->IsOp()) {
       auto* op = n->Op();
-      if (op->HasAttr("is_test")) {
+      if (n->OpHasAttr("is_test")) {
         op->SetAttr("is_test", true);
       } else if (std::find(begin(op_list), end(op_list), op->Type()) !=
                  end(op_list)) {
diff --git a/paddle/fluid/framework/ir/is_test_pass_tester.cc b/paddle/fluid/framework/ir/is_test_pass_tester.cc
index 9696441a21..a5fb0abb3c 100644
--- a/paddle/fluid/framework/ir/is_test_pass_tester.cc
+++ b/paddle/fluid/framework/ir/is_test_pass_tester.cc
@@ -104,9 +104,9 @@ TEST(IsTestPass, basic) {
       auto* op = node->Op();
       auto op_name = boost::get<std::string>(op->GetAttr("name"));
       if (op_name == "conv3") {
-        ASSERT_FALSE(op->HasAttr("is_test"));
+        ASSERT_FALSE(node->OpHasAttr("is_test"));
       } else {
-        ASSERT_TRUE(op->HasAttr("is_test"));
+        ASSERT_TRUE(node->OpHasAttr("is_test"));
         EXPECT_TRUE(boost::get<bool>(op->GetAttr("is_test")));
       }
     }
diff --git a/paddle/fluid/framework/ir/mkldnn_placement_pass.cc b/paddle/fluid/framework/ir/mkldnn_placement_pass.cc
index 65be69b7f5..366057b01e 100644
--- a/paddle/fluid/framework/ir/mkldnn_placement_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn_placement_pass.cc
@@ -22,7 +22,7 @@ std::unique_ptr<ir::Graph> MKLDNNPlacementPass::ApplyImpl(
     std::unique_ptr<ir::Graph> graph) const {
   VLOG(3) << "Aplies MKL-DNN placement strategy.";
   for (const Node* n : graph->Nodes()) {
-    if (n->IsOp() && n->Op()->HasAttr("use_mkldnn")) {
+    if (n->IsOp() && n->OpHasAttr("use_mkldnn")) {
       n->Op()->SetAttr("use_mkldnn", true);
     }
   }
diff --git a/paddle/fluid/framework/ir/node.cc b/paddle/fluid/framework/ir/node.cc
index 50d9113088..4c4da10b04 100644
--- a/paddle/fluid/framework/ir/node.cc
+++ b/paddle/fluid/framework/ir/node.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/ir/node.h"
+#include "paddle/fluid/framework/op_info.h"
 
 namespace paddle {
 namespace framework {
@@ -24,10 +25,33 @@ constexpr char Node::kControlDepVarName[];
 const char Node::kControlDepVarName[] = "__control_var";
 #endif
 
-std::unique_ptr<Node> CreateNodeForTest(const std::string& name,
+std::unique_ptr<Node> CreateNodeForTest(const std::string &name,
                                         Node::Type type) {
   return std::unique_ptr<Node>(new Node(name, type));
 }
+
+bool Node::OpHasAttr(const std::string &name) const {
+  if (Op()->HasAttr(name)) {
+    return true;
+  } else {
+    auto &op_info = OpInfoMap::Instance();
+    auto op_type = Op()->Type();
+    if (op_info.Has(op_type)) {
+      auto op_info_ptr = op_info.Get(op_type);
+      if (op_info_ptr.HasOpProtoAndChecker()) {
+        const proto::OpProto &proto = op_info_ptr.Proto();
+        for (int i = 0; i != proto.attrs_size(); ++i) {
+          const proto::OpProto::Attr &attr = proto.attrs(i);
+          if (attr.name() == name) {
+            return true;
+          }
+        }
+      }
+    }
+  }
+  return false;
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h
index d2a393b3f1..ac08006a49 100644
--- a/paddle/fluid/framework/ir/node.h
+++ b/paddle/fluid/framework/ir/node.h
@@ -108,6 +108,8 @@ class Node {
            Name().find(ir::Node::kControlDepVarName) != std::string::npos;
   }
 
+  bool OpHasAttr(const std::string& name) const;
+
   std::vector<Node*> inputs;
   std::vector<Node*> outputs;
 

From 4b7617740e9aba2158834ded9762c7d9b438c28c Mon Sep 17 00:00:00 2001
From: Yan Chunwei <yanchunwei@outlook.com>
Date: Thu, 29 Nov 2018 20:46:08 +0800
Subject: [PATCH 60/90] fix container not cleared (#14231)

---
 .../fluid/inference/analysis/analysis_pass.h  |  2 --
 .../fluid/inference/api/analysis_predictor.cc | 12 ++++++----
 paddle/fluid/inference/api/api_impl.cc        |  6 ++---
 .../api/details/reset_tensor_array.cc         | 23 +++++++++++++++++++
 .../api/details/reset_tensor_array.h          | 17 ++++++++++++++
 5 files changed, 51 insertions(+), 9 deletions(-)

diff --git a/paddle/fluid/inference/analysis/analysis_pass.h b/paddle/fluid/inference/analysis/analysis_pass.h
index 299f235a74..d5a972fab3 100644
--- a/paddle/fluid/inference/analysis/analysis_pass.h
+++ b/paddle/fluid/inference/analysis/analysis_pass.h
@@ -46,8 +46,6 @@ class AnalysisPass {
  protected:
   // User should implement these.
   virtual void RunImpl(Argument* argument) = 0;
-
-  Argument* argument_{nullptr};
 };
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 1862f61f0f..391330a7c0 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -190,9 +190,13 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
   }
   VLOG(3) << "predict cost: " << timer.toc() << "ms";
 
-  // Fix TensorArray reuse not cleaned bug.
-  tensor_array_batch_cleaner_.CollectTensorArrays(scope_.get());
-  tensor_array_batch_cleaner_.ResetTensorArray();
+  // All the containers in the scope will be hold in inference, but the
+  // operators assume that the container will be reset after each batch.
+  // Here is a bugfix, collect all the container variables, and reset then to a
+  // bool; the next time, the operator will call MutableData and construct a new
+  // container again, so that the container will be empty for each batch.
+  tensor_array_batch_cleaner_.CollectNoTensorVars(sub_scope_);
+  tensor_array_batch_cleaner_.ResetNoTensorVars();
   return true;
 }
 
@@ -417,7 +421,7 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
 bool AnalysisPredictor::ZeroCopyRun() {
   executor_->Run();
   // Fix TensorArray reuse not cleaned bug.
-  tensor_array_batch_cleaner_.CollectTensorArrays(scope_.get());
+  tensor_array_batch_cleaner_.CollectTensorArrays(sub_scope_);
   tensor_array_batch_cleaner_.ResetTensorArray();
   return true;
 }
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index 74369e8866..4c5b412a2c 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -154,9 +154,9 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
   }
   VLOG(3) << "predict cost: " << timer.toc() << "ms";
 
-  // Fix TensorArray reuse not cleaned bug.
-  tensor_array_batch_cleaner_.CollectTensorArrays(scope_.get());
-  tensor_array_batch_cleaner_.ResetTensorArray();
+  // For some other vector like containers not cleaned after each batch.
+  tensor_array_batch_cleaner_.CollectNoTensorVars(scope_.get());
+  tensor_array_batch_cleaner_.ResetNoTensorVars();
   return true;
 }
 
diff --git a/paddle/fluid/inference/api/details/reset_tensor_array.cc b/paddle/fluid/inference/api/details/reset_tensor_array.cc
index 4ae6c6dc9f..569a487328 100644
--- a/paddle/fluid/inference/api/details/reset_tensor_array.cc
+++ b/paddle/fluid/inference/api/details/reset_tensor_array.cc
@@ -46,5 +46,28 @@ void TensorArrayBatchCleaner::ResetTensorArray() {
   }
 }
 
+void TensorArrayBatchCleaner::CollectNoTensorVars(framework::Scope *scope) {
+  if (no_tensor_flag_) {
+    for (auto &var_name : scope->LocalVarNames()) {
+      auto *var = scope->FindVar(var_name);
+      if (!var->IsInitialized()) continue;
+      if (!valid_types_.count(var->Type())) {
+        no_tensor_vars_.insert(var);
+      }
+    }
+
+    for (auto *kid : scope->kids()) {
+      CollectTensorArrays(kid);
+    }
+    no_tensor_flag_ = false;  // Only collect one time.
+  }
+}
+
+void TensorArrayBatchCleaner::ResetNoTensorVars() {
+  for (auto *var : no_tensor_vars_) {
+    var->Clear();
+  }
+}
+
 }  // namespace details
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/details/reset_tensor_array.h b/paddle/fluid/inference/api/details/reset_tensor_array.h
index a39449ff0e..6a5ea64de6 100644
--- a/paddle/fluid/inference/api/details/reset_tensor_array.h
+++ b/paddle/fluid/inference/api/details/reset_tensor_array.h
@@ -14,9 +14,11 @@
 
 #pragma once
 
+#include <unordered_set>
 #include <vector>
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/variable.h"
 
 namespace paddle {
 namespace details {
@@ -24,13 +26,28 @@ namespace details {
 // Clean the TensorArray each batch to make the behavior the same with the
 // training phase.
 struct TensorArrayBatchCleaner {
+  TensorArrayBatchCleaner() {
+    valid_types_.insert(typeid(framework::Tensor));
+    valid_types_.insert(typeid(framework::LoDTensor));
+  }
+  // Collect the variables that are not Tensor or LoDTensor, and reset them to a
+  // bool(trick), because some of them are containers, and some operators just
+  // keep inserting new items without clearing the containers first; So the
+  // memory grow larger and larger in inference service deployed online.
+  void CollectNoTensorVars(framework::Scope *scope);
+  void ResetNoTensorVars();
+
   // Fix the tensor array not clear in the inference scenarios.
   void CollectTensorArrays(framework::Scope *scope);
   void ResetTensorArray();
 
  private:
   bool flag_{true};
+  bool no_tensor_flag_{true};
   std::vector<framework::LoDTensorArray *> arrays_;
+
+  std::unordered_set<std::type_index> valid_types_;
+  std::unordered_set<framework::Variable *> no_tensor_vars_;
 };
 
 }  // namespace details

From 33b49635055963def9ae057ef68259043934f48c Mon Sep 17 00:00:00 2001
From: ZhenWang <wangzhen31@baidu.com>
Date: Thu, 29 Nov 2018 20:57:19 +0800
Subject: [PATCH 61/90] unify the normal and small dam model.

---
 .../fluid/inference/tests/api/CMakeLists.txt  |  9 ++-
 .../tests/api/analyzer_dam_tester.cc          | 76 +++++++++++--------
 2 files changed, 52 insertions(+), 33 deletions(-)

diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index b54693d948..ab45249f28 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -48,10 +48,13 @@ inference_analysis_api_test(test_analyzer_rnn2 ${RNN2_INSTALL_DIR} analyzer_rnn2
 
 # DAM
 set(DAM_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/dam")
-# For the normal DAM model
+# For the normal DAM model: download DAM_model.tar.gz and DAM_data.txt.tar.gz.
 # download_model_and_data(${DAM_INSTALL_DIR} "DAM_model.tar.gz" "DAM_data.txt.tar.gz")
-download_model_and_data(${DAM_INSTALL_DIR} "small_dam_model.tar.gz" "small_dam_data.txt.tar.gz")
-inference_analysis_api_test(test_analyzer_dam ${DAM_INSTALL_DIR} analyzer_dam_tester.cc)
+download_model_and_data(${DAM_INSTALL_DIR} "dam_small_model.tar.gz" "dam_small_data.txt.tar.gz")
+# For the normal DAM model: --max_turn_num=9.
+inference_analysis_test(test_analyzer_dam SRCS analyzer_dam_tester.cc
+        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
+        ARGS --infer_model=${DAM_INSTALL_DIR}/model --infer_data=${DAM_INSTALL_DIR}/data.txt --max_turn_num=1)
 
 # chinese_ner
 set(CHINESE_NER_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/chinese_ner")
diff --git a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
index b5a68538ab..925f19417d 100644
--- a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
@@ -14,38 +14,54 @@
 
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
+DEFINE_int32(max_turn_num, 1,
+             "The max turn number: 1 for the small and 9 for the normal.");
+
 namespace paddle {
 namespace inference {
 using contrib::AnalysisConfig;
-#define MAX_TURN_NUM 1
-#define MAX_TURN_LEN 50
+
+constexpr int32_t kMaxTurnLen = 50;
+
 static std::vector<float> result_data;
 
 struct DataRecord {
-  std::vector<std::vector<int64_t>>
-      turns[MAX_TURN_NUM];  // turns data : MAX_TURN_NUM
-  std::vector<std::vector<float>>
-      turns_mask[MAX_TURN_NUM];                // turns mask data : MAX_TURN_NUM
-  std::vector<std::vector<int64_t>> response;  // response data : 1
+  std::vector<std::vector<int64_t>> *turns;
+  std::vector<std::vector<float>> *turns_mask;
+  std::vector<std::vector<int64_t>> response;     // response data : 1
   std::vector<std::vector<float>> response_mask;  // response mask data : 1
   size_t batch_iter{0};
   size_t batch_size{1};
   size_t num_samples;  // total number of samples
-  DataRecord() = default;
+
+  DataRecord() {
+    turns = new std::vector<std::vector<
+        int64_t>>[FLAGS_max_turn_num];  // turns data : FLAGS_max_turn_num
+    turns_mask = new std::vector<std::vector<
+        float>>[FLAGS_max_turn_num];  // turns mask data : FLAGS_max_turn_num
+  }
+
   explicit DataRecord(const std::string &path, int batch_size = 1)
-      : batch_size(batch_size) {
+      : DataRecord() {
+    this->batch_size = batch_size;
     Load(path);
   }
+
+  ~DataRecord() {
+    delete[] turns;
+    delete[] turns_mask;
+  }
+
   DataRecord NextBatch() {
     DataRecord data;
     size_t batch_end = batch_iter + batch_size;
     // NOTE skip the final batch, if no enough data is provided.
     if (batch_end <= response.size()) {
-      for (int i = 0; i < MAX_TURN_NUM; ++i) {
+      for (int i = 0; i < FLAGS_max_turn_num; ++i) {
         data.turns[i].assign(turns[i].begin() + batch_iter,
                              turns[i].begin() + batch_end);
       }
-      for (int i = 0; i < MAX_TURN_NUM; ++i) {
+      for (int i = 0; i < FLAGS_max_turn_num; ++i) {
         data.turns_mask[i].assign(turns_mask[i].begin() + batch_iter,
                                   turns_mask[i].begin() + batch_end);
       }
@@ -60,6 +76,7 @@ struct DataRecord {
     batch_iter += batch_size;
     return data;
   }
+
   void Load(const std::string &path) {
     std::ifstream file(path);
     std::string line;
@@ -69,30 +86,30 @@ struct DataRecord {
       num_lines++;
       std::vector<std::string> data;
       split(line, ',', &data);
-      CHECK_EQ(data.size(), (size_t)(2 * MAX_TURN_NUM + 3));
+      CHECK_EQ(data.size(), (size_t)(2 * FLAGS_max_turn_num + 3));
       // load turn data
-      std::vector<int64_t> turns_tmp[MAX_TURN_NUM];
-      for (int i = 0; i < MAX_TURN_NUM; ++i) {
+      std::vector<int64_t> turns_tmp[FLAGS_max_turn_num];
+      for (int i = 0; i < FLAGS_max_turn_num; ++i) {
         split_to_int64(data[i], ' ', &turns_tmp[i]);
         turns[i].push_back(std::move(turns_tmp[i]));
       }
       // load turn_mask data
-      std::vector<float> turns_mask_tmp[MAX_TURN_NUM];
-      for (int i = 0; i < MAX_TURN_NUM; ++i) {
-        split_to_float(data[MAX_TURN_NUM + i], ' ', &turns_mask_tmp[i]);
+      std::vector<float> turns_mask_tmp[FLAGS_max_turn_num];
+      for (int i = 0; i < FLAGS_max_turn_num; ++i) {
+        split_to_float(data[FLAGS_max_turn_num + i], ' ', &turns_mask_tmp[i]);
         turns_mask[i].push_back(std::move(turns_mask_tmp[i]));
       }
       // load response data
       std::vector<int64_t> response_tmp;
-      split_to_int64(data[2 * MAX_TURN_NUM], ' ', &response_tmp);
+      split_to_int64(data[2 * FLAGS_max_turn_num], ' ', &response_tmp);
       response.push_back(std::move(response_tmp));
       // load response_mask data
       std::vector<float> response_mask_tmp;
-      split_to_float(data[2 * MAX_TURN_NUM + 1], ' ', &response_mask_tmp);
+      split_to_float(data[2 * FLAGS_max_turn_num + 1], ' ', &response_mask_tmp);
       response_mask.push_back(std::move(response_mask_tmp));
       // load result data
       float result_tmp;
-      result_tmp = std::stof(data[2 * MAX_TURN_NUM + 2]);
+      result_tmp = std::stof(data[2 * FLAGS_max_turn_num + 2]);
       result_data.push_back(result_tmp);
     }
     num_samples = num_lines;
@@ -101,8 +118,8 @@ struct DataRecord {
 
 void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
                    int batch_size) {
-  PaddleTensor turns_tensor[MAX_TURN_NUM];
-  PaddleTensor turns_mask_tensor[MAX_TURN_NUM];
+  PaddleTensor turns_tensor[FLAGS_max_turn_num];
+  PaddleTensor turns_mask_tensor[FLAGS_max_turn_num];
   PaddleTensor response_tensor;
   PaddleTensor response_mask_tensor;
   std::string turn_pre = "turn_";
@@ -110,16 +127,16 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
 
   auto one_batch = data->NextBatch();
   int size = one_batch.response[0].size();
-  CHECK_EQ(size, MAX_TURN_LEN);
+  CHECK_EQ(size, kMaxTurnLen);
   // turn tensor assignment
-  for (int i = 0; i < MAX_TURN_NUM; ++i) {
+  for (int i = 0; i < FLAGS_max_turn_num; ++i) {
     turns_tensor[i].name = turn_pre + std::to_string(i);
     turns_tensor[i].shape.assign({batch_size, size, 1});
     turns_tensor[i].dtype = PaddleDType::INT64;
     TensorAssignData<int64_t>(&turns_tensor[i], one_batch.turns[i]);
   }
   // turn mask tensor assignment
-  for (int i = 0; i < MAX_TURN_NUM; ++i) {
+  for (int i = 0; i < FLAGS_max_turn_num; ++i) {
     turns_mask_tensor[i].name = turn_mask_pre + std::to_string(i);
     turns_mask_tensor[i].shape.assign({batch_size, size, 1});
     turns_mask_tensor[i].dtype = PaddleDType::FLOAT32;
@@ -137,10 +154,10 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
   TensorAssignData<float>(&response_mask_tensor, one_batch.response_mask);
 
   // Set inputs.
-  for (int i = 0; i < MAX_TURN_NUM; ++i) {
+  for (int i = 0; i < FLAGS_max_turn_num; ++i) {
     input_slots->push_back(std::move(turns_tensor[i]));
   }
-  for (int i = 0; i < MAX_TURN_NUM; ++i) {
+  for (int i = 0; i < FLAGS_max_turn_num; ++i) {
     input_slots->push_back(std::move(turns_mask_tensor[i]));
   }
   input_slots->push_back(std::move(response_tensor));
@@ -148,7 +165,8 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
 }
 
 void SetConfig(contrib::AnalysisConfig *cfg) {
-  cfg->model_dir = FLAGS_infer_model;
+  cfg->prog_file = FLAGS_infer_model + "/__model__";
+  cfg->param_file = FLAGS_infer_model + "/param";
   cfg->use_gpu = false;
   cfg->device = 0;
   cfg->specify_input_name = true;
@@ -201,8 +219,6 @@ TEST(Analyzer_dam, fuse_statis) {
   auto fuse_statis = GetFuseStatis(
       static_cast<AnalysisPredictor *>(predictor.get()), &num_ops);
   ASSERT_TRUE(fuse_statis.count("fc_fuse"));
-  EXPECT_EQ(fuse_statis.at("fc_fuse"), 45);
-  EXPECT_EQ(num_ops, 292);
 }
 
 // Compare result of NativeConfig and AnalysisConfig

From d5947b0ed7b0b0d1365d40b12de7bc4fc099a2b4 Mon Sep 17 00:00:00 2001
From: ZhenWang <wangzhen31@baidu.com>
Date: Thu, 29 Nov 2018 21:12:29 +0800
Subject: [PATCH 62/90] test=develop

---
 paddle/fluid/inference/tests/api/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index ab45249f28..22c9a8735b 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -51,7 +51,7 @@ set(DAM_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/dam")
 # For the normal DAM model: download DAM_model.tar.gz and DAM_data.txt.tar.gz.
 # download_model_and_data(${DAM_INSTALL_DIR} "DAM_model.tar.gz" "DAM_data.txt.tar.gz")
 download_model_and_data(${DAM_INSTALL_DIR} "dam_small_model.tar.gz" "dam_small_data.txt.tar.gz")
-# For the normal DAM model: --max_turn_num=9.
+# For the normal DAM model: use --max_turn_num=9.
 inference_analysis_test(test_analyzer_dam SRCS analyzer_dam_tester.cc
         EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
         ARGS --infer_model=${DAM_INSTALL_DIR}/model --infer_data=${DAM_INSTALL_DIR}/data.txt --max_turn_num=1)

From bcc90123f0bba8111a913254a805c8466f5fb688 Mon Sep 17 00:00:00 2001
From: luotao1 <luotao02@baidu.com>
Date: Thu, 29 Nov 2018 21:13:02 +0800
Subject: [PATCH 63/90] speedup box_coder_op for multi-threads

test=develop
---
 paddle/fluid/operators/detection/box_coder_op.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/paddle/fluid/operators/detection/box_coder_op.h b/paddle/fluid/operators/detection/box_coder_op.h
index 5ed8520acd..b2a2bcdce9 100644
--- a/paddle/fluid/operators/detection/box_coder_op.h
+++ b/paddle/fluid/operators/detection/box_coder_op.h
@@ -43,6 +43,9 @@ class BoxCoderKernel : public framework::OpKernel<T> {
     const T* prior_box_var_data = nullptr;
     if (prior_box_var) prior_box_var_data = prior_box_var->data<T>();
 
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for collapse(2)
+#endif
     for (int64_t i = 0; i < row; ++i) {
       for (int64_t j = 0; j < col; ++j) {
         T prior_box_width = prior_box_data[j * len + 2] -
@@ -96,6 +99,9 @@ class BoxCoderKernel : public framework::OpKernel<T> {
     const T* prior_box_var_data = nullptr;
     if (prior_box_var) prior_box_var_data = prior_box_var->data<T>();
 
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for collapse(2)
+#endif
     for (int64_t i = 0; i < row; ++i) {
       for (int64_t j = 0; j < col; ++j) {
         size_t offset = i * col * len + j * len;

From e1da6cd75407e5e72c3900ab2d098230b308e2e8 Mon Sep 17 00:00:00 2001
From: ZhenWang <wangzhen31@baidu.com>
Date: Thu, 29 Nov 2018 21:33:49 +0800
Subject: [PATCH 64/90] add the normal dam and the small dam

---
 paddle/fluid/inference/tests/api/CMakeLists.txt  | 16 +++++++++-------
 .../inference/tests/api/analyzer_dam_tester.cc   |  2 +-
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 22c9a8735b..78c8ed1534 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -46,15 +46,17 @@ set(RNN2_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn2")
 download_model_and_data(${RNN2_INSTALL_DIR} "rnn2_model.tar.gz" "rnn2_data.txt.tar.gz")
 inference_analysis_api_test(test_analyzer_rnn2 ${RNN2_INSTALL_DIR} analyzer_rnn2_tester.cc)
 
-# DAM
+# Normal DAM
 set(DAM_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/dam")
-# For the normal DAM model: download DAM_model.tar.gz and DAM_data.txt.tar.gz.
-# download_model_and_data(${DAM_INSTALL_DIR} "DAM_model.tar.gz" "DAM_data.txt.tar.gz")
-download_model_and_data(${DAM_INSTALL_DIR} "dam_small_model.tar.gz" "dam_small_data.txt.tar.gz")
-# For the normal DAM model: use --max_turn_num=9.
-inference_analysis_test(test_analyzer_dam SRCS analyzer_dam_tester.cc
+download_model_and_data(${DAM_INSTALL_DIR} "DAM_model.tar.gz" "DAM_data.txt.tar.gz")
+inference_analysis_api_test(test_analyzer_dam ${DAM_INSTALL_DIR} analyzer_dam_tester.cc)
+
+# Small DAM
+set(DAM_SMALL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/small_dam")
+download_model_and_data(${DAM_SMALL_INSTALL_DIR} "dam_small_model.tar.gz" "dam_small_data.txt.tar.gz")
+inference_analysis_test(test_analyzer_small_dam SRCS analyzer_dam_tester.cc
         EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
-        ARGS --infer_model=${DAM_INSTALL_DIR}/model --infer_data=${DAM_INSTALL_DIR}/data.txt --max_turn_num=1)
+        ARGS --infer_model=${DAM_SMALL_INSTALL_DIR}/model --infer_data=${DAM_SMALL_INSTALL_DIR}/data.txt --max_turn_num=1)
 
 # chinese_ner
 set(CHINESE_NER_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/chinese_ner")
diff --git a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
index 925f19417d..a3a6130db7 100644
--- a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
@@ -14,7 +14,7 @@
 
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
-DEFINE_int32(max_turn_num, 1,
+DEFINE_int32(max_turn_num, 9,
              "The max turn number: 1 for the small and 9 for the normal.");
 
 namespace paddle {

From 6e48e47406cb3191d4a8a9901730bbe4e8e50cb7 Mon Sep 17 00:00:00 2001
From: ZhenWang <wangzhen31@baidu.com>
Date: Thu, 29 Nov 2018 21:35:36 +0800
Subject: [PATCH 65/90] test=develop

---
 paddle/fluid/inference/tests/api/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 78c8ed1534..a07626a103 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -46,12 +46,12 @@ set(RNN2_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn2")
 download_model_and_data(${RNN2_INSTALL_DIR} "rnn2_model.tar.gz" "rnn2_data.txt.tar.gz")
 inference_analysis_api_test(test_analyzer_rnn2 ${RNN2_INSTALL_DIR} analyzer_rnn2_tester.cc)
 
-# Normal DAM
+# normal DAM
 set(DAM_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/dam")
 download_model_and_data(${DAM_INSTALL_DIR} "DAM_model.tar.gz" "DAM_data.txt.tar.gz")
 inference_analysis_api_test(test_analyzer_dam ${DAM_INSTALL_DIR} analyzer_dam_tester.cc)
 
-# Small DAM
+# small DAM
 set(DAM_SMALL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/small_dam")
 download_model_and_data(${DAM_SMALL_INSTALL_DIR} "dam_small_model.tar.gz" "dam_small_data.txt.tar.gz")
 inference_analysis_test(test_analyzer_small_dam SRCS analyzer_dam_tester.cc

From fc61bf1b168992bcedf5b645a841f6ea8a52d449 Mon Sep 17 00:00:00 2001
From: baojun-nervana <baojun.liu@intel.com>
Date: Thu, 29 Nov 2018 12:06:16 -0800
Subject: [PATCH 66/90] Renamed methods test=develope

---
 paddle/fluid/framework/ngraph_bridge.cc   | 2 +-
 paddle/fluid/framework/ngraph_bridge.h    | 2 +-
 paddle/fluid/framework/ngraph_operator.cc | 8 ++++----
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/paddle/fluid/framework/ngraph_bridge.cc b/paddle/fluid/framework/ngraph_bridge.cc
index 45ef0211ad..e22c290377 100644
--- a/paddle/fluid/framework/ngraph_bridge.cc
+++ b/paddle/fluid/framework/ngraph_bridge.cc
@@ -111,7 +111,7 @@ std::map<std::string,
     NgraphBridge::NG_NODE_MAP = {{"relu", BuildUnaryNode<ngraph::op::Relu>},
                                  {"tanh", BuildUnaryNode<ngraph::op::Tanh>}};
 
-void NgraphBridge::BuildNgGraph(const std::shared_ptr<OperatorBase>& op) {
+void NgraphBridge::BuildNgNode(const std::shared_ptr<OperatorBase>& op) {
   auto& op_type = op->Type();
   NG_NODE_MAP[op_type](op, ngb_node_map_);
 }
diff --git a/paddle/fluid/framework/ngraph_bridge.h b/paddle/fluid/framework/ngraph_bridge.h
index 3cf62b6daa..9ed6b95109 100644
--- a/paddle/fluid/framework/ngraph_bridge.h
+++ b/paddle/fluid/framework/ngraph_bridge.h
@@ -43,7 +43,7 @@ class NgraphBridge {
           var_node_map)
       : ngb_node_map_(var_node_map) {}
 
-  void BuildNgGraph(const std::shared_ptr<OperatorBase>& op);
+  void BuildNgNode(const std::shared_ptr<OperatorBase>& op);
 
  private:
   std::shared_ptr<
diff --git a/paddle/fluid/framework/ngraph_operator.cc b/paddle/fluid/framework/ngraph_operator.cc
index 1c770a2370..3fea753f06 100644
--- a/paddle/fluid/framework/ngraph_operator.cc
+++ b/paddle/fluid/framework/ngraph_operator.cc
@@ -122,7 +122,7 @@ class NgraphOperator {
   // get ngraph input and define ngraph input parameters
   void GetNgInputShape(std::shared_ptr<OperatorBase> op);
   // Call ngraph bridge to map ops
-  void BuildNgNode();
+  void BuildNgNodes();
   // get the ngraph input and output var list
   void BuildNgIO();
   // build ngraph function call
@@ -301,7 +301,7 @@ void NgraphOperator::GetNgInputShape(std::shared_ptr<OperatorBase> op) {
   }
 }
 
-void NgraphOperator::BuildNgNode() {
+void NgraphOperator::BuildNgNodes() {
   for (auto& var_name : var_out_) {
     if (var_node_map_->find(var_name) == var_node_map_->end()) {
       auto* var = scope_.FindVar(var_name);
@@ -319,7 +319,7 @@ void NgraphOperator::BuildNgNode() {
 
   paddle::framework::NgraphBridge ngb(var_node_map_);
   for (auto& op : fused_ops_) {
-    ngb.BuildNgGraph(op);
+    ngb.BuildNgNode(op);
   }
 }
 
@@ -396,7 +396,7 @@ void NgraphOperator::BuildNgIO() {
 }
 
 void NgraphOperator::BuildNgFunction() {
-  BuildNgNode();
+  BuildNgNodes();
   ngraph_function_ = nullptr;
   ngraph::NodeVector func_outputs;
   ngraph::op::ParameterVector func_inputs;

From 5d334ff0f158955f6fb9069f37183ccf3d4e496f Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Fri, 30 Nov 2018 09:48:45 +0800
Subject: [PATCH 67/90] Add examples to some functions. (#14645)

---
 python/paddle/fluid/layers/nn.py | 50 ++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 56366142eb..0403765121 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -7606,6 +7606,11 @@ def uniform_random_batch_size_like(input,
     Returns:
         out (Variable): ${out_comment}
 
+    Examples:
+        .. code-block:: python
+
+            input = layers.data(name="input", shape=[13, 11], dtype='float32')
+            out = layers.uniform_random_batch_size_like(input, [-1, 11])
     """
 
     helper = LayerHelper('uniform_random_batch_size_like', **locals())
@@ -7643,6 +7648,10 @@ def gaussian_random(shape, mean=0.0, std=1.0, seed=0, dtype='float32'):
     Returns:
         out (Variable): ${out_comment}
 
+    Examples:
+        .. code-block:: python
+
+            out = layers.gaussian_random(shape=[20, 30])
     """
 
     helper = LayerHelper('gaussian_random', **locals())
@@ -7678,6 +7687,16 @@ def sampling_id(x, min=0.0, max=1.0, seed=0, dtype='float32'):
     Returns:
         out (Variable): ${out_comment}
 
+    Examples:
+        .. code-block:: python
+
+            x = layers.data(
+                name="X",
+                shape=[13, 11],
+                dtype='float32',
+                append_batch_size=False)
+
+            out = layers.sampling_id(x)
     """
 
     helper = LayerHelper('sampling_id', **locals())
@@ -7717,6 +7736,14 @@ def gaussian_random_batch_size_like(input,
 
     Returns:
         out (Variable): ${out_comment}
+
+    Examples:
+        .. code-block:: python
+
+            input = layers.data(name="input", shape=[13, 11], dtype='float32')
+
+            out = layers.gaussian_random_batch_size_like(
+                input, shape=[-1, 11], mean=1.0, std=2.0)
     """
 
     helper = LayerHelper('gaussian_random_batch_size_like', **locals())
@@ -7749,6 +7776,12 @@ def sum(x):
 
     Returns:
         out (Variable): ${out_comment}
+
+    Examples:
+        .. code-block:: python
+
+            input = layers.data(name="input", shape=[13, 11], dtype='float32')
+            out = layers.sum(input)
     """
 
     helper = LayerHelper('sum', **locals())
@@ -7777,6 +7810,17 @@ def slice(input, axes, starts, ends):
     Returns:
         out (Variable): ${out_comment}
 
+    Examples:
+        .. code-block:: python
+
+            starts = [1, 0, 2]
+            ends = [3, 3, 4]
+            axes = [0, 1, 2]
+
+            input = layers.data(
+                name="input", shape=[3, 4, 5, 6], dtype='float32')
+
+            out = layers.slice(input, axes=axes, starts=starts, ends=ends)
     """
 
     helper = LayerHelper('slice', **locals())
@@ -7804,6 +7848,12 @@ def shape(input):
     Returns:
         out (Variable): ${out_comment}
 
+    Examples:
+        .. code-block:: python
+
+            input = layers.data(
+                name="input", shape=[3, 100, 100], dtype="float32")
+            out = layers.shape(input)
     """
 
     helper = LayerHelper('shape', **locals())

From 86ae32fbd81abf91646f976379a3fd6a89e448d9 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Fri, 30 Nov 2018 10:26:15 +0800
Subject: [PATCH 68/90] Stablize decorator test

test=develop
---
 python/paddle/reader/tests/decorator_test.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/paddle/reader/tests/decorator_test.py b/python/paddle/reader/tests/decorator_test.py
index b9af8348e1..a9dddbbcc8 100644
--- a/python/paddle/reader/tests/decorator_test.py
+++ b/python/paddle/reader/tests/decorator_test.py
@@ -62,10 +62,10 @@ class TestBuffered(unittest.TestCase):
         for idx, i in enumerate(b()):
             elapsed_time = time.time() - last_time
             if i == 0:
-                time.sleep(0.3)
+                time.sleep(1)
             else:
                 # read time should be short, meaning already buffered.
-                self.assertLess(elapsed_time, 0.05)
+                self.assertLess(elapsed_time, 0.08)
             last_time = time.time()
 
 

From 1b9753d10949784cd2522c49821feb02ee0be167 Mon Sep 17 00:00:00 2001
From: whs <wanghaoshuang@baidu.com>
Date: Fri, 30 Nov 2018 13:04:45 +0800
Subject: [PATCH 69/90] Make pad2d support for variable paddings. (#14667)

* Make pad2d support for variable paddings.
test=develop

* Rename get_paddings and add inline modifier.
test=develop

* Fix comments.
---
 paddle/fluid/operators/pad2d_op.cc            | 88 ++++++++++++++++---
 paddle/fluid/operators/pad2d_op.cu            | 41 +++++++--
 python/paddle/fluid/layers/nn.py              | 21 ++---
 .../fluid/tests/unittests/test_layers.py      |  8 ++
 .../fluid/tests/unittests/test_pad2d_op.py    | 27 +++++-
 5 files changed, 156 insertions(+), 29 deletions(-)

diff --git a/paddle/fluid/operators/pad2d_op.cc b/paddle/fluid/operators/pad2d_op.cc
index a706d05fd7..a9da21f479 100644
--- a/paddle/fluid/operators/pad2d_op.cc
+++ b/paddle/fluid/operators/pad2d_op.cc
@@ -319,20 +319,46 @@ void Pad2DGradEdgeNHWC(T* d_in_data, const int num, const int channels,
   }
 }
 
+static inline void GetPaddings(int* paddings,
+                               const framework::ExecutionContext& context) {
+  auto* paddings_t = context.Input<Tensor>("Paddings");
+  if (paddings_t) {
+    auto paddings_data = paddings_t->data<int>();
+    paddings[0] = paddings_data[0];
+    paddings[1] = paddings_data[1];
+    paddings[2] = paddings_data[2];
+    paddings[3] = paddings_data[3];
+  } else {
+    auto pads = context.Attr<std::vector<int>>("paddings");
+    std::copy(pads.begin(), pads.end(), paddings);
+  }
+}
+
 template <typename T>
 class Pad2dCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto pads = context.Attr<std::vector<int>>("paddings");
+    int pads[4];
+    GetPaddings(pads, context);
     auto mode = context.Attr<std::string>("mode");
     auto data_format = context.Attr<std::string>("data_format");
     T value = context.Attr<T>("pad_value");
+
     auto* x = context.Input<Tensor>("X");
-    auto* out = context.Output<Tensor>("Out");
     auto in_dims = x->dims();
-    auto out_dims = out->dims();
     const T* in_data = x->data<T>();
+
+    auto* out = context.Output<Tensor>("Out");
+    if (data_format == "NCHW") {
+      out->Resize({in_dims[0], in_dims[1], in_dims[2] + pads[0] + pads[1],
+                   in_dims[3] + pads[2] + pads[3]});
+    } else {
+      out->Resize({in_dims[0], in_dims[1] + pads[0] + pads[1],
+                   in_dims[2] + pads[2] + pads[3], in_dims[3]});
+    }
+    auto out_dims = out->dims();
     T* out_data = out->mutable_data<T>(context.GetPlace());
+
     const int pad_top = pads[0];
     const int pad_left = pads[2];
     const int num = in_dims[0];
@@ -376,7 +402,8 @@ template <typename T>
 class Pad2dGradCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto pads = context.Attr<std::vector<int>>("paddings");
+    int pads[4];
+    GetPaddings(pads, context);
     auto mode = context.Attr<std::string>("mode");
     auto data_format = context.Attr<std::string>("data_format");
     auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
@@ -442,21 +469,35 @@ class Pad2dOp : public framework::OperatorWithKernel {
                    "Output(Out) of Pad2dOp should not be null.");
 
     auto x_dim = ctx->GetInputDim("X");
-    auto paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
     PADDLE_ENFORCE_EQ(x_dim.size(), 4,
-                      "Size of paddings should be equal to 4.");
-    std::vector<int64_t> out_dims(x_dim.size());
+                      "The size of input(X)'s dimension should be equal to 4.");
 
+    std::vector<int64_t> out_dims(x_dim.size());
     auto data_format = ctx->Attrs().Get<std::string>("data_format");
     out_dims[0] = x_dim[0];
-    if (data_format == "NCHW") {
+    if (ctx->HasInput("Paddings")) {
+      auto paddings_dim = ctx->GetInputDim("Paddings");
+      PADDLE_ENFORCE_EQ(
+          paddings_dim.size(), 1,
+          "Size of Input(Paddings)'s dimension should be equal to 1.");
+      PADDLE_ENFORCE_EQ(paddings_dim[0], 4,
+                        "Shape of Input(Paddings) should be equal to [4].");
       out_dims[1] = x_dim[1];
-      out_dims[2] = x_dim[2] + paddings[0] + paddings[1];  // height
-      out_dims[3] = x_dim[3] + paddings[2] + paddings[3];  // width
-    } else {                                               // NHWC
+      out_dims[2] = x_dim[2];
       out_dims[3] = x_dim[3];
-      out_dims[1] = x_dim[1] + paddings[0] + paddings[1];
-      out_dims[2] = x_dim[2] + paddings[2] + paddings[3];
+    } else {
+      auto paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+      PADDLE_ENFORCE_EQ(paddings.size(), 4,
+                        "Size of paddings should be equal to 4.");
+      if (data_format == "NCHW") {
+        out_dims[1] = x_dim[1];
+        out_dims[2] = x_dim[2] + paddings[0] + paddings[1];  // height
+        out_dims[3] = x_dim[3] + paddings[2] + paddings[3];  // width
+      } else {                                               // NHWC
+        out_dims[3] = x_dim[3];
+        out_dims[1] = x_dim[1] + paddings[0] + paddings[1];
+        out_dims[2] = x_dim[2] + paddings[2] + paddings[3];
+      }
     }
 
     ctx->SetOutputDim("Out", framework::make_ddim(out_dims));
@@ -466,6 +507,13 @@ class Pad2dOp : public framework::OperatorWithKernel {
       ctx->ShareLoD("X", /*->*/ "Out");
     }
   }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace());
+  }
 };
 
 class Pad2dOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -477,6 +525,12 @@ class Pad2dOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Out",
               "The output of pad2d op. "
               "A tensor with the same shape as X.");
+    AddInput("Paddings",
+             "A 1-D tensor to describe the padding rules."
+             "paddings=[0, 1, 2, 3] means "
+             "padding 0 row to top, 1 row to bottom, 2 columns to left "
+             "and 3 columns to right. Size of paddings must be 4.")
+        .AsDispensable();
     AddAttr<std::vector<int>>(
         "paddings",
         "(vector<int>) "
@@ -554,6 +608,13 @@ class Pad2dOpGrad : public framework::OperatorWithKernel {
       ctx->SetOutputDim(x_grad_name, x_dims);
     }
   }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace());
+  }
 };
 
 class Pad2dOpGradMaker : public framework::SingleGradOpDescMaker {
@@ -564,6 +625,7 @@ class Pad2dOpGradMaker : public framework::SingleGradOpDescMaker {
   std::unique_ptr<framework::OpDesc> Apply() const override {
     auto* bind = new framework::OpDesc();
     bind->SetInput("X", Input("X"));
+    bind->SetInput("Paddings", Input("Paddings"));
     bind->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
     bind->SetOutput(framework::GradVarName("X"), InputGrad("X"));
     bind->SetAttrMap(Attrs());
diff --git a/paddle/fluid/operators/pad2d_op.cu b/paddle/fluid/operators/pad2d_op.cu
index 9ba0ddbd84..72eca08b06 100644
--- a/paddle/fluid/operators/pad2d_op.cu
+++ b/paddle/fluid/operators/pad2d_op.cu
@@ -287,20 +287,50 @@ __global__ void Pad2DGradEdgeNHWC(const int out_size, T* d_in_data,
   }
 }
 
+static inline void GetPaddings(int* paddings,
+                               const framework::ExecutionContext& context) {
+  auto* paddings_t = context.Input<Tensor>("Paddings");
+  if (paddings_t) {
+    Tensor pads;
+    framework::TensorCopySync(*paddings_t, platform::CPUPlace(), &pads);
+    auto pads_data = pads.data<int>();
+    paddings[0] = pads_data[0];
+    paddings[1] = pads_data[1];
+    paddings[2] = pads_data[2];
+    paddings[3] = pads_data[3];
+  } else {
+    auto pads = context.Attr<std::vector<int>>("paddings");
+    std::copy(pads.begin(), pads.end(), paddings);
+  }
+}
+
 template <typename T>
 class Pad2dCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto pads = context.Attr<std::vector<int>>("paddings");
+    int pads[4];
+    GetPaddings(pads, context);
     auto mode = context.Attr<std::string>("mode");
     auto data_format = context.Attr<std::string>("data_format");
     T value = context.Attr<T>("pad_value");
+
     auto* x = context.Input<Tensor>("X");
-    auto* out = context.Output<Tensor>("Out");
     auto in_dims = x->dims();
-    auto out_dims = out->dims();
     const T* in_data = x->data<T>();
-    T* out_data = out->mutable_data<T>(context.GetPlace());
+    auto* out = context.Output<Tensor>("Out");
+    auto out_dims = out->dims();
+    if (data_format == "NCHW") {
+      out_dims[0] = in_dims[0];
+      out_dims[1] = in_dims[1];
+      out_dims[2] = in_dims[2] + pads[0] + pads[1];
+      out_dims[3] = in_dims[3] + pads[2] + pads[3];
+    } else {
+      out_dims[0] = in_dims[0];
+      out_dims[1] = in_dims[1] + pads[0] + pads[1];
+      out_dims[2] = in_dims[2] + pads[2] + pads[3];
+      out_dims[3] = in_dims[3];
+    }
+    T* out_data = out->mutable_data<T>(out_dims, context.GetPlace());
     const int pad_top = pads[0];
     const int pad_left = pads[2];
     const int num = in_dims[0];
@@ -356,7 +386,8 @@ template <typename T>
 class Pad2dGradCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto pads = context.Attr<std::vector<int>>("paddings");
+    int pads[4];
+    GetPaddings(pads, context);
     auto mode = context.Attr<std::string>("mode");
     auto data_format = context.Attr<std::string>("data_format");
     auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 0403765121..472daff1ea 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -6924,7 +6924,7 @@ def pad2d(input,
 
     Args:
         input (Variable): The input image with [N, C, H, W] format or [N, H, W, C] format.
-        paddings (tuple|list): The padding size. If padding is a tuple, it must
+        paddings (tuple|list|Variable): The padding size. If padding is a tuple, it must
             contain four integers, (padding_top, padding_bottom, padding_left, padding_right).
             Default: padding = [0, 0, 0, 0].
         mode (str): Three modes: constant(default), reflect, edge. Default: constant
@@ -6949,16 +6949,17 @@ def pad2d(input,
     helper = LayerHelper('pad2d', **locals())
     dtype = helper.input_dtype(input_param_name='input')
     out = helper.create_variable_for_type_inference(dtype)
+    inputs = {'X': input}
+    attrs = {'mode': mode, 'pad_value': pad_value, 'data_format': data_format}
+
+    if isinstance(paddings, Variable):
+        inputs['Paddings'] = paddings
+        attrs['paddings'] = []
+    else:
+        attrs['paddings'] = paddings
+
     helper.append_op(
-        type='pad2d',
-        inputs={'X': input},
-        outputs={"Out": out},
-        attrs={
-            'paddings': paddings,
-            'mode': mode,
-            'pad_value': pad_value,
-            'data_frmat': data_format
-        })
+        type='pad2d', inputs=inputs, outputs={"Out": out}, attrs=attrs)
 
     return out
 
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 2004c91793..7f477031dd 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -636,13 +636,21 @@ class TestBook(unittest.TestCase):
         with program_guard(program):
             input = layers.data(
                 name="input", shape=[3, 100, 100], dtype="float32")
+            paddings = layers.fill_constant(shape=[4], dtype='int32', value=1)
             out = layers.pad2d(
                 input,
                 paddings=[1, 2, 3, 4],
                 mode='reflect',
                 data_format='NCHW',
                 name="shape")
+            out_1 = layers.pad2d(
+                input,
+                paddings=paddings,
+                mode='reflect',
+                data_format='NCHW',
+                name="shape")
             self.assertIsNotNone(out)
+            self.assertIsNotNone(out_1)
         print(str(program))
 
     def test_prelu(self):
diff --git a/python/paddle/fluid/tests/unittests/test_pad2d_op.py b/python/paddle/fluid/tests/unittests/test_pad2d_op.py
index 728b8c181a..5c4a6ca59e 100644
--- a/python/paddle/fluid/tests/unittests/test_pad2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pad2d_op.py
@@ -20,11 +20,17 @@ from op_test import OpTest
 class TestPad2dOp(OpTest):
     def setUp(self):
         self.pad_value = 0.0
+        self.variable_paddings = False
         self.initTestCase()
         self.op_type = "pad2d"
         self.inputs = {'X': np.random.random(self.shape).astype("float32"), }
         self.attrs = {}
-        self.attrs['paddings'] = np.array(self.paddings).flatten()
+        if self.variable_paddings:
+            self.attrs['paddings'] = []
+            self.inputs['Paddings'] = np.array(self.paddings).flatten().astype(
+                "int32")
+        else:
+            self.attrs['paddings'] = np.array(self.paddings).flatten()
         self.attrs['pad_value'] = self.pad_value
         self.attrs['mode'] = self.mode
         self.attrs['data_format'] = self.data_format
@@ -98,5 +104,24 @@ class TestCase5(TestPad2dOp):
         self.data_format = "NHWC"
 
 
+class TestCase6(TestPad2dOp):
+    def initTestCase(self):
+        self.shape = (2, 4, 4, 2)
+        self.paddings = [0, 1, 2, 3]
+        self.mode = "constant"
+        self.pad_value = 1.2
+        self.data_format = "NHWC"
+        self.variable_paddings = True
+
+
+class TestCase7(TestPad2dOp):
+    def initTestCase(self):
+        self.shape = (2, 3, 4, 4)
+        self.paddings = [0, 1, 2, 3]
+        self.mode = "reflect"
+        self.data_format = "NCHW"
+        self.variable_paddings = True
+
+
 if __name__ == '__main__':
     unittest.main()

From 78738d6c86da21fc05d8e8f62ec07a29751257a8 Mon Sep 17 00:00:00 2001
From: whs <wanghaoshuang@baidu.com>
Date: Fri, 30 Nov 2018 13:51:00 +0800
Subject: [PATCH 70/90] Fix comments of ctc_greedy_decoder. (#14679)

test=develop
---
 python/paddle/fluid/layers/nn.py | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 472daff1ea..3c2975729c 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -4250,8 +4250,15 @@ def ctc_greedy_decoder(input, blank, name=None):
                       [0.5, 0.1, 0.3, 0.1]]
 
         input.lod = [[4, 4]]
+      
+        Computation:
 
-        Then:
+        step1: Apply argmax to first input sequence which is input.data[0:4]. Then we get:
+               [[0], [2], [1], [0]]
+        step2: merge repeated tokens and remove blank which is 0. Then we get first output sequence:
+               [[2], [1]]
+
+        Finally:
 
         output.data = [[2],
                        [1],
@@ -4259,6 +4266,7 @@ def ctc_greedy_decoder(input, blank, name=None):
 
         output.lod = [[2, 1]]
 
+
     Args:
 
         input(Variable): (LoDTensor<float>), the probabilities of
@@ -4273,8 +4281,10 @@ def ctc_greedy_decoder(input, blank, name=None):
         name (str): The name of this layer. It is optional.
 
     Returns:
-        Variable: CTC greedy decode result. If all the sequences in result were
-        empty, the result LoDTensor will be [-1] with LoD [[]] and dims [1, 1].
+        Variable: CTC greedy decode result which is a 2-D tensor with shape [Lp, 1].
+                  'Lp' is the sum if all output sequences' length. If all the sequences
+                  in result were empty, the result LoDTensor will be [-1] with 
+                  LoD [[]] and dims [1, 1].
 
     Examples:
         .. code-block:: python

From 41e19eb4319695a02b01649409ebd34cbb6350d7 Mon Sep 17 00:00:00 2001
From: Wang Guibao <wang_guibao@163.com>
Date: Fri, 30 Nov 2018 15:29:24 +0800
Subject: [PATCH 71/90] AsyncExecutor (#14627)

* AsyncExecutor: C++ side

* Google naming conventions

* Rename MultiExecutor to AsyncExecutor

* pybind with async_executor

* Naming convention

* remove some flags and unused code

* add refactored file of async_executor and data_feed

* clear async executor interface and add data feed factory

* split async executor into executor_thread_worker and async_executor, refactor pybind, add datafeed and corresponding proto

* Fix async_executor interfaces: 1) Remove all protobufs; 2) Stop after each epoch

* refine async_executor_refactor.cc

* add some files about datafeed

* Revert "add some files about datafeed"

This reverts commit 8ee8133ab841196925a2812b76f18d2812a6701d.

* Interface rework

* add MultiSlotDataFeed

* Creating DataFeedDesc from .proto file, then manipulate it (add/del fields etc) from python side

* update data_feed for add MultiSlotDataFeed

* update datafeed and async_executor to run bow_net demo

* fix bug that finish_set_filelist failed in multithread

* delete finish_binding_memory_(flag), because it can not be marked under the current interface

* Fix bug

* update async_executor.py for support set_use_slots

* update async_executor.py for support set_use_slots and set set_dense_slots

* fix bug that when the number of files is less than the number of threads, it will fetch nan

* remove redundant code, and make executor exit when set a illegal queue size

* add batch_size check

* add MultiSlotDesc

* Revert "add MultiSlotDesc"

This reverts commit 2e72ebfad364ed6b5dcc75f38ffb2a1fdec83d8e.

* add some checkpoint in DataFeedDesc

* add CheckFile function in MultiSlotDataFeed

* update something error info

* fix deaded lock bug

* Fix fetch variable

* Merge error

* fix code style in async_executor

* using one lock blocking queue replace two lock blocking queue because of some bugs

* update code style

* add utest for data_feed

* Fix fetch var

* update utest for data_feed for multithread

* update SetFileList info

* fix bug in utest of data_feed

* Add comments for python

* Add comments for python code

* Fix pybind.cc with new pybind11 version

* add note for DataFeedDesc's set_use_slots function

* Add save_model

* update data_feed_test for multi-type

* add comment for executor_thread_worker

* Remove unused code

* update data_feed_test for generate test data file

* removed unnecessary interfaces and add comments

* c++ style check

* update data_feed.cc

* AsyncExecutor: C++ side

Google naming conventions

Rename MultiExecutor to AsyncExecutor

pybind with async_executor

Naming convention

remove some flags and unused code

add refactored file of async_executor and data_feed

clear async executor interface and add data feed factory

split async executor into executor_thread_worker and async_executor, refactor pybind, add datafeed and corresponding proto

Fix async_executor interfaces: 1) Remove all protobufs; 2) Stop after each epoch

refine async_executor_refactor.cc

add some files about datafeed

Revert "add some files about datafeed"

This reverts commit 8ee8133ab841196925a2812b76f18d2812a6701d.

add MultiSlotDataFeed

Interface rework

Creating DataFeedDesc from .proto file, then manipulate it (add/del fields etc) from python side

update datafeed and async_executor to run bow_net demo

update async_executor.py for support set_use_slots

Fix bug

update async_executor.py for support set_use_slots and set set_dense_slots

fix bug that when the number of files is less than the number of threads, it will fetch nan

remove redundant code, and make executor exit when set a illegal queue size

add MultiSlotDesc

Revert "add MultiSlotDesc"

This reverts commit 2e72ebfad364ed6b5dcc75f38ffb2a1fdec83d8e.

add some checkpoint in DataFeedDesc

Fix fetch variable

fix code style in async_executor

Fix fetch var

add utest for data_feed

Add comments for python

update utest for data_feed for multithread

fix bug in utest of data_feed

Add comments for python code

Fix pybind.cc with new pybind11 version

add note for DataFeedDesc's set_use_slots function

update data_feed_test for multi-type

Add save_model

update data_feed_test for generate test data file

removed unnecessary interfaces and add comments

add comment for executor_thread_worker

Remove unused code

update data_feed.cc

c++ style check

* commit for code style

* commit for code style

* commit for code style

* commit for code style

* Comment away __init__ in async_executor.py

* clang-format fix test=develop

* use PADDLE_THROW instead of exit(-1); use unique_ptr to manage scope var in data_feed_test.cc

* commit for update code style

* commit for update code style

* Add async_executor demo; Remove some methods
test=develop

* commit for update code style

* commit for update code style

* commit for update code style

* update API.spec

* AsyncExecutor
test=develop

* AsyncExecutor
test=develop

* AsyncExecutor
test=develop

* AsyncExecutor
test=develop

* Fix API.spec
test=develop

* Fix API.spec
test=develop

* Fix windows build error
test=develop

* FIx windows build error
test=develop

* FIx windows build error
test=develop

* FIx windows build error
test=develop

* Fix Windows Build
test=develop

* Fix Windows Build
test=develop

* Fix Windows Build
test=develop

* Fix code style
test=develop

* Fix code style
test=develop

* update datafeed

* Fix code style
test=develop

* update data_feed_test for test Tensor test=develop

* Fix code style
test=develop

* Fix windows build failure
test=develop

* Fix code style and windows build failure
test=develop

* Fix PYTHON3.5 build failure
test=develop

* AsyncExecutor API
test=develop
---
 paddle/fluid/API.spec                         |   7 +
 paddle/fluid/framework/CMakeLists.txt         |  17 +-
 paddle/fluid/framework/async_executor.cc      | 138 +++++++
 paddle/fluid/framework/async_executor.h       |  58 +++
 paddle/fluid/framework/data_feed.cc           | 375 ++++++++++++++++++
 paddle/fluid/framework/data_feed.h            | 269 +++++++++++++
 paddle/fluid/framework/data_feed.proto        |  30 ++
 paddle/fluid/framework/data_feed_factory.cc   |  64 +++
 paddle/fluid/framework/data_feed_factory.h    |  29 ++
 paddle/fluid/framework/data_feed_test.cc      | 337 ++++++++++++++++
 .../scope_buffered_ssa_graph_executor.cc      |   2 +-
 paddle/fluid/framework/executor.cc            |  31 +-
 paddle/fluid/framework/executor.h             |   1 -
 .../fluid/framework/executor_thread_worker.cc | 223 +++++++++++
 .../fluid/framework/executor_thread_worker.h  |  88 ++++
 paddle/fluid/framework/naive_executor.cc      |  33 +-
 paddle/fluid/framework/variable_helper.cc     |  60 +++
 paddle/fluid/framework/variable_helper.h      |  22 +
 .../distributed/request_handler_impl.cc       |   3 +-
 paddle/fluid/pybind/CMakeLists.txt            |   4 +-
 paddle/fluid/pybind/async_executor_py.cc      |  53 +++
 paddle/fluid/pybind/async_executor_py.h       |  28 ++
 paddle/fluid/pybind/pybind.cc                 |   2 +
 python/paddle/fluid/__init__.py               |  10 +-
 python/paddle/fluid/async_executor.py         | 151 +++++++
 python/paddle/fluid/data_feed_desc.py         | 152 +++++++
 python/paddle/fluid/executor.py               |   1 +
 .../paddle/fluid/tests/demo/async_executor.py | 100 +++++
 .../tests/unittests/test_async_executor.py    | 142 +++++++
 29 files changed, 2356 insertions(+), 74 deletions(-)
 create mode 100644 paddle/fluid/framework/async_executor.cc
 create mode 100644 paddle/fluid/framework/async_executor.h
 create mode 100644 paddle/fluid/framework/data_feed.cc
 create mode 100644 paddle/fluid/framework/data_feed.h
 create mode 100644 paddle/fluid/framework/data_feed.proto
 create mode 100644 paddle/fluid/framework/data_feed_factory.cc
 create mode 100644 paddle/fluid/framework/data_feed_factory.h
 create mode 100644 paddle/fluid/framework/data_feed_test.cc
 create mode 100644 paddle/fluid/framework/executor_thread_worker.cc
 create mode 100644 paddle/fluid/framework/executor_thread_worker.h
 create mode 100644 paddle/fluid/framework/variable_helper.cc
 create mode 100644 paddle/fluid/framework/variable_helper.h
 create mode 100644 paddle/fluid/pybind/async_executor_py.cc
 create mode 100644 paddle/fluid/pybind/async_executor_py.h
 create mode 100644 python/paddle/fluid/async_executor.py
 create mode 100644 python/paddle/fluid/data_feed_desc.py
 create mode 100644 python/paddle/fluid/tests/demo/async_executor.py
 create mode 100644 python/paddle/fluid/tests/unittests/test_async_executor.py

diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index 6b5ed10244..26113ee7e9 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -32,6 +32,13 @@ paddle.fluid.BuildStrategy.ReduceStrategy.__init__ __init__(self: paddle.fluid.c
 paddle.fluid.BuildStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy) -> None
 paddle.fluid.create_lod_tensor ArgSpec(args=['data', 'recursive_seq_lens', 'place'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.create_random_int_lodtensor ArgSpec(args=['recursive_seq_lens', 'base_shape', 'place', 'low', 'high'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.DataFeedDesc.__init__ ArgSpec(args=['self', 'proto_file'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.DataFeedDesc.desc ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.DataFeedDesc.set_batch_size ArgSpec(args=['self', 'batch_size'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.DataFeedDesc.set_dense_slots ArgSpec(args=['self', 'dense_slots_name'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.DataFeedDesc.set_use_slots ArgSpec(args=['self', 'use_slots_name'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.AsyncExecutor.__init__ ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.AsyncExecutor.run ArgSpec(args=['self', 'program', 'data_feed', 'filelist', 'thread_num', 'fetch', 'debug'], varargs=None, keywords=None, defaults=(False,))
 paddle.fluid.io.save_vars ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None))
 paddle.fluid.io.save_params ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.io.save_persistables ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None))
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 52946c7f11..9f5631b87c 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -34,6 +34,7 @@ add_subdirectory(ir)
 add_subdirectory(details)
 # ddim lib
 proto_library(framework_proto SRCS framework.proto)
+proto_library(async_executor_proto SRCS data_feed.proto)
 
 cc_library(ddim SRCS ddim.cc DEPS eigen3 boost)
 cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
@@ -135,7 +136,7 @@ endif(NOT WIN32)
 cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
 nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
 
-py_proto_compile(framework_py_proto SRCS framework.proto)
+py_proto_compile(framework_py_proto SRCS framework.proto data_feed.proto)
 # Generate an empty __init__.py to make framework_py_proto as a valid python module.
 add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
 add_dependencies(framework_py_proto framework_py_proto_init)
@@ -157,18 +158,19 @@ endif(NOT WIN32)
 cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)
 
 cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog)
+cc_library(variable_helper SRCS variable_helper.cc DEPS lod_tensor)
 
-cc_library(naive_executor SRCS naive_executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass)
+cc_library(naive_executor SRCS naive_executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper)
 
 if(WITH_DISTRIBUTE)
-  cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr graph_to_program_pass)
+  cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr graph_to_program_pass variable_helper)
   set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
   set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 else()
   if(NOT WIN32)
-    cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph_operator)
+    cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph_operator variable_helper)
   else(NOT WIN32)
-    cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass)
+    cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper)
   endif(NOT WIN32)
   cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op)
 endif()
@@ -176,8 +178,11 @@ endif()
 cc_library(parallel_executor SRCS parallel_executor.cc DEPS
         threaded_ssa_graph_executor scope_buffered_ssa_graph_executor
         graph build_strategy
-        fast_threaded_ssa_graph_executor)
+        fast_threaded_ssa_graph_executor variable_helper)
 
+cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper)
+
+cc_test(data_feed_test SRCS data_feed_test.cc DEPS async_executor)
 cc_library(prune SRCS prune.cc DEPS framework_proto)
 cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
 cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry
diff --git a/paddle/fluid/framework/async_executor.cc b/paddle/fluid/framework/async_executor.cc
new file mode 100644
index 0000000000..afb2dd2f06
--- /dev/null
+++ b/paddle/fluid/framework/async_executor.cc
@@ -0,0 +1,138 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/async_executor.h"
+#include "google/protobuf/io/zero_copy_stream_impl.h"
+#include "google/protobuf/message.h"
+#include "google/protobuf/text_format.h"
+
+#include "gflags/gflags.h"
+#include "paddle/fluid/framework/data_feed_factory.h"
+#include "paddle/fluid/framework/executor_thread_worker.h"
+#include "paddle/fluid/framework/feed_fetch_method.h"
+#include "paddle/fluid/framework/feed_fetch_type.h"
+#include "paddle/fluid/framework/lod_rank_table.h"
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/reader.h"
+#include "paddle/fluid/inference/io.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/pybind/pybind.h"
+
+namespace paddle {
+namespace framework {
+AsyncExecutor::AsyncExecutor(Scope* scope, const platform::Place& place)
+    : root_scope_(scope), place_(place) {}
+
+void AsyncExecutor::CreateThreads(
+    ExecutorThreadWorker* worker, const ProgramDesc& main_program,
+    const std::shared_ptr<DataFeed>& reader,
+    const std::vector<std::string>& fetch_var_names, Scope* root_scope,
+    const int thread_index, const bool debug) {
+  worker->SetThreadId(thread_index);
+  worker->SetDebug(debug);
+  worker->SetRootScope(root_scope);
+  worker->CreateThreadResource(main_program, place_);
+  worker->SetDataFeed(reader);
+  worker->SetFetchVarNames(fetch_var_names);
+  worker->BindingDataFeedMemory();
+}
+
+void PrepareReaders(std::vector<std::shared_ptr<DataFeed>>& readers,  // NOLINT
+                    const int thread_num, const DataFeedDesc& data_feed_desc,
+                    const std::vector<std::string>& filelist) {
+  readers.resize(thread_num);
+  for (size_t i = 0; i < readers.size(); ++i) {
+    readers[i] = DataFeedFactory::CreateDataFeed(data_feed_desc.name());
+    readers[i]->Init(data_feed_desc);  // set batch_size and queue_size here
+  }
+  readers[0]->SetFileList(filelist);
+}
+
+void AsyncExecutor::RunFromFile(const ProgramDesc& main_program,
+                                const std::string& data_feed_desc_str,
+                                const std::vector<std::string>& filelist,
+                                const int thread_num,
+                                const std::vector<std::string>& fetch_var_names,
+                                const bool debug) {
+  std::vector<std::thread> threads;
+
+  auto& block = main_program.Block(0);
+  for (auto var_name : fetch_var_names) {
+    auto var_desc = block.FindVar(var_name);
+    auto shapes = var_desc->GetShape();
+    PADDLE_ENFORCE(shapes[shapes.size() - 1] == 1,
+                   "var %s: Fetched var has wrong shape, "
+                   "only variables with the last dimension size 1 supported",
+                   var_name);
+  }
+
+  DataFeedDesc data_feed_desc;
+  google::protobuf::TextFormat::ParseFromString(data_feed_desc_str,
+                                                &data_feed_desc);
+
+  int actual_thread_num = thread_num;
+  int file_cnt = filelist.size();
+  PADDLE_ENFORCE(file_cnt > 0, "File list cannot be empty");
+
+  if (actual_thread_num > file_cnt) {
+    VLOG(1) << "Thread num = " << thread_num << ", file num = " << file_cnt
+            << ". Changing thread_num = " << file_cnt;
+    actual_thread_num = file_cnt;
+  }
+
+  /*
+    readerDesc: protobuf description for reader initlization
+    argument: class_name, batch_size, use_slot, queue_size, buffer_size,
+    padding_index
+
+    reader:
+    1) each thread has a reader, reader will read input data and
+    put it into input queue
+    2) each reader has a Next() iterface, that can fetch an instance
+    from the input queue
+   */
+  // todo: should be factory method for creating datafeed
+  std::vector<std::shared_ptr<DataFeed>> readers;
+  PrepareReaders(readers, actual_thread_num, data_feed_desc, filelist);
+
+  std::vector<std::shared_ptr<ExecutorThreadWorker>> workers;
+  workers.resize(actual_thread_num);
+  for (auto& worker : workers) {
+    worker.reset(new ExecutorThreadWorker);
+  }
+
+  // prepare thread resource here
+  for (int thidx = 0; thidx < actual_thread_num; ++thidx) {
+    CreateThreads(workers[thidx].get(), main_program, readers[thidx],
+                  fetch_var_names, root_scope_, thidx, debug);
+  }
+
+  // start executing ops in multiple threads
+  for (int thidx = 0; thidx < actual_thread_num; ++thidx) {
+    threads.push_back(
+        std::thread(&ExecutorThreadWorker::TrainFiles, workers[thidx].get()));
+  }
+
+  for (auto& th : threads) {
+    th.join();
+  }
+
+  root_scope_->DropKids();
+
+  return;
+}
+
+}  // einit_modelnd namespace framework
+}  // end namespace paddle
diff --git a/paddle/fluid/framework/async_executor.h b/paddle/fluid/framework/async_executor.h
new file mode 100644
index 0000000000..f4d2a79ac5
--- /dev/null
+++ b/paddle/fluid/framework/async_executor.h
@@ -0,0 +1,58 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <map>
+#include <memory>
+#include <mutex>  // NOLINT
+#include <set>
+#include <string>
+#include <thread>  // NOLINT
+#include <typeinfo>
+#include <vector>
+#include "paddle/fluid/framework/data_feed.pb.h"
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/executor_thread_worker.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+
+namespace paddle {
+namespace framework {
+class AsyncExecutor {
+ public:
+  AsyncExecutor(Scope* scope, const platform::Place& place);
+  virtual ~AsyncExecutor() {}
+  void RunFromFile(const ProgramDesc& main_program,
+                   const std::string& data_feed_desc_str,
+                   const std::vector<std::string>& filelist,
+                   const int thread_num,
+                   const std::vector<std::string>& fetch_names,
+                   const bool debug = false);
+
+ private:
+  void CreateThreads(ExecutorThreadWorker* worker,
+                     const ProgramDesc& main_program,
+                     const std::shared_ptr<DataFeed>& reader,
+                     const std::vector<std::string>& fetch_var_names,
+                     Scope* root_scope, const int thread_index,
+                     const bool debug);
+
+ public:
+  Scope* root_scope_;
+  platform::Place place_;
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc
new file mode 100644
index 0000000000..851c7eda89
--- /dev/null
+++ b/paddle/fluid/framework/data_feed.cc
@@ -0,0 +1,375 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "google/protobuf/io/zero_copy_stream_impl.h"
+#include "google/protobuf/message.h"
+#include "google/protobuf/text_format.h"
+
+#include "gflags/gflags.h"
+#include "paddle/fluid/framework/data_feed.h"
+#include "paddle/fluid/framework/feed_fetch_method.h"
+#include "paddle/fluid/framework/feed_fetch_type.h"
+
+namespace paddle {
+namespace framework {
+
+std::vector<std::string> DataFeed::filelist_;
+size_t DataFeed::file_idx_;
+std::mutex DataFeed::mutex_for_pick_file_;
+bool DataFeed::finish_set_filelist_;
+
+void DataFeed::AddFeedVar(Variable* var, const std::string& name) {
+  CheckInit();
+  for (size_t i = 0; i < use_slots_.size(); ++i) {
+    if (name == use_slots_[i]) {
+      if (use_slots_is_dense_[i]) {
+        feed_vec_[i] = MixTensor(var->GetMutable<Tensor>());
+      } else {
+        feed_vec_[i] = MixTensor(var->GetMutable<LoDTensor>());
+      }
+    }
+  }
+}
+
+bool DataFeed::SetFileList(const std::vector<std::string>& files) {
+  std::unique_lock<std::mutex> lock(mutex_for_pick_file_);
+  CheckInit();
+  if (finish_set_filelist_) {
+    VLOG(3) << "info: you have set the filelist.";
+    return false;
+  }
+  PADDLE_ENFORCE(files.size(), "You have set an empty filelist.");
+  filelist_.assign(files.begin(), files.end());
+  file_idx_ = 0;
+
+  finish_set_filelist_ = true;
+  return true;
+}
+
+void DataFeed::SetBatchSize(int batch_size) {
+  PADDLE_ENFORCE(batch_size > 0, "Illegal batch size: %d.", batch_size);
+  default_batch_size_ = batch_size;
+}
+
+bool DataFeed::PickOneFile(std::string* filename) {
+  std::unique_lock<std::mutex> lock(mutex_for_pick_file_);
+  if (file_idx_ == filelist_.size()) {
+    return false;
+  }
+  *filename = filelist_[file_idx_++];
+  return true;
+}
+
+void DataFeed::CheckInit() {
+  PADDLE_ENFORCE(finish_init_, "Initialization did not succeed.");
+}
+
+void DataFeed::CheckSetFileList() {
+  PADDLE_ENFORCE(finish_set_filelist_, "Set filelist did not succeed.");
+}
+
+void DataFeed::CheckStart() {
+  PADDLE_ENFORCE(finish_start_, "Datafeed has not started running yet.");
+}
+
+template <typename T>
+void PrivateQueueDataFeed<T>::SetQueueSize(int queue_size) {
+  PADDLE_ENFORCE(queue_size > 0, "Illegal queue size: %d.", queue_size);
+  queue_size_ = queue_size;
+  queue_ = std::unique_ptr<paddle::operators::reader::BlockingQueue<T>>(
+      new paddle::operators::reader::BlockingQueue<T>(queue_size_));
+}
+
+template <typename T>
+bool PrivateQueueDataFeed<T>::Start() {
+  CheckSetFileList();
+  read_thread_ = std::thread(&PrivateQueueDataFeed::ReadThread, this);
+  read_thread_.detach();
+
+  finish_start_ = true;
+  return true;
+}
+
+template <typename T>
+void PrivateQueueDataFeed<T>::ReadThread() {
+  std::string filename;
+  while (PickOneFile(&filename)) {
+    file_.open(filename.c_str());  // is_text_feed
+    PADDLE_ENFORCE(file_.good(), "Open file<%s> fail.", filename.c_str());
+    T instance;
+    while (ParseOneInstance(&instance)) {
+      queue_->Send(instance);
+    }
+    file_.close();
+  }
+  queue_->Close();
+}
+
+template <typename T>
+int PrivateQueueDataFeed<T>::Next() {
+  CheckStart();
+  int index = 0;
+  T instance;
+  T ins_vec;
+  while (index < default_batch_size_) {
+    if (!queue_->Receive(&instance)) {
+      break;
+    }
+    AddInstanceToInsVec(&ins_vec, instance, index++);
+  }
+  batch_size_ = index;
+  if (batch_size_ != 0) {
+    PutToFeedVec(ins_vec);
+  }
+  return batch_size_;
+}
+
+#ifdef _WIN32
+template class PrivateQueueDataFeed<std::vector<MultiSlotType>>;
+#endif
+
+void MultiSlotDataFeed::Init(
+    const paddle::framework::DataFeedDesc& data_feed_desc) {
+  finish_init_ = false;
+  finish_set_filelist_ = false;
+  finish_start_ = false;
+
+  PADDLE_ENFORCE(data_feed_desc.has_multi_slot_desc(),
+                 "Multi_slot_desc has not been set.");
+  paddle::framework::MultiSlotDesc multi_slot_desc =
+      data_feed_desc.multi_slot_desc();
+  SetBatchSize(data_feed_desc.batch_size());
+  SetQueueSize(data_feed_desc.batch_size());
+  size_t all_slot_num = multi_slot_desc.slots_size();
+  all_slots_.resize(all_slot_num);
+  all_slots_type_.resize(all_slot_num);
+  use_slots_index_.resize(all_slot_num);
+  use_slots_.clear();
+  use_slots_is_dense_.clear();
+  for (size_t i = 0; i < all_slot_num; ++i) {
+    const auto& slot = multi_slot_desc.slots(i);
+    all_slots_[i] = slot.name();
+    all_slots_type_[i] = slot.type();
+    use_slots_index_[i] = slot.is_used() ? use_slots_.size() : -1;
+    if (slot.is_used()) {
+      use_slots_.push_back(all_slots_[i]);
+      use_slots_is_dense_.push_back(slot.is_dense());
+    }
+  }
+  feed_vec_.resize(use_slots_.size());
+  finish_init_ = true;
+}
+
+bool MultiSlotDataFeed::CheckFile(const char* filename) {
+  CheckInit();  // get info of slots
+  std::ifstream fin(filename);
+  if (!fin.good()) {
+    VLOG(1) << "error: open file<" << filename << "> fail";
+    return false;
+  }
+  std::string line;
+  int instance_cout = 0;
+  std::string all_slots_alias = "";
+  for (const auto& alias : all_slots_) {
+    all_slots_alias += alias + " ";
+  }
+  std::string use_slots_alias = "";
+  for (const auto& alias : use_slots_) {
+    use_slots_alias += alias + " ";
+  }
+  VLOG(3) << "total slots num: " << all_slots_.size();
+  VLOG(3) << "total slots alias: " << all_slots_alias;
+  VLOG(3) << "used slots num: " << use_slots_.size();
+  VLOG(3) << "used slots alias: " << use_slots_alias;
+  while (getline(fin, line)) {
+    ++instance_cout;
+    const char* str = line.c_str();
+    char* endptr = const_cast<char*>(str);
+    int len = line.length();
+    for (size_t i = 0; i < all_slots_.size(); ++i) {
+      int num = strtol(endptr, &endptr, 10);
+      if (num < 0) {
+        VLOG(1) << "error: the number of ids is a negative number: " << num;
+        VLOG(1) << "please check line<" << instance_cout << "> in file<"
+                << filename << ">";
+        return false;
+      } else if (num == 0) {
+        VLOG(1)
+            << "error: the number of ids can not be zero, you need "
+               "padding it in data generator; or if there is something wrong"
+               " with the data, please check if the data contains unresolvable "
+               "characters.";
+        VLOG(1) << "please check line<" << instance_cout << "> in file<"
+                << filename << ">";
+        return false;
+      } else if (errno == ERANGE || num > INT_MAX) {
+        VLOG(1) << "error: the number of ids greater than INT_MAX";
+        VLOG(1) << "please check line<" << instance_cout << "> in file<"
+                << filename << ">";
+        return false;
+      }
+      if (all_slots_type_[i] == "float") {
+        for (int i = 0; i < num; ++i) {
+          strtof(endptr, &endptr);
+          if (errno == ERANGE) {
+            VLOG(1) << "error: the value is out of the range of "
+                       "representable values for float";
+            VLOG(1) << "please check line<" << instance_cout << "> in file<"
+                    << filename << ">";
+            return false;
+          }
+          if (i + 1 != num && endptr - str == len) {
+            VLOG(1) << "error: there is a wrong with the number of ids.";
+            VLOG(1) << "please check line<" << instance_cout << "> in file<"
+                    << filename << ">";
+            return false;
+          }
+        }
+      } else if (all_slots_type_[i] == "uint64") {
+        for (int i = 0; i < num; ++i) {
+          strtoull(endptr, &endptr, 10);
+          if (errno == ERANGE) {
+            VLOG(1) << "error: the value is out of the range of "
+                       "representable values for uint64_t";
+            VLOG(1) << "please check line<" << instance_cout << "> in file<"
+                    << filename << ">";
+            return false;
+          }
+          if (i + 1 != num && endptr - str == len) {
+            VLOG(1) << "error: there is a wrong with the number of ids.";
+            VLOG(1) << "please check line<" << instance_cout << "> in file<"
+                    << filename << ">";
+            return false;
+          }
+        }
+      } else {
+        VLOG(1) << "error: this type<" << all_slots_type_[i]
+                << "> is not supported";
+        return false;
+      }
+    }
+    if (endptr - str != len) {
+      VLOG(1) << "error: there is some data at the end of the line.";
+      VLOG(1) << "please check line<" << instance_cout << "> in file<"
+              << filename << ">";
+      return false;
+    }
+  }
+  VLOG(3) << "instances cout: " << instance_cout;
+  VLOG(3) << "The file format is correct";
+  return true;
+}
+
+bool MultiSlotDataFeed::ParseOneInstance(std::vector<MultiSlotType>* instance) {
+  std::string line;
+  if (getline(file_, line)) {
+    int use_slots_num = use_slots_.size();
+    instance->resize(use_slots_num);
+    // parse line
+    const char* str = line.c_str();
+    char* endptr = const_cast<char*>(str);
+    int pos = 0;
+    for (size_t i = 0; i < use_slots_index_.size(); ++i) {
+      int idx = use_slots_index_[i];
+      int num = strtol(&str[pos], &endptr, 10);
+      PADDLE_ENFORCE(
+          num,
+          "The number of ids can not be zero, you need padding "
+          "it in data generator; or if there is something wrong with "
+          "the data, please check if the data contains unresolvable "
+          "characters.\nplease check this error line: %s",
+          str);
+      if (idx != -1) {
+        (*instance)[idx].Init(all_slots_type_[i]);
+        if ((*instance)[idx].GetType()[0] == 'f') {  // float
+          for (int j = 0; j < num; ++j) {
+            float feasign = strtof(endptr, &endptr);
+            (*instance)[idx].AddValue(feasign);
+          }
+        } else if ((*instance)[idx].GetType()[0] == 'u') {  // uint64
+          for (int j = 0; j < num; ++j) {
+            uint64_t feasign = (uint64_t)strtoull(endptr, &endptr, 10);
+            (*instance)[idx].AddValue(feasign);
+          }
+        }
+        pos = endptr - str;
+      } else {
+        for (int j = 0; j <= num; ++j) {
+          pos = line.find_first_of(' ', pos + 1);
+        }
+      }
+    }
+  } else {
+    return false;
+  }
+  return true;
+}
+
+void MultiSlotDataFeed::AddInstanceToInsVec(
+    std::vector<MultiSlotType>* ins_vec,
+    const std::vector<MultiSlotType>& instance, int index) {
+  if (index == 0) {
+    ins_vec->resize(instance.size());
+    for (size_t i = 0; i < instance.size(); ++i) {
+      (*ins_vec)[i].Init(instance[i].GetType());
+      (*ins_vec)[i].InitOffset();
+    }
+  }
+  for (size_t i = 0; i < instance.size(); ++i) {
+    (*ins_vec)[i].AddIns(instance[i]);
+  }
+}
+
+void MultiSlotDataFeed::PutToFeedVec(
+    const std::vector<MultiSlotType>& ins_vec) {
+  for (size_t i = 0; i < use_slots_.size(); ++i) {
+    const auto& type = ins_vec[i].GetType();
+    const auto& offset = ins_vec[i].GetOffset();
+    int total_instance = static_cast<int>(offset.back());
+    if (type[0] == 'f') {  // float
+      const auto& feasign = ins_vec[i].GetFloatData();
+      if (feed_vec_[i].IsDense()) {
+        int size_in_each_batch = total_instance / batch_size_;
+        float* tensor_ptr = feed_vec_[i].GetTensor()->mutable_data<float>(
+            {batch_size_, size_in_each_batch}, platform::CPUPlace());
+        memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(float));
+      } else {
+        float* tensor_ptr = feed_vec_[i].GetLoDTensor()->mutable_data<float>(
+            {total_instance, 1}, platform::CPUPlace());
+        memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(float));
+        LoD data_lod{offset};
+        feed_vec_[i].GetLoDTensor()->set_lod(data_lod);
+      }
+    } else if (type[0] == 'u') {  // uint64
+      // no uint64_t type in paddlepaddle
+      const auto& feasign = ins_vec[i].GetUint64Data();
+      if (feed_vec_[i].IsDense()) {
+        int size_in_each_batch = total_instance / batch_size_;
+        int64_t* tensor_ptr = feed_vec_[i].GetTensor()->mutable_data<int64_t>(
+            {batch_size_, size_in_each_batch}, platform::CPUPlace());
+        memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(int64_t));
+      } else {
+        int64_t* tensor_ptr =
+            feed_vec_[i].GetLoDTensor()->mutable_data<int64_t>(
+                {total_instance, 1}, platform::CPUPlace());
+        memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(int64_t));
+        LoD data_lod{offset};
+        feed_vec_[i].GetLoDTensor()->set_lod(data_lod);
+      }
+    }
+  }
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/data_feed.h b/paddle/fluid/framework/data_feed.h
new file mode 100644
index 0000000000..a7f8d1d317
--- /dev/null
+++ b/paddle/fluid/framework/data_feed.h
@@ -0,0 +1,269 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <fstream>
+#include <memory>
+#include <mutex>  // NOLINT
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "paddle/fluid/framework/data_feed.pb.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/reader.h"
+#include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/operators/reader/blocking_queue.h"
+
+namespace paddle {
+namespace framework {
+
+// Pack Tensor type and LoDTensor type into MixTensor type, in order
+// to record either Tensor or LoDTensor information at the same time.
+class MixTensor {
+ public:
+  MixTensor() {}
+  explicit MixTensor(LoDTensor* lodtensor) {
+    is_dense_ = false;
+    lodtensor_ = lodtensor;
+  }
+  explicit MixTensor(Tensor* tensor) {
+    is_dense_ = true;
+    tensor_ = tensor;
+  }
+  bool IsDense() { return is_dense_; }
+  LoDTensor* GetLoDTensor() {
+    PADDLE_ENFORCE(!is_dense_, "Let a dense var return a LoDTensor ptr.");
+    return lodtensor_;
+  }
+  Tensor* GetTensor() {
+    PADDLE_ENFORCE(is_dense_, "Let a sparse var return a Tensor ptr.");
+    return tensor_;
+  }
+
+ private:
+  bool is_dense_;
+  LoDTensor* lodtensor_;
+  Tensor* tensor_;
+};
+
+// DataFeed is the base virtual class for all ohther DataFeeds.
+// It is used to read files and parse the data for subsequent trainer.
+// Example:
+//   DataFeed* reader =
+//   paddle::framework::DataFeedFactory::CreateDataFeed(data_feed_name);
+//   reader->Init(data_feed_desc); // data_feed_desc is a protobuf object
+//   reader->SetFileList(filelist);
+//   const std::vector<std::string> & use_slot_alias =
+//   reader->GetUseSlotAlias();
+//   for (auto name: use_slot_alias){ // for binding memory
+//     reader->AddFeedVar(scope->Var(name), name);
+//   }
+//   reader->Start();
+//   while (reader->Next()) {
+//      // trainer do something
+//   }
+class DataFeed {
+ public:
+  DataFeed() {}
+  virtual ~DataFeed() {}
+  virtual void Init(const paddle::framework::DataFeedDesc& data_feed_desc) = 0;
+  virtual bool CheckFile(const char* filename) {
+    PADDLE_THROW("This function(CheckFile) is not implemented.");
+  }
+  // Set filelist for DataFeed.
+  // Pay attention that it must init all readers before call this function.
+  // Otherwise, Init() function will init finish_set_filelist_ flag.
+  virtual bool SetFileList(const std::vector<std::string>& files);
+  virtual bool Start() = 0;
+  // The trainer calls the Next() function, and the DataFeed will load a new
+  // batch to the feed_vec. The return value of this function is the batch
+  // size of the current batch.
+  virtual int Next() = 0;
+  // Get all slots' alias which defined in protofile
+  virtual const std::vector<std::string>& GetAllSlotAlias() {
+    return all_slots_;
+  }
+  // Get used slots' alias which defined in protofile
+  virtual const std::vector<std::string>& GetUseSlotAlias() {
+    return use_slots_;
+  }
+  // This function is used for binding feed_vec memory
+  virtual void AddFeedVar(Variable* var, const std::string& name);
+
+ protected:
+  // The following three functions are used to check if it is executed in this
+  // order:
+  //   Init() -> SetFileList() -> Start() -> Next()
+  virtual void CheckInit();
+  virtual void CheckSetFileList();
+  virtual void CheckStart();
+  virtual void SetBatchSize(
+      int batch);  // batch size will be set in Init() function
+  // This function is used to pick one file from the global filelist(thread
+  // safe).
+  virtual bool PickOneFile(std::string* filename);
+
+  static std::vector<std::string> filelist_;
+  static size_t file_idx_;
+  static std::mutex mutex_for_pick_file_;
+
+  // the alias of used slots, and its order is determined by
+  // data_feed_desc(proto object)
+  std::vector<std::string> use_slots_;
+  std::vector<bool> use_slots_is_dense_;
+
+  // the alias of all slots, and its order is determined by data_feed_desc(proto
+  // object)
+  std::vector<std::string> all_slots_;
+  std::vector<std::string> all_slots_type_;
+  std::vector<int>
+      use_slots_index_;  // -1: not used; >=0: the index of use_slots_
+
+  // The data read by DataFeed will be stored here
+  std::vector<MixTensor> feed_vec_;
+
+  // the batch size defined by user
+  int default_batch_size_;
+  // current batch size
+  int batch_size_;
+
+  bool finish_init_;
+  static bool finish_set_filelist_;
+  bool finish_start_;
+};
+
+// PrivateQueueDataFeed is the base virtual class for ohther DataFeeds.
+// It use a read-thread to read file and parse data to a private-queue
+// (thread level), and get data from this queue when trainer call Next().
+template <typename T>
+class PrivateQueueDataFeed : public DataFeed {
+ public:
+  PrivateQueueDataFeed() {}
+  virtual ~PrivateQueueDataFeed() {}
+  virtual void Init(const paddle::framework::DataFeedDesc& data_feed_desc) = 0;
+  virtual bool Start();
+  virtual int Next();
+
+ protected:
+  // The thread implementation function for reading file and parse.
+  virtual void ReadThread();
+  // This function is used to set private-queue size, and the most
+  // efficient when the queue size is close to the batch size.
+  virtual void SetQueueSize(int queue_size);
+  // The reading and parsing method called in the ReadThread.
+  virtual bool ParseOneInstance(T* instance) = 0;
+  // This function is used to put instance to vec_ins
+  virtual void AddInstanceToInsVec(T* vec_ins, const T& instance,
+                                   int index) = 0;
+  // This function is used to put ins_vec to feed_vec
+  virtual void PutToFeedVec(const T& ins_vec) = 0;
+
+  // The thread for read files
+  std::thread read_thread_;
+  // using ifstream one line and one line parse is faster
+  // than using fread one buffer and one buffer parse.
+  //   for a 601M real data:
+  //     ifstream one line and one line parse: 6034 ms
+  //     fread one buffer and one buffer parse: 7097 ms
+  std::ifstream file_;
+  size_t queue_size_;
+  // The queue for store parsed data
+  std::unique_ptr<paddle::operators::reader::BlockingQueue<T>> queue_;
+};
+
+// This class define the data type of instance(ins_vec) in MultiSlotDataFeed
+class MultiSlotType {
+ public:
+  MultiSlotType() {}
+  ~MultiSlotType() {}
+  void Init(const std::string& type) {
+    CheckType(type);
+    if (type_[0] == 'f') {
+      float_feasign_.clear();
+    } else if (type_[0] == 'u') {
+      uint64_feasign_.clear();
+    }
+    type_ = type;
+  }
+  void InitOffset() {
+    offset_.resize(1);
+    // LoDTensor' lod is counted from 0, the size of lod
+    // is one size larger than the size of data.
+    offset_[0] = 0;
+  }
+  const std::vector<size_t>& GetOffset() const { return offset_; }
+  void AddValue(const float v) {
+    CheckFloat();
+    float_feasign_.push_back(v);
+  }
+  void AddValue(const uint64_t v) {
+    CheckUint64();
+    uint64_feasign_.push_back(v);
+  }
+  void AddIns(const MultiSlotType& ins) {
+    if (ins.GetType()[0] == 'f') {  // float
+      CheckFloat();
+      auto& vec = ins.GetFloatData();
+      offset_.push_back(offset_.back() + vec.size());
+      float_feasign_.insert(float_feasign_.end(), vec.begin(), vec.end());
+    } else if (ins.GetType()[0] == 'u') {  // uint64
+      CheckUint64();
+      auto& vec = ins.GetUint64Data();
+      offset_.push_back(offset_.back() + vec.size());
+      uint64_feasign_.insert(uint64_feasign_.end(), vec.begin(), vec.end());
+    }
+  }
+  const std::vector<float>& GetFloatData() const { return float_feasign_; }
+  const std::vector<uint64_t>& GetUint64Data() const { return uint64_feasign_; }
+  const std::string& GetType() const { return type_; }
+
+ private:
+  void CheckType(const std::string& type) const {
+    PADDLE_ENFORCE((type == "uint64") || (type == "float"),
+                   "There is no this type<%s>.", type);
+  }
+  void CheckFloat() const {
+    PADDLE_ENFORCE(type_[0] == 'f', "Add %s value to float slot.", type_);
+  }
+  void CheckUint64() const {
+    PADDLE_ENFORCE(type_[0] == 'u', "Add %s value to uint64 slot.", type_);
+  }
+  std::vector<float> float_feasign_;
+  std::vector<uint64_t> uint64_feasign_;
+  std::string type_;
+  std::vector<size_t> offset_;
+};
+
+// This DataFeed is used to feed multi-slot type data.
+// The format of multi-slot type data:
+//   [n feasign_0 feasign_1 ... feasign_n]*
+class MultiSlotDataFeed
+    : public PrivateQueueDataFeed<std::vector<MultiSlotType>> {
+ public:
+  MultiSlotDataFeed() {}
+  virtual ~MultiSlotDataFeed() {}
+  virtual void Init(const paddle::framework::DataFeedDesc& data_feed_desc);
+  virtual bool CheckFile(const char* filename);
+
+ protected:
+  virtual void AddInstanceToInsVec(std::vector<MultiSlotType>* vec_ins,
+                                   const std::vector<MultiSlotType>& instance,
+                                   int index);
+  virtual bool ParseOneInstance(std::vector<MultiSlotType>* instance);
+  virtual void PutToFeedVec(const std::vector<MultiSlotType>& ins_vec);
+};
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/data_feed.proto b/paddle/fluid/framework/data_feed.proto
new file mode 100644
index 0000000000..489fec08d8
--- /dev/null
+++ b/paddle/fluid/framework/data_feed.proto
@@ -0,0 +1,30 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+syntax = "proto2";
+package paddle.framework;
+
+message Slot {
+  required string name = 1;
+  required string type = 2;
+  optional bool is_dense = 3 [ default = false ];
+  optional bool is_used = 4 [ default = false ];
+}
+
+message MultiSlotDesc { repeated Slot slots = 1; }
+
+message DataFeedDesc {
+  optional string name = 1;
+  optional int32 batch_size = 2 [ default = 32 ];
+  optional MultiSlotDesc multi_slot_desc = 3;
+}
diff --git a/paddle/fluid/framework/data_feed_factory.cc b/paddle/fluid/framework/data_feed_factory.cc
new file mode 100644
index 0000000000..72148b9f7d
--- /dev/null
+++ b/paddle/fluid/framework/data_feed_factory.cc
@@ -0,0 +1,64 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/data_feed_factory.h"
+#include <memory>
+#include <string>
+#include <unordered_map>
+
+#include "paddle/fluid/framework/data_feed.h"
+
+namespace paddle {
+namespace framework {
+typedef std::shared_ptr<DataFeed> (*Createdata_feedFunction)();
+typedef std::unordered_map<std::string, Createdata_feedFunction> data_feedMap;
+data_feedMap g_data_feed_map;
+
+#define REGISTER_DATAFEED_CLASS(data_feed_class)                      \
+  namespace {                                                         \
+  std::shared_ptr<DataFeed> Creator_##data_feed_class() {             \
+    return std::shared_ptr<DataFeed>(new data_feed_class);            \
+  }                                                                   \
+  class __Registerer_##data_feed_class {                              \
+   public:                                                            \
+    __Registerer_##data_feed_class() {                                \
+      g_data_feed_map[#data_feed_class] = &Creator_##data_feed_class; \
+    }                                                                 \
+  };                                                                  \
+  __Registerer_##data_feed_class g_registerer_##data_feed_class;      \
+  }  // namespace
+
+std::string DataFeedFactory::DataFeedTypeList() {
+  std::string data_feed_types;
+  for (auto iter = g_data_feed_map.begin(); iter != g_data_feed_map.end();
+       ++iter) {
+    if (iter != g_data_feed_map.begin()) {
+      data_feed_types += ", ";
+    }
+    data_feed_types += iter->first;
+  }
+  return data_feed_types;
+}
+
+std::shared_ptr<DataFeed> DataFeedFactory::CreateDataFeed(
+    std::string data_feed_class) {
+  if (g_data_feed_map.count(data_feed_class) < 1) {
+    exit(-1);
+  }
+  return g_data_feed_map[data_feed_class]();
+}
+
+REGISTER_DATAFEED_CLASS(MultiSlotDataFeed);
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/data_feed_factory.h b/paddle/fluid/framework/data_feed_factory.h
new file mode 100644
index 0000000000..13678edb0b
--- /dev/null
+++ b/paddle/fluid/framework/data_feed_factory.h
@@ -0,0 +1,29 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include "paddle/fluid/framework/data_feed.h"
+
+namespace paddle {
+namespace framework {
+class DataFeedFactory {
+ public:
+  static std::string DataFeedTypeList();
+  static std::shared_ptr<DataFeed> CreateDataFeed(std::string data_feed_class);
+};
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/data_feed_test.cc b/paddle/fluid/framework/data_feed_test.cc
new file mode 100644
index 0000000000..3974f8dbad
--- /dev/null
+++ b/paddle/fluid/framework/data_feed_test.cc
@@ -0,0 +1,337 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/data_feed.h"
+#include <fcntl.h>
+#include <chrono>  // NOLINT
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <mutex>  // NOLINT
+#include <set>
+#include <thread>  // NOLINT
+#include <utility>
+#include <vector>
+#include "google/protobuf/io/zero_copy_stream_impl.h"
+#include "google/protobuf/text_format.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/data_feed_factory.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+
+paddle::framework::DataFeedDesc load_datafeed_param_from_file(
+    const char* filename) {
+  paddle::framework::DataFeedDesc data_feed_desc;
+  int file_descriptor = open(filename, O_RDONLY);
+  PADDLE_ENFORCE(file_descriptor != -1, "Can not open %s.", filename);
+  google::protobuf::io::FileInputStream fileInput(file_descriptor);
+  google::protobuf::TextFormat::Parse(&fileInput, &data_feed_desc);
+  close(file_descriptor);
+  return data_feed_desc;
+}
+
+const std::vector<std::string> load_filelist_from_file(const char* filename) {
+  std::vector<std::string> filelist;
+  std::ifstream fin(filename);
+  PADDLE_ENFORCE(fin.good(), "Can not open %s.", filename);
+  std::string line;
+  while (getline(fin, line)) {
+    filelist.push_back(line);
+  }
+  fin.close();
+  return filelist;
+}
+
+void GenerateFileForTest(const char* protofile, const char* filelist) {
+  std::ofstream w_protofile(protofile);
+  w_protofile << "name: \"MultiSlotDataFeed\"\n"
+                 "batch_size: 2\n"
+                 "multi_slot_desc {\n"
+                 "    slots {\n"
+                 "        name: \"uint64_sparse_slot\"\n"
+                 "        type: \"uint64\"\n"
+                 "        is_dense: false\n"
+                 "        is_used: true\n"
+                 "    }\n"
+                 "    slots {\n"
+                 "        name: \"float_sparse_slot\"\n"
+                 "        type: \"float\"\n"
+                 "        is_dense: false\n"
+                 "        is_used: true\n"
+                 "    }\n"
+                 "    slots {\n"
+                 "        name: \"uint64_dense_slot\"\n"
+                 "        type: \"uint64\"\n"
+                 "        is_dense: true\n"
+                 "        is_used: true\n"
+                 "    }\n"
+                 "    slots {\n"
+                 "        name: \"float_dense_slot\"\n"
+                 "        type: \"float\"\n"
+                 "        is_dense: true\n"
+                 "        is_used: true\n"
+                 "    }\n"
+                 "    slots {\n"
+                 "        name: \"not_used_slot\"\n"
+                 "        type: \"uint64\"\n"
+                 "        is_dense: false\n"
+                 "        is_used: false\n"
+                 "    }\n"
+                 "}";
+  w_protofile.close();
+  std::ofstream w_filelist(filelist);
+  int total_file = 4;
+  for (int i = 0; i < total_file; ++i) {
+    std::string filename = "TestMultiSlotDataFeed.data." + std::to_string(i);
+    w_filelist << filename;
+    if (i + 1 != total_file) {
+      w_filelist << std::endl;
+    }
+    std::ofstream w_datafile(filename.c_str());
+    w_datafile << "3 3978 620 82 1 1926.08 1 1926 1 6.02 1 1996\n"
+                  "2 1300 2983353 1 985.211 1 8 1 0.618 1 12\n"
+                  "1 19260827 2 3.14 2.718 1 27 1 2.236 1 28\n";
+    w_datafile.close();
+  }
+  w_filelist.close();
+}
+
+class MultiTypeSet {
+ public:
+  MultiTypeSet() {
+    uint64_set_.clear();
+    float_set_.clear();
+  }
+  ~MultiTypeSet() {}
+  void AddValue(uint64_t v) { uint64_set_.insert(v); }
+  void AddValue(float v) { float_set_.insert(v); }
+  const std::set<uint64_t>& GetUint64Set() const { return uint64_set_; }
+  const std::set<float>& GetFloatSet() const { return float_set_; }
+
+ private:
+  std::set<uint64_t> uint64_set_;
+  std::set<float> float_set_;
+};
+
+void GetElemSetFromReader(std::vector<MultiTypeSet>* reader_elem_set,
+                          const paddle::framework::DataFeedDesc& data_feed_desc,
+                          const std::vector<std::string>& filelist,
+                          const int thread_num) {
+  int used_slot_num = 0;
+  for (auto i = 0; i < data_feed_desc.multi_slot_desc().slots_size(); ++i) {
+    if (data_feed_desc.multi_slot_desc().slots(i).is_used()) {
+      ++used_slot_num;
+    }
+  }
+  reader_elem_set->resize(used_slot_num);
+  std::vector<std::thread> threads;
+  std::vector<std::shared_ptr<paddle::framework::DataFeed>> readers;
+  readers.resize(thread_num);
+  for (int i = 0; i < thread_num; ++i) {
+    readers[i] = paddle::framework::DataFeedFactory::CreateDataFeed(
+        data_feed_desc.name());
+    readers[i]->Init(data_feed_desc);
+  }
+  readers[0]->SetFileList(filelist);
+  std::mutex mu;
+  for (int idx = 0; idx < thread_num; ++idx) {
+    threads.emplace_back(std::thread([&, idx] {
+      std::unique_ptr<paddle::framework::Scope> scope(
+          new paddle::framework::Scope());
+      const auto& multi_slot_desc = data_feed_desc.multi_slot_desc();
+      std::map<std::string, const paddle::framework::LoDTensor*>
+          lodtensor_targets;
+      std::map<std::string, const paddle::framework::Tensor*> tensor_targets;
+      for (int i = 0; i < multi_slot_desc.slots_size(); ++i) {
+        const auto& slot = multi_slot_desc.slots(i);
+        if (slot.is_used()) {
+          const auto& name = slot.name();
+          readers[idx]->AddFeedVar(scope->Var(name), name);
+          if (slot.is_dense()) {
+            tensor_targets[name] =
+                &scope->FindVar(name)->Get<paddle::framework::Tensor>();
+          } else {
+            lodtensor_targets[name] =
+                &scope->FindVar(name)->Get<paddle::framework::LoDTensor>();
+          }
+        }
+      }
+      readers[idx]->Start();
+      while (readers[idx]->Next()) {
+        int index = 0;
+        for (int k = 0; k < multi_slot_desc.slots_size(); ++k) {
+          const auto& slot = multi_slot_desc.slots(k);
+          if (!slot.is_used()) {
+            continue;
+          }
+          if (slot.is_dense()) {  // dense branch
+            const paddle::framework::Tensor* tens = tensor_targets[slot.name()];
+            if (slot.type() == "uint64") {
+              const int64_t* data = tens->data<int64_t>();
+              int batch_size = tens->dims()[0];
+              int dim = tens->dims()[1];
+              for (int i = 0; i < batch_size; ++i) {
+                for (int j = 0; j < dim; ++j) {
+                  std::lock_guard<std::mutex> lock(mu);
+                  (*reader_elem_set)[index].AddValue(
+                      (uint64_t)data[i * dim + j]);
+                }
+              }
+            } else if (slot.type() == "float") {
+              const float* data = tens->data<float>();
+              int batch_size = tens->dims()[0];
+              int dim = tens->dims()[1];
+              for (int i = 0; i < batch_size; ++i) {
+                for (int j = 0; j < dim; ++j) {
+                  std::lock_guard<std::mutex> lock(mu);
+                  (*reader_elem_set)[index].AddValue(data[i * dim + j]);
+                }
+              }
+            } else {
+              PADDLE_THROW("Error type in proto file.");
+            }
+          } else {  // sparse branch
+            const paddle::framework::LoDTensor* tens =
+                lodtensor_targets[slot.name()];
+            if (slot.type() == "uint64") {
+              const int64_t* data = tens->data<int64_t>();
+              for (size_t i = 0; i < tens->NumElements(); ++i) {
+                std::pair<size_t, size_t> element = tens->lod_element(0, i);
+                for (size_t j = element.first; j < element.second; ++j) {
+                  std::lock_guard<std::mutex> lock(mu);
+                  (*reader_elem_set)[index].AddValue((uint64_t)data[j]);
+                }
+              }
+            } else if (slot.type() == "float") {
+              const float* data = tens->data<float>();
+              for (size_t i = 0; i < tens->NumElements(); ++i) {
+                std::pair<size_t, size_t> element = tens->lod_element(0, i);
+                for (size_t j = element.first; j < element.second; ++j) {
+                  std::lock_guard<std::mutex> lock(mu);
+                  (*reader_elem_set)[index].AddValue(data[j]);
+                }
+              }
+            } else {
+              PADDLE_THROW("Error type in proto file.");
+            }
+          }  // end sparse branch
+          ++index;
+        }  // end slots loop
+      }    // end while Next()
+    }));   // end anonymous function
+  }
+  for (auto& th : threads) {
+    th.join();
+  }
+}
+
+void CheckIsUnorderedSame(const std::vector<MultiTypeSet>& s1,
+                          const std::vector<MultiTypeSet>& s2) {
+  EXPECT_EQ(s1.size(), s2.size());
+  for (size_t i = 0; i < s1.size(); ++i) {
+    // check for uint64
+    const std::set<uint64_t>& uint64_s1 = s1[i].GetUint64Set();
+    const std::set<uint64_t>& uint64_s2 = s2[i].GetUint64Set();
+    EXPECT_EQ(uint64_s1.size(), uint64_s2.size());
+    auto uint64_it1 = uint64_s1.begin();
+    auto uint64_it2 = uint64_s2.begin();
+    while (uint64_it1 != uint64_s1.end()) {
+      EXPECT_EQ(*uint64_it1, *uint64_it2);
+      ++uint64_it1;
+      ++uint64_it2;
+    }
+    // check for float
+    const std::set<float>& float_s1 = s1[i].GetFloatSet();
+    const std::set<float>& float_s2 = s2[i].GetFloatSet();
+    EXPECT_EQ(float_s1.size(), float_s2.size());
+    auto float_it1 = float_s1.begin();
+    auto float_it2 = float_s2.begin();
+    while (float_it1 != float_s1.end()) {
+      EXPECT_EQ(*float_it1, *float_it2);
+      ++float_it1;
+      ++float_it2;
+    }
+  }
+}
+
+void GetElemSetFromFile(std::vector<MultiTypeSet>* file_elem_set,
+                        const paddle::framework::DataFeedDesc& data_feed_desc,
+                        const std::vector<std::string>& filelist) {
+  int used_slot_num = 0;
+  for (auto i = 0; i < data_feed_desc.multi_slot_desc().slots_size(); ++i) {
+    if (data_feed_desc.multi_slot_desc().slots(i).is_used()) {
+      ++used_slot_num;
+    }
+  }
+  file_elem_set->resize(used_slot_num);
+  for (const auto& file : filelist) {
+    std::ifstream fin(file.c_str());
+    PADDLE_ENFORCE(fin.good(), "Can not open %s.", file.c_str());
+    while (1) {
+      bool end_flag = false;
+      int index = 0;
+      for (auto i = 0; i < data_feed_desc.multi_slot_desc().slots_size(); ++i) {
+        int num;
+        if (fin >> num) {
+          auto slot = data_feed_desc.multi_slot_desc().slots(i);
+          auto type = slot.type();
+          if (type == "uint64") {
+            while (num--) {
+              uint64_t feasign;
+              fin >> feasign;
+              if (slot.is_used()) {
+                (*file_elem_set)[index].AddValue(feasign);
+              }
+            }
+          } else if (type == "float") {
+            while (num--) {
+              float feasign;
+              fin >> feasign;
+              if (slot.is_used()) {
+                (*file_elem_set)[index].AddValue(feasign);
+              }
+            }
+          } else {
+            PADDLE_THROW("Error type in proto file.");
+          }
+          if (slot.is_used()) {
+            ++index;
+          }
+        } else {
+          end_flag = true;
+          break;
+        }
+      }
+      if (end_flag) {
+        break;
+      }
+    }
+    fin.close();
+  }
+}
+
+TEST(DataFeed, MultiSlotUnitTest) {
+  const char* protofile = "data_feed_desc.prototxt";
+  const char* filelist_name = "filelist.txt";
+  GenerateFileForTest(protofile, filelist_name);
+  const std::vector<std::string> filelist =
+      load_filelist_from_file(filelist_name);
+  paddle::framework::DataFeedDesc data_feed_desc =
+      load_datafeed_param_from_file(protofile);
+  std::vector<MultiTypeSet> reader_elem_set;
+  std::vector<MultiTypeSet> file_elem_set;
+  GetElemSetFromReader(&reader_elem_set, data_feed_desc, filelist, 4);
+  GetElemSetFromFile(&file_elem_set, data_feed_desc, filelist);
+  CheckIsUnorderedSame(reader_elem_set, file_elem_set);
+}
diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
index e5b1eaa731..499246a985 100644
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
@@ -16,7 +16,7 @@
 #include <stdexcept>
 #include <string>
 #include <vector>
-#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/variable_helper.h"
 #include "paddle/fluid/platform/profiler.h"
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/framework/details/reference_count_op_handle.h"
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 96132a2c18..73cec21e20 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -21,6 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/framework/transfer_scope_cache.h"
+#include "paddle/fluid/framework/variable_helper.h"
 #include "paddle/fluid/operators/detail/macros.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -114,36 +115,6 @@ void Executor::Close() {
 #endif
 }
 
-void InitializeVariable(Variable* var, proto::VarType::Type var_type) {
-  if (var_type == proto::VarType::LOD_TENSOR) {
-    var->GetMutable<LoDTensor>();
-  } else if (var_type == proto::VarType::SELECTED_ROWS) {
-    var->GetMutable<SelectedRows>();
-  } else if (var_type == proto::VarType::FEED_MINIBATCH) {
-    var->GetMutable<FeedFetchList>();
-  } else if (var_type == proto::VarType::FETCH_LIST) {
-    var->GetMutable<FeedFetchList>();
-  } else if (var_type == proto::VarType::STEP_SCOPES) {
-    var->GetMutable<std::vector<framework::Scope*>>();
-  } else if (var_type == proto::VarType::LOD_RANK_TABLE) {
-    var->GetMutable<LoDRankTable>();
-  } else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) {
-    var->GetMutable<LoDTensorArray>();
-  } else if (var_type == proto::VarType::PLACE_LIST) {
-    var->GetMutable<platform::PlaceList>();
-  } else if (var_type == proto::VarType::READER) {
-    var->GetMutable<ReaderHolder>();
-  } else if (var_type == proto::VarType::RAW) {
-    // GetMutable will be called in operator
-  } else {
-    PADDLE_THROW(
-        "Variable type %d is not in "
-        "[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, "
-        "LOD_RANK_TABLE, PLACE_LIST, READER, RAW]",
-        var_type);
-  }
-}
-
 void Executor::CreateVariables(const ProgramDesc& pdesc, Scope* scope,
                                int block_id) {
   auto& global_block = pdesc.Block(block_id);
diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h
index 36b36d49c2..2d47903ffb 100644
--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
@@ -26,7 +26,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
-extern void InitializeVariable(Variable* var, proto::VarType::Type var_type);
 
 template <typename T>
 std::unordered_map<std::string, T> GetNonPersistableReferenceCount(
diff --git a/paddle/fluid/framework/executor_thread_worker.cc b/paddle/fluid/framework/executor_thread_worker.cc
new file mode 100644
index 0000000000..4e4001e979
--- /dev/null
+++ b/paddle/fluid/framework/executor_thread_worker.cc
@@ -0,0 +1,223 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/executor_thread_worker.h"
+#include "google/protobuf/io/zero_copy_stream_impl.h"
+#include "google/protobuf/message.h"
+#include "google/protobuf/text_format.h"
+
+#include "gflags/gflags.h"
+#include "paddle/fluid/framework/feed_fetch_method.h"
+#include "paddle/fluid/framework/feed_fetch_type.h"
+#include "paddle/fluid/framework/lod_rank_table.h"
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/reader.h"
+#include "paddle/fluid/framework/variable_helper.h"
+#include "paddle/fluid/inference/io.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/pybind/pybind.h"
+namespace paddle {
+namespace framework {
+
+void ExecutorThreadWorker::CreateThreadOperators(const ProgramDesc& program) {
+  auto& block = program.Block(0);
+  op_names_.clear();
+  for (auto& op_desc : block.AllOps()) {
+    std::unique_ptr<OperatorBase> local_op = OpRegistry::CreateOp(*op_desc);
+    op_names_.push_back(op_desc->Type());
+    OperatorBase* local_op_ptr = local_op.release();
+    ops_.push_back(local_op_ptr);
+    continue;
+  }
+}
+
+void ExecutorThreadWorker::CreateThreadResource(
+    const framework::ProgramDesc& program,
+    const paddle::platform::Place& place) {
+  CreateThreadScope(program);
+  CreateThreadOperators(program);
+  SetMainProgram(program);
+  SetPlace(place);
+}
+
+void ExecutorThreadWorker::CreateThreadScope(const ProgramDesc& program) {
+  auto& block = program.Block(0);
+
+  PADDLE_ENFORCE_NOT_NULL(
+      root_scope_, "root_scope should be set before creating thread scope");
+
+  thread_scope_ = &root_scope_->NewScope();
+  for (auto& var : block.AllVars()) {
+    if (var->Persistable()) {
+      auto* ptr = root_scope_->Var(var->Name());
+      InitializeVariable(ptr, var->GetType());
+    } else {
+      auto* ptr = thread_scope_->Var(var->Name());
+      InitializeVariable(ptr, var->GetType());
+    }
+  }
+}
+
+void ExecutorThreadWorker::SetDataFeed(
+    const std::shared_ptr<DataFeed>& datafeed) {
+  thread_reader_ = datafeed;
+}
+
+void ExecutorThreadWorker::BindingDataFeedMemory() {
+  const std::vector<std::string>& input_feed =
+      thread_reader_->GetUseSlotAlias();
+  for (auto name : input_feed) {
+    thread_reader_->AddFeedVar(thread_scope_->Var(name), name);
+  }
+}
+
+void ExecutorThreadWorker::SetFetchVarNames(
+    const std::vector<std::string>& fetch_var_names) {
+  fetch_var_names_.clear();
+  fetch_var_names_.insert(fetch_var_names_.end(), fetch_var_names.begin(),
+                          fetch_var_names.end());
+}
+
+void ExecutorThreadWorker::SetDevice() {
+#if defined _WIN32 || defined __APPLE__
+  return;
+#else
+  static unsigned concurrency_cap = std::thread::hardware_concurrency();
+  int thread_id = this->thread_id_;
+
+  if (thread_id < concurrency_cap) {
+    unsigned proc = thread_id;
+
+    cpu_set_t mask;
+    CPU_ZERO(&mask);
+    CPU_SET(proc, &mask);
+
+    if (-1 == sched_setaffinity(0, sizeof(mask), &mask)) {
+      VLOG(1) << "WARNING: Failed to set thread affinity for thread "
+              << thread_id;
+    } else {
+      CPU_ZERO(&mask);
+      if ((0 != sched_getaffinity(0, sizeof(mask), &mask)) ||
+          (CPU_ISSET(proc, &mask) == 0)) {
+        VLOG(3) << "WARNING: Failed to set thread affinity for thread "
+                << thread_id;
+      }
+    }
+  } else {
+    VLOG(1) << "WARNING: Failed to set thread affinity for thread "
+            << thread_id;
+  }
+#endif
+}
+
+template <typename T>
+void print_lod_tensor(std::string var_name, const LoDTensor& lod_tensor) {
+  auto inspect = lod_tensor.data<T>();
+  auto element_num = lod_tensor.numel();
+
+  std::ostringstream sstream;
+  sstream << var_name << " (element num " << element_num << "): [";
+  sstream << inspect[0];
+  for (int j = 1; j < element_num; ++j) {
+    sstream << " " << inspect[j];
+  }
+  sstream << "]";
+
+  std::cout << sstream.str() << std::endl;
+}
+
+void print_fetch_var(Scope* scope, std::string var_name) {
+  const LoDTensor& tensor = scope->FindVar(var_name)->Get<LoDTensor>();
+
+  if (std::type_index(tensor.type()) ==
+      std::type_index(typeid(platform::float16))) {
+    print_lod_tensor<platform::float16>(var_name, tensor);
+  } else if (std::type_index(tensor.type()) == std::type_index(typeid(float))) {
+    print_lod_tensor<float>(var_name, tensor);
+  } else if (std::type_index(tensor.type()) ==
+             std::type_index(typeid(double))) {
+    print_lod_tensor<double>(var_name, tensor);
+  } else if (std::type_index(tensor.type()) == std::type_index(typeid(int))) {
+    print_lod_tensor<int>(var_name, tensor);
+  } else if (std::type_index(tensor.type()) ==
+             std::type_index(typeid(int64_t))) {
+    print_lod_tensor<int64_t>(var_name, tensor);
+  } else if (std::type_index(tensor.type()) == std::type_index(typeid(bool))) {
+    print_lod_tensor<bool>(var_name, tensor);
+  } else if (std::type_index(tensor.type()) ==
+             std::type_index(typeid(uint8_t))) {
+    print_lod_tensor<uint8_t>(var_name, tensor);
+  } else if (std::type_index(tensor.type()) ==
+             std::type_index(typeid(int16_t))) {
+    print_lod_tensor<int16_t>(var_name, tensor);
+  } else if (std::type_index(tensor.type()) ==
+             std::type_index(typeid(int8_t))) {
+    print_lod_tensor<int8_t>(var_name, tensor);
+  } else {
+    VLOG(1) << "print_fetch_var: unrecognized data type:"
+            << tensor.type().name();
+  }
+
+  return;
+}
+
+void ExecutorThreadWorker::TrainFiles() {
+  // todo: configurable
+  SetDevice();
+
+  int fetch_var_num = fetch_var_names_.size();
+  fetch_values_.clear();
+  fetch_values_.resize(fetch_var_num);
+
+  thread_reader_->Start();
+
+  int cur_batch;
+  int batch_cnt = 0;
+  while ((cur_batch = thread_reader_->Next()) > 0) {
+    // executor run here
+    for (auto& op : ops_) {
+      op->Run(*thread_scope_, place_);
+    }
+
+    ++batch_cnt;
+    thread_scope_->DropKids();
+
+    if (debug_ == false || thread_id_ != 0) {
+      continue;
+    }
+
+    for (int i = 0; i < fetch_var_num; ++i) {
+      print_fetch_var(thread_scope_, fetch_var_names_[i]);
+    }  // end for (int i = 0...)
+  }    // end while ()
+}
+
+void ExecutorThreadWorker::SetThreadId(int tid) { thread_id_ = tid; }
+
+void ExecutorThreadWorker::SetPlace(const platform::Place& place) {
+  place_ = place;
+}
+
+void ExecutorThreadWorker::SetMainProgram(
+    const ProgramDesc& main_program_desc) {
+  main_program_.reset(new ProgramDesc(main_program_desc));
+}
+
+void ExecutorThreadWorker::SetRootScope(Scope* g_scope) {
+  root_scope_ = g_scope;
+}
+
+}  // einit_modelnd namespace framework
+}  // end namespace paddle
diff --git a/paddle/fluid/framework/executor_thread_worker.h b/paddle/fluid/framework/executor_thread_worker.h
new file mode 100644
index 0000000000..13ec2442c4
--- /dev/null
+++ b/paddle/fluid/framework/executor_thread_worker.h
@@ -0,0 +1,88 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <map>
+#include <memory>
+#include <mutex>  // NOLINT
+#include <set>
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+#include "paddle/fluid/framework/data_feed.h"
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+
+namespace paddle {
+namespace framework {
+void CreateTensor(Variable* var, proto::VarType::Type var_type);
+
+class ExecutorThreadWorker {
+ public:
+  ExecutorThreadWorker()
+      : thread_id_(-1), root_scope_(NULL), thread_scope_(NULL), debug_(false) {}
+  ~ExecutorThreadWorker() {}
+
+  void CreateThreadResource(const framework::ProgramDesc& program,
+                            const paddle::platform::Place& place);
+  void SetThreadId(int tid);
+  void SetDebug(const bool debug) { debug_ = debug; }
+  void SetRootScope(Scope* g_scope);
+  // set cpu device in this function
+  // cpu binding is used by default
+  void SetDevice();
+  // since we read data into memory that can not be accessed by program
+  // we need to bind memory of data with corresponding variables in program
+  // this function should be called after data feed is set
+  void BindingDataFeedMemory();
+  // set data feed declared in executor
+  void SetDataFeed(const std::shared_ptr<DataFeed>& datafeed);
+  // A multi-thread training function
+  void TrainFiles();
+  // set fetch variable names from python interface assigned by users
+  void SetFetchVarNames(const std::vector<std::string>& fetch_var_names);
+
+ private:
+  void CreateThreadScope(const framework::ProgramDesc& program);
+  void CreateThreadOperators(const framework::ProgramDesc& program);
+  void SetMainProgram(const ProgramDesc& main_program_desc);
+  void SetPlace(const paddle::platform::Place& place);
+
+ protected:
+  // thread index
+  std::shared_ptr<DataFeed> thread_reader_;  // shared queue, thread buffer
+  int thread_id_;
+  // operator name
+  std::vector<std::string> op_names_;
+  // thread level, local operators for forward and backward
+  std::vector<OperatorBase*> ops_;
+  // main program for training
+  std::unique_ptr<framework::ProgramDesc> main_program_;
+  // execution place
+  platform::Place place_;
+  // root scope for model parameters
+  Scope* root_scope_;
+  // a thread scope, father scope is global score which is shared
+  Scope* thread_scope_;
+
+ private:
+  std::vector<std::string> fetch_var_names_;
+  std::vector<std::vector<float>> fetch_values_;
+  bool debug_;
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc
index e829563952..f1642bc0d2 100644
--- a/paddle/fluid/framework/naive_executor.cc
+++ b/paddle/fluid/framework/naive_executor.cc
@@ -21,42 +21,11 @@
 #include "paddle/fluid/framework/naive_executor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/reader.h"
+#include "paddle/fluid/framework/variable_helper.h"
 #include "paddle/fluid/string/pretty_log.h"
 
 namespace paddle {
 namespace framework {
-
-// These code can be shared with Executor.
-static void InitializeVariable(Variable *var, proto::VarType::Type var_type) {
-  if (var_type == proto::VarType::LOD_TENSOR) {
-    var->GetMutable<LoDTensor>();
-  } else if (var_type == proto::VarType::SELECTED_ROWS) {
-    var->GetMutable<SelectedRows>();
-  } else if (var_type == proto::VarType::FEED_MINIBATCH) {
-    var->GetMutable<FeedFetchList>();
-  } else if (var_type == proto::VarType::FETCH_LIST) {
-    var->GetMutable<FeedFetchList>();
-  } else if (var_type == proto::VarType::STEP_SCOPES) {
-    var->GetMutable<std::vector<framework::Scope *>>();
-  } else if (var_type == proto::VarType::LOD_RANK_TABLE) {
-    var->GetMutable<LoDRankTable>();
-  } else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) {
-    var->GetMutable<LoDTensorArray>();
-  } else if (var_type == proto::VarType::PLACE_LIST) {
-    var->GetMutable<platform::PlaceList>();
-  } else if (var_type == proto::VarType::READER) {
-    var->GetMutable<ReaderHolder>();
-  } else if (var_type == proto::VarType::RAW) {
-    // GetMutable will be called in operator
-  } else {
-    PADDLE_THROW(
-        "Variable type %d is not in "
-        "[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, "
-        "LOD_RANK_TABLE, PLACE_LIST, READER, CHANNEL, RAW]",
-        var_type);
-  }
-}
-
 void NaiveExecutor::Prepare(Scope *scope, const ProgramDesc &program_desc,
                             int block_id, bool with_feed_fetch_ops) {
   if (!scope) {
diff --git a/paddle/fluid/framework/variable_helper.cc b/paddle/fluid/framework/variable_helper.cc
new file mode 100644
index 0000000000..fc4525549c
--- /dev/null
+++ b/paddle/fluid/framework/variable_helper.cc
@@ -0,0 +1,60 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/variable_helper.h"
+
+#include <vector>
+
+#include "paddle/fluid/framework/feed_fetch_type.h"
+#include "paddle/fluid/framework/lod_rank_table.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/reader.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace framework {
+void InitializeVariable(Variable* var, proto::VarType::Type var_type) {
+  if (var_type == proto::VarType::LOD_TENSOR) {
+    var->GetMutable<LoDTensor>();
+  } else if (var_type == proto::VarType::SELECTED_ROWS) {
+    var->GetMutable<SelectedRows>();
+  } else if (var_type == proto::VarType::FEED_MINIBATCH) {
+    var->GetMutable<FeedFetchList>();
+  } else if (var_type == proto::VarType::FETCH_LIST) {
+    var->GetMutable<FeedFetchList>();
+  } else if (var_type == proto::VarType::STEP_SCOPES) {
+    var->GetMutable<std::vector<framework::Scope*>>();
+  } else if (var_type == proto::VarType::LOD_RANK_TABLE) {
+    var->GetMutable<LoDRankTable>();
+  } else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) {
+    var->GetMutable<LoDTensorArray>();
+  } else if (var_type == proto::VarType::PLACE_LIST) {
+    var->GetMutable<platform::PlaceList>();
+  } else if (var_type == proto::VarType::READER) {
+    var->GetMutable<ReaderHolder>();
+  } else if (var_type == proto::VarType::RAW) {
+    // GetMutable will be called in operator
+  } else {
+    PADDLE_THROW(
+        "Variable type %d is not in "
+        "[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, "
+        "LOD_RANK_TABLE, PLACE_LIST, READER, RAW]",
+        var_type);
+  }
+}
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/variable_helper.h b/paddle/fluid/framework/variable_helper.h
new file mode 100644
index 0000000000..0e0c72c362
--- /dev/null
+++ b/paddle/fluid/framework/variable_helper.h
@@ -0,0 +1,22 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/variable.h"
+namespace paddle {
+namespace framework {
+void InitializeVariable(Variable *var, proto::VarType::Type var_type);
+}
+}
diff --git a/paddle/fluid/operators/distributed/request_handler_impl.cc b/paddle/fluid/operators/distributed/request_handler_impl.cc
index 0258f8f238..9722f8c96e 100644
--- a/paddle/fluid/operators/distributed/request_handler_impl.cc
+++ b/paddle/fluid/operators/distributed/request_handler_impl.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/operators/distributed/request_handler_impl.h"
 #include <iostream>
 #include <string>
 #include <vector>
@@ -20,7 +21,7 @@
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/operators/distributed/request_handler_impl.h"
+#include "paddle/fluid/framework/variable_helper.h"
 #include "paddle/fluid/operators/distributed/rpc_server.h"
 #include "paddle/fluid/string/printf.h"
 
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 25d241d976..d602613fc8 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -1,6 +1,6 @@
 
-set(PYBIND_DEPS pybind python proto_desc memory executor prune feed_fetch_method pass_builder parallel_executor profiler)
-set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc)
+set(PYBIND_DEPS pybind python proto_desc memory executor async_executor prune feed_fetch_method pass_builder parallel_executor profiler)
+set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc async_executor_py.cc)
 if(WITH_PYTHON)
   if(WITH_AMD_GPU)
     hip_library(paddle_pybind SHARED
diff --git a/paddle/fluid/pybind/async_executor_py.cc b/paddle/fluid/pybind/async_executor_py.cc
new file mode 100644
index 0000000000..470e8b0508
--- /dev/null
+++ b/paddle/fluid/pybind/async_executor_py.cc
@@ -0,0 +1,53 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <fcntl.h>
+
+// To avoid conflicting definition in gcc-4.8.2 headers and pyconfig.h (2.7.3)
+#ifdef _POSIX_C_SOURCE
+#undef _POSIX_C_SOURCE
+#endif
+
+#ifdef _XOPEN_SOURCE
+#undef _XOPEN_SOURCE
+#endif
+#include <string>
+#include <vector>
+
+#include "google/protobuf/io/zero_copy_stream_impl.h"
+#include "google/protobuf/text_format.h"
+#include "paddle/fluid/framework/async_executor.h"
+#include "paddle/fluid/framework/data_feed.h"
+#include "paddle/fluid/framework/data_feed.pb.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/inference/io.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/platform/variant.h"
+#include "paddle/fluid/pybind/async_executor_py.h"
+
+namespace py = pybind11;
+namespace pd = paddle::framework;
+
+namespace paddle {
+namespace pybind {
+using set_name_func = void (pd::DataFeedDesc::*)(const std::string&);
+void BindAsyncExecutor(py::module* m) {
+  py::class_<framework::AsyncExecutor>(*m, "AsyncExecutor")
+      .def(py::init([](framework::Scope* scope, const platform::Place& place) {
+        return std::unique_ptr<framework::AsyncExecutor>(
+            new framework::AsyncExecutor(scope, place));
+      }))
+      .def("run_from_files", &framework::AsyncExecutor::RunFromFile);
+}  // end BindAsyncExecutor
+}  // end namespace pybind
+}  // end namespace paddle
diff --git a/paddle/fluid/pybind/async_executor_py.h b/paddle/fluid/pybind/async_executor_py.h
new file mode 100644
index 0000000000..a99d6e0421
--- /dev/null
+++ b/paddle/fluid/pybind/async_executor_py.h
@@ -0,0 +1,28 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
+
+namespace py = pybind11;
+
+namespace paddle {
+namespace pybind {
+
+void BindAsyncExecutor(py::module* m);
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 1835c06405..fc7991d297 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -42,6 +42,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/init.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
+#include "paddle/fluid/pybind/async_executor_py.h"
 #include "paddle/fluid/pybind/const_value.h"
 #include "paddle/fluid/pybind/exception.h"
 #include "paddle/fluid/pybind/protobuf.h"
@@ -932,6 +933,7 @@ All parameter, weight, gradient are variables in Paddle.
       });
 
   BindRecordIOWriter(&m);
+  BindAsyncExecutor(&m);
 }
 }  // namespace pybind
 }  // namespace paddle
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index f7fefb3e5b..a1ffbf4262 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -20,6 +20,13 @@ from .framework import *
 # import all class inside executor into fluid module
 from . import executor
 from .executor import *
+
+from . import data_feed_desc
+from .data_feed_desc import *
+
+from . import async_executor
+from .async_executor import *
+
 from . import trainer
 from . import inferencer
 
@@ -54,7 +61,8 @@ Tensor = LoDTensor
 
 __all__ = framework.__all__ + executor.__all__ + \
     trainer.__all__ + inferencer.__all__ + transpiler.__all__ + \
-    parallel_executor.__all__ + lod_tensor.__all__ + [
+    parallel_executor.__all__ + lod_tensor.__all__ + \
+    data_feed_desc.__all__ + async_executor.__all__ + [
         'io',
         'initializer',
         'layers',
diff --git a/python/paddle/fluid/async_executor.py b/python/paddle/fluid/async_executor.py
new file mode 100644
index 0000000000..2664a7301d
--- /dev/null
+++ b/python/paddle/fluid/async_executor.py
@@ -0,0 +1,151 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import contextlib
+import six
+from .framework import Program, default_main_program, Variable
+from . import core
+from .executor import global_scope, Executor
+from paddle.fluid.proto import data_feed_pb2
+from google.protobuf import text_format
+from . import io
+from .data_feed_desc import DataFeedDesc
+
+__all__ = ['AsyncExecutor']
+
+
+class AsyncExecutor(object):
+    """
+    An asynchronous Executor in Python. Through exploiting the power of
+    multi-core processor and data queueing, AsyncExecutor makes data reading
+    and cosuming decoupled, each run in multiple threads in parallel.
+
+    Instead of reading data in python side, AsyncExecutor accepts a training
+    file list, which will be retrieved in C++, then training inputs will be
+    read, parsed and fed to training network within C++ code.
+
+    AsyncExecutor is in active development and the API might change in the near
+    future.
+
+    Example:
+        >>> data_feed = fluid.DataFeedDesc('data.proto')
+        >>> startup_program = fluid.default_startup_program()
+        >>> main_program = fluid.default_main_program()
+        >>> filelist = ["train_data/part-%d" % i for i in range(100)]
+        >>> thread_num = len(filelist) / 4
+        >>>
+        >>> place = fluid.CPUPlace()
+        >>> async_executor = fluid.AsyncExecutor(place)
+        >>>
+        >>> async_executor.run_startup_program(startup_program)
+        >>>
+        >>> epoch = 10
+        >>> for i in range(epoch):
+        >>>     async_executor.run(main_program,
+        >>>                        data_feed,
+        >>>                        filelist,
+        >>>                        thread_num,
+        >>>                        [acc],
+        >>>                        debug=False)
+
+    Args:
+        place(fluid.CPUPlace|None): indicate the executor run on which device.
+                                   Only CPUPlace supported
+
+    Note:
+        For debugging complicated network in parallel-GPUs, you can test it
+        on the executor. They has the exactly same arguments, and expected
+        the same results.
+
+    Note: Only running on CPUPlace supported.
+    """
+
+    def __init__(self, place=None):
+        if place is None:
+            place = core.CPUPlace()
+        if not isinstance(place, core.CPUPlace):
+            raise ValueError("AsyncExecutor only supports CPU device")
+
+        p = core.Place()
+        p.set_place(place)
+
+        scope = global_scope()
+        self.executor = core.AsyncExecutor(scope, p)
+
+    def run(self, program, data_feed, filelist, thread_num, fetch, debug=False):
+        """
+        Run program by this AsyncExecutor. Training dataset will be in filelist.
+        Users can also inspect certain variables by naming them in parameter
+        :code:`fetch`, like in fluid.Executor. Unlike fluid.Executor, however,
+        AsyncExecutor doesn't return fetched variables, instead, it will dump
+        the values of each fetched variable to stdandard output.
+
+        Running the dataset will be on multiple threads, within each a thread
+        local scope will be created, then all OPs also created in that scope.
+        Parameters are updated by all the OPs simultaneously.
+
+        Args:
+            program(Program): the program that need to run, if not provied,
+                              then default_main_program will be used.
+            data_feed(DataFeedDesc): A DataFeedDesc object
+            filelist(str): a file containing the training dataset file list
+            thread_num(int): number of concurrent training threads. See
+                             :code:`Note` for how to set this properly
+            fetch(str|list): the var name or a list of var names to inspect
+            debug(bool): When set to True, fetch vars will be printed to
+                         standard output after each minibatch
+
+        Note:
+            the executor will run all operators in the program but not only
+            the operators dependent by the fetch_list.
+
+        Note:
+            Running AsyncExecutor will be on multiple threads, each bound to a
+            CPU core. To achieve best performance, it's suggested to set thread
+            num to be equal or slightly less than that of CPU cores.
+        """
+        if program is None:
+            program = default_main_program()
+        program_desc = program.desc
+
+        if data_feed is None:
+            raise ValueError('ValueError: data_feed should be provided')
+
+        if filelist is None:
+            raise ValueError('ValueError: filelist should be provided')
+
+        if isinstance(filelist, str):
+            filelist = [filelist]
+
+        if not isinstance(thread_num, int):
+            raise TypeError('TypeError: thread_num should be a positive number')
+
+        if fetch is not None:
+            if isinstance(fetch, Variable):
+                fetch = [fetch]
+            fetch_var_names = [var.name for var in fetch]
+            for fetch_var in fetch:
+                shape = fetch_var.shape
+                if shape[len(shape) - 1] != 1:
+                    raise AssertionError(
+                        "%s: Fetch variable has wrong shape. Only varibles "
+                        "with the last dimension size 1 supported." %
+                        (fetch_var.name))
+
+        self.executor.run_from_files(program_desc,
+                                     data_feed.desc(), filelist, thread_num,
+                                     fetch_var_names, debug)
diff --git a/python/paddle/fluid/data_feed_desc.py b/python/paddle/fluid/data_feed_desc.py
new file mode 100644
index 0000000000..d2ec74d6cf
--- /dev/null
+++ b/python/paddle/fluid/data_feed_desc.py
@@ -0,0 +1,152 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.fluid.proto import data_feed_pb2
+from google.protobuf import text_format
+
+__all__ = ['DataFeedDesc']
+
+
+class DataFeedDesc(object):
+    """
+    Datafeed descriptor, describing input training data format. This class is
+    currently only used for AsyncExecutor (See comments for class AsyncExecutor
+    for a brief introduction)
+
+    DataFeedDesc shall be initialized from a valid protobuf message from disk:
+    >>> data_feed = fluid.DataFeedDesc('data.proto')
+
+    See :code:`paddle/fluid/framework/data_feed.proto` for message definition.
+    A typical message might look like:
+
+    >>> name: "MultiSlotDataFeed"
+    >>> batch_size: 2
+    >>> multi_slot_desc {
+    >>>     slots {
+    >>>         name: "words"
+    >>>         type: "uint64"
+    >>>         is_dense: false
+    >>>         is_used: true
+    >>>     }
+    >>>     slots {
+    >>>         name: "label"
+    >>>         type: "uint64"
+    >>>         is_dense: false
+    >>>         is_used: true
+    >>>     }
+    >>> }
+
+    However, users usually shouldn't care about the message format; instead,
+    they are encouragd to use :code:`Data Generator` as a tool to generate a
+    valid data description, in the process of converting their raw log files to
+    training files acceptable to AsyncExecutor.
+
+    DataFeedDesc can also be changed during runtime. Once you got familiar with
+    what each field mean, you can modify it to better suit your need. E.g.:
+    >>> data_feed.set_batch_size(128)
+    >>> data_feed.set_dense_slots('wd')  # The slot named 'wd' will be dense
+    >>> data_feed.set_use_slots('wd')    # The slot named 'wd' will be used
+
+    Finally, the content can be dumped out for debugging purpose:
+    >>> print(data_feed.desc())
+
+    Args:
+        proto_file(string): Disk file containing a data feed description.
+    
+    """
+
+    def __init__(self, proto_file):
+        self.proto_desc = data_feed_pb2.DataFeedDesc()
+        with open(proto_file, 'r') as f:
+            text_format.Parse(f.read(), self.proto_desc)
+        if self.proto_desc.name == "MultiSlotDataFeed":
+            self.__name_to_index = {
+                slot.name: i
+                for i, slot in enumerate(self.proto_desc.multi_slot_desc.slots)
+            }
+
+    def set_batch_size(self, batch_size):
+        """
+        Set batch size. Will be effective during training
+
+        Example:
+            >>> data_feed = fluid.DataFeedDesc('data.proto')
+            >>> data_feed.set_batch_size(128)
+
+        Args:
+            batch_size: batch size
+
+        """
+        self.proto_desc.batch_size = batch_size
+
+    def set_dense_slots(self, dense_slots_name):
+        """
+        Set if a specific slot will be dense. Will be effective during training.
+        features for a dense slot will be fed into a Tensor, while those for a
+        sparse slot will be fed into a LoDTensor
+
+        Example:
+            >>> data_feed = fluid.DataFeedDesc('data.proto')
+            >>> data_feed.set_dense_slots(['words'])
+
+        Args:
+            dense_slots_name: a list of slot names which will be set dense
+
+        Note:
+            Default is sparse for all slots
+        """
+        if self.proto_desc.name != "MultiSlotDataFeed":
+            raise ValueError(
+                "Only MultiSlotDataFeed need set_dense_slots, pls check your datafeed.proto"
+            )
+        for name in dense_slots_name:
+            self.proto_desc.multi_slot_desc.slots[self.__name_to_index[
+                name]].is_dense = True
+
+    def set_use_slots(self, use_slots_name):
+        """
+        Set if a specific slot will be used for training. A dataset shall
+        contain a lot of features, through this function one can select which
+        ones will be used for a specific model.
+
+        Example:
+            >>> data_feed = fluid.DataFeedDesc('data.proto')
+            >>> data_feed.set_use_slots(['words'])
+
+        Args:
+            use_slots_name: a list of slot names which will be used in training
+
+        Note:
+            Default is not used for all slots
+        """
+        if self.proto_desc.name != "MultiSlotDataFeed":
+            raise ValueError(
+                "Only MultiSlotDataFeed need set_use_slots, pls check your datafeed.proto"
+            )
+        for name in use_slots_name:
+            self.proto_desc.multi_slot_desc.slots[self.__name_to_index[
+                name]].is_used = True
+
+    def desc(self):
+        """
+        Returns a protobuf message for this DataFeedDesc
+
+        Example:
+            >>> data_feed = fluid.DataFeedDesc('data.proto')
+            >>> print(data_feed.desc())
+
+        Returns:
+            A string message
+        """
+        return text_format.MessageToString(self.proto_desc)
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index 288951cd7c..42c2484b28 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -278,6 +278,7 @@ class Executor(object):
         p = core.Place()
         p.set_place(place)
         self.executor = core.Executor(p)
+
         self.program_caches = dict()
         self._closed = False
 
diff --git a/python/paddle/fluid/tests/demo/async_executor.py b/python/paddle/fluid/tests/demo/async_executor.py
new file mode 100644
index 0000000000..fe8da0aab7
--- /dev/null
+++ b/python/paddle/fluid/tests/demo/async_executor.py
@@ -0,0 +1,100 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import tarfile
+import paddle.fluid as fluid
+import paddle
+from paddle.fluid import core
+
+URL = 'http://paddle-unittest-data.gz.bcebos.com/python_paddle_fluid_tests_demo_async-executor/train_data.tar.gz'
+MD5 = '2a405a31508969b3ab823f42c0f522ca'
+
+
+def bow_net(data,
+            label,
+            dict_dim=89528,
+            emb_dim=128,
+            hid_dim=128,
+            hid_dim2=96,
+            class_dim=2):
+    """
+    BOW net
+    This model is from https://github.com/PaddlePaddle/models:
+    models/fluid/PaddleNLP/text_classification/nets.py
+    """
+    # embedding
+    emb = fluid.layers.embedding(
+        input=data, size=[dict_dim, emb_dim], is_sparse=True)
+    bow = fluid.layers.sequence_pool(input=emb, pool_type='sum')
+    bowh = fluid.layers.tanh(bow)
+    # fc layer after conv
+    fc_1 = fluid.layers.fc(input=bowh, size=hid_dim, act="tanh")
+    fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh")
+    # probability of each class
+    prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax")
+    # cross entropy loss
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    # mean loss
+    avg_cost = fluid.layers.mean(x=cost)
+    acc = fluid.layers.accuracy(input=prediction, label=label)
+    return avg_cost, acc, prediction
+
+
+def train():
+    # Download data
+    with tarfile.open(paddle.dataset.common.download(URL, "imdb", MD5)) as tarf:
+        tarf.extractall(path='./')
+        tarf.close()
+
+    # Initialize dataset description
+    dataset = fluid.DataFeedDesc('train_data/data.prototxt')
+    dataset.set_batch_size(128)  # See API doc for how to change other fields
+    print dataset.desc()  # Debug purpose: see what we get
+
+    # define network
+    # input text data
+    data = fluid.layers.data(
+        name="words", shape=[1], dtype="int64", lod_level=1)
+    # label data
+    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+
+    avg_cost, acc, prediction = bow_net(data, label)
+    sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=0.002)
+    opt_ops, weight_and_grad = sgd_optimizer.minimize(avg_cost)
+
+    # Run startup program
+    startup_program = fluid.default_startup_program()
+    place = fluid.CPUPlace()
+    executor = fluid.Executor(place)
+    executor.run(startup_program)
+
+    async_executor = fluid.AsyncExecutor(place)
+    main_program = fluid.default_main_program()
+    epochs = 10
+    filelist = ["train_data/part-%d" % i for i in range(12)]
+    for i in range(epochs):
+        thread_num = 4
+        async_executor.run(
+            main_program,  # This can be changed during iteration
+            dataset,  # This can be changed during iteration
+            filelist,  # This can be changed during iteration
+            thread_num,  # This can be changed during iteration
+            [data, acc],  # Multiple fetch targets can be specified
+            debug=False)
+        fluid.io.save_inference_model('imdb/epoch%d.model' % i,
+                                      [data.name, label.name], [acc], executor)
+
+
+if __name__ == "__main__":
+    train()
diff --git a/python/paddle/fluid/tests/unittests/test_async_executor.py b/python/paddle/fluid/tests/unittests/test_async_executor.py
new file mode 100644
index 0000000000..43855b95f9
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_async_executor.py
@@ -0,0 +1,142 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid as fluid
+import paddle
+import unittest
+import tarfile
+import os
+import shutil
+
+proto_str = ('name: "MultiSlotDataFeed"\n'
+             'batch_size: 2\n'
+             'multi_slot_desc {\n'
+             '   slots {\n'
+             '       name: "words"\n'
+             '       type: "uint64"\n'
+             '       is_dense: false\n'
+             '       is_used: true\n'
+             '   }\n'
+             '   slots {\n'
+             '       name: "label"\n'
+             '       type: "uint64"\n'
+             '       is_dense: false\n'
+             '       is_used: true\n'
+             '   }\n'
+             '}')
+
+URL = 'http://paddle-unittest-data.gz.bcebos.com/python_paddle_fluid_tests_demo_async-executor/train_data.tar.gz'
+MD5 = '2a405a31508969b3ab823f42c0f522ca'
+
+
+def bow_net(data,
+            label,
+            dict_dim=89528,
+            emb_dim=128,
+            hid_dim=128,
+            hid_dim2=96,
+            class_dim=2):
+    """
+    BOW net
+    This model is from https://github.com/PaddlePaddle/models:
+    models/fluid/PaddleNLP/text_classification/nets.py
+    """
+    # embedding
+    emb = fluid.layers.embedding(
+        input=data, size=[dict_dim, emb_dim], is_sparse=True)
+    bow = fluid.layers.sequence_pool(input=emb, pool_type='sum')
+    bowh = fluid.layers.tanh(bow)
+    # fc layer after conv
+    fc_1 = fluid.layers.fc(input=bowh, size=hid_dim, act="tanh")
+    fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh")
+    # probability of each class
+    prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax")
+    # cross entropy loss
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    # mean loss
+    avg_cost = fluid.layers.mean(x=cost)
+    acc = fluid.layers.accuracy(input=prediction, label=label)
+    return avg_cost, acc, prediction
+
+
+class TestAsyncExecutor(unittest.TestCase):
+    def setUp(self):
+        with open('./data.prototxt', 'w+') as f:
+            f.write(proto_str)
+            f.close()
+
+        with tarfile.open(paddle.dataset.common.download(URL, "imdb",
+                                                         MD5)) as tarf:
+            tarf.extractall(path='./')
+            tarf.close()
+
+    def test_data_feed_desc(self):
+        data_feed = fluid.DataFeedDesc('./data.prototxt')
+        # assertEqueal(data_feed.proto_desc.batch, 2)
+        # assertEqual(len(data_feed.proto_desc.multi_slot_desc), 2)
+        self.assertEqual(" ".join(data_feed.desc().split()),
+                         " ".join(proto_str.split()))
+
+    def test_run(self):
+        # Initialize dataset description
+        data_feed = fluid.DataFeedDesc('train_data/data.prototxt')
+        data_feed.set_batch_size(
+            128)  # See API doc for how to change other fields
+
+        # define network
+        # input text data
+        data = fluid.layers.data(
+            name="words", shape=[1], dtype="int64", lod_level=1)
+        # label data
+        label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+
+        avg_cost, acc, prediction = bow_net(data, label)
+        sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=0.002)
+        opt_ops, weight_and_grad = sgd_optimizer.minimize(avg_cost)
+
+        # Run startup program
+        startup_program = fluid.default_startup_program()
+        place = fluid.CPUPlace()
+        executor = fluid.Executor(place)
+        executor.run(startup_program)
+
+        main_program = fluid.default_main_program()
+        async_executor = fluid.AsyncExecutor(place)
+
+        self.assertRaises(TypeError, async_executor.run)
+        self.assertRaises(TypeError, async_executor.run, main_program)
+        self.assertRaises(TypeError, async_executor.run, main_program,
+                          data_feed)
+
+        filelist = ['train_data/part-%d' % i for i in range(10)]
+        self.assertRaises(TypeError, async_executor.run, main_program,
+                          data_feed, filelist)
+
+        thread_num = 4
+        self.assertRaises(TypeError, async_executor.run, main_program,
+                          data_feed, filelist, thread_num)
+
+        async_executor.run(main_program, data_feed, filelist, thread_num, [acc])
+        fluid.io.save_inference_model("imdb.model", [data.name, label.name],
+                                      [acc], executor)
+        statinfo = os.stat('imdb.model/__model__')
+        self.assertGreater(statinfo.st_size, 0)
+
+        os.remove('./data.prototxt')
+        shutil.rmtree('./train_data')
+        shutil.rmtree('./imdb.model')
+
+
+if __name__ == '__main__':
+    unittest.main()

From 3df05389407cd0af09e1a2fc9cfde633aec81070 Mon Sep 17 00:00:00 2001
From: jerrywgz <jerrwgz@126.com>
Date: Fri, 30 Nov 2018 08:04:38 +0000
Subject: [PATCH 72/90] replace -100 to kIgnoreIndex

---
 .../operators/sigmoid_cross_entropy_with_logits_op.cc | 11 ++++++-----
 python/paddle/fluid/layers/nn.py                      | 11 ++++++++---
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
index 368988d60d..14746fa951 100644
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
@@ -18,6 +18,7 @@ namespace paddle {
 namespace operators {
 
 using framework::Tensor;
+const int kIgnoreIndex = -100;
 
 class SigmoidCrossEntropyWithLogitsOp : public framework::OperatorWithKernel {
  public:
@@ -100,11 +101,11 @@ class SigmoidCrossEntropyWithLogitsOpMaker
     AddOutput("Out",
               "(Tensor, default Tensor<float>), a 2-D tensor with shape N x D "
               " of elementwise logistic losses.");
-    AddAttr<int>(
-        "ignore_index",
-        "(int, default -100), Specifies a target value that is ignored and"
-        "does not contribute to the input gradient.")
-        .SetDefault(-100);
+    AddAttr<int>("ignore_index",
+                 "(int, default kIgnoreIndex), Specifies a target value that "
+                 "is ignored and"
+                 "does not contribute to the input gradient.")
+        .SetDefault(kIgnoreIndex);
     AddComment(R"DOC(
 SigmoidCrossEntropyWithLogits Operator.
 
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 38da9173cc..f746e4424d 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -170,6 +170,8 @@ __all__ = [
     'bilinear_tensor_product',
 ]
 
+kIgnoreIndex = -100
+
 
 def fc(input,
        size,
@@ -1103,7 +1105,7 @@ def dropout(x,
     return out
 
 
-def cross_entropy(input, label, soft_label=False, ignore_index=-100):
+def cross_entropy(input, label, soft_label=False, ignore_index=kIgnoreIndex):
     """
     **Cross Entropy Layer**
 
@@ -4796,7 +4798,7 @@ def multiplex(inputs, index):
 def softmax_with_cross_entropy(logits,
                                label,
                                soft_label=False,
-                               ignore_index=-100,
+                               ignore_index=kIgnoreIndex,
                                numeric_stable_mode=False,
                                return_softmax=False):
     """
@@ -7892,7 +7894,10 @@ def mul(x, y, x_num_col_dims=1, y_num_col_dims=1, name=None):
 
 
 @templatedoc()
-def sigmoid_cross_entropy_with_logits(x, label, ignore_index=-100, name=None):
+def sigmoid_cross_entropy_with_logits(x,
+                                      label,
+                                      ignore_index=kIgnoreIndex,
+                                      name=None):
     """
     ${comment}
 

From 126e18c1e893d6129150a6c0af16552c1ed66b54 Mon Sep 17 00:00:00 2001
From: jerrywgz <jerrwgz@126.com>
Date: Fri, 30 Nov 2018 09:12:44 +0000
Subject: [PATCH 73/90] test=develop

---
 python/paddle/fluid/layers/nn.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index f746e4424d..29b05d2eea 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -1152,7 +1152,7 @@ def cross_entropy(input, label, soft_label=False, ignore_index=kIgnoreIndex):
                                            labels. Default: `False`.
         ignore_index (int): Specifies a target value that is ignored and does
                             not contribute to the input gradient. Only valid
-                            if soft_label is set to False. Default: -100
+                            if soft_label is set to False. Default: kIgnoreIndex
 
     Returns:
          A 2-D tensor with shape [N x 1], the cross entropy loss.
@@ -4856,7 +4856,7 @@ def softmax_with_cross_entropy(logits,
             labels as soft labels. By default, `soft_label` is set to False.
         ignore_index (int): Specifies a target value that is ignored and does
                             not contribute to the input gradient. Only valid
-                            if soft_label is set to False. Default: -100
+                            if soft_label is set to False. Default: kIgnoreIndex
         numeric_stable_mode (bool): A flag to indicate whether to use a more
                                     numerically stable algorithm. Only valid
                                     when soft_label is False and GPU is used.

From 679d8fc6fe0349c7438cbe45f4d4a1deba52837e Mon Sep 17 00:00:00 2001
From: chengduozh <zhaochengduo@baidu.com>
Date: Fri, 30 Nov 2018 18:52:50 +0800
Subject: [PATCH 74/90] rename op name test=develop

---
 paddle/fluid/operators/lstm_cudnn_op.cc    | 8 ++++----
 paddle/fluid/operators/lstm_cudnn_op.cu.cc | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/paddle/fluid/operators/lstm_cudnn_op.cc b/paddle/fluid/operators/lstm_cudnn_op.cc
index ca60fb4b0b..c9a4a31738 100644
--- a/paddle/fluid/operators/lstm_cudnn_op.cc
+++ b/paddle/fluid/operators/lstm_cudnn_op.cc
@@ -205,14 +205,14 @@ class CudnnLSTMGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OPERATOR(cudnn_lstm, ops::CudnnLSTMOp, ops::CudnnLSTMOpMaker,
+REGISTER_OPERATOR(lstm_cudnn, ops::CudnnLSTMOp, ops::CudnnLSTMOpMaker,
                   paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OPERATOR(cudnn_lstm_grad, ops::CudnnLSTMGradOp);
+REGISTER_OPERATOR(lstm_cudnn_grad, ops::CudnnLSTMGradOp);
 
 REGISTER_OP_CPU_KERNEL(
-    cudnn_lstm,
+    lstm_cudnn,
     ops::CudnnLSTMKernel<paddle::platform::CPUDeviceContext, float>);
 
 REGISTER_OP_CPU_KERNEL(
-    cudnn_lstm_grad,
+    lstm_cudnn_grad,
     ops::CudnnLSTMGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/lstm_cudnn_op.cu.cc b/paddle/fluid/operators/lstm_cudnn_op.cu.cc
index 7a67bbe539..353ab17597 100644
--- a/paddle/fluid/operators/lstm_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/lstm_cudnn_op.cu.cc
@@ -487,8 +487,8 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
-    cudnn_lstm,
+    lstm_cudnn,
     ops::CudnnLSTMGPUKernel<paddle::platform::CUDADeviceContext, float>);
 REGISTER_OP_CUDA_KERNEL(
-    cudnn_lstm_grad,
+    lstm_cudnn_grad,
     ops::CudnnLSTMGPUGradKernel<paddle::platform::CUDADeviceContext, float>);

From af8c2cec136ce0893d4a35155f6760f05109856a Mon Sep 17 00:00:00 2001
From: chengduozh <zhaochengduo@baidu.com>
Date: Fri, 30 Nov 2018 20:09:34 +0800
Subject: [PATCH 75/90] fix operator.cmake test=develop

---
 .../fluid/operators/{lstm_cudnn_op.cc => cudnn_lstm_op.cc}  | 6 +++---
 .../operators/{lstm_cudnn_op.cu.cc => cudnn_lstm_op.cu.cc}  | 6 +++---
 paddle/fluid/operators/{lstm_cudnn_op.h => cudnn_lstm_op.h} | 0
 3 files changed, 6 insertions(+), 6 deletions(-)
 rename paddle/fluid/operators/{lstm_cudnn_op.cc => cudnn_lstm_op.cc} (98%)
 rename paddle/fluid/operators/{lstm_cudnn_op.cu.cc => cudnn_lstm_op.cu.cc} (99%)
 rename paddle/fluid/operators/{lstm_cudnn_op.h => cudnn_lstm_op.h} (100%)

diff --git a/paddle/fluid/operators/lstm_cudnn_op.cc b/paddle/fluid/operators/cudnn_lstm_op.cc
similarity index 98%
rename from paddle/fluid/operators/lstm_cudnn_op.cc
rename to paddle/fluid/operators/cudnn_lstm_op.cc
index c9a4a31738..86632fc9fb 100644
--- a/paddle/fluid/operators/lstm_cudnn_op.cc
+++ b/paddle/fluid/operators/cudnn_lstm_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/lstm_cudnn_op.h"
+#include "paddle/fluid/operators/cudnn_lstm_op.h"
 #include <string>
 
 #ifdef PADDLE_WITH_CUDA
@@ -205,12 +205,12 @@ class CudnnLSTMGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OPERATOR(lstm_cudnn, ops::CudnnLSTMOp, ops::CudnnLSTMOpMaker,
+REGISTER_OPERATOR(cudnn_lstm, ops::CudnnLSTMOp, ops::CudnnLSTMOpMaker,
                   paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(lstm_cudnn_grad, ops::CudnnLSTMGradOp);
 
 REGISTER_OP_CPU_KERNEL(
-    lstm_cudnn,
+    cudnn_lstm,
     ops::CudnnLSTMKernel<paddle::platform::CPUDeviceContext, float>);
 
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/operators/lstm_cudnn_op.cu.cc b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
similarity index 99%
rename from paddle/fluid/operators/lstm_cudnn_op.cu.cc
rename to paddle/fluid/operators/cudnn_lstm_op.cu.cc
index 353ab17597..811975a9f3 100644
--- a/paddle/fluid/operators/lstm_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/lstm_cudnn_op.h"
+#include "paddle/fluid/operators/cudnn_lstm_op.h"
 #include "paddle/fluid/platform/cudnn_helper.h"
 
 namespace paddle {
@@ -487,8 +487,8 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
-    lstm_cudnn,
+    cudnn_lstm,
     ops::CudnnLSTMGPUKernel<paddle::platform::CUDADeviceContext, float>);
 REGISTER_OP_CUDA_KERNEL(
-    lstm_cudnn_grad,
+    cudnn_lstm_grad,
     ops::CudnnLSTMGPUGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/lstm_cudnn_op.h b/paddle/fluid/operators/cudnn_lstm_op.h
similarity index 100%
rename from paddle/fluid/operators/lstm_cudnn_op.h
rename to paddle/fluid/operators/cudnn_lstm_op.h

From 3f4aca618f4ef0b87881c4a7ae62af2490e35780 Mon Sep 17 00:00:00 2001
From: chengduozh <zhaochengduo@baidu.com>
Date: Fri, 30 Nov 2018 21:03:28 +0800
Subject: [PATCH 76/90] code refine test=develop

---
 paddle/fluid/operators/cudnn_lstm_op.cc    | 26 ++++++-------
 paddle/fluid/operators/cudnn_lstm_op.cu.cc | 17 ++++----
 paddle/fluid/operators/cudnn_lstm_op.h     | 45 ----------------------
 3 files changed, 20 insertions(+), 68 deletions(-)
 delete mode 100644 paddle/fluid/operators/cudnn_lstm_op.h

diff --git a/paddle/fluid/operators/cudnn_lstm_op.cc b/paddle/fluid/operators/cudnn_lstm_op.cc
index 86632fc9fb..e63d57be57 100644
--- a/paddle/fluid/operators/cudnn_lstm_op.cc
+++ b/paddle/fluid/operators/cudnn_lstm_op.cc
@@ -12,12 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/cudnn_lstm_op.h"
 #include <string>
-
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/cudnn_helper.h"
-#endif
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
@@ -201,18 +197,22 @@ class CudnnLSTMGradOp : public framework::OperatorWithKernel {
   }
 };
 
+template <typename T>
+class NotImpleKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_THROW(
+        "CPU is not support for this kernel now. Will be add in the future");
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(cudnn_lstm, ops::CudnnLSTMOp, ops::CudnnLSTMOpMaker,
                   paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OPERATOR(lstm_cudnn_grad, ops::CudnnLSTMGradOp);
-
-REGISTER_OP_CPU_KERNEL(
-    cudnn_lstm,
-    ops::CudnnLSTMKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OPERATOR(cudnn_lstm_grad, ops::CudnnLSTMGradOp);
 
-REGISTER_OP_CPU_KERNEL(
-    lstm_cudnn_grad,
-    ops::CudnnLSTMGradKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(cudnn_lstm, ops::NotImpleKernel<float>);
+REGISTER_OP_CPU_KERNEL(cudnn_lstm_grad, ops::NotImpleKernel<float>);
diff --git a/paddle/fluid/operators/cudnn_lstm_op.cu.cc b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
index 811975a9f3..cad62de754 100644
--- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc
+++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
@@ -12,7 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/cudnn_lstm_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/cudnn_helper.h"
 
 namespace paddle {
@@ -246,7 +247,7 @@ struct CudnnRNNCache {
   }
 };
 
-template <typename DeviceContext, typename T>
+template <typename T>
 class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
@@ -343,7 +344,7 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename DeviceContext, typename T>
+template <typename T>
 class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
@@ -380,7 +381,7 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
     auto init_c_dims = init_c->dims();
     in_grad->mutable_data<T>(ctx.GetPlace());
     weight_grad->mutable_data<T>(ctx.GetPlace());
-    math::SetConstant<DeviceContext, T> zero;
+    math::SetConstant<paddle::platform::CUDADeviceContext, T> zero;
     zero(dev_ctx, in_grad, static_cast<T>(0.0));
     zero(dev_ctx, weight_grad, static_cast<T>(0.0));
 
@@ -486,9 +487,5 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    cudnn_lstm,
-    ops::CudnnLSTMGPUKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    cudnn_lstm_grad,
-    ops::CudnnLSTMGPUGradKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(cudnn_lstm, ops::CudnnLSTMGPUKernel<float>);
+REGISTER_OP_CUDA_KERNEL(cudnn_lstm_grad, ops::CudnnLSTMGPUGradKernel<float>);
diff --git a/paddle/fluid/operators/cudnn_lstm_op.h b/paddle/fluid/operators/cudnn_lstm_op.h
deleted file mode 100644
index fc329cc239..0000000000
--- a/paddle/fluid/operators/cudnn_lstm_op.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <string>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/detail/activation_functions.h"
-#include "paddle/fluid/operators/math/lstm_compute.h"
-#include "paddle/fluid/operators/math/sequence2batch.h"
-
-namespace paddle {
-namespace operators {
-
-using LoDTensor = framework::LoDTensor;
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T>
-class CudnnLSTMKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_THROW(
-        "CPU is not support for this kernel now. Will be add in the future");
-  }
-};
-
-template <typename DeviceContext, typename T>
-class CudnnLSTMGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {}
-};
-
-}  // namespace operators
-}  // namespace paddle

From a6aa782ee5b0bfdb88c3b00fccbb72a6ae904195 Mon Sep 17 00:00:00 2001
From: phlrain <phliuhongyu@126.com>
Date: Fri, 30 Nov 2018 21:49:14 +0800
Subject: [PATCH 77/90] add unitest

---
 .../tests/unittests/test_lstm_cudnn_op.py     | 154 ++++++++++++++++++
 1 file changed, 154 insertions(+)
 create mode 100644 python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py

diff --git a/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py b/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
new file mode 100644
index 0000000000..2741bf167b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
@@ -0,0 +1,154 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+
+import paddle.fluid.core as core
+from op_test import OpTest
+
+
+def lstm_naive(
+        input,
+        w, ):
+    seq_len, batch_size, hidden_size = input.shape
+
+    offset = 0
+    wi = w[offset:offset + hidden_size * hidden_size].reshape(
+        (hidden_size, hidden_size)).transpose()
+    offset += hidden_size * hidden_size
+    wf = w[offset:offset + hidden_size * hidden_size].reshape(
+        (hidden_size, hidden_size)).transpose()
+    offset += hidden_size * hidden_size
+    wc = w[offset:offset + hidden_size * hidden_size].reshape(
+        (hidden_size, hidden_size)).transpose()
+    offset += hidden_size * hidden_size
+    wo = w[offset:offset + hidden_size * hidden_size].reshape(
+        (hidden_size, hidden_size)).transpose()
+    offset += hidden_size * hidden_size
+    ri = w[offset:offset + hidden_size * hidden_size].reshape(
+        (hidden_size, hidden_size)).transpose()
+    offset += hidden_size * hidden_size
+    rf = w[offset:offset + hidden_size * hidden_size].reshape(
+        (hidden_size, hidden_size)).transpose()
+    offset += hidden_size * hidden_size
+    rc = w[offset:offset + hidden_size * hidden_size].reshape(
+        (hidden_size, hidden_size)).transpose()
+    offset += hidden_size * hidden_size
+    ro = w[offset:offset + hidden_size * hidden_size].reshape(
+        (hidden_size, hidden_size)).transpose()
+    offset += hidden_size * hidden_size
+
+    bi_1 = w[offset:offset + hidden_size]
+    offset += hidden_size
+    bf_1 = w[offset:offset + hidden_size]
+    offset += hidden_size
+    bc_1 = w[offset:offset + hidden_size]
+    offset += hidden_size
+    bo_1 = w[offset:offset + hidden_size]
+    offset += hidden_size
+
+    bi_2 = w[offset:offset + hidden_size]
+    offset += hidden_size
+    bf_2 = w[offset:offset + hidden_size]
+    offset += hidden_size
+    bc_2 = w[offset:offset + hidden_size]
+    offset += hidden_size
+    bo_2 = w[offset:offset + hidden_size]
+
+    def sigmoid(x):
+        return 1.0 / (1.0 + np.exp(-x))
+
+    def tanh(x):
+        return (np.exp(x) - np.exp(-x)) / (np.exp(x) + np.exp(-x))
+
+    output = []
+    pre_h = np.zeros((batch_size, hidden_size), dtype=input.dtype)
+    pre_c = np.zeros((batch_size, hidden_size), dtype=input.dtype)
+
+    for i in range(seq_len):
+        emb_1 = input[i]
+
+        input_gate = sigmoid(
+            np.matmul(emb_1, wi) + np.matmul(pre_h, ri) + bi_1 + bi_2)
+        forget_gate = sigmoid(
+            np.matmul(emb_1, wf) + np.matmul(pre_h, rf) + bf_1 + bf_2)
+        output_gate = sigmoid(
+            np.matmul(emb_1, wo) + np.matmul(pre_h, ro) + bo_1 + bo_2)
+        c_t_temp = tanh(
+            np.matmul(emb_1, wc) + np.matmul(pre_h, rc) + bc_1 + bc_2)
+        new_c = input_gate * c_t_temp + forget_gate * pre_c
+        new_h = output_gate * tanh(new_c)
+
+        pre_h = new_h
+        pre_c = new_c
+
+        output.append(new_h)
+
+    output = np.concatenate(output, -1)
+    output = output.reshape((batch_size, -1, hidden_size))
+
+    output = output.transpose((1, 0, 2))
+
+    return output
+
+
+class TestCUDNNLstmOp(OpTest):
+    def setUp(self):
+        self.op_type = "cudnn_lstm"
+        self.dtype = np.float32
+
+        num_steps = 50
+        batch_size = 20
+        hidden_size = 200
+
+        input_weight_size = (hidden_size * hidden_size) * 4
+        hidden_weight_size = (hidden_size * hidden_size) * 4
+        weight_size = input_weight_size + hidden_weight_size
+        weight_size += hidden_size * 8
+
+        input = np.random.random(
+            (num_steps, batch_size, hidden_size)).astype(self.dtype)
+        flat_w = np.random.random((weight_size)).astype(self.dtype)
+
+        output = lstm_naive(input, flat_w)
+
+        init_h = np.zeros((batch_size, hidden_size), dtype=np.float32)
+        init_c = np.zeros((batch_size, hidden_size), dtype=np.float32)
+        self.inputs = {
+            'Input': OpTest.np_dtype_to_fluid_dtype(input),
+            'W': OpTest.np_dtype_to_fluid_dtype(flat_w),
+            'InitH': OpTest.np_dtype_to_fluid_dtype(init_h),
+            'InitC': OpTest.np_dtype_to_fluid_dtype(init_c),
+        }
+        self.attrs = {
+            'max_len': num_steps,
+            'dropout_prob': 0.0,
+            'is_bidirec': False,
+            'input_size': hidden_size,
+            'hidden_size': hidden_size,
+            'num_layers': 1,
+        }
+        self.outputs = {'Out': output}
+
+    def test_grad_with_place(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(place, atol=1e-5)
+
+    def test_output_with_place(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(
+            place, atol=1e-5, no_check_set=['last_h', 'last_c'])

From bc7db6cec944a8ab5099dfd8d0e03ebd5fa1eb60 Mon Sep 17 00:00:00 2001
From: Krzysztof Binias <krzysztof.binias@intel.com>
Date: Fri, 30 Nov 2018 15:25:44 +0100
Subject: [PATCH 78/90] Fix for accuracy problem for inplace operators when
 MKL-DNN mode is enabled

test=develop
---
 paddle/fluid/framework/operator.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 2260353af7..fc5036886a 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -749,7 +749,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
 
   kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx));
 
-  if (!transfered_inplace_vars.empty()) {
+  if (run_by_executor_ && !transfered_inplace_vars.empty()) {
     // there is inplace variable has been transfered.
     TransferInplaceVarsBack(scope, transfered_inplace_vars, *transfer_scope);
   }
@@ -771,6 +771,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
     }
   }
 }
+
 void OperatorWithKernel::TransferInplaceVarsBack(
     const Scope& scope, const std::vector<std::string>& inplace_vars,
     const Scope& transfer_scope) const {

From 618f7620e2f90c52b94ee243bf1827914a04fd82 Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Sat, 1 Dec 2018 12:39:36 +0800
Subject: [PATCH 79/90] add enforce for auc (#14687)

* add enforce for AUC,  test=develop
---
 paddle/fluid/operators/metrics/auc_op.h | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/operators/metrics/auc_op.h b/paddle/fluid/operators/metrics/auc_op.h
index fb370842d1..4ab5cfe53c 100644
--- a/paddle/fluid/operators/metrics/auc_op.h
+++ b/paddle/fluid/operators/metrics/auc_op.h
@@ -75,8 +75,13 @@ class AucKernel : public framework::OpKernel<T> {
     const auto *label_data = label->data<int64_t>();
 
     for (size_t i = 0; i < batch_size; i++) {
-      uint32_t binIdx = static_cast<uint32_t>(
-          inference_data[i * inference_width + 1] * num_thresholds);
+      auto predict_data = inference_data[i * inference_width + 1];
+      PADDLE_ENFORCE_LE(predict_data, 1,
+                        "The predict data must less or equal 1.");
+      PADDLE_ENFORCE_GE(predict_data, 0,
+                        "The predict data must gather or equal 0.");
+
+      uint32_t binIdx = static_cast<uint32_t>(predict_data * num_thresholds);
       if (label_data[i]) {
         (*stat_pos)[binIdx] += 1.0;
       } else {

From b65722d3cf0ac525eaf39fd026013f1aaf718531 Mon Sep 17 00:00:00 2001
From: phlrain <phliuhongyu@126.com>
Date: Sat, 1 Dec 2018 16:03:42 +0800
Subject: [PATCH 80/90] fix uni test; test=develop

---
 paddle/fluid/operators/cudnn_lstm_op.cu.cc    |  6 ---
 python/paddle/fluid/layers/nn.py              | 13 +++--
 .../paddle/fluid/tests/unittests/op_test.py   |  9 ++++
 .../tests/unittests/test_lstm_cudnn_op.py     | 53 ++++++++++++++-----
 4 files changed, 54 insertions(+), 27 deletions(-)

diff --git a/paddle/fluid/operators/cudnn_lstm_op.cu.cc b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
index cad62de754..e01070c7b8 100644
--- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc
+++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
@@ -279,12 +279,6 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
     int num_layers = ctx.Attr<int>("num_layers");
     bool is_test = ctx.Attr<bool>("is_test");
 
-    /*
-    if (is_test) {
-      TensorCopy(*x, ctx.GetPlace(), out);
-      return;
-    }*/
-
     auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     auto handle = dev_ctx.cudnn_handle();
     auto *cache_var = ctx.InputVar("Cache");
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index f9e3da68d7..dbc39afccb 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -477,12 +477,10 @@ def lstm(input,
          init_h,
          init_c,
          max_len,
-         dropout_prob,
-         input_size,
          hidden_size,
          num_layers,
+         dropout_prob=0.0,
          is_bidirec=False,
-         dtype='float32',
          is_test=False,
          name=None,
          default_initializer=None,
@@ -531,13 +529,11 @@ def lstm(input,
                        This is a tensor with shape ( num_layers x batch_size x hidden_size )
                        if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size)
         max_len (int): max length of LSTM. the first dim of input tensor CAN NOT greater than max_len 
-        dropout_prob(float): dropout prob, dropout ONLY work between rnn layers, NOT between time steps
-                             There is NO dropout work on rnn output of the last RNN layers
-        input_size (int): hidden size of the input tensor
         hidden_size (int): hidden size of the LSTM
         num_layers (int): total layers number of the LSTM
+        dropout_prob(float|0.0): dropout prob, dropout ONLY work between rnn layers, NOT between time steps
+                             There is NO dropout work on rnn output of the last RNN layers
         is_bidirec (bool): If it is bidirectional
-        dtype (str): Data type. Choices = ["float32", "float64"], default "float32".
         is_test (bool): If it is in test phrase
         name (str|None): A name for this layer(optional). If set None, the layer
                          will be named automatically.
@@ -577,6 +573,9 @@ def lstm(input,
 
     helper = LayerHelper('cudnn_lstm', **locals())
 
+    dtype = input.dtype
+    input_shape = list(input.shape)
+    input_size = input_shape[-1]
     weight_size = 0
     for i in range(num_layers):
         if i == 0:
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index 271b9c740f..0200d74136 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -216,6 +216,15 @@ class OpTest(unittest.TestCase):
                                      self.dtype)
         outputs = append_input_output(block, op_proto, self.outputs, False,
                                       self.dtype)
+
+        if hasattr(self, "cache_name_list"):
+            for name in self.cache_name_list:
+                inputs[name] = block.create_var(
+                    name=name,
+                    persistable=True,
+                    type=core.VarDesc.VarType.RAW,
+                    stop_gradient=True)
+
         op = block.append_op(
             type=self.op_type,
             inputs=inputs,
diff --git a/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py b/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
index 2741bf167b..8d313970cc 100644
--- a/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
@@ -19,6 +19,11 @@ import numpy as np
 
 import paddle.fluid.core as core
 from op_test import OpTest
+import paddle.fluid as fluid
+
+SIGMOID_THRESHOLD_MIN = -40.0
+SIGMOID_THRESHOLD_MAX = 13.0
+EXP_MAX_INPUT = 40.0
 
 
 def lstm_naive(
@@ -70,10 +75,15 @@ def lstm_naive(
     bo_2 = w[offset:offset + hidden_size]
 
     def sigmoid(x):
-        return 1.0 / (1.0 + np.exp(-x))
+        y = np.copy(x)
+        y[x < SIGMOID_THRESHOLD_MIN] = SIGMOID_THRESHOLD_MIN
+        y[x > SIGMOID_THRESHOLD_MAX] = SIGMOID_THRESHOLD_MAX
+        return 1. / (1. + np.exp(-y))
 
     def tanh(x):
-        return (np.exp(x) - np.exp(-x)) / (np.exp(x) + np.exp(-x))
+        y = -2. * x
+        y[y > EXP_MAX_INPUT] = EXP_MAX_INPUT
+        return (2. / (1. + np.exp(y))) - 1.
 
     output = []
     pre_h = np.zeros((batch_size, hidden_size), dtype=input.dtype)
@@ -103,7 +113,7 @@ def lstm_naive(
 
     output = output.transpose((1, 0, 2))
 
-    return output
+    return output, pre_h, pre_c
 
 
 class TestCUDNNLstmOp(OpTest):
@@ -120,20 +130,32 @@ class TestCUDNNLstmOp(OpTest):
         weight_size = input_weight_size + hidden_weight_size
         weight_size += hidden_size * 8
 
-        input = np.random.random(
-            (num_steps, batch_size, hidden_size)).astype(self.dtype)
-        flat_w = np.random.random((weight_size)).astype(self.dtype)
+        input = np.random.uniform(
+            low=-0.1, high=0.1, size=(num_steps, batch_size,
+                                      hidden_size)).astype(self.dtype)
+        flat_w = np.random.uniform(
+            low=-0.1, high=0.1, size=(weight_size)).astype(self.dtype)
 
-        output = lstm_naive(input, flat_w)
+        output, last_hidden, last_cell = lstm_naive(input, flat_w)
 
         init_h = np.zeros((batch_size, hidden_size), dtype=np.float32)
         init_c = np.zeros((batch_size, hidden_size), dtype=np.float32)
+        scope = core.Scope()
+        program = fluid.Program()
+        block = program.global_block()
+
+        cache_temp = block.create_var(
+            name="Cache",
+            persistable=True,
+            type=core.VarDesc.VarType.RAW,
+            stop_gradient=True)
         self.inputs = {
             'Input': OpTest.np_dtype_to_fluid_dtype(input),
             'W': OpTest.np_dtype_to_fluid_dtype(flat_w),
             'InitH': OpTest.np_dtype_to_fluid_dtype(init_h),
             'InitC': OpTest.np_dtype_to_fluid_dtype(init_c),
         }
+        self.cache_name_list = ['Cache']
         self.attrs = {
             'max_len': num_steps,
             'dropout_prob': 0.0,
@@ -142,13 +164,16 @@ class TestCUDNNLstmOp(OpTest):
             'hidden_size': hidden_size,
             'num_layers': 1,
         }
-        self.outputs = {'Out': output}
-
-    def test_grad_with_place(self):
-        place = core.CUDAPlace(0)
-        self.check_grad_with_place(place, atol=1e-5)
+        self.outputs = {
+            'Out': output,
+            "last_h": last_hidden,
+            'last_c': last_cell
+        }
 
     def test_output_with_place(self):
         place = core.CUDAPlace(0)
-        self.check_output_with_place(
-            place, atol=1e-5, no_check_set=['last_h', 'last_c'])
+        self.check_output_with_place(place, atol=1e-5)
+
+
+if __name__ == '__main__':
+    unittest.main()

From 25df78eaf31b4b4fe1e208f3c70a9319e9fd4830 Mon Sep 17 00:00:00 2001
From: phlrain <phliuhongyu@126.com>
Date: Sat, 1 Dec 2018 16:09:26 +0800
Subject: [PATCH 81/90] fix api spec; test=develop

---
 paddle/fluid/API.spec | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index 10ac9fe070..8f6797429c 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -194,7 +194,7 @@ paddle.fluid.layers.grid_sampler ArgSpec(args=['x', 'grid', 'name'], varargs=Non
 paddle.fluid.layers.log_loss ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None))
 paddle.fluid.layers.add_position_encoding ArgSpec(args=['input', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.bilinear_tensor_product ArgSpec(args=['x', 'y', 'size', 'act', 'name', 'param_attr', 'bias_attr'], varargs=None, keywords=None, defaults=(None, None, None, None))
-paddle.fluid.layers.lstm ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'dropout_prob', 'input_size', 'hidden_size', 'num_layers', 'is_bidirec', 'dtype', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(False, 'float32', False, None, None, False, 0))
+paddle.fluid.layers.lstm ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1))
 paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
 paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
 paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None)

From 0b8a377d9cbd02c8c0d3824ad9b37d9a52ba8822 Mon Sep 17 00:00:00 2001
From: phlrain <phliuhongyu@126.com>
Date: Sat, 1 Dec 2018 18:22:05 +0800
Subject: [PATCH 82/90] fix cpu test; test=develop

---
 python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py b/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
index 8d313970cc..e3de9f992b 100644
--- a/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
@@ -171,9 +171,14 @@ class TestCUDNNLstmOp(OpTest):
         }
 
     def test_output_with_place(self):
+        if not self.testcuda():
+            pass
         place = core.CUDAPlace(0)
         self.check_output_with_place(place, atol=1e-5)
 
+    def testcuda(self):
+        return core.is_compiled_with_cuda()
+
 
 if __name__ == '__main__':
     unittest.main()

From 24fa1f4b8c377dc446857a59d88bf72b1c3a5a67 Mon Sep 17 00:00:00 2001
From: phlrain <phliuhongyu@126.com>
Date: Sat, 1 Dec 2018 18:39:13 +0800
Subject: [PATCH 83/90] fix test uni; test=develop

---
 python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py b/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
index e3de9f992b..9c8e7beae2 100644
--- a/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
@@ -171,10 +171,9 @@ class TestCUDNNLstmOp(OpTest):
         }
 
     def test_output_with_place(self):
-        if not self.testcuda():
-            pass
-        place = core.CUDAPlace(0)
-        self.check_output_with_place(place, atol=1e-5)
+        if self.testcuda():
+            place = core.CUDAPlace(0)
+            self.check_output_with_place(place, atol=1e-5)
 
     def testcuda(self):
         return core.is_compiled_with_cuda()

From 61ae88b7604a9ed4edb77700efe932f79c0f49ef Mon Sep 17 00:00:00 2001
From: Tao Luo <luotao02@baidu.com>
Date: Sat, 1 Dec 2018 19:24:36 +0800
Subject: [PATCH 84/90] Revert "Fix for accuracy problem for inplace operators
 when MKL-DNN mode is enabled"

---
 paddle/fluid/framework/operator.cc | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 50a748ba02..8bfdf38912 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -754,7 +754,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
 
   kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx));
 
-  if (run_by_executor_ && !transfered_inplace_vars.empty()) {
+  if (!transfered_inplace_vars.empty()) {
     // there is inplace variable has been transfered.
     TransferInplaceVarsBack(scope, transfered_inplace_vars, *transfer_scope);
   }
@@ -776,7 +776,6 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
     }
   }
 }
-
 void OperatorWithKernel::TransferInplaceVarsBack(
     const Scope& scope, const std::vector<std::string>& inplace_vars,
     const Scope& transfer_scope) const {

From 1d63b06bd738e9b754a006eb23a1a2e064f9504e Mon Sep 17 00:00:00 2001
From: phlrain <phliuhongyu@126.com>
Date: Sat, 1 Dec 2018 22:36:37 +0800
Subject: [PATCH 85/90] add grad test unit; test=develop

---
 python/paddle/fluid/tests/unittests/op_test.py    | 13 +++++++++++--
 .../fluid/tests/unittests/test_lstm_cudnn_op.py   | 15 ++++++++++++---
 python/paddle/fluid/tests/unittests/testsuite.py  |  7 ++++++-
 3 files changed, 29 insertions(+), 6 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index 0200d74136..76a707efdc 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -437,8 +437,17 @@ class OpTest(unittest.TestCase):
         op_inputs = self.inputs if hasattr(self, "inputs") else dict()
         op_outputs = self.outputs if hasattr(self, "outputs") else dict()
         op_attrs = self.attrs if hasattr(self, "attrs") else dict()
-        self.op = create_op(self.scope, self.op_type, op_inputs, op_outputs,
-                            op_attrs)
+
+        cache_list = None
+        if hasattr(self, "cache_name_list"):
+            cache_list = self.cache_name_list
+        self.op = create_op(
+            self.scope,
+            self.op_type,
+            op_inputs,
+            op_outputs,
+            op_attrs,
+            cache_list=cache_list)
 
         if no_grad_set is None:
             no_grad_set = set()
diff --git a/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py b/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
index 9c8e7beae2..0e9e2e8429 100644
--- a/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
@@ -121,9 +121,9 @@ class TestCUDNNLstmOp(OpTest):
         self.op_type = "cudnn_lstm"
         self.dtype = np.float32
 
-        num_steps = 50
-        batch_size = 20
-        hidden_size = 200
+        num_steps = 20
+        batch_size = 5
+        hidden_size = 20
 
         input_weight_size = (hidden_size * hidden_size) * 4
         hidden_weight_size = (hidden_size * hidden_size) * 4
@@ -175,6 +175,15 @@ class TestCUDNNLstmOp(OpTest):
             place = core.CUDAPlace(0)
             self.check_output_with_place(place, atol=1e-5)
 
+    def test_grad_with_place(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            self.check_grad_with_place(
+                place,
+                set(['Input', 'W', 'InitH', 'InitC']),
+                ['Out', 'last_h', 'last_c'],
+                max_relative_error=0.02)
+
     def testcuda(self):
         return core.is_compiled_with_cuda()
 
diff --git a/python/paddle/fluid/tests/unittests/testsuite.py b/python/paddle/fluid/tests/unittests/testsuite.py
index 34fbb1b549..dc3b2cb8bc 100644
--- a/python/paddle/fluid/tests/unittests/testsuite.py
+++ b/python/paddle/fluid/tests/unittests/testsuite.py
@@ -20,7 +20,7 @@ import paddle.fluid.core as core
 from paddle.fluid.op import Operator
 
 
-def create_op(scope, op_type, inputs, outputs, attrs):
+def create_op(scope, op_type, inputs, outputs, attrs, cache_list=None):
     kwargs = dict()
 
     op_maker = core.op_proto_and_checker_maker
@@ -43,6 +43,11 @@ def create_op(scope, op_type, inputs, outputs, attrs):
                     __create_var__(in_name, sub_in_name)
             else:
                 __create_var__(in_name, in_name)
+    if cache_list != None and isinstance(cache_list, list):
+        for name in cache_list:
+            kwargs[name] = []
+            scope.var(name)
+            kwargs[name].append(name)
 
     for out_name, out_dup in Operator.get_op_outputs(op_type):
         if out_name in outputs:

From d89108766c315c2387e60db208ad842c76fa3313 Mon Sep 17 00:00:00 2001
From: barrierye <barriery@qq.com>
Date: Sun, 2 Dec 2018 14:22:59 +0800
Subject: [PATCH 86/90] update CheckFile function in data_feed for ignore the
 space at the end of each line of data(for example, it may be added '\t'
 character to the end of the reduce task output when processes data by hadoop,
 which does not affect the correctness of the data). test=develop

---
 paddle/fluid/framework/data_feed.cc | 43 +++++++++++++++--------------
 1 file changed, 23 insertions(+), 20 deletions(-)

diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc
index 851c7eda89..5fb141f3c1 100644
--- a/paddle/fluid/framework/data_feed.cc
+++ b/paddle/fluid/framework/data_feed.cc
@@ -200,22 +200,22 @@ bool MultiSlotDataFeed::CheckFile(const char* filename) {
     for (size_t i = 0; i < all_slots_.size(); ++i) {
       int num = strtol(endptr, &endptr, 10);
       if (num < 0) {
-        VLOG(1) << "error: the number of ids is a negative number: " << num;
-        VLOG(1) << "please check line<" << instance_cout << "> in file<"
+        VLOG(0) << "error: the number of ids is a negative number: " << num;
+        VLOG(0) << "please check line<" << instance_cout << "> in file<"
                 << filename << ">";
         return false;
       } else if (num == 0) {
-        VLOG(1)
+        VLOG(0)
             << "error: the number of ids can not be zero, you need "
                "padding it in data generator; or if there is something wrong"
                " with the data, please check if the data contains unresolvable "
                "characters.";
-        VLOG(1) << "please check line<" << instance_cout << "> in file<"
+        VLOG(0) << "please check line<" << instance_cout << "> in file<"
                 << filename << ">";
         return false;
       } else if (errno == ERANGE || num > INT_MAX) {
-        VLOG(1) << "error: the number of ids greater than INT_MAX";
-        VLOG(1) << "please check line<" << instance_cout << "> in file<"
+        VLOG(0) << "error: the number of ids greater than INT_MAX";
+        VLOG(0) << "please check line<" << instance_cout << "> in file<"
                 << filename << ">";
         return false;
       }
@@ -223,15 +223,15 @@ bool MultiSlotDataFeed::CheckFile(const char* filename) {
         for (int i = 0; i < num; ++i) {
           strtof(endptr, &endptr);
           if (errno == ERANGE) {
-            VLOG(1) << "error: the value is out of the range of "
+            VLOG(0) << "error: the value is out of the range of "
                        "representable values for float";
-            VLOG(1) << "please check line<" << instance_cout << "> in file<"
+            VLOG(0) << "please check line<" << instance_cout << "> in file<"
                     << filename << ">";
             return false;
           }
           if (i + 1 != num && endptr - str == len) {
-            VLOG(1) << "error: there is a wrong with the number of ids.";
-            VLOG(1) << "please check line<" << instance_cout << "> in file<"
+            VLOG(0) << "error: there is a wrong with the number of ids.";
+            VLOG(0) << "please check line<" << instance_cout << "> in file<"
                     << filename << ">";
             return false;
           }
@@ -240,30 +240,33 @@ bool MultiSlotDataFeed::CheckFile(const char* filename) {
         for (int i = 0; i < num; ++i) {
           strtoull(endptr, &endptr, 10);
           if (errno == ERANGE) {
-            VLOG(1) << "error: the value is out of the range of "
+            VLOG(0) << "error: the value is out of the range of "
                        "representable values for uint64_t";
-            VLOG(1) << "please check line<" << instance_cout << "> in file<"
+            VLOG(0) << "please check line<" << instance_cout << "> in file<"
                     << filename << ">";
             return false;
           }
           if (i + 1 != num && endptr - str == len) {
-            VLOG(1) << "error: there is a wrong with the number of ids.";
-            VLOG(1) << "please check line<" << instance_cout << "> in file<"
+            VLOG(0) << "error: there is a wrong with the number of ids.";
+            VLOG(0) << "please check line<" << instance_cout << "> in file<"
                     << filename << ">";
             return false;
           }
         }
       } else {
-        VLOG(1) << "error: this type<" << all_slots_type_[i]
+        VLOG(0) << "error: this type<" << all_slots_type_[i]
                 << "> is not supported";
         return false;
       }
     }
-    if (endptr - str != len) {
-      VLOG(1) << "error: there is some data at the end of the line.";
-      VLOG(1) << "please check line<" << instance_cout << "> in file<"
-              << filename << ">";
-      return false;
+    while (endptr - str != len) {
+      if (!isspace(*(endptr++))) {
+        VLOG(0)
+            << "error: there is some extra characters at the end of the line.";
+        VLOG(0) << "please check line<" << instance_cout << "> in file<"
+                << filename << ">";
+        return false;
+      }
     }
   }
   VLOG(3) << "instances cout: " << instance_cout;

From d62a3dd72d847d7d37031ae6560caca6a020c15c Mon Sep 17 00:00:00 2001
From: barrierye <barriery@qq.com>
Date: Sun, 2 Dec 2018 21:28:33 +0800
Subject: [PATCH 87/90] add the comment for CheckFile function. test=develop

---
 paddle/fluid/framework/data_feed.cc | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc
index 5fb141f3c1..ae52b5dfca 100644
--- a/paddle/fluid/framework/data_feed.cc
+++ b/paddle/fluid/framework/data_feed.cc
@@ -259,6 +259,13 @@ bool MultiSlotDataFeed::CheckFile(const char* filename) {
         return false;
       }
     }
+    // It may be added '\t' character to the end of the output of reduce
+    // task when processes data by Hadoop(when the output of the reduce
+    // task of Hadoop has only one field, it will add a '\t' at the end
+    // of the line by default), which does not affect the correctness of
+    // the data. Therefore, it should be judged that the data is not
+    // normal when the end of each line of data contains characters
+    // which are not spaces.
     while (endptr - str != len) {
       if (!isspace(*(endptr++))) {
         VLOG(0)

From 08233beed7804f4a5e6ef17d84f919439f95a933 Mon Sep 17 00:00:00 2001
From: barrierye <barriery@qq.com>
Date: Sun, 2 Dec 2018 21:38:22 +0800
Subject: [PATCH 88/90] add the comment for CheckFile function. test=develop

---
 paddle/fluid/framework/data_feed.cc | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc
index ae52b5dfca..291d8ffc3c 100644
--- a/paddle/fluid/framework/data_feed.cc
+++ b/paddle/fluid/framework/data_feed.cc
@@ -262,10 +262,11 @@ bool MultiSlotDataFeed::CheckFile(const char* filename) {
     // It may be added '\t' character to the end of the output of reduce
     // task when processes data by Hadoop(when the output of the reduce
     // task of Hadoop has only one field, it will add a '\t' at the end
-    // of the line by default), which does not affect the correctness of
-    // the data. Therefore, it should be judged that the data is not
-    // normal when the end of each line of data contains characters
-    // which are not spaces.
+    // of the line by default, and you can use this option to avoid it:
+    // `-D mapred.textoutputformat.ignoreseparator=true`), which does
+    // not affect the correctness of the data. Therefore, it should be
+    // judged that the data is not normal when the end of each line of
+    // data contains characters which are not spaces.
     while (endptr - str != len) {
       if (!isspace(*(endptr++))) {
         VLOG(0)

From c7382df80f2e320adbd0f76c3d0daa5fb4958868 Mon Sep 17 00:00:00 2001
From: Yibing Liu <liuyibing01@baidu.com>
Date: Mon, 3 Dec 2018 10:59:36 +0800
Subject: [PATCH 89/90] Print assert failure id in lookup_table_op (#14698)

---
 paddle/fluid/operators/lookup_table_op.cu | 10 +++++-----
 paddle/fluid/platform/assert.h            | 10 ++++++++++
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/operators/lookup_table_op.cu b/paddle/fluid/operators/lookup_table_op.cu
index 36156a1f61..6a0d6bad51 100644
--- a/paddle/fluid/operators/lookup_table_op.cu
+++ b/paddle/fluid/operators/lookup_table_op.cu
@@ -31,8 +31,8 @@ __global__ void LookupTable(T *output, const T *table, const int64_t *ids,
 
   while (idy < K) {
     int64_t id = ids[idy];
-    PADDLE_ASSERT(id >= 0);
-    PADDLE_ASSERT(id < N);
+    PADDLE_ASSERT_MSG_CODE(id >= 0, "received id:", id);
+    PADDLE_ASSERT_MSG_CODE(id < N, "received id:", id);
     T *out = output + idy * D;
     const T *tab = table + id * D;
     for (int i = idx; i < D; i += BlockDimX) {
@@ -57,9 +57,9 @@ __global__ void LookupTableGrad(T *table, const T *output, const int64_t *ids,
   int idy = blockIdx.x + threadIdx.y * GridDimX;
 
   while (idy < K) {
-    int id = ids[idy];
-    PADDLE_ASSERT(id >= 0);
-    PADDLE_ASSERT(id < N);
+    int64_t id = ids[idy];
+    PADDLE_ASSERT_MSG_CODE(id >= 0, "received id:", id);
+    PADDLE_ASSERT_MSG_CODE(id < N, "received id:", id);
     const T *out = output + idy * D;
     T *tab = table + id * D;
     for (int i = idx; i < D; i += BlockDimX) {
diff --git a/paddle/fluid/platform/assert.h b/paddle/fluid/platform/assert.h
index 2ce9b31bb8..2e8fa7c1b8 100644
--- a/paddle/fluid/platform/assert.h
+++ b/paddle/fluid/platform/assert.h
@@ -36,6 +36,15 @@ limitations under the License. */
       asm("trap;");                                                     \
     }                                                                   \
   } while (0)
+
+#define PADDLE_ASSERT_MSG_CODE(e, m, c)                                    \
+  do {                                                                     \
+    if (!(e)) {                                                            \
+      printf("%s:%d Assertion `%s` failed (%s %d).\n", __FILE__, __LINE__, \
+             TOSTRING(e), m, c);                                           \
+      asm("trap;");                                                        \
+    }                                                                      \
+  } while (0)
 #else
 #include <assert.h>
 // For cuda, the assertions can affect performance and it is therefore
@@ -43,4 +52,5 @@ limitations under the License. */
 // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#assertion
 #define PADDLE_ASSERT(e) assert((e))
 #define PADDLE_ASSERT_MSG(e, m) assert((e) && (m))
+#define PADDLE_ASSERT_MSG_CODE(e, m, c) assert((e) && (m) && (c || 1))
 #endif

From 8d6984eb9b901e99e6204abe74d10edd63bee935 Mon Sep 17 00:00:00 2001
From: Tao Luo <luotao02@baidu.com>
Date: Mon, 3 Dec 2018 13:40:53 +0800
Subject: [PATCH 90/90] change OpHasAttr to RuntimeHasAttr, add some comments

test=develop
---
 paddle/fluid/framework/ir/is_test_pass.cc          |  2 +-
 paddle/fluid/framework/ir/is_test_pass_tester.cc   |  4 ++--
 paddle/fluid/framework/ir/mkldnn_placement_pass.cc |  2 +-
 paddle/fluid/framework/ir/node.cc                  |  2 +-
 paddle/fluid/framework/ir/node.h                   | 12 +++++++++++-
 5 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/paddle/fluid/framework/ir/is_test_pass.cc b/paddle/fluid/framework/ir/is_test_pass.cc
index a61bd5f291..6d8f020918 100644
--- a/paddle/fluid/framework/ir/is_test_pass.cc
+++ b/paddle/fluid/framework/ir/is_test_pass.cc
@@ -38,7 +38,7 @@ std::unique_ptr<ir::Graph> IsTestPass::ApplyImpl(
   for (const Node* n : graph->Nodes()) {
     if (n->IsOp()) {
       auto* op = n->Op();
-      if (n->OpHasAttr("is_test")) {
+      if (n->RuntimeHasAttr("is_test")) {
         op->SetAttr("is_test", true);
       } else if (std::find(begin(op_list), end(op_list), op->Type()) !=
                  end(op_list)) {
diff --git a/paddle/fluid/framework/ir/is_test_pass_tester.cc b/paddle/fluid/framework/ir/is_test_pass_tester.cc
index a5fb0abb3c..d9a68c7f1d 100644
--- a/paddle/fluid/framework/ir/is_test_pass_tester.cc
+++ b/paddle/fluid/framework/ir/is_test_pass_tester.cc
@@ -104,9 +104,9 @@ TEST(IsTestPass, basic) {
       auto* op = node->Op();
       auto op_name = boost::get<std::string>(op->GetAttr("name"));
       if (op_name == "conv3") {
-        ASSERT_FALSE(node->OpHasAttr("is_test"));
+        ASSERT_FALSE(node->RuntimeHasAttr("is_test"));
       } else {
-        ASSERT_TRUE(node->OpHasAttr("is_test"));
+        ASSERT_TRUE(node->RuntimeHasAttr("is_test"));
         EXPECT_TRUE(boost::get<bool>(op->GetAttr("is_test")));
       }
     }
diff --git a/paddle/fluid/framework/ir/mkldnn_placement_pass.cc b/paddle/fluid/framework/ir/mkldnn_placement_pass.cc
index 366057b01e..1cf1315d3d 100644
--- a/paddle/fluid/framework/ir/mkldnn_placement_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn_placement_pass.cc
@@ -22,7 +22,7 @@ std::unique_ptr<ir::Graph> MKLDNNPlacementPass::ApplyImpl(
     std::unique_ptr<ir::Graph> graph) const {
   VLOG(3) << "Aplies MKL-DNN placement strategy.";
   for (const Node* n : graph->Nodes()) {
-    if (n->IsOp() && n->OpHasAttr("use_mkldnn")) {
+    if (n->IsOp() && n->RuntimeHasAttr("use_mkldnn")) {
       n->Op()->SetAttr("use_mkldnn", true);
     }
   }
diff --git a/paddle/fluid/framework/ir/node.cc b/paddle/fluid/framework/ir/node.cc
index 4c4da10b04..7a88cb2b68 100644
--- a/paddle/fluid/framework/ir/node.cc
+++ b/paddle/fluid/framework/ir/node.cc
@@ -30,7 +30,7 @@ std::unique_ptr<Node> CreateNodeForTest(const std::string &name,
   return std::unique_ptr<Node>(new Node(name, type));
 }
 
-bool Node::OpHasAttr(const std::string &name) const {
+bool Node::RuntimeHasAttr(const std::string &name) const {
   if (Op()->HasAttr(name)) {
     return true;
   } else {
diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h
index ac08006a49..1044a96430 100644
--- a/paddle/fluid/framework/ir/node.h
+++ b/paddle/fluid/framework/ir/node.h
@@ -108,7 +108,17 @@ class Node {
            Name().find(ir::Node::kControlDepVarName) != std::string::npos;
   }
 
-  bool OpHasAttr(const std::string& name) const;
+  // RuntimeHasAttr is different with HasAttr now.
+  // 1. For Op()->HasAttr(), it judges whether a stored program_desc_ has attr,
+  // thus, if stored program_desc_ are old which don't have an attr, a new
+  // library which adds the attr already will fail on this function.
+  // Details:
+  // https://github.com/PaddlePaddle/Paddle/pull/14608#issuecomment-442309087
+  // 2. For Op()->RuntimeHasAttr, it judges the attr in runtime to avoid above
+  // problem.
+  // TODO(luotao): Maybe we should enhance HasAttr later, instead of adding
+  // RuntimeHasAttr.
+  bool RuntimeHasAttr(const std::string& name) const;
 
   std::vector<Node*> inputs;
   std::vector<Node*> outputs;