From a7f94ec7944ffc9332f9ce0ccfcadb1b7bff6f82 Mon Sep 17 00:00:00 2001
From: barrierye <barriery@qq.com>
Date: Thu, 25 Oct 2018 18:54:26 +0800
Subject: [PATCH 01/25] add similarity_focus op

---
 paddle/fluid/operators/similarity_focus_op.cc |  83 +++++++++
 paddle/fluid/operators/similarity_focus_op.h  | 168 ++++++++++++++++++
 python/paddle/fluid/layers/nn.py              |  56 ++++++
 .../unittests/test_similarity_focus_op.py     | 168 ++++++++++++++++++
 4 files changed, 475 insertions(+)
 create mode 100644 paddle/fluid/operators/similarity_focus_op.cc
 create mode 100644 paddle/fluid/operators/similarity_focus_op.h
 create mode 100755 python/paddle/fluid/tests/unittests/test_similarity_focus_op.py
diff --git a/paddle/fluid/operators/similarity_focus_op.cc b/paddle/fluid/operators/similarity_focus_op.cc
new file mode 100644
index 0000000000..0750fc737a
--- /dev/null
+++ b/paddle/fluid/operators/similarity_focus_op.cc
@@ -0,0 +1,83 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/similarity_focus_op.h"
+
+namespace paddle {
+namespace operators {
+class SimilarityFocusOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(Tensor, default Tensor<float>), a 4-D tensor with shape,"
+             " [BatchSize, X, Y, Z]");
+    AddOutput("Out",
+              "(Tensor, default Tensor<float>), the similarity focus mask"
+              " with the same shape of input X.");
+    AddAttr<int>("axis",
+                 "(int32), indicating the dimension to be select. It can"
+                 " only be 1, 2, or 3.");
+    AddAttr<std::vector<int>>("indexes",
+                              "(std::vector<int32>), indicating the indexes"
+                              " of the selected dimension.");
+    AddComment(R"DOC(
+SimilarityFocus Operator.
+
+Generate a similarity focus mask with the same shape of input using the following method:
+1. Extract the 3-D matrix(here the first dimension is BatchSize) corresponding
+   to the axis according to the indexes. For example, if axis=1 and indexes=[a],
+   it will get the matrix T=X[:, a, :, :]. In this casr, if the shape of input X
+   is (BatchSize, A, B, C), the shape of matrix T is (BatchSize, B, C).
+2. For each index, find the largest numbers in the matrix T, so that the same
+   row and same column has at most one number(obviously there will be min(B, C)
+   numbers), and mark the corresponding position of the 3-D similarity focus mask
+   as 1, otherwise as 0. Do elementwise-or for each index.
+3. Broadcast the 3-D similarity focus mask to the same shape of input X.
+
+Refer to `Similarity Focus Layer <http://www.aclweb.org/anthology/N16-1108>`_
+)DOC");
+  }
+};
+
+class SimilarityFocusOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should be not null.");
+    auto x_dims = ctx->GetInputDim("X");
+    PADDLE_ENFORCE_EQ(x_dims.size(), 4, "Input(X)'s rank should be 4.");
+    ctx->SetOutputDim("Out", x_dims);
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
+        platform::CPUPlace());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(similarity_focus, ops::SimilarityFocusOp,
+                  ops::SimilarityFocusOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(similarity_focus, ops::SimilarityFocusKernel<float>,
+                       ops::SimilarityFocusKernel<double>);
diff --git a/paddle/fluid/operators/similarity_focus_op.h b/paddle/fluid/operators/similarity_focus_op.h
new file mode 100644
index 0000000000..bf3fed2aaf
--- /dev/null
+++ b/paddle/fluid/operators/similarity_focus_op.h
@@ -0,0 +1,168 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <cstring>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+
+template <typename T>
+class SimilarityFocusKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    Tensor* out = context.Output<Tensor>("Out");
+    const Tensor* x = context.Input<Tensor>("X");
+    T* out_data = out->mutable_data<T>(context.GetPlace());
+    const T* x_data = x->data<T>();
+
+    int axis = context.Attr<int>("axis");
+    std::vector<int> indexes = context.Attr<std::vector<int>>("indexes");
+
+    int64_t batch_size = x->dims()[0];
+    int64_t dim[4];
+    for (int i = 1; i <= 3; ++i) {
+      dim[i] = x->dims()[i];
+    }
+
+    if (indexes.size() < 1) {
+      PADDLE_THROW("Indexes' size can not be 0.");
+    }
+    for (auto index : indexes) {
+      if (dim[axis] < index) {
+        PADDLE_THROW("Index exceeds tensor shape limit.");
+      }
+    }
+
+    int64_t array_size = 1;
+    for (int i = 1; i <= 3; ++i) {
+      if (i != axis) {
+        array_size *= dim[i];
+      }
+    }
+
+    std::vector<std::pair<T, int64_t>> array(array_size);
+
+    bool (*cmp)(std::pair<T, int64_t>, std::pair<T, int64_t>) = [](
+        std::pair<T, int64_t> x, std::pair<T, int64_t> y) {
+      return x.first > y.first;
+    };
+
+    int64_t (*compute_index)(int64_t*, int, int, int, int) = [](
+        int64_t* dim, int d1, int d2, int d3, int d4) {
+      return d1 * dim[1] * dim[2] * dim[3] + d2 * dim[2] * dim[3] +
+             d3 * dim[3] + d4;
+    };
+
+    memset(out_data, 0, sizeof(T) * batch_size * dim[1] * dim[2] * dim[3]);
+    for (int i = 0; i < batch_size; ++i) {
+      for (auto index : indexes) {
+        if (axis == 1) {
+          for (int j = 0; j < dim[2]; ++j) {
+            for (int k = 0; k < dim[3]; ++k) {
+              array[j * dim[3] + k] = std::make_pair(
+                  x_data[compute_index(dim, i, index, j, k)], j * dim[3] + k);
+            }
+          }
+
+          std::sort(array.begin(), array.end(), cmp);
+          int tag_num = 0;
+          std::vector<bool> tag2(dim[2]), tag3(dim[3]);
+          for (auto x : array) {
+            int idx2 = x.second / dim[3];
+            int idx3 = x.second % dim[3];
+            if (tag2[idx2] || tag3[idx3]) {
+              continue;
+            }
+            tag_num++;
+            tag2[idx2] = true;
+            tag3[idx3] = true;
+            for (int j = 0; j < dim[1]; ++j) {
+              out_data[compute_index(dim, i, j, idx2, idx3)] = 1;
+            }
+            if (tag_num == std::min(dim[2], dim[3])) {
+              break;
+            }
+          }
+        } else if (axis == 2) {
+          for (int j = 0; j < dim[1]; ++j) {
+            for (int k = 0; k < dim[3]; ++k) {
+              array[j * dim[3] + k] = std::make_pair(
+                  x_data[compute_index(dim, i, j, index, k)], j * dim[3] + k);
+            }
+          }
+
+          std::sort(array.begin(), array.end(), cmp);
+          int tag_num = 0;
+          std::vector<bool> tag1(dim[1]), tag3(dim[3]);
+          for (auto x : array) {
+            int idx1 = x.second / dim[3];
+            int idx3 = x.second % dim[3];
+            if (tag1[idx1] || tag3[idx3]) {
+              continue;
+            }
+            tag_num++;
+            tag1[idx1] = true;
+            tag3[idx3] = true;
+            for (int j = 0; j < dim[2]; ++j) {
+              out_data[compute_index(dim, i, idx1, j, idx3)] = 1;
+            }
+            if (tag_num == std::min(dim[1], dim[3])) {
+              break;
+            }
+          }
+        } else if (axis == 3) {
+          for (int j = 0; j < dim[1]; ++j) {
+            for (int k = 0; k < dim[2]; ++k) {
+              array[j * dim[2] + k] = std::make_pair(
+                  x_data[compute_index(dim, i, j, k, index)], j * dim[2] + k);
+            }
+          }
+
+          std::sort(array.begin(), array.end(), cmp);
+          int tag_num = 0;
+          std::vector<bool> tag1(dim[1]), tag2(dim[2]);
+          for (auto x : array) {
+            int idx1 = x.second / dim[2];
+            int idx2 = x.second % dim[2];
+            if (tag1[idx1] || tag2[idx2]) {
+              continue;
+            }
+            tag_num++;
+            tag1[idx1] = true;
+            tag2[idx2] = true;
+            for (int j = 0; j < dim[3]; ++j) {
+              out_data[compute_index(dim, i, idx1, idx2, j)] = 1;
+            }
+            if (tag_num == std::min(dim[1], dim[2])) {
+              break;
+            }
+          }
+        } else {
+          PADDLE_THROW("Axis must be 1 or 2 or 3");
+        }
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index cca618b9ad..463200fb72 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -155,6 +155,7 @@ __all__ = [
     'sigmoid_cross_entropy_with_logits',
     'maxout',
     'affine_channel',
+    'similarity_focus',
 ]
 
 
@@ -7494,3 +7495,58 @@ def affine_channel(x, scale=None, bias=None, data_layout='NCHW', name=None):
         attrs={"data_layout": data_layout},
         outputs={"Out": out})
     return out
+
+
+def similarity_focus(input, axis, indexes, name=None):
+    """  
+    **SimilarityFocus Operator**
+
+    Generate a similarity focus mask with the same shape of input using the following method:
+    1. Extract the 3-D matrix(here the first dimension is BatchSize) corresponding 
+       to the axis according to the indexes. For example, if axis=1 and indexes=[a], 
+       it will get the matrix T=X[:, a, :, :]. In this casr, if the shape of input X 
+       is (BatchSize, A, B, C), the shape of matrix T is (BatchSize, B, C).
+    2. For each index, find the largest numbers in the matrix T, so that the same 
+       row and same column has at most one number(obviously there will be min(B, C) 
+       numbers), and mark the corresponding position of the 3-D similarity focus mask 
+       as 1, otherwise as 0. Do elementwise-or for each index.
+    3. Broadcast the 3-D similarity focus mask to the same shape of input X.
+
+    Refer to `Similarity Focus Layer <http://www.aclweb.org/anthology/N16-1108>`_
+
+    Args:
+        input(Variable): The input tensor variable(default float). It should 
+            be a 4-D tensor with shape [BatchSize, A, B, C].
+        axis(int): Indicating the dimension to be select. It can only be
+            1, 2, or 3.
+        indexes(list): indicating the indexes of the selected dimension.
+
+    Returns:
+        Variable: A tensor variable with the same shape and same type 
+            as the input.
+        
+    Examples:
+        .. code-block:: python
+            data = fluid.layers.data(
+              name='data', shape=[128, 13, 48, 48], dtype='float32')
+            x = fluid.layers.layer_norm(input=data, axis=1, indexes=[9, 10])
+    """
+    helper = LayerHelper('similarity_focus', **locals())
+    # check attrs
+    if isinstance(axis, int) is False:
+        raise TypeError("axis must be int type.")
+    if isinstance(indexes, list) is False:
+        raise TypeError("indexes must be list type.")
+    if axis != 1 and axis != 2 and axis != 3:
+        raise ValueError("axis must be 1, 2 or 3.")
+    if len(indexes) == 0:
+        raise ValueError("indexes can not be empty.")
+
+    out = helper.create_tmp_variable(dtype=helper.input_dtype())
+    helper.append_op(
+        type='similarity_focus',
+        inputs={'X': input},
+        outputs={'Out': out},
+        attrs={"axis": axis,
+               "indexes": indexes})
+    return out
diff --git a/python/paddle/fluid/tests/unittests/test_similarity_focus_op.py b/python/paddle/fluid/tests/unittests/test_similarity_focus_op.py
new file mode 100755
index 0000000000..21308a7e0c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_similarity_focus_op.py
@@ -0,0 +1,168 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+from op_test import OpTest
+
+
+class TestSimilarityFocusOp_axis1(OpTest):
+    def setUp(self):
+        self.op_type = "similarity_focus"
+        batch_size = 3
+        x_dim, y_dim, z_dim = 4, 5, 6
+        self.inputs = {
+            'X': np.random.random(
+                (batch_size, x_dim, y_dim, z_dim)).astype("float32"),
+        }
+        self.attrs = {
+            'axis': 1,
+            'indexes': [0, 3],
+        }
+
+        output = None
+        for batch in range(batch_size):
+            res = np.zeros((1, y_dim, z_dim)).astype("float32").reshape(-1)
+            for index in self.attrs['indexes']:
+                channel = self.inputs['X'][batch, index, :, :].reshape(-1).copy(
+                )
+                tag1 = [0 for i in range(y_dim)]
+                tag2 = [0 for i in range(z_dim)]
+                cnt = 0
+                for i in range(channel.size):
+                    index = channel.argmax()
+                    idx1 = index / z_dim
+                    idx2 = index % z_dim
+                    if tag1[idx1] + tag2[idx2] == 0:
+                        tag1[idx1] = 1
+                        tag2[idx2] = 1
+                        res[index] = 1
+                        cnt += 1
+                        if cnt == min(y_dim, z_dim):
+                            break
+                    channel[index] = -1
+            res = res.reshape(1, y_dim, z_dim)
+            res = res.repeat([x_dim], axis=0)
+            res = res.reshape(1, x_dim, y_dim, z_dim)
+            if output is not None:
+                output = np.concatenate((output, res), axis=0)
+            else:
+                output = res
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestSimilarityFocusOp_axis2(OpTest):
+    def setUp(self):
+        self.op_type = "similarity_focus"
+        batch_size = 6
+        x_dim, y_dim, z_dim = 7, 8, 9
+        self.inputs = {
+            'X': np.random.random(
+                (batch_size, x_dim, y_dim, z_dim)).astype("float32"),
+        }
+        self.attrs = {
+            'axis': 2,
+            'indexes': [0, 3, 5],
+        }
+
+        output = None
+        for batch in range(batch_size):
+            res = np.zeros((x_dim, 1, z_dim)).astype("float32").reshape(-1)
+            for index in self.attrs['indexes']:
+                channel = self.inputs['X'][batch, :, index, :].reshape(-1).copy(
+                )
+                tag1 = [0 for i in range(x_dim)]
+                tag2 = [0 for i in range(z_dim)]
+                cnt = 0
+                for i in range(channel.size):
+                    index = channel.argmax()
+                    idx1 = index / z_dim
+                    idx2 = index % z_dim
+                    if tag1[idx1] + tag2[idx2] == 0:
+                        tag1[idx1] = 1
+                        tag2[idx2] = 1
+                        res[index] = 1
+                        cnt += 1
+                        if cnt == min(x_dim, z_dim):
+                            break
+                    channel[index] = -1
+            res = res.reshape(x_dim, 1, z_dim)
+            res = res.repeat([y_dim], axis=1)
+            res = res.reshape(1, x_dim, y_dim, z_dim)
+            if output is not None:
+                output = np.concatenate((output, res), axis=0)
+            else:
+                output = res
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestSimilarityFocusOp_axis3(OpTest):
+    def setUp(self):
+        self.op_type = "similarity_focus"
+        batch_size = 64
+        x_dim, y_dim, z_dim = 48, 48, 13
+        self.inputs = {
+            'X': np.random.random(
+                (batch_size, x_dim, y_dim, z_dim)).astype("float32"),
+        }
+        self.attrs = {
+            'axis': 3,
+            'indexes': [0, 2, 7, 9],
+        }
+
+        output = None
+        for batch in range(batch_size):
+            res = np.zeros((x_dim, y_dim, 1)).astype("float32").reshape(-1)
+            for index in self.attrs['indexes']:
+                channel = self.inputs['X'][batch, :, :, index].reshape(-1).copy(
+                )
+                tag1 = [0 for i in range(x_dim)]
+                tag2 = [0 for i in range(y_dim)]
+                cnt = 0
+                for i in range(channel.size):
+                    index = channel.argmax()
+                    idx1 = index / y_dim
+                    idx2 = index % y_dim
+                    if tag1[idx1] + tag2[idx2] == 0:
+                        tag1[idx1] = 1
+                        tag2[idx2] = 1
+                        res[index] = 1
+                        cnt += 1
+                        if cnt == min(x_dim, y_dim):
+                            break
+                    channel[index] = -1
+            res = res.reshape(x_dim, y_dim, 1)
+            res = res.repeat([z_dim], axis=2)
+            res = res.reshape(1, x_dim, y_dim, z_dim)
+            if output is not None:
+                output = np.concatenate((output, res), axis=0)
+            else:
+                output = res
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == "__main__":
+    unittest.main()

From 99707b281dbc709f39a51aba7c0e22a143ba8a08 Mon Sep 17 00:00:00 2001
From: barrierye <barriery@qq.com>
Date: Mon, 29 Oct 2018 15:37:56 +0800
Subject: [PATCH 02/25] change / to // to fit py3

---
 .../fluid/tests/unittests/test_similarity_focus_op.py     | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_similarity_focus_op.py b/python/paddle/fluid/tests/unittests/test_similarity_focus_op.py
index 7df9fe3a48..bd3b2782ae 100755
--- a/python/paddle/fluid/tests/unittests/test_similarity_focus_op.py
+++ b/python/paddle/fluid/tests/unittests/test_similarity_focus_op.py
@@ -47,7 +47,7 @@ class TestSimilarityFocusOp(OpTest):
                 cnt = 0
                 for i in range(channel.size):
                     index = channel.argmax()
-                    idx1 = index / z_dim
+                    idx1 = index // z_dim
                     idx2 = index % z_dim
                     if tag1[idx1] + tag2[idx2] == 0:
                         tag1[idx1] = 1
@@ -95,7 +95,7 @@ class TestSimilarityFocusOp_axis1(OpTest):
                 cnt = 0
                 for i in range(channel.size):
                     index = channel.argmax()
-                    idx1 = index / z_dim
+                    idx1 = index // z_dim
                     idx2 = index % z_dim
                     if tag1[idx1] + tag2[idx2] == 0:
                         tag1[idx1] = 1
@@ -143,7 +143,7 @@ class TestSimilarityFocusOp_axis2(OpTest):
                 cnt = 0
                 for i in range(channel.size):
                     index = channel.argmax()
-                    idx1 = index / z_dim
+                    idx1 = index // z_dim
                     idx2 = index % z_dim
                     if tag1[idx1] + tag2[idx2] == 0:
                         tag1[idx1] = 1
@@ -191,7 +191,7 @@ class TestSimilarityFocusOp_axis3(OpTest):
                 cnt = 0
                 for i in range(channel.size):
                     index = channel.argmax()
-                    idx1 = index / y_dim
+                    idx1 = index // y_dim
                     idx2 = index % y_dim
                     if tag1[idx1] + tag2[idx2] == 0:
                         tag1[idx1] = 1

From 73671379cd2b046ec32c70b7f76d23247f7893bd Mon Sep 17 00:00:00 2001
From: barrierye <barriery@qq.com>
Date: Mon, 29 Oct 2018 17:07:08 +0800
Subject: [PATCH 03/25] update paddle/fluid/API.spec test=develop

---
 paddle/fluid/API.spec | 32 +++++++++-----------------------
 1 file changed, 9 insertions(+), 23 deletions(-)

diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index 0d90bf3cc1..4d66dcb7ca 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -61,12 +61,12 @@ paddle.fluid.layers.cos_sim ArgSpec(args=['X', 'Y'], varargs=None, keywords=None
 paddle.fluid.layers.cross_entropy ArgSpec(args=['input', 'label', 'soft_label', 'ignore_index'], varargs=None, keywords=None, defaults=(False, -100))
 paddle.fluid.layers.square_error_cost ArgSpec(args=['input', 'label'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.chunk_eval ArgSpec(args=['input', 'label', 'chunk_scheme', 'num_chunk_types', 'excluded_chunk_types'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.sequence_conv ArgSpec(args=['input', 'num_filters', 'filter_size', 'filter_stride', 'padding', 'bias_attr', 'param_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(3, 1, None, None, None, None, None))
+paddle.fluid.layers.sequence_conv ArgSpec(args=['input', 'num_filters', 'filter_size', 'filter_stride', 'padding', 'bias_attr', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(3, 1, None, None, None, None))
 paddle.fluid.layers.conv2d ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None))
 paddle.fluid.layers.conv3d ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None))
 paddle.fluid.layers.sequence_pool ArgSpec(args=['input', 'pool_type'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.sequence_softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(False, None))
-paddle.fluid.layers.softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(True, None))
+paddle.fluid.layers.sequence_softmax ArgSpec(args=['input', 'param_attr', 'bias_attr', 'use_cudnn'], varargs=None, keywords=None, defaults=(None, None, False))
+paddle.fluid.layers.softmax ArgSpec(args=['input', 'param_attr', 'bias_attr', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(None, None, True, None))
 paddle.fluid.layers.pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None))
 paddle.fluid.layers.pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None))
 paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False))
@@ -75,8 +75,7 @@ paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'outp
 paddle.fluid.layers.conv3d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
 paddle.fluid.layers.sequence_expand ArgSpec(args=['x', 'y', 'ref_level', 'name'], varargs=None, keywords=None, defaults=(-1, None))
 paddle.fluid.layers.sequence_expand_as ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.sequence_pad ArgSpec(args=['x', 'pad_value', 'maxlen', 'name'], varargs=None, keywords=None, defaults=(None, None))
-paddle.fluid.layers.sequence_unpad ArgSpec(args=['x', 'length', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.sequence_pad ArgSpec(args=['x', 'pad_value', 'maxlen'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.lstm_unit ArgSpec(args=['x_t', 'hidden_t_prev', 'cell_t_prev', 'forget_bias', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(0.0, None, None, None))
 paddle.fluid.layers.reduce_sum ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None))
 paddle.fluid.layers.reduce_mean ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None))
@@ -85,8 +84,7 @@ paddle.fluid.layers.reduce_min ArgSpec(args=['input', 'dim', 'keep_dim', 'name']
 paddle.fluid.layers.reduce_prod ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None))
 paddle.fluid.layers.sequence_first_step ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.sequence_last_step ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.sequence_slice ArgSpec(args=['input', 'offset', 'length', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.dropout ArgSpec(args=['x', 'dropout_prob', 'is_test', 'seed', 'name', 'dropout_implementation'], varargs=None, keywords=None, defaults=(False, None, None, 'downgrade_in_infer'))
+paddle.fluid.layers.dropout ArgSpec(args=['x', 'dropout_prob', 'is_test', 'seed', 'name'], varargs=None, keywords=None, defaults=(False, None, None))
 paddle.fluid.layers.split ArgSpec(args=['input', 'num_or_sections', 'dim', 'name'], varargs=None, keywords=None, defaults=(-1, None))
 paddle.fluid.layers.ctc_greedy_decoder ArgSpec(args=['input', 'blank', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.edit_distance ArgSpec(args=['input', 'label', 'normalized', 'ignored_tokens'], varargs=None, keywords=None, defaults=(True, None))
@@ -97,8 +95,8 @@ paddle.fluid.layers.warpctc ArgSpec(args=['input', 'label', 'blank', 'norm_by_ti
 paddle.fluid.layers.sequence_reshape ArgSpec(args=['input', 'new_dim'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.transpose ArgSpec(args=['x', 'perm', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.im2sequence ArgSpec(args=['input', 'filter_size', 'stride', 'padding', 'input_image_size', 'out_stride', 'name'], varargs=None, keywords=None, defaults=(1, 1, 0, None, 1, None))
-paddle.fluid.layers.nce ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, None))
-paddle.fluid.layers.hsigmoid ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
+paddle.fluid.layers.nce ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples'], varargs=None, keywords=None, defaults=(None, None, None, None))
+paddle.fluid.layers.hsigmoid ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'name'], varargs=None, keywords=None, defaults=(0, None))
 paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.layers.multiplex ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None)
@@ -107,7 +105,7 @@ paddle.fluid.layers.softmax_with_cross_entropy ArgSpec(args=['logits', 'label',
 paddle.fluid.layers.smooth_l1 ArgSpec(args=['x', 'y', 'inside_weight', 'outside_weight', 'sigma'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.layers.one_hot ArgSpec(args=['input', 'depth'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.autoincreased_step_counter ArgSpec(args=['counter_name', 'begin', 'step'], varargs=None, keywords=None, defaults=(None, 1, 1))
-paddle.fluid.layers.reshape ArgSpec(args=['x', 'shape', 'actual_shape', 'act', 'inplace', 'name'], varargs=None, keywords=None, defaults=(None, None, False, None))
+paddle.fluid.layers.reshape ArgSpec(args=['x', 'shape', 'actual_shape', 'act', 'inplace', 'name'], varargs=None, keywords=None, defaults=(None, None, True, None))
 paddle.fluid.layers.squeeze ArgSpec(args=['input', 'axes', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.unsqueeze ArgSpec(args=['input', 'axes', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.lod_reset ArgSpec(args=['x', 'y', 'target_lod'], varargs=None, keywords=None, defaults=(None, None))
@@ -116,7 +114,6 @@ paddle.fluid.layers.pad ArgSpec(args=['x', 'paddings', 'pad_value', 'name'], var
 paddle.fluid.layers.pad_constant_like ArgSpec(args=['x', 'y', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0.0, None))
 paddle.fluid.layers.label_smooth ArgSpec(args=['label', 'prior_dist', 'epsilon', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, 0.1, 'float32', None))
 paddle.fluid.layers.roi_pool ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1, 1, 1.0))
-paddle.fluid.layers.roi_align ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale', 'sampling_ratio', 'name'], varargs=None, keywords=None, defaults=(1, 1, 1.0, -1, None))
 paddle.fluid.layers.dice_loss ArgSpec(args=['input', 'label', 'epsilon'], varargs=None, keywords=None, defaults=(1e-05,))
 paddle.fluid.layers.image_resize ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'resample'], varargs=None, keywords=None, defaults=(None, None, None, 'BILINEAR'))
 paddle.fluid.layers.image_resize_short ArgSpec(args=['input', 'out_short_len', 'resample'], varargs=None, keywords=None, defaults=('BILINEAR',))
@@ -130,7 +127,6 @@ paddle.fluid.layers.relu ArgSpec(args=['x', 'name'], varargs=None, keywords=None
 paddle.fluid.layers.log ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.crop ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.layers.rank_loss ArgSpec(args=['label', 'left', 'right', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.margin_rank_loss ArgSpec(args=['label', 'left', 'right', 'margin', 'name'], varargs=None, keywords=None, defaults=(0.1, None))
 paddle.fluid.layers.elu ArgSpec(args=['x', 'alpha', 'name'], varargs=None, keywords=None, defaults=(1.0, None))
 paddle.fluid.layers.relu6 ArgSpec(args=['x', 'threshold', 'name'], varargs=None, keywords=None, defaults=(6.0, None))
 paddle.fluid.layers.pow ArgSpec(args=['x', 'factor', 'name'], varargs=None, keywords=None, defaults=(1.0, None))
@@ -174,9 +170,7 @@ paddle.fluid.layers.mean ArgSpec(args=['x', 'name'], varargs=None, keywords=None
 paddle.fluid.layers.mul ArgSpec(args=['x', 'y', 'x_num_col_dims', 'y_num_col_dims', 'name'], varargs=None, keywords=None, defaults=(1, 1, None))
 paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.maxout ArgSpec(args=['x', 'groups', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.sequence_reverse ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.affine_channel ArgSpec(args=['x', 'scale', 'bias', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(None, None, 'NCHW', None))
-paddle.fluid.layers.hash ArgSpec(args=['input', 'hash_size', 'num_hash', 'name'], varargs=None, keywords=None, defaults=(1, None))
+paddle.fluid.layers.similarity_focus ArgSpec(args=['input', 'axis', 'indexes', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
 paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
 paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None)
@@ -205,9 +199,6 @@ paddle.fluid.layers.argsort ArgSpec(args=['input', 'axis', 'name'], varargs=None
 paddle.fluid.layers.ones ArgSpec(args=['shape', 'dtype', 'force_cpu'], varargs=None, keywords=None, defaults=(False,))
 paddle.fluid.layers.zeros ArgSpec(args=['shape', 'dtype', 'force_cpu'], varargs=None, keywords=None, defaults=(False,))
 paddle.fluid.layers.reverse ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.has_inf ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.has_nan ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.isfinite ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.While.__init__ ArgSpec(args=['self', 'cond', 'is_test', 'name'], varargs=None, keywords=None, defaults=(False, None))
 paddle.fluid.layers.While.block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.Switch.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,))
@@ -308,11 +299,6 @@ paddle.fluid.contrib.BeamSearchDecoder.early_stop ArgSpec(args=['self'], varargs
 paddle.fluid.contrib.BeamSearchDecoder.read_array ArgSpec(args=['self', 'init', 'is_ids', 'is_scores'], varargs=None, keywords=None, defaults=(False, False))
 paddle.fluid.contrib.BeamSearchDecoder.update_array ArgSpec(args=['self', 'array', 'value'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.contrib.memory_usage ArgSpec(args=['program', 'batch_size'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.contrib.op_freq_statistic ArgSpec(args=['program'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.contrib.QuantizeTranspiler.__init__ ArgSpec(args=['self', 'weight_bits', 'activation_bits', 'activation_quantize_type', 'weight_quantize_type', 'window_size'], varargs=None, keywords=None, defaults=(8, 8, 'abs_max', 'abs_max', 10000))
-paddle.fluid.contrib.QuantizeTranspiler.convert_to_int8 ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.contrib.QuantizeTranspiler.freeze_program ArgSpec(args=['self', 'program', 'place', 'fuse_bn', 'scope'], varargs=None, keywords=None, defaults=(False, None))
-paddle.fluid.contrib.QuantizeTranspiler.training_transpile ArgSpec(args=['self', 'program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.transpiler.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)

From 0e60bb3c4ffdca714b73196c1a4eb5385b9b87a7 Mon Sep 17 00:00:00 2001
From: barrierye <barriery@qq.com>
Date: Wed, 31 Oct 2018 10:54:27 +0800
Subject: [PATCH 04/25] Submit PR again test=develop

---
 .../paddle/fluid/tests/unittests/test_similarity_focus_op.py   | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/test_similarity_focus_op.py b/python/paddle/fluid/tests/unittests/test_similarity_focus_op.py
index b3833f05f1..bd3b2782ae 100755
--- a/python/paddle/fluid/tests/unittests/test_similarity_focus_op.py
+++ b/python/paddle/fluid/tests/unittests/test_similarity_focus_op.py
@@ -57,7 +57,8 @@ class TestSimilarityFocusOp(OpTest):
                         if cnt == min(y_dim, z_dim):
                             break
                     channel[index] = -1
-            res = res.reshape(1, y_dim, z_dim).repeat([x_dim], axis=0)
+            res = res.reshape(1, y_dim, z_dim)
+            res = res.repeat([x_dim], axis=0)
             res = res.reshape(1, x_dim, y_dim, z_dim)
             if output is not None:
                 output = np.concatenate((output, res), axis=0)

From 162cf75c88111b44e1126a84b8b6438f5dac752c Mon Sep 17 00:00:00 2001
From: barrierye <barriery@qq.com>
Date: Mon, 5 Nov 2018 10:48:39 +0800
Subject: [PATCH 05/25] Submit PR again test=develop

---
 .../paddle/fluid/tests/unittests/test_similarity_focus_op.py   | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_similarity_focus_op.py b/python/paddle/fluid/tests/unittests/test_similarity_focus_op.py
index bd3b2782ae..b3833f05f1 100755
--- a/python/paddle/fluid/tests/unittests/test_similarity_focus_op.py
+++ b/python/paddle/fluid/tests/unittests/test_similarity_focus_op.py
@@ -57,8 +57,7 @@ class TestSimilarityFocusOp(OpTest):
                         if cnt == min(y_dim, z_dim):
                             break
                     channel[index] = -1
-            res = res.reshape(1, y_dim, z_dim)
-            res = res.repeat([x_dim], axis=0)
+            res = res.reshape(1, y_dim, z_dim).repeat([x_dim], axis=0)
             res = res.reshape(1, x_dim, y_dim, z_dim)
             if output is not None:
                 output = np.concatenate((output, res), axis=0)

From 316e020a11448a3ae0230d7fd85bf3d0b6d2b99e Mon Sep 17 00:00:00 2001
From: barrierye <barriery@qq.com>
Date: Mon, 5 Nov 2018 11:53:42 +0800
Subject: [PATCH 06/25] Submit PR again test=develop

---
 .../paddle/fluid/tests/unittests/test_similarity_focus_op.py   | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/test_similarity_focus_op.py b/python/paddle/fluid/tests/unittests/test_similarity_focus_op.py
index b3833f05f1..bd3b2782ae 100755
--- a/python/paddle/fluid/tests/unittests/test_similarity_focus_op.py
+++ b/python/paddle/fluid/tests/unittests/test_similarity_focus_op.py
@@ -57,7 +57,8 @@ class TestSimilarityFocusOp(OpTest):
                         if cnt == min(y_dim, z_dim):
                             break
                     channel[index] = -1
-            res = res.reshape(1, y_dim, z_dim).repeat([x_dim], axis=0)
+            res = res.reshape(1, y_dim, z_dim)
+            res = res.repeat([x_dim], axis=0)
             res = res.reshape(1, x_dim, y_dim, z_dim)
             if output is not None:
                 output = np.concatenate((output, res), axis=0)

From 5e7bb6a9bddfd41335021464dd0335f6cc576e81 Mon Sep 17 00:00:00 2001
From: barrierye <barriery@qq.com>
Date: Mon, 5 Nov 2018 15:02:30 +0800
Subject: [PATCH 07/25] update docs test=develop

---
 paddle/fluid/operators/similarity_focus_op.cc | 19 ++++++++------
 python/paddle/fluid/layers/nn.py              | 25 ++++++++++++-------
 2 files changed, 27 insertions(+), 17 deletions(-)

diff --git a/paddle/fluid/operators/similarity_focus_op.cc b/paddle/fluid/operators/similarity_focus_op.cc
index 25a023aed2..768b6903b7 100644
--- a/paddle/fluid/operators/similarity_focus_op.cc
+++ b/paddle/fluid/operators/similarity_focus_op.cc
@@ -35,14 +35,17 @@ class SimilarityFocusOpMaker : public framework::OpProtoAndCheckerMaker {
 SimilarityFocus Operator.
 
 Generate a similarity focus mask with the same shape of input using the following method:
-1. Extract the 4-D matrix(here the first dimension is BatchSize) corresponding
-   to the axis according to the indexes. For example, if axis=1 and indexes=[a],
-   it will get the matrix T=X[:, a, :, :]. In this case, if the shape of input X
-   is (BatchSize, A, B, C), the shape of matrix T is (BatchSize, B, C).
-2. For each index, find the largest numbers in the matrix T, so that the same
-   row and same column has at most one number(obviously there will be min(B, C)
-   numbers), and mark the corresponding position of the 3-D similarity focus mask
-   as 1, otherwise as 0. Do elementwise-or for each index.
+1. Extract the 3-D tensor(here the first dimension is BatchSize) corresponding 
+   to the axis according to the indexes. For example, if axis=1 and indexes=[a], 
+   it will get the matrix T=X[:, a, :, :]. In this case, if the shape of input X 
+   is (BatchSize, A, B, C), the shape of tensor T is (BatchSize, B, C).
+2. For each index, find the largest numbers in the tensor T, so that the same 
+   row and same column has at most one number(what it means is that if the 
+   largest number has been found in the i-th row and the j-th column, then 
+   the numbers in the i-th or j-th column will be skipped. Obviously there 
+   will be min(B, C) numbers), and mark the corresponding position of the 
+   3-D similarity focus mask as 1, otherwise as 0. Do elementwise-or for 
+   each index.
 3. Broadcast the 3-D similarity focus mask to the same shape of input X.
 
 Refer to `Similarity Focus Layer <http://www.aclweb.org/anthology/N16-1108>`_
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index a1ef1ca009..be0e75161b 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -7560,14 +7560,17 @@ def similarity_focus(input, axis, indexes, name=None):
     SimilarityFocus Operator
 
     Generate a similarity focus mask with the same shape of input using the following method:
-    1. Extract the 4-D matrix(here the first dimension is BatchSize) corresponding 
+    1. Extract the 3-D tensor(here the first dimension is BatchSize) corresponding 
        to the axis according to the indexes. For example, if axis=1 and indexes=[a], 
        it will get the matrix T=X[:, a, :, :]. In this case, if the shape of input X 
-       is (BatchSize, A, B, C), the shape of matrix T is (BatchSize, B, C).
-    2. For each index, find the largest numbers in the matrix T, so that the same 
-       row and same column has at most one number(obviously there will be min(B, C) 
-       numbers), and mark the corresponding position of the 3-D similarity focus mask 
-       as 1, otherwise as 0. Do elementwise-or for each index.
+       is (BatchSize, A, B, C), the shape of tensor T is (BatchSize, B, C).
+    2. For each index, find the largest numbers in the tensor T, so that the same 
+       row and same column has at most one number(what it means is that if the 
+       largest number has been found in the i-th row and the j-th column, then 
+       the numbers in the i-th or j-th column will be skipped. Obviously there 
+       will be min(B, C) numbers), and mark the corresponding position of the 
+       3-D similarity focus mask as 1, otherwise as 0. Do elementwise-or for 
+       each index.
     3. Broadcast the 3-D similarity focus mask to the same shape of input X.
 
     Refer to `Similarity Focus Layer <http://www.aclweb.org/anthology/N16-1108>`_
@@ -7624,9 +7627,9 @@ def similarity_focus(input, axis, indexes, name=None):
     Args:
         input(Variable): The input tensor variable(default float). It should 
             be a 4-D tensor with shape [BatchSize, A, B, C].
-        axis(int): Indicating the dimension to be select. It can only be
+        axis(int): Indicating the dimension to be selected. It can only be
             1, 2 or 3.
-        indexes(list): indicating the indexes of the selected dimension.
+        indexes(list): Indicating the indexes of the selected dimension.
 
     Returns:
         Variable: A tensor variable with the same shape and same type 
@@ -7649,7 +7652,11 @@ def similarity_focus(input, axis, indexes, name=None):
     if len(indexes) == 0:
         raise ValueError("indexes can not be empty.")
 
-    out = helper.create_tmp_variable(dtype=helper.input_dtype())
+    if name is None:
+        out = helper.create_variable_for_type_inference(dtype=input.dtype)
+    else:
+        out = helper.create_variable(
+            name=name, dtype=input.dtype, persistable=False)
     helper.append_op(
         type='similarity_focus',
         inputs={'X': input},

From 19b68de79fec0d55faf516b61e80e05940758917 Mon Sep 17 00:00:00 2001
From: barrierye <barriery@qq.com>
Date: Mon, 5 Nov 2018 19:00:13 +0800
Subject: [PATCH 08/25] submit again test=develop

---
 .../paddle/fluid/tests/unittests/test_similarity_focus_op.py   | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_similarity_focus_op.py b/python/paddle/fluid/tests/unittests/test_similarity_focus_op.py
index bd3b2782ae..b3833f05f1 100755
--- a/python/paddle/fluid/tests/unittests/test_similarity_focus_op.py
+++ b/python/paddle/fluid/tests/unittests/test_similarity_focus_op.py
@@ -57,8 +57,7 @@ class TestSimilarityFocusOp(OpTest):
                         if cnt == min(y_dim, z_dim):
                             break
                     channel[index] = -1
-            res = res.reshape(1, y_dim, z_dim)
-            res = res.repeat([x_dim], axis=0)
+            res = res.reshape(1, y_dim, z_dim).repeat([x_dim], axis=0)
             res = res.reshape(1, x_dim, y_dim, z_dim)
             if output is not None:
                 output = np.concatenate((output, res), axis=0)

From ef8218be222c9576bd0435f7e842ce5650317371 Mon Sep 17 00:00:00 2001
From: barrierye <barriery@qq.com>
Date: Wed, 7 Nov 2018 14:11:10 +0800
Subject: [PATCH 09/25] update docs test=develop

---
 paddle/fluid/operators/similarity_focus_op.cc | 5 +++--
 python/paddle/fluid/layers/nn.py              | 5 +++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/operators/similarity_focus_op.cc b/paddle/fluid/operators/similarity_focus_op.cc
index 768b6903b7..9612f82b6d 100644
--- a/paddle/fluid/operators/similarity_focus_op.cc
+++ b/paddle/fluid/operators/similarity_focus_op.cc
@@ -42,8 +42,9 @@ Generate a similarity focus mask with the same shape of input using the followin
 2. For each index, find the largest numbers in the tensor T, so that the same 
    row and same column has at most one number(what it means is that if the 
    largest number has been found in the i-th row and the j-th column, then 
-   the numbers in the i-th or j-th column will be skipped. Obviously there 
-   will be min(B, C) numbers), and mark the corresponding position of the 
+   the numbers in the i-th row or j-th column will be skipped. And then the 
+   next largest number will be selected from the remaining numbers. Obviously 
+   there will be min(B, C) numbers), and mark the corresponding position of the 
    3-D similarity focus mask as 1, otherwise as 0. Do elementwise-or for 
    each index.
 3. Broadcast the 3-D similarity focus mask to the same shape of input X.
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index be0e75161b..e3737bf6fe 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -7567,8 +7567,9 @@ def similarity_focus(input, axis, indexes, name=None):
     2. For each index, find the largest numbers in the tensor T, so that the same 
        row and same column has at most one number(what it means is that if the 
        largest number has been found in the i-th row and the j-th column, then 
-       the numbers in the i-th or j-th column will be skipped. Obviously there 
-       will be min(B, C) numbers), and mark the corresponding position of the 
+       the numbers in the i-th row or j-th column will be skipped. And then the 
+       next largest number will be selected from the remaining numbers. Obviously 
+       there will be min(B, C) numbers), and mark the corresponding position of the 
        3-D similarity focus mask as 1, otherwise as 0. Do elementwise-or for 
        each index.
     3. Broadcast the 3-D similarity focus mask to the same shape of input X.

From 866d6bfe593bf98cd3082f7ba1178897fc9ab673 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Wed, 7 Nov 2018 16:06:14 +0800
Subject: [PATCH 10/25] dist table support other optimize and regular config

---
 python/paddle/fluid/optimizer.py              | 19 ++++--
 .../details/distribute_lookuptable_utils.py   | 66 +++++++++++++++++++
 .../fluid/transpiler/distribute_transpiler.py | 36 ++--------
 3 files changed, 85 insertions(+), 36 deletions(-)
 create mode 100644 python/paddle/fluid/transpiler/details/distribute_lookuptable_utils.py

diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 7e2364a5a8..ec8bed45dc 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -13,21 +13,23 @@
 # limitations under the License.
 
 from __future__ import print_function
-import re
-import sys
+
 from collections import defaultdict
+from contextlib import contextmanager
+
 from paddle.fluid.framework import Program, Variable, name_scope, default_main_program
+import paddle.fluid.transpiler.details.distribute_lookuptable_utils as distribute_lookuptable_utils
+
 from . import framework
 from . import layers
+from . import unique_name
 from .backward import append_backward
+from .clip import append_gradient_clip_ops, error_clip_callback
 from .framework import program_guard
-from . import unique_name
 from .initializer import Constant
 from .layer_helper import LayerHelper
-from .regularizer import append_regularization_ops
-from .clip import append_gradient_clip_ops, error_clip_callback
-from contextlib import contextmanager
 from .layers import ops
+from .regularizer import append_regularization_ops
 
 __all__ = [
     'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad', 'Ftrl',
@@ -260,6 +262,9 @@ class Optimizer(object):
 
         params_grads = sorted(params_grads, key=lambda x: x[0].name)
 
+        params_grads, table_param_and_grad, table_optimize_op = \
+            distribute_lookuptable_utils.process_distribute_lookuptable(loss.block.program, params_grads, self._learning_rate)
+
         params_grads = append_gradient_clip_ops(params_grads)
 
         # Add regularization if any
@@ -268,6 +273,8 @@ class Optimizer(object):
 
         optimize_ops = self._create_optimization_pass(params_grads, loss,
                                                       startup_program)
+        optimize_ops.append(table_optimize_op)
+        params_grads.append(table_param_and_grad)
         return optimize_ops, params_grads
 
 
diff --git a/python/paddle/fluid/transpiler/details/distribute_lookuptable_utils.py b/python/paddle/fluid/transpiler/details/distribute_lookuptable_utils.py
new file mode 100644
index 0000000000..ab1b551a2e
--- /dev/null
+++ b/python/paddle/fluid/transpiler/details/distribute_lookuptable_utils.py
@@ -0,0 +1,66 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid.optimizer as optimizer
+import paddle.fluid.framework as framework
+
+LOOKUP_TABLE_TYPE = "lookup_table"
+
+
+def find_distributed_lookup_table(program):
+    # process lookup_table_op
+    # 1. check all lookup_table_op is distributed
+    # 2. check all lookup_table_op share the same table.
+    distributed_lookup_table_ops = []
+    # support only one distributed_lookup_table now
+    table_name = None
+
+    for op in program.global_block().ops:
+        if op.type == LOOKUP_TABLE_TYPE:
+            if op.attr('is_distributed') is True:
+                if table_name is None:
+                    table_name = op.input("W")[0]
+                if table_name != op.input("W")[0]:
+                    raise RuntimeError("all distributed lookup_table_ops"
+                                       " should have only one table")
+                distributed_lookup_table_ops.append(op)
+            else:
+                if table_name is not None:
+                    assert op.input("W")[0] != table_name
+
+    return table_name
+
+
+def process_distribute_lookuptable(program, param_grads, learning_rate):
+    table_name = find_distributed_lookup_table(program)
+    table_param = None
+    table_grad = None
+    new_param_grads = []
+    for p, g in param_grads:
+        if p.name == table_name:
+            if table_param is not None:
+                raise RuntimeError(
+                    "multi dist table var found, only support one now!")
+            table_param = p
+            table_grad = g
+        else:
+            new_param_grads.append((p, g))
+    sgd_op = None
+    if table_param is not None:
+        with table_param.block.program._optimized_guard(
+            [table_param, table_grad]), framework.name_scope("optimizer"):
+            sgd_optimizer = optimizer.SGD(learning_rate)
+            sgd_op = sgd_optimizer._append_optimize_op(table_param.block, (
+                table_param, table_grad))
+    return new_param_grads, (table_param, table_grad), sgd_op
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index 7c7fba7671..575f74dfe0 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -31,18 +31,17 @@ Steps to transpile pserver:
 """
 
 import math
-import sys
 import numpy as np
 import collections
-import six
 import logging
 
-from .ps_dispatcher import RoundRobin, HashName, PSDispatcher
+from .ps_dispatcher import RoundRobin, PSDispatcher
 from .. import core, framework, unique_name
 from ..framework import Program, default_main_program, \
     default_startup_program, Block, \
     Parameter, grad_var_name
 from .details import *
+from .details.distribute_lookuptable_utils import find_distributed_lookup_table
 from functools import reduce
 
 LOOKUP_TABLE_TYPE = "lookup_table"
@@ -292,7 +291,8 @@ class DistributeTranspiler(object):
         self.optimize_ops, self.params_grads = self._get_optimize_pass()
 
         ps_dispatcher = self.config.split_method(self.pserver_endpoints)
-        self.has_distributed_lookup_table = self._has_distributed_lookup_table()
+        self.table_name = find_distributed_lookup_table(self.origin_program)
+        self.has_distributed_lookup_table = self.table_name != None
         self.param_name_to_grad_name = dict()
         self.grad_name_to_param_name = dict()
         for param_var, grad_var in self.params_grads:
@@ -966,28 +966,6 @@ to transpile() call.")
 
     # ====================== private transpiler functions =====================
 
-    def _has_distributed_lookup_table(self):
-        # process lookup_table_op
-        # 1. check all lookup_table_op is distributed
-        # 2. check all lookup_table_op share the same table.
-        distributed_lookup_table_ops = []
-        # support only one distributed_lookup_table now
-        self.table_name = None
-        for op in self.origin_program.global_block().ops:
-            if op.type == LOOKUP_TABLE_TYPE:
-                if op.attr('is_distributed') is True:
-                    if self.table_name is None:
-                        self.table_name = op.input("W")[0]
-                    if self.table_name != op.input("W")[0]:
-                        raise RuntimeError("all distributed lookup_table_ops"
-                                           " should have only one table")
-                    distributed_lookup_table_ops.append(op)
-                else:
-                    if self.table_name is not None:
-                        assert op.input("W")[0] != self.table_name
-
-        return len(distributed_lookup_table_ops) > 0
-
     def _update_dist_lookup_table_vars(self, param_list, grad_list,
                                        params_grads):
         # TODO(wuyi): put find a way to put dist lookup table stuff all together.
@@ -1259,9 +1237,8 @@ to transpile() call.")
         # create table param and grad var in pserver program
         # create table optimize block in pserver program
         table_opt_op = [
-            op for op in self.optimize_ops
-            if 'Param' in op.input_names and op.input("Param")[0] ==
-            self.table_name
+            op for op in self.optimize_ops if 'Param' in op.input_names and
+            op.input("Param")[0] == self.table_name
         ][0]
 
         origin_param_var = self.origin_program.global_block().vars[
@@ -1341,7 +1318,6 @@ to transpile() call.")
         """
         create a new block to handle save checkpoint.
         """
-        import os
 
         pserver_program.global_block().create_var(
             name="kLookupTablePath",

From fbcdb29d8c352d7d0ca4eb147e45764e33166047 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Wed, 7 Nov 2018 16:49:36 +0800
Subject: [PATCH 11/25] fix import issue

---
 python/paddle/fluid/optimizer.py              | 33 ++++++++++++++++---
 .../details/distribute_lookuptable_utils.py   | 24 --------------
 2 files changed, 29 insertions(+), 28 deletions(-)

diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index ec8bed45dc..e0ee9955b8 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -18,7 +18,7 @@ from collections import defaultdict
 from contextlib import contextmanager
 
 from paddle.fluid.framework import Program, Variable, name_scope, default_main_program
-import paddle.fluid.transpiler.details.distribute_lookuptable_utils as distribute_lookuptable_utils
+from paddle.fluid.transpiler.details.distribute_lookuptable_utils import find_distributed_lookup_table
 
 from . import framework
 from . import layers
@@ -40,6 +40,30 @@ __all__ = [
 ]
 
 
+def _process_distribute_lookuptable(program, param_grads, learning_rate):
+    table_name = find_distributed_lookup_table(program)
+    table_param = None
+    table_grad = None
+    new_param_grads = []
+    for p, g in param_grads:
+        if p.name == table_name:
+            if table_param is not None:
+                raise RuntimeError(
+                    "multi dist table var found, only support one now!")
+            table_param = p
+            table_grad = g
+        else:
+            new_param_grads.append((p, g))
+    sgd_op = None
+    if table_param is not None:
+        with table_param.block.program._optimized_guard(
+            [table_param, table_grad]), framework.name_scope("optimizer"):
+            sgd_optimizer = SGD(learning_rate)
+            sgd_op = sgd_optimizer._append_optimize_op(table_param.block, (
+                table_param, table_grad))
+    return new_param_grads, (table_param, table_grad), sgd_op
+
+
 class Optimizer(object):
     """Optimizer Base class.
 
@@ -263,7 +287,7 @@ class Optimizer(object):
         params_grads = sorted(params_grads, key=lambda x: x[0].name)
 
         params_grads, table_param_and_grad, table_optimize_op = \
-            distribute_lookuptable_utils.process_distribute_lookuptable(loss.block.program, params_grads, self._learning_rate)
+            _process_distribute_lookuptable(loss.block.program, params_grads, self._learning_rate)
 
         params_grads = append_gradient_clip_ops(params_grads)
 
@@ -273,8 +297,9 @@ class Optimizer(object):
 
         optimize_ops = self._create_optimization_pass(params_grads, loss,
                                                       startup_program)
-        optimize_ops.append(table_optimize_op)
-        params_grads.append(table_param_and_grad)
+        if table_optimize_op is not None:
+            optimize_ops.append(table_optimize_op)
+            params_grads.append(table_param_and_grad)
         return optimize_ops, params_grads
 
 
diff --git a/python/paddle/fluid/transpiler/details/distribute_lookuptable_utils.py b/python/paddle/fluid/transpiler/details/distribute_lookuptable_utils.py
index ab1b551a2e..bc4a9e7a4e 100644
--- a/python/paddle/fluid/transpiler/details/distribute_lookuptable_utils.py
+++ b/python/paddle/fluid/transpiler/details/distribute_lookuptable_utils.py
@@ -40,27 +40,3 @@ def find_distributed_lookup_table(program):
                     assert op.input("W")[0] != table_name
 
     return table_name
-
-
-def process_distribute_lookuptable(program, param_grads, learning_rate):
-    table_name = find_distributed_lookup_table(program)
-    table_param = None
-    table_grad = None
-    new_param_grads = []
-    for p, g in param_grads:
-        if p.name == table_name:
-            if table_param is not None:
-                raise RuntimeError(
-                    "multi dist table var found, only support one now!")
-            table_param = p
-            table_grad = g
-        else:
-            new_param_grads.append((p, g))
-    sgd_op = None
-    if table_param is not None:
-        with table_param.block.program._optimized_guard(
-            [table_param, table_grad]), framework.name_scope("optimizer"):
-            sgd_optimizer = optimizer.SGD(learning_rate)
-            sgd_op = sgd_optimizer._append_optimize_op(table_param.block, (
-                table_param, table_grad))
-    return new_param_grads, (table_param, table_grad), sgd_op

From 3d8077e9fb92a5f2a21c214162f04ba200bcc92d Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Wed, 7 Nov 2018 17:30:48 +0800
Subject: [PATCH 12/25] update optimizer

---
 python/paddle/fluid/optimizer.py              | 94 +++++++++++--------
 .../fluid/transpiler/details/__init__.py      |  1 +
 .../details/distribute_lookuptable_utils.py   |  3 -
 3 files changed, 54 insertions(+), 44 deletions(-)

diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index e0ee9955b8..f48d7e189e 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -40,30 +40,6 @@ __all__ = [
 ]
 
 
-def _process_distribute_lookuptable(program, param_grads, learning_rate):
-    table_name = find_distributed_lookup_table(program)
-    table_param = None
-    table_grad = None
-    new_param_grads = []
-    for p, g in param_grads:
-        if p.name == table_name:
-            if table_param is not None:
-                raise RuntimeError(
-                    "multi dist table var found, only support one now!")
-            table_param = p
-            table_grad = g
-        else:
-            new_param_grads.append((p, g))
-    sgd_op = None
-    if table_param is not None:
-        with table_param.block.program._optimized_guard(
-            [table_param, table_grad]), framework.name_scope("optimizer"):
-            sgd_optimizer = SGD(learning_rate)
-            sgd_op = sgd_optimizer._append_optimize_op(table_param.block, (
-                table_param, table_grad))
-    return new_param_grads, (table_param, table_grad), sgd_op
-
-
 class Optimizer(object):
     """Optimizer Base class.
 
@@ -111,7 +87,7 @@ class Optimizer(object):
             name=unique_name.generate("learning_rate"),
             shape=[1],
             value=float(self._learning_rate),
-            dtype='float32' if self._dtype == None else self._dtype,
+            dtype='float32' if self._dtype is None else self._dtype,
             persistable=True)
 
     def _global_learning_rate(self, program=None):
@@ -251,7 +227,6 @@ class Optimizer(object):
             self.helper = LayerHelper(self.__class__.__name__)
             self._create_accumulators(loss.block,
                                       [p[0] for p in parameters_and_grads])
-            self._create_global_learning_rate()
 
             optimize_ops = []
             for param_and_grad in parameters_and_grads:
@@ -271,6 +246,40 @@ class Optimizer(object):
             end = len(global_block.ops)
             return global_block._slice_ops(start, end)
 
+    def _process_distribute_lookuptable(self, param_grads, loss,
+                                        startup_program):
+        program = loss.block.program
+        table_name = find_distributed_lookup_table(program)
+        table_param = None
+        table_grad = None
+        new_param_grads = []
+        for p, g in param_grads:
+            if p.name == table_name:
+                if table_param is not None:
+                    raise RuntimeError(
+                        "multi dist table var found, only support one now!")
+                table_param = p
+                table_grad = g
+            else:
+                new_param_grads.append((p, g))
+        sgd_op = None
+        if table_param is not None:
+            with program_guard(program, startup_program):
+                param_and_grad = [table_param, table_grad]
+                with table_param.block.program._optimized_guard(param_and_grad), \
+                     framework.name_scope("optimizer"):
+                    # create the optimize op
+                    sgd_op = loss.block.append_op(
+                        type='sgd',
+                        inputs={
+                            "Param": table_param,
+                            "Grad": table_grad,
+                            "LearningRate":
+                            self._create_param_lr(param_and_grad)
+                        },
+                        outputs={"ParamOut": param_and_grad[0]})
+        return new_param_grads, (table_param, table_grad), sgd_op
+
     def minimize(self,
                  loss,
                  startup_program=None,
@@ -281,26 +290,29 @@ class Optimizer(object):
         This method combines interface `append_backward()` and
         `create_optimization_pass()` into one.
         """
-        params_grads = append_backward(loss, parameter_list, no_grad_set,
-                                       [error_clip_callback])
+        with program_guard(loss.block.program, startup_program):
+            self._create_global_learning_rate()
+
+            params_grads = append_backward(loss, parameter_list, no_grad_set,
+                                           [error_clip_callback])
 
-        params_grads = sorted(params_grads, key=lambda x: x[0].name)
+            params_grads = sorted(params_grads, key=lambda x: x[0].name)
 
-        params_grads, table_param_and_grad, table_optimize_op = \
-            _process_distribute_lookuptable(loss.block.program, params_grads, self._learning_rate)
+            params_grads, table_param_and_grad, table_optimize_op = \
+                self._process_distribute_lookuptable(params_grads, loss, startup_program)
 
-        params_grads = append_gradient_clip_ops(params_grads)
+            params_grads = append_gradient_clip_ops(params_grads)
 
-        # Add regularization if any
-        params_grads = append_regularization_ops(params_grads,
-                                                 self.regularization)
+            # Add regularization if any
+            params_grads = append_regularization_ops(params_grads,
+                                                     self.regularization)
 
-        optimize_ops = self._create_optimization_pass(params_grads, loss,
-                                                      startup_program)
-        if table_optimize_op is not None:
-            optimize_ops.append(table_optimize_op)
-            params_grads.append(table_param_and_grad)
-        return optimize_ops, params_grads
+            optimize_ops = self._create_optimization_pass(params_grads, loss,
+                                                          startup_program)
+            if table_optimize_op is not None:
+                optimize_ops.append(table_optimize_op)
+                params_grads.append(table_param_and_grad)
+            return optimize_ops, params_grads
 
 
 class SGDOptimizer(Optimizer):
diff --git a/python/paddle/fluid/transpiler/details/__init__.py b/python/paddle/fluid/transpiler/details/__init__.py
index f33c05ed2f..9671b60007 100644
--- a/python/paddle/fluid/transpiler/details/__init__.py
+++ b/python/paddle/fluid/transpiler/details/__init__.py
@@ -17,3 +17,4 @@ from __future__ import print_function
 from .program_utils import *
 from .ufind import *
 from .checkport import *
+from .distribute_lookuptable_utils import *
diff --git a/python/paddle/fluid/transpiler/details/distribute_lookuptable_utils.py b/python/paddle/fluid/transpiler/details/distribute_lookuptable_utils.py
index bc4a9e7a4e..ce1e993402 100644
--- a/python/paddle/fluid/transpiler/details/distribute_lookuptable_utils.py
+++ b/python/paddle/fluid/transpiler/details/distribute_lookuptable_utils.py
@@ -12,9 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import paddle.fluid.optimizer as optimizer
-import paddle.fluid.framework as framework
-
 LOOKUP_TABLE_TYPE = "lookup_table"
 
 

From d9dc81a6c69a27e854e66a04e35744100261abc5 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Wed, 7 Nov 2018 23:08:16 +0800
Subject: [PATCH 13/25] fix dist transpiler test test=develop

---
 python/paddle/fluid/tests/unittests/test_dist_transpiler.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
index 986fdd9ff2..0957b97980 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
@@ -570,7 +570,6 @@ class TestDistLookupTable(TestDistLookupTableBase):
             'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
             'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
             'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
-            'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
             'fill_constant', 'fill_constant', 'uniform_random',
             'uniform_random', 'recv', 'recv', 'recv', 'fetch_barrier', 'concat',
             'fake_init'

From fec0b192a24b6760bfbcbe2a40913269fb168353 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Thu, 8 Nov 2018 11:07:33 +0800
Subject: [PATCH 14/25] fix unit test test=develop

---
 python/paddle/fluid/optimizer.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index f48d7e189e..6d88d76e72 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -227,6 +227,7 @@ class Optimizer(object):
             self.helper = LayerHelper(self.__class__.__name__)
             self._create_accumulators(loss.block,
                                       [p[0] for p in parameters_and_grads])
+            self._create_global_learning_rate()
 
             optimize_ops = []
             for param_and_grad in parameters_and_grads:
@@ -268,6 +269,7 @@ class Optimizer(object):
                 param_and_grad = [table_param, table_grad]
                 with table_param.block.program._optimized_guard(param_and_grad), \
                      framework.name_scope("optimizer"):
+                    self._create_global_learning_rate()
                     # create the optimize op
                     sgd_op = loss.block.append_op(
                         type='sgd',
@@ -291,7 +293,6 @@ class Optimizer(object):
         `create_optimization_pass()` into one.
         """
         with program_guard(loss.block.program, startup_program):
-            self._create_global_learning_rate()
 
             params_grads = append_backward(loss, parameter_list, no_grad_set,
                                            [error_clip_callback])

From 55edfca2b8a70d555f5adf0d8f737977ae1f17c4 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Thu, 8 Nov 2018 11:22:20 +0800
Subject: [PATCH 15/25] revert unused change

---
 python/paddle/fluid/optimizer.py | 32 +++++++++++++++-----------------
 1 file changed, 15 insertions(+), 17 deletions(-)

diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 6d88d76e72..9f089ef1e8 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -292,28 +292,26 @@ class Optimizer(object):
         This method combines interface `append_backward()` and
         `create_optimization_pass()` into one.
         """
-        with program_guard(loss.block.program, startup_program):
+        params_grads = append_backward(loss, parameter_list, no_grad_set,
+                                       [error_clip_callback])
 
-            params_grads = append_backward(loss, parameter_list, no_grad_set,
-                                           [error_clip_callback])
+        params_grads = sorted(params_grads, key=lambda x: x[0].name)
 
-            params_grads = sorted(params_grads, key=lambda x: x[0].name)
+        params_grads, table_param_and_grad, table_optimize_op = \
+            self._process_distribute_lookuptable(params_grads, loss, startup_program)
 
-            params_grads, table_param_and_grad, table_optimize_op = \
-                self._process_distribute_lookuptable(params_grads, loss, startup_program)
+        params_grads = append_gradient_clip_ops(params_grads)
 
-            params_grads = append_gradient_clip_ops(params_grads)
+        # Add regularization if any
+        params_grads = append_regularization_ops(params_grads,
+                                                 self.regularization)
 
-            # Add regularization if any
-            params_grads = append_regularization_ops(params_grads,
-                                                     self.regularization)
-
-            optimize_ops = self._create_optimization_pass(params_grads, loss,
-                                                          startup_program)
-            if table_optimize_op is not None:
-                optimize_ops.append(table_optimize_op)
-                params_grads.append(table_param_and_grad)
-            return optimize_ops, params_grads
+        optimize_ops = self._create_optimization_pass(params_grads, loss,
+                                                      startup_program)
+        if table_optimize_op is not None:
+            optimize_ops.append(table_optimize_op)
+            params_grads.append(table_param_and_grad)
+        return optimize_ops, params_grads
 
 
 class SGDOptimizer(Optimizer):

From ffd5a832d8f40ec703c3c7736b9e5be845224529 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Thu, 8 Nov 2018 13:14:13 +0800
Subject: [PATCH 16/25] fix code style

---
 python/paddle/fluid/transpiler/distribute_transpiler.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index 575f74dfe0..b6179864a2 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -1237,8 +1237,9 @@ to transpile() call.")
         # create table param and grad var in pserver program
         # create table optimize block in pserver program
         table_opt_op = [
-            op for op in self.optimize_ops if 'Param' in op.input_names and
-            op.input("Param")[0] == self.table_name
+            op for op in self.optimize_ops
+            if 'Param' in op.input_names and op.input("Param")[0] ==
+            self.table_name
         ][0]
 
         origin_param_var = self.origin_program.global_block().vars[

From 67050468e1c2801e0aa0c7896cd8e5ffb5046f8f Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Thu, 8 Nov 2018 13:24:54 +0800
Subject: [PATCH 17/25] optimize code test=develop

---
 .../details/distribute_lookuptable_utils.py          | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/python/paddle/fluid/transpiler/details/distribute_lookuptable_utils.py b/python/paddle/fluid/transpiler/details/distribute_lookuptable_utils.py
index ce1e993402..52d9ce75f8 100644
--- a/python/paddle/fluid/transpiler/details/distribute_lookuptable_utils.py
+++ b/python/paddle/fluid/transpiler/details/distribute_lookuptable_utils.py
@@ -16,11 +16,12 @@ LOOKUP_TABLE_TYPE = "lookup_table"
 
 
 def find_distributed_lookup_table(program):
-    # process lookup_table_op
-    # 1. check all lookup_table_op is distributed
-    # 2. check all lookup_table_op share the same table.
-    distributed_lookup_table_ops = []
-    # support only one distributed_lookup_table now
+    """
+    Find distribute lookup table in program.
+    We only support one distribute table now.
+    :param program:
+    :return: table_name or None
+    """
     table_name = None
 
     for op in program.global_block().ops:
@@ -31,7 +32,6 @@ def find_distributed_lookup_table(program):
                 if table_name != op.input("W")[0]:
                     raise RuntimeError("all distributed lookup_table_ops"
                                        " should have only one table")
-                distributed_lookup_table_ops.append(op)
             else:
                 if table_name is not None:
                     assert op.input("W")[0] != table_name

From 373f64986dd41bfacda4d408d138f25f6fa95c2c Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Thu, 8 Nov 2018 13:55:02 +0800
Subject: [PATCH 18/25] add comment and unit test test=develop

---
 python/paddle/fluid/optimizer.py                     |  9 +++++++++
 .../fluid/tests/unittests/test_dist_transpiler.py    | 12 +++++++++++-
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 9f089ef1e8..94d171d83d 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -249,6 +249,15 @@ class Optimizer(object):
 
     def _process_distribute_lookuptable(self, param_grads, loss,
                                         startup_program):
+        """
+        Because distribute lookup table only support SGD optimizer for now, not support
+        other optimizer and regularization, so we should find the table parameter out,
+        and avoid to add regularization and other op for it, and add sgd optimize op
+        for it independently.
+        :param param_grads(list((Var, Var))): list of (param, grad) pair.
+        :param loss: the loss variable.
+        :param startup_program: the startup program
+        """
         program = loss.block.program
         table_name = find_distributed_lookup_table(program)
         table_param = None
diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
index 0957b97980..f08b6ac035 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
@@ -641,7 +641,7 @@ class TestAsyncDistLookupTable(TestDistLookupTableBase):
         # 5 save table
         self.assertEqual([op.type for op in pserver1.blocks[5].ops], ["save"])
 
-        trainer, _ = self.get_trainer(config)
+        trainer, trainer_startup = self.get_trainer(config)
         self.assertEqual(len(trainer.blocks), 1)
         ops = [
             'split_ids', 'prefetch', 'merge_ids', 'sequence_pool',
@@ -655,6 +655,16 @@ class TestAsyncDistLookupTable(TestDistLookupTableBase):
             'recv', 'concat'
         ]
         self.assertEqual([op.type for op in trainer.blocks[0].ops], ops)
+        startup_ops = [
+            'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
+            'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
+            'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
+            'fill_constant', 'fill_constant', 'uniform_random',
+            'uniform_random', 'recv', 'recv', 'recv', 'fetch_barrier', 'concat',
+            'fake_init'
+        ]
+        self.assertEqual([op.type for op in trainer_startup.blocks[0].ops],
+                         startup_ops)
 
 
 class TestDistLookupTableSliceSize(TestDistLookupTableBase):

From 433fc7c1d44b4f7c9b2ac9cf856b12b06d756b25 Mon Sep 17 00:00:00 2001
From: Tao Luo <luotao02@baidu.com>
Date: Fri, 9 Nov 2018 16:03:20 +0800
Subject: [PATCH 19/25] skip mkldnn related pass when use_mkldnn=false

test=develop
---
 paddle/fluid/inference/analysis/analyzer.cc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/inference/analysis/analyzer.cc b/paddle/fluid/inference/analysis/analyzer.cc
index a3440cfc78..d55303a51e 100644
--- a/paddle/fluid/inference/analysis/analyzer.cc
+++ b/paddle/fluid/inference/analysis/analyzer.cc
@@ -113,7 +113,9 @@ void Analyzer::Run(Argument* argument) {
   passes.push_back("infer_clean_graph_pass");
   passes.push_back("graph_viz_pass");  // add graphviz for debug.
   for (auto& pass : ir_passes_) {
-    if (!disabled_ir_passes_.count(pass)) {
+    // skip mkldnn pass when use_mkldnn_ = false;
+    bool skip_pass = (!use_mkldnn_) && pass.find("mkldnn") != std::string::npos;
+    if (!disabled_ir_passes_.count(pass) && !skip_pass) {
       passes.push_back(pass);
       passes.push_back("graph_viz_pass");  // add graphviz for debug.
     }

From 6c6e6385507cfcf658e6a6f3ccc39e0ac353e06a Mon Sep 17 00:00:00 2001
From: chengduo <zhaochengduo@baidu.com>
Date: Fri, 9 Nov 2018 17:37:15 +0800
Subject: [PATCH 20/25] Add InferVarType for some op (#14201)

* add_infer_var_type
test=develop

* InferVarTypeHelper-> VarTypeInferenceHelper
test=develop

* PassInputTypeAndDTypeOnOutput
 test=develop

* follow comment
test=develop
---
 paddle/fluid/framework/operator.cc          |  2 ++
 paddle/fluid/framework/var_type_inference.h | 25 +++++++++++++++++++++
 paddle/fluid/operators/activation_op.cc     | 16 +++++--------
 paddle/fluid/operators/batch_norm_op.cc     | 11 ++++++++-
 paddle/fluid/operators/conv_op.cc           | 12 ++++++++++
 paddle/fluid/operators/cross_entropy_op.cc  | 11 +++++++++
 paddle/fluid/operators/elementwise_op.h     | 16 +++++--------
 paddle/fluid/operators/mean_op.cc           | 21 +++++++++++++++--
 paddle/fluid/operators/mul_op.cc            | 11 ++++++++-
 paddle/fluid/operators/pool_op.cc           | 18 +++++++++++----
 paddle/fluid/operators/softmax_op.cc        | 10 ++++++++-
 11 files changed, 124 insertions(+), 29 deletions(-)

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 0506907ab5..5624878d43 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -259,6 +259,8 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const {
           if (row_size >= 0) {
             ss << "[row_size=" << row_size << "]";
           }
+          std::string dtype = GetDtype(*scope, output.second[i]);
+          ss << ":" << dtype;
           ss << "[" << GetDims(*scope, var_name, true) << "]";
           ss << "(" << GetLoD(*scope, var_name) << ")";
         }
diff --git a/paddle/fluid/framework/var_type_inference.h b/paddle/fluid/framework/var_type_inference.h
index f3035cd712..64236b78d2 100644
--- a/paddle/fluid/framework/var_type_inference.h
+++ b/paddle/fluid/framework/var_type_inference.h
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <string>
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/type_defs.h"
 
 namespace paddle {
@@ -24,5 +27,27 @@ class VarTypeInference {
   virtual void operator()(const OpDesc& op_desc, BlockDesc* block) const = 0;
 };
 
+class PassInDtypeAndVarTypeToOutput : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc& op_desc,
+                  framework::BlockDesc* block) const final {
+    auto in_out_var_names = this->GetInputOutputWithSameType();
+
+    for (auto& i_o_n : in_out_var_names) {
+      auto& x_name = op_desc.Input(i_o_n.first).at(0);
+      auto& out_name = op_desc.Output(i_o_n.second).at(0);
+
+      auto& x = block->FindRecursiveOrCreateVar(x_name);
+      auto& out = block->FindRecursiveOrCreateVar(out_name);
+      out.SetType(x.GetType());
+      out.SetDataType(x.GetDataType());
+    }
+  }
+
+ protected:
+  virtual std::unordered_map<std::string, std::string>
+  GetInputOutputWithSameType() const = 0;
+};
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index 9ddb3a5d29..ea260a3e92 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -91,16 +91,12 @@ class ActivationOp : public framework::OperatorWithKernel {
   }
 };
 
-class ActivationOpInferVarType : public framework::VarTypeInference {
- public:
-  void operator()(const framework::OpDesc& op_desc,
-                  framework::BlockDesc* block) const override {
-    auto x_name = op_desc.Input("X")[0];
-    auto out_name = op_desc.Output("Out")[0];
-    auto& x = block->FindRecursiveOrCreateVar(x_name);
-    auto& out = block->FindRecursiveOrCreateVar(out_name);
-    out.SetType(x.GetType());
-    out.SetDataType(x.GetDataType());
+class ActivationOpInferVarType
+    : public framework::PassInDtypeAndVarTypeToOutput {
+ protected:
+  std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
+      const override {
+    return std::unordered_map<std::string, std::string>{{"X", /*->*/ "Out"}};
   }
 };
 
diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc
index 3eb4738325..cf245f5038 100644
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -170,6 +170,15 @@ The required data format for this layer is one of the following:
   }
 };
 
+class BatchNormOpInferVarType
+    : public framework::PassInDtypeAndVarTypeToOutput {
+ protected:
+  std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
+      const override {
+    return std::unordered_map<std::string, std::string>{{"X", /*->*/ "Y"}};
+  }
+};
+
 template <typename T>
 class BatchNormKernel<platform::CPUDeviceContext, T>
     : public framework::OpKernel<T> {
@@ -525,7 +534,7 @@ class BatchNormGradMaker : public framework::SingleGradOpDescMaker {
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker,
-                  ops::BatchNormGradMaker);
+                  ops::BatchNormOpInferVarType, ops::BatchNormGradMaker);
 REGISTER_OPERATOR(batch_norm_grad, ops::BatchNormGradOp);
 
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc
index 7401f100d7..4d37074638 100644
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -224,6 +224,15 @@ $$
 )DOC");
 }
 
+class ConvOpInferVarType : public framework::PassInDtypeAndVarTypeToOutput {
+ protected:
+  std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
+      const override {
+    return std::unordered_map<std::string, std::string>{
+        {"Input", /*->*/ "Output"}};
+  }
+};
+
 void Conv3DOpMaker::Make() {
   AddInput(
       "Input",
@@ -365,6 +374,7 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(conv2d, ops::ConvOp, ops::Conv2DOpMaker,
+                  ops::ConvOpInferVarType,
                   paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(conv2d_grad, ops::ConvOpGrad);
 
@@ -372,7 +382,9 @@ REGISTER_OPERATOR(conv2d_grad, ops::ConvOpGrad);
 REGISTER_OPERATOR(depthwise_conv2d, ops::ConvOp, ops::Conv2DOpMaker,
                   paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(depthwise_conv2d_grad, ops::ConvOpGrad);
+
 REGISTER_OPERATOR(conv3d, ops::ConvOp, ops::Conv3DOpMaker,
+                  ops::ConvOpInferVarType,
                   paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(conv3d_grad, ops::ConvOpGrad);
 
diff --git a/paddle/fluid/operators/cross_entropy_op.cc b/paddle/fluid/operators/cross_entropy_op.cc
index 66f19fe7ec..a904dd9130 100644
--- a/paddle/fluid/operators/cross_entropy_op.cc
+++ b/paddle/fluid/operators/cross_entropy_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/cross_entropy_op.h"
+#include <string>
 
 namespace paddle {
 namespace operators {
@@ -179,6 +180,15 @@ or not. But the output only shares the LoD information with input X.
 )DOC");
   }
 };
+
+class CrossEntropyOpInferVarType
+    : public framework::PassInDtypeAndVarTypeToOutput {
+ protected:
+  std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
+      const override {
+    return std::unordered_map<std::string, std::string>{{"X", /*->*/ "Y"}};
+  }
+};
 }  // namespace operators
 }  // namespace paddle
 
@@ -186,6 +196,7 @@ namespace ops = paddle::operators;
 using CPUCtx = paddle::platform::CPUDeviceContext;
 
 REGISTER_OPERATOR(cross_entropy, ops::CrossEntropyOp, ops::CrossEntropyOpMaker,
+                  ops::CrossEntropyOpInferVarType,
                   paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(cross_entropy_grad, ops::CrossEntropyGradientOp);
 REGISTER_OP_CPU_KERNEL(cross_entropy, ops::CrossEntropyOpKernel<CPUCtx, float>,
diff --git a/paddle/fluid/operators/elementwise_op.h b/paddle/fluid/operators/elementwise_op.h
index 5eb4233344..f01f67692e 100644
--- a/paddle/fluid/operators/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise_op.h
@@ -75,16 +75,12 @@ class ElementwiseOp : public framework::OperatorWithKernel {
   }
 };
 
-class ElementwiseOpInferVarType : public framework::VarTypeInference {
- public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const override {
-    auto x_name = op_desc.Input("X")[0];
-    auto out_name = op_desc.Output("Out")[0];
-    auto &x = block->FindRecursiveOrCreateVar(x_name);
-    auto &out = block->FindRecursiveOrCreateVar(out_name);
-    out.SetType(x.GetType());
-    out.SetDataType(x.GetDataType());
+class ElementwiseOpInferVarType
+    : public framework::PassInDtypeAndVarTypeToOutput {
+ protected:
+  std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
+      const override {
+    return std::unordered_map<std::string, std::string>{{"X", /*->*/ "Out"}};
   }
 };
 
diff --git a/paddle/fluid/operators/mean_op.cc b/paddle/fluid/operators/mean_op.cc
index 19426b3c20..820636defa 100644
--- a/paddle/fluid/operators/mean_op.cc
+++ b/paddle/fluid/operators/mean_op.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/mean_op.h"
-
+#include <string>
 namespace paddle {
 namespace operators {
 
@@ -42,6 +42,14 @@ Mean Operator calculates the mean of all elements in X.
   }
 };
 
+class MeanOpInferVarType : public framework::PassInDtypeAndVarTypeToOutput {
+ protected:
+  std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
+      const override {
+    return std::unordered_map<std::string, std::string>{{"X", /*->*/ "Out"}};
+  }
+};
+
 class MeanGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -50,6 +58,14 @@ class MeanGradOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
     ctx->ShareLoD("X", framework::GradVarName("X"));
   }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto input_data_type =
+        framework::ToDataType(ctx.Input<Tensor>("X")->type());
+
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+  }
 };
 
 class MeanGradMaker : public framework::SingleGradOpDescMaker {
@@ -71,7 +87,8 @@ class MeanGradMaker : public framework::SingleGradOpDescMaker {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OPERATOR(mean, ops::MeanOp, ops::MeanOpMaker, ops::MeanGradMaker);
+REGISTER_OPERATOR(mean, ops::MeanOp, ops::MeanOpMaker, ops::MeanOpInferVarType,
+                  ops::MeanGradMaker);
 REGISTER_OPERATOR(mean_grad, ops::MeanGradOp);
 REGISTER_OP_CPU_KERNEL(
     mean, ops::MeanKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/mul_op.cc b/paddle/fluid/operators/mul_op.cc
index a2140ddc79..08f2949d4a 100644
--- a/paddle/fluid/operators/mul_op.cc
+++ b/paddle/fluid/operators/mul_op.cc
@@ -126,6 +126,14 @@ or not. But the output only shares the LoD information with input $X$.
   }
 };
 
+class MulOpInferVarType : public framework::PassInDtypeAndVarTypeToOutput {
+ protected:
+  std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
+      const override {
+    return std::unordered_map<std::string, std::string>{{"X", /*->*/ "Out"}};
+  }
+};
+
 class MulGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -178,7 +186,8 @@ class MulOpGradMaker : public framework::SingleGradOpDescMaker {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OPERATOR(mul, ops::MulOp, ops::MulOpMaker, ops::MulOpGradMaker);
+REGISTER_OPERATOR(mul, ops::MulOp, ops::MulOpMaker, ops::MulOpInferVarType,
+                  ops::MulOpGradMaker);
 REGISTER_OPERATOR(mul_grad, ops::MulGradOp);
 REGISTER_OP_CPU_KERNEL(
     mul, ops::MulKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc
index 484cb65746..46a95350a7 100644
--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
@@ -40,7 +40,7 @@ int PoolOutputSize(int input_size, int filter_size, int padding, int stride,
   return output_size;
 }
 
-void PoolOp::InferShape(framework::InferShapeContext *ctx) const {
+void PoolOp::InferShape(framework::InferShapeContext* ctx) const {
   PADDLE_ENFORCE(ctx->HasInput("X"), "X(Input) of Pooling should not be null.");
   PADDLE_ENFORCE(ctx->HasOutput("Out"),
                  "Out(Output) of Pooling should not be null.");
@@ -81,7 +81,7 @@ void PoolOp::InferShape(framework::InferShapeContext *ctx) const {
 }
 
 framework::OpKernelType PoolOp::GetExpectedKernelType(
-    const framework::ExecutionContext &ctx) const {
+    const framework::ExecutionContext& ctx) const {
   framework::LibraryType library_{framework::LibraryType::kPlain};
   std::string data_format = ctx.Attr<std::string>("data_format");
   framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
@@ -104,7 +104,7 @@ framework::OpKernelType PoolOp::GetExpectedKernelType(
       layout_, library_);
 }
 
-void PoolOpGrad::InferShape(framework::InferShapeContext *ctx) const {
+void PoolOpGrad::InferShape(framework::InferShapeContext* ctx) const {
   PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
   PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
                  "Input(X@GRAD) should not be null.");
@@ -112,7 +112,7 @@ void PoolOpGrad::InferShape(framework::InferShapeContext *ctx) const {
 }
 
 framework::OpKernelType PoolOpGrad::GetExpectedKernelType(
-    const framework::ExecutionContext &ctx) const {
+    const framework::ExecutionContext& ctx) const {
   framework::LibraryType library_{framework::LibraryType::kPlain};
   std::string data_format = ctx.Attr<std::string>("data_format");
   framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
@@ -262,6 +262,14 @@ Example:
 )DOC");
 }
 
+class PoolOpInferVarType : public framework::PassInDtypeAndVarTypeToOutput {
+ protected:
+  std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
+      const override {
+    return std::unordered_map<std::string, std::string>{{"X", /*->*/ "Out"}};
+  }
+};
+
 void Pool3dOpMaker::Make() {
   AddInput("X",
            "(Tensor) The input tensor of pooling operator. "
@@ -372,6 +380,7 @@ Example:
 namespace ops = paddle::operators;
 
 REGISTER_OPERATOR(pool2d, ops::PoolOp, ops::Pool2dOpMaker,
+                  ops::PoolOpInferVarType,
                   paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(pool2d_grad, ops::PoolOpGrad);
 
@@ -383,6 +392,7 @@ REGISTER_OP_CPU_KERNEL(
     ops::PoolGradKernel<paddle::platform::CPUDeviceContext, double>);
 
 REGISTER_OPERATOR(pool3d, ops::PoolOp, ops::Pool3dOpMaker,
+                  ops::PoolOpInferVarType,
                   paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(pool3d_grad, ops::PoolOpGrad);
 
diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc
index a4bdbe6648..9e21b6c824 100644
--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
@@ -124,6 +124,14 @@ For each row $i$ and each column $j$ in the matrix, we have:
   }
 };
 
+class SoftmaxOpInferVarType : public framework::PassInDtypeAndVarTypeToOutput {
+ protected:
+  std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
+      const override {
+    return std::unordered_map<std::string, std::string>{{"X", /*->*/ "Out"}};
+  }
+};
+
 class SoftmaxOpGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -196,7 +204,7 @@ class SoftmaxOpGradMaker : public framework::SingleGradOpDescMaker {
 namespace ops = paddle::operators;
 
 REGISTER_OPERATOR(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker,
-                  ops::SoftmaxOpGradMaker);
+                  ops::SoftmaxOpInferVarType, ops::SoftmaxOpGradMaker);
 REGISTER_OPERATOR(softmax_grad, ops::SoftmaxOpGrad);
 REGISTER_OP_CPU_KERNEL(
     softmax, ops::SoftmaxKernel<paddle::platform::CPUDeviceContext, float>,

From 688ed6011651a3b4640853067a35bae8ae054cec Mon Sep 17 00:00:00 2001
From: li099 <lixuan666666@163.com>
Date: Fri, 9 Nov 2018 18:40:03 +0800
Subject: [PATCH 21/25] Add lod tensor array to tensor op (#13990)

* add lod tensor array concat

* add lod tensor array concat

* test=develop

* add lod tensor array concat
test=develop

* Fix API.spec
test=develop

* add lod tensor array concat
test=develop

* revise some bug of lod tensor array concat
test=develop

* add unittest for tensor array concat
test=develop

* change to tensor array to tensor
test=develop

* revise bug
test=develop

* revise a bug
test=develop

* revise a bug
test=develop

* revise a bug of python3
test=develop
---
 paddle/fluid/API.spec                         |   1 +
 paddle/fluid/operators/CMakeLists.txt         |   1 +
 .../operators/tensor_array_to_tensor_op.cc    | 246 ++++++++++++++++++
 python/paddle/fluid/layers/tensor.py          |  62 ++++-
 .../unittests/test_tensor_array_to_tensor.py  | 142 ++++++++++
 5 files changed, 448 insertions(+), 4 deletions(-)
 create mode 100644 paddle/fluid/operators/tensor_array_to_tensor_op.cc
 create mode 100644 python/paddle/fluid/tests/unittests/test_tensor_array_to_tensor.py

diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index f58131e75b..250ea89b12 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -201,6 +201,7 @@ paddle.fluid.layers.create_tensor ArgSpec(args=['dtype', 'name', 'persistable'],
 paddle.fluid.layers.create_parameter ArgSpec(args=['shape', 'dtype', 'name', 'attr', 'is_bias', 'default_initializer'], varargs=None, keywords=None, defaults=(None, None, False, None))
 paddle.fluid.layers.create_global_var ArgSpec(args=['shape', 'value', 'dtype', 'persistable', 'force_cpu', 'name'], varargs=None, keywords=None, defaults=(False, False, None))
 paddle.fluid.layers.cast ArgSpec(args=['x', 'dtype'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.tensor_array_to_tensor ArgSpec(args=['input', 'axis', 'name'], varargs=None, keywords=None, defaults=(1, None))
 paddle.fluid.layers.concat ArgSpec(args=['input', 'axis', 'name'], varargs=None, keywords=None, defaults=(0, None))
 paddle.fluid.layers.sums ArgSpec(args=['input', 'out'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.assign ArgSpec(args=['input', 'output'], varargs=None, keywords=None, defaults=(None,))
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 2a7de024bf..7599313070 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -317,6 +317,7 @@ op_library(save_op DEPS lod_tensor)
 op_library(load_op DEPS lod_tensor)
 op_library(save_combine_op DEPS lod_tensor)
 op_library(load_combine_op DEPS lod_tensor)
+op_library(tensor_array_to_tensor_op DEPS concat_op)
 op_library(concat_op DEPS concat_and_split)
 
 list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
diff --git a/paddle/fluid/operators/tensor_array_to_tensor_op.cc b/paddle/fluid/operators/tensor_array_to_tensor_op.cc
new file mode 100644
index 0000000000..96dc123f6a
--- /dev/null
+++ b/paddle/fluid/operators/tensor_array_to_tensor_op.cc
@@ -0,0 +1,246 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/variable.h"
+
+namespace paddle {
+namespace operators {
+using framework::Tensor;
+
+void LodTensorArray2LodTensorVector(const framework::Scope &scope,
+                                    const std::string &base_name,
+                                    const std::string &lod_tensor_array_name,
+                                    std::vector<std::string> *res_names) {
+  auto &inx =
+      scope.FindVar(lod_tensor_array_name)->Get<framework::LoDTensorArray>();
+  for (size_t i = 0; i < inx.size(); i++) {
+    std::string var_name = base_name + std::to_string(i);
+    framework::Variable *g_feed_value =
+        const_cast<framework::Scope &>(scope).Var(var_name);
+    auto &feed_input =
+        *(g_feed_value->GetMutable<paddle::framework::LoDTensor>());
+    feed_input.ShareDataWith(inx[i]);
+    res_names->push_back(var_name);
+  }
+}
+
+void LodTensorVectorResizeFromLodTensorArray(
+    const framework::Scope &scope, const std::string &base_name,
+    const std::string &lod_tensor_array_name,
+    std::vector<std::string> *res_names) {
+  auto &inx =
+      scope.FindVar(lod_tensor_array_name)->Get<framework::LoDTensorArray>();
+  for (size_t i = 0; i < inx.size(); i++) {
+    std::string var_name = base_name + std::to_string(i);
+    framework::Variable *g_feed_value =
+        const_cast<framework::Scope &>(scope).Var(var_name);
+    auto &feed_input =
+        *(g_feed_value->GetMutable<paddle::framework::LoDTensor>());
+    auto dims = inx[i].dims();
+    feed_input.Resize(dims);
+    res_names->push_back(var_name);
+  }
+}
+
+void LodTensorArrayCreateFromLodTensorArray(
+    const framework::Scope &scope,
+    const std::string &input_lod_tensor_array_name,
+    const std::string &output_lod_tensor_array_name) {
+  auto &inx = scope.FindVar(input_lod_tensor_array_name)
+                  ->Get<framework::LoDTensorArray>();
+  auto &grad_inx = *scope.FindVar(output_lod_tensor_array_name)
+                        ->GetMutable<framework::LoDTensorArray>();
+
+  for (size_t i = 0; i < inx.size(); i++) {
+    std::string var_name = output_lod_tensor_array_name + std::to_string(i);
+    framework::Variable *g_feed_value =
+        const_cast<framework::Scope &>(scope).Var(var_name);
+    auto &feed_input =
+        *(g_feed_value->GetMutable<paddle::framework::LoDTensor>());
+    grad_inx.push_back(feed_input);
+  }
+}
+
+class LoDTensorArray2TensorOp : public framework::OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto axis = Attr<int>("axis");
+
+    framework::AttributeMap attrs;
+    attrs["axis"] = axis;
+
+    auto &inx = scope.FindVar(Input("X"))->Get<framework::LoDTensorArray>();
+    auto &out =
+        *scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
+    auto &out_inx =
+        *scope.FindVar(Output("OutIndex"))->GetMutable<framework::LoDTensor>();
+
+    const size_t n = inx.size();
+    PADDLE_ENFORCE_GT(n, 0, "Input tensorarray size should > 0.");
+
+    std::string base_name = Inputs("X")[0];
+    std::vector<std::string> names;
+
+    // get the input tensorarray items' dim in out_inx
+    auto out_inx_dim = out_inx.dims();
+    out_inx_dim[0] = inx.size();
+    out_inx.Resize(out_inx_dim);
+
+    std::string var_name = "out_index";
+    framework::Variable *tmp_index_var =
+        const_cast<framework::Scope &>(scope).Var(var_name);
+    auto &tmp_index_tensor =
+        *(tmp_index_var->GetMutable<paddle::framework::LoDTensor>());
+    tmp_index_tensor.Resize(out_inx_dim);
+    int *tmp_index_data =
+        tmp_index_tensor.mutable_data<int>(platform::CPUPlace());
+
+    auto out_dims = inx[0].dims();
+    size_t out_dim_sum = 0;
+    for (size_t index = 0; index < inx.size(); index++) {
+      auto inx_dims = inx[index].dims();
+      out_dim_sum += inx_dims[axis];
+      tmp_index_data[index] = inx_dims[axis];
+    }
+    out_inx.ShareDataWith(tmp_index_tensor);
+
+    // get input array items' dims
+    out_dims[axis] = out_dim_sum;
+    out.Resize(out_dims);
+
+    LodTensorArray2LodTensorVector(scope, base_name, Input("X"), &names);
+    // Invoke Reshape Op
+    auto concat_op = framework::OpRegistry::CreateOp(
+        "concat", {{"X", names}}, {{"Out", {Output("Out")}}}, attrs);
+
+    concat_op->Run(scope, place);
+  }
+};
+
+class LoDTensorArray2TensorOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "Input LoDTensorArray of tensor_array_to_tensor operator.");
+    AddOutput("Out", "Output tensor of tensor_array_to_tensor operator.");
+    AddOutput("OutIndex",
+              "Output input LoDTensorArray items' dims of "
+              "tensor_array_to_tensor operator.");
+    AddAttr<int>("axis",
+                 "The axis along which the input tensors will be concatenated.")
+        .SetDefault(0);
+    AddComment(R"DOC(
+tensor_array_to_tensor Operator.
+
+Concatenate the input LoDTensorArray along dimension axis to the output Tensor.
+Examples:
+  Input = {[1,2], [3,4], [5,6]}
+  axis = 0
+  Output = [[1,2],
+            [3,4],
+            [5,6]]
+  OutputIndex = [1,1,1]
+
+)DOC");
+  }
+};
+
+class LoDTensorArray2TensorOpInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {}
+};
+
+class LoDTensorArray2TensorGradInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {}
+};
+
+class LoDTensorArray2TensorGradInferVarType
+    : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const override {
+    for (auto &out_var : op_desc.Output(framework::GradVarName("X"))) {
+      block->Var(out_var)->SetType(framework::proto::VarType::LOD_TENSOR_ARRAY);
+    }
+  }
+};
+
+class LoDTensorArray2TensorGradOp : public framework::OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto axis = Attr<int>("axis");
+    framework::AttributeMap attrs;
+    attrs["axis"] = axis;
+
+    auto &inx = scope.FindVar(Input("X"))->Get<framework::LoDTensorArray>();
+    const size_t n = inx.size();
+    PADDLE_ENFORCE_GT(n, 0, "Input tensorarray size should > 0.");
+
+    std::string base_name = Inputs("X")[0];
+    std::vector<std::string> names;
+
+    LodTensorArray2LodTensorVector(scope, base_name, Input("X"), &names);
+
+    // grad
+    auto dx_name = Output(framework::GradVarName("X"));
+    auto dout_name = Input(framework::GradVarName("Out"));
+
+    std::vector<std::string> grad_names;
+
+    LodTensorVectorResizeFromLodTensorArray(scope, "grad_name", Input("X"),
+                                            &grad_names);
+
+    auto concat_grad_op = framework::OpRegistry::CreateOp(
+        "concat_grad", {{"X", names}, {"Out@GRAD", {dout_name}}},
+        {{"X@GRAD", grad_names}}, attrs);
+
+    concat_grad_op->Run(scope, place);
+
+    LodTensorArrayCreateFromLodTensorArray(scope, Input("X"), dx_name);
+    auto &grad_inx =
+        *scope.FindVar(dx_name)->GetMutable<framework::LoDTensorArray>();
+
+    for (size_t i = 0; i < grad_names.size(); i++) {
+      std::string var_name = grad_names[i];
+      auto &feed_input = scope.FindVar(var_name)->Get<framework::LoDTensor>();
+      grad_inx[i].ShareDataWith(feed_input);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+USE_OP(concat);
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(tensor_array_to_tensor, ops::LoDTensorArray2TensorOp,
+                  ops::LoDTensorArray2TensorOpMaker,
+                  ops::LoDTensorArray2TensorOpInferShape,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(tensor_array_to_tensor_grad, ops::LoDTensorArray2TensorGradOp,
+                  ops::LoDTensorArray2TensorGradInferShape,
+                  ops::LoDTensorArray2TensorGradInferVarType);
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index 09a7cb8dc9..57e5d197b6 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -24,10 +24,10 @@ from .layer_function_generator import templatedoc
 import numpy
 
 __all__ = [
-    'create_tensor', 'create_parameter', 'create_global_var', 'cast', 'concat',
-    'sums', 'assign', 'fill_constant_batch_size_like', 'fill_constant',
-    'argmin', 'argmax', 'argsort', 'ones', 'zeros', 'reverse', 'has_inf',
-    'has_nan', 'isfinite'
+    'create_tensor', 'create_parameter', 'create_global_var', 'cast',
+    'tensor_array_to_tensor', 'concat', 'sums', 'assign',
+    'fill_constant_batch_size_like', 'fill_constant', 'argmin', 'argmax',
+    'argsort', 'ones', 'zeros', 'reverse', 'has_inf', 'has_nan', 'isfinite'
 ]
 
 
@@ -193,6 +193,60 @@ def concat(input, axis=0, name=None):
     return out
 
 
+def tensor_array_to_tensor(input, axis=1, name=None):
+    """
+    This function concatenates the input LodTensorArray along the axis mentioned
+    and returns that as the output.
+
+    A simple example as below:
+    
+    .. code-block:: text
+    
+        Given:
+
+        input.data = {[[0.6, 0.1, 0.3],
+                       [0.5, 0.3, 0.2]],
+                      [[1.3],
+                       [1.8]],
+                      [[2.3, 2.1],
+                       [2.5, 2.4]]}
+        
+        axis = 1
+    
+        Then:
+
+        output.data = [[0.6, 0.1, 0.3, 1.3, 2.3, 2.1],
+                       [0.5, 0.3, 0.2, 1.8, 2.5, 2.4]]
+
+        output_index.data = [3, 1, 2]
+
+    Args:
+        input(list): Input LodTensorArray
+        axis(int): Integer axis along which the tensors will be concatenated
+        name(str|None): A name for this layer(optional). If set None, the layer
+                       will be named automatically.
+
+    Returns:
+        Variable: Output variable of the concatenation
+        Variable: The input LodTensorArray items' dims along the axis
+
+    Examples:
+        .. code-block:: python
+
+           output, output_index = fluid.layers.tensor_array_to_tensor(input=tensor_array)
+    """
+    helper = LayerHelper('tensor_array_concat', **locals())
+    out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
+    out_index = helper.create_variable_for_type_inference(dtype="int32")
+    helper.append_op(
+        type='tensor_array_concat',
+        inputs={'X': input},
+        outputs={'Out': [out],
+                 'OutIndex': [out_index]},
+        attrs={'axis': axis})
+    return out, out_index
+
+
 def sums(input, out=None):
     """
     This function performs the sum operation on the input and returns the
diff --git a/python/paddle/fluid/tests/unittests/test_tensor_array_to_tensor.py b/python/paddle/fluid/tests/unittests/test_tensor_array_to_tensor.py
new file mode 100644
index 0000000000..78b95de7e0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_tensor_array_to_tensor.py
@@ -0,0 +1,142 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+from paddle.fluid.executor import Executor
+
+
+class TestLoDTensorArrayConcat(unittest.TestCase):
+    def setUp(self):
+        self.op_type = "tensor_array_to_tensor"
+        self.attrs = {"axis": 0}
+        self.outputs = ["Out"]
+
+    def test_get_set(self):
+        scope = core.Scope()
+        program = fluid.Program()
+        block = program.global_block()
+
+        input_arr = block.create_var(
+            name="tmp_lod_tensor_array",
+            type=core.VarDesc.VarType.LOD_TENSOR_ARRAY)
+        input_arr.persistable = True
+        input_arr_var = scope.var('tmp_lod_tensor_array')
+        input_tensor_array = input_arr_var.get_lod_tensor_array()
+        self.assertEqual(0, len(input_tensor_array))
+
+        cpu = core.CPUPlace()
+        for i in range(10):
+            t = core.LoDTensor()
+            if i == 0:
+                t.set(numpy.array([[i], [i]], dtype='float32'), cpu)
+            else:
+                t.set(numpy.array([[i]], dtype='float32'), cpu)
+            input_tensor_array.append(t)
+
+        self.assertEqual(10, len(input_tensor_array))
+
+        random_grad = numpy.random.random_sample([11]).astype(numpy.float32)
+
+        y_out = block.create_var(name="Out")
+        y_out.persistable = True
+        y_out_index = block.create_var(name="OutIndex")
+        y_out_index.persistable = True
+
+        y_grad_arr = block.create_var(
+            name='Out@GRAD', dtype='float32', shape=[11])
+        y_grad_arr.persistable = True
+        y_grad = scope.var('Out@GRAD')
+        y_grad_tensor = y_grad.get_tensor()
+        y_grad_tensor.set(random_grad, cpu)
+
+        op = block.append_op(
+            type=self.op_type,
+            inputs={"X": input_arr},
+            outputs={"Out": y_out,
+                     "OutIndex": y_out_index},
+            attrs=self.attrs)
+
+        out_grad = block.create_var(
+            name="tmp_lod_tensor_array@GRAD",
+            type=core.VarDesc.VarType.LOD_TENSOR_ARRAY)
+        out_grad.persistable = True
+
+        grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(op.desc,
+                                                                  set(), [])
+        grad_op_desc = grad_op_desc_list[0]
+        new_op_desc = block.desc.append_op()
+        new_op_desc.copy_from(grad_op_desc)
+        for var_name in grad_op_desc.output_arg_names():
+            block.desc.var(var_name.encode("ascii"))
+
+        grad_op_desc.infer_var_type(block.desc)
+        grad_op_desc.infer_shape(block.desc)
+        for arg in grad_op_desc.output_arg_names():
+            grad_var = block.desc.find_var(arg.encode("ascii"))
+            grad_var.set_dtype(core.VarDesc.VarType.FP32)
+
+        fetch_list = []
+        fetch_list.append(block.var('Out'))
+        fetch_list.append(block.var('OutIndex'))
+
+        exe = fluid.Executor(fluid.CPUPlace())
+        out = exe.run(program, fetch_list=fetch_list, scope=scope)
+        #print ("index: ", numpy.array(out[1]))  
+
+        # test forward
+        tensor_res = numpy.array(out[0])
+        tensor_res_out_idx = numpy.array(out[1])
+        tensor_gt = numpy.array(
+            [0] + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype='float32')
+
+        self.assertEqual(len(tensor_res), len(tensor_gt))
+        self.assertEqual(len(tensor_res_out_idx), 10)
+
+        for i in range(len(tensor_res)):
+            self.assertEqual(tensor_res[i], tensor_gt[i])
+
+        for i in range(len(tensor_res_out_idx)):
+            if i == 0:
+                self.assertEqual(tensor_res_out_idx[i], 2)
+            else:
+                self.assertEqual(tensor_res_out_idx[i], 1)
+
+        # test backward
+        grad_tensor = scope.var('tmp_lod_tensor_array@GRAD')
+        grad_tensor_array = grad_tensor.get_lod_tensor_array()
+
+        self.assertEqual(10, len(grad_tensor_array))
+
+        for i in range(len(grad_tensor_array)):
+            if i == 0:
+                self.assertEqual(
+                    numpy.array(grad_tensor_array[i])[0],
+                    numpy.array(random_grad[i]))
+                self.assertEqual(
+                    numpy.array(grad_tensor_array[i])[1],
+                    numpy.array(random_grad[i + 1]))
+            if i == 1:
+                self.assertEqual(
+                    numpy.array(grad_tensor_array[i]),
+                    numpy.array(random_grad[i + 1]))
+
+
+if __name__ == '__main__':
+    unittest.main()

From d03cbd1b8ca15eeb121521da8a18909e990af758 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Sun, 11 Nov 2018 10:36:58 +0800
Subject: [PATCH 22/25] follow comment test=develop

---
 ...stribute_lookuptable_utils.py => distribute_lookup_table.py} | 0
 python/paddle/fluid/optimizer.py                                | 2 +-
 python/paddle/fluid/transpiler/distribute_transpiler.py         | 2 +-
 3 files changed, 2 insertions(+), 2 deletions(-)
 rename python/paddle/fluid/{transpiler/details/distribute_lookuptable_utils.py => distribute_lookup_table.py} (100%)

diff --git a/python/paddle/fluid/transpiler/details/distribute_lookuptable_utils.py b/python/paddle/fluid/distribute_lookup_table.py
similarity index 100%
rename from python/paddle/fluid/transpiler/details/distribute_lookuptable_utils.py
rename to python/paddle/fluid/distribute_lookup_table.py
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 94d171d83d..da92826d41 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -18,7 +18,7 @@ from collections import defaultdict
 from contextlib import contextmanager
 
 from paddle.fluid.framework import Program, Variable, name_scope, default_main_program
-from paddle.fluid.transpiler.details.distribute_lookuptable_utils import find_distributed_lookup_table
+from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table
 
 from . import framework
 from . import layers
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index b6179864a2..7c0e8dd9fc 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -41,7 +41,7 @@ from ..framework import Program, default_main_program, \
     default_startup_program, Block, \
     Parameter, grad_var_name
 from .details import *
-from .details.distribute_lookuptable_utils import find_distributed_lookup_table
+from ..distribute_lookup_table import find_distributed_lookup_table
 from functools import reduce
 
 LOOKUP_TABLE_TYPE = "lookup_table"

From 45eebf69e8bedf8103e07642227de9f6f6600cd6 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Sun, 11 Nov 2018 10:54:41 +0800
Subject: [PATCH 23/25] reduce pass num of test_label_semantic_roles to avoid
 test timeout test=develop

---
 python/paddle/fluid/tests/book/test_label_semantic_roles.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
index f63387a906..42ab9b2311 100644
--- a/python/paddle/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
@@ -38,7 +38,7 @@ depth = 8
 mix_hidden_lr = 1e-3
 
 IS_SPARSE = True
-PASS_NUM = 10
+PASS_NUM = 1
 BATCH_SIZE = 10
 
 embedding_name = 'emb'

From 04da1dcfb80e0ff7b49dbea8e5027e6b73cf4ba0 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Sun, 11 Nov 2018 13:05:02 +0800
Subject: [PATCH 24/25] optimize import test=develop

---
 python/paddle/fluid/__init__.py                    | 1 +
 python/paddle/fluid/transpiler/details/__init__.py | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index c4cfd8e468..876775a6f3 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -34,6 +34,7 @@ from . import regularizer
 from . import average
 from . import metrics
 from . import transpiler
+from . import distribute_lookup_table
 from .param_attr import ParamAttr, WeightNormParamAttr
 from .data_feeder import DataFeeder
 from .core import LoDTensor, LoDTensorArray, CPUPlace, CUDAPlace, CUDAPinnedPlace, Scope
diff --git a/python/paddle/fluid/transpiler/details/__init__.py b/python/paddle/fluid/transpiler/details/__init__.py
index 9671b60007..f33c05ed2f 100644
--- a/python/paddle/fluid/transpiler/details/__init__.py
+++ b/python/paddle/fluid/transpiler/details/__init__.py
@@ -17,4 +17,3 @@ from __future__ import print_function
 from .program_utils import *
 from .ufind import *
 from .checkport import *
-from .distribute_lookuptable_utils import *

From 792bf0b77f1bc91c757f4465cc3f7ce84746bd20 Mon Sep 17 00:00:00 2001
From: Yu Yang <reyoung@126.com>
Date: Mon, 12 Nov 2018 11:12:40 +0800
Subject: [PATCH 25/25] Fix for cmake 1.11 (#14350)

test=develop
---
 cmake/external/protobuf.cmake | 102 ++++++++++++++++------------------
 1 file changed, 49 insertions(+), 53 deletions(-)

diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index 550b0dada8..45ef9b4550 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -30,66 +30,61 @@ UNSET_VAR(PROTOBUF_LITE_LIBRARY)
 UNSET_VAR(PROTOBUF_LIBRARY)
 UNSET_VAR(PROTOBUF_INCLUDE_DIR)
 UNSET_VAR(Protobuf_PROTOC_EXECUTABLE)
+function(protobuf_generate_python SRCS)
+    # shameless copy from https://github.com/Kitware/CMake/blob/master/Modules/FindProtobuf.cmake
+    if(NOT ARGN)
+        message(SEND_ERROR "Error: PROTOBUF_GENERATE_PYTHON() called without any proto files")
+        return()
+    endif()
 
-if(NOT COMMAND protobuf_generate_python)  # before cmake 3.4, protobuf_genrerate_python is not defined.
-    function(protobuf_generate_python SRCS)
-        # shameless copy from https://github.com/Kitware/CMake/blob/master/Modules/FindProtobuf.cmake
-        if(NOT ARGN)
-            message(SEND_ERROR "Error: PROTOBUF_GENERATE_PYTHON() called without any proto files")
-            return()
-        endif()
-
-        if(PROTOBUF_GENERATE_CPP_APPEND_PATH)
-            # Create an include path for each file specified
-            foreach(FIL ${ARGN})
-                get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
-                get_filename_component(ABS_PATH ${ABS_FIL} PATH)
-                list(FIND _protobuf_include_path ${ABS_PATH} _contains_already)
-                if(${_contains_already} EQUAL -1)
-                    list(APPEND _protobuf_include_path -I ${ABS_PATH})
-                endif()
-            endforeach()
-        else()
-            set(_protobuf_include_path -I ${CMAKE_CURRENT_SOURCE_DIR})
-        endif()
-
-        if(DEFINED PROTOBUF_IMPORT_DIRS AND NOT DEFINED Protobuf_IMPORT_DIRS)
-            set(Protobuf_IMPORT_DIRS "${PROTOBUF_IMPORT_DIRS}")
-        endif()
-
-        if(DEFINED Protobuf_IMPORT_DIRS)
-            foreach(DIR ${Protobuf_IMPORT_DIRS})
-                get_filename_component(ABS_PATH ${DIR} ABSOLUTE)
-                list(FIND _protobuf_include_path ${ABS_PATH} _contains_already)
-                if(${_contains_already} EQUAL -1)
-                    list(APPEND _protobuf_include_path -I ${ABS_PATH})
-                endif()
-            endforeach()
-        endif()
-
-        set(${SRCS})
+    if(PROTOBUF_GENERATE_CPP_APPEND_PATH)
+        # Create an include path for each file specified
         foreach(FIL ${ARGN})
             get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
-            get_filename_component(FIL_WE ${FIL} NAME_WE)
-            if(NOT PROTOBUF_GENERATE_CPP_APPEND_PATH)
-                get_filename_component(FIL_DIR ${FIL} DIRECTORY)
-                if(FIL_DIR)
-                    set(FIL_WE "${FIL_DIR}/${FIL_WE}")
-                endif()
+            get_filename_component(ABS_PATH ${ABS_FIL} PATH)
+            list(FIND _protobuf_include_path ${ABS_PATH} _contains_already)
+            if(${_contains_already} EQUAL -1)
+                list(APPEND _protobuf_include_path -I ${ABS_PATH})
             endif()
+        endforeach()
+    else()
+        set(_protobuf_include_path -I ${CMAKE_CURRENT_SOURCE_DIR})
+    endif()
+    if(DEFINED PROTOBUF_IMPORT_DIRS AND NOT DEFINED Protobuf_IMPORT_DIRS)
+        set(Protobuf_IMPORT_DIRS "${PROTOBUF_IMPORT_DIRS}")
+    endif()
 
-            list(APPEND ${SRCS} "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}_pb2.py")
-            add_custom_command(
-                    OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}_pb2.py"
-                    COMMAND  ${Protobuf_PROTOC_EXECUTABLE} --python_out ${CMAKE_CURRENT_BINARY_DIR} ${_protobuf_include_path} ${ABS_FIL}
-                    DEPENDS ${ABS_FIL} ${Protobuf_PROTOC_EXECUTABLE}
-                    COMMENT "Running Python protocol buffer compiler on ${FIL}"
-                    VERBATIM )
+    if(DEFINED Protobuf_IMPORT_DIRS)
+        foreach(DIR ${Protobuf_IMPORT_DIRS})
+            get_filename_component(ABS_PATH ${DIR} ABSOLUTE)
+            list(FIND _protobuf_include_path ${ABS_PATH} _contains_already)
+            if(${_contains_already} EQUAL -1)
+                list(APPEND _protobuf_include_path -I ${ABS_PATH})
+            endif()
         endforeach()
+    endif()
 
-        set(${SRCS} ${${SRCS}} PARENT_SCOPE)
-    endfunction()
-endif()
+    set(${SRCS})
+    foreach(FIL ${ARGN})
+        get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
+        get_filename_component(FIL_WE ${FIL} NAME_WE)
+        if(NOT PROTOBUF_GENERATE_CPP_APPEND_PATH)
+            get_filename_component(FIL_DIR ${FIL} DIRECTORY)
+            if(FIL_DIR)
+                set(FIL_WE "${FIL_DIR}/${FIL_WE}")
+            endif()
+        endif()
+        list(APPEND ${SRCS} "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}_pb2.py")
+        add_custom_command(
+                OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}_pb2.py"
+                COMMAND  ${PROTOBUF_PROTOC_EXECUTABLE} --python_out ${CMAKE_CURRENT_BINARY_DIR} ${_protobuf_include_path} ${ABS_FIL}
+                DEPENDS ${ABS_FIL} ${PROTOBUF_PROTOC_EXECUTABLE}
+                COMMENT "Running Python protocol buffer compiler on ${FIL}"
+                VERBATIM )
+    endforeach()
+
+    set(${SRCS} ${${SRCS}} PARENT_SCOPE)
+endfunction()
 
 # Print and set the protobuf library information,
 # finish this cmake process and exit from this file.
@@ -126,6 +121,7 @@ macro(PROMPT_PROTOBUF_LIB)
     # FIND_Protobuf.cmake uses `Protobuf_PROTOC_EXECUTABLE`.
     # make `protobuf_generate_cpp` happy.
     SET(Protobuf_PROTOC_EXECUTABLE ${PROTOBUF_PROTOC_EXECUTABLE})
+
     FOREACH(dep ${protobuf_DEPS})
         ADD_DEPENDENCIES(protobuf ${dep})
         ADD_DEPENDENCIES(protobuf_lite ${dep})