From b65709e4039f338d90391b0fed9b8f6118b23380 Mon Sep 17 00:00:00 2001
From: dangqingqing <dangqingqing@baidu.com>
Date: Tue, 19 Sep 2017 16:44:28 +0800
Subject: [PATCH 1/6] Share LoD between input and output of each opeators.

---
 paddle/framework/operator.h                   |  8 +++++++
 paddle/operators/accuracy_op.cc               |  7 ++++++-
 paddle/operators/cos_sim_op.cc                | 10 ++++++---
 paddle/operators/elementwise_mul_op.cc        |  5 +++++
 paddle/operators/fc_op.cc                     |  4 ++++
 paddle/operators/fill_zeros_like_op.cc        | 21 +++++++++----------
 paddle/operators/fill_zeros_like_op.h         |  2 +-
 paddle/operators/lookup_table_op.cc           | 11 +++++++---
 paddle/operators/mean_op.cc                   |  3 ++-
 paddle/operators/minus_op.cc                  |  8 ++++++-
 paddle/operators/mul_op.cc                    | 10 +++++++--
 paddle/operators/onehot_cross_entropy_op.cc   |  3 +++
 paddle/operators/prelu_op.cc                  |  3 +++
 paddle/operators/rowwise_add_op.cc            |  1 +
 paddle/operators/scale_op.cc                  |  1 +
 paddle/operators/sigmoid_op.cc                |  1 +
 paddle/operators/squared_l2_distance_op.cc    |  4 ++++
 paddle/operators/sum_op.cc                    |  8 +++++--
 .../tests/test_fill_zeros_like_op.py          |  4 ++--
 19 files changed, 87 insertions(+), 27 deletions(-)
diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
index b7c9c39402..28a253ec0b 100644
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -336,6 +336,14 @@ class InferShapeContext {
     return &var->Get<Tensor>();
   }
 
+  void ShareLoD(const std::string& in, const std::string& out) const {
+    PADDLE_ENFORCE(InputVar(in)->IsType<LoDTensor>(),
+                   "The Input(%s) must be LoDTensor.", in);
+    PADDLE_ENFORCE(OutputVar(out)->IsType<LoDTensor>(),
+                   "The Output(%s) must be LoDTensor.", out);
+    Output<LoDTensor>(out)->set_lod(Input<LoDTensor>(in)->lod());
+  }
+
  private:
   const OperatorBase& op_;
   const Scope& scope_;
diff --git a/paddle/operators/accuracy_op.cc b/paddle/operators/accuracy_op.cc
index 0c813748b2..32479ae5a3 100644
--- a/paddle/operators/accuracy_op.cc
+++ b/paddle/operators/accuracy_op.cc
@@ -40,6 +40,7 @@ class AccuracyOp : public framework::OperatorWithKernel {
                       "inference size must be the same as label size");
 
     ctx.Output<framework::LoDTensor>("Accuracy")->Resize({1});
+    ctx.ShareLoD("Inference", "Accuracy");
   }
 };
 
@@ -58,7 +59,11 @@ class AccuracyOpMaker : public framework::OpProtoAndCheckerMaker {
         R"DOC(Accuracy. It will print accuracy rate for classification.
 The accuracy is:
 ..  math::
-accuracy = \\frac{NumOfCorrectPredicts}{NumOfAllSamples})DOC");
+accuracy = \\frac{NumOfCorrectPredicts}{NumOfAllSamples})
+
+Both the input `Inference` and `Label` can carry the LoD (Level of Details)
+information, or not. But the output only shares the LoD with input `Inference`.
+DOC");
   }
 };
 
diff --git a/paddle/operators/cos_sim_op.cc b/paddle/operators/cos_sim_op.cc
index 72c4464936..840848fa08 100644
--- a/paddle/operators/cos_sim_op.cc
+++ b/paddle/operators/cos_sim_op.cc
@@ -57,6 +57,7 @@ class CosSimOp : public framework::OperatorWithKernel {
     ctx.Output<framework::LoDTensor>("Out")->Resize({x_dims[0], 1});
     ctx.Output<framework::LoDTensor>("XNorm")->Resize({x_dims[0], 1});
     ctx.Output<framework::LoDTensor>("YNorm")->Resize({y_dims[0], 1});
+    ctx.ShareLoD("X", "Out");
   }
 };
 
@@ -81,10 +82,13 @@ Cosine Similarity Operator.
 
 The equation is: Out = X^T * Y / (sqrt(X^T * X) * sqrt(Y^T * Y)).
 
-Input(X) and Input(Y) must have the same shape, except that the 1st dimension
-of Input(Y) could be just 1 (different from Input(X)), which will be
-broadcasted to match the shape of Input(X) before computing their cosine
+The input `X` and `Y` must have the same shape, except that the 1st dimension
+of input `Y` could be just 1 (different from input `X`), which will be
+broadcasted to match the shape of input `X` before computing their cosine
 similarity.
+
+Both the input `X` and `Y` can carry the LoD (Level of Details) information,
+or not. But the output only shares the LoD with input `X`.
 )DOC");
   }
 };
diff --git a/paddle/operators/elementwise_mul_op.cc b/paddle/operators/elementwise_mul_op.cc
index ee6e975b44..304e45fa5b 100644
--- a/paddle/operators/elementwise_mul_op.cc
+++ b/paddle/operators/elementwise_mul_op.cc
@@ -38,6 +38,7 @@ class ElementWiseMulOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_GE(x_dim.size(), y_dim.size(),
                       "Rank of first input must >= rank of second input.")
     ctx.Output<framework::LoDTensor>("Out")->Resize(x_dim);
+    ctx.ShareLoD("X", "Out");
   }
 };
 
@@ -63,11 +64,15 @@ Limited elementwise multiple operator.The equation is: Out = X ⊙ Y.
 2. Y's shape is a subset of X. 
    Y will be broadcasted to match the shape of X and axis should be dimension index Y in X.
    example:
+
       shape(X) = (2, 3, 4, 5), shape(Y) = (,)
       shape(X) = (2, 3, 4, 5), shape(Y) = (5,)
       shape(X) = (2, 3, 4, 5), shape(Y) = (4, 5)
       shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1
       shape(X) = (2, 3, 4, 5), shape(Y) = (2), with axis=0
+
+Both the input X and Y can carry the LoD (Level of Details) information,
+or not. But the output only shares the LoD with input X.
 )DOC");
   }
 };
diff --git a/paddle/operators/fc_op.cc b/paddle/operators/fc_op.cc
index e5d0f3c372..56fe654d1e 100644
--- a/paddle/operators/fc_op.cc
+++ b/paddle/operators/fc_op.cc
@@ -186,6 +186,10 @@ W_i is a 2-D matrix of size (K x N), where N means the number of neurons
 in the fully connected layer. B is a 1-D vector of size N.
 Thus, the output Out is a 2-D matrix of size (M x N).
 Activation type can be set to `identity` (default), `sigmoid` or `softmax`.
+
+All the inputs can carry the LoD (Level of Details) information,
+or not. But the output only shares the LoD with first input (`X[0]`).
+)DOC");
 )DOC");
   }
 };
diff --git a/paddle/operators/fill_zeros_like_op.cc b/paddle/operators/fill_zeros_like_op.cc
index ba7857cc65..a238b59b78 100644
--- a/paddle/operators/fill_zeros_like_op.cc
+++ b/paddle/operators/fill_zeros_like_op.cc
@@ -23,15 +23,14 @@ class FillZerosLikeOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(const framework::InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(
-        ctx.InputVar("Src"),
-        "Input(Src) of FillZerosLikeOp should not be null.");
-    PADDLE_ENFORCE_NOT_NULL(
-        ctx.OutputVar("Dst"),
-        "Output(Dst) of FillZerosLikeOp should not be null.");
-
-    ctx.Output<framework::LoDTensor>("Dst")->Resize(
-        ctx.Input<framework::Tensor>("Src")->dims());
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"),
+                            "Input(X) of FillZerosLikeOp should not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Y"),
+                            "Output(Y) of FillZerosLikeOp should not be null.");
+
+    ctx.Output<framework::LoDTensor>("Y")->Resize(
+        ctx.Input<framework::Tensor>("X")->dims());
+    ctx.ShareLoD("X", "Y");
   }
 };
 
@@ -40,8 +39,8 @@ class FillZerosLikeOpMaker : public framework::OpProtoAndCheckerMaker {
   FillZerosLikeOpMaker(framework::OpProto *proto,
                        framework::OpAttrChecker *op_checker)
       : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("Src", "The input of fill-zeros-like op.");
-    AddOutput("Dst", "The varibale will be filled up with zeros.");
+    AddInput("X", "The input of fill-zeros-like op.");
+    AddOutput("Y", "The varibale will be filled up with zeros.");
     AddComment(R"DOC(
 Fill up a vriable with zeros.
 
diff --git a/paddle/operators/fill_zeros_like_op.h b/paddle/operators/fill_zeros_like_op.h
index 969998ce2e..4474581784 100644
--- a/paddle/operators/fill_zeros_like_op.h
+++ b/paddle/operators/fill_zeros_like_op.h
@@ -23,7 +23,7 @@ template <typename Place, typename T>
 class FillZerosLikeKernel : public framework::OpKernel {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* output = context.Output<framework::Tensor>("Dst");
+    auto* output = context.Output<framework::Tensor>("Y");
     output->mutable_data<T>(context.GetPlace());
     auto t = framework::EigenVector<T>::Flatten(*output);
     t.device(context.GetEigenDevice<Place>()) = t.constant(static_cast<T>(0));
diff --git a/paddle/operators/lookup_table_op.cc b/paddle/operators/lookup_table_op.cc
index 07f6dfabca..8f533f1cc3 100644
--- a/paddle/operators/lookup_table_op.cc
+++ b/paddle/operators/lookup_table_op.cc
@@ -35,6 +35,7 @@ class LookupTableOp : public framework::OperatorWithKernel {
     auto output_t = ctx.Output<framework::LoDTensor>("Out");
 
     output_t->Resize({ids_t->dims()[0], table_t->dims()[1]});
+    ctx.ShareLoD("Ids", "Out");
   }
 };
 
@@ -50,9 +51,13 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
              "An input with type int32 or int64"
              "contains the ids to be looked up in W.");
     AddOutput("Out", "The lookup results, which have the same type with W.");
-    AddComment(
-        "This operator is used to perform lookups on the parameter W,"
-        "then concatenated into a dense tensor.");
+    AddComment(R"DOC(
+This operator is used to perform lookups on the parameter W,
+then concatenated into a dense tensor.
+
+The input `Ids` can carry the LoD (Level of Details) information,
+or not. And the output only shares the LoD with input `Ids`.
+)DOC");
   }
 };
 
diff --git a/paddle/operators/mean_op.cc b/paddle/operators/mean_op.cc
index 7d7eeb59a2..96540ff454 100644
--- a/paddle/operators/mean_op.cc
+++ b/paddle/operators/mean_op.cc
@@ -37,7 +37,8 @@ class MeanOpMaker : public framework::OpProtoAndCheckerMaker {
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The input of mean op");
     AddOutput("Out", "The output of mean op").NotInGradient();
-    AddComment("Mean Operator");
+    AddComment(R"DOC( Mean Operator
+)DOC");
   }
 };
 
diff --git a/paddle/operators/minus_op.cc b/paddle/operators/minus_op.cc
index a97bbecdca..5036f9f98a 100644
--- a/paddle/operators/minus_op.cc
+++ b/paddle/operators/minus_op.cc
@@ -41,6 +41,7 @@ class MinusOp : public framework::OperatorWithKernel {
         left_tensor->numel(), right_tensor->numel(),
         "Minus operator must take two tensor with same num of elements");
     ctx.Output<framework::LoDTensor>("Out")->Resize(left_tensor->dims());
+    ctx.ShareLoD("X", "Out");
   }
 };
 
@@ -54,7 +55,12 @@ class MinusOpMaker : public framework::OpProtoAndCheckerMaker {
 
     AddComment(R"DOC(Minus Operator
 
-Equation: Out = X - Y
+Equation:
+
+    Out = X - Y
+
+Both the input `X` and `Y` can carry the LoD (Level of Details) information,
+or not. But the output only shares the LoD with input `X`.
 )DOC");
   }
 };
diff --git a/paddle/operators/mul_op.cc b/paddle/operators/mul_op.cc
index b6d320b415..b2409a1870 100644
--- a/paddle/operators/mul_op.cc
+++ b/paddle/operators/mul_op.cc
@@ -55,6 +55,7 @@ class MulOp : public framework::OperatorWithKernel {
         "First matrix's width must be equal with second matrix's height.");
     ctx.Output<framework::LoDTensor>("Out")->Resize(
         {x_mat_dims[0], y_mat_dims[1]});
+    ctx.ShareLoD("X", "Out");
   }
 };
 
@@ -83,9 +84,14 @@ class MulOpMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault(1)
         .EqualGreaterThan(1);
     AddComment(R"DOC(
-Two Element Mul Operator.
+Mul operator is used to perform matrix multiplication for input X and Y.
 
-The equation is: Out = X * Y
+The equation is:
+
+    Out = X * Y
+
+Both the input `X` and `Y` can carry the LoD (Level of Details) information,
+or not. But the output only shares the LoD with input `X`.
 )DOC");
   }
 };
diff --git a/paddle/operators/onehot_cross_entropy_op.cc b/paddle/operators/onehot_cross_entropy_op.cc
index f38be3549f..1d87032d27 100644
--- a/paddle/operators/onehot_cross_entropy_op.cc
+++ b/paddle/operators/onehot_cross_entropy_op.cc
@@ -40,6 +40,7 @@ class OnehotCrossEntropyOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(label->dims().size(), 1, "label's dimension must be 1.");
     PADDLE_ENFORCE_EQ(X->dims()[0], label->dims()[0]);
     ctx.Output<framework::LoDTensor>("Y")->Resize({X->dims()[0], 1});
+    ctx.ShareLoD("X", "Y");
   }
 };
 
@@ -69,6 +70,8 @@ OnehotCrossEntropy Operator.
 
                 Y[i] = -log(X[i][j])
 
+Both the input `X` and `Label` can carry the LoD (Level of Details) information,
+or not. But the output only shares the LoD with input `X`.
 )DOC");
   }
 };
diff --git a/paddle/operators/prelu_op.cc b/paddle/operators/prelu_op.cc
index 7ae80b2968..2b7b82a3e1 100644
--- a/paddle/operators/prelu_op.cc
+++ b/paddle/operators/prelu_op.cc
@@ -38,6 +38,7 @@ class PReluOp : public framework::OperatorWithKernel {
                             "Output(Out) should not be null");
     auto *out = ctx.Output<framework::LoDTensor>("Out");
     out->Resize(in->dims());
+    ctx.ShareLoD("X", "Out");
   }
 };
 
@@ -55,6 +56,8 @@ The equation is:
   f(x) = alpha * x , for x < 0
   f(x) = x         , for x >= 0
 
+The input `X` can carry the LoD (Level of Details) information,
+or not. And the output shares the LoD with input `X`.
 )DOC");
   }
 };
diff --git a/paddle/operators/rowwise_add_op.cc b/paddle/operators/rowwise_add_op.cc
index 2a3fd3be94..90cdb2558b 100644
--- a/paddle/operators/rowwise_add_op.cc
+++ b/paddle/operators/rowwise_add_op.cc
@@ -45,6 +45,7 @@ class RowwiseAddOp : public framework::OperatorWithKernel {
         "The width of two operands must be same");
     PADDLE_ENFORCE_EQ(ctx.OutputSize("Out"), 1, "The output size must be 1");
     ctx.Output<framework::LoDTensor>("Out")->Resize(x_dims);
+    ctx.ShareLoD("X", "Out");
   }
 };
 
diff --git a/paddle/operators/scale_op.cc b/paddle/operators/scale_op.cc
index d1f42e8662..ca1bc4ac80 100644
--- a/paddle/operators/scale_op.cc
+++ b/paddle/operators/scale_op.cc
@@ -35,6 +35,7 @@ class ScaleOp : public framework::OperatorWithKernel {
     auto *in = ctx.Input<framework::Tensor>("X");
     auto *out = ctx.Output<framework::LoDTensor>("Out");
     out->Resize(in->dims());
+    ctx.ShareLoD("X", "Out");
   }
 };
 
diff --git a/paddle/operators/sigmoid_op.cc b/paddle/operators/sigmoid_op.cc
index 992b19965e..42befa22d0 100644
--- a/paddle/operators/sigmoid_op.cc
+++ b/paddle/operators/sigmoid_op.cc
@@ -30,6 +30,7 @@ class SigmoidOp : public framework::OperatorWithKernel {
 
     ctx.Output<framework::LoDTensor>("Y")->Resize(
         ctx.Input<Tensor>("X")->dims());
+    ctx.ShareLoD("X", "Y");
   }
 };
 
diff --git a/paddle/operators/squared_l2_distance_op.cc b/paddle/operators/squared_l2_distance_op.cc
index 39f4305877..dfe8e6decd 100644
--- a/paddle/operators/squared_l2_distance_op.cc
+++ b/paddle/operators/squared_l2_distance_op.cc
@@ -57,6 +57,7 @@ class SquaredL2DistanceOp : public framework::OperatorWithKernel {
     ctx.Output<framework::LoDTensor>("sub_result")
         ->Resize({x_dims[0], x->numel() / x_dims[0]});
     ctx.Output<framework::LoDTensor>("Out")->Resize({x_dims[0], 1});
+    ctx.ShareLoD("X", "Out");
   }
 };
 
@@ -79,6 +80,9 @@ class SquaredL2DistanceOpMaker : public framework::OpProtoAndCheckerMaker {
     input or to 1. If the first dimension of target is 1, SquaredL2DistanceOp
     will broadcast target's first dimension to input's first dimension.
     You can decide whether calculate the gradient of input and target.
+
+    Both the input X and Y can carry the LoD (Level of Details) information,
+    or not. But the output only shares the LoD with input X.
     )DOC");
   }
 };
diff --git a/paddle/operators/sum_op.cc b/paddle/operators/sum_op.cc
index 41e05c27f9..ebc57d6b7b 100644
--- a/paddle/operators/sum_op.cc
+++ b/paddle/operators/sum_op.cc
@@ -39,6 +39,7 @@ class SumOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE(in_dim == dim, "Input tensors must have same shape");
     }
     out->Resize(in_dim);
+    ctx.ShareLoD(ctx.op().Inputs("X")[0], "Out");
   }
 };
 
@@ -49,8 +50,11 @@ class SumOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "the input tensors of sum operator.").AsDuplicable();
     AddOutput("Out", "the output tensor of sum operator.");
     AddComment(R"DOC(
-            Sum the input tensors.
-        )DOC");
+Sum the input tensors.
+
+All the inputs can carry the LoD (Level of Details) information,
+or not. But the output only shares the LoD with the first input.
+)DOC");
   }
 };
 
diff --git a/python/paddle/v2/framework/tests/test_fill_zeros_like_op.py b/python/paddle/v2/framework/tests/test_fill_zeros_like_op.py
index 2473daaba2..eff8fa87d9 100644
--- a/python/paddle/v2/framework/tests/test_fill_zeros_like_op.py
+++ b/python/paddle/v2/framework/tests/test_fill_zeros_like_op.py
@@ -6,8 +6,8 @@ from op_test import OpTest
 class TestFillZerosLikeOp(OpTest):
     def setUp(self):
         self.op_type = "fill_zeros_like"
-        self.inputs = {'Src': np.random.random((219, 232)).astype("float32")}
-        self.outputs = {'Dst': np.zeros_like(self.inputs["Src"])}
+        self.inputs = {'X': np.random.random((219, 232)).astype("float32")}
+        self.outputs = {'Y': np.zeros_like(self.inputs["X"])}
 
     def test_check_output(self):
         self.check_output()

From 3ec48480af191970ad12377c9161994920fbf722 Mon Sep 17 00:00:00 2001
From: dangqingqing <dangqingqing@baidu.com>
Date: Wed, 20 Sep 2017 19:05:36 +0800
Subject: [PATCH 2/6] Fix bug

---
 paddle/operators/accuracy_op.cc | 6 +++---
 paddle/operators/dropout_op.cc  | 1 +
 paddle/operators/fc_op.cc       | 1 -
 paddle/operators/pad_op.cc      | 5 +++++
 paddle/operators/reshape_op.cc  | 5 +++++
 5 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/paddle/operators/accuracy_op.cc b/paddle/operators/accuracy_op.cc
index 32479ae5a3..391258b40b 100644
--- a/paddle/operators/accuracy_op.cc
+++ b/paddle/operators/accuracy_op.cc
@@ -55,15 +55,15 @@ class AccuracyOpMaker : public framework::OpProtoAndCheckerMaker {
     // TODO(typhoonzero): AddInput("Weight", ...
     AddOutput("Accuracy", "The accuracy of current batch");
 
-    AddComment(
-        R"DOC(Accuracy. It will print accuracy rate for classification.
+    AddComment(R"DOC(
+Accuracy. It will print accuracy rate for classification.
 The accuracy is:
 ..  math::
 accuracy = \\frac{NumOfCorrectPredicts}{NumOfAllSamples})
 
 Both the input `Inference` and `Label` can carry the LoD (Level of Details)
 information, or not. But the output only shares the LoD with input `Inference`.
-DOC");
+)DOC");
   }
 };
 
diff --git a/paddle/operators/dropout_op.cc b/paddle/operators/dropout_op.cc
index b111b9fccb..bfa1992d79 100644
--- a/paddle/operators/dropout_op.cc
+++ b/paddle/operators/dropout_op.cc
@@ -38,6 +38,7 @@ class DropoutOp : public framework::OperatorWithKernel {
     if (ctx.Attr<int>("is_training") == 1) {
       ctx.Output<LoDTensor>("Mask")->Resize(dims);
     }
+    ctx.ShareLoD("X", "Out");
   }
 };
 
diff --git a/paddle/operators/fc_op.cc b/paddle/operators/fc_op.cc
index 56fe654d1e..5ac0e8cc45 100644
--- a/paddle/operators/fc_op.cc
+++ b/paddle/operators/fc_op.cc
@@ -189,7 +189,6 @@ Activation type can be set to `identity` (default), `sigmoid` or `softmax`.
 
 All the inputs can carry the LoD (Level of Details) information,
 or not. But the output only shares the LoD with first input (`X[0]`).
-)DOC");
 )DOC");
   }
 };
diff --git a/paddle/operators/pad_op.cc b/paddle/operators/pad_op.cc
index a0b1c6b631..98de18fb9f 100644
--- a/paddle/operators/pad_op.cc
+++ b/paddle/operators/pad_op.cc
@@ -41,6 +41,11 @@ class PadOp : public framework::OperatorWithKernel {
     }
     ctx.Output<framework::LoDTensor>("Out")->Resize(
         framework::make_ddim(out_dims));
+    if (out_dims[0] == x_dim[0]) {
+      // Only pass LoD when the first dimension is equal between
+      // output and input.
+      ctx.ShareLoD("X", "Out");
+    }
   }
 };
 
diff --git a/paddle/operators/reshape_op.cc b/paddle/operators/reshape_op.cc
index 0d05e34414..c090758619 100644
--- a/paddle/operators/reshape_op.cc
+++ b/paddle/operators/reshape_op.cc
@@ -51,6 +51,11 @@ class ReshapeOp : public framework::OperatorWithKernel {
                    [](int a) { return static_cast<int64_t>(a); });
     auto out_dims = framework::make_ddim(shape_int64);
     ctx.Output<framework::LoDTensor>("Out")->Resize(out_dims);
+    if (shape[0] == in->dims()[0]) {
+      // Only pass LoD when the first dimension is equal between
+      // output and input.
+      ctx.ShareLoD("X", "Out");
+    }
   }
 };
 

From 36aeb30d12b2cdb7a763b59c70bc427eec7a49e3 Mon Sep 17 00:00:00 2001
From: dangqingqing <dangqingqing@baidu.com>
Date: Thu, 21 Sep 2017 11:07:56 +0800
Subject: [PATCH 3/6] Remove LoDTensor in some operators' InferShape and refine
 ShareLoD function.

---
 paddle/framework/operator.cc               | 15 +++----
 paddle/framework/operator.h                | 51 +++++++---------------
 paddle/operators/accuracy_op.cc            |  4 +-
 paddle/operators/add_op.cc                 |  2 +-
 paddle/operators/concat_op.cc              |  2 +-
 paddle/operators/cos_sim_op.cc             | 14 +++---
 paddle/operators/cross_entropy_op.cc       |  2 +-
 paddle/operators/dropout_op.cc             |  2 +-
 paddle/operators/elementwise_mul_op.cc     | 10 ++---
 paddle/operators/fill_zeros_like_op.cc     |  4 +-
 paddle/operators/gather_op.cc              |  4 +-
 paddle/operators/gaussian_random_op.cc     |  2 +-
 paddle/operators/lookup_table_op.cc        |  6 +--
 paddle/operators/mean_op.cc                |  4 +-
 paddle/operators/minus_op.cc               |  4 +-
 paddle/operators/mul_op.cc                 | 10 ++---
 paddle/operators/pad_op.cc                 |  6 +--
 paddle/operators/prelu_op.cc               |  8 ++--
 paddle/operators/reshape_op.cc             |  6 +--
 paddle/operators/rowwise_add_op.cc         |  8 ++--
 paddle/operators/scale_op.cc               |  4 +-
 paddle/operators/scatter_op.cc             |  7 ++-
 paddle/operators/sequence_avg_pool_op.cc   |  5 +--
 paddle/operators/sgd_op.cc                 |  2 +-
 paddle/operators/sigmoid_op.cc             |  7 ++-
 paddle/operators/softmax_op.cc             |  5 +--
 paddle/operators/split_op.cc               |  2 +-
 paddle/operators/squared_l2_distance_op.cc | 12 +++--
 paddle/operators/sum_op.cc                 |  6 +--
 paddle/operators/top_k_op.cc               |  4 +-
 paddle/operators/uniform_random_op.cc      |  2 +-
 31 files changed, 93 insertions(+), 127 deletions(-)

diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc
index f8a64a7866..fdc0660837 100644
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -207,23 +207,22 @@ const std::vector<const Tensor*> InferShapeContext::MultiInput<Tensor>(
 }
 
 template <>
-Tensor* ExecutionContext::Output<Tensor>(const std::string& name) const {
-  auto* var = OutputVar(name);
-  return var == nullptr ? nullptr : const_cast<Tensor*>(GetTensorFromVar(var));
+Tensor* InferShapeContext::Output<Tensor>(const std::string& name) const {
+  auto var = OutputVar(name);
+  return var == nullptr ? nullptr : var->GetMutable<LoDTensor>();
 }
 
 template <>
-std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>(
+std::vector<Tensor*> InferShapeContext::MultiOutput<Tensor>(
     const std::string& name) const {
   auto names = op().Outputs(name);
   std::vector<Tensor*> res;
   res.reserve(names.size());
   std::transform(names.begin(), names.end(), std::back_inserter(res),
                  [&](const std::string& sub_name) {
-                   auto var = scope().FindVar(sub_name);
-                   return var == nullptr
-                              ? nullptr
-                              : const_cast<Tensor*>(GetTensorFromVar(var));
+                   auto var = scope_.FindVar(sub_name);
+                   return var == nullptr ? nullptr
+                                         : var->GetMutable<LoDTensor>();
                  });
   return res;
 }
diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
index 28a253ec0b..4a078258d2 100644
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -277,9 +277,9 @@ class InferShapeContext {
     return res;
   }
 
-  std::vector<const Variable*> MultiOutputVar(const std::string& name) const {
+  std::vector<Variable*> MultiOutputVar(const std::string& name) const {
     auto names = op_.Outputs(name);
-    std::vector<const Variable*> res;
+    std::vector<Variable*> res;
     res.reserve(names.size());
     std::transform(names.begin(), names.end(), std::back_inserter(res),
                    [this](const std::string& name) {
@@ -336,12 +336,19 @@ class InferShapeContext {
     return &var->Get<Tensor>();
   }
 
-  void ShareLoD(const std::string& in, const std::string& out) const {
-    PADDLE_ENFORCE(InputVar(in)->IsType<LoDTensor>(),
-                   "The Input(%s) must be LoDTensor.", in);
-    PADDLE_ENFORCE(OutputVar(out)->IsType<LoDTensor>(),
-                   "The Output(%s) must be LoDTensor.", out);
-    Output<LoDTensor>(out)->set_lod(Input<LoDTensor>(in)->lod());
+  void ShareLoD(const std::string& in, const std::string& out, size_t i = 0,
+                size_t j = 0) const {
+    PADDLE_ENFORCE_LT(i, InputSize(in));
+    PADDLE_ENFORCE_LT(j, OutputSize(out));
+    auto* in_var = MultiInputVar(in)[i];
+    auto* out_var = MultiOutputVar(out)[j];
+    PADDLE_ENFORCE(in_var->IsType<LoDTensor>(),
+                   "The %d-th input of Input(%s) must be LoDTensor.", in);
+    PADDLE_ENFORCE(out_var->IsType<LoDTensor>(),
+                   "The %d-th output of Output(%s) must be LoDTensor.", out);
+    auto in_tensor = in_var->Get<LoDTensor>();
+    auto* out_tensor = out_var->GetMutable<LoDTensor>();
+    out_tensor->set_lod(in_tensor.lod());
   }
 
  private:
@@ -388,38 +395,10 @@ class ExecutionContext : public InferShapeContext {
     return device_context_;
   }
 
-  // redefine Output function,
-  // use Variable::Get instead of Variable::GetMutable
-  template <typename T>
-  T* Output(const std::string& name) const {
-    auto var = OutputVar(name);
-    return var == nullptr ? nullptr : const_cast<T*>(&var->Get<T>());
-  }
-
-  // redefine MultiOutput function.
-  // use Variable::Get instead of Variable::GetMutable
-  template <typename T>
-  std::vector<T*> MultiOutput(const std::string& name) const {
-    auto names = op().Outputs(name);
-    std::vector<T*> res;
-    res.reserve(names.size());
-    std::transform(
-        names.begin(), names.end(), std::back_inserter(res),
-        [&](const std::string& sub_name) { return Output<T>(sub_name); });
-    return res;
-  }
-
  private:
   const platform::DeviceContext& device_context_;
 };
 
-template <>
-Tensor* ExecutionContext::Output<Tensor>(const std::string& name) const;
-
-template <>
-std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>(
-    const std::string& name) const;
-
 class OpKernel {
  public:
   /**
diff --git a/paddle/operators/accuracy_op.cc b/paddle/operators/accuracy_op.cc
index 391258b40b..70e4f9da12 100644
--- a/paddle/operators/accuracy_op.cc
+++ b/paddle/operators/accuracy_op.cc
@@ -39,8 +39,8 @@ class AccuracyOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(inference->dims()[0], label->dims()[0],
                       "inference size must be the same as label size");
 
-    ctx.Output<framework::LoDTensor>("Accuracy")->Resize({1});
-    ctx.ShareLoD("Inference", "Accuracy");
+    ctx.Output<framework::Tensor>("Accuracy")->Resize({1});
+    ctx.ShareLoD("Inference", /*->*/ "Accuracy");
   }
 };
 
diff --git a/paddle/operators/add_op.cc b/paddle/operators/add_op.cc
index e83c1efeaf..ed11d09697 100644
--- a/paddle/operators/add_op.cc
+++ b/paddle/operators/add_op.cc
@@ -33,7 +33,7 @@ class AddOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(ctx.Input<Tensor>("X")->dims(),
                       ctx.Input<Tensor>("Y")->dims(),
                       "Two input of Add Op's dimension must be same.");
-    ctx.Output<framework::LoDTensor>("Out")->Resize(
+    ctx.Output<framework::Tensor>("Out")->Resize(
         ctx.Input<Tensor>("X")->dims());
   }
 };
diff --git a/paddle/operators/concat_op.cc b/paddle/operators/concat_op.cc
index 223bb0ffe6..07f847079e 100644
--- a/paddle/operators/concat_op.cc
+++ b/paddle/operators/concat_op.cc
@@ -29,7 +29,7 @@ class ConcatOp : public framework::OperatorWithKernel {
                             "Output(Out) of ConcatOp should not be null.");
 
     auto ins = ctx.MultiInput<framework::Tensor>("X");
-    auto *out = ctx.Output<framework::LoDTensor>("Out");
+    auto *out = ctx.Output<framework::Tensor>("Out");
     size_t axis = static_cast<size_t>(ctx.Attr<int>("axis"));
     size_t n = ins.size();
 
diff --git a/paddle/operators/cos_sim_op.cc b/paddle/operators/cos_sim_op.cc
index 840848fa08..b56ee2047b 100644
--- a/paddle/operators/cos_sim_op.cc
+++ b/paddle/operators/cos_sim_op.cc
@@ -54,10 +54,10 @@ class CosSimOp : public framework::OperatorWithKernel {
                    " just 1 (which will be broadcasted to match Input(X)).");
 
     // resize tensor
-    ctx.Output<framework::LoDTensor>("Out")->Resize({x_dims[0], 1});
-    ctx.Output<framework::LoDTensor>("XNorm")->Resize({x_dims[0], 1});
-    ctx.Output<framework::LoDTensor>("YNorm")->Resize({y_dims[0], 1});
-    ctx.ShareLoD("X", "Out");
+    ctx.Output<framework::Tensor>("Out")->Resize({x_dims[0], 1});
+    ctx.Output<framework::Tensor>("XNorm")->Resize({x_dims[0], 1});
+    ctx.Output<framework::Tensor>("YNorm")->Resize({y_dims[0], 1});
+    ctx.ShareLoD("X", /*->*/ "Out");
   }
 };
 
@@ -143,10 +143,8 @@ class CosSimOpGrad : public framework::OperatorWithKernel {
                       "Shape of Input(Out@Grad) must be [X.Dim(0), 1].");
 
     // resize tensor
-    auto *x_grad =
-        ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
-    auto *y_grad =
-        ctx.Output<framework::LoDTensor>(framework::GradVarName("Y"));
+    auto *x_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto *y_grad = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
     if (x_grad) x_grad->Resize(x_dims);
     if (y_grad) y_grad->Resize(y_dims);
   }
diff --git a/paddle/operators/cross_entropy_op.cc b/paddle/operators/cross_entropy_op.cc
index 5de8f1489d..fd91d39d5f 100644
--- a/paddle/operators/cross_entropy_op.cc
+++ b/paddle/operators/cross_entropy_op.cc
@@ -52,7 +52,7 @@ class CrossEntropyOp : public framework::OperatorWithKernel {
     }
 
     ctx.Output<LoDTensor>("Y")->Resize({x->dims()[0], 1});
-    ctx.ShareLoD("X", "Y");
+    ctx.ShareLoD("X", /*->*/ "Y");
   }
 };
 
diff --git a/paddle/operators/dropout_op.cc b/paddle/operators/dropout_op.cc
index bfa1992d79..dc773e510e 100644
--- a/paddle/operators/dropout_op.cc
+++ b/paddle/operators/dropout_op.cc
@@ -38,7 +38,7 @@ class DropoutOp : public framework::OperatorWithKernel {
     if (ctx.Attr<int>("is_training") == 1) {
       ctx.Output<LoDTensor>("Mask")->Resize(dims);
     }
-    ctx.ShareLoD("X", "Out");
+    ctx.ShareLoD("X", /*->*/ "Out");
   }
 };
 
diff --git a/paddle/operators/elementwise_mul_op.cc b/paddle/operators/elementwise_mul_op.cc
index 304e45fa5b..02bd4c7b85 100644
--- a/paddle/operators/elementwise_mul_op.cc
+++ b/paddle/operators/elementwise_mul_op.cc
@@ -37,8 +37,8 @@ class ElementWiseMulOp : public framework::OperatorWithKernel {
     auto y_dim = ctx.Input<Tensor>("Y")->dims();
     PADDLE_ENFORCE_GE(x_dim.size(), y_dim.size(),
                       "Rank of first input must >= rank of second input.")
-    ctx.Output<framework::LoDTensor>("Out")->Resize(x_dim);
-    ctx.ShareLoD("X", "Out");
+    ctx.Output<framework::Tensor>("Out")->Resize(x_dim);
+    ctx.ShareLoD("X", /*->*/ "Out");
   }
 };
 
@@ -91,10 +91,8 @@ class ElementWiseMulOpGrad : public framework::OperatorWithKernel {
     auto x_dims = ctx.Input<Tensor>("X")->dims();
     auto y_dims = ctx.Input<Tensor>("Y")->dims();
     auto out_dims = ctx.Input<Tensor>(framework::GradVarName("Out"))->dims();
-    auto *x_grad =
-        ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
-    auto *y_grad =
-        ctx.Output<framework::LoDTensor>(framework::GradVarName("Y"));
+    auto *x_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto *y_grad = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
 
     PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
                       "Rank of first input must >= rank of second input.")
diff --git a/paddle/operators/fill_zeros_like_op.cc b/paddle/operators/fill_zeros_like_op.cc
index a238b59b78..761a527a55 100644
--- a/paddle/operators/fill_zeros_like_op.cc
+++ b/paddle/operators/fill_zeros_like_op.cc
@@ -28,9 +28,9 @@ class FillZerosLikeOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Y"),
                             "Output(Y) of FillZerosLikeOp should not be null.");
 
-    ctx.Output<framework::LoDTensor>("Y")->Resize(
+    ctx.Output<framework::Tensor>("Y")->Resize(
         ctx.Input<framework::Tensor>("X")->dims());
-    ctx.ShareLoD("X", "Y");
+    ctx.ShareLoD("X", /*->*/ "Y");
   }
 };
 
diff --git a/paddle/operators/gather_op.cc b/paddle/operators/gather_op.cc
index d445b61c16..fecd1ce214 100644
--- a/paddle/operators/gather_op.cc
+++ b/paddle/operators/gather_op.cc
@@ -35,7 +35,7 @@ class GatherOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_GE(batch_size, 0, "Batch size must be >0");
     framework::DDim output_dims(ctx.Input<Tensor>("X")->dims());
     output_dims[0] = batch_size;
-    ctx.Output<framework::LoDTensor>("Out")->Resize(output_dims);
+    ctx.Output<framework::Tensor>("Out")->Resize(output_dims);
   }
 };
 
@@ -45,7 +45,7 @@ class GatherGradOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(const framework::InferShapeContext &ctx) const override {
-    auto X_grad = ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
+    auto X_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
     auto X = ctx.Input<Tensor>("X");
 
     X_grad->Resize(X->dims());
diff --git a/paddle/operators/gaussian_random_op.cc b/paddle/operators/gaussian_random_op.cc
index c0e161bbc0..5b7cbb5cc7 100644
--- a/paddle/operators/gaussian_random_op.cc
+++ b/paddle/operators/gaussian_random_op.cc
@@ -48,7 +48,7 @@ class GaussianRandomOp : public framework::OperatorWithKernel {
         ctx.OutputVar("Out"),
         "Output(Out) of GaussianRandomOp should not be null.");
 
-    auto* tensor = ctx.Output<framework::LoDTensor>("Out");
+    auto* tensor = ctx.Output<framework::Tensor>("Out");
     auto dims = Attr<std::vector<int>>("dims");
     std::vector<int64_t> temp;
     temp.reserve(dims.size());
diff --git a/paddle/operators/lookup_table_op.cc b/paddle/operators/lookup_table_op.cc
index 8f533f1cc3..04ac24662e 100644
--- a/paddle/operators/lookup_table_op.cc
+++ b/paddle/operators/lookup_table_op.cc
@@ -32,10 +32,10 @@ class LookupTableOp : public framework::OperatorWithKernel {
 
     auto table_t = ctx.Input<Tensor>("W");
     auto ids_t = ctx.Input<Tensor>("Ids");
-    auto output_t = ctx.Output<framework::LoDTensor>("Out");
+    auto output_t = ctx.Output<framework::Tensor>("Out");
 
     output_t->Resize({ids_t->dims()[0], table_t->dims()[1]});
-    ctx.ShareLoD("Ids", "Out");
+    ctx.ShareLoD("Ids", /*->*/ "Out");
   }
 };
 
@@ -69,7 +69,7 @@ class LookupTableOpGrad : public framework::OperatorWithKernel {
   void InferShape(const framework::InferShapeContext &context) const override {
     auto table = context.Input<Tensor>("W");
     auto d_table =
-        context.Output<framework::LoDTensor>(framework::GradVarName("W"));
+        context.Output<framework::Tensor>(framework::GradVarName("W"));
     d_table->Resize(table->dims());
   }
 };
diff --git a/paddle/operators/mean_op.cc b/paddle/operators/mean_op.cc
index 96540ff454..b04384bda8 100644
--- a/paddle/operators/mean_op.cc
+++ b/paddle/operators/mean_op.cc
@@ -27,7 +27,7 @@ class MeanOp : public framework::OperatorWithKernel {
                             "Input(X) of MeanOp should not be null.");
     PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Out"),
                             "Output(Out) of MeanOp should not be null.");
-    ctx.Output<framework::LoDTensor>("Out")->Resize({1});
+    ctx.Output<framework::Tensor>("Out")->Resize({1});
   }
 };
 
@@ -48,7 +48,7 @@ class MeanGradOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(const framework::InferShapeContext &ctx) const override {
-    ctx.Output<framework::LoDTensor>(framework::GradVarName("X"))
+    ctx.Output<framework::Tensor>(framework::GradVarName("X"))
         ->Resize(ctx.Input<Tensor>("X")->dims());
   }
 };
diff --git a/paddle/operators/minus_op.cc b/paddle/operators/minus_op.cc
index 5036f9f98a..29cb85489b 100644
--- a/paddle/operators/minus_op.cc
+++ b/paddle/operators/minus_op.cc
@@ -40,8 +40,8 @@ class MinusOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         left_tensor->numel(), right_tensor->numel(),
         "Minus operator must take two tensor with same num of elements");
-    ctx.Output<framework::LoDTensor>("Out")->Resize(left_tensor->dims());
-    ctx.ShareLoD("X", "Out");
+    ctx.Output<framework::Tensor>("Out")->Resize(left_tensor->dims());
+    ctx.ShareLoD("X", /*->*/ "Out");
   }
 };
 
diff --git a/paddle/operators/mul_op.cc b/paddle/operators/mul_op.cc
index b2409a1870..5303a31501 100644
--- a/paddle/operators/mul_op.cc
+++ b/paddle/operators/mul_op.cc
@@ -53,9 +53,9 @@ class MulOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         x_mat_dims[1], y_mat_dims[0],
         "First matrix's width must be equal with second matrix's height.");
-    ctx.Output<framework::LoDTensor>("Out")->Resize(
+    ctx.Output<framework::Tensor>("Out")->Resize(
         {x_mat_dims[0], y_mat_dims[1]});
-    ctx.ShareLoD("X", "Out");
+    ctx.ShareLoD("X", /*->*/ "Out");
   }
 };
 
@@ -109,10 +109,8 @@ class MulOpGrad : public framework::OperatorWithKernel {
     auto x_dims = ctx.Input<Tensor>("X")->dims();
     auto y_dims = ctx.Input<Tensor>("Y")->dims();
     auto out_dims = ctx.Input<Tensor>(framework::GradVarName("Out"))->dims();
-    auto *x_grad =
-        ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
-    auto *y_grad =
-        ctx.Output<framework::LoDTensor>(framework::GradVarName("Y"));
+    auto *x_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto *y_grad = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
 
     auto x_mat_dims =
         framework::flatten_to_2d(x_dims, Attr<int>("x_num_col_dims"));
diff --git a/paddle/operators/pad_op.cc b/paddle/operators/pad_op.cc
index 98de18fb9f..375d8a35ac 100644
--- a/paddle/operators/pad_op.cc
+++ b/paddle/operators/pad_op.cc
@@ -39,12 +39,12 @@ class PadOp : public framework::OperatorWithKernel {
     for (int i = 0; i < x_dim.size(); ++i) {
       out_dims[i] = x_dim[i] + paddings[i * 2] + paddings[i * 2 + 1];
     }
-    ctx.Output<framework::LoDTensor>("Out")->Resize(
+    ctx.Output<framework::Tensor>("Out")->Resize(
         framework::make_ddim(out_dims));
     if (out_dims[0] == x_dim[0]) {
       // Only pass LoD when the first dimension is equal between
       // output and input.
-      ctx.ShareLoD("X", "Out");
+      ctx.ShareLoD("X", /*->*/ "Out");
     }
   }
 };
@@ -106,7 +106,7 @@ class PadOpGrad : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Out")),
                             "Input(Out@GRAD) should not be null");
     auto x_dims = ctx.Input<Tensor>("X")->dims();
-    auto *x_g = ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
+    auto *x_g = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
     if (x_g != nullptr) {
       x_g->Resize(x_dims);
     }
diff --git a/paddle/operators/prelu_op.cc b/paddle/operators/prelu_op.cc
index 2b7b82a3e1..912196c190 100644
--- a/paddle/operators/prelu_op.cc
+++ b/paddle/operators/prelu_op.cc
@@ -36,9 +36,9 @@ class PReluOp : public framework::OperatorWithKernel {
 
     PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Out"),
                             "Output(Out) should not be null");
-    auto *out = ctx.Output<framework::LoDTensor>("Out");
+    auto *out = ctx.Output<framework::Tensor>("Out");
     out->Resize(in->dims());
-    ctx.ShareLoD("X", "Out");
+    ctx.ShareLoD("X", /*->*/ "Out");
   }
 };
 
@@ -72,11 +72,11 @@ class PReluGradOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) must not be null.");
     PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Out")),
                             "Input(Out@GRAD) should not be null");
-    auto *dx = ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
+    auto *dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
     auto *x = ctx.Input<framework::Tensor>("X");
 
     auto *dalpha =
-        ctx.Output<framework::LoDTensor>(framework::GradVarName("Alpha"));
+        ctx.Output<framework::Tensor>(framework::GradVarName("Alpha"));
     auto *alpha = ctx.Input<framework::Tensor>("Alpha");
 
     dx->Resize(x->dims());
diff --git a/paddle/operators/reshape_op.cc b/paddle/operators/reshape_op.cc
index c090758619..ddb93007e2 100644
--- a/paddle/operators/reshape_op.cc
+++ b/paddle/operators/reshape_op.cc
@@ -50,11 +50,11 @@ class ReshapeOp : public framework::OperatorWithKernel {
     std::transform(shape.begin(), shape.end(), shape_int64.begin(),
                    [](int a) { return static_cast<int64_t>(a); });
     auto out_dims = framework::make_ddim(shape_int64);
-    ctx.Output<framework::LoDTensor>("Out")->Resize(out_dims);
+    ctx.Output<framework::Tensor>("Out")->Resize(out_dims);
     if (shape[0] == in->dims()[0]) {
       // Only pass LoD when the first dimension is equal between
       // output and input.
-      ctx.ShareLoD("X", "Out");
+      ctx.ShareLoD("X", /*->*/ "Out");
     }
   }
 };
@@ -99,7 +99,7 @@ class ReshapeGradOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Out")),
                             "Input(Out@GRAD) shouldn't be null.");
     auto dims = ctx.Input<framework::Tensor>("X")->dims();
-    auto *d_in = ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
+    auto *d_in = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
     d_in->Resize(dims);
   }
 };
diff --git a/paddle/operators/rowwise_add_op.cc b/paddle/operators/rowwise_add_op.cc
index 90cdb2558b..fc3ad721f2 100644
--- a/paddle/operators/rowwise_add_op.cc
+++ b/paddle/operators/rowwise_add_op.cc
@@ -44,8 +44,8 @@ class RowwiseAddOp : public framework::OperatorWithKernel {
         framework::slice_ddim(x_dims, num_col_dims, x_dims.size()), b_dims,
         "The width of two operands must be same");
     PADDLE_ENFORCE_EQ(ctx.OutputSize("Out"), 1, "The output size must be 1");
-    ctx.Output<framework::LoDTensor>("Out")->Resize(x_dims);
-    ctx.ShareLoD("X", "Out");
+    ctx.Output<framework::Tensor>("Out")->Resize(x_dims);
+    ctx.ShareLoD("X", /*->*/ "Out");
   }
 };
 
@@ -84,8 +84,8 @@ class RowwiseAddGradOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         framework::slice_ddim(x_dims, num_col_dims, x_dims.size()), b_dims,
         "The width of two operands must be same");
-    auto *dx = ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
-    auto *db = ctx.Output<framework::LoDTensor>(framework::GradVarName("b"));
+    auto *dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto *db = ctx.Output<framework::Tensor>(framework::GradVarName("b"));
     if (dx) dx->Resize(x_dims);
     if (db) db->Resize(b_dims);
   }
diff --git a/paddle/operators/scale_op.cc b/paddle/operators/scale_op.cc
index ca1bc4ac80..3940037c37 100644
--- a/paddle/operators/scale_op.cc
+++ b/paddle/operators/scale_op.cc
@@ -33,9 +33,9 @@ class ScaleOp : public framework::OperatorWithKernel {
                             "Output(Out) of ScaleOp should not be null.");
 
     auto *in = ctx.Input<framework::Tensor>("X");
-    auto *out = ctx.Output<framework::LoDTensor>("Out");
+    auto *out = ctx.Output<framework::Tensor>("Out");
     out->Resize(in->dims());
-    ctx.ShareLoD("X", "Out");
+    // ctx.ShareLoD("X", /*->*/ "Out");
   }
 };
 
diff --git a/paddle/operators/scatter_op.cc b/paddle/operators/scatter_op.cc
index 8820262732..3f02081a06 100644
--- a/paddle/operators/scatter_op.cc
+++ b/paddle/operators/scatter_op.cc
@@ -44,7 +44,7 @@ class ScatterOp : public framework::OperatorWithKernel {
     framework::DDim data_dim(ctx.Input<Tensor>("Updates")->dims());
     for (int i = 1; i < data_dim.size(); ++i)
       PADDLE_ENFORCE_EQ(data_dim[i], ctx.Input<Tensor>("Updates")->dims()[i]);
-    ctx.Output<framework::LoDTensor>("Out")->Resize(
+    ctx.Output<framework::Tensor>("Out")->Resize(
         ctx.Input<Tensor>("Ref")->dims());
   }
 };
@@ -56,10 +56,9 @@ class ScatterGradOp : public framework::OperatorWithKernel {
  protected:
   void InferShape(const framework::InferShapeContext &ctx) const override {
     auto *dUpdates =
-        ctx.Output<framework::LoDTensor>(framework::GradVarName("Updates"));
+        ctx.Output<framework::Tensor>(framework::GradVarName("Updates"));
     auto *Updates = ctx.Input<Tensor>("Updates");
-    auto *dRef =
-        ctx.Output<framework::LoDTensor>(framework::GradVarName("Ref"));
+    auto *dRef = ctx.Output<framework::Tensor>(framework::GradVarName("Ref"));
     auto *Ref = ctx.Input<Tensor>("Ref");
 
     dRef->Resize(Ref->dims());
diff --git a/paddle/operators/sequence_avg_pool_op.cc b/paddle/operators/sequence_avg_pool_op.cc
index 9815b8f3a8..11d42ac44e 100644
--- a/paddle/operators/sequence_avg_pool_op.cc
+++ b/paddle/operators/sequence_avg_pool_op.cc
@@ -38,7 +38,7 @@ class SequenceAvgPoolOp : public framework::OperatorWithKernel {
         /*batch size = */ static_cast<int64_t>(lod[0].size() - 1),
         "The first dimension of Input(X) must be large than batch size.");
     dims[0] = lod[0].size() - 1;
-    ctx.Output<framework::LoDTensor>("Out")->Resize({dims});
+    ctx.Output<framework::Tensor>("Out")->Resize({dims});
   }
 };
 
@@ -74,8 +74,7 @@ class SequenceAvgPoolGradOp : public framework::OperatorWithKernel {
     for (int64_t i = 1; i < og_dims.size(); ++i) {
       PADDLE_ENFORCE_EQ(og_dims[i], x_dims[i], "The dimension mismatch.");
     }
-    auto* x_grad =
-        ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
+    auto* x_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
     x_grad->Resize(x_dims);
   }
 };
diff --git a/paddle/operators/sgd_op.cc b/paddle/operators/sgd_op.cc
index 1232e64c7f..b063e24272 100644
--- a/paddle/operators/sgd_op.cc
+++ b/paddle/operators/sgd_op.cc
@@ -33,7 +33,7 @@ class SGDOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(ctx.Input<Tensor>("param")->dims(),
                       ctx.Input<Tensor>("grad")->dims(),
                       "Two input of SGD Op's dimension must be same.");
-    ctx.Output<framework::LoDTensor>("param_out")
+    ctx.Output<framework::Tensor>("param_out")
         ->Resize(ctx.Input<Tensor>("param")->dims());
   }
 };
diff --git a/paddle/operators/sigmoid_op.cc b/paddle/operators/sigmoid_op.cc
index 42befa22d0..d2a38d1ebe 100644
--- a/paddle/operators/sigmoid_op.cc
+++ b/paddle/operators/sigmoid_op.cc
@@ -28,9 +28,8 @@ class SigmoidOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar("Y"),
                             "Output(Y) of SigmoidOp should not be null.");
 
-    ctx.Output<framework::LoDTensor>("Y")->Resize(
-        ctx.Input<Tensor>("X")->dims());
-    ctx.ShareLoD("X", "Y");
+    ctx.Output<framework::Tensor>("Y")->Resize(ctx.Input<Tensor>("X")->dims());
+    ctx.ShareLoD("X", /*->*/ "Y");
   }
 };
 
@@ -51,7 +50,7 @@ class SigmoidOpGrad : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(const framework::InferShapeContext &ctx) const override {
-    ctx.Output<framework::LoDTensor>(framework::GradVarName("X"))
+    ctx.Output<framework::Tensor>(framework::GradVarName("X"))
         ->Resize(ctx.Input<Tensor>("Y")->dims());
   }
 };
diff --git a/paddle/operators/softmax_op.cc b/paddle/operators/softmax_op.cc
index c67eb028c8..e15cfe4850 100644
--- a/paddle/operators/softmax_op.cc
+++ b/paddle/operators/softmax_op.cc
@@ -30,8 +30,7 @@ class SoftmaxOp : public framework::OperatorWithKernel {
 
     PADDLE_ENFORCE(ctx.Input<Tensor>("X")->dims().size() == 2UL,
                    "The input of softmax op must be a matrix.");
-    ctx.Output<framework::LoDTensor>("Y")->Resize(
-        ctx.Input<Tensor>("X")->dims());
+    ctx.Output<framework::Tensor>("Y")->Resize(ctx.Input<Tensor>("X")->dims());
   }
 };
 
@@ -77,7 +76,7 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
                       ctx.Input<Tensor>(framework::GradVarName("Y"))->dims(),
                       "Input(Y) and its gradients should have a same shape.");
 
-    ctx.Output<framework::LoDTensor>(framework::GradVarName("X"))
+    ctx.Output<framework::Tensor>(framework::GradVarName("X"))
         ->Resize(ctx.Input<Tensor>("X")->dims());
   }
 };
diff --git a/paddle/operators/split_op.cc b/paddle/operators/split_op.cc
index 61296f5c81..a9d35b4fb7 100644
--- a/paddle/operators/split_op.cc
+++ b/paddle/operators/split_op.cc
@@ -27,7 +27,7 @@ class SplitOp : public framework::OperatorWithKernel {
   void InferShape(const framework::InferShapeContext &ctx) const override {
     // infershape
     auto *in = ctx.Input<framework::Tensor>("X");
-    auto outs = ctx.MultiOutput<framework::LoDTensor>("Out");
+    auto outs = ctx.MultiOutput<framework::Tensor>("Out");
     size_t axis = static_cast<size_t>(ctx.Attr<int>("axis"));
     size_t num = static_cast<size_t>(ctx.Attr<int>("num"));
     std::vector<int> sections =
diff --git a/paddle/operators/squared_l2_distance_op.cc b/paddle/operators/squared_l2_distance_op.cc
index dfe8e6decd..33a564b05b 100644
--- a/paddle/operators/squared_l2_distance_op.cc
+++ b/paddle/operators/squared_l2_distance_op.cc
@@ -54,10 +54,10 @@ class SquaredL2DistanceOp : public framework::OperatorWithKernel {
                    "First dimension of target must be equal to input "
                    "or to 1.");
 
-    ctx.Output<framework::LoDTensor>("sub_result")
+    ctx.Output<framework::Tensor>("sub_result")
         ->Resize({x_dims[0], x->numel() / x_dims[0]});
-    ctx.Output<framework::LoDTensor>("Out")->Resize({x_dims[0], 1});
-    ctx.ShareLoD("X", "Out");
+    ctx.Output<framework::Tensor>("Out")->Resize({x_dims[0], 1});
+    ctx.ShareLoD("X", /*->*/ "Out");
   }
 };
 
@@ -104,10 +104,8 @@ class SquaredL2DistanceGradOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(out_dims[1], 1,
                       "Second dimension of output gradient "
                       "must be 1.");
-    auto* x_grad =
-        ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
-    auto* y_grad =
-        ctx.Output<framework::LoDTensor>(framework::GradVarName("Y"));
+    auto* x_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto* y_grad = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
     if (x_grad) x_grad->Resize(x_dims);
     if (y_grad) y_grad->Resize(y_dims);
   }
diff --git a/paddle/operators/sum_op.cc b/paddle/operators/sum_op.cc
index ebc57d6b7b..437fc262f3 100644
--- a/paddle/operators/sum_op.cc
+++ b/paddle/operators/sum_op.cc
@@ -28,7 +28,7 @@ class SumOp : public framework::OperatorWithKernel {
                             "Output(Out) of SumOp should not be null.");
 
     auto ins = ctx.MultiInput<framework::Tensor>("X");
-    auto *out = ctx.Output<framework::LoDTensor>("Out");
+    auto *out = ctx.Output<framework::Tensor>("Out");
     int N = ins.size();
 
     auto in_dim = ins[0]->dims();
@@ -39,7 +39,7 @@ class SumOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE(in_dim == dim, "Input tensors must have same shape");
     }
     out->Resize(in_dim);
-    ctx.ShareLoD(ctx.op().Inputs("X")[0], "Out");
+    ctx.ShareLoD("X", /*->*/ "Out");
   }
 };
 
@@ -65,7 +65,7 @@ class SumGradOp : public framework::OperatorWithKernel {
  protected:
   void InferShape(const framework::InferShapeContext &ctx) const override {
     auto outputs =
-        ctx.MultiOutput<framework::LoDTensor>(framework::GradVarName("X"));
+        ctx.MultiOutput<framework::Tensor>(framework::GradVarName("X"));
     auto dims = ctx.Input<Tensor>(framework::GradVarName("Out"))->dims();
     for (auto output : outputs) {
       output->Resize(dims);
diff --git a/paddle/operators/top_k_op.cc b/paddle/operators/top_k_op.cc
index 169b815fef..a6e43964e9 100644
--- a/paddle/operators/top_k_op.cc
+++ b/paddle/operators/top_k_op.cc
@@ -40,8 +40,8 @@ class TopkOp : public framework::OperatorWithKernel {
 
     framework::DDim dims = input->dims();
     dims[dims.size() - 1] = k;
-    ctx.Output<framework::LoDTensor>("Out")->Resize(dims);
-    ctx.Output<framework::LoDTensor>("Indices")->Resize(dims);
+    ctx.Output<framework::Tensor>("Out")->Resize(dims);
+    ctx.Output<framework::Tensor>("Indices")->Resize(dims);
   }
 };
 
diff --git a/paddle/operators/uniform_random_op.cc b/paddle/operators/uniform_random_op.cc
index 184bcbc29c..17ea48361b 100644
--- a/paddle/operators/uniform_random_op.cc
+++ b/paddle/operators/uniform_random_op.cc
@@ -54,7 +54,7 @@ class UniformRandomOp : public framework::OperatorWithKernel {
 
     PADDLE_ENFORCE(Attr<float>("min") < Attr<float>("max"),
                    "uniform_random's min must less then max");
-    auto* tensor = ctx.Output<framework::LoDTensor>("Out");
+    auto* tensor = ctx.Output<framework::Tensor>("Out");
     auto dims = Attr<std::vector<int>>("dims");
     std::vector<int64_t> temp;
     temp.reserve(dims.size());

From a524498efe8f1273bafe7e1a874ef272ce00bb1d Mon Sep 17 00:00:00 2001
From: dangqingqing <dangqingqing@baidu.com>
Date: Thu, 21 Sep 2017 12:55:46 +0800
Subject: [PATCH 4/6] fix the bug in TeamCity environment.

---
 paddle/framework/operator.h | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
index 4a078258d2..82a23797d4 100644
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -342,10 +342,9 @@ class InferShapeContext {
     PADDLE_ENFORCE_LT(j, OutputSize(out));
     auto* in_var = MultiInputVar(in)[i];
     auto* out_var = MultiOutputVar(out)[j];
-    PADDLE_ENFORCE(in_var->IsType<LoDTensor>(),
-                   "The %d-th input of Input(%s) must be LoDTensor.", in);
+    if (!in_var->IsType<LoDTensor>()) return;
     PADDLE_ENFORCE(out_var->IsType<LoDTensor>(),
-                   "The %d-th output of Output(%s) must be LoDTensor.", out);
+                   "The %d-th output of Output(%s) must be LoDTensor.", j, out);
     auto in_tensor = in_var->Get<LoDTensor>();
     auto* out_tensor = out_var->GetMutable<LoDTensor>();
     out_tensor->set_lod(in_tensor.lod());
@@ -363,6 +362,13 @@ template <>
 const std::vector<const Tensor*> InferShapeContext::MultiInput<Tensor>(
     const std::string& name) const;
 
+template <>
+Tensor* InferShapeContext::Output<Tensor>(const std::string& name) const;
+
+template <>
+std::vector<Tensor*> InferShapeContext::MultiOutput<Tensor>(
+    const std::string& name) const;
+
 template <typename T>
 struct EigenDeviceConverter;
 

From 4f9d82a9c62064c9523bc84a457e05b03a1ddd0d Mon Sep 17 00:00:00 2001
From: dangqingqing <dangqingqing@baidu.com>
Date: Fri, 22 Sep 2017 10:10:59 +0800
Subject: [PATCH 5/6] Fix bug.

---
 paddle/framework/backward_test.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/framework/backward_test.cc b/paddle/framework/backward_test.cc
index b4e51ad6ed..6932f5b989 100644
--- a/paddle/framework/backward_test.cc
+++ b/paddle/framework/backward_test.cc
@@ -127,8 +127,8 @@ class FillZeroOpMaker : public OpProtoAndCheckerMaker {
  public:
   FillZeroOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("Src", "x");
-    AddOutput("Dst", "out");
+    AddInput("X", "x");
+    AddOutput("Y", "out");
     AddComment("");
   }
 };

From fbc0db4a61c76ffa29f8e0df405557b7997b15f4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=AD=A6=E6=AF=85?= <typhoonzero1986@gmail.com>
Date: Fri, 22 Sep 2017 11:46:57 +0800
Subject: [PATCH 6/6] Update faq (#4317)

* update faq

* follow comments
---
 doc/faq/index_cn.rst | 26 ++++++++++++++++++++++----
 1 file changed, 22 insertions(+), 4 deletions(-)

diff --git a/doc/faq/index_cn.rst b/doc/faq/index_cn.rst
index 00192aa69b..acbf4c87ae 100644
--- a/doc/faq/index_cn.rst
+++ b/doc/faq/index_cn.rst
@@ -158,17 +158,23 @@ PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID，相同名字
 
 这里 :code:`hidden_a` 和 :code:`hidden_b` 使用了同样的parameter和bias。并且softmax层的两个输入也使用了同样的参数 :code:`softmax_param`。
 
-7. \*-cp27mu-linux_x86_64.whl is not a supported wheel on this platform.
+7. paddlepaddle\*.whl is not a supported wheel on this platform.
 ------------------------------------------------------------------------
 
-出现这个问题的主要原因是，系统编译wheel包的时候，使用的 :code:`wheel` 包是最新的，
-而系统中的 :code:`pip` 包比较老。具体的解决方法是，更新 :code:`pip` 包并重新编译PaddlePaddle。
+出现这个问题的主要原因是，没有找到和当前系统匹配的paddlepaddle安装包。最新的paddlepaddle python安装包支持Linux x86_64和MacOS 10.12操作系统，并安装了python 2.7和pip 9.0.1。
+
 更新 :code:`pip` 包的方法是\:
 
 ..  code-block:: bash
 
     pip install --upgrade pip
 
+如果还不行，可以执行 :code:`python -c "import pip; print(pip.pep425tags.get_supported())"` 获取当前系统支持的python包的后缀，
+并对比是否和正在安装的后缀一致。
+
+如果系统支持的是 :code:`linux_x86_64` 而安装包是 :code:`manylinux1_x86_64` ，需要升级pip版本到最新；
+如果系统支持 :code:`manylinux1_x86_64` 而安装包（本地）是 :code:`linux_x86_64` ，可以重命名这个whl包为 :code:`manylinux1_x86_64` 再安装。
+
 8.  python相关的单元测试都过不了
 --------------------------------
 
@@ -310,7 +316,7 @@ Paddle二进制在运行时捕获了浮点数异常，只要出现浮点数异
 * 模型一直不收敛，发散到了一个数值特别大的地方。
 * 训练数据有问题，导致参数收敛到了一些奇异的情况。或者输入数据尺度过大，有些特征的取值达到数百万，这时进行矩阵乘法运算就可能导致浮点数溢出。
 
-主要的解决办法是减小学习律或者对数据进行归一化处理。
+主要的解决办法是减小学习率或者对数据进行归一化处理。
 
 15. 编译安装后执行 import paddle.v2 as paddle 报ImportError: No module named v2
 ------------------------------------------------------------------------
@@ -373,3 +379,15 @@ PaddlePaddle保存的模型参数文件内容由16字节头信息和网络参数
 
     parameters = paddle.parameters.create(my_cost)
     parameters.set('emb', load_parameter(emb_param_file, 30000, 256))
+
+18. 集群多节点训练，日志中保存均为网络通信类错误
+------------------------------
+
+集群多节点训练，日志报错为网络通信类错误，比如 :code:`Connection reset by peer` 等。
+此类报错通常是由于某一个节点的错误导致这个节点的训练进程退出，从而引发其他节点无法连接导致，可以参考下面的步骤排查：
+
+* 从 :code:`train.log` ， :code:`server.log` 找到最早报错的地方，查看是否是其他错误引发的报错（比如FPE，内存不足，磁盘空间不足等）。
+
+* 如果发现最早的报错就是网络通信的问题，很有可能是非独占方式执行导致的端口冲突，可以联系OP，看当前MPI集群是否支持resource=full参数提交，如果支持增加此参数提交，并更换job 端口。
+
+* 如果当前MPI集群并不支持任务独占模式，可以联系OP是否可以更换集群或升级当前集群。
\ No newline at end of file