Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into refactor_registry_macro

8 years ago · 3e6e5c9286
parent fb6bec6a8f f80fea8d0a
commit 3e6e5c9286
73 changed files with 1699 additions and 1821 deletions
--- a/doc/getstarted/build_and_install/docker_install_cn.rst
+++ b/doc/getstarted/build_and_install/docker_install_cn.rst
@ -74,13 +74,13 @@ PaddlePaddle发布新版本的时候都会发布对应版本的生产镜像以

   .. code-block:: bash

-      docker run -it --rm paddlepaddle/paddle:0.10.0-dev /bin/bash
+      docker run -it --rm -v $(pwd):/paddle  paddlepaddle/paddle:0.10.0-dev /bin/bash

   或者，可以以后台进程方式运行容器：

   .. code-block:: bash

-      docker run -d -p 2202:22 -p 8888:8888 paddledev/paddle:0.10.0-dev
+      docker run -d -p 2202:22 -p 8888:8888 -v $(pwd):/paddle paddlepaddle/paddle:0.10.0-dev /usr/sbin/sshd -D

   然后用密码 :code:`root` SSH进入容器：

--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@ -7,7 +7,7 @@ cc_library(tensor SRCS tensor.cc DEPS ddim place paddle_memory device_context)
 cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
 cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)

-cc_library(lod_tensor SRCS lod_tensor.cc details/lod_tensor.cc DEPS ddim place tensor)
+cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor)
 cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor)

 cc_test(variable_test SRCS variable_test.cc)
@ -15,23 +15,19 @@ cc_test(variable_test SRCS variable_test.cc)
 cc_library(scope SRCS scope.cc)
 cc_test(scope_test SRCS scope_test.cc DEPS scope)

-proto_library(attribute_proto SRCS attribute.proto)
-proto_library(op_proto SRCS op_proto.proto DEPS attribute_proto)
-proto_library(op_desc SRCS op_desc.proto DEPS attribute_proto)
-cc_test(op_proto_test SRCS op_proto_test.cc DEPS op_proto protobuf)
-cc_test(op_desc_test SRCS op_desc_test.cc DEPS op_desc protobuf)
+proto_library(framework_proto SRCS framework.proto)

-cc_library(attribute SRCS attribute.cc DEPS op_desc op_proto)
+cc_library(attribute SRCS attribute.cc DEPS framework_proto)

-cc_library(operator SRCS operator.cc DEPS op_desc device_context tensor scope attribute)
+cc_library(operator SRCS operator.cc DEPS framework_proto device_context tensor scope attribute)
 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry)

-cc_library(grad_op_builder SRCS grad_op_builder.cc DEPS op_proto operator)
-cc_library(op_registry SRCS op_registry.cc DEPS op_desc grad_op_builder)
+cc_library(grad_op_builder SRCS grad_op_builder.cc DEPS operator)
+cc_library(op_registry SRCS op_registry.cc DEPS grad_op_builder)
 cc_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
 cc_test(grad_op_builder_test SRCS grad_op_builder_test.cc DEPS grad_op_builder op_registry add_op)

-py_proto_compile(framework_py_proto SRCS attribute.proto op_proto.proto op_desc.proto)
+py_proto_compile(framework_py_proto SRCS framework.proto)
 # Generate an empty __init__.py to make framework_py_proto as a valid python module.
 add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
 add_dependencies(framework_py_proto framework_py_proto_init)
--- a/paddle/framework/attribute.cc
+++ b/paddle/framework/attribute.cc
@ -44,7 +44,7 @@ AttrType AttrTypeID<std::vector<std::string>>() {
  return STRINGS;
 }

-Attribute GetAttrValue(const AttrDesc& attr_desc) {
+Attribute GetAttrValue(const OpDesc::Attr& attr_desc) {
  switch (attr_desc.type()) {
    case paddle::framework::AttrType::INT: {
      return attr_desc.i();
--- a/paddle/framework/attribute.h
+++ b/paddle/framework/attribute.h
@ -20,8 +20,7 @@ limitations under the License. */
 #include <unordered_set>
 #include <vector>

-#include "paddle/framework/attribute.pb.h"
-#include "paddle/framework/op_desc.pb.h"
+#include "paddle/framework/framework.pb.h"
 #include "paddle/platform/enforce.h"
 #include "paddle/platform/variant.h"

@ -37,7 +36,7 @@ typedef std::unordered_map<std::string, Attribute> AttributeMap;
 template <typename T>
 AttrType AttrTypeID();

-Attribute GetAttrValue(const AttrDesc& attr_desc);
+Attribute GetAttrValue(const OpDesc::Attr& attr_desc);

 // check whether a value(attribute) fit a certain limit
 template <typename T>
--- a/paddle/framework/attribute.proto
+++ b/paddle/framework/attribute.proto
@ -1,28 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-syntax = "proto2";
-package paddle.framework;
-
-// Attribute Type for paddle's Op.
-// Op contains many attributes. Each type of attributes could be different.
-// The AttrType will be shared between AttrDesc and AttrProto.
-enum AttrType {
-  INT = 0;
-  FLOAT = 1;
-  STRING = 2;
-  INTS = 3;
-  FLOATS = 4;
-  STRINGS = 5;
-}
--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@ -21,15 +21,25 @@
 namespace paddle {
 namespace framework {

-static bool AllInSet(const std::vector<std::string>& names,
-                     const std::string& suffix,
-                     const std::unordered_set<std::string>& set) {
+template <typename Map, typename T>
+static void ForEachVarName(Map& names, T callback) {
  for (auto& name : names) {
-    if (set.find(name + suffix) == set.end()) {
-      return false;
+    for (auto& n : name.second) {
+      if (callback(n)) return;
    }
  }
-  return true;
+}
+
+// return whether all the names + suffixes in the set
+static bool AllInSet(
+    const std::map<std::string, std::vector<std::string>>& names,
+    const std::string& suffix, const std::unordered_set<std::string>& set) {
+  bool all_in_set = true;
+  ForEachVarName(names, [&all_in_set, &set, &suffix](const std::string& n) {
+    all_in_set = set.find(n + suffix) != set.end();
+    return !all_in_set;
+  });
+  return all_in_set;
 }

 static std::shared_ptr<OperatorBase> NOP() {
@ -39,7 +49,7 @@ static std::shared_ptr<OperatorBase> NOP() {
  return net_op;
 }

-//  Get backward operator from a forward operator, recursively implementation.
+//  Get backward operator from a forward operator, a recursive implementation.
 //
 //  no_grad_names the gradient variable names without gradient calculating.
 //
@ -47,31 +57,35 @@ static std::shared_ptr<OperatorBase> NOP() {
 //  BackwardRecursive. use `uid = uniq_id++;` to get the unique index, and
 //  pass `uniq_id` through recursive calling.
 //
-//  returns The backward operator. For simple situation, it is a simple
-//  operator. For complex situation, it is a NetOp.
+//  returns The backward operator. In a simple situation, it may be a simple
+//  operator, in a complex situation, it maybe a NetOp.
 //
 //  See Backward.h for details
 static std::shared_ptr<OperatorBase> BackwardRecursive(
    const OperatorBase& forwardOp,
    std::unordered_set<std::string>& no_grad_names, size_t& uniq_id);
+
 std::shared_ptr<OperatorBase> BackwardRecursive(
    const OperatorBase& forwardOp,
    std::unordered_set<std::string>& no_grad_names, size_t& uniq_id) {
  //  If all input gradients of forwarding operator do not need to calculate,
  //  just return an NOP. Not return null ptr because NOP does not take
-  //  too much time for calculation, but it is useful for simplifying logic.
-  if (AllInSet(forwardOp.inputs_, kGradVarSuffix, no_grad_names)) {
+  //  much time for calculation, but it is useful for simplifying logic.
+  if (AllInSet(forwardOp.inputs_ /*names*/, kGradVarSuffix /*suffix*/,
+               no_grad_names /*set*/)) {
    return NOP();
  }

  //  All output gradients of forwarding operator do not need to calculate.
  //  Then all input gradients cannot be computed at all, and we put them into
  //  `no_grad_names` set. Return an NOP.
-  if (AllInSet(forwardOp.outputs_, kGradVarSuffix, no_grad_names)) {
-    for (auto& name : forwardOp.inputs_) {
-      // Mark all input is not need
-      no_grad_names.insert(name + kGradVarSuffix);
-    }
+  if (AllInSet(forwardOp.outputs_ /*names*/, kGradVarSuffix /*suffix*/,
+               no_grad_names /*set*/)) {
+    ForEachVarName(forwardOp.inputs_,
+                   [&no_grad_names](const std::string& name) -> bool {
+                     no_grad_names.insert(GradVarName(name));
+                     return false;
+                   });
    return NOP();
  }

@ -83,55 +97,65 @@ std::shared_ptr<OperatorBase> BackwardRecursive(
    auto& forwardNet = static_cast<const operators::NetOp&>(forwardOp);

    // Map from output gradient variable name to operator's indices in
-    // backward net. That operator generates that variable.
+    // backward net's ops_. That operator generates that variable.
    std::unordered_map<std::string, std::vector<size_t>> dup_output_ops;

    size_t local_op_id = 0;
-    // reversely travel forwardNet
+    // reversely travel forwardNet and collect all duplicate outputs.
    for (auto it = forwardNet.ops_.rbegin(); it != forwardNet.ops_.rend();
         ++it, ++local_op_id) {
      auto fwd = *it;
      auto bwd = BackwardRecursive(*fwd, no_grad_names, uniq_id);
      net->AddOp(bwd);
-      for (auto& out : bwd->outputs_) {
-        dup_output_ops[out].emplace_back(local_op_id);
-      }
+      ForEachVarName(bwd->outputs_,
+                     [&dup_output_ops, local_op_id](const std::string& out) {
+                       dup_output_ops[out].emplace_back(local_op_id);
+                       return false;
+                     });
    }
    // Get unique ID for this method.
    auto uid = uniq_id++;
    // TODO(dzh): more comment
+    // multiple operators which have the same output (y for example) may
+    // overwrite the same y variable when backward, special operations are token
+    // to handle this case. For each duplicate output, rename it to an alias
+    // (original name with a offset), append an `add` op for its operator,
+    // and finally sum all the alias variable to the final output variable y.
    using Pos = std::pair<size_t, std::shared_ptr<OperatorBase>>;
    std::list<Pos> insert_position;
    for (auto& dup_output_op : dup_output_ops) {
      const std::string& name = dup_output_op.first;
      auto& dup_op = dup_output_op.second;
+      // no duplicate output
      if (dup_op.size() == 1) continue;
-      std::vector<std::string> dup_outputs;

+      // process the duplicate outputs
+      std::vector<std::string> dup_outputs;
      for (size_t i = 0; i < dup_op.size(); ++i) {
+        // rename each duplicate output to an alias
        auto op_offset = dup_op[i];
        dup_outputs.push_back(name + "@RENAME@" + std::to_string(uid) + "@" +
                              std::to_string(i));
        net->ops_[op_offset]->Rename(name, dup_outputs.back());
      }
+      // collect all the offset to append `add` op for each alias
      insert_position.push_back(
-          {dup_op.back(),
-           OpRegistry::CreateOp(
-               "add", {dup_outputs}, {name},
-               {{"input_format",
-                 std::vector<int>{0, static_cast<int>(dup_outputs.size())}}})});
+          {dup_op.back(), OpRegistry::CreateOp("add", {{"X", {dup_outputs}}},
+                                               {{"Out", {name}}}, {})});
    }

+    // make sure the inserted `add` ops follow the BFS order.
    insert_position.sort(
        [](const Pos& l, const Pos& r) { return l.first > r.first; });

    for (auto& pos : insert_position) {
      net->InsertOp(pos.first + 1, pos.second);
    }
-
  } else {
    std::shared_ptr<OperatorBase> grad_op = OpRegistry::CreateGradOp(forwardOp);
-    for (std::string& grad_input : grad_op->inputs_) {
+
+    ForEachVarName(grad_op->inputs_, [&no_grad_names,
+                                      &net](std::string& grad_input) {
      if (no_grad_names.count(grad_input)) {
        // +1 for \0
        std::string prefix = grad_input.substr(
@ -140,16 +164,19 @@ std::shared_ptr<OperatorBase> BackwardRecursive(

        // If part of input gradient of that operator is not calculated, fill
        // zero variables to that input gradient.
-        net->AddOp(OpRegistry::CreateOp("fill_zeros_like", {prefix},
-                                        {grad_input}, {}));
+        net->AddOp(OpRegistry::CreateOp("fill_zeros_like", {{"Src", {prefix}}},
+                                        {{"Dst", {grad_input}}}, {}));
      }
-    }
+      return false;
+    });

-    for (std::string& grad_output : grad_op->outputs_) {
-      if (no_grad_names.count(grad_output)) {
-        grad_output = kEmptyVarName;
-      }
-    }
+    ForEachVarName(grad_op->outputs_,
+                   [&no_grad_names](std::string& grad_output) {
+                     if (no_grad_names.count(grad_output)) {
+                       grad_output = kEmptyVarName;
+                     }
+                     return false;
+                   });

    if (net->ops_.empty()) {  // Current no aux op is added to network
      return grad_op;
@ -159,7 +186,7 @@ std::shared_ptr<OperatorBase> BackwardRecursive(
  net->type_ = "@GENERATED_BACKWARD@";
  net->CompleteAddOp();
  return net;
-}
+}  // namespace framework

 // See header for comments
 std::shared_ptr<OperatorBase> Backward(
--- a/paddle/framework/backward_test.cc
+++ b/paddle/framework/backward_test.cc
--- a/paddle/framework/ddim.cc
+++ b/paddle/framework/ddim.cc
@ -283,6 +283,5 @@ std::ostream& operator<<(std::ostream& os, const DDim& ddim) {
 DDim::DDim(std::initializer_list<int> init_list) {
  *this = make_ddim(init_list);
 }
-
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/details/lod_tensor.cc
+++ b/paddle/framework/details/lod_tensor.cc
@ -1,62 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include "paddle/framework/lod_tensor.h"
-
-#include <memory>
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-using LOD = LODTensor::LOD;
-
-std::shared_ptr<LOD> SliceLOD(const LOD &lod, size_t level_begin,
-                              size_t level_end) {
-  auto new_lod = std::make_shared<LOD>();
-  new_lod->reserve(level_end - level_begin);
-  for (size_t i = level_begin; i < level_end; i++) {
-    new_lod->emplace_back(lod[i]);
-  }
-  return new_lod;
-}
-
-std::shared_ptr<LOD> SliceLOD(const LOD &lod, size_t level, size_t elem_begin,
-                              size_t elem_end, bool tensor_shared) {
-  // slice the lod.
-  auto new_lod = std::make_shared<LOD>();
-  new_lod->reserve(lod.size() - level);
-  auto start = lod.at(level)[elem_begin];
-  auto end = lod.at(level)[elem_end];
-
-  for (auto it = lod.begin() + level; it != lod.end(); it++) {
-    auto it_begin = std::find(it->begin(), it->end(), start);
-    auto it_end = std::find(it_begin, it->end(), end);
-    PADDLE_ENFORCE(it_begin != it->end(), "error in parsing lod info");
-    PADDLE_ENFORCE(it_end != it->end(), "error in parsing lod info");
-    new_lod->emplace_back(it_begin, it_end + 1);
-    if (!tensor_shared) {
-      // reset offset if tensor is copyed and sliced.
-      std::transform(new_lod->back().begin(), new_lod->back().end(),
-                     new_lod->back().begin(),
-                     [start](int v) { return v - start; });
-      PADDLE_ENFORCE(new_lod->back().front() == 0, "error in slice LOD");
-    }
-  }
-  return new_lod;
-}
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
--- a/paddle/framework/details/lod_tensor.h
+++ b/paddle/framework/details/lod_tensor.h
@ -1,46 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#pragma once
-
-#include <memory>
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-/*
- * Slice levels from LOD.
- *
- * @lod: LOD to slice.
- * @level_begin: level to begin slice.
- * @level_end: level to end slice.
- */
-std::shared_ptr<LODTensor::LOD> SliceLOD(const LODTensor::LOD &lod,
-                                         size_t level_begin, size_t level_end);
-
-/*
- * Slice elements from a level of LOD.
- *
- * @lod: LOD to slice.
- * @level: which level to slice.
- * @elem_begin: element's index to begin slice.
- * @elem_end: element's index to end slice.
- */
-std::shared_ptr<LODTensor::LOD> SliceLOD(const LODTensor::LOD &lod,
-                                         size_t level, size_t elem_begin,
-                                         size_t elem_end, bool tensor_shared);
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
--- a/paddle/framework/framework.proto
+++ b/paddle/framework/framework.proto
@ -0,0 +1,82 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+syntax = "proto2";
+package paddle.framework;
+
+enum AttrType {
+  INT = 0;
+  FLOAT = 1;
+  STRING = 2;
+  INTS = 3;
+  FLOATS = 4;
+  STRINGS = 5;
+}
+
+// OpDesc describes an instance of a C++ framework::OperatorBase
+// derived class type.
+message OpDesc {
+
+  message Attr {
+    required string name = 1;
+    required AttrType type = 2;
+    optional int32 i = 3;
+    optional float f = 4;
+    optional string s = 5;
+    repeated int32 ints = 6;
+    repeated float floats = 7;
+    repeated string strings = 8;
+  };
+
+  message Var {
+    required string parameter = 1;
+    repeated string arguments = 2;
+  };
+
+  required string type = 3;
+  repeated Var inputs = 1;
+  repeated Var outputs = 2;
+  repeated Attr attrs = 4;
+};
+
+// OpProto describes a C++ framework::OperatorBase derived class.
+message OpProto {
+
+  // VarProto describes the C++ type framework::Variable.
+  message Var {
+    required string name = 1;
+    required string comment = 2;
+
+    optional bool duplicable = 3 [ default = false ];
+    optional bool intermediate = 4 [ default = false ];
+    optional bool no_gradient = 5 [ default = false ];
+  }
+
+  // AttrProto describes the C++ type Attribute.
+  message Attr {
+    required string name = 1;
+    required AttrType type = 2;
+    required string comment = 3;
+    // If that attribute is generated, it means the Paddle third
+    // language binding has responsibility to fill that
+    // attribute. End-User should not set that attribute.
+    optional bool generated = 4 [ default = false ];
+  }
+
+  required string type = 1;
+  repeated Var inputs = 2;
+  repeated Var outputs = 3;
+  repeated Attr attrs = 4;
+  required string comment = 5;
+}
--- a/paddle/framework/grad_op_builder.cc
+++ b/paddle/framework/grad_op_builder.cc
@ -13,63 +13,28 @@ express or implied. See the License for the specific language governing
 permissions and limitations under the License. */

 #include "paddle/framework/grad_op_builder.h"
-#include "paddle/framework/op_proto.pb.h"
+#include "paddle/framework/framework.pb.h"
 #include "paddle/framework/op_registry.h"

 namespace paddle {
 namespace framework {
-
-typedef std::vector<int> Ints;
-
 enum class OpArgType { IN, OUT };

-const Ints* AttrFormat(const AttributeMap& attrs, const std::string& key) {
-  return (attrs.count(key) > 0) ? &boost::get<Ints>(attrs.at(key)) : nullptr;
-}
-
-Ints* AttrFormat(AttributeMap& attrs, const std::string& key) {
-  return (attrs.count(key) > 0) ? &boost::get<Ints>(attrs.at(key)) : nullptr;
-}
-
-static void TransOpArg(const OperatorBase* src_op,
-                       std::vector<std::string>& grad_inputs,
-                       std::vector<std::string>& grad_outputs,
-                       AttributeMap& grad_attrs,
-                       std::unordered_map<std::string, int>& grad_idxs,
-                       const std::string& src_type, const std::string& dst_type,
-                       int& idx, bool is_grad) {
-  const std::vector<std::string>& src_inout =
-      (src_type == "input_format") ? src_op->inputs_ : src_op->outputs_;
-
-  const std::vector<int>* src_format = AttrFormat(src_op->Attrs(), src_type);
-
-  std::vector<std::string>& dst_inout =
-      (dst_type == "input_format") ? grad_inputs : grad_outputs;
-
-  std::vector<int>* dst_format = AttrFormat(grad_attrs, dst_type);
-
-  const OpProto& proto = *(OpRegistry::op_info_map().at(src_op->type_).proto_);
-
+static void TransOpArg(const OperatorBase* src_op, const OpArgType& src_type,
+                       bool is_grad, OperatorBase::VarNameMap* vars) {
+  const auto& src_inout =
+      src_type == OpArgType::IN ? src_op->inputs_ : src_op->outputs_;
+  auto& dst_inout = *vars;
  const auto& src_arg_list =
-      (src_type == "input_format") ? proto.inputs() : proto.outputs();
-
+      src_type == OpArgType::IN ? proto.inputs() : proto.outputs();
  for (const auto& arg : src_arg_list) {
-    std::string src_name = arg.name();
-    std::string dst_name = is_grad ? src_name + kGradVarSuffix : src_name;
-    grad_idxs[dst_name] = idx++;
-    int src_arg_idx = src_op->in_out_idxs_->at(src_name);
-    int src_begin =
-        src_format == nullptr ? src_arg_idx : src_format->at(src_arg_idx);
-    int src_end = src_format == nullptr ? src_arg_idx + 1
-                                        : src_format->at(src_arg_idx + 1);
-    for (int i = src_begin; i < src_end; ++i) {
-      std::string s =
-          is_grad ? src_inout[i] + kGradVarSuffix
-                  : (arg.ignore_gradient() ? kEmptyVarName : src_inout[i]);
-      dst_inout.emplace_back(s);
-    }
-    if (dst_format != nullptr) {
-      dst_format->push_back(dst_inout.size());
+    if (arg.no_gradient() && !is_grad) continue;
+    const std::string src_name = arg.name();
+    std::string dst_name = is_grad ? GradVarName(src_name) : src_name;
+    dst_inout[dst_name].reserve(src_inout.at(src_name).size());
+    for (auto& var_name : src_inout.at(src_name)) {
+      std::string s = is_grad ? GradVarName(var_name) : var_name;
+      dst_inout[dst_name].emplace_back(s);
    }
  }
 }
@ -82,44 +47,17 @@ OperatorBase* BuildGradOp(const OperatorBase* op) {
  PADDLE_ENFORCE(!grad_op_type.empty(), "'%s' has no gradient operator.",
                 op->type_);

-  AttributeMap grad_attrs(op->Attrs());
-  grad_attrs.erase("input_format");
-  grad_attrs.erase("output_format");
-  if (op->Attrs().count("input_format") > 0) {
-    grad_attrs["output_format"] = std::vector<int>({0});
-  }
-  if (op->Attrs().count("input_format") > 0 ||
-      op->Attrs().count("output_format") > 0) {
-    grad_attrs["input_format"] = std::vector<int>({0});
-  }
-
-  std::vector<std::string> grad_inputs, grad_outputs;
-
-  using VarIndexMap = std::unordered_map<std::string, int>;
-  VarIndexMap* grad_idxs = new VarIndexMap;
-  int in_idx = 0;
-  int out_idx = 0;
-  TransOpArg(op, grad_inputs, grad_outputs, grad_attrs, *grad_idxs,
-             "input_format", "input_format", in_idx, false);  // I
-  TransOpArg(op, grad_inputs, grad_outputs, grad_attrs, *grad_idxs,
-             "output_format", "input_format", in_idx, false);  // G
-  TransOpArg(op, grad_inputs, grad_outputs, grad_attrs, *grad_idxs,
-             "output_format", "input_format", in_idx, true);  // OG
-  TransOpArg(op, grad_inputs, grad_outputs, grad_attrs, *grad_idxs,
-             "input_format", "output_format", out_idx, true);  // IG
+  OperatorBase::VarNameMap inputs;
+  OperatorBase::VarNameMap outputs;
+  TransOpArg(op, OpArgType::IN, false, &inputs);   // I
+  TransOpArg(op, OpArgType::OUT, false, &inputs);  // O
+  TransOpArg(op, OpArgType::OUT, true, &inputs);   // OG
+  TransOpArg(op, OpArgType::IN, true, &outputs);   // IG

  it = OpRegistry::op_info_map().find(grad_op_type);
  PADDLE_ENFORCE(it != OpRegistry::op_info_map().end(),
                 "'%s' has not been registered.", grad_op_type);
-  OperatorBase* grad_op = it->second.creator_();
-
-  grad_op->type_ = grad_op_type;
-  grad_op->inputs_ = grad_inputs;
-  grad_op->outputs_ = grad_outputs;
-  grad_op->attrs_ = grad_attrs;
-  grad_op->in_out_idxs_.reset(grad_idxs);
-
-  return grad_op;
+  return it->second.creator_(grad_op_type, inputs, outputs, op->attrs_);
 }

 }  // namespace framework
--- a/paddle/framework/grad_op_builder_test.cc
+++ b/paddle/framework/grad_op_builder_test.cc
@ -13,10 +13,10 @@ class MutiInOutOpMaker : public OpProtoAndCheckerMaker {
  MutiInOutOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("In1", "a single input");
-    AddInput("In2_mult", "a multiple input").SetMultiple();
+    AddInput("In2_mult", "a multiple input").AsDuplicable();
    AddInput("In3", "another single input");
    AddOutput("Out1", "a single output");
-    AddOutput("Out2_mult", "a multiple output").SetMultiple();
+    AddOutput("Out2_mult", "a multiple output").AsDuplicable();
    AddComment("test op with multiple inputs and outputs");
  }
 };
@ -26,10 +26,10 @@ class IOIgnoredOpMaker : public OpProtoAndCheckerMaker {
  IOIgnoredOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("In1", "a single input");
-    AddInput("In2_mult", "a multiple input").SetMultiple().IgnoreGradient();
-    AddInput("In3_mult", "another multiple input").SetMultiple();
-    AddOutput("Out1_mult", "a multiple output").SetMultiple();
-    AddOutput("Out2", "a single output").IgnoreGradient();
+    AddInput("In2_mult", "a multiple input").AsDuplicable().AsNoGradient();
+    AddInput("In3_mult", "another multiple input").AsDuplicable();
+    AddOutput("Out1_mult", "a multiple output").AsDuplicable();
+    AddOutput("Out2", "a single output").AsNoGradient();
    AddComment("op with inputs and outputs ignored in gradient calculating");
  }
 };
@ -40,33 +40,34 @@ class IOIgnoredOpMaker : public OpProtoAndCheckerMaker {
 namespace f = paddle::framework;

 TEST(GradOpBuilder, AddTwo) {
-  std::shared_ptr<f::OperatorBase> add_op(
-      f::OpRegistry::CreateOp("add_two", {"x", "y"}, {"out"}, {}));
+  std::shared_ptr<f::OperatorBase> add_op(f::OpRegistry::CreateOp(
+      "add_two", {{"X", {"x"}}, {"Y", {"y"}}}, {{"Out", {"out"}}}, {}));
  std::shared_ptr<f::OperatorBase> grad_add_op =
      f::OpRegistry::CreateGradOp(*add_op);
-  EXPECT_EQ(static_cast<int>(grad_add_op->inputs_.size()), 4);
-  EXPECT_EQ(static_cast<int>(grad_add_op->outputs_.size()), 2);
+  EXPECT_EQ(grad_add_op->inputs_.size(), 4UL);
+  EXPECT_EQ(grad_add_op->outputs_.size(), 2UL);
  EXPECT_EQ(grad_add_op->Input("X"), "x");
  EXPECT_EQ(grad_add_op->Input("Y"), "y");
  EXPECT_EQ(grad_add_op->Input("Out"), "out");
-  EXPECT_EQ(grad_add_op->Input("Out@GRAD"), "out@GRAD");
-  EXPECT_EQ(grad_add_op->Output("X@GRAD"), "x@GRAD");
-  EXPECT_EQ(grad_add_op->Output("Y@GRAD"), "y@GRAD");
+  EXPECT_EQ(grad_add_op->Input(f::GradVarName("Out")), f::GradVarName("out"));
+  EXPECT_EQ(grad_add_op->Output(f::GradVarName("X")), f::GradVarName("x"));
+  EXPECT_EQ(grad_add_op->Output(f::GradVarName("Y")), f::GradVarName("y"));
 }

 REGISTER_OP(mult_io, f::NOP, f::MutiInOutOpMaker, mult_io_grad, f::NOP);
 REGISTER_OP(io_ignored, f::NOP, f::IOIgnoredOpMaker, io_ignored_grad, f::NOP);

 TEST(GradOpBuilder, MutiInOut) {
-  f::AttributeMap attrs{{"input_format", std::vector<int>{0, 1, 4, 5}},
-                        {"output_format", std::vector<int>{0, 1, 3}}};
  std::shared_ptr<f::OperatorBase> test_op(f::OpRegistry::CreateOp(
-      "mult_io", {"in1", "in2_1", "in2_2", "in2_3", "in3"},
-      {"out1", "out2_1", "out2_2"}, attrs));
+      "mult_io",
+      {{"In1", {"in1"}},
+       {"In2_mult", {"in2_1", "in2_2", "in2_3"}},
+       {"In3", {"in3"}}},
+      {{"Out1", {"out1"}}, {"Out2_mult", {"out2_1", "out2_2"}}}, {}));
  std::shared_ptr<f::OperatorBase> grad_test_op =
      f::OpRegistry::CreateGradOp(*test_op);

-  ASSERT_EQ(grad_test_op->inputs_.size(), 5UL + 3UL + 3UL);
+  ASSERT_EQ(grad_test_op->inputs_.size(), 3UL + 2UL + 2UL);
  EXPECT_EQ(grad_test_op->Input("In1"), "in1");
  EXPECT_EQ(grad_test_op->Inputs("In2_mult"),
            std::vector<std::string>({"in2_1", "in2_2", "in2_3"}));
@ -80,7 +81,7 @@ TEST(GradOpBuilder, MutiInOut) {
            std::vector<std::string>(
                {f::GradVarName("out2_1"), f::GradVarName("out2_2")}));

-  ASSERT_EQ(grad_test_op->outputs_.size(), 5UL);
+  ASSERT_EQ(grad_test_op->outputs_.size(), 3UL);
  EXPECT_EQ(grad_test_op->Output(f::GradVarName("In1")), f::GradVarName("in1"));
  EXPECT_EQ(grad_test_op->Outputs(f::GradVarName("In2_mult")),
            std::vector<std::string>({f::GradVarName("in2_1"),
@ -90,31 +91,29 @@ TEST(GradOpBuilder, MutiInOut) {
 }

 TEST(GradOpBuilder, IOIgnoredInGradient) {
-  f::AttributeMap attrs{{"input_format", std::vector<int>{0, 1, 3, 5}},
-                        {"output_format", std::vector<int>{0, 2, 3}}};
  std::shared_ptr<f::OperatorBase> test_op(f::OpRegistry::CreateOp(
-      "io_ignored", {"in1", "in2_1", "in2_2", "in3_1", "in3_2"},
-      {"out1_1", "out1_2", "out2"}, attrs));
+      "io_ignored",
+      {{"In1", {"in1"}},
+       {"In2_mult", {"in2_1", "in2_2"}},
+       {"In3_mult", {"in3_1", "in3_2"}}},
+      {{"Out1_mult", {"out1_1", "out1_2"}}, {"Out2", {"out2"}}}, {}));
  std::shared_ptr<f::OperatorBase> grad_test_op =
      f::OpRegistry::CreateGradOp(*test_op);

  // 'In2' and 'Out2' are ignored in gradient calculating
-  ASSERT_EQ(grad_test_op->inputs_.size(), 5UL + 3UL + 3UL);
+  ASSERT_EQ(grad_test_op->inputs_.size(), 2UL + 1UL + 2UL);
  EXPECT_EQ(grad_test_op->Input("In1"), "in1");
-  EXPECT_EQ(grad_test_op->Inputs("In2_mult"),
-            std::vector<std::string>({f::kEmptyVarName, f::kEmptyVarName}));
  EXPECT_EQ(grad_test_op->Inputs("In3_mult"),
            std::vector<std::string>({"in3_1", "in3_2"}));
  EXPECT_EQ(grad_test_op->Inputs("Out1_mult"),
            std::vector<std::string>({"out1_1", "out1_2"}));
-  EXPECT_EQ(grad_test_op->Input("Out2"), f::kEmptyVarName);
  EXPECT_EQ(grad_test_op->Inputs(f::GradVarName("Out1_mult")),
            std::vector<std::string>(
                {f::GradVarName("out1_1"), f::GradVarName("out1_2")}));
  EXPECT_EQ(grad_test_op->Input(f::GradVarName("Out2")),
            f::GradVarName("out2"));

-  ASSERT_EQ(grad_test_op->outputs_.size(), 5UL);
+  ASSERT_EQ(grad_test_op->outputs_.size(), 3UL);
  EXPECT_EQ(grad_test_op->Output(f::GradVarName("In1")), f::GradVarName("in1"));
  EXPECT_EQ(grad_test_op->Outputs(f::GradVarName("In2_mult")),
            std::vector<std::string>(
--- a/paddle/framework/lod_tensor.cc
+++ b/paddle/framework/lod_tensor.cc
@ -19,32 +19,59 @@
 namespace paddle {
 namespace framework {

-LODTensor LODTensor::SliceShared(size_t level_begin, size_t level_end) const {
-  PADDLE_ENFORCE(HasLOD(), "has no LOD info, can't be sliced.");
-  auto new_lod = details::SliceLOD(*lod_start_pos_, level_begin, level_end);
-  // slice levels just need to update LOD info, each level will contains the
-  // whole tensor_, so no need to modify tensor_.
-  return LODTensor(tensor_, new_lod);
+LODTensor::LOD LODTensor::LOD::SliceLevels(size_t level_begin,
+                                           size_t level_end) const {
+  LOD new_lod;
+  new_lod.reserve(level_end - level_begin);
+  for (size_t i = level_begin; i < level_end; i++) {
+    new_lod.emplace_back(at(i));
+  }
+  return new_lod;
 }

-LODTensor LODTensor::SliceShared(size_t level, size_t elem_begin,
-                                 size_t elem_end) const {
-  PADDLE_ENFORCE(HasLOD(), "has no LOD info, can't be sliced.");
-  PADDLE_ENFORCE(level < NumLevels(), "level [%d] out of range [%d]", level,
-                 NumLevels());
-  PADDLE_ENFORCE(elem_begin < NumElements(level),
-                 "element begin [%d] out of range [%d]", elem_begin,
-                 NumElements(level));
-  PADDLE_ENFORCE(elem_end < NumElements(level) + 1,
-                 "element end [%d] out of range [%d]", elem_end,
-                 NumElements(level));
-
-  auto new_lod = details::SliceLOD(*lod_start_pos_, level, elem_begin, elem_end,
-                                   true /*tensor_shared*/);
-
-  // slice elements just need to update LOD info, because offsets are not
-  // changed, so the original tensor_ can be reused.
-  return LODTensor(tensor_, new_lod);
+LODTensor::LOD LODTensor::LOD::SliceInLevel(size_t level, size_t elem_begin,
+                                            size_t elem_end) const {
+  // slice the lod.
+  LOD new_lod;
+  new_lod.reserve(size() - level);
+  auto start = this->at(level)[elem_begin];
+  auto end = this->at(level)[elem_end];
+
+  for (auto it = this->begin() + level; it != this->end(); it++) {
+    auto it_begin = std::find(it->begin(), it->end(), start);
+    auto it_end = std::find(it_begin, it->end(), end);
+    PADDLE_ENFORCE(it_begin != it->end(), "error in parsing lod info");
+    PADDLE_ENFORCE(it_end != it->end(), "error in parsing lod info");
+    new_lod.emplace_back(it_begin, it_end + 1);
+    // reset offset if tensor is copyed and sliced.
+    std::transform(new_lod.back().begin(), new_lod.back().end(),
+                   new_lod.back().begin(),
+                   [start](int v) { return v - start; });
+    PADDLE_ENFORCE_EQ(new_lod.back().front(), 0, "error in slice LOD");
+  }
+  PADDLE_ENFORCE_LE(new_lod.size(), this->size());
+  return new_lod;
+}
+
+bool operator==(const LODTensor::LOD& a, const LODTensor::LOD& b) {
+  if (a.size() != b.size()) {
+    return false;
+  }
+
+  for (size_t i = 0; i < a.size(); i++) {
+    const auto& a_level = a[i];
+    const auto& b_level = b[i];
+    if (a_level.size() != b_level.size()) {
+      return false;
+    }
+    for (size_t j = 0; j < a_level.size(); j++) {
+      if (a_level[j] != b_level[j]) {
+        return false;
+      }
+    }
+  }
+
+  return true;
 }

 }  // namespace framework
--- a/paddle/framework/lod_tensor.h
+++ b/paddle/framework/lod_tensor.h
@ -15,7 +15,7 @@
 #pragma once

 #include <memory>
-#if (!PADDLE_ONLY_CPU)
+#if !defined(PADDLE_ONLY_CPU)
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
 #endif
@ -31,30 +31,29 @@ namespace framework {
 * LODTensor (Level of details Tensor)
 * see https://en.wikipedia.org/wiki/Level_of_details for reference.
 */
-class LODTensor {
+class LODTensor : public Tensor {
 public:
 // Level save offsets of each unit.
 #ifdef PADDLE_ONLY_CPU
-  using Level = std::vector<size_t>;
+  template <typename T>
+  using Vector = std::vector<T>;
 #else
-  using Level = thrust::device_vector<size_t>;
+  template <typename T>
+  using Vector = thrust::host_vector<T>;
 #endif
-  // LOD stores offsets of each level of units, the largest units level first,
+  // LoD stores offsets of each level of units, the largest units level first,
  // then the smaller units level. Each Level stores the offsets of units in
  // Tesor.
-  typedef std::vector<Level> LOD;
+  class LOD : public std::vector<Vector<size_t>> {
+   public:
+    LOD SliceLevels(size_t level_begin, size_t level_end) const;
+    LOD SliceInLevel(size_t level, size_t elem_begin, size_t elem_end) const;
+  };

  LODTensor() {}
-  LODTensor(const std::shared_ptr<Tensor> &tensor,
-            const std::shared_ptr<LOD> &lod) {
-    Reset(tensor, lod);
-  }
+  explicit LODTensor(const LOD &lod) : lod_(lod) {}

-  void Reset(const std::shared_ptr<Tensor> &tensor,
-             const std::shared_ptr<LOD> &lod) {
-    tensor_ = tensor;
-    lod_start_pos_ = lod;
-  }
+  virtual Tensor *Clone() const { return new LODTensor(lod_); }

  /*
   * Get a element from LOD.
@ -65,16 +64,14 @@ class LODTensor {
    PADDLE_ENFORCE(elem < NumElements(level),
                   "element begin [%d] out of range [%d]", elem,
                   NumElements(level));
-    return (*lod_start_pos_)[level][elem];
+    return (lod_)[level][elem];
  }

  /*
   * Number of LODTensor's levels, each level has units of data, for example,
   * in the sentence's view, article, paragraph, sentence are 3 levels.
   */
-  size_t NumLevels() const {
-    return lod_start_pos_ ? lod_start_pos_->size() : 0UL;
-  }
+  size_t NumLevels() const { return lod_.size(); }
  /*
   * Number of elements in a level.
   */
@ -82,64 +79,71 @@ class LODTensor {
    PADDLE_ENFORCE(level < NumLevels(), "level [%d] out of range [%d]", level,
                   NumLevels());
    // the last offset is the end of last element
-    return lod_start_pos_->at(level).size() - 1;
+    return lod_[level].size() - 1;
  }

-  /*
-   * Slice of levels[level_begin:level_end], with tensor copied.
-   */
-  template <typename T>
-  LODTensor SliceCopied(size_t level_begin, size_t level_end,
-                        const platform::Place &dst_place) const;
-
  /*
   * Slice of levels[level_begin:level_end], with tensor shared.
   */
-  LODTensor SliceShared(size_t level_begin, size_t level_end) const;
-
-  /*
-   * Slice of elements of a level, [elem_begin: elem_end], with tensor copied.
-   * @note: low performance in slice lod_start_pos_.
-   */
  template <typename T>
-  LODTensor SliceCopied(size_t level, size_t elem_begin, size_t elem_end,
-                        const platform::Place &dst_place) const;
+  LODTensor SliceLevels(size_t level_begin, size_t level_end) const;

  /*
   * Slice of elements of a level, [elem_begin: elem_end], with tensor shared.
-   * @note: low performance in slice lod_start_pos_.
-   */
-  LODTensor SliceShared(size_t level, size_t elem_begin, size_t elem_end) const;
-
-  /*
-   * Copy other's lod_start_pos_, to share LOD info.
-   * @note: the LOD info should not be changed.
+   * @note: low performance in slice lod_.
   */
-  void ShareLOD(const LODTensor &other) {
-    lod_start_pos_ = other.lod_start_pos_;
-  }
+  template <typename T>
+  LODTensor SliceInLevel(size_t level, size_t elem_begin,
+                         size_t elem_end) const;

  /*
-   * Copy other's lod_start_pos_'s content, free to mutate.
+   * Copy other's lod_'s content, free to mutate.
   */
-  void CopyLOD(const LODTensor &other) {
-    lod_start_pos_ = std::make_shared<LOD>(*other.lod_start_pos_);
-  }
+  void CopyLOD(const LODTensor &other) { lod_ = other.lod_; }
  /*
   * Determine whether LODTensor has a valid LOD info.
   */
-  bool HasLOD() const { return bool(lod_start_pos_); }
-  LOD *lod() const { return lod_start_pos_.get(); }
+  const LOD &lod() const { return lod_; }
+  LOD *mutable_lod() { return &lod_; }

-  std::shared_ptr<Tensor> &tensor() { return tensor_; }
-  Tensor *raw_tensor() { return tensor_.get(); }
+  virtual ~LODTensor() {}

 private:
-  std::shared_ptr<LOD> lod_start_pos_;
-  std::shared_ptr<Tensor> tensor_;
+  LOD lod_;
 };

+bool operator==(const LODTensor::LOD &a, const LODTensor::LOD &b);
+
+template <typename T>
+LODTensor LODTensor::SliceLevels(size_t level_begin, size_t level_end) const {
+  auto new_lod = lod_.SliceLevels(level_begin, level_end);
+  // slice levels just need to update LOD info, each level will contains the
+  // whole tensor_, so no need to modify tensor_.
+  LODTensor new_tensor(new_lod);
+  new_tensor.ShareDataWith<T>(*this);
+  return new_tensor;
+}
+
+template <typename T>
+LODTensor LODTensor::SliceInLevel(size_t level, size_t elem_begin,
+                                  size_t elem_end) const {
+  PADDLE_ENFORCE(level < NumLevels(), "level [%d] out of range [%d]", level,
+                 NumLevels());
+  PADDLE_ENFORCE(elem_begin < NumElements(level),
+                 "element begin [%d] out of range [%d]", elem_begin,
+                 NumElements(level));
+  PADDLE_ENFORCE(elem_end < NumElements(level) + 1,
+                 "element end [%d] out of range [%d]", elem_end,
+                 NumElements(level));
+
+  auto new_lod = lod_.SliceInLevel(level, elem_begin, elem_end);
+
+  // slice elements just need to update LOD info, because offsets are not
+  // changed, so the original tensor_ can be reused.
+  LODTensor new_tensor(new_lod);
+  new_tensor.ShareDataWith<T>(*this);
+  return new_tensor;
+}
+
 }  // namespace framework
 }  // namespace paddle
-
-#include "paddle/framework/lod_tensor_impl.h"
--- a/paddle/framework/lod_tensor_impl.h
+++ b/paddle/framework/lod_tensor_impl.h
@ -1,60 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#pragma once
-
-#include "paddle/framework/details/lod_tensor.h"
-
-namespace paddle {
-namespace framework {
-
-template <typename T>
-LODTensor LODTensor::SliceCopied(size_t level_begin, size_t level_end,
-                                 const platform::Place &dst_place) const {
-  PADDLE_ENFORCE(HasLOD(), "has no LOD info, can't be sliced.");
-  auto new_lod = details::SliceLOD(*lod_start_pos_, level_begin, level_end);
-  auto new_tensor = std::make_shared<Tensor>();
-  new_tensor->CopyFrom<T>(*tensor_, dst_place);
-
-  return LODTensor(new_tensor, new_lod);
-}
-
-template <typename T>
-LODTensor LODTensor::SliceCopied(size_t level, size_t elem_begin,
-                                 size_t elem_end,
-                                 const platform::Place &dst_place) const {
-  PADDLE_ENFORCE(HasLOD(), "has no LOD info, can't be sliced.");
-  PADDLE_ENFORCE(level < NumLevels(), "level [%d] out of range [%d]", level,
-                 NumLevels());
-  PADDLE_ENFORCE(elem_begin < NumElements(level),
-                 "element begin [%d] out of range [%d]", elem_begin,
-                 NumElements(level));
-  PADDLE_ENFORCE(elem_end < NumElements(level) + 1,
-                 "element end [%d] out of range [%d]", elem_end,
-                 NumElements(level));
-
-  auto new_lod = details::SliceLOD(*lod_start_pos_, level, elem_begin, elem_end,
-                                   false /*tensor_shared*/);
-
-  auto start_idx = new_lod->front().front();
-  auto end_idx = new_lod->front().back() - 1 /*the next element's start*/;
-  auto sliced_tensor = tensor_->Slice<T>(start_idx, end_idx);
-  auto new_tensor = std::make_shared<Tensor>();
-  new_tensor->CopyFrom<T>(sliced_tensor, dst_place);
-
-  return LODTensor(new_tensor, new_lod);
-}
-
-}  // namespace framework
-}  // namespace paddle
--- a/paddle/framework/lod_tensor_test.cc
+++ b/paddle/framework/lod_tensor_test.cc
@ -15,6 +15,7 @@

 #include <glog/logging.h>
 #include <gtest/gtest.h>
+#include <algorithm>
 #include <memory>

 namespace paddle {
@ -29,22 +30,28 @@ class LODTensorTester : public ::testing::Test {
    // 0 10 20
    // 0 5 10 15 20
    // 0 2 5 7 10 12 15 20
-    auto lod = std::make_shared<LODTensor::LOD>();
-    lod->push_back(std::vector<size_t>{0, 10, 20});
-    lod->push_back(std::vector<size_t>{0, 5, 10, 15, 20});
-    lod->push_back(std::vector<size_t>{0, 2, 5, 7, 10, 12, 15, 17, 20});
+    LODTensor::LOD lod;
+    lod.push_back(std::vector<size_t>{0, 10, 20});
+    lod.push_back(std::vector<size_t>{0, 5, 10, 15, 20});
+    lod.push_back(std::vector<size_t>{0, 2, 5, 7, 10, 12, 15, 17, 20});

-    auto tensor = std::make_shared<Tensor>();
-    tensor->Resize({20 /*batch size*/, 128 /*dim*/});
+    ASSERT_EQ(lod.size(), 3UL);
+
+    tensor.Resize({20 /*batch size*/, 128 /*dim*/});
    // malloc memory
-    tensor->mutable_data<float>(place);
+    tensor.mutable_data<float>(place);
+
+    lod_tensor.reset(new LODTensor(lod));
+    lod_tensor->Resize({20 /*batch size*/, 128 /*dim*/});

-    lod_tensor->Reset(tensor, lod);
+    lod_tensor->ShareDataWith<float>(tensor);
+    // lod_tensor->ShareDataWith<Tensor>(tensor);
  }

 protected:
  std::unique_ptr<LODTensor> lod_tensor;
  platform::CPUPlace place;
+  Tensor tensor;
 };

 TEST_F(LODTensorTester, NumLevels) { ASSERT_EQ(lod_tensor->NumLevels(), 3UL); }
@ -55,110 +62,54 @@ TEST_F(LODTensorTester, NumElements) {
  ASSERT_EQ(lod_tensor->NumElements(2), 8UL);
 }

-TEST_F(LODTensorTester, SliceShared_Level) {
-  // slice 1 level
-  for (size_t level = 0; level < 3UL; ++level) {
-    auto new_lod_tensor = lod_tensor->SliceShared(level, level + 1);
-    ASSERT_EQ(new_lod_tensor.NumLevels(), 1UL);
-    ASSERT_EQ(new_lod_tensor.NumElements(0UL), lod_tensor->NumElements(level));
-    ASSERT_EQ(new_lod_tensor.tensor(), lod_tensor->tensor());
-  }
-  // slice 2 level
-  for (size_t level = 0; level < 2UL; ++level) {
-    auto new_lod_tensor = lod_tensor->SliceShared(level, level + 2);
-    ASSERT_EQ(new_lod_tensor.NumLevels(), 2UL);
-    ASSERT_EQ(new_lod_tensor.NumElements(0), lod_tensor->NumElements(level));
-    ASSERT_EQ(new_lod_tensor.NumElements(1),
-              lod_tensor->NumElements(level + 1));
-    ASSERT_EQ(new_lod_tensor.tensor(), lod_tensor->tensor());
-  }
-}
-
-TEST_F(LODTensorTester, SliceCopied_Level) {
+TEST_F(LODTensorTester, SliceLevels) {
  // slice 1 level
  for (size_t level = 0; level < 3UL; ++level) {
-    auto new_lod_tensor =
-        lod_tensor->SliceCopied<float>(level, level + 1, place);
+    auto new_lod_tensor = lod_tensor->SliceLevels<float>(level, level + 1);
    ASSERT_EQ(new_lod_tensor.NumLevels(), 1UL);
    ASSERT_EQ(new_lod_tensor.NumElements(0UL), lod_tensor->NumElements(level));
-    // ASSERT_EQ(new_lod_tensor.tensor(), lod_tensor->tensor());
-    // TODO(superjom) add tensor comparation here.
+    // ASSERT_EQ(new_lod_tensor, *lod_tensor);
  }
  // slice 2 level
  for (size_t level = 0; level < 2UL; ++level) {
-    auto new_lod_tensor =
-        lod_tensor->SliceCopied<float>(level, level + 2, place);
+    auto new_lod_tensor = lod_tensor->SliceLevels<float>(level, level + 2);
    ASSERT_EQ(new_lod_tensor.NumLevels(), 2UL);
    ASSERT_EQ(new_lod_tensor.NumElements(0), lod_tensor->NumElements(level));
    ASSERT_EQ(new_lod_tensor.NumElements(1),
              lod_tensor->NumElements(level + 1));
-    // ASSERT_EQ(new_lod_tensor.tensor(), lod_tensor->tensor());
-    // TODO(superjom) add tensor comparation here.
+    ASSERT_EQ(new_lod_tensor.data<float>(), lod_tensor->data<float>());
  }
 }

-TEST_F(LODTensorTester, SliceShared_Element) {
-  size_t level = 0;
-  auto new_lod_tensor = lod_tensor->SliceShared(level, 0, 2);
-  ASSERT_EQ(new_lod_tensor.NumLevels(), 3UL);
-  ASSERT_EQ(new_lod_tensor.NumElements(0), 2UL);
-  ASSERT_EQ(new_lod_tensor.NumElements(1), 4UL);
-  ASSERT_EQ(new_lod_tensor.NumElements(2), 8UL);
-  ASSERT_EQ(new_lod_tensor.raw_tensor(), lod_tensor->raw_tensor());
-
-  level = 1;
-  new_lod_tensor = lod_tensor->SliceShared(level, 0, 2);
-  ASSERT_EQ(new_lod_tensor.NumLevels(), 2UL);
-  ASSERT_EQ(new_lod_tensor.NumElements(0), 2UL);
-  ASSERT_EQ(new_lod_tensor.NumElements(1), 4UL);
-  ASSERT_EQ(new_lod_tensor.raw_tensor(), lod_tensor->raw_tensor());
-}
-
-TEST_F(LODTensorTester, SliceCopied_Element) {
+TEST_F(LODTensorTester, SliceInLevel) {
  size_t level = 0;
-  auto new_lod_tensor = lod_tensor->SliceCopied<float>(level, 0, 2, place);
-  ASSERT_EQ(new_lod_tensor.NumLevels(), 3UL);
-  ASSERT_EQ(new_lod_tensor.NumElements(0), 2UL);
-  ASSERT_EQ(new_lod_tensor.NumElements(1), 4UL);
-  ASSERT_EQ(new_lod_tensor.NumElements(2), 8UL);
-  ASSERT_NE(new_lod_tensor.raw_tensor(), lod_tensor->raw_tensor());
+  auto new_lod_tensor = lod_tensor->SliceInLevel<float>(level, 0, 2);
+  EXPECT_EQ(new_lod_tensor.NumLevels(), 3UL);
+  EXPECT_EQ(new_lod_tensor.NumElements(0), 2UL);
+  EXPECT_EQ(new_lod_tensor.NumElements(1), 4UL);
+  EXPECT_EQ(new_lod_tensor.NumElements(2), 8UL);
+  ASSERT_EQ(new_lod_tensor.data<float>(), lod_tensor->data<float>());

  level = 1;
-  new_lod_tensor = lod_tensor->SliceCopied<float>(level, 0, 2, place);
+  new_lod_tensor = lod_tensor->SliceInLevel<float>(level, 0, 2);
  ASSERT_EQ(new_lod_tensor.NumLevels(), 2UL);
  ASSERT_EQ(new_lod_tensor.NumElements(0), 2UL);
  ASSERT_EQ(new_lod_tensor.NumElements(1), 4UL);
-  ASSERT_NE(new_lod_tensor.raw_tensor(), lod_tensor->raw_tensor());
-
-  level = 1;
-  // LOD is
-  //    0 5 10
-  //    0 2 5 7 10
-  new_lod_tensor = lod_tensor->SliceCopied<float>(level, 1, 3, place);
-  ASSERT_EQ(new_lod_tensor.NumLevels(), 2UL);
-  ASSERT_EQ(new_lod_tensor.NumElements(0), 2UL);
-  ASSERT_EQ(new_lod_tensor.NumElements(1), 4UL);
-
-  ASSERT_EQ(new_lod_tensor.lod_element(0, 0), 0UL);
-  ASSERT_EQ(new_lod_tensor.lod_element(0, 1), 5UL);
-  ASSERT_EQ(new_lod_tensor.lod_element(1, 0), 0UL);
-  ASSERT_EQ(new_lod_tensor.lod_element(1, 1), 2UL);
-  ASSERT_EQ(new_lod_tensor.lod_element(1, 2), 5UL);
-  ASSERT_EQ(new_lod_tensor.lod_element(1, 3), 7UL);
-
-  // TODO(superjom) compare the content of these tensors
+  ASSERT_EQ(new_lod_tensor.data<float>(), lod_tensor->data<float>());
 }

 TEST_F(LODTensorTester, ShareLOD) {
  LODTensor new_lod_tensor;
-  new_lod_tensor.ShareLOD(*lod_tensor);
+  new_lod_tensor.CopyLOD(*lod_tensor);
  ASSERT_EQ(new_lod_tensor.lod(), lod_tensor->lod());
 }

 TEST_F(LODTensorTester, CopyLOD) {
  LODTensor new_lod_tensor;
  new_lod_tensor.CopyLOD(*lod_tensor);
-  ASSERT_NE(new_lod_tensor.lod(), lod_tensor->lod());
+  bool equals = std::equal(lod_tensor->lod().begin(), lod_tensor->lod().end(),
+                           new_lod_tensor.lod().begin());
+  ASSERT_TRUE(equals);
 }

 }  // namespace framework
--- a/paddle/framework/op_desc.proto
+++ b/paddle/framework/op_desc.proto
@ -1,56 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-syntax = "proto2";
-package paddle.framework;
-
-import "attribute.proto";
-
-// AttrDesc is used to describe Attributes of an Operator. It contain's
-// name, type, and value of Attribute.
-//
-// e.g, for scale=3.0: name=scala, type=AttrType.FLOAT, value=3.0
-message AttrDesc {
-  required string name = 1;
-  required AttrType type = 2;
-  optional int32 i = 3;
-  optional float f = 4;
-  optional string s = 5;
-  repeated int32 ints = 6;
-  repeated float floats = 7;
-  repeated string strings = 8;
-};
-
-// Protocol Message to describe an Operator.
-//
-// In PaddlePaddle, Operator is used to do a certain computation such
-// as "add", "sub", "cosine", etc.
-//  (1) Operator needs to know the input and output variable names.
-//  (2) Some ops may have special attributes such as "scale" in "CosineOp".
-//
-// 3rd-party language can build this proto message and call
-// AddOp(const OpDesc& op_desc) of Paddle core to create an Operator.
-message OpDesc {
-  // input names of this Operator.
-  repeated string inputs = 1;
-
-  // output names of this Operator.
-  repeated string outputs = 2;
-
-  // type of this Operator, such as "add", "sub", "fc".
-  required string type = 3;
-
-  // Attributes of this Operator. e.g., scale=3.0 in cosine op.
-  repeated AttrDesc attrs = 4;
-};
--- a/paddle/framework/op_desc_test.cc
+++ b/paddle/framework/op_desc_test.cc
@ -1,35 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <paddle/framework/op_desc.pb.h>
-
-TEST(OpDesc, Create) {
-  paddle::framework::OpDesc op_desc;
-  op_desc.set_type("add");
-  op_desc.add_inputs("X");
-  op_desc.add_inputs("Y");
-  op_desc.add_outputs("Z");
-
-  auto attr = op_desc.mutable_attrs()->Add();
-  attr->set_type(paddle::framework::AttrType::FLOAT);
-  attr->set_f(3.14);
-
-  // required field name is not set, so IsInitialized should be false.
-  ASSERT_FALSE(op_desc.IsInitialized());
-
-  attr->set_name("add");
-  // after all required fields are set, IsInitialized should be true now.
-  ASSERT_TRUE(op_desc.IsInitialized());
-}
--- a/paddle/framework/op_proto.proto
+++ b/paddle/framework/op_proto.proto
@ -1,116 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-// Protocol Message for 3rd-party language binding.
-//
-// Paddle Python package will use `OpProto` to generate op creation methods.
-// The op creation methods take user's input and generate `OpDesc` proto
-// message,
-// then pass `OpDesc` to C++ side and create Op pointer.
-//
-syntax = "proto2";
-package paddle.framework;
-
-import "attribute.proto";
-
-// Attribute protocol message for 3rd-party language binding.
-// It will store the Op support what attribute and what type.
-message AttrProto {
-  // Supported attribute name. e.g. `scale` for cosine op.
-  required string name = 1;
-
-  // Supported attribute type.
-  required AttrType type = 2;
-
-  // Supported attribute comments. It helps 3rd-party language generate
-  // doc-string.
-  required string comment = 3;
-
-  // If that attribute is generated, it means the Paddle third language
-  // binding has responsibility to fill that attribute. End-User should
-  // not set that attribute.
-  optional bool generated = 4 [ default = false ];
-}
-
-// Input or output message for 3rd-party language binding.
-// It contains parameter name and its comments.
-message VarProto {
-  // Input or output name in that op creation function.
-  // e.g. `cos(a, b, output, ...)`, "a", "b", "output" are names.
-  required string name = 1;
-
-  // The comment for that input. It helps 3rd-party language generate
-  // doc-string.
-  required string comment = 2;
-
-  // Is that input/output could be a list or not.
-  // If so, that Op should write a attributed named `input_format` or
-  // `output_format`.
-  //
-  // e.g.
-  //   If the op is a fc op, the inputs are `X`, `W`, `b`. The `X` and `W`
-  //   could be multiple, so the multiple of `X` and `W` is True, and OpDesc
-  //   will hold a attribute of them.
-  //
-  //   The Op desc of same fc could be
-  //   {
-  //      "type": "fc",
-  //      "input": ["X1", "X2", "W1", "W2", "b"],
-  //      "output": "fc.out",
-  //      "attrs" : {
-  //        "input_format": [0, 2, 4, 5]
-  //      }
-  //   }
-  //
-  optional bool multiple = 3 [ default = false ];
-
-  // It marks that output is a temporary output. That output is not used by
-  // user, but used by other op internally as input. If other op is not use
-  // that output, it could be optimized early.
-  //
-  // Attribute temporary_index will be set in OpDesc if there is some
-  // outputs are temporary.
-  //
-  // output = [ "xxx.out1", "xxx.tmp", "xxx.out2"],
-  // attrs = {
-  //   "temporary_index": [1]
-  // }
-  optional bool temporary = 4 [ default = false ];
-
-  // The gradient of operator can be ignored immediately
-  // e.g. operator AddOp, y = x1 + x2, the gradient of dy/dx1, dy/dx2
-  // can be ignored for the future optimized on graph.
-  optional bool ignore_gradient = 6;
-}
-
-// Op protocol message for 3rd-party language binding.
-// It contains all information for generating op creation method.
-message OpProto {
-  // The input information to generate op creation method.
-  repeated VarProto inputs = 1;
-
-  // The output information to generate op creation method.
-  repeated VarProto outputs = 2;
-
-  // The attribute information to generate op creation method.
-  repeated AttrProto attrs = 3;
-
-  // The comments for that Op. It helps 3rd-party language generate
-  // doc-string. The whole documentation of that Op is generated by comment,
-  // inputs, outputs, attrs together.
-  required string comment = 4;
-
-  // The type of that Op.
-  required string type = 5;
-}
--- a/paddle/framework/op_proto_test.cc
+++ b/paddle/framework/op_proto_test.cc
@ -1,31 +0,0 @@
-#include <gtest/gtest.h>
-#include <paddle/framework/op_proto.pb.h>
-
-TEST(TestOpProto, ALL) {
-  paddle::framework::OpProto proto;
-  {
-    auto ipt = proto.mutable_inputs()->Add();
-    *ipt->mutable_name() = "a";
-    *ipt->mutable_comment() = "the one input of cosine op";
-  }
-  {
-    auto ipt = proto.mutable_inputs()->Add();
-    *ipt->mutable_name() = "b";
-    *ipt->mutable_comment() = "the other input of cosine op";
-  }
-  {
-    auto opt = proto.mutable_outputs()->Add();
-    *opt->mutable_name() = "output";
-    *opt->mutable_comment() = "the output of cosine op";
-  }
-  {
-    auto attr = proto.mutable_attrs()->Add();
-    *attr->mutable_name() = "scale";
-    attr->set_type(paddle::framework::AttrType::FLOAT);
-    *attr->mutable_comment() = "the scale attribute of cosine op";
-  }
-  proto.set_type("cos");
-  *proto.mutable_comment() = "cosine op, output = scale * cos(a, b)";
-
-  ASSERT_TRUE(proto.IsInitialized());
-}
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
--- a/paddle/framework/op_registry_test.cc
+++ b/paddle/framework/op_registry_test.cc
@ -7,8 +7,7 @@ namespace paddle {
 namespace framework {
 class CosineOp : public OperatorBase {
 public:
-  DEFINE_OPERATOR_CTOR(CosineOp, OperatorBase)
-
+  using OperatorBase::OperatorBase;
  void Run(const Scope& scope,
           const platform::DeviceContext& dev_ctx) const override {}
  void InferShape(const Scope& scope) const override {}
@ -29,8 +28,7 @@ class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {

 class MyTestOp : public OperatorBase {
 public:
-  DEFINE_OPERATOR_CTOR(MyTestOp, OperatorBase)
-
+  using OperatorBase::OperatorBase;
  void InferShape(const Scope& scope) const override {}
  void Run(const Scope& scope,
           const platform::DeviceContext& dev_ctx) const override {}
@ -40,8 +38,8 @@ class MyTestOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
 public:
  MyTestOpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("input", "input of cosine op").SetMultiple();
-    AddOutput("output", "output of cosine op").SetTemporary();
+    AddInput("input", "input of cosine op").AsDuplicable();
+    AddOutput("output", "output of cosine op").AsIntermediate();
    auto my_checker = [](int i) {
      PADDLE_ENFORCE(i % 2 == 0, "'test_attr' must be even!");
    };
@ -53,6 +51,14 @@ class MyTestOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
 }  // namespace framework
 }  // namespace paddle

+static void BuildVar(const std::string& param_name,
+                     std::initializer_list<const char*> arguments,
+                     paddle::framework::OpDesc::Var* var) {
+  var->set_parameter(param_name);
+  for (auto& arg_name : arguments) {
+    var->add_arguments(arg_name);
+  }
+}
 REGISTER_OP_WITHOUT_GRADIENT(cos_sim, paddle::framework::CosineOp,
                             paddle::framework::CosineOpProtoAndCheckerMaker);
 REGISTER_OP_WITHOUT_GRADIENT(my_test_op, paddle::framework::MyTestOp,
@ -61,8 +67,8 @@ REGISTER_OP_WITHOUT_GRADIENT(my_test_op, paddle::framework::MyTestOp,
 TEST(OpRegistry, CreateOp) {
  paddle::framework::OpDesc op_desc;
  op_desc.set_type("cos_sim");
-  op_desc.add_inputs("aa");
-  op_desc.add_outputs("bb");
+  BuildVar("input", {"aa"}, op_desc.add_inputs());
+  BuildVar("output", {"bb"}, op_desc.add_outputs());

  float scale = 3.3;
  auto attr = op_desc.mutable_attrs()->Add();
@ -82,8 +88,8 @@ TEST(OpRegistry, CreateOp) {
 TEST(OpRegistry, IllegalAttr) {
  paddle::framework::OpDesc op_desc;
  op_desc.set_type("cos_sim");
-  op_desc.add_inputs("aa");
-  op_desc.add_outputs("bb");
+  BuildVar("input", {"aa"}, op_desc.add_inputs());
+  BuildVar("output", {"bb"}, op_desc.add_outputs());

  auto attr = op_desc.mutable_attrs()->Add();
  attr->set_name("scale");
@ -107,8 +113,8 @@ TEST(OpRegistry, IllegalAttr) {
 TEST(OpRegistry, DefaultValue) {
  paddle::framework::OpDesc op_desc;
  op_desc.set_type("cos_sim");
-  op_desc.add_inputs("aa");
-  op_desc.add_outputs("bb");
+  BuildVar("input", {"aa"}, op_desc.add_inputs());
+  BuildVar("output", {"bb"}, op_desc.add_outputs());

  ASSERT_TRUE(op_desc.IsInitialized());

@ -120,20 +126,11 @@ TEST(OpRegistry, DefaultValue) {
  ASSERT_EQ(op->GetAttr<float>("scale"), 1.0);
 }

-static void SetInputFormat(paddle::framework::OpDesc* desc) {
-  auto attr = desc->add_attrs();
-  attr->set_name("input_format");
-  attr->set_type(paddle::framework::INTS);
-  attr->mutable_ints()->Add(0);
-  attr->mutable_ints()->Add(1);
-}
-
 TEST(OpRegistry, CustomChecker) {
  paddle::framework::OpDesc op_desc;
  op_desc.set_type("my_test_op");
-  op_desc.add_inputs("ii");
-  op_desc.add_outputs("oo");
-  SetInputFormat(&op_desc);
+  BuildVar("input", {"ii"}, op_desc.add_inputs());
+  BuildVar("output", {"oo"}, op_desc.add_outputs());

  // attr 'test_attr' is not set
  bool caught = false;
@ -173,7 +170,6 @@ TEST(OpRegistry, CustomChecker) {
  attr->set_name("test_attr");
  attr->set_type(paddle::framework::AttrType::INT);
  attr->set_i(4);
-  SetInputFormat(&op_desc);
  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
  paddle::platform::CPUDeviceContext dev_ctx;
  paddle::framework::Scope scope;
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include <algorithm>
-
 #include "paddle/framework/operator.h"
+#include <algorithm>
+#include "paddle/framework/op_registry.h"

 namespace paddle {
 namespace framework {
@ -34,83 +34,134 @@ ExecutionContext::GetEigenDevice<platform::GPUPlace, Eigen::GpuDevice>() const {
 #endif

 const std::string& OperatorBase::Input(const std::string& name) const {
-  PADDLE_ENFORCE_NOT_NULL(in_out_idxs_,
-                          "Input Output Indices could not be nullptr");
-  auto it = in_out_idxs_->find(name);
-  PADDLE_ENFORCE(it != in_out_idxs_->end(), "no key [%s] in in_out_idxs_",
-                 name);
-  if (attrs_.count("input_format") == 0) {
-    return inputs_.at((size_t)it->second);
-  } else {
-    const auto& input_format = GetAttr<std::vector<int>>("input_format");
-    int idx = input_format[it->second];
-    return inputs_.at((size_t)idx);
-  }
+  auto& ins = Inputs(name);
+  PADDLE_ENFORCE_EQ(ins.size(), 1UL,
+                    "Op %s input %s should contain only one variable", type_,
+                    name);
+  return ins[0];
 }

-std::vector<std::string> OperatorBase::Inputs(const std::string& name) const {
-  PADDLE_ENFORCE_NOT_NULL(in_out_idxs_, "IO Idx could not be nullptr");
-  auto input_format = GetAttr<std::vector<int>>("input_format");
-  auto offset = in_out_idxs_->at(name);
-  PADDLE_ENFORCE(input_format.at(static_cast<size_t>(offset) + 1) <=
-                     static_cast<int>(inputs_.size()),
-                 "Input Out Of Range");
-
-  return std::vector<std::string>{
-      inputs_.begin() + input_format.at(offset),
-      inputs_.begin() + input_format.at(offset + 1)};
+const std::vector<std::string>& OperatorBase::Inputs(
+    const std::string& name) const {
+  auto it = inputs_.find(name);
+  PADDLE_ENFORCE(it != inputs_.end(), "Op %s do not have input %s", type_,
+                 name);
+  return it->second;
 }

 const std::string& OperatorBase::Output(const std::string& name) const {
-  PADDLE_ENFORCE_NOT_NULL(in_out_idxs_, "InOut Indice could not be nullptr");
-  auto it = in_out_idxs_->find(name);
-  PADDLE_ENFORCE(it != in_out_idxs_->end(), "no key [%s] in in_out_idxs_",
-                 name);
-  if (attrs_.count("output_format") == 0) {
-    return outputs_.at((size_t)it->second);
-  } else {
-    const auto& output_format = GetAttr<std::vector<int>>("output_format");
-    int idx = output_format[it->second];
-    return outputs_.at((size_t)idx);
-  }
+  auto& outs = Outputs(name);
+  PADDLE_ENFORCE_EQ(outs.size(), 1UL,
+                    "Op %s output %s should contain only one variable", type_,
+                    name);
+  return outs[0];
 }

-std::vector<std::string> OperatorBase::Outputs(const std::string& name) const {
-  PADDLE_ENFORCE_NOT_NULL(in_out_idxs_, "InOut Indice could not be nullptr");
-  auto output_format = GetAttr<std::vector<int>>("output_format");
-  auto offset = in_out_idxs_->at(name);
-  PADDLE_ENFORCE(output_format.at(static_cast<size_t>(offset) + 1) <=
-                     static_cast<int>(outputs_.size()),
-                 "Output Out of Range");
-  return std::vector<std::string>{
-      outputs_.begin() + output_format.at(offset),
-      outputs_.begin() + output_format.at(offset + 1)};
+const std::vector<std::string>& OperatorBase::Outputs(
+    const std::string& name) const {
+  auto it = outputs_.find(name);
+  PADDLE_ENFORCE(it != outputs_.end(), "Op %s does not have output %s", type_,
+                 name);
+  return it->second;
 }

 std::string OperatorBase::DebugString() const {
  std::stringstream ss;
-  ss << "Op(" << type_ << "), inputs:(";
-  for (size_t i = 0; i < inputs_.size(); ++i) {
-    ss << inputs_[i];
-    if (i != inputs_.size() - 1) {
+  ss << "Op(" << type_ << "), inputs:{";
+  for (auto it = inputs_.begin(); it != inputs_.end();) {
+    auto& input = *it;
+    ss << input.first << "[";
+    for (size_t i = 0; i < input.second.size(); ++i) {
+      ss << input.second[i];
+      if (i != input.second.size() - 1) {
+        ss << ", ";
+      }
+    }
+    ss << "]";
+    ++it;
+    if (it != inputs_.end()) {
      ss << ", ";
    }
  }
-  ss << "), outputs:(";
-  for (size_t i = 0; i < outputs_.size(); ++i) {
-    ss << outputs_[i];
-    if (i != outputs_.size() - 1) {
+  ss << "}, outputs:{";
+  for (auto it = outputs_.begin(); it != outputs_.end();) {
+    auto& output = *it;
+    ss << output.first << "[";
+    for (size_t i = 0; i < output.second.size(); ++i) {
+      ss << output.second[i];
+      if (i != output.second.size() - 1) {
+        ss << ", ";
+      }
+    }
+    ss << "]";
+    ++it;
+    if (it != outputs_.end()) {
      ss << ", ";
    }
  }
-  ss << ").";
+  ss << "}.";
  return ss.str();
 }

 void OperatorBase::Rename(const std::string& old_name,
                          const std::string& new_name) {
-  std::replace(inputs_.begin(), inputs_.end(), old_name, new_name);
-  std::replace(outputs_.begin(), outputs_.end(), old_name, new_name);
+  for (auto& input : inputs_) {
+    std::replace(input.second.begin(), input.second.end(), old_name, new_name);
+  }
+  for (auto& output : outputs_) {
+    std::replace(output.second.begin(), output.second.end(), old_name,
+                 new_name);
+  }
+}
+
+OperatorBase::OperatorBase(const std::string& type,
+                           const OperatorBase::VarNameMap& inputs,
+                           const OperatorBase::VarNameMap& outputs,
+                           const AttributeMap& attrs)
+    : type_(type), inputs_(inputs), outputs_(outputs), attrs_(attrs) {
+  static std::atomic<size_t> gUniqId(0UL);
+  for (auto& output : outputs_) {
+    for (auto& output_name : output.second) {
+      if (output_name == kTempVarName) {
+        output_name += type_;
+        output_name += "@";
+        output_name += std::to_string(gUniqId.fetch_add(1));
+      }
+    }
+  }
+}
+
+std::vector<std::string> OperatorBase::OutputVars(bool has_intermediate) const {
+  std::vector<std::string> ret_val;
+  if (has_intermediate) {
+    // push all outputs into ret_val
+    for (auto& o : outputs_) {
+      ret_val.reserve(ret_val.size() + o.second.size());
+      ret_val.insert(ret_val.end(), o.second.begin(), o.second.end());
+    }
+    return ret_val;
+  }
+  auto it = OpRegistry::op_info_map().find(type_);
+  PADDLE_ENFORCE(
+      it != OpRegistry::op_info_map().end(),
+      "Operator %s not registered, cannot figure out intermediate outputs",
+      type_);
+  PADDLE_ENFORCE(
+      it->second.proto_ != nullptr,
+      "Operator %s has no OpProto, cannot figure out intermediate outputs",
+      type_);
+
+  // get all OpProto::Var for outputs
+  for (auto& o : it->second.proto_.outputs()) {
+    // ignore all intermediate output
+    if (o.intermediate()) continue;
+    auto out = outputs_.find(o.name());
+    if (out != outputs_.end()) {
+      ret_val.reserve(ret_val.size() + out->second.size());
+      ret_val.insert(ret_val.end(), out->second.begin(), out->second.end());
+    }
+  }
+  return ret_val;
 }

 }  // namespace framework
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@ -20,8 +20,7 @@ limitations under the License. */
 #include <vector>

 #include "paddle/framework/attribute.h"
-#include "paddle/framework/op_desc.pb.h"
-#include "paddle/framework/op_proto.pb.h"
+#include "paddle/framework/framework.pb.h"
 #include "paddle/framework/scope.h"
 #include "paddle/framework/tensor.h"
 #include "paddle/platform/device_context.h"
@ -55,16 +54,6 @@ class OperatorBase;
 class InferShapeContext;
 class ExecutionContext;

-#define DEFINE_OPERATOR_CTOR(Class, ParentClass)                         \
- public:                                                                 \
-  Class() { /* TODO(yi): This constructor is to be removed. */           \
-  }                                                                      \
-  Class(const std::string& type, const std::vector<std::string>& inputs, \
-        const std::vector<std::string>& outputs,                         \
-        const ::paddle::framework::AttributeMap& attrs,                  \
-        std::unordered_map<std::string, int>* in_out_idxs)               \
-      : ParentClass(type, inputs, outputs, attrs, in_out_idxs) {}
-
 /**
 * OperatorBase has the basic element that Net will call to do computation.
 * Only CreateOperator from OpRegistry will new Operator directly. User
@ -73,16 +62,14 @@ class ExecutionContext;
 */
 class OperatorBase {
 public:
-  OperatorBase() {}  // TODO(yi): This constructor is to be removed.
-  OperatorBase(const std::string& type, const std::vector<std::string>& inputs,
-               const std::vector<std::string>& outputs,
-               const AttributeMap& attrs,
-               std::unordered_map<std::string, int>* in_out_idxs)
-      : type_(type),
-        inputs_(inputs),
-        outputs_(outputs),
-        attrs_(attrs),
-        in_out_idxs_(in_out_idxs) {}
+  using VarNameMap = std::map<std::string, std::vector<std::string>>;
+
+  OperatorBase(const std::string& type, const VarNameMap& inputs,
+               const VarNameMap& outputs, const AttributeMap& attrs);
+
+  OperatorBase(const OperatorBase& o) = delete;
+  OperatorBase& operator=(const OperatorBase& o) = delete;
+  OperatorBase(OperatorBase&& o) = delete;

  virtual ~OperatorBase() {}

@ -95,10 +82,6 @@ class OperatorBase {

  virtual std::string DebugString() const;

-  /// Init will be called after CreateOperator, you can put some initialization
-  /// logic here.
-  virtual void Init() {}
-
  /// InferShape infer the size of Variables used by this Operator with
  /// information inside scope
  virtual void InferShape(const Scope& scope) const = 0;
@ -117,22 +100,18 @@ class OperatorBase {
  //! Get a input with argument's name described in `op_proto`
  const std::string& Input(const std::string& name) const;
  //! Get a input which has multiple variables.
-  //! TODO add a vector_view to prevent memory copy.
-  std::vector<std::string> Inputs(const std::string& name) const;
+  const std::vector<std::string>& Inputs(const std::string& name) const;

  //! Get a output with argument's name described in `op_proto`
  const std::string& Output(const std::string& name) const;
  //! Get an output which has multiple variables.
  //! TODO add a vector_view to prevent memory copy.
-  std::vector<std::string> Outputs(const std::string& name) const;
+  const std::vector<std::string>& Outputs(const std::string& name) const;
+
+  virtual std::vector<std::string> OutputVars(bool has_intermediate) const;

-  const std::string Type() const { return type_; }
-  const std::vector<std::string> Inputs() const { return inputs_; }
-  const std::vector<std::string> Outputs() const { return outputs_; }
+  std::string Type() const { return type_; }
  const AttributeMap& Attrs() const { return attrs_; }
-  const std::unordered_map<std::string, int>* InOutIdx() const {
-    return in_out_idxs_.get();
-  }

 public:
  std::string type_;
@ -140,19 +119,17 @@ class OperatorBase {
  // I (Inputs)
  // O (Outputs)
  // OG (Output Gradients)
-  std::vector<std::string> inputs_;
+  VarNameMap inputs_;
+
  // NOTE: in case of OpGrad, outputs_ contains
  // IG (Inputs Gradients)
-  std::vector<std::string> outputs_;
+  VarNameMap outputs_;
  AttributeMap attrs_;
-  // store the arguments' offset described in op_desc.
-  std::shared_ptr<std::unordered_map<std::string, int>> in_out_idxs_;
 };

 class NOP : public OperatorBase {
 public:
-  DEFINE_OPERATOR_CTOR(NOP, OperatorBase)
-
+  using OperatorBase::OperatorBase;
  void InferShape(const Scope& scope) const override {}
  void Run(const Scope& scope,
           const platform::DeviceContext& dev_ctx) const override {}
@ -163,16 +140,12 @@ class InferShapeContext {
  InferShapeContext(const OperatorBase& op, const Scope& scope)
      : op_(op), scope_(scope) {}

-  size_t InputSize() const { return op_.inputs_.size(); }
-
-  size_t OutputSize() const { return op_.outputs_.size(); }
-
-  const Variable* InputVar(const size_t index) const {
-    return scope_.FindVar(op_.inputs_.at(index));
+  size_t InputSize(const std::string& name) const {
+    return op_.Inputs(name).size();
  }

-  Variable* OutputVar(const size_t index) const {
-    return scope_.FindVar(op_.outputs_.at(index));
+  size_t OutputSize(const std::string& name) const {
+    return op_.Outputs(name).size();
  }

  const Variable* InputVar(const std::string& name) const {
@ -204,27 +177,9 @@ class InferShapeContext {
    return res;
  }

-  template <typename T>
-  const T* Input(const size_t index) const {
-    auto var = InputVar(index);
-    PADDLE_ENFORCE_NOT_NULL(var, "Input(%d) should not be nullptr", index);
-    return &var->Get<T>();
-  }
-
-  template <typename T>
-  T* Output(const size_t index) const {
-    auto var = OutputVar(index);
-    PADDLE_ENFORCE_NOT_NULL(
-        var,
-        "Output(%d) not be nullptr, which means variable [%s] does not "
-        "exist in scope",
-        index, op_.outputs_[index]);
-    return var->GetMutable<T>();
-  }
-
  template <typename T>
  const T* Input(const std::string& name) const {
-    auto var = InputVar(name);
+    auto* var = InputVar(name);
    PADDLE_ENFORCE_NOT_NULL(var, "Input(%s) should not be nullptr", name);
    return &var->Get<T>();
  }
@ -300,6 +255,10 @@ class ExecutionContext : public InferShapeContext {

  platform::Place GetPlace() const { return device_context_->GetPlace(); }

+  const platform::DeviceContext* device_context() const {
+    return device_context_;
+  }
+
  const platform::DeviceContext* device_context_;
 };

@ -319,14 +278,6 @@ class OpKernel {

 class OperatorWithKernel : public OperatorBase {
 public:
-  OperatorWithKernel() {}  // TODO(yi): This constructor is to be removed.
-  OperatorWithKernel(const std::string& type,
-                     const std::vector<std::string>& inputs,
-                     const std::vector<std::string>& outputs,
-                     const AttributeMap& attrs,
-                     std::unordered_map<std::string, int>* in_out_idxs)
-      : OperatorBase(type, inputs, outputs, attrs, in_out_idxs) {}
-
  struct OpKernelKey {
    platform::Place place_;

@ -350,6 +301,10 @@ class OperatorWithKernel : public OperatorBase {
  using OpKernelMap =
      std::unordered_map<OpKernelKey, std::unique_ptr<OpKernel>, OpKernelHash>;

+  OperatorWithKernel(const std::string& type, const VarNameMap& inputs,
+                     const VarNameMap& outputs, const AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
  void InferShape(const Scope& scope) const override {
    InferShape(InferShapeContext(*this, scope));
  }
--- a/Show More
+++ b/Show More