Merge branch 'develop' of github.com:baidu/Paddle into feature/fast_python_unittest

8 years ago · 329370e8ca
parent aa57f0fc85 dc21a58b58
commit 329370e8ca
82 changed files with 1565 additions and 648 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -36,8 +36,8 @@ include(simd)
 ################################ Configurations #######################################
 option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_FOUND})
 option(WITH_AVX         "Compile PaddlePaddle with AVX intrinsics"      ${AVX_FOUND})
-option(WITH_MKLDNN      "Compile PaddlePaddle with mkl-dnn support."    ${AVX_FOUND})
-option(WITH_MKLML       "Compile PaddlePaddle with mklml package."      ${AVX_FOUND})
+option(WITH_MKLDNN      "Compile PaddlePaddle with mkl-dnn support."    OFF)
+option(WITH_MKLML       "Compile PaddlePaddle with mklml package."      OFF)
 option(WITH_DSO         "Compile PaddlePaddle with dynamic linked CUDA" ON)
 option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        ON)
 option(WITH_SWIG_PY     "Compile PaddlePaddle with inference api"       ON)
--- a/cmake/cpplint.cmake
+++ b/cmake/cpplint.cmake
@ -56,11 +56,14 @@ macro(add_style_check_target TARGET_NAME)
                # cpplint code style
                get_filename_component(base_filename ${filename} NAME)
                set(CUR_GEN ${CMAKE_CURRENT_BINARY_DIR}/${base_filename}.cpplint)
-                add_custom_command(TARGET ${TARGET_NAME} PRE_BUILD
+                add_custom_command(OUTPUT ${CUR_GEN} PRE_BUILD
                    COMMAND "${PYTHON_EXECUTABLE}" "${PROJ_ROOT}/paddle/scripts/cpplint.py"
                            "--filter=${STYLE_FILTER}"
                            "--write-success=${CUR_GEN}" ${filename}
+                    DEPENDS ${filename} ${PROJ_ROOT}/paddle/scripts/cpplint.py
                    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+                add_custom_target(${base_filename}.cpplint DEPENDS ${CUR_GEN})
+                add_dependencies(${TARGET_NAME} ${base_filename}.cpplint)
            endif()
        endforeach()
    endif()
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@ -187,7 +187,13 @@ function(cc_library TARGET_NAME)
    endif()
    
    # cpplint code style
-    add_style_check_target(${TARGET_NAME} ${cc_library_SRCS})
+    foreach(source_file ${cc_library_SRCS})
+      string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file})
+      if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+        list(APPEND cc_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+      endif()
+    endforeach()
+    add_style_check_target(${TARGET_NAME} ${cc_library_SRCS} ${cc_library_HEADERS})

  else(cc_library_SRCS)
    if (cc_library_DEPS)
@ -239,6 +245,14 @@ function(nv_library TARGET_NAME)
        add_dependencies(${TARGET_NAME} ${nv_library_DEPS})
        target_link_libraries(${TARGET_NAME} ${nv_library_DEPS})
      endif()
+      # cpplint code style
+      foreach(source_file ${nv_library_SRCS})
+        string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file})
+        if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+          list(APPEND cc_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+        endif()
+      endforeach()
+      add_style_check_target(${TARGET_NAME} ${nv_library_SRCS} ${nv_library_HEADERS})
    else(nv_library_SRCS)
      if (nv_library_DEPS)
        merge_static_libs(${TARGET_NAME} ${nv_library_DEPS})
--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@ -118,7 +118,6 @@ endfunction()
 macro(add_unittest_without_exec TARGET_NAME)
    add_executable(${TARGET_NAME} ${ARGN})
    link_paddle_test(${TARGET_NAME})
-    add_style_check_target(${TARGET_NAME} ${ARGN})
 endmacro()

 # add_unittest
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@ -15,7 +15,6 @@ if(Boost_FOUND)
  add_subdirectory(platform)
  add_subdirectory(framework)
  add_subdirectory(operators)
-  add_subdirectory(pybind)
 endif()

 if(WITH_C_API)
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@ -12,13 +12,15 @@ cc_test(variable_test SRCS variable_test.cc)
 cc_library(scope SRCS scope.cc)
 cc_test(scope_test SRCS scope_test.cc DEPS scope)

-proto_library(attr_type SRCS attr_type.proto)
-proto_library(op_proto SRCS op_proto.proto DEPS attr_type)
-proto_library(op_desc SRCS op_desc.proto DEPS attr_type)
+proto_library(attribute_proto SRCS attribute.proto)
+proto_library(op_proto SRCS op_proto.proto DEPS attribute_proto)
+proto_library(op_desc SRCS op_desc.proto DEPS attribute_proto)
 cc_test(op_proto_test SRCS op_proto_test.cc DEPS op_proto protobuf)
 cc_test(op_desc_test SRCS op_desc_test.cc DEPS op_desc protobuf)

-cc_library(operator SRCS operator.cc DEPS op_desc device_context tensor scope)
+cc_library(attribute SRCS attribute.cc DEPS op_desc op_proto)
+
+cc_library(operator SRCS operator.cc DEPS op_desc device_context tensor scope attribute)
 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry)

 cc_library(grad_op_builder SRCS grad_op_builder.cc DEPS op_proto operator)
@ -26,13 +28,19 @@ cc_library(op_registry SRCS op_registry.cc DEPS op_desc grad_op_builder)
 cc_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
 cc_test(grad_op_builder_test SRCS grad_op_builder_test.cc DEPS grad_op_builder op_registry add_op)

-py_proto_compile(framework_py_proto SRCS attr_type.proto op_proto.proto op_desc.proto)
+py_proto_compile(framework_py_proto SRCS attribute.proto op_proto.proto op_desc.proto)
 # Generate an empty __init__.py to make framework_py_proto as a valid python module.
 add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
 add_dependencies(framework_py_proto framework_py_proto_init)

-cc_library(net SRCS net.cc DEPS op_registry)
-cc_test(net_op_test SRCS net_op_test.cc DEPS net)
-
-cc_library(backward SRCS backward.cc DEPS net)
+cc_library(backward SRCS backward.cc DEPS net_op)
 cc_test(backward_test SRCS backward_test.cc DEPS backward)
+cc_library(paddle_pybind SHARED
+    SRCS pybind.cc
+    DEPS pybind python backward
+	fc_op
+	sgd_op
+	add_op
+	mean_op
+	cross_entropy_op
+	recurrent_op)
--- a/paddle/framework/attribute.cc
+++ b/paddle/framework/attribute.cc
@ -0,0 +1,85 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/attribute.h"
+
+#include <vector>
+
+namespace paddle {
+namespace framework {
+
+template <>
+AttrType AttrTypeID<int>() {
+  return INT;
+}
+template <>
+AttrType AttrTypeID<float>() {
+  return FLOAT;
+}
+template <>
+AttrType AttrTypeID<std::string>() {
+  return STRING;
+}
+template <>
+AttrType AttrTypeID<std::vector<int>>() {
+  return INTS;
+}
+template <>
+AttrType AttrTypeID<std::vector<float>>() {
+  return FLOATS;
+}
+template <>
+AttrType AttrTypeID<std::vector<std::string>>() {
+  return STRINGS;
+}
+
+Attribute GetAttrValue(const AttrDesc& attr_desc) {
+  switch (attr_desc.type()) {
+    case paddle::framework::AttrType::INT: {
+      return attr_desc.i();
+    }
+    case paddle::framework::AttrType::FLOAT: {
+      return attr_desc.f();
+    }
+    case paddle::framework::AttrType::STRING: {
+      return attr_desc.s();
+    }
+    case paddle::framework::AttrType::INTS: {
+      std::vector<int> val(attr_desc.ints_size());
+      for (int i = 0; i < attr_desc.ints_size(); ++i) {
+        val[i] = attr_desc.ints(i);
+      }
+      return val;
+    }
+    case paddle::framework::AttrType::FLOATS: {
+      std::vector<float> val(attr_desc.floats_size());
+      for (int i = 0; i < attr_desc.floats_size(); ++i) {
+        val[i] = attr_desc.floats(i);
+      }
+      return val;
+    }
+    case paddle::framework::AttrType::STRINGS: {
+      std::vector<std::string> val(attr_desc.strings_size());
+      for (int i = 0; i < attr_desc.strings_size(); ++i) {
+        val[i] = attr_desc.strings(i);
+      }
+      return val;
+    }
+  }
+  PADDLE_ENFORCE(false, "Unknown OpDesc::AttrDesc::type !");
+  return boost::blank();
+}
+
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/attr_checker.h
+++ b/paddle/framework/attr_checker.h
@ -1,3 +1,17 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
 #pragma once

 #include <boost/variant.hpp>
@ -6,6 +20,9 @@
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
+
+#include "paddle/framework/attribute.pb.h"
+#include "paddle/framework/op_desc.pb.h"
 #include "paddle/platform/enforce.h"

 namespace paddle {
@ -14,13 +31,19 @@ namespace framework {
 typedef boost::variant<boost::blank, int, float, std::string, std::vector<int>,
                       std::vector<float>, std::vector<std::string>>
    Attribute;
+
 typedef std::unordered_map<std::string, Attribute> AttributeMap;

+template <typename T>
+AttrType AttrTypeID();
+
+Attribute GetAttrValue(const AttrDesc& attr_desc);
+
 // check whether a value(attribute) fit a certain limit
 template <typename T>
 class LargerThanChecker {
 public:
-  LargerThanChecker(T lower_bound) : lower_bound_(lower_bound) {}
+  explicit LargerThanChecker(T lower_bound) : lower_bound_(lower_bound) {}
  void operator()(T& value) const {
    PADDLE_ENFORCE(value > lower_bound_, "larger_than check fail");
  }
@ -35,7 +58,8 @@ class LargerThanChecker {
 template <typename T>
 class DefaultValueSetter {
 public:
-  DefaultValueSetter(T default_value) : default_value_(default_value) {}
+  explicit DefaultValueSetter(T default_value)
+      : default_value_(default_value) {}
  void operator()(T& value) const { value = default_value_; }

 private:
@ -78,7 +102,8 @@ class TypedAttrChecker {
  typedef std::function<void(T&)> ValueChecker;

 public:
-  TypedAttrChecker(const std::string& attr_name) : attr_name_(attr_name) {}
+  explicit TypedAttrChecker(const std::string& attr_name)
+      : attr_name_(attr_name) {}

  TypedAttrChecker& InEnum(const std::unordered_set<T>& range) {
    value_checkers_.push_back(EnumInContainer<T>(range));
--- a/paddle/framework/attribute.proto
+++ b/paddle/framework/attribute.proto
--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@ -14,8 +14,8 @@

 #include "paddle/framework/backward.h"
 #include <list>
-#include "paddle/framework/net.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/operators/net_op.h"

 namespace paddle {
 namespace framework {
@ -32,7 +32,7 @@ static bool AllInSet(const std::vector<std::string>& names,
 }

 static std::shared_ptr<OperatorBase> NOP() {
-  auto net_op = std::make_shared<NetOp>();
+  auto net_op = std::make_shared<operators::NetOp>();
  net_op->type_ = "@NOP@";
  net_op->CompleteAddOp();
  return net_op;
@ -42,9 +42,9 @@ static std::shared_ptr<OperatorBase> NOP() {
 //
 //  no_grad_names the gradient variable names without gradient calculating.
 //
-//  uniq_id is a unique index used inside recursively calling BackwardRecursive.
-//  use `uid = uniq_id++;` to get the unique index, and pass `uniq_id` through
-//  recursive calling.
+//  uniq_id is a unique index used inside recursively calling
+//  BackwardRecursive. use `uid = uniq_id++;` to get the unique index, and
+//  pass `uniq_id` through recursive calling.
 //
 //  returns The backward operator. For simple situation, it is a simple
 //  operator. For complex situation, it is a NetOp.
@ -59,32 +59,30 @@ std::shared_ptr<OperatorBase> BackwardRecursive(
  //  If all input gradients of forwarding operator do not need to calculate,
  //  just return an NOP. Not return null ptr because NOP does not take
  //  too much time for calculation, but it is useful for simplifying logic.
-  if (AllInSet(forwardOp.inputs_, OperatorBase::GRAD_VAR_SUFFIX(),
-               no_grad_names)) {
+  if (AllInSet(forwardOp.inputs_, kGradVarSuffix, no_grad_names)) {
    return NOP();
  }

-  //  All output gradients of forwarding operator do not need to calculate. Then
-  //  all input gradients cannot be computed at all, and we put them into
+  //  All output gradients of forwarding operator do not need to calculate.
+  //  Then all input gradients cannot be computed at all, and we put them into
  //  `no_grad_names` set. Return an NOP.
-  if (AllInSet(forwardOp.outputs_, OperatorBase::GRAD_VAR_SUFFIX(),
-               no_grad_names)) {
+  if (AllInSet(forwardOp.outputs_, kGradVarSuffix, no_grad_names)) {
    for (auto& name : forwardOp.inputs_) {
      // Mark all input is not need
-      no_grad_names.insert(name + OperatorBase::GRAD_VAR_SUFFIX());
+      no_grad_names.insert(name + kGradVarSuffix);
    }
    return NOP();
  }

  // Returned gradient network
-  auto net = std::make_shared<NetOp>();
+  auto net = std::make_shared<operators::NetOp>();

  if (forwardOp.IsNetOp()) {
    // Because forwardOp is a net op, it can static_cast.
-    auto& forwardNet = static_cast<const NetOp&>(forwardOp);
+    auto& forwardNet = static_cast<const operators::NetOp&>(forwardOp);

-    // Map from output gradient variable name to operator's indices in backward
-    // net. That operator generates that variable.
+    // Map from output gradient variable name to operator's indices in
+    // backward net. That operator generates that variable.
    std::unordered_map<std::string, std::vector<size_t>> dup_output_ops;

    size_t local_op_id = 0;
@ -134,9 +132,9 @@ std::shared_ptr<OperatorBase> BackwardRecursive(
    std::shared_ptr<OperatorBase> grad_op = OpRegistry::CreateGradOp(forwardOp);
    for (std::string& grad_input : grad_op->inputs_) {
      if (no_grad_names.count(grad_input)) {
-        std::string prefix = grad_input.substr(
-            0, grad_input.size() - OperatorBase::GRAD_VAR_SUFFIX().size());
-        grad_input = prefix + OperatorBase::ZERO_VAR_SUFFIX();
+        std::string prefix =
+            grad_input.substr(0, grad_input.size() - kGradVarSuffix.size());
+        grad_input = prefix + kZeroVarSuffix;

        // If part of input gradient of that operator is not calculated, fill
        // zero variables to that input gradient.
@ -147,7 +145,7 @@ std::shared_ptr<OperatorBase> BackwardRecursive(

    for (std::string& grad_output : grad_op->outputs_) {
      if (no_grad_names.count(grad_output)) {
-        grad_output = OperatorBase::EMPTY_VAR_NAME();
+        grad_output = kEmptyVarName;
      }
    }

@ -168,14 +166,14 @@ std::shared_ptr<OperatorBase> Backward(
  std::unordered_set<std::string> no_grad_names;
  no_grad_names.reserve(no_grad_vars.size());

-  no_grad_names.insert(OperatorBase::EMPTY_VAR_NAME() +
-                       OperatorBase::GRAD_VAR_SUFFIX());
+  no_grad_names.insert(kEmptyVarName + kGradVarSuffix);

  for (auto& name : no_grad_vars) {
-    no_grad_names.insert(name + OperatorBase::GRAD_VAR_SUFFIX());
+    no_grad_names.insert(name + kGradVarSuffix);
  }
  size_t uid = 0;
  return BackwardRecursive(forwardOp, no_grad_names, uid);
 }
+
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/backward_test.cc
+++ b/paddle/framework/backward_test.cc
--- a/paddle/framework/ddim.h
+++ b/paddle/framework/ddim.h
@ -25,18 +25,15 @@ limitations under the License. */
 namespace paddle {
 namespace framework {

-namespace {
-typedef boost::variant<Dim<1>, Dim<2>, Dim<3>, Dim<4>, Dim<5>, Dim<6>, Dim<7>,
-                       Dim<8>, Dim<9>>
-    DDimVar;
-}
-
 /**
 * \brief A dynamically sized dimension.
 *
 * The number of dimensions must be between [1, 9].
 */
 struct DDim {
+  typedef boost::variant<Dim<1>, Dim<2>, Dim<3>, Dim<4>, Dim<5>, Dim<6>, Dim<7>,
+                         Dim<8>, Dim<9>>
+      DDimVar;
  DDimVar var;

  DDim() : var(Dim<1>()) {}
--- a/paddle/framework/grad_op_builder.cc
+++ b/paddle/framework/grad_op_builder.cc
@ -8,107 +8,95 @@ You may obtain a copy of the License at

 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
+WITHOpArgType::OUT WARRANTIES OR CONDITIONS OF ANY KOpArgType::IND, either
+express or implied. See the License for the specific language governing
+permissions and limitations under the License. */

 #include "paddle/framework/grad_op_builder.h"
+#include "paddle/framework/op_proto.pb.h"
 #include "paddle/framework/op_registry.h"

 namespace paddle {
 namespace framework {

-OperatorBase* GradOpBuilder::Build() {
-  BuildOpInOutArgList();
-  std::string grad_op_type = OpRegistry::grad_ops().at(op_.type_);
-  OperatorBase* grad_op = OpRegistry::op_creators().at(grad_op_type)();
-  grad_op->type_ = grad_op_type;
-  CompleteGradOp(grad_op);
-  return grad_op;
-}
+class OpRegistry;
+
+using VarIndexMap = std::unordered_map<std::string, int>;

-OpInOutArg* GradOpBuilder::BuildArg(const VarProto& var,
-                                    const VarIndexMap& var_map,
-                                    const std::vector<int>& format,
-                                    InOutType type) {
-  int idx = var_map.at(var.name());
-  int begin_idx = format.empty() ? idx : format.at(idx);
-  int end_idx = format.empty() ? idx + 1 : format.at(idx + 1);
-  return new OpInOutArg(var.name(), type, !var.ignore_gradient(), begin_idx,
-                        end_idx);
+enum class OpArgType { IN, OUT };
+
+static std::vector<int>* GetOpFormat(OperatorBase* op, const OpArgType& type) {
+  std::string key = type == OpArgType::IN ? "input_format" : "output_format";
+  return op->attrs_.count(key)
+             ? &boost::get<std::vector<int>>(op->attrs_.at(key))
+             : nullptr;
 }

-void GradOpBuilder::BuildOpInOutArgList() {
-  const OpProto& op_proto = OpRegistry::protos().at(op_.type_);
-  const auto& var_map = *(OpRegistry::VarIndexMaps().at(op_.type_));
-  const std::vector<int>& in_format =
-      op_.attrs_.count("input_format")
-          ? op_.GetAttr<std::vector<int>>("input_format")
-          : std::vector<int>();
-  const std::vector<int>& out_format =
-      op_.attrs_.count("output_format")
-          ? op_.GetAttr<std::vector<int>>("output_format")
-          : std::vector<int>();
-  for (const auto& var : op_proto.inputs()) {
-    arg_list_.emplace_back(
-        std::shared_ptr<OpInOutArg>(BuildArg(var, var_map, in_format, IN)));
-  }
-  for (const auto& var : op_proto.outputs()) {
-    arg_list_.emplace_back(
-        std::shared_ptr<OpInOutArg>(BuildArg(var, var_map, out_format, OUT)));
-  }
+static const std::vector<int>* GetOpFormat(const OperatorBase* op,
+                                           const OpArgType& type) {
+  std::string key = type == OpArgType::IN ? "input_format" : "output_format";
+  return op->attrs_.count(key)
+             ? &boost::get<std::vector<int>>(op->attrs_.at(key))
+             : nullptr;
 }

-void GradOpBuilder::AddArgIntoGradOp(const OpInOutArg* arg,
-                                     std::vector<std::string>& in_out,
-                                     std::vector<int>& format,
-                                     VarIndexMap* varmap, int& idx,
-                                     bool is_grad) const {
-  std::string var_name = arg->proto_name_;
-  if (is_grad) {
-    var_name += OperatorBase::GRAD_VAR_SUFFIX();
-  }
-  (*varmap)[var_name] = idx++;
-  size_t pre_sz = in_out.size();
-  auto base_it = arg->type_ == IN ? op_.inputs_.begin() : op_.outputs_.begin();
-  std::copy(base_it + arg->begin_idx_, base_it + arg->end_idx_,
-            std::back_inserter(in_out));
-  if (is_grad) {
-    for (size_t i = pre_sz; i < in_out.size(); ++i) {
-      in_out[i] += OperatorBase::GRAD_VAR_SUFFIX();
+static void TransOpArg(const OperatorBase* src_op, OperatorBase* dst_op,
+                       const OpArgType& src_type, const OpArgType& dst_type,
+                       int& idx, bool is_grad) {
+  const std::vector<std::string>& src_inout =
+      src_type == OpArgType::IN ? src_op->inputs_ : src_op->outputs_;
+  const std::vector<int>* src_format = GetOpFormat(src_op, src_type);
+
+  std::vector<std::string>& dst_inout =
+      dst_type == OpArgType::IN ? dst_op->inputs_ : dst_op->outputs_;
+  std::vector<int>* dst_format = GetOpFormat(dst_op, dst_type);
+  const OpProto& proto = OpRegistry::protos().at(src_op->type_);
+  const auto& src_arg_list =
+      src_type == OpArgType::IN ? proto.inputs() : proto.outputs();
+
+  for (const auto& arg : src_arg_list) {
+    std::string src_name = arg.name();
+    std::string dst_name = is_grad ? src_name + kGradVarSuffix : src_name;
+    (*dst_op->in_out_idxs_)[dst_name] = idx++;
+    int src_arg_idx = src_op->in_out_idxs_->at(src_name);
+    int src_begin =
+        src_format == nullptr ? src_arg_idx : src_format->at(src_arg_idx);
+    int src_end = src_format == nullptr ? src_arg_idx + 1
+                                        : src_format->at(src_arg_idx + 1);
+    for (int i = src_begin; i < src_end; ++i) {
+      std::string s =
+          is_grad ? src_inout[i] + kGradVarSuffix
+                  : (arg.ignore_gradient() ? kEmptyVarName : src_inout[i]);
+      dst_inout.emplace_back(s);
+    }
+    if (dst_format != nullptr) {
+      dst_format->push_back(dst_inout.size());
    }
  }
-  format.push_back(in_out.size());
 }

-void GradOpBuilder::CompleteGradOp(OperatorBase* grad_op) const {
-  grad_op->attrs_ = op_.attrs_;
+OperatorBase* BuildGradOp(const OperatorBase* op) {
+  std::string grad_op_type = OpRegistry::grad_ops().at(op->type_);
+  OperatorBase* grad_op = OpRegistry::op_creators().at(grad_op_type)();
+  grad_op->type_ = grad_op_type;
+  grad_op->attrs_ = op->attrs_;
  grad_op->attrs_.erase("input_format");
  grad_op->attrs_.erase("output_format");
-  VarIndexMap* grad_varmap = new VarIndexMap();
+  if (GetOpFormat(op, OpArgType::IN) != nullptr) {
+    grad_op->attrs_["output_format"] = std::vector<int>({0});
+  }
+  if (GetOpFormat(op, OpArgType::IN) != nullptr ||
+      GetOpFormat(op, OpArgType::OUT) != nullptr) {
+    grad_op->attrs_["input_format"] = std::vector<int>({0});
+  }
+  grad_op->in_out_idxs_.reset(new VarIndexMap());
  int in_idx = 0;
  int out_idx = 0;
-  std::vector<int> in_format({0});
-  std::vector<int> out_format({0});
-  for (const auto& arg : arg_list_) {
-    // op_'s inputs_ and outputs_
-    if (arg->needed_in_grad_) {
-      AddArgIntoGradOp(arg.get(), grad_op->inputs_, in_format, grad_varmap,
-                       in_idx, false);
-    }
-    if (arg->type_ == IN) {
-      // gradients of op_'s inputs_
-      AddArgIntoGradOp(arg.get(), grad_op->outputs_, out_format, grad_varmap,
-                       out_idx, true);
-    } else {
-      // gradients of op_'s outputs_
-      AddArgIntoGradOp(arg.get(), grad_op->inputs_, in_format, grad_varmap,
-                       in_idx, true);
-    }
-  }
-  grad_op->attrs_["input_format"] = in_format;
-  grad_op->attrs_["output_format"] = out_format;
-  grad_op->in_out_idxs_.reset(grad_varmap);
+  TransOpArg(op, grad_op, OpArgType::IN, OpArgType::IN, in_idx, false);   // I
+  TransOpArg(op, grad_op, OpArgType::OUT, OpArgType::IN, in_idx, false);  // G
+  TransOpArg(op, grad_op, OpArgType::OUT, OpArgType::IN, in_idx, true);   // OG
+  TransOpArg(op, grad_op, OpArgType::IN, OpArgType::OUT, out_idx, true);  // IG
+  return grad_op;
 }

 }  // namespace framework
--- a/paddle/framework/grad_op_builder.h
+++ b/paddle/framework/grad_op_builder.h
@ -1,48 +1,25 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
 #pragma once

-#include "paddle/framework/op_proto.pb.h"
 #include "paddle/framework/operator.h"

 namespace paddle {
 namespace framework {
-class OpRegistry;
-
-enum InOutType { IN, OUT };
-
-struct OpInOutArg {
-  OpInOutArg(const std::string& proto_name, const InOutType& type,
-             bool needed_in_grad, size_t begin_idx, size_t end_idx)
-      : proto_name_(proto_name),
-        type_(type),
-        needed_in_grad_(needed_in_grad),
-        begin_idx_(begin_idx),
-        end_idx_(end_idx) {}
-
-  std::string proto_name_;
-  InOutType type_;
-  bool needed_in_grad_;
-  size_t begin_idx_;
-  size_t end_idx_;
-};
-
-class GradOpBuilder {
-  using VarIndexMap = std::unordered_map<std::string, int>;
-
- public:
-  GradOpBuilder(const OperatorBase& op) : op_(op) {}
-  OperatorBase* Build();
-
- private:
-  OpInOutArg* BuildArg(const VarProto& var, const VarIndexMap& var_map,
-                       const std::vector<int>& format, InOutType type);
-  void BuildOpInOutArgList();
-  void AddArgIntoGradOp(const OpInOutArg* arg, std::vector<std::string>& in_out,
-                        std::vector<int>& format, VarIndexMap* varmap, int& idx,
-                        bool is_grad) const;
-  void CompleteGradOp(OperatorBase* grad_op) const;
-  const OperatorBase& op_;
-  std::vector<std::shared_ptr<OpInOutArg>> arg_list_;
-};
+
+OperatorBase* BuildGradOp(const OperatorBase* op);

 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/grad_op_builder_test.cc
+++ b/paddle/framework/grad_op_builder_test.cc
@ -8,10 +8,49 @@ USE_OP(add_two);
 namespace paddle {
 namespace framework {

+class NOP : public OperatorBase {
+ public:
+  void InferShape(const Scope &scope) const override {}
+  void Run(const Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {}
+};
+
+class MutiInOutOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  MutiInOutOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("In1", "a single input");
+    AddInput("In2_mult", "a multiple input").SetMultiple();
+    AddInput("In3", "another single input");
+    AddOutput("Out1", "a single output");
+    AddOutput("Out2_mult", "a multiple output").SetMultiple();
+    AddComment("test op with multiple inputs and outputs");
+  }
+};
+
+class IOIgnoredOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  IOIgnoredOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("In1", "a single input");
+    AddInput("In2_mult", "a multiple input").SetMultiple().IgnoreGradient();
+    AddInput("In3_mult", "another multiple input").SetMultiple();
+    AddOutput("Out1_mult", "a multiple output").SetMultiple();
+    AddOutput("Out2", "a single output").IgnoreGradient();
+    AddComment("op with inputs and outputs ignored in gradient calculating");
+  }
+};
+
+}  // namespace framework
+}  // namespace paddle
+
+namespace f = paddle::framework;
+
 TEST(GradOpBuilder, AddTwo) {
-  std::shared_ptr<OperatorBase> add_op(
-      OpRegistry::CreateOp("add_two", {"x", "y"}, {"out"}, {}));
-  std::shared_ptr<OperatorBase> grad_add_op = OpRegistry::CreateGradOp(*add_op);
+  std::shared_ptr<f::OperatorBase> add_op(
+      f::OpRegistry::CreateOp("add_two", {"x", "y"}, {"out"}, {}));
+  std::shared_ptr<f::OperatorBase> grad_add_op =
+      f::OpRegistry::CreateGradOp(*add_op);
  EXPECT_EQ(static_cast<int>(grad_add_op->inputs_.size()), 4);
  EXPECT_EQ(static_cast<int>(grad_add_op->outputs_.size()), 2);
  EXPECT_EQ(grad_add_op->Input("X"), "x");
@ -22,5 +61,77 @@ TEST(GradOpBuilder, AddTwo) {
  EXPECT_EQ(grad_add_op->Output("Y@GRAD"), "y@GRAD");
 }

-}  // namespace framework
-}  // namespace paddle
+REGISTER_OP(mult_io, f::NOP, f::MutiInOutOpMaker);
+REGISTER_GRADIENT_OP(mult_io, mult_io_grad, f::NOP);
+REGISTER_OP(io_ignored, f::NOP, f::IOIgnoredOpMaker);
+REGISTER_GRADIENT_OP(io_ignored, io_ignored_grad, f::NOP);
+
+TEST(GradOpBuilder, MutiInOut) {
+  f::AttributeMap attrs{{"input_format", std::vector<int>{0, 1, 4, 5}},
+                        {"output_format", std::vector<int>{0, 1, 3}}};
+  std::shared_ptr<f::OperatorBase> test_op(f::OpRegistry::CreateOp(
+      "mult_io", {"in1", "in2_1", "in2_2", "in2_3", "in3"},
+      {"out1", "out2_1", "out2_2"}, attrs));
+  std::shared_ptr<f::OperatorBase> grad_test_op =
+      f::OpRegistry::CreateGradOp(*test_op);
+
+  ASSERT_EQ(grad_test_op->inputs_.size(), 5UL + 3UL + 3UL);
+  EXPECT_EQ(grad_test_op->Input("In1"), "in1");
+  EXPECT_EQ(grad_test_op->Inputs("In2_mult"),
+            std::vector<std::string>({"in2_1", "in2_2", "in2_3"}));
+  EXPECT_EQ(grad_test_op->Input("In3"), "in3");
+  EXPECT_EQ(grad_test_op->Input("Out1"), "out1");
+  EXPECT_EQ(grad_test_op->Inputs("Out2_mult"),
+            std::vector<std::string>({"out2_1", "out2_2"}));
+  EXPECT_EQ(grad_test_op->Input("Out1" + f::kGradVarSuffix),
+            "out1" + f::kGradVarSuffix);
+  EXPECT_EQ(grad_test_op->Inputs("Out2_mult" + f::kGradVarSuffix),
+            std::vector<std::string>(
+                {"out2_1" + f::kGradVarSuffix, "out2_2" + f::kGradVarSuffix}));
+
+  ASSERT_EQ(grad_test_op->outputs_.size(), 5UL);
+  EXPECT_EQ(grad_test_op->Output("In1" + f::kGradVarSuffix),
+            "in1" + f::kGradVarSuffix);
+  EXPECT_EQ(grad_test_op->Outputs("In2_mult" + f::kGradVarSuffix),
+            std::vector<std::string>({"in2_1" + f::kGradVarSuffix,
+                                      "in2_2" + f::kGradVarSuffix,
+                                      "in2_3" + f::kGradVarSuffix}));
+  EXPECT_EQ(grad_test_op->Output("In3" + f::kGradVarSuffix),
+            "in3" + f::kGradVarSuffix);
+}
+
+TEST(GradOpBuilder, IOIgnoredInGradient) {
+  f::AttributeMap attrs{{"input_format", std::vector<int>{0, 1, 3, 5}},
+                        {"output_format", std::vector<int>{0, 2, 3}}};
+  std::shared_ptr<f::OperatorBase> test_op(f::OpRegistry::CreateOp(
+      "io_ignored", {"in1", "in2_1", "in2_2", "in3_1", "in3_2"},
+      {"out1_1", "out1_2", "out2"}, attrs));
+  std::shared_ptr<f::OperatorBase> grad_test_op =
+      f::OpRegistry::CreateGradOp(*test_op);
+
+  // 'In2' and 'Out2' are ignored in gradient calculating
+  ASSERT_EQ(grad_test_op->inputs_.size(), 5UL + 3UL + 3UL);
+  EXPECT_EQ(grad_test_op->Input("In1"), "in1");
+  EXPECT_EQ(grad_test_op->Inputs("In2_mult"),
+            std::vector<std::string>({f::kEmptyVarName, f::kEmptyVarName}));
+  EXPECT_EQ(grad_test_op->Inputs("In3_mult"),
+            std::vector<std::string>({"in3_1", "in3_2"}));
+  EXPECT_EQ(grad_test_op->Inputs("Out1_mult"),
+            std::vector<std::string>({"out1_1", "out1_2"}));
+  EXPECT_EQ(grad_test_op->Input("Out2"), f::kEmptyVarName);
+  EXPECT_EQ(grad_test_op->Inputs("Out1_mult" + f::kGradVarSuffix),
+            std::vector<std::string>(
+                {"out1_1" + f::kGradVarSuffix, "out1_2" + f::kGradVarSuffix}));
+  EXPECT_EQ(grad_test_op->Input("Out2" + f::kGradVarSuffix),
+            "out2" + f::kGradVarSuffix);
+
+  ASSERT_EQ(grad_test_op->outputs_.size(), 5UL);
+  EXPECT_EQ(grad_test_op->Output("In1" + f::kGradVarSuffix),
+            "in1" + f::kGradVarSuffix);
+  EXPECT_EQ(grad_test_op->Outputs("In2_mult" + f::kGradVarSuffix),
+            std::vector<std::string>(
+                {"in2_1" + f::kGradVarSuffix, "in2_2" + f::kGradVarSuffix}));
+  EXPECT_EQ(grad_test_op->Outputs("In3_mult" + f::kGradVarSuffix),
+            std::vector<std::string>(
+                {"in3_1" + f::kGradVarSuffix, "in3_2" + f::kGradVarSuffix}));
+}
--- a/paddle/framework/op_desc.proto
+++ b/paddle/framework/op_desc.proto
@ -15,7 +15,7 @@ limitations under the License. */
 syntax="proto2";
 package paddle.framework;

-import "attr_type.proto";
+import "attribute.proto";

 // AttrDesc is used to describe Attributes of an Operator. It contain's
 // name, type, and value of Attribute.
--- a/paddle/framework/op_proto.proto
+++ b/paddle/framework/op_proto.proto
@ -21,7 +21,7 @@ limitations under the License. */
 syntax="proto2";
 package paddle.framework;

-import "attr_type.proto";
+import "attribute.proto";

 // Attribute protocol message for 3rd-party language binding.
 // It will store the Op support what attribute and what type.
--- a/paddle/framework/op_registry.cc
+++ b/paddle/framework/op_registry.cc
@ -14,37 +14,8 @@ limitations under the License. */

 #include <paddle/framework/op_registry.h>

-namespace paddle {
-namespace framework {
-
-template <>
-void AttrTypeHelper::SetAttrType<int>(AttrProto* attr) {
-  attr->set_type(paddle::framework::AttrType::INT);
-}
-
-template <>
-void AttrTypeHelper::SetAttrType<float>(AttrProto* attr) {
-  attr->set_type(paddle::framework::AttrType::FLOAT);
-}
-
-template <>
-void AttrTypeHelper::SetAttrType<std::string>(AttrProto* attr) {
-  attr->set_type(paddle::framework::AttrType::STRING);
-}
+#include <vector>

-template <>
-void AttrTypeHelper::SetAttrType<std::vector<int>>(AttrProto* attr) {
-  attr->set_type(paddle::framework::AttrType::INTS);
-}
-
-template <>
-void AttrTypeHelper::SetAttrType<std::vector<float>>(AttrProto* attr) {
-  attr->set_type(paddle::framework::AttrType::FLOATS);
-}
-
-template <>
-void AttrTypeHelper::SetAttrType<std::vector<std::string>>(AttrProto* attr) {
-  attr->set_type(paddle::framework::AttrType::STRINGS);
-}
-}  // namespace framework
+namespace paddle {
+namespace framework {}  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@ -19,7 +19,7 @@ limitations under the License. */
 #include <type_traits>
 #include <unordered_map>
 #include <unordered_set>
-#include "paddle/framework/attr_checker.h"
+#include "paddle/framework/attribute.h"
 #include "paddle/framework/grad_op_builder.h"
 #include "paddle/framework/op_desc.pb.h"
 #include "paddle/framework/scope.h"
@ -27,49 +27,6 @@ limitations under the License. */
 namespace paddle {
 namespace framework {

-// helper class to set attribute type
-struct AttrTypeHelper {
-  template <typename T>
-  static void SetAttrType(AttrProto* attr);
-
-  static Attribute GetAttrValue(const AttrDesc& attr_desc) {
-    switch (attr_desc.type()) {
-      case paddle::framework::AttrType::INT: {
-        return attr_desc.i();
-      }
-      case paddle::framework::AttrType::FLOAT: {
-        return attr_desc.f();
-      }
-      case paddle::framework::AttrType::STRING: {
-        return attr_desc.s();
-      }
-      case paddle::framework::AttrType::INTS: {
-        std::vector<int> val(attr_desc.ints_size());
-        for (int i = 0; i < attr_desc.ints_size(); ++i) {
-          val[i] = attr_desc.ints(i);
-        }
-        return val;
-      }
-      case paddle::framework::AttrType::FLOATS: {
-        std::vector<float> val(attr_desc.floats_size());
-        for (int i = 0; i < attr_desc.floats_size(); ++i) {
-          val[i] = attr_desc.floats(i);
-        }
-        return val;
-      }
-      case paddle::framework::AttrType::STRINGS: {
-        std::vector<std::string> val(attr_desc.strings_size());
-        for (int i = 0; i < attr_desc.strings_size(); ++i) {
-          val[i] = attr_desc.strings(i);
-        }
-        return val;
-      }
-    }
-    PADDLE_ENFORCE(false, "Unknown OpDesc::AttrDesc::type !");
-    return boost::blank();
-  }
-};
-
 // this class not only make proto but also init attribute checkers.
 class OpProtoAndCheckerMaker {
 public:
@ -136,7 +93,7 @@ class OpProtoAndCheckerMaker {
    *attr->mutable_name() = name;
    *attr->mutable_comment() = comment;
    attr->set_generated(generated);
-    AttrTypeHelper::SetAttrType<T>(attr);
+    attr->set_type(AttrTypeID<T>());
    return op_checker_->AddAttrChecker<T>(name);
  }

@ -297,7 +254,7 @@ class OpRegistry {

    AttributeMap attrs;
    for (auto& attr : op_desc.attrs()) {
-      attrs[attr.name()] = AttrTypeHelper::GetAttrValue(attr);
+      attrs[attr.name()] = GetAttrValue(attr);
    }

    return CreateOp(op_desc.type(), inputs, outputs, attrs);
@ -306,8 +263,7 @@ class OpRegistry {
  static std::shared_ptr<OperatorBase> CreateGradOp(const OperatorBase& op) {
    PADDLE_ENFORCE(!op.IsNetOp(),
                   "Use framework::Backward to get backward ops");
-    GradOpBuilder builder(op);
-    std::shared_ptr<OperatorBase> grad_op(builder.Build());
+    std::shared_ptr<OperatorBase> grad_op(BuildGradOp(&op));
    grad_op->Init();
    return grad_op;
  }
@ -315,7 +271,7 @@ class OpRegistry {
  static std::unordered_map<std::string, OpProto>& protos() {
    static std::unordered_map<std::string, OpProto> protos_;
    return protos_;
-  };
+  }

  static std::unordered_map<std::string, std::string>& grad_ops() {
    static std::unordered_map<std::string, std::string> grad_ops_;
@ -337,12 +293,12 @@ class OpRegistry {
  static std::unordered_map<std::string, OpAttrChecker>& op_checkers() {
    static std::unordered_map<std::string, OpAttrChecker> op_checkers_;
    return op_checkers_;
-  };
+  }

  static void GenerateTempVariableName(OperatorBase* op) {
    static std::atomic<size_t> gUniqId(0UL);
    for (auto& outname : op->outputs_) {
-      if (outname == OperatorBase::TMP_VAR_NAME()) {
+      if (outname == kTempVarName) {
        outname += op->type_;
        outname += "@";
        outname += std::to_string(gUniqId.fetch_add(1));
@ -354,7 +310,7 @@ class OpRegistry {
 template <typename OpType, typename ProtoMakerType>
 class OpRegisterHelper {
 public:
-  OpRegisterHelper(const char* op_type) {
+  explicit OpRegisterHelper(const char* op_type) {
    OpRegistry::RegisterOp<OpType, ProtoMakerType>(op_type);
  }
 };
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@ -20,7 +20,7 @@ limitations under the License. */
 #include <unordered_map>
 #include <vector>

-#include "paddle/framework/attr_checker.h"
+#include "paddle/framework/attribute.h"
 #include "paddle/framework/op_desc.pb.h"
 #include "paddle/framework/op_proto.pb.h"
 #include "paddle/framework/scope.h"
@ -32,9 +32,29 @@ limitations under the License. */
 namespace paddle {
 namespace framework {

+/// If a variable is a empty variable, that name will be used.
+const std::string kEmptyVarName = "@EMPTY@";
+
+/// If a variable is a temporary variable, that name will be set in Python,
+/// but it will be convert to a unique name in scope after OpCreator.
+const std::string kTempVarName = "@TEMP@";
+
+/// If a variable's name has a certain suffix, it means that the
+/// variable is the gradient of another varibale.
+/// e.g. Variable "x@GRAD" is the gradient of varibale "x".
+const std::string kGradVarSuffix = "@GRAD";
+
+/// Variables with this suffix are supposed to be filled up with zeros.
+const std::string kZeroVarSuffix = "@ZERO";
+
+inline std::string GradVarName(const std::string& var_name) {
+  return var_name + kGradVarSuffix;
+}
+
 class OperatorBase;
 class InferShapeContext;
 class ExecutionContext;
+
 /**
 * OperatorBase has the basic element that Net will call to do computation.
 * Only CreateOperator from OpRegistry will new Operator directly. User
@ -43,21 +63,6 @@ class ExecutionContext;
 */
 class OperatorBase {
 public:
-  /// If a variable is a empty variable, that name will be used.
-  static std::string EMPTY_VAR_NAME() { return "@EMPTY@"; }
-
-  /// If a variable is a temporary variable, that name will be set in Python,
-  /// but it will be convert to a unique name in scope after OpCreator.
-  static std::string TMP_VAR_NAME() { return "@TEMP@"; }
-
-  /// If a variable's name has a certain suffix, it means that the
-  /// variable is the gradient of another varibale.
-  /// e.g. Variable "x@GRAD" is the gradient of varibale "x".
-  static std::string GRAD_VAR_SUFFIX() { return "@GRAD"; }
-
-  /// Variables with this suffix are supposed to be filled up with zeros.
-  static std::string ZERO_VAR_SUFFIX() { return "@ZERO"; }
-
  virtual ~OperatorBase() {}

  template <typename T>
@ -280,7 +285,7 @@ class OperatorWithKernel : public OperatorBase {
    platform::Place place_;

    OpKernelKey() = default;
-    OpKernelKey(const platform::DeviceContext& dev_ctx) {
+    explicit OpKernelKey(const platform::DeviceContext& dev_ctx) {
      place_ = dev_ctx.GetPlace();
    }

--- a/paddle/framework/pybind.cc
+++ b/paddle/framework/pybind.cc
@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at

-    http://www.apache.org/licenses/LICENSE-2.0
+http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
@ -17,19 +17,19 @@ limitations under the License. */
 #include <vector>

 #include "paddle/framework/backward.h"
-#include "paddle/framework/net.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/framework/operator.h"
 #include "paddle/framework/scope.h"
+#include "paddle/framework/tensor_py.h"
+#include "paddle/operators/net_op.h"
+#include "paddle/operators/type_alias.h"
 #include "paddle/platform/enforce.h"
 #include "paddle/platform/place.h"
-#include "paddle/pybind/tensor_bind.h"
 #include "pybind11/numpy.h"
 #include "pybind11/pybind11.h"
 #include "pybind11/stl.h"

 namespace py = pybind11;
-namespace pd = paddle::framework;

 USE_OP(add_two);
 USE_OP(onehot_cross_entropy);
@ -41,17 +41,18 @@ USE_OP(sigmoid);
 USE_OP(softmax);
 USE_OP(rowwise_add);
 USE_OP_WITHOUT_KERNEL(recurrent_op);
-
+namespace paddle {
+namespace framework {
 template <typename ClassType>
-void ExposeOperator(ClassType& m) {
+void ExposeOperator(ClassType &m) {
  m.def("infer_shape", &ClassType::type::InferShape)
      .def("run", &ClassType::type::Run)
      .def("type",
-           [](const typename ClassType::type& op) -> std::string {
+           [](const typename ClassType::type &op) -> std::string {
             return op.type_;
           })
      .def("outputs",
-           [](const typename ClassType::type& op) -> std::vector<std::string> {
+           [](const typename ClassType::type &op) -> std::vector<std::string> {
             return op.outputs_;
           })
      .def("__str__", &ClassType::type::DebugString);
@ -73,80 +74,81 @@ bool IsCompileGPU() {
 PYBIND11_PLUGIN(core) {
  py::module m("core", "C++ core of PaddlePaddle");

-  py::class_<pd::Tensor>(m, "Tensor", py::buffer_protocol())
-      .def_buffer([](pd::Tensor& self) -> py::buffer_info {
-        return paddle::pybind::CastToPyBuffer(self);
-      })
+  py::class_<Tensor>(m, "Tensor", py::buffer_protocol())
+      .def_buffer(
+          [](Tensor &self) -> py::buffer_info { return CastToPyBuffer(self); })
      .def("get_dims",
-           [](const pd::Tensor& self) { return pd::vectorize(self.dims()); })
+           [](const Tensor &self) { return vectorize(self.dims()); })
      .def("set_dims",
-           [](pd::Tensor& self, const std::vector<int>& dim) {
-             self.Resize(pd::make_ddim(dim));
+           [](Tensor &self, const std::vector<int> &dim) {
+             self.Resize(make_ddim(dim));
           })
      .def("alloc_float",
-           [](pd::Tensor& self, paddle::platform::GPUPlace& place) {
+           [](Tensor &self, paddle::platform::GPUPlace &place) {
             self.mutable_data<float>(place);
           })
      .def("alloc_float",
-           [](pd::Tensor& self, paddle::platform::CPUPlace& place) {
+           [](Tensor &self, paddle::platform::CPUPlace &place) {
             self.mutable_data<float>(place);
           })
      .def("alloc_int",
-           [](pd::Tensor& self, paddle::platform::CPUPlace& place) {
+           [](Tensor &self, paddle::platform::CPUPlace &place) {
             self.mutable_data<int>(place);
           })
      .def("alloc_int",
-           [](pd::Tensor& self, paddle::platform::GPUPlace& place) {
+           [](Tensor &self, paddle::platform::GPUPlace &place) {
             self.mutable_data<int>(place);
           })
-      .def("set", paddle::pybind::PyCPUTensorSetFromArray<float>)
-      .def("set", paddle::pybind::PyCPUTensorSetFromArray<int>)
+      .def("set", PyCPUTensorSetFromArray<float>)
+      .def("set", PyCPUTensorSetFromArray<int>)
 #ifndef PADDLE_ONLY_CPU
-      .def("set", paddle::pybind::PyCUDATensorSetFromArray<float>)
-      .def("set", paddle::pybind::PyCUDATensorSetFromArray<int>)
+      .def("set", PyCUDATensorSetFromArray<float>)
+      .def("set", PyCUDATensorSetFromArray<int>)
 #endif
-      .def("shape",
-           [](pd::Tensor& self) { return pd::vectorize(self.dims()); });
+      .def("shape", [](Tensor &self) { return vectorize(self.dims()); })
+      .def("set_float_element",
+           [](Tensor &self, size_t offset, float f) {
+             // TODO(yuyang18): Only support GPU now.
+             self.data<float>()[offset] = f;
+           })
+      .def("get_float_element", [](Tensor &self, size_t offset) -> float {
+        // TODO(yuyang18): Only support GPU now.
+        return self.data<float>()[offset];
+      });

-  py::class_<pd::Variable>(m, "Variable", R"DOC(Variable Class.
+  py::class_<Variable>(m, "Variable", R"DOC(Variable Class.

 All parameter, weight, gradient are variables in Paddle.
 )DOC")
-      .def("is_int", [](const pd::Variable& var) { return var.IsType<int>(); })
+      .def("is_int", [](const Variable &var) { return var.IsType<int>(); })
      .def("set_int",
-           [](pd::Variable& var, int val) -> void {
-             *var.GetMutable<int>() = val;
-           })
-      .def("get_int",
-           [](const pd::Variable& var) -> int { return var.Get<int>(); })
+           [](Variable &var, int val) -> void { *var.GetMutable<int>() = val; })
+      .def("get_int", [](const Variable &var) -> int { return var.Get<int>(); })
      .def("get_tensor",
-           [](pd::Variable& self) -> pd::Tensor* {
-             return self.GetMutable<pd::Tensor>();
-           },
+           [](Variable &self) -> Tensor * { return self.GetMutable<Tensor>(); },
           py::return_value_policy::reference)
      .def("get_net",
-           [](pd::Variable& self) -> pd::NetOp* {
-             return self.GetMutable<pd::NetOp>();
+           [](Variable &self) -> ops::NetOp * {
+             return self.GetMutable<ops::NetOp>();
           },
           py::return_value_policy::reference);

-  py::class_<pd::Scope>(m, "Scope", "")
+  py::class_<Scope>(m, "Scope", "")
      .def("new_var",
-           [](pd::Scope& self, const std::string& name) -> pd::Variable* {
+           [](Scope &self, const std::string &name) -> Variable * {
             return self.NewVar(name);
           },
           py::return_value_policy::reference)
-      .def("find_var", &pd::Scope::FindVar, py::return_value_policy::reference)
+      .def("find_var", &Scope::FindVar, py::return_value_policy::reference)
      .def(py::init<>())
-      .def("new_scope",
-           [](pd::Scope& self) -> pd::Scope* { return &self.NewScope(); },
+      .def("new_scope", [](Scope &self) -> Scope * { return &self.NewScope(); },
           py::return_value_policy::reference)
-      .def("drop_kids", &pd::Scope::DropKids);
+      .def("drop_kids", &Scope::DropKids);

  //! @note: Be careful! PyBind will return std::string as an unicode, not
  //! Python str. If you want a str object, you should cast them in Python.
  m.def("get_all_op_protos", []() -> std::vector<py::bytes> {
-    auto& protos = pd::OpRegistry::protos();
+    auto &protos = OpRegistry::protos();
    std::vector<py::bytes> ret_values;
    for (auto it = protos.begin(); it != protos.end(); ++it) {
      PADDLE_ENFORCE(it->second.IsInitialized(),
@ -161,8 +163,8 @@ All parameter, weight, gradient are variables in Paddle.
  m.def_submodule(
       "var_names",
       "The module will return special predefined variable name in Paddle")
-      .def("empty", pd::OperatorBase::EMPTY_VAR_NAME)
-      .def("temp", pd::OperatorBase::TMP_VAR_NAME);
+      .def("empty", []() { return kEmptyVarName; })
+      .def("temp", []() { return kTempVarName; });
  // clang-format off
  py::class_<paddle::platform::DeviceContext>(m, "DeviceContext")
      .def_static("create",
@ -185,43 +187,45 @@ All parameter, weight, gradient are variables in Paddle.

  py::class_<paddle::platform::CPUPlace>(m, "CPUPlace").def(py::init<>());

-  py::class_<pd::OperatorBase, std::shared_ptr<pd::OperatorBase>> operator_base(
+  py::class_<OperatorBase, std::shared_ptr<OperatorBase>> operator_base(
      m, "Operator");

  operator_base.def_static("create", [](py::bytes protobin) {
-    pd::OpDesc desc;
+    OpDesc desc;
    PADDLE_ENFORCE(desc.ParsePartialFromString(protobin),
                   "Cannot parse user input to OpDesc");
    PADDLE_ENFORCE(desc.IsInitialized(),
                   "User OpDesc is not initialized, reason %s",
                   desc.InitializationErrorString());
-    return pd::OpRegistry::CreateOp(desc);
+    return OpRegistry::CreateOp(desc);
  });

  operator_base.def("backward",
-                    [](const pd::OperatorBase& forwardOp,
-                       const std::unordered_set<std::string>& no_grad_vars) {
-                      return pd::Backward(forwardOp, no_grad_vars);
+                    [](const OperatorBase &forwardOp,
+                       const std::unordered_set<std::string> &no_grad_vars) {
+                      return Backward(forwardOp, no_grad_vars);
                    });

  ExposeOperator(operator_base);

-  py::class_<pd::NetOp, std::shared_ptr<pd::NetOp>> net(m, "Net");
+  py::class_<ops::NetOp, std::shared_ptr<ops::NetOp>> net(m, "Net");

  net.def_static("create",
-                 []() -> std::shared_ptr<pd::NetOp> {
-                   auto retv = std::make_shared<pd::NetOp>();
+                 []() -> std::shared_ptr<ops::NetOp> {
+                   auto retv = std::make_shared<ops::NetOp>();
                   retv->type_ = "plain_net";
                   return retv;
                 })
-      .def("add_op", &pd::NetOp::AddOp)
-      .def("add_op",
-           [](pd::NetOp& self, const std::shared_ptr<pd::NetOp>& net) -> void {
-             self.AddOp(std::static_pointer_cast<pd::OperatorBase>(net));
-           })
-      .def("complete_add_op", &pd::NetOp::CompleteAddOp)
+      .def("add_op", &ops::NetOp::AddOp)
+      .def(
+          "add_op",
+          [](ops::NetOp &self, const std::shared_ptr<ops::NetOp> &net) -> void {
+            self.AddOp(std::static_pointer_cast<OperatorBase>(net));
+          })
+      .def("complete_add_op", &ops::NetOp::CompleteAddOp)
      .def("complete_add_op",
-           [](std::shared_ptr<pd::NetOp>& self) { self->CompleteAddOp(); });
+           [](std::shared_ptr<ops::NetOp> &self) { self->CompleteAddOp(); });
+
  ExposeOperator(net);

  m.def("unique_integer", UniqueIntegerGenerator);
@ -230,3 +234,5 @@ All parameter, weight, gradient are variables in Paddle.

  return m.ptr();
 }
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@ -26,19 +26,17 @@ limitations under the License. */
 #include "unsupported/Eigen/CXX11/Tensor"

 namespace paddle {
-namespace pybind {
-namespace details {  // forward declare
-template <bool less, size_t i, typename... args>
-struct CastToPyBufferImpl;
-}  // namespace details
-}  // namespace pybind

 namespace framework {
+namespace details {
+template <bool less, size_t i, typename... args>
+struct CastToPyBufferImpl;
+}

 class Tensor {
 public:
  template <bool less, size_t i, typename... args>
-  friend struct paddle::pybind::details::CastToPyBufferImpl;
+  friend struct details::CastToPyBufferImpl;

  template <typename T, size_t D, int MajorType, typename IndexType>
  friend struct EigenTensor;
--- a/paddle/framework/tensor_py.h
+++ b/paddle/framework/tensor_py.h
@ -23,7 +23,7 @@ namespace py = pybind11;

 namespace paddle {

-namespace pybind {
+namespace framework {

 namespace details {

@ -63,11 +63,8 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
      }
      return py::buffer_info(
          dst_tensor.mutable_data<CUR_TYPE>(dst_tensor.holder_->place()),
-          sizeof(CUR_TYPE),
-          py::format_descriptor<CUR_TYPE>::format(),
-          (size_t)framework::arity(dst_tensor.dims()),
-          dims_outside,
-          strides);
+          sizeof(CUR_TYPE), py::format_descriptor<CUR_TYPE>::format(),
+          (size_t)framework::arity(dst_tensor.dims()), dims_outside, strides);
    } else {
      constexpr bool less = I + 1 < std::tuple_size<std::tuple<ARGS...>>::value;
      return CastToPyBufferImpl<less, I + 1, ARGS...>()(tensor);
@ -110,8 +107,8 @@ void PyCUDATensorSetFromArray(

  self.Resize(framework::make_ddim(dims));
  auto *dst = self.mutable_data<T>(place);
-  paddle::platform::GpuMemcpySync(
-      dst, array.data(), sizeof(T) * array.size(), cudaMemcpyHostToDevice);
+  paddle::platform::GpuMemcpySync(dst, array.data(), sizeof(T) * array.size(),
+                                  cudaMemcpyHostToDevice);
 }
 #endif

--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
@ -967,8 +967,9 @@ void RecurrentGradientMachine::generateSequence() {
  size_t numSequences = getGenBatchSize();

  resizeBootFrame(numSequences);
-  // We create only two sub-network in generation for alternate use.
-  // Thus, we can reduce total memory of output_ in layer forward.
+  // We create only two sub-network in generation, one stores states of all
+  // layers in previous time step and the other storing the states at current
+  // time step.
  resizeOrCreateFrames(2);

  // outFrameLines_.size() > 1UL
@ -1001,10 +1002,9 @@ void RecurrentGradientMachine::generateSequence() {

  // init outArg
  size_t resultNum = generator_.config.num_results_per_sample();
-  IVector::resizeOrCreate(
-      generator_.outArg.ids,
-      generator_.config.max_num_frames() * numSequences * resultNum,
-      false);
+  size_t maxGenWordCount =
+      generator_.config.max_num_frames() * numSequences * resultNum;
+  IVector::resizeOrCreate(generator_.outArg.ids, maxGenWordCount, false);
  if (resultNum > 1) {
    CHECK_LE(resultNum, static_cast<size_t>(generator_.config.beam_size()));
    Matrix::resizeOrCreate(generator_.outArg.in,
@ -1012,6 +1012,11 @@ void RecurrentGradientMachine::generateSequence() {
                           /* width */ resultNum,
                           false,
                           /* useGpu */ false);
+    Matrix::resizeOrCreate(generator_.outArg.value,
+                           /* height */ maxGenWordCount,
+                           /* width */ 1,
+                           false,
+                           /* useGpu */ false);
  }
  ICpuGpuVector::resizeOrCreate(generator_.outArg.sequenceStartPositions,
                                numSequences + 1,
@ -1313,13 +1318,20 @@ void RecurrentGradientMachine::fillGenOutputs() {
  starts[0] = 0;
  if (numResults > 1) {
    real* probs = generator_.outArg.in->getData();
+    real* idsProb = generator_.outArg.value->getData();
+    size_t curPos = 0;
    for (size_t i = 0; i < finalPaths_.size(); ++i) {
      for (size_t j = 0; j < finalPaths_[i].size(); ++j) {
        Path& path = finalPaths_[i][j];
-        generator_.ids.push_back(path.ids.size());  // sequence size
+        size_t genLen = path.ids.size();
+        generator_.ids.push_back(genLen);  // sequence size
        generator_.ids.insert(
            generator_.ids.end(), path.ids.begin(), path.ids.end());
        generator_.ids.push_back(-1);  // end of sequence
+
+        memcpy(idsProb + curPos, path.idsProb.data(), sizeof(real) * genLen);
+        curPos += genLen;
+        idsProb[curPos++] = -1.0;
        probs[i * numResults + j] = path.logProb;

        if (!j && dataArgsSize_) {
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
@ -189,6 +189,11 @@ public:
     */
    std::vector<int> ids;

+    /**
+     * @brief idsProb, log probability of each generated words.
+     */
+    std::vector<real> idsProb;
+
    /**
     * @brief logProb, current probability of path.
     */
@ -228,11 +233,13 @@ public:
     */
    Path(Path& old, int newId, real logProb, int machineId, int topIndex)
        : ids(old.ids),
+          idsProb(old.idsProb),
          logProb(old.logProb + logProb),
          machineId(machineId),
          topIndex(topIndex),
          seqId(old.seqId) {
      ids.push_back(newId);
+      idsProb.push_back(logProb);
      if (!old.probHistory.empty()) {
        this->probHistory = old.probHistory;
        // probHistory store current prob, not sum
@ -411,8 +418,9 @@ protected:

  struct Generator {
    GeneratorConfig config;
-    std::vector<int> ids;  // store generated sequences
-    Argument outArg;       // final output argument
+    std::vector<int> ids;       // store generated sequences
+    std::vector<real> idsProb;  // log probability of each generated word
+    Argument outArg;            // final output argument
  };
  bool generating_;
  Generator generator_;
--- a/Show More
+++ b/Show More