Update and Resolve conflicts

8 years ago · 483947c45d
parent 4c9be1a997 f8a6bda863
commit 483947c45d
140 changed files with 2966 additions and 1408 deletions
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@ -45,8 +45,9 @@ add_custom_command(TARGET framework_py_proto POST_BUILD

 cc_library(backward SRCS backward.cc DEPS net_op)
 cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context fill_constant_op)
+cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)

-cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto backward glog)
+cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto backward glog lod_rank_table)

 cc_library(prune SRCS prune.cc DEPS framework_proto)
 cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@ -21,7 +21,9 @@ limitations under the License. */
 #include <vector>

 #include "paddle/framework/feed_fetch_type.h"
+#include "paddle/framework/lod_rank_table.h"
 #include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/lod_tensor_array.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/framework/scope.h"

@ -70,10 +72,14 @@ static void CreateTensor(Variable* var, VarDesc::VarType var_type) {
    var->GetMutable<FeedFetchList>();
  } else if (var_type == VarDesc::STEP_SCOPES) {
    var->GetMutable<std::vector<framework::Scope>>();
+  } else if (var_type == VarDesc::LOD_RANK_TABLE) {
+    var->GetMutable<LoDRankTable>();
+  } else if (var_type == VarDesc::LOD_TENSOR_ARRAY) {
+    var->GetMutable<LoDTensorArray>();
  } else {
    PADDLE_THROW(
        "Variable type %d is not in "
-        "[LoDTensor, SelectedRows, FEED_MINIBATCH, FETCH_LIST]",
+        "[LoDTensor, SelectedRows, FEED_MINIBATCH, FETCH_LIST, LOD_RANK_TABLE]",
        var_type);
  }
 }
--- a/paddle/framework/framework.proto
+++ b/paddle/framework/framework.proto
@ -109,6 +109,11 @@ message LoDTensorDesc {
  optional int32 lod_level = 2 [ default = 0 ];
 }

+message LoDTensorArrayDesc {
+  required TensorDesc tensor = 1;
+  optional int32 lod_level = 2 [ default = 0 ];
+}
+
 message VarDesc {
  enum VarType {
    LOD_TENSOR = 1;
@ -116,11 +121,14 @@ message VarDesc {
    FEED_MINIBATCH = 3;
    FETCH_LIST = 4;
    STEP_SCOPES = 5;
+    LOD_RANK_TABLE = 6;
+    LOD_TENSOR_ARRAY = 7;
  }
  required string name = 1;
  required VarType type = 2;
  optional LoDTensorDesc lod_tensor = 3;
  optional TensorDesc selected_rows = 4;
+  optional LoDTensorArrayDesc tensor_array = 6;
  optional bool persistable = 5 [ default = false ];
 }

--- a/paddle/framework/lod_rank_table.cc
+++ b/paddle/framework/lod_rank_table.cc
@ -0,0 +1,48 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/framework/lod_rank_table.h"
+
+namespace paddle {
+namespace framework {
+void LoDRankTable::Reset(const LoD& lod, size_t level) {
+  this->coarse_lod_.clear();
+  this->items_.clear();
+  PADDLE_ENFORCE(level < lod.size(),
+                 "Cannot rank lod since the level %d is less than lod size %d",
+                 level, lod.size());
+  coarse_lod_.reserve(level);
+  for (size_t i = 0; i < level; ++i) {
+    coarse_lod_.push_back(lod[i]);
+  }
+  auto& vec = lod[level];
+  for (size_t i = 0; i < vec.size() - 1; ++i) {
+    TableItem item;
+    item.index = i;
+    item.length = vec[i + 1] - vec[i];
+    items_.emplace_back(item);
+  }
+  // NOTE(yuyang18):
+  //
+  // The time complexity of stable_sort is O(N*log(N)) if additional memory is
+  // available. It is easy to debug and unit test when using `stable_sort`
+  // instead of `sort`. Also, the items of a rank table will not be too large.
+  std::stable_sort(items_.begin(), items_.end(),
+                   [](const TableItem& a, const TableItem& b) {
+                     return a.length > b.length;
+                   });
+}
+
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/lod_rank_table.h
+++ b/paddle/framework/lod_rank_table.h
@ -0,0 +1,55 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include "paddle/framework/lod_tensor.h"
+
+namespace paddle {
+namespace framework {
+
+// LoD Rank Table stores the `level` of `lod` which is ordered by sequence
+// length in descending order. It is useful when implement dynamic RNN and is
+// shared by dynamic RNN memory, dynamic RNN slice input and dynamic RNN slice
+// output operators.
+//
+// The table item contains two element. The length of sequence and the index of
+// sequence in that level.
+//
+// LoDRankTable also stores the coarse_lod, which is the lod information whose
+// level is less than input level, in order to restore the output LoD
+// information.
+class LoDRankTable {
+ public:
+  struct TableItem {
+    size_t index;
+    size_t length;
+  };
+
+  LoDRankTable() {}
+
+  void Reset(const LoD& lod, size_t level);
+
+  const std::vector<TableItem>& items() const { return this->items_; }
+
+  const LoD& coarse_lod() const { return this->coarse_lod_; }
+
+  size_t level() const { return coarse_lod_.size(); }
+
+ private:
+  LoD coarse_lod_;
+  std::vector<TableItem> items_;
+};
+
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/lod_tensor.cc
+++ b/paddle/framework/lod_tensor.cc
@ -135,5 +135,43 @@ void LoDTensor::ShrinkInLevel(size_t level, size_t elem_begin,
  PADDLE_ENFORCE_LT(begin, end, "Cannot shrink, the result tensor is empty.");
  ShareDataWith(Slice(begin, end));
 }
+
+void GetFineGrainedLoDLength(const LoD& lod, size_t start_idx, size_t end_idx,
+                             std::vector<std::vector<size_t>>* lod_length,
+                             size_t* start_offset) {
+  lod_length->clear();
+  PADDLE_ENFORCE(start_idx < lod.size() - 1,
+                 "start_idx should be >= 0 and < lod.size() - 1.");
+  PADDLE_ENFORCE(end_idx < lod.size(),
+                 "end_idx should be >= 0 and < lod.size().");
+  PADDLE_ENFORCE_LE(start_idx, end_idx,
+                    "start_idx should be less than end_idx.");
+  for (size_t level_idx = 0; level_idx < lod.size(); ++level_idx) {
+    std::vector<size_t> level_lens;
+    for (size_t i = start_idx; i < end_idx; ++i) {
+      level_lens.push_back(lod[level_idx][i + 1] - lod[level_idx][i]);
+    }
+    lod_length->emplace_back(level_lens);
+    start_idx = lod[level_idx][start_idx];
+    end_idx = lod[level_idx][end_idx];
+  }
+  *start_offset = start_idx;
+}
+
+void AppendLoD(LoD* lod, const std::vector<std::vector<size_t>>& lod_length) {
+  PADDLE_ENFORCE_EQ(
+      lod->size(), lod_length.size(),
+      "The lod_length should has the same size with the appended lod.");
+  for (size_t i = 0; i < lod->size(); ++i) {
+    auto& level = (*lod)[i];
+    if (level.empty()) {
+      level.push_back(0);
+    }
+    for (size_t len : lod_length[i]) {
+      level.push_back(level.back() + len);
+    }
+  }
+}
+
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/lod_tensor.h
+++ b/paddle/framework/lod_tensor.h
@ -181,5 +181,11 @@ LoDTensor LodExpand(const LoDTensor& source, const LoD& lod, size_t level,
  return tensor;
 }

+void GetFineGrainedLoDLength(const LoD& lod, size_t start_idx, size_t end_idx,
+                             std::vector<std::vector<size_t>>* lod_length,
+                             size_t* start_offset);
+
+void AppendLoD(LoD* lod, const std::vector<std::vector<size_t>>& lod_length);
+
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/lod_tensor_array.h
+++ b/paddle/framework/lod_tensor_array.h
@ -0,0 +1,23 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include <vector>
+#include "paddle/framework/lod_tensor.h"
+
+namespace paddle {
+namespace framework {
+using LoDTensorArray = std::vector<LoDTensor>;
+}
+}  // namespace paddle
--- a/paddle/framework/lod_tensor_test.cc
+++ b/paddle/framework/lod_tensor_test.cc
@ -144,5 +144,47 @@ TEST(LodExpand, test) {
  }
 }

+TEST(LoD, GetFineGrainedLoDLength) {
+  LoD lod;
+  lod.push_back(std::vector<size_t>{0, 2, 4, 5});
+  lod.push_back(std::vector<size_t>{0, 1, 6, 8, 10, 11});
+  lod.push_back(
+      std::vector<size_t>{0, 2, 5, 7, 10, 12, 15, 17, 20, 24, 26, 29});
+
+  std::vector<std::vector<size_t>> lod_length;
+  size_t start_offset;
+  paddle::framework::GetFineGrainedLoDLength(lod, 1, 2, &lod_length,
+                                             &start_offset);
+
+  std::vector<std::vector<size_t>> expected;
+  expected.push_back(std::vector<size_t>{2});
+  expected.push_back(std::vector<size_t>{2, 2});
+  expected.push_back(std::vector<size_t>{2, 3, 4, 2});
+  EXPECT_EQ(lod_length, expected);
+  EXPECT_EQ(start_offset, 15UL);
+}
+
+TEST(LoD, AppendLoD) {
+  std::vector<std::vector<size_t>> lod_lens;
+  lod_lens.push_back(std::vector<size_t>{2});
+  lod_lens.push_back(std::vector<size_t>{2, 2});
+  lod_lens.push_back(std::vector<size_t>{2, 3, 4, 2});
+
+  LoD origin;
+  origin.push_back(std::vector<size_t>{0, 2});
+  origin.push_back(std::vector<size_t>{0, 1, 6});
+  origin.push_back(std::vector<size_t>{0, 2, 5, 7, 10, 12, 15});
+
+  paddle::framework::AppendLoD(&origin, lod_lens);
+
+  LoD expected;
+  expected.push_back(std::vector<size_t>{0, 2, 4});
+  expected.push_back(std::vector<size_t>{0, 1, 6, 8, 10});
+  expected.push_back(
+      std::vector<size_t>{0, 2, 5, 7, 10, 12, 15, 17, 20, 24, 26});
+
+  EXPECT_EQ(origin, expected);
+}
+
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@ -408,7 +408,6 @@ class OperatorWithKernel : public OperatorBase {
  // indicate kernel DataType by input data. Defaultly all input data must be
  // same.
  virtual DataType IndicateDataType(const ExecutionContext& ctx) const {
-    VLOG(3) << "Default IndicateDataType " << this->Type();
    auto& scope = ctx.scope();
    int data_type = -1;
    for (auto& input : this->inputs_) {
@ -425,7 +424,6 @@ class OperatorWithKernel : public OperatorBase {
          }
          if (t != nullptr) {
            int tmp = static_cast<int>(ToDataType(t->type()));
-            VLOG(3) << "Input " << ipt_name << " with data_type " << tmp;
            PADDLE_ENFORCE(tmp == data_type || data_type == -1,
                           "DataType of Paddle Op %s must be the same.",
                           Type());
--- a/paddle/framework/var_desc.cc
+++ b/paddle/framework/var_desc.cc
@ -37,13 +37,27 @@ std::vector<int64_t> VarDescBind::Shape() const {
 DataType VarDescBind::GetDataType() const { return tensor_desc().data_type(); }

 void VarDescBind::SetLoDLevel(int32_t lod_level) {
-  PADDLE_ENFORCE(desc_.type() == VarDesc::LOD_TENSOR);
+  switch (desc_.type()) {
+    case VarDesc::LOD_TENSOR:
      desc_.mutable_lod_tensor()->set_lod_level(lod_level);
+      break;
+    case VarDesc::LOD_TENSOR_ARRAY:
+      desc_.mutable_tensor_array()->set_lod_level(lod_level);
+      break;
+    default:
+      PADDLE_THROW("Tensor type=%d does not support LoDLevel", desc_.type());
+  }
 }

 int32_t VarDescBind::GetLodLevel() const {
-  PADDLE_ENFORCE(desc_.type() == VarDesc::LOD_TENSOR);
+  switch (desc_.type()) {
+    case VarDesc::LOD_TENSOR:
      return desc_.lod_tensor().lod_level();
+    case VarDesc::LOD_TENSOR_ARRAY:
+      return desc_.tensor_array().lod_level();
+    default:
+      PADDLE_THROW("Tensor type=%d does not support LoDLevel", desc_.type());
+  }
 }

 const TensorDesc &VarDescBind::tensor_desc() const {
@ -53,6 +67,8 @@ const TensorDesc &VarDescBind::tensor_desc() const {
      return desc_.selected_rows();
    case VarDesc::LOD_TENSOR:
      return desc_.lod_tensor().tensor();
+    case VarDesc::LOD_TENSOR_ARRAY:
+      return desc_.tensor_array().tensor();
    default:
      PADDLE_THROW("Unexpected branch.");
  }
@ -66,6 +82,8 @@ TensorDesc *VarDescBind::mutable_tensor_desc() {
      return desc_.mutable_selected_rows();
    case VarDesc::LOD_TENSOR:
      return desc_.mutable_lod_tensor()->mutable_tensor();
+    case VarDesc::LOD_TENSOR_ARRAY:
+      return desc_.mutable_tensor_array()->mutable_tensor();
    default:
      PADDLE_THROW("Unexpected branch.");
  }
--- a/paddle/framework/var_desc.h
+++ b/paddle/framework/var_desc.h
@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once

 #include <vector>
+#include "glog/logging.h"
 #include "paddle/framework/framework.pb.h"

 namespace paddle {
--- a/paddle/gserver/layers/MKLDNNAddtoLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNAddtoLayer.cpp
@ -0,0 +1,154 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MKLDNNAddtoLayer.h"
+
+using namespace mkldnn;  // NOLINT
+
+namespace paddle {
+
+REGISTER_LAYER(mkldnn_addto, MKLDNNAddtoLayer);
+
+bool MKLDNNAddtoLayer::init(const LayerMap& layerMap,
+                            const ParameterMap& parameterMap) {
+  if (!MKLDNNLayer::init(layerMap, parameterMap)) {
+    return false;
+  }
+
+  layerSize_ = getSize();
+  for (size_t i = 0; i < inputLayers_.size(); i++) {
+    CHECK_EQ(layerSize_, inputLayers_[i]->getSize()) << "input size must equal";
+  }
+  if (biasParameter_.get() != NULL) {
+    biases_ =
+        std::unique_ptr<Weight>(new Weight(1, layerSize_, biasParameter_, 0));
+  }
+  return true;
+}
+
+void MKLDNNAddtoLayer::reshape(
+    int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) {
+  CHECK_EQ(layerSize_, getSize()) << "this layer size can not be changed";
+  reshapeInput(bs, ih, iw);
+  ic = inputLayers_[0]->getSize() / ih / iw;
+  CHECK_EQ((size_t)ic * ih * iw, inputLayers_[0]->getSize());
+  CHECK_EQ(inputElemenCnt_, (size_t)bs * ic * ih * iw);
+  for (size_t i = 0; i < inputLayers_.size(); i++) {
+    CHECK_EQ(int64_t(bs), inputLayers_[i]->getOutput().getBatchSize());
+    CHECK_EQ(layerSize_, inputLayers_[i]->getSize());
+  }
+
+  oc = ic;
+  oh = ih;
+  ow = iw;
+  reshapeOutput(oh, ow);
+  resizeOutput(bs, oc * oh * ow);
+  printSizeInfo();
+}
+
+void MKLDNNAddtoLayer::resetFwd(std::vector<primitive>& pipeline,
+                                MKLDNNMatrixPtr& in,
+                                MKLDNNMatrixPtr& wgt,
+                                MKLDNNMatrixPtr& bias,
+                                MKLDNNMatrixPtr& out) {
+  if (biases_) {
+    LOG(FATAL) << "not implemented yet";
+  }
+  resetFwdBuffers(inVals_, out);
+  in = inVals_[0];
+
+  std::shared_ptr<sum::primitive_desc> fwdPD;
+  resetFwdPD(fwdPD, inVals_, out);
+
+  resetFwdPipeline(pipeline, fwdPD, inVals_, out);
+}
+
+void MKLDNNAddtoLayer::resetBwd(std::vector<primitive>& pipeline,
+                                MKLDNNMatrixPtr& in,
+                                MKLDNNMatrixPtr& wgt,
+                                MKLDNNMatrixPtr& bias,
+                                MKLDNNMatrixPtr& out) {
+  resetBwdBuffers(inGrads_, out);
+  in = inGrads_[0];
+
+  // backward only need share output grad to input grad
+  for (size_t i = 0; i < inGrads_.size(); i++) {
+    if (inGrads_[i] != nullptr) {
+      inGrads_[i] = out;
+      inputLayers_[i]->getOutputGrad()->setData(inGrads_[i]->getData());
+    }
+  }
+}
+
+void MKLDNNAddtoLayer::updateWeights(const UpdateCallback& callback) {
+  if (biases_ && biases_->getWGrad()) {
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+}
+
+void MKLDNNAddtoLayer::resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
+                                       MKLDNNMatrixPtr& out) {
+  inputs.resize(inputLayers_.size());
+  for (size_t i = 0; i < inputs.size(); i++) {
+    resetInValue(inputs[i], nullptr, i);
+    CHECK(inputs[i]);
+    inputs[i]->downSpatial();
+  }
+  for (size_t i = 1; i < inputs.size(); i++) {
+    CHECK_PRIMITIVE_DESC_EQ(inputs[i], inputs[0]->getPrimitiveDesc());
+  }
+
+  resetOutValue(out, inputs[0]->getPrimitiveDesc());
+}
+
+void MKLDNNAddtoLayer::resetFwdPD(std::shared_ptr<sum::primitive_desc>& pd,
+                                  std::vector<MKLDNNMatrixPtr>& inputs,
+                                  MKLDNNMatrixPtr out) {
+  std::vector<double> scales(inputs.size(), 1.0);
+  std::vector<memory::primitive_desc> srcPDs;
+  for (size_t i = 0; i < inputs.size(); i++) {
+    srcPDs.push_back(inputs[i]->getPrimitiveDesc());
+  }
+  CHECK(out);
+  pd.reset(new sum::primitive_desc(out->getMemoryDesc(), scales, srcPDs));
+  CHECK_PRIMITIVE_DESC_EQ(out, pd->dst_primitive_desc());
+}
+
+void MKLDNNAddtoLayer::resetFwdPipeline(
+    std::vector<primitive>& pipeline,
+    std::shared_ptr<sum::primitive_desc>& pd,
+    std::vector<MKLDNNMatrixPtr>& inputs,
+    MKLDNNMatrixPtr& out) {
+  std::vector<primitive::at> srcs;
+  for (size_t i = 0; i < inputs.size(); i++) {
+    srcs.push_back(*(inputs[i]));
+  }
+  fwd_.reset(new sum(*pd, srcs, *out));
+  pipeline.push_back(*fwd_);
+}
+
+void MKLDNNAddtoLayer::resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
+                                       MKLDNNMatrixPtr& out) {
+  CHECK(outVal_);
+  resetOutGrad(out, outVal_->getPrimitiveDesc());
+  CHECK(out);
+
+  inputs.resize(inputLayers_.size());
+  for (size_t i = 0; i < inputs.size(); i++) {
+    resetInGrad(inputs[i], inVal_->getPrimitiveDesc(), i);
+    CHECK_PRIMITIVE_DESC_EQ(inputs[i], out->getPrimitiveDesc());
+  }
+}
+
+}  // namespace paddle
--- a/paddle/gserver/layers/MKLDNNAddtoLayer.h
+++ b/paddle/gserver/layers/MKLDNNAddtoLayer.h
@ -0,0 +1,110 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "MKLDNNLayer.h"
+#include "mkldnn.hpp"
+
+namespace paddle {
+
+/**
+ * @brief A subclass of MKLDNNLayer Addto layer.
+ *
+ * The config file api is mkldnn_addto
+ */
+class MKLDNNAddtoLayer : public MKLDNNLayer {
+protected:
+  std::vector<MKLDNNMatrixPtr> inVals_;
+  std::vector<MKLDNNMatrixPtr> inGrads_;
+
+  // layer size == ic * ih * iw == oc * oh *ow, and can not be changed
+  size_t layerSize_;
+
+  // TODO(TJ): this part has not been optimized by MKL-DNN
+  std::unique_ptr<Weight> biases_;
+
+public:
+  explicit MKLDNNAddtoLayer(const LayerConfig& config) : MKLDNNLayer(config) {}
+
+  ~MKLDNNAddtoLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void reshape(
+      int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) override;
+
+  void resetFwd(std::vector<mkldnn::primitive>& pipeline,
+                MKLDNNMatrixPtr& in,
+                MKLDNNMatrixPtr& wgt,
+                MKLDNNMatrixPtr& bias,
+                MKLDNNMatrixPtr& out) override;
+
+  void resetBwd(std::vector<mkldnn::primitive>& pipeline,
+                MKLDNNMatrixPtr& in,
+                MKLDNNMatrixPtr& wgt,
+                MKLDNNMatrixPtr& bias,
+                MKLDNNMatrixPtr& out) override;
+
+  void updateWeights(const UpdateCallback& callback) override;
+
+  void printValueFormat() override {
+    for (size_t i = 0; i < inVals_.size(); ++i) {
+      VLOG(MKLDNN_FMTS) << i << " input: " << inVals_[i]->getFormat() << " >>>";
+    }
+    if (outVal_) {
+      VLOG(MKLDNN_FMTS) << outVal_->getFormat() << " >>> ";
+    }
+    if (extOutVal_) {
+      VLOG(MKLDNN_FMTS) << extOutVal_->getFormat();
+    }
+  }
+
+  void printGradFormat() override {
+    if (extOutGrad_) {
+      VLOG(MKLDNN_FMTS) << extOutGrad_->getFormat();
+    }
+    if (outGrad_) {
+      VLOG(MKLDNN_FMTS) << outGrad_->getFormat() << " <<< ";
+    }
+    for (size_t i = 0; i < inGrads_.size(); ++i) {
+      VLOG(MKLDNN_FMTS) << i << " input: " << inGrads_[i]->getFormat() << "<<<";
+    }
+  }
+
+protected:
+  /**
+   * Forward functions: reset buffers(inputs, output, bias),
+   *                    reset primitive descriptor,
+   *                    reset pipeline.
+   */
+  void resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
+                       MKLDNNMatrixPtr& out);
+  void resetFwdPD(std::shared_ptr<mkldnn::sum::primitive_desc>& pd,
+                  std::vector<MKLDNNMatrixPtr>& inputs,
+                  MKLDNNMatrixPtr out);
+  void resetFwdPipeline(std::vector<mkldnn::primitive>& pipeline,
+                        std::shared_ptr<mkldnn::sum::primitive_desc>& pd,
+                        std::vector<MKLDNNMatrixPtr>& inputs,
+                        MKLDNNMatrixPtr& out);
+
+  /**
+   * Backward functions: reset buffers(inputs, output, bias)
+   */
+  void resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
+                       MKLDNNMatrixPtr& out);
+};
+
+}  // namespace paddle
--- a/paddle/gserver/layers/MKLDNNLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNLayer.cpp
@ -77,7 +77,7 @@ void MKLDNNLayer::forward(PassType passType) {
      needResetBwd_ = true;
    }

-    if (inputLayers_[0]->getType() == "data") {
+    if (inputLayers_[0]->getType() == "data" && inputLayers_.size() == 1) {
      // Update input value data when input layer is "data" type,
      // since the input value data address might be changed.
      CHECK(extInVal_);
@ -171,14 +171,16 @@ void MKLDNNLayer::resetWithMatrix(MKLDNNMatrixPtr& dnn,
 }

 void MKLDNNLayer::resetInValue(
-    MKLDNNMatrixPtr& in, const std::shared_ptr<memory::primitive_desc>& intPD) {
+    MKLDNNMatrixPtr& in,
+    const std::shared_ptr<memory::primitive_desc>& intPD,
+    size_t inputIdx) {
  cvtInVal_ = nullptr;
  extInVal_ = nullptr;
  in = nullptr;
  CHECK_GT(bs_ * ic_ * ih_ * iw_, 0);
  auto extPD = MKLDNNMatrix::createPrimitiveDesc(
      {bs_, ic_, ih_, iw_}, format::nchw, engine_);
-  const MatrixPtr& inMat = inputLayers_[0]->getOutputValue();
+  const MatrixPtr& inMat = inputLayers_[inputIdx]->getOutputValue();
  in = std::dynamic_pointer_cast<MKLDNNMatrix>(inMat);
  CHECK_EQ(inputIsOnlyMKLDNN(), in != nullptr);
  if (in == nullptr || in->getFormat() == format::nc) {
@ -216,11 +218,12 @@ void MKLDNNLayer::resetOutValue(MKLDNNMatrixPtr& out,
 }

 void MKLDNNLayer::resetInGrad(MKLDNNMatrixPtr& in,
-                              memory::primitive_desc intPD) {
+                              memory::primitive_desc intPD,
+                              size_t inputIdx) {
  cvtInGrad_ = nullptr;
  extInGrad_ = nullptr;
  in = nullptr;
-  LayerPtr& input = inputLayers_[0];
+  LayerPtr& input = inputLayers_[inputIdx];
  if (input->getOutputGrad() == nullptr) {
    // no need input grad
    return;
@ -245,7 +248,6 @@ void MKLDNNLayer::resetInGrad(MKLDNNMatrixPtr& in,
    return;
  }
  // need create reorder
-  // TODO(TJ): add macro definition to simplify it
  CHECK(extInVal_ != nullptr && isPaddleFormat(extInVal_->getFormat()))
      << "should have external input value and the format must be nchw(nc)";
  extInGrad_ = MKLDNNMatrix::create(extInVal_->getPrimitiveDesc(), inMat);
--- a/paddle/gserver/layers/MKLDNNLayer.h
+++ b/paddle/gserver/layers/MKLDNNLayer.h
@ -199,7 +199,8 @@ protected:
   */
  void resetInValue(
      MKLDNNMatrixPtr& in,
-      const std::shared_ptr<mkldnn::memory::primitive_desc>& intPD = nullptr);
+      const std::shared_ptr<mkldnn::memory::primitive_desc>& intPD = nullptr,
+      size_t inputIdx = 0);

  /**
   * reset output value from internal primitive desc.
@ -212,7 +213,9 @@ protected:
   * reset input grad from internal primitive desc.
   * reset both internal and external buffer and create reorder if necessary.
   */
-  void resetInGrad(MKLDNNMatrixPtr& in, mkldnn::memory::primitive_desc intPD);
+  void resetInGrad(MKLDNNMatrixPtr& in,
+                   mkldnn::memory::primitive_desc intPD,
+                   size_t inputIdx = 0);

  /**
   * reset output grad from internal primitive desc.
--- a/paddle/gserver/tests/MKLDNNTester.cpp
+++ b/paddle/gserver/tests/MKLDNNTester.cpp
@ -132,7 +132,7 @@ void MKLDNNTester::checkForward() {
  VLOG(MKLDNN_TESTS) << "Check Forward";
  printTopDatas();
  double delta =
-      compareMatrix(dnnLayer_->getOutputValue(), refLayer_->getOutputValue());
+      compareMatrix(refLayer_->getOutputValue(), dnnLayer_->getOutputValue());
  EXPECT_LE(fabs(delta), eps_);
 }

@ -147,7 +147,7 @@ void MKLDNNTester::checkBackwardData() {
    VLOG(MKLDNN_ALL) << "Reference Backward Result: InputGrad " << i;
    printMatrix(refDiff);

-    double delta = compareMatrix(dnnDiff, refDiff);
+    double delta = compareMatrix(refDiff, dnnDiff);
    EXPECT_LE(fabs(delta), eps_);
    if (isBN) {
      // the other two inputs in batch norm are for moving mean and var
@ -177,7 +177,7 @@ void MKLDNNTester::checkBackwardWgts() {
                     << parameters_[REF][i]->getName();
    printVector(ref);

-    double delta = compareVector(dnn, ref);
+    double delta = compareVector(ref, dnn);
    EXPECT_LE(fabs(delta), eps_);
  }

--- a/paddle/gserver/tests/test_MKLDNN.cpp
+++ b/paddle/gserver/tests/test_MKLDNN.cpp
@ -271,20 +271,53 @@ TEST(MKLDNNLayer, BatchNormLayer) {
  testBatchNormLayer({16, 32, 16, 16});
 }

-struct testActDesc {
+struct testImageDesc {
  int bs, ic, ih, iw;
 };

-static void getAddtoConfig(TestConfig& cfg, const testActDesc& pm) {
+static void getAddtoConfig(TestConfig& cfg,
+                           const testImageDesc& pm,
+                           const size_t nInputs = 1) {
  cfg.biasSize = 0;
  cfg.layerConfig.set_type("addto");
  size_t layerSize = pm.ic * pm.ih * pm.iw;
  cfg.layerConfig.set_size(layerSize);
-  cfg.inputDefs.push_back({INPUT_DATA, "layer_0", layerSize, 0});
-  cfg.layerConfig.add_inputs();
+  cfg.layerConfig.set_active_type("relu");
+  for (size_t i = 0; i < nInputs; ++i) {
+    std::stringstream ss;
+    ss << "layer_" << i;
+    cfg.inputDefs.push_back({INPUT_DATA, ss.str(), layerSize, 0});
+    LayerInputConfig* input = cfg.layerConfig.add_inputs();
+    ImageConfig* img_conf = input->mutable_image_conf();
+    img_conf->set_channels(pm.ic);
+    img_conf->set_img_size_y(pm.ih);
+    img_conf->set_img_size(pm.iw);
+  }
+}
+
+void testAddtoLayer(const testImageDesc& pm, const size_t nInputs) {
+  CHECK_GE(nInputs, 1);
+  TestConfig dnnConfig;
+  getAddtoConfig(dnnConfig, pm, nInputs);
+  dnnConfig.layerConfig.set_type("mkldnn_addto");
+  // TODO(TJ): test with bias
+  for (auto withBias : {false}) {
+    if (withBias) {
+      dnnConfig.biasSize = pm.ic * pm.ih * pm.iw;
+    } else {
+      dnnConfig.biasSize = 0;
+    }
+    RUN_MKLDNN_TEST_LAYER(dnnConfig, "addto", pm)
+  }
+}
+
+TEST(MKLDNNLayer, AddtoLayer) {
+  testAddtoLayer({16, 5, 14, 14}, 1);
+  testAddtoLayer({8, 10, 8, 8}, 2);
+  testAddtoLayer({4, 12, 1, 1}, 3);
 }

-void testActivation(std::string actType, const testActDesc& pm) {
+void testActivation(std::string actType, const testImageDesc& pm) {
  // TODO(TJ): remove me when paddle support elu activation
  if (actType == "mkldnn_elu") {
    return;
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@ -142,6 +142,7 @@ set(DEPS_OPS
    nccl_op
    sequence_conv_op
    sequence_pool_op
+    lod_rank_table_op
    lstm_op)

 op_library(cond_op SRCS cond_op.cc DEPS framework_proto tensor operator net_op)
@ -150,6 +151,7 @@ op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax)
 op_library(sum_op DEPS net_op selected_rows_functor)
 op_library(pool_op DEPS pooling)
 op_library(pool_with_index_op DEPS pooling)
+op_library(lod_rank_table_op SRCS lod_rank_table_op.cc DEPS lod_rank_table)
 if(WITH_GPU)
 op_library(nccl_op DEPS nccl_common)
 endif()
--- a/paddle/operators/accuracy_op.cc
+++ b/paddle/operators/accuracy_op.cc
@ -33,7 +33,7 @@ class AccuracyOp : public framework::OperatorWithKernel {

    auto inference_dim = ctx->GetInputDim("Out");
    auto label_dim = ctx->GetInputDim("Label");
-    // Assume indices has same shape with infernece, because
+    // Assume indices has same shape as inference, because
    // it's the output of topk.

    PADDLE_ENFORCE_EQ(label_dim.size(), 2, "label's rank must be 2.");
@ -60,20 +60,24 @@ class AccuracyOpMaker : public framework::OpProtoAndCheckerMaker {
                  framework::OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    // TODO(typhoonzero): support both inference value and indices.
-    AddInput("Out", "topk (inferences) the network output");
-    AddInput("Indices", "topk (indices) the network output");
+    AddInput("Out", "The network output of topk (inferences)");
+    AddInput("Indices", "The the network output of topk (indices)");
    AddInput("Label", "Label of the training data");
    // TODO(typhoonzero): AddInput("Weight", ...
    AddOutput("Accuracy", "The accuracy of current batch");

    AddComment(R"DOC(
-Accuracy. It will print accuracy rate for classification.
-The accuracy is:
-..  math::
-accuracy = \\frac{NumOfCorrectPredicts}{NumOfAllSamples})
+Accuracy Operator. 
+
+It will print accuracy rate for classification.
+The accuracy is calculated as follows:
+
+$$accuracy = \frac{NumOfCorrectPredicts}{NumOfAllSamples}$$
+
+Both the input Out and Label can carry the LoD (Level of Details)
+information, or not. But the output only shares the LoD information 
+with the input Out(Inference).

-Both the input `Out` and `Label` can carry the LoD (Level of Details)
-information, or not. But the output only shares the LoD with input `Inference`.
 )DOC");
  }
 };
--- a/paddle/operators/activation_op.cc
+++ b/paddle/operators/activation_op.cc
@ -44,7 +44,7 @@ class SigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("X", "Input of Sigmoid operator");
    AddOutput("Y", "Output of Sigmoid operator");
    AddComment(R"DOC(
-Sigmoid activation operator.
+Sigmoid Activation Operator.

 $y = 1 / (1 + e^{-x})$

@ -60,7 +60,7 @@ class LogSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("X", "Input of LogSigmoid operator");
    AddOutput("Y", "Output of LogSigmoid operator");
    AddComment(R"DOC(
-Logsigmoid activation operator.
+Logsigmoid Activation Operator.

 $y = \log(1 / (1 + e^{-x}))$

@ -75,7 +75,7 @@ class ExpOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("X", "Input of Exp operator");
    AddOutput("Y", "Output of Exp operator");
    AddComment(R"DOC(
-Exp activation operator.
+Exp Activation Operator.

 $y = e^x$

@ -90,7 +90,7 @@ class ReluOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("X", "Input of Relu operator");
    AddOutput("Y", "Output of Relu operator");
    AddComment(R"DOC(
-Relu activation operator.
+Relu Activation Operator.

 $y = \max(x, 0)$

@ -109,7 +109,7 @@ class LeakyReluOpMaker : public framework::OpProtoAndCheckerMaker {
    AddAttr<AttrType>("alpha", "The small negative slope")
        .SetDefault(static_cast<AttrType>(0.02f));
    AddComment(R"DOC(
-LeakyRelu activation operator.
+LeakyRelu Activation Operator.

 $y = \max(x, \alpha * x)$

@ -128,7 +128,7 @@ class SoftShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
    AddAttr<AttrType>("lambda", "non-negative offset")
        .SetDefault(static_cast<AttrType>(0.5f));
    AddComment(R"DOC(
-Softshrink activation operator.
+Softshrink Activation Operator.

 $$
 y = \begin{cases} 
@ -149,7 +149,7 @@ class TanhOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("X", "Input of Tanh operator");
    AddOutput("Y", "Output of Tanh operator");
    AddComment(R"DOC(
-Tanh activation operator.
+Tanh Activation Operator.

 $$y = \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$

@ -165,7 +165,7 @@ class TanhShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("X", "Input of TanhShrink operator");
    AddOutput("Y", "Output of TanhShrink operator");
    AddComment(R"DOC(
-TanhShrink activation operator.
+TanhShrink Activation Operator.

 $$y = x - \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$

@ -184,7 +184,7 @@ class HardShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
    AddAttr<AttrType>("threshold", "The value of threshold for HardShrink")
        .SetDefault(static_cast<AttrType>(0.5));
    AddComment(R"DOC(
-HardShrink activation operator.
+HardShrink Activation Operator.

 $$
 y = \begin{cases} 
@ -205,7 +205,7 @@ class SqrtOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("X", "Input of Sqrt operator");
    AddOutput("Y", "Output of Sqrt operator");
    AddComment(R"DOC(
-Sqrt activation operator.
+Sqrt Activation Operator.

 $y = \sqrt{x}$

@ -220,7 +220,7 @@ class AbsOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("X", "Input of Abs operator");
    AddOutput("Y", "Output of Abs operator");
    AddComment(R"DOC(
-Abs activation operator.
+Abs Activation Operator.

 $y = |x|$

@ -236,7 +236,7 @@ class ReciprocalOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("X", "Input of Reciprocal operator");
    AddOutput("Y", "Output of Reciprocal operator");
    AddComment(R"DOC(
-Reciprocal activation operator.
+Reciprocal Activation Operator.

 $$y = \frac{1}{x}$$

@ -251,7 +251,7 @@ class LogOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("X", "Input of Log operator");
    AddOutput("Y", "Output of Log operator");
    AddComment(R"DOC(
-Log activation operator.
+Log Activation Operator.

 $y = \ln(x)$

@ -268,7 +268,7 @@ class SquareOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("X", "Input of Square operator");
    AddOutput("Y", "Output of Square operator");
    AddComment(R"DOC(
-Square activation operator.
+Square Activation Operator.

 $y = x^2$

@ -284,7 +284,7 @@ class SoftplusOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("X", "Input of Softplus operator");
    AddOutput("Y", "Output of Softplus operator");
    AddComment(R"DOC(
-Softplus activation operator.
+Softplus Activation Operator.

 $y = \ln(1 + e^{x})$

@ -300,7 +300,7 @@ class SoftsignOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("X", "Input of Softsign operator");
    AddOutput("Y", "Output of Softsign operator");
    AddComment(R"DOC(
-Softsign activation operator.
+Softsign Activation Operator.

 $$y = \frac{x}{1 + |x|}$$

@ -320,7 +320,7 @@ class BReluOpMaker : public framework::OpProtoAndCheckerMaker {
    AddAttr<AttrType>("t_max", "The max marginal value of BRelu")
        .SetDefault(static_cast<AttrType>(24));
    AddComment(R"DOC(
-BRelu activation operator.
+BRelu Activation Operator.

 $y = \max(\min(x, t_{min}), t_{max})$

@ -339,7 +339,7 @@ class SoftReluOpMaker : public framework::OpProtoAndCheckerMaker {
    AddAttr<AttrType>("threshold", "The threshold value of SoftRelu")
        .SetDefault(static_cast<AttrType>(40));
    AddComment(R"DOC(
-SoftRelu activation operator.
+SoftRelu Activation Operator.

 $y = \ln(1 + \exp(\max(\min(x, threshold), threshold))$

@ -357,7 +357,7 @@ class ELUOpMaker : public framework::OpProtoAndCheckerMaker {
    AddAttr<AttrType>("alpha", "The alpha value of ELU")
        .SetDefault(static_cast<AttrType>(1.0f));
    AddComment(R"DOC(
-ELU activation operator.
+ELU Activation Operator.

 Applies the following element-wise computation on the input according to
 https://arxiv.org/abs/1511.07289.
@ -378,7 +378,7 @@ class Relu6OpMaker : public framework::OpProtoAndCheckerMaker {
    AddAttr<AttrType>("threshold", "The threshold value of Relu6")
        .SetDefault(static_cast<AttrType>(6));
    AddComment(R"DOC(
-Relu6 activation operator.
+Relu6 Activation Operator.

 $y = \min(\max(0, x), 6)$

@ -396,7 +396,7 @@ class PowOpMaker : public framework::OpProtoAndCheckerMaker {
    AddAttr<AttrType>("factor", "The exponential factor of Pow")
        .SetDefault(static_cast<AttrType>(1));
    AddComment(R"DOC(
-Pow activation operator.
+Pow Activation Operator.

 $y = x^{factor}$

@ -416,7 +416,7 @@ class STanhOpMaker : public framework::OpProtoAndCheckerMaker {
    AddAttr<AttrType>("scale_b", "The scale parameter of b for the input")
        .SetDefault(static_cast<AttrType>(1.7159));
    AddComment(R"DOC(
-STanh activation operator.
+STanh Activation Operator.

 $$y = b * \frac{e^{a * x} - e^{-a * x}}{e^{a * x} + e^{-a * x}}$$

@ -435,7 +435,7 @@ class ThresholdedReluOpMaker : public framework::OpProtoAndCheckerMaker {
    AddAttr<AttrType>("threshold", "The threshold location of activation")
        .SetDefault(static_cast<AttrType>(1.0));
    AddComment(R"DOC(
-ThresholdedRelu activation operator.
+ThresholdedRelu Activation Operator.

 $$
 y = \begin{cases} 
@ -461,7 +461,7 @@ class HardSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
    AddAttr<AttrType>("offset", "Offset for linear approximation of sigmoid")
        .SetDefault(static_cast<AttrType>(0.5));
    AddComment(R"DOC(
-HardSigmoid activation operator.
+HardSigmoid Activation Operator.

 Segment-wise linear approximation of sigmoid(https://arxiv.org/abs/1603.00391), 
 which is much faster than sigmoid.
--- a/paddle/operators/adadelta_op.cc
+++ b/paddle/operators/adadelta_op.cc
@ -64,16 +64,15 @@ class AdadeltaOpMaker : public framework::OpProtoAndCheckerMaker {
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("Param", "(Tensor) Input parameter");
    AddInput("Grad", "(Tensor) Input gradient");
-    AddInput("AvgSquaredGrad",
-             "(Tensor) Input expectation of squared gradient");
+    AddInput("AvgSquaredGrad", "(Tensor) Input average of squared gradient");
    AddInput("AvgSquaredUpdate",
-             "(Tensor) Input expectation of squared parameter updates");
+             "(Tensor) Input average of squared parameter updates");

    AddOutput("ParamOut", "(Tensor) Output parameter");
    AddOutput("AvgSquaredGradOut",
-              "(Tensor) Output expectation of squared gradient");
+              "(Tensor) Output average of squared gradient");
    AddOutput("AvgSquaredUpdateOut",
-              "(Tensor) Output expectation of squared parameter updates");
+              "(Tensor) Output average of squared parameter updates");

    AddAttr<float>("rho",
                   "(float, default 0.95) Exponential decay rate "
@ -84,22 +83,21 @@ class AdadeltaOpMaker : public framework::OpProtoAndCheckerMaker {
                   "numerical stability")
        .SetDefault(1.0e-6f);
    AddComment(R"DOC(
-Adadelta Updates Operator.
+Adadelta Optimizer.

-This implements the Adadelta optimizer[1]. Adadelta is a per-dimension
-adaptive learning rate method for gradient descent.
+Adadelta optimizer is implemented as explained in:
+https://arxiv.org/abs/1212.5701
+Adadelta is a per-dimension adaptive learning rate method used
+for gradient descent.

-Adadelta updates:
+Adadelta updates are as follows:

-avg_squared_grad_out = rho * avg_squared_grad + (1 - rho) * grad * grad
-param_update =  - sqrt((avg_squared_update + epsilon) /
-                       (avg_squared_grad_out + epsilon)) * grad
-avg_squared_update_out = rho * avg_squared_update + (1 - rho) * param_update**2
-param_out = param + param_update
-
-References:
-  [1] ADADELTA: An Adaptive Learning Rate Method
-      https://arxiv.org/abs/1212.5701
+$$avgSquaredGradOut = \rho * avgSquaredGrad + (1 - \rho) * grad * grad \break
+paramUpdate =  - $\sqrt{((avgSquaredUpdate + \epsilon) /
+                       (avgSquaredGrad_out + \epsilon))}$ * grad \break
+avgSquaredUpdateOut = \rho * avgSquaredUpdate + (1 - \rho) *
+                                  {(paramUpdate)}^2 \break
+paramOut = param + paramUpdate$$

 )DOC");
  }
--- a/paddle/operators/adagrad_op.cc
+++ b/paddle/operators/adagrad_op.cc
@ -73,12 +73,16 @@ class AdagradOpMaker : public framework::OpProtoAndCheckerMaker {

 Adaptive Gradient Algorithm (Adagrad).

-moment_out = moment + grad * grad
-param_out = param - learning_rate * grad / (sqrt(moment_out) + epsilon)
+The update is done as follows:
+
+$$momentOut = moment + grad * grad \break
+paramOut = param - learningRate * grad / ($\sqrt{momentOut}$ + \epsilon) \break
+$$

 The original paper(http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
-does not have the epsilon attribute. It is added here for numerical stability 
-by avoiding division by zero.
+does not have the epsilon attribute. It is added here in our implementation
+as also proposed here: http://cs231n.github.io/neural-networks-3/#ada
+for numerical stability to avoid the division by zero error.

 )DOC");
  }
--- a/paddle/operators/adam_op.cc
+++ b/paddle/operators/adam_op.cc
@ -51,8 +51,8 @@ class AdamOp : public framework::OperatorWithKernel {
    PADDLE_ENFORCE_EQ(framework::product(beta1_pow_dims), 1,
                      "Beta1 power accumulator should have 1 dimension");
    auto beta2_pow_dims = ctx->GetInputDim("Beta2Pow");
-    PADDLE_ENFORCE_EQ(framework::product(beta1_pow_dims), 1,
-                      "Beta1 power accumulator should have 1 dimension");
+    PADDLE_ENFORCE_EQ(framework::product(beta2_pow_dims), 1,
+                      "Beta2 power accumulator should have 1 dimension");

    auto param_dims = ctx->GetInputDim("Param");
    PADDLE_ENFORCE_EQ(
@ -60,10 +60,10 @@ class AdamOp : public framework::OperatorWithKernel {
        "Param and Grad input of AdamOp should have same dimension");
    PADDLE_ENFORCE_EQ(
        param_dims, ctx->GetInputDim("Moment1"),
-        "Param and Moment input of AdamOp should have same dimension");
+        "Param and Moment1 input of AdamOp should have same dimension");
    PADDLE_ENFORCE_EQ(
        param_dims, ctx->GetInputDim("Moment2"),
-        "Param and InfNorm input of AdamOp should have same dimension");
+        "Param and Moment2 input of AdamOp should have same dimension");

    ctx->SetOutputDim("ParamOut", param_dims);
    ctx->SetOutputDim("Moment1Out", param_dims);
@ -103,23 +103,20 @@ class AdamOpMaker : public framework::OpProtoAndCheckerMaker {
        .SetDefault(1.0e-8f);

    AddComment(R"DOC(
-Adam Updates Operator.
+Adam Optimizer.

 This implements the Adam optimizer from Section 2 of the Adam
-paper[1]. Adam is a first-order gradient-based optimization
-method based on adaptive estimates of lower-order moments.
+paper : https://arxiv.org/abs/1412.6980.
+Adam is a first-order gradient-based optimization method based on
+adaptive estimates of lower-order moments.

 Adam updates:

-moment1_out = beta1 * moment1 + (1 − beta1) * grad
-moment2_out = beta2 * moment2 + (1 − beta2) * grad * grad
-learning_rate_t = learning_rate_t *
-                  sqrt(1 - beta2_pow) / (1 - beta1_pow)
-param_out = param - learning_rate_t * moment1/ (sqrt(moment2) + epsilon)
-
-References:
-  [1] Adam: A Method for Stochastic Optimization
-      (https://arxiv.org/abs/1412.6980)
+$$moment_1_{out} = \beta_1 * moment_1 + (1 - \beta_1) * grad \break
+moment_2_{out} = \beta_2 * moment_2 + (1 - \beta_2) * grad * grad \break
+learningRate = learningRate *
+                  $\sqrt{(1 - \beta_2_{pow})}$ / (1 - \beta_1_{pow}) \break
+paramOut = param - learningRate * moment_1/ ($\sqrt{(moment_2)} + \epsilon)$$

 )DOC");
  }
--- a/paddle/operators/adamax_op.cc
+++ b/paddle/operators/adamax_op.cc
@ -99,26 +99,22 @@ class AdamaxOpMaker : public framework::OpProtoAndCheckerMaker {
                   "Constant for numerical stability")
        .SetDefault(1.0e-8f);
    AddComment(R"DOC(
-Adamax Updates Operator.
+Adamax Optimizer.

-This implements the Adamax optimizer from Section 7 of the Adam
-paper[1]. Adamax is a variant of the
+We implement the Adamax optimizer from Section 7 of the Adam
+paper: https://arxiv.org/abs/1412.6980. Adamax is a variant of the
 Adam algorithm based on the infinity norm.

 Adamax updates:

-moment_out = beta1 * moment + (1 - beta1) * grad
-inf_norm_out = max(beta2 * inf_norm + epsilon, abs(grad))
-learning_rate_t = learning_rate/(1 - beta1_pow)
-param_out = param - learning_rate_t * moment_out/inf_norm_out
+$$momentOut = \beta_1 * moment + (1 - \beta_1) * grad \break
+infNormOut = max(\beta_2 * infNorm + \epsilon, |grad|) \break
+learningRate = learningRate /(1 - \beta_1_{pow}) \break
+paramOut = param - learningRate * momentPut / infNormOut$$

 The original paper does not have an epsilon attribute.
-However, it is added here for numerical stability
-by preventing divide by 0.
-
-References:
-  [1] Adam: A Method for Stochastic Optimization
-      (https://arxiv.org/abs/1412.6980)
+However, it is added here for numerical stability to prevent the
+division by 0 error.

 )DOC");
  }
--- a/Show More
+++ b/Show More