Merge branch 'develop' into sparse_vector

8 years ago · c2feab7ff1
parent 36ebf00f98 abce9eb750
commit c2feab7ff1
61 changed files with 1846 additions and 1079 deletions
--- a/2
+++ b/2
@ -22,7 +22,7 @@ COPY ./paddle/scripts/docker/root/ /root/

 RUN apt-get update && \
    apt-get install -y \
-    git python-pip python-dev openssh-server bison  \
+    git python-pip python-dev openssh-server bison libnccl-dev \
    wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
    curl sed grep graphviz libjpeg-dev zlib1g-dev  \
    python-matplotlib gcc-4.8 g++-4.8 \
--- a/doc/design/block.md
+++ b/doc/design/block.md
@ -189,7 +189,7 @@ OpDesc {
  inputs = {0} // the index of x in vars of BlockDesc above
  outputs = {5, 3} // indices of act and hidden_out in vars of BlockDesc above
  attrs {
-    "memories" : {1} // the index of h
+    "states" : {1} // the index of h
    "step_net" : <above step net>
  }
 };
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@ -19,15 +19,15 @@ cc_test(scope_test SRCS scope_test.cc DEPS scope)
 proto_library(framework_proto SRCS framework.proto)

 cc_library(attribute SRCS attribute.cc DEPS framework_proto)
-cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS attribute ddim op_info)
 cc_test(program_desc_test SRCS program_desc_test.cc DEPS proto_desc)
 cc_library(op_proto_maker SRCS op_proto_maker.cc DEPS framework_proto attribute)
 cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker)
 cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
-cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope proto_desc glog)
+cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog)
 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry)
+cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS attribute ddim op_info operator)

-cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog)
+cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
 cc_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)

 py_proto_compile(framework_py_proto SRCS framework.proto)
--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@ -21,6 +21,7 @@

 #include "paddle/framework/block_desc.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/operators/dynamic_recurrent_op.h"
 #include "paddle/operators/net_op.h"
 #include "paddle/operators/recurrent_op.h"

@ -220,8 +221,7 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
    // process recurrent gradient op as a special operator.
    if (forwardOp.Type() == "recurrent") {
      // NOTE clean up cycle call somewhere (RNN's stepnet constains itself),
-      // or
-      // this will result in infinite loop.
+      // or this will result in infinite loop.
      const auto& rnnop =
          *static_cast<const operators::RecurrentOp*>(&forwardOp);
      auto rnn_grad_op =
@ -231,6 +231,18 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
      // create stepnet's gradient op
      rnn_grad_op->set_stepnet(
          BackwardRecursive(stepnet_op, no_grad_names, grad_to_var, uniq_id));
+    } else if (forwardOp.Type() == "dynamic_recurrent") {
+      // NOTE clean up cycle call somewhere (RNN's stepnet constains itself),
+      // or this will result in infinite loop.
+      const auto& rnnop =
+          *static_cast<const operators::DynamicRecurrentOp*>(&forwardOp);
+      auto rnn_grad_op =
+          static_cast<operators::DynamicRecurrentGradientOp*>(grad_op.get());
+      const auto& stepnet_op =
+          *static_cast<const OperatorBase*>(&rnnop.rnn.GetStepUnit());
+      // create stepnet's gradient op
+      rnn_grad_op->rnn.SetStepUnit(
+          BackwardRecursive(stepnet_op, no_grad_names, grad_to_var, uniq_id));
    }

    if (net->ops_.empty()) {  // Current no aux op is added to network
--- a/paddle/framework/block_desc.cc
+++ b/paddle/framework/block_desc.cc
@ -41,6 +41,19 @@ bool BlockDescBind::HasVar(const std::string &name) const {
  return vars_.find(name) != vars_.end();
 }

+VarDescBind *BlockDescBind::FindVarRecursive(const std::string &name) const {
+  auto it = vars_.find(name);
+  if (it == vars_.end()) {
+    return Parent() == kNoneBlockIndex ? nullptr
+                                       : ParentBlock()->FindVarRecursive(name);
+  }
+  return it->second.get();
+}
+
+bool BlockDescBind::HasVarRecursive(const std::string &name) const {
+  return FindVarRecursive(name) != nullptr;
+}
+
 std::vector<VarDescBind *> BlockDescBind::AllVars() const {
  std::vector<VarDescBind *> res;
  for (const auto &p : vars_) {
@ -97,7 +110,7 @@ void BlockDescBind::Flush() {
 }

 BlockDescBind *BlockDescBind::ParentBlock() const {
-  if (this->desc_->parent_idx() == -1) {
+  if (this->desc_->parent_idx() == kNoneBlockIndex) {
    return nullptr;
  }
  return prog_->Block(static_cast<size_t>(this->desc_->parent_idx()));
--- a/paddle/framework/block_desc.h
+++ b/paddle/framework/block_desc.h
@ -21,6 +21,7 @@ limitations under the License. */
 #include <vector>

 #include "paddle/framework/op_desc.h"
+#include "paddle/framework/proto_desc.h"
 #include "paddle/framework/var_desc.h"
 #include "paddle/platform/macros.h"

@ -56,6 +57,10 @@ class BlockDescBind {

  bool HasVar(const std::string &var_name) const;

+  VarDescBind *FindVarRecursive(const std::string &name_bytes) const;
+
+  bool HasVarRecursive(const std::string &var_name) const;
+
  std::set<std::string> LocalVarNames() const {
    std::set<std::string> var_names;
    for (auto &var : vars_) {
--- a/paddle/framework/framework.proto
+++ b/paddle/framework/framework.proto
@ -68,6 +68,7 @@ message OpProto {

    optional bool duplicable = 3 [ default = false ];
    optional bool intermediate = 4 [ default = false ];
+    optional bool dispensable = 5 [ default = false ];
  }

  // AttrProto describes the C++ type Attribute.
--- a/paddle/framework/lod_tensor.cc
+++ b/paddle/framework/lod_tensor.cc
@ -25,31 +25,50 @@ LoD SliceLevels(const LoD& in, size_t level_begin, size_t level_end) {
  for (size_t i = level_begin; i < level_end; i++) {
    new_lod.emplace_back(in.at(i));
  }
+  // transform the lowest level to absolute offset.
+  LoD abs_offset_lod = ToAbsOffset(in);
+  new_lod.back() = abs_offset_lod[level_end - 1];
  return new_lod;
 }

 LoD SliceInLevel(const LoD& in, size_t level, size_t elem_begin,
                 size_t elem_end) {
-  // slice the lod.
-  LoD new_lod;
-  new_lod.reserve(in.size() - level);
-  auto start = in.at(level)[elem_begin];
-  auto end = in.at(level)[elem_end];
-
-  for (auto it = in.begin() + level; it != in.end(); it++) {
-    auto it_begin = std::find(it->begin(), it->end(), start);
-    auto it_end = std::find(it_begin, it->end(), end);
-    PADDLE_ENFORCE(it_begin != it->end(), "error in parsing lod info");
-    PADDLE_ENFORCE(it_end != it->end(), "error in parsing lod info");
-    new_lod.emplace_back(it_begin, it_end + 1);
-    // reset offset if tensor is copyed and sliced.
-    std::transform(new_lod.back().begin(), new_lod.back().end(),
-                   new_lod.back().begin(),
-                   [start](int v) { return v - start; });
-    PADDLE_ENFORCE_EQ(new_lod.back().front(), 0, "error in slice LoD");
+  PADDLE_ENFORCE_LT(level, in.size());
+  PADDLE_ENFORCE_LT(elem_end, in[level].size());
+
+  LoD res;
+  res.resize(in.size() - level);
+  // copy the first level
+  res[0].assign(in[level].begin() + elem_begin,
+                in[level].begin() + elem_end + 1);
+  for (size_t lvl = 1; lvl < res.size(); lvl++) {
+    const auto& in_level = in[level + lvl];
+    const auto& above_level = res[lvl - 1];
+    auto& out_level = res[lvl];
+    out_level.assign(in_level.begin() + above_level.front(),
+                     in_level.begin() + above_level.back() + 1);
  }
-  PADDLE_ENFORCE_LE(new_lod.size(), in.size());
-  return new_lod;
+  for (size_t lvl = 0; lvl < res.size(); lvl++) {
+    // to make the first offset equals 0, all the elements minus the first
+    // element
+    size_t front = res[lvl].front();
+    for (auto& ele : res[lvl]) {
+      ele -= front;
+    }
+  }
+  return res;
+}
+
+LoD ToAbsOffset(const LoD& in) {
+  // the lowest level stores relative offsets
+  if (in.empty() || in.size() == 1) return in;
+  LoD result = in;
+  for (int level = result.size() - 2; level >= 0; level--) {
+    for (auto& ele : result[level]) {
+      ele = result[level + 1][ele];
+    }
+  }
+  return result;
 }

 bool operator==(const LoD& a, const LoD& b) {
@ -75,17 +94,7 @@ bool operator==(const LoD& a, const LoD& b) {
 size_t LoDTensor::NumElements(size_t level, size_t idx) const {
  PADDLE_ENFORCE_LT(level, NumLevels());
  PADDLE_ENFORCE_LT(idx, NumElements(level));
-  // the last level of LoD, just return number of records in Tensor
-  if (level == NumLevels() - 1) {
  return lod_[level][idx + 1] - lod_[level][idx];
-  }
-  // high level of LoD, and there is another lower level, return number of
-  // lower-level elements
-  auto tmp = SliceInLevel(lod_, level, idx, idx + 1);
-  PADDLE_ENFORCE_GE(tmp.size(), 2);
-  // there is a 0 as a placeholder stored in LoD, so the number of elements
-  // equals lod.size() - 1
-  return tmp[1].size() - 1;
 }

 void LoDTensor::ShrinkLevels(size_t level_begin, size_t level_end) {
--- a/paddle/framework/lod_tensor.h
+++ b/paddle/framework/lod_tensor.h
@ -39,23 +39,36 @@ using Vector = thrust::host_vector<
 #endif

 /*
- * 3-level LoD stores
- *
- * 0 10 20
- * 0 5 10 15 20
- * 0 2 5 7 10 12 15 20
+ * LoD is short for Level of Details.
 *
- * - in a level, each element indicates offset in the underlying Tensor
+ * - in a level, each element indicates relative offset of the lower level
 * - the first element should be 0 and that indicates that this sequence start
 * from 0
 * - each sequence's begin and end(no-inclusive) is level[id, id+1]
+ *
+ * For example:
+ *    3-level LoD stores
+ *
+ *    0 2 3
+ *    0 2 4 7
+ *    0 2 5 7 10 12 15 20
 */
 using LoD = std::vector<Vector<size_t>>;

+/*
+ * Slice levels from a LoD.
+ * NOTE the lowest level should always be the absolute offsets of the underlying
+ * tensor instances. So if higher layers are sliced without the lowest level,
+ * the lower level of the sliced LoD will be transformed to the absolute offset.
+ */
 LoD SliceLevels(const LoD& in, size_t level_begin, size_t level_end);

 LoD SliceInLevel(const LoD& in, size_t level, size_t elem_begin,
                 size_t elem_end);
+/*
+ * Transform an LoD from relative offsets to absolute offsets.
+ */
+LoD ToAbsOffset(const LoD& in);

 bool operator==(const LoD& a, const LoD& b);

--- a/paddle/framework/lod_tensor_test.cc
+++ b/paddle/framework/lod_tensor_test.cc
@ -30,8 +30,8 @@ class LoDTensorTester : public ::testing::Test {
    // 0 5 10 15 20
    // 0 2 5 7 10 12 15 20
    LoD lod;
-    lod.push_back(std::vector<size_t>{0, 10, 20});
-    lod.push_back(std::vector<size_t>{0, 5, 10, 15, 20});
+    lod.push_back(std::vector<size_t>{0, 2, 3});
+    lod.push_back(std::vector<size_t>{0, 2, 5, 8});
    lod.push_back(std::vector<size_t>{0, 2, 5, 7, 10, 12, 15, 17, 20});

    ASSERT_EQ(lod.size(), 3UL);
@ -52,14 +52,14 @@ TEST_F(LoDTensorTester, NumLevels) { ASSERT_EQ(lod_tensor_.NumLevels(), 3UL); }

 TEST_F(LoDTensorTester, NumElements) {
  ASSERT_EQ(lod_tensor_.NumElements(0), 2UL);
-  ASSERT_EQ(lod_tensor_.NumElements(1), 4UL);
+  ASSERT_EQ(lod_tensor_.NumElements(1), 3UL);
  ASSERT_EQ(lod_tensor_.NumElements(2), 8UL);
 }

 TEST_F(LoDTensorTester, NumElements2) {
  ASSERT_EQ(lod_tensor_.NumElements(0, 0), 2UL);
-  ASSERT_EQ(lod_tensor_.NumElements(0, 1), 2UL);
-  ASSERT_EQ(lod_tensor_.NumElements(1, 1), 2UL);
+  ASSERT_EQ(lod_tensor_.NumElements(0, 1), 1UL);
+  ASSERT_EQ(lod_tensor_.NumElements(1, 1), 3UL);
 }

 TEST_F(LoDTensorTester, ShrinkLevels) {
@ -68,17 +68,16 @@ TEST_F(LoDTensorTester, ShrinkLevels) {
    LoDTensor new_lod_tensor = lod_tensor_;
    new_lod_tensor.ShrinkLevels(level, level + 1);
    ASSERT_EQ(new_lod_tensor.NumLevels(), 1UL);
-    ASSERT_EQ(new_lod_tensor.NumElements(0), lod_tensor_.NumElements(level));
    ASSERT_EQ(new_lod_tensor.data<float>(), lod_tensor_.data<float>());
  }
  // shrink 2 level
  for (size_t level = 0; level < 2UL; ++level) {
    LoDTensor new_lod_tensor = lod_tensor_;
    new_lod_tensor.ShrinkLevels(level, level + 2);
+    // the lowest level's last element should be the tensor's batch_size.
+    ASSERT_EQ(new_lod_tensor.lod().back().back(),
+              lod_tensor_.lod().back().back());
    ASSERT_EQ(new_lod_tensor.NumLevels(), 2UL);
-    ASSERT_EQ(new_lod_tensor.NumElements(0), lod_tensor_.NumElements(level));
-    ASSERT_EQ(new_lod_tensor.NumElements(1),
-              lod_tensor_.NumElements(level + 1));
    ASSERT_EQ(new_lod_tensor.data<float>(), lod_tensor_.data<float>());
  }
 }
@ -86,19 +85,19 @@ TEST_F(LoDTensorTester, ShrinkLevels) {
 TEST_F(LoDTensorTester, ShrinkInLevel) {
  size_t level = 0;
  LoDTensor new_lod_tensor = lod_tensor_;
-  new_lod_tensor.ShrinkInLevel(level, 0, 2);
+  new_lod_tensor.ShrinkInLevel(level, 0, 1);
  EXPECT_EQ(new_lod_tensor.NumLevels(), 3UL);
-  EXPECT_EQ(new_lod_tensor.NumElements(0), 2UL);
-  EXPECT_EQ(new_lod_tensor.NumElements(1), 4UL);
-  EXPECT_EQ(new_lod_tensor.NumElements(2), 8UL);
+  EXPECT_EQ(new_lod_tensor.NumElements(0), 1UL);
+  EXPECT_EQ(new_lod_tensor.NumElements(1), 2UL);
+  EXPECT_EQ(new_lod_tensor.NumElements(2), 5UL);
  ASSERT_EQ(new_lod_tensor.data<float>(), lod_tensor_.data<float>());

  level = 1;
  new_lod_tensor = lod_tensor_;
-  new_lod_tensor.ShrinkInLevel(level, 0, 2);
+  new_lod_tensor.ShrinkInLevel(level, 1, 2);
  ASSERT_EQ(new_lod_tensor.NumLevels(), 2UL);
-  ASSERT_EQ(new_lod_tensor.NumElements(0), 2UL);
-  ASSERT_EQ(new_lod_tensor.NumElements(1), 4UL);
+  ASSERT_EQ(new_lod_tensor.NumElements(0), 1UL);
+  ASSERT_EQ(new_lod_tensor.NumElements(1), 3UL);
  ASSERT_EQ(new_lod_tensor.data<float>(), lod_tensor_.data<float>());
 }

--- a/paddle/framework/op_proto_maker.h
+++ b/paddle/framework/op_proto_maker.h
@ -44,6 +44,11 @@ class OpProtoAndCheckerMaker {
      var_->set_intermediate(true);
      return *this;
    }
+
+    VariableBuilder& AsDispensable() {
+      var_->set_dispensable(true);
+      return *this;
+    }
  };

  VariableBuilder AddInput(const std::string& name, const std::string& comment);
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@ -252,5 +252,20 @@ std::ostream& operator<<(std::ostream& os,
  return os;
 }

+bool OpSupportGPU(const std::string& op_type) {
+  auto& all_kernels = OperatorWithKernel::AllOpKernels();
+  auto it = all_kernels.find(op_type);
+  if (it == all_kernels.end()) {
+    // All control operator must support GPU
+    return true;
+  }
+  for (auto& kern_pair : it->second) {
+    if (platform::is_gpu_place(kern_pair.first.place_)) {
+      return true;
+    }
+  }
+  return false;
+}
+
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@ -327,37 +327,47 @@ class CompileTimeInferShapeContext : public InferShapeContext {
  bool HasInput(const std::string& name) const override {
    const std::vector<std::string>& input_names = op_.Input(name);
    auto length = input_names.size();
+    if (length == 0) {
+      return false;
+    }
    PADDLE_ENFORCE_EQ(length, 1UL,
                      "Input(%s) should have only one value, "
                      "but it have %d now",
                      name, length);
-    return block_.HasVar(input_names[0]);
+    return block_.HasVarRecursive(input_names[0]);
  }

  bool HasOutput(const std::string& name) const override {
    const std::vector<std::string>& output_names = op_.Output(name);
    auto length = output_names.size();
+    if (length == 0) {
+      return false;
+    }
    PADDLE_ENFORCE_EQ(length, 1UL,
                      "Output(%s) should have only one value, "
                      "but it have %d now",
                      name, length);
-    return block_.HasVar(output_names[0]);
+    return block_.HasVarRecursive(output_names[0]);
  }

  bool HasInputs(const std::string& name) const override {
    const std::vector<std::string>& input_names = op_.Input(name);
-    PADDLE_ENFORCE(!input_names.empty(), "Inputs(%s) length is 0", name);
+    if (input_names.empty()) {
+      return false;
+    }
    for (auto& input : input_names) {
-      if (!block_.HasVar(input)) return false;
+      if (!block_.HasVarRecursive(input)) return false;
    }
    return true;
  }

  bool HasOutputs(const std::string& name) const override {
    const std::vector<std::string>& output_names = op_.Output(name);
-    PADDLE_ENFORCE(!output_names.empty(), "Inputs(%s) length is 0", name);
+    if (output_names.empty()) {
+      return false;
+    }
    for (auto& output : output_names) {
-      if (!block_.HasVar(output)) return false;
+      if (!block_.HasVarRecursive(output)) return false;
    }
    return true;
  }
@ -404,11 +414,11 @@ class CompileTimeInferShapeContext : public InferShapeContext {

 private:
  DDim GetDim(const std::string& name) const override {
-    return framework::make_ddim(block_.FindVar(name)->Shape());
+    return framework::make_ddim(block_.FindVarRecursive(name)->Shape());
  }

  void SetDim(const std::string& name, const DDim& dim) override {
-    block_.FindVar(name)->SetShape(framework::vectorize(dim));
+    block_.FindVarRecursive(name)->SetShape(framework::vectorize(dim));
  }

  const OpDescBind& op_;
@ -421,13 +431,27 @@ class RuntimeInferShapeContext : public InferShapeContext {
      : op_(op), scope_(scope) {}

  bool HasInput(const std::string& name) const override {
-    auto ipt = op_.Input(name);
+    auto& ins = Inputs(name);
+    size_t length = ins.size();
+    if (length == 0) {
+      return false;
+    }
+    PADDLE_ENFORCE_EQ(length, 1UL, "Input %s should have more than one inputs",
+                      name);
+    auto ipt = ins[0];
    auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
    return var != nullptr;
  }

  bool HasOutput(const std::string& name) const override {
-    auto ipt = op_.Output(name);
+    auto& outs = Outputs(name);
+    size_t length = outs.size();
+    if (length == 0) {
+      return false;
+    }
+    PADDLE_ENFORCE_EQ(length, 1UL, "Output %s should have more than one inputs",
+                      name);
+    auto ipt = outs[0];
    auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
    return var != nullptr;
  }
@ -649,5 +673,7 @@ class OperatorWithKernel : public OperatorBase {
 std::ostream& operator<<(std::ostream& os,
                         const OperatorWithKernel::OpKernelKey& kernel_key);

+extern bool OpSupportGPU(const std::string& op_type);
+
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/program_desc.cc
+++ b/paddle/framework/program_desc.cc
@ -35,8 +35,8 @@ ProgramDesc *ProgramDescBind::Proto() {

 ProgramDescBind::ProgramDescBind() {
  auto *block = prog_.mutable_blocks()->Add();
-  block->set_idx(0);
-  block->set_parent_idx(-1);
+  block->set_idx(kRootBlockIndex);
+  block->set_parent_idx(kNoneBlockIndex);
  blocks_.emplace_back(new BlockDescBind(this, block));
 }

--- a/paddle/framework/program_desc.h
+++ b/paddle/framework/program_desc.h
@ -17,6 +17,7 @@ limitations under the License. */
 #include <memory>
 #include <vector>
 #include "paddle/framework/framework.pb.h"
+#include "paddle/framework/proto_desc.h"
 #include "paddle/platform/macros.h"

 namespace paddle {
--- a/paddle/framework/proto_desc.h
+++ b/paddle/framework/proto_desc.h
@ -0,0 +1,26 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+namespace paddle {
+namespace framework {
+
+// The Index of first Block in Program. also called root block.
+constexpr int kRootBlockIndex = 0;
+// The Parent Index of root Block, this block does not exist.
+constexpr int kNoneBlockIndex = -1;
+
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/gserver/activations/MKLDNNActivation.cpp
+++ b/paddle/gserver/activations/MKLDNNActivation.cpp
@ -126,7 +126,7 @@ void MKLDNNEltwiseActivation::resetFwd(Argument& act) {
  copyInVal_ = nullptr;
  if (act.grad && algo == algorithm::eltwise_tanh) {
    // tanh need save src input for backward
-    inVal_ = MKLDNNMatrix::create(nullptr, val_->getPrimitiveDesc());
+    inVal_ = MKLDNNMatrix::create(val_->getPrimitiveDesc());
    copyInVal_ = std::make_shared<mkldnn::reorder>(*val_, *inVal_);
    CHECK(copyInVal_) << "should not be emptry";
    pipelineFwd_.push_back(*copyInVal_);
@ -145,7 +145,7 @@ void MKLDNNEltwiseActivation::resetBwd(Argument& act) {
  algorithm algo = getAlgo(this->getName());
  float alpha = getBwdAlpha();
  float beta = getBeta();
-  grad_ = MKLDNNMatrix::create(act.grad, val_->getPrimitiveDesc());
+  grad_ = MKLDNNMatrix::create(val_->getPrimitiveDesc(), act.grad);
  auto eng = CPUEngine::Instance().getEngine();
  auto bwdDesc = eltwise_bwd::desc(
      algo, grad_->getMemoryDesc(), val_->getMemoryDesc(), alpha, beta);
@ -230,7 +230,7 @@ void MKLDNNActivation::resetFwd(Argument& act) {
    int ic = cnt_ / bs / ih / iw;
    CHECK_EQ(cnt_, (size_t)bs * ic * ih * iw);
    val_ = MKLDNNMatrix::create(
-        act.value, {bs, ic, ih, iw}, mkldnn::memory::format::nchw, *engine_);
+        {bs, ic, ih, iw}, mkldnn::memory::format::nchw, *engine_, act.value);
    CHECK(val_);
    val_->downSpatial();
  }
--- a/paddle/gserver/layers/MKLDNNBase.h
+++ b/paddle/gserver/layers/MKLDNNBase.h
@ -21,8 +21,8 @@ namespace paddle {
 typedef enum {
  MKLDNN_BASE = 1,   // basical info of MKLDNN
  MKLDNN_TESTS = 1,  // gtest info of MKLDNN
-  MKLDNN_SIZES = 2,  // size info of MKLDNN
-  MKLDNN_FMTS = 3,   // format info of MKLDNN
+  MKLDNN_FMTS = 2,   // format info of MKLDNN
+  MKLDNN_SIZES = 3,  // size info of MKLDNN
  MKLDNN_ALL = 4,    // show all info of MKLDNN
 } MKLDNN_LOG_LEVEL;

--- a/paddle/gserver/layers/MKLDNNConvLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNConvLayer.cpp
--- a/paddle/gserver/layers/MKLDNNConvLayer.h
+++ b/paddle/gserver/layers/MKLDNNConvLayer.h
@ -48,17 +48,6 @@ protected:
  // save forward primitive_desc, which can be used backward
  std::shared_ptr<conv_fwd::primitive_desc> fwdPD_;

-  // MKLDNNMatrixPtr which should be created from CPU Device
-  MKLDNNMatrixPtr cpuInVal_;
-  MKLDNNMatrixPtr cpuInGrad_;
-  MKLDNNMatrixPtr cpuOutVal_;
-  MKLDNNMatrixPtr cpuOutGrad_;
-  // convert handle between CPU device and MKLDNN device
-  std::shared_ptr<mkldnn::reorder> cvtInVal_;
-  std::shared_ptr<mkldnn::reorder> cvtInGrad_;
-  std::shared_ptr<mkldnn::reorder> cvtOutVal_;
-  std::shared_ptr<mkldnn::reorder> cvtOutGrad_;
-
  // whether the weight has been init
  bool hasInitedWgt_;

@ -94,8 +83,6 @@ public:
                MKLDNNMatrixPtr& bias,
                MKLDNNMatrixPtr& out) override;

-  void updateInputData() override;
-
  void updateWeights(const UpdateCallback& callback) override;

  void convertWeightsFromPaddle() override;
@ -109,26 +96,6 @@ public:
                       << ", sw: " << sw_ << ", dh: " << dh_ << ", dw: " << dw_;
  }

-  void printValueFormatFlow() override {
-    if (cpuInVal_) {
-      VLOG(MKLDNN_FMTS) << cpuInVal_->getFormat() << " >>>";
-    }
-    MKLDNNLayer::printValueFormatFlow();
-    if (cpuOutVal_) {
-      VLOG(MKLDNN_FMTS) << " >>> " << cpuOutVal_->getFormat();
-    }
-  }
-
-  void printGradFormatFlow() override {
-    if (cpuInGrad_) {
-      VLOG(MKLDNN_FMTS) << cpuInGrad_->getFormat() << " <<<";
-    }
-    MKLDNNLayer::printGradFormatFlow();
-    if (cpuOutGrad_) {
-      VLOG(MKLDNN_FMTS) << " <<< " << cpuOutGrad_->getFormat();
-    }
-  }
-
 protected:
  /**
   * load the dims settings of this conv
@ -162,23 +129,6 @@ protected:
                        MKLDNNMatrixPtr& bias,
                        MKLDNNMatrixPtr& out);

-  /**
-   * reset MKLDNNMatrix of input value
-   */
-  void resetInValue(std::shared_ptr<conv_fwd::primitive_desc>& pd,
-                    MKLDNNMatrixPtr& in);
-  /**
-   * reset MKLDNNMatrix of weight and bias value
-   */
-  void resetWgtBiasValue(std::shared_ptr<conv_fwd::primitive_desc>& pd,
-                         MKLDNNMatrixPtr& wgt,
-                         MKLDNNMatrixPtr& bias);
-  /**
-   * reset MKLDNNMatrix of output value
-   */
-  void resetOutValue(std::shared_ptr<conv_fwd::primitive_desc>& pd,
-                     MKLDNNMatrixPtr& out);
-
  /**
   * reset the backward weight primitive descriptor.
   */
@ -207,22 +157,6 @@ protected:
                        MKLDNNMatrixPtr& bias,
                        MKLDNNMatrixPtr& out);

-  /**
-   * reset MKLDNNMatrix of output grad
-   */
-  void resetOutGrad(std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD,
-                    MKLDNNMatrixPtr& out);
-  /**
-   * reset MKLDNNMatrix of weight and bias grad
-   */
-  void resetWgtBiasGrad(std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD,
-                        MKLDNNMatrixPtr& wgt,
-                        MKLDNNMatrixPtr& bias);
-  /**
-   * reset MKLDNNMatrix of input grad
-   */
-  void resetInGrad(std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
-                   MKLDNNMatrixPtr& in);
  /**
   * reset MKLDNNMatrix of weight value for backward data
   * since the primitive_desc would be different with wgtVal_
--- a/paddle/gserver/layers/MKLDNNFcLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNFcLayer.cpp
@ -62,7 +62,7 @@ void MKLDNNFcLayer::convertWeightsFromPaddle() {
  CHECK(wgtVal_) << "should have been initialized";
  bool hasNoSpatial_ = ih_ == 1 && iw_ == 1;
  auto targetDim = wgtVal_->getDims();
-  auto srcFmt = hasNoSpatial_ ? memory::format::io : memory::format::ihwo;
+  auto srcFmt = hasNoSpatial_ ? format::io : format::ihwo;
  wgtVal_->reorderDataFrom(wgtVal_, srcFmt, targetDim);
  hasInitedWgt_ = true;
 }
@ -71,7 +71,7 @@ void MKLDNNFcLayer::convertWeightsToPaddle() {
  CHECK(wgtVal_) << "should have been initialized";
  bool hasNoSpatial_ = ih_ == 1 && iw_ == 1;
  auto targetDim = wgtVal_->getDims();
-  auto dstFmt = hasNoSpatial_ ? memory::format::io : memory::format::ihwo;
+  auto dstFmt = hasNoSpatial_ ? format::io : format::ihwo;
  wgtVal_->reorderDataTo(wgtVal_, dstFmt, targetDim);
 }

@ -100,8 +100,6 @@ void MKLDNNFcLayer::resetFwd(std::vector<primitive>& pipeline,
  resetFwdPD(fwdPD_, in, wgt, bias, out);

  resetFwdPipeline(pipeline, fwdPD_, in, wgt, bias, out);
-
-  printValueFormatFlow();
 }

 void MKLDNNFcLayer::resetBwd(std::vector<primitive>& pipeline,
@ -119,12 +117,6 @@ void MKLDNNFcLayer::resetBwd(std::vector<primitive>& pipeline,
  resetBwdDataPD(bwdDataPD, in, out);

  resetBwdPipeline(pipeline, bwdWgtPD, bwdDataPD, in, wgt, bias, out);
-
-  printGradFormatFlow();
-}
-
-void MKLDNNFcLayer::updateInputData() {
-  inVal_->setData(getInputValue(0, CPU_DEVICE)->getData());
 }

 void MKLDNNFcLayer::updateWeights(const UpdateCallback& callback) {
@ -139,51 +131,30 @@ void MKLDNNFcLayer::resetFwdBuffers(MKLDNNMatrixPtr& in,
                                    MKLDNNMatrixPtr& bias,
                                    MKLDNNMatrixPtr& out) {
  resetInValue(in);
-
-  resetWgtBiasValue(wgt, bias);
-
-  resetOutValue(out);
-}
-
-void MKLDNNFcLayer::resetInValue(MKLDNNMatrixPtr& in) {
-  if (inputIsOnlyMKLDNN()) {
-    const MatrixPtr& dnnIn = getInputValue(0);
-    in = std::dynamic_pointer_cast<MKLDNNMatrix>(dnnIn);
-    CHECK(in) << "Input should be MKLDNNMatrix";
-  } else {
-    CHECK_EQ(getPrev(0)->getDeviceId(), CPU_DEVICE) << "Only support CPU yet";
-    const MatrixPtr& cpuIn = getInputValue(0, CPU_DEVICE);
-    in = MKLDNNMatrix::create(
-        cpuIn, {bs_, ic_, ih_, iw_}, format::nchw, engine_);
-  }
+  CHECK(in);
  in->downSpatial();
-}

-void MKLDNNFcLayer::resetWgtBiasValue(MKLDNNMatrixPtr& wgt,
-                                      MKLDNNMatrixPtr& bias) {
+  auto outPD =
+      MKLDNNMatrix::createPrimitiveDesc({bs_, oc_}, format::nc, engine_);
+  resetOutValue(out, outPD);
+
  format wgtFmt = format::oihw;
-  if (inVal_->getFormat() == format::nChw8c) {
+  if (in->getFormat() == format::nChw8c) {
    wgtFmt = format::oIhw8i;
-  } else if (inVal_->getFormat() == format::nChw16c) {
+  } else if (in->getFormat() == format::nChw16c) {
    wgtFmt = format::oIhw16i;
  }
-  wgt = MKLDNNMatrix::create(
-      weight_->getW(), {oc_, ic_, ih_, iw_}, wgtFmt, engine_);
+  auto wgtPD =
+      MKLDNNMatrix::createPrimitiveDesc({oc_, ic_, ih_, iw_}, wgtFmt, engine_);
+  resetWithMatrix(wgt, weight_->getW(), wgtPD);
  wgt->downSpatial();
-  VLOG(MKLDNN_FMTS) << "Weight value format: " << wgt->getFormat();
-
-  bias = (biases_ && biases_->getW())
-             ? MKLDNNMatrix::create(biases_->getW(), {oc_}, format::x, engine_)
-             : nullptr;
-}

-void MKLDNNFcLayer::resetOutValue(MKLDNNMatrixPtr& out) {
-  out = MKLDNNMatrix::create(output_.value, {bs_, oc_}, format::nc, engine_);
-  if (!outputIsOnlyMKLDNN()) {
-    // fc cpu output value do not need create convert, just share data
-    getOutput(CPU_DEVICE).value->setData(out->getData());
+  if (biases_ && biases_->getW()) {
+    auto biasPD = MKLDNNMatrix::createPrimitiveDesc({oc_}, format::x, engine_);
+    resetWithMatrix(bias, biases_->getW(), biasPD);
+  } else {
+    bias = nullptr;
  }
-  output_.value = std::dynamic_pointer_cast<Matrix>(out);
 }

 void MKLDNNFcLayer::resetFwdPD(std::shared_ptr<fc_fwd::primitive_desc>& pd,
@ -219,7 +190,6 @@ void MKLDNNFcLayer::resetFwdPipeline(
  } else {
    fwd_.reset(new fc_fwd(*pd, *in, *wgt, *out));
  }
-
  pipeline.push_back(*fwd_);
 }

@ -227,44 +197,18 @@ void MKLDNNFcLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
                                    MKLDNNMatrixPtr& wgt,
                                    MKLDNNMatrixPtr& bias,
                                    MKLDNNMatrixPtr& out) {
-  resetOutGrad(out);
-
-  resetWgtBiasGrad(wgt, bias);
-
-  resetInGrad(in);
-}
-
-void MKLDNNFcLayer::resetOutGrad(MKLDNNMatrixPtr& out) {
-  CHECK(outVal_);
-  if (outputIsOnlyMKLDNN()) {
-    MKLDNNLayer::resetOutGrad(out, outVal_->getPrimitiveDesc());
-  } else {
-    const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).grad;
-    output_.grad->setData(cpuOut->getData());
-    out = MKLDNNMatrix::create(cpuOut, outVal_->getPrimitiveDesc());
-  }
-}
+  CHECK(inVal_ && outVal_);
+  resetOutGrad(out, outVal_->getPrimitiveDesc());
+  resetInGrad(in, inVal_->getPrimitiveDesc());

-void MKLDNNFcLayer::resetWgtBiasGrad(MKLDNNMatrixPtr& wgt,
-                                     MKLDNNMatrixPtr& bias) {
  CHECK(wgtVal_);
-  wgt = MKLDNNMatrix::create(weight_->getWGrad(), wgtVal_->getPrimitiveDesc());
+  resetWithMatrix(wgt, weight_->getWGrad(), wgtVal_->getPrimitiveDesc());

+  if (biasVal_) {
+    resetWithMatrix(bias, biases_->getWGrad(), biasVal_->getPrimitiveDesc());
+  } else {
    bias = nullptr;
-  if (biasVal_ == nullptr) {
-    return;
-  }
-  bias =
-      MKLDNNMatrix::create(biases_->getWGrad(), biasVal_->getPrimitiveDesc());
-}
-
-void MKLDNNFcLayer::resetInGrad(MKLDNNMatrixPtr& in) {
-  in = nullptr;
-  if (inputLayers_[0]->getOutput().grad == nullptr) {
-    return;
  }
-  CHECK(inVal_);
-  MKLDNNLayer::resetInGrad(in, inVal_->getPrimitiveDesc());
 }

 void MKLDNNFcLayer::resetBwdWgtPD(
--- a/paddle/gserver/layers/MKLDNNFcLayer.h
+++ b/paddle/gserver/layers/MKLDNNFcLayer.h
@ -66,8 +66,6 @@ public:
                MKLDNNMatrixPtr& bias,
                MKLDNNMatrixPtr& out) override;

-  void updateInputData() override;
-
  void updateWeights(const UpdateCallback& callback) override;

  void convertWeightsFromPaddle() override;
@ -84,9 +82,6 @@ protected:
                       MKLDNNMatrixPtr& wgt,
                       MKLDNNMatrixPtr& bias,
                       MKLDNNMatrixPtr& out);
-  void resetInValue(MKLDNNMatrixPtr& in);
-  void resetWgtBiasValue(MKLDNNMatrixPtr& wgt, MKLDNNMatrixPtr& bias);
-  void resetOutValue(MKLDNNMatrixPtr& out);
  void resetFwdPD(std::shared_ptr<fc_fwd::primitive_desc>& pd,
                  MKLDNNMatrixPtr in,
                  MKLDNNMatrixPtr wgt,
@ -109,9 +104,6 @@ protected:
                       MKLDNNMatrixPtr& wgt,
                       MKLDNNMatrixPtr& bias,
                       MKLDNNMatrixPtr& out);
-  void resetOutGrad(MKLDNNMatrixPtr& out);
-  void resetWgtBiasGrad(MKLDNNMatrixPtr& wgt, MKLDNNMatrixPtr& bias);
-  void resetInGrad(MKLDNNMatrixPtr& in);
  void resetBwdWgtPD(std::shared_ptr<fc_bwdWgt::primitive_desc>& pd,
                     MKLDNNMatrixPtr& wgt,
                     MKLDNNMatrixPtr& bias,
--- a/paddle/gserver/layers/MKLDNNLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNLayer.cpp
--- a/paddle/gserver/layers/MKLDNNLayer.h
+++ b/paddle/gserver/layers/MKLDNNLayer.h
--- a/paddle/gserver/layers/MKLDNNPoolLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNPoolLayer.cpp
@ -85,8 +85,6 @@ void MKLDNNPoolLayer::resetFwd(std::vector<primitive>& pipeline,
  resetFwdPD(fwdPD_, in, out);

  resetFwdPipeline(pipeline, fwdPD_, in, out);
-
-  printValueFormatFlow();
 }

 void MKLDNNPoolLayer::resetBwd(std::vector<primitive>& pipeline,
@ -101,65 +99,22 @@ void MKLDNNPoolLayer::resetBwd(std::vector<primitive>& pipeline,
  resetBwdPD(pd, in, out);

  resetBwdPipeline(pipeline, pd, in, out);
-
-  printGradFormatFlow();
-}
-
-void MKLDNNPoolLayer::updateInputData() {
-  inVal_->setData(getInputValue(0, CPU_DEVICE)->getData());
 }

 void MKLDNNPoolLayer::resetFwdBuffers(MKLDNNMatrixPtr& in,
                                      MKLDNNMatrixPtr& out) {
  resetInValue(in);

-  resetOutValue(out);
-}
-
-void MKLDNNPoolLayer::resetInValue(MKLDNNMatrixPtr& in) {
-  if (inputIsOnlyMKLDNN()) {
-    const MatrixPtr& dnnIn = getInputValue(0);
-    in = std::dynamic_pointer_cast<MKLDNNMatrix>(dnnIn);
-    CHECK(in) << "Input should be MKLDNNMatrix";
-  } else {
-    CHECK_EQ(getPrev(0)->getDeviceId(), CPU_DEVICE) << "Only support CPU yet";
-    const MatrixPtr& cpuIn = getInputValue(0, CPU_DEVICE);
-    in = MKLDNNMatrix::create(
-        cpuIn, {bs_, ic_, ih_, iw_}, format::nchw, engine_);
-  }
-}
-
-void MKLDNNPoolLayer::resetOutValue(MKLDNNMatrixPtr& out) {
-  CHECK(inVal_) << "Should reset input value first";
  memory::dims outDims = memory::dims{bs_, oc_, oh_, ow_};
-  out = MKLDNNMatrix::create(
-      output_.value, outDims, inVal_->getFormat(), engine_);
-
-  // create reorder if output value has cpu device and pd do not match
-  cpuOutVal_ = nullptr;
-  cvtOutVal_ = nullptr;
-  if (!outputIsOnlyMKLDNN()) {
-    const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).value;
-    cpuOutVal_ = MKLDNNMatrix::create(cpuOut, outDims, format::nchw, engine_);
-    if (cpuOutVal_->getPrimitiveDesc() != out->getPrimitiveDesc()) {
-      out = MKLDNNMatrix::create(nullptr, out->getPrimitiveDesc());
-      cvtOutVal_ = MKLDNNMatrix::createReorder(out, cpuOutVal_);
-      CHECK(cvtOutVal_) << "should not be emptry";
-    } else {
-      cpuOut->setData(output_.value->getData());
-      cpuOutVal_ = out;
-    }
-    output_.value = std::dynamic_pointer_cast<Matrix>(cpuOutVal_);
-    return;
-  }
-  output_.value = std::dynamic_pointer_cast<Matrix>(outVal_);
+  CHECK(in);
+  auto outPD =
+      MKLDNNMatrix::createPrimitiveDesc(outDims, in->getFormat(), engine_);
+  resetOutValue(out, outPD);
 }

 void MKLDNNPoolLayer::resetFwdPD(std::shared_ptr<pool_fwd::primitive_desc>& pd,
                                 MKLDNNMatrixPtr in,
                                 MKLDNNMatrixPtr out) {
-  memory::dims inDims = memory::dims{bs_, ic_, ih_, iw_};
-  memory::dims outDims = memory::dims{bs_, oc_, oh_, ow_};
  memory::dims kernels = memory::dims{fh_, fw_};
  memory::dims strides = memory::dims{sh_, sw_};
  memory::dims padL = memory::dims{ph_, pw_};
@ -194,58 +149,26 @@ void MKLDNNPoolLayer::resetFwdPipeline(
             ? std::make_shared<pool_fwd>(pool_fwd(*pd, *in, *out, *workspace_))
             : std::make_shared<pool_fwd>(pool_fwd(*pd, *in, *out));
  pipeline.push_back(*fwd_);
-
-  if (cvtOutVal_) {
-    pipeline.push_back(*cvtOutVal_);
-  }
 }

 void MKLDNNPoolLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
                                      MKLDNNMatrixPtr& out) {
-  resetOutGrad(out);
-
-  resetInGrad(in);
-}
-void MKLDNNPoolLayer::resetOutGrad(MKLDNNMatrixPtr& out) {
-  cpuOutGrad_ = nullptr;
-  cvtOutGrad_ = nullptr;
-  CHECK(outVal_);
-  if (outputIsOnlyMKLDNN()) {
-    MKLDNNLayer::resetOutGrad(out, outVal_->getPrimitiveDesc());
-  } else {
-    const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).grad;
-    // always share the same grad data of CPU output
-    // then the activation can get the right grad from output_.grad
-    output_.grad->setData(cpuOut->getData());
-    cpuOutGrad_ = MKLDNNMatrix::create(
-        cpuOut, memory::dims{bs_, oc_, oh_, ow_}, format::nchw, engine_);
-    if (cpuOutGrad_->getPrimitiveDesc() != outVal_->getPrimitiveDesc()) {
-      out = MKLDNNMatrix::create(nullptr, outVal_->getPrimitiveDesc());
-      cvtOutGrad_ = MKLDNNMatrix::createReorder(cpuOutGrad_, out);
-      CHECK(cvtOutGrad_) << "should not be emptry";
-    } else {
-      out = cpuOutGrad_;
-    }
-  }
-}
-
-void MKLDNNPoolLayer::resetInGrad(MKLDNNMatrixPtr& in) {
-  in = nullptr;
-  if (inputLayers_[0]->getOutput().grad == nullptr) {
-    return;
-  }
-  CHECK(inVal_);
-  MKLDNNLayer::resetInGrad(in, inVal_->getPrimitiveDesc());
+  CHECK(inVal_ && outVal_);
+  resetOutGrad(out, outVal_->getPrimitiveDesc());
+  resetInGrad(in, inVal_->getPrimitiveDesc());
 }

 void MKLDNNPoolLayer::resetBwdPD(std::shared_ptr<pool_bwd::primitive_desc>& pd,
                                 MKLDNNMatrixPtr& in,
                                 MKLDNNMatrixPtr& out) {
+  pd = nullptr;
+  if (in == nullptr) {
+    return;
+  }
  memory::dims kernels = memory::dims{fh_, fw_};
  memory::dims strides = memory::dims{sh_, sw_};
  memory::dims padL = memory::dims{ph_, pw_};
  memory::dims padR = getPaddingR();
-  CHECK(in);
  CHECK(out);
  auto bwdDesc = pool_bwd::desc(poolAlgo_,
                                in->getMemoryDesc(),
@ -263,8 +186,8 @@ void MKLDNNPoolLayer::resetBwdPipeline(
    std::shared_ptr<pool_bwd::primitive_desc>& pd,
    MKLDNNMatrixPtr& in,
    MKLDNNMatrixPtr& out) {
-  if (cvtOutGrad_) {
-    pipeline.push_back(*cvtOutGrad_);
+  if (pd == nullptr) {
+    return;
  }

  bwdData_ =
--- a/Show More
+++ b/Show More