From c1914543b0eaef98450314a1b56f4f918aa36ce2 Mon Sep 17 00:00:00 2001
From: tensor-tang <jian.j.tang@intel.com>
Date: Thu, 19 Oct 2017 14:34:44 +0800
Subject: [PATCH 1/5] refine mkldnn logic, move reset buffers into MKLDNNLayer

---
 paddle/gserver/layers/MKLDNNConvLayer.cpp | 233 +++-------------
 paddle/gserver/layers/MKLDNNConvLayer.h   |  66 -----
 paddle/gserver/layers/MKLDNNFcLayer.cpp   | 101 ++-----
 paddle/gserver/layers/MKLDNNFcLayer.h     |   8 -
 paddle/gserver/layers/MKLDNNLayer.h       | 324 ++++++++++++++++++----
 paddle/gserver/layers/MKLDNNPoolLayer.cpp | 103 +------
 paddle/gserver/layers/MKLDNNPoolLayer.h   |  13 -
 paddle/math/MKLDNNMatrix.cpp              |   2 +-
 paddle/math/MKLDNNMatrix.h                |  14 +-
 9 files changed, 358 insertions(+), 506 deletions(-)
diff --git a/paddle/gserver/layers/MKLDNNConvLayer.cpp b/paddle/gserver/layers/MKLDNNConvLayer.cpp
index 26810a6483..463e6ad0ed 100644
--- a/paddle/gserver/layers/MKLDNNConvLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNConvLayer.cpp
@@ -116,8 +116,6 @@ void MKLDNNConvLayer::resetFwd(std::vector<primitive>& pipeline,
   resetFwdBuffers(fwdPD_, in, wgt, bias, out);
 
   resetFwdPipeline(pipeline, fwdPD_, in, wgt, bias, out);
-
-  printValueFormatFlow();
 }
 
 void MKLDNNConvLayer::resetBwd(std::vector<primitive>& pipeline,
@@ -135,12 +133,6 @@ void MKLDNNConvLayer::resetBwd(std::vector<primitive>& pipeline,
   resetBwdBuffers(bwdWgtPD, bwdDataPD, in, wgt, bias, out);
 
   resetBwdPipeline(pipeline, bwdWgtPD, bwdDataPD, in, wgt, bias, out);
-
-  printGradFormatFlow();
-}
-
-void MKLDNNConvLayer::updateInputData() {
-  cpuInVal_->setData(getInputValue(0, CPU_DEVICE)->getData());
 }
 
 void MKLDNNConvLayer::updateWeights(const UpdateCallback& callback) {
@@ -211,11 +203,18 @@ void MKLDNNConvLayer::resetFwdBuffers(
     MKLDNNMatrixPtr& bias,
     MKLDNNMatrixPtr& out) {
   CHECK(pd);
-  resetInValue(pd, in);
+  resetInValue(
+      in, std::make_shared<memory::primitive_desc>(pd->src_primitive_desc()));
+
+  resetOutValue(out, pd->dst_primitive_desc());
 
-  resetWgtBiasValue(pd, wgt, bias);
+  resetWithMatrix(wgt, weight_->getW(), pd->weights_primitive_desc());
 
-  resetOutValue(pd, out);
+  bias = nullptr;
+  if (biases_ == nullptr || biases_->getW() == nullptr) {
+    return;
+  }
+  resetWithMatrix(bias, biases_->getW(), pd->bias_primitive_desc());
 }
 
 void MKLDNNConvLayer::resetFwdPipeline(
@@ -225,104 +224,12 @@ void MKLDNNConvLayer::resetFwdPipeline(
     MKLDNNMatrixPtr& wgt,
     MKLDNNMatrixPtr& bias,
     MKLDNNMatrixPtr& out) {
-  if (cvtInVal_) {
-    pipeline.push_back(*cvtInVal_);
-  }
-
   if (bias) {
     fwd_.reset(new conv_fwd(*pd, *in, *wgt, *bias, *out));
   } else {
     fwd_.reset(new conv_fwd(*pd, *in, *wgt, *out));
   }
   pipeline.push_back(*fwd_);
-
-  if (cvtOutVal_) {
-    pipeline.push_back(*cvtOutVal_);
-  }
-}
-
-void MKLDNNConvLayer::resetInValue(
-    std::shared_ptr<conv_fwd::primitive_desc>& pd, MKLDNNMatrixPtr& in) {
-  const MatrixPtr& inMat = inputLayers_[0]->getOutputValue();
-  in = MKLDNNMatrix::create(inMat, pd->src_primitive_desc());
-
-  // create buffer and reorder if input value do not match
-  cpuInVal_ = nullptr;
-  cvtInVal_ = nullptr;
-
-  MKLDNNMatrixPtr dnnIn = std::dynamic_pointer_cast<MKLDNNMatrix>(inMat);
-  CHECK_EQ(inputIsOnlyMKLDNN(), dnnIn != nullptr);
-  if (dnnIn != nullptr && dnnIn->getPrimitiveDesc() == in->getPrimitiveDesc()) {
-    in = dnnIn;
-    return;
-  }
-  if (dnnIn) {
-    if (dnnIn->getFormat() == format::nc) {
-      CHECK(ih_ == 1 && iw_ == 1) << "when input is nc format";
-      // create a new one with nchw format and same data
-      memory::dims inDims = memory::dims{bs_, ic_, 1, 1};
-      dnnIn = MKLDNNMatrix::create(inMat, inDims, format::nchw, engine_);
-    }
-    if (dnnIn->getPrimitiveDesc() == in->getPrimitiveDesc()) {
-      in = dnnIn;
-      return;
-    }
-    cpuInVal_ = dnnIn;
-    in = MKLDNNMatrix::create(nullptr, pd->src_primitive_desc());
-    cvtInVal_ = MKLDNNMatrix::createReorder(cpuInVal_, in);
-    CHECK(cvtInVal_) << "should not be emptry";
-  } else {
-    memory::dims inDims = memory::dims{bs_, ic_, ih_, iw_};
-    cpuInVal_ = MKLDNNMatrix::create(inMat, inDims, format::nchw, engine_);
-    if (cpuInVal_->getPrimitiveDesc() != in->getPrimitiveDesc()) {
-      // create new mkldnn matrix
-      in = MKLDNNMatrix::create(nullptr, pd->src_primitive_desc());
-      cvtInVal_ = MKLDNNMatrix::createReorder(cpuInVal_, in);
-      CHECK(cvtInVal_) << "should not be emptry";
-    } else {
-      in = cpuInVal_;
-    }
-  }
-}
-
-void MKLDNNConvLayer::resetWgtBiasValue(
-    std::shared_ptr<conv_fwd::primitive_desc>& pd,
-    MKLDNNMatrixPtr& wgt,
-    MKLDNNMatrixPtr& bias) {
-  wgt = MKLDNNMatrix::create(weight_->getW(), pd->weights_primitive_desc());
-  VLOG(MKLDNN_FMTS) << "Weight value format: " << wgt->getFormat();
-
-  bias = (biases_ && biases_->getW())
-             ? MKLDNNMatrix::create(biases_->getW(), pd->bias_primitive_desc())
-             : nullptr;
-}
-
-void MKLDNNConvLayer::resetOutValue(
-    std::shared_ptr<conv_fwd::primitive_desc>& pd, MKLDNNMatrixPtr& out) {
-  out = MKLDNNMatrix::create(output_.value, pd->dst_primitive_desc());
-
-  // create reorder if output value has cpu device and pd do not match
-  cpuOutVal_ = nullptr;
-  cvtOutVal_ = nullptr;
-  if (!outputIsOnlyMKLDNN()) {
-    const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).value;
-    memory::dims outDims = memory::dims{bs_, oc_, oh_, ow_};
-    cpuOutVal_ = MKLDNNMatrix::create(cpuOut, outDims, format::nchw, engine_);
-    if (cpuOutVal_->getPrimitiveDesc() != pd->dst_primitive_desc()) {
-      out = MKLDNNMatrix::create(nullptr, pd->dst_primitive_desc());
-      cvtOutVal_ = MKLDNNMatrix::createReorder(out, cpuOutVal_);
-      CHECK(cvtOutVal_) << "should not be empty";
-    } else {
-      cpuOut->setData(output_.value->getData());
-      cpuOutVal_ = out;
-    }
-    // when output is cpu device, change the mkldnn output value and make them
-    // share the same data. Then if next layer use inputlayer->getOuputValue()
-    // to achieve the input value, it will get the right data.
-    output_.value = std::dynamic_pointer_cast<Matrix>(cpuOutVal_);
-    return;
-  }
-  output_.value = std::dynamic_pointer_cast<Matrix>(out);
 }
 
 void MKLDNNConvLayer::resetBwdWgtPD(
@@ -331,8 +238,8 @@ void MKLDNNConvLayer::resetBwdWgtPD(
   loadConvSettings(wgtDims, biasDims, strides, dilations, padL, padR);
 
   // create backward weight using input, output and weight value memory desc
-  CHECK(inVal_) << "Should have input value";
-  CHECK(outVal_) << "Should have output value";
+  CHECK(inVal_) << "Should have internal input value";
+  CHECK(outVal_) << "Should have internal output value";
   CHECK(wgtVal_) << "Should have weight value";
   algorithm algo = algorithm::convolution_direct;
   padding_kind padKind = padding_kind::zero;
@@ -372,8 +279,8 @@ void MKLDNNConvLayer::resetBwdDataPD(
 
   memory::dims wgtDims, biasDims, strides, dilations, padL, padR;
   loadConvSettings(wgtDims, biasDims, strides, dilations, padL, padR);
-  CHECK(inVal_) << "Should have input value";
-  CHECK(outVal_) << "Should have output value";
+  CHECK(inVal_) << "Should have internal input value";
+  CHECK(outVal_) << "Should have internal output value";
   // create backward data using input and output value memory desc
   // but using weight memory desc with any format
   auto bwdDataDesc = conv_bwdData::desc(algorithm::convolution_direct,
@@ -399,12 +306,27 @@ void MKLDNNConvLayer::resetBwdBuffers(
     MKLDNNMatrixPtr& bias,
     MKLDNNMatrixPtr& out) {
   CHECK(wgtPD);
-  resetOutGrad(wgtPD, out);
+  resetOutGrad(out, wgtPD->diff_dst_primitive_desc());
 
-  resetWgtBiasGrad(wgtPD, wgt, bias);
+  resetWithMatrix(
+      wgt, weight_->getWGrad(), wgtPD->diff_weights_primitive_desc());
+  CHECK(wgtVal_ != nullptr &&
+        wgt->getPrimitiveDesc() == wgtVal_->getPrimitiveDesc())
+      << "primitive desc of weight grad and value should be equal";
 
-  resetInGrad(dataPD, in);
+  bias = nullptr;
+  if (biases_ && biases_->getWGrad()) {
+    resetWithMatrix(
+        bias, biases_->getWGrad(), wgtPD->diff_bias_primitive_desc());
+    CHECK(bias && biasVal_ &&
+          bias->getPrimitiveDesc() == biasVal_->getPrimitiveDesc())
+        << "primitive desc of bias grad should equal the bias value";
+  }
 
+  if (dataPD == nullptr) {
+    return;
+  }
+  resetInGrad(in, dataPD->diff_src_primitive_desc());
   resetWgtValBwdData(dataPD, wgtValBwdData_);
 }
 
@@ -416,10 +338,7 @@ void MKLDNNConvLayer::resetBwdPipeline(
     MKLDNNMatrixPtr& wgt,
     MKLDNNMatrixPtr& bias,
     MKLDNNMatrixPtr& out) {
-  if (cvtOutGrad_) {
-    pipeline.push_back(*cvtOutGrad_);
-  }
-
+  CHECK(inVal_);
   // add bwdWgt handle
   if (bias) {
     bwdWgt_.reset(new conv_bwdWgt(*wgtPD, *inVal_, *out, *wgt, *bias));
@@ -431,99 +350,13 @@ void MKLDNNConvLayer::resetBwdPipeline(
   if (dataPD == nullptr) {
     return;
   }
-
   if (cvtWgtVal_) {
     pipeline.push_back(*cvtWgtVal_);
   }
-
   // add bwdData handle
   CHECK(wgtValBwdData_) << "Should have weight memory";
   bwdData_.reset(new conv_bwdData(*dataPD, *out, *wgtValBwdData_, *in));
   pipeline.push_back(*bwdData_);
-
-  if (cvtInGrad_) {
-    pipeline.push_back(*cvtInGrad_);
-  }
-}
-
-void MKLDNNConvLayer::resetOutGrad(
-    std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD, MKLDNNMatrixPtr& out) {
-  cpuOutGrad_ = nullptr;
-  cvtOutGrad_ = nullptr;
-  CHECK(outVal_ != nullptr &&
-        outVal_->getPrimitiveDesc() == wgtPD->diff_dst_primitive_desc())
-      << "primitive desc of out grad and value should be equal";
-  if (outputIsOnlyMKLDNN()) {
-    MKLDNNLayer::resetOutGrad(out, outVal_->getPrimitiveDesc());
-  } else {
-    const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).grad;
-    // always share the same grad data of CPU output
-    // then the activation can get the right grad from output_.grad
-    output_.grad->setData(cpuOut->getData());
-    // same PrimitiveDesc with cpuInVal_
-    CHECK(cpuOutVal_);
-    cpuOutGrad_ = MKLDNNMatrix::create(cpuOut, cpuOutVal_->getPrimitiveDesc());
-    // create reorder if primitive desc does not match
-    if (cpuOutGrad_->getPrimitiveDesc() != outVal_->getPrimitiveDesc()) {
-      out = MKLDNNMatrix::create(nullptr, outVal_->getPrimitiveDesc());
-      cvtOutGrad_ = MKLDNNMatrix::createReorder(cpuOutGrad_, out);
-      CHECK(cvtOutGrad_);
-    } else {
-      out = cpuOutGrad_;
-    }
-  }
-}
-
-void MKLDNNConvLayer::resetWgtBiasGrad(
-    std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD,
-    MKLDNNMatrixPtr& wgt,
-    MKLDNNMatrixPtr& bias) {
-  wgt = MKLDNNMatrix::create(weight_->getWGrad(),
-                             wgtPD->diff_weights_primitive_desc());
-  CHECK(nullptr != wgtVal_ &&
-        wgt->getPrimitiveDesc() == wgtVal_->getPrimitiveDesc())
-      << "primitive desc of weight grad and value should be equal";
-  VLOG(MKLDNN_FMTS) << "weight grad format: " << wgt->getFormat();
-
-  bias = nullptr;
-  if (biasVal_ == nullptr) {
-    return;
-  }
-  bias = MKLDNNMatrix::create(biases_->getWGrad(),
-                              wgtPD->diff_bias_primitive_desc());
-  CHECK(bias->getPrimitiveDesc() == biasVal_->getPrimitiveDesc())
-      << "primitive desc of bias grad should equal the bias value";
-}
-
-void MKLDNNConvLayer::resetInGrad(
-    std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
-    MKLDNNMatrixPtr& in) {
-  in = nullptr;
-  cpuInGrad_ = nullptr;
-  cvtInGrad_ = nullptr;
-  if (dataPD == nullptr) {
-    return;
-  }
-
-  if (inputIsOnlyMKLDNN()) {
-    MKLDNNLayer::resetInGrad(in, dataPD->diff_src_primitive_desc());
-    CHECK(nullptr != inVal_ &&
-          in->getPrimitiveDesc() == inVal_->getPrimitiveDesc())
-        << "primitive desc of input grad and value should be equal";
-  } else {
-    const MatrixPtr& cpuIn = getInputGrad(0, CPU_DEVICE);
-    // same PrimitiveDesc with cpuInVal_
-    CHECK(cpuInVal_);
-    cpuInGrad_ = MKLDNNMatrix::create(cpuIn, cpuInVal_->getPrimitiveDesc());
-    in = cpuInGrad_;
-    // create reorder if PrimitiveDesc does not match
-    if (cpuInGrad_->getPrimitiveDesc() != dataPD->diff_src_primitive_desc()) {
-      in = MKLDNNMatrix::create(getInputGrad(0, MKLDNN_DEVICE),
-                                dataPD->diff_src_primitive_desc());
-      cvtInGrad_ = MKLDNNMatrix::createReorder(in, cpuInGrad_);
-      CHECK(cvtInGrad_);
-    }
-  }
 }
 
 void MKLDNNConvLayer::resetWgtValBwdData(
diff --git a/paddle/gserver/layers/MKLDNNConvLayer.h b/paddle/gserver/layers/MKLDNNConvLayer.h
index f84f2f737c..1fed0e1c65 100644
--- a/paddle/gserver/layers/MKLDNNConvLayer.h
+++ b/paddle/gserver/layers/MKLDNNConvLayer.h
@@ -48,17 +48,6 @@ protected:
   // save forward primitive_desc, which can be used backward
   std::shared_ptr<conv_fwd::primitive_desc> fwdPD_;
 
-  // MKLDNNMatrixPtr which should be created from CPU Device
-  MKLDNNMatrixPtr cpuInVal_;
-  MKLDNNMatrixPtr cpuInGrad_;
-  MKLDNNMatrixPtr cpuOutVal_;
-  MKLDNNMatrixPtr cpuOutGrad_;
-  // convert handle between CPU device and MKLDNN device
-  std::shared_ptr<mkldnn::reorder> cvtInVal_;
-  std::shared_ptr<mkldnn::reorder> cvtInGrad_;
-  std::shared_ptr<mkldnn::reorder> cvtOutVal_;
-  std::shared_ptr<mkldnn::reorder> cvtOutGrad_;
-
   // whether the weight has been init
   bool hasInitedWgt_;
 
@@ -94,8 +83,6 @@ public:
                 MKLDNNMatrixPtr& bias,
                 MKLDNNMatrixPtr& out) override;
 
-  void updateInputData() override;
-
   void updateWeights(const UpdateCallback& callback) override;
 
   void convertWeightsFromPaddle() override;
@@ -109,26 +96,6 @@ public:
                        << ", sw: " << sw_ << ", dh: " << dh_ << ", dw: " << dw_;
   }
 
-  void printValueFormatFlow() override {
-    if (cpuInVal_) {
-      VLOG(MKLDNN_FMTS) << cpuInVal_->getFormat() << " >>>";
-    }
-    MKLDNNLayer::printValueFormatFlow();
-    if (cpuOutVal_) {
-      VLOG(MKLDNN_FMTS) << " >>> " << cpuOutVal_->getFormat();
-    }
-  }
-
-  void printGradFormatFlow() override {
-    if (cpuInGrad_) {
-      VLOG(MKLDNN_FMTS) << cpuInGrad_->getFormat() << " <<<";
-    }
-    MKLDNNLayer::printGradFormatFlow();
-    if (cpuOutGrad_) {
-      VLOG(MKLDNN_FMTS) << " <<< " << cpuOutGrad_->getFormat();
-    }
-  }
-
 protected:
   /**
    * load the dims settings of this conv
@@ -162,23 +129,6 @@ protected:
                         MKLDNNMatrixPtr& bias,
                         MKLDNNMatrixPtr& out);
 
-  /**
-   * reset MKLDNNMatrix of input value
-   */
-  void resetInValue(std::shared_ptr<conv_fwd::primitive_desc>& pd,
-                    MKLDNNMatrixPtr& in);
-  /**
-   * reset MKLDNNMatrix of weight and bias value
-   */
-  void resetWgtBiasValue(std::shared_ptr<conv_fwd::primitive_desc>& pd,
-                         MKLDNNMatrixPtr& wgt,
-                         MKLDNNMatrixPtr& bias);
-  /**
-   * reset MKLDNNMatrix of output value
-   */
-  void resetOutValue(std::shared_ptr<conv_fwd::primitive_desc>& pd,
-                     MKLDNNMatrixPtr& out);
-
   /**
    * reset the backward weight primitive descriptor.
    */
@@ -207,22 +157,6 @@ protected:
                         MKLDNNMatrixPtr& bias,
                         MKLDNNMatrixPtr& out);
 
-  /**
-   * reset MKLDNNMatrix of output grad
-   */
-  void resetOutGrad(std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD,
-                    MKLDNNMatrixPtr& out);
-  /**
-   * reset MKLDNNMatrix of weight and bias grad
-   */
-  void resetWgtBiasGrad(std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD,
-                        MKLDNNMatrixPtr& wgt,
-                        MKLDNNMatrixPtr& bias);
-  /**
-   * reset MKLDNNMatrix of input grad
-   */
-  void resetInGrad(std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
-                   MKLDNNMatrixPtr& in);
   /**
    * reset MKLDNNMatrix of weight value for backward data
    * since the primitive_desc would be different with wgtVal_
diff --git a/paddle/gserver/layers/MKLDNNFcLayer.cpp b/paddle/gserver/layers/MKLDNNFcLayer.cpp
index cf19a15568..9f82a3b747 100644
--- a/paddle/gserver/layers/MKLDNNFcLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNFcLayer.cpp
@@ -62,7 +62,7 @@ void MKLDNNFcLayer::convertWeightsFromPaddle() {
   CHECK(wgtVal_) << "should have been initialized";
   bool hasNoSpatial_ = ih_ == 1 && iw_ == 1;
   auto targetDim = wgtVal_->getDims();
-  auto srcFmt = hasNoSpatial_ ? memory::format::io : memory::format::ihwo;
+  auto srcFmt = hasNoSpatial_ ? format::io : format::ihwo;
   wgtVal_->reorderDataFrom(wgtVal_, srcFmt, targetDim);
   hasInitedWgt_ = true;
 }
@@ -71,7 +71,7 @@ void MKLDNNFcLayer::convertWeightsToPaddle() {
   CHECK(wgtVal_) << "should have been initialized";
   bool hasNoSpatial_ = ih_ == 1 && iw_ == 1;
   auto targetDim = wgtVal_->getDims();
-  auto dstFmt = hasNoSpatial_ ? memory::format::io : memory::format::ihwo;
+  auto dstFmt = hasNoSpatial_ ? format::io : format::ihwo;
   wgtVal_->reorderDataTo(wgtVal_, dstFmt, targetDim);
 }
 
@@ -100,8 +100,6 @@ void MKLDNNFcLayer::resetFwd(std::vector<primitive>& pipeline,
   resetFwdPD(fwdPD_, in, wgt, bias, out);
 
   resetFwdPipeline(pipeline, fwdPD_, in, wgt, bias, out);
-
-  printValueFormatFlow();
 }
 
 void MKLDNNFcLayer::resetBwd(std::vector<primitive>& pipeline,
@@ -119,12 +117,6 @@ void MKLDNNFcLayer::resetBwd(std::vector<primitive>& pipeline,
   resetBwdDataPD(bwdDataPD, in, out);
 
   resetBwdPipeline(pipeline, bwdWgtPD, bwdDataPD, in, wgt, bias, out);
-
-  printGradFormatFlow();
-}
-
-void MKLDNNFcLayer::updateInputData() {
-  inVal_->setData(getInputValue(0, CPU_DEVICE)->getData());
 }
 
 void MKLDNNFcLayer::updateWeights(const UpdateCallback& callback) {
@@ -139,51 +131,33 @@ void MKLDNNFcLayer::resetFwdBuffers(MKLDNNMatrixPtr& in,
                                     MKLDNNMatrixPtr& bias,
                                     MKLDNNMatrixPtr& out) {
   resetInValue(in);
+  CHECK(in);
+  in->downSpatial();
 
-  resetWgtBiasValue(wgt, bias);
-
-  resetOutValue(out);
-}
+  //  if (extInVal_) {
+  //    extInVal_->downSpatial();
+  //  }
 
-void MKLDNNFcLayer::resetInValue(MKLDNNMatrixPtr& in) {
-  if (inputIsOnlyMKLDNN()) {
-    const MatrixPtr& dnnIn = getInputValue(0);
-    in = std::dynamic_pointer_cast<MKLDNNMatrix>(dnnIn);
-    CHECK(in) << "Input should be MKLDNNMatrix";
-  } else {
-    CHECK_EQ(getPrev(0)->getDeviceId(), CPU_DEVICE) << "Only support CPU yet";
-    const MatrixPtr& cpuIn = getInputValue(0, CPU_DEVICE);
-    in = MKLDNNMatrix::create(
-        cpuIn, {bs_, ic_, ih_, iw_}, format::nchw, engine_);
-  }
-  in->downSpatial();
-}
+  auto outPD =
+      MKLDNNMatrix::createPrimitiveDesc({bs_, oc_}, format::nc, engine_);
+  resetOutValue(out, outPD);
 
-void MKLDNNFcLayer::resetWgtBiasValue(MKLDNNMatrixPtr& wgt,
-                                      MKLDNNMatrixPtr& bias) {
   format wgtFmt = format::oihw;
-  if (inVal_->getFormat() == format::nChw8c) {
+  if (in->getFormat() == format::nChw8c) {
     wgtFmt = format::oIhw8i;
-  } else if (inVal_->getFormat() == format::nChw16c) {
+  } else if (in->getFormat() == format::nChw16c) {
     wgtFmt = format::oIhw16i;
   }
-  wgt = MKLDNNMatrix::create(
-      weight_->getW(), {oc_, ic_, ih_, iw_}, wgtFmt, engine_);
+  auto wgtPD =
+      MKLDNNMatrix::createPrimitiveDesc({oc_, ic_, ih_, iw_}, wgtFmt, engine_);
+  resetWithMatrix(wgt, weight_->getW(), wgtPD);
   wgt->downSpatial();
-  VLOG(MKLDNN_FMTS) << "Weight value format: " << wgt->getFormat();
-
-  bias = (biases_ && biases_->getW())
-             ? MKLDNNMatrix::create(biases_->getW(), {oc_}, format::x, engine_)
-             : nullptr;
-}
 
-void MKLDNNFcLayer::resetOutValue(MKLDNNMatrixPtr& out) {
-  out = MKLDNNMatrix::create(output_.value, {bs_, oc_}, format::nc, engine_);
-  if (!outputIsOnlyMKLDNN()) {
-    // fc cpu output value do not need create convert, just share data
-    getOutput(CPU_DEVICE).value->setData(out->getData());
+  if (biases_ == nullptr || biases_->getW() == nullptr) {
+    return;
   }
-  output_.value = std::dynamic_pointer_cast<Matrix>(out);
+  auto biasPD = MKLDNNMatrix::createPrimitiveDesc({oc_}, format::x, engine_);
+  resetWithMatrix(bias, biases_->getW(), biasPD);
 }
 
 void MKLDNNFcLayer::resetFwdPD(std::shared_ptr<fc_fwd::primitive_desc>& pd,
@@ -219,7 +193,6 @@ void MKLDNNFcLayer::resetFwdPipeline(
   } else {
     fwd_.reset(new fc_fwd(*pd, *in, *wgt, *out));
   }
-
   pipeline.push_back(*fwd_);
 }
 
@@ -227,44 +200,18 @@ void MKLDNNFcLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
                                     MKLDNNMatrixPtr& wgt,
                                     MKLDNNMatrixPtr& bias,
                                     MKLDNNMatrixPtr& out) {
-  resetOutGrad(out);
-
-  resetWgtBiasGrad(wgt, bias);
-
-  resetInGrad(in);
-}
-
-void MKLDNNFcLayer::resetOutGrad(MKLDNNMatrixPtr& out) {
-  CHECK(outVal_);
-  if (outputIsOnlyMKLDNN()) {
-    MKLDNNLayer::resetOutGrad(out, outVal_->getPrimitiveDesc());
-  } else {
-    const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).grad;
-    output_.grad->setData(cpuOut->getData());
-    out = MKLDNNMatrix::create(cpuOut, outVal_->getPrimitiveDesc());
-  }
-}
+  CHECK(inVal_ && outVal_);
+  resetOutGrad(out, outVal_->getPrimitiveDesc());
+  resetInGrad(in, inVal_->getPrimitiveDesc());
 
-void MKLDNNFcLayer::resetWgtBiasGrad(MKLDNNMatrixPtr& wgt,
-                                     MKLDNNMatrixPtr& bias) {
   CHECK(wgtVal_);
-  wgt = MKLDNNMatrix::create(weight_->getWGrad(), wgtVal_->getPrimitiveDesc());
+  resetWithMatrix(wgt, weight_->getWGrad(), wgtVal_->getPrimitiveDesc());
 
   bias = nullptr;
   if (biasVal_ == nullptr) {
     return;
   }
-  bias =
-      MKLDNNMatrix::create(biases_->getWGrad(), biasVal_->getPrimitiveDesc());
-}
-
-void MKLDNNFcLayer::resetInGrad(MKLDNNMatrixPtr& in) {
-  in = nullptr;
-  if (inputLayers_[0]->getOutput().grad == nullptr) {
-    return;
-  }
-  CHECK(inVal_);
-  MKLDNNLayer::resetInGrad(in, inVal_->getPrimitiveDesc());
+  resetWithMatrix(bias, biases_->getWGrad(), biasVal_->getPrimitiveDesc());
 }
 
 void MKLDNNFcLayer::resetBwdWgtPD(
diff --git a/paddle/gserver/layers/MKLDNNFcLayer.h b/paddle/gserver/layers/MKLDNNFcLayer.h
index c76878aafa..ee861763ff 100644
--- a/paddle/gserver/layers/MKLDNNFcLayer.h
+++ b/paddle/gserver/layers/MKLDNNFcLayer.h
@@ -66,8 +66,6 @@ public:
                 MKLDNNMatrixPtr& bias,
                 MKLDNNMatrixPtr& out) override;
 
-  void updateInputData() override;
-
   void updateWeights(const UpdateCallback& callback) override;
 
   void convertWeightsFromPaddle() override;
@@ -84,9 +82,6 @@ protected:
                        MKLDNNMatrixPtr& wgt,
                        MKLDNNMatrixPtr& bias,
                        MKLDNNMatrixPtr& out);
-  void resetInValue(MKLDNNMatrixPtr& in);
-  void resetWgtBiasValue(MKLDNNMatrixPtr& wgt, MKLDNNMatrixPtr& bias);
-  void resetOutValue(MKLDNNMatrixPtr& out);
   void resetFwdPD(std::shared_ptr<fc_fwd::primitive_desc>& pd,
                   MKLDNNMatrixPtr in,
                   MKLDNNMatrixPtr wgt,
@@ -109,9 +104,6 @@ protected:
                        MKLDNNMatrixPtr& wgt,
                        MKLDNNMatrixPtr& bias,
                        MKLDNNMatrixPtr& out);
-  void resetOutGrad(MKLDNNMatrixPtr& out);
-  void resetWgtBiasGrad(MKLDNNMatrixPtr& wgt, MKLDNNMatrixPtr& bias);
-  void resetInGrad(MKLDNNMatrixPtr& in);
   void resetBwdWgtPD(std::shared_ptr<fc_bwdWgt::primitive_desc>& pd,
                      MKLDNNMatrixPtr& wgt,
                      MKLDNNMatrixPtr& bias,
diff --git a/paddle/gserver/layers/MKLDNNLayer.h b/paddle/gserver/layers/MKLDNNLayer.h
index 4e2753eba2..ab59357ad0 100644
--- a/paddle/gserver/layers/MKLDNNLayer.h
+++ b/paddle/gserver/layers/MKLDNNLayer.h
@@ -58,11 +58,30 @@ protected:
   std::vector<mkldnn::primitive> pipelineFwd_;
   std::vector<mkldnn::primitive> pipelineBwd_;
 
-  // MKLDNNMatrixPtr with internal format
+  /// value and grad are seperate as internal and external buffers.
+  /// each MKLDNNLayer must init or reset internal buffer at least,
+  /// and the external buffer format is always nchw of nc(when h==w==1),
+  /// which is the same format as paddle.
+  /// When mixed with cpu device, the output_.value and output_.grad
+  /// always save the external data.
+  /// When all layers are all mkldnn layers, they could be internal data.
+  /// below MKLDNNMatrix buffers are all internal buffers
   MKLDNNMatrixPtr inVal_;
   MKLDNNMatrixPtr inGrad_;
   MKLDNNMatrixPtr outVal_;
   MKLDNNMatrixPtr outGrad_;
+  // below are external value and grad
+  MKLDNNMatrixPtr extInVal_;
+  MKLDNNMatrixPtr extInGrad_;
+  MKLDNNMatrixPtr extOutVal_;
+  MKLDNNMatrixPtr extOutGrad_;
+  // convert handle between external and internal buffers
+  std::shared_ptr<mkldnn::reorder> cvtInVal_;
+  std::shared_ptr<mkldnn::reorder> cvtInGrad_;
+  std::shared_ptr<mkldnn::reorder> cvtOutVal_;
+  std::shared_ptr<mkldnn::reorder> cvtOutGrad_;
+
+  // weight and bias are always internal buffers
   MKLDNNMatrixPtr wgtVal_;
   MKLDNNMatrixPtr wgtGrad_;
   MKLDNNMatrixPtr biasVal_;
@@ -91,6 +110,7 @@ public:
         oh_(0),
         ow_(0),
         needResetBwd_(true),
+        outputOnlyMKLDNN_(false),
         engine_(mkldnn::engine::cpu, 0),
         stream_(nullptr),
         fwd_(nullptr),
@@ -128,20 +148,39 @@ public:
       REGISTER_TIMER_INFO("mkldnn_FwdTimer", getName().c_str());
       CHECK(!inputLayers_.empty());
       copySeqInfoToOutputs();
-      size_t elemenCnt = inputLayers_[0]->getOutput().value->getElementCnt();
+      size_t elemenCnt = inputLayers_[0]->getOutputValue()->getElementCnt();
       if (inputElemenCnt_ != elemenCnt) {
         VLOG(MKLDNN_BASE) << getName() << " reset mkldnn forward";
         // reset when input total sizes changed, not only the batchsize
         inputElemenCnt_ = elemenCnt;
         pipelineFwd_.clear();
         reshape(bs_, ic_, ih_, iw_, oc_, oh_, ow_);
+        // all cpu device output grad or value share output's
+        shareCPUDevice();
         resetFwd(pipelineFwd_, inVal_, wgtVal_, biasVal_, outVal_);
+        // MKLDNNLayer output value should be MKLDNNMatrix
+        // so external output value is necessary.
+        // then external input value is not necessary,
+        // since input may be mkldnn internal buffer.
+        CHECK(extOutVal_) << "external output value is necessary";
+        output_.value = std::dynamic_pointer_cast<Matrix>(extOutVal_);
+        CHECK(inVal_ && outVal_) << "internal memories are necessary";
+        if (cvtInVal_) {
+          pipelineFwd_.insert(pipelineFwd_.begin(), *cvtInVal_);
+        }
+        if (cvtOutVal_) {
+          pipelineFwd_.push_back(*cvtOutVal_);
+        }
         convertWeightsFromPaddle();
+        printValueFormat();
         needResetBwd_ = true;
       }
 
       if (inputLayers_[0]->getType() == "data") {
-        updateInputData();
+        // Update input value data when input layer is "data" type,
+        // since the input value data address might be changed.
+        CHECK(extInVal_);
+        extInVal_->setData(getInputValue(0, CPU_DEVICE)->getData());
       }
 
       if (!outputOnlyMKLDNN_) {
@@ -149,8 +188,7 @@ public:
       }
       stream_->submit(pipelineFwd_);
     }
-
-    /* activation */ {
+    {
       REGISTER_TIMER_INFO("FwActTimer", getName().c_str());
       forwardActivation();
     }
@@ -163,6 +201,16 @@ public:
       pipelineMergeGrad_.clear();
       mergeGrad_ = nullptr;
       resetBwd(pipelineBwd_, inGrad_, wgtGrad_, biasGrad_, outGrad_);
+      // external output grad is not necessary
+      // since output may be mkldnn internal buffer or merge them directly.
+      CHECK(outGrad_) << "internal output grad is necessary";
+      if (cvtOutGrad_) {
+        pipelineBwd_.insert(pipelineBwd_.begin(), *cvtOutGrad_);
+      }
+      if (cvtInGrad_) {
+        pipelineBwd_.push_back(*cvtInGrad_);
+      }
+      printGradFormat();
       needResetBwd_ = false;
     }
 
@@ -179,7 +227,6 @@ public:
       REGISTER_TIMER_INFO("mkldnn_bwdTimer", getName().c_str());
       stream_->submit(pipelineBwd_);
     }
-
     {
       REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
       updateWeights(callback);
@@ -195,7 +242,7 @@ public:
       int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) = 0;
 
   /**
-   * reset the mkldnn forward primitve and memory
+   * reset the mkldnn forward primitve and memories
    * only would be called when input size changes
    */
   virtual void resetFwd(std::vector<mkldnn::primitive>& pipeline,
@@ -205,7 +252,7 @@ public:
                         MKLDNNMatrixPtr& out) = 0;
 
   /**
-   * reset the mkldnn backward primitve and memory for mkldnn fc
+   * reset the mkldnn backward primitve and memories
    * only would be called when needed
    */
   virtual void resetBwd(std::vector<mkldnn::primitive>& pipeline,
@@ -214,12 +261,6 @@ public:
                         MKLDNNMatrixPtr& bias,
                         MKLDNNMatrixPtr& out) = 0;
 
-  /**
-   * Update input value data when input layer is "data" type.
-   * Since the input value data address might be changed.
-   */
-  virtual void updateInputData() {}
-
   /**
    * Update weights and biases if necessary.
    */
@@ -272,21 +313,167 @@ protected:
   }
 
   /**
-   * reset the output grad matrix from primitive desc.
-   * and reset the merge grad primitive if needed.
-   * note: when this layer has serval outputs,
+   * reset MKLDNNMatrix from Matrix and internal primitive desc.
+   * reset nullptr if matrix or primitive desc is empty
+   */
+  void resetWithMatrix(MKLDNNMatrixPtr& dnn,
+                       const MatrixPtr& mat,
+                       mkldnn::memory::primitive_desc pd) {
+    dnn = nullptr;
+    if (mat == nullptr) {
+      return;
+    }
+    dnn = MKLDNNMatrix::create(mat, pd);
+  }
+
+  /**
+   * reset input value from input MKLDNNMatrix and internal primitive desc.
+   * reset both internal and external buffer and create reorder if necessary.
+   */
+  void resetInValue(
+      MKLDNNMatrixPtr& in,
+      const std::shared_ptr<mkldnn::memory::primitive_desc>& intPD = nullptr) {
+    cvtInVal_ = nullptr;
+    extInVal_ = nullptr;
+    in = nullptr;
+    CHECK_GT(bs_ * ic_ * ih_ * iw_, 0);
+    auto extPD = MKLDNNMatrix::createPrimitiveDesc(
+        {bs_, ic_, ih_, iw_}, mkldnn::memory::format::nchw, engine_);
+    const MatrixPtr& inMat = inputLayers_[0]->getOutputValue();
+    in = std::dynamic_pointer_cast<MKLDNNMatrix>(inMat);
+    CHECK_EQ(inputIsOnlyMKLDNN(), in != nullptr);
+    if (in == nullptr || in->getFormat() == mkldnn::memory::format::nc) {
+      in = MKLDNNMatrix::create(inMat, extPD);
+    }
+    extInVal_ = isPaddleFormat(in->getFormat()) ? in : nullptr;
+    if (in->getFormat() == mkldnn::memory::format::nc) {
+      CHECK(ih_ == 1 && iw_ == 1);
+    }
+    if (nullptr == intPD || in->getPrimitiveDesc() == *intPD) {
+      return;
+    }
+    // need create reorder
+    in = MKLDNNMatrix::create(nullptr, *intPD);
+    extInVal_ = extInVal_ ? extInVal_ : MKLDNNMatrix::create(inMat, extPD);
+    cvtInVal_ = MKLDNNMatrix::createReorder(extInVal_, in);
+    CHECK(cvtInVal_) << "should not be emptry";
+  }
+
+  /**
+   * reset output value from internal primitive desc.
+   * reset both internal and external buffer and create reorder if necessary.
+   */
+  void resetOutValue(MKLDNNMatrixPtr& out,
+                     mkldnn::memory::primitive_desc intPD) {
+    cvtOutVal_ = nullptr;
+    out = MKLDNNMatrix::create(output_.value, intPD);
+    extOutVal_ = out;
+    if (outputIsOnlyMKLDNN() || isPaddleFormat(extOutVal_->getFormat())) {
+      return;
+    }
+    // need create reorder
+    CHECK_GT(bs_ * oc_ * oh_ * ow_, 0);
+    extOutVal_ = MKLDNNMatrix::create(output_.value,
+                                      {bs_, oc_, oh_, ow_},
+                                      mkldnn::memory::format::nchw,
+                                      engine_);
+    out = MKLDNNMatrix::create(nullptr, intPD);
+    cvtOutVal_ = MKLDNNMatrix::createReorder(out, extOutVal_);
+    CHECK(cvtOutVal_) << "should not be empty";
+  }
+
+  /**
+   * reset input grad from internal primitive desc.
+   * reset both internal and external buffer and create reorder if necessary.
+   */
+  void resetInGrad(MKLDNNMatrixPtr& in, mkldnn::memory::primitive_desc intPD) {
+    cvtInGrad_ = nullptr;
+    extInGrad_ = nullptr;
+    in = nullptr;
+    LayerPtr& input = inputLayers_[0];
+    if (input->getOutputGrad() == nullptr) {
+      // no need input grad
+      return;
+    }
+    CHECK(inputIsOnlyMKLDNN() || input->getOutputMapSize() <= 1)
+        << "only support input is MKLDNN layer or only have one output layer";
+    // when input is a mkldnn branch node,
+    // this layer will save input grad to a internal buffer,
+    // and the mkldnn input layer will merge them to actual prev->output_.grad
+    const MatrixPtr& inMat =
+        input->getOutputMapSize() <= 1 ? input->getOutputGrad() : nullptr;
+    in = MKLDNNMatrix::create(inMat, intPD);
+    Argument& arg = input->getOutput(this->getName());
+    arg.grad = std::dynamic_pointer_cast<Matrix>(in);
+    CHECK(inVal_ != nullptr && inVal_->getPrimitiveDesc() == intPD)
+        << "should have internal input value and primitive desc must equal";
+    if (inputIsOnlyMKLDNN()) {
+      return;
+    }
+
+    extInGrad_ = in;
+    if (isPaddleFormat(extInGrad_->getFormat())) {
+      return;
+    }
+    // need create reorder
+    CHECK(extInVal_ != nullptr && isPaddleFormat(extInVal_->getFormat()))
+        << "should have external input value and the format must be nchw(nc)";
+    extInGrad_ = MKLDNNMatrix::create(inMat, extInVal_->getPrimitiveDesc());
+    CHECK(inVal_ != nullptr && inVal_->getPrimitiveDesc() == intPD)
+        << "should have internal input value and primitive desc must equal";
+    in = MKLDNNMatrix::create(nullptr, intPD);
+    cvtInGrad_ = MKLDNNMatrix::createReorder(in, extInGrad_);
+    CHECK(cvtInGrad_);
+  }
+
+  /**
+   * reset output grad from internal primitive desc.
+   * merge grad if necessary.
+   * reset both internal and external buffer and create reorder if necessary.
+   * note: about merge grad, when this layer has serval outputs,
    *       it could not be mixed with cpu device,
    *       since it can not get memory desc from cpu device.
    */
-  virtual void resetOutGrad(MKLDNNMatrixPtr& out,
-                            mkldnn::memory::primitive_desc pd) {
-    CHECK(outputIsOnlyMKLDNN()) << "do not support mixed with other device yet";
+  void resetOutGrad(MKLDNNMatrixPtr& out,
+                    mkldnn::memory::primitive_desc intPD) {
+    cvtOutGrad_ = nullptr;
+    extOutGrad_ = nullptr;
+    out = nullptr;
+    MatrixPtr& outMat = output_.grad;
+    out = MKLDNNMatrix::create(outMat, intPD);
+    resetMergeGrad(out);
+    if (outputIsOnlyMKLDNN()) {
+      return;
+    }
+    CHECK_LE(outputMap_.size(), 1U) << "do not support mixed with cpu device";
+    extOutGrad_ = out;
+    if (isPaddleFormat(extOutGrad_->getFormat())) {
+      return;
+    }
+    // need create reorder
+    CHECK(extOutVal_ != nullptr && isPaddleFormat(extOutVal_->getFormat()))
+        << "should have external output value and the format must be nchw(nc)";
+    extOutGrad_ = MKLDNNMatrix::create(outMat, extOutVal_->getPrimitiveDesc());
+    CHECK(outVal_ != nullptr && outVal_->getPrimitiveDesc() == intPD)
+        << "should have internal output value and primitive desc must equal";
+    out = MKLDNNMatrix::create(nullptr, intPD);
+    cvtOutGrad_ = MKLDNNMatrix::createReorder(extOutGrad_, out);
+    CHECK(cvtOutGrad_);
+  }
+
+  /**
+   * reset the merge grad primitive if necessary.
+   * note: do not support the grads are mixed with cpu device,
+   *       since it can not get memory desc from cpu device.
+   */
+  virtual void resetMergeGrad(MKLDNNMatrixPtr& out) {
     mergeGrad_ = nullptr;
     pipelineMergeGrad_.clear();
-    out = MKLDNNMatrix::create(output_.grad, pd);
-    if (outputMap_.size() <= 1) {
+    if (outputMap_.size() <= 1 || !outputIsOnlyMKLDNN()) {
+      // do not merge when output is not all MKLDNN or only one output
       return;
     }
+    CHECK(out) << "should have reset internal ouput grad";
     std::vector<double> scales(outputMap_.size(), 1.0);
     std::vector<mkldnn::memory::primitive_desc> srcPDs;
     std::vector<mkldnn::primitive::at> srcs;
@@ -309,15 +496,13 @@ protected:
     for (size_t i = 1; i < srcPDs.size(); ++i) {
       CHECK(srcPDs[0] == srcPDs[i]);
     }
-    tmpOutGrad_ = nullptr;
+    tmpOutGrad_ = out;
     tmpCvt_ = nullptr;
     if (out->getPrimitiveDesc() != srcPDs[0]) {
       tmpOutGrad_ = MKLDNNMatrix::create(nullptr, srcPDs[0]);
       tmpCvt_ = MKLDNNMatrix::createReorder(tmpOutGrad_, out);
       CHECK(tmpCvt_);
       pipelineMergeGrad_.push_back(*tmpCvt_);
-    } else {
-      tmpOutGrad_ = out;
     }
 
     auto sumPD = mkldnn::sum::primitive_desc(
@@ -326,21 +511,6 @@ protected:
     pipelineMergeGrad_.insert(pipelineMergeGrad_.begin(), *mergeGrad_);
   }
 
-  /**
-   * reset input grad from primitive desc.
-   * this function is avaiable for input is only mkldnn
-   * or input do not care cpu device
-   */
-  virtual void resetInGrad(MKLDNNMatrixPtr& in,
-                           mkldnn::memory::primitive_desc pd) {
-    LayerPtr& input = inputLayers_[0];
-    const MatrixPtr& grad =
-        input->getOutputMapSize() > 1 ? nullptr : input->getOutput().grad;
-    in = MKLDNNMatrix::create(grad, pd);
-    Argument& arg = input->getOutput(this->getName());
-    arg.grad = std::dynamic_pointer_cast<Matrix>(in);
-  }
-
   /**
    * print info about sizes
    */
@@ -351,22 +521,50 @@ protected:
   }
 
   /**
-   * Print the mkldnn memory format flow of value
+   * print the mkldnn memory format of value
    */
-  virtual void printValueFormatFlow() {
-    if (inVal_ && outVal_) {
-      VLOG(MKLDNN_FMTS) << inVal_->getFormat() << " >>> "
-                        << outVal_->getFormat();
+  virtual void printValueFormat() {
+    if (extInVal_) {
+      VLOG(MKLDNN_FMTS) << extInVal_->getFormat() << " >>> ";
+    }
+    if (inVal_) {
+      VLOG(MKLDNN_FMTS) << inVal_->getFormat() << " >>>";
+    }
+    if (outVal_) {
+      VLOG(MKLDNN_FMTS) << outVal_->getFormat() << " >>> ";
+    }
+    if (extOutVal_) {
+      VLOG(MKLDNN_FMTS) << extOutVal_->getFormat();
+    }
+    if (wgtVal_) {
+      VLOG(MKLDNN_FMTS) << "Weight value format: " << wgtVal_->getFormat();
+    }
+    if (biasVal_) {
+      VLOG(MKLDNN_FMTS) << "Bias value format: " << biasVal_->getFormat();
     }
   }
 
   /**
-   * Print the mkldnn memory format flow of grad
+   * print the mkldnn memory format of grad
    */
-  virtual void printGradFormatFlow() {
-    if (inGrad_ && outGrad_) {
-      VLOG(MKLDNN_FMTS) << inGrad_->getFormat() << " <<< "
-                        << outGrad_->getFormat();
+  virtual void printGradFormat() {
+    if (extInGrad_) {
+      VLOG(MKLDNN_FMTS) << extInGrad_->getFormat() << " <<< ";
+    }
+    if (inGrad_) {
+      VLOG(MKLDNN_FMTS) << inGrad_->getFormat() << " <<<";
+    }
+    if (outGrad_) {
+      VLOG(MKLDNN_FMTS) << outGrad_->getFormat() << " <<< ";
+    }
+    if (extOutGrad_) {
+      VLOG(MKLDNN_FMTS) << extOutGrad_->getFormat();
+    }
+    if (wgtGrad_) {
+      VLOG(MKLDNN_FMTS) << "Weight grad format: " << wgtGrad_->getFormat();
+    }
+    if (biasGrad_) {
+      VLOG(MKLDNN_FMTS) << "Bias grad format: " << biasGrad_->getFormat();
     }
   }
 
@@ -405,6 +603,19 @@ protected:
   void setDevice(int id) { deviceId_ = id; }
 
 private:
+  /**
+   * check the format is nchw or nc,
+   * which is supported by Paddle default memory layout
+   */
+  bool isPaddleFormat(mkldnn::memory::format fmt) {
+    if (fmt == mkldnn::memory::format::nchw ||
+        fmt == mkldnn::memory::format::nc) {
+      return true;
+    } else {
+      return false;
+    }
+  }
+
   /**
    * clear all grad
    */
@@ -449,6 +660,19 @@ private:
     }
   }
 
+  /**
+   * if have cpu device, share value and grad data with output_
+   */
+  void shareCPUDevice() {
+    if (outputIsOnlyMKLDNN()) {
+      return;
+    }
+    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
+      outputOtherDevice_[i].value = output_.value;
+      outputOtherDevice_[i].grad = output_.grad;
+    }
+  }
+
   /**
    * Check the cpu device number of outputOtherDevice_.
    * should have only one at most.
diff --git a/paddle/gserver/layers/MKLDNNPoolLayer.cpp b/paddle/gserver/layers/MKLDNNPoolLayer.cpp
index 0e53e2d1b7..6e89260f49 100644
--- a/paddle/gserver/layers/MKLDNNPoolLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNPoolLayer.cpp
@@ -85,8 +85,6 @@ void MKLDNNPoolLayer::resetFwd(std::vector<primitive>& pipeline,
   resetFwdPD(fwdPD_, in, out);
 
   resetFwdPipeline(pipeline, fwdPD_, in, out);
-
-  printValueFormatFlow();
 }
 
 void MKLDNNPoolLayer::resetBwd(std::vector<primitive>& pipeline,
@@ -101,65 +99,22 @@ void MKLDNNPoolLayer::resetBwd(std::vector<primitive>& pipeline,
   resetBwdPD(pd, in, out);
 
   resetBwdPipeline(pipeline, pd, in, out);
-
-  printGradFormatFlow();
-}
-
-void MKLDNNPoolLayer::updateInputData() {
-  inVal_->setData(getInputValue(0, CPU_DEVICE)->getData());
 }
 
 void MKLDNNPoolLayer::resetFwdBuffers(MKLDNNMatrixPtr& in,
                                       MKLDNNMatrixPtr& out) {
   resetInValue(in);
 
-  resetOutValue(out);
-}
-
-void MKLDNNPoolLayer::resetInValue(MKLDNNMatrixPtr& in) {
-  if (inputIsOnlyMKLDNN()) {
-    const MatrixPtr& dnnIn = getInputValue(0);
-    in = std::dynamic_pointer_cast<MKLDNNMatrix>(dnnIn);
-    CHECK(in) << "Input should be MKLDNNMatrix";
-  } else {
-    CHECK_EQ(getPrev(0)->getDeviceId(), CPU_DEVICE) << "Only support CPU yet";
-    const MatrixPtr& cpuIn = getInputValue(0, CPU_DEVICE);
-    in = MKLDNNMatrix::create(
-        cpuIn, {bs_, ic_, ih_, iw_}, format::nchw, engine_);
-  }
-}
-
-void MKLDNNPoolLayer::resetOutValue(MKLDNNMatrixPtr& out) {
-  CHECK(inVal_) << "Should reset input value first";
   memory::dims outDims = memory::dims{bs_, oc_, oh_, ow_};
-  out = MKLDNNMatrix::create(
-      output_.value, outDims, inVal_->getFormat(), engine_);
-
-  // create reorder if output value has cpu device and pd do not match
-  cpuOutVal_ = nullptr;
-  cvtOutVal_ = nullptr;
-  if (!outputIsOnlyMKLDNN()) {
-    const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).value;
-    cpuOutVal_ = MKLDNNMatrix::create(cpuOut, outDims, format::nchw, engine_);
-    if (cpuOutVal_->getPrimitiveDesc() != out->getPrimitiveDesc()) {
-      out = MKLDNNMatrix::create(nullptr, out->getPrimitiveDesc());
-      cvtOutVal_ = MKLDNNMatrix::createReorder(out, cpuOutVal_);
-      CHECK(cvtOutVal_) << "should not be emptry";
-    } else {
-      cpuOut->setData(output_.value->getData());
-      cpuOutVal_ = out;
-    }
-    output_.value = std::dynamic_pointer_cast<Matrix>(cpuOutVal_);
-    return;
-  }
-  output_.value = std::dynamic_pointer_cast<Matrix>(outVal_);
+  CHECK(in);
+  auto outPD =
+      MKLDNNMatrix::createPrimitiveDesc(outDims, in->getFormat(), engine_);
+  resetOutValue(out, outPD);
 }
 
 void MKLDNNPoolLayer::resetFwdPD(std::shared_ptr<pool_fwd::primitive_desc>& pd,
                                  MKLDNNMatrixPtr in,
                                  MKLDNNMatrixPtr out) {
-  memory::dims inDims = memory::dims{bs_, ic_, ih_, iw_};
-  memory::dims outDims = memory::dims{bs_, oc_, oh_, ow_};
   memory::dims kernels = memory::dims{fh_, fw_};
   memory::dims strides = memory::dims{sh_, sw_};
   memory::dims padL = memory::dims{ph_, pw_};
@@ -194,58 +149,26 @@ void MKLDNNPoolLayer::resetFwdPipeline(
              ? std::make_shared<pool_fwd>(pool_fwd(*pd, *in, *out, *workspace_))
              : std::make_shared<pool_fwd>(pool_fwd(*pd, *in, *out));
   pipeline.push_back(*fwd_);
-
-  if (cvtOutVal_) {
-    pipeline.push_back(*cvtOutVal_);
-  }
 }
 
 void MKLDNNPoolLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
                                       MKLDNNMatrixPtr& out) {
-  resetOutGrad(out);
-
-  resetInGrad(in);
-}
-void MKLDNNPoolLayer::resetOutGrad(MKLDNNMatrixPtr& out) {
-  cpuOutGrad_ = nullptr;
-  cvtOutGrad_ = nullptr;
-  CHECK(outVal_);
-  if (outputIsOnlyMKLDNN()) {
-    MKLDNNLayer::resetOutGrad(out, outVal_->getPrimitiveDesc());
-  } else {
-    const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).grad;
-    // always share the same grad data of CPU output
-    // then the activation can get the right grad from output_.grad
-    output_.grad->setData(cpuOut->getData());
-    cpuOutGrad_ = MKLDNNMatrix::create(
-        cpuOut, memory::dims{bs_, oc_, oh_, ow_}, format::nchw, engine_);
-    if (cpuOutGrad_->getPrimitiveDesc() != outVal_->getPrimitiveDesc()) {
-      out = MKLDNNMatrix::create(nullptr, outVal_->getPrimitiveDesc());
-      cvtOutGrad_ = MKLDNNMatrix::createReorder(cpuOutGrad_, out);
-      CHECK(cvtOutGrad_) << "should not be emptry";
-    } else {
-      out = cpuOutGrad_;
-    }
-  }
-}
-
-void MKLDNNPoolLayer::resetInGrad(MKLDNNMatrixPtr& in) {
-  in = nullptr;
-  if (inputLayers_[0]->getOutput().grad == nullptr) {
-    return;
-  }
-  CHECK(inVal_);
-  MKLDNNLayer::resetInGrad(in, inVal_->getPrimitiveDesc());
+  CHECK(inVal_ && outVal_);
+  resetOutGrad(out, outVal_->getPrimitiveDesc());
+  resetInGrad(in, inVal_->getPrimitiveDesc());
 }
 
 void MKLDNNPoolLayer::resetBwdPD(std::shared_ptr<pool_bwd::primitive_desc>& pd,
                                  MKLDNNMatrixPtr& in,
                                  MKLDNNMatrixPtr& out) {
+  pd = nullptr;
+  if (in == nullptr) {
+    return;
+  }
   memory::dims kernels = memory::dims{fh_, fw_};
   memory::dims strides = memory::dims{sh_, sw_};
   memory::dims padL = memory::dims{ph_, pw_};
   memory::dims padR = getPaddingR();
-  CHECK(in);
   CHECK(out);
   auto bwdDesc = pool_bwd::desc(poolAlgo_,
                                 in->getMemoryDesc(),
@@ -263,8 +186,8 @@ void MKLDNNPoolLayer::resetBwdPipeline(
     std::shared_ptr<pool_bwd::primitive_desc>& pd,
     MKLDNNMatrixPtr& in,
     MKLDNNMatrixPtr& out) {
-  if (cvtOutGrad_) {
-    pipeline.push_back(*cvtOutGrad_);
+  if (pd == nullptr) {
+    return;
   }
 
   bwdData_ =
diff --git a/paddle/gserver/layers/MKLDNNPoolLayer.h b/paddle/gserver/layers/MKLDNNPoolLayer.h
index 891e15a7ef..c5ec87828b 100644
--- a/paddle/gserver/layers/MKLDNNPoolLayer.h
+++ b/paddle/gserver/layers/MKLDNNPoolLayer.h
@@ -38,13 +38,6 @@ protected:
   // pooling_avg or pooling_max
   mkldnn::algorithm poolAlgo_;
 
-  // MKLDNNMatrixPtr which should be created from CPU Device
-  MKLDNNMatrixPtr cpuOutVal_;
-  MKLDNNMatrixPtr cpuOutGrad_;
-  // convert handle between CPU device and MKLDNN device
-  std::shared_ptr<mkldnn::reorder> cvtOutVal_;
-  std::shared_ptr<mkldnn::reorder> cvtOutGrad_;
-
   // save forward primitive_desc, which can be used backward
   std::shared_ptr<pool_fwd::primitive_desc> fwdPD_;
   // according to https://github.com/01org/mkl-dnn/blob/master/tests/gtests/
@@ -74,8 +67,6 @@ public:
                 MKLDNNMatrixPtr& bias,
                 MKLDNNMatrixPtr& out) override;
 
-  void updateInputData() override;
-
   void printSizeInfo() override {
     MKLDNNLayer::printSizeInfo();
     VLOG(MKLDNN_SIZES) << getName() << ": fh: " << fh_ << ", fw: " << fw_
@@ -90,8 +81,6 @@ protected:
    *                    reset pipeline.
    */
   void resetFwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out);
-  void resetInValue(MKLDNNMatrixPtr& in);
-  void resetOutValue(MKLDNNMatrixPtr& out);
   void resetFwdPD(std::shared_ptr<pool_fwd::primitive_desc>& pd,
                   MKLDNNMatrixPtr in,
                   MKLDNNMatrixPtr out);
@@ -106,8 +95,6 @@ protected:
    *                     reset pipeline.
    */
   void resetBwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out);
-  void resetOutGrad(MKLDNNMatrixPtr& out);
-  void resetInGrad(MKLDNNMatrixPtr& in);
   void resetBwdPD(std::shared_ptr<pool_bwd::primitive_desc>& pd,
                   MKLDNNMatrixPtr& in,
                   MKLDNNMatrixPtr& out);
diff --git a/paddle/math/MKLDNNMatrix.cpp b/paddle/math/MKLDNNMatrix.cpp
index 0778bb63b7..c606560473 100644
--- a/paddle/math/MKLDNNMatrix.cpp
+++ b/paddle/math/MKLDNNMatrix.cpp
@@ -46,7 +46,7 @@ MKLDNNMatrixPtr MKLDNNMatrix::create(MatrixPtr m,
                                      memory::format fmt,
                                      engine& eg,
                                      mkldnn::memory::data_type dtype) {
-  return create(m, memory::primitive_desc(memory::desc(dims, dtype, fmt), eg));
+  return create(m, createPrimitiveDesc(dims, fmt, eg, dtype));
 }
 
 std::shared_ptr<reorder> MKLDNNMatrix::createReorder(const MKLDNNMatrixPtr& src,
diff --git a/paddle/math/MKLDNNMatrix.h b/paddle/math/MKLDNNMatrix.h
index c843115eb9..9e3f29eb57 100644
--- a/paddle/math/MKLDNNMatrix.h
+++ b/paddle/math/MKLDNNMatrix.h
@@ -52,12 +52,24 @@ public:
       mkldnn::engine& eg,
       mkldnn::memory::data_type dtype = mkldnn::memory::data_type::f32);
 
+  /**
+   * Create primitive descriptor.
+   * default with f32 dtype
+   */
+  static mkldnn::memory::primitive_desc createPrimitiveDesc(
+      const mkldnn::memory::dims dims,
+      const mkldnn::memory::format& fmt,
+      const mkldnn::engine& eg,
+      const mkldnn::memory::data_type& dtype = mkldnn::memory::data_type::f32) {
+    return mkldnn::memory::primitive_desc(memory::desc(dims, dtype, fmt), eg);
+  }
+
   /**
    * Create Memory descriptor.
    * default with any format and f32 dtype
    */
   static mkldnn::memory::desc createMemoryDesc(
-      const mkldnn::memory::dims& dims,
+      const mkldnn::memory::dims dims,
       const mkldnn::memory::format& fmt = mkldnn::memory::format::any,
       const mkldnn::memory::data_type& dtype = mkldnn::memory::data_type::f32) {
     return mkldnn::memory::desc(dims, dtype, fmt);

From 9e38dafa29acc59347a5aee33424be7bb8bcd168 Mon Sep 17 00:00:00 2001
From: tensor-tang <jian.j.tang@intel.com>
Date: Thu, 19 Oct 2017 15:18:34 +0800
Subject: [PATCH 2/5] change MKLDNNMatrix create interface since MatrixPtr is
 not always required

---
 .../gserver/activations/MKLDNNActivation.cpp  |  6 ++--
 paddle/gserver/layers/MKLDNNConvLayer.cpp     |  3 +-
 paddle/gserver/layers/MKLDNNLayer.h           | 32 +++++++++----------
 paddle/math/MKLDNNMatrix.cpp                  |  8 ++---
 paddle/math/MKLDNNMatrix.h                    |  5 +--
 5 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/paddle/gserver/activations/MKLDNNActivation.cpp b/paddle/gserver/activations/MKLDNNActivation.cpp
index 18c5638100..f3ccd68160 100644
--- a/paddle/gserver/activations/MKLDNNActivation.cpp
+++ b/paddle/gserver/activations/MKLDNNActivation.cpp
@@ -126,7 +126,7 @@ void MKLDNNEltwiseActivation::resetFwd(Argument& act) {
   copyInVal_ = nullptr;
   if (act.grad && algo == algorithm::eltwise_tanh) {
     // tanh need save src input for backward
-    inVal_ = MKLDNNMatrix::create(nullptr, val_->getPrimitiveDesc());
+    inVal_ = MKLDNNMatrix::create(val_->getPrimitiveDesc());
     copyInVal_ = std::make_shared<mkldnn::reorder>(*val_, *inVal_);
     CHECK(copyInVal_) << "should not be emptry";
     pipelineFwd_.push_back(*copyInVal_);
@@ -145,7 +145,7 @@ void MKLDNNEltwiseActivation::resetBwd(Argument& act) {
   algorithm algo = getAlgo(this->getName());
   float alpha = getBwdAlpha();
   float beta = getBeta();
-  grad_ = MKLDNNMatrix::create(act.grad, val_->getPrimitiveDesc());
+  grad_ = MKLDNNMatrix::create(val_->getPrimitiveDesc(), act.grad);
   auto eng = CPUEngine::Instance().getEngine();
   auto bwdDesc = eltwise_bwd::desc(
       algo, grad_->getMemoryDesc(), val_->getMemoryDesc(), alpha, beta);
@@ -230,7 +230,7 @@ void MKLDNNActivation::resetFwd(Argument& act) {
     int ic = cnt_ / bs / ih / iw;
     CHECK_EQ(cnt_, (size_t)bs * ic * ih * iw);
     val_ = MKLDNNMatrix::create(
-        act.value, {bs, ic, ih, iw}, mkldnn::memory::format::nchw, *engine_);
+        {bs, ic, ih, iw}, mkldnn::memory::format::nchw, *engine_, act.value);
     CHECK(val_);
     val_->downSpatial();
   }
diff --git a/paddle/gserver/layers/MKLDNNConvLayer.cpp b/paddle/gserver/layers/MKLDNNConvLayer.cpp
index 463e6ad0ed..3fbfb1ab1f 100644
--- a/paddle/gserver/layers/MKLDNNConvLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNConvLayer.cpp
@@ -370,8 +370,7 @@ void MKLDNNConvLayer::resetWgtValBwdData(
   // since the primitive_desc would be different with wgtVal_
   CHECK(wgtVal_) << "should have weight value";
   if (dataPD->weights_primitive_desc() != wgtVal_->getPrimitiveDesc()) {
-    wgtValBwdData_ =
-        MKLDNNMatrix::create(nullptr, dataPD->weights_primitive_desc());
+    wgtValBwdData_ = MKLDNNMatrix::create(dataPD->weights_primitive_desc());
     cvtWgtVal_ = MKLDNNMatrix::createReorder(wgtVal_, wgtValBwdData_);
     CHECK(cvtWgtVal_);
   } else {
diff --git a/paddle/gserver/layers/MKLDNNLayer.h b/paddle/gserver/layers/MKLDNNLayer.h
index ab59357ad0..80c67529da 100644
--- a/paddle/gserver/layers/MKLDNNLayer.h
+++ b/paddle/gserver/layers/MKLDNNLayer.h
@@ -323,7 +323,7 @@ protected:
     if (mat == nullptr) {
       return;
     }
-    dnn = MKLDNNMatrix::create(mat, pd);
+    dnn = MKLDNNMatrix::create(pd, mat);
   }
 
   /**
@@ -343,7 +343,7 @@ protected:
     in = std::dynamic_pointer_cast<MKLDNNMatrix>(inMat);
     CHECK_EQ(inputIsOnlyMKLDNN(), in != nullptr);
     if (in == nullptr || in->getFormat() == mkldnn::memory::format::nc) {
-      in = MKLDNNMatrix::create(inMat, extPD);
+      in = MKLDNNMatrix::create(extPD, inMat);
     }
     extInVal_ = isPaddleFormat(in->getFormat()) ? in : nullptr;
     if (in->getFormat() == mkldnn::memory::format::nc) {
@@ -353,8 +353,8 @@ protected:
       return;
     }
     // need create reorder
-    in = MKLDNNMatrix::create(nullptr, *intPD);
-    extInVal_ = extInVal_ ? extInVal_ : MKLDNNMatrix::create(inMat, extPD);
+    in = MKLDNNMatrix::create(*intPD);
+    extInVal_ = extInVal_ ? extInVal_ : MKLDNNMatrix::create(extPD, inMat);
     cvtInVal_ = MKLDNNMatrix::createReorder(extInVal_, in);
     CHECK(cvtInVal_) << "should not be emptry";
   }
@@ -366,18 +366,18 @@ protected:
   void resetOutValue(MKLDNNMatrixPtr& out,
                      mkldnn::memory::primitive_desc intPD) {
     cvtOutVal_ = nullptr;
-    out = MKLDNNMatrix::create(output_.value, intPD);
+    out = MKLDNNMatrix::create(intPD, output_.value);
     extOutVal_ = out;
     if (outputIsOnlyMKLDNN() || isPaddleFormat(extOutVal_->getFormat())) {
       return;
     }
     // need create reorder
     CHECK_GT(bs_ * oc_ * oh_ * ow_, 0);
-    extOutVal_ = MKLDNNMatrix::create(output_.value,
-                                      {bs_, oc_, oh_, ow_},
+    extOutVal_ = MKLDNNMatrix::create(mkldnn::memory::dims{bs_, oc_, oh_, ow_},
                                       mkldnn::memory::format::nchw,
-                                      engine_);
-    out = MKLDNNMatrix::create(nullptr, intPD);
+                                      engine_,
+                                      output_.value);
+    out = MKLDNNMatrix::create(intPD);
     cvtOutVal_ = MKLDNNMatrix::createReorder(out, extOutVal_);
     CHECK(cvtOutVal_) << "should not be empty";
   }
@@ -402,7 +402,7 @@ protected:
     // and the mkldnn input layer will merge them to actual prev->output_.grad
     const MatrixPtr& inMat =
         input->getOutputMapSize() <= 1 ? input->getOutputGrad() : nullptr;
-    in = MKLDNNMatrix::create(inMat, intPD);
+    in = MKLDNNMatrix::create(intPD, inMat);
     Argument& arg = input->getOutput(this->getName());
     arg.grad = std::dynamic_pointer_cast<Matrix>(in);
     CHECK(inVal_ != nullptr && inVal_->getPrimitiveDesc() == intPD)
@@ -418,10 +418,10 @@ protected:
     // need create reorder
     CHECK(extInVal_ != nullptr && isPaddleFormat(extInVal_->getFormat()))
         << "should have external input value and the format must be nchw(nc)";
-    extInGrad_ = MKLDNNMatrix::create(inMat, extInVal_->getPrimitiveDesc());
+    extInGrad_ = MKLDNNMatrix::create(extInVal_->getPrimitiveDesc(), inMat);
     CHECK(inVal_ != nullptr && inVal_->getPrimitiveDesc() == intPD)
         << "should have internal input value and primitive desc must equal";
-    in = MKLDNNMatrix::create(nullptr, intPD);
+    in = MKLDNNMatrix::create(intPD);
     cvtInGrad_ = MKLDNNMatrix::createReorder(in, extInGrad_);
     CHECK(cvtInGrad_);
   }
@@ -440,7 +440,7 @@ protected:
     extOutGrad_ = nullptr;
     out = nullptr;
     MatrixPtr& outMat = output_.grad;
-    out = MKLDNNMatrix::create(outMat, intPD);
+    out = MKLDNNMatrix::create(intPD, outMat);
     resetMergeGrad(out);
     if (outputIsOnlyMKLDNN()) {
       return;
@@ -453,10 +453,10 @@ protected:
     // need create reorder
     CHECK(extOutVal_ != nullptr && isPaddleFormat(extOutVal_->getFormat()))
         << "should have external output value and the format must be nchw(nc)";
-    extOutGrad_ = MKLDNNMatrix::create(outMat, extOutVal_->getPrimitiveDesc());
+    extOutGrad_ = MKLDNNMatrix::create(extOutVal_->getPrimitiveDesc(), outMat);
     CHECK(outVal_ != nullptr && outVal_->getPrimitiveDesc() == intPD)
         << "should have internal output value and primitive desc must equal";
-    out = MKLDNNMatrix::create(nullptr, intPD);
+    out = MKLDNNMatrix::create(intPD);
     cvtOutGrad_ = MKLDNNMatrix::createReorder(extOutGrad_, out);
     CHECK(cvtOutGrad_);
   }
@@ -499,7 +499,7 @@ protected:
     tmpOutGrad_ = out;
     tmpCvt_ = nullptr;
     if (out->getPrimitiveDesc() != srcPDs[0]) {
-      tmpOutGrad_ = MKLDNNMatrix::create(nullptr, srcPDs[0]);
+      tmpOutGrad_ = MKLDNNMatrix::create(srcPDs[0]);
       tmpCvt_ = MKLDNNMatrix::createReorder(tmpOutGrad_, out);
       CHECK(tmpCvt_);
       pipelineMergeGrad_.push_back(*tmpCvt_);
diff --git a/paddle/math/MKLDNNMatrix.cpp b/paddle/math/MKLDNNMatrix.cpp
index c606560473..21a8f73c3e 100644
--- a/paddle/math/MKLDNNMatrix.cpp
+++ b/paddle/math/MKLDNNMatrix.cpp
@@ -18,7 +18,7 @@ using namespace mkldnn;  // NOLINT
 
 namespace paddle {
 
-MKLDNNMatrixPtr MKLDNNMatrix::create(MatrixPtr m, memory::primitive_desc pd) {
+MKLDNNMatrixPtr MKLDNNMatrix::create(memory::primitive_desc pd, MatrixPtr m) {
   memory::desc md = pd.desc();
   size_t ndims = md.data.ndims;
   int* dims = md.data.dims;
@@ -41,12 +41,12 @@ MKLDNNMatrixPtr MKLDNNMatrix::create(MatrixPtr m, memory::primitive_desc pd) {
   return std::make_shared<MKLDNNMatrix>(cpuMatrix, pd);
 }
 
-MKLDNNMatrixPtr MKLDNNMatrix::create(MatrixPtr m,
-                                     memory::dims dims,
+MKLDNNMatrixPtr MKLDNNMatrix::create(memory::dims dims,
                                      memory::format fmt,
                                      engine& eg,
+                                     MatrixPtr m,
                                      mkldnn::memory::data_type dtype) {
-  return create(m, createPrimitiveDesc(dims, fmt, eg, dtype));
+  return create(createPrimitiveDesc(dims, fmt, eg, dtype), m);
 }
 
 std::shared_ptr<reorder> MKLDNNMatrix::createReorder(const MKLDNNMatrixPtr& src,
diff --git a/paddle/math/MKLDNNMatrix.h b/paddle/math/MKLDNNMatrix.h
index 9e3f29eb57..fe755d096d 100644
--- a/paddle/math/MKLDNNMatrix.h
+++ b/paddle/math/MKLDNNMatrix.h
@@ -40,16 +40,17 @@ public:
   /**
    * Create MKLDNNMatrix from a MatrixPtr and memory primitive_desc
    */
-  static MKLDNNMatrixPtr create(MatrixPtr m, mkldnn::memory::primitive_desc pd);
+  static MKLDNNMatrixPtr create(mkldnn::memory::primitive_desc pd,
+                                MatrixPtr m = nullptr);
 
   /**
    * Create MKLDNNMatrix from a MatrixPtr and memory details info
    */
   static MKLDNNMatrixPtr create(
-      MatrixPtr m,
       mkldnn::memory::dims dims,
       mkldnn::memory::format fmt,
       mkldnn::engine& eg,
+      MatrixPtr m = nullptr,
       mkldnn::memory::data_type dtype = mkldnn::memory::data_type::f32);
 
   /**

From 94e442d4b14c66ba68d8e64c0f51f5bc849437dd Mon Sep 17 00:00:00 2001
From: tensor-tang <jian.j.tang@intel.com>
Date: Thu, 19 Oct 2017 16:32:11 +0800
Subject: [PATCH 3/5] add cpp file of MKLDNNLayer

---
 paddle/gserver/layers/MKLDNNLayer.cpp | 327 ++++++++++++++++++++++
 paddle/gserver/layers/MKLDNNLayer.h   | 386 ++++----------------------
 2 files changed, 379 insertions(+), 334 deletions(-)
 create mode 100644 paddle/gserver/layers/MKLDNNLayer.cpp

diff --git a/paddle/gserver/layers/MKLDNNLayer.cpp b/paddle/gserver/layers/MKLDNNLayer.cpp
new file mode 100644
index 0000000000..91f0ff5bd3
--- /dev/null
+++ b/paddle/gserver/layers/MKLDNNLayer.cpp
@@ -0,0 +1,327 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MKLDNNLayer.h"
+
+using namespace mkldnn;  // NOLINT
+typedef memory::format format;
+
+namespace paddle {
+
+bool MKLDNNLayer::init(const LayerMap& layerMap,
+                       const ParameterMap& parameterMap) {
+  CHECK(FLAGS_use_mkldnn) << "MkldnnLayers only support use_mkldnn."
+                          << "Please set WITH_MKLDNN=ON "
+                          << "and set use_mkldnn=True";
+  CHECK(!useGpu_) << "Do not support GPU yet";
+
+  // set device id before Layer::init
+  setDevice(MKLDNN_DEVICE);
+  // change param device to MKLDNN device
+  setParamsDevice(MKLDNN_DEVICE, parameterMap);
+  if (!Layer::init(layerMap, parameterMap)) {
+    return false;
+  }
+  setOutputMap();
+  checkCPUOutputsNumber();
+
+  stream_.reset(new MKLDNNStream());
+  engine_ = CPUEngine::Instance().getEngine();
+  return true;
+}
+
+void MKLDNNLayer::forward(PassType passType) {
+  passType_ = passType;
+
+  {
+    REGISTER_TIMER_INFO("mkldnn_FwdTimer", getName().c_str());
+    CHECK(!inputLayers_.empty());
+    copySeqInfoToOutputs();
+    size_t elemenCnt = inputLayers_[0]->getOutputValue()->getElementCnt();
+    if (inputElemenCnt_ != elemenCnt) {
+      VLOG(MKLDNN_BASE) << getName() << " reset mkldnn forward";
+      // reset when input total sizes changed, not only the batchsize
+      inputElemenCnt_ = elemenCnt;
+      pipelineFwd_.clear();
+      reshape(bs_, ic_, ih_, iw_, oc_, oh_, ow_);
+      // all cpu device output grad or value share output's
+      shareCPUDevice();
+      resetFwd(pipelineFwd_, inVal_, wgtVal_, biasVal_, outVal_);
+      // MKLDNNLayer output value should be MKLDNNMatrix
+      // so external output value is necessary.
+      // then external input value is not necessary,
+      // since input may be mkldnn internal buffer.
+      CHECK(extOutVal_) << "external output value is necessary";
+      output_.value = std::dynamic_pointer_cast<Matrix>(extOutVal_);
+      CHECK(inVal_ && outVal_) << "internal memories are necessary";
+      if (cvtInVal_) {
+        pipelineFwd_.insert(pipelineFwd_.begin(), *cvtInVal_);
+      }
+      if (cvtOutVal_) {
+        pipelineFwd_.push_back(*cvtOutVal_);
+      }
+      convertWeightsFromPaddle();
+      printSizeInfo();
+      printValueFormat();
+      needResetBwd_ = true;
+    }
+
+    if (inputLayers_[0]->getType() == "data") {
+      // Update input value data when input layer is "data" type,
+      // since the input value data address might be changed.
+      CHECK(extInVal_);
+      extInVal_->setData(getInputValue(0, CPU_DEVICE)->getData());
+    }
+
+    if (!outputOnlyMKLDNN_) {
+      clearGrads();
+    }
+    stream_->submit(pipelineFwd_);
+  }
+  {
+    REGISTER_TIMER_INFO("FwActTimer", getName().c_str());
+    forwardActivation();
+  }
+}
+
+void MKLDNNLayer::backward(const UpdateCallback& callback) {
+  if (needResetBwd_) {
+    VLOG(MKLDNN_BASE) << getName() << " reset mkldnn backward";
+    pipelineBwd_.clear();
+    pipelineMergeGrad_.clear();
+    mergeGrad_ = nullptr;
+    resetBwd(pipelineBwd_, inGrad_, wgtGrad_, biasGrad_, outGrad_);
+    // external output grad is not necessary
+    // since output may be mkldnn internal buffer or merge them directly.
+    CHECK(outGrad_) << "internal output grad is necessary";
+    if (cvtOutGrad_) {
+      pipelineBwd_.insert(pipelineBwd_.begin(), *cvtOutGrad_);
+    }
+    if (cvtInGrad_) {
+      pipelineBwd_.push_back(*cvtInGrad_);
+    }
+    printGradFormat();
+    needResetBwd_ = false;
+  }
+
+  // merge grad must before backward activation
+  if (mergeGrad_) {
+    REGISTER_TIMER_INFO("MergeBpGrad", getName().c_str());
+    stream_->submit(pipelineMergeGrad_);
+  }
+  {
+    REGISTER_TIMER_INFO("BpActTimer", getName().c_str());
+    backwardActivation();
+  }
+  {
+    REGISTER_TIMER_INFO("mkldnn_bwdTimer", getName().c_str());
+    stream_->submit(pipelineBwd_);
+  }
+  {
+    REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
+    updateWeights(callback);
+  }
+}
+
+void MKLDNNLayer::reshapeInput(int& batchsize, int& height, int& width) {
+  const Argument& input = inputLayers_[0]->getOutput();
+  batchsize = input.getBatchSize();
+  int h = input.getFrameHeight();
+  int w = input.getFrameWidth();
+  if (h != 0) {
+    height = h;
+  }
+  if (w != 0) {
+    width = w;
+  }
+}
+
+void MKLDNNLayer::reshapeOutput(size_t height, size_t width) {
+  output_.setFrameHeight(height);
+  output_.setFrameWidth(width);
+  for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
+    outputOtherDevice_[i].setFrameHeight(height);
+    outputOtherDevice_[i].setFrameWidth(width);
+  }
+}
+
+void MKLDNNLayer::resetWithMatrix(MKLDNNMatrixPtr& dnn,
+                                  const MatrixPtr& mat,
+                                  memory::primitive_desc pd) {
+  dnn = nullptr;
+  if (mat == nullptr) {
+    return;
+  }
+  dnn = MKLDNNMatrix::create(pd, mat);
+}
+
+void MKLDNNLayer::resetInValue(
+    MKLDNNMatrixPtr& in, const std::shared_ptr<memory::primitive_desc>& intPD) {
+  cvtInVal_ = nullptr;
+  extInVal_ = nullptr;
+  in = nullptr;
+  CHECK_GT(bs_ * ic_ * ih_ * iw_, 0);
+  auto extPD = MKLDNNMatrix::createPrimitiveDesc(
+      {bs_, ic_, ih_, iw_}, format::nchw, engine_);
+  const MatrixPtr& inMat = inputLayers_[0]->getOutputValue();
+  in = std::dynamic_pointer_cast<MKLDNNMatrix>(inMat);
+  CHECK_EQ(inputIsOnlyMKLDNN(), in != nullptr);
+  if (in == nullptr || in->getFormat() == format::nc) {
+    in = MKLDNNMatrix::create(extPD, inMat);
+  }
+  extInVal_ = isPaddleFormat(in->getFormat()) ? in : nullptr;
+  if (in->getFormat() == format::nc) {
+    CHECK(ih_ == 1 && iw_ == 1);
+  }
+  if (nullptr == intPD || in->getPrimitiveDesc() == *intPD) {
+    return;
+  }
+  // need create reorder
+  in = MKLDNNMatrix::create(*intPD);
+  extInVal_ = extInVal_ ? extInVal_ : MKLDNNMatrix::create(extPD, inMat);
+  cvtInVal_ = MKLDNNMatrix::createReorder(extInVal_, in);
+  CHECK(cvtInVal_) << "should not be emptry";
+}
+
+void MKLDNNLayer::resetOutValue(MKLDNNMatrixPtr& out,
+                                memory::primitive_desc intPD) {
+  cvtOutVal_ = nullptr;
+  out = MKLDNNMatrix::create(intPD, output_.value);
+  extOutVal_ = out;
+  if (outputIsOnlyMKLDNN() || isPaddleFormat(extOutVal_->getFormat())) {
+    return;
+  }
+  // need create reorder
+  CHECK_GT(bs_ * oc_ * oh_ * ow_, 0);
+  extOutVal_ = MKLDNNMatrix::create(
+      memory::dims{bs_, oc_, oh_, ow_}, format::nchw, engine_, output_.value);
+  out = MKLDNNMatrix::create(intPD);
+  cvtOutVal_ = MKLDNNMatrix::createReorder(out, extOutVal_);
+  CHECK(cvtOutVal_) << "should not be empty";
+}
+
+void MKLDNNLayer::resetInGrad(MKLDNNMatrixPtr& in,
+                              memory::primitive_desc intPD) {
+  cvtInGrad_ = nullptr;
+  extInGrad_ = nullptr;
+  in = nullptr;
+  LayerPtr& input = inputLayers_[0];
+  if (input->getOutputGrad() == nullptr) {
+    // no need input grad
+    return;
+  }
+  CHECK(inputIsOnlyMKLDNN() || input->getOutputMapSize() <= 1)
+      << "only support input is MKLDNN layer or only have one output layer";
+  // when input is a mkldnn branch node,
+  // this layer will save input grad to a internal buffer,
+  // and the mkldnn input layer will merge them to actual prev->output_.grad
+  const MatrixPtr& inMat =
+      input->getOutputMapSize() <= 1 ? input->getOutputGrad() : nullptr;
+  in = MKLDNNMatrix::create(intPD, inMat);
+  Argument& arg = input->getOutput(this->getName());
+  arg.grad = std::dynamic_pointer_cast<Matrix>(in);
+  CHECK(inVal_ != nullptr && inVal_->getPrimitiveDesc() == intPD)
+      << "should have internal input value and primitive desc must equal";
+  if (inputIsOnlyMKLDNN()) {
+    return;
+  }
+
+  extInGrad_ = in;
+  if (isPaddleFormat(extInGrad_->getFormat())) {
+    return;
+  }
+  // need create reorder
+  CHECK(extInVal_ != nullptr && isPaddleFormat(extInVal_->getFormat()))
+      << "should have external input value and the format must be nchw(nc)";
+  extInGrad_ = MKLDNNMatrix::create(extInVal_->getPrimitiveDesc(), inMat);
+  CHECK(inVal_ != nullptr && inVal_->getPrimitiveDesc() == intPD)
+      << "should have internal input value and primitive desc must equal";
+  in = MKLDNNMatrix::create(intPD);
+  cvtInGrad_ = MKLDNNMatrix::createReorder(in, extInGrad_);
+  CHECK(cvtInGrad_);
+}
+
+void MKLDNNLayer::resetOutGrad(MKLDNNMatrixPtr& out,
+                               memory::primitive_desc intPD) {
+  cvtOutGrad_ = nullptr;
+  extOutGrad_ = nullptr;
+  out = nullptr;
+  MatrixPtr& outMat = output_.grad;
+  out = MKLDNNMatrix::create(intPD, outMat);
+  resetMergeGrad(out);
+  if (outputIsOnlyMKLDNN()) {
+    return;
+  }
+  CHECK_LE(outputMap_.size(), 1U) << "do not support mixed with cpu device";
+  extOutGrad_ = out;
+  if (isPaddleFormat(extOutGrad_->getFormat())) {
+    return;
+  }
+  // need create reorder
+  CHECK(extOutVal_ != nullptr && isPaddleFormat(extOutVal_->getFormat()))
+      << "should have external output value and the format must be nchw(nc)";
+  extOutGrad_ = MKLDNNMatrix::create(extOutVal_->getPrimitiveDesc(), outMat);
+  CHECK(outVal_ != nullptr && outVal_->getPrimitiveDesc() == intPD)
+      << "should have internal output value and primitive desc must equal";
+  out = MKLDNNMatrix::create(intPD);
+  cvtOutGrad_ = MKLDNNMatrix::createReorder(extOutGrad_, out);
+  CHECK(cvtOutGrad_);
+}
+
+void MKLDNNLayer::resetMergeGrad(MKLDNNMatrixPtr& out) {
+  mergeGrad_ = nullptr;
+  pipelineMergeGrad_.clear();
+  if (outputMap_.size() <= 1 || !outputIsOnlyMKLDNN()) {
+    // do not merge when output is not all MKLDNN or only one output
+    return;
+  }
+  CHECK(out) << "should have reset internal ouput grad";
+  std::vector<double> scales(outputMap_.size(), 1.0);
+  std::vector<memory::primitive_desc> srcPDs;
+  std::vector<primitive::at> srcs;
+  for (auto it = outputMap_.begin(); it != outputMap_.end(); ++it) {
+    MKLDNNMatrixPtr src =
+        std::dynamic_pointer_cast<MKLDNNMatrix>(it->second->grad);
+    VLOG(MKLDNN_BASE) << getName() << " has output grad " << it->first;
+    CHECK(src) << "should be MKLDNNMatrix";
+    auto srcDims = src->getDims();
+    auto dstDims = out->getDims();
+    CHECK_EQ(srcDims.size(), dstDims.size());
+    for (size_t i = 0; i < srcDims.size(); ++i) {
+      CHECK_EQ(srcDims[i], dstDims[i]);
+    }
+    srcPDs.push_back(src->getPrimitiveDesc());
+    srcs.push_back(*src);
+  }
+
+  // TODO(TJ): remove me when mkldnn sum support different formats
+  for (size_t i = 1; i < srcPDs.size(); ++i) {
+    CHECK(srcPDs[0] == srcPDs[i]);
+  }
+  tmpOutGrad_ = out;
+  tmpCvt_ = nullptr;
+  if (out->getPrimitiveDesc() != srcPDs[0]) {
+    tmpOutGrad_ = MKLDNNMatrix::create(srcPDs[0]);
+    tmpCvt_ = MKLDNNMatrix::createReorder(tmpOutGrad_, out);
+    CHECK(tmpCvt_);
+    pipelineMergeGrad_.push_back(*tmpCvt_);
+  }
+
+  auto sumPD =
+      sum::primitive_desc(tmpOutGrad_->getMemoryDesc(), scales, srcPDs);
+  mergeGrad_.reset(new sum(sumPD, srcs, *tmpOutGrad_));
+  pipelineMergeGrad_.insert(pipelineMergeGrad_.begin(), *mergeGrad_);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNLayer.h b/paddle/gserver/layers/MKLDNNLayer.h
index 80c67529da..faad434526 100644
--- a/paddle/gserver/layers/MKLDNNLayer.h
+++ b/paddle/gserver/layers/MKLDNNLayer.h
@@ -119,119 +119,9 @@ public:
 
   ~MKLDNNLayer() {}
 
-  virtual bool init(const LayerMap& layerMap,
-                    const ParameterMap& parameterMap) {
-    CHECK(FLAGS_use_mkldnn) << "MkldnnLayers only support use_mkldnn."
-                            << "Please set WITH_MKLDNN=ON "
-                            << "and set use_mkldnn=True";
-    CHECK(!useGpu_) << "Do not support GPU yet";
-
-    // set device id before Layer::init
-    setDevice(MKLDNN_DEVICE);
-    // change param device to MKLDNN device
-    setParamsDevice(MKLDNN_DEVICE, parameterMap);
-    if (!Layer::init(layerMap, parameterMap)) {
-      return false;
-    }
-    setOutputMap();
-    checkCPUOutputsNumber();
-
-    stream_.reset(new MKLDNNStream());
-    engine_ = CPUEngine::Instance().getEngine();
-    return true;
-  }
-
-  void forward(PassType passType) override {
-    passType_ = passType;
-
-    {
-      REGISTER_TIMER_INFO("mkldnn_FwdTimer", getName().c_str());
-      CHECK(!inputLayers_.empty());
-      copySeqInfoToOutputs();
-      size_t elemenCnt = inputLayers_[0]->getOutputValue()->getElementCnt();
-      if (inputElemenCnt_ != elemenCnt) {
-        VLOG(MKLDNN_BASE) << getName() << " reset mkldnn forward";
-        // reset when input total sizes changed, not only the batchsize
-        inputElemenCnt_ = elemenCnt;
-        pipelineFwd_.clear();
-        reshape(bs_, ic_, ih_, iw_, oc_, oh_, ow_);
-        // all cpu device output grad or value share output's
-        shareCPUDevice();
-        resetFwd(pipelineFwd_, inVal_, wgtVal_, biasVal_, outVal_);
-        // MKLDNNLayer output value should be MKLDNNMatrix
-        // so external output value is necessary.
-        // then external input value is not necessary,
-        // since input may be mkldnn internal buffer.
-        CHECK(extOutVal_) << "external output value is necessary";
-        output_.value = std::dynamic_pointer_cast<Matrix>(extOutVal_);
-        CHECK(inVal_ && outVal_) << "internal memories are necessary";
-        if (cvtInVal_) {
-          pipelineFwd_.insert(pipelineFwd_.begin(), *cvtInVal_);
-        }
-        if (cvtOutVal_) {
-          pipelineFwd_.push_back(*cvtOutVal_);
-        }
-        convertWeightsFromPaddle();
-        printValueFormat();
-        needResetBwd_ = true;
-      }
-
-      if (inputLayers_[0]->getType() == "data") {
-        // Update input value data when input layer is "data" type,
-        // since the input value data address might be changed.
-        CHECK(extInVal_);
-        extInVal_->setData(getInputValue(0, CPU_DEVICE)->getData());
-      }
-
-      if (!outputOnlyMKLDNN_) {
-        clearGrads();
-      }
-      stream_->submit(pipelineFwd_);
-    }
-    {
-      REGISTER_TIMER_INFO("FwActTimer", getName().c_str());
-      forwardActivation();
-    }
-  }
-
-  void backward(const UpdateCallback& callback) override {
-    if (needResetBwd_) {
-      VLOG(MKLDNN_BASE) << getName() << " reset mkldnn backward";
-      pipelineBwd_.clear();
-      pipelineMergeGrad_.clear();
-      mergeGrad_ = nullptr;
-      resetBwd(pipelineBwd_, inGrad_, wgtGrad_, biasGrad_, outGrad_);
-      // external output grad is not necessary
-      // since output may be mkldnn internal buffer or merge them directly.
-      CHECK(outGrad_) << "internal output grad is necessary";
-      if (cvtOutGrad_) {
-        pipelineBwd_.insert(pipelineBwd_.begin(), *cvtOutGrad_);
-      }
-      if (cvtInGrad_) {
-        pipelineBwd_.push_back(*cvtInGrad_);
-      }
-      printGradFormat();
-      needResetBwd_ = false;
-    }
-
-    // merge grad must before backward activation
-    if (mergeGrad_) {
-      REGISTER_TIMER_INFO("MergeBpGrad", getName().c_str());
-      stream_->submit(pipelineMergeGrad_);
-    }
-    {
-      REGISTER_TIMER_INFO("BpActTimer", getName().c_str());
-      backwardActivation();
-    }
-    {
-      REGISTER_TIMER_INFO("mkldnn_bwdTimer", getName().c_str());
-      stream_->submit(pipelineBwd_);
-    }
-    {
-      REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
-      updateWeights(callback);
-    }
-  }
+  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback) override;
 
   /**
    * reshape the input image sizes
@@ -287,30 +177,12 @@ protected:
   /**
    * reshape the input image sizes and input batchsize
    */
-  virtual void reshapeInput(int& batchsize, int& height, int& width) {
-    const Argument& input = inputLayers_[0]->getOutput();
-    batchsize = input.getBatchSize();
-    int h = input.getFrameHeight();
-    int w = input.getFrameWidth();
-    if (h != 0) {
-      height = h;
-    }
-    if (w != 0) {
-      width = w;
-    }
-  }
+  void reshapeInput(int& batchsize, int& height, int& width);
 
   /**
    * reshape output image sizes
    */
-  virtual void reshapeOutput(size_t height, size_t width) {
-    output_.setFrameHeight(height);
-    output_.setFrameWidth(width);
-    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
-      outputOtherDevice_[i].setFrameHeight(height);
-      outputOtherDevice_[i].setFrameWidth(width);
-    }
-  }
+  void reshapeOutput(size_t height, size_t width);
 
   /**
    * reset MKLDNNMatrix from Matrix and internal primitive desc.
@@ -318,13 +190,7 @@ protected:
    */
   void resetWithMatrix(MKLDNNMatrixPtr& dnn,
                        const MatrixPtr& mat,
-                       mkldnn::memory::primitive_desc pd) {
-    dnn = nullptr;
-    if (mat == nullptr) {
-      return;
-    }
-    dnn = MKLDNNMatrix::create(pd, mat);
-  }
+                       mkldnn::memory::primitive_desc pd);
 
   /**
    * reset input value from input MKLDNNMatrix and internal primitive desc.
@@ -332,99 +198,20 @@ protected:
    */
   void resetInValue(
       MKLDNNMatrixPtr& in,
-      const std::shared_ptr<mkldnn::memory::primitive_desc>& intPD = nullptr) {
-    cvtInVal_ = nullptr;
-    extInVal_ = nullptr;
-    in = nullptr;
-    CHECK_GT(bs_ * ic_ * ih_ * iw_, 0);
-    auto extPD = MKLDNNMatrix::createPrimitiveDesc(
-        {bs_, ic_, ih_, iw_}, mkldnn::memory::format::nchw, engine_);
-    const MatrixPtr& inMat = inputLayers_[0]->getOutputValue();
-    in = std::dynamic_pointer_cast<MKLDNNMatrix>(inMat);
-    CHECK_EQ(inputIsOnlyMKLDNN(), in != nullptr);
-    if (in == nullptr || in->getFormat() == mkldnn::memory::format::nc) {
-      in = MKLDNNMatrix::create(extPD, inMat);
-    }
-    extInVal_ = isPaddleFormat(in->getFormat()) ? in : nullptr;
-    if (in->getFormat() == mkldnn::memory::format::nc) {
-      CHECK(ih_ == 1 && iw_ == 1);
-    }
-    if (nullptr == intPD || in->getPrimitiveDesc() == *intPD) {
-      return;
-    }
-    // need create reorder
-    in = MKLDNNMatrix::create(*intPD);
-    extInVal_ = extInVal_ ? extInVal_ : MKLDNNMatrix::create(extPD, inMat);
-    cvtInVal_ = MKLDNNMatrix::createReorder(extInVal_, in);
-    CHECK(cvtInVal_) << "should not be emptry";
-  }
+      const std::shared_ptr<mkldnn::memory::primitive_desc>& intPD = nullptr);
 
   /**
    * reset output value from internal primitive desc.
    * reset both internal and external buffer and create reorder if necessary.
    */
   void resetOutValue(MKLDNNMatrixPtr& out,
-                     mkldnn::memory::primitive_desc intPD) {
-    cvtOutVal_ = nullptr;
-    out = MKLDNNMatrix::create(intPD, output_.value);
-    extOutVal_ = out;
-    if (outputIsOnlyMKLDNN() || isPaddleFormat(extOutVal_->getFormat())) {
-      return;
-    }
-    // need create reorder
-    CHECK_GT(bs_ * oc_ * oh_ * ow_, 0);
-    extOutVal_ = MKLDNNMatrix::create(mkldnn::memory::dims{bs_, oc_, oh_, ow_},
-                                      mkldnn::memory::format::nchw,
-                                      engine_,
-                                      output_.value);
-    out = MKLDNNMatrix::create(intPD);
-    cvtOutVal_ = MKLDNNMatrix::createReorder(out, extOutVal_);
-    CHECK(cvtOutVal_) << "should not be empty";
-  }
+                     mkldnn::memory::primitive_desc intPD);
 
   /**
    * reset input grad from internal primitive desc.
    * reset both internal and external buffer and create reorder if necessary.
    */
-  void resetInGrad(MKLDNNMatrixPtr& in, mkldnn::memory::primitive_desc intPD) {
-    cvtInGrad_ = nullptr;
-    extInGrad_ = nullptr;
-    in = nullptr;
-    LayerPtr& input = inputLayers_[0];
-    if (input->getOutputGrad() == nullptr) {
-      // no need input grad
-      return;
-    }
-    CHECK(inputIsOnlyMKLDNN() || input->getOutputMapSize() <= 1)
-        << "only support input is MKLDNN layer or only have one output layer";
-    // when input is a mkldnn branch node,
-    // this layer will save input grad to a internal buffer,
-    // and the mkldnn input layer will merge them to actual prev->output_.grad
-    const MatrixPtr& inMat =
-        input->getOutputMapSize() <= 1 ? input->getOutputGrad() : nullptr;
-    in = MKLDNNMatrix::create(intPD, inMat);
-    Argument& arg = input->getOutput(this->getName());
-    arg.grad = std::dynamic_pointer_cast<Matrix>(in);
-    CHECK(inVal_ != nullptr && inVal_->getPrimitiveDesc() == intPD)
-        << "should have internal input value and primitive desc must equal";
-    if (inputIsOnlyMKLDNN()) {
-      return;
-    }
-
-    extInGrad_ = in;
-    if (isPaddleFormat(extInGrad_->getFormat())) {
-      return;
-    }
-    // need create reorder
-    CHECK(extInVal_ != nullptr && isPaddleFormat(extInVal_->getFormat()))
-        << "should have external input value and the format must be nchw(nc)";
-    extInGrad_ = MKLDNNMatrix::create(extInVal_->getPrimitiveDesc(), inMat);
-    CHECK(inVal_ != nullptr && inVal_->getPrimitiveDesc() == intPD)
-        << "should have internal input value and primitive desc must equal";
-    in = MKLDNNMatrix::create(intPD);
-    cvtInGrad_ = MKLDNNMatrix::createReorder(in, extInGrad_);
-    CHECK(cvtInGrad_);
-  }
+  void resetInGrad(MKLDNNMatrixPtr& in, mkldnn::memory::primitive_desc intPD);
 
   /**
    * reset output grad from internal primitive desc.
@@ -434,81 +221,59 @@ protected:
    *       it could not be mixed with cpu device,
    *       since it can not get memory desc from cpu device.
    */
-  void resetOutGrad(MKLDNNMatrixPtr& out,
-                    mkldnn::memory::primitive_desc intPD) {
-    cvtOutGrad_ = nullptr;
-    extOutGrad_ = nullptr;
-    out = nullptr;
-    MatrixPtr& outMat = output_.grad;
-    out = MKLDNNMatrix::create(intPD, outMat);
-    resetMergeGrad(out);
-    if (outputIsOnlyMKLDNN()) {
-      return;
-    }
-    CHECK_LE(outputMap_.size(), 1U) << "do not support mixed with cpu device";
-    extOutGrad_ = out;
-    if (isPaddleFormat(extOutGrad_->getFormat())) {
-      return;
-    }
-    // need create reorder
-    CHECK(extOutVal_ != nullptr && isPaddleFormat(extOutVal_->getFormat()))
-        << "should have external output value and the format must be nchw(nc)";
-    extOutGrad_ = MKLDNNMatrix::create(extOutVal_->getPrimitiveDesc(), outMat);
-    CHECK(outVal_ != nullptr && outVal_->getPrimitiveDesc() == intPD)
-        << "should have internal output value and primitive desc must equal";
-    out = MKLDNNMatrix::create(intPD);
-    cvtOutGrad_ = MKLDNNMatrix::createReorder(extOutGrad_, out);
-    CHECK(cvtOutGrad_);
-  }
+  void resetOutGrad(MKLDNNMatrixPtr& out, mkldnn::memory::primitive_desc intPD);
 
   /**
    * reset the merge grad primitive if necessary.
    * note: do not support the grads are mixed with cpu device,
    *       since it can not get memory desc from cpu device.
    */
-  virtual void resetMergeGrad(MKLDNNMatrixPtr& out) {
-    mergeGrad_ = nullptr;
-    pipelineMergeGrad_.clear();
-    if (outputMap_.size() <= 1 || !outputIsOnlyMKLDNN()) {
-      // do not merge when output is not all MKLDNN or only one output
-      return;
-    }
-    CHECK(out) << "should have reset internal ouput grad";
-    std::vector<double> scales(outputMap_.size(), 1.0);
-    std::vector<mkldnn::memory::primitive_desc> srcPDs;
-    std::vector<mkldnn::primitive::at> srcs;
-    for (auto it = outputMap_.begin(); it != outputMap_.end(); ++it) {
-      MKLDNNMatrixPtr src =
-          std::dynamic_pointer_cast<MKLDNNMatrix>(it->second->grad);
-      VLOG(MKLDNN_BASE) << getName() << " has output grad " << it->first;
-      CHECK(src) << "should be MKLDNNMatrix";
-      auto srcDims = src->getDims();
-      auto dstDims = out->getDims();
-      CHECK_EQ(srcDims.size(), dstDims.size());
-      for (size_t i = 0; i < srcDims.size(); ++i) {
-        CHECK_EQ(srcDims[i], dstDims[i]);
-      }
-      srcPDs.push_back(src->getPrimitiveDesc());
-      srcs.push_back(*src);
-    }
+  void resetMergeGrad(MKLDNNMatrixPtr& out);
+
+protected:
+  /**
+   * Set deviceId of this layer.
+   */
+  void setDevice(int id) { deviceId_ = id; }
 
-    // TODO(TJ): remove me when mkldnn sum support different formats
-    for (size_t i = 1; i < srcPDs.size(); ++i) {
-      CHECK(srcPDs[0] == srcPDs[i]);
+  /**
+   * check the format is nchw or nc,
+   * which is supported by Paddle default memory layout
+   */
+  bool isPaddleFormat(mkldnn::memory::format fmt) {
+    if (fmt == mkldnn::memory::format::nchw ||
+        fmt == mkldnn::memory::format::nc) {
+      return true;
+    } else {
+      return false;
     }
-    tmpOutGrad_ = out;
-    tmpCvt_ = nullptr;
-    if (out->getPrimitiveDesc() != srcPDs[0]) {
-      tmpOutGrad_ = MKLDNNMatrix::create(srcPDs[0]);
-      tmpCvt_ = MKLDNNMatrix::createReorder(tmpOutGrad_, out);
-      CHECK(tmpCvt_);
-      pipelineMergeGrad_.push_back(*tmpCvt_);
+  }
+
+  /**
+   * If input only has MKLDNN device.
+   * Otherwise, only support the previous layer using CPU device.
+   */
+  bool inputIsOnlyMKLDNN(int index = 0) {
+    int prevDevice = getPrev(index)->getDeviceId();
+    if (prevDevice == MKLDNN_DEVICE) {
+      return true;
+    } else {
+      CHECK_EQ(prevDevice, CPU_DEVICE) << "Only support CPU yet";
+      return false;
     }
+  }
 
-    auto sumPD = mkldnn::sum::primitive_desc(
-        tmpOutGrad_->getMemoryDesc(), scales, srcPDs);
-    mergeGrad_.reset(new mkldnn::sum(sumPD, srcs, *tmpOutGrad_));
-    pipelineMergeGrad_.insert(pipelineMergeGrad_.begin(), *mergeGrad_);
+  /**
+   * If output only has MKLDNN device.
+   * Otherwise, other devices should only using CPU device.
+   */
+  bool outputIsOnlyMKLDNN() {
+    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
+      CHECK_EQ(outputOtherDevice_[i].deviceId, CPU_DEVICE)
+          << "Only support other device is CPU yet";
+    }
+    outputOnlyMKLDNN_ = outputOtherDevice_.size() == 0;
+    return outputOnlyMKLDNN_;
   }
 
   /**
@@ -568,54 +333,7 @@ protected:
     }
   }
 
-protected:
-  /**
-   * If input only has MKLDNN device.
-   * Otherwise, only support the previous layer using CPU device.
-   */
-  bool inputIsOnlyMKLDNN(int index = 0) {
-    int prevDevice = getPrev(index)->getDeviceId();
-    if (prevDevice == MKLDNN_DEVICE) {
-      return true;
-    } else {
-      // do not support GPU yet
-      CHECK_EQ(prevDevice, CPU_DEVICE) << "Only support CPU yet";
-      return false;
-    }
-  }
-
-  /**
-   * If output only has MKLDNN device.
-   * Otherwise, other devices should only using CPU device.
-   */
-  bool outputIsOnlyMKLDNN() {
-    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
-      CHECK_EQ(outputOtherDevice_[i].deviceId, CPU_DEVICE)
-          << "Only support other device is CPU yet";
-    }
-    outputOnlyMKLDNN_ = outputOtherDevice_.size() == 0;
-    return outputOnlyMKLDNN_;
-  }
-
-  /**
-   * Set deviceId of this layer.
-   */
-  void setDevice(int id) { deviceId_ = id; }
-
 private:
-  /**
-   * check the format is nchw or nc,
-   * which is supported by Paddle default memory layout
-   */
-  bool isPaddleFormat(mkldnn::memory::format fmt) {
-    if (fmt == mkldnn::memory::format::nchw ||
-        fmt == mkldnn::memory::format::nc) {
-      return true;
-    } else {
-      return false;
-    }
-  }
-
   /**
    * clear all grad
    */

From d75b00c2210c247bcf626bf3239fd6f7dc115e49 Mon Sep 17 00:00:00 2001
From: tensor-tang <jian.j.tang@intel.com>
Date: Thu, 19 Oct 2017 17:18:22 +0800
Subject: [PATCH 4/5] refine the gtest log info and vlog order, and change the
 size of test to make unit test faster refine comment and log of mkldnnlayer

---
 paddle/gserver/layers/MKLDNNBase.h            |  4 +-
 paddle/gserver/layers/MKLDNNLayer.cpp         |  7 ++-
 paddle/gserver/layers/MKLDNNLayer.h           |  8 ++--
 paddle/gserver/tests/MKLDNNTester.cpp         | 44 ++++++++++---------
 paddle/gserver/tests/MKLDNNTester.h           |  8 +---
 .../sample_trainer_config_branch_net.conf     | 26 +++++------
 .../sample_trainer_config_simple_net.conf     |  2 +-
 7 files changed, 52 insertions(+), 47 deletions(-)

diff --git a/paddle/gserver/layers/MKLDNNBase.h b/paddle/gserver/layers/MKLDNNBase.h
index 4c0234e7b3..af02a37cad 100644
--- a/paddle/gserver/layers/MKLDNNBase.h
+++ b/paddle/gserver/layers/MKLDNNBase.h
@@ -21,8 +21,8 @@ namespace paddle {
 typedef enum {
   MKLDNN_BASE = 1,   // basical info of MKLDNN
   MKLDNN_TESTS = 1,  // gtest info of MKLDNN
-  MKLDNN_SIZES = 2,  // size info of MKLDNN
-  MKLDNN_FMTS = 3,   // format info of MKLDNN
+  MKLDNN_FMTS = 2,   // format info of MKLDNN
+  MKLDNN_SIZES = 3,  // size info of MKLDNN
   MKLDNN_ALL = 4,    // show all info of MKLDNN
 } MKLDNN_LOG_LEVEL;
 
diff --git a/paddle/gserver/layers/MKLDNNLayer.cpp b/paddle/gserver/layers/MKLDNNLayer.cpp
index 91f0ff5bd3..f4968c4af3 100644
--- a/paddle/gserver/layers/MKLDNNLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNLayer.cpp
@@ -105,6 +105,10 @@ void MKLDNNLayer::backward(const UpdateCallback& callback) {
     // external output grad is not necessary
     // since output may be mkldnn internal buffer or merge them directly.
     CHECK(outGrad_) << "internal output grad is necessary";
+    if (extOutGrad_) {
+      CHECK_EQ(extOutGrad_->getData(), output_.grad->getData())
+          << "the external buffer should share the same data with output_.grad";
+    }
     if (cvtOutGrad_) {
       pipelineBwd_.insert(pipelineBwd_.begin(), *cvtOutGrad_);
     }
@@ -293,7 +297,6 @@ void MKLDNNLayer::resetMergeGrad(MKLDNNMatrixPtr& out) {
   for (auto it = outputMap_.begin(); it != outputMap_.end(); ++it) {
     MKLDNNMatrixPtr src =
         std::dynamic_pointer_cast<MKLDNNMatrix>(it->second->grad);
-    VLOG(MKLDNN_BASE) << getName() << " has output grad " << it->first;
     CHECK(src) << "should be MKLDNNMatrix";
     auto srcDims = src->getDims();
     auto dstDims = out->getDims();
@@ -301,6 +304,8 @@ void MKLDNNLayer::resetMergeGrad(MKLDNNMatrixPtr& out) {
     for (size_t i = 0; i < srcDims.size(); ++i) {
       CHECK_EQ(srcDims[i], dstDims[i]);
     }
+    VLOG(MKLDNN_BASE) << getName() << " has output grad " << it->first
+                      << ", format " << src->getFormat();
     srcPDs.push_back(src->getPrimitiveDesc());
     srcs.push_back(*src);
   }
diff --git a/paddle/gserver/layers/MKLDNNLayer.h b/paddle/gserver/layers/MKLDNNLayer.h
index faad434526..656b5ee2d7 100644
--- a/paddle/gserver/layers/MKLDNNLayer.h
+++ b/paddle/gserver/layers/MKLDNNLayer.h
@@ -58,13 +58,13 @@ protected:
   std::vector<mkldnn::primitive> pipelineFwd_;
   std::vector<mkldnn::primitive> pipelineBwd_;
 
-  /// value and grad are seperate as internal and external buffers.
+  /// value and grad are seperated as internal and external buffers.
   /// each MKLDNNLayer must init or reset internal buffer at least,
   /// and the external buffer format is always nchw of nc(when h==w==1),
   /// which is the same format as paddle.
-  /// When mixed with cpu device, the output_.value and output_.grad
-  /// always save the external data.
-  /// When all layers are all mkldnn layers, they could be internal data.
+  /// The output_.value and output_.grad always save the external data,
+  /// when mixed with cpu device.
+  /// When all layers are mkldnn layers, they could save internal data.
   /// below MKLDNNMatrix buffers are all internal buffers
   MKLDNNMatrixPtr inVal_;
   MKLDNNMatrixPtr inGrad_;
diff --git a/paddle/gserver/tests/MKLDNNTester.cpp b/paddle/gserver/tests/MKLDNNTester.cpp
index 3bf6a9e176..0a19fe2333 100644
--- a/paddle/gserver/tests/MKLDNNTester.cpp
+++ b/paddle/gserver/tests/MKLDNNTester.cpp
@@ -97,7 +97,7 @@ void MKLDNNTester::randomWgtDatas() {
     parameters_[REF][i]->randomize();
     dnnValue->copyFrom(*refValue);
 
-    VLOG(lvl_) << "Random weight data " << parameters_[DNN][i]->getName();
+    VLOG(MKLDNN_TESTS) << "Random weight " << parameters_[DNN][i]->getName();
     printVector(dnnValue);
   }
 }
@@ -109,7 +109,7 @@ void MKLDNNTester::randomBotDatas() {
     dataLayers_[REF][i]->getOutputValue()->randomizeUniform();
     dataLayers_[DNN][i]->getOutputValue()->copyFrom(
         *(dataLayers_[REF][i]->getOutputValue()));
-    VLOG(lvl_) << "Input " << i << " data:";
+    VLOG(MKLDNN_TESTS) << "Random Foward, InputValue " << i;
     printMatrix(dataLayers_[REF][i]->getOutputValue());
   }
 }
@@ -118,12 +118,12 @@ void MKLDNNTester::randomTopDiffs() {
   refLayer_->getOutputGrad()->randomizeUniform();
   dnnLayer_->getOutput(CPU_DEVICE)
       .grad->copyFrom(*(refLayer_->getOutputGrad()));
-  VLOG(lvl_) << "Random Backward Input, TopDiff: ";
+  VLOG(MKLDNN_TESTS) << "Random Backward, OutputGrad";
   printMatrix(refLayer_->getOutputGrad());
 }
 
 void MKLDNNTester::checkForward() {
-  VLOG(MKLDNN_ALL) << "Check Forward";
+  VLOG(MKLDNN_TESTS) << "Check Forward";
   printTopDatas();
   double delta =
       compareMatrix(dnnLayer_->getOutputValue(), refLayer_->getOutputValue());
@@ -131,15 +131,15 @@ void MKLDNNTester::checkForward() {
 }
 
 void MKLDNNTester::checkBackwardData() {
-  VLOG(MKLDNN_ALL) << "Check Backward Data";
+  VLOG(MKLDNN_TESTS) << "Check Backward Data";
   // TODO(TJ): uncomment me when batch norm ready
   // const bool isBN = dnnLayer_->getType() == "mkldnn_batch_norm";
   for (size_t i = 0; i < dataLayers_[DNN].size(); ++i) {
     const MatrixPtr& dnnDiff = dataLayers_[DNN][i]->getOutputGrad();
     const MatrixPtr& refDiff = dataLayers_[REF][i]->getOutputGrad();
-    VLOG(lvl_) << "Mkldnn Backward Output BotDiff " << i;
+    VLOG(MKLDNN_ALL) << "MKLDNN Backward Result: InputGrad " << i;
     printMatrix(dnnDiff);
-    VLOG(lvl_) << "Reference Backward Output BotDiff " << i;
+    VLOG(MKLDNN_ALL) << "Reference Backward Result: InputGrad " << i;
     printMatrix(refDiff);
 
     double delta = compareMatrix(dnnDiff, refDiff);
@@ -153,7 +153,7 @@ void MKLDNNTester::checkBackwardData() {
 }
 
 void MKLDNNTester::checkBackwardWgts() {
-  VLOG(MKLDNN_ALL) << "Check Backward Weight";
+  VLOG(MKLDNN_TESTS) << "Check Backward Weight";
   CHECK_EQ(parameters_[DNN].size(), parameters_[REF].size());
   vector<VectorPtr> dnnWgts;  // used to temply save mkldnn weights
   saveWgt(parameters_[DNN], dnnWgts);
@@ -165,9 +165,11 @@ void MKLDNNTester::checkBackwardWgts() {
   for (size_t i = 0; i < parameters_[DNN].size(); ++i) {
     const VectorPtr& dnn = parameters_[DNN][i]->getBuf(PARAMETER_VALUE);
     const VectorPtr& ref = parameters_[REF][i]->getBuf(PARAMETER_VALUE);
-    VLOG(lvl_) << "Mkldnn Output weight " << parameters_[DNN][i]->getName();
+    VLOG(MKLDNN_ALL) << "MKLDNN Result: weight value"
+                     << parameters_[DNN][i]->getName();
     printVector(dnn);
-    VLOG(lvl_) << "Reference Output weight " << parameters_[REF][i]->getName();
+    VLOG(MKLDNN_ALL) << "Reference Result: weight value "
+                     << parameters_[REF][i]->getName();
     printVector(ref);
 
     double delta = compareVector(dnn, ref);
@@ -240,7 +242,8 @@ void MKLDNNTester::printTopDatas() {
   }
 
   for (int n = 0; n < NUM; ++n) {
-    VLOG(lvl_) << testLayers_[n]->getType() << " forward output TopData: ";
+    VLOG(MKLDNN_ALL) << testLayers_[n]->getType()
+                     << " Forward Result: OutputValue";
     printMatrix(testLayers_[n]->getOutputValue());
   }
 }
@@ -252,7 +255,7 @@ void MKLDNNTester::printMatrix(const MatrixPtr& m) {
 
   std::ostringstream ostr;
   m->print(ostr);
-  VLOG(lvl_) << std::endl << ostr.str();
+  VLOG(MKLDNN_ALL) << std::endl << ostr.str();
 }
 
 void MKLDNNTester::printVector(const VectorPtr& v) {
@@ -262,7 +265,7 @@ void MKLDNNTester::printVector(const VectorPtr& v) {
 
   std::ostringstream ostr;
   v->print(ostr, v->getSize());
-  VLOG(lvl_) << std::endl << ostr.str();
+  VLOG(MKLDNN_ALL) << std::endl << ostr.str();
 }
 
 double MKLDNNTester::getDelta(const real* d1,
@@ -314,7 +317,7 @@ void MKLDNNTester::runOnce() {
   UpdateCallback updateCallback = [](Parameter* para) {
     auto& grad = para->getBuf(PARAMETER_GRADIENT);
     auto& value = para->getBuf(PARAMETER_VALUE);
-    real lr = 1e-3;
+    real lr = 1e-2;
     value->add(*grad, lr);
     grad->zeroMem();
   };
@@ -340,10 +343,9 @@ void MKLDNNTester::run(const TestConfig& dnn,
                        size_t batchSize,
                        size_t inputImgH,
                        size_t inputImgW,
+                       bool printDetails,
                        size_t iter,
-                       float epsilon,
-                       bool log,
-                       int level) {
+                       float epsilon) {
   CHECK(dnn.layerConfig.type().compare(0, 7, "mkldnn_") == 0 ||
         dnn.layerConfig.active_type().compare(0, 7, "mkldnn_") == 0)
       << "should be MKLDNN layer or MKLDNN activation";
@@ -359,10 +361,9 @@ void MKLDNNTester::run(const TestConfig& dnn,
 
   ih_ = inputImgH;
   iw_ = inputImgW;
+  log_ = printDetails;
   iter_ = iter;
   eps_ = epsilon;
-  log_ = log;
-  lvl_ = level;
 
   // Firstly test mkldnn init from PARAM_FORMAT_ORIGINAL weight
   reset(dnn, ref, batchSize);
@@ -531,9 +532,11 @@ void MKLDNNTester::getOutResult(const std::string& configPath,
 void MKLDNNTester::compareResult(DataOut& ref, DataOut& dnn, float eps) {
   CHECK_EQ(ref.outValues.size(), dnn.outValues.size());
   CHECK_EQ(ref.paraValues.size(), dnn.paraValues.size());
+  VLOG(MKLDNN_TESTS) << "compare value size: " << ref.outValues.size();
   for (size_t i = 0; i < ref.outValues.size(); i++) {
     EXPECT_LE(fabs(compareMatrix(ref.outValues[i], dnn.outValues[i])), eps);
   }
+  VLOG(MKLDNN_TESTS) << "compare param size: " << ref.outValues.size();
   for (size_t i = 0; i < ref.paraValues.size(); i++) {
     EXPECT_LE(fabs(compareVector(ref.paraValues[i], dnn.paraValues[i])), eps);
   }
@@ -544,9 +547,10 @@ void MKLDNNTester::runBranchesTest(const std::string& configPath,
                                    float eps) {
   DataIn in;
   initArgument(in, configPath, iter);
-
   DataOut outCpu, outDnn;
+  VLOG(MKLDNN_TESTS) << "runing cpu network";
   getOutResult(configPath, in, outCpu, false, iter);
+  VLOG(MKLDNN_TESTS) << "runing mkldnn network";
   getOutResult(configPath, in, outDnn, true, iter);
 
   compareResult(outCpu, outDnn, eps);
diff --git a/paddle/gserver/tests/MKLDNNTester.h b/paddle/gserver/tests/MKLDNNTester.h
index 51abfcb67e..c385d1c727 100644
--- a/paddle/gserver/tests/MKLDNNTester.h
+++ b/paddle/gserver/tests/MKLDNNTester.h
@@ -58,8 +58,6 @@ protected:
   size_t iter_;
   /// whether to print out the details
   bool log_;
-  /// vlog level to print the matrix details datas
-  int lvl_;
   /// epsilon
   float eps_;
   /// input image size, default 1
@@ -70,7 +68,6 @@ public:
     iter_ = iter;
     eps_ = epsilon;
     log_ = false;
-    lvl_ = MKLDNN_ALL;
   }
 
   ~MKLDNNTester() {}
@@ -81,10 +78,9 @@ public:
            size_t batchSize,
            size_t inputImgH = 1,
            size_t inputImgW = 1,
+           bool printDetails = false,
            size_t iter = 3,
-           float epsilon = 1e-4,
-           bool log = false,
-           int level = MKLDNN_ALL);
+           float epsilon = 1e-4);
   static void runBranchesTest(const std::string& configPath,
                               size_t iter = 3,
                               float eps = 1e-4);
diff --git a/paddle/trainer/tests/sample_trainer_config_branch_net.conf b/paddle/trainer/tests/sample_trainer_config_branch_net.conf
index c2594bc13c..a073708a18 100644
--- a/paddle/trainer/tests/sample_trainer_config_branch_net.conf
+++ b/paddle/trainer/tests/sample_trainer_config_branch_net.conf
@@ -17,7 +17,7 @@ from paddle.trainer_config_helpers import *
 ################################### Data Configuration ###################################
 TrainData(ProtoData(files = "trainer/tests/mnist.list"))
 ################################### Algorithm Configuration ###################################
-settings(batch_size = 256,
+settings(batch_size = 128,
          learning_method = MomentumOptimizer(momentum=0.5, sparse=False))
 ################################### Network Configuration ###################################
 data = data_layer(name ="input", size=784)
@@ -44,10 +44,11 @@ a2 = img_conv_layer(input=tmp,
             shared_biases=True,
             act=ReluActivation())
 
-tmp = concat_layer(input=[a1, a2])
+tmp = addto_layer(input=[a1, a2],
+            act=ReluActivation(),
+            bias_attr=False)
 
 tmp = img_pool_layer(input=tmp,
-            num_channels=64,
             pool_size=3,
             stride=2,
             padding=1,
@@ -55,35 +56,34 @@ tmp = img_pool_layer(input=tmp,
 
 b1 = img_conv_layer(input=tmp,
             filter_size=3,
-            num_filters=64,
+            num_filters=32,
             padding=1,
             shared_biases=True,
             act=ReluActivation())
 
 b1 = img_pool_layer(input=b1,
             pool_size=3,
-            stride=1,
-            padding=1,
+            stride=2,
+            padding=0,
             pool_type=MaxPooling())
 
 b2 = img_conv_layer(input=tmp,
-            filter_size=5,
+            filter_size=3,
             num_filters=64,
-            padding=2,
+            padding=1,
             shared_biases=True,
             act=ReluActivation())
 
 b2 = img_pool_layer(input=b2,
             pool_size=5,
-            stride=1,
-            padding=2,
+            stride=2,
+            padding=1,
             pool_type=MaxPooling())
 
-tmp = addto_layer(input=[b1, b2],
-            act=ReluActivation(),
-            bias_attr=False)
+tmp = concat_layer(input=[b1, b2])
 
 tmp = img_pool_layer(input=tmp,
+            num_channels=96,
             pool_size=3,
             stride=2,
             padding=1,
diff --git a/paddle/trainer/tests/sample_trainer_config_simple_net.conf b/paddle/trainer/tests/sample_trainer_config_simple_net.conf
index 77f7816153..2ba71884d0 100644
--- a/paddle/trainer/tests/sample_trainer_config_simple_net.conf
+++ b/paddle/trainer/tests/sample_trainer_config_simple_net.conf
@@ -17,7 +17,7 @@ from paddle.trainer_config_helpers import *
 ################################### Data Configuration ###################################
 TrainData(ProtoData(files = "trainer/tests/mnist.list"))
 ################################### Algorithm Configuration ###################################
-settings(batch_size = 1000,
+settings(batch_size = 128,
          learning_method = MomentumOptimizer(momentum=0.5, sparse=False))
 ################################### Network Configuration ###################################
 data = data_layer(name ="input", size=784)

From 5c892db64ca22200e9d245da3ff72d1dfca3738d Mon Sep 17 00:00:00 2001
From: tensor-tang <jian.j.tang@intel.com>
Date: Fri, 20 Oct 2017 21:43:54 +0800
Subject: [PATCH 5/5] remove unused code refine comments and bias fix typo and
 todo

---
 paddle/gserver/layers/MKLDNNConvLayer.cpp |  8 ++---
 paddle/gserver/layers/MKLDNNFcLayer.cpp   | 21 ++++++------
 paddle/gserver/layers/MKLDNNLayer.cpp     |  7 ++--
 paddle/gserver/layers/MKLDNNLayer.h       | 39 ++++++++++++-----------
 4 files changed, 37 insertions(+), 38 deletions(-)

diff --git a/paddle/gserver/layers/MKLDNNConvLayer.cpp b/paddle/gserver/layers/MKLDNNConvLayer.cpp
index 3fbfb1ab1f..83f4e4e615 100644
--- a/paddle/gserver/layers/MKLDNNConvLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNConvLayer.cpp
@@ -210,11 +210,11 @@ void MKLDNNConvLayer::resetFwdBuffers(
 
   resetWithMatrix(wgt, weight_->getW(), pd->weights_primitive_desc());
 
-  bias = nullptr;
-  if (biases_ == nullptr || biases_->getW() == nullptr) {
-    return;
+  if (biases_ && biases_->getW()) {
+    resetWithMatrix(bias, biases_->getW(), pd->bias_primitive_desc());
+  } else {
+    bias = nullptr;
   }
-  resetWithMatrix(bias, biases_->getW(), pd->bias_primitive_desc());
 }
 
 void MKLDNNConvLayer::resetFwdPipeline(
diff --git a/paddle/gserver/layers/MKLDNNFcLayer.cpp b/paddle/gserver/layers/MKLDNNFcLayer.cpp
index 9f82a3b747..d82063a713 100644
--- a/paddle/gserver/layers/MKLDNNFcLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNFcLayer.cpp
@@ -134,10 +134,6 @@ void MKLDNNFcLayer::resetFwdBuffers(MKLDNNMatrixPtr& in,
   CHECK(in);
   in->downSpatial();
 
-  //  if (extInVal_) {
-  //    extInVal_->downSpatial();
-  //  }
-
   auto outPD =
       MKLDNNMatrix::createPrimitiveDesc({bs_, oc_}, format::nc, engine_);
   resetOutValue(out, outPD);
@@ -153,11 +149,12 @@ void MKLDNNFcLayer::resetFwdBuffers(MKLDNNMatrixPtr& in,
   resetWithMatrix(wgt, weight_->getW(), wgtPD);
   wgt->downSpatial();
 
-  if (biases_ == nullptr || biases_->getW() == nullptr) {
-    return;
+  if (biases_ && biases_->getW()) {
+    auto biasPD = MKLDNNMatrix::createPrimitiveDesc({oc_}, format::x, engine_);
+    resetWithMatrix(bias, biases_->getW(), biasPD);
+  } else {
+    bias = nullptr;
   }
-  auto biasPD = MKLDNNMatrix::createPrimitiveDesc({oc_}, format::x, engine_);
-  resetWithMatrix(bias, biases_->getW(), biasPD);
 }
 
 void MKLDNNFcLayer::resetFwdPD(std::shared_ptr<fc_fwd::primitive_desc>& pd,
@@ -207,11 +204,11 @@ void MKLDNNFcLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
   CHECK(wgtVal_);
   resetWithMatrix(wgt, weight_->getWGrad(), wgtVal_->getPrimitiveDesc());
 
-  bias = nullptr;
-  if (biasVal_ == nullptr) {
-    return;
+  if (biasVal_) {
+    resetWithMatrix(bias, biases_->getWGrad(), biasVal_->getPrimitiveDesc());
+  } else {
+    bias = nullptr;
   }
-  resetWithMatrix(bias, biases_->getWGrad(), biasVal_->getPrimitiveDesc());
 }
 
 void MKLDNNFcLayer::resetBwdWgtPD(
diff --git a/paddle/gserver/layers/MKLDNNLayer.cpp b/paddle/gserver/layers/MKLDNNLayer.cpp
index f4968c4af3..6bb19976b5 100644
--- a/paddle/gserver/layers/MKLDNNLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNLayer.cpp
@@ -60,7 +60,7 @@ void MKLDNNLayer::forward(PassType passType) {
       resetFwd(pipelineFwd_, inVal_, wgtVal_, biasVal_, outVal_);
       // MKLDNNLayer output value should be MKLDNNMatrix
       // so external output value is necessary.
-      // then external input value is not necessary,
+      // Then external input value is not necessary,
       // since input may be mkldnn internal buffer.
       CHECK(extOutVal_) << "external output value is necessary";
       output_.value = std::dynamic_pointer_cast<Matrix>(extOutVal_);
@@ -235,8 +235,8 @@ void MKLDNNLayer::resetInGrad(MKLDNNMatrixPtr& in,
   in = MKLDNNMatrix::create(intPD, inMat);
   Argument& arg = input->getOutput(this->getName());
   arg.grad = std::dynamic_pointer_cast<Matrix>(in);
-  CHECK(inVal_ != nullptr && inVal_->getPrimitiveDesc() == intPD)
-      << "should have internal input value and primitive desc must equal";
+  CHECK(inVal_);
+  CHECK(inVal_->getPrimitiveDesc() == intPD) << "the primitive desc must equal";
   if (inputIsOnlyMKLDNN()) {
     return;
   }
@@ -246,6 +246,7 @@ void MKLDNNLayer::resetInGrad(MKLDNNMatrixPtr& in,
     return;
   }
   // need create reorder
+  // TODO(TJ): add macro definition to simplify it
   CHECK(extInVal_ != nullptr && isPaddleFormat(extInVal_->getFormat()))
       << "should have external input value and the format must be nchw(nc)";
   extInGrad_ = MKLDNNMatrix::create(extInVal_->getPrimitiveDesc(), inMat);
diff --git a/paddle/gserver/layers/MKLDNNLayer.h b/paddle/gserver/layers/MKLDNNLayer.h
index 656b5ee2d7..9b54c95b55 100644
--- a/paddle/gserver/layers/MKLDNNLayer.h
+++ b/paddle/gserver/layers/MKLDNNLayer.h
@@ -58,14 +58,15 @@ protected:
   std::vector<mkldnn::primitive> pipelineFwd_;
   std::vector<mkldnn::primitive> pipelineBwd_;
 
-  /// value and grad are seperated as internal and external buffers.
-  /// each MKLDNNLayer must init or reset internal buffer at least,
-  /// and the external buffer format is always nchw of nc(when h==w==1),
-  /// which is the same format as paddle.
-  /// The output_.value and output_.grad always save the external data,
-  /// when mixed with cpu device.
-  /// When all layers are mkldnn layers, they could save internal data.
-  /// below MKLDNNMatrix buffers are all internal buffers
+  /* Value and grad are seperated as internal and external buffers.
+   * Each MKLDNNLayer must init or reset internal buffer at least,
+   * and the external buffer format is always nchw of nc(when h==w==1),
+   * which is the same format as paddle.
+   * The output_.value and output_.grad always save the external data,
+   * when mixed with cpu device.
+   * When all layers are mkldnn layers, they could save internal data.
+   */
+  // below MKLDNNMatrix buffers are all internal buffers
   MKLDNNMatrixPtr inVal_;
   MKLDNNMatrixPtr inGrad_;
   MKLDNNMatrixPtr outVal_;
@@ -120,8 +121,8 @@ public:
   ~MKLDNNLayer() {}
 
   virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback) override;
+  virtual void forward(PassType passType);
+  virtual void backward(const UpdateCallback& callback);
 
   /**
    * reshape the input image sizes
@@ -217,7 +218,7 @@ protected:
    * reset output grad from internal primitive desc.
    * merge grad if necessary.
    * reset both internal and external buffer and create reorder if necessary.
-   * note: about merge grad, when this layer has serval outputs,
+   * note: about merge grad, when this layer has several outputs,
    *       it could not be mixed with cpu device,
    *       since it can not get memory desc from cpu device.
    */
@@ -225,7 +226,7 @@ protected:
 
   /**
    * reset the merge grad primitive if necessary.
-   * note: do not support the grads are mixed with cpu device,
+   * note: do not support the grads mixed with cpu device,
    *       since it can not get memory desc from cpu device.
    */
   void resetMergeGrad(MKLDNNMatrixPtr& out);
@@ -313,17 +314,17 @@ protected:
    * print the mkldnn memory format of grad
    */
   virtual void printGradFormat() {
-    if (extInGrad_) {
-      VLOG(MKLDNN_FMTS) << extInGrad_->getFormat() << " <<< ";
-    }
-    if (inGrad_) {
-      VLOG(MKLDNN_FMTS) << inGrad_->getFormat() << " <<<";
+    if (extOutGrad_) {
+      VLOG(MKLDNN_FMTS) << extOutGrad_->getFormat();
     }
     if (outGrad_) {
       VLOG(MKLDNN_FMTS) << outGrad_->getFormat() << " <<< ";
     }
-    if (extOutGrad_) {
-      VLOG(MKLDNN_FMTS) << extOutGrad_->getFormat();
+    if (inGrad_) {
+      VLOG(MKLDNN_FMTS) << inGrad_->getFormat() << " <<<";
+    }
+    if (extInGrad_) {
+      VLOG(MKLDNN_FMTS) << extInGrad_->getFormat() << " <<< ";
     }
     if (wgtGrad_) {
       VLOG(MKLDNN_FMTS) << "Weight grad format: " << wgtGrad_->getFormat();