Merge pull request #4953 from tensor-tang/merge_grad_gtest

refine the mkldnn logic
7 years ago · abce9eb750
parent c91de280d7 5c892db64c
commit abce9eb750
16 changed files with 612 additions and 711 deletions
--- a/paddle/gserver/activations/MKLDNNActivation.cpp
+++ b/paddle/gserver/activations/MKLDNNActivation.cpp
@ -126,7 +126,7 @@ void MKLDNNEltwiseActivation::resetFwd(Argument& act) {
  copyInVal_ = nullptr;
  if (act.grad && algo == algorithm::eltwise_tanh) {
    // tanh need save src input for backward
-    inVal_ = MKLDNNMatrix::create(nullptr, val_->getPrimitiveDesc());
+    inVal_ = MKLDNNMatrix::create(val_->getPrimitiveDesc());
    copyInVal_ = std::make_shared<mkldnn::reorder>(*val_, *inVal_);
    CHECK(copyInVal_) << "should not be emptry";
    pipelineFwd_.push_back(*copyInVal_);
@ -145,7 +145,7 @@ void MKLDNNEltwiseActivation::resetBwd(Argument& act) {
  algorithm algo = getAlgo(this->getName());
  float alpha = getBwdAlpha();
  float beta = getBeta();
-  grad_ = MKLDNNMatrix::create(act.grad, val_->getPrimitiveDesc());
+  grad_ = MKLDNNMatrix::create(val_->getPrimitiveDesc(), act.grad);
  auto eng = CPUEngine::Instance().getEngine();
  auto bwdDesc = eltwise_bwd::desc(
      algo, grad_->getMemoryDesc(), val_->getMemoryDesc(), alpha, beta);
@ -230,7 +230,7 @@ void MKLDNNActivation::resetFwd(Argument& act) {
    int ic = cnt_ / bs / ih / iw;
    CHECK_EQ(cnt_, (size_t)bs * ic * ih * iw);
    val_ = MKLDNNMatrix::create(
-        act.value, {bs, ic, ih, iw}, mkldnn::memory::format::nchw, *engine_);
+        {bs, ic, ih, iw}, mkldnn::memory::format::nchw, *engine_, act.value);
    CHECK(val_);
    val_->downSpatial();
  }
--- a/paddle/gserver/layers/MKLDNNBase.h
+++ b/paddle/gserver/layers/MKLDNNBase.h
@ -21,8 +21,8 @@ namespace paddle {
 typedef enum {
  MKLDNN_BASE = 1,   // basical info of MKLDNN
  MKLDNN_TESTS = 1,  // gtest info of MKLDNN
-  MKLDNN_SIZES = 2,  // size info of MKLDNN
+  MKLDNN_FMTS = 2,   // format info of MKLDNN
-  MKLDNN_FMTS = 3,   // format info of MKLDNN
+  MKLDNN_SIZES = 3,  // size info of MKLDNN
  MKLDNN_ALL = 4,    // show all info of MKLDNN
 } MKLDNN_LOG_LEVEL;
--- a/paddle/gserver/layers/MKLDNNConvLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNConvLayer.cpp
--- a/paddle/gserver/layers/MKLDNNConvLayer.h
+++ b/paddle/gserver/layers/MKLDNNConvLayer.h
@ -48,17 +48,6 @@ protected:
  // save forward primitive_desc, which can be used backward
  std::shared_ptr<conv_fwd::primitive_desc> fwdPD_;
  // MKLDNNMatrixPtr which should be created from CPU Device
  MKLDNNMatrixPtr cpuInVal_;
  MKLDNNMatrixPtr cpuInGrad_;
  MKLDNNMatrixPtr cpuOutVal_;
  MKLDNNMatrixPtr cpuOutGrad_;
  // convert handle between CPU device and MKLDNN device
  std::shared_ptr<mkldnn::reorder> cvtInVal_;
  std::shared_ptr<mkldnn::reorder> cvtInGrad_;
  std::shared_ptr<mkldnn::reorder> cvtOutVal_;
  std::shared_ptr<mkldnn::reorder> cvtOutGrad_;
  // whether the weight has been init
  bool hasInitedWgt_;
@ -94,8 +83,6 @@ public:
                MKLDNNMatrixPtr& bias,
                MKLDNNMatrixPtr& out) override;
  void updateInputData() override;
  void updateWeights(const UpdateCallback& callback) override;
  void convertWeightsFromPaddle() override;
@ -109,26 +96,6 @@ public:
                       << ", sw: " << sw_ << ", dh: " << dh_ << ", dw: " << dw_;
  }
  void printValueFormatFlow() override {
    if (cpuInVal_) {
      VLOG(MKLDNN_FMTS) << cpuInVal_->getFormat() << " >>>";
    }
    MKLDNNLayer::printValueFormatFlow();
    if (cpuOutVal_) {
      VLOG(MKLDNN_FMTS) << " >>> " << cpuOutVal_->getFormat();
    }
  }
  void printGradFormatFlow() override {
    if (cpuInGrad_) {
      VLOG(MKLDNN_FMTS) << cpuInGrad_->getFormat() << " <<<";
    }
    MKLDNNLayer::printGradFormatFlow();
    if (cpuOutGrad_) {
      VLOG(MKLDNN_FMTS) << " <<< " << cpuOutGrad_->getFormat();
    }
  }
 protected:
  /**
   * load the dims settings of this conv
@ -162,23 +129,6 @@ protected:
                        MKLDNNMatrixPtr& bias,
                        MKLDNNMatrixPtr& out);
  /**
   * reset MKLDNNMatrix of input value
   */
  void resetInValue(std::shared_ptr<conv_fwd::primitive_desc>& pd,
                    MKLDNNMatrixPtr& in);
  /**
   * reset MKLDNNMatrix of weight and bias value
   */
  void resetWgtBiasValue(std::shared_ptr<conv_fwd::primitive_desc>& pd,
                         MKLDNNMatrixPtr& wgt,
                         MKLDNNMatrixPtr& bias);
  /**
   * reset MKLDNNMatrix of output value
   */
  void resetOutValue(std::shared_ptr<conv_fwd::primitive_desc>& pd,
                     MKLDNNMatrixPtr& out);
  /**
   * reset the backward weight primitive descriptor.
   */
@ -207,22 +157,6 @@ protected:
                        MKLDNNMatrixPtr& bias,
                        MKLDNNMatrixPtr& out);
  /**
   * reset MKLDNNMatrix of output grad
   */
  void resetOutGrad(std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD,
                    MKLDNNMatrixPtr& out);
  /**
   * reset MKLDNNMatrix of weight and bias grad
   */
  void resetWgtBiasGrad(std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD,
                        MKLDNNMatrixPtr& wgt,
                        MKLDNNMatrixPtr& bias);
  /**
   * reset MKLDNNMatrix of input grad
   */
  void resetInGrad(std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
                   MKLDNNMatrixPtr& in);
  /**
   * reset MKLDNNMatrix of weight value for backward data
   * since the primitive_desc would be different with wgtVal_
--- a/paddle/gserver/layers/MKLDNNFcLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNFcLayer.cpp
@ -62,7 +62,7 @@ void MKLDNNFcLayer::convertWeightsFromPaddle() {
  CHECK(wgtVal_) << "should have been initialized";
  bool hasNoSpatial_ = ih_ == 1 && iw_ == 1;
  auto targetDim = wgtVal_->getDims();
-  auto srcFmt = hasNoSpatial_ ? memory::format::io : memory::format::ihwo;
+  auto srcFmt = hasNoSpatial_ ? format::io : format::ihwo;
  wgtVal_->reorderDataFrom(wgtVal_, srcFmt, targetDim);
  hasInitedWgt_ = true;
 }
@ -71,7 +71,7 @@ void MKLDNNFcLayer::convertWeightsToPaddle() {
  CHECK(wgtVal_) << "should have been initialized";
  bool hasNoSpatial_ = ih_ == 1 && iw_ == 1;
  auto targetDim = wgtVal_->getDims();
-  auto dstFmt = hasNoSpatial_ ? memory::format::io : memory::format::ihwo;
+  auto dstFmt = hasNoSpatial_ ? format::io : format::ihwo;
  wgtVal_->reorderDataTo(wgtVal_, dstFmt, targetDim);
 }
@ -100,8 +100,6 @@ void MKLDNNFcLayer::resetFwd(std::vector<primitive>& pipeline,
  resetFwdPD(fwdPD_, in, wgt, bias, out);
  resetFwdPipeline(pipeline, fwdPD_, in, wgt, bias, out);
  printValueFormatFlow();
 }
 void MKLDNNFcLayer::resetBwd(std::vector<primitive>& pipeline,
@ -119,12 +117,6 @@ void MKLDNNFcLayer::resetBwd(std::vector<primitive>& pipeline,
  resetBwdDataPD(bwdDataPD, in, out);
  resetBwdPipeline(pipeline, bwdWgtPD, bwdDataPD, in, wgt, bias, out);
  printGradFormatFlow();
 }
 void MKLDNNFcLayer::updateInputData() {
  inVal_->setData(getInputValue(0, CPU_DEVICE)->getData());
 }
 void MKLDNNFcLayer::updateWeights(const UpdateCallback& callback) {
@ -139,51 +131,30 @@ void MKLDNNFcLayer::resetFwdBuffers(MKLDNNMatrixPtr& in,
                                    MKLDNNMatrixPtr& bias,
                                    MKLDNNMatrixPtr& out) {
  resetInValue(in);
-
+  CHECK(in);
  resetWgtBiasValue(wgt, bias);
  resetOutValue(out);
 }
 void MKLDNNFcLayer::resetInValue(MKLDNNMatrixPtr& in) {
  if (inputIsOnlyMKLDNN()) {
    const MatrixPtr& dnnIn = getInputValue(0);
    in = std::dynamic_pointer_cast<MKLDNNMatrix>(dnnIn);
    CHECK(in) << "Input should be MKLDNNMatrix";
  } else {
    CHECK_EQ(getPrev(0)->getDeviceId(), CPU_DEVICE) << "Only support CPU yet";
    const MatrixPtr& cpuIn = getInputValue(0, CPU_DEVICE);
    in = MKLDNNMatrix::create(
        cpuIn, {bs_, ic_, ih_, iw_}, format::nchw, engine_);
  }
  in->downSpatial();
 }
-void MKLDNNFcLayer::resetWgtBiasValue(MKLDNNMatrixPtr& wgt,
+  auto outPD =
-                                      MKLDNNMatrixPtr& bias) {
+      MKLDNNMatrix::createPrimitiveDesc({bs_, oc_}, format::nc, engine_);
  resetOutValue(out, outPD);
  format wgtFmt = format::oihw;
-  if (inVal_->getFormat() == format::nChw8c) {
+  if (in->getFormat() == format::nChw8c) {
    wgtFmt = format::oIhw8i;
-  } else if (inVal_->getFormat() == format::nChw16c) {
+  } else if (in->getFormat() == format::nChw16c) {
    wgtFmt = format::oIhw16i;
  }
-  wgt = MKLDNNMatrix::create(
+  auto wgtPD =
-      weight_->getW(), {oc_, ic_, ih_, iw_}, wgtFmt, engine_);
+      MKLDNNMatrix::createPrimitiveDesc({oc_, ic_, ih_, iw_}, wgtFmt, engine_);
  resetWithMatrix(wgt, weight_->getW(), wgtPD);
  wgt->downSpatial();
  VLOG(MKLDNN_FMTS) << "Weight value format: " << wgt->getFormat();
  bias = (biases_ && biases_->getW())
             ? MKLDNNMatrix::create(biases_->getW(), {oc_}, format::x, engine_)
             : nullptr;
 }
-void MKLDNNFcLayer::resetOutValue(MKLDNNMatrixPtr& out) {
+  if (biases_ && biases_->getW()) {
-  out = MKLDNNMatrix::create(output_.value, {bs_, oc_}, format::nc, engine_);
+    auto biasPD = MKLDNNMatrix::createPrimitiveDesc({oc_}, format::x, engine_);
-  if (!outputIsOnlyMKLDNN()) {
+    resetWithMatrix(bias, biases_->getW(), biasPD);
-    // fc cpu output value do not need create convert, just share data
+  } else {
-    getOutput(CPU_DEVICE).value->setData(out->getData());
+    bias = nullptr;
  }
  output_.value = std::dynamic_pointer_cast<Matrix>(out);
 }
 void MKLDNNFcLayer::resetFwdPD(std::shared_ptr<fc_fwd::primitive_desc>& pd,
@ -219,7 +190,6 @@ void MKLDNNFcLayer::resetFwdPipeline(
  } else {
    fwd_.reset(new fc_fwd(*pd, *in, *wgt, *out));
  }
  pipeline.push_back(*fwd_);
 }
@ -227,44 +197,18 @@ void MKLDNNFcLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
                                    MKLDNNMatrixPtr& wgt,
                                    MKLDNNMatrixPtr& bias,
                                    MKLDNNMatrixPtr& out) {
-  resetOutGrad(out);
+  CHECK(inVal_ && outVal_);
-
+  resetOutGrad(out, outVal_->getPrimitiveDesc());
-  resetWgtBiasGrad(wgt, bias);
+  resetInGrad(in, inVal_->getPrimitiveDesc());
  resetInGrad(in);
 }
 void MKLDNNFcLayer::resetOutGrad(MKLDNNMatrixPtr& out) {
  CHECK(outVal_);
  if (outputIsOnlyMKLDNN()) {
    MKLDNNLayer::resetOutGrad(out, outVal_->getPrimitiveDesc());
  } else {
    const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).grad;
    output_.grad->setData(cpuOut->getData());
    out = MKLDNNMatrix::create(cpuOut, outVal_->getPrimitiveDesc());
  }
 }
 void MKLDNNFcLayer::resetWgtBiasGrad(MKLDNNMatrixPtr& wgt,
                                     MKLDNNMatrixPtr& bias) {
  CHECK(wgtVal_);
-  wgt = MKLDNNMatrix::create(weight_->getWGrad(), wgtVal_->getPrimitiveDesc());
+  resetWithMatrix(wgt, weight_->getWGrad(), wgtVal_->getPrimitiveDesc());
-  bias = nullptr;
+  if (biasVal_) {
-  if (biasVal_ == nullptr) {
+    resetWithMatrix(bias, biases_->getWGrad(), biasVal_->getPrimitiveDesc());
-    return;
+  } else {
-  }
+    bias = nullptr;
  bias =
      MKLDNNMatrix::create(biases_->getWGrad(), biasVal_->getPrimitiveDesc());
 }
 void MKLDNNFcLayer::resetInGrad(MKLDNNMatrixPtr& in) {
  in = nullptr;
  if (inputLayers_[0]->getOutput().grad == nullptr) {
    return;
  }
  CHECK(inVal_);
  MKLDNNLayer::resetInGrad(in, inVal_->getPrimitiveDesc());
 }
 void MKLDNNFcLayer::resetBwdWgtPD(
--- a/paddle/gserver/layers/MKLDNNFcLayer.h
+++ b/paddle/gserver/layers/MKLDNNFcLayer.h
@ -66,8 +66,6 @@ public:
                MKLDNNMatrixPtr& bias,
                MKLDNNMatrixPtr& out) override;
  void updateInputData() override;
  void updateWeights(const UpdateCallback& callback) override;
  void convertWeightsFromPaddle() override;
@ -84,9 +82,6 @@ protected:
                       MKLDNNMatrixPtr& wgt,
                       MKLDNNMatrixPtr& bias,
                       MKLDNNMatrixPtr& out);
  void resetInValue(MKLDNNMatrixPtr& in);
  void resetWgtBiasValue(MKLDNNMatrixPtr& wgt, MKLDNNMatrixPtr& bias);
  void resetOutValue(MKLDNNMatrixPtr& out);
  void resetFwdPD(std::shared_ptr<fc_fwd::primitive_desc>& pd,
                  MKLDNNMatrixPtr in,
                  MKLDNNMatrixPtr wgt,
@ -109,9 +104,6 @@ protected:
                       MKLDNNMatrixPtr& wgt,
                       MKLDNNMatrixPtr& bias,
                       MKLDNNMatrixPtr& out);
  void resetOutGrad(MKLDNNMatrixPtr& out);
  void resetWgtBiasGrad(MKLDNNMatrixPtr& wgt, MKLDNNMatrixPtr& bias);
  void resetInGrad(MKLDNNMatrixPtr& in);
  void resetBwdWgtPD(std::shared_ptr<fc_bwdWgt::primitive_desc>& pd,
                     MKLDNNMatrixPtr& wgt,
                     MKLDNNMatrixPtr& bias,
--- a/paddle/gserver/layers/MKLDNNLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNLayer.cpp
--- a/paddle/gserver/layers/MKLDNNLayer.h
+++ b/paddle/gserver/layers/MKLDNNLayer.h
--- a/paddle/gserver/layers/MKLDNNPoolLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNPoolLayer.cpp
@ -85,8 +85,6 @@ void MKLDNNPoolLayer::resetFwd(std::vector<primitive>& pipeline,
  resetFwdPD(fwdPD_, in, out);
  resetFwdPipeline(pipeline, fwdPD_, in, out);
  printValueFormatFlow();
 }
 void MKLDNNPoolLayer::resetBwd(std::vector<primitive>& pipeline,
@ -101,65 +99,22 @@ void MKLDNNPoolLayer::resetBwd(std::vector<primitive>& pipeline,
  resetBwdPD(pd, in, out);
  resetBwdPipeline(pipeline, pd, in, out);
  printGradFormatFlow();
 }
 void MKLDNNPoolLayer::updateInputData() {
  inVal_->setData(getInputValue(0, CPU_DEVICE)->getData());
 }
 void MKLDNNPoolLayer::resetFwdBuffers(MKLDNNMatrixPtr& in,
                                      MKLDNNMatrixPtr& out) {
  resetInValue(in);
  resetOutValue(out);
 }
 void MKLDNNPoolLayer::resetInValue(MKLDNNMatrixPtr& in) {
  if (inputIsOnlyMKLDNN()) {
    const MatrixPtr& dnnIn = getInputValue(0);
    in = std::dynamic_pointer_cast<MKLDNNMatrix>(dnnIn);
    CHECK(in) << "Input should be MKLDNNMatrix";
  } else {
    CHECK_EQ(getPrev(0)->getDeviceId(), CPU_DEVICE) << "Only support CPU yet";
    const MatrixPtr& cpuIn = getInputValue(0, CPU_DEVICE);
    in = MKLDNNMatrix::create(
        cpuIn, {bs_, ic_, ih_, iw_}, format::nchw, engine_);
  }
 }
 void MKLDNNPoolLayer::resetOutValue(MKLDNNMatrixPtr& out) {
  CHECK(inVal_) << "Should reset input value first";
  memory::dims outDims = memory::dims{bs_, oc_, oh_, ow_};
-  out = MKLDNNMatrix::create(
+  CHECK(in);
-      output_.value, outDims, inVal_->getFormat(), engine_);
+  auto outPD =
-
+      MKLDNNMatrix::createPrimitiveDesc(outDims, in->getFormat(), engine_);
-  // create reorder if output value has cpu device and pd do not match
+  resetOutValue(out, outPD);
  cpuOutVal_ = nullptr;
  cvtOutVal_ = nullptr;
  if (!outputIsOnlyMKLDNN()) {
    const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).value;
    cpuOutVal_ = MKLDNNMatrix::create(cpuOut, outDims, format::nchw, engine_);
    if (cpuOutVal_->getPrimitiveDesc() != out->getPrimitiveDesc()) {
      out = MKLDNNMatrix::create(nullptr, out->getPrimitiveDesc());
      cvtOutVal_ = MKLDNNMatrix::createReorder(out, cpuOutVal_);
      CHECK(cvtOutVal_) << "should not be emptry";
    } else {
      cpuOut->setData(output_.value->getData());
      cpuOutVal_ = out;
    }
    output_.value = std::dynamic_pointer_cast<Matrix>(cpuOutVal_);
    return;
  }
  output_.value = std::dynamic_pointer_cast<Matrix>(outVal_);
 }
 void MKLDNNPoolLayer::resetFwdPD(std::shared_ptr<pool_fwd::primitive_desc>& pd,
                                 MKLDNNMatrixPtr in,
                                 MKLDNNMatrixPtr out) {
  memory::dims inDims = memory::dims{bs_, ic_, ih_, iw_};
  memory::dims outDims = memory::dims{bs_, oc_, oh_, ow_};
  memory::dims kernels = memory::dims{fh_, fw_};
  memory::dims strides = memory::dims{sh_, sw_};
  memory::dims padL = memory::dims{ph_, pw_};
@ -194,58 +149,26 @@ void MKLDNNPoolLayer::resetFwdPipeline(
             ? std::make_shared<pool_fwd>(pool_fwd(*pd, *in, *out, *workspace_))
             : std::make_shared<pool_fwd>(pool_fwd(*pd, *in, *out));
  pipeline.push_back(*fwd_);
  if (cvtOutVal_) {
    pipeline.push_back(*cvtOutVal_);
  }
 }
 void MKLDNNPoolLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
                                      MKLDNNMatrixPtr& out) {
-  resetOutGrad(out);
+  CHECK(inVal_ && outVal_);
-
+  resetOutGrad(out, outVal_->getPrimitiveDesc());
-  resetInGrad(in);
+  resetInGrad(in, inVal_->getPrimitiveDesc());
 }
 void MKLDNNPoolLayer::resetOutGrad(MKLDNNMatrixPtr& out) {
  cpuOutGrad_ = nullptr;
  cvtOutGrad_ = nullptr;
  CHECK(outVal_);
  if (outputIsOnlyMKLDNN()) {
    MKLDNNLayer::resetOutGrad(out, outVal_->getPrimitiveDesc());
  } else {
    const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).grad;
    // always share the same grad data of CPU output
    // then the activation can get the right grad from output_.grad
    output_.grad->setData(cpuOut->getData());
    cpuOutGrad_ = MKLDNNMatrix::create(
        cpuOut, memory::dims{bs_, oc_, oh_, ow_}, format::nchw, engine_);
    if (cpuOutGrad_->getPrimitiveDesc() != outVal_->getPrimitiveDesc()) {
      out = MKLDNNMatrix::create(nullptr, outVal_->getPrimitiveDesc());
      cvtOutGrad_ = MKLDNNMatrix::createReorder(cpuOutGrad_, out);
      CHECK(cvtOutGrad_) << "should not be emptry";
    } else {
      out = cpuOutGrad_;
    }
  }
 }
 void MKLDNNPoolLayer::resetInGrad(MKLDNNMatrixPtr& in) {
  in = nullptr;
  if (inputLayers_[0]->getOutput().grad == nullptr) {
    return;
  }
  CHECK(inVal_);
  MKLDNNLayer::resetInGrad(in, inVal_->getPrimitiveDesc());
 }
 void MKLDNNPoolLayer::resetBwdPD(std::shared_ptr<pool_bwd::primitive_desc>& pd,
                                 MKLDNNMatrixPtr& in,
                                 MKLDNNMatrixPtr& out) {
  pd = nullptr;
  if (in == nullptr) {
    return;
  }
  memory::dims kernels = memory::dims{fh_, fw_};
  memory::dims strides = memory::dims{sh_, sw_};
  memory::dims padL = memory::dims{ph_, pw_};
  memory::dims padR = getPaddingR();
  CHECK(in);
  CHECK(out);
  auto bwdDesc = pool_bwd::desc(poolAlgo_,
                                in->getMemoryDesc(),
@ -263,8 +186,8 @@ void MKLDNNPoolLayer::resetBwdPipeline(
    std::shared_ptr<pool_bwd::primitive_desc>& pd,
    MKLDNNMatrixPtr& in,
    MKLDNNMatrixPtr& out) {
-  if (cvtOutGrad_) {
+  if (pd == nullptr) {
-    pipeline.push_back(*cvtOutGrad_);
+    return;
  }
  bwdData_ =
--- a/paddle/gserver/layers/MKLDNNPoolLayer.h
+++ b/paddle/gserver/layers/MKLDNNPoolLayer.h
@ -38,13 +38,6 @@ protected:
  // pooling_avg or pooling_max
  mkldnn::algorithm poolAlgo_;
  // MKLDNNMatrixPtr which should be created from CPU Device
  MKLDNNMatrixPtr cpuOutVal_;
  MKLDNNMatrixPtr cpuOutGrad_;
  // convert handle between CPU device and MKLDNN device
  std::shared_ptr<mkldnn::reorder> cvtOutVal_;
  std::shared_ptr<mkldnn::reorder> cvtOutGrad_;
  // save forward primitive_desc, which can be used backward
  std::shared_ptr<pool_fwd::primitive_desc> fwdPD_;
  // according to https://github.com/01org/mkl-dnn/blob/master/tests/gtests/
@ -74,8 +67,6 @@ public:
                MKLDNNMatrixPtr& bias,
                MKLDNNMatrixPtr& out) override;
  void updateInputData() override;
  void printSizeInfo() override {
    MKLDNNLayer::printSizeInfo();
    VLOG(MKLDNN_SIZES) << getName() << ": fh: " << fh_ << ", fw: " << fw_
@ -90,8 +81,6 @@ protected:
   *                    reset pipeline.
   */
  void resetFwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out);
  void resetInValue(MKLDNNMatrixPtr& in);
  void resetOutValue(MKLDNNMatrixPtr& out);
  void resetFwdPD(std::shared_ptr<pool_fwd::primitive_desc>& pd,
                  MKLDNNMatrixPtr in,
                  MKLDNNMatrixPtr out);
@ -106,8 +95,6 @@ protected:
   *                     reset pipeline.
   */
  void resetBwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out);
  void resetOutGrad(MKLDNNMatrixPtr& out);
  void resetInGrad(MKLDNNMatrixPtr& in);
  void resetBwdPD(std::shared_ptr<pool_bwd::primitive_desc>& pd,
                  MKLDNNMatrixPtr& in,
                  MKLDNNMatrixPtr& out);
--- a/paddle/gserver/tests/MKLDNNTester.cpp
+++ b/paddle/gserver/tests/MKLDNNTester.cpp
@ -97,7 +97,7 @@ void MKLDNNTester::randomWgtDatas() {
    parameters_[REF][i]->randomize();
    dnnValue->copyFrom(*refValue);
-    VLOG(lvl_) << "Random weight data " << parameters_[DNN][i]->getName();
+    VLOG(MKLDNN_TESTS) << "Random weight " << parameters_[DNN][i]->getName();
    printVector(dnnValue);
  }
 }
@ -109,7 +109,7 @@ void MKLDNNTester::randomBotDatas() {
    dataLayers_[REF][i]->getOutputValue()->randomizeUniform();
    dataLayers_[DNN][i]->getOutputValue()->copyFrom(
        *(dataLayers_[REF][i]->getOutputValue()));
-    VLOG(lvl_) << "Input " << i << " data:";
+    VLOG(MKLDNN_TESTS) << "Random Foward, InputValue " << i;
    printMatrix(dataLayers_[REF][i]->getOutputValue());
  }
 }
@ -118,12 +118,12 @@ void MKLDNNTester::randomTopDiffs() {
  refLayer_->getOutputGrad()->randomizeUniform();
  dnnLayer_->getOutput(CPU_DEVICE)
      .grad->copyFrom(*(refLayer_->getOutputGrad()));
-  VLOG(lvl_) << "Random Backward Input, TopDiff: ";
+  VLOG(MKLDNN_TESTS) << "Random Backward, OutputGrad";
  printMatrix(refLayer_->getOutputGrad());
 }
 void MKLDNNTester::checkForward() {
-  VLOG(MKLDNN_ALL) << "Check Forward";
+  VLOG(MKLDNN_TESTS) << "Check Forward";
  printTopDatas();
  double delta =
      compareMatrix(dnnLayer_->getOutputValue(), refLayer_->getOutputValue());
@ -131,15 +131,15 @@ void MKLDNNTester::checkForward() {
 }
 void MKLDNNTester::checkBackwardData() {
-  VLOG(MKLDNN_ALL) << "Check Backward Data";
+  VLOG(MKLDNN_TESTS) << "Check Backward Data";
  // TODO(TJ): uncomment me when batch norm ready
  // const bool isBN = dnnLayer_->getType() == "mkldnn_batch_norm";
  for (size_t i = 0; i < dataLayers_[DNN].size(); ++i) {
    const MatrixPtr& dnnDiff = dataLayers_[DNN][i]->getOutputGrad();
    const MatrixPtr& refDiff = dataLayers_[REF][i]->getOutputGrad();
-    VLOG(lvl_) << "Mkldnn Backward Output BotDiff " << i;
+    VLOG(MKLDNN_ALL) << "MKLDNN Backward Result: InputGrad " << i;
    printMatrix(dnnDiff);
-    VLOG(lvl_) << "Reference Backward Output BotDiff " << i;
+    VLOG(MKLDNN_ALL) << "Reference Backward Result: InputGrad " << i;
    printMatrix(refDiff);
    double delta = compareMatrix(dnnDiff, refDiff);
@ -153,7 +153,7 @@ void MKLDNNTester::checkBackwardData() {
 }
 void MKLDNNTester::checkBackwardWgts() {
-  VLOG(MKLDNN_ALL) << "Check Backward Weight";
+  VLOG(MKLDNN_TESTS) << "Check Backward Weight";
  CHECK_EQ(parameters_[DNN].size(), parameters_[REF].size());
  vector<VectorPtr> dnnWgts;  // used to temply save mkldnn weights
  saveWgt(parameters_[DNN], dnnWgts);
@ -165,9 +165,11 @@ void MKLDNNTester::checkBackwardWgts() {
  for (size_t i = 0; i < parameters_[DNN].size(); ++i) {
    const VectorPtr& dnn = parameters_[DNN][i]->getBuf(PARAMETER_VALUE);
    const VectorPtr& ref = parameters_[REF][i]->getBuf(PARAMETER_VALUE);
-    VLOG(lvl_) << "Mkldnn Output weight " << parameters_[DNN][i]->getName();
+    VLOG(MKLDNN_ALL) << "MKLDNN Result: weight value"
                     << parameters_[DNN][i]->getName();
    printVector(dnn);
-    VLOG(lvl_) << "Reference Output weight " << parameters_[REF][i]->getName();
+    VLOG(MKLDNN_ALL) << "Reference Result: weight value "
                     << parameters_[REF][i]->getName();
    printVector(ref);
    double delta = compareVector(dnn, ref);
@ -240,7 +242,8 @@ void MKLDNNTester::printTopDatas() {
  }
  for (int n = 0; n < NUM; ++n) {
-    VLOG(lvl_) << testLayers_[n]->getType() << " forward output TopData: ";
+    VLOG(MKLDNN_ALL) << testLayers_[n]->getType()
                     << " Forward Result: OutputValue";
    printMatrix(testLayers_[n]->getOutputValue());
  }
 }
@ -252,7 +255,7 @@ void MKLDNNTester::printMatrix(const MatrixPtr& m) {
  std::ostringstream ostr;
  m->print(ostr);
-  VLOG(lvl_) << std::endl << ostr.str();
+  VLOG(MKLDNN_ALL) << std::endl << ostr.str();
 }
 void MKLDNNTester::printVector(const VectorPtr& v) {
@ -262,7 +265,7 @@ void MKLDNNTester::printVector(const VectorPtr& v) {
  std::ostringstream ostr;
  v->print(ostr, v->getSize());
-  VLOG(lvl_) << std::endl << ostr.str();
+  VLOG(MKLDNN_ALL) << std::endl << ostr.str();
 }
 double MKLDNNTester::getDelta(const real* d1,
@ -314,7 +317,7 @@ void MKLDNNTester::runOnce() {
  UpdateCallback updateCallback = [](Parameter* para) {
    auto& grad = para->getBuf(PARAMETER_GRADIENT);
    auto& value = para->getBuf(PARAMETER_VALUE);
-    real lr = 1e-3;
+    real lr = 1e-2;
    value->add(*grad, lr);
    grad->zeroMem();
  };
@ -340,10 +343,9 @@ void MKLDNNTester::run(const TestConfig& dnn,
                       size_t batchSize,
                       size_t inputImgH,
                       size_t inputImgW,
                       bool printDetails,
                       size_t iter,
-                       float epsilon,
+                       float epsilon) {
                       bool log,
                       int level) {
  CHECK(dnn.layerConfig.type().compare(0, 7, "mkldnn_") == 0 ||
        dnn.layerConfig.active_type().compare(0, 7, "mkldnn_") == 0)
      << "should be MKLDNN layer or MKLDNN activation";
@ -359,10 +361,9 @@ void MKLDNNTester::run(const TestConfig& dnn,
  ih_ = inputImgH;
  iw_ = inputImgW;
  log_ = printDetails;
  iter_ = iter;
  eps_ = epsilon;
  log_ = log;
  lvl_ = level;
  // Firstly test mkldnn init from PARAM_FORMAT_ORIGINAL weight
  reset(dnn, ref, batchSize);
@ -531,9 +532,11 @@ void MKLDNNTester::getOutResult(const std::string& configPath,
 void MKLDNNTester::compareResult(DataOut& ref, DataOut& dnn, float eps) {
  CHECK_EQ(ref.outValues.size(), dnn.outValues.size());
  CHECK_EQ(ref.paraValues.size(), dnn.paraValues.size());
  VLOG(MKLDNN_TESTS) << "compare value size: " << ref.outValues.size();
  for (size_t i = 0; i < ref.outValues.size(); i++) {
    EXPECT_LE(fabs(compareMatrix(ref.outValues[i], dnn.outValues[i])), eps);
  }
  VLOG(MKLDNN_TESTS) << "compare param size: " << ref.outValues.size();
  for (size_t i = 0; i < ref.paraValues.size(); i++) {
    EXPECT_LE(fabs(compareVector(ref.paraValues[i], dnn.paraValues[i])), eps);
  }
@ -544,9 +547,10 @@ void MKLDNNTester::runBranchesTest(const std::string& configPath,
                                   float eps) {
  DataIn in;
  initArgument(in, configPath, iter);
  DataOut outCpu, outDnn;
  VLOG(MKLDNN_TESTS) << "runing cpu network";
  getOutResult(configPath, in, outCpu, false, iter);
  VLOG(MKLDNN_TESTS) << "runing mkldnn network";
  getOutResult(configPath, in, outDnn, true, iter);
  compareResult(outCpu, outDnn, eps);
--- a/paddle/gserver/tests/MKLDNNTester.h
+++ b/paddle/gserver/tests/MKLDNNTester.h
@ -58,8 +58,6 @@ protected:
  size_t iter_;
  /// whether to print out the details
  bool log_;
  /// vlog level to print the matrix details datas
  int lvl_;
  /// epsilon
  float eps_;
  /// input image size, default 1
@ -70,7 +68,6 @@ public:
    iter_ = iter;
    eps_ = epsilon;
    log_ = false;
    lvl_ = MKLDNN_ALL;
  }
  ~MKLDNNTester() {}
@ -81,10 +78,9 @@ public:
           size_t batchSize,
           size_t inputImgH = 1,
           size_t inputImgW = 1,
           bool printDetails = false,
           size_t iter = 3,
-           float epsilon = 1e-4,
+           float epsilon = 1e-4);
           bool log = false,
           int level = MKLDNN_ALL);
  static void runBranchesTest(const std::string& configPath,
                              size_t iter = 3,
                              float eps = 1e-4);
--- a/paddle/math/MKLDNNMatrix.cpp
+++ b/paddle/math/MKLDNNMatrix.cpp
@ -18,7 +18,7 @@ using namespace mkldnn;  // NOLINT
 namespace paddle {
-MKLDNNMatrixPtr MKLDNNMatrix::create(MatrixPtr m, memory::primitive_desc pd) {
+MKLDNNMatrixPtr MKLDNNMatrix::create(memory::primitive_desc pd, MatrixPtr m) {
  memory::desc md = pd.desc();
  size_t ndims = md.data.ndims;
  int* dims = md.data.dims;
@ -41,12 +41,12 @@ MKLDNNMatrixPtr MKLDNNMatrix::create(MatrixPtr m, memory::primitive_desc pd) {
  return std::make_shared<MKLDNNMatrix>(cpuMatrix, pd);
 }
-MKLDNNMatrixPtr MKLDNNMatrix::create(MatrixPtr m,
+MKLDNNMatrixPtr MKLDNNMatrix::create(memory::dims dims,
                                     memory::dims dims,
                                     memory::format fmt,
                                     engine& eg,
                                     MatrixPtr m,
                                     mkldnn::memory::data_type dtype) {
-  return create(m, memory::primitive_desc(memory::desc(dims, dtype, fmt), eg));
+  return create(createPrimitiveDesc(dims, fmt, eg, dtype), m);
 }
 std::shared_ptr<reorder> MKLDNNMatrix::createReorder(const MKLDNNMatrixPtr& src,
--- a/paddle/math/MKLDNNMatrix.h
+++ b/paddle/math/MKLDNNMatrix.h
@ -40,24 +40,37 @@ public:
  /**
   * Create MKLDNNMatrix from a MatrixPtr and memory primitive_desc
   */
-  static MKLDNNMatrixPtr create(MatrixPtr m, mkldnn::memory::primitive_desc pd);
+  static MKLDNNMatrixPtr create(mkldnn::memory::primitive_desc pd,
                                MatrixPtr m = nullptr);
  /**
   * Create MKLDNNMatrix from a MatrixPtr and memory details info
   */
  static MKLDNNMatrixPtr create(
      MatrixPtr m,
      mkldnn::memory::dims dims,
      mkldnn::memory::format fmt,
      mkldnn::engine& eg,
      MatrixPtr m = nullptr,
      mkldnn::memory::data_type dtype = mkldnn::memory::data_type::f32);
  /**
   * Create primitive descriptor.
   * default with f32 dtype
   */
  static mkldnn::memory::primitive_desc createPrimitiveDesc(
      const mkldnn::memory::dims dims,
      const mkldnn::memory::format& fmt,
      const mkldnn::engine& eg,
      const mkldnn::memory::data_type& dtype = mkldnn::memory::data_type::f32) {
    return mkldnn::memory::primitive_desc(memory::desc(dims, dtype, fmt), eg);
  }
  /**
   * Create Memory descriptor.
   * default with any format and f32 dtype
   */
  static mkldnn::memory::desc createMemoryDesc(
-      const mkldnn::memory::dims& dims,
+      const mkldnn::memory::dims dims,
      const mkldnn::memory::format& fmt = mkldnn::memory::format::any,
      const mkldnn::memory::data_type& dtype = mkldnn::memory::data_type::f32) {
    return mkldnn::memory::desc(dims, dtype, fmt);
--- a/paddle/trainer/tests/sample_trainer_config_branch_net.conf
+++ b/paddle/trainer/tests/sample_trainer_config_branch_net.conf
@ -17,7 +17,7 @@ from paddle.trainer_config_helpers import *
 ################################### Data Configuration ###################################
 TrainData(ProtoData(files = "trainer/tests/mnist.list"))
 ################################### Algorithm Configuration ###################################
-settings(batch_size = 256,
+settings(batch_size = 128,
         learning_method = MomentumOptimizer(momentum=0.5, sparse=False))
 ################################### Network Configuration ###################################
 data = data_layer(name ="input", size=784)
@ -44,10 +44,11 @@ a2 = img_conv_layer(input=tmp,
            shared_biases=True,
            act=ReluActivation())
-tmp = concat_layer(input=[a1, a2])
+tmp = addto_layer(input=[a1, a2],
            act=ReluActivation(),
            bias_attr=False)
 tmp = img_pool_layer(input=tmp,
            num_channels=64,
            pool_size=3,
            stride=2,
            padding=1,
@ -55,35 +56,34 @@ tmp = img_pool_layer(input=tmp,
 b1 = img_conv_layer(input=tmp,
            filter_size=3,
-            num_filters=64,
+            num_filters=32,
            padding=1,
            shared_biases=True,
            act=ReluActivation())
 b1 = img_pool_layer(input=b1,
            pool_size=3,
-            stride=1,
+            stride=2,
-            padding=1,
+            padding=0,
            pool_type=MaxPooling())
 b2 = img_conv_layer(input=tmp,
-            filter_size=5,
+            filter_size=3,
            num_filters=64,
-            padding=2,
+            padding=1,
            shared_biases=True,
            act=ReluActivation())
 b2 = img_pool_layer(input=b2,
            pool_size=5,
-            stride=1,
+            stride=2,
-            padding=2,
+            padding=1,
            pool_type=MaxPooling())
-tmp = addto_layer(input=[b1, b2],
+tmp = concat_layer(input=[b1, b2])
            act=ReluActivation(),
            bias_attr=False)
 tmp = img_pool_layer(input=tmp,
            num_channels=96,
            pool_size=3,
            stride=2,
            padding=1,
--- a/paddle/trainer/tests/sample_trainer_config_simple_net.conf
+++ b/paddle/trainer/tests/sample_trainer_config_simple_net.conf
@ -17,7 +17,7 @@ from paddle.trainer_config_helpers import *
 ################################### Data Configuration ###################################
 TrainData(ProtoData(files = "trainer/tests/mnist.list"))
 ################################### Algorithm Configuration ###################################
-settings(batch_size = 1000,
+settings(batch_size = 128,
         learning_method = MomentumOptimizer(momentum=0.5, sparse=False))
 ################################### Network Configuration ###################################
 data = data_layer(name ="input", size=784)