From c1914543b0eaef98450314a1b56f4f918aa36ce2 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Thu, 19 Oct 2017 14:34:44 +0800 Subject: [PATCH 1/5] refine mkldnn logic, move reset buffers into MKLDNNLayer --- paddle/gserver/layers/MKLDNNConvLayer.cpp | 233 +++------------- paddle/gserver/layers/MKLDNNConvLayer.h | 66 ----- paddle/gserver/layers/MKLDNNFcLayer.cpp | 101 ++----- paddle/gserver/layers/MKLDNNFcLayer.h | 8 - paddle/gserver/layers/MKLDNNLayer.h | 324 ++++++++++++++++++---- paddle/gserver/layers/MKLDNNPoolLayer.cpp | 103 +------ paddle/gserver/layers/MKLDNNPoolLayer.h | 13 - paddle/math/MKLDNNMatrix.cpp | 2 +- paddle/math/MKLDNNMatrix.h | 14 +- 9 files changed, 358 insertions(+), 506 deletions(-) diff --git a/paddle/gserver/layers/MKLDNNConvLayer.cpp b/paddle/gserver/layers/MKLDNNConvLayer.cpp index 26810a6483..463e6ad0ed 100644 --- a/paddle/gserver/layers/MKLDNNConvLayer.cpp +++ b/paddle/gserver/layers/MKLDNNConvLayer.cpp @@ -116,8 +116,6 @@ void MKLDNNConvLayer::resetFwd(std::vector& pipeline, resetFwdBuffers(fwdPD_, in, wgt, bias, out); resetFwdPipeline(pipeline, fwdPD_, in, wgt, bias, out); - - printValueFormatFlow(); } void MKLDNNConvLayer::resetBwd(std::vector& pipeline, @@ -135,12 +133,6 @@ void MKLDNNConvLayer::resetBwd(std::vector& pipeline, resetBwdBuffers(bwdWgtPD, bwdDataPD, in, wgt, bias, out); resetBwdPipeline(pipeline, bwdWgtPD, bwdDataPD, in, wgt, bias, out); - - printGradFormatFlow(); -} - -void MKLDNNConvLayer::updateInputData() { - cpuInVal_->setData(getInputValue(0, CPU_DEVICE)->getData()); } void MKLDNNConvLayer::updateWeights(const UpdateCallback& callback) { @@ -211,11 +203,18 @@ void MKLDNNConvLayer::resetFwdBuffers( MKLDNNMatrixPtr& bias, MKLDNNMatrixPtr& out) { CHECK(pd); - resetInValue(pd, in); + resetInValue( + in, std::make_shared(pd->src_primitive_desc())); + + resetOutValue(out, pd->dst_primitive_desc()); - resetWgtBiasValue(pd, wgt, bias); + resetWithMatrix(wgt, weight_->getW(), pd->weights_primitive_desc()); - resetOutValue(pd, out); + bias = nullptr; + if (biases_ == nullptr || biases_->getW() == nullptr) { + return; + } + resetWithMatrix(bias, biases_->getW(), pd->bias_primitive_desc()); } void MKLDNNConvLayer::resetFwdPipeline( @@ -225,104 +224,12 @@ void MKLDNNConvLayer::resetFwdPipeline( MKLDNNMatrixPtr& wgt, MKLDNNMatrixPtr& bias, MKLDNNMatrixPtr& out) { - if (cvtInVal_) { - pipeline.push_back(*cvtInVal_); - } - if (bias) { fwd_.reset(new conv_fwd(*pd, *in, *wgt, *bias, *out)); } else { fwd_.reset(new conv_fwd(*pd, *in, *wgt, *out)); } pipeline.push_back(*fwd_); - - if (cvtOutVal_) { - pipeline.push_back(*cvtOutVal_); - } -} - -void MKLDNNConvLayer::resetInValue( - std::shared_ptr& pd, MKLDNNMatrixPtr& in) { - const MatrixPtr& inMat = inputLayers_[0]->getOutputValue(); - in = MKLDNNMatrix::create(inMat, pd->src_primitive_desc()); - - // create buffer and reorder if input value do not match - cpuInVal_ = nullptr; - cvtInVal_ = nullptr; - - MKLDNNMatrixPtr dnnIn = std::dynamic_pointer_cast(inMat); - CHECK_EQ(inputIsOnlyMKLDNN(), dnnIn != nullptr); - if (dnnIn != nullptr && dnnIn->getPrimitiveDesc() == in->getPrimitiveDesc()) { - in = dnnIn; - return; - } - if (dnnIn) { - if (dnnIn->getFormat() == format::nc) { - CHECK(ih_ == 1 && iw_ == 1) << "when input is nc format"; - // create a new one with nchw format and same data - memory::dims inDims = memory::dims{bs_, ic_, 1, 1}; - dnnIn = MKLDNNMatrix::create(inMat, inDims, format::nchw, engine_); - } - if (dnnIn->getPrimitiveDesc() == in->getPrimitiveDesc()) { - in = dnnIn; - return; - } - cpuInVal_ = dnnIn; - in = MKLDNNMatrix::create(nullptr, pd->src_primitive_desc()); - cvtInVal_ = MKLDNNMatrix::createReorder(cpuInVal_, in); - CHECK(cvtInVal_) << "should not be emptry"; - } else { - memory::dims inDims = memory::dims{bs_, ic_, ih_, iw_}; - cpuInVal_ = MKLDNNMatrix::create(inMat, inDims, format::nchw, engine_); - if (cpuInVal_->getPrimitiveDesc() != in->getPrimitiveDesc()) { - // create new mkldnn matrix - in = MKLDNNMatrix::create(nullptr, pd->src_primitive_desc()); - cvtInVal_ = MKLDNNMatrix::createReorder(cpuInVal_, in); - CHECK(cvtInVal_) << "should not be emptry"; - } else { - in = cpuInVal_; - } - } -} - -void MKLDNNConvLayer::resetWgtBiasValue( - std::shared_ptr& pd, - MKLDNNMatrixPtr& wgt, - MKLDNNMatrixPtr& bias) { - wgt = MKLDNNMatrix::create(weight_->getW(), pd->weights_primitive_desc()); - VLOG(MKLDNN_FMTS) << "Weight value format: " << wgt->getFormat(); - - bias = (biases_ && biases_->getW()) - ? MKLDNNMatrix::create(biases_->getW(), pd->bias_primitive_desc()) - : nullptr; -} - -void MKLDNNConvLayer::resetOutValue( - std::shared_ptr& pd, MKLDNNMatrixPtr& out) { - out = MKLDNNMatrix::create(output_.value, pd->dst_primitive_desc()); - - // create reorder if output value has cpu device and pd do not match - cpuOutVal_ = nullptr; - cvtOutVal_ = nullptr; - if (!outputIsOnlyMKLDNN()) { - const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).value; - memory::dims outDims = memory::dims{bs_, oc_, oh_, ow_}; - cpuOutVal_ = MKLDNNMatrix::create(cpuOut, outDims, format::nchw, engine_); - if (cpuOutVal_->getPrimitiveDesc() != pd->dst_primitive_desc()) { - out = MKLDNNMatrix::create(nullptr, pd->dst_primitive_desc()); - cvtOutVal_ = MKLDNNMatrix::createReorder(out, cpuOutVal_); - CHECK(cvtOutVal_) << "should not be empty"; - } else { - cpuOut->setData(output_.value->getData()); - cpuOutVal_ = out; - } - // when output is cpu device, change the mkldnn output value and make them - // share the same data. Then if next layer use inputlayer->getOuputValue() - // to achieve the input value, it will get the right data. - output_.value = std::dynamic_pointer_cast(cpuOutVal_); - return; - } - output_.value = std::dynamic_pointer_cast(out); } void MKLDNNConvLayer::resetBwdWgtPD( @@ -331,8 +238,8 @@ void MKLDNNConvLayer::resetBwdWgtPD( loadConvSettings(wgtDims, biasDims, strides, dilations, padL, padR); // create backward weight using input, output and weight value memory desc - CHECK(inVal_) << "Should have input value"; - CHECK(outVal_) << "Should have output value"; + CHECK(inVal_) << "Should have internal input value"; + CHECK(outVal_) << "Should have internal output value"; CHECK(wgtVal_) << "Should have weight value"; algorithm algo = algorithm::convolution_direct; padding_kind padKind = padding_kind::zero; @@ -372,8 +279,8 @@ void MKLDNNConvLayer::resetBwdDataPD( memory::dims wgtDims, biasDims, strides, dilations, padL, padR; loadConvSettings(wgtDims, biasDims, strides, dilations, padL, padR); - CHECK(inVal_) << "Should have input value"; - CHECK(outVal_) << "Should have output value"; + CHECK(inVal_) << "Should have internal input value"; + CHECK(outVal_) << "Should have internal output value"; // create backward data using input and output value memory desc // but using weight memory desc with any format auto bwdDataDesc = conv_bwdData::desc(algorithm::convolution_direct, @@ -399,12 +306,27 @@ void MKLDNNConvLayer::resetBwdBuffers( MKLDNNMatrixPtr& bias, MKLDNNMatrixPtr& out) { CHECK(wgtPD); - resetOutGrad(wgtPD, out); + resetOutGrad(out, wgtPD->diff_dst_primitive_desc()); - resetWgtBiasGrad(wgtPD, wgt, bias); + resetWithMatrix( + wgt, weight_->getWGrad(), wgtPD->diff_weights_primitive_desc()); + CHECK(wgtVal_ != nullptr && + wgt->getPrimitiveDesc() == wgtVal_->getPrimitiveDesc()) + << "primitive desc of weight grad and value should be equal"; - resetInGrad(dataPD, in); + bias = nullptr; + if (biases_ && biases_->getWGrad()) { + resetWithMatrix( + bias, biases_->getWGrad(), wgtPD->diff_bias_primitive_desc()); + CHECK(bias && biasVal_ && + bias->getPrimitiveDesc() == biasVal_->getPrimitiveDesc()) + << "primitive desc of bias grad should equal the bias value"; + } + if (dataPD == nullptr) { + return; + } + resetInGrad(in, dataPD->diff_src_primitive_desc()); resetWgtValBwdData(dataPD, wgtValBwdData_); } @@ -416,10 +338,7 @@ void MKLDNNConvLayer::resetBwdPipeline( MKLDNNMatrixPtr& wgt, MKLDNNMatrixPtr& bias, MKLDNNMatrixPtr& out) { - if (cvtOutGrad_) { - pipeline.push_back(*cvtOutGrad_); - } - + CHECK(inVal_); // add bwdWgt handle if (bias) { bwdWgt_.reset(new conv_bwdWgt(*wgtPD, *inVal_, *out, *wgt, *bias)); @@ -431,99 +350,13 @@ void MKLDNNConvLayer::resetBwdPipeline( if (dataPD == nullptr) { return; } - if (cvtWgtVal_) { pipeline.push_back(*cvtWgtVal_); } - // add bwdData handle CHECK(wgtValBwdData_) << "Should have weight memory"; bwdData_.reset(new conv_bwdData(*dataPD, *out, *wgtValBwdData_, *in)); pipeline.push_back(*bwdData_); - - if (cvtInGrad_) { - pipeline.push_back(*cvtInGrad_); - } -} - -void MKLDNNConvLayer::resetOutGrad( - std::shared_ptr& wgtPD, MKLDNNMatrixPtr& out) { - cpuOutGrad_ = nullptr; - cvtOutGrad_ = nullptr; - CHECK(outVal_ != nullptr && - outVal_->getPrimitiveDesc() == wgtPD->diff_dst_primitive_desc()) - << "primitive desc of out grad and value should be equal"; - if (outputIsOnlyMKLDNN()) { - MKLDNNLayer::resetOutGrad(out, outVal_->getPrimitiveDesc()); - } else { - const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).grad; - // always share the same grad data of CPU output - // then the activation can get the right grad from output_.grad - output_.grad->setData(cpuOut->getData()); - // same PrimitiveDesc with cpuInVal_ - CHECK(cpuOutVal_); - cpuOutGrad_ = MKLDNNMatrix::create(cpuOut, cpuOutVal_->getPrimitiveDesc()); - // create reorder if primitive desc does not match - if (cpuOutGrad_->getPrimitiveDesc() != outVal_->getPrimitiveDesc()) { - out = MKLDNNMatrix::create(nullptr, outVal_->getPrimitiveDesc()); - cvtOutGrad_ = MKLDNNMatrix::createReorder(cpuOutGrad_, out); - CHECK(cvtOutGrad_); - } else { - out = cpuOutGrad_; - } - } -} - -void MKLDNNConvLayer::resetWgtBiasGrad( - std::shared_ptr& wgtPD, - MKLDNNMatrixPtr& wgt, - MKLDNNMatrixPtr& bias) { - wgt = MKLDNNMatrix::create(weight_->getWGrad(), - wgtPD->diff_weights_primitive_desc()); - CHECK(nullptr != wgtVal_ && - wgt->getPrimitiveDesc() == wgtVal_->getPrimitiveDesc()) - << "primitive desc of weight grad and value should be equal"; - VLOG(MKLDNN_FMTS) << "weight grad format: " << wgt->getFormat(); - - bias = nullptr; - if (biasVal_ == nullptr) { - return; - } - bias = MKLDNNMatrix::create(biases_->getWGrad(), - wgtPD->diff_bias_primitive_desc()); - CHECK(bias->getPrimitiveDesc() == biasVal_->getPrimitiveDesc()) - << "primitive desc of bias grad should equal the bias value"; -} - -void MKLDNNConvLayer::resetInGrad( - std::shared_ptr& dataPD, - MKLDNNMatrixPtr& in) { - in = nullptr; - cpuInGrad_ = nullptr; - cvtInGrad_ = nullptr; - if (dataPD == nullptr) { - return; - } - - if (inputIsOnlyMKLDNN()) { - MKLDNNLayer::resetInGrad(in, dataPD->diff_src_primitive_desc()); - CHECK(nullptr != inVal_ && - in->getPrimitiveDesc() == inVal_->getPrimitiveDesc()) - << "primitive desc of input grad and value should be equal"; - } else { - const MatrixPtr& cpuIn = getInputGrad(0, CPU_DEVICE); - // same PrimitiveDesc with cpuInVal_ - CHECK(cpuInVal_); - cpuInGrad_ = MKLDNNMatrix::create(cpuIn, cpuInVal_->getPrimitiveDesc()); - in = cpuInGrad_; - // create reorder if PrimitiveDesc does not match - if (cpuInGrad_->getPrimitiveDesc() != dataPD->diff_src_primitive_desc()) { - in = MKLDNNMatrix::create(getInputGrad(0, MKLDNN_DEVICE), - dataPD->diff_src_primitive_desc()); - cvtInGrad_ = MKLDNNMatrix::createReorder(in, cpuInGrad_); - CHECK(cvtInGrad_); - } - } } void MKLDNNConvLayer::resetWgtValBwdData( diff --git a/paddle/gserver/layers/MKLDNNConvLayer.h b/paddle/gserver/layers/MKLDNNConvLayer.h index f84f2f737c..1fed0e1c65 100644 --- a/paddle/gserver/layers/MKLDNNConvLayer.h +++ b/paddle/gserver/layers/MKLDNNConvLayer.h @@ -48,17 +48,6 @@ protected: // save forward primitive_desc, which can be used backward std::shared_ptr fwdPD_; - // MKLDNNMatrixPtr which should be created from CPU Device - MKLDNNMatrixPtr cpuInVal_; - MKLDNNMatrixPtr cpuInGrad_; - MKLDNNMatrixPtr cpuOutVal_; - MKLDNNMatrixPtr cpuOutGrad_; - // convert handle between CPU device and MKLDNN device - std::shared_ptr cvtInVal_; - std::shared_ptr cvtInGrad_; - std::shared_ptr cvtOutVal_; - std::shared_ptr cvtOutGrad_; - // whether the weight has been init bool hasInitedWgt_; @@ -94,8 +83,6 @@ public: MKLDNNMatrixPtr& bias, MKLDNNMatrixPtr& out) override; - void updateInputData() override; - void updateWeights(const UpdateCallback& callback) override; void convertWeightsFromPaddle() override; @@ -109,26 +96,6 @@ public: << ", sw: " << sw_ << ", dh: " << dh_ << ", dw: " << dw_; } - void printValueFormatFlow() override { - if (cpuInVal_) { - VLOG(MKLDNN_FMTS) << cpuInVal_->getFormat() << " >>>"; - } - MKLDNNLayer::printValueFormatFlow(); - if (cpuOutVal_) { - VLOG(MKLDNN_FMTS) << " >>> " << cpuOutVal_->getFormat(); - } - } - - void printGradFormatFlow() override { - if (cpuInGrad_) { - VLOG(MKLDNN_FMTS) << cpuInGrad_->getFormat() << " <<<"; - } - MKLDNNLayer::printGradFormatFlow(); - if (cpuOutGrad_) { - VLOG(MKLDNN_FMTS) << " <<< " << cpuOutGrad_->getFormat(); - } - } - protected: /** * load the dims settings of this conv @@ -162,23 +129,6 @@ protected: MKLDNNMatrixPtr& bias, MKLDNNMatrixPtr& out); - /** - * reset MKLDNNMatrix of input value - */ - void resetInValue(std::shared_ptr& pd, - MKLDNNMatrixPtr& in); - /** - * reset MKLDNNMatrix of weight and bias value - */ - void resetWgtBiasValue(std::shared_ptr& pd, - MKLDNNMatrixPtr& wgt, - MKLDNNMatrixPtr& bias); - /** - * reset MKLDNNMatrix of output value - */ - void resetOutValue(std::shared_ptr& pd, - MKLDNNMatrixPtr& out); - /** * reset the backward weight primitive descriptor. */ @@ -207,22 +157,6 @@ protected: MKLDNNMatrixPtr& bias, MKLDNNMatrixPtr& out); - /** - * reset MKLDNNMatrix of output grad - */ - void resetOutGrad(std::shared_ptr& wgtPD, - MKLDNNMatrixPtr& out); - /** - * reset MKLDNNMatrix of weight and bias grad - */ - void resetWgtBiasGrad(std::shared_ptr& wgtPD, - MKLDNNMatrixPtr& wgt, - MKLDNNMatrixPtr& bias); - /** - * reset MKLDNNMatrix of input grad - */ - void resetInGrad(std::shared_ptr& dataPD, - MKLDNNMatrixPtr& in); /** * reset MKLDNNMatrix of weight value for backward data * since the primitive_desc would be different with wgtVal_ diff --git a/paddle/gserver/layers/MKLDNNFcLayer.cpp b/paddle/gserver/layers/MKLDNNFcLayer.cpp index cf19a15568..9f82a3b747 100644 --- a/paddle/gserver/layers/MKLDNNFcLayer.cpp +++ b/paddle/gserver/layers/MKLDNNFcLayer.cpp @@ -62,7 +62,7 @@ void MKLDNNFcLayer::convertWeightsFromPaddle() { CHECK(wgtVal_) << "should have been initialized"; bool hasNoSpatial_ = ih_ == 1 && iw_ == 1; auto targetDim = wgtVal_->getDims(); - auto srcFmt = hasNoSpatial_ ? memory::format::io : memory::format::ihwo; + auto srcFmt = hasNoSpatial_ ? format::io : format::ihwo; wgtVal_->reorderDataFrom(wgtVal_, srcFmt, targetDim); hasInitedWgt_ = true; } @@ -71,7 +71,7 @@ void MKLDNNFcLayer::convertWeightsToPaddle() { CHECK(wgtVal_) << "should have been initialized"; bool hasNoSpatial_ = ih_ == 1 && iw_ == 1; auto targetDim = wgtVal_->getDims(); - auto dstFmt = hasNoSpatial_ ? memory::format::io : memory::format::ihwo; + auto dstFmt = hasNoSpatial_ ? format::io : format::ihwo; wgtVal_->reorderDataTo(wgtVal_, dstFmt, targetDim); } @@ -100,8 +100,6 @@ void MKLDNNFcLayer::resetFwd(std::vector& pipeline, resetFwdPD(fwdPD_, in, wgt, bias, out); resetFwdPipeline(pipeline, fwdPD_, in, wgt, bias, out); - - printValueFormatFlow(); } void MKLDNNFcLayer::resetBwd(std::vector& pipeline, @@ -119,12 +117,6 @@ void MKLDNNFcLayer::resetBwd(std::vector& pipeline, resetBwdDataPD(bwdDataPD, in, out); resetBwdPipeline(pipeline, bwdWgtPD, bwdDataPD, in, wgt, bias, out); - - printGradFormatFlow(); -} - -void MKLDNNFcLayer::updateInputData() { - inVal_->setData(getInputValue(0, CPU_DEVICE)->getData()); } void MKLDNNFcLayer::updateWeights(const UpdateCallback& callback) { @@ -139,51 +131,33 @@ void MKLDNNFcLayer::resetFwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& bias, MKLDNNMatrixPtr& out) { resetInValue(in); + CHECK(in); + in->downSpatial(); - resetWgtBiasValue(wgt, bias); - - resetOutValue(out); -} + // if (extInVal_) { + // extInVal_->downSpatial(); + // } -void MKLDNNFcLayer::resetInValue(MKLDNNMatrixPtr& in) { - if (inputIsOnlyMKLDNN()) { - const MatrixPtr& dnnIn = getInputValue(0); - in = std::dynamic_pointer_cast(dnnIn); - CHECK(in) << "Input should be MKLDNNMatrix"; - } else { - CHECK_EQ(getPrev(0)->getDeviceId(), CPU_DEVICE) << "Only support CPU yet"; - const MatrixPtr& cpuIn = getInputValue(0, CPU_DEVICE); - in = MKLDNNMatrix::create( - cpuIn, {bs_, ic_, ih_, iw_}, format::nchw, engine_); - } - in->downSpatial(); -} + auto outPD = + MKLDNNMatrix::createPrimitiveDesc({bs_, oc_}, format::nc, engine_); + resetOutValue(out, outPD); -void MKLDNNFcLayer::resetWgtBiasValue(MKLDNNMatrixPtr& wgt, - MKLDNNMatrixPtr& bias) { format wgtFmt = format::oihw; - if (inVal_->getFormat() == format::nChw8c) { + if (in->getFormat() == format::nChw8c) { wgtFmt = format::oIhw8i; - } else if (inVal_->getFormat() == format::nChw16c) { + } else if (in->getFormat() == format::nChw16c) { wgtFmt = format::oIhw16i; } - wgt = MKLDNNMatrix::create( - weight_->getW(), {oc_, ic_, ih_, iw_}, wgtFmt, engine_); + auto wgtPD = + MKLDNNMatrix::createPrimitiveDesc({oc_, ic_, ih_, iw_}, wgtFmt, engine_); + resetWithMatrix(wgt, weight_->getW(), wgtPD); wgt->downSpatial(); - VLOG(MKLDNN_FMTS) << "Weight value format: " << wgt->getFormat(); - - bias = (biases_ && biases_->getW()) - ? MKLDNNMatrix::create(biases_->getW(), {oc_}, format::x, engine_) - : nullptr; -} -void MKLDNNFcLayer::resetOutValue(MKLDNNMatrixPtr& out) { - out = MKLDNNMatrix::create(output_.value, {bs_, oc_}, format::nc, engine_); - if (!outputIsOnlyMKLDNN()) { - // fc cpu output value do not need create convert, just share data - getOutput(CPU_DEVICE).value->setData(out->getData()); + if (biases_ == nullptr || biases_->getW() == nullptr) { + return; } - output_.value = std::dynamic_pointer_cast(out); + auto biasPD = MKLDNNMatrix::createPrimitiveDesc({oc_}, format::x, engine_); + resetWithMatrix(bias, biases_->getW(), biasPD); } void MKLDNNFcLayer::resetFwdPD(std::shared_ptr& pd, @@ -219,7 +193,6 @@ void MKLDNNFcLayer::resetFwdPipeline( } else { fwd_.reset(new fc_fwd(*pd, *in, *wgt, *out)); } - pipeline.push_back(*fwd_); } @@ -227,44 +200,18 @@ void MKLDNNFcLayer::resetBwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& wgt, MKLDNNMatrixPtr& bias, MKLDNNMatrixPtr& out) { - resetOutGrad(out); - - resetWgtBiasGrad(wgt, bias); - - resetInGrad(in); -} - -void MKLDNNFcLayer::resetOutGrad(MKLDNNMatrixPtr& out) { - CHECK(outVal_); - if (outputIsOnlyMKLDNN()) { - MKLDNNLayer::resetOutGrad(out, outVal_->getPrimitiveDesc()); - } else { - const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).grad; - output_.grad->setData(cpuOut->getData()); - out = MKLDNNMatrix::create(cpuOut, outVal_->getPrimitiveDesc()); - } -} + CHECK(inVal_ && outVal_); + resetOutGrad(out, outVal_->getPrimitiveDesc()); + resetInGrad(in, inVal_->getPrimitiveDesc()); -void MKLDNNFcLayer::resetWgtBiasGrad(MKLDNNMatrixPtr& wgt, - MKLDNNMatrixPtr& bias) { CHECK(wgtVal_); - wgt = MKLDNNMatrix::create(weight_->getWGrad(), wgtVal_->getPrimitiveDesc()); + resetWithMatrix(wgt, weight_->getWGrad(), wgtVal_->getPrimitiveDesc()); bias = nullptr; if (biasVal_ == nullptr) { return; } - bias = - MKLDNNMatrix::create(biases_->getWGrad(), biasVal_->getPrimitiveDesc()); -} - -void MKLDNNFcLayer::resetInGrad(MKLDNNMatrixPtr& in) { - in = nullptr; - if (inputLayers_[0]->getOutput().grad == nullptr) { - return; - } - CHECK(inVal_); - MKLDNNLayer::resetInGrad(in, inVal_->getPrimitiveDesc()); + resetWithMatrix(bias, biases_->getWGrad(), biasVal_->getPrimitiveDesc()); } void MKLDNNFcLayer::resetBwdWgtPD( diff --git a/paddle/gserver/layers/MKLDNNFcLayer.h b/paddle/gserver/layers/MKLDNNFcLayer.h index c76878aafa..ee861763ff 100644 --- a/paddle/gserver/layers/MKLDNNFcLayer.h +++ b/paddle/gserver/layers/MKLDNNFcLayer.h @@ -66,8 +66,6 @@ public: MKLDNNMatrixPtr& bias, MKLDNNMatrixPtr& out) override; - void updateInputData() override; - void updateWeights(const UpdateCallback& callback) override; void convertWeightsFromPaddle() override; @@ -84,9 +82,6 @@ protected: MKLDNNMatrixPtr& wgt, MKLDNNMatrixPtr& bias, MKLDNNMatrixPtr& out); - void resetInValue(MKLDNNMatrixPtr& in); - void resetWgtBiasValue(MKLDNNMatrixPtr& wgt, MKLDNNMatrixPtr& bias); - void resetOutValue(MKLDNNMatrixPtr& out); void resetFwdPD(std::shared_ptr& pd, MKLDNNMatrixPtr in, MKLDNNMatrixPtr wgt, @@ -109,9 +104,6 @@ protected: MKLDNNMatrixPtr& wgt, MKLDNNMatrixPtr& bias, MKLDNNMatrixPtr& out); - void resetOutGrad(MKLDNNMatrixPtr& out); - void resetWgtBiasGrad(MKLDNNMatrixPtr& wgt, MKLDNNMatrixPtr& bias); - void resetInGrad(MKLDNNMatrixPtr& in); void resetBwdWgtPD(std::shared_ptr& pd, MKLDNNMatrixPtr& wgt, MKLDNNMatrixPtr& bias, diff --git a/paddle/gserver/layers/MKLDNNLayer.h b/paddle/gserver/layers/MKLDNNLayer.h index 4e2753eba2..ab59357ad0 100644 --- a/paddle/gserver/layers/MKLDNNLayer.h +++ b/paddle/gserver/layers/MKLDNNLayer.h @@ -58,11 +58,30 @@ protected: std::vector pipelineFwd_; std::vector pipelineBwd_; - // MKLDNNMatrixPtr with internal format + /// value and grad are seperate as internal and external buffers. + /// each MKLDNNLayer must init or reset internal buffer at least, + /// and the external buffer format is always nchw of nc(when h==w==1), + /// which is the same format as paddle. + /// When mixed with cpu device, the output_.value and output_.grad + /// always save the external data. + /// When all layers are all mkldnn layers, they could be internal data. + /// below MKLDNNMatrix buffers are all internal buffers MKLDNNMatrixPtr inVal_; MKLDNNMatrixPtr inGrad_; MKLDNNMatrixPtr outVal_; MKLDNNMatrixPtr outGrad_; + // below are external value and grad + MKLDNNMatrixPtr extInVal_; + MKLDNNMatrixPtr extInGrad_; + MKLDNNMatrixPtr extOutVal_; + MKLDNNMatrixPtr extOutGrad_; + // convert handle between external and internal buffers + std::shared_ptr cvtInVal_; + std::shared_ptr cvtInGrad_; + std::shared_ptr cvtOutVal_; + std::shared_ptr cvtOutGrad_; + + // weight and bias are always internal buffers MKLDNNMatrixPtr wgtVal_; MKLDNNMatrixPtr wgtGrad_; MKLDNNMatrixPtr biasVal_; @@ -91,6 +110,7 @@ public: oh_(0), ow_(0), needResetBwd_(true), + outputOnlyMKLDNN_(false), engine_(mkldnn::engine::cpu, 0), stream_(nullptr), fwd_(nullptr), @@ -128,20 +148,39 @@ public: REGISTER_TIMER_INFO("mkldnn_FwdTimer", getName().c_str()); CHECK(!inputLayers_.empty()); copySeqInfoToOutputs(); - size_t elemenCnt = inputLayers_[0]->getOutput().value->getElementCnt(); + size_t elemenCnt = inputLayers_[0]->getOutputValue()->getElementCnt(); if (inputElemenCnt_ != elemenCnt) { VLOG(MKLDNN_BASE) << getName() << " reset mkldnn forward"; // reset when input total sizes changed, not only the batchsize inputElemenCnt_ = elemenCnt; pipelineFwd_.clear(); reshape(bs_, ic_, ih_, iw_, oc_, oh_, ow_); + // all cpu device output grad or value share output's + shareCPUDevice(); resetFwd(pipelineFwd_, inVal_, wgtVal_, biasVal_, outVal_); + // MKLDNNLayer output value should be MKLDNNMatrix + // so external output value is necessary. + // then external input value is not necessary, + // since input may be mkldnn internal buffer. + CHECK(extOutVal_) << "external output value is necessary"; + output_.value = std::dynamic_pointer_cast(extOutVal_); + CHECK(inVal_ && outVal_) << "internal memories are necessary"; + if (cvtInVal_) { + pipelineFwd_.insert(pipelineFwd_.begin(), *cvtInVal_); + } + if (cvtOutVal_) { + pipelineFwd_.push_back(*cvtOutVal_); + } convertWeightsFromPaddle(); + printValueFormat(); needResetBwd_ = true; } if (inputLayers_[0]->getType() == "data") { - updateInputData(); + // Update input value data when input layer is "data" type, + // since the input value data address might be changed. + CHECK(extInVal_); + extInVal_->setData(getInputValue(0, CPU_DEVICE)->getData()); } if (!outputOnlyMKLDNN_) { @@ -149,8 +188,7 @@ public: } stream_->submit(pipelineFwd_); } - - /* activation */ { + { REGISTER_TIMER_INFO("FwActTimer", getName().c_str()); forwardActivation(); } @@ -163,6 +201,16 @@ public: pipelineMergeGrad_.clear(); mergeGrad_ = nullptr; resetBwd(pipelineBwd_, inGrad_, wgtGrad_, biasGrad_, outGrad_); + // external output grad is not necessary + // since output may be mkldnn internal buffer or merge them directly. + CHECK(outGrad_) << "internal output grad is necessary"; + if (cvtOutGrad_) { + pipelineBwd_.insert(pipelineBwd_.begin(), *cvtOutGrad_); + } + if (cvtInGrad_) { + pipelineBwd_.push_back(*cvtInGrad_); + } + printGradFormat(); needResetBwd_ = false; } @@ -179,7 +227,6 @@ public: REGISTER_TIMER_INFO("mkldnn_bwdTimer", getName().c_str()); stream_->submit(pipelineBwd_); } - { REGISTER_TIMER_INFO("WeightUpdate", getName().c_str()); updateWeights(callback); @@ -195,7 +242,7 @@ public: int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) = 0; /** - * reset the mkldnn forward primitve and memory + * reset the mkldnn forward primitve and memories * only would be called when input size changes */ virtual void resetFwd(std::vector& pipeline, @@ -205,7 +252,7 @@ public: MKLDNNMatrixPtr& out) = 0; /** - * reset the mkldnn backward primitve and memory for mkldnn fc + * reset the mkldnn backward primitve and memories * only would be called when needed */ virtual void resetBwd(std::vector& pipeline, @@ -214,12 +261,6 @@ public: MKLDNNMatrixPtr& bias, MKLDNNMatrixPtr& out) = 0; - /** - * Update input value data when input layer is "data" type. - * Since the input value data address might be changed. - */ - virtual void updateInputData() {} - /** * Update weights and biases if necessary. */ @@ -272,21 +313,167 @@ protected: } /** - * reset the output grad matrix from primitive desc. - * and reset the merge grad primitive if needed. - * note: when this layer has serval outputs, + * reset MKLDNNMatrix from Matrix and internal primitive desc. + * reset nullptr if matrix or primitive desc is empty + */ + void resetWithMatrix(MKLDNNMatrixPtr& dnn, + const MatrixPtr& mat, + mkldnn::memory::primitive_desc pd) { + dnn = nullptr; + if (mat == nullptr) { + return; + } + dnn = MKLDNNMatrix::create(mat, pd); + } + + /** + * reset input value from input MKLDNNMatrix and internal primitive desc. + * reset both internal and external buffer and create reorder if necessary. + */ + void resetInValue( + MKLDNNMatrixPtr& in, + const std::shared_ptr& intPD = nullptr) { + cvtInVal_ = nullptr; + extInVal_ = nullptr; + in = nullptr; + CHECK_GT(bs_ * ic_ * ih_ * iw_, 0); + auto extPD = MKLDNNMatrix::createPrimitiveDesc( + {bs_, ic_, ih_, iw_}, mkldnn::memory::format::nchw, engine_); + const MatrixPtr& inMat = inputLayers_[0]->getOutputValue(); + in = std::dynamic_pointer_cast(inMat); + CHECK_EQ(inputIsOnlyMKLDNN(), in != nullptr); + if (in == nullptr || in->getFormat() == mkldnn::memory::format::nc) { + in = MKLDNNMatrix::create(inMat, extPD); + } + extInVal_ = isPaddleFormat(in->getFormat()) ? in : nullptr; + if (in->getFormat() == mkldnn::memory::format::nc) { + CHECK(ih_ == 1 && iw_ == 1); + } + if (nullptr == intPD || in->getPrimitiveDesc() == *intPD) { + return; + } + // need create reorder + in = MKLDNNMatrix::create(nullptr, *intPD); + extInVal_ = extInVal_ ? extInVal_ : MKLDNNMatrix::create(inMat, extPD); + cvtInVal_ = MKLDNNMatrix::createReorder(extInVal_, in); + CHECK(cvtInVal_) << "should not be emptry"; + } + + /** + * reset output value from internal primitive desc. + * reset both internal and external buffer and create reorder if necessary. + */ + void resetOutValue(MKLDNNMatrixPtr& out, + mkldnn::memory::primitive_desc intPD) { + cvtOutVal_ = nullptr; + out = MKLDNNMatrix::create(output_.value, intPD); + extOutVal_ = out; + if (outputIsOnlyMKLDNN() || isPaddleFormat(extOutVal_->getFormat())) { + return; + } + // need create reorder + CHECK_GT(bs_ * oc_ * oh_ * ow_, 0); + extOutVal_ = MKLDNNMatrix::create(output_.value, + {bs_, oc_, oh_, ow_}, + mkldnn::memory::format::nchw, + engine_); + out = MKLDNNMatrix::create(nullptr, intPD); + cvtOutVal_ = MKLDNNMatrix::createReorder(out, extOutVal_); + CHECK(cvtOutVal_) << "should not be empty"; + } + + /** + * reset input grad from internal primitive desc. + * reset both internal and external buffer and create reorder if necessary. + */ + void resetInGrad(MKLDNNMatrixPtr& in, mkldnn::memory::primitive_desc intPD) { + cvtInGrad_ = nullptr; + extInGrad_ = nullptr; + in = nullptr; + LayerPtr& input = inputLayers_[0]; + if (input->getOutputGrad() == nullptr) { + // no need input grad + return; + } + CHECK(inputIsOnlyMKLDNN() || input->getOutputMapSize() <= 1) + << "only support input is MKLDNN layer or only have one output layer"; + // when input is a mkldnn branch node, + // this layer will save input grad to a internal buffer, + // and the mkldnn input layer will merge them to actual prev->output_.grad + const MatrixPtr& inMat = + input->getOutputMapSize() <= 1 ? input->getOutputGrad() : nullptr; + in = MKLDNNMatrix::create(inMat, intPD); + Argument& arg = input->getOutput(this->getName()); + arg.grad = std::dynamic_pointer_cast(in); + CHECK(inVal_ != nullptr && inVal_->getPrimitiveDesc() == intPD) + << "should have internal input value and primitive desc must equal"; + if (inputIsOnlyMKLDNN()) { + return; + } + + extInGrad_ = in; + if (isPaddleFormat(extInGrad_->getFormat())) { + return; + } + // need create reorder + CHECK(extInVal_ != nullptr && isPaddleFormat(extInVal_->getFormat())) + << "should have external input value and the format must be nchw(nc)"; + extInGrad_ = MKLDNNMatrix::create(inMat, extInVal_->getPrimitiveDesc()); + CHECK(inVal_ != nullptr && inVal_->getPrimitiveDesc() == intPD) + << "should have internal input value and primitive desc must equal"; + in = MKLDNNMatrix::create(nullptr, intPD); + cvtInGrad_ = MKLDNNMatrix::createReorder(in, extInGrad_); + CHECK(cvtInGrad_); + } + + /** + * reset output grad from internal primitive desc. + * merge grad if necessary. + * reset both internal and external buffer and create reorder if necessary. + * note: about merge grad, when this layer has serval outputs, * it could not be mixed with cpu device, * since it can not get memory desc from cpu device. */ - virtual void resetOutGrad(MKLDNNMatrixPtr& out, - mkldnn::memory::primitive_desc pd) { - CHECK(outputIsOnlyMKLDNN()) << "do not support mixed with other device yet"; + void resetOutGrad(MKLDNNMatrixPtr& out, + mkldnn::memory::primitive_desc intPD) { + cvtOutGrad_ = nullptr; + extOutGrad_ = nullptr; + out = nullptr; + MatrixPtr& outMat = output_.grad; + out = MKLDNNMatrix::create(outMat, intPD); + resetMergeGrad(out); + if (outputIsOnlyMKLDNN()) { + return; + } + CHECK_LE(outputMap_.size(), 1U) << "do not support mixed with cpu device"; + extOutGrad_ = out; + if (isPaddleFormat(extOutGrad_->getFormat())) { + return; + } + // need create reorder + CHECK(extOutVal_ != nullptr && isPaddleFormat(extOutVal_->getFormat())) + << "should have external output value and the format must be nchw(nc)"; + extOutGrad_ = MKLDNNMatrix::create(outMat, extOutVal_->getPrimitiveDesc()); + CHECK(outVal_ != nullptr && outVal_->getPrimitiveDesc() == intPD) + << "should have internal output value and primitive desc must equal"; + out = MKLDNNMatrix::create(nullptr, intPD); + cvtOutGrad_ = MKLDNNMatrix::createReorder(extOutGrad_, out); + CHECK(cvtOutGrad_); + } + + /** + * reset the merge grad primitive if necessary. + * note: do not support the grads are mixed with cpu device, + * since it can not get memory desc from cpu device. + */ + virtual void resetMergeGrad(MKLDNNMatrixPtr& out) { mergeGrad_ = nullptr; pipelineMergeGrad_.clear(); - out = MKLDNNMatrix::create(output_.grad, pd); - if (outputMap_.size() <= 1) { + if (outputMap_.size() <= 1 || !outputIsOnlyMKLDNN()) { + // do not merge when output is not all MKLDNN or only one output return; } + CHECK(out) << "should have reset internal ouput grad"; std::vector scales(outputMap_.size(), 1.0); std::vector srcPDs; std::vector srcs; @@ -309,15 +496,13 @@ protected: for (size_t i = 1; i < srcPDs.size(); ++i) { CHECK(srcPDs[0] == srcPDs[i]); } - tmpOutGrad_ = nullptr; + tmpOutGrad_ = out; tmpCvt_ = nullptr; if (out->getPrimitiveDesc() != srcPDs[0]) { tmpOutGrad_ = MKLDNNMatrix::create(nullptr, srcPDs[0]); tmpCvt_ = MKLDNNMatrix::createReorder(tmpOutGrad_, out); CHECK(tmpCvt_); pipelineMergeGrad_.push_back(*tmpCvt_); - } else { - tmpOutGrad_ = out; } auto sumPD = mkldnn::sum::primitive_desc( @@ -326,21 +511,6 @@ protected: pipelineMergeGrad_.insert(pipelineMergeGrad_.begin(), *mergeGrad_); } - /** - * reset input grad from primitive desc. - * this function is avaiable for input is only mkldnn - * or input do not care cpu device - */ - virtual void resetInGrad(MKLDNNMatrixPtr& in, - mkldnn::memory::primitive_desc pd) { - LayerPtr& input = inputLayers_[0]; - const MatrixPtr& grad = - input->getOutputMapSize() > 1 ? nullptr : input->getOutput().grad; - in = MKLDNNMatrix::create(grad, pd); - Argument& arg = input->getOutput(this->getName()); - arg.grad = std::dynamic_pointer_cast(in); - } - /** * print info about sizes */ @@ -351,22 +521,50 @@ protected: } /** - * Print the mkldnn memory format flow of value + * print the mkldnn memory format of value */ - virtual void printValueFormatFlow() { - if (inVal_ && outVal_) { - VLOG(MKLDNN_FMTS) << inVal_->getFormat() << " >>> " - << outVal_->getFormat(); + virtual void printValueFormat() { + if (extInVal_) { + VLOG(MKLDNN_FMTS) << extInVal_->getFormat() << " >>> "; + } + if (inVal_) { + VLOG(MKLDNN_FMTS) << inVal_->getFormat() << " >>>"; + } + if (outVal_) { + VLOG(MKLDNN_FMTS) << outVal_->getFormat() << " >>> "; + } + if (extOutVal_) { + VLOG(MKLDNN_FMTS) << extOutVal_->getFormat(); + } + if (wgtVal_) { + VLOG(MKLDNN_FMTS) << "Weight value format: " << wgtVal_->getFormat(); + } + if (biasVal_) { + VLOG(MKLDNN_FMTS) << "Bias value format: " << biasVal_->getFormat(); } } /** - * Print the mkldnn memory format flow of grad + * print the mkldnn memory format of grad */ - virtual void printGradFormatFlow() { - if (inGrad_ && outGrad_) { - VLOG(MKLDNN_FMTS) << inGrad_->getFormat() << " <<< " - << outGrad_->getFormat(); + virtual void printGradFormat() { + if (extInGrad_) { + VLOG(MKLDNN_FMTS) << extInGrad_->getFormat() << " <<< "; + } + if (inGrad_) { + VLOG(MKLDNN_FMTS) << inGrad_->getFormat() << " <<<"; + } + if (outGrad_) { + VLOG(MKLDNN_FMTS) << outGrad_->getFormat() << " <<< "; + } + if (extOutGrad_) { + VLOG(MKLDNN_FMTS) << extOutGrad_->getFormat(); + } + if (wgtGrad_) { + VLOG(MKLDNN_FMTS) << "Weight grad format: " << wgtGrad_->getFormat(); + } + if (biasGrad_) { + VLOG(MKLDNN_FMTS) << "Bias grad format: " << biasGrad_->getFormat(); } } @@ -405,6 +603,19 @@ protected: void setDevice(int id) { deviceId_ = id; } private: + /** + * check the format is nchw or nc, + * which is supported by Paddle default memory layout + */ + bool isPaddleFormat(mkldnn::memory::format fmt) { + if (fmt == mkldnn::memory::format::nchw || + fmt == mkldnn::memory::format::nc) { + return true; + } else { + return false; + } + } + /** * clear all grad */ @@ -449,6 +660,19 @@ private: } } + /** + * if have cpu device, share value and grad data with output_ + */ + void shareCPUDevice() { + if (outputIsOnlyMKLDNN()) { + return; + } + for (size_t i = 0; i < outputOtherDevice_.size(); i++) { + outputOtherDevice_[i].value = output_.value; + outputOtherDevice_[i].grad = output_.grad; + } + } + /** * Check the cpu device number of outputOtherDevice_. * should have only one at most. diff --git a/paddle/gserver/layers/MKLDNNPoolLayer.cpp b/paddle/gserver/layers/MKLDNNPoolLayer.cpp index 0e53e2d1b7..6e89260f49 100644 --- a/paddle/gserver/layers/MKLDNNPoolLayer.cpp +++ b/paddle/gserver/layers/MKLDNNPoolLayer.cpp @@ -85,8 +85,6 @@ void MKLDNNPoolLayer::resetFwd(std::vector& pipeline, resetFwdPD(fwdPD_, in, out); resetFwdPipeline(pipeline, fwdPD_, in, out); - - printValueFormatFlow(); } void MKLDNNPoolLayer::resetBwd(std::vector& pipeline, @@ -101,65 +99,22 @@ void MKLDNNPoolLayer::resetBwd(std::vector& pipeline, resetBwdPD(pd, in, out); resetBwdPipeline(pipeline, pd, in, out); - - printGradFormatFlow(); -} - -void MKLDNNPoolLayer::updateInputData() { - inVal_->setData(getInputValue(0, CPU_DEVICE)->getData()); } void MKLDNNPoolLayer::resetFwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out) { resetInValue(in); - resetOutValue(out); -} - -void MKLDNNPoolLayer::resetInValue(MKLDNNMatrixPtr& in) { - if (inputIsOnlyMKLDNN()) { - const MatrixPtr& dnnIn = getInputValue(0); - in = std::dynamic_pointer_cast(dnnIn); - CHECK(in) << "Input should be MKLDNNMatrix"; - } else { - CHECK_EQ(getPrev(0)->getDeviceId(), CPU_DEVICE) << "Only support CPU yet"; - const MatrixPtr& cpuIn = getInputValue(0, CPU_DEVICE); - in = MKLDNNMatrix::create( - cpuIn, {bs_, ic_, ih_, iw_}, format::nchw, engine_); - } -} - -void MKLDNNPoolLayer::resetOutValue(MKLDNNMatrixPtr& out) { - CHECK(inVal_) << "Should reset input value first"; memory::dims outDims = memory::dims{bs_, oc_, oh_, ow_}; - out = MKLDNNMatrix::create( - output_.value, outDims, inVal_->getFormat(), engine_); - - // create reorder if output value has cpu device and pd do not match - cpuOutVal_ = nullptr; - cvtOutVal_ = nullptr; - if (!outputIsOnlyMKLDNN()) { - const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).value; - cpuOutVal_ = MKLDNNMatrix::create(cpuOut, outDims, format::nchw, engine_); - if (cpuOutVal_->getPrimitiveDesc() != out->getPrimitiveDesc()) { - out = MKLDNNMatrix::create(nullptr, out->getPrimitiveDesc()); - cvtOutVal_ = MKLDNNMatrix::createReorder(out, cpuOutVal_); - CHECK(cvtOutVal_) << "should not be emptry"; - } else { - cpuOut->setData(output_.value->getData()); - cpuOutVal_ = out; - } - output_.value = std::dynamic_pointer_cast(cpuOutVal_); - return; - } - output_.value = std::dynamic_pointer_cast(outVal_); + CHECK(in); + auto outPD = + MKLDNNMatrix::createPrimitiveDesc(outDims, in->getFormat(), engine_); + resetOutValue(out, outPD); } void MKLDNNPoolLayer::resetFwdPD(std::shared_ptr& pd, MKLDNNMatrixPtr in, MKLDNNMatrixPtr out) { - memory::dims inDims = memory::dims{bs_, ic_, ih_, iw_}; - memory::dims outDims = memory::dims{bs_, oc_, oh_, ow_}; memory::dims kernels = memory::dims{fh_, fw_}; memory::dims strides = memory::dims{sh_, sw_}; memory::dims padL = memory::dims{ph_, pw_}; @@ -194,58 +149,26 @@ void MKLDNNPoolLayer::resetFwdPipeline( ? std::make_shared(pool_fwd(*pd, *in, *out, *workspace_)) : std::make_shared(pool_fwd(*pd, *in, *out)); pipeline.push_back(*fwd_); - - if (cvtOutVal_) { - pipeline.push_back(*cvtOutVal_); - } } void MKLDNNPoolLayer::resetBwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out) { - resetOutGrad(out); - - resetInGrad(in); -} -void MKLDNNPoolLayer::resetOutGrad(MKLDNNMatrixPtr& out) { - cpuOutGrad_ = nullptr; - cvtOutGrad_ = nullptr; - CHECK(outVal_); - if (outputIsOnlyMKLDNN()) { - MKLDNNLayer::resetOutGrad(out, outVal_->getPrimitiveDesc()); - } else { - const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).grad; - // always share the same grad data of CPU output - // then the activation can get the right grad from output_.grad - output_.grad->setData(cpuOut->getData()); - cpuOutGrad_ = MKLDNNMatrix::create( - cpuOut, memory::dims{bs_, oc_, oh_, ow_}, format::nchw, engine_); - if (cpuOutGrad_->getPrimitiveDesc() != outVal_->getPrimitiveDesc()) { - out = MKLDNNMatrix::create(nullptr, outVal_->getPrimitiveDesc()); - cvtOutGrad_ = MKLDNNMatrix::createReorder(cpuOutGrad_, out); - CHECK(cvtOutGrad_) << "should not be emptry"; - } else { - out = cpuOutGrad_; - } - } -} - -void MKLDNNPoolLayer::resetInGrad(MKLDNNMatrixPtr& in) { - in = nullptr; - if (inputLayers_[0]->getOutput().grad == nullptr) { - return; - } - CHECK(inVal_); - MKLDNNLayer::resetInGrad(in, inVal_->getPrimitiveDesc()); + CHECK(inVal_ && outVal_); + resetOutGrad(out, outVal_->getPrimitiveDesc()); + resetInGrad(in, inVal_->getPrimitiveDesc()); } void MKLDNNPoolLayer::resetBwdPD(std::shared_ptr& pd, MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out) { + pd = nullptr; + if (in == nullptr) { + return; + } memory::dims kernels = memory::dims{fh_, fw_}; memory::dims strides = memory::dims{sh_, sw_}; memory::dims padL = memory::dims{ph_, pw_}; memory::dims padR = getPaddingR(); - CHECK(in); CHECK(out); auto bwdDesc = pool_bwd::desc(poolAlgo_, in->getMemoryDesc(), @@ -263,8 +186,8 @@ void MKLDNNPoolLayer::resetBwdPipeline( std::shared_ptr& pd, MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out) { - if (cvtOutGrad_) { - pipeline.push_back(*cvtOutGrad_); + if (pd == nullptr) { + return; } bwdData_ = diff --git a/paddle/gserver/layers/MKLDNNPoolLayer.h b/paddle/gserver/layers/MKLDNNPoolLayer.h index 891e15a7ef..c5ec87828b 100644 --- a/paddle/gserver/layers/MKLDNNPoolLayer.h +++ b/paddle/gserver/layers/MKLDNNPoolLayer.h @@ -38,13 +38,6 @@ protected: // pooling_avg or pooling_max mkldnn::algorithm poolAlgo_; - // MKLDNNMatrixPtr which should be created from CPU Device - MKLDNNMatrixPtr cpuOutVal_; - MKLDNNMatrixPtr cpuOutGrad_; - // convert handle between CPU device and MKLDNN device - std::shared_ptr cvtOutVal_; - std::shared_ptr cvtOutGrad_; - // save forward primitive_desc, which can be used backward std::shared_ptr fwdPD_; // according to https://github.com/01org/mkl-dnn/blob/master/tests/gtests/ @@ -74,8 +67,6 @@ public: MKLDNNMatrixPtr& bias, MKLDNNMatrixPtr& out) override; - void updateInputData() override; - void printSizeInfo() override { MKLDNNLayer::printSizeInfo(); VLOG(MKLDNN_SIZES) << getName() << ": fh: " << fh_ << ", fw: " << fw_ @@ -90,8 +81,6 @@ protected: * reset pipeline. */ void resetFwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out); - void resetInValue(MKLDNNMatrixPtr& in); - void resetOutValue(MKLDNNMatrixPtr& out); void resetFwdPD(std::shared_ptr& pd, MKLDNNMatrixPtr in, MKLDNNMatrixPtr out); @@ -106,8 +95,6 @@ protected: * reset pipeline. */ void resetBwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out); - void resetOutGrad(MKLDNNMatrixPtr& out); - void resetInGrad(MKLDNNMatrixPtr& in); void resetBwdPD(std::shared_ptr& pd, MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out); diff --git a/paddle/math/MKLDNNMatrix.cpp b/paddle/math/MKLDNNMatrix.cpp index 0778bb63b7..c606560473 100644 --- a/paddle/math/MKLDNNMatrix.cpp +++ b/paddle/math/MKLDNNMatrix.cpp @@ -46,7 +46,7 @@ MKLDNNMatrixPtr MKLDNNMatrix::create(MatrixPtr m, memory::format fmt, engine& eg, mkldnn::memory::data_type dtype) { - return create(m, memory::primitive_desc(memory::desc(dims, dtype, fmt), eg)); + return create(m, createPrimitiveDesc(dims, fmt, eg, dtype)); } std::shared_ptr MKLDNNMatrix::createReorder(const MKLDNNMatrixPtr& src, diff --git a/paddle/math/MKLDNNMatrix.h b/paddle/math/MKLDNNMatrix.h index c843115eb9..9e3f29eb57 100644 --- a/paddle/math/MKLDNNMatrix.h +++ b/paddle/math/MKLDNNMatrix.h @@ -52,12 +52,24 @@ public: mkldnn::engine& eg, mkldnn::memory::data_type dtype = mkldnn::memory::data_type::f32); + /** + * Create primitive descriptor. + * default with f32 dtype + */ + static mkldnn::memory::primitive_desc createPrimitiveDesc( + const mkldnn::memory::dims dims, + const mkldnn::memory::format& fmt, + const mkldnn::engine& eg, + const mkldnn::memory::data_type& dtype = mkldnn::memory::data_type::f32) { + return mkldnn::memory::primitive_desc(memory::desc(dims, dtype, fmt), eg); + } + /** * Create Memory descriptor. * default with any format and f32 dtype */ static mkldnn::memory::desc createMemoryDesc( - const mkldnn::memory::dims& dims, + const mkldnn::memory::dims dims, const mkldnn::memory::format& fmt = mkldnn::memory::format::any, const mkldnn::memory::data_type& dtype = mkldnn::memory::data_type::f32) { return mkldnn::memory::desc(dims, dtype, fmt); From 9e38dafa29acc59347a5aee33424be7bb8bcd168 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Thu, 19 Oct 2017 15:18:34 +0800 Subject: [PATCH 2/5] change MKLDNNMatrix create interface since MatrixPtr is not always required --- .../gserver/activations/MKLDNNActivation.cpp | 6 ++-- paddle/gserver/layers/MKLDNNConvLayer.cpp | 3 +- paddle/gserver/layers/MKLDNNLayer.h | 32 +++++++++---------- paddle/math/MKLDNNMatrix.cpp | 8 ++--- paddle/math/MKLDNNMatrix.h | 5 +-- 5 files changed, 27 insertions(+), 27 deletions(-) diff --git a/paddle/gserver/activations/MKLDNNActivation.cpp b/paddle/gserver/activations/MKLDNNActivation.cpp index 18c5638100..f3ccd68160 100644 --- a/paddle/gserver/activations/MKLDNNActivation.cpp +++ b/paddle/gserver/activations/MKLDNNActivation.cpp @@ -126,7 +126,7 @@ void MKLDNNEltwiseActivation::resetFwd(Argument& act) { copyInVal_ = nullptr; if (act.grad && algo == algorithm::eltwise_tanh) { // tanh need save src input for backward - inVal_ = MKLDNNMatrix::create(nullptr, val_->getPrimitiveDesc()); + inVal_ = MKLDNNMatrix::create(val_->getPrimitiveDesc()); copyInVal_ = std::make_shared(*val_, *inVal_); CHECK(copyInVal_) << "should not be emptry"; pipelineFwd_.push_back(*copyInVal_); @@ -145,7 +145,7 @@ void MKLDNNEltwiseActivation::resetBwd(Argument& act) { algorithm algo = getAlgo(this->getName()); float alpha = getBwdAlpha(); float beta = getBeta(); - grad_ = MKLDNNMatrix::create(act.grad, val_->getPrimitiveDesc()); + grad_ = MKLDNNMatrix::create(val_->getPrimitiveDesc(), act.grad); auto eng = CPUEngine::Instance().getEngine(); auto bwdDesc = eltwise_bwd::desc( algo, grad_->getMemoryDesc(), val_->getMemoryDesc(), alpha, beta); @@ -230,7 +230,7 @@ void MKLDNNActivation::resetFwd(Argument& act) { int ic = cnt_ / bs / ih / iw; CHECK_EQ(cnt_, (size_t)bs * ic * ih * iw); val_ = MKLDNNMatrix::create( - act.value, {bs, ic, ih, iw}, mkldnn::memory::format::nchw, *engine_); + {bs, ic, ih, iw}, mkldnn::memory::format::nchw, *engine_, act.value); CHECK(val_); val_->downSpatial(); } diff --git a/paddle/gserver/layers/MKLDNNConvLayer.cpp b/paddle/gserver/layers/MKLDNNConvLayer.cpp index 463e6ad0ed..3fbfb1ab1f 100644 --- a/paddle/gserver/layers/MKLDNNConvLayer.cpp +++ b/paddle/gserver/layers/MKLDNNConvLayer.cpp @@ -370,8 +370,7 @@ void MKLDNNConvLayer::resetWgtValBwdData( // since the primitive_desc would be different with wgtVal_ CHECK(wgtVal_) << "should have weight value"; if (dataPD->weights_primitive_desc() != wgtVal_->getPrimitiveDesc()) { - wgtValBwdData_ = - MKLDNNMatrix::create(nullptr, dataPD->weights_primitive_desc()); + wgtValBwdData_ = MKLDNNMatrix::create(dataPD->weights_primitive_desc()); cvtWgtVal_ = MKLDNNMatrix::createReorder(wgtVal_, wgtValBwdData_); CHECK(cvtWgtVal_); } else { diff --git a/paddle/gserver/layers/MKLDNNLayer.h b/paddle/gserver/layers/MKLDNNLayer.h index ab59357ad0..80c67529da 100644 --- a/paddle/gserver/layers/MKLDNNLayer.h +++ b/paddle/gserver/layers/MKLDNNLayer.h @@ -323,7 +323,7 @@ protected: if (mat == nullptr) { return; } - dnn = MKLDNNMatrix::create(mat, pd); + dnn = MKLDNNMatrix::create(pd, mat); } /** @@ -343,7 +343,7 @@ protected: in = std::dynamic_pointer_cast(inMat); CHECK_EQ(inputIsOnlyMKLDNN(), in != nullptr); if (in == nullptr || in->getFormat() == mkldnn::memory::format::nc) { - in = MKLDNNMatrix::create(inMat, extPD); + in = MKLDNNMatrix::create(extPD, inMat); } extInVal_ = isPaddleFormat(in->getFormat()) ? in : nullptr; if (in->getFormat() == mkldnn::memory::format::nc) { @@ -353,8 +353,8 @@ protected: return; } // need create reorder - in = MKLDNNMatrix::create(nullptr, *intPD); - extInVal_ = extInVal_ ? extInVal_ : MKLDNNMatrix::create(inMat, extPD); + in = MKLDNNMatrix::create(*intPD); + extInVal_ = extInVal_ ? extInVal_ : MKLDNNMatrix::create(extPD, inMat); cvtInVal_ = MKLDNNMatrix::createReorder(extInVal_, in); CHECK(cvtInVal_) << "should not be emptry"; } @@ -366,18 +366,18 @@ protected: void resetOutValue(MKLDNNMatrixPtr& out, mkldnn::memory::primitive_desc intPD) { cvtOutVal_ = nullptr; - out = MKLDNNMatrix::create(output_.value, intPD); + out = MKLDNNMatrix::create(intPD, output_.value); extOutVal_ = out; if (outputIsOnlyMKLDNN() || isPaddleFormat(extOutVal_->getFormat())) { return; } // need create reorder CHECK_GT(bs_ * oc_ * oh_ * ow_, 0); - extOutVal_ = MKLDNNMatrix::create(output_.value, - {bs_, oc_, oh_, ow_}, + extOutVal_ = MKLDNNMatrix::create(mkldnn::memory::dims{bs_, oc_, oh_, ow_}, mkldnn::memory::format::nchw, - engine_); - out = MKLDNNMatrix::create(nullptr, intPD); + engine_, + output_.value); + out = MKLDNNMatrix::create(intPD); cvtOutVal_ = MKLDNNMatrix::createReorder(out, extOutVal_); CHECK(cvtOutVal_) << "should not be empty"; } @@ -402,7 +402,7 @@ protected: // and the mkldnn input layer will merge them to actual prev->output_.grad const MatrixPtr& inMat = input->getOutputMapSize() <= 1 ? input->getOutputGrad() : nullptr; - in = MKLDNNMatrix::create(inMat, intPD); + in = MKLDNNMatrix::create(intPD, inMat); Argument& arg = input->getOutput(this->getName()); arg.grad = std::dynamic_pointer_cast(in); CHECK(inVal_ != nullptr && inVal_->getPrimitiveDesc() == intPD) @@ -418,10 +418,10 @@ protected: // need create reorder CHECK(extInVal_ != nullptr && isPaddleFormat(extInVal_->getFormat())) << "should have external input value and the format must be nchw(nc)"; - extInGrad_ = MKLDNNMatrix::create(inMat, extInVal_->getPrimitiveDesc()); + extInGrad_ = MKLDNNMatrix::create(extInVal_->getPrimitiveDesc(), inMat); CHECK(inVal_ != nullptr && inVal_->getPrimitiveDesc() == intPD) << "should have internal input value and primitive desc must equal"; - in = MKLDNNMatrix::create(nullptr, intPD); + in = MKLDNNMatrix::create(intPD); cvtInGrad_ = MKLDNNMatrix::createReorder(in, extInGrad_); CHECK(cvtInGrad_); } @@ -440,7 +440,7 @@ protected: extOutGrad_ = nullptr; out = nullptr; MatrixPtr& outMat = output_.grad; - out = MKLDNNMatrix::create(outMat, intPD); + out = MKLDNNMatrix::create(intPD, outMat); resetMergeGrad(out); if (outputIsOnlyMKLDNN()) { return; @@ -453,10 +453,10 @@ protected: // need create reorder CHECK(extOutVal_ != nullptr && isPaddleFormat(extOutVal_->getFormat())) << "should have external output value and the format must be nchw(nc)"; - extOutGrad_ = MKLDNNMatrix::create(outMat, extOutVal_->getPrimitiveDesc()); + extOutGrad_ = MKLDNNMatrix::create(extOutVal_->getPrimitiveDesc(), outMat); CHECK(outVal_ != nullptr && outVal_->getPrimitiveDesc() == intPD) << "should have internal output value and primitive desc must equal"; - out = MKLDNNMatrix::create(nullptr, intPD); + out = MKLDNNMatrix::create(intPD); cvtOutGrad_ = MKLDNNMatrix::createReorder(extOutGrad_, out); CHECK(cvtOutGrad_); } @@ -499,7 +499,7 @@ protected: tmpOutGrad_ = out; tmpCvt_ = nullptr; if (out->getPrimitiveDesc() != srcPDs[0]) { - tmpOutGrad_ = MKLDNNMatrix::create(nullptr, srcPDs[0]); + tmpOutGrad_ = MKLDNNMatrix::create(srcPDs[0]); tmpCvt_ = MKLDNNMatrix::createReorder(tmpOutGrad_, out); CHECK(tmpCvt_); pipelineMergeGrad_.push_back(*tmpCvt_); diff --git a/paddle/math/MKLDNNMatrix.cpp b/paddle/math/MKLDNNMatrix.cpp index c606560473..21a8f73c3e 100644 --- a/paddle/math/MKLDNNMatrix.cpp +++ b/paddle/math/MKLDNNMatrix.cpp @@ -18,7 +18,7 @@ using namespace mkldnn; // NOLINT namespace paddle { -MKLDNNMatrixPtr MKLDNNMatrix::create(MatrixPtr m, memory::primitive_desc pd) { +MKLDNNMatrixPtr MKLDNNMatrix::create(memory::primitive_desc pd, MatrixPtr m) { memory::desc md = pd.desc(); size_t ndims = md.data.ndims; int* dims = md.data.dims; @@ -41,12 +41,12 @@ MKLDNNMatrixPtr MKLDNNMatrix::create(MatrixPtr m, memory::primitive_desc pd) { return std::make_shared(cpuMatrix, pd); } -MKLDNNMatrixPtr MKLDNNMatrix::create(MatrixPtr m, - memory::dims dims, +MKLDNNMatrixPtr MKLDNNMatrix::create(memory::dims dims, memory::format fmt, engine& eg, + MatrixPtr m, mkldnn::memory::data_type dtype) { - return create(m, createPrimitiveDesc(dims, fmt, eg, dtype)); + return create(createPrimitiveDesc(dims, fmt, eg, dtype), m); } std::shared_ptr MKLDNNMatrix::createReorder(const MKLDNNMatrixPtr& src, diff --git a/paddle/math/MKLDNNMatrix.h b/paddle/math/MKLDNNMatrix.h index 9e3f29eb57..fe755d096d 100644 --- a/paddle/math/MKLDNNMatrix.h +++ b/paddle/math/MKLDNNMatrix.h @@ -40,16 +40,17 @@ public: /** * Create MKLDNNMatrix from a MatrixPtr and memory primitive_desc */ - static MKLDNNMatrixPtr create(MatrixPtr m, mkldnn::memory::primitive_desc pd); + static MKLDNNMatrixPtr create(mkldnn::memory::primitive_desc pd, + MatrixPtr m = nullptr); /** * Create MKLDNNMatrix from a MatrixPtr and memory details info */ static MKLDNNMatrixPtr create( - MatrixPtr m, mkldnn::memory::dims dims, mkldnn::memory::format fmt, mkldnn::engine& eg, + MatrixPtr m = nullptr, mkldnn::memory::data_type dtype = mkldnn::memory::data_type::f32); /** From 94e442d4b14c66ba68d8e64c0f51f5bc849437dd Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Thu, 19 Oct 2017 16:32:11 +0800 Subject: [PATCH 3/5] add cpp file of MKLDNNLayer --- paddle/gserver/layers/MKLDNNLayer.cpp | 327 ++++++++++++++++++++++ paddle/gserver/layers/MKLDNNLayer.h | 386 ++++---------------------- 2 files changed, 379 insertions(+), 334 deletions(-) create mode 100644 paddle/gserver/layers/MKLDNNLayer.cpp diff --git a/paddle/gserver/layers/MKLDNNLayer.cpp b/paddle/gserver/layers/MKLDNNLayer.cpp new file mode 100644 index 0000000000..91f0ff5bd3 --- /dev/null +++ b/paddle/gserver/layers/MKLDNNLayer.cpp @@ -0,0 +1,327 @@ +/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "MKLDNNLayer.h" + +using namespace mkldnn; // NOLINT +typedef memory::format format; + +namespace paddle { + +bool MKLDNNLayer::init(const LayerMap& layerMap, + const ParameterMap& parameterMap) { + CHECK(FLAGS_use_mkldnn) << "MkldnnLayers only support use_mkldnn." + << "Please set WITH_MKLDNN=ON " + << "and set use_mkldnn=True"; + CHECK(!useGpu_) << "Do not support GPU yet"; + + // set device id before Layer::init + setDevice(MKLDNN_DEVICE); + // change param device to MKLDNN device + setParamsDevice(MKLDNN_DEVICE, parameterMap); + if (!Layer::init(layerMap, parameterMap)) { + return false; + } + setOutputMap(); + checkCPUOutputsNumber(); + + stream_.reset(new MKLDNNStream()); + engine_ = CPUEngine::Instance().getEngine(); + return true; +} + +void MKLDNNLayer::forward(PassType passType) { + passType_ = passType; + + { + REGISTER_TIMER_INFO("mkldnn_FwdTimer", getName().c_str()); + CHECK(!inputLayers_.empty()); + copySeqInfoToOutputs(); + size_t elemenCnt = inputLayers_[0]->getOutputValue()->getElementCnt(); + if (inputElemenCnt_ != elemenCnt) { + VLOG(MKLDNN_BASE) << getName() << " reset mkldnn forward"; + // reset when input total sizes changed, not only the batchsize + inputElemenCnt_ = elemenCnt; + pipelineFwd_.clear(); + reshape(bs_, ic_, ih_, iw_, oc_, oh_, ow_); + // all cpu device output grad or value share output's + shareCPUDevice(); + resetFwd(pipelineFwd_, inVal_, wgtVal_, biasVal_, outVal_); + // MKLDNNLayer output value should be MKLDNNMatrix + // so external output value is necessary. + // then external input value is not necessary, + // since input may be mkldnn internal buffer. + CHECK(extOutVal_) << "external output value is necessary"; + output_.value = std::dynamic_pointer_cast(extOutVal_); + CHECK(inVal_ && outVal_) << "internal memories are necessary"; + if (cvtInVal_) { + pipelineFwd_.insert(pipelineFwd_.begin(), *cvtInVal_); + } + if (cvtOutVal_) { + pipelineFwd_.push_back(*cvtOutVal_); + } + convertWeightsFromPaddle(); + printSizeInfo(); + printValueFormat(); + needResetBwd_ = true; + } + + if (inputLayers_[0]->getType() == "data") { + // Update input value data when input layer is "data" type, + // since the input value data address might be changed. + CHECK(extInVal_); + extInVal_->setData(getInputValue(0, CPU_DEVICE)->getData()); + } + + if (!outputOnlyMKLDNN_) { + clearGrads(); + } + stream_->submit(pipelineFwd_); + } + { + REGISTER_TIMER_INFO("FwActTimer", getName().c_str()); + forwardActivation(); + } +} + +void MKLDNNLayer::backward(const UpdateCallback& callback) { + if (needResetBwd_) { + VLOG(MKLDNN_BASE) << getName() << " reset mkldnn backward"; + pipelineBwd_.clear(); + pipelineMergeGrad_.clear(); + mergeGrad_ = nullptr; + resetBwd(pipelineBwd_, inGrad_, wgtGrad_, biasGrad_, outGrad_); + // external output grad is not necessary + // since output may be mkldnn internal buffer or merge them directly. + CHECK(outGrad_) << "internal output grad is necessary"; + if (cvtOutGrad_) { + pipelineBwd_.insert(pipelineBwd_.begin(), *cvtOutGrad_); + } + if (cvtInGrad_) { + pipelineBwd_.push_back(*cvtInGrad_); + } + printGradFormat(); + needResetBwd_ = false; + } + + // merge grad must before backward activation + if (mergeGrad_) { + REGISTER_TIMER_INFO("MergeBpGrad", getName().c_str()); + stream_->submit(pipelineMergeGrad_); + } + { + REGISTER_TIMER_INFO("BpActTimer", getName().c_str()); + backwardActivation(); + } + { + REGISTER_TIMER_INFO("mkldnn_bwdTimer", getName().c_str()); + stream_->submit(pipelineBwd_); + } + { + REGISTER_TIMER_INFO("WeightUpdate", getName().c_str()); + updateWeights(callback); + } +} + +void MKLDNNLayer::reshapeInput(int& batchsize, int& height, int& width) { + const Argument& input = inputLayers_[0]->getOutput(); + batchsize = input.getBatchSize(); + int h = input.getFrameHeight(); + int w = input.getFrameWidth(); + if (h != 0) { + height = h; + } + if (w != 0) { + width = w; + } +} + +void MKLDNNLayer::reshapeOutput(size_t height, size_t width) { + output_.setFrameHeight(height); + output_.setFrameWidth(width); + for (size_t i = 0; i < outputOtherDevice_.size(); i++) { + outputOtherDevice_[i].setFrameHeight(height); + outputOtherDevice_[i].setFrameWidth(width); + } +} + +void MKLDNNLayer::resetWithMatrix(MKLDNNMatrixPtr& dnn, + const MatrixPtr& mat, + memory::primitive_desc pd) { + dnn = nullptr; + if (mat == nullptr) { + return; + } + dnn = MKLDNNMatrix::create(pd, mat); +} + +void MKLDNNLayer::resetInValue( + MKLDNNMatrixPtr& in, const std::shared_ptr& intPD) { + cvtInVal_ = nullptr; + extInVal_ = nullptr; + in = nullptr; + CHECK_GT(bs_ * ic_ * ih_ * iw_, 0); + auto extPD = MKLDNNMatrix::createPrimitiveDesc( + {bs_, ic_, ih_, iw_}, format::nchw, engine_); + const MatrixPtr& inMat = inputLayers_[0]->getOutputValue(); + in = std::dynamic_pointer_cast(inMat); + CHECK_EQ(inputIsOnlyMKLDNN(), in != nullptr); + if (in == nullptr || in->getFormat() == format::nc) { + in = MKLDNNMatrix::create(extPD, inMat); + } + extInVal_ = isPaddleFormat(in->getFormat()) ? in : nullptr; + if (in->getFormat() == format::nc) { + CHECK(ih_ == 1 && iw_ == 1); + } + if (nullptr == intPD || in->getPrimitiveDesc() == *intPD) { + return; + } + // need create reorder + in = MKLDNNMatrix::create(*intPD); + extInVal_ = extInVal_ ? extInVal_ : MKLDNNMatrix::create(extPD, inMat); + cvtInVal_ = MKLDNNMatrix::createReorder(extInVal_, in); + CHECK(cvtInVal_) << "should not be emptry"; +} + +void MKLDNNLayer::resetOutValue(MKLDNNMatrixPtr& out, + memory::primitive_desc intPD) { + cvtOutVal_ = nullptr; + out = MKLDNNMatrix::create(intPD, output_.value); + extOutVal_ = out; + if (outputIsOnlyMKLDNN() || isPaddleFormat(extOutVal_->getFormat())) { + return; + } + // need create reorder + CHECK_GT(bs_ * oc_ * oh_ * ow_, 0); + extOutVal_ = MKLDNNMatrix::create( + memory::dims{bs_, oc_, oh_, ow_}, format::nchw, engine_, output_.value); + out = MKLDNNMatrix::create(intPD); + cvtOutVal_ = MKLDNNMatrix::createReorder(out, extOutVal_); + CHECK(cvtOutVal_) << "should not be empty"; +} + +void MKLDNNLayer::resetInGrad(MKLDNNMatrixPtr& in, + memory::primitive_desc intPD) { + cvtInGrad_ = nullptr; + extInGrad_ = nullptr; + in = nullptr; + LayerPtr& input = inputLayers_[0]; + if (input->getOutputGrad() == nullptr) { + // no need input grad + return; + } + CHECK(inputIsOnlyMKLDNN() || input->getOutputMapSize() <= 1) + << "only support input is MKLDNN layer or only have one output layer"; + // when input is a mkldnn branch node, + // this layer will save input grad to a internal buffer, + // and the mkldnn input layer will merge them to actual prev->output_.grad + const MatrixPtr& inMat = + input->getOutputMapSize() <= 1 ? input->getOutputGrad() : nullptr; + in = MKLDNNMatrix::create(intPD, inMat); + Argument& arg = input->getOutput(this->getName()); + arg.grad = std::dynamic_pointer_cast(in); + CHECK(inVal_ != nullptr && inVal_->getPrimitiveDesc() == intPD) + << "should have internal input value and primitive desc must equal"; + if (inputIsOnlyMKLDNN()) { + return; + } + + extInGrad_ = in; + if (isPaddleFormat(extInGrad_->getFormat())) { + return; + } + // need create reorder + CHECK(extInVal_ != nullptr && isPaddleFormat(extInVal_->getFormat())) + << "should have external input value and the format must be nchw(nc)"; + extInGrad_ = MKLDNNMatrix::create(extInVal_->getPrimitiveDesc(), inMat); + CHECK(inVal_ != nullptr && inVal_->getPrimitiveDesc() == intPD) + << "should have internal input value and primitive desc must equal"; + in = MKLDNNMatrix::create(intPD); + cvtInGrad_ = MKLDNNMatrix::createReorder(in, extInGrad_); + CHECK(cvtInGrad_); +} + +void MKLDNNLayer::resetOutGrad(MKLDNNMatrixPtr& out, + memory::primitive_desc intPD) { + cvtOutGrad_ = nullptr; + extOutGrad_ = nullptr; + out = nullptr; + MatrixPtr& outMat = output_.grad; + out = MKLDNNMatrix::create(intPD, outMat); + resetMergeGrad(out); + if (outputIsOnlyMKLDNN()) { + return; + } + CHECK_LE(outputMap_.size(), 1U) << "do not support mixed with cpu device"; + extOutGrad_ = out; + if (isPaddleFormat(extOutGrad_->getFormat())) { + return; + } + // need create reorder + CHECK(extOutVal_ != nullptr && isPaddleFormat(extOutVal_->getFormat())) + << "should have external output value and the format must be nchw(nc)"; + extOutGrad_ = MKLDNNMatrix::create(extOutVal_->getPrimitiveDesc(), outMat); + CHECK(outVal_ != nullptr && outVal_->getPrimitiveDesc() == intPD) + << "should have internal output value and primitive desc must equal"; + out = MKLDNNMatrix::create(intPD); + cvtOutGrad_ = MKLDNNMatrix::createReorder(extOutGrad_, out); + CHECK(cvtOutGrad_); +} + +void MKLDNNLayer::resetMergeGrad(MKLDNNMatrixPtr& out) { + mergeGrad_ = nullptr; + pipelineMergeGrad_.clear(); + if (outputMap_.size() <= 1 || !outputIsOnlyMKLDNN()) { + // do not merge when output is not all MKLDNN or only one output + return; + } + CHECK(out) << "should have reset internal ouput grad"; + std::vector scales(outputMap_.size(), 1.0); + std::vector srcPDs; + std::vector srcs; + for (auto it = outputMap_.begin(); it != outputMap_.end(); ++it) { + MKLDNNMatrixPtr src = + std::dynamic_pointer_cast(it->second->grad); + VLOG(MKLDNN_BASE) << getName() << " has output grad " << it->first; + CHECK(src) << "should be MKLDNNMatrix"; + auto srcDims = src->getDims(); + auto dstDims = out->getDims(); + CHECK_EQ(srcDims.size(), dstDims.size()); + for (size_t i = 0; i < srcDims.size(); ++i) { + CHECK_EQ(srcDims[i], dstDims[i]); + } + srcPDs.push_back(src->getPrimitiveDesc()); + srcs.push_back(*src); + } + + // TODO(TJ): remove me when mkldnn sum support different formats + for (size_t i = 1; i < srcPDs.size(); ++i) { + CHECK(srcPDs[0] == srcPDs[i]); + } + tmpOutGrad_ = out; + tmpCvt_ = nullptr; + if (out->getPrimitiveDesc() != srcPDs[0]) { + tmpOutGrad_ = MKLDNNMatrix::create(srcPDs[0]); + tmpCvt_ = MKLDNNMatrix::createReorder(tmpOutGrad_, out); + CHECK(tmpCvt_); + pipelineMergeGrad_.push_back(*tmpCvt_); + } + + auto sumPD = + sum::primitive_desc(tmpOutGrad_->getMemoryDesc(), scales, srcPDs); + mergeGrad_.reset(new sum(sumPD, srcs, *tmpOutGrad_)); + pipelineMergeGrad_.insert(pipelineMergeGrad_.begin(), *mergeGrad_); +} + +} // namespace paddle diff --git a/paddle/gserver/layers/MKLDNNLayer.h b/paddle/gserver/layers/MKLDNNLayer.h index 80c67529da..faad434526 100644 --- a/paddle/gserver/layers/MKLDNNLayer.h +++ b/paddle/gserver/layers/MKLDNNLayer.h @@ -119,119 +119,9 @@ public: ~MKLDNNLayer() {} - virtual bool init(const LayerMap& layerMap, - const ParameterMap& parameterMap) { - CHECK(FLAGS_use_mkldnn) << "MkldnnLayers only support use_mkldnn." - << "Please set WITH_MKLDNN=ON " - << "and set use_mkldnn=True"; - CHECK(!useGpu_) << "Do not support GPU yet"; - - // set device id before Layer::init - setDevice(MKLDNN_DEVICE); - // change param device to MKLDNN device - setParamsDevice(MKLDNN_DEVICE, parameterMap); - if (!Layer::init(layerMap, parameterMap)) { - return false; - } - setOutputMap(); - checkCPUOutputsNumber(); - - stream_.reset(new MKLDNNStream()); - engine_ = CPUEngine::Instance().getEngine(); - return true; - } - - void forward(PassType passType) override { - passType_ = passType; - - { - REGISTER_TIMER_INFO("mkldnn_FwdTimer", getName().c_str()); - CHECK(!inputLayers_.empty()); - copySeqInfoToOutputs(); - size_t elemenCnt = inputLayers_[0]->getOutputValue()->getElementCnt(); - if (inputElemenCnt_ != elemenCnt) { - VLOG(MKLDNN_BASE) << getName() << " reset mkldnn forward"; - // reset when input total sizes changed, not only the batchsize - inputElemenCnt_ = elemenCnt; - pipelineFwd_.clear(); - reshape(bs_, ic_, ih_, iw_, oc_, oh_, ow_); - // all cpu device output grad or value share output's - shareCPUDevice(); - resetFwd(pipelineFwd_, inVal_, wgtVal_, biasVal_, outVal_); - // MKLDNNLayer output value should be MKLDNNMatrix - // so external output value is necessary. - // then external input value is not necessary, - // since input may be mkldnn internal buffer. - CHECK(extOutVal_) << "external output value is necessary"; - output_.value = std::dynamic_pointer_cast(extOutVal_); - CHECK(inVal_ && outVal_) << "internal memories are necessary"; - if (cvtInVal_) { - pipelineFwd_.insert(pipelineFwd_.begin(), *cvtInVal_); - } - if (cvtOutVal_) { - pipelineFwd_.push_back(*cvtOutVal_); - } - convertWeightsFromPaddle(); - printValueFormat(); - needResetBwd_ = true; - } - - if (inputLayers_[0]->getType() == "data") { - // Update input value data when input layer is "data" type, - // since the input value data address might be changed. - CHECK(extInVal_); - extInVal_->setData(getInputValue(0, CPU_DEVICE)->getData()); - } - - if (!outputOnlyMKLDNN_) { - clearGrads(); - } - stream_->submit(pipelineFwd_); - } - { - REGISTER_TIMER_INFO("FwActTimer", getName().c_str()); - forwardActivation(); - } - } - - void backward(const UpdateCallback& callback) override { - if (needResetBwd_) { - VLOG(MKLDNN_BASE) << getName() << " reset mkldnn backward"; - pipelineBwd_.clear(); - pipelineMergeGrad_.clear(); - mergeGrad_ = nullptr; - resetBwd(pipelineBwd_, inGrad_, wgtGrad_, biasGrad_, outGrad_); - // external output grad is not necessary - // since output may be mkldnn internal buffer or merge them directly. - CHECK(outGrad_) << "internal output grad is necessary"; - if (cvtOutGrad_) { - pipelineBwd_.insert(pipelineBwd_.begin(), *cvtOutGrad_); - } - if (cvtInGrad_) { - pipelineBwd_.push_back(*cvtInGrad_); - } - printGradFormat(); - needResetBwd_ = false; - } - - // merge grad must before backward activation - if (mergeGrad_) { - REGISTER_TIMER_INFO("MergeBpGrad", getName().c_str()); - stream_->submit(pipelineMergeGrad_); - } - { - REGISTER_TIMER_INFO("BpActTimer", getName().c_str()); - backwardActivation(); - } - { - REGISTER_TIMER_INFO("mkldnn_bwdTimer", getName().c_str()); - stream_->submit(pipelineBwd_); - } - { - REGISTER_TIMER_INFO("WeightUpdate", getName().c_str()); - updateWeights(callback); - } - } + virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap); + void forward(PassType passType) override; + void backward(const UpdateCallback& callback) override; /** * reshape the input image sizes @@ -287,30 +177,12 @@ protected: /** * reshape the input image sizes and input batchsize */ - virtual void reshapeInput(int& batchsize, int& height, int& width) { - const Argument& input = inputLayers_[0]->getOutput(); - batchsize = input.getBatchSize(); - int h = input.getFrameHeight(); - int w = input.getFrameWidth(); - if (h != 0) { - height = h; - } - if (w != 0) { - width = w; - } - } + void reshapeInput(int& batchsize, int& height, int& width); /** * reshape output image sizes */ - virtual void reshapeOutput(size_t height, size_t width) { - output_.setFrameHeight(height); - output_.setFrameWidth(width); - for (size_t i = 0; i < outputOtherDevice_.size(); i++) { - outputOtherDevice_[i].setFrameHeight(height); - outputOtherDevice_[i].setFrameWidth(width); - } - } + void reshapeOutput(size_t height, size_t width); /** * reset MKLDNNMatrix from Matrix and internal primitive desc. @@ -318,13 +190,7 @@ protected: */ void resetWithMatrix(MKLDNNMatrixPtr& dnn, const MatrixPtr& mat, - mkldnn::memory::primitive_desc pd) { - dnn = nullptr; - if (mat == nullptr) { - return; - } - dnn = MKLDNNMatrix::create(pd, mat); - } + mkldnn::memory::primitive_desc pd); /** * reset input value from input MKLDNNMatrix and internal primitive desc. @@ -332,99 +198,20 @@ protected: */ void resetInValue( MKLDNNMatrixPtr& in, - const std::shared_ptr& intPD = nullptr) { - cvtInVal_ = nullptr; - extInVal_ = nullptr; - in = nullptr; - CHECK_GT(bs_ * ic_ * ih_ * iw_, 0); - auto extPD = MKLDNNMatrix::createPrimitiveDesc( - {bs_, ic_, ih_, iw_}, mkldnn::memory::format::nchw, engine_); - const MatrixPtr& inMat = inputLayers_[0]->getOutputValue(); - in = std::dynamic_pointer_cast(inMat); - CHECK_EQ(inputIsOnlyMKLDNN(), in != nullptr); - if (in == nullptr || in->getFormat() == mkldnn::memory::format::nc) { - in = MKLDNNMatrix::create(extPD, inMat); - } - extInVal_ = isPaddleFormat(in->getFormat()) ? in : nullptr; - if (in->getFormat() == mkldnn::memory::format::nc) { - CHECK(ih_ == 1 && iw_ == 1); - } - if (nullptr == intPD || in->getPrimitiveDesc() == *intPD) { - return; - } - // need create reorder - in = MKLDNNMatrix::create(*intPD); - extInVal_ = extInVal_ ? extInVal_ : MKLDNNMatrix::create(extPD, inMat); - cvtInVal_ = MKLDNNMatrix::createReorder(extInVal_, in); - CHECK(cvtInVal_) << "should not be emptry"; - } + const std::shared_ptr& intPD = nullptr); /** * reset output value from internal primitive desc. * reset both internal and external buffer and create reorder if necessary. */ void resetOutValue(MKLDNNMatrixPtr& out, - mkldnn::memory::primitive_desc intPD) { - cvtOutVal_ = nullptr; - out = MKLDNNMatrix::create(intPD, output_.value); - extOutVal_ = out; - if (outputIsOnlyMKLDNN() || isPaddleFormat(extOutVal_->getFormat())) { - return; - } - // need create reorder - CHECK_GT(bs_ * oc_ * oh_ * ow_, 0); - extOutVal_ = MKLDNNMatrix::create(mkldnn::memory::dims{bs_, oc_, oh_, ow_}, - mkldnn::memory::format::nchw, - engine_, - output_.value); - out = MKLDNNMatrix::create(intPD); - cvtOutVal_ = MKLDNNMatrix::createReorder(out, extOutVal_); - CHECK(cvtOutVal_) << "should not be empty"; - } + mkldnn::memory::primitive_desc intPD); /** * reset input grad from internal primitive desc. * reset both internal and external buffer and create reorder if necessary. */ - void resetInGrad(MKLDNNMatrixPtr& in, mkldnn::memory::primitive_desc intPD) { - cvtInGrad_ = nullptr; - extInGrad_ = nullptr; - in = nullptr; - LayerPtr& input = inputLayers_[0]; - if (input->getOutputGrad() == nullptr) { - // no need input grad - return; - } - CHECK(inputIsOnlyMKLDNN() || input->getOutputMapSize() <= 1) - << "only support input is MKLDNN layer or only have one output layer"; - // when input is a mkldnn branch node, - // this layer will save input grad to a internal buffer, - // and the mkldnn input layer will merge them to actual prev->output_.grad - const MatrixPtr& inMat = - input->getOutputMapSize() <= 1 ? input->getOutputGrad() : nullptr; - in = MKLDNNMatrix::create(intPD, inMat); - Argument& arg = input->getOutput(this->getName()); - arg.grad = std::dynamic_pointer_cast(in); - CHECK(inVal_ != nullptr && inVal_->getPrimitiveDesc() == intPD) - << "should have internal input value and primitive desc must equal"; - if (inputIsOnlyMKLDNN()) { - return; - } - - extInGrad_ = in; - if (isPaddleFormat(extInGrad_->getFormat())) { - return; - } - // need create reorder - CHECK(extInVal_ != nullptr && isPaddleFormat(extInVal_->getFormat())) - << "should have external input value and the format must be nchw(nc)"; - extInGrad_ = MKLDNNMatrix::create(extInVal_->getPrimitiveDesc(), inMat); - CHECK(inVal_ != nullptr && inVal_->getPrimitiveDesc() == intPD) - << "should have internal input value and primitive desc must equal"; - in = MKLDNNMatrix::create(intPD); - cvtInGrad_ = MKLDNNMatrix::createReorder(in, extInGrad_); - CHECK(cvtInGrad_); - } + void resetInGrad(MKLDNNMatrixPtr& in, mkldnn::memory::primitive_desc intPD); /** * reset output grad from internal primitive desc. @@ -434,81 +221,59 @@ protected: * it could not be mixed with cpu device, * since it can not get memory desc from cpu device. */ - void resetOutGrad(MKLDNNMatrixPtr& out, - mkldnn::memory::primitive_desc intPD) { - cvtOutGrad_ = nullptr; - extOutGrad_ = nullptr; - out = nullptr; - MatrixPtr& outMat = output_.grad; - out = MKLDNNMatrix::create(intPD, outMat); - resetMergeGrad(out); - if (outputIsOnlyMKLDNN()) { - return; - } - CHECK_LE(outputMap_.size(), 1U) << "do not support mixed with cpu device"; - extOutGrad_ = out; - if (isPaddleFormat(extOutGrad_->getFormat())) { - return; - } - // need create reorder - CHECK(extOutVal_ != nullptr && isPaddleFormat(extOutVal_->getFormat())) - << "should have external output value and the format must be nchw(nc)"; - extOutGrad_ = MKLDNNMatrix::create(extOutVal_->getPrimitiveDesc(), outMat); - CHECK(outVal_ != nullptr && outVal_->getPrimitiveDesc() == intPD) - << "should have internal output value and primitive desc must equal"; - out = MKLDNNMatrix::create(intPD); - cvtOutGrad_ = MKLDNNMatrix::createReorder(extOutGrad_, out); - CHECK(cvtOutGrad_); - } + void resetOutGrad(MKLDNNMatrixPtr& out, mkldnn::memory::primitive_desc intPD); /** * reset the merge grad primitive if necessary. * note: do not support the grads are mixed with cpu device, * since it can not get memory desc from cpu device. */ - virtual void resetMergeGrad(MKLDNNMatrixPtr& out) { - mergeGrad_ = nullptr; - pipelineMergeGrad_.clear(); - if (outputMap_.size() <= 1 || !outputIsOnlyMKLDNN()) { - // do not merge when output is not all MKLDNN or only one output - return; - } - CHECK(out) << "should have reset internal ouput grad"; - std::vector scales(outputMap_.size(), 1.0); - std::vector srcPDs; - std::vector srcs; - for (auto it = outputMap_.begin(); it != outputMap_.end(); ++it) { - MKLDNNMatrixPtr src = - std::dynamic_pointer_cast(it->second->grad); - VLOG(MKLDNN_BASE) << getName() << " has output grad " << it->first; - CHECK(src) << "should be MKLDNNMatrix"; - auto srcDims = src->getDims(); - auto dstDims = out->getDims(); - CHECK_EQ(srcDims.size(), dstDims.size()); - for (size_t i = 0; i < srcDims.size(); ++i) { - CHECK_EQ(srcDims[i], dstDims[i]); - } - srcPDs.push_back(src->getPrimitiveDesc()); - srcs.push_back(*src); - } + void resetMergeGrad(MKLDNNMatrixPtr& out); + +protected: + /** + * Set deviceId of this layer. + */ + void setDevice(int id) { deviceId_ = id; } - // TODO(TJ): remove me when mkldnn sum support different formats - for (size_t i = 1; i < srcPDs.size(); ++i) { - CHECK(srcPDs[0] == srcPDs[i]); + /** + * check the format is nchw or nc, + * which is supported by Paddle default memory layout + */ + bool isPaddleFormat(mkldnn::memory::format fmt) { + if (fmt == mkldnn::memory::format::nchw || + fmt == mkldnn::memory::format::nc) { + return true; + } else { + return false; } - tmpOutGrad_ = out; - tmpCvt_ = nullptr; - if (out->getPrimitiveDesc() != srcPDs[0]) { - tmpOutGrad_ = MKLDNNMatrix::create(srcPDs[0]); - tmpCvt_ = MKLDNNMatrix::createReorder(tmpOutGrad_, out); - CHECK(tmpCvt_); - pipelineMergeGrad_.push_back(*tmpCvt_); + } + + /** + * If input only has MKLDNN device. + * Otherwise, only support the previous layer using CPU device. + */ + bool inputIsOnlyMKLDNN(int index = 0) { + int prevDevice = getPrev(index)->getDeviceId(); + if (prevDevice == MKLDNN_DEVICE) { + return true; + } else { + CHECK_EQ(prevDevice, CPU_DEVICE) << "Only support CPU yet"; + return false; } + } - auto sumPD = mkldnn::sum::primitive_desc( - tmpOutGrad_->getMemoryDesc(), scales, srcPDs); - mergeGrad_.reset(new mkldnn::sum(sumPD, srcs, *tmpOutGrad_)); - pipelineMergeGrad_.insert(pipelineMergeGrad_.begin(), *mergeGrad_); + /** + * If output only has MKLDNN device. + * Otherwise, other devices should only using CPU device. + */ + bool outputIsOnlyMKLDNN() { + for (size_t i = 0; i < outputOtherDevice_.size(); i++) { + CHECK_EQ(outputOtherDevice_[i].deviceId, CPU_DEVICE) + << "Only support other device is CPU yet"; + } + outputOnlyMKLDNN_ = outputOtherDevice_.size() == 0; + return outputOnlyMKLDNN_; } /** @@ -568,54 +333,7 @@ protected: } } -protected: - /** - * If input only has MKLDNN device. - * Otherwise, only support the previous layer using CPU device. - */ - bool inputIsOnlyMKLDNN(int index = 0) { - int prevDevice = getPrev(index)->getDeviceId(); - if (prevDevice == MKLDNN_DEVICE) { - return true; - } else { - // do not support GPU yet - CHECK_EQ(prevDevice, CPU_DEVICE) << "Only support CPU yet"; - return false; - } - } - - /** - * If output only has MKLDNN device. - * Otherwise, other devices should only using CPU device. - */ - bool outputIsOnlyMKLDNN() { - for (size_t i = 0; i < outputOtherDevice_.size(); i++) { - CHECK_EQ(outputOtherDevice_[i].deviceId, CPU_DEVICE) - << "Only support other device is CPU yet"; - } - outputOnlyMKLDNN_ = outputOtherDevice_.size() == 0; - return outputOnlyMKLDNN_; - } - - /** - * Set deviceId of this layer. - */ - void setDevice(int id) { deviceId_ = id; } - private: - /** - * check the format is nchw or nc, - * which is supported by Paddle default memory layout - */ - bool isPaddleFormat(mkldnn::memory::format fmt) { - if (fmt == mkldnn::memory::format::nchw || - fmt == mkldnn::memory::format::nc) { - return true; - } else { - return false; - } - } - /** * clear all grad */ From d75b00c2210c247bcf626bf3239fd6f7dc115e49 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Thu, 19 Oct 2017 17:18:22 +0800 Subject: [PATCH 4/5] refine the gtest log info and vlog order, and change the size of test to make unit test faster refine comment and log of mkldnnlayer --- paddle/gserver/layers/MKLDNNBase.h | 4 +- paddle/gserver/layers/MKLDNNLayer.cpp | 7 ++- paddle/gserver/layers/MKLDNNLayer.h | 8 ++-- paddle/gserver/tests/MKLDNNTester.cpp | 44 ++++++++++--------- paddle/gserver/tests/MKLDNNTester.h | 8 +--- .../sample_trainer_config_branch_net.conf | 26 +++++------ .../sample_trainer_config_simple_net.conf | 2 +- 7 files changed, 52 insertions(+), 47 deletions(-) diff --git a/paddle/gserver/layers/MKLDNNBase.h b/paddle/gserver/layers/MKLDNNBase.h index 4c0234e7b3..af02a37cad 100644 --- a/paddle/gserver/layers/MKLDNNBase.h +++ b/paddle/gserver/layers/MKLDNNBase.h @@ -21,8 +21,8 @@ namespace paddle { typedef enum { MKLDNN_BASE = 1, // basical info of MKLDNN MKLDNN_TESTS = 1, // gtest info of MKLDNN - MKLDNN_SIZES = 2, // size info of MKLDNN - MKLDNN_FMTS = 3, // format info of MKLDNN + MKLDNN_FMTS = 2, // format info of MKLDNN + MKLDNN_SIZES = 3, // size info of MKLDNN MKLDNN_ALL = 4, // show all info of MKLDNN } MKLDNN_LOG_LEVEL; diff --git a/paddle/gserver/layers/MKLDNNLayer.cpp b/paddle/gserver/layers/MKLDNNLayer.cpp index 91f0ff5bd3..f4968c4af3 100644 --- a/paddle/gserver/layers/MKLDNNLayer.cpp +++ b/paddle/gserver/layers/MKLDNNLayer.cpp @@ -105,6 +105,10 @@ void MKLDNNLayer::backward(const UpdateCallback& callback) { // external output grad is not necessary // since output may be mkldnn internal buffer or merge them directly. CHECK(outGrad_) << "internal output grad is necessary"; + if (extOutGrad_) { + CHECK_EQ(extOutGrad_->getData(), output_.grad->getData()) + << "the external buffer should share the same data with output_.grad"; + } if (cvtOutGrad_) { pipelineBwd_.insert(pipelineBwd_.begin(), *cvtOutGrad_); } @@ -293,7 +297,6 @@ void MKLDNNLayer::resetMergeGrad(MKLDNNMatrixPtr& out) { for (auto it = outputMap_.begin(); it != outputMap_.end(); ++it) { MKLDNNMatrixPtr src = std::dynamic_pointer_cast(it->second->grad); - VLOG(MKLDNN_BASE) << getName() << " has output grad " << it->first; CHECK(src) << "should be MKLDNNMatrix"; auto srcDims = src->getDims(); auto dstDims = out->getDims(); @@ -301,6 +304,8 @@ void MKLDNNLayer::resetMergeGrad(MKLDNNMatrixPtr& out) { for (size_t i = 0; i < srcDims.size(); ++i) { CHECK_EQ(srcDims[i], dstDims[i]); } + VLOG(MKLDNN_BASE) << getName() << " has output grad " << it->first + << ", format " << src->getFormat(); srcPDs.push_back(src->getPrimitiveDesc()); srcs.push_back(*src); } diff --git a/paddle/gserver/layers/MKLDNNLayer.h b/paddle/gserver/layers/MKLDNNLayer.h index faad434526..656b5ee2d7 100644 --- a/paddle/gserver/layers/MKLDNNLayer.h +++ b/paddle/gserver/layers/MKLDNNLayer.h @@ -58,13 +58,13 @@ protected: std::vector pipelineFwd_; std::vector pipelineBwd_; - /// value and grad are seperate as internal and external buffers. + /// value and grad are seperated as internal and external buffers. /// each MKLDNNLayer must init or reset internal buffer at least, /// and the external buffer format is always nchw of nc(when h==w==1), /// which is the same format as paddle. - /// When mixed with cpu device, the output_.value and output_.grad - /// always save the external data. - /// When all layers are all mkldnn layers, they could be internal data. + /// The output_.value and output_.grad always save the external data, + /// when mixed with cpu device. + /// When all layers are mkldnn layers, they could save internal data. /// below MKLDNNMatrix buffers are all internal buffers MKLDNNMatrixPtr inVal_; MKLDNNMatrixPtr inGrad_; diff --git a/paddle/gserver/tests/MKLDNNTester.cpp b/paddle/gserver/tests/MKLDNNTester.cpp index 3bf6a9e176..0a19fe2333 100644 --- a/paddle/gserver/tests/MKLDNNTester.cpp +++ b/paddle/gserver/tests/MKLDNNTester.cpp @@ -97,7 +97,7 @@ void MKLDNNTester::randomWgtDatas() { parameters_[REF][i]->randomize(); dnnValue->copyFrom(*refValue); - VLOG(lvl_) << "Random weight data " << parameters_[DNN][i]->getName(); + VLOG(MKLDNN_TESTS) << "Random weight " << parameters_[DNN][i]->getName(); printVector(dnnValue); } } @@ -109,7 +109,7 @@ void MKLDNNTester::randomBotDatas() { dataLayers_[REF][i]->getOutputValue()->randomizeUniform(); dataLayers_[DNN][i]->getOutputValue()->copyFrom( *(dataLayers_[REF][i]->getOutputValue())); - VLOG(lvl_) << "Input " << i << " data:"; + VLOG(MKLDNN_TESTS) << "Random Foward, InputValue " << i; printMatrix(dataLayers_[REF][i]->getOutputValue()); } } @@ -118,12 +118,12 @@ void MKLDNNTester::randomTopDiffs() { refLayer_->getOutputGrad()->randomizeUniform(); dnnLayer_->getOutput(CPU_DEVICE) .grad->copyFrom(*(refLayer_->getOutputGrad())); - VLOG(lvl_) << "Random Backward Input, TopDiff: "; + VLOG(MKLDNN_TESTS) << "Random Backward, OutputGrad"; printMatrix(refLayer_->getOutputGrad()); } void MKLDNNTester::checkForward() { - VLOG(MKLDNN_ALL) << "Check Forward"; + VLOG(MKLDNN_TESTS) << "Check Forward"; printTopDatas(); double delta = compareMatrix(dnnLayer_->getOutputValue(), refLayer_->getOutputValue()); @@ -131,15 +131,15 @@ void MKLDNNTester::checkForward() { } void MKLDNNTester::checkBackwardData() { - VLOG(MKLDNN_ALL) << "Check Backward Data"; + VLOG(MKLDNN_TESTS) << "Check Backward Data"; // TODO(TJ): uncomment me when batch norm ready // const bool isBN = dnnLayer_->getType() == "mkldnn_batch_norm"; for (size_t i = 0; i < dataLayers_[DNN].size(); ++i) { const MatrixPtr& dnnDiff = dataLayers_[DNN][i]->getOutputGrad(); const MatrixPtr& refDiff = dataLayers_[REF][i]->getOutputGrad(); - VLOG(lvl_) << "Mkldnn Backward Output BotDiff " << i; + VLOG(MKLDNN_ALL) << "MKLDNN Backward Result: InputGrad " << i; printMatrix(dnnDiff); - VLOG(lvl_) << "Reference Backward Output BotDiff " << i; + VLOG(MKLDNN_ALL) << "Reference Backward Result: InputGrad " << i; printMatrix(refDiff); double delta = compareMatrix(dnnDiff, refDiff); @@ -153,7 +153,7 @@ void MKLDNNTester::checkBackwardData() { } void MKLDNNTester::checkBackwardWgts() { - VLOG(MKLDNN_ALL) << "Check Backward Weight"; + VLOG(MKLDNN_TESTS) << "Check Backward Weight"; CHECK_EQ(parameters_[DNN].size(), parameters_[REF].size()); vector dnnWgts; // used to temply save mkldnn weights saveWgt(parameters_[DNN], dnnWgts); @@ -165,9 +165,11 @@ void MKLDNNTester::checkBackwardWgts() { for (size_t i = 0; i < parameters_[DNN].size(); ++i) { const VectorPtr& dnn = parameters_[DNN][i]->getBuf(PARAMETER_VALUE); const VectorPtr& ref = parameters_[REF][i]->getBuf(PARAMETER_VALUE); - VLOG(lvl_) << "Mkldnn Output weight " << parameters_[DNN][i]->getName(); + VLOG(MKLDNN_ALL) << "MKLDNN Result: weight value" + << parameters_[DNN][i]->getName(); printVector(dnn); - VLOG(lvl_) << "Reference Output weight " << parameters_[REF][i]->getName(); + VLOG(MKLDNN_ALL) << "Reference Result: weight value " + << parameters_[REF][i]->getName(); printVector(ref); double delta = compareVector(dnn, ref); @@ -240,7 +242,8 @@ void MKLDNNTester::printTopDatas() { } for (int n = 0; n < NUM; ++n) { - VLOG(lvl_) << testLayers_[n]->getType() << " forward output TopData: "; + VLOG(MKLDNN_ALL) << testLayers_[n]->getType() + << " Forward Result: OutputValue"; printMatrix(testLayers_[n]->getOutputValue()); } } @@ -252,7 +255,7 @@ void MKLDNNTester::printMatrix(const MatrixPtr& m) { std::ostringstream ostr; m->print(ostr); - VLOG(lvl_) << std::endl << ostr.str(); + VLOG(MKLDNN_ALL) << std::endl << ostr.str(); } void MKLDNNTester::printVector(const VectorPtr& v) { @@ -262,7 +265,7 @@ void MKLDNNTester::printVector(const VectorPtr& v) { std::ostringstream ostr; v->print(ostr, v->getSize()); - VLOG(lvl_) << std::endl << ostr.str(); + VLOG(MKLDNN_ALL) << std::endl << ostr.str(); } double MKLDNNTester::getDelta(const real* d1, @@ -314,7 +317,7 @@ void MKLDNNTester::runOnce() { UpdateCallback updateCallback = [](Parameter* para) { auto& grad = para->getBuf(PARAMETER_GRADIENT); auto& value = para->getBuf(PARAMETER_VALUE); - real lr = 1e-3; + real lr = 1e-2; value->add(*grad, lr); grad->zeroMem(); }; @@ -340,10 +343,9 @@ void MKLDNNTester::run(const TestConfig& dnn, size_t batchSize, size_t inputImgH, size_t inputImgW, + bool printDetails, size_t iter, - float epsilon, - bool log, - int level) { + float epsilon) { CHECK(dnn.layerConfig.type().compare(0, 7, "mkldnn_") == 0 || dnn.layerConfig.active_type().compare(0, 7, "mkldnn_") == 0) << "should be MKLDNN layer or MKLDNN activation"; @@ -359,10 +361,9 @@ void MKLDNNTester::run(const TestConfig& dnn, ih_ = inputImgH; iw_ = inputImgW; + log_ = printDetails; iter_ = iter; eps_ = epsilon; - log_ = log; - lvl_ = level; // Firstly test mkldnn init from PARAM_FORMAT_ORIGINAL weight reset(dnn, ref, batchSize); @@ -531,9 +532,11 @@ void MKLDNNTester::getOutResult(const std::string& configPath, void MKLDNNTester::compareResult(DataOut& ref, DataOut& dnn, float eps) { CHECK_EQ(ref.outValues.size(), dnn.outValues.size()); CHECK_EQ(ref.paraValues.size(), dnn.paraValues.size()); + VLOG(MKLDNN_TESTS) << "compare value size: " << ref.outValues.size(); for (size_t i = 0; i < ref.outValues.size(); i++) { EXPECT_LE(fabs(compareMatrix(ref.outValues[i], dnn.outValues[i])), eps); } + VLOG(MKLDNN_TESTS) << "compare param size: " << ref.outValues.size(); for (size_t i = 0; i < ref.paraValues.size(); i++) { EXPECT_LE(fabs(compareVector(ref.paraValues[i], dnn.paraValues[i])), eps); } @@ -544,9 +547,10 @@ void MKLDNNTester::runBranchesTest(const std::string& configPath, float eps) { DataIn in; initArgument(in, configPath, iter); - DataOut outCpu, outDnn; + VLOG(MKLDNN_TESTS) << "runing cpu network"; getOutResult(configPath, in, outCpu, false, iter); + VLOG(MKLDNN_TESTS) << "runing mkldnn network"; getOutResult(configPath, in, outDnn, true, iter); compareResult(outCpu, outDnn, eps); diff --git a/paddle/gserver/tests/MKLDNNTester.h b/paddle/gserver/tests/MKLDNNTester.h index 51abfcb67e..c385d1c727 100644 --- a/paddle/gserver/tests/MKLDNNTester.h +++ b/paddle/gserver/tests/MKLDNNTester.h @@ -58,8 +58,6 @@ protected: size_t iter_; /// whether to print out the details bool log_; - /// vlog level to print the matrix details datas - int lvl_; /// epsilon float eps_; /// input image size, default 1 @@ -70,7 +68,6 @@ public: iter_ = iter; eps_ = epsilon; log_ = false; - lvl_ = MKLDNN_ALL; } ~MKLDNNTester() {} @@ -81,10 +78,9 @@ public: size_t batchSize, size_t inputImgH = 1, size_t inputImgW = 1, + bool printDetails = false, size_t iter = 3, - float epsilon = 1e-4, - bool log = false, - int level = MKLDNN_ALL); + float epsilon = 1e-4); static void runBranchesTest(const std::string& configPath, size_t iter = 3, float eps = 1e-4); diff --git a/paddle/trainer/tests/sample_trainer_config_branch_net.conf b/paddle/trainer/tests/sample_trainer_config_branch_net.conf index c2594bc13c..a073708a18 100644 --- a/paddle/trainer/tests/sample_trainer_config_branch_net.conf +++ b/paddle/trainer/tests/sample_trainer_config_branch_net.conf @@ -17,7 +17,7 @@ from paddle.trainer_config_helpers import * ################################### Data Configuration ################################### TrainData(ProtoData(files = "trainer/tests/mnist.list")) ################################### Algorithm Configuration ################################### -settings(batch_size = 256, +settings(batch_size = 128, learning_method = MomentumOptimizer(momentum=0.5, sparse=False)) ################################### Network Configuration ################################### data = data_layer(name ="input", size=784) @@ -44,10 +44,11 @@ a2 = img_conv_layer(input=tmp, shared_biases=True, act=ReluActivation()) -tmp = concat_layer(input=[a1, a2]) +tmp = addto_layer(input=[a1, a2], + act=ReluActivation(), + bias_attr=False) tmp = img_pool_layer(input=tmp, - num_channels=64, pool_size=3, stride=2, padding=1, @@ -55,35 +56,34 @@ tmp = img_pool_layer(input=tmp, b1 = img_conv_layer(input=tmp, filter_size=3, - num_filters=64, + num_filters=32, padding=1, shared_biases=True, act=ReluActivation()) b1 = img_pool_layer(input=b1, pool_size=3, - stride=1, - padding=1, + stride=2, + padding=0, pool_type=MaxPooling()) b2 = img_conv_layer(input=tmp, - filter_size=5, + filter_size=3, num_filters=64, - padding=2, + padding=1, shared_biases=True, act=ReluActivation()) b2 = img_pool_layer(input=b2, pool_size=5, - stride=1, - padding=2, + stride=2, + padding=1, pool_type=MaxPooling()) -tmp = addto_layer(input=[b1, b2], - act=ReluActivation(), - bias_attr=False) +tmp = concat_layer(input=[b1, b2]) tmp = img_pool_layer(input=tmp, + num_channels=96, pool_size=3, stride=2, padding=1, diff --git a/paddle/trainer/tests/sample_trainer_config_simple_net.conf b/paddle/trainer/tests/sample_trainer_config_simple_net.conf index 77f7816153..2ba71884d0 100644 --- a/paddle/trainer/tests/sample_trainer_config_simple_net.conf +++ b/paddle/trainer/tests/sample_trainer_config_simple_net.conf @@ -17,7 +17,7 @@ from paddle.trainer_config_helpers import * ################################### Data Configuration ################################### TrainData(ProtoData(files = "trainer/tests/mnist.list")) ################################### Algorithm Configuration ################################### -settings(batch_size = 1000, +settings(batch_size = 128, learning_method = MomentumOptimizer(momentum=0.5, sparse=False)) ################################### Network Configuration ################################### data = data_layer(name ="input", size=784) From 5c892db64ca22200e9d245da3ff72d1dfca3738d Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Fri, 20 Oct 2017 21:43:54 +0800 Subject: [PATCH 5/5] remove unused code refine comments and bias fix typo and todo --- paddle/gserver/layers/MKLDNNConvLayer.cpp | 8 ++--- paddle/gserver/layers/MKLDNNFcLayer.cpp | 21 ++++++------ paddle/gserver/layers/MKLDNNLayer.cpp | 7 ++-- paddle/gserver/layers/MKLDNNLayer.h | 39 ++++++++++++----------- 4 files changed, 37 insertions(+), 38 deletions(-) diff --git a/paddle/gserver/layers/MKLDNNConvLayer.cpp b/paddle/gserver/layers/MKLDNNConvLayer.cpp index 3fbfb1ab1f..83f4e4e615 100644 --- a/paddle/gserver/layers/MKLDNNConvLayer.cpp +++ b/paddle/gserver/layers/MKLDNNConvLayer.cpp @@ -210,11 +210,11 @@ void MKLDNNConvLayer::resetFwdBuffers( resetWithMatrix(wgt, weight_->getW(), pd->weights_primitive_desc()); - bias = nullptr; - if (biases_ == nullptr || biases_->getW() == nullptr) { - return; + if (biases_ && biases_->getW()) { + resetWithMatrix(bias, biases_->getW(), pd->bias_primitive_desc()); + } else { + bias = nullptr; } - resetWithMatrix(bias, biases_->getW(), pd->bias_primitive_desc()); } void MKLDNNConvLayer::resetFwdPipeline( diff --git a/paddle/gserver/layers/MKLDNNFcLayer.cpp b/paddle/gserver/layers/MKLDNNFcLayer.cpp index 9f82a3b747..d82063a713 100644 --- a/paddle/gserver/layers/MKLDNNFcLayer.cpp +++ b/paddle/gserver/layers/MKLDNNFcLayer.cpp @@ -134,10 +134,6 @@ void MKLDNNFcLayer::resetFwdBuffers(MKLDNNMatrixPtr& in, CHECK(in); in->downSpatial(); - // if (extInVal_) { - // extInVal_->downSpatial(); - // } - auto outPD = MKLDNNMatrix::createPrimitiveDesc({bs_, oc_}, format::nc, engine_); resetOutValue(out, outPD); @@ -153,11 +149,12 @@ void MKLDNNFcLayer::resetFwdBuffers(MKLDNNMatrixPtr& in, resetWithMatrix(wgt, weight_->getW(), wgtPD); wgt->downSpatial(); - if (biases_ == nullptr || biases_->getW() == nullptr) { - return; + if (biases_ && biases_->getW()) { + auto biasPD = MKLDNNMatrix::createPrimitiveDesc({oc_}, format::x, engine_); + resetWithMatrix(bias, biases_->getW(), biasPD); + } else { + bias = nullptr; } - auto biasPD = MKLDNNMatrix::createPrimitiveDesc({oc_}, format::x, engine_); - resetWithMatrix(bias, biases_->getW(), biasPD); } void MKLDNNFcLayer::resetFwdPD(std::shared_ptr& pd, @@ -207,11 +204,11 @@ void MKLDNNFcLayer::resetBwdBuffers(MKLDNNMatrixPtr& in, CHECK(wgtVal_); resetWithMatrix(wgt, weight_->getWGrad(), wgtVal_->getPrimitiveDesc()); - bias = nullptr; - if (biasVal_ == nullptr) { - return; + if (biasVal_) { + resetWithMatrix(bias, biases_->getWGrad(), biasVal_->getPrimitiveDesc()); + } else { + bias = nullptr; } - resetWithMatrix(bias, biases_->getWGrad(), biasVal_->getPrimitiveDesc()); } void MKLDNNFcLayer::resetBwdWgtPD( diff --git a/paddle/gserver/layers/MKLDNNLayer.cpp b/paddle/gserver/layers/MKLDNNLayer.cpp index f4968c4af3..6bb19976b5 100644 --- a/paddle/gserver/layers/MKLDNNLayer.cpp +++ b/paddle/gserver/layers/MKLDNNLayer.cpp @@ -60,7 +60,7 @@ void MKLDNNLayer::forward(PassType passType) { resetFwd(pipelineFwd_, inVal_, wgtVal_, biasVal_, outVal_); // MKLDNNLayer output value should be MKLDNNMatrix // so external output value is necessary. - // then external input value is not necessary, + // Then external input value is not necessary, // since input may be mkldnn internal buffer. CHECK(extOutVal_) << "external output value is necessary"; output_.value = std::dynamic_pointer_cast(extOutVal_); @@ -235,8 +235,8 @@ void MKLDNNLayer::resetInGrad(MKLDNNMatrixPtr& in, in = MKLDNNMatrix::create(intPD, inMat); Argument& arg = input->getOutput(this->getName()); arg.grad = std::dynamic_pointer_cast(in); - CHECK(inVal_ != nullptr && inVal_->getPrimitiveDesc() == intPD) - << "should have internal input value and primitive desc must equal"; + CHECK(inVal_); + CHECK(inVal_->getPrimitiveDesc() == intPD) << "the primitive desc must equal"; if (inputIsOnlyMKLDNN()) { return; } @@ -246,6 +246,7 @@ void MKLDNNLayer::resetInGrad(MKLDNNMatrixPtr& in, return; } // need create reorder + // TODO(TJ): add macro definition to simplify it CHECK(extInVal_ != nullptr && isPaddleFormat(extInVal_->getFormat())) << "should have external input value and the format must be nchw(nc)"; extInGrad_ = MKLDNNMatrix::create(extInVal_->getPrimitiveDesc(), inMat); diff --git a/paddle/gserver/layers/MKLDNNLayer.h b/paddle/gserver/layers/MKLDNNLayer.h index 656b5ee2d7..9b54c95b55 100644 --- a/paddle/gserver/layers/MKLDNNLayer.h +++ b/paddle/gserver/layers/MKLDNNLayer.h @@ -58,14 +58,15 @@ protected: std::vector pipelineFwd_; std::vector pipelineBwd_; - /// value and grad are seperated as internal and external buffers. - /// each MKLDNNLayer must init or reset internal buffer at least, - /// and the external buffer format is always nchw of nc(when h==w==1), - /// which is the same format as paddle. - /// The output_.value and output_.grad always save the external data, - /// when mixed with cpu device. - /// When all layers are mkldnn layers, they could save internal data. - /// below MKLDNNMatrix buffers are all internal buffers + /* Value and grad are seperated as internal and external buffers. + * Each MKLDNNLayer must init or reset internal buffer at least, + * and the external buffer format is always nchw of nc(when h==w==1), + * which is the same format as paddle. + * The output_.value and output_.grad always save the external data, + * when mixed with cpu device. + * When all layers are mkldnn layers, they could save internal data. + */ + // below MKLDNNMatrix buffers are all internal buffers MKLDNNMatrixPtr inVal_; MKLDNNMatrixPtr inGrad_; MKLDNNMatrixPtr outVal_; @@ -120,8 +121,8 @@ public: ~MKLDNNLayer() {} virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap); - void forward(PassType passType) override; - void backward(const UpdateCallback& callback) override; + virtual void forward(PassType passType); + virtual void backward(const UpdateCallback& callback); /** * reshape the input image sizes @@ -217,7 +218,7 @@ protected: * reset output grad from internal primitive desc. * merge grad if necessary. * reset both internal and external buffer and create reorder if necessary. - * note: about merge grad, when this layer has serval outputs, + * note: about merge grad, when this layer has several outputs, * it could not be mixed with cpu device, * since it can not get memory desc from cpu device. */ @@ -225,7 +226,7 @@ protected: /** * reset the merge grad primitive if necessary. - * note: do not support the grads are mixed with cpu device, + * note: do not support the grads mixed with cpu device, * since it can not get memory desc from cpu device. */ void resetMergeGrad(MKLDNNMatrixPtr& out); @@ -313,17 +314,17 @@ protected: * print the mkldnn memory format of grad */ virtual void printGradFormat() { - if (extInGrad_) { - VLOG(MKLDNN_FMTS) << extInGrad_->getFormat() << " <<< "; - } - if (inGrad_) { - VLOG(MKLDNN_FMTS) << inGrad_->getFormat() << " <<<"; + if (extOutGrad_) { + VLOG(MKLDNN_FMTS) << extOutGrad_->getFormat(); } if (outGrad_) { VLOG(MKLDNN_FMTS) << outGrad_->getFormat() << " <<< "; } - if (extOutGrad_) { - VLOG(MKLDNN_FMTS) << extOutGrad_->getFormat(); + if (inGrad_) { + VLOG(MKLDNN_FMTS) << inGrad_->getFormat() << " <<<"; + } + if (extInGrad_) { + VLOG(MKLDNN_FMTS) << extInGrad_->getFormat() << " <<< "; } if (wgtGrad_) { VLOG(MKLDNN_FMTS) << "Weight grad format: " << wgtGrad_->getFormat();