|
|
@ -77,6 +77,24 @@ void MKLDNNFcLayer::convertWeightsToPaddle() {
|
|
|
|
wgtVal_->reorderDataTo(wgtVal_, dstFmt, targetDim);
|
|
|
|
wgtVal_->reorderDataTo(wgtVal_, dstFmt, targetDim);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void MKLDNNFcLayer::convertOutputToOtherDevice() {
|
|
|
|
|
|
|
|
copyOutputInfoToOtherDevice();
|
|
|
|
|
|
|
|
// find other cpu device and reorder output to cpu device
|
|
|
|
|
|
|
|
int cnt = 0;
|
|
|
|
|
|
|
|
for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
|
|
|
|
|
|
|
|
if (outputOtherDevice_[i].deviceId == CPU_DEVICE) {
|
|
|
|
|
|
|
|
// fc cpu output value do not need convert
|
|
|
|
|
|
|
|
// just share point
|
|
|
|
|
|
|
|
outputOtherDevice_[i].value = output_.value;
|
|
|
|
|
|
|
|
++cnt;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (cnt > 1) {
|
|
|
|
|
|
|
|
LOG(WARNING) << "should not have more than one CPU devie";
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void MKLDNNFcLayer::reshape() {
|
|
|
|
void MKLDNNFcLayer::reshape() {
|
|
|
|
const Argument& input = getInput(0, getPrev(0)->getDeviceId());
|
|
|
|
const Argument& input = getInput(0, getPrev(0)->getDeviceId());
|
|
|
|
int batchSize = input.getBatchSize();
|
|
|
|
int batchSize = input.getBatchSize();
|
|
|
@ -116,7 +134,7 @@ void MKLDNNFcLayer::resetFwd() {
|
|
|
|
const MatrixPtr& bias = hasBias ? biases_->getW() : nullptr;
|
|
|
|
const MatrixPtr& bias = hasBias ? biases_->getW() : nullptr;
|
|
|
|
const MatrixPtr& out = output_.value;
|
|
|
|
const MatrixPtr& out = output_.value;
|
|
|
|
|
|
|
|
|
|
|
|
if (prevIsMKLDNN()) {
|
|
|
|
if (prevIsOnlyMKLDNN()) {
|
|
|
|
const MatrixPtr& in = getInputValue(0);
|
|
|
|
const MatrixPtr& in = getInputValue(0);
|
|
|
|
inVal_ = std::dynamic_pointer_cast<MKLDNNMatrix>(in);
|
|
|
|
inVal_ = std::dynamic_pointer_cast<MKLDNNMatrix>(in);
|
|
|
|
CHECK(inVal_) << "Input should be MKLDNNMatrix";
|
|
|
|
CHECK(inVal_) << "Input should be MKLDNNMatrix";
|
|
|
@ -136,30 +154,21 @@ void MKLDNNFcLayer::resetFwd() {
|
|
|
|
|
|
|
|
|
|
|
|
// change original output value to mkldnn output value
|
|
|
|
// change original output value to mkldnn output value
|
|
|
|
output_.value = std::dynamic_pointer_cast<Matrix>(outVal_);
|
|
|
|
output_.value = std::dynamic_pointer_cast<Matrix>(outVal_);
|
|
|
|
if (!nextIsMKLDNN()) {
|
|
|
|
if (!nextIsOnlyMKLDNN()) {
|
|
|
|
Argument cpuOutput;
|
|
|
|
convertOutputToOtherDevice();
|
|
|
|
for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
|
|
|
|
|
|
|
|
if (outputOtherDevice_[i].deviceId == CPU_DEVICE) {
|
|
|
|
|
|
|
|
cpuOutput = outputOtherDevice_[i];
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
cpuOutput.setFrameHeight(output_.getFrameHeight());
|
|
|
|
|
|
|
|
cpuOutput.setFrameWidth(output_.getFrameWidth());
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// fc cpu output value do not need convert
|
|
|
|
|
|
|
|
cpuOutput.value = output_.value;
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// create forward handle
|
|
|
|
// create forward handle
|
|
|
|
prop_kind pk = prop_kind::forward;
|
|
|
|
prop_kind pk = prop_kind::forward;
|
|
|
|
fc_fwd::desc fwdDesc =
|
|
|
|
fc_fwd::desc fwdDesc = hasBias ? fc_fwd::desc(pk,
|
|
|
|
hasBias ? fc_fwd::desc(pk,
|
|
|
|
inVal_->getMemoryDesc(),
|
|
|
|
inVal_->getMD(),
|
|
|
|
wgtVal_->getMemoryDesc(),
|
|
|
|
wgtVal_->getMD(),
|
|
|
|
biasVal_->getMemoryDesc(),
|
|
|
|
biasVal_->getMD(),
|
|
|
|
outVal_->getMemoryDesc())
|
|
|
|
outVal_->getMD())
|
|
|
|
: fc_fwd::desc(pk,
|
|
|
|
: fc_fwd::desc(
|
|
|
|
inVal_->getMemoryDesc(),
|
|
|
|
pk, inVal_->getMD(), wgtVal_->getMD(), outVal_->getMD());
|
|
|
|
wgtVal_->getMemoryDesc(),
|
|
|
|
|
|
|
|
outVal_->getMemoryDesc());
|
|
|
|
fc_fwd::primitive_desc fwdPD = fc_fwd::primitive_desc(fwdDesc, engine_);
|
|
|
|
fc_fwd::primitive_desc fwdPD = fc_fwd::primitive_desc(fwdDesc, engine_);
|
|
|
|
if (hasBias) {
|
|
|
|
if (hasBias) {
|
|
|
|
fwd_.reset(new fc_fwd(fwdPD, *inVal_, *wgtVal_, *biasVal_, *outVal_));
|
|
|
|
fwd_.reset(new fc_fwd(fwdPD, *inVal_, *wgtVal_, *biasVal_, *outVal_));
|
|
|
@ -184,36 +193,38 @@ void MKLDNNFcLayer::resetBwd() {
|
|
|
|
const MatrixPtr& wgt = weight_->getWGrad();
|
|
|
|
const MatrixPtr& wgt = weight_->getWGrad();
|
|
|
|
const MatrixPtr& bias = hasBias ? biases_->getWGrad() : nullptr;
|
|
|
|
const MatrixPtr& bias = hasBias ? biases_->getWGrad() : nullptr;
|
|
|
|
|
|
|
|
|
|
|
|
// TODO(TJ): merge topdiffs
|
|
|
|
// TODO(TJ): merge outgrad
|
|
|
|
if (nextIsMKLDNN()) {
|
|
|
|
if (nextIsOnlyMKLDNN()) {
|
|
|
|
// can not directly cast outputgrad to mkldnnmatrix,
|
|
|
|
// can not directly cast outputgrad to mkldnnmatrix,
|
|
|
|
// since each layer can not write the inputgrad to mkldnn inputgrad.
|
|
|
|
// since each layer can not write the inputgrad to mkldnn inputgrad.
|
|
|
|
// So just create from matrix with outputvalue format.
|
|
|
|
// So just create from matrix with outputvalue format.
|
|
|
|
const MatrixPtr& out = getOutput(MKLDNN_DEVICE).grad;
|
|
|
|
const MatrixPtr& out = getOutput(MKLDNN_DEVICE).grad;
|
|
|
|
outGrad_ = MKLDNNMatrix::create(out, outVal_->getPD());
|
|
|
|
outGrad_ = MKLDNNMatrix::create(out, outVal_->getPrimitiveDesc());
|
|
|
|
} else {
|
|
|
|
} else {
|
|
|
|
const MatrixPtr& out = getOutput(CPU_DEVICE).grad;
|
|
|
|
const MatrixPtr& out = getOutput(CPU_DEVICE).grad;
|
|
|
|
// fc do not need to convert from cpu device since output always nc
|
|
|
|
// fc do not need to convert from cpu device since output always nc
|
|
|
|
// only need create from cpu device
|
|
|
|
// only need create from cpu device
|
|
|
|
outGrad_ = MKLDNNMatrix::create(out, outVal_->getPD());
|
|
|
|
outGrad_ = MKLDNNMatrix::create(out, outVal_->getPrimitiveDesc());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
wgtGrad_ = MKLDNNMatrix::create(wgt, wgtVal_->getPD());
|
|
|
|
wgtGrad_ = MKLDNNMatrix::create(wgt, wgtVal_->getPrimitiveDesc());
|
|
|
|
biasGrad_ = hasBias ? MKLDNNMatrix::create(bias, biasVal_->getPD()) : nullptr;
|
|
|
|
biasGrad_ = hasBias ? MKLDNNMatrix::create(bias, biasVal_->getPrimitiveDesc())
|
|
|
|
|
|
|
|
: nullptr;
|
|
|
|
|
|
|
|
|
|
|
|
// create memory primitive desc
|
|
|
|
// create memory primitive desc
|
|
|
|
fc_fwd::desc fwdDesc = fc_fwd::desc(prop_kind::forward,
|
|
|
|
fc_fwd::desc fwdDesc = fc_fwd::desc(prop_kind::forward,
|
|
|
|
inVal_->getMD(),
|
|
|
|
inVal_->getMemoryDesc(),
|
|
|
|
wgtGrad_->getMD(),
|
|
|
|
wgtGrad_->getMemoryDesc(),
|
|
|
|
outGrad_->getMD());
|
|
|
|
outGrad_->getMemoryDesc());
|
|
|
|
fc_fwd::primitive_desc fwdPD = fc_fwd::primitive_desc(fwdDesc, engine_);
|
|
|
|
fc_fwd::primitive_desc fwdPD = fc_fwd::primitive_desc(fwdDesc, engine_);
|
|
|
|
fc_bwdWgt::desc bwdWgtDesc =
|
|
|
|
fc_bwdWgt::desc bwdWgtDesc = hasBias
|
|
|
|
hasBias ? fc_bwdWgt::desc(inVal_->getMD(),
|
|
|
|
? fc_bwdWgt::desc(inVal_->getMemoryDesc(),
|
|
|
|
wgtGrad_->getMD(),
|
|
|
|
wgtGrad_->getMemoryDesc(),
|
|
|
|
biasGrad_->getMD(),
|
|
|
|
biasGrad_->getMemoryDesc(),
|
|
|
|
outGrad_->getMD())
|
|
|
|
outGrad_->getMemoryDesc())
|
|
|
|
: fc_bwdWgt::desc(
|
|
|
|
: fc_bwdWgt::desc(inVal_->getMemoryDesc(),
|
|
|
|
inVal_->getMD(), wgtGrad_->getMD(), outGrad_->getMD());
|
|
|
|
wgtGrad_->getMemoryDesc(),
|
|
|
|
|
|
|
|
outGrad_->getMemoryDesc());
|
|
|
|
fc_bwdWgt::primitive_desc bwdWgtPD =
|
|
|
|
fc_bwdWgt::primitive_desc bwdWgtPD =
|
|
|
|
fc_bwdWgt::primitive_desc(bwdWgtDesc, engine_, fwdPD);
|
|
|
|
fc_bwdWgt::primitive_desc(bwdWgtDesc, engine_, fwdPD);
|
|
|
|
|
|
|
|
|
|
|
@ -227,30 +238,20 @@ void MKLDNNFcLayer::resetBwd() {
|
|
|
|
pipelineBwd_.push_back(*bwdWgt_);
|
|
|
|
pipelineBwd_.push_back(*bwdWgt_);
|
|
|
|
|
|
|
|
|
|
|
|
/// backward data
|
|
|
|
/// backward data
|
|
|
|
if (prevIsMKLDNN()) {
|
|
|
|
int device = prevIsOnlyMKLDNN() ? MKLDNN_DEVICE : CPU_DEVICE;
|
|
|
|
const MatrixPtr& in = getInputGrad(0, MKLDNN_DEVICE);
|
|
|
|
const MatrixPtr& in = getInputGrad(0, device);
|
|
|
|
if (in == nullptr) {
|
|
|
|
if (in == nullptr) {
|
|
|
|
return;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (getInput(0, MKLDNN_DEVICE).getAllCount() > 1) {
|
|
|
|
if (getInput(0, device).getAllCount() > 1) {
|
|
|
|
// TODO(TJ): use outputMaps_ ways when merge topdiff done
|
|
|
|
// TODO(TJ): use outputMaps_ ways when merge outgrad done
|
|
|
|
} else {
|
|
|
|
|
|
|
|
inGrad_ = MKLDNNMatrix::create(in, inVal_->getPD());
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
} else {
|
|
|
|
} else {
|
|
|
|
const MatrixPtr& in = getInputGrad(0, CPU_DEVICE);
|
|
|
|
inGrad_ = MKLDNNMatrix::create(in, inVal_->getPrimitiveDesc());
|
|
|
|
if (in == nullptr) {
|
|
|
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
if (getInput(0, CPU_DEVICE).getAllCount() > 1) {
|
|
|
|
|
|
|
|
// TODO(TJ): use outputMaps_ ways when merge topdiff done
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
|
|
|
inGrad_ = MKLDNNMatrix::create(in, inVal_->getPD());
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
fc_bwdData::desc bwdDataDesc =
|
|
|
|
fc_bwdData::desc bwdDataDesc = fc_bwdData::desc(inVal_->getMemoryDesc(),
|
|
|
|
fc_bwdData::desc(inVal_->getMD(), wgtGrad_->getMD(), outGrad_->getMD());
|
|
|
|
wgtGrad_->getMemoryDesc(),
|
|
|
|
|
|
|
|
outGrad_->getMemoryDesc());
|
|
|
|
fc_bwdData::primitive_desc bwdDataPD =
|
|
|
|
fc_bwdData::primitive_desc bwdDataPD =
|
|
|
|
fc_bwdData::primitive_desc(bwdDataDesc, engine_, fwdPD);
|
|
|
|
fc_bwdData::primitive_desc(bwdDataDesc, engine_, fwdPD);
|
|
|
|
|
|
|
|
|
|
|
|