You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
394 lines
13 KiB
394 lines
13 KiB
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License. */
|
|
|
|
#include "CrossEntropyOverBeam.h"
|
|
|
|
namespace paddle {
|
|
|
|
void CostForOneSequence::calValidExpandStep() {
|
|
validExpansionCount_ = 0;
|
|
goldAsExtraPath_ = true;
|
|
|
|
for (size_t i = 0; i < beams_->expansionCount; ++i) {
|
|
real gold = static_cast<real>(beams_->gold[i]);
|
|
if (i) {
|
|
real* start = beams_->candidateIds[i - 1]->getData();
|
|
goldRowIds_[i] = std::count_if(
|
|
start,
|
|
start + goldRowIds_[i - 1] * beamSize_ + goldColIds_[i - 1],
|
|
[](const real& val) { return val != -1.; });
|
|
} else {
|
|
goldRowIds_[i] = 0;
|
|
}
|
|
|
|
real* start =
|
|
beams_->candidateIds[i]->getData() + goldRowIds_[i] * beamSize_;
|
|
real* findEnd = std::find(start, start + beamSize_, gold);
|
|
validExpansionCount_++;
|
|
|
|
if (start + beamSize_ == findEnd) return;
|
|
goldColIds_[i] = findEnd - start;
|
|
}
|
|
if (goldColIds_[beams_->expansionCount - 1] != -1) goldAsExtraPath_ = false;
|
|
}
|
|
|
|
size_t CostForOneSequence::initLastExpansion() {
|
|
int beamId = validExpansionCount_ - 1;
|
|
const MatrixPtr candidates = beams_->candidateIds[beamId];
|
|
size_t height = candidates->getHeight();
|
|
|
|
/* initialization the last expansion. */
|
|
size_t pathCount = std::count_if(candidates->getData(),
|
|
candidates->getData() + height * beamSize_,
|
|
[](const real& val) { return val != -1; });
|
|
/*
|
|
* if the gold sequence falls off the beam during search, add the gold
|
|
* sequence as the last path into the all expanded candidates.
|
|
*/
|
|
if (goldAsExtraPath_) goldIdsInFinalExpansion_ = pathCount++;
|
|
|
|
pathRowIdsInEachBeam_.clear();
|
|
pathRowIdsInEachBeam_.resize(validExpansionCount_,
|
|
std::vector<int>(pathCount, 0));
|
|
parentIdsInBeam_.clear();
|
|
parentIdsInBeam_.resize(pathCount, 0);
|
|
|
|
if (goldAsExtraPath_) {
|
|
/* add gold sequence into the total expansion. */
|
|
pathRowIdsInEachBeam_[beamId].back() =
|
|
beams_->gold[beamId] +
|
|
getSeqStartPos(beamId, goldRowIds_[validExpansionCount_ - 1]);
|
|
parentIdsInBeam_.back() = goldRowIds_[validExpansionCount_ - 1];
|
|
} else {
|
|
size_t goldOffset = goldRowIds_[beamId] * beamSize_ + goldColIds_[beamId];
|
|
goldIdsInFinalExpansion_ =
|
|
std::count_if(candidates->getData(),
|
|
candidates->getData() + goldOffset,
|
|
[](const real& val) { return val != -1.; });
|
|
}
|
|
|
|
/*
|
|
* TODO(caoying): fix this, store the indices of selected candidate
|
|
* paths into Argument.ids
|
|
*/
|
|
real* ids = candidates->getData();
|
|
size_t curIdx = 0;
|
|
for (size_t i = 0; i < height; ++i) {
|
|
int basePos = getSeqStartPos(beamId, i);
|
|
for (size_t j = 0; j < beamSize_; ++j) {
|
|
int id = ids[i * beamSize_ + j];
|
|
if (id == -1) continue;
|
|
pathRowIdsInEachBeam_[beamId][curIdx] = id + basePos;
|
|
parentIdsInBeam_[curIdx++] = i;
|
|
}
|
|
}
|
|
return pathCount;
|
|
}
|
|
|
|
void CostForOneSequence::constructTotalExpansion() {
|
|
/*
|
|
* construct the entire expanded beam by begining with the last search
|
|
* in which gold falls off the beam.
|
|
*/
|
|
size_t totalPathCount = initLastExpansion();
|
|
|
|
for (int beamId = validExpansionCount_ - 2; beamId >= 0; --beamId) {
|
|
const MatrixPtr candidates = beams_->candidateIds[beamId];
|
|
real* ids = candidates->getData();
|
|
|
|
int lastParentIdInBeam = -1;
|
|
int basePos = -1;
|
|
for (size_t i = 0;
|
|
i < (goldAsExtraPath_ ? totalPathCount - 1 : totalPathCount);
|
|
++i) {
|
|
int id = ids[parentIdsInBeam_[i]];
|
|
int parentRowId = std::div(parentIdsInBeam_[i], beamSize_).quot;
|
|
if (parentIdsInBeam_[i] != lastParentIdInBeam)
|
|
basePos = getSeqStartPos(beamId, parentRowId);
|
|
|
|
pathRowIdsInEachBeam_[beamId][i] = id + basePos;
|
|
lastParentIdInBeam = parentIdsInBeam_[i];
|
|
parentIdsInBeam_[i] = parentRowId;
|
|
|
|
if (goldAsExtraPath_)
|
|
pathRowIdsInEachBeam_[beamId][totalPathCount - 1] =
|
|
beams_->gold[beamId] + getSeqStartPos(beamId, goldRowIds_[beamId]);
|
|
}
|
|
}
|
|
}
|
|
|
|
real CostForOneSequence::globallyNormalizedScore() {
|
|
expandedPathScores_.resize(validExpansionCount_);
|
|
|
|
Matrix::resizeOrCreate(
|
|
softmaxOut_, 1, pathRowIdsInEachBeam_[0].size(), false, false);
|
|
softmaxOut_->zeroMem();
|
|
MatrixPtr tmp = Matrix::create(
|
|
softmaxOut_->getData(), softmaxOut_->getWidth(), 1, false, false);
|
|
|
|
for (size_t i = 0; i < validExpansionCount_; ++i) {
|
|
Matrix::resizeOrCreate(expandedPathScores_[i],
|
|
pathRowIdsInEachBeam_[i].size(),
|
|
1,
|
|
false,
|
|
false);
|
|
expandedPathScores_[i]->zeroMem();
|
|
|
|
IVectorPtr rowIds = IVector::create(pathRowIdsInEachBeam_[i].data(),
|
|
pathRowIdsInEachBeam_[i].size(),
|
|
false);
|
|
expandedPathScores_[i]->selectRows(*(beams_->scores[i]), *rowIds);
|
|
tmp->add(*expandedPathScores_[i]);
|
|
}
|
|
|
|
softmaxOut_->softmax(*softmaxOut_);
|
|
return -std::log(softmaxOut_->getData()[goldIdsInFinalExpansion_]);
|
|
}
|
|
|
|
real CostForOneSequence::forward() {
|
|
calValidExpandStep();
|
|
constructTotalExpansion();
|
|
return globallyNormalizedScore();
|
|
}
|
|
|
|
void CostForOneSequence::backward() {
|
|
/*
|
|
* when softmax layer is the output layer, and it is combined with
|
|
* cross-entropy as cost. The derivate with regard to softmax's input
|
|
* is simply:
|
|
*
|
|
* grad_i = softmax_out_i - target_i,
|
|
*
|
|
* and here hard label is used.
|
|
*/
|
|
softmaxOut_->getData()[goldIdsInFinalExpansion_] -= 1.;
|
|
|
|
MatrixPtr tmp = Matrix::create(
|
|
softmaxOut_->getData(), softmaxOut_->getWidth(), 1, false, false);
|
|
|
|
for (size_t i = 0; i < validExpansionCount_; ++i) {
|
|
IVectorPtr rowIds = IVector::create(pathRowIdsInEachBeam_[i].data(),
|
|
pathRowIdsInEachBeam_[i].size(),
|
|
false);
|
|
/*
|
|
beams_->scoreGrad[i] has been intialized outside this class, this
|
|
class only keeps a pointer pointing to the original input gradients,
|
|
so here does not need to allocate or initalize the memory.
|
|
*/
|
|
tmp->addToRows(*beams_->scoreGrad[i], *rowIds);
|
|
}
|
|
}
|
|
|
|
REGISTER_LAYER(cross_entropy_over_beam, CrossEntropyOverBeam);
|
|
|
|
bool CrossEntropyOverBeam::init(const LayerMap& layerMap,
|
|
const ParameterMap& parameterMap) {
|
|
/* Initialize the basic parent class */
|
|
Layer::init(layerMap, parameterMap);
|
|
CHECK_EQ(0U, inputLayers_.size() % 3) << "Error input number.";
|
|
|
|
beamExpanCount_ = inputLayers_.size() / 3;
|
|
|
|
candidateScores_.resize(beamExpanCount_);
|
|
candidateScoreGrad_.resize(beamExpanCount_);
|
|
|
|
candidateInBeam_.resize(beamExpanCount_);
|
|
goldSequence_.resize(beamExpanCount_);
|
|
gradToInputs_.resize(beamExpanCount_);
|
|
|
|
setNeedSequenceInfo(false);
|
|
return true;
|
|
}
|
|
|
|
void CrossEntropyOverBeam::checkInputs() {
|
|
batchSize_ = 0;
|
|
for (size_t i = 0; i < beamExpanCount_; ++i) {
|
|
const Argument& scores = getInput(i * 3);
|
|
const Argument& selCandidates = getInput(i * 3 + 1);
|
|
const Argument& goldSeq = getInput(i * 3 + 2);
|
|
|
|
if (i) {
|
|
CHECK(scores.hasSubseq()) << "input " << i << " "
|
|
<< inputLayers_[i * 3]->getName()
|
|
<< " should be a nested sequence";
|
|
CHECK_EQ(getInputValue(i * 3 + 1)->getWidth(), beamSize_);
|
|
CHECK_EQ(batchSize_, static_cast<size_t>(scores.getNumSequences()));
|
|
CHECK_EQ(scores.getNumSubSequences(), selCandidates.getBatchSize());
|
|
} else {
|
|
CHECK(scores.hasSeq()) << "input " << i << " "
|
|
<< inputLayers_[i]->getName()
|
|
<< " should be a sequence";
|
|
batchSize_ = scores.getNumSequences();
|
|
beamSize_ = getInputValue(i * 3 + 1)->getWidth();
|
|
CHECK_EQ(batchSize_, static_cast<size_t>(selCandidates.getBatchSize()));
|
|
}
|
|
CHECK_EQ(1U, scores.value->getWidth());
|
|
CHECK_EQ(batchSize_, static_cast<size_t>(goldSeq.getBatchSize()));
|
|
}
|
|
}
|
|
|
|
void CrossEntropyOverBeam::copyInputsToCpu() {
|
|
auto copyValue = [](const MatrixPtr& src, MatrixPtr& trg) {
|
|
if (dynamic_cast<GpuMatrix*>(src.get())) {
|
|
Matrix::resizeOrCreate(
|
|
trg, src->getHeight(), src->getWidth(), false, false);
|
|
trg->copyFrom(*src);
|
|
} else {
|
|
trg = std::move(src);
|
|
}
|
|
};
|
|
|
|
auto copyIds = [](const IVectorPtr& src, IVectorPtr& trg) {
|
|
if (dynamic_cast<GpuIVector*>(src.get())) {
|
|
IVector::resizeOrCreate(trg, src->getSize(), false);
|
|
trg->copyFrom(*src);
|
|
} else {
|
|
trg = std::move(src);
|
|
}
|
|
};
|
|
|
|
beamSplitPos_.clear();
|
|
beamSplitPos_.resize(batchSize_, std::vector<int>(beamExpanCount_, 0));
|
|
for (size_t i = 0; i < beamExpanCount_; ++i) {
|
|
copyValue(getInputValue(i * 3), candidateScores_[i]);
|
|
copyValue(getInputValue(i * 3 + 1), candidateInBeam_[i]);
|
|
copyIds(getInput(i * 3 + 2).ids, goldSequence_[i]);
|
|
|
|
if (i) {
|
|
ICpuGpuVectorPtr seqInfo = getInput(i * 3).sequenceStartPositions;
|
|
const int* seqStarts = seqInfo->getMutableData(false);
|
|
ICpuGpuVectorPtr subSeqInfo = getInput(i * 3).subSequenceStartPositions;
|
|
const int* subSeqStarts = subSeqInfo->getMutableData(false);
|
|
|
|
size_t seqId = 1;
|
|
for (size_t subSeqId = 0; subSeqId < subSeqInfo->getSize() - 1;
|
|
++subSeqId) {
|
|
CHECK_LT(seqId, seqInfo->getSize());
|
|
if (subSeqStarts[subSeqId] == seqStarts[seqId]) {
|
|
beamSplitPos_[seqId][i] = beamSplitPos_[seqId - 1][i];
|
|
seqId++;
|
|
}
|
|
beamSplitPos_[seqId - 1][i]++;
|
|
}
|
|
} else {
|
|
for (size_t j = 0; j < batchSize_; ++j) beamSplitPos_[j][i] = j + 1;
|
|
}
|
|
}
|
|
}
|
|
|
|
void CrossEntropyOverBeam::splitBatchBeams() {
|
|
beamCosts_.resize(batchSize_);
|
|
beamPerSeq_.resize(batchSize_, BeamExpansion(beamExpanCount_));
|
|
|
|
for (size_t i = 0; i < beamExpanCount_; ++i) {
|
|
int* seqStarts =
|
|
getInput(i * 3).sequenceStartPositions->getMutableData(false);
|
|
|
|
int* subSeqStarts = nullptr;
|
|
int maxLen = 0;
|
|
if (i) {
|
|
subSeqStarts =
|
|
getInput(i * 3).subSequenceStartPositions->getMutableData(false);
|
|
maxLen = getInput(i * 3).subSequenceStartPositions->getSize() - 1;
|
|
} else {
|
|
maxLen = getInput(i).sequenceStartPositions->getSize() - 1;
|
|
}
|
|
|
|
for (size_t j = 0; j < batchSize_; ++j) {
|
|
beamPerSeq_[j].scores[i] =
|
|
Matrix::create(candidateScores_[i]->getData() + seqStarts[j],
|
|
seqStarts[j + 1] - seqStarts[j],
|
|
1,
|
|
false,
|
|
false);
|
|
beamPerSeq_[j].scoreGrad[i] =
|
|
Matrix::create(candidateScoreGrad_[i]->getData() + seqStarts[j],
|
|
seqStarts[j + 1] - seqStarts[j],
|
|
1,
|
|
false,
|
|
false);
|
|
|
|
int offset = j ? beamSplitPos_[j - 1][i] : 0;
|
|
int height = beamSplitPos_[j][i] - (j ? beamSplitPos_[j - 1][i] : 0);
|
|
CHECK_GE(maxLen, offset + height);
|
|
beamPerSeq_[j].seqInfo[i] = IVector::create(
|
|
(i ? subSeqStarts : seqStarts) + offset, height + 1, false);
|
|
|
|
beamPerSeq_[j].candidateIds[i] =
|
|
Matrix::create(candidateInBeam_[i]->getData() + offset * beamSize_,
|
|
height,
|
|
beamSize_,
|
|
false,
|
|
false);
|
|
beamPerSeq_[j].gold[i] = goldSequence_[i]->getData()[j];
|
|
|
|
CHECK_LE(beamPerSeq_[j].gold[i], seqStarts[j + 1] - seqStarts[j]);
|
|
}
|
|
}
|
|
}
|
|
|
|
void CrossEntropyOverBeam::resizeOutput() {
|
|
Matrix::resizeOrCreate(output_.value, batchSize_, 1, false, false);
|
|
output_.value->zeroMem();
|
|
|
|
for (size_t i = 0; i < beamExpanCount_; ++i) {
|
|
MatrixPtr inGrad = getInputGrad(i * 3);
|
|
if (dynamic_cast<GpuMatrix*>(inGrad.get())) {
|
|
Matrix::resizeOrCreate(candidateScoreGrad_[i],
|
|
inGrad->getHeight(),
|
|
inGrad->getWidth(),
|
|
false,
|
|
false);
|
|
} else {
|
|
candidateScoreGrad_[i] = std::move(inGrad);
|
|
}
|
|
candidateScoreGrad_[i]->zeroMem();
|
|
}
|
|
}
|
|
|
|
void CrossEntropyOverBeam::copyGradToGpu(size_t copyCount) {
|
|
for (size_t i = 0; i < beamExpanCount_; ++i) {
|
|
if (dynamic_cast<GpuMatrix*>(getInputGrad(i * 3).get()))
|
|
getInputGrad(i * 3)->copyFrom(*candidateScoreGrad_[i]);
|
|
|
|
if (i == copyCount - 1) break;
|
|
}
|
|
}
|
|
|
|
void CrossEntropyOverBeam::forward(PassType passType) {
|
|
Layer::forward(passType);
|
|
|
|
checkInputs();
|
|
copyInputsToCpu();
|
|
|
|
resizeOutput();
|
|
splitBatchBeams();
|
|
|
|
MatrixPtr outputValue = getOutputValue();
|
|
for (size_t i = 0; i < batchSize_; ++i) {
|
|
BeamExpansionPtr ptr = std::make_shared<BeamExpansion>(beamPerSeq_[i]);
|
|
beamCosts_[i].setData(std::move(ptr), beamSize_);
|
|
outputValue->getData()[i] = beamCosts_[i].forward();
|
|
}
|
|
}
|
|
|
|
void CrossEntropyOverBeam::backward(const UpdateCallback& callback) {
|
|
for (size_t i = 0; i < batchSize_; ++i) {
|
|
beamCosts_[i].backward();
|
|
copyGradToGpu(beamCosts_[i].getValidExpansionCount());
|
|
}
|
|
}
|
|
|
|
} // namespace paddle
|