You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
309 lines
9.0 KiB
309 lines
9.0 KiB
/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License. */
|
|
|
|
#include <random>
|
|
|
|
#include "Layer.h"
|
|
#include "MultinomialSampler.h"
|
|
#include "paddle/math/MathFunctions.h"
|
|
|
|
namespace paddle {
|
|
|
|
/**
|
|
* Noise-contrastive estimation.
|
|
* Implements the method in the following paper:
|
|
* A fast and simple algorithm for training neural probabilistic language models.
|
|
*
|
|
* The config file api is nce_layer.
|
|
*/
|
|
class NCELayer : public Layer {
|
|
int numClasses_;
|
|
/// number of input layer besides labelLayer and weightLayer
|
|
int numInputs_;
|
|
LayerPtr labelLayer_;
|
|
/// weight layer, can be None
|
|
LayerPtr weightLayer_;
|
|
WeightList weights_;
|
|
std::unique_ptr<Weight> biases_;
|
|
std::unique_ptr<MultinomialSampler> sampler_;
|
|
|
|
std::uniform_int_distribution<int> rand_;
|
|
|
|
struct Sample {
|
|
int sampleId;
|
|
int labelId;
|
|
bool target;
|
|
real weight;
|
|
};
|
|
std::vector<Sample> samples_;
|
|
/// whether samples_ is prepared
|
|
bool prepared_;
|
|
Argument sampleOut_;
|
|
|
|
IVectorPtr labelIds_;
|
|
|
|
public:
|
|
explicit NCELayer(const LayerConfig& config)
|
|
: Layer(config),
|
|
numClasses_(config.num_classes()),
|
|
rand_(0, config.num_classes() - 1),
|
|
prepared_(false) {}
|
|
|
|
bool init(const LayerMap& layerMap, const ParameterMap& parameterMap) {
|
|
/* Initialize the basic parent class */
|
|
Layer::init(layerMap, parameterMap);
|
|
|
|
/* initialize the weightList */
|
|
size_t i;
|
|
for (i = 0; i < inputLayers_.size(); i++) {
|
|
if (!parameters_[i]) break;
|
|
size_t width = inputLayers_[i]->getSize();
|
|
// create a new weight
|
|
CHECK_EQ(parameters_[i]->getSize(), width * numClasses_);
|
|
Weight* w = new Weight(numClasses_, width, parameters_[i]);
|
|
|
|
// append the new weight to the list
|
|
weights_.emplace_back(w);
|
|
}
|
|
|
|
CHECK_EQ(1U, getSize());
|
|
|
|
numInputs_ = i;
|
|
CHECK_GE(numInputs_, 1)
|
|
<< "Must have at least one input besides label and weight";
|
|
CHECK_LT(i, inputLayers_.size()) << "Missing label layer";
|
|
labelLayer_ = inputLayers_[i];
|
|
if (++i < inputLayers_.size()) {
|
|
weightLayer_ = inputLayers_[i];
|
|
++i;
|
|
}
|
|
CHECK_EQ(i, inputLayers_.size());
|
|
|
|
/* initialize biases_ */
|
|
if (biasParameter_.get() != NULL) {
|
|
CHECK_EQ(biasParameter_->getSize(), (size_t)numClasses_);
|
|
biases_.reset(new Weight(1, numClasses_, biasParameter_));
|
|
}
|
|
|
|
if (config_.neg_sampling_dist_size()) {
|
|
CHECK_EQ(numClasses_, config_.neg_sampling_dist_size());
|
|
sampler_.reset(new MultinomialSampler(config_.neg_sampling_dist().data(),
|
|
numClasses_));
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
void prepareSamples() {
|
|
CHECK(!useGpu_) << "GPU is not supported";
|
|
|
|
int batchSize = getInput(*labelLayer_).getBatchSize();
|
|
IVectorPtr label = getInput(*labelLayer_).ids;
|
|
|
|
CpuSparseMatrixPtr multiLabel = std::dynamic_pointer_cast<CpuSparseMatrix>(
|
|
getInput(*labelLayer_).value);
|
|
|
|
CHECK(label || multiLabel)
|
|
<< "The label layer must have ids or NonValueSparseMatrix value";
|
|
|
|
auto& randEngine = ThreadLocalRandomEngine::get();
|
|
|
|
samples_.clear();
|
|
samples_.reserve(batchSize * (1 + config_.num_neg_samples()));
|
|
|
|
real* weight =
|
|
weightLayer_ ? getInputValue(*weightLayer_)->getData() : nullptr;
|
|
|
|
for (int i = 0; i < batchSize; ++i) {
|
|
real w = weight ? weight[i] : 1;
|
|
if (label) {
|
|
int* ids = label->getData();
|
|
samples_.push_back({i, ids[i], true, w});
|
|
} else {
|
|
const int* cols = multiLabel->getRowCols(i);
|
|
int n = multiLabel->getColNum(i);
|
|
for (int j = 0; j < n; ++j) {
|
|
samples_.push_back({i, cols[j], true, w});
|
|
}
|
|
}
|
|
for (int j = 0; j < config_.num_neg_samples(); ++j) {
|
|
int id = sampler_ ? sampler_->gen(randEngine) : rand_(randEngine);
|
|
samples_.push_back({i, id, false, w});
|
|
}
|
|
}
|
|
prepared_ = true;
|
|
}
|
|
|
|
void prefetch() {
|
|
prepareSamples();
|
|
IVector::resizeOrCreate(labelIds_, samples_.size(), useGpu_);
|
|
int* ids = labelIds_->getData();
|
|
for (size_t i = 0; i < samples_.size(); ++i) {
|
|
ids[i] = samples_[i].labelId;
|
|
}
|
|
|
|
for (int i = 0; i < numInputs_; ++i) {
|
|
auto sparseParam =
|
|
dynamic_cast<SparsePrefetchRowCpuMatrix*>(weights_[i]->getW().get());
|
|
if (sparseParam) {
|
|
sparseParam->addRows(labelIds_);
|
|
}
|
|
}
|
|
}
|
|
|
|
void forward(PassType passType) {
|
|
Layer::forward(passType);
|
|
|
|
CHECK(!useGpu_) << "GPU is not supported";
|
|
|
|
if (!prepared_) {
|
|
if (passType == PASS_GC) {
|
|
ThreadLocalRandomEngine::get().seed(ThreadLocalRand::getDefaultSeed());
|
|
}
|
|
prepareSamples();
|
|
}
|
|
prepared_ = false;
|
|
|
|
/* malloc memory for the output_ if necessary */
|
|
int batchSize = getInputValue(0)->getHeight();
|
|
int size = getSize();
|
|
resetOutput(batchSize, size);
|
|
|
|
Matrix::resizeOrCreate(sampleOut_.value, 1, samples_.size(),
|
|
/* trans= */ false, useGpu_);
|
|
|
|
forwardBias();
|
|
|
|
for (int l = 0; l < numInputs_; ++l) {
|
|
forwardOneInput(l);
|
|
}
|
|
|
|
activation_->forward(sampleOut_);
|
|
|
|
forwardCost();
|
|
}
|
|
|
|
void backward(const UpdateCallback& callback) {
|
|
Matrix::resizeOrCreate(sampleOut_.grad, 1, samples_.size(),
|
|
/* trans= */ false, useGpu_);
|
|
|
|
backwardCost();
|
|
|
|
activation_->backward(sampleOut_);
|
|
|
|
if (biases_->getWGrad()) {
|
|
backwardBias(callback);
|
|
}
|
|
|
|
for (int l = 0; l < numInputs_; ++l) {
|
|
backwardOneInput(l, callback);
|
|
}
|
|
}
|
|
|
|
void forwardBias() {
|
|
if (!biases_) {
|
|
sampleOut_.value->zeroMem();
|
|
} else {
|
|
real* bias = biases_->getW()->getData();
|
|
real* sampleOut = sampleOut_.value->getData();
|
|
for (size_t i = 0; i < samples_.size(); ++i) {
|
|
sampleOut[i] = bias[samples_[i].labelId];
|
|
}
|
|
}
|
|
}
|
|
|
|
void backwardBias(const UpdateCallback& callback) {
|
|
if (!biases_) return;
|
|
real* bias = biases_->getWGrad()->getData();
|
|
real* sampleOut = sampleOut_.grad->getData();
|
|
for (size_t i = 0; i < samples_.size(); ++i) {
|
|
bias[samples_[i].labelId] += sampleOut[i];
|
|
}
|
|
biases_->incUpdate(callback);
|
|
}
|
|
|
|
void forwardOneInput(int layerId) {
|
|
const MatrixPtr& inputMat = getInputValue(layerId);
|
|
const MatrixPtr& weightMat = weights_[layerId]->getW();
|
|
|
|
int dim = inputMat->getWidth();
|
|
real* sampleOut = sampleOut_.value->getData();
|
|
|
|
for (size_t i = 0; i < samples_.size(); ++i) {
|
|
sampleOut[i] += dotProduct(dim, inputMat->getRowBuf(samples_[i].sampleId),
|
|
weightMat->getRowBuf(samples_[i].labelId));
|
|
}
|
|
}
|
|
|
|
void backwardOneInput(int layerId, const UpdateCallback& callback) {
|
|
const MatrixPtr& inputMat = getInputValue(layerId);
|
|
const MatrixPtr& inputGradMat = getInputGrad(layerId);
|
|
const MatrixPtr& weightMat = weights_[layerId]->getW();
|
|
const MatrixPtr& weightGradMat = weights_[layerId]->getWGrad();
|
|
|
|
int dim = inputMat->getWidth();
|
|
real* sampleGrad = sampleOut_.grad->getData();
|
|
|
|
if (weightGradMat) {
|
|
for (size_t i = 0; i < samples_.size(); ++i) {
|
|
axpy(dim, sampleGrad[i], inputMat->getRowBuf(samples_[i].sampleId),
|
|
weightGradMat->getRowBuf(samples_[i].labelId));
|
|
}
|
|
weights_[layerId]->incUpdate(callback);
|
|
}
|
|
|
|
if (inputGradMat) {
|
|
for (size_t i = 0; i < samples_.size(); ++i) {
|
|
axpy(dim, sampleGrad[i], weightMat->getRowBuf(samples_[i].labelId),
|
|
inputGradMat->getRowBuf(samples_[i].sampleId));
|
|
}
|
|
}
|
|
}
|
|
|
|
void forwardCost() {
|
|
real* out = output_.value->getData();
|
|
real* sampleOut = sampleOut_.value->getData();
|
|
real b = 1. / numClasses_ * config_.num_neg_samples();
|
|
for (size_t i = 0; i < samples_.size(); ++i) {
|
|
real o = sampleOut[i];
|
|
if (sampler_) {
|
|
b = config_.num_neg_samples() *
|
|
config_.neg_sampling_dist(samples_[i].labelId);
|
|
}
|
|
real cost = samples_[i].target ? -log(o / (o + b)) : -log(b / (o + b));
|
|
out[samples_[i].sampleId] += samples_[i].weight * cost;
|
|
}
|
|
}
|
|
|
|
void backwardCost() {
|
|
real* sampleOut = sampleOut_.value->getData();
|
|
real* sampleGrad = sampleOut_.grad->getData();
|
|
|
|
real b = 1. / numClasses_ * config_.num_neg_samples();
|
|
for (size_t i = 0; i < samples_.size(); ++i) {
|
|
real o = sampleOut[i];
|
|
if (sampler_) {
|
|
b = config_.num_neg_samples() *
|
|
config_.neg_sampling_dist(samples_[i].labelId);
|
|
}
|
|
real w = samples_[i].weight;
|
|
sampleGrad[i] = samples_[i].target ? -w * b / (o * (o + b)) : w / (o + b);
|
|
}
|
|
}
|
|
};
|
|
|
|
REGISTER_LAYER(nce, NCELayer);
|
|
|
|
} // namespace paddle
|