|
|
|
@ -161,6 +161,7 @@ void AdaDeltaParameterOptimizer::update(const VectorPtr vecs[],
|
|
|
|
|
const ParameterConfig& config,
|
|
|
|
|
size_t sparseId) const {
|
|
|
|
|
CHECK(sparseId == -1LU) << "Sparse update is not supported";
|
|
|
|
|
|
|
|
|
|
BaseMatrix& value = *vecs[PARAMETER_VALUE];
|
|
|
|
|
BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
|
|
|
|
|
BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
|
|
|
|
@ -265,6 +266,7 @@ void AdamParameterOptimizer::update(const VectorPtr vecs[],
|
|
|
|
|
const ParameterConfig& config,
|
|
|
|
|
size_t sparseId) const {
|
|
|
|
|
CHECK(sparseId == -1UL) << "Sparse update is not supported";
|
|
|
|
|
|
|
|
|
|
real beta1_power = std::pow(beta1_, step_);
|
|
|
|
|
real beta2_power = std::pow(beta2_, step_);
|
|
|
|
|
real learningRate = config.learning_rate() * learningRate_;
|
|
|
|
@ -303,18 +305,34 @@ void AdamaxParameterOptimizer::update(const VectorPtr vecs[],
|
|
|
|
|
void OptimizerWithGradientClipping::update(const VectorPtr vecs[],
|
|
|
|
|
const ParameterConfig& config,
|
|
|
|
|
size_t sparseId) const {
|
|
|
|
|
// globalGradientClipping(vecs, config, FLAGS_log_clipping);
|
|
|
|
|
real global_thres_ = optConfig_.gradient_clipping_threshold();
|
|
|
|
|
real local_thres_ = config.gradient_clipping_threshold();
|
|
|
|
|
|
|
|
|
|
real threshold;
|
|
|
|
|
std::string field;
|
|
|
|
|
if (global_thres_ > 0.0f && local_thres_ > 0.0f) {
|
|
|
|
|
threshold = global_thres_ < local_thres_ ? global_thres_ : local_thres_;
|
|
|
|
|
field = global_thres_ < local_thres_ ? "global" : "local";
|
|
|
|
|
} else if (global_thres_ > 0.0f) {
|
|
|
|
|
threshold = global_thres_;
|
|
|
|
|
field = "global";
|
|
|
|
|
} else {
|
|
|
|
|
threshold = local_thres_;
|
|
|
|
|
field = "local";
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
real maxAbsGrad = vecs[PARAMETER_GRADIENT]->getAbsMax();
|
|
|
|
|
if (maxAbsGrad > config.gradient_clipping_threshold()) {
|
|
|
|
|
if (maxAbsGrad > threshold) {
|
|
|
|
|
if (FLAGS_log_clipping) {
|
|
|
|
|
real avgAbsGrad = vecs[PARAMETER_GRADIENT]->getAbsSum() /
|
|
|
|
|
vecs[PARAMETER_GRADIENT]->getSize();
|
|
|
|
|
LOG(INFO) << "parameter=" << config.name() << " need clipping,"
|
|
|
|
|
<< " max grad=" << maxAbsGrad << " avg grad=" << avgAbsGrad;
|
|
|
|
|
LOG(INFO) << "parameter=" << config.name() << " need clipping by "
|
|
|
|
|
<< field << " threshold=" << threshold
|
|
|
|
|
<< ", max grad=" << maxAbsGrad << ", avg grad=" << avgAbsGrad;
|
|
|
|
|
}
|
|
|
|
|
vecs[PARAMETER_GRADIENT]->clip(-config.gradient_clipping_threshold(),
|
|
|
|
|
config.gradient_clipping_threshold());
|
|
|
|
|
vecs[PARAMETER_GRADIENT]->clip(-threshold, threshold);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
optimizer_->update(vecs, config, sparseId);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|