You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
268 lines
7.5 KiB
268 lines
7.5 KiB
/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License. */
|
|
|
|
|
|
#pragma once
|
|
|
|
#include "paddle/utils/Thread.h"
|
|
#include "paddle/utils/Util.h"
|
|
|
|
#include "paddle/parameter/AverageOptimizer.h"
|
|
#include "paddle/parameter/FirstOrderOptimizer.h"
|
|
#include "paddle/parameter/OptimizerFunctions.h"
|
|
#include "paddle/parameter/OptimizerWithRegularizer.h"
|
|
#include "paddle/parameter/Parameter.h"
|
|
#include "paddle/parameter/ParameterUpdaterBase.h"
|
|
|
|
#include "paddle/gserver/layers/Layer.h"
|
|
#include "TrainerConfig.pb.h"
|
|
|
|
#include <memory>
|
|
#include <vector>
|
|
|
|
namespace paddle {
|
|
|
|
/**
|
|
* @brief Parameter Updater for SGD, and local(not cluster) run.
|
|
*/
|
|
class SgdLocalUpdater : public ParameterUpdater {
|
|
public:
|
|
/**
|
|
* @brief Ctor. Initialize optimizer locally by optConfig.
|
|
* @param optConfig optimization config.
|
|
* @param withAverager with average optimizer or not, default is true.
|
|
*/
|
|
explicit SgdLocalUpdater(const OptimizationConfig& optConfig,
|
|
bool withAverager = true)
|
|
: numSamplesProcessed_(0) {
|
|
auto baseOptimizer = ParameterOptimizer::create(optConfig);
|
|
optimizer_.reset(withAverager
|
|
? AverageOptimizer::create(optConfig, baseOptimizer)
|
|
: baseOptimizer);
|
|
CHECK(optimizer_) << "fail to create optimizer: "
|
|
<< optConfig.learning_method();
|
|
auto types = optimizer_->getParameterTypes();
|
|
for (auto type : types) {
|
|
addParameterType(type);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* @brief Initialize parameters and optimizer_.
|
|
* For example,
|
|
* If optimizer need hassien vector, then parameter's hassien will
|
|
* be initialized.
|
|
* @param parameters The parameter need to be initialized.
|
|
*/
|
|
virtual void init(std::vector<ParameterPtr>& parameters) {
|
|
ParameterUpdater::init(parameters);
|
|
optimizer_->init(parameters_.size(), nullptr);
|
|
// check no L1 decay in parameter configs
|
|
CHECK(std::find_if(parameters.begin(), parameters.end(),
|
|
[](const ParameterPtr& para) {
|
|
return para->getConfig().decay_rate_l1() > 0.0f;
|
|
}) == parameters.end())
|
|
<< "SgdLocalUpdater cannot support L1 decay in parameter";
|
|
}
|
|
|
|
/**
|
|
* @brief Start a batch with current mini-batch size
|
|
* @param current mini-batch size.
|
|
* @return Always PASS_TRAIN.
|
|
*/
|
|
virtual PassType startBatch(int64_t batchSize) {
|
|
numSamplesProcessed_ += batchSize;
|
|
optimizer_->startBatch(numSamplesProcessed_);
|
|
return PASS_TRAIN;
|
|
}
|
|
|
|
/**
|
|
* @brief finish a mini-batch.
|
|
*/
|
|
virtual void finishBatch(real cost) { optimizer_->finishBatch(); }
|
|
|
|
/**
|
|
* @brief start a pass.
|
|
*/
|
|
virtual void startPass() { optimizer_->startPass(); }
|
|
|
|
/**
|
|
* @brief finish a pass.
|
|
* @param cost sum cost during one pass.
|
|
* @return true if accept (used for owlqn).
|
|
*/
|
|
virtual bool finishPass(real cost) {
|
|
optimizer_->finishPass();
|
|
return ParameterUpdater::finishPass(cost);
|
|
}
|
|
|
|
/**
|
|
* @brief apply model average.
|
|
*/
|
|
virtual void apply() {
|
|
if (auto callback = optimizer_->apply()) {
|
|
for (auto para : parameters_) {
|
|
SetDevice device(para->getDeviceId());
|
|
callback(para->getBufs(), para->getConfig(), -1UL);
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* @brief restore parameter value before model average
|
|
*/
|
|
virtual void restore() {
|
|
if (auto callback = optimizer_->restore()) {
|
|
for (auto para : parameters_) {
|
|
SetDevice device(para->getDeviceId());
|
|
callback(para->getBufs(), para->getConfig(), -1UL);
|
|
}
|
|
}
|
|
}
|
|
|
|
protected:
|
|
/**
|
|
* @brief update method. Update value from gradient.
|
|
* @param para parameter that will be updated.
|
|
*/
|
|
virtual void updateImpl(Parameter* para) {
|
|
optimizer_->update(para->getBufs(), para->getConfig());
|
|
if (auto callback = optimizer_->needSpecialTraversal(para->getConfig())) {
|
|
callback(para->getBufs(), para->getConfig(), -1UL);
|
|
}
|
|
|
|
para->setValueUpdated();
|
|
para->getBuf(PARAMETER_GRADIENT)->zeroMem();
|
|
}
|
|
|
|
|
|
std::unique_ptr<ParameterOptimizer> optimizer_;
|
|
|
|
/**
|
|
* @brief total number of samples processed.
|
|
*/
|
|
int64_t numSamplesProcessed_;
|
|
};
|
|
|
|
/**
|
|
* @brief SgdCpuUpdater is used only in recursive neural network
|
|
* @deprecated
|
|
*/
|
|
class SgdCpuUpdater : public SgdLocalUpdater, public Deprecated {
|
|
public:
|
|
explicit SgdCpuUpdater(const OptimizationConfig& optConfig)
|
|
: SgdLocalUpdater(optConfig),
|
|
Deprecated("SgdCpuUpdater is used only in recursive neural network, "
|
|
"and recursive neural network is deprecated in paddle. "
|
|
"Use it all by your own.")
|
|
{}
|
|
|
|
/**
|
|
* @brief update all parameter on finish batch.
|
|
* @param cost
|
|
*/
|
|
virtual void finishBatch(real cost) {
|
|
for (auto para : parameters_) {
|
|
SgdLocalUpdater::update(para.get());
|
|
}
|
|
optimizer_->finishBatch();
|
|
}
|
|
|
|
protected:
|
|
/**
|
|
* @brief do nothing.
|
|
* @param para
|
|
*/
|
|
virtual void updateImpl(Parameter* para) {}
|
|
virtual void update(Parameter* para) {}
|
|
};
|
|
|
|
/**
|
|
* @brief Sgd Local Updater With average in cpu.
|
|
*
|
|
* It will do model average in cpu to reduce gpu memory comsuption.
|
|
*/
|
|
class SgdUpdaterWithCpuAverager : public SgdLocalUpdater {
|
|
public:
|
|
/**
|
|
* @brief Ctor.
|
|
*
|
|
* SgdUpdaterWithCpuAverager will do everything as a
|
|
* SgdLocalUpdater, then copy parameter from GPU to CPU, and do model
|
|
* average in cpu.
|
|
*/
|
|
explicit SgdUpdaterWithCpuAverager(const OptimizationConfig& optConfig);
|
|
~SgdUpdaterWithCpuAverager();
|
|
|
|
/**
|
|
* @brief init. Initialize cpu parameters, model average optimizer.
|
|
* @param parameters
|
|
*/
|
|
virtual void init(std::vector<ParameterPtr>& parameters);
|
|
|
|
virtual PassType startBatch(int64_t batchSize) {
|
|
averager_->startBatch(-1UL);
|
|
return SgdLocalUpdater::startBatch(batchSize);
|
|
}
|
|
virtual void finishBatch(real cost);
|
|
|
|
virtual void startPass() {
|
|
averager_->startPass();
|
|
SgdLocalUpdater::startPass();
|
|
}
|
|
virtual bool finishPass(real cost) {
|
|
averager_->finishPass();
|
|
return SgdLocalUpdater::finishPass(cost);
|
|
}
|
|
|
|
/// apply the averaged parameter to PARAMETER_VALUE
|
|
/// use PARAETER_GRADIENT for backing up PARAMETER_VALUE
|
|
virtual void apply();
|
|
|
|
/**
|
|
* @brief Restore parameter before apply().
|
|
*/
|
|
virtual void restore();
|
|
|
|
protected:
|
|
virtual void updateImpl(Parameter* para);
|
|
|
|
void updateFunc(Parameter* para);
|
|
|
|
protected:
|
|
std::unique_ptr<ParameterOptimizer> averager_;
|
|
|
|
/**
|
|
* @brief The thread worker which do model average.
|
|
*
|
|
* For each parameter, GPU->CPU parameter is async, and do model average in
|
|
* another thread. Because the training process don't need model average while
|
|
* training, and model average only used in evaluation stage and saving stage.
|
|
* So the model average is totally async.
|
|
*/
|
|
ThreadWorker updateWorker_;
|
|
|
|
/**
|
|
* @brief The parameter mirror in cpu.
|
|
*/
|
|
std::vector<ParameterPtr> cpuParameters_;
|
|
|
|
/**
|
|
* @brief GPU -> CPU copy event. Model average will wait after copy done.
|
|
*/
|
|
std::vector<hl_event_t> copyEvents_;
|
|
};
|
|
|
|
} // namespace paddle
|