parent
5d7fabd32c
commit
2ac8e6a518
@ -1,209 +0,0 @@
|
|||||||
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
|
|
||||||
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
you may not use this file except in compliance with the License.
|
|
||||||
You may obtain a copy of the License at
|
|
||||||
|
|
||||||
http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
|
|
||||||
Unless required by applicable law or agreed to in writing, software
|
|
||||||
distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
See the License for the specific language governing permissions and
|
|
||||||
limitations under the License. */
|
|
||||||
|
|
||||||
#include <fstream>
|
|
||||||
#include "paddle/utils/Logging.h"
|
|
||||||
|
|
||||||
#include "ParallelParameter.h"
|
|
||||||
|
|
||||||
namespace paddle {
|
|
||||||
|
|
||||||
UpdateFunction paramUpdateFunctions[UPDATE_TYPE_NUM] = {
|
|
||||||
nullptr, // &ParallelParameter::singleUpdate, /* single thread */
|
|
||||||
nullptr, // &ParallelParameter::controlUpdate, /* controller thread */
|
|
||||||
&ParallelParameter::majorUpdate, /* major thread */
|
|
||||||
&ParallelParameter::minorUpdate, /* minor thread */
|
|
||||||
|
|
||||||
nullptr, /* master */
|
|
||||||
&ParallelParameter::slaveUpdate, /* slave */
|
|
||||||
};
|
|
||||||
ParallelParameterPtr ParallelParameter::create(TrainerRole role,
|
|
||||||
ParameterPtr localParam,
|
|
||||||
int asyncCount) {
|
|
||||||
ParallelParameterPtr ptr = nullptr;
|
|
||||||
switch (role) {
|
|
||||||
case TRAINER_ROLE_CONTROL:
|
|
||||||
case TRAINER_ROLE_MAJOR:
|
|
||||||
case TRAINER_ROLE_MINOR:
|
|
||||||
ptr = std::make_shared<SyncParameter>(role, localParam);
|
|
||||||
break;
|
|
||||||
case TRAINER_ROLE_MASTER:
|
|
||||||
case TRAINER_ROLE_SLAVE:
|
|
||||||
ptr = std::make_shared<AsyncParameter>(role, asyncCount, localParam);
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
LOG(FATAL) << "unknown role " << role << "\n";
|
|
||||||
}
|
|
||||||
return ptr;
|
|
||||||
}
|
|
||||||
void ParallelParameter::syncUpdate(TrainerRole role, real learnRate) {
|
|
||||||
if (paramUpdateFunctions[role]) {
|
|
||||||
(this->*paramUpdateFunctions[role])(learnRate);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void SyncParameter::attachControlParam(ParallelParameterPtr controler) {
|
|
||||||
controlParam_ = controler;
|
|
||||||
}
|
|
||||||
|
|
||||||
void SyncParameter::attachMajorParam(ParallelParameterPtr partner) {
|
|
||||||
majorPartners_.push_back(partner);
|
|
||||||
if (role_ == TRAINER_ROLE_CONTROL) {
|
|
||||||
localParam_->setSharedCount(majorPartners_.size());
|
|
||||||
}
|
|
||||||
// partnerParam_ = partner;
|
|
||||||
}
|
|
||||||
|
|
||||||
void SyncParameter::attachMinorParam(ParallelParameterPtr partner,
|
|
||||||
int deviceId) {
|
|
||||||
minorPartners_.push_back(partner);
|
|
||||||
minorDeviceIds_.push_back(deviceId);
|
|
||||||
// partnerParam_ = partner;
|
|
||||||
}
|
|
||||||
|
|
||||||
void SyncParameter::waitAllMajorGradReady() {
|
|
||||||
for (size_t i = 0; i < majorPartners_.size(); i++) {
|
|
||||||
majorPartners_[i]->waitGradReady();
|
|
||||||
partnerParam_ = majorPartners_[i]->getLocalParameter();
|
|
||||||
VectorPtr localGrad = localParam_->getBuf(PARAMETER_GRADIENT);
|
|
||||||
VectorPtr patnrGrad = partnerParam_->getBuf(PARAMETER_GRADIENT);
|
|
||||||
if (FLAGS_use_gpu) hl_set_device(minorDeviceIds_[i]);
|
|
||||||
localGrad->add(*patnrGrad);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void SyncParameter::synchronizeParamter() {
|
|
||||||
valueSem_->wait();
|
|
||||||
if (role_ == TRAINER_ROLE_MINOR) {
|
|
||||||
/* copy the value from controller */
|
|
||||||
VectorPtr cntrlVec =
|
|
||||||
(controlParam_->getLocalParameter())->getBuf(PARAMETER_VALUE);
|
|
||||||
VectorPtr localVec = localParam_->getBuf(PARAMETER_VALUE);
|
|
||||||
localVec->copyFrom(*cntrlVec);
|
|
||||||
|
|
||||||
/* dispatch the value to major */
|
|
||||||
for (size_t i = 0; i < majorPartners_.size(); i++) {
|
|
||||||
VectorPtr majorVec =
|
|
||||||
(majorPartners_[i]->getLocalParameter())->getBuf(PARAMETER_VALUE);
|
|
||||||
majorVec->copyFrom(*localVec);
|
|
||||||
majorPartners_[i]->postValueReady();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void SyncParameter::singleUpdate(real learnRate) {
|
|
||||||
CHECK(role_ == TRAINER_ROLE_SINGLE);
|
|
||||||
localParam_->updateWithGradient(learnRate);
|
|
||||||
}
|
|
||||||
|
|
||||||
void SyncParameter::controlUpdate(const UpdateCallback &callBack) {
|
|
||||||
CHECK(role_ == TRAINER_ROLE_CONTROL);
|
|
||||||
CHECK(gradSem_ != NULL && valueSem_ != NULL);
|
|
||||||
CHECK(majorPartners_.size());
|
|
||||||
|
|
||||||
/* update */
|
|
||||||
if (callBack) {
|
|
||||||
callBack(localParam_.get());
|
|
||||||
localParam_->clearGradient();
|
|
||||||
}
|
|
||||||
|
|
||||||
for (size_t i = 0; i < minorPartners_.size(); i++) {
|
|
||||||
minorPartners_[i]->postValueReady();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void SyncParameter::majorUpdate(real learnRate) {
|
|
||||||
(void)learnRate;
|
|
||||||
CHECK(role_ == TRAINER_ROLE_MAJOR);
|
|
||||||
CHECK(gradSem_ != NULL && valueSem_ != NULL);
|
|
||||||
CHECK(minorPartners_.size() && controlParam_);
|
|
||||||
|
|
||||||
/* wait the minor-Gradient is ready */
|
|
||||||
for (size_t i = 0; i < minorPartners_.size(); i++) {
|
|
||||||
minorPartners_[i]->waitGradReady();
|
|
||||||
partnerParam_ = minorPartners_[i]->getLocalParameter();
|
|
||||||
VectorPtr localGrad = localParam_->getBuf(PARAMETER_GRADIENT);
|
|
||||||
VectorPtr minorGrad = partnerParam_->getBuf(PARAMETER_GRADIENT);
|
|
||||||
localGrad->add(*minorGrad);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* notice the controller that the gradient is ready */
|
|
||||||
gradSem_->post();
|
|
||||||
}
|
|
||||||
|
|
||||||
void SyncParameter::minorUpdate(real learnRate) {
|
|
||||||
(void)learnRate;
|
|
||||||
CHECK(role_ == TRAINER_ROLE_MINOR);
|
|
||||||
CHECK(gradSem_ != NULL && valueSem_ != NULL);
|
|
||||||
|
|
||||||
// notice the major that the gradient is ready
|
|
||||||
gradSem_->post();
|
|
||||||
}
|
|
||||||
|
|
||||||
AsyncParameter::AsyncParameter(TrainerRole role,
|
|
||||||
int asyncCount,
|
|
||||||
ParameterPtr localParam)
|
|
||||||
: ParallelParameter(role, localParam) {
|
|
||||||
asyncCount_ = asyncCount;
|
|
||||||
accumCounter_ = 0;
|
|
||||||
gradientAccum_ = Vector::create(localParam->getSize(), localParam->useGpu());
|
|
||||||
gradientAccum_->zeroMem();
|
|
||||||
}
|
|
||||||
|
|
||||||
void AsyncParameter::slaveUpdate(real learnRate) {
|
|
||||||
/* increase the accumCounter_ */
|
|
||||||
accumCounter_++;
|
|
||||||
|
|
||||||
/* accumulate the gradient to the buffer */
|
|
||||||
VectorPtr grad = localParam_->getBuf(PARAMETER_GRADIENT);
|
|
||||||
gradientAccum_->add(*grad);
|
|
||||||
|
|
||||||
/* if need to be synchronized with the master */
|
|
||||||
if (accumCounter_ == asyncCount_) {
|
|
||||||
gradSem_->post();
|
|
||||||
// accumCounter_ = 0; NOTICE: the upper-function need to reset the counter
|
|
||||||
} else { // self update
|
|
||||||
localParam_->updateWithGradient(learnRate);
|
|
||||||
}
|
|
||||||
localParam_->clearGradient();
|
|
||||||
}
|
|
||||||
|
|
||||||
bool AsyncParameter::masterUpdate(ParallelParameterPtr slaveParam,
|
|
||||||
const UpdateCallback &callback) {
|
|
||||||
CHECK(slaveParam && callback);
|
|
||||||
|
|
||||||
/* wait the slave is ready */
|
|
||||||
if (!slaveParam->timeWaitGradReady(5)) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
AsyncParameter *asyncParam = dynamic_cast<AsyncParameter *>(slaveParam.get());
|
|
||||||
|
|
||||||
/* get the accum-gradient to update local parameter */
|
|
||||||
VectorPtr slaveVec = asyncParam->getAccum();
|
|
||||||
localParam_->getBuf(PARAMETER_GRADIENT)->copyFrom(*slaveVec);
|
|
||||||
callback(localParam_.get());
|
|
||||||
// slaveVec->zeroMem();
|
|
||||||
|
|
||||||
/* copy the newest parameter-value to the slave */
|
|
||||||
slaveVec = (slaveParam->getLocalParameter())->getBuf(PARAMETER_VALUE);
|
|
||||||
slaveVec->copyFrom(*(localParam_->getBuf(PARAMETER_VALUE)));
|
|
||||||
|
|
||||||
/* release the semphore */
|
|
||||||
slaveParam->postValueReady();
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace paddle
|
|
@ -1,244 +0,0 @@
|
|||||||
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
|
|
||||||
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
you may not use this file except in compliance with the License.
|
|
||||||
You may obtain a copy of the License at
|
|
||||||
|
|
||||||
http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
|
|
||||||
Unless required by applicable law or agreed to in writing, software
|
|
||||||
distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
See the License for the specific language governing permissions and
|
|
||||||
limitations under the License. */
|
|
||||||
|
|
||||||
#pragma once
|
|
||||||
|
|
||||||
#include <stdint.h>
|
|
||||||
|
|
||||||
#include <sys/time.h>
|
|
||||||
#include <unistd.h>
|
|
||||||
#include <iostream>
|
|
||||||
#include <string>
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
#include "hl_gpu.h"
|
|
||||||
#include "paddle/math/Vector.h"
|
|
||||||
#include "paddle/parameter/Parameter.h"
|
|
||||||
#include "paddle/parameter/ParameterUpdateFunctions.h"
|
|
||||||
#include "paddle/utils/Common.h"
|
|
||||||
#include "paddle/utils/Flags.h"
|
|
||||||
#include "paddle/utils/Locks.h"
|
|
||||||
|
|
||||||
#include "ParameterConfig.pb.h"
|
|
||||||
|
|
||||||
namespace paddle {
|
|
||||||
|
|
||||||
class ParallelParameter;
|
|
||||||
class SyncParameter;
|
|
||||||
class AsyncParameter;
|
|
||||||
|
|
||||||
typedef std::shared_ptr<ParallelParameter> ParallelParameterPtr;
|
|
||||||
|
|
||||||
const int UPDATE_TYPE_NUM = 32;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* TrainRole denotes the role of current training, different roles have
|
|
||||||
* different jobs.
|
|
||||||
*
|
|
||||||
* control, major, minor are three kinds of role to support mutiple GPUs
|
|
||||||
* parallel SGD training. SM on GPU card has two groups, each group
|
|
||||||
* consist of a major and a minor.
|
|
||||||
*
|
|
||||||
* @param single single GPU card single thread training.
|
|
||||||
*
|
|
||||||
*
|
|
||||||
* @param control current parameter updates via control role,
|
|
||||||
* not participate in real training. control role is
|
|
||||||
* responsible for merging all major's gradient and
|
|
||||||
* update parameter value.
|
|
||||||
*
|
|
||||||
* @param major major role paticipates in real training, when local
|
|
||||||
* gradient is ready, merge its corresponding minor's
|
|
||||||
* gradient and notify controller: this group's gradient
|
|
||||||
* is already ready.
|
|
||||||
*
|
|
||||||
* @param minor minor role participates in real training, when local
|
|
||||||
* gradient is ready, only notify its corresponding major.
|
|
||||||
* In order to maximum apportion jobs, after controller
|
|
||||||
* updates the paramemter value, each group's minior
|
|
||||||
* reponses to dispatch the latest model into local and
|
|
||||||
* major.
|
|
||||||
*/
|
|
||||||
enum TrainerRole {
|
|
||||||
TRAINER_ROLE_SINGLE,
|
|
||||||
TRAINER_ROLE_CONTROL,
|
|
||||||
TRAINER_ROLE_MAJOR,
|
|
||||||
TRAINER_ROLE_MINOR,
|
|
||||||
TRAINER_ROLE_MASTER,
|
|
||||||
TRAINER_ROLE_SLAVE
|
|
||||||
};
|
|
||||||
typedef void (ParallelParameter::*UpdateFunction)(real learnRate);
|
|
||||||
|
|
||||||
class ParallelParameter {
|
|
||||||
public:
|
|
||||||
static ParallelParameterPtr create(TrainerRole role,
|
|
||||||
ParameterPtr localParam,
|
|
||||||
int asyncCount = 1);
|
|
||||||
|
|
||||||
ParallelParameter(TrainerRole role, ParameterPtr localParam) {
|
|
||||||
role_ = role;
|
|
||||||
gradSem_.reset(new Semaphore(0));
|
|
||||||
valueSem_.reset(new Semaphore(0));
|
|
||||||
localParam_ = localParam;
|
|
||||||
}
|
|
||||||
|
|
||||||
virtual ~ParallelParameter() {}
|
|
||||||
|
|
||||||
ParameterPtr getLocalParameter() { return localParam_; }
|
|
||||||
bool timeWaitGradReady(int sec) {
|
|
||||||
struct timespec ts;
|
|
||||||
ts.tv_nsec = 0;
|
|
||||||
ts.tv_sec = time(NULL) + sec;
|
|
||||||
return gradSem_->timeWait(&ts);
|
|
||||||
}
|
|
||||||
void waitGradReady() { gradSem_->wait(); }
|
|
||||||
void postValueReady() { valueSem_->post(); }
|
|
||||||
|
|
||||||
void syncUpdate(TrainerRole role, real learnRate);
|
|
||||||
|
|
||||||
virtual void synchronizeParamter() = 0;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* for synchronous
|
|
||||||
*/
|
|
||||||
virtual void singleUpdate(real learnRate) { (void)learnRate; }
|
|
||||||
|
|
||||||
virtual void controlUpdate(const UpdateCallback& callback) { (void)callback; }
|
|
||||||
|
|
||||||
virtual void majorUpdate(real learnRate) { (void)learnRate; }
|
|
||||||
|
|
||||||
virtual void minorUpdate(real learnRate) { (void)learnRate; }
|
|
||||||
|
|
||||||
/**
|
|
||||||
* for asynchronous
|
|
||||||
*/
|
|
||||||
virtual void slaveUpdate(real learnRate) { (void)learnRate; }
|
|
||||||
|
|
||||||
protected:
|
|
||||||
TrainerRole role_;
|
|
||||||
ParameterPtr localParam_;
|
|
||||||
std::unique_ptr<Semaphore>
|
|
||||||
gradSem_; /// wether the local parameter-gradient is ready
|
|
||||||
std::unique_ptr<Semaphore>
|
|
||||||
valueSem_; /// wether the local parameter-value is updated
|
|
||||||
};
|
|
||||||
|
|
||||||
/**
|
|
||||||
* this class is designed for multi-threading training.
|
|
||||||
*
|
|
||||||
* "Synchronous" means multiple GPUs calculate 1/4 mini-Batch,
|
|
||||||
* but will get only one gradient
|
|
||||||
*/
|
|
||||||
class SyncParameter : public ParallelParameter {
|
|
||||||
public:
|
|
||||||
SyncParameter(TrainerRole role, ParameterPtr localParam)
|
|
||||||
: ParallelParameter(role, localParam) {
|
|
||||||
controlParam_ = nullptr;
|
|
||||||
majorPartners_.clear();
|
|
||||||
minorPartners_.clear();
|
|
||||||
}
|
|
||||||
~SyncParameter() {
|
|
||||||
majorPartners_.clear();
|
|
||||||
minorPartners_.clear();
|
|
||||||
}
|
|
||||||
void attachControlParam(ParallelParameterPtr controler);
|
|
||||||
|
|
||||||
void attachMajorParam(ParallelParameterPtr partner);
|
|
||||||
|
|
||||||
void attachMinorParam(ParallelParameterPtr partner, int deviceId);
|
|
||||||
|
|
||||||
void waitAllMajorGradReady();
|
|
||||||
|
|
||||||
void synchronizeParamter();
|
|
||||||
|
|
||||||
void singleUpdate(real learnRate);
|
|
||||||
|
|
||||||
void controlUpdate(const UpdateCallback& callback);
|
|
||||||
|
|
||||||
void majorUpdate(real learnRate);
|
|
||||||
|
|
||||||
void minorUpdate(real learnRate);
|
|
||||||
|
|
||||||
std::vector<ParallelParameterPtr>& getMajorPartners() {
|
|
||||||
return majorPartners_;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<ParallelParameterPtr>& getMinorPartners() {
|
|
||||||
return minorPartners_;
|
|
||||||
}
|
|
||||||
|
|
||||||
private:
|
|
||||||
// The following variables are used in a multithreaded training situation
|
|
||||||
// partnerParam_ is local-parameter's partner
|
|
||||||
// controlParam_ is the controller-thread 's parameter
|
|
||||||
ParameterPtr partnerParam_;
|
|
||||||
std::vector<ParallelParameterPtr> majorPartners_;
|
|
||||||
std::vector<ParallelParameterPtr> minorPartners_;
|
|
||||||
std::vector<int> minorDeviceIds_;
|
|
||||||
ParallelParameterPtr controlParam_;
|
|
||||||
};
|
|
||||||
|
|
||||||
class AsyncParameter : public ParallelParameter {
|
|
||||||
public:
|
|
||||||
AsyncParameter(TrainerRole role, int asyncCount, ParameterPtr localParam);
|
|
||||||
|
|
||||||
void clearCounter() { accumCounter_ = 0; }
|
|
||||||
|
|
||||||
VectorPtr getAccum() { return gradientAccum_; }
|
|
||||||
|
|
||||||
void synchronizeParamter() {
|
|
||||||
if (accumCounter_ == asyncCount_) {
|
|
||||||
valueSem_->wait();
|
|
||||||
clearCounter();
|
|
||||||
gradientAccum_->zeroMem();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* When asynchronous training, update strategy including slave and master.
|
|
||||||
*
|
|
||||||
* slave: If in range asyncCount, adopting self-update method.
|
|
||||||
* If beyond asyncCount, waiting for master to update.
|
|
||||||
*/
|
|
||||||
void slaveUpdate(real learnRate);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* When asynchronous training, update strategy including slave and master.
|
|
||||||
*
|
|
||||||
* master: it only polls slaves, do not training data.
|
|
||||||
* If slave's gradient is ready, fetch it.
|
|
||||||
* Update master's parameter, then copy it into
|
|
||||||
* corresponding slave.
|
|
||||||
*/
|
|
||||||
bool masterUpdate(ParallelParameterPtr slaveParam,
|
|
||||||
const UpdateCallback& callback);
|
|
||||||
|
|
||||||
private:
|
|
||||||
/**
|
|
||||||
* When asynchronous training, every aysnc trainer needs to
|
|
||||||
* accumulate a number of batch gradient.
|
|
||||||
*
|
|
||||||
* gradientAccum_ is used to save the sum of gradients.
|
|
||||||
*/
|
|
||||||
VectorPtr gradientAccum_;
|
|
||||||
|
|
||||||
/// Asynchronous count.
|
|
||||||
int asyncCount_;
|
|
||||||
/// Accumulate counter of current gradients.
|
|
||||||
int accumCounter_;
|
|
||||||
};
|
|
||||||
|
|
||||||
typedef std::map<std::string, ParallelParameterPtr> ParallelParameterMap;
|
|
||||||
|
|
||||||
} // namespace paddle
|
|
Loading…
Reference in new issue