You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
337 lines
12 KiB
337 lines
12 KiB
/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License. */
|
|
|
|
#include <sys/types.h>
|
|
#include <iomanip>
|
|
#include <algorithm>
|
|
#include <string.h>
|
|
#include "paddle/utils/Stat.h"
|
|
#include "paddle/utils/BarrierStat.h"
|
|
#include "paddle/utils/Flags.h"
|
|
|
|
P_DEFINE_bool(log_barrier_abstract, true,
|
|
"if true, show abstract of barrier performance");
|
|
P_DEFINE_int32(log_barrier_lowest_nodes, 5,
|
|
"how many lowest node will be logged");
|
|
P_DEFINE_bool(log_barrier_show_log, false, // for performance tuning insight
|
|
"if true, always show barrier abstract even with little gap");
|
|
|
|
namespace paddle {
|
|
|
|
std::ostream &operator<<(std::ostream &output, BarrierStatBase &stat) {
|
|
if (FLAGS_log_barrier_abstract) {
|
|
std::lock_guard<std::mutex> guard(
|
|
const_cast<BarrierStatBase &>(stat).lock_);
|
|
stat.showAbstract(output);
|
|
}
|
|
return output;
|
|
}
|
|
|
|
BarrierStatBase::BarrierStatBase(uint16_t numConnThreads,
|
|
const std::string &name)
|
|
: totSamples_(0), numConnThreads_(numConnThreads), name_(name) {
|
|
abstract_.resize(numConnThreads_);
|
|
if (FLAGS_log_barrier_show_log) {
|
|
rateThreshold_ = 0.0;
|
|
} else {
|
|
/* probablity of abnormal node
|
|
* p = 1/n + (n/8)/(n+1), n = nodes, n > 1
|
|
* if the freq of lowest trainerId larger than p,
|
|
* output FLAGS_log_barrier_lowest_nodes lastTrainerId.
|
|
* numConnThreads_ indicates nodes
|
|
*/
|
|
float n = (float)numConnThreads;
|
|
rateThreshold_ = 1.0 / n + (n / 8.0) / (n + 1.0);
|
|
}
|
|
}
|
|
|
|
BarrierEndStat::BarrierEndStat(uint16_t numConnThreads, const std::string &name)
|
|
: BarrierStatBase(numConnThreads, name) {
|
|
timeVector_.reset(new TimeVectorEnd(numConnThreads_));
|
|
reset(true);
|
|
LOG(INFO) << " create barrierEndStat: " << name
|
|
<< " endBarrier warning rate: " << rateThreshold_;
|
|
}
|
|
|
|
/*
|
|
* Note:
|
|
* the design different pserver entity owns different statSet to obey
|
|
* the background that different pserver runs separately.
|
|
*/
|
|
void BarrierEndStat::updateStat(struct timeval &cur, int32_t trainerId) {
|
|
CHECK_LT(trainerId, numConnThreads_) << "trainerId is invalid in barrier";
|
|
|
|
std::lock_guard<std::mutex> guard(lock_);
|
|
timeVector_->addTimeval(cur, trainerId);
|
|
|
|
if (timeVector_->full()) {
|
|
std::lock_guard<std::mutex> abstractGuard(abstractLock_);
|
|
auto id = timeVector_->getLastTrainerId();
|
|
auto delta = timeToMicroSecond(timeVector_->getDelta());
|
|
auto secondDelta = timeToMicroSecond(timeVector_->get1NDelta());
|
|
auto lastTwoDelta = timeToMicroSecond(timeVector_->getMinus1NDelta());
|
|
auto midDelta = timeToMicroSecond(timeVector_->getMidNDelta());
|
|
// discard first sample, since first sample probably is abnormal.
|
|
if (totSamples_) {
|
|
abstract_[id].freq++;
|
|
|
|
if (delta < abstract_[id].minDelta) {
|
|
abstract_[id].minDelta = delta;
|
|
}
|
|
if (delta > abstract_[id].maxDelta) {
|
|
abstract_[id].maxDelta = delta;
|
|
}
|
|
abstract_[id].totDelta += delta;
|
|
abstract_[id].totSecondDelta += secondDelta;
|
|
abstract_[id].totLastTwoDelta += lastTwoDelta;
|
|
abstract_[id].totMidDelta += midDelta;
|
|
|
|
// update totAbstract_
|
|
totAbstract_.freq++;
|
|
if (delta < totAbstract_.minDelta) {
|
|
totAbstract_.minDelta = delta;
|
|
}
|
|
if (delta > totAbstract_.maxDelta) {
|
|
totAbstract_.maxDelta = delta;
|
|
}
|
|
totAbstract_.totDelta += delta;
|
|
totAbstract_.totSecondDelta += secondDelta;
|
|
totAbstract_.totLastTwoDelta += lastTwoDelta;
|
|
totAbstract_.totMidDelta += midDelta;
|
|
}
|
|
|
|
totSamples_++;
|
|
timeVector_->reset();
|
|
}
|
|
}
|
|
|
|
void BarrierEndStat::reset(bool clearRawData) {
|
|
int32_t i = 0;
|
|
|
|
totSamples_ = 0;
|
|
|
|
std::lock_guard<std::mutex> guard(abstractLock_);
|
|
|
|
if (clearRawData) {
|
|
timeVector_->reset();
|
|
}
|
|
|
|
for (auto &abstract : abstract_) {
|
|
memset((void *)&abstract, 0, sizeof(abstract));
|
|
abstract.minDelta = UINT64_MAX;
|
|
abstract.trainerId = i++;
|
|
}
|
|
memset((void *)&totAbstract_, 0, sizeof(Abstract));
|
|
totAbstract_.minDelta = UINT64_MAX;
|
|
}
|
|
|
|
void BarrierEndStat::showAbstract(std::ostream &output) {
|
|
// do not support the case "<=2 pserver"
|
|
if (numConnThreads_ <= 2 || !totSamples_) {
|
|
return;
|
|
}
|
|
|
|
// duplicate freq info
|
|
std::vector<struct Abstract> outputAbstract = abstract_;
|
|
std::sort(outputAbstract.begin(), outputAbstract.end(),
|
|
[](const struct Abstract &a, const struct Abstract &b) {
|
|
return a.freq > b.freq;
|
|
});
|
|
|
|
auto rate = (float)outputAbstract[0].freq / (float)totSamples_;
|
|
if (rate < rateThreshold_) {
|
|
return;
|
|
}
|
|
|
|
output << std::setw(20) << name_ << std::endl;
|
|
|
|
/*
|
|
* Note:
|
|
* avgGap: the average delta between 1 -- n arriving trainers
|
|
* avgSecondGap: the average delta between 2 -- n arriving trainers
|
|
* avgLastTwoGap: the average delta between n-1 -- n arriving trainers
|
|
* avgMidGap: the average delta between n/2 -- n arriving trainers
|
|
* rato: samples / totSamples
|
|
*
|
|
* the stat is based on per trainer if trainer_id is set, totAbstract is
|
|
* stat based on all trainers scope.
|
|
*/
|
|
output << std::setw(42) << " " << std::setw(15) << "trainerId"
|
|
<< std::setw(15) << "avgGap" << std::setw(15) << "avgSecondGap"
|
|
<< std::setw(15) << "avgLastTwoGap" << std::setw(15) << "avgMidGap"
|
|
<< std::setw(10) << "rate" << std::setw(10) << "samples"
|
|
<< std::setw(10) << "totSamples" << std::endl;
|
|
// show totAbstract, it's valuable when lastTrainerId is even-distributed'
|
|
if (!totAbstract_.freq) return;
|
|
output << std::setw(42) << " " << std::setw(15) << "totAbstract"
|
|
<< std::setw(15) << (totAbstract_.totDelta / totAbstract_.freq) * 0.001
|
|
<< std::setw(15)
|
|
<< (totAbstract_.totSecondDelta / totAbstract_.freq) * 0.001
|
|
<< std::setw(15)
|
|
<< (totAbstract_.totLastTwoDelta / totAbstract_.freq) * 0.001
|
|
<< std::setw(15)
|
|
<< (totAbstract_.totMidDelta / totAbstract_.freq) * 0.001
|
|
<< std::setw(10) << (float)totAbstract_.freq / (float)totSamples_
|
|
<< std::setw(10) << (float)totAbstract_.freq << std::setw(10)
|
|
<< (float)totSamples_ << std::endl;
|
|
|
|
// show lastTrainerId abstract
|
|
int count = 0;
|
|
for (auto &abstract : outputAbstract) {
|
|
if (!abstract.freq || count++ >= FLAGS_log_barrier_lowest_nodes) {
|
|
break;
|
|
}
|
|
// output format control
|
|
output << std::setw(42) << " " << std::setw(15) << abstract.trainerId
|
|
<< std::setw(15) << (abstract.totDelta / abstract.freq) * 0.001
|
|
<< std::setw(15) << (abstract.totSecondDelta / abstract.freq) * 0.001
|
|
<< std::setw(15)
|
|
<< (abstract.totLastTwoDelta / abstract.freq) * 0.001
|
|
<< std::setw(15) << (abstract.totMidDelta / abstract.freq) * 0.001
|
|
<< std::setw(10) << (float)abstract.freq / (float)totSamples_
|
|
<< std::setw(10) << (float)abstract.freq << std::setw(10)
|
|
<< (float)totSamples_ << std::endl;
|
|
}
|
|
}
|
|
|
|
BarrierDeltaStat::BarrierDeltaStat(uint16_t numConnThreads,
|
|
const std::string &name)
|
|
: BarrierStatBase(numConnThreads, name) {
|
|
timeVector_.reset(new TimeVectorDelta(numConnThreads_));
|
|
reset(true);
|
|
LOG(INFO) << " create barrierDeltaStat: " << name
|
|
<< " barrierDelta warning rate: " << rateThreshold_;
|
|
}
|
|
|
|
void BarrierDeltaStat::updateStat(uint64_t delta, int32_t trainerId) {
|
|
CHECK_LT(trainerId, numConnThreads_) << "trainerId is invalid in barrier";
|
|
|
|
std::lock_guard<std::mutex> guard(lock_);
|
|
timeVector_->addTimeval(delta, trainerId);
|
|
|
|
if (timeVector_->full()) {
|
|
std::lock_guard<std::mutex> abstractGuard(abstractLock_);
|
|
auto id = timeVector_->getMaxTrainerId();
|
|
auto delta = timeVector_->getDelta();
|
|
// discard first sample, since first sample probably is abnormal.
|
|
if (totSamples_) {
|
|
abstract_[id].freq++;
|
|
|
|
if (delta < abstract_[id].minDelta) {
|
|
abstract_[id].minDelta = delta;
|
|
}
|
|
if (delta > abstract_[id].maxDelta) {
|
|
abstract_[id].maxDelta = delta;
|
|
}
|
|
abstract_[id].totDelta += delta;
|
|
|
|
// update totAbstract_
|
|
totAbstract_.freq++;
|
|
if (delta < totAbstract_.minDelta) {
|
|
totAbstract_.minDelta = delta;
|
|
}
|
|
if (delta > totAbstract_.maxDelta) {
|
|
totAbstract_.maxDelta = delta;
|
|
}
|
|
totAbstract_.totDelta += delta;
|
|
}
|
|
|
|
totSamples_++;
|
|
timeVector_->reset();
|
|
}
|
|
}
|
|
|
|
void BarrierDeltaStat::reset(bool clearRawData) {
|
|
int32_t i = 0;
|
|
|
|
totSamples_ = 0;
|
|
|
|
std::lock_guard<std::mutex> guard(abstractLock_);
|
|
|
|
if (clearRawData) {
|
|
timeVector_->reset();
|
|
}
|
|
|
|
for (auto &abstract : abstract_) {
|
|
memset((void *)&abstract, 0, sizeof(abstract));
|
|
abstract.minDelta = UINT64_MAX;
|
|
abstract.trainerId = i++;
|
|
}
|
|
memset((void *)&totAbstract_, 0, sizeof(Abstract));
|
|
totAbstract_.minDelta = UINT64_MAX;
|
|
}
|
|
|
|
void BarrierDeltaStat::showAbstract(std::ostream &output) {
|
|
// do not support the case "<=2 pserver"
|
|
if (numConnThreads_ <= 2 || !totSamples_) {
|
|
return;
|
|
}
|
|
|
|
// duplicate freq info
|
|
std::vector<struct Abstract> outputAbstract = abstract_;
|
|
std::sort(outputAbstract.begin(), outputAbstract.end(),
|
|
[](const struct Abstract &a, const struct Abstract &b) {
|
|
return a.freq > b.freq;
|
|
});
|
|
|
|
auto rate = (float)outputAbstract[0].freq / (float)totSamples_;
|
|
if (rate < rateThreshold_) {
|
|
return;
|
|
}
|
|
|
|
output << std::setw(20) << name_ << std::endl;
|
|
|
|
/* Note:
|
|
* Gap means the delta from all trainers' forwardbackward
|
|
* avgGap: average Gap in log_period batches
|
|
* minGap: min Gap in log_period batches
|
|
* maxGap: max Gap in log_period batches
|
|
* trainerId: the slowest trainer_id
|
|
*
|
|
* the stat is based on per trainer if trainer_id is set, totAbstract is
|
|
* stat based on all trainers scope.
|
|
*/
|
|
output << std::setw(42) << " " << std::setw(15) << "trainerId"
|
|
<< std::setw(15) << "avgGap" << std::setw(10) << "minGap"
|
|
<< std::setw(10) << "maxGap" << std::setw(10) << "rate"
|
|
<< std::setw(10) << "samples" << std::setw(10) << "totSamples"
|
|
<< std::endl;
|
|
// show totAbstract, it's valuable when lastTrainerId is even-distributed'
|
|
if (!totAbstract_.freq) return;
|
|
output << std::setw(42) << " " << std::setw(15) << "totAbstract"
|
|
<< std::setw(15) << (totAbstract_.totDelta / totAbstract_.freq) * 0.001
|
|
<< std::setw(10) << totAbstract_.minDelta * 0.001 << std::setw(10)
|
|
<< totAbstract_.maxDelta * 0.001 << std::setw(10)
|
|
<< (float)totAbstract_.freq / (float)totSamples_ << std::setw(10)
|
|
<< (float)totAbstract_.freq << std::setw(10) << (float)totSamples_
|
|
<< std::endl;
|
|
|
|
// show lastTrainerId abstract
|
|
int count = 0;
|
|
for (auto &abstract : outputAbstract) {
|
|
if (!abstract.freq || count++ >= FLAGS_log_barrier_lowest_nodes) {
|
|
break;
|
|
}
|
|
// output format control
|
|
output << std::setw(42) << " " << std::setw(15) << abstract.trainerId
|
|
<< std::setw(15) << (abstract.totDelta / abstract.freq) * 0.001
|
|
<< std::setw(10) << abstract.minDelta * 0.001 << std::setw(10)
|
|
<< abstract.maxDelta * 0.001 << std::setw(10)
|
|
<< (float)abstract.freq / (float)totSamples_ << std::setw(10)
|
|
<< (float)abstract.freq << std::setw(10) << (float)totSamples_
|
|
<< std::endl;
|
|
}
|
|
}
|
|
} // namespace paddle
|