parent
d52ebb0296
commit
f3c61cbc4c
@ -0,0 +1,101 @@
|
||||
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. */
|
||||
|
||||
#include "PServerUtil.h"
|
||||
|
||||
namespace paddle {
|
||||
|
||||
ParameterServerConfig* PServerUtil::initConfig() {
|
||||
ParameterServerConfig* config = new ParameterServerConfig();
|
||||
config->set_nics(FLAGS_nics);
|
||||
config->set_port(FLAGS_port);
|
||||
config->set_ports_num(FLAGS_ports_num);
|
||||
config->set_rdma_tcp(FLAGS_rdma_tcp);
|
||||
return config;
|
||||
}
|
||||
|
||||
PServerUtil* PServerUtil::create() {
|
||||
auto& pServerConfig = *paddle::PServerUtil::initConfig();
|
||||
return PServerUtil::create(pServerConfig);
|
||||
}
|
||||
|
||||
PServerUtil* PServerUtil::create(const ParameterServerConfig& config) {
|
||||
return new PServerUtil(config);
|
||||
}
|
||||
|
||||
PServerUtil::PServerUtil(const ParameterServerConfig& config) {
|
||||
// round robin to load balance RDMA server ENGINE
|
||||
std::vector<std::string> devices;
|
||||
int rdmaCpu = 0;
|
||||
int onlineCpus = rdma::numCpus();
|
||||
;
|
||||
int numPorts = config.ports_num() + config.ports_num_for_sparse();
|
||||
|
||||
if (FLAGS_nics.empty()) {
|
||||
pservers_.resize(numPorts);
|
||||
for (int i = 0; i < numPorts; ++i) {
|
||||
if (FLAGS_rdma_tcp == "rdma") {
|
||||
pservers_[i].reset(
|
||||
new ParameterServer2(std::string(), FLAGS_port + i, rdmaCpu++));
|
||||
rdmaCpu = rdmaCpu % onlineCpus;
|
||||
} else {
|
||||
pservers_[i].reset(new ParameterServer2(std::string(), FLAGS_port + i));
|
||||
}
|
||||
CHECK(pservers_[i]->init()) << "Fail to initialize parameter server"
|
||||
<< FLAGS_port + i;
|
||||
}
|
||||
} else {
|
||||
str::split(FLAGS_nics, ',', &devices);
|
||||
pservers_.resize(devices.size() * numPorts);
|
||||
for (int i = 0; i < numPorts; ++i) {
|
||||
for (size_t j = 0; j < devices.size(); ++j) {
|
||||
if (FLAGS_rdma_tcp == "rdma") {
|
||||
pservers_[i * devices.size() + j].reset(new ParameterServer2(
|
||||
getIpAddr(devices[j]), FLAGS_port + i, rdmaCpu++));
|
||||
rdmaCpu = rdmaCpu % onlineCpus;
|
||||
} else {
|
||||
pservers_[i * devices.size() + j].reset(
|
||||
new ParameterServer2(getIpAddr(devices[j]), FLAGS_port + i));
|
||||
}
|
||||
CHECK(pservers_[i * devices.size() + j]->init())
|
||||
<< "Fail to initialize parameter server" << devices[j]
|
||||
<< FLAGS_port + i;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
PServerUtil::~PServerUtil() { this->join(); }
|
||||
|
||||
void PServerUtil::start() {
|
||||
LOG(INFO) << "pserver sizes : " << pservers_.size();
|
||||
int i = 0;
|
||||
for (const auto& pserver : pservers_) {
|
||||
LOG(INFO) << "pserver started : " << i;
|
||||
pserver->start();
|
||||
i++;
|
||||
}
|
||||
}
|
||||
|
||||
void PServerUtil::join() {
|
||||
LOG(INFO) << "pserver sizes : " << pservers_.size();
|
||||
int i = 0;
|
||||
for (const auto& pserver : pservers_) {
|
||||
LOG(INFO) << "pserver join : " << i;
|
||||
pserver->join();
|
||||
i++;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace paddle
|
@ -0,0 +1,39 @@
|
||||
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. */
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ParameterServer2.h"
|
||||
#include "ParameterServerConfig.pb.h"
|
||||
#include "RDMANetwork.h"
|
||||
#include "paddle/utils/StringUtil.h"
|
||||
|
||||
namespace paddle {
|
||||
|
||||
class PServerUtil {
|
||||
public:
|
||||
DISABLE_COPY(PServerUtil);
|
||||
static PServerUtil* create();
|
||||
static PServerUtil* create(const ParameterServerConfig& config);
|
||||
explicit PServerUtil(const ParameterServerConfig& config);
|
||||
~PServerUtil();
|
||||
static ParameterServerConfig* initConfig();
|
||||
void start();
|
||||
void join();
|
||||
|
||||
private:
|
||||
std::vector<std::shared_ptr<ParameterServer2>> pservers_;
|
||||
};
|
||||
|
||||
} // namespace paddle
|
@ -0,0 +1,43 @@
|
||||
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. */
|
||||
syntax = "proto2";
|
||||
|
||||
package paddle;
|
||||
|
||||
message ParameterClientConfig {
|
||||
required int32 trainer_id = 1;
|
||||
}
|
||||
|
||||
message ParameterServerConfig {
|
||||
// The ports number for parameter send,
|
||||
// increment based on default port number
|
||||
required int32 ports_num = 1 [default = 1];
|
||||
// The ports number for parameter send,
|
||||
// increment based on default (port + ports_num
|
||||
required int32 ports_num_for_sparse = 2 [default = 0];
|
||||
// network device name for pservers
|
||||
required string nics = 3 [default = "xgbe0,xgbe1"];
|
||||
required string rdma_tcp = 4 [default = "tcp"];
|
||||
// Listening port for pserver
|
||||
required int32 port = 5 [default = 20134];
|
||||
// number of gradient servers
|
||||
required int32 num_gradient_servers = 6 [default = 1];
|
||||
// number of threads for sync op exec
|
||||
required int32 pserver_num_threads = 7 [default = 1];
|
||||
// control config_.async_lagged_grad_discard_ratio() min value
|
||||
required double async_lagged_ratio_min = 8 [default = 1.0];
|
||||
// if async_lagged_grad_discard_ratio is not set in trainer_config.conf
|
||||
// use it as defalut value
|
||||
required double async_lagged_ratio_default = 9 [default = 1.5];
|
||||
}
|
Loading…
Reference in new issue