add GeneralRoleMaker (#22295)

* add GeneralRoleMaker which is for general usage
* test=develop
revert-22710-feature/integrated_ps_api
xujiaqi01 5 years ago committed by GitHub
parent 269db0d1d1
commit 371f377bea
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -219,6 +219,13 @@ RUN wget -q https://launchpad.net/ubuntu/+archive/primary/+sourcefiles/binutils/
cd binutils-2.27 && \
./configure && make -j && make install && cd .. && rm -rf binutils-2.27 binutils_2.27.orig.tar.gz
RUN wget --no-check-certificate https://pslib.bj.bcebos.com/openmpi-1.4.5.tar.gz && tar -xzf openmpi-1.4.5.tar.gz && \
cd openmpi-1.4.5 && ./configure --prefix=/usr/local && make all -j8 && make install -j8 && \
export LD_LIBRARY_PATH=/usr/local/lib/:$LD_LIBRARY_PATH && export PATH=/usr/local/bin:$PATH && cd .. && \
rm -rf openmpi-1.4.5.tar.gz && pip --no-cache-dir install mpi4py && ln -fs /bin/bash /bin/sh && \
apt-get install libprotobuf-dev -y
RUN pip --no-cache-dir install -U netifaces==0.10.9
# Older versions of patchelf limited the size of the files being processed and were fixed in this pr.
# https://github.com/NixOS/patchelf/commit/ba2695a8110abbc8cc6baf0eea819922ee5007fa
# So install a newer version here.

@ -214,6 +214,7 @@ cc_library(parallel_executor SRCS parallel_executor.cc DEPS
graph build_strategy
fast_threaded_ssa_graph_executor variable_helper)
cc_test(dist_multi_trainer_test SRCS dist_multi_trainer_test.cc DEPS executor)
cc_library(prune SRCS prune.cc DEPS framework_proto boost)
cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry

@ -287,6 +287,7 @@ void DatasetImpl<T>::LocalShuffle() {
template <typename T>
void DatasetImpl<T>::GlobalShuffle(int thread_num) {
#ifdef PADDLE_WITH_PSLIB
VLOG(3) << "DatasetImpl<T>::GlobalShuffle() begin";
platform::Timer timeline;
timeline.Start();
@ -379,6 +380,7 @@ void DatasetImpl<T>::GlobalShuffle(int thread_num) {
timeline.Pause();
VLOG(3) << "DatasetImpl<T>::GlobalShuffle() end, cost time="
<< timeline.ElapsedSec() << " seconds";
#endif
}
template <typename T>

@ -41,8 +41,8 @@ void DistMultiTrainer::Initialize(const TrainerDesc &trainer_desc,
need_dump_field_ = false;
}
}
mpi_rank_ = trainer_desc.mpi_rank() / 2;
mpi_size_ = trainer_desc.mpi_size() / 2;
mpi_rank_ = trainer_desc.mpi_rank();
mpi_size_ = trainer_desc.mpi_size();
dump_file_num_ = trainer_desc.dump_file_num();
const std::vector<paddle::framework::DataFeed *> readers =
dataset->GetReaders();

@ -0,0 +1,56 @@
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <fstream>
#include <iostream>
#include <sstream>
#include "google/protobuf/io/zero_copy_stream_impl.h"
#include "google/protobuf/message.h"
#include "google/protobuf/text_format.h"
#include "gtest/gtest.h"
#include "paddle/fluid/framework/trainer.h"
#if defined _WIN32 || defined __APPLE__
#else
#define _LINUX
#endif
namespace paddle {
namespace framework {
TEST(DisMultiTrainerTest, test1) {
#ifdef _LINUX
std::shared_ptr<DistMultiTrainer> tmp1 = std::make_shared<DistMultiTrainer>();
TrainerDesc t;
t.set_class_name("DistMultiTrainer");
t.set_device_worker_name("DownpourWorker");
t.set_thread_num(1);
auto* m = t.mutable_downpour_param()->add_program_config();
m->set_program_id("123");
std::string str;
str += "name: \"MultiSlotDataFeed\"\nbatch_size: 2\nmulti_slot_desc {\n";
str += "slots {\nname: \"words\"\ntype: \"uint64\"\nis_dense: false\n";
str += "is_used: true\n}\nslots {\nname: \"label\"\ntype: \"uint64\"\n";
str += "is_dense: false\nis_used: true\n}\n}\n";
std::shared_ptr<MultiSlotDataset> dataset =
std::make_shared<MultiSlotDataset>();
dataset->SetFileList(std::vector<std::string>());
dataset->SetThreadNum(1);
dataset->SetTrainerNum(1);
dataset->SetDataFeedDesc(str);
dataset->CreateReaders();
tmp1->Initialize(t, dataset.get());
#endif
}
} // namespace framework
} // namespace paddle

@ -21,6 +21,7 @@ HdfsStore::HdfsStore(const std::string& path) {
path_ = path;
wait_sleep_ms_ = 3000;
wait_timeout_ = std::chrono::seconds(999999999);
retry_times_ = 100;
}
void HdfsStore::set(const std::string& key, const std::vector<char>& data) {
@ -33,10 +34,27 @@ void HdfsStore::set(const std::string& key, const std::vector<char>& data) {
paddle::framework::fs_remove(path);
}
int err_no = 0;
std::shared_ptr<FILE> fp = paddle::framework::fs_open_write(tmp, &err_no, "");
size_t write_count = fwrite_unlocked(data.data(), 1, data.size(), fp.get());
VLOG(3) << "HdfsStore::set write_count=" << write_count << " key " << key;
fp.reset();
for (int i = 1; i <= retry_times_; ++i) {
std::shared_ptr<FILE> fp =
paddle::framework::fs_open_write(tmp, &err_no, "");
if (err_no != 0) {
VLOG(0) << "fs_open_write failed, retry times " << i << " err no "
<< err_no;
fp.reset();
sleep(wait_sleep_ms_ / 1000);
continue;
}
size_t write_count = fwrite_unlocked(data.data(), 1, data.size(), fp.get());
if (write_count != data.size()) {
VLOG(0) << "fwrite_unlocked failed, retry times " << i << " write_count "
<< write_count << " data.size() " << data.size();
fp.reset();
sleep(2);
continue;
}
fp.reset();
break;
}
paddle::framework::fs_mv(tmp, path);
#endif
}
@ -131,7 +149,7 @@ void GlooWrapper::Init(int rank, int size, const std::string& path,
}
rank_ = rank;
size_ = size;
std::string cmd = std::string("hadoop fs");
std::string cmd = std::string("${HADOOP_HOME}/bin/hadoop fs");
cmd += " -D fs.default.name=" + fs_name;
cmd += " -D hadoop.job.ugi=" + fs_ugi;
paddle::framework::hdfs_set_command(cmd);
@ -149,16 +167,19 @@ void GlooWrapper::Init(int rank, int size, const std::string& path,
is_initialized_ = true;
}
template void GlooWrapper::AllReduce<int64_t>(
template std::vector<int64_t> GlooWrapper::AllReduce<int64_t>(
std::vector<int64_t>& sendbuf, // NOLINT
std::vector<int64_t>& recvbuf, // NOLINT
const std::string& mode);
template void GlooWrapper::AllReduce<double>(
template std::vector<double> GlooWrapper::AllReduce<double>(
std::vector<double>& sendbuf, // NOLINT
std::vector<double>& recvbuf, // NOLINT
const std::string& mode);
template std::vector<uint64_t> GlooWrapper::AllReduce<uint64_t>(
std::vector<uint64_t>& sendbuf, // NOLINT
const std::string& mode);
template std::vector<int64_t> GlooWrapper::AllGather<int64_t>(
int64_t& input); // NOLINT
template std::vector<uint64_t> GlooWrapper::AllGather<uint64_t>(
uint64_t& input); // NOLINT
template std::vector<double> GlooWrapper::AllGather<double>(
double& input); // NOLINT

@ -70,6 +70,7 @@ class HdfsStore {
std::string path_;
int wait_sleep_ms_;
std::chrono::seconds wait_timeout_;
int retry_times_;
};
} // namespace rendezvous
@ -107,9 +108,10 @@ class GlooWrapper {
}
template <typename T>
void AllReduce(std::vector<T>& sendbuf, std::vector<T>& recvbuf, // NOLINT
const std::string& mode = "sum") {
std::vector<T> AllReduce(std::vector<T>& sendbuf, // NOLINT
const std::string& mode = "sum") { // NOLINT
CHECK_EQ(is_initialized_, true);
std::vector<T> recvbuf(sendbuf.size(), T());
CHECK_EQ(sendbuf.size() == recvbuf.size(), true);
#ifdef PADDLE_WITH_GLOO
gloo::AllreduceOptions opts(context_);
@ -133,6 +135,7 @@ class GlooWrapper {
}
gloo::allreduce(opts);
#endif
return recvbuf;
}
template <typename T>

@ -49,8 +49,7 @@ TEST(TEST_GLOO, store_1) {
gw.Size();
gw.Barrier();
std::vector<double> input;
std::vector<double> output;
gw.AllReduce(input, output);
gw.AllReduce(input);
int64_t t;
gw.AllGather(t);
#endif

@ -37,12 +37,12 @@ void BindGlooWrapper(py::module* m) {
.def("rank", &framework::GlooWrapper::Rank)
.def("size", &framework::GlooWrapper::Size)
.def("barrier", &framework::GlooWrapper::Barrier)
.def("all_reduce", &framework::GlooWrapper::AllReduce<uint64_t>)
.def("all_reduce", &framework::GlooWrapper::AllReduce<int64_t>)
.def("all_reduce", &framework::GlooWrapper::AllReduce<double>)
.def("all_gather", &framework::GlooWrapper::AllGather<uint64_t>)
.def("all_gather", &framework::GlooWrapper::AllGather<int64_t>)
.def("all_gather", &framework::GlooWrapper::AllGather<double>)
.def("Allreduce", &framework::GlooWrapper::AllReduce<int64_t>)
.def("Allreduce", &framework::GlooWrapper::AllReduce<double>);
.def("all_gather", &framework::GlooWrapper::AllGather<double>);
} // end BindGlooWrapper
} // end namespace pybind
} // end namespace paddle

@ -526,7 +526,7 @@ class InMemoryDataset(DatasetBase):
"""
trainer_num = 1
if fleet is not None:
fleet._role_maker._barrier_worker()
fleet._role_maker.barrier_worker()
trainer_num = fleet.worker_num()
if self.fleet_send_batch_size is None:
self.fleet_send_batch_size = 1024
@ -537,14 +537,14 @@ class InMemoryDataset(DatasetBase):
self.dataset.set_fleet_send_batch_size(self.fleet_send_batch_size)
self.dataset.set_fleet_send_sleep_seconds(self.fleet_send_sleep_seconds)
if fleet is not None:
fleet._role_maker._barrier_worker()
fleet._role_maker.barrier_worker()
self.dataset.global_shuffle(thread_num)
if fleet is not None:
fleet._role_maker._barrier_worker()
fleet._role_maker.barrier_worker()
if self.merge_by_lineid:
self.dataset.merge_by_lineid()
if fleet is not None:
fleet._role_maker._barrier_worker()
fleet._role_maker.barrier_worker()
def release_memory(self):
"""
@ -599,8 +599,8 @@ class InMemoryDataset(DatasetBase):
local_data_size = np.array([local_data_size])
if fleet is not None:
global_data_size = local_data_size * 0
fleet._role_maker._node_type_comm.Allreduce(local_data_size,
global_data_size)
fleet._role_maker.all_reduce_worker(local_data_size,
global_data_size)
return global_data_size[0]
return local_data_size[0]
@ -637,8 +637,8 @@ class InMemoryDataset(DatasetBase):
local_data_size = np.array([local_data_size])
if fleet is not None:
global_data_size = local_data_size * 0
fleet._role_maker._node_type_comm.Allreduce(local_data_size,
global_data_size)
fleet._role_maker.all_reduce_worker(local_data_size,
global_data_size)
return global_data_size[0]
return local_data_size[0]

@ -202,6 +202,22 @@ class Fleet(object):
self._role_maker.generate_role()
self._is_initialized = True
def all_reduce_worker(self, input, output):
"""
all reduce between workers, only support array of one dim.
Args:
input(list|numpy.array): array of one dim
output(list|numpy.array): array of one dim
"""
self._role_maker.all_reduce_worker(input, output)
def barrier_worker(self):
"""
barrier between workers
"""
self._role_maker.barrier_worker()
@abc.abstractmethod
def init_worker(self):
pass

File diff suppressed because it is too large Load Diff

@ -40,7 +40,9 @@ class PSLib(Fleet):
self._client2client_max_retry = 3
def init(self, role_maker=None):
super(PSLib, self).init(MPISymetricRoleMaker())
if role_maker is None:
role_maker = MPISymetricRoleMaker()
super(PSLib, self).init(role_maker)
self._fleet_ptr = fluid.core.Fleet()
def _set_client_communication_config(self, request_timeout_ms,
@ -75,9 +77,10 @@ class PSLib(Fleet):
# barrier_all for init_server, wait for server starts
self._role_maker._barrier_all()
self.all_ips_ = self._role_maker._all_gather(self._local_ip)
# worker_index * 2 is for compatible with older versions of pslib
self._fleet_ptr.init_worker(self._dist_desc_str, self.all_ips_,
self._role_maker._get_size(),
self._role_maker._get_rank())
self._role_maker.worker_index() * 2)
# barrier_all for init_worker
self._role_maker._barrier_all()
# prepare for client to client communication
@ -160,9 +163,16 @@ class PSLib(Fleet):
else:
raise Exception(
"You should run DistributedOptimizer.minimize() first")
# server_index * 2 is for compatible with older versions of pslib
self._fleet_ptr.init_server(self._dist_desc_str,
self._role_maker._get_rank())
self._local_ip = self._fleet_ptr.run_server()
self._role_maker.server_index() * 2)
if isinstance(self._role_maker, MPISymetricRoleMaker):
self._local_ip = self._fleet_ptr.run_server()
else:
local_endpoint = self._role_maker.get_local_endpoint()
local_endpoint = local_endpoint.split(":")
self._local_ip = self._fleet_ptr.run_server(
str(local_endpoint[0]), int(local_endpoint[1]))
# barrier_all for init_server
self._role_maker._barrier_all()
@ -632,8 +642,8 @@ class DownpourOptimizer(DistributedOptimizer):
parameter_list,
no_grad_set,
self._strategy)
opt_info["mpi_rank"] = fleet._role_maker._get_rank()
opt_info["mpi_size"] = fleet._role_maker._get_size()
opt_info["mpi_rank"] = fleet.worker_index()
opt_info["mpi_size"] = fleet.worker_num()
fleet._set_opt_info(opt_info)
programs = [loss.block.program for loss in losses]

@ -206,7 +206,7 @@ class FleetUtil(object):
pos = pos.reshape(-1)
global_pos = np.copy(pos) * 0
# mpi allreduce
fleet._role_maker._node_type_comm.Allreduce(pos, global_pos)
fleet._role_maker._all_reduce(pos, global_pos)
# reshape to its original shape
global_pos = global_pos.reshape(old_pos_shape)
@ -215,7 +215,7 @@ class FleetUtil(object):
old_neg_shape = np.array(neg.shape)
neg = neg.reshape(-1)
global_neg = np.copy(neg) * 0
fleet._role_maker._node_type_comm.Allreduce(neg, global_neg)
fleet._role_maker._all_reduce(neg, global_neg)
global_neg = global_neg.reshape(old_neg_shape)
# calculate auc
@ -1350,7 +1350,7 @@ class FleetUtil(object):
pos = pos.reshape(-1)
global_pos = np.copy(pos) * 0
# mpi allreduce
fleet._role_maker._node_type_comm.Allreduce(pos, global_pos)
fleet._role_maker._all_reduce(pos, global_pos)
# reshape to its original shape
global_pos = global_pos.reshape(old_pos_shape)
# auc neg bucket
@ -1358,7 +1358,7 @@ class FleetUtil(object):
old_neg_shape = np.array(neg.shape)
neg = neg.reshape(-1)
global_neg = np.copy(neg) * 0
fleet._role_maker._node_type_comm.Allreduce(neg, global_neg)
fleet._role_maker._all_reduce(neg, global_neg)
global_neg = global_neg.reshape(old_neg_shape)
num_bucket = len(global_pos[0])
@ -1368,7 +1368,7 @@ class FleetUtil(object):
old_metric_shape = np.array(metric.shape)
metric = metric.reshape(-1)
global_metric = np.copy(metric) * 0
fleet._role_maker._node_type_comm.Allreduce(metric, global_metric)
fleet._role_maker._all_reduce(metric, global_metric)
global_metric = global_metric.reshape(old_metric_shape)
return global_metric[0]

@ -733,7 +733,7 @@ class TestDataset2(unittest.TestCase):
place = fluid.CPUPlace()
exe = fluid.Executor(place)
try:
fleet.init(exe)
fleet.init()
except ImportError as e:
print("warning: no mpi4py")
adam = fluid.optimizer.Adam(learning_rate=0.000005)
@ -795,7 +795,7 @@ class TestDataset2(unittest.TestCase):
place = fluid.CPUPlace()
exe = fluid.Executor(place)
try:
fleet.init(exe)
fleet.init()
except ImportError as e:
print("warning: no mpi4py")
adam = fluid.optimizer.Adam(learning_rate=0.000005)
@ -824,6 +824,10 @@ class TestDataset2(unittest.TestCase):
dataset.set_pipe_command("cat")
dataset.set_use_var(slots_vars)
dataset.load_into_memory()
try:
dataset.global_shuffle(fleet)
except:
print("warning: catch expected error")
fleet._opt_info = None
fleet._fleet_ptr = None

@ -11,36 +11,41 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Test cloud role maker."""
from __future__ import print_function
import os
import unittest
import paddle.fluid.incubate.fleet.base.role_maker as role_maker
class TestCloudRoleMaker(unittest.TestCase):
"""
Test cases for PaddleCloudRoleMaker.
"""
def setUp(self):
"""Set up, set envs."""
os.environ["PADDLE_TRAINERS_NUM"] = "2"
os.environ[
"PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001,127.0.0.2:36001"
def test_tr_rolemaker(self):
"""Test tr rolenamer."""
os.environ["TRAINING_ROLE"] = "TRAINER"
os.environ["PADDLE_TRAINER_ID"] = "0"
ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
ro.generate_role()
self.assertTrue(ro.is_worker())
self.assertFalse(ro.is_server())
self.assertEqual(ro.worker_num(), 2)
def test_ps_rolemaker(self):
"""Test ps rolemaker."""
os.environ["TRAINING_ROLE"] = "PSERVER"
os.environ["POD_IP"] = "127.0.0.1"
os.environ["PADDLE_PORT"] = "36001"
ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
ro.generate_role()
self.assertFalse(ro.is_worker())
@ -48,10 +53,75 @@ class TestCloudRoleMaker(unittest.TestCase):
self.assertEqual(ro.worker_num(), 2)
def test_traing_role(self):
"""Test training role."""
os.environ["TRAINING_ROLE"] = "TEST"
ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
self.assertRaises(ValueError, ro.generate_role)
def test_pslib_1(self):
"""Test cases for pslib."""
import paddle.fluid as fluid
from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
from paddle.fluid.incubate.fleet.parameter_server.pslib import PSLib
from paddle.fluid.incubate.fleet.base.role_maker import GeneralRoleMaker
try:
import netifaces
except:
print("warning: no netifaces, skip test_pslib_1")
return
os.environ["POD_IP"] = "127.0.0.1"
os.environ["PADDLE_PORT"] = "36001"
os.environ["TRAINING_ROLE"] = "TRAINER"
os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36002"
os.environ["PADDLE_TRAINER_ID"] = "0"
role_maker = GeneralRoleMaker()
role_maker.generate_role()
place = fluid.CPUPlace()
exe = fluid.Executor(place)
fleet.init(role_maker)
train_program = fluid.Program()
startup_program = fluid.Program()
scope = fluid.Scope()
with fluid.program_guard(train_program, startup_program):
show = fluid.layers.data(name="show", shape=[-1, 1], \
dtype="float32", lod_level=1, append_batch_size=False)
fc = fluid.layers.fc(input=show, size=1, act=None)
label = fluid.layers.data(name="click", shape=[-1, 1], \
dtype="int64", lod_level=1, append_batch_size=False)
label_cast = fluid.layers.cast(label, dtype='float32')
cost = fluid.layers.log_loss(fc, label_cast)
try:
adam = fluid.optimizer.Adam(learning_rate=0.000005)
adam = fleet.distributed_optimizer(adam)
adam.minimize([cost], [scope])
fleet.run_server()
except:
print("do not support pslib test, skip")
return
from paddle.fluid.incubate.fleet.base.role_maker import \
MPISymetricRoleMaker
try:
role = MPISymetricRoleMaker()
role._all_reduce([1], [2])
except:
print("catch expected error of not inited")
try:
role = MPISymetricRoleMaker()
role._all_reduce([1], [2], "min")
except:
print("catch expected error of not inited")
try:
role = MPISymetricRoleMaker()
role._all_reduce([1], [2], "max")
except:
print("catch expected error of not inited")
try:
role = MPISymetricRoleMaker()
role._all_reduce([1], [2], "unknown")
except:
print("catch expected error of unknown type")
if __name__ == "__main__":
unittest.main()

Loading…
Cancel
Save