integrated HALF_ASYNC to communicator (#21869)

* add half_async in the communicator
* fix DistributedStrategy
revert-22710-feature/integrated_ps_api
tangwei12 6 years ago committed by GitHub
parent 1e932eccfa
commit 82bc814a57
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -192,7 +192,7 @@ if(WITH_DISTRIBUTE)
data_feed.cc device_worker.cc hogwild_worker.cc downpour_worker.cc
pull_dense_worker.cc section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry
device_context scope framework_proto trainer_desc_proto glog fs shell fleet_wrapper lodtensor_printer
lod_rank_table feed_fetch_method sendrecvop_rpc collective_helper ${GLOB_DISTRIBUTE_DEPS}
lod_rank_table feed_fetch_method sendrecvop_rpc communicator collective_helper ${GLOB_DISTRIBUTE_DEPS}
graph_to_program_pass variable_helper data_feed_proto ${NGRAPH_EXE_DEPS} timer)
set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})

@ -48,7 +48,7 @@ void ProcessGraph(std::vector<ir::Graph *> graphs, Scope *scope) {
using RpcCtxMap = operators::distributed::RpcCtxMap;
VLOG(3) << "ProcessGraph";
RpcCtxMap send_varname_to_ctx;
RpcCtxMap recv_varname_to_ctx;
for (auto &node : graphs[0]->Nodes()) {
VLOG(3) << "node name " << node->Name();
if (node && node->IsOp()) {
@ -74,30 +74,19 @@ void ProcessGraph(std::vector<ir::Graph *> graphs, Scope *scope) {
merge_add, use_send_handler);
VLOG(3) << "find and init an send op: "
<< send_varname_to_ctx[send_var_name];
} else if (node->Name() == "recv") {
auto recv_var_name = node->Op()->Output("Out")[0];
auto recv_varnames = boost::get<std::vector<std::string>>(
node->Op()->GetNullableAttr("recv_varnames"));
auto epmap = boost::get<std::vector<std::string>>(
node->Op()->GetNullableAttr("epmap"));
auto trainer_id =
boost::get<int>(node->Op()->GetNullableAttr("trainer_id"));
recv_varname_to_ctx[recv_var_name] = operators::distributed::RpcContext(
recv_var_name, recv_varnames, epmap, {}, trainer_id);
VLOG(3) << "find and remove an recv op: "
<< recv_varname_to_ctx[recv_var_name];
}
}
}
// init communicator here
if (send_varname_to_ctx.size() > 0) {
VLOG(3) << "this is distribute mode, will use communicator";
auto *instance = operators::distributed::Communicator::InitInstance<
operators::distributed::AsyncCommunicator>(send_varname_to_ctx,
recv_varname_to_ctx, scope);
if (!instance->IsRunning()) instance->Start();
auto *instance = operators::distributed::Communicator::GetInstance();
auto initialized = instance ? true : false;
PADDLE_ENFORCE_EQ(initialized, true,
platform::errors::InvalidArgument(
"Communicator is not Initialized, you may use "
"FleetAPI(https://github.com/PaddlePaddle/Fleet/tree/"
"develop/markdown_doc/transpiler)"));
}
#endif
}

@ -179,6 +179,7 @@ class HogwildWorker : public CPUWorkerBase {
void CreateThreadScope(const ProgramDesc& program);
std::vector<std::string> op_names_;
std::vector<OperatorBase*> ops_;
bool thread_barrier_;
// Scope* thread_scope_;
HogwildWorkerParameter param_;
std::vector<std::string> skip_ops_;

@ -15,6 +15,7 @@ limitations under the License. */
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/device_worker.h"
#include "paddle/fluid/framework/device_worker_factory.h"
#include "paddle/fluid/operators/distributed/distributed.h"
#include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/lodtensor_printer.h"
@ -29,6 +30,7 @@ void HogwildWorker::Initialize(const TrainerDesc &desc) {
skip_ops_[i] = param_.skip_ops(i);
}
use_cvm_ = desc.use_cvm();
thread_barrier_ = desc.thread_barrier();
}
void HogwildWorker::CreateThreadOperators(const ProgramDesc &program) {
@ -158,6 +160,12 @@ void HogwildWorker::TrainFilesWithProfiler() {
thread_scope_->DropKids();
timeline.Start();
}
#ifdef PADDLE_WITH_DISTRIBUTE
if (thread_barrier_) {
operators::distributed::Communicator::GetInstance()
->BarrierTriggerDecrement();
}
#endif
}
void HogwildWorker::TrainFiles() {
@ -183,6 +191,12 @@ void HogwildWorker::TrainFiles() {
PrintFetchVars();
thread_scope_->DropKids();
}
#ifdef PADDLE_WITH_DISTRIBUTE
if (thread_barrier_) {
operators::distributed::Communicator::GetInstance()
->BarrierTriggerDecrement();
}
#endif
}
void HogwildWorker::PrintFetchVars() {

@ -17,6 +17,7 @@ limitations under the License. */
#include "paddle/fluid/framework/data_feed_factory.h"
#include "paddle/fluid/framework/device_worker_factory.h"
#include "paddle/fluid/framework/trainer.h"
#include "paddle/fluid/operators/distributed/distributed.h"
namespace paddle {
namespace framework {
@ -38,6 +39,14 @@ void MultiTrainer::Initialize(const TrainerDesc& trainer_desc,
thread_num_ = readers.size();
VLOG(3) << "worker thread num: " << thread_num_;
workers_.resize(thread_num_);
#ifdef PADDLE_WITH_DISTRIBUTE
if (trainer_desc.thread_barrier()) {
operators::distributed::Communicator::GetInstance()->BarrierTriggerReset(
thread_num_);
}
#endif
for (int i = 0; i < thread_num_; ++i) {
workers_[i] = DeviceWorkerFactory::CreateDeviceWorker(
trainer_desc.device_worker_name());

@ -47,6 +47,7 @@ message TrainerDesc {
// adjust ins weight
optional AdjustInsWeightConfig adjust_ins_weight_config = 20;
optional bool no_cvm = 21 [ default = false ];
optional bool thread_barrier = 22;
// device worker parameters
optional HogwildWorkerParameter hogwild_param = 101;

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -17,6 +17,7 @@
#ifdef PADDLE_WITH_DISTRIBUTE
#ifdef PADDLE_WITH_GRPC
#include "paddle/fluid/operators/distributed/communicator.h"
#include "paddle/fluid/operators/distributed/grpc/grpc_client.h"
#include "paddle/fluid/operators/distributed/grpc/grpc_server.h"

@ -36,6 +36,13 @@ class SendBarrierOp : public framework::OperatorBase {
void RunImpl(const framework::Scope& scope,
const platform::Place& place) const override {
auto is_half_async = Attr<bool>("half_async");
if (is_half_async) {
distributed::Communicator::GetInstance()->Barrier();
return;
}
std::vector<std::string> eps = Attr<std::vector<std::string>>("endpoints");
distributed::RPCClient* rpc_client =
@ -76,6 +83,12 @@ the Parameter Server would knew all variables have been sent.
"(string vector, default 127.0.0.1:6164)"
"Server endpoints to send variables to.")
.SetDefault({"127.0.0.1:6164"});
AddAttr<bool>(
"half_async",
"(bool, default false)"
"half_async=True is for half_async mode, this will send signal "
"to HalfAsyncCommunicator Instance")
.SetDefault(false);
}
};

@ -48,12 +48,7 @@ class SendOp : public framework::OperatorBase {
auto use_send_handler = Attr<bool>("use_send_handler");
if (send_varnames.size() > 0) {
if (ins.size() > 1) {
distributed::Communicator::GetInstance()->Send(ins, send_varnames,
scope);
} else {
distributed::Communicator::GetInstance()->Send(ins[0], scope);
}
distributed::Communicator::GetInstance()->Send(ins, send_varnames, scope);
} else {
platform::DeviceContextPool& pool =
platform::DeviceContextPool::Instance();

@ -27,10 +27,11 @@ limitations under the License. */
namespace py = pybind11;
using paddle::framework::ProgramDesc;
using paddle::operators::distributed::Communicator;
using paddle::framework::Scope;
using paddle::operators::distributed::AsyncCommunicator;
using paddle::operators::distributed::Communicator;
using paddle::operators::distributed::GeoSgdCommunicator;
using paddle::framework::Scope;
using paddle::operators::distributed::HalfAsyncCommunicator;
namespace paddle {
namespace pybind {
@ -39,29 +40,27 @@ void BindCommunicator(py::module* m) {
// Communicator is already used by nccl, change to DistCommunicator
py::class_<Communicator, std::shared_ptr<Communicator>>(*m,
"DistCommunicator")
.def(py::init([](const ProgramDesc& program, Scope* param_scope,
std::map<std::string, int>& env_flags) {
VLOG(0) << "using communicator";
Communicator::InitInstance<AsyncCommunicator>(program, param_scope,
env_flags);
return Communicator::GetInstantcePtr();
}))
.def(py::init([](
const ProgramDesc& program, Scope* training_scope,
std::map<std::string,
std::map<std::string, std::vector<std::string>>>& vars_info,
int& trainers, int& geo_need_push_nums,
std::map<std::string, int>& env_flags) {
VLOG(0) << "using geo sgd communicator";
Communicator::InitInstance<GeoSgdCommunicator>(
program, training_scope, vars_info, trainers, geo_need_push_nums,
env_flags);
.def(py::init([](const std::string& mode, const ProgramDesc& program,
Scope* param_scope,
std::map<std::string, std::string>& envs) {
if (mode == "HALF_ASYNC") {
Communicator::InitInstance<HalfAsyncCommunicator>(program,
param_scope, envs);
} else if (mode == "ASYNC") {
Communicator::InitInstance<AsyncCommunicator>(program, param_scope,
envs);
} else if (mode == "GEO") {
Communicator::InitInstance<GeoSgdCommunicator>(program, param_scope,
envs);
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"unsuported communicator MODE"));
}
return Communicator::GetInstantcePtr();
}))
.def("stop", &Communicator::Stop)
.def("start", &Communicator::Start)
.def("is_running", &Communicator::IsRunning);
}
} // namespace pybind
} // namespace paddle

@ -199,17 +199,6 @@ def __bootstrap__():
read_env_flags.append('worker_update_interval_secs')
# env for communicator
read_env_flags.append('communicator_independent_recv_thread')
read_env_flags.append('communicator_send_queue_size')
read_env_flags.append('communicator_min_send_grad_num_before_recv')
read_env_flags.append('communicator_thread_pool_size')
read_env_flags.append('communicator_max_merge_var_num')
read_env_flags.append('communicator_merge_sparse_bucket')
read_env_flags.append('communicator_fake_rpc')
read_env_flags.append('communicator_send_wait_times')
read_env_flags.append('communicator_merge_sparse_grad')
read_env_flags.append('communicator_is_sgd_optimizer')
if core.is_compiled_with_brpc():
read_env_flags.append('max_body_size')
#set brpc max body size

@ -19,17 +19,13 @@ It's a wrapper of a cpp class Communicator and should be used inside fleet API.
"""
from . import core
from .framework import Program
from .transpiler.distribute_transpiler import DistributedMode
__all__ = ['Communicator']
class Communicator(object):
def __init__(self,
program,
vars_info=None,
trainers=None,
geo_sgd_need_push_nums=None,
env_flags=None):
def __init__(self, program, mode, kwargs=None, envs={}):
"""
Communicator is used for async distribute training in distribute_transpiler mode.
It's a wrapper of a cpp class Communicator and should be used inside fleet API.
@ -56,20 +52,37 @@ class Communicator(object):
for op in program.block(0).ops:
if op.type == "recv":
op._set_attr('do_not_run', True)
# Todo: Add check
if env_flags is None:
env_flags = {}
if vars_info and trainers and geo_sgd_need_push_nums:
# for geo sgd
self.communicator_ = core.DistCommunicator(
program.desc,
global_scope(), vars_info, trainers, geo_sgd_need_push_nums,
env_flags)
else:
self.communicator_ = core.DistCommunicator(program.desc,
global_scope(),
env_flags)
if mode == DistributedMode.GEO:
push_vars = kwargs["push_vars"]
push_var_names = []
for k, vs in push_vars.items():
varnames = "&".join(vs["var_names"])
sections = "&".join([str(v) for v in vs["sections"]])
endpoints = "&".join(vs["epmap"])
is_sparse = "1" if vs["is_sparse"] else "0"
push_var_names.append(k)
envs[k] = "#".join([varnames, sections, endpoints, is_sparse])
envs["geo_trainer_nums"] = str(kwargs["trainers"])
envs["geo_need_push_nums"] = str(kwargs["push_nums"])
envs["geo_send_varnames"] = '#'.join(push_var_names)
mode_str = None
if mode == DistributedMode.SYNC:
mode_str = "SYNC"
elif mode == DistributedMode.ASYNC:
mode_str = "ASYNC"
elif mode == DistributedMode.HALF_ASYNC:
mode_str = "HALF_ASYNC"
elif mode == DistributedMode.GEO:
mode_str = "GEO"
self.communicator_ = core.DistCommunicator(mode_str, program.desc,
global_scope(), envs)
def start(self):
"""

@ -963,6 +963,7 @@ class Executor(object):
program._pipeline_opt)
else:
trainer = TrainerFactory()._create_trainer(program._fleet_opt)
trainer._set_thread_barrier(program._is_distributed)
trainer._set_program(program)
else:
if program._pipeline_opt:

@ -17,7 +17,6 @@ import warnings
"""
Convert the fluid program to distributed data-parallelism programs.
"""
from .distributed_strategy import *
import paddle.fluid.io as io
from paddle.fluid.communicator import Communicator
from paddle.fluid.framework import default_main_program
@ -27,8 +26,11 @@ from paddle.fluid.compiler import CompiledProgram
from paddle.fluid.executor import Executor
from paddle.fluid.parallel_executor import ParallelExecutor
from paddle.fluid.optimizer import Optimizer
from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import TrainerRuntimeConfig, DistributedStrategy, SyncStrategy, AsyncStrategy, HalfAsyncStrategy, GeoStrategy, StrategyFactory
from paddle.fluid.transpiler.distribute_transpiler import DistributeTranspiler as OriginTranspiler
from paddle.fluid.transpiler.distribute_transpiler import DistributeTranspilerConfig, ServerRuntimeConfig
from paddle.fluid.transpiler.distribute_transpiler import DistributeTranspilerConfig, ServerRuntimeConfig, DistributedMode
from paddle.fluid.incubate.fleet.base.fleet_base import DistributedOptimizer
from paddle.fluid.incubate.fleet.base.fleet_base import Fleet
@ -70,25 +72,39 @@ class DistributedTranspiler(Fleet):
program_config = self._transpile_config.get_program_config()
trainer_communicator_config = self._transpile_config.get_trainer_runtime_config(
)
if isinstance(self._transpile_config, SyncStrategy):
return
print(trainer_communicator_config)
need_communicator_flag = False
if isinstance(self._transpile_config, GeoStrategy):
need_communicator_flag = True
kwargs = {}
kwargs["push_vars"] = self.vars_info
kwargs["trainers"] = fleet.worker_num()
kwargs["push_nums"] = self._transpile_config.get_program_config(
).geo_sgd_need_push_nums
self._communicator = Communicator(
self.main_program, self.vars_info,
fleet.worker_num(), program_config.geo_sgd_need_push_nums,
self.main_program, DistributedMode.GEO, kwargs,
trainer_communicator_config.get_communicator_flags())
elif isinstance(self._transpile_config, AsyncStrategy):
need_communicator_flag = True
self._communicator = Communicator(
self.main_program,
env_flags=trainer_communicator_config.get_communicator_flags())
if need_communicator_flag:
if not self._communicator.is_running():
self._communicator.start()
else:
warnings.warn("communicator has been initialized, skip")
self.main_program, DistributedMode.ASYNC, None,
trainer_communicator_config.get_communicator_flags())
elif isinstance(self._transpile_config, HalfAsyncStrategy):
self._communicator = Communicator(
self.main_program, DistributedMode.HALF_ASYNC, None,
trainer_communicator_config.get_communicator_flags())
else:
raise TypeError("Training MODE do not supported")
if not self._communicator.is_running():
self._communicator.start()
else:
warnings.warn("communicator has been initialized, skip")
def init_server(self, model_dir=None):
"""
@ -139,12 +155,12 @@ class DistributedTranspiler(Fleet):
Returns:
None
"""
if isinstance(self._transpile_config, GeoStrategy) or isinstance(
self._transpile_config, AsyncStrategy):
if not isinstance(self._transpile_config, SyncStrategy):
self._communicator.stop()
self._executor.close()
if isinstance(self._role_maker, MPISymetricRoleMaker):
self._role_maker._finalize()
self._executor.close()
def distributed_optimizer(self, optimizer, strategy=None):
"""
@ -250,14 +266,22 @@ class DistributedTranspiler(Fleet):
io.save_persistables(executor, dirname, main_program, None)
def _transpile(self, config):
if isinstance(config, DistributeTranspilerConfig):
self._transpile_config = DistributedStrategy()
self._transpile_config.set_program_config(config)
elif isinstance(config, DistributedStrategy):
if isinstance(config, DistributedStrategy):
self._transpile_config = config
elif isinstance(config, DistributeTranspilerConfig):
if config.sync_mode:
self._transpile_config = SyncStrategy()
elif config.geo_sgd_mode:
self._transpile_config = GeoStrategy(
config.geo_sgd_need_push_nums)
elif config.runtime_split_send_recv and config.half_async:
self._transpile_config = HalfAsyncStrategy()
else:
self._transpile_config = AsyncStrategy()
self._transpile_config.set_program_config(config)
else:
raise TypeError(
"config must be an instance of DistributeTranspilerConfig or DistributedStrategy"
"config must be an instance of DistributeTranspilerConfig, SyncStrategy, HalfAsyncStrategy, AsyncStrategy or GeoStratey."
)
program_config = self._transpile_config.get_program_config()
@ -327,14 +351,12 @@ class TranspilerOptimizer(DistributedOptimizer):
super(TranspilerOptimizer, self).__init__(optimizer, strategy)
if strategy:
if isinstance(strategy, DistributedStrategy):
if isinstance(strategy, DistributeTranspilerConfig) or isinstance(
strategy, DistributedStrategy):
self._strategy = strategy
elif isinstance(strategy, DistributeTranspilerConfig):
self._strategy = DistributedStrategy()
self._strategy.set_program_config(strategy)
else:
raise TypeError(
"In {} mode, strategy must be an instance of DistributeTranspilerConfig or DistributedStrategy".
"In {} mode, strategy must be an instance of DistributeTranspilerConfig, SyncStrategy, HalfAsyncStrategy, AsyncStrategy, or GeoStrategy".
format(fleet._mode))
else:
self._strategy = DistributedStrategy()

@ -24,49 +24,51 @@ from paddle.fluid.transpiler.distribute_transpiler import DistributeTranspilerCo
class TrainerRuntimeConfig(object):
def __init__(self):
self.max_merge_var_num = int(
os.getenv("FLAGS_communicator_max_merge_var_num", "20"))
self.send_queue_size = int(
os.getenv("FLAGS_communicator_send_queue_size", "20"))
self.independent_recv_thread = int(
os.getenv("FLAGS_communicator_independent_recv_thread", "1"))
self.min_send_grad_num_before_recv = int(
os.getenv("FLAGS_communicator_min_send_grad_num_before_recv", "20"))
self.thread_pool_size = int(
os.getenv("FLAGS_communicator_thread_pool_size", "5"))
self.send_wait_times = int(
os.getenv("FLAGS_communicator_send_wait_times", "5"))
self.fake_rpc = int(os.getenv("FLAGS_communicator_fake_rpc", "0"))
self.merge_sparse_grad = int(
os.getenv("FLAGS_communicator_merge_sparse_grad", "1"))
self.is_sgd_optimizer = int(
os.getenv("FLAGS_communicator_is_sgd_optimizer", "1"))
self.max_merge_var_num = os.getenv(
"FLAGS_communicator_max_merge_var_num", "20")
self.send_queue_size = os.getenv("FLAGS_communicator_send_queue_size",
"20")
self.independent_recv_thread = os.getenv(
"FLAGS_communicator_independent_recv_thread", "1")
self.min_send_grad_num_before_recv = os.getenv(
"FLAGS_communicator_min_send_grad_num_before_recv", "20")
self.thread_pool_size = os.getenv("FLAGS_communicator_thread_pool_size",
"5")
self.send_wait_times = os.getenv("FLAGS_communicator_send_wait_times",
"5")
self.fake_rpc = os.getenv("FLAGS_communicator_fake_rpc", "0")
self.merge_sparse_grad = os.getenv(
"FLAGS_communicator_merge_sparse_grad", "1")
self.is_sgd_optimizer = os.getenv("FLAGS_communicator_is_sgd_optimizer",
"1")
# not used
self._rpc_deadline = int(os.getenv("FLAGS_rpc_deadline", "180000"))
self._rpc_retry_times = int(os.getenv("FLAGS_rpc_retry_times", "3"))
self._rpc_deadline = os.getenv("FLAGS_rpc_deadline", "180000")
self._rpc_retry_times = os.getenv("FLAGS_rpc_retry_times", "3")
def get_communicator_flags(self):
_communicator_flags = dict()
_communicator_flags["max_merge_var_num"] = self.max_merge_var_num
_communicator_flags["send_queue_size"] = self.send_queue_size
_communicator_flags[
"independent_recv_thread"] = self.independent_recv_thread
"communicator_max_merge_var_num"] = self.max_merge_var_num
_communicator_flags[
"min_send_grad_num_before_recv"] = self.min_send_grad_num_before_recv
_communicator_flags["thread_pool_size"] = self.thread_pool_size
_communicator_flags["send_wait_times"] = self.send_wait_times
_communicator_flags["fake_rpc"] = self.fake_rpc
_communicator_flags["merge_sparse_grad"] = self.merge_sparse_grad
_communicator_flags["is_sgd_optimizer"] = self.is_sgd_optimizer
"communicator_send_queue_size"] = self.send_queue_size
_communicator_flags[
"communicator_independent_recv_thread"] = self.independent_recv_thread
_communicator_flags[
"communicator_min_send_grad_num_before_recv"] = self.min_send_grad_num_before_recv
_communicator_flags[
"communicator_thread_pool_size"] = self.thread_pool_size
_communicator_flags[
"communicator_send_wait_times"] = self.send_wait_times
_communicator_flags[
"communicator_is_sgd_optimizer"] = self.is_sgd_optimizer
return _communicator_flags
def __repr__(self):
_str = "please check that TrainerRuntimeConfig is as expected:\n"
_communicator_flags = self.get_communicator_flags()
for key in _communicator_flags:
_str += "communicator_{}: {}\n".format(key,
_communicator_flags[key])
_str += "{}: {}\n".format(key, _communicator_flags[key])
return _str
@ -193,8 +195,9 @@ class HalfAsyncStrategy(DistributedStrategy):
def __init__(self):
super(HalfAsyncStrategy, self).__init__()
self._program_config.sync_mode = False
self._program_config.runtime_split_send_recv = False
self._build_strategy.async_mode = False
self._program_config.runtime_split_send_recv = True
self._build_strategy.async_mode = True
self._program_config.half_async = True
class GeoStrategy(DistributedStrategy):
@ -202,9 +205,9 @@ class GeoStrategy(DistributedStrategy):
super(GeoStrategy, self).__init__()
self._program_config.sync_mode = False
self._program_config.runtime_split_send_recv = True
self._build_strategy.async_mode = True
self._program_config.geo_sgd_mode = True
self._program_config.geo_sgd_need_push_nums = update_frequency
self._build_strategy.async_mode = True
class StrategyFactory(object):

@ -1,10 +1,6 @@
file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
if(NOT WITH_DISTRIBUTE)
list(REMOVE_ITEM TEST_OPS test_communicator)
endif(NOT WITH_DISTRIBUTE)
foreach(src ${TEST_OPS})
py_test(${src} SRCS ${src}.py)
endforeach()

@ -20,6 +20,10 @@ list(APPEND MIXED_DIST_TEST_OPS test_transpiler_ops)
list(APPEND MIXED_DIST_TEST_OPS test_lookup_remote_table_op)
list(APPEND MIXED_DIST_TEST_OPS test_launch)
list(APPEND MIXED_DIST_TEST_OPS test_launch_ps)
list(APPEND MIXED_DIST_TEST_OPS test_communicator_async)
list(APPEND MIXED_DIST_TEST_OPS test_communicator_geo)
list(APPEND MIXED_DIST_TEST_OPS test_communicator_half_async)
list(APPEND MIXED_DIST_TEST_OPS test_fleet_api_input)
foreach(TEST_OP ${MIXED_DIST_TEST_OPS})
list(REMOVE_ITEM TEST_OPS ${TEST_OP})
endforeach()
@ -269,6 +273,9 @@ if(WITH_DISTRIBUTE)
py_test_modules(test_nce_remote_table_op MODULES test_nce_remote_table_op ENVS ${dist_ENVS})
py_test_modules(test_recv_save_op MODULES test_recv_save_op ENVS ${dist_ENVS})
py_test_modules(test_transpiler_ops MODULES test_transpiler_ops ENVS ${dist_ENVS})
py_test_modules(test_communicator_async MODULES test_communicator_async ENVS ${dist_ENVS})
py_test_modules(test_communicator_geo MODULES test_communicator_geo ENVS ${dist_ENVS})
py_test_modules(test_communicator_half_async MODULES test_communicator_half_async ENVS ${dist_ENVS} FLAGS_communicator_send_queue_size=1 FLAGS_communicator_max_merge_var_num=1)
if(WITH_DGC)
# if with dgc, test all dgc tests.
# NOTE. dist dgc tests is already in DIST_TEST_OPS

@ -110,8 +110,10 @@ class TestDistCTR2x2(FleetDistRunnerBase):
predict = fluid.layers.fc(input=merge_layer, size=2, act='softmax')
acc = fluid.layers.accuracy(input=predict, label=label)
auc_var, batch_auc_var, auc_states = fluid.layers.auc(input=predict,
label=label)
cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(x=cost)
@ -242,11 +244,13 @@ class TestDistCTR2x2(FleetDistRunnerBase):
debug=False)
pass_time = time.time() - pass_start
model_dir = tempfile.mkdtemp()
fleet.save_inference_model(
exe, model_dir, [feed.name for feed in self.feeds], self.avg_cost)
self.check_model_right(model_dir)
shutil.rmtree(model_dir)
if os.getenv("SAVE_MODEL") == "1":
model_dir = tempfile.mkdtemp()
fleet.save_inference_model(exe, model_dir,
[feed.name for feed in self.feeds],
self.avg_cost)
self.check_model_right(model_dir)
shutil.rmtree(model_dir)
fleet.stop_worker()

@ -16,7 +16,10 @@ from __future__ import print_function
import unittest
import time
import threading
import numpy
import paddle
import paddle.fluid as fluid
from paddle.fluid.communicator import Communicator
@ -35,7 +38,7 @@ class TestCommunicator(unittest.TestCase):
avg_cost = fluid.layers.mean(cost)
return avg_cost
def test_communicator_init_and_start(self):
def test_communicator_async(self):
role = role_maker.UserDefinedRoleMaker(
current_id=0,
role=role_maker.Role.WORKER,
@ -48,23 +51,15 @@ class TestCommunicator(unittest.TestCase):
optimizer = fluid.optimizer.SGD(0.01)
strategy = DistributeTranspilerConfig()
strategy.sync_mode = True
strategy.sync_mode = False
strategy.runtime_split_send_recv = True
strategy.wait_port = False
optimizer = fleet.distributed_optimizer(optimizer, strategy)
optimizer.minimize(avg_cost)
comm = Communicator(fleet.main_program)
comm.start()
fleet.init_worker()
time.sleep(10)
comm.stop()
class TestCommunicator2(unittest.TestCase):
def test_communicator_init_and_start(self):
prog = fluid.Program()
comm = Communicator(prog)
comm.start()
comm.stop()
fleet.stop_worker()
if __name__ == '__main__':

@ -0,0 +1,83 @@
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import time
import threading
import numpy
import paddle
import paddle.fluid as fluid
from paddle.fluid.communicator import Communicator
from paddle.fluid.transpiler.distribute_transpiler import DistributedMode
import paddle.fluid.incubate.fleet.base.role_maker as role_maker
from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
from paddle.fluid.transpiler.distribute_transpiler import DistributeTranspilerConfig
class TestCommunicator(unittest.TestCase):
def net(self):
x = fluid.layers.data(name='x', shape=[13], dtype='float32')
y_predict = fluid.layers.fc(input=x, size=1, act=None)
y = fluid.layers.data(name='y', shape=[1], dtype='float32')
cost = fluid.layers.square_error_cost(input=y_predict, label=y)
avg_cost = fluid.layers.mean(cost)
return avg_cost
def test_communicator_geo(self):
role = role_maker.UserDefinedRoleMaker(
current_id=0,
role=role_maker.Role.WORKER,
worker_num=2,
server_endpoints=["127.0.0.1:6001", "127.0.0.1:6002"])
fleet.init(role)
avg_cost = self.net()
optimizer = fluid.optimizer.SGD(0.01)
strategy = DistributeTranspilerConfig()
strategy.sync_mode = False
strategy.runtime_split_send_recv = True
strategy.geo_sgd_mode = True
strategy.wait_port = False
optimizer = fleet.distributed_optimizer(optimizer, strategy)
optimizer.minimize(avg_cost)
fleet.init_worker()
time.sleep(10)
fleet.stop_worker()
# class TestCommunicatorGEO(unittest.TestCase):
# def test_communicator_init_and_start(self):
# prog = fluid.Program()
# envs = {}
# envs["communicator_thread_pool_size"] = "5"
# envs["communicator_send_wait_times"] = "5"
# kwargs = {}
# kwargs["push_vars"] = {}
# kwargs["trainers"] = 10
# kwargs["push_nums"] = 10
# comm = Communicator(prog, DistributedMode.GEO, kwargs, envs)
if __name__ == '__main__':
unittest.main()

@ -0,0 +1,177 @@
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import os
import sys
import time
import threading
import subprocess
import unittest
import numpy
import paddle
import paddle.fluid as fluid
from paddle.fluid.communicator import Communicator
import paddle.fluid.incubate.fleet.base.role_maker as role_maker
from paddle.fluid.transpiler.distribute_transpiler import DistributedMode
from paddle.fluid.transpiler.distribute_transpiler import DistributeTranspilerConfig
from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
class TestCommunicatorHalfAsyncEnd2End(unittest.TestCase):
def net(self):
x = fluid.layers.data(name='x', shape=[13], dtype='float32')
y_predict = fluid.layers.fc(input=x, size=1, act=None)
y = fluid.layers.data(name='y', shape=[1], dtype='float32')
cost = fluid.layers.square_error_cost(input=y_predict, label=y)
avg_cost = fluid.layers.mean(cost)
return avg_cost, x, y
def fake_reader(self):
def reader():
for i in range(10000):
x = numpy.random.random((1, 13)).astype('float32')
y = numpy.random.randint(0, 2, (1, 1)).astype('int64')
yield x, y
return reader
def run_pserver(self, role, strategy):
fleet.init(role)
avg_cost, x, y = self.net()
optimizer = fluid.optimizer.SGD(0.01)
optimizer = fleet.distributed_optimizer(optimizer, strategy)
optimizer.minimize(avg_cost)
fleet.init_server()
fleet.run_server()
def run_trainer(self, role, strategy):
place = fluid.core.CPUPlace()
exe = fluid.Executor(place)
fleet.init(role)
avg_cost, x, y = self.net()
optimizer = fluid.optimizer.SGD(0.01)
optimizer = fleet.distributed_optimizer(optimizer, strategy)
optimizer.minimize(avg_cost)
exe.run(fleet.startup_program)
fleet.init_worker()
train_reader = paddle.batch(self.fake_reader(), batch_size=24)
feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
for batch_id, data in enumerate(train_reader()):
exe.run(fleet.main_program, feed=feeder.feed(data), fetch_list=[])
fleet.stop_worker()
def run_ut(self):
strategy = DistributeTranspilerConfig()
strategy.sync_mode = False
strategy.runtime_split_send_recv = True
strategy.half_async = True
training_role = os.getenv("TRAINING_ROLE", "TRAINER")
role = role_maker.UserDefinedRoleMaker(
current_id=0,
role=role_maker.Role.WORKER
if training_role == "TRAINER" else role_maker.Role.SERVER,
worker_num=2,
server_endpoints=["127.0.0.1:6002"])
if training_role == "TRAINER":
self.run_trainer(role, strategy)
else:
self.run_pserver(role, strategy)
def test_communicator(self):
run_server_cmd = """
from __future__ import print_function
import sys
import os
import time
import threading
import subprocess
import unittest
import numpy
import paddle
import paddle.fluid as fluid
from paddle.fluid.communicator import Communicator
from paddle.fluid.communicator import DistributedMode
import paddle.fluid.incubate.fleet.base.role_maker as role_maker
from test_communicator_half_async import TestCommunicatorHalfAsyncEnd2End
from paddle.fluid.transpiler.distribute_transpiler import DistributeTranspilerConfig
from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
class RunServer(TestCommunicatorHalfAsyncEnd2End):
def runTest(self):
pass
os.environ["TRAINING_ROLE"] = "PSERVER"
half_run_server = RunServer()
half_run_server.run_ut()
"""
server_file = "run_server_for_communicator_haflaysnc.py"
with open(server_file, "w") as wb:
wb.write(run_server_cmd)
os.environ["TRAINING_ROLE"] = "PSERVER"
_python = sys.executable
ps_cmd = "{} {}".format(_python, server_file)
ps_proc = subprocess.Popen(
ps_cmd.strip().split(" "),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
os.environ["TRAINING_ROLE"] = "TRAINER"
os.environ["FLAGS_communicator_send_queue_size"] = "1"
os.environ["FLAGS_communicator_max_merge_var_num"] = "1"
self.run_ut()
ps_proc.kill()
if os.path.exists(server_file):
os.remove(server_file)
# class TestCommunicatorHalfAsync2(unittest.TestCase):
# def test_communicator_init_and_start(self):
# prog = fluid.Program()
# envs = {}
# envs["communicator_send_queue_size"] = "12"
# envs["communicator_max_merge_var_num"] = "12"
# envs["communicator_thread_pool_size"] = "5"
# envs["communicator_send_wait_times"] = "5"
# comm = Communicator(prog, DistributedMode.HALF_ASYNC, None, envs)
# comm.start()
# time.sleep(10)
# comm.stop()
if __name__ == '__main__':
unittest.main()

@ -32,7 +32,6 @@ class TestDistCTR2x2(TestDistBase):
"dist_ctr.py", delta=1e-2, check_error_log=True, log_name=flag_name)
@unittest.skip(reason="Skip unstable ci")
class TestDistCTRWithL2Decay2x2(TestDistBase):
def _setup_config(self):
self._sync_mode = True
@ -48,6 +47,7 @@ class TestDistCTRWithL2Decay2x2(TestDistBase):
log_name=flag_name)
@unittest.skip(reason="Skip unstable ci")
class TestDistCTR2x2_ASYNC(TestDistBase):
def _setup_config(self):
self._sync_mode = False
@ -69,6 +69,7 @@ class TestDistCTR2x2_ASYNC(TestDistBase):
log_name=flag_name)
@unittest.skip(reason="Skip unstable ci")
class TestDistCTR2x2_ASYNCWithLRDecay2x2(TestDistBase):
def _setup_config(self):
self._sync_mode = False
@ -91,6 +92,7 @@ class TestDistCTR2x2_ASYNCWithLRDecay2x2(TestDistBase):
log_name=flag_name)
@unittest.skip(reason="Skip unstable ci")
class TestDistCTR2x2_ASYNC2(TestDistBase):
def _setup_config(self):
self._sync_mode = False

@ -53,7 +53,22 @@ class FleetDistRunnerBase(object):
do training : exe run program
"""
def generate_strategy(self, args):
def build_role(self, args):
if args.role.upper() == "PSERVER":
role = role_maker.UserDefinedRoleMaker(
current_id=args.current_id,
role=role_maker.Role.SERVER,
worker_num=args.trainers,
server_endpoints=args.endpoints.split(","))
else:
role = role_maker.UserDefinedRoleMaker(
current_id=args.current_id,
role=role_maker.Role.WORKER,
worker_num=args.trainers,
server_endpoints=args.endpoints.split(","))
return role
def build_strategy(self, args):
self.strategy = None
if args.mode == "async":
self.strategy = StrategyFactory.create_async_strategy()
@ -66,22 +81,7 @@ class FleetDistRunnerBase(object):
args.geo_sgd_need_push_nums)
return self.strategy
def run_pserver(self, args):
if args.role.upper() != "PSERVER":
raise ValueError("args role must be PSERVER")
role = role_maker.UserDefinedRoleMaker(
current_id=args.current_id,
role=role_maker.Role.SERVER,
worker_num=args.trainers,
server_endpoints=args.endpoints.split(","))
fleet.init(role)
strategy = self.generate_strategy(args)
avg_cost = self.net()
def build_optimizer(self, avg_cost, strategy):
use_grad_clip = int(os.getenv('GRAD_CLIP', 0))
if use_grad_clip:
# 1: clip_by_value; 2: clip_by_norm; 3:clip_by_global_norm
@ -99,70 +99,33 @@ class FleetDistRunnerBase(object):
optimizer = fleet.distributed_optimizer(optimizer, strategy)
optimizer.minimize(avg_cost)
def run_pserver(self, args):
fleet.init(self.build_role(args))
strategy = self.build_strategy(args)
avg_cost = self.net()
self.build_optimizer(avg_cost, strategy)
fleet.init_server()
fleet.run_server()
def run_dataset_trainer(self, args):
if args.role.upper() != "TRAINER":
raise ValueError("args role must be TRAINER")
role = role_maker.UserDefinedRoleMaker(
current_id=args.current_id,
role=role_maker.Role.WORKER,
worker_num=args.trainers,
server_endpoints=args.endpoints.split(","))
fleet.init(role)
strategy = self.generate_strategy(args)
fleet.init(self.build_role(args))
strategy = self.build_strategy(args)
avg_cost = self.net()
use_grad_clip = int(os.getenv('GRAD_CLIP', 0))
if use_grad_clip:
# 1: clip_by_value; 2: clip_by_norm; 3:clip_by_global_norm
if use_grad_clip == 1:
fluid.clip.set_gradient_clip(
clip=fluid.clip.GradientClipByValue(2.0))
elif use_grad_clip == 2:
fluid.clip.set_gradient_clip(
clip=fluid.clip.GradientClipByNorm(2.0))
elif use_grad_clip == 3:
fluid.clip.set_gradient_clip(
clip=fluid.clip.GradientClipByGlobalNorm(2.0))
optimizer = fluid.optimizer.SGD(LEARNING_RATE)
optimizer = fleet.distributed_optimizer(optimizer, strategy)
optimizer.minimize(avg_cost)
self.build_optimizer(avg_cost, strategy)
out = self.do_dataset_training(fleet)
def run_pyreader_trainer(self, args):
if args.role.upper() != "TRAINER":
raise ValueError("args role must be TRAINER")
role = role_maker.UserDefinedRoleMaker(
current_id=args.current_id,
role=role_maker.Role.WORKER,
worker_num=args.trainers,
server_endpoints=args.endpoints.split(","))
fleet.init(role)
strategy = self.generate_strategy(args)
fleet.init(self.build_role(args))
strategy = self.build_strategy(args)
avg_cost = self.net()
self.reader = fluid.io.PyReader(
feed_list=self.feeds,
capacity=64,
iterable=False,
use_double_buffer=False)
optimizer = fluid.optimizer.SGD(LEARNING_RATE)
optimizer = fleet.distributed_optimizer(optimizer, strategy)
optimizer.minimize(avg_cost)
self.build_optimizer(avg_cost, strategy)
out = self.do_pyreader_training(fleet)
def net(self, batch_size=4, lr=0.01):
@ -263,7 +226,7 @@ class TestFleetBase(unittest.TestCase):
return tr0_proc, tr1_proc, tr0_pipe, tr1_pipe
def _run_cluster(self, model, envs):
env = {'CPU_NUM': '1', 'GRAD_CLIP': str(self._grad_clip_mode)}
env = {'GRAD_CLIP': str(self._grad_clip_mode)}
env.update(envs)
python_path = self._python_interp
@ -307,29 +270,6 @@ class TestFleetBase(unittest.TestCase):
ps0.terminate()
ps1.terminate()
'''
with open("/tmp/tr0_out.log", "wb+") as wn:
wn.write(tr0_out)
with open("/tmp/tr1_out.log", "wb+") as wn:
wn.write(tr1_out)
# print server log
'''
# print server log
'''
with open("/tmp/ps0_err.log", "r") as fn:
sys.stderr.write("ps0 stderr: %s\n" % fn.read())
with open("/tmp/ps1_err.log", "r") as fn:
sys.stderr.write("ps1 stderr: %s\n" % fn.read())
'''
# print log
'''
with open("/tmp/tr0_err.log", "r") as fn:
sys.stderr.write('trainer 0 stderr: %s\n' % fn.read())
with open("/tmp/tr1_err.log", "r") as fn:
sys.stderr.write('trainer 1 stderr: %s\n' % fn.read())
'''
return 0, 0

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save