【paddle.fleet】Update fleetrun & ps-heter (#27472)

* refine fleetrun.ps_launch

* update fleet run for multi device support

* ps_graph support ps-gpu

* fix heter save

* add heter save unittest

* fix unittest & simple code

* update fleetrun

* fix fleetrun

* fix launch barrier

* fix role maker

* add paddlecloud rolemaker unittest

* rename heter_worker_device_guard
my_2.0rc
Chengmo 4 years ago committed by GitHub
parent bbc837ee72
commit c5f2802d56
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -98,6 +98,7 @@ message AsyncConfig {
optional int32 send_wait_times = 7 [ default = 1 ];
optional bool runtime_split_send_recv = 8 [ default = false ];
optional bool launch_barrier = 9 [ default = true ];
optional string heter_worker_device_guard = 10 [ default = 'cpu' ];
}
message PipelineConfig { optional int32 micro_batch = 1 [ default = 1 ]; }

@ -530,13 +530,6 @@ class RoleMakerBase(object):
return self._heter_trainer_endpoints[(self._current_id) %
self._heter_worker_num()]
def _get_heter_worker_device(self):
"""
Returns:
string: heter_trainer's device of current node, e.g: CPU/GPU/XPU
"""
return self._heter_trainer_device.upper()
class PaddleCloudRoleMaker(RoleMakerBase):
def __init__(self, is_collective=False, **kwargs):
@ -677,88 +670,99 @@ class PaddleCloudRoleMaker(RoleMakerBase):
return self._role == Role.HETER_WORKER
def _ps_env(self):
try:
# Environment variable PADDLE_PSERVERS_IP_PORT_LIST must be set
# format: string(ip:port,ip:port), eg. 127.0.0.1:6001,127.0.0.1:6002
self._server_endpoints = os.getenv("PADDLE_PSERVERS_IP_PORT_LIST")
if self._server_endpoints is None:
# back to non_distributed execution.
self._server_endpoints = ""
self._trainers_num = 1
self._role = Role.WORKER
self._current_id = 0
self._nodes_num = 1
self._heter_trainers_num = 0
self._heter_trainer_endpoints = None
self._non_distributed = True
return
self._server_endpoints = self._server_endpoints.split(",")
self._worker_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS")
if self._worker_endpoints:
self._worker_endpoints = self._worker_endpoints.split(",")
else:
self._worker_endpoints = []
# Environment variable PADDLE_PSERVERS_IP_PORT_LIST must be set
# format: string(ip:port,ip:port), eg. 127.0.0.1:6001,127.0.0.1:6002
self._server_endpoints = os.getenv("PADDLE_PSERVERS_IP_PORT_LIST", None)
if self._server_endpoints is None:
# back to non_distributed execution.
self._server_endpoints = ""
self._trainers_num = 1
self._role = Role.WORKER
self._current_id = 0
self._nodes_num = 1
self._heter_trainers_num = 0
self._heter_trainer_endpoints = None
self._non_distributed = True
return
self._server_endpoints = self._server_endpoints.split(",")
self._worker_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS", None)
if self._worker_endpoints != None:
self._worker_endpoints = self._worker_endpoints.split(",")
else:
self._worker_endpoints = []
trainers_num = os.getenv("PADDLE_TRAINERS_NUM", None)
if trainers_num == None:
raise ValueError(
"Can not find PADDLE_TRAINERS_NUM, please check your environment."
)
trainers_num = int(trainers_num)
trainers_num = int(os.environ["PADDLE_TRAINERS_NUM"])
training_role = os.environ["TRAINING_ROLE"]
training_role = os.getenv("TRAINING_ROLE", None)
if training_role == None:
raise ValueError(
"Can not find TRAINING_ROLE, please check your environment.")
if training_role not in ["TRAINER", "PSERVER", "HETER_TRAINER"]:
if training_role not in ["TRAINER", "PSERVER", "HETER_TRAINER"]:
raise ValueError(
"TRAINING_ROLE must be PSERVER or TRAINER or HETER_TRAINER, but get {}, please check your environment.".
format(training_role))
# For heter parameter server env setting
heter_trainer_eplist = os.getenv("PADDLE_HETER_TRAINER_IP_PORT_LIST",
"")
if heter_trainer_eplist != "":
try:
heter_trainer_eplist = os.environ[
"PADDLE_HETER_TRAINER_IP_PORT_LIST"].split(",")
except:
raise ValueError(
"TRAINING_ROLE must be PSERVER or TRAINER or HETER_TRAINER, but get {}, please check your environment.".
format(training_role))
# For heter parameter server env setting
heter_trainer_eplist = os.getenv(
"PADDLE_HETER_TRAINER_IP_PORT_LIST", None)
heter_trainer_device = os.getenv("PADDLE_HETER_TRAINER_DEVICE",
None)
if heter_trainer_eplist and heter_trainer_device:
try:
heter_trainer_eplist = os.environ[
"PADDLE_HETER_TRAINER_IP_PORT_LIST"].split(",")
except:
raise ValueError(
"Can not Find PADDLE_HETER_TRAINER_IP_PORT_LIST in env or its format doesn't match the requirement: 'IP:PORT,IP:PORT' ."
)
self._is_heter_parameter_server_mode = True
heter_trainers_num = len(heter_trainer_eplist)
current_node_device = heter_trainer_device.upper()
if current_node_device not in ["CPU", "GPU", "XPU"]:
raise ValueError(
"Heter Trainer doesn't support {} device now, please use CPU / GPU / XPU(KunLun)".
format(heter_trainer_device))
self._heter_trainer_device = current_node_device
else:
self._is_heter_parameter_server_mode = False
heter_trainers_num = 0
if training_role == "TRAINER":
role = Role.WORKER
current_id = int(os.environ["PADDLE_TRAINER_ID"])
if len(self._worker_endpoints) > 0:
self._cur_endpoint = self._worker_endpoints[current_id]
elif training_role == "PSERVER":
role = Role.SERVER
port = os.environ["PADDLE_PORT"]
ip = os.environ["POD_IP"]
self._cur_endpoint = ip + ":" + port
current_id = self._server_endpoints.index(self._cur_endpoint)
elif training_role == "HETER_TRAINER":
role = Role.HETER_WORKER
cur_ip = os.environ["POD_IP"]
cur_port = os.environ["PADDLE_PORT"]
curr_endpoint = ":".join([cur_ip, cur_port])
current_id = heter_trainer_eplist.index(curr_endpoint)
else:
"Can not Find PADDLE_HETER_TRAINER_IP_PORT_LIST in env or its format doesn't match the requirement: 'IP:PORT,IP:PORT' ."
)
self._is_heter_parameter_server_mode = True
heter_trainers_num = len(heter_trainer_eplist)
else:
self._is_heter_parameter_server_mode = False
heter_trainers_num = 0
if training_role == "TRAINER":
role = Role.WORKER
current_id = os.getenv("PADDLE_TRAINER_ID", None)
if current_id == None:
raise ValueError(
"TRAINING_ROLE must be PSERVER or TRAINER or HETER_TRAINER")
except ValueError as e:
raise ValueError(
"Something wrong with PaddleCloud, please check environment")
"Can not find PADDLE_TRAINER_ID, please check your environment."
)
current_id = int(current_id)
if len(self._worker_endpoints) > 0:
self._cur_endpoint = self._worker_endpoints[current_id]
elif training_role == "PSERVER":
role = Role.SERVER
port = os.getenv("PADDLE_PORT", None)
if port == None:
raise ValueError(
"Can not find PADDLE_PORT, please check your environment.")
ip = os.getenv("POD_IP", None)
if ip == None:
raise ValueError(
"Can not find POD_IP, please check your environment.")
self._cur_endpoint = ip + ":" + port
current_id = self._server_endpoints.index(self._cur_endpoint)
elif training_role == "HETER_TRAINER":
role = Role.HETER_WORKER
cur_port = os.getenv("PADDLE_PORT", None)
if cur_port == None:
raise ValueError(
"Can not find PADDLE_PORT, please check your environment.")
cur_ip = os.getenv("POD_IP", None)
if cur_ip == None:
raise ValueError(
"Can not find POD_IP, please check your environment.")
curr_endpoint = ":".join([cur_ip, cur_port])
current_id = heter_trainer_eplist.index(curr_endpoint)
self._trainers_num = trainers_num
self._role = role

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -74,6 +74,8 @@ class ParameterServerOptimizer(MetaOptimizerBase):
_startup = worker.delet_extra_optimizes_pass(_startup,
compiled_config)
compiled_config.set_origin_ps_main_program(_main)
compiled_config.set_origin_ps_startup_program(_startup)
# for heter program
if self.role_maker._is_heter_parameter_server_mode:
from paddle.fluid.incubate.fleet.parameter_server.ir import heter_trainer_pass as heter_worker
@ -91,6 +93,8 @@ class ParameterServerOptimizer(MetaOptimizerBase):
else:
_main = worker.append_send_ops_pass(_main, compiled_config)
_startup = _startup
compiled_config.set_origin_ps_main_program(_main)
compiled_config.set_origin_ps_startup_program(_startup)
return _main, _startup

@ -210,18 +210,23 @@ class ParameterServerRuntime(RuntimeBase):
warnings.warn("communicator has been initialized, skip")
def _get_executor(self):
if self.role_maker._is_heter_worker():
if self.role_maker._get_heter_worker_device() == "GPU":
gpu_id = int(os.getenv("FLAGS_selected_gpus", "0"))
executor = Executor(fluid.CUDAPlace(gpu_id))
elif self.role_maker._get_heter_worker_device() == "XPU":
xpu_id = int(os.getenv("FLAGS_selected_xpus", "0"))
executor = Executor(fluid.XPUPlace(xpu_id))
else:
raise ValueError("Not Support Device {}".format(
self.role_maker._get_heter_worker_device()))
else:
executor = fluid.Executor(fluid.CPUPlace())
executor = fluid.Executor(fluid.CPUPlace())
if self.role_maker._is_heter_parameter_server_mode:
heter_worker_device_guard = self.context[
"valid_strategy"].a_sync_configs[
"heter_worker_device_guard"].upper()
if heter_worker_device_guard not in ["GPU", "XPU", "CPU"]:
raise ValueError("Heter Worker Not Support Device {}".format(
heter_worker_device_guard))
if self.role_maker._is_heter_worker():
if heter_worker_device_guard == "GPU":
executor = Executor(
fluid.CUDAPlace(
int(os.getenv("FLAGS_selected_gpus", "0"))))
elif heter_worker_device_guard == "XPU":
executor = Executor(
fluid.XPUPlace(
int(os.getenv("FLAGS_selected_xpus", "0"))))
return executor
def _init_server(self, *args, **kwargs):
@ -233,12 +238,14 @@ class ParameterServerRuntime(RuntimeBase):
model_dirname = None
executor = self._get_executor()
if self.role_maker._is_heter_worker() and self.context[
"valid_strategy"].a_sync_configs["launch_barrier"]:
# for heter trainer wait server ready
wait_server_ready(self.role_maker._get_pserver_endpoints())
executor.run(fluid.default_startup_program())
if self.role_maker._is_heter_worker():
self._init_worker()
if self.role_maker._is_heter_worker():
return
if not model_dirname:
@ -470,13 +477,13 @@ class ParameterServerRuntime(RuntimeBase):
def _save_distributed_persistables(self, executor, dirname, main_program):
dense_ctx = self.compiled_strategy.get_communicator_recv_context(
recv_type=1)
recv_type=1, use_origin_program=True)
sparse_ctx = self.compiled_strategy.get_communicator_recv_context(
recv_type=2)
recv_type=2, use_origin_program=True)
distributed_ctx = self.compiled_strategy.get_communicator_recv_context(
recv_type=3)
recv_type=3, use_origin_program=True)
recv_dense_varnames = self._save_dense_params(executor, dirname,
dense_ctx, main_program)
@ -528,7 +535,7 @@ class ParameterServerRuntime(RuntimeBase):
)
if main_program is None:
main_program = fluid.default_main_program()
main_program = self.compiled_strategy.get_origin_ps_main_program()
if isinstance(main_program, CompiledProgram):
raise TypeError(

@ -133,6 +133,8 @@ class CompileTimeStrategy(object):
self.origin_main_program = main_program
self.origin_startup_program = startup_program
self.origin_ps_main_program = main_program
self.origin_ps_startup_program = startup_program
self.strategy = strategy
self.role_maker = role_maker
@ -153,6 +155,11 @@ class CompileTimeStrategy(object):
self._build_var_distributed()
# for heter-ps save variables
self.origin_merged_variables_pairs = list(self.merged_variables_pairs)
self.origin_merged_dense_pairs = list(self.merged_dense_pairs)
self.origin_merged_sparse_pairs = list(self.merged_sparse_pairs)
def get_distributed_mode(self):
trainer = self.strategy.get_trainer_runtime_config()
return trainer.mode
@ -214,6 +221,18 @@ class CompileTimeStrategy(object):
def get_origin_startup_program(self):
return self.origin_startup_program
def set_origin_ps_main_program(self, program):
self.origin_ps_main_program = program
def set_origin_ps_startup_program(self, program):
self.origin_ps_startup_program = program
def get_origin_ps_main_program(self):
return self.origin_ps_main_program
def get_origin_ps_startup_program(self):
return self.origin_ps_startup_program
def get_sparse_varname_on_ps(self, is_distributed, endpoint=None):
if not endpoint:
endpoint = self.get_ps_endpoint()
@ -378,7 +397,9 @@ class CompileTimeStrategy(object):
send_ctx[name] = ctx
return send_ctx
def get_communicator_recv_context(self, recv_type=1):
def get_communicator_recv_context(self,
recv_type=1,
use_origin_program=False):
# recv_type
# 1 : DENSE 2. SPARSE 3. DISTRIBUTED 4. ALL
distibuted_varnames = get_sparse_tablenames(self.origin_main_program,
@ -392,7 +413,8 @@ class CompileTimeStrategy(object):
sparse_recv_ctx = {}
distributed_recv_ctx = {}
for merged in self.merged_variables_pairs:
variables_pairs = self.merged_variables_pairs if not use_origin_program else self.origin_merged_variables_pairs
for merged in variables_pairs:
params = merged[0]
if params.merged_var.name in sparse_varnames:
continue

@ -169,6 +169,10 @@ class TestHeterPsCTR2x2(FleetDistHeterRunnerBase):
except fluid.core.EOFException:
self.reader.reset()
if fleet.is_first_worker():
model_path = tempfile.mkdtemp()
fleet.save_persistables(executor=exe, dirname=model_path)
shutil.rmtree(model_path)
fleet.stop_worker()
def do_dataset_training(self, fleet):

@ -20,8 +20,12 @@ from paddle.fluid.incubate.fleet.base import role_maker
input_x = fluid.layers.data(name="x", shape=[32], dtype='float32')
input_y = fluid.layers.data(name="y", shape=[1], dtype='int64')
input_y = fluid.layers.cast(input_y, dtype="float32")
with fluid.device_guard("gpu"):
input_y = fluid.layers.cast(input_y, dtype="int64")
cost = mlp(input_x, input_y)
cost = mlp(input_x, input_y)
optimizer = fluid.optimizer.Adagrad(learning_rate=0.01)
role = role_maker.PaddleCloudRoleMaker()

@ -288,7 +288,7 @@ class TestFleetHeterBase(unittest.TestCase):
print("tr end communicate")
tr0_ret = tr0.returncode
tr1_ret = tr0.returncode
tr1_ret = tr1.returncode
# close trainer file
tr0_pipe.close()

@ -50,6 +50,10 @@ class TestDistFleetHeterProgram(unittest.TestCase):
def build_strategy(self):
self.strategy = paddle.distributed.fleet.DistributedStrategy()
self.strategy.a_sync = True
self.strategy.a_sync_configs = {
"launch_barrier": False,
"heter_worker_device_guard": "gpu"
}
return self.strategy
def build_input(self):

@ -28,13 +28,27 @@ function test_launch_ps(){
fi
}
function test_launch_ps_heter(){
fleetrun --server_num=2 --worker_num=2 --heter_worker_num=2 fleet_ps_training.py 2> ut.elog
if grep -q "server are killed" ut.elog; then
echo "test heter pserver launch succeed"
else
echo "test pserver launch failed"
exit -1
fi
}
if [[ ${WITH_GPU} == "OFF" ]]; then
echo "in cpu test mode"
test_launch_ps
exit 0
fi
echo "No.1 unittest"
test_launch_ps
test_launch_ps_heter
# use default values
echo "No.2 unittest"
fleetrun multi_process.py fleetrun
# use paddlecloud
@ -48,6 +62,7 @@ export PADDLE_TRAINER_ID=0
export PADDLE_PORT=35789
export TRAINER_PORTS_NUM=2
echo "No.3 unittest"
distributed_args="--ips=${cluster_node_ips} --gpus=0,1 --log_dir=testlog"
CUDA_VISIBLE_DEVICES=0,1 fleetrun ${distributed_args} multi_process.py fleetrun
@ -83,7 +98,7 @@ fi
unset PADDLE_PORT
export DISTRIBUTED_TRAINER_ENDPOINTS=127.0.0.1:6170,127.0.0.1:6171,127.0.0.2:6170,127.0.0.2:6171
echo ""
echo "No.4 unittest"
echo "paddle.distributed.launch async poll process test"
if ! CUDA_VISIBLE_DEVICES=0,1 fleetrun ${distributed_args} multi_process.py fleetrun abort; then
echo "train abort as planned"
@ -112,5 +127,6 @@ rm -rf $file_0_0 $file_0_1
distributed_args="--gpus=0,1 --log_dir=testlog"
export PADDLE_LAUNCH_LOG="test_launch_filelock_0"
echo "No.5 unittest"
CUDA_VISIBLE_DEVICES=0,1 fleetrun ${distributed_args} find_ports.py
str_0="worker_endpoints:127.0.0.1:6070,127.0.0.1:6071"

@ -0,0 +1,149 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Test cloud role maker."""
from __future__ import print_function
import os
import platform
import shutil
import tempfile
import unittest
import paddle
import paddle.distributed.fleet.base.role_maker as role_maker
class TestPSCloudRoleMakerCase1(unittest.TestCase):
"""
Test cases for PaddleCloudRoleMake Parameter Server.
"""
def setUp(self):
os.environ[
"PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:4001,127.0.0.1:4002"
def test_paddle_trainers_num(self):
# PADDLE_TRAINERS_NUM
ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
self.assertRaises(ValueError, ro._generate_role)
class TestPSCloudRoleMakerCase2(unittest.TestCase):
"""
Test cases for PaddleCloudRoleMake Parameter Server.
"""
def setUp(self):
os.environ[
"PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:4001,127.0.0.1:4002"
os.environ["PADDLE_TRAINERS_NUM"] = str(2)
def test_training_role(self):
# TRAINING_ROLE
ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
self.assertRaises(ValueError, ro._generate_role)
class TestPSCloudRoleMakerCase3(unittest.TestCase):
"""
Test cases for PaddleCloudRoleMake Parameter Server.
"""
def setUp(self):
os.environ[
"PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:4001,127.0.0.1:4002"
os.environ["PADDLE_TRAINERS_NUM"] = str(2)
os.environ["TRAINING_ROLE"] = 'TRAINER'
def test_trainer_id(self):
# PADDLE_TRAINER_ID
ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
self.assertRaises(ValueError, ro._generate_role)
class TestPSCloudRoleMakerCase4(unittest.TestCase):
"""
Test cases for PaddleCloudRoleMake Parameter Server.
"""
def setUp(self):
os.environ[
"PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:4001,127.0.0.1:4002"
os.environ["PADDLE_TRAINERS_NUM"] = str(2)
os.environ["TRAINING_ROLE"] = 'PSERVER'
def test_ps_port(self):
# PADDLE_PORT
ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
self.assertRaises(ValueError, ro._generate_role)
class TestPSCloudRoleMakerCase5(unittest.TestCase):
"""
Test cases for PaddleCloudRoleMake Parameter Server.
"""
def setUp(self):
os.environ[
"PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:4001,127.0.0.1:4002"
os.environ["PADDLE_TRAINERS_NUM"] = str(2)
os.environ["TRAINING_ROLE"] = 'PSERVER'
os.environ["PADDLE_PORT"] = str(4001)
def test_ps_ip(self):
# POD_IP
ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
self.assertRaises(ValueError, ro._generate_role)
class TestPSCloudRoleMakerCase6(unittest.TestCase):
"""
Test cases for PaddleCloudRoleMake Parameter Server.
"""
def setUp(self):
os.environ[
"PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:4001,127.0.0.1:4002"
os.environ[
"PADDLE_HETER_TRAINER_IP_PORT_LIST"] = "127.0.0.1:4003,127.0.0.1:4004"
os.environ["PADDLE_TRAINERS_NUM"] = str(2)
os.environ["TRAINING_ROLE"] = 'HETER_TRAINER'
def test_heter_port(self):
# PADDLE_PORT
ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
self.assertRaises(ValueError, ro._generate_role)
class TestPSCloudRoleMakerCase7(unittest.TestCase):
"""
Test cases for PaddleCloudRoleMake Parameter Server.
"""
def setUp(self):
os.environ[
"PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:4001,127.0.0.1:4002"
os.environ[
"PADDLE_HETER_TRAINER_IP_PORT_LIST"] = "127.0.0.1:4003,127.0.0.1:4004"
os.environ["PADDLE_TRAINERS_NUM"] = str(2)
os.environ["TRAINING_ROLE"] = 'HETER_TRAINER'
os.environ["PADDLE_PORT"] = str(4003)
def test_heter_ip(self):
# POD_IP
ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
self.assertRaises(ValueError, ro._generate_role)
if __name__ == "__main__":
unittest.main()
Loading…
Cancel
Save