|
|
|
@ -27,13 +27,13 @@ export PADDLE_TRAINER_ID=0
|
|
|
|
|
|
|
|
|
|
export TRAINER_PORTS_NUM=2
|
|
|
|
|
|
|
|
|
|
file_0="multi_process_fleetrun.check_0.log"
|
|
|
|
|
file_1="multi_process_fleetrun.check_1.log"
|
|
|
|
|
file_0="multi_process_fullpath_launch.check_0.log"
|
|
|
|
|
file_1="multi_process_fullpath_launch.check_1.log"
|
|
|
|
|
|
|
|
|
|
distributed_args="--ips=${cluster_node_ips} --gpus=0,1 --log_dir=testlog"
|
|
|
|
|
|
|
|
|
|
echo "paddle.distributed.fleet.launch async poll process test"
|
|
|
|
|
if ! CUDA_VISIBLE_DEVICES=0,1 python -m paddle.distributed.fleet.launch ${distributed_args} multi_process.py fleetrun abort; then
|
|
|
|
|
if ! CUDA_VISIBLE_DEVICES=0,1 python -m paddle.distributed.fleet.launch ${distributed_args} multi_process.py fullpath_launch abort; then
|
|
|
|
|
echo "train abort as planned"
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|