@ -16,26 +16,26 @@
if [ $# != 3 ] && [ $# != 4 ]
then
echo "Usage: sh run_distribute_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)"
exit 1
echo "Usage: sh run_distribute_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)"
exit 1
fi
if [ $1 != "resnet50" ] && [ $1 != "resnet101" ]
then
echo "error: the selected net is neither resnet50 nor resnet101"
exit 1
exit 1
fi
if [ $2 != "cifar10" ] && [ $2 != "imagenet2012" ]
then
echo "error: the selected dataset is neither cifar10 nor imagenet2012"
exit 1
exit 1
fi
if [ $1 = = "resnet101" ] && [ $2 = = "cifar10" ]
then
echo "error: training resnet101 with cifar10 dataset is unsupported now!"
exit 1
exit 1
fi
@ -58,13 +58,13 @@ fi
if [ ! -d $PATH2 ]
then
echo " error: DATASET_PATH= $PATH1 is not a directory "
exit 1
exit 1
fi
if [ $# = = 5 ] && [ ! -f $PATH2 ]
then
echo " error: PRETRAINED_CKPT_PATH= $PATH2 is not a file "
exit 1
exit 1
fi
export DEVICE_NUM = 8
@ -85,44 +85,43 @@ cp *.sh ./sched
cp -r ../src ./sched
cd ./sched || exit
if [ $# = = 3 ]
then
mpirun --allow-run-as-root -n 1 --output-filename log_output --merge-stderr-to-stdout \
python train.py --net= $1 --dataset= $2 --run_distribute= True \
--device_num= $DEVICE_NUM --device_target= "GPU" --dataset_path= $PATH1 --parameter_server= True & > sched.log &
then
mpirun --allow-run-as-root -n 1 --output-filename log_output --merge-stderr-to-stdout \
python train.py --net= $1 --dataset= $2 --run_distribute= True \
--device_num= $DEVICE_NUM --device_target= "GPU" --dataset_path= $PATH1 --parameter_server= True & > sched.log &
fi
if [ $# = = 4 ]
then
mpirun --allow-run-as-root -n 1 --output-filename log_output --merge-stderr-to-stdout \
python train.py --net= $1 --dataset= $2 --run_distribute= True \
--device_num= $DEVICE_NUM --device_target= "GPU" --dataset_path= $PATH1 --parameter_server= True --pre_trained= $PATH2 & > sched.log &
mpirun --allow-run-as-root -n 1 --output-filename log_output --merge-stderr-to-stdout \
python train.py --net= $1 --dataset= $2 --run_distribute= True \
--device_num= $DEVICE_NUM --device_target= "GPU" --dataset_path= $PATH1 --parameter_server= True --pre_trained= $PATH2 & > sched.log &
fi
cd ..
export MS_ROLE = MS_PSERVER
for ( ( i = 0; i<$MS_SERVER_NUM ; i++) ) ;
do
rm -rf ./server_$i
mkdir ./server_$i
cp ../*.py ./server_$i
cp *.sh ./server_$i
cp -r ../src ./server_$i
cd ./server_$i || exit
if [ $# = = 3 ]
then
mpirun --allow-run-as-root -n 1 --output-filename log_output --merge-stderr-to-stdout \
python train.py --net= $1 --dataset= $2 --run_distribute= True \
--device_num= $DEVICE_NUM --device_target= "GPU" --dataset_path= $PATH1 --parameter_server= True & > server_$i .log &
fi
if [ $# = = 4 ]
then
mpirun --allow-run-as-root -n 1 --output-filename log_output --merge-stderr-to-stdout \
python train.py --net= $1 --dataset= $2 --run_distribute= True \
--device_num= $DEVICE_NUM --device_target= "GPU" --dataset_path= $PATH1 --parameter_server= True --pre_trained= $PATH2 & > server_$i .log &
fi
cd ..
rm -rf ./server_$i
mkdir ./server_$i
cp ../*.py ./server_$i
cp *.sh ./server_$i
cp -r ../src ./server_$i
cd ./server_$i || exit
if [ $# = = 3 ]
then
mpirun --allow-run-as-root -n 1 --output-filename log_output --merge-stderr-to-stdout \
python train.py --net= $1 --dataset= $2 --run_distribute= True \
--device_num= $DEVICE_NUM --device_target= "GPU" --dataset_path= $PATH1 --parameter_server= True & > server_$i .log &
fi
if [ $# = = 4 ]
then
mpirun --allow-run-as-root -n 1 --output-filename log_output --merge-stderr-to-stdout \
python train.py --net= $1 --dataset= $2 --run_distribute= True \
--device_num= $DEVICE_NUM --device_target= "GPU" --dataset_path= $PATH1 --parameter_server= True --pre_trained= $PATH2 & > server_$i .log &
fi
cd ..
done
export MS_ROLE = MS_WORKER
@ -133,16 +132,16 @@ cp *.sh ./worker
cp -r ../src ./worker
cd ./worker || exit
if [ $# = = 3 ]
then
mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \
python train.py --net= $1 --dataset= $2 --run_distribute= True \
--device_num= $DEVICE_NUM --device_target= "GPU" --dataset_path= $PATH1 --parameter_server= True & > worker.log &
then
mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \
python train.py --net= $1 --dataset= $2 --run_distribute= True \
--device_num= $DEVICE_NUM --device_target= "GPU" --dataset_path= $PATH1 --parameter_server= True & > worker.log &
fi
if [ $# = = 4 ]
then
mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \
python train.py --net= $1 --dataset= $2 --run_distribute= True \
--device_num= $DEVICE_NUM --device_target= "GPU" --dataset_path= $PATH1 --parameter_server= True --pre_trained= $PATH2 & > worker.log &
mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \
python train.py --net= $1 --dataset= $2 --run_distribute= True \
--device_num= $DEVICE_NUM --device_target= "GPU" --dataset_path= $PATH1 --parameter_server= True --pre_trained= $PATH2 & > worker.log &
fi
cd ..