From 71a20a87a1dfee742aee26bec3894b39affed22c Mon Sep 17 00:00:00 2001 From: ZPaC Date: Sun, 26 Jul 2020 19:00:03 +0800 Subject: [PATCH] Add ps model zoo for resnet --- model_zoo/official/cv/resnet/README.md | 39 +++-- .../scripts/run_parameter_server_train.sh | 158 ++++++++++++++++++ .../scripts/run_parameter_server_train_gpu.sh | 144 ++++++++++++++++ model_zoo/official/cv/resnet/train.py | 5 +- 4 files changed, 332 insertions(+), 14 deletions(-) create mode 100644 model_zoo/official/cv/resnet/scripts/run_parameter_server_train.sh create mode 100755 model_zoo/official/cv/resnet/scripts/run_parameter_server_train_gpu.sh diff --git a/model_zoo/official/cv/resnet/README.md b/model_zoo/official/cv/resnet/README.md index a22df320e7..2d64530d98 100644 --- a/model_zoo/official/cv/resnet/README.md +++ b/model_zoo/official/cv/resnet/README.md @@ -41,20 +41,22 @@ ImageNet2012 └──resnet ├── README.md ├── script - ├── run_distribute_train.sh # launch distributed training(8 pcs) - ├── run_eval.sh # launch evaluation - └── run_standalone_train.sh # launch standalone training(1 pcs) - ├── run_distribute_train_gpu.sh # launch gpu distributed training(8 pcs) - ├── run_eval_gpu.sh # launch gpu evaluation - └── run_standalone_train_gpu.sh # launch gpu standalone training(1 pcs) + ├── run_distribute_train.sh # launch distributed training(8 pcs) + ├── run_parameter_server_train.sh # launch Ascend parameter server training(8 pcs) + ├── run_eval.sh # launch evaluation + └── run_standalone_train.sh # launch standalone training(1 pcs) + ├── run_distribute_train_gpu.sh # launch gpu distributed training(8 pcs) + ├── run_parameter_server_train_gpu.sh # launch gpu parameter server training(8 pcs) + ├── run_eval_gpu.sh # launch gpu evaluation + └── run_standalone_train_gpu.sh # launch gpu standalone training(1 pcs) ├── src - ├── config.py # parameter configuration - ├── dataset.py # data preprocessing - ├── crossentropy.py # loss definition for ImageNet2012 dataset - ├── lr_generator.py # generate learning rate for each step - └── resnet.py # resnet backbone, including resnet50 and resnet101 - ├── eval.py # eval net - └── train.py # train net + ├── config.py # parameter configuration + ├── dataset.py # data preprocessing + ├── crossentropy.py # loss definition for ImageNet2012 dataset + ├── lr_generator.py # generate learning rate for each step + └── resnet.py # resnet backbone, including resnet50 and resnet101 + ├── eval.py # eval net + └── train.py # train net ``` @@ -252,3 +254,14 @@ sh run_standalone_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATA # infer example sh run_eval_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [CHECKPOINT_PATH] ``` + +### Running parameter server mode training +``` +# parameter server training Ascend example +sh run_parameter_server_train.sh [resnet50|resnet101] [cifar10|imagenet2012] [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) + +# parameter server training GPU example +sh run_parameter_server_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) + +> The way to evaluate is the same as the examples above. +``` diff --git a/model_zoo/official/cv/resnet/scripts/run_parameter_server_train.sh b/model_zoo/official/cv/resnet/scripts/run_parameter_server_train.sh new file mode 100644 index 0000000000..a041aef04e --- /dev/null +++ b/model_zoo/official/cv/resnet/scripts/run_parameter_server_train.sh @@ -0,0 +1,158 @@ +#!/bin/bash +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +if [ $# != 4 ] && [ $# != 5 ] +then + echo "Usage: sh run_distribute_train.sh [resnet50|resnet101] [cifar10|imagenet2012] [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)" +exit 1 +fi + +if [ $1 != "resnet50" ] && [ $1 != "resnet101" ] +then + echo "error: the selected net is neither resnet50 nor resnet101" +exit 1 +fi + +if [ $2 != "cifar10" ] && [ $2 != "imagenet2012" ] +then + echo "error: the selected dataset is neither cifar10 nor imagenet2012" +exit 1 +fi + +if [ $1 == "resnet101" ] && [ $2 == "cifar10" ] +then + echo "error: training resnet101 with cifar10 dataset is unsupported now!" +exit 1 +fi + + +get_real_path(){ + if [ "${1:0:1}" == "/" ]; then + echo "$1" + else + echo "$(realpath -m $PWD/$1)" + fi +} + +PATH1=$(get_real_path $3) +PATH2=$(get_real_path $4) + +if [ $# == 5 ] +then + PATH3=$(get_real_path $5) +fi + +if [ ! -f $PATH1 ] +then + echo "error: RANK_TABLE_FILE=$PATH1 is not a file" +exit 1 +fi + +if [ ! -d $PATH2 ] +then + echo "error: DATASET_PATH=$PATH2 is not a directory" +exit 1 +fi + +if [ $# == 5 ] && [ ! -f $PATH3 ] +then + echo "error: PRETRAINED_CKPT_PATH=$PATH3 is not a file" +exit 1 +fi + +ulimit -u unlimited +export DEVICE_NUM=8 +export RANK_SIZE=8 +export RANK_TABLE_FILE=$PATH1 + +export MS_COMM_TYPE=zmq +export MS_SCHED_NUM=1 +export MS_WORKER_NUM=$RANK_SIZE +export MS_SERVER_NUM=1 +export MS_SCHED_HOST=127.0.0.1 +export MS_SCHED_PORT=8081 + +export MS_ROLE=MS_SCHED +export DEVICE_ID=0 +export RANK_ID=0 +rm -rf ./sched +mkdir ./sched +cp ../*.py ./sched +cp *.sh ./sched +cp -r ../src ./sched +cd ./sched || exit +echo "start scheduler" +if [ $# == 4 ] +then + python train.py --net=$1 --dataset=$2 --run_distribute=True --device_num=1 --dataset_path=$PATH2 --parameter_server=True &> sched.log & +fi + +if [ $# == 5 ] +then + python train.py --net=$1 --dataset=$2 --run_distribute=True --device_num=1 --dataset_path=$PATH2 --parameter_server=True --pre_trained=$PATH3 &> sched.log & +fi +cd .. + +export MS_ROLE=MS_PSERVER +for((i=0; i<1; i++)) +do + export DEVICE_ID=$i + export RANK_ID=$i + rm -rf ./server_$i + mkdir ./server_$i + cp ../*.py ./server_$i + cp *.sh ./server_$i + cp -r ../src ./server_$i + cd ./server_$i || exit + echo "start server" + if [ $# == 4 ] + then + python train.py --net=$1 --dataset=$2 --run_distribute=True --device_num=1 --dataset_path=$PATH2 --parameter_server=True &> server_$i.log & + fi + + if [ $# == 5 ] + then + python train.py --net=$1 --dataset=$2 --run_distribute=True --device_num=1 --dataset_path=$PATH2 --parameter_server=True --pre_trained=$PATH3 &> server_$i.log & + fi + + cd .. +done + +export MS_ROLE=MS_WORKER +for((i=0; i<${DEVICE_NUM}; i++)) +do + export DEVICE_ID=$i + export RANK_ID=$i + rm -rf ./worker_$i + mkdir ./worker_$i + cp ../*.py ./worker_$i + cp *.sh ./worker_$i + cp -r ../src ./worker_$i + cd ./worker_$i || exit + echo "start training for worker rank $RANK_ID, device $DEVICE_ID" + env > env.log + if [ $# == 4 ] + then + python train.py --net=$1 --dataset=$2 --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 --parameter_server=True &> worker_$i.log & + fi + + if [ $# == 5 ] + then + python train.py --net=$1 --dataset=$2 --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 --parameter_server=True --pre_trained=$PATH3 &> worker_$i.log & + fi + + cd .. +done diff --git a/model_zoo/official/cv/resnet/scripts/run_parameter_server_train_gpu.sh b/model_zoo/official/cv/resnet/scripts/run_parameter_server_train_gpu.sh new file mode 100755 index 0000000000..7ccfdddb2f --- /dev/null +++ b/model_zoo/official/cv/resnet/scripts/run_parameter_server_train_gpu.sh @@ -0,0 +1,144 @@ +#!/bin/bash +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +if [ $# != 3 ] && [ $# != 4 ] +then + echo "Usage: sh run_distribute_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)" +exit 1 +fi + +if [ $1 != "resnet50" ] && [ $1 != "resnet101" ] +then + echo "error: the selected net is neither resnet50 nor resnet101" +exit 1 +fi + +if [ $2 != "cifar10" ] && [ $2 != "imagenet2012" ] +then + echo "error: the selected dataset is neither cifar10 nor imagenet2012" +exit 1 +fi + +if [ $1 == "resnet101" ] && [ $2 == "cifar10" ] +then + echo "error: training resnet101 with cifar10 dataset is unsupported now!" +exit 1 +fi + + +get_real_path(){ + if [ "${1:0:1}" == "/" ]; then + echo "$1" + else + echo "$(realpath -m $PWD/$1)" + fi +} + +PATH1=$(get_real_path $3) + +if [ $# == 4 ] +then + PATH2=$(get_real_path $4) +fi + + +if [ ! -d $PATH2 ] +then + echo "error: DATASET_PATH=$PATH1 is not a directory" +exit 1 +fi + +if [ $# == 5 ] && [ ! -f $PATH2 ] +then + echo "error: PRETRAINED_CKPT_PATH=$PATH2 is not a file" +exit 1 +fi + +export DEVICE_NUM=8 +export RANK_SIZE=8 + +export MS_COMM_TYPE=zmq +export MS_SCHED_NUM=1 +export MS_WORKER_NUM=8 +export MS_SERVER_NUM=1 +export MS_SCHED_HOST=127.0.0.1 +export MS_SCHED_PORT=8081 + +export MS_ROLE=MS_SCHED +rm -rf ./sched +mkdir ./sched +cp ../*.py ./sched +cp *.sh ./sched +cp -r ../src ./sched +cd ./sched || exit +if [ $# == 3 ] +then + mpirun --allow-run-as-root -n 1 \ + python train.py --net=$1 --dataset=$2 --run_distribute=True \ + --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True &> sched.log & +fi + +if [ $# == 4 ] +then + mpirun --allow-run-as-root -n 1 \ + python train.py --net=$1 --dataset=$2 --run_distribute=True \ + --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True --pre_trained=$PATH2 &> sched.log & +fi +cd .. + +export MS_ROLE=MS_PSERVER +rm -rf ./server +mkdir ./server +cp ../*.py ./server +cp *.sh ./server +cp -r ../src ./server +cd ./server || exit +if [ $# == 3 ] +then + mpirun --allow-run-as-root -n 1 \ + python train.py --net=$1 --dataset=$2 --run_distribute=True \ + --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True &> server.log & +fi + +if [ $# == 4 ] +then + mpirun --allow-run-as-root -n 1 \ + python train.py --net=$1 --dataset=$2 --run_distribute=True \ + --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True --pre_trained=$PATH2 &> server.log & +fi +cd .. + +export MS_ROLE=MS_WORKER +rm -rf ./worker +mkdir ./worker +cp ../*.py ./worker +cp *.sh ./worker +cp -r ../src ./worker +cd ./worker || exit +if [ $# == 3 ] +then + mpirun --allow-run-as-root -n $RANK_SIZE \ + python train.py --net=$1 --dataset=$2 --run_distribute=True \ + --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True &> worker.log & +fi + +if [ $# == 4 ] +then + mpirun --allow-run-as-root -n $RANK_SIZE \ + python train.py --net=$1 --dataset=$2 --run_distribute=True \ + --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True --pre_trained=$PATH2 &> worker.log & +fi +cd .. diff --git a/model_zoo/official/cv/resnet/train.py b/model_zoo/official/cv/resnet/train.py index 414fa4c7de..3d65d10392 100755 --- a/model_zoo/official/cv/resnet/train.py +++ b/model_zoo/official/cv/resnet/train.py @@ -41,6 +41,7 @@ parser.add_argument('--device_num', type=int, default=1, help='Device num.') parser.add_argument('--dataset_path', type=str, default=None, help='Dataset path') parser.add_argument('--device_target', type=str, default='Ascend', help='Device target') parser.add_argument('--pre_trained', type=str, default=None, help='Pretrained checkpoint path') +parser.add_argument('--parameter_server', type=bool, default=False, help='Run parameter server train') args_opt = parser.parse_args() random.seed(1) @@ -92,6 +93,8 @@ if __name__ == '__main__': # define net net = resnet(class_num=config.class_num) + if args_opt.parameter_server: + net.set_param_ps() # init weight if args_opt.pre_trained: @@ -181,4 +184,4 @@ if __name__ == '__main__': cb += [ckpt_cb] # train model - model.train(config.epoch_size, dataset, callbacks=cb) + model.train(config.epoch_size, dataset, callbacks=cb, dataset_sink_mode=(not args_opt.parameter_server))