From 71a20a87a1dfee742aee26bec3894b39affed22c Mon Sep 17 00:00:00 2001
From: ZPaC <zhoupeichen@huawei.com>
Date: Sun, 26 Jul 2020 19:00:03 +0800
Subject: [PATCH] Add ps model zoo for resnet

---
 model_zoo/official/cv/resnet/README.md        |  39 +++--
 .../scripts/run_parameter_server_train.sh     | 158 ++++++++++++++++++
 .../scripts/run_parameter_server_train_gpu.sh | 144 ++++++++++++++++
 model_zoo/official/cv/resnet/train.py         |   5 +-
 4 files changed, 332 insertions(+), 14 deletions(-)
 create mode 100644 model_zoo/official/cv/resnet/scripts/run_parameter_server_train.sh
 create mode 100755 model_zoo/official/cv/resnet/scripts/run_parameter_server_train_gpu.sh

diff --git a/model_zoo/official/cv/resnet/README.md b/model_zoo/official/cv/resnet/README.md
index a22df320e7..2d64530d98 100644
--- a/model_zoo/official/cv/resnet/README.md
+++ b/model_zoo/official/cv/resnet/README.md
@@ -41,20 +41,22 @@ ImageNet2012
 └──resnet
   ├── README.md
   ├── script
-    ├── run_distribute_train.sh         # launch distributed training(8 pcs)
-    ├── run_eval.sh                     # launch evaluation
-    └── run_standalone_train.sh         # launch standalone training(1 pcs)
-    ├── run_distribute_train_gpu.sh     # launch gpu distributed training(8 pcs)
-    ├── run_eval_gpu.sh                 # launch gpu evaluation
-    └── run_standalone_train_gpu.sh     # launch gpu standalone training(1 pcs)
+    ├── run_distribute_train.sh            # launch distributed training(8 pcs)
+    ├── run_parameter_server_train.sh      # launch Ascend parameter server training(8 pcs)
+    ├── run_eval.sh                        # launch evaluation
+    └── run_standalone_train.sh            # launch standalone training(1 pcs)
+    ├── run_distribute_train_gpu.sh        # launch gpu distributed training(8 pcs)
+    ├── run_parameter_server_train_gpu.sh  # launch gpu parameter server training(8 pcs)
+    ├── run_eval_gpu.sh                    # launch gpu evaluation
+    └── run_standalone_train_gpu.sh        # launch gpu standalone training(1 pcs)
   ├── src
-    ├── config.py                       # parameter configuration
-    ├── dataset.py                      # data preprocessing
-    ├── crossentropy.py                 # loss definition for ImageNet2012 dataset
-    ├── lr_generator.py                 # generate learning rate for each step
-    └── resnet.py                       # resnet backbone, including resnet50 and resnet101
-  ├── eval.py                           # eval net
-  └── train.py                          # train net
+    ├── config.py                          # parameter configuration
+    ├── dataset.py                         # data preprocessing
+    ├── crossentropy.py                    # loss definition for ImageNet2012 dataset
+    ├── lr_generator.py                    # generate learning rate for each step
+    └── resnet.py                          # resnet backbone, including resnet50 and resnet101
+  ├── eval.py                              # eval net
+  └── train.py                             # train net
 ```
 
 
@@ -252,3 +254,14 @@ sh run_standalone_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATA
 # infer example
 sh run_eval_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [CHECKPOINT_PATH]
 ```
+
+### Running parameter server mode training
+```
+# parameter server training Ascend example
+sh run_parameter_server_train.sh [resnet50|resnet101] [cifar10|imagenet2012] [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
+
+# parameter server training GPU example
+sh run_parameter_server_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
+
+> The way to evaluate is the same as the examples above.
+```
diff --git a/model_zoo/official/cv/resnet/scripts/run_parameter_server_train.sh b/model_zoo/official/cv/resnet/scripts/run_parameter_server_train.sh
new file mode 100644
index 0000000000..a041aef04e
--- /dev/null
+++ b/model_zoo/official/cv/resnet/scripts/run_parameter_server_train.sh
@@ -0,0 +1,158 @@
+#!/bin/bash
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+if [ $# != 4 ] && [ $# != 5 ]
+then 
+	echo "Usage: sh run_distribute_train.sh [resnet50|resnet101] [cifar10|imagenet2012] [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)"
+exit 1
+fi
+
+if [ $1 != "resnet50" ] && [ $1 != "resnet101" ]
+then 
+    echo "error: the selected net is neither resnet50 nor resnet101"
+exit 1
+fi
+
+if [ $2 != "cifar10" ] && [ $2 != "imagenet2012" ]
+then 
+    echo "error: the selected dataset is neither cifar10 nor imagenet2012"
+exit 1
+fi
+
+if [ $1 == "resnet101" ] && [ $2 == "cifar10" ]
+then 
+    echo "error: training resnet101 with cifar10 dataset is unsupported now!"
+exit 1
+fi
+
+
+get_real_path(){
+  if [ "${1:0:1}" == "/" ]; then
+    echo "$1"
+  else
+    echo "$(realpath -m $PWD/$1)"
+  fi
+}
+
+PATH1=$(get_real_path $3)
+PATH2=$(get_real_path $4)
+
+if [ $# == 5 ]
+then 
+    PATH3=$(get_real_path $5)
+fi
+
+if [ ! -f $PATH1 ]
+then 
+    echo "error: RANK_TABLE_FILE=$PATH1 is not a file"
+exit 1
+fi 
+
+if [ ! -d $PATH2 ]
+then 
+    echo "error: DATASET_PATH=$PATH2 is not a directory"
+exit 1
+fi 
+
+if [ $# == 5 ] && [ ! -f $PATH3 ]
+then
+    echo "error: PRETRAINED_CKPT_PATH=$PATH3 is not a file"
+exit 1
+fi
+
+ulimit -u unlimited
+export DEVICE_NUM=8
+export RANK_SIZE=8
+export RANK_TABLE_FILE=$PATH1
+
+export MS_COMM_TYPE=zmq
+export MS_SCHED_NUM=1
+export MS_WORKER_NUM=$RANK_SIZE
+export MS_SERVER_NUM=1
+export MS_SCHED_HOST=127.0.0.1
+export MS_SCHED_PORT=8081
+
+export MS_ROLE=MS_SCHED
+export DEVICE_ID=0
+export RANK_ID=0
+rm -rf ./sched
+mkdir ./sched
+cp ../*.py ./sched
+cp *.sh ./sched
+cp -r ../src ./sched
+cd ./sched || exit
+echo "start scheduler"
+if [ $# == 4 ]
+then	    
+    python train.py --net=$1 --dataset=$2 --run_distribute=True --device_num=1 --dataset_path=$PATH2 --parameter_server=True &> sched.log &
+fi
+
+if [ $# == 5 ]
+then
+    python train.py --net=$1 --dataset=$2 --run_distribute=True --device_num=1 --dataset_path=$PATH2 --parameter_server=True --pre_trained=$PATH3 &> sched.log &
+fi
+cd ..
+
+export MS_ROLE=MS_PSERVER
+for((i=0; i<1; i++))
+do
+    export DEVICE_ID=$i
+    export RANK_ID=$i
+    rm -rf ./server_$i
+    mkdir ./server_$i
+    cp ../*.py ./server_$i
+    cp *.sh ./server_$i
+    cp -r ../src ./server_$i
+    cd ./server_$i || exit
+    echo "start server"
+    if [ $# == 4 ]
+    then	    
+        python train.py --net=$1 --dataset=$2 --run_distribute=True --device_num=1 --dataset_path=$PATH2 --parameter_server=True &> server_$i.log &
+    fi
+    
+    if [ $# == 5 ]
+    then
+        python train.py --net=$1 --dataset=$2 --run_distribute=True --device_num=1 --dataset_path=$PATH2 --parameter_server=True --pre_trained=$PATH3 &> server_$i.log &
+    fi
+
+    cd ..
+done
+
+export MS_ROLE=MS_WORKER
+for((i=0; i<${DEVICE_NUM}; i++))
+do
+    export DEVICE_ID=$i
+    export RANK_ID=$i
+    rm -rf ./worker_$i
+    mkdir ./worker_$i
+    cp ../*.py ./worker_$i
+    cp *.sh ./worker_$i
+    cp -r ../src ./worker_$i
+    cd ./worker_$i || exit
+    echo "start training for worker rank $RANK_ID, device $DEVICE_ID"
+    env > env.log
+    if [ $# == 4 ]
+    then	    
+        python train.py --net=$1 --dataset=$2 --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 --parameter_server=True &> worker_$i.log &
+    fi
+    
+    if [ $# == 5 ]
+    then
+        python train.py --net=$1 --dataset=$2 --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 --parameter_server=True --pre_trained=$PATH3 &> worker_$i.log &
+    fi
+
+    cd ..
+done
diff --git a/model_zoo/official/cv/resnet/scripts/run_parameter_server_train_gpu.sh b/model_zoo/official/cv/resnet/scripts/run_parameter_server_train_gpu.sh
new file mode 100755
index 0000000000..7ccfdddb2f
--- /dev/null
+++ b/model_zoo/official/cv/resnet/scripts/run_parameter_server_train_gpu.sh
@@ -0,0 +1,144 @@
+#!/bin/bash
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+if [ $# != 3 ] && [ $# != 4 ]
+then 
+	echo "Usage: sh run_distribute_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012]  [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)"
+exit 1
+fi
+
+if [ $1 != "resnet50" ] && [ $1 != "resnet101" ]
+then 
+    echo "error: the selected net is neither resnet50 nor resnet101"
+exit 1
+fi
+
+if [ $2 != "cifar10" ] && [ $2 != "imagenet2012" ]
+then 
+    echo "error: the selected dataset is neither cifar10 nor imagenet2012"
+exit 1
+fi
+
+if [ $1 == "resnet101" ] && [ $2 == "cifar10" ]
+then 
+    echo "error: training resnet101 with cifar10 dataset is unsupported now!"
+exit 1
+fi
+
+
+get_real_path(){
+  if [ "${1:0:1}" == "/" ]; then
+    echo "$1"
+  else
+    echo "$(realpath -m $PWD/$1)"
+  fi
+}
+
+PATH1=$(get_real_path $3)
+
+if [ $# == 4 ]
+then 
+    PATH2=$(get_real_path $4)
+fi
+
+
+if [ ! -d $PATH2 ]
+then 
+    echo "error: DATASET_PATH=$PATH1 is not a directory"
+exit 1
+fi 
+
+if [ $# == 5 ] && [ ! -f $PATH2 ]
+then
+    echo "error: PRETRAINED_CKPT_PATH=$PATH2 is not a file"
+exit 1
+fi
+
+export DEVICE_NUM=8
+export RANK_SIZE=8
+
+export MS_COMM_TYPE=zmq
+export MS_SCHED_NUM=1
+export MS_WORKER_NUM=8
+export MS_SERVER_NUM=1
+export MS_SCHED_HOST=127.0.0.1
+export MS_SCHED_PORT=8081
+
+export MS_ROLE=MS_SCHED
+rm -rf ./sched
+mkdir ./sched
+cp ../*.py ./sched
+cp *.sh ./sched
+cp -r ../src ./sched
+cd ./sched || exit
+if [ $# == 3 ]
+then	    
+        mpirun --allow-run-as-root -n 1 \
+	      python train.py --net=$1 --dataset=$2 --run_distribute=True \
+	      --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True &> sched.log &
+fi
+    
+if [ $# == 4 ]
+then
+        mpirun --allow-run-as-root -n 1 \
+          python train.py --net=$1 --dataset=$2 --run_distribute=True \
+	  --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True --pre_trained=$PATH2 &> sched.log &
+fi
+cd ..
+
+export MS_ROLE=MS_PSERVER
+rm -rf ./server
+mkdir ./server
+cp ../*.py ./server
+cp *.sh ./server
+cp -r ../src ./server
+cd ./server || exit
+if [ $# == 3 ]
+then	    
+        mpirun --allow-run-as-root -n 1 \
+	      python train.py --net=$1 --dataset=$2 --run_distribute=True \
+	      --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True &> server.log &
+fi
+    
+if [ $# == 4 ]
+then
+        mpirun --allow-run-as-root -n 1 \
+          python train.py --net=$1 --dataset=$2 --run_distribute=True \
+	  --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True --pre_trained=$PATH2 &> server.log &
+fi
+cd ..
+
+export MS_ROLE=MS_WORKER
+rm -rf ./worker
+mkdir ./worker
+cp ../*.py ./worker
+cp *.sh ./worker
+cp -r ../src ./worker
+cd ./worker || exit
+if [ $# == 3 ]
+then	    
+        mpirun --allow-run-as-root -n $RANK_SIZE \
+	      python train.py --net=$1 --dataset=$2 --run_distribute=True \
+	      --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True &> worker.log &
+fi
+    
+if [ $# == 4 ]
+then
+        mpirun --allow-run-as-root -n $RANK_SIZE \
+          python train.py --net=$1 --dataset=$2 --run_distribute=True \
+	  --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True --pre_trained=$PATH2 &> worker.log &
+fi
+cd ..
diff --git a/model_zoo/official/cv/resnet/train.py b/model_zoo/official/cv/resnet/train.py
index 414fa4c7de..3d65d10392 100755
--- a/model_zoo/official/cv/resnet/train.py
+++ b/model_zoo/official/cv/resnet/train.py
@@ -41,6 +41,7 @@ parser.add_argument('--device_num', type=int, default=1, help='Device num.')
 parser.add_argument('--dataset_path', type=str, default=None, help='Dataset path')
 parser.add_argument('--device_target', type=str, default='Ascend', help='Device target')
 parser.add_argument('--pre_trained', type=str, default=None, help='Pretrained checkpoint path')
+parser.add_argument('--parameter_server', type=bool, default=False, help='Run parameter server train')
 args_opt = parser.parse_args()
 
 random.seed(1)
@@ -92,6 +93,8 @@ if __name__ == '__main__':
 
     # define net
     net = resnet(class_num=config.class_num)
+    if args_opt.parameter_server:
+        net.set_param_ps()
 
     # init weight
     if args_opt.pre_trained:
@@ -181,4 +184,4 @@ if __name__ == '__main__':
         cb += [ckpt_cb]
 
     # train model
-    model.train(config.epoch_size, dataset, callbacks=cb)
+    model.train(config.epoch_size, dataset, callbacks=cb, dataset_sink_mode=(not args_opt.parameter_server))