refine model zoo scripts for ps cache

4 years ago · b70bc5b9d0
parent b8faa8293b
commit b70bc5b9d0
10 changed files with 274 additions and 336 deletions
--- a/model_zoo/official/recommend/wide_and_deep/script/run_parameter_server_cache_multigpu_train.sh
+++ b/model_zoo/official/recommend/wide_and_deep/script/run_parameter_server_cache_multigpu_train.sh
@ -1,58 +0,0 @@
-#!/bin/bash
-# Copyright 2020 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-
-execute_path=$(pwd)
-script_self=$(readlink -f "$0")
-self_path=$(dirname "${script_self}")
-export RANK_SIZE=$1
-export EPOCH_SIZE=$2
-export DATASET=$3
-export MS_COMM_TYPE=zmq
-export MS_SCHED_NUM=1
-export MS_WORKER_NUM=$RANK_SIZE
-export MS_SERVER_NUM=$4
-export MS_SCHED_HOST=$5
-export MS_SCHED_PORT=$6
-
-export MS_ROLE=MS_SCHED
-for((i=0;i<1;i++));
-do
-  rm -rf ${execute_path}/sched_$i/
-  mkdir ${execute_path}/sched_$i/
-  cd ${execute_path}/sched_$i/ || exit
-  python -s ${self_path}/../train_and_eval_parameter_server_cache_distribute.py                 \
-         --device_target='GPU' --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1   \
-         --vocab_cache_size=300000 >sched_$i.log 2>&1 &
-done
-
-export MS_ROLE=MS_PSERVER
-for((i=0;i<$MS_SERVER_NUM;i++));
-do
-  rm -rf ${execute_path}/server_$i/
-  mkdir ${execute_path}/server_$i/
-  cd ${execute_path}/server_$i/ || exit
-  python -s ${self_path}/../train_and_eval_parameter_server_cache_distribute.py                 \
-         --device_target='GPU' --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1   \
-         --vocab_cache_size=300000 >server_$i.log 2>&1 &
-done
-
-export MS_ROLE=MS_WORKER
-rm -rf ${execute_path}/worker/
-mkdir ${execute_path}/worker/
-cd ${execute_path}/worker/ || exit
-mpirun --allow-run-as-root -n $RANK_SIZE python -s ${self_path}/../train_and_eval_parameter_server_cache_distribute.py               \
-                                                --device_target='GPU' --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \
-                                                --vocab_cache_size=300000 --full_batch=1 --dropout_flag=1 >worker.log 2>&1 &
--- a/model_zoo/official/recommend/wide_and_deep/script/run_parameter_server_cache_multinpu_train.sh
+++ b/model_zoo/official/recommend/wide_and_deep/script/run_parameter_server_cache_multinpu_train.sh
@ -1,68 +0,0 @@
-#!/bin/bash
-# Copyright 2020 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-
-execute_path=$(pwd)
-script_self=$(readlink -f "$0")
-self_path=$(dirname "${script_self}")
-export RANK_SIZE=$1
-export EPOCH_SIZE=$2
-export DATASET=$3
-export RANK_TABLE_FILE=$4
-export MS_COMM_TYPE=zmq
-export MS_SCHED_NUM=1
-export MS_WORKER_NUM=$RANK_SIZE
-export MS_SERVER_NUM=$5
-export MS_SCHED_HOST=$6
-export MS_SCHED_PORT=$7
-
-export MS_ROLE=MS_SCHED
-for((i=0;i<1;i++));
-do
-  rm -rf ${execute_path}/sched_$i/
-  mkdir ${execute_path}/sched_$i/
-  cd ${execute_path}/sched_$i/ || exit
-  export RANK_ID=$i
-  export DEVICE_ID=$i
-  python -s ${self_path}/../train_and_eval_parameter_server_cache_distribute.py \
-         --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1         \
-         --vocab_cache_size=300000 >sched_$i.log 2>&1 &
-done
-
-export MS_ROLE=MS_PSERVER
-for((i=0;i<$MS_SERVER_NUM;i++));
-do
-  rm -rf ${execute_path}/server_$i/
-  mkdir ${execute_path}/server_$i/
-  cd ${execute_path}/server_$i/ || exit
-  export RANK_ID=$i
-  export DEVICE_ID=$i
-  python -s ${self_path}/../train_and_eval_parameter_server_cache_distribute.py \
-         --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1         \
-         --vocab_cache_size=300000 >server_$i.log 2>&1 &
-done
-
-export MS_ROLE=MS_WORKER
-for((i=0;i<$MS_WORKER_NUM;i++));
-do
-  rm -rf ${execute_path}/worker_$i/
-  mkdir ${execute_path}/worker_$i/
-  cd ${execute_path}/worker_$i/ || exit
-  export RANK_ID=$i
-  export DEVICE_ID=$i
-  python -s ${self_path}/../train_and_eval_parameter_server_cache_distribute.py \
-         --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1         \
-         --vocab_cache_size=300000 --full_batch=1 --dropout_flag=1 >worker_$i.log 2>&1 &
-done
--- a/model_zoo/official/recommend/wide_and_deep/script/run_parameter_server_cache_standalone_train.sh
+++ b/model_zoo/official/recommend/wide_and_deep/script/run_parameter_server_cache_standalone_train.sh
@ -1,56 +0,0 @@
-#!/bin/bash
-# Copyright 2020 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-
-execute_path=$(pwd)
-script_self=$(readlink -f "$0")
-self_path=$(dirname "${script_self}")
-export EPOCH_SIZE=$1
-export DEVICE_TARGET=$2
-export DATASET=$3
-export MS_COMM_TYPE=zmq
-export MS_SCHED_NUM=1
-export MS_WORKER_NUM=1
-export MS_SERVER_NUM=$4
-export MS_SCHED_HOST=$5
-export MS_SCHED_PORT=$6
-
-export MS_ROLE=MS_SCHED
-rm -rf ${execute_path}/sched/
-mkdir ${execute_path}/sched/
-cd ${execute_path}/sched/ || exit
-export DEVICE_ID=$i
-python -s ${self_path}/../train_and_eval_parameter_server.py --epochs=$EPOCH_SIZE --device_target=$DEVICE_TARGET --data_path=$DATASET \
-                                                             --parameter_server=1 --vocab_cache_size=300000 >sched.log 2>&1 &
-
-export MS_ROLE=MS_PSERVER
-for((i=0;i<$MS_SERVER_NUM;i++));
-do
-  rm -rf ${execute_path}/server_$i/
-  mkdir ${execute_path}/server_$i/
-  cd ${execute_path}/server_$i/ || exit
-  export DEVICE_ID=$i
-  python -s ${self_path}/../train_and_eval_parameter_server.py --epochs=$EPOCH_SIZE --device_target=$DEVICE_TARGET --data_path=$DATASET \
-                                                               --parameter_server=1 --vocab_cache_size=300000 >server_$i.log 2>&1 &
-done
-
-export MS_ROLE=MS_WORKER
-rm -rf ${execute_path}/worker/
-mkdir ${execute_path}/worker/
-cd ${execute_path}/worker/ || exit
-export DEVICE_ID=$i
-python -s ${self_path}/../train_and_eval_parameter_server.py --epochs=$EPOCH_SIZE --device_target=$DEVICE_TARGET --data_path=$DATASET \
-                                                             --parameter_server=1 --vocab_cache_size=300000 \
-                                                             --dropout_flag=1 >worker.log 2>&1 &
--- a/model_zoo/official/recommend/wide_and_deep/script/run_parameter_server_train.sh
+++ b/model_zoo/official/recommend/wide_and_deep/script/run_parameter_server_train.sh
@ -1,63 +0,0 @@
-#!/bin/bash
-# Copyright 2020 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-
-execute_path=$(pwd)
-script_self=$(readlink -f "$0")
-self_path=$(dirname "${script_self}")
-export RANK_SIZE=$1
-export EPOCH_SIZE=$2
-export DATASET=$3
-export RANK_TABLE_FILE=$4
-
-export MS_COMM_TYPE=zmq
-export MS_SCHED_NUM=1
-export MS_WORKER_NUM=$RANK_SIZE
-export MS_SERVER_NUM=$5
-export MS_SCHED_HOST=$6
-export MS_SCHED_PORT=$7
-
-export MS_ROLE=MS_SCHED
-for((i=0;i<1;i++));
-do
-  rm -rf ${execute_path}/sched_$i/
-  mkdir ${execute_path}/sched_$i/
-  cd ${execute_path}/sched_$i/ || exit
-  export RANK_ID=$i
-  export DEVICE_ID=$i
-  python -s ${self_path}/../train_and_eval_parameter_server.py --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 >sched_$i.log 2>&1 &
-done
-
-export MS_ROLE=MS_PSERVER
-for((i=0;i<$MS_SERVER_NUM;i++));
-do
-  rm -rf ${execute_path}/server_$i/
-  mkdir ${execute_path}/server_$i/
-  cd ${execute_path}/server_$i/ || exit
-  export RANK_ID=$i
-  export DEVICE_ID=$i
-  python -s ${self_path}/../train_and_eval_parameter_server.py --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 >server_$i.log 2>&1 &
-done
-
-export MS_ROLE=MS_WORKER
-for((i=0;i<$MS_WORKER_NUM;i++));
-do
-  rm -rf ${execute_path}/worker_$i/
-  mkdir ${execute_path}/worker_$i/
-  cd ${execute_path}/worker_$i/ || exit
-  export RANK_ID=$i
-  export DEVICE_ID=$i
-  python -s ${self_path}/../train_and_eval_parameter_server.py --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 >worker_$i.log 2>&1 &
-done
--- a/model_zoo/official/recommend/wide_and_deep/script/run_parameter_server_train_cluster.sh
+++ b/model_zoo/official/recommend/wide_and_deep/script/run_parameter_server_train_cluster.sh
@ -14,17 +14,17 @@
 # limitations under the License.
 # ============================================================================

+
+#bash run_parameter_server_train_cluster.sh RANK_SIZE EPOCHS DEVICE_TARGET DATASET 
+#                                           LOCAL_WORKER_NUM LOCAL_SERVER_NUM SERVER_NUM 
+#                                           SCHED_HOST SCHED_PORT ROLE RANK_TABLE_FILE VOCAB_CACHE_SIZE 
 execute_path=$(pwd)
 script_self=$(readlink -f "$0")
 self_path=$(dirname "${script_self}")
-
-#bash run_parameter_server_train_cluster.sh RANK_SIZE EPOCHS DATASET RANK_TABLE_FILE 
-#                                            LOCAL_WORKER_NUM LOCAL_SERVER_NUM SERVER_NUM 
-#                                            SCHED_HOST SCHED_PORT ROLE
 export RANK_SIZE=$1
 export EPOCH_SIZE=$2
-export DATASET=$3
-export RANK_TABLE_FILE=$4
+export DEVICE_TARGET=$3
+export DATASET=$4

 export MS_COMM_TYPE=zmq
 export MS_SCHED_NUM=1
@ -35,41 +35,56 @@ export MS_SERVER_NUM=$7
 export MS_SCHED_HOST=$8
 export MS_SCHED_PORT=$9
 export MS_ROLE=${10}
-echo  "=====Role is $MS_ROLE======"
+export RANK_TABLE_FILE=${11}
+export VOCAB_CACHE_SIZE=${12}

+if [[ ! -n "${12}" ]]; then
+  export VOCAB_CACHE_SIZE=0
+fi
+
+echo  "=====Role is $MS_ROLE======"

-if [ "$MS_ROLE" == "MS_SCHED" ];then
-for((i=0;i<1;i++));
-do
-  rm -rf ${execute_path}/sched_$i/
-  mkdir ${execute_path}/sched_$i/
-  cd ${execute_path}/sched_$i/ || exit
-  export RANK_ID=$i
-  export DEVICE_ID=$i
-  python -s ${self_path}/../train_and_eval_parameter_server.py --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 >sched_$i.log 2>&1 &
-done
+if [[ "$MS_ROLE" == "MS_SCHED" ]]; then
+  rm -rf ${execute_path}/sched/
+  mkdir ${execute_path}/sched/
+  cd ${execute_path}/sched/ || exit
+  python -s ${self_path}/../train_and_eval_parameter_server_distribute.py --device_target=$DEVICE_TARGET   \
+        --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1                                     \
+        --vocab_cache_size=$VOCAB_CACHE_SIZE >sched.log 2>&1 &
 fi

-if [ "$MS_ROLE" == "MS_PSERVER" ];then
-for((i=0;i<$LOCAL_SERVER_NUM;i++));
-do
-  rm -rf ${execute_path}/server_$i/
-  mkdir ${execute_path}/server_$i/
-  cd ${execute_path}/server_$i/ || exit
-  export RANK_ID=$i
-  export DEVICE_ID=$i
-  python -s ${self_path}/../train_and_eval_parameter_server.py --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 >server_$i.log 2>&1 &
-done
+if [[ "$MS_ROLE" == "MS_PSERVER" ]]; then
+  for((i=0;i<$LOCAL_SERVER_NUM;i++));
+  do
+    rm -rf ${execute_path}/server_$i/
+    mkdir ${execute_path}/server_$i/
+    cd ${execute_path}/server_$i/ || exit
+    python -s ${self_path}/../train_and_eval_parameter_server_distribute.py --device_target=$DEVICE_TARGET \
+          --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1                                   \
+          --vocab_cache_size=$VOCAB_CACHE_SIZE >server_$i.log 2>&1 &
+  done
 fi

-if [ "$MS_ROLE" == "MS_WORKER" ];then
-for((i=0;i<$LOCAL_WORKER_NUM;i++));
-do
-  rm -rf ${execute_path}/worker_$i/
-  mkdir ${execute_path}/worker_$i/
-  cd ${execute_path}/worker_$i/ || exit
-  export RANK_ID=$i
-  export DEVICE_ID=$i
-  python -s ${self_path}/../train_and_eval_parameter_server.py --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 >worker_$i.log 2>&1 &
-done
+if [[ "$MS_ROLE" == "MS_WORKER" ]]; then
+  if [[ "X$DEVICE_TARGET" == "XGPU" ]]; then
+    rm -rf ${execute_path}/worker/
+    mkdir ${execute_path}/worker/
+    cd ${execute_path}/worker/ || exit
+    mpirun --allow-run-as-root -n $LOCAL_WORKER_NUM \
+      python -s ${self_path}/../train_and_eval_parameter_server_distribute.py                    \
+        --device_target=$DEVICE --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1   \
+        --vocab_cache_size=$VOCAB_CACHE_SIZE --dropout_flag=1 >worker.log 2>&1 &
+  else
+    for((i=0;i<$LOCAL_WORKER_NUM;i++));
+    do
+      rm -rf ${execute_path}/worker_$i/
+      mkdir ${execute_path}/worker_$i/
+      cd ${execute_path}/worker_$i/ || exit
+      export RANK_ID=$i
+      export DEVICE_ID=$i
+      python -s ${self_path}/../train_and_eval_parameter_server_distribute.py                         \
+        --device_target=$DEVICE_TARGET --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \
+        --vocab_cache_size=$VOCAB_CACHE_SIZE --dropout_flag=1 >worker_$i.log 2>&1 &
+    done
+  fi
 fi
--- a/model_zoo/official/recommend/wide_and_deep/script/run_parameter_server_train_distribute.sh
+++ b/model_zoo/official/recommend/wide_and_deep/script/run_parameter_server_train_distribute.sh
@ -0,0 +1,82 @@
+#!/bin/bash
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+
+#bash run_parameter_server_train_distribute.sh RANK_SIZE EPOCHS DEVICE_TARGET DATASET 
+#                                              SERVER_NUM SCHED_HOST SCHED_PORT RANK_TABLE_FILE
+#                                              VOCAB_CACHE_SIZE
+execute_path=$(pwd)
+script_self=$(readlink -f "$0")
+self_path=$(dirname "${script_self}")
+export RANK_SIZE=$1
+export EPOCH_SIZE=$2
+export DEVICE_TARGET=$3
+export DATASET=$4
+export MS_COMM_TYPE=zmq
+export MS_SCHED_NUM=1
+export MS_WORKER_NUM=$RANK_SIZE
+export MS_SERVER_NUM=$5
+export MS_SCHED_HOST=$6
+export MS_SCHED_PORT=$7
+export RANK_TABLE_FILE=$8
+export VOCAB_CACHE_SIZE=$9
+
+if [[ ! -n "$9" ]]; then
+  export VOCAB_CACHE_SIZE=0
+fi
+
+export MS_ROLE=MS_SCHED
+rm -rf ${execute_path}/sched/
+mkdir ${execute_path}/sched/
+cd ${execute_path}/sched/ || exit
+python -s ${self_path}/../train_and_eval_parameter_server_distribute.py                                 \
+        --device_target=$DEVICE_TARGET --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1   \
+        --vocab_cache_size=$VOCAB_CACHE_SIZE >sched.log 2>&1 &
+
+export MS_ROLE=MS_PSERVER
+for((i=0;i<$MS_SERVER_NUM;i++));
+do
+  rm -rf ${execute_path}/server_$i/
+  mkdir ${execute_path}/server_$i/
+  cd ${execute_path}/server_$i/ || exit
+  python -s ${self_path}/../train_and_eval_parameter_server_distribute.py                               \
+         --device_target=$DEVICE_TARGET --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1  \
+         --vocab_cache_size=$VOCAB_CACHE_SIZE >server_$i.log 2>&1 &
+done
+
+export MS_ROLE=MS_WORKER
+if [[ "X$DEVICE_TARGET" == "XGPU" ]]; then
+  rm -rf ${execute_path}/worker/
+  mkdir ${execute_path}/worker/
+  cd ${execute_path}/worker/ || exit
+  mpirun --allow-run-as-root -n $RANK_SIZE \
+    python -s ${self_path}/../train_and_eval_parameter_server_distribute.py                           \
+      --device_target=$DEVICE_TARGET --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1   \
+      --vocab_cache_size=$VOCAB_CACHE_SIZE --dropout_flag=1 >worker.log 2>&1 &
+else
+  for((i=0;i<$MS_WORKER_NUM;i++));
+  do
+    rm -rf ${execute_path}/worker_$i/
+    mkdir ${execute_path}/worker_$i/
+    cd ${execute_path}/worker_$i/ || exit
+    export RANK_ID=$i
+    export DEVICE_ID=$i
+    python -s ${self_path}/../train_and_eval_parameter_server_distribute.py                         \
+      --device_target=$DEVICE_TARGET --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \
+      --vocab_cache_size=$VOCAB_CACHE_SIZE --dropout_flag=1 >worker_$i.log 2>&1 &
+  done
+fi
+
--- a/model_zoo/official/recommend/wide_and_deep/script/run_parameter_server_train_standalone.sh
+++ b/model_zoo/official/recommend/wide_and_deep/script/run_parameter_server_train_standalone.sh
@ -0,0 +1,79 @@
+#!/bin/bash
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+
+#bash run_parameter_server_train_standalone.sh EPOCHS DEVICE_TARGET DATASET SERVER_NUM SCHED_HOST
+#                                              SCHED_PORT DEVICE_ID VOCAB_CACHE_SIZE
+execute_path=$(pwd)
+script_self=$(readlink -f "$0")
+self_path=$(dirname "${script_self}")
+export EPOCH_SIZE=$1
+export DEVICE_TARGET=$2
+export DATASET=$3
+export MS_COMM_TYPE=zmq
+export MS_SCHED_NUM=1
+export MS_WORKER_NUM=1
+export MS_SERVER_NUM=$4
+export MS_SCHED_HOST=$5
+export MS_SCHED_PORT=$6
+DEVICE_ID=$7
+export VOCAB_CACHE_SIZE=$8
+
+if [[ ! -n "$8" ]]; then
+  export VOCAB_CACHE_SIZE=0
+fi
+
+# Set device id
+if [[ "X$DEVICE_TARGET" == "XGPU" ]]; then
+  if [[ ! -n "$DEVICE_ID" ]]; then
+    export CUDA_VISIBLE_DEVICES=0
+  else
+    export CUDA_VISIBLE_DEVICES=$DEVICE_ID
+  fi
+else
+  if [[ ! -n "$DEVICE_ID" ]]; then
+    export DEVICE_ID=0
+  else
+    export DEVICE_ID=$DEVICE_ID
+  fi
+fi
+
+export MS_ROLE=MS_SCHED
+rm -rf ${execute_path}/sched/
+mkdir ${execute_path}/sched/
+cd ${execute_path}/sched/ || exit
+python -s ${self_path}/../train_and_eval_parameter_server_standalone.py --device_target=$DEVICE_TARGET  \
+       --epochs=$EPOCH_SIZE --data_path=$DATASET --parameter_server=1                                   \
+       --vocab_cache_size=$VOCAB_CACHE_SIZE >sched.log 2>&1 &
+
+export MS_ROLE=MS_PSERVER
+for((i=0;i<$MS_SERVER_NUM;i++));
+do
+  rm -rf ${execute_path}/server_$i/
+  mkdir ${execute_path}/server_$i/
+  cd ${execute_path}/server_$i/ || exit
+  python -s ${self_path}/../train_and_eval_parameter_server_standalone.py --device_target=$DEVICE_TARGET  \
+         --epochs=$EPOCH_SIZE --data_path=$DATASET --parameter_server=1                                   \
+         --vocab_cache_size=$VOCAB_CACHE_SIZE >server_$i.log 2>&1 &
+done
+
+export MS_ROLE=MS_WORKER
+rm -rf ${execute_path}/worker/
+mkdir ${execute_path}/worker/
+cd ${execute_path}/worker/ || exit
+python -s ${self_path}/../train_and_eval_parameter_server_standalone.py --device_target=$DEVICE_TARGET  \
+       --epochs=$EPOCH_SIZE --data_path=$DATASET --parameter_server=1                                   \
+       --vocab_cache_size=$VOCAB_CACHE_SIZE --dropout_flag=1 >worker.log 2>&1 &
--- a/model_zoo/official/recommend/wide_and_deep/src/wide_and_deep.py
+++ b/model_zoo/official/recommend/wide_and_deep/src/wide_and_deep.py
@ -238,6 +238,8 @@ class WideDeepModel(nn.Cell):
        elif parameter_server:
            cache_enable = self.vocab_cache_size > 0
            target = 'DEVICE' if cache_enable else 'CPU'
+            if not cache_enable:
+                sparse = True
            if is_auto_parallel and config.full_batch and cache_enable:
                self.deep_embeddinglookup = nn.EmbeddingLookup(self.vocab_size, self.emb_dim, target=target,
                                                               slice_mode=nn.EmbeddingLookup.TABLE_ROW_SLICE,
--- a/model_zoo/official/recommend/wide_and_deep/train_and_eval_parameter_server_cache_distribute.py
+++ b/model_zoo/official/recommend/wide_and_deep/train_and_eval_parameter_server_cache_distribute.py
@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
-"""train_multinpu."""
+"""train distribute on parameter server."""


 import os
@ -23,6 +23,7 @@ from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMoni
 from mindspore.context import ParallelMode
 from mindspore.communication.management import get_rank, get_group_size, init
 from mindspore.nn.wrap.cell_wrapper import VirtualDatasetCellTriple
+from mindspore.common import set_seed

 from src.wide_and_deep import PredictWithSigmoid, TrainStepWrap, NetWithLossClass, WideDeepModel
 from src.callbacks import LossCallBack, EvalCallBack
@ -32,18 +33,19 @@ from src.config import WideDeepConfig

 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

-
-def get_WideDeep_net(config):
+def get_wide_deep_net(config):
    """
    Get network of wide&deep model.
    """
-    WideDeep_net = WideDeepModel(config)
-    loss_net = NetWithLossClass(WideDeep_net, config)
-    loss_net = VirtualDatasetCellTriple(loss_net)
+    wide_deep_net = WideDeepModel(config)
+    loss_net = NetWithLossClass(wide_deep_net, config)
+    if cache_enable:
+        loss_net = VirtualDatasetCellTriple(loss_net)
    train_net = TrainStepWrap(loss_net, parameter_server=bool(config.parameter_server),
-                              cache_enable=bool(config.vocab_cache_size > 0))
-    eval_net = PredictWithSigmoid(WideDeep_net)
-    eval_net = VirtualDatasetCellTriple(eval_net)
+                              cache_enable=(config.vocab_cache_size > 0))
+    eval_net = PredictWithSigmoid(wide_deep_net)
+    if cache_enable:
+        eval_net = VirtualDatasetCellTriple(eval_net)
    return train_net, eval_net


@ -51,7 +53,6 @@ class ModelBuilder():
    """
    ModelBuilder
    """
-
    def __init__(self):
        pass

@ -67,13 +68,14 @@ class ModelBuilder():
        return hooks

    def get_net(self, config):
-        return get_WideDeep_net(config)
+        return get_wide_deep_net(config)


 def train_and_eval(config):
    """
    test_train_eval
    """
+    set_seed(1000)
    data_path = config.data_path
    batch_size = config.batch_size
    epochs = config.epochs
@ -83,6 +85,9 @@ def train_and_eval(config):
        dataset_type = DataType.MINDRECORD
    else:
        dataset_type = DataType.H5
+    parameter_server = bool(config.parameter_server)
+    if cache_enable:
+        config.full_batch = True
    print("epochs is {}".format(epochs))
    if config.full_batch:
        context.set_auto_parallel_context(full_batch=True)
@ -107,35 +112,46 @@ def train_and_eval(config):
    train_net.set_train()
    auc_metric = AUCMetric()

-    model = Model(train_net, eval_network=eval_net,
-                  metrics={"auc": auc_metric})
+    model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric})

-    eval_callback = EvalCallBack(
-        model, ds_eval, auc_metric, config)
+    eval_callback = EvalCallBack(model, ds_eval, auc_metric, config)

-    callback = LossCallBack(config=config, per_print_times=20)
-    ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size()*epochs,
-                                  keep_checkpoint_max=5, integrated_save=False)
+    callback = LossCallBack(config=config)
+    if cache_enable:
+        ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size()*epochs,
+                                      keep_checkpoint_max=5, integrated_save=False)
+    else:
+        ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size(), keep_checkpoint_max=5)
    ckpoint_cb = ModelCheckpoint(prefix='widedeep_train',
-                                 directory=config.ckpt_path + '/ckpt_' + str(get_rank()) + '/', config=ckptconfig)
-    context.set_auto_parallel_context(strategy_ckpt_save_file=config.stra_ckpt)
-    callback_list = [TimeMonitor(
-        ds_train.get_dataset_size()), eval_callback, callback]
-    callback_list.append(ckpoint_cb)
-    model.train(epochs, ds_train, callbacks=callback_list, dataset_sink_mode=True)
+                                 directory=config.ckpt_path + '/ckpt_' + str(get_rank()) + '/',
+                                 config=ckptconfig)
+    if cache_enable:
+        context.set_auto_parallel_context(strategy_ckpt_save_file=config.stra_ckpt)
+    callback_list = [TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback]
+    if get_rank() == 0:
+        callback_list.append(ckpoint_cb)
+    model.train(epochs, ds_train,
+                callbacks=callback_list,
+                dataset_sink_mode=bool(parameter_server and cache_enable))


 if __name__ == "__main__":
    wide_deep_config = WideDeepConfig()
    wide_deep_config.argparse_init()
-    context.set_context(mode=context.GRAPH_MODE,
-                        device_target=wide_deep_config.device_target, save_graphs=True)
-    context.set_context(variable_memory_max_size="24GB")
+    context.set_context(mode=context.GRAPH_MODE, device_target=wide_deep_config.device_target, save_graphs=True)
+    cache_enable = wide_deep_config.vocab_cache_size > 0
+    if cache_enable and wide_deep_config.device_target != "GPU":
+        context.set_context(variable_memory_max_size="24GB")
    context.set_context(enable_sparse=True)
    context.set_ps_context(enable_ps=True)
    init()
    context.set_context(save_graphs_path='./graphs_of_device_id_'+str(get_rank()))

-    context.set_auto_parallel_context(
-        parallel_mode=ParallelMode.AUTO_PARALLEL, gradients_mean=True)
+    if cache_enable:
+        context.set_auto_parallel_context(
+            parallel_mode=ParallelMode.AUTO_PARALLEL, gradients_mean=True)
+    else:
+        context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True,
+                                          device_num=get_group_size())
+
    train_and_eval(wide_deep_config)
--- a/model_zoo/official/recommend/wide_and_deep/train_and_eval_parameter_server_standalone.py
+++ b/model_zoo/official/recommend/wide_and_deep/train_and_eval_parameter_server_standalone.py
@ -12,15 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
-"""train_multinpu."""
+"""train standalone on parameter server."""


 import os
 import sys
 from mindspore import Model, context
 from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor
-from mindspore.context import ParallelMode
-from mindspore.communication.management import get_rank, get_group_size, init
 from mindspore.common import set_seed

 from src.wide_and_deep import PredictWithSigmoid, TrainStepWrap, NetWithLossClass, WideDeepModel
@ -33,15 +31,15 @@ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 context.set_context(enable_sparse=True)


-def get_WideDeep_net(config):
+def get_wide_deep_net(config):
    """
    Get network of wide&deep model.
    """
-    WideDeep_net = WideDeepModel(config)
-    loss_net = NetWithLossClass(WideDeep_net, config)
+    wide_deep_net = WideDeepModel(config)
+    loss_net = NetWithLossClass(wide_deep_net, config)
    train_net = TrainStepWrap(loss_net, parameter_server=bool(config.parameter_server),
-                              cache_enable=bool(config.vocab_cache_size > 0))
-    eval_net = PredictWithSigmoid(WideDeep_net)
+                              cache_enable=(config.vocab_cache_size > 0))
+    eval_net = PredictWithSigmoid(wide_deep_net)
    return train_net, eval_net


@ -64,7 +62,7 @@ class ModelBuilder():
        return hooks

    def get_net(self, config):
-        return get_WideDeep_net(config)
+        return get_wide_deep_net(config)


 def train_and_eval(config):
@ -82,14 +80,12 @@ def train_and_eval(config):
    else:
        dataset_type = DataType.H5
    parameter_server = bool(config.parameter_server)
-    cache_enable = bool(config.vocab_cache_size > 0)
+    cache_enable = config.vocab_cache_size > 0
    print("epochs is {}".format(epochs))
    ds_train = create_dataset(data_path, train_mode=True, epochs=1,
-                              batch_size=batch_size, rank_id=get_rank(),
-                              rank_size=get_group_size(), data_type=dataset_type)
+                              batch_size=batch_size, data_type=dataset_type)
    ds_eval = create_dataset(data_path, train_mode=False, epochs=1,
-                             batch_size=batch_size, rank_id=get_rank(),
-                             rank_size=get_group_size(), data_type=dataset_type)
+                             batch_size=batch_size, data_type=dataset_type)
    print("ds_train.size: {}".format(ds_train.get_dataset_size()))
    print("ds_eval.size: {}".format(ds_eval.get_dataset_size()))

@ -102,15 +98,11 @@ def train_and_eval(config):
    model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric})

    eval_callback = EvalCallBack(model, ds_eval, auc_metric, config)
-
    callback = LossCallBack(config=config)
    ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size(), keep_checkpoint_max=5)
-    ckpoint_cb = ModelCheckpoint(prefix='widedeep_train',
-                                 directory=config.ckpt_path + '/ckpt_' + str(get_rank()) + '/',
-                                 config=ckptconfig)
-    callback_list = [TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback]
-    if get_rank() == 0:
-        callback_list.append(ckpoint_cb)
+    ckpoint_cb = ModelCheckpoint(prefix='widedeep_train', directory=config.ckpt_path, config=ckptconfig)
+    callback_list = [TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback, ckpoint_cb]
+
    model.train(epochs, ds_train,
                callbacks=callback_list,
                dataset_sink_mode=(parameter_server and cache_enable))
@ -120,10 +112,7 @@ if __name__ == "__main__":
    wide_deep_config = WideDeepConfig()
    wide_deep_config.argparse_init()

-    context.set_context(mode=context.GRAPH_MODE, device_target=wide_deep_config.device_target)
+    context.set_context(mode=context.GRAPH_MODE, device_target=wide_deep_config.device_target, save_graphs=True)
    context.set_ps_context(enable_ps=True)
-    init()
-    context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True,
-                                      device_num=get_group_size())

    train_and_eval(wide_deep_config)