diff --git a/model_zoo/official/recommend/wide_and_deep/script/run_parameter_server_cache_multigpu_train.sh b/model_zoo/official/recommend/wide_and_deep/script/run_parameter_server_cache_multigpu_train.sh deleted file mode 100644 index 355102764a..0000000000 --- a/model_zoo/official/recommend/wide_and_deep/script/run_parameter_server_cache_multigpu_train.sh +++ /dev/null @@ -1,58 +0,0 @@ -#!/bin/bash -# Copyright 2020 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ - -execute_path=$(pwd) -script_self=$(readlink -f "$0") -self_path=$(dirname "${script_self}") -export RANK_SIZE=$1 -export EPOCH_SIZE=$2 -export DATASET=$3 -export MS_COMM_TYPE=zmq -export MS_SCHED_NUM=1 -export MS_WORKER_NUM=$RANK_SIZE -export MS_SERVER_NUM=$4 -export MS_SCHED_HOST=$5 -export MS_SCHED_PORT=$6 - -export MS_ROLE=MS_SCHED -for((i=0;i<1;i++)); -do - rm -rf ${execute_path}/sched_$i/ - mkdir ${execute_path}/sched_$i/ - cd ${execute_path}/sched_$i/ || exit - python -s ${self_path}/../train_and_eval_parameter_server_cache_distribute.py \ - --device_target='GPU' --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \ - --vocab_cache_size=300000 >sched_$i.log 2>&1 & -done - -export MS_ROLE=MS_PSERVER -for((i=0;i<$MS_SERVER_NUM;i++)); -do - rm -rf ${execute_path}/server_$i/ - mkdir ${execute_path}/server_$i/ - cd ${execute_path}/server_$i/ || exit - python -s ${self_path}/../train_and_eval_parameter_server_cache_distribute.py \ - --device_target='GPU' --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \ - --vocab_cache_size=300000 >server_$i.log 2>&1 & -done - -export MS_ROLE=MS_WORKER -rm -rf ${execute_path}/worker/ -mkdir ${execute_path}/worker/ -cd ${execute_path}/worker/ || exit -mpirun --allow-run-as-root -n $RANK_SIZE python -s ${self_path}/../train_and_eval_parameter_server_cache_distribute.py \ - --device_target='GPU' --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \ - --vocab_cache_size=300000 --full_batch=1 --dropout_flag=1 >worker.log 2>&1 & diff --git a/model_zoo/official/recommend/wide_and_deep/script/run_parameter_server_cache_multinpu_train.sh b/model_zoo/official/recommend/wide_and_deep/script/run_parameter_server_cache_multinpu_train.sh deleted file mode 100644 index fc9f9a455b..0000000000 --- a/model_zoo/official/recommend/wide_and_deep/script/run_parameter_server_cache_multinpu_train.sh +++ /dev/null @@ -1,68 +0,0 @@ -#!/bin/bash -# Copyright 2020 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ - -execute_path=$(pwd) -script_self=$(readlink -f "$0") -self_path=$(dirname "${script_self}") -export RANK_SIZE=$1 -export EPOCH_SIZE=$2 -export DATASET=$3 -export RANK_TABLE_FILE=$4 -export MS_COMM_TYPE=zmq -export MS_SCHED_NUM=1 -export MS_WORKER_NUM=$RANK_SIZE -export MS_SERVER_NUM=$5 -export MS_SCHED_HOST=$6 -export MS_SCHED_PORT=$7 - -export MS_ROLE=MS_SCHED -for((i=0;i<1;i++)); -do - rm -rf ${execute_path}/sched_$i/ - mkdir ${execute_path}/sched_$i/ - cd ${execute_path}/sched_$i/ || exit - export RANK_ID=$i - export DEVICE_ID=$i - python -s ${self_path}/../train_and_eval_parameter_server_cache_distribute.py \ - --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \ - --vocab_cache_size=300000 >sched_$i.log 2>&1 & -done - -export MS_ROLE=MS_PSERVER -for((i=0;i<$MS_SERVER_NUM;i++)); -do - rm -rf ${execute_path}/server_$i/ - mkdir ${execute_path}/server_$i/ - cd ${execute_path}/server_$i/ || exit - export RANK_ID=$i - export DEVICE_ID=$i - python -s ${self_path}/../train_and_eval_parameter_server_cache_distribute.py \ - --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \ - --vocab_cache_size=300000 >server_$i.log 2>&1 & -done - -export MS_ROLE=MS_WORKER -for((i=0;i<$MS_WORKER_NUM;i++)); -do - rm -rf ${execute_path}/worker_$i/ - mkdir ${execute_path}/worker_$i/ - cd ${execute_path}/worker_$i/ || exit - export RANK_ID=$i - export DEVICE_ID=$i - python -s ${self_path}/../train_and_eval_parameter_server_cache_distribute.py \ - --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \ - --vocab_cache_size=300000 --full_batch=1 --dropout_flag=1 >worker_$i.log 2>&1 & -done diff --git a/model_zoo/official/recommend/wide_and_deep/script/run_parameter_server_cache_standalone_train.sh b/model_zoo/official/recommend/wide_and_deep/script/run_parameter_server_cache_standalone_train.sh deleted file mode 100644 index 7341cfa17b..0000000000 --- a/model_zoo/official/recommend/wide_and_deep/script/run_parameter_server_cache_standalone_train.sh +++ /dev/null @@ -1,56 +0,0 @@ -#!/bin/bash -# Copyright 2020 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ - -execute_path=$(pwd) -script_self=$(readlink -f "$0") -self_path=$(dirname "${script_self}") -export EPOCH_SIZE=$1 -export DEVICE_TARGET=$2 -export DATASET=$3 -export MS_COMM_TYPE=zmq -export MS_SCHED_NUM=1 -export MS_WORKER_NUM=1 -export MS_SERVER_NUM=$4 -export MS_SCHED_HOST=$5 -export MS_SCHED_PORT=$6 - -export MS_ROLE=MS_SCHED -rm -rf ${execute_path}/sched/ -mkdir ${execute_path}/sched/ -cd ${execute_path}/sched/ || exit -export DEVICE_ID=$i -python -s ${self_path}/../train_and_eval_parameter_server.py --epochs=$EPOCH_SIZE --device_target=$DEVICE_TARGET --data_path=$DATASET \ - --parameter_server=1 --vocab_cache_size=300000 >sched.log 2>&1 & - -export MS_ROLE=MS_PSERVER -for((i=0;i<$MS_SERVER_NUM;i++)); -do - rm -rf ${execute_path}/server_$i/ - mkdir ${execute_path}/server_$i/ - cd ${execute_path}/server_$i/ || exit - export DEVICE_ID=$i - python -s ${self_path}/../train_and_eval_parameter_server.py --epochs=$EPOCH_SIZE --device_target=$DEVICE_TARGET --data_path=$DATASET \ - --parameter_server=1 --vocab_cache_size=300000 >server_$i.log 2>&1 & -done - -export MS_ROLE=MS_WORKER -rm -rf ${execute_path}/worker/ -mkdir ${execute_path}/worker/ -cd ${execute_path}/worker/ || exit -export DEVICE_ID=$i -python -s ${self_path}/../train_and_eval_parameter_server.py --epochs=$EPOCH_SIZE --device_target=$DEVICE_TARGET --data_path=$DATASET \ - --parameter_server=1 --vocab_cache_size=300000 \ - --dropout_flag=1 >worker.log 2>&1 & diff --git a/model_zoo/official/recommend/wide_and_deep/script/run_parameter_server_train.sh b/model_zoo/official/recommend/wide_and_deep/script/run_parameter_server_train.sh deleted file mode 100644 index d7f8d41a52..0000000000 --- a/model_zoo/official/recommend/wide_and_deep/script/run_parameter_server_train.sh +++ /dev/null @@ -1,63 +0,0 @@ -#!/bin/bash -# Copyright 2020 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ - -execute_path=$(pwd) -script_self=$(readlink -f "$0") -self_path=$(dirname "${script_self}") -export RANK_SIZE=$1 -export EPOCH_SIZE=$2 -export DATASET=$3 -export RANK_TABLE_FILE=$4 - -export MS_COMM_TYPE=zmq -export MS_SCHED_NUM=1 -export MS_WORKER_NUM=$RANK_SIZE -export MS_SERVER_NUM=$5 -export MS_SCHED_HOST=$6 -export MS_SCHED_PORT=$7 - -export MS_ROLE=MS_SCHED -for((i=0;i<1;i++)); -do - rm -rf ${execute_path}/sched_$i/ - mkdir ${execute_path}/sched_$i/ - cd ${execute_path}/sched_$i/ || exit - export RANK_ID=$i - export DEVICE_ID=$i - python -s ${self_path}/../train_and_eval_parameter_server.py --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 >sched_$i.log 2>&1 & -done - -export MS_ROLE=MS_PSERVER -for((i=0;i<$MS_SERVER_NUM;i++)); -do - rm -rf ${execute_path}/server_$i/ - mkdir ${execute_path}/server_$i/ - cd ${execute_path}/server_$i/ || exit - export RANK_ID=$i - export DEVICE_ID=$i - python -s ${self_path}/../train_and_eval_parameter_server.py --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 >server_$i.log 2>&1 & -done - -export MS_ROLE=MS_WORKER -for((i=0;i<$MS_WORKER_NUM;i++)); -do - rm -rf ${execute_path}/worker_$i/ - mkdir ${execute_path}/worker_$i/ - cd ${execute_path}/worker_$i/ || exit - export RANK_ID=$i - export DEVICE_ID=$i - python -s ${self_path}/../train_and_eval_parameter_server.py --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 >worker_$i.log 2>&1 & -done diff --git a/model_zoo/official/recommend/wide_and_deep/script/run_parameter_server_train_cluster.sh b/model_zoo/official/recommend/wide_and_deep/script/run_parameter_server_train_cluster.sh index d2d885d420..0015bf470d 100644 --- a/model_zoo/official/recommend/wide_and_deep/script/run_parameter_server_train_cluster.sh +++ b/model_zoo/official/recommend/wide_and_deep/script/run_parameter_server_train_cluster.sh @@ -14,17 +14,17 @@ # limitations under the License. # ============================================================================ + +#bash run_parameter_server_train_cluster.sh RANK_SIZE EPOCHS DEVICE_TARGET DATASET +# LOCAL_WORKER_NUM LOCAL_SERVER_NUM SERVER_NUM +# SCHED_HOST SCHED_PORT ROLE RANK_TABLE_FILE VOCAB_CACHE_SIZE execute_path=$(pwd) script_self=$(readlink -f "$0") self_path=$(dirname "${script_self}") - -#bash run_parameter_server_train_cluster.sh RANK_SIZE EPOCHS DATASET RANK_TABLE_FILE -# LOCAL_WORKER_NUM LOCAL_SERVER_NUM SERVER_NUM -# SCHED_HOST SCHED_PORT ROLE export RANK_SIZE=$1 export EPOCH_SIZE=$2 -export DATASET=$3 -export RANK_TABLE_FILE=$4 +export DEVICE_TARGET=$3 +export DATASET=$4 export MS_COMM_TYPE=zmq export MS_SCHED_NUM=1 @@ -35,41 +35,56 @@ export MS_SERVER_NUM=$7 export MS_SCHED_HOST=$8 export MS_SCHED_PORT=$9 export MS_ROLE=${10} -echo "=====Role is $MS_ROLE======" +export RANK_TABLE_FILE=${11} +export VOCAB_CACHE_SIZE=${12} +if [[ ! -n "${12}" ]]; then + export VOCAB_CACHE_SIZE=0 +fi + +echo "=====Role is $MS_ROLE======" -if [ "$MS_ROLE" == "MS_SCHED" ];then -for((i=0;i<1;i++)); -do - rm -rf ${execute_path}/sched_$i/ - mkdir ${execute_path}/sched_$i/ - cd ${execute_path}/sched_$i/ || exit - export RANK_ID=$i - export DEVICE_ID=$i - python -s ${self_path}/../train_and_eval_parameter_server.py --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 >sched_$i.log 2>&1 & -done +if [[ "$MS_ROLE" == "MS_SCHED" ]]; then + rm -rf ${execute_path}/sched/ + mkdir ${execute_path}/sched/ + cd ${execute_path}/sched/ || exit + python -s ${self_path}/../train_and_eval_parameter_server_distribute.py --device_target=$DEVICE_TARGET \ + --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \ + --vocab_cache_size=$VOCAB_CACHE_SIZE >sched.log 2>&1 & fi -if [ "$MS_ROLE" == "MS_PSERVER" ];then -for((i=0;i<$LOCAL_SERVER_NUM;i++)); -do - rm -rf ${execute_path}/server_$i/ - mkdir ${execute_path}/server_$i/ - cd ${execute_path}/server_$i/ || exit - export RANK_ID=$i - export DEVICE_ID=$i - python -s ${self_path}/../train_and_eval_parameter_server.py --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 >server_$i.log 2>&1 & -done +if [[ "$MS_ROLE" == "MS_PSERVER" ]]; then + for((i=0;i<$LOCAL_SERVER_NUM;i++)); + do + rm -rf ${execute_path}/server_$i/ + mkdir ${execute_path}/server_$i/ + cd ${execute_path}/server_$i/ || exit + python -s ${self_path}/../train_and_eval_parameter_server_distribute.py --device_target=$DEVICE_TARGET \ + --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \ + --vocab_cache_size=$VOCAB_CACHE_SIZE >server_$i.log 2>&1 & + done fi -if [ "$MS_ROLE" == "MS_WORKER" ];then -for((i=0;i<$LOCAL_WORKER_NUM;i++)); -do - rm -rf ${execute_path}/worker_$i/ - mkdir ${execute_path}/worker_$i/ - cd ${execute_path}/worker_$i/ || exit - export RANK_ID=$i - export DEVICE_ID=$i - python -s ${self_path}/../train_and_eval_parameter_server.py --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 >worker_$i.log 2>&1 & -done +if [[ "$MS_ROLE" == "MS_WORKER" ]]; then + if [[ "X$DEVICE_TARGET" == "XGPU" ]]; then + rm -rf ${execute_path}/worker/ + mkdir ${execute_path}/worker/ + cd ${execute_path}/worker/ || exit + mpirun --allow-run-as-root -n $LOCAL_WORKER_NUM \ + python -s ${self_path}/../train_and_eval_parameter_server_distribute.py \ + --device_target=$DEVICE --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \ + --vocab_cache_size=$VOCAB_CACHE_SIZE --dropout_flag=1 >worker.log 2>&1 & + else + for((i=0;i<$LOCAL_WORKER_NUM;i++)); + do + rm -rf ${execute_path}/worker_$i/ + mkdir ${execute_path}/worker_$i/ + cd ${execute_path}/worker_$i/ || exit + export RANK_ID=$i + export DEVICE_ID=$i + python -s ${self_path}/../train_and_eval_parameter_server_distribute.py \ + --device_target=$DEVICE_TARGET --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \ + --vocab_cache_size=$VOCAB_CACHE_SIZE --dropout_flag=1 >worker_$i.log 2>&1 & + done + fi fi diff --git a/model_zoo/official/recommend/wide_and_deep/script/run_parameter_server_train_distribute.sh b/model_zoo/official/recommend/wide_and_deep/script/run_parameter_server_train_distribute.sh new file mode 100644 index 0000000000..10186751f3 --- /dev/null +++ b/model_zoo/official/recommend/wide_and_deep/script/run_parameter_server_train_distribute.sh @@ -0,0 +1,82 @@ +#!/bin/bash +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + + +#bash run_parameter_server_train_distribute.sh RANK_SIZE EPOCHS DEVICE_TARGET DATASET +# SERVER_NUM SCHED_HOST SCHED_PORT RANK_TABLE_FILE +# VOCAB_CACHE_SIZE +execute_path=$(pwd) +script_self=$(readlink -f "$0") +self_path=$(dirname "${script_self}") +export RANK_SIZE=$1 +export EPOCH_SIZE=$2 +export DEVICE_TARGET=$3 +export DATASET=$4 +export MS_COMM_TYPE=zmq +export MS_SCHED_NUM=1 +export MS_WORKER_NUM=$RANK_SIZE +export MS_SERVER_NUM=$5 +export MS_SCHED_HOST=$6 +export MS_SCHED_PORT=$7 +export RANK_TABLE_FILE=$8 +export VOCAB_CACHE_SIZE=$9 + +if [[ ! -n "$9" ]]; then + export VOCAB_CACHE_SIZE=0 +fi + +export MS_ROLE=MS_SCHED +rm -rf ${execute_path}/sched/ +mkdir ${execute_path}/sched/ +cd ${execute_path}/sched/ || exit +python -s ${self_path}/../train_and_eval_parameter_server_distribute.py \ + --device_target=$DEVICE_TARGET --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \ + --vocab_cache_size=$VOCAB_CACHE_SIZE >sched.log 2>&1 & + +export MS_ROLE=MS_PSERVER +for((i=0;i<$MS_SERVER_NUM;i++)); +do + rm -rf ${execute_path}/server_$i/ + mkdir ${execute_path}/server_$i/ + cd ${execute_path}/server_$i/ || exit + python -s ${self_path}/../train_and_eval_parameter_server_distribute.py \ + --device_target=$DEVICE_TARGET --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \ + --vocab_cache_size=$VOCAB_CACHE_SIZE >server_$i.log 2>&1 & +done + +export MS_ROLE=MS_WORKER +if [[ "X$DEVICE_TARGET" == "XGPU" ]]; then + rm -rf ${execute_path}/worker/ + mkdir ${execute_path}/worker/ + cd ${execute_path}/worker/ || exit + mpirun --allow-run-as-root -n $RANK_SIZE \ + python -s ${self_path}/../train_and_eval_parameter_server_distribute.py \ + --device_target=$DEVICE_TARGET --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \ + --vocab_cache_size=$VOCAB_CACHE_SIZE --dropout_flag=1 >worker.log 2>&1 & +else + for((i=0;i<$MS_WORKER_NUM;i++)); + do + rm -rf ${execute_path}/worker_$i/ + mkdir ${execute_path}/worker_$i/ + cd ${execute_path}/worker_$i/ || exit + export RANK_ID=$i + export DEVICE_ID=$i + python -s ${self_path}/../train_and_eval_parameter_server_distribute.py \ + --device_target=$DEVICE_TARGET --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \ + --vocab_cache_size=$VOCAB_CACHE_SIZE --dropout_flag=1 >worker_$i.log 2>&1 & + done +fi + diff --git a/model_zoo/official/recommend/wide_and_deep/script/run_parameter_server_train_standalone.sh b/model_zoo/official/recommend/wide_and_deep/script/run_parameter_server_train_standalone.sh new file mode 100644 index 0000000000..78b4c64a03 --- /dev/null +++ b/model_zoo/official/recommend/wide_and_deep/script/run_parameter_server_train_standalone.sh @@ -0,0 +1,79 @@ +#!/bin/bash +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + + +#bash run_parameter_server_train_standalone.sh EPOCHS DEVICE_TARGET DATASET SERVER_NUM SCHED_HOST +# SCHED_PORT DEVICE_ID VOCAB_CACHE_SIZE +execute_path=$(pwd) +script_self=$(readlink -f "$0") +self_path=$(dirname "${script_self}") +export EPOCH_SIZE=$1 +export DEVICE_TARGET=$2 +export DATASET=$3 +export MS_COMM_TYPE=zmq +export MS_SCHED_NUM=1 +export MS_WORKER_NUM=1 +export MS_SERVER_NUM=$4 +export MS_SCHED_HOST=$5 +export MS_SCHED_PORT=$6 +DEVICE_ID=$7 +export VOCAB_CACHE_SIZE=$8 + +if [[ ! -n "$8" ]]; then + export VOCAB_CACHE_SIZE=0 +fi + +# Set device id +if [[ "X$DEVICE_TARGET" == "XGPU" ]]; then + if [[ ! -n "$DEVICE_ID" ]]; then + export CUDA_VISIBLE_DEVICES=0 + else + export CUDA_VISIBLE_DEVICES=$DEVICE_ID + fi +else + if [[ ! -n "$DEVICE_ID" ]]; then + export DEVICE_ID=0 + else + export DEVICE_ID=$DEVICE_ID + fi +fi + +export MS_ROLE=MS_SCHED +rm -rf ${execute_path}/sched/ +mkdir ${execute_path}/sched/ +cd ${execute_path}/sched/ || exit +python -s ${self_path}/../train_and_eval_parameter_server_standalone.py --device_target=$DEVICE_TARGET \ + --epochs=$EPOCH_SIZE --data_path=$DATASET --parameter_server=1 \ + --vocab_cache_size=$VOCAB_CACHE_SIZE >sched.log 2>&1 & + +export MS_ROLE=MS_PSERVER +for((i=0;i<$MS_SERVER_NUM;i++)); +do + rm -rf ${execute_path}/server_$i/ + mkdir ${execute_path}/server_$i/ + cd ${execute_path}/server_$i/ || exit + python -s ${self_path}/../train_and_eval_parameter_server_standalone.py --device_target=$DEVICE_TARGET \ + --epochs=$EPOCH_SIZE --data_path=$DATASET --parameter_server=1 \ + --vocab_cache_size=$VOCAB_CACHE_SIZE >server_$i.log 2>&1 & +done + +export MS_ROLE=MS_WORKER +rm -rf ${execute_path}/worker/ +mkdir ${execute_path}/worker/ +cd ${execute_path}/worker/ || exit +python -s ${self_path}/../train_and_eval_parameter_server_standalone.py --device_target=$DEVICE_TARGET \ + --epochs=$EPOCH_SIZE --data_path=$DATASET --parameter_server=1 \ + --vocab_cache_size=$VOCAB_CACHE_SIZE --dropout_flag=1 >worker.log 2>&1 & diff --git a/model_zoo/official/recommend/wide_and_deep/src/wide_and_deep.py b/model_zoo/official/recommend/wide_and_deep/src/wide_and_deep.py index b0de6c14f3..0dda5d58e0 100644 --- a/model_zoo/official/recommend/wide_and_deep/src/wide_and_deep.py +++ b/model_zoo/official/recommend/wide_and_deep/src/wide_and_deep.py @@ -238,6 +238,8 @@ class WideDeepModel(nn.Cell): elif parameter_server: cache_enable = self.vocab_cache_size > 0 target = 'DEVICE' if cache_enable else 'CPU' + if not cache_enable: + sparse = True if is_auto_parallel and config.full_batch and cache_enable: self.deep_embeddinglookup = nn.EmbeddingLookup(self.vocab_size, self.emb_dim, target=target, slice_mode=nn.EmbeddingLookup.TABLE_ROW_SLICE, diff --git a/model_zoo/official/recommend/wide_and_deep/train_and_eval_parameter_server_cache_distribute.py b/model_zoo/official/recommend/wide_and_deep/train_and_eval_parameter_server_distribute.py similarity index 63% rename from model_zoo/official/recommend/wide_and_deep/train_and_eval_parameter_server_cache_distribute.py rename to model_zoo/official/recommend/wide_and_deep/train_and_eval_parameter_server_distribute.py index 7002dd07b5..b87352088e 100644 --- a/model_zoo/official/recommend/wide_and_deep/train_and_eval_parameter_server_cache_distribute.py +++ b/model_zoo/official/recommend/wide_and_deep/train_and_eval_parameter_server_distribute.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ -"""train_multinpu.""" +"""train distribute on parameter server.""" import os @@ -23,6 +23,7 @@ from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMoni from mindspore.context import ParallelMode from mindspore.communication.management import get_rank, get_group_size, init from mindspore.nn.wrap.cell_wrapper import VirtualDatasetCellTriple +from mindspore.common import set_seed from src.wide_and_deep import PredictWithSigmoid, TrainStepWrap, NetWithLossClass, WideDeepModel from src.callbacks import LossCallBack, EvalCallBack @@ -32,18 +33,19 @@ from src.config import WideDeepConfig sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - -def get_WideDeep_net(config): +def get_wide_deep_net(config): """ Get network of wide&deep model. """ - WideDeep_net = WideDeepModel(config) - loss_net = NetWithLossClass(WideDeep_net, config) - loss_net = VirtualDatasetCellTriple(loss_net) + wide_deep_net = WideDeepModel(config) + loss_net = NetWithLossClass(wide_deep_net, config) + if cache_enable: + loss_net = VirtualDatasetCellTriple(loss_net) train_net = TrainStepWrap(loss_net, parameter_server=bool(config.parameter_server), - cache_enable=bool(config.vocab_cache_size > 0)) - eval_net = PredictWithSigmoid(WideDeep_net) - eval_net = VirtualDatasetCellTriple(eval_net) + cache_enable=(config.vocab_cache_size > 0)) + eval_net = PredictWithSigmoid(wide_deep_net) + if cache_enable: + eval_net = VirtualDatasetCellTriple(eval_net) return train_net, eval_net @@ -51,7 +53,6 @@ class ModelBuilder(): """ ModelBuilder """ - def __init__(self): pass @@ -67,13 +68,14 @@ class ModelBuilder(): return hooks def get_net(self, config): - return get_WideDeep_net(config) + return get_wide_deep_net(config) def train_and_eval(config): """ test_train_eval """ + set_seed(1000) data_path = config.data_path batch_size = config.batch_size epochs = config.epochs @@ -83,6 +85,9 @@ def train_and_eval(config): dataset_type = DataType.MINDRECORD else: dataset_type = DataType.H5 + parameter_server = bool(config.parameter_server) + if cache_enable: + config.full_batch = True print("epochs is {}".format(epochs)) if config.full_batch: context.set_auto_parallel_context(full_batch=True) @@ -107,35 +112,46 @@ def train_and_eval(config): train_net.set_train() auc_metric = AUCMetric() - model = Model(train_net, eval_network=eval_net, - metrics={"auc": auc_metric}) + model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric}) - eval_callback = EvalCallBack( - model, ds_eval, auc_metric, config) + eval_callback = EvalCallBack(model, ds_eval, auc_metric, config) - callback = LossCallBack(config=config, per_print_times=20) - ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size()*epochs, - keep_checkpoint_max=5, integrated_save=False) + callback = LossCallBack(config=config) + if cache_enable: + ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size()*epochs, + keep_checkpoint_max=5, integrated_save=False) + else: + ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size(), keep_checkpoint_max=5) ckpoint_cb = ModelCheckpoint(prefix='widedeep_train', - directory=config.ckpt_path + '/ckpt_' + str(get_rank()) + '/', config=ckptconfig) - context.set_auto_parallel_context(strategy_ckpt_save_file=config.stra_ckpt) - callback_list = [TimeMonitor( - ds_train.get_dataset_size()), eval_callback, callback] - callback_list.append(ckpoint_cb) - model.train(epochs, ds_train, callbacks=callback_list, dataset_sink_mode=True) + directory=config.ckpt_path + '/ckpt_' + str(get_rank()) + '/', + config=ckptconfig) + if cache_enable: + context.set_auto_parallel_context(strategy_ckpt_save_file=config.stra_ckpt) + callback_list = [TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback] + if get_rank() == 0: + callback_list.append(ckpoint_cb) + model.train(epochs, ds_train, + callbacks=callback_list, + dataset_sink_mode=bool(parameter_server and cache_enable)) if __name__ == "__main__": wide_deep_config = WideDeepConfig() wide_deep_config.argparse_init() - context.set_context(mode=context.GRAPH_MODE, - device_target=wide_deep_config.device_target, save_graphs=True) - context.set_context(variable_memory_max_size="24GB") + context.set_context(mode=context.GRAPH_MODE, device_target=wide_deep_config.device_target, save_graphs=True) + cache_enable = wide_deep_config.vocab_cache_size > 0 + if cache_enable and wide_deep_config.device_target != "GPU": + context.set_context(variable_memory_max_size="24GB") context.set_context(enable_sparse=True) context.set_ps_context(enable_ps=True) init() context.set_context(save_graphs_path='./graphs_of_device_id_'+str(get_rank())) - context.set_auto_parallel_context( - parallel_mode=ParallelMode.AUTO_PARALLEL, gradients_mean=True) + if cache_enable: + context.set_auto_parallel_context( + parallel_mode=ParallelMode.AUTO_PARALLEL, gradients_mean=True) + else: + context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, + device_num=get_group_size()) + train_and_eval(wide_deep_config) diff --git a/model_zoo/official/recommend/wide_and_deep/train_and_eval_parameter_server.py b/model_zoo/official/recommend/wide_and_deep/train_and_eval_parameter_server_standalone.py similarity index 72% rename from model_zoo/official/recommend/wide_and_deep/train_and_eval_parameter_server.py rename to model_zoo/official/recommend/wide_and_deep/train_and_eval_parameter_server_standalone.py index 5fefc1b36c..e195b1360b 100644 --- a/model_zoo/official/recommend/wide_and_deep/train_and_eval_parameter_server.py +++ b/model_zoo/official/recommend/wide_and_deep/train_and_eval_parameter_server_standalone.py @@ -12,15 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ -"""train_multinpu.""" +"""train standalone on parameter server.""" import os import sys from mindspore import Model, context from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor -from mindspore.context import ParallelMode -from mindspore.communication.management import get_rank, get_group_size, init from mindspore.common import set_seed from src.wide_and_deep import PredictWithSigmoid, TrainStepWrap, NetWithLossClass, WideDeepModel @@ -33,15 +31,15 @@ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) context.set_context(enable_sparse=True) -def get_WideDeep_net(config): +def get_wide_deep_net(config): """ Get network of wide&deep model. """ - WideDeep_net = WideDeepModel(config) - loss_net = NetWithLossClass(WideDeep_net, config) + wide_deep_net = WideDeepModel(config) + loss_net = NetWithLossClass(wide_deep_net, config) train_net = TrainStepWrap(loss_net, parameter_server=bool(config.parameter_server), - cache_enable=bool(config.vocab_cache_size > 0)) - eval_net = PredictWithSigmoid(WideDeep_net) + cache_enable=(config.vocab_cache_size > 0)) + eval_net = PredictWithSigmoid(wide_deep_net) return train_net, eval_net @@ -64,7 +62,7 @@ class ModelBuilder(): return hooks def get_net(self, config): - return get_WideDeep_net(config) + return get_wide_deep_net(config) def train_and_eval(config): @@ -82,14 +80,12 @@ def train_and_eval(config): else: dataset_type = DataType.H5 parameter_server = bool(config.parameter_server) - cache_enable = bool(config.vocab_cache_size > 0) + cache_enable = config.vocab_cache_size > 0 print("epochs is {}".format(epochs)) ds_train = create_dataset(data_path, train_mode=True, epochs=1, - batch_size=batch_size, rank_id=get_rank(), - rank_size=get_group_size(), data_type=dataset_type) + batch_size=batch_size, data_type=dataset_type) ds_eval = create_dataset(data_path, train_mode=False, epochs=1, - batch_size=batch_size, rank_id=get_rank(), - rank_size=get_group_size(), data_type=dataset_type) + batch_size=batch_size, data_type=dataset_type) print("ds_train.size: {}".format(ds_train.get_dataset_size())) print("ds_eval.size: {}".format(ds_eval.get_dataset_size())) @@ -102,15 +98,11 @@ def train_and_eval(config): model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric}) eval_callback = EvalCallBack(model, ds_eval, auc_metric, config) - callback = LossCallBack(config=config) ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size(), keep_checkpoint_max=5) - ckpoint_cb = ModelCheckpoint(prefix='widedeep_train', - directory=config.ckpt_path + '/ckpt_' + str(get_rank()) + '/', - config=ckptconfig) - callback_list = [TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback] - if get_rank() == 0: - callback_list.append(ckpoint_cb) + ckpoint_cb = ModelCheckpoint(prefix='widedeep_train', directory=config.ckpt_path, config=ckptconfig) + callback_list = [TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback, ckpoint_cb] + model.train(epochs, ds_train, callbacks=callback_list, dataset_sink_mode=(parameter_server and cache_enable)) @@ -120,10 +112,7 @@ if __name__ == "__main__": wide_deep_config = WideDeepConfig() wide_deep_config.argparse_init() - context.set_context(mode=context.GRAPH_MODE, device_target=wide_deep_config.device_target) + context.set_context(mode=context.GRAPH_MODE, device_target=wide_deep_config.device_target, save_graphs=True) context.set_ps_context(enable_ps=True) - init() - context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, - device_num=get_group_size()) train_and_eval(wide_deep_config)