parent
b8faa8293b
commit
b70bc5b9d0
@ -1,58 +0,0 @@
|
||||
#!/bin/bash
|
||||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
|
||||
execute_path=$(pwd)
|
||||
script_self=$(readlink -f "$0")
|
||||
self_path=$(dirname "${script_self}")
|
||||
export RANK_SIZE=$1
|
||||
export EPOCH_SIZE=$2
|
||||
export DATASET=$3
|
||||
export MS_COMM_TYPE=zmq
|
||||
export MS_SCHED_NUM=1
|
||||
export MS_WORKER_NUM=$RANK_SIZE
|
||||
export MS_SERVER_NUM=$4
|
||||
export MS_SCHED_HOST=$5
|
||||
export MS_SCHED_PORT=$6
|
||||
|
||||
export MS_ROLE=MS_SCHED
|
||||
for((i=0;i<1;i++));
|
||||
do
|
||||
rm -rf ${execute_path}/sched_$i/
|
||||
mkdir ${execute_path}/sched_$i/
|
||||
cd ${execute_path}/sched_$i/ || exit
|
||||
python -s ${self_path}/../train_and_eval_parameter_server_cache_distribute.py \
|
||||
--device_target='GPU' --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \
|
||||
--vocab_cache_size=300000 >sched_$i.log 2>&1 &
|
||||
done
|
||||
|
||||
export MS_ROLE=MS_PSERVER
|
||||
for((i=0;i<$MS_SERVER_NUM;i++));
|
||||
do
|
||||
rm -rf ${execute_path}/server_$i/
|
||||
mkdir ${execute_path}/server_$i/
|
||||
cd ${execute_path}/server_$i/ || exit
|
||||
python -s ${self_path}/../train_and_eval_parameter_server_cache_distribute.py \
|
||||
--device_target='GPU' --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \
|
||||
--vocab_cache_size=300000 >server_$i.log 2>&1 &
|
||||
done
|
||||
|
||||
export MS_ROLE=MS_WORKER
|
||||
rm -rf ${execute_path}/worker/
|
||||
mkdir ${execute_path}/worker/
|
||||
cd ${execute_path}/worker/ || exit
|
||||
mpirun --allow-run-as-root -n $RANK_SIZE python -s ${self_path}/../train_and_eval_parameter_server_cache_distribute.py \
|
||||
--device_target='GPU' --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \
|
||||
--vocab_cache_size=300000 --full_batch=1 --dropout_flag=1 >worker.log 2>&1 &
|
@ -1,68 +0,0 @@
|
||||
#!/bin/bash
|
||||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
|
||||
execute_path=$(pwd)
|
||||
script_self=$(readlink -f "$0")
|
||||
self_path=$(dirname "${script_self}")
|
||||
export RANK_SIZE=$1
|
||||
export EPOCH_SIZE=$2
|
||||
export DATASET=$3
|
||||
export RANK_TABLE_FILE=$4
|
||||
export MS_COMM_TYPE=zmq
|
||||
export MS_SCHED_NUM=1
|
||||
export MS_WORKER_NUM=$RANK_SIZE
|
||||
export MS_SERVER_NUM=$5
|
||||
export MS_SCHED_HOST=$6
|
||||
export MS_SCHED_PORT=$7
|
||||
|
||||
export MS_ROLE=MS_SCHED
|
||||
for((i=0;i<1;i++));
|
||||
do
|
||||
rm -rf ${execute_path}/sched_$i/
|
||||
mkdir ${execute_path}/sched_$i/
|
||||
cd ${execute_path}/sched_$i/ || exit
|
||||
export RANK_ID=$i
|
||||
export DEVICE_ID=$i
|
||||
python -s ${self_path}/../train_and_eval_parameter_server_cache_distribute.py \
|
||||
--data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \
|
||||
--vocab_cache_size=300000 >sched_$i.log 2>&1 &
|
||||
done
|
||||
|
||||
export MS_ROLE=MS_PSERVER
|
||||
for((i=0;i<$MS_SERVER_NUM;i++));
|
||||
do
|
||||
rm -rf ${execute_path}/server_$i/
|
||||
mkdir ${execute_path}/server_$i/
|
||||
cd ${execute_path}/server_$i/ || exit
|
||||
export RANK_ID=$i
|
||||
export DEVICE_ID=$i
|
||||
python -s ${self_path}/../train_and_eval_parameter_server_cache_distribute.py \
|
||||
--data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \
|
||||
--vocab_cache_size=300000 >server_$i.log 2>&1 &
|
||||
done
|
||||
|
||||
export MS_ROLE=MS_WORKER
|
||||
for((i=0;i<$MS_WORKER_NUM;i++));
|
||||
do
|
||||
rm -rf ${execute_path}/worker_$i/
|
||||
mkdir ${execute_path}/worker_$i/
|
||||
cd ${execute_path}/worker_$i/ || exit
|
||||
export RANK_ID=$i
|
||||
export DEVICE_ID=$i
|
||||
python -s ${self_path}/../train_and_eval_parameter_server_cache_distribute.py \
|
||||
--data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \
|
||||
--vocab_cache_size=300000 --full_batch=1 --dropout_flag=1 >worker_$i.log 2>&1 &
|
||||
done
|
@ -1,56 +0,0 @@
|
||||
#!/bin/bash
|
||||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
|
||||
execute_path=$(pwd)
|
||||
script_self=$(readlink -f "$0")
|
||||
self_path=$(dirname "${script_self}")
|
||||
export EPOCH_SIZE=$1
|
||||
export DEVICE_TARGET=$2
|
||||
export DATASET=$3
|
||||
export MS_COMM_TYPE=zmq
|
||||
export MS_SCHED_NUM=1
|
||||
export MS_WORKER_NUM=1
|
||||
export MS_SERVER_NUM=$4
|
||||
export MS_SCHED_HOST=$5
|
||||
export MS_SCHED_PORT=$6
|
||||
|
||||
export MS_ROLE=MS_SCHED
|
||||
rm -rf ${execute_path}/sched/
|
||||
mkdir ${execute_path}/sched/
|
||||
cd ${execute_path}/sched/ || exit
|
||||
export DEVICE_ID=$i
|
||||
python -s ${self_path}/../train_and_eval_parameter_server.py --epochs=$EPOCH_SIZE --device_target=$DEVICE_TARGET --data_path=$DATASET \
|
||||
--parameter_server=1 --vocab_cache_size=300000 >sched.log 2>&1 &
|
||||
|
||||
export MS_ROLE=MS_PSERVER
|
||||
for((i=0;i<$MS_SERVER_NUM;i++));
|
||||
do
|
||||
rm -rf ${execute_path}/server_$i/
|
||||
mkdir ${execute_path}/server_$i/
|
||||
cd ${execute_path}/server_$i/ || exit
|
||||
export DEVICE_ID=$i
|
||||
python -s ${self_path}/../train_and_eval_parameter_server.py --epochs=$EPOCH_SIZE --device_target=$DEVICE_TARGET --data_path=$DATASET \
|
||||
--parameter_server=1 --vocab_cache_size=300000 >server_$i.log 2>&1 &
|
||||
done
|
||||
|
||||
export MS_ROLE=MS_WORKER
|
||||
rm -rf ${execute_path}/worker/
|
||||
mkdir ${execute_path}/worker/
|
||||
cd ${execute_path}/worker/ || exit
|
||||
export DEVICE_ID=$i
|
||||
python -s ${self_path}/../train_and_eval_parameter_server.py --epochs=$EPOCH_SIZE --device_target=$DEVICE_TARGET --data_path=$DATASET \
|
||||
--parameter_server=1 --vocab_cache_size=300000 \
|
||||
--dropout_flag=1 >worker.log 2>&1 &
|
@ -1,63 +0,0 @@
|
||||
#!/bin/bash
|
||||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
|
||||
execute_path=$(pwd)
|
||||
script_self=$(readlink -f "$0")
|
||||
self_path=$(dirname "${script_self}")
|
||||
export RANK_SIZE=$1
|
||||
export EPOCH_SIZE=$2
|
||||
export DATASET=$3
|
||||
export RANK_TABLE_FILE=$4
|
||||
|
||||
export MS_COMM_TYPE=zmq
|
||||
export MS_SCHED_NUM=1
|
||||
export MS_WORKER_NUM=$RANK_SIZE
|
||||
export MS_SERVER_NUM=$5
|
||||
export MS_SCHED_HOST=$6
|
||||
export MS_SCHED_PORT=$7
|
||||
|
||||
export MS_ROLE=MS_SCHED
|
||||
for((i=0;i<1;i++));
|
||||
do
|
||||
rm -rf ${execute_path}/sched_$i/
|
||||
mkdir ${execute_path}/sched_$i/
|
||||
cd ${execute_path}/sched_$i/ || exit
|
||||
export RANK_ID=$i
|
||||
export DEVICE_ID=$i
|
||||
python -s ${self_path}/../train_and_eval_parameter_server.py --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 >sched_$i.log 2>&1 &
|
||||
done
|
||||
|
||||
export MS_ROLE=MS_PSERVER
|
||||
for((i=0;i<$MS_SERVER_NUM;i++));
|
||||
do
|
||||
rm -rf ${execute_path}/server_$i/
|
||||
mkdir ${execute_path}/server_$i/
|
||||
cd ${execute_path}/server_$i/ || exit
|
||||
export RANK_ID=$i
|
||||
export DEVICE_ID=$i
|
||||
python -s ${self_path}/../train_and_eval_parameter_server.py --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 >server_$i.log 2>&1 &
|
||||
done
|
||||
|
||||
export MS_ROLE=MS_WORKER
|
||||
for((i=0;i<$MS_WORKER_NUM;i++));
|
||||
do
|
||||
rm -rf ${execute_path}/worker_$i/
|
||||
mkdir ${execute_path}/worker_$i/
|
||||
cd ${execute_path}/worker_$i/ || exit
|
||||
export RANK_ID=$i
|
||||
export DEVICE_ID=$i
|
||||
python -s ${self_path}/../train_and_eval_parameter_server.py --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 >worker_$i.log 2>&1 &
|
||||
done
|
@ -0,0 +1,82 @@
|
||||
#!/bin/bash
|
||||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
|
||||
|
||||
#bash run_parameter_server_train_distribute.sh RANK_SIZE EPOCHS DEVICE_TARGET DATASET
|
||||
# SERVER_NUM SCHED_HOST SCHED_PORT RANK_TABLE_FILE
|
||||
# VOCAB_CACHE_SIZE
|
||||
execute_path=$(pwd)
|
||||
script_self=$(readlink -f "$0")
|
||||
self_path=$(dirname "${script_self}")
|
||||
export RANK_SIZE=$1
|
||||
export EPOCH_SIZE=$2
|
||||
export DEVICE_TARGET=$3
|
||||
export DATASET=$4
|
||||
export MS_COMM_TYPE=zmq
|
||||
export MS_SCHED_NUM=1
|
||||
export MS_WORKER_NUM=$RANK_SIZE
|
||||
export MS_SERVER_NUM=$5
|
||||
export MS_SCHED_HOST=$6
|
||||
export MS_SCHED_PORT=$7
|
||||
export RANK_TABLE_FILE=$8
|
||||
export VOCAB_CACHE_SIZE=$9
|
||||
|
||||
if [[ ! -n "$9" ]]; then
|
||||
export VOCAB_CACHE_SIZE=0
|
||||
fi
|
||||
|
||||
export MS_ROLE=MS_SCHED
|
||||
rm -rf ${execute_path}/sched/
|
||||
mkdir ${execute_path}/sched/
|
||||
cd ${execute_path}/sched/ || exit
|
||||
python -s ${self_path}/../train_and_eval_parameter_server_distribute.py \
|
||||
--device_target=$DEVICE_TARGET --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \
|
||||
--vocab_cache_size=$VOCAB_CACHE_SIZE >sched.log 2>&1 &
|
||||
|
||||
export MS_ROLE=MS_PSERVER
|
||||
for((i=0;i<$MS_SERVER_NUM;i++));
|
||||
do
|
||||
rm -rf ${execute_path}/server_$i/
|
||||
mkdir ${execute_path}/server_$i/
|
||||
cd ${execute_path}/server_$i/ || exit
|
||||
python -s ${self_path}/../train_and_eval_parameter_server_distribute.py \
|
||||
--device_target=$DEVICE_TARGET --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \
|
||||
--vocab_cache_size=$VOCAB_CACHE_SIZE >server_$i.log 2>&1 &
|
||||
done
|
||||
|
||||
export MS_ROLE=MS_WORKER
|
||||
if [[ "X$DEVICE_TARGET" == "XGPU" ]]; then
|
||||
rm -rf ${execute_path}/worker/
|
||||
mkdir ${execute_path}/worker/
|
||||
cd ${execute_path}/worker/ || exit
|
||||
mpirun --allow-run-as-root -n $RANK_SIZE \
|
||||
python -s ${self_path}/../train_and_eval_parameter_server_distribute.py \
|
||||
--device_target=$DEVICE_TARGET --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \
|
||||
--vocab_cache_size=$VOCAB_CACHE_SIZE --dropout_flag=1 >worker.log 2>&1 &
|
||||
else
|
||||
for((i=0;i<$MS_WORKER_NUM;i++));
|
||||
do
|
||||
rm -rf ${execute_path}/worker_$i/
|
||||
mkdir ${execute_path}/worker_$i/
|
||||
cd ${execute_path}/worker_$i/ || exit
|
||||
export RANK_ID=$i
|
||||
export DEVICE_ID=$i
|
||||
python -s ${self_path}/../train_and_eval_parameter_server_distribute.py \
|
||||
--device_target=$DEVICE_TARGET --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \
|
||||
--vocab_cache_size=$VOCAB_CACHE_SIZE --dropout_flag=1 >worker_$i.log 2>&1 &
|
||||
done
|
||||
fi
|
||||
|
@ -0,0 +1,79 @@
|
||||
#!/bin/bash
|
||||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
|
||||
|
||||
#bash run_parameter_server_train_standalone.sh EPOCHS DEVICE_TARGET DATASET SERVER_NUM SCHED_HOST
|
||||
# SCHED_PORT DEVICE_ID VOCAB_CACHE_SIZE
|
||||
execute_path=$(pwd)
|
||||
script_self=$(readlink -f "$0")
|
||||
self_path=$(dirname "${script_self}")
|
||||
export EPOCH_SIZE=$1
|
||||
export DEVICE_TARGET=$2
|
||||
export DATASET=$3
|
||||
export MS_COMM_TYPE=zmq
|
||||
export MS_SCHED_NUM=1
|
||||
export MS_WORKER_NUM=1
|
||||
export MS_SERVER_NUM=$4
|
||||
export MS_SCHED_HOST=$5
|
||||
export MS_SCHED_PORT=$6
|
||||
DEVICE_ID=$7
|
||||
export VOCAB_CACHE_SIZE=$8
|
||||
|
||||
if [[ ! -n "$8" ]]; then
|
||||
export VOCAB_CACHE_SIZE=0
|
||||
fi
|
||||
|
||||
# Set device id
|
||||
if [[ "X$DEVICE_TARGET" == "XGPU" ]]; then
|
||||
if [[ ! -n "$DEVICE_ID" ]]; then
|
||||
export CUDA_VISIBLE_DEVICES=0
|
||||
else
|
||||
export CUDA_VISIBLE_DEVICES=$DEVICE_ID
|
||||
fi
|
||||
else
|
||||
if [[ ! -n "$DEVICE_ID" ]]; then
|
||||
export DEVICE_ID=0
|
||||
else
|
||||
export DEVICE_ID=$DEVICE_ID
|
||||
fi
|
||||
fi
|
||||
|
||||
export MS_ROLE=MS_SCHED
|
||||
rm -rf ${execute_path}/sched/
|
||||
mkdir ${execute_path}/sched/
|
||||
cd ${execute_path}/sched/ || exit
|
||||
python -s ${self_path}/../train_and_eval_parameter_server_standalone.py --device_target=$DEVICE_TARGET \
|
||||
--epochs=$EPOCH_SIZE --data_path=$DATASET --parameter_server=1 \
|
||||
--vocab_cache_size=$VOCAB_CACHE_SIZE >sched.log 2>&1 &
|
||||
|
||||
export MS_ROLE=MS_PSERVER
|
||||
for((i=0;i<$MS_SERVER_NUM;i++));
|
||||
do
|
||||
rm -rf ${execute_path}/server_$i/
|
||||
mkdir ${execute_path}/server_$i/
|
||||
cd ${execute_path}/server_$i/ || exit
|
||||
python -s ${self_path}/../train_and_eval_parameter_server_standalone.py --device_target=$DEVICE_TARGET \
|
||||
--epochs=$EPOCH_SIZE --data_path=$DATASET --parameter_server=1 \
|
||||
--vocab_cache_size=$VOCAB_CACHE_SIZE >server_$i.log 2>&1 &
|
||||
done
|
||||
|
||||
export MS_ROLE=MS_WORKER
|
||||
rm -rf ${execute_path}/worker/
|
||||
mkdir ${execute_path}/worker/
|
||||
cd ${execute_path}/worker/ || exit
|
||||
python -s ${self_path}/../train_and_eval_parameter_server_standalone.py --device_target=$DEVICE_TARGET \
|
||||
--epochs=$EPOCH_SIZE --data_path=$DATASET --parameter_server=1 \
|
||||
--vocab_cache_size=$VOCAB_CACHE_SIZE --dropout_flag=1 >worker.log 2>&1 &
|
Loading…
Reference in new issue