refine model zoo scripts for ps cache

pull/9901/head
lizhenyu 4 years ago
parent b8faa8293b
commit b70bc5b9d0

@ -1,58 +0,0 @@
#!/bin/bash
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
execute_path=$(pwd)
script_self=$(readlink -f "$0")
self_path=$(dirname "${script_self}")
export RANK_SIZE=$1
export EPOCH_SIZE=$2
export DATASET=$3
export MS_COMM_TYPE=zmq
export MS_SCHED_NUM=1
export MS_WORKER_NUM=$RANK_SIZE
export MS_SERVER_NUM=$4
export MS_SCHED_HOST=$5
export MS_SCHED_PORT=$6
export MS_ROLE=MS_SCHED
for((i=0;i<1;i++));
do
rm -rf ${execute_path}/sched_$i/
mkdir ${execute_path}/sched_$i/
cd ${execute_path}/sched_$i/ || exit
python -s ${self_path}/../train_and_eval_parameter_server_cache_distribute.py \
--device_target='GPU' --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \
--vocab_cache_size=300000 >sched_$i.log 2>&1 &
done
export MS_ROLE=MS_PSERVER
for((i=0;i<$MS_SERVER_NUM;i++));
do
rm -rf ${execute_path}/server_$i/
mkdir ${execute_path}/server_$i/
cd ${execute_path}/server_$i/ || exit
python -s ${self_path}/../train_and_eval_parameter_server_cache_distribute.py \
--device_target='GPU' --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \
--vocab_cache_size=300000 >server_$i.log 2>&1 &
done
export MS_ROLE=MS_WORKER
rm -rf ${execute_path}/worker/
mkdir ${execute_path}/worker/
cd ${execute_path}/worker/ || exit
mpirun --allow-run-as-root -n $RANK_SIZE python -s ${self_path}/../train_and_eval_parameter_server_cache_distribute.py \
--device_target='GPU' --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \
--vocab_cache_size=300000 --full_batch=1 --dropout_flag=1 >worker.log 2>&1 &

@ -1,68 +0,0 @@
#!/bin/bash
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
execute_path=$(pwd)
script_self=$(readlink -f "$0")
self_path=$(dirname "${script_self}")
export RANK_SIZE=$1
export EPOCH_SIZE=$2
export DATASET=$3
export RANK_TABLE_FILE=$4
export MS_COMM_TYPE=zmq
export MS_SCHED_NUM=1
export MS_WORKER_NUM=$RANK_SIZE
export MS_SERVER_NUM=$5
export MS_SCHED_HOST=$6
export MS_SCHED_PORT=$7
export MS_ROLE=MS_SCHED
for((i=0;i<1;i++));
do
rm -rf ${execute_path}/sched_$i/
mkdir ${execute_path}/sched_$i/
cd ${execute_path}/sched_$i/ || exit
export RANK_ID=$i
export DEVICE_ID=$i
python -s ${self_path}/../train_and_eval_parameter_server_cache_distribute.py \
--data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \
--vocab_cache_size=300000 >sched_$i.log 2>&1 &
done
export MS_ROLE=MS_PSERVER
for((i=0;i<$MS_SERVER_NUM;i++));
do
rm -rf ${execute_path}/server_$i/
mkdir ${execute_path}/server_$i/
cd ${execute_path}/server_$i/ || exit
export RANK_ID=$i
export DEVICE_ID=$i
python -s ${self_path}/../train_and_eval_parameter_server_cache_distribute.py \
--data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \
--vocab_cache_size=300000 >server_$i.log 2>&1 &
done
export MS_ROLE=MS_WORKER
for((i=0;i<$MS_WORKER_NUM;i++));
do
rm -rf ${execute_path}/worker_$i/
mkdir ${execute_path}/worker_$i/
cd ${execute_path}/worker_$i/ || exit
export RANK_ID=$i
export DEVICE_ID=$i
python -s ${self_path}/../train_and_eval_parameter_server_cache_distribute.py \
--data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \
--vocab_cache_size=300000 --full_batch=1 --dropout_flag=1 >worker_$i.log 2>&1 &
done

@ -1,56 +0,0 @@
#!/bin/bash
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
execute_path=$(pwd)
script_self=$(readlink -f "$0")
self_path=$(dirname "${script_self}")
export EPOCH_SIZE=$1
export DEVICE_TARGET=$2
export DATASET=$3
export MS_COMM_TYPE=zmq
export MS_SCHED_NUM=1
export MS_WORKER_NUM=1
export MS_SERVER_NUM=$4
export MS_SCHED_HOST=$5
export MS_SCHED_PORT=$6
export MS_ROLE=MS_SCHED
rm -rf ${execute_path}/sched/
mkdir ${execute_path}/sched/
cd ${execute_path}/sched/ || exit
export DEVICE_ID=$i
python -s ${self_path}/../train_and_eval_parameter_server.py --epochs=$EPOCH_SIZE --device_target=$DEVICE_TARGET --data_path=$DATASET \
--parameter_server=1 --vocab_cache_size=300000 >sched.log 2>&1 &
export MS_ROLE=MS_PSERVER
for((i=0;i<$MS_SERVER_NUM;i++));
do
rm -rf ${execute_path}/server_$i/
mkdir ${execute_path}/server_$i/
cd ${execute_path}/server_$i/ || exit
export DEVICE_ID=$i
python -s ${self_path}/../train_and_eval_parameter_server.py --epochs=$EPOCH_SIZE --device_target=$DEVICE_TARGET --data_path=$DATASET \
--parameter_server=1 --vocab_cache_size=300000 >server_$i.log 2>&1 &
done
export MS_ROLE=MS_WORKER
rm -rf ${execute_path}/worker/
mkdir ${execute_path}/worker/
cd ${execute_path}/worker/ || exit
export DEVICE_ID=$i
python -s ${self_path}/../train_and_eval_parameter_server.py --epochs=$EPOCH_SIZE --device_target=$DEVICE_TARGET --data_path=$DATASET \
--parameter_server=1 --vocab_cache_size=300000 \
--dropout_flag=1 >worker.log 2>&1 &

@ -1,63 +0,0 @@
#!/bin/bash
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
execute_path=$(pwd)
script_self=$(readlink -f "$0")
self_path=$(dirname "${script_self}")
export RANK_SIZE=$1
export EPOCH_SIZE=$2
export DATASET=$3
export RANK_TABLE_FILE=$4
export MS_COMM_TYPE=zmq
export MS_SCHED_NUM=1
export MS_WORKER_NUM=$RANK_SIZE
export MS_SERVER_NUM=$5
export MS_SCHED_HOST=$6
export MS_SCHED_PORT=$7
export MS_ROLE=MS_SCHED
for((i=0;i<1;i++));
do
rm -rf ${execute_path}/sched_$i/
mkdir ${execute_path}/sched_$i/
cd ${execute_path}/sched_$i/ || exit
export RANK_ID=$i
export DEVICE_ID=$i
python -s ${self_path}/../train_and_eval_parameter_server.py --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 >sched_$i.log 2>&1 &
done
export MS_ROLE=MS_PSERVER
for((i=0;i<$MS_SERVER_NUM;i++));
do
rm -rf ${execute_path}/server_$i/
mkdir ${execute_path}/server_$i/
cd ${execute_path}/server_$i/ || exit
export RANK_ID=$i
export DEVICE_ID=$i
python -s ${self_path}/../train_and_eval_parameter_server.py --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 >server_$i.log 2>&1 &
done
export MS_ROLE=MS_WORKER
for((i=0;i<$MS_WORKER_NUM;i++));
do
rm -rf ${execute_path}/worker_$i/
mkdir ${execute_path}/worker_$i/
cd ${execute_path}/worker_$i/ || exit
export RANK_ID=$i
export DEVICE_ID=$i
python -s ${self_path}/../train_and_eval_parameter_server.py --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 >worker_$i.log 2>&1 &
done

@ -14,17 +14,17 @@
# limitations under the License.
# ============================================================================
#bash run_parameter_server_train_cluster.sh RANK_SIZE EPOCHS DEVICE_TARGET DATASET
# LOCAL_WORKER_NUM LOCAL_SERVER_NUM SERVER_NUM
# SCHED_HOST SCHED_PORT ROLE RANK_TABLE_FILE VOCAB_CACHE_SIZE
execute_path=$(pwd)
script_self=$(readlink -f "$0")
self_path=$(dirname "${script_self}")
#bash run_parameter_server_train_cluster.sh RANK_SIZE EPOCHS DATASET RANK_TABLE_FILE
# LOCAL_WORKER_NUM LOCAL_SERVER_NUM SERVER_NUM
# SCHED_HOST SCHED_PORT ROLE
export RANK_SIZE=$1
export EPOCH_SIZE=$2
export DATASET=$3
export RANK_TABLE_FILE=$4
export DEVICE_TARGET=$3
export DATASET=$4
export MS_COMM_TYPE=zmq
export MS_SCHED_NUM=1
@ -35,41 +35,56 @@ export MS_SERVER_NUM=$7
export MS_SCHED_HOST=$8
export MS_SCHED_PORT=$9
export MS_ROLE=${10}
echo "=====Role is $MS_ROLE======"
export RANK_TABLE_FILE=${11}
export VOCAB_CACHE_SIZE=${12}
if [[ ! -n "${12}" ]]; then
export VOCAB_CACHE_SIZE=0
fi
echo "=====Role is $MS_ROLE======"
if [ "$MS_ROLE" == "MS_SCHED" ];then
for((i=0;i<1;i++));
do
rm -rf ${execute_path}/sched_$i/
mkdir ${execute_path}/sched_$i/
cd ${execute_path}/sched_$i/ || exit
export RANK_ID=$i
export DEVICE_ID=$i
python -s ${self_path}/../train_and_eval_parameter_server.py --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 >sched_$i.log 2>&1 &
done
if [[ "$MS_ROLE" == "MS_SCHED" ]]; then
rm -rf ${execute_path}/sched/
mkdir ${execute_path}/sched/
cd ${execute_path}/sched/ || exit
python -s ${self_path}/../train_and_eval_parameter_server_distribute.py --device_target=$DEVICE_TARGET \
--data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \
--vocab_cache_size=$VOCAB_CACHE_SIZE >sched.log 2>&1 &
fi
if [ "$MS_ROLE" == "MS_PSERVER" ];then
for((i=0;i<$LOCAL_SERVER_NUM;i++));
do
rm -rf ${execute_path}/server_$i/
mkdir ${execute_path}/server_$i/
cd ${execute_path}/server_$i/ || exit
export RANK_ID=$i
export DEVICE_ID=$i
python -s ${self_path}/../train_and_eval_parameter_server.py --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 >server_$i.log 2>&1 &
done
if [[ "$MS_ROLE" == "MS_PSERVER" ]]; then
for((i=0;i<$LOCAL_SERVER_NUM;i++));
do
rm -rf ${execute_path}/server_$i/
mkdir ${execute_path}/server_$i/
cd ${execute_path}/server_$i/ || exit
python -s ${self_path}/../train_and_eval_parameter_server_distribute.py --device_target=$DEVICE_TARGET \
--data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \
--vocab_cache_size=$VOCAB_CACHE_SIZE >server_$i.log 2>&1 &
done
fi
if [ "$MS_ROLE" == "MS_WORKER" ];then
for((i=0;i<$LOCAL_WORKER_NUM;i++));
do
rm -rf ${execute_path}/worker_$i/
mkdir ${execute_path}/worker_$i/
cd ${execute_path}/worker_$i/ || exit
export RANK_ID=$i
export DEVICE_ID=$i
python -s ${self_path}/../train_and_eval_parameter_server.py --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 >worker_$i.log 2>&1 &
done
if [[ "$MS_ROLE" == "MS_WORKER" ]]; then
if [[ "X$DEVICE_TARGET" == "XGPU" ]]; then
rm -rf ${execute_path}/worker/
mkdir ${execute_path}/worker/
cd ${execute_path}/worker/ || exit
mpirun --allow-run-as-root -n $LOCAL_WORKER_NUM \
python -s ${self_path}/../train_and_eval_parameter_server_distribute.py \
--device_target=$DEVICE --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \
--vocab_cache_size=$VOCAB_CACHE_SIZE --dropout_flag=1 >worker.log 2>&1 &
else
for((i=0;i<$LOCAL_WORKER_NUM;i++));
do
rm -rf ${execute_path}/worker_$i/
mkdir ${execute_path}/worker_$i/
cd ${execute_path}/worker_$i/ || exit
export RANK_ID=$i
export DEVICE_ID=$i
python -s ${self_path}/../train_and_eval_parameter_server_distribute.py \
--device_target=$DEVICE_TARGET --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \
--vocab_cache_size=$VOCAB_CACHE_SIZE --dropout_flag=1 >worker_$i.log 2>&1 &
done
fi
fi

@ -0,0 +1,82 @@
#!/bin/bash
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
#bash run_parameter_server_train_distribute.sh RANK_SIZE EPOCHS DEVICE_TARGET DATASET
# SERVER_NUM SCHED_HOST SCHED_PORT RANK_TABLE_FILE
# VOCAB_CACHE_SIZE
execute_path=$(pwd)
script_self=$(readlink -f "$0")
self_path=$(dirname "${script_self}")
export RANK_SIZE=$1
export EPOCH_SIZE=$2
export DEVICE_TARGET=$3
export DATASET=$4
export MS_COMM_TYPE=zmq
export MS_SCHED_NUM=1
export MS_WORKER_NUM=$RANK_SIZE
export MS_SERVER_NUM=$5
export MS_SCHED_HOST=$6
export MS_SCHED_PORT=$7
export RANK_TABLE_FILE=$8
export VOCAB_CACHE_SIZE=$9
if [[ ! -n "$9" ]]; then
export VOCAB_CACHE_SIZE=0
fi
export MS_ROLE=MS_SCHED
rm -rf ${execute_path}/sched/
mkdir ${execute_path}/sched/
cd ${execute_path}/sched/ || exit
python -s ${self_path}/../train_and_eval_parameter_server_distribute.py \
--device_target=$DEVICE_TARGET --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \
--vocab_cache_size=$VOCAB_CACHE_SIZE >sched.log 2>&1 &
export MS_ROLE=MS_PSERVER
for((i=0;i<$MS_SERVER_NUM;i++));
do
rm -rf ${execute_path}/server_$i/
mkdir ${execute_path}/server_$i/
cd ${execute_path}/server_$i/ || exit
python -s ${self_path}/../train_and_eval_parameter_server_distribute.py \
--device_target=$DEVICE_TARGET --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \
--vocab_cache_size=$VOCAB_CACHE_SIZE >server_$i.log 2>&1 &
done
export MS_ROLE=MS_WORKER
if [[ "X$DEVICE_TARGET" == "XGPU" ]]; then
rm -rf ${execute_path}/worker/
mkdir ${execute_path}/worker/
cd ${execute_path}/worker/ || exit
mpirun --allow-run-as-root -n $RANK_SIZE \
python -s ${self_path}/../train_and_eval_parameter_server_distribute.py \
--device_target=$DEVICE_TARGET --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \
--vocab_cache_size=$VOCAB_CACHE_SIZE --dropout_flag=1 >worker.log 2>&1 &
else
for((i=0;i<$MS_WORKER_NUM;i++));
do
rm -rf ${execute_path}/worker_$i/
mkdir ${execute_path}/worker_$i/
cd ${execute_path}/worker_$i/ || exit
export RANK_ID=$i
export DEVICE_ID=$i
python -s ${self_path}/../train_and_eval_parameter_server_distribute.py \
--device_target=$DEVICE_TARGET --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \
--vocab_cache_size=$VOCAB_CACHE_SIZE --dropout_flag=1 >worker_$i.log 2>&1 &
done
fi

@ -0,0 +1,79 @@
#!/bin/bash
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
#bash run_parameter_server_train_standalone.sh EPOCHS DEVICE_TARGET DATASET SERVER_NUM SCHED_HOST
# SCHED_PORT DEVICE_ID VOCAB_CACHE_SIZE
execute_path=$(pwd)
script_self=$(readlink -f "$0")
self_path=$(dirname "${script_self}")
export EPOCH_SIZE=$1
export DEVICE_TARGET=$2
export DATASET=$3
export MS_COMM_TYPE=zmq
export MS_SCHED_NUM=1
export MS_WORKER_NUM=1
export MS_SERVER_NUM=$4
export MS_SCHED_HOST=$5
export MS_SCHED_PORT=$6
DEVICE_ID=$7
export VOCAB_CACHE_SIZE=$8
if [[ ! -n "$8" ]]; then
export VOCAB_CACHE_SIZE=0
fi
# Set device id
if [[ "X$DEVICE_TARGET" == "XGPU" ]]; then
if [[ ! -n "$DEVICE_ID" ]]; then
export CUDA_VISIBLE_DEVICES=0
else
export CUDA_VISIBLE_DEVICES=$DEVICE_ID
fi
else
if [[ ! -n "$DEVICE_ID" ]]; then
export DEVICE_ID=0
else
export DEVICE_ID=$DEVICE_ID
fi
fi
export MS_ROLE=MS_SCHED
rm -rf ${execute_path}/sched/
mkdir ${execute_path}/sched/
cd ${execute_path}/sched/ || exit
python -s ${self_path}/../train_and_eval_parameter_server_standalone.py --device_target=$DEVICE_TARGET \
--epochs=$EPOCH_SIZE --data_path=$DATASET --parameter_server=1 \
--vocab_cache_size=$VOCAB_CACHE_SIZE >sched.log 2>&1 &
export MS_ROLE=MS_PSERVER
for((i=0;i<$MS_SERVER_NUM;i++));
do
rm -rf ${execute_path}/server_$i/
mkdir ${execute_path}/server_$i/
cd ${execute_path}/server_$i/ || exit
python -s ${self_path}/../train_and_eval_parameter_server_standalone.py --device_target=$DEVICE_TARGET \
--epochs=$EPOCH_SIZE --data_path=$DATASET --parameter_server=1 \
--vocab_cache_size=$VOCAB_CACHE_SIZE >server_$i.log 2>&1 &
done
export MS_ROLE=MS_WORKER
rm -rf ${execute_path}/worker/
mkdir ${execute_path}/worker/
cd ${execute_path}/worker/ || exit
python -s ${self_path}/../train_and_eval_parameter_server_standalone.py --device_target=$DEVICE_TARGET \
--epochs=$EPOCH_SIZE --data_path=$DATASET --parameter_server=1 \
--vocab_cache_size=$VOCAB_CACHE_SIZE --dropout_flag=1 >worker.log 2>&1 &

@ -238,6 +238,8 @@ class WideDeepModel(nn.Cell):
elif parameter_server:
cache_enable = self.vocab_cache_size > 0
target = 'DEVICE' if cache_enable else 'CPU'
if not cache_enable:
sparse = True
if is_auto_parallel and config.full_batch and cache_enable:
self.deep_embeddinglookup = nn.EmbeddingLookup(self.vocab_size, self.emb_dim, target=target,
slice_mode=nn.EmbeddingLookup.TABLE_ROW_SLICE,

@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""train_multinpu."""
"""train distribute on parameter server."""
import os
@ -23,6 +23,7 @@ from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMoni
from mindspore.context import ParallelMode
from mindspore.communication.management import get_rank, get_group_size, init
from mindspore.nn.wrap.cell_wrapper import VirtualDatasetCellTriple
from mindspore.common import set_seed
from src.wide_and_deep import PredictWithSigmoid, TrainStepWrap, NetWithLossClass, WideDeepModel
from src.callbacks import LossCallBack, EvalCallBack
@ -32,18 +33,19 @@ from src.config import WideDeepConfig
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
def get_WideDeep_net(config):
def get_wide_deep_net(config):
"""
Get network of wide&deep model.
"""
WideDeep_net = WideDeepModel(config)
loss_net = NetWithLossClass(WideDeep_net, config)
loss_net = VirtualDatasetCellTriple(loss_net)
wide_deep_net = WideDeepModel(config)
loss_net = NetWithLossClass(wide_deep_net, config)
if cache_enable:
loss_net = VirtualDatasetCellTriple(loss_net)
train_net = TrainStepWrap(loss_net, parameter_server=bool(config.parameter_server),
cache_enable=bool(config.vocab_cache_size > 0))
eval_net = PredictWithSigmoid(WideDeep_net)
eval_net = VirtualDatasetCellTriple(eval_net)
cache_enable=(config.vocab_cache_size > 0))
eval_net = PredictWithSigmoid(wide_deep_net)
if cache_enable:
eval_net = VirtualDatasetCellTriple(eval_net)
return train_net, eval_net
@ -51,7 +53,6 @@ class ModelBuilder():
"""
ModelBuilder
"""
def __init__(self):
pass
@ -67,13 +68,14 @@ class ModelBuilder():
return hooks
def get_net(self, config):
return get_WideDeep_net(config)
return get_wide_deep_net(config)
def train_and_eval(config):
"""
test_train_eval
"""
set_seed(1000)
data_path = config.data_path
batch_size = config.batch_size
epochs = config.epochs
@ -83,6 +85,9 @@ def train_and_eval(config):
dataset_type = DataType.MINDRECORD
else:
dataset_type = DataType.H5
parameter_server = bool(config.parameter_server)
if cache_enable:
config.full_batch = True
print("epochs is {}".format(epochs))
if config.full_batch:
context.set_auto_parallel_context(full_batch=True)
@ -107,35 +112,46 @@ def train_and_eval(config):
train_net.set_train()
auc_metric = AUCMetric()
model = Model(train_net, eval_network=eval_net,
metrics={"auc": auc_metric})
model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric})
eval_callback = EvalCallBack(
model, ds_eval, auc_metric, config)
eval_callback = EvalCallBack(model, ds_eval, auc_metric, config)
callback = LossCallBack(config=config, per_print_times=20)
ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size()*epochs,
keep_checkpoint_max=5, integrated_save=False)
callback = LossCallBack(config=config)
if cache_enable:
ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size()*epochs,
keep_checkpoint_max=5, integrated_save=False)
else:
ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size(), keep_checkpoint_max=5)
ckpoint_cb = ModelCheckpoint(prefix='widedeep_train',
directory=config.ckpt_path + '/ckpt_' + str(get_rank()) + '/', config=ckptconfig)
context.set_auto_parallel_context(strategy_ckpt_save_file=config.stra_ckpt)
callback_list = [TimeMonitor(
ds_train.get_dataset_size()), eval_callback, callback]
callback_list.append(ckpoint_cb)
model.train(epochs, ds_train, callbacks=callback_list, dataset_sink_mode=True)
directory=config.ckpt_path + '/ckpt_' + str(get_rank()) + '/',
config=ckptconfig)
if cache_enable:
context.set_auto_parallel_context(strategy_ckpt_save_file=config.stra_ckpt)
callback_list = [TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback]
if get_rank() == 0:
callback_list.append(ckpoint_cb)
model.train(epochs, ds_train,
callbacks=callback_list,
dataset_sink_mode=bool(parameter_server and cache_enable))
if __name__ == "__main__":
wide_deep_config = WideDeepConfig()
wide_deep_config.argparse_init()
context.set_context(mode=context.GRAPH_MODE,
device_target=wide_deep_config.device_target, save_graphs=True)
context.set_context(variable_memory_max_size="24GB")
context.set_context(mode=context.GRAPH_MODE, device_target=wide_deep_config.device_target, save_graphs=True)
cache_enable = wide_deep_config.vocab_cache_size > 0
if cache_enable and wide_deep_config.device_target != "GPU":
context.set_context(variable_memory_max_size="24GB")
context.set_context(enable_sparse=True)
context.set_ps_context(enable_ps=True)
init()
context.set_context(save_graphs_path='./graphs_of_device_id_'+str(get_rank()))
context.set_auto_parallel_context(
parallel_mode=ParallelMode.AUTO_PARALLEL, gradients_mean=True)
if cache_enable:
context.set_auto_parallel_context(
parallel_mode=ParallelMode.AUTO_PARALLEL, gradients_mean=True)
else:
context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True,
device_num=get_group_size())
train_and_eval(wide_deep_config)

@ -12,15 +12,13 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""train_multinpu."""
"""train standalone on parameter server."""
import os
import sys
from mindspore import Model, context
from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor
from mindspore.context import ParallelMode
from mindspore.communication.management import get_rank, get_group_size, init
from mindspore.common import set_seed
from src.wide_and_deep import PredictWithSigmoid, TrainStepWrap, NetWithLossClass, WideDeepModel
@ -33,15 +31,15 @@ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
context.set_context(enable_sparse=True)
def get_WideDeep_net(config):
def get_wide_deep_net(config):
"""
Get network of wide&deep model.
"""
WideDeep_net = WideDeepModel(config)
loss_net = NetWithLossClass(WideDeep_net, config)
wide_deep_net = WideDeepModel(config)
loss_net = NetWithLossClass(wide_deep_net, config)
train_net = TrainStepWrap(loss_net, parameter_server=bool(config.parameter_server),
cache_enable=bool(config.vocab_cache_size > 0))
eval_net = PredictWithSigmoid(WideDeep_net)
cache_enable=(config.vocab_cache_size > 0))
eval_net = PredictWithSigmoid(wide_deep_net)
return train_net, eval_net
@ -64,7 +62,7 @@ class ModelBuilder():
return hooks
def get_net(self, config):
return get_WideDeep_net(config)
return get_wide_deep_net(config)
def train_and_eval(config):
@ -82,14 +80,12 @@ def train_and_eval(config):
else:
dataset_type = DataType.H5
parameter_server = bool(config.parameter_server)
cache_enable = bool(config.vocab_cache_size > 0)
cache_enable = config.vocab_cache_size > 0
print("epochs is {}".format(epochs))
ds_train = create_dataset(data_path, train_mode=True, epochs=1,
batch_size=batch_size, rank_id=get_rank(),
rank_size=get_group_size(), data_type=dataset_type)
batch_size=batch_size, data_type=dataset_type)
ds_eval = create_dataset(data_path, train_mode=False, epochs=1,
batch_size=batch_size, rank_id=get_rank(),
rank_size=get_group_size(), data_type=dataset_type)
batch_size=batch_size, data_type=dataset_type)
print("ds_train.size: {}".format(ds_train.get_dataset_size()))
print("ds_eval.size: {}".format(ds_eval.get_dataset_size()))
@ -102,15 +98,11 @@ def train_and_eval(config):
model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric})
eval_callback = EvalCallBack(model, ds_eval, auc_metric, config)
callback = LossCallBack(config=config)
ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size(), keep_checkpoint_max=5)
ckpoint_cb = ModelCheckpoint(prefix='widedeep_train',
directory=config.ckpt_path + '/ckpt_' + str(get_rank()) + '/',
config=ckptconfig)
callback_list = [TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback]
if get_rank() == 0:
callback_list.append(ckpoint_cb)
ckpoint_cb = ModelCheckpoint(prefix='widedeep_train', directory=config.ckpt_path, config=ckptconfig)
callback_list = [TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback, ckpoint_cb]
model.train(epochs, ds_train,
callbacks=callback_list,
dataset_sink_mode=(parameter_server and cache_enable))
@ -120,10 +112,7 @@ if __name__ == "__main__":
wide_deep_config = WideDeepConfig()
wide_deep_config.argparse_init()
context.set_context(mode=context.GRAPH_MODE, device_target=wide_deep_config.device_target)
context.set_context(mode=context.GRAPH_MODE, device_target=wide_deep_config.device_target, save_graphs=True)
context.set_ps_context(enable_ps=True)
init()
context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True,
device_num=get_group_size())
train_and_eval(wide_deep_config)
Loading…
Cancel
Save