wide_and_deep gpu host_device

pull/4806/head
yao_yf 5 years ago
parent ddd9121968
commit da7c8cbae1

@ -0,0 +1,34 @@
#!/bin/bash
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
# bash run_multigpu_train.sh RANK_SIZE EPOCH_SIZE DATASET
script_self=$(readlink -f "$0")
self_path=$(dirname "${script_self}")
RANK_SIZE=$1
EPOCH_SIZE=$2
DATASET=$3
VOCAB_SIZE=$4
EMB_DIM=$5
mpirun --allow-run-as-root -n $RANK_SIZE \
python -s ${self_path}/../train_and_eval_auto_parallel.py \
--device_target="GPU" \
--data_path=$DATASET \
--epochs=$EPOCH_SIZE \
--vocab_size=$VOCAB_SIZE \
--emb_dim=$EMB_DIM \
--dropout_flag=1 \
--host_device_mix=1 > log.txt 2>&1 &

@ -18,6 +18,7 @@ import time
from mindspore.train.callback import Callback from mindspore.train.callback import Callback
from mindspore import context from mindspore import context
from mindspore.train import ParallelMode from mindspore.train import ParallelMode
from mindspore.communication.management import get_rank
def add_write(file_path, out_str): def add_write(file_path, out_str):
""" """
@ -52,7 +53,14 @@ class LossCallBack(Callback):
wide_loss, deep_loss = cb_params.net_outputs[0].asnumpy(), cb_params.net_outputs[1].asnumpy() wide_loss, deep_loss = cb_params.net_outputs[0].asnumpy(), cb_params.net_outputs[1].asnumpy()
cur_step_in_epoch = (cb_params.cur_step_num - 1) % cb_params.batch_num + 1 cur_step_in_epoch = (cb_params.cur_step_num - 1) % cb_params.batch_num + 1
cur_num = cb_params.cur_step_num cur_num = cb_params.cur_step_num
print("===loss===", cb_params.cur_epoch_num, cur_step_in_epoch, wide_loss, deep_loss, flush=True) rank_id = 0
parallel_mode = context.get_auto_parallel_context("parallel_mode")
if parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL, ParallelMode.AUTO_PARALLEL,
ParallelMode.DATA_PARALLEL):
rank_id = get_rank()
print("===loss===", rank_id, cb_params.cur_epoch_num, cur_step_in_epoch,
wide_loss, deep_loss, flush=True)
# raise ValueError # raise ValueError
if self._per_print_times != 0 and cur_num % self._per_print_times == 0 and self.config is not None: if self._per_print_times != 0 and cur_num % self._per_print_times == 0 and self.config is not None:
@ -99,13 +107,18 @@ class EvalCallBack(Callback):
if parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL, ParallelMode.AUTO_PARALLEL): if parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL, ParallelMode.AUTO_PARALLEL):
context.set_auto_parallel_context(strategy_ckpt_save_file="", context.set_auto_parallel_context(strategy_ckpt_save_file="",
strategy_ckpt_load_file="./strategy_train.ckpt") strategy_ckpt_load_file="./strategy_train.ckpt")
rank_id = 0
if parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL, ParallelMode.AUTO_PARALLEL,
ParallelMode.DATA_PARALLEL):
rank_id = get_rank()
start_time = time.time() start_time = time.time()
out = self.model.eval(self.eval_dataset, dataset_sink_mode=(not self.host_device_mix)) out = self.model.eval(self.eval_dataset, dataset_sink_mode=(not self.host_device_mix))
end_time = time.time() end_time = time.time()
eval_time = int(end_time - start_time) eval_time = int(end_time - start_time)
time_str = time.strftime("%Y-%m-%d %H:%M%S", time.localtime()) time_str = time.strftime("%Y-%m-%d %H:%M%S", time.localtime())
out_str = "{}==== EvalCallBack model.eval(): {}; eval_time: {}s".format(time_str, out.values(), eval_time) out_str = "{} == Rank: {} == EvalCallBack model.eval(): {}; eval_time: {}s".\
format(time_str, rank_id, out.values(), eval_time)
print(out_str) print(out_str)
self.eval_values = out.values() self.eval_values = out.values()
add_write(self.eval_file_name, out_str) add_write(self.eval_file_name, out_str)

@ -201,6 +201,7 @@ class WideDeepModel(nn.Cell):
self.cast = P.Cast() self.cast = P.Cast()
if is_auto_parallel and host_device_mix: if is_auto_parallel and host_device_mix:
self.dense_layer_1.dropout.dropout_do_mask.set_strategy(((1, get_group_size()),)) self.dense_layer_1.dropout.dropout_do_mask.set_strategy(((1, get_group_size()),))
self.dense_layer_1.dropout.dropout.set_strategy(((1, get_group_size()),))
self.dense_layer_1.matmul.set_strategy(((1, get_group_size()), (get_group_size(), 1))) self.dense_layer_1.matmul.set_strategy(((1, get_group_size()), (get_group_size(), 1)))
self.deep_embeddinglookup = nn.EmbeddingLookup(self.vocab_size, self.emb_dim, self.deep_embeddinglookup = nn.EmbeddingLookup(self.vocab_size, self.emb_dim,
slice_mode=nn.EmbeddingLookUpSplitMode.TABLE_COLUMN_SLICE) slice_mode=nn.EmbeddingLookUpSplitMode.TABLE_COLUMN_SLICE)

@ -32,13 +32,6 @@ from src.metrics import AUCMetric
from src.config import WideDeepConfig from src.config import WideDeepConfig
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=True)
context.set_context(variable_memory_max_size="24GB")
context.set_context(enable_sparse=True)
cost_model_context.set_cost_model_context(multi_subgraphs=True)
init()
def get_WideDeep_net(config): def get_WideDeep_net(config):
""" """
@ -131,6 +124,14 @@ def train_and_eval(config):
if __name__ == "__main__": if __name__ == "__main__":
wide_deep_config = WideDeepConfig() wide_deep_config = WideDeepConfig()
wide_deep_config.argparse_init() wide_deep_config.argparse_init()
context.set_context(mode=context.GRAPH_MODE, device_target=wide_deep_config.device_target, save_graphs=True)
context.set_context(variable_memory_max_size="24GB")
context.set_context(enable_sparse=True)
cost_model_context.set_cost_model_context(multi_subgraphs=True)
if wide_deep_config.device_target == "Ascend":
init("hccl")
elif wide_deep_config.device_target == "GPU":
init("nccl")
if wide_deep_config.host_device_mix == 1: if wide_deep_config.host_device_mix == 1:
context.set_auto_parallel_context(parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL, mirror_mean=True) context.set_auto_parallel_context(parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL, mirror_mean=True)
else: else:

@ -16,6 +16,7 @@
import os import os
import sys import sys
import numpy as np
from mindspore import Model, context from mindspore import Model, context
from mindspore.train.callback import ModelCheckpoint, CheckpointConfig from mindspore.train.callback import ModelCheckpoint, CheckpointConfig
from mindspore.train.callback import TimeMonitor from mindspore.train.callback import TimeMonitor
@ -68,6 +69,7 @@ def train_and_eval(config):
""" """
train_and_eval train_and_eval
""" """
np.random.seed(1000)
data_path = config.data_path data_path = config.data_path
epochs = config.epochs epochs = config.epochs
print("epochs is {}".format(epochs)) print("epochs is {}".format(epochs))

Loading…
Cancel
Save