From 41715677e769a5a35aa3bf6799af746a9fa57e74 Mon Sep 17 00:00:00 2001 From: yao_yf Date: Thu, 11 Jun 2020 10:43:20 +0800 Subject: [PATCH] fix_modelzoo_widedeep_run_multinup_train --- model_zoo/wide_and_deep/README.md | 44 +++++++++++-------- model_zoo/wide_and_deep/{test.py => eval.py} | 0 .../script/run_auto_parallel_train.sh | 35 +++++++++++++++ .../script/run_multinpu_train.sh | 4 +- model_zoo/wide_and_deep/src/config.py | 4 +- model_zoo/wide_and_deep/src/wide_and_deep.py | 26 ++++++----- .../{train_and_test.py => train_and_eval.py} | 0 ...lel.py => train_and_eval_auto_parallel.py} | 8 ++-- ...multinpu.py => train_and_eval_multinpu.py} | 4 +- 9 files changed, 88 insertions(+), 37 deletions(-) rename model_zoo/wide_and_deep/{test.py => eval.py} (100%) create mode 100644 model_zoo/wide_and_deep/script/run_auto_parallel_train.sh rename model_zoo/wide_and_deep/{train_and_test.py => train_and_eval.py} (100%) rename model_zoo/wide_and_deep/{train_and_test_multinpu_auto_parallel.py => train_and_eval_auto_parallel.py} (94%) rename model_zoo/wide_and_deep/{train_and_test_multinpu.py => train_and_eval_multinpu.py} (98%) diff --git a/model_zoo/wide_and_deep/README.md b/model_zoo/wide_and_deep/README.md index 54da9f9f4a..54367ef173 100644 --- a/model_zoo/wide_and_deep/README.md +++ b/model_zoo/wide_and_deep/README.md @@ -13,26 +13,28 @@ The Criteo datasets are used for model training and evaluation. The entire code structure is as following: ``` |--- wide_and_deep/ - train_and_test.py "Entrance of Wide&Deep model training and evaluation" - test.py "Entrance of Wide&Deep model evaluation" - train.py "Entrance of Wide&Deep model training" - train_and_test_multinpu.py "Entrance of Wide&Deep model data parallel training and evaluation" - |--- src/ "entrance of training and evaluation" - config.py "parameters configuration" - dataset.py "Dataset loader class" - process_data.py "process dataset" - preprocess_data.py "pre_process dataset" - WideDeep.py "Model structure" - callbacks.py "Callback class for training and evaluation" - metrics.py "Metric class" - |--- script/ "run shell dir" - run_multinpu_train.sh "run data parallel" + train_and_eval.py "Entrance of Wide&Deep model training and evaluation" + eval.py "Entrance of Wide&Deep model evaluation" + train.py "Entrance of Wide&Deep model training" + train_and_eval_multinpu.py "Entrance of Wide&Deep model data parallel training and evaluation" + train_and_eval_auto_parallel.py + |--- src/ "Entrance of training and evaluation" + config.py "Parameters configuration" + dataset.py "Dataset loader class" + process_data.py "Process dataset" + preprocess_data.py "Pre_process dataset" + wide_and_deep.py "Model structure" + callbacks.py "Callback class for training and evaluation" + metrics.py "Metric class" + |--- script/ "Run shell dir" + run_multinpu_train.sh "Run data parallel" + run_auto_parallel_train.sh "Run auto parallel" ``` ### Train and evaluate model To train and evaluate the model, command as follows: ``` -python train_and_test.py +python train_and_eval.py ``` Arguments: * `--data_path`: This should be set to the same directory given to the data_download's data_dir argument. @@ -44,6 +46,7 @@ Arguments: * `--emb_dim`: The dense embedding dimension of sparse feature. * `--deep_layers_dim`: The dimension of all deep layers. * `--deep_layers_act`: The activation of all deep layers. + * `--dropout_flag`: Whether do dropout. * `--keep_prob`: The rate to keep in dropout layer. * `--ckpt_path`:The location of the checkpoint file. * `--eval_file_name` : Eval output file. @@ -63,6 +66,7 @@ Arguments: * `--emb_dim`: The dense embedding dimension of sparse feature. * `--deep_layers_dim`: The dimension of all deep layers. * `--deep_layers_act`: The activation of all deep layers. + * `--dropout_flag`: Whether do dropout. * `--keep_prob`: The rate to keep in dropout layer. * `--ckpt_path`:The location of the checkpoint file. * `--eval_file_name` : Eval output file. @@ -70,13 +74,17 @@ Arguments: To train the model in distributed, command as follows: ``` -# configure environment path, RANK_TABLE_FILE, RANK_SIZE, MINDSPORE_HCCL_CONFIG_PATH before training -bash run_multinpu_train.sh +# configure environment path before training +bash run_multinpu_train.sh RANK_SIZE EPOCHS DATASET RANK_TABLE_FILE +``` +``` +# configure environment path before training +bash run_auto_parallel_train.sh RANK_SIZE EPOCHS DATASET RANK_TABLE_FILE ``` To evaluate the model, command as follows: ``` -python test.py +python eval.py ``` Arguments: * `--data_path`: This should be set to the same directory given to the data_download's data_dir argument. diff --git a/model_zoo/wide_and_deep/test.py b/model_zoo/wide_and_deep/eval.py similarity index 100% rename from model_zoo/wide_and_deep/test.py rename to model_zoo/wide_and_deep/eval.py diff --git a/model_zoo/wide_and_deep/script/run_auto_parallel_train.sh b/model_zoo/wide_and_deep/script/run_auto_parallel_train.sh new file mode 100644 index 0000000000..9e9226a23a --- /dev/null +++ b/model_zoo/wide_and_deep/script/run_auto_parallel_train.sh @@ -0,0 +1,35 @@ +#!/bin/bash +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +# bash run_multinpu_train.sh +execute_path=$(pwd) +script_self=$(readlink -f "$0") +self_path=$(dirname "${script_self}") +export RANK_SIZE=$1 +export EPOCH_SIZE=$2 +export DATASET=$3 +export RANK_TABLE_FILE=$4 +export MINDSPORE_HCCL_CONFIG_PATH=$4 + +for((i=0;i<$RANK_SIZE;i++)); +do + rm -rf ${execute_path}/device_$i/ + mkdir ${execute_path}/device_$i/ + cd ${execute_path}/device_$i/ || exit + export RANK_ID=$i + export DEVICE_ID=$i + python -s ${self_path}/../train_and_eval_auto_parallel.py --data_path=$DATASET --epochs=$EPOCH_SIZE >train_deep$i.log 2>&1 & +done diff --git a/model_zoo/wide_and_deep/script/run_multinpu_train.sh b/model_zoo/wide_and_deep/script/run_multinpu_train.sh index ea3c711d1b..c05156ff7e 100644 --- a/model_zoo/wide_and_deep/script/run_multinpu_train.sh +++ b/model_zoo/wide_and_deep/script/run_multinpu_train.sh @@ -24,12 +24,12 @@ export DATASET=$3 export RANK_TABLE_FILE=$4 export MINDSPORE_HCCL_CONFIG_PATH=$4 -for((i=0;i<=$RANK_SIZE;i++)); +for((i=0;i<$RANK_SIZE;i++)); do rm -rf ${execute_path}/device_$i/ mkdir ${execute_path}/device_$i/ cd ${execute_path}/device_$i/ || exit export RANK_ID=$i export DEVICE_ID=$i - python -s ${self_path}/../train_and_test_multinpu.py --data_path=$DATASET --epochs=$EPOCH_SIZE >train_deep$i.log 2>&1 & + python -s ${self_path}/../train_and_eval_multinpu.py --data_path=$DATASET --epochs=$EPOCH_SIZE >train_deep$i.log 2>&1 & done diff --git a/model_zoo/wide_and_deep/src/config.py b/model_zoo/wide_and_deep/src/config.py index 71031b7b95..f3488287af 100644 --- a/model_zoo/wide_and_deep/src/config.py +++ b/model_zoo/wide_and_deep/src/config.py @@ -31,7 +31,7 @@ def argparse_init(): parser.add_argument("--deep_layer_dim", type=int, nargs='+', default=[1024, 512, 256, 128]) parser.add_argument("--deep_layer_act", type=str, default='relu') parser.add_argument("--keep_prob", type=float, default=1.0) - + parser.add_argument("--dropout_flag", type=int, default=0) parser.add_argument("--output_path", type=str, default="./output/") parser.add_argument("--ckpt_path", type=str, default="./checkpoints/") parser.add_argument("--eval_file_name", type=str, default="eval.log") @@ -86,7 +86,7 @@ class WideDeepConfig(): self.weight_bias_init = ['normal', 'normal'] self.emb_init = 'normal' self.init_args = [-0.01, 0.01] - self.dropout_flag = False + self.dropout_flag = bool(args.dropout_flag) self.l2_coef = 8e-5 self.output_path = args.output_path diff --git a/model_zoo/wide_and_deep/src/wide_and_deep.py b/model_zoo/wide_and_deep/src/wide_and_deep.py index 9ddd9e12cd..1ba86dcc76 100644 --- a/model_zoo/wide_and_deep/src/wide_and_deep.py +++ b/model_zoo/wide_and_deep/src/wide_and_deep.py @@ -19,7 +19,7 @@ import mindspore.common.dtype as mstype from mindspore.ops import functional as F from mindspore.ops import composite as C from mindspore.ops import operations as P -# from mindspore.nn import Dropout +from mindspore.nn import Dropout from mindspore.nn.optim import Adam, FTRL # from mindspore.nn.metrics import Metric from mindspore.common.initializer import Uniform, initializer @@ -82,7 +82,7 @@ class DenseLayer(nn.Cell): """ def __init__(self, input_dim, output_dim, weight_bias_init, act_str, - keep_prob=0.7, scale_coef=1.0, convert_dtype=True): + keep_prob=0.7, scale_coef=1.0, convert_dtype=True, drop_out=False): super(DenseLayer, self).__init__() weight_init, bias_init = weight_bias_init self.weight = init_method( @@ -92,11 +92,12 @@ class DenseLayer(nn.Cell): self.matmul = P.MatMul(transpose_b=False) self.bias_add = P.BiasAdd() self.cast = P.Cast() - #self.dropout = Dropout(keep_prob=keep_prob) + self.dropout = Dropout(keep_prob=keep_prob) self.mul = P.Mul() self.realDiv = P.RealDiv() self.scale_coef = scale_coef self.convert_dtype = convert_dtype + self.drop_out = drop_out def _init_activation(self, act_str): act_str = act_str.lower() @@ -110,8 +111,8 @@ class DenseLayer(nn.Cell): def construct(self, x): x = self.act_func(x) - # if self.training: - # x = self.dropout(x) + if self.training and self.drop_out: + x = self.dropout(x) x = self.mul(x, self.scale_coef) if self.convert_dtype: x = self.cast(x, mstype.float16) @@ -163,23 +164,28 @@ class WideDeepModel(nn.Cell): self.dense_layer_1 = DenseLayer(self.all_dim_list[0], self.all_dim_list[1], self.weight_bias_init, - self.deep_layer_act, convert_dtype=True) + self.deep_layer_act, + convert_dtype=True, drop_out=config.dropout_flag) self.dense_layer_2 = DenseLayer(self.all_dim_list[1], self.all_dim_list[2], self.weight_bias_init, - self.deep_layer_act, convert_dtype=True) + self.deep_layer_act, + convert_dtype=True, drop_out=config.dropout_flag) self.dense_layer_3 = DenseLayer(self.all_dim_list[2], self.all_dim_list[3], self.weight_bias_init, - self.deep_layer_act, convert_dtype=True) + self.deep_layer_act, + convert_dtype=True, drop_out=config.dropout_flag) self.dense_layer_4 = DenseLayer(self.all_dim_list[3], self.all_dim_list[4], self.weight_bias_init, - self.deep_layer_act, convert_dtype=True) + self.deep_layer_act, + convert_dtype=True, drop_out=config.dropout_flag) self.dense_layer_5 = DenseLayer(self.all_dim_list[4], self.all_dim_list[5], self.weight_bias_init, - self.deep_layer_act, convert_dtype=True) + self.deep_layer_act, + convert_dtype=True, drop_out=config.dropout_flag) self.gather_v2 = P.GatherV2() self.mul = P.Mul() diff --git a/model_zoo/wide_and_deep/train_and_test.py b/model_zoo/wide_and_deep/train_and_eval.py similarity index 100% rename from model_zoo/wide_and_deep/train_and_test.py rename to model_zoo/wide_and_deep/train_and_eval.py diff --git a/model_zoo/wide_and_deep/train_and_test_multinpu_auto_parallel.py b/model_zoo/wide_and_deep/train_and_eval_auto_parallel.py similarity index 94% rename from model_zoo/wide_and_deep/train_and_test_multinpu_auto_parallel.py rename to model_zoo/wide_and_deep/train_and_eval_auto_parallel.py index 9659d17223..780c95540c 100644 --- a/model_zoo/wide_and_deep/train_and_test_multinpu_auto_parallel.py +++ b/model_zoo/wide_and_deep/train_and_eval_auto_parallel.py @@ -71,11 +71,10 @@ class ModelBuilder(): return get_WideDeep_net(config) -def test_train_eval(): +def train_and_eval(config): """ test_train_eval """ - config = WideDeepConfig() data_path = config.data_path batch_size = config.batch_size epochs = config.epochs @@ -109,9 +108,12 @@ def test_train_eval(): ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size(), keep_checkpoint_max=5) ckpoint_cb = ModelCheckpoint(prefix='widedeep_train', directory=config.ckpt_path, config=ckptconfig) + context.set_auto_parallel_context(strategy_ckpt_save_file="./strategy_train.ckpt") model.train(epochs, ds_train, callbacks=[TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback, ckpoint_cb]) if __name__ == "__main__": - test_train_eval() + wide_deep_config = WideDeepConfig() + wide_deep_config.argparse_init() + train_and_eval(wide_deep_config) diff --git a/model_zoo/wide_and_deep/train_and_test_multinpu.py b/model_zoo/wide_and_deep/train_and_eval_multinpu.py similarity index 98% rename from model_zoo/wide_and_deep/train_and_test_multinpu.py rename to model_zoo/wide_and_deep/train_and_eval_multinpu.py index e68bd47f39..37ade572bf 100644 --- a/model_zoo/wide_and_deep/train_and_test_multinpu.py +++ b/model_zoo/wide_and_deep/train_and_eval_multinpu.py @@ -66,7 +66,7 @@ class ModelBuilder(): return get_WideDeep_net(config) -def test_train_eval(config): +def train_and_eval(config): """ test_train_eval """ @@ -105,4 +105,4 @@ def test_train_eval(config): if __name__ == "__main__": wide_deep_config = WideDeepConfig() wide_deep_config.argparse_init() - test_train_eval(wide_deep_config) + train_and_eval(wide_deep_config)