From 172999fefa579249a7a7e800cf7dcaccd7505c5e Mon Sep 17 00:00:00 2001 From: panfengfeng Date: Tue, 27 Oct 2020 19:01:01 +0800 Subject: [PATCH] fix nasnet & efficientnet scripts --- model_zoo/official/cv/efficientnet/src/config.py | 1 + model_zoo/official/cv/efficientnet/train.py | 14 +++++++++----- .../nasnet/scripts/run_distribute_train_for_gpu.sh | 2 +- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/model_zoo/official/cv/efficientnet/src/config.py b/model_zoo/official/cv/efficientnet/src/config.py index 09ea624716..092669d471 100644 --- a/model_zoo/official/cv/efficientnet/src/config.py +++ b/model_zoo/official/cv/efficientnet/src/config.py @@ -41,6 +41,7 @@ efficientnet_b0_config_gpu = edict({ 'smoothing': 0.1, #Use Tensorflow BatchNorm defaults for models that support it 'bn_tf': False, + 'save_checkpoint': True, 'keep_checkpoint_max': 10, 'loss_scale': 1024, 'resume_start_epoch': 0, diff --git a/model_zoo/official/cv/efficientnet/train.py b/model_zoo/official/cv/efficientnet/train.py index 5c102648d4..9e3acd2fde 100644 --- a/model_zoo/official/cv/efficientnet/train.py +++ b/model_zoo/official/cv/efficientnet/train.py @@ -146,10 +146,14 @@ def main(): loss_scale_manager = FixedLossScaleManager( cfg.loss_scale, drop_overflow_update=False) - config_ck = CheckpointConfig( - save_checkpoint_steps=batches_per_epoch, keep_checkpoint_max=cfg.keep_checkpoint_max) - ckpoint_cb = ModelCheckpoint( - prefix=cfg.model, directory=output_dir, config=config_ck) + callbacks = [time_cb, loss_cb] + + if cfg.save_checkpoint: + config_ck = CheckpointConfig( + save_checkpoint_steps=batches_per_epoch, keep_checkpoint_max=cfg.keep_checkpoint_max) + ckpoint_cb = ModelCheckpoint( + prefix=cfg.model, directory=output_dir, config=config_ck) + callbacks += [ckpoint_cb] lr = Tensor(get_lr(base_lr=cfg.lr, total_epochs=cfg.epochs, steps_per_epoch=batches_per_epoch, decay_steps=cfg.decay_epochs, decay_rate=cfg.decay_rate, @@ -176,7 +180,7 @@ def main(): amp_level=cfg.amp_level ) - callbacks = [loss_cb, ckpoint_cb, time_cb] if is_master else [] + callbacks = callbacks if is_master else [] if args.resume: real_epoch = cfg.epochs - cfg.resume_start_epoch diff --git a/model_zoo/official/cv/nasnet/scripts/run_distribute_train_for_gpu.sh b/model_zoo/official/cv/nasnet/scripts/run_distribute_train_for_gpu.sh index 0ddfde76b9..ba6253c07d 100755 --- a/model_zoo/official/cv/nasnet/scripts/run_distribute_train_for_gpu.sh +++ b/model_zoo/official/cv/nasnet/scripts/run_distribute_train_for_gpu.sh @@ -14,5 +14,5 @@ # limitations under the License. # ============================================================================ DATA_DIR=$1 -mpirun --allow-run-as-root -n 8 --output-filename log_output --merge-stderr-to-stdout \ +CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 mpirun --allow-run-as-root -n 8 --output-filename log_output --merge-stderr-to-stdout \ python ./train.py --is_distributed --platform 'GPU' --dataset_path $DATA_DIR > train.log 2>&1 &