From 3988376b67de327b9028e78fe1f4c9d1b1586642 Mon Sep 17 00:00:00 2001 From: hanhuifeng2020 Date: Fri, 18 Dec 2020 14:20:50 +0800 Subject: [PATCH] Performance optimization of Bert on GPU by the graph_kernel --- akg | 2 +- model_zoo/official/nlp/bert/run_pretrain.py | 6 ++-- .../run_distributed_pretrain_for_gpu.sh | 28 +++++++++---------- .../run_standalone_pretrain_for_gpu.sh | 2 +- 4 files changed, 20 insertions(+), 18 deletions(-) diff --git a/akg b/akg index 72b359ad45..065ca53530 160000 --- a/akg +++ b/akg @@ -1 +1 @@ -Subproject commit 72b359ad457ed8f4f254c8a3bd2bde88967202fb +Subproject commit 065ca5353077903828bebc1baedd4d1c0b052bb6 diff --git a/model_zoo/official/nlp/bert/run_pretrain.py b/model_zoo/official/nlp/bert/run_pretrain.py index 13366f2553..ccbb34d57a 100644 --- a/model_zoo/official/nlp/bert/run_pretrain.py +++ b/model_zoo/official/nlp/bert/run_pretrain.py @@ -94,7 +94,8 @@ def _get_optimizer(args_opt, network): def _auto_enable_graph_kernel(device_target, graph_kernel_mode): """Judge whether is suitable to enable graph kernel.""" return graph_kernel_mode in ("auto", "true") and device_target == 'GPU' and \ - cfg.bert_network == 'base' and cfg.batch_size == 32 and cfg.optimizer == 'AdamWeightDecay' + cfg.bert_network == 'base' and (cfg.batch_size == 32 or cfg.batch_size == 64) and \ + cfg.optimizer == 'AdamWeightDecay' def run_pretrain(): @@ -148,7 +149,8 @@ def run_pretrain(): context.reset_auto_parallel_context() context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, device_num=device_num) - _set_bert_all_reduce_split() + if args_opt.device_target == 'Ascend': + _set_bert_all_reduce_split() else: rank = 0 device_num = 1 diff --git a/model_zoo/official/nlp/bert/scripts/run_distributed_pretrain_for_gpu.sh b/model_zoo/official/nlp/bert/scripts/run_distributed_pretrain_for_gpu.sh index 9fbc156b66..ff54a331db 100644 --- a/model_zoo/official/nlp/bert/scripts/run_distributed_pretrain_for_gpu.sh +++ b/model_zoo/official/nlp/bert/scripts/run_distributed_pretrain_for_gpu.sh @@ -27,18 +27,18 @@ DATA_DIR=$3 SCHEMA_DIR=$4 mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \ - python run_pretrain.py \ - --device_target="GPU" \ - --distribute="true" \ - --epoch_size=$EPOCH_SIZE \ - --enable_save_ckpt="true" \ - --enable_lossscale="false" \ - --do_shuffle="true" \ - --enable_data_sink="true" \ - --data_sink_steps=1 \ - --load_checkpoint_path="" \ - --save_checkpoint_steps=10000 \ - --save_checkpoint_num=1 \ - --data_dir=$DATA_DIR \ - --schema_dir=$SCHEMA_DIR > log.txt 2>&1 & + python run_pretrain.py \ + --device_target="GPU" \ + --distribute="true" \ + --epoch_size=$EPOCH_SIZE \ + --enable_save_ckpt="true" \ + --enable_lossscale="false" \ + --do_shuffle="true" \ + --enable_data_sink="true" \ + --data_sink_steps=20 \ + --load_checkpoint_path="" \ + --save_checkpoint_steps=10000 \ + --save_checkpoint_num=1 \ + --data_dir=$DATA_DIR \ + --schema_dir=$SCHEMA_DIR > log.txt 2>&1 & diff --git a/model_zoo/official/nlp/bert/scripts/run_standalone_pretrain_for_gpu.sh b/model_zoo/official/nlp/bert/scripts/run_standalone_pretrain_for_gpu.sh index 1e9f1ec3e7..bd42ebf744 100644 --- a/model_zoo/official/nlp/bert/scripts/run_standalone_pretrain_for_gpu.sh +++ b/model_zoo/official/nlp/bert/scripts/run_standalone_pretrain_for_gpu.sh @@ -39,7 +39,7 @@ python run_pretrain.py \ --enable_lossscale="false" \ --do_shuffle="true" \ --enable_data_sink="true" \ - --data_sink_steps=1 \ + --data_sink_steps=20 \ --load_checkpoint_path="" \ --save_checkpoint_path="" \ --save_checkpoint_steps=10000 \