From 3988376b67de327b9028e78fe1f4c9d1b1586642 Mon Sep 17 00:00:00 2001
From: hanhuifeng2020 <hanhuifeng1@huawei.com>
Date: Fri, 18 Dec 2020 14:20:50 +0800
Subject: [PATCH] Performance optimization of Bert on GPU by the graph_kernel

---
 akg                                           |  2 +-
 model_zoo/official/nlp/bert/run_pretrain.py   |  6 ++--
 .../run_distributed_pretrain_for_gpu.sh       | 28 +++++++++----------
 .../run_standalone_pretrain_for_gpu.sh        |  2 +-
 4 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/akg b/akg
index 72b359ad45..065ca53530 160000
--- a/akg
+++ b/akg
@@ -1 +1 @@
-Subproject commit 72b359ad457ed8f4f254c8a3bd2bde88967202fb
+Subproject commit 065ca5353077903828bebc1baedd4d1c0b052bb6
diff --git a/model_zoo/official/nlp/bert/run_pretrain.py b/model_zoo/official/nlp/bert/run_pretrain.py
index 13366f2553..ccbb34d57a 100644
--- a/model_zoo/official/nlp/bert/run_pretrain.py
+++ b/model_zoo/official/nlp/bert/run_pretrain.py
@@ -94,7 +94,8 @@ def _get_optimizer(args_opt, network):
 def _auto_enable_graph_kernel(device_target, graph_kernel_mode):
     """Judge whether is suitable to enable graph kernel."""
     return graph_kernel_mode in ("auto", "true") and device_target == 'GPU' and \
-        cfg.bert_network == 'base' and cfg.batch_size == 32 and cfg.optimizer == 'AdamWeightDecay'
+        cfg.bert_network == 'base' and (cfg.batch_size == 32 or cfg.batch_size == 64) and \
+        cfg.optimizer == 'AdamWeightDecay'
 
 
 def run_pretrain():
@@ -148,7 +149,8 @@ def run_pretrain():
         context.reset_auto_parallel_context()
         context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True,
                                           device_num=device_num)
-        _set_bert_all_reduce_split()
+        if args_opt.device_target == 'Ascend':
+            _set_bert_all_reduce_split()
     else:
         rank = 0
         device_num = 1
diff --git a/model_zoo/official/nlp/bert/scripts/run_distributed_pretrain_for_gpu.sh b/model_zoo/official/nlp/bert/scripts/run_distributed_pretrain_for_gpu.sh
index 9fbc156b66..ff54a331db 100644
--- a/model_zoo/official/nlp/bert/scripts/run_distributed_pretrain_for_gpu.sh
+++ b/model_zoo/official/nlp/bert/scripts/run_distributed_pretrain_for_gpu.sh
@@ -27,18 +27,18 @@ DATA_DIR=$3
 SCHEMA_DIR=$4
 
 mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \
-	python run_pretrain.py				\
-		--device_target="GPU"			\
-		--distribute="true"				\
-		--epoch_size=$EPOCH_SIZE		\
-		--enable_save_ckpt="true"		\
-		--enable_lossscale="false"		\
-		--do_shuffle="true"				\
-		--enable_data_sink="true"		\
-		--data_sink_steps=1				\
-		--load_checkpoint_path=""			\
-		--save_checkpoint_steps=10000	\
-		--save_checkpoint_num=1			\
-		--data_dir=$DATA_DIR			\
-		--schema_dir=$SCHEMA_DIR > log.txt 2>&1 &
+  python run_pretrain.py        \
+    --device_target="GPU"      \
+    --distribute="true"        \
+    --epoch_size=$EPOCH_SIZE    \
+    --enable_save_ckpt="true"    \
+    --enable_lossscale="false"    \
+    --do_shuffle="true"        \
+    --enable_data_sink="true"    \
+    --data_sink_steps=20        \
+    --load_checkpoint_path=""      \
+    --save_checkpoint_steps=10000  \
+    --save_checkpoint_num=1      \
+    --data_dir=$DATA_DIR      \
+    --schema_dir=$SCHEMA_DIR > log.txt 2>&1 &
 
diff --git a/model_zoo/official/nlp/bert/scripts/run_standalone_pretrain_for_gpu.sh b/model_zoo/official/nlp/bert/scripts/run_standalone_pretrain_for_gpu.sh
index 1e9f1ec3e7..bd42ebf744 100644
--- a/model_zoo/official/nlp/bert/scripts/run_standalone_pretrain_for_gpu.sh
+++ b/model_zoo/official/nlp/bert/scripts/run_standalone_pretrain_for_gpu.sh
@@ -39,7 +39,7 @@ python run_pretrain.py  \
     --enable_lossscale="false" \
     --do_shuffle="true" \
     --enable_data_sink="true" \
-    --data_sink_steps=1 \
+    --data_sink_steps=20 \
     --load_checkpoint_path="" \
     --save_checkpoint_path="" \
     --save_checkpoint_steps=10000 \