From e1f4c066b3cdcb94a2f7df92c1292ba380e82468 Mon Sep 17 00:00:00 2001
From: chenhaozhe <chenhaozhe1@huawei.com>
Date: Mon, 14 Sep 2020 14:31:03 +0800
Subject: [PATCH] fix loss print in bert and corresponding downstream task

---
 model_zoo/official/nlp/bert/README.md         |  3 ++
 model_zoo/official/nlp/bert/run_classifier.py |  2 +-
 model_zoo/official/nlp/bert/run_ner.py        |  2 +-
 model_zoo/official/nlp/bert/run_squad.py      |  2 +-
 model_zoo/official/nlp/bert/src/utils.py      | 12 ++++---
 model_zoo/official/nlp/tinybert/README.md     | 35 ++++++++++++++-----
 6 files changed, 40 insertions(+), 16 deletions(-)

diff --git a/model_zoo/official/nlp/bert/README.md b/model_zoo/official/nlp/bert/README.md
index 6ca9162ddc..ae80e442a1 100644
--- a/model_zoo/official/nlp/bert/README.md
+++ b/model_zoo/official/nlp/bert/README.md
@@ -399,6 +399,9 @@ epoch: 0.0, current epoch percent: 0.002, step: 200, outpus are (Tensor(shape=[1
 ...
 ```
 
+> **Attention** This will bind the processor cores according to the `device_num` and total processor numbers. If you don't expect to run pretraining with binding processor cores, remove the operations about `taskset` in `scripts/ascend_distributed_launcher/get_distribute_pretrain_cmd.py`
+
+
 ## [Evaluation Process](#contents)
 ### Evaluation
 #### evaluation on cola dataset when running on Ascend
diff --git a/model_zoo/official/nlp/bert/run_classifier.py b/model_zoo/official/nlp/bert/run_classifier.py
index bb2cc5ec15..236e6947bb 100644
--- a/model_zoo/official/nlp/bert/run_classifier.py
+++ b/model_zoo/official/nlp/bert/run_classifier.py
@@ -78,7 +78,7 @@ def do_train(dataset=None, network=None, load_checkpoint_path="", save_checkpoin
     update_cell = DynamicLossScaleUpdateCell(loss_scale_value=2**32, scale_factor=2, scale_window=1000)
     netwithgrads = BertFinetuneCell(network, optimizer=optimizer, scale_update_cell=update_cell)
     model = Model(netwithgrads)
-    callbacks = [TimeMonitor(dataset.get_dataset_size()), LossCallBack(), ckpoint_cb]
+    callbacks = [TimeMonitor(dataset.get_dataset_size()), LossCallBack(dataset.get_dataset_size()), ckpoint_cb]
     model.train(epoch_num, dataset, callbacks=callbacks)
 
 def eval_result_print(assessment_method="accuracy", callback=None):
diff --git a/model_zoo/official/nlp/bert/run_ner.py b/model_zoo/official/nlp/bert/run_ner.py
index 704be721ea..33d272c373 100644
--- a/model_zoo/official/nlp/bert/run_ner.py
+++ b/model_zoo/official/nlp/bert/run_ner.py
@@ -79,7 +79,7 @@ def do_train(dataset=None, network=None, load_checkpoint_path="", save_checkpoin
     update_cell = DynamicLossScaleUpdateCell(loss_scale_value=2**32, scale_factor=2, scale_window=1000)
     netwithgrads = BertFinetuneCell(network, optimizer=optimizer, scale_update_cell=update_cell)
     model = Model(netwithgrads)
-    callbacks = [TimeMonitor(dataset.get_dataset_size()), LossCallBack(), ckpoint_cb]
+    callbacks = [TimeMonitor(dataset.get_dataset_size()), LossCallBack(dataset.get_dataset_size()), ckpoint_cb]
     model.train(epoch_num, dataset, callbacks=callbacks)
 
 def eval_result_print(assessment_method="accuracy", callback=None):
diff --git a/model_zoo/official/nlp/bert/run_squad.py b/model_zoo/official/nlp/bert/run_squad.py
index 82859229d3..bd45ffcc1f 100644
--- a/model_zoo/official/nlp/bert/run_squad.py
+++ b/model_zoo/official/nlp/bert/run_squad.py
@@ -81,7 +81,7 @@ def do_train(dataset=None, network=None, load_checkpoint_path="", save_checkpoin
     update_cell = DynamicLossScaleUpdateCell(loss_scale_value=2**32, scale_factor=2, scale_window=1000)
     netwithgrads = BertSquadCell(network, optimizer=optimizer, scale_update_cell=update_cell)
     model = Model(netwithgrads)
-    callbacks = [TimeMonitor(dataset.get_dataset_size()), LossCallBack(), ckpoint_cb]
+    callbacks = [TimeMonitor(dataset.get_dataset_size()), LossCallBack(dataset.get_dataset_size()), ckpoint_cb]
     model.train(epoch_num, dataset, callbacks=callbacks)
 
 
diff --git a/model_zoo/official/nlp/bert/src/utils.py b/model_zoo/official/nlp/bert/src/utils.py
index 56422a07df..77c71d2b88 100644
--- a/model_zoo/official/nlp/bert/src/utils.py
+++ b/model_zoo/official/nlp/bert/src/utils.py
@@ -141,14 +141,18 @@ class LossCallBack(Callback):
     Args:
         per_print_times (int): Print loss every times. Default: 1.
     """
-    def __init__(self, dataset_size=1):
+    def __init__(self, dataset_size=-1):
         super(LossCallBack, self).__init__()
         self._dataset_size = dataset_size
     def step_end(self, run_context):
         cb_params = run_context.original_args()
-        percent, epoch_num = math.modf(cb_params.cur_step_num / self._dataset_size)
-        print("epoch: {}, current epoch percent: {}, step: {}, outputs are {}"
-              .format(epoch_num, "%.3f" % percent, cb_params.cur_step_num, str(cb_params.net_outputs)))
+        if self._dataset_size > 0:
+            percent, epoch_num = math.modf(cb_params.cur_step_num / self._dataset_size)
+            print("epoch: {}, current epoch percent: {}, step: {}, outputs are {}"
+                  .format(epoch_num, "%.3f" % percent, cb_params.cur_step_num, str(cb_params.net_outputs)))
+        else:
+            print("epoch: {}, step: {}, outputs are {}".format(cb_params.cur_epoch_num, cb_params.cur_step_num,
+                                                               str(cb_params.net_outputs)))
 
 def LoadNewestCkpt(load_finetune_checkpoint_dir, steps_per_epoch, epoch_num, prefix):
     """
diff --git a/model_zoo/official/nlp/tinybert/README.md b/model_zoo/official/nlp/tinybert/README.md
index 3309fd773d..fcaaa42472 100644
--- a/model_zoo/official/nlp/tinybert/README.md
+++ b/model_zoo/official/nlp/tinybert/README.md
@@ -1,19 +1,34 @@
 # Contents
+- [Contents](#contents)
 - [TinyBERT Description](#tinybert-description)
 - [Model Architecture](#model-architecture)
 - [Dataset](#dataset)
 - [Environment Requirements](#environment-requirements)
 - [Quick Start](#quick-start)
 - [Script Description](#script-description)
-    - [Script and Sample Code](#script-and-sample-code)
-    - [Script Parameters](#script-parameters)
-    - [Dataset Preparation](#dataset-preparation)
-    - [Training Process](#training-process)
-    - [Evaluation Process](#evaluation-process)
-- [Model Description](#model-description)
-    - [Performance](#performance)
-        - [Training Performance](#training-performance)
-        - [Evaluation Performance](#evaluation-performance)
+  - [Script and Sample Code](#script-and-sample-code)
+  - [Script Parameters](#script-parameters)
+    - [General Distill](#general-distill)
+    - [Task Distill](#task-distill)
+  - [Options and Parameters](#options-and-parameters)
+    - [Options:](#options)
+    - [Parameters:](#parameters)
+  - [Training Process](#training-process)
+    - [Training](#training)
+      - [running on Ascend](#running-on-ascend)
+      - [running on GPU](#running-on-gpu)
+    - [Distributed Training](#distributed-training)
+      - [running on Ascend](#running-on-ascend-1)
+      - [running on GPU](#running-on-gpu-1)
+  - [Evaluation Process](#evaluation-process)
+    - [Evaluation](#evaluation)
+      - [evaluation on SST-2 dataset](#evaluation-on-sst-2-dataset)
+      - [evaluation on MNLI dataset](#evaluation-on-mnli-dataset)
+      - [evaluation on QNLI dataset](#evaluation-on-qnli-dataset)
+  - [Model Description](#model-description)
+  - [Performance](#performance)
+    - [training Performance](#training-performance)
+      - [Inference Performance](#inference-performance)
 - [Description of Random Situation](#description-of-random-situation)
 - [ModelZoo Homepage](#modelzoo-homepage)
 
@@ -244,6 +259,8 @@ epoch: 2, step: 200, outpus are (Tensor(shape=[1], dtype=Float32, 30.1724), Tens
 ...
 ```
 
+> **Attention** This will bind the processor cores according to the `device_num` and total processor numbers. If you don't expect to run pretraining with binding processor cores, remove the operations about `taskset` in `scripts/run_distributed_gd_ascend.sh`
+
 #### running on GPU
 Before running the command below, please check `load_teacher_ckpt_path`, `data_dir` `schma_dir` and `device_target=GPU` has been set. Please set the path to be the absolute full path, e.g:"/username/checkpoint_100_300.ckpt".
 ```