From e1f4c066b3cdcb94a2f7df92c1292ba380e82468 Mon Sep 17 00:00:00 2001 From: chenhaozhe Date: Mon, 14 Sep 2020 14:31:03 +0800 Subject: [PATCH] fix loss print in bert and corresponding downstream task --- model_zoo/official/nlp/bert/README.md | 3 ++ model_zoo/official/nlp/bert/run_classifier.py | 2 +- model_zoo/official/nlp/bert/run_ner.py | 2 +- model_zoo/official/nlp/bert/run_squad.py | 2 +- model_zoo/official/nlp/bert/src/utils.py | 12 ++++--- model_zoo/official/nlp/tinybert/README.md | 35 ++++++++++++++----- 6 files changed, 40 insertions(+), 16 deletions(-) diff --git a/model_zoo/official/nlp/bert/README.md b/model_zoo/official/nlp/bert/README.md index 6ca9162ddc..ae80e442a1 100644 --- a/model_zoo/official/nlp/bert/README.md +++ b/model_zoo/official/nlp/bert/README.md @@ -399,6 +399,9 @@ epoch: 0.0, current epoch percent: 0.002, step: 200, outpus are (Tensor(shape=[1 ... ``` +> **Attention** This will bind the processor cores according to the `device_num` and total processor numbers. If you don't expect to run pretraining with binding processor cores, remove the operations about `taskset` in `scripts/ascend_distributed_launcher/get_distribute_pretrain_cmd.py` + + ## [Evaluation Process](#contents) ### Evaluation #### evaluation on cola dataset when running on Ascend diff --git a/model_zoo/official/nlp/bert/run_classifier.py b/model_zoo/official/nlp/bert/run_classifier.py index bb2cc5ec15..236e6947bb 100644 --- a/model_zoo/official/nlp/bert/run_classifier.py +++ b/model_zoo/official/nlp/bert/run_classifier.py @@ -78,7 +78,7 @@ def do_train(dataset=None, network=None, load_checkpoint_path="", save_checkpoin update_cell = DynamicLossScaleUpdateCell(loss_scale_value=2**32, scale_factor=2, scale_window=1000) netwithgrads = BertFinetuneCell(network, optimizer=optimizer, scale_update_cell=update_cell) model = Model(netwithgrads) - callbacks = [TimeMonitor(dataset.get_dataset_size()), LossCallBack(), ckpoint_cb] + callbacks = [TimeMonitor(dataset.get_dataset_size()), LossCallBack(dataset.get_dataset_size()), ckpoint_cb] model.train(epoch_num, dataset, callbacks=callbacks) def eval_result_print(assessment_method="accuracy", callback=None): diff --git a/model_zoo/official/nlp/bert/run_ner.py b/model_zoo/official/nlp/bert/run_ner.py index 704be721ea..33d272c373 100644 --- a/model_zoo/official/nlp/bert/run_ner.py +++ b/model_zoo/official/nlp/bert/run_ner.py @@ -79,7 +79,7 @@ def do_train(dataset=None, network=None, load_checkpoint_path="", save_checkpoin update_cell = DynamicLossScaleUpdateCell(loss_scale_value=2**32, scale_factor=2, scale_window=1000) netwithgrads = BertFinetuneCell(network, optimizer=optimizer, scale_update_cell=update_cell) model = Model(netwithgrads) - callbacks = [TimeMonitor(dataset.get_dataset_size()), LossCallBack(), ckpoint_cb] + callbacks = [TimeMonitor(dataset.get_dataset_size()), LossCallBack(dataset.get_dataset_size()), ckpoint_cb] model.train(epoch_num, dataset, callbacks=callbacks) def eval_result_print(assessment_method="accuracy", callback=None): diff --git a/model_zoo/official/nlp/bert/run_squad.py b/model_zoo/official/nlp/bert/run_squad.py index 82859229d3..bd45ffcc1f 100644 --- a/model_zoo/official/nlp/bert/run_squad.py +++ b/model_zoo/official/nlp/bert/run_squad.py @@ -81,7 +81,7 @@ def do_train(dataset=None, network=None, load_checkpoint_path="", save_checkpoin update_cell = DynamicLossScaleUpdateCell(loss_scale_value=2**32, scale_factor=2, scale_window=1000) netwithgrads = BertSquadCell(network, optimizer=optimizer, scale_update_cell=update_cell) model = Model(netwithgrads) - callbacks = [TimeMonitor(dataset.get_dataset_size()), LossCallBack(), ckpoint_cb] + callbacks = [TimeMonitor(dataset.get_dataset_size()), LossCallBack(dataset.get_dataset_size()), ckpoint_cb] model.train(epoch_num, dataset, callbacks=callbacks) diff --git a/model_zoo/official/nlp/bert/src/utils.py b/model_zoo/official/nlp/bert/src/utils.py index 56422a07df..77c71d2b88 100644 --- a/model_zoo/official/nlp/bert/src/utils.py +++ b/model_zoo/official/nlp/bert/src/utils.py @@ -141,14 +141,18 @@ class LossCallBack(Callback): Args: per_print_times (int): Print loss every times. Default: 1. """ - def __init__(self, dataset_size=1): + def __init__(self, dataset_size=-1): super(LossCallBack, self).__init__() self._dataset_size = dataset_size def step_end(self, run_context): cb_params = run_context.original_args() - percent, epoch_num = math.modf(cb_params.cur_step_num / self._dataset_size) - print("epoch: {}, current epoch percent: {}, step: {}, outputs are {}" - .format(epoch_num, "%.3f" % percent, cb_params.cur_step_num, str(cb_params.net_outputs))) + if self._dataset_size > 0: + percent, epoch_num = math.modf(cb_params.cur_step_num / self._dataset_size) + print("epoch: {}, current epoch percent: {}, step: {}, outputs are {}" + .format(epoch_num, "%.3f" % percent, cb_params.cur_step_num, str(cb_params.net_outputs))) + else: + print("epoch: {}, step: {}, outputs are {}".format(cb_params.cur_epoch_num, cb_params.cur_step_num, + str(cb_params.net_outputs))) def LoadNewestCkpt(load_finetune_checkpoint_dir, steps_per_epoch, epoch_num, prefix): """ diff --git a/model_zoo/official/nlp/tinybert/README.md b/model_zoo/official/nlp/tinybert/README.md index 3309fd773d..fcaaa42472 100644 --- a/model_zoo/official/nlp/tinybert/README.md +++ b/model_zoo/official/nlp/tinybert/README.md @@ -1,19 +1,34 @@ # Contents +- [Contents](#contents) - [TinyBERT Description](#tinybert-description) - [Model Architecture](#model-architecture) - [Dataset](#dataset) - [Environment Requirements](#environment-requirements) - [Quick Start](#quick-start) - [Script Description](#script-description) - - [Script and Sample Code](#script-and-sample-code) - - [Script Parameters](#script-parameters) - - [Dataset Preparation](#dataset-preparation) - - [Training Process](#training-process) - - [Evaluation Process](#evaluation-process) -- [Model Description](#model-description) - - [Performance](#performance) - - [Training Performance](#training-performance) - - [Evaluation Performance](#evaluation-performance) + - [Script and Sample Code](#script-and-sample-code) + - [Script Parameters](#script-parameters) + - [General Distill](#general-distill) + - [Task Distill](#task-distill) + - [Options and Parameters](#options-and-parameters) + - [Options:](#options) + - [Parameters:](#parameters) + - [Training Process](#training-process) + - [Training](#training) + - [running on Ascend](#running-on-ascend) + - [running on GPU](#running-on-gpu) + - [Distributed Training](#distributed-training) + - [running on Ascend](#running-on-ascend-1) + - [running on GPU](#running-on-gpu-1) + - [Evaluation Process](#evaluation-process) + - [Evaluation](#evaluation) + - [evaluation on SST-2 dataset](#evaluation-on-sst-2-dataset) + - [evaluation on MNLI dataset](#evaluation-on-mnli-dataset) + - [evaluation on QNLI dataset](#evaluation-on-qnli-dataset) + - [Model Description](#model-description) + - [Performance](#performance) + - [training Performance](#training-performance) + - [Inference Performance](#inference-performance) - [Description of Random Situation](#description-of-random-situation) - [ModelZoo Homepage](#modelzoo-homepage) @@ -244,6 +259,8 @@ epoch: 2, step: 200, outpus are (Tensor(shape=[1], dtype=Float32, 30.1724), Tens ... ``` +> **Attention** This will bind the processor cores according to the `device_num` and total processor numbers. If you don't expect to run pretraining with binding processor cores, remove the operations about `taskset` in `scripts/run_distributed_gd_ascend.sh` + #### running on GPU Before running the command below, please check `load_teacher_ckpt_path`, `data_dir` `schma_dir` and `device_target=GPU` has been set. Please set the path to be the absolute full path, e.g:"/username/checkpoint_100_300.ckpt". ```