!6188 fix loss print of bert, add attention about bind cores in README

Merge pull request !6188 from chenhaozhe/fix-bert-loss-print
pull/6188/MERGE
mindspore-ci-bot 4 years ago committed by Gitee
commit 105422fbb9

@ -399,6 +399,9 @@ epoch: 0.0, current epoch percent: 0.002, step: 200, outpus are (Tensor(shape=[1
...
```
> **Attention** This will bind the processor cores according to the `device_num` and total processor numbers. If you don't expect to run pretraining with binding processor cores, remove the operations about `taskset` in `scripts/ascend_distributed_launcher/get_distribute_pretrain_cmd.py`
## [Evaluation Process](#contents)
### Evaluation
#### evaluation on cola dataset when running on Ascend

@ -78,7 +78,7 @@ def do_train(dataset=None, network=None, load_checkpoint_path="", save_checkpoin
update_cell = DynamicLossScaleUpdateCell(loss_scale_value=2**32, scale_factor=2, scale_window=1000)
netwithgrads = BertFinetuneCell(network, optimizer=optimizer, scale_update_cell=update_cell)
model = Model(netwithgrads)
callbacks = [TimeMonitor(dataset.get_dataset_size()), LossCallBack(), ckpoint_cb]
callbacks = [TimeMonitor(dataset.get_dataset_size()), LossCallBack(dataset.get_dataset_size()), ckpoint_cb]
model.train(epoch_num, dataset, callbacks=callbacks)
def eval_result_print(assessment_method="accuracy", callback=None):

@ -79,7 +79,7 @@ def do_train(dataset=None, network=None, load_checkpoint_path="", save_checkpoin
update_cell = DynamicLossScaleUpdateCell(loss_scale_value=2**32, scale_factor=2, scale_window=1000)
netwithgrads = BertFinetuneCell(network, optimizer=optimizer, scale_update_cell=update_cell)
model = Model(netwithgrads)
callbacks = [TimeMonitor(dataset.get_dataset_size()), LossCallBack(), ckpoint_cb]
callbacks = [TimeMonitor(dataset.get_dataset_size()), LossCallBack(dataset.get_dataset_size()), ckpoint_cb]
model.train(epoch_num, dataset, callbacks=callbacks)
def eval_result_print(assessment_method="accuracy", callback=None):

@ -81,7 +81,7 @@ def do_train(dataset=None, network=None, load_checkpoint_path="", save_checkpoin
update_cell = DynamicLossScaleUpdateCell(loss_scale_value=2**32, scale_factor=2, scale_window=1000)
netwithgrads = BertSquadCell(network, optimizer=optimizer, scale_update_cell=update_cell)
model = Model(netwithgrads)
callbacks = [TimeMonitor(dataset.get_dataset_size()), LossCallBack(), ckpoint_cb]
callbacks = [TimeMonitor(dataset.get_dataset_size()), LossCallBack(dataset.get_dataset_size()), ckpoint_cb]
model.train(epoch_num, dataset, callbacks=callbacks)

@ -141,14 +141,18 @@ class LossCallBack(Callback):
Args:
per_print_times (int): Print loss every times. Default: 1.
"""
def __init__(self, dataset_size=1):
def __init__(self, dataset_size=-1):
super(LossCallBack, self).__init__()
self._dataset_size = dataset_size
def step_end(self, run_context):
cb_params = run_context.original_args()
percent, epoch_num = math.modf(cb_params.cur_step_num / self._dataset_size)
print("epoch: {}, current epoch percent: {}, step: {}, outputs are {}"
.format(epoch_num, "%.3f" % percent, cb_params.cur_step_num, str(cb_params.net_outputs)))
if self._dataset_size > 0:
percent, epoch_num = math.modf(cb_params.cur_step_num / self._dataset_size)
print("epoch: {}, current epoch percent: {}, step: {}, outputs are {}"
.format(epoch_num, "%.3f" % percent, cb_params.cur_step_num, str(cb_params.net_outputs)))
else:
print("epoch: {}, step: {}, outputs are {}".format(cb_params.cur_epoch_num, cb_params.cur_step_num,
str(cb_params.net_outputs)))
def LoadNewestCkpt(load_finetune_checkpoint_dir, steps_per_epoch, epoch_num, prefix):
"""

@ -1,19 +1,34 @@
# Contents
- [Contents](#contents)
- [TinyBERT Description](#tinybert-description)
- [Model Architecture](#model-architecture)
- [Dataset](#dataset)
- [Environment Requirements](#environment-requirements)
- [Quick Start](#quick-start)
- [Script Description](#script-description)
- [Script and Sample Code](#script-and-sample-code)
- [Script Parameters](#script-parameters)
- [Dataset Preparation](#dataset-preparation)
- [Training Process](#training-process)
- [Evaluation Process](#evaluation-process)
- [Model Description](#model-description)
- [Performance](#performance)
- [Training Performance](#training-performance)
- [Evaluation Performance](#evaluation-performance)
- [Script and Sample Code](#script-and-sample-code)
- [Script Parameters](#script-parameters)
- [General Distill](#general-distill)
- [Task Distill](#task-distill)
- [Options and Parameters](#options-and-parameters)
- [Options:](#options)
- [Parameters:](#parameters)
- [Training Process](#training-process)
- [Training](#training)
- [running on Ascend](#running-on-ascend)
- [running on GPU](#running-on-gpu)
- [Distributed Training](#distributed-training)
- [running on Ascend](#running-on-ascend-1)
- [running on GPU](#running-on-gpu-1)
- [Evaluation Process](#evaluation-process)
- [Evaluation](#evaluation)
- [evaluation on SST-2 dataset](#evaluation-on-sst-2-dataset)
- [evaluation on MNLI dataset](#evaluation-on-mnli-dataset)
- [evaluation on QNLI dataset](#evaluation-on-qnli-dataset)
- [Model Description](#model-description)
- [Performance](#performance)
- [training Performance](#training-performance)
- [Inference Performance](#inference-performance)
- [Description of Random Situation](#description-of-random-situation)
- [ModelZoo Homepage](#modelzoo-homepage)
@ -244,6 +259,8 @@ epoch: 2, step: 200, outpus are (Tensor(shape=[1], dtype=Float32, 30.1724), Tens
...
```
> **Attention** This will bind the processor cores according to the `device_num` and total processor numbers. If you don't expect to run pretraining with binding processor cores, remove the operations about `taskset` in `scripts/run_distributed_gd_ascend.sh`
#### running on GPU
Before running the command below, please check `load_teacher_ckpt_path`, `data_dir` `schma_dir` and `device_target=GPU` has been set. Please set the path to be the absolute full path, e.g:"/username/checkpoint_100_300.ckpt".
```

Loading…
Cancel
Save