diff --git a/model_zoo/official/cv/resnet/README.md b/model_zoo/official/cv/resnet/README.md index 326c527700..0fa73c5012 100644 --- a/model_zoo/official/cv/resnet/README.md +++ b/model_zoo/official/cv/resnet/README.md @@ -375,11 +375,11 @@ epoch: 5 step: 5004, loss is 3.3501816 ```bash # ========START RESNET50 GPU BENCHMARK======== -Epoch time: 12416.098 ms, fps: 412 img/sec. epoch: 1 step: 20, loss is 6.940182 -Epoch time: 3472.037 ms, fps: 1474 img/sec. epoch: 2 step: 20, loss is 7.078993 -Epoch time: 3469.523 ms, fps: 1475 img/sec. epoch: 3 step: 20, loss is 7.559594 -Epoch time: 3460.311 ms, fps: 1479 img/sec. epoch: 4 step: 20, loss is 6.920937 -Epoch time: 3460.543 ms, fps: 1479 img/sec. epoch: 5 step: 20, loss is 6.814013 +epoch: [0/1] step: [20/5004], loss is 6.940182 Epoch time: 12416.098 ms, fps: 412 img/sec. +epoch: [0/1] step: [40/5004], loss is 7.078993Epoch time: 3438.972 ms, fps: 1488 img/sec. +epoch: [0/1] step: [60/5004], loss is 7.559594Epoch time: 3431.516 ms, fps: 1492 img/sec. +epoch: [0/1] step: [80/5004], loss is 6.920937Epoch time: 3435.777 ms, fps: 1490 img/sec. +epoch: [0/1] step: [100/5004], loss is 6.814013Epoch time: 3437.154 ms, fps: 1489 img/sec. ... ``` diff --git a/model_zoo/official/cv/resnet/gpu_resnet_benchmark.py b/model_zoo/official/cv/resnet/gpu_resnet_benchmark.py index 577c8dbcec..9498ff8027 100644 --- a/model_zoo/official/cv/resnet/gpu_resnet_benchmark.py +++ b/model_zoo/official/cv/resnet/gpu_resnet_benchmark.py @@ -53,10 +53,11 @@ set_seed(1) class MyTimeMonitor(Callback): - def __init__(self, batch_size, sink_size): + def __init__(self, batch_size, sink_size, dataset_size): super(MyTimeMonitor, self).__init__() self.batch_size = batch_size self.size = sink_size + self.data_size = dataset_size def step_begin(self, run_context): self.step_time = time.time() @@ -72,14 +73,16 @@ class MyTimeMonitor(Callback): if isinstance(loss, Tensor) and isinstance(loss.asnumpy(), np.ndarray): loss = np.mean(loss.asnumpy()) - cur_step_in_epoch = (cb_params.cur_step_num - 1) % cb_params.batch_num + 1 + cur_epoch_num = int(cb_params.cur_epoch_num / (self.data_size / self.size)) + cur_step_in_epoch = int(self.size * (cb_params.cur_epoch_num % (self.data_size / self.size))) if isinstance(loss, float) and (np.isnan(loss) or np.isinf(loss)): raise ValueError("epoch: {} step: {}. Invalid loss, terminating training.".format( - cb_params.cur_epoch_num, cur_step_in_epoch)) + cur_epoch_num, cur_step_in_epoch)) step_mseconds = (time.time() - self.step_time) * 1000 fps = self.batch_size / step_mseconds * 1000 * self.size - print("epoch: %s step: %s, loss is %s" % (cb_params.cur_epoch_num, cur_step_in_epoch, loss), + print("epoch: [%s/%s] step: [%s/%s], loss is %s" % (cur_epoch_num, int(cb_params.epoch_num /\ + (self.data_size / self.size)), cur_step_in_epoch, self.data_size, loss), "Epoch time: {:5.3f} ms, fps: {:d} img/sec.".format(step_mseconds, int(fps)), flush=True) @@ -214,7 +217,7 @@ def train(): # define callbacks if mode == context.PYNATIVE_MODE: print_per_steps = 1 - time_cb = MyTimeMonitor(total_batch, print_per_steps) + time_cb = MyTimeMonitor(total_batch, print_per_steps, step_size) cb = [time_cb] if save_ckpt: config_ck = CheckpointConfig(save_checkpoint_steps=5 * step_size, keep_checkpoint_max=5)