From 12cca89d3de9879cf057f7238808c2f1cd6f7bf9 Mon Sep 17 00:00:00 2001
From: zhanghuiyao <1814619459@qq.com>
Date: Mon, 29 Mar 2021 17:54:33 +0800
Subject: [PATCH] Modify yolov4 readme.md file and fix profiler bug

---
 model_zoo/official/cv/yolov4/README.md | 58 +++++++++++++-------------
 model_zoo/official/cv/yolov4/train.py  | 10 ++---
 2 files changed, 33 insertions(+), 35 deletions(-)

diff --git a/model_zoo/official/cv/yolov4/README.md b/model_zoo/official/cv/yolov4/README.md
index 279de66268..04cfc360ca 100644
--- a/model_zoo/official/cv/yolov4/README.md
+++ b/model_zoo/official/cv/yolov4/README.md
@@ -2,6 +2,7 @@
 
 - [YOLOv4 Description](#YOLOv4-description)
 - [Model Architecture](#model-architecture)
+- [Pretrain Model](#pretrain-model)
 - [Dataset](#dataset)
 - [Environment Requirements](#environment-requirements)
 - [Quick Start](#quick-start)
@@ -33,10 +34,20 @@ Bochkovskiy A, Wang C Y, Liao H Y M. YOLOv4: Optimal Speed and Accuracy of Objec
 
 YOLOv4 choose CSPDarknet53 backbone, SPP additional module, PANet path-aggregation neck, and YOLOv4 (anchor based) head as the architecture of YOLOv4.
 
+# [Pretrain Model](#contents)
+
+YOLOv4 needs a CSPDarknet53 backbone to extract image features for detection. You could get CSPDarknet53 train script from our modelzoo and modify the backbone structure according to CSPDarknet53 in ```./src.cspdarknet53```, Final train it on imagenet2012 to get CSPDarknet53 pretrain model.
+Steps:
+
+1. Get resnet50 train script from our modelzoo.
+2. Modify the network architecture according to CSPDarknet53 in ```./src.cspdarknet53```
+3. Train CSPDarknet53 on imagenet2012.
+
 # [Dataset](#contents)
 
-Dataset support: [MS COCO] or datasetd with the same format as MS COCO
-Annotation support: [MS COCO] or annotation as the same format as MS COCO
+Dataset used: [COCO2017](https://cocodataset.org/#download)
+Dataset support: [COCO2017] or datasetd with the same format as MS COCO
+Annotation support: [COCO2017] or annotation as the same format as MS COCO
 
 - The directory structure is as follows, the name of directory and file is user define:
 
@@ -71,10 +82,18 @@ other datasets need to use the same format as MS COCO.
 
 # [Quick Start](#contents)
 
-After installing MindSpore via the official website, you can start training and evaluation as follows:
+- After installing MindSpore via the official website, you can start training and evaluation as follows:
+- Prepare the CSPDarknet53.ckpt and hccl_8p.json files, before run network.
+    - Please refer to [Pretrain Model]
+
+    - Genatating hccl_8p.json, Run the script of model_zoo/utils/hccl_tools/hccl_tools.py.
+      The following parameter "[0-8)" indicates that the hccl_8p.json file of cards 0 to 7 is generated.
+
+      ```
+      python hccl_tools.py --device_num "[0,8)"
+      ```
 
 ```text
-# The cspdarknet53_backbone.ckpt in the follow script is got from cspdarknet53 training like paper.
 # The parameter of training_shape define image shape for network, default is
                    [416, 416],
                    [448, 448],
@@ -91,7 +110,7 @@ After installing MindSpore via the official website, you can start training and
 ```
 
 ```bash
-#run training example(1p) by python command
+#run training example(1p) by python command (Training with a single scale)
 python train.py \
     --data_dir=./dataset/xxx \
     --pretrained_backbone=cspdarknet53_backbone.ckpt \
@@ -105,12 +124,12 @@ python train.py \
 ```
 
 ```bash
-# standalone training example(1p) by shell script
+# standalone training example(1p) by shell script (Training with a single scale)
 sh run_standalone_train.sh dataset/xxx cspdarknet53_backbone.ckpt
 ```
 
 ```bash
-# For Ascend device, distributed training example(8p) by shell script
+# For Ascend device, distributed training example(8p) by shell script (Training with multi scale)
 sh run_distribute_train.sh dataset/xxx cspdarknet53_backbone.ckpt rank_table_8p.json
 ```
 
@@ -119,7 +138,7 @@ sh run_distribute_train.sh dataset/xxx cspdarknet53_backbone.ckpt rank_table_8p.
 python eval.py \
     --data_dir=./dataset/xxx \
     --pretrained=yolov4.ckpt \
-    --testing_shape=416 > log.txt 2>&1 &
+    --testing_shape=608 > log.txt 2>&1 &
 ```
 
 ```bash
@@ -280,27 +299,6 @@ sh run_distribute_train.sh dataset/coco2017 cspdarknet53_backbone.ckpt rank_tabl
 
 The above shell script will run distribute training in the background. You can view the results through the file train_parallel[X]/log.txt. The loss value will be achieved as follows:
 
-```text
-# distribute training result(8p, shape=416)
-...
-2020-10-16 14:58:25,142:INFO:epoch[0], iter[1000], loss:242.509259, 388.73 imgs/sec, lr:0.00032783843926154077
-2020-10-16 14:58:41,320:INFO:epoch[0], iter[1100], loss:228.137516, 395.61 imgs/sec, lr:0.0003605895326472819
-2020-10-16 14:58:57,607:INFO:epoch[0], iter[1200], loss:219.689884, 392.94 imgs/sec, lr:0.00039334059692919254
-2020-10-16 14:59:13,787:INFO:epoch[0], iter[1300], loss:216.173309, 395.56 imgs/sec, lr:0.00042609169031493366
-2020-10-16 14:59:29,969:INFO:epoch[0], iter[1400], loss:234.500610, 395.54 imgs/sec, lr:0.00045884278370067477
-2020-10-16 14:59:46,132:INFO:epoch[0], iter[1500], loss:209.420913, 396.00 imgs/sec, lr:0.0004915939061902463
-2020-10-16 15:00:02,416:INFO:epoch[0], iter[1600], loss:210.953930, 393.04 imgs/sec, lr:0.000524344970472157
-2020-10-16 15:00:18,651:INFO:epoch[0], iter[1700], loss:197.171296, 394.20 imgs/sec, lr:0.0005570960929617286
-2020-10-16 15:00:34,056:INFO:epoch[0], iter[1800], loss:203.928903, 415.47 imgs/sec, lr:0.0005898471572436392
-2020-10-16 15:00:53,680:INFO:epoch[1], iter[1900], loss:191.693561, 326.14 imgs/sec, lr:0.0006225982797332108
-2020-10-16 15:01:10,442:INFO:epoch[1], iter[2000], loss:196.632004, 381.82 imgs/sec, lr:0.0006553493440151215
-2020-10-16 15:01:27,180:INFO:epoch[1], iter[2100], loss:193.813570, 382.43 imgs/sec, lr:0.0006881004082970321
-2020-10-16 15:01:43,736:INFO:epoch[1], iter[2200], loss:176.996778, 386.59 imgs/sec, lr:0.0007208515307866037
-2020-10-16 15:02:00,294:INFO:epoch[1], iter[2300], loss:185.858901, 386.55 imgs/sec, lr:0.0007536025950685143
-...
-
-```
-
 ```text
 # distribute training result(8p, dynamic shape)
 ...
@@ -450,7 +448,7 @@ YOLOv4 on 118K images(The annotation and data format must be the same as coco201
 
 | Parameters                 | YOLOv4                                                      |
 | -------------------------- | ----------------------------------------------------------- |
-| Resource                   | Ascend 910; CPU 2.60GHz, 192cores; Memory, 755G             |
+| Resource                   | Ascend 910; CPU 2.60GHz, 192cores; Memory, 755G; System, Euleros 2.8;|
 | uploaded Date              | 10/16/2020 (month/day/year)                                 |
 | MindSpore Version          | 1.0.0-alpha                                                 |
 | Dataset                    | 118K images                                                 |
diff --git a/model_zoo/official/cv/yolov4/train.py b/model_zoo/official/cv/yolov4/train.py
index 19ca07738a..adb4c08463 100644
--- a/model_zoo/official/cv/yolov4/train.py
+++ b/model_zoo/official/cv/yolov4/train.py
@@ -121,6 +121,9 @@ device_id = int(os.getenv('DEVICE_ID', '0'))
 context.set_context(mode=context.GRAPH_MODE, enable_auto_mixed_precision=True,
                     device_target=args.device_target, save_graphs=False, device_id=device_id)
 
+if args.need_profiler:
+    profiler = Profiler(output_path=args.outputs_dir, is_detail=True, is_show_op_path=True)
+
 # init distributed
 if args.is_distributed:
     if args.device_target == "Ascend":
@@ -163,9 +166,6 @@ class BuildTrainNetwork(nn.Cell):
 
 
 if __name__ == "__main__":
-    if args.need_profiler:
-        profiler = Profiler(output_path=args.outputs_dir, is_detail=True, is_show_op_path=True)
-
     loss_meter = AverageMeter('loss')
 
     context.reset_auto_parallel_context()
@@ -224,7 +224,7 @@ if __name__ == "__main__":
 
     if args.rank_save_ckpt_flag:
         # checkpoint save
-        ckpt_max_num = args.max_epoch * args.steps_per_epoch // args.ckpt_interval
+        ckpt_max_num = 10
         ckpt_config = CheckpointConfig(save_checkpoint_steps=args.ckpt_interval,
                                        keep_checkpoint_max=ckpt_max_num)
         save_ckpt_path = os.path.join(args.outputs_dir, 'ckpt_' + str(args.rank) + '/')
@@ -233,7 +233,7 @@ if __name__ == "__main__":
                                   prefix='{}'.format(args.rank))
         cb_params = _InternalCallbackParam()
         cb_params.train_network = network
-        cb_params.epoch_num = ckpt_max_num
+        cb_params.epoch_num = args.max_epoch * args.steps_per_epoch // args.ckpt_interval
         cb_params.cur_epoch_num = 1
         run_context = RunContext(cb_params)
         ckpt_cb.begin(run_context)