add detection output operator for supporting retinanet (#17896)

* test=develop add detection output for supporting retinanet * test=develop add test_layers.py * test=develop add API.spec * test=develop alter test_retinanet_detection_output.py * test=develop alter round 2 * test=develop alter retinanet_detection_output * test=develop alter paddle/fluid/API.spec * test=devlop alter detection.py * test=develop alter retinanet_detection_output * test=develop alter paddle/fluid/API.spec * test=develop alter detection.py * test=develop alter API.spec * test=develop alter retinanet_detection_output * test=develop alter paddle/fluid/API.spec * test=develop alter python/paddle/fluid/tests/unittests/test_retinanet_detection_output.py * test=develop alter python/paddle/fluid/tests/unittests/test_retinanet_detection_output.py * test=develop fix grammer error * test=develop fix grammer error * test=develop fix grammer error * test=develop alter python/paddle/fluid/tests/unittests/test_layers.py * test=develop alter paddle/fluid/API.spec
6 years ago · ff83655f7e
parent 0941e3e013
commit ff83655f7e
6 changed files with 1123 additions and 0 deletions
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@ -362,6 +362,7 @@ paddle.fluid.layers.yolov3_loss (ArgSpec(args=['x', 'gt_box', 'gt_label', 'ancho
 paddle.fluid.layers.yolo_box (ArgSpec(args=['x', 'img_size', 'anchors', 'class_num', 'conf_thresh', 'downsample_ratio', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'f332fb8c5bb581bd1a6b5be450a99990'))
 paddle.fluid.layers.box_clip (ArgSpec(args=['input', 'im_info', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '04384378ff00a42ade8fabd52e27cbc5'))
 paddle.fluid.layers.multiclass_nms (ArgSpec(args=['bboxes', 'scores', 'score_threshold', 'nms_top_k', 'keep_top_k', 'nms_threshold', 'normalized', 'nms_eta', 'background_label', 'name'], varargs=None, keywords=None, defaults=(0.3, True, 1.0, 0, None)), ('document', 'ca7d1107b6c5d2d6d8221039a220fde0'))
+paddle.fluid.layers.retinanet_detection_output (ArgSpec(args=['bboxes', 'scores', 'anchors', 'im_info', 'score_threshold', 'nms_top_k', 'keep_top_k', 'nms_threshold', 'nms_eta'], varargs=None, keywords=None, defaults=(0.05, 1000, 100, 0.3, 1.0)), ('document', '078d28607ce261a0cba2b965a79f6bb8'))
 paddle.fluid.layers.distribute_fpn_proposals (ArgSpec(args=['fpn_rois', 'min_level', 'max_level', 'refer_level', 'refer_scale', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '7bb011ec26bace2bc23235aa4a17647d'))
 paddle.fluid.layers.box_decoder_and_assign (ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'box_score', 'box_clip', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'dfc953994fd8fef35c49dd9c6eea37a5'))
 paddle.fluid.layers.collect_fpn_proposals (ArgSpec(args=['multi_rois', 'multi_scores', 'min_level', 'max_level', 'post_nms_top_n', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '82ffd896ecc3c005ae1cad40854dcace'))
--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@ -36,6 +36,7 @@ detection_library(yolov3_loss_op SRCS yolov3_loss_op.cc)
 detection_library(yolo_box_op SRCS yolo_box_op.cc yolo_box_op.cu)
 detection_library(box_decoder_and_assign_op SRCS box_decoder_and_assign_op.cc box_decoder_and_assign_op.cu)
 detection_library(sigmoid_focal_loss_op SRCS sigmoid_focal_loss_op.cc sigmoid_focal_loss_op.cu)
+detection_library(retinanet_detection_output_op SRCS retinanet_detection_output_op.cc)

 if(WITH_GPU)
  detection_library(generate_proposals_op SRCS generate_proposals_op.cc generate_proposals_op.cu DEPS memory cub)
--- a/paddle/fluid/operators/detection/retinanet_detection_output_op.cc
+++ b/paddle/fluid/operators/detection/retinanet_detection_output_op.cc
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@ -53,6 +53,7 @@ __all__ = [
    'yolo_box',
    'box_clip',
    'multiclass_nms',
+    'retinanet_detection_output',
    'distribute_fpn_proposals',
    'box_decoder_and_assign',
    'collect_fpn_proposals',
@ -2548,6 +2549,113 @@ def box_clip(input, im_info, name=None):
    return output


+def retinanet_detection_output(bboxes,
+                               scores,
+                               anchors,
+                               im_info,
+                               score_threshold=0.05,
+                               nms_top_k=1000,
+                               keep_top_k=100,
+                               nms_threshold=0.3,
+                               nms_eta=1.):
+    """
+    **Detection Output Layer for Retinanet.**
+
+    This operation is to get the detection results by performing following
+    steps:
+
+    1. Decode top-scoring bounding box predictions per FPN level according 
+       to the anchor boxes.
+    2. Merge top predictions from all levels and apply multi-class non 
+       maximum suppression (NMS) on them to get the final detections.
+
+    Args:
+        bboxes(List): A list of tensors from multiple FPN levels. Each
+            element is a 3-D Tensor with shape [N, Mi, 4] representing the
+            predicted locations of Mi bounding boxes. N is the batch size,
+            Mi is the number of bounding boxes from i-th FPN level and each 
+            bounding box has four coordinate values and the layout is
+            [xmin, ymin, xmax, ymax].
+        scores(List): A list of tensors from multiple FPN levels. Each
+            element is a 3-D Tensor with shape [N, Mi, C] representing the
+            predicted confidence predictions. N is the batch size, C is the
+            class number (excluding background), Mi is the number of bounding
+            boxes from i-th FPN level. For each bounding box, there are total
+            C scores.
+        anchors(List): A 2-D Tensor with shape [Mi, 4] represents the locations
+            of Mi anchor boxes from all FPN level. Each bounding box has four
+            coordinate values and the layout is [xmin, ymin, xmax, ymax].
+        im_info(Variable): A 2-D LoDTensor with shape [N, 3] represents the
+            image information. N is the batch size, each image information
+            includes height, width and scale.
+        score_threshold(float): Threshold to filter out bounding boxes
+            with a confidence score.
+        nms_top_k(int): Maximum number of detections per FPN layer to be
+            kept according to the confidences before NMS.
+        keep_top_k(int): Number of total bounding boxes to be kept per image after
+            NMS step. -1 means keeping all bounding boxes after NMS step.
+        nms_threshold(float): The threshold to be used in NMS.
+        nms_eta(float): The parameter for adaptive NMS.
+
+    Returns:
+        Variable:
+            The detection output is a LoDTensor with shape [No, 6].
+            Each row has six values: [label, confidence, xmin, ymin, xmax, ymax].
+            `No` is the total number of detections in this mini-batch. For each
+            instance, the offsets in first dimension are called LoD, the offset
+            number is N + 1, N is the batch size. The i-th image has
+            `LoD[i + 1] - LoD[i]` detected results, if it is 0, the i-th image
+            has no detected results. If all images have no detected results,
+            LoD will be set to 0, and the output tensor is empty (None).
+
+    Examples:
+        .. code-block:: python
+        
+            import paddle.fluid as fluid
+
+            bboxes = layers.data(name='bboxes', shape=[1, 21, 4],
+                append_batch_size=False, dtype='float32')
+            scores = layers.data(name='scores', shape=[1, 21, 10],
+                append_batch_size=False, dtype='float32')
+            anchors = layers.data(name='anchors', shape=[21, 4],
+                append_batch_size=False, dtype='float32')
+            im_info = layers.data(name="im_info", shape=[1, 3],
+                append_batch_size=False, dtype='float32')
+            nmsed_outs = fluid.layers.retinanet_detection_output(
+                                                    bboxes=[bboxes, bboxes],
+                                                    scores=[scores, scores],
+                                                    anchors=[anchors, anchors],
+                                                    im_info=im_info,
+                                                    score_threshold=0.05,
+                                                    nms_top_k=1000,
+                                                    keep_top_k=100,
+                                                    nms_threshold=0.3,
+                                                    nms_eta=1.)
+    """
+
+    helper = LayerHelper('retinanet_detection_output', **locals())
+    output = helper.create_variable_for_type_inference(
+        dtype=helper.input_dtype('scores'))
+    helper.append_op(
+        type="retinanet_detection_output",
+        inputs={
+            'BBoxes': bboxes,
+            'Scores': scores,
+            'Anchors': anchors,
+            'ImInfo': im_info
+        },
+        attrs={
+            'score_threshold': score_threshold,
+            'nms_top_k': nms_top_k,
+            'nms_threshold': nms_threshold,
+            'keep_top_k': keep_top_k,
+            'nms_eta': 1.,
+        },
+        outputs={'Out': output})
+    output.stop_gradient = True
+    return output
+
+
 def multiclass_nms(bboxes,
                   scores,
                   score_threshold,
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@ -2093,6 +2093,41 @@ class TestBook(LayerTest):
                x=input, label=label, fg_num=fg_num, gamma=2., alpha=0.25)
            return (out)

+    def test_retinanet_detection_output(self):
+        with program_guard(fluid.default_main_program(),
+                           fluid.default_startup_program()):
+            bboxes = layers.data(
+                name='bboxes',
+                shape=[1, 21, 4],
+                append_batch_size=False,
+                dtype='float32')
+            scores = layers.data(
+                name='scores',
+                shape=[1, 21, 10],
+                append_batch_size=False,
+                dtype='float32')
+            anchors = layers.data(
+                name='anchors',
+                shape=[21, 4],
+                append_batch_size=False,
+                dtype='float32')
+            im_info = layers.data(
+                name="im_info",
+                shape=[1, 3],
+                append_batch_size=False,
+                dtype='float32')
+            nmsed_outs = layers.retinanet_detection_output(
+                bboxes=[bboxes, bboxes],
+                scores=[scores, scores],
+                anchors=[anchors, anchors],
+                im_info=im_info,
+                score_threshold=0.05,
+                nms_top_k=1000,
+                keep_top_k=100,
+                nms_threshold=0.3,
+                nms_eta=1.)
+            return (nmsed_outs)
+

 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_retinanet_detection_output.py
+++ b/python/paddle/fluid/tests/unittests/test_retinanet_detection_output.py