add target assign operator for supporting retinanet (#17893)

* test=develop add target assign for retinanet * test=develop run ci * test=developp add test_layers * test=develop add APi.spec * test=develop alter round 1 * test=develop alter rpn_target_assign_op.cc * test=develop alter test_rpn_target_assign_op.py * test=develop alter rpn_target_assign_op.cc * test=develop alter API.spec * test=develop alter paddle/fluid/operators/detection/rpn_target_assign_op.cc * test=develop alter rpn_target_assign_op.cc * test=develop alter python/paddle/fluid/layers/detection.py * test=develop alter paddle/fluid/API.spec
6 years ago · 9ed2f936f1
parent 7faf095618
commit 9ed2f936f1
5 changed files with 826 additions and 9 deletions
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@ -348,6 +348,7 @@ paddle.fluid.layers.target_assign (ArgSpec(args=['input', 'matched_indices', 'ne
 paddle.fluid.layers.detection_output (ArgSpec(args=['loc', 'scores', 'prior_box', 'prior_box_var', 'background_label', 'nms_threshold', 'nms_top_k', 'keep_top_k', 'score_threshold', 'nms_eta'], varargs=None, keywords=None, defaults=(0, 0.3, 400, 200, 0.01, 1.0)), ('document', 'efae414c1137c7944d6174dd08c5347a'))
 paddle.fluid.layers.ssd_loss (ArgSpec(args=['location', 'confidence', 'gt_box', 'gt_label', 'prior_box', 'prior_box_var', 'background_label', 'overlap_threshold', 'neg_pos_ratio', 'neg_overlap', 'loc_loss_weight', 'conf_loss_weight', 'match_type', 'mining_type', 'normalize', 'sample_size'], varargs=None, keywords=None, defaults=(None, 0, 0.5, 3.0, 0.5, 1.0, 1.0, 'per_prediction', 'max_negative', True, None)), ('document', '6d5028fd09d01ab82d296adc0ea95aee'))
 paddle.fluid.layers.rpn_target_assign (ArgSpec(args=['bbox_pred', 'cls_logits', 'anchor_box', 'anchor_var', 'gt_boxes', 'is_crowd', 'im_info', 'rpn_batch_size_per_im', 'rpn_straddle_thresh', 'rpn_fg_fraction', 'rpn_positive_overlap', 'rpn_negative_overlap', 'use_random'], varargs=None, keywords=None, defaults=(256, 0.0, 0.5, 0.7, 0.3, True)), ('document', '1e164a56fe9376e18a56d22563d9f801'))
+paddle.fluid.layers.retinanet_target_assign (ArgSpec(args=['bbox_pred', 'cls_logits', 'anchor_box', 'anchor_var', 'gt_boxes', 'gt_labels', 'is_crowd', 'im_info', 'num_classes', 'positive_overlap', 'negative_overlap'], varargs=None, keywords=None, defaults=(1, 0.5, 0.4)), ('document', 'fa1d1c9d5e0111684c0db705f86a2595'))
 paddle.fluid.layers.anchor_generator (ArgSpec(args=['input', 'anchor_sizes', 'aspect_ratios', 'variance', 'stride', 'offset', 'name'], varargs=None, keywords=None, defaults=(None, None, [0.1, 0.1, 0.2, 0.2], None, 0.5, None)), ('document', '82b2aefeeb1b706bc4afec70928a259a'))
 paddle.fluid.layers.roi_perspective_transform (ArgSpec(args=['input', 'rois', 'transformed_height', 'transformed_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1.0,)), ('document', 'd1ddc75629fedee46f82e631e22c79dc'))
 paddle.fluid.layers.generate_proposal_labels (ArgSpec(args=['rpn_rois', 'gt_classes', 'is_crowd', 'gt_boxes', 'im_info', 'batch_size_per_im', 'fg_fraction', 'fg_thresh', 'bg_thresh_hi', 'bg_thresh_lo', 'bbox_reg_weights', 'class_nums', 'use_random'], varargs=None, keywords=None, defaults=(256, 0.25, 0.25, 0.5, 0.0, [0.1, 0.1, 0.2, 0.2], None, True)), ('document', '9c601df88b251f22e9311c52939948cd'))
--- a/paddle/fluid/operators/detection/rpn_target_assign_op.cc
+++ b/paddle/fluid/operators/detection/rpn_target_assign_op.cc
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@ -39,6 +39,7 @@ __all__ = [
    'detection_output',
    'ssd_loss',
    'rpn_target_assign',
+    'retinanet_target_assign',
    'anchor_generator',
    'roi_perspective_transform',
    'generate_proposal_labels',
@ -57,6 +58,164 @@ __all__ = [
 ]


+def retinanet_target_assign(bbox_pred,
+                            cls_logits,
+                            anchor_box,
+                            anchor_var,
+                            gt_boxes,
+                            gt_labels,
+                            is_crowd,
+                            im_info,
+                            num_classes=1,
+                            positive_overlap=0.5,
+                            negative_overlap=0.4):
+    """
+    **Target Assign Layer for Retinanet .**
+
+    This layer can be, for given the Intersection-over-Union (IoU) overlap
+    between anchors and ground truth boxes, to assign classification and
+    regression targets to each anchor, these target labels are used for training
+    retinanet. Every anchor is assigned with a length :attr:`num_classes`
+    one-hot vector of classification targets, and a 4-vector of box regression
+    targets. The assignment rules are as followed:
+    
+    1. Anchors are assigned to ground-truth boxes when: (i) it has the highest
+    IoU overlap with a ground-truth box, or (ii) it has an IoU overlap higher
+    than positive_overlap(0.5) with any ground-truth box.
+    
+    2. Anchors are assigned to background when its IoU ratio is lower than
+    negative_overlap (0.4) for all ground-truth boxes.
+    
+    When an anchor is assigned with a ground-truth box which is the i-th category,
+    the i-th entry in its C vector of targets is set to 1 and all other entries
+    are set to 0. When an anchor is assigned with background, all entries are set
+    to 0. Anchors that are not assigned do not contribute to the training
+    objective. The regression targets are the encoded ground-truth boxes
+    associated with the assigned anchors.
+ 
+    Args:
+        bbox_pred(Variable): A 3-D Tensor with shape [N, M, 4] represents the
+            predicted locations of M bounding bboxes. N is the batch size,
+            and each bounding box has four coordinate values and the layout
+            is [xmin, ymin, xmax, ymax].
+        cls_logits(Variable): A 3-D Tensor with shape [N, M, C] represents the
+            predicted confidence predictions. N is the batch size, C is the
+            number of classes (excluding background), M is number of bounding boxes.
+        anchor_box(Variable): A 2-D Tensor with shape [M, 4] holds M boxes,
+            each box is represented as [xmin, ymin, xmax, ymax],
+            [xmin, ymin] is the left top coordinate of the anchor box,
+            if the input is image feature map, they are close to the origin
+            of the coordinate system. [xmax, ymax] is the right bottom
+            coordinate of the anchor box.
+        anchor_var(Variable): A 2-D Tensor with shape [M,4] holds expanded 
+            variances of anchors.
+        gt_boxes(Variable): The ground-truth bounding boxes (bboxes) are a 2D
+            LoDTensor with shape [Ng, 4], Ng is the total number of ground-truth
+            bboxes of mini-batch input.
+        gt_labels(variable): The ground-truth labels are a 2D LoDTensor with
+            shape [Ng, 1], Ng is the total number of ground-truth labels of
+            mini-batch input.
+        is_crowd(Variable): A 1-D LoDTensor which indicates ground-truth is crowd.
+        im_info(Variable): A 2-D LoDTensor with shape [N, 3]. N is the batch size,
+            3 is the height, width and scale.
+        num_classes(int32): The number of classes.
+        positive_overlap(float): Minimum overlap required between an anchor
+            and ground-truth box for the (anchor, gt box) pair to be a positive
+            example.
+        negative_overlap(float): Maximum overlap allowed between an anchor
+            and ground-truth box for the (anchor, gt box) pair to be a negative
+            examples.
+
+    Returns:
+        tuple:
+               A tuple(predicted_scores, predicted_location, target_label,
+               target_bbox, bbox_inside_weight, fg_num) is returned. The
+               predicted_scores and predicted_location are the predicted result
+               of the retinanet.The target_label and target_bbox are the ground
+               truth, respectively. The predicted_location is a 2D Tensor with
+               shape [F, 4], and the shape of target_bbox is same as the shape of
+               the predicted_location, F is the number of the foreground
+               anchors. The predicted_scores is a 2D Tensor with shape
+               [F + B, C], and the shape of target_label is [F + B, 1], B is the
+               number of the background anchors, the F and B is depends on the
+               input of this operator. Bbox_inside_weight represents whether the
+               predicted location is fake foreground or not and the shape is [F, 4].
+               Fg_num is the foreground number (including fake foreground) which
+               is needed by focal loss.
+
+    Examples:
+        .. code-block:: python
+
+          import paddle.fluid as fluid
+          bbox_pred = layers.data(name='bbox_pred', shape=[1, 100, 4],
+                            append_batch_size=False, dtype='float32')
+          cls_logits = layers.data(name='cls_logits', shape=[1, 100, 10],
+                            append_batch_size=False, dtype='float32')
+          anchor_box = layers.data(name='anchor_box', shape=[100, 4],
+                            append_batch_size=False, dtype='float32')
+          anchor_var = layers.data(name='anchor_var', shape=[100, 4],
+                            append_batch_size=False, dtype='float32')
+          gt_boxes = layers.data(name='gt_boxes', shape=[10, 4],
+                            append_batch_size=False, dtype='float32')
+          gt_labels = layers.data(name='gt_labels', shape=[10, 1],
+                            append_batch_size=False, dtype='float32')
+          is_crowd = fluid.layers.data(name='is_crowd', shape=[1],
+                            append_batch_size=False, dtype='float32')
+          im_info = fluid.layers.data(name='im_infoss', shape=[1, 3],
+                            append_batch_size=False, dtype='float32')
+          loc_pred, score_pred, loc_target, score_target, bbox_inside_weight, fg_num =
+                fluid.layers.retinanet_target_assign(bbox_pred, cls_logits, anchor_box,
+                anchor_var, gt_boxes, gt_labels, is_crowd, im_info, 10)
+
+    """
+
+    helper = LayerHelper('retinanet_target_assign', **locals())
+    # Assign target label to anchors
+    loc_index = helper.create_variable_for_type_inference(dtype='int32')
+    score_index = helper.create_variable_for_type_inference(dtype='int32')
+    target_label = helper.create_variable_for_type_inference(dtype='int32')
+    target_bbox = helper.create_variable_for_type_inference(
+        dtype=anchor_box.dtype)
+    bbox_inside_weight = helper.create_variable_for_type_inference(
+        dtype=anchor_box.dtype)
+    fg_num = helper.create_variable_for_type_inference(dtype='int32')
+    helper.append_op(
+        type="retinanet_target_assign",
+        inputs={
+            'Anchor': anchor_box,
+            'GtBoxes': gt_boxes,
+            'GtLabels': gt_labels,
+            'IsCrowd': is_crowd,
+            'ImInfo': im_info
+        },
+        outputs={
+            'LocationIndex': loc_index,
+            'ScoreIndex': score_index,
+            'TargetLabel': target_label,
+            'TargetBBox': target_bbox,
+            'BBoxInsideWeight': bbox_inside_weight,
+            'ForegroundNumber': fg_num
+        },
+        attrs={
+            'positive_overlap': positive_overlap,
+            'negative_overlap': negative_overlap
+        })
+
+    loc_index.stop_gradient = True
+    score_index.stop_gradient = True
+    target_label.stop_gradient = True
+    target_bbox.stop_gradient = True
+    bbox_inside_weight.stop_gradient = True
+    fg_num.stop_gradient = True
+
+    cls_logits = nn.reshape(x=cls_logits, shape=(-1, num_classes))
+    bbox_pred = nn.reshape(x=bbox_pred, shape=(-1, 4))
+    predicted_cls_logits = nn.gather(cls_logits, score_index)
+    predicted_bbox_pred = nn.gather(bbox_pred, loc_index)
+
+    return predicted_cls_logits, predicted_bbox_pred, target_label, target_bbox, bbox_inside_weight, fg_num
+
+
 def rpn_target_assign(bbox_pred,
                      cls_logits,
                      anchor_box,
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@ -2024,6 +2024,53 @@ class TestBook(LayerTest):
                trans_std=0.1)
        return (out)

+    def test_retinanet_target_assign(self):
+        with program_guard(fluid.default_main_program(),
+                           fluid.default_startup_program()):
+            bbox_pred = layers.data(
+                name='bbox_pred',
+                shape=[1, 100, 4],
+                append_batch_size=False,
+                dtype='float32')
+            cls_logits = layers.data(
+                name='cls_logits',
+                shape=[1, 100, 10],
+                append_batch_size=False,
+                dtype='float32')
+            anchor_box = layers.data(
+                name='anchor_box',
+                shape=[100, 4],
+                append_batch_size=False,
+                dtype='float32')
+            anchor_var = layers.data(
+                name='anchor_var',
+                shape=[100, 4],
+                append_batch_size=False,
+                dtype='float32')
+            gt_boxes = layers.data(
+                name='gt_boxes',
+                shape=[10, 4],
+                append_batch_size=False,
+                dtype='float32')
+            gt_labels = layers.data(
+                name='gt_labels',
+                shape=[10, 1],
+                append_batch_size=False,
+                dtype='float32')
+            is_crowd = layers.data(
+                name='is_crowd',
+                shape=[1],
+                append_batch_size=False,
+                dtype='float32')
+            im_info = layers.data(
+                name='im_info',
+                shape=[1, 3],
+                append_batch_size=False,
+                dtype='float32')
+            return (layers.retinanet_target_assign(
+                bbox_pred, cls_logits, anchor_box, anchor_var, gt_boxes,
+                gt_labels, is_crowd, im_info, 10))
+

 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py
@ -167,6 +167,105 @@ def rpn_target_assign_in_python(all_anchors,
    return loc_indexes, score_indexes, tgt_bboxes, tgt_labels, bbox_inside_weights


+def retinanet_target_assign(anchor_by_gt_overlap, gt_labels, positive_overlap,
+                            negative_overlap):
+    anchor_to_gt_argmax = anchor_by_gt_overlap.argmax(axis=1)
+    anchor_to_gt_max = anchor_by_gt_overlap[np.arange(
+        anchor_by_gt_overlap.shape[0]), anchor_to_gt_argmax]
+
+    gt_to_anchor_argmax = anchor_by_gt_overlap.argmax(axis=0)
+    gt_to_anchor_max = anchor_by_gt_overlap[gt_to_anchor_argmax, np.arange(
+        anchor_by_gt_overlap.shape[1])]
+    anchors_with_max_overlap = np.where(
+        anchor_by_gt_overlap == gt_to_anchor_max)[0]
+
+    labels = np.ones((anchor_by_gt_overlap.shape[0], ), dtype=np.int32) * -1
+    labels[anchors_with_max_overlap] = 1
+    labels[anchor_to_gt_max >= positive_overlap] = 1
+
+    fg_inds = np.where(labels == 1)[0]
+    bbox_inside_weight = np.zeros((len(fg_inds), 4), dtype=np.float32)
+
+    bg_inds = np.where(anchor_to_gt_max < negative_overlap)[0]
+    enable_inds = bg_inds
+
+    fg_fake_inds = np.array([], np.int32)
+    fg_value = np.array([fg_inds[0]], np.int32)
+    fake_num = 0
+    for bg_id in enable_inds:
+        if bg_id in fg_inds:
+            fake_num += 1
+            fg_fake_inds = np.hstack([fg_fake_inds, fg_value])
+    labels[enable_inds] = 0
+
+    bbox_inside_weight[fake_num:, :] = 1
+    fg_inds = np.where(labels == 1)[0]
+    bg_inds = np.where(labels == 0)[0]
+    loc_index = np.hstack([fg_fake_inds, fg_inds])
+    score_index = np.hstack([fg_inds, bg_inds])
+    score_index_tmp = np.hstack([fg_inds])
+    labels = labels[score_index]
+
+    gt_inds = anchor_to_gt_argmax[loc_index]
+    label_inds = anchor_to_gt_argmax[score_index_tmp]
+    labels[0:len(fg_inds)] = np.squeeze(gt_labels[label_inds])
+    fg_num = len(fg_fake_inds) + len(fg_inds) + 1
+    assert not np.any(labels == -1), "Wrong labels with -1"
+    return loc_index, score_index, labels, gt_inds, bbox_inside_weight, fg_num
+
+
+def retinanet_target_assign_in_python(all_anchors, gt_boxes, gt_labels,
+                                      is_crowd, im_info, lod, positive_overlap,
+                                      negative_overlap):
+    anchor_num = all_anchors.shape[0]
+    batch_size = len(lod) - 1
+    for i in range(batch_size):
+        im_scale = im_info[i][2]
+
+        inds_inside = np.arange(all_anchors.shape[0])
+        inside_anchors = all_anchors
+        b, e = lod[i], lod[i + 1]
+        gt_boxes_slice = gt_boxes[b:e, :] * im_scale
+        gt_labels_slice = gt_labels[b:e, :]
+        is_crowd_slice = is_crowd[b:e]
+
+        not_crowd_inds = np.where(is_crowd_slice == 0)[0]
+        gt_boxes_slice = gt_boxes_slice[not_crowd_inds]
+        gt_labels_slice = gt_labels_slice[not_crowd_inds]
+        iou = _bbox_overlaps(inside_anchors, gt_boxes_slice)
+
+        loc_inds, score_inds, labels, gt_inds, bbox_inside_weight, fg_num = \
+                         retinanet_target_assign(iou, gt_labels_slice,
+                                                positive_overlap, negative_overlap)
+        # unmap to all anchor
+        loc_inds = inds_inside[loc_inds]
+        score_inds = inds_inside[score_inds]
+
+        sampled_gt = gt_boxes_slice[gt_inds]
+        sampled_anchor = all_anchors[loc_inds]
+        box_deltas = _box_to_delta(sampled_anchor, sampled_gt, [1., 1., 1., 1.])
+
+        if i == 0:
+            loc_indexes = loc_inds
+            score_indexes = score_inds
+            tgt_labels = labels
+            tgt_bboxes = box_deltas
+            bbox_inside_weights = bbox_inside_weight
+            fg_nums = [[fg_num]]
+        else:
+            loc_indexes = np.concatenate(
+                [loc_indexes, loc_inds + i * anchor_num])
+            score_indexes = np.concatenate(
+                [score_indexes, score_inds + i * anchor_num])
+            tgt_labels = np.concatenate([tgt_labels, labels])
+            tgt_bboxes = np.vstack([tgt_bboxes, box_deltas])
+            bbox_inside_weights = np.vstack([bbox_inside_weights, \
+                                             bbox_inside_weight])
+            fg_nums = np.concatenate([fg_nums, [[fg_num]]])
+
+    return loc_indexes, score_indexes, tgt_bboxes, tgt_labels, bbox_inside_weights, fg_nums
+
+
 class TestRpnTargetAssignOp(OpTest):
    def setUp(self):
        n, c, h, w = 2, 4, 14, 14
@ -234,5 +333,65 @@ class TestRpnTargetAssignOp(OpTest):
        self.check_output()


+class TestRetinanetTargetAssignOp(OpTest):
+    def setUp(self):
+        n, c, h, w = 2, 4, 14, 14
+        all_anchors = get_anchor(n, c, h, w)
+        gt_num = 10
+        all_anchors = all_anchors.reshape(-1, 4)
+        anchor_num = all_anchors.shape[0]
+
+        images_shape = [[64, 64], [64, 64]]
+        groundtruth, lod = _generate_groundtruth(images_shape, 3, 4)
+        lod = [0, 4, 8]
+
+        im_info = np.ones((len(images_shape), 3)).astype(np.float32)
+        for i in range(len(images_shape)):
+            im_info[i, 0] = images_shape[i][0]
+            im_info[i, 1] = images_shape[i][1]
+            im_info[i, 2] = 0.8  #scale
+        gt_boxes = np.vstack([v['boxes'] for v in groundtruth])
+        is_crowd = np.hstack([v['is_crowd'] for v in groundtruth])
+        gt_labels = np.vstack([
+            v['gt_classes'].reshape(len(v['gt_classes']), 1)
+            for v in groundtruth
+        ])
+        gt_labels = gt_labels.reshape(len(gt_labels), 1)
+        all_anchors = all_anchors.astype('float32')
+        gt_boxes = gt_boxes.astype('float32')
+        gt_labels = gt_labels.astype('int32')
+
+        positive_overlap = 0.5
+        negative_overlap = 0.4
+
+        loc_index, score_index, tgt_bbox, labels, bbox_inside_weights, fg_num = \
+            retinanet_target_assign_in_python(all_anchors, gt_boxes, gt_labels, is_crowd,
+                                   im_info, lod, positive_overlap, negative_overlap)
+        labels = labels[:, np.newaxis]
+        self.op_type = "retinanet_target_assign"
+        self.inputs = {
+            'Anchor': all_anchors,
+            'GtBoxes': (gt_boxes, [[4, 4]]),
+            'GtLabels': (gt_labels, [[4, 4]]),
+            'IsCrowd': (is_crowd, [[4, 4]]),
+            'ImInfo': (im_info, [[1, 1]])
+        }
+        self.attrs = {
+            'positive_overlap': positive_overlap,
+            'negative_overlap': negative_overlap
+        }
+        self.outputs = {
+            'LocationIndex': loc_index.astype('int32'),
+            'ScoreIndex': score_index.astype('int32'),
+            'TargetBBox': tgt_bbox.astype('float32'),
+            'TargetLabel': labels.astype('int32'),
+            'BBoxInsideWeight': bbox_inside_weights.astype('float32'),
+            'ForegroundNumber': fg_num.astype('int32')
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
 if __name__ == '__main__':
    unittest.main()