You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Paddle/python/paddle/nn/functional/vision.py

383 lines
16 KiB

# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from ...device import get_cudnn_version
from ...fluid.framework import core, in_dygraph_mode, Variable
from ...fluid.layer_helper import LayerHelper
from ...fluid.data_feeder import check_variable_and_dtype
from ...fluid import dygraph_utils
import numpy as np
# TODO: define specitial functions used in computer vision task
# from ...fluid.layers import affine_channel #DEFINE_ALIAS
# from ...fluid.layers import anchor_generator #DEFINE_ALIAS
# from ...fluid.layers import bipartite_match #DEFINE_ALIAS
# from ...fluid.layers import box_clip #DEFINE_ALIAS
# from ...fluid.layers import box_coder #DEFINE_ALIAS
# from ...fluid.layers import box_decoder_and_assign #DEFINE_ALIAS
# from ...fluid.layers import collect_fpn_proposals #DEFINE_ALIAS
# from ...fluid.layers import deformable_roi_pooling #DEFINE_ALIAS
# from ...fluid.layers import density_prior_box #DEFINE_ALIAS
# from ...fluid.layers import detection_output #DEFINE_ALIAS
# from ...fluid.layers import distribute_fpn_proposals #DEFINE_ALIAS
# from ...fluid.layers import generate_mask_labels #DEFINE_ALIAS
# from ...fluid.layers import generate_proposal_labels #DEFINE_ALIAS
# from ...fluid.layers import generate_proposals #DEFINE_ALIAS
# from ...fluid.layers import image_resize #DEFINE_ALIAS
# from ...fluid.layers import prior_box #DEFINE_ALIAS
# from ...fluid.layers import prroi_pool #DEFINE_ALIAS
# from ...fluid.layers import psroi_pool #DEFINE_ALIAS
# from ...fluid.layers import resize_bilinear #DEFINE_ALIAS
# from ...fluid.layers import resize_nearest #DEFINE_ALIAS
# from ...fluid.layers import resize_trilinear #DEFINE_ALIAS
# from ...fluid.layers import roi_align #DEFINE_ALIAS
# from ...fluid.layers import roi_pool #DEFINE_ALIAS
# from ...fluid.layers import space_to_depth #DEFINE_ALIAS
# from ...fluid.layers import yolo_box #DEFINE_ALIAS
# from ...fluid.layers import yolov3_loss #DEFINE_ALIAS
# from ...fluid.layers import fsp_matrix #DEFINE_ALIAS
# from ...fluid.layers import image_resize_short #DEFINE_ALIAS
# from ...fluid.layers import pixel_shuffle #DEFINE_ALIAS
# from ...fluid.layers import retinanet_detection_output #DEFINE_ALIAS
# from ...fluid.layers import retinanet_target_assign #DEFINE_ALIAS
# from ...fluid.layers import roi_perspective_transform #DEFINE_ALIAS
# from ...fluid.layers import shuffle_channel #DEFINE_ALIAS
__all__ = [
'affine_grid',
'grid_sample',
'pixel_shuffle'
]
def affine_grid(theta, out_shape, align_corners=True, name=None):
"""
It generates a grid of (x,y) coordinates using the parameters of
the affine transformation that correspond to a set of points where
the input feature map should be sampled to produce the transformed
output feature map.
Args:
theta (Tensor) - A tensor with shape [N, 2, 3]. It contains a batch of affine transform parameters.
The data type can be float32 or float64.
out_shape (Tensor | list | tuple): The shape of target output with format [batch_size, channel, height, width].
``out_shape`` can be a Tensor or a list or tuple. The data
type must be int32.
align_corners(bool): Whether to align corners of target feature map and source feature map. Default: True.
name(str|None): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`.
Returns:
Tensor, A Tensor with shape [batch_size, H, W, 2] while 'H' and 'W' are the height and width of feature map in affine transformation. The data type is the same as `theta`.
Raises:
ValueError: If the type of arguments is not supported.
Examples:
.. code-block:: python
import paddle
import paddle.nn.functional as F
import numpy as np
paddle.disable_static()
# theta shape = [1, 2, 3]
theta = np.array([[[-0.7, -0.4, 0.3],
[ 0.6, 0.5, 1.5]]]).astype("float32")
theta_t = paddle.to_tensor(theta)
y_t = F.affine_grid(
theta_t,
[1, 2, 3, 3],
align_corners=False)
print(y_t.numpy())
#[[[[ 1.0333333 0.76666665]
# [ 0.76666665 1.0999999 ]
# [ 0.5 1.4333333 ]]
#
# [[ 0.5666667 1.1666666 ]
# [ 0.3 1.5 ]
# [ 0.03333333 1.8333334 ]]
#
# [[ 0.10000002 1.5666667 ]
# [-0.16666666 1.9000001 ]
# [-0.43333334 2.2333333 ]]]]
"""
helper = LayerHelper('affine_grid')
if not isinstance(theta, Variable):
raise ValueError("The theta should be a Tensor.")
check_variable_and_dtype(theta, 'theta', ['float32', 'float64'],
'affine_grid')
cudnn_version = get_cudnn_version()
if cudnn_version is not None and cudnn_version >= 6000 and align_corners:
use_cudnn = True
else:
use_cudnn = False
if not (isinstance(out_shape, list) or isinstance(out_shape, tuple) or \
isinstance(out_shape, Variable)):
raise ValueError("The out_shape should be a list, tuple or Tensor.")
if in_dygraph_mode():
_out_shape = out_shape.numpy().tolist() if isinstance(
out_shape, Variable) else out_shape
return core.ops.affine_grid(theta, "output_shape", _out_shape,
"align_corners", align_corners, "use_cudnn",
use_cudnn)
out = helper.create_variable_for_type_inference(theta.dtype)
ipts = {'Theta': theta}
attrs = {"align_corners": align_corners, "use_cudnn": use_cudnn}
if isinstance(out_shape, Variable):
ipts['OutputShape'] = out_shape
check_variable_and_dtype(out_shape, 'out_shape', ['int32'],
'affine_grid')
else:
attrs['output_shape'] = out_shape
helper.append_op(
type='affine_grid',
inputs=ipts,
outputs={'Output': out},
attrs=None if len(attrs) == 0 else attrs)
return out
def grid_sample(x,
grid,
mode='bilinear',
padding_mode='zeros',
align_corners=True,
name=None):
"""
This operation samples input X by using bilinear interpolation or
nearest interpolation based on flow field grid, which is usually
generated by :code:`affine_grid` . The grid of shape [N, H, W, 2]
is the concatenation of (x, y) coordinates with shape [N, H, W] each,
where x is indexing the 4th dimension (in width dimension) of input
data x and y is indexing the 3rd dimension (in height dimension),
finally results is the bilinear interpolation or nearest value of 4 nearest corner
points. The output tensor shape will be [N, C, H, W].
Step 1:
Get (x, y) grid coordinates and scale to [0, H-1/W-1].
.. code-block:: text
grid_x = 0.5 * (grid[:, :, :, 0] + 1) * (W - 1)
grid_y = 0.5 * (grid[:, :, :, 1] + 1) * (H - 1)
Step 2:
Indices input data X with grid (x, y) in each [H, W] area, and bilinear
interpolate point value by 4 nearest points or nearest interpolate point value
by nearest point.
.. code-block:: text
wn ------- y_n ------- en
| | |
| d_n |
| | |
x_w --d_w-- grid--d_e-- x_e
| | |
| d_s |
| | |
ws ------- y_s ------- wn
For bilinear interpolation:
x_w = floor(x) // west side x coord
x_e = x_w + 1 // east side x coord
y_n = floor(y) // north side y coord
y_s = y_s + 1 // south side y coord
d_w = grid_x - x_w // distance to west side
d_e = x_e - grid_x // distance to east side
d_n = grid_y - y_n // distance to north side
d_s = y_s - grid_y // distance to south side
wn = X[:, :, y_n, x_w] // north-west point value
en = X[:, :, y_n, x_e] // north-east point value
ws = X[:, :, y_s, x_w] // south-east point value
es = X[:, :, y_s, x_w] // north-east point value
output = wn * d_e * d_s + en * d_w * d_s
+ ws * d_e * d_n + es * d_w * d_n
Args:
x(Tensor): The input tensor, which is a 4-d tensor with shape
[N, C, H, W], N is the batch size, C is the channel
number, H and W is the feature height and width.
The data type is float32 or float64.
grid(Tensor): Input grid tensor of shape [N, grid_H, grid_W, 2]. The
data type is float32 or float64.
mode(str, optional): The interpolation method which can be 'bilinear' or 'nearest'.
Default: 'bilinear'.
padding_mode(str, optional) The padding method used when source index
is out of input images. It can be 'zeros', 'reflection' and 'border'.
Default: zeros.
align_corners(bool, optional): If `align_corners` is true, it will projects
-1 and 1 to the centers of the corner pixels. Otherwise, it will
projects -1 and 1 to the image edges.
name(str, optional): For detailed information, please refer
to :ref:`api_guide_Name`. Usually name is no need to set and
None by default.
Returns:
Tensor, The shape of output is [N, C, grid_H, grid_W] in which `grid_H` is the height of grid and `grid_W` is the width of grid. The data type is same as input tensor.
Examples:
.. code-block:: python
import paddle
import paddle.nn.functional as F
import numpy as np
# shape=[1, 1, 3, 3]
x = np.array([[[[-0.6, 0.8, -0.5],
[-0.5, 0.2, 1.2],
[ 1.4, 0.3, -0.2]]]]).astype("float64")
# grid shape = [1, 3, 4, 2]
grid = np.array(
[[[[ 0.2, 0.3],
[-0.4, -0.3],
[-0.9, 0.3],
[-0.9, -0.6]],
[[ 0.4, 0.1],
[ 0.9, -0.8],
[ 0.4, 0.5],
[ 0.5, -0.2]],
[[ 0.1, -0.8],
[-0.3, -1. ],
[ 0.7, 0.4],
[ 0.2, 0.8]]]]).astype("float64")
x = paddle.to_tensor(x)
grid = paddle.to_tensor(grid)
y_t = F.grid_sample(
x,
grid,
mode='bilinear',
padding_mode='border',
align_corners=True)
print(y_t.numpy())
# output shape = [1, 1, 3, 4]
# [[[[ 0.34 0.016 0.086 -0.448]
# [ 0.55 -0.076 0.35 0.59 ]
# [ 0.596 0.38 0.52 0.24 ]]]]
"""
helper = LayerHelper("grid_sample", **locals())
check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'grid_sample')
check_variable_and_dtype(grid, 'grid', ['float32', 'float64'],
'grid_sample')
_modes = ['bilinear', 'nearest']
_padding_modes = ['zeros', 'reflection', 'border']
if mode not in _modes:
raise ValueError(
"The mode of grid sample function should be in {}, but got: {}".
format(_modes, mode))
if padding_mode not in _padding_modes:
raise ValueError(
"The padding mode of grid sample function should be in {}, but got: {}".
format(_padding_modes, padding_mode))
if not isinstance(align_corners, bool):
raise ValueError("The align corners should be bool, but got: {}".format(
align_corners))
cudnn_version = get_cudnn_version()
use_cudnn = False
if (cudnn_version is not None
) and align_corners and mode == 'bilinear' and padding_mode == 'zeros':
use_cudnn = True
ipts = {'X': x, 'Grid': grid}
attrs = {
'mode': mode,
'padding_mode': padding_mode,
'align_corners': align_corners,
'use_cudnn': use_cudnn
}
if in_dygraph_mode():
attrs = ('mode', mode, 'padding_mode', padding_mode, 'align_corners',
align_corners, 'use_cudnn', use_cudnn)
out = getattr(core.ops, 'grid_sampler')(x, grid, *attrs)
else:
out = helper.create_variable_for_type_inference(x.dtype)
helper.append_op(
type='grid_sampler',
inputs=ipts,
attrs=attrs,
outputs={'Output': out})
return out
def pixel_shuffle(x, upscale_factor, data_format="NCHW", name=None):
"""
This API implements pixel shuffle operation.
See more details in :ref:`api_nn_vision_PixelShuffle` .
Parameters:
x(Tensor): 4-D tensor, the data type should be float32 or float64.
upscale_factor(int): factor to increase spatial resolution.
data_format (str): The data format of the input and output data. An optional string from: "NCHW", "NHWC". The default is "NCHW". When it is "NCHW", the data is stored in the order of: [batch_size, input_channels, input_height, input_width].
name (str, optional): The default value is None. Normally there is no need for user to set this property.
Returns:
Out(tensor): Reshaped tensor according to the new dimension.
Raises:
ValueError: If the square of upscale_factor cannot divide the channels of input.
Examples:
.. code-block:: python
import paddle
import paddle.nn.functional as F
import numpy as np
x = np.random.randn(2, 9, 4, 4).astype(np.float32)
paddle.disable_static()
x_var = paddle.to_tensor(x)
out_var = F.pixel_shuffle(x_var, 3)
out = out_var.numpy()
print(out.shape)
# (2, 1, 12, 12)
"""
if not in_dygraph_mode():
check_variable_and_dtype(x, 'x', ['float32', 'float64'],
'pixel_shuffle')
if not isinstance(upscale_factor, int):
raise TypeError("upscale factor must be int type")
if data_format not in ["NCHW", "NHWC"]:
raise ValueError("Attr(data_format) should be 'NCHW' or 'NHWC'."
"But recevie Attr(data_format): {} ".format(
data_format))
if in_dygraph_mode():
return core.ops.pixel_shuffle(x, "upscale_factor", upscale_factor,
"data_format", data_format)
helper = LayerHelper("pixel_shuffle", **locals())
out = helper.create_variable_for_type_inference(dtype=x.dtype)
helper.append_op(
type="pixel_shuffle",
inputs={"X": x},
outputs={"Out": out},
attrs={"upscale_factor": upscale_factor,
"data_format": data_format})
return out