Move dygraph amp api to paddle-2.0 (#27681)
* move dygraph amp api to paddle * refine code and add unit testmy_2.0rc
parent
a0f1dba37f
commit
69a3339aaa
@ -0,0 +1,18 @@
|
|||||||
|
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
from .auto_cast import auto_cast
|
||||||
|
from .grad_scaler import GradScaler
|
||||||
|
|
||||||
|
__all__ = ['auto_cast', 'GradScaler']
|
@ -0,0 +1,52 @@
|
|||||||
|
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
from paddle.fluid.dygraph.amp import amp_guard
|
||||||
|
|
||||||
|
__all__ = ['auto_cast']
|
||||||
|
|
||||||
|
|
||||||
|
def auto_cast(enable=True, custom_white_list=None, custom_black_list=None):
|
||||||
|
"""
|
||||||
|
Create a context which enables auto-mixed-precision(AMP) of operators executed in dynamic graph mode.
|
||||||
|
If enabled, the input data type (float32 or float16) of each operator is decided
|
||||||
|
by autocast algorithm for better performance.
|
||||||
|
|
||||||
|
Commonly, it is used together with `AmpScaler` to achieve Auto-Mixed-Precision in
|
||||||
|
imperative mode.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
enable(bool, optional): Enable auto-mixed-precision or not. Default is True.
|
||||||
|
custom_white_list(set|list, optional): The custom white_list.
|
||||||
|
custom_black_list(set|list, optional): The custom black_list.
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
import paddle
|
||||||
|
|
||||||
|
conv2d = paddle.nn.Conv2d(3, 2, 3, bias_attr=False)
|
||||||
|
data = paddle.rand([10, 3, 32, 32])
|
||||||
|
|
||||||
|
with paddle.amp.auto_cast():
|
||||||
|
conv = conv2d(data)
|
||||||
|
print(conv.dtype) # FP16
|
||||||
|
|
||||||
|
with paddle.amp.auto_cast(enable=False):
|
||||||
|
conv = conv2d(data)
|
||||||
|
print(conv.dtype) # FP32
|
||||||
|
|
||||||
|
"""
|
||||||
|
return amp_guard(enable, custom_white_list, custom_black_list)
|
@ -0,0 +1,136 @@
|
|||||||
|
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
from paddle.fluid.dygraph.amp import AmpScaler
|
||||||
|
|
||||||
|
__all__ = ['GradScaler']
|
||||||
|
|
||||||
|
|
||||||
|
class GradScaler(AmpScaler):
|
||||||
|
"""
|
||||||
|
GradScaler is used for Auto-Mixed-Precision training/inferring in dynamic graph
|
||||||
|
mode. It controls the scaling of loss, helps avoiding numerical overflow.
|
||||||
|
The object of this class has two methods `scale()`, `minimize()`.
|
||||||
|
|
||||||
|
`scale()` is used to multiply the loss by a scale ratio.
|
||||||
|
`minimize()` is similar as `Optimizer.minimize()`, performs parameters updating.
|
||||||
|
|
||||||
|
Commonly, it is used together with `paddle.amp.auto_cast` to achieve Auto-Mixed-Precision in
|
||||||
|
dynamic graph mode.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
enable(bool, optional): Enable loss scaling or not. Default is True.
|
||||||
|
init_loss_scaling (float, optional): The initial loss scaling factor. Default is 2**15.
|
||||||
|
incr_ratio(float, optional): The multiplier to use when increasing the loss
|
||||||
|
scaling. Default is 2.0.
|
||||||
|
decr_ratio(float, optional): The less-than-one-multiplier to use when decreasing
|
||||||
|
the loss scaling. Default is 0.5.
|
||||||
|
incr_every_n_steps(int, optional): Increases loss scaling every n consecutive
|
||||||
|
steps with finite gradients. Default is 1000.
|
||||||
|
decr_every_n_nan_or_inf(int, optional): Decreases loss scaling every n
|
||||||
|
accumulated steps with nan or inf gradients. Default is 2.
|
||||||
|
use_dynamic_loss_scaling(bool, optional): Whether to use dynamic loss scaling. If False, fixed loss_scaling is used. If True, the loss scaling is updated dynamicly. Default is True.
|
||||||
|
Returns:
|
||||||
|
An AmpScaler object.
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
import paddle
|
||||||
|
|
||||||
|
model = paddle.nn.Conv2d(3, 2, 3, bias_attr=True)
|
||||||
|
optimizer = paddle.optimizer.SGD(learning_rate=0.01, parameters=model.parameters())
|
||||||
|
scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
|
||||||
|
data = paddle.rand([10, 3, 32, 32])
|
||||||
|
with paddle.amp.auto_cast():
|
||||||
|
conv = model(data)
|
||||||
|
loss = paddle.reduce_mean(conv)
|
||||||
|
scaled = scaler.scale(loss) # scale the loss
|
||||||
|
scaled.backward() # do backward
|
||||||
|
scaler.minimize(optimizer, scaled) # update parameters
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
enable=True,
|
||||||
|
init_loss_scaling=2.**15,
|
||||||
|
incr_ratio=2.0,
|
||||||
|
decr_ratio=0.5,
|
||||||
|
incr_every_n_steps=1000,
|
||||||
|
decr_every_n_nan_or_inf=1,
|
||||||
|
use_dynamic_loss_scaling=True):
|
||||||
|
super(GradScaler, self).__init__(enable, init_loss_scaling, incr_ratio,
|
||||||
|
decr_ratio, incr_every_n_steps,
|
||||||
|
decr_every_n_nan_or_inf,
|
||||||
|
use_dynamic_loss_scaling)
|
||||||
|
|
||||||
|
def scale(self, var):
|
||||||
|
"""
|
||||||
|
Multiplies a Tensor by the scale factor and returns scaled outputs.
|
||||||
|
If this instance of :class:`GradScaler` is not enabled, output are returned unmodified.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
var (Tensor): The tensor to scale.
|
||||||
|
Returns:
|
||||||
|
The scaled tensor or original tensor.
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
import paddle
|
||||||
|
|
||||||
|
model = paddle.nn.Conv2d(3, 2, 3, bias_attr=True)
|
||||||
|
optimizer = paddle.optimizer.SGD(learning_rate=0.01, parameters=model.parameters())
|
||||||
|
scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
|
||||||
|
data = paddle.rand([10, 3, 32, 32])
|
||||||
|
with paddle.amp.auto_cast():
|
||||||
|
conv = model(data)
|
||||||
|
loss = paddle.reduce_mean(conv)
|
||||||
|
scaled = scaler.scale(loss) # scale the loss
|
||||||
|
scaled.backward() # do backward
|
||||||
|
scaler.minimize(optimizer, scaled) # update parameters
|
||||||
|
"""
|
||||||
|
return super(GradScaler, self).scale(var)
|
||||||
|
|
||||||
|
def minimize(self, optimizer, *args, **kwargs):
|
||||||
|
"""
|
||||||
|
This function is similar as `Optimizer.minimize()`, which performs parameters updating.
|
||||||
|
|
||||||
|
If the scaled gradients of parameters contains NAN or INF, the parameters updating is skipped.
|
||||||
|
Otherwise, it first unscales the scaled gradients of parameters, then updates the parameters.
|
||||||
|
|
||||||
|
Finally, the loss scaling ratio is updated.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
optimizer(Optimizer): The optimizer used to update parameters.
|
||||||
|
args: Arguments, which will be forward to `optimizer.minimize()`.
|
||||||
|
kwargs: Keyword arguments, which will be forward to `Optimizer.minimize()`.
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
import paddle
|
||||||
|
|
||||||
|
model = paddle.nn.Conv2d(3, 2, 3, bias_attr=True)
|
||||||
|
optimizer = paddle.optimizer.SGD(learning_rate=0.01, parameters=model.parameters())
|
||||||
|
scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
|
||||||
|
data = paddle.rand([10, 3, 32, 32])
|
||||||
|
with paddle.amp.auto_cast():
|
||||||
|
conv = model(data)
|
||||||
|
loss = paddle.reduce_mean(conv)
|
||||||
|
scaled = scaler.scale(loss) # scale the loss
|
||||||
|
scaled.backward() # do backward
|
||||||
|
scaler.minimize(optimizer, scaled) # update parameters
|
||||||
|
"""
|
||||||
|
return super(GradScaler, self).minimize(optimizer, *args, **kwargs)
|
Loading…
Reference in new issue