@ -19,8 +19,7 @@ import paddle.fluid as fluid
import contextlib
import contextlib
import unittest
import unittest
import numpy as np
import numpy as np
from paddle . static . amp import cast_model_to_fp16
from paddle . fluid . contrib . mixed_precision . fp16_utils import cast_model_to_fp16
from paddle . static . amp import cast_parameters_to_fp16
paddle . enable_static ( )
paddle . enable_static ( )
@ -65,38 +64,19 @@ def resnet_cifar10(input, depth=32):
n = ( depth - 2 ) / / 6
n = ( depth - 2 ) / / 6
conv1 = conv_bn_layer (
conv1 = conv_bn_layer (
input = input , ch_out = 16 , filter_size = 3 , stride = 1 , padding = 1 )
input = input , ch_out = 16 , filter_size = 3 , stride = 1 , padding = 1 )
res1 = layer_warp ( basicblock , conv1 , 16 , 16 , n , 1 )
with paddle . static . amp . fp16_guard ( ) :
res2 = layer_warp ( basicblock , res1 , 16 , 32 , n , 2 )
res1 = layer_warp ( basicblock , conv1 , 16 , 16 , n , 1 )
res3 = layer_warp ( basicblock , res2 , 32 , 64 , n , 2 )
res2 = layer_warp ( basicblock , res1 , 16 , 32 , n , 2 )
res3 = layer_warp ( basicblock , res2 , 32 , 64 , n , 2 )
pool = fluid . layers . pool2d (
pool = fluid . layers . pool2d (
input = res3 , pool_size = 8 , pool_type = ' avg ' , pool_stride = 1 )
input = res3 , pool_size = 8 , pool_type = ' avg ' , pool_stride = 1 )
return pool
return pool
def compile ( program , loss_name = None ) :
def train ( use_pure_fp16 = True , use_nesterov = False , use_adam = False ) :
build_strategy = paddle . static . BuildStrategy ( )
exec_strategy = paddle . static . ExecutionStrategy ( )
exec_strategy . num_threads = 1
exec_strategy . num_iteration_per_drop_scope = 10000
build_strategy . fuse_bn_act_ops = True
build_strategy . fuse_elewise_add_act_ops = True
build_strategy . fuse_bn_add_act_ops = True
compiled_program = paddle . static . CompiledProgram (
program ) . with_data_parallel (
loss_name = loss_name ,
build_strategy = build_strategy ,
exec_strategy = exec_strategy )
return compiled_program
def train ( use_pure_fp16 = True , use_nesterov = False ) :
classdim = 10
classdim = 10
data_shape = [ 3 , 32 , 32 ]
data_shape = [ 3 , 32 , 32 ]
BATCH_SIZE = 128
BATCH_SIZE = 32
PASS_NUM = 1
PASS_NUM = 1
train_program = fluid . Program ( )
train_program = fluid . Program ( )
@ -107,28 +87,35 @@ def train(use_pure_fp16=True, use_nesterov=False):
images = fluid . layers . data (
images = fluid . layers . data (
name = ' pixel ' , shape = data_shape , dtype = ' float32 ' )
name = ' pixel ' , shape = data_shape , dtype = ' float32 ' )
label = fluid . layers . data ( name = ' label ' , shape = [ 1 ] , dtype = ' int64 ' )
label = fluid . layers . data ( name = ' label ' , shape = [ 1 ] , dtype = ' int64 ' )
net = resnet_cifar10 ( images , 32 )
net = resnet_cifar10 ( images )
logits = fluid . layers . fc ( input = net , size = classdim , act = " softmax " )
logits = fluid . layers . fc ( input = net , size = classdim , act = " softmax " )
if use_pure_fp16 :
cast_model_to_fp16 ( fluid . default_main_program ( ) )
logits_fp32 = fluid . layers . cast ( x = logits , dtype = " float32 " )
else :
logits_fp32 = logits
cost = fluid . layers . softmax_with_cross_entropy (
cost = fluid . layers . softmax_with_cross_entropy (
logits _fp32 , label , return_softmax = False )
logits , label , return_softmax = False )
sum_cost = fluid . layers . reduce_sum ( cost )
sum_cost = fluid . layers . reduce_sum ( cost )
# Test program
# Test program
test_program = train_program . clone ( for_test = True )
test_program = train_program . clone ( for_test = True )
optimizer = paddle . optimizer . Momentum (
if use_adam :
learning_rate = 0.001 ,
optimizer = paddle . optimizer . Adam (
momentum = 0.9 ,
learning_rate = 0.001 ,
use_nesterov = use_nesterov ,
epsilon = 1e-8 ,
weight_decay = fluid . regularizer . L2Decay ( 1e-4 ) ,
weight_decay = 0.0 ,
multi_precision = use_pure_fp16 ,
multi_precision = True )
rescale_grad = 1.0 / BATCH_SIZE )
else :
optimizer = paddle . optimizer . Momentum (
learning_rate = 0.001 ,
momentum = 0.9 ,
use_nesterov = use_nesterov ,
weight_decay = fluid . regularizer . L2Decay ( 1e-4 ) ,
multi_precision = use_pure_fp16 )
if use_pure_fp16 :
optimizer = paddle . static . amp . decorate (
optimizer ,
init_loss_scaling = 128.0 ,
use_dynamic_loss_scaling = True ,
use_pure_fp16 = True )
optimizer . minimize ( sum_cost )
optimizer . minimize ( sum_cost )
@ -146,13 +133,13 @@ def train(use_pure_fp16=True, use_nesterov=False):
def train_loop ( main_program ) :
def train_loop ( main_program ) :
exe . run ( startup_prog )
exe . run ( startup_prog )
if use_pure_fp16 :
if use_pure_fp16 :
cast_parameters_to_fp16( place , train_program , fluid . global_scope ( ) )
optimizer. amp_init (
compiled_program = compile ( train_program , sum_cost . nam e)
place , test_program = test_program , use_fp16_test = Tru e)
loss = 0.0
loss = 0.0
for pass_id in range ( PASS_NUM ) :
for pass_id in range ( PASS_NUM ) :
train_loss_list = [ ]
train_loss_list = [ ]
for batch_id , data in enumerate ( train_reader ( ) ) :
for batch_id , data in enumerate ( train_reader ( ) ) :
loss , = exe . run ( compiled _program,
loss , = exe . run ( train _program,
feed = feeder . feed ( data ) ,
feed = feeder . feed ( data ) ,
fetch_list = [ sum_cost ] )
fetch_list = [ sum_cost ] )
loss_v = loss [ 0 ] if isinstance ( loss , np . ndarray ) else loss
loss_v = loss [ 0 ] if isinstance ( loss , np . ndarray ) else loss
@ -182,18 +169,25 @@ class TestImageMultiPrecision(unittest.TestCase):
if not fluid . core . is_compiled_with_cuda ( ) :
if not fluid . core . is_compiled_with_cuda ( ) :
return
return
def do_test ( use_nesterov = False ) :
def do_test ( use_nesterov = False , use_adam = False ) :
suffix = " with Nesterov " if use_nesterov else " without Nesterov "
if use_adam :
suffix = " use Adam "
else :
suffix = " with Nesterov " if use_nesterov else " without Nesterov "
with self . scope_prog_guard ( ) :
with self . scope_prog_guard ( ) :
print ( " -----------------FP16 Train {} ----------------- " . format (
print ( " -----------------FP16 Train {} ----------------- " . format (
suffix ) )
suffix ) )
train_loss_fp16 , test_loss_fp16 = train (
train_loss_fp16 , test_loss_fp16 = train (
use_pure_fp16 = True , use_nesterov = use_nesterov )
use_pure_fp16 = True ,
use_nesterov = use_nesterov ,
use_adam = use_adam )
with self . scope_prog_guard ( ) :
with self . scope_prog_guard ( ) :
print ( " -----------------FP32 Train {} ----------------- " . format (
print ( " -----------------FP32 Train {} ----------------- " . format (
suffix ) )
suffix ) )
train_loss_fp32 , test_loss_fp32 = train (
train_loss_fp32 , test_loss_fp32 = train (
use_pure_fp16 = False , use_nesterov = use_nesterov )
use_pure_fp16 = False ,
use_nesterov = use_nesterov ,
use_adam = use_adam )
self . assertTrue (
self . assertTrue (
np . allclose (
np . allclose (
@ -214,6 +208,7 @@ class TestImageMultiPrecision(unittest.TestCase):
do_test ( use_nesterov = False )
do_test ( use_nesterov = False )
do_test ( use_nesterov = True )
do_test ( use_nesterov = True )
do_test ( use_adam = True )
@contextlib.contextmanager
@contextlib.contextmanager
def scope_prog_guard ( self ) :
def scope_prog_guard ( self ) :
@ -260,7 +255,7 @@ class TestAmpWithNonIterableDataLoader(unittest.TestCase):
op . _set_attr ( ' out_dtype ' , fluid . core . VarDesc . VarType . FP32 )
op . _set_attr ( ' out_dtype ' , fluid . core . VarDesc . VarType . FP32 )
op . _set_attr ( ' dtype ' , fluid . core . VarDesc . VarType . FP32 )
op . _set_attr ( ' dtype ' , fluid . core . VarDesc . VarType . FP32 )
cast_model_to_fp16 ( main_prog )
cast_model_to_fp16 ( main_prog , use_fp16_guard = False )
def test_non_iterable_dataloader ( self ) :
def test_non_iterable_dataloader ( self ) :
self . decorate_with_data_loader ( )
self . decorate_with_data_loader ( )