remove params in Tracer object (in dygraph) (#20815)

* remove params in Tracer object, test=develop

* Repair failed optest, test=develop

* remove build_once & name_scope (Conv2D)
test=develop

* fix unittest
test=develop

* Conv2DTranspose

* Conv3D & Conv3DTranspose
test=develop

* Pool2D & BatchNorm

* Embedding

* LayerNorm

* GRUUnit & NCE

* PRelu

* BilinearTensorProduct

* GroupNorm & SpectralNorm

* TreeConv
test=develop

* fix LayerNorm in transformer unnittest
test=develop

* disable LayerNorm or BatchNorm in multicard
test=develop

* refine Layer.create_parameter api
test=develop

* refine LayerNorm, remove begin_norm_axis param, add normed shape check
test=develop

* LayerNorm bug fix
test=develop

* fix optest,test=develop

* fix optest, test=develop

* fix optest for pass parameter_list when constructing an Optimizer class instance, test=develop

* polish code for better code style, test=develop

* fix se_resnext optest, test=develop

* polish code for better code style, test=develop

Co-authored-by: songyouwei <youwei0314@gmail.com>
release/1.7
zhongpu 6 years ago committed by hong
parent c3e1954918
commit dca075839b

@ -154,14 +154,14 @@ def guard(place=None):
yield
def _print_debug_msg(limit=5, is_test=False):
def _print_debug_msg(parameter_list, limit=5, is_test=False):
if not core._is_dygraph_debug_enabled():
logging.warn(
'Debug mode is not enabled. Please set FLAGS_dygraph_debug=1 to enable debug'
)
return
unique_name_size = len(framework.unique_name.generator.ids)
tracer_var_size = len(framework._dygraph_tracer()._vars)
tracer_var_size = len(parameter_list)
alive_cpp_var_size = len(core.VarBase._alive_vars())
if not is_test:
logging.warn(

@ -53,7 +53,8 @@ def save_dygraph(state_dict, model_path):
state_dict = emb.state_dict()
fluid.save_dygraph( state_dict, "paddle_dy")
adam = fluid.optimizer.Adam( learning_rate = fluid.layers.noam_decay( 100, 10000) )
adam = fluid.optimizer.Adam( learning_rate = fluid.layers.noam_decay( 100, 10000),
parameter_list = emb.parameters() )
state_dict = adam.state_dict()
fluid.save_dygraph( state_dict, "paddle_dy")
@ -96,7 +97,8 @@ def load_dygraph(model_path):
state_dict = emb.state_dict()
fluid.save_dygraph( state_dict, "paddle_dy")
adam = fluid.optimizer.Adam( learning_rate = fluid.layers.noam_decay( 100, 10000) )
adam = fluid.optimizer.Adam( learning_rate = fluid.layers.noam_decay( 100, 10000),
parameter_list = emb.parameters() )
state_dict = adam.state_dict()
fluid.save_dygraph( state_dict, "padle_dy")

@ -145,9 +145,13 @@ class Layer(core.Layer):
list of :ref:`api_guide_Variable_en` : a list of Parameters.
"""
ret = [p for p in self._parameters.values()]
parameters_set = set(ret)
if include_sublayers:
for l in self._sub_layers.values():
for p in l.parameters(include_sublayers):
if p in parameters_set:
continue
parameters_set.add(p)
ret.append(p)
return ret
@ -261,11 +265,6 @@ class Layer(core.Layer):
value.set_value(self._loaddict_holder[value.name])
if name in params:
# remove unused param in tracer
if framework._dygraph_tracer_ is not None:
framework._dygraph_tracer_._vars.pop(params[name].name,
None)
params[name] = value
elif isinstance(value, core.Layer):
layers = self.__dict__.get('_sub_layers', None)

@ -104,8 +104,10 @@ class PiecewiseDecay(LearningRateDecay):
boundaries = [10000, 20000]
values = [1.0, 0.5, 0.1]
with fluid.dygraph.guard():
emb = fluid.dygraph.Embedding( [10, 10] )
optimizer = fluid.optimizer.SGD(
learning_rate=fluid.dygraph.PiecewiseDecay(boundaries, values, 0) )
learning_rate=fluid.dygraph.PiecewiseDecay(boundaries, values, 0),
parameter_list = emb.parameters() )
"""
def __init__(self, boundaries, values, begin, step=1, dtype='float32'):
@ -323,12 +325,14 @@ class InverseTimeDecay(LearningRateDecay):
import paddle.fluid as fluid
base_lr = 0.1
with fluid.dygraph.guard():
emb = fluid.dygraph.Embedding([10, 10])
sgd_optimizer = fluid.optimizer.SGD(
learning_rate=fluid.dygraph.InverseTimeDecay(
learning_rate=base_lr,
decay_steps=10000,
decay_rate=0.5,
staircase=True))
staircase=True),
parameter_list = emb.parameters())
"""
@ -404,9 +408,11 @@ class PolynomialDecay(LearningRateDecay):
total_step = 5000
end_lr = 0
with fluid.dygraph.guard():
emb = fluid.dygraph.Embedding( [10, 10])
optimizer = fluid.optimizer.SGD(
learning_rate = fluid.dygraph.PolynomialDecay(
start_lr, total_step, end_lr, power=1.0) )
start_lr, total_step, end_lr, power=1.0),
parameter_list = emb.parameters())
"""
@ -536,10 +542,12 @@ class NoamDecay(LearningRateDecay):
warmup_steps = 100
learning_rate = 0.01
with fluid.dygraph.guard():
emb = fluid.dygraph.Embedding([10, 10])
optimizer = fluid.optimizer.SGD(
learning_rate = fluid.dygraph.NoamDecay(
1/(warmup_steps *(learning_rate ** 2)),
warmup_steps) )
warmup_steps),
parameter_list = emb.parameters())
"""
def __init__(self, d_model, warmup_steps, begin=1, step=1, dtype='float32'):

@ -31,16 +31,8 @@ class Tracer(core.Tracer):
def __init__(self):
super(Tracer, self).__init__()
self._vars = defaultdict()
self._train_mode = True
def trace_var(self, name, var):
self._vars[name] = var
def all_parameters(self):
return list((item for name, item in six.iteritems(self._vars)
if isinstance(item, framework.Parameter)))
def trace_op(self, type, inputs, outputs, attrs, stop_gradient=False):
self.trace(type, inputs, outputs, attrs,
framework._current_expected_place(), self._train_mode and

@ -4676,8 +4676,6 @@ class ParamBase(core.VarBase):
# self.block = default_main_program().global_block()
_dygraph_tracer().trace_var(name, self)
def __str__(self):
return self.to_string(True)

File diff suppressed because it is too large Load Diff

@ -26,7 +26,7 @@ import paddle.fluid as fluid
import paddle.fluid.dygraph as dygraph
from paddle.fluid import core
from paddle.fluid.optimizer import SGDOptimizer
from paddle.fluid.dygraph.nn import Conv2D, Pool2D, FC
from paddle.fluid.dygraph.nn import Conv2D, Pool2D, Linear
from paddle.fluid.dygraph.base import to_variable
from test_dist_base import runtime_main, TestParallelDyGraphRunnerBase
@ -79,8 +79,8 @@ class SimpleImgConvPool(fluid.dygraph.Layer):
class MNIST(fluid.dygraph.Layer):
def __init__(self, name_scope):
super(MNIST, self).__init__(name_scope)
def __init__(self):
super(MNIST, self).__init__()
self._simple_img_conv_pool_1 = SimpleImgConvPool(
1, 20, 5, 2, 2, act="relu")
@ -88,19 +88,21 @@ class MNIST(fluid.dygraph.Layer):
self._simple_img_conv_pool_2 = SimpleImgConvPool(
20, 50, 5, 2, 2, act="relu")
pool_2_shape = 50 * 4 * 4
self.pool_2_shape = 50 * 4 * 4
SIZE = 10
scale = (2.0 / (pool_2_shape**2 * SIZE))**0.5
self._fc = FC(self.full_name(),
10,
param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.NormalInitializer(
loc=0.0, scale=scale)),
act="softmax")
scale = (2.0 / (self.pool_2_shape**2 * SIZE))**0.5
self._fc = Linear(
self.pool_2_shape,
10,
param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.NormalInitializer(
loc=0.0, scale=scale)),
act="softmax")
def forward(self, inputs, label):
x = self._simple_img_conv_pool_1(inputs)
x = self._simple_img_conv_pool_2(x)
x = fluid.layers.reshape(x, shape=[-1, self.pool_2_shape])
cost = self._fc(x)
loss = fluid.layers.cross_entropy(cost, label)
avg_loss = fluid.layers.mean(loss)
@ -109,10 +111,11 @@ class MNIST(fluid.dygraph.Layer):
class TestMnist(TestParallelDyGraphRunnerBase):
def get_model(self):
model = MNIST("mnist")
model = MNIST()
train_reader = paddle.batch(
paddle.dataset.mnist.train(), batch_size=2, drop_last=True)
opt = fluid.optimizer.Adam(learning_rate=1e-3)
opt = fluid.optimizer.Adam(
learning_rate=1e-3, parameter_list=model.parameters())
return model, train_reader, opt
def run_one_loop(self, model, opt, data):

@ -27,7 +27,7 @@ import paddle.fluid as fluid
import paddle.fluid.dygraph as dygraph
from paddle.fluid import core
from paddle.fluid.optimizer import SGDOptimizer
from paddle.fluid.dygraph.nn import Conv2D, Pool2D, FC, BatchNorm
from paddle.fluid.dygraph.nn import Conv2D, Pool2D, Linear, BatchNorm
from paddle.fluid.dygraph.base import to_variable
from paddle.fluid.layer_helper import LayerHelper
import math
@ -54,7 +54,7 @@ train_parameters = {
}
def optimizer_setting(params):
def optimizer_setting(params, parameter_list=None):
ls = params["learning_strategy"]
if "total_images" not in params:
total_images = 6149
@ -66,11 +66,19 @@ def optimizer_setting(params):
bd = [step * e for e in ls["epochs"]]
lr = params["lr"]
num_epochs = params["num_epochs"]
optimizer = fluid.optimizer.Momentum(
learning_rate=fluid.layers.cosine_decay(
learning_rate=lr, step_each_epoch=step, epochs=num_epochs),
momentum=momentum_rate,
regularization=fluid.regularizer.L2Decay(l2_decay))
if fluid.in_dygraph_mode():
optimizer = fluid.optimizer.Momentum(
learning_rate=fluid.layers.cosine_decay(
learning_rate=lr, step_each_epoch=step, epochs=num_epochs),
momentum=momentum_rate,
regularization=fluid.regularizer.L2Decay(l2_decay),
parameter_list=parameter_list)
else:
optimizer = fluid.optimizer.Momentum(
learning_rate=fluid.layers.cosine_decay(
learning_rate=lr, step_each_epoch=step, epochs=num_epochs),
momentum=momentum_rate,
regularization=fluid.regularizer.L2Decay(l2_decay))
return optimizer
@ -107,27 +115,29 @@ class ConvBNLayer(fluid.dygraph.Layer):
class SqueezeExcitation(fluid.dygraph.Layer):
def __init__(self, name_scope, num_channels, reduction_ratio):
def __init__(self, num_channels, reduction_ratio):
super(SqueezeExcitation, self).__init__(name_scope)
super(SqueezeExcitation, self).__init__()
self._num_channels = num_channels
self._pool = Pool2D(pool_size=0, pool_type='avg', global_pooling=True)
stdv = 1.0 / math.sqrt(num_channels * 1.0)
self._squeeze = FC(
self.full_name(),
size=num_channels // reduction_ratio,
self._squeeze = Linear(
num_channels,
num_channels // reduction_ratio,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(-stdv, stdv)),
act='relu')
stdv = 1.0 / math.sqrt(num_channels / 16.0 * 1.0)
self._excitation = FC(
self.full_name(),
size=num_channels,
self._excitation = Linear(
num_channels // reduction_ratio,
num_channels,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(-stdv, stdv)),
act='sigmoid')
def forward(self, input):
y = self._pool(input)
y = fluid.layers.reshape(y, shape=[-1, self._num_channels])
y = self._squeeze(y)
y = self._excitation(y)
y = fluid.layers.elementwise_mul(x=input, y=y, axis=0)
@ -163,9 +173,7 @@ class BottleneckBlock(fluid.dygraph.Layer):
act=None)
self.scale = SqueezeExcitation(
self.full_name(),
num_channels=num_filters * 2,
reduction_ratio=reduction_ratio)
num_channels=num_filters * 2, reduction_ratio=reduction_ratio)
if not shortcut:
self.short = ConvBNLayer(
@ -194,8 +202,8 @@ class BottleneckBlock(fluid.dygraph.Layer):
class SeResNeXt(fluid.dygraph.Layer):
def __init__(self, name_scope, layers=50, class_dim=102):
super(SeResNeXt, self).__init__(name_scope)
def __init__(self, layers=50, class_dim=102):
super(SeResNeXt, self).__init__()
self.layers = layers
supported_layers = [50, 101, 152]
@ -276,10 +284,13 @@ class SeResNeXt(fluid.dygraph.Layer):
pool_size=7, pool_type='avg', global_pooling=True)
stdv = 1.0 / math.sqrt(2048 * 1.0)
self.out = FC(self.full_name(),
size=class_dim,
param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.Uniform(-stdv, stdv)))
self.pool2d_avg_output = num_filters[len(num_filters) - 1] * 2 * 1 * 1
self.out = Linear(
self.pool2d_avg_output,
class_dim,
param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.Uniform(-stdv, stdv)))
def forward(self, inputs):
if self.layers == 50 or self.layers == 101:
@ -294,18 +305,20 @@ class SeResNeXt(fluid.dygraph.Layer):
for bottleneck_block in self.bottleneck_block_list:
y = bottleneck_block(y)
y = self.pool2d_avg(y)
y = fluid.layers.reshape(y, shape=[-1, self.pool2d_avg_output])
y = self.out(y)
return y
class TestSeResNeXt(TestParallelDyGraphRunnerBase):
def get_model(self):
model = SeResNeXt("se-resnext")
model = SeResNeXt()
train_reader = paddle.batch(
paddle.dataset.flowers.test(use_xmap=False),
batch_size=train_parameters["batch_size"],
drop_last=True)
optimizer = optimizer_setting(train_parameters)
optimizer = optimizer_setting(
train_parameters, parameter_list=model.parameters())
return model, train_reader, optimizer
def run_one_loop(self, model, opt, data):

@ -23,7 +23,7 @@ import paddle
import paddle.fluid as fluid
from paddle.fluid import core
from paddle.fluid.optimizer import SGDOptimizer
from paddle.fluid.dygraph.nn import Conv2D, Pool2D, FC
from paddle.fluid.dygraph.nn import Conv2D, Pool2D, Linear
from paddle.fluid.dygraph.base import to_variable
from test_imperative_base import new_program_scope
@ -75,8 +75,8 @@ class SimpleImgConvPool(fluid.dygraph.Layer):
class MNIST(fluid.dygraph.Layer):
def __init__(self, name_scope):
super(MNIST, self).__init__(name_scope)
def __init__(self):
super(MNIST, self).__init__()
self._simple_img_conv_pool_1 = SimpleImgConvPool(
1, 20, 5, 2, 2, act="relu")
@ -84,19 +84,21 @@ class MNIST(fluid.dygraph.Layer):
self._simple_img_conv_pool_2 = SimpleImgConvPool(
20, 50, 5, 2, 2, act="relu")
pool_2_shape = 50 * 4 * 4
SIZE = 10
scale = (2.0 / (pool_2_shape**2 * SIZE))**0.5
self._fc = FC(self.full_name(),
10,
param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.NormalInitializer(
loc=0.0, scale=scale)),
act="softmax")
self.pool_2_shape = 50 * 4 * 4
SIZE = 100 #10
scale = (2.0 / (self.pool_2_shape**2 * SIZE))**0.5
self._fc = Linear(
self.pool_2_shape,
SIZE,
param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.NormalInitializer(
loc=0.0, scale=scale)),
act="softmax")
def forward(self, inputs):
x = self._simple_img_conv_pool_1(inputs)
x = self._simple_img_conv_pool_2(x)
x = fluid.layers.reshape(x, shape=[-1, self.pool_2_shape])
x = self._fc(x)
return x
@ -109,8 +111,9 @@ class TestDygraphMultiForward(unittest.TestCase):
fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed
mnist = MNIST("mnist")
sgd = SGDOptimizer(learning_rate=1e-3)
mnist = MNIST()
sgd = SGDOptimizer(
learning_rate=1e-3, parameter_list=mnist.parameters())
train_reader = paddle.batch(
paddle.dataset.mnist.train(), batch_size=128, drop_last=True)
@ -145,7 +148,7 @@ class TestDygraphMultiForward(unittest.TestCase):
exe = fluid.Executor(fluid.CPUPlace(
) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
mnist = MNIST("mnist")
mnist = MNIST()
sgd = SGDOptimizer(learning_rate=1e-3)
train_reader = paddle.batch(
paddle.dataset.mnist.train(), batch_size=128, drop_last=True)

@ -258,7 +258,9 @@ class TestImperativeAutoPrune(unittest.TestCase):
fc2_origin = fc2._w.numpy()
fc2._w.stop_gradient = True
out2.backward()
optimizer = fluid.optimizer.SGD(learning_rate=0.003)
optimizer = fluid.optimizer.SGD(
learning_rate=0.003,
parameter_list=(fc.parameters() + fc2.parameters()))
optimizer.minimize(out2)
self.assertTrue(np.array_equal(fc2_origin, fc2._w.numpy()))
self.assertFalse(np.array_equal(fc_origin, fc._w.numpy()))
@ -279,7 +281,9 @@ class TestImperativeAutoPrune(unittest.TestCase):
fc2_origin = fc2._w.numpy()
out2.stop_gradient = True
out2.backward()
optimizer = fluid.optimizer.SGD(learning_rate=0.003)
optimizer = fluid.optimizer.SGD(
learning_rate=0.003,
parameter_list=(fc.parameters() + fc2.parameters()))
optimizer.minimize(out2)
self.assertTrue(np.array_equal(fc2_origin, fc2._w.numpy()))
self.assertTrue(np.array_equal(fc_origin, fc._w.numpy()))
@ -320,7 +324,8 @@ class TestImperativeAutoPrune(unittest.TestCase):
place = fluid.CPUPlace()
with fluid.dygraph.guard(place):
model = MyLayer("mylayer", vocab_size, size)
optimizer = fluid.optimizer.AdamOptimizer(0.001)
optimizer = fluid.optimizer.AdamOptimizer(
0.001, parameter_list=model.parameters())
grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(0.001)
indices = fluid.dygraph.to_variable(indices)
@ -338,7 +343,8 @@ class TestImperativeAutoPrune(unittest.TestCase):
with fluid.dygraph.guard(place):
model = MyLayer2("mylayer", vocab_size, size)
optimizer = fluid.optimizer.AdamOptimizer(0.001)
optimizer = fluid.optimizer.AdamOptimizer(
0.001, parameter_list=model.parameters())
grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(0.001)
indices = fluid.dygraph.to_variable(indices)

@ -58,7 +58,7 @@ class TestDygraphDebugString(unittest.TestCase):
out.backward()
mlp.clear_gradients()
unique_name_tmp, trace_var_tmp, alive_var_tmp = fluid.dygraph.base._print_debug_msg(
is_test=True)
mlp.parameters(), is_test=True)
if i > 0:
self.assertGreaterEqual(unique_name, unique_name_tmp)
self.assertGreaterEqual(trace_var, trace_var_tmp)
@ -68,7 +68,7 @@ class TestDygraphDebugString(unittest.TestCase):
trace_var = trace_var_tmp
alive_var = alive_var_tmp
try:
fluid.dygraph.base._print_debug_msg()
fluid.dygraph.base._print_debug_msg(mlp.parameters())
except Exception as e:
raise RuntimeError(
"No Exception is accepted in _print_debug_msg, but we got: {}".

@ -23,6 +23,7 @@ import paddle.fluid as fluid
import paddle.fluid.core as core
from test_imperative_base import new_program_scope
from paddle.fluid.dygraph.base import to_variable
from paddle.fluid.dygraph import Linear
# Can use Amusic dataset as the DeepCF describes.
DATA_PATH = os.environ.get('DATA_PATH', '')
@ -33,10 +34,10 @@ NUM_EPOCHES = int(os.environ.get('NUM_EPOCHES', 1))
class DMF(fluid.Layer):
def __init__(self, name_scope):
super(DMF, self).__init__(name_scope)
self._user_latent = fluid.FC(self.full_name(), 256)
self._item_latent = fluid.FC(self.full_name(), 256)
def __init__(self):
super(DMF, self).__init__()
self._user_latent = Linear(1000, 256)
self._item_latent = Linear(100, 256)
self._user_layers = []
self._item_layers = []
@ -45,11 +46,17 @@ class DMF(fluid.Layer):
self._user_layers.append(
self.add_sublayer(
'user_layer_%d' % i,
fluid.FC(self.full_name(), self._hid_sizes[i], act='relu')))
Linear(
256 if i == 0 else self._hid_sizes[i - 1],
self._hid_sizes[i],
act='relu')))
self._item_layers.append(
self.add_sublayer(
'item_layer_%d' % i,
fluid.FC(self.full_name(), self._hid_sizes[i], act='relu')))
Linear(
256 if i == 0 else self._hid_sizes[i - 1],
self._hid_sizes[i],
act='relu')))
def forward(self, users, items):
users = self._user_latent(users)
@ -62,17 +69,20 @@ class DMF(fluid.Layer):
class MLP(fluid.Layer):
def __init__(self, name_scope):
super(MLP, self).__init__(name_scope)
self._user_latent = fluid.FC(self.full_name(), 256)
self._item_latent = fluid.FC(self.full_name(), 256)
def __init__(self):
super(MLP, self).__init__()
self._user_latent = Linear(1000, 256)
self._item_latent = Linear(100, 256)
self._match_layers = []
self._hid_sizes = [128, 64]
for i in range(len(self._hid_sizes)):
self._match_layers.append(
self.add_sublayer(
'match_layer_%d' % i,
fluid.FC(self.full_name(), self._hid_sizes[i], act='relu')))
Linear(
256 * 2 if i == 0 else self._hid_sizes[i - 1],
self._hid_sizes[i],
act='relu')))
def forward(self, users, items):
users = self._user_latent(users)
@ -85,8 +95,8 @@ class MLP(fluid.Layer):
class DeepCF(fluid.Layer):
def __init__(self, name_scope, num_users, num_items, matrix):
super(DeepCF, self).__init__(name_scope)
def __init__(self, num_users, num_items, matrix):
super(DeepCF, self).__init__()
self._num_users = num_users
self._num_items = num_items
self._rating_matrix = self.create_parameter(
@ -97,9 +107,9 @@ class DeepCF(fluid.Layer):
default_initializer=fluid.initializer.NumpyArrayInitializer(matrix))
self._rating_matrix.stop_gradient = True
self._mlp = MLP(self.full_name())
self._dmf = DMF(self.full_name())
self._match_fc = fluid.FC(self.full_name(), 1, act='sigmoid')
self._mlp = MLP()
self._dmf = DMF()
self._match_fc = Linear(128, 1, act='sigmoid')
def forward(self, users, items):
# users_emb = self._user_emb(users)
@ -208,7 +218,7 @@ class TestDygraphDeepCF(unittest.TestCase):
items = fluid.layers.data('items', [1], dtype='int32')
labels = fluid.layers.data('labels', [1], dtype='float32')
deepcf = DeepCF('deepcf', num_users, num_items, matrix)
deepcf = DeepCF(num_users, num_items, matrix)
prediction = deepcf(users, items)
loss = fluid.layers.reduce_sum(
fluid.layers.log_loss(prediction, labels))
@ -237,8 +247,9 @@ class TestDygraphDeepCF(unittest.TestCase):
fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed
deepcf = DeepCF('deepcf', num_users, num_items, matrix)
adam = fluid.optimizer.AdamOptimizer(0.01)
deepcf = DeepCF(num_users, num_items, matrix)
adam = fluid.optimizer.AdamOptimizer(
0.01, parameter_list=deepcf.parameters())
for e in range(NUM_EPOCHES):
sys.stderr.write('epoch %d\n' % e)
for slice in range(0, BATCH_SIZE * NUM_BATCHES, BATCH_SIZE):
@ -261,8 +272,9 @@ class TestDygraphDeepCF(unittest.TestCase):
fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed
deepcf2 = DeepCF('deepcf', num_users, num_items, matrix)
adam2 = fluid.optimizer.AdamOptimizer(0.01)
deepcf2 = DeepCF(num_users, num_items, matrix)
adam2 = fluid.optimizer.AdamOptimizer(
0.01, parameter_list=deepcf2.parameters())
backward_strategy = fluid.dygraph.BackwardStrategy()
backward_strategy.sort_sum_gradient = True
for e in range(NUM_EPOCHES):

@ -22,33 +22,35 @@ import paddle
import paddle.fluid as fluid
import paddle.fluid.core as core
from paddle.fluid.optimizer import SGDOptimizer
from paddle.fluid import Conv2D, Pool2D, FC
from paddle.fluid import Conv2D, Pool2D, Linear
from test_imperative_base import new_program_scope
from paddle.fluid.dygraph.base import to_variable
class Discriminator(fluid.Layer):
def __init__(self, name_scope):
super(Discriminator, self).__init__(name_scope)
self._fc1 = FC(self.full_name(), size=32, act='elu')
self._fc2 = FC(self.full_name(), size=1)
def __init__(self):
super(Discriminator, self).__init__()
self._fc1 = Linear(1, 32, act='elu')
self._fc2 = Linear(32, 1)
def forward(self, inputs):
x = self._fc1(inputs)
return self._fc2(x)
x = self._fc2(x)
return x
class Generator(fluid.Layer):
def __init__(self, name_scope):
super(Generator, self).__init__(name_scope)
self._fc1 = FC(self.full_name(), size=64, act='elu')
self._fc2 = FC(self.full_name(), size=64, act='elu')
self._fc3 = FC(self.full_name(), size=1)
def __init__(self):
super(Generator, self).__init__()
self._fc1 = Linear(2, 64, act='elu')
self._fc2 = Linear(64, 64, act='elu')
self._fc3 = Linear(64, 1)
def forward(self, inputs):
x = self._fc1(inputs)
x = self._fc2(x)
return self._fc3(x)
x = self._fc3(x)
return x
class TestDygraphGAN(unittest.TestCase):
@ -65,8 +67,8 @@ class TestDygraphGAN(unittest.TestCase):
scope = fluid.core.Scope()
with new_program_scope(
main=discriminate_p, startup=startup, scope=scope):
discriminator = Discriminator("d")
generator = Generator("g")
discriminator = Discriminator()
generator = Generator()
img = fluid.layers.data(
name="img", shape=[2, 1], append_batch_size=False)
@ -93,8 +95,8 @@ class TestDygraphGAN(unittest.TestCase):
sgd.minimize(d_loss)
with new_program_scope(main=generate_p, startup=startup, scope=scope):
discriminator = Discriminator("d")
generator = Generator("g")
discriminator = Discriminator()
generator = Generator()
noise = fluid.layers.data(
name="noise", shape=[2, 2], append_batch_size=False)
@ -134,9 +136,12 @@ class TestDygraphGAN(unittest.TestCase):
fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed
discriminator = Discriminator("d")
generator = Generator("g")
sgd = SGDOptimizer(learning_rate=1e-3)
discriminator = Discriminator()
generator = Generator()
sgd = SGDOptimizer(
learning_rate=1e-3,
parameter_list=(
discriminator.parameters() + generator.parameters()))
d_real = discriminator(to_variable(np.ones([2, 1], np.float32)))
d_loss_real = fluid.layers.reduce_mean(
@ -177,9 +182,12 @@ class TestDygraphGAN(unittest.TestCase):
backward_strategy = fluid.dygraph.BackwardStrategy()
backward_strategy.sort_sum_gradient = True
discriminator2 = Discriminator("d")
generator2 = Generator("g")
sgd2 = SGDOptimizer(learning_rate=1e-3)
discriminator2 = Discriminator()
generator2 = Generator()
sgd2 = SGDOptimizer(
learning_rate=1e-3,
parameter_list=(
discriminator2.parameters() + generator2.parameters()))
d_real2 = discriminator2(to_variable(np.ones([2, 1], np.float32)))
d_loss_real2 = fluid.layers.reduce_mean(

@ -131,7 +131,8 @@ class TestDygraphGNN(unittest.TestCase):
to_variable(labels))
loss = fluid.layers.reduce_sum(loss)
loss.backward()
adam = AdamOptimizer(learning_rate=1e-3)
adam = AdamOptimizer(
learning_rate=1e-3, parameter_list=model.parameters())
adam.minimize(loss)
model.clear_gradients()
@ -156,7 +157,8 @@ class TestDygraphGNN(unittest.TestCase):
logits2, to_variable(labels2))
loss2 = fluid.layers.reduce_sum(loss2)
loss2.backward()
adam2 = AdamOptimizer(learning_rate=1e-3)
adam2 = AdamOptimizer(
learning_rate=1e-3, parameter_list=model2.parameters())
adam2.minimize(loss2)
model2.clear_gradients()
loss2_value = loss2.numpy()

@ -105,7 +105,9 @@ class TestDygraphSimpleNet(unittest.TestCase):
is_sparse=is_sparse,
dtype=dtype)
sgd = SGDOptimizer(learning_rate=1e-3)
sgd = SGDOptimizer(
learning_rate=1e-3,
parameter_list=simple_net.parameters())
dy_param_updated = dict()
dy_param_init = dict()
dy_loss = None

@ -23,7 +23,7 @@ import paddle
import paddle.fluid as fluid
from paddle.fluid import core
from paddle.fluid.optimizer import SGDOptimizer
from paddle.fluid.dygraph.nn import Conv2D, Pool2D, FC
from paddle.fluid.dygraph.nn import Conv2D, Pool2D, Linear
from paddle.fluid.dygraph.base import to_variable
from test_imperative_base import new_program_scope
from utils import DyGraphProgramDescTracerTestHelper, is_equal_program
@ -77,8 +77,8 @@ class SimpleImgConvPool(fluid.dygraph.Layer):
class MNIST(fluid.dygraph.Layer):
def __init__(self, name_scope):
super(MNIST, self).__init__(name_scope)
def __init__(self):
super(MNIST, self).__init__()
self._simple_img_conv_pool_1 = SimpleImgConvPool(
1, 20, 5, 2, 2, act="relu")
@ -86,19 +86,21 @@ class MNIST(fluid.dygraph.Layer):
self._simple_img_conv_pool_2 = SimpleImgConvPool(
20, 50, 5, 2, 2, act="relu")
pool_2_shape = 50 * 4 * 4
self.pool_2_shape = 50 * 4 * 4
SIZE = 10
scale = (2.0 / (pool_2_shape**2 * SIZE))**0.5
self._fc = FC(self.full_name(),
10,
param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.NormalInitializer(
loc=0.0, scale=scale)),
act="softmax")
scale = (2.0 / (self.pool_2_shape**2 * SIZE))**0.5
self._fc = Linear(
self.pool_2_shape,
10,
param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.NormalInitializer(
loc=0.0, scale=scale)),
act="softmax")
def forward(self, inputs):
x = self._simple_img_conv_pool_1(inputs)
x = self._simple_img_conv_pool_2(x)
x = fluid.layers.reshape(x, shape=[-1, self.pool_2_shape])
x = self._fc(x)
return x
@ -125,8 +127,9 @@ class TestImperativeMnist(unittest.TestCase):
fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed
mnist = MNIST("mnist")
sgd = SGDOptimizer(learning_rate=1e-3)
mnist = MNIST()
sgd = SGDOptimizer(
learning_rate=1e-3, parameter_list=mnist.parameters())
batch_py_reader = fluid.io.PyReader(capacity=1)
batch_py_reader.decorate_sample_list_generator(
@ -189,7 +192,7 @@ class TestImperativeMnist(unittest.TestCase):
exe = fluid.Executor(fluid.CPUPlace(
) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
mnist = MNIST("mnist")
mnist = MNIST()
sgd = SGDOptimizer(learning_rate=1e-3)
train_reader = paddle.batch(
paddle.dataset.mnist.train(),

@ -39,8 +39,9 @@ class TestImperativeMnistSortGradient(unittest.TestCase):
backward_strategy = fluid.dygraph.BackwardStrategy()
backward_strategy.sort_sum_gradient = True
mnist2 = MNIST("mnist")
sgd2 = SGDOptimizer(learning_rate=1e-3)
mnist2 = MNIST()
sgd2 = SGDOptimizer(
learning_rate=1e-3, parameter_list=mnist2.parameters())
train_reader2 = paddle.batch(
paddle.dataset.mnist.train(), batch_size=128, drop_last=True)
@ -85,7 +86,7 @@ class TestImperativeMnistSortGradient(unittest.TestCase):
exe = fluid.Executor(fluid.CPUPlace(
) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
mnist = MNIST("mnist")
mnist = MNIST()
sgd = SGDOptimizer(learning_rate=1e-3)
train_reader = paddle.batch(
paddle.dataset.mnist.train(), batch_size=128, drop_last=True)

@ -18,7 +18,7 @@ import numpy as np
import six
import paddle.fluid as fluid
from paddle.fluid import core
from paddle.fluid.dygraph.nn import Conv2D, Pool2D, FC, BatchNorm, Embedding, GRUUnit
from paddle.fluid.dygraph.nn import Conv2D, Pool2D, Linear, BatchNorm, Embedding, GRUUnit
from paddle.fluid.dygraph.base import to_variable
from test_imperative_base import new_program_scope
@ -27,6 +27,8 @@ class Config(object):
'''
config for training
'''
# encoder rnn hidden_size
encoder_size = 200
# decoder size for decoder stage
decoder_size = 128
# size for word embedding
@ -118,8 +120,8 @@ class ConvBNPool(fluid.dygraph.Layer):
class OCRConv(fluid.dygraph.Layer):
def __init__(self, name_scope, is_test=False, use_cudnn=True):
super(OCRConv, self).__init__(name_scope)
def __init__(self, is_test=False, use_cudnn=True):
super(OCRConv, self).__init__()
self.conv_bn_pool_1 = ConvBNPool(
2, [16, 16], [1, 16], is_test=is_test, use_cudnn=use_cudnn)
self.conv_bn_pool_2 = ConvBNPool(
@ -143,7 +145,6 @@ class OCRConv(fluid.dygraph.Layer):
class DynamicGRU(fluid.dygraph.Layer):
def __init__(self,
scope_name,
size,
param_attr=None,
bias_attr=None,
@ -152,7 +153,7 @@ class DynamicGRU(fluid.dygraph.Layer):
candidate_activation='tanh',
h_0=None,
origin_mode=False):
super(DynamicGRU, self).__init__(scope_name)
super(DynamicGRU, self).__init__()
self.gru_unit = GRUUnit(
size * 3,
@ -164,6 +165,7 @@ class DynamicGRU(fluid.dygraph.Layer):
self.h_0 = h_0
self.is_reverse = is_reverse
self.size = size
def forward(self, inputs):
hidden = self.h_0
@ -188,11 +190,10 @@ class DynamicGRU(fluid.dygraph.Layer):
class EncoderNet(fluid.dygraph.Layer):
def __init__(self,
scope_name,
rnn_hidden_size=200,
rnn_hidden_size=Config.encoder_size,
is_test=False,
use_cudnn=True):
super(EncoderNet, self).__init__(scope_name)
super(EncoderNet, self).__init__()
self.rnn_hidden_size = rnn_hidden_size
para_attr = fluid.ParamAttr(initializer=fluid.initializer.Normal(0.0,
0.02))
@ -207,28 +208,19 @@ class EncoderNet(fluid.dygraph.Layer):
shape=[Config.batch_size, rnn_hidden_size],
dtype='float32',
value=0)
self.ocr_convs = OCRConv(
self.full_name(), is_test=is_test, use_cudnn=use_cudnn)
self.fc_1_layer = FC(self.full_name(),
rnn_hidden_size * 3,
param_attr=para_attr,
bias_attr=False,
num_flatten_dims=2)
self.fc_2_layer = FC(self.full_name(),
rnn_hidden_size * 3,
param_attr=para_attr,
bias_attr=False,
num_flatten_dims=2)
self.ocr_convs = OCRConv(is_test=is_test, use_cudnn=use_cudnn)
self.fc_1_layer = Linear(
768, rnn_hidden_size * 3, param_attr=para_attr, bias_attr=False)
self.fc_2_layer = Linear(
768, rnn_hidden_size * 3, param_attr=para_attr, bias_attr=False)
self.gru_forward_layer = DynamicGRU(
self.full_name(),
size=rnn_hidden_size,
h_0=h_0,
param_attr=para_attr,
bias_attr=bias_attr,
candidate_activation='relu')
self.gru_backward_layer = DynamicGRU(
self.full_name(),
size=rnn_hidden_size,
h_0=h_0,
param_attr=para_attr,
@ -236,10 +228,8 @@ class EncoderNet(fluid.dygraph.Layer):
candidate_activation='relu',
is_reverse=True)
self.encoded_proj_fc = FC(self.full_name(),
Config.decoder_size,
bias_attr=False,
num_flatten_dims=2)
self.encoded_proj_fc = Linear(
rnn_hidden_size * 2, Config.decoder_size, bias_attr=False)
def forward(self, inputs):
conv_features = self.ocr_convs(inputs)
@ -272,18 +262,12 @@ class EncoderNet(fluid.dygraph.Layer):
class SimpleAttention(fluid.dygraph.Layer):
def __init__(self, scope_name, decoder_size):
super(SimpleAttention, self).__init__(scope_name)
self.fc_1 = FC(self.full_name(),
decoder_size,
act=None,
bias_attr=False)
self.fc_2 = FC(self.full_name(),
1,
num_flatten_dims=2,
act=None,
bias_attr=False)
def __init__(self, decoder_size):
super(SimpleAttention, self).__init__()
self.fc_1 = Linear(
decoder_size, decoder_size, act=None, bias_attr=False)
self.fc_2 = Linear(decoder_size, 1, act=None, bias_attr=False)
def forward(self, encoder_vec, encoder_proj, decoder_state):
@ -311,22 +295,18 @@ class SimpleAttention(fluid.dygraph.Layer):
class GRUDecoderWithAttention(fluid.dygraph.Layer):
def __init__(self, scope_name, decoder_size, num_classes):
super(GRUDecoderWithAttention, self).__init__(scope_name)
self.simple_attention = SimpleAttention(self.full_name(), decoder_size)
self.fc_1_layer = FC(self.full_name(),
size=decoder_size * 3,
bias_attr=False)
self.fc_2_layer = FC(self.full_name(),
size=decoder_size * 3,
bias_attr=False)
def __init__(self, decoder_size, num_classes):
super(GRUDecoderWithAttention, self).__init__()
self.simple_attention = SimpleAttention(decoder_size)
self.fc_1_layer = Linear(
Config.encoder_size * 2, decoder_size * 3, bias_attr=False)
self.fc_2_layer = Linear(
decoder_size, decoder_size * 3, bias_attr=False)
self.gru_unit = GRUUnit(
size=decoder_size * 3, param_attr=None, bias_attr=None)
self.out_layer = FC(self.full_name(),
size=num_classes + 2,
bias_attr=None,
act='softmax')
self.out_layer = Linear(
decoder_size, num_classes + 2, bias_attr=None, act='softmax')
self.decoder_size = decoder_size
@ -357,17 +337,18 @@ class GRUDecoderWithAttention(fluid.dygraph.Layer):
class OCRAttention(fluid.dygraph.Layer):
def __init__(self, scope_name):
super(OCRAttention, self).__init__(scope_name)
self.encoder_net = EncoderNet(self.full_name())
self.fc = FC(self.full_name(),
size=Config.decoder_size,
bias_attr=False,
act='relu')
def __init__(self):
super(OCRAttention, self).__init__()
self.encoder_net = EncoderNet()
self.fc = Linear(
Config.encoder_size,
Config.decoder_size,
bias_attr=False,
act='relu')
self.embedding = Embedding(
[Config.num_classes + 2, Config.word_vector_dim], dtype='float32')
self.gru_decoder_with_attention = GRUDecoderWithAttention(
self.full_name(), Config.decoder_size, Config.num_classes)
Config.decoder_size, Config.num_classes)
def forward(self, inputs, label_in):
gru_backward, encoded_vector, encoded_proj = self.encoder_net(inputs)
@ -425,14 +406,15 @@ class TestDygraphOCRAttention(unittest.TestCase):
fluid.default_main_program().random_seed = seed
backward_strategy = fluid.dygraph.BackwardStrategy()
backward_strategy.sort_sum_gradient = True
ocr_attention = OCRAttention("ocr_attention")
ocr_attention = OCRAttention()
if Config.learning_rate_decay == "piecewise_decay":
learning_rate = fluid.layers.piecewise_decay(
[50000], [Config.LR, Config.LR * 0.01])
else:
learning_rate = Config.LR
optimizer = fluid.optimizer.SGD(learning_rate=0.001)
optimizer = fluid.optimizer.SGD(
learning_rate=0.001, parameter_list=ocr_attention.parameters())
dy_param_init_value = {}
for param in ocr_attention.parameters():
dy_param_init_value[param.name] = param.numpy()
@ -478,7 +460,7 @@ class TestDygraphOCRAttention(unittest.TestCase):
# print("static start")
exe = fluid.Executor(fluid.CPUPlace(
) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
ocr_attention = OCRAttention("ocr_attention")
ocr_attention = OCRAttention()
if Config.learning_rate_decay == "piecewise_decay":
learning_rate = fluid.layers.piecewise_decay(

@ -23,17 +23,17 @@ import paddle
import paddle.fluid as fluid
from paddle.fluid import core
from paddle.fluid.optimizer import SGDOptimizer, Adam
from paddle.fluid.dygraph.nn import FC
from paddle.fluid.dygraph import Linear
from paddle.fluid.dygraph.base import to_variable
from test_imperative_base import new_program_scope
class MLP(fluid.Layer):
def __init__(self, name_scope, param_attr=None, bias_attr=None):
super(MLP, self).__init__(name_scope)
def __init__(self, param_attr=None, bias_attr=None):
super(MLP, self).__init__()
self._fc1 = FC(self.full_name(), 10)
self._fc2 = FC(self.full_name(), 10)
self._fc1 = Linear(784, 10)
self._fc2 = Linear(10, 10)
def forward(self, inputs):
y = self._fc1(inputs)
@ -45,13 +45,16 @@ class TestImperativeOptimizerBase(unittest.TestCase):
def setUp(self):
self.batch_num = 20
def get_optimizer_dygraph(self, parameter_list):
raise NotImplementedError()
def get_optimizer(self):
raise NotImplementedError()
def reader_decorator(self, reader):
def _reader_imple():
for item in reader():
image = np.array(item[0]).reshape(1, 28, 28)
image = np.array(item[0]).reshape(1, 784)
label = np.array(item[1]).astype('int64').reshape(1)
yield image, label
@ -65,8 +68,9 @@ class TestImperativeOptimizerBase(unittest.TestCase):
fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed
mlp = MLP('mlp')
optimizer = self.get_optimizer()
mlp = MLP()
optimizer = self.get_optimizer_dygraph(
parameter_list=mlp.parameters())
batch_py_reader = fluid.io.PyReader(capacity=1)
batch_py_reader.decorate_sample_list_generator(
@ -85,6 +89,7 @@ class TestImperativeOptimizerBase(unittest.TestCase):
label = data[1]
label.stop_gradient = True
img = fluid.layers.reshape(img, shape=[batch_size, -1])
cost = mlp(img)
avg_loss = fluid.layers.reduce_mean(cost)
dy_out = avg_loss.numpy()
@ -107,7 +112,7 @@ class TestImperativeOptimizerBase(unittest.TestCase):
exe = fluid.Executor(fluid.CPUPlace(
) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
mlp = MLP('mlp')
mlp = MLP()
optimizer = self.get_optimizer()
train_reader = paddle.batch(
paddle.dataset.mnist.train(), batch_size=128, drop_last=True)
@ -115,6 +120,7 @@ class TestImperativeOptimizerBase(unittest.TestCase):
img = fluid.layers.data(
name='pixel', shape=[1, 28, 28], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
img = fluid.layers.reshape(img, shape=[batch_size, -1])
cost = mlp(img)
avg_loss = fluid.layers.reduce_mean(cost)
optimizer.minimize(avg_loss)
@ -162,6 +168,15 @@ class TestImperativeOptimizerBase(unittest.TestCase):
class TestImperativeOptimizerPiecewiseDecay(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list):
bd = [3, 6, 9]
optimizer = SGDOptimizer(
learning_rate=fluid.layers.piecewise_decay(
boundaries=bd,
values=[0.1 * (0.1**i) for i in range(len(bd) + 1)]),
parameter_list=parameter_list)
return optimizer
def get_optimizer(self):
bd = [3, 6, 9]
optimizer = SGDOptimizer(learning_rate=fluid.layers.piecewise_decay(
@ -173,6 +188,16 @@ class TestImperativeOptimizerPiecewiseDecay(TestImperativeOptimizerBase):
class TestImperativeOptimizerNaturalExpDecay(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list):
optimizer = SGDOptimizer(
learning_rate=fluid.layers.natural_exp_decay(
learning_rate=0.1,
decay_steps=10000,
decay_rate=0.5,
staircase=True),
parameter_list=parameter_list)
return optimizer
def get_optimizer(self):
optimizer = SGDOptimizer(learning_rate=fluid.layers.natural_exp_decay(
learning_rate=0.1,
@ -186,6 +211,16 @@ class TestImperativeOptimizerNaturalExpDecay(TestImperativeOptimizerBase):
class TestImperativeOptimizerExponentialDecay(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list):
optimizer = SGDOptimizer(
learning_rate=fluid.layers.exponential_decay(
learning_rate=0.1,
decay_steps=10000,
decay_rate=0.5,
staircase=True),
parameter_list=parameter_list)
return optimizer
def get_optimizer(self):
optimizer = SGDOptimizer(learning_rate=fluid.layers.exponential_decay(
learning_rate=0.1,
@ -199,6 +234,16 @@ class TestImperativeOptimizerExponentialDecay(TestImperativeOptimizerBase):
class TestImperativeOptimizerInverseTimeDecay(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list):
optimizer = Adam(
learning_rate=fluid.layers.inverse_time_decay(
learning_rate=0.1,
decay_steps=10000,
decay_rate=0.5,
staircase=True),
parameter_list=parameter_list)
return optimizer
def get_optimizer(self):
optimizer = Adam(learning_rate=fluid.layers.inverse_time_decay(
learning_rate=0.1,
@ -212,6 +257,13 @@ class TestImperativeOptimizerInverseTimeDecay(TestImperativeOptimizerBase):
class TestImperativeOptimizerPolynomialDecay(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list):
optimizer = SGDOptimizer(
learning_rate=fluid.layers.polynomial_decay(
learning_rate=0.1, decay_steps=5, cycle=self.cycle),
parameter_list=parameter_list)
return optimizer
def get_optimizer(self):
optimizer = SGDOptimizer(learning_rate=fluid.layers.polynomial_decay(
learning_rate=0.1, decay_steps=5, cycle=self.cycle))
@ -227,6 +279,13 @@ class TestImperativeOptimizerPolynomialDecay(TestImperativeOptimizerBase):
class TestImperativeOptimizerCosineDecay(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list):
optimizer = SGDOptimizer(
learning_rate=fluid.layers.cosine_decay(
learning_rate=0.1, step_each_epoch=10000, epochs=120),
parameter_list=parameter_list)
return optimizer
def get_optimizer(self):
optimizer = SGDOptimizer(learning_rate=fluid.layers.cosine_decay(
learning_rate=0.1, step_each_epoch=10000, epochs=120))
@ -237,6 +296,13 @@ class TestImperativeOptimizerCosineDecay(TestImperativeOptimizerBase):
class TestImperativeOptimizerNoamDecay(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list):
optimizer = SGDOptimizer(
learning_rate=fluid.layers.noam_decay(
d_model=512, warmup_steps=8000),
parameter_list=parameter_list)
return optimizer
def get_optimizer(self):
optimizer = SGDOptimizer(learning_rate=fluid.layers.noam_decay(
d_model=512, warmup_steps=8000))

@ -38,7 +38,8 @@ class TestImperativePartitialBackward(unittest.TestCase):
for param in fc2.parameters():
self.assertIsNone(param._grad_ivar())
optimizer = fluid.optimizer.AdamOptimizer()
optimizer = fluid.optimizer.AdamOptimizer(parameter_list=(
fc1.parameters() + fc2.parameters()))
_, params_grads = optimizer.minimize(loss)
self.assertListEqual(

@ -30,13 +30,12 @@ from utils import DyGraphProgramDescTracerTestHelper, is_equal_program
class SimpleLSTMRNN(fluid.Layer):
def __init__(self,
name_scope,
hidden_size,
num_steps,
num_layers=2,
init_scale=0.1,
dropout=None):
super(SimpleLSTMRNN, self).__init__(name_scope)
super(SimpleLSTMRNN, self).__init__()
self._hidden_size = hidden_size
self._num_layers = num_layers
self._init_scale = init_scale
@ -45,8 +44,9 @@ class SimpleLSTMRNN(fluid.Layer):
self._num_steps = num_steps
self.cell_array = []
self.hidden_array = []
self._create_parameter()
def _build_once(self, input_embedding, init_hidden=None, init_cell=None):
def _create_parameter(self):
self.weight_1_arr = []
self.weight_2_arr = []
self.bias_arr = []
@ -135,7 +135,6 @@ class SimpleLSTMRNN(fluid.Layer):
class PtbModel(fluid.Layer):
def __init__(self,
name_scope,
hidden_size,
vocab_size,
num_layers=2,
@ -143,7 +142,7 @@ class PtbModel(fluid.Layer):
init_scale=0.1,
is_sparse=False,
dropout=None):
super(PtbModel, self).__init__(name_scope)
super(PtbModel, self).__init__()
self.hidden_size = hidden_size
self.vocab_size = vocab_size
self.init_scale = init_scale
@ -151,7 +150,6 @@ class PtbModel(fluid.Layer):
self.num_steps = num_steps
self.dropout = dropout
self.simple_lstm_rnn = SimpleLSTMRNN(
self.full_name(),
hidden_size,
num_steps,
num_layers=num_layers,
@ -231,7 +229,6 @@ class TestDygraphPtbRnn(unittest.TestCase):
fluid.default_main_program().random_seed = seed
# TODO: marsyang1993 Change seed to
ptb_model = PtbModel(
"ptb_model",
hidden_size=hidden_size,
vocab_size=vocab_size,
num_layers=num_layers,
@ -239,7 +236,8 @@ class TestDygraphPtbRnn(unittest.TestCase):
init_scale=init_scale,
is_sparse=is_sparse)
sgd = SGDOptimizer(learning_rate=1e-3)
sgd = SGDOptimizer(
learning_rate=1e-3, parameter_list=ptb_model.parameters())
dy_param_updated = dict()
dy_param_init = dict()
dy_loss = None
@ -298,7 +296,6 @@ class TestDygraphPtbRnn(unittest.TestCase):
fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed
ptb_model = PtbModel(
"ptb_model",
hidden_size=hidden_size,
vocab_size=vocab_size,
num_layers=num_layers,

@ -49,7 +49,6 @@ class TestDygraphPtbRnnSortGradient(unittest.TestCase):
backward_strategy.sort_sum_gradient = True
# TODO: marsyang1993 Change seed to
ptb_model = PtbModel(
"ptb_model",
hidden_size=hidden_size,
vocab_size=vocab_size,
num_layers=num_layers,
@ -57,7 +56,8 @@ class TestDygraphPtbRnnSortGradient(unittest.TestCase):
init_scale=init_scale,
is_sparse=is_sparse)
sgd = SGDOptimizer(learning_rate=1e-3)
sgd = SGDOptimizer(
learning_rate=1e-3, parameter_list=ptb_model.parameters())
dy_param_updated = dict()
dy_param_init = dict()
dy_loss = None
@ -97,7 +97,6 @@ class TestDygraphPtbRnnSortGradient(unittest.TestCase):
fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed
ptb_model = PtbModel(
"ptb_model",
hidden_size=hidden_size,
vocab_size=vocab_size,
num_layers=num_layers,

@ -86,7 +86,8 @@ class TestImperativeMnist(unittest.TestCase):
loss_probs = fluid.layers.elementwise_mul(dy_reward, loss_probs)
loss = fluid.layers.reduce_sum(loss_probs)
sgd = SGDOptimizer(learning_rate=1e-3)
sgd = SGDOptimizer(
learning_rate=1e-3, parameter_list=policy.parameters())
dy_param_init_value = {}

@ -21,7 +21,7 @@ import paddle
import paddle.fluid as fluid
from paddle.fluid import core
from paddle.fluid.layer_helper import LayerHelper
from paddle.fluid import Conv2D, Pool2D, BatchNorm, FC
from paddle.fluid import Conv2D, Pool2D, BatchNorm, Linear
from paddle.fluid.dygraph.base import to_variable
from test_imperative_base import new_program_scope
from utils import DyGraphProgramDescTracerTestHelper, is_equal_program
@ -44,7 +44,7 @@ train_parameters = {
}
def optimizer_setting(params):
def optimizer_setting(params, parameter_list=None):
ls = params["learning_strategy"]
if ls["name"] == "piecewise_decay":
if "total_images" not in params:
@ -58,14 +58,18 @@ def optimizer_setting(params):
base_lr = params["lr"]
lr = []
lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
optimizer = fluid.optimizer.SGD(learning_rate=0.01)
if fluid.in_dygraph_mode():
optimizer = fluid.optimizer.SGD(learning_rate=0.01,
parameter_list=parameter_list)
else:
optimizer = fluid.optimizer.SGD(learning_rate=0.01)
# TODO(minqiyang): Add learning rate scheduler support to dygraph mode
# optimizer = fluid.optimizer.Momentum(
# learning_rate=params["lr"],
# learning_rate=fluid.layers.piecewise_decay(
# boundaries=bd, values=lr),
# momentum=0.9,
# regularization=fluid.regularizer.L2Decay(1e-4))
# learning_rate=params["lr"],
# learning_rate=fluid.layers.piecewise_decay(
# boundaries=bd, values=lr),
# momentum=0.9,
# regularization=fluid.regularizer.L2Decay(1e-4))
return optimizer
@ -147,8 +151,8 @@ class BottleneckBlock(fluid.Layer):
class ResNet(fluid.Layer):
def __init__(self, name_scope, layers=50, class_dim=102):
super(ResNet, self).__init__(name_scope)
def __init__(self, layers=50, class_dim=102):
super(ResNet, self).__init__()
self.layers = layers
supported_layers = [50, 101, 152]
@ -187,14 +191,17 @@ class ResNet(fluid.Layer):
self.pool2d_avg = Pool2D(
pool_size=7, pool_type='avg', global_pooling=True)
self.pool2d_avg_output = num_filters[-1] * 4 * 1 * 1
import math
stdv = 1.0 / math.sqrt(2048 * 1.0)
self.out = FC(self.full_name(),
size=class_dim,
act='softmax',
param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.Uniform(-stdv, stdv)))
self.out = Linear(
self.pool2d_avg_output,
class_dim,
act='softmax',
param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.Uniform(-stdv, stdv)))
def forward(self, inputs):
y = self.conv(inputs)
@ -202,6 +209,7 @@ class ResNet(fluid.Layer):
for bottleneck_block in self.bottleneck_block_list:
y = bottleneck_block(y)
y = self.pool2d_avg(y)
y = fluid.layers.reshape(y, shape=[-1, self.pool2d_avg_output])
y = self.out(y)
return y
@ -228,8 +236,9 @@ class TestDygraphResnet(unittest.TestCase):
fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed
resnet = ResNet("resnet")
optimizer = optimizer_setting(train_parameters)
resnet = ResNet()
optimizer = optimizer_setting(
train_parameters, parameter_list=resnet.parameters())
np.random.seed(seed)
import random
random.seed = seed
@ -315,7 +324,7 @@ class TestDygraphResnet(unittest.TestCase):
exe = fluid.Executor(fluid.CPUPlace(
) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
resnet = ResNet("resnet")
resnet = ResNet()
optimizer = optimizer_setting(train_parameters)
np.random.seed(seed)

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save