Optimizer use init program (#5275)

* optimizer use init_program

* create persistable variable

* add create_persistable_var to block

* optimizer use create_persistable_var

* fix prefix

* move create_global_persistable_var from Block to LayerHelper

* Polish Optimizer initialization code.

* Using the LayerHelper to create initialize operator and variables

* add_accumulator should use an independent data type

* default use param data type for accumulator
fix-typo
Qiao Longfei 7 years ago committed by GitHub
parent 90f4d5e904
commit f48159ade0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -7,6 +7,11 @@ import copy
__all__ = ['Block', 'Variable', 'Program', 'Operator'] __all__ = ['Block', 'Variable', 'Program', 'Operator']
def unique_name(prefix):
uid = core.unique_integer(prefix) # unique during whole process.
return "_".join([prefix, str(uid)])
class Variable(object): class Variable(object):
def __init__(self, def __init__(self,
block, block,

@ -1,19 +1,12 @@
import copy import copy
import itertools import itertools
import paddle.v2.framework.core as core
from paddle.v2.framework.framework import Variable, g_program, \ from paddle.v2.framework.framework import Variable, g_program, \
g_init_program g_init_program, unique_name, Program
from paddle.v2.framework.initializer import ConstantInitializer, \ from paddle.v2.framework.initializer import ConstantInitializer, \
UniformInitializer UniformInitializer
def unique_name(prefix):
uid = core.unique_integer(prefix) # unique during whole process.
return "_".join([prefix, str(uid)])
class LayerHelper(object): class LayerHelper(object):
def __init__(self, layer_type, **kwargs): def __init__(self, layer_type, **kwargs):
self.kwargs = kwargs self.kwargs = kwargs
@ -138,9 +131,19 @@ class LayerHelper(object):
def create_variable(self, *args, **kwargs): def create_variable(self, *args, **kwargs):
return self.program.current_block().create_var(*args, **kwargs) return self.program.current_block().create_var(*args, **kwargs)
def create_global_variable(self, *args, **kwargs): def create_global_variable(self, persistable=False, *args, **kwargs):
return self.program.global_block().create_var( return self.program.global_block().create_var(
*args, persistable=False, **kwargs) *args, persistable=persistable, **kwargs)
def set_variable_initializer(self, var, initializer):
assert isinstance(var, Variable)
self.init_program.global_block().create_var(
name=var.name,
type=var.type,
dtype=var.data_type,
shape=var.shape,
persistable=True,
initializer=initializer)
def append_bias_op(self, input_var, num_flatten_dims=None): def append_bias_op(self, input_var, num_flatten_dims=None):
""" """

File diff suppressed because it is too large Load Diff

@ -36,7 +36,7 @@ cost = layers.square_error_cost(
avg_cost = layers.mean(x=cost, program=program, init_program=init_program) avg_cost = layers.mean(x=cost, program=program, init_program=init_program)
sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001) sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001)
opts = sgd_optimizer.minimize(avg_cost) opts = sgd_optimizer.minimize(avg_cost, init_program)
BATCH_SIZE = 20 BATCH_SIZE = 20

@ -208,7 +208,7 @@ cost = layers.cross_entropy(
avg_cost = layers.mean(x=cost, program=program, init_program=init_program) avg_cost = layers.mean(x=cost, program=program, init_program=init_program)
sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001) sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001)
opts = sgd_optimizer.minimize(avg_cost) opts = sgd_optimizer.minimize(avg_cost, init_program)
BATCH_SIZE = 128 BATCH_SIZE = 128
PASS_NUM = 1 PASS_NUM = 1

@ -44,7 +44,7 @@ class TestBook(unittest.TestCase):
x=cost, program=program, init_program=init_program) x=cost, program=program, init_program=init_program)
sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001) sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001)
opts = sgd_optimizer.minimize(avg_cost) opts = sgd_optimizer.minimize(avg_cost, init_program)
place = core.CPUPlace() place = core.CPUPlace()
exe = executor.Executor(place) exe = executor.Executor(place)

@ -7,6 +7,7 @@ from paddle.v2.framework.backward import append_backward_ops
class TestOptimizer(unittest.TestCase): class TestOptimizer(unittest.TestCase):
def test_sgd_optimizer(self): def test_sgd_optimizer(self):
init_program = framework.Program()
program = framework.Program() program = framework.Program()
block = program.global_block() block = program.global_block()
mul_x = block.create_parameter( mul_x = block.create_parameter(
@ -22,12 +23,13 @@ class TestOptimizer(unittest.TestCase):
outputs={"Out": mul_out}, outputs={"Out": mul_out},
attrs={"x_num_col_dims": 1}) attrs={"x_num_col_dims": 1})
sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.01) sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.01)
opts = sgd_optimizer.minimize(mul_out) opts = sgd_optimizer.minimize(mul_out, init_program)
self.assertEqual(len(opts), 1) self.assertEqual(len(opts), 1)
sgd_op = opts[0] sgd_op = opts[0]
self.assertEqual(sgd_op.type, "sgd") self.assertEqual(sgd_op.type, "sgd")
def test_sgd_optimizer_with_global_step(self): def test_sgd_optimizer_with_global_step(self):
init_program = framework.Program()
program = framework.Program() program = framework.Program()
block = program.global_block() block = program.global_block()
mul_x = block.create_parameter( mul_x = block.create_parameter(
@ -44,15 +46,22 @@ class TestOptimizer(unittest.TestCase):
attrs={"x_num_col_dims": 1}) attrs={"x_num_col_dims": 1})
global_step = block.create_var( global_step = block.create_var(
dtype="float32", shape=[1], lod_level=0, name="step") dtype="float32", shape=[1], lod_level=0, name="step")
learning_rate = 0.01
sgd_optimizer = optimizer.SGDOptimizer( sgd_optimizer = optimizer.SGDOptimizer(
learning_rate=0.01, global_step=global_step) learning_rate=learning_rate, global_step=global_step)
opts = sgd_optimizer.minimize(mul_out) opts = sgd_optimizer.minimize(mul_out, init_program)
self.assertEqual(len(opts), 2) self.assertEqual(len(opts), 2)
sgd_op = opts[0] sgd_op = opts[0]
self.assertEqual(sgd_op.type, "sgd") self.assertEqual(sgd_op.type, "sgd")
increment_op = opts[1] increment_op = opts[1]
self.assertEqual(increment_op.type, "increment") self.assertEqual(increment_op.type, "increment")
# Check init_program
init_ops = init_program.global_block().ops
self.assertEqual(len(init_ops), 1)
self.assertEqual(init_ops[0].type, "fill_constant")
self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate)
class TestMomentumOptimizer(unittest.TestCase): class TestMomentumOptimizer(unittest.TestCase):
class MockMomentum(optimizer.MomentumOptimizer): class MockMomentum(optimizer.MomentumOptimizer):
@ -63,6 +72,7 @@ class TestMomentumOptimizer(unittest.TestCase):
return self._velocity_acc_str return self._velocity_acc_str
def test_vanilla_momentum_optimizer(self): def test_vanilla_momentum_optimizer(self):
init_program = framework.Program()
program = framework.Program() program = framework.Program()
block = program.global_block() block = program.global_block()
mul_x = block.create_parameter( mul_x = block.create_parameter(
@ -77,12 +87,14 @@ class TestMomentumOptimizer(unittest.TestCase):
"Y": mul_y}, "Y": mul_y},
outputs={"Out": mul_out}, outputs={"Out": mul_out},
attrs={"x_num_col_dims": 1}) attrs={"x_num_col_dims": 1})
momentum_optimizer = self.MockMomentum(learning_rate=0.01, momentum=0.2) learning_rate = 0.01
momentum_optimizer = self.MockMomentum(
learning_rate=learning_rate, momentum=0.2)
params_grads = append_backward_ops(mul_out) params_grads = append_backward_ops(mul_out)
self.assertEqual(len(params_grads), 1) self.assertEqual(len(params_grads), 1)
self.assertEqual(len(momentum_optimizer.get_accumulators()), 0) self.assertEqual(len(momentum_optimizer.get_accumulators()), 0)
opts = momentum_optimizer.create_optimization_pass(params_grads, opts = momentum_optimizer.create_optimization_pass(
mul_out) params_grads, mul_out, init_program)
self.assertEqual(len(opts), 1) self.assertEqual(len(opts), 1)
sgd_op = opts[0] sgd_op = opts[0]
self.assertEqual(sgd_op.type, "momentum") self.assertEqual(sgd_op.type, "momentum")
@ -96,7 +108,16 @@ class TestMomentumOptimizer(unittest.TestCase):
self.assertEqual(len(velocity_acc), 1) self.assertEqual(len(velocity_acc), 1)
self.assertTrue(mul_x.name in velocity_acc) self.assertTrue(mul_x.name in velocity_acc)
# Check init_program
init_ops = init_program.global_block().ops
self.assertEqual(len(init_ops), 2)
self.assertEqual(init_ops[0].type, "fill_constant")
self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate)
self.assertEqual(init_ops[1].type, "fill_constant")
self.assertAlmostEqual(init_ops[1].attr('value'), 0.0)
def test_nesterov_momentum_optimizer(self): def test_nesterov_momentum_optimizer(self):
init_program = framework.Program()
program = framework.Program() program = framework.Program()
block = program.global_block() block = program.global_block()
mul_x = block.create_parameter( mul_x = block.create_parameter(
@ -111,13 +132,14 @@ class TestMomentumOptimizer(unittest.TestCase):
"Y": mul_y}, "Y": mul_y},
outputs={"Out": mul_out}, outputs={"Out": mul_out},
attrs={"x_num_col_dims": 1}) attrs={"x_num_col_dims": 1})
learning_rate = 0.01
momentum_optimizer = self.MockMomentum( momentum_optimizer = self.MockMomentum(
learning_rate=0.01, momentum=0.2, use_nesterov=True) learning_rate=learning_rate, momentum=0.2, use_nesterov=True)
params_grads = append_backward_ops(mul_out) params_grads = append_backward_ops(mul_out)
self.assertEqual(len(params_grads), 1) self.assertEqual(len(params_grads), 1)
self.assertEqual(len(momentum_optimizer.get_accumulators()), 0) self.assertEqual(len(momentum_optimizer.get_accumulators()), 0)
opts = momentum_optimizer.create_optimization_pass(params_grads, opts = momentum_optimizer.create_optimization_pass(
mul_out) params_grads, mul_out, init_program)
self.assertEqual(len(opts), 1) self.assertEqual(len(opts), 1)
sgd_op = opts[0] sgd_op = opts[0]
self.assertEqual(sgd_op.type, "momentum") self.assertEqual(sgd_op.type, "momentum")
@ -131,6 +153,14 @@ class TestMomentumOptimizer(unittest.TestCase):
self.assertEqual(len(velocity_acc), 1) self.assertEqual(len(velocity_acc), 1)
self.assertTrue(mul_x.name in velocity_acc) self.assertTrue(mul_x.name in velocity_acc)
# Check init_program
init_ops = init_program.global_block().ops
self.assertEqual(len(init_ops), 2)
self.assertEqual(init_ops[0].type, "fill_constant")
self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate)
self.assertEqual(init_ops[1].type, "fill_constant")
self.assertAlmostEqual(init_ops[1].attr('value'), 0.0)
class TestAdagradOptimizer(unittest.TestCase): class TestAdagradOptimizer(unittest.TestCase):
class MockAdagrad(optimizer.AdagradOptimizer): class MockAdagrad(optimizer.AdagradOptimizer):
@ -141,6 +171,7 @@ class TestAdagradOptimizer(unittest.TestCase):
return self._moment_acc_str return self._moment_acc_str
def test_adagrad_optimizer(self): def test_adagrad_optimizer(self):
init_program = framework.Program()
program = framework.Program() program = framework.Program()
block = program.global_block() block = program.global_block()
mul_x = block.create_parameter( mul_x = block.create_parameter(
@ -155,11 +186,14 @@ class TestAdagradOptimizer(unittest.TestCase):
"Y": mul_y}, "Y": mul_y},
outputs={"Out": mul_out}, outputs={"Out": mul_out},
attrs={"x_num_col_dims": 1}) attrs={"x_num_col_dims": 1})
adagrad_optimizer = self.MockAdagrad(learning_rate=0.01, epsilon=1.0e-6) learning_rate = 0.01
adagrad_optimizer = self.MockAdagrad(
learning_rate=learning_rate, epsilon=1.0e-6)
params_grads = append_backward_ops(mul_out) params_grads = append_backward_ops(mul_out)
self.assertEqual(len(params_grads), 1) self.assertEqual(len(params_grads), 1)
self.assertEqual(len(adagrad_optimizer.get_accumulators()), 0) self.assertEqual(len(adagrad_optimizer.get_accumulators()), 0)
opts = adagrad_optimizer.create_optimization_pass(params_grads, mul_out) opts = adagrad_optimizer.create_optimization_pass(params_grads, mul_out,
init_program)
self.assertEqual(len(opts), 1) self.assertEqual(len(opts), 1)
adagrad_op = opts[0] adagrad_op = opts[0]
self.assertEqual(adagrad_op.type, "adagrad") self.assertEqual(adagrad_op.type, "adagrad")
@ -172,6 +206,14 @@ class TestAdagradOptimizer(unittest.TestCase):
self.assertEqual(len(moment_acc), 1) self.assertEqual(len(moment_acc), 1)
self.assertTrue(mul_x.name in moment_acc) self.assertTrue(mul_x.name in moment_acc)
# Check init_program
init_ops = init_program.global_block().ops
self.assertEqual(len(init_ops), 2)
self.assertEqual(init_ops[0].type, "fill_constant")
self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate)
self.assertEqual(init_ops[1].type, "fill_constant")
self.assertAlmostEqual(init_ops[1].attr('value'), 0.0)
class TestAdamOptimizer(unittest.TestCase): class TestAdamOptimizer(unittest.TestCase):
class MockAdam(optimizer.AdamOptimizer): class MockAdam(optimizer.AdamOptimizer):
@ -185,6 +227,7 @@ class TestAdamOptimizer(unittest.TestCase):
return self._moment2_acc_str return self._moment2_acc_str
def test_adam_optimizer(self): def test_adam_optimizer(self):
init_program = framework.Program()
program = framework.Program() program = framework.Program()
block = program.global_block() block = program.global_block()
mul_x = block.create_parameter( mul_x = block.create_parameter(
@ -199,12 +242,14 @@ class TestAdamOptimizer(unittest.TestCase):
"Y": mul_y}, "Y": mul_y},
outputs={"Out": mul_out}, outputs={"Out": mul_out},
attrs={"x_num_col_dims": 1}) attrs={"x_num_col_dims": 1})
learning_rate = 0.01
adam_optimizer = self.MockAdam( adam_optimizer = self.MockAdam(
learning_rate=0.01, beta1=0.9, beta2=0.999) learning_rate=learning_rate, beta1=0.9, beta2=0.999)
params_grads = append_backward_ops(mul_out) params_grads = append_backward_ops(mul_out)
self.assertEqual(len(params_grads), 1) self.assertEqual(len(params_grads), 1)
self.assertEqual(len(adam_optimizer.get_accumulators()), 0) self.assertEqual(len(adam_optimizer.get_accumulators()), 0)
opts = adam_optimizer.create_optimization_pass(params_grads, mul_out) opts = adam_optimizer.create_optimization_pass(params_grads, mul_out,
init_program)
self.assertEqual(len(opts), 3) self.assertEqual(len(opts), 3)
adam_op = opts[0] adam_op = opts[0]
self.assertEqual(adam_op.type, "adam") self.assertEqual(adam_op.type, "adam")
@ -221,6 +266,12 @@ class TestAdamOptimizer(unittest.TestCase):
self.assertTrue(mul_x.name in moment1_acc) self.assertTrue(mul_x.name in moment1_acc)
self.assertTrue(mul_x.name in moment2_acc) self.assertTrue(mul_x.name in moment2_acc)
# Check init_program
init_ops = init_program.global_block().ops
self.assertEqual(len(init_ops), 5)
self.assertEqual(init_ops[0].type, "fill_constant")
self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate)
class TestAdamaxOptimizer(unittest.TestCase): class TestAdamaxOptimizer(unittest.TestCase):
class MockAdamax(optimizer.AdamaxOptimizer): class MockAdamax(optimizer.AdamaxOptimizer):
@ -234,6 +285,7 @@ class TestAdamaxOptimizer(unittest.TestCase):
return self._inf_norm_acc_str return self._inf_norm_acc_str
def test_adamax_optimizer(self): def test_adamax_optimizer(self):
init_program = framework.Program()
program = framework.Program() program = framework.Program()
block = program.global_block() block = program.global_block()
mul_x = block.create_parameter( mul_x = block.create_parameter(
@ -248,12 +300,14 @@ class TestAdamaxOptimizer(unittest.TestCase):
"Y": mul_y}, "Y": mul_y},
outputs={"Out": mul_out}, outputs={"Out": mul_out},
attrs={"x_num_col_dims": 1}) attrs={"x_num_col_dims": 1})
learning_rate = 0.01
adamax_optimizer = self.MockAdamax( adamax_optimizer = self.MockAdamax(
learning_rate=0.01, beta1=0.9, beta2=0.999) learning_rate=learning_rate, beta1=0.9, beta2=0.999)
params_grads = append_backward_ops(mul_out) params_grads = append_backward_ops(mul_out)
self.assertEqual(len(params_grads), 1) self.assertEqual(len(params_grads), 1)
self.assertEqual(len(adamax_optimizer.get_accumulators()), 0) self.assertEqual(len(adamax_optimizer.get_accumulators()), 0)
opts = adamax_optimizer.create_optimization_pass(params_grads, mul_out) opts = adamax_optimizer.create_optimization_pass(params_grads, mul_out,
init_program)
self.assertEqual(len(opts), 2) self.assertEqual(len(opts), 2)
adam_op = opts[0] adam_op = opts[0]
self.assertEqual(adam_op.type, "adamax") self.assertEqual(adam_op.type, "adamax")
@ -270,6 +324,12 @@ class TestAdamaxOptimizer(unittest.TestCase):
self.assertTrue(mul_x.name in moment_acc) self.assertTrue(mul_x.name in moment_acc)
self.assertTrue(mul_x.name in inf_norm_acc) self.assertTrue(mul_x.name in inf_norm_acc)
# Check init_program
init_ops = init_program.global_block().ops
self.assertEqual(len(init_ops), 4)
self.assertEqual(init_ops[0].type, "fill_constant")
self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate)
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()

@ -54,8 +54,10 @@ avg_cost = layers.mean(x=cost, program=program)
accuracy = layers.accuracy( accuracy = layers.accuracy(
input=predict, label=label, program=program, init_program=init_program) input=predict, label=label, program=program, init_program=init_program)
sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001) # optimizer = optimizer.MomentumOptimizer(learning_rate=0.1 / 128.0,
opts = sgd_optimizer.minimize(avg_cost) # momentum=0.9)
optimizer = optimizer.AdamOptimizer(learning_rate=0.01, beta1=0.9, beta2=0.999)
opts = optimizer.minimize(avg_cost, init_program)
BATCH_SIZE = 50 BATCH_SIZE = 50
PASS_NUM = 3 PASS_NUM = 3

@ -58,8 +58,8 @@ cost = layers.cross_entropy(
input=predict, label=label, program=program, init_program=init_program) input=predict, label=label, program=program, init_program=init_program)
avg_cost = layers.mean(x=cost, program=program, init_program=init_program) avg_cost = layers.mean(x=cost, program=program, init_program=init_program)
sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001) optimizer = optimizer.MomentumOptimizer(learning_rate=0.001, momentum=0.9)
opts = sgd_optimizer.minimize(avg_cost) opts = optimizer.minimize(avg_cost, init_program)
train_reader = paddle.batch( train_reader = paddle.batch(
paddle.reader.shuffle( paddle.reader.shuffle(
@ -89,6 +89,7 @@ for pass_id in range(PASS_NUM):
'y': tensor_y}, 'y': tensor_y},
fetch_list=[avg_cost]) fetch_list=[avg_cost])
out = np.array(outs[0]) out = np.array(outs[0])
if out[0] < 5.0: if out[0] < 5.0:
exit(0) # if avg cost less than 5.0, we think our code is good. exit(0) # if avg cost less than 5.0, we think our code is good.
exit(1) exit(1)

@ -109,7 +109,7 @@ cost = layers.cross_entropy(
avg_cost = layers.mean(x=cost, program=program, init_program=init_program) avg_cost = layers.mean(x=cost, program=program, init_program=init_program)
sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001) sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001)
opts = sgd_optimizer.minimize(avg_cost) opts = sgd_optimizer.minimize(avg_cost, init_program)
train_reader = paddle.batch( train_reader = paddle.batch(
paddle.dataset.imikolov.train(word_dict, N), batch_size) paddle.dataset.imikolov.train(word_dict, N), batch_size)

Loading…
Cancel
Save