Merge pull request #6297 from typhoonzero/simple_dist_train_api
[Done] API for dist traindel_some_in_makelist
commit
8d6db25167
@ -0,0 +1,238 @@
|
|||||||
|
import framework
|
||||||
|
from framework import Program, default_main_program, Parameter, Variable
|
||||||
|
import optimizer
|
||||||
|
from layer_helper import LayerHelper
|
||||||
|
|
||||||
|
|
||||||
|
def hash_name_to_server(params_grads, pserver_endpoints):
|
||||||
|
"""
|
||||||
|
:param param_grads:
|
||||||
|
:return: a map of pserver endpoint ->
|
||||||
|
params -> [param list]
|
||||||
|
grads -> [grad list]
|
||||||
|
"""
|
||||||
|
|
||||||
|
def _hash_param(param_name, total):
|
||||||
|
return hash(param_name) % total
|
||||||
|
|
||||||
|
param_grad_map = dict()
|
||||||
|
for param, grad in params_grads:
|
||||||
|
if param.trainable is True and grad is not None:
|
||||||
|
server_id = _hash_param(param.name, len(pserver_endpoints))
|
||||||
|
server_for_param = pserver_endpoints[server_id]
|
||||||
|
if not param_grad_map.has_key(server_for_param):
|
||||||
|
param_grad_map[server_for_param] = {"params": [], "grads": []}
|
||||||
|
param_grad_map[server_for_param]["params"].append(param)
|
||||||
|
param_grad_map[server_for_param]["grads"].append(grad)
|
||||||
|
|
||||||
|
return param_grad_map
|
||||||
|
|
||||||
|
|
||||||
|
def round_robin(params_grads, pserver_endpoints):
|
||||||
|
assert (len(params_grads) > len(pserver_endpoints))
|
||||||
|
|
||||||
|
param_grad_map = dict()
|
||||||
|
pserver_idx = 0
|
||||||
|
for param, grad in params_grads:
|
||||||
|
if param.trainable is True:
|
||||||
|
server_for_param = pserver_endpoints[pserver_idx]
|
||||||
|
if not param_grad_map.has_key(server_for_param):
|
||||||
|
param_grad_map[server_for_param] = {"params": [], "grads": []}
|
||||||
|
|
||||||
|
param_grad_map[server_for_param]["params"].append(param)
|
||||||
|
param_grad_map[server_for_param]["grads"].append(grad)
|
||||||
|
|
||||||
|
pserver_idx += 1
|
||||||
|
if pserver_idx >= len(pserver_endpoints):
|
||||||
|
pserver_idx = 0
|
||||||
|
return param_grad_map
|
||||||
|
|
||||||
|
|
||||||
|
class DistributeTranspiler:
|
||||||
|
def transpile(self,
|
||||||
|
optimize_ops,
|
||||||
|
params_grads,
|
||||||
|
program=None,
|
||||||
|
pservers="127.0.0.1:6174",
|
||||||
|
trainers=1,
|
||||||
|
split_method=round_robin):
|
||||||
|
"""
|
||||||
|
Transpile the program to a distributed data-parallelism programs.
|
||||||
|
|
||||||
|
The main_program will be transform to use a remote parameter server
|
||||||
|
to do parameter optimization. And the optimization graph will be put
|
||||||
|
in to a parameter server program.
|
||||||
|
|
||||||
|
Use different methods to split trainable varialbles to different
|
||||||
|
parameter servers.
|
||||||
|
|
||||||
|
Example to run:
|
||||||
|
|
||||||
|
exe = fluid.Executor(place)
|
||||||
|
t = fluid.DistributeTranspiler()
|
||||||
|
t.transpile(optimize_ops, params_grads, pservers="127.0.0.1:6174", trainers=1)
|
||||||
|
|
||||||
|
pserver_endpoint = os.getenv("PSERVER")
|
||||||
|
if pserver_endpoint:
|
||||||
|
pserver_prog = t.get_pserver_program(pserver_endpoint, optimize_ops)
|
||||||
|
exe.run(fluid.default_startup_program())
|
||||||
|
exe.run(pserver_prog)
|
||||||
|
else:
|
||||||
|
feeder = fluid.DataFeeder(feed_list=[images, label], place=place)
|
||||||
|
exe.run(fluid.default_startup_program())
|
||||||
|
|
||||||
|
for pass_id in range(PASS_NUM):
|
||||||
|
...
|
||||||
|
|
||||||
|
:param optimize_ops: op list of optimization, should be the
|
||||||
|
return value of Optimizer.minimize
|
||||||
|
:type optimize_ops: list
|
||||||
|
:param program: program to optimize, default default_main_program
|
||||||
|
:param pservers: parameter server endpoints like "m1:6174,m2:6174"
|
||||||
|
:type pservers: string
|
||||||
|
|
||||||
|
:return: return a list of programs
|
||||||
|
"""
|
||||||
|
if program is None:
|
||||||
|
program = default_main_program()
|
||||||
|
self.trainers = trainers
|
||||||
|
self._optimize_distributed(
|
||||||
|
optimize_ops,
|
||||||
|
program,
|
||||||
|
params_grads,
|
||||||
|
pservers=pservers,
|
||||||
|
trainers=trainers,
|
||||||
|
split_method=split_method)
|
||||||
|
|
||||||
|
def _clone_param(self, block, v):
|
||||||
|
assert isinstance(v, Parameter)
|
||||||
|
new_p = Parameter(
|
||||||
|
block=block,
|
||||||
|
shape=v.shape,
|
||||||
|
dtype=v.dtype,
|
||||||
|
type=v.type,
|
||||||
|
lod_level=v.lod_level,
|
||||||
|
stop_gradient=v.stop_gradient,
|
||||||
|
trainable=v.trainable,
|
||||||
|
optimize_attr=v.optimize_attr,
|
||||||
|
regularizer=v.regularizer,
|
||||||
|
name=v.name)
|
||||||
|
block.vars[new_p.name] = new_p
|
||||||
|
|
||||||
|
def _clone_var(self, block, var):
|
||||||
|
assert isinstance(var, Variable)
|
||||||
|
return block.create_var(
|
||||||
|
name=var.name,
|
||||||
|
shape=var.shape,
|
||||||
|
dtype=var.dtype,
|
||||||
|
type=var.type,
|
||||||
|
lod_level=var.lod_level,
|
||||||
|
persistable=var.persistable)
|
||||||
|
|
||||||
|
def _optimize_distributed(self, optimize_ops, program, params_and_grads,
|
||||||
|
**kwargs):
|
||||||
|
if kwargs.has_key("split_method"):
|
||||||
|
split_method = kwargs["split_method"]
|
||||||
|
else:
|
||||||
|
split_method = round_robin
|
||||||
|
|
||||||
|
assert (callable(split_method))
|
||||||
|
pserver_endpoints = kwargs["pservers"].split(",")
|
||||||
|
self.param_grad_map = split_method(params_and_grads, pserver_endpoints)
|
||||||
|
|
||||||
|
send_op_ordered_inputs = []
|
||||||
|
epmap = []
|
||||||
|
for ep, v in self.param_grad_map.iteritems():
|
||||||
|
send_op_ordered_inputs.extend(v["grads"])
|
||||||
|
for i in v["grads"]:
|
||||||
|
epmap.append(ep)
|
||||||
|
send_op = program.global_block().append_op(
|
||||||
|
type="send",
|
||||||
|
inputs={"X": send_op_ordered_inputs
|
||||||
|
}, # inputs is a list of tensors to be send
|
||||||
|
outputs={},
|
||||||
|
attrs={"endpoints": pserver_endpoints,
|
||||||
|
"epmap": epmap})
|
||||||
|
|
||||||
|
def get_trainer_program(optimize_ops, program):
|
||||||
|
# remove optimize ops and add a send op to main_program
|
||||||
|
program.global_block().delete_ops(optimize_ops)
|
||||||
|
|
||||||
|
def _create_var_for_trainers(self, block, var, trainers):
|
||||||
|
var_list = []
|
||||||
|
for i in xrange(trainers):
|
||||||
|
var_each = block.create_var(
|
||||||
|
name="%s.trainer_%d" % (var.name, i),
|
||||||
|
psersistable=var.persistable,
|
||||||
|
dtype=var.dtype,
|
||||||
|
shape=var.shape)
|
||||||
|
var_list.append(var_each)
|
||||||
|
return var_list
|
||||||
|
|
||||||
|
def get_pserver_program(self, endpoint, optimize_ops):
|
||||||
|
pserver_program = Program()
|
||||||
|
for v in self.param_grad_map[endpoint]["params"]:
|
||||||
|
self._clone_param(pserver_program.global_block(), v)
|
||||||
|
|
||||||
|
optimize_sub_program = Program()
|
||||||
|
grad_var_names = [
|
||||||
|
var.name for var in self.param_grad_map[endpoint]["grads"]
|
||||||
|
]
|
||||||
|
for opt_op in optimize_ops:
|
||||||
|
for _, var in opt_op.inputs.iteritems():
|
||||||
|
# NOTE: append operators to merge gradients from multiple
|
||||||
|
# trainers. If trainers == 1, this is not needed.
|
||||||
|
if self.trainers > 1 and var.name in grad_var_names:
|
||||||
|
vars2merge = self._create_var_for_trainers(
|
||||||
|
optimize_sub_program.global_block(), var, self.trainers)
|
||||||
|
merged_var = optimize_sub_program.global_block().create_var(
|
||||||
|
name=var.name,
|
||||||
|
persistable=var.persistable,
|
||||||
|
dtype=var.dtype,
|
||||||
|
shape=var.shape)
|
||||||
|
optimize_sub_program.global_block().append_op(
|
||||||
|
type="sum",
|
||||||
|
inputs={"X": vars2merge},
|
||||||
|
outputs={"Out": merged_var})
|
||||||
|
optimize_sub_program.global_block().append_op(
|
||||||
|
type="scale",
|
||||||
|
inputs={"X": merged_var},
|
||||||
|
outputs={"Out": merged_var},
|
||||||
|
attrs={"scale": 1.0 / float(self.trainers)})
|
||||||
|
else:
|
||||||
|
optimize_sub_program.global_block().create_var(
|
||||||
|
name=var.name,
|
||||||
|
persistable=var.persistable,
|
||||||
|
dtype=var.dtype,
|
||||||
|
shape=var.shape)
|
||||||
|
|
||||||
|
if opt_op.inputs.has_key("Grad"):
|
||||||
|
if opt_op.inputs["Grad"].name in grad_var_names:
|
||||||
|
print "appending ", opt_op.type, opt_op.inputs
|
||||||
|
optimize_sub_program.global_block().append_op(
|
||||||
|
type=opt_op.type,
|
||||||
|
inputs=opt_op.inputs,
|
||||||
|
outputs=opt_op.outputs,
|
||||||
|
attrs=opt_op.attrs)
|
||||||
|
else:
|
||||||
|
optimize_sub_program.global_block().append_op(
|
||||||
|
type=opt_op.type,
|
||||||
|
inputs=opt_op.inputs,
|
||||||
|
outputs=opt_op.outputs,
|
||||||
|
attrs=opt_op.attrs)
|
||||||
|
pserver_program.global_block().append_op(
|
||||||
|
type="recv",
|
||||||
|
inputs={"RX":
|
||||||
|
self.param_grad_map[endpoint]["grads"]}, # grads to recv
|
||||||
|
outputs={},
|
||||||
|
attrs={
|
||||||
|
"OptimizeProgram": optimize_sub_program.desc,
|
||||||
|
"endpoint": endpoint,
|
||||||
|
"ParamList":
|
||||||
|
[p.name for p in self.param_grad_map[endpoint]["params"]],
|
||||||
|
"GradList":
|
||||||
|
[p.name for p in self.param_grad_map[endpoint]["grads"]],
|
||||||
|
"Trainers": self.trainers
|
||||||
|
})
|
||||||
|
pserver_program.sync_with_cpp()
|
||||||
|
return pserver_program
|
||||||
@ -0,0 +1,72 @@
|
|||||||
|
from __future__ import print_function
|
||||||
|
import numpy as np
|
||||||
|
import paddle.v2 as paddle
|
||||||
|
import paddle.v2.fluid as fluid
|
||||||
|
import os
|
||||||
|
|
||||||
|
images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype='float32')
|
||||||
|
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
|
||||||
|
conv_pool_1 = fluid.nets.simple_img_conv_pool(
|
||||||
|
input=images,
|
||||||
|
filter_size=5,
|
||||||
|
num_filters=20,
|
||||||
|
pool_size=2,
|
||||||
|
pool_stride=2,
|
||||||
|
act="relu")
|
||||||
|
conv_pool_2 = fluid.nets.simple_img_conv_pool(
|
||||||
|
input=conv_pool_1,
|
||||||
|
filter_size=5,
|
||||||
|
num_filters=50,
|
||||||
|
pool_size=2,
|
||||||
|
pool_stride=2,
|
||||||
|
act="relu")
|
||||||
|
|
||||||
|
predict = fluid.layers.fc(input=conv_pool_2, size=10, act="softmax")
|
||||||
|
cost = fluid.layers.cross_entropy(input=predict, label=label)
|
||||||
|
avg_cost = fluid.layers.mean(x=cost)
|
||||||
|
optimizer = fluid.optimizer.Adam(learning_rate=0.01)
|
||||||
|
optimize_ops, params_grads = optimizer.minimize(avg_cost)
|
||||||
|
|
||||||
|
accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
|
||||||
|
|
||||||
|
BATCH_SIZE = 50
|
||||||
|
PASS_NUM = 3
|
||||||
|
train_reader = paddle.batch(
|
||||||
|
paddle.reader.shuffle(
|
||||||
|
paddle.dataset.mnist.train(), buf_size=500),
|
||||||
|
batch_size=BATCH_SIZE)
|
||||||
|
|
||||||
|
place = fluid.CPUPlace()
|
||||||
|
exe = fluid.Executor(place)
|
||||||
|
t = fluid.DistributeTranspiler()
|
||||||
|
pserver_endpoints = os.getenv("PSERVERS")
|
||||||
|
training_role = os.getenv("TRAINING_ROLE",
|
||||||
|
"TRAINER") # get the training role: trainer/pserver
|
||||||
|
t.transpile(optimize_ops, params_grads, pservers=pserver_endpoints, trainers=1)
|
||||||
|
|
||||||
|
if training_role == "PSERVER":
|
||||||
|
pserver_prog = t.get_pserver_program(pserver_endpoints, optimize_ops)
|
||||||
|
exe.run(fluid.default_startup_program())
|
||||||
|
exe.run(pserver_prog)
|
||||||
|
elif training_role == "TRAINER":
|
||||||
|
feeder = fluid.DataFeeder(feed_list=[images, label], place=place)
|
||||||
|
exe.run(fluid.default_startup_program())
|
||||||
|
|
||||||
|
for pass_id in range(PASS_NUM):
|
||||||
|
accuracy.reset(exe)
|
||||||
|
for data in train_reader():
|
||||||
|
loss, acc = exe.run(fluid.default_main_program(),
|
||||||
|
feed=feeder.feed(data),
|
||||||
|
fetch_list=[avg_cost] + accuracy.metrics)
|
||||||
|
pass_acc = accuracy.eval(exe)
|
||||||
|
# print loss, acc
|
||||||
|
if loss < 10.0 and pass_acc > 0.9:
|
||||||
|
# if avg cost less than 10.0 and accuracy is larger than 0.9, we think our code is good.
|
||||||
|
exit(0)
|
||||||
|
|
||||||
|
pass_acc = accuracy.eval(exe)
|
||||||
|
print("pass_id=" + str(pass_id) + " pass_acc=" + str(pass_acc))
|
||||||
|
else:
|
||||||
|
print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
|
||||||
|
|
||||||
|
exit(1)
|
||||||
Loading…
Reference in new issue