1. merge simple_dist_transpiler to distribute_transpiler

2. add align_var_to_block argument to func transpile
3. remove concat and spilt if align_var_to_block is False
4. unittests for simple_dist_transpiler
wangkuiyi-patch-1
minqiyang 7 years ago
parent 580340eeb2
commit a2c017da9b

@ -0,0 +1,120 @@
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import paddle.fluid as fluid
import paddle.fluid.core as core
import paddle.fluid.layers as layers
from paddle.fluid.transpiler.distribute_transpiler import delete_ops
import numpy as np
class TestSimpleDistTranspiler(unittest.TestCase):
def setUp(self):
self.trainer_id = 0
self.trainers = 2
self.pservers = 2
self.pserver_eps = "127.0.0.1:6174,127.0.0.1:6175"
self.current_pserver_ep = "127.0.0.1:6175"
def net_conf(self):
x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
y_predict = fluid.layers.fc(input=x,
size=1000,
act=None,
param_attr=fluid.ParamAttr(name='fc_w'))
y = fluid.layers.data(name='y', shape=[1], dtype='float32')
cost = fluid.layers.square_error_cost(input=y_predict, label=y)
avg_cost = fluid.layers.mean(cost)
sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1)
optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost)
return optimize_ops, params_grads
def test_simple_transpiler(self):
np.random.seed(1)
trainer = self.get_trainer()
pserver, startup = self.get_pserver(self.current_pserver_ep)
self.assertEqual([op.type for op in trainer.global_block().ops],
self.get_expect_trainer_ops())
self.assertEqual(len(pserver.blocks), 2)
# block0: listen_and_serv
self.assertEqual([op.type for op in pserver.blocks[0].ops],
["listen_and_serv"])
# block1: optimize pass
self.assertEqual([op.type for op in pserver.blocks[1].ops],
["sum", "scale", "sgd"])
print("xxx", [op.output_arg_names for op in startup.global_block().ops])
# confirm startup program
self.assertEqual([op.type for op in startup.global_block().ops],
["fill_constant", "uniform_random", "uniform_random"])
# the variable #fc_w will NOT be splited
fc_w_var = startup.global_block().var("fc_w@GRAD")
self.assertEqual(fc_w_var.shape, (1000, 1000))
fc_w_var = startup.global_block().var("fc_w@GRAD.trainer_0")
self.assertEqual(fc_w_var.shape, (1000, 1000))
def get_main_program(self):
main = fluid.Program()
with fluid.program_guard(main):
self.net_conf()
return main
def get_expect_trainer_ops(self):
trainer = fluid.Program()
with fluid.program_guard(trainer):
optimize_ops, params_grads = self.net_conf()
delete_ops(trainer.global_block(), optimize_ops)
ops = [op.type for op in trainer.global_block().ops] + [
"send_vars", "send_barrier", "recv", "recv", "fetch_barrier"
]
ops.insert(ops.index("elementwise_add_grad") + 1, "send_vars")
return ops
def get_trainer(self):
return self._transpiler_instance().get_trainer_program()
def get_pserver(self, ep):
t = self._transpiler_instance()
pserver = t.get_pserver_program(ep)
startup = t.get_startup_program(ep, pserver)
return pserver, startup
def _transpiler_instance(self):
main = self.get_main_program()
t = fluid.DistributeTranspiler()
t.transpile(
self.trainer_id,
program=main,
pservers=self.pserver_eps,
trainers=self.trainers,
align_var_to_block=False)
return t
if __name__ == "__main__":
unittest.main()

@ -15,6 +15,7 @@
from __future__ import print_function
import math
import numpy as np
from ps_dispatcher import RoundRobin, HashName, PSDispatcher
from .. import core, framework
@ -103,7 +104,7 @@ def split_dense_variable(var_list, service_count, min_block_size=8192):
We need to have a minimal block size so that the calculations in
the parameter server side can gain better performance. By default
minimum block size 8K elements (maybe 16bit or 32bit or 64bit).
minimum block size 8K elements (maybe 16bit or 32bit or 64bit).
Args:
var_list (list): List of variables.
@ -111,7 +112,7 @@ def split_dense_variable(var_list, service_count, min_block_size=8192):
or more listening ports.
min_block_size (int): Minimum splitted block size.
Returns:
blocks (list[(varname, block_id, current_block_size)]): A list
blocks (list[(varname, block_id, current_block_size)]): A list
of VarBlocks. Each VarBlock specifies a shard of the var.
"""
blocks = []
@ -171,6 +172,7 @@ class DistributeTranspiler:
program=None,
pservers="127.0.0.1:6174",
trainers=1,
align_var_to_block=True,
split_method=RoundRobin,
sync_mode=True):
"""
@ -183,7 +185,8 @@ class DistributeTranspiler:
parameter servers.
Steps to transpile trainer:
1. split variable to multiple blocks, aligned by product(dim[1:]) (width).
1. split variable to multiple blocks, aligned by product(dim[1:]) (width)
if align_var_to_block is True
2. rename splited grad variables to add trainer_id suffix ".trainer_%d".
3. modify trainer program add split_op to each grad variable.
4. append send_op to send splited variables to server and fetch
@ -293,9 +296,18 @@ class DistributeTranspiler:
for index in range(len(self.pserver_endpoints))
]
grad_blocks = split_dense_variable(grad_list, len(pserver_endpoints))
param_blocks = split_dense_variable(param_list, len(pserver_endpoints))
if align_var_to_block:
grad_blocks = split_dense_variable(grad_list,
len(pserver_endpoints))
param_blocks = split_dense_variable(param_list,
len(pserver_endpoints))
else:
# when we do NOT align var to block, we will always split params
# grads into one block.
grad_blocks = split_dense_variable(grad_list, 1)
param_blocks = split_dense_variable(param_list, 1)
assert (len(grad_blocks) == len(param_blocks))
# step2: Create new vars for the parameters and gradients blocks and
# add ops to do the split.
param_var_mapping = self._create_vars_from_blocklist(program,
@ -325,8 +337,22 @@ class DistributeTranspiler:
# step 3.1: insert send op to send gradient vars to parameter servers
ps_dispatcher.reset()
send_vars = []
for orig_varname, splited_vars in grad_var_mapping.items():
# in general cases, the number of pservers is times of 2, and this
# will lead to uneven distribution among weights and bias:
# fc_w@GRAD_trainer_0, fc_w@GRAD_trainer_1 --> pserver1
# fc_b@GRAD_trainer_0, fc_b@GRAD_trainer_1 --> pserver2
# shuffle the map will avoid the uneven distribution above
grad_var_mapping_items = grad_var_mapping.items()
if not align_var_to_block:
np.random.shuffle(grad_var_mapping_items)
for orig_varname, splited_vars in grad_var_mapping_items:
eplist = ps_dispatcher.dispatch(splited_vars)
if not align_var_to_block:
assert (len(splited_vars) == 1)
if len(splited_vars) == 1:
orig_varname = splited_vars[0].name
index = find_op_by_output_arg(program.global_block(),
@ -374,7 +400,7 @@ class DistributeTranspiler:
for i, ep in enumerate(eplist):
self.param_grad_ep_mapping[ep]["params"].append(recv_vars[i])
self.param_grad_ep_mapping[ep]["grads"].append(send_vars[i])
# step4: Concat the parameters splits together after recv.
for varname, splited_var in param_var_mapping.iteritems():
eps = []
for var in splited_var:
@ -399,6 +425,7 @@ class DistributeTranspiler:
RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
})
# step4: Concat the parameters splits together after recv.
for varname, splited_var in param_var_mapping.iteritems():
if len(splited_var) <= 1:
continue
@ -849,8 +876,8 @@ class DistributeTranspiler:
program (ProgramDesc): ProgramDesc which gradients blong.
block_list (list[(varname, block_id, block_size)]): List of gradient blocks.
add_trainer_suffix (Bool): Add trainer suffix to new variable's name if set True.
Returns:
var_mapping (dict(varname->[new_varname_variable])):A dict mapping
Returns:
var_mapping (dict(varname->[new_varname_variable])):A dict mapping
from original var name to each var split.
"""

Loading…
Cancel
Save