add fleet_desc config feature & multi_sparse table, test=develop (#18827)

add fleet_desc config feature & multi_sparse table,
padding_in_crf
zhang wenhui 6 years ago committed by GitHub
parent 1799c257ad
commit 4a3c4b8fa4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -146,27 +146,29 @@ class DownpourSGD(DeviceWorker):
dense_table.dense_value_name.extend(i.dense_variable_name)
dense_table.table_id = \
i.table_id
sparse_table = downpour.sparse_table.add()
sparse_table.table_id = \
self._fleet_desc.trainer_param.sparse_table[0].table_id
sparse_table.sparse_key_name.extend(
self._fleet_desc.trainer_param.sparse_table[0].slot_key)
sparse_table.sparse_value_name.extend(
self._fleet_desc.trainer_param.sparse_table[0].slot_value)
sparse_table.sparse_grad_name.extend(
self._fleet_desc.trainer_param.sparse_table[0].slot_gradient)
if opt_info["use_cvm"]:
sparse_table.emb_dim = \
self._fleet_desc.server_param.downpour_server_param.downpour_table_param[
0].accessor.fea_dim
sparse_table.fea_dim = sparse_table.emb_dim
else:
sparse_table.emb_dim = \
self._fleet_desc.server_param.downpour_server_param.downpour_table_param[
0].accessor.fea_dim - 2
sparse_table.fea_dim = sparse_table.emb_dim + 2
# TODO(guru4elephant): hard code here, need to improve
sparse_table.label_var_name = "click"
sparse_len = len(self._fleet_desc.trainer_param.sparse_table)
for i in range(sparse_len):
sparse_table = downpour.sparse_table.add()
sparse_table.table_id = \
self._fleet_desc.trainer_param.sparse_table[i].table_id
sparse_table.sparse_key_name.extend(
self._fleet_desc.trainer_param.sparse_table[i].slot_key)
sparse_table.sparse_value_name.extend(
self._fleet_desc.trainer_param.sparse_table[i].slot_value)
sparse_table.sparse_grad_name.extend(
self._fleet_desc.trainer_param.sparse_table[i].slot_gradient)
if opt_info["use_cvm"]:
sparse_table.emb_dim = \
self._fleet_desc.server_param.downpour_server_param.downpour_table_param[
i].accessor.fea_dim
sparse_table.fea_dim = sparse_table.emb_dim
else:
sparse_table.emb_dim = \
self._fleet_desc.server_param.downpour_server_param.downpour_table_param[
i].accessor.fea_dim - 2
sparse_table.fea_dim = sparse_table.emb_dim + 2
# TODO(guru4elephant): hard code here, need to improve
sparse_table.label_var_name = "click"
for i in self._fleet_desc.trainer_param.dense_table:
if i.table_id in dense_table_set:

@ -13,7 +13,7 @@
import os
import sys
from optimizer_factory import *
from .optimizer_factory import *
from google.protobuf import text_format
import paddle.fluid as fluid
from paddle.fluid.framework import Program

@ -13,13 +13,13 @@
# limitations under the License.
__all__ = ["DistributedAdam"]
import ps_pb2 as pslib
import paddle.fluid as fluid
from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table
from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table_inputs
from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table_outputs
from google.protobuf import text_format
from .node import DownpourWorker, DownpourServer
from . import ps_pb2 as pslib
class DistributedOptimizerImplBase(object):
@ -48,6 +48,63 @@ class DistributedAdam(DistributedOptimizerImplBase):
".batch_size@GRAD", ".batch_square_sum@GRAD", ".batch_sum@GRAD"
]
def _find_distributed_lookup_table_inputs(self, program, table_names):
"""
Find input variable of distribute lookup table in program.
We could support multi-distribute table now.
Args:
program(Program): given program, locate distributed lookup table
table_name(str): given table names that is found beforehand
Returns:
inputs
"""
local_vars = program.current_block().vars
inputs_dict = dict()
for table_name in table_names:
inputs_dict[table_name] = []
for op in program.global_block().ops:
if op.type == "lookup_table":
if op.input("W")[0] in table_names:
inputs_dict[op.input("W")[0]].extend(
[local_vars[name] for name in op.input("Ids")])
return inputs_dict
def _find_distributed_lookup_table_outputs(self, program, table_names):
"""
Find output variable of distribute lookup table in program.
We could support multi-distribute table now.
Args:
program(Program): given program, locate distributed lookup table
table_name(str): given table name that is found beforehand
Returns:
outputs
"""
local_vars = program.current_block().vars
outputs_dict = dict()
for table_name in table_names:
outputs_dict[table_name] = []
for op in program.global_block().ops:
if op.type == "lookup_table":
if op.input("W")[0] in table_names:
outputs_dict[op.input("W")[0]].extend(
[local_vars[name] for name in op.output("Out")])
return outputs_dict
def _find_multi_distributed_lookup_table(self, losses):
"""
find multi-sparse-table
"""
table_names = set()
for loss in losses:
for op in loss.block.program.global_block().ops:
if op.type == "lookup_table":
if op.attr('is_distributed') is True:
table_name = op.input("W")[0]
table_names.add(table_name)
return list(table_names)
def _minimize(self,
losses,
startup_program=None,
@ -69,10 +126,15 @@ class DistributedAdam(DistributedOptimizerImplBase):
[optimize_ops, grads_and_weights]
"""
table_name = find_distributed_lookup_table(losses[0].block.program)
table_name = self._find_multi_distributed_lookup_table(losses)
prefetch_slots = find_distributed_lookup_table_inputs(
losses[0].block.program, table_name[0])
inputs_dict = self._find_distributed_lookup_table_inputs(
losses[0].block.program, table_name)
prefetch_slots_emb = find_distributed_lookup_table_outputs(
losses[0].block.program, table_name[0])
outputs_dict = self._find_distributed_lookup_table_outputs(
losses[0].block.program, table_name)
ps_param = pslib.PSParameter()
@ -87,20 +149,29 @@ class DistributedAdam(DistributedOptimizerImplBase):
text_format.Merge(f.read(), ps_param)
server.get_desc().CopyFrom(ps_param.server_param)
worker.get_desc().CopyFrom(ps_param.trainer_param)
sparse_table_index = 0
server.add_sparse_table(sparse_table_index, self._learning_rate,
prefetch_slots, prefetch_slots_emb)
worker.add_sparse_table(sparse_table_index, self._learning_rate,
prefetch_slots, prefetch_slots_emb)
dense_table_index = 1
for tn in table_name:
if strategy.get(tn) is not None:
server.add_sparse_table(sparse_table_index, strategy[tn])
else:
server.add_sparse_table(sparse_table_index, None)
worker.add_sparse_table(sparse_table_index, inputs_dict[tn],
outputs_dict[tn])
sparse_table_index += 1
dense_start_table_id = sparse_table_index
dense_table_index = sparse_table_index
program_configs = {}
param_grads_list = []
for loss_index in range(len(losses)):
program_id = str(id(losses[loss_index].block.program))
program_configs[program_id] = {
"pull_sparse": [sparse_table_index],
"push_sparse": [sparse_table_index]
"pull_sparse":
[t_index for t_index in range(sparse_table_index)],
"push_sparse":
[t_index for t_index in range(sparse_table_index)]
}
params_grads = sorted(
@ -128,19 +199,30 @@ class DistributedAdam(DistributedOptimizerImplBase):
data_norm_grads.append(i[1])
if not is_data_norm_data:
grads.append(i[1])
server.add_dense_table(dense_table_index, self._learning_rate,
params, grads)
if strategy.get('dense_table') is not None:
server.add_dense_table(dense_table_index, params, grads,
strategy['dense_table'])
else:
server.add_dense_table(dense_table_index, params, grads, None)
worker.add_dense_table(dense_table_index, self._learning_rate,
params, grads)
params, grads, dense_start_table_id)
program_configs[program_id]["pull_dense"] = [dense_table_index]
program_configs[program_id]["push_dense"] = [dense_table_index]
if len(data_norm_params) != 0 and len(data_norm_grads) != 0:
dense_table_index += 1
server.add_data_norm_table(dense_table_index,
self._learning_rate,
data_norm_params, data_norm_grads)
if strategy.get('datanorm_table') is not None:
server.add_data_norm_table(
dense_table_index, self._learning_rate,
data_norm_params, data_norm_grads,
strategy['datanorm_table'])
else:
server.add_data_norm_table(
dense_table_index, self._learning_rate,
data_norm_params, data_norm_grads, None)
worker.add_dense_table(dense_table_index, self._learning_rate,
data_norm_params, data_norm_grads)
data_norm_params, data_norm_grads,
dense_start_table_id)
program_configs[program_id]["pull_dense"].extend(
[dense_table_index])
program_configs[program_id]["push_dense"].extend(

@ -0,0 +1,148 @@
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import paddle
import paddle.fluid as fluid
import os
import signal
import subprocess
import time
import unittest
import sys
from op_test import OpTest
from paddle.fluid.trainer_desc import DistMultiTrainer
from paddle.fluid.device_worker import DownpourSGD
from google.protobuf import text_format
import paddle.fluid.incubate.fleet.parameter_server.pslib.ps_pb2 as pslib
class TestListenAndServOp(OpTest):
def setUp(self):
pass
def test_device_work_use_cvm(self):
if sys.platform == 'win32' or sys.platform == 'sys.platform':
pass
else:
print(sys.platform)
cmd = "wget --no-check-certificate https://pslib.bj.bcebos.com/fleet_desc.prototxt"
os.system(cmd)
x = fluid.layers.data(name='x', shape=[1], dtype='float32')
x_emb = fluid.layers.embedding(
input=x, size=[1, 2], is_distributed=True)
y_predict = fluid.layers.fc(input=x_emb, size=1, act=None)
y = fluid.layers.data(name='y', shape=[1], dtype='float32')
cost = fluid.layers.square_error_cost(input=y_predict, label=y)
avg_cost = fluid.layers.mean(cost)
ps_param = pslib.PSParameter()
with open("fleet_desc.prototxt") as f:
text_format.Merge(f.read(), ps_param)
fleet_desc = ps_param
exe = fluid.Executor(fluid.CPUPlace())
exe.run(fluid.default_startup_program())
opt_info = {}
main_program = fluid.default_main_program()
program_id = str(id(avg_cost.block.program))
program_configs = {}
program_configs[program_id] = {
"pull_sparse": [0],
"push_sparse": [0]
}
program_configs[program_id]["pull_dense"] = [1]
program_configs[program_id]["push_dense"] = [1]
worker_skipped_ops = ["lookup_table", "lookup_table_grad"]
opt_info["program_configs"] = program_configs
opt_info["trainer"] = "DistMultiTrainer"
opt_info["device_worker"] = "DownpourSGD"
opt_info["optimizer"] = "DownpourSGD"
opt_info["fleet_desc"] = ps_param
opt_info["worker_skipped_ops"] = worker_skipped_ops
opt_info["use_cvm"] = True
opt_info["scale_datanorm"] = -1
opt_info["dump_slot"] = False
main_program._fleet_opt = opt_info
trainer = DistMultiTrainer()
trainer._set_program(main_program)
device_worker = DownpourSGD()
device_worker._set_fleet_desc(fleet_desc)
trainer._set_device_worker(device_worker)
trainer._set_fleet_desc(fleet_desc)
trainer._gen_trainer_desc()
cmd = "rm fleet_desc.prototxt*"
os.system(cmd)
def test_device_work(self):
if sys.platform == 'win32' or sys.platform == 'sys.platform':
pass
else:
print(sys.platform)
cmd = "wget --no-check-certificate https://pslib.bj.bcebos.com/fleet_desc.prototxt"
os.system(cmd)
x = fluid.layers.data(name='x', shape=[1], dtype='float32')
x_emb = fluid.layers.embedding(
input=x, size=[1, 2], is_distributed=True)
y_predict = fluid.layers.fc(input=x_emb, size=1, act=None)
y = fluid.layers.data(name='y', shape=[1], dtype='float32')
cost = fluid.layers.square_error_cost(input=y_predict, label=y)
avg_cost = fluid.layers.mean(cost)
ps_param = pslib.PSParameter()
with open("fleet_desc.prototxt") as f:
text_format.Merge(f.read(), ps_param)
fleet_desc = ps_param
exe = fluid.Executor(fluid.CPUPlace())
exe.run(fluid.default_startup_program())
opt_info = {}
main_program = fluid.default_main_program()
program_id = str(id(avg_cost.block.program))
program_configs = {}
program_configs[program_id] = {
"pull_sparse": [0],
"push_sparse": [0]
}
program_configs[program_id]["pull_dense"] = [1]
program_configs[program_id]["push_dense"] = [1]
worker_skipped_ops = ["lookup_table", "lookup_table_grad"]
opt_info["program_configs"] = program_configs
opt_info["trainer"] = "DistMultiTrainer"
opt_info["device_worker"] = "DownpourSGD"
opt_info["optimizer"] = "DownpourSGD"
opt_info["fleet_desc"] = ps_param
opt_info["worker_skipped_ops"] = worker_skipped_ops
opt_info["use_cvm"] = False
opt_info["scale_datanorm"] = -1
opt_info["dump_slot"] = False
main_program._fleet_opt = opt_info
trainer = DistMultiTrainer()
trainer._set_program(main_program)
device_worker = DownpourSGD()
device_worker._set_fleet_desc(fleet_desc)
trainer._set_device_worker(device_worker)
trainer._set_fleet_desc(fleet_desc)
trainer._gen_trainer_desc()
cmd = "rm fleet_desc.prototxt*"
os.system(cmd)
if __name__ == "__main__":
unittest.main()
Loading…
Cancel
Save