Paddle fleet distributed strategy (#25379)

* add paddle.fleet.DistributedStrategy for 2.0
fix_copy_if_different
Dong Daxiang 5 years ago committed by GitHub
parent 0954e907f6
commit d5e40d1ba9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -155,22 +155,31 @@ nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
if(WITH_PYTHON)
py_proto_compile(framework_py_proto SRCS framework.proto data_feed.proto)
py_proto_compile(trainer_py_proto SRCS trainer_desc.proto data_feed.proto)
py_proto_compile(distributed_strategy_py_proto SRCS distributed_strategy.proto)
#Generate an empty \
#__init__.py to make framework_py_proto as a valid python module.
add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
add_dependencies(framework_py_proto framework_py_proto_init trainer_py_proto)
add_dependencies(framework_py_proto framework_py_proto_init trainer_py_proto distributed_strategy_py_proto)
if (NOT WIN32)
add_custom_command(TARGET framework_py_proto POST_BUILD
COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto
COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fleet/proto
COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_BINARY_DIR}/python/paddle/fleet/proto/__init__.py
COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/
COMMAND cp distributed_strategy_*.py ${PADDLE_BINARY_DIR}/python/paddle/fleet/proto
COMMENT "Copy generated python proto into directory paddle/fluid/proto."
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
else(NOT WIN32)
string(REPLACE "/" "\\" proto_dstpath "${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/")
string(REPLACE "/" "\\" fleet_proto_dstpath "${PADDLE_BINARY_DIR}/python/paddle/fleet/proto/")
add_custom_command(TARGET framework_py_proto POST_BUILD
COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto
COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fleet/proto
COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_BINARY_DIR}/python/paddle/fleet/proto/__init__.py
COMMAND copy /Y *.py ${proto_dstpath}
COMMAND copy /Y distributed_strategy_*.py ${fleet_proto_dstpath}
COMMENT "Copy generated python proto into directory paddle/fluid/proto."
COMMENT "Copy generated python proto into directory paddle/fleet/proto."
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
endif(NOT WIN32)
endif()

@ -0,0 +1,87 @@
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
syntax = "proto2";
package paddle.fleet;
enum Mode {
COLLECTIVE = 1;
PS = 2;
PIPELINE = 3;
HETER = 4; // support XPU and GPU computing server
}
message DistributedStrategy {
optional Mode mode = 1 [ default = COLLECTIVE ]; // just for serialization
// collective training strategy
optional bool amp = 2 [ default = false ];
optional int32 amp_loss_scaling = 3 [ default = 32768 ];
optional bool recompute = 4 [ default = false ];
repeated string recompute_checkpoints = 5;
optional bool localsgd = 6 [ default = false ];
optional int32 localsgd_k_step = 7 [ default = 4 ];
optional bool dgc = 8 [ default = false ];
optional bool hierachical_allreduce = 9 [ default = false ];
optional int32 nccl_comm_num = 10 [ default = 1 ];
optional bool gradient_merge = 11 [ default = false ];
optional int32 gradient_merge_k_step = 12 [ default = 1 ];
optional bool sequential_execution = 13 [ default = false ];
optional bool enable_backward_optimizer_op_deps = 14 [ default = true ];
optional bool lars = 15 [ default = false ];
optional bool lamb = 16 [ default = false ];
optional bool fuse_elewise_add_act_ops = 17 [ default = false ];
optional bool fuse_bn_act_ops = 18 [ default = false ];
optional bool enable_auto_fusion = 19 [ default = false ];
optional bool fuse_relu_depthwise_conv = 20 [ default = false ];
optional bool enable_inplace = 21 [ default = false ];
optional bool fuse_all_reduce_ops = 22 [ default = false ];
optional int32 num_iteration_per_drop_scope = 23 [ default = 1 ];
optional bool sync_batch_norm = 24 [ default = false ];
optional bool fuse_all_optimizer_ops = 25 [ default = false ];
// pipeline training
optional bool pipeline = 101 [ default = false ];
optional int32 pipeline_micro_batch = 102;
// parameter server training
optional bool sync = 201 [ default = false ];
optional bool async = 202 [ default = true ];
optional int32 async_k_step = 203 [ default = -1 ];
optional int32 max_merge_var_num = 204 [ default = 1 ];
optional int32 send_queue_size = 205 [ default = 16 ];
optional bool independent_recv_thread = 206 [ default = false ];
optional int32 min_send_grad_num_before_recv = 207 [ default = 1 ];
optional int32 thread_pool_size = 208 [ default = 1 ];
optional int32 send_wait_times = 209 [ default = 1 ];
optional bool runtime_split_send_recv = 210 [ default = false ];
optional bool use_thread_barrier = 211 [ default = false ];
// elastic deep learning strategies
optional bool elastic = 301 [ default = false ];
// auto parallel
optional bool auto = 401 [ default = false ];
}
message DistributedJobInfo {
optional int32 worker_num = 1;
optional int32 server_num = 2;
repeated string worker_ips = 3;
repeated string server_endpoints = 4;
optional string origin_startup = 5;
optional string origin_main = 6; // without backpropagation and optimization
optional string distributed_main = 7; // with backpropagation and optimization
optional string optimizer_name = 8; // optimizer name
optional DistributedStrategy strategy = 101;
}

@ -36,6 +36,7 @@ import paddle.distributed
import paddle.sysconfig
import paddle.tensor
import paddle.nn
import paddle.fleet
import paddle.framework
import paddle.imperative
import paddle.optimizer

@ -13,16 +13,11 @@
# limitations under the License.
# TODO: define distributed api under this directory,
# __all__ = ['metric',
# 'optimizer',
# 'RoleMaker',
# 'dataset',
# ' DatasetFactory',
# ' InMemoryDataset',
# ' QueueDataset',
# 'transpiler',
# ' DistributeTranspiler',
# ' DistributeTranspilerConfig',
# ' HashName',
# ' RoundRobin',
# 'collective']
from .base.distributed_strategy import DistributedStrategy
#from .base.role_maker import PaddleCloudRoleMaker, UserDefinedRoleMaker
#from .base.fleet_base import Fleet
#__all__ = [
# "DistributedStrategy", "PaddleCloudRoleMaker", "UserDefinedRoleMaker"
#]
__all__ = ['DistributedStrategy']

@ -0,0 +1,13 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

File diff suppressed because it is too large Load Diff

@ -0,0 +1,19 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
from paddle.fleet import RoleMakerBase
from . import obj_creator
# __all__ = ['Fleet']

@ -0,0 +1,23 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from util_base import UtilBase
def _create_fleet_obj_from_role_maker(role_maker):
pass
def _create_fleet_util_from_role_maker(role_maker):
pass

@ -0,0 +1,16 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Defination of Role Makers."""
# __all__ = ['RoleMakerBase', 'UserDefinedRoleMaker', 'PaddleCloudRoleMaker']

@ -0,0 +1,64 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Fleet Utils."""
"""distributed operations"""
"""basic collective operations in python"""
"""remote file system"""
# __all__ = ['UtilBase']
'''
class UtilBase(object):
def __init__(self, role_maker, fleet_obj):
self.role_maker = roke_maker
self.fleet_obj = fleet_obj
def set_file_system(self, fs_client):
self.fs_client = fs_client
def broadcast(self):
pass
def all_gather(self):
pass
def all_reduce(self):
pass
def reduce_scatter(self):
pass
def reduce(self):
pass
def get_file_shard(self, files):
pass
def feed_gen(self, batch_size, feed_vars_dims, feeded_vars_filelist):
pass
def save_program(program, output_dir):
pass
def load_program(input_dir):
pass
def load_var():
pass
def save_var():
pass
def print_on_rank(self):
pass
'''

@ -0,0 +1,12 @@
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and

@ -0,0 +1,12 @@
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and

@ -0,0 +1,13 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

@ -0,0 +1,13 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

@ -0,0 +1,13 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

@ -143,6 +143,13 @@ packages=['paddle',
'paddle.incubate',
'paddle.incubate.complex',
'paddle.incubate.complex.tensor',
'paddle.fleet',
'paddle.fleet.base',
'paddle.fleet.collective',
'paddle.fleet.dataset',
'paddle.fleet.metrics',
'paddle.fleet.parameter_server',
'paddle.fleet.proto',
'paddle.framework',
'paddle.fluid',
'paddle.fluid.dygraph',

Loading…
Cancel
Save