Paddle fleet distributed strategy (#25379)

* add paddle.fleet.DistributedStrategy for 2.0
5 years ago · d5e40d1ba9
parent 0954e907f6
commit d5e40d1ba9
17 changed files with 1186 additions and 14 deletions
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@ -155,22 +155,31 @@ nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
 if(WITH_PYTHON)
  py_proto_compile(framework_py_proto SRCS framework.proto data_feed.proto)
  py_proto_compile(trainer_py_proto SRCS trainer_desc.proto data_feed.proto)
+  py_proto_compile(distributed_strategy_py_proto SRCS distributed_strategy.proto)
 #Generate an empty \
    #__init__.py to make framework_py_proto as a valid python module.
  add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
-  add_dependencies(framework_py_proto framework_py_proto_init trainer_py_proto)
+  add_dependencies(framework_py_proto framework_py_proto_init trainer_py_proto distributed_strategy_py_proto)
  if (NOT WIN32)
    add_custom_command(TARGET framework_py_proto POST_BUILD
      COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto
+      COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fleet/proto
+      COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_BINARY_DIR}/python/paddle/fleet/proto/__init__.py
      COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/
+      COMMAND cp distributed_strategy_*.py ${PADDLE_BINARY_DIR}/python/paddle/fleet/proto
      COMMENT "Copy generated python proto into directory paddle/fluid/proto."
      WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
  else(NOT WIN32)
    string(REPLACE "/" "\\" proto_dstpath "${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/")
+    string(REPLACE "/" "\\" fleet_proto_dstpath "${PADDLE_BINARY_DIR}/python/paddle/fleet/proto/")
    add_custom_command(TARGET framework_py_proto POST_BUILD
          COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto
+	  COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fleet/proto
+	  COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_BINARY_DIR}/python/paddle/fleet/proto/__init__.py
          COMMAND copy /Y *.py ${proto_dstpath}
+	  COMMAND copy /Y distributed_strategy_*.py ${fleet_proto_dstpath}
          COMMENT "Copy generated python proto into directory paddle/fluid/proto."
+	  COMMENT "Copy generated python proto into directory paddle/fleet/proto."
          WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
  endif(NOT WIN32)
 endif()
--- a/paddle/fluid/framework/distributed_strategy.proto
+++ b/paddle/fluid/framework/distributed_strategy.proto
@ -0,0 +1,87 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+syntax = "proto2";
+package paddle.fleet;
+
+enum Mode {
+  COLLECTIVE = 1;
+  PS = 2;
+  PIPELINE = 3;
+  HETER = 4; // support XPU and GPU computing server
+}
+
+message DistributedStrategy {
+  optional Mode mode = 1 [ default = COLLECTIVE ]; // just for serialization
+  // collective training strategy
+  optional bool amp = 2 [ default = false ];
+  optional int32 amp_loss_scaling = 3 [ default = 32768 ];
+  optional bool recompute = 4 [ default = false ];
+  repeated string recompute_checkpoints = 5;
+  optional bool localsgd = 6 [ default = false ];
+  optional int32 localsgd_k_step = 7 [ default = 4 ];
+  optional bool dgc = 8 [ default = false ];
+  optional bool hierachical_allreduce = 9 [ default = false ];
+  optional int32 nccl_comm_num = 10 [ default = 1 ];
+  optional bool gradient_merge = 11 [ default = false ];
+  optional int32 gradient_merge_k_step = 12 [ default = 1 ];
+  optional bool sequential_execution = 13 [ default = false ];
+  optional bool enable_backward_optimizer_op_deps = 14 [ default = true ];
+  optional bool lars = 15 [ default = false ];
+  optional bool lamb = 16 [ default = false ];
+  optional bool fuse_elewise_add_act_ops = 17 [ default = false ];
+  optional bool fuse_bn_act_ops = 18 [ default = false ];
+  optional bool enable_auto_fusion = 19 [ default = false ];
+  optional bool fuse_relu_depthwise_conv = 20 [ default = false ];
+  optional bool enable_inplace = 21 [ default = false ];
+  optional bool fuse_all_reduce_ops = 22 [ default = false ];
+  optional int32 num_iteration_per_drop_scope = 23 [ default = 1 ];
+  optional bool sync_batch_norm = 24 [ default = false ];
+  optional bool fuse_all_optimizer_ops = 25 [ default = false ];
+
+  // pipeline training
+  optional bool pipeline = 101 [ default = false ];
+  optional int32 pipeline_micro_batch = 102;
+
+  // parameter server training
+  optional bool sync = 201 [ default = false ];
+  optional bool async = 202 [ default = true ];
+  optional int32 async_k_step = 203 [ default = -1 ];
+  optional int32 max_merge_var_num = 204 [ default = 1 ];
+  optional int32 send_queue_size = 205 [ default = 16 ];
+  optional bool independent_recv_thread = 206 [ default = false ];
+  optional int32 min_send_grad_num_before_recv = 207 [ default = 1 ];
+  optional int32 thread_pool_size = 208 [ default = 1 ];
+  optional int32 send_wait_times = 209 [ default = 1 ];
+  optional bool runtime_split_send_recv = 210 [ default = false ];
+  optional bool use_thread_barrier = 211 [ default = false ];
+
+  // elastic deep learning strategies
+  optional bool elastic = 301 [ default = false ];
+
+  // auto parallel
+  optional bool auto = 401 [ default = false ];
+}
+
+message DistributedJobInfo {
+  optional int32 worker_num = 1;
+  optional int32 server_num = 2;
+  repeated string worker_ips = 3;
+  repeated string server_endpoints = 4;
+  optional string origin_startup = 5;
+  optional string origin_main = 6; // without backpropagation and optimization
+  optional string distributed_main = 7; // with backpropagation and optimization
+  optional string optimizer_name = 8;   // optimizer name
+  optional DistributedStrategy strategy = 101;
+}
--- a/python/paddle/init.py
+++ b/python/paddle/init.py
@ -36,6 +36,7 @@ import paddle.distributed
 import paddle.sysconfig
 import paddle.tensor
 import paddle.nn
+import paddle.fleet
 import paddle.framework
 import paddle.imperative
 import paddle.optimizer
--- a/python/paddle/fleet/init.py
+++ b/python/paddle/fleet/init.py
@ -13,16 +13,11 @@
 # limitations under the License.

 # TODO: define distributed api under this directory, 
-# __all__ = ['metric',
-#            'optimizer',
-#            'RoleMaker',
-#            'dataset',
-#            '	DatasetFactory',
-#            '	InMemoryDataset',
-#            '	QueueDataset',
-#            'transpiler',
-#            '	DistributeTranspiler',
-#            '	DistributeTranspilerConfig',
-#            '	HashName',
-#            '	RoundRobin',
-#            'collective']
+from .base.distributed_strategy import DistributedStrategy
+#from .base.role_maker import PaddleCloudRoleMaker, UserDefinedRoleMaker
+#from .base.fleet_base import Fleet
+
+#__all__ = [
+#    "DistributedStrategy", "PaddleCloudRoleMaker", "UserDefinedRoleMaker"
+#]
+__all__ = ['DistributedStrategy']
--- a/python/paddle/fleet/base/init.py
+++ b/python/paddle/fleet/base/init.py
@ -0,0 +1,13 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/python/paddle/fleet/base/distributed_strategy.py
+++ b/python/paddle/fleet/base/distributed_strategy.py
--- a/python/paddle/fleet/base/fleet_base.py
+++ b/python/paddle/fleet/base/fleet_base.py
@ -0,0 +1,19 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+from paddle.fleet import RoleMakerBase
+from . import obj_creator
+
+# __all__ = ['Fleet']
--- a/python/paddle/fleet/base/obj_creator.py
+++ b/python/paddle/fleet/base/obj_creator.py
@ -0,0 +1,23 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from util_base import UtilBase
+
+
+def _create_fleet_obj_from_role_maker(role_maker):
+    pass
+
+
+def _create_fleet_util_from_role_maker(role_maker):
+    pass
--- a/python/paddle/fleet/base/role_maker.py
+++ b/python/paddle/fleet/base/role_maker.py
@ -0,0 +1,16 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Defination of Role Makers."""
+
+# __all__ = ['RoleMakerBase', 'UserDefinedRoleMaker', 'PaddleCloudRoleMaker']
--- a/python/paddle/fleet/base/util_base.py
+++ b/python/paddle/fleet/base/util_base.py
@ -0,0 +1,64 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fleet Utils."""
+"""distributed operations"""
+"""basic collective operations in python"""
+"""remote file system"""
+
+# __all__ = ['UtilBase']
+'''
+class UtilBase(object):
+    def __init__(self, role_maker, fleet_obj):
+        self.role_maker = roke_maker
+        self.fleet_obj = fleet_obj
+
+    def set_file_system(self, fs_client):
+        self.fs_client = fs_client
+
+    def broadcast(self):
+        pass
+
+    def all_gather(self):
+        pass
+
+    def all_reduce(self):
+        pass
+
+    def reduce_scatter(self):
+        pass
+
+    def reduce(self):
+        pass
+
+    def get_file_shard(self, files):
+        pass
+
+    def feed_gen(self, batch_size, feed_vars_dims, feeded_vars_filelist):
+        pass
+
+    def save_program(program, output_dir):
+        pass
+
+    def load_program(input_dir):
+        pass
+
+    def load_var():
+        pass
+
+    def save_var():
+        pass
+
+    def print_on_rank(self):
+        pass
+'''
--- a/python/paddle/fleet/collective/init.py
+++ b/python/paddle/fleet/collective/init.py
@ -0,0 +1,12 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
--- a/python/paddle/fleet/dataset/init.py
+++ b/python/paddle/fleet/dataset/init.py
@ -0,0 +1,12 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
--- a/python/paddle/fleet/metrics/init.py
+++ b/python/paddle/fleet/metrics/init.py
@ -0,0 +1,13 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/python/paddle/fleet/metrics/metric.py
+++ b/python/paddle/fleet/metrics/metric.py
@ -0,0 +1,13 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/python/paddle/fleet/parameter_server/init.py
+++ b/python/paddle/fleet/parameter_server/init.py
@ -0,0 +1,13 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
--- a/python/setup.py.in
+++ b/python/setup.py.in
@ -143,6 +143,13 @@ packages=['paddle',
          'paddle.incubate',
          'paddle.incubate.complex',
          'paddle.incubate.complex.tensor',
+          'paddle.fleet',
+          'paddle.fleet.base',
+          'paddle.fleet.collective',
+          'paddle.fleet.dataset',
+          'paddle.fleet.metrics',
+          'paddle.fleet.parameter_server',
+          'paddle.fleet.proto',
          'paddle.framework',
          'paddle.fluid',
          'paddle.fluid.dygraph',