Polish fleet API to support cuda collective mode and nccl2 mode. (#18966)

Polish fleet API to support cuda collective mode and nccl2 mode
6 years ago · 29d8781240
parent b7e1a1d7e7
commit 29d8781240
14 changed files with 363 additions and 162 deletions
--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@ -22,6 +22,7 @@

 // asynchronous nccl allreduce or synchronous issue:
 // https://github.com/PaddlePaddle/Paddle/issues/15049
+// If you want to change this default value, why?(gongwb)
 DEFINE_bool(
    sync_nccl_allreduce, true,
    "If set true, will call `cudaStreamSynchronize(nccl_stream)`"
--- a/paddle/fluid/operators/distributed/grpc/grpc_client.cc
+++ b/paddle/fluid/operators/distributed/grpc/grpc_client.cc
@ -449,7 +449,7 @@ void GRPCClient::Proceed() {
  // destructed at this moment.
  if (FLAGS_v >= 3) {
    std::string msg("GRPCClient Proceed end");
-    fwrite(msg.c_str(), msg.length(), 1, stdout);
+    fwrite(msg.c_str(), msg.length(), 1, stderr);
  }
 }

--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@ -32,8 +32,10 @@ platform::DeviceContext* DeviceContextPool::Get(const platform::Place& place) {
  auto it = device_contexts_.find(place);
  if (it == device_contexts_.end()) {
    PADDLE_THROW(
-        "Place %s is not supported, Please re-compile with WITH_GPU "
-        "option",
+        "Place %s is not supported, Please check that your paddle compiles "
+        "with WITH_GPU "
+        "option or check that your train process hold the correct gpu_id if "
+        "you use Executor",
        place);
  }
  return it->second.get().get();
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@ -2848,6 +2848,8 @@ class Program(object):

        # use Deep gradient comrepssion or not
        self._enable_dgc = False
+        self._use_lamb = False
+
        self._nccl_comm_num = 1
        self._use_hierarchical_allreduce = False
        self._hierarchical_allreduce_inter_nranks = 0
--- a/python/paddle/fluid/incubate/fleet/base/fleet_base.py
+++ b/python/paddle/fluid/incubate/fleet/base/fleet_base.py
@ -232,6 +232,14 @@ class Fleet(object):
    def save_persistables(self, executor, dirname, main_program=None):
        pass

+    @abc.abstractmethod
+    def node_num(self):
+        pass
+
+    @abc.abstractmethod
+    def node_id(self):
+        pass
+

 class DistributedOptimizer(object):
    """
--- a/python/paddle/fluid/incubate/fleet/base/role_maker.py
+++ b/python/paddle/fluid/incubate/fleet/base/role_maker.py
@ -350,7 +350,7 @@ class PaddleCloudRoleMaker(RoleMakerBase):
                for i, ip in enumerate(self.pserver_ips.split(",")):
                    eplist.append(':'.join([ip, ports[i]]))
                self.endpoints = ",".join(eplist)
-                self._trainers = int(os.getenv("PADDLE_TRAINERS_NUM", "1"))
+                self._trainers_num = int(os.getenv("PADDLE_TRAINERS_NUM", "1"))
                # ip of current node, either a worker or a pserver
                current_ip = os.getenv("POD_IP", "")
                if current_ip == "":
@ -380,11 +380,31 @@ class PaddleCloudRoleMaker(RoleMakerBase):
                assert (self._training_role == "TRAINER")
                self._worker_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS")
                self._current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
-                if self._worker_endpoints:
-                    self._worker_endpoints = self._worker_endpoints.split(",")
-                    self._num_trainers = len(self._worker_endpoints)
+                assert self._worker_endpoints is not None, "can't find PADDLE_TRAINER_ENDPOINTS"
+                self._worker_endpoints = self._worker_endpoints.split(",")
+                self._trainers_num = len(self._worker_endpoints)
+
+                self._node_ips = self._get_node_ips_from_endpoints(
+                    self._worker_endpoints)
+                self._node_ip = self._current_endpoint.split(":")[0].strip()
+
+                self._node_num = len(self._node_ips)
+                self._node_id = self._node_ips.index(self._node_ip)
            self._role_is_generated = True

+    def _get_node_ips_from_endpoints(self, endpoints):
+        ss = set()
+        ips = []
+        for ep in endpoints:
+            ip = ep.split(":")[0].strip()
+            if ip not in ss:
+                ss.add(ip)
+                ips.append(ip)
+            else:
+                continue
+
+        return ips
+
    def get_pserver_endpoints(self):
        if not self._role_is_generated:
            self.generate_role()
@ -418,7 +438,7 @@ class PaddleCloudRoleMaker(RoleMakerBase):
    def worker_num(self):
        if not self._role_is_generated:
            self.generate_role()
-        return self._trainers
+        return self._trainers_num


 class UserDefinedRoleMaker(RoleMakerBase):
--- a/python/paddle/fluid/incubate/fleet/collective/init.py
+++ b/python/paddle/fluid/incubate/fleet/collective/init.py
--- a/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/init.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/init.py
@ -239,6 +239,14 @@ class DistributedTranspiler(Fleet):
            self.main_program, self.startup_program = \
                self._transpiler.get_pserver_programs(self.server_endpoints()[self.server_index()])

+    def node_num(self):
+        logging.warn(
+            "You should not call 'node_num' method for collective mode.")
+
+    def node_id(self):
+        logging.warn(
+            "You should not call 'node_id' method for collective mode.")
+

 fleet = DistributedTranspiler()

--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@ -2176,6 +2176,7 @@ class LambOptimizer(AdamOptimizer):

    def _append_optimize_op(self, block, param_and_grad):
        assert isinstance(block, framework.Block)
+        block.program._use_lamb = True

        moment1 = self._get_accumulator(self._moment1_acc_str,
                                        param_and_grad[0])
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@ -8,6 +8,7 @@ if(NOT WITH_DISTRIBUTE)
    list(REMOVE_ITEM TEST_OPS test_simple_dist_transpiler)
    list(REMOVE_ITEM TEST_OPS test_listen_and_serv_op)
    LIST(REMOVE_ITEM TEST_OPS test_dist_mnist)
+    LIST(REMOVE_ITEM TEST_OPS test_dist_mnist_fleetapi)
    LIST(REMOVE_ITEM TEST_OPS test_dist_mnist_dgc_nccl)
    LIST(REMOVE_ITEM TEST_OPS test_dist_mnist_hallreduce)
    LIST(REMOVE_ITEM TEST_OPS test_dist_mnist_multi_comm)
@ -236,29 +237,32 @@ if(WITH_DISTRIBUTE)
    if(NOT APPLE)
        set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 350 LABELS "RUN_TYPE=EXCLUSIVE")
        set_tests_properties(test_dist_mnist_dgc_nccl PROPERTIES TIMEOUT 350 LABELS "RUN_TYPE=EXCLUSIVE")
-	set_tests_properties(test_dist_mnist_hallreduce PROPERTIES TIMEOUT 350 LABELS "RUN_TYPE=EXCLUSIVE")
-	set_tests_properties(test_dist_mnist_multi_comm PROPERTIES TIMEOUT 350 LABELS "RUN_TYPE=EXCLUSIVE")
-	set_tests_properties(test_dist_mnist_ring_allreduce PROPERTIES TIMEOUT 350 LABELS "RUN_TYPE=EXCLUSIVE")
-	set_tests_properties(test_dist_mnist_backward_deps PROPERTIES TIMEOUT 350 LABELS "RUN_TYPE=EXCLUSIVE")
+        set_tests_properties(test_dist_mnist_hallreduce PROPERTIES TIMEOUT 350 LABELS "RUN_TYPE=EXCLUSIVE")
+        set_tests_properties(test_dist_mnist_multi_comm PROPERTIES TIMEOUT 350 LABELS "RUN_TYPE=EXCLUSIVE")
+        set_tests_properties(test_dist_mnist_ring_allreduce PROPERTIES TIMEOUT 350 LABELS "RUN_TYPE=EXCLUSIVE")
+        set_tests_properties(test_dist_mnist_backward_deps PROPERTIES TIMEOUT 350 LABELS "RUN_TYPE=EXCLUSIVE")
+        set_tests_properties(test_dist_mnist_fleetapi  PROPERTIES TIMEOUT 350 LABELS "RUN_TYPE=EXCLUSIVE")
        set_tests_properties(test_dist_mnist_lars PROPERTIES TIMEOUT 350 LABELS "RUN_TYPE=EXCLUSIVE")
        set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 350 LABELS "RUN_TYPE=EXCLUSIVE")
-	set_tests_properties(test_dist_simnet_bow PROPERTIES TIMEOUT 350 LABELS "RUN_TYPE=EXCLUSIVE")
-	set_tests_properties(test_dist_text_classification PROPERTIES TIMEOUT 350 LABELS "RUN_TYPE=EXCLUSIVE")
+        set_tests_properties(test_dist_simnet_bow PROPERTIES TIMEOUT 350 LABELS "RUN_TYPE=EXCLUSIVE")
+        set_tests_properties(test_dist_text_classification PROPERTIES TIMEOUT 350 LABELS "RUN_TYPE=EXCLUSIVE")

-    list(REMOVE_ITEM TEST_OPS test_dist_se_resnext_dgc)
+        list(REMOVE_ITEM TEST_OPS test_dist_se_resnext_dgc)
        list(REMOVE_ITEM TEST_OPS test_dist_se_resnext_sync)
 	    list(REMOVE_ITEM TEST_OPS test_dist_se_resnext_async)
-	        list(REMOVE_ITEM TEST_OPS test_dist_se_resnext_sync_with_memopt)
+	    list(REMOVE_ITEM TEST_OPS test_dist_se_resnext_sync_with_memopt)
+
        py_test_modules(test_dist_se_resnext_dgc MODULES test_dist_se_resnext_dgc)
-	py_test_modules(test_dist_se_resnext_sync MODULES test_dist_se_resnext_sync)
+	    py_test_modules(test_dist_se_resnext_sync MODULES test_dist_se_resnext_sync)
        py_test_modules(test_dist_se_resnext_nccl MODULES test_dist_se_resnext_nccl)
        bash_test_modules(test_launch MODULES test_launch.sh)
+
        # FIXME(typhoonzero): add these tests back
        # py_test_modules(test_dist_transformer MODULES test_dist_transformer)
        # set_tests_properties(test_dist_transformer PROPERTIES TIMEOUT 1000)
-	set_tests_properties(test_dist_se_resnext_dgc PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
-	set_tests_properties(test_dist_se_resnext_sync PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
-	set_tests_properties(test_dist_se_resnext_nccl PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
+        set_tests_properties(test_dist_se_resnext_dgc PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
+        set_tests_properties(test_dist_se_resnext_sync PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
+        set_tests_properties(test_dist_se_resnext_nccl PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
    endif(NOT APPLE)
    # py_test_modules(test_dist_transpiler MODULES test_dist_transpiler)
 endif()
--- a/python/paddle/fluid/tests/unittests/dist_mnist.py
+++ b/python/paddle/fluid/tests/unittests/dist_mnist.py
@ -29,6 +29,7 @@ import os
 import signal
 from functools import reduce
 from test_dist_base import TestDistRunnerBase, runtime_main
+from paddle.fluid.incubate.fleet.collective import fleet, DistributedStrategy

 DTYPE = "float32"
 paddle.dataset.mnist.fetch()
@ -73,7 +74,7 @@ def cnn_model(data):


 class TestDistMnist2x2(TestDistRunnerBase):
-    def get_model(self, batch_size=2, use_dgc=False):
+    def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None):
        # Input data
        images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
@ -104,7 +105,14 @@ class TestDistMnist2x2(TestDistRunnerBase):
            paddle.dataset.mnist.test(), batch_size=batch_size)
        test_reader = paddle.batch(
            paddle.dataset.mnist.test(), batch_size=batch_size)
-        opt.minimize(avg_cost)
+
+        if dist_strategy:
+            dist_opt = fleet.distributed_optimizer(
+                optimizer=opt, strategy=dist_strategy)
+            _, param_grads = dist_opt.minimize(avg_cost)
+        else:
+            opt.minimize(avg_cost)
+
        return inference_program, avg_cost, train_reader, test_reader, batch_acc, predict


--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@ -31,6 +31,9 @@ import paddle.fluid.dygraph as dygraph
 from paddle.fluid.dygraph.base import to_variable
 from paddle.fluid.dygraph.parallel import DataParallel

+from paddle.fluid.incubate.fleet.collective import fleet, DistributedStrategy
+import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+
 RUN_STEP = 5
 DEFAULT_BATCH_SIZE = 2

@ -44,6 +47,10 @@ def my_print(class_name, log_str):
        sys.stderr.buffer.write(pickle.dumps(print_str))


+def eprint(*args, **kwargs):
+    print(*args, file=sys.stderr, **kwargs)
+
+
 class TestDistRunnerBase(object):
    def get_model(self,
                  batch_size=DEFAULT_BATCH_SIZE,
@ -96,6 +103,72 @@ class TestDistRunnerBase(object):
        exe.run(pserver_prog)
        my_print(type(self).__name__, "run pserver main program done.")

+    def run_gpu_fleet_api_trainer(self, args):
+        assert args.update_method == "nccl2"
+
+        self.lr = args.lr
+
+        exec_strategy = fluid.ExecutionStrategy()
+        exec_strategy.num_threads = 1
+
+        dist_strategy = DistributedStrategy()
+        dist_strategy.exec_strategy = exec_strategy
+        dist_strategy.fuse_memory_size = 1  #MB
+        dist_strategy.fuse_laryer_size = 1
+
+        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        fleet.init(role)
+        my_print("gpu_fleet", "fleet.node_num:")
+        #"fleet.node_id:", fleet.node_id(),
+        #"fleet.trainer_num:", fleet.worker_num())
+
+        test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \
+                self.get_model(batch_size=args.batch_size, dist_strategy=dist_strategy)
+
+        trainer_prog = fleet._origin_program
+        dist_prog = fleet.main_program
+
+        device_id = int(os.getenv("FLAGS_selected_gpus", "0"))
+        place = fluid.CUDAPlace(device_id)
+
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+        eprint(type(self).__name__, "run worker startup program done.")
+
+        feed_var_list = [
+            var for var in trainer_prog.global_block().vars.values()
+            if var.is_data
+        ]
+
+        feeder = fluid.DataFeeder(feed_var_list, place)
+        reader_generator = train_reader()
+
+        def get_data():
+            origin_batch = next(reader_generator)
+            if args.update_method != "local" and args.use_reader_alloc:
+                new_batch = []
+                for offset, item in enumerate(origin_batch):
+                    if offset % 2 == args.trainer_id:
+                        new_batch.append(item)
+                return new_batch
+            else:
+                return origin_batch
+
+        my_print(type(self).__name__, "begin to train on trainer")
+        out_losses = []
+        for i in six.moves.xrange(RUN_STEP):
+            loss, = exe.run(dist_prog,
+                            fetch_list=[avg_cost.name],
+                            feed=feeder.feed(get_data()))
+            out_losses.append(loss[0])
+            my_print(type(self).__name__, "run step %d finished" % i)
+        my_print(type(self).__name__, "trainer run finished")
+
+        if six.PY2:
+            print(pickle.dumps(out_losses))
+        else:
+            sys.stdout.buffer.write(pickle.dumps(out_losses))
+
    def run_trainer(self, args):
        self.lr = args.lr
        if args.nccl2_reduce_layer_local_run:
@ -318,6 +391,7 @@ def runtime_main(test_class):
    parser.add_argument('--nccl_comm_num', type=int, required=False, default=1)
    parser.add_argument('--enable_backward_deps', action='store_true')
    parser.add_argument('--use_hallreduce', action='store_true')
+    parser.add_argument('--gpu_fleet_api', action='store_true')
    parser.add_argument(
        '--hallreduce_inter_nranks', type=int, required=False, default=2)
    parser.add_argument(
@ -344,6 +418,8 @@ def runtime_main(test_class):
    model = test_class()
    if args.role == "pserver" and args.update_method == "pserver":
        model.run_pserver(args)
+    elif args.gpu_fleet_api:
+        model.run_gpu_fleet_api_trainer(args)
    else:
        model.run_trainer(args)

@ -397,6 +473,7 @@ class TestDistBase(unittest.TestCase):
        self._dygraph = False
        self._nccl_comm_num = 1
        self._enable_backward_deps = False
+        self._gpu_fleet_api = False
        self._use_hallreduce = False
        self._setup_config()
        self._after_setup_config()
@ -600,7 +677,9 @@ class TestDistBase(unittest.TestCase):
            env.update({
                "CUDA_VISIBLE_DEVICES": "{}".format(trainer_id),
                "PADDLE_TRAINERS_NUM": "{}".format(trainer_num),
-                "PADDLE_TRAINER_ID": "{}".format(trainer_id)
+                "PADDLE_TRAINER_ID": "{}".format(trainer_id),
+                "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints,
+                "PADDLE_CURRENT_ENDPOINT": ep,
            })
        else:
            env.update({'CPU_NUM': '1'})
@ -620,6 +699,9 @@ class TestDistBase(unittest.TestCase):
        if self._enable_backward_deps:
            tr_cmd += " --enable_backward_deps"

+        if self._gpu_fleet_api:
+            tr_cmd += " --gpu_fleet_api"
+
        return tr_cmd, env

    def _run_cluster_nccl2(self, model, envs, nccl2_reduce_layer,
@ -669,6 +751,9 @@ class TestDistBase(unittest.TestCase):
            pipes[i].close()
            sys.stderr.write('trainer {} stderr: {}\n'.format(i, tr_err))

+        if check_error_log:
+            print("outs[0]:", outs[0])
+            print("outs[1]:", outs[1])
        return pickle.loads(outs[0]), pickle.loads(outs[1])

    def check_with_place(self,
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist_fleetapi.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist_fleetapi.py
@ -0,0 +1,35 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+from test_dist_base import TestDistBase
+
+
+class TestDistMnistNCCL2FleetApi(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
+        self._use_reduce = False
+        self._use_reader_alloc = False
+        self._nccl2_mode = True
+        self._gpu_fleet_api = True
+
+    def test_dist_train(self):
+        import paddle.fluid as fluid
+        if fluid.core.is_compiled_with_cuda():
+            self.check_with_place("dist_mnist.py", delta=1e-5)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@ -174,7 +174,7 @@ class DistributeTranspilerConfig(object):
    hierarchical_allreduce_inter_nranks = 0

    # if mode is collective
-    # supported modes: sgd, local_sgd
+    # supported modes: grad_allreduce, local_sgd
    collective_mode = None


@ -431,7 +431,7 @@ class DistributeTranspiler(object):
                trainers_num = len(self.origin_program._trainers_endpoints)
                # selected automaticly
                if self.config.hierarchical_allreduce_inter_nranks <= 1:
-                    self.config.hierarchical_allreduce_inter_nranks = fluid.core.get_cuda_device_count(
+                    self.config.hierarchical_allreduce_inter_nranks = core.get_cuda_device_count(
                    )

                assert trainers_num > self.config.hierarchical_allreduce_inter_nranks, \