【Fleet2.0 Util】 add documents (#26698)

* test=develop, util documents
4 years ago · f36b9a7f79
parent e9a0fbfff2
commit f36b9a7f79
14 changed files with 694 additions and 67 deletions
--- a/python/paddle/distributed/fleet/base/role_maker.py
+++ b/python/paddle/distributed/fleet/base/role_maker.py
@ -637,7 +637,7 @@ class PaddleCloudRoleMaker(RoleMakerBase):
        return "lo"
    def __start_kv_server(self, http_server_d, size_d):
-        from paddle.distributed.fleet.utils import KVServer
+        from paddle.distributed.fleet.utils.http_server import KVServer
        http_server = KVServer(int(self._http_ip_port[1]), size_d)
        http_server.start()
        wait_seconds = 5
@ -651,6 +651,7 @@ class UserDefinedRoleMaker(PaddleCloudRoleMaker):
    def __init__(self, is_collective=False, init_gloo=False, **kwargs):
        super(UserDefinedRoleMaker, self).__init__(
            is_collective=is_collective, init_gloo=init_gloo, **kwargs)
        self._init_gloo = init_gloo
    def _user_defined_ps_env(self):
        self._server_endpoints = self._kwargs.get("server_endpoints")
--- a/python/paddle/distributed/fleet/base/util_factory.py
+++ b/python/paddle/distributed/fleet/base/util_factory.py
--- a/python/paddle/distributed/fleet/launch.py
+++ b/python/paddle/distributed/fleet/launch.py
@ -87,7 +87,7 @@ def _parse_args():
 see: http://www.paddlepaddle.org/documentation/docs/zh/1.6/user_guides/howto/training/cluster_howto.html#permalink-8--nccl2-
 ''')
-    #Optional arguments for the launch helper
+    # Optional arguments for the launch helper
    parser.add_argument(
        "--ips",
        type=str,
@ -115,7 +115,7 @@ see: http://www.paddlepaddle.org/documentation/docs/zh/1.6/user_guides/howto/tra
        default="log",
        help="The path for each process's log.If it's not set, the log will printed to default pipe."
    )
-    #positional
+    # positional
    parser.add_argument(
        "training_script",
        type=str,
@ -124,7 +124,7 @@ see: http://www.paddlepaddle.org/documentation/docs/zh/1.6/user_guides/howto/tra
        "followed by all the arguments for the "
        "training script")
-    #rest from the training program
+    # rest from the training program
    parser.add_argument('training_script_args', nargs=REMAINDER)
    return parser.parse_args()
@ -138,7 +138,7 @@ def get_cluster_from_args(args, gpus):
    # node_ip = args.node_ip
    assert node_ip in node_ips, "Can't find your local ip {%s} in node_ips: {%s}" \
-                % (node_ip, node_ips)
+        % (node_ip, node_ips)
    node_rank = node_ips.index(node_ip)
    logger.debug("parsed from args: node_ips:{} node_ip:{} node_rank:{}".format(
@ -280,7 +280,7 @@ def launch_ps(args):
        _, current_node_ip = get_host_name_ip()
    assert current_node_ip in node_ips, "Can't find your local ip {%s} in args.servers and args.workers ips: {%s}" \
-                % (current_node_ip, node_ips)
+        % (current_node_ip, node_ips)
    node_rank = node_ips.index(current_node_ip)
    logger.debug(
        "parsed from args: node_ips:{} current_node_ip:{} node_rank:{}, server_ports:{}".
@ -323,10 +323,12 @@ def launch_ps(args):
    for idx, cur_server in enumerate(pod.servers):
        proc_env = {
            "PADDLE_PSERVERS_IP_PORT_LIST": server_endpoints,
            "PADDLE_TRAINER_ENDPOINTS": worker_endpoints,
            "PADDLE_PORT": cur_server.endpoint.split(":")[1],
            "TRAINING_ROLE": "PSERVER",
            "PADDLE_TRAINERS_NUM": str(worker_num),
-            "POD_IP": cur_server.endpoint.split(":")[0]
+            "POD_IP": cur_server.endpoint.split(":")[0],
            "PADDLE_WITH_GLOO": "1"
        }
        current_env.update(proc_env)
@ -365,7 +367,8 @@ def launch_ps(args):
            "PADDLE_TRAINER_ENDPOINTS": worker_endpoints,
            "PADDLE_TRAINERS_NUM": str(worker_num),
            "TRAINING_ROLE": "TRAINER",
-            "PADDLE_TRAINER_ID": str(cur_worker.rank)
+            "PADDLE_TRAINER_ID": str(cur_worker.rank),
            "PADDLE_WITH_GLOO": "1"
        }
        current_env.update(proc_env)
@ -430,7 +433,11 @@ def launch():
        co_arg for co_arg in collective_args
        if co_arg in " ".join(sys.argv[1:-1])
    ]
-    cuda_device_num = fluid.core.get_cuda_device_count()
+    if fluid.core.is_compiled_with_cuda():
        cuda_device_num = fluid.core.get_cuda_device_count()
    else:
        cuda_device_num = 0
    if len(has_ps_args) > 0 or cuda_device_num == 0:
        logger.info(
            "Run parameter-sever cpu mode. pserver arguments:{}, cuda count:{}".
--- a/python/paddle/distributed/fleet/utils/init.py
+++ b/python/paddle/distributed/fleet/utils/init.py
@ -11,8 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from .fs import *
 from .http_server import KVHandler, KVHTTPServer, KVServer
 #__all__ = ['KVHandler', 'KVHTTPServer', 'KVServer'] + fs.__all__
--- a/python/paddle/distributed/fleet/utils/fs.py
+++ b/python/paddle/distributed/fleet/utils/fs.py
--- a/python/paddle/fluid/incubate/checkpoint/auto_checkpoint.py
+++ b/python/paddle/fluid/incubate/checkpoint/auto_checkpoint.py
@ -98,7 +98,7 @@ class AutoCheckpointChecker(object):
            self._fs_cache = os.getenv("PADDLE_EDL_FS_CACHE", ".cache")
            self._save_checkpoint_inter = int(
-                os.getenv("PADDLE_EDL_SAVE_CHECKPOINT_INTER", "900"))  #s
+                os.getenv("PADDLE_EDL_SAVE_CHECKPOINT_INTER", "900"))  # s
            if not self._ce_test:
                assert len(self._hdfs_home) > 3 and \
@ -132,7 +132,7 @@ class AutoCheckpointChecker(object):
        if in_dygraph_mode():
            return False
-        return  self._run_env is not None and \
+        return self._run_env is not None and \
            self._platform is not None and \
            self._job_id is not None and \
            self._hdfs_home is not None and \
--- a/python/paddle/fluid/tests/unittests/hdfs_test_utils.py
+++ b/python/paddle/fluid/tests/unittests/hdfs_test_utils.py
@ -19,7 +19,7 @@ from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet
 import os
 import sys
-from paddle.distributed.fleet.utils import LocalFS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
+from paddle.distributed.fleet.utils.fs import LocalFS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
 java_home = os.environ["JAVA_HOME"]
--- a/python/paddle/fluid/tests/unittests/test_auto_checkpoint_dist_basic.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_checkpoint_dist_basic.py
@ -67,13 +67,13 @@ class AutoCheckpointTestDist(AutoCheckPointACLBase):
        save_dir = "./run_save_0"
        fs.delete(save_dir)
-        #basic
+        # basic
        exe, main_prog, startup_prog = self._generate()
        compiled, data_loader, optimizer, loss, image, label = \
            self._init_env(exe, main_prog, startup_prog, minimize=False)
-        #fleet
+        # fleet
        os.environ["TRAINING_ROLE"] = "TRAINER"
        os.environ["PADDLE_TRAINER_ID"] = "0"
        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:6070"
--- a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_4.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_4.py
@ -40,9 +40,9 @@ class TestCloudRoleMaker(unittest.TestCase):
            from paddle.fluid.incubate.fleet.parameter_server.pslib import PSLib
            from paddle.fluid.incubate.fleet.base.role_maker import \
                GeneralRoleMaker
-            from paddle.distributed.fleet.utils import KVHandler
+            from paddle.distributed.fleet.utils.http_server import KVHandler
-            from paddle.distributed.fleet.utils import KVServer
+            from paddle.distributed.fleet.utils.http_server import KVServer
-            from paddle.distributed.fleet.utils import KVHTTPServer
+            from paddle.distributed.fleet.utils.http_server import KVHTTPServer
        except:
            print("warning: no fleet, skip test_pslib_4")
            return
--- a/python/paddle/fluid/tests/unittests/test_fleet_util.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_util.py
@ -81,12 +81,12 @@ class TestFleetUtil(unittest.TestCase):
        self.assertEqual(user_id, 10)
    def test_fs(self):
-        from paddle.distributed.fleet.utils import LocalFS
+        from paddle.distributed.fleet.utils.fs import LocalFS
        fs = LocalFS()
        dirs, files = fs.ls_dir("test_tmp")
        dirs, files = fs.ls_dir("./")
        self.assertFalse(fs.need_upload_download())
-        fleet_util.set_file_system(fs)
+        fleet_util._set_file_system(fs)
    def test_barrier(self):
        try:
--- a/python/paddle/fluid/tests/unittests/test_fs_interface.py
+++ b/python/paddle/fluid/tests/unittests/test_fs_interface.py
@ -20,7 +20,7 @@ import os
 import sys
 import inspect
-from paddle.distributed.fleet.utils import LocalFS, FS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
+from paddle.distributed.fleet.utils.fs import LocalFS, FS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
 class FSTest(unittest.TestCase):
--- a/python/paddle/fluid/tests/unittests/test_hdfs1.py
+++ b/python/paddle/fluid/tests/unittests/test_hdfs1.py
@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from paddle.fluid.tests.unittests.hdfs_test_utils import FSTestBase
 import unittest
 import paddle.fluid as fluid
 import paddle.fluid.incubate.fleet.base.role_maker as role_maker
@ -19,12 +20,10 @@ from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet
 import os
 import sys
-from paddle.distributed.fleet.utils import LocalFS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
+from paddle.distributed.fleet.utils.fs import LocalFS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
 java_home = os.environ["JAVA_HOME"]
 from paddle.fluid.tests.unittests.hdfs_test_utils import FSTestBase
 class FSTest1(FSTestBase):
    def test_timeout(self):
--- a/python/paddle/fluid/tests/unittests/test_hdfs2.py
+++ b/python/paddle/fluid/tests/unittests/test_hdfs2.py
@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from paddle.fluid.tests.unittests.hdfs_test_utils import FSTestBase
 import unittest
 import paddle.fluid as fluid
 import paddle.fluid.incubate.fleet.base.role_maker as role_maker
@ -19,12 +20,10 @@ from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet
 import os
 import sys
-from paddle.distributed.fleet.utils import LocalFS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
+from paddle.distributed.fleet.utils.fs import LocalFS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
 java_home = os.environ["JAVA_HOME"]
 from paddle.fluid.tests.unittests.hdfs_test_utils import FSTestBase
 class FSTest2(FSTestBase):
    def test_hdfs(self):
--- a/python/paddle/fluid/tests/unittests/test_hdfs3.py
+++ b/python/paddle/fluid/tests/unittests/test_hdfs3.py
@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from paddle.fluid.tests.unittests.hdfs_test_utils import FSTestBase
 import unittest
 import paddle.fluid as fluid
 import paddle.fluid.incubate.fleet.base.role_maker as role_maker
@ -19,12 +20,10 @@ from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet
 import os
 import sys
-from paddle.distributed.fleet.utils import LocalFS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
+from paddle.distributed.fleet.utils.fs import LocalFS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
 java_home = os.environ["JAVA_HOME"]
 from paddle.fluid.tests.unittests.hdfs_test_utils import FSTestBase
 class FSTest3(FSTestBase):
    def test_hdfs(self):