【Fleet2.0 Util】 add documents (#26698)

* test=develop, util documents
revert-27520-disable_pr
123malin 4 years ago committed by GitHub
parent e9a0fbfff2
commit f36b9a7f79
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -637,7 +637,7 @@ class PaddleCloudRoleMaker(RoleMakerBase):
return "lo"
def __start_kv_server(self, http_server_d, size_d):
from paddle.distributed.fleet.utils import KVServer
from paddle.distributed.fleet.utils.http_server import KVServer
http_server = KVServer(int(self._http_ip_port[1]), size_d)
http_server.start()
wait_seconds = 5
@ -651,6 +651,7 @@ class UserDefinedRoleMaker(PaddleCloudRoleMaker):
def __init__(self, is_collective=False, init_gloo=False, **kwargs):
super(UserDefinedRoleMaker, self).__init__(
is_collective=is_collective, init_gloo=init_gloo, **kwargs)
self._init_gloo = init_gloo
def _user_defined_ps_env(self):
self._server_endpoints = self._kwargs.get("server_endpoints")

File diff suppressed because it is too large Load Diff

@ -87,7 +87,7 @@ def _parse_args():
see: http://www.paddlepaddle.org/documentation/docs/zh/1.6/user_guides/howto/training/cluster_howto.html#permalink-8--nccl2-
''')
#Optional arguments for the launch helper
# Optional arguments for the launch helper
parser.add_argument(
"--ips",
type=str,
@ -115,7 +115,7 @@ see: http://www.paddlepaddle.org/documentation/docs/zh/1.6/user_guides/howto/tra
default="log",
help="The path for each process's log.If it's not set, the log will printed to default pipe."
)
#positional
# positional
parser.add_argument(
"training_script",
type=str,
@ -124,7 +124,7 @@ see: http://www.paddlepaddle.org/documentation/docs/zh/1.6/user_guides/howto/tra
"followed by all the arguments for the "
"training script")
#rest from the training program
# rest from the training program
parser.add_argument('training_script_args', nargs=REMAINDER)
return parser.parse_args()
@ -138,7 +138,7 @@ def get_cluster_from_args(args, gpus):
# node_ip = args.node_ip
assert node_ip in node_ips, "Can't find your local ip {%s} in node_ips: {%s}" \
% (node_ip, node_ips)
% (node_ip, node_ips)
node_rank = node_ips.index(node_ip)
logger.debug("parsed from args: node_ips:{} node_ip:{} node_rank:{}".format(
@ -280,7 +280,7 @@ def launch_ps(args):
_, current_node_ip = get_host_name_ip()
assert current_node_ip in node_ips, "Can't find your local ip {%s} in args.servers and args.workers ips: {%s}" \
% (current_node_ip, node_ips)
% (current_node_ip, node_ips)
node_rank = node_ips.index(current_node_ip)
logger.debug(
"parsed from args: node_ips:{} current_node_ip:{} node_rank:{}, server_ports:{}".
@ -323,10 +323,12 @@ def launch_ps(args):
for idx, cur_server in enumerate(pod.servers):
proc_env = {
"PADDLE_PSERVERS_IP_PORT_LIST": server_endpoints,
"PADDLE_TRAINER_ENDPOINTS": worker_endpoints,
"PADDLE_PORT": cur_server.endpoint.split(":")[1],
"TRAINING_ROLE": "PSERVER",
"PADDLE_TRAINERS_NUM": str(worker_num),
"POD_IP": cur_server.endpoint.split(":")[0]
"POD_IP": cur_server.endpoint.split(":")[0],
"PADDLE_WITH_GLOO": "1"
}
current_env.update(proc_env)
@ -365,7 +367,8 @@ def launch_ps(args):
"PADDLE_TRAINER_ENDPOINTS": worker_endpoints,
"PADDLE_TRAINERS_NUM": str(worker_num),
"TRAINING_ROLE": "TRAINER",
"PADDLE_TRAINER_ID": str(cur_worker.rank)
"PADDLE_TRAINER_ID": str(cur_worker.rank),
"PADDLE_WITH_GLOO": "1"
}
current_env.update(proc_env)
@ -430,7 +433,11 @@ def launch():
co_arg for co_arg in collective_args
if co_arg in " ".join(sys.argv[1:-1])
]
cuda_device_num = fluid.core.get_cuda_device_count()
if fluid.core.is_compiled_with_cuda():
cuda_device_num = fluid.core.get_cuda_device_count()
else:
cuda_device_num = 0
if len(has_ps_args) > 0 or cuda_device_num == 0:
logger.info(
"Run parameter-sever cpu mode. pserver arguments:{}, cuda count:{}".

@ -11,8 +11,3 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .fs import *
from .http_server import KVHandler, KVHTTPServer, KVServer
#__all__ = ['KVHandler', 'KVHTTPServer', 'KVServer'] + fs.__all__

File diff suppressed because it is too large Load Diff

@ -98,7 +98,7 @@ class AutoCheckpointChecker(object):
self._fs_cache = os.getenv("PADDLE_EDL_FS_CACHE", ".cache")
self._save_checkpoint_inter = int(
os.getenv("PADDLE_EDL_SAVE_CHECKPOINT_INTER", "900")) #s
os.getenv("PADDLE_EDL_SAVE_CHECKPOINT_INTER", "900")) # s
if not self._ce_test:
assert len(self._hdfs_home) > 3 and \
@ -132,7 +132,7 @@ class AutoCheckpointChecker(object):
if in_dygraph_mode():
return False
return self._run_env is not None and \
return self._run_env is not None and \
self._platform is not None and \
self._job_id is not None and \
self._hdfs_home is not None and \

@ -19,7 +19,7 @@ from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet
import os
import sys
from paddle.distributed.fleet.utils import LocalFS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
from paddle.distributed.fleet.utils.fs import LocalFS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
java_home = os.environ["JAVA_HOME"]

@ -67,13 +67,13 @@ class AutoCheckpointTestDist(AutoCheckPointACLBase):
save_dir = "./run_save_0"
fs.delete(save_dir)
#basic
# basic
exe, main_prog, startup_prog = self._generate()
compiled, data_loader, optimizer, loss, image, label = \
self._init_env(exe, main_prog, startup_prog, minimize=False)
#fleet
# fleet
os.environ["TRAINING_ROLE"] = "TRAINER"
os.environ["PADDLE_TRAINER_ID"] = "0"
os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:6070"

@ -40,9 +40,9 @@ class TestCloudRoleMaker(unittest.TestCase):
from paddle.fluid.incubate.fleet.parameter_server.pslib import PSLib
from paddle.fluid.incubate.fleet.base.role_maker import \
GeneralRoleMaker
from paddle.distributed.fleet.utils import KVHandler
from paddle.distributed.fleet.utils import KVServer
from paddle.distributed.fleet.utils import KVHTTPServer
from paddle.distributed.fleet.utils.http_server import KVHandler
from paddle.distributed.fleet.utils.http_server import KVServer
from paddle.distributed.fleet.utils.http_server import KVHTTPServer
except:
print("warning: no fleet, skip test_pslib_4")
return

@ -81,12 +81,12 @@ class TestFleetUtil(unittest.TestCase):
self.assertEqual(user_id, 10)
def test_fs(self):
from paddle.distributed.fleet.utils import LocalFS
from paddle.distributed.fleet.utils.fs import LocalFS
fs = LocalFS()
dirs, files = fs.ls_dir("test_tmp")
dirs, files = fs.ls_dir("./")
self.assertFalse(fs.need_upload_download())
fleet_util.set_file_system(fs)
fleet_util._set_file_system(fs)
def test_barrier(self):
try:

@ -20,7 +20,7 @@ import os
import sys
import inspect
from paddle.distributed.fleet.utils import LocalFS, FS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
from paddle.distributed.fleet.utils.fs import LocalFS, FS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
class FSTest(unittest.TestCase):

@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.fluid.tests.unittests.hdfs_test_utils import FSTestBase
import unittest
import paddle.fluid as fluid
import paddle.fluid.incubate.fleet.base.role_maker as role_maker
@ -19,12 +20,10 @@ from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet
import os
import sys
from paddle.distributed.fleet.utils import LocalFS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
from paddle.distributed.fleet.utils.fs import LocalFS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
java_home = os.environ["JAVA_HOME"]
from paddle.fluid.tests.unittests.hdfs_test_utils import FSTestBase
class FSTest1(FSTestBase):
def test_timeout(self):

@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.fluid.tests.unittests.hdfs_test_utils import FSTestBase
import unittest
import paddle.fluid as fluid
import paddle.fluid.incubate.fleet.base.role_maker as role_maker
@ -19,12 +20,10 @@ from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet
import os
import sys
from paddle.distributed.fleet.utils import LocalFS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
from paddle.distributed.fleet.utils.fs import LocalFS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
java_home = os.environ["JAVA_HOME"]
from paddle.fluid.tests.unittests.hdfs_test_utils import FSTestBase
class FSTest2(FSTestBase):
def test_hdfs(self):

@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.fluid.tests.unittests.hdfs_test_utils import FSTestBase
import unittest
import paddle.fluid as fluid
import paddle.fluid.incubate.fleet.base.role_maker as role_maker
@ -19,12 +20,10 @@ from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet
import os
import sys
from paddle.distributed.fleet.utils import LocalFS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
from paddle.distributed.fleet.utils.fs import LocalFS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
java_home = os.environ["JAVA_HOME"]
from paddle.fluid.tests.unittests.hdfs_test_utils import FSTestBase
class FSTest3(FSTestBase):
def test_hdfs(self):

Loading…
Cancel
Save