Trainer auto wait pserver ports (#13341)
* trainer auto wait pserver port ready * add file * fix docstring * add option to not wait * update api spec * clean * fix test hangfix-develop-build.sh
parent
7622234205
commit
3ab3a7f392
@ -0,0 +1,50 @@
|
||||
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import sys
|
||||
import time
|
||||
import socket
|
||||
from contextlib import closing
|
||||
|
||||
|
||||
def wait_server_ready(endpoints):
|
||||
"""
|
||||
Wait until parameter servers are ready, use connext_ex to detect
|
||||
port readiness.
|
||||
|
||||
Args:
|
||||
endpoints (list): endpoints string list, like:
|
||||
["127.0.0.1:8080", "127.0.0.1:8081"]
|
||||
|
||||
Examples:
|
||||
.. code-block:: python
|
||||
|
||||
wait_server_ready(["127.0.0.1:8080", "127.0.0.1:8081"])
|
||||
"""
|
||||
while True:
|
||||
all_ok = True
|
||||
for ep in endpoints:
|
||||
ip_port = ep.split(":")
|
||||
with closing(socket.socket(socket.AF_INET,
|
||||
socket.SOCK_STREAM)) as sock:
|
||||
sock.settimeout(2)
|
||||
result = sock.connect_ex((ip_port[0], int(ip_port[1])))
|
||||
if result != 0:
|
||||
all_ok = False
|
||||
if not all_ok:
|
||||
sys.stderr.write("pserver not ready, wait 3 sec to retry...\n")
|
||||
sys.stderr.flush()
|
||||
time.sleep(3)
|
||||
else:
|
||||
break
|
Loading…
Reference in new issue