@ -1,7 +0,0 @@
|
|||||||
FROM paddledev/paddle:cpu-latest
|
|
||||||
|
|
||||||
MAINTAINER zjsxzong89@gmail.com
|
|
||||||
|
|
||||||
COPY start.sh /root/
|
|
||||||
COPY start_paddle.py /root/
|
|
||||||
CMD ["bash"," -c","/root/start.sh"]
|
|
Before Width: | Height: | Size: 116 KiB |
Before Width: | Height: | Size: 236 KiB |
Before Width: | Height: | Size: 225 KiB |
Before Width: | Height: | Size: 501 KiB |
@ -1,7 +0,0 @@
|
|||||||
FROM alpine
|
|
||||||
|
|
||||||
RUN apk update && apk upgrade && apk add coreutils
|
|
||||||
ADD quick_start /quick_start
|
|
||||||
ADD get_data.sh /bin/
|
|
||||||
RUN chmod +x /bin/get_data.sh
|
|
||||||
ENTRYPOINT ["/bin/get_data.sh"]
|
|
@ -1,6 +0,0 @@
|
|||||||
To build PaddlePaddle data preparation image in tutorial [Distributed PaddlePaddle Training on AWS with Kubernetes](../../k8s_aws_en.md), run following commands:
|
|
||||||
|
|
||||||
```
|
|
||||||
cp -r ../../../../../../demo/quick_start .
|
|
||||||
docker build . -t prepare-data-image-name
|
|
||||||
```
|
|
@ -1,26 +0,0 @@
|
|||||||
#!/bin/sh
|
|
||||||
|
|
||||||
out_dir=$OUT_DIR
|
|
||||||
split_count=$SPLIT_COUNT
|
|
||||||
|
|
||||||
set -e
|
|
||||||
|
|
||||||
mkdir -p $out_dir
|
|
||||||
cp -r /quick_start $out_dir/
|
|
||||||
|
|
||||||
mkdir -p $out_dir/0/data
|
|
||||||
cd $out_dir/0/data
|
|
||||||
wget http://paddlepaddle.bj.bcebos.com/demo/quick_start_preprocessed_data/preprocessed_data.tar.gz
|
|
||||||
tar zxvf preprocessed_data.tar.gz
|
|
||||||
rm preprocessed_data.tar.gz
|
|
||||||
|
|
||||||
split -d --number=l/$split_count -a 5 train.txt train.
|
|
||||||
mv train.00000 train.txt
|
|
||||||
|
|
||||||
cd $out_dir
|
|
||||||
end=$(expr $split_count - 1)
|
|
||||||
for i in $(seq 1 $end); do
|
|
||||||
mkdir -p $i/data
|
|
||||||
cp -r 0/data/* $i/data
|
|
||||||
mv $i/data/train.`printf %05d $i` $i/data/train.txt
|
|
||||||
done;
|
|
@ -1,6 +0,0 @@
|
|||||||
FROM paddledev/paddle:cpu-latest
|
|
||||||
|
|
||||||
COPY start.sh /root/
|
|
||||||
COPY start_paddle.py /root/
|
|
||||||
RUN chmod +x /root/start.sh
|
|
||||||
CMD ["bash"," -c","/root/start.sh"]
|
|
@ -1,5 +0,0 @@
|
|||||||
To build PaddlePaddle training image in tutorial [Distributed PaddlePaddle Training on AWS with Kubernetes](../../k8s_aws_en.md), run following command:
|
|
||||||
|
|
||||||
```
|
|
||||||
docker build . -t train-image-name
|
|
||||||
```
|
|
@ -1,19 +0,0 @@
|
|||||||
#!/bin/sh
|
|
||||||
|
|
||||||
set -eu
|
|
||||||
|
|
||||||
jobconfig=${JOB_PATH}"/"${JOB_NAME}"/"${TRAIN_CONFIG_DIR}
|
|
||||||
cd /root
|
|
||||||
cp -rf $jobconfig/* .
|
|
||||||
|
|
||||||
python /root/start_paddle.py \
|
|
||||||
--dot_period=10 \
|
|
||||||
--ports_num=$CONF_PADDLE_PORTS_NUM \
|
|
||||||
--ports_num_for_sparse=$CONF_PADDLE_PORTS_NUM_SPARSE \
|
|
||||||
--log_period=50 \
|
|
||||||
--num_passes=10 \
|
|
||||||
--trainer_count=$TRAINER_COUNT \
|
|
||||||
--saving_period=1 \
|
|
||||||
--local=0 \
|
|
||||||
--config=trainer_config.lr.py \
|
|
||||||
--use_gpu=0
|
|
@ -1,170 +0,0 @@
|
|||||||
#!/usr/bin/python
|
|
||||||
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
|
|
||||||
import requests
|
|
||||||
import time
|
|
||||||
import socket
|
|
||||||
import os
|
|
||||||
import argparse
|
|
||||||
|
|
||||||
# configuration for cluster
|
|
||||||
API = "/api/v1/namespaces/"
|
|
||||||
JOBSELECTOR = "labelSelector=job-name="
|
|
||||||
JOB_PATH = os.getenv("JOB_PATH") + "/" + os.getenv("JOB_NAME")
|
|
||||||
JOB_PATH_OUTPUT = JOB_PATH + "/output"
|
|
||||||
JOBNAME = os.getenv("JOB_NAME")
|
|
||||||
NAMESPACE = os.getenv("JOB_NAMESPACE")
|
|
||||||
PADDLE_NIC = os.getenv("CONF_PADDLE_NIC")
|
|
||||||
PADDLE_PORT = os.getenv("CONF_PADDLE_PORT")
|
|
||||||
PADDLE_PORTS_NUM = os.getenv("CONF_PADDLE_PORTS_NUM")
|
|
||||||
PADDLE_PORTS_NUM_SPARSE = os.getenv("CONF_PADDLE_PORTS_NUM_SPARSE")
|
|
||||||
PADDLE_SERVER_NUM = os.getenv("CONF_PADDLE_GRADIENT_NUM")
|
|
||||||
|
|
||||||
tokenpath = '/var/run/secrets/kubernetes.io/serviceaccount/token'
|
|
||||||
|
|
||||||
|
|
||||||
def refine_unknown_args(cmd_args):
|
|
||||||
'''
|
|
||||||
refine unknown parameters to handle some special parameters
|
|
||||||
'''
|
|
||||||
new_args = []
|
|
||||||
for arg in cmd_args:
|
|
||||||
if arg.startswith("--") and arg.find("=") != -1:
|
|
||||||
equal_pos = arg.find("=") # find first = pos
|
|
||||||
arglist = list(arg)
|
|
||||||
arglist[equal_pos] = " "
|
|
||||||
arg = "".join(arglist)
|
|
||||||
arg = arg.lstrip("-")
|
|
||||||
new_args += arg.split(" ")
|
|
||||||
elif arg.startswith("--") and arg.find("=") == -1:
|
|
||||||
arg = arg.lstrip("-")
|
|
||||||
new_args.append(arg)
|
|
||||||
else:
|
|
||||||
new_args.append(arg)
|
|
||||||
return new_args
|
|
||||||
|
|
||||||
|
|
||||||
def isPodAllRunning(podlist):
|
|
||||||
'''
|
|
||||||
check all pod is running
|
|
||||||
'''
|
|
||||||
require = len(podlist["items"])
|
|
||||||
running = 0
|
|
||||||
for pod in podlist["items"]:
|
|
||||||
if pod["status"]["phase"] == "Running":
|
|
||||||
running += 1
|
|
||||||
print "waiting for pods running, require:", require, "running:", running
|
|
||||||
if require == running:
|
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def getPodList():
|
|
||||||
'''
|
|
||||||
get all container status of the job
|
|
||||||
'''
|
|
||||||
apiserver = "https://" + \
|
|
||||||
os.getenv("KUBERNETES_SERVICE_HOST") + ":" + \
|
|
||||||
os.getenv("KUBERNETES_SERVICE_PORT_HTTPS")
|
|
||||||
|
|
||||||
pod = API + NAMESPACE + "/pods?"
|
|
||||||
job = JOBNAME
|
|
||||||
if os.path.isfile(tokenpath):
|
|
||||||
tokenfile = open(tokenpath, mode='r')
|
|
||||||
token = tokenfile.read()
|
|
||||||
Bearer = "Bearer " + token
|
|
||||||
headers = {"Authorization": Bearer}
|
|
||||||
return requests.get(apiserver + pod + JOBSELECTOR + job,
|
|
||||||
headers=headers,
|
|
||||||
verify=False).json()
|
|
||||||
else:
|
|
||||||
return requests.get(apiserver + pod + JOBSELECTOR + job,
|
|
||||||
verify=False).json()
|
|
||||||
|
|
||||||
|
|
||||||
def getIdMap(podlist):
|
|
||||||
'''
|
|
||||||
generate tainer_id by ip
|
|
||||||
'''
|
|
||||||
ips = []
|
|
||||||
for pod in podlist["items"]:
|
|
||||||
ips.append(pod["status"]["podIP"])
|
|
||||||
ips.sort()
|
|
||||||
idMap = {}
|
|
||||||
for i in range(len(ips)):
|
|
||||||
idMap[ips[i]] = i
|
|
||||||
return idMap
|
|
||||||
|
|
||||||
|
|
||||||
def startPaddle(idMap={}, train_args_dict=None):
|
|
||||||
'''
|
|
||||||
start paddle pserver and trainer
|
|
||||||
'''
|
|
||||||
program = 'paddle train'
|
|
||||||
args = " --nics=" + PADDLE_NIC
|
|
||||||
args += " --port=" + str(PADDLE_PORT)
|
|
||||||
args += " --ports_num=" + str(PADDLE_PORTS_NUM)
|
|
||||||
args += " --comment=" + "paddle_process_by_paddle"
|
|
||||||
ip_string = ""
|
|
||||||
for ip in idMap.keys():
|
|
||||||
ip_string += (ip + ",")
|
|
||||||
ip_string = ip_string.rstrip(",")
|
|
||||||
args += " --pservers=" + ip_string
|
|
||||||
args_ext = ""
|
|
||||||
for key, value in train_args_dict.items():
|
|
||||||
args_ext += (' --' + key + '=' + value)
|
|
||||||
localIP = socket.gethostbyname(socket.gethostname())
|
|
||||||
trainerId = idMap[localIP]
|
|
||||||
args += " " + args_ext + " --trainer_id=" + \
|
|
||||||
str(trainerId) + " --save_dir=" + JOB_PATH_OUTPUT
|
|
||||||
logDir = JOB_PATH_OUTPUT + "/node_" + str(trainerId)
|
|
||||||
if not os.path.exists(JOB_PATH_OUTPUT):
|
|
||||||
os.makedirs(JOB_PATH_OUTPUT)
|
|
||||||
if not os.path.exists(logDir):
|
|
||||||
os.mkdir(logDir)
|
|
||||||
copyCommand = 'cp -rf ' + JOB_PATH + \
|
|
||||||
"/" + str(trainerId) + "/data/*" + " ./data/"
|
|
||||||
os.system(copyCommand)
|
|
||||||
startPserver = 'nohup paddle pserver' + \
|
|
||||||
" --port=" + str(PADDLE_PORT) + \
|
|
||||||
" --ports_num=" + str(PADDLE_PORTS_NUM) + \
|
|
||||||
" --ports_num_for_sparse=" + str(PADDLE_PORTS_NUM_SPARSE) + \
|
|
||||||
" --nics=" + PADDLE_NIC + \
|
|
||||||
" --comment=" + "paddle_process_by_paddle" + \
|
|
||||||
" --num_gradient_servers=" + str(PADDLE_SERVER_NUM) +\
|
|
||||||
" > " + logDir + "/server.log 2>&1 &"
|
|
||||||
print startPserver
|
|
||||||
os.system(startPserver)
|
|
||||||
# wait until pservers completely start
|
|
||||||
time.sleep(20)
|
|
||||||
startTrainer = program + args + " 2>&1 | tee " + \
|
|
||||||
logDir + "/train.log"
|
|
||||||
print startTrainer
|
|
||||||
os.system(startTrainer)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
prog="start_paddle.py", description='simple tool for k8s')
|
|
||||||
args, train_args_list = parser.parse_known_args()
|
|
||||||
train_args = refine_unknown_args(train_args_list)
|
|
||||||
train_args_dict = dict(zip(train_args[:-1:2], train_args[1::2]))
|
|
||||||
podlist = getPodList()
|
|
||||||
# need to wait until all pods are running
|
|
||||||
while not isPodAllRunning(podlist):
|
|
||||||
time.sleep(20)
|
|
||||||
podlist = getPodList()
|
|
||||||
idMap = getIdMap(podlist)
|
|
||||||
startPaddle(idMap, train_args_dict)
|
|
Before Width: | Height: | Size: 242 KiB |
Before Width: | Height: | Size: 70 KiB |
Before Width: | Height: | Size: 35 KiB |
Before Width: | Height: | Size: 51 KiB |
Before Width: | Height: | Size: 87 KiB |