parent
bd64719a2f
commit
f9db562987
@ -1,15 +1,13 @@
|
||||
#FROM paddlepaddle/paddlecloud-job
|
||||
#RUN mkdir -p /workspace
|
||||
#ADD reader.py /workspace/
|
||||
#RUN python /workspace/reader.py
|
||||
FROM python:2.7.14
|
||||
ADD paddle_k8s /usr/bin
|
||||
ADD k8s_tools.py /root
|
||||
RUN pip install -U kubernetes opencv-python && apt-get update -y && apt-get install -y iputils-ping libgtk2.0-dev
|
||||
ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
|
||||
ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
|
||||
RUN pip install -U kubernetes opencv-python && apt-get update -y && apt-get install -y iputils-ping libgtk2.0-dev && \
|
||||
chmod +x /usr/bin/paddle_k8s
|
||||
# NOTE: By default CI built wheel packages turn WITH_DISTRIBUTE=OFF,
|
||||
# so we must build one with distribute support to install in this image.
|
||||
ADD *.whl /
|
||||
RUN pip install /*.whl && rm -f /*.whl
|
||||
ENV LD_LIBRARY_PATH=/usr/local/lib
|
||||
ADD reader.py /workspace/
|
||||
RUN python /workspace/reader.py
|
||||
RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()" | python'
|
||||
|
||||
ADD vgg16_fluid.py vgg16_v2.py /workspace/
|
||||
|
@ -1,94 +0,0 @@
|
||||
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
#!/bin/env python
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import socket
|
||||
from kubernetes import client, config
|
||||
PADDLE_JOB_NAME = os.getenv("PADDLE_JOB_NAME")
|
||||
NAMESPACE = os.getenv("NAMESPACE")
|
||||
PORT = os.getenv("PSERVER_PORT")
|
||||
if os.getenv("KUBERNETES_SERVICE_HOST", None):
|
||||
config.load_incluster_config()
|
||||
else:
|
||||
config.load_kube_config()
|
||||
v1 = client.CoreV1Api()
|
||||
|
||||
|
||||
def fetch_pods_info(label_selector):
|
||||
api_response = v1.list_namespaced_pod(
|
||||
namespace=NAMESPACE, pretty=True, label_selector=label_selector)
|
||||
pod_list = []
|
||||
for item in api_response.items:
|
||||
pod_list.append((item.status.phase, item.status.pod_ip))
|
||||
return pod_list
|
||||
|
||||
|
||||
def wait_pods_running(label_selector, desired):
|
||||
print "label selector: %s, desired: %s" % (label_selector, desired)
|
||||
while True:
|
||||
count = count_pods_by_phase(label_selector, 'Running')
|
||||
# NOTE: pods may be scaled.
|
||||
if count >= int(desired):
|
||||
break
|
||||
print 'current cnt: %d sleep for 5 seconds...' % count
|
||||
time.sleep(5)
|
||||
|
||||
|
||||
def count_pods_by_phase(label_selector, phase):
|
||||
pod_list = fetch_pods_info(label_selector)
|
||||
filtered_pod_list = filter(lambda x: x[0] == phase, pod_list)
|
||||
return len(filtered_pod_list)
|
||||
|
||||
|
||||
def fetch_pserver_ips():
|
||||
label_selector = "paddle-job-pserver=%s" % PADDLE_JOB_NAME
|
||||
pod_list = fetch_pods_info(label_selector)
|
||||
pserver_ips = [item[1] for item in pod_list]
|
||||
return ",".join(pserver_ips)
|
||||
|
||||
|
||||
def fetch_master_ip():
|
||||
label_selector = "paddle-job-master=%s" % PADDLE_JOB_NAME
|
||||
pod_list = fetch_pods_info(label_selector)
|
||||
master_ips = [item[1] for item in pod_list]
|
||||
return master_ips[0]
|
||||
|
||||
|
||||
def fetch_trainer_id():
|
||||
label_selector = "paddle-job=%s" % PADDLE_JOB_NAME
|
||||
pod_list = fetch_pods_info(label_selector)
|
||||
trainer_ips = [item[1] for item in pod_list]
|
||||
trainer_ips.sort()
|
||||
local_ip = socket.gethostbyname(socket.gethostname())
|
||||
for i in xrange(len(trainer_ips)):
|
||||
if trainer_ips[i] == local_ip:
|
||||
return i
|
||||
return None
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
command = sys.argv[1]
|
||||
if command == "fetch_pserver_ips":
|
||||
print fetch_pserver_ips()
|
||||
elif command == "fetch_trainer_id":
|
||||
print fetch_trainer_id()
|
||||
elif command == "fetch_master_ip":
|
||||
print fetch_master_ip()
|
||||
elif command == "count_pods_by_phase":
|
||||
print count_pods_by_phase(sys.argv[2], sys.argv[3])
|
||||
elif command == "wait_pods_running":
|
||||
wait_pods_running(sys.argv[2], sys.argv[3])
|
@ -1,199 +0,0 @@
|
||||
#!/bin/bash
|
||||
start_pserver() {
|
||||
stdbuf -oL paddle pserver \
|
||||
--use_gpu=0 \
|
||||
--port=$PADDLE_INIT_PORT \
|
||||
--ports_num=$PADDLE_INIT_PORTS_NUM \
|
||||
--ports_num_for_sparse=$PADDLE_INIT_PORTS_NUM_FOR_SPARSE \
|
||||
--nics=$PADDLE_INIT_NICS \
|
||||
--comment=paddle_process_k8s \
|
||||
--num_gradient_servers=$PADDLE_INIT_NUM_GRADIENT_SERVERS
|
||||
}
|
||||
|
||||
start_new_pserver() {
|
||||
stdbuf -oL python /root/k8s_tools.py wait_pods_running paddle-job-master=${PADDLE_JOB_NAME} 1
|
||||
export MASTER_IP=$(python /root/k8s_tools.py fetch_master_ip)
|
||||
stdbuf -oL /usr/bin/pserver \
|
||||
-port=$PADDLE_INIT_PORT \
|
||||
-num-pservers=$PSERVERS \
|
||||
-log-level=debug \
|
||||
-etcd-endpoint=http://$MASTER_IP:2379
|
||||
}
|
||||
|
||||
start_master() {
|
||||
stdbuf -oL /usr/bin/master \
|
||||
-port=8080 \
|
||||
-chunk-per-task=1\
|
||||
-task-timout-dur=16s\
|
||||
-endpoints=http://127.0.0.1:2379
|
||||
}
|
||||
|
||||
check_failed_cnt() {
|
||||
max_failed=$1
|
||||
failed_count=$(python /root/k8s_tools.py count_pods_by_phase paddle-job=${PADDLE_JOB_NAME} Failed)
|
||||
if [ $failed_count -gt $max_failed ]; then
|
||||
stdbuf -oL echo "Failed trainer count beyond the threadhold: "$max_failed
|
||||
echo "Failed trainer count beyond the threshold: " $max_failed > /dev/termination-log
|
||||
exit 0
|
||||
fi
|
||||
}
|
||||
|
||||
check_trainer_ret() {
|
||||
ret=$1
|
||||
stdbuf -oL echo "job returned $ret...setting pod return message..."
|
||||
stdbuf -oL echo "==============================="
|
||||
|
||||
if [ $ret -eq 136 ] ; then
|
||||
echo "Error Arithmetic Operation(Floating Point Exception)" > /dev/termination-log
|
||||
elif [ $ret -eq 139 ] ; then
|
||||
echo "Segmentation Fault" > /dev/termination-log
|
||||
elif [ $ret -eq 1 ] ; then
|
||||
echo "General Error" > /dev/termination-log
|
||||
elif [ $ret -eq 134 ] ; then
|
||||
echo "Program Abort" > /dev/termination-log
|
||||
fi
|
||||
stdbuf -oL echo "termination log wroted..."
|
||||
exit $ret
|
||||
}
|
||||
|
||||
start_fluid_process() {
|
||||
stdbuf -oL python /root/k8s_tools.py wait_pods_running paddle-job-pserver=${PADDLE_JOB_NAME} ${PSERVERS}
|
||||
if [ "${TRAINING_ROLE}" == "TRAINER" ]; then
|
||||
check_failed_cnt ${TRAINERS}
|
||||
sleep 5
|
||||
export PADDLE_INIT_TRAINER_ID=$(python /root/k8s_tools.py fetch_trainer_id)
|
||||
fi
|
||||
export PADDLE_INIT_PSERVERS=$(python /root/k8s_tools.py fetch_pserver_ips)
|
||||
stdbuf -oL sh -c "${ENTRY}"
|
||||
check_trainer_ret $?
|
||||
}
|
||||
|
||||
start_new_trainer() {
|
||||
# FIXME(Yancey1989): use command-line interface to configure the max failed count
|
||||
check_failed_cnt ${TRAINERS}
|
||||
stdbuf -oL python /root/k8s_tools.py wait_pods_running paddle-job-pserver=${PADDLE_JOB_NAME} ${PSERVERS}
|
||||
sleep 5
|
||||
stdbuf -oL python /root/k8s_tools.py wait_pods_running paddle-job-master=${PADDLE_JOB_NAME} 1
|
||||
export MASTER_IP=$(python /root/k8s_tools.py fetch_master_ip)
|
||||
export ETCD_IP="$MASTER_IP"
|
||||
|
||||
# NOTE: $TRAINER_PACKAGE may be large, do not copy
|
||||
export PYTHONPATH=$TRAINER_PACKAGE:$PYTHONPATH
|
||||
cd $TRAINER_PACKAGE
|
||||
|
||||
stdbuf -oL echo "Starting training job: " $TRAINER_PACKAGE, "num_gradient_servers:" \
|
||||
$PADDLE_INIT_NUM_GRADIENT_SERVERS, "version: " $1
|
||||
|
||||
stdbuf -oL sh -c "${ENTRY}"
|
||||
check_trainer_ret $?
|
||||
}
|
||||
|
||||
start_trainer() {
|
||||
# paddle v1 and V2 distributed training does not allow any trainer failed.
|
||||
check_failed_cnt 0
|
||||
stdbuf -oL python /root/k8s_tools.py wait_pods_running paddle-job-pserver=${PADDLE_JOB_NAME} ${PSERVERS}
|
||||
stdbuf -oL python /root/k8s_tools.py wait_pods_running paddle-job=${PADDLE_JOB_NAME} ${TRAINERS}
|
||||
|
||||
export PADDLE_INIT_PSERVERS=$(python /root/k8s_tools.py fetch_pserver_ips)
|
||||
export PADDLE_INIT_TRAINER_ID=$(python /root/k8s_tools.py fetch_trainer_id)
|
||||
stdbuf -oL echo $PADDLE_INIT_TRAINER_ID > /trainer_id
|
||||
# FIXME: /trainer_count = PADDLE_INIT_NUM_GRADIENT_SERVERS
|
||||
stdbuf -oL echo $PADDLE_INIT_NUM_GRADIENT_SERVERS > /trainer_count
|
||||
|
||||
# NOTE: $TRAINER_PACKAGE may be large, do not copy
|
||||
export PYTHONPATH=$TRAINER_PACKAGE:$PYTHONPATH
|
||||
cd $TRAINER_PACKAGE
|
||||
|
||||
stdbuf -oL echo "Starting training job: " $TRAINER_PACKAGE, "num_gradient_servers:" \
|
||||
$PADDLE_INIT_NUM_GRADIENT_SERVERS, "trainer_id: " $PADDLE_INIT_TRAINER_ID, \
|
||||
"version: " $1
|
||||
|
||||
# FIXME: If we use the new PServer by Golang, add Kubernetes healthz
|
||||
# to wait PServer process get ready.Now only sleep 20 seconds.
|
||||
sleep 20
|
||||
|
||||
case "$1" in
|
||||
"v1")
|
||||
FILE_COUNT=$(wc -l $TRAIN_LIST | awk '{print $1}')
|
||||
if [ $FILE_COUNT -le $PADDLE_INIT_NUM_GRADIENT_SERVERS ]; then
|
||||
echo "file count less than trainers"
|
||||
check_trainer_ret 0
|
||||
fi
|
||||
let lines_per_node="$FILE_COUNT / ($PADDLE_INIT_NUM_GRADIENT_SERVERS + 1)"
|
||||
echo "spliting file to" $lines_per_node
|
||||
cp $TRAIN_LIST /
|
||||
cd /
|
||||
split -l $lines_per_node -d -a 3 $TRAIN_LIST train.list
|
||||
CURRENT_LIST=$(printf "train.list%03d" $PADDLE_INIT_TRAINER_ID)
|
||||
# always use /train.list for paddle v1 for each node.
|
||||
echo "File for current node ${CURRENT_LIST}"
|
||||
sleep 10
|
||||
cp $CURRENT_LIST train.list
|
||||
|
||||
cd $TRAINER_PACKAGE
|
||||
|
||||
stdbuf -oL paddle train \
|
||||
--port=$PADDLE_INIT_PORT \
|
||||
--nics=$PADDLE_INIT_NICS \
|
||||
--ports_num=$PADDLE_INIT_PORTS_NUM \
|
||||
--ports_num_for_sparse=$PADDLE_INIT_PORTS_NUM_FOR_SPARSE \
|
||||
--num_passes=$PADDLE_INIT_NUM_PASSES \
|
||||
--trainer_count=$PADDLE_INIT_TRAINER_COUNT \
|
||||
--saving_period=1 \
|
||||
--log_period=20 \
|
||||
--local=0 \
|
||||
--rdma_tcp=tcp \
|
||||
--config=$TOPOLOGY \
|
||||
--use_gpu=$PADDLE_INIT_USE_GPU \
|
||||
--trainer_id=$PADDLE_INIT_TRAINER_ID \
|
||||
--save_dir=$OUTPUT \
|
||||
--pservers=$PADDLE_INIT_PSERVERS \
|
||||
--num_gradient_servers=$PADDLE_INIT_NUM_GRADIENT_SERVERS
|
||||
# paddle v1 API does not allow any trainer failed.
|
||||
check_trainer_ret $?
|
||||
;;
|
||||
"v2")
|
||||
stdbuf -oL sh -c "${ENTRY}"
|
||||
# paddle v2 API does not allow any trainer failed.
|
||||
check_trainer_ret $?
|
||||
;;
|
||||
*)
|
||||
;;
|
||||
esac
|
||||
}
|
||||
|
||||
usage() {
|
||||
echo "usage: paddle_k8s [<args>]:"
|
||||
echo " start_trainer [v1|v2] Start a trainer process with v1 or v2 API"
|
||||
echo " start_pserver Start a pserver process"
|
||||
echo " start_new_pserver Start a new pserver process"
|
||||
echo " start_new_trainer Start a new triner process"
|
||||
}
|
||||
|
||||
case "$1" in
|
||||
start_pserver)
|
||||
start_pserver
|
||||
;;
|
||||
start_trainer)
|
||||
start_trainer $2
|
||||
;;
|
||||
start_new_trainer)
|
||||
start_new_trainer
|
||||
;;
|
||||
start_new_pserver)
|
||||
start_new_pserver
|
||||
;;
|
||||
start_master)
|
||||
start_master
|
||||
;;
|
||||
start_fluid)
|
||||
start_fluid_process
|
||||
;;
|
||||
--help)
|
||||
usage
|
||||
;;
|
||||
*)
|
||||
usage
|
||||
;;
|
||||
esac
|
||||
|
@ -1,16 +0,0 @@
|
||||
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import paddle.v2 as paddle
|
||||
paddle.dataset.cifar.train10()
|
Loading…
Reference in new issue