parent
							
								
									1ac31d3d08
								
							
						
					
					
						commit
						ef35c4ed1a
					
				| @ -1,18 +1,35 @@ | ||||
| #FROM python:2.7.14 | ||||
| FROM nvidia/cuda:8.0-cudnn5-runtime-ubuntu16.04 | ||||
| RUN apt-get update && apt-get install -y python | ||||
| RUN pip install -U kubernetes opencv-python &&   apt-get update -y &&   apt-get install -y iputils-ping libgtk2.0-dev | ||||
| # NOTE: By default CI built wheel packages turn WITH_DISTRIBUTE=OFF, | ||||
| #       so we must build one with distribute support to install in this image. | ||||
| 
 | ||||
| # you can get mirror list here: | ||||
| # https://launchpad.net/ubuntu/+archivemirrors | ||||
| ARG UBUNTU_MIRROR | ||||
| RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi' | ||||
| 
 | ||||
| RUN apt-get update && apt-get install -y python python-dev python-pip iputils-ping libgtk2.0-dev | ||||
| RUN pip install -U kubernetes opencv-python | ||||
| 
 | ||||
| RUN pip install paddlepaddle | ||||
| # if network is slowly, you may need to add proxy here. | ||||
| # ENV https_proxy= | ||||
| RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()" | python' | ||||
| RUN pip uninstall -y paddlepaddle | ||||
| # unset proxy if it is setted. | ||||
| # ENV https_proxy="" | ||||
| 
 | ||||
| # NOTE: By default CI built wheel packages turn WITH_DISTRIBUTE=OFF, | ||||
| #       so we must build one with distribute support to install in this image. | ||||
| ADD *.whl / | ||||
| RUN pip install /*.whl && rm -f /*.whl | ||||
| ENV LD_LIBRARY_PATH=/usr/local/lib | ||||
| 
 | ||||
| # tf k8s | ||||
| RUN pip install tensorflow==1.4.0 | ||||
| ADD tf_k8s /usr/bin | ||||
| RUN chmod +x /usr/bin/tf_k8s | ||||
| ADD vgg16_tf.py /workspace/ | ||||
| 
 | ||||
| # below lines may change a lot for debugging | ||||
| ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin | ||||
| ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root | ||||
| ADD *.whl / | ||||
| RUN pip install /*.whl && rm -f /*.whl && \ | ||||
| chmod +x /usr/bin/paddle_k8s | ||||
| ENV LD_LIBRARY_PATH=/usr/local/lib | ||||
| RUN chmod +x /usr/bin/paddle_k8s | ||||
| ADD vgg16_fluid.py vgg16_v2.py /workspace/ | ||||
|  | ||||
| @ -0,0 +1,82 @@ | ||||
| #!/bin/bash | ||||
| check_trainer_ret() { | ||||
|   ret=$1 | ||||
|   stdbuf -oL echo "job returned $ret...setting pod return message..." | ||||
|   stdbuf -oL echo "===============================" | ||||
| 
 | ||||
|   if [ $ret -eq 136 ] ; then | ||||
|     echo "Error Arithmetic Operation(Floating Point Exception)" > /dev/termination-log | ||||
|   elif [ $ret -eq 139 ] ; then | ||||
|     echo "Segmentation Fault" > /dev/termination-log | ||||
|   elif [ $ret -eq 1 ] ; then | ||||
|     echo "General Error" > /dev/termination-log | ||||
|   elif [ $ret -eq 134 ] ; then | ||||
|     echo "Program Abort" > /dev/termination-log | ||||
|   fi | ||||
|   stdbuf -oL echo "termination log wroted..." | ||||
|   exit $ret | ||||
| } | ||||
| 
 | ||||
| g_pservers="" | ||||
| g_trainers="" | ||||
| 
 | ||||
| wait_running_pods(){ | ||||
|   pserver_label="tf-job-pserver=${JOB_NAME}" | ||||
|   trainer_label="tf-job-trainer=${JOB_NAME}" | ||||
| 
 | ||||
|   stdbuf -oL python /root/k8s_tools.py wait_pods_running ${pserver_label} ${PSERVERS_NUM} | ||||
|   stdbuf -oL python /root/k8s_tools.py wait_pods_running ${trainer_label} ${TRAINERS_NUM} | ||||
| 
 | ||||
|   g_pservers=$(python /root/k8s_tools.py fetch_endpoints ${pserver_label} ${PORT}) | ||||
|   g_trainers=$(python /root/k8s_tools.py fetch_endpoints ${trainer_label} ${PORT}) | ||||
| } | ||||
| 
 | ||||
| start_tf_pserver(){ | ||||
|   wait_running_pods | ||||
| 
 | ||||
|   label="tf-job-pserver=${JOB_NAME}" | ||||
|   pserver_id=$(python /root/k8s_tools.py fetch_id ${label}) | ||||
| 
 | ||||
|   cmd="${ENTRY} --ps_hosts=${g_pservers} --worker_hosts=${g_trainers} \ | ||||
|   --job_name=${TF_JOB_NAME} --task_index=${pserver_id}" | ||||
| 
 | ||||
|   stdbuf -oL sh -c "cd ${TRAINER_PACKAGE} && ${cmd}" | ||||
| } | ||||
| 
 | ||||
| start_tf_trainer(){ | ||||
|   wait_running_pods | ||||
| 
 | ||||
|   label="tf-job-trainer=${JOB_NAME}" | ||||
|   trainer_id=$(python /root/k8s_tools.py fetch_id ${label}) | ||||
| 
 | ||||
|   cmd="${ENTRY} --ps_hosts=${g_pservers} --worker_hosts=${g_trainers} \ | ||||
|   --job_name=${TF_JOB_NAME} --task_index=${trainer_id} --batch_size=${BATCH_SIZE}" | ||||
| 
 | ||||
|   stdbuf -oL sh -c "cd ${TRAINER_PACKAGE} && ${cmd}" | ||||
|   check_trainer_ret $? | ||||
| } | ||||
| 
 | ||||
| start_tf(){ | ||||
|     if [[ "${TF_JOB_NAME}" == "worker" ]]; then | ||||
|         start_tf_trainer | ||||
|     else | ||||
|         start_tf_pserver | ||||
|     fi | ||||
| } | ||||
| 
 | ||||
| usage() { | ||||
|     echo "usage: tf_k8s [<args>]:" | ||||
|     echo "  start_tf         Start tensorflow jobs" | ||||
| } | ||||
| 
 | ||||
| case "$1" in | ||||
|     start_tf) | ||||
|         start_tf | ||||
|         ;; | ||||
|     --help) | ||||
|         usage | ||||
|         ;; | ||||
|     *) | ||||
|         usage | ||||
|         ;; | ||||
| esac | ||||
| @ -0,0 +1,56 @@ | ||||
| apiVersion: extensions/v1beta1 | ||||
| kind: ReplicaSet | ||||
| metadata: | ||||
|   name: vgg16job-tf-pserver | ||||
| spec: | ||||
|   replicas: 10 | ||||
|   template: | ||||
|     metadata: | ||||
|       labels: | ||||
|         tf-job-pserver: vgg16job-tf | ||||
|     spec: | ||||
|       hostNetwork: true | ||||
|       imagePullSecrets: | ||||
|       - name: job-registry-secret | ||||
|       containers: | ||||
|       - name: pserver | ||||
|         image: "registry.baidu.com/paddlepaddle/fluid_benchmark_tf:vgg16" | ||||
|         imagePullPolicy: Always | ||||
|         command: ["tf_k8s", "start_tf"] | ||||
|         ports: | ||||
|         - name: jobport-30236 | ||||
|           containerPort: 30236 | ||||
|         env: | ||||
|         - name: PORT | ||||
|           value: "32036" | ||||
|         - name: ENTRY | ||||
|           value: "python vgg16_tf.py" | ||||
|         - name: JOB_NAME | ||||
|           value: vgg16job-tf | ||||
|         - name: PSERVERS_NUM | ||||
|           value: "10" | ||||
|         - name: TF_JOB_NAME  | ||||
|           value: "ps" | ||||
|         - name: TRAINERS_NUM | ||||
|           value: "20" | ||||
|         - name: BATCH_SIZE | ||||
|           value: "128" | ||||
|         - name: TRAINER_PACKAGE | ||||
|           value: "/workspace" | ||||
|         - name: NUM_PASSES | ||||
|           value: "1" | ||||
|         - name: NAMESPACE | ||||
|           valueFrom: | ||||
|             fieldRef: | ||||
|               fieldPath: "metadata.namespace" | ||||
|         - name: POD_IP | ||||
|           valueFrom: | ||||
|             fieldRef: | ||||
|               fieldPath: "status.podIP" | ||||
|         resources: | ||||
|           requests: | ||||
|             memory: 10Gi | ||||
|             cpu: 4 | ||||
|           limits: | ||||
|             memory: 10Gi | ||||
|             cpu: 4 | ||||
| @ -0,0 +1,58 @@ | ||||
| apiVersion: batch/v1 | ||||
| kind: Job | ||||
| metadata: | ||||
|   name: vgg16job-tf-trainer | ||||
| spec: | ||||
|   parallelism: 20 | ||||
|   completions: 20 | ||||
|   template: | ||||
|     metadata: | ||||
|       labels: | ||||
|         tf-job-trainer: vgg16job-tf | ||||
|     spec: | ||||
|       imagePullSecrets: | ||||
|       - name: job-registry-secret | ||||
|       hostNetwork: true | ||||
|       containers: | ||||
|       - name: trainer | ||||
|         image: "registry.baidu.com/paddlepaddle/fluid_benchmark_tf:vgg16" | ||||
|         imagePullPolicy: Always | ||||
|         command: ["tf_k8s", "start_tf"] | ||||
|         ports: | ||||
|         - name: jobport-30236 | ||||
|           containerPort: 30236 | ||||
|         env: | ||||
|         - name: PORT | ||||
|           value: "32036" | ||||
|         - name: JOB_NAME | ||||
|           value: vgg16job-tf | ||||
|         - name: TF_JOB_NAME  | ||||
|           value: "worker" | ||||
|         - name: ENTRY | ||||
|           value: "python vgg16_tf.py" | ||||
|         - name: PSERVERS_NUM | ||||
|           value: "10" | ||||
|         - name: BATCH_SIZE | ||||
|           value: "128" | ||||
|         - name: TRAINERS_NUM | ||||
|           value: "20" | ||||
|         - name: TRAINER_PACKAGE | ||||
|           value: "/workspace" | ||||
|         - name: NUM_PASSES | ||||
|           value: "1" | ||||
|         - name: NAMESPACE | ||||
|           valueFrom: | ||||
|             fieldRef: | ||||
|               fieldPath: "metadata.namespace" | ||||
|         - name: POD_IP | ||||
|           valueFrom: | ||||
|             fieldRef: | ||||
|               fieldPath: "status.podIP" | ||||
|         resources: | ||||
|           requests: | ||||
|             memory: 40Gi | ||||
|             cpu: 2 | ||||
|           limits: | ||||
|             memory: 40Gi | ||||
|             cpu: 2 | ||||
|       restartPolicy: Never | ||||
											
												
													File diff suppressed because it is too large
													Load Diff
												
											
										
									
								
					Loading…
					
					
				
		Reference in new issue