Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into remove_evaluator
commit
101378c878
@ -1,18 +1,35 @@
|
||||
#FROM python:2.7.14
|
||||
FROM nvidia/cuda:8.0-cudnn5-runtime-ubuntu16.04
|
||||
RUN apt-get update && apt-get install -y python
|
||||
RUN pip install -U kubernetes opencv-python && apt-get update -y && apt-get install -y iputils-ping libgtk2.0-dev
|
||||
# NOTE: By default CI built wheel packages turn WITH_DISTRIBUTE=OFF,
|
||||
# so we must build one with distribute support to install in this image.
|
||||
|
||||
# you can get mirror list here:
|
||||
# https://launchpad.net/ubuntu/+archivemirrors
|
||||
ARG UBUNTU_MIRROR
|
||||
RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
|
||||
|
||||
RUN apt-get update && apt-get install -y python python-dev python-pip iputils-ping libgtk2.0-dev
|
||||
RUN pip install -U kubernetes opencv-python
|
||||
|
||||
RUN pip install paddlepaddle
|
||||
# if network is slowly, you may need to add proxy here.
|
||||
# ENV https_proxy=
|
||||
RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()" | python'
|
||||
RUN pip uninstall -y paddlepaddle
|
||||
# unset proxy if it is setted.
|
||||
# ENV https_proxy=""
|
||||
|
||||
# NOTE: By default CI built wheel packages turn WITH_DISTRIBUTE=OFF,
|
||||
# so we must build one with distribute support to install in this image.
|
||||
ADD *.whl /
|
||||
RUN pip install /*.whl && rm -f /*.whl
|
||||
ENV LD_LIBRARY_PATH=/usr/local/lib
|
||||
|
||||
# tf k8s
|
||||
RUN pip install tensorflow==1.4.0
|
||||
ADD tf_k8s /usr/bin
|
||||
RUN chmod +x /usr/bin/tf_k8s
|
||||
ADD vgg16_tf.py /workspace/
|
||||
|
||||
# below lines may change a lot for debugging
|
||||
ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
|
||||
ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
|
||||
ADD *.whl /
|
||||
RUN pip install /*.whl && rm -f /*.whl && \
|
||||
chmod +x /usr/bin/paddle_k8s
|
||||
ENV LD_LIBRARY_PATH=/usr/local/lib
|
||||
RUN chmod +x /usr/bin/paddle_k8s
|
||||
ADD vgg16_fluid.py vgg16_v2.py /workspace/
|
||||
|
@ -0,0 +1,82 @@
|
||||
#!/bin/bash
|
||||
check_trainer_ret() {
|
||||
ret=$1
|
||||
stdbuf -oL echo "job returned $ret...setting pod return message..."
|
||||
stdbuf -oL echo "==============================="
|
||||
|
||||
if [ $ret -eq 136 ] ; then
|
||||
echo "Error Arithmetic Operation(Floating Point Exception)" > /dev/termination-log
|
||||
elif [ $ret -eq 139 ] ; then
|
||||
echo "Segmentation Fault" > /dev/termination-log
|
||||
elif [ $ret -eq 1 ] ; then
|
||||
echo "General Error" > /dev/termination-log
|
||||
elif [ $ret -eq 134 ] ; then
|
||||
echo "Program Abort" > /dev/termination-log
|
||||
fi
|
||||
stdbuf -oL echo "termination log wroted..."
|
||||
exit $ret
|
||||
}
|
||||
|
||||
g_pservers=""
|
||||
g_trainers=""
|
||||
|
||||
wait_running_pods(){
|
||||
pserver_label="tf-job-pserver=${JOB_NAME}"
|
||||
trainer_label="tf-job-trainer=${JOB_NAME}"
|
||||
|
||||
stdbuf -oL python /root/k8s_tools.py wait_pods_running ${pserver_label} ${PSERVERS_NUM}
|
||||
stdbuf -oL python /root/k8s_tools.py wait_pods_running ${trainer_label} ${TRAINERS_NUM}
|
||||
|
||||
g_pservers=$(python /root/k8s_tools.py fetch_endpoints ${pserver_label} ${PORT})
|
||||
g_trainers=$(python /root/k8s_tools.py fetch_endpoints ${trainer_label} ${PORT})
|
||||
}
|
||||
|
||||
start_tf_pserver(){
|
||||
wait_running_pods
|
||||
|
||||
label="tf-job-pserver=${JOB_NAME}"
|
||||
pserver_id=$(python /root/k8s_tools.py fetch_id ${label})
|
||||
|
||||
cmd="${ENTRY} --ps_hosts=${g_pservers} --worker_hosts=${g_trainers} \
|
||||
--job_name=${TF_JOB_NAME} --task_index=${pserver_id}"
|
||||
|
||||
stdbuf -oL sh -c "cd ${TRAINER_PACKAGE} && ${cmd}"
|
||||
}
|
||||
|
||||
start_tf_trainer(){
|
||||
wait_running_pods
|
||||
|
||||
label="tf-job-trainer=${JOB_NAME}"
|
||||
trainer_id=$(python /root/k8s_tools.py fetch_id ${label})
|
||||
|
||||
cmd="${ENTRY} --ps_hosts=${g_pservers} --worker_hosts=${g_trainers} \
|
||||
--job_name=${TF_JOB_NAME} --task_index=${trainer_id} --batch_size=${BATCH_SIZE}"
|
||||
|
||||
stdbuf -oL sh -c "cd ${TRAINER_PACKAGE} && ${cmd}"
|
||||
check_trainer_ret $?
|
||||
}
|
||||
|
||||
start_tf(){
|
||||
if [[ "${TF_JOB_NAME}" == "worker" ]]; then
|
||||
start_tf_trainer
|
||||
else
|
||||
start_tf_pserver
|
||||
fi
|
||||
}
|
||||
|
||||
usage() {
|
||||
echo "usage: tf_k8s [<args>]:"
|
||||
echo " start_tf Start tensorflow jobs"
|
||||
}
|
||||
|
||||
case "$1" in
|
||||
start_tf)
|
||||
start_tf
|
||||
;;
|
||||
--help)
|
||||
usage
|
||||
;;
|
||||
*)
|
||||
usage
|
||||
;;
|
||||
esac
|
@ -0,0 +1,56 @@
|
||||
apiVersion: extensions/v1beta1
|
||||
kind: ReplicaSet
|
||||
metadata:
|
||||
name: vgg16job-tf-pserver
|
||||
spec:
|
||||
replicas: 10
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
tf-job-pserver: vgg16job-tf
|
||||
spec:
|
||||
hostNetwork: true
|
||||
imagePullSecrets:
|
||||
- name: job-registry-secret
|
||||
containers:
|
||||
- name: pserver
|
||||
image: "registry.baidu.com/paddlepaddle/fluid_benchmark_tf:vgg16"
|
||||
imagePullPolicy: Always
|
||||
command: ["tf_k8s", "start_tf"]
|
||||
ports:
|
||||
- name: jobport-30236
|
||||
containerPort: 30236
|
||||
env:
|
||||
- name: PORT
|
||||
value: "32036"
|
||||
- name: ENTRY
|
||||
value: "python vgg16_tf.py"
|
||||
- name: JOB_NAME
|
||||
value: vgg16job-tf
|
||||
- name: PSERVERS_NUM
|
||||
value: "10"
|
||||
- name: TF_JOB_NAME
|
||||
value: "ps"
|
||||
- name: TRAINERS_NUM
|
||||
value: "20"
|
||||
- name: BATCH_SIZE
|
||||
value: "128"
|
||||
- name: TRAINER_PACKAGE
|
||||
value: "/workspace"
|
||||
- name: NUM_PASSES
|
||||
value: "1"
|
||||
- name: NAMESPACE
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: "metadata.namespace"
|
||||
- name: POD_IP
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: "status.podIP"
|
||||
resources:
|
||||
requests:
|
||||
memory: 10Gi
|
||||
cpu: 4
|
||||
limits:
|
||||
memory: 10Gi
|
||||
cpu: 4
|
@ -0,0 +1,58 @@
|
||||
apiVersion: batch/v1
|
||||
kind: Job
|
||||
metadata:
|
||||
name: vgg16job-tf-trainer
|
||||
spec:
|
||||
parallelism: 20
|
||||
completions: 20
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
tf-job-trainer: vgg16job-tf
|
||||
spec:
|
||||
imagePullSecrets:
|
||||
- name: job-registry-secret
|
||||
hostNetwork: true
|
||||
containers:
|
||||
- name: trainer
|
||||
image: "registry.baidu.com/paddlepaddle/fluid_benchmark_tf:vgg16"
|
||||
imagePullPolicy: Always
|
||||
command: ["tf_k8s", "start_tf"]
|
||||
ports:
|
||||
- name: jobport-30236
|
||||
containerPort: 30236
|
||||
env:
|
||||
- name: PORT
|
||||
value: "32036"
|
||||
- name: JOB_NAME
|
||||
value: vgg16job-tf
|
||||
- name: TF_JOB_NAME
|
||||
value: "worker"
|
||||
- name: ENTRY
|
||||
value: "python vgg16_tf.py"
|
||||
- name: PSERVERS_NUM
|
||||
value: "10"
|
||||
- name: BATCH_SIZE
|
||||
value: "128"
|
||||
- name: TRAINERS_NUM
|
||||
value: "20"
|
||||
- name: TRAINER_PACKAGE
|
||||
value: "/workspace"
|
||||
- name: NUM_PASSES
|
||||
value: "1"
|
||||
- name: NAMESPACE
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: "metadata.namespace"
|
||||
- name: POD_IP
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: "status.podIP"
|
||||
resources:
|
||||
requests:
|
||||
memory: 40Gi
|
||||
cpu: 2
|
||||
limits:
|
||||
memory: 40Gi
|
||||
cpu: 2
|
||||
restartPolicy: Never
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,41 @@
|
||||
if(NOT WITH_GPU)
|
||||
return()
|
||||
endif()
|
||||
|
||||
|
||||
set(CUPTI_ROOT "/usr" CACHE PATH "CUPTI ROOT")
|
||||
find_path(CUPTI_INCLUDE_DIR cupti.h
|
||||
PATHS ${CUPTI_ROOT} ${CUPTI_ROOT}/include
|
||||
$ENV{CUPTI_ROOT} $ENV{CUPTI_ROOT}/include
|
||||
${CUDA_TOOLKIT_ROOT_DIR}/extras/CUPTI/include
|
||||
NO_DEFAULT_PATH
|
||||
)
|
||||
|
||||
get_filename_component(__libpath_hist ${CUDA_CUDART_LIBRARY} PATH)
|
||||
|
||||
set(TARGET_ARCH "x86_64")
|
||||
if(NOT ${CMAKE_SYSTEM_PROCESSOR})
|
||||
set(TARGET_ARCH ${CMAKE_SYSTEM_PROCESSOR})
|
||||
endif()
|
||||
|
||||
list(APPEND CUPTI_CHECK_LIBRARY_DIRS
|
||||
${CUPTI_ROOT}
|
||||
${CUPTI_ROOT}/lib64
|
||||
${CUPTI_ROOT}/lib
|
||||
${CUPTI_ROOT}/lib/${TARGET_ARCH}-linux-gnu
|
||||
$ENV{CUPTI_ROOT}
|
||||
$ENV{CUPTI_ROOT}/lib64
|
||||
$ENV{CUPTI_ROOT}/lib
|
||||
/usr/lib
|
||||
${CUDA_TOOLKIT_ROOT_DIR}/extras/CUPTI/lib64)
|
||||
find_library(CUPTI_LIBRARY NAMES libcupti.so libcupti.dylib # libcupti_static.a
|
||||
PATHS ${CUPTI_CHECK_LIBRARY_DIRS} ${CUPTI_INCLUDE_DIR} ${__libpath_hist}
|
||||
NO_DEFAULT_PATH
|
||||
DOC "Path to cuPTI library.")
|
||||
|
||||
get_filename_component(CUPTI_LIBRARY_PATH ${CUPTI_LIBRARY} DIRECTORY)
|
||||
if(CUPTI_INCLUDE_DIR AND CUPTI_LIBRARY)
|
||||
set(CUPTI_FOUND ON)
|
||||
else()
|
||||
set(CUPTI_FOUND OFF)
|
||||
endif()
|
@ -0,0 +1,18 @@
|
||||
======================
|
||||
Fluid
|
||||
======================
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
|
||||
layers.rst
|
||||
data_feeder.rst
|
||||
executor.rst
|
||||
initializer.rst
|
||||
evaluator.rst
|
||||
nets.rst
|
||||
optimizer.rst
|
||||
param_attr.rst
|
||||
profiler.rst
|
||||
regularizer.rst
|
||||
io.rst
|
File diff suppressed because it is too large
Load Diff
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue