Merge branch 'develop' of github.com:PaddlePaddle/Paddle into overlap_send_op

release/0.13.0
Yancey1989 7 years ago
commit ceefbf3259

@ -57,7 +57,10 @@ option(GLIDE_INSTALL "Download and install go dependencies " ON)
option(USE_NNPACK "Compile PaddlePaddle with NNPACK library" OFF) option(USE_NNPACK "Compile PaddlePaddle with NNPACK library" OFF)
option(WITH_DISTRIBUTE "Compile with grpc distributed support" OFF) option(WITH_DISTRIBUTE "Compile with grpc distributed support" OFF)
option(USE_EIGEN_FOR_BLAS "Use matrix multiplication in Eigen" OFF) option(USE_EIGEN_FOR_BLAS "Use matrix multiplication in Eigen" OFF)
option(EIGEN_USE_THREADS "Compile with multi-threaded Eigen" OFF)
option(WITH_ARM_FP16 "Use half precision support on armv8.2-a cpu" OFF) option(WITH_ARM_FP16 "Use half precision support on armv8.2-a cpu" OFF)
option(WITH_FAST_BUNDLE_TEST "Bundle tests that can be run in a single process together to reduce launch overhead" OFF)
option(WITH_CONTRIB "Compile the third-party contributation" OFF)
# CMAKE_BUILD_TYPE # CMAKE_BUILD_TYPE
if(NOT CMAKE_BUILD_TYPE) if(NOT CMAKE_BUILD_TYPE)
@ -202,7 +205,7 @@ endif(USE_NNPACK)
add_subdirectory(proto) add_subdirectory(proto)
if(NOT MOBILE_INFERENCE) if(NOT MOBILE_INFERENCE AND NOT WITH_FLUID_ONLY)
# "add_subdirectory(go)" should be placed after the following loine, # "add_subdirectory(go)" should be placed after the following loine,
# because it depends on paddle/optimizer. # because it depends on paddle/optimizer.
add_subdirectory(paddle/optimizer) add_subdirectory(paddle/optimizer)
@ -230,3 +233,7 @@ if(WITH_DOC)
find_python_module(recommonmark REQUIRED) find_python_module(recommonmark REQUIRED)
add_subdirectory(doc) add_subdirectory(doc)
endif() endif()
if (WITH_CONTRIB)
add_subdirectory(paddle/contrib)
endif()

@ -101,6 +101,3 @@ RUN echo 'root:root' | chpasswd
RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config
RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
EXPOSE 22 EXPOSE 22
# development image default do build work
CMD ["bash", "/paddle/paddle/scripts/docker/build.sh"]

@ -40,5 +40,3 @@ RUN mkdir -p ${ANDROID_TOOLCHAINS_DIR} && \
unzip -q android-ndk-r14b-linux-x86_64.zip && \ unzip -q android-ndk-r14b-linux-x86_64.zip && \
mv android-ndk-r14b ${ANDROID_NDK_HOME} && \ mv android-ndk-r14b ${ANDROID_NDK_HOME} && \
rm -rf /opt/android-ndk-tmp rm -rf /opt/android-ndk-tmp
CMD ["bash", "/paddle/paddle/scripts/docker/build_android.sh"]

@ -1,196 +0,0 @@
# Cluster Training Benchmark
## Setup
- Platform
- Kubernetes: v1.6.2
- Linux Kernel: v3.10.0
- Resource
- CPU: 10 Cores per Pod
- Memory: 5GB per Pod
- Docker Image
We use different base Docker Image to run the benchmark on Kubernetes:
- PaddlePaddle v2: paddlepaddle/paddle:0.11.0
- PaddlePaddle Fluid: paddlepaddle/paddle:[commit-id]
- TensorFlow: tensorflow/tensorflow:1.5.0-rc0
- Model
vgg16 is used in this benchmark.
## Cases
- Variable
- Batch Size of training data.
- PServer count of the training job.
- The number of trainers.
- Invariant
- The resource of trainer/pserver Pod.
### Measure the Performance for Different Batch Size
- PServer Count: 40
- Trainer Count: 100
- Metrics: mini-batch / sec
<table>
<thead>
<tr>
<th>Batch Size </th>
<th> 32</th>
<th>64</th>
<th>128 </th>
<th>256</th>
</tr>
</thead>
<tbody>
<tr>
<td> PaddlePaddle Fluid</td>
<td>-</td>
<td>- </td>
<td>- </td>
<td>- </td>
</tr>
<tr>
<td>PaddlePaddle v2 </td>
<td>- </td>
<td>- </td>
<td>- </td>
<td>- </td>
</tr>
<tr>
<td>TensorFlow </td>
<td>- </td>
<td>- </td>
<td>- </td>
<td>- </td>
</tr>
</tbody>
</table>
### Measure the Performance for Different PServer Count
- Trainer Count: 100
- Batch Size: 64
- Metrics: mini-batch / sec
<table>
<thead>
<tr>
<th>PServer Count </th>
<th>10</th>
<th>20</th>
<th>40 </th>
<th>60</th>
</tr>
</thead>
<tbody>
<tr>
<td> PaddlePaddle Fluid</td>
<td>-</td>
<td>- </td>
<td>- </td>
<td>- </td>
</tr>
<tr>
<td>PaddlePaddle v2 </td>
<td>- </td>
<td>- </td>
<td>- </td>
<td>- </td>
</tr>
<tr>
<td>TensorFlow </td>
<td>- </td>
<td>- </td>
<td>- </td>
<td>- </td>
</tr>
</tbody>
</table>
### Measure Parallel Efficiency By Increasing Trainer Count
- PServer Count: 20
- Batch Size: 64
- Metrics:
$S = \div(T1, TN)$
which S is the ratio of T1 over TN, training time of 1 and N trainers.
The parallel efficiency is:
$E = \div(S, N)$
<table>
<thead>
<tr>
<th>Trainer Counter </th>
<th>1</th>
<th>10</th>
<th>20 </th>
<th>30</th>
<th>40</th>
<th>50</th>
<th>60 </th>
<th>70</th>
<th>80</th>
<th>90</th>
<th>100 </th>
</tr>
</thead>
<tbody>
<tr>
<td> PaddlePaddle Fluid</td>
<td>-</td>
<td>- </td>
<td>- </td>
<td>- </td>
<td>-</td>
<td>- </td>
<td>- </td>
<td>- </td>
<td>-</td>
<td>- </td>
<td>- </td>
</tr>
<tr>
<td>PaddlePaddle v2 </td>
<td>- </td>
<td>- </td>
<td>- </td>
<td>- </td>
<td>-</td>
<td>- </td>
<td>- </td>
<td>- </td>
<td>-</td>
<td>- </td>
<td>- </td>
</tr>
<tr>
<td>TensorFlow </td>
<td>- </td>
<td>- </td>
<td>- </td>
<td>- </td>
<td>-</td>
<td>- </td>
<td>- </td>
<td>- </td>
<td>-</td>
<td>- </td>
<td>- </td>
</tr>
</tbody>
</table>
## Reproduce the benchmark
TODO

@ -1,35 +0,0 @@
FROM nvidia/cuda:8.0-cudnn5-runtime-ubuntu16.04
# you can get mirror list here:
# https://launchpad.net/ubuntu/+archivemirrors
ARG UBUNTU_MIRROR
RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
RUN apt-get update && apt-get install -y python python-dev python-pip iputils-ping libgtk2.0-dev
RUN pip install -U kubernetes opencv-python
RUN pip install paddlepaddle
# if network is slowly, you may need to add proxy here.
# ENV https_proxy=
RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()" | python'
RUN pip uninstall -y paddlepaddle
# unset proxy if it is setted.
# ENV https_proxy=""
# NOTE: By default CI built wheel packages turn WITH_DISTRIBUTE=OFF,
# so we must build one with distribute support to install in this image.
ADD *.whl /
RUN pip install /*.whl && rm -f /*.whl
ENV LD_LIBRARY_PATH=/usr/local/lib
# tf k8s
RUN pip install tensorflow==1.4.0
ADD tf_k8s /usr/bin
RUN chmod +x /usr/bin/tf_k8s
ADD vgg16_tf.py /workspace/
# below lines may change a lot for debugging
ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
RUN chmod +x /usr/bin/paddle_k8s
ADD vgg16_fluid.py vgg16_v2.py /workspace/

@ -1,195 +0,0 @@
# Performance for Distributed vgg16
## Test Result
### Hardware Infomation
- CPU: Intel(R) Xeon(R) CPU E5-2620 v4 @ 2.10GHz
- cpu MHz : 2101.000
- cache size : 20480 KB
### Blas settings
Setting environment variable: `MKL_NUM_THREADS=1`.
### Single Node Single Thread
- Metrics: samples / sec
<table>
<thead>
<tr>
<th>Batch Size </th>
<th> 32</th>
<th>64</th>
<th>128 </th>
<th>256</th>
</tr>
</thead>
<tbody>
<tr>
<td> PaddlePaddle Fluid</td>
<td> 15.44 </td>
<td> 16.32 </td>
<td> 16.74 </td>
<td> 16.79 </td>
</tr>
<tr>
<td>PaddlePaddle v2 </td>
<td> 15.97 </td>
<td> 17.04 </td>
<td> 17.60 </td>
<td> 17.83 </td>
</tr>
<tr>
<td>TensorFlow </td>
<td> 9.09 </td>
<td> 9.10 </td>
<td> 9.24 </td>
<td> 8.66 </td>
</tr>
</tbody>
</table>
### Different Batch Size
- PServer Count: 10
- Trainer Count: 20
- Metrics: samples / sec
<table>
<thead>
<tr>
<th>Batch Size </th>
<th> 32</th>
<th>64</th>
<th>128 </th>
<th>256</th>
</tr>
</thead>
<tbody>
<tr>
<td> PaddlePaddle Fluid</td>
<td> 190.20 </td>
<td> 222.15 </td>
<td> 247.40 </td>
<td> 258.18 </td>
</tr>
<tr>
<td>PaddlePaddle v2 </td>
<td> 170.96 </td>
<td> 233.71 </td>
<td> 256.14 </td>
<td> 329.23 </td>
</tr>
<tr>
<td>TensorFlow </td>
<td> - </td>
<td> - </td>
<td> - </td>
<td> - </td>
</tr>
</tbody>
</table>
### Accelerate Rate
- Pserver Count: 20
- Batch Size: 128
- Metrics: samples / sec
<table>
<thead>
<tr>
<th>Trainer Count </th>
<th>20</th>
<th>40</th>
<th>80</th>
<th>100</th>
</tr>
</thead>
<tbody>
<tr>
<td> PaddlePaddle Fluid</td>
<td> 263.29 (78.64%) </td>
<td> 518.80 (77.47%) </td>
<td> 836.26 (62.44%) </td>
<td> 1019.29 (60.89%) </td>
</tr>
<tr>
<td>PaddlePaddle v2 (need more tests) </td>
<td> 326.85 (92.85%) </td>
<td> 534.58 (75.93%) </td>
<td> 853.30 (60.60%) </td>
<td> 1041.99 (59.20%) </td>
</tr>
<tr>
<td>TensorFlow </td>
<td> - </td>
<td> - </td>
<td> - </td>
<td> - </td>
</tr>
</tbody>
</table>
### Different Pserver Count
- Trainer Count: 60
- Batch Size: 128
- Metrics: samples/ sec
<table>
<thead>
<tr>
<th>PServer Count </th>
<th>3</th>
<th>6</th>
<th>10</th>
<th>20</th>
</tr>
</thead>
<tbody>
<tr>
<td> PaddlePaddle Fluid(should fix in next PR) </td>
<td> 589.1 </td>
<td> 592.6 </td>
<td> 656.4 </td>
<td> 655.8 </td>
</tr>
<tr>
<td>PaddlePaddle v2 (need more tests) </td>
<td> 593.4 </td>
<td> 791.3 </td>
<td> 729.7 </td>
<td> 821.7 </td>
</tr>
<tr>
<td>TensorFlow </td>
<td> - </td>
<td> - </td>
<td> - </td>
<td> - </td>
</tr>
</tbody>
</table>
*The performance gap between Fuild and v2 comes from the network interference.*
## Steps to Run the Performance Test
1. You must re-compile PaddlePaddle and enable `-DWITH_DISTRIBUTE` to build PaddlePaddle with distributed support.
1. When the build finishes, copy the output `whl` package located under `build/python/dist` to current directory.
1. Run `docker build -t [image:tag] .` to build the docker image and run `docker push [image:tag]` to push the image to reponsitory so kubernetes can find it.
1. Run `kubectl create -f pserver.yaml && kubectl create -f trainer.yaml` to start the job on your kubernetes cluster (you must configure the `kubectl` client before this step).
1. Run `kubectl get po` to get running pods, and run `kubectl logs [podID]` to fetch the pod log of pservers and trainers.
Check the logs for the distributed training progress and analyze the performance.
## Enable Verbos Logs
Edit `pserver.yaml` and `trainer.yaml` and add an environment variable `GLOG_v=3` and `GLOG_logtostderr=1` to see what happend in detail.

@ -1,72 +0,0 @@
apiVersion: extensions/v1beta1
kind: ReplicaSet
metadata:
name: vgg16job-pserver
spec:
replicas: 10
template:
metadata:
labels:
paddle-job-pserver: vgg16job
spec:
hostNetwork: true
imagePullSecrets:
- name: job-registry-secret
containers:
- name: pserver
image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
imagePullPolicy: Always
ports:
- name: jobport-30236
containerPort: 30236
env:
- name: PADDLE_JOB_NAME
value: vgg16job
- name: MKL_NUM_THREADS
value: "1"
- name: TRAINING_ROLE
value: "PSERVER"
- name: TRAINERS
value: "20"
- name: PSERVERS
value: "10"
- name: TOPOLOGY
value: ""
- name: ENTRY
value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0"
- name: TRAINER_PACKAGE
value: "/workspace"
- name: PADDLE_INIT_PORT
value: "30236"
- name: PADDLE_INIT_NICS
value: "xgbe0"
- name: PADDLE_INIT_TRAINER_COUNT
value: "1"
- name: PADDLE_INIT_PORTS_NUM
value: "1"
- name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
value: "1"
- name: PADDLE_INIT_NUM_GRADIENT_SERVERS
value: "20"
- name: PADDLE_INIT_NUM_PASSES
value: "1"
- name: PADDLE_INIT_USE_GPU
value: "0"
- name: LD_LIBRARY_PATH
value: "/usr/local/lib:/usr/local/nvidia/lib64"
- name: NAMESPACE
valueFrom:
fieldRef:
fieldPath: "metadata.namespace"
- name: POD_IP
valueFrom:
fieldRef:
fieldPath: "status.podIP"
command: ["paddle_k8s", "start_fluid"]
resources:
requests:
memory: 10Gi
cpu: 4
limits:
memory: 10Gi
cpu: 4

@ -1,69 +0,0 @@
apiVersion: batch/v1
kind: Job
metadata:
name: vgg16job-trainer
spec:
parallelism: 20
completions: 20
template:
metadata:
labels:
paddle-job: vgg16job
spec:
imagePullSecrets:
- name: job-registry-secret
hostNetwork: true
containers:
- name: trainer
image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
imagePullPolicy: Always
command: ["paddle_k8s", "start_fluid"]
env:
- name: PADDLE_JOB_NAME
value: vgg16job
- name: TRAINING_ROLE
value: "TRAINER"
- name: TRAINERS
value: "20"
- name: PSERVERS
value: "10"
- name: TOPOLOGY
value: ""
- name: ENTRY
value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0 --batch_size 128"
- name: TRAINER_PACKAGE
value: "/workspace"
- name: PADDLE_INIT_PORT
value: "30236"
- name: PADDLE_INIT_NICS
value: "xgbe0"
- name: PADDLE_INIT_TRAINER_COUNT
value: "1"
- name: PADDLE_INIT_PORTS_NUM
value: "1"
- name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
value: "1"
- name: PADDLE_INIT_NUM_GRADIENT_SERVERS
value: "20"
- name: PADDLE_INIT_NUM_PASSES
value: "1"
- name: PADDLE_INIT_USE_GPU
value: "0"
- name: LD_LIBRARY_PATH
value: "/usr/local/lib:/usr/local/nvidia/lib64"
- name: NAMESPACE
valueFrom:
fieldRef:
fieldPath: "metadata.namespace"
- name: POD_IP
valueFrom:
fieldRef:
fieldPath: "status.podIP"
resources:
requests:
memory: 40Gi
cpu: 2
limits:
memory: 40Gi
cpu: 2
restartPolicy: Never

@ -1,21 +0,0 @@
#!/bin/bash
# Update to point to the source file.
VGG_SRC="vgg16_fluid.py"
export TRAINING_ROLE=PSERVER
export TRAINERS=2
export POD_IP=127.0.0.1
export PADDLE_INIT_PORT=6174
MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 &
# Need to wait for the ps to start first.
sleep 10
echo "done start ps"
export TRAINING_ROLE=TRAINER
export TRAINERS=2
export POD_IP=127.0.0.1
export PADDLE_INIT_PORT=6174
CUDA_VISIBLE_DEVICES=4 MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 --device=GPU --task_index=0 &
CUDA_VISIBLE_DEVICES=5 MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 --device=GPU --task_index=1 &

@ -1,82 +0,0 @@
#!/bin/bash
check_trainer_ret() {
ret=$1
stdbuf -oL echo "job returned $ret...setting pod return message..."
stdbuf -oL echo "==============================="
if [ $ret -eq 136 ] ; then
echo "Error Arithmetic Operation(Floating Point Exception)" > /dev/termination-log
elif [ $ret -eq 139 ] ; then
echo "Segmentation Fault" > /dev/termination-log
elif [ $ret -eq 1 ] ; then
echo "General Error" > /dev/termination-log
elif [ $ret -eq 134 ] ; then
echo "Program Abort" > /dev/termination-log
fi
stdbuf -oL echo "termination log wroted..."
exit $ret
}
g_pservers=""
g_trainers=""
wait_running_pods(){
pserver_label="tf-job-pserver=${JOB_NAME}"
trainer_label="tf-job-trainer=${JOB_NAME}"
stdbuf -oL python /root/k8s_tools.py wait_pods_running ${pserver_label} ${PSERVERS_NUM}
stdbuf -oL python /root/k8s_tools.py wait_pods_running ${trainer_label} ${TRAINERS_NUM}
g_pservers=$(python /root/k8s_tools.py fetch_endpoints ${pserver_label} ${PORT})
g_trainers=$(python /root/k8s_tools.py fetch_endpoints ${trainer_label} ${PORT})
}
start_tf_pserver(){
wait_running_pods
label="tf-job-pserver=${JOB_NAME}"
pserver_id=$(python /root/k8s_tools.py fetch_id ${label})
cmd="${ENTRY} --ps_hosts=${g_pservers} --worker_hosts=${g_trainers} \
--job_name=${TF_JOB_NAME} --task_index=${pserver_id}"
stdbuf -oL sh -c "cd ${TRAINER_PACKAGE} && ${cmd}"
}
start_tf_trainer(){
wait_running_pods
label="tf-job-trainer=${JOB_NAME}"
trainer_id=$(python /root/k8s_tools.py fetch_id ${label})
cmd="${ENTRY} --ps_hosts=${g_pservers} --worker_hosts=${g_trainers} \
--job_name=${TF_JOB_NAME} --task_index=${trainer_id} --batch_size=${BATCH_SIZE}"
stdbuf -oL sh -c "cd ${TRAINER_PACKAGE} && ${cmd}"
check_trainer_ret $?
}
start_tf(){
if [[ "${TF_JOB_NAME}" == "worker" ]]; then
start_tf_trainer
else
start_tf_pserver
fi
}
usage() {
echo "usage: tf_k8s [<args>]:"
echo " start_tf Start tensorflow jobs"
}
case "$1" in
start_tf)
start_tf
;;
--help)
usage
;;
*)
usage
;;
esac

@ -1,56 +0,0 @@
apiVersion: extensions/v1beta1
kind: ReplicaSet
metadata:
name: vgg16job-tf-pserver
spec:
replicas: 10
template:
metadata:
labels:
tf-job-pserver: vgg16job-tf
spec:
hostNetwork: true
imagePullSecrets:
- name: job-registry-secret
containers:
- name: pserver
image: "registry.baidu.com/paddlepaddle/fluid_benchmark_tf:vgg16"
imagePullPolicy: Always
command: ["tf_k8s", "start_tf"]
ports:
- name: jobport-30236
containerPort: 30236
env:
- name: PORT
value: "32036"
- name: ENTRY
value: "python vgg16_tf.py"
- name: JOB_NAME
value: vgg16job-tf
- name: PSERVERS_NUM
value: "10"
- name: TF_JOB_NAME
value: "ps"
- name: TRAINERS_NUM
value: "20"
- name: BATCH_SIZE
value: "128"
- name: TRAINER_PACKAGE
value: "/workspace"
- name: NUM_PASSES
value: "1"
- name: NAMESPACE
valueFrom:
fieldRef:
fieldPath: "metadata.namespace"
- name: POD_IP
valueFrom:
fieldRef:
fieldPath: "status.podIP"
resources:
requests:
memory: 10Gi
cpu: 4
limits:
memory: 10Gi
cpu: 4

@ -1,58 +0,0 @@
apiVersion: batch/v1
kind: Job
metadata:
name: vgg16job-tf-trainer
spec:
parallelism: 20
completions: 20
template:
metadata:
labels:
tf-job-trainer: vgg16job-tf
spec:
imagePullSecrets:
- name: job-registry-secret
hostNetwork: true
containers:
- name: trainer
image: "registry.baidu.com/paddlepaddle/fluid_benchmark_tf:vgg16"
imagePullPolicy: Always
command: ["tf_k8s", "start_tf"]
ports:
- name: jobport-30236
containerPort: 30236
env:
- name: PORT
value: "32036"
- name: JOB_NAME
value: vgg16job-tf
- name: TF_JOB_NAME
value: "worker"
- name: ENTRY
value: "python vgg16_tf.py"
- name: PSERVERS_NUM
value: "10"
- name: BATCH_SIZE
value: "128"
- name: TRAINERS_NUM
value: "20"
- name: TRAINER_PACKAGE
value: "/workspace"
- name: NUM_PASSES
value: "1"
- name: NAMESPACE
valueFrom:
fieldRef:
fieldPath: "metadata.namespace"
- name: POD_IP
valueFrom:
fieldRef:
fieldPath: "status.podIP"
resources:
requests:
memory: 40Gi
cpu: 2
limits:
memory: 40Gi
cpu: 2
restartPolicy: Never

@ -1,64 +0,0 @@
apiVersion: extensions/v1beta1
kind: ReplicaSet
metadata:
name: vgg16v2job-pserver
spec:
replicas: 10
template:
metadata:
labels:
paddle-job-pserver: vgg16v2job
spec:
hostNetwork: true
imagePullSecrets:
- name: job-registry-secret
containers:
- name: pserver
image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
imagePullPolicy: Always
ports:
- name: jobport-30236
containerPort: 30236
env:
- name: PADDLE_JOB_NAME
value: vgg16v2job
- name: TRAINERS
value: "20"
- name: PSERVERS
value: "10"
- name: TOPOLOGY
value: ""
- name: ENTRY
value: "python train.py"
- name: TRAINER_PACKAGE
value: "/workspace"
- name: PADDLE_INIT_PORT
value: "30236"
- name: PADDLE_INIT_NICS
value: "xgbe0"
- name: PADDLE_INIT_TRAINER_COUNT
value: "1"
- name: PADDLE_INIT_PORTS_NUM
value: "1"
- name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
value: "1"
- name: PADDLE_INIT_NUM_GRADIENT_SERVERS
value: "20"
- name: PADDLE_INIT_NUM_PASSES
value: "1"
- name: PADDLE_INIT_USE_GPU
value: "0"
- name: LD_LIBRARY_PATH
value: "/usr/local/lib:/usr/local/nvidia/lib64"
- name: NAMESPACE
valueFrom:
fieldRef:
fieldPath: "metadata.namespace"
command: ["paddle_k8s", "start_pserver"]
resources:
requests:
memory: 10Gi
cpu: 4
limits:
memory: 10Gi
cpu: 4

@ -1,65 +0,0 @@
apiVersion: batch/v1
kind: Job
metadata:
name: vgg16v2job-trainer
spec:
parallelism: 20
completions: 20
template:
metadata:
labels:
paddle-job: vgg16v2job
spec:
imagePullSecrets:
- name: job-registry-secret
hostNetwork: true
containers:
- name: trainer
image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
imagePullPolicy: Always
command: ["paddle_k8s", "start_trainer", "v2"]
env:
- name: PADDLE_JOB_NAME
value: vgg16v2job
- name: BATCH_SIZE
value: "256"
- name: TRAINERS
value: "20"
- name: PSERVERS
value: "10"
- name: TOPOLOGY
value: ""
- name: ENTRY
value: "cd /workspace && MKL_NUM_THREADS=1 python /workspace/vgg16_v2.py"
- name: TRAINER_PACKAGE
value: "/workspace"
- name: PADDLE_INIT_PORT
value: "30236"
- name: PADDLE_INIT_NICS
value: "xgbe0"
- name: PADDLE_INIT_TRAINER_COUNT
value: "1"
- name: PADDLE_INIT_PORTS_NUM
value: "1"
- name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
value: "1"
- name: PADDLE_INIT_NUM_GRADIENT_SERVERS
value: "20"
- name: PADDLE_INIT_NUM_PASSES
value: "2"
- name: PADDLE_INIT_USE_GPU
value: "0"
- name: LD_LIBRARY_PATH
value: "/usr/local/lib:/usr/local/nvidia/lib64"
- name: NAMESPACE
valueFrom:
fieldRef:
fieldPath: "metadata.namespace"
resources:
requests:
memory: 40Gi
cpu: 2
limits:
memory: 40Gi
cpu: 2
restartPolicy: Never

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -1,154 +0,0 @@
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
import gzip
import paddle.v2.dataset.cifar as cifar
import paddle.v2 as paddle
import time
import os
DATA_DIM = 3 * 32 * 32
CLASS_DIM = 10
BATCH_SIZE = os.getenv("BATCH_SIZE")
if BATCH_SIZE:
BATCH_SIZE = int(BATCH_SIZE)
else:
BATCH_SIZE = 128
print "batch_size", BATCH_SIZE
NODE_COUNT = int(os.getenv("TRAINERS"))
ts = 0
def vgg(input, nums, class_dim):
def conv_block(input, num_filter, groups, num_channels=None):
return paddle.networks.img_conv_group(
input=input,
num_channels=num_channels,
pool_size=2,
pool_stride=2,
conv_num_filter=[num_filter] * groups,
conv_filter_size=3,
conv_act=paddle.activation.Relu(),
pool_type=paddle.pooling.Max())
assert len(nums) == 5
# the channel of input feature is 3
conv1 = conv_block(input, 64, nums[0], 3)
conv2 = conv_block(conv1, 128, nums[1])
conv3 = conv_block(conv2, 256, nums[2])
conv4 = conv_block(conv3, 512, nums[3])
conv5 = conv_block(conv4, 512, nums[4])
fc_dim = 512
fc1 = paddle.layer.fc(input=conv5,
size=fc_dim,
act=paddle.activation.Relu(),
layer_attr=paddle.attr.Extra(drop_rate=0.5))
fc2 = paddle.layer.fc(input=fc1,
size=fc_dim,
act=paddle.activation.Relu(),
layer_attr=paddle.attr.Extra(drop_rate=0.5))
out = paddle.layer.fc(input=fc2,
size=class_dim,
act=paddle.activation.Softmax())
return out
def vgg13(input, class_dim):
nums = [2, 2, 2, 2, 2]
return vgg(input, nums, class_dim)
def vgg16(input, class_dim):
nums = [2, 2, 3, 3, 3]
return vgg(input, nums, class_dim)
def vgg19(input, class_dim):
nums = [2, 2, 4, 4, 4]
return vgg(input, nums, class_dim)
def main():
global ts
paddle.init(use_gpu=False)
image = paddle.layer.data(
name="image", type=paddle.data_type.dense_vector(DATA_DIM))
lbl = paddle.layer.data(
name="label", type=paddle.data_type.integer_value(CLASS_DIM))
extra_layers = None
# NOTE: for v2 distributed training need averaging updates.
learning_rate = 1e-3 / NODE_COUNT
out = vgg16(image, class_dim=CLASS_DIM)
cost = paddle.layer.classification_cost(input=out, label=lbl)
# Create parameters
parameters = paddle.parameters.create(cost)
# Create optimizer
optimizer = paddle.optimizer.Momentum(
momentum=0.9,
regularization=paddle.optimizer.L2Regularization(rate=0.0005 *
BATCH_SIZE),
learning_rate=learning_rate / BATCH_SIZE,
learning_rate_decay_a=0.1,
learning_rate_decay_b=128000 * 35,
learning_rate_schedule="discexp", )
train_reader = paddle.batch(
paddle.reader.shuffle(
cifar.train10(),
# To use other data, replace the above line with:
# reader.train_reader('train.list'),
buf_size=1000),
batch_size=BATCH_SIZE)
test_reader = paddle.batch(
cifar.test10(),
# To use other data, replace the above line with:
# reader.test_reader('val.list'),
batch_size=BATCH_SIZE)
# Create trainer
trainer = paddle.trainer.SGD(cost=cost,
parameters=parameters,
update_equation=optimizer,
extra_layers=extra_layers,
is_local=False)
# End batch and end pass event handler
def event_handler(event):
global ts, ts_pass
if isinstance(event, paddle.event.BeginPass):
ts_pass = time.time()
if isinstance(event, paddle.event.BeginIteration):
ts = time.time()
if isinstance(event, paddle.event.EndIteration):
if event.batch_id % 1 == 0:
print "\nPass %d, Batch %d, Cost %f, %s, spent: %f" % (
event.pass_id, event.batch_id, event.cost, event.metrics,
time.time() - ts)
if isinstance(event, paddle.event.EndPass):
print "Pass %d end, spent: %f" % (event.pass_id,
time.time() - ts_pass)
result = trainer.test(reader=test_reader)
print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
trainer.train(
reader=train_reader, num_passes=200, event_handler=event_handler)
if __name__ == '__main__':
main()

@ -94,6 +94,10 @@ def parse_args():
'--memory_optimize', '--memory_optimize',
action='store_true', action='store_true',
help='If set, optimize runtime memory before start.') help='If set, optimize runtime memory before start.')
parser.add_argument(
'--use_fake_data',
action='store_true',
help='If set ommit the actual read data operators.')
parser.add_argument( parser.add_argument(
'--update_method', '--update_method',
type=str, type=str,
@ -198,6 +202,10 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
exe.run(train_prog) exe.run(train_prog)
return return
if args.use_fake_data:
raise Exception(
"fake data is not supported in single GPU test for now.")
place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0) place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
exe = fluid.Executor(place) exe = fluid.Executor(place)
exe.run(startup_prog) exe.run(startup_prog)
@ -244,7 +252,31 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader, def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
batch_acc, args, train_prog, startup_prog, nccl_id_var, batch_acc, args, train_prog, startup_prog, nccl_id_var,
num_trainers, trainer_id): num_trainers, trainer_id):
feed_var_list = [
var for var in train_prog.global_block().vars.itervalues()
if var.is_data
]
# generate fake:
if args.use_fake_data:
for var in feed_var_list:
v = startup_prog.global_block().clone_variable(var)
var.persistable = True
v.persistable = True
real_shape = list(var.shape)
real_shape[0] = args.batch_size / args.gpus
startup_prog.global_block().append_op(
outputs={"Out": v},
type="fill_constant",
attrs={"shape": real_shape,
"value": 1.0,
"dtype": var.dtype})
place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0) place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
if nccl_id_var and trainer_id == 0:
#FIXME(wuyi): wait other trainer to start listening
time.sleep(30)
startup_exe = fluid.Executor(place) startup_exe = fluid.Executor(place)
startup_exe.run(startup_prog) startup_exe.run(startup_prog)
strategy = fluid.ExecutionStrategy() strategy = fluid.ExecutionStrategy()
@ -256,10 +288,7 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
exec_strategy=strategy, exec_strategy=strategy,
num_trainers=num_trainers, num_trainers=num_trainers,
trainer_id=trainer_id) trainer_id=trainer_id)
feed_var_list = [
var for var in train_prog.global_block().vars.itervalues()
if var.is_data
]
feeder = fluid.DataFeeder(feed_var_list, place) feeder = fluid.DataFeeder(feed_var_list, place)
for pass_id in range(args.pass_num): for pass_id in range(args.pass_num):
num_samples = 0 num_samples = 0
@ -271,7 +300,10 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
num_samples = 0 num_samples = 0
if iters == args.iterations: if iters == args.iterations:
break break
loss, = exe.run([avg_loss.name], feed=feeder.feed(data)) if args.use_fake_data:
loss, = exe.run([avg_loss.name])
else:
loss, = exe.run([avg_loss.name], feed=feeder.feed(data))
if args.update_method == "pserver": if args.update_method == "pserver":
exe.bcast_params() exe.bcast_params()
num_samples += len(data) num_samples += len(data)

@ -112,6 +112,7 @@ def gen_job():
envs.append({"name": "PSERVERS", "value": str(args.pservers)}) envs.append({"name": "PSERVERS", "value": str(args.pservers)})
envs.append({"name": "ENTRY", "value": args.entry}) envs.append({"name": "ENTRY", "value": args.entry})
envs.append({"name": "PADDLE_INIT_PORT", "value": str(args.port)}) envs.append({"name": "PADDLE_INIT_PORT", "value": str(args.port)})
envs.append({"name": "PADDLE_PSERVER_PORT", "value": str(args.port)})
# NOTE: these directories below are cluster specific, please modify # NOTE: these directories below are cluster specific, please modify
# this settings before you run on your own cluster. # this settings before you run on your own cluster.
envs.append({ envs.append({

@ -54,5 +54,13 @@ envs = [
"fieldPath": "status.podIP" "fieldPath": "status.podIP"
} }
} }
},
{
"name": "PADDLE_CURRENT_IP",
"valueFrom": {
"fieldRef": {
"fieldPath": "status.podIP"
}
}
} }
] ]

@ -41,6 +41,10 @@ if(USE_EIGEN_FOR_BLAS)
add_definitions(-DPADDLE_USE_EIGEN_FOR_BLAS) add_definitions(-DPADDLE_USE_EIGEN_FOR_BLAS)
endif(USE_EIGEN_FOR_BLAS) endif(USE_EIGEN_FOR_BLAS)
if(EIGEN_USE_THREADS)
add_definitions(-DEIGEN_USE_THREADS)
endif(EIGEN_USE_THREADS)
if(NOT WITH_PROFILER) if(NOT WITH_PROFILER)
add_definitions(-DPADDLE_DISABLE_PROFILER) add_definitions(-DPADDLE_DISABLE_PROFILER)
endif(NOT WITH_PROFILER) endif(NOT WITH_PROFILER)

@ -212,6 +212,7 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
${CMAKE_COMMAND} ${PROTOBUF_SOURCES_DIR}/src/${TARGET_NAME}/cmake ${CMAKE_COMMAND} ${PROTOBUF_SOURCES_DIR}/src/${TARGET_NAME}/cmake
${OPTIONAL_ARGS} ${OPTIONAL_ARGS}
-Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_BUILD_TESTS=OFF
-DCMAKE_SKIP_RPATH=ON
-DCMAKE_POSITION_INDEPENDENT_CODE=ON -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR} -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR}

@ -1003,9 +1003,9 @@ dice_loss
.. autofunction:: paddle.fluid.layers.dice_loss .. autofunction:: paddle.fluid.layers.dice_loss
:noindex: :noindex:
bilinear_interp upsampling_bilinear2d
____ ____
.. autofunction:: paddle.fluid.layers.bilinear_interp .. autofunction:: paddle.fluid.layers.upsampling_bilinear2d
:noindex: :noindex:

@ -35,13 +35,11 @@ PaddlePaddle需要使用Docker环境完成编译这样可以免去单独安
# 2. 可选步骤源码中构建用于编译PaddlePaddle的Docker镜像 # 2. 可选步骤源码中构建用于编译PaddlePaddle的Docker镜像
docker build -t paddle:dev . docker build -t paddle:dev .
# 3. 执行下面的命令编译CPU-Only的二进制 # 3. 执行下面的命令编译CPU-Only的二进制
docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/paddle_build.sh build docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh build
# 4. 或者也可以使用为上述可选步骤构建的镜像必须先执行第2步 # 4. 或者也可以使用为上述可选步骤构建的镜像必须先执行第2步
docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev ./paddle/scripts/paddle_build.sh build
注:上述命令把当前目录(源码树根目录)映射为 container 里的 :code:`/paddle` 目录。如果使用自行 注:上述命令把当前目录(源码树根目录)映射为 container 里的 :code:`/paddle` 目录。
构建的镜像上述第4步会执行 :code:`Dockerfile` 描述的默认入口程序 :code:`build.sh` 可以省略步骤3中
最后的执行脚本的命令。
编译完成后会在build/python/dist目录下生成输出的whl包可以选在在当前机器安装也可以拷贝到目标机器安装 编译完成后会在build/python/dist目录下生成输出的whl包可以选在在当前机器安装也可以拷贝到目标机器安装
@ -72,15 +70,15 @@ PaddlePaddle需要使用Docker环境完成编译这样可以免去单独安
.. code-block:: bash .. code-block:: bash
docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/docker/build.sh docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh test
如果期望执行其中一个单元测试,(比如 :code:`test_sum_op` 如果期望执行其中一个单元测试,(比如 :code:`test_sum_op`
.. code-block:: bash .. code-block:: bash
docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 /bin/bash docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 /bin/bash
bash /paddle/paddle/scripts/docker/build.sh ./paddle/scripts/paddle_build.sh build
cd /paddle/build cd build
ctest -R test_sum_op -V ctest -R test_sum_op -V
.. _faq_docker: .. _faq_docker:

@ -34,14 +34,12 @@ Or you can build your own image from source as the optional step below:
# 2. Optional: build development docker image from source # 2. Optional: build development docker image from source
docker build -t paddle:dev . docker build -t paddle:dev .
# 3. Run the following command to build a CPU-Only binaries # 3. Run the following command to build a CPU-Only binaries
docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/paddle_build.sh build docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh build
# 4. Or, use your built Docker image to build PaddlePaddle (must run step 2) # 4. Or, use your built Docker image to build PaddlePaddle (must run step 2)
docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev ./paddle/scripts/paddle_build.sh build
NOTE: The above command try to mount the current working directory (root directory of source code) NOTE: The above command try to mount the current working directory (root directory of source code)
into :code:`/paddle` directory inside docker container. If you are using your own image into :code:`/paddle` directory inside docker container.
(Step 4) it will run default entry-point :code:`build.sh` , so you could omit the last
command in step 3.
When the compile finishes, you can get the output whl package under When the compile finishes, you can get the output whl package under
build/python/dist, then you can choose to install the whl on local build/python/dist, then you can choose to install the whl on local
@ -74,15 +72,15 @@ Set :code:`WITH_GPU=ON` Can also run tests on GPU.
.. code-block:: bash .. code-block:: bash
docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/paddle/scripts/docker/build.sh docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh test
If you wish to run only one unit test, like :code:`test_sum_op`: If you wish to run only one unit test, like :code:`test_sum_op`:
.. code-block:: bash .. code-block:: bash
docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 /bin/bash docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 /bin/bash
bash /paddle/paddle/scripts/docker/build.sh ./paddle/scripts/paddle_build.sh build
cd /paddle/build cd build
ctest -R test_sum_op -V ctest -R test_sum_op -V
.. _faq_docker: .. _faq_docker:

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save