Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into paddle_fix

wangkuiyi-patch-1
guochaorong 7 years ago
commit 04b8d3d03c

@ -19,7 +19,7 @@ BasedOnStyle: Google
IndentWidth: 2
TabWidth: 2
ContinuationIndentWidth: 4
AccessModifierOffset: -2 # The private/protected/public has no indent in class
AccessModifierOffset: -1 # The private/protected/public has no indent in class
Standard: Cpp11
AllowAllParametersOfDeclarationOnNextLine: true
BinPackParameters: false

@ -9,7 +9,7 @@ import subprocess
import platform
COPYRIGHT = '''
Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.

@ -34,6 +34,14 @@ repos:
entry: bash ./tools/codestyle/cpplint_pre_commit.hook
language: system
files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx)$
- repo: local
hooks:
- id: pylint-doc-string
name: pylint
description: Check python docstring style using docstring_checker.
entry: bash ./tools/codestyle/pylint_pre_commit.hook
language: system
files: \.(py)$
- repo: https://github.com/PaddlePaddle/pre-commit-golang
sha: 8337620115c25ff8333f1b1a493bd031049bd7c0
hooks:

@ -18,6 +18,8 @@ env:
addons:
ssh_known_hosts: 13.229.163.131
before_install:
# For pylint dockstring checker
- sudo pip install pylint pytest astroid isort
- |
function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; }
script:

@ -4,6 +4,7 @@
| backyes | Yan-Fei Wang |
| baiyfbupt | Yi-Fan Bai |
| beckett1124 | Bin Qi |
| ChengduoZH | Cheng-Duo Zhao|
| chengxiaohua1105 | Xiao-Hua Cheng |
| cxwangyi, yiwangbaidu, wangkuiyi | Yi Wang |
| cxysteven | Xing-Yi Cheng |

@ -25,7 +25,6 @@ message(STATUS "CXX compiler: ${CMAKE_CXX_COMPILER}, version: "
message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: "
"${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
find_package(Sphinx)
if(NOT CMAKE_CROSSCOMPILING)
find_package(CUDA QUIET)
endif(NOT CMAKE_CROSSCOMPILING)
@ -42,7 +41,6 @@ option(WITH_MKL "Compile PaddlePaddle with MKL support." ${AVX_FO
option(WITH_DSO "Compile PaddlePaddle with dynamic linked CUDA" ON)
option(WITH_TESTING "Compile PaddlePaddle with unit testing" OFF)
option(WITH_SWIG_PY "Compile PaddlePaddle with inference api" ON)
option(WITH_STYLE_CHECK "Compile PaddlePaddle with style check" ON)
option(WITH_PYTHON "Compile PaddlePaddle with python interpreter" ON)
option(WITH_DOUBLE "Compile PaddlePaddle with double precision" OFF)
option(WITH_RDMA "Compile PaddlePaddle with RDMA support" OFF)
@ -59,8 +57,10 @@ option(GLIDE_INSTALL "Download and install go dependencies " ON)
option(USE_NNPACK "Compile PaddlePaddle with NNPACK library" OFF)
option(WITH_DISTRIBUTE "Compile with grpc distributed support" OFF)
option(USE_EIGEN_FOR_BLAS "Use matrix multiplication in Eigen" OFF)
option(EIGEN_USE_THREADS "Compile with multi-threaded Eigen" OFF)
option(WITH_ARM_FP16 "Use half precision support on armv8.2-a cpu" OFF)
option(WITH_FAST_BUNDLE_TEST "Bundle tests that can be run in a single process together to reduce launch overhead" OFF)
option(WITH_CONTRIB "Compile the third-party contributation" OFF)
# CMAKE_BUILD_TYPE
if(NOT CMAKE_BUILD_TYPE)
@ -101,6 +101,9 @@ endif()
set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
"A path setting third party libraries download & build directories.")
set(FLUID_INSTALL_DIR "${CMAKE_BINARY_DIR}/fluid_install_dir" CACHE STRING
"A path setting fluid shared and static libraries")
if (WITH_C_API AND WITH_PYTHON)
message(WARNING "It is suggest not embedded a python interpreter in Paddle "
"when using C-API. It will give an unpredictable behavior when using a "
@ -118,13 +121,14 @@ else()
endif()
set(WITH_MKLML ${WITH_MKL})
if (WITH_MKL AND AVX2_FOUND)
set(WITH_MKLDNN ON)
else()
message(STATUS "Do not have AVX2 intrinsics and disabled MKL-DNN")
set(WITH_MKLDNN OFF)
if (NOT DEFINED WITH_MKLDNN)
if (WITH_MKL AND AVX2_FOUND)
set(WITH_MKLDNN ON)
else()
message(STATUS "Do not have AVX2 intrinsics and disabled MKL-DNN")
set(WITH_MKLDNN OFF)
endif()
endif()
########################################################################################
include(external/mklml) # download mklml package
@ -153,7 +157,6 @@ include(cupti)
include(configure) # add paddle env configuration
include(generic) # simplify cmake module
include(package) # set paddle packages
include(cpplint) # set paddle c++ style
include(ccache) # set ccache for compilation
include(util) # set unittest and link libs
include(rdma) # set rdma libraries
@ -202,7 +205,7 @@ endif(USE_NNPACK)
add_subdirectory(proto)
if(NOT MOBILE_INFERENCE)
if(NOT MOBILE_INFERENCE AND NOT WITH_FLUID_ONLY)
# "add_subdirectory(go)" should be placed after the following loine,
# because it depends on paddle/optimizer.
add_subdirectory(paddle/optimizer)
@ -226,5 +229,11 @@ if(WITH_PYTHON)
endif()
if(WITH_DOC)
find_package(Sphinx REQUIRED)
find_python_module(recommonmark REQUIRED)
add_subdirectory(doc)
endif()
if (WITH_CONTRIB)
add_subdirectory(paddle/contrib)
endif()

@ -58,6 +58,8 @@ PaddlePaddle uses this [Git branching model](http://nvie.com/posts/a-successful-
create mode 100644 233
```
NOTE: The `yapf` installed by `pip install pre-commit` and `conda install -c conda-forge pre-commit` is slightly different. Paddle developers use `pip install pre-commit`.
1. Build and test
Users can build PaddlePaddle natively on Linux and Mac OS X. But to unify the building environment and to make it easy for debugging, the recommended way is [using Docker](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/build_en.md).

@ -24,12 +24,12 @@ COPY ./paddle/scripts/docker/root/ /root/
RUN apt-get update && \
apt-get install -y --allow-downgrades \
git python-pip python-dev openssh-server bison \
git python-pip python-dev python-opencv openssh-server bison \
libnccl2=2.1.2-1+cuda8.0 libnccl-dev=2.1.2-1+cuda8.0 \
wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
curl sed grep graphviz libjpeg-dev zlib1g-dev \
python-matplotlib gcc-4.8 g++-4.8 \
automake locales clang-format swig doxygen cmake \
automake locales clang-format swig cmake \
liblapack-dev liblapacke-dev \
clang-3.8 llvm-3.8 libclang-3.8-dev \
net-tools libtool ccache && \
@ -70,14 +70,16 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8
# specify sphinx version as 1.5.6 and remove -U option for [pip install -U
# sphinx-rtd-theme] since -U option will cause sphinx being updated to newest
# version(1.7.1 for now), which causes building documentation failed.
RUN pip install --upgrade pip==9.0.3 && \
RUN easy_install -U pip && \
pip install -U wheel && \
pip install -U docopt PyYAML sphinx==1.5.6 && \
pip install sphinx-rtd-theme==0.1.9 recommonmark
RUN pip install pre-commit 'ipython==5.3.0' && \
pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
pip install opencv-python
pip install 'ipykernel==4.6.0' 'jupyter==1.0.0'
#For docstring checker
RUN pip install pylint pytest astroid isort
COPY ./python/requirements.txt /root/
RUN pip install -r /root/requirements.txt
@ -101,6 +103,3 @@ RUN echo 'root:root' | chpasswd
RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config
RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
EXPOSE 22
# development image default do build work
CMD ["bash", "/paddle/paddle/scripts/docker/build.sh"]

@ -40,5 +40,3 @@ RUN mkdir -p ${ANDROID_TOOLCHAINS_DIR} && \
unzip -q android-ndk-r14b-linux-x86_64.zip && \
mv android-ndk-r14b ${ANDROID_NDK_HOME} && \
rm -rf /opt/android-ndk-tmp
CMD ["bash", "/paddle/paddle/scripts/docker/build_android.sh"]

@ -62,9 +62,9 @@ Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddl
## Installation
It is recommended to check out the
[Docker installation guide](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/docker_install_en.html)
[Docker installation guide](http://www.paddlepaddle.org/docs/develop/documentation/fluid/en/build_and_install/docker_install_en.html)
before looking into the
[build from source guide](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/build_from_source_en.html).
[build from source guide](http://www.paddlepaddle.org/docs/develop/documentation/fluid/en/build_and_install/build_from_source_en.html).
## Documentation

@ -7,3 +7,6 @@ paddle/rnn/imdb.pkl
caffe/image/logs
tensorflow/image/logs
tensorflow/rnn/logs
fluid/models/*.pyc
fluid/logs
fluid/nohup.out

@ -1,196 +0,0 @@
# Cluster Training Benchmark
## Setup
- Platform
- Kubernetes: v1.6.2
- Linux Kernel: v3.10.0
- Resource
- CPU: 10 Cores per Pod
- Memory: 5GB per Pod
- Docker Image
We use different base Docker Image to run the benchmark on Kubernetes:
- PaddlePaddle v2: paddlepaddle/paddle:0.11.0
- PaddlePaddle Fluid: paddlepaddle/paddle:[commit-id]
- TensorFlow: tensorflow/tensorflow:1.5.0-rc0
- Model
vgg16 is used in this benchmark.
## Cases
- Variable
- Batch Size of training data.
- PServer count of the training job.
- The number of trainers.
- Invariant
- The resource of trainer/pserver Pod.
### Measure the Performance for Different Batch Size
- PServer Count: 40
- Trainer Count: 100
- Metrics: mini-batch / sec
<table>
<thead>
<tr>
<th>Batch Size </th>
<th> 32</th>
<th>64</th>
<th>128 </th>
<th>256</th>
</tr>
</thead>
<tbody>
<tr>
<td> PaddlePaddle Fluid</td>
<td>-</td>
<td>- </td>
<td>- </td>
<td>- </td>
</tr>
<tr>
<td>PaddlePaddle v2 </td>
<td>- </td>
<td>- </td>
<td>- </td>
<td>- </td>
</tr>
<tr>
<td>TensorFlow </td>
<td>- </td>
<td>- </td>
<td>- </td>
<td>- </td>
</tr>
</tbody>
</table>
### Measure the Performance for Different PServer Count
- Trainer Count: 100
- Batch Size: 64
- Metrics: mini-batch / sec
<table>
<thead>
<tr>
<th>PServer Count </th>
<th>10</th>
<th>20</th>
<th>40 </th>
<th>60</th>
</tr>
</thead>
<tbody>
<tr>
<td> PaddlePaddle Fluid</td>
<td>-</td>
<td>- </td>
<td>- </td>
<td>- </td>
</tr>
<tr>
<td>PaddlePaddle v2 </td>
<td>- </td>
<td>- </td>
<td>- </td>
<td>- </td>
</tr>
<tr>
<td>TensorFlow </td>
<td>- </td>
<td>- </td>
<td>- </td>
<td>- </td>
</tr>
</tbody>
</table>
### Measure Parallel Efficiency By Increasing Trainer Count
- PServer Count: 20
- Batch Size: 64
- Metrics:
$S = \div(T1, TN)$
which S is the ratio of T1 over TN, training time of 1 and N trainers.
The parallel efficiency is:
$E = \div(S, N)$
<table>
<thead>
<tr>
<th>Trainer Counter </th>
<th>1</th>
<th>10</th>
<th>20 </th>
<th>30</th>
<th>40</th>
<th>50</th>
<th>60 </th>
<th>70</th>
<th>80</th>
<th>90</th>
<th>100 </th>
</tr>
</thead>
<tbody>
<tr>
<td> PaddlePaddle Fluid</td>
<td>-</td>
<td>- </td>
<td>- </td>
<td>- </td>
<td>-</td>
<td>- </td>
<td>- </td>
<td>- </td>
<td>-</td>
<td>- </td>
<td>- </td>
</tr>
<tr>
<td>PaddlePaddle v2 </td>
<td>- </td>
<td>- </td>
<td>- </td>
<td>- </td>
<td>-</td>
<td>- </td>
<td>- </td>
<td>- </td>
<td>-</td>
<td>- </td>
<td>- </td>
</tr>
<tr>
<td>TensorFlow </td>
<td>- </td>
<td>- </td>
<td>- </td>
<td>- </td>
<td>-</td>
<td>- </td>
<td>- </td>
<td>- </td>
<td>-</td>
<td>- </td>
<td>- </td>
</tr>
</tbody>
</table>
## Reproduce the benchmark
TODO

@ -1,35 +0,0 @@
FROM nvidia/cuda:8.0-cudnn5-runtime-ubuntu16.04
# you can get mirror list here:
# https://launchpad.net/ubuntu/+archivemirrors
ARG UBUNTU_MIRROR
RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
RUN apt-get update && apt-get install -y python python-dev python-pip iputils-ping libgtk2.0-dev
RUN pip install -U kubernetes opencv-python
RUN pip install paddlepaddle
# if network is slowly, you may need to add proxy here.
# ENV https_proxy=
RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()" | python'
RUN pip uninstall -y paddlepaddle
# unset proxy if it is setted.
# ENV https_proxy=""
# NOTE: By default CI built wheel packages turn WITH_DISTRIBUTE=OFF,
# so we must build one with distribute support to install in this image.
ADD *.whl /
RUN pip install /*.whl && rm -f /*.whl
ENV LD_LIBRARY_PATH=/usr/local/lib
# tf k8s
RUN pip install tensorflow==1.4.0
ADD tf_k8s /usr/bin
RUN chmod +x /usr/bin/tf_k8s
ADD vgg16_tf.py /workspace/
# below lines may change a lot for debugging
ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
RUN chmod +x /usr/bin/paddle_k8s
ADD vgg16_fluid.py vgg16_v2.py /workspace/

@ -1,195 +0,0 @@
# Performance for Distributed vgg16
## Test Result
### Hardware Infomation
- CPU: Intel(R) Xeon(R) CPU E5-2620 v4 @ 2.10GHz
- cpu MHz : 2101.000
- cache size : 20480 KB
### Blas settings
Setting environment variable: `MKL_NUM_THREADS=1`.
### Single Node Single Thread
- Metrics: samples / sec
<table>
<thead>
<tr>
<th>Batch Size </th>
<th> 32</th>
<th>64</th>
<th>128 </th>
<th>256</th>
</tr>
</thead>
<tbody>
<tr>
<td> PaddlePaddle Fluid</td>
<td> 15.44 </td>
<td> 16.32 </td>
<td> 16.74 </td>
<td> 16.79 </td>
</tr>
<tr>
<td>PaddlePaddle v2 </td>
<td> 15.97 </td>
<td> 17.04 </td>
<td> 17.60 </td>
<td> 17.83 </td>
</tr>
<tr>
<td>TensorFlow </td>
<td> 9.09 </td>
<td> 9.10 </td>
<td> 9.24 </td>
<td> 8.66 </td>
</tr>
</tbody>
</table>
### Different Batch Size
- PServer Count: 10
- Trainer Count: 20
- Metrics: samples / sec
<table>
<thead>
<tr>
<th>Batch Size </th>
<th> 32</th>
<th>64</th>
<th>128 </th>
<th>256</th>
</tr>
</thead>
<tbody>
<tr>
<td> PaddlePaddle Fluid</td>
<td> 190.20 </td>
<td> 222.15 </td>
<td> 247.40 </td>
<td> 258.18 </td>
</tr>
<tr>
<td>PaddlePaddle v2 </td>
<td> 170.96 </td>
<td> 233.71 </td>
<td> 256.14 </td>
<td> 329.23 </td>
</tr>
<tr>
<td>TensorFlow </td>
<td> - </td>
<td> - </td>
<td> - </td>
<td> - </td>
</tr>
</tbody>
</table>
### Accelerate Rate
- Pserver Count: 20
- Batch Size: 128
- Metrics: samples / sec
<table>
<thead>
<tr>
<th>Trainer Count </th>
<th>20</th>
<th>40</th>
<th>80</th>
<th>100</th>
</tr>
</thead>
<tbody>
<tr>
<td> PaddlePaddle Fluid</td>
<td> 263.29 (78.64%) </td>
<td> 518.80 (77.47%) </td>
<td> 836.26 (62.44%) </td>
<td> 1019.29 (60.89%) </td>
</tr>
<tr>
<td>PaddlePaddle v2 (need more tests) </td>
<td> 326.85 (92.85%) </td>
<td> 534.58 (75.93%) </td>
<td> 853.30 (60.60%) </td>
<td> 1041.99 (59.20%) </td>
</tr>
<tr>
<td>TensorFlow </td>
<td> - </td>
<td> - </td>
<td> - </td>
<td> - </td>
</tr>
</tbody>
</table>
### Different Pserver Count
- Trainer Count: 60
- Batch Size: 128
- Metrics: samples/ sec
<table>
<thead>
<tr>
<th>PServer Count </th>
<th>3</th>
<th>6</th>
<th>10</th>
<th>20</th>
</tr>
</thead>
<tbody>
<tr>
<td> PaddlePaddle Fluid(should fix in next PR) </td>
<td> 589.1 </td>
<td> 592.6 </td>
<td> 656.4 </td>
<td> 655.8 </td>
</tr>
<tr>
<td>PaddlePaddle v2 (need more tests) </td>
<td> 593.4 </td>
<td> 791.3 </td>
<td> 729.7 </td>
<td> 821.7 </td>
</tr>
<tr>
<td>TensorFlow </td>
<td> - </td>
<td> - </td>
<td> - </td>
<td> - </td>
</tr>
</tbody>
</table>
*The performance gap between Fuild and v2 comes from the network interference.*
## Steps to Run the Performance Test
1. You must re-compile PaddlePaddle and enable `-DWITH_DISTRIBUTE` to build PaddlePaddle with distributed support.
1. When the build finishes, copy the output `whl` package located under `build/python/dist` to current directory.
1. Run `docker build -t [image:tag] .` to build the docker image and run `docker push [image:tag]` to push the image to reponsitory so kubernetes can find it.
1. Run `kubectl create -f pserver.yaml && kubectl create -f trainer.yaml` to start the job on your kubernetes cluster (you must configure the `kubectl` client before this step).
1. Run `kubectl get po` to get running pods, and run `kubectl logs [podID]` to fetch the pod log of pservers and trainers.
Check the logs for the distributed training progress and analyze the performance.
## Enable Verbos Logs
Edit `pserver.yaml` and `trainer.yaml` and add an environment variable `GLOG_v=3` and `GLOG_logtostderr=1` to see what happend in detail.

@ -1,72 +0,0 @@
apiVersion: extensions/v1beta1
kind: ReplicaSet
metadata:
name: vgg16job-pserver
spec:
replicas: 10
template:
metadata:
labels:
paddle-job-pserver: vgg16job
spec:
hostNetwork: true
imagePullSecrets:
- name: job-registry-secret
containers:
- name: pserver
image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
imagePullPolicy: Always
ports:
- name: jobport-30236
containerPort: 30236
env:
- name: PADDLE_JOB_NAME
value: vgg16job
- name: MKL_NUM_THREADS
value: "1"
- name: TRAINING_ROLE
value: "PSERVER"
- name: TRAINERS
value: "20"
- name: PSERVERS
value: "10"
- name: TOPOLOGY
value: ""
- name: ENTRY
value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0"
- name: TRAINER_PACKAGE
value: "/workspace"
- name: PADDLE_INIT_PORT
value: "30236"
- name: PADDLE_INIT_NICS
value: "xgbe0"
- name: PADDLE_INIT_TRAINER_COUNT
value: "1"
- name: PADDLE_INIT_PORTS_NUM
value: "1"
- name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
value: "1"
- name: PADDLE_INIT_NUM_GRADIENT_SERVERS
value: "20"
- name: PADDLE_INIT_NUM_PASSES
value: "1"
- name: PADDLE_INIT_USE_GPU
value: "0"
- name: LD_LIBRARY_PATH
value: "/usr/local/lib:/usr/local/nvidia/lib64"
- name: NAMESPACE
valueFrom:
fieldRef:
fieldPath: "metadata.namespace"
- name: POD_IP
valueFrom:
fieldRef:
fieldPath: "status.podIP"
command: ["paddle_k8s", "start_fluid"]
resources:
requests:
memory: 10Gi
cpu: 4
limits:
memory: 10Gi
cpu: 4

@ -1,69 +0,0 @@
apiVersion: batch/v1
kind: Job
metadata:
name: vgg16job-trainer
spec:
parallelism: 20
completions: 20
template:
metadata:
labels:
paddle-job: vgg16job
spec:
imagePullSecrets:
- name: job-registry-secret
hostNetwork: true
containers:
- name: trainer
image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
imagePullPolicy: Always
command: ["paddle_k8s", "start_fluid"]
env:
- name: PADDLE_JOB_NAME
value: vgg16job
- name: TRAINING_ROLE
value: "TRAINER"
- name: TRAINERS
value: "20"
- name: PSERVERS
value: "10"
- name: TOPOLOGY
value: ""
- name: ENTRY
value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0 --batch_size 128"
- name: TRAINER_PACKAGE
value: "/workspace"
- name: PADDLE_INIT_PORT
value: "30236"
- name: PADDLE_INIT_NICS
value: "xgbe0"
- name: PADDLE_INIT_TRAINER_COUNT
value: "1"
- name: PADDLE_INIT_PORTS_NUM
value: "1"
- name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
value: "1"
- name: PADDLE_INIT_NUM_GRADIENT_SERVERS
value: "20"
- name: PADDLE_INIT_NUM_PASSES
value: "1"
- name: PADDLE_INIT_USE_GPU
value: "0"
- name: LD_LIBRARY_PATH
value: "/usr/local/lib:/usr/local/nvidia/lib64"
- name: NAMESPACE
valueFrom:
fieldRef:
fieldPath: "metadata.namespace"
- name: POD_IP
valueFrom:
fieldRef:
fieldPath: "status.podIP"
resources:
requests:
memory: 40Gi
cpu: 2
limits:
memory: 40Gi
cpu: 2
restartPolicy: Never

@ -1,21 +0,0 @@
#!/bin/bash
# Update to point to the source file.
VGG_SRC="vgg16_fluid.py"
export TRAINING_ROLE=PSERVER
export TRAINERS=2
export POD_IP=127.0.0.1
export PADDLE_INIT_PORT=6174
MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 &
# Need to wait for the ps to start first.
sleep 10
echo "done start ps"
export TRAINING_ROLE=TRAINER
export TRAINERS=2
export POD_IP=127.0.0.1
export PADDLE_INIT_PORT=6174
CUDA_VISIBLE_DEVICES=4 MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 --device=GPU --task_index=0 &
CUDA_VISIBLE_DEVICES=5 MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 --device=GPU --task_index=1 &

@ -1,82 +0,0 @@
#!/bin/bash
check_trainer_ret() {
ret=$1
stdbuf -oL echo "job returned $ret...setting pod return message..."
stdbuf -oL echo "==============================="
if [ $ret -eq 136 ] ; then
echo "Error Arithmetic Operation(Floating Point Exception)" > /dev/termination-log
elif [ $ret -eq 139 ] ; then
echo "Segmentation Fault" > /dev/termination-log
elif [ $ret -eq 1 ] ; then
echo "General Error" > /dev/termination-log
elif [ $ret -eq 134 ] ; then
echo "Program Abort" > /dev/termination-log
fi
stdbuf -oL echo "termination log wroted..."
exit $ret
}
g_pservers=""
g_trainers=""
wait_running_pods(){
pserver_label="tf-job-pserver=${JOB_NAME}"
trainer_label="tf-job-trainer=${JOB_NAME}"
stdbuf -oL python /root/k8s_tools.py wait_pods_running ${pserver_label} ${PSERVERS_NUM}
stdbuf -oL python /root/k8s_tools.py wait_pods_running ${trainer_label} ${TRAINERS_NUM}
g_pservers=$(python /root/k8s_tools.py fetch_endpoints ${pserver_label} ${PORT})
g_trainers=$(python /root/k8s_tools.py fetch_endpoints ${trainer_label} ${PORT})
}
start_tf_pserver(){
wait_running_pods
label="tf-job-pserver=${JOB_NAME}"
pserver_id=$(python /root/k8s_tools.py fetch_id ${label})
cmd="${ENTRY} --ps_hosts=${g_pservers} --worker_hosts=${g_trainers} \
--job_name=${TF_JOB_NAME} --task_index=${pserver_id}"
stdbuf -oL sh -c "cd ${TRAINER_PACKAGE} && ${cmd}"
}
start_tf_trainer(){
wait_running_pods
label="tf-job-trainer=${JOB_NAME}"
trainer_id=$(python /root/k8s_tools.py fetch_id ${label})
cmd="${ENTRY} --ps_hosts=${g_pservers} --worker_hosts=${g_trainers} \
--job_name=${TF_JOB_NAME} --task_index=${trainer_id} --batch_size=${BATCH_SIZE}"
stdbuf -oL sh -c "cd ${TRAINER_PACKAGE} && ${cmd}"
check_trainer_ret $?
}
start_tf(){
if [[ "${TF_JOB_NAME}" == "worker" ]]; then
start_tf_trainer
else
start_tf_pserver
fi
}
usage() {
echo "usage: tf_k8s [<args>]:"
echo " start_tf Start tensorflow jobs"
}
case "$1" in
start_tf)
start_tf
;;
--help)
usage
;;
*)
usage
;;
esac

@ -1,56 +0,0 @@
apiVersion: extensions/v1beta1
kind: ReplicaSet
metadata:
name: vgg16job-tf-pserver
spec:
replicas: 10
template:
metadata:
labels:
tf-job-pserver: vgg16job-tf
spec:
hostNetwork: true
imagePullSecrets:
- name: job-registry-secret
containers:
- name: pserver
image: "registry.baidu.com/paddlepaddle/fluid_benchmark_tf:vgg16"
imagePullPolicy: Always
command: ["tf_k8s", "start_tf"]
ports:
- name: jobport-30236
containerPort: 30236
env:
- name: PORT
value: "32036"
- name: ENTRY
value: "python vgg16_tf.py"
- name: JOB_NAME
value: vgg16job-tf
- name: PSERVERS_NUM
value: "10"
- name: TF_JOB_NAME
value: "ps"
- name: TRAINERS_NUM
value: "20"
- name: BATCH_SIZE
value: "128"
- name: TRAINER_PACKAGE
value: "/workspace"
- name: NUM_PASSES
value: "1"
- name: NAMESPACE
valueFrom:
fieldRef:
fieldPath: "metadata.namespace"
- name: POD_IP
valueFrom:
fieldRef:
fieldPath: "status.podIP"
resources:
requests:
memory: 10Gi
cpu: 4
limits:
memory: 10Gi
cpu: 4

@ -1,58 +0,0 @@
apiVersion: batch/v1
kind: Job
metadata:
name: vgg16job-tf-trainer
spec:
parallelism: 20
completions: 20
template:
metadata:
labels:
tf-job-trainer: vgg16job-tf
spec:
imagePullSecrets:
- name: job-registry-secret
hostNetwork: true
containers:
- name: trainer
image: "registry.baidu.com/paddlepaddle/fluid_benchmark_tf:vgg16"
imagePullPolicy: Always
command: ["tf_k8s", "start_tf"]
ports:
- name: jobport-30236
containerPort: 30236
env:
- name: PORT
value: "32036"
- name: JOB_NAME
value: vgg16job-tf
- name: TF_JOB_NAME
value: "worker"
- name: ENTRY
value: "python vgg16_tf.py"
- name: PSERVERS_NUM
value: "10"
- name: BATCH_SIZE
value: "128"
- name: TRAINERS_NUM
value: "20"
- name: TRAINER_PACKAGE
value: "/workspace"
- name: NUM_PASSES
value: "1"
- name: NAMESPACE
valueFrom:
fieldRef:
fieldPath: "metadata.namespace"
- name: POD_IP
valueFrom:
fieldRef:
fieldPath: "status.podIP"
resources:
requests:
memory: 40Gi
cpu: 2
limits:
memory: 40Gi
cpu: 2
restartPolicy: Never

@ -1,64 +0,0 @@
apiVersion: extensions/v1beta1
kind: ReplicaSet
metadata:
name: vgg16v2job-pserver
spec:
replicas: 10
template:
metadata:
labels:
paddle-job-pserver: vgg16v2job
spec:
hostNetwork: true
imagePullSecrets:
- name: job-registry-secret
containers:
- name: pserver
image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
imagePullPolicy: Always
ports:
- name: jobport-30236
containerPort: 30236
env:
- name: PADDLE_JOB_NAME
value: vgg16v2job
- name: TRAINERS
value: "20"
- name: PSERVERS
value: "10"
- name: TOPOLOGY
value: ""
- name: ENTRY
value: "python train.py"
- name: TRAINER_PACKAGE
value: "/workspace"
- name: PADDLE_INIT_PORT
value: "30236"
- name: PADDLE_INIT_NICS
value: "xgbe0"
- name: PADDLE_INIT_TRAINER_COUNT
value: "1"
- name: PADDLE_INIT_PORTS_NUM
value: "1"
- name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
value: "1"
- name: PADDLE_INIT_NUM_GRADIENT_SERVERS
value: "20"
- name: PADDLE_INIT_NUM_PASSES
value: "1"
- name: PADDLE_INIT_USE_GPU
value: "0"
- name: LD_LIBRARY_PATH
value: "/usr/local/lib:/usr/local/nvidia/lib64"
- name: NAMESPACE
valueFrom:
fieldRef:
fieldPath: "metadata.namespace"
command: ["paddle_k8s", "start_pserver"]
resources:
requests:
memory: 10Gi
cpu: 4
limits:
memory: 10Gi
cpu: 4

@ -1,65 +0,0 @@
apiVersion: batch/v1
kind: Job
metadata:
name: vgg16v2job-trainer
spec:
parallelism: 20
completions: 20
template:
metadata:
labels:
paddle-job: vgg16v2job
spec:
imagePullSecrets:
- name: job-registry-secret
hostNetwork: true
containers:
- name: trainer
image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
imagePullPolicy: Always
command: ["paddle_k8s", "start_trainer", "v2"]
env:
- name: PADDLE_JOB_NAME
value: vgg16v2job
- name: BATCH_SIZE
value: "256"
- name: TRAINERS
value: "20"
- name: PSERVERS
value: "10"
- name: TOPOLOGY
value: ""
- name: ENTRY
value: "cd /workspace && MKL_NUM_THREADS=1 python /workspace/vgg16_v2.py"
- name: TRAINER_PACKAGE
value: "/workspace"
- name: PADDLE_INIT_PORT
value: "30236"
- name: PADDLE_INIT_NICS
value: "xgbe0"
- name: PADDLE_INIT_TRAINER_COUNT
value: "1"
- name: PADDLE_INIT_PORTS_NUM
value: "1"
- name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
value: "1"
- name: PADDLE_INIT_NUM_GRADIENT_SERVERS
value: "20"
- name: PADDLE_INIT_NUM_PASSES
value: "2"
- name: PADDLE_INIT_USE_GPU
value: "0"
- name: LD_LIBRARY_PATH
value: "/usr/local/lib:/usr/local/nvidia/lib64"
- name: NAMESPACE
valueFrom:
fieldRef:
fieldPath: "metadata.namespace"
resources:
requests:
memory: 40Gi
cpu: 2
limits:
memory: 40Gi
cpu: 2
restartPolicy: Never

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -1,154 +0,0 @@
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
import gzip
import paddle.v2.dataset.cifar as cifar
import paddle.v2 as paddle
import time
import os
DATA_DIM = 3 * 32 * 32
CLASS_DIM = 10
BATCH_SIZE = os.getenv("BATCH_SIZE")
if BATCH_SIZE:
BATCH_SIZE = int(BATCH_SIZE)
else:
BATCH_SIZE = 128
print "batch_size", BATCH_SIZE
NODE_COUNT = int(os.getenv("TRAINERS"))
ts = 0
def vgg(input, nums, class_dim):
def conv_block(input, num_filter, groups, num_channels=None):
return paddle.networks.img_conv_group(
input=input,
num_channels=num_channels,
pool_size=2,
pool_stride=2,
conv_num_filter=[num_filter] * groups,
conv_filter_size=3,
conv_act=paddle.activation.Relu(),
pool_type=paddle.pooling.Max())
assert len(nums) == 5
# the channel of input feature is 3
conv1 = conv_block(input, 64, nums[0], 3)
conv2 = conv_block(conv1, 128, nums[1])
conv3 = conv_block(conv2, 256, nums[2])
conv4 = conv_block(conv3, 512, nums[3])
conv5 = conv_block(conv4, 512, nums[4])
fc_dim = 512
fc1 = paddle.layer.fc(input=conv5,
size=fc_dim,
act=paddle.activation.Relu(),
layer_attr=paddle.attr.Extra(drop_rate=0.5))
fc2 = paddle.layer.fc(input=fc1,
size=fc_dim,
act=paddle.activation.Relu(),
layer_attr=paddle.attr.Extra(drop_rate=0.5))
out = paddle.layer.fc(input=fc2,
size=class_dim,
act=paddle.activation.Softmax())
return out
def vgg13(input, class_dim):
nums = [2, 2, 2, 2, 2]
return vgg(input, nums, class_dim)
def vgg16(input, class_dim):
nums = [2, 2, 3, 3, 3]
return vgg(input, nums, class_dim)
def vgg19(input, class_dim):
nums = [2, 2, 4, 4, 4]
return vgg(input, nums, class_dim)
def main():
global ts
paddle.init(use_gpu=False)
image = paddle.layer.data(
name="image", type=paddle.data_type.dense_vector(DATA_DIM))
lbl = paddle.layer.data(
name="label", type=paddle.data_type.integer_value(CLASS_DIM))
extra_layers = None
# NOTE: for v2 distributed training need averaging updates.
learning_rate = 1e-3 / NODE_COUNT
out = vgg16(image, class_dim=CLASS_DIM)
cost = paddle.layer.classification_cost(input=out, label=lbl)
# Create parameters
parameters = paddle.parameters.create(cost)
# Create optimizer
optimizer = paddle.optimizer.Momentum(
momentum=0.9,
regularization=paddle.optimizer.L2Regularization(rate=0.0005 *
BATCH_SIZE),
learning_rate=learning_rate / BATCH_SIZE,
learning_rate_decay_a=0.1,
learning_rate_decay_b=128000 * 35,
learning_rate_schedule="discexp", )
train_reader = paddle.batch(
paddle.reader.shuffle(
cifar.train10(),
# To use other data, replace the above line with:
# reader.train_reader('train.list'),
buf_size=1000),
batch_size=BATCH_SIZE)
test_reader = paddle.batch(
cifar.test10(),
# To use other data, replace the above line with:
# reader.test_reader('val.list'),
batch_size=BATCH_SIZE)
# Create trainer
trainer = paddle.trainer.SGD(cost=cost,
parameters=parameters,
update_equation=optimizer,
extra_layers=extra_layers,
is_local=False)
# End batch and end pass event handler
def event_handler(event):
global ts, ts_pass
if isinstance(event, paddle.event.BeginPass):
ts_pass = time.time()
if isinstance(event, paddle.event.BeginIteration):
ts = time.time()
if isinstance(event, paddle.event.EndIteration):
if event.batch_id % 1 == 0:
print "\nPass %d, Batch %d, Cost %f, %s, spent: %f" % (
event.pass_id, event.batch_id, event.cost, event.metrics,
time.time() - ts)
if isinstance(event, paddle.event.EndPass):
print "Pass %d end, spent: %f" % (event.pass_id,
time.time() - ts_pass)
result = trainer.test(reader=test_reader)
print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
trainer.train(
reader=train_reader, num_passes=200, event_handler=event_handler)
if __name__ == '__main__':
main()

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save