Merge branch 'develop' of github.com:PaddlePaddle/Paddle into overlap_send_op

7 years ago · ceefbf3259
parent 0aa6f9e934 376c948e88
commit ceefbf3259
75 changed files with 1161 additions and 2481 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -57,7 +57,10 @@ option(GLIDE_INSTALL    "Download and install go dependencies "         ON)
 option(USE_NNPACK       "Compile PaddlePaddle with NNPACK library"      OFF)
 option(WITH_DISTRIBUTE  "Compile with grpc distributed support"         OFF)
 option(USE_EIGEN_FOR_BLAS   "Use matrix multiplication in Eigen"        OFF)
 option(EIGEN_USE_THREADS "Compile with multi-threaded Eigen"            OFF)
 option(WITH_ARM_FP16    "Use half precision support on armv8.2-a cpu"   OFF)
 option(WITH_FAST_BUNDLE_TEST    "Bundle tests that can be run in a single process together to reduce launch overhead"   OFF)
 option(WITH_CONTRIB     "Compile the third-party contributation"        OFF)
 # CMAKE_BUILD_TYPE
 if(NOT CMAKE_BUILD_TYPE)
@ -202,7 +205,7 @@ endif(USE_NNPACK)
 add_subdirectory(proto)
-if(NOT MOBILE_INFERENCE)
+if(NOT MOBILE_INFERENCE AND NOT WITH_FLUID_ONLY)
    # "add_subdirectory(go)" should be placed after the following loine,
    # because it depends on paddle/optimizer.
    add_subdirectory(paddle/optimizer)
@ -230,3 +233,7 @@ if(WITH_DOC)
    find_python_module(recommonmark REQUIRED)
    add_subdirectory(doc)
 endif()
 if (WITH_CONTRIB)
    add_subdirectory(paddle/contrib)
 endif()
--- a/3
+++ b/3
@ -101,6 +101,3 @@ RUN echo 'root:root' | chpasswd
 RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config
 RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
 EXPOSE 22
 # development image default do build work
 CMD ["bash", "/paddle/paddle/scripts/docker/build.sh"]
--- a/Dockerfile.android
+++ b/Dockerfile.android
@ -40,5 +40,3 @@ RUN mkdir -p ${ANDROID_TOOLCHAINS_DIR} && \
    unzip -q android-ndk-r14b-linux-x86_64.zip && \
    mv android-ndk-r14b ${ANDROID_NDK_HOME} && \
    rm -rf /opt/android-ndk-tmp
 CMD ["bash", "/paddle/paddle/scripts/docker/build_android.sh"]
--- a/benchmark/cluster/README.md
+++ b/benchmark/cluster/README.md
@ -1,196 +0,0 @@
 # Cluster Training Benchmark
 ## Setup
 - Platform
  - Kubernetes: v1.6.2
  - Linux Kernel: v3.10.0
 - Resource
  - CPU: 10 Cores per Pod
  - Memory: 5GB per Pod
 - Docker Image
  We use different base Docker Image to run the benchmark on Kubernetes:
  - PaddlePaddle v2: paddlepaddle/paddle:0.11.0
  - PaddlePaddle Fluid: paddlepaddle/paddle:[commit-id]
  - TensorFlow: tensorflow/tensorflow:1.5.0-rc0
 - Model
  vgg16 is used in this benchmark.
 ## Cases
 - Variable
  - Batch Size of training data.
  - PServer count of the training job.
  - The number of trainers.
 - Invariant
  - The resource of trainer/pserver Pod.
 ### Measure the Performance for Different Batch Size
 - PServer Count: 40
 - Trainer Count: 100
 - Metrics: mini-batch / sec
 <table>
 <thead>
 <tr>
 <th>Batch Size </th>
 <th> 32</th>
 <th>64</th>
 <th>128 </th>
 <th>256</th>
 </tr>
 </thead>
 <tbody>
 <tr>
 <td> PaddlePaddle Fluid</td>
 <td>-</td>
 <td>- </td>
 <td>-  </td>
 <td>- </td>
 </tr>
 <tr>
 <td>PaddlePaddle v2  </td>
 <td>-  </td>
 <td>- </td>
 <td>-  </td>
 <td>- </td>
 </tr>
 <tr>
 <td>TensorFlow </td>
 <td>-  </td>
 <td>- </td>
 <td>-  </td>
 <td>- </td>
 </tr>
 </tbody>
 </table>
 ### Measure the Performance for Different PServer Count
 - Trainer Count: 100
 - Batch Size: 64
 - Metrics: mini-batch / sec
 <table>
 <thead>
 <tr>
 <th>PServer Count  </th>
 <th>10</th>
 <th>20</th>
 <th>40 </th>
 <th>60</th>
 </tr>
 </thead>
 <tbody>
 <tr>
 <td> PaddlePaddle Fluid</td>
 <td>-</td>
 <td>- </td>
 <td>-  </td>
 <td>- </td>
 </tr>
 <tr>
 <td>PaddlePaddle v2  </td>
 <td>-  </td>
 <td>- </td>
 <td>-  </td>
 <td>- </td>
 </tr>
 <tr>
 <td>TensorFlow </td>
 <td>-  </td>
 <td>- </td>
 <td>-  </td>
 <td>- </td>
 </tr>
 </tbody>
 </table>
 ### Measure Parallel Efficiency By Increasing Trainer Count
 - PServer Count: 20
 - Batch Size: 64
 - Metrics:
 $S = \div(T1, TN)$
 which S is the ratio of T1 over TN, training time of 1 and N trainers.
 The parallel efficiency is:
 $E = \div(S, N)$
 <table>
 <thead>
 <tr>
 <th>Trainer Counter  </th>
 <th>1</th>
 <th>10</th>
 <th>20 </th>
 <th>30</th>
 <th>40</th>
 <th>50</th>
 <th>60 </th>
 <th>70</th>
 <th>80</th>
 <th>90</th>
 <th>100 </th>
 </tr>
 </thead>
 <tbody>
 <tr>
 <td> PaddlePaddle Fluid</td>
 <td>-</td>
 <td>- </td>
 <td>- </td>
 <td>- </td>
 <td>-</td>
 <td>- </td>
 <td>- </td>
 <td>- </td>
 <td>-</td>
 <td>- </td>
 <td>- </td>
 </tr>
 <tr>
 <td>PaddlePaddle v2  </td>
 <td>-  </td>
 <td>- </td>
 <td>-  </td>
 <td>- </td>
 <td>-</td>
 <td>- </td>
 <td>- </td>
 <td>- </td>
 <td>-</td>
 <td>- </td>
 <td>- </td>
 </tr>
 <tr>
 <td>TensorFlow </td>
 <td>-  </td>
 <td>- </td>
 <td>-  </td>
 <td>- </td>
 <td>-</td>
 <td>- </td>
 <td>- </td>
 <td>- </td>
 <td>-</td>
 <td>- </td>
 <td>- </td>
 </tr>
 </tbody>
 </table>
 ## Reproduce the benchmark
 TODO
--- a/benchmark/cluster/vgg16/Dockerfile
+++ b/benchmark/cluster/vgg16/Dockerfile
@ -1,35 +0,0 @@
 FROM nvidia/cuda:8.0-cudnn5-runtime-ubuntu16.04
 # you can get mirror list here:
 # https://launchpad.net/ubuntu/+archivemirrors
 ARG UBUNTU_MIRROR
 RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
 RUN apt-get update && apt-get install -y python python-dev python-pip iputils-ping libgtk2.0-dev
 RUN pip install -U kubernetes opencv-python
 RUN pip install paddlepaddle
 # if network is slowly, you may need to add proxy here.
 # ENV https_proxy=
 RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()" | python'
 RUN pip uninstall -y paddlepaddle
 # unset proxy if it is setted.
 # ENV https_proxy=""
 # NOTE: By default CI built wheel packages turn WITH_DISTRIBUTE=OFF,
 #       so we must build one with distribute support to install in this image.
 ADD *.whl /
 RUN pip install /*.whl && rm -f /*.whl
 ENV LD_LIBRARY_PATH=/usr/local/lib
 # tf k8s
 RUN pip install tensorflow==1.4.0
 ADD tf_k8s /usr/bin
 RUN chmod +x /usr/bin/tf_k8s
 ADD vgg16_tf.py /workspace/
 # below lines may change a lot for debugging
 ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
 ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
 RUN chmod +x /usr/bin/paddle_k8s
 ADD vgg16_fluid.py vgg16_v2.py /workspace/
--- a/benchmark/cluster/vgg16/README.md
+++ b/benchmark/cluster/vgg16/README.md
@ -1,195 +0,0 @@
 # Performance for Distributed vgg16
 ## Test Result
 ### Hardware Infomation
 - CPU: Intel(R) Xeon(R) CPU E5-2620 v4 @ 2.10GHz
 - cpu MHz		: 2101.000
 - cache size	: 20480 KB
 ### Blas settings
 Setting environment variable: `MKL_NUM_THREADS=1`.
 ### Single Node Single Thread
 - Metrics: samples / sec
 <table>
 <thead>
 <tr>
 <th>Batch Size </th>
 <th> 32</th>
 <th>64</th>
 <th>128 </th>
 <th>256</th>
 </tr>
 </thead>
 <tbody>
 <tr>
 <td> PaddlePaddle Fluid</td>
 <td> 15.44 </td>
 <td> 16.32 </td>
 <td> 16.74 </td>
 <td> 16.79 </td>
 </tr>
 <tr>
 <td>PaddlePaddle v2  </td>
 <td> 15.97 </td>
 <td> 17.04 </td>
 <td> 17.60 </td>
 <td> 17.83 </td>
 </tr>
 <tr>
 <td>TensorFlow </td>
 <td> 9.09 </td>
 <td> 9.10 </td>
 <td> 9.24 </td>
 <td> 8.66 </td>
 </tr>
 </tbody>
 </table>
 ### Different Batch Size
 - PServer Count: 10
 - Trainer Count: 20
 - Metrics: samples / sec
 <table>
 <thead>
 <tr>
 <th>Batch Size </th>
 <th> 32</th>
 <th>64</th>
 <th>128 </th>
 <th>256</th>
 </tr>
 </thead>
 <tbody>
 <tr>
 <td> PaddlePaddle Fluid</td>
 <td> 190.20 </td>
 <td> 222.15 </td>
 <td> 247.40 </td>
 <td> 258.18 </td>
 </tr>
 <tr>
 <td>PaddlePaddle v2  </td>
 <td> 170.96 </td>
 <td> 233.71 </td>
 <td> 256.14 </td>
 <td> 329.23 </td>
 </tr>
 <tr>
 <td>TensorFlow </td>
 <td> - </td>
 <td> - </td>
 <td> - </td>
 <td> - </td>
 </tr>
 </tbody>
 </table>
 ### Accelerate Rate
 - Pserver Count: 20
 - Batch Size: 128
 - Metrics: samples / sec
 <table>
 <thead>
 <tr>
 <th>Trainer Count </th>
 <th>20</th>
 <th>40</th>
 <th>80</th>
 <th>100</th>
 </tr>
 </thead>
 <tbody>
 <tr>
 <td> PaddlePaddle Fluid</td>
 <td> 263.29 (78.64%) </td>
 <td> 518.80 (77.47%) </td>
 <td> 836.26 (62.44%) </td>
 <td> 1019.29 (60.89%) </td>
 </tr>
 <tr>
 <td>PaddlePaddle v2 (need more tests)   </td>
 <td> 326.85 (92.85%) </td>
 <td> 534.58 (75.93%) </td>
 <td> 853.30 (60.60%) </td>
 <td> 1041.99 (59.20%) </td>
 </tr>
 <tr>
 <td>TensorFlow </td>
 <td> - </td>
 <td> - </td>
 <td> - </td>
 <td> - </td>
 </tr>
 </tbody>
 </table>
 ### Different Pserver Count
 - Trainer Count: 60
 - Batch Size: 128
 - Metrics: samples/ sec
 <table>
 <thead>
 <tr>
 <th>PServer Count </th>
 <th>3</th>
 <th>6</th>
 <th>10</th>
 <th>20</th>
 </tr>
 </thead>
 <tbody>
 <tr>
 <td> PaddlePaddle Fluid(should fix in next PR) </td>
 <td> 589.1 </td>
 <td> 592.6 </td>
 <td> 656.4 </td>
 <td> 655.8 </td>
 </tr>
 <tr>
 <td>PaddlePaddle v2 (need more tests)   </td>
 <td> 593.4 </td>
 <td> 791.3 </td>
 <td> 729.7 </td>
 <td> 821.7 </td>
 </tr>
 <tr>
 <td>TensorFlow </td>
 <td> - </td>
 <td> - </td>
 <td> - </td>
 <td> - </td>
 </tr>
 </tbody>
 </table>
 *The performance gap between Fuild and v2 comes from the network interference.*
 ## Steps to Run the Performance Test
 1. You must re-compile PaddlePaddle and enable `-DWITH_DISTRIBUTE` to build PaddlePaddle with distributed support.
 1. When the build finishes, copy the output `whl` package located under `build/python/dist` to current directory.
 1. Run `docker build -t [image:tag] .` to build the docker image and run `docker push [image:tag]` to push the image to reponsitory so kubernetes can find it.
 1. Run `kubectl create -f pserver.yaml && kubectl create -f trainer.yaml` to start the job on your kubernetes cluster (you must configure the `kubectl` client before this step).
 1. Run `kubectl get po` to get running pods, and run `kubectl logs [podID]` to fetch the pod log of pservers and trainers.
 Check the logs for the distributed training progress and analyze the performance.
 ## Enable Verbos Logs
 Edit `pserver.yaml` and `trainer.yaml` and add an environment variable `GLOG_v=3` and `GLOG_logtostderr=1` to see what happend in detail.
--- a/benchmark/cluster/vgg16/fluid_pserver.yaml
+++ b/benchmark/cluster/vgg16/fluid_pserver.yaml
@ -1,72 +0,0 @@
 apiVersion: extensions/v1beta1
 kind: ReplicaSet
 metadata:
  name: vgg16job-pserver
 spec:
  replicas: 10
  template:
    metadata:
      labels:
        paddle-job-pserver: vgg16job
    spec:
      hostNetwork: true
      imagePullSecrets:
      - name: job-registry-secret
      containers:
      - name: pserver
        image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
        imagePullPolicy: Always
        ports:
        - name: jobport-30236
          containerPort: 30236
        env:
        - name: PADDLE_JOB_NAME
          value: vgg16job
        - name: MKL_NUM_THREADS
          value: "1"
        - name: TRAINING_ROLE
          value: "PSERVER"
        - name: TRAINERS
          value: "20"
        - name: PSERVERS
          value: "10"
        - name: TOPOLOGY
          value: ""
        - name: ENTRY
          value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0"
        - name: TRAINER_PACKAGE
          value: "/workspace"
        - name: PADDLE_INIT_PORT
          value: "30236"
        - name: PADDLE_INIT_NICS
          value: "xgbe0"
        - name: PADDLE_INIT_TRAINER_COUNT
          value: "1"
        - name: PADDLE_INIT_PORTS_NUM
          value: "1"
        - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
          value: "1"
        - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
          value: "20"
        - name: PADDLE_INIT_NUM_PASSES
          value: "1"
        - name: PADDLE_INIT_USE_GPU
          value: "0"
        - name: LD_LIBRARY_PATH
          value: "/usr/local/lib:/usr/local/nvidia/lib64"
        - name: NAMESPACE
          valueFrom:
            fieldRef:
              fieldPath: "metadata.namespace"
        - name: POD_IP
          valueFrom:
            fieldRef:
              fieldPath: "status.podIP"
        command: ["paddle_k8s", "start_fluid"]
        resources:
          requests:
            memory: 10Gi
            cpu: 4
          limits:
            memory: 10Gi
            cpu: 4
--- a/benchmark/cluster/vgg16/fluid_trainer.yaml
+++ b/benchmark/cluster/vgg16/fluid_trainer.yaml
@ -1,69 +0,0 @@
 apiVersion: batch/v1
 kind: Job
 metadata:
  name: vgg16job-trainer
 spec:
  parallelism: 20
  completions: 20
  template:
    metadata:
      labels:
        paddle-job: vgg16job
    spec:
      imagePullSecrets:
      - name: job-registry-secret
      hostNetwork: true
      containers:
      - name: trainer
        image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
        imagePullPolicy: Always
        command: ["paddle_k8s", "start_fluid"]
        env:
        - name: PADDLE_JOB_NAME
          value: vgg16job
        - name: TRAINING_ROLE
          value: "TRAINER"
        - name: TRAINERS
          value: "20"
        - name: PSERVERS
          value: "10"
        - name: TOPOLOGY
          value: ""
        - name: ENTRY
          value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0 --batch_size 128"
        - name: TRAINER_PACKAGE
          value: "/workspace"
        - name: PADDLE_INIT_PORT
          value: "30236"
        - name: PADDLE_INIT_NICS
          value: "xgbe0"
        - name: PADDLE_INIT_TRAINER_COUNT
          value: "1"
        - name: PADDLE_INIT_PORTS_NUM
          value: "1"
        - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
          value: "1"
        - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
          value: "20"
        - name: PADDLE_INIT_NUM_PASSES
          value: "1"
        - name: PADDLE_INIT_USE_GPU
          value: "0"
        - name: LD_LIBRARY_PATH
          value: "/usr/local/lib:/usr/local/nvidia/lib64"
        - name: NAMESPACE
          valueFrom:
            fieldRef:
              fieldPath: "metadata.namespace"
        - name: POD_IP
          valueFrom:
            fieldRef:
              fieldPath: "status.podIP"
        resources:
          requests:
            memory: 40Gi
            cpu: 2
          limits:
            memory: 40Gi
            cpu: 2
      restartPolicy: Never
--- a/benchmark/cluster/vgg16/run_vgg_dist.sh
+++ b/benchmark/cluster/vgg16/run_vgg_dist.sh
@ -1,21 +0,0 @@
 #!/bin/bash
 # Update to point to the source file.
 VGG_SRC="vgg16_fluid.py"
 export TRAINING_ROLE=PSERVER
 export TRAINERS=2
 export POD_IP=127.0.0.1
 export PADDLE_INIT_PORT=6174
 MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 &
 # Need to wait for the ps to start first.
 sleep 10
 echo "done start ps"
 export TRAINING_ROLE=TRAINER
 export TRAINERS=2
 export POD_IP=127.0.0.1
 export PADDLE_INIT_PORT=6174
 CUDA_VISIBLE_DEVICES=4 MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 --device=GPU --task_index=0 &
 CUDA_VISIBLE_DEVICES=5 MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 --device=GPU --task_index=1 &
--- a/benchmark/cluster/vgg16/tf_k8s
+++ b/benchmark/cluster/vgg16/tf_k8s
@ -1,82 +0,0 @@
 #!/bin/bash
 check_trainer_ret() {
  ret=$1
  stdbuf -oL echo "job returned $ret...setting pod return message..."
  stdbuf -oL echo "==============================="
  if [ $ret -eq 136 ] ; then
    echo "Error Arithmetic Operation(Floating Point Exception)" > /dev/termination-log
  elif [ $ret -eq 139 ] ; then
    echo "Segmentation Fault" > /dev/termination-log
  elif [ $ret -eq 1 ] ; then
    echo "General Error" > /dev/termination-log
  elif [ $ret -eq 134 ] ; then
    echo "Program Abort" > /dev/termination-log
  fi
  stdbuf -oL echo "termination log wroted..."
  exit $ret
 }
 g_pservers=""
 g_trainers=""
 wait_running_pods(){
  pserver_label="tf-job-pserver=${JOB_NAME}"
  trainer_label="tf-job-trainer=${JOB_NAME}"
  stdbuf -oL python /root/k8s_tools.py wait_pods_running ${pserver_label} ${PSERVERS_NUM}
  stdbuf -oL python /root/k8s_tools.py wait_pods_running ${trainer_label} ${TRAINERS_NUM}
  g_pservers=$(python /root/k8s_tools.py fetch_endpoints ${pserver_label} ${PORT})
  g_trainers=$(python /root/k8s_tools.py fetch_endpoints ${trainer_label} ${PORT})
 }
 start_tf_pserver(){
  wait_running_pods
  label="tf-job-pserver=${JOB_NAME}"
  pserver_id=$(python /root/k8s_tools.py fetch_id ${label})
  cmd="${ENTRY} --ps_hosts=${g_pservers} --worker_hosts=${g_trainers} \
  --job_name=${TF_JOB_NAME} --task_index=${pserver_id}"
  stdbuf -oL sh -c "cd ${TRAINER_PACKAGE} && ${cmd}"
 }
 start_tf_trainer(){
  wait_running_pods
  label="tf-job-trainer=${JOB_NAME}"
  trainer_id=$(python /root/k8s_tools.py fetch_id ${label})
  cmd="${ENTRY} --ps_hosts=${g_pservers} --worker_hosts=${g_trainers} \
  --job_name=${TF_JOB_NAME} --task_index=${trainer_id} --batch_size=${BATCH_SIZE}"
  stdbuf -oL sh -c "cd ${TRAINER_PACKAGE} && ${cmd}"
  check_trainer_ret $?
 }
 start_tf(){
    if [[ "${TF_JOB_NAME}" == "worker" ]]; then
        start_tf_trainer
    else
        start_tf_pserver
    fi
 }
 usage() {
    echo "usage: tf_k8s [<args>]:"
    echo "  start_tf         Start tensorflow jobs"
 }
 case "$1" in
    start_tf)
        start_tf
        ;;
    --help)
        usage
        ;;
    *)
        usage
        ;;
 esac
--- a/benchmark/cluster/vgg16/tf_pserver.yaml
+++ b/benchmark/cluster/vgg16/tf_pserver.yaml
@ -1,56 +0,0 @@
 apiVersion: extensions/v1beta1
 kind: ReplicaSet
 metadata:
  name: vgg16job-tf-pserver
 spec:
  replicas: 10
  template:
    metadata:
      labels:
        tf-job-pserver: vgg16job-tf
    spec:
      hostNetwork: true
      imagePullSecrets:
      - name: job-registry-secret
      containers:
      - name: pserver
        image: "registry.baidu.com/paddlepaddle/fluid_benchmark_tf:vgg16"
        imagePullPolicy: Always
        command: ["tf_k8s", "start_tf"]
        ports:
        - name: jobport-30236
          containerPort: 30236
        env:
        - name: PORT
          value: "32036"
        - name: ENTRY
          value: "python vgg16_tf.py"
        - name: JOB_NAME
          value: vgg16job-tf
        - name: PSERVERS_NUM
          value: "10"
        - name: TF_JOB_NAME 
          value: "ps"
        - name: TRAINERS_NUM
          value: "20"
        - name: BATCH_SIZE
          value: "128"
        - name: TRAINER_PACKAGE
          value: "/workspace"
        - name: NUM_PASSES
          value: "1"
        - name: NAMESPACE
          valueFrom:
            fieldRef:
              fieldPath: "metadata.namespace"
        - name: POD_IP
          valueFrom:
            fieldRef:
              fieldPath: "status.podIP"
        resources:
          requests:
            memory: 10Gi
            cpu: 4
          limits:
            memory: 10Gi
            cpu: 4
--- a/benchmark/cluster/vgg16/tf_trainer.yaml
+++ b/benchmark/cluster/vgg16/tf_trainer.yaml
@ -1,58 +0,0 @@
 apiVersion: batch/v1
 kind: Job
 metadata:
  name: vgg16job-tf-trainer
 spec:
  parallelism: 20
  completions: 20
  template:
    metadata:
      labels:
        tf-job-trainer: vgg16job-tf
    spec:
      imagePullSecrets:
      - name: job-registry-secret
      hostNetwork: true
      containers:
      - name: trainer
        image: "registry.baidu.com/paddlepaddle/fluid_benchmark_tf:vgg16"
        imagePullPolicy: Always
        command: ["tf_k8s", "start_tf"]
        ports:
        - name: jobport-30236
          containerPort: 30236
        env:
        - name: PORT
          value: "32036"
        - name: JOB_NAME
          value: vgg16job-tf
        - name: TF_JOB_NAME 
          value: "worker"
        - name: ENTRY
          value: "python vgg16_tf.py"
        - name: PSERVERS_NUM
          value: "10"
        - name: BATCH_SIZE
          value: "128"
        - name: TRAINERS_NUM
          value: "20"
        - name: TRAINER_PACKAGE
          value: "/workspace"
        - name: NUM_PASSES
          value: "1"
        - name: NAMESPACE
          valueFrom:
            fieldRef:
              fieldPath: "metadata.namespace"
        - name: POD_IP
          valueFrom:
            fieldRef:
              fieldPath: "status.podIP"
        resources:
          requests:
            memory: 40Gi
            cpu: 2
          limits:
            memory: 40Gi
            cpu: 2
      restartPolicy: Never
--- a/benchmark/cluster/vgg16/v2_pserver.yaml
+++ b/benchmark/cluster/vgg16/v2_pserver.yaml
@ -1,64 +0,0 @@
 apiVersion: extensions/v1beta1
 kind: ReplicaSet
 metadata:
  name: vgg16v2job-pserver
 spec:
  replicas: 10
  template:
    metadata:
      labels:
        paddle-job-pserver: vgg16v2job
    spec:
      hostNetwork: true
      imagePullSecrets:
      - name: job-registry-secret
      containers:
      - name: pserver
        image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
        imagePullPolicy: Always
        ports:
        - name: jobport-30236
          containerPort: 30236
        env:
        - name: PADDLE_JOB_NAME
          value: vgg16v2job
        - name: TRAINERS
          value: "20"
        - name: PSERVERS
          value: "10"
        - name: TOPOLOGY
          value: ""
        - name: ENTRY
          value: "python train.py"
        - name: TRAINER_PACKAGE
          value: "/workspace"
        - name: PADDLE_INIT_PORT
          value: "30236"
        - name: PADDLE_INIT_NICS
          value: "xgbe0"
        - name: PADDLE_INIT_TRAINER_COUNT
          value: "1"
        - name: PADDLE_INIT_PORTS_NUM
          value: "1"
        - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
          value: "1"
        - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
          value: "20"
        - name: PADDLE_INIT_NUM_PASSES
          value: "1"
        - name: PADDLE_INIT_USE_GPU
          value: "0"
        - name: LD_LIBRARY_PATH
          value: "/usr/local/lib:/usr/local/nvidia/lib64"
        - name: NAMESPACE
          valueFrom:
            fieldRef:
              fieldPath: "metadata.namespace"
        command: ["paddle_k8s", "start_pserver"]
        resources:
          requests:
            memory: 10Gi
            cpu: 4
          limits:
            memory: 10Gi
            cpu: 4
--- a/benchmark/cluster/vgg16/v2_trainer.yaml
+++ b/benchmark/cluster/vgg16/v2_trainer.yaml
@ -1,65 +0,0 @@
 apiVersion: batch/v1
 kind: Job
 metadata:
  name: vgg16v2job-trainer
 spec:
  parallelism: 20
  completions: 20
  template:
    metadata:
      labels:
        paddle-job: vgg16v2job
    spec:
      imagePullSecrets:
        - name: job-registry-secret
      hostNetwork: true
      containers:
      - name: trainer
        image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
        imagePullPolicy: Always
        command: ["paddle_k8s", "start_trainer", "v2"]
        env:
        - name: PADDLE_JOB_NAME
          value: vgg16v2job
        - name: BATCH_SIZE
          value: "256"
        - name: TRAINERS
          value: "20"
        - name: PSERVERS
          value: "10"
        - name: TOPOLOGY
          value: ""
        - name: ENTRY
          value: "cd /workspace && MKL_NUM_THREADS=1 python /workspace/vgg16_v2.py"
        - name: TRAINER_PACKAGE
          value: "/workspace"
        - name: PADDLE_INIT_PORT
          value: "30236"
        - name: PADDLE_INIT_NICS
          value: "xgbe0"
        - name: PADDLE_INIT_TRAINER_COUNT
          value: "1"
        - name: PADDLE_INIT_PORTS_NUM
          value: "1"
        - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
          value: "1"
        - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
          value: "20"
        - name: PADDLE_INIT_NUM_PASSES
          value: "2"
        - name: PADDLE_INIT_USE_GPU
          value: "0"
        - name: LD_LIBRARY_PATH
          value: "/usr/local/lib:/usr/local/nvidia/lib64"
        - name: NAMESPACE
          valueFrom:
            fieldRef:
              fieldPath: "metadata.namespace"
        resources:
          requests:
            memory: 40Gi
            cpu: 2
          limits:
            memory: 40Gi
            cpu: 2
      restartPolicy: Never
--- a/benchmark/cluster/vgg16/vgg16_fluid.py
+++ b/benchmark/cluster/vgg16/vgg16_fluid.py
--- a/benchmark/cluster/vgg16/vgg16_tf.py
+++ b/benchmark/cluster/vgg16/vgg16_tf.py
--- a/benchmark/cluster/vgg16/vgg16_v2.py
+++ b/benchmark/cluster/vgg16/vgg16_v2.py
@ -1,154 +0,0 @@
 #  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 #Licensed under the Apache License, Version 2.0 (the "License");
 #you may not use this file except in compliance with the License.
 #You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 #Unless required by applicable law or agreed to in writing, software
 #distributed under the License is distributed on an "AS IS" BASIS,
 #WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #See the License for the specific language governing permissions and
 #limitations under the License.
 import gzip
 import paddle.v2.dataset.cifar as cifar
 import paddle.v2 as paddle
 import time
 import os
 DATA_DIM = 3 * 32 * 32
 CLASS_DIM = 10
 BATCH_SIZE = os.getenv("BATCH_SIZE")
 if BATCH_SIZE:
    BATCH_SIZE = int(BATCH_SIZE)
 else:
    BATCH_SIZE = 128
 print "batch_size", BATCH_SIZE
 NODE_COUNT = int(os.getenv("TRAINERS"))
 ts = 0
 def vgg(input, nums, class_dim):
    def conv_block(input, num_filter, groups, num_channels=None):
        return paddle.networks.img_conv_group(
            input=input,
            num_channels=num_channels,
            pool_size=2,
            pool_stride=2,
            conv_num_filter=[num_filter] * groups,
            conv_filter_size=3,
            conv_act=paddle.activation.Relu(),
            pool_type=paddle.pooling.Max())
    assert len(nums) == 5
    # the channel of input feature is 3
    conv1 = conv_block(input, 64, nums[0], 3)
    conv2 = conv_block(conv1, 128, nums[1])
    conv3 = conv_block(conv2, 256, nums[2])
    conv4 = conv_block(conv3, 512, nums[3])
    conv5 = conv_block(conv4, 512, nums[4])
    fc_dim = 512
    fc1 = paddle.layer.fc(input=conv5,
                          size=fc_dim,
                          act=paddle.activation.Relu(),
                          layer_attr=paddle.attr.Extra(drop_rate=0.5))
    fc2 = paddle.layer.fc(input=fc1,
                          size=fc_dim,
                          act=paddle.activation.Relu(),
                          layer_attr=paddle.attr.Extra(drop_rate=0.5))
    out = paddle.layer.fc(input=fc2,
                          size=class_dim,
                          act=paddle.activation.Softmax())
    return out
 def vgg13(input, class_dim):
    nums = [2, 2, 2, 2, 2]
    return vgg(input, nums, class_dim)
 def vgg16(input, class_dim):
    nums = [2, 2, 3, 3, 3]
    return vgg(input, nums, class_dim)
 def vgg19(input, class_dim):
    nums = [2, 2, 4, 4, 4]
    return vgg(input, nums, class_dim)
 def main():
    global ts
    paddle.init(use_gpu=False)
    image = paddle.layer.data(
        name="image", type=paddle.data_type.dense_vector(DATA_DIM))
    lbl = paddle.layer.data(
        name="label", type=paddle.data_type.integer_value(CLASS_DIM))
    extra_layers = None
    # NOTE: for v2 distributed training need averaging updates.
    learning_rate = 1e-3 / NODE_COUNT
    out = vgg16(image, class_dim=CLASS_DIM)
    cost = paddle.layer.classification_cost(input=out, label=lbl)
    # Create parameters
    parameters = paddle.parameters.create(cost)
    # Create optimizer
    optimizer = paddle.optimizer.Momentum(
        momentum=0.9,
        regularization=paddle.optimizer.L2Regularization(rate=0.0005 *
                                                         BATCH_SIZE),
        learning_rate=learning_rate / BATCH_SIZE,
        learning_rate_decay_a=0.1,
        learning_rate_decay_b=128000 * 35,
        learning_rate_schedule="discexp", )
    train_reader = paddle.batch(
        paddle.reader.shuffle(
            cifar.train10(),
            # To use other data, replace the above line with:
            # reader.train_reader('train.list'),
            buf_size=1000),
        batch_size=BATCH_SIZE)
    test_reader = paddle.batch(
        cifar.test10(),
        # To use other data, replace the above line with:
        # reader.test_reader('val.list'),
        batch_size=BATCH_SIZE)
    # Create trainer
    trainer = paddle.trainer.SGD(cost=cost,
                                 parameters=parameters,
                                 update_equation=optimizer,
                                 extra_layers=extra_layers,
                                 is_local=False)
    # End batch and end pass event handler
    def event_handler(event):
        global ts, ts_pass
        if isinstance(event, paddle.event.BeginPass):
            ts_pass = time.time()
        if isinstance(event, paddle.event.BeginIteration):
            ts = time.time()
        if isinstance(event, paddle.event.EndIteration):
            if event.batch_id % 1 == 0:
                print "\nPass %d, Batch %d, Cost %f, %s, spent: %f" % (
                    event.pass_id, event.batch_id, event.cost, event.metrics,
                    time.time() - ts)
        if isinstance(event, paddle.event.EndPass):
            print "Pass %d end, spent: %f" % (event.pass_id,
                                              time.time() - ts_pass)
            result = trainer.test(reader=test_reader)
            print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
    trainer.train(
        reader=train_reader, num_passes=200, event_handler=event_handler)
 if __name__ == '__main__':
    main()
--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
@ -94,6 +94,10 @@ def parse_args():
        '--memory_optimize',
        action='store_true',
        help='If set, optimize runtime memory before start.')
    parser.add_argument(
        '--use_fake_data',
        action='store_true',
        help='If set ommit the actual read data operators.')
    parser.add_argument(
        '--update_method',
        type=str,
@ -198,6 +202,10 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
        exe.run(train_prog)
        return
    if args.use_fake_data:
        raise Exception(
            "fake data is not supported in single GPU test for now.")
    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
    exe = fluid.Executor(place)
    exe.run(startup_prog)
@ -244,7 +252,31 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
 def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
                   batch_acc, args, train_prog, startup_prog, nccl_id_var,
                   num_trainers, trainer_id):
    feed_var_list = [
        var for var in train_prog.global_block().vars.itervalues()
        if var.is_data
    ]
    # generate fake:
    if args.use_fake_data:
        for var in feed_var_list:
            v = startup_prog.global_block().clone_variable(var)
            var.persistable = True
            v.persistable = True
            real_shape = list(var.shape)
            real_shape[0] = args.batch_size / args.gpus
            startup_prog.global_block().append_op(
                outputs={"Out": v},
                type="fill_constant",
                attrs={"shape": real_shape,
                       "value": 1.0,
                       "dtype": var.dtype})
    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
    if nccl_id_var and trainer_id == 0:
        #FIXME(wuyi): wait other trainer to start listening
        time.sleep(30)
    startup_exe = fluid.Executor(place)
    startup_exe.run(startup_prog)
    strategy = fluid.ExecutionStrategy()
@ -256,10 +288,7 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
        exec_strategy=strategy,
        num_trainers=num_trainers,
        trainer_id=trainer_id)
-    feed_var_list = [
+
        var for var in train_prog.global_block().vars.itervalues()
        if var.is_data
    ]
    feeder = fluid.DataFeeder(feed_var_list, place)
    for pass_id in range(args.pass_num):
        num_samples = 0
@ -271,7 +300,10 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
                num_samples = 0
            if iters == args.iterations:
                break
-            loss, = exe.run([avg_loss.name], feed=feeder.feed(data))
+            if args.use_fake_data:
                loss, = exe.run([avg_loss.name])
            else:
                loss, = exe.run([avg_loss.name], feed=feeder.feed(data))
            if args.update_method == "pserver":
                exe.bcast_params()
            num_samples += len(data)
--- a/benchmark/fluid/kube_gen_job.py
+++ b/benchmark/fluid/kube_gen_job.py
@ -112,6 +112,7 @@ def gen_job():
    envs.append({"name": "PSERVERS", "value": str(args.pservers)})
    envs.append({"name": "ENTRY", "value": args.entry})
    envs.append({"name": "PADDLE_INIT_PORT", "value": str(args.port)})
    envs.append({"name": "PADDLE_PSERVER_PORT", "value": str(args.port)})
    # NOTE: these directories below are cluster specific, please modify
    # this settings before you run on your own cluster.
    envs.append({
--- a/benchmark/fluid/kube_templates/init.py
+++ b/benchmark/fluid/kube_templates/init.py
@ -54,5 +54,13 @@ envs = [
                "fieldPath": "status.podIP"
            }
        }
    },
    {
        "name": "PADDLE_CURRENT_IP",
        "valueFrom": {
            "fieldRef": {
                "fieldPath": "status.podIP"
            }
        }
    }
 ]
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@ -41,6 +41,10 @@ if(USE_EIGEN_FOR_BLAS)
    add_definitions(-DPADDLE_USE_EIGEN_FOR_BLAS)
 endif(USE_EIGEN_FOR_BLAS)
 if(EIGEN_USE_THREADS)
    add_definitions(-DEIGEN_USE_THREADS)
 endif(EIGEN_USE_THREADS)
 if(NOT WITH_PROFILER)
    add_definitions(-DPADDLE_DISABLE_PROFILER)
 endif(NOT WITH_PROFILER)
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@ -212,6 +212,7 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
        ${CMAKE_COMMAND} ${PROTOBUF_SOURCES_DIR}/src/${TARGET_NAME}/cmake
            ${OPTIONAL_ARGS}
            -Dprotobuf_BUILD_TESTS=OFF
            -DCMAKE_SKIP_RPATH=ON
            -DCMAKE_POSITION_INDEPENDENT_CODE=ON
            -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
            -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR}
--- a/doc/fluid/api/layers.rst
+++ b/doc/fluid/api/layers.rst
@ -1003,9 +1003,9 @@ dice_loss
 ..  autofunction:: paddle.fluid.layers.dice_loss
    :noindex:
-bilinear_interp
+upsampling_bilinear2d
 ____
-..  autofunction:: paddle.fluid.layers.bilinear_interp
+..  autofunction:: paddle.fluid.layers.upsampling_bilinear2d
    :noindex:
--- a/doc/v2/build_and_install/build_from_source_cn.rst
+++ b/doc/v2/build_and_install/build_from_source_cn.rst
@ -35,13 +35,11 @@ PaddlePaddle需要使用Docker环境完成编译，这样可以免去单独安
   # 2. 可选步骤：源码中构建用于编译PaddlePaddle的Docker镜像
   docker build -t paddle:dev .
   # 3. 执行下面的命令编译CPU-Only的二进制
-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/paddle_build.sh build
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh build
   # 4. 或者也可以使用为上述可选步骤构建的镜像（必须先执行第2步）
-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev ./paddle/scripts/paddle_build.sh build
-注：上述命令把当前目录（源码树根目录）映射为 container 里的 :code:`/paddle` 目录。如果使用自行
+注：上述命令把当前目录（源码树根目录）映射为 container 里的 :code:`/paddle` 目录。
 构建的镜像（上述第4步）会执行 :code:`Dockerfile` 描述的默认入口程序 :code:`build.sh` 可以省略步骤3中
 最后的执行脚本的命令。
 编译完成后会在build/python/dist目录下生成输出的whl包，可以选在在当前机器安装也可以拷贝到目标机器安装：
@ -72,15 +70,15 @@ PaddlePaddle需要使用Docker环境完成编译，这样可以免去单独安
 .. code-block:: bash
-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/docker/build.sh
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh test
 如果期望执行其中一个单元测试，（比如 :code:`test_sum_op` ）：
 .. code-block:: bash
-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 /bin/bash
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 /bin/bash
-   bash /paddle/paddle/scripts/docker/build.sh
+   ./paddle/scripts/paddle_build.sh build
-   cd /paddle/build
+   cd build
   ctest -R test_sum_op -V
 .. _faq_docker:
--- a/doc/v2/build_and_install/build_from_source_en.rst
+++ b/doc/v2/build_and_install/build_from_source_en.rst
@ -34,14 +34,12 @@ Or you can build your own image from source as the optional step below:
   # 2. Optional: build development docker image from source
   docker build -t paddle:dev .
   # 3. Run the following command to build a CPU-Only binaries
-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/paddle_build.sh build
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh build
   # 4. Or, use your built Docker image to build PaddlePaddle (must run step 2)
-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev ./paddle/scripts/paddle_build.sh build
 NOTE: The above command try to mount the current working directory (root directory of source code)
-into :code:`/paddle` directory inside docker container. If you are using your own image
+into :code:`/paddle` directory inside docker container.
 (Step 4) it will run default entry-point :code:`build.sh` , so you could omit the last
 command in step 3.
 When the compile finishes, you can get the output whl package under
 build/python/dist, then you can choose to install the whl on local
@ -74,15 +72,15 @@ Set :code:`WITH_GPU=ON` Can also run tests on GPU.
 .. code-block:: bash
-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/paddle/scripts/docker/build.sh
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh test
 If you wish to run only one unit test, like :code:`test_sum_op`:
 .. code-block:: bash
-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 /bin/bash
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 /bin/bash
-   bash /paddle/paddle/scripts/docker/build.sh
+   ./paddle/scripts/paddle_build.sh build
-   cd /paddle/build
+   cd build
   ctest -R test_sum_op -V
 .. _faq_docker:
--- a/Show More
+++ b/Show More