update results

8 years ago · f9db562987
parent bd64719a2f
commit f9db562987
8 changed files with 11 additions and 322 deletions
--- a/benchmark/cluster/vgg16/Dockerfile
+++ b/benchmark/cluster/vgg16/Dockerfile
@ -1,15 +1,13 @@
 #FROM paddlepaddle/paddlecloud-job
 #RUN mkdir -p /workspace
 #ADD reader.py /workspace/
 #RUN python /workspace/reader.py
 FROM python:2.7.14
-ADD paddle_k8s /usr/bin
+ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
-ADD k8s_tools.py /root
+ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
-RUN pip install -U kubernetes opencv-python &&   apt-get update -y &&   apt-get install -y iputils-ping libgtk2.0-dev 
+RUN pip install -U kubernetes opencv-python &&   apt-get update -y &&   apt-get install -y iputils-ping libgtk2.0-dev && \
 chmod +x /usr/bin/paddle_k8s
 # NOTE: By default CI built wheel packages turn WITH_DISTRIBUTE=OFF,
 #       so we must build one with distribute support to install in this image.
 ADD *.whl /
 RUN pip install /*.whl && rm -f /*.whl
 ENV LD_LIBRARY_PATH=/usr/local/lib
-ADD reader.py /workspace/
+RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()" | python'
 RUN python /workspace/reader.py
 ADD vgg16_fluid.py vgg16_v2.py /workspace/
--- a/benchmark/cluster/vgg16/README.md
+++ b/benchmark/cluster/vgg16/README.md
@ -43,7 +43,7 @@
 | Trainer Counter | 20 | 40 | 80 | 100 |
 | -- | -- | -- | -- | -- |
 | PaddlePaddle Fluid | 291.06 | 518.80 | 836.26 | 1019.29 |
-| PaddlePaddle v2 | 356.28 | - | - | 1041.99 |
+| PaddlePaddle v2 (need more tests) | 356.28 | 785.39 | 853.30 | 1041.99 |
 | TensorFlow | - | - | - | - |
 ### different pserver number
--- a/benchmark/cluster/vgg16/fluid_trainer.yaml
+++ b/benchmark/cluster/vgg16/fluid_trainer.yaml
@ -30,7 +30,7 @@ spec:
        - name: TOPOLOGY
          value: ""
        - name: ENTRY
-          value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0 --batch_size 256"
+          value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0 --batch_size 128"
        - name: TRAINER_PACKAGE
          value: "/workspace"
        - name: PADDLE_INIT_PORT
--- a/benchmark/cluster/vgg16/k8s_tools.py
+++ b/benchmark/cluster/vgg16/k8s_tools.py
@ -1,94 +0,0 @@
 #   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 # 
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 # 
 #     http://www.apache.org/licenses/LICENSE-2.0
 # 
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #!/bin/env python
 import os
 import sys
 import time
 import socket
 from kubernetes import client, config
 PADDLE_JOB_NAME = os.getenv("PADDLE_JOB_NAME")
 NAMESPACE = os.getenv("NAMESPACE")
 PORT = os.getenv("PSERVER_PORT")
 if os.getenv("KUBERNETES_SERVICE_HOST", None):
    config.load_incluster_config()
 else:
    config.load_kube_config()
 v1 = client.CoreV1Api()
 def fetch_pods_info(label_selector):
    api_response = v1.list_namespaced_pod(
        namespace=NAMESPACE, pretty=True, label_selector=label_selector)
    pod_list = []
    for item in api_response.items:
        pod_list.append((item.status.phase, item.status.pod_ip))
    return pod_list
 def wait_pods_running(label_selector, desired):
    print "label selector: %s, desired: %s" % (label_selector, desired)
    while True:
        count = count_pods_by_phase(label_selector, 'Running')
        # NOTE: pods may be scaled.
        if count >= int(desired):
            break
        print 'current cnt: %d sleep for 5 seconds...' % count
        time.sleep(5)
 def count_pods_by_phase(label_selector, phase):
    pod_list = fetch_pods_info(label_selector)
    filtered_pod_list = filter(lambda x: x[0] == phase, pod_list)
    return len(filtered_pod_list)
 def fetch_pserver_ips():
    label_selector = "paddle-job-pserver=%s" % PADDLE_JOB_NAME
    pod_list = fetch_pods_info(label_selector)
    pserver_ips = [item[1] for item in pod_list]
    return ",".join(pserver_ips)
 def fetch_master_ip():
    label_selector = "paddle-job-master=%s" % PADDLE_JOB_NAME
    pod_list = fetch_pods_info(label_selector)
    master_ips = [item[1] for item in pod_list]
    return master_ips[0]
 def fetch_trainer_id():
    label_selector = "paddle-job=%s" % PADDLE_JOB_NAME
    pod_list = fetch_pods_info(label_selector)
    trainer_ips = [item[1] for item in pod_list]
    trainer_ips.sort()
    local_ip = socket.gethostbyname(socket.gethostname())
    for i in xrange(len(trainer_ips)):
        if trainer_ips[i] == local_ip:
            return i
    return None
 if __name__ == "__main__":
    command = sys.argv[1]
    if command == "fetch_pserver_ips":
        print fetch_pserver_ips()
    elif command == "fetch_trainer_id":
        print fetch_trainer_id()
    elif command == "fetch_master_ip":
        print fetch_master_ip()
    elif command == "count_pods_by_phase":
        print count_pods_by_phase(sys.argv[2], sys.argv[3])
    elif command == "wait_pods_running":
        wait_pods_running(sys.argv[2], sys.argv[3])
--- a/benchmark/cluster/vgg16/paddle_k8s
+++ b/benchmark/cluster/vgg16/paddle_k8s
@ -1,199 +0,0 @@
 #!/bin/bash
 start_pserver() {
    stdbuf -oL paddle pserver \
      --use_gpu=0 \
      --port=$PADDLE_INIT_PORT \
      --ports_num=$PADDLE_INIT_PORTS_NUM \
      --ports_num_for_sparse=$PADDLE_INIT_PORTS_NUM_FOR_SPARSE \
      --nics=$PADDLE_INIT_NICS \
      --comment=paddle_process_k8s \
      --num_gradient_servers=$PADDLE_INIT_NUM_GRADIENT_SERVERS
 }
 start_new_pserver() {
  stdbuf -oL python /root/k8s_tools.py wait_pods_running  paddle-job-master=${PADDLE_JOB_NAME} 1
  export MASTER_IP=$(python /root/k8s_tools.py fetch_master_ip)
  stdbuf -oL /usr/bin/pserver \
    -port=$PADDLE_INIT_PORT \
    -num-pservers=$PSERVERS \
    -log-level=debug \
    -etcd-endpoint=http://$MASTER_IP:2379
 }
 start_master() {
  stdbuf -oL /usr/bin/master \
  -port=8080 \
  -chunk-per-task=1\
  -task-timout-dur=16s\
  -endpoints=http://127.0.0.1:2379
 }
 check_failed_cnt() {
  max_failed=$1
  failed_count=$(python /root/k8s_tools.py count_pods_by_phase paddle-job=${PADDLE_JOB_NAME} Failed) 
  if [ $failed_count -gt $max_failed ]; then
    stdbuf -oL echo "Failed trainer count beyond the threadhold: "$max_failed
    echo "Failed trainer count beyond the threshold: " $max_failed > /dev/termination-log 
    exit 0
  fi
 }
 check_trainer_ret() {
  ret=$1
  stdbuf -oL echo "job returned $ret...setting pod return message..."
  stdbuf -oL echo "==============================="
  if [ $ret -eq 136 ] ; then
    echo "Error Arithmetic Operation(Floating Point Exception)" > /dev/termination-log
  elif [ $ret -eq 139 ] ; then
    echo "Segmentation Fault" > /dev/termination-log
  elif [ $ret -eq 1 ] ; then
    echo "General Error" > /dev/termination-log
  elif [ $ret -eq 134 ] ; then
    echo "Program Abort" > /dev/termination-log
  fi
  stdbuf -oL echo "termination log wroted..."
  exit $ret
 }
 start_fluid_process() {
  stdbuf -oL python /root/k8s_tools.py wait_pods_running paddle-job-pserver=${PADDLE_JOB_NAME} ${PSERVERS}
  if [ "${TRAINING_ROLE}" == "TRAINER" ]; then
    check_failed_cnt ${TRAINERS}
    sleep 5
    export PADDLE_INIT_TRAINER_ID=$(python /root/k8s_tools.py fetch_trainer_id)
  fi
  export PADDLE_INIT_PSERVERS=$(python /root/k8s_tools.py fetch_pserver_ips)
  stdbuf -oL sh -c "${ENTRY}"
  check_trainer_ret $?
 }
 start_new_trainer() {
  # FIXME(Yancey1989): use command-line interface to configure the max failed count
  check_failed_cnt ${TRAINERS}
  stdbuf -oL python /root/k8s_tools.py wait_pods_running paddle-job-pserver=${PADDLE_JOB_NAME} ${PSERVERS}
  sleep 5
  stdbuf -oL python /root/k8s_tools.py wait_pods_running  paddle-job-master=${PADDLE_JOB_NAME} 1
  export MASTER_IP=$(python /root/k8s_tools.py fetch_master_ip)
  export ETCD_IP="$MASTER_IP"
  # NOTE: $TRAINER_PACKAGE may be large, do not copy
  export PYTHONPATH=$TRAINER_PACKAGE:$PYTHONPATH
  cd $TRAINER_PACKAGE
  stdbuf -oL echo "Starting training job: " $TRAINER_PACKAGE, "num_gradient_servers:" \
  $PADDLE_INIT_NUM_GRADIENT_SERVERS, "version: " $1 
  stdbuf -oL sh -c "${ENTRY}"
  check_trainer_ret $?
 }
 start_trainer() {
    # paddle v1 and V2 distributed training does not allow any trainer failed. 
    check_failed_cnt 0
    stdbuf -oL python /root/k8s_tools.py wait_pods_running paddle-job-pserver=${PADDLE_JOB_NAME} ${PSERVERS}
    stdbuf -oL python /root/k8s_tools.py wait_pods_running paddle-job=${PADDLE_JOB_NAME} ${TRAINERS}
    export PADDLE_INIT_PSERVERS=$(python /root/k8s_tools.py fetch_pserver_ips)
    export PADDLE_INIT_TRAINER_ID=$(python /root/k8s_tools.py fetch_trainer_id)
    stdbuf -oL echo $PADDLE_INIT_TRAINER_ID > /trainer_id
    # FIXME: /trainer_count = PADDLE_INIT_NUM_GRADIENT_SERVERS
    stdbuf -oL echo $PADDLE_INIT_NUM_GRADIENT_SERVERS > /trainer_count
    # NOTE: $TRAINER_PACKAGE may be large, do not copy
    export PYTHONPATH=$TRAINER_PACKAGE:$PYTHONPATH
    cd $TRAINER_PACKAGE
    stdbuf -oL echo "Starting training job: " $TRAINER_PACKAGE, "num_gradient_servers:" \
    $PADDLE_INIT_NUM_GRADIENT_SERVERS, "trainer_id: " $PADDLE_INIT_TRAINER_ID, \
    "version: " $1
    # FIXME: If we use the new PServer by Golang, add Kubernetes healthz
    # to wait PServer process get ready.Now only sleep 20 seconds.
    sleep 20
    case "$1" in
      "v1")
        FILE_COUNT=$(wc -l $TRAIN_LIST | awk '{print $1}')
        if [ $FILE_COUNT -le $PADDLE_INIT_NUM_GRADIENT_SERVERS ]; then
          echo "file count less than trainers"
          check_trainer_ret 0
        fi
        let lines_per_node="$FILE_COUNT / ($PADDLE_INIT_NUM_GRADIENT_SERVERS + 1)"
        echo "spliting file to" $lines_per_node
        cp $TRAIN_LIST /
        cd /
        split -l $lines_per_node -d -a 3 $TRAIN_LIST train.list
        CURRENT_LIST=$(printf "train.list%03d" $PADDLE_INIT_TRAINER_ID)
        # always use /train.list for paddle v1 for each node.
        echo "File for current node ${CURRENT_LIST}"
        sleep 10
        cp $CURRENT_LIST train.list
        cd $TRAINER_PACKAGE
        stdbuf -oL  paddle train \
          --port=$PADDLE_INIT_PORT \
          --nics=$PADDLE_INIT_NICS \
          --ports_num=$PADDLE_INIT_PORTS_NUM \
          --ports_num_for_sparse=$PADDLE_INIT_PORTS_NUM_FOR_SPARSE \
          --num_passes=$PADDLE_INIT_NUM_PASSES \
          --trainer_count=$PADDLE_INIT_TRAINER_COUNT \
          --saving_period=1 \
          --log_period=20 \
          --local=0 \
          --rdma_tcp=tcp \
          --config=$TOPOLOGY \
          --use_gpu=$PADDLE_INIT_USE_GPU \
          --trainer_id=$PADDLE_INIT_TRAINER_ID \
          --save_dir=$OUTPUT \
          --pservers=$PADDLE_INIT_PSERVERS \
          --num_gradient_servers=$PADDLE_INIT_NUM_GRADIENT_SERVERS
        # paddle v1 API does not allow any trainer failed.
        check_trainer_ret $? 
        ;;
      "v2")
        stdbuf -oL sh -c "${ENTRY}"
        # paddle v2 API does not allow any trainer failed.
        check_trainer_ret $? 
        ;;
      *)
        ;;
    esac
 }
 usage() {
    echo "usage: paddle_k8s [<args>]:"
    echo "  start_trainer  [v1|v2]    Start a trainer process with v1 or v2 API"
    echo "  start_pserver             Start a pserver process"
    echo "  start_new_pserver         Start a new pserver process"
    echo "  start_new_trainer         Start a new triner process"
 }
 case "$1" in
    start_pserver)
        start_pserver
        ;;
    start_trainer)
        start_trainer $2
        ;;
    start_new_trainer)
        start_new_trainer
        ;;
    start_new_pserver)
        start_new_pserver
        ;;
    start_master)
        start_master
        ;;
    start_fluid)
        start_fluid_process
        ;;
    --help)
        usage
        ;;
    *)
        usage
        ;;
 esac
--- a/benchmark/cluster/vgg16/reader.py
+++ b/benchmark/cluster/vgg16/reader.py
@ -1,16 +0,0 @@
 #   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 # 
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 # 
 #     http://www.apache.org/licenses/LICENSE-2.0
 # 
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import paddle.v2 as paddle
 paddle.dataset.cifar.train10()
--- a/benchmark/cluster/vgg16/v2_trainer.yaml
+++ b/benchmark/cluster/vgg16/v2_trainer.yaml
@ -38,7 +38,7 @@ spec:
        - name: PADDLE_INIT_NICS
          value: "xgbe0"
        - name: PADDLE_INIT_TRAINER_COUNT
-          value: "2"
+          value: "1"
        - name: PADDLE_INIT_PORTS_NUM
          value: "1"
        - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
--- a/benchmark/cluster/vgg16/vgg16_v2.py
+++ b/benchmark/cluster/vgg16/vgg16_v2.py
@ -51,7 +51,7 @@ def vgg(input, nums, class_dim):
    conv4 = conv_block(conv3, 512, nums[3])
    conv5 = conv_block(conv4, 512, nums[4])
-    fc_dim = 4096
+    fc_dim = 512
    fc1 = paddle.layer.fc(input=conv5,
                          size=fc_dim,
                          act=paddle.activation.Relu(),