update dist benchmark to one image

7 years ago · 70142ae65e
parent da3b14bc67
commit 70142ae65e
14 changed files with 86 additions and 111 deletions
--- a/benchmark/cluster/vgg16/fluid/Dockerfile
+++ b/benchmark/cluster/vgg16/fluid/Dockerfile
@ -12,4 +12,4 @@ ENV LD_LIBRARY_PATH=/usr/local/lib
 ADD reader.py /workspace/
 RUN python /workspace/reader.py

-ADD vgg16.py /workspace/
+ADD vgg16_fluid.py vgg16_v2.py /workspace/
--- a/benchmark/cluster/vgg16/README.md
+++ b/benchmark/cluster/vgg16/README.md
@ -0,0 +1,58 @@
+# Performance for distributed vgg16
+
+## Test Result
+
+### Single node single thread
+
+| Batch Size | 32 | 64 | 128 | 256 |
+| -- | -- | -- | -- | -- |
+| PaddlePaddle Fluid | - | - | 16.74 | - |
+| PaddlePaddle v2 | - | - | 17.60 | - |
+| TensorFlow | - | - | - | - |
+
+### different batch size
+
+- PServer Count: 10
+- Trainer Count: 20
+- Metrics: samples / sec
+
+| Batch Size | 32 | 64 | 128 | 256 |
+| -- | -- | -- | -- | -- |
+| PaddlePaddle Fluid | - | 247.40 | - | - |
+| PaddlePaddle v2 | - | - | 256.14 | - |
+| TensorFlow | - | - | - | - |
+
+### different pserver number
+
+- Trainer Count: 100
+- Batch Size: 64
+- Metrics: mini-batch / sec
+
+| PServer Count | 10 | 20 | 40 | 60 |
+| -- | -- | -- | -- | -- |
+| PaddlePaddle Fluid | - | - | - | - |
+| PaddlePaddle v2 | - | - | - | - |
+| TensorFlow | - | - | - | - |
+
+### Accelerate rate
+
+| Trainer Counter | 20 | 40 | 80 | 100 |
+| -- | -- | -- | -- | -- |
+| PaddlePaddle Fluid | - | - | - | - |
+| PaddlePaddle v2 | - | - | - | - |
+| TensorFlow | - | - | - | - |
+
+
+## Steps to run the performance test
+
+1. You must re-compile PaddlePaddle and enable `-DWITH_DISTRIBUTE` to build PaddlePaddle with distributed support.
+1. When the build finishes, copy the output `whl` package located under `build/python/dist` to current directory.
+1. Run `docker build -t [image:tag] .` to build the docker image and run `docker push [image:tag]` to push the image to reponsitory so kubernetes can find it.
+1. Run `kubectl create -f pserver.yaml && kubectl create -f trainer.yaml` to start the job on your kubernetes cluster (you must configure the `kubectl` client before this step).
+1. Run `kubectl get po` to get running pods, and run `kubectl logs [podID]` to fetch the pod log of pservers and trainers.
+
+Check the logs for the distributed training progress and analyze the performance.
+
+## Enable verbos logs
+
+Edit `pserver.yaml` and `trainer.yaml` and add an environment variable `GLOG_v=3` to see what happend in detail.
--- a/benchmark/cluster/vgg16/fluid/README.md
+++ b/benchmark/cluster/vgg16/fluid/README.md
@ -1,15 +0,0 @@
-# Fluid distributed training perf test
-
-## Steps to get started
-
-1. You must re-compile PaddlePaddle and enable `-DWITH_DISTRIBUTE` to build PaddlePaddle with distributed support.
-1. When the build finishes, copy the output `whl` package located under `build/python/dist` to current directory.
-1. Run `docker build -t [image:tag] .` to build the docker image and run `docker push [image:tag]` to push the image to reponsitory so kubernetes can find it.
-1. Run `kubectl create -f pserver.yaml && kubectl create -f trainer.yaml` to start the job on your kubernetes cluster (you must configure the `kubectl` client before this step).
-1. Run `kubectl get po` to get running pods, and run `kubectl logs [podID]` to fetch the pod log of pservers and trainers.
-
-Check the logs for the distributed training progress and analyze the performance.
-
-## Enable verbos logs
-
-Edit `pserver.yaml` and `trainer.yaml` and add an environment variable `GLOG_v=3` to see what happend in detail.
--- a/benchmark/cluster/vgg16/fluid_pserver.yaml
+++ b/benchmark/cluster/vgg16/fluid_pserver.yaml
@ -14,7 +14,7 @@ spec:
      - name: job-registry-secret
      containers:
      - name: pserver
-        image: "registry.baidu.com/paddlepaddle/rawjob:vgg16_fluid"
+        image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
        imagePullPolicy: Always
        ports:
        - name: jobport-30236
@ -33,7 +33,7 @@ spec:
        - name: TOPOLOGY
          value: ""
        - name: ENTRY
-          value: "LD_LIBRARY_PATH=/usr/local/lib MKL_NUM_THREADS=1 python /workspace/vgg16.py --local 0"
+          value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0"
        - name: TRAINER_PACKAGE
          value: "/workspace"
        - name: PADDLE_INIT_PORT
@ -53,7 +53,7 @@ spec:
        - name: PADDLE_INIT_USE_GPU
          value: "0"
        - name: LD_LIBRARY_PATH
-          value: "/usr/local/nvidia/lib64"
+          value: "/usr/local/lib:/usr/local/nvidia/lib64"
        - name: NAMESPACE
          valueFrom:
            fieldRef:
--- a/benchmark/cluster/vgg16/fluid_trainer.yaml
+++ b/benchmark/cluster/vgg16/fluid_trainer.yaml
@ -15,7 +15,7 @@ spec:
      hostNetwork: true
      containers:
      - name: trainer
-        image: "registry.baidu.com/paddlepaddle/rawjob:vgg16_fluid"
+        image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
        imagePullPolicy: Always
        command: ["paddle_k8s", "start_fluid"]
        env:
@ -30,7 +30,7 @@ spec:
        - name: TOPOLOGY
          value: ""
        - name: ENTRY
-          value: "cd /workspace && LD_LIBRARY_PATH=/usr/local/lib MKL_NUM_THREADS=1 python /workspace/vgg16.py --local 0"
+          value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0 --batch_size 128"
        - name: TRAINER_PACKAGE
          value: "/workspace"
        - name: PADDLE_INIT_PORT
@ -50,7 +50,7 @@ spec:
        - name: PADDLE_INIT_USE_GPU
          value: "0"
        - name: LD_LIBRARY_PATH
-          value: "/usr/local/nvidia/lib64"
+          value: "/usr/local/lib:/usr/local/nvidia/lib64"
        - name: NAMESPACE
          valueFrom:
            fieldRef:
--- a/benchmark/cluster/vgg16/fluid/k8s_tools.py
+++ b/benchmark/cluster/vgg16/fluid/k8s_tools.py
--- a/benchmark/cluster/vgg16/fluid/paddle_k8s
+++ b/benchmark/cluster/vgg16/fluid/paddle_k8s
--- a/benchmark/cluster/vgg16/fluid/reader.py
+++ b/benchmark/cluster/vgg16/fluid/reader.py
--- a/benchmark/cluster/vgg16/v2/Dockerfile
+++ b/benchmark/cluster/vgg16/v2/Dockerfile
@ -1,7 +0,0 @@
-FROM paddlepaddle/paddlecloud-job
-RUN mkdir -p /workspace
-ADD reader.py /workspace/
-RUN python /workspace/reader.py
-ADD vgg16.py /workspace/
-
-ADD vgg16_fluid.py /workspace
--- a/benchmark/cluster/vgg16/v2/reader.py
+++ b/benchmark/cluster/vgg16/v2/reader.py
@ -1,70 +0,0 @@
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-
-import random
-from paddle.v2.image import load_and_transform
-import paddle.v2 as paddle
-from multiprocessing import cpu_count
-
-
-def train_mapper(sample):
-    '''
-    map image path to type needed by model input layer for the training set
-    '''
-    img, label = sample
-    img = paddle.image.load_image(img)
-    img = paddle.image.simple_transform(img, 256, 224, True)
-    return img.flatten().astype('float32'), label
-
-
-def test_mapper(sample):
-    '''
-    map image path to type needed by model input layer for the test set
-    '''
-    img, label = sample
-    img = paddle.image.load_image(img)
-    img = paddle.image.simple_transform(img, 256, 224, True)
-    return img.flatten().astype('float32'), label
-
-
-def train_reader(train_list, buffered_size=1024):
-    def reader():
-        with open(train_list, 'r') as f:
-            lines = [line.strip() for line in f]
-            for line in lines:
-                img_path, lab = line.strip().split('\t')
-                yield img_path, int(lab)
-
-    return paddle.reader.xmap_readers(train_mapper, reader,
-                                      cpu_count(), buffered_size)
-
-
-def test_reader(test_list, buffered_size=1024):
-    def reader():
-        with open(test_list, 'r') as f:
-            lines = [line.strip() for line in f]
-            for line in lines:
-                img_path, lab = line.strip().split('\t')
-                yield img_path, int(lab)
-
-    return paddle.reader.xmap_readers(test_mapper, reader,
-                                      cpu_count(), buffered_size)
-
-
-if __name__ == '__main__':
-    #for im in train_reader('train.list'):
-    #    print len(im[0])
-    #for im in train_reader('test.list'):
-    #    print len(im[0])
-    paddle.dataset.cifar.train10()
--- a/benchmark/cluster/vgg16/v2_pserver.yaml
+++ b/benchmark/cluster/vgg16/v2_pserver.yaml
@ -14,7 +14,7 @@ spec:
      - name: job-registry-secret
      containers:
      - name: pserver
-        image: "registry.baidu.com/paddlepaddle/rawjob:vgg16"
+        image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
        imagePullPolicy: Always
        ports:
        - name: jobport-30236
@ -49,7 +49,7 @@ spec:
        - name: PADDLE_INIT_USE_GPU
          value: "0"
        - name: LD_LIBRARY_PATH
-          value: "/usr/local/nvidia/lib64"
+          value: "/usr/local/lib:/usr/local/nvidia/lib64"
        - name: NAMESPACE
          valueFrom:
            fieldRef:
--- a/benchmark/cluster/vgg16/v2_trainer.yaml
+++ b/benchmark/cluster/vgg16/v2_trainer.yaml
@ -15,12 +15,14 @@ spec:
      hostNetwork: true
      containers:
      - name: trainer
-        image: "registry.baidu.com/paddlepaddle/rawjob:vgg16"
+        image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
        imagePullPolicy: Always
        command: ["paddle_k8s", "start_trainer", "v2"]
        env:
        - name: PADDLE_JOB_NAME
          value: vgg16v2job
+        - name: BATCH_SIZE
+          value: "128"
        - name: TRAINERS
          value: "20"
        - name: PSERVERS
@ -28,7 +30,7 @@ spec:
        - name: TOPOLOGY
          value: ""
        - name: ENTRY
-          value: "cd /workspace && MKL_NUM_THREADS=1 python /workspace/vgg16.py"
+          value: "cd /workspace && MKL_NUM_THREADS=1 python /workspace/vgg16_v2.py"
        - name: TRAINER_PACKAGE
          value: "/workspace"
        - name: PADDLE_INIT_PORT
@ -48,7 +50,7 @@ spec:
        - name: PADDLE_INIT_USE_GPU
          value: "0"
        - name: LD_LIBRARY_PATH
-          value: "/usr/local/nvidia/lib64"
+          value: "/usr/local/lib:/usr/local/nvidia/lib64"
        - name: NAMESPACE
          valueFrom:
            fieldRef:
--- a/benchmark/cluster/vgg16/vgg16_fluid.py
+++ b/benchmark/cluster/vgg16/vgg16_fluid.py
--- a/benchmark/cluster/vgg16/vgg16_v2.py
+++ b/benchmark/cluster/vgg16/vgg16_v2.py
@ -16,12 +16,17 @@ import gzip

 import paddle.v2.dataset.cifar as cifar
 import paddle.v2 as paddle
-import reader
 import time
+import os

 DATA_DIM = 3 * 32 * 32
 CLASS_DIM = 10
-BATCH_SIZE = 128
+BATCH_SIZE = os.getenv("BATCH_SIZE")
+if BATCH_SIZE:
+    BATCH_SIZE = int(BATCH_SIZE)
+else:
+    BATCH_SIZE = 128
+NODE_COUNT = int(os.getenv("TRAINERS"))
 ts = 0


@ -84,7 +89,8 @@ def main():
        name="label", type=paddle.data_type.integer_value(CLASS_DIM))

    extra_layers = None
-    learning_rate = 1e-3 / 20
+    # NOTE: for v2 distributed training need averaging updates.
+    learning_rate = 1e-3 / NODE_COUNT
    out = vgg16(image, class_dim=CLASS_DIM)
    cost = paddle.layer.classification_cost(input=out, label=lbl)

@ -123,7 +129,9 @@ def main():

    # End batch and end pass event handler
    def event_handler(event):
-        global ts
+        global ts, ts_pass
+        if isinstance(event, paddle.event.BeginPass):
+            ts_pass = time.time()
        if isinstance(event, paddle.event.BeginIteration):
            ts = time.time()
        if isinstance(event, paddle.event.EndIteration):
@ -132,9 +140,8 @@ def main():
                    event.pass_id, event.batch_id, event.cost, event.metrics,
                    time.time() - ts)
        if isinstance(event, paddle.event.EndPass):
-            with gzip.open('params_pass_%d.tar.gz' % event.pass_id, 'w') as f:
-                trainer.save_parameter_to_tar(f)
-
+            print "Pass %d end, spent: %f" % (event.pass_id,
+                                              time.time() - ts_pass)
            result = trainer.test(reader=test_reader)
            print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)