update dist benchmark to one image

7 years ago · 70142ae65e
parent da3b14bc67
commit 70142ae65e
14 changed files with 86 additions and 111 deletions
--- a/benchmark/cluster/vgg16/fluid/Dockerfile
+++ b/benchmark/cluster/vgg16/fluid/Dockerfile
@ -12,4 +12,4 @@ ENV LD_LIBRARY_PATH=/usr/local/lib
 ADD reader.py /workspace/
 RUN python /workspace/reader.py
-ADD vgg16.py /workspace/
+ADD vgg16_fluid.py vgg16_v2.py /workspace/
--- a/benchmark/cluster/vgg16/README.md
+++ b/benchmark/cluster/vgg16/README.md
@ -0,0 +1,58 @@
 # Performance for distributed vgg16
 ## Test Result
 ### Single node single thread
 | Batch Size | 32 | 64 | 128 | 256 |
 | -- | -- | -- | -- | -- |
 | PaddlePaddle Fluid | - | - | 16.74 | - |
 | PaddlePaddle v2 | - | - | 17.60 | - |
 | TensorFlow | - | - | - | - |
 ### different batch size
 - PServer Count: 10
 - Trainer Count: 20
 - Metrics: samples / sec
 | Batch Size | 32 | 64 | 128 | 256 |
 | -- | -- | -- | -- | -- |
 | PaddlePaddle Fluid | - | 247.40 | - | - |
 | PaddlePaddle v2 | - | - | 256.14 | - |
 | TensorFlow | - | - | - | - |
 ### different pserver number
 - Trainer Count: 100
 - Batch Size: 64
 - Metrics: mini-batch / sec
 | PServer Count | 10 | 20 | 40 | 60 |
 | -- | -- | -- | -- | -- |
 | PaddlePaddle Fluid | - | - | - | - |
 | PaddlePaddle v2 | - | - | - | - |
 | TensorFlow | - | - | - | - |
 ### Accelerate rate
 | Trainer Counter | 20 | 40 | 80 | 100 |
 | -- | -- | -- | -- | -- |
 | PaddlePaddle Fluid | - | - | - | - |
 | PaddlePaddle v2 | - | - | - | - |
 | TensorFlow | - | - | - | - |
 ## Steps to run the performance test
 1. You must re-compile PaddlePaddle and enable `-DWITH_DISTRIBUTE` to build PaddlePaddle with distributed support.
 1. When the build finishes, copy the output `whl` package located under `build/python/dist` to current directory.
 1. Run `docker build -t [image:tag] .` to build the docker image and run `docker push [image:tag]` to push the image to reponsitory so kubernetes can find it.
 1. Run `kubectl create -f pserver.yaml && kubectl create -f trainer.yaml` to start the job on your kubernetes cluster (you must configure the `kubectl` client before this step).
 1. Run `kubectl get po` to get running pods, and run `kubectl logs [podID]` to fetch the pod log of pservers and trainers.
 Check the logs for the distributed training progress and analyze the performance.
 ## Enable verbos logs
 Edit `pserver.yaml` and `trainer.yaml` and add an environment variable `GLOG_v=3` to see what happend in detail.
--- a/benchmark/cluster/vgg16/fluid/README.md
+++ b/benchmark/cluster/vgg16/fluid/README.md
@ -1,15 +0,0 @@
 # Fluid distributed training perf test
 ## Steps to get started
 1. You must re-compile PaddlePaddle and enable `-DWITH_DISTRIBUTE` to build PaddlePaddle with distributed support.
 1. When the build finishes, copy the output `whl` package located under `build/python/dist` to current directory.
 1. Run `docker build -t [image:tag] .` to build the docker image and run `docker push [image:tag]` to push the image to reponsitory so kubernetes can find it.
 1. Run `kubectl create -f pserver.yaml && kubectl create -f trainer.yaml` to start the job on your kubernetes cluster (you must configure the `kubectl` client before this step).
 1. Run `kubectl get po` to get running pods, and run `kubectl logs [podID]` to fetch the pod log of pservers and trainers.
 Check the logs for the distributed training progress and analyze the performance.
 ## Enable verbos logs
 Edit `pserver.yaml` and `trainer.yaml` and add an environment variable `GLOG_v=3` to see what happend in detail.
--- a/benchmark/cluster/vgg16/fluid_pserver.yaml
+++ b/benchmark/cluster/vgg16/fluid_pserver.yaml
@ -14,7 +14,7 @@ spec:
      - name: job-registry-secret
      containers:
      - name: pserver
-        image: "registry.baidu.com/paddlepaddle/rawjob:vgg16_fluid"
+        image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
        imagePullPolicy: Always
        ports:
        - name: jobport-30236
@ -33,7 +33,7 @@ spec:
        - name: TOPOLOGY
          value: ""
        - name: ENTRY
-          value: "LD_LIBRARY_PATH=/usr/local/lib MKL_NUM_THREADS=1 python /workspace/vgg16.py --local 0"
+          value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0"
        - name: TRAINER_PACKAGE
          value: "/workspace"
        - name: PADDLE_INIT_PORT
@ -53,7 +53,7 @@ spec:
        - name: PADDLE_INIT_USE_GPU
          value: "0"
        - name: LD_LIBRARY_PATH
-          value: "/usr/local/nvidia/lib64"
+          value: "/usr/local/lib:/usr/local/nvidia/lib64"
        - name: NAMESPACE
          valueFrom:
            fieldRef:
--- a/benchmark/cluster/vgg16/fluid_trainer.yaml
+++ b/benchmark/cluster/vgg16/fluid_trainer.yaml
@ -15,7 +15,7 @@ spec:
      hostNetwork: true
      containers:
      - name: trainer
-        image: "registry.baidu.com/paddlepaddle/rawjob:vgg16_fluid"
+        image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
        imagePullPolicy: Always
        command: ["paddle_k8s", "start_fluid"]
        env:
@ -30,7 +30,7 @@ spec:
        - name: TOPOLOGY
          value: ""
        - name: ENTRY
-          value: "cd /workspace && LD_LIBRARY_PATH=/usr/local/lib MKL_NUM_THREADS=1 python /workspace/vgg16.py --local 0"
+          value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0 --batch_size 128"
        - name: TRAINER_PACKAGE
          value: "/workspace"
        - name: PADDLE_INIT_PORT
@ -50,7 +50,7 @@ spec:
        - name: PADDLE_INIT_USE_GPU
          value: "0"
        - name: LD_LIBRARY_PATH
-          value: "/usr/local/nvidia/lib64"
+          value: "/usr/local/lib:/usr/local/nvidia/lib64"
        - name: NAMESPACE
          valueFrom:
            fieldRef:
--- a/benchmark/cluster/vgg16/fluid/k8s_tools.py
+++ b/benchmark/cluster/vgg16/fluid/k8s_tools.py
--- a/benchmark/cluster/vgg16/fluid/paddle_k8s
+++ b/benchmark/cluster/vgg16/fluid/paddle_k8s
--- a/benchmark/cluster/vgg16/fluid/reader.py
+++ b/benchmark/cluster/vgg16/fluid/reader.py
--- a/benchmark/cluster/vgg16/v2/Dockerfile
+++ b/benchmark/cluster/vgg16/v2/Dockerfile
@ -1,7 +0,0 @@
 FROM paddlepaddle/paddlecloud-job
 RUN mkdir -p /workspace
 ADD reader.py /workspace/
 RUN python /workspace/reader.py
 ADD vgg16.py /workspace/
 ADD vgg16_fluid.py /workspace
--- a/benchmark/cluster/vgg16/v2/reader.py
+++ b/benchmark/cluster/vgg16/v2/reader.py
@ -1,70 +0,0 @@
 #  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 #
 #Licensed under the Apache License, Version 2.0 (the "License");
 #you may not use this file except in compliance with the License.
 #You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 #Unless required by applicable law or agreed to in writing, software
 #distributed under the License is distributed on an "AS IS" BASIS,
 #WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #See the License for the specific language governing permissions and
 #limitations under the License.
 import random
 from paddle.v2.image import load_and_transform
 import paddle.v2 as paddle
 from multiprocessing import cpu_count
 def train_mapper(sample):
    '''
    map image path to type needed by model input layer for the training set
    '''
    img, label = sample
    img = paddle.image.load_image(img)
    img = paddle.image.simple_transform(img, 256, 224, True)
    return img.flatten().astype('float32'), label
 def test_mapper(sample):
    '''
    map image path to type needed by model input layer for the test set
    '''
    img, label = sample
    img = paddle.image.load_image(img)
    img = paddle.image.simple_transform(img, 256, 224, True)
    return img.flatten().astype('float32'), label
 def train_reader(train_list, buffered_size=1024):
    def reader():
        with open(train_list, 'r') as f:
            lines = [line.strip() for line in f]
            for line in lines:
                img_path, lab = line.strip().split('\t')
                yield img_path, int(lab)
    return paddle.reader.xmap_readers(train_mapper, reader,
                                      cpu_count(), buffered_size)
 def test_reader(test_list, buffered_size=1024):
    def reader():
        with open(test_list, 'r') as f:
            lines = [line.strip() for line in f]
            for line in lines:
                img_path, lab = line.strip().split('\t')
                yield img_path, int(lab)
    return paddle.reader.xmap_readers(test_mapper, reader,
                                      cpu_count(), buffered_size)
 if __name__ == '__main__':
    #for im in train_reader('train.list'):
    #    print len(im[0])
    #for im in train_reader('test.list'):
    #    print len(im[0])
    paddle.dataset.cifar.train10()
--- a/benchmark/cluster/vgg16/v2_pserver.yaml
+++ b/benchmark/cluster/vgg16/v2_pserver.yaml
@ -14,7 +14,7 @@ spec:
      - name: job-registry-secret
      containers:
      - name: pserver
-        image: "registry.baidu.com/paddlepaddle/rawjob:vgg16"
+        image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
        imagePullPolicy: Always
        ports:
        - name: jobport-30236
@ -49,7 +49,7 @@ spec:
        - name: PADDLE_INIT_USE_GPU
          value: "0"
        - name: LD_LIBRARY_PATH
-          value: "/usr/local/nvidia/lib64"
+          value: "/usr/local/lib:/usr/local/nvidia/lib64"
        - name: NAMESPACE
          valueFrom:
            fieldRef:
--- a/benchmark/cluster/vgg16/v2_trainer.yaml
+++ b/benchmark/cluster/vgg16/v2_trainer.yaml
@ -15,12 +15,14 @@ spec:
      hostNetwork: true
      containers:
      - name: trainer
-        image: "registry.baidu.com/paddlepaddle/rawjob:vgg16"
+        image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
        imagePullPolicy: Always
        command: ["paddle_k8s", "start_trainer", "v2"]
        env:
        - name: PADDLE_JOB_NAME
          value: vgg16v2job
        - name: BATCH_SIZE
          value: "128"
        - name: TRAINERS
          value: "20"
        - name: PSERVERS
@ -28,7 +30,7 @@ spec:
        - name: TOPOLOGY
          value: ""
        - name: ENTRY
-          value: "cd /workspace && MKL_NUM_THREADS=1 python /workspace/vgg16.py"
+          value: "cd /workspace && MKL_NUM_THREADS=1 python /workspace/vgg16_v2.py"
        - name: TRAINER_PACKAGE
          value: "/workspace"
        - name: PADDLE_INIT_PORT
@ -48,7 +50,7 @@ spec:
        - name: PADDLE_INIT_USE_GPU
          value: "0"
        - name: LD_LIBRARY_PATH
-          value: "/usr/local/nvidia/lib64"
+          value: "/usr/local/lib:/usr/local/nvidia/lib64"
        - name: NAMESPACE
          valueFrom:
            fieldRef:
--- a/benchmark/cluster/vgg16/vgg16_fluid.py
+++ b/benchmark/cluster/vgg16/vgg16_fluid.py
--- a/benchmark/cluster/vgg16/vgg16_v2.py
+++ b/benchmark/cluster/vgg16/vgg16_v2.py
@ -16,12 +16,17 @@ import gzip
 import paddle.v2.dataset.cifar as cifar
 import paddle.v2 as paddle
 import reader
 import time
 import os
 DATA_DIM = 3 * 32 * 32
 CLASS_DIM = 10
-BATCH_SIZE = 128
+BATCH_SIZE = os.getenv("BATCH_SIZE")
 if BATCH_SIZE:
    BATCH_SIZE = int(BATCH_SIZE)
 else:
    BATCH_SIZE = 128
 NODE_COUNT = int(os.getenv("TRAINERS"))
 ts = 0
@ -84,7 +89,8 @@ def main():
        name="label", type=paddle.data_type.integer_value(CLASS_DIM))
    extra_layers = None
-    learning_rate = 1e-3 / 20
+    # NOTE: for v2 distributed training need averaging updates.
    learning_rate = 1e-3 / NODE_COUNT
    out = vgg16(image, class_dim=CLASS_DIM)
    cost = paddle.layer.classification_cost(input=out, label=lbl)
@ -123,7 +129,9 @@ def main():
    # End batch and end pass event handler
    def event_handler(event):
-        global ts
+        global ts, ts_pass
        if isinstance(event, paddle.event.BeginPass):
            ts_pass = time.time()
        if isinstance(event, paddle.event.BeginIteration):
            ts = time.time()
        if isinstance(event, paddle.event.EndIteration):
@ -132,9 +140,8 @@ def main():
                    event.pass_id, event.batch_id, event.cost, event.metrics,
                    time.time() - ts)
        if isinstance(event, paddle.event.EndPass):
-            with gzip.open('params_pass_%d.tar.gz' % event.pass_id, 'w') as f:
+            print "Pass %d end, spent: %f" % (event.pass_id,
-                trainer.save_parameter_to_tar(f)
+                                              time.time() - ts_pass)
            result = trainer.test(reader=test_reader)
            print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)