parent
da3b14bc67
commit
70142ae65e
@ -0,0 +1,58 @@
|
||||
# Performance for distributed vgg16
|
||||
|
||||
## Test Result
|
||||
|
||||
### Single node single thread
|
||||
|
||||
| Batch Size | 32 | 64 | 128 | 256 |
|
||||
| -- | -- | -- | -- | -- |
|
||||
| PaddlePaddle Fluid | - | - | 16.74 | - |
|
||||
| PaddlePaddle v2 | - | - | 17.60 | - |
|
||||
| TensorFlow | - | - | - | - |
|
||||
|
||||
### different batch size
|
||||
|
||||
- PServer Count: 10
|
||||
- Trainer Count: 20
|
||||
- Metrics: samples / sec
|
||||
|
||||
| Batch Size | 32 | 64 | 128 | 256 |
|
||||
| -- | -- | -- | -- | -- |
|
||||
| PaddlePaddle Fluid | - | 247.40 | - | - |
|
||||
| PaddlePaddle v2 | - | - | 256.14 | - |
|
||||
| TensorFlow | - | - | - | - |
|
||||
|
||||
### different pserver number
|
||||
|
||||
- Trainer Count: 100
|
||||
- Batch Size: 64
|
||||
- Metrics: mini-batch / sec
|
||||
|
||||
| PServer Count | 10 | 20 | 40 | 60 |
|
||||
| -- | -- | -- | -- | -- |
|
||||
| PaddlePaddle Fluid | - | - | - | - |
|
||||
| PaddlePaddle v2 | - | - | - | - |
|
||||
| TensorFlow | - | - | - | - |
|
||||
|
||||
### Accelerate rate
|
||||
|
||||
| Trainer Counter | 20 | 40 | 80 | 100 |
|
||||
| -- | -- | -- | -- | -- |
|
||||
| PaddlePaddle Fluid | - | - | - | - |
|
||||
| PaddlePaddle v2 | - | - | - | - |
|
||||
| TensorFlow | - | - | - | - |
|
||||
|
||||
|
||||
## Steps to run the performance test
|
||||
|
||||
1. You must re-compile PaddlePaddle and enable `-DWITH_DISTRIBUTE` to build PaddlePaddle with distributed support.
|
||||
1. When the build finishes, copy the output `whl` package located under `build/python/dist` to current directory.
|
||||
1. Run `docker build -t [image:tag] .` to build the docker image and run `docker push [image:tag]` to push the image to reponsitory so kubernetes can find it.
|
||||
1. Run `kubectl create -f pserver.yaml && kubectl create -f trainer.yaml` to start the job on your kubernetes cluster (you must configure the `kubectl` client before this step).
|
||||
1. Run `kubectl get po` to get running pods, and run `kubectl logs [podID]` to fetch the pod log of pservers and trainers.
|
||||
|
||||
Check the logs for the distributed training progress and analyze the performance.
|
||||
|
||||
## Enable verbos logs
|
||||
|
||||
Edit `pserver.yaml` and `trainer.yaml` and add an environment variable `GLOG_v=3` to see what happend in detail.
|
@ -1,15 +0,0 @@
|
||||
# Fluid distributed training perf test
|
||||
|
||||
## Steps to get started
|
||||
|
||||
1. You must re-compile PaddlePaddle and enable `-DWITH_DISTRIBUTE` to build PaddlePaddle with distributed support.
|
||||
1. When the build finishes, copy the output `whl` package located under `build/python/dist` to current directory.
|
||||
1. Run `docker build -t [image:tag] .` to build the docker image and run `docker push [image:tag]` to push the image to reponsitory so kubernetes can find it.
|
||||
1. Run `kubectl create -f pserver.yaml && kubectl create -f trainer.yaml` to start the job on your kubernetes cluster (you must configure the `kubectl` client before this step).
|
||||
1. Run `kubectl get po` to get running pods, and run `kubectl logs [podID]` to fetch the pod log of pservers and trainers.
|
||||
|
||||
Check the logs for the distributed training progress and analyze the performance.
|
||||
|
||||
## Enable verbos logs
|
||||
|
||||
Edit `pserver.yaml` and `trainer.yaml` and add an environment variable `GLOG_v=3` to see what happend in detail.
|
@ -1,7 +0,0 @@
|
||||
FROM paddlepaddle/paddlecloud-job
|
||||
RUN mkdir -p /workspace
|
||||
ADD reader.py /workspace/
|
||||
RUN python /workspace/reader.py
|
||||
ADD vgg16.py /workspace/
|
||||
|
||||
ADD vgg16_fluid.py /workspace
|
@ -1,70 +0,0 @@
|
||||
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
#Licensed under the Apache License, Version 2.0 (the "License");
|
||||
#you may not use this file except in compliance with the License.
|
||||
#You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
#Unless required by applicable law or agreed to in writing, software
|
||||
#distributed under the License is distributed on an "AS IS" BASIS,
|
||||
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
#See the License for the specific language governing permissions and
|
||||
#limitations under the License.
|
||||
|
||||
import random
|
||||
from paddle.v2.image import load_and_transform
|
||||
import paddle.v2 as paddle
|
||||
from multiprocessing import cpu_count
|
||||
|
||||
|
||||
def train_mapper(sample):
|
||||
'''
|
||||
map image path to type needed by model input layer for the training set
|
||||
'''
|
||||
img, label = sample
|
||||
img = paddle.image.load_image(img)
|
||||
img = paddle.image.simple_transform(img, 256, 224, True)
|
||||
return img.flatten().astype('float32'), label
|
||||
|
||||
|
||||
def test_mapper(sample):
|
||||
'''
|
||||
map image path to type needed by model input layer for the test set
|
||||
'''
|
||||
img, label = sample
|
||||
img = paddle.image.load_image(img)
|
||||
img = paddle.image.simple_transform(img, 256, 224, True)
|
||||
return img.flatten().astype('float32'), label
|
||||
|
||||
|
||||
def train_reader(train_list, buffered_size=1024):
|
||||
def reader():
|
||||
with open(train_list, 'r') as f:
|
||||
lines = [line.strip() for line in f]
|
||||
for line in lines:
|
||||
img_path, lab = line.strip().split('\t')
|
||||
yield img_path, int(lab)
|
||||
|
||||
return paddle.reader.xmap_readers(train_mapper, reader,
|
||||
cpu_count(), buffered_size)
|
||||
|
||||
|
||||
def test_reader(test_list, buffered_size=1024):
|
||||
def reader():
|
||||
with open(test_list, 'r') as f:
|
||||
lines = [line.strip() for line in f]
|
||||
for line in lines:
|
||||
img_path, lab = line.strip().split('\t')
|
||||
yield img_path, int(lab)
|
||||
|
||||
return paddle.reader.xmap_readers(test_mapper, reader,
|
||||
cpu_count(), buffered_size)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
#for im in train_reader('train.list'):
|
||||
# print len(im[0])
|
||||
#for im in train_reader('test.list'):
|
||||
# print len(im[0])
|
||||
paddle.dataset.cifar.train10()
|
Loading…
Reference in new issue