parent
da3b14bc67
commit
70142ae65e
@ -0,0 +1,58 @@
|
|||||||
|
# Performance for distributed vgg16
|
||||||
|
|
||||||
|
## Test Result
|
||||||
|
|
||||||
|
### Single node single thread
|
||||||
|
|
||||||
|
| Batch Size | 32 | 64 | 128 | 256 |
|
||||||
|
| -- | -- | -- | -- | -- |
|
||||||
|
| PaddlePaddle Fluid | - | - | 16.74 | - |
|
||||||
|
| PaddlePaddle v2 | - | - | 17.60 | - |
|
||||||
|
| TensorFlow | - | - | - | - |
|
||||||
|
|
||||||
|
### different batch size
|
||||||
|
|
||||||
|
- PServer Count: 10
|
||||||
|
- Trainer Count: 20
|
||||||
|
- Metrics: samples / sec
|
||||||
|
|
||||||
|
| Batch Size | 32 | 64 | 128 | 256 |
|
||||||
|
| -- | -- | -- | -- | -- |
|
||||||
|
| PaddlePaddle Fluid | - | 247.40 | - | - |
|
||||||
|
| PaddlePaddle v2 | - | - | 256.14 | - |
|
||||||
|
| TensorFlow | - | - | - | - |
|
||||||
|
|
||||||
|
### different pserver number
|
||||||
|
|
||||||
|
- Trainer Count: 100
|
||||||
|
- Batch Size: 64
|
||||||
|
- Metrics: mini-batch / sec
|
||||||
|
|
||||||
|
| PServer Count | 10 | 20 | 40 | 60 |
|
||||||
|
| -- | -- | -- | -- | -- |
|
||||||
|
| PaddlePaddle Fluid | - | - | - | - |
|
||||||
|
| PaddlePaddle v2 | - | - | - | - |
|
||||||
|
| TensorFlow | - | - | - | - |
|
||||||
|
|
||||||
|
### Accelerate rate
|
||||||
|
|
||||||
|
| Trainer Counter | 20 | 40 | 80 | 100 |
|
||||||
|
| -- | -- | -- | -- | -- |
|
||||||
|
| PaddlePaddle Fluid | - | - | - | - |
|
||||||
|
| PaddlePaddle v2 | - | - | - | - |
|
||||||
|
| TensorFlow | - | - | - | - |
|
||||||
|
|
||||||
|
|
||||||
|
## Steps to run the performance test
|
||||||
|
|
||||||
|
1. You must re-compile PaddlePaddle and enable `-DWITH_DISTRIBUTE` to build PaddlePaddle with distributed support.
|
||||||
|
1. When the build finishes, copy the output `whl` package located under `build/python/dist` to current directory.
|
||||||
|
1. Run `docker build -t [image:tag] .` to build the docker image and run `docker push [image:tag]` to push the image to reponsitory so kubernetes can find it.
|
||||||
|
1. Run `kubectl create -f pserver.yaml && kubectl create -f trainer.yaml` to start the job on your kubernetes cluster (you must configure the `kubectl` client before this step).
|
||||||
|
1. Run `kubectl get po` to get running pods, and run `kubectl logs [podID]` to fetch the pod log of pservers and trainers.
|
||||||
|
|
||||||
|
Check the logs for the distributed training progress and analyze the performance.
|
||||||
|
|
||||||
|
## Enable verbos logs
|
||||||
|
|
||||||
|
Edit `pserver.yaml` and `trainer.yaml` and add an environment variable `GLOG_v=3` to see what happend in detail.
|
@ -1,15 +0,0 @@
|
|||||||
# Fluid distributed training perf test
|
|
||||||
|
|
||||||
## Steps to get started
|
|
||||||
|
|
||||||
1. You must re-compile PaddlePaddle and enable `-DWITH_DISTRIBUTE` to build PaddlePaddle with distributed support.
|
|
||||||
1. When the build finishes, copy the output `whl` package located under `build/python/dist` to current directory.
|
|
||||||
1. Run `docker build -t [image:tag] .` to build the docker image and run `docker push [image:tag]` to push the image to reponsitory so kubernetes can find it.
|
|
||||||
1. Run `kubectl create -f pserver.yaml && kubectl create -f trainer.yaml` to start the job on your kubernetes cluster (you must configure the `kubectl` client before this step).
|
|
||||||
1. Run `kubectl get po` to get running pods, and run `kubectl logs [podID]` to fetch the pod log of pservers and trainers.
|
|
||||||
|
|
||||||
Check the logs for the distributed training progress and analyze the performance.
|
|
||||||
|
|
||||||
## Enable verbos logs
|
|
||||||
|
|
||||||
Edit `pserver.yaml` and `trainer.yaml` and add an environment variable `GLOG_v=3` to see what happend in detail.
|
|
@ -1,7 +0,0 @@
|
|||||||
FROM paddlepaddle/paddlecloud-job
|
|
||||||
RUN mkdir -p /workspace
|
|
||||||
ADD reader.py /workspace/
|
|
||||||
RUN python /workspace/reader.py
|
|
||||||
ADD vgg16.py /workspace/
|
|
||||||
|
|
||||||
ADD vgg16_fluid.py /workspace
|
|
@ -1,70 +0,0 @@
|
|||||||
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
|
|
||||||
#
|
|
||||||
#Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
#you may not use this file except in compliance with the License.
|
|
||||||
#You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
#Unless required by applicable law or agreed to in writing, software
|
|
||||||
#distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
#See the License for the specific language governing permissions and
|
|
||||||
#limitations under the License.
|
|
||||||
|
|
||||||
import random
|
|
||||||
from paddle.v2.image import load_and_transform
|
|
||||||
import paddle.v2 as paddle
|
|
||||||
from multiprocessing import cpu_count
|
|
||||||
|
|
||||||
|
|
||||||
def train_mapper(sample):
|
|
||||||
'''
|
|
||||||
map image path to type needed by model input layer for the training set
|
|
||||||
'''
|
|
||||||
img, label = sample
|
|
||||||
img = paddle.image.load_image(img)
|
|
||||||
img = paddle.image.simple_transform(img, 256, 224, True)
|
|
||||||
return img.flatten().astype('float32'), label
|
|
||||||
|
|
||||||
|
|
||||||
def test_mapper(sample):
|
|
||||||
'''
|
|
||||||
map image path to type needed by model input layer for the test set
|
|
||||||
'''
|
|
||||||
img, label = sample
|
|
||||||
img = paddle.image.load_image(img)
|
|
||||||
img = paddle.image.simple_transform(img, 256, 224, True)
|
|
||||||
return img.flatten().astype('float32'), label
|
|
||||||
|
|
||||||
|
|
||||||
def train_reader(train_list, buffered_size=1024):
|
|
||||||
def reader():
|
|
||||||
with open(train_list, 'r') as f:
|
|
||||||
lines = [line.strip() for line in f]
|
|
||||||
for line in lines:
|
|
||||||
img_path, lab = line.strip().split('\t')
|
|
||||||
yield img_path, int(lab)
|
|
||||||
|
|
||||||
return paddle.reader.xmap_readers(train_mapper, reader,
|
|
||||||
cpu_count(), buffered_size)
|
|
||||||
|
|
||||||
|
|
||||||
def test_reader(test_list, buffered_size=1024):
|
|
||||||
def reader():
|
|
||||||
with open(test_list, 'r') as f:
|
|
||||||
lines = [line.strip() for line in f]
|
|
||||||
for line in lines:
|
|
||||||
img_path, lab = line.strip().split('\t')
|
|
||||||
yield img_path, int(lab)
|
|
||||||
|
|
||||||
return paddle.reader.xmap_readers(test_mapper, reader,
|
|
||||||
cpu_count(), buffered_size)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
#for im in train_reader('train.list'):
|
|
||||||
# print len(im[0])
|
|
||||||
#for im in train_reader('test.list'):
|
|
||||||
# print len(im[0])
|
|
||||||
paddle.dataset.cifar.train10()
|
|
Loading…
Reference in new issue