Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into fix-7195
commit
f05330b78b
@ -0,0 +1,78 @@
|
||||
# Cluster Training Benchmark
|
||||
|
||||
## Setup
|
||||
|
||||
- Platform
|
||||
- Kubernetes: v1.6.2
|
||||
- Linux Kernel: v3.10.0
|
||||
|
||||
- Resource
|
||||
- CPU: 10 Cores per Pod
|
||||
- Memory: 5GB per Pod
|
||||
|
||||
- Docker Image
|
||||
|
||||
We use different base Docker Image to run the benchmark on Kubernetes:
|
||||
- PaddlePaddle v2: paddlepaddle/paddle:0.11.0
|
||||
- PaddlePaddle Fluid: paddlepaddle/paddle:[commit-id]
|
||||
- TensorFlow: tensorflow/tensorflow:1.5.0-rc0
|
||||
|
||||
- Model
|
||||
vgg16 is used in this benchmark.
|
||||
|
||||
## Cases
|
||||
|
||||
- Variable
|
||||
- Batch Size of training data.
|
||||
- PServer count of the training job.
|
||||
- The number of trainers.
|
||||
|
||||
- Invariant
|
||||
- The resource of trainer/pserver Pod.
|
||||
|
||||
### Measure the Performance for Different Batch Size
|
||||
|
||||
- PServer Count: 40
|
||||
- Trainer Count: 100
|
||||
- Metrics: mini-batch / sec
|
||||
|
||||
| Batch Size | 32 | 64 | 128 | 256 |
|
||||
| -- | -- | -- | -- | -- |
|
||||
| PaddlePaddle Fluid | - | - | - | - |
|
||||
| PaddlePaddle v2 | - | - | - | - |
|
||||
| TensorFlow | - | - | - | - |
|
||||
|
||||
### Measure the Performance for Different PServer Count
|
||||
|
||||
- Trainer Count: 100
|
||||
- Batch Size: 64
|
||||
- Metrics: mini-batch / sec
|
||||
|
||||
| PServer Count | 10 | 20 | 40 | 60 |
|
||||
| -- | -- | -- | -- | -- |
|
||||
| PaddlePaddle Fluid | - | - | - | - |
|
||||
| PaddlePaddle v2 | - | - | - | - |
|
||||
| TensorFlow | - | - | - | - |
|
||||
|
||||
### Measure Parallel Efficiency By Increasing Trainer Count
|
||||
|
||||
- PServer Count: 20
|
||||
- Batch Size: 64
|
||||
- Metrics:
|
||||
|
||||
$S = \div(T1, TN)$
|
||||
|
||||
which S is the ratio of T1 over TN, training time of 1 and N trainers.
|
||||
The parallel efficiency is:
|
||||
|
||||
$E = \div(S, N)$
|
||||
|
||||
| Trainer Counter | 1 | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
|
||||
| -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- |
|
||||
| PaddlePaddle Fluid | - | - | - | - | - | - | - | - | - | - | - |
|
||||
| PaddlePaddle v2 | - | - | - | - | - | - | - | - | - | - | - | - |
|
||||
| TensorFlow | - | - | - | - | - | - | - | - | - | - | - | - | - |
|
||||
|
||||
## Reproduce the benchmark
|
||||
|
||||
TODO
|
||||
@ -0,0 +1,10 @@
|
||||
===========
|
||||
IO
|
||||
===========
|
||||
|
||||
|
||||
|
||||
is_parameter
|
||||
-----------
|
||||
.. autofunction:: paddle.v2.fluid.io.is_parameter
|
||||
:noindex:
|
||||
|
After Width: | Height: | Size: 361 KiB |
|
After Width: | Height: | Size: 470 KiB |
|
After Width: | Height: | Size: 448 KiB |
@ -0,0 +1,9 @@
|
||||
PaddlePaddle C-API
|
||||
==================
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
|
||||
compile_paddle_lib_cn.md
|
||||
organization_of_the_inputs_cn.md
|
||||
workflow_of_capi_cn.md
|
||||
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,8 @@
|
||||
from paddle.utils.merge_model import merge_v2_model
|
||||
|
||||
from mnist_v2 import network
|
||||
|
||||
net = network(is_infer=True)
|
||||
param_file = "models/params_pass_4.tar"
|
||||
output_file = "output.paddle.model"
|
||||
merge_v2_model(net, param_file, output_file)
|
||||
@ -0,0 +1,117 @@
|
||||
import os
|
||||
import sys
|
||||
import gzip
|
||||
import logging
|
||||
import argparse
|
||||
from PIL import Image
|
||||
import numpy as np
|
||||
|
||||
import paddle.v2 as paddle
|
||||
from paddle.utils.dump_v2_config import dump_v2_config
|
||||
|
||||
logger = logging.getLogger("paddle")
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
|
||||
def multilayer_perceptron(img, layer_size, lbl_dim):
|
||||
for idx, size in enumerate(layer_size):
|
||||
hidden = paddle.layer.fc(input=(img if not idx else hidden),
|
||||
size=size,
|
||||
act=paddle.activation.Relu())
|
||||
return paddle.layer.fc(input=hidden,
|
||||
size=lbl_dim,
|
||||
act=paddle.activation.Softmax())
|
||||
|
||||
|
||||
def network(input_dim=784, lbl_dim=10, is_infer=False):
|
||||
images = paddle.layer.data(
|
||||
name='pixel', type=paddle.data_type.dense_vector(input_dim))
|
||||
|
||||
predict = multilayer_perceptron(
|
||||
images, layer_size=[128, 64], lbl_dim=lbl_dim)
|
||||
|
||||
if is_infer:
|
||||
return predict
|
||||
else:
|
||||
label = paddle.layer.data(
|
||||
name='label', type=paddle.data_type.integer_value(lbl_dim))
|
||||
return paddle.layer.classification_cost(input=predict, label=label)
|
||||
|
||||
|
||||
def main(task="train", use_gpu=False, trainer_count=1, save_dir="models"):
|
||||
if task == "train":
|
||||
if not os.path.exists(save_dir):
|
||||
os.mkdir(save_dir)
|
||||
|
||||
paddle.init(use_gpu=use_gpu, trainer_count=trainer_count)
|
||||
cost = network()
|
||||
parameters = paddle.parameters.create(cost)
|
||||
optimizer = paddle.optimizer.Momentum(
|
||||
learning_rate=0.1 / 128.0,
|
||||
momentum=0.9,
|
||||
regularization=paddle.optimizer.L2Regularization(rate=0.0005 * 128))
|
||||
|
||||
trainer = paddle.trainer.SGD(cost=cost,
|
||||
parameters=parameters,
|
||||
update_equation=optimizer)
|
||||
|
||||
def event_handler(event):
|
||||
if isinstance(event, paddle.event.EndIteration):
|
||||
if event.batch_id % 100 == 0:
|
||||
logger.info("Pass %d, Batch %d, Cost %f, %s" %
|
||||
(event.pass_id, event.batch_id, event.cost,
|
||||
event.metrics))
|
||||
if isinstance(event, paddle.event.EndPass):
|
||||
with gzip.open(
|
||||
os.path.join(save_dir, "params_pass_%d.tar" %
|
||||
event.pass_id), "w") as f:
|
||||
trainer.save_parameter_to_tar(f)
|
||||
|
||||
trainer.train(
|
||||
reader=paddle.batch(
|
||||
paddle.reader.shuffle(
|
||||
paddle.dataset.mnist.train(), buf_size=8192),
|
||||
batch_size=128),
|
||||
event_handler=event_handler,
|
||||
num_passes=5)
|
||||
elif task == "dump_config":
|
||||
predict = network(is_infer=True)
|
||||
dump_v2_config(predict, "trainer_config.bin", True)
|
||||
else:
|
||||
raise RuntimeError(("Error value for parameter task. "
|
||||
"Available options are: train and dump_config."))
|
||||
|
||||
|
||||
def parse_cmd():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="PaddlePaddle MNIST demo for CAPI.")
|
||||
parser.add_argument(
|
||||
"--task",
|
||||
type=str,
|
||||
required=False,
|
||||
help=("A string indicating the taks type. "
|
||||
"Available options are: \"train\", \"dump_config\"."),
|
||||
default="train")
|
||||
parser.add_argument(
|
||||
"--use_gpu",
|
||||
type=bool,
|
||||
help=("A bool flag indicating whether to use GPU device or not."),
|
||||
default=False)
|
||||
parser.add_argument(
|
||||
"--trainer_count",
|
||||
type=int,
|
||||
help=("This parameter is only used in training task. It indicates "
|
||||
"how many computing threads are created in training."),
|
||||
default=1)
|
||||
parser.add_argument(
|
||||
"--save_dir",
|
||||
type=str,
|
||||
help=("This parameter is only used in training task. It indicates "
|
||||
"path of the directory to save the trained models."),
|
||||
default="models")
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_cmd()
|
||||
main(args.task, args.use_gpu, args.trainer_count, args.save_dir)
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue