remove benchmark folder, since there is a benchmark repo already, distributed benchmark will be maintained in fleet repo (#18537)
test=developsum_op
@ -1,12 +0,0 @@
|
||||
paddle/image/logs
|
||||
paddle/image/*.pyc
|
||||
paddle/image/train.list
|
||||
paddle/rnn/logs
|
||||
paddle/rnn/*.pyc
|
||||
paddle/rnn/imdb.pkl
|
||||
caffe/image/logs
|
||||
tensorflow/image/logs
|
||||
tensorflow/rnn/logs
|
||||
fluid/models/*.pyc
|
||||
fluid/logs
|
||||
fluid/nohup.out
|
@ -1,30 +0,0 @@
|
||||
set -e
|
||||
|
||||
function test() {
|
||||
cfg=$1
|
||||
batch=$2
|
||||
prefix=$3
|
||||
sed -i "/input: \"data\"/{n;s/^input_dim.*/input_dim: $batch/g}" $cfg
|
||||
sed -i "/input: \"label\"/{n;s/^input_dim.*/input_dim: $batch/g}" $cfg
|
||||
caffe time --model=$cfg --iterations=50 --gpu 0 > logs/$prefix-1gpu-batch${batch}.log 2>&1
|
||||
}
|
||||
|
||||
if [ ! -d "logs" ]; then
|
||||
mkdir logs
|
||||
fi
|
||||
|
||||
# alexnet
|
||||
test alexnet.prototxt 64 alexnet
|
||||
test alexnet.prototxt 128 alexnet
|
||||
test alexnet.prototxt 256 alexnet
|
||||
test alexnet.prototxt 512 alexnet
|
||||
|
||||
# googlenet
|
||||
test googlenet.prototxt 64 googlenet
|
||||
test googlenet.prototxt 128 googlenet
|
||||
|
||||
# small net
|
||||
test smallnet_mnist_cifar.prototxt 64 smallnet
|
||||
test smallnet_mnist_cifar.prototxt 128 smallnet
|
||||
test smallnet_mnist_cifar.prototxt 256 smallnet
|
||||
test smallnet_mnist_cifar.prototxt 512 smallnet
|
@ -1,24 +0,0 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
function test() {
|
||||
cfg=$1
|
||||
batch=$2
|
||||
prefix=$3
|
||||
batch_per_gpu=`expr ${batch} / 4`
|
||||
sed -i "/input: \"data\"/{n;s/^input_dim.*/input_dim: ${batch_per_gpu}/g}" $cfg
|
||||
sed -i "/input: \"label\"/{n;s/^input_dim.*/input_dim: ${batch_per_gpu}/g}" $cfg
|
||||
sed -i "1c\net : \"${cfg}\"" solver.prototxt
|
||||
caffe train --solver=solver.prototxt -gpu 0,1,2,3 > logs/${prefix}-4gpu-batch${batch}.log 2>&1
|
||||
}
|
||||
|
||||
if [ ! -d "logs" ]; then
|
||||
mkdir logs
|
||||
fi
|
||||
|
||||
# alexnet
|
||||
test alexnet.prototxt 512 alexnet
|
||||
test alexnet.prototxt 1024 alexnet
|
||||
|
||||
# googlnet
|
||||
test googlenet.prototxt 512 googlenet
|
@ -1,198 +0,0 @@
|
||||
name: "mnist/cifar"
|
||||
input: "data"
|
||||
input_dim: 128
|
||||
input_dim: 3
|
||||
input_dim: 32
|
||||
input_dim: 32
|
||||
input: "label"
|
||||
input_dim: 128
|
||||
input_dim: 1
|
||||
input_dim: 1
|
||||
input_dim: 1
|
||||
layer {
|
||||
name: "conv1"
|
||||
type: "Convolution"
|
||||
bottom: "data"
|
||||
top: "conv1"
|
||||
param {
|
||||
lr_mult: 1
|
||||
}
|
||||
param {
|
||||
lr_mult: 2
|
||||
}
|
||||
convolution_param {
|
||||
num_output: 32
|
||||
pad: 2
|
||||
kernel_size: 5
|
||||
stride: 1
|
||||
weight_filler {
|
||||
type: "gaussian"
|
||||
std: 0.0001
|
||||
}
|
||||
bias_filler {
|
||||
type: "constant"
|
||||
}
|
||||
}
|
||||
}
|
||||
layer {
|
||||
name: "pool1"
|
||||
type: "Pooling"
|
||||
bottom: "conv1"
|
||||
top: "pool1"
|
||||
pooling_param {
|
||||
pool: MAX
|
||||
kernel_size: 3
|
||||
stride: 2
|
||||
}
|
||||
}
|
||||
layer {
|
||||
name: "relu1"
|
||||
type: "ReLU"
|
||||
bottom: "pool1"
|
||||
top: "pool1"
|
||||
}
|
||||
layer {
|
||||
name: "conv2"
|
||||
type: "Convolution"
|
||||
bottom: "pool1"
|
||||
top: "conv2"
|
||||
param {
|
||||
lr_mult: 1
|
||||
}
|
||||
param {
|
||||
lr_mult: 2
|
||||
}
|
||||
convolution_param {
|
||||
num_output: 32
|
||||
pad: 2
|
||||
kernel_size: 5
|
||||
stride: 1
|
||||
weight_filler {
|
||||
type: "gaussian"
|
||||
std: 0.01
|
||||
}
|
||||
bias_filler {
|
||||
type: "constant"
|
||||
}
|
||||
}
|
||||
}
|
||||
layer {
|
||||
name: "relu2"
|
||||
type: "ReLU"
|
||||
bottom: "conv2"
|
||||
top: "conv2"
|
||||
}
|
||||
layer {
|
||||
name: "pool2"
|
||||
type: "Pooling"
|
||||
bottom: "conv2"
|
||||
top: "pool2"
|
||||
pooling_param {
|
||||
pool: AVE
|
||||
kernel_size: 3
|
||||
stride: 2
|
||||
}
|
||||
}
|
||||
layer {
|
||||
name: "conv3"
|
||||
type: "Convolution"
|
||||
bottom: "pool2"
|
||||
top: "conv3"
|
||||
param {
|
||||
lr_mult: 1
|
||||
}
|
||||
param {
|
||||
lr_mult: 2
|
||||
}
|
||||
convolution_param {
|
||||
num_output: 64
|
||||
pad: 2
|
||||
kernel_size: 5
|
||||
stride: 1
|
||||
weight_filler {
|
||||
type: "gaussian"
|
||||
std: 0.01
|
||||
}
|
||||
bias_filler {
|
||||
type: "constant"
|
||||
}
|
||||
}
|
||||
}
|
||||
layer {
|
||||
name: "relu3"
|
||||
type: "ReLU"
|
||||
bottom: "conv3"
|
||||
top: "conv3"
|
||||
}
|
||||
layer {
|
||||
name: "pool3"
|
||||
type: "Pooling"
|
||||
bottom: "conv3"
|
||||
top: "pool3"
|
||||
pooling_param {
|
||||
pool: AVE
|
||||
kernel_size: 3
|
||||
stride: 2
|
||||
}
|
||||
}
|
||||
layer {
|
||||
name: "ip1"
|
||||
type: "InnerProduct"
|
||||
bottom: "pool3"
|
||||
top: "ip1"
|
||||
param {
|
||||
lr_mult: 1
|
||||
}
|
||||
param {
|
||||
lr_mult: 2
|
||||
}
|
||||
inner_product_param {
|
||||
num_output: 64
|
||||
weight_filler {
|
||||
type: "gaussian"
|
||||
std: 0.1
|
||||
}
|
||||
bias_filler {
|
||||
type: "constant"
|
||||
}
|
||||
}
|
||||
}
|
||||
layer {
|
||||
name: "ip2"
|
||||
type: "InnerProduct"
|
||||
bottom: "ip1"
|
||||
top: "ip2"
|
||||
param {
|
||||
lr_mult: 1
|
||||
}
|
||||
param {
|
||||
lr_mult: 2
|
||||
}
|
||||
inner_product_param {
|
||||
num_output: 10
|
||||
weight_filler {
|
||||
type: "gaussian"
|
||||
std: 0.1
|
||||
}
|
||||
bias_filler {
|
||||
type: "constant"
|
||||
}
|
||||
}
|
||||
}
|
||||
layer {
|
||||
name: "accuracy"
|
||||
type: "Accuracy"
|
||||
bottom: "ip2"
|
||||
bottom: "label"
|
||||
top: "accuracy"
|
||||
include {
|
||||
phase: TEST
|
||||
}
|
||||
}
|
||||
layer {
|
||||
name: "loss"
|
||||
type: "SoftmaxWithLoss"
|
||||
bottom: "ip2"
|
||||
bottom: "label"
|
||||
top: "loss"
|
||||
}
|
@ -1,10 +0,0 @@
|
||||
net: "alexnet.prototxt"
|
||||
base_lr: 0.01
|
||||
lr_policy: "fixed"
|
||||
display: 20
|
||||
max_iter: 200
|
||||
momentum: 0.9
|
||||
weight_decay: 0.0005
|
||||
snapshot: 10000
|
||||
snapshot_prefix: "models/caffe_alexnet_train"
|
||||
solver_mode: GPU
|
Before Width: | Height: | Size: 82 KiB |
Before Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 16 KiB |
Before Width: | Height: | Size: 82 KiB |
Before Width: | Height: | Size: 14 KiB |
Before Width: | Height: | Size: 19 KiB |
Before Width: | Height: | Size: 14 KiB |
Before Width: | Height: | Size: 18 KiB |
Before Width: | Height: | Size: 72 KiB |
Before Width: | Height: | Size: 115 KiB |
Before Width: | Height: | Size: 14 KiB |
Before Width: | Height: | Size: 17 KiB |
@ -1,30 +0,0 @@
|
||||
FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04
|
||||
|
||||
# Use UBUNTU_MIRROR can speed up apt-get speed.
|
||||
# ARG UBUNTU_MIRROR
|
||||
# RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
|
||||
|
||||
RUN apt-get update && apt-get install -y python python-pip iputils-ping libgtk2.0-dev wget vim net-tools iftop python-opencv
|
||||
RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.7 /usr/lib/libcudnn.so && ln -s /usr/lib/x86_64-linux-gnu/libnccl.so.2 /usr/lib/libnccl.so
|
||||
|
||||
# IMPORTANT:
|
||||
# Add "ENV http_proxy=http://ip:port" if your download is slow, and don't forget to unset it at runtime.
|
||||
# exmaple: unset http_proxy && unset https_proxy && python fluid_benchmark.py ...
|
||||
|
||||
|
||||
RUN pip install -U pip
|
||||
RUN pip install -U kubernetes paddlepaddle
|
||||
|
||||
RUN pip uninstall -y paddlepaddle && mkdir /workspace
|
||||
|
||||
ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
|
||||
ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
|
||||
RUN chmod +x /usr/bin/paddle_k8s
|
||||
|
||||
ADD *.whl /
|
||||
RUN pip install /*.whl && rm -f /*.whl
|
||||
|
||||
ENV LD_LIBRARY_PATH=/usr/local/lib
|
||||
ADD fluid_benchmark.py recordio_converter.py args.py recordio_converter.py run.sh run_fluid_benchmark.sh imagenet_reader.py /workspace/
|
||||
ADD models/ /workspace/models/
|
||||
|
@ -1,99 +0,0 @@
|
||||
# Fluid Benchmark
|
||||
|
||||
This directory contains several models configurations and tools that used to run
|
||||
Fluid benchmarks for local and distributed training.
|
||||
|
||||
|
||||
## Run the Benchmark
|
||||
|
||||
To start, run the following command to get the full help message:
|
||||
|
||||
```bash
|
||||
python fluid_benchmark.py --help
|
||||
```
|
||||
|
||||
Currently supported `--model` argument include:
|
||||
|
||||
* mnist
|
||||
* resnet
|
||||
* you can chose to use different dataset using `--data_set cifar10` or
|
||||
`--data_set flowers`.
|
||||
* vgg
|
||||
* stacked_dynamic_lstm
|
||||
* machine_translation
|
||||
|
||||
* Run the following command to start a benchmark job locally:
|
||||
```bash
|
||||
python fluid_benchmark.py --model mnist --device GPU
|
||||
```
|
||||
You can choose to use GPU/CPU training. With GPU training, you can specify
|
||||
`--gpus <gpu_num>` to run multi GPU training.
|
||||
You can set async mode parameter server. With async mode, you can specify
|
||||
`--async_mode` to train model asynchronous.
|
||||
* Run distributed training with parameter servers:
|
||||
* see [run_fluid_benchmark.sh](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/fluid/run_fluid_benchmark.sh) as an example.
|
||||
* start parameter servers:
|
||||
```bash
|
||||
PADDLE_TRAINING_ROLE=PSERVER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=1 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist --device GPU --update_method pserver
|
||||
sleep 15
|
||||
```
|
||||
* start trainers:
|
||||
```bash
|
||||
PADDLE_TRAINING_ROLE=TRAINER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=1 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist --device GPU --update_method pserver
|
||||
```
|
||||
* Run distributed training using NCCL2
|
||||
```bash
|
||||
PADDLE_PSERVER_PORT=7164 PADDLE_TRAINER_IPS=192.168.0.2,192.168.0.3 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist --device GPU --update_method nccl2
|
||||
```
|
||||
|
||||
## Prepare the RecordIO file to Achieve Better Performance
|
||||
|
||||
Run the following command will generate RecordIO files like "mnist.recordio" under the path
|
||||
and batch_size you choose, you can use batch_size=1 so that later reader can change the batch_size
|
||||
at any time using `fluid.batch`.
|
||||
|
||||
```bash
|
||||
python -c 'from recordio_converter import *; prepare_mnist("data", 1)'
|
||||
```
|
||||
|
||||
## Run Distributed Benchmark on Kubernetes Cluster
|
||||
|
||||
You may need to build a Docker image before submitting a cluster job onto Kubernetes, or you will
|
||||
have to start all those processes manually on each node, which is not recommended.
|
||||
|
||||
To build the Docker image, you need to choose a paddle "whl" package to run with, you may either
|
||||
download it from
|
||||
http://www.paddlepaddle.org/docs/develop/documentation/zh/build_and_install/pip_install_en.html or
|
||||
build it by your own. Once you've got the "whl" package, put it under the current directory and run:
|
||||
|
||||
```bash
|
||||
docker build -t [your docker image name]:[your docker image tag] .
|
||||
```
|
||||
|
||||
Then push the image to a Docker registry that your Kubernetes cluster can reach.
|
||||
|
||||
We provide a script `kube_gen_job.py` to generate Kubernetes yaml files to submit
|
||||
distributed benchmark jobs to your cluster. To generate a job yaml, just run:
|
||||
|
||||
```bash
|
||||
python kube_gen_job.py --jobname myjob --pscpu 4 --cpu 8 --gpu 8 --psmemory 20 --memory 40 --pservers 4 --trainers 4 --entry "python fluid_benchmark.py --model mnist --gpus 8 --device GPU --update_method pserver " --disttype pserver
|
||||
```
|
||||
|
||||
Then the yaml files are generated under directory `myjob`, you can run:
|
||||
|
||||
```bash
|
||||
kubectl create -f myjob/
|
||||
```
|
||||
|
||||
The job shall start.
|
||||
|
||||
|
||||
## Notes for Run Fluid Distributed with NCCL2 and RDMA
|
||||
|
||||
Before running NCCL2 distributed jobs, please check that whether your node has multiple network
|
||||
interfaces, try to add the environment variable `export NCCL_SOCKET_IFNAME=eth0` to use your actual
|
||||
network device.
|
||||
|
||||
To run high-performance distributed training, you must prepare your hardware environment to be
|
||||
able to run RDMA enabled network communication, please check out [this](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/howto/cluster/nccl2_rdma_training.md)
|
||||
note for details.
|
@ -1,151 +0,0 @@
|
||||
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import argparse
|
||||
|
||||
__all__ = ['parse_args', ]
|
||||
|
||||
BENCHMARK_MODELS = [
|
||||
"machine_translation", "resnet", "se_resnext", "vgg", "mnist",
|
||||
"stacked_dynamic_lstm", "resnet_with_preprocess"
|
||||
]
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser('Fluid model benchmarks.')
|
||||
parser.add_argument(
|
||||
'--model',
|
||||
type=str,
|
||||
choices=BENCHMARK_MODELS,
|
||||
default='resnet',
|
||||
help='The model to run benchmark with.')
|
||||
parser.add_argument(
|
||||
'--batch_size', type=int, default=32, help='The minibatch size.')
|
||||
# args related to learning rate
|
||||
parser.add_argument(
|
||||
'--learning_rate', type=float, default=0.001, help='The learning rate.')
|
||||
# TODO(wuyi): add "--use_fake_data" option back.
|
||||
parser.add_argument(
|
||||
'--skip_batch_num',
|
||||
type=int,
|
||||
default=5,
|
||||
help='The first num of minibatch num to skip, for better performance test'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--iterations', type=int, default=80, help='The number of minibatches.')
|
||||
parser.add_argument(
|
||||
'--pass_num', type=int, default=100, help='The number of passes.')
|
||||
parser.add_argument(
|
||||
'--data_format',
|
||||
type=str,
|
||||
default='NCHW',
|
||||
choices=['NCHW', 'NHWC'],
|
||||
help='The data data_format, now only support NCHW.')
|
||||
parser.add_argument(
|
||||
'--device',
|
||||
type=str,
|
||||
default='GPU',
|
||||
choices=['CPU', 'GPU'],
|
||||
help='The device type.')
|
||||
parser.add_argument(
|
||||
'--gpus',
|
||||
type=int,
|
||||
default=1,
|
||||
help='If gpus > 1, will use ParallelExecutor to run, else use Executor.')
|
||||
# this option is available only for vgg and resnet.
|
||||
parser.add_argument(
|
||||
'--cpus',
|
||||
type=int,
|
||||
default=1,
|
||||
help='If cpus > 1, will set ParallelExecutor to use multiple threads.')
|
||||
parser.add_argument(
|
||||
'--data_set',
|
||||
type=str,
|
||||
default='flowers',
|
||||
choices=['cifar10', 'flowers', 'imagenet'],
|
||||
help='Optional dataset for benchmark.')
|
||||
parser.add_argument(
|
||||
'--infer_only', action='store_true', help='If set, run forward only.')
|
||||
parser.add_argument(
|
||||
'--use_cprof', action='store_true', help='If set, use cProfile.')
|
||||
parser.add_argument(
|
||||
'--use_nvprof',
|
||||
action='store_true',
|
||||
help='If set, use nvprof for CUDA.')
|
||||
parser.add_argument(
|
||||
'--no_test',
|
||||
action='store_true',
|
||||
help='If set, do not test the testset during training.')
|
||||
parser.add_argument(
|
||||
'--memory_optimize',
|
||||
action='store_true',
|
||||
help='If set, optimize runtime memory before start.')
|
||||
parser.add_argument(
|
||||
'--use_fake_data',
|
||||
action='store_true',
|
||||
help='If set ommit the actual read data operators.')
|
||||
parser.add_argument(
|
||||
'--profile', action='store_true', help='If set, profile a few steps.')
|
||||
parser.add_argument(
|
||||
'--update_method',
|
||||
type=str,
|
||||
default='local',
|
||||
choices=['local', 'pserver', 'nccl2'],
|
||||
help='Choose parameter update method, can be local, pserver, nccl2.')
|
||||
parser.add_argument(
|
||||
'--no_split_var',
|
||||
action='store_true',
|
||||
default=False,
|
||||
help='Whether split variables into blocks when update_method is pserver')
|
||||
parser.add_argument(
|
||||
'--async_mode',
|
||||
action='store_true',
|
||||
default=False,
|
||||
help='Whether start pserver in async mode to support ASGD')
|
||||
parser.add_argument(
|
||||
'--use_reader_op',
|
||||
action='store_true',
|
||||
help='Whether to use reader op, and must specify the data path if set this to true.'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--data_path',
|
||||
type=str,
|
||||
default="",
|
||||
help='Directory that contains all the training recordio files.')
|
||||
parser.add_argument(
|
||||
'--test_data_path',
|
||||
type=str,
|
||||
default="",
|
||||
help='Directory that contains all the test data (NOT recordio).')
|
||||
parser.add_argument(
|
||||
'--use_inference_transpiler',
|
||||
action='store_true',
|
||||
help='If set, use inference transpiler to optimize the program.')
|
||||
parser.add_argument(
|
||||
'--no_random',
|
||||
action='store_true',
|
||||
help='If set, keep the random seed and do not shuffle the data.')
|
||||
parser.add_argument(
|
||||
'--reduce_strategy',
|
||||
type=str,
|
||||
choices=['reduce', 'all_reduce'],
|
||||
default='all_reduce',
|
||||
help='Specify the reduce strategy, can be reduce, all_reduce')
|
||||
parser.add_argument(
|
||||
'--fuse_broadcast_op',
|
||||
action='store_true',
|
||||
help='If set, would fuse multiple broadcast operators into one fused_broadcast operator.'
|
||||
)
|
||||
args = parser.parse_args()
|
||||
return args
|