remove benchmark folder, since there is a benchmark repo already, distributed benchmark will be maintained in fleet repo (#18537)

test=develop
6 years ago · 6f6ecbec4e
parent 1f1cc2221f
commit 6f6ecbec4e
59 changed files with 0 additions and 10654 deletions
--- a/benchmark/.gitignore
+++ b/benchmark/.gitignore
@ -1,12 +0,0 @@
 paddle/image/logs
 paddle/image/*.pyc
 paddle/image/train.list
 paddle/rnn/logs
 paddle/rnn/*.pyc
 paddle/rnn/imdb.pkl
 caffe/image/logs
 tensorflow/image/logs
 tensorflow/rnn/logs
 fluid/models/*.pyc
 fluid/logs
 fluid/nohup.out
--- a/benchmark/caffe/image/alexnet.prototxt
+++ b/benchmark/caffe/image/alexnet.prototxt
--- a/benchmark/caffe/image/googlenet.prototxt
+++ b/benchmark/caffe/image/googlenet.prototxt
--- a/benchmark/caffe/image/run.sh
+++ b/benchmark/caffe/image/run.sh
@ -1,30 +0,0 @@
 set -e
 function test() {
  cfg=$1
  batch=$2
  prefix=$3
  sed -i "/input: \"data\"/{n;s/^input_dim.*/input_dim: $batch/g}" $cfg 
  sed -i "/input: \"label\"/{n;s/^input_dim.*/input_dim: $batch/g}" $cfg
  caffe time --model=$cfg --iterations=50 --gpu 0 > logs/$prefix-1gpu-batch${batch}.log 2>&1
 }
 if [ ! -d "logs" ]; then
  mkdir logs
 fi
 # alexnet
 test alexnet.prototxt 64 alexnet 
 test alexnet.prototxt 128 alexnet 
 test alexnet.prototxt 256 alexnet 
 test alexnet.prototxt 512 alexnet 
 # googlenet
 test googlenet.prototxt 64 googlenet 
 test googlenet.prototxt 128 googlenet 
 # small net 
 test smallnet_mnist_cifar.prototxt 64 smallnet 
 test smallnet_mnist_cifar.prototxt 128 smallnet 
 test smallnet_mnist_cifar.prototxt 256 smallnet 
 test smallnet_mnist_cifar.prototxt 512 smallnet 
--- a/benchmark/caffe/image/run_multi.sh
+++ b/benchmark/caffe/image/run_multi.sh
@ -1,24 +0,0 @@
 #!/bin/bash
 set -e
 function test() {
  cfg=$1
  batch=$2
  prefix=$3
  batch_per_gpu=`expr ${batch} / 4`
  sed -i "/input: \"data\"/{n;s/^input_dim.*/input_dim: ${batch_per_gpu}/g}" $cfg 
  sed -i "/input: \"label\"/{n;s/^input_dim.*/input_dim: ${batch_per_gpu}/g}" $cfg 
  sed -i "1c\net : \"${cfg}\"" solver.prototxt
  caffe train --solver=solver.prototxt -gpu 0,1,2,3 > logs/${prefix}-4gpu-batch${batch}.log 2>&1
 }
 if [ ! -d "logs" ]; then
  mkdir logs
 fi
 # alexnet
 test alexnet.prototxt 512 alexnet 
 test alexnet.prototxt 1024 alexnet 
 # googlnet 
 test googlenet.prototxt 512 googlenet 
--- a/benchmark/caffe/image/smallnet_mnist_cifar.prototxt
+++ b/benchmark/caffe/image/smallnet_mnist_cifar.prototxt
@ -1,198 +0,0 @@
 name: "mnist/cifar"
 input: "data"
 input_dim: 128 
 input_dim: 3
 input_dim: 32 
 input_dim: 32 
 input: "label"
 input_dim: 128 
 input_dim: 1
 input_dim: 1
 input_dim: 1 
 layer {
  name: "conv1"
  type: "Convolution"
  bottom: "data"
  top: "conv1"
  param {
    lr_mult: 1
  }
  param {
    lr_mult: 2
  }
  convolution_param {
    num_output: 32
    pad: 2
    kernel_size: 5
    stride: 1
    weight_filler {
      type: "gaussian"
      std: 0.0001
    }
    bias_filler {
      type: "constant"
    }
  }
 }
 layer {
  name: "pool1"
  type: "Pooling"
  bottom: "conv1"
  top: "pool1"
  pooling_param {
    pool: MAX
    kernel_size: 3
    stride: 2
  }
 }
 layer {
  name: "relu1"
  type: "ReLU"
  bottom: "pool1"
  top: "pool1"
 }
 layer {
  name: "conv2"
  type: "Convolution"
  bottom: "pool1"
  top: "conv2"
  param {
    lr_mult: 1
  }
  param {
    lr_mult: 2
  }
  convolution_param {
    num_output: 32
    pad: 2
    kernel_size: 5
    stride: 1
    weight_filler {
      type: "gaussian"
      std: 0.01
    }
    bias_filler {
      type: "constant"
    }
  }
 }
 layer {
  name: "relu2"
  type: "ReLU"
  bottom: "conv2"
  top: "conv2"
 }
 layer {
  name: "pool2"
  type: "Pooling"
  bottom: "conv2"
  top: "pool2"
  pooling_param {
    pool: AVE
    kernel_size: 3
    stride: 2
  }
 }
 layer {
  name: "conv3"
  type: "Convolution"
  bottom: "pool2"
  top: "conv3"
  param {
    lr_mult: 1
  }
  param {
    lr_mult: 2
  }
  convolution_param {
    num_output: 64
    pad: 2
    kernel_size: 5
    stride: 1
    weight_filler {
      type: "gaussian"
      std: 0.01
    }
    bias_filler {
      type: "constant"
    }
  }
 }
 layer {
  name: "relu3"
  type: "ReLU"
  bottom: "conv3"
  top: "conv3"
 }
 layer {
  name: "pool3"
  type: "Pooling"
  bottom: "conv3"
  top: "pool3"
  pooling_param {
    pool: AVE
    kernel_size: 3
    stride: 2
  }
 }
 layer {
  name: "ip1"
  type: "InnerProduct"
  bottom: "pool3"
  top: "ip1"
  param {
    lr_mult: 1
  }
  param {
    lr_mult: 2
  }
  inner_product_param {
    num_output: 64
    weight_filler {
      type: "gaussian"
      std: 0.1
    }
    bias_filler {
      type: "constant"
    }
  }
 }
 layer {
  name: "ip2"
  type: "InnerProduct"
  bottom: "ip1"
  top: "ip2"
  param {
    lr_mult: 1
  }
  param {
    lr_mult: 2
  }
  inner_product_param {
    num_output: 10
    weight_filler {
      type: "gaussian"
      std: 0.1
    }
    bias_filler {
      type: "constant"
    }
  }
 }
 layer {
  name: "accuracy"
  type: "Accuracy"
  bottom: "ip2"
  bottom: "label"
  top: "accuracy"
  include {
    phase: TEST
  }
 }
 layer {
  name: "loss"
  type: "SoftmaxWithLoss"
  bottom: "ip2"
  bottom: "label"
  top: "loss"
 }
--- a/benchmark/caffe/image/solver.prototxt
+++ b/benchmark/caffe/image/solver.prototxt
@ -1,10 +0,0 @@
 net: "alexnet.prototxt"
 base_lr: 0.01
 lr_policy: "fixed"
 display: 20
 max_iter: 200
 momentum: 0.9
 weight_decay: 0.0005
 snapshot: 10000
 snapshot_prefix: "models/caffe_alexnet_train"
 solver_mode: GPU
--- a/benchmark/figs/alexnet-4gpu.png
+++ b/benchmark/figs/alexnet-4gpu.png
--- a/benchmark/figs/alexnet-cpu-infer.png
+++ b/benchmark/figs/alexnet-cpu-infer.png
--- a/benchmark/figs/alexnet-cpu-train.png
+++ b/benchmark/figs/alexnet-cpu-train.png
--- a/benchmark/figs/googlenet-4gpu.png
+++ b/benchmark/figs/googlenet-4gpu.png
--- a/benchmark/figs/googlenet-cpu-infer.png
+++ b/benchmark/figs/googlenet-cpu-infer.png
--- a/benchmark/figs/googlenet-cpu-train.png
+++ b/benchmark/figs/googlenet-cpu-train.png
--- a/benchmark/figs/resnet-cpu-infer.png
+++ b/benchmark/figs/resnet-cpu-infer.png
--- a/benchmark/figs/resnet-cpu-train.png
+++ b/benchmark/figs/resnet-cpu-train.png
--- a/benchmark/figs/rnn_lstm_4gpus.png
+++ b/benchmark/figs/rnn_lstm_4gpus.png
--- a/benchmark/figs/rnn_lstm_cls.png
+++ b/benchmark/figs/rnn_lstm_cls.png
--- a/benchmark/figs/vgg-cpu-infer.png
+++ b/benchmark/figs/vgg-cpu-infer.png
--- a/benchmark/figs/vgg-cpu-train.png
+++ b/benchmark/figs/vgg-cpu-train.png
--- a/benchmark/fluid/Dockerfile
+++ b/benchmark/fluid/Dockerfile
@ -1,30 +0,0 @@
 FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04
 # Use UBUNTU_MIRROR can speed up apt-get speed.
 # ARG UBUNTU_MIRROR
 # RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
 RUN apt-get update && apt-get install -y python python-pip iputils-ping libgtk2.0-dev wget vim net-tools iftop python-opencv
 RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.7 /usr/lib/libcudnn.so && ln -s /usr/lib/x86_64-linux-gnu/libnccl.so.2 /usr/lib/libnccl.so
 # IMPORTANT:
 # Add "ENV http_proxy=http://ip:port" if your download is slow, and don't forget to unset it at runtime.
 # exmaple: unset http_proxy && unset https_proxy && python fluid_benchmark.py ...
 RUN pip install -U pip
 RUN pip install -U kubernetes paddlepaddle
 RUN pip uninstall -y paddlepaddle && mkdir /workspace
 ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
 ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
 RUN chmod +x /usr/bin/paddle_k8s
 ADD *.whl /
 RUN pip install /*.whl && rm -f /*.whl 
 ENV LD_LIBRARY_PATH=/usr/local/lib
 ADD fluid_benchmark.py recordio_converter.py args.py recordio_converter.py run.sh run_fluid_benchmark.sh imagenet_reader.py /workspace/
 ADD models/ /workspace/models/
--- a/benchmark/fluid/README.md
+++ b/benchmark/fluid/README.md
@ -1,99 +0,0 @@
 # Fluid Benchmark
 This directory contains several models configurations and tools that used to run
 Fluid benchmarks for local and distributed training.
 ## Run the Benchmark
 To start, run the following command to get the full help message:
 ```bash
 python fluid_benchmark.py --help
 ```
 Currently supported `--model` argument include:
 * mnist
 * resnet
    * you can chose to use different dataset using `--data_set cifar10` or
      `--data_set flowers`.
 * vgg
 * stacked_dynamic_lstm
 * machine_translation
 * Run the following command to start a benchmark job locally:
    ```bash
      python fluid_benchmark.py --model mnist --device GPU
    ```
    You can choose to use GPU/CPU training. With GPU training, you can specify
    `--gpus <gpu_num>` to run multi GPU training.
    You can set async mode parameter server. With async mode, you can specify
    `--async_mode` to train model asynchronous.
 * Run distributed training with parameter servers:
    * see [run_fluid_benchmark.sh](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/fluid/run_fluid_benchmark.sh) as an example.
    * start parameter servers:
        ```bash
        PADDLE_TRAINING_ROLE=PSERVER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=1 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist  --device GPU --update_method pserver
        sleep 15
        ```
    * start trainers:
        ```bash
        PADDLE_TRAINING_ROLE=TRAINER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=1 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist  --device GPU --update_method pserver
        ```
 * Run distributed training using NCCL2
    ```bash
    PADDLE_PSERVER_PORT=7164 PADDLE_TRAINER_IPS=192.168.0.2,192.168.0.3  PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist --device GPU --update_method nccl2
    ```
 ## Prepare the RecordIO file to Achieve Better Performance
 Run the following command will generate RecordIO files like "mnist.recordio" under the path
 and batch_size you choose, you can use batch_size=1 so that later reader can change the batch_size
 at any time using `fluid.batch`.
 ```bash
 python -c 'from recordio_converter import *; prepare_mnist("data", 1)'
 ```
 ## Run Distributed Benchmark on Kubernetes Cluster
 You may need to build a Docker image before submitting a cluster job onto Kubernetes, or you will
 have to start all those processes manually on each node, which is not recommended.
 To build the Docker image, you need to choose a paddle "whl" package to run with, you may either
 download it from
 http://www.paddlepaddle.org/docs/develop/documentation/zh/build_and_install/pip_install_en.html or
 build it by your own. Once you've got the "whl" package, put it under the current directory and run:
 ```bash
 docker build -t [your docker image name]:[your docker image tag] .
 ```
 Then push the image to a Docker registry that your Kubernetes cluster can reach.
 We provide a script `kube_gen_job.py` to generate Kubernetes yaml files to submit
 distributed benchmark jobs to your cluster. To generate a job yaml, just run:
 ```bash
 python kube_gen_job.py --jobname myjob --pscpu 4 --cpu 8 --gpu 8 --psmemory 20 --memory 40 --pservers 4 --trainers 4 --entry "python fluid_benchmark.py --model mnist --gpus 8 --device GPU --update_method pserver " --disttype pserver
 ```
 Then the yaml files are generated under directory `myjob`, you can run:
 ```bash
 kubectl create -f myjob/
 ```
 The job shall start.
 ## Notes for Run Fluid Distributed with NCCL2 and RDMA
 Before running NCCL2 distributed jobs, please check that whether your node has multiple network
 interfaces, try to add the environment variable `export NCCL_SOCKET_IFNAME=eth0` to use your actual
 network device.
 To run high-performance distributed training, you must prepare your hardware environment to be
 able to run RDMA enabled network communication, please check out [this](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/howto/cluster/nccl2_rdma_training.md)
 note for details.
--- a/benchmark/fluid/args.py
+++ b/benchmark/fluid/args.py
@ -1,151 +0,0 @@
 # Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
 __all__ = ['parse_args', ]
 BENCHMARK_MODELS = [
    "machine_translation", "resnet", "se_resnext", "vgg", "mnist",
    "stacked_dynamic_lstm", "resnet_with_preprocess"
 ]
 def parse_args():
    parser = argparse.ArgumentParser('Fluid model benchmarks.')
    parser.add_argument(
        '--model',
        type=str,
        choices=BENCHMARK_MODELS,
        default='resnet',
        help='The model to run benchmark with.')
    parser.add_argument(
        '--batch_size', type=int, default=32, help='The minibatch size.')
    #  args related to learning rate
    parser.add_argument(
        '--learning_rate', type=float, default=0.001, help='The learning rate.')
    # TODO(wuyi): add "--use_fake_data" option back.
    parser.add_argument(
        '--skip_batch_num',
        type=int,
        default=5,
        help='The first num of minibatch num to skip, for better performance test'
    )
    parser.add_argument(
        '--iterations', type=int, default=80, help='The number of minibatches.')
    parser.add_argument(
        '--pass_num', type=int, default=100, help='The number of passes.')
    parser.add_argument(
        '--data_format',
        type=str,
        default='NCHW',
        choices=['NCHW', 'NHWC'],
        help='The data data_format, now only support NCHW.')
    parser.add_argument(
        '--device',
        type=str,
        default='GPU',
        choices=['CPU', 'GPU'],
        help='The device type.')
    parser.add_argument(
        '--gpus',
        type=int,
        default=1,
        help='If gpus > 1, will use ParallelExecutor to run, else use Executor.')
    # this option is available only for vgg and resnet.
    parser.add_argument(
        '--cpus',
        type=int,
        default=1,
        help='If cpus > 1, will set ParallelExecutor to use multiple threads.')
    parser.add_argument(
        '--data_set',
        type=str,
        default='flowers',
        choices=['cifar10', 'flowers', 'imagenet'],
        help='Optional dataset for benchmark.')
    parser.add_argument(
        '--infer_only', action='store_true', help='If set, run forward only.')
    parser.add_argument(
        '--use_cprof', action='store_true', help='If set, use cProfile.')
    parser.add_argument(
        '--use_nvprof',
        action='store_true',
        help='If set, use nvprof for CUDA.')
    parser.add_argument(
        '--no_test',
        action='store_true',
        help='If set, do not test the testset during training.')
    parser.add_argument(
        '--memory_optimize',
        action='store_true',
        help='If set, optimize runtime memory before start.')
    parser.add_argument(
        '--use_fake_data',
        action='store_true',
        help='If set ommit the actual read data operators.')
    parser.add_argument(
        '--profile', action='store_true', help='If set, profile a few steps.')
    parser.add_argument(
        '--update_method',
        type=str,
        default='local',
        choices=['local', 'pserver', 'nccl2'],
        help='Choose parameter update method, can be local, pserver, nccl2.')
    parser.add_argument(
        '--no_split_var',
        action='store_true',
        default=False,
        help='Whether split variables into blocks when update_method is pserver')
    parser.add_argument(
        '--async_mode',
        action='store_true',
        default=False,
        help='Whether start pserver in async mode to support ASGD')
    parser.add_argument(
        '--use_reader_op',
        action='store_true',
        help='Whether to use reader op, and must specify the data path if set this to true.'
    )
    parser.add_argument(
        '--data_path',
        type=str,
        default="",
        help='Directory that contains all the training recordio files.')
    parser.add_argument(
        '--test_data_path',
        type=str,
        default="",
        help='Directory that contains all the test data (NOT recordio).')
    parser.add_argument(
        '--use_inference_transpiler',
        action='store_true',
        help='If set, use inference transpiler to optimize the program.')
    parser.add_argument(
        '--no_random',
        action='store_true',
        help='If set, keep the random seed and do not shuffle the data.')
    parser.add_argument(
        '--reduce_strategy',
        type=str,
        choices=['reduce', 'all_reduce'],
        default='all_reduce',
        help='Specify the reduce strategy, can be reduce, all_reduce')
    parser.add_argument(
        '--fuse_broadcast_op',
        action='store_true',
        help='If set, would fuse multiple broadcast operators into one fused_broadcast operator.'
    )
    args = parser.parse_args()
    return args
--- a/benchmark/fluid/check_env.sh
+++ b/benchmark/fluid/check_env.sh
--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
--- a/benchmark/fluid/imagenet_reader.py
+++ b/benchmark/fluid/imagenet_reader.py
--- a/Show More
+++ b/Show More