Benchmark/Integrate benchmark scripts (#10707)

* wip integrate benchmark scripts * testing nlp models * k8s script to start dist benchmark job * update script * done support all models * add README.md * update by comment * clean up * follow comments
7 years ago · 55d3951bed
parent 530556dd97
commit 55d3951bed
15 changed files with 1192 additions and 1046 deletions
--- a/benchmark/fluid/README.md
+++ b/benchmark/fluid/README.md
@ -0,0 +1,60 @@
 # Fluid Benchmark
 This directory contains several models configurations and tools that used to run
 Fluid benchmarks for local and distributed training.
 ## Run the Benchmark
 To start, run the following command to get the full help message:
 ```bash
 python fluid_benchmark.py --help
 ```
 Currently supported `--model` argument include:
 * mnist
 * resnet
    * you can chose to use different dataset using `--data_set cifar10` or
      `--data_set flowers`.
 * vgg
 * stacked_dynamic_lstm
 * machine_translation
 * Run the following command to start a benchmark job locally:
    ```bash
      python fluid_benchmark.py --model mnist --parallel 1 --device GPU --with_test
    ```
    You can choose to use GPU/CPU training. With GPU training, you can specify
    `--parallel 1` to run multi GPU training.
 * Run distributed training with parameter servers:
    * start parameter servers:
        ```bash
        PADDLE_TRAINING_ROLE=PSERVER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=1 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist --parallel 0 --device GPU --update_method pserver
        ```
    * start trainers:
        ```bash
        PADDLE_TRAINING_ROLE=PSERVER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=1 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist --parallel 0 --device GPU --update_method pserver
        ```
 * Run distributed training using NCCL2
    ```bash
    PADDLE_PSERVER_PORT=7164 PADDLE_TRAINER_IPS=192.168.0.2,192.168.0.3  PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist --parallel 0 --device GPU --update_method nccl2
    ```
 ## Run Distributed Benchmark on Kubernetes Cluster
 We provide a script `kube_gen_job.py` to generate Kubernetes yaml files to submit
 distributed benchmark jobs to your cluster. To generate a job yaml, just run:
 ```bash
 python kube_gen_job.py --jobname myjob --pscpu 4 --cpu 8 --gpu 8 --psmemory 20 --memory 40 --pservers 4 --trainers 4 --entry "python fluid_benchmark.py --model mnist --parallel 1 --device GPU --update_method pserver --with_test" --disttype pserver
 ```
 Then the yaml files are generated under directory `myjob`, you can run:
 ```bash
 kubectl create -f myjob/
 ```
 The job shall start.
--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
--- a/benchmark/fluid/kube_gen_job.py
+++ b/benchmark/fluid/kube_gen_job.py
@ -0,0 +1,190 @@
 # Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import yaml
 import copy
 import argparse
 import random
 import os
 from kube_templates import pserver, trainer, envs
 def parse_args():
    parser = argparse.ArgumentParser(description='Generate dist job yamls.')
    parser.add_argument(
        '--jobname', default="paddlejob", help='unique job name')
    parser.add_argument(
        '--cpu', default=1, type=int, help='CPU cores per trainer node')
    parser.add_argument(
        '--pscpu', default=1, type=int, help='CPU cores per pserver node')
    parser.add_argument(
        '--gpu', default=0, type=int, help='num of GPUs per node')
    parser.add_argument(
        '--image',
        default="bootstrapper:5000/fluid_benchmark:gpu",
        help='num of GPUs per node')
    parser.add_argument(
        '--pservers', default=1, type=int, help='num of pservers')
    parser.add_argument(
        '--trainers', default=1, type=int, help='num of trainers')
    parser.add_argument('--memory', default=1, type=int, help='trainer memory')
    parser.add_argument(
        '--psmemory', default=1, type=int, help='pserver memory')
    parser.add_argument(
        '--port', default=30236, type=int, help='num of trainers')
    parser.add_argument(
        '--entry', default="python train.py", help='command to run')
    parser.add_argument(
        '--fluid', default=1, type=int, help='whether is fluid job')
    parser.add_argument(
        '--rdma', action='store_ture', help='whether mount rdma libs')
    parser.add_argument(
        '--disttype',
        default="pserver",
        type=str,
        choices=['pserver', 'nccl2', 'local'],
        help='pserver or nccl2 or local')
    args = parser.parse_args()
    return args
 def gen_job():
    ps = pserver
    tn = trainer
    args = parse_args()
    ps_container = ps["spec"]["template"]["spec"]["containers"][0]
    tn_container = tn["spec"]["template"]["spec"]["containers"][0]
    if args.fluid == 1:
        ps_container["command"] = \
            ["paddle_k8s", "start_fluid"]
        tn_container["command"] = \
            ["paddle_k8s", "start_fluid"]
    ps["metadata"]["name"] = args.jobname + "-pserver"
    ps["spec"]["template"]["metadata"]["labels"][
        "paddle-job-pserver"] = args.jobname
    tn["metadata"]["name"] = args.jobname + "-trainer"
    tn["spec"]["template"]["metadata"]["labels"]["paddle-job"] = args.jobname
    ps_container["image"] = args.image
    tn_container["image"] = args.image
    ps_container["resources"]["requests"]["cpu"] = str(args.pscpu)
    ps_container["resources"]["requests"]["memory"] = str(args.psmemory) + "Gi"
    ps_container["resources"]["limits"]["cpu"] = str(args.pscpu)
    ps_container["resources"]["limits"]["memory"] = str(args.psmemory) + "Gi"
    tn_container["resources"]["requests"]["cpu"] = str(args.cpu)
    tn_container["resources"]["requests"]["memory"] = str(args.memory) + "Gi"
    tn_container["resources"]["limits"]["cpu"] = str(args.cpu)
    tn_container["resources"]["limits"]["memory"] = str(args.memory) + "Gi"
    if args.gpu > 0:
        tn_container["resources"]["requests"][
            "alpha.kubernetes.io/nvidia-gpu"] = str(args.gpu)
        tn_container["resources"]["limits"][
            "alpha.kubernetes.io/nvidia-gpu"] = str(args.gpu)
    ps["spec"]["replicas"] = int(args.pservers)
    tn["spec"]["parallelism"] = int(args.trainers)
    tn["spec"]["completions"] = int(args.trainers)
    ps_container["ports"][0]["name"] = "jobport-" + str(args.port)
    ps_container["ports"][0]["containerPort"] = args.port
    spreadport = random.randint(40000, 60000)
    tn_container["ports"][0]["name"] = "spr-" + str(spreadport)
    tn_container["ports"][0]["containerPort"] = spreadport
    envs.append({"name": "PADDLE_JOB_NAME", "value": args.jobname})
    envs.append({"name": "TRAINERS", "value": str(args.trainers)})
    envs.append({"name": "PSERVERS", "value": str(args.pservers)})
    envs.append({"name": "ENTRY", "value": args.entry})
    envs.append({"name": "PADDLE_INIT_PORT", "value": str(args.port)})
    # NOTE: these directories below are cluster specific, please modify
    # this settings before you run on your own cluster.
    envs.append({
        "name": "LD_LIBRARY_PATH",
        "value":
        "/usr/local/lib:/usr/local/nvidia/lib64:/usr/local/rdma/lib64:/usr/lib64/mlnx_ofed/valgrind"
    })
    volumes = [{
        "name": "nvidia-driver",
        "hostPath": {
            "path": "/usr/local/nvidia/lib64"
        }
    }]
    volumeMounts = [{
        "mountPath": "/usr/local/nvidia/lib64",
        "name": "nvidia-driver"
    }]
    if args.rdma:
        volumes.extend([{
            "name": "ibetc",
            "hostPath": {
                "path": "/etc/libibverbs.d"
            }
        }, {
            "name": "iblibs",
            "hostPath": {
                "path": "/usr/local/rdma"
            }
        }, {
            "name": "valgrind",
            "hostPath": {
                "path": "/usr/lib64/mlnx_ofed/valgrind"
            }
        }])
        volumeMounts.extend([{
            "mountPath": "/etc/libibverbs.d",
            "name": "ibetc"
        }, {
            "mountPath": "/usr/local/rdma",
            "name": "iblibs"
        }, {
            "mountPath": "/usr/lib64/mlnx_ofed/valgrind",
            "name": "valgrind"
        }])
        # append shm for NCCL2
        volumes.append({"name": "dshm", "emptyDir": {"medium": "Memory"}})
        volumeMounts.append({"mountPath": "/dev/shm", "name": "dshm"})
    tn["spec"]["template"]["spec"]["volumes"] = volumes
    tn_container["volumeMounts"] = volumeMounts
    ps_container["env"] = envs
    ps_container["env"].append({"name": "TRAINING_ROLE", "value": "PSERVER"})
    tn_container["env"] = envs
    if args.disttype == "pserver":
        tn_container["env"].append({
            "name": "TRAINING_ROLE",
            "value": "TRAINER"
        })
    elif args.disttype == "nccl2" or args.disttype == "local":
        # NCCL2 have no training role, set to plain WORKER
        tn_container["env"].append({"name": "TRAINING_ROLE", "value": "WORKER"})
    os.mkdir(args.jobname)
    if args.disttype == "pserver":
        with open("%s/pserver.yaml" % args.jobname, "w") as fn:
            yaml.dump(ps, fn)
    with open("%s/trainer.yaml" % args.jobname, "w") as fn:
        yaml.dump(tn, fn)
 if __name__ == "__main__":
    gen_job()
--- a/benchmark/fluid/kube_templates/init.py
+++ b/benchmark/fluid/kube_templates/init.py
@ -0,0 +1,58 @@
 # Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from pserver import pserver
 from trainer import trainer
 __all__ = ["pserver", "trainer", "envs"]
 envs = [
    # envs that don't need to change
    {
        "name": "GLOG_v",
        "value": "0"
    },
    {
        "name": "GLOG_logtostderr",
        "value": "1"
    },
    {
        "name": "TOPOLOGY",
        "value": ""
    },
    {
        "name": "TRAINER_PACKAGE",
        "value": "/workspace"
    },
    {
        "name": "PADDLE_INIT_NICS",
        "value": "eth2"
    },
    {
        "name": "NAMESPACE",
        "valueFrom": {
            "fieldRef": {
                "fieldPath": "metadata.namespace"
            }
        }
    },
    {
        "name": "POD_IP",
        "valueFrom": {
            "fieldRef": {
                "fieldPath": "status.podIP"
            }
        }
    }
 ]
--- a/benchmark/fluid/kube_templates/pserver.py
+++ b/benchmark/fluid/kube_templates/pserver.py
@ -0,0 +1,58 @@
 # Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 pserver = {
    "apiVersion": "extensions/v1beta1",
    "kind": "ReplicaSet",
    "metadata": {
        "name": "jobname-pserver"
    },
    "spec": {
        "replicas": 1,
        "template": {
            "metadata": {
                "labels": {
                    "paddle-job-pserver": "jobname"
                }
            },
            "spec": {
                "hostNetwork": True,
                "imagePullSecrets": [{
                    "name": "job-registry-secret"
                }],
                "containers": [{
                    "name": "pserver",
                    "image": "",
                    "imagePullPolicy": "Always",
                    "ports": [{
                        "name": "jobport-1",
                        "containerPort": 1
                    }],
                    "env": [],
                    "command": ["paddle_k8s", "start_pserver"],
                    "resources": {
                        "requests": {
                            "memory": "10Gi",
                            "cpu": "4"
                        },
                        "limits": {
                            "memory": "10Gi",
                            "cpu": "4"
                        }
                    }
                }]
            }
        }
    }
 }
--- a/benchmark/fluid/kube_templates/trainer.py
+++ b/benchmark/fluid/kube_templates/trainer.py
@ -0,0 +1,70 @@
 # Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 trainer = {
    "apiVersion": "batch/v1",
    "kind": "Job",
    "metadata": {
        "name": "jobname-pserver"
    },
    "spec": {
        "parallelism": 4,
        "completions": 4,
        "template": {
            "metadata": {
                "labels": {
                    "paddle-job": "jobname"
                }
            },
            "spec": {
                "hostNetwork": True,
                "imagePullSecrets": [{
                    "name": "job-registry-secret"
                }],
                "restartPolicy": "Never",
                "containers": [{
                    "name": "trainer",
                    "image": "",
                    "imagePullPolicy": "Always",
                    # to let container set rlimit
                    "securityContext": {
                        "privileged": True
                        # TODO(wuyi): use below specific cap instead of privileged,
                        # using privileged will cause all GPU device are visible
                        # in the container.
                        # "capabilities": {
                        #     "add": ["SYS_RESOURCE"]
                        # }
                    },
                    "ports": [{
                        "name": "jobport-1",
                        "containerPort": 1
                    }],
                    "env": [],
                    "command": ["paddle_k8s", "start_trainer", "v2"],
                    "resources": {
                        "requests": {
                            "memory": "10Gi",
                            "cpu": "4",
                        },
                        "limits": {
                            "memory": "10Gi",
                            "cpu": "4",
                        }
                    }
                }]
            }
        }
    }
 }
--- a/benchmark/fluid/mnist.py
+++ b/benchmark/fluid/mnist.py
@ -1,228 +0,0 @@
 #   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 import numpy as np
 import argparse
 import time
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.profiler as profiler
 SEED = 1
 DTYPE = "float32"
 # random seed must set before configuring the network.
 # fluid.default_startup_program().random_seed = SEED
 def parse_args():
    parser = argparse.ArgumentParser("mnist model benchmark.")
    parser.add_argument(
        '--batch_size', type=int, default=128, help='The minibatch size.')
    parser.add_argument(
        '--skip_batch_num',
        type=int,
        default=5,
        help='The first num of minibatch num to skip, for better performance test'
    )
    parser.add_argument(
        '--iterations', type=int, default=35, help='The number of minibatches.')
    parser.add_argument(
        '--pass_num', type=int, default=5, help='The number of passes.')
    parser.add_argument(
        '--device',
        type=str,
        default='GPU',
        choices=['CPU', 'GPU'],
        help='The device type.')
    parser.add_argument(
        '--infer_only', action='store_true', help='If set, run forward only.')
    parser.add_argument(
        '--use_cprof', action='store_true', help='If set, use cProfile.')
    parser.add_argument(
        '--use_nvprof',
        action='store_true',
        help='If set, use nvprof for CUDA.')
    parser.add_argument(
        '--with_test',
        action='store_true',
        help='If set, test the testset during training.')
    args = parser.parse_args()
    return args
 def cnn_model(data):
    conv_pool_1 = fluid.nets.simple_img_conv_pool(
        input=data,
        filter_size=5,
        num_filters=20,
        pool_size=2,
        pool_stride=2,
        act="relu")
    conv_pool_2 = fluid.nets.simple_img_conv_pool(
        input=conv_pool_1,
        filter_size=5,
        num_filters=50,
        pool_size=2,
        pool_stride=2,
        act="relu")
    # TODO(dzhwinter) : refine the initializer and random seed settting
    SIZE = 10
    input_shape = conv_pool_2.shape
    param_shape = [reduce(lambda a, b: a * b, input_shape[1:], 1)] + [SIZE]
    scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5
    predict = fluid.layers.fc(
        input=conv_pool_2,
        size=SIZE,
        act="softmax",
        param_attr=fluid.param_attr.ParamAttr(
            initializer=fluid.initializer.NormalInitializer(
                loc=0.0, scale=scale)))
    return predict
 def eval_test(exe, batch_acc, batch_size_tensor, inference_program):
    test_reader = paddle.batch(
        paddle.dataset.mnist.test(), batch_size=args.batch_size)
    test_pass_acc = fluid.average.WeightedAverage()
    for batch_id, data in enumerate(test_reader()):
        img_data = np.array(map(lambda x: x[0].reshape([1, 28, 28]),
                                data)).astype(DTYPE)
        y_data = np.array(map(lambda x: x[1], data)).astype("int64")
        y_data = y_data.reshape([len(y_data), 1])
        acc, weight = exe.run(inference_program,
                              feed={"pixel": img_data,
                                    "label": y_data},
                              fetch_list=[batch_acc, batch_size_tensor])
        test_pass_acc.add(value=acc, weight=weight)
        pass_acc = test_pass_acc.eval()
    return pass_acc
 def run_benchmark(model, args):
    if args.use_cprof:
        pr = cProfile.Profile()
        pr.enable()
    start_time = time.time()
    # Input data
    images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
    # Train program
    predict = model(images)
    cost = fluid.layers.cross_entropy(input=predict, label=label)
    avg_cost = fluid.layers.mean(x=cost)
    # Evaluator
    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
    batch_acc = fluid.layers.accuracy(
        input=predict, label=label, total=batch_size_tensor)
    # inference program
    inference_program = fluid.default_main_program().clone()
    # Optimization
    opt = fluid.optimizer.AdamOptimizer(
        learning_rate=0.001, beta1=0.9, beta2=0.999)
    opt.minimize(avg_cost)
    fluid.memory_optimize(fluid.default_main_program())
    # Initialize executor
    place = fluid.CPUPlace() if args.device == 'CPU' else fluid.CUDAPlace(0)
    exe = fluid.Executor(place)
    # Parameter initialization
    exe.run(fluid.default_startup_program())
    # Reader
    train_reader = paddle.batch(
        paddle.dataset.mnist.train(), batch_size=args.batch_size)
    accuracy = fluid.metrics.Accuracy()
    train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=avg_cost.name)
    iters, num_samples, start_time = 0, 0, time.time()
    for pass_id in range(args.pass_num):
        accuracy.reset()
        train_accs = []
        train_losses = []
        for batch_id, data in enumerate(train_reader()):
            if iters == args.skip_batch_num:
                start_time = time.time()
                num_samples = 0
            if iters == args.iterations:
                break
            img_data = np.array(
                map(lambda x: x[0].reshape([1, 28, 28]), data)).astype(DTYPE)
            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
            y_data = y_data.reshape([len(y_data), 1])
            outs = train_exe.run(
                feed={"pixel": img_data,
                      "label": y_data},
                fetch_list=[
                    avg_cost.name, batch_acc.name, batch_size_tensor.name
                ]
            )  # The accuracy is the accumulation of batches, but not the current batch.
            accuracy.update(
                value=np.array(np.mean(outs[1])),
                weight=np.mean(np.array(outs[2])))
            iters += 1
            num_samples += len(y_data)
            loss = np.mean(np.array(outs[0]))
            acc = np.mean(np.array(outs[1]))
            train_losses.append(loss)
            train_accs.append(acc)
            print("Pass: %d, Iter: %d, Loss: %f, Accuracy: %f" %
                  (pass_id, iters, loss, acc))
        print("Pass: %d, Loss: %f, Train Accuray: %f\n" %
              (pass_id, np.mean(train_losses), np.mean(train_accs)))
        train_elapsed = time.time() - start_time
        examples_per_sec = num_samples / train_elapsed
        print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
              (num_samples, train_elapsed, examples_per_sec))
        # evaluation
        if args.with_test:
            test_avg_acc = eval_test(exe, batch_acc, batch_size_tensor,
                                     inference_program)
        exit(0)
 def print_arguments(args):
    vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
                                vars(args)['device'] == 'GPU')
    print('----------- mnist Configuration Arguments -----------')
    for arg, value in sorted(vars(args).iteritems()):
        print('%s: %s' % (arg, value))
    print('------------------------------------------------')
 if __name__ == '__main__':
    args = parse_args()
    print_arguments(args)
    if args.use_nvprof and args.device == 'GPU':
        with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof:
            run_benchmark(cnn_model, args)
    else:
        run_benchmark(cnn_model, args)
--- a/benchmark/fluid/models/init.py
+++ b/benchmark/fluid/models/init.py
@ -0,0 +1,17 @@
 # Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 __all__ = [
    "machine_translation", "resnet", "vgg", "mnist", "stacked_dynamic_lstm"
 ]
--- a/benchmark/fluid/models/machine_translation.py
+++ b/benchmark/fluid/models/machine_translation.py
@ -27,74 +27,6 @@ import paddle.fluid.core as core
 import paddle.fluid.framework as framework
 from paddle.fluid.executor import Executor
 parser = argparse.ArgumentParser(description=__doc__)
 parser.add_argument(
    "--embedding_dim",
    type=int,
    default=512,
    help="The dimension of embedding table. (default: %(default)d)")
 parser.add_argument(
    "--encoder_size",
    type=int,
    default=512,
    help="The size of encoder bi-rnn unit. (default: %(default)d)")
 parser.add_argument(
    "--decoder_size",
    type=int,
    default=512,
    help="The size of decoder rnn unit. (default: %(default)d)")
 parser.add_argument(
    "--batch_size",
    type=int,
    default=16,
    help="The sequence number of a mini-batch data. (default: %(default)d)")
 parser.add_argument(
    '--skip_batch_num',
    type=int,
    default=5,
    help='The first num of minibatch num to skip, for better performance test')
 parser.add_argument(
    '--iterations', type=int, default=80, help='The number of minibatches.')
 parser.add_argument(
    "--dict_size",
    type=int,
    default=30000,
    help="The dictionary capacity. Dictionaries of source sequence and "
    "target dictionary have same capacity. (default: %(default)d)")
 parser.add_argument(
    "--pass_num",
    type=int,
    default=2,
    help="The pass number to train. (default: %(default)d)")
 parser.add_argument(
    "--learning_rate",
    type=float,
    default=0.0002,
    help="Learning rate used to train the model. (default: %(default)f)")
 parser.add_argument(
    "--infer_only", action='store_true', help="If set, run forward only.")
 parser.add_argument(
    "--beam_size",
    type=int,
    default=3,
    help="The width for beam searching. (default: %(default)d)")
 parser.add_argument(
    '--device',
    type=str,
    default='GPU',
    choices=['CPU', 'GPU'],
    help="The device type.")
 parser.add_argument(
    "--max_length",
    type=int,
    default=250,
    help="The maximum length of sequence when doing generation. "
    "(default: %(default)d)")
 parser.add_argument(
    '--with_test',
    action='store_true',
    help='If set, test the testset during training.')
 def lstm_step(x_t, hidden_t_prev, cell_t_prev, size):
    def linear(inputs):
@ -264,116 +196,37 @@ def lodtensor_to_ndarray(lod_tensor):
    return ndarray
-def train():
+def get_model(args):
    embedding_dim = 512
    encoder_size = 512
    decoder_size = 512
    dict_size = 30000
    beam_size = 3
    max_length = 250
    avg_cost, feeding_list = seq_to_seq_net(
-        args.embedding_dim,
+        embedding_dim,
-        args.encoder_size,
+        encoder_size,
-        args.decoder_size,
+        decoder_size,
-        args.dict_size,
+        dict_size,
-        args.dict_size,
+        dict_size,
        False,
-        beam_size=args.beam_size,
+        beam_size=beam_size,
-        max_length=args.max_length)
+        max_length=max_length)
    # clone from default main program
    inference_program = fluid.default_main_program().clone()
    optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
    optimizer.minimize(avg_cost)
    fluid.memory_optimize(fluid.default_main_program())
    train_batch_generator = paddle.batch(
        paddle.reader.shuffle(
-            paddle.dataset.wmt14.train(args.dict_size), buf_size=1000),
+            paddle.dataset.wmt14.train(dict_size), buf_size=1000),
        batch_size=args.batch_size)
    test_batch_generator = paddle.batch(
        paddle.reader.shuffle(
-            paddle.dataset.wmt14.test(args.dict_size), buf_size=1000),
+            paddle.dataset.wmt14.test(dict_size), buf_size=1000),
        batch_size=args.batch_size)
-    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
+    return avg_cost, inference_program, optimizer, train_batch_generator, \
-    exe = Executor(place)
+           test_batch_generator, None
    exe.run(framework.default_startup_program())
    def do_validation():
        total_loss = 0.0
        count = 0
        for batch_id, data in enumerate(test_batch_generator()):
            src_seq = to_lodtensor(map(lambda x: x[0], data), place)[0]
            trg_seq = to_lodtensor(map(lambda x: x[1], data), place)[0]
            lbl_seq = to_lodtensor(map(lambda x: x[2], data), place)[0]
            fetch_outs = exe.run(inference_program,
                                 feed={
                                     feeding_list[0]: src_seq,
                                     feeding_list[1]: trg_seq,
                                     feeding_list[2]: lbl_seq
                                 },
                                 fetch_list=[avg_cost],
                                 return_numpy=False)
            total_loss += lodtensor_to_ndarray(fetch_outs[0])[0]
            count += 1
        return total_loss / count
    iters, num_samples, start_time = 0, 0, time.time()
    for pass_id in xrange(args.pass_num):
        train_accs = []
        train_losses = []
        for batch_id, data in enumerate(train_batch_generator()):
            if iters == args.skip_batch_num:
                start_time = time.time()
                num_samples = 0
            if iters == args.iterations:
                break
            src_seq, word_num = to_lodtensor(map(lambda x: x[0], data), place)
            num_samples += word_num
            trg_seq, word_num = to_lodtensor(map(lambda x: x[1], data), place)
            num_samples += word_num
            lbl_seq, _ = to_lodtensor(map(lambda x: x[2], data), place)
            fetch_outs = exe.run(framework.default_main_program(),
                                 feed={
                                     feeding_list[0]: src_seq,
                                     feeding_list[1]: trg_seq,
                                     feeding_list[2]: lbl_seq
                                 },
                                 fetch_list=[avg_cost])
            iters += 1
            loss = np.array(fetch_outs[0])
            print(
                "Pass = %d, Iter = %d, Loss = %f" % (pass_id, iters, loss)
            )  # The accuracy is the accumulation of batches, but not the current batch.
        train_elapsed = time.time() - start_time
        examples_per_sec = num_samples / train_elapsed
        print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
              (num_samples, train_elapsed, examples_per_sec))
        # evaluation
        if args.with_test:
            test_loss = do_validation()
        exit(0)
 def infer():
    pass
 def print_arguments(args):
    print('----------- seq2seq Configuration Arguments -----------')
    for arg, value in sorted(vars(args).iteritems()):
        print('%s: %s' % (arg, value))
    print('------------------------------------------------')
 if __name__ == '__main__':
    args = parser.parse_args()
    print_arguments(args)
    if args.infer_only:
        infer()
    else:
        train()
--- a/benchmark/fluid/models/mnist.py
+++ b/benchmark/fluid/models/mnist.py
@ -0,0 +1,94 @@
 #   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 import numpy as np
 import argparse
 import time
 import cProfile
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.profiler as profiler
 SEED = 1
 DTYPE = "float32"
 # random seed must set before configuring the network.
 # fluid.default_startup_program().random_seed = SEED
 def cnn_model(data):
    conv_pool_1 = fluid.nets.simple_img_conv_pool(
        input=data,
        filter_size=5,
        num_filters=20,
        pool_size=2,
        pool_stride=2,
        act="relu")
    conv_pool_2 = fluid.nets.simple_img_conv_pool(
        input=conv_pool_1,
        filter_size=5,
        num_filters=50,
        pool_size=2,
        pool_stride=2,
        act="relu")
    # TODO(dzhwinter) : refine the initializer and random seed settting
    SIZE = 10
    input_shape = conv_pool_2.shape
    param_shape = [reduce(lambda a, b: a * b, input_shape[1:], 1)] + [SIZE]
    scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5
    predict = fluid.layers.fc(
        input=conv_pool_2,
        size=SIZE,
        act="softmax",
        param_attr=fluid.param_attr.ParamAttr(
            initializer=fluid.initializer.NormalInitializer(
                loc=0.0, scale=scale)))
    return predict
 def get_model(args):
    # Input data
    images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
    # Train program
    predict = cnn_model(images)
    cost = fluid.layers.cross_entropy(input=predict, label=label)
    avg_cost = fluid.layers.mean(x=cost)
    # Evaluator
    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
    batch_acc = fluid.layers.accuracy(
        input=predict, label=label, total=batch_size_tensor)
    # inference program
    inference_program = fluid.default_main_program().clone()
    # Optimization
    opt = fluid.optimizer.AdamOptimizer(
        learning_rate=0.001, beta1=0.9, beta2=0.999)
    # Reader
    train_reader = paddle.batch(
        paddle.dataset.mnist.train(), batch_size=args.batch_size)
    test_reader = paddle.batch(
        paddle.dataset.mnist.test(), batch_size=args.batch_size)
    return avg_cost, inference_program, opt, train_reader, test_reader, batch_acc
--- a/benchmark/fluid/models/resnet.py
+++ b/benchmark/fluid/models/resnet.py
@ -0,0 +1,161 @@
 #   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 import functools
 import numpy as np
 import time
 import cProfile, pstats, StringIO
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import paddle.fluid.profiler as profiler
 def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'):
    conv1 = fluid.layers.conv2d(
        input=input,
        filter_size=filter_size,
        num_filters=ch_out,
        stride=stride,
        padding=padding,
        act=None,
        bias_attr=False)
    return fluid.layers.batch_norm(input=conv1, act=act)
 def shortcut(input, ch_out, stride):
    ch_in = input.shape[1]  # if args.data_format == 'NCHW' else input.shape[-1]
    if ch_in != ch_out:
        return conv_bn_layer(input, ch_out, 1, stride, 0, None)
    else:
        return input
 def basicblock(input, ch_out, stride):
    short = shortcut(input, ch_out, stride)
    conv1 = conv_bn_layer(input, ch_out, 3, stride, 1)
    conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, act=None)
    return fluid.layers.elementwise_add(x=short, y=conv2, act='relu')
 def bottleneck(input, ch_out, stride):
    short = shortcut(input, ch_out * 4, stride)
    conv1 = conv_bn_layer(input, ch_out, 1, stride, 0)
    conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1)
    conv3 = conv_bn_layer(conv2, ch_out * 4, 1, 1, 0, act=None)
    return fluid.layers.elementwise_add(x=short, y=conv3, act='relu')
 def layer_warp(block_func, input, ch_out, count, stride):
    res_out = block_func(input, ch_out, stride)
    for i in range(1, count):
        res_out = block_func(res_out, ch_out, 1)
    return res_out
 def resnet_imagenet(input, class_dim, depth=50, data_format='NCHW'):
    cfg = {
        18: ([2, 2, 2, 1], basicblock),
        34: ([3, 4, 6, 3], basicblock),
        50: ([3, 4, 6, 3], bottleneck),
        101: ([3, 4, 23, 3], bottleneck),
        152: ([3, 8, 36, 3], bottleneck)
    }
    stages, block_func = cfg[depth]
    conv1 = conv_bn_layer(input, ch_out=64, filter_size=7, stride=2, padding=3)
    pool1 = fluid.layers.pool2d(
        input=conv1, pool_type='avg', pool_size=3, pool_stride=2)
    res1 = layer_warp(block_func, pool1, 64, stages[0], 1)
    res2 = layer_warp(block_func, res1, 128, stages[1], 2)
    res3 = layer_warp(block_func, res2, 256, stages[2], 2)
    res4 = layer_warp(block_func, res3, 512, stages[3], 2)
    pool2 = fluid.layers.pool2d(
        input=res4,
        pool_size=7,
        pool_type='avg',
        pool_stride=1,
        global_pooling=True)
    out = fluid.layers.fc(input=pool2, size=class_dim, act='softmax')
    return out
 def resnet_cifar10(input, class_dim, depth=32, data_format='NCHW'):
    assert (depth - 2) % 6 == 0
    n = (depth - 2) // 6
    conv1 = conv_bn_layer(
        input=input, ch_out=16, filter_size=3, stride=1, padding=1)
    res1 = layer_warp(basicblock, conv1, 16, n, 1)
    res2 = layer_warp(basicblock, res1, 32, n, 2)
    res3 = layer_warp(basicblock, res2, 64, n, 2)
    pool = fluid.layers.pool2d(
        input=res3, pool_size=8, pool_type='avg', pool_stride=1)
    out = fluid.layers.fc(input=pool, size=class_dim, act='softmax')
    return out
 def get_model(args):
    model = resnet_cifar10
    if args.data_set == "cifar10":
        class_dim = 10
        if args.data_format == 'NCHW':
            dshape = [3, 32, 32]
        else:
            dshape = [32, 32, 3]
        model = resnet_cifar10
    else:
        class_dim = 102
        if args.data_format == 'NCHW':
            dshape = [3, 224, 224]
        else:
            dshape = [224, 224, 3]
        model = resnet_imagenet
    input = fluid.layers.data(name='data', shape=dshape, dtype='float32')
    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
    predict = model(input, class_dim)
    cost = fluid.layers.cross_entropy(input=predict, label=label)
    avg_cost = fluid.layers.mean(x=cost)
    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
    batch_acc = fluid.layers.accuracy(
        input=predict, label=label, total=batch_size_tensor)
    inference_program = fluid.default_main_program().clone()
    with fluid.program_guard(inference_program):
        inference_program = fluid.io.get_inference_program(
            target_vars=[batch_acc, batch_size_tensor])
    optimizer = fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9)
    train_reader = paddle.batch(
        paddle.reader.shuffle(
            paddle.dataset.cifar.train10()
            if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
            buf_size=5120),
        batch_size=args.batch_size)
    test_reader = paddle.batch(
        paddle.dataset.cifar.test10()
        if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
        batch_size=args.batch_size)
    return avg_cost, inference_program, optimizer, train_reader, test_reader, batch_acc
--- a/benchmark/fluid/models/stacked_dynamic_lstm.py
+++ b/benchmark/fluid/models/stacked_dynamic_lstm.py
@ -29,57 +29,6 @@ import paddle.fluid as fluid
 import paddle.batch as batch
 import paddle.fluid.profiler as profiler
 def parse_args():
    parser = argparse.ArgumentParser("Understand Sentiment by Dynamic RNN.")
    parser.add_argument(
        '--batch_size',
        type=int,
        default=32,
        help='The sequence number of a batch data. (default: %(default)d)')
    parser.add_argument(
        '--skip_batch_num',
        type=int,
        default=5,
        help='The first num of minibatch num to skip, for better performance test'
    )
    parser.add_argument(
        '--iterations', type=int, default=80, help='The number of minibatches.')
    parser.add_argument(
        '--emb_dim',
        type=int,
        default=512,
        help='Dimension of embedding table. (default: %(default)d)')
    parser.add_argument(
        '--hidden_dim',
        type=int,
        default=512,
        help='Hidden size of lstm unit. (default: %(default)d)')
    parser.add_argument(
        '--pass_num',
        type=int,
        default=100,
        help='Epoch number to train. (default: %(default)d)')
    parser.add_argument(
        '--device',
        type=str,
        default='CPU',
        choices=['CPU', 'GPU'],
        help='The device type.')
    parser.add_argument(
        '--crop_size',
        type=int,
        default=int(os.environ.get('CROP_SIZE', '1500')),
        help='The max sentence length of input. Since this model use plain RNN,'
        ' Gradient could be explored if sentence is too long')
    parser.add_argument(
        '--with_test',
        action='store_true',
        help='If set, test the testset during training.')
    args = parser.parse_args()
    return args
 word_dict = imdb.word_dict()
@ -94,14 +43,15 @@ def crop_sentence(reader, crop_size):
    return __impl__
-def main():
+def get_model(args):
-    args = parse_args()
+    lstm_size = 512
-    lstm_size = args.hidden_dim
+    emb_dim = 512
    crop_size = 1500
    data = fluid.layers.data(
        name="words", shape=[1], lod_level=1, dtype='int64')
    sentence = fluid.layers.embedding(
-        input=data, size=[len(word_dict), args.emb_dim])
+        input=data, size=[len(word_dict), emb_dim])
    sentence = fluid.layers.fc(input=sentence, size=lstm_size, act='tanh')
@ -161,51 +111,17 @@ def main():
            target_vars=[batch_acc, batch_size_tensor])
    adam = fluid.optimizer.Adam()
    adam.minimize(loss)
    fluid.memory_optimize(fluid.default_main_program())
    place = fluid.CPUPlace() if args.device == 'CPU' else fluid.CUDAPlace(0)
    exe = fluid.Executor(place)
    exe.run(fluid.default_startup_program())
    train_reader = batch(
        paddle.reader.shuffle(
-            crop_sentence(imdb.train(word_dict), args.crop_size),
+            crop_sentence(imdb.train(word_dict), crop_size), buf_size=25000),
-            buf_size=25000),
+        batch_size=args.batch_size)
    test_reader = batch(
        paddle.reader.shuffle(
            crop_sentence(imdb.test(word_dict), crop_size), buf_size=25000),
        batch_size=args.batch_size)
-    iters, num_samples, start_time = 0, 0, time.time()
+    return loss, inference_program, adam, train_reader, test_reader, batch_acc
    for pass_id in range(args.pass_num):
        train_accs = []
        train_losses = []
        for batch_id, data in enumerate(train_reader()):
            if iters == args.skip_batch_num:
                start_time = time.time()
                num_samples = 0
            if iters == args.iterations:
                break
            tensor_words = to_lodtensor([x[0] for x in data], place)
            label = numpy.array([x[1] for x in data]).astype("int64")
            label = label.reshape((-1, 1))
            loss_np, acc, weight = exe.run(
                fluid.default_main_program(),
                feed={"words": tensor_words,
                      "label": label},
                fetch_list=[loss, batch_acc, batch_size_tensor])
            iters += 1
            for x in data:
                num_samples += len(x[0])
            print(
                "Pass = %d, Iter = %d, Loss = %f, Accuracy = %f" %
                (pass_id, iters, loss_np, acc)
            )  # The accuracy is the accumulation of batches, but not the current batch.
        train_elapsed = time.time() - start_time
        examples_per_sec = num_samples / train_elapsed
        print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
              (num_samples, train_elapsed, examples_per_sec))
        exit(0)
 def to_lodtensor(data, place):
@ -221,16 +137,3 @@ def to_lodtensor(data, place):
    res.set(flattened_data, place)
    res.set_lod([lod])
    return res
 def print_arguments(args):
    print('----------- lstm Configuration Arguments -----------')
    for arg, value in sorted(vars(args).iteritems()):
        print('%s: %s' % (arg, value))
    print('------------------------------------------------')
 if __name__ == '__main__':
    args = parse_args()
    print_arguments(args)
    main()
--- a/benchmark/fluid/models/vgg.py
+++ b/benchmark/fluid/models/vgg.py
@ -0,0 +1,104 @@
 #   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """VGG16 benchmark in Fluid"""
 from __future__ import print_function
 import sys
 import time
 import numpy as np
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import argparse
 import functools
 def vgg16_bn_drop(input):
    def conv_block(input, num_filter, groups, dropouts):
        return fluid.nets.img_conv_group(
            input=input,
            pool_size=2,
            pool_stride=2,
            conv_num_filter=[num_filter] * groups,
            conv_filter_size=3,
            conv_act='relu',
            conv_with_batchnorm=True,
            conv_batchnorm_drop_rate=dropouts,
            pool_type='max')
    conv1 = conv_block(input, 64, 2, [0.3, 0])
    conv2 = conv_block(conv1, 128, 2, [0.4, 0])
    conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
    conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
    conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
    drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
    fc1 = fluid.layers.fc(input=drop, size=512, act=None)
    bn = fluid.layers.batch_norm(input=fc1, act='relu')
    drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
    fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
    return fc2
 def get_model(args):
    if args.data_set == "cifar10":
        classdim = 10
        if args.data_format == 'NCHW':
            data_shape = [3, 32, 32]
        else:
            data_shape = [32, 32, 3]
    else:
        classdim = 102
        if args.data_format == 'NCHW':
            data_shape = [3, 224, 224]
        else:
            data_shape = [224, 224, 3]
    # Input data
    images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
    # Train program
    net = vgg16_bn_drop(images)
    predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
    cost = fluid.layers.cross_entropy(input=predict, label=label)
    avg_cost = fluid.layers.mean(x=cost)
    # Evaluator
    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
    batch_acc = fluid.layers.accuracy(
        input=predict, label=label, total=batch_size_tensor)
    # inference program
    inference_program = fluid.default_main_program().clone()
    with fluid.program_guard(inference_program):
        inference_program = fluid.io.get_inference_program(
            target_vars=[batch_acc, batch_size_tensor])
    # Optimization
    optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
    # data reader
    train_reader = paddle.batch(
        paddle.reader.shuffle(
            paddle.dataset.cifar.train10()
            if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
            buf_size=5120),
        batch_size=args.batch_size)
    test_reader = paddle.batch(
        paddle.dataset.cifar.test10()
        if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
        batch_size=args.batch_size)
    return avg_cost, inference_program, optimizer, train_reader, test_reader, batch_acc
--- a/benchmark/fluid/resnet.py
+++ b/benchmark/fluid/resnet.py
--- a/benchmark/fluid/vgg.py
+++ b/benchmark/fluid/vgg.py
@ -1,228 +0,0 @@
 #   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """VGG16 benchmark in Fluid"""
 from __future__ import print_function
 import sys
 import time
 import numpy as np
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import argparse
 import functools
 parser = argparse.ArgumentParser(description=__doc__)
 parser.add_argument(
    '--batch_size', type=int, default=128, help="Batch size for training.")
 parser.add_argument(
    '--skip_batch_num',
    type=int,
    default=5,
    help='The first num of minibatch num to skip, for better performance test')
 parser.add_argument(
    '--iterations', type=int, default=80, help='The number of minibatches.')
 parser.add_argument(
    '--learning_rate',
    type=float,
    default=1e-3,
    help="Learning rate for training.")
 parser.add_argument('--pass_num', type=int, default=50, help="No. of passes.")
 parser.add_argument(
    '--device',
    type=str,
    default='GPU',
    choices=['CPU', 'GPU'],
    help="The device type.")
 parser.add_argument(
    '--data_format',
    type=str,
    default='NCHW',
    choices=['NCHW', 'NHWC'],
    help='The data order, now only support NCHW.')
 parser.add_argument(
    '--data_set',
    type=str,
    default='cifar10',
    choices=['cifar10', 'flowers'],
    help='Optional dataset for benchmark.')
 parser.add_argument(
    '--with_test',
    action='store_true',
    help='If set, test the testset during training.')
 args = parser.parse_args()
 def vgg16_bn_drop(input):
    def conv_block(input, num_filter, groups, dropouts):
        return fluid.nets.img_conv_group(
            input=input,
            pool_size=2,
            pool_stride=2,
            conv_num_filter=[num_filter] * groups,
            conv_filter_size=3,
            conv_act='relu',
            conv_with_batchnorm=True,
            conv_batchnorm_drop_rate=dropouts,
            pool_type='max')
    conv1 = conv_block(input, 64, 2, [0.3, 0])
    conv2 = conv_block(conv1, 128, 2, [0.4, 0])
    conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
    conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
    conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
    drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
    fc1 = fluid.layers.fc(input=drop, size=512, act=None)
    bn = fluid.layers.batch_norm(input=fc1, act='relu')
    drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
    fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
    return fc2
 def main():
    if args.data_set == "cifar10":
        classdim = 10
        if args.data_format == 'NCHW':
            data_shape = [3, 32, 32]
        else:
            data_shape = [32, 32, 3]
    else:
        classdim = 102
        if args.data_format == 'NCHW':
            data_shape = [3, 224, 224]
        else:
            data_shape = [224, 224, 3]
    # Input data
    images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
    # Train program
    net = vgg16_bn_drop(images)
    predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
    cost = fluid.layers.cross_entropy(input=predict, label=label)
    avg_cost = fluid.layers.mean(x=cost)
    # Evaluator
    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
    batch_acc = fluid.layers.accuracy(
        input=predict, label=label, total=batch_size_tensor)
    # inference program
    inference_program = fluid.default_main_program().clone()
    with fluid.program_guard(inference_program):
        inference_program = fluid.io.get_inference_program(
            target_vars=[batch_acc, batch_size_tensor])
    # Optimization
    optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
    opts = optimizer.minimize(avg_cost)
    fluid.memory_optimize(fluid.default_main_program())
    # Initialize executor
    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
    exe = fluid.Executor(place)
    # Parameter initialization
    exe.run(fluid.default_startup_program())
    # data reader
    train_reader = paddle.batch(
        paddle.reader.shuffle(
            paddle.dataset.cifar.train10()
            if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
            buf_size=5120),
        batch_size=args.batch_size)
    test_reader = paddle.batch(
        paddle.dataset.cifar.test10()
        if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
        batch_size=args.batch_size)
    # test
    def test(exe):
        test_accuracy = fluid.average.WeightedAverage()
        for batch_id, data in enumerate(test_reader()):
            img_data = np.array(map(lambda x: x[0].reshape(data_shape),
                                    data)).astype("float32")
            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
            y_data = y_data.reshape([-1, 1])
            acc, weight = exe.run(inference_program,
                                  feed={"pixel": img_data,
                                        "label": y_data},
                                  fetch_list=[batch_acc, batch_size_tensor])
            test_accuracy.add(value=acc, weight=weight)
        return test_accuracy.eval()
    iters, num_samples, start_time = 0, 0, time.time()
    accuracy = fluid.average.WeightedAverage()
    train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=avg_cost.name)
    for pass_id in range(args.pass_num):
        accuracy.reset()
        train_accs = []
        train_losses = []
        for batch_id, data in enumerate(train_reader()):
            if iters == args.skip_batch_num:
                start_time = time.time()
                num_samples = 0
            if iters == args.iterations:
                break
            img_data = np.array(map(lambda x: x[0].reshape(data_shape),
                                    data)).astype("float32")
            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
            y_data = y_data.reshape([-1, 1])
            loss, acc, weight = train_exe.run(
                feed={"pixel": img_data,
                      "label": y_data},
                fetch_list=[
                    avg_cost.name, batch_acc.name, batch_size_tensor.name
                ])
            accuracy.add(value=np.array(np.mean(acc)), weight=np.mean(weight))
            iters += 1
            num_samples += len(y_data)
            loss = np.mean(np.array(loss))
            acc = np.mean(np.array(acc))
            print(
                "Pass = %d, Iter = %d, Loss = %f, Accuracy = %f" %
                (pass_id, iters, loss, acc)
            )  # The accuracy is the accumulation of batches, but not the current batch.
        # pass_train_acc = accuracy.eval()
        train_losses.append(loss)
        train_accs.append(acc)
        print("Pass: %d, Loss: %f, Train Accuray: %f\n" %
              (pass_id, np.mean(train_losses), np.mean(train_accs)))
        train_elapsed = time.time() - start_time
        examples_per_sec = num_samples / train_elapsed
        print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
              (num_samples, train_elapsed, examples_per_sec))
        # evaluation
        if args.with_test:
            pass_test_acc = test(exe)
        exit(0)
 def print_arguments():
    print('----------- vgg Configuration Arguments -----------')
    for arg, value in sorted(vars(args).iteritems()):
        print('%s: %s' % (arg, value))
    print('------------------------------------------------')
 if __name__ == "__main__":
    print_arguments()
    main()