Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into mpi_enabled

7 years ago · d2ba05a671
parent 10669f1fe7 a79676e8bf
commit d2ba05a671
765 changed files with 28689 additions and 7707 deletions
--- a/.gitignore
+++ b/.gitignore
@ -25,12 +25,3 @@ third_party/
 # clion workspace.
 cmake-build-*
 # generated while compiling
 paddle/pybind/pybind.h
 CMakeFiles
 cmake_install.cmake
 paddle/.timestamp
 python/paddlepaddle.egg-info/
 paddle/fluid/pybind/pybind.h
 python/paddle/version.py
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -1,3 +1,4 @@
 repos:
 -   repo: https://github.com/Lucas-C/pre-commit-hooks.git
    sha: v1.0.1
    hooks:
@ -25,6 +26,14 @@
        entry: bash ./.clang_format.hook -i
        language: system
        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto)$
 -   repo: local
    hooks:
    -   id: cpplint-cpp-source
        name: cpplint
        description: Check C++ code style using cpplint.py.
        entry: bash ./tools/codestyle/cpplint_pre_commit.hook
        language: system
        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx)$
 -   repo: https://github.com/PaddlePaddle/pre-commit-golang
    sha: 8337620115c25ff8333f1b1a493bd031049bd7c0
    hooks:
--- a/.travis.yml
+++ b/.travis.yml
@ -34,7 +34,7 @@ addons:
      - automake
      - libtool
      - ccache
-  ssh_known_hosts: 52.76.173.135
+  ssh_known_hosts: 13.229.163.131
 before_install:
  - if [[ "$JOB" == "check_style" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi
  # Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -39,6 +39,7 @@ option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_F
 option(WITH_AMD_GPU     "Compile PaddlePaddle with AMD GPU"             OFF)
 option(WITH_AVX         "Compile PaddlePaddle with AVX intrinsics"      ${AVX_FOUND})
 option(WITH_MKL         "Compile PaddlePaddle with MKL support."        ${AVX_FOUND})
 option(WITH_TENSORRT    "Compile PaddlePaddle with TensorRT support."   OFF)
 option(WITH_DSO         "Compile PaddlePaddle with dynamic linked CUDA" ON)
 option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        OFF)
 option(WITH_SWIG_PY     "Compile PaddlePaddle with inference api"       ON)
@ -53,8 +54,7 @@ option(WITH_COVERAGE    "Compile PaddlePaddle with code coverage"       OFF)
 option(COVERALLS_UPLOAD "Package code coverage data to coveralls"       OFF)
 option(ON_TRAVIS        "Exclude special unit test on Travis CI"        OFF)
 option(WITH_C_API       "Compile PaddlePaddle with C-API(Prediction)"   OFF)
-# TODO: Only compile PaddlePaddle fluid version by WITH_FLUID option. 
+option(WITH_FLUID_ONLY  "Compile PaddlePaddle fluid only"               OFF)
 option(WITH_FLUID       "Compile PaddlePaddle fluid only(TODO)"         OFF)
 option(WITH_GOLANG      "Compile PaddlePaddle with GOLANG"              OFF)
 option(GLIDE_INSTALL    "Download and install go dependencies "         ON)
 option(USE_NNPACK       "Compile PaddlePaddle with NNPACK library"      OFF)
@ -109,7 +109,7 @@ if (WITH_C_API AND WITH_PYTHON)
 endif()
 if (WITH_C_API)
-  set(WITH_FLUID OFF CACHE STRING "Disable install fluid when compile the C_API" FORCE)
+  set(WITH_FLUID_ONLY OFF CACHE STRING "Disable install fluid when compile the C_API" FORCE)
 endif()
 if(MOBILE_INFERENCE)
@ -147,6 +147,7 @@ include(external/cares)
 include(external/grpc)
 include(external/snappy)    # download snappy
 include(external/snappystream)
 include(external/threadpool)
 include(cudnn)              # set cudnn libraries, must before configure
 include(cupti)
@ -181,6 +182,11 @@ if(WITH_GPU)
    include(cuda)
 endif(WITH_GPU)
 # TensorRT depends on GPU.
 if (NOT WITH_GPU)
  set(WITH_TENSORRT OFF)
 endif()
 if(WITH_AMD_GPU)
    find_package(HIP)
    include(hip)
--- a/11
+++ b/11
@ -1,6 +1,6 @@
 # A image for building paddle binaries
 # Use cuda devel base image for both cpu and gpu environment
-FROM nvidia/cuda:8.0-cudnn5-devel-ubuntu16.04
+FROM nvidia/cuda:8.0-cudnn7-devel-ubuntu16.04
 MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
 ARG UBUNTU_MIRROR
@ -45,6 +45,13 @@ ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin
 # install glide
 RUN curl -s -q https://glide.sh/get | sh
 # Install TensorRT
 # The unnecessary files has been removed to make the library small.
 RUN wget -qO- http://paddlepaddledeps.bj.bcebos.com/TensorRT-4.0.0.3.Ubuntu-16.04.4.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz | \
    tar -xz -C /usr/local && \
    cp -rf /usr/local/TensorRT/include /usr && \
    cp -rf /usr/local/TensorRT/lib /usr
 # git credential to skip password typing
 RUN git config --global credential.helper store
@ -57,7 +64,7 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8
 # specify sphinx version as 1.5.6 and remove -U option for [pip install -U
 # sphinx-rtd-theme] since -U option will cause sphinx being updated to newest
 # version(1.7.1 for now), which causes building documentation failed.
-RUN pip install --upgrade pip && \
+RUN pip install --upgrade pip==9.0.3 && \
    pip install -U wheel && \
    pip install -U docopt PyYAML sphinx==1.5.6 && \
    pip install sphinx-rtd-theme==0.1.9 recommonmark
--- a/benchmark/cluster/README.md
+++ b/benchmark/cluster/README.md
@ -36,11 +36,41 @@
 - Trainer Count: 100
 - Metrics: mini-batch / sec
-| Batch Size | 32 | 64 | 128 | 256 |
+
-| -- | -- | -- | -- | -- |
+<table>
-| PaddlePaddle Fluid | - | - | - | - |
+<thead>
-| PaddlePaddle v2 | - | - | - | - |
+<tr>
-| TensorFlow | - | - | - | - |
+<th>Batch Size </th>
 <th> 32</th>
 <th>64</th>
 <th>128 </th>
 <th>256</th>
 </tr>
 </thead>
 <tbody>
 <tr>
 <td> PaddlePaddle Fluid</td>
 <td>-</td>
 <td>- </td>
 <td>-  </td>
 <td>- </td>
 </tr>
 <tr>
 <td>PaddlePaddle v2  </td>
 <td>-  </td>
 <td>- </td>
 <td>-  </td>
 <td>- </td>
 </tr>
 <tr>
 <td>TensorFlow </td>
 <td>-  </td>
 <td>- </td>
 <td>-  </td>
 <td>- </td>
 </tr>
 </tbody>
 </table>
 ### Measure the Performance for Different PServer Count
@ -48,11 +78,41 @@
 - Batch Size: 64
 - Metrics: mini-batch / sec
-| PServer Count | 10 | 20 | 40 | 60 |
+
-| -- | -- | -- | -- | -- |
+<table>
-| PaddlePaddle Fluid | - | - | - | - |
+<thead>
-| PaddlePaddle v2 | - | - | - | - |
+<tr>
-| TensorFlow | - | - | - | - |
+<th>PServer Count  </th>
 <th>10</th>
 <th>20</th>
 <th>40 </th>
 <th>60</th>
 </tr>
 </thead>
 <tbody>
 <tr>
 <td> PaddlePaddle Fluid</td>
 <td>-</td>
 <td>- </td>
 <td>-  </td>
 <td>- </td>
 </tr>
 <tr>
 <td>PaddlePaddle v2  </td>
 <td>-  </td>
 <td>- </td>
 <td>-  </td>
 <td>- </td>
 </tr>
 <tr>
 <td>TensorFlow </td>
 <td>-  </td>
 <td>- </td>
 <td>-  </td>
 <td>- </td>
 </tr>
 </tbody>
 </table>
 ### Measure Parallel Efficiency By Increasing Trainer Count
@ -67,11 +127,69 @@ The parallel efficiency is:
 $E = \div(S, N)$
-| Trainer Counter | 1 | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
+<table>
-| -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- |
+<thead>
-| PaddlePaddle Fluid | - | - | - | - | - | - | - | - | - | - | - |
+<tr>
-| PaddlePaddle v2 | - | - | - | - | - | - | - | - | - | - | - | - |
+<th>Trainer Counter  </th>
-| TensorFlow | - | - | - | - | - | - | - | - | - | - | - | - | - |
+<th>1</th>
 <th>10</th>
 <th>20 </th>
 <th>30</th>
 <th>40</th>
 <th>50</th>
 <th>60 </th>
 <th>70</th>
 <th>80</th>
 <th>90</th>
 <th>100 </th>
 </tr>
 </thead>
 <tbody>
 <tr>
 <td> PaddlePaddle Fluid</td>
 <td>-</td>
 <td>- </td>
 <td>- </td>
 <td>- </td>
 <td>-</td>
 <td>- </td>
 <td>- </td>
 <td>- </td>
 <td>-</td>
 <td>- </td>
 <td>- </td>
 </tr>
 <tr>
 <td>PaddlePaddle v2  </td>
 <td>-  </td>
 <td>- </td>
 <td>-  </td>
 <td>- </td>
 <td>-</td>
 <td>- </td>
 <td>- </td>
 <td>- </td>
 <td>-</td>
 <td>- </td>
 <td>- </td>
 </tr>
 <tr>
 <td>TensorFlow </td>
 <td>-  </td>
 <td>- </td>
 <td>-  </td>
 <td>- </td>
 <td>-</td>
 <td>- </td>
 <td>- </td>
 <td>- </td>
 <td>-</td>
 <td>- </td>
 <td>- </td>
 </tr>
 </tbody>
 </table>
 ## Reproduce the benchmark
--- a/benchmark/cluster/vgg16/README.md
+++ b/benchmark/cluster/vgg16/README.md
@ -16,11 +16,41 @@ Setting environment variable: `MKL_NUM_THREADS=1`.
 - Metrics: samples / sec
-| Batch Size | 32 | 64 | 128 | 256 |
+<table>
-| -- | -- | -- | -- | -- |
+<thead>
-| PaddlePaddle Fluid | 15.44 | 16.32 | 16.74 | 16.79 |
+<tr>
-| PaddlePaddle v2 | 15.97 | 17.04 | 17.60 | 17.83 |
+<th>Batch Size </th>
-| TensorFlow | 9.09 | 9.10 | 9.24 | 8.66 |
+<th> 32</th>
 <th>64</th>
 <th>128 </th>
 <th>256</th>
 </tr>
 </thead>
 <tbody>
 <tr>
 <td> PaddlePaddle Fluid</td>
 <td> 15.44 </td>
 <td> 16.32 </td>
 <td> 16.74 </td>
 <td> 16.79 </td>
 </tr>
 <tr>
 <td>PaddlePaddle v2  </td>
 <td> 15.97 </td>
 <td> 17.04 </td>
 <td> 17.60 </td>
 <td> 17.83 </td>
 </tr>
 <tr>
 <td>TensorFlow </td>
 <td> 9.09 </td>
 <td> 9.10 </td>
 <td> 9.24 </td>
 <td> 8.66 </td>
 </tr>
 </tbody>
 </table>
 ### Different Batch Size
@ -28,12 +58,40 @@ Setting environment variable: `MKL_NUM_THREADS=1`.
 - Trainer Count: 20
 - Metrics: samples / sec
-| Batch Size | 32 | 64 | 128 | 256 |
+<table>
-| -- | -- | -- | -- | -- |
+<thead>
-| PaddlePaddle Fluid | 190.20 | 222.15 | 247.40 | 258.18 |
+<tr>
-| PaddlePaddle v2 | 170.96 | 233.71 | 256.14 | 329.23 |
+<th>Batch Size </th>
-| TensorFlow | - | - | - | - |
+<th> 32</th>
-
+<th>64</th>
 <th>128 </th>
 <th>256</th>
 </tr>
 </thead>
 <tbody>
 <tr>
 <td> PaddlePaddle Fluid</td>
 <td> 190.20 </td>
 <td> 222.15 </td>
 <td> 247.40 </td>
 <td> 258.18 </td>
 </tr>
 <tr>
 <td>PaddlePaddle v2  </td>
 <td> 170.96 </td>
 <td> 233.71 </td>
 <td> 256.14 </td>
 <td> 329.23 </td>
 </tr>
 <tr>
 <td>TensorFlow </td>
 <td> - </td>
 <td> - </td>
 <td> - </td>
 <td> - </td>
 </tr>
 </tbody>
 </table>
 ### Accelerate Rate
@ -41,11 +99,41 @@ Setting environment variable: `MKL_NUM_THREADS=1`.
 - Batch Size: 128
 - Metrics: samples / sec
-| Trainer Count | 20 | 40 | 80 | 100 |
+<table>
-| -- | -- | -- | -- | -- |
+<thead>
-| PaddlePaddle Fluid | 263.29 (78.64%) | 518.80 (77.47%) | 836.26 (62.44%) | 1019.29 (60.89%) |
+<tr>
-| PaddlePaddle v2 (need more tests) | 326.85 (92.85%) | 534.58 (75.93%) | 853.30 (60.60%) | 1041.99 (59.20%) |
+<th>Trainer Count </th>
-| TensorFlow | - | - | - | - |
+<th>20</th>
 <th>40</th>
 <th>80</th>
 <th>100</th>
 </tr>
 </thead>
 <tbody>
 <tr>
 <td> PaddlePaddle Fluid</td>
 <td> 263.29 (78.64%) </td>
 <td> 518.80 (77.47%) </td>
 <td> 836.26 (62.44%) </td>
 <td> 1019.29 (60.89%) </td>
 </tr>
 <tr>
 <td>PaddlePaddle v2 (need more tests)   </td>
 <td> 326.85 (92.85%) </td>
 <td> 534.58 (75.93%) </td>
 <td> 853.30 (60.60%) </td>
 <td> 1041.99 (59.20%) </td>
 </tr>
 <tr>
 <td>TensorFlow </td>
 <td> - </td>
 <td> - </td>
 <td> - </td>
 <td> - </td>
 </tr>
 </tbody>
 </table>
 ### Different Pserver Count
@ -53,11 +141,41 @@ Setting environment variable: `MKL_NUM_THREADS=1`.
 - Batch Size: 128
 - Metrics: samples/ sec
-| PServer Count | 3 | 6 |10 | 20 |
+<table>
-| -- | -- | -- | -- | -- |
+<thead>
-| PaddlePaddle Fluid(should fix in next PR) | 589.1 | 592.6 | 656.4 | 655.8 |
+<tr>
-| PaddlePaddle v2 | 593.4 | 791.3 | 729.7 | 821.7 |
+<th>PServer Count </th>
-| TensorFlow | - | - | - | - |
+<th>3</th>
 <th>6</th>
 <th>10</th>
 <th>20</th>
 </tr>
 </thead>
 <tbody>
 <tr>
 <td> PaddlePaddle Fluid(should fix in next PR) </td>
 <td> 589.1 </td>
 <td> 592.6 </td>
 <td> 656.4 </td>
 <td> 655.8 </td>
 </tr>
 <tr>
 <td>PaddlePaddle v2 (need more tests)   </td>
 <td> 593.4 </td>
 <td> 791.3 </td>
 <td> 729.7 </td>
 <td> 821.7 </td>
 </tr>
 <tr>
 <td>TensorFlow </td>
 <td> - </td>
 <td> - </td>
 <td> - </td>
 <td> - </td>
 </tr>
 </tbody>
 </table>
 *The performance gap between Fuild and v2 comes from the network interference.*
--- a/benchmark/fluid/machine_translation.py
+++ b/benchmark/fluid/machine_translation.py
--- a/benchmark/fluid/mnist.py
+++ b/benchmark/fluid/mnist.py
@ -0,0 +1,224 @@
 #   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 import numpy as np
 import argparse
 import time
 import paddle.v2 as paddle
 import paddle.fluid as fluid
 import paddle.fluid.profiler as profiler
 SEED = 1
 DTYPE = "float32"
 # random seed must set before configuring the network.
 # fluid.default_startup_program().random_seed = SEED
 def parse_args():
    parser = argparse.ArgumentParser("mnist model benchmark.")
    parser.add_argument(
        '--batch_size', type=int, default=128, help='The minibatch size.')
    parser.add_argument(
        '--skip_batch_num',
        type=int,
        default=5,
        help='The first num of minibatch num to skip, for better performance test'
    )
    parser.add_argument(
        '--iterations', type=int, default=35, help='The number of minibatches.')
    parser.add_argument(
        '--pass_num', type=int, default=5, help='The number of passes.')
    parser.add_argument(
        '--device',
        type=str,
        default='GPU',
        choices=['CPU', 'GPU'],
        help='The device type.')
    parser.add_argument(
        '--infer_only', action='store_true', help='If set, run forward only.')
    parser.add_argument(
        '--use_cprof', action='store_true', help='If set, use cProfile.')
    parser.add_argument(
        '--use_nvprof',
        action='store_true',
        help='If set, use nvprof for CUDA.')
    parser.add_argument(
        '--with_test',
        action='store_true',
        help='If set, test the testset during training.')
    args = parser.parse_args()
    return args
 def cnn_model(data):
    conv_pool_1 = fluid.nets.simple_img_conv_pool(
        input=data,
        filter_size=5,
        num_filters=20,
        pool_size=2,
        pool_stride=2,
        act="relu")
    conv_pool_2 = fluid.nets.simple_img_conv_pool(
        input=conv_pool_1,
        filter_size=5,
        num_filters=50,
        pool_size=2,
        pool_stride=2,
        act="relu")
    # TODO(dzhwinter) : refine the initializer and random seed settting
    SIZE = 10
    input_shape = conv_pool_2.shape
    param_shape = [reduce(lambda a, b: a * b, input_shape[1:], 1)] + [SIZE]
    scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5
    predict = fluid.layers.fc(
        input=conv_pool_2,
        size=SIZE,
        act="softmax",
        param_attr=fluid.param_attr.ParamAttr(
            initializer=fluid.initializer.NormalInitializer(
                loc=0.0, scale=scale)))
    return predict
 def eval_test(exe, batch_acc, batch_size_tensor, inference_program):
    test_reader = paddle.batch(
        paddle.dataset.mnist.test(), batch_size=args.batch_size)
    test_pass_acc = fluid.average.WeightedAverage()
    for batch_id, data in enumerate(test_reader()):
        img_data = np.array(map(lambda x: x[0].reshape([1, 28, 28]),
                                data)).astype(DTYPE)
        y_data = np.array(map(lambda x: x[1], data)).astype("int64")
        y_data = y_data.reshape([len(y_data), 1])
        acc, weight = exe.run(inference_program,
                              feed={"pixel": img_data,
                                    "label": y_data},
                              fetch_list=[batch_acc, batch_size_tensor])
        test_pass_acc.add(value=acc, weight=weight)
        pass_acc = test_pass_acc.eval()
    return pass_acc
 def run_benchmark(model, args):
    if args.use_cprof:
        pr = cProfile.Profile()
        pr.enable()
    start_time = time.time()
    # Input data
    images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
    # Train program
    predict = model(images)
    cost = fluid.layers.cross_entropy(input=predict, label=label)
    avg_cost = fluid.layers.mean(x=cost)
    # Evaluator
    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
    batch_acc = fluid.layers.accuracy(
        input=predict, label=label, total=batch_size_tensor)
    # inference program
    inference_program = fluid.default_main_program().clone()
    # Optimization
    opt = fluid.optimizer.AdamOptimizer(
        learning_rate=0.001, beta1=0.9, beta2=0.999)
    opt.minimize(avg_cost)
    fluid.memory_optimize(fluid.default_main_program())
    # Initialize executor
    place = fluid.CPUPlace() if args.device == 'CPU' else fluid.CUDAPlace(0)
    exe = fluid.Executor(place)
    # Parameter initialization
    exe.run(fluid.default_startup_program())
    # Reader
    train_reader = paddle.batch(
        paddle.dataset.mnist.train(), batch_size=args.batch_size)
    accuracy = fluid.metrics.Accuracy()
    iters, num_samples, start_time = 0, 0, time.time()
    for pass_id in range(args.pass_num):
        accuracy.reset()
        train_accs = []
        train_losses = []
        for batch_id, data in enumerate(train_reader()):
            if iters == args.skip_batch_num:
                start_time = time.time()
                num_samples = 0
            if iters == args.iterations:
                break
            img_data = np.array(
                map(lambda x: x[0].reshape([1, 28, 28]), data)).astype(DTYPE)
            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
            y_data = y_data.reshape([len(y_data), 1])
            outs = exe.run(
                fluid.default_main_program(),
                feed={"pixel": img_data,
                      "label": y_data},
                fetch_list=[avg_cost, batch_acc, batch_size_tensor]
            )  # The accuracy is the accumulation of batches, but not the current batch.
            accuracy.update(value=outs[1], weight=outs[2])
            iters += 1
            num_samples += len(y_data)
            loss = np.array(outs[0])
            acc = np.array(outs[1])
            train_losses.append(loss)
            train_accs.append(acc)
            print("Pass: %d, Iter: %d, Loss: %f, Accuracy: %f" %
                  (pass_id, iters, loss, acc))
        print("Pass: %d, Loss: %f, Train Accuray: %f\n" %
              (pass_id, np.mean(train_losses), np.mean(train_accs)))
        train_elapsed = time.time() - start_time
        examples_per_sec = num_samples / train_elapsed
        print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
              (num_samples, train_elapsed, examples_per_sec))
        # evaluation
        if args.with_test:
            test_avg_acc = eval_test(exe, batch_acc, batch_size_tensor,
                                     inference_program)
        exit(0)
 def print_arguments(args):
    vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
                                vars(args)['device'] == 'GPU')
    print('----------- mnist Configuration Arguments -----------')
    for arg, value in sorted(vars(args).iteritems()):
        print('%s: %s' % (arg, value))
    print('------------------------------------------------')
 if __name__ == '__main__':
    args = parse_args()
    print_arguments(args)
    if args.use_nvprof and args.device == 'GPU':
        with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof:
            run_benchmark(cnn_model, args)
    else:
        run_benchmark(cnn_model, args)
--- a/benchmark/fluid/resnet.py
+++ b/benchmark/fluid/resnet.py
--- a/benchmark/fluid/run.sh
+++ b/benchmark/fluid/run.sh
@ -0,0 +1,105 @@
 #!/bin/bash
 # This script benchmarking the PaddlePaddle Fluid on
 # single thread single GPU.
 #export FLAGS_fraction_of_gpu_memory_to_use=0.0
 export CUDNN_PATH=/paddle/cudnn_v5
 # disable openmp and mkl parallel
 #https://github.com/PaddlePaddle/Paddle/issues/7199
 export MKL_NUM_THREADS=1
 export OMP_NUM_THREADS=1
 ht=`lscpu |grep "per core"|awk -F':' '{print $2}'|xargs`
 if [ $ht -eq 1 ]; then # HT is OFF
    if [ -z "$KMP_AFFINITY" ]; then
        export KMP_AFFINITY="granularity=fine,compact,0,0"
    fi
    if [ -z "$OMP_DYNAMIC" ]; then
        export OMP_DYNAMIC="FALSE"
    fi
 else # HT is ON
    if [ -z "$KMP_AFFINITY" ]; then
        export KMP_AFFINITY="granularity=fine,compact,1,0"
    fi
 fi
 # disable multi-gpu if have more than one
 export CUDA_VISIBLE_DEVICES=0
 export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
 export LD_LIBRARY_PATH=$CUDNN_PATH:$LD_LIBRARY_PATH
 # only query the gpu used
 nohup stdbuf -oL nvidia-smi \
      --id=${CUDA_VISIBLE_DEVICES} \
      --query-gpu=timestamp \
      --query-compute-apps=pid,process_name,used_memory \
      --format=csv \
      --filename=mem.log  \
      -l 1 &
 # mnist
 # mnist gpu mnist 128
 FLAGS_benchmark=true stdbuf -oL python fluid/mnist.py \
               --device=GPU \
               --batch_size=128 \
               --skip_batch_num=5 \
               --iterations=500 \
               2>&1 | tee -a mnist_gpu_128.log
 # vgg16
 # gpu cifar10 128
 FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \
               --device=GPU \
               --batch_size=128 \
               --skip_batch_num=5 \
               --iterations=30 \
               2>&1 | tee -a vgg16_gpu_128.log
 # flowers gpu  128
 FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \
               --device=GPU \
               --batch_size=32 \
               --data_set=flowers \
               --skip_batch_num=5 \
               --iterations=30 \
               2>&1 | tee -a vgg16_gpu_flowers_32.log
 # resnet50
 # resnet50 gpu cifar10 128
 FLAGS_benchmark=true stdbuf -oL python fluid/resnet50.py \
               --device=GPU \
               --batch_size=128 \
               --data_set=cifar10 \
               --model=resnet_cifar10 \
               --skip_batch_num=5 \
               --iterations=30 \
               2>&1 | tee -a resnet50_gpu_128.log
 # resnet50 gpu flowers 64
 FLAGS_benchmark=true stdbuf -oL python fluid/resnet50.py \
               --device=GPU \
               --batch_size=64 \
               --data_set=flowers \
               --model=resnet_imagenet \
               --skip_batch_num=5 \
               --iterations=30 \
               2>&1 | tee -a resnet50_gpu_flowers_64.log
 # lstm
 # lstm gpu imdb 32 # tensorflow only support batch=32
 FLAGS_benchmark=true stdbuf -oL python fluid/stacked_dynamic_lstm.py \
               --device=GPU \
               --batch_size=32 \
               --skip_batch_num=5 \
               --iterations=30 \
               --hidden_dim=512 \
               --emb_dim=512 \
               --crop_size=1500 \
               2>&1 | tee -a lstm_gpu_32.log
 # seq2seq
 # seq2seq gpu wmb 128
 FLAGS_benchmark=true stdbuf -oL python fluid/machine_translation.py \
               --device=GPU \
               --batch_size=128 \
               --skip_batch_num=5 \
               --iterations=30 \
               2>&1 | tee -a lstm_gpu_128.log
--- a/benchmark/fluid/stacked_dynamic_lstm.py
+++ b/benchmark/fluid/stacked_dynamic_lstm.py
@ -0,0 +1,236 @@
 #   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 import argparse
 import cPickle
 import os
 import random
 import time
 import numpy
 import paddle.v2 as paddle
 import paddle.v2.dataset.imdb as imdb
 import paddle.fluid as fluid
 from paddle.v2 import batch
 import paddle.fluid.profiler as profiler
 def parse_args():
    parser = argparse.ArgumentParser("Understand Sentiment by Dynamic RNN.")
    parser.add_argument(
        '--batch_size',
        type=int,
        default=32,
        help='The sequence number of a batch data. (default: %(default)d)')
    parser.add_argument(
        '--skip_batch_num',
        type=int,
        default=5,
        help='The first num of minibatch num to skip, for better performance test'
    )
    parser.add_argument(
        '--iterations', type=int, default=80, help='The number of minibatches.')
    parser.add_argument(
        '--emb_dim',
        type=int,
        default=512,
        help='Dimension of embedding table. (default: %(default)d)')
    parser.add_argument(
        '--hidden_dim',
        type=int,
        default=512,
        help='Hidden size of lstm unit. (default: %(default)d)')
    parser.add_argument(
        '--pass_num',
        type=int,
        default=100,
        help='Epoch number to train. (default: %(default)d)')
    parser.add_argument(
        '--device',
        type=str,
        default='CPU',
        choices=['CPU', 'GPU'],
        help='The device type.')
    parser.add_argument(
        '--crop_size',
        type=int,
        default=int(os.environ.get('CROP_SIZE', '1500')),
        help='The max sentence length of input. Since this model use plain RNN,'
        ' Gradient could be explored if sentence is too long')
    parser.add_argument(
        '--with_test',
        action='store_true',
        help='If set, test the testset during training.')
    args = parser.parse_args()
    return args
 word_dict = imdb.word_dict()
 def crop_sentence(reader, crop_size):
    unk_value = word_dict['<unk>']
    def __impl__():
        for item in reader():
            if len([x for x in item[0] if x != unk_value]) < crop_size:
                yield item
    return __impl__
 def main():
    args = parse_args()
    lstm_size = args.hidden_dim
    data = fluid.layers.data(
        name="words", shape=[1], lod_level=1, dtype='int64')
    sentence = fluid.layers.embedding(
        input=data, size=[len(word_dict), args.emb_dim])
    sentence = fluid.layers.fc(input=sentence, size=lstm_size, act='tanh')
    rnn = fluid.layers.DynamicRNN()
    with rnn.block():
        word = rnn.step_input(sentence)
        prev_hidden = rnn.memory(value=0.0, shape=[lstm_size])
        prev_cell = rnn.memory(value=0.0, shape=[lstm_size])
        def gate_common(
                ipt,
                hidden,
                size, ):
            gate0 = fluid.layers.fc(input=ipt, size=size, bias_attr=True)
            gate1 = fluid.layers.fc(input=hidden, size=size, bias_attr=False)
            gate = fluid.layers.sums(input=[gate0, gate1])
            return gate
        forget_gate = fluid.layers.sigmoid(
            x=gate_common(word, prev_hidden, lstm_size))
        input_gate = fluid.layers.sigmoid(
            x=gate_common(word, prev_hidden, lstm_size))
        output_gate = fluid.layers.sigmoid(
            x=gate_common(word, prev_hidden, lstm_size))
        cell_gate = fluid.layers.tanh(
            x=gate_common(word, prev_hidden, lstm_size))
        cell = fluid.layers.sums(input=[
            fluid.layers.elementwise_mul(
                x=forget_gate, y=prev_cell), fluid.layers.elementwise_mul(
                    x=input_gate, y=cell_gate)
        ])
        hidden = fluid.layers.elementwise_mul(
            x=output_gate, y=fluid.layers.tanh(x=cell))
        rnn.update_memory(prev_cell, cell)
        rnn.update_memory(prev_hidden, hidden)
        rnn.output(hidden)
    last = fluid.layers.sequence_pool(rnn(), 'last')
    logit = fluid.layers.fc(input=last, size=2, act='softmax')
    loss = fluid.layers.cross_entropy(
        input=logit,
        label=fluid.layers.data(
            name='label', shape=[1], dtype='int64'))
    loss = fluid.layers.mean(x=loss)
    # add acc
    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
    batch_acc = fluid.layers.accuracy(input=logit, label=fluid.layers.data(name='label', \
                shape=[1], dtype='int64'), total=batch_size_tensor)
    inference_program = fluid.default_main_program().clone()
    with fluid.program_guard(inference_program):
        inference_program = fluid.io.get_inference_program(
            target_vars=[batch_acc, batch_size_tensor])
    adam = fluid.optimizer.Adam()
    adam.minimize(loss)
    fluid.memory_optimize(fluid.default_main_program())
    place = fluid.CPUPlace() if args.device == 'CPU' else fluid.CUDAPlace(0)
    exe = fluid.Executor(place)
    exe.run(fluid.default_startup_program())
    train_reader = batch(
        paddle.reader.shuffle(
            crop_sentence(imdb.train(word_dict), args.crop_size),
            buf_size=25000),
        batch_size=args.batch_size)
    iters, num_samples, start_time = 0, 0, time.time()
    for pass_id in range(args.pass_num):
        train_accs = []
        train_losses = []
        for batch_id, data in enumerate(train_reader()):
            if iters == args.skip_batch_num:
                start_time = time.time()
                num_samples = 0
            if iters == args.iterations:
                break
            tensor_words = to_lodtensor([x[0] for x in data], place)
            label = numpy.array([x[1] for x in data]).astype("int64")
            label = label.reshape((-1, 1))
            loss_np, acc, weight = exe.run(
                fluid.default_main_program(),
                feed={"words": tensor_words,
                      "label": label},
                fetch_list=[loss, batch_acc, batch_size_tensor])
            iters += 1
            for x in data:
                num_samples += len(x[0])
            print(
                "Pass = %d, Iter = %d, Loss = %f, Accuracy = %f" %
                (pass_id, iters, loss_np, acc)
            )  # The accuracy is the accumulation of batches, but not the current batch.
        train_elapsed = time.time() - start_time
        examples_per_sec = num_samples / train_elapsed
        print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
              (num_samples, train_elapsed, examples_per_sec))
        exit(0)
 def to_lodtensor(data, place):
    seq_lens = [len(seq) for seq in data]
    cur_len = 0
    lod = [cur_len]
    for l in seq_lens:
        cur_len += l
        lod.append(cur_len)
    flattened_data = numpy.concatenate(data, axis=0).astype("int64")
    flattened_data = flattened_data.reshape([len(flattened_data), 1])
    res = fluid.LoDTensor()
    res.set(flattened_data, place)
    res.set_lod([lod])
    return res
 def print_arguments(args):
    print('----------- lstm Configuration Arguments -----------')
    for arg, value in sorted(vars(args).iteritems()):
        print('%s: %s' % (arg, value))
    print('------------------------------------------------')
 if __name__ == '__main__':
    args = parse_args()
    print_arguments(args)
    main()
--- a/benchmark/fluid/vgg.py
+++ b/benchmark/fluid/vgg.py
@ -0,0 +1,224 @@
 #   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """VGG16 benchmark in Fluid"""
 from __future__ import print_function
 import sys
 import time
 import numpy as np
 import paddle.v2 as paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import argparse
 import functools
 parser = argparse.ArgumentParser(description=__doc__)
 parser.add_argument(
    '--batch_size', type=int, default=128, help="Batch size for training.")
 parser.add_argument(
    '--skip_batch_num',
    type=int,
    default=5,
    help='The first num of minibatch num to skip, for better performance test')
 parser.add_argument(
    '--iterations', type=int, default=80, help='The number of minibatches.')
 parser.add_argument(
    '--learning_rate',
    type=float,
    default=1e-3,
    help="Learning rate for training.")
 parser.add_argument('--pass_num', type=int, default=50, help="No. of passes.")
 parser.add_argument(
    '--device',
    type=str,
    default='GPU',
    choices=['CPU', 'GPU'],
    help="The device type.")
 parser.add_argument(
    '--data_format',
    type=str,
    default='NCHW',
    choices=['NCHW', 'NHWC'],
    help='The data order, now only support NCHW.')
 parser.add_argument(
    '--data_set',
    type=str,
    default='cifar10',
    choices=['cifar10', 'flowers'],
    help='Optional dataset for benchmark.')
 parser.add_argument(
    '--with_test',
    action='store_true',
    help='If set, test the testset during training.')
 args = parser.parse_args()
 def vgg16_bn_drop(input):
    def conv_block(input, num_filter, groups, dropouts):
        return fluid.nets.img_conv_group(
            input=input,
            pool_size=2,
            pool_stride=2,
            conv_num_filter=[num_filter] * groups,
            conv_filter_size=3,
            conv_act='relu',
            conv_with_batchnorm=True,
            conv_batchnorm_drop_rate=dropouts,
            pool_type='max')
    conv1 = conv_block(input, 64, 2, [0.3, 0])
    conv2 = conv_block(conv1, 128, 2, [0.4, 0])
    conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
    conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
    conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
    drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
    fc1 = fluid.layers.fc(input=drop, size=512, act=None)
    bn = fluid.layers.batch_norm(input=fc1, act='relu')
    drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
    fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
    return fc2
 def main():
    if args.data_set == "cifar10":
        classdim = 10
        if args.data_format == 'NCHW':
            data_shape = [3, 32, 32]
        else:
            data_shape = [32, 32, 3]
    else:
        classdim = 102
        if args.data_format == 'NCHW':
            data_shape = [3, 224, 224]
        else:
            data_shape = [224, 224, 3]
    # Input data
    images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
    # Train program
    net = vgg16_bn_drop(images)
    predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
    cost = fluid.layers.cross_entropy(input=predict, label=label)
    avg_cost = fluid.layers.mean(x=cost)
    # Evaluator
    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
    batch_acc = fluid.layers.accuracy(
        input=predict, label=label, total=batch_size_tensor)
    # inference program
    inference_program = fluid.default_main_program().clone()
    with fluid.program_guard(inference_program):
        inference_program = fluid.io.get_inference_program(
            target_vars=[batch_acc, batch_size_tensor])
    # Optimization
    optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
    opts = optimizer.minimize(avg_cost)
    fluid.memory_optimize(fluid.default_main_program())
    # Initialize executor
    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
    exe = fluid.Executor(place)
    # Parameter initialization
    exe.run(fluid.default_startup_program())
    # data reader
    train_reader = paddle.batch(
        paddle.reader.shuffle(
            paddle.dataset.cifar.train10()
            if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
            buf_size=5120),
        batch_size=args.batch_size)
    test_reader = paddle.batch(
        paddle.dataset.cifar.test10()
        if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
        batch_size=args.batch_size)
    # test
    def test(exe):
        test_accuracy = fluid.average.WeightedAverage()
        for batch_id, data in enumerate(test_reader()):
            img_data = np.array(map(lambda x: x[0].reshape(data_shape),
                                    data)).astype("float32")
            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
            y_data = y_data.reshape([-1, 1])
            acc, weight = exe.run(inference_program,
                                  feed={"pixel": img_data,
                                        "label": y_data},
                                  fetch_list=[batch_acc, batch_size_tensor])
            test_accuracy.add(value=acc, weight=weight)
        return test_accuracy.eval()
    iters, num_samples, start_time = 0, 0, time.time()
    accuracy = fluid.average.WeightedAverage()
    for pass_id in range(args.pass_num):
        accuracy.reset()
        train_accs = []
        train_losses = []
        for batch_id, data in enumerate(train_reader()):
            if iters == args.skip_batch_num:
                start_time = time.time()
                num_samples = 0
            if iters == args.iterations:
                break
            img_data = np.array(map(lambda x: x[0].reshape(data_shape),
                                    data)).astype("float32")
            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
            y_data = y_data.reshape([-1, 1])
            loss, acc, weight = exe.run(
                fluid.default_main_program(),
                feed={"pixel": img_data,
                      "label": y_data},
                fetch_list=[avg_cost, batch_acc, batch_size_tensor])
            accuracy.add(value=acc, weight=weight)
            iters += 1
            num_samples += len(y_data)
            print(
                "Pass = %d, Iter = %d, Loss = %f, Accuracy = %f" %
                (pass_id, iters, loss, acc)
            )  # The accuracy is the accumulation of batches, but not the current batch.
        # pass_train_acc = accuracy.eval()
        train_losses.append(loss)
        train_accs.append(acc)
        print("Pass: %d, Loss: %f, Train Accuray: %f\n" %
              (pass_id, np.mean(train_losses), np.mean(train_accs)))
        train_elapsed = time.time() - start_time
        examples_per_sec = num_samples / train_elapsed
        print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
              (num_samples, train_elapsed, examples_per_sec))
        # evaluation
        if args.with_test:
            pass_test_acc = test(exe)
        exit(0)
 def print_arguments():
    print('----------- vgg Configuration Arguments -----------')
    for arg, value in sorted(vars(args).iteritems()):
        print('%s: %s' % (arg, value))
    print('------------------------------------------------')
 if __name__ == "__main__":
    print_arguments()
    main()
--- a/benchmark/tensorflow/machine_translation.py
+++ b/benchmark/tensorflow/machine_translation.py
--- a/benchmark/tensorflow/mnist.py
+++ b/benchmark/tensorflow/mnist.py
@ -0,0 +1,180 @@
 #   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 import argparse
 import time
 import numpy as np
 import tensorflow as tf
 import paddle.v2 as paddle
 DTYPE = tf.float32
 def parse_args():
    parser = argparse.ArgumentParser("mnist model benchmark.")
    parser.add_argument(
        '--batch_size', type=int, default=128, help='The minibatch size.')
    parser.add_argument(
        '--iterations', type=int, default=35, help='The number of minibatches.')
    parser.add_argument(
        '--pass_num', type=int, default=5, help='The number of passes.')
    parser.add_argument(
        '--device',
        type=str,
        default='GPU',
        choices=['CPU', 'GPU'],
        help='The device type.')
    args = parser.parse_args()
    return args
 def run_benchmark(args):
    def weight_variable(dtype, shape):
        initial = tf.truncated_normal(shape, stddev=0.1, dtype=dtype)
        return tf.Variable(initial)
    def bias_variable(dtype, shape):
        initial = tf.constant(0.1, shape=shape, dtype=dtype)
        return tf.Variable(initial)
    device = '/cpu:0' if args.device == 'CPU' else '/device:GPU:0'
    with tf.device(device):
        images = tf.placeholder(DTYPE, shape=(None, 28, 28, 1))
        labels = tf.placeholder(tf.int64, shape=(None, ))
        # conv1, relu, pool1
        conv1_weights = weight_variable(DTYPE, [5, 5, 1, 20])
        conv1_bias = bias_variable(DTYPE, [20])
        conv1 = tf.nn.conv2d(
            images, conv1_weights, strides=[1, 1, 1, 1], padding="VALID")
        relu1 = tf.nn.relu(tf.nn.bias_add(conv1, conv1_bias))
        pool1 = tf.nn.max_pool(
            relu1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding="VALID")
        # conv2, relu, pool2
        conv2_weights = weight_variable(DTYPE, [5, 5, 20, 50])
        conv2_bias = bias_variable(DTYPE, [50])
        conv2 = tf.nn.conv2d(
            pool1, conv2_weights, strides=[1, 1, 1, 1], padding="VALID")
        relu2 = tf.nn.relu(tf.nn.bias_add(conv2, conv2_bias))
        pool2 = tf.nn.max_pool(
            relu2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding="VALID")
        # FC 
        pool_shape = pool2.get_shape().as_list()
        hidden_dim = reduce(lambda a, b: a * b, pool_shape[1:], 1)
        reshape = tf.reshape(pool2, shape=(tf.shape(pool2)[0], hidden_dim))
        fc_weights = weight_variable(DTYPE, [hidden_dim, 10])
        fc_bias = bias_variable(DTYPE, [10])
        logits = tf.matmul(reshape, fc_weights) + fc_bias
        # Get prediction
        prediction = tf.nn.softmax(logits)
        # Loss 
        one_hot_labels = tf.one_hot(labels, depth=10)
        cost = -tf.reduce_sum(tf.log(prediction) * one_hot_labels, [1])
        avg_cost = tf.reduce_mean(cost)
        # Get accuracy
        correct = tf.equal(tf.argmax(prediction, 1), labels)
        accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
        # metrics, g_accuracy
        with tf.variable_scope("reset_metrics_accuracy_scope") as scope:
            g_accuracy = tf.metrics.accuracy(
                labels, tf.argmax(
                    prediction, axis=1))
            vars = tf.contrib.framework.get_variables(
                scope, collection=tf.GraphKeys.LOCAL_VARIABLES)
            g_accuracy_reset_op = tf.variables_initializer(vars)
        # Optimizer 
        opt = tf.train.AdamOptimizer(
            learning_rate=0.001, beta1=0.9, beta2=0.999)
        train_op = opt.minimize(avg_cost)
        # train_op = tf.train.AdamOptimizer(1e-4).minimize(avg_cost)
    train_reader = paddle.batch(
        paddle.dataset.mnist.train(), batch_size=args.batch_size)
    test_reader = paddle.batch(
        paddle.dataset.mnist.test(), batch_size=args.batch_size)
    def eval_test():
        sess.run(g_accuracy_reset_op)
        for batch_id, data in enumerate(test_reader()):
            images_data = np.array(
                map(lambda x: np.transpose(x[0].reshape([1, 28, 28]), axes=[1,2,0]), data)).astype("float32")
            labels_data = np.array(map(lambda x: x[1], data)).astype("int64")
            loss, acc, g_acc = sess.run(
                [avg_cost, accuracy, g_accuracy],
                feed_dict={images: images_data,
                           labels: labels_data})
        return g_acc[1]
    config = tf.ConfigProto(
        intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        init_g = tf.global_variables_initializer()
        init_l = tf.local_variables_initializer()
        sess.run(init_g)
        sess.run(init_l)
        for pass_id in range(args.pass_num):
            sess.run(g_accuracy_reset_op)
            pass_start = time.time()
            for batch_id, data in enumerate(train_reader()):
                images_data = np.array(
                    map(lambda x: np.transpose(x[0].reshape([1, 28, 28]), axes=[1,2,0]), data)).astype("float32")
                labels_data = np.array(map(lambda x: x[1], data)).astype(
                    "int64")
                start = time.time()
                _, loss, acc, g_acc = sess.run(
                    [train_op, avg_cost, accuracy, g_accuracy],
                    feed_dict={images: images_data,
                               labels: labels_data})
                end = time.time()
                print("pass=%d, batch=%d, loss=%f, error=%f, elapse=%f" %
                      (pass_id, batch_id, loss, 1 - acc, (end - start) / 1000))
            pass_end = time.time()
            test_avg_acc = eval_test()
            print(
                "pass=%d, training_avg_accuracy=%f, test_avg_acc=%f, elapse=%f"
                % (pass_id, g_acc[1], test_avg_acc,
                   (pass_end - pass_start) / 1000))
 def print_arguments(args):
    print('-----------  Configuration Arguments -----------')
    for arg, value in sorted(vars(args).iteritems()):
        print('%s: %s' % (arg, value))
    print('------------------------------------------------')
 if __name__ == '__main__':
    args = parse_args()
    print_arguments(args)
    run_benchmark(args)
--- a/benchmark/tensorflow/resnet.py
+++ b/benchmark/tensorflow/resnet.py
--- a/benchmark/tensorflow/stacked_dynamic_lstm.py
+++ b/benchmark/tensorflow/stacked_dynamic_lstm.py
@ -0,0 +1,220 @@
 #   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 import numpy as np
 import argparse
 import time
 import tensorflow as tf
 import paddle.v2 as paddle
 def parse_args():
    parser = argparse.ArgumentParser("LSTM model benchmark.")
    parser.add_argument(
        '--batch_size',
        type=int,
        default=32,
        help='The sequence number of a batch data. (default: %(default)d)')
    parser.add_argument(
        '--stacked_num',
        type=int,
        default=5,
        help='Number of lstm layers to stack. (default: %(default)d)')
    parser.add_argument(
        '--embedding_dim',
        type=int,
        default=512,
        help='Dimension of embedding table. (default: %(default)d)')
    parser.add_argument(
        '--hidden_dim',
        type=int,
        default=512,
        help='Hidden size of lstm unit. (default: %(default)d)')
    parser.add_argument(
        '--pass_num',
        type=int,
        default=10,
        help='Epoch number to train. (default: %(default)d)')
    parser.add_argument(
        '--learning_rate',
        type=float,
        default=0.0002,
        help='Learning rate used to train. (default: %(default)f)')
    parser.add_argument(
        '--infer_only', action='store_true', help='If set, run forward only.')
    args = parser.parse_args()
    return args
 def print_arguments(args):
    print('-----------  Configuration Arguments -----------')
    for arg, value in sorted(vars(args).iteritems()):
        print('%s: %s' % (arg, value))
    print('------------------------------------------------')
 def dynamic_lstm_model(dict_size,
                       embedding_dim,
                       hidden_dim,
                       stacked_num,
                       class_num=2,
                       is_train=True):
    word_idx = tf.placeholder(tf.int64, shape=[None, None])
    sequence_length = tf.placeholder(tf.int64, shape=[None, ])
    embedding_weights = tf.get_variable('word_embeddings',
                                        [dict_size, embedding_dim])
    embedding = tf.nn.embedding_lookup(embedding_weights, word_idx)
    lstm_cell = tf.nn.rnn_cell.LSTMCell(
        num_units=hidden_dim, use_peepholes=False)
    stacked_cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell] * stacked_num)
    # final_state [LSTMTuple(c, h), LSTMTuple(c, h) ...] total stacked_num LSTMTuples
    _, final_state = tf.nn.dynamic_rnn(
        cell=stacked_cell,
        inputs=embedding,
        dtype=tf.float32,
        sequence_length=sequence_length)
    w = tf.Variable(
        tf.truncated_normal([hidden_dim, class_num]), dtype=tf.float32)
    bias = tf.Variable(
        tf.constant(
            value=0.0, shape=[class_num], dtype=tf.float32))
    prediction = tf.matmul(final_state[-1][1], w) + bias
    if not is_train:
        return (word_idx, sequence_length), tf.nn.softmax(prediction)
    label = tf.placeholder(tf.int64, shape=[None, ])
    loss = tf.nn.softmax_cross_entropy_with_logits(
        labels=tf.one_hot(label, 2), logits=prediction)
    avg_loss = tf.reduce_mean(loss)
    correct_count = tf.equal(tf.argmax(prediction, 1), label)
    acc = tf.reduce_mean(tf.cast(correct_count, tf.float32))
    with tf.variable_scope("reset_metrics_accuracy_scope") as scope:
        g_acc = tf.metrics.accuracy(label, tf.argmax(prediction, axis=1))
        vars = tf.contrib.framework.get_variables(
            scope, collection=tf.GraphKeys.LOCAL_VARIABLES)
        reset_op = tf.variables_initializer(vars)
    return (word_idx, sequence_length, label), avg_loss, acc, g_acc, reset_op
 def padding_data(data, padding_size, value):
    data = data + [value] * padding_size
    return data[:padding_size]
 def train(args):
    word_dict = paddle.dataset.imdb.word_dict()
    dict_size = len(word_dict)
    feeding_list, avg_loss, acc, g_acc, reset_op = dynamic_lstm_model(
        dict_size, args.embedding_dim, args.hidden_dim, args.stacked_num)
    adam_optimizer = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
    train_op = adam_optimizer.minimize(avg_loss)
    train_reader = paddle.batch(
        paddle.reader.shuffle(
            paddle.dataset.imdb.train(word_dict), buf_size=25000),
        batch_size=args.batch_size)
    test_reader = paddle.batch(
        paddle.reader.shuffle(
            paddle.dataset.imdb.test(word_dict), buf_size=25000),
        batch_size=args.batch_size)
    def do_validation(sess):
        sess.run(reset_op)
        for batch_id, data in enumerate(test_reader()):
            word_idx = map(lambda x: x[0], data)
            sequence_length = np.array(
                [len(seq) for seq in word_idx]).astype('int64')
            maxlen = np.max(sequence_length)
            word_idx = [padding_data(seq, maxlen, 0) for seq in word_idx]
            word_idx = np.array(word_idx).astype('int64')
            label = np.array(map(lambda x: x[1], data)).astype('int64')
            _, loss, fetch_acc, fetch_g_acc = sess.run(
                [train_op, avg_loss, acc, g_acc],
                feed_dict={
                    feeding_list[0]: word_idx,
                    feeding_list[1]: sequence_length,
                    feeding_list[2]: label
                })
        return fetch_g_acc[1]
    config = tf.ConfigProto(
        intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        init_g = tf.global_variables_initializer()
        init_l = tf.local_variables_initializer()
        sess.run(init_l)
        sess.run(init_g)
        for pass_id in xrange(args.pass_num):
            # clear accuracy local variable 
            sess.run(reset_op)
            pass_start_time = time.time()
            words_seen = 0
            for batch_id, data in enumerate(train_reader()):
                word_idx = map(lambda x: x[0], data)
                sequence_length = np.array(
                    [len(seq) for seq in word_idx]).astype('int64')
                words_seen += np.sum(sequence_length)
                maxlen = np.max(sequence_length)
                word_idx = [padding_data(seq, maxlen, 0) for seq in word_idx]
                word_idx = np.array(word_idx).astype('int64')
                label = np.array(map(lambda x: x[1], data)).astype('int64')
                _, loss, fetch_acc, fetch_g_acc = sess.run(
                    [train_op, avg_loss, acc, g_acc],
                    feed_dict={
                        feeding_list[0]: word_idx,
                        feeding_list[1]: sequence_length,
                        feeding_list[2]: label
                    })
                print("pass_id=%d, batch_id=%d, loss: %f, acc: %f, avg_acc: %f"
                      % (pass_id, batch_id, loss, fetch_acc, fetch_g_acc[1]))
            pass_end_time = time.time()
            time_consumed = pass_end_time - pass_start_time
            words_per_sec = words_seen / time_consumed
            test_acc = do_validation(sess)
            print("pass_id=%d, test_acc: %f, words/s: %f, sec/pass: %f" %
                  (pass_id, test_acc, words_per_sec, time_consumed))
 if __name__ == '__main__':
    args = parse_args()
    print_arguments(args)
    if args.infer_only:
        pass
    else:
        train(args)
--- a/benchmark/tensorflow/vgg.py
+++ b/benchmark/tensorflow/vgg.py
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@ -62,29 +62,33 @@ endif()
 ## Then find the reference-cblas.  www.netlib.org/blas/
 set(REFERENCE_CBLAS_ROOT $ENV{REFERENCE_CBLAS_ROOT} CACHE PATH
  "Folder contains reference-cblas")
-set(REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS
+if(NOT CMAKE_CROSSCOMPILING)
-  ${REFERENCE_CBLAS_ROOT}/include
+  set(REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS
-  /usr/include
+    ${REFERENCE_CBLAS_ROOT}/include
-  /usr/include/cblas
+    /usr/include
-)
+    /usr/include/cblas
-
+  )
-set(REFERENCE_CBLAS_LIB_SEARCH_PATHS
+
-  ${REFERENCE_CBLAS_ROOT}/lib
+  set(REFERENCE_CBLAS_LIB_SEARCH_PATHS
-  /usr/lib
+    ${REFERENCE_CBLAS_ROOT}/lib
-  /usr/lib/blas/reference/
+    /usr/lib
-  /usr/lib/reference/
+    /usr/lib/blas/reference/
-)
+    /usr/lib/reference/
  )
 else()
  # Disable the finding of reference cblas under host's system path
  set(REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS ${REFERENCE_CBLAS_ROOT}/include)
  set(REFERENCE_CBLAS_LIB_SEARCH_PATHS ${REFERENCE_CBLAS_ROOT}/lib)
 endif()
 find_path(REFERENCE_CBLAS_INCLUDE_DIR NAMES cblas.h PATHS
        ${REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS})
 find_library(REFERENCE_CBLAS_LIBRARY NAMES cblas PATHS
        ${REFERENCE_CBLAS_LIB_SEARCH_PATHS})
-if (REFERENCE_CBLAS_INCLUDE_DIR AND REFERENCE_CBLAS_LIBRARY)
+if(REFERENCE_CBLAS_INCLUDE_DIR AND REFERENCE_CBLAS_LIBRARY)
  set(CBLAS_FOUND ON)
  set(CBLAS_PROVIDER REFERENCE)
  set(CBLAS_INC_DIR ${REFERENCE_CBLAS_INCLUDE_DIR})
--- a/cmake/external/grpc.cmake
+++ b/cmake/external/grpc.cmake
@ -24,16 +24,16 @@ SET(GRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/grpc)
 SET(GRPC_INCLUDE_DIR "${GRPC_INSTALL_DIR}/include/" CACHE PATH "grpc include directory." FORCE)
 SET(GRPC_CPP_PLUGIN "${GRPC_INSTALL_DIR}/bin/grpc_cpp_plugin" CACHE FILEPATH "GRPC_CPP_PLUGIN" FORCE)
 IF(APPLE)
-  SET(BUILD_CMD make -n HAS_SYSTEM_PROTOBUF=false -s -j8 static grpc_cpp_plugin | sed "s/-Werror//g" | sh)
+  SET(BUILD_CMD make -n HAS_SYSTEM_PROTOBUF=false -s -j static grpc_cpp_plugin | sed "s/-Werror//g" | sh)
 ELSE()
-  SET(BUILD_CMD make HAS_SYSTEM_PROTOBUF=false -s -j8 static grpc_cpp_plugin)
+  SET(BUILD_CMD make HAS_SYSTEM_PROTOBUF=false -s -j static grpc_cpp_plugin)
 ENDIF()
 ExternalProject_Add(
    extern_grpc
    DEPENDS protobuf zlib
    GIT_REPOSITORY "https://github.com/grpc/grpc.git"
-    GIT_TAG "v1.8.x"
+    GIT_TAG "v1.10.x"
    PREFIX          ${GRPC_SOURCES_DIR}
    UPDATE_COMMAND  ""
    CONFIGURE_COMMAND ""
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@ -36,7 +36,8 @@ MESSAGE(STATUS "Set ${MKLDNN_INSTALL_DIR}/lib to runtime path")
 SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
 SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLDNN_INSTALL_DIR}/lib")
-INCLUDE_DIRECTORIES(${MKLDNN_INC_DIR})
+INCLUDE_DIRECTORIES(${MKLDNN_INC_DIR}) # For MKLDNN code to include internal headers.
 INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install) # For Paddle code to include mkldnn.h
 IF(${CBLAS_PROVIDER} STREQUAL "MKLML")
    SET(MKLDNN_DEPENDS   ${MKLML_PROJECT})
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@ -28,7 +28,7 @@ INCLUDE(ExternalProject)
 SET(MKLML_PROJECT       "extern_mklml")
 SET(MKLML_VER           "mklml_lnx_2018.0.1.20171007")
-SET(MKLML_URL           "https://github.com/01org/mkl-dnn/releases/download/v0.11/${MKLML_VER}.tgz")
+SET(MKLML_URL           "http://paddlepaddledeps.bj.bcebos.com/${MKLML_VER}.tgz")
 SET(MKLML_SOURCE_DIR    "${THIRD_PARTY_PATH}/mklml")
 SET(MKLML_DOWNLOAD_DIR  "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}")
 SET(MKLML_DST_DIR       "mklml")
--- a/cmake/external/nccl.cmake
+++ b/cmake/external/nccl.cmake
@ -1,67 +0,0 @@
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 if(NOT WITH_GPU)
  return()
 endif()
 include(ExternalProject)
 set(NCCL_SOURCE_DIR ${THIRD_PARTY_PATH}/nccl)
 include_directories(${NCCL_SOURCE_DIR}/src/extern_nccl/src)
 if(WITH_DSO)
  # If we use DSO, we do not build nccl, just download the dependencies
  set(NCCL_BUILD_COMMAND "")
  set(NCCL_INSTALL_COMMAND "")
  set(NCCL_INSTALL_DIR "")
 else()
  # otherwise, we build nccl and link it.
  set(NCCL_INSTALL_DIR ${THIRD_PARTY_PATH}/install/nccl)
  # Note: cuda 8.0 is needed to make nccl
  # When cuda is not installed on the system directory, need to set CUDA_HOME to your cuda root
  set(NCCL_BUILD_COMMAND "make -j 8")
  set(NCCL_INSTALL_COMMAND  "make install PREFIX=${NCCL_INSTALL_DIR}")
 endif()
 ExternalProject_Add(
    extern_nccl
    ${EXTERNAL_PROJECT_LOG_ARGS}
    GIT_REPOSITORY  "https://github.com/NVIDIA/nccl.git"
    GIT_TAG         "v1.3.4-1"
    PREFIX          "${NCCL_SOURCE_DIR}"
    UPDATE_COMMAND  ""
    CONFIGURE_COMMAND ""
    BUILD_COMMAND     "${NCCL_BUILD_COMMAND}"
    INSTALL_COMMAND   "${NCCL_INSTALL_COMMAND}"
    INSTALL_DIR       "${NCCL_INSTALL_DIR}"
    TEST_COMMAND      ""
 )
 if(WITH_DSO)
  if(${CMAKE_VERSION} VERSION_LESS "3.3.0")
    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/lib_nccl_dummy.c)
    file(WRITE ${dummyfile} "const char * dummy_nccl = \"${dummyfile}\";")
    add_library(nccl STATIC ${dummyfile})
  else()
    add_library(nccl INTERFACE)
  endif()
 else()
  add_library(nccl STATIC IMPORTED GLOBAL)
  set_property(TARGET nccl PROPERTY IMPORTED_LOCATION
               ${NCCL_INSTALL_DIR}/lib/libnccl_static.a)
 endif()
 add_dependencies(nccl extern_nccl)
--- a/cmake/external/snappy.cmake
+++ b/cmake/external/snappy.cmake
@ -11,19 +11,20 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-IF(MOBILE_INFERENCE)
+if(MOBILE_INFERENCE OR RPI)
    return()
-ENDIF()
+endif()
 include (ExternalProject)
 # NOTE: snappy is needed when linking with recordio
-SET(SNAPPY_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy)
+set(SNAPPY_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy)
-SET(SNAPPY_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy)
+set(SNAPPY_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy)
-SET(SNAPPY_INCLUDE_DIR "${SNAPPY_INSTALL_DIR}/include/" CACHE PATH "snappy include directory." FORCE)
+set(SNAPPY_INCLUDE_DIR "${SNAPPY_INSTALL_DIR}/include" CACHE PATH "snappy include directory." FORCE)
 set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/libsnappy.a")
 ExternalProject_Add(
    extern_snappy
@ -51,8 +52,7 @@ ExternalProject_Add(
 )
 add_library(snappy STATIC IMPORTED GLOBAL)
-set_property(TARGET snappy PROPERTY IMPORTED_LOCATION
+set_property(TARGET snappy PROPERTY IMPORTED_LOCATION ${SNAPPY_LIBRARIES})
             "${SNAPPY_INSTALL_DIR}/lib/libsnappy.a")
 include_directories(${SNAPPY_INCLUDE_DIR})
 add_dependencies(snappy extern_snappy)
--- a/cmake/external/snappystream.cmake
+++ b/cmake/external/snappystream.cmake
@ -11,9 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-IF(MOBILE_INFERENCE)
+IF(MOBILE_INFERENCE OR RPI)
    return()
 ENDIF()
@ -21,9 +20,11 @@ include (ExternalProject)
 # NOTE: snappy is needed when linking with recordio
-SET(SNAPPYSTREAM_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy_stream)
+set(SNAPPYSTREAM_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy_stream)
-SET(SNAPPYSTREAM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy_stream)
+set(SNAPPYSTREAM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy_stream)
-SET(SNAPPYSTREAM_INCLUDE_DIR "${SNAPPYSTREAM_INSTALL_DIR}/include/" CACHE PATH "snappy stream include directory." FORCE)
+set(SNAPPYSTREAM_INCLUDE_DIR "${SNAPPYSTREAM_INSTALL_DIR}/include" CACHE PATH "snappy stream include directory." FORCE)
 set(SNAPPYSTREAM_LIBRARIES "${SNAPPYSTREAM_INSTALL_DIR}/lib/libsnappystream.a")
 ExternalProject_Add(
        extern_snappystream
@ -51,8 +52,9 @@ ExternalProject_Add(
 )
 add_library(snappystream STATIC IMPORTED GLOBAL)
-set_property(TARGET snappystream PROPERTY IMPORTED_LOCATION
+set_property(TARGET snappystream PROPERTY IMPORTED_LOCATION ${SNAPPYSTREAM_LIBRARIES})
-        "${SNAPPYSTREAM_INSTALL_DIR}/lib/libsnappystream.a")
+
 include_directories(${SNAPPYSTREAM_INCLUDE_DIR}) # For snappysteam to include its own headers.
 include_directories(${THIRD_PARTY_PATH}/install) # For Paddle to include snappy stream headers.
 include_directories(${SNAPPYSTREAM_INCLUDE_DIR})
 add_dependencies(snappystream extern_snappystream)
--- a/Show More
+++ b/Show More