Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into change_manylinux1_Docker

7 years ago · ba84a6b7ed
parent e38eca26e2 d06849305a
commit ba84a6b7ed
1988 changed files with 66462 additions and 15448 deletions
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -23,7 +23,7 @@ repos:
    -   id: clang-format-with-version-check
        name: clang-format
        description: Format files with ClangFormat.
-        entry: bash ./.clang_format.hook -i
+        entry: bash ./tools/codestyle/clang_format.hook -i
        language: system
        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto)$
 -   repo: local
@ -52,7 +52,7 @@ repos:
    hooks:
    -   id: copyright_checker
        name: copyright_checker
-        entry: python ./.copyright.hook
+        entry: python ./tools/codestyle/copyright.hook
        language: system
        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py)$
        exclude: (?!.*third_party)^.*$ | (?!.*book)^.*$
--- a/.travis.yml
+++ b/.travis.yml
@ -27,15 +27,6 @@ script:
    # 43min timeout
    paddle/scripts/paddle_docker_build.sh ${JOB}
    if [ $? -eq 0 ] || [ $? -eq 142 ]; then true; else exit 1; fi;
  - |
    if [[ "$JOB" != "doc" ]]; then exit 0; fi;
    # For document only
    if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then exit 0; fi;
    if [[ "$TRAVIS_BRANCH" != "develop"  && ! "$TRAVIS_BRANCH" =~ ^v[[:digit:]]+\.[[:digit:]]+(\.[[:digit:]]+)?(-\S*)?$ ]]; then exit 0; fi;
    export DEPLOY_DOCS_SH=https://raw.githubusercontent.com/PaddlePaddle/PaddlePaddle.org/master/scripts/deploy/deploy_docs.sh
    export DOCS_DIR=`pwd`
    cd ..
    curl $DEPLOY_DOCS_SH | bash -s $CONTENT_DEC_PASSWD $TRAVIS_BRANCH $DOCS_DIR $DOCS_DIR/build/doc/
 notifications:
  email:
    on_success: change
--- a/AUTHORS.md
+++ b/AUTHORS.md
@ -22,6 +22,7 @@
 | jczaja | Jacek Czaja |
 | JiayiFeng | Jia-Yi Feng |
 | kbinias | Krzysztof Binias |
 | kexinzhao | Ke-Xin Zhao |
 | kuke | Yi-Bing Liu |
 | lcy-seso | Ying Cao |
 | lipeng-unisound | Peng Li |
@ -45,6 +46,7 @@
 | tianbingsz | Tian-Bing Xu |
 | tpatejko | Tomasz Patejko |
 | typhoonzero | Yi Wu |
 | velconia | Qi-Yang Min |
 | wanghaoshuang | Hao-Shuang Wang |
 | wangyang59 | Yang Wang |
 | wangzhen-nlp | Zhen Wang |
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -55,12 +55,25 @@ option(WITH_FLUID_ONLY  "Compile PaddlePaddle fluid only"               OFF)
 option(WITH_GOLANG      "Compile PaddlePaddle with GOLANG"              OFF)
 option(GLIDE_INSTALL    "Download and install go dependencies "         ON)
 option(USE_NNPACK       "Compile PaddlePaddle with NNPACK library"      OFF)
-option(WITH_DISTRIBUTE  "Compile with grpc distributed support"         OFF)
+option(WITH_DISTRIBUTE  "Compile with distributed support"              OFF)
 option(USE_EIGEN_FOR_BLAS   "Use matrix multiplication in Eigen"        OFF)
 option(EIGEN_USE_THREADS "Compile with multi-threaded Eigen"            OFF)
 option(WITH_ARM_FP16    "Use half precision support on armv8.2-a cpu"   OFF)
 option(WITH_FAST_BUNDLE_TEST    "Bundle tests that can be run in a single process together to reduce launch overhead"   OFF)
 option(WITH_CONTRIB     "Compile the third-party contributation"        OFF)
 option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better debug." OFF)
 option(WITH_ANAKIN      "Compile with Anakin library"                   OFF)
 option(WITH_GRPC     "Use grpc as the default rpc framework"            ${WITH_DISTRIBUTE})
 option(WITH_BRPC_RDMA     "Use brpc rdma as the rpc protocal"           OFF)
 option(WITH_INFERENCE    "Compile fluid inference library"              ON)
 option(WITH_SYSTEM_BLAS   "Use system blas library"           OFF)
 option(PY_VERSION       "Compile PaddlePaddle with python3 support"     ${PY_VERSION})
 # PY_VERSION
 if(NOT PY_VERSION)
  set(PY_VERSION 2.7)
 endif()
 set(PYBIND11_PYTHON_VERSION ${PY_VERSION})
 # CMAKE_BUILD_TYPE
 if(NOT CMAKE_BUILD_TYPE)
@ -98,6 +111,11 @@ if(ANDROID OR IOS)
    add_definitions(-DPADDLE_MOBILE_INFERENCE)
 endif()
 if (APPLE OR WIN32)
    set(WITH_MKL OFF CACHE STRING
        "Disable MKL for building on mac and windows" FORCE)
 endif()
 set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
  "A path setting third party libraries download & build directories.")
@ -120,6 +138,12 @@ else()
    set(THIRD_PARTY_BUILD_TYPE Release)
 endif()
 if(WITH_MKL)
  option(MKL_SPLIT_GEMM "PaddlePaddle MKL gemm would split to small ones" OFF)
  if (MKL_SPLIT_GEMM)
    add_definitions(-DPADDLE_MKL_SPLIT_GEMM)
  endif()
 endif()
 set(WITH_MKLML ${WITH_MKL})
 if (NOT DEFINED WITH_MKLDNN)
    if (WITH_MKL AND AVX2_FOUND)
@ -129,9 +153,15 @@ if (NOT DEFINED WITH_MKLDNN)
        set(WITH_MKLDNN OFF)
    endif()
 endif()
 if (REPLACE_ENFORCE_GLOG)
  add_definitions("-DREPLACE_ENFORCE_GLOG")
 endif()
 ########################################################################################
 include(external/mklml)     # download mklml package
 include(external/xbyak)     # download xbyak package
 include(external/libxsmm)   # download, build, install libxsmm
 include(external/zlib)      # download, build, install zlib
 include(external/gflags)    # download, build, install gflags
 include(external/glog)      # download, build, install glog
@ -147,11 +177,40 @@ include(external/any)       # download libn::any
 include(external/eigen)     # download eigen3
 include(external/pybind11)  # download pybind11
 include(external/cares)
 include(external/cub)
 if(WITH_DISTRIBUTE)
    if(WITH_GRPC)
        include(external/grpc)
        message(STATUS "Use grpc framework.")
    else()
        message(STATUS "Use brpc framework.")
        include(external/leveldb)
        include(external/brpc)
    endif()
 endif()
 if(WITH_BRPC_RDMA)
    message(STATUS "Use brpc with rdma.")
    if(WITH_GRPC)
        message(FATAL_ERROR "Can't use grpc with brpc rdma.")
    endif()
    if(NOT WITH_DISTRIBUTE)
        message(FATAL_ERROR "Can't use brpc rdma in no distribute env.")
    endif()
 endif()
 include(external/snappy)    # download snappy
 include(external/snappystream)
 include(external/threadpool)
 set(WITH_ANAKIN OFF CACHE STRING "Disable Anakin first, will add it later." FORCE)
 if(WITH_GPU)
    include(cuda)
    include(tensorrt)
    include(external/anakin)
 endif()
 include(cudnn)              # set cudnn libraries, must before configure
 include(cupti)
 include(configure)          # add paddle env configuration
@ -167,7 +226,7 @@ include(inference_lib)      # add paddle fluid inference libraries
 include_directories("${PADDLE_SOURCE_DIR}")
-include_directories("${PADDLE_SOURCE_DIR}/paddle/cuda/include")
+include_directories("${PADDLE_SOURCE_DIR}/paddle/legacy/cuda/include")
 include_directories("${CMAKE_CURRENT_BINARY_DIR}/proto")
 include_directories("${CMAKE_CURRENT_BINARY_DIR}/go/pserver/client/c")
@ -180,11 +239,6 @@ set(EXTERNAL_LIBS
    ${PYTHON_LIBRARIES}
 )
 if(WITH_GPU)
    include(cuda)
    include(tensorrt)
 endif(WITH_GPU)
 if(WITH_AMD_GPU)
    find_package(HIP)
    include(hip)
@ -194,6 +248,10 @@ if(WITH_MKLML)
    list(APPEND EXTERNAL_LIBS ${MKLML_IOMP_LIB})
 endif()
 if(WITH_LIBXSMM)
    list(APPEND EXTERNAL_LIBS ${LIBXSMM_LIBS})
 endif()
 if(WITH_MKLDNN)
    list(APPEND EXTERNAL_LIBS ${MKLDNN_LIB})
 endif()
@ -208,7 +266,7 @@ add_subdirectory(proto)
 if(NOT MOBILE_INFERENCE AND NOT WITH_FLUID_ONLY)
    # "add_subdirectory(go)" should be placed after the following loine,
    # because it depends on paddle/optimizer.
-    add_subdirectory(paddle/optimizer)
+    add_subdirectory(paddle/legacy/optimizer)
 endif()
 # "add_subdirectory(paddle)" and "add_subdirectory(python)" should be
@ -233,7 +291,3 @@ if(WITH_DOC)
    find_python_module(recommonmark REQUIRED)
    add_subdirectory(doc)
 endif()
 if (WITH_CONTRIB)
    add_subdirectory(paddle/contrib)
 endif()
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -159,4 +159,4 @@ This will enable VLOG messages generated by `buddy_allocator.{h,cc}` and in the
 - verbose level 1: [framework](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework)
 - verbose level 3: [operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators)
 - verbose level 5: [memory](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/memory), [platform](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/platform)
- verbose level 7: [math](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/math)
+- verbose level 7: [math](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/legacy/math)
--- a/6
+++ b/6
@ -23,8 +23,8 @@ ENV HOME /root
 COPY ./paddle/scripts/docker/root/ /root/
 RUN apt-get update && \
-    apt-get install -y --allow-downgrades \
+    apt-get install -y --allow-downgrades patchelf \
-    git python-pip python-dev openssh-server bison \
+    git python-pip python-dev python-opencv openssh-server bison \
    libnccl2=2.1.2-1+cuda8.0 libnccl-dev=2.1.2-1+cuda8.0 \
    wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
    curl sed grep graphviz libjpeg-dev zlib1g-dev  \
@ -80,7 +80,7 @@ RUN pip install pre-commit 'ipython==5.3.0' && \
    pip install opencv-python
 #For docstring checker
-RUN pip install pylint pytest astroid isort
+RUN pip install pylint pytest astroid isort LinkChecker
 COPY ./python/requirements.txt /root/
 RUN pip install -r /root/requirements.txt
--- a/README.md
+++ b/README.md
@ -4,7 +4,6 @@
 [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
 [![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/index_en.html)
 [![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html)
 [![Coverage Status](https://coveralls.io/repos/github/PaddlePaddle/Paddle/badge.svg?branch=develop)](https://coveralls.io/github/PaddlePaddle/Paddle?branch=develop)
 [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
 [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
@ -19,6 +18,22 @@ learning to many products at Baidu.
 Our vision is to enable deep learning for everyone via PaddlePaddle.
 Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle.
 ### Latest PaddlePaddle Release: [Fluid 0.14.0](https://github.com/PaddlePaddle/Paddle/tree/v0.14.0)
 ### Install Latest Stable Release:
 ```
 # Linux CPU
 pip install paddlepaddle
 # Linux GPU cuda9cudnn7
 pip install paddlepaddle-gpu
 # Linux GPU cuda8cudnn7
 pip install paddlepaddle-gpu==0.14.0.post87
 # Linux GPU cuda8cudnn5
 pip install paddlepaddle-gpu==0.14.0.post85
 # For installation on other platform, refer to http://paddlepaddle.org/
 ```
 ## Features
 - **Flexibility**
--- a/benchmark/.gitignore
+++ b/benchmark/.gitignore
@ -7,3 +7,6 @@ paddle/rnn/imdb.pkl
 caffe/image/logs
 tensorflow/image/logs
 tensorflow/rnn/logs
 fluid/models/*.pyc
 fluid/logs
 fluid/nohup.out
--- a/benchmark/fluid/Dockerfile
+++ b/benchmark/fluid/Dockerfile
@ -0,0 +1,31 @@
 FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04
 # Use UBUNTU_MIRROR can speed up apt-get speed.
 # ARG UBUNTU_MIRROR
 # RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
 RUN apt-get update && apt-get install -y python python-pip iputils-ping libgtk2.0-dev wget vim net-tools iftop python-opencv
 RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.7 /usr/lib/libcudnn.so && ln -s /usr/lib/x86_64-linux-gnu/libnccl.so.2 /usr/lib/libnccl.so
 # IMPORTANT:
 # Add "ENV http_proxy=http://ip:port" if your download is slow, and don't forget to unset it at runtime.
 # exmaple: unset http_proxy && unset https_proxy && python fluid_benchmark.py ...
 RUN pip install -U pip
 RUN pip install -U kubernetes paddlepaddle
 RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()\npaddle.dataset.flowers.fetch()" | python'
 RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.mnist.train()\npaddle.dataset.mnist.test()\npaddle.dataset.imdb.fetch()" | python'
 RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.imikolov.fetch()" | python'
 RUN pip uninstall -y paddlepaddle && mkdir /workspace
 ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
 ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
 RUN chmod +x /usr/bin/paddle_k8s
 ADD *.whl /
 RUN pip install /*.whl && rm -f /*.whl 
 ENV LD_LIBRARY_PATH=/usr/local/lib
 ADD fluid_benchmark.py recordio_converter.py args.py recordio_converter.py run.sh run_fluid_benchmark.sh /workspace/
 ADD models/ /workspace/models/
--- a/benchmark/fluid/README.md
+++ b/benchmark/fluid/README.md
@ -28,10 +28,14 @@ Currently supported `--model` argument include:
    ```
    You can choose to use GPU/CPU training. With GPU training, you can specify
    `--gpus <gpu_num>` to run multi GPU training.
    You can set async mode parameter server. With async mode, you can specify
    `--async_mode` to train model asynchronous.
 * Run distributed training with parameter servers:
    * see [run_fluid_benchmark.sh](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/fluid/run_fluid_benchmark.sh) as an example.
    * start parameter servers:
        ```bash
        PADDLE_TRAINING_ROLE=PSERVER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=1 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist  --device GPU --update_method pserver
        sleep 15
        ```
    * start trainers:
        ```bash
@ -42,13 +46,37 @@ Currently supported `--model` argument include:
    PADDLE_PSERVER_PORT=7164 PADDLE_TRAINER_IPS=192.168.0.2,192.168.0.3  PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist --device GPU --update_method nccl2
    ```
 ## Prepare the RecordIO file to Achieve Better Performance
 Run the following command will generate RecordIO files like "mnist.recordio" under the path
 and batch_size you choose, you can use batch_size=1 so that later reader can change the batch_size
 at any time using `fluid.batch`.
 ```bash
 python -c 'from recordio_converter import *; prepare_mnist("data", 1)'
 ```
 ## Run Distributed Benchmark on Kubernetes Cluster
 You may need to build a Docker image before submitting a cluster job onto Kubernetes, or you will
 have to start all those processes mannually on each node, which is not recommended.
 To build the Docker image, you need to choose a paddle "whl" package to run with, you may either
 download it from
 http://www.paddlepaddle.org/docs/develop/documentation/zh/build_and_install/pip_install_en.html or
 build it by your own. Once you've got the "whl" package, put it under the current directory and run:
 ```bash
 docker build -t [your docker image name]:[your docker image tag] .
 ```
 Then push the image to a Docker registry that your Kubernetes cluster can reach.
 We provide a script `kube_gen_job.py` to generate Kubernetes yaml files to submit
 distributed benchmark jobs to your cluster. To generate a job yaml, just run:
 ```bash
-python kube_gen_job.py --jobname myjob --pscpu 4 --cpu 8 --gpu 8 --psmemory 20 --memory 40 --pservers 4 --trainers 4 --entry "python fluid_benchmark.py --model mnist --parallel 1 --device GPU --update_method pserver " --disttype pserver
+python kube_gen_job.py --jobname myjob --pscpu 4 --cpu 8 --gpu 8 --psmemory 20 --memory 40 --pservers 4 --trainers 4 --entry "python fluid_benchmark.py --model mnist --gpus 8 --device GPU --update_method pserver " --disttype pserver
 ```
 Then the yaml files are generated under directory `myjob`, you can run:
--- a/benchmark/fluid/args.py
+++ b/benchmark/fluid/args.py
@ -0,0 +1,134 @@
 # Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
 __all__ = ['parse_args', ]
 BENCHMARK_MODELS = [
    "machine_translation", "resnet", "vgg", "mnist", "stacked_dynamic_lstm"
 ]
 def parse_args():
    parser = argparse.ArgumentParser('Fluid model benchmarks.')
    parser.add_argument(
        '--model',
        type=str,
        choices=BENCHMARK_MODELS,
        default='resnet',
        help='The model to run benchmark with.')
    parser.add_argument(
        '--batch_size', type=int, default=32, help='The minibatch size.')
    #  args related to learning rate
    parser.add_argument(
        '--learning_rate', type=float, default=0.001, help='The learning rate.')
    # TODO(wuyi): add "--use_fake_data" option back.
    parser.add_argument(
        '--skip_batch_num',
        type=int,
        default=5,
        help='The first num of minibatch num to skip, for better performance test'
    )
    parser.add_argument(
        '--iterations', type=int, default=80, help='The number of minibatches.')
    parser.add_argument(
        '--pass_num', type=int, default=100, help='The number of passes.')
    parser.add_argument(
        '--data_format',
        type=str,
        default='NCHW',
        choices=['NCHW', 'NHWC'],
        help='The data data_format, now only support NCHW.')
    parser.add_argument(
        '--device',
        type=str,
        default='GPU',
        choices=['CPU', 'GPU'],
        help='The device type.')
    parser.add_argument(
        '--gpus',
        type=int,
        default=1,
        help='If gpus > 1, will use ParallelExecutor to run, else use Executor.')
    # this option is available only for vgg and resnet.
    parser.add_argument(
        '--cpus',
        type=int,
        default=1,
        help='If cpus > 1, will use ParallelDo to run, else use Executor.')
    parser.add_argument(
        '--data_set',
        type=str,
        default='flowers',
        choices=['cifar10', 'flowers'],
        help='Optional dataset for benchmark.')
    parser.add_argument(
        '--infer_only', action='store_true', help='If set, run forward only.')
    parser.add_argument(
        '--use_cprof', action='store_true', help='If set, use cProfile.')
    parser.add_argument(
        '--use_nvprof',
        action='store_true',
        help='If set, use nvprof for CUDA.')
    parser.add_argument(
        '--no_test',
        action='store_true',
        help='If set, do not test the testset during training.')
    parser.add_argument(
        '--memory_optimize',
        action='store_true',
        help='If set, optimize runtime memory before start.')
    parser.add_argument(
        '--use_fake_data',
        action='store_true',
        help='If set ommit the actual read data operators.')
    parser.add_argument(
        '--profile', action='store_true', help='If set, profile a few steps.')
    parser.add_argument(
        '--update_method',
        type=str,
        default='local',
        choices=['local', 'pserver', 'nccl2'],
        help='Choose parameter update method, can be local, pserver, nccl2.')
    parser.add_argument(
        '--no_split_var',
        action='store_true',
        default=False,
        help='Whether split variables into blocks when update_method is pserver')
    parser.add_argument(
        '--async_mode',
        action='store_true',
        default=False,
        help='Whether start pserver in async mode to support ASGD')
    parser.add_argument(
        '--use_reader_op',
        action='store_true',
        help='Whether to use reader op, and must specify the data path if set this to true.'
    )
    parser.add_argument(
        '--data_path',
        type=str,
        default="",
        help='Directory that contains all the training recordio files.')
    parser.add_argument(
        '--use_inference_transpiler',
        action='store_true',
        help='If set, use inference transpiler to optimize the program.')
    parser.add_argument(
        '--no_random',
        action='store_true',
        help='If set, keep the random seed and do not shuffle the data.')
    args = parser.parse_args()
    return args
--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
--- a/benchmark/fluid/kube_gen_job.py
+++ b/benchmark/fluid/kube_gen_job.py
@ -17,6 +17,7 @@ import copy
 import argparse
 import random
 import os
 import copy
 from kube_templates import pserver, trainer, envs
@ -108,10 +109,9 @@ def gen_job():
    tn_container["ports"][0]["containerPort"] = spreadport
    envs.append({"name": "PADDLE_JOB_NAME", "value": args.jobname})
-    envs.append({"name": "TRAINERS", "value": str(args.trainers)})
+    envs.append({"name": "PADDLE_TRAINERS", "value": str(args.trainers)})
-    envs.append({"name": "PSERVERS", "value": str(args.pservers)})
+    envs.append({"name": "PADDLE_PSERVERS", "value": str(args.pservers)})
    envs.append({"name": "ENTRY", "value": args.entry})
    envs.append({"name": "PADDLE_INIT_PORT", "value": str(args.port)})
    envs.append({"name": "PADDLE_PSERVER_PORT", "value": str(args.port)})
    # NOTE: these directories below are cluster specific, please modify
    # this settings before you run on your own cluster.
@ -166,17 +166,23 @@ def gen_job():
    tn["spec"]["template"]["spec"]["volumes"] = volumes
    tn_container["volumeMounts"] = volumeMounts
-    ps_container["env"] = envs
+    ps_container["env"] = copy.deepcopy(envs)
-    ps_container["env"].append({"name": "TRAINING_ROLE", "value": "PSERVER"})
+    ps_container["env"].append({
        "name": "PADDLE_TRAINING_ROLE",
        "value": "PSERVER"
    })
    tn_container["env"] = envs
    if args.disttype == "pserver":
        tn_container["env"].append({
-            "name": "TRAINING_ROLE",
+            "name": "PADDLE_TRAINING_ROLE",
            "value": "TRAINER"
        })
    elif args.disttype == "nccl2" or args.disttype == "local":
        # NCCL2 have no training role, set to plain WORKER
-        tn_container["env"].append({"name": "TRAINING_ROLE", "value": "WORKER"})
+        tn_container["env"].append({
            "name": "PADDLE_TRAINING_ROLE",
            "value": "WORKER"
        })
    os.mkdir(args.jobname)
    if args.disttype == "pserver":
--- a/benchmark/fluid/models/machine_translation.py
+++ b/benchmark/fluid/models/machine_translation.py
@ -173,21 +173,6 @@ def seq_to_seq_net(embedding_dim, encoder_size, decoder_size, source_dict_dim,
        return avg_cost, feeding_list
 def to_lodtensor(data, place):
    seq_lens = [len(seq) for seq in data]
    cur_len = 0
    lod = [cur_len]
    for l in seq_lens:
        cur_len += l
        lod.append(cur_len)
    flattened_data = np.concatenate(data, axis=0).astype("int64")
    flattened_data = flattened_data.reshape([len(flattened_data), 1])
    lod_t = core.LoDTensor()
    lod_t.set(flattened_data, place)
    lod_t.set_lod([lod])
    return lod_t, lod[-1]
 def lodtensor_to_ndarray(lod_tensor):
    dims = lod_tensor.get_dims()
    ndarray = np.zeros(shape=dims).astype('float32')
@ -197,6 +182,8 @@ def lodtensor_to_ndarray(lod_tensor):
 def get_model(args):
    if args.use_reader_op:
        raise Exception("machine_translation do not support reader op for now.")
    embedding_dim = 512
    encoder_size = 512
    decoder_size = 512
@ -221,7 +208,7 @@ def get_model(args):
    train_batch_generator = paddle.batch(
        paddle.reader.shuffle(
            paddle.dataset.wmt14.train(dict_size), buf_size=1000),
-        batch_size=args.batch_size)
+        batch_size=args.batch_size * args.gpus)
    test_batch_generator = paddle.batch(
        paddle.reader.shuffle(
--- a/benchmark/fluid/models/mnist.py
+++ b/benchmark/fluid/models/mnist.py
@ -20,6 +20,7 @@ import numpy as np
 import argparse
 import time
 import cProfile
 import os
 import paddle
 import paddle.fluid as fluid
@ -65,19 +66,49 @@ def cnn_model(data):
 def get_model(args):
-    # Input data
+    if args.use_reader_op:
        filelist = [
            os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
        ]
        data_file = fluid.layers.open_files(
            filenames=filelist,
            shapes=[[-1, 1, 28, 28], (-1, 1)],
            lod_levels=[0, 0],
            dtypes=["float32", "int64"],
            thread_num=args.gpus,
            pass_num=args.pass_num)
        data_file = fluid.layers.double_buffer(
            fluid.layers.batch(
                data_file, batch_size=args.batch_size))
        images, label = fluid.layers.read_file(data_file)
    else:
        images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
    if args.device == 'CPU' and args.cpus > 1:
        places = fluid.layers.get_places(args.cpus)
        pd = fluid.layers.ParallelDo(places)
        with pd.do():
            predict = cnn_model(pd.read_input(images))
            label = pd.read_input(label)
            cost = fluid.layers.cross_entropy(input=predict, label=label)
            avg_cost = fluid.layers.mean(x=cost)
            batch_acc = fluid.layers.accuracy(input=predict, label=label)
            pd.write_output(avg_cost)
            pd.write_output(batch_acc)
        avg_cost, batch_acc = pd()
        avg_cost = fluid.layers.mean(avg_cost)
        batch_acc = fluid.layers.mean(batch_acc)
    else:
        # Train program
        predict = cnn_model(images)
        cost = fluid.layers.cross_entropy(input=predict, label=label)
        avg_cost = fluid.layers.mean(x=cost)
        # Evaluator
-    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+        batch_acc = fluid.layers.accuracy(input=predict, label=label)
    batch_acc = fluid.layers.accuracy(
        input=predict, label=label, total=batch_size_tensor)
    # inference program
    inference_program = fluid.default_main_program().clone()
@ -88,7 +119,7 @@ def get_model(args):
    # Reader
    train_reader = paddle.batch(
-        paddle.dataset.mnist.train(), batch_size=args.batch_size)
+        paddle.dataset.mnist.train(), batch_size=args.batch_size * args.gpus)
    test_reader = paddle.batch(
        paddle.dataset.mnist.test(), batch_size=args.batch_size)
    return avg_cost, inference_program, opt, train_reader, test_reader, batch_acc
--- a/benchmark/fluid/models/resnet.py
+++ b/benchmark/fluid/models/resnet.py
@ -19,6 +19,7 @@ from __future__ import print_function
 import functools
 import numpy as np
 import time
 import os
 import cProfile, pstats, StringIO
@ -26,6 +27,7 @@ import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import paddle.fluid.profiler as profiler
 from recordio_converter import imagenet_train, imagenet_test
 def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'):
@ -122,40 +124,85 @@ def get_model(args):
        else:
            dshape = [32, 32, 3]
        model = resnet_cifar10
-    else:
+        train_reader = paddle.dataset.cifar.train10()
        test_reader = paddle.dataset.cifar.test10()
    elif args.data_set == "flowers":
        class_dim = 102
        if args.data_format == 'NCHW':
            dshape = [3, 224, 224]
        else:
            dshape = [224, 224, 3]
        model = resnet_imagenet
-
+        train_reader = paddle.dataset.flowers.train()
        test_reader = paddle.dataset.flowers.test()
    elif args.data_set == "imagenet":
        class_dim = 1000
        if args.data_format == 'NCHW':
            dshape = [3, 224, 224]
        else:
            dshape = [224, 224, 3]
        model = resnet_imagenet
        if not args.data_path:
            raise Exception(
                "Must specify --data_path when training with imagenet")
        train_reader = imagenet_train(args.data_path)
        test_reader = imagenet_test(args.data_path)
    if args.use_reader_op:
        filelist = [
            os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
        ]
        data_file = fluid.layers.open_files(
            filenames=filelist,
            shapes=[[-1] + dshape, (-1, 1)],
            lod_levels=[0, 0],
            dtypes=["float32", "int64"],
            thread_num=args.gpus,
            pass_num=args.pass_num)
        data_file = fluid.layers.double_buffer(
            fluid.layers.batch(
                data_file, batch_size=args.batch_size))
        input, label = fluid.layers.read_file(data_file)
    else:
        input = fluid.layers.data(name='data', shape=dshape, dtype='float32')
        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-    predict = model(input, class_dim)
+
    if args.device == 'CPU' and args.cpus > 1:
        places = fluid.layers.get_places(args.cpus)
        pd = fluid.layers.ParallelDo(places)
        with pd.do():
            predict = model(pd.read_input(input), class_dim)
            label = pd.read_input(label)
            cost = fluid.layers.cross_entropy(input=predict, label=label)
            avg_cost = fluid.layers.mean(x=cost)
            batch_acc = fluid.layers.accuracy(input=predict, label=label)
-    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+            pd.write_output(avg_cost)
-    batch_acc = fluid.layers.accuracy(
+            pd.write_output(batch_acc)
-        input=predict, label=label, total=batch_size_tensor)
+
        avg_cost, batch_acc = pd()
        avg_cost = fluid.layers.mean(avg_cost)
        batch_acc = fluid.layers.mean(batch_acc)
    else:
        predict = model(input, class_dim)
        cost = fluid.layers.cross_entropy(input=predict, label=label)
        avg_cost = fluid.layers.mean(x=cost)
        batch_acc = fluid.layers.accuracy(input=predict, label=label)
    inference_program = fluid.default_main_program().clone()
    with fluid.program_guard(inference_program):
        inference_program = fluid.io.get_inference_program(
-            target_vars=[batch_acc, batch_size_tensor])
+            target_vars=[batch_acc])
    optimizer = fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9)
-    train_reader = paddle.batch(
+    batched_train_reader = paddle.batch(
-        paddle.reader.shuffle(
+        train_reader if args.no_random else paddle.reader.shuffle(
-            paddle.dataset.cifar.train10()
+            train_reader, buf_size=5120),
-            if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
+        batch_size=args.batch_size * args.gpus,
-            buf_size=5120),
+        drop_last=True)
-        batch_size=args.batch_size)
+    batched_test_reader = paddle.batch(
-    test_reader = paddle.batch(
+        test_reader, batch_size=args.batch_size, drop_last=True)
-        paddle.dataset.cifar.test10()
+
-        if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
+    return avg_cost, inference_program, optimizer, batched_train_reader,\
-        batch_size=args.batch_size)
+                   batched_test_reader, batch_acc
    return avg_cost, inference_program, optimizer, train_reader, test_reader, batch_acc
--- a/benchmark/fluid/models/stacked_dynamic_lstm.py
+++ b/benchmark/fluid/models/stacked_dynamic_lstm.py
@ -44,6 +44,9 @@ def crop_sentence(reader, crop_size):
 def get_model(args):
    if args.use_reader_op:
        raise Exception(
            "stacked_dynamic_lstm do not support reader op for now.")
    lstm_size = 512
    emb_dim = 512
    crop_size = 1500
@ -115,25 +118,10 @@ def get_model(args):
    train_reader = batch(
        paddle.reader.shuffle(
            crop_sentence(imdb.train(word_dict), crop_size), buf_size=25000),
-        batch_size=args.batch_size)
+        batch_size=args.batch_size * args.gpus)
    test_reader = batch(
        paddle.reader.shuffle(
            crop_sentence(imdb.test(word_dict), crop_size), buf_size=25000),
        batch_size=args.batch_size)
    return loss, inference_program, adam, train_reader, test_reader, batch_acc
 def to_lodtensor(data, place):
    seq_lens = [len(seq) for seq in data]
    cur_len = 0
    lod = [cur_len]
    for l in seq_lens:
        cur_len += l
        lod.append(cur_len)
    flattened_data = numpy.concatenate(data, axis=0).astype("int64")
    flattened_data = flattened_data.reshape([len(flattened_data), 1])
    res = fluid.LoDTensor()
    res.set(flattened_data, place)
    res.set_lod([lod])
    return res
--- a/benchmark/fluid/models/vgg.py
+++ b/benchmark/fluid/models/vgg.py
@ -22,6 +22,7 @@ import paddle.fluid as fluid
 import paddle.fluid.core as core
 import argparse
 import functools
 import os
 def vgg16_bn_drop(input):
@ -65,8 +66,24 @@ def get_model(args):
        else:
            data_shape = [224, 224, 3]
-    # Input data
+    if args.use_reader_op:
-    images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
+        filelist = [
            os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
        ]
        data_file = fluid.layers.open_files(
            filenames=filelist,
            shapes=[[-1] + data_shape, (-1, 1)],
            lod_levels=[0, 0],
            dtypes=["float32", "int64"],
            thread_num=args.gpus,
            pass_num=args.pass_num)
        data_file = fluid.layers.double_buffer(
            fluid.layers.batch(
                data_file, batch_size=args.batch_size))
        images, label = fluid.layers.read_file(data_file)
    else:
        images = fluid.layers.data(
            name='data', shape=data_shape, dtype='float32')
        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
    # Train program
@ -95,7 +112,7 @@ def get_model(args):
            paddle.dataset.cifar.train10()
            if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
            buf_size=5120),
-        batch_size=args.batch_size)
+        batch_size=args.batch_size * args.gpus)
    test_reader = paddle.batch(
        paddle.dataset.cifar.test10()
        if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
--- a/benchmark/fluid/recordio_converter.py
+++ b/benchmark/fluid/recordio_converter.py
@ -0,0 +1,164 @@
 # Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
 import random
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.dataset import mnist, cifar, flowers, image
 def convert_2_recordio(py_reader, outfilepath, batch_size, shape_data,
                       shape_label):
    num_batches = 0
    with fluid.program_guard(fluid.Program(), fluid.Program()):
        reader = paddle.batch(py_reader(), batch_size=batch_size)
        feeder = fluid.DataFeeder(
            feed_list=[  # order is image and label
                fluid.layers.data(
                    name='image', shape=shape_data),
                fluid.layers.data(
                    name='label', shape=shape_label, dtype='int64'),
            ],
            place=fluid.CPUPlace())
        num_batches = fluid.recordio_writer.convert_reader_to_recordio_file(
            outfilepath, reader, feeder)
    return num_batches
 def prepare_mnist(outpath, batch_size):
    outfilepath = os.path.join(outpath, "mnist.recordio")
    convert_2_recordio(mnist.train, outfilepath, batch_size, [784], [1])
 def prepare_cifar10(outpath, batch_size):
    outfilepath = os.path.join(outpath, "cifar.recordio")
    convert_2_recordio(cifar.train10, outfilepath, batch_size, [3, 32, 32], [1])
 def prepare_flowers(outpath, batch_size):
    outfilepath = os.path.join(outpath, "flowers.recordio")
    convert_2_recordio(flowers.train, outfilepath, batch_size, [3, 224, 224],
                       [1])
 def default_mapper(sample):
    img, label = sample
    img = image.simple_transform(
        img, 256, 224, True, mean=[103.94, 116.78, 123.68])
    return img.flatten().astype('float32'), label
 def imagenet_train(data_dir):
    contents = os.listdir(data_dir)
    if set(contents) != set(
        ["train", "train.txt", "val", "val_set", "val.txt", "unzip.sh"]):
        raise Exception("Imagenet data contents error!")
    img2label = dict()
    imgfilelist = []
    with open(os.path.join(data_dir, "train.txt")) as fn:
        while 1:
            l = fn.readline()
            if not l:
                break
            img, lbl = l[:-1].split(" ")
            img2label[img] = int(lbl)
            imgfilelist.append(img)
    # shuffle all, this is slow
    random.shuffle(imgfilelist)
    def train_reader():
        for idx, imgfile in enumerate(imgfilelist):
            data = image.load_image(
                os.path.join(data_dir, "train", imgfile.lower()))
            label = [img2label[imgfile], ]
            yield [data, label]
    return paddle.reader.map_readers(default_mapper, train_reader)
 def imagenet_test(data_dir):
    contents = os.listdir(data_dir)
    if set(contents) != set(
        ["train", "train.txt", "val", "val_set", "val.txt", "unzip.sh"]):
        raise Exception("Imagenet data contents error!")
    img2label = dict()
    imgfilelist = []
    with open(os.path.join(data_dir, "val.txt")) as fn:
        while 1:
            l = fn.readline()
            if not l:
                break
            img, lbl = l[:-1].split(" ")
            img2label[img] = int(lbl)
            imgfilelist.append(img)
    def test_reader():
        for idx, imgfile in enumerate(imgfilelist):
            base_path = os.path.join(data_dir, "val", imgfile.split(".")[0])
            image_path = ".".join([base_path, "jpeg"])
            data = image.load_image(image_path)
            label = [img2label[imgfile], ]
            yield [data, label]
    return paddle.reader.map_readers(default_mapper, test_reader)
 # FIXME(wuyi): delete this when https://github.com/PaddlePaddle/Paddle/pull/11066 is merged
 def convert_reader_to_recordio_files(
        filename,
        batch_per_file,
        reader_creator,
        feeder,
        compressor=core.RecordIOWriter.Compressor.Snappy,
        max_num_records=1000,
        feed_order=None):
    if feed_order is None:
        feed_order = feeder.feed_names
    f_name, f_ext = os.path.splitext(filename)
    assert (f_ext == ".recordio")
    lines = []
    f_idx = 0
    counter = 0
    for idx, batch in enumerate(reader_creator()):
        lines.append(batch)
        if idx >= batch_per_file and idx % batch_per_file == 0:
            filename = "%s-%05d%s" % (f_name, f_idx, f_ext)
            with fluid.recordio_writer.create_recordio_writer(
                    filename, compressor, max_num_records) as writer:
                for l in lines:
                    res = feeder.feed(l)
                    for each in feed_order:
                        writer.append_tensor(res[each])
                    writer.complete_append_tensor()
                    counter += 1
                lines = []
                f_idx += 1
            print("written file: ", filename)
    return counter
 def prepare_imagenet(inpath, outpath, batch_size):
    r = paddle.batch(imagenet_train(inpath), batch_size=batch_size)
    feeder = fluid.DataFeeder(
        feed_list=[
            fluid.layers.data(
                name="image", shape=[3, 224, 224]), fluid.layers.data(
                    name="label", shape=[1], dtype='int64')
        ],
        place=fluid.CPUPlace())
    outpath = os.path.join(outpath, "imagenet.recordio")
    convert_reader_to_recordio_files(outpath, 10000, r, feeder)
--- a/benchmark/fluid/run.sh
+++ b/benchmark/fluid/run.sh
@ -2,6 +2,7 @@
 # This script benchmarking the PaddlePaddle Fluid on
 # single thread single GPU.
 mkdir -p logs
 #export FLAGS_fraction_of_gpu_memory_to_use=0.0
 export CUDNN_PATH=/paddle/cudnn_v5
@ -35,71 +36,74 @@ nohup stdbuf -oL nvidia-smi \
      --format=csv \
      --filename=mem.log  \
      -l 1 &
 # mnist
 # mnist gpu mnist 128
-FLAGS_benchmark=true stdbuf -oL python fluid/mnist.py \
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
               --model=mnist \
               --device=GPU \
               --batch_size=128 \
               --skip_batch_num=5 \
               --iterations=500 \
-               2>&1 | tee -a mnist_gpu_128.log
+               2>&1 | tee -a logs/mnist_gpu_128.log
 # vgg16
 # gpu cifar10 128
-FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
               --model=vgg16 \
               --device=GPU \
               --batch_size=128 \
               --skip_batch_num=5 \
               --iterations=30 \
-               2>&1 | tee -a vgg16_gpu_128.log
+               2>&1 | tee -a logs/vgg16_gpu_128.log
 # flowers gpu  128
-FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
               --model=vgg16 \
               --device=GPU \
               --batch_size=32 \
               --data_set=flowers \
               --skip_batch_num=5 \
               --iterations=30 \
-               2>&1 | tee -a vgg16_gpu_flowers_32.log
+               2>&1 | tee -a logs/vgg16_gpu_flowers_32.log
 # resnet50
 # resnet50 gpu cifar10 128
-FLAGS_benchmark=true stdbuf -oL python fluid/resnet50.py \
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
               --model=resnet \
               --device=GPU \
               --batch_size=128 \
               --data_set=cifar10 \
               --model=resnet_cifar10 \
               --skip_batch_num=5 \
               --iterations=30 \
-               2>&1 | tee -a resnet50_gpu_128.log
+               2>&1 | tee -a logs/resnet50_gpu_128.log
 # resnet50 gpu flowers 64
-FLAGS_benchmark=true stdbuf -oL python fluid/resnet50.py \
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
               --model=resnet \
               --device=GPU \
               --batch_size=64 \
               --data_set=flowers \
               --model=resnet_imagenet \
               --skip_batch_num=5 \
               --iterations=30 \
-               2>&1 | tee -a resnet50_gpu_flowers_64.log
+               2>&1 | tee -a logs/resnet50_gpu_flowers_64.log
 # lstm
 # lstm gpu imdb 32 # tensorflow only support batch=32
-FLAGS_benchmark=true stdbuf -oL python fluid/stacked_dynamic_lstm.py \
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
               --model=stacked_dynamic_lstm \
               --device=GPU \
               --batch_size=32 \
               --skip_batch_num=5 \
               --iterations=30 \
-               --hidden_dim=512 \
+               2>&1 | tee -a logs/lstm_gpu_32.log
               --emb_dim=512 \
               --crop_size=1500 \
               2>&1 | tee -a lstm_gpu_32.log
 # seq2seq
 # seq2seq gpu wmb 128
-FLAGS_benchmark=true stdbuf -oL python fluid/machine_translation.py \
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
               --model=machine_translation \
               --device=GPU \
               --batch_size=128 \
               --skip_batch_num=5 \
               --iterations=30 \
-               2>&1 | tee -a lstm_gpu_128.log
+               2>&1 | tee -a logs/lstm_gpu_128.log
--- a/benchmark/fluid/run_fluid_benchmark.sh
+++ b/benchmark/fluid/run_fluid_benchmark.sh
@ -0,0 +1,9 @@
 #!/bin/bash
 PADDLE_TRAINING_ROLE=PSERVER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=2 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model resnet --device CPU --update_method pserver --iterations=10000 &
 sleep 15
 CUDA_VISIBLE_DEVICES=0,1 PADDLE_TRAINING_ROLE=TRAINER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=2 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model resnet --device GPU --update_method pserver --iterations=10000 --gpus 2 &
 CUDA_VISIBLE_DEVICES=2,3 PADDLE_TRAINING_ROLE=TRAINER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=2 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=1 python fluid_benchmark.py --model resnet --device GPU --update_method pserver --iterations=10000 --gpus 2 &
--- a/benchmark/paddle/image/run.sh
+++ b/benchmark/paddle/image/run.sh
@ -1,3 +1,5 @@
 #!/bin/bash
 set -e
 function train() {
--- a/benchmark/paddle/image/run_mkl_infer.sh
+++ b/benchmark/paddle/image/run_mkl_infer.sh
@ -1,3 +1,5 @@
 #!/bin/bash
 set -e
 function clock_to_seconds() {
--- a/benchmark/paddle/image/run_mkl_train.sh
+++ b/benchmark/paddle/image/run_mkl_train.sh
@ -1,3 +1,5 @@
 #!/bin/bash
 set -e
 function train() {
--- a/benchmark/paddle/image/run_openblas_infer.sh
+++ b/benchmark/paddle/image/run_openblas_infer.sh
@ -1,3 +1,5 @@
 #!/bin/bash
 set -e
 function clock_to_seconds() {
--- a/Show More
+++ b/Show More