rm space

7 years ago · 8cbb49ce63
parent e97645c959 be1f058f52
commit 8cbb49ce63
1503 changed files with 61894 additions and 15659 deletions
--- a/.gitignore
+++ b/.gitignore
@ -25,12 +25,3 @@ third_party/
 # clion workspace.
 cmake-build-*
 # generated while compiling
 paddle/pybind/pybind.h
 CMakeFiles
 cmake_install.cmake
 paddle/.timestamp
 python/paddlepaddle.egg-info/
 paddle/fluid/pybind/pybind.h
 python/paddle/version.py
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -1,3 +1,4 @@
 repos:
 -   repo: https://github.com/Lucas-C/pre-commit-hooks.git
    sha: v1.0.1
    hooks:
@ -25,6 +26,14 @@
        entry: bash ./.clang_format.hook -i
        language: system
        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto)$
 -   repo: local
    hooks:
    -   id: cpplint-cpp-source
        name: cpplint
        description: Check C++ code style using cpplint.py.
        entry: bash ./tools/codestyle/cpplint_pre_commit.hook
        language: system
        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx)$
 -   repo: https://github.com/PaddlePaddle/pre-commit-golang
    sha: 8337620115c25ff8333f1b1a493bd031049bd7c0
    hooks:
--- a/.travis.yml
+++ b/.travis.yml
@ -12,51 +12,28 @@ services:
 os:
  - linux
 env:
-  - JOB=build_doc
+  - JOB=doc
  - JOB=check_style
  - JOB=build_android
 addons:
-  apt:
+  ssh_known_hosts: 13.229.163.131
    packages:
      - gcc-4.8
      - g++-4.8
      - git
      - build-essential
      - python
      - python-pip
      - python2.7-dev
      - python-wheel
      - libboost-dev
      - curl
      - swig
      - graphviz
      - clang-format-3.8
      - automake
      - libtool
      - ccache
  ssh_known_hosts: 52.76.173.135
 before_install:
  - if [[ "$JOB" == "check_style" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi
  # Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python
  # protobuf version.
  - sudo pip install -r $TRAVIS_BUILD_DIR/python/requirements.txt
  - sudo pip install wheel sphinx==1.5.6 recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit LinkChecker
  - |
    function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; }
 script:
  - |
    # 43min timeout
-    if [[ "$JOB" == "build_android" ]]; then timeout 2580 docker run -it --rm -v "$TRAVIS_BUILD_DIR:/paddle" paddlepaddle/paddle:latest-dev-android;
+    paddle/scripts/paddle_docker_build.sh ${JOB}
-    else timeout 2580 paddle/scripts/travis/${JOB}.sh; fi;
+    if [ $? -eq 0 ] || [ $? -eq 142 ]; then true; else exit 1; fi;
    RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true; else exit 1; fi;
  - |
-    if [[ "$JOB" != "build_doc" ]]; then exit 0; fi;
+    if [[ "$JOB" != "doc" ]]; then exit 0; fi;
    # For document only
    if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then exit 0; fi;
    if [[ "$TRAVIS_BRANCH" != "develop"  && ! "$TRAVIS_BRANCH" =~ ^v[[:digit:]]+\.[[:digit:]]+(\.[[:digit:]]+)?(-\S*)?$ ]]; then exit 0; fi;
    export DEPLOY_DOCS_SH=https://raw.githubusercontent.com/PaddlePaddle/PaddlePaddle.org/master/scripts/deploy/deploy_docs.sh
    export DOCS_DIR=`pwd`
    cd ..
-    curl $DEPLOY_DOCS_SH | bash -s $CONTENT_DEC_PASSWD $TRAVIS_BRANCH $DOCS_DIR $DOCS_DIR/build/doc   
+    curl $DEPLOY_DOCS_SH | bash -s $CONTENT_DEC_PASSWD $TRAVIS_BRANCH $DOCS_DIR $DOCS_DIR/build/doc/
 notifications:
  email:
    on_success: change
--- a/AUTHORS.md
+++ b/AUTHORS.md
@ -2,12 +2,14 @@
 |---|---|
 | abhinavarora | Abhinav Arora |
 | backyes | Yan-Fei Wang |
 | baiyfbupt | Yi-Fan Bai |
 | beckett1124 | Bin Qi |
 | JiayiFeng | Jia-Yi Feng |
 | chengxiaohua1105 | Xiao-Hua Cheng |
 | cxwangyi, yiwangbaidu, wangkuiyi | Yi Wang |
 | cxysteven | Xing-Yi Cheng |
 | dzhwinter | Zhi-Hong Dong |
 | dragonwarrior | Long Wang |
 | dyning | Yuning Du |
 | emailweixu | Wei Xu |
 | gangliao | Gang Liao |
 | gongweibao | Wei-Bao Gong |
@ -16,6 +18,9 @@
 | hedaoyuan | Dao-Yuan He |
 | helinwang | He-Lin Wang |
 | jacquesqiao | Long-Fei Qiao |
 | jczaja | Jacek Czaja |
 | JiayiFeng | Jia-Yi Feng |
 | kbinias | Krzysztof Binias |
 | kuke | Yi-Bing Liu |
 | lcy-seso | Ying Cao |
 | lipeng-unisound | Peng Li |
@ -24,15 +29,20 @@
 | llxxxll | Yong-Feng Liu |
 | luotao01 | Tao Luo |
 | lzhao4ever | Liang Zhao |
 | mozga-intel | Mateusz Ozga |
 | NHZlX | Zhao-Long Xing |
 | Noplz | Yuan Gao |
 | pakchoi | Chuan-Jiang Song |
 | panyx0718 | Xin Pan |
 | pengli09 | Peng Li |
 | pkuyym | Ya-Ming Yang |
 | pzelazko-intel | Pawel Zelazko |
 | QiJune | Jun Qi |
 | qingqing01 | Qing-Qing Dang |
 | reyoung | Yang Yu |
 | Superjom | Chun-Wei Yan |
 | tianbingsz | Tian-Bing Xu |
 | tpatejko | Tomasz Patejko |
 | typhoonzero | Yi Wu |
 | wanghaoshuang | Hao-Shuang Wang |
 | wangyang59 | Yang Wang |
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -36,6 +36,7 @@ include(simd)
 ################################ Configurations #######################################
 option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_FOUND})
 option(WITH_AMD_GPU     "Compile PaddlePaddle with AMD GPU"             OFF)
 option(WITH_AVX         "Compile PaddlePaddle with AVX intrinsics"      ${AVX_FOUND})
 option(WITH_MKL         "Compile PaddlePaddle with MKL support."        ${AVX_FOUND})
 option(WITH_DSO         "Compile PaddlePaddle with dynamic linked CUDA" ON)
@ -52,8 +53,7 @@ option(WITH_COVERAGE    "Compile PaddlePaddle with code coverage"       OFF)
 option(COVERALLS_UPLOAD "Package code coverage data to coveralls"       OFF)
 option(ON_TRAVIS        "Exclude special unit test on Travis CI"        OFF)
 option(WITH_C_API       "Compile PaddlePaddle with C-API(Prediction)"   OFF)
-# TODO: Only compile PaddlePaddle fluid version by WITH_FLUID option. 
+option(WITH_FLUID_ONLY  "Compile PaddlePaddle fluid only"               OFF)
 option(WITH_FLUID       "Compile PaddlePaddle fluid only(TODO)"         ON)
 option(WITH_GOLANG      "Compile PaddlePaddle with GOLANG"              OFF)
 option(GLIDE_INSTALL    "Download and install go dependencies "         ON)
 option(USE_NNPACK       "Compile PaddlePaddle with NNPACK library"      OFF)
@ -108,7 +108,7 @@ if (WITH_C_API AND WITH_PYTHON)
 endif()
 if (WITH_C_API)
-  set(WITH_FLUID OFF CACHE STRING "Disable install fluid when compile the C_API" FORCE)
+  set(WITH_FLUID_ONLY OFF CACHE STRING "Disable install fluid when compile the C_API" FORCE)
 endif()
 if(MOBILE_INFERENCE)
@ -144,6 +144,9 @@ include(external/eigen)     # download eigen3
 include(external/pybind11)  # download pybind11
 include(external/cares)
 include(external/grpc)
 include(external/snappy)    # download snappy
 include(external/snappystream)
 include(external/threadpool)
 include(cudnn)              # set cudnn libraries, must before configure
 include(cupti)
@ -176,8 +179,14 @@ set(EXTERNAL_LIBS
 if(WITH_GPU)
    include(cuda)
    include(tensorrt)
 endif(WITH_GPU)
 if(WITH_AMD_GPU)
    find_package(HIP)
    include(hip)
 endif(WITH_AMD_GPU)
 if(WITH_MKLML)
    list(APPEND EXTERNAL_LIBS ${MKLML_IOMP_LIB})
 endif()
--- a/29
+++ b/29
@ -1,6 +1,8 @@
 # A image for building paddle binaries
 # Use cuda devel base image for both cpu and gpu environment
-FROM nvidia/cuda:8.0-cudnn5-devel-ubuntu16.04
+# When you modify it, please be aware of cudnn-runtime version
 # and libcudnn.so.x in paddle/scripts/docker/build.sh
 FROM nvidia/cuda:8.0-cudnn7-devel-ubuntu16.04
 MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
 ARG UBUNTU_MIRROR
@ -21,7 +23,7 @@ ENV HOME /root
 COPY ./paddle/scripts/docker/root/ /root/
 RUN apt-get update && \
-    apt-get install -y \
+    apt-get install -y --allow-downgrades \
    git python-pip python-dev openssh-server bison \
    libnccl2=2.1.2-1+cuda8.0 libnccl-dev=2.1.2-1+cuda8.0 \
    wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
@ -30,7 +32,7 @@ RUN apt-get update && \
    automake locales clang-format swig doxygen cmake  \
    liblapack-dev liblapacke-dev \
    clang-3.8 llvm-3.8 libclang-3.8-dev \
-    net-tools libtool && \
+    net-tools libtool ccache && \
    apt-get clean -y
 # Install Go and glide
@ -45,6 +47,17 @@ ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin
 # install glide
 RUN curl -s -q https://glide.sh/get | sh
 # Install TensorRT
 # following TensorRT.tar.gz is not the default official one, we do two miny changes:
 # 1. Remove the unnecessary files to make the library small. TensorRT.tar.gz only contains include and lib now,
 #    and its size is only one-third of the official one.
 # 2. Manually add ~IPluginFactory() in IPluginFactory class of NvInfer.h, otherwise, it couldn't work in paddle.
 #    See https://github.com/PaddlePaddle/Paddle/issues/10129 for details.
 RUN wget -qO- http://paddlepaddledeps.bj.bcebos.com/TensorRT-4.0.0.3.Ubuntu-16.04.4.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz | \
    tar -xz -C /usr/local && \
    cp -rf /usr/local/TensorRT/include /usr && \
    cp -rf /usr/local/TensorRT/lib /usr
 # git credential to skip password typing
 RUN git config --global credential.helper store
@ -53,10 +66,14 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8
 # FIXME: due to temporary ipykernel dependency issue, specify ipykernel jupyter
 # version util jupyter fixes this issue.
-RUN pip install --upgrade pip && \
+
 # specify sphinx version as 1.5.6 and remove -U option for [pip install -U
 # sphinx-rtd-theme] since -U option will cause sphinx being updated to newest
 # version(1.7.1 for now), which causes building documentation failed.
 RUN pip install --upgrade pip==9.0.3 && \
    pip install -U wheel && \
-    pip install -U docopt PyYAML sphinx && \
+    pip install -U docopt PyYAML sphinx==1.5.6 && \
-    pip install -U sphinx-rtd-theme==0.1.9 recommonmark
+    pip install sphinx-rtd-theme==0.1.9 recommonmark
 RUN pip install pre-commit 'ipython==5.3.0' && \
    pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
--- a/Dockerfile.android
+++ b/Dockerfile.android
@ -27,7 +27,7 @@ RUN git config --global credential.helper store
 # Fix locales to en_US.UTF-8
 RUN localedef -i en_US -f UTF-8 en_US.UTF-8
-RUN pip install --upgrade pip && \
+RUN pip install --upgrade pip==9.0.3 && \
    pip install -U 'protobuf==3.1.0' && \
    pip install -U wheel sphinx && \
    pip install pre-commit
--- a/README.md
+++ b/README.md
@ -75,19 +75,19 @@ We provide [English](http://www.paddlepaddle.org/docs/develop/documentation/en/g
  You might want to start from this online interactive book that can run in a Jupyter Notebook.
- [Distributed Training](http://www.paddlepaddle.org/docs/develop/documentation/en/howto/usage/cluster/cluster_train_en.html)
+- [Distributed Training](http://www.paddlepaddle.org/docs/develop/documentation/en/howto/cluster/index_en.html)
  You can run distributed training jobs on MPI clusters.
- [Distributed Training on Kubernetes](http://www.paddlepaddle.org/docs/develop/documentation/en/howto/usage/cluster/k8s_en.html)
+- [Distributed Training on Kubernetes](http://www.paddlepaddle.org/docs/develop/documentation/en/howto/cluster/multi_cluster/k8s_en.html)
   You can also run distributed training jobs on Kubernetes clusters.
- [Python API](http://www.paddlepaddle.org/docs/develop/documentation/en/api/index_en.html)
+- [Python API](http://www.paddlepaddle.org/docs/develop/api/en/overview.html)
   Our new API enables much shorter programs.
- [How to Contribute](http://www.paddlepaddle.org/docs/develop/documentation/en/howto/dev/contribute_to_paddle_en.html)
+- [How to Contribute](http://www.paddlepaddle.org/docs/develop/documentation/fluid/en/dev/contribute_to_paddle_en.html)
   We appreciate your contributions!
--- a/benchmark/cluster/README.md
+++ b/benchmark/cluster/README.md
@ -36,11 +36,41 @@
 - Trainer Count: 100
 - Metrics: mini-batch / sec
-| Batch Size | 32 | 64 | 128 | 256 |
+
-| -- | -- | -- | -- | -- |
+<table>
-| PaddlePaddle Fluid | - | - | - | - |
+<thead>
-| PaddlePaddle v2 | - | - | - | - |
+<tr>
-| TensorFlow | - | - | - | - |
+<th>Batch Size </th>
 <th> 32</th>
 <th>64</th>
 <th>128 </th>
 <th>256</th>
 </tr>
 </thead>
 <tbody>
 <tr>
 <td> PaddlePaddle Fluid</td>
 <td>-</td>
 <td>- </td>
 <td>-  </td>
 <td>- </td>
 </tr>
 <tr>
 <td>PaddlePaddle v2  </td>
 <td>-  </td>
 <td>- </td>
 <td>-  </td>
 <td>- </td>
 </tr>
 <tr>
 <td>TensorFlow </td>
 <td>-  </td>
 <td>- </td>
 <td>-  </td>
 <td>- </td>
 </tr>
 </tbody>
 </table>
 ### Measure the Performance for Different PServer Count
@ -48,11 +78,41 @@
 - Batch Size: 64
 - Metrics: mini-batch / sec
-| PServer Count | 10 | 20 | 40 | 60 |
+
-| -- | -- | -- | -- | -- |
+<table>
-| PaddlePaddle Fluid | - | - | - | - |
+<thead>
-| PaddlePaddle v2 | - | - | - | - |
+<tr>
-| TensorFlow | - | - | - | - |
+<th>PServer Count  </th>
 <th>10</th>
 <th>20</th>
 <th>40 </th>
 <th>60</th>
 </tr>
 </thead>
 <tbody>
 <tr>
 <td> PaddlePaddle Fluid</td>
 <td>-</td>
 <td>- </td>
 <td>-  </td>
 <td>- </td>
 </tr>
 <tr>
 <td>PaddlePaddle v2  </td>
 <td>-  </td>
 <td>- </td>
 <td>-  </td>
 <td>- </td>
 </tr>
 <tr>
 <td>TensorFlow </td>
 <td>-  </td>
 <td>- </td>
 <td>-  </td>
 <td>- </td>
 </tr>
 </tbody>
 </table>
 ### Measure Parallel Efficiency By Increasing Trainer Count
@ -67,11 +127,69 @@ The parallel efficiency is:
 $E = \div(S, N)$
-| Trainer Counter | 1 | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
+<table>
-| -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- |
+<thead>
-| PaddlePaddle Fluid | - | - | - | - | - | - | - | - | - | - | - |
+<tr>
-| PaddlePaddle v2 | - | - | - | - | - | - | - | - | - | - | - | - |
+<th>Trainer Counter  </th>
-| TensorFlow | - | - | - | - | - | - | - | - | - | - | - | - | - |
+<th>1</th>
 <th>10</th>
 <th>20 </th>
 <th>30</th>
 <th>40</th>
 <th>50</th>
 <th>60 </th>
 <th>70</th>
 <th>80</th>
 <th>90</th>
 <th>100 </th>
 </tr>
 </thead>
 <tbody>
 <tr>
 <td> PaddlePaddle Fluid</td>
 <td>-</td>
 <td>- </td>
 <td>- </td>
 <td>- </td>
 <td>-</td>
 <td>- </td>
 <td>- </td>
 <td>- </td>
 <td>-</td>
 <td>- </td>
 <td>- </td>
 </tr>
 <tr>
 <td>PaddlePaddle v2  </td>
 <td>-  </td>
 <td>- </td>
 <td>-  </td>
 <td>- </td>
 <td>-</td>
 <td>- </td>
 <td>- </td>
 <td>- </td>
 <td>-</td>
 <td>- </td>
 <td>- </td>
 </tr>
 <tr>
 <td>TensorFlow </td>
 <td>-  </td>
 <td>- </td>
 <td>-  </td>
 <td>- </td>
 <td>-</td>
 <td>- </td>
 <td>- </td>
 <td>- </td>
 <td>-</td>
 <td>- </td>
 <td>- </td>
 </tr>
 </tbody>
 </table>
 ## Reproduce the benchmark
--- a/benchmark/cluster/vgg16/README.md
+++ b/benchmark/cluster/vgg16/README.md
@ -16,11 +16,41 @@ Setting environment variable: `MKL_NUM_THREADS=1`.
 - Metrics: samples / sec
-| Batch Size | 32 | 64 | 128 | 256 |
+<table>
-| -- | -- | -- | -- | -- |
+<thead>
-| PaddlePaddle Fluid | 15.44 | 16.32 | 16.74 | 16.79 |
+<tr>
-| PaddlePaddle v2 | 15.97 | 17.04 | 17.60 | 17.83 |
+<th>Batch Size </th>
-| TensorFlow | 9.09 | 9.10 | 9.24 | 8.66 |
+<th> 32</th>
 <th>64</th>
 <th>128 </th>
 <th>256</th>
 </tr>
 </thead>
 <tbody>
 <tr>
 <td> PaddlePaddle Fluid</td>
 <td> 15.44 </td>
 <td> 16.32 </td>
 <td> 16.74 </td>
 <td> 16.79 </td>
 </tr>
 <tr>
 <td>PaddlePaddle v2  </td>
 <td> 15.97 </td>
 <td> 17.04 </td>
 <td> 17.60 </td>
 <td> 17.83 </td>
 </tr>
 <tr>
 <td>TensorFlow </td>
 <td> 9.09 </td>
 <td> 9.10 </td>
 <td> 9.24 </td>
 <td> 8.66 </td>
 </tr>
 </tbody>
 </table>
 ### Different Batch Size
@ -28,12 +58,40 @@ Setting environment variable: `MKL_NUM_THREADS=1`.
 - Trainer Count: 20
 - Metrics: samples / sec
-| Batch Size | 32 | 64 | 128 | 256 |
+<table>
-| -- | -- | -- | -- | -- |
+<thead>
-| PaddlePaddle Fluid | 190.20 | 222.15 | 247.40 | 258.18 |
+<tr>
-| PaddlePaddle v2 | 170.96 | 233.71 | 256.14 | 329.23 |
+<th>Batch Size </th>
-| TensorFlow | - | - | - | - |
+<th> 32</th>
-
+<th>64</th>
 <th>128 </th>
 <th>256</th>
 </tr>
 </thead>
 <tbody>
 <tr>
 <td> PaddlePaddle Fluid</td>
 <td> 190.20 </td>
 <td> 222.15 </td>
 <td> 247.40 </td>
 <td> 258.18 </td>
 </tr>
 <tr>
 <td>PaddlePaddle v2  </td>
 <td> 170.96 </td>
 <td> 233.71 </td>
 <td> 256.14 </td>
 <td> 329.23 </td>
 </tr>
 <tr>
 <td>TensorFlow </td>
 <td> - </td>
 <td> - </td>
 <td> - </td>
 <td> - </td>
 </tr>
 </tbody>
 </table>
 ### Accelerate Rate
@ -41,11 +99,41 @@ Setting environment variable: `MKL_NUM_THREADS=1`.
 - Batch Size: 128
 - Metrics: samples / sec
-| Trainer Count | 20 | 40 | 80 | 100 |
+<table>
-| -- | -- | -- | -- | -- |
+<thead>
-| PaddlePaddle Fluid | 263.29 (78.64%) | 518.80 (77.47%) | 836.26 (62.44%) | 1019.29 (60.89%) |
+<tr>
-| PaddlePaddle v2 (need more tests) | 326.85 (92.85%) | 534.58 (75.93%) | 853.30 (60.60%) | 1041.99 (59.20%) |
+<th>Trainer Count </th>
-| TensorFlow | - | - | - | - |
+<th>20</th>
 <th>40</th>
 <th>80</th>
 <th>100</th>
 </tr>
 </thead>
 <tbody>
 <tr>
 <td> PaddlePaddle Fluid</td>
 <td> 263.29 (78.64%) </td>
 <td> 518.80 (77.47%) </td>
 <td> 836.26 (62.44%) </td>
 <td> 1019.29 (60.89%) </td>
 </tr>
 <tr>
 <td>PaddlePaddle v2 (need more tests)   </td>
 <td> 326.85 (92.85%) </td>
 <td> 534.58 (75.93%) </td>
 <td> 853.30 (60.60%) </td>
 <td> 1041.99 (59.20%) </td>
 </tr>
 <tr>
 <td>TensorFlow </td>
 <td> - </td>
 <td> - </td>
 <td> - </td>
 <td> - </td>
 </tr>
 </tbody>
 </table>
 ### Different Pserver Count
@ -53,11 +141,41 @@ Setting environment variable: `MKL_NUM_THREADS=1`.
 - Batch Size: 128
 - Metrics: samples/ sec
-| PServer Count | 3 | 6 |10 | 20 |
+<table>
-| -- | -- | -- | -- | -- |
+<thead>
-| PaddlePaddle Fluid(should fix in next PR) | 589.1 | 592.6 | 656.4 | 655.8 |
+<tr>
-| PaddlePaddle v2 | 593.4 | 791.3 | 729.7 | 821.7 |
+<th>PServer Count </th>
-| TensorFlow | - | - | - | - |
+<th>3</th>
 <th>6</th>
 <th>10</th>
 <th>20</th>
 </tr>
 </thead>
 <tbody>
 <tr>
 <td> PaddlePaddle Fluid(should fix in next PR) </td>
 <td> 589.1 </td>
 <td> 592.6 </td>
 <td> 656.4 </td>
 <td> 655.8 </td>
 </tr>
 <tr>
 <td>PaddlePaddle v2 (need more tests)   </td>
 <td> 593.4 </td>
 <td> 791.3 </td>
 <td> 729.7 </td>
 <td> 821.7 </td>
 </tr>
 <tr>
 <td>TensorFlow </td>
 <td> - </td>
 <td> - </td>
 <td> - </td>
 <td> - </td>
 </tr>
 </tbody>
 </table>
 *The performance gap between Fuild and v2 comes from the network interference.*
--- a/benchmark/cluster/vgg16/run_vgg_dist.sh
+++ b/benchmark/cluster/vgg16/run_vgg_dist.sh
@ -0,0 +1,21 @@
 #!/bin/bash
 # Update to point to the source file.
 VGG_SRC="vgg16_fluid.py"
 export TRAINING_ROLE=PSERVER
 export TRAINERS=2
 export POD_IP=127.0.0.1
 export PADDLE_INIT_PORT=6174
 MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 &
 # Need to wait for the ps to start first.
 sleep 10
 echo "done start ps"
 export TRAINING_ROLE=TRAINER
 export TRAINERS=2
 export POD_IP=127.0.0.1
 export PADDLE_INIT_PORT=6174
 CUDA_VISIBLE_DEVICES=4 MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 --device=GPU --task_index=0 &
 CUDA_VISIBLE_DEVICES=5 MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 --device=GPU --task_index=1 &
--- a/benchmark/cluster/vgg16/vgg16_fluid.py
+++ b/benchmark/cluster/vgg16/vgg16_fluid.py
@ -1,11 +1,11 @@
 #   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -18,12 +18,13 @@ import sys
 import time
 import numpy as np
 import paddle.v2 as paddle
-import paddle.v2.fluid as fluid
+import paddle.fluid as fluid
-import paddle.v2.fluid.core as core
+import paddle.fluid.core as core
-import paddle.v2.fluid.profiler as profiler
+import paddle.fluid.profiler as profiler
 import argparse
 import functools
 import os
 from paddle.fluid import debuger
 def str2bool(v):
@ -79,6 +80,8 @@ parser.add_argument(
    type=str,
    default="",
    help="Comma-separated list of hostname:port pairs")
 parser.add_argument(
    "--profile", action='store_true', help="If set, profile a few steps.")
 # Flags for defining the tf.train.Server
 parser.add_argument(
@ -138,13 +141,14 @@ def main():
    avg_cost = fluid.layers.mean(x=cost)
    # Evaluator
-    accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
+    batch_size = fluid.layers.create_tensor(dtype='int64')
    batch_acc = fluid.layers.accuracy(
        input=predict, label=label, total=batch_size)
    # inference program
    inference_program = fluid.default_main_program().clone()
    with fluid.program_guard(inference_program):
-        test_target = accuracy.metrics + accuracy.states
+        inference_program = fluid.io.get_inference_program(batch_acc)
        inference_program = fluid.io.get_inference_program(test_target)
    # Optimization
    optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
@ -157,56 +161,74 @@ def main():
    # test
    def test(exe):
-        accuracy.reset(exe)
+        test_pass_acc = fluid.average.WeightedAverage()
        for batch_id, data in enumerate(test_reader()):
            img_data = np.array(map(lambda x: x[0].reshape(data_shape),
                                    data)).astype("float32")
            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
            y_data = y_data.reshape([-1, 1])
-            exe.run(inference_program,
+            outs = exe.run(inference_program,
-                    feed={"pixel": img_data,
+                           feed={"pixel": img_data,
-                          "label": y_data})
+                                 "label": y_data},
                           fetch_list=[batch_acc, batch_size])
            test_pass_acc.add(value=np.array(outs[0]), weight=np.array(outs[1]))
-        return accuracy.eval(exe)
+        return test_pass_acc.eval()
    def train_loop(exe, trainer_prog):
        iters = 0
        ts = time.time()
        train_pass_acc = fluid.average.WeightedAverage()
        for pass_id in range(args.num_passes):
            # train
            start_time = time.time()
            num_samples = 0
-            accuracy.reset(exe)
+            train_pass_acc.reset()
-            with profiler.profiler("CPU", 'total') as prof:
+
            def run_step(batch_id, data):
                img_data = np.array(
                    map(lambda x: x[0].reshape(data_shape), data)).astype(
                        "float32")
                y_data = np.array(map(lambda x: x[1], data)).astype("int64")
                y_data = y_data.reshape([-1, 1])
                loss, acc, b_size = exe.run(
                    trainer_prog,
                    feed={"pixel": img_data,
                          "label": y_data},
                    fetch_list=[avg_cost, batch_acc, batch_size])
                return loss, acc, b_size
            if args.profile and args.task_index == 0:
                # warmup.
                for batch_id, data in enumerate(train_reader()):
-                    ts = time.time()
+                    if batch_id > 5: break
-                    img_data = np.array(
+                    run_step(batch_id, data)
-                        map(lambda x: x[0].reshape(data_shape), data)).astype(
+                with profiler.profiler('All', 'total', '/tmp/profile_vgg'):
-                            "float32")
+                    for batch_id, data in enumerate(train_reader()):
-                    y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+                        if batch_id > 5: break
-                    y_data = y_data.reshape([-1, 1])
+                        run_step(batch_id, data)
-
+
-                    loss, acc = exe.run(
+            for batch_id, data in enumerate(train_reader()):
-                        trainer_prog,
+                ts = time.time()
-                        feed={"pixel": img_data,
+                loss, acc, b_size = run_step(batch_id, data)
-                              "label": y_data},
+                iters += 1
-                        fetch_list=[avg_cost] + accuracy.metrics)
+                num_samples += len(data)
-                    iters += 1
+                train_pass_acc.add(value=acc, weight=b_size)
-                    num_samples += len(data)
+                print(
-                    print(
+                    "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, "
-                        "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, Speed = %.2f img/s"
+                    "Speed = %.2f img/s" % (pass_id, iters, loss, acc,
-                        % (pass_id, iters, loss, acc,
+                                            len(data) / (time.time() - ts))
-                           len(data) / (time.time() - ts))
+                )  # The accuracy is the accumulation of batches, but not the current batch.
                    )  # The accuracy is the accumulation of batches, but not the current batch.
            pass_elapsed = time.time() - start_time
-            pass_train_acc = accuracy.eval(exe)
+            pass_train_acc = train_pass_acc.eval()
            pass_test_acc = test(exe)
-            print(
+            print("Task:%d Pass = %d, Training performance = %f imgs/s, "
-                "Pass = %d, Training performance = %f imgs/s, Train accuracy = %f, Test accuracy = %f\n"
+                  "Train accuracy = %f, Test accuracy = %f\n" %
-                % (pass_id, num_samples / pass_elapsed, pass_train_acc,
+                  (args.task_index, pass_id, num_samples / pass_elapsed,
-                   pass_test_acc))
+                   pass_train_acc, pass_test_acc))
    if args.local:
        # Parameter initialization
@ -234,8 +256,6 @@ def main():
        t = fluid.DistributeTranspiler()
        t.transpile(
            optimize_ops,
            params_grads,
            trainer_id=args.task_index,
            pservers=args.ps_hosts,
            trainers=trainers)
@ -249,9 +269,7 @@ def main():
            pserver_prog = t.get_pserver_program(current_endpoint)
            pserver_startup = t.get_startup_program(current_endpoint,
                                                    pserver_prog)
            print("starting server side startup")
            exe.run(pserver_startup)
            print("starting parameter server...")
            exe.run(pserver_prog)
        elif training_role == "TRAINER":
            # Parameter initialization
--- a/benchmark/cluster/vgg16/vgg16_tf.py
+++ b/benchmark/cluster/vgg16/vgg16_tf.py
@ -292,14 +292,18 @@ def run_benchmark(cluster_spec, server):
        return np.mean(test_accs)
    config = tf.ConfigProto(
-        intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
+        intra_op_parallelism_threads=1,
        inter_op_parallelism_threads=1,
        log_device_placement=True)
    config.gpu_options.allow_growth = True
    hooks = [tf.train.StopAtStepHook(last_step=1000000)]
    with tf.train.MonitoredTrainingSession(
-            master=server.target, is_chief=(args.task_index == 0),
+            master=server.target,
-            hooks=hooks) as sess:
+            is_chief=(args.task_index == 0),
            hooks=hooks,
            config=config) as sess:
        iters, num_samples, start_time = 0, 0, 0.0
        for pass_id in range(args.num_passes):
            # train
--- a/benchmark/fluid/machine_translation.py
+++ b/benchmark/fluid/machine_translation.py
--- a/benchmark/fluid/mnist.py
+++ b/benchmark/fluid/mnist.py
@ -0,0 +1,224 @@
 #   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 import numpy as np
 import argparse
 import time
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.profiler as profiler
 SEED = 1
 DTYPE = "float32"
 # random seed must set before configuring the network.
 # fluid.default_startup_program().random_seed = SEED
 def parse_args():
    parser = argparse.ArgumentParser("mnist model benchmark.")
    parser.add_argument(
        '--batch_size', type=int, default=128, help='The minibatch size.')
    parser.add_argument(
        '--skip_batch_num',
        type=int,
        default=5,
        help='The first num of minibatch num to skip, for better performance test'
    )
    parser.add_argument(
        '--iterations', type=int, default=35, help='The number of minibatches.')
    parser.add_argument(
        '--pass_num', type=int, default=5, help='The number of passes.')
    parser.add_argument(
        '--device',
        type=str,
        default='GPU',
        choices=['CPU', 'GPU'],
        help='The device type.')
    parser.add_argument(
        '--infer_only', action='store_true', help='If set, run forward only.')
    parser.add_argument(
        '--use_cprof', action='store_true', help='If set, use cProfile.')
    parser.add_argument(
        '--use_nvprof',
        action='store_true',
        help='If set, use nvprof for CUDA.')
    parser.add_argument(
        '--with_test',
        action='store_true',
        help='If set, test the testset during training.')
    args = parser.parse_args()
    return args
 def cnn_model(data):
    conv_pool_1 = fluid.nets.simple_img_conv_pool(
        input=data,
        filter_size=5,
        num_filters=20,
        pool_size=2,
        pool_stride=2,
        act="relu")
    conv_pool_2 = fluid.nets.simple_img_conv_pool(
        input=conv_pool_1,
        filter_size=5,
        num_filters=50,
        pool_size=2,
        pool_stride=2,
        act="relu")
    # TODO(dzhwinter) : refine the initializer and random seed settting
    SIZE = 10
    input_shape = conv_pool_2.shape
    param_shape = [reduce(lambda a, b: a * b, input_shape[1:], 1)] + [SIZE]
    scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5
    predict = fluid.layers.fc(
        input=conv_pool_2,
        size=SIZE,
        act="softmax",
        param_attr=fluid.param_attr.ParamAttr(
            initializer=fluid.initializer.NormalInitializer(
                loc=0.0, scale=scale)))
    return predict
 def eval_test(exe, batch_acc, batch_size_tensor, inference_program):
    test_reader = paddle.batch(
        paddle.dataset.mnist.test(), batch_size=args.batch_size)
    test_pass_acc = fluid.average.WeightedAverage()
    for batch_id, data in enumerate(test_reader()):
        img_data = np.array(map(lambda x: x[0].reshape([1, 28, 28]),
                                data)).astype(DTYPE)
        y_data = np.array(map(lambda x: x[1], data)).astype("int64")
        y_data = y_data.reshape([len(y_data), 1])
        acc, weight = exe.run(inference_program,
                              feed={"pixel": img_data,
                                    "label": y_data},
                              fetch_list=[batch_acc, batch_size_tensor])
        test_pass_acc.add(value=acc, weight=weight)
        pass_acc = test_pass_acc.eval()
    return pass_acc
 def run_benchmark(model, args):
    if args.use_cprof:
        pr = cProfile.Profile()
        pr.enable()
    start_time = time.time()
    # Input data
    images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
    # Train program
    predict = model(images)
    cost = fluid.layers.cross_entropy(input=predict, label=label)
    avg_cost = fluid.layers.mean(x=cost)
    # Evaluator
    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
    batch_acc = fluid.layers.accuracy(
        input=predict, label=label, total=batch_size_tensor)
    # inference program
    inference_program = fluid.default_main_program().clone()
    # Optimization
    opt = fluid.optimizer.AdamOptimizer(
        learning_rate=0.001, beta1=0.9, beta2=0.999)
    opt.minimize(avg_cost)
    fluid.memory_optimize(fluid.default_main_program())
    # Initialize executor
    place = fluid.CPUPlace() if args.device == 'CPU' else fluid.CUDAPlace(0)
    exe = fluid.Executor(place)
    # Parameter initialization
    exe.run(fluid.default_startup_program())
    # Reader
    train_reader = paddle.batch(
        paddle.dataset.mnist.train(), batch_size=args.batch_size)
    accuracy = fluid.metrics.Accuracy()
    iters, num_samples, start_time = 0, 0, time.time()
    for pass_id in range(args.pass_num):
        accuracy.reset()
        train_accs = []
        train_losses = []
        for batch_id, data in enumerate(train_reader()):
            if iters == args.skip_batch_num:
                start_time = time.time()
                num_samples = 0
            if iters == args.iterations:
                break
            img_data = np.array(
                map(lambda x: x[0].reshape([1, 28, 28]), data)).astype(DTYPE)
            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
            y_data = y_data.reshape([len(y_data), 1])
            outs = exe.run(
                fluid.default_main_program(),
                feed={"pixel": img_data,
                      "label": y_data},
                fetch_list=[avg_cost, batch_acc, batch_size_tensor]
            )  # The accuracy is the accumulation of batches, but not the current batch.
            accuracy.update(value=outs[1], weight=outs[2])
            iters += 1
            num_samples += len(y_data)
            loss = np.array(outs[0])
            acc = np.array(outs[1])
            train_losses.append(loss)
            train_accs.append(acc)
            print("Pass: %d, Iter: %d, Loss: %f, Accuracy: %f" %
                  (pass_id, iters, loss, acc))
        print("Pass: %d, Loss: %f, Train Accuray: %f\n" %
              (pass_id, np.mean(train_losses), np.mean(train_accs)))
        train_elapsed = time.time() - start_time
        examples_per_sec = num_samples / train_elapsed
        print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
              (num_samples, train_elapsed, examples_per_sec))
        # evaluation
        if args.with_test:
            test_avg_acc = eval_test(exe, batch_acc, batch_size_tensor,
                                     inference_program)
        exit(0)
 def print_arguments(args):
    vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
                                vars(args)['device'] == 'GPU')
    print('----------- mnist Configuration Arguments -----------')
    for arg, value in sorted(vars(args).iteritems()):
        print('%s: %s' % (arg, value))
    print('------------------------------------------------')
 if __name__ == '__main__':
    args = parse_args()
    print_arguments(args)
    if args.use_nvprof and args.device == 'GPU':
        with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof:
            run_benchmark(cnn_model, args)
    else:
        run_benchmark(cnn_model, args)
--- a/benchmark/fluid/resnet.py
+++ b/benchmark/fluid/resnet.py
--- a/benchmark/fluid/run.sh
+++ b/benchmark/fluid/run.sh
@ -0,0 +1,105 @@
 #!/bin/bash
 # This script benchmarking the PaddlePaddle Fluid on
 # single thread single GPU.
 #export FLAGS_fraction_of_gpu_memory_to_use=0.0
 export CUDNN_PATH=/paddle/cudnn_v5
 # disable openmp and mkl parallel
 #https://github.com/PaddlePaddle/Paddle/issues/7199
 export MKL_NUM_THREADS=1
 export OMP_NUM_THREADS=1
 ht=`lscpu |grep "per core"|awk -F':' '{print $2}'|xargs`
 if [ $ht -eq 1 ]; then # HT is OFF
    if [ -z "$KMP_AFFINITY" ]; then
        export KMP_AFFINITY="granularity=fine,compact,0,0"
    fi
    if [ -z "$OMP_DYNAMIC" ]; then
        export OMP_DYNAMIC="FALSE"
    fi
 else # HT is ON
    if [ -z "$KMP_AFFINITY" ]; then
        export KMP_AFFINITY="granularity=fine,compact,1,0"
    fi
 fi
 # disable multi-gpu if have more than one
 export CUDA_VISIBLE_DEVICES=0
 export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
 export LD_LIBRARY_PATH=$CUDNN_PATH:$LD_LIBRARY_PATH
 # only query the gpu used
 nohup stdbuf -oL nvidia-smi \
      --id=${CUDA_VISIBLE_DEVICES} \
      --query-gpu=timestamp \
      --query-compute-apps=pid,process_name,used_memory \
      --format=csv \
      --filename=mem.log  \
      -l 1 &
 # mnist
 # mnist gpu mnist 128
 FLAGS_benchmark=true stdbuf -oL python fluid/mnist.py \
               --device=GPU \
               --batch_size=128 \
               --skip_batch_num=5 \
               --iterations=500 \
               2>&1 | tee -a mnist_gpu_128.log
 # vgg16
 # gpu cifar10 128
 FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \
               --device=GPU \
               --batch_size=128 \
               --skip_batch_num=5 \
               --iterations=30 \
               2>&1 | tee -a vgg16_gpu_128.log
 # flowers gpu  128
 FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \
               --device=GPU \
               --batch_size=32 \
               --data_set=flowers \
               --skip_batch_num=5 \
               --iterations=30 \
               2>&1 | tee -a vgg16_gpu_flowers_32.log
 # resnet50
 # resnet50 gpu cifar10 128
 FLAGS_benchmark=true stdbuf -oL python fluid/resnet50.py \
               --device=GPU \
               --batch_size=128 \
               --data_set=cifar10 \
               --model=resnet_cifar10 \
               --skip_batch_num=5 \
               --iterations=30 \
               2>&1 | tee -a resnet50_gpu_128.log
 # resnet50 gpu flowers 64
 FLAGS_benchmark=true stdbuf -oL python fluid/resnet50.py \
               --device=GPU \
               --batch_size=64 \
               --data_set=flowers \
               --model=resnet_imagenet \
               --skip_batch_num=5 \
               --iterations=30 \
               2>&1 | tee -a resnet50_gpu_flowers_64.log
 # lstm
 # lstm gpu imdb 32 # tensorflow only support batch=32
 FLAGS_benchmark=true stdbuf -oL python fluid/stacked_dynamic_lstm.py \
               --device=GPU \
               --batch_size=32 \
               --skip_batch_num=5 \
               --iterations=30 \
               --hidden_dim=512 \
               --emb_dim=512 \
               --crop_size=1500 \
               2>&1 | tee -a lstm_gpu_32.log
 # seq2seq
 # seq2seq gpu wmb 128
 FLAGS_benchmark=true stdbuf -oL python fluid/machine_translation.py \
               --device=GPU \
               --batch_size=128 \
               --skip_batch_num=5 \
               --iterations=30 \
               2>&1 | tee -a lstm_gpu_128.log
--- a/benchmark/fluid/stacked_dynamic_lstm.py
+++ b/benchmark/fluid/stacked_dynamic_lstm.py
@ -0,0 +1,236 @@
 #   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 import argparse
 import cPickle
 import os
 import random
 import time
 import numpy
 import paddle
 import paddle.dataset.imdb as imdb
 import paddle.fluid as fluid
 import paddle.batch as batch
 import paddle.fluid.profiler as profiler
 def parse_args():
    parser = argparse.ArgumentParser("Understand Sentiment by Dynamic RNN.")
    parser.add_argument(
        '--batch_size',
        type=int,
        default=32,
        help='The sequence number of a batch data. (default: %(default)d)')
    parser.add_argument(
        '--skip_batch_num',
        type=int,
        default=5,
        help='The first num of minibatch num to skip, for better performance test'
    )
    parser.add_argument(
        '--iterations', type=int, default=80, help='The number of minibatches.')
    parser.add_argument(
        '--emb_dim',
        type=int,
        default=512,
        help='Dimension of embedding table. (default: %(default)d)')
    parser.add_argument(
        '--hidden_dim',
        type=int,
        default=512,
        help='Hidden size of lstm unit. (default: %(default)d)')
    parser.add_argument(
        '--pass_num',
        type=int,
        default=100,
        help='Epoch number to train. (default: %(default)d)')
    parser.add_argument(
        '--device',
        type=str,
        default='CPU',
        choices=['CPU', 'GPU'],
        help='The device type.')
    parser.add_argument(
        '--crop_size',
        type=int,
        default=int(os.environ.get('CROP_SIZE', '1500')),
        help='The max sentence length of input. Since this model use plain RNN,'
        ' Gradient could be explored if sentence is too long')
    parser.add_argument(
        '--with_test',
        action='store_true',
        help='If set, test the testset during training.')
    args = parser.parse_args()
    return args
 word_dict = imdb.word_dict()
 def crop_sentence(reader, crop_size):
    unk_value = word_dict['<unk>']
    def __impl__():
        for item in reader():
            if len([x for x in item[0] if x != unk_value]) < crop_size:
                yield item
    return __impl__
 def main():
    args = parse_args()
    lstm_size = args.hidden_dim
    data = fluid.layers.data(
        name="words", shape=[1], lod_level=1, dtype='int64')
    sentence = fluid.layers.embedding(
        input=data, size=[len(word_dict), args.emb_dim])
    sentence = fluid.layers.fc(input=sentence, size=lstm_size, act='tanh')
    rnn = fluid.layers.DynamicRNN()
    with rnn.block():
        word = rnn.step_input(sentence)
        prev_hidden = rnn.memory(value=0.0, shape=[lstm_size])
        prev_cell = rnn.memory(value=0.0, shape=[lstm_size])
        def gate_common(
                ipt,
                hidden,
                size, ):
            gate0 = fluid.layers.fc(input=ipt, size=size, bias_attr=True)
            gate1 = fluid.layers.fc(input=hidden, size=size, bias_attr=False)
            gate = fluid.layers.sums(input=[gate0, gate1])
            return gate
        forget_gate = fluid.layers.sigmoid(
            x=gate_common(word, prev_hidden, lstm_size))
        input_gate = fluid.layers.sigmoid(
            x=gate_common(word, prev_hidden, lstm_size))
        output_gate = fluid.layers.sigmoid(
            x=gate_common(word, prev_hidden, lstm_size))
        cell_gate = fluid.layers.tanh(
            x=gate_common(word, prev_hidden, lstm_size))
        cell = fluid.layers.sums(input=[
            fluid.layers.elementwise_mul(
                x=forget_gate, y=prev_cell), fluid.layers.elementwise_mul(
                    x=input_gate, y=cell_gate)
        ])
        hidden = fluid.layers.elementwise_mul(
            x=output_gate, y=fluid.layers.tanh(x=cell))
        rnn.update_memory(prev_cell, cell)
        rnn.update_memory(prev_hidden, hidden)
        rnn.output(hidden)
    last = fluid.layers.sequence_pool(rnn(), 'last')
    logit = fluid.layers.fc(input=last, size=2, act='softmax')
    loss = fluid.layers.cross_entropy(
        input=logit,
        label=fluid.layers.data(
            name='label', shape=[1], dtype='int64'))
    loss = fluid.layers.mean(x=loss)
    # add acc
    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
    batch_acc = fluid.layers.accuracy(input=logit, label=fluid.layers.data(name='label', \
                shape=[1], dtype='int64'), total=batch_size_tensor)
    inference_program = fluid.default_main_program().clone()
    with fluid.program_guard(inference_program):
        inference_program = fluid.io.get_inference_program(
            target_vars=[batch_acc, batch_size_tensor])
    adam = fluid.optimizer.Adam()
    adam.minimize(loss)
    fluid.memory_optimize(fluid.default_main_program())
    place = fluid.CPUPlace() if args.device == 'CPU' else fluid.CUDAPlace(0)
    exe = fluid.Executor(place)
    exe.run(fluid.default_startup_program())
    train_reader = batch(
        paddle.reader.shuffle(
            crop_sentence(imdb.train(word_dict), args.crop_size),
            buf_size=25000),
        batch_size=args.batch_size)
    iters, num_samples, start_time = 0, 0, time.time()
    for pass_id in range(args.pass_num):
        train_accs = []
        train_losses = []
        for batch_id, data in enumerate(train_reader()):
            if iters == args.skip_batch_num:
                start_time = time.time()
                num_samples = 0
            if iters == args.iterations:
                break
            tensor_words = to_lodtensor([x[0] for x in data], place)
            label = numpy.array([x[1] for x in data]).astype("int64")
            label = label.reshape((-1, 1))
            loss_np, acc, weight = exe.run(
                fluid.default_main_program(),
                feed={"words": tensor_words,
                      "label": label},
                fetch_list=[loss, batch_acc, batch_size_tensor])
            iters += 1
            for x in data:
                num_samples += len(x[0])
            print(
                "Pass = %d, Iter = %d, Loss = %f, Accuracy = %f" %
                (pass_id, iters, loss_np, acc)
            )  # The accuracy is the accumulation of batches, but not the current batch.
        train_elapsed = time.time() - start_time
        examples_per_sec = num_samples / train_elapsed
        print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
              (num_samples, train_elapsed, examples_per_sec))
        exit(0)
 def to_lodtensor(data, place):
    seq_lens = [len(seq) for seq in data]
    cur_len = 0
    lod = [cur_len]
    for l in seq_lens:
        cur_len += l
        lod.append(cur_len)
    flattened_data = numpy.concatenate(data, axis=0).astype("int64")
    flattened_data = flattened_data.reshape([len(flattened_data), 1])
    res = fluid.LoDTensor()
    res.set(flattened_data, place)
    res.set_lod([lod])
    return res
 def print_arguments(args):
    print('----------- lstm Configuration Arguments -----------')
    for arg, value in sorted(vars(args).iteritems()):
        print('%s: %s' % (arg, value))
    print('------------------------------------------------')
 if __name__ == '__main__':
    args = parse_args()
    print_arguments(args)
    main()
--- a/benchmark/fluid/vgg.py
+++ b/benchmark/fluid/vgg.py
@ -0,0 +1,224 @@
 #   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """VGG16 benchmark in Fluid"""
 from __future__ import print_function
 import sys
 import time
 import numpy as np
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import argparse
 import functools
 parser = argparse.ArgumentParser(description=__doc__)
 parser.add_argument(
    '--batch_size', type=int, default=128, help="Batch size for training.")
 parser.add_argument(
    '--skip_batch_num',
    type=int,
    default=5,
    help='The first num of minibatch num to skip, for better performance test')
 parser.add_argument(
    '--iterations', type=int, default=80, help='The number of minibatches.')
 parser.add_argument(
    '--learning_rate',
    type=float,
    default=1e-3,
    help="Learning rate for training.")
 parser.add_argument('--pass_num', type=int, default=50, help="No. of passes.")
 parser.add_argument(
    '--device',
    type=str,
    default='GPU',
    choices=['CPU', 'GPU'],
    help="The device type.")
 parser.add_argument(
    '--data_format',
    type=str,
    default='NCHW',
    choices=['NCHW', 'NHWC'],
    help='The data order, now only support NCHW.')
 parser.add_argument(
    '--data_set',
    type=str,
    default='cifar10',
    choices=['cifar10', 'flowers'],
    help='Optional dataset for benchmark.')
 parser.add_argument(
    '--with_test',
    action='store_true',
    help='If set, test the testset during training.')
 args = parser.parse_args()
 def vgg16_bn_drop(input):
    def conv_block(input, num_filter, groups, dropouts):
        return fluid.nets.img_conv_group(
            input=input,
            pool_size=2,
            pool_stride=2,
            conv_num_filter=[num_filter] * groups,
            conv_filter_size=3,
            conv_act='relu',
            conv_with_batchnorm=True,
            conv_batchnorm_drop_rate=dropouts,
            pool_type='max')
    conv1 = conv_block(input, 64, 2, [0.3, 0])
    conv2 = conv_block(conv1, 128, 2, [0.4, 0])
    conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
    conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
    conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
    drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
    fc1 = fluid.layers.fc(input=drop, size=512, act=None)
    bn = fluid.layers.batch_norm(input=fc1, act='relu')
    drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
    fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
    return fc2
 def main():
    if args.data_set == "cifar10":
        classdim = 10
        if args.data_format == 'NCHW':
            data_shape = [3, 32, 32]
        else:
            data_shape = [32, 32, 3]
    else:
        classdim = 102
        if args.data_format == 'NCHW':
            data_shape = [3, 224, 224]
        else:
            data_shape = [224, 224, 3]
    # Input data
    images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
    # Train program
    net = vgg16_bn_drop(images)
    predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
    cost = fluid.layers.cross_entropy(input=predict, label=label)
    avg_cost = fluid.layers.mean(x=cost)
    # Evaluator
    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
    batch_acc = fluid.layers.accuracy(
        input=predict, label=label, total=batch_size_tensor)
    # inference program
    inference_program = fluid.default_main_program().clone()
    with fluid.program_guard(inference_program):
        inference_program = fluid.io.get_inference_program(
            target_vars=[batch_acc, batch_size_tensor])
    # Optimization
    optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
    opts = optimizer.minimize(avg_cost)
    fluid.memory_optimize(fluid.default_main_program())
    # Initialize executor
    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
    exe = fluid.Executor(place)
    # Parameter initialization
    exe.run(fluid.default_startup_program())
    # data reader
    train_reader = paddle.batch(
        paddle.reader.shuffle(
            paddle.dataset.cifar.train10()
            if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
            buf_size=5120),
        batch_size=args.batch_size)
    test_reader = paddle.batch(
        paddle.dataset.cifar.test10()
        if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
        batch_size=args.batch_size)
    # test
    def test(exe):
        test_accuracy = fluid.average.WeightedAverage()
        for batch_id, data in enumerate(test_reader()):
            img_data = np.array(map(lambda x: x[0].reshape(data_shape),
                                    data)).astype("float32")
            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
            y_data = y_data.reshape([-1, 1])
            acc, weight = exe.run(inference_program,
                                  feed={"pixel": img_data,
                                        "label": y_data},
                                  fetch_list=[batch_acc, batch_size_tensor])
            test_accuracy.add(value=acc, weight=weight)
        return test_accuracy.eval()
    iters, num_samples, start_time = 0, 0, time.time()
    accuracy = fluid.average.WeightedAverage()
    for pass_id in range(args.pass_num):
        accuracy.reset()
        train_accs = []
        train_losses = []
        for batch_id, data in enumerate(train_reader()):
            if iters == args.skip_batch_num:
                start_time = time.time()
                num_samples = 0
            if iters == args.iterations:
                break
            img_data = np.array(map(lambda x: x[0].reshape(data_shape),
                                    data)).astype("float32")
            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
            y_data = y_data.reshape([-1, 1])
            loss, acc, weight = exe.run(
                fluid.default_main_program(),
                feed={"pixel": img_data,
                      "label": y_data},
                fetch_list=[avg_cost, batch_acc, batch_size_tensor])
            accuracy.add(value=acc, weight=weight)
            iters += 1
            num_samples += len(y_data)
            print(
                "Pass = %d, Iter = %d, Loss = %f, Accuracy = %f" %
                (pass_id, iters, loss, acc)
            )  # The accuracy is the accumulation of batches, but not the current batch.
        # pass_train_acc = accuracy.eval()
        train_losses.append(loss)
        train_accs.append(acc)
        print("Pass: %d, Loss: %f, Train Accuray: %f\n" %
              (pass_id, np.mean(train_losses), np.mean(train_accs)))
        train_elapsed = time.time() - start_time
        examples_per_sec = num_samples / train_elapsed
        print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
              (num_samples, train_elapsed, examples_per_sec))
        # evaluation
        if args.with_test:
            pass_test_acc = test(exe)
        exit(0)
 def print_arguments():
    print('----------- vgg Configuration Arguments -----------')
    for arg, value in sorted(vars(args).iteritems()):
        print('%s: %s' % (arg, value))
    print('------------------------------------------------')
 if __name__ == "__main__":
    print_arguments()
    main()
--- a/benchmark/paddle/image/check_env.sh
+++ b/benchmark/paddle/image/check_env.sh
--- a/benchmark/tensorflow/machine_translation.py
+++ b/benchmark/tensorflow/machine_translation.py
--- a/benchmark/tensorflow/mnist.py
+++ b/benchmark/tensorflow/mnist.py
@ -0,0 +1,180 @@
 #   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 import argparse
 import time
 import numpy as np
 import tensorflow as tf
 import paddle.v2 as paddle
 DTYPE = tf.float32
 def parse_args():
    parser = argparse.ArgumentParser("mnist model benchmark.")
    parser.add_argument(
        '--batch_size', type=int, default=128, help='The minibatch size.')
    parser.add_argument(
        '--iterations', type=int, default=35, help='The number of minibatches.')
    parser.add_argument(
        '--pass_num', type=int, default=5, help='The number of passes.')
    parser.add_argument(
        '--device',
        type=str,
        default='GPU',
        choices=['CPU', 'GPU'],
        help='The device type.')
    args = parser.parse_args()
    return args
 def run_benchmark(args):
    def weight_variable(dtype, shape):
        initial = tf.truncated_normal(shape, stddev=0.1, dtype=dtype)
        return tf.Variable(initial)
    def bias_variable(dtype, shape):
        initial = tf.constant(0.1, shape=shape, dtype=dtype)
        return tf.Variable(initial)
    device = '/cpu:0' if args.device == 'CPU' else '/device:GPU:0'
    with tf.device(device):
        images = tf.placeholder(DTYPE, shape=(None, 28, 28, 1))
        labels = tf.placeholder(tf.int64, shape=(None, ))
        # conv1, relu, pool1
        conv1_weights = weight_variable(DTYPE, [5, 5, 1, 20])
        conv1_bias = bias_variable(DTYPE, [20])
        conv1 = tf.nn.conv2d(
            images, conv1_weights, strides=[1, 1, 1, 1], padding="VALID")
        relu1 = tf.nn.relu(tf.nn.bias_add(conv1, conv1_bias))
        pool1 = tf.nn.max_pool(
            relu1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding="VALID")
        # conv2, relu, pool2
        conv2_weights = weight_variable(DTYPE, [5, 5, 20, 50])
        conv2_bias = bias_variable(DTYPE, [50])
        conv2 = tf.nn.conv2d(
            pool1, conv2_weights, strides=[1, 1, 1, 1], padding="VALID")
        relu2 = tf.nn.relu(tf.nn.bias_add(conv2, conv2_bias))
        pool2 = tf.nn.max_pool(
            relu2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding="VALID")
        # FC 
        pool_shape = pool2.get_shape().as_list()
        hidden_dim = reduce(lambda a, b: a * b, pool_shape[1:], 1)
        reshape = tf.reshape(pool2, shape=(tf.shape(pool2)[0], hidden_dim))
        fc_weights = weight_variable(DTYPE, [hidden_dim, 10])
        fc_bias = bias_variable(DTYPE, [10])
        logits = tf.matmul(reshape, fc_weights) + fc_bias
        # Get prediction
        prediction = tf.nn.softmax(logits)
        # Loss 
        one_hot_labels = tf.one_hot(labels, depth=10)
        cost = -tf.reduce_sum(tf.log(prediction) * one_hot_labels, [1])
        avg_cost = tf.reduce_mean(cost)
        # Get accuracy
        correct = tf.equal(tf.argmax(prediction, 1), labels)
        accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
        # metrics, g_accuracy
        with tf.variable_scope("reset_metrics_accuracy_scope") as scope:
            g_accuracy = tf.metrics.accuracy(
                labels, tf.argmax(
                    prediction, axis=1))
            vars = tf.contrib.framework.get_variables(
                scope, collection=tf.GraphKeys.LOCAL_VARIABLES)
            g_accuracy_reset_op = tf.variables_initializer(vars)
        # Optimizer 
        opt = tf.train.AdamOptimizer(
            learning_rate=0.001, beta1=0.9, beta2=0.999)
        train_op = opt.minimize(avg_cost)
        # train_op = tf.train.AdamOptimizer(1e-4).minimize(avg_cost)
    train_reader = paddle.batch(
        paddle.dataset.mnist.train(), batch_size=args.batch_size)
    test_reader = paddle.batch(
        paddle.dataset.mnist.test(), batch_size=args.batch_size)
    def eval_test():
        sess.run(g_accuracy_reset_op)
        for batch_id, data in enumerate(test_reader()):
            images_data = np.array(
                map(lambda x: np.transpose(x[0].reshape([1, 28, 28]), axes=[1,2,0]), data)).astype("float32")
            labels_data = np.array(map(lambda x: x[1], data)).astype("int64")
            loss, acc, g_acc = sess.run(
                [avg_cost, accuracy, g_accuracy],
                feed_dict={images: images_data,
                           labels: labels_data})
        return g_acc[1]
    config = tf.ConfigProto(
        intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        init_g = tf.global_variables_initializer()
        init_l = tf.local_variables_initializer()
        sess.run(init_g)
        sess.run(init_l)
        for pass_id in range(args.pass_num):
            sess.run(g_accuracy_reset_op)
            pass_start = time.time()
            for batch_id, data in enumerate(train_reader()):
                images_data = np.array(
                    map(lambda x: np.transpose(x[0].reshape([1, 28, 28]), axes=[1,2,0]), data)).astype("float32")
                labels_data = np.array(map(lambda x: x[1], data)).astype(
                    "int64")
                start = time.time()
                _, loss, acc, g_acc = sess.run(
                    [train_op, avg_cost, accuracy, g_accuracy],
                    feed_dict={images: images_data,
                               labels: labels_data})
                end = time.time()
                print("pass=%d, batch=%d, loss=%f, error=%f, elapse=%f" %
                      (pass_id, batch_id, loss, 1 - acc, (end - start) / 1000))
            pass_end = time.time()
            test_avg_acc = eval_test()
            print(
                "pass=%d, training_avg_accuracy=%f, test_avg_acc=%f, elapse=%f"
                % (pass_id, g_acc[1], test_avg_acc,
                   (pass_end - pass_start) / 1000))
 def print_arguments(args):
    print('-----------  Configuration Arguments -----------')
    for arg, value in sorted(vars(args).iteritems()):
        print('%s: %s' % (arg, value))
    print('------------------------------------------------')
 if __name__ == '__main__':
    args = parse_args()
    print_arguments(args)
    run_benchmark(args)
--- a/benchmark/tensorflow/resnet.py
+++ b/benchmark/tensorflow/resnet.py
--- a/benchmark/tensorflow/stacked_dynamic_lstm.py
+++ b/benchmark/tensorflow/stacked_dynamic_lstm.py
@ -0,0 +1,220 @@
 #   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 import numpy as np
 import argparse
 import time
 import tensorflow as tf
 import paddle.v2 as paddle
 def parse_args():
    parser = argparse.ArgumentParser("LSTM model benchmark.")
    parser.add_argument(
        '--batch_size',
        type=int,
        default=32,
        help='The sequence number of a batch data. (default: %(default)d)')
    parser.add_argument(
        '--stacked_num',
        type=int,
        default=5,
        help='Number of lstm layers to stack. (default: %(default)d)')
    parser.add_argument(
        '--embedding_dim',
        type=int,
        default=512,
        help='Dimension of embedding table. (default: %(default)d)')
    parser.add_argument(
        '--hidden_dim',
        type=int,
        default=512,
        help='Hidden size of lstm unit. (default: %(default)d)')
    parser.add_argument(
        '--pass_num',
        type=int,
        default=10,
        help='Epoch number to train. (default: %(default)d)')
    parser.add_argument(
        '--learning_rate',
        type=float,
        default=0.0002,
        help='Learning rate used to train. (default: %(default)f)')
    parser.add_argument(
        '--infer_only', action='store_true', help='If set, run forward only.')
    args = parser.parse_args()
    return args
 def print_arguments(args):
    print('-----------  Configuration Arguments -----------')
    for arg, value in sorted(vars(args).iteritems()):
        print('%s: %s' % (arg, value))
    print('------------------------------------------------')
 def dynamic_lstm_model(dict_size,
                       embedding_dim,
                       hidden_dim,
                       stacked_num,
                       class_num=2,
                       is_train=True):
    word_idx = tf.placeholder(tf.int64, shape=[None, None])
    sequence_length = tf.placeholder(tf.int64, shape=[None, ])
    embedding_weights = tf.get_variable('word_embeddings',
                                        [dict_size, embedding_dim])
    embedding = tf.nn.embedding_lookup(embedding_weights, word_idx)
    lstm_cell = tf.nn.rnn_cell.LSTMCell(
        num_units=hidden_dim, use_peepholes=False)
    stacked_cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell] * stacked_num)
    # final_state [LSTMTuple(c, h), LSTMTuple(c, h) ...] total stacked_num LSTMTuples
    _, final_state = tf.nn.dynamic_rnn(
        cell=stacked_cell,
        inputs=embedding,
        dtype=tf.float32,
        sequence_length=sequence_length)
    w = tf.Variable(
        tf.truncated_normal([hidden_dim, class_num]), dtype=tf.float32)
    bias = tf.Variable(
        tf.constant(
            value=0.0, shape=[class_num], dtype=tf.float32))
    prediction = tf.matmul(final_state[-1][1], w) + bias
    if not is_train:
        return (word_idx, sequence_length), tf.nn.softmax(prediction)
    label = tf.placeholder(tf.int64, shape=[None, ])
    loss = tf.nn.softmax_cross_entropy_with_logits(
        labels=tf.one_hot(label, 2), logits=prediction)
    avg_loss = tf.reduce_mean(loss)
    correct_count = tf.equal(tf.argmax(prediction, 1), label)
    acc = tf.reduce_mean(tf.cast(correct_count, tf.float32))
    with tf.variable_scope("reset_metrics_accuracy_scope") as scope:
        g_acc = tf.metrics.accuracy(label, tf.argmax(prediction, axis=1))
        vars = tf.contrib.framework.get_variables(
            scope, collection=tf.GraphKeys.LOCAL_VARIABLES)
        reset_op = tf.variables_initializer(vars)
    return (word_idx, sequence_length, label), avg_loss, acc, g_acc, reset_op
 def padding_data(data, padding_size, value):
    data = data + [value] * padding_size
    return data[:padding_size]
 def train(args):
    word_dict = paddle.dataset.imdb.word_dict()
    dict_size = len(word_dict)
    feeding_list, avg_loss, acc, g_acc, reset_op = dynamic_lstm_model(
        dict_size, args.embedding_dim, args.hidden_dim, args.stacked_num)
    adam_optimizer = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
    train_op = adam_optimizer.minimize(avg_loss)
    train_reader = paddle.batch(
        paddle.reader.shuffle(
            paddle.dataset.imdb.train(word_dict), buf_size=25000),
        batch_size=args.batch_size)
    test_reader = paddle.batch(
        paddle.reader.shuffle(
            paddle.dataset.imdb.test(word_dict), buf_size=25000),
        batch_size=args.batch_size)
    def do_validation(sess):
        sess.run(reset_op)
        for batch_id, data in enumerate(test_reader()):
            word_idx = map(lambda x: x[0], data)
            sequence_length = np.array(
                [len(seq) for seq in word_idx]).astype('int64')
            maxlen = np.max(sequence_length)
            word_idx = [padding_data(seq, maxlen, 0) for seq in word_idx]
            word_idx = np.array(word_idx).astype('int64')
            label = np.array(map(lambda x: x[1], data)).astype('int64')
            _, loss, fetch_acc, fetch_g_acc = sess.run(
                [train_op, avg_loss, acc, g_acc],
                feed_dict={
                    feeding_list[0]: word_idx,
                    feeding_list[1]: sequence_length,
                    feeding_list[2]: label
                })
        return fetch_g_acc[1]
    config = tf.ConfigProto(
        intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        init_g = tf.global_variables_initializer()
        init_l = tf.local_variables_initializer()
        sess.run(init_l)
        sess.run(init_g)
        for pass_id in xrange(args.pass_num):
            # clear accuracy local variable 
            sess.run(reset_op)
            pass_start_time = time.time()
            words_seen = 0
            for batch_id, data in enumerate(train_reader()):
                word_idx = map(lambda x: x[0], data)
                sequence_length = np.array(
                    [len(seq) for seq in word_idx]).astype('int64')
                words_seen += np.sum(sequence_length)
                maxlen = np.max(sequence_length)
                word_idx = [padding_data(seq, maxlen, 0) for seq in word_idx]
                word_idx = np.array(word_idx).astype('int64')
                label = np.array(map(lambda x: x[1], data)).astype('int64')
                _, loss, fetch_acc, fetch_g_acc = sess.run(
                    [train_op, avg_loss, acc, g_acc],
                    feed_dict={
                        feeding_list[0]: word_idx,
                        feeding_list[1]: sequence_length,
                        feeding_list[2]: label
                    })
                print("pass_id=%d, batch_id=%d, loss: %f, acc: %f, avg_acc: %f"
                      % (pass_id, batch_id, loss, fetch_acc, fetch_g_acc[1]))
            pass_end_time = time.time()
            time_consumed = pass_end_time - pass_start_time
            words_per_sec = words_seen / time_consumed
            test_acc = do_validation(sess)
            print("pass_id=%d, test_acc: %f, words/s: %f, sec/pass: %f" %
                  (pass_id, test_acc, words_per_sec, time_consumed))
 if __name__ == '__main__':
    args = parse_args()
    print_arguments(args)
    if args.infer_only:
        pass
    else:
        train(args)
--- a/benchmark/tensorflow/vgg.py
+++ b/benchmark/tensorflow/vgg.py
--- a/Show More
+++ b/Show More