Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into fix-4029

8 years ago · 7be390aa83
parent 9367fa1229 502d7daff7
commit 7be390aa83
994 changed files with 65541 additions and 12192 deletions
--- a/.gitignore
+++ b/.gitignore
@ -28,3 +28,4 @@ cmake_install.cmake
 paddle/.timestamp
 python/paddlepaddle.egg-info/
 paddle/pybind/pybind.h
+python/paddle/v2/framework/tests/tmp/*
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -31,6 +31,3 @@
    -   id: go-fmt
        types:
        - go
-    -   id: gometalinter
-        types:
-        - go
--- a/.travis.yml
+++ b/.travis.yml
@ -30,6 +30,7 @@ addons:
      - automake
      - libtool
      - ccache
+  ssh_known_hosts: 52.76.173.135
 before_install:
  - if [[ "$JOB" == "check_style" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi
  # Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python
@ -42,6 +43,14 @@ script:
  - |
    timeout 2580 paddle/scripts/travis/${JOB}.sh # 43min timeout
    RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true; else false; fi;
+  - |
+    if [[ "$JOB" != "build_doc" ]]; then exit 0; fi;
+    if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then exit 0; fi;
+    if [[ "$TRAVIS_BRANCH" != "develop"  && ! "$TRAVIS_BRANCH" =~ ^v[[:digit:]]+\.[[:digit:]]+(\.[[:digit:]]+)?(-\S*)?$ ]]; then exit 0; fi;
+    export DEPLOY_DOCS_SH=https://raw.githubusercontent.com/PaddlePaddle/PaddlePaddle.org/master/scripts/deploy/deploy_docs.sh
+    export DOCS_DIR=`pwd`
+    cd ..
+    curl $DEPLOY_DOCS_SH | bash -s $CONTENT_DEC_PASSWD $TRAVIS_BRANCH $DOCS_DIR $DOCS_DIR/build/doc   
 notifications:
  email:
    on_success: change
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -86,6 +86,14 @@ if(ANDROID OR IOS)
        "Disable MKLDNN when cross-compiling for Android and iOS" FORCE)
    set(WITH_MKLML OFF CACHE STRING
        "Disable MKLML package when cross-compiling for Android and iOS" FORCE)
+
+    # Compile PaddlePaddle mobile inference library
+    if (NOT WITH_C_API)
+        set(WITH_C_API ON CACHE STRING
+            "Always compile the C_API when cross-compiling for Android and iOS" FORCE)
+    endif()
+    set(MOBILE_INFERENCE ON)
+    add_definitions(-DPADDLE_MOBILE_INFERENCE)
 endif()

 set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
@ -97,6 +105,12 @@ if (WITH_C_API AND WITH_PYTHON)
    "different Python interpreter from compiling.")
 endif()

+if(MOBILE_INFERENCE)
+    set(THIRD_PARTY_BUILD_TYPE MinSizeRel)
+else()
+    set(THIRD_PARTY_BUILD_TYPE Release)
+endif()
+
 ########################################################################################

 include(external/mklml)     # download mklml package
@ -112,7 +126,8 @@ include(external/swig)      # download, build, install swig
 include(external/warpctc)   # download, build, install warpctc
 include(external/any)       # download libn::any
 include(external/eigen)     # download eigen3
-include(external/pybind11)    # download pybind11
+include(external/pybind11)  # download pybind11
+include(external/nccl)

 include(cudnn)              # set cudnn libraries, must before configure
 include(configure)          # add paddle env configuration
@ -145,7 +160,7 @@ set(EXTERNAL_LIBS
 if(WITH_GPU)
    list(APPEND EXTERNAL_LIBS ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY})
    if(NOT WITH_DSO)
-        list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY})
+        list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${NCCL_LIBRARY})
    endif(NOT WITH_DSO)
 endif(WITH_GPU)

@ -160,9 +175,11 @@ endif(USE_NNPACK)

 add_subdirectory(proto)

-# "add_subdirectory(go)" should be placed after the following loine,
-# because it depends on paddle/optimizer.
-add_subdirectory(paddle/optimizer)
+if(NOT MOBILE_INFERENCE)
+    # "add_subdirectory(go)" should be placed after the following loine,
+    # because it depends on paddle/optimizer.
+    add_subdirectory(paddle/optimizer)
+endif()

 # "add_subdirectory(paddle)" and "add_subdirectory(python)" should be
 # placed after this block, because they depends on it.
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -1 +1,157 @@
-./doc/howto/dev/contribute_to_paddle_en.md
+# Contribute Code
+
+We sincerely appreciate your contribution.  This document explains our workflow and work style.
+
+## Workflow
+
+PaddlePaddle uses this [Git branching model](http://nvie.com/posts/a-successful-git-branching-model/).  The following steps guide usual contributions.
+
+1. Fork
+
+   Our development community has been growing fastly; it doesn't make sense for everyone to write into the official repo.  So, please file Pull Requests from your fork.  To make a fork,  just head over to the GitHub page and click the ["Fork" button](https://help.github.com/articles/fork-a-repo/).
+
+1. Clone
+
+   To make a copy of your fork to your local computers, please run
+
+   ```bash
+   git clone https://github.com/your-github-account/paddle
+   cd paddle
+   ```
+
+1. Create the local feature branch
+
+   For daily works like adding a new feature or fixing a bug, please open your feature branch before coding:
+
+   ```bash
+   git checkout -b my-cool-stuff
+   ```
+
+1. Commit
+
+   Before issuing your first `git commit` command, please install [`pre-commit`](http://pre-commit.com/) by running the following commands:
+
+   ```bash
+   pip install pre-commit
+   pre-commit install
+   ```
+
+   Our pre-commit configuration requires clang-format 3.8 for auto-formating C/C++ code and yapf for Python.
+
+   Once installed, `pre-commit` checks the style of code and documentation in every commit.  We will see something like the following when you run `git commit`:
+
+   ```
+   ➜  git commit
+   CRLF end-lines remover...............................(no files to check)Skipped
+   yapf.................................................(no files to check)Skipped
+   Check for added large files..............................................Passed
+   Check for merge conflicts................................................Passed
+   Check for broken symlinks................................................Passed
+   Detect Private Key...................................(no files to check)Skipped
+   Fix End of Files.....................................(no files to check)Skipped
+   clang-formater.......................................(no files to check)Skipped
+   [my-cool-stuff c703c041] add test file
+    1 file changed, 0 insertions(+), 0 deletions(-)
+    create mode 100644 233
+   ```
+
+1. Build and test
+
+   Users can build PaddlePaddle natively on Linux and Mac OS X.  But to unify the building environment and to make it easy for debugging, the recommended way is [using Docker](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/build_en.md).
+
+1. Keep pulling
+
+   An experienced Git user pulls from the official repo often -- daily or even hourly, so they notice conflicts with others work early, and it's easier to resolve smaller conflicts.
+
+   ```bash
+   git remote add upstream https://github.com/PaddlePaddle/Paddle
+   git pull upstream develop
+   ```
+
+1. Push and file a pull request
+
+   You can "push" your local work into your forked repo:
+
+   ```bash
+   git push origin my-cool-stuff
+   ```
+
+   The push allows you to create a pull request, requesting owners of this [official repo](https://github.com/PaddlePaddle/Paddle) to pull your change into the official one.
+
+   To create a pull request, please follow [these steps](https://help.github.com/articles/creating-a-pull-request/).
+
+   If your change is for fixing an issue, please write ["Fixes <issue-URL>"](https://help.github.com/articles/closing-issues-using-keywords/) in the description section of your pull request.  Github would close the issue when the owners merge your pull request.
+
+   Please remember to specify some reviewers for your pull request.  If you don't know who are the right ones, please follow Github's recommendation.
+
+
+1. Delete local and remote branches
+
+   To keep your local workspace and your fork clean, you might want to remove merged branches:
+
+   ```bash
+   git push origin :my-cool-stuff
+   git checkout develop
+   git pull upstream develop
+   git branch -d my-cool-stuff
+   ```
+
+### Code Review
+
+-  Please feel free to ping your reviewers by sending them the URL of your pull request via IM or email.  Please do this after your pull request passes the CI.
+
+- Please answer reviewers' every comment.  If you are to follow the comment, please write "Done"; please give a reason otherwise.
+
+- If you don't want your reviewers to get overwhelmed by email notifications, you might reply their comments by [in a batch](https://help.github.com/articles/reviewing-proposed-changes-in-a-pull-request/).
+
+- Reduce the unnecessary commits.  Some developers commit often.  It is recommended to append a sequence of small changes into one commit by running `git commit --amend` instead of `git commit`.
+
+
+## Coding Standard
+
+### Code Style
+
+Our C/C++ code follows the [Google style guide](http://google.github.io/styleguide/cppguide.html).
+
+Our Python code follows the [PEP8 style guide](https://www.python.org/dev/peps/pep-0008/).
+
+Our build process helps to check the code style.  In [`build.sh`](https://github.com/PaddlePaddle/Paddle/blob/b84e8226514b8bb4405c3c28e54aa5077193d179/paddle/scripts/docker/build.sh#L42), the entry point of our [builder Docker image](https://github.com/PaddlePaddle/Paddle/blob/b84e8226514b8bb4405c3c28e54aa5077193d179/Dockerfile#L88), the CMake argument `WITH_STYLE_CHECK` is set to `ON` by default.  This flag is on
+
+Please install pre-commit, which automatically reformat the changes to C/C++ and Python code whenever we run `git commit`.  To check the whole codebase, we can run the command `pre-commit run -a`, as in the [`check_style.sh` file](https://github.com/PaddlePaddle/Paddle/blob/b84e8226514b8bb4405c3c28e54aa5077193d179/paddle/scripts/travis/check_style.sh#L30), which is invoked by [our Travis CI configuration](https://github.com/PaddlePaddle/Paddle/blob/b84e8226514b8bb4405c3c28e54aa5077193d179/.travis.yml#L43).
+
+### Unit Tests
+
+Please remember to add related unit tests.
+
+- For C/C++ code, please follow [`google-test` Primer](https://github.com/google/googletest/blob/master/googletest/docs/Primer.md).
+
+- For Python code, please use [Python's standard `unittest` package](http://pythontesting.net/framework/unittest/unittest-introduction/).
+
+
+### Writing Logs
+
+We use [glog](https://github.com/google/glog) for logging in our C/C++ code.
+
+For general information, please use `LOG`.  For debug information, please use [`VLOG`](http://htmlpreview.github.io/?https://github.com/google/glog/blob/master/doc/glog.html#verbose).  The reason is at [here](https://groups.google.com/a/chromium.org/d/msg/chromium-dev/3NDNd1KzXeY/AZKMMx37fdQJ).
+
+`VLOG` requires a *verbose level* parameter.  For example:
+
+```c++
+VLOG(3) << "Operator FC is taking " << num_inputs << "inputs."
+```
+
+When we run a PaddlePaddle application or test, we can specify a verbose threshold.  For example:
+
+```bash
+GLOG_vmodule=buddy_allocator=2 \
+GLOG_v=10 \
+python \
+../python/paddle/v2/framework/tests/test_recurrent_op.py
+```
+
+This will enable VLOG messages generated by `buddy_allocator.{h,cc}` and in the verbose range of 0 to 3, so you will see above example VLOG message, which is in level 3.  This suggests that we output overall messages in lower verbose levels, so they display with higher probability.  When coding C++, please follow the verbose level convention as follows:
+
+- verbose level 1: [framework](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework)
+- verbose level 3: [operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators)
+- verbose level 5: [memory](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/memory), [platform](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/platform)
+- verbose level 7: [math](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/math)
--- a/2
+++ b/2
@ -22,7 +22,7 @@ COPY ./paddle/scripts/docker/root/ /root/

 RUN apt-get update && \
    apt-get install -y \
-    git python-pip python-dev openssh-server bison  \
+    git python-pip python-dev openssh-server bison libnccl-dev \
    wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
    curl sed grep graphviz libjpeg-dev zlib1g-dev  \
    python-matplotlib gcc-4.8 g++-4.8 \
--- a/README.md
+++ b/README.md
@ -51,19 +51,19 @@ Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddl
 - **Connected to Products**

    In addition, PaddlePaddle is also designed to be easily deployable. At Baidu,
-    PaddlePaddle has been deployed into products or service with a vast number
+    PaddlePaddle has been deployed into products and services with a vast number
    of users, including ad click-through rate (CTR) prediction, large-scale image
    classification, optical character recognition(OCR), search ranking, computer
    virus detection, recommendation, etc. It is widely utilized in products at
-    Baidu and it has achieved a significant impact. We hope you can also exploit
-    the capability of PaddlePaddle to make a huge impact for your product.
+    Baidu and it has achieved a significant impact. We hope you can also explore
+    the capability of PaddlePaddle to make an impact on your product.

 ## Installation

 It is recommended to check out the
 [Docker installation guide](http://doc.paddlepaddle.org/develop/doc/getstarted/build_and_install/docker_install_en.html)
 before looking into the
-[build from source guide](http://doc.paddlepaddle.org/develop/doc/getstarted/build_and_install/build_from_source_en.html)
+[build from source guide](http://doc.paddlepaddle.org/develop/doc/getstarted/build_and_install/build_from_source_en.html).

 ## Documentation

@ -72,7 +72,7 @@ We provide [English](http://doc.paddlepaddle.org/develop/doc/) and

 - [Deep Learning 101](http://book.paddlepaddle.org/index.html)

-  You might want to start from this online interactive book that can run in Jupyter Notebook.
+  You might want to start from this online interactive book that can run in a Jupyter Notebook.

 - [Distributed Training](http://doc.paddlepaddle.org/develop/doc/howto/usage/cluster/cluster_train_en.html)

--- a/benchmark/IntelOptimizedPaddle.md
+++ b/benchmark/IntelOptimizedPaddle.md
@ -0,0 +1,48 @@
+# Benchmark
+
+Machine:
+
+- Server
+ 	- Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz, 2 Sockets, 20 Cores per socket
+- Laptop
+ 	- DELL XPS15-9560-R1745: i7-7700HQ 8G 256GSSD
+ 	- i5 MacBook Pro (Retina, 13-inch, Early 2015)
+- Desktop
+ 	- i7-6700k
+
+System: CentOS release 6.3 (Final), Docker 1.12.1.
+
+PaddlePaddle: paddlepaddle/paddle:latest (TODO: will rerun after 0.11.0)
+
+- MKL-DNN tag v0.10
+- MKLML 2018.0.20170720
+- OpenBLAS v0.2.20
+	 
+On each machine, we will test and compare the performance of training on single node using MKL-DNN / MKLML / OpenBLAS respectively.
+
+## Benchmark Model
+
+### Server
+Test on batch size 64, 128, 256 on Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
+
+Input image size - 3 * 224 * 224, Time: images/second
+
+- VGG-19
+
+| BatchSize    | 64    | 128  | 256     |
+|--------------|-------| -----| --------|
+| OpenBLAS     | 7.82  | 8.62  | 10.34  | 
+| MKLML        | 11.02 | 12.86 | 15.33  |
+| MKL-DNN      | 27.69 | 28.8 | 29.27  |
+
+
+chart on batch size 128
+TBD
+
+ - ResNet
+ - GoogLeNet
+
+### Laptop
+TBD
+### Desktop
+TBD
--- a/benchmark/paddle/image/provider.py
+++ b/benchmark/paddle/image/provider.py
@ -22,5 +22,5 @@ def initHook(settings, height, width, color, num_class, **kwargs):
 def process(settings, file_list):
    for i in xrange(1024):
        img = np.random.rand(1, settings.data_size).reshape(-1, 1).flatten()
-        lab = random.randint(0, settings.num_class)
+        lab = random.randint(0, settings.num_class - 1)
        yield img.astype('float32'), int(lab)
--- a/benchmark/paddle/image/resnet.py
+++ b/benchmark/paddle/image/resnet.py
@ -0,0 +1,213 @@
+#!/usr/bin/env python
+from paddle.trainer_config_helpers import *
+
+height = 224
+width = 224
+num_class = 1000
+batch_size = get_config_arg('batch_size', int, 64)
+layer_num = get_config_arg("layer_num", int, 50)
+is_test = get_config_arg("is_test", bool, False)
+
+args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
+define_py_data_sources2(
+    "train.list", None, module="provider", obj="process", args=args)
+
+settings(
+    batch_size=batch_size,
+    learning_rate=0.01 / batch_size,
+    learning_method=MomentumOptimizer(0.9),
+    regularization=L2Regularization(0.0005 * batch_size))
+
+
+#######################Network Configuration #############
+def conv_bn_layer(name,
+                  input,
+                  filter_size,
+                  num_filters,
+                  stride,
+                  padding,
+                  channels=None,
+                  active_type=ReluActivation()):
+    """
+    A wrapper for conv layer with batch normalization layers.
+    Note:
+    conv layer has no activation.
+    """
+
+    tmp = img_conv_layer(
+        name=name + "_conv",
+        input=input,
+        filter_size=filter_size,
+        num_channels=channels,
+        num_filters=num_filters,
+        stride=stride,
+        padding=padding,
+        act=LinearActivation(),
+        bias_attr=False)
+    return batch_norm_layer(
+        name=name + "_bn", input=tmp, act=active_type, use_global_stats=is_test)
+
+
+def bottleneck_block(name, input, num_filters1, num_filters2):
+    """
+    A wrapper for bottlenect building block in ResNet.
+    Last conv_bn_layer has no activation.
+    Addto layer has activation of relu.
+    """
+    last_name = conv_bn_layer(
+        name=name + '_branch2a',
+        input=input,
+        filter_size=1,
+        num_filters=num_filters1,
+        stride=1,
+        padding=0)
+    last_name = conv_bn_layer(
+        name=name + '_branch2b',
+        input=last_name,
+        filter_size=3,
+        num_filters=num_filters1,
+        stride=1,
+        padding=1)
+    last_name = conv_bn_layer(
+        name=name + '_branch2c',
+        input=last_name,
+        filter_size=1,
+        num_filters=num_filters2,
+        stride=1,
+        padding=0,
+        active_type=LinearActivation())
+
+    return addto_layer(
+        name=name + "_addto", input=[input, last_name], act=ReluActivation())
+
+
+def mid_projection(name, input, num_filters1, num_filters2, stride=2):
+    """
+    A wrapper for middile projection in ResNet.
+    projection shortcuts are used for increasing dimensions,
+    and other shortcuts are identity
+    branch1: projection shortcuts are used for increasing
+    dimensions, has no activation.
+    branch2x: bottleneck building block, shortcuts are identity.
+    """
+    # stride = 2
+    branch1 = conv_bn_layer(
+        name=name + '_branch1',
+        input=input,
+        filter_size=1,
+        num_filters=num_filters2,
+        stride=stride,
+        padding=0,
+        active_type=LinearActivation())
+
+    last_name = conv_bn_layer(
+        name=name + '_branch2a',
+        input=input,
+        filter_size=1,
+        num_filters=num_filters1,
+        stride=stride,
+        padding=0)
+    last_name = conv_bn_layer(
+        name=name + '_branch2b',
+        input=last_name,
+        filter_size=3,
+        num_filters=num_filters1,
+        stride=1,
+        padding=1)
+
+    last_name = conv_bn_layer(
+        name=name + '_branch2c',
+        input=last_name,
+        filter_size=1,
+        num_filters=num_filters2,
+        stride=1,
+        padding=0,
+        active_type=LinearActivation())
+
+    return addto_layer(
+        name=name + "_addto", input=[branch1, last_name], act=ReluActivation())
+
+
+img = data_layer(name='image', size=height * width * 3)
+
+
+def deep_res_net(res2_num=3, res3_num=4, res4_num=6, res5_num=3):
+    """
+    A wrapper for 50,101,152 layers of ResNet.
+    res2_num: number of blocks stacked in conv2_x
+    res3_num: number of blocks stacked in conv3_x
+    res4_num: number of blocks stacked in conv4_x
+    res5_num: number of blocks stacked in conv5_x
+    """
+    # For ImageNet
+    # conv1: 112x112
+    tmp = conv_bn_layer(
+        "conv1",
+        input=img,
+        filter_size=7,
+        channels=3,
+        num_filters=64,
+        stride=2,
+        padding=3)
+    tmp = img_pool_layer(name="pool1", input=tmp, pool_size=3, stride=2)
+
+    # conv2_x: 56x56
+    tmp = mid_projection(
+        name="res2_1", input=tmp, num_filters1=64, num_filters2=256, stride=1)
+    for i in xrange(2, res2_num + 1, 1):
+        tmp = bottleneck_block(
+            name="res2_" + str(i), input=tmp, num_filters1=64, num_filters2=256)
+
+    # conv3_x: 28x28
+    tmp = mid_projection(
+        name="res3_1", input=tmp, num_filters1=128, num_filters2=512)
+    for i in xrange(2, res3_num + 1, 1):
+        tmp = bottleneck_block(
+            name="res3_" + str(i),
+            input=tmp,
+            num_filters1=128,
+            num_filters2=512)
+
+    # conv4_x: 14x14
+    tmp = mid_projection(
+        name="res4_1", input=tmp, num_filters1=256, num_filters2=1024)
+    for i in xrange(2, res4_num + 1, 1):
+        tmp = bottleneck_block(
+            name="res4_" + str(i),
+            input=tmp,
+            num_filters1=256,
+            num_filters2=1024)
+
+    # conv5_x: 7x7
+    tmp = mid_projection(
+        name="res5_1", input=tmp, num_filters1=512, num_filters2=2048)
+    for i in xrange(2, res5_num + 1, 1):
+        tmp = bottleneck_block(
+            name="res5_" + str(i),
+            input=tmp,
+            num_filters1=512,
+            num_filters2=2048)
+
+    tmp = img_pool_layer(
+        name='avgpool',
+        input=tmp,
+        pool_size=7,
+        stride=1,
+        pool_type=AvgPooling())
+
+    return fc_layer(input=tmp, size=num_class, act=SoftmaxActivation())
+
+
+if layer_num == 50:
+    resnet = deep_res_net(3, 4, 6, 3)
+elif layer_num == 101:
+    resnet = deep_res_net(3, 4, 23, 3)
+elif layer_num == 152:
+    resnet = deep_res_net(3, 8, 36, 3)
+else:
+    print("Wrong layer number.")
+
+lbl = data_layer(name="label", size=num_class)
+loss = cross_entropy(name='loss', input=resnet, label=lbl)
+inputs(img, lbl)
+outputs(loss)
--- a/benchmark/paddle/image/run_mkldnn.sh
+++ b/benchmark/paddle/image/run_mkldnn.sh
@ -0,0 +1,49 @@
+set -e
+
+function train() {
+  unset OMP_NUM_THREADS MKL_NUM_THREADS
+  export OMP_DYNAMIC="FALSE"
+  export KMP_AFFINITY="granularity=fine,compact,0,0"
+  topology=$1
+  layer_num=$2
+  bs=$3
+  use_mkldnn=$4
+  if [ $4 == "True" ]; then
+    thread=1
+    log="logs/${topology}-${layer_num}-mkldnn-${bs}.log"
+  elif [ $4 == "False" ]; then
+    thread=`nproc`
+    # each trainer_count use only 1 core to avoid conflict
+    export OMP_NUM_THREADS=1
+    export MKL_NUM_THREADS=1
+    log="logs/${topology}-${layer_num}-${thread}mklml-${bs}.log"
+  else
+    echo "Wrong input $3, use True or False."
+    exit 0
+  fi
+  args="batch_size=${bs},layer_num=${layer_num}"
+  config="${topology}.py"
+  paddle train --job=time \
+    --config=$config \
+    --use_mkldnn=$use_mkldnn \
+    --use_gpu=False \
+    --trainer_count=$thread \
+    --log_period=10 \
+    --test_period=100 \
+    --config_args=$args \
+    2>&1 | tee ${log} 
+}
+
+if [ ! -d "train.list" ]; then
+  echo " " > train.list
+fi
+if [ ! -d "logs" ]; then
+  mkdir logs
+fi
+
+for use_mkldnn in True False; do
+  for batchsize in 64 128 256; do
+    train vgg 19 $batchsize $use_mkldnn
+    train resnet 50  $batchsize $use_mkldnn
+  done
+done
--- a/benchmark/paddle/image/vgg.py
+++ b/benchmark/paddle/image/vgg.py
@ -0,0 +1,103 @@
+#!/usr/bin/env python
+from paddle.trainer_config_helpers import *
+
+height = 224
+width = 224
+num_class = 1000
+batch_size = get_config_arg('batch_size', int, 64)
+layer_num = get_config_arg('layer_num', int, 19)
+
+args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
+define_py_data_sources2(
+    "train.list", None, module="provider", obj="process", args=args)
+
+settings(
+    batch_size=batch_size,
+    learning_rate=0.001 / batch_size,
+    learning_method=MomentumOptimizer(0.9),
+    regularization=L2Regularization(0.0005 * batch_size))
+
+img = data_layer(name='image', size=height * width * 3)
+
+
+def vgg_network(vgg_num=3):
+    tmp = img_conv_group(
+        input=img,
+        num_channels=3,
+        conv_padding=1,
+        conv_num_filter=[64, 64],
+        conv_filter_size=3,
+        conv_act=ReluActivation(),
+        pool_size=2,
+        pool_stride=2,
+        pool_type=MaxPooling())
+
+    tmp = img_conv_group(
+        input=tmp,
+        conv_num_filter=[128, 128],
+        conv_padding=1,
+        conv_filter_size=3,
+        conv_act=ReluActivation(),
+        pool_stride=2,
+        pool_type=MaxPooling(),
+        pool_size=2)
+
+    channels = []
+    for i in range(vgg_num):
+        channels.append(256)
+    tmp = img_conv_group(
+        input=tmp,
+        conv_num_filter=channels,
+        conv_padding=1,
+        conv_filter_size=3,
+        conv_act=ReluActivation(),
+        pool_stride=2,
+        pool_type=MaxPooling(),
+        pool_size=2)
+    channels = []
+    for i in range(vgg_num):
+        channels.append(512)
+    tmp = img_conv_group(
+        input=tmp,
+        conv_num_filter=channels,
+        conv_padding=1,
+        conv_filter_size=3,
+        conv_act=ReluActivation(),
+        pool_stride=2,
+        pool_type=MaxPooling(),
+        pool_size=2)
+    tmp = img_conv_group(
+        input=tmp,
+        conv_num_filter=channels,
+        conv_padding=1,
+        conv_filter_size=3,
+        conv_act=ReluActivation(),
+        pool_stride=2,
+        pool_type=MaxPooling(),
+        pool_size=2)
+
+    tmp = fc_layer(
+        input=tmp,
+        size=4096,
+        act=ReluActivation(),
+        layer_attr=ExtraAttr(drop_rate=0.5))
+
+    tmp = fc_layer(
+        input=tmp,
+        size=4096,
+        act=ReluActivation(),
+        layer_attr=ExtraAttr(drop_rate=0.5))
+
+    return fc_layer(input=tmp, size=num_class, act=SoftmaxActivation())
+
+
+if layer_num == 16:
+    vgg = vgg_network(3)
+elif layer_num == 19:
+    vgg = vgg_network(4)
+else:
+    print("Wrong layer number.")
+
+lab = data_layer('label', num_class)
+loss = cross_entropy(input=vgg, label=lab)
+outputs(loss)
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@ -24,6 +24,10 @@ if(WITH_DOUBLE)
    add_definitions(-DPADDLE_TYPE_DOUBLE)
 endif(WITH_DOUBLE)

+if(WITH_TESTING)
+    add_definitions(-DPADDLE_WITH_TESTING)
+endif(WITH_TESTING)
+
 if(NOT WITH_TIMER)
    add_definitions(-DPADDLE_DISABLE_TIMER)
 endif(NOT WITH_TIMER)
@ -49,19 +53,20 @@ if(NOT WITH_GOLANG)
 endif(NOT WITH_GOLANG)

 if(NOT WITH_GPU)
-    add_definitions(-DPADDLE_ONLY_CPU)
    add_definitions(-DHPPL_STUB_FUNC)

    list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
 else()
+    add_definitions(-DPADDLE_WITH_CUDA)
+
    FIND_PACKAGE(CUDA REQUIRED)

    if(${CUDA_VERSION_MAJOR} VERSION_LESS 7)
-        message(FATAL_ERROR "Paddle need CUDA >= 7.0 to compile")
+        message(FATAL_ERROR "Paddle needs CUDA >= 7.0 to compile")
    endif()

    if(NOT CUDNN_FOUND)
-        message(FATAL_ERROR "Paddle need cudnn to compile")
+        message(FATAL_ERROR "Paddle needs cudnn to compile")
    endif()

    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SIMD_FLAG}")
--- a/cmake/cross_compiling/ios.cmake
+++ b/cmake/cross_compiling/ios.cmake
@ -79,9 +79,8 @@ if(NOT DEFINED IOS_ARCH)
    # FIXME(liuyiqun): support "armv7;armv7s;arm64" future
    set(IOS_ARCH "arm64")
  elseif(IOS_PLATFORM STREQUAL "SIMULATOR")
-    set(IOS_ARCH "i386;x86_64")
-  elseif(IOS_PLATFORM STREQUAL "WATCHOS")
-    set(IOS_ARCH armv7k)
+    # FIXME(liuyiqun): support "i386;x86_64" future
+    set(IOS_ARCH "x86_64")
  endif()
 endif()
 set(CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE string  "Build architecture for iOS")
--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@ -8,7 +8,7 @@ ExternalProject_Add(
    extern_eigen3
    ${EXTERNAL_PROJECT_LOG_ARGS}
    GIT_REPOSITORY  "https://github.com/RLovelett/eigen.git"
-    GIT_TAG         "master"
+    GIT_TAG         70661066beef694cadf6c304d0d07e0758825c10
    PREFIX          ${EIGEN_SOURCE_DIR}
    UPDATE_COMMAND  ""
    CONFIGURE_COMMAND ""
--- a/cmake/external/gflags.cmake
+++ b/cmake/external/gflags.cmake
@ -36,6 +36,7 @@ ExternalProject_Add(
    # change this back to the official Github repo once my PR is
    # merged.
    GIT_REPOSITORY  "https://github.com/wangkuiyi/gflags.git"
+    GIT_TAG         986964c07427ecb9cdb5bd73f73ebbd40e54dadb
    PREFIX          ${GFLAGS_SOURCES_DIR}
    UPDATE_COMMAND  ""
    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
@ -45,11 +46,11 @@ ExternalProject_Add(
                    -DCMAKE_INSTALL_PREFIX=${GFLAGS_INSTALL_DIR}
                    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
                    -DBUILD_TESTING=OFF
-                    -DCMAKE_BUILD_TYPE=Release
+                    -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
                    ${EXTERNAL_OPTIONAL_ARGS}
    CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GFLAGS_INSTALL_DIR}
                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-                     -DCMAKE_BUILD_TYPE:STRING=Release
+                     -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
 )

 ADD_LIBRARY(gflags STATIC IMPORTED GLOBAL)
--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@ -31,6 +31,7 @@ ExternalProject_Add(
    ${EXTERNAL_PROJECT_LOG_ARGS}
    DEPENDS gflags
    GIT_REPOSITORY  "https://github.com/google/glog.git"
+    GIT_TAG         v0.3.5
    PREFIX          ${GLOG_SOURCES_DIR}
    UPDATE_COMMAND  ""
    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
@ -43,12 +44,12 @@ ExternalProject_Add(
                    -DWITH_GFLAGS=ON
                    -Dgflags_DIR=${GFLAGS_INSTALL_DIR}/lib/cmake/gflags
                    -DBUILD_TESTING=OFF
-                    -DCMAKE_BUILD_TYPE=Release
+                    -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
                    ${EXTERNAL_OPTIONAL_ARGS}
    CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GLOG_INSTALL_DIR}
                     -DCMAKE_INSTALL_LIBDIR:PATH=${GLOG_INSTALL_DIR}/lib
                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-                     -DCMAKE_BUILD_TYPE:STRING=Release
+                     -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
 )

 ADD_LIBRARY(glog STATIC IMPORTED GLOBAL)
--- a/cmake/external/gtest.cmake
+++ b/cmake/external/gtest.cmake
@ -56,11 +56,11 @@ IF(WITH_TESTING)
                        -DBUILD_GMOCK=ON
                        -Dgtest_disable_pthreads=ON
                        -Dgtest_force_shared_crt=ON
-                        -DCMAKE_BUILD_TYPE=Release
+                        -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
                        ${EXTERNAL_OPTIONAL_ARGS}
        CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GTEST_INSTALL_DIR}
                         -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-                         -DCMAKE_BUILD_TYPE:STRING=Release
+                         -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
    )

    ADD_LIBRARY(gtest STATIC IMPORTED GLOBAL)
--- a/cmake/external/nccl.cmake
+++ b/cmake/external/nccl.cmake
@ -0,0 +1,67 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(NOT WITH_GPU)
+  return()
+endif()
+
+include(ExternalProject)
+
+set(NCCL_SOURCE_DIR ${THIRD_PARTY_PATH}/nccl)
+
+include_directories(${NCCL_SOURCE_DIR}/src/extern_nccl/src)
+
+if(WITH_DSO)
+  # If we use DSO, we do not build nccl, just download the dependencies
+  set(NCCL_BUILD_COMMAND "")
+  set(NCCL_INSTALL_COMMAND "")
+  set(NCCL_INSTALL_DIR "")
+else()
+  # otherwise, we build nccl and link it.
+  set(NCCL_INSTALL_DIR ${THIRD_PARTY_PATH}/install/nccl)
+  # Note: cuda 8.0 is needed to make nccl
+  # When cuda is not installed on the system directory, need to set CUDA_HOME to your cuda root
+  set(NCCL_BUILD_COMMAND "make -j 8")
+  set(NCCL_INSTALL_COMMAND  "make install PREFIX=${NCCL_INSTALL_DIR}")
+endif()
+
+ExternalProject_Add(
+    extern_nccl
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    GIT_REPOSITORY  "https://github.com/NVIDIA/nccl.git"
+    GIT_TAG         "v1.3.4-1"
+    PREFIX          "${NCCL_SOURCE_DIR}"
+    UPDATE_COMMAND  ""
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND     "${NCCL_BUILD_COMMAND}"
+    INSTALL_COMMAND   "${NCCL_INSTALL_COMMAND}"
+    INSTALL_DIR       "${NCCL_INSTALL_DIR}"
+    TEST_COMMAND      ""
+)
+
+if(WITH_DSO)
+  if(${CMAKE_VERSION} VERSION_LESS "3.3.0")
+    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/lib_nccl_dummy.c)
+    file(WRITE ${dummyfile} "const char * dummy_nccl = \"${dummyfile}\";")
+    add_library(nccl STATIC ${dummyfile})
+  else()
+    add_library(nccl INTERFACE)
+  endif()
+else()
+  add_library(nccl STATIC IMPORTED GLOBAL)
+  set_property(TARGET nccl PROPERTY IMPORTED_LOCATION
+               ${NCCL_INSTALL_DIR}/lib/libnccl_static.a)
+endif()
+
+add_dependencies(nccl extern_nccl)
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@ -1,11 +1,11 @@
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 # http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@ -191,12 +191,12 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
            ${OPTIONAL_ARGS}
            -Dprotobuf_BUILD_TESTS=OFF
            -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-            -DCMAKE_BUILD_TYPE=Release
+            -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
            -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR}
            -DCMAKE_INSTALL_LIBDIR=lib
        CMAKE_CACHE_ARGS
            -DCMAKE_INSTALL_PREFIX:PATH=${PROTOBUF_INSTALL_DIR}
-            -DCMAKE_BUILD_TYPE:STRING=Release
+            -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
            -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
            -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
            ${OPTIONAL_CACHE_ARGS}
--- a/cmake/external/pybind11.cmake
+++ b/cmake/external/pybind11.cmake
@ -1,8 +1,26 @@
-INCLUDE(ExternalProject)
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

-SET(PYBIND_SOURCE_DIR ${THIRD_PARTY_PATH}/pybind)
+if(NOT WITH_PYTHON)
+    return()
+endif()
+
+include(ExternalProject)

-INCLUDE_DIRECTORIES(${PYBIND_SOURCE_DIR}/src/extern_pybind/include)
+set(PYBIND_SOURCE_DIR ${THIRD_PARTY_PATH}/pybind)
+
+include_directories(${PYBIND_SOURCE_DIR}/src/extern_pybind/include)

 ExternalProject_Add(
        extern_pybind
@ -17,14 +35,12 @@ ExternalProject_Add(
        TEST_COMMAND      ""
 )

-if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
+if(${CMAKE_VERSION} VERSION_LESS "3.3.0")
    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/pybind_dummy.c)
-    file(WRITE ${dummyfile} "const char * dummy_any = \"${dummyfile}\";")
+    file(WRITE ${dummyfile} "const char * dummy_pybind = \"${dummyfile}\";")
    add_library(pybind STATIC ${dummyfile})
 else()
    add_library(pybind INTERFACE)
 endif()

 add_dependencies(pybind extern_pybind)
-
-LIST(APPEND external_project_dependencies pybind)
--- a/cmake/external/swig.cmake
+++ b/cmake/external/swig.cmake
@ -1,11 +1,11 @@
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 # http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@ -35,6 +35,7 @@ ExternalProject_Add(
    extern_warpctc
    ${EXTERNAL_PROJECT_LOG_ARGS}
    GIT_REPOSITORY  "https://github.com/gangliao/warp-ctc.git"
+    GIT_TAG         b63a0644654a3e0ed624c85a1767bc8193aead09
    PREFIX          ${WARPCTC_SOURCES_DIR}
    UPDATE_COMMAND  ""
    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
@ -48,9 +49,9 @@ ExternalProject_Add(
                    -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON
                    -DBUILD_SHARED=ON
                    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-                    -DCMAKE_BUILD_TYPE=Release
+                    -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
                    ${EXTERNAL_OPTIONAL_ARGS}
-    CMAKE_CACHE_ARGS -DCMAKE_BUILD_TYPE:STRING=Release
+    CMAKE_CACHE_ARGS -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
                     -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR}
 )
--- a/cmake/external/zlib.cmake
+++ b/cmake/external/zlib.cmake
@ -1,11 +1,11 @@
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 # http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -42,11 +42,11 @@ ExternalProject_Add(
                    -DBUILD_SHARED_LIBS=OFF
                    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
                    -DCMAKE_MACOSX_RPATH=ON
-                    -DCMAKE_BUILD_TYPE=Release
+                    -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
                    ${EXTERNAL_OPTIONAL_ARGS}
    CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${ZLIB_INSTALL_DIR}
                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-                     -DCMAKE_BUILD_TYPE:STRING=Release
+                     -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
 )

 LIST(APPEND external_project_dependencies zlib)
--- a/Show More
+++ b/Show More