update design doc

7 years ago · fab63cc612
parent 1ce2d61341 9cfa5ce30f
commit fab63cc612
975 changed files with 37627 additions and 25205 deletions
--- a/.clang-format
+++ b/.clang-format
@ -25,4 +25,3 @@ AllowAllParametersOfDeclarationOnNextLine: true
 BinPackParameters: false
 BinPackArguments: false
 ...
--- a/.gitignore
+++ b/.gitignore
@ -28,3 +28,4 @@ cmake_install.cmake
 paddle/.timestamp
 python/paddlepaddle.egg-info/
 paddle/pybind/pybind.h
 python/paddle/version.py
--- a/.travis.yml
+++ b/.travis.yml
@ -42,7 +42,7 @@ before_install:
 script:
  - |
    timeout 2580 paddle/scripts/travis/${JOB}.sh # 43min timeout
-    RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true; else false; fi;
+    RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true ;else exit 1; fi;
  - |
    if [[ "$JOB" != "build_doc" ]]; then exit 0; fi;
    if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then exit 0; fi;
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -20,6 +20,8 @@ set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
 include(system)
 project(paddle CXX C Go)
 message(STATUS "CXX compiler: " ${CMAKE_CXX_COMPILER} ", version: " ${CMAKE_CXX_COMPILER_VERSION})
 message(STATUS "C compiler: " ${CMAKE_C_COMPILER} ", version: " ${CMAKE_C_COMPILER_VERSION})
 find_package(Sphinx)
 if(NOT CMAKE_CROSSCOMPILING)
@ -54,7 +56,9 @@ option(WITH_C_API       "Compile PaddlePaddle with C-API(Prediction)"   OFF)
 option(WITH_GOLANG      "Compile PaddlePaddle with GOLANG"              OFF)
 option(GLIDE_INSTALL    "Download and install go dependencies "         ON)
 option(USE_NNPACK       "Compile PaddlePaddle with NNPACK library"      OFF)
 option(WITH_DISTRIBUTE  "Compile with grpc distributed support"         OFF)
 option(USE_EIGEN_FOR_BLAS   "Use matrix multiplication in Eigen"        OFF)
 option(WITH_ARM_FP16    "Use half precision support on armv8.2-a cpu"   OFF)
 # CMAKE_BUILD_TYPE
 if(NOT CMAKE_BUILD_TYPE)
@ -67,9 +71,6 @@ if(ANDROID OR IOS)
    if(ANDROID)
        if(${CMAKE_SYSTEM_VERSION} VERSION_LESS "16")
            message(FATAL_ERROR "Unsupport standalone toolchains with Android API level lower than 16")
        elseif(${CMAKE_SYSTEM_VERSION} VERSION_LESS "21")
            # TODO: support glog for Android api 16 ~ 19 in the future
            message(WARNING "Using the unofficial git repository <https://github.com/Xreki/glog.git> instead")
        endif()
    endif()
@ -83,6 +84,8 @@ if(ANDROID OR IOS)
        "Disable RDMA when cross-compiling for Android and iOS" FORCE)
    set(WITH_MKL OFF CACHE STRING
        "Disable MKL when cross-compiling for Android and iOS" FORCE)
    set(WITH_GOLANG OFF CACHE STRING
        "Disable golang when cross-compiling for Android and iOS" FORCE)
    # Compile PaddlePaddle mobile inference library
    if (NOT WITH_C_API)
@ -133,6 +136,8 @@ include(external/any)       # download libn::any
 include(external/eigen)     # download eigen3
 include(external/pybind11)  # download pybind11
 include(external/nccl)
 include(external/cares)
 include(external/grpc)
 include(cudnn)              # set cudnn libraries, must before configure
 include(configure)          # add paddle env configuration
@ -194,6 +199,10 @@ if(WITH_GOLANG)
 endif(WITH_GOLANG)
 set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build")
 SET(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
 SET(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
 add_subdirectory(paddle)
 if(WITH_PYTHON)
  add_subdirectory(python)
--- a/2
+++ b/2
@ -29,7 +29,7 @@ RUN apt-get update && \
    automake locales clang-format swig doxygen cmake  \
    liblapack-dev liblapacke-dev libboost-dev \
    clang-3.8 llvm-3.8 libclang-3.8-dev \
-    net-tools && \
+    net-tools libtool && \
    apt-get clean -y
 # Install Go and glide
--- a/README.md
+++ b/README.md
@ -2,8 +2,8 @@
 [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
-[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://doc.paddlepaddle.org/develop/doc/)
+[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/index_en.html)
-[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://doc.paddlepaddle.org/develop/doc_cn/)
+[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html)
 [![Coverage Status](https://coveralls.io/repos/github/PaddlePaddle/Paddle/badge.svg?branch=develop)](https://coveralls.io/github/PaddlePaddle/Paddle?branch=develop)
 [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
 [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
@ -36,7 +36,7 @@ Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddl
    examples:
      - Optimized math operations through SSE/AVX intrinsics, BLAS libraries
-      (e.g. MKL, ATLAS, cuBLAS) or customized CPU/GPU kernels.
+      (e.g. MKL, OpenBLAS, cuBLAS) or customized CPU/GPU kernels.
      - Highly optimized recurrent networks which can handle **variable-length**
      sequence without padding.
      - Optimized local and distributed training for models with high dimensional
@ -61,32 +61,32 @@ Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddl
 ## Installation
 It is recommended to check out the
-[Docker installation guide](http://doc.paddlepaddle.org/develop/doc/getstarted/build_and_install/docker_install_en.html)
+[Docker installation guide](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/docker_install_en.html)
 before looking into the
-[build from source guide](http://doc.paddlepaddle.org/develop/doc/getstarted/build_and_install/build_from_source_en.html).
+[build from source guide](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/build_from_source_en.html).
 ## Documentation
-We provide [English](http://doc.paddlepaddle.org/develop/doc/) and
+We provide [English](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/index_en.html) and
-[Chinese](http://doc.paddlepaddle.org/doc_cn/) documentation.
+[Chinese](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html) documentation.
- [Deep Learning 101](http://book.paddlepaddle.org/index.html)
+- [Deep Learning 101](http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.html)
  You might want to start from this online interactive book that can run in a Jupyter Notebook.
- [Distributed Training](http://doc.paddlepaddle.org/develop/doc/howto/usage/cluster/cluster_train_en.html)
+- [Distributed Training](http://www.paddlepaddle.org/docs/develop/documentation/en/howto/usage/cluster/cluster_train_en.html)
  You can run distributed training jobs on MPI clusters.
- [Distributed Training on Kubernetes](http://doc.paddlepaddle.org/develop/doc/howto/usage/k8s/k8s_en.html)
+- [Distributed Training on Kubernetes](http://www.paddlepaddle.org/docs/develop/documentation/en/howto/usage/cluster/k8s_en.html)
   You can also run distributed training jobs on Kubernetes clusters.
- [Python API](http://doc.paddlepaddle.org/develop/doc/api/index_en.html)
+- [Python API](http://www.paddlepaddle.org/docs/develop/documentation/en/api/index_en.html)
   Our new API enables much shorter programs.
- [How to Contribute](http://doc.paddlepaddle.org/develop/doc/howto/dev/contribute_to_paddle_en.html)
+- [How to Contribute](http://www.paddlepaddle.org/docs/develop/documentation/en/howto/dev/contribute_to_paddle_en.html)
   We appreciate your contributions!
--- a/RELEASE.cn.md
+++ b/RELEASE.cn.md
@ -1,3 +1,62 @@
 # v0.11.0版本
 ## PaddlePaddle Fluid
 - PaddlePaddle发布版本v0.11.0包含一个新的特性*PaddlePaddle Fluid*. Fluid 是设计用来让用户像Pytorch和Tensorflow Eager Execution一样执行程序。在这些系统中，不再有*模型*这个概念，应用也不再包含一个用于描述Operator图或者一系列层的符号描述，而是像通用程序那样描述训练或者预测的过程。而Fluid与PyTorch或Eager Execution的区别在于Fluid不依赖Python提供的控制流，例如 if-else-then或者for，而是提供了基于C++实现的控制流并暴露了对应的用with语法实现的Python接口。例如：
  https://github.com/PaddlePaddle/Paddle/blob/3df78ed2a98d37f7ae6725894cc7514effd5664b/python/paddle/v2/fluid/tests/test_while_op.py#L36-L44
 - 在v0.11.0版本中，我们提供了一个C++类`Executor`用于运行一个Fluid程序。Executor类似一个解释器。在未来的版本中，我们将提升和优化Executor成为一个调试器，就像GDB。并可能提供一些编译器，这个编译器会读取一个上文所描述的应用然后编译成一个等价的
 源代码，这个源代码可以被nvcc编译成可以使用CUDA的二进制，或者被icc编译成可以充分利用Intel CPU的二进制。
 ## 新特点
 * 发布 `PaddlePaddle Fluid`。
 * 增加了用于模型预测的C-API。
 * 用Fluid API实现了一个简单的GAN的例子。
 * 增加了关于性能调优的文档。
 * 为`paddle.v2.dataset`下载数据集提供了重试机制.
 * C++中使用protobuf-lite替换protobuf减少了二进制的大小。
 * 发布了新特性 [Elastic Deep Learning (EDL)](https://github.com/PaddlePaddle/cloud/tree/develop/doc/autoscale/experiment).
 * 基于Bazel API利用cmake实现了一个的新的构建系统函数库。
 * 当使用编译选项`WITH_MKL=ON`时自动下载和编译Intel® [MKLML](https://github.com/01org/mkl-dnn/releases/download/v0.11/mklml_lnx_2018.0.1.20171007.tgz) 函数库.
 * [Intel® MKL-DNN on PaddlePaddle](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/mkldnn):
  - 完成了 11个 MKL-DNN 层: Convolution, Fully connectivity, Pooling, ReLU, Tanh, ELU, Softmax, BatchNorm, AddTo, Concat, LRN。
  - 完成了 3个 MKL-DNN 网络: VGG-19, ResNet-50, GoogleNet
  - 基于Intel Skylake 6148 CPU的[性能测试](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/IntelOptimizedPaddle.md) : 相对于MKLML有2~3倍的训练加速。
 * 增加 [softsign activation](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/activation.html#softsign)
 * 增加 [dot product layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#dot-prod)
 * 增加 [L2 distance layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#l2-distance)
 * 增加 [sub-nested sequence layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#sub-nested-seq)
 * 增加 [kmax sequence score layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#kmax-sequence-score)
 * 增加 [sequence slice layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#seq-slice)
 * 增加 [row convolution layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#row-conv)
 * 增加移动端友好的网页
 ## 改进
 * 使用一个Python`whl`包即可安装.
 * [V2 API可以实现用户定制化评估](https://github.com/PaddlePaddle/models/tree/develop/ltr#训练过程中输出自定义评估指标)。
 * 将 `PADDLE_ONLY_CPU` 改为 `PADDLE_WITH_GPU`, 因为我们会支持多种设备。
 * 删除了有一些bug的BarrierStat。
 * 清理和删除了paddle::Parameter中未使用的函数。
 * 删除了ProtoDataProvider。
 * Huber loss同时支持回归和分类。
 * 为sequence pooling 层增加`stride`参数。
 * v2 API自动使用cudnn batch normalization。
 * 可以使用一个固定的参数名共享BN层的参数。
 * 2D convolution operation支持variable-dimension input特性。
 * 重构cmake中关于CUDA的部分并实现自动检测GPU架构的功能。
 * 优化网页导航。
 ## 错误修复
 * 修复ROI pooling的Bug. cc9a761
 * 修复当label是dense vector是AUC变成0的问题. #5274
 * 修复WarpCTC 层的Bug.
 # v0.10.0版本
 我们非常高兴发布了PaddlePaddle V0.10.0版，并开发了新的[Python API](http://research.baidu.com/paddlepaddles-new-api-simplifies-deep-learning-programs/)。
--- a/RELEASE.md
+++ b/RELEASE.md
@ -1,3 +1,75 @@
 # Release v0.11.0
 ## PaddlePaddle Fluid
 - Release 0.11.0 includes a new feature *PaddlePaddle Fluid*.  Fluid is
  designed to allow users to program like PyTorch and TensorFlow Eager Execution.
  In these systems, there is no longer the concept *model* and applications
  do not include a symbolic description of a graph of operators nor a sequence
  of layers. Instead, applications look exactly like a usual program that
  describes a process of training or inference.  The difference between
  Fluid and PyTorch or Eager Execution is that Fluid doesn't rely on Python's
  control-flow, `if-then-else` nor `for`.  Instead, Fluid provides its
  C++ implementations and their Python binding using the `with` statement.  For an example
  https://github.com/PaddlePaddle/Paddle/blob/3df78ed2a98d37f7ae6725894cc7514effd5664b/python/paddle/v2/fluid/tests/test_while_op.py#L36-L44
 - In 0.11.0, we provides a C++ class `Executor` to run a Fluid program.
 Executor works like an interpreter. In future version, we will improve
 `Executor` into a debugger like GDB, and we might provide some compilers,
 which, for example, takes an application like the above one, and outputs
 an equivalent C++ source program, which can be compiled using
 [`nvcc`](http://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html)
 to generate binaries that use CUDA, or using
 [`icc`](https://software.intel.com/en-us/c-compilers) to generate binaries
 that make full use of Intel CPUs.
 ## New Features
 * Release `PaddlePaddle Fluid`.
 * Add C-API for model inference
 * Use fluid API to create a simple GAN demo.
 * Add develop guide about performance tunning.
 * Add retry when download `paddle.v2.dataset`.
 * Linking protobuf-lite not protobuf in C++. Reduce the binary size.
 * Feature [Elastic Deep Learning (EDL)](https://github.com/PaddlePaddle/cloud/tree/develop/doc/autoscale/experiment) released.
 * A new style cmake functions for Paddle. It is based on Bazel API.
 * Automatically download and compile with Intel® [MKLML](https://github.com/01org/mkl-dnn/releases/download/v0.11/mklml_lnx_2018.0.1.20171007.tgz) library as CBLAS when build `WITH_MKL=ON`.
 * [Intel® MKL-DNN on PaddlePaddle](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/mkldnn):
  - Complete 11 MKL-DNN layers: Convolution, Fully connectivity, Pooling, ReLU, Tanh, ELU, Softmax, BatchNorm, AddTo, Concat, LRN.
  - Complete 3 MKL-DNN networks: VGG-19, ResNet-50, GoogleNet
  - [Benchmark](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/IntelOptimizedPaddle.md) on Intel Skylake 6148 CPU: 2~3x training speedup compared with MKLML.
 * Add the [`softsign` activation](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/activation.html#softsign).
 * Add the [dot product layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#dot-prod).
 * Add the [L2 distance layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#l2-distance).
 * Add the [sub-nested sequence layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#sub-nested-seq).
 * Add the [kmax sequence score layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#kmax-sequence-score).
 * Add the [sequence slice layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#seq-slice).
 * Add the [row convolution layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#row-conv)
 * Add mobile friendly webpages.
 ## Improvements
 * Build and install using a single `whl` package.
 * [Custom evaluating in V2 API](https://github.com/PaddlePaddle/models/tree/develop/ltr#训练过程中输出自定义评估指标).
 * Change `PADDLE_ONLY_CPU` to `PADDLE_WITH_GPU`, since we will support many kinds of devices.
 * Remove buggy BarrierStat.
 * Clean and remove unused functions in paddle::Parameter.
 * Remove ProtoDataProvider.
 * Huber loss supports both regression and classification.
 * Add the `stride` parameter  for sequence pooling layers.
 * Enable v2 API use cudnn batch normalization automatically.
 * The BN layer's parameter can be shared by a fixed the parameter name.
 * Support variable-dimension input feature for 2D convolution operation.
 * Refine cmake about CUDA to automatically detect GPU architecture.
 * Improved website navigation.
 ## Bug Fixes
 * Fix bug in ROI pooling. cc9a761
 * Fix AUC is zero when label is dense vector. #5274
 * Fix bug in WarpCTC layer.
 # Release v0.10.0
 We are glad to release version 0.10.0.  In this version, we are happy to release the new 
--- a/benchmark/IntelOptimizedPaddle.md
+++ b/benchmark/IntelOptimizedPaddle.md
@ -2,28 +2,27 @@
 Machine:
- Server
+- Server: Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz, 2 Sockets, 20 Cores per socket
- 	- Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz, 2 Sockets, 20 Cores per socket
+- Laptop: TBD
 - Laptop
 	- DELL XPS15-9560-R1745: i7-7700HQ 8G 256GSSD
 	- i5 MacBook Pro (Retina, 13-inch, Early 2015)
 - Desktop
 	- i7-6700k
 System: CentOS release 6.3 (Final), Docker 1.12.1.
-PaddlePaddle: paddlepaddle/paddle:latest (TODO: will rerun after 0.11.0)
+PaddlePaddle: (TODO: will rerun after 0.11.0)
-
+- paddlepaddle/paddle:latest (for MKLML and MKL-DNN)
- MKL-DNN tag v0.10
+  - MKL-DNN tag v0.11
- MKLML 2018.0.20170720
+  - MKLML 2018.0.1.20171007
- OpenBLAS v0.2.20
+- paddlepaddle/paddle:latest-openblas (for OpenBLAS)
  - OpenBLAS v0.2.20
 On each machine, we will test and compare the performance of training on single node using MKL-DNN / MKLML / OpenBLAS respectively.
 ## Benchmark Model
 ### Server
 #### Training
 Test on batch size 64, 128, 256 on Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
 Pay attetion that the speed below includes forward, backward and parameter update time. So we can not directly compare the data with the benchmark of caffe `time` [command](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/caffe/image/run.sh#L9), which only contain forward and backward. The updating time of parameter would become very heavy when the weight size are large, especially on alexnet.
 Input image size - 3 * 224 * 224, Time: images/second
@ -31,18 +30,78 @@ Input image size - 3 * 224 * 224, Time: images/second
 | BatchSize    | 64    | 128  | 256     |
 |--------------|-------| -----| --------|
-| OpenBLAS     | 7.82  | 8.62  | 10.34  | 
+| OpenBLAS     | 7.80  | 9.00  | 10.80  | 
-| MKLML        | 11.02 | 12.86 | 15.33  |
+| MKLML        | 12.12 | 13.70 | 16.18  |
-| MKL-DNN      | 27.69 | 28.8 | 29.27  |
+| MKL-DNN      | 28.46 | 29.83 | 30.44  |
 <img src="figs/vgg-cpu-train.png" width="500">
-chart on batch size 128
+ - ResNet-50
-TBD
+
 | BatchSize    | 64    | 128   | 256    |
 |--------------|-------| ------| -------|
 | OpenBLAS     | 25.22 | 25.68 | 27.12  | 
 | MKLML        | 32.52 | 31.89 | 33.12  |
 | MKL-DNN      | 81.69 | 82.35 | 84.08  |
 <img src="figs/resnet-cpu-train.png" width="500">
 - ResNet
 - GoogLeNet
 | BatchSize    | 64    | 128   | 256    |
 |--------------|-------| ------| -------|
 | OpenBLAS     | 89.52 | 96.97 | 108.25 | 
 | MKLML        | 128.46| 137.89| 158.63 |
 | MKL-DNN      | 250.46| 264.83| 269.50 |
 <img src="figs/googlenet-cpu-train.png" width="500">
 - Alexnet
 | BatchSize    | 64     | 128    | 256    |
 |--------------|--------| ------ | -------|
 | OpenBLAS     | 2.13   | 2.45   | 2.68   | 
 | MKLML        | 66.37  | 105.60 | 144.04 |
 | MKL-DNN      | 399.00 | 498.94 | 626.53 | 
 chart TBD
 #### Inference
 Test on batch size 1, 2, 4, 8, 16 on Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
 - VGG-19
 | BatchSize | 1     | 2     | 4     | 8     | 16    |
 |-----------|-------|-------|-------|-------|-------|
 | OpenBLAS  | 1.07  | 1.08  | 1.06  | 0.88  | 0.65  |
 | MKLML     | 5.58  | 9.80  | 15.15 | 21.21 | 28.67 |
 | MKL-DNN   | 75.07 | 88.64 | 82.58 | 92.29 | 96.75 |
 - ResNet-50
 | BatchSize | 1     | 2      | 4      | 8      | 16     |
 |-----------|-------|--------|--------|--------|--------|
 | OpenBLAS  | 3.35  | 3.19   | 3.09   | 2.55   | 1.96   |
 | MKLML     | 6.33  | 12.02  | 22.88  | 40.53  | 63.09  |
 | MKL-DNN   | 107.83| 148.84 | 177.78 | 189.35 | 217.69 |
 - GoogLeNet
 | BatchSize | 1      | 2      | 4      | 8      | 16     |
 |-----------|--------|--------|--------|--------|--------|
 | OpenBLAS  | 12.04  | 11.31  | 10.00  | 9.07   | 4.34   |
 | MKLML     | 22.74  | 41.56  | 81.22  | 133.47 | 210.53 |
 | MKL-DNN   | 175.10 | 272.92 | 450.70 | 512.00 | 600.94 |
 - Alexnet
 | BatchSize | 1      | 2      | 4      | 8      | 16     |
 |-----------|--------|--------|--------|--------|--------|
 | OpenBLAS  |    |   |   |   |    |
 | MKLML     | 21.32  | 36.55  | 73.06  | 131.15 | 192.77 |
 | MKL-DNN   | 442.91 | 656.41 | 719.10 | 847.68 | 850.51 |
 chart TBD
 ### Laptop
 TBD
 ### Desktop
 TBD
--- a/benchmark/figs/googlenet-cpu-train.png
+++ b/benchmark/figs/googlenet-cpu-train.png
--- a/benchmark/figs/resnet-cpu-train.png
+++ b/benchmark/figs/resnet-cpu-train.png
--- a/benchmark/figs/vgg-cpu-train.png
+++ b/benchmark/figs/vgg-cpu-train.png
--- a/benchmark/paddle/image/alexnet.py
+++ b/benchmark/paddle/image/alexnet.py
@ -6,10 +6,24 @@ height = 227
 width = 227
 num_class = 1000
 batch_size = get_config_arg('batch_size', int, 128)
 gp = get_config_arg('layer_num', int, 1)
 is_infer = get_config_arg("is_infer", bool, False)
 num_samples = get_config_arg('num_samples', int, 2560)
-args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
+args = {
    'height': height,
    'width': width,
    'color': True,
    'num_class': num_class,
    'is_infer': is_infer,
    'num_samples': num_samples
 }
 define_py_data_sources2(
-    "train.list", None, module="provider", obj="process", args=args)
+    "train.list" if not is_infer else None,
    "test.list" if is_infer else None,
    module="provider",
    obj="process",
    args=args)
 settings(
    batch_size=batch_size,
@ -31,7 +45,7 @@ net = img_pool_layer(input=net, pool_size=3, stride=2)
 # conv2
 net = img_conv_layer(
-    input=net, filter_size=5, num_filters=256, stride=1, padding=2, groups=1)
+    input=net, filter_size=5, num_filters=256, stride=1, padding=2, groups=gp)
 net = img_cmrnorm_layer(input=net, size=5, scale=0.0001, power=0.75)
 net = img_pool_layer(input=net, pool_size=3, stride=2)
@ -40,11 +54,11 @@ net = img_conv_layer(
    input=net, filter_size=3, num_filters=384, stride=1, padding=1)
 # conv4
 net = img_conv_layer(
-    input=net, filter_size=3, num_filters=384, stride=1, padding=1, groups=1)
+    input=net, filter_size=3, num_filters=384, stride=1, padding=1, groups=gp)
 # conv5
 net = img_conv_layer(
-    input=net, filter_size=3, num_filters=256, stride=1, padding=1, groups=1)
+    input=net, filter_size=3, num_filters=256, stride=1, padding=1, groups=gp)
 net = img_pool_layer(input=net, pool_size=3, stride=2)
 net = fc_layer(
@ -59,6 +73,9 @@ net = fc_layer(
    layer_attr=ExtraAttr(drop_rate=0.5))
 net = fc_layer(input=net, size=1000, act=SoftmaxActivation())
-lab = data_layer('label', num_class)
+if is_infer:
-loss = cross_entropy(input=net, label=lab)
+    outputs(net)
-outputs(loss)
+else:
    lab = data_layer('label', num_class)
    loss = cross_entropy(input=net, label=lab)
    outputs(loss)
--- a/benchmark/paddle/image/googlenet.py
+++ b/benchmark/paddle/image/googlenet.py
@ -5,10 +5,24 @@ height = 224
 width = 224
 num_class = 1000
 batch_size = get_config_arg('batch_size', int, 128)
-
+use_gpu = get_config_arg('use_gpu', bool, True)
-args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
+is_infer = get_config_arg("is_infer", bool, False)
 num_samples = get_config_arg('num_samples', int, 2560)
 args = {
    'height': height,
    'width': width,
    'color': True,
    'num_class': num_class,
    'is_infer': is_infer,
    'num_samples': num_samples
 }
 define_py_data_sources2(
-    "train.list", None, module="provider", obj="process", args=args)
+    "train.list" if not is_infer else None,
    "test.list" if is_infer else None,
    module="provider",
    obj="process",
    args=args)
 settings(
    batch_size=batch_size,
@ -16,6 +30,8 @@ settings(
    learning_method=MomentumOptimizer(0.9),
    regularization=L2Regularization(0.0005 * batch_size))
 conv_projection = conv_projection if use_gpu else img_conv_layer
 def inception2(name, input, channels, \
    filter1,
    filter3R, filter3,
@ -138,12 +154,11 @@ def inception(name, input, channels, \
    cat = concat_layer(
        name=name,
        input=[cov1, cov3, cov5, covprj],
-        bias_attr=True,
+        bias_attr=True if use_gpu else False,
        act=ReluActivation())
    return cat
 lab = data_layer(name="label", size=1000)
 data = data_layer(name="input", size=3 * height * width)
 # stage 1
@ -221,6 +236,10 @@ pool5 = img_pool_layer(
 dropout = dropout_layer(name="dropout", input=pool5, dropout_rate=0.4)
 out3 = fc_layer(
    name="output3", input=dropout, size=1000, act=SoftmaxActivation())
 loss3 = cross_entropy(name='loss3', input=out3, label=lab)
-outputs(loss3)
+if is_infer:
    outputs(out3)
 else:
    lab = data_layer(name="label", size=num_class)
    loss3 = cross_entropy(name='loss3', input=out3, label=lab)
    outputs(loss3)
--- a/benchmark/paddle/image/provider.py
+++ b/benchmark/paddle/image/provider.py
@ -13,14 +13,21 @@ def initHook(settings, height, width, color, num_class, **kwargs):
        settings.data_size = settings.height * settings.width * 3
    else:
        settings.data_size = settings.height * settings.width
-
+    settings.is_infer = kwargs.get('is_infer', False)
-    settings.slots = [dense_vector(settings.data_size), integer_value(1)]
+    settings.num_samples = kwargs.get('num_samples', 2560)
    if settings.is_infer:
        settings.slots = [dense_vector(settings.data_size)]
    else:
        settings.slots = [dense_vector(settings.data_size), integer_value(1)]
@provider(
    init_hook=initHook, min_pool_size=-1, cache=CacheType.CACHE_PASS_IN_MEM)
 def process(settings, file_list):
-    for i in xrange(1024):
+    for i in xrange(settings.num_samples):
        img = np.random.rand(1, settings.data_size).reshape(-1, 1).flatten()
-        lab = random.randint(0, settings.num_class - 1)
+        if settings.is_infer:
-        yield img.astype('float32'), int(lab)
+            yield img.astype('float32')
        else:
            lab = random.randint(0, settings.num_class - 1)
            yield img.astype('float32'), int(lab)
--- a/benchmark/paddle/image/resnet.py
+++ b/benchmark/paddle/image/resnet.py
@ -6,11 +6,23 @@ width = 224
 num_class = 1000
 batch_size = get_config_arg('batch_size', int, 64)
 layer_num = get_config_arg("layer_num", int, 50)
-is_test = get_config_arg("is_test", bool, False)
+is_infer = get_config_arg("is_infer", bool, False)
-
+num_samples = get_config_arg('num_samples', int, 2560)
-args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
+
 args = {
    'height': height,
    'width': width,
    'color': True,
    'num_class': num_class,
    'is_infer': is_infer,
    'num_samples': num_samples
 }
 define_py_data_sources2(
-    "train.list", None, module="provider", obj="process", args=args)
+    "train.list" if not is_infer else None,
    "test.list" if is_infer else None,
    module="provider",
    obj="process",
    args=args)
 settings(
    batch_size=batch_size,
@ -45,7 +57,10 @@ def conv_bn_layer(name,
        act=LinearActivation(),
        bias_attr=False)
    return batch_norm_layer(
-        name=name + "_bn", input=tmp, act=active_type, use_global_stats=is_test)
+        name=name + "_bn",
        input=tmp,
        act=active_type,
        use_global_stats=is_infer)
 def bottleneck_block(name, input, num_filters1, num_filters2):
@ -207,7 +222,9 @@ elif layer_num == 152:
 else:
    print("Wrong layer number.")
-lbl = data_layer(name="label", size=num_class)
+if is_infer:
-loss = cross_entropy(name='loss', input=resnet, label=lbl)
+    outputs(resnet)
-inputs(img, lbl)
+else:
-outputs(loss)
+    lbl = data_layer(name="label", size=num_class)
    loss = cross_entropy(name='loss', input=resnet, label=lbl)
    outputs(loss)
--- a/benchmark/paddle/image/run_mkl_infer.sh
+++ b/benchmark/paddle/image/run_mkl_infer.sh
@ -0,0 +1,87 @@
 set -e
 function clock_to_seconds() {
  hours=`echo $1 | awk -F ':' '{print $1}'`
  mins=`echo $1 | awk -F ':' '{print $2}'`
  secs=`echo $1 | awk -F ':' '{print $3}'`
  echo `awk 'BEGIN{printf "%.2f",('$secs' + '$mins' * 60 + '$hours' * 3600)}'`
 }
 function infer() {
  unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY
  topology=$1
  layer_num=$2
  bs=$3
  use_mkldnn=$4
  if [ $4 == "True" ]; then
    thread=1
    log="logs/infer-${topology}-${layer_num}-mkldnn-${bs}.log"
  elif [ $4 == "False" ]; then
    thread=`nproc`
    if [ $thread -gt $bs ]; then
      thread=$bs
    fi
    log="logs/infer-${topology}-${layer_num}-${thread}mklml-${bs}.log"
  else
    echo "Wrong input $4, use True or False."
    exit 0
  fi
  models_in="models/${topology}-${layer_num}/pass-00000/"
  if [ ! -d $models_in ]; then
    echo "Training model ${topology}_${layer_num}"
    paddle train --job=train \
      --config="${topology}.py" \
      --use_mkldnn=True \
      --use_gpu=False \
      --trainer_count=1 \
      --num_passes=1 \
      --save_dir="models/${topology}-${layer_num}" \
      --config_args="batch_size=128,layer_num=${layer_num},num_samples=256" \
      > /dev/null 2>&1
    echo "Done"
  fi
  log_period=$((256 / bs))
  paddle train --job=test \
    --config="${topology}.py" \
    --use_mkldnn=$use_mkldnn \
    --use_gpu=False \
    --trainer_count=$thread \
    --log_period=$log_period \
    --config_args="batch_size=${bs},layer_num=${layer_num},is_infer=True" \
    --init_model_path=$models_in \
    2>&1 | tee ${log}
  # calculate the last 5 logs period time of 1280 samples,
  # the time before are burning time.
  start=`tail ${log} -n 7 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
  end=`tail ${log} -n 2 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
  start_sec=`clock_to_seconds $start`
  end_sec=`clock_to_seconds $end`
  fps=`awk 'BEGIN{printf "%.2f",(1280 / ('$end_sec' - '$start_sec'))}'`
  echo "Last 1280 samples start: ${start}(${start_sec} sec), end: ${end}(${end_sec} sec;" >> ${log}
  echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
 }
 if [ ! -f "train.list" ]; then
  echo " " > train.list
 fi
 if [ ! -f "test.list" ]; then
  echo " " > test.list
 fi
 if [ ! -d "logs" ]; then
  mkdir logs
 fi
 if [ ! -d "models" ]; then
  mkdir -p models
 fi
 # inference benchmark
 for use_mkldnn in True False; do
  for batchsize in 1 2 4 8 16; do
    infer vgg 19 $batchsize $use_mkldnn
    infer resnet 50 $batchsize $use_mkldnn
    infer googlenet v1 $batchsize $use_mkldnn
    infer alexnet 2 $batchsize $use_mkldnn
  done
 done
--- a/benchmark/paddle/image/run_mkl_train.sh
+++ b/benchmark/paddle/image/run_mkl_train.sh
@ -8,13 +8,13 @@ function train() {
  use_mkldnn=$4
  if [ $4 == "True" ]; then
    thread=1
-    log="logs/${topology}-${layer_num}-mkldnn-${bs}.log"
+    log="logs/train-${topology}-${layer_num}-mkldnn-${bs}.log"
  elif [ $4 == "False" ]; then
    thread=`nproc`
    # each trainer_count use only 1 core to avoid conflict
-    log="logs/${topology}-${layer_num}-${thread}mklml-${bs}.log"
+    log="logs/train-${topology}-${layer_num}-${thread}mklml-${bs}.log"
  else
-    echo "Wrong input $3, use True or False."
+    echo "Wrong input $4, use True or False."
    exit 0
  fi
  args="batch_size=${bs},layer_num=${layer_num}"
@ -28,18 +28,25 @@ function train() {
    --test_period=100 \
    --config_args=$args \
    2>&1 | tee ${log} 
  avg_time=`tail ${log} -n 1 | awk -F ' ' '{print $8}' | sed 's/avg=//'`
  fps=`awk 'BEGIN{printf "%.2f",('$bs' / '$avg_time' * 1000)}'`
  echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
 }
-if [ ! -d "train.list" ]; then
+if [ ! -f "train.list" ]; then
  echo " " > train.list
 fi
 if [ ! -d "logs" ]; then
  mkdir logs
 fi
 # training benchmark
 for use_mkldnn in True False; do
  for batchsize in 64 128 256; do
    train vgg 19 $batchsize $use_mkldnn
-    train resnet 50  $batchsize $use_mkldnn
+    train resnet 50 $batchsize $use_mkldnn
    train googlenet v1 $batchsize $use_mkldnn
    train alexnet 2 $batchsize $use_mkldnn
  done
 done
--- a/benchmark/paddle/image/run_openblas_infer.sh
+++ b/benchmark/paddle/image/run_openblas_infer.sh
@ -0,0 +1,68 @@
 set -e
 function clock_to_seconds() {
  hours=`echo $1 | awk -F ':' '{print $1}'`
  mins=`echo $1 | awk -F ':' '{print $2}'`
  secs=`echo $1 | awk -F ':' '{print $3}'`
  echo `awk 'BEGIN{printf "%.2f",('$secs' + '$mins' * 60 + '$hours' * 3600)}'`
 }
 function infer() {
  topology=$1
  layer_num=$2
  bs=$3
  trainers=`nproc`
  if [ $trainers -gt $bs ]; then
    trainers=$bs
  fi
  log="logs/infer-${topology}-${layer_num}-${trainers}openblas-${bs}.log"
  threads=$((`nproc` / trainers))
  if [ $threads -eq 0 ]; then
    threads=1
  fi
  export OPENBLAS_NUM_THREADS=$threads
  models_in="models/${topology}-${layer_num}/pass-00000/"
  if [ ! -d $models_in ]; then
    echo "./run_mkl_infer.sh to save the model first"
    exit 0
  fi
  log_period=$((32 / bs))
  paddle train --job=test \
    --config="${topology}.py" \
    --use_mkldnn=False \
    --use_gpu=False \
    --trainer_count=$trainers \
    --log_period=$log_period \
    --config_args="batch_size=${bs},layer_num=${layer_num},is_infer=True,num_samples=256" \
    --init_model_path=$models_in \
    2>&1 | tee ${log}
  # calculate the last 5 logs period time of 160(=32*5) samples,
  # the time before are burning time.
  start=`tail ${log} -n 7 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
  end=`tail ${log} -n 2 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
  start_sec=`clock_to_seconds $start`
  end_sec=`clock_to_seconds $end`
  fps=`awk 'BEGIN{printf "%.2f",(160 / ('$end_sec' - '$start_sec'))}'`
  echo "Last 160 samples start: ${start}(${start_sec} sec), end: ${end}(${end_sec} sec;" >> ${log}
  echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
 }
 if [ ! -f "train.list" ]; then
  echo " " > train.list
 fi
 if [ ! -f "test.list" ]; then
  echo " " > test.list
 fi
 if [ ! -d "logs" ]; then
  mkdir logs
 fi
 # inference benchmark
 for batchsize in 1 2 4 8 16; do
  infer vgg 19 $batchsize
  infer resnet 50 $batchsize 
  infer googlenet v1 $batchsize
  infer alexnet 2 $batchsize
 done
--- a/benchmark/paddle/image/run_openblas_train.sh
+++ b/benchmark/paddle/image/run_openblas_train.sh
@ -0,0 +1,41 @@
 set -e
 function train() {
  export OPENBLAS_NUM_THREADS=1
  topology=$1
  layer_num=$2
  bs=$3
  thread=`nproc`
  # each trainer_count use only 1 core to avoid conflict
  log="logs/train-${topology}-${layer_num}-${thread}openblas-${bs}.log"
  args="batch_size=${bs},layer_num=${layer_num}"
  config="${topology}.py"
  paddle train --job=time \
    --config=$config \
    --use_mkldnn=False \
    --use_gpu=False \
    --trainer_count=$thread \
    --log_period=3 \
    --test_period=30 \
    --config_args=$args \
    2>&1 | tee ${log} 
  avg_time=`tail ${log} -n 1 | awk -F ' ' '{print $8}' | sed 's/avg=//'`
  fps=`awk 'BEGIN{printf "%.2f",('$bs' / '$avg_time' * 1000)}'`
  echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
 }
 if [ ! -f "train.list" ]; then
  echo " " > train.list
 fi
 if [ ! -d "logs" ]; then
  mkdir logs
 fi
 # training benchmark
 for batchsize in 64 128 256; do
  train vgg 19 $batchsize
  train resnet 50 $batchsize
  train googlenet v1 $batchsize
  train alexnet 2 $batchsize
 done
--- a/benchmark/paddle/image/vgg.py
+++ b/benchmark/paddle/image/vgg.py
@ -6,10 +6,23 @@ width = 224
 num_class = 1000
 batch_size = get_config_arg('batch_size', int, 64)
 layer_num = get_config_arg('layer_num', int, 19)
 is_infer = get_config_arg("is_infer", bool, False)
 num_samples = get_config_arg('num_samples', int, 2560)
-args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
+args = {
    'height': height,
    'width': width,
    'color': True,
    'num_class': num_class,
    'is_infer': is_infer,
    'num_samples': num_samples
 }
 define_py_data_sources2(
-    "train.list", None, module="provider", obj="process", args=args)
+    "train.list" if not is_infer else None,
    "test.list" if is_infer else None,
    module="provider",
    obj="process",
    args=args)
 settings(
    batch_size=batch_size,
@ -98,6 +111,9 @@ elif layer_num == 19:
 else:
    print("Wrong layer number.")
-lab = data_layer('label', num_class)
+if is_infer:
-loss = cross_entropy(input=vgg, label=lab)
+    outputs(vgg)
-outputs(loss)
+else:
    lab = data_layer('label', num_class)
    loss = cross_entropy(input=vgg, label=lab)
    outputs(loss)
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@ -3,7 +3,7 @@
 # It will search MKLML, atlas, OpenBlas, reference-cblas in order.
 #
 # If any cblas implementation found, the following variable will be set.
-#    CBLAS_PROVIDER  # one of MKLML, ATLAS, OPENBLAS, REFERENCE
+#    CBLAS_PROVIDER  # one of MKLML, OPENBLAS, REFERENCE
 #    CBLAS_INC_DIR   # the include directory for cblas.
 #    CBLAS_LIBS      # a list of libraries should be linked by paddle.
 #                    # Each library should be full path to object file.
@ -17,7 +17,7 @@ if(WITH_MKLML AND MKLML_INC_DIR AND MKLML_LIB)
  set(CBLAS_INC_DIR ${MKLML_INC_DIR})
  set(CBLAS_LIBRARIES ${MKLML_LIB})
-  add_definitions(-DPADDLE_USE_MKLML)
+  add_definitions(-DPADDLE_WITH_MKLML)
  add_definitions(-DLAPACK_FOUND)
  message(STATUS "Found cblas and lapack in MKLML "
@ -25,42 +25,6 @@ if(WITH_MKLML AND MKLML_INC_DIR AND MKLML_LIB)
  return()
 endif()
 ## Then find atlas.
 set(ATLAS_ROOT $ENV{ATLAS_ROOT} CACHE PATH "Folder contains Atlas")
 set(ATLAS_INCLUDE_SEARCH_PATHS
        ${ATLAS_ROOT}/include
        /usr/include
        /usr/include/atlas)
 set(ATLAS_LIB_SEARCH_PATHS
        ${ATLAS_ROOT}/lib
        /usr/lib
        /usr/lib/blas/atlas
        /usr/lib/atlas
        /usr/lib/atlas-base   # special for ubuntu 14.04.
    )
 find_path(ATLAS_INC_DIR NAMES cblas.h
  PATHS ${ATLAS_INCLUDE_SEARCH_PATHS})
 find_path(ATLAS_CLAPACK_INC_DIR NAMES clapack.h
  PATHS ${ATLAS_INCLUDE_SEARCH_PATHS})
 find_library(ATLAS_CBLAS_LIB NAMES cblas libcblas.so.3
  PATHS ${ATLAS_LIB_SEARCH_PATHS})
 find_library(ATLAS_CLAPACK_LIB NAMES lapack_atlas liblapack_atlas.so.3
  PATHS ${ATLAS_LIB_SEARCH_PATHS})
 if(ATLAS_CLAPACK_INC_DIR AND ATLAS_INC_DIR AND ATLAS_CBLAS_LIB AND ATLAS_CLAPACK_LIB)
  set(CBLAS_FOUND ON)
  set(CBLAS_PROVIDER ATLAS)
  set(CBLAS_INC_DIR ${ATLAS_INC_DIR} ${ATLAS_CLAPACK_INC_DIR})
  set(CBLAS_LIBRARIES ${ATLAS_CLAPACK_LIB} ${ATLAS_CBLAS_LIB})
  add_definitions(-DPADDLE_USE_ATLAS)
  add_definitions(-DLAPACK_FOUND)
  message(STATUS "Found ATLAS (include: ${ATLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
  message(STATUS "Found lapack in ATLAS (include: ${ATLAS_CLAPACK_INC_DIR})")
  return()
 endif()
 ## Then find openblas.
 set(OPENBLAS_ROOT $ENV{OPENBLAS_ROOT} CACHE PATH "Folder contains Openblas")
 set(OPENBLAS_INCLUDE_SEARCH_PATHS
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@ -24,6 +24,11 @@ if(WITH_DOUBLE)
    add_definitions(-DPADDLE_TYPE_DOUBLE)
 endif(WITH_DOUBLE)
 if(WITH_ARM_FP16)
    add_definitions(-DPADDLE_ARM_FP16)
    add_definitions("-march=armv8.2-a+fp16+simd")
 endif(WITH_ARM_FP16)
 if(WITH_TESTING)
    add_definitions(-DPADDLE_WITH_TESTING)
 endif(WITH_TESTING)
--- a/cmake/external/cares.cmake
+++ b/cmake/external/cares.cmake
@ -0,0 +1,45 @@
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
 IF(MOBILE_INFERENCE OR NOT WITH_DISTRIBUTE)
    return()
 ENDIF()
 include (ExternalProject)
 # NOTE: c-ares is needed when linking with grpc.
 SET(CARES_SOURCES_DIR ${THIRD_PARTY_PATH}/cares)
 SET(CARES_INSTALL_DIR ${THIRD_PARTY_PATH}/install/cares)
 SET(CARES_INCLUDE_DIR "${CARES_INSTALL_DIR}/include/" CACHE PATH "cares include directory." FORCE)
 ExternalProject_Add(
    extern_cares
    GIT_REPOSITORY "https://github.com/c-ares/c-ares.git"
    GIT_TAG "cares-1_13_0"
    PREFIX          ${CARES_SOURCES_DIR}
    UPDATE_COMMAND  ""
    CONFIGURE_COMMAND ./buildconf && ./configure --disable-shared --prefix=${CARES_INSTALL_DIR}
    BUILD_IN_SOURCE 1
    BUILD_COMMAND   make -j8
    INSTALL_COMMAND make install
 )
 ADD_LIBRARY(cares STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET cares PROPERTY IMPORTED_LOCATION
             "${CARES_INSTALL_DIR}/lib/libcares.a")
 include_directories(${CARES_INCLUDE_DIR})
 ADD_DEPENDENCIES(cares extern_cares)
--- a/cmake/external/gflags.cmake
+++ b/cmake/external/gflags.cmake
@ -28,15 +28,8 @@ INCLUDE_DIRECTORIES(${GFLAGS_INCLUDE_DIR})
 ExternalProject_Add(
    extern_gflags
    ${EXTERNAL_PROJECT_LOG_ARGS}
-    # TODO(yiwang): The annoying warnings mentioned in
+    GIT_REPOSITORY  "https://github.com/gflags/gflags.git"
-    # https://github.com/PaddlePaddle/Paddle/issues/3277 are caused by
+    GIT_TAG         77592648e3f3be87d6c7123eb81cbad75f9aef5a
    # gflags.  I fired a PR https://github.com/gflags/gflags/pull/230
    # to fix it.  Before it gets accepted by the gflags team, we use
    # my personal fork, which contains above fix, temporarily.  Let's
    # change this back to the official Github repo once my PR is
    # merged.
    GIT_REPOSITORY  "https://github.com/wangkuiyi/gflags.git"
    GIT_TAG         986964c07427ecb9cdb5bd73f73ebbd40e54dadb
    PREFIX          ${GFLAGS_SOURCES_DIR}
    UPDATE_COMMAND  ""
    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
--- a/Show More
+++ b/Show More