Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into detection_map

8 years ago · a0b57ac781
parent a606775f7b b1869f1695
commit a0b57ac781
330 changed files with 12057 additions and 2354 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -39,7 +39,7 @@ option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_F
 option(WITH_AVX         "Compile PaddlePaddle with AVX intrinsics"      ${AVX_FOUND})
 option(WITH_MKL         "Compile PaddlePaddle with MKL support."        ${AVX_FOUND})
 option(WITH_DSO         "Compile PaddlePaddle with dynamic linked CUDA" ON)
-option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        ON)
+option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        OFF)
 option(WITH_SWIG_PY     "Compile PaddlePaddle with inference api"       ON)
 option(WITH_STYLE_CHECK "Compile PaddlePaddle with style check"         ON)
 option(WITH_PYTHON      "Compile PaddlePaddle with python interpreter"  ON)
@ -137,7 +137,7 @@ include(external/openblas)  # download, build, install openblas
 include(external/mkldnn)    # download, build, install mkldnn
 include(external/swig)      # download, build, install swig
 include(external/warpctc)   # download, build, install warpctc
-include(external/boost)     # download, build, install boost
+include(external/boost)     # download boost
 include(external/any)       # download libn::any
 include(external/eigen)     # download eigen3
 include(external/pybind11)  # download pybind11
@ -156,6 +156,7 @@ include(rdma)               # set rdma libraries
 include(flags)              # set paddle compile flags
 include(version)            # set PADDLE_VERSION
 include(coveralls)          # set code coverage
+include(inference_lib)      # add paddle fluid inference libraries


 include_directories("${PADDLE_SOURCE_DIR}")
--- a/benchmark/cluster/vgg16/Dockerfile
+++ b/benchmark/cluster/vgg16/Dockerfile
@ -0,0 +1,18 @@
+#FROM python:2.7.14
+FROM nvidia/cuda:8.0-cudnn5-runtime-ubuntu16.04
+RUN apt-get update && apt-get install -y python
+RUN pip install -U kubernetes opencv-python &&   apt-get update -y &&   apt-get install -y iputils-ping libgtk2.0-dev
+# NOTE: By default CI built wheel packages turn WITH_DISTRIBUTE=OFF,
+#       so we must build one with distribute support to install in this image.
+RUN pip install paddlepaddle
+RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()" | python'
+RUN pip uninstall -y paddlepaddle
+
+# below lines may change a lot for debugging
+ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
+ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
+ADD *.whl /
+RUN pip install /*.whl && rm -f /*.whl && \
+chmod +x /usr/bin/paddle_k8s
+ENV LD_LIBRARY_PATH=/usr/local/lib
+ADD vgg16_fluid.py vgg16_v2.py /workspace/
--- a/benchmark/cluster/vgg16/README.md
+++ b/benchmark/cluster/vgg16/README.md
@ -0,0 +1,76 @@
+# Performance for Distributed vgg16
+
+## Test Result
+
+### Hardware Infomation
+
+- CPU: Intel(R) Xeon(R) CPU E5-2620 v4 @ 2.10GHz
+- cpu MHz		: 2101.000
+- cache size	: 20480 KB
+
+### Single Node Single Thread
+
+- PServer Count: 10
+- Trainer Count: 20
+- Metrics: samples / sec
+
+| Batch Size | 32 | 64 | 128 | 256 |
+| -- | -- | -- | -- | -- |
+| PaddlePaddle Fluid | 15.44 | 16.32 | 16.74 | 16.79 |
+| PaddlePaddle v2 | 15.97 | 17.04 | 17.60 | 17.83 |
+| TensorFlow | - | - | - | - |
+
+### Different Batch Size
+
+- PServer Count: 10
+- Trainer Count: 20
+- Per trainer CPU Core: 1
+- Metrics: samples / sec
+
+| Batch Size | 32 | 64 | 128 | 256 |
+| -- | -- | -- | -- | -- |
+| PaddlePaddle Fluid | 190.20 | 222.15 | 247.40 | 258.18 |
+| PaddlePaddle v2 | 170.96 | 233.71 | 256.14 | 329.23 |
+| TensorFlow | - | - | - | - |
+
+
+### Accelerate Rate
+
+- Pserver Count: 20
+- Batch Size: 128
+- Metrics: samples / sec
+
+| Trainer Count | 20 | 40 | 80 | 100 |
+| -- | -- | -- | -- | -- |
+| PaddlePaddle Fluid | 263.29 (78.64%) | 518.80 (77.47%) | 836.26 (62.44%) | 1019.29 (60.89%) |
+| PaddlePaddle v2 (need more tests) | 326.85 (92.85%) | 534.58 (75.93%) | 853.30 (60.60%) | 1041.99 (59.20%) |
+| TensorFlow | - | - | - | - |
+
+### Different Pserver Count
+
+- Trainer Count: 60
+- Batch Size: 128
+- Metrics: samples/ sec
+
+| PServer Count | 3 | 6 |10 | 20 |
+| -- | -- | -- | -- | -- |
+| PaddlePaddle Fluid(should fix in next PR) | 589.1 | 592.6 | 656.4 | 655.8 |
+| PaddlePaddle v2 | 593.4 | 791.3 | 729.7 | 821.7 |
+| TensorFlow | - | - | - | - |
+
+*The performance gap between Fuild and v2 comes from the network interference.*
+
+
+## Steps to Run the Performance Test
+
+1. You must re-compile PaddlePaddle and enable `-DWITH_DISTRIBUTE` to build PaddlePaddle with distributed support.
+1. When the build finishes, copy the output `whl` package located under `build/python/dist` to current directory.
+1. Run `docker build -t [image:tag] .` to build the docker image and run `docker push [image:tag]` to push the image to reponsitory so kubernetes can find it.
+1. Run `kubectl create -f pserver.yaml && kubectl create -f trainer.yaml` to start the job on your kubernetes cluster (you must configure the `kubectl` client before this step).
+1. Run `kubectl get po` to get running pods, and run `kubectl logs [podID]` to fetch the pod log of pservers and trainers.
+
+Check the logs for the distributed training progress and analyze the performance.
+
+## Enable Verbos Logs
+
+Edit `pserver.yaml` and `trainer.yaml` and add an environment variable `GLOG_v=3` and `GLOG_logtostderr=1` to see what happend in detail.
--- a/benchmark/cluster/vgg16/fluid_pserver.yaml
+++ b/benchmark/cluster/vgg16/fluid_pserver.yaml
@ -0,0 +1,72 @@
+apiVersion: extensions/v1beta1
+kind: ReplicaSet
+metadata:
+  name: vgg16job-pserver
+spec:
+  replicas: 10
+  template:
+    metadata:
+      labels:
+        paddle-job-pserver: vgg16job
+    spec:
+      hostNetwork: true
+      imagePullSecrets:
+      - name: job-registry-secret
+      containers:
+      - name: pserver
+        image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
+        imagePullPolicy: Always
+        ports:
+        - name: jobport-30236
+          containerPort: 30236
+        env:
+        - name: PADDLE_JOB_NAME
+          value: vgg16job
+        - name: MKL_NUM_THREADS
+          value: "1"
+        - name: TRAINING_ROLE
+          value: "PSERVER"
+        - name: TRAINERS
+          value: "20"
+        - name: PSERVERS
+          value: "10"
+        - name: TOPOLOGY
+          value: ""
+        - name: ENTRY
+          value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0"
+        - name: TRAINER_PACKAGE
+          value: "/workspace"
+        - name: PADDLE_INIT_PORT
+          value: "30236"
+        - name: PADDLE_INIT_NICS
+          value: "xgbe0"
+        - name: PADDLE_INIT_TRAINER_COUNT
+          value: "1"
+        - name: PADDLE_INIT_PORTS_NUM
+          value: "1"
+        - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
+          value: "1"
+        - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
+          value: "20"
+        - name: PADDLE_INIT_NUM_PASSES
+          value: "1"
+        - name: PADDLE_INIT_USE_GPU
+          value: "0"
+        - name: LD_LIBRARY_PATH
+          value: "/usr/local/lib:/usr/local/nvidia/lib64"
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: "metadata.namespace"
+        - name: POD_IP
+          valueFrom:
+            fieldRef:
+              fieldPath: "status.podIP"
+        command: ["paddle_k8s", "start_fluid"]
+        resources:
+          requests:
+            memory: 10Gi
+            cpu: 4
+          limits:
+            memory: 10Gi
+            cpu: 4
--- a/benchmark/cluster/vgg16/fluid_trainer.yaml
+++ b/benchmark/cluster/vgg16/fluid_trainer.yaml
@ -0,0 +1,69 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: vgg16job-trainer
+spec:
+  parallelism: 20
+  completions: 20
+  template:
+    metadata:
+      labels:
+        paddle-job: vgg16job
+    spec:
+      imagePullSecrets:
+        - name: job-registry-secret
+      hostNetwork: true
+      containers:
+      - name: trainer
+        image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
+        imagePullPolicy: Always
+        command: ["paddle_k8s", "start_fluid"]
+        env:
+        - name: PADDLE_JOB_NAME
+          value: vgg16job
+        - name: TRAINING_ROLE
+          value: "TRAINER"
+        - name: TRAINERS
+          value: "20"
+        - name: PSERVERS
+          value: "10"
+        - name: TOPOLOGY
+          value: ""
+        - name: ENTRY
+          value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0 --batch_size 128"
+        - name: TRAINER_PACKAGE
+          value: "/workspace"
+        - name: PADDLE_INIT_PORT
+          value: "30236"
+        - name: PADDLE_INIT_NICS
+          value: "xgbe0"
+        - name: PADDLE_INIT_TRAINER_COUNT
+          value: "1"
+        - name: PADDLE_INIT_PORTS_NUM
+          value: "1"
+        - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
+          value: "1"
+        - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
+          value: "20"
+        - name: PADDLE_INIT_NUM_PASSES
+          value: "1"
+        - name: PADDLE_INIT_USE_GPU
+          value: "0"
+        - name: LD_LIBRARY_PATH
+          value: "/usr/local/lib:/usr/local/nvidia/lib64"
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: "metadata.namespace"
+        - name: POD_IP
+          valueFrom:
+            fieldRef:
+              fieldPath: "status.podIP"
+        resources:
+          requests:
+            memory: 40Gi
+            cpu: 2
+          limits:
+            memory: 40Gi
+            cpu: 2
+      restartPolicy: Never
--- a/benchmark/cluster/vgg16/v2_pserver.yaml
+++ b/benchmark/cluster/vgg16/v2_pserver.yaml
@ -0,0 +1,64 @@
+apiVersion: extensions/v1beta1
+kind: ReplicaSet
+metadata:
+  name: vgg16v2job-pserver
+spec:
+  replicas: 10
+  template:
+    metadata:
+      labels:
+        paddle-job-pserver: vgg16v2job
+    spec:
+      hostNetwork: true
+      imagePullSecrets:
+      - name: job-registry-secret
+      containers:
+      - name: pserver
+        image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
+        imagePullPolicy: Always
+        ports:
+        - name: jobport-30236
+          containerPort: 30236
+        env:
+        - name: PADDLE_JOB_NAME
+          value: vgg16v2job
+        - name: TRAINERS
+          value: "20"
+        - name: PSERVERS
+          value: "10"
+        - name: TOPOLOGY
+          value: ""
+        - name: ENTRY
+          value: "python train.py"
+        - name: TRAINER_PACKAGE
+          value: "/workspace"
+        - name: PADDLE_INIT_PORT
+          value: "30236"
+        - name: PADDLE_INIT_NICS
+          value: "xgbe0"
+        - name: PADDLE_INIT_TRAINER_COUNT
+          value: "1"
+        - name: PADDLE_INIT_PORTS_NUM
+          value: "1"
+        - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
+          value: "1"
+        - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
+          value: "20"
+        - name: PADDLE_INIT_NUM_PASSES
+          value: "1"
+        - name: PADDLE_INIT_USE_GPU
+          value: "0"
+        - name: LD_LIBRARY_PATH
+          value: "/usr/local/lib:/usr/local/nvidia/lib64"
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: "metadata.namespace"
+        command: ["paddle_k8s", "start_pserver"]
+        resources:
+          requests:
+            memory: 10Gi
+            cpu: 4
+          limits:
+            memory: 10Gi
+            cpu: 4
--- a/benchmark/cluster/vgg16/v2_trainer.yaml
+++ b/benchmark/cluster/vgg16/v2_trainer.yaml
@ -0,0 +1,65 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: vgg16v2job-trainer
+spec:
+  parallelism: 20
+  completions: 20
+  template:
+    metadata:
+      labels:
+        paddle-job: vgg16v2job
+    spec:
+      imagePullSecrets:
+        - name: job-registry-secret
+      hostNetwork: true
+      containers:
+      - name: trainer
+        image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
+        imagePullPolicy: Always
+        command: ["paddle_k8s", "start_trainer", "v2"]
+        env:
+        - name: PADDLE_JOB_NAME
+          value: vgg16v2job
+        - name: BATCH_SIZE
+          value: "256"
+        - name: TRAINERS
+          value: "20"
+        - name: PSERVERS
+          value: "10"
+        - name: TOPOLOGY
+          value: ""
+        - name: ENTRY
+          value: "cd /workspace && MKL_NUM_THREADS=1 python /workspace/vgg16_v2.py"
+        - name: TRAINER_PACKAGE
+          value: "/workspace"
+        - name: PADDLE_INIT_PORT
+          value: "30236"
+        - name: PADDLE_INIT_NICS
+          value: "xgbe0"
+        - name: PADDLE_INIT_TRAINER_COUNT
+          value: "1"
+        - name: PADDLE_INIT_PORTS_NUM
+          value: "1"
+        - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
+          value: "1"
+        - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
+          value: "20"
+        - name: PADDLE_INIT_NUM_PASSES
+          value: "2"
+        - name: PADDLE_INIT_USE_GPU
+          value: "0"
+        - name: LD_LIBRARY_PATH
+          value: "/usr/local/lib:/usr/local/nvidia/lib64"
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: "metadata.namespace"
+        resources:
+          requests:
+            memory: 40Gi
+            cpu: 2
+          limits:
+            memory: 40Gi
+            cpu: 2
+      restartPolicy: Never
--- a/benchmark/cluster/vgg16/vgg16_fluid.py
+++ b/benchmark/cluster/vgg16/vgg16_fluid.py
--- a/benchmark/cluster/vgg16/vgg16_v2.py
+++ b/benchmark/cluster/vgg16/vgg16_v2.py
@ -0,0 +1,154 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+import gzip
+
+import paddle.v2.dataset.cifar as cifar
+import paddle.v2 as paddle
+import time
+import os
+
+DATA_DIM = 3 * 32 * 32
+CLASS_DIM = 10
+BATCH_SIZE = os.getenv("BATCH_SIZE")
+if BATCH_SIZE:
+    BATCH_SIZE = int(BATCH_SIZE)
+else:
+    BATCH_SIZE = 128
+print "batch_size", BATCH_SIZE
+NODE_COUNT = int(os.getenv("TRAINERS"))
+ts = 0
+
+
+def vgg(input, nums, class_dim):
+    def conv_block(input, num_filter, groups, num_channels=None):
+        return paddle.networks.img_conv_group(
+            input=input,
+            num_channels=num_channels,
+            pool_size=2,
+            pool_stride=2,
+            conv_num_filter=[num_filter] * groups,
+            conv_filter_size=3,
+            conv_act=paddle.activation.Relu(),
+            pool_type=paddle.pooling.Max())
+
+    assert len(nums) == 5
+    # the channel of input feature is 3
+    conv1 = conv_block(input, 64, nums[0], 3)
+    conv2 = conv_block(conv1, 128, nums[1])
+    conv3 = conv_block(conv2, 256, nums[2])
+    conv4 = conv_block(conv3, 512, nums[3])
+    conv5 = conv_block(conv4, 512, nums[4])
+
+    fc_dim = 512
+    fc1 = paddle.layer.fc(input=conv5,
+                          size=fc_dim,
+                          act=paddle.activation.Relu(),
+                          layer_attr=paddle.attr.Extra(drop_rate=0.5))
+    fc2 = paddle.layer.fc(input=fc1,
+                          size=fc_dim,
+                          act=paddle.activation.Relu(),
+                          layer_attr=paddle.attr.Extra(drop_rate=0.5))
+    out = paddle.layer.fc(input=fc2,
+                          size=class_dim,
+                          act=paddle.activation.Softmax())
+    return out
+
+
+def vgg13(input, class_dim):
+    nums = [2, 2, 2, 2, 2]
+    return vgg(input, nums, class_dim)
+
+
+def vgg16(input, class_dim):
+    nums = [2, 2, 3, 3, 3]
+    return vgg(input, nums, class_dim)
+
+
+def vgg19(input, class_dim):
+    nums = [2, 2, 4, 4, 4]
+    return vgg(input, nums, class_dim)
+
+
+def main():
+    global ts
+    paddle.init(use_gpu=False)
+    image = paddle.layer.data(
+        name="image", type=paddle.data_type.dense_vector(DATA_DIM))
+    lbl = paddle.layer.data(
+        name="label", type=paddle.data_type.integer_value(CLASS_DIM))
+
+    extra_layers = None
+    # NOTE: for v2 distributed training need averaging updates.
+    learning_rate = 1e-3 / NODE_COUNT
+    out = vgg16(image, class_dim=CLASS_DIM)
+    cost = paddle.layer.classification_cost(input=out, label=lbl)
+
+    # Create parameters
+    parameters = paddle.parameters.create(cost)
+
+    # Create optimizer
+    optimizer = paddle.optimizer.Momentum(
+        momentum=0.9,
+        regularization=paddle.optimizer.L2Regularization(rate=0.0005 *
+                                                         BATCH_SIZE),
+        learning_rate=learning_rate / BATCH_SIZE,
+        learning_rate_decay_a=0.1,
+        learning_rate_decay_b=128000 * 35,
+        learning_rate_schedule="discexp", )
+
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            cifar.train10(),
+            # To use other data, replace the above line with:
+            # reader.train_reader('train.list'),
+            buf_size=1000),
+        batch_size=BATCH_SIZE)
+    test_reader = paddle.batch(
+        cifar.test10(),
+        # To use other data, replace the above line with:
+        # reader.test_reader('val.list'),
+        batch_size=BATCH_SIZE)
+
+    # Create trainer
+    trainer = paddle.trainer.SGD(cost=cost,
+                                 parameters=parameters,
+                                 update_equation=optimizer,
+                                 extra_layers=extra_layers,
+                                 is_local=False)
+
+    # End batch and end pass event handler
+    def event_handler(event):
+        global ts, ts_pass
+        if isinstance(event, paddle.event.BeginPass):
+            ts_pass = time.time()
+        if isinstance(event, paddle.event.BeginIteration):
+            ts = time.time()
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 1 == 0:
+                print "\nPass %d, Batch %d, Cost %f, %s, spent: %f" % (
+                    event.pass_id, event.batch_id, event.cost, event.metrics,
+                    time.time() - ts)
+        if isinstance(event, paddle.event.EndPass):
+            print "Pass %d end, spent: %f" % (event.pass_id,
+                                              time.time() - ts_pass)
+            result = trainer.test(reader=test_reader)
+            print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
+
+    trainer.train(
+        reader=train_reader, num_passes=200, event_handler=event_handler)
+
+
+if __name__ == '__main__':
+    main()
--- a/cmake/external/boost.cmake
+++ b/cmake/external/boost.cmake
@ -21,6 +21,7 @@ set(BOOST_URL           "http://sourceforge.net/projects/boost/files/boost/${BOO
 set(BOOST_SOURCES_DIR ${THIRD_PARTY_PATH}/boost)
 set(BOOST_DOWNLOAD_DIR  "${BOOST_SOURCES_DIR}/src/${BOOST_PROJECT}")
 set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}/${BOOST_TAR}" CACHE PATH "boost include directory." FORCE)
+set_directory_properties(PROPERTIES CLEAN_NO_CUSTOM 1)

 include_directories(${BOOST_INCLUDE_DIR})

--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@ -28,9 +28,3 @@ endif()
 add_dependencies(eigen3 extern_eigen3)

 LIST(APPEND external_project_dependencies eigen3)
-
-IF(NOT WITH_C_API AND WITH_FLUID)
-    INSTALL(FILES ${EIGEN_INCLUDE_DIR}/Eigen/Core DESTINATION third_party/eigen3/Eigen)
-    INSTALL(DIRECTORY ${EIGEN_INCLUDE_DIR}/Eigen/src DESTINATION third_party/eigen3/Eigen)
-    INSTALL(DIRECTORY ${EIGEN_INCLUDE_DIR}/unsupported/Eigen DESTINATION third_party/eigen3/unsupported)
-ENDIF()
--- a/cmake/external/gflags.cmake
+++ b/cmake/external/gflags.cmake
@ -52,7 +52,7 @@ ADD_DEPENDENCIES(gflags extern_gflags)

 LIST(APPEND external_project_dependencies gflags)

-IF(WITH_C_API OR WITH_FLUID)
+IF(WITH_C_API)
  INSTALL(DIRECTORY ${GFLAGS_INCLUDE_DIR} DESTINATION third_party/gflags)
  IF(ANDROID)
    INSTALL(FILES ${GFLAGS_LIBRARIES} DESTINATION third_party/gflags/lib/${ANDROID_ABI})
--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@ -68,7 +68,7 @@ LINK_LIBRARIES(glog gflags)

 LIST(APPEND external_project_dependencies glog)

-IF(WITH_C_API OR WITH_FLUID)
+IF(WITH_C_API)
  INSTALL(DIRECTORY ${GLOG_INCLUDE_DIR} DESTINATION third_party/glog)
  IF(ANDROID)
    INSTALL(FILES ${GLOG_LIBRARIES} DESTINATION third_party/glog/lib/${ANDROID_ABI})
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@ -250,7 +250,7 @@ IF(NOT PROTOBUF_FOUND)
    SET(PROTOBUF_PROTOC_LIBRARY ${extern_protobuf_PROTOC_LIBRARY}
        CACHE FILEPATH "protoc library." FORCE)

-    IF(WITH_C_API OR WITH_FLUID)
+    IF(WITH_C_API)
        INSTALL(DIRECTORY ${PROTOBUF_INCLUDE_DIR} DESTINATION third_party/protobuf)
        IF(ANDROID)
            INSTALL(FILES ${PROTOBUF_LITE_LIBRARY} DESTINATION third_party/protobuf/lib/${ANDROID_ABI})
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@ -52,6 +52,7 @@ ExternalProject_Add(
                    -DWITH_TORCH=OFF
                    -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON
                    -DBUILD_SHARED=ON
+                    -DBUILD_TESTS=OFF
                    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
                    -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
                    ${EXTERNAL_OPTIONAL_ARGS}
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@ -186,6 +186,11 @@ function(cc_library TARGET_NAME)
      add_library(${TARGET_NAME} STATIC ${cc_library_SRCS})
    endif()
    if (cc_library_DEPS)
+      # Don't need link libwarpctc.so
+      if ("${cc_library_DEPS};" MATCHES "warpctc;")
+        list(REMOVE_ITEM cc_library_DEPS warpctc)
+        add_dependencies(${TARGET_NAME} warpctc)
+      endif()
      add_dependencies(${TARGET_NAME} ${cc_library_DEPS})
      target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
    endif()
@ -224,12 +229,18 @@ function(cc_test TARGET_NAME)
  if(WITH_TESTING)
    set(options "")
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS)
+    set(multiValueArgs SRCS DEPS ARGS)
    cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
    add_executable(${TARGET_NAME} ${cc_test_SRCS})
-    target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
+    # Support linking flags: --whole-archive (Linux) / -force_load (MacOS)
+    target_circle_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
+    if("${cc_test_DEPS}" MATCHES "ARCHIVE_START")
+      list(REMOVE_ITEM cc_test_DEPS ARCHIVE_START ARCHIVE_END)
+    endif()
    add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
-    add_test(NAME ${TARGET_NAME} COMMAND ${TARGET_NAME} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+    add_test(NAME ${TARGET_NAME}
+             COMMAND ${TARGET_NAME} ${cc_test_ARGS}
+             WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
  endif()
 endfunction(cc_test)

@ -457,12 +468,12 @@ endfunction()

 function(py_test TARGET_NAME)
  if(WITH_TESTING)
-    set(options STATIC static SHARED shared)
+    set(options "")
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS ARGS)
+    set(multiValueArgs SRCS DEPS ARGS ENVS)
    cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
    add_test(NAME ${TARGET_NAME}
-             COMMAND env PYTHONPATH=${PADDLE_PYTHON_BUILD_DIR}/lib-python
+             COMMAND env PYTHONPATH=${PADDLE_PYTHON_BUILD_DIR}/lib-python ${py_test_ENVS}
             ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
             WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
  endif()
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@ -0,0 +1,90 @@
+# make package for paddle fluid shared and static library
+function(copy TARGET)
+    set(options "")
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DSTS DEPS)
+    cmake_parse_arguments(copy_lib "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+    list(LENGTH copy_lib_SRCS copy_lib_SRCS_len)
+    list(LENGTH copy_lib_DSTS copy_lib_DSTS_len)
+    if(NOT ${copy_lib_SRCS_len} EQUAL ${copy_lib_DSTS_len})
+        message(FATAL_ERROR "${TARGET} source numbers are not equal to destination numbers")
+    endif()
+    math(EXPR len "${copy_lib_SRCS_len} - 1")
+    
+    add_custom_target(${TARGET} DEPENDS ${copy_lib_DEPS})
+    foreach(index RANGE ${len})
+        list(GET copy_lib_SRCS ${index} src)
+        list(GET copy_lib_DSTS ${index} dst)
+        add_custom_command(TARGET ${TARGET} PRE_BUILD COMMAND mkdir -p "${dst}")
+        if(IS_DIRECTORY ${src})
+            add_custom_command(TARGET ${TARGET} PRE_BUILD COMMAND cp -r "${src}" "${dst}")
+        else()
+            add_custom_command(TARGET ${TARGET} PRE_BUILD COMMAND cp "${src}" "${dst}")
+        endif()
+    endforeach()
+endfunction()
+
+# third party
+set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/eigen3")
+copy(eigen3_lib
+  SRCS ${EIGEN_INCLUDE_DIR}/Eigen/Core ${EIGEN_INCLUDE_DIR}/Eigen/src ${EIGEN_INCLUDE_DIR}/unsupported/Eigen
+  DSTS ${dst_dir}/Eigen ${dst_dir}/Eigen ${dst_dir}/unsupported
+)
+
+set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/gflags")
+copy(gflags_lib
+  SRCS ${GFLAGS_INCLUDE_DIR} ${GFLAGS_LIBRARIES}
+  DSTS ${dst_dir} ${dst_dir}/lib
+)
+
+set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/glog")
+copy(glog_lib
+  SRCS ${GLOG_INCLUDE_DIR} ${GLOG_LIBRARIES}
+  DSTS ${dst_dir} ${dst_dir}/lib
+)
+
+IF(NOT PROTOBUF_FOUND)
+    set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/protobuf")
+    copy(protobuf_lib
+      SRCS ${PROTOBUF_INCLUDE_DIR} ${PROTOBUF_LITE_LIBRARY}
+      DSTS ${dst_dir} ${dst_dir}/lib
+    )
+ENDIF(NOT PROTOBUF_FOUND)
+
+# paddle fluid module
+set(src_dir "${PADDLE_SOURCE_DIR}/paddle")
+set(dst_dir "${CMAKE_INSTALL_PREFIX}/paddle")
+set(module "framework")
+copy(framework_lib DEPS framework_py_proto 
+  SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/framework/framework.pb.h
+  DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module}
+)
+
+set(module "memory")
+copy(memory_lib
+  SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/detail/*.h
+  DSTS ${dst_dir}/${module} ${dst_dir}/${module}/detail
+)
+
+set(module "inference")
+copy(inference_lib DEPENDS paddle_fluid_shared
+  SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/inference/libpaddle_fluid.so
+  DSTS ${dst_dir}/${module} ${dst_dir}/${module}
+)
+
+set(module "platform")
+copy(platform_lib
+  SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h ${src_dir}/${module}/details/*.h
+  DSTS ${dst_dir}/${module} ${dst_dir}/${module}/dynload ${dst_dir}/${module}/details
+)
+
+set(module "string")
+copy(string_lib
+  SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/tinyformat/*.h
+  DSTS ${dst_dir}/${module} ${dst_dir}/${module}/tinyformat
+)
+
+add_custom_target(inference_lib_dist DEPENDS 
+  inference_lib framework_lib memory_lib platform_lib string_lib
+  gflags_lib glog_lib protobuf_lib eigen3_lib)
--- a/doc/CMakeLists.txt
+++ b/doc/CMakeLists.txt
@ -47,3 +47,5 @@ sphinx_add_target(paddle_docs_cn
                  ${SPHINX_CACHE_DIR_CN}
                  ${CMAKE_CURRENT_SOURCE_DIR}
                  ${SPHINX_HTML_DIR_CN})
+
+add_subdirectory(api)
--- a/doc/api/CMakeLists.txt
+++ b/doc/api/CMakeLists.txt
@ -0,0 +1,20 @@
+# configured documentation tools and intermediate build results
+set(BINARY_BUILD_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_build")
+
+# Sphinx cache with pickled ReST documents
+set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")
+
+# HTML output director
+set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")
+
+configure_file(
+    "${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.en.in"
+    "${BINARY_BUILD_DIR_EN}/conf.py"
+    @ONLY)
+
+sphinx_add_target(paddle_api_docs
+                  html
+                  ${BINARY_BUILD_DIR_EN}
+                  ${SPHINX_CACHE_DIR_EN}
+                  ${CMAKE_CURRENT_SOURCE_DIR}
+                  ${SPHINX_HTML_DIR_EN})
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@ -87,6 +87,11 @@ roi_pool
 ..  autoclass:: paddle.v2.layer.roi_pool
    :noindex:

+pad
+----
+..  autoclass:: paddle.v2.layer.pad
+    :noindex:
+
 Norm Layer
 ==========

@ -133,6 +138,11 @@ grumemory
 ..  autoclass:: paddle.v2.layer.grumemory
    :noindex:

+gated_unit
+-----------
+..  autoclass:: paddle.v2.layer.gated_unit
+    :noindex:
+    
 Recurrent Layer Group
 =====================

@ -340,6 +350,11 @@ bilinear_interp
 ..  autoclass:: paddle.v2.layer.bilinear_interp
    :noindex:

+dropout
+--------
+..  autoclass:: paddle.v2.layer.dropout
+    :noindex:
+    
 dot_prod
 ---------
 .. autoclass:: paddle.v2.layer.dot_prod
@ -402,6 +417,11 @@ scale_shift
 ..  autoclass:: paddle.v2.layer.scale_shift
    :noindex:

+factorization_machine
+---------------------
+..  autoclass:: paddle.v2.layer.factorization_machine
+    :noindex:
+
 Sampling Layers
 ===============

@ -420,22 +440,6 @@ multiplex
 ..  autoclass:: paddle.v2.layer.multiplex
    :noindex:

-Factorization Machine Layer
-============================
-
-factorization_machine
---------------------
-..  autoclass:: paddle.v2.layer.factorization_machine
-    :noindex:
-
-Slicing and Joining Layers
-==========================
-
-pad
----
-..  autoclass:: paddle.v2.layer.pad
-    :noindex:
-
 ..  _api_v2.layer_costs:

 Cost Layers
@ -526,6 +530,11 @@ multibox_loss
 ..  autoclass:: paddle.v2.layer.multibox_loss
    :noindex:

+detection_output
+----------------
+..  autoclass:: paddle.v2.layer.detection_output
+    :noindex:
+    
 Check Layer
 ============

@ -534,31 +543,10 @@ eos
 ..  autoclass:: paddle.v2.layer.eos
    :noindex:

-Miscs
-=====
-
-dropout
--------
-..  autoclass:: paddle.v2.layer.dropout
-    :noindex:
-
-Activation with learnable parameter
-===================================
+Activation
+==========

 prelu
 --------
 ..  autoclass:: paddle.v2.layer.prelu
    :noindex:
-
-gated_unit
-----------
-..  autoclass:: paddle.v2.layer.gated_unit
-    :noindex:
-
-Detection output Layer
-======================
-
-detection_output
----------------
-..  autoclass:: paddle.v2.layer.detection_output
-    :noindex:
--- a/doc/api/v2/data/dataset.rst
+++ b/doc/api/v2/data/dataset.rst
@ -73,3 +73,10 @@ wmt14
 ..  automodule:: paddle.v2.dataset.wmt14
    :members:
    :noindex:
+
+wmt16
+++++
+
+..  automodule:: paddle.v2.dataset.wmt16
+    :members:
+    :noindex:
--- a/doc/api/v2/fluid/data_feeder.rst
+++ b/doc/api/v2/fluid/data_feeder.rst
@ -1,9 +1,14 @@
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
 ===========
-DataFeeder
+data_feeder
 ===========

 DataFeeder
-----------
-..  automodule:: paddle.v2.fluid.data_feeder
-    :members: DataFeeder
+----------
+
+..  autoclass:: paddle.v2.fluid.data_feeder.DataFeeder
+    :members:
    :noindex:
+
--- a/doc/api/v2/fluid/evaluator.rst
+++ b/doc/api/v2/fluid/evaluator.rst
@ -1,9 +1,21 @@
-===========
-Evaluator
-===========
-
-Evaluator
-----------
-..  automodule:: paddle.v2.fluid.evaluator
-    :members: Evaluator
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+=========
+evaluator
+=========
+
+Accuracy
+--------
+
+..  autoclass:: paddle.v2.fluid.evaluator.Accuracy
+    :members:
    :noindex:
+
+ChunkEvaluator
+--------------
+
+..  autoclass:: paddle.v2.fluid.evaluator.ChunkEvaluator
+    :members:
+    :noindex:
+
--- a/doc/api/v2/fluid/executor.rst
+++ b/doc/api/v2/fluid/executor.rst
@ -1,9 +1,32 @@
-===========
-Executor
-===========
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+========
+executor
+========

 Executor
+--------
+
+..  autoclass:: paddle.v2.fluid.executor.Executor
+    :members:
+    :noindex:
+
+global_scope
+------------
+
+..  autofunction:: paddle.v2.fluid.executor.global_scope
+    :noindex:
+
+scope_guard
 -----------
-..  automodule:: paddle.v2.fluid.executor
-    :members: Executor
+
+..  autofunction:: paddle.v2.fluid.executor.scope_guard
+    :noindex:
+
+switch_scope
+------------
+
+..  autofunction:: paddle.v2.fluid.executor.switch_scope
    :noindex:
+
--- a/doc/api/v2/fluid/gen_doc.py
+++ b/doc/api/v2/fluid/gen_doc.py
@ -0,0 +1,109 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import argparse
+import sys
+import types
+
+import paddle.v2.fluid as fluid
+
+
+def parse_arg():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--submodules', nargs="*")
+    parser.add_argument(
+        'module', type=str, help='Generate the documentation of which module')
+    return parser.parse_args()
+
+
+class DocGenerator(object):
+    def __init__(self, module_name, stream=sys.stdout):
+        self.stream = stream
+        self.module_name = module_name
+        if not hasattr(fluid, module_name):
+            raise ValueError("Cannot find fluid.{0}".format(module_name))
+        else:
+            self.module = getattr(fluid, module_name)
+        self.stream.write('''..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+''')
+
+        self._print_header_(module_name, dot='=', is_title=True)
+
+    def print_submodule(self, submodule_name):
+        submodule = getattr(self.module, submodule_name)
+        if submodule is None:
+            raise ValueError("Cannot find submodule {0}".format(submodule_name))
+        self.print_section(submodule_name)
+
+        for item in submodule.__all__:
+            self.print_item(item)
+
+    def print_current_module(self):
+        for item in self.module.__all__:
+            self.print_item(item)
+
+    def print_section(self, name):
+        self._print_header_(name, dot='=', is_title=False)
+
+    def print_item(self, name):
+        item = getattr(self.module, name)
+        if isinstance(item, types.TypeType):
+            self.print_class(name)
+        elif isinstance(item, types.FunctionType):
+            self.print_method(name)
+        else:
+            raise RuntimeError("Unsupported item {0}".format(name))
+
+    def print_class(self, name):
+        self._print_header_(name, dot='-', is_title=False)
+        self.stream.write('''..  autoclass:: paddle.v2.fluid.{0}.{1}
+    :members:
+    :noindex:
+
+'''.format(self.module_name, name))
+
+    def print_method(self, name):
+        self._print_header_(name, dot='-', is_title=False)
+        self.stream.write('''..  autofunction:: paddle.v2.fluid.{0}.{1}
+    :noindex:
+
+'''.format(self.module_name, name))
+
+    def _print_header_(self, name, dot, is_title):
+        dot_line = dot * len(name)
+        if is_title:
+            self.stream.write(dot_line)
+            self.stream.write('\n')
+        self.stream.write(name)
+        self.stream.write('\n')
+        self.stream.write(dot_line)
+        self.stream.write('\n')
+        self.stream.write('\n')
+
+
+def main():
+    args = parse_arg()
+    gen = DocGenerator(args.module)
+    if args.submodules is None:
+        gen.print_current_module()
+    else:
+        for submodule_name in args.submodules:
+            gen.print_submodule(submodule_name)
+
+
+if __name__ == '__main__':
+    main()
--- a/Show More
+++ b/Show More