Merge remote-tracking branch 'ups/develop' into refine/set_num_threads

fix conflicts
7 years ago · 3df99e72ab
parent e3a96300bb 4ed0b62476
commit 3df99e72ab
1034 changed files with 6151 additions and 2344 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -61,8 +61,11 @@ option(EIGEN_USE_THREADS "Compile with multi-threaded Eigen"            OFF)
 option(WITH_ARM_FP16    "Use half precision support on armv8.2-a cpu"   OFF)
 option(WITH_FAST_BUNDLE_TEST    "Bundle tests that can be run in a single process together to reduce launch overhead"   OFF)
 option(WITH_CONTRIB     "Compile the third-party contributation"        OFF)
+option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better debug." OFF)
 option(WITH_ANAKIN      "Compile with Anakin library"                   OFF)
 option(WITH_GRPC     "Use grpc as the default rpc framework"            ${WITH_DISTRIBUTE})
+option(WITH_BRPC_RDMA     "Use brpc rdma as the rpc protocal"           OFF)
+option(WITH_SYSTEM_BLAS   "Use system blas library"           OFF)

 # CMAKE_BUILD_TYPE
 if(NOT CMAKE_BUILD_TYPE)
@ -131,6 +134,10 @@ if (NOT DEFINED WITH_MKLDNN)
        set(WITH_MKLDNN OFF)
    endif()
 endif()
+
+if (REPLACE_ENFORCE_GLOG)
+  add_definitions("-DREPLACE_ENFORCE_GLOG")
+endif()
 ########################################################################################

 include(external/mklml)     # download mklml package
@ -153,12 +160,24 @@ include(external/cares)
 if(WITH_DISTRIBUTE)
    if(WITH_GRPC)
        include(external/grpc)
+        message(STATUS "Use grpc framework.")
    else()
+        message(STATUS "Use brpc framework.")
        include(external/leveldb)
        include(external/brpc)
    endif()
 endif()

+if(WITH_BRPC_RDMA)
+    message(STATUS "Use brpc with rdma.")
+    if(WITH_GRPC)
+        message(FATAL_ERROR "Can't use grpc with brpc rdma.")
+    endif()
+    if(NOT WITH_DISTRIBUTE)
+        message(FATAL_ERROR "Can't use brpc rdma in no distribute env.")
+    endif()
+endif()
+
 include(external/snappy)    # download snappy
 include(external/snappystream)
 include(external/threadpool)
@ -178,7 +197,7 @@ include(inference_lib)      # add paddle fluid inference libraries


 include_directories("${PADDLE_SOURCE_DIR}")
-include_directories("${PADDLE_SOURCE_DIR}/paddle/cuda/include")
+include_directories("${PADDLE_SOURCE_DIR}/paddle/legacy/cuda/include")
 include_directories("${CMAKE_CURRENT_BINARY_DIR}/proto")
 include_directories("${CMAKE_CURRENT_BINARY_DIR}/go/pserver/client/c")

@ -222,7 +241,7 @@ add_subdirectory(proto)
 if(NOT MOBILE_INFERENCE AND NOT WITH_FLUID_ONLY)
    # "add_subdirectory(go)" should be placed after the following loine,
    # because it depends on paddle/optimizer.
-    add_subdirectory(paddle/optimizer)
+    add_subdirectory(paddle/legacy/optimizer)
 endif()

 # "add_subdirectory(paddle)" and "add_subdirectory(python)" should be
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -159,4 +159,4 @@ This will enable VLOG messages generated by `buddy_allocator.{h,cc}` and in the
 - verbose level 1: [framework](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework)
 - verbose level 3: [operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators)
 - verbose level 5: [memory](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/memory), [platform](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/platform)
- verbose level 7: [math](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/math)
+- verbose level 7: [math](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/legacy/math)
--- a/README.md
+++ b/README.md
@ -4,7 +4,6 @@
 [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
 [![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/index_en.html)
 [![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html)
-[![Coverage Status](https://coveralls.io/repos/github/PaddlePaddle/Paddle/badge.svg?branch=develop)](https://coveralls.io/github/PaddlePaddle/Paddle?branch=develop)
 [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
 [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)

@ -19,6 +18,8 @@ learning to many products at Baidu.
 Our vision is to enable deep learning for everyone via PaddlePaddle.
 Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle.

+### Lastest PaddlePaddle Version: [Fluid](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid)
+
 ## Features

 - **Flexibility**
--- a/benchmark/fluid/args.py
+++ b/benchmark/fluid/args.py
@ -122,5 +122,13 @@ def parse_args():
        type=str,
        default="",
        help='Directory that contains all the training recordio files.')
+    parser.add_argument(
+        '--use_inference_transpiler',
+        action='store_true',
+        help='If set, use inference transpiler to optimize the program.')
+    parser.add_argument(
+        '--no_random',
+        action='store_true',
+        help='If set, keep the random seed and do not shuffle the data.')
    args = parser.parse_args()
    return args
--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
@ -131,6 +131,7 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
    exe = fluid.Executor(place)
    exe.run(startup_prog)

+    # Use inference_transpiler to speedup
    if not args.use_reader_op:
        feed_var_list = [
            var for var in train_prog.global_block().vars.itervalues()
@ -181,6 +182,10 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
        print("Pass: %d, Loss: %f" % (pass_id, np.mean(train_losses))),
        # evaluation
        if not args.no_test and batch_acc and not args.use_reader_op:
+            if args.use_inference_transpiler:
+                t = fluid.InferenceTranspiler()
+                t.transpile(infer_prog, place)
+
            pass_test_acc = test(exe, infer_prog, test_reader, feeder,
                                 batch_acc)
            print(", Test Accuracy: %f" % pass_test_acc)
@ -311,6 +316,8 @@ def main():
    args = parse_args()
    print_arguments(args)
    print_paddle_envs()
+    if args.no_random:
+        fluid.default_startup_program().random_seed = 1

    # the unique trainer id, starting from 0, needed by trainer
    # only
--- a/benchmark/fluid/models/resnet.py
+++ b/benchmark/fluid/models/resnet.py
@ -197,12 +197,12 @@ def get_model(args):
    optimizer = fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9)

    batched_train_reader = paddle.batch(
-        paddle.reader.shuffle(
+        train_reader if args.no_random else paddle.reader.shuffle(
            train_reader, buf_size=5120),
        batch_size=args.batch_size * args.gpus,
        drop_last=True)
    batched_test_reader = paddle.batch(
-        train_reader, batch_size=args.batch_size, drop_last=True)
+        test_reader, batch_size=args.batch_size, drop_last=True)

    return avg_cost, inference_program, optimizer, batched_train_reader,\
                   batched_test_reader, batch_acc
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@ -83,18 +83,20 @@ else()
  set(REFERENCE_CBLAS_LIB_SEARCH_PATHS ${REFERENCE_CBLAS_ROOT}/lib)
 endif()

-find_path(REFERENCE_CBLAS_INCLUDE_DIR NAMES cblas.h PATHS
+if(WITH_SYSTEM_BLAS)
+  find_path(REFERENCE_CBLAS_INCLUDE_DIR NAMES cblas.h PATHS
        ${REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS})
-find_library(REFERENCE_CBLAS_LIBRARY NAMES cblas PATHS
+  find_library(REFERENCE_CBLAS_LIBRARY NAMES cblas PATHS
        ${REFERENCE_CBLAS_LIB_SEARCH_PATHS})

-if(REFERENCE_CBLAS_INCLUDE_DIR AND REFERENCE_CBLAS_LIBRARY)
-  set(CBLAS_FOUND ON)
-  set(CBLAS_PROVIDER REFERENCE)
-  set(CBLAS_INC_DIR ${REFERENCE_CBLAS_INCLUDE_DIR})
-  set(CBLAS_LIBRARIES ${REFERENCE_CBLAS_LIBRARY})
-  add_definitions(-DPADDLE_USE_REFERENCE_CBLAS)
-  message(STATUS "Found reference-cblas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
+  if(REFERENCE_CBLAS_INCLUDE_DIR AND REFERENCE_CBLAS_LIBRARY)
+    set(CBLAS_FOUND ON)
+    set(CBLAS_PROVIDER REFERENCE)
+    set(CBLAS_INC_DIR ${REFERENCE_CBLAS_INCLUDE_DIR})
+    set(CBLAS_LIBRARIES ${REFERENCE_CBLAS_LIBRARY})
+    add_definitions(-DPADDLE_USE_REFERENCE_CBLAS)
+    message(STATUS "Found reference-cblas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
+  endif()
 endif()

 if(IOS_USE_VECLIB_FOR_BLAS AND VECLIB_FOUND)
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@ -174,3 +174,7 @@ endif(WITH_GOLANG)
 if(WITH_GRPC)
    add_definitions(-DPADDLE_WITH_GRPC)
 endif(WITH_GRPC)
+
+if(WITH_BRPC_RDMA)
+    add_definitions(-DPADDLE_WITH_BRPC_RDMA)
+endif(WITH_BRPC_RDMA)
--- a/cmake/external/anakin.cmake
+++ b/cmake/external/anakin.cmake
@ -26,13 +26,15 @@ function(fetch_include_recursively root_dir)
    endforeach()
 endfunction()

-# download library
-message(STATUS "Download Anakin library from ${ANAKIN_LIBRARY_URL}")
-execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}")
-execute_process(COMMAND bash -c "rm -rf ${ANAKIN_INSTALL_DIR}/*")
-execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; wget -q ${ANAKIN_LIBRARY_URL}")
-execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}")
-execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; tar xzf anakin_release_simple.tar.gz")
+if (NOT EXISTS "${ANAKIN_INSTALL_DIR}")
+    # download library
+    message(STATUS "Download Anakin library from ${ANAKIN_LIBRARY_URL}")
+    execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}")
+    execute_process(COMMAND bash -c "rm -rf ${ANAKIN_INSTALL_DIR}/*")
+    execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; wget -q ${ANAKIN_LIBRARY_URL}")
+    execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}")
+    execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; tar xzf anakin_release_simple.tar.gz")
+endif()

 if (WITH_ANAKIN)
    message(STATUS "Anakin for inference is enabled")
--- a/cmake/external/brpc.cmake
+++ b/cmake/external/brpc.cmake
@ -14,6 +14,15 @@

 INCLUDE(ExternalProject)

+find_library(SSL_LIBRARY NAMES ssl)
+ADD_LIBRARY(ssl SHARED IMPORTED GLOBAL)
+SET_PROPERTY(TARGET ssl PROPERTY IMPORTED_LOCATION ${SSL_LIBRARY})
+
+find_library(CRYPTO_LIBRARY NAMES crypto)
+ADD_LIBRARY(crypto SHARED IMPORTED GLOBAL)
+SET_PROPERTY(TARGET crypto PROPERTY IMPORTED_LOCATION ${CRYPTO_LIBRARY})
+
+
 SET(BRPC_SOURCES_DIR ${THIRD_PARTY_PATH}/brpc)
 SET(BRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/brpc)
 SET(BRPC_INCLUDE_DIR "${BRPC_INSTALL_DIR}/include" CACHE PATH "brpc include directory." FORCE)
@ -22,14 +31,14 @@ SET(BRPC_LIBRARIES "${BRPC_INSTALL_DIR}/lib/libbrpc.a" CACHE FILEPATH "brpc libr
 INCLUDE_DIRECTORIES(${BRPC_INCLUDE_DIR})

 # Reference https://stackoverflow.com/questions/45414507/pass-a-list-of-prefix-paths-to-externalproject-add-in-cmake-args
-set(prefix_path "${THIRD_PARTY_PATH}/install/gflags|${THIRD_PARTY_PATH}/install/leveldb|${THIRD_PARTY_PATH}/install/snappy|${THIRD_PARTY_PATH}/install/gtest|${THIRD_PARTY_PATH}/install/protobuf")
+set(prefix_path "${THIRD_PARTY_PATH}/install/gflags|${THIRD_PARTY_PATH}/install/leveldb|${THIRD_PARTY_PATH}/install/snappy|${THIRD_PARTY_PATH}/install/gtest|${THIRD_PARTY_PATH}/install/protobuf|${THIRD_PARTY_PATH}/install/zlib")

 # If minimal .a is need, you can set  WITH_DEBUG_SYMBOLS=OFF
 ExternalProject_Add(
    extern_brpc
    ${EXTERNAL_PROJECT_LOG_ARGS}
-    GIT_REPOSITORY  "https://github.com/brpc/brpc"
-    GIT_TAG         "6d153dd7ff00f960ae6895c9c5fff0ce9f07aff2"
+    GIT_REPOSITORY  "https://github.com/gongweibao/brpc"
+    GIT_TAG         "7dc04defad1fd4173aae170c3fcbde131b65155a"
    PREFIX          ${BRPC_SOURCES_DIR}
    UPDATE_COMMAND  ""
    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
@ -42,6 +51,8 @@ ExternalProject_Add(
                    -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
                    -DCMAKE_PREFIX_PATH=${prefix_path}
                    -DBRPC_WITH_GLOG=ON
+                    -DIOBUF_WITH_HUGE_BLOCK=ON
+                    -DBRPC_WITH_RDMA=${WITH_BRPC_RDMA}
                    ${EXTERNAL_OPTIONAL_ARGS}
    LIST_SEPARATOR |
    CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${BRPC_INSTALL_DIR}
@ -49,7 +60,7 @@ ExternalProject_Add(
                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
                     -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
 )
-ADD_DEPENDENCIES(extern_brpc protobuf leveldb gflags glog gtest snappy)
+ADD_DEPENDENCIES(extern_brpc protobuf ssl crypto leveldb gflags glog gtest snappy)
 ADD_LIBRARY(brpc STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET brpc PROPERTY IMPORTED_LOCATION ${BRPC_LIBRARIES})
 ADD_DEPENDENCIES(brpc extern_brpc)
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@ -96,6 +96,20 @@ if(NOT APPLE AND NOT ANDROID)
    set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -pthread -ldl -lrt")
 endif(NOT APPLE AND NOT ANDROID)

+set_property(GLOBAL PROPERTY FLUID_MODULES "")
+# find all fluid modules is used for paddle fluid static library
+# for building inference libs
+function(find_fluid_modules TARGET_NAME)
+  get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE)
+  string(REGEX REPLACE "^${PADDLE_SOURCE_DIR}/" "" __target_path ${__target_path})
+  string(FIND "${__target_path}" "fluid" pos)
+  if(pos GREATER 1)
+    get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
+    set(fluid_modules ${fluid_modules} ${TARGET_NAME})
+    set_property(GLOBAL PROPERTY FLUID_MODULES "${fluid_modules}")
+  endif()
+endfunction(find_fluid_modules)
+
 function(merge_static_libs TARGET_NAME)
  set(libs ${ARGN})
  list(REMOVE_DUPLICATES libs)
@ -243,13 +257,14 @@ function(cc_test TARGET_NAME)
    set(multiValueArgs SRCS DEPS ARGS)
    cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
    add_executable(${TARGET_NAME} ${cc_test_SRCS})
-    target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main memory gtest gflags glog)
-    add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main memory gtest gflags glog)
+    target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
+    add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
    add_test(NAME ${TARGET_NAME}
             COMMAND ${TARGET_NAME} ${cc_test_ARGS}
             WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
    if (${cc_test_SERIAL})
        set_property(TEST ${TARGET_NAME} PROPERTY SERIAL 1)
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
    endif()
  endif()
 endfunction(cc_test)
@ -309,11 +324,12 @@ function(nv_test TARGET_NAME)
    set(multiValueArgs SRCS DEPS)
    cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
    cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS})
-    target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main memory gtest gflags glog)
-    add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main memory gtest gflags glog)
+    target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
+    add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
    add_test(${TARGET_NAME} ${TARGET_NAME})
    if (nv_test_SERIAL)
        set_property(TEST ${TARGET_NAME} PROPERTY SERIAL 1)
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
    endif()
  endif()
 endfunction(nv_test)
@ -561,7 +577,7 @@ function(py_test TARGET_NAME)
    set(multiValueArgs SRCS DEPS ARGS ENVS)
    cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
    add_test(NAME ${TARGET_NAME}
-             COMMAND env PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
+             COMMAND env FLAGS_init_allocated_mem=true PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
             ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
             WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
  endif()
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@ -12,19 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-set_property(GLOBAL PROPERTY FLUID_MODULES "")
-# find all fluid modules is used for paddle fluid static library
-function(find_fluid_modules TARGET_NAME)
-  get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE)
-  string(REGEX REPLACE "^${PADDLE_SOURCE_DIR}/" "" __target_path ${__target_path})
-  string(FIND "${__target_path}" "fluid" pos)
-  if(pos GREATER 1)
-    get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
-    set(fluid_modules ${fluid_modules} ${TARGET_NAME})
-    set_property(GLOBAL PROPERTY FLUID_MODULES "${fluid_modules}")
-  endif()
-endfunction(find_fluid_modules)
-
 # make package for paddle fluid shared and static library
 function(copy TARGET)
    set(options "")
@ -149,21 +136,33 @@ copy(memory_lib
  DSTS ${dst_dir}/${module} ${dst_dir}/${module}/detail
 )

-set(module "inference")
-copy(inference_lib DEPS paddle_fluid_shared paddle_fluid
-  SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*
-  DSTS ${dst_dir}/${module} ${dst_dir}/${module}
-)
+set(inference_deps paddle_fluid_shared paddle_fluid)

 if(WITH_CONTRIB)
-   set(contrib_dst_dir "${FLUID_INSTALL_DIR}/contrib/inference")
-   copy(contrib_inference_lib DEPS paddle_inference_api
+    message(STATUS "installing contrib")
+    set(contrib_dst_dir "${FLUID_INSTALL_DIR}/contrib/inference")
+    if (WITH_ANAKIN AND WITH_GPU)
+        copy(contrib_anakin_inference_lib DEPS paddle_inference_api inference_anakin_api
+            SRCS
+            ${PADDLE_BINARY_DIR}/paddle/contrib/inference/libinference_anakin_api* # compiled anakin api
+            ${PADDLE_BINARY_DIR}/third_party/install/anakin/*.tar.gz # anakin release
+            DSTS ${contrib_dst_dir}/anakin ${contrib_dst_dir}/anakin)
+        list(APPEND inference_deps contrib_anakin_inference_lib)
+   endif()
+
+  copy(contrib_inference_lib DEPS paddle_inference_api paddle_inference_api_shared
        SRCS ${PADDLE_SOURCE_DIR}/paddle/contrib/inference/paddle_inference_api.h
-        ${PADDLE_BINARY_DIR}/paddle/contrib/inference/libpaddle_inference_api.*
-        DSTS ${contrib_dst_dir} ${contrib_dst_dir}
-   )
+        ${PADDLE_BINARY_DIR}/paddle/contrib/inference/libpaddle_inference_api*
+        DSTS ${contrib_dst_dir} ${contrib_dst_dir})
+  list(APPEND inference_deps contrib_inference_lib)
 endif()

+set(module "inference")
+copy(inference_lib DEPS ${inference_deps}
+  SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*
+  DSTS ${dst_dir}/${module} ${dst_dir}/${module}
+)
+
 set(module "platform")
 copy(platform_lib DEPS profiler_py_proto
  SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h ${src_dir}/${module}/details/*.h
--- a/doc/fluid/api/layers.rst
+++ b/doc/fluid/api/layers.rst
@ -1468,6 +1468,14 @@ argmax
 ..  autofunction:: paddle.fluid.layers.argmax
    :noindex:

+.. _api_fluid_layers_argsort:
+
+argsort
+-------
+
+..  autofunction:: paddle.fluid.layers.argsort
+    :noindex:
+
 .. _api_fluid_layers_ones:

 ones
--- a/doc/fluid/design/concepts/lod_tensor.md
+++ b/doc/fluid/design/concepts/lod_tensor.md
@ -173,6 +173,7 @@ are transformed into offsets of elements/words as follows:

 ## Slicing of LoD Tensors

+
 When we use the above 2-level LoD Tensor as the input to a nested-RNN, we need to retrieve certain sequences.  Here we define the sequence identified by branch <i,j,...> as the **<i,j,...>-slice**.

 For example, the <2>-slice of above example is
@ -189,3 +190,22 @@ and the <2,0>-slice of above slice is
 10  12
  ||
 ```
+
+## Length Representation vs Offset Representation
+
+The offset representation is an implementation-oriented decision and it makes understanding the idea behind LoDTensor difficult.
+Hence, we encapsulate this implementation detail in C++ and expose the original length representation in our Python API. 
+Specifically, we call this length representation `recursive_sequence_lengths` and users can use the following code to set or get the `recursive_sequence_lengths` of a LoDTensor in Python:
+```Python
+# length representation of lod called recursive_sequence_lengths
+recursive_seq_lens = [[3, 1, 2], [2, 2, 1, 3, 1, 2]]
+# Create a LoDTensor that has the above recursive_sequence_lengths info.
+# This recursive_sequence_lengths will be converted to an offset representation of LoD in the C++ implementation under the hood.
+tensor = fluid.LoDTensor(lod)
+
+# Set/Change the recursive_sequence_lengths info of LoDTensor
+tensor.set_recursive_sequence_lengths([[3, 1, 2]])
+# Get the recursive_sequence_lengths info of a LoDTensor (the offset-based LoD representation stored in C++ will be converted 
+# back to length-based recursive_sequence_lengths), new_recursive_seq_lens = [[3, 1, 2]]
+new_recursive_seq_lens = tensor.recursive_sequence_lengths()
+```
--- a/doc/fluid/design/dist_train/dist_train_nccl2.md
+++ b/doc/fluid/design/dist_train/dist_train_nccl2.md
@ -0,0 +1,35 @@
+# Distributed Training with NCCL2
+
+We design a pattern that can enable training with `ParallelExecutor` and
+using [NCCL2](https://developer.nvidia.com/nccl) as it's collective
+communication library.
+
+In `ParallelExecutor` we can use `AllReduce` or `Reduce` and `Broadcast`
+to do multi GPU training. And if we initialize NCCL2 communicators as
+ranks in a distributed environment, we can simply run the `ParallelExecutor`
+as a distributed program! The only thing that may be different than in
+the single node version is that we need to broadcast the NCCL unique ID
+to all the nodes, and initialize communicators using that ID, so NCCL2
+will know each other as ranks.
+
+To achieve this feature, we introduce a new operator: `gen_nccl_id` op,
+so we are ***not*** "bind to" running NCCL2 with MPI, we can run it in
+what ever platform you like.
+
+It have two running modes:
+
+1. Generate and broadcast mode, which should be used on trainer 0;
+1. Listen and fetch mode, which should be used on trainers other than 0.
+
+In both two modes, this op can save the NCCL ID into current scope as a
+persistable variable, Then we can insert this op at the end of
+"startup program" of fluid, so that all workers can get the same ID to
+initialize NCCL communicator objects.
+
+<img src="src/ncc2_design.png">
+
+The above figure indicates the general process when training with NCCL2
+distributed. Each trainer have the number of communicators equal to the
+number of GPUs, but the ranks should match the global ranks number: here
+we have total 8 GPUs, so `nranks==8`, for each trainer, the ranks should
+be from 0 ~ 3 on trainer 0 and 4 ~ 7 on trainer 1.
--- a/doc/fluid/design/dist_train/distributed_lookup_table_design.md
+++ b/doc/fluid/design/dist_train/distributed_lookup_table_design.md
@ -119,6 +119,32 @@ optimization algorithm $f$ runs on the storage service.
 - Con: the storage service needs to be able to run the optimization
  algorithm.

+## Distributed Sparse Table in Fluid
+
+For another design, we can implement a distributed sparse table in Fluid,
+and don't need to maintain an external storage component while training.
+
+You may need to read Fluid [Distributed Training Architecture](./distributed_architecture.md)
+and [Parameter Server](./parameter_server.md) before going on.
+
+![fluid lookup remote table](./src/fluid_lookup_remote_table.png)
+
+Partition a large table into multiple pserver instances
+1. `DistributeTranspiler` would split the table partitioned into some small
+table blocks with some partitioned algorithms such as
+[RoundRobin](https://en.wikipedia.org/wiki/Round-robin_scheduling),
+[Hash](https://en.wikipedia.org/wiki/Hash) and etc...
+1. For some cases, the range of input `Ids` is very wide and unpredictable, so the sparse
+table would be able to fill a new value for the id that didn't appear before with
+zero, uniform random or Gaussian distribution.
+
+For each Trainer's training process:
+1. In the forward pass, we use `pre-fetch` op to pre-fetch parameter blocks according to the
+input `Ids` from PServers instead of the local `lookup_table` op, and then merge the blocks
+into a parameter `W`.
+1. Compute `GRAD@W'` in the backward pass using the pre-fetched `W` and send it to PServer to
+execute the optimize pass.
+
 ## Conclusion

 Let us do the "storage service does not optimize" solution first, as a
--- a/doc/fluid/design/dist_train/src/fluid_lookup_remote_table.graffle
+++ b/doc/fluid/design/dist_train/src/fluid_lookup_remote_table.graffle
--- a/doc/fluid/design/dist_train/src/fluid_lookup_remote_table.png
+++ b/doc/fluid/design/dist_train/src/fluid_lookup_remote_table.png
--- a/doc/fluid/design/dist_train/src/ncc2_design.graffle
+++ b/doc/fluid/design/dist_train/src/ncc2_design.graffle
--- a/doc/fluid/design/dist_train/src/ncc2_design.png
+++ b/doc/fluid/design/dist_train/src/ncc2_design.png
--- a/doc/fluid/design/multi_devices/kernel_selection.md
+++ b/doc/fluid/design/multi_devices/kernel_selection.md
@ -74,10 +74,10 @@ void OperatorWithKernel::Run(
    auto kernel_type_for_var = this->GetKernelTypeForVar(...);
    if (kernel_type_for_var.place_ != expected_kernel_key.place_) {
      auto* trans_var = new_scope.Var(var_name);
-      auto* out = DataTransform(expected_kernel_key,
+      auto* out = TransformData(expected_kernel_key,
                                kernel_type_for_var,
                                *tensor_in);
-      CopyVariableWithTensor(...);
+      SetTensorToVariable(...);
    }
  }

--- a/doc/fluid/howto/optimization/host_memory_profiling_cn.md
+++ b/doc/fluid/howto/optimization/host_memory_profiling_cn.md
@ -1,4 +1,4 @@
-## 堆内存分析和优化
+# 堆内存分析和优化

 计算机程序都可能有内存泄漏的风险。**内存泄漏**一般是由于程序在堆(heap)上分配了内存而没有释放，随着程序的运行占用的内存越来越大，一方面会影响程序的稳定性，可能让运行速度越来越慢，或者造成oom，甚至会影响运行程序的机器的稳定性，造成宕机。

@ -20,11 +20,11 @@ Paddle也提供了基于gperftool的[CPU性能分析教程](https://github.com/P

 对于堆内存的分析，主要用到thread-caching malloc和heap-profiling using tcmalloc。

-## 使用流程
-#### 环境
+## 环境
+
 本教程基于paddle提供的Docker开发环境paddlepaddle/paddle:latest-dev，基于Ubuntu 16.04.4 LTS环境。

-#### 使用流程
+## 使用流程

 - 安装google-perftools

--- a/doc/fluid/howto/optimization/timeline_cn.md
+++ b/doc/fluid/howto/optimization/timeline_cn.md
@ -0,0 +1,26 @@
+# 如何使用timeline工具做性能分析
+
+1. 在训练的主循环外加上`with profiler.profiler(...)`。运行之后，代码会在`/tmp/profile`目录下生成一个profile的记录文件。
+
+	**提示：**
+	请不要在timeline记录信息时运行太多次迭代，因为timeline中的记录数量和迭代次数是成正比的。
+
+	```python
+	with profiler.profiler('All', 'total', '/tmp/profile') as prof:
+	    for pass_id in range(pass_num):
+	        for batch_id, data in enumerate(train_reader()):
+	            exe.run(fluid.default_main_program(),
+	                    feed=feeder.feed(data),
+	                    fetch_list=[])
+	            ...
+	```
+
+1. 运行`python paddle/tools/timeline.py`来处理`/tmp/profile`，这个程序默认会生成一个`/tmp/timeline`文件，你也可以用命令行参数来修改这个路径，请参考[timeline.py](https://github.com/PaddlePaddle/Paddle/blob/develop/tools/timeline.py)。
+
+1. 打开chrome浏览器，访问<chrome://tracing/>，用`load`按钮来加载生成的`timeline`文件。
+
+	![chrome tracing](./tracing.jpeg)
+
+1. 结果如下图所示，可以放到来查看timetime的细节信息。
+
+	![chrome timeline](./timeline.jpeg)
--- a/doc/fluid/howto/optimization/timeline_en.md
+++ b/doc/fluid/howto/optimization/timeline_en.md
--- a/doc/v2/design/cluster_train/large_model_dist_train.md
+++ b/doc/v2/design/cluster_train/large_model_dist_train.md
@ -52,7 +52,7 @@ In `trainer_internal.cpp:L93 trainOneBatch`:

 When doing actual network forward and backward, at the beginning of each batch, the trainer will try to download one row of data from pserver.

-In `trainer/RemoteParameterUpdater.cpp`: `parameterUpdater_->getParametersRemote();`:
+In `legacy/trainer/RemoteParameterUpdater.cpp`: `parameterUpdater_->getParametersRemote();`:

 ```c++
 if (fullSize) {
--- a/Show More
+++ b/Show More