Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into fix-default-value

Change the default value of the parameter 'drop_last' in 'paddle.batch' to False.
7 years ago · 7b7a4afa5a
parent 186919016b 1640334241
commit 7b7a4afa5a
178 changed files with 6361 additions and 2005 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -66,6 +66,12 @@ option(WITH_ANAKIN      "Compile with Anakin library"                   OFF)
 option(WITH_GRPC     "Use grpc as the default rpc framework"            ${WITH_DISTRIBUTE})
 option(WITH_BRPC_RDMA     "Use brpc rdma as the rpc protocal"           OFF)
 option(WITH_SYSTEM_BLAS   "Use system blas library"           OFF)
+option(PY_VERSION       "Compile PaddlePaddle with python3 support"     ${PY_VERSION})
+
+# PY_VERSION
+if(NOT PY_VERSION)
+  set(PY_VERSION 2.7)
+endif()

 # CMAKE_BUILD_TYPE
 if(NOT CMAKE_BUILD_TYPE)
@ -146,6 +152,7 @@ endif()
 ########################################################################################

 include(external/mklml)     # download mklml package
+include(external/libxsmm)   # download, build, install libxsmm
 include(external/zlib)      # download, build, install zlib
 include(external/gflags)    # download, build, install gflags
 include(external/glog)      # download, build, install glog
@ -232,6 +239,10 @@ if(WITH_MKLML)
    list(APPEND EXTERNAL_LIBS ${MKLML_IOMP_LIB})
 endif()

+if(WITH_LIBXSMM)
+    list(APPEND EXTERNAL_LIBS ${LIBXSMM_LIBS})
+endif()
+
 if(WITH_MKLDNN)
    list(APPEND EXTERNAL_LIBS ${MKLDNN_LIB})
 endif()
@ -271,7 +282,3 @@ if(WITH_DOC)
    find_python_module(recommonmark REQUIRED)
    add_subdirectory(doc)
 endif()
-
-if (WITH_CONTRIB)
-    add_subdirectory(paddle/contrib)
-endif()
--- a/2
+++ b/2
@ -80,7 +80,7 @@ RUN pip install pre-commit 'ipython==5.3.0' && \
    pip install opencv-python

 #For docstring checker
-RUN pip install pylint pytest astroid isort
+RUN pip install pylint pytest astroid isort LinkChecker

 COPY ./python/requirements.txt /root/
 RUN pip install -r /root/requirements.txt
--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
@ -210,7 +210,7 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
    # generate fake:
    if args.use_fake_data:
        for var in feed_var_list:
-            v = startup_prog.global_block().clone_variable(var)
+            v = startup_prog.global_block()._clone_variable(var)
            var.persistable = True
            v.persistable = True

--- a/cmake/external/libxsmm.cmake
+++ b/cmake/external/libxsmm.cmake
@ -0,0 +1,57 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+OPTION(WITH_LIBXSMM "Compile with libxsmm" OFF)
+
+IF(NOT WITH_LIBXSMM)
+    return()
+ENDIF()
+
+IF(WIN32 OR APPLE OR ANDROID OR IOS)
+    MESSAGE(WARNING "Windows, Mac or Mobile are not supported with libxsmm in Paddle yet.")
+    SET(WITH_LIBXSMM OFF CACHE STRING "Disable LIBXSMM" FORCE)
+    return()
+ENDIF()
+
+INCLUDE (ExternalProject)
+
+SET(LIBXSMM_SOURCES_DIR ${THIRD_PARTY_PATH}/libxsmm)
+SET(LIBXSMM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/libxsmm)
+SET(LIBXSMM_INCLUDE_DIR "${LIBXSMM_INSTALL_DIR}/include" CACHE PATH "LIBXSMM include directory." FORCE)
+SET(LIBXSMM_LIBRARY_DIR "${LIBXSMM_INSTALL_DIR}/lib" CACHE PATH "LIBXSMM library directory." FORCE)
+SET(LIBXSMM_LIBS        "${LIBXSMM_LIBRARY_DIR}/libxsmm.a"
+                        "${LIBXSMM_LIBRARY_DIR}/libxsmmnoblas.a")
+
+ExternalProject_Add(
+    extern_libxsmm
+    GIT_REPOSITORY  "https://github.com/hfp/libxsmm.git"
+    GIT_TAG         "7cc03b5b342fdbc6b6d990b190671c5dbb8489a2"
+    PREFIX          ${LIBXSMM_SOURCES_DIR}
+    UPDATE_COMMAND  ""
+    CONFIGURE_COMMAND ""
+    BUILD_IN_SOURCE 1
+    BUILD_COMMAND   $(MAKE) --silent PREFIX=${LIBXSMM_INSTALL_DIR} CXX=g++ CC=gcc WARP=0 install
+    INSTALL_COMMAND ""
+)
+ADD_LIBRARY(libxsmm STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET libxsmm PROPERTY IMPORTED_LOCATION "${LIBXSMM_LIBRARY_DIR}/libxsmm.a")
+SET_PROPERTY(TARGET libxsmm PROPERTY IMPORTED_LOCATION "${LIBXSMM_LIBRARY_DIR}/libxsmmnoblas.a")
+
+MESSAGE(STATUS "Libxsmm library: ${LIBXSMM_LIBS}")
+include_directories(${LIBXSMM_INCLUDE_DIR})
+ADD_DEFINITIONS(-DPADDLE_WITH_LIBXSMM)
+ADD_DEPENDENCIES(libxsmm extern_libxsmm)
+LIST(APPEND external_project_dependencies libxsmm)
+
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@ -121,6 +121,11 @@ ELSE()
  TARGET_LINK_LIBRARIES(cblas ${CBLAS_LIBRARIES})
 ENDIF("${CBLAS_PROVIDER}" STREQUAL "MKLML")

+IF(WITH_LIBXSMM)
+  TARGET_LINK_LIBRARIES(cblas ${LIBXSMM_LIBS})
+  ADD_DEPENDENCIES(cblas extern_libxsmm)
+ENDIF()
+
 IF(NOT ${CBLAS_FOUND})
    ADD_DEPENDENCIES(cblas extern_openblas)
    LIST(APPEND external_project_dependencies cblas)
--- a/cmake/external/python.cmake
+++ b/cmake/external/python.cmake
@ -18,8 +18,9 @@ ENDIF()

 INCLUDE(python_module)

-FIND_PACKAGE(PythonInterp 2.7)
-FIND_PACKAGE(PythonLibs 2.7)
+FIND_PACKAGE(PythonInterp ${PY_VERSION})
+FIND_PACKAGE(PythonLibs ${PY_VERSION})
+
 # Fixme: Maybe find a static library. Get SHARED/STATIC by FIND_PACKAGE.
 ADD_LIBRARY(python SHARED IMPORTED GLOBAL)
 SET_PROPERTY(TARGET python PROPERTY IMPORTED_LOCATION ${PYTHON_LIBRARIES})
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@ -138,25 +138,24 @@ copy(memory_lib

 set(inference_deps paddle_fluid_shared paddle_fluid)

-if(WITH_CONTRIB)
-    message(STATUS "installing contrib")
-    set(contrib_dst_dir "${FLUID_INSTALL_DIR}/contrib/inference")
-    if (WITH_ANAKIN AND WITH_GPU)
-        copy(contrib_anakin_inference_lib DEPS paddle_inference_api inference_anakin_api
-            SRCS
-            ${PADDLE_BINARY_DIR}/paddle/contrib/inference/libinference_anakin_api* # compiled anakin api
-            ${PADDLE_BINARY_DIR}/third_party/install/anakin/*.tar.gz # anakin release
-            DSTS ${contrib_dst_dir}/anakin ${contrib_dst_dir}/anakin)
-        list(APPEND inference_deps contrib_anakin_inference_lib)
-   endif()
-
-  copy(contrib_inference_lib DEPS paddle_inference_api paddle_inference_api_shared
-        SRCS ${PADDLE_SOURCE_DIR}/paddle/contrib/inference/paddle_inference_api.h
-        ${PADDLE_BINARY_DIR}/paddle/contrib/inference/libpaddle_inference_api*
-        DSTS ${contrib_dst_dir} ${contrib_dst_dir})
-  list(APPEND inference_deps contrib_inference_lib)
+set(module "inference/api")
+if (WITH_ANAKIN AND WITH_GPU)
+    copy(anakin_inference_lib DEPS paddle_inference_api inference_anakin_api
+        SRCS
+        ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/libinference_anakin_api* # compiled anakin api
+        ${PADDLE_BINARY_DIR}/third_party/install/anakin/*.tar.gz # anakin release
+        DSTS ${dst_dir}/inference/anakin ${dst_dir}/inference/anakin)
+     list(APPEND inference_deps anakin_inference_lib)
 endif()

+copy(inference_api_lib DEPS paddle_inference_api paddle_inference_api_shared
+  SRCS ${src_dir}/${module}/paddle_inference_api.h 
+       ${src_dir}/${module}/demo_ci
+       ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/libpaddle_inference_api*
+  DSTS ${dst_dir}/inference ${dst_dir}/inference ${dst_dir}/inference
+)
+list(APPEND inference_deps inference_api_lib)
+
 set(module "inference")
 copy(inference_lib DEPS ${inference_deps}
  SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*
--- a/doc/fluid/design/modules/python_api.md
+++ b/doc/fluid/design/modules/python_api.md
@ -98,13 +98,13 @@ class Block(objects):
    def append_operator(self, ...):
        self.ops.append(Operator(self, ...))

-    def prepend_operator(self, ...): # Parameter's ctor prepands initialize operators.
+    def _prepend_operator(self, ...): # Parameter's ctor prepands initialize operators.
       self.ops.prepend(Operator(self, ...))
 ```

 `create_parameter` is necessary because parameters are global variables, defined in the global block, but can be created in some sub-blocks. For example, an FC layer in the step block of an RNN operator.

-`prepend_operator` is necessary because the constructor of `Parameter` needs to create the initialize (or load) operator of the parameter, and would like to put it in the *preamble* of the global block.
+`_prepend_operator` is necessary because the constructor of `Parameter` needs to create the initialize (or load) operator of the parameter, and would like to put it in the *preamble* of the global block.

 ### Operator

--- a/doc/fluid/howto/performance/error_clip.md
+++ b/doc/fluid/howto/performance/error_clip.md
@ -78,7 +78,7 @@ def error_clip_callback(block, context):
    op_desc = block.desc.op(block.desc.op_size() - 1)
    for grad_n in filter(lambda n: grad_to_var.has_key(n),
                         op_desc.output_arg_names()):
-        fwd_var = block.var_recursive(grad_to_var[grad_n])
+        fwd_var = block.__var_recursive(grad_to_var[grad_n])
        error_clip = getattr(fwd_var, "error_clip", None)
        if not (error_clip is None or isinstance(error_clip,
                                                 BaseErrorClipAttr)):
--- a/doc/v2/api/index_en.rst
+++ b/doc/v2/api/index_en.rst
@ -4,7 +4,6 @@ API
 ..  toctree::
    :maxdepth: 1

-    overview.rst
    model_configs.rst
    data.rst
    run_logic.rst
--- a/doc/v2/build_and_install/build_from_source_cn.rst
+++ b/doc/v2/build_and_install/build_from_source_cn.rst
@ -35,11 +35,16 @@ PaddlePaddle需要使用Docker环境完成编译，这样可以免去单独安
   # 2. 可选步骤：源码中构建用于编译PaddlePaddle的Docker镜像
   docker build -t paddle:dev .
   # 3. 执行下面的命令编译CPU-Only的二进制
-   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh build
+   docker run -it -v $PWD:/paddle -w /paddle -e "PYTHON_ABI=cp27-cp27mu" -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh build
   # 4. 或者也可以使用为上述可选步骤构建的镜像（必须先执行第2步）
   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev ./paddle/scripts/paddle_build.sh build

-注：上述命令把当前目录（源码树根目录）映射为 container 里的 :code:`/paddle` 目录。
+注：
+
+- 上述命令把当前目录（源码树根目录）映射为 container 里的 :code:`/paddle` 目录。
+
+- 如果您使用的是 manylinux 的镜像进行编译, 那么您需要通过环境变量 :code:`PYTHON_ABI` 来指定一个 `Python ABI <https://www.python.org/dev/peps/pep-0425/#id8>`__.
+PaddlePaddle目前支持的 Python ABI 有 :code:`cp27-cp27m` 和 :code:`cp27-cp27mu`.

 编译完成后会在build/python/dist目录下生成输出的whl包，可以选在在当前机器安装也可以拷贝到目标机器安装：

--- a/doc/v2/build_and_install/build_from_source_en.rst
+++ b/doc/v2/build_and_install/build_from_source_en.rst
@ -36,13 +36,18 @@ If you don't wish to use docker，you need to install several compile dependenci
   # 2. Optional: build development docker image from source
   docker build -t paddle:dev .
   # 3. Run the following command to build a CPU-Only binaries
-   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh build
+   docker run -it -v $PWD:/paddle -w /paddle -e "PYTHON_ABI=cp27-cp27mu" -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh build
   # 4. Or, use your built Docker image to build PaddlePaddle (must run step 2)
   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev ./paddle/scripts/paddle_build.sh build

-NOTE: The above command try to mount the current working directory (root directory of source code)
+NOTE: 
+
+- The above command try to mount the current working directory (root directory of source code)
 into :code:`/paddle` directory inside docker container.

+- You need to pass in the required environment variable :code:`PYTHON_ABI` to specify a `Python ABI <https://www.python.org/dev/peps/pep-0425/#id8>`__.
+Currently PaddlePaddle supported Python ABIs include :code:`cp27-cp27m` and :code:`cp27-cp27mu` .
+
 When the compile finishes, you can get the output whl package under
 build/python/dist, then you can choose to install the whl on local
 machine or copy it to the target machine.
--- a/paddle/contrib/CMakeLists.txt
+++ b/paddle/contrib/CMakeLists.txt
@ -1,16 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-add_subdirectory(inference)
--- a/paddle/contrib/float16/float16_transpiler.py
+++ b/paddle/contrib/float16/float16_transpiler.py
@ -118,7 +118,7 @@ class Float16Transpiler:

        for var in self.block.vars.keys():
            if var not in args:
-                self.block.remove_var(var)
+                self.block._remove_var(var)

    def _modify_feed_fetch(self):
        '''
@ -165,7 +165,7 @@ class Float16Transpiler:
                    dtype=core.VarDesc.VarType.FP16,
                    shape=var.shape,
                    persistable=var.persistable)
-                self.block.insert_op(
+                self.block._insert_op(
                    i + 1,
                    type="cast",
                    inputs={"X": var},
@ -188,7 +188,7 @@ class Float16Transpiler:
                    persistable=var.persistable)
                find_op(var)
                var.op.rename_output(var_name, tmp_var_name)
-                self.block.insert_op(
+                self.block._insert_op(
                    i,
                    type="cast",
                    inputs={"X": tmp_var},
@ -253,4 +253,4 @@ class Float16Transpiler:

            # old var will be replaced by the fp16 var in program desc
            self.input_map[var.name] = fp16_var_name
-            self.block.remove_var(var.name)
+            self.block._remove_var(var.name)
--- a/paddle/contrib/inference/demo/CMakeLists.txt
+++ b/paddle/contrib/inference/demo/CMakeLists.txt
@ -1,61 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-inference_api_test(simple_on_word2vec ARGS test_word2vec)
-
-option(WITH_INFERENCE_DEMO "Compile with Inference demo" OFF)
-if(NOT WITH_INFERENCE_DEMO)
-  return()
-endif()
-
-set(DEMO_INSTALL_DIR "${PADDLE_BINARY_DIR}/inference_demo")
-set(URL_ROOT http://paddlemodels.bj.bcebos.com/inference-vis-demos%2F)
-
-function(inference_download_test_demo TARGET)
-    if (NOT WITH_TESTING)
-        return()
-    endif()
-    set(options "")
-    set(oneValueArgs URL)
-    set(multiValueArgs SRCS)
-    cmake_parse_arguments(tests "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-
-    set(test_dir "${DEMO_INSTALL_DIR}/${TARGET}")
-    message(STATUS "inference demo ${test_dir}")
-
-    if(NOT EXISTS "${test_dir}")
-        message(STATUS "Download ${TARGET} model from ${tests_URL}")
-        execute_process(COMMAND bash -c "mkdir -p ${test_dir}")
-        execute_process(COMMAND bash -c "cd ${test_dir}; wget -q ${tests_URL}")
-        execute_process(COMMAND bash -c "cd ${test_dir}; tar xzf *.tar.gz")
-    endif()
-
-    cc_test(${TARGET} SRCS "${tests_SRCS}"
-        DEPS paddle_inference_api paddle_fluid
-        ARGS --data=${test_dir}/data.txt
-             --modeldir=${test_dir}/model
-             --refer=${test_dir}/result.txt)
-endfunction()
-
-# disable mobilenet test
-#inference_download_test_demo(mobilenet_inference_demo
-#    SRCS vis_demo.cc
-#    URL ${URL_ROOT}mobilenet.tar.gz)
-inference_download_test_demo(se_resnext50_inference_demo
-    SRCS vis_demo.cc
-    URL ${URL_ROOT}se_resnext50.tar.gz)
-inference_download_test_demo(ocr_inference_demo
-    SRCS vis_demo.cc
-    URL ${URL_ROOT}ocr.tar.gz)
--- a/paddle/contrib/inference/demo/README.md
+++ b/paddle/contrib/inference/demo/README.md
@ -1,36 +0,0 @@
-# Infernce Demos
-
-Input data format:
-
- Each line contains a single record
- Each record's format is
-
-```
-<space splitted floats as data>\t<space splitted ints as shape>
-```
-
-Follow the C++ codes in `vis_demo.cc`.
-
-## MobileNet
-
-To execute the demo, simply run
-
-```sh
-./mobilenet_inference_demo --modeldir <model> --data <datafile>
-```
-
-## SE-ResNeXt-50
-
-To execute the demo, simply run
-
-```sh
-./se_resnext50_inference_demo --modeldir <model> --data <datafile>
-```
-
-## OCR
-
-To execute the demo, simply run
-
-```sh
-./ocr_inference_demo --modeldir <model> --data <datafile>
-```
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@ -276,13 +276,22 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
    }
  }

-  // Insert BCast Ops
-  for (size_t dev_id = 0; dev_id < bcast_var_name_set.size(); ++dev_id) {
-    auto &to_bcast_set = bcast_var_name_set[dev_id];
-    for (auto &bcast_name : to_bcast_set) {
-      CreateBroadcastOp(&result, bcast_name, dev_id);
+  bool use_gpu = false;
+#ifdef PADDLE_WITH_CUDA
+  use_gpu = nccl_ctxs_ != nullptr;
+#endif
+
+  if (use_gpu ||
+      strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) {
+    // Insert BCast Ops
+    for (size_t dev_id = 0; dev_id < bcast_var_name_set.size(); ++dev_id) {
+      auto &to_bcast_set = bcast_var_name_set[dev_id];
+      for (auto &bcast_name : to_bcast_set) {
+        CreateBroadcastOp(&result, bcast_name, dev_id);
+      }
    }
  }
+
  /*
    Dependency graph has been constructed. However, there are still data
    hazards need to be handled.
@ -412,14 +421,19 @@ int MultiDevSSAGraphBuilder::GetOpDeviceID(const OpDesc &op) const {
  if (strategy_.reduce_ != BuildStrategy::ReduceStrategy::kReduce) {
    return -1;
  }
-
-  for (auto &varname : op.InputArgumentNames()) {
-    int dev_id = GetVarDeviceID(varname);
-    if (dev_id != -1) {
-      return dev_id;
-    }
+  int op_role = boost::get<int>(
+      op.GetAttr(framework::OpProtoAndCheckerMaker::OpRoleAttrName()));
+  if (op_role != static_cast<int>(framework::OpRole::kOptimize)) {
+    return -1;
  }
-  return -1;
+  auto param_grad = boost::get<std::vector<std::string>>(
+      op.GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName()));
+
+  PADDLE_ENFORCE_EQ(param_grad.size(), 2U);
+  int dev_id = GetVarDeviceID(param_grad[1]);
+  PADDLE_ENFORCE_NE(dev_id, -1, "dev_id should not be -1.[%s, %s]", op.Type(),
+                    param_grad[0]);
+  return dev_id;
 }

 int MultiDevSSAGraphBuilder::GetVarDeviceID(const std::string &varname) const {
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
@ -13,6 +13,7 @@
 // limitations under the License.

 #include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
+#include <stdexcept>
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/executor.h"
@ -53,8 +54,14 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
      }
    }
  }
+  std::vector<framework::LoDTensor> fetch_data;
+  std::exception_ptr eptr;
+  try {
+    fetch_data = underlying_executor_->Run(fetch_tensors);
+  } catch (...) {
+    eptr = std::current_exception();
+  }

-  auto fetch_data = underlying_executor_->Run(fetch_tensors);
  drop_scope_counter_ += 1;
  if (!fetch_tensors.empty() ||
      drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) {
@ -69,7 +76,11 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
      scope->DeleteScope(local_scope);
    }
  }
-  return fetch_data;
+  if (eptr) {
+    std::rethrow_exception(eptr);
+  } else {
+    return fetch_data;
+  }
 }
 }  // namespace details
 }  // namespace framework
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@ -78,6 +78,10 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
    set.clear();
  };

+  // Clean run context
+  run_op_futures_.clear();
+  exception_.reset();
+
  // Step 3. Execution
  while (!pending_vars.empty()) {
    // 1. Run All Ready ops
@ -96,16 +100,19 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
    auto cur_ready_vars = ready_vars.PopAll(1, &timeout);

    if (timeout) {
-      std::lock_guard<std::mutex> l(exception_mu_);
+      std::unique_lock<std::mutex> l(exception_mu_);
      if (exception_) {
+        l.unlock();
+        for (auto &run_op_future : run_op_futures_) {
+          run_op_future.wait();
+        }
+        l.lock();
        std::exception *exp = exception_.get();
        if (dynamic_cast<platform::EOFException *>(exp)) {
          auto e = *static_cast<platform::EOFException *>(exp);
-          exception_.reset();
          throw e;
        } else if (dynamic_cast<platform::EnforceNotMet *>(exp)) {
          auto e = *static_cast<platform::EnforceNotMet *>(exp);
-          exception_.reset();
          throw e;
        } else {
          LOG(FATAL) << "Unknown exception.";
@ -222,7 +229,7 @@ void ThreadedSSAGraphExecutor::RunOp(
    }
  };
  if (pool_) {
-    pool_->enqueue(op_run);
+    run_op_futures_.emplace_back(pool_->enqueue(op_run));
  } else {
    op_run();
  }
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@ -15,6 +15,7 @@
 #pragma once

 #include <deque>
+#include <list>
 #include <string>
 #include <unordered_set>
 #include <utility>
@ -77,6 +78,8 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {

 private:
  ExecutionStrategy strategy_;
+  // use std::list because clear(), push_back, and for_each are O(1)
+  std::list<std::future<void>> run_op_futures_;
 };

 }  // namespace details
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@ -45,6 +45,7 @@ class ParallelExecutorPrivate {
 #endif
  bool own_local_scope_;
  bool use_cuda_;
+  bool use_all_reduce_;
 };

 std::vector<Scope *> &ParallelExecutor::GetLocalScopes() {
@ -62,6 +63,14 @@ ParallelExecutor::ParallelExecutor(
    : member_(new ParallelExecutorPrivate(places)) {
  member_->global_scope_ = scope;
  member_->use_cuda_ = exec_strategy.use_cuda_;
+  member_->use_all_reduce_ =
+      build_strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce;
+
+  if (!member_->use_all_reduce_) {
+    PADDLE_ENFORCE(places.size() > 1,
+                   "If you set build_strategy.reduce with 'Reduce',"
+                   "the number of places must be greater than 1.");
+  }

  // Step 1. Bcast the params to devs.
  // Create local scopes
@ -95,7 +104,7 @@ ParallelExecutor::ParallelExecutor(
  }

  if (member_->local_scopes_.size() != 1 && local_scopes.empty()) {
-    BCastParamsToGPUs(bcast_vars);
+    BCastParamsToDevices(bcast_vars);
  }
  // Startup Program has been run. All local scopes has correct parameters.

@ -117,7 +126,7 @@ ParallelExecutor::ParallelExecutor(
 #ifdef PADDLE_WITH_CUDA
    builder_factory.SetNCCLContextMap(member_->nccl_ctxs_.get());
 #else
-    PADDLE_THROW("Not compiled with CUDA");
+    PADDLE_THROW("Not compiled with CUDA.");
 #endif
  }

@ -131,9 +140,9 @@ ParallelExecutor::ParallelExecutor(
      member_->places_, std::move(member_->executor_)));
 }

-void ParallelExecutor::BCastParamsToGPUs(
+void ParallelExecutor::BCastParamsToDevices(
    const std::unordered_set<std::string> &vars) const {
-  // the the initializing bcast, all vars would be bcast from device(0),
+  // the initializing bcast, all vars would be bcast from device(0),
  // otherwise
  // bcast from the specified device.
  bool initializing = builder_.get() == nullptr ? true : false;
@ -202,12 +211,23 @@ void ParallelExecutor::BCastParamsToGPUs(
 #endif
    } else {
      platform::CPUPlace cpu;
-      for (size_t i = 1; i < member_->places_.size(); ++i) {
+      for (size_t i = 0; i < member_->places_.size(); ++i) {
+        if ((initializing && i == 0) ||
+            (!initializing && static_cast<int>(i) == var_dev_id))
+          continue;
+
        auto local_scope = member_->local_scopes_[i];
        auto *t = local_scope->Var(var)->GetMutable<LoDTensor>();
-        t->Resize(dims);
-        t->mutable_data(cpu, main_tensor.type());
-        paddle::framework::TensorCopy(main_tensor, cpu, t);
+
+        // FIXME(zcd): LR_DECAY_COUNTER should not be shared. This is a hot fix.
+        if (member_->use_all_reduce_ || member_->use_cuda_ ||
+            var == "@LR_DECAY_COUNTER@") {
+          t->Resize(dims);
+          t->mutable_data(cpu, main_tensor.type());
+          paddle::framework::TensorCopy(main_tensor, cpu, t);
+        } else {
+          t->ShareDataWith(main_tensor);
+        }
      }
    }
  }
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@ -66,7 +66,7 @@ class ParallelExecutor {
  void Run(const std::vector<std::string> &fetch_tensors,
           const std::string &fetched_var_name);

-  void BCastParamsToGPUs(const std::unordered_set<std::string> &vars) const;
+  void BCastParamsToDevices(const std::unordered_set<std::string> &vars) const;

 private:
  ParallelExecutorPrivate *member_;
--- a/paddle/fluid/framework/reader.h
+++ b/paddle/fluid/framework/reader.h
@ -29,11 +29,11 @@ enum ReaderStatus { kRunning, kStopped };

 class ReaderBase {
 public:
-  void ReadNext(std::vector<LoDTensor>* out);
+  virtual void ReadNext(std::vector<LoDTensor>* out);

-  void Shutdown();
+  virtual void Shutdown();

-  void Start();
+  virtual void Start();

  // Return the readers which are the end of decorating chain. Basically
  // they are readers just before read op.
@ -42,7 +42,7 @@ class ReaderBase {
  virtual ~ReaderBase();

 protected:
-  virtual void ReadNextImpl(std::vector<LoDTensor>* out) = 0;
+  virtual void ReadNextImpl(std::vector<LoDTensor>* out) {}

  virtual void ShutdownImpl() {}

--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@ -1,4 +1,11 @@
-set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor )
+# analysis and tensorrt must be added before creating static library,
+# otherwise, there would be undefined reference to them in static library.
+add_subdirectory(analysis)
+if (TENSORRT_FOUND)
+  add_subdirectory(tensorrt)
+endif()
+
+set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor)

 # TODO(panyx0718): Should this be called paddle_fluid_inference_api_internal?
 cc_library(paddle_fluid_api
@ -7,12 +14,14 @@ cc_library(paddle_fluid_api

 get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)

-if(WITH_CONTRIB)
-  set(fluid_modules "${fluid_modules}" paddle_inference_api)
-endif()
-
 # Create static library
 cc_library(paddle_fluid DEPS ${fluid_modules} paddle_fluid_api)
+if(NOT APPLE)
+  # TODO(liuyiqu: Temporarily disable the link flag because it is not support on Mac.
+  set(LINK_FLAGS "-Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/paddle_fluid.sym")
+  set_target_properties(paddle_fluid PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
+endif()
+
 # Create shared library
 cc_library(paddle_fluid_shared SHARED
    SRCS io.cc
@ -29,9 +38,4 @@ if(WITH_TESTING)
  # both tests/book and analysis depends the models that generated by python/paddle/fluid/tests/book
  add_subdirectory(tests/book)
 endif()
-
-add_subdirectory(analysis)
-
-if (TENSORRT_FOUND)
-  add_subdirectory(tensorrt)
-endif()
+add_subdirectory(api)
--- a/Show More
+++ b/Show More