Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into data_reader

7 years ago · b858c10319
parent 11b6fabee1 1bdc7261a5
commit b858c10319
68 changed files with 930 additions and 384 deletions
--- a/2
+++ b/2
@ -1,6 +1,6 @@
 # A image for building paddle binaries
 # Use cuda devel base image for both cpu and gpu environment
-FROM nvidia/cuda:8.0-cudnn5-devel-ubuntu16.04
+FROM nvidia/cuda:8.0-cudnn7-devel-ubuntu16.04
 MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
 ARG UBUNTU_MIRROR
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@ -62,10 +62,9 @@ endif()
 ## Then find the reference-cblas.  www.netlib.org/blas/
 set(REFERENCE_CBLAS_ROOT $ENV{REFERENCE_CBLAS_ROOT} CACHE PATH
  "Folder contains reference-cblas")
 if(NOT CMAKE_CROSSCOMPILING)
  set(REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS
    ${REFERENCE_CBLAS_ROOT}/include
    /usr/include
@ -78,6 +77,11 @@ set(REFERENCE_CBLAS_LIB_SEARCH_PATHS
    /usr/lib/blas/reference/
    /usr/lib/reference/
  )
 else()
  # Disable the finding of reference cblas under host's system path
  set(REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS ${REFERENCE_CBLAS_ROOT}/include)
  set(REFERENCE_CBLAS_LIB_SEARCH_PATHS ${REFERENCE_CBLAS_ROOT}/lib)
 endif()
 find_path(REFERENCE_CBLAS_INCLUDE_DIR NAMES cblas.h PATHS
        ${REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS})
--- a/cmake/external/grpc.cmake
+++ b/cmake/external/grpc.cmake
@ -24,16 +24,16 @@ SET(GRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/grpc)
 SET(GRPC_INCLUDE_DIR "${GRPC_INSTALL_DIR}/include/" CACHE PATH "grpc include directory." FORCE)
 SET(GRPC_CPP_PLUGIN "${GRPC_INSTALL_DIR}/bin/grpc_cpp_plugin" CACHE FILEPATH "GRPC_CPP_PLUGIN" FORCE)
 IF(APPLE)
-  SET(BUILD_CMD make -n HAS_SYSTEM_PROTOBUF=false -s -j8 static grpc_cpp_plugin | sed "s/-Werror//g" | sh)
+  SET(BUILD_CMD make -n HAS_SYSTEM_PROTOBUF=false -s -j static grpc_cpp_plugin | sed "s/-Werror//g" | sh)
 ELSE()
-  SET(BUILD_CMD make HAS_SYSTEM_PROTOBUF=false -s -j8 static grpc_cpp_plugin)
+  SET(BUILD_CMD make HAS_SYSTEM_PROTOBUF=false -s -j static grpc_cpp_plugin)
 ENDIF()
 ExternalProject_Add(
    extern_grpc
    DEPENDS protobuf zlib
    GIT_REPOSITORY "https://github.com/grpc/grpc.git"
-    GIT_TAG "v1.8.x"
+    GIT_TAG "v1.11.x"
    PREFIX          ${GRPC_SOURCES_DIR}
    UPDATE_COMMAND  ""
    CONFIGURE_COMMAND ""
--- a/cmake/external/snappy.cmake
+++ b/cmake/external/snappy.cmake
@ -11,19 +11,20 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-IF(MOBILE_INFERENCE)
+if(MOBILE_INFERENCE OR RPI)
    return()
-ENDIF()
+endif()
 include (ExternalProject)
 # NOTE: snappy is needed when linking with recordio
-SET(SNAPPY_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy)
+set(SNAPPY_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy)
-SET(SNAPPY_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy)
+set(SNAPPY_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy)
-SET(SNAPPY_INCLUDE_DIR "${SNAPPY_INSTALL_DIR}/include/" CACHE PATH "snappy include directory." FORCE)
+set(SNAPPY_INCLUDE_DIR "${SNAPPY_INSTALL_DIR}/include" CACHE PATH "snappy include directory." FORCE)
 set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/libsnappy.a")
 ExternalProject_Add(
    extern_snappy
@ -51,8 +52,7 @@ ExternalProject_Add(
 )
 add_library(snappy STATIC IMPORTED GLOBAL)
-set_property(TARGET snappy PROPERTY IMPORTED_LOCATION
+set_property(TARGET snappy PROPERTY IMPORTED_LOCATION ${SNAPPY_LIBRARIES})
             "${SNAPPY_INSTALL_DIR}/lib/libsnappy.a")
 include_directories(${SNAPPY_INCLUDE_DIR})
 add_dependencies(snappy extern_snappy)
--- a/cmake/external/snappystream.cmake
+++ b/cmake/external/snappystream.cmake
@ -11,9 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-IF(MOBILE_INFERENCE)
+IF(MOBILE_INFERENCE OR RPI)
    return()
 ENDIF()
@ -21,9 +20,11 @@ include (ExternalProject)
 # NOTE: snappy is needed when linking with recordio
-SET(SNAPPYSTREAM_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy_stream)
+set(SNAPPYSTREAM_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy_stream)
-SET(SNAPPYSTREAM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy_stream)
+set(SNAPPYSTREAM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy_stream)
-SET(SNAPPYSTREAM_INCLUDE_DIR "${SNAPPYSTREAM_INSTALL_DIR}/include/" CACHE PATH "snappy stream include directory." FORCE)
+set(SNAPPYSTREAM_INCLUDE_DIR "${SNAPPYSTREAM_INSTALL_DIR}/include" CACHE PATH "snappy stream include directory." FORCE)
 set(SNAPPYSTREAM_LIBRARIES "${SNAPPYSTREAM_INSTALL_DIR}/lib/libsnappystream.a")
 ExternalProject_Add(
        extern_snappystream
@ -51,8 +52,7 @@ ExternalProject_Add(
 )
 add_library(snappystream STATIC IMPORTED GLOBAL)
-set_property(TARGET snappystream PROPERTY IMPORTED_LOCATION
+set_property(TARGET snappystream PROPERTY IMPORTED_LOCATION ${SNAPPYSTREAM_LIBRARIES})
        "${SNAPPYSTREAM_INSTALL_DIR}/lib/libsnappystream.a")
 include_directories(${SNAPPYSTREAM_INCLUDE_DIR}) # For snappysteam to include its own headers.
 include_directories(${THIRD_PARTY_PATH}/install) # For Paddle to include snappy stream headers.
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@ -195,14 +195,7 @@ function(cc_library TARGET_NAME)
        list(REMOVE_ITEM cc_library_DEPS warpctc)
        add_dependencies(${TARGET_NAME} warpctc)
      endif()
      if("${cc_library_DEPS}" MATCHES "ARCHIVE_START")
        # Support linking flags: --whole-archive (Linux) / -force_load (MacOS).
        # WARNING: Please don't use ARCHIVE_START&ARCHIVE_END if TARGET_NAME will be linked by other libraries.
        target_circle_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
        list(REMOVE_ITEM cc_library_DEPS ARCHIVE_START ARCHIVE_END)
      else()
      target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
      endif()
      add_dependencies(${TARGET_NAME} ${cc_library_DEPS})
    endif()
@ -243,11 +236,7 @@ function(cc_test TARGET_NAME)
    set(multiValueArgs SRCS DEPS ARGS)
    cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
    add_executable(${TARGET_NAME} ${cc_test_SRCS})
-    # Support linking flags: --whole-archive (Linux) / -force_load (MacOS)
+    target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main memory gtest gflags glog)
    target_circle_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main memory gtest gflags glog)
    if("${cc_test_DEPS}" MATCHES "ARCHIVE_START")
      list(REMOVE_ITEM cc_test_DEPS ARCHIVE_START ARCHIVE_END)
    endif()
    add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main memory gtest gflags glog)
    add_test(NAME ${TARGET_NAME}
             COMMAND ${TARGET_NAME} ${cc_test_ARGS}
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@ -1,7 +1,22 @@
 # Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 set_property(GLOBAL PROPERTY FLUID_MODULES "")
 # find all fluid modules is used for paddle fluid static library
 function(find_fluid_modules TARGET_NAME)
  get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE)
  string(REGEX REPLACE "^${PADDLE_SOURCE_DIR}/" "" __target_path ${__target_path})
  string(FIND "${__target_path}" "fluid" pos)
  if(pos GREATER 1)
    get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
@ -77,6 +92,23 @@ elseif (WITH_MKLML)
    )
 endif()
 if(NOT MOBILE_INFERENCE AND NOT RPI)
  set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/snappy")
  copy(snappy_lib
    SRCS ${SNAPPY_INCLUDE_DIR} ${SNAPPY_LIBRARIES}
    DSTS ${dst_dir} ${dst_dir}/lib)
  set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/snappystream")
  copy(snappystream_lib
    SRCS ${SNAPPYSTREAM_INCLUDE_DIR} ${SNAPPYSTREAM_LIBRARIES}
    DSTS ${dst_dir} ${dst_dir}/lib)
  set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/zlib")
  copy(zlib_lib
    SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES}
    DSTS ${dst_dir} ${dst_dir}/lib)
 endif()
 # paddle fluid module
 set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
 set(dst_dir "${CMAKE_INSTALL_PREFIX}/paddle/fluid")
--- a/doc/fluid/design/motivation/fluid.md
+++ b/doc/fluid/design/motivation/fluid.md
@ -119,7 +119,7 @@ An actual Fluid example is described  [here](https://github.com/PaddlePaddle/Pad
 From the example, the Fluid programs look very similar to their PyTorch equivalent programs, except that Fluid's loop structure, wrapped with Python's `with` statement, could run much faster than just a Python loop.
-We have more examples of the [`if-then-else`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/if_else_op.md) structure of Fluid.
+We have more examples of the [`if-then-else`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/execution/if_else_op.md) structure of Fluid.
 ## Turing Completeness
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@ -24,6 +24,6 @@ if(NOT WITH_FLUID_ONLY)
 endif()
 add_subdirectory(testing)
-if(NOT MOBILE_INFERENCE AND NOT ANDROID AND NOT IOS)
+if(NOT MOBILE_INFERENCE AND NOT RPI)
  add_subdirectory(fluid)
 endif()
--- a/paddle/fluid/CMakeLists.txt
+++ b/paddle/fluid/CMakeLists.txt
@ -3,6 +3,7 @@ add_subdirectory(platform)
 add_subdirectory(framework)
 add_subdirectory(operators)
 add_subdirectory(pybind)
 add_subdirectory(inference)
 add_subdirectory(string)
 add_subdirectory(recordio)
 # NOTE: please add subdirectory inference at last.
 add_subdirectory(inference)
--- a/paddle/fluid/framework/block_desc.h
+++ b/paddle/fluid/framework/block_desc.h
@ -92,7 +92,7 @@ class BlockDesc {
  /*
   * Remove Op and its input/output variables.
-   * Note that for either input or ouput variable, if it is also an input or
+   * Note that for either input or output variable, if it is also an input or
   * output variable of other ops, we should remain it.
   */
  void RemoveOp(size_t s, size_t e);
--- a/paddle/fluid/framework/details/computation_op_handle.cc
+++ b/paddle/fluid/framework/details/computation_op_handle.cc
@ -14,6 +14,8 @@
 #include "paddle/fluid/framework/details/computation_op_handle.h"
 #include <string>
 namespace paddle {
 namespace framework {
 namespace details {
@ -33,7 +35,7 @@ void ComputationOpHandle::RunImpl() {
    }
  }
-  op_->Run(*scope_->FindVar("@TMP_SCOPE@")->Get<Scope *>(), place_);
+  op_->Run(*scope_->FindVar(kLocalExecScopeName)->Get<Scope *>(), place_);
 }
 std::string ComputationOpHandle::Name() const { return op_->Type(); }
--- a/paddle/fluid/framework/details/fetch_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_op_handle.cc
@ -14,6 +14,9 @@
 #include "paddle/fluid/framework/details/fetch_op_handle.h"
 #include <string>
 #include <vector>
 namespace paddle {
 namespace framework {
 namespace details {
@ -57,7 +60,10 @@ void FetchOpHandle::RunImpl() {
  for (size_t i = 0; i < scopes.size(); ++i) {
    auto &scope = scopes[i];
-    auto &t = scope->FindVar(var_name)->Get<framework::LoDTensor>();
+    auto &t = scope->FindVar(kLocalExecScopeName)
                  ->Get<Scope *>()
                  ->FindVar(var_name)
                  ->Get<framework::LoDTensor>();
    if (platform::is_gpu_place(var->place_)) {
 #ifdef PADDLE_WITH_CUDA
      TensorCopy(t, cpu, *dev_ctxes_[t.place()], &tensors_[i]);
--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@ -24,6 +24,8 @@ namespace paddle {
 namespace framework {
 namespace details {
 constexpr char kLocalExecScopeName[] = "@LCOAL_SCOPE@";
 class OpHandleBase {
 private:
  DISABLE_COPY_AND_ASSIGN(OpHandleBase);
--- a/paddle/fluid/framework/details/ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/ssa_graph_executor.h
@ -15,13 +15,15 @@
 #pragma once
 #include <memory>
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/details/ssa_graph.h"
 #include "paddle/fluid/framework/feed_fetch_type.h"
 namespace paddle {
 namespace framework {
 namespace details {
 class SSAGraphExecutor {
  DISABLE_COPY_AND_ASSIGN(SSAGraphExecutor);
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@ -136,12 +136,6 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
    ready_ops.clear();
  };
  // Create local scopes.
  for (auto &scope : local_scopes_) {
    auto &local_scope = scope->NewScope();
    *scope->Var("@TMP_SCOPE@")->GetMutable<Scope *>() = &local_scope;
  }
  // Step 3. Execution
  while (!pending_vars.empty() || !ready_ops.empty() || !delayed_ops.empty()) {
    // 1. Run All Ready ops
@ -189,34 +183,10 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
  PADDLE_ENFORCE(ready_ops.empty());
  PADDLE_ENFORCE(delayed_ops.empty());
  PADDLE_ENFORCE(blocked_by_delayed_ops.empty());
  ++computation_count_;
  auto sync_computation = [&] {
    computation_count_ = 0;
    // Wait All computational streams
    for (auto p : this->places_) {
      platform::DeviceContextPool::Instance().Get(p)->Wait();
    }
    for (auto &scope : local_scopes_) {
      scope->DropKids();
    }
  };
  // Wait FetchOps.
  if (!fetch_ops.empty()) {
    fetch_ops.clear();
    sync_computation();
  }
  if (computation_count_ == max_async_computation) {
    sync_computation();
  }
  // NOTE: the temp scope can be dropped lazily if needed.
  // Drop tmp scopes;
  for (auto &scope : local_scopes_) {
    auto &kid = *scope->Var("@TMP_SCOPE@")->GetMutable<Scope *>();
    kid = nullptr;
  }
  return fetch_data;
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@ -99,9 +99,6 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
  std::unique_ptr<platform::EnforceNotMet> exception_;
  std::atomic<int> running_ops_;
  bool allow_op_delay_;
  size_t computation_count_{0};
  size_t max_async_computation{100};
 };
 }  // namespace details
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@ -83,8 +83,8 @@ static void CheckTensorNANOrInf(const std::string& name,
  if (tensor.memory_size() == 0) {
    return;
  }
-  if (tensor.type().hash_code() != typeid(float).hash_code() &&
+  if (tensor.type().hash_code() != typeid(float).hash_code() &&   // NOLINT
-      tensor.type().hash_code() != typeid(double).hash_code()) {
+      tensor.type().hash_code() != typeid(double).hash_code()) {  // NOLINT
    return;
  }
  PADDLE_ENFORCE(!framework::TensorContainsInf(tensor),
@ -145,12 +145,13 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
 // Return true if the block has feed operators and holder of matching info.
 static bool has_feed_operators(
    const BlockDesc& block,
-    std::map<std::string, const LoDTensor*>& feed_targets,
+    const std::map<std::string, const LoDTensor*>& feed_targets,
    const std::string& feed_holder_name) {
  size_t feed_count = 0;
  for (auto* op : block.AllOps()) {
    if (op->Type() == kFeedOpType) {
      feed_count++;
      // The input variable's name of feed_op should be feed_holder_name.
      PADDLE_ENFORCE_EQ(op->Input("X")[0], feed_holder_name,
                        "Input to feed op should be '%s'", feed_holder_name);
      std::string feed_target_name = op->Output("Out")[0];
@ -166,7 +167,8 @@ static bool has_feed_operators(
        feed_count, feed_targets.size(),
        "The number of feed operators should match 'feed_targets'");
-    // When feed operator are present, so should be feed_holder
+    if (!feed_holder_name.empty()) {
      // When feed operator are present, so should be feed_holder.
      auto var = block.FindVar(feed_holder_name);
      PADDLE_ENFORCE_NOT_NULL(var, "Block should already have a '%s' variable",
                              feed_holder_name);
@ -174,6 +176,7 @@ static bool has_feed_operators(
                        "'%s' variable should be 'FEED_MINIBATCH' type",
                        feed_holder_name);
    }
  }
  return feed_count > 0;
 }
@ -185,12 +188,14 @@ static bool has_feed_operators(
 // and fetch_holder_name. Raise exception when any mismatch is found.
 // Return true if the block has fetch operators and holder of matching info.
 static bool has_fetch_operators(
-    const BlockDesc& block, std::map<std::string, LoDTensor*>& fetch_targets,
+    const BlockDesc& block,
    const std::map<std::string, LoDTensor*>& fetch_targets,
    const std::string& fetch_holder_name) {
  size_t fetch_count = 0;
  for (auto* op : block.AllOps()) {
    if (op->Type() == kFetchOpType) {
      fetch_count++;
      // The output variable's name of fetch_op should be fetch_holder_name.
      PADDLE_ENFORCE_EQ(op->Output("Out")[0], fetch_holder_name,
                        "Output of fetch op should be '%s'", fetch_holder_name);
      std::string fetch_target_name = op->Input("X")[0];
@ -206,7 +211,8 @@ static bool has_fetch_operators(
        fetch_count, fetch_targets.size(),
        "The number of fetch operators should match 'fetch_targets'");
-    // When fetch operator are present, so should be fetch_holder
+    if (!fetch_holder_name.empty()) {
      // When fetch operator are present, so should be fetch_holder.
      auto var = block.FindVar(fetch_holder_name);
      PADDLE_ENFORCE_NOT_NULL(var, "Block should already have a '%s' variable",
                              fetch_holder_name);
@ -214,6 +220,7 @@ static bool has_fetch_operators(
                        "'%s' variable should be 'FETCH_LIST' type",
                        fetch_holder_name);
    }
  }
  return fetch_count > 0;
 }
@ -259,16 +266,6 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
    }
  }
  // map the data of feed_targets to feed_holder
  for (auto* op : global_block->AllOps()) {
    if (op->Type() == kFeedOpType) {
      std::string feed_target_name = op->Output("Out")[0];
      int idx = boost::get<int>(op->GetAttr("col"));
      SetFeedVariable(scope, *feed_targets[feed_target_name], feed_holder_name,
                      idx);
    }
  }
  if (!has_fetch_ops) {
    // create fetch_holder variable
    auto* fetch_holder = global_block->Var(fetch_holder_name);
@ -292,17 +289,9 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
    }
  }
-  Run(*copy_program, scope, 0, create_vars, create_vars);
+  auto ctx = Prepare(*copy_program, 0);
-
+  RunPreparedContext(ctx.get(), scope, feed_targets, fetch_targets, create_vars,
-  // obtain the data of fetch_targets from fetch_holder
+                     feed_holder_name, fetch_holder_name);
  for (auto* op : global_block->AllOps()) {
    if (op->Type() == kFetchOpType) {
      std::string fetch_target_name = op->Input("X")[0];
      int idx = boost::get<int>(op->GetAttr("col"));
      *fetch_targets[fetch_target_name] =
          GetFetchVariable(*scope, fetch_holder_name, idx);
    }
  }
 }
 std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
@ -370,5 +359,42 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
  }
 }
 void Executor::RunPreparedContext(
    ExecutorPrepareContext* ctx, Scope* scope,
    std::map<std::string, const LoDTensor*>& feed_targets,
    std::map<std::string, LoDTensor*>& fetch_targets, bool create_vars,
    const std::string& feed_holder_name, const std::string& fetch_holder_name) {
  auto& global_block = ctx->prog_.Block(ctx->block_id_);
  PADDLE_ENFORCE(
      has_feed_operators(global_block, feed_targets, feed_holder_name),
      "Program in ExecutorPrepareContext should has feed_ops.");
  PADDLE_ENFORCE(
      has_fetch_operators(global_block, fetch_targets, fetch_holder_name),
      "Program in the prepared context should has fetch_ops.");
  // map the data of feed_targets to feed_holder
  for (auto* op : global_block.AllOps()) {
    if (op->Type() == kFeedOpType) {
      std::string feed_target_name = op->Output("Out")[0];
      int idx = boost::get<int>(op->GetAttr("col"));
      SetFeedVariable(scope, *feed_targets[feed_target_name], feed_holder_name,
                      idx);
    }
  }
  RunPreparedContext(ctx, scope, create_vars, create_vars);
  // obtain the data of fetch_targets from fetch_holder
  for (auto* op : global_block.AllOps()) {
    if (op->Type() == kFetchOpType) {
      std::string fetch_target_name = op->Input("X")[0];
      int idx = boost::get<int>(op->GetAttr("col"));
      *fetch_targets[fetch_target_name] =
          GetFetchVariable(*scope, fetch_holder_name, idx);
    }
  }
 }
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
@ -14,6 +14,9 @@ limitations under the License. */
 #pragma once
 #include <map>
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
@ -70,6 +73,13 @@ class Executor {
                          bool create_local_scope = true,
                          bool create_vars = true);
  void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
                          std::map<std::string, const LoDTensor*>& feed_targets,
                          std::map<std::string, LoDTensor*>& fetch_targets,
                          bool create_vars = true,
                          const std::string& feed_holder_name = "feed",
                          const std::string& fetch_holder_name = "fetch");
 private:
  const platform::Place place_;
 };
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@ -46,7 +46,8 @@ proto::VarType::Type GetDataTypeOfVar(const Variable* var) {
  }
 }
-static DDim GetDims(const Scope& scope, const std::string& name) {
+static DDim GetDims(const Scope& scope, const std::string& name,
                    bool get_actual_dim = false) {
  Variable* var = scope.FindVar(name);
  if (var == nullptr) {
    return DDim({-1});
@ -55,7 +56,11 @@ static DDim GetDims(const Scope& scope, const std::string& name) {
  if (var->IsType<LoDTensor>()) {
    return var->Get<LoDTensor>().dims();
  } else if (var->IsType<SelectedRows>()) {
    if (get_actual_dim) {
      return var->Get<SelectedRows>().value().dims();
    } else {
      return var->Get<SelectedRows>().GetCompleteDims();
    }
  } else {
    return DDim({-1});
  }
@ -129,7 +134,7 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const {
    for (size_t i = 0; i < input.second.size(); ++i) {
      ss << input.second[i];
      if (scope) {
-        ss << "[" << GetDims(*scope, input.second[i]) << "]";
+        ss << "[" << GetDims(*scope, input.second[i], true) << "]";
        ss << "(" << GetLoD(*scope, input.second[i]) << ")";
      }
      if (i != input.second.size() - 1) {
@ -149,7 +154,7 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const {
    for (size_t i = 0; i < output.second.size(); ++i) {
      ss << output.second[i];
      if (scope) {
-        ss << "[" << GetDims(*scope, output.second[i]) << "]";
+        ss << "[" << GetDims(*scope, output.second[i], true) << "]";
        ss << "(" << GetLoD(*scope, output.second[i]) << ")";
      }
      if (i != output.second.size() - 1) {
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/parallel_executor.h"
 #include <string>
 #include <tuple>
 #include <vector>
 #ifdef PADDLE_WITH_CUDA
@ -41,6 +42,8 @@ class ParallelExecutorPrivate {
 #ifdef PADDLE_WITH_CUDA
  std::unique_ptr<platform::NCCLContextMap> nccl_ctxs_;
 #endif
  std::vector<std::tuple<std::string, proto::VarType::Type, bool>> var_types_;
 };
 std::vector<Scope *> &ParallelExecutor::GetLocalScopes() {
@ -97,14 +100,9 @@ ParallelExecutor::ParallelExecutor(
      allow_op_delay));
  // Step 3. Create vars in each scope;
  for (auto *scope : member_->local_scopes_) {
  for (auto *var : main_program.Block(0).AllVars()) {
-      if (scope->FindVar(var->Name()) != nullptr) {
+    member_->var_types_.emplace_back(var->Name(), var->GetType(),
-        continue;
+                                     var->Persistable());
      }
      InitializeVariable(scope->Var(var->Name()), var->GetType());
    }
  }
 }
@ -163,9 +161,42 @@ void ParallelExecutor::Run(
    const std::unordered_map<std::string, LoDTensor> &feed_tensors) {
  platform::RecordBlock b(0);
  SplitTensorToPlaces(feed_tensors);
  // Create local scopes.
  for (auto &scope : member_->local_scopes_) {
    Scope &local_scope = scope->NewScope();
    *scope->Var(details::kLocalExecScopeName)->GetMutable<Scope *>() =
        &local_scope;
    for (auto &name_type_pair : member_->var_types_) {
      if (scope->FindVar(std::get<0>(name_type_pair)) != nullptr) {
        continue;
      }
      if (std::get<2>(name_type_pair)) {  // Persistable
        InitializeVariable(scope->Var(std::get<0>(name_type_pair)),
                           std::get<1>(name_type_pair));
      } else {
        InitializeVariable(scope->Var(std::get<0>(name_type_pair)),
                           std::get<1>(name_type_pair));
      }
    }
  }
  auto fetch_data = member_->executor_->Run(fetch_tensors);
  *member_->global_scope_->Var(fetched_var_name)->GetMutable<FeedFetchList>() =
      fetch_data;
  // Wait All computational streams
  for (auto p : member_->places_) {
    platform::DeviceContextPool::Instance().Get(p)->Wait();
  }
  for (auto &scope : member_->local_scopes_) {
    auto &local_scope =
        *scope->Var(details::kLocalExecScopeName)->GetMutable<Scope *>();
    scope->DeleteScope(local_scope);
    local_scope = nullptr;
  }
 }
 void ParallelExecutor::SplitTensorToPlaces(
--- a/paddle/fluid/framework/threadpool.cc
+++ b/paddle/fluid/framework/threadpool.cc
@ -14,8 +14,12 @@
 #include "paddle/fluid/framework/threadpool.h"
 #include "gflags/gflags.h"
 #include "paddle/fluid/platform/enforce.h"
 DEFINE_int32(io_threadpool_size, 100,
             "number of threads used for doing IO, default 100");
 namespace paddle {
 namespace framework {
@ -91,5 +95,20 @@ void ThreadPool::TaskLoop() {
  }
 }
 std::unique_ptr<ThreadPool> ThreadPoolIO::io_threadpool_(nullptr);
 std::once_flag ThreadPoolIO::io_init_flag_;
 ThreadPool* ThreadPoolIO::GetInstanceIO() {
  std::call_once(io_init_flag_, &ThreadPoolIO::InitIO);
  return io_threadpool_.get();
 }
 void ThreadPoolIO::InitIO() {
  if (io_threadpool_.get() == nullptr) {
    // TODO(typhoonzero1986): make this configurable
    io_threadpool_.reset(new ThreadPool(FLAGS_io_threadpool_size));
  }
 }
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/threadpool.h
+++ b/paddle/fluid/framework/threadpool.h
@ -14,12 +14,12 @@ limitations under the License. */
 #pragma once
-#include <condition_variable>
+#include <condition_variable>  // NOLINT
 #include <functional>
-#include <future>
+#include <future>  // NOLINT
-#include <mutex>
+#include <mutex>   // NOLINT
 #include <queue>
-#include <thread>
+#include <thread>  // NOLINT
 #include <vector>
 #include "glog/logging.h"
 #include "paddle/fluid/platform/enforce.h"
@ -28,6 +28,22 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 struct ExceptionHandler {
  mutable std::future<std::unique_ptr<platform::EnforceNotMet>> future_;
  explicit ExceptionHandler(
      std::future<std::unique_ptr<platform::EnforceNotMet>>&& f)
      : future_(std::move(f)) {}
  void operator()() const {
    auto ex = this->future_.get();
    if (ex != nullptr) {
      LOG(FATAL) << "The exception is thrown inside the thread pool. You "
                    "should use RunAndGetException to handle the exception.\n"
                    "The default exception handler is LOG(FATAL)."
                 << ex->what();
    }
  }
 };
 // ThreadPool maintains a queue of tasks, and runs them using a fixed
 // number of threads.
 class ThreadPool {
@ -87,22 +103,6 @@ class ThreadPool {
  void Wait();
 private:
  struct ExceptionHandler {
    mutable std::future<std::unique_ptr<platform::EnforceNotMet>> future_;
    explicit ExceptionHandler(
        std::future<std::unique_ptr<platform::EnforceNotMet>>&& f)
        : future_(std::move(f)) {}
    void operator()() const {
      auto ex = this->future_.get();
      if (ex != nullptr) {
        LOG(FATAL) << "The exception is thrown inside the thread pool. You "
                      "should use RunAndGetException to handle the exception.\n"
                      "The default exception handler is LOG(FATAL)."
                   << ex->what();
      }
    }
  };
  DISABLE_COPY_AND_ASSIGN(ThreadPool);
  // If the task queue is empty and avaialbe is equal to the number of
@ -135,6 +135,17 @@ class ThreadPool {
  std::condition_variable completed_;
 };
 class ThreadPoolIO : ThreadPool {
 public:
  static ThreadPool* GetInstanceIO();
  static void InitIO();
 private:
  // NOTE: threadpool in base will be inhereted here.
  static std::unique_ptr<ThreadPool> io_threadpool_;
  static std::once_flag io_init_flag_;
 };
 // Run a function asynchronously.
 // NOTE: The function must return void. If the function need to return a value,
 // you can use lambda to capture a value pointer.
@ -143,5 +154,10 @@ std::future<void> Async(Callback callback) {
  return ThreadPool::GetInstance()->Run(callback);
 }
 template <typename Callback>
 std::future<void> AsyncIO(Callback callback) {
  return ThreadPoolIO::GetInstanceIO()->Run(callback);
 }
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@ -1,4 +1,4 @@
-set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor prune init)
+set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor init)
 cc_library(paddle_fluid_api
    SRCS io.cc
@ -11,7 +11,7 @@ cc_library(paddle_fluid DEPS ${fluid_modules})
 # Create shared library
 cc_library(paddle_fluid_shared SHARED
    SRCS io.cc
-    DEPS ARCHIVE_START ${GLOB_OP_LIB} ${FLUID_CORE_MODULES} ARCHIVE_END)
+    DEPS ${fluid_modules})
 set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid)
 if(NOT APPLE)
  # TODO(liuyiqun): Temporarily disable the link flag because it is not support on Mac.
--- a/paddle/fluid/inference/io.cc
+++ b/paddle/fluid/inference/io.cc
@ -17,10 +17,16 @@ limitations under the License. */
 #include <fstream>
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/feed_fetch_type.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/pybind/pybind.h"
 namespace paddle {
 namespace inference {
 // Temporarily add this function for exposing framework::InitDevices() when
 // linking the inference shared library.
 void Init(bool init_p2p) { framework::InitDevices(init_p2p); }
 void ReadBinaryFile(const std::string& filename, std::string& contents) {
  std::ifstream fin(filename, std::ios::in | std::ios::binary);
  PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s", filename);
--- a/Show More
+++ b/Show More