merge nn.py

7 years ago · 8c1e304307
parent 9dc28179a4 aa6b2bda2c
commit 8c1e304307
70 changed files with 1473 additions and 296 deletions
--- a/.gitignore
+++ b/.gitignore
@ -28,3 +28,4 @@ third_party/
 build_*
 # clion workspace.
 cmake-build-*
+model_test
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -69,6 +69,7 @@ option(WITH_ANAKIN      "Compile with Anakin library"                   OFF)
 option(WITH_GRPC     "Use grpc as the default rpc framework"            ${WITH_DISTRIBUTE})
 option(WITH_BRPC_RDMA     "Use brpc rdma as the rpc protocal"           OFF)
 option(WITH_INFERENCE    "Compile fluid inference library"              ON)
+option(ON_INFER         "Turn on inference optimization."               OFF)
 option(WITH_INFERENCE_API_TEST   "Test fluid inference high-level api interface"  OFF)
 option(WITH_SYSTEM_BLAS   "Use system blas library"           OFF)
 option(PY_VERSION       "Compile PaddlePaddle with python3 support"     ${PY_VERSION})
@ -179,6 +180,7 @@ include(external/eigen)     # download eigen3
 include(external/pybind11)  # download pybind11
 include(external/cares)
 include(external/cub)
+include(external/xxhash)    # download xxhash

 if (NOT WIN32)
 # there is no official support of snappystream, warpctc, nccl, cupti in windows
@ -301,3 +303,8 @@ if(WITH_DOC)
    find_python_module(recommonmark REQUIRED)
    add_subdirectory(doc)
 endif()
+
+if (ON_INFER)
+    message(WARNING "On inference mode, will take place some specific optimization.")
+    add_definitions(-DPADDLE_ON_INFERENCE)
+endif()
--- a/6
+++ b/6
@ -75,14 +75,14 @@ RUN pip3 install -U wheel && \
    pip3 install -U docopt PyYAML sphinx==1.5.6 && \
    pip3 install sphinx-rtd-theme==0.1.9 recommonmark && \
    easy_install -U pip && \
-    pip install -U wheel && \
+    pip install -U pip setuptools wheel && \
    pip install -U docopt PyYAML sphinx==1.5.6 && \
    pip install sphinx-rtd-theme==0.1.9 recommonmark

-RUN pip3 install pre-commit 'ipython==5.3.0' && \
+RUN pip3 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
    pip3 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
    pip3 install opencv-python && \
-    pip install pre-commit 'ipython==5.3.0' && \
+    pip install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
    pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
    pip install opencv-python

--- a/cmake/external/xxhash.cmake
+++ b/cmake/external/xxhash.cmake
@ -0,0 +1,46 @@
+INCLUDE(ExternalProject)
+
+set(XXHASH_SOURCE_DIR ${THIRD_PARTY_PATH}/xxhash)
+set(XXHASH_INSTALL_DIR ${THIRD_PARTY_PATH}/install/xxhash)
+set(XXHASH_INCLUDE_DIR "${XXHASH_INSTALL_DIR}/include")
+
+IF(WITH_STATIC_LIB)
+  SET(BUILD_CMD make lib)
+ELSE()
+  SET(BUILD_CMD sed -i "s/-Wstrict-prototypes -Wundef/-Wstrict-prototypes -Wundef -fPIC/g" ${XXHASH_SOURCE_DIR}/src/extern_xxhash/Makefile && make lib)
+ENDIF()
+
+ExternalProject_Add(
+    extern_xxhash
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    GIT_REPOSITORY  "https://github.com/Cyan4973/xxHash"
+    GIT_TAG         "v0.6.5"
+    PREFIX          ${XXHASH_SOURCE_DIR}
+    DOWNLOAD_NAME   "xxhash"
+    UPDATE_COMMAND  ""
+    CONFIGURE_COMMAND ""
+    BUILD_IN_SOURCE 1
+    PATCH_COMMAND
+    BUILD_COMMAND     ${BUILD_CMD}
+    INSTALL_COMMAND   export PREFIX=${XXHASH_INSTALL_DIR}/ && make install
+    TEST_COMMAND      ""
+)
+
+set(XXHASH_LIBRARIES "${XXHASH_INSTALL_DIR}/lib/libxxhash.a")
+INCLUDE_DIRECTORIES(${XXHASH_INCLUDE_DIR})
+
+add_library(xxhash STATIC IMPORTED GLOBAL)
+set_property(TARGET xxhash PROPERTY IMPORTED_LOCATION ${XXHASH_LIBRARIES})
+include_directories(${XXHASH_INCLUDE_DIR})
+add_dependencies(xxhash extern_xxhash)
+
+LIST(APPEND external_project_dependencies xxhash)
+
+IF(WITH_C_API)
+  INSTALL(DIRECTORY ${XXHASH_INCLUDE_DIR} DESTINATION third_party/xxhash)
+  IF(ANDROID)
+    INSTALL(FILES ${XXHASH_LIBRARIES} DESTINATION third_party/xxhash/lib/${ANDROID_ABI})
+  ELSE()
+    INSTALL(FILES ${XXHASH_LIBRARIES} DESTINATION third_party/xxhash/lib)
+  ENDIF()
+ENDIF()
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@ -14,6 +14,9 @@

 # make package for paddle fluid shared and static library
 function(copy TARGET)
+    if (NOT ON_INFER)
+      message(WARNING "Turn on the ON_INFER flag when building inference_lib only.")
+    endif()
    set(options "")
    set(oneValueArgs "")
    set(multiValueArgs SRCS DSTS DEPS)
@ -31,7 +34,7 @@ function(copy TARGET)
    foreach(index RANGE ${len})
        list(GET copy_lib_SRCS ${index} src)
        list(GET copy_lib_DSTS ${index} dst)
-        add_custom_command(TARGET ${TARGET} PRE_BUILD 
+        add_custom_command(TARGET ${TARGET} PRE_BUILD
          COMMAND mkdir -p "${dst}"
          COMMAND cp -r "${src}" "${dst}"
          COMMENT "copying ${src} -> ${dst}")
@ -67,6 +70,13 @@ copy(boost_lib
  DEPS boost
 )

+set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/xxhash")
+copy(xxhash_lib
+  SRCS ${XXHASH_INCLUDE_DIR} ${XXHASH_LIBRARIES}
+  DSTS ${dst_dir} ${dst_dir}/lib
+  DEPS xxhash
+)
+
 if(NOT PROTOBUF_FOUND)
    set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/protobuf")
    copy(protobuf_lib
@ -186,7 +196,7 @@ copy(cmake_cache
  DSTS ${FLUID_INSTALL_DIR})

 # This command generates a complete fluid library for both train and inference
-add_custom_target(fluid_lib_dist DEPENDS ${fluid_lib_dist_dep}) 
+add_custom_target(fluid_lib_dist DEPENDS ${fluid_lib_dist_dep})

 # Following commands generate a inference-only fluid library
 # third_party, version.txt and CMakeCache.txt are the same position with ${FLUID_INSTALL_DIR}
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@ -86,7 +86,7 @@ paddle.fluid.layers.reduce_prod ArgSpec(args=['input', 'dim', 'keep_dim', 'name'
 paddle.fluid.layers.sequence_first_step ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.sequence_last_step ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.sequence_slice ArgSpec(args=['input', 'offset', 'length', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.dropout ArgSpec(args=['x', 'dropout_prob', 'is_test', 'seed', 'name'], varargs=None, keywords=None, defaults=(False, None, None))
+paddle.fluid.layers.dropout ArgSpec(args=['x', 'dropout_prob', 'is_test', 'seed', 'name', 'dropout_implementation'], varargs=None, keywords=None, defaults=(False, None, None, 'downgrade_in_infer'))
 paddle.fluid.layers.split ArgSpec(args=['input', 'num_or_sections', 'dim', 'name'], varargs=None, keywords=None, defaults=(-1, None))
 paddle.fluid.layers.ctc_greedy_decoder ArgSpec(args=['input', 'blank', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.edit_distance ArgSpec(args=['input', 'label', 'normalized', 'ignored_tokens'], varargs=None, keywords=None, defaults=(True, None))
@ -107,7 +107,7 @@ paddle.fluid.layers.softmax_with_cross_entropy ArgSpec(args=['logits', 'label',
 paddle.fluid.layers.smooth_l1 ArgSpec(args=['x', 'y', 'inside_weight', 'outside_weight', 'sigma'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.layers.one_hot ArgSpec(args=['input', 'depth'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.autoincreased_step_counter ArgSpec(args=['counter_name', 'begin', 'step'], varargs=None, keywords=None, defaults=(None, 1, 1))
-paddle.fluid.layers.reshape ArgSpec(args=['x', 'shape', 'actual_shape', 'act', 'inplace', 'name'], varargs=None, keywords=None, defaults=(None, None, True, None))
+paddle.fluid.layers.reshape ArgSpec(args=['x', 'shape', 'actual_shape', 'act', 'inplace', 'name'], varargs=None, keywords=None, defaults=(None, None, False, None))
 paddle.fluid.layers.squeeze ArgSpec(args=['input', 'axes', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.unsqueeze ArgSpec(args=['input', 'axes', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.lod_reset ArgSpec(args=['x', 'y', 'target_lod'], varargs=None, keywords=None, defaults=(None, None))
@ -174,7 +174,9 @@ paddle.fluid.layers.mean ArgSpec(args=['x', 'name'], varargs=None, keywords=None
 paddle.fluid.layers.mul ArgSpec(args=['x', 'y', 'x_num_col_dims', 'y_num_col_dims', 'name'], varargs=None, keywords=None, defaults=(1, 1, None))
 paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.maxout ArgSpec(args=['x', 'groups', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.sequence_reverse ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.affine_channel ArgSpec(args=['x', 'scale', 'bias', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(None, None, 'NCHW', None))
+paddle.fluid.layers.hash ArgSpec(args=['input', 'hash_size', 'num_hash', 'name'], varargs=None, keywords=None, defaults=(1, None))
 paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
 paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
 paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None)
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@ -252,9 +252,9 @@ std::vector<ir::Node *> SortOpsAndDelayOptimizeOp(const ir::Graph &graph) {
  std::vector<ir::Node *> sorted_ret;
  for (size_t i = 0; i < ret.size(); ++i) {
    if (i < last_backward) {
-      if (boost::get<int>(ret[i]->Op()->GetAttr(
-              OpProtoAndCheckerMaker::OpRoleAttrName())) ==
-          static_cast<int>(OpRole::kOptimize)) {
+      if (static_cast<bool>(boost::get<int>(ret[i]->Op()->GetAttr(
+                                OpProtoAndCheckerMaker::OpRoleAttrName())) &
+                            static_cast<int>(OpRole::kOptimize))) {
        optimize_ops.push_back(ret[i]);
      } else {
        sorted_ret.push_back(ret[i]);
--- a/paddle/fluid/framework/ir/graph_helper.cc
+++ b/paddle/fluid/framework/ir/graph_helper.cc
@ -120,19 +120,25 @@ size_t GraphNum(const Graph &graph) {
  std::deque<ir::Node *> q_nodes;
  std::vector<std::unordered_set<ir::Node *>> graph_nodes;
  std::unordered_set<ir::Node *> g_nodes;
+  // q_set used to record records in the queue.
+  std::unordered_set<ir::Node *> q_set;
  size_t graph_count = 0;

-  auto traverse_nodes = [&visited_nodes,
-                         &q_nodes](const std::vector<ir::Node *> &nodes) {
-    std::copy_if(
-        nodes.begin(), nodes.end(), std::back_inserter(q_nodes),
-        [&visited_nodes](Node *node) { return !visited_nodes.count(node); });
+  auto traverse_nodes = [&visited_nodes, &q_nodes,
+                         &q_set](const std::vector<ir::Node *> &nodes) {
+    for (auto n : nodes) {
+      if (visited_nodes.count(n) == 0 && q_set.count(n) == 0) {
+        q_nodes.push_back(n);
+        q_set.insert(n);
+      }
+    }
  };

  while (visited_nodes.size() != nodes.size()) {
    if (!q_nodes.empty()) {
      auto cur_node = q_nodes.front();
      q_nodes.pop_front();
+      q_set.erase(cur_node);
      visited_nodes.insert(cur_node);
      g_nodes.insert(cur_node);
      traverse_nodes(cur_node->inputs);
@ -146,6 +152,7 @@ size_t GraphNum(const Graph &graph) {
      for (auto &n : nodes) {
        if (visited_nodes.count(n) == 0) {
          q_nodes.push_back(n);
+          q_set.insert(n);
          break;
        }
      }
--- a/paddle/fluid/framework/lod_tensor_array.h
+++ b/paddle/fluid/framework/lod_tensor_array.h
@ -18,6 +18,82 @@ limitations under the License. */

 namespace paddle {
 namespace framework {
+
+// NOTE The vector<LoDTensor> can't be replaced with the class LoDTensorArray
+// directly, because there are many vector<LoDTensor> used accross the project,
+// and some of them are treated as LoDTensorArray.
+#if !defined(PADDLE_ON_INFERENCE)
+
 using LoDTensorArray = std::vector<LoDTensor>;
-}
+
+#else  // !PADDLE_ON_INFERENCE
+
+#pragma message "LoDTensorArray is replaced with the inference one."
+/*
+ * A LoDTensorArray which will not deallocate buffer when resized, fix the data
+ * diff in inference, and more performance friendly in the concurrency
+ * scenerios.
+ */
+class LoDTensorArray {
+ public:
+  LoDTensorArray() = default;
+
+  using iterator = std::vector<LoDTensor>::iterator;
+  using const_iterator = std::vector<LoDTensor>::const_iterator;
+
+  const_iterator begin() const { return array_.begin(); }
+  const_iterator end() const { return array_.begin() + size_; }
+  iterator begin() { return array_.begin(); }
+  iterator end() { return array_.begin() + size_; }
+
+  void push_back(const LoDTensor& x) {
+    if (size_ < array_.size()) {
+      array_[size_++] = x;
+    } else {
+      array_.push_back(x);
+      ++size_;
+    }
+  }
+  void resize(size_t size) {
+    if (array_.size() < size) {
+      array_.resize(size);
+    }
+    size_ = size;
+  }
+
+  void emplace_back() { array_.emplace_back(); }
+
+  void emplace_back(LoDTensor&& x) { array_.emplace_back(std::move(x)); }
+
+  LoDTensor& back() { return array_.back(); }
+
+  size_t space() const { return array_.size(); }
+
+  void reserve(size_t size) {
+    // Naive warning to tell user this array might be to large. The memory and
+    // buffer used by this TensorArray will not be deleted during the training
+    // and inference phase, so attention not to make it expand too long.
+    if (size > 800UL) {
+      LOG(WARNING) << "TensorArray has more than 800 items";
+    }
+    array_.reserve(size);
+  }
+
+  bool empty() const { return size_ == 0UL; }
+  void clear() { size_ = 0UL; }
+
+  LoDTensor& operator[](size_t id) { return array_[id]; }
+  const LoDTensor& operator[](size_t id) const { return array_[id]; }
+  LoDTensor& at(size_t id) { return array_.at(id); }
+  const LoDTensor& at(size_t id) const { return array_.at(id); }
+
+  size_t size() const { return size_; }
+
+ private:
+  size_t size_{0};
+  std::vector<LoDTensor> array_;
+};
+#endif  // !PADDLE_ON_INFERENCE
+
+}  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/mixed_vector.h
+++ b/paddle/fluid/framework/mixed_vector.h
@ -542,6 +542,33 @@ class CPUVector : public std::vector<T, std::allocator<T>> {
    this->reserve(this->size() + size_t(end - begin));
    this->insert(this->end(), begin, end);
  }
+
+  const T *CUDAData(platform::Place place) const {
+    PADDLE_THROW(
+        "Vector::CUDAData() method is not supported in CPU-only version");
+  }
+
+  T *CUDAMutableData(platform::Place place) {
+    PADDLE_THROW(
+        "Vector::CUDAMutableData() method is not supported in CPU-only "
+        "version");
+  }
+
+  const T *Data(platform::Place place) const {
+    PADDLE_ENFORCE(
+        platform::is_cpu_place(place),
+        "Vector::Data() method is not supported when not in CPUPlace");
+    return this->data();
+  }
+
+  T *MutableData(platform::Place place) {
+    PADDLE_ENFORCE(
+        platform::is_cpu_place(place),
+        "Vector::MutableData() method is not supported when not in CPUPlace");
+    return this->data();
+  }
+
+  const void *Handle() const { return static_cast<const void *>(this); }
 };

 template <typename T>
--- a/paddle/fluid/framework/naive_executor.cc
+++ b/paddle/fluid/framework/naive_executor.cc
@ -146,22 +146,5 @@ void NaiveExecutor::CleanFeedFetchOps() {
  ops_.swap(ops);
 }

-void NaiveExecutor::EnableMKLDNN(const ProgramDesc &program) {
-#ifdef PADDLE_WITH_MKLDNN
-  VLOG(3) << "use_mkldnn=True";
-  for (size_t block_id = 0; block_id < program.Size(); ++block_id) {
-    auto *block = const_cast<ProgramDesc &>(program).MutableBlock(block_id);
-    for (auto *op : block->AllOps()) {
-      if (op->HasAttr("use_mkldnn")) {
-        op->SetAttr("use_mkldnn", true);
-      }
-    }
-  }
-#else
-  LOG(WARNING)
-      << "'MKLDNN' is not supported, Please re-compile with WITH_MKLDNN option";
-#endif
-}
-
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/naive_executor.h
+++ b/paddle/fluid/framework/naive_executor.h
@ -48,8 +48,6 @@ class NaiveExecutor {

  void CleanFeedFetchOps();

-  void EnableMKLDNN(const ProgramDesc& program);
-
 protected:
  void CreateVariables(const ProgramDesc& desc, Scope* scope, int block_id);

--- a/paddle/fluid/framework/op_proto_maker.cc
+++ b/paddle/fluid/framework/op_proto_maker.cc
@ -71,6 +71,8 @@ void OpProtoAndCheckerMaker::operator()(proto::OpProto* proto,
           static_cast<int>(OpRole::kLoss) | static_cast<int>(OpRole::kForward),
           static_cast<int>(OpRole::kLoss) |
               static_cast<int>(OpRole::kBackward),
+           static_cast<int>(OpRole::kOptimize) |
+               static_cast<int>(OpRole::kLRSched),
           static_cast<int>(OpRole::kNotSpecified)})
      .SetDefault(static_cast<int>(OpRole::kNotSpecified));
  AddAttr<std::vector<std::string>>(OpRoleVarAttrName(),
--- a/paddle/fluid/framework/op_proto_maker.h
+++ b/paddle/fluid/framework/op_proto_maker.h
@ -20,17 +20,20 @@ limitations under the License. */
 namespace paddle {
 namespace framework {

+//////////////////////////
+// Don't add more roles to make this too complicated!
+//////////////////////////
 enum class OpRole {
  kForward = 0x0000,
  kBackward = 0x0001,
  kOptimize = 0x0002,
  // RPC role is for send/recv releated op
-  kRPC = 0x0003,
+  kRPC = 0x0004,
  // Dist role is for split_byref/split_selected_rows/concat
  // used for distributed training.
-  kDist = 0x0004,
+  kDist = 0x0008,
  // Tag all learning rate scheduler operators.
-  kLRSched = 0x0005,
+  kLRSched = 0x0016,

  kLoss = 0x0100,
  // The default value of op's role. This should be only used for unittests and
--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@ -78,6 +78,8 @@ class Scope {
  /// Drop all kids scopes belonged to this scope.
  void DropKids();

+  std::list<Scope*>& kids() const { return kids_; }
+
  /// Find if a scope exists in the kid scopes
  bool HasKid(const Scope* scope) const;

--- a/paddle/fluid/framework/threadpool.cc
+++ b/paddle/fluid/framework/threadpool.cc
@ -25,7 +25,6 @@ DEFINE_int32(dist_threadpool_size, 0,

 namespace paddle {
 namespace framework {
-
 std::unique_ptr<ThreadPool> ThreadPool::threadpool_(nullptr);
 std::once_flag ThreadPool::init_flag_;

@ -47,8 +46,7 @@ void ThreadPool::Init() {
  }
 }

-ThreadPool::ThreadPool(int num_threads)
-    : total_threads_(num_threads), idle_threads_(num_threads), running_(true) {
+ThreadPool::ThreadPool(int num_threads) : running_(true) {
  threads_.resize(num_threads);
  for (auto& thread : threads_) {
    // TODO(Yancey1989): binding the thread on the specify CPU number
@ -59,6 +57,7 @@ ThreadPool::ThreadPool(int num_threads)
 ThreadPool::~ThreadPool() {
  {
    // notify all threads to stop running
+    std::lock_guard<std::mutex> l(mutex_);
    running_ = false;
    scheduled_.notify_all();
  }
@ -69,36 +68,24 @@ ThreadPool::~ThreadPool() {
  }
 }

-void ThreadPool::Wait() {
-  std::unique_lock<std::mutex> lock(mutex_);
-  completed_.wait(lock, [=] { return Done() == true; });
-}
-
 void ThreadPool::TaskLoop() {
-  while (running_) {
+  while (true) {
    std::unique_lock<std::mutex> lock(mutex_);
-    scheduled_.wait(lock, [=] { return !tasks_.empty() || !running_; });

-    if (!running_) {
-      break;
+    scheduled_.wait(
+        lock, [this] { return !this->tasks_.empty() || !this->running_; });
+
+    if (!running_ || tasks_.empty()) {
+      return;
    }
+
    // pop a task from the task queue
    auto task = std::move(tasks_.front());
    tasks_.pop();
-
-    --idle_threads_;
    lock.unlock();

    // run the task
    task();
-
-    {
-      std::unique_lock<std::mutex> lock(mutex_);
-      ++idle_threads_;
-      if (Done()) {
-        completed_.notify_all();
-      }
-    }
  }
 }

--- a/paddle/fluid/framework/threadpool.h
+++ b/paddle/fluid/framework/threadpool.h
@ -57,15 +57,6 @@ class ThreadPool {

  ~ThreadPool();

-  // Returns the number of threads created by the constructor.
-  size_t Threads() const { return total_threads_; }
-
-  // Returns the number of currently idle threads.
-  size_t IdleThreads() {
-    std::unique_lock<std::mutex> lock(mutex_);
-    return idle_threads_;
-  }
-
  // Run pushes a function to the task queue and returns a std::future
  // object.  To wait for the completion of the task, call
  // std::future::wait().
@ -94,25 +85,13 @@ class ThreadPool {
    });
    std::future<std::unique_ptr<platform::EnforceNotMet>> f = task.get_future();
    tasks_.push(std::move(task));
-    lock.unlock();
    scheduled_.notify_one();
    return f;
  }

-  // Wait until all the tasks are completed.
-  void Wait();
-
 private:
  DISABLE_COPY_AND_ASSIGN(ThreadPool);

-  // If the task queue is empty and avaialbe is equal to the number of
-  // threads, means that all tasks are completed.  Note: this function
-  // is not thread-safe.  Returns true if all tasks are completed.
-  // Note: don't delete the data member total_threads_ and use
-  // threads_.size() instead; because you'd need to lock the mutex
-  // before accessing threads_.
-  bool Done() { return tasks_.empty() && idle_threads_ == total_threads_; }
-
  // The constructor starts threads to run TaskLoop, which retrieves
  // and runs tasks from the queue.
  void TaskLoop();
@ -125,14 +104,11 @@ class ThreadPool {
  static std::once_flag init_flag_;

  std::vector<std::unique_ptr<std::thread>> threads_;
-  const size_t total_threads_;
-  size_t idle_threads_;

  std::queue<Task> tasks_;
  std::mutex mutex_;
  bool running_;
  std::condition_variable scheduled_;
-  std::condition_variable completed_;
 };

 class ThreadPoolIO : ThreadPool {
--- a/paddle/fluid/framework/threadpool_test.cc
+++ b/paddle/fluid/framework/threadpool_test.cc
@ -19,10 +19,11 @@ limitations under the License. */

 namespace framework = paddle::framework;

-void do_sum(framework::ThreadPool* pool, std::atomic<int>* sum, int cnt) {
-  std::vector<std::future<void>> fs;
+void do_sum(std::vector<std::future<void>>* fs, std::mutex* mu,
+            std::atomic<int>* sum, int cnt) {
  for (int i = 0; i < cnt; ++i) {
-    fs.push_back(framework::Async([sum]() { sum->fetch_add(1); }));
+    std::lock_guard<std::mutex> l(*mu);
+    fs->push_back(framework::Async([sum]() { sum->fetch_add(1); }));
  }
 }

@ -40,18 +41,21 @@ TEST(ThreadPool, ConcurrentInit) {
 }

 TEST(ThreadPool, ConcurrentRun) {
-  framework::ThreadPool* pool = framework::ThreadPool::GetInstance();
  std::atomic<int> sum(0);
  std::vector<std::thread> threads;
+  std::vector<std::future<void>> fs;
+  std::mutex fs_mu;
  int n = 50;
  // sum = (n * (n + 1)) / 2
  for (int i = 1; i <= n; ++i) {
-    std::thread t(do_sum, pool, &sum, i);
+    std::thread t(do_sum, &fs, &fs_mu, &sum, i);
    threads.push_back(std::move(t));
  }
  for (auto& t : threads) {
    t.join();
  }
-  pool->Wait();
+  for (auto& t : fs) {
+    t.wait();
+  }
  EXPECT_EQ(sum, ((n + 1) * n) / 2);
 }
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@ -30,7 +30,7 @@ if (WITH_GPU AND TENSORRT_FOUND)
 endif()

 # Create static library
-cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor)
+cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array)

 if(NOT APPLE)
  # TODO(liuyiqu: Temporarily disable the link flag because it is not support on Mac.
@ -40,7 +40,7 @@ endif()

 # Create shared library
 cc_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
-    DEPS ${fluid_modules} paddle_fluid_api)
+    DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array)

 set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid)
 if(NOT APPLE)
--- a/paddle/fluid/inference/analysis/analyzer.cc
+++ b/paddle/fluid/inference/analysis/analyzer.cc
@ -107,6 +107,9 @@ void Analyzer::Run(Argument* argument) {
    passes.push_back("mkldnn_placement_pass");
  }
 #endif
+  // infer_clean_graph_pass should be the first default pass
+  // after mkldnn_placement_pass.
+  passes.push_back("infer_clean_graph_pass");
  for (auto& pass : ir_passes_) {
    if (!disabled_ir_passes_.count(pass)) {
      passes.push_back(pass);
--- a/paddle/fluid/inference/analysis/analyzer.h
+++ b/paddle/fluid/inference/analysis/analyzer.h
@ -67,7 +67,6 @@ class Analyzer : public OrderedRegistry<PassManager> {
  // larger fusion.
  const std::vector<std::string> all_ir_passes_{{
      // Manual update the passes here.
-      "infer_clean_graph_pass",         //
      "attention_lstm_fuse_pass",       //
      "seqconv_eltadd_relu_fuse_pass",  //
      "embedding_fc_lstm_fuse_pass",    //
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@ -18,7 +18,8 @@ if(APPLE)
 endif(APPLE)


-set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager naive_executor ${GLOB_PASS_LIB})
+set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager naive_executor ${GLOB_PASS_LIB}
+        )

 if(WITH_GPU AND TENSORRT_FOUND)
    set(inference_deps ${inference_deps} paddle_inference_tensorrt_subgraph_engine analysis_predictor)
@ -31,10 +32,17 @@ function(inference_api_test TARGET_NAME)
        set(multiValueArgs ARGS)
        cmake_parse_arguments(inference_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

-        cc_test(${TARGET_NAME}
-                SRCS ${inference_test_SRC}
-                DEPS "${inference_deps}"
-                ARGS --dirname=${PYTHON_TESTS_DIR}/book/)
+	if (WITH_GPU)
+		cc_test(${TARGET_NAME}
+			SRCS ${inference_test_SRC}
+			DEPS "${inference_deps}"
+			ARGS --dirname=${PYTHON_TESTS_DIR}/book/ --fraction_of_gpu_memory_to_use=0.15)
+        else()
+		cc_test(${TARGET_NAME}
+			SRCS ${inference_test_SRC}
+			DEPS "${inference_deps}"
+			ARGS --dirname=${PYTHON_TESTS_DIR}/book/)
+	endif()
        if(inference_test_ARGS)
            set_tests_properties(${TARGET_NAME}
                    PROPERTIES DEPENDS "${inference_test_ARGS}")
@ -42,7 +50,8 @@ function(inference_api_test TARGET_NAME)
    endif(WITH_TESTING)
 endfunction(inference_api_test)

-cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope)
+cc_library(reset_tensor_array SRCS details/reset_tensor_array.cc DEPS lod_tensor scope)
+cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS reset_tensor_array lod_tensor scope)
 cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor)
 cc_library(zero_copy_tensor SRCS details/zero_copy_tensor.cc DEPS paddle_inference_api)
 cc_library(zero_copy_tensor_dummy SRCS details/zero_copy_tensor_dummy.cc DEPS paddle_inference_api)
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@ -82,6 +82,7 @@ bool AnalysisPredictor::Init(

  // Get the feed_target_names and fetch_target_names
  PrepareFeedFetch();
+
  return true;
 }

@ -109,6 +110,10 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
    return false;
  }
  VLOG(3) << "predict cost: " << timer.toc() << "ms";
+
+  // Fix TensorArray reuse not cleaned bug.
+  tensor_array_batch_cleaner_.CollectTensorArrays(scope_.get());
+  tensor_array_batch_cleaner_.ResetTensorArray();
  return true;
 }

@ -322,6 +327,9 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(

 bool AnalysisPredictor::ZeroCopyRun() {
  executor_->Run();
+  // Fix TensorArray reuse not cleaned bug.
+  tensor_array_batch_cleaner_.CollectTensorArrays(scope_.get());
+  tensor_array_batch_cleaner_.ResetTensorArray();
  return true;
 }

--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@ -18,6 +18,7 @@
 #include "paddle/fluid/framework/naive_executor.h"
 #include "paddle/fluid/inference/analysis/analyzer.h"
 #include "paddle/fluid/inference/api/api_impl.h"
+#include "paddle/fluid/inference/api/details/reset_tensor_array.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/string/printf.h"

@ -88,6 +89,7 @@ class AnalysisPredictor : public PaddlePredictor {
  // Memory buffer for feed inputs. The temporary LoDTensor will cause serious
  // concurrency problems, so cache them.
  std::vector<framework::LoDTensor> feed_tensors_;
+  details::TensorArrayBatchCleaner tensor_array_batch_cleaner_;
 };

 }  // namespace paddle
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@ -22,6 +22,7 @@ limitations under the License. */

 #include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/inference/api/api_impl.h"
+#include "paddle/fluid/inference/api/details/reset_tensor_array.h"
 #include "paddle/fluid/inference/api/helper.h"
 #include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/platform/profiler.h"
@ -157,6 +158,10 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
    return false;
  }
  VLOG(3) << "predict cost: " << timer.toc() << "ms";
+
+  // Fix TensorArray reuse not cleaned bug.
+  tensor_array_batch_cleaner_.CollectTensorArrays(scope_.get());
+  tensor_array_batch_cleaner_.ResetTensorArray();
  return true;
 }

--- a/Show More
+++ b/Show More