Merge branch 'develop' into core_add_inference_unittest

8 years ago · 119da44954
parent 6ac2e079b3 311334e083
commit 119da44954
56 changed files with 2060 additions and 354 deletions
--- a/cmake/external/boost.cmake
+++ b/cmake/external/boost.cmake
@ -15,9 +15,9 @@
 include(ExternalProject)

 set(BOOST_PROJECT       "extern_boost")
-set(BOOST_VER           "1.66.0")
-set(BOOST_TAR           "boost_1_66_0")
-set(BOOST_URL           "https://dl.bintray.com/boostorg/release/${BOOST_VER}/source/${BOOST_TAR}.tar.gz")
+set(BOOST_VER           "1.41.0")
+set(BOOST_TAR           "boost_1_41_0")
+set(BOOST_URL           "http://sourceforge.net/projects/boost/files/boost/${BOOST_VER}/${BOOST_TAR}.tar.gz")
 set(BOOST_SOURCES_DIR ${THIRD_PARTY_PATH}/boost)
 set(BOOST_DOWNLOAD_DIR  "${BOOST_SOURCES_DIR}/src/${BOOST_PROJECT}")
 set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}/${BOOST_TAR}" CACHE PATH "boost include directory." FORCE)
--- a/doc/api/v2/fluid/layers.rst
+++ b/doc/api/v2/fluid/layers.rst
@ -18,6 +18,11 @@ dynamic_lstm
 ..  autofunction:: paddle.v2.fluid.layers.dynamic_lstm
    :noindex:

+dynamic_lstmp
+-------------
+..  autofunction:: paddle.v2.fluid.layers.dynamic_lstmp
+    :noindex:
+
 dynamic_gru
 -----------
 ..  autofunction:: paddle.v2.fluid.layers.dynamic_gru
--- a/doc/design/support_new_device.md
+++ b/doc/design/support_new_device.md
@ -2,7 +2,7 @@

 ## Background

-Deep learning has a high demand for computing resources. New high-performance devices and computing libraries are appearing very frequently. Deep learning frameworks have to integrate these high-performance devices and computing libraries flexibly and efficiently.
+Deep learning has a high demand for computing resources. New high-performance devices and computing libraries are appearing very frequently. Deep learning frameworks have to integrate these high-performance devices and computing libraries in a flexible and efficient manner.

 On one hand, hardware and computing libraries usually do not have a one-to-one correspondence. For example, Intel CPUs support Eigen and MKL computing libraries while Nvidia GPUs support Eigen and cuDNN computing libraries. We have to implement operator specific kernels for each computing library.

@ -17,7 +17,7 @@ For a general overview of fluid, please refer to the [overview doc](https://gith

 There are mainly three parts that we have to consider while integrating a new device/library:

- Place and DeviceContext: indicates the device id and manages hardware resources
+- Place and DeviceContext: indicate the device id and manage hardware resources

 - Memory and Tensor: malloc/free data on certain device

@ -25,10 +25,10 @@ There are mainly three parts that we have to consider while integrating a new de

 ### Place and DeviceContext

-Please remind that device and computing library are not one-to-one corresponding. A device can have a lot of computing libraries and a computing library can also support several devices.
+Please note that device and computing library are not one-to-one corresponding. A device can have a lot of computing libraries and a computing library can also support several devices.

 #### Place
-Fluid uses class [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L55) to represent the device memory where data is located. If we add another device, we have to add corresponding `DevicePlace`.
+Fluid uses class [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L55) to represent the device memory where data is located. If we add another device, we have to add the corresponding `DevicePlace`.

 ```
        |   CPUPlace
@ -144,7 +144,7 @@ class Tensor {
 };
 ```

-`Placeholder` is used to delay memory allocation; that is, we can first define a tensor, using `Resize` to configure its shape, and then call `mutuable_data` to allocate the actual memory.
+`Placeholder` is used to delay memory allocation; that is, we can first define a tensor, using `Resize` to configurate its shape, and then call `mutuable_data` to allocate the actual memory.

 ```cpp
 paddle::framework::Tensor t;
@ -163,7 +163,7 @@ Fluid implements computing units based on different DeviceContexts. Some computi

 Let's take [MaxOutFunctor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/math/maxouting.h#L27) as an example:

-The interface is defined in header file.
+The interface is defined in the header file.

 ```
 template <typename DeviceContext, typename T>
@ -174,7 +174,7 @@ class MaxOutFunctor {
 };
 ```

-CPU implemention is in .cc file
+CPU implementation is in .cc file

 ```
 template <typename T>
@ -188,7 +188,7 @@ class MaxOutFunctor<platform::CPUDeviceContext, T> {
 };
 ```

-CUDA implemention is in .cu file
+CUDA implementation is in .cu file

 ```
 template <typename T>
@ -203,9 +203,9 @@ class MaxOutFunctor<platform::CUDADeviceContext, T> {
 ```


-We get computing handle from a concrete DeviceContext, and make compution on tensors.
+We first obtain the computing handle from a concrete DeviceContext and then compute on tensors.

-The implemention of `OpKernel` is similar to math functors, the extra thing we need to do is to register the OpKernel in a global map.
+The implementation of `OpKernel` is similar to math functors, the extra thing we need to do is to register the OpKernel in a global map.

 Fluid provides different register interfaces in op_registry.h

@ -231,7 +231,7 @@ REGISTER_OP_CUDA_KERNEL(

 ## Advanced topics: How to switch between different Device/Library

-Generally, we will impelement OpKernel for all Device/Library of an Operator. We can easily train a Convolutional Neural Network in GPU. However, some OpKernel is not sutibale on a specific Device. For example, crf operator can only run on CPU, whereas most other operators can run at GPU. To achieve high performance in such circumstance, we have to switch between different Device/Library.
+Generally, we will implement OpKernel for all Device/Library of an Operator. We can easily train a Convolutional Neural Network in GPU. However, some OpKernel is not suitable on a specific Device. For example, crf operator can only run on CPU, whereas most other operators can run on GPU. To achieve high performance in such circumstance, we have to switch between different Device/Library.


 For more details, please refer to following docs:
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@ -26,7 +26,7 @@ nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor)

 cc_test(variable_test SRCS variable_test.cc)

-cc_library(threadpool SRCS threadpool.cc)
+cc_library(threadpool SRCS threadpool.cc DEPS enforce)
 cc_test(threadpool_test SRCS threadpool_test.cc DEPS threadpool)

 cc_library(scope SRCS scope.cc DEPS glog threadpool)
@ -98,3 +98,5 @@ if(NOT WITH_C_API AND WITH_FLUID)
  install(FILES ${CMAKE_CURRENT_BINARY_DIR}/framework.pb.h DESTINATION include/paddle/framework)
  install(FILES details/cow_ptr.h details/op_registry.h DESTINATION include/paddle/framework/details)
 endif()
+
+cc_test(channel_test SRCS channel_test.cc)
--- a/paddle/framework/channel.h
+++ b/paddle/framework/channel.h
@ -0,0 +1,64 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stddef.h>  // for size_t
+
+namespace paddle {
+namespace framework {
+
+// Channel is the abstract class of buffered and un-buffered channels.
+template <typename T>
+class Channel {
+ public:
+  virtual void Send(T*) = 0;
+  virtual void Receive(T*) = 0;
+  virtual size_t Cap() = 0;
+
+  // Don't delete channels; instead, call Channel::Close.
+ protected:
+  virtual ~Channel() {}
+};
+
+// Forward declaration of channel implementations.
+namespace details {
+template <typename T>
+class Buffered;
+template <typename T>
+class UnBuffered;
+}  // namespace details
+
+template <typename T>
+Channel<T>* MakeChannel(size_t buffer_size) {
+  if (buffer_size > 0) {
+    return new details::Buffered<T>(buffer_size);
+  }
+  return new details::UnBuffered<T>();
+}
+
+template <typename T>
+void CloseChannel(Channel<T>* ch) {
+  if (ch->Cap() > 0) {
+    delete dynamic_cast<details::Buffered<T>*>(ch);
+  } else {
+    delete dynamic_cast<details::UnBuffered<T>*>(ch);
+  }
+}
+
+}  // namespace framework
+}  // namespace paddle
+
+#include "paddle/framework/details/buffered_channel.h"
+#include "paddle/framework/details/unbuffered_channel.h"
--- a/paddle/framework/channel_test.cc
+++ b/paddle/framework/channel_test.cc
@ -0,0 +1,26 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/channel.h"
+
+#include "gtest/gtest.h"
+
+TEST(Channel, MakeAndClose) {
+  using paddle::framework::Channel;
+  using paddle::framework::MakeChannel;
+  using paddle::framework::CloseChannel;
+
+  Channel<int>* ch = MakeChannel<int>(10);
+  CloseChannel(ch);
+}
--- a/paddle/framework/data_type.h
+++ b/paddle/framework/data_type.h
@ -79,5 +79,33 @@ inline void VisitDataType(proto::DataType type, Visitor visitor) {
  }
 }

+inline std::string DataTypeToString(const proto::DataType type) {
+  using namespace paddle::framework::proto;
+  switch (type) {
+    case DataType::FP16:
+      return "float16";
+    case DataType::FP32:
+      return "float32";
+    case DataType::FP64:
+      return "float64";
+    case DataType::INT16:
+      return "int16";
+    case DataType::INT32:
+      return "int32";
+    case DataType::INT64:
+      return "int64";
+    case DataType::BOOL:
+      return "bool";
+    default:
+      PADDLE_THROW("Not support type %d", type);
+  }
+}
+
+inline std::ostream& operator<<(std::ostream& out,
+                                const proto::DataType& type) {
+  out << DataTypeToString(type);
+  return out;
+}
+
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/details/buffered_channel.h
+++ b/paddle/framework/details/buffered_channel.h
@ -0,0 +1,82 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <condition_variable>
+#include <deque>
+#include <mutex>
+
+#include "paddle/framework/channel.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+template <typename T>
+class Buffered : public paddle::framework::Channel<T> {
+  friend Channel<T>* paddle::framework::MakeChannel<T>(size_t);
+  friend void paddle::framework::CloseChannel<T>(Channel<T>*);
+
+ public:
+  virtual void Send(T*);
+  virtual void Receive(T*);
+  virtual size_t Cap() { return cap_; }
+
+ private:
+  size_t cap_;
+  std::mutex mu_;
+  std::condition_variable empty_cond_var_;
+  std::condition_variable full_cond_var_;
+  std::deque<T> channel_;
+
+  Buffered(size_t cap) : cap_(cap) {}
+  virtual ~Buffered();
+
+  void NotifyAllSenders(std::unique_lock<std::mutex>*);
+};
+
+template <typename T>
+void Buffered<T>::Send(T* item) {
+  std::unique_lock<std::mutex> lock(mu_);
+  full_cond_var_.wait(lock, [this]() { return channel_.size() < cap_; });
+  channel_.push_back(std::move(*item));
+  lock.unlock();
+  empty_cond_var_.notify_one();
+}
+
+template <typename T>
+void Buffered<T>::Receive(T* item) {
+  std::unique_lock<std::mutex> lock(mu_);
+  empty_cond_var_.wait(lock, [this]() { return !channel_.empty(); });
+  *item = std::move(channel_.front());
+  channel_.pop_front();
+  NotifyAllSenders(&lock);
+}
+
+template <typename T>
+Buffered<T>::~Buffered() {
+  std::unique_lock<std::mutex> lock(mu_);
+  channel_.clear();
+  NotifyAllSenders(&lock);
+}
+
+template <typename T>
+void Buffered<T>::NotifyAllSenders(std::unique_lock<std::mutex>* lock) {
+  lock->unlock();
+  full_cond_var_.notify_one();
+}
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/details/unbuffered_channel.h
+++ b/paddle/framework/details/unbuffered_channel.h
@ -0,0 +1,52 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <condition_variable>
+#include <deque>
+#include <mutex>
+
+#include "paddle/framework/channel.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+template <typename T>
+class UnBuffered : public paddle::framework::Channel<T> {
+  friend Channel<T>* paddle::framework::MakeChannel<T>(size_t);
+  friend void paddle::framework::CloseChannel<T>(Channel<T>*);
+
+ public:
+  virtual void Send(T*);
+  virtual void Receive(T*);
+  virtual size_t Cap() { return 0; }
+
+ private:
+  UnBuffered() {}
+  virtual ~UnBuffered();
+};
+
+template <typename T>
+void UnBuffered<T>::Send(T* channel_element) {}
+
+template <typename T>
+void UnBuffered<T>::Receive(T*) {}
+
+template <typename T>
+UnBuffered<T>::~UnBuffered() {}
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@ -170,8 +170,6 @@ static bool has_feed_operators(
          feed_targets.find(feed_target_name) != feed_targets.end(),
          "Feed operator output name '%s' cannot be found in 'feed_targets'",
          feed_target_name);
-    } else {
-      break;
    }
  }

@ -270,8 +268,6 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
      int idx = boost::get<int>(op->GetAttr("col"));
      SetFeedVariable(scope, *feed_targets[feed_target_name], feed_holder_name,
                      idx);
-    } else {
-      break;
    }
  }

--- a/paddle/framework/op_kernel_type_test.cc
+++ b/paddle/framework/op_kernel_type_test.cc
@ -26,9 +26,9 @@ TEST(OpKernelType, ToString) {
  OpKernelType op_kernel_type(DataType::FP32, CPUPlace(), DataLayout::kNCHW,
                              LibraryType::kCUDNN);

-  ASSERT_EQ(
-      paddle::framework::KernelTypeToString(op_kernel_type),
-      "data_type[5]:data_layout[NCHW]:place[CPUPlace]:library_type[CUDNN]");
+  ASSERT_EQ(paddle::framework::KernelTypeToString(op_kernel_type),
+            "data_type[float32]:data_layout[NCHW]:place[CPUPlace]:library_type["
+            "CUDNN]");
 }

 TEST(OpKernelType, Hash) {
--- a/paddle/framework/program_desc.cc
+++ b/paddle/framework/program_desc.cc
@ -18,6 +18,9 @@ limitations under the License. */
 namespace paddle {
 namespace framework {

+const std::string kFeedOpType = "feed";
+const std::string kFetchOpType = "fetch";
+
 BlockDesc *ProgramDesc::AppendBlock(const BlockDesc &parent) {
  auto *b = desc_.add_blocks();
  b->set_parent_idx(parent.ID());
@ -64,5 +67,27 @@ ProgramDesc::ProgramDesc(const std::string &binary_str) {
  }
 }

+const std::vector<std::string> ProgramDesc::GetFeedVarNames() {
+  BlockDesc *global_block = blocks_[0].get();
+  std::vector<std::string> feed_var_names;
+  for (auto *op : global_block->AllOps()) {
+    if (op->Type() == "feed") {
+      feed_var_names.insert(feed_var_names.begin(), op->Output("Out")[0]);
+    }
+  }
+  return feed_var_names;
+}
+
+const std::vector<std::string> ProgramDesc::GetFetchVarNames() {
+  BlockDesc *global_block = blocks_[0].get();
+  std::vector<std::string> fetch_var_names;
+  for (auto *op : global_block->AllOps()) {
+    if (op->Type() == "fetch") {
+      fetch_var_names.push_back(op->Input("X")[0]);
+    }
+  }
+  return fetch_var_names;
+}
+
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/program_desc.h
+++ b/paddle/framework/program_desc.h
@ -45,6 +45,10 @@ class ProgramDesc {

  proto::ProgramDesc *Proto();

+  const std::vector<std::string> GetFeedVarNames();
+
+  const std::vector<std::string> GetFetchVarNames();
+
 private:
  proto::ProgramDesc desc_;

--- a/paddle/framework/prune.cc
+++ b/paddle/framework/prune.cc
@ -17,6 +17,7 @@ limitations under the License. */
 #include <algorithm>
 #include <set>
 #include <string>
+#include <unordered_map>
 #include <vector>

 #include <glog/logging.h>
@ -102,6 +103,32 @@ void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output,
      *op_field->Add() = input.blocks(block_id).ops(i);
    }
  }
+
+  // remove the VarDescs in BlockDesc that are not referenced in
+  // the pruned OpDescs
+  std::unordered_map<std::string, proto::VarDesc> var_map;
+  auto* var_field = output->mutable_blocks(block_id)->mutable_vars();
+  for (const auto& var : *var_field) {
+    var_map[var.name()] = var;
+  }
+
+  var_field->Clear();
+  for (const auto& op : *op_field) {
+    // add VarDescs of all input arguments for each OpDesc
+    auto& input_field = op.inputs();
+    for (auto& input_var : input_field) {
+      for (auto& arg : input_var.arguments()) {
+        *var_field->Add() = var_map[arg];
+      }
+    }
+    // add VarDescs of all output arguments for each OpDesc
+    auto& output_field = op.outputs();
+    for (auto& output_var : output_field) {
+      for (auto& arg : output_var.arguments()) {
+        *var_field->Add() = var_map[arg];
+      }
+    }
+  }
 }

 // TODO(fengjiayi): Prune() could be inplaced to avoid unnecessary copies
--- a/paddle/framework/threadpool.cc
+++ b/paddle/framework/threadpool.cc
@ -14,11 +14,82 @@ limitations under the License. */

 #include "paddle/framework/threadpool.h"

+#include "paddle/platform/enforce.h"
+
 namespace paddle {
 namespace framework {

-std::unique_ptr<ThreadPool> ThreadPool::threadpool(nullptr);
-std::once_flag ThreadPool::init_flag;
+std::unique_ptr<ThreadPool> ThreadPool::threadpool_(nullptr);
+std::once_flag ThreadPool::init_flag_;
+
+ThreadPool* ThreadPool::GetInstance() {
+  std::call_once(init_flag_, &ThreadPool::Init);
+  return threadpool_.get();
+}
+
+void ThreadPool::Init() {
+  if (threadpool_.get() == nullptr) {
+    // TODO(Yancey1989): specify the max threads number
+    int num_threads = std::thread::hardware_concurrency();
+    PADDLE_ENFORCE_GT(num_threads, 0);
+    threadpool_.reset(new ThreadPool(num_threads));
+  }
+}
+
+ThreadPool::ThreadPool(int num_threads)
+    : total_threads_(num_threads), idle_threads_(num_threads), running_(true) {
+  threads_.resize(num_threads);
+  for (auto& thread : threads_) {
+    // TODO(Yancey1989): binding the thread on the specify CPU number
+    thread.reset(new std::thread(std::bind(&ThreadPool::TaskLoop, this)));
+  }
+}
+
+ThreadPool::~ThreadPool() {
+  {
+    // notify all threads to stop running
+    running_ = false;
+    scheduled_.notify_all();
+  }
+
+  for (auto& t : threads_) {
+    t->join();
+    t.reset(nullptr);
+  }
+}
+
+void ThreadPool::Wait() {
+  std::unique_lock<std::mutex> lock(mutex_);
+  completed_.wait(lock, [=] { return Done() == true; });
+}
+
+void ThreadPool::TaskLoop() {
+  while (running_) {
+    std::unique_lock<std::mutex> lock(mutex_);
+    scheduled_.wait(lock, [=] { return !tasks_.empty() || !running_; });
+
+    if (!running_) {
+      break;
+    }
+    // pop a task from the task queue
+    auto task = std::move(tasks_.front());
+    tasks_.pop();
+
+    --idle_threads_;
+    lock.unlock();
+
+    // run the task
+    task();
+
+    {
+      std::unique_lock<std::mutex> lock(mutex_);
+      ++idle_threads_;
+      if (Done()) {
+        completed_.notify_all();
+      }
+    }
+  }
+}

 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/threadpool.h
+++ b/paddle/framework/threadpool.h
@ -20,52 +20,36 @@ limitations under the License. */
 #include <mutex>
 #include <queue>
 #include <thread>
+#include <vector>

-#include "paddle/platform/enforce.h"
+#include "paddle/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN

 namespace paddle {
 namespace framework {

+// ThreadPool maintains a queue of tasks, and runs them using a fixed
+// number of threads.
 class ThreadPool {
 public:
  typedef std::packaged_task<void()> Task;

-  /**
-   * @brief   Get a instance of threadpool, the thread number will
-   *          be specified as the number of hardware thread contexts
-   */
-  static ThreadPool* GetInstance() {
-    std::call_once(init_flag, &ThreadPool::Init);
-    return threadpool.get();
-  }
+  // Returns the singleton of ThreadPool.
+  static ThreadPool* GetInstance();

-  ~ThreadPool() {
-    {
-      // notify all threads to stop running
-      running_ = false;
-      scheduled_.notify_all();
-    }
+  ~ThreadPool();

-    for (auto& t : threads_) {
-      t->join();
-      t.reset(nullptr);
-    }
-  }
+  // Returns the number of threads created by the constructor.
+  size_t Threads() const { return total_threads_; }

-  int GetNumThreads() const { return num_threads_; }
-
-  int GetAvailable() {
+  // Returns the number of currently idle threads.
+  size_t IdleThreads() {
    std::unique_lock<std::mutex> lock(mutex_);
-    return available_;
+    return idle_threads_;
  }

-  /**
-   * @brief   Push a function to the queue, and will be scheduled and
-   *          executed if a thread is available.
-   * @param[in] Task, will be pushed to the task queue.
-   * @return    std::future<void>, we could wait for the task finished by
-   *            f.wait().
-   */
+  // Run pushes a function to the task queue and returns a std::future
+  // object.  To wait for the completion of the task, call
+  // std::future::wait().
  template <typename Callback>
  std::future<void> Run(Callback fn) {
    std::unique_lock<std::mutex> lock(mutex_);
@ -77,84 +61,40 @@ class ThreadPool {
    return f;
  }

-  /**
-   * @brief   Wait until all the tasks are completed.
-   */
-  void Wait() {
-    std::unique_lock<std::mutex> lock(mutex_);
-    completed_.wait(lock, [=] { return Done() == true; });
-  }
+  // Wait until all the tasks are completed.
+  void Wait();

 private:
  DISABLE_COPY_AND_ASSIGN(ThreadPool);

-  explicit ThreadPool(int num_threads)
-      : num_threads_(num_threads), available_(num_threads), running_(true) {
-    threads_.resize(num_threads);
-    for (auto& thread : threads_) {
-      // TODO(Yancey1989): binding the thread on the specify CPU number
-      thread.reset(new std::thread(std::bind(&ThreadPool::TaskLoop, this)));
-    }
-  }
+  explicit ThreadPool(int num_threads);

-  /**
-   * @brief   If the task queue is empty and avaialbe
-   *          is equal to the number of threads, means that
-   *          all tasks are completed.
-   *
-   *          Note: this function is not thread-safe.
-   *
-   * @return true if all tasks are completed.
-   */
-  bool Done() { return tasks_.empty() && available_ == num_threads_; }
-
-  void TaskLoop() {
-    while (running_) {
-      std::unique_lock<std::mutex> lock(mutex_);
-      scheduled_.wait(lock, [=] { return !tasks_.empty() || !running_; });
+  // If the task queue is empty and avaialbe is equal to the number of
+  // threads, means that all tasks are completed.  Note: this function
+  // is not thread-safe.  Returns true if all tasks are completed.
+  // Note: don't delete the data member total_threads_ and use
+  // threads_.size() instead; because you'd need to lock the mutex
+  // before accessing threads_.
+  bool Done() { return tasks_.empty() && idle_threads_ == total_threads_; }

-      if (!running_) {
-        break;
-      }
-      // pop a task from the task queue
-      auto task = std::move(tasks_.front());
-      tasks_.pop();
+  // The constructor starts threads to run TaskLoop, which retrieves
+  // and runs tasks from the queue.
+  void TaskLoop();

-      --available_;
-      lock.unlock();
-
-      // run the task
-      task();
-
-      {
-        std::unique_lock<std::mutex> lock(mutex_);
-        ++available_;
-        if (Done()) {
-          completed_.notify_all();
-        }
-      }
-    }
-  }
-
-  static void Init() {
-    if (threadpool.get() == nullptr) {
-      // TODO(Yancey1989): specify the max threads number
-      int num_threads = std::thread::hardware_concurrency();
-      PADDLE_ENFORCE_GT(num_threads, 0);
-      threadpool.reset(new ThreadPool(num_threads));
-    }
-  }
+  // Init is called by GetInstance.
+  static void Init();

 private:
-  static std::unique_ptr<ThreadPool> threadpool;
-  static std::once_flag init_flag;
+  static std::unique_ptr<ThreadPool> threadpool_;
+  static std::once_flag init_flag_;

-  int num_threads_;
-  int available_;
-  bool running_;
-  std::queue<Task> tasks_;
  std::vector<std::unique_ptr<std::thread>> threads_;
+  const size_t total_threads_;
+  size_t idle_threads_;
+
+  std::queue<Task> tasks_;
  std::mutex mutex_;
+  bool running_;
  std::condition_variable scheduled_;
  std::condition_variable completed_;
 };
--- a/paddle/framework/threadpool_test.cc
+++ b/paddle/framework/threadpool_test.cc
@ -22,11 +22,7 @@ namespace framework = paddle::framework;
 void do_sum(framework::ThreadPool* pool, std::atomic<int>& sum, int cnt) {
  std::vector<std::future<void>> fs;
  for (int i = 0; i < cnt; ++i) {
-    auto f = pool->Run([&sum]() { sum.fetch_add(1); });
-    fs.push_back(std::move(f));
-  }
-  for (auto& f : fs) {
-    f.wait();
+    fs.push_back(framework::Async([&sum]() { sum.fetch_add(1); }));
  }
 }

--- a/paddle/inference/CMakeLists.txt
+++ b/paddle/inference/CMakeLists.txt
@ -1,14 +1,14 @@
 set(FLUID_CORE_MODULES proto_desc paddle_memory executor prune init)

 cc_library(paddle_fluid_api
-    SRCS inference.cc
+    SRCS io.cc
    DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})

 # Merge all modules into a single static library
 cc_library(paddle_fluid DEPS paddle_fluid_api ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})

 # Create shared library
-add_library(paddle_fluid_shared SHARED inference.cc)
+add_library(paddle_fluid_shared SHARED io.cc)

 target_circle_link_libraries(paddle_fluid_shared
  ARCHIVE_START
@ -20,7 +20,7 @@ SET_TARGET_PROPERTIES(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid)

 # install library & headers
 if(NOT WITH_C_API AND WITH_FLUID)
-  install(FILES inference.h DESTINATION include/paddle/inference)
+  install(FILES io.h DESTINATION include/paddle/inference)
  install(TARGETS paddle_fluid_shared DESTINATION lib)
 endif()

--- a/paddle/inference/inference.h
+++ b/paddle/inference/inference.h
@ -1,56 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/framework/block_desc.h"
-#include "paddle/framework/executor.h"
-#include "paddle/framework/lod_tensor.h"
-#include "paddle/framework/program_desc.h"
-#include "paddle/framework/scope.h"
-
-namespace paddle {
-
-class InferenceEngine {
-public:
-  InferenceEngine() : program_(nullptr), load_program_(nullptr) {}
-  ~InferenceEngine() {
-    delete program_;
-    delete load_program_;
-  }
-
-  framework::ProgramDesc* LoadInferenceModel(framework::Executor& exe,
-                                             framework::Scope* scope,
-                                             const std::string& dirname);
-
-  const std::vector<std::string>& GetFeedVarNames() const {
-    return feed_var_names_;
-  }
-
-  const std::vector<std::string>& GetFetchVarNames() const {
-    return fetch_var_names_;
-  }
-
-private:
-  bool IsParameter(const framework::VarDesc* var);
-  void GenerateLoadProgram(const std::string& dirname);
-
-private:
-  framework::ProgramDesc* program_;
-  framework::ProgramDesc* load_program_;
-  std::vector<std::string> feed_var_names_;
-  std::vector<std::string> fetch_var_names_;
-};
-
-}  // namespace paddle
--- a/paddle/inference/inference.cc
+++ b/paddle/inference/inference.cc
@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@ -12,51 +12,22 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "inference.h"
+#include "paddle/inference/io.h"
 #include <fstream>

 namespace paddle {
+namespace inference {

-framework::ProgramDesc* InferenceEngine::LoadInferenceModel(
-    framework::Executor& exe,
-    framework::Scope* scope,
-    const std::string& dirname) {
-  std::string model_filename = dirname + "/__model__";
-  LOG(INFO) << "loading model from " << model_filename;
-  std::ifstream inputfs(model_filename, std::ios::in | std::ios::binary);
-  std::string program_desc_str;
-  inputfs.seekg(0, std::ios::end);
-  program_desc_str.resize(inputfs.tellg());
-  inputfs.seekg(0, std::ios::beg);
-  LOG(INFO) << "program_desc_str's size: " << program_desc_str.size();
-  inputfs.read(&program_desc_str[0], program_desc_str.size());
-  inputfs.close();
-
-  program_ = new framework::ProgramDesc(program_desc_str);
-  GenerateLoadProgram(dirname);
-  exe.Run(*load_program_, scope, 0, true, true);
-
-  framework::BlockDesc* global_block = program_->MutableBlock(0);
-  feed_var_names_.clear();
-  fetch_var_names_.clear();
-  for (auto* op : global_block->AllOps()) {
-    if (op->Type() == "feed") {
-      feed_var_names_.insert(feed_var_names_.begin(), op->Output("Out")[0]);
-    } else if (op->Type() == "fetch") {
-      fetch_var_names_.push_back(op->Input("X")[0]);
-    }
-  }
-
-  return program_;
-}
+const std::string kFeedOpType = "feed";

-bool InferenceEngine::IsParameter(const framework::VarDesc* var) {
+bool IsParameter(const framework::VarDesc* var,
+                 const framework::ProgramDesc* main_program) {
  if (var->Persistable()) {
    // There are many unreachable variables in the program
-    for (size_t i = 0; i < program_->Size(); ++i) {
-      const framework::BlockDesc& block = program_->Block(i);
+    for (size_t i = 0; i < main_program->Size(); ++i) {
+      const framework::BlockDesc& block = main_program->Block(i);
      for (auto* op : block.AllOps()) {
-        if (op->Type() == "feed") {
+        if (op->Type() == kFeedOpType) {
          continue;
        }
        for (auto input_argument_name : op->InputArgumentNames()) {
@ -70,13 +41,16 @@ bool InferenceEngine::IsParameter(const framework::VarDesc* var) {
  return false;
 }

-void InferenceEngine::GenerateLoadProgram(const std::string& dirname) {
-  framework::BlockDesc* global_block = program_->MutableBlock(0);
+void LoadPersistables(framework::Executor& executor,
+                      framework::Scope& scope,
+                      const std::string& dirname,
+                      framework::ProgramDesc* main_program) {
+  framework::BlockDesc* global_block = main_program->MutableBlock(0);

-  load_program_ = new framework::ProgramDesc();
-  framework::BlockDesc* load_block = load_program_->MutableBlock(0);
+  framework::ProgramDesc* load_program = new framework::ProgramDesc();
+  framework::BlockDesc* load_block = load_program->MutableBlock(0);
  for (auto* var : global_block->AllVars()) {
-    if (IsParameter(var)) {
+    if (IsParameter(var, main_program)) {
      VLOG(3) << "parameter's name: " << var->Name();

      framework::VarDesc* new_var = load_block->Var(var->Name());
@ -94,5 +68,30 @@ void InferenceEngine::GenerateLoadProgram(const std::string& dirname) {
      op->CheckAttrs();
    }
  }
+  executor.Run(*load_program, &scope, 0, true, true);
+  delete load_program;
 }
+
+framework::ProgramDesc* Load(framework::Executor& executor,
+                             framework::Scope& scope,
+                             const std::string& dirname) {
+  std::string model_filename = dirname + "/__model__";
+  LOG(INFO) << "loading model from " << model_filename;
+  std::ifstream inputfs(model_filename, std::ios::in | std::ios::binary);
+  std::string program_desc_str;
+  inputfs.seekg(0, std::ios::end);
+  program_desc_str.resize(inputfs.tellg());
+  inputfs.seekg(0, std::ios::beg);
+  LOG(INFO) << "program_desc_str's size: " << program_desc_str.size();
+  inputfs.read(&program_desc_str[0], program_desc_str.size());
+  inputfs.close();
+
+  framework::ProgramDesc* main_program =
+      new framework::ProgramDesc(program_desc_str);
+
+  LoadPersistables(executor, scope, dirname, main_program);
+  return main_program;
+}
+
+}  // namespace inference
 }  // namespace paddle
--- a/paddle/inference/io.h
+++ b/paddle/inference/io.h
@ -0,0 +1,41 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "paddle/framework/block_desc.h"
+#include "paddle/framework/executor.h"
+#include "paddle/framework/program_desc.h"
+#include "paddle/framework/scope.h"
+#include "paddle/framework/var_desc.h"
+
+namespace paddle {
+namespace inference {
+
+bool IsParameter(const framework::VarDesc* var,
+                 const framework::ProgramDesc* main_program);
+
+void LoadPersistables(framework::Executor& executor,
+                      framework::Scope& scope,
+                      const std::string& dirname,
+                      framework::ProgramDesc* main_program);
+
+framework::ProgramDesc* Load(framework::Executor& executor,
+                             framework::Scope& scope,
+                             const std::string& dirname);
+
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/inference/tests/book/test_inference_recognize_digits.cc
+++ b/paddle/inference/tests/book/test_inference_recognize_digits.cc
@ -16,8 +16,8 @@ limitations under the License. */
 #include <time.h>
 #include <sstream>
 #include "gflags/gflags.h"
-#include "paddle/framework/init.h"
-#include "paddle/inference/inference.h"
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/inference/io.h"

 DEFINE_string(dirname, "", "Directory of the inference model.");

@ -31,14 +31,13 @@ void TestInference(const std::string& dirname,
  auto* scope = new paddle::framework::Scope();

  // 2. Initialize the inference_program and load all parameters from file
-  paddle::InferenceEngine* engine = new paddle::InferenceEngine();
-  paddle::framework::ProgramDesc* inference_program =
-      engine->LoadInferenceModel(executor, scope, dirname);
+  auto* inference_program = paddle::inference::Load(executor, *scope, dirname);

  // 3. Get the feed_var_names and fetch_var_names
-  const std::vector<std::string>& feed_target_names = engine->GetFeedVarNames();
+  const std::vector<std::string>& feed_target_names =
+      inference_program->GetFeedVarNames();
  const std::vector<std::string>& fetch_target_names =
-      engine->GetFetchVarNames();
+      inference_program->GetFetchVarNames();

  // 4. Prepare inputs
  std::map<std::string, const paddle::framework::LoDTensor*> feed_targets;
@ -56,8 +55,8 @@ void TestInference(const std::string& dirname,
  // 6. Run the inference program
  executor.Run(*inference_program, scope, feed_targets, fetch_targets);

+  delete inference_program;
  delete scope;
-  delete engine;
 }

 TEST(inference, recognize_digits) {
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@ -147,6 +147,7 @@ op_library(max_sequence_len_op DEPS lod_rank_table)
 op_library(sequence_conv_op DEPS context_project)
 op_library(sequence_pool_op DEPS sequence_pooling)
 op_library(lstm_op DEPS sequence2batch lstm_compute)
+op_library(lstmp_op DEPS sequence2batch lstm_compute)
 op_library(gru_op DEPS sequence2batch gru_compute)
 op_library(recurrent_op DEPS executor)
 op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale math_function)
--- a/paddle/operators/activation_op.h
+++ b/paddle/operators/activation_op.h
@ -323,7 +323,7 @@ template <typename T>
 struct FloorFunctor : public BaseActivationFunctor<T> {
  template <typename Device, typename X, typename Out>
  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.ceil();
+    out.device(d) = x.floor();
  }
 };

--- a/paddle/operators/detail/grpc_client.cc
+++ b/paddle/operators/detail/grpc_client.cc
@ -97,12 +97,27 @@ bool RPCClient::AsyncGetVariable(const std::string& ep,
  return true;
 }

+bool RPCClient::AsyncSendBatchBarrier(const std::string& ep, int64_t time_out) {
+  const auto ch = GetChannel(ep);
+
+  BatchBarrierProcessor* s = new BatchBarrierProcessor(ch);
+  s->Prepare(time_out);
+
+  sendrecv::VariableMessage req;
+  req.set_varname(BATCH_BARRIER_MESSAGE);
+  auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
+  rpc->Finish(&s->reply_, &s->status_, (void*)s);
+  req_count_++;
+
+  return true;
+}
+
 bool RPCClient::Wait() {
  if (req_count_ <= 0) {
    return true;
  }
-
-  std::vector<bool> a(req_count_);
+  const size_t kReqCnt = req_count_;
+  bool a[kReqCnt];
  std::vector<std::future<void>> waits(req_count_);

  for (int i = 0; i < req_count_; i++) {
--- a/Show More
+++ b/Show More