Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into img

Fix img links errors
8 years ago · fcb48440a1
parent 4d1fe5c3ec f2c0b88649
commit fcb48440a1
47 changed files with 1211 additions and 197 deletions
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@ -21,6 +21,9 @@
 #include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h"
 #endif

+#include <string>
+#include <vector>
+
 namespace paddle {
 namespace framework {
 namespace details {
@ -168,6 +171,11 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
   */
  PolishGraphToSupportDataHazards(&result);

+  /*
+   * Only variables should be the leaves of graph.
+   */
+  AddOutputToLeafOps(&result);
+
  if (VLOG_IS_ON(10)) {
    std::ostringstream sout;
    PrintGraphviz(*graph, sout);
--- a/paddle/fluid/framework/details/ssa_graph_builder.cc
+++ b/paddle/fluid/framework/details/ssa_graph_builder.cc
@ -136,6 +136,17 @@ void SSAGraphBuilder::PrintGraphviz(const SSAGraph &graph, std::ostream &sout) {

  sout << "}\n";
 }
+
+void SSAGraphBuilder::AddOutputToLeafOps(SSAGraph *graph) {
+  for (auto &op : graph->ops_) {
+    if (!op->outputs_.empty()) {
+      continue;
+    }
+    auto *dummy_leaf = new DummyVarHandle();
+    graph->dep_vars_.emplace(dummy_leaf);
+    op->AddOutput(dummy_leaf);
+  }
+}
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/details/ssa_graph_builder.h
+++ b/paddle/fluid/framework/details/ssa_graph_builder.h
@ -14,13 +14,13 @@

 #pragma once

+#include <memory>
+#include <string>
+
 #include "paddle/fluid/framework/details/ssa_graph.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/platform/place.h"

-#include <memory>
-#include <string>
-
 namespace paddle {
 namespace framework {
 namespace details {
@ -52,6 +52,8 @@ class SSAGraphBuilder {
                             const std::string &each_var_name,
                             const platform::Place &place, size_t place_offset);

+  static void AddOutputToLeafOps(SSAGraph *graph);
+
  static void PrintGraphviz(const SSAGraph &graph, std::ostream &sout);
 };
 }  // namespace details
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@ -87,7 +87,6 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(

  // Step 2. Insert FetchOps
  std::vector<std::unique_ptr<FetchOpHandle>> fetch_ops;
-  std::vector<DummyVarHandle> dummy_vars;
  FeedFetchList fetch_data(fetch_tensors.size());

  std::unordered_map<std::string, std::vector<VarHandleBase *>> fetched_vars;
@ -101,13 +100,13 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
    }
  }

+  std::unordered_set<std::unique_ptr<VarHandleBase>> fetch_dependencies;
  for (size_t i = 0; i < fetch_tensors.size(); ++i) {
    auto &var_name = fetch_tensors[i];
    auto &vars = fetched_vars.at(var_name);
    auto *op = new FetchOpHandle(&fetch_data, i, &local_scopes_);
    fetch_ops.emplace_back(op);

-    // FIXME: Use new device context
    for (auto &p : places_) {
      op->dev_ctxes_[p] = fetch_ctxs_.Get(p);
    }
@ -115,6 +114,11 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
    for (auto *var : vars) {
      op->AddInput(var);
    }
+
+    auto *fetch_dummy = new DummyVarHandle();
+    op->AddOutput(fetch_dummy);
+    fetch_dependencies.emplace(fetch_dummy);
+    InsertPendingVar(*fetch_dummy);
    InsertPendingOp(*op);
  }

--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@ -45,11 +45,10 @@ class Tensor {
  friend struct EigenVector;

 public:
-  Tensor() : offset_(0), is_pinned_(false) {}
+  Tensor() : offset_(0) {}

  /*! Constructor with place should only be used in pybind. */
-  explicit Tensor(const platform::Place& place)
-      : offset_(0), is_pinned_(false) {
+  explicit Tensor(const platform::Place& place) : offset_(0) {
    holder_->set_place(place);
  }

@ -70,12 +69,11 @@ class Tensor {
   * @note    If not exist, then allocation.
   */
  template <typename T>
-  inline T* mutable_data(platform::Place place, bool is_pinned = false);
+  inline T* mutable_data(platform::Place place);

-  inline void* mutable_data(platform::Place place, std::type_index type,
-                            bool is_pinned = false);
+  inline void* mutable_data(platform::Place place, std::type_index type);

-  inline void* mutable_data(platform::Place place, bool is_pinned = false);
+  inline void* mutable_data(platform::Place place);

  /**
   * @brief     Return a pointer to mutable memory block.
@ -86,8 +84,7 @@ class Tensor {
   * @note      If not exist, then allocation.
   */
  template <typename T>
-  inline T* mutable_data(DDim dims, platform::Place place,
-                         bool is_pinned = false);
+  inline T* mutable_data(DDim dims, platform::Place place);

  /*! Return the dimensions of the memory block. */
  inline const DDim& dims() const;
@ -95,9 +92,6 @@ class Tensor {
  /*! Return the numel of the memory block. */
  inline int64_t numel() const;

-  /*! Return the numel of the memory block. */
-  inline bool isPinned() const;
-
  /*! Resize the dimensions of the memory block. */
  inline Tensor& Resize(const DDim& dims);

@ -152,14 +146,12 @@ class Tensor {

  template <typename Place>
  struct PlaceholderImpl : public Placeholder {
-    PlaceholderImpl(Place place, size_t size, std::type_index type,
-                    bool is_pinned = false)
-        : ptr_(static_cast<uint8_t*>(memory::Alloc(place, size, is_pinned)),
-               memory::PODDeleter<uint8_t, Place>(place, is_pinned)),
+    PlaceholderImpl(Place place, size_t size, std::type_index type)
+        : ptr_(static_cast<uint8_t*>(memory::Alloc(place, size)),
+               memory::PODDeleter<uint8_t, Place>(place)),
          place_(place),
          size_(size),
-          type_(type),
-          is_pinned_(is_pinned) {
+          type_(type) {
      PADDLE_ENFORCE_NOT_NULL(ptr_, "Insufficient %s memory to allocation.",
                              (is_cpu_place(place_) ? "CPU" : "GPU"));
    }
@ -182,9 +174,6 @@ class Tensor {

    /* the current type of memory */
    std::type_index type_;
-
-    /*! use pinned memory or not. */
-    bool is_pinned_;
  };

  /*! holds the memory block if allocated. */
@ -219,7 +208,6 @@ class Tensor {
   *          PlaceHolder::ptr_ and where the tensor data really begins.
   */
  size_t offset_;
-  bool is_pinned_;
 };

 inline void Tensor::switch_place(platform::Place new_place) {
--- a/paddle/fluid/framework/tensor_impl.h
+++ b/paddle/fluid/framework/tensor_impl.h
@ -101,21 +101,19 @@ inline T* Tensor::data() {
 }

 template <typename T>
-inline T* Tensor::mutable_data(DDim dims, platform::Place place,
-                               bool is_pinned) {
+inline T* Tensor::mutable_data(DDim dims, platform::Place place) {
  static_assert(std::is_pod<T>::value, "T must be POD");
  Resize(dims);
-  return mutable_data<T>(place, is_pinned);
+  return mutable_data<T>(place);
 }

 template <typename T>
-inline T* Tensor::mutable_data(platform::Place place, bool is_pinned) {
+inline T* Tensor::mutable_data(platform::Place place) {
  static_assert(std::is_pod<T>::value, "T must be POD");
-  return reinterpret_cast<T*>(mutable_data(place, typeid(T), is_pinned));
+  return reinterpret_cast<T*>(mutable_data(place, typeid(T)));
 }

-inline void* Tensor::mutable_data(platform::Place place, std::type_index type,
-                                  bool is_pinned) {
+inline void* Tensor::mutable_data(platform::Place place, std::type_index type) {
  if (holder_ != nullptr) {
    holder_->set_type(type);
  }
@ -129,27 +127,26 @@ inline void* Tensor::mutable_data(platform::Place place, std::type_index type,
      holder_->size() < size + offset_) {
    if (platform::is_cpu_place(place)) {
      holder_.reset(new PlaceholderImpl<platform::CPUPlace>(
-          boost::get<platform::CPUPlace>(place), size, type, is_pinned));
+          boost::get<platform::CPUPlace>(place), size, type));
    } else if (platform::is_gpu_place(place)) {
 #ifndef PADDLE_WITH_CUDA
      PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
    }
 #else
      holder_.reset(new PlaceholderImpl<platform::CUDAPlace>(
-          boost::get<platform::CUDAPlace>(place), size, type, is_pinned));
+          boost::get<platform::CUDAPlace>(place), size, type));
    }
 #endif
    offset_ = 0;
-    is_pinned_ = is_pinned;
  }
  return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
                                 offset_);
 }

-inline void* Tensor::mutable_data(platform::Place place, bool is_pinned) {
+inline void* Tensor::mutable_data(platform::Place place) {
  PADDLE_ENFORCE(this->holder_ != nullptr,
                 "Cannot invoke mutable data if current hold nothing");
-  return mutable_data(place, holder_->type(), is_pinned);
+  return mutable_data(place, holder_->type());
 }

 inline Tensor& Tensor::ShareDataWith(const Tensor& src) {
@ -191,8 +188,6 @@ inline const DDim& Tensor::dims() const { return dims_; }

 inline int64_t Tensor::numel() const { return product(dims_); }

-inline bool Tensor::isPinned() const { return is_pinned_; }
-
 inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) {
  Tensor res;
  res.ShareDataWith(src);
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@ -148,6 +148,11 @@ struct AnyVisitor : public boost::static_visitor<bool> {
                 const platform::CPUPlace& cpu) const {
    return *out.data<bool>();
  }
+
+  bool GetResult(const framework::Tensor& out,
+                 const platform::CUDAPinnedPlace& cpu) const {
+    return *out.data<bool>();
+  }
 };

 template <typename Predicate>
--- a/paddle/fluid/memory/CMakeLists.txt
+++ b/paddle/fluid/memory/CMakeLists.txt
@ -4,13 +4,17 @@ cc_library(memory SRCS memory.cc DEPS place enforce)
 cc_library(memcpy SRCS memcpy.cc DEPS place)

 cc_library(paddle_memory
-    DEPS
-    memory
-    memcpy
-    meta_data
-    meta_cache
-    memory_block
-    buddy_allocator
-    system_allocator)
+        DEPS
+        memory
+        memcpy
+        meta_data
+        meta_cache
+        memory_block
+        buddy_allocator
+        system_allocator)

 cc_test(memory_test SRCS memory_test.cc DEPS place paddle_memory)
+
+#if (WITH_GPU)
+#   nv_test(pinned_memory_test SRCS pinned_memory_test.cu  DEPS place paddle_memory)
+#endif()
--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@ -14,6 +14,7 @@ limitations under the License. */

 #include "paddle/fluid/memory/detail/system_allocator.h"
 #include "paddle/fluid/platform/assert.h"
+#include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/gpu_info.h"

@ -134,21 +135,31 @@ bool GPUAllocator::UseGpu() const { return true; }
 // memory. It’s locked to a physical address.
 void* CUDAPinnedAllocator::Alloc(size_t& index, size_t size) {
  if (size <= 0) return nullptr;
-  void* p;
-  // NOTE: here, we use GpuMaxAllocSize() as the maximum memory size
+
+  // NOTE: here, we use CUDAPinnedMaxAllocSize as the maximum memory size
  // of host pinned allocation. Allocates too much would reduce
  // the amount of memory available to the underlying system for paging.
+  size_t usable =
+      paddle::platform::CUDAPinnedMaxAllocSize() - cuda_pinnd_alloc_size_;

-  size_t usable = paddle::platform::GpuMaxAllocSize() - fallback_alloc_size_;
-
-  if (size > usable) return nullptr;
+  if (size > usable) {
+    LOG(WARNING) << "Cannot malloc " << size / 1024.0 / 1024.0
+                 << " MB pinned memory."
+                 << ", available " << usable / 1024.0 / 1024.0 << " MB";
+    return nullptr;
+  }

+  void* p;
  // PINNED memory is visible to all CUDA contexts.
  cudaError_t result = cudaMallocHost(&p, size);
+
  if (result == cudaSuccess) {
-    index = 1;
-    fallback_alloc_size_ += size;
+    index = 1;  // PINNED memory
+    cuda_pinnd_alloc_size_ += size;
    return p;
+  } else {
+    LOG(WARNING) << "cudaMallocHost failed.";
+    return nullptr;
  }

  return nullptr;
@ -158,8 +169,8 @@ void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) {
  cudaError_t err;
  PADDLE_ASSERT(index == 1);

-  PADDLE_ASSERT(fallback_alloc_size_ >= size);
-  fallback_alloc_size_ -= size;
+  PADDLE_ASSERT(cuda_pinnd_alloc_size_ >= size);
+  cuda_pinnd_alloc_size_ -= size;
  err = cudaFreeHost(p);

  // Purposefully allow cudaErrorCudartUnloading, because
@ -172,7 +183,7 @@ void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) {
  }
 }

-bool CUDAPinnedAllocator::UseGpu() const { return true; }
+bool CUDAPinnedAllocator::UseGpu() const { return false; }

 #endif

--- a/paddle/fluid/memory/detail/system_allocator.h
+++ b/paddle/fluid/memory/detail/system_allocator.h
@ -21,8 +21,9 @@ namespace memory {
 namespace detail {

 /**
- * \brief SystemAllocator is the parent class of CPUAllocator and GPUAllocator.
- *        A BuddyAllocator object uses a SystemAllocator* pointing to the
+ * \brief SystemAllocator is the parent class of CPUAllocator,
+ *        CUDAPinnedAllocator and GPUAllocator. A BuddyAllocator
+ *        object uses a SystemAllocator* pointing to the
 *        underlying system allocator.
 */
 class SystemAllocator {
@ -62,9 +63,7 @@ class CUDAPinnedAllocator : public SystemAllocator {
  virtual bool UseGpu() const;

 private:
-  size_t gpu_alloc_size_ =
-      0;  // TODO(zcd): how to define the upper limit of CUDAPinnedMemory?
-  size_t fallback_alloc_size_ = 0;
+  size_t cuda_pinnd_alloc_size_ = 0;
 };
 #endif

--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@ -56,6 +56,45 @@ void Copy<platform::CUDAPlace, platform::CUDAPlace>(
  }
 }

+template <>
+void Copy<platform::CPUPlace, platform::CUDAPinnedPlace>(
+    platform::CPUPlace dst_place, void* dst,
+    platform::CUDAPinnedPlace src_place, const void* src, size_t num) {
+  std::memcpy(dst, src, num);
+}
+
+template <>
+void Copy<platform::CUDAPinnedPlace, platform::CPUPlace>(
+    platform::CUDAPinnedPlace dst_place, void* dst,
+    platform::CPUPlace src_place, const void* src, size_t num) {
+  std::memcpy(dst, src, num);
+}
+
+template <>
+void Copy<platform::CUDAPinnedPlace, platform::CUDAPinnedPlace>(
+    platform::CUDAPinnedPlace dst_place, void* dst,
+    platform::CUDAPinnedPlace src_place, const void* src, size_t num) {
+  std::memcpy(dst, src, num);
+}
+
+template <>
+void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>(
+    platform::CUDAPinnedPlace dst_place, void* dst,
+    platform::CUDAPlace src_place, const void* src, size_t num,
+    cudaStream_t stream) {
+  platform::SetDeviceId(src_place.device);
+  platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
+}
+
+template <>
+void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(
+    platform::CUDAPlace dst_place, void* dst,
+    platform::CUDAPinnedPlace src_place, const void* src, size_t num,
+    cudaStream_t stream) {
+  platform::SetDeviceId(dst_place.device);
+  platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
+}
+
 #endif

 }  // namespace memory
--- a/paddle/fluid/memory/memory.cc
+++ b/paddle/fluid/memory/memory.cc
@ -38,8 +38,7 @@ BuddyAllocator* GetCPUBuddyAllocator() {
 }

 template <>
-void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size,
-                                bool is_pinned) {
+void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size) {
  VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
  void* p = GetCPUBuddyAllocator()->Alloc(size);
  VLOG(10) << "  pointer=" << p;
@ -47,8 +46,7 @@ void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size,
 }

 template <>
-void Free<platform::CPUPlace>(platform::CPUPlace place, void* p,
-                              bool is_pinned) {
+void Free<platform::CPUPlace>(platform::CPUPlace place, void* p) {
  VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
  GetCPUBuddyAllocator()->Free(p);
 }
@ -84,47 +82,15 @@ BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
  return as[gpu_id];
 }

-BuddyAllocator* GetCUDAPinnedBuddyAllocator(int gpu_id) {
-  static BuddyAllocator** as = NULL;
-  if (as == NULL) {
-    int gpu_num = platform::GetCUDADeviceCount();
-    as = new BuddyAllocator*[gpu_num];
-    for (int gpu = 0; gpu < gpu_num; gpu++) {
-      as[gpu] = nullptr;
-    }
-  }
-  platform::SetDeviceId(gpu_id);
-  if (!as[gpu_id]) {
-    as[gpu_id] = new BuddyAllocator(new detail::CUDAPinnedAllocator,
-                                    platform::GpuMinChunkSize(),
-                                    platform::GpuMaxChunkSize());
-    VLOG(10) << "\n\nNOTE: each GPU device use "
-             << FLAGS_fraction_of_gpu_memory_to_use * 100
-             << "% of GPU memory.\n"
-             << "You can set GFlags environment variable '"
-             << "FLAGS_fraction_of_gpu_memory_to_use"
-             << "' to change the fraction of GPU usage.\n\n";
-  }
-  return as[gpu_id];
-}
-
 template <>
 size_t Used<platform::CUDAPlace>(platform::CUDAPlace place) {
  return GetGPUBuddyAllocator(place.device)->Used();
 }

 template <>
-void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size,
-                                 bool is_pinned) {
-  void* ptr;
-  if (is_pinned) {
-    auto* buddy_allocator = GetCUDAPinnedBuddyAllocator(place.device);
-    ptr = buddy_allocator->Alloc(size);
-  } else {
-    auto* buddy_allocator = GetGPUBuddyAllocator(place.device);
-    ptr = buddy_allocator->Alloc(size);
-  }
-
+void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size) {
+  auto* buddy_allocator = GetGPUBuddyAllocator(place.device);
+  auto* ptr = buddy_allocator->Alloc(size);
  if (ptr == nullptr) {
    int cur_dev = platform::GetCurrentDeviceId();
    platform::SetDeviceId(place.device);
@ -142,15 +108,42 @@ void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size,
 }

 template <>
-void Free<platform::CUDAPlace>(platform::CUDAPlace place, void* p,
-                               bool is_pinned) {
-  if (is_pinned) {
-    GetCUDAPinnedBuddyAllocator(place.device)->Free(p);
-  } else {
-    GetGPUBuddyAllocator(place.device)->Free(p);
+void Free<platform::CUDAPlace>(platform::CUDAPlace place, void* p) {
+  GetGPUBuddyAllocator(place.device)->Free(p);
+}
+
+BuddyAllocator* GetCUDAPinnedBuddyAllocator() {
+  static BuddyAllocator* ba = NULL;
+  if (ba == NULL) {
+    ba = new BuddyAllocator(new detail::CUDAPinnedAllocator,
+                            platform::CUDAPinnedMinChunkSize(),
+                            platform::CUDAPinnedMaxChunkSize());
  }
+  return ba;
 }

+template <>
+size_t Used<platform::CUDAPinnedPlace>(platform::CUDAPinnedPlace place) {
+  return GetCUDAPinnedBuddyAllocator()->Used();
+}
+
+template <>
+void* Alloc<platform::CUDAPinnedPlace>(platform::CUDAPinnedPlace place,
+                                       size_t size) {
+  auto* buddy_allocator = GetCUDAPinnedBuddyAllocator();
+  void* ptr = buddy_allocator->Alloc(size);
+
+  if (ptr == nullptr) {
+    LOG(WARNING) << "cudaMallocHost Cannot allocate " << size
+                 << " bytes in CUDAPinnedPlace";
+  }
+  return ptr;
+}
+
+template <>
+void Free<platform::CUDAPinnedPlace>(platform::CUDAPinnedPlace place, void* p) {
+  GetCUDAPinnedBuddyAllocator()->Free(p);
+}
 #endif

 size_t Usage::operator()(const platform::CPUPlace& cpu) const {
@ -165,6 +158,14 @@ size_t Usage::operator()(const platform::CUDAPlace& gpu) const {
 #endif
 }

+size_t Usage::operator()(const platform::CUDAPinnedPlace& cuda_pinned) const {
+#ifdef PADDLE_WITH_CUDA
+  return Used(cuda_pinned);
+#else
+  PADDLE_THROW("'CUDAPinnedPlace' is not supported in CPU only device.");
+#endif
+}
+
 size_t memory_usage(const platform::Place& p) {
  return boost::apply_visitor(Usage(), p);
 }
--- a/paddle/fluid/memory/memory.h
+++ b/paddle/fluid/memory/memory.h
@ -33,7 +33,7 @@ namespace memory {
 *          address is valid or not.
 */
 template <typename Place>
-void* Alloc(Place place, size_t size, bool is_pinned = false);
+void* Alloc(Place place, size_t size);

 /**
 * \brief   Free memory block in one place.
@ -43,7 +43,7 @@ void* Alloc(Place place, size_t size, bool is_pinned = false);
 *
 */
 template <typename Place>
-void Free(Place place, void* ptr, bool is_pinned = false);
+void Free(Place place, void* ptr);

 /**
 * \brief   Total size of used memory in one place.
@ -57,6 +57,7 @@ size_t Used(Place place);
 struct Usage : public boost::static_visitor<size_t> {
  size_t operator()(const platform::CPUPlace& cpu) const;
  size_t operator()(const platform::CUDAPlace& gpu) const;
+  size_t operator()(const platform::CUDAPinnedPlace& cuda_pinned) const;
 };

 size_t memory_usage(const platform::Place& p);
@ -74,13 +75,11 @@ class PODDeleter {
  static_assert(std::is_pod<T>::value, "T must be POD");

 public:
-  explicit PODDeleter(Place place, bool is_pinned = false)
-      : place_(place), is_pinned_(is_pinned) {}
-  void operator()(T* ptr) { Free(place_, static_cast<void*>(ptr), is_pinned_); }
+  explicit PODDeleter(Place place) : place_(place) {}
+  void operator()(T* ptr) { Free(place_, static_cast<void*>(ptr)); }

 private:
  Place place_;
-  bool is_pinned_;
 };

 /**
--- a/paddle/fluid/memory/memory_test.cc
+++ b/paddle/fluid/memory/memory_test.cc
@ -141,4 +141,59 @@ TEST(BuddyAllocator, GPUMultAlloc) {
  }
 }

+size_t align(size_t size, paddle::platform::CUDAPinnedPlace place) {
+  size += sizeof(paddle::memory::detail::Metadata);
+  size_t alignment = paddle::platform::CUDAPinnedMinChunkSize();
+  size_t remaining = size % alignment;
+  return remaining == 0 ? size : size + (alignment - remaining);
+}
+
+TEST(BuddyAllocator, CUDAPinnedAllocator) {
+  void *p = nullptr;
+
+  EXPECT_EQ(p, nullptr);
+
+  paddle::platform::CUDAPinnedPlace cpu;
+  p = paddle::memory::Alloc(cpu, 4096);
+
+  EXPECT_NE(p, nullptr);
+
+  paddle::platform::Place place = cpu;
+  EXPECT_EQ(paddle::memory::Used(cpu), paddle::memory::memory_usage(place));
+
+  paddle::memory::Free(cpu, p);
+}
+
+TEST(BuddyAllocator, CUDAPinnedMultAllocator) {
+  paddle::platform::CUDAPinnedPlace cpu;
+
+  std::unordered_map<void *, size_t> ps;
+
+  size_t total_size = paddle::memory::Used(cpu);
+  EXPECT_EQ(total_size, 0UL);
+
+  for (auto size :
+       {0, 128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {
+    ps[paddle::memory::Alloc(cpu, size)] = size;
+
+    // Buddy Allocator doesn't manage too large memory chunk
+    if (paddle::memory::Used(cpu) == total_size) continue;
+
+    size_t aligned_size = align(size, cpu);
+    total_size += aligned_size;
+    EXPECT_EQ(total_size, paddle::memory::Used(cpu));
+  }
+
+  for (auto p : ps) {
+    EXPECT_EQ(is_aligned(p.first), true);
+    paddle::memory::Free(cpu, p.first);
+
+    // Buddy Allocator doesn't manage too large memory chunk
+    if (paddle::memory::Used(cpu) == total_size) continue;
+
+    size_t aligned_size = align(p.second, cpu);
+    total_size -= aligned_size;
+    EXPECT_EQ(total_size, paddle::memory::Used(cpu));
+  }
+}
 #endif
--- a/paddle/fluid/memory/pinned_memory_test.cu
+++ b/paddle/fluid/memory/pinned_memory_test.cu
@ -0,0 +1,147 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <gtest/gtest.h>
+#include <unordered_map>
+
+#include "paddle/fluid/memory/detail/memory_block.h"
+#include "paddle/fluid/memory/detail/meta_data.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/memory/memory.h"
+
+#include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/place.h"
+
+// This unit test is an example comparing the performance between using pinned
+// memory and not. In general, using pinned memory will be faster.
+template <typename T>
+__global__ void Kernel(T* output, int dim) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  if (tid < dim) {
+    output[tid] = output[tid] * output[tid] / 100;
+  }
+}
+
+template <typename Place>
+float test_pinned_memory() {
+  Place cpu_place;
+  paddle::platform::CUDAPlace cuda_place;
+
+  const int data_size = 4096;
+  const int iteration = 10;
+
+  // create event start and end
+  cudaEvent_t start_e, stop_e, copying_e;
+  float elapsedTime = 0;
+  cudaEventCreate(&start_e);
+  cudaEventCreate(&stop_e);
+  cudaEventCreate(&copying_e);
+
+  // create computation stream, data copying stream
+  cudaStream_t computation_stream, copying_stream;
+  cudaStreamCreate(&computation_stream);
+  cudaStreamCreate(&copying_stream);
+
+  // create record event, pinned memory, gpu memory
+  std::vector<cudaEvent_t> record_event(iteration);
+  std::vector<float*> input_pinned_mem(iteration);
+  std::vector<float*> gpu_mem(iteration);
+  std::vector<float*> output_pinned_mem(iteration);
+
+  // initial data
+  for (int j = 0; j < iteration; ++j) {
+    cudaEventCreateWithFlags(&record_event[j], cudaEventDisableTiming);
+    cudaEventCreate(&(record_event[j]));
+    input_pinned_mem[j] = static_cast<float*>(
+        paddle::memory::Alloc(cpu_place, data_size * sizeof(float)));
+    output_pinned_mem[j] = static_cast<float*>(
+        paddle::memory::Alloc(cpu_place, data_size * sizeof(float)));
+    gpu_mem[j] = static_cast<float*>(
+        paddle::memory::Alloc(cuda_place, data_size * sizeof(float)));
+
+    for (int k = 0; k < data_size; ++k) {
+      input_pinned_mem[j][k] = k;
+    }
+  }
+
+  cudaEventRecord(start_e, computation_stream);
+
+  // computation
+  for (int m = 0; m < 30; ++m) {
+    for (int i = 0; i < iteration; ++i) {
+      // cpu -> GPU on computation stream.
+      // note: this operation is async for pinned memory.
+      paddle::memory::Copy(cuda_place, gpu_mem[i], cpu_place,
+                           input_pinned_mem[i], data_size * sizeof(float),
+                           computation_stream);
+
+      // call kernel on computation stream.
+      Kernel<<<4, 1024, 0, computation_stream>>>(gpu_mem[i], data_size);
+
+      // record event_computation on computation stream
+      cudaEventRecord(record_event[i], computation_stream);
+
+      // wait event_computation on copy stream.
+      // note: this operation is async.
+      cudaStreamWaitEvent(copying_stream, record_event[i], 0);
+
+      // copy data GPU->CPU, on copy stream.
+      // note: this operation is async for pinned memory.
+      paddle::memory::Copy(cpu_place, output_pinned_mem[i], cuda_place,
+                           gpu_mem[i], data_size * sizeof(float),
+                           copying_stream);
+    }
+  }
+
+  cudaEventRecord(copying_e, copying_stream);
+  cudaStreamWaitEvent(computation_stream, copying_e, 0);
+
+  cudaEventRecord(stop_e, computation_stream);
+
+  cudaEventSynchronize(start_e);
+  cudaEventSynchronize(stop_e);
+  cudaEventElapsedTime(&elapsedTime, start_e, stop_e);
+
+  // std::cout << cpu_place << " "
+  //          << "time consume:" << elapsedTime / 30 << std::endl;
+
+  for (int l = 0; l < iteration; ++l) {
+    for (int k = 0; k < data_size; ++k) {
+      float temp = input_pinned_mem[l][k];
+      temp = temp * temp / 100;
+      EXPECT_FLOAT_EQ(temp, output_pinned_mem[l][k]);
+    }
+  }
+
+  // destroy resource
+  cudaEventDestroy(copying_e);
+  cudaEventDestroy(start_e);
+  cudaEventDestroy(stop_e);
+  for (int j = 0; j < 10; ++j) {
+    cudaEventDestroy((record_event[j]));
+    paddle::memory::Free(cpu_place, input_pinned_mem[j]);
+    paddle::memory::Free(cpu_place, output_pinned_mem[j]);
+    paddle::memory::Free(cuda_place, gpu_mem[j]);
+  }
+  return elapsedTime / 30;
+}
+
+TEST(CPUANDCUDAPinned, CPUAllocatorAndCUDAPinnedAllocator) {
+  // Generally speaking, operation on pinned_memory is faster than that on
+  // unpinned-memory, but if this unit test fails frequently, please close this
+  // test for the time being.
+  float time1 = test_pinned_memory<paddle::platform::CPUPlace>();
+  float time2 = test_pinned_memory<paddle::platform::CUDAPinnedPlace>();
+  EXPECT_GT(time1, time2);
+}
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@ -13,6 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/conv_op.h"
+
+#include <string>
+#include <vector>
+
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/cudnn_helper.h"
 #endif
--- a/paddle/fluid/operators/detail/bytebuffer_stream.cc
+++ b/paddle/fluid/operators/detail/bytebuffer_stream.cc
@ -17,7 +17,7 @@ limitations under the License. */
 //       file and did some modifications so that we can send gRPC
 //       requests without too much copying of the tensor data.

-#include "bytebuffer_stream.h"
+#include "paddle/fluid/operators/detail/bytebuffer_stream.h"

 namespace paddle {
 namespace operators {
--- a/paddle/fluid/operators/detail/bytebuffer_stream.h
+++ b/paddle/fluid/operators/detail/bytebuffer_stream.h
@ -19,9 +19,11 @@ limitations under the License. */

 #pragma once

-#include <grpc++/grpc++.h>
+#include <vector>
+
 #include "google/protobuf/io/coded_stream.h"
 #include "google/protobuf/io/zero_copy_stream.h"
+#include "grpc++/grpc++.h"

 namespace grpc {
 // A ZeroCopyInputStream that reads from grpc_byte_buffer
@ -56,7 +58,7 @@ class GrpcBufferReader final
      *data = GRPC_SLICE_START_PTR(slice_) + GRPC_SLICE_LENGTH(slice_) -
              backup_count_;
      GPR_CODEGEN_ASSERT(backup_count_ <= INT_MAX);
-      *size = (int)backup_count_;
+      *size = static_cast<int>(backup_count_);
      backup_count_ = 0;
      return true;
    }
@ -68,7 +70,7 @@ class GrpcBufferReader final
    *data = GRPC_SLICE_START_PTR(slice_);
    // On win x64, int is only 32bit
    GPR_CODEGEN_ASSERT(GRPC_SLICE_LENGTH(slice_) <= INT_MAX);
-    byte_count_ += * size = (int)GRPC_SLICE_LENGTH(slice_);
+    byte_count_ += * size = static_cast<int>(GRPC_SLICE_LENGTH(slice_));
    return true;
  }

--- a/paddle/fluid/operators/detail/grpc_client.cc
+++ b/paddle/fluid/operators/detail/grpc_client.cc
@ -14,6 +14,8 @@ limitations under the License. */

 #include "paddle/fluid/operators/detail/grpc_client.h"

+#include <sys/time.h>
+
 #include <limits>

 #include "paddle/fluid/framework/threadpool.h"
@ -54,7 +56,7 @@ bool RPCClient::AsyncSendVariable(const std::string& ep,
    auto call = s->stub_g_.PrepareUnaryCall(
        s->context_.get(), "/sendrecv.SendRecvService/SendVariable", req, &cq_);
    call->StartCall();
-    call->Finish(&s->reply_, &s->status_, static_cast<void*>(s));
+    call->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
  });

  req_count_++;
@ -66,7 +68,7 @@ void ProcGetResponse(const VarHandle& var_h,
                     // const sendrecv::VariableMessage& ret_msg) {
                     const ::grpc::ByteBuffer& ret_msg) {
  framework::Variable* outvar = NULL;
-  DeserializeFromByteBuffer(ret_msg, *var_h.ctx, var_h.scope, outvar);
+  DeserializeFromByteBuffer(ret_msg, *var_h.ctx, var_h.scope, &outvar);
 }

 template <typename T>
@ -110,7 +112,7 @@ bool RPCClient::AsyncGetVariable(const std::string& ep,
    auto call = s->stub_g_.PrepareUnaryCall(
        s->context_.get(), "/sendrecv.SendRecvService/GetVariable", buf, &cq_);
    call->StartCall();
-    call->Finish(&s->reply_, &s->status_, static_cast<void*>(s));
+    call->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
  });

  req_count_++;
@ -170,7 +172,7 @@ void RPCClient::AsyncSendBatchBarrier(const std::string& ep, int64_t time_out) {
  sendrecv::VariableMessage req;
  req.set_varname(BATCH_BARRIER_MESSAGE);
  auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
-  rpc->Finish(&s->reply_, &s->status_, static_cast<void*>(s));
+  rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
  req_count_++;
 }

@ -182,7 +184,7 @@ void RPCClient::AsyncSendFetchBarrier(const std::string& ep, int64_t time_out) {
  sendrecv::VariableMessage req;
  req.set_varname(FETCH_BARRIER_MESSAGE);
  auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_);
-  rpc->Finish(&s->reply_, &s->status_, static_cast<void*>(s));
+  rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
  req_count_++;
 }

--- a/paddle/fluid/operators/detail/grpc_client.h
+++ b/paddle/fluid/operators/detail/grpc_client.h
@ -14,10 +14,9 @@ limitations under the License. */

 #pragma once

-#include <grpc++/grpc++.h>
-#include <grpc/support/log.h>
 #include <time.h>
-#include <chrono>
+
+#include <chrono>  // NOLINT
 #include <ctime>
 #include <functional>
 #include <iostream>
@ -25,11 +24,11 @@ limitations under the License. */
 #include <string>
 #include <vector>

-#include <grpc++/generic/generic_stub.h>
-#include <grpc++/grpc++.h>
-#include <grpc++/support/byte_buffer.h>
-#include <grpc++/support/slice.h>
-
+#include "grpc++/generic/generic_stub.h"
+#include "grpc++/grpc++.h"
+#include "grpc++/support/byte_buffer.h"
+#include "grpc++/support/slice.h"
+#include "grpc/support/log.h"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
--- a/paddle/fluid/operators/detail/grpc_server.cc
+++ b/paddle/fluid/operators/detail/grpc_server.cc
@ -273,7 +273,7 @@ void AsyncGRPCServer::TryToRegisterNewPrefetchOne() {

 // FIXME(typhoonzero): change cq_name to enum.
 void AsyncGRPCServer::HandleRequest(::grpc::ServerCompletionQueue* cq,
-                                    std::string cq_name,
+                                    const std::string& cq_name,
                                    std::function<void()> TryToRegisterNewOne) {
  TryToRegisterNewOne();

--- a/paddle/fluid/operators/detail/grpc_server.h
+++ b/paddle/fluid/operators/detail/grpc_server.h
@ -14,10 +14,11 @@ limitations under the License. */

 #pragma once

-#include <grpc++/grpc++.h>
 #include <string>
+#include <thread>  // NOLINT
 #include <utility>

+#include "grpc++/grpc++.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/program_desc.h"
@ -71,7 +72,8 @@ class AsyncGRPCServer final {
  void ShutDown();

 protected:
-  void HandleRequest(::grpc::ServerCompletionQueue *cq, std::string cq_name,
+  void HandleRequest(::grpc::ServerCompletionQueue *cq,
+                     const std::string &cq_name,
                     std::function<void()> TryToRegisterNewOne);
  void TryToRegisterNewSendOne();
  void TryToRegisterNewGetOne();
--- a/paddle/fluid/operators/detail/grpc_server_test.cc
+++ b/paddle/fluid/operators/detail/grpc_server_test.cc
@ -14,7 +14,7 @@ limitations under the License. */

 #include <unistd.h>
 #include <string>
-#include <thread>
+#include <thread>  // NOLINT

 #include "gtest/gtest.h"
 #include "paddle/fluid/operators/detail/grpc_client.h"
--- a/paddle/fluid/operators/detail/proto_encoder_helper.h
+++ b/paddle/fluid/operators/detail/proto_encoder_helper.h
@ -19,7 +19,9 @@ limitations under the License. */

 #pragma once

-#include <grpc++/grpc++.h>
+#include <string>
+
+#include "grpc++/grpc++.h"
 #include "paddle/fluid/platform/enforce.h"

 namespace paddle {
@ -142,6 +144,6 @@ class ProtoEncodeHelper {
  char* limit_;  // Just for CHECKs
 };

-}  // detail
-}  // operators
-}  // paddle
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/detail/sendrecvop_utils.cc
+++ b/paddle/fluid/operators/detail/sendrecvop_utils.cc
@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/detail/sendrecvop_utils.h"
+
 #include <sys/time.h>
-#include <thread>
+#include <thread>  // NOLINT
+
 #include "google/protobuf/io/coded_stream.h"
 #include "google/protobuf/io/zero_copy_stream.h"
 #include "paddle/fluid/framework/data_type.h"
@ -42,7 +44,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
  void* buf = malloc(1024);
  void* payload = nullptr;
  size_t payload_size;
-  ProtoEncodeHelper e((char*)buf, 1024);
+  ProtoEncodeHelper e(static_cast<char*>(buf), 1024);
  e.WriteString(VarMsg::kVarnameFieldNumber, name);
  if (var->IsType<framework::LoDTensor>()) {
    e.WriteUint64(VarMsg::kTypeFieldNumber, 0);
@ -152,7 +154,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
      framework::proto::VarType_Type_SELECTED_ROWS) {
    auto* slr = var->GetMutable<framework::SelectedRows>();

-    ProtoEncodeHelper e2((char*)buf, 128);
+    ProtoEncodeHelper e2(static_cast<char*>(buf), 128);
    // NOTE: rows is of type int64_t
    size_t rows_memory_size =
        slr->rows().size() * framework::SizeOfType(typeid(int64_t));
@ -181,10 +183,10 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
 void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
                               const platform::DeviceContext& ctx,
                               const framework::Scope* scope,
-                               framework::Variable*& var) {
+                               framework::Variable** var) {
  operators::detail::VariableResponse resp(scope, &ctx);
  PADDLE_ENFORCE(resp.Parse(msg) == 0, "parse bytebuffer to tensor error!");
-  var = resp.GetVar();
+  *var = resp.GetVar();
 }

 }  // namespace detail
--- a/Show More
+++ b/Show More