Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into img

Fix img links errors
7 years ago · fcb48440a1
parent 4d1fe5c3ec f2c0b88649
commit fcb48440a1
47 changed files with 1211 additions and 197 deletions
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@ -21,6 +21,9 @@
 #include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h"
 #endif
 #include <string>
 #include <vector>
 namespace paddle {
 namespace framework {
 namespace details {
@ -168,6 +171,11 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
   */
  PolishGraphToSupportDataHazards(&result);
  /*
   * Only variables should be the leaves of graph.
   */
  AddOutputToLeafOps(&result);
  if (VLOG_IS_ON(10)) {
    std::ostringstream sout;
    PrintGraphviz(*graph, sout);
--- a/paddle/fluid/framework/details/ssa_graph_builder.cc
+++ b/paddle/fluid/framework/details/ssa_graph_builder.cc
@ -136,6 +136,17 @@ void SSAGraphBuilder::PrintGraphviz(const SSAGraph &graph, std::ostream &sout) {
  sout << "}\n";
 }
 void SSAGraphBuilder::AddOutputToLeafOps(SSAGraph *graph) {
  for (auto &op : graph->ops_) {
    if (!op->outputs_.empty()) {
      continue;
    }
    auto *dummy_leaf = new DummyVarHandle();
    graph->dep_vars_.emplace(dummy_leaf);
    op->AddOutput(dummy_leaf);
  }
 }
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/details/ssa_graph_builder.h
+++ b/paddle/fluid/framework/details/ssa_graph_builder.h
@ -14,13 +14,13 @@
 #pragma once
 #include <memory>
 #include <string>
 #include "paddle/fluid/framework/details/ssa_graph.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/platform/place.h"
 #include <memory>
 #include <string>
 namespace paddle {
 namespace framework {
 namespace details {
@ -52,6 +52,8 @@ class SSAGraphBuilder {
                             const std::string &each_var_name,
                             const platform::Place &place, size_t place_offset);
  static void AddOutputToLeafOps(SSAGraph *graph);
  static void PrintGraphviz(const SSAGraph &graph, std::ostream &sout);
 };
 }  // namespace details
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@ -87,7 +87,6 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
  // Step 2. Insert FetchOps
  std::vector<std::unique_ptr<FetchOpHandle>> fetch_ops;
  std::vector<DummyVarHandle> dummy_vars;
  FeedFetchList fetch_data(fetch_tensors.size());
  std::unordered_map<std::string, std::vector<VarHandleBase *>> fetched_vars;
@ -101,13 +100,13 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
    }
  }
  std::unordered_set<std::unique_ptr<VarHandleBase>> fetch_dependencies;
  for (size_t i = 0; i < fetch_tensors.size(); ++i) {
    auto &var_name = fetch_tensors[i];
    auto &vars = fetched_vars.at(var_name);
    auto *op = new FetchOpHandle(&fetch_data, i, &local_scopes_);
    fetch_ops.emplace_back(op);
    // FIXME: Use new device context
    for (auto &p : places_) {
      op->dev_ctxes_[p] = fetch_ctxs_.Get(p);
    }
@ -115,6 +114,11 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
    for (auto *var : vars) {
      op->AddInput(var);
    }
    auto *fetch_dummy = new DummyVarHandle();
    op->AddOutput(fetch_dummy);
    fetch_dependencies.emplace(fetch_dummy);
    InsertPendingVar(*fetch_dummy);
    InsertPendingOp(*op);
  }
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@ -45,11 +45,10 @@ class Tensor {
  friend struct EigenVector;
 public:
-  Tensor() : offset_(0), is_pinned_(false) {}
+  Tensor() : offset_(0) {}
  /*! Constructor with place should only be used in pybind. */
-  explicit Tensor(const platform::Place& place)
+  explicit Tensor(const platform::Place& place) : offset_(0) {
      : offset_(0), is_pinned_(false) {
    holder_->set_place(place);
  }
@ -70,12 +69,11 @@ class Tensor {
   * @note    If not exist, then allocation.
   */
  template <typename T>
-  inline T* mutable_data(platform::Place place, bool is_pinned = false);
+  inline T* mutable_data(platform::Place place);
-  inline void* mutable_data(platform::Place place, std::type_index type,
+  inline void* mutable_data(platform::Place place, std::type_index type);
                            bool is_pinned = false);
-  inline void* mutable_data(platform::Place place, bool is_pinned = false);
+  inline void* mutable_data(platform::Place place);
  /**
   * @brief     Return a pointer to mutable memory block.
@ -86,8 +84,7 @@ class Tensor {
   * @note      If not exist, then allocation.
   */
  template <typename T>
-  inline T* mutable_data(DDim dims, platform::Place place,
+  inline T* mutable_data(DDim dims, platform::Place place);
                         bool is_pinned = false);
  /*! Return the dimensions of the memory block. */
  inline const DDim& dims() const;
@ -95,9 +92,6 @@ class Tensor {
  /*! Return the numel of the memory block. */
  inline int64_t numel() const;
  /*! Return the numel of the memory block. */
  inline bool isPinned() const;
  /*! Resize the dimensions of the memory block. */
  inline Tensor& Resize(const DDim& dims);
@ -152,14 +146,12 @@ class Tensor {
  template <typename Place>
  struct PlaceholderImpl : public Placeholder {
-    PlaceholderImpl(Place place, size_t size, std::type_index type,
+    PlaceholderImpl(Place place, size_t size, std::type_index type)
-                    bool is_pinned = false)
+        : ptr_(static_cast<uint8_t*>(memory::Alloc(place, size)),
-        : ptr_(static_cast<uint8_t*>(memory::Alloc(place, size, is_pinned)),
+               memory::PODDeleter<uint8_t, Place>(place)),
               memory::PODDeleter<uint8_t, Place>(place, is_pinned)),
          place_(place),
          size_(size),
-          type_(type),
+          type_(type) {
          is_pinned_(is_pinned) {
      PADDLE_ENFORCE_NOT_NULL(ptr_, "Insufficient %s memory to allocation.",
                              (is_cpu_place(place_) ? "CPU" : "GPU"));
    }
@ -182,9 +174,6 @@ class Tensor {
    /* the current type of memory */
    std::type_index type_;
    /*! use pinned memory or not. */
    bool is_pinned_;
  };
  /*! holds the memory block if allocated. */
@ -219,7 +208,6 @@ class Tensor {
   *          PlaceHolder::ptr_ and where the tensor data really begins.
   */
  size_t offset_;
  bool is_pinned_;
 };
 inline void Tensor::switch_place(platform::Place new_place) {
--- a/paddle/fluid/framework/tensor_impl.h
+++ b/paddle/fluid/framework/tensor_impl.h
@ -101,21 +101,19 @@ inline T* Tensor::data() {
 }
 template <typename T>
-inline T* Tensor::mutable_data(DDim dims, platform::Place place,
+inline T* Tensor::mutable_data(DDim dims, platform::Place place) {
                               bool is_pinned) {
  static_assert(std::is_pod<T>::value, "T must be POD");
  Resize(dims);
-  return mutable_data<T>(place, is_pinned);
+  return mutable_data<T>(place);
 }
 template <typename T>
-inline T* Tensor::mutable_data(platform::Place place, bool is_pinned) {
+inline T* Tensor::mutable_data(platform::Place place) {
  static_assert(std::is_pod<T>::value, "T must be POD");
-  return reinterpret_cast<T*>(mutable_data(place, typeid(T), is_pinned));
+  return reinterpret_cast<T*>(mutable_data(place, typeid(T)));
 }
-inline void* Tensor::mutable_data(platform::Place place, std::type_index type,
+inline void* Tensor::mutable_data(platform::Place place, std::type_index type) {
                                  bool is_pinned) {
  if (holder_ != nullptr) {
    holder_->set_type(type);
  }
@ -129,27 +127,26 @@ inline void* Tensor::mutable_data(platform::Place place, std::type_index type,
      holder_->size() < size + offset_) {
    if (platform::is_cpu_place(place)) {
      holder_.reset(new PlaceholderImpl<platform::CPUPlace>(
-          boost::get<platform::CPUPlace>(place), size, type, is_pinned));
+          boost::get<platform::CPUPlace>(place), size, type));
    } else if (platform::is_gpu_place(place)) {
 #ifndef PADDLE_WITH_CUDA
      PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
    }
 #else
      holder_.reset(new PlaceholderImpl<platform::CUDAPlace>(
-          boost::get<platform::CUDAPlace>(place), size, type, is_pinned));
+          boost::get<platform::CUDAPlace>(place), size, type));
    }
 #endif
    offset_ = 0;
    is_pinned_ = is_pinned;
  }
  return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
                                 offset_);
 }
-inline void* Tensor::mutable_data(platform::Place place, bool is_pinned) {
+inline void* Tensor::mutable_data(platform::Place place) {
  PADDLE_ENFORCE(this->holder_ != nullptr,
                 "Cannot invoke mutable data if current hold nothing");
-  return mutable_data(place, holder_->type(), is_pinned);
+  return mutable_data(place, holder_->type());
 }
 inline Tensor& Tensor::ShareDataWith(const Tensor& src) {
@ -191,8 +188,6 @@ inline const DDim& Tensor::dims() const { return dims_; }
 inline int64_t Tensor::numel() const { return product(dims_); }
 inline bool Tensor::isPinned() const { return is_pinned_; }
 inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) {
  Tensor res;
  res.ShareDataWith(src);
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@ -148,6 +148,11 @@ struct AnyVisitor : public boost::static_visitor<bool> {
                 const platform::CPUPlace& cpu) const {
    return *out.data<bool>();
  }
  bool GetResult(const framework::Tensor& out,
                 const platform::CUDAPinnedPlace& cpu) const {
    return *out.data<bool>();
  }
 };
 template <typename Predicate>
--- a/paddle/fluid/memory/CMakeLists.txt
+++ b/paddle/fluid/memory/CMakeLists.txt
@ -4,13 +4,17 @@ cc_library(memory SRCS memory.cc DEPS place enforce)
 cc_library(memcpy SRCS memcpy.cc DEPS place)
 cc_library(paddle_memory
-    DEPS
+        DEPS
-    memory
+        memory
-    memcpy
+        memcpy
-    meta_data
+        meta_data
-    meta_cache
+        meta_cache
-    memory_block
+        memory_block
-    buddy_allocator
+        buddy_allocator
-    system_allocator)
+        system_allocator)
 cc_test(memory_test SRCS memory_test.cc DEPS place paddle_memory)
 #if (WITH_GPU)
 #   nv_test(pinned_memory_test SRCS pinned_memory_test.cu  DEPS place paddle_memory)
 #endif()
--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@ -14,6 +14,7 @@ limitations under the License. */
 #include "paddle/fluid/memory/detail/system_allocator.h"
 #include "paddle/fluid/platform/assert.h"
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/gpu_info.h"
@ -134,21 +135,31 @@ bool GPUAllocator::UseGpu() const { return true; }
 // memory. It’s locked to a physical address.
 void* CUDAPinnedAllocator::Alloc(size_t& index, size_t size) {
  if (size <= 0) return nullptr;
-  void* p;
+
-  // NOTE: here, we use GpuMaxAllocSize() as the maximum memory size
+  // NOTE: here, we use CUDAPinnedMaxAllocSize as the maximum memory size
  // of host pinned allocation. Allocates too much would reduce
  // the amount of memory available to the underlying system for paging.
  size_t usable =
      paddle::platform::CUDAPinnedMaxAllocSize() - cuda_pinnd_alloc_size_;
-  size_t usable = paddle::platform::GpuMaxAllocSize() - fallback_alloc_size_;
+  if (size > usable) {
-
+    LOG(WARNING) << "Cannot malloc " << size / 1024.0 / 1024.0
-  if (size > usable) return nullptr;
+                 << " MB pinned memory."
                 << ", available " << usable / 1024.0 / 1024.0 << " MB";
    return nullptr;
  }
  void* p;
  // PINNED memory is visible to all CUDA contexts.
  cudaError_t result = cudaMallocHost(&p, size);
  if (result == cudaSuccess) {
-    index = 1;
+    index = 1;  // PINNED memory
-    fallback_alloc_size_ += size;
+    cuda_pinnd_alloc_size_ += size;
    return p;
  } else {
    LOG(WARNING) << "cudaMallocHost failed.";
    return nullptr;
  }
  return nullptr;
@ -158,8 +169,8 @@ void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) {
  cudaError_t err;
  PADDLE_ASSERT(index == 1);
-  PADDLE_ASSERT(fallback_alloc_size_ >= size);
+  PADDLE_ASSERT(cuda_pinnd_alloc_size_ >= size);
-  fallback_alloc_size_ -= size;
+  cuda_pinnd_alloc_size_ -= size;
  err = cudaFreeHost(p);
  // Purposefully allow cudaErrorCudartUnloading, because
@ -172,7 +183,7 @@ void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) {
  }
 }
-bool CUDAPinnedAllocator::UseGpu() const { return true; }
+bool CUDAPinnedAllocator::UseGpu() const { return false; }
 #endif
--- a/paddle/fluid/memory/detail/system_allocator.h
+++ b/paddle/fluid/memory/detail/system_allocator.h
@ -21,8 +21,9 @@ namespace memory {
 namespace detail {
 /**
- * \brief SystemAllocator is the parent class of CPUAllocator and GPUAllocator.
+ * \brief SystemAllocator is the parent class of CPUAllocator,
- *        A BuddyAllocator object uses a SystemAllocator* pointing to the
+ *        CUDAPinnedAllocator and GPUAllocator. A BuddyAllocator
 *        object uses a SystemAllocator* pointing to the
 *        underlying system allocator.
 */
 class SystemAllocator {
@ -62,9 +63,7 @@ class CUDAPinnedAllocator : public SystemAllocator {
  virtual bool UseGpu() const;
 private:
-  size_t gpu_alloc_size_ =
+  size_t cuda_pinnd_alloc_size_ = 0;
      0;  // TODO(zcd): how to define the upper limit of CUDAPinnedMemory?
  size_t fallback_alloc_size_ = 0;
 };
 #endif
--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@ -56,6 +56,45 @@ void Copy<platform::CUDAPlace, platform::CUDAPlace>(
  }
 }
 template <>
 void Copy<platform::CPUPlace, platform::CUDAPinnedPlace>(
    platform::CPUPlace dst_place, void* dst,
    platform::CUDAPinnedPlace src_place, const void* src, size_t num) {
  std::memcpy(dst, src, num);
 }
 template <>
 void Copy<platform::CUDAPinnedPlace, platform::CPUPlace>(
    platform::CUDAPinnedPlace dst_place, void* dst,
    platform::CPUPlace src_place, const void* src, size_t num) {
  std::memcpy(dst, src, num);
 }
 template <>
 void Copy<platform::CUDAPinnedPlace, platform::CUDAPinnedPlace>(
    platform::CUDAPinnedPlace dst_place, void* dst,
    platform::CUDAPinnedPlace src_place, const void* src, size_t num) {
  std::memcpy(dst, src, num);
 }
 template <>
 void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>(
    platform::CUDAPinnedPlace dst_place, void* dst,
    platform::CUDAPlace src_place, const void* src, size_t num,
    cudaStream_t stream) {
  platform::SetDeviceId(src_place.device);
  platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
 }
 template <>
 void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(
    platform::CUDAPlace dst_place, void* dst,
    platform::CUDAPinnedPlace src_place, const void* src, size_t num,
    cudaStream_t stream) {
  platform::SetDeviceId(dst_place.device);
  platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
 }
 #endif
 }  // namespace memory
--- a/paddle/fluid/memory/memory.cc
+++ b/paddle/fluid/memory/memory.cc
@ -38,8 +38,7 @@ BuddyAllocator* GetCPUBuddyAllocator() {
 }
 template <>
-void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size,
+void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size) {
                                bool is_pinned) {
  VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
  void* p = GetCPUBuddyAllocator()->Alloc(size);
  VLOG(10) << "  pointer=" << p;
@ -47,8 +46,7 @@ void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size,
 }
 template <>
-void Free<platform::CPUPlace>(platform::CPUPlace place, void* p,
+void Free<platform::CPUPlace>(platform::CPUPlace place, void* p) {
                              bool is_pinned) {
  VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
  GetCPUBuddyAllocator()->Free(p);
 }
@ -84,47 +82,15 @@ BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
  return as[gpu_id];
 }
 BuddyAllocator* GetCUDAPinnedBuddyAllocator(int gpu_id) {
  static BuddyAllocator** as = NULL;
  if (as == NULL) {
    int gpu_num = platform::GetCUDADeviceCount();
    as = new BuddyAllocator*[gpu_num];
    for (int gpu = 0; gpu < gpu_num; gpu++) {
      as[gpu] = nullptr;
    }
  }
  platform::SetDeviceId(gpu_id);
  if (!as[gpu_id]) {
    as[gpu_id] = new BuddyAllocator(new detail::CUDAPinnedAllocator,
                                    platform::GpuMinChunkSize(),
                                    platform::GpuMaxChunkSize());
    VLOG(10) << "\n\nNOTE: each GPU device use "
             << FLAGS_fraction_of_gpu_memory_to_use * 100
             << "% of GPU memory.\n"
             << "You can set GFlags environment variable '"
             << "FLAGS_fraction_of_gpu_memory_to_use"
             << "' to change the fraction of GPU usage.\n\n";
  }
  return as[gpu_id];
 }
 template <>
 size_t Used<platform::CUDAPlace>(platform::CUDAPlace place) {
  return GetGPUBuddyAllocator(place.device)->Used();
 }
 template <>
-void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size,
+void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size) {
-                                 bool is_pinned) {
+  auto* buddy_allocator = GetGPUBuddyAllocator(place.device);
-  void* ptr;
+  auto* ptr = buddy_allocator->Alloc(size);
  if (is_pinned) {
    auto* buddy_allocator = GetCUDAPinnedBuddyAllocator(place.device);
    ptr = buddy_allocator->Alloc(size);
  } else {
    auto* buddy_allocator = GetGPUBuddyAllocator(place.device);
    ptr = buddy_allocator->Alloc(size);
  }
  if (ptr == nullptr) {
    int cur_dev = platform::GetCurrentDeviceId();
    platform::SetDeviceId(place.device);
@ -142,15 +108,42 @@ void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size,
 }
 template <>
-void Free<platform::CUDAPlace>(platform::CUDAPlace place, void* p,
+void Free<platform::CUDAPlace>(platform::CUDAPlace place, void* p) {
-                               bool is_pinned) {
+  GetGPUBuddyAllocator(place.device)->Free(p);
-  if (is_pinned) {
+}
-    GetCUDAPinnedBuddyAllocator(place.device)->Free(p);
+
-  } else {
+BuddyAllocator* GetCUDAPinnedBuddyAllocator() {
-    GetGPUBuddyAllocator(place.device)->Free(p);
+  static BuddyAllocator* ba = NULL;
  if (ba == NULL) {
    ba = new BuddyAllocator(new detail::CUDAPinnedAllocator,
                            platform::CUDAPinnedMinChunkSize(),
                            platform::CUDAPinnedMaxChunkSize());
  }
  return ba;
 }
 template <>
 size_t Used<platform::CUDAPinnedPlace>(platform::CUDAPinnedPlace place) {
  return GetCUDAPinnedBuddyAllocator()->Used();
 }
 template <>
 void* Alloc<platform::CUDAPinnedPlace>(platform::CUDAPinnedPlace place,
                                       size_t size) {
  auto* buddy_allocator = GetCUDAPinnedBuddyAllocator();
  void* ptr = buddy_allocator->Alloc(size);
  if (ptr == nullptr) {
    LOG(WARNING) << "cudaMallocHost Cannot allocate " << size
                 << " bytes in CUDAPinnedPlace";
  }
  return ptr;
 }
 template <>
 void Free<platform::CUDAPinnedPlace>(platform::CUDAPinnedPlace place, void* p) {
  GetCUDAPinnedBuddyAllocator()->Free(p);
 }
 #endif
 size_t Usage::operator()(const platform::CPUPlace& cpu) const {
@ -165,6 +158,14 @@ size_t Usage::operator()(const platform::CUDAPlace& gpu) const {
 #endif
 }
 size_t Usage::operator()(const platform::CUDAPinnedPlace& cuda_pinned) const {
 #ifdef PADDLE_WITH_CUDA
  return Used(cuda_pinned);
 #else
  PADDLE_THROW("'CUDAPinnedPlace' is not supported in CPU only device.");
 #endif
 }
 size_t memory_usage(const platform::Place& p) {
  return boost::apply_visitor(Usage(), p);
 }
--- a/paddle/fluid/memory/memory.h
+++ b/paddle/fluid/memory/memory.h
@ -33,7 +33,7 @@ namespace memory {
 *          address is valid or not.
 */
 template <typename Place>
-void* Alloc(Place place, size_t size, bool is_pinned = false);
+void* Alloc(Place place, size_t size);
 /**
 * \brief   Free memory block in one place.
@ -43,7 +43,7 @@ void* Alloc(Place place, size_t size, bool is_pinned = false);
 *
 */
 template <typename Place>
-void Free(Place place, void* ptr, bool is_pinned = false);
+void Free(Place place, void* ptr);
 /**
 * \brief   Total size of used memory in one place.
@ -57,6 +57,7 @@ size_t Used(Place place);
 struct Usage : public boost::static_visitor<size_t> {
  size_t operator()(const platform::CPUPlace& cpu) const;
  size_t operator()(const platform::CUDAPlace& gpu) const;
  size_t operator()(const platform::CUDAPinnedPlace& cuda_pinned) const;
 };
 size_t memory_usage(const platform::Place& p);
@ -74,13 +75,11 @@ class PODDeleter {
  static_assert(std::is_pod<T>::value, "T must be POD");
 public:
-  explicit PODDeleter(Place place, bool is_pinned = false)
+  explicit PODDeleter(Place place) : place_(place) {}
-      : place_(place), is_pinned_(is_pinned) {}
+  void operator()(T* ptr) { Free(place_, static_cast<void*>(ptr)); }
  void operator()(T* ptr) { Free(place_, static_cast<void*>(ptr), is_pinned_); }
 private:
  Place place_;
  bool is_pinned_;
 };
 /**
--- a/paddle/fluid/memory/memory_test.cc
+++ b/paddle/fluid/memory/memory_test.cc
@ -141,4 +141,59 @@ TEST(BuddyAllocator, GPUMultAlloc) {
  }
 }
 size_t align(size_t size, paddle::platform::CUDAPinnedPlace place) {
  size += sizeof(paddle::memory::detail::Metadata);
  size_t alignment = paddle::platform::CUDAPinnedMinChunkSize();
  size_t remaining = size % alignment;
  return remaining == 0 ? size : size + (alignment - remaining);
 }
 TEST(BuddyAllocator, CUDAPinnedAllocator) {
  void *p = nullptr;
  EXPECT_EQ(p, nullptr);
  paddle::platform::CUDAPinnedPlace cpu;
  p = paddle::memory::Alloc(cpu, 4096);
  EXPECT_NE(p, nullptr);
  paddle::platform::Place place = cpu;
  EXPECT_EQ(paddle::memory::Used(cpu), paddle::memory::memory_usage(place));
  paddle::memory::Free(cpu, p);
 }
 TEST(BuddyAllocator, CUDAPinnedMultAllocator) {
  paddle::platform::CUDAPinnedPlace cpu;
  std::unordered_map<void *, size_t> ps;
  size_t total_size = paddle::memory::Used(cpu);
  EXPECT_EQ(total_size, 0UL);
  for (auto size :
       {0, 128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {
    ps[paddle::memory::Alloc(cpu, size)] = size;
    // Buddy Allocator doesn't manage too large memory chunk
    if (paddle::memory::Used(cpu) == total_size) continue;
    size_t aligned_size = align(size, cpu);
    total_size += aligned_size;
    EXPECT_EQ(total_size, paddle::memory::Used(cpu));
  }
  for (auto p : ps) {
    EXPECT_EQ(is_aligned(p.first), true);
    paddle::memory::Free(cpu, p.first);
    // Buddy Allocator doesn't manage too large memory chunk
    if (paddle::memory::Used(cpu) == total_size) continue;
    size_t aligned_size = align(p.second, cpu);
    total_size -= aligned_size;
    EXPECT_EQ(total_size, paddle::memory::Used(cpu));
  }
 }
 #endif
--- a/paddle/fluid/memory/pinned_memory_test.cu
+++ b/paddle/fluid/memory/pinned_memory_test.cu
@ -0,0 +1,147 @@
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include <gtest/gtest.h>
 #include <unordered_map>
 #include "paddle/fluid/memory/detail/memory_block.h"
 #include "paddle/fluid/memory/detail/meta_data.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/gpu_info.h"
 #include "paddle/fluid/platform/place.h"
 // This unit test is an example comparing the performance between using pinned
 // memory and not. In general, using pinned memory will be faster.
 template <typename T>
 __global__ void Kernel(T* output, int dim) {
  int tid = blockIdx.x * blockDim.x + threadIdx.x;
  if (tid < dim) {
    output[tid] = output[tid] * output[tid] / 100;
  }
 }
 template <typename Place>
 float test_pinned_memory() {
  Place cpu_place;
  paddle::platform::CUDAPlace cuda_place;
  const int data_size = 4096;
  const int iteration = 10;
  // create event start and end
  cudaEvent_t start_e, stop_e, copying_e;
  float elapsedTime = 0;
  cudaEventCreate(&start_e);
  cudaEventCreate(&stop_e);
  cudaEventCreate(&copying_e);
  // create computation stream, data copying stream
  cudaStream_t computation_stream, copying_stream;
  cudaStreamCreate(&computation_stream);
  cudaStreamCreate(&copying_stream);
  // create record event, pinned memory, gpu memory
  std::vector<cudaEvent_t> record_event(iteration);
  std::vector<float*> input_pinned_mem(iteration);
  std::vector<float*> gpu_mem(iteration);
  std::vector<float*> output_pinned_mem(iteration);
  // initial data
  for (int j = 0; j < iteration; ++j) {
    cudaEventCreateWithFlags(&record_event[j], cudaEventDisableTiming);
    cudaEventCreate(&(record_event[j]));
    input_pinned_mem[j] = static_cast<float*>(
        paddle::memory::Alloc(cpu_place, data_size * sizeof(float)));
    output_pinned_mem[j] = static_cast<float*>(
        paddle::memory::Alloc(cpu_place, data_size * sizeof(float)));
    gpu_mem[j] = static_cast<float*>(
        paddle::memory::Alloc(cuda_place, data_size * sizeof(float)));
    for (int k = 0; k < data_size; ++k) {
      input_pinned_mem[j][k] = k;
    }
  }
  cudaEventRecord(start_e, computation_stream);
  // computation
  for (int m = 0; m < 30; ++m) {
    for (int i = 0; i < iteration; ++i) {
      // cpu -> GPU on computation stream.
      // note: this operation is async for pinned memory.
      paddle::memory::Copy(cuda_place, gpu_mem[i], cpu_place,
                           input_pinned_mem[i], data_size * sizeof(float),
                           computation_stream);
      // call kernel on computation stream.
      Kernel<<<4, 1024, 0, computation_stream>>>(gpu_mem[i], data_size);
      // record event_computation on computation stream
      cudaEventRecord(record_event[i], computation_stream);
      // wait event_computation on copy stream.
      // note: this operation is async.
      cudaStreamWaitEvent(copying_stream, record_event[i], 0);
      // copy data GPU->CPU, on copy stream.
      // note: this operation is async for pinned memory.
      paddle::memory::Copy(cpu_place, output_pinned_mem[i], cuda_place,
                           gpu_mem[i], data_size * sizeof(float),
                           copying_stream);
    }
  }
  cudaEventRecord(copying_e, copying_stream);
  cudaStreamWaitEvent(computation_stream, copying_e, 0);
  cudaEventRecord(stop_e, computation_stream);
  cudaEventSynchronize(start_e);
  cudaEventSynchronize(stop_e);
  cudaEventElapsedTime(&elapsedTime, start_e, stop_e);
  // std::cout << cpu_place << " "
  //          << "time consume:" << elapsedTime / 30 << std::endl;
  for (int l = 0; l < iteration; ++l) {
    for (int k = 0; k < data_size; ++k) {
      float temp = input_pinned_mem[l][k];
      temp = temp * temp / 100;
      EXPECT_FLOAT_EQ(temp, output_pinned_mem[l][k]);
    }
  }
  // destroy resource
  cudaEventDestroy(copying_e);
  cudaEventDestroy(start_e);
  cudaEventDestroy(stop_e);
  for (int j = 0; j < 10; ++j) {
    cudaEventDestroy((record_event[j]));
    paddle::memory::Free(cpu_place, input_pinned_mem[j]);
    paddle::memory::Free(cpu_place, output_pinned_mem[j]);
    paddle::memory::Free(cuda_place, gpu_mem[j]);
  }
  return elapsedTime / 30;
 }
 TEST(CPUANDCUDAPinned, CPUAllocatorAndCUDAPinnedAllocator) {
  // Generally speaking, operation on pinned_memory is faster than that on
  // unpinned-memory, but if this unit test fails frequently, please close this
  // test for the time being.
  float time1 = test_pinned_memory<paddle::platform::CPUPlace>();
  float time2 = test_pinned_memory<paddle::platform::CUDAPinnedPlace>();
  EXPECT_GT(time1, time2);
 }
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@ -13,6 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/conv_op.h"
 #include <string>
 #include <vector>
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/cudnn_helper.h"
 #endif
--- a/paddle/fluid/operators/detail/bytebuffer_stream.cc
+++ b/paddle/fluid/operators/detail/bytebuffer_stream.cc
@ -17,7 +17,7 @@ limitations under the License. */
 //       file and did some modifications so that we can send gRPC
 //       requests without too much copying of the tensor data.
-#include "bytebuffer_stream.h"
+#include "paddle/fluid/operators/detail/bytebuffer_stream.h"
 namespace paddle {
 namespace operators {
--- a/paddle/fluid/operators/detail/bytebuffer_stream.h
+++ b/paddle/fluid/operators/detail/bytebuffer_stream.h
@ -19,9 +19,11 @@ limitations under the License. */
 #pragma once
-#include <grpc++/grpc++.h>
+#include <vector>
 #include "google/protobuf/io/coded_stream.h"
 #include "google/protobuf/io/zero_copy_stream.h"
 #include "grpc++/grpc++.h"
 namespace grpc {
 // A ZeroCopyInputStream that reads from grpc_byte_buffer
@ -56,7 +58,7 @@ class GrpcBufferReader final
      *data = GRPC_SLICE_START_PTR(slice_) + GRPC_SLICE_LENGTH(slice_) -
              backup_count_;
      GPR_CODEGEN_ASSERT(backup_count_ <= INT_MAX);
-      *size = (int)backup_count_;
+      *size = static_cast<int>(backup_count_);
      backup_count_ = 0;
      return true;
    }
@ -68,7 +70,7 @@ class GrpcBufferReader final
    *data = GRPC_SLICE_START_PTR(slice_);
    // On win x64, int is only 32bit
    GPR_CODEGEN_ASSERT(GRPC_SLICE_LENGTH(slice_) <= INT_MAX);
-    byte_count_ += * size = (int)GRPC_SLICE_LENGTH(slice_);
+    byte_count_ += * size = static_cast<int>(GRPC_SLICE_LENGTH(slice_));
    return true;
  }
--- a/paddle/fluid/operators/detail/grpc_client.cc
+++ b/paddle/fluid/operators/detail/grpc_client.cc
@ -14,6 +14,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/detail/grpc_client.h"
 #include <sys/time.h>
 #include <limits>
 #include "paddle/fluid/framework/threadpool.h"
@ -54,7 +56,7 @@ bool RPCClient::AsyncSendVariable(const std::string& ep,
    auto call = s->stub_g_.PrepareUnaryCall(
        s->context_.get(), "/sendrecv.SendRecvService/SendVariable", req, &cq_);
    call->StartCall();
-    call->Finish(&s->reply_, &s->status_, static_cast<void*>(s));
+    call->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
  });
  req_count_++;
@ -66,7 +68,7 @@ void ProcGetResponse(const VarHandle& var_h,
                     // const sendrecv::VariableMessage& ret_msg) {
                     const ::grpc::ByteBuffer& ret_msg) {
  framework::Variable* outvar = NULL;
-  DeserializeFromByteBuffer(ret_msg, *var_h.ctx, var_h.scope, outvar);
+  DeserializeFromByteBuffer(ret_msg, *var_h.ctx, var_h.scope, &outvar);
 }
 template <typename T>
@ -110,7 +112,7 @@ bool RPCClient::AsyncGetVariable(const std::string& ep,
    auto call = s->stub_g_.PrepareUnaryCall(
        s->context_.get(), "/sendrecv.SendRecvService/GetVariable", buf, &cq_);
    call->StartCall();
-    call->Finish(&s->reply_, &s->status_, static_cast<void*>(s));
+    call->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
  });
  req_count_++;
@ -170,7 +172,7 @@ void RPCClient::AsyncSendBatchBarrier(const std::string& ep, int64_t time_out) {
  sendrecv::VariableMessage req;
  req.set_varname(BATCH_BARRIER_MESSAGE);
  auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
-  rpc->Finish(&s->reply_, &s->status_, static_cast<void*>(s));
+  rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
  req_count_++;
 }
@ -182,7 +184,7 @@ void RPCClient::AsyncSendFetchBarrier(const std::string& ep, int64_t time_out) {
  sendrecv::VariableMessage req;
  req.set_varname(FETCH_BARRIER_MESSAGE);
  auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_);
-  rpc->Finish(&s->reply_, &s->status_, static_cast<void*>(s));
+  rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
  req_count_++;
 }
--- a/paddle/fluid/operators/detail/grpc_client.h
+++ b/paddle/fluid/operators/detail/grpc_client.h
@ -14,10 +14,9 @@ limitations under the License. */
 #pragma once
 #include <grpc++/grpc++.h>
 #include <grpc/support/log.h>
 #include <time.h>
-#include <chrono>
+
 #include <chrono>  // NOLINT
 #include <ctime>
 #include <functional>
 #include <iostream>
@ -25,11 +24,11 @@ limitations under the License. */
 #include <string>
 #include <vector>
-#include <grpc++/generic/generic_stub.h>
+#include "grpc++/generic/generic_stub.h"
-#include <grpc++/grpc++.h>
+#include "grpc++/grpc++.h"
-#include <grpc++/support/byte_buffer.h>
+#include "grpc++/support/byte_buffer.h"
-#include <grpc++/support/slice.h>
+#include "grpc++/support/slice.h"
-
+#include "grpc/support/log.h"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
--- a/paddle/fluid/operators/detail/grpc_server.cc
+++ b/paddle/fluid/operators/detail/grpc_server.cc
@ -273,7 +273,7 @@ void AsyncGRPCServer::TryToRegisterNewPrefetchOne() {
 // FIXME(typhoonzero): change cq_name to enum.
 void AsyncGRPCServer::HandleRequest(::grpc::ServerCompletionQueue* cq,
-                                    std::string cq_name,
+                                    const std::string& cq_name,
                                    std::function<void()> TryToRegisterNewOne) {
  TryToRegisterNewOne();
--- a/paddle/fluid/operators/detail/grpc_server.h
+++ b/paddle/fluid/operators/detail/grpc_server.h
@ -14,10 +14,11 @@ limitations under the License. */
 #pragma once
 #include <grpc++/grpc++.h>
 #include <string>
 #include <thread>  // NOLINT
 #include <utility>
 #include "grpc++/grpc++.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/program_desc.h"
@ -71,7 +72,8 @@ class AsyncGRPCServer final {
  void ShutDown();
 protected:
-  void HandleRequest(::grpc::ServerCompletionQueue *cq, std::string cq_name,
+  void HandleRequest(::grpc::ServerCompletionQueue *cq,
                     const std::string &cq_name,
                     std::function<void()> TryToRegisterNewOne);
  void TryToRegisterNewSendOne();
  void TryToRegisterNewGetOne();
--- a/paddle/fluid/operators/detail/grpc_server_test.cc
+++ b/paddle/fluid/operators/detail/grpc_server_test.cc
@ -14,7 +14,7 @@ limitations under the License. */
 #include <unistd.h>
 #include <string>
-#include <thread>
+#include <thread>  // NOLINT
 #include "gtest/gtest.h"
 #include "paddle/fluid/operators/detail/grpc_client.h"
--- a/paddle/fluid/operators/detail/proto_encoder_helper.h
+++ b/paddle/fluid/operators/detail/proto_encoder_helper.h
@ -19,7 +19,9 @@ limitations under the License. */
 #pragma once
-#include <grpc++/grpc++.h>
+#include <string>
 #include "grpc++/grpc++.h"
 #include "paddle/fluid/platform/enforce.h"
 namespace paddle {
@ -142,6 +144,6 @@ class ProtoEncodeHelper {
  char* limit_;  // Just for CHECKs
 };
-}  // detail
+}  // namespace detail
-}  // operators
+}  // namespace operators
-}  // paddle
+}  // namespace paddle
--- a/paddle/fluid/operators/detail/sendrecvop_utils.cc
+++ b/paddle/fluid/operators/detail/sendrecvop_utils.cc
@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/detail/sendrecvop_utils.h"
 #include <sys/time.h>
-#include <thread>
+#include <thread>  // NOLINT
 #include "google/protobuf/io/coded_stream.h"
 #include "google/protobuf/io/zero_copy_stream.h"
 #include "paddle/fluid/framework/data_type.h"
@ -42,7 +44,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
  void* buf = malloc(1024);
  void* payload = nullptr;
  size_t payload_size;
-  ProtoEncodeHelper e((char*)buf, 1024);
+  ProtoEncodeHelper e(static_cast<char*>(buf), 1024);
  e.WriteString(VarMsg::kVarnameFieldNumber, name);
  if (var->IsType<framework::LoDTensor>()) {
    e.WriteUint64(VarMsg::kTypeFieldNumber, 0);
@ -152,7 +154,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
      framework::proto::VarType_Type_SELECTED_ROWS) {
    auto* slr = var->GetMutable<framework::SelectedRows>();
-    ProtoEncodeHelper e2((char*)buf, 128);
+    ProtoEncodeHelper e2(static_cast<char*>(buf), 128);
    // NOTE: rows is of type int64_t
    size_t rows_memory_size =
        slr->rows().size() * framework::SizeOfType(typeid(int64_t));
@ -181,10 +183,10 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
 void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
                               const platform::DeviceContext& ctx,
                               const framework::Scope* scope,
-                               framework::Variable*& var) {
+                               framework::Variable** var) {
  operators::detail::VariableResponse resp(scope, &ctx);
  PADDLE_ENFORCE(resp.Parse(msg) == 0, "parse bytebuffer to tensor error!");
-  var = resp.GetVar();
+  *var = resp.GetVar();
 }
 }  // namespace detail
--- a/Show More
+++ b/Show More