From 028f3dc4e5fcb558041ff168e233a89b41aeaed9 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Wed, 19 Jul 2017 13:13:19 +0800
Subject: [PATCH 1/7] Add memcpy

---
 paddle/memory/memory.cc | 14 ++++++++++++++
 paddle/memory/memory.h  |  3 +++
 2 files changed, 17 insertions(+)
diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc
index 430ce98bfc..5be9bef3ac 100644
--- a/paddle/memory/memory.cc
+++ b/paddle/memory/memory.cc
@@ -47,6 +47,20 @@ size_t Used<platform::CPUPlace>(platform::CPUPlace place) {
   return GetCPUBuddyAllocator()->Used();
 }
 
+template <>
+void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
+                                                  platform::CPUPlace, void* src,
+                                                  size_t size) {
+  memcpy(dst, src, size);
+}
+
+template <>
+void Copy<platform::CPUPlace, platform::GPUPlace>(platform::CPUPlace, void* dst,
+                                                  platform::CPUPlace, void* src,
+                                                  size_t size) {
+  memcpy(dst, src, size);
+}
+
 #ifndef PADDLE_ONLY_CPU
 
 detail::BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
diff --git a/paddle/memory/memory.h b/paddle/memory/memory.h
index 2d6f4fd2a0..96c00cb106 100644
--- a/paddle/memory/memory.h
+++ b/paddle/memory/memory.h
@@ -28,5 +28,8 @@ void Free(Place, void*);
 template <class Place>
 size_t Used(Place);
 
+template <class Place1, class Place2>
+void Copy(Place1, void* dst, Place2, void* src, size_t size);
+
 }  // namespace memory
 }  // namespace paddle

From e53a48b46a143217a39b5f1c9125c4a7d507d2b5 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Wed, 19 Jul 2017 22:27:41 +0800
Subject: [PATCH 2/7] Add memcpy

---
 paddle/memory/memory.cc | 46 +++++++++++++++++++++++++++++------------
 paddle/memory/memory.h  | 17 ++++++++++-----
 2 files changed, 45 insertions(+), 18 deletions(-)

diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc
index 5be9bef3ac..5c7b3bb15e 100644
--- a/paddle/memory/memory.cc
+++ b/paddle/memory/memory.cc
@@ -15,9 +15,6 @@ limitations under the License. */
 #include "paddle/memory/memory.h"
 #include "paddle/memory/detail/buddy_allocator.h"
 #include "paddle/memory/detail/system_allocator.h"
-#include "paddle/platform/assert.h"
-
-#include <boost/variant.hpp>
 
 namespace paddle {
 namespace memory {
@@ -49,16 +46,9 @@ size_t Used<platform::CPUPlace>(platform::CPUPlace place) {
 
 template <>
 void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
-                                                  platform::CPUPlace, void* src,
-                                                  size_t size) {
-  memcpy(dst, src, size);
-}
-
-template <>
-void Copy<platform::CPUPlace, platform::GPUPlace>(platform::CPUPlace, void* dst,
-                                                  platform::CPUPlace, void* src,
-                                                  size_t size) {
-  memcpy(dst, src, size);
+                                                  platform::CPUPlace,
+                                                  const void* src, size_t num) {
+  memcpy(dst, src, num);
 }
 
 #ifndef PADDLE_ONLY_CPU
@@ -93,6 +83,36 @@ size_t Used<platform::GPUPlace>(platform::GPUPlace place) {
   return GetGPUBuddyAllocator(place.device)->Used();
 }
 
+template <>
+void Copy<platform::CPUPlace, platform::GPUPlace>(platform::CPUPlace, void* dst,
+                                                  platform::GPUPlace,
+                                                  const void* src, size_t num,
+                                                  cudaStream_t stream) {
+  platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
+}
+
+template <>
+void Copy<platform::GPUPlace, platform::CPUPlace>(platform::GPUPlace, void* dst,
+                                                  platform::CPUPlace,
+                                                  const void* src, size_t num,
+                                                  cudaStream_t stream) {
+  platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
+}
+
+template <>
+void Copy<platform::GPUPlace, platform::GPUPlace>(platform::GPUPlace dst_place,
+                                                  void* dst,
+                                                  platform::GPUPlace src_place,
+                                                  const void* src, size_t num,
+                                                  cudaStream_t stream) {
+  if (dst_place == src_place) {
+    platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice, stream);
+  } else {
+    platform::GpuMemcpyPeer(dst, dst_place.device, src, src_place.device, num,
+                            stream);
+  }
+}
+
 #endif  // PADDLE_ONLY_CPU
 
 }  // namespace memory
diff --git a/paddle/memory/memory.h b/paddle/memory/memory.h
index 96c00cb106..3ac359e174 100644
--- a/paddle/memory/memory.h
+++ b/paddle/memory/memory.h
@@ -14,22 +14,29 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/platform/gpu_info.h"
 #include "paddle/platform/place.h"
 
 namespace paddle {
 namespace memory {
 
-template <class Place>
+template <typename Place>
 void* Alloc(Place, size_t);
 
-template <class Place>
+template <typename Place>
 void Free(Place, void*);
 
-template <class Place>
+template <typename Place>
 size_t Used(Place);
 
-template <class Place1, class Place2>
-void Copy(Place1, void* dst, Place2, void* src, size_t size);
+template <typename DstPlace, typename SrcPlace>
+void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num);
+
+#ifndef PADDLE_ONLY_CPU
+template <typename DstPlace, typename SrcPlace>
+void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num,
+          cudaStream_t stream);
+#endif  // PADDLE_ONLY_CPU
 
 }  // namespace memory
 }  // namespace paddle

From b05886414b4f7bdbc9a6228e6cb681c7f3ccb50c Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Wed, 19 Jul 2017 22:28:06 +0800
Subject: [PATCH 3/7] Add cuda memcpy in gpu_info

---
 paddle/platform/gpu_info.cc | 20 +++++++++++++++++++-
 paddle/platform/gpu_info.h  | 15 ++++++++++++++-
 2 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/paddle/platform/gpu_info.cc b/paddle/platform/gpu_info.cc
index a1383d3524..12dc01d1a1 100644
--- a/paddle/platform/gpu_info.cc
+++ b/paddle/platform/gpu_info.cc
@@ -44,7 +44,7 @@ void SetDeviceId(int id) {
                  "cudaSetDevice failed in paddle::platform::SetDeviceId");
 }
 
-void GpuMemoryUsage(size_t& available, size_t& total) {
+void GpuMemoryUsage(size_t &available, size_t &total) {
   throw_on_error(cudaMemGetInfo(&available, &total),
                  "cudaMemGetInfo failed in paddle::platform::GetMemoryUsage");
 }
@@ -82,5 +82,23 @@ size_t GpuMaxChunkSize() {
   return usable;
 }
 
+void GpuMemcpyAsync(void *dst, const void *src, size_t count,
+                    enum cudaMemcpyKind kind, cudaStream_t stream) {
+  PADDLE_ENFORCE(cudaMemcpyAsync(dst, src, count, kind, stream));
+}
+
+void GpuMemcpySync(void *dst, const void *src, size_t count,
+                   enum cudaMemcpyKind kind) {
+  PADDLE_ENFORCE(cudaMemcpy(dst, src, count, kind));
+  // note: cudaMemcpy may actually be asynchronous with respect to the caller,
+  //       block on stream 0 to make sure the copy has completed
+  PADDLE_ENFORCE(cudaStreamSynchronize(0));
+}
+
+void GpuMemcpyPeer(void *dst, int dst_device, const void *src, int src_device,
+                   size_t count, cudaStream_t stream) {
+  PADDLE_ENFORCE(
+      cudaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream));
+}
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/gpu_info.h b/paddle/platform/gpu_info.h
index 79e71956bd..d3a5f5f13f 100644
--- a/paddle/platform/gpu_info.h
+++ b/paddle/platform/gpu_info.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #ifndef PADDLE_ONLY_CPU
 
+#include <cuda_runtime.h>
 #include <stddef.h>
 
 namespace paddle {
@@ -31,7 +32,7 @@ int GetCurrentDeviceId();
 void SetDeviceId(int device_id);
 
 //！Get the memory usage of current GPU device.
-void GpuMemoryUsage(size_t& available, size_t& total);
+void GpuMemoryUsage(size_t &available, size_t &total);
 
 //! Get the maximum allocation size of current GPU device.
 size_t GpuMaxAllocSize();
@@ -42,6 +43,18 @@ size_t GpuMinChunkSize();
 //! Get the maximum chunk size for GPU buddy allocator.
 size_t GpuMaxChunkSize();
 
+//! Copy memory from address src to dst asynchronously.
+void GpuMemcpyAsync(void *dst, const void *src, size_t count,
+                    enum cudaMemcpyKind kind, cudaStream_t stream);
+
+//! Copy memory from address src to dst synchronously.
+void GpuMemcpySync(void *dst, const void *src, size_t count,
+                   enum cudaMemcpyKind kind);
+
+//! Copy memory from one device to another device.
+void GpuMemcpyPeer(void *dst, int dst_device, const void *src, int src_device,
+                   size_t count, cudaStream_t stream);
+
 }  // namespace platform
 }  // namespace paddle
 

From 527c85970bcc16fae2598f984baa6d648fde4052 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Wed, 19 Jul 2017 22:51:27 +0800
Subject: [PATCH 4/7] Fix H2D and D2H order

---
 paddle/memory/memory.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc
index 5c7b3bb15e..67d2ae1bbd 100644
--- a/paddle/memory/memory.cc
+++ b/paddle/memory/memory.cc
@@ -88,7 +88,7 @@ void Copy<platform::CPUPlace, platform::GPUPlace>(platform::CPUPlace, void* dst,
                                                   platform::GPUPlace,
                                                   const void* src, size_t num,
                                                   cudaStream_t stream) {
-  platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
+  platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
 }
 
 template <>
@@ -96,7 +96,7 @@ void Copy<platform::GPUPlace, platform::CPUPlace>(platform::GPUPlace, void* dst,
                                                   platform::CPUPlace,
                                                   const void* src, size_t num,
                                                   cudaStream_t stream) {
-  platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
+  platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
 }
 
 template <>

From 00500eeb7dcf388261d3145e0ac521d0b1e10dc2 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Thu, 20 Jul 2017 09:40:27 +0800
Subject: [PATCH 5/7] Add stdlib.h for memcpy

---
 paddle/memory/memory.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc
index 67d2ae1bbd..a7d7fa0bfe 100644
--- a/paddle/memory/memory.cc
+++ b/paddle/memory/memory.cc
@@ -16,6 +16,8 @@ limitations under the License. */
 #include "paddle/memory/detail/buddy_allocator.h"
 #include "paddle/memory/detail/system_allocator.h"
 
+#include <stdlib.h>  // for memcpy
+
 namespace paddle {
 namespace memory {
 

From 0897d18a0a66b7942d8cecc7c8652192f47df66c Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Thu, 20 Jul 2017 10:05:54 +0800
Subject: [PATCH 6/7] Fix string.h for memcpy

---
 paddle/memory/memory.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc
index a7d7fa0bfe..4056a54b4a 100644
--- a/paddle/memory/memory.cc
+++ b/paddle/memory/memory.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "paddle/memory/detail/buddy_allocator.h"
 #include "paddle/memory/detail/system_allocator.h"
 
-#include <stdlib.h>  // for memcpy
+#include <cstring>  // for memcpy
 
 namespace paddle {
 namespace memory {
@@ -50,7 +50,7 @@ template <>
 void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
                                                   platform::CPUPlace,
                                                   const void* src, size_t num) {
-  memcpy(dst, src, num);
+  std::memcpy(dst, src, num);
 }
 
 #ifndef PADDLE_ONLY_CPU

From b3115fb01c007abea7e7ea7bf41363c5669e844a Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Thu, 20 Jul 2017 11:21:37 +0800
Subject: [PATCH 7/7] Add SetDeviceId in memcpy

---
 paddle/memory/memory.cc | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc
index 4056a54b4a..78443cc35a 100644
--- a/paddle/memory/memory.cc
+++ b/paddle/memory/memory.cc
@@ -86,18 +86,22 @@ size_t Used<platform::GPUPlace>(platform::GPUPlace place) {
 }
 
 template <>
-void Copy<platform::CPUPlace, platform::GPUPlace>(platform::CPUPlace, void* dst,
-                                                  platform::GPUPlace,
+void Copy<platform::CPUPlace, platform::GPUPlace>(platform::CPUPlace dst_place,
+                                                  void* dst,
+                                                  platform::GPUPlace src_place,
                                                   const void* src, size_t num,
                                                   cudaStream_t stream) {
+  platform::SetDeviceId(src_place.device);
   platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
 }
 
 template <>
-void Copy<platform::GPUPlace, platform::CPUPlace>(platform::GPUPlace, void* dst,
-                                                  platform::CPUPlace,
+void Copy<platform::GPUPlace, platform::CPUPlace>(platform::GPUPlace dst_place,
+                                                  void* dst,
+                                                  platform::CPUPlace src_place,
                                                   const void* src, size_t num,
                                                   cudaStream_t stream) {
+  platform::SetDeviceId(dst_place.device);
   platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
 }
 
@@ -108,6 +112,7 @@ void Copy<platform::GPUPlace, platform::GPUPlace>(platform::GPUPlace dst_place,
                                                   const void* src, size_t num,
                                                   cudaStream_t stream) {
   if (dst_place == src_place) {
+    platform::SetDeviceId(src_place.device);
     platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice, stream);
   } else {
     platform::GpuMemcpyPeer(dst, dst_place.device, src, src_place.device, num,