From 028f3dc4e5fcb558041ff168e233a89b41aeaed9 Mon Sep 17 00:00:00 2001 From: liaogang Date: Wed, 19 Jul 2017 13:13:19 +0800 Subject: [PATCH 1/7] Add memcpy --- paddle/memory/memory.cc | 14 ++++++++++++++ paddle/memory/memory.h | 3 +++ 2 files changed, 17 insertions(+) diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc index 430ce98bfc..5be9bef3ac 100644 --- a/paddle/memory/memory.cc +++ b/paddle/memory/memory.cc @@ -47,6 +47,20 @@ size_t Used(platform::CPUPlace place) { return GetCPUBuddyAllocator()->Used(); } +template <> +void Copy(platform::CPUPlace, void* dst, + platform::CPUPlace, void* src, + size_t size) { + memcpy(dst, src, size); +} + +template <> +void Copy(platform::CPUPlace, void* dst, + platform::CPUPlace, void* src, + size_t size) { + memcpy(dst, src, size); +} + #ifndef PADDLE_ONLY_CPU detail::BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) { diff --git a/paddle/memory/memory.h b/paddle/memory/memory.h index 2d6f4fd2a0..96c00cb106 100644 --- a/paddle/memory/memory.h +++ b/paddle/memory/memory.h @@ -28,5 +28,8 @@ void Free(Place, void*); template size_t Used(Place); +template +void Copy(Place1, void* dst, Place2, void* src, size_t size); + } // namespace memory } // namespace paddle From e53a48b46a143217a39b5f1c9125c4a7d507d2b5 Mon Sep 17 00:00:00 2001 From: liaogang Date: Wed, 19 Jul 2017 22:27:41 +0800 Subject: [PATCH 2/7] Add memcpy --- paddle/memory/memory.cc | 46 +++++++++++++++++++++++++++++------------ paddle/memory/memory.h | 17 ++++++++++----- 2 files changed, 45 insertions(+), 18 deletions(-) diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc index 5be9bef3ac..5c7b3bb15e 100644 --- a/paddle/memory/memory.cc +++ b/paddle/memory/memory.cc @@ -15,9 +15,6 @@ limitations under the License. */ #include "paddle/memory/memory.h" #include "paddle/memory/detail/buddy_allocator.h" #include "paddle/memory/detail/system_allocator.h" -#include "paddle/platform/assert.h" - -#include namespace paddle { namespace memory { @@ -49,16 +46,9 @@ size_t Used(platform::CPUPlace place) { template <> void Copy(platform::CPUPlace, void* dst, - platform::CPUPlace, void* src, - size_t size) { - memcpy(dst, src, size); -} - -template <> -void Copy(platform::CPUPlace, void* dst, - platform::CPUPlace, void* src, - size_t size) { - memcpy(dst, src, size); + platform::CPUPlace, + const void* src, size_t num) { + memcpy(dst, src, num); } #ifndef PADDLE_ONLY_CPU @@ -93,6 +83,36 @@ size_t Used(platform::GPUPlace place) { return GetGPUBuddyAllocator(place.device)->Used(); } +template <> +void Copy(platform::CPUPlace, void* dst, + platform::GPUPlace, + const void* src, size_t num, + cudaStream_t stream) { + platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream); +} + +template <> +void Copy(platform::GPUPlace, void* dst, + platform::CPUPlace, + const void* src, size_t num, + cudaStream_t stream) { + platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream); +} + +template <> +void Copy(platform::GPUPlace dst_place, + void* dst, + platform::GPUPlace src_place, + const void* src, size_t num, + cudaStream_t stream) { + if (dst_place == src_place) { + platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice, stream); + } else { + platform::GpuMemcpyPeer(dst, dst_place.device, src, src_place.device, num, + stream); + } +} + #endif // PADDLE_ONLY_CPU } // namespace memory diff --git a/paddle/memory/memory.h b/paddle/memory/memory.h index 96c00cb106..3ac359e174 100644 --- a/paddle/memory/memory.h +++ b/paddle/memory/memory.h @@ -14,22 +14,29 @@ limitations under the License. */ #pragma once +#include "paddle/platform/gpu_info.h" #include "paddle/platform/place.h" namespace paddle { namespace memory { -template +template void* Alloc(Place, size_t); -template +template void Free(Place, void*); -template +template size_t Used(Place); -template -void Copy(Place1, void* dst, Place2, void* src, size_t size); +template +void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num); + +#ifndef PADDLE_ONLY_CPU +template +void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num, + cudaStream_t stream); +#endif // PADDLE_ONLY_CPU } // namespace memory } // namespace paddle From b05886414b4f7bdbc9a6228e6cb681c7f3ccb50c Mon Sep 17 00:00:00 2001 From: liaogang Date: Wed, 19 Jul 2017 22:28:06 +0800 Subject: [PATCH 3/7] Add cuda memcpy in gpu_info --- paddle/platform/gpu_info.cc | 20 +++++++++++++++++++- paddle/platform/gpu_info.h | 15 ++++++++++++++- 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/paddle/platform/gpu_info.cc b/paddle/platform/gpu_info.cc index a1383d3524..12dc01d1a1 100644 --- a/paddle/platform/gpu_info.cc +++ b/paddle/platform/gpu_info.cc @@ -44,7 +44,7 @@ void SetDeviceId(int id) { "cudaSetDevice failed in paddle::platform::SetDeviceId"); } -void GpuMemoryUsage(size_t& available, size_t& total) { +void GpuMemoryUsage(size_t &available, size_t &total) { throw_on_error(cudaMemGetInfo(&available, &total), "cudaMemGetInfo failed in paddle::platform::GetMemoryUsage"); } @@ -82,5 +82,23 @@ size_t GpuMaxChunkSize() { return usable; } +void GpuMemcpyAsync(void *dst, const void *src, size_t count, + enum cudaMemcpyKind kind, cudaStream_t stream) { + PADDLE_ENFORCE(cudaMemcpyAsync(dst, src, count, kind, stream)); +} + +void GpuMemcpySync(void *dst, const void *src, size_t count, + enum cudaMemcpyKind kind) { + PADDLE_ENFORCE(cudaMemcpy(dst, src, count, kind)); + // note: cudaMemcpy may actually be asynchronous with respect to the caller, + // block on stream 0 to make sure the copy has completed + PADDLE_ENFORCE(cudaStreamSynchronize(0)); +} + +void GpuMemcpyPeer(void *dst, int dst_device, const void *src, int src_device, + size_t count, cudaStream_t stream) { + PADDLE_ENFORCE( + cudaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream)); +} } // namespace platform } // namespace paddle diff --git a/paddle/platform/gpu_info.h b/paddle/platform/gpu_info.h index 79e71956bd..d3a5f5f13f 100644 --- a/paddle/platform/gpu_info.h +++ b/paddle/platform/gpu_info.h @@ -16,6 +16,7 @@ limitations under the License. */ #ifndef PADDLE_ONLY_CPU +#include #include namespace paddle { @@ -31,7 +32,7 @@ int GetCurrentDeviceId(); void SetDeviceId(int device_id); //!Get the memory usage of current GPU device. -void GpuMemoryUsage(size_t& available, size_t& total); +void GpuMemoryUsage(size_t &available, size_t &total); //! Get the maximum allocation size of current GPU device. size_t GpuMaxAllocSize(); @@ -42,6 +43,18 @@ size_t GpuMinChunkSize(); //! Get the maximum chunk size for GPU buddy allocator. size_t GpuMaxChunkSize(); +//! Copy memory from address src to dst asynchronously. +void GpuMemcpyAsync(void *dst, const void *src, size_t count, + enum cudaMemcpyKind kind, cudaStream_t stream); + +//! Copy memory from address src to dst synchronously. +void GpuMemcpySync(void *dst, const void *src, size_t count, + enum cudaMemcpyKind kind); + +//! Copy memory from one device to another device. +void GpuMemcpyPeer(void *dst, int dst_device, const void *src, int src_device, + size_t count, cudaStream_t stream); + } // namespace platform } // namespace paddle From 527c85970bcc16fae2598f984baa6d648fde4052 Mon Sep 17 00:00:00 2001 From: liaogang Date: Wed, 19 Jul 2017 22:51:27 +0800 Subject: [PATCH 4/7] Fix H2D and D2H order --- paddle/memory/memory.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc index 5c7b3bb15e..67d2ae1bbd 100644 --- a/paddle/memory/memory.cc +++ b/paddle/memory/memory.cc @@ -88,7 +88,7 @@ void Copy(platform::CPUPlace, void* dst, platform::GPUPlace, const void* src, size_t num, cudaStream_t stream) { - platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream); + platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream); } template <> @@ -96,7 +96,7 @@ void Copy(platform::GPUPlace, void* dst, platform::CPUPlace, const void* src, size_t num, cudaStream_t stream) { - platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream); + platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream); } template <> From 00500eeb7dcf388261d3145e0ac521d0b1e10dc2 Mon Sep 17 00:00:00 2001 From: liaogang Date: Thu, 20 Jul 2017 09:40:27 +0800 Subject: [PATCH 5/7] Add stdlib.h for memcpy --- paddle/memory/memory.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc index 67d2ae1bbd..a7d7fa0bfe 100644 --- a/paddle/memory/memory.cc +++ b/paddle/memory/memory.cc @@ -16,6 +16,8 @@ limitations under the License. */ #include "paddle/memory/detail/buddy_allocator.h" #include "paddle/memory/detail/system_allocator.h" +#include // for memcpy + namespace paddle { namespace memory { From 0897d18a0a66b7942d8cecc7c8652192f47df66c Mon Sep 17 00:00:00 2001 From: liaogang Date: Thu, 20 Jul 2017 10:05:54 +0800 Subject: [PATCH 6/7] Fix string.h for memcpy --- paddle/memory/memory.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc index a7d7fa0bfe..4056a54b4a 100644 --- a/paddle/memory/memory.cc +++ b/paddle/memory/memory.cc @@ -16,7 +16,7 @@ limitations under the License. */ #include "paddle/memory/detail/buddy_allocator.h" #include "paddle/memory/detail/system_allocator.h" -#include // for memcpy +#include // for memcpy namespace paddle { namespace memory { @@ -50,7 +50,7 @@ template <> void Copy(platform::CPUPlace, void* dst, platform::CPUPlace, const void* src, size_t num) { - memcpy(dst, src, num); + std::memcpy(dst, src, num); } #ifndef PADDLE_ONLY_CPU From b3115fb01c007abea7e7ea7bf41363c5669e844a Mon Sep 17 00:00:00 2001 From: liaogang Date: Thu, 20 Jul 2017 11:21:37 +0800 Subject: [PATCH 7/7] Add SetDeviceId in memcpy --- paddle/memory/memory.cc | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc index 4056a54b4a..78443cc35a 100644 --- a/paddle/memory/memory.cc +++ b/paddle/memory/memory.cc @@ -86,18 +86,22 @@ size_t Used(platform::GPUPlace place) { } template <> -void Copy(platform::CPUPlace, void* dst, - platform::GPUPlace, +void Copy(platform::CPUPlace dst_place, + void* dst, + platform::GPUPlace src_place, const void* src, size_t num, cudaStream_t stream) { + platform::SetDeviceId(src_place.device); platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream); } template <> -void Copy(platform::GPUPlace, void* dst, - platform::CPUPlace, +void Copy(platform::GPUPlace dst_place, + void* dst, + platform::CPUPlace src_place, const void* src, size_t num, cudaStream_t stream) { + platform::SetDeviceId(dst_place.device); platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream); } @@ -108,6 +112,7 @@ void Copy(platform::GPUPlace dst_place, const void* src, size_t num, cudaStream_t stream) { if (dst_place == src_place) { + platform::SetDeviceId(src_place.device); platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice, stream); } else { platform::GpuMemcpyPeer(dst, dst_place.device, src, src_place.device, num,