commit
c110f56574
@ -0,0 +1,72 @@
|
||||
# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
IF(NOT ${WITH_MKLDNN})
|
||||
return()
|
||||
ENDIF(NOT ${WITH_MKLDNN})
|
||||
|
||||
INCLUDE(ExternalProject)
|
||||
|
||||
SET(MKLDNN_PROJECT "extern_mkldnn")
|
||||
SET(MKLDNN_SOURCES_DIR ${THIRD_PARTY_PATH}/mkldnn)
|
||||
SET(MKLDNN_INSTALL_ROOT ${CMAKE_INSTALL_PREFIX})
|
||||
IF(NOT "$ENV{HOME}" STREQUAL "/root")
|
||||
SET(MKLDNN_INSTALL_ROOT "$ENV{HOME}")
|
||||
ENDIF()
|
||||
|
||||
SET(MKLDNN_INSTALL_DIR "${MKLDNN_INSTALL_ROOT}/opt/paddle/third_party/mkldnn")
|
||||
SET(MKLDNN_INCLUDE_DIR "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE)
|
||||
|
||||
IF(WIN32)
|
||||
MESSAGE(WARNING "It is not supported compiling with mkldnn in windows Paddle yet."
|
||||
"Force WITH_MKLDNN=OFF")
|
||||
SET(WITH_MKLDNN OFF)
|
||||
return()
|
||||
ELSE(WIN32)
|
||||
SET(MKLDNN_LIBRARY "${MKLDNN_INSTALL_DIR}/lib/libmkldnn.so" CACHE FILEPATH "mkldnn library." FORCE)
|
||||
MESSAGE(STATUS "Set ${MKLDNN_INSTALL_DIR}/lib to runtime path")
|
||||
SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
|
||||
#SET(CMAKE_MACOSX_RPATH 1) # hold for MacOS
|
||||
SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLDNN_INSTALL_DIR}/lib")
|
||||
ENDIF(WIN32)
|
||||
|
||||
INCLUDE_DIRECTORIES(${MKLDNN_INCLUDE_DIR})
|
||||
|
||||
IF(${CBLAS_PROVIDER} STREQUAL "MKLML")
|
||||
SET(MKLDNN_DEPENDS ${MKLML_PROJECT})
|
||||
SET(MKLDNN_MKLROOT ${MKLML_ROOT})
|
||||
SET(MKLDNN_IOMP_LIB ${MKLML_IOMP_LIB})
|
||||
SET(MKLDNN_IOMP_DIR ${MKLML_LIB_DIR})
|
||||
ENDIF()
|
||||
|
||||
ExternalProject_Add(
|
||||
${MKLDNN_PROJECT}
|
||||
${EXTERNAL_PROJECT_LOG_ARGS}
|
||||
DEPENDS ${MKLDNN_DEPENDS}
|
||||
GIT_REPOSITORY "https://github.com/01org/mkl-dnn.git"
|
||||
GIT_TAG "v0.9"
|
||||
PREFIX ${MKLDNN_SOURCES_DIR}
|
||||
CONFIGURE_COMMAND mkdir -p <SOURCE_DIR>/build
|
||||
BUILD_COMMAND cd <SOURCE_DIR>/build
|
||||
&& cmake .. -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR} -DMKLROOT=${MKLDNN_MKLROOT}
|
||||
&& $(MAKE)
|
||||
INSTALL_COMMAND cd <SOURCE_DIR>/build && $(MAKE) install
|
||||
UPDATE_COMMAND ""
|
||||
)
|
||||
|
||||
ADD_LIBRARY(mkldnn SHARED IMPORTED GLOBAL)
|
||||
SET_PROPERTY(TARGET mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIBRARY})
|
||||
ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT})
|
||||
MESSAGE(STATUS "Mkldnn library: ${MKLDNN_LIBRARY}")
|
||||
LIST(APPEND external_project_dependencies mkldnn)
|
@ -0,0 +1,64 @@
|
||||
# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
IF(NOT ${WITH_MKLML})
|
||||
return()
|
||||
ENDIF(NOT ${WITH_MKLML})
|
||||
|
||||
INCLUDE(ExternalProject)
|
||||
|
||||
SET(MKLML_PROJECT "extern_mklml")
|
||||
SET(MKLML_VER "mklml_lnx_2018.0.20170425")
|
||||
SET(MKLML_URL "https://github.com/01org/mkl-dnn/releases/download/v0.9/${MKLML_VER}.tgz")
|
||||
SET(MKLML_SOURCE_DIR "${THIRD_PARTY_PATH}/mklml")
|
||||
SET(MKLML_DOWNLOAD_DIR "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}")
|
||||
SET(MKLML_DST_DIR "opt/paddle/third_party/mklml")
|
||||
SET(MKLML_INSTALL_ROOT "${CMAKE_INSTALL_PREFIX}")
|
||||
IF(NOT "$ENV{HOME}" STREQUAL "/root")
|
||||
SET(MKLML_INSTALL_ROOT "$ENV{HOME}")
|
||||
ENDIF()
|
||||
|
||||
SET(MKLML_INSTALL_DIR ${MKLML_INSTALL_ROOT}/${MKLML_DST_DIR})
|
||||
SET(MKLML_ROOT ${MKLML_INSTALL_DIR}/${MKLML_VER})
|
||||
SET(MKLML_INC_DIR ${MKLML_ROOT}/include)
|
||||
SET(MKLML_LIB_DIR ${MKLML_ROOT}/lib)
|
||||
SET(MKLML_LIB ${MKLML_LIB_DIR}/libmklml_intel.so)
|
||||
SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5.so)
|
||||
SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLML_ROOT}/lib")
|
||||
|
||||
INCLUDE_DIRECTORIES(${MKLML_INC_DIR})
|
||||
|
||||
SET(mklml_cmakefile ${MKLML_DOWNLOAD_DIR}/CMakeLists.txt)
|
||||
FILE(WRITE ${mklml_cmakefile} "PROJECT(MKLML)\n"
|
||||
"cmake_minimum_required(VERSION 3.0)\n"
|
||||
"install(DIRECTORY ${MKLML_VER}\n"
|
||||
" DESTINATION ${MKLML_DST_DIR})\n")
|
||||
|
||||
ExternalProject_Add(
|
||||
${MKLML_PROJECT}
|
||||
${EXTERNAL_PROJECT_LOG_ARGS}
|
||||
PREFIX ${MKLML_SOURCE_DIR}
|
||||
DOWNLOAD_DIR ${MKLML_DOWNLOAD_DIR}
|
||||
DOWNLOAD_COMMAND wget --no-check-certificate -O ${MKLML_DOWNLOAD_DIR}/${MKLML_VER}.tgz ${MKLML_URL}
|
||||
&& tar -xzf ${MKLML_DOWNLOAD_DIR}/${MKLML_VER}.tgz
|
||||
DOWNLOAD_NO_PROGRESS 1
|
||||
UPDATE_COMMAND ""
|
||||
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${MKLML_INSTALL_ROOT}
|
||||
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${MKLML_INSTALL_ROOT}
|
||||
)
|
||||
|
||||
ADD_LIBRARY(mklml SHARED IMPORTED GLOBAL)
|
||||
SET_PROPERTY(TARGET mklml PROPERTY IMPORTED_LOCATION ${MKLML_LIB})
|
||||
ADD_DEPENDENCIES(mklml ${MKLML_PROJECT})
|
||||
LIST(APPEND external_project_dependencies mklml)
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,159 @@
|
||||
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. */
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "TensorType.h"
|
||||
|
||||
namespace paddle {
|
||||
|
||||
/**
|
||||
*\brief Depthwise convolution forward. The outputData
|
||||
* of depthwise convolution is same with ExpandConvLayer
|
||||
* when groups equals inputChannels in ExpandConvLayer.
|
||||
*
|
||||
* \param[in] inputData input data.
|
||||
* \param[in] filterData the Paramters of the depthwise conv layer..
|
||||
* \param[in] batchSize batch size of input data.
|
||||
* \param[in] outputChannels channels of outputData.
|
||||
* \param[in] outputHeight height of outputData.
|
||||
* \param[in] outputWidth width of outputData.
|
||||
* \param[in] inputChannels channels of inputData.
|
||||
* \param[in] inputHeight height of inputData.
|
||||
* \param[in] inputWidth width of inputData..
|
||||
* \param[in] filterMultiplier equals to outputChannels/groups_.
|
||||
* \param[in] filterHeight height of filter.
|
||||
* \param[in] filterWidth widht of filter.
|
||||
* \param[in] strideH stride size in height direction.
|
||||
* \param[in] strideW stride size in width direction.
|
||||
* \param[in] paddingH padding size in height direction.
|
||||
* \param[in] paddingW padding size in width direction.
|
||||
* \param[out] outputData outputData.
|
||||
*
|
||||
*/
|
||||
template <DeviceType Device, class T>
|
||||
class DepthwiseConvFunctor {
|
||||
public:
|
||||
void operator()(const T* inputData,
|
||||
const T* filterData,
|
||||
int batchSize,
|
||||
int outputChannels,
|
||||
int outputHeight,
|
||||
int outputWidth,
|
||||
int inputChannels,
|
||||
int inputHeight,
|
||||
int inputWidth,
|
||||
int filterMultiplier,
|
||||
int filterHeight,
|
||||
int filterWidth,
|
||||
int strideH,
|
||||
int strideW,
|
||||
int paddingH,
|
||||
int paddingW,
|
||||
T* outputData);
|
||||
};
|
||||
|
||||
/**
|
||||
*\brief Functor tot compute the depthwise convolution backprop w.r.t input.
|
||||
*
|
||||
*
|
||||
* \param[in] outputGradData the grad data of output.
|
||||
* \param[in] filterData the Paramters of the depthwise conv layer..
|
||||
* \param[in] batchSize batch size of input data.
|
||||
* \param[in] outputChannels channels of outputData.
|
||||
* \param[in] outputHeight height of outputData.
|
||||
* \param[in] outputWidth width of outputData.
|
||||
* \param[in] inputChannels channels of input data.
|
||||
* \param[in] inputHeight height of inputData.
|
||||
* \param[in] inputWidth width of inputData.
|
||||
* \param[in] filterMultiplier equals to outputChannels/groups_.
|
||||
* \param[in] filterHeight height of filter.
|
||||
* \param[in] filterWidth widht of filter.
|
||||
* \param[in] strideH stride size in height direction.
|
||||
* \param[in] strideW stride size in width direction.
|
||||
* \param[in] paddingH padding size in height direction.
|
||||
* \param[in] paddingW padding size in width direction.
|
||||
* \param[out] inputGrad the grad data of input.
|
||||
*
|
||||
*/
|
||||
template <DeviceType Device, class T>
|
||||
class DepthwiseConvGradInputFunctor {
|
||||
public:
|
||||
void operator()(const T* outputGrad,
|
||||
const T* filterData,
|
||||
int batchSize,
|
||||
int outputChannels,
|
||||
int outputHeight,
|
||||
int outputWidth,
|
||||
int inputChannels,
|
||||
int inputHeight,
|
||||
int inputWidth,
|
||||
int filterMultiplier,
|
||||
int filterHeight,
|
||||
int filterWidth,
|
||||
int strideH,
|
||||
int strideW,
|
||||
int paddingH,
|
||||
int paddingW,
|
||||
T* inputGrad);
|
||||
};
|
||||
|
||||
/**
|
||||
*\brief Functor tot compute the depthwise convolution backprop w.r.t filter.
|
||||
*
|
||||
* \param[in] outputGradData the grad data of output.
|
||||
* \param[in] inputData inputData.
|
||||
* \param[in] batchSize batch size of input data.
|
||||
* \param[in] outputChannels channels of outputData.
|
||||
* \param[in] outputHeight height of outputData.
|
||||
* \param[in] outputWidth width of outputData.
|
||||
* \param[in] inputChannels channels of input data.
|
||||
* \param[in] inputHeight height of inputData.
|
||||
* \param[in] inputWidth width of inputData.
|
||||
* \param[in] filterMultiplier equals to outputChannels/groups_.
|
||||
* \param[in] filterHeight height of filter.
|
||||
* \param[in] filterWidth widht of filter.
|
||||
* \param[in] strideH stride size in height direction.
|
||||
* \param[in] strideW stride size in width direction.
|
||||
* \param[in] paddingH padding size in height direction.
|
||||
* \param[in] paddingW padding size in width direction.
|
||||
* \param[in] colData Auxiliary data when calculating filterGrad.
|
||||
* \param[in] multiplierData Auxiliary data when calculating filterGrad.
|
||||
* \param[out] filterGrad the grad data of filter.
|
||||
*
|
||||
*/
|
||||
template <DeviceType Device, class T>
|
||||
class DepthwiseConvGradFilterFunctor {
|
||||
public:
|
||||
void operator()(const T* outputGrad,
|
||||
const T* inputData,
|
||||
int batchSize,
|
||||
int outputChannels,
|
||||
int outputHeight,
|
||||
int outputWidth,
|
||||
int inputChannels,
|
||||
int inputHeight,
|
||||
int inputWidth,
|
||||
int filterMultiplier,
|
||||
int filterHeight,
|
||||
int filterWidth,
|
||||
int strideH,
|
||||
int strideW,
|
||||
int paddingH,
|
||||
int paddingW,
|
||||
T* colData,
|
||||
T* filterGrad);
|
||||
};
|
||||
|
||||
} // namespace paddle
|
File diff suppressed because it is too large
Load Diff
@ -1,11 +1,16 @@
|
||||
add_subdirectory(detail)
|
||||
|
||||
cc_library(memory SRCS memory.cc)
|
||||
cc_library(memcpy SRCS memcpy.cc DEPS device_context)
|
||||
|
||||
cc_library(paddle_memory
|
||||
DEPS
|
||||
memory meta_data
|
||||
meta_cache memory_block
|
||||
buddy_allocator system_allocator)
|
||||
memory
|
||||
memcpy
|
||||
meta_data
|
||||
meta_cache
|
||||
memory_block
|
||||
buddy_allocator
|
||||
system_allocator)
|
||||
|
||||
cc_test(memory_test SRCS memory_test.cc DEPS place paddle_memory)
|
||||
|
@ -1,140 +1,4 @@
|
||||
## Design
|
||||
# Region-based Heterogeneous Memory Management
|
||||
|
||||
### Usage
|
||||
|
||||
To allocate 4KB CPU memory:
|
||||
|
||||
```cpp
|
||||
p = memory::Alloc(platform::CPUPlace(), 4*1024);
|
||||
```
|
||||
|
||||
To allocate 4KB memory on the 3rd GPU:
|
||||
|
||||
```cpp
|
||||
p = memory::Alloc(platform::GPUPlace(2), 4*1024);
|
||||
```
|
||||
|
||||
To free memory and check the so-far used amount of memory on a place:
|
||||
|
||||
```cpp
|
||||
auto pl = platform::GPUPlace(0);
|
||||
p = memory::Alloc(pl, 4*1024);
|
||||
cout << memory::Used(pl);
|
||||
memory::Free(pl, p);
|
||||
```
|
||||
|
||||
### API
|
||||
|
||||
In `paddle/memory/memory.h` we have:
|
||||
|
||||
```cpp
|
||||
namespace memory {
|
||||
template <typename Place> void* Alloc(Place, size_t);
|
||||
template <typename Place> void Free(Place, void*);
|
||||
template <typename Place> size_t Used(Place);
|
||||
} // namespace memory
|
||||
```
|
||||
|
||||
These function templates have specializations on either `platform::CPUPlace` or `platform::GPUPlace`:
|
||||
|
||||
```cpp
|
||||
template<>
|
||||
void* Alloc<CPUPlace>(CPUPlace p, size_t size) {
|
||||
return GetCPUBuddyAllocator()->Alloc(size);
|
||||
}
|
||||
```
|
||||
|
||||
and
|
||||
|
||||
```cpp
|
||||
template<>
|
||||
void Alloc<GPUPlace>(GPUPlace p, size_t size) {
|
||||
return GetGPUBuddyAllocator(p.id)->Alloc(size);
|
||||
}
|
||||
```
|
||||
|
||||
Similar specializations exist for `Free` and `Used`.
|
||||
|
||||
### Implementation
|
||||
|
||||
`GetCPUBuddyAllocator` and `GetGPUBuddyAllocator` are singletions.
|
||||
|
||||
```cpp
|
||||
BuddyAllocator* GetCPUBuddyAllocator() {
|
||||
static BuddyAllocator* a = NULL;
|
||||
if (a == NULL) {
|
||||
a = new BuddyAllocator(new CPUAllocator /*backup allocator*/, ...);
|
||||
}
|
||||
return a;
|
||||
}
|
||||
|
||||
BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
|
||||
static BuddyAllocator* as = NULL;
|
||||
if (as == NULL) {
|
||||
as = new BuddyAllocator*[platform::NumGPUs()];
|
||||
for (int gpu = 0; gpu < platform::NumGPUs(); gpu++) {
|
||||
as[gpu] = new BuddyAllocator(new GPUAllocator(gpu) /* backup allocator */, ...);
|
||||
}
|
||||
}
|
||||
return as[gpu_id);
|
||||
```
|
||||
|
||||
#### `BuddyAllocator`
|
||||
|
||||
`BuddyAllocator` implements the buddy allocation algorithm. Its constructor takes parameters only related with the algorithm:
|
||||
|
||||
```cpp
|
||||
BuddyAllocator::BuddyAllocator(initial_pool_size, max_pool_size) {
|
||||
...
|
||||
}
|
||||
```
|
||||
|
||||
Please be aware that **`BuddyAllocator` always allocate aligned memory**, aligned on 32-bytes, which can hold a `BuddyAllocator::Block` object:
|
||||
|
||||
```cpp
|
||||
class BuddyAllocator {
|
||||
private:
|
||||
struct Block {
|
||||
size_t size;
|
||||
Block* left, right;
|
||||
size_t index; // allocator id
|
||||
};
|
||||
...
|
||||
};
|
||||
```
|
||||
|
||||
Because BuddyAllocator has the meta-data of each block, it can trace the used memory -- record the amount returned by `Alloc` freed in `Free`. Instead, `CPUAllocator` and `GPUAllocator` doesn't know the size of freed memory block and cannot do the trace.
|
||||
|
||||
#### System Allocators
|
||||
|
||||
The `GPUAllocator` and `CPUAllocator` are calls *system allocators*. They work as the fallback allocators of `BuddyAllocator`.
|
||||
|
||||
## Justification
|
||||
|
||||
I got inspiration from Majel and Caffe2, though above design look different from both.
|
||||
|
||||
### Caffe2
|
||||
|
||||
In Caffe2, `Tensor<Context>::mutable_data()` allocates the memroy. In particular, [`Tensor<Context>::mutable_data`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/tensor.h#L523) calls [`Tensor<Context>::raw_mutable_data`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/tensor.h#L459), which in turn calls [`Context::New`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/tensor.h#L479).
|
||||
|
||||
There are two implementations of `Context`:
|
||||
|
||||
1. [`CPUContext`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.h#L105), whose [`New` method](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.h#L131) calls [`g_cpu_allocator.get()->New(size_t)`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.cc#L15) to allocate the memory.
|
||||
|
||||
1. [`CUDAContext`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L99), which has a data member [`int gpu_id_`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L202). This looks very similar to class `majel::GPUPlace`, who also has an `int id_` data member. `CUDAContext::New(size_t)` calls [`g_cub_allocator->DeviceAllocate(&ptr, nbytes)`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.cu#L355) to allocate the memory.
|
||||
|
||||
### Majel
|
||||
|
||||
In Majel, there are basically two allocator types:
|
||||
|
||||
1. `cpu::SystemAllocator`, which has similar functionality to `caffe2::CPUContext::New/Delete`.
|
||||
1. `gpu::SystemAllocator`, which has similar functionality to `caffe2::CUDAContext::New/Delete`.
|
||||
|
||||
However, memory allocation is not via these two allocators. Instead, these two allocators are defined in hidden namespaces.
|
||||
|
||||
In Majel there are hidden global variables like:
|
||||
|
||||
1. `cpu::SystemAllocator g_cpu_allocator`, and
|
||||
1. `vector<gpu::SystemAllocator*> g_gpu_allocators(NUM_GPUS)`.
|
||||
|
||||
Programs allocate memory via a BuddyAllocator, which can take the `g_cpu_allocator` or a `g_gpu_allocators[gpu_id]` as its *fallback allocator*, so that if BuddyAllocator cannot find a block in its memory pool, it extends its memory pool by calling the fallback allocator's `New(size_t)`.
|
||||
Please check out the [design documentation](http://gangliao.me) to find out more details about
|
||||
buddy memory allocator for both CPU and GPU.
|
||||
|
@ -0,0 +1,70 @@
|
||||
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. */
|
||||
|
||||
#include "paddle/memory/memcpy.h"
|
||||
|
||||
#include <cstring> // for memcpy
|
||||
|
||||
#include "paddle/platform/device_context.h"
|
||||
|
||||
namespace paddle {
|
||||
namespace memory {
|
||||
|
||||
template <>
|
||||
void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
|
||||
platform::CPUPlace,
|
||||
const void* src, size_t num) {
|
||||
std::memcpy(dst, src, num);
|
||||
}
|
||||
|
||||
#ifndef PADDLE_ONLY_CPU
|
||||
template <>
|
||||
void Copy<platform::CPUPlace, platform::GPUPlace>(platform::CPUPlace dst_place,
|
||||
void* dst,
|
||||
platform::GPUPlace src_place,
|
||||
const void* src, size_t num,
|
||||
cudaStream_t stream) {
|
||||
platform::GPUPlaceGuard g(src_place.device);
|
||||
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
|
||||
}
|
||||
|
||||
template <>
|
||||
void Copy<platform::GPUPlace, platform::CPUPlace>(platform::GPUPlace dst_place,
|
||||
void* dst,
|
||||
platform::CPUPlace src_place,
|
||||
const void* src, size_t num,
|
||||
cudaStream_t stream) {
|
||||
platform::GPUPlaceGuard g(dst_place.device);
|
||||
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
|
||||
}
|
||||
|
||||
template <>
|
||||
void Copy<platform::GPUPlace, platform::GPUPlace>(platform::GPUPlace dst_place,
|
||||
void* dst,
|
||||
platform::GPUPlace src_place,
|
||||
const void* src, size_t num,
|
||||
cudaStream_t stream) {
|
||||
if (dst_place == src_place) {
|
||||
platform::GPUPlaceGuard g(src_place.device);
|
||||
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice, stream);
|
||||
} else {
|
||||
platform::GpuMemcpyPeer(dst, dst_place.device, src, src_place.device, num,
|
||||
stream);
|
||||
}
|
||||
}
|
||||
|
||||
#endif // PADDLE_ONLY_CPU
|
||||
|
||||
} // namespace memory
|
||||
} // namespace paddle
|
@ -0,0 +1,33 @@
|
||||
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. */
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "paddle/platform/gpu_info.h"
|
||||
#include "paddle/platform/place.h"
|
||||
|
||||
namespace paddle {
|
||||
namespace memory {
|
||||
|
||||
template <typename DstPlace, typename SrcPlace>
|
||||
void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num);
|
||||
|
||||
#ifndef PADDLE_ONLY_CPU
|
||||
template <typename DstPlace, typename SrcPlace>
|
||||
void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num,
|
||||
cudaStream_t stream);
|
||||
#endif // PADDLE_ONLY_CPU
|
||||
|
||||
} // namespace memory
|
||||
} // namespace paddle
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue