You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
247 lines
8.5 KiB
247 lines
8.5 KiB
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License. */
|
|
|
|
#pragma once
|
|
#include <Python.h>
|
|
#include <string>
|
|
#include <tuple>
|
|
#include <vector>
|
|
#include "paddle/fluid/framework/lod_tensor.h"
|
|
#include "paddle/fluid/memory/memcpy.h"
|
|
#include "paddle/fluid/platform/device_context.h"
|
|
#include "paddle/fluid/platform/float16.h"
|
|
#include "pybind11/numpy.h"
|
|
#include "pybind11/pybind11.h"
|
|
|
|
namespace paddle {
|
|
namespace pybind {
|
|
namespace details {
|
|
|
|
template <bool less, size_t I, typename... ARGS>
|
|
struct CastToPyBufferImpl;
|
|
|
|
template <size_t I, typename... ARGS>
|
|
struct CastToPyBufferImpl<false, I, ARGS...> {
|
|
pybind11::buffer_info operator()(const framework::Tensor &tensor) {
|
|
PADDLE_THROW("This type of tensor cannot be expose to Python");
|
|
return pybind11::buffer_info();
|
|
}
|
|
};
|
|
|
|
template <size_t I, typename... ARGS>
|
|
struct CastToPyBufferImpl<true, I, ARGS...> {
|
|
using CUR_TYPE = typename std::tuple_element<I, std::tuple<ARGS...>>::type;
|
|
pybind11::buffer_info operator()(const framework::Tensor &tensor) {
|
|
if (std::type_index(typeid(CUR_TYPE)) == tensor.type()) {
|
|
auto dim_vec = framework::vectorize(tensor.dims());
|
|
std::vector<size_t> dims_outside;
|
|
std::vector<size_t> strides;
|
|
dims_outside.resize(dim_vec.size());
|
|
strides.resize(dim_vec.size());
|
|
|
|
size_t prod = 1;
|
|
for (size_t i = dim_vec.size(); i != 0; --i) {
|
|
dims_outside[i - 1] = (size_t)dim_vec[i - 1];
|
|
strides[i - 1] = sizeof(CUR_TYPE) * prod;
|
|
prod *= dims_outside[i - 1];
|
|
}
|
|
framework::Tensor dst_tensor;
|
|
if (paddle::platform::is_gpu_place(tensor.place())) {
|
|
#ifdef PADDLE_WITH_CUDA
|
|
auto *src_ptr = static_cast<const void *>(tensor.data<CUR_TYPE>());
|
|
auto *dst_ptr = static_cast<void *>(dst_tensor.mutable_data<CUR_TYPE>(
|
|
tensor.dims(), platform::CPUPlace()));
|
|
|
|
paddle::platform::GpuMemcpySync(dst_ptr, src_ptr,
|
|
sizeof(CUR_TYPE) * tensor.numel(),
|
|
cudaMemcpyDeviceToHost);
|
|
#else
|
|
PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
|
|
#endif
|
|
} else if (paddle::platform::is_cpu_place(tensor.place())) {
|
|
dst_tensor = tensor;
|
|
}
|
|
|
|
if (std::type_index(typeid(CUR_TYPE)) ==
|
|
std::type_index(typeid(platform::float16))) {
|
|
return pybind11::buffer_info(
|
|
dst_tensor.data<CUR_TYPE>(), sizeof(CUR_TYPE),
|
|
"e", /* np.dtype('e') == np.float16 */
|
|
(size_t)framework::arity(dst_tensor.dims()), dims_outside, strides);
|
|
} else {
|
|
return pybind11::buffer_info(
|
|
dst_tensor.data<CUR_TYPE>(), sizeof(CUR_TYPE),
|
|
pybind11::format_descriptor<CUR_TYPE>::format(),
|
|
(size_t)framework::arity(dst_tensor.dims()), dims_outside, strides);
|
|
}
|
|
} else {
|
|
constexpr bool less = I + 1 < std::tuple_size<std::tuple<ARGS...>>::value;
|
|
return CastToPyBufferImpl<less, I + 1, ARGS...>()(tensor);
|
|
}
|
|
}
|
|
};
|
|
|
|
} // namespace details
|
|
|
|
inline pybind11::buffer_info CastToPyBuffer(const framework::Tensor &tensor) {
|
|
auto buffer_info =
|
|
details::CastToPyBufferImpl<true, 0, float, int, double, int64_t, bool,
|
|
uint8_t, int8_t, platform::float16>()(tensor);
|
|
return buffer_info;
|
|
}
|
|
|
|
template <typename T>
|
|
T TensorGetElement(const framework::Tensor &self, size_t offset) {
|
|
if (platform::is_cpu_place(self.place())) {
|
|
return self.data<T>()[offset];
|
|
} else {
|
|
std::shared_ptr<framework::Tensor> dst(new framework::Tensor);
|
|
framework::TensorCopySync(self, platform::CPUPlace(), dst.get());
|
|
return dst->data<T>()[offset];
|
|
}
|
|
}
|
|
|
|
// TODO(dzhwinter) : fix the redundent Tensor allocate and free
|
|
template <typename T>
|
|
void TensorSetElement(framework::Tensor *self, size_t offset, T elem) {
|
|
if (platform::is_gpu_place(self->place())) {
|
|
std::shared_ptr<framework::Tensor> dst(new framework::Tensor);
|
|
framework::TensorCopySync(*self, platform::CPUPlace(), dst.get());
|
|
dst->data<T>()[offset] = elem;
|
|
framework::TensorCopySync(*dst.get(), self->place(), self);
|
|
|
|
} else if (platform::is_cpu_place(self->place())) {
|
|
self->data<T>()[offset] = elem;
|
|
}
|
|
}
|
|
|
|
template <typename T>
|
|
void PyCPUTensorSetFromArray(
|
|
framework::Tensor *self,
|
|
pybind11::array_t<T, pybind11::array::c_style | pybind11::array::forcecast>
|
|
array,
|
|
paddle::platform::CPUPlace place) {
|
|
std::vector<int64_t> dims;
|
|
dims.reserve(array.ndim());
|
|
for (size_t i = 0; i < array.ndim(); ++i) {
|
|
dims.push_back(static_cast<int>(array.shape()[i]));
|
|
}
|
|
|
|
self->Resize(framework::make_ddim(dims));
|
|
auto *dst = self->mutable_data<T>(place);
|
|
std::memcpy(dst, array.data(), sizeof(T) * array.size());
|
|
}
|
|
|
|
template <>
|
|
// This following specialization maps uint16_t in the parameter type to
|
|
// platform::float16.
|
|
inline void PyCPUTensorSetFromArray(
|
|
framework::Tensor *self,
|
|
pybind11::array_t<uint16_t,
|
|
pybind11::array::c_style | pybind11::array::forcecast>
|
|
array,
|
|
paddle::platform::CPUPlace place) {
|
|
std::vector<int64_t> dims;
|
|
dims.reserve(array.ndim());
|
|
for (size_t i = 0; i < array.ndim(); ++i) {
|
|
dims.push_back(static_cast<int>(array.shape()[i]));
|
|
}
|
|
|
|
self->Resize(framework::make_ddim(dims));
|
|
auto *dst = self->mutable_data<platform::float16>(place);
|
|
std::memcpy(dst, array.data(), sizeof(uint16_t) * array.size());
|
|
}
|
|
|
|
#ifdef PADDLE_WITH_CUDA
|
|
template <typename T>
|
|
void PyCUDATensorSetFromArray(
|
|
framework::Tensor *self,
|
|
pybind11::array_t<T, pybind11::array::c_style | pybind11::array::forcecast>
|
|
array,
|
|
paddle::platform::CUDAPlace place) {
|
|
std::vector<int64_t> dims;
|
|
dims.reserve(array.ndim());
|
|
for (size_t i = 0; i < array.ndim(); ++i) {
|
|
dims.push_back(static_cast<int>(array.shape()[i]));
|
|
}
|
|
|
|
self->Resize(framework::make_ddim(dims));
|
|
auto *dst = self->mutable_data<T>(place);
|
|
paddle::platform::GpuMemcpySync(dst, array.data(), sizeof(T) * array.size(),
|
|
cudaMemcpyHostToDevice);
|
|
}
|
|
|
|
template <>
|
|
// This following specialization maps uint16_t in the parameter type to
|
|
// platform::float16.
|
|
inline void PyCUDATensorSetFromArray(
|
|
framework::Tensor *self,
|
|
pybind11::array_t<uint16_t,
|
|
pybind11::array::c_style | pybind11::array::forcecast>
|
|
array,
|
|
paddle::platform::CUDAPlace place) {
|
|
std::vector<int64_t> dims;
|
|
dims.reserve(array.ndim());
|
|
for (size_t i = 0; i < array.ndim(); ++i) {
|
|
dims.push_back(static_cast<int>(array.shape()[i]));
|
|
}
|
|
|
|
self->Resize(framework::make_ddim(dims));
|
|
auto *dst = self->mutable_data<platform::float16>(place);
|
|
paddle::platform::GpuMemcpySync(dst, array.data(),
|
|
sizeof(uint16_t) * array.size(),
|
|
cudaMemcpyHostToDevice);
|
|
}
|
|
|
|
template <typename T>
|
|
void PyCUDAPinnedTensorSetFromArray(
|
|
framework::Tensor *self,
|
|
pybind11::array_t<T, pybind11::array::c_style | pybind11::array::forcecast>
|
|
array,
|
|
const paddle::platform::CUDAPinnedPlace &place) {
|
|
std::vector<int64_t> dims;
|
|
dims.reserve(array.ndim());
|
|
for (size_t i = 0; i < array.ndim(); ++i) {
|
|
dims.push_back(static_cast<int>(array.shape()[i]));
|
|
}
|
|
|
|
self->Resize(framework::make_ddim(dims));
|
|
auto *dst = self->mutable_data<T>(place);
|
|
std::memcpy(dst, array.data(), sizeof(T) * array.size());
|
|
}
|
|
|
|
template <>
|
|
// This following specialization maps uint16_t in the parameter type to
|
|
// platform::float16.
|
|
inline void PyCUDAPinnedTensorSetFromArray(
|
|
framework::Tensor *self,
|
|
pybind11::array_t<uint16_t,
|
|
pybind11::array::c_style | pybind11::array::forcecast>
|
|
array,
|
|
const paddle::platform::CUDAPinnedPlace &place) {
|
|
std::vector<int64_t> dims;
|
|
dims.reserve(array.ndim());
|
|
for (size_t i = 0; i < array.ndim(); ++i) {
|
|
dims.push_back(static_cast<int>(array.shape()[i]));
|
|
}
|
|
|
|
self->Resize(framework::make_ddim(dims));
|
|
auto *dst = self->mutable_data<platform::float16>(place);
|
|
std::memcpy(dst, array.data(), sizeof(uint16_t) * array.size());
|
|
}
|
|
#endif
|
|
|
|
} // namespace pybind
|
|
} // namespace paddle
|