diff --git a/paddle/fluid/framework/tensor_impl.h b/paddle/fluid/framework/tensor_impl.h index 7a48390440..f49d1a47a3 100644 --- a/paddle/fluid/framework/tensor_impl.h +++ b/paddle/fluid/framework/tensor_impl.h @@ -128,13 +128,20 @@ inline void* Tensor::mutable_data(platform::Place place, std::type_index type) { if (platform::is_cpu_place(place)) { holder_.reset(new PlaceholderImpl( boost::get(place), size, type)); - } else if (platform::is_gpu_place(place)) { + } else if (platform::is_gpu_place(place) || + platform::is_cuda_pinned_place(place)) { #ifndef PADDLE_WITH_CUDA - PADDLE_THROW("'CUDAPlace' is not supported in CPU only device."); + PADDLE_THROW( + "CUDAPlace or CUDAPinnedPlace is not supported in CPU-only mode."); } #else - holder_.reset(new PlaceholderImpl( - boost::get(place), size, type)); + if (platform::is_gpu_place(place)) { + holder_.reset(new PlaceholderImpl( + boost::get(place), size, type)); + } else if (platform::is_cuda_pinned_place(place)) { + holder_.reset(new PlaceholderImpl( + boost::get(place), size, type)); + } } #endif offset_ = 0; @@ -145,7 +152,7 @@ inline void* Tensor::mutable_data(platform::Place place, std::type_index type) { inline void* Tensor::mutable_data(platform::Place place) { PADDLE_ENFORCE(this->holder_ != nullptr, - "Cannot invoke mutable data if current hold nothing"); + "Cannot invoke mutable data if current hold nothing."); return mutable_data(place, holder_->type()); } diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index b0a3f06a88..046721970a 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -11,11 +11,16 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include +#include +#include // NOLINT // for call_once +#include +#include +#include +#include #include "paddle/fluid/pybind/protobuf.h" -#include // for call_once -#include #include "paddle/fluid/framework/backward.h" #include "paddle/fluid/framework/channel.h" #include "paddle/fluid/framework/executor.h" @@ -32,7 +37,6 @@ limitations under the License. */ #include "paddle/fluid/operators/cond_op.h" #include "paddle/fluid/operators/net_op.h" #include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/pybind/const_value.h" @@ -100,6 +104,14 @@ PYBIND11_PLUGIN(core) { [](Tensor &self, paddle::platform::CUDAPlace &place) { self.mutable_data(place); }) + .def("alloc_int", + [](Tensor &self, paddle::platform::CUDAPinnedPlace &place) { + self.mutable_data(place); + }) + .def("alloc_float", + [](Tensor &self, paddle::platform::CUDAPinnedPlace &place) { + self.mutable_data(place); + }) .def("set", PyCPUTensorSetFromArray) .def("set", PyCPUTensorSetFromArray) .def("set", PyCPUTensorSetFromArray) @@ -113,6 +125,12 @@ PYBIND11_PLUGIN(core) { .def("set", PyCUDATensorSetFromArray) .def("set", PyCUDATensorSetFromArray) .def("set", PyCUDATensorSetFromArray) + .def("set", PyCUDAPinnedTensorSetFromArray) + .def("set", PyCUDAPinnedTensorSetFromArray) + .def("set", PyCUDAPinnedTensorSetFromArray) + .def("set", PyCUDAPinnedTensorSetFromArray) + .def("set", PyCUDAPinnedTensorSetFromArray) + .def("set", PyCUDAPinnedTensorSetFromArray) #endif .def("shape", [](Tensor &self) { return vectorize(self.dims()); }) .def("set_float_element", TensorSetElement) @@ -317,7 +335,17 @@ All parameter, weight, gradient are variables in Paddle. #else return new paddle::platform::CUDADeviceContext(place); #endif - }); + }) + .def_static("create", + [](paddle::platform::CUDAPinnedPlace& place) + -> paddle::platform::DeviceContext* { +#ifndef PADDLE_WITH_CUDA + PADDLE_THROW( + "CUDAPinnedPlace is not supported in CPU device."); +#else + return new paddle::platform::CUDAPinnedDeviceContext(place); +#endif + });; // clang-format on #ifdef PADDLE_WITH_CUDA py::class_(m, "Communicator").def(py::init<>()); @@ -330,6 +358,10 @@ All parameter, weight, gradient are variables in Paddle. .def(py::init<>()) .def("__str__", string::to_string); + py::class_(m, "CUDAPinnedPlace") + .def(py::init<>()) + .def("__str__", string::to_string); + py::class_(m, "Place") .def(py::init<>()) .def("set_place", @@ -339,7 +371,11 @@ All parameter, weight, gradient are variables in Paddle. .def("set_place", [](platform::Place &self, const platform::CUDAPlace &gpu_place) { self = gpu_place; - }); + }) + .def("set_place", [](platform::Place &self, + const platform::CUDAPinnedPlace &cuda_pinned_place) { + self = cuda_pinned_place; + }); py::class_(m, "Operator") .def_static("create", @@ -363,6 +399,11 @@ All parameter, weight, gradient are variables in Paddle. .def("run", [](OperatorBase &self, const Scope &scope, const platform::CUDAPlace &place) { self.Run(scope, place); }) + .def("run", + [](OperatorBase &self, const Scope &scope, + const platform::CUDAPinnedPlace &place) { + self.Run(scope, place); + }) .def("type", [](const OperatorBase &op) -> std::string { return op.Type(); }) .def("outputs", diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h index 6f8c597f8e..868966433e 100644 --- a/paddle/fluid/pybind/tensor_py.h +++ b/paddle/fluid/pybind/tensor_py.h @@ -14,6 +14,8 @@ limitations under the License. */ #pragma once #include +#include +#include #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/platform/device_context.h" @@ -141,7 +143,7 @@ void PyCPUTensorSetFromArray( std::vector dims; dims.reserve(array.ndim()); for (size_t i = 0; i < array.ndim(); ++i) { - dims.push_back((int)array.shape()[i]); + dims.push_back(static_cast(array.shape()[i])); } self.Resize(framework::make_ddim(dims)); @@ -150,6 +152,8 @@ void PyCPUTensorSetFromArray( } template <> +// This following specialization maps uint16_t in the parameter type to +// platform::float16. void PyCPUTensorSetFromArray( framework::Tensor &self, py::array_t array, @@ -157,7 +161,7 @@ void PyCPUTensorSetFromArray( std::vector dims; dims.reserve(array.ndim()); for (size_t i = 0; i < array.ndim(); ++i) { - dims.push_back((int)array.shape()[i]); + dims.push_back(static_cast(array.shape()[i])); } self.Resize(framework::make_ddim(dims)); @@ -174,7 +178,7 @@ void PyCUDATensorSetFromArray( std::vector dims; dims.reserve(array.ndim()); for (size_t i = 0; i < array.ndim(); ++i) { - dims.push_back((int)array.shape()[i]); + dims.push_back(static_cast(array.shape()[i])); } self.Resize(framework::make_ddim(dims)); @@ -188,6 +192,8 @@ void PyCUDATensorSetFromArray( } template <> +// This following specialization maps uint16_t in the parameter type to +// platform::float16. void PyCUDATensorSetFromArray( framework::Tensor &self, py::array_t array, @@ -195,7 +201,7 @@ void PyCUDATensorSetFromArray( std::vector dims; dims.reserve(array.ndim()); for (size_t i = 0; i < array.ndim(); ++i) { - dims.push_back((int)array.shape()[i]); + dims.push_back(static_cast(array.shape()[i])); } self.Resize(framework::make_ddim(dims)); @@ -208,6 +214,40 @@ void PyCUDATensorSetFromArray( sizeof(uint16_t) * array.size(), cudaMemcpyHostToDevice, dev_ctx->stream()); } + +template +void PyCUDAPinnedTensorSetFromArray( + framework::Tensor &self, + py::array_t array, + const paddle::platform::CUDAPinnedPlace &place) { + std::vector dims; + dims.reserve(array.ndim()); + for (size_t i = 0; i < array.ndim(); ++i) { + dims.push_back(static_cast(array.shape()[i])); + } + + self.Resize(framework::make_ddim(dims)); + auto *dst = self.mutable_data(place); + std::memcpy(dst, array.data(), sizeof(T) * array.size()); +} + +template <> +// This following specialization maps uint16_t in the parameter type to +// platform::float16. +void PyCUDAPinnedTensorSetFromArray( + framework::Tensor &self, + py::array_t array, + const paddle::platform::CUDAPinnedPlace &place) { + std::vector dims; + dims.reserve(array.ndim()); + for (size_t i = 0; i < array.ndim(); ++i) { + dims.push_back(static_cast(array.shape()[i])); + } + + self.Resize(framework::make_ddim(dims)); + auto *dst = self.mutable_data(place); + std::memcpy(dst, array.data(), sizeof(uint16_t) * array.size()); +} #endif } // namespace pybind diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 5ea4d977f4..f01d638efd 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -31,7 +31,7 @@ import regularizer import average from param_attr import ParamAttr, WeightNormParamAttr from data_feeder import DataFeeder -from core import LoDTensor, CPUPlace, CUDAPlace +from core import LoDTensor, CPUPlace, CUDAPlace, CUDAPinnedPlace from distribute_transpiler import DistributeTranspiler from distribute_transpiler_simple import SimpleDistributeTranspiler from concurrency import (Go, make_channel, channel_send, channel_recv, @@ -57,6 +57,7 @@ __all__ = framework.__all__ + executor.__all__ + concurrency.__all__ + [ 'LoDTensor', 'CPUPlace', 'CUDAPlace', + 'CUDAPinnedPlace', 'Tensor', 'ParamAttr', 'WeightNormParamAttr',