diff --git a/doc/api/v2/fluid/layers.rst b/doc/api/v2/fluid/layers.rst index 333c5a856b..ef9febe0aa 100644 --- a/doc/api/v2/fluid/layers.rst +++ b/doc/api/v2/fluid/layers.rst @@ -170,6 +170,18 @@ sequence_pool :noindex: +sequence_first_step +------------------- +.. autofunction:: paddle.v2.fluid.layers.sequence_first_step + :noindex: + + +sequence_last_step +------------------ +.. autofunction:: paddle.v2.fluid.layers.sequence_last_step + :noindex: + + pool2d ------ .. autofunction:: paddle.v2.fluid.layers.pool2d diff --git a/doc/design/block.md b/doc/design/block.md index 4066122c0e..fab7f2dc48 100644 --- a/doc/design/block.md +++ b/doc/design/block.md @@ -291,10 +291,10 @@ public: } void Run(const framework::Scope& scope, - const platform::DeviceContext& dev_ctx) const override { + const platform::Place& place) const override { PADDLE_ENFORCE(symbols_ready_, "operators and variables should be created first."); for (auto& op : runtime_table_.ops()) { - op->Run(scope, dev_ctx); + op->Run(scope, place); } } diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt index 206e298eb2..25a0db2768 100644 --- a/paddle/framework/CMakeLists.txt +++ b/paddle/framework/CMakeLists.txt @@ -30,7 +30,7 @@ cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker) cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto) cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute) cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog shape_inference) -cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry) +cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry init) cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog) cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc) @@ -59,5 +59,8 @@ cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry cc_library(selected_rows SRCS selected_rows.cc DEPS tensor) cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows) -cc_library(init SRCS init.cc DEPS gflags executor place stringpiece) +cc_test(threadpool_test SRCS threadpool_test.cc) +cc_library(init SRCS init.cc DEPS gflags device_context place stringpiece) cc_test(init_test SRCS init_test.cc DEPS init) + +cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context) diff --git a/paddle/framework/data_layout.h b/paddle/framework/data_layout.h index 7429de7ee3..7d7a444cf0 100644 --- a/paddle/framework/data_layout.h +++ b/paddle/framework/data_layout.h @@ -14,6 +14,9 @@ limitations under the License. */ #pragma once +#include +#include "paddle/platform/enforce.h" + namespace paddle { namespace framework { @@ -33,5 +36,23 @@ inline DataLayout StringToDataLayout(const std::string& str) { } } +inline std::string DataLayoutToString(const DataLayout& data_layout) { + switch (data_layout) { + case kNHWC: + return "NHWC"; + case kNCHW: + return "NCHW"; + case kAnyLayout: + return "ANY_LAYOUT"; + default: + PADDLE_THROW("unknown DataLayou %d", data_layout); + } +} + +inline std::ostream& operator<<(std::ostream& out, DataLayout l) { + out << DataLayoutToString(l); + return out; +} + } // namespace framework } // namespace paddle diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc index 14ae37ec49..997773c168 100644 --- a/paddle/framework/executor.cc +++ b/paddle/framework/executor.cc @@ -33,13 +33,7 @@ namespace framework { const std::string kFeedOpType = "feed"; const std::string kFetchOpType = "fetch"; -DeviceContextPool* DeviceContextPool::pool = nullptr; - -Executor::Executor(const std::vector& places) { - DeviceContextPool& pool = DeviceContextPool::Get(); - auto borrowed_contexts = pool.Borrow(places); - device_contexts_.swap(borrowed_contexts); -} +Executor::Executor(const platform::Place& place) : place_(place) {} static void CreateTensor(Variable* var, proto::VarDesc::VarType var_type) { if (var_type == proto::VarDesc::LOD_TENSOR) { @@ -71,7 +65,6 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, // - will change to use multiple blocks for RNN op and Cond Op PADDLE_ENFORCE_LT(static_cast(block_id), pdesc.Size()); auto& block = pdesc.Block(block_id); - auto& device = device_contexts_[0]; Scope* local_scope = scope; if (create_vars) { @@ -107,7 +100,7 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, for (auto& op_desc : block.AllOps()) { auto op = paddle::framework::OpRegistry::CreateOp(*op_desc); VLOG(3) << op->DebugString(); - op->Run(*local_scope, *device); + op->Run(*local_scope, place_); } if (create_local_scope) { scope->DeleteScope(local_scope); diff --git a/paddle/framework/executor.h b/paddle/framework/executor.h index a3d1609293..d869e18901 100644 --- a/paddle/framework/executor.h +++ b/paddle/framework/executor.h @@ -14,9 +14,6 @@ limitations under the License. */ #pragma once -#include -#include - #include "paddle/framework/op_info.h" #include "paddle/framework/program_desc.h" #include "paddle/framework/scope.h" @@ -26,96 +23,13 @@ limitations under the License. */ namespace paddle { namespace framework { -class DeviceContextPool { - public: - static DeviceContextPool& Get() { - PADDLE_ENFORCE_NOT_NULL(pool, "Need to Create DeviceContextPool first!"); - return *pool; - } - - static DeviceContextPool& Create(const std::vector& places) { - if (pool == nullptr) { - pool = new DeviceContextPool(places); - } - return *pool; - } - - const platform::DeviceContext* Borrow(const platform::Place& place) { - auto range = device_contexts_.equal_range(place); - if (range.first == range.second) { - PADDLE_THROW( - "'Place' is not supported, Please re-compile with WITH_GPU " - "option"); - } - return range.first->second; - } - - std::vector Borrow( - const std::vector& places) { - PADDLE_ENFORCE_GT(places.size(), 0); - PADDLE_ENFORCE_LE(places.size(), device_contexts_.size()); - std::vector borrowed_contexts; - for (auto& place : places) { - auto range = device_contexts_.equal_range(place); - if (range.first == range.second) { - PADDLE_THROW( - "'Place' is not supported, Please re-compile with WITH_GPU " - "option"); - } - // TODO(dzhwinter) : assign the first found device. Will enhanced later. - // device load balancer maybe useful here. - borrowed_contexts.emplace_back(range.first->second); - } - return borrowed_contexts; - } - - explicit DeviceContextPool(const std::vector& places) { - PADDLE_ENFORCE_GT(places.size(), 0); - for (size_t i = 0; i < places.size(); i++) { - if (platform::is_cpu_place(places[i])) { - device_contexts_.emplace( - places[i], new platform::CPUDeviceContext( - boost::get(places[i]))); - } else if (platform::is_gpu_place(places[i])) { -#ifdef PADDLE_WITH_CUDA - device_contexts_.emplace( - places[i], new platform::CUDADeviceContext( - boost::get(places[i]))); -#else - PADDLE_THROW( - "'GPUPlace' is not supported, Please re-compile with WITH_GPU " - "option"); -#endif - } - } - } - - ~DeviceContextPool() {} - - private: - static DeviceContextPool* pool; - struct Hash { - std::hash hash_; - size_t operator()(const platform::Place& place) const { - return hash_(place.which()); - } - }; - std::unordered_multimap - device_contexts_; - DISABLE_COPY_AND_ASSIGN(DeviceContextPool); -}; - class Executor { public: // TODO(dzhwinter) : Do not rely on this function, it will be removed explicit Executor(const platform::DeviceContext& device) - : Executor(std::vector({device.GetPlace()})) {} - - explicit Executor(const platform::Place& place) - : Executor(std::vector({place})) {} + : Executor(device.GetPlace()) {} - explicit Executor(const std::vector& places); + explicit Executor(const platform::Place& place); /* @Brief * Runtime evaluation of the given ProgramDesc under certain Scope @@ -128,7 +42,7 @@ class Executor { bool create_vars = true); private: - std::vector device_contexts_; + const platform::Place place_; }; } // namespace framework diff --git a/paddle/framework/init.cc b/paddle/framework/init.cc index 1c4476f4b3..4deb4fa903 100644 --- a/paddle/framework/init.cc +++ b/paddle/framework/init.cc @@ -14,8 +14,8 @@ #include #include -#include "paddle/framework/executor.h" #include "paddle/framework/init.h" +#include "paddle/platform/device_context.h" #include "paddle/platform/place.h" #include "paddle/string/piece.h" @@ -48,7 +48,7 @@ bool InitDevices(const std::vector &devices) { std::vector places; for (auto &device : devices) { auto p = string::Piece(device); - if (string::Find(p, ':', 0) == string::Piece::npos) { + if (string::HasPrefix(p, "CPU")) { places.emplace_back(platform::CPUPlace()); } else if (string::HasPrefix(p, "GPU")) { #ifdef PADDLE_WITH_CUDA @@ -69,10 +69,9 @@ bool InitDevices(const std::vector &devices) { return platform::is_cpu_place(place); }) == places.end()) { places.emplace_back(platform::CPUPlace()); - LOG(WARNING) << "Not specified any device, use CPU by Default."; + LOG(WARNING) << "Not specified CPU device, create CPU by Default."; } - DeviceContextPool::Create(places); - return true; + platform::DeviceContextPool::Create(places); return true; } diff --git a/paddle/framework/init_test.cc b/paddle/framework/init_test.cc index f65e881a76..cb1ba7ce8f 100644 --- a/paddle/framework/init_test.cc +++ b/paddle/framework/init_test.cc @@ -23,5 +23,9 @@ TEST(Init, InitDevices) { #ifdef PADDLE_WITH_CUDA std::vector ds2 = {"CPU", "GPU:0", "GPU:1"}; ASSERT_EQ(InitDevices(ds2), true); + + // test re-init + std::vector ds3 = {"GPU:0", "GPU:1"}; + ASSERT_EQ(InitDevices(ds3), true); #endif } diff --git a/paddle/framework/library_type.h b/paddle/framework/library_type.h index 68e9cabb66..aa66cf00f3 100644 --- a/paddle/framework/library_type.h +++ b/paddle/framework/library_type.h @@ -20,7 +20,25 @@ namespace framework { // For more details about the design of LibraryType, Please refer to // https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/operator_kernel_type.md#library -enum LibraryType { kPlain = 0; kMKLDNN = 1; kCUDNN = 2; } +enum LibraryType { kPlain = 0, kMKLDNN = 1, kCUDNN = 2 }; + +inline std::string LibraryTypeToString(const LibraryType& library_type) { + switch (library_type) { + case kPlain: + return "PLAIN"; + case kMKLDNN: + return "MKLDNN"; + case kCUDNN: + return "CUDNN"; + default: + PADDLE_THROW("unknown LibraryType %d", library_type); + } +} + +inline std::ostream& operator<<(std::ostream& out, LibraryType l) { + out << LibraryTypeToString(l); + return out; +} } // namespace } // framework diff --git a/paddle/framework/op_kernel_type.h b/paddle/framework/op_kernel_type.h new file mode 100644 index 0000000000..e9c45b958c --- /dev/null +++ b/paddle/framework/op_kernel_type.h @@ -0,0 +1,81 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/framework/data_layout.h" +#include "paddle/framework/data_type.h" +#include "paddle/framework/library_type.h" +#include "paddle/platform/device_context.h" +#include "paddle/platform/place.h" + +namespace paddle { +namespace framework { + +struct OpKernelType { + struct Hash { + size_t operator()(const OpKernelType& key) const { + int place = key.place_.which() + (1 << LEFT_SHIFT); + int data_type = + static_cast(key.data_type_) + (1 << (LEFT_SHIFT + 1)); + int data_layout = + static_cast(key.data_layout_) + (1 << (LEFT_SHIFT + 2)); + int library_type = + static_cast(key.library_type_) + (1 << (LEFT_SHIFT + 3)); + std::hash hasher; + return hasher(place + data_type + data_layout + library_type); + } + }; + + // place, data_type, library_type kinds less than 2^8 + constexpr static int LEFT_SHIFT = 8; + proto::DataType data_type_; + DataLayout data_layout_; + platform::Place place_; + LibraryType library_type_; + + OpKernelType(proto::DataType data_type, platform::Place place, + DataLayout data_layout = DataLayout::kAnyLayout, + LibraryType library_type = LibraryType::kPlain) + : data_type_(data_type), + data_layout_(data_layout), + place_(place), + library_type_(library_type) {} + + OpKernelType(proto::DataType data_type, + const platform::DeviceContext& dev_ctx, + DataLayout data_layout = DataLayout::kAnyLayout, + LibraryType library_type = LibraryType::kPlain) + : data_type_(data_type), + data_layout_(data_layout), + place_(dev_ctx.GetPlace()), + library_type_(library_type) {} + + bool operator==(const OpKernelType& o) const { + return platform::places_are_same_class(place_, o.place_) && + data_type_ == o.data_type_ && data_layout_ == o.data_layout_ && + library_type_ == o.library_type_; + } +}; + +inline std::ostream& operator<<(std::ostream& os, + const OpKernelType& kernel_key) { + os << "data_type[" << kernel_key.data_type_ << "]:data_layout[" + << kernel_key.data_layout_ << "]:place[" << kernel_key.place_ + << "]:library_type[" << kernel_key.library_type_ << "]"; + return os; +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/framework/op_kernel_type_test.cc b/paddle/framework/op_kernel_type_test.cc new file mode 100644 index 0000000000..899676b5c1 --- /dev/null +++ b/paddle/framework/op_kernel_type_test.cc @@ -0,0 +1,51 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/framework/op_kernel_type.h" +#include +#include + +TEST(OpKernelType, ToString) { + using OpKernelType = paddle::framework::OpKernelType; + using DataType = paddle::framework::proto::DataType; + using CPUPlace = paddle::platform::CPUPlace; + using DataLayout = paddle::framework::DataLayout; + using LibraryType = paddle::framework::LibraryType; + + OpKernelType op_kernel_type(DataType::FP32, CPUPlace(), DataLayout::kNCHW, + LibraryType::kCUDNN); + + std::ostringstream stream; + stream << op_kernel_type; + ASSERT_EQ( + stream.str(), + "data_type[5]:data_layout[NCHW]:place[CPUPlace]:library_type[CUDNN]"); +} + +TEST(OpKernelType, Hash) { + using OpKernelType = paddle::framework::OpKernelType; + using DataType = paddle::framework::proto::DataType; + using CPUPlace = paddle::platform::CPUPlace; + using GPUPlace = paddle::platform::GPUPlace; + using DataLayout = paddle::framework::DataLayout; + using LibraryType = paddle::framework::LibraryType; + + OpKernelType op_kernel_type_1(DataType::FP32, CPUPlace(), DataLayout::kNCHW, + LibraryType::kCUDNN); + OpKernelType op_kernel_type_2(DataType::FP32, GPUPlace(0), DataLayout::kNCHW, + LibraryType::kCUDNN); + + OpKernelType::Hash hasher; + ASSERT_NE(hasher(op_kernel_type_1), hasher(op_kernel_type_2)); +} \ No newline at end of file diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h index 7f0155b61f..244c117465 100644 --- a/paddle/framework/op_registry.h +++ b/paddle/framework/op_registry.h @@ -61,17 +61,6 @@ struct OperatorRegistrar : public Registrar { class OpRegistry { public: - template - static void RegisterOp(const std::string& op_type, - const std::string& grad_op_type) { - OperatorRegistrar reg(op_type.c_str()); - reg.info.grad_op_type_ = grad_op_type; - // register gradient op - if (!grad_op_type.empty()) { - OperatorRegistrar grad_reg(grad_op_type.c_str()); - } - } - static std::unique_ptr CreateOp(const std::string& type, const VariableNameMap& inputs, const VariableNameMap& outputs, diff --git a/paddle/framework/op_registry_test.cc b/paddle/framework/op_registry_test.cc index 27713e5cbf..4cdf6e0865 100644 --- a/paddle/framework/op_registry_test.cc +++ b/paddle/framework/op_registry_test.cc @@ -8,8 +8,7 @@ namespace framework { class CosineOp : public OperatorBase { public: using OperatorBase::OperatorBase; - void Run(const Scope& scope, - const platform::DeviceContext& dev_ctx) const override {} + void Run(const Scope& scope, const platform::Place& place) const override {} }; class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker { @@ -28,8 +27,7 @@ class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker { class MyTestOp : public OperatorBase { public: using OperatorBase::OperatorBase; - void Run(const Scope& scope, - const platform::DeviceContext& dev_ctx) const override {} + void Run(const Scope& scope, const platform::Place& place) const override {} }; class MyTestOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker { @@ -76,8 +74,8 @@ TEST(OpRegistry, CreateOp) { auto op = paddle::framework::OpRegistry::CreateOp(op_desc); paddle::framework::Scope scope; - paddle::platform::CPUDeviceContext dev_ctx; - op->Run(scope, dev_ctx); + paddle::platform::CPUPlace cpu_place; + op->Run(scope, cpu_place); float scale_get = op->Attr("scale"); ASSERT_EQ(scale_get, scale); } @@ -117,8 +115,8 @@ TEST(OpRegistry, DefaultValue) { auto op = paddle::framework::OpRegistry::CreateOp(op_desc); paddle::framework::Scope scope; - paddle::platform::CPUDeviceContext dev_ctx; - op->Run(scope, dev_ctx); + paddle::platform::CPUPlace cpu_place; + op->Run(scope, cpu_place); ASSERT_EQ(op->Attr("scale"), 1.0); } @@ -167,9 +165,9 @@ TEST(OpRegistry, CustomChecker) { attr->set_type(paddle::framework::proto::AttrType::INT); attr->set_i(4); auto op = paddle::framework::OpRegistry::CreateOp(op_desc); - paddle::platform::CPUDeviceContext dev_ctx; + paddle::platform::CPUPlace cpu_place; paddle::framework::Scope scope; - op->Run(scope, dev_ctx); + op->Run(scope, cpu_place); int test_attr = op->Attr("test_attr"); ASSERT_EQ(test_attr, 4); } diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc index 0e58c0b570..f147cc5a6e 100644 --- a/paddle/framework/operator.cc +++ b/paddle/framework/operator.cc @@ -12,10 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/framework/operator.h" #include #include + +#include "paddle/framework/executor.h" #include "paddle/framework/lod_tensor_array.h" +#include "paddle/framework/operator.h" #include "paddle/framework/shape_inference.h" #include "paddle/framework/var_type.h" @@ -240,12 +242,6 @@ std::vector ExecutionContext::MultiOutput( return res; } -std::ostream& operator<<(std::ostream& os, const OpKernelType& kernel_key) { - os << "place[" << kernel_key.place_ << "]:data_type[" << kernel_key.data_type_ - << "]"; - return os; -} - bool OpSupportGPU(const std::string& op_type) { auto& all_kernels = OperatorWithKernel::AllOpKernels(); auto it = all_kernels.find(op_type); @@ -388,11 +384,11 @@ class RuntimeInferShapeContext : public InferShapeContext { }; void OperatorWithKernel::Run(const Scope& scope, - const platform::DeviceContext& dev_ctx) const { + const platform::Place& place) const { RuntimeInferShapeContext infer_shape_ctx(*this, scope); this->InferShape(&infer_shape_ctx); - - ExecutionContext ctx(*this, scope, dev_ctx); + platform::DeviceContextPool& pool = platform::DeviceContextPool::Get(); + auto dev_ctx = pool.Borrow(place); // check if op[type] has kernel registered. auto& all_op_kernels = AllOpKernels(); @@ -404,6 +400,8 @@ void OperatorWithKernel::Run(const Scope& scope, // check if op[type] have kernel for kernel_key OpKernelMap& kernels = kernels_iter->second; + + ExecutionContext ctx(*this, scope, *dev_ctx); auto kernel_key = GetKernelType(ctx); auto kernel_iter = kernels.find(kernel_key); diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h index 3207360cba..b592eea1b9 100644 --- a/paddle/framework/operator.h +++ b/paddle/framework/operator.h @@ -23,15 +23,14 @@ limitations under the License. */ #include "glog/logging.h" // For VLOG #include "paddle/framework/attribute.h" #include "paddle/framework/block_desc.h" -#include "paddle/framework/data_type.h" #include "paddle/framework/framework.pb.h" #include "paddle/framework/lod_tensor.h" #include "paddle/framework/op_info.h" +#include "paddle/framework/op_kernel_type.h" #include "paddle/framework/scope.h" #include "paddle/framework/selected_rows.h" #include "paddle/framework/tensor.h" #include "paddle/platform/device_context.h" -#include "paddle/platform/place.h" #include "paddle/platform/variant.h" #include "paddle/utils/Error.h" @@ -83,8 +82,7 @@ class OperatorBase { virtual std::string DebugString() const; /// Net will call this function to Run an op. - virtual void Run(const Scope& scope, - const platform::DeviceContext& dev_ctx) const = 0; + virtual void Run(const Scope& scope, const platform::Place& place) const = 0; virtual bool IsNetOp() const { return false; } @@ -159,8 +157,7 @@ class OperatorBase { class NOP : public OperatorBase { public: using OperatorBase::OperatorBase; - void Run(const Scope& scope, - const platform::DeviceContext& dev_ctx) const override {} + void Run(const Scope& scope, const platform::Place& place) const override {} std::unique_ptr Clone() const override { return std::unique_ptr(new NOP(*this)); } @@ -345,34 +342,6 @@ class OpKernel : public OpKernelBase { using ELEMENT_TYPE = T; }; -struct OpKernelType { - struct Hash { - std::hash hash_; - size_t operator()(const OpKernelType& key) const { - int place = key.place_.which(); - int data_type = static_cast(key.data_type_); - int pre_hash = data_type << NUM_PLACE_TYPE_LIMIT_IN_BIT | - (place & ((1 << NUM_PLACE_TYPE_LIMIT_IN_BIT) - 1)); - return hash_(pre_hash); - } - }; - - platform::Place place_; - proto::DataType data_type_; - - OpKernelType(proto::DataType data_type, platform::Place place) - : place_(place), data_type_(data_type) {} - - OpKernelType(proto::DataType data_type, - const platform::DeviceContext& dev_ctx) - : place_(dev_ctx.GetPlace()), data_type_(data_type) {} - - bool operator==(const OpKernelType& o) const { - return platform::places_are_same_class(place_, o.place_) && - data_type_ == o.data_type_; - } -}; - class OperatorWithKernel : public OperatorBase { public: using OpKernelMap = @@ -383,8 +352,7 @@ class OperatorWithKernel : public OperatorBase { const VariableNameMap& outputs, const AttributeMap& attrs) : OperatorBase(type, inputs, outputs, attrs) {} - void Run(const Scope& scope, - const platform::DeviceContext& dev_ctx) const final; + void Run(const Scope& scope, const platform::Place& place) const final; static std::unordered_map& AllOpKernels() { @@ -413,8 +381,6 @@ class OperatorWithKernel : public OperatorBase { proto::DataType IndicateDataType(const ExecutionContext& ctx) const; }; -std::ostream& operator<<(std::ostream& os, const OpKernelType& kernel_key); - extern bool OpSupportGPU(const std::string& op_type); } // namespace framework diff --git a/paddle/framework/operator_test.cc b/paddle/framework/operator_test.cc index 05a4651522..fbca45b59d 100644 --- a/paddle/framework/operator_test.cc +++ b/paddle/framework/operator_test.cc @@ -11,11 +11,12 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#include "paddle/framework/operator.h" #include "gtest/gtest.h" + +#include "paddle/framework/init.h" #include "paddle/framework/op_info.h" #include "paddle/framework/op_registry.h" +#include "paddle/framework/operator.h" namespace paddle { namespace framework { @@ -27,8 +28,7 @@ class OpWithoutKernelTest : public OperatorBase { OpWithoutKernelTest(const std::string& type, const VariableNameMap& inputs, const VariableNameMap& outputs, const AttributeMap& attrs) : OperatorBase(type, inputs, outputs, attrs), x(1) {} - void Run(const Scope& scope, - const platform::DeviceContext& dev_ctx) const override { + void Run(const Scope& scope, const platform::Place& place) const override { ++op_run_num; ASSERT_EQ(static_cast(inputs_.size()), 1); ASSERT_EQ(static_cast(outputs_.size()), 1); @@ -41,10 +41,9 @@ class OpWithoutKernelTest : public OperatorBase { int x{0}; }; -class OpeWithoutKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker { +class OpWithoutKernelCheckerMaker : public OpProtoAndCheckerMaker { public: - OpeWithoutKernelTestProtoAndCheckerMaker(OpProto* proto, - OpAttrChecker* op_checker) + OpWithoutKernelCheckerMaker(OpProto* proto, OpAttrChecker* op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { AddInput("input", "input of test op"); AddOutput("output", "output of test op"); @@ -65,11 +64,12 @@ static void BuildVar(const std::string& param_name, } } -REGISTER_OP_WITHOUT_GRADIENT( - test_operator, paddle::framework::OpWithoutKernelTest, - paddle::framework::OpeWithoutKernelTestProtoAndCheckerMaker); +REGISTER_OP_WITHOUT_GRADIENT(test_operator, + paddle::framework::OpWithoutKernelTest, + paddle::framework::OpWithoutKernelCheckerMaker); TEST(OperatorBase, all) { + paddle::framework::InitDevices({"CPU"}); paddle::framework::proto::OpDesc op_desc; op_desc.set_type("test_operator"); BuildVar("input", {"IN1"}, op_desc.add_inputs()); @@ -80,13 +80,13 @@ TEST(OperatorBase, all) { attr->set_type(paddle::framework::proto::AttrType::FLOAT); attr->set_f(3.14); - paddle::platform::CPUDeviceContext device_context; + paddle::platform::CPUPlace cpu_place; paddle::framework::Scope scope; auto op = paddle::framework::OpRegistry::CreateOp(op_desc); scope.Var("OUT1"); ASSERT_EQ(paddle::framework::op_run_num, 0); - op->Run(scope, device_context); + op->Run(scope, cpu_place); ASSERT_EQ(paddle::framework::op_run_num, 1); } @@ -123,7 +123,6 @@ template class CPUKernelTest : public OpKernel { public: void Compute(const ExecutionContext& ctx) const { - std::cout << "this is cpu kernel" << std::endl; std::cout << ctx.op().DebugString() << std::endl; cpu_kernel_run_num++; ASSERT_EQ(ctx.op().Input("x"), "IN1"); @@ -195,6 +194,7 @@ REGISTER_OP_CPU_KERNEL(op_with_kernel, // test with single input TEST(OpKernel, all) { + paddle::framework::InitDevices({"CPU"}); paddle::framework::proto::OpDesc op_desc; op_desc.set_type("op_with_kernel"); BuildVar("x", {"IN1"}, op_desc.add_inputs()); @@ -205,12 +205,12 @@ TEST(OpKernel, all) { attr->set_type(paddle::framework::proto::AttrType::FLOAT); attr->set_f(3.14); - paddle::platform::CPUDeviceContext cpu_device_context; + paddle::platform::CPUPlace cpu_place; paddle::framework::Scope scope; auto op = paddle::framework::OpRegistry::CreateOp(op_desc); ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 0); - op->Run(scope, cpu_device_context); + op->Run(scope, cpu_place); ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 1); } @@ -224,7 +224,9 @@ REGISTER_OP_CPU_KERNEL(op_multi_inputs_with_kernel, TEST(OpKernel, multi_inputs) { using namespace paddle::framework; + paddle::framework::InitDevices({"CPU"}); proto::OpDesc op_desc; + op_desc.set_type("op_multi_inputs_with_kernel"); BuildVar("xs", {"x0", "x1", "x2"}, op_desc.add_inputs()); BuildVar("k", {"k0"}, op_desc.add_inputs()); @@ -235,7 +237,7 @@ TEST(OpKernel, multi_inputs) { attr->set_type(paddle::framework::proto::AttrType::FLOAT); attr->set_f(3.14); - paddle::platform::CPUDeviceContext cpu_device_context; + paddle::platform::CPUPlace cpu_place; paddle::framework::Scope scope; scope.Var("x0")->GetMutable(); scope.Var("x1")->GetMutable(); @@ -245,7 +247,7 @@ TEST(OpKernel, multi_inputs) { scope.Var("y1")->GetMutable(); auto op = paddle::framework::OpRegistry::CreateOp(op_desc); - op->Run(scope, cpu_device_context); + op->Run(scope, cpu_place); } class OperatorClone : public paddle::framework::OperatorBase { @@ -257,10 +259,11 @@ class OperatorClone : public paddle::framework::OperatorBase { const paddle::framework::AttributeMap& attrs) : OperatorBase(type, inputs, outputs, attrs) {} void Run(const paddle::framework::Scope& scope, - const paddle::platform::DeviceContext& dev_ctx) const override {} + const paddle::platform::Place& place) const override {} }; TEST(Operator, Clone) { + paddle::framework::InitDevices({"CPU"}); OperatorClone a("ABC", paddle::framework::VariableNameMap{}, paddle::framework::VariableNameMap{}, paddle::framework::AttributeMap{}); diff --git a/paddle/framework/threadpool.h b/paddle/framework/threadpool.h new file mode 100644 index 0000000000..9a1ece3ae8 --- /dev/null +++ b/paddle/framework/threadpool.h @@ -0,0 +1,161 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include +#include +#include +#include +#include + +#include "paddle/platform/call_once.h" +#include "paddle/platform/enforce.h" + +namespace paddle { +namespace framework { + +typedef std::function Task; + +class ThreadPool { + public: + /** + * @brief Get a instance of threadpool, the thread number will + * be specified as the number of hardware thread contexts + */ + static ThreadPool* GetInstance() { + std::call_once(init_flag, &ThreadPool::Init); + return threadpool.get(); + } + + ~ThreadPool() { + { + // notify all threads to stop running + running_ = false; + scheduled_.notify_all(); + } + + for (auto& t : threads_) { + t->join(); + t.reset(nullptr); + } + } + + int GetNumThreads() const { return num_threads_; } + + int GetAvailable() { + std::unique_lock lock(mutex_); + return available_; + } + + /** + * @brief Push a function to the queue, and will be scheduled and + * executed if a thread is available. + * @param[in] Task will be pushed to the task queue. + */ + void Run(const Task& fn) { + std::unique_lock lock(mutex_); + tasks_.push(fn); + lock.unlock(); + scheduled_.notify_one(); + } + + /** + * @brief Wait until all the tasks are completed. + */ + void Wait() { + std::unique_lock lock(mutex_); + completed_.wait(lock, [=] { return Done() == true; }); + } + + private: + ThreadPool& operator=(const ThreadPool&) = delete; + ThreadPool(const ThreadPool&) = delete; + + ThreadPool(int num_threads) + : num_threads_(num_threads), available_(num_threads), running_(true) { + threads_.resize(num_threads); + for (auto& thread : threads_) { + // TODO(Yancey1989): binding the thread on the specify CPU number + thread.reset(new std::thread(std::bind(&ThreadPool::TaskLoop, this))); + } + } + + /** + * @brief If the task queue is empty and avaialbe + * is equal to the number of threads, means that + * all tasks are completed. + * + * Note: this function is not thread-safe. + * + * @return true if all tasks are completed. + */ + bool Done() { return tasks_.empty() && available_ == num_threads_; } + + void TaskLoop() { + while (running_) { + std::unique_lock lock(mutex_); + scheduled_.wait(lock, [=] { return !tasks_.empty() || !running_; }); + + if (!running_) { + break; + } + // pop a task from the task queue + auto task = tasks_.front(); + tasks_.pop(); + + --available_; + lock.unlock(); + + // run the task + task(); + + { + std::unique_lock lock(mutex_); + ++available_; + if (Done()) { + completed_.notify_all(); + } + } + } + } + + static void Init() { + if (threadpool.get() == nullptr) { + // TODO(Yancey1989): specify the max threads number + int num_threads = std::thread::hardware_concurrency(); + PADDLE_ENFORCE_GT(num_threads, 0); + threadpool.reset(new ThreadPool(num_threads)); + } + } + + private: + static std::unique_ptr threadpool; + static std::once_flag init_flag; + + int num_threads_; + int available_; + bool running_; + std::queue tasks_; + std::vector> threads_; + std::mutex mutex_; + std::condition_variable scheduled_; + std::condition_variable completed_; +}; + +std::unique_ptr ThreadPool::threadpool(nullptr); +std::once_flag ThreadPool::init_flag; +} // namespace framework +} // namespace paddle diff --git a/paddle/framework/threadpool_test.cc b/paddle/framework/threadpool_test.cc new file mode 100644 index 0000000000..78c762608e --- /dev/null +++ b/paddle/framework/threadpool_test.cc @@ -0,0 +1,58 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "threadpool.h" +#include +#include +#include +#include +#include + +namespace framework = paddle::framework; + +void do_sum(framework::ThreadPool* pool, std::atomic& sum, int cnt) { + for (int i = 0; i < cnt; ++i) { + pool->Run([&sum]() { sum.fetch_add(1); }); + } +} + +TEST(ThreadPool, ConcurrentInit) { + framework::ThreadPool* pool; + int concurrent_cnt = 50; + std::vector threads; + for (int i = 0; i < concurrent_cnt; ++i) { + std::thread t([&pool]() { pool = framework::ThreadPool::GetInstance(); }); + threads.push_back(std::move(t)); + } + for (auto& t : threads) { + t.join(); + } +} + +TEST(ThreadPool, ConcurrentStart) { + framework::ThreadPool* pool = framework::ThreadPool::GetInstance(); + std::atomic sum(0); + std::vector threads; + int concurrent_cnt = 50; + // sum = (n * (n + 1)) / 2 + for (int i = 1; i <= concurrent_cnt; ++i) { + std::thread t(do_sum, pool, std::ref(sum), i); + threads.push_back(std::move(t)); + } + for (auto& t : threads) { + t.join(); + } + pool->Wait(); + EXPECT_EQ(sum, ((concurrent_cnt + 1) * concurrent_cnt) / 2); +} diff --git a/paddle/operators/array_operator.h b/paddle/operators/array_operator.h index 1f2b4fdb4b..d641918c56 100644 --- a/paddle/operators/array_operator.h +++ b/paddle/operators/array_operator.h @@ -15,6 +15,7 @@ #pragma once #include "paddle/framework/lod_tensor_array.h" #include "paddle/framework/op_registry.h" +#include "paddle/platform/device_context.h" namespace paddle { namespace operators { @@ -27,11 +28,16 @@ class ArrayOp : public framework::OperatorBase { protected: size_t GetOffset(const framework::Scope &scope, - const platform::DeviceContext &dev_ctx) const { + const platform::Place &place) const { auto *i = scope.FindVar(Input("I")); PADDLE_ENFORCE(i != nullptr, "I must be set"); auto &i_tensor = i->Get(); PADDLE_ENFORCE_EQ(i_tensor.numel(), 1); + + // get device context from pool + platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); + auto &dev_ctx = *pool.Borrow(place); + size_t offset; if (platform::is_gpu_place(i_tensor.place())) { // FIXME: Avoid copy from GPU to CPU diff --git a/paddle/operators/array_to_lod_tensor_op.cc b/paddle/operators/array_to_lod_tensor_op.cc index b6ca3cad94..73796229bc 100644 --- a/paddle/operators/array_to_lod_tensor_op.cc +++ b/paddle/operators/array_to_lod_tensor_op.cc @@ -12,10 +12,12 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include "paddle/framework/lod_rank_table.h" #include "paddle/framework/lod_tensor_array.h" #include "paddle/framework/op_registry.h" #include "paddle/memory/memcpy.h" +#include "paddle/platform/device_context.h" namespace paddle { namespace operators { @@ -30,7 +32,7 @@ class ArrayToLoDTensorOp : public framework::OperatorBase { const framework::AttributeMap &attrs) : OperatorBase(type, inputs, outputs, attrs) {} void Run(const framework::Scope &scope, - const platform::DeviceContext &dev_ctx) const override { + const platform::Place &dev_place) const override { auto &x = scope.FindVar(Input("X"))->Get(); auto &rank_table = scope.FindVar(Input("RankTable"))->Get(); @@ -103,6 +105,10 @@ class ArrayToLoDTensorOp : public framework::OperatorBase { continue; } auto slice = out->Slice(out_offset, out_offset + len); + + platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); + auto &dev_ctx = *pool.Borrow(place); + framework::CopyFrom(x[x_idx].Slice(start_offset, end_offset), place, dev_ctx, &slice); out_offset += len; diff --git a/paddle/operators/assign_op.cc b/paddle/operators/assign_op.cc index a914ff4ba9..60a913947f 100644 --- a/paddle/operators/assign_op.cc +++ b/paddle/operators/assign_op.cc @@ -15,6 +15,7 @@ #include "paddle/framework/data_type.h" #include "paddle/framework/op_registry.h" #include "paddle/framework/var_type.h" +#include "paddle/platform/device_context.h" namespace paddle { namespace operators { @@ -71,7 +72,7 @@ class AssignOp : public framework::OperatorBase { const framework::AttributeMap &attrs) : OperatorBase(type, inputs, outputs, attrs) {} void Run(const framework::Scope &scope, - const platform::DeviceContext &dev_ctx) const override { + const platform::Place &place) const override { auto *x = scope.FindVar(Input("X")); if (x == nullptr) { return; @@ -80,6 +81,10 @@ class AssignOp : public framework::OperatorBase { PADDLE_ENFORCE( out != nullptr, "The Output(Out) should not be null if the Input(X) is set."); + + platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); + auto &dev_ctx = *pool.Borrow(place); + framework::VisitVarType(*x, AssignFunctor(out, dev_ctx)); } }; diff --git a/paddle/operators/beam_search_decode_op.cc b/paddle/operators/beam_search_decode_op.cc index 32756faac5..52c28e7f53 100644 --- a/paddle/operators/beam_search_decode_op.cc +++ b/paddle/operators/beam_search_decode_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/operators/beam_search_decode_op.h" +#include "paddle/platform/device_context.h" namespace paddle { namespace operators { @@ -55,7 +56,10 @@ class BeamSearchDecodeOp : public framework::OperatorBase { const framework::AttributeMap& attrs) : OperatorBase(type, inputs, outputs, attrs) {} void Run(const framework::Scope& scope, - const platform::DeviceContext& dev_ctx) const override { + const platform::Place& dev_place) const override { + platform::DeviceContextPool& pool = platform::DeviceContextPool::Get(); + auto& dev_ctx = *pool.Borrow(dev_place); + framework::ExecutionContext ctx(*this, scope, dev_ctx); const LoDTensorArray* ids = ctx.Input("Ids"); diff --git a/paddle/operators/beam_search_op.h b/paddle/operators/beam_search_op.h index cc556bfe42..08b551ef9b 100644 --- a/paddle/operators/beam_search_op.h +++ b/paddle/operators/beam_search_op.h @@ -189,7 +189,7 @@ class BeamSearchOp : public framework::OperatorBase { } void Run(const framework::Scope& scope, - const platform::DeviceContext& dev_ctx) const override { + const platform::Place& dev_place) const override { LOG(INFO) << "run beam search op"; auto ids_var = scope.FindVar(Input("ids")); auto scores_var = scope.FindVar(Input("scores")); diff --git a/paddle/operators/cond_op.cc b/paddle/operators/cond_op.cc index 8c860676e0..455fbd8ca3 100644 --- a/paddle/operators/cond_op.cc +++ b/paddle/operators/cond_op.cc @@ -13,9 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/operators/cond_op.h" - #include "paddle/operators/gather.h" #include "paddle/operators/scatter.h" +#include "paddle/platform/device_context.h" namespace paddle { namespace operators { @@ -193,12 +193,15 @@ void CondOp::MergeDataFromSubnet(const framework::Scope& scope, } } -void CondOp::Run(const Scope& scope, - const platform::DeviceContext& dev_ctx) const { +void CondOp::Run(const Scope& scope, const platform::Place& place) const { + // get device context from pool + platform::DeviceContextPool& pool = platform::DeviceContextPool::Get(); + auto& dev_ctx = *pool.Borrow(place); + PrepareDataForSubnet(scope, dev_ctx); std::vector& sub_scopes = GetSubScopes(scope); for (int i = 0; i < BRANCH_NUM; ++i) { - sub_net_op_[i]->Run(*sub_scopes[i], dev_ctx); + sub_net_op_[i]->Run(*sub_scopes[i], place); } MergeDataFromSubnet(scope, dev_ctx); } diff --git a/paddle/operators/cond_op.h b/paddle/operators/cond_op.h index 93121fb31b..7dcdc47e0b 100644 --- a/paddle/operators/cond_op.h +++ b/paddle/operators/cond_op.h @@ -78,7 +78,7 @@ class CondOp : public framework::OperatorBase { } void Run(const framework::Scope& scope, - const platform::DeviceContext& dev_ctx) const override; + const platform::Place& place) const override; private: const int TRUE_BRANCH = 0; diff --git a/paddle/operators/conditional_block_op.cc b/paddle/operators/conditional_block_op.cc index 204be7d1e5..d8fd6420da 100644 --- a/paddle/operators/conditional_block_op.cc +++ b/paddle/operators/conditional_block_op.cc @@ -51,7 +51,7 @@ class ConditionalBlockOp : public ConditionalOp { const framework::AttributeMap &attrs) : ConditionalOp(type, inputs, outputs, attrs) {} void Run(const framework::Scope &scope, - const platform::DeviceContext &dev_ctx) const override { + const platform::Place &dev_place) const override { auto xs = InputTensors(scope); bool need_run = std::all_of( xs.begin(), xs.end(), @@ -65,8 +65,8 @@ class ConditionalBlockOp : public ConditionalOp { scopes->front() = &scope.NewScope(); auto &cur_scope = *scopes->front(); + framework::Executor exec(dev_place); auto *block = Attr("sub_block"); - framework::Executor exec(dev_ctx); exec.Run(*block->Program(), &cur_scope, block->ID(), false); } } @@ -104,7 +104,7 @@ class ConditionalBlockGradOp : public ConditionalOp { const framework::AttributeMap &attrs) : ConditionalOp(type, inputs, outputs, attrs) {} void Run(const framework::Scope &scope, - const platform::DeviceContext &dev_ctx) const override { + const platform::Place &dev_place) const override { auto xs = this->InputTensors(scope); bool need_run = std::all_of( xs.begin(), xs.end(), @@ -116,21 +116,21 @@ class ConditionalBlockGradOp : public ConditionalOp { auto &scopes = scope_var->Get>(); framework::Scope &cur_scope = *scopes[0]; + framework::Executor exec(dev_place); auto *block = Attr("sub_block"); - framework::Executor exec(dev_ctx); exec.Run(*block->Program(), &cur_scope, block->ID(), false); - AssignLocalGradientToGlobal(dev_ctx, cur_scope, Inputs("Params"), + AssignLocalGradientToGlobal(dev_place, cur_scope, Inputs("Params"), Outputs(framework::GradVarName("Params"))); - AssignLocalGradientToGlobal(dev_ctx, cur_scope, Inputs("X"), + AssignLocalGradientToGlobal(dev_place, cur_scope, Inputs("X"), Outputs(framework::GradVarName("X"))); } } private: void AssignLocalGradientToGlobal( - const platform::DeviceContext &dev_ctx, const framework::Scope &cur_scope, + const platform::Place &place, const framework::Scope &cur_scope, const std::vector &p_names, const std::vector &pg_names) const { for (size_t i = 0; i < p_names.size(); ++i) { @@ -144,7 +144,7 @@ class ConditionalBlockGradOp : public ConditionalOp { auto assign = framework::OpRegistry::CreateOp( "assign", {{"X", {new_in_grad_name}}}, {{"Out", {out_grad_name}}}, framework::AttributeMap{}); - assign->Run(cur_scope, dev_ctx); + assign->Run(cur_scope, place); cur_scope.Rename(new_in_grad_name, in_grad_name); } } diff --git a/paddle/operators/feed_op.cc b/paddle/operators/feed_op.cc index 66b8080c26..65c98a219b 100644 --- a/paddle/operators/feed_op.cc +++ b/paddle/operators/feed_op.cc @@ -25,7 +25,7 @@ class FeedOp : public framework::OperatorBase { const framework::AttributeMap &attrs) : OperatorBase(type, inputs, outputs, attrs) {} void Run(const framework::Scope &scope, - const platform::DeviceContext &dev_ctx) const override { + const platform::Place &place) const override { auto feed_var_name = Input("X"); auto *feed_var = scope.FindVar(feed_var_name); @@ -47,7 +47,12 @@ class FeedOp : public framework::OperatorBase { auto &feed_list = feed_var->Get(); auto &feed_item = feed_list.at(static_cast(col)); auto *out_item = out_var->GetMutable(); - framework::CopyFrom(feed_item, dev_ctx.GetPlace(), dev_ctx, out_item); + + // get device context from pool + platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); + auto &dev_ctx = *pool.Borrow(place); + + framework::CopyFrom(feed_item, place, dev_ctx, out_item); out_item->set_lod(feed_item.lod()); } }; diff --git a/paddle/operators/fetch_op.cc b/paddle/operators/fetch_op.cc index 616590f200..21c34512bf 100644 --- a/paddle/operators/fetch_op.cc +++ b/paddle/operators/fetch_op.cc @@ -14,6 +14,7 @@ #include "paddle/framework/feed_fetch_type.h" #include "paddle/framework/op_registry.h" +#include "paddle/platform/device_context.h" namespace paddle { namespace operators { @@ -26,7 +27,7 @@ class FetchOp : public framework::OperatorBase { : OperatorBase(type, inputs, outputs, attrs) {} void Run(const framework::Scope &scope, - const platform::DeviceContext &dev_ctx) const override { + const platform::Place &place) const override { auto fetch_var_name = Input("X"); auto *fetch_var = scope.FindVar(fetch_var_name); PADDLE_ENFORCE(fetch_var != nullptr, @@ -51,6 +52,9 @@ class FetchOp : public framework::OperatorBase { // FIXME(yuyang18): Should we assume the fetch operator always generate // CPU outputs? + platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); + auto &dev_ctx = *pool.Borrow(place); + CopyFrom(src_item, platform::CPUPlace(), dev_ctx, &dst_item); dev_ctx.Wait(); dst_item.set_lod(src_item.lod()); diff --git a/paddle/operators/fill_constant_op.cc b/paddle/operators/fill_constant_op.cc index 3489079eaa..fe0706c4a9 100644 --- a/paddle/operators/fill_constant_op.cc +++ b/paddle/operators/fill_constant_op.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include "paddle/framework/data_type.h" #include "paddle/framework/op_registry.h" #include "paddle/operators/math/math_function.h" +#include "paddle/platform/device_context.h" namespace paddle { namespace operators { @@ -33,7 +34,7 @@ class FillConstantOp : public framework::OperatorBase { public: using framework::OperatorBase::OperatorBase; void Run(const framework::Scope &scope, - const platform::DeviceContext &dev_ctx) const override { + const platform::Place &dev_place) const override { auto data_type = static_cast(Attr("dtype")); auto value = Attr("value"); @@ -45,8 +46,11 @@ class FillConstantOp : public framework::OperatorBase { auto cpu = platform::CPUPlace(); out.mutable_data(cpu, framework::ToTypeIndex(data_type)); } else { - out.mutable_data(dev_ctx.GetPlace(), framework::ToTypeIndex(data_type)); + out.mutable_data(dev_place, framework::ToTypeIndex(data_type)); } + + platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); + auto &dev_ctx = *pool.Borrow(dev_place); math::set_constant(dev_ctx, &out, value); } }; diff --git a/paddle/operators/fill_op.cc b/paddle/operators/fill_op.cc index f0c6cff8e3..9a2d8aafca 100644 --- a/paddle/operators/fill_op.cc +++ b/paddle/operators/fill_op.cc @@ -15,6 +15,7 @@ #include "paddle/framework/data_type.h" #include "paddle/framework/op_registry.h" #include "paddle/operators/detail/safe_ref.h" +#include "paddle/platform/device_context.h" namespace paddle { namespace operators { @@ -42,7 +43,7 @@ class FillOp : public framework::OperatorBase { const framework::AttributeMap &attrs) : OperatorBase(type, inputs, outputs, attrs) {} void Run(const framework::Scope &scope, - const platform::DeviceContext &dev_ctx) const override { + const platform::Place &place) const override { auto &out = detail::Ref(detail::Ref(scope.FindVar(Output("Out")), "Cannot find variable %s", Output("Out")) @@ -51,12 +52,11 @@ class FillOp : public framework::OperatorBase { auto dtype = static_cast(Attr("dtype")); platform::CPUPlace cpu; auto force_cpu = Attr("force_cpu"); - out.mutable_data(force_cpu ? cpu : dev_ctx.GetPlace(), - framework::ToTypeIndex(dtype)); + out.mutable_data(force_cpu ? cpu : place, framework::ToTypeIndex(dtype)); framework::LoDTensor tensor; - if (force_cpu || platform::is_cpu_place(dev_ctx.GetPlace())) { + if (force_cpu || platform::is_cpu_place(place)) { tensor.ShareDataWith(out); } else { // Always make tensor in CPU memory. @@ -67,9 +67,11 @@ class FillOp : public framework::OperatorBase { framework::VisitDataType( dtype, FillOpVisitor(&tensor, Attr>("value"))); - if (!force_cpu && platform::is_gpu_place(dev_ctx.GetPlace())) { + if (!force_cpu && platform::is_gpu_place(place)) { // Copy tensor to out - framework::CopyFrom(tensor, dev_ctx.GetPlace(), dev_ctx, &out); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); + auto &dev_ctx = *pool.Borrow(place); + framework::CopyFrom(tensor, place, dev_ctx, &out); } } }; diff --git a/paddle/operators/increment_op.cc b/paddle/operators/increment_op.cc index 789c92102d..3988ac12c7 100644 --- a/paddle/operators/increment_op.cc +++ b/paddle/operators/increment_op.cc @@ -52,7 +52,7 @@ class IncrementOp : public framework::OperatorBase { : OperatorBase(type, inputs, outputs, attrs) {} void Run(const framework::Scope &scope, - const platform::DeviceContext &dev_ctx) const override { + const platform::Place &place) const override { auto &x = scope.FindVar(Input("X"))->Get(); auto &out = *scope.FindVar(Output("Out"))->GetMutable(); diff --git a/paddle/operators/is_empty_op.cc b/paddle/operators/is_empty_op.cc index 3616a0414f..545f87d4ed 100644 --- a/paddle/operators/is_empty_op.cc +++ b/paddle/operators/is_empty_op.cc @@ -29,7 +29,7 @@ class IsEmptyOp : public framework::OperatorBase { : OperatorBase(type, inputs, outputs, attrs) {} void Run(const framework::Scope &scope, - const platform::DeviceContext &dev_ctx) const override { + const platform::Place &place) const override { // get input auto *var = scope.FindVar(Input(kInput)); PADDLE_ENFORCE_NOT_NULL(var); diff --git a/paddle/operators/load_op.cc b/paddle/operators/load_op.cc index 6c51dad27a..ae6515bb12 100644 --- a/paddle/operators/load_op.cc +++ b/paddle/operators/load_op.cc @@ -11,10 +11,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include #include "paddle/framework/op_registry.h" - -#include +#include "paddle/platform/device_context.h" namespace paddle { namespace operators { @@ -26,7 +26,7 @@ class LoadOp : public framework::OperatorBase { const framework::AttributeMap &attrs) : OperatorBase(type, inputs, outputs, attrs) {} void Run(const framework::Scope &scope, - const platform::DeviceContext &dev_ctx) const override { + const platform::Place &place) const override { auto filename = Attr("file_path"); std::ifstream fin(filename); PADDLE_ENFORCE(static_cast(fin), "Cannot open file %s for load op", @@ -40,7 +40,9 @@ class LoadOp : public framework::OperatorBase { auto *tensor = out_var->GetMutable(); framework::DeserializeFromStream(fin, tensor); - auto place = dev_ctx.GetPlace(); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); + auto &dev_ctx = *pool.Borrow(place); + if (platform::is_gpu_place(place)) { // copy CPU to GPU framework::LoDTensor cpu_tensor; diff --git a/paddle/operators/lod_array_length_op.cc b/paddle/operators/lod_array_length_op.cc index cc8593810b..d71cb028bc 100644 --- a/paddle/operators/lod_array_length_op.cc +++ b/paddle/operators/lod_array_length_op.cc @@ -26,7 +26,7 @@ class LoDArrayLengthOp : public framework::OperatorBase { const framework::AttributeMap &attrs) : OperatorBase(type, inputs, outputs, attrs) {} void Run(const framework::Scope &scope, - const platform::DeviceContext &dev_ctx) const override { + const platform::Place &place) const override { auto &x = scope.FindVar(Input("X"))->Get(); auto &out = *scope.FindVar(Output("Out"))->GetMutable(); diff --git a/paddle/operators/lod_rank_table_op.cc b/paddle/operators/lod_rank_table_op.cc index 2d67046bfe..c351ad8fef 100644 --- a/paddle/operators/lod_rank_table_op.cc +++ b/paddle/operators/lod_rank_table_op.cc @@ -24,7 +24,7 @@ class LoDRankTableOp : public framework::OperatorBase { const framework::AttributeMap &attrs) : OperatorBase(type, inputs, outputs, attrs) {} void Run(const framework::Scope &scope, - const platform::DeviceContext &dev_ctx) const override { + const platform::Place &dev_place) const override { auto x = scope.FindVar(Input("X"))->Get(); auto *out = scope.FindVar(Output("Out"))->GetMutable(); diff --git a/paddle/operators/lod_tensor_to_array_op.cc b/paddle/operators/lod_tensor_to_array_op.cc index 643f8859f3..c7b9057f8d 100644 --- a/paddle/operators/lod_tensor_to_array_op.cc +++ b/paddle/operators/lod_tensor_to_array_op.cc @@ -15,6 +15,7 @@ #include "paddle/framework/lod_tensor_array.h" #include "paddle/framework/op_registry.h" #include "paddle/operators/detail/safe_ref.h" +#include "paddle/platform/device_context.h" namespace paddle { namespace operators { @@ -32,7 +33,7 @@ class LoDTensorToArrayOp : public framework::OperatorBase { const framework::AttributeMap &attrs) : OperatorBase(type, inputs, outputs, attrs) {} void Run(const framework::Scope &scope, - const platform::DeviceContext &dev_ctx) const override { + const platform::Place &place) const override { auto &x = detail::Ref(scope.FindVar(Input("X")), "Cannot find input %s", Input("X")) .Get(); @@ -86,6 +87,10 @@ class LoDTensorToArrayOp : public framework::OperatorBase { // out[i][offset: offset+len] = x[each_range.begin: each_range.end] auto slice = out[i].Slice(static_cast(offset), static_cast(offset + len)); + + platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); + auto &dev_ctx = *pool.Borrow(place); + framework::CopyFrom(x.Slice(static_cast(each_range.begin), static_cast(each_range.end)), x.place(), dev_ctx, &slice); diff --git a/paddle/operators/math/math_function_impl.h b/paddle/operators/math/math_function_impl.h index aced2690bc..ddd798dace 100644 --- a/paddle/operators/math/math_function_impl.h +++ b/paddle/operators/math/math_function_impl.h @@ -94,8 +94,8 @@ class ColwiseSum { T* out_buf = out->mutable_data(out->place()); const T* in_buf = input.data(); - for (size_t i = 0; i < height; ++i) { - for (size_t j = 0; j < size; ++j) { + for (size_t i = 0; i < static_cast(height); ++i) { + for (size_t j = 0; j < static_cast(size); ++j) { if (i == 0) { out_buf[j] = in_buf[i * size + j]; } else { diff --git a/paddle/operators/max_sequence_len_op.cc b/paddle/operators/max_sequence_len_op.cc index dec2874a1f..8d629fe735 100644 --- a/paddle/operators/max_sequence_len_op.cc +++ b/paddle/operators/max_sequence_len_op.cc @@ -28,7 +28,7 @@ class MaxSeqenceLenOp : public framework::OperatorBase { : OperatorBase(type, inputs, outputs, attrs) {} void Run(const framework::Scope &scope, - const platform::DeviceContext &dev_ctx) const override { + const platform::Place &dev_place) const override { auto &rank_table = scope.FindVar(Input("RankTable"))->Get(); auto *out = diff --git a/paddle/operators/merge_lod_tensor_op.cc b/paddle/operators/merge_lod_tensor_op.cc index 5edf29c3af..2287f34791 100644 --- a/paddle/operators/merge_lod_tensor_op.cc +++ b/paddle/operators/merge_lod_tensor_op.cc @@ -28,7 +28,11 @@ class MergeLoDTensorOp : public framework::OperatorBase { const framework::AttributeMap &attrs) : OperatorBase(type, inputs, outputs, attrs) {} void Run(const framework::Scope &scope, - const platform::DeviceContext &dev_ctx) const override { + const platform::Place &dev_place) const override { + // get device context from pool + platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); + auto &dev_ctx = *pool.Borrow(dev_place); + auto &x = scope.FindVar(Input("X"))->Get(); auto &mask = scope.FindVar(Input("Mask"))->Get(); auto &in_true = scope.FindVar(Input("InTrue"))->Get(); diff --git a/paddle/operators/mul_op.cc b/paddle/operators/mul_op.cc index 599df9c3df..c923e988a5 100644 --- a/paddle/operators/mul_op.cc +++ b/paddle/operators/mul_op.cc @@ -113,7 +113,7 @@ This operator is used to perform matrix multiplication for input $X$ and $Y$. The equation is: - $$Out = X * Y$$ +$$Out = X * Y$$ Both the input $X$ and $Y$ can carry the LoD (Level of Details) information, or not. But the output only shares the LoD information with input $X$. diff --git a/paddle/operators/nccl_op.cc b/paddle/operators/nccl_op.cc index e19f534f8a..368d2bfaa1 100644 --- a/paddle/operators/nccl_op.cc +++ b/paddle/operators/nccl_op.cc @@ -24,7 +24,7 @@ class NCCLInitOp : public framework::OperatorBase { : OperatorBase(type, inputs, outputs, attrs) {} void Run(const framework::Scope &scope, - const platform::DeviceContext &dev_ctx) const override { + const platform::Place &place) const override { const auto &name = Output("Communicator"); PADDLE_ENFORCE_NOT_NULL(scope.FindVar(name), "Can not find variable '%s' in the scope.", name); diff --git a/paddle/operators/nccl_op_test.cu.cc b/paddle/operators/nccl_op_test.cu.cc index c1046aadaf..b6e4ccb73f 100644 --- a/paddle/operators/nccl_op_test.cu.cc +++ b/paddle/operators/nccl_op_test.cu.cc @@ -22,6 +22,7 @@ #include #include "paddle/framework/block_desc.h" +#include "paddle/framework/init.h" #include "paddle/framework/op_desc.h" #include "paddle/framework/op_registry.h" #include "paddle/framework/program_desc.h" @@ -49,7 +50,7 @@ const f::DDim kDims = {100, 100}; class NCCLTester : public ::testing::Test { public: virtual void SetUp() override { - cpu_ctx = new p::CPUDeviceContext(p::CPUPlace()); + paddle::platform::CPUPlace cpu_place; for (size_t i = 0; i < gpu_list.size(); ++i) { p::GPUPlace place(i); dev_ctxs.emplace_back(new p::CUDADeviceContext(place)); @@ -65,6 +66,7 @@ class NCCLTester : public ::testing::Test { } void NCCLInitOp() { + paddle::platform::CPUPlace cpu_place; std::unique_ptr op1(new f::OpDesc); op1->SetType("ncclInit"); @@ -76,7 +78,7 @@ class NCCLTester : public ::testing::Test { auto op = f::OpRegistry::CreateOp(*op1); VLOG(1) << "invoke NCCLInitOp."; - op->Run(g_scope, *cpu_ctx); + op->Run(g_scope, cpu_place); VLOG(1) << "NCCLInitOp finished."; } @@ -111,13 +113,12 @@ class NCCLTester : public ::testing::Test { VLOG(1) << "Device : " << gpu_id << " invoke " << op_desc.Type(); VLOG(1) << " send_tensor : " << send_tensor->numel() << " recv_tensor : " << recv_tensor->numel(); - op->Run(*scope, *ctx); + op->Run(*scope, place); VLOG(1) << "Device : " << gpu_id << " finished " << op_desc.Type(); } public: std::vector dev_ctxs; - p::DeviceContext *cpu_ctx; f::Scope g_scope; std::mutex mu; }; @@ -131,14 +132,14 @@ TEST(NCCL, ncclInitOp) { op_desc->SetAttr("gpus", {gpu_list}); f::Scope g_scope; - std::unique_ptr ctx(new p::CPUDeviceContext(p::CPUPlace())); + paddle::platform::CPUPlace cpu_place; auto *var = g_scope.Var("x1"); var->GetMutable(); auto op = f::OpRegistry::CreateOp(*op_desc); VLOG(1) << "invoke NCCLInitOp."; - op->Run(g_scope, *ctx.get()); + op->Run(g_scope, cpu_place); VLOG(1) << "NCCLInitOp finished."; } @@ -294,9 +295,18 @@ int main(int argc, char **argv) { return 0; } - for (int i = 0; i < dev_count; ++i) { + std::vector places; + + places.emplace_back(paddle::platform::CPUPlace()); + int count = paddle::platform::GetCUDADeviceCount(); + for (int i = 0; i < count; ++i) { + places.emplace_back(paddle::platform::GPUPlace(i)); gpu_list.emplace_back(i); } + + VLOG(0) << " DeviceCount " << count; + paddle::platform::DeviceContextPool::Create(places); + testing::InitGoogleTest(&argc, argv); // device context should be release before scope. diff --git a/paddle/operators/net_op.h b/paddle/operators/net_op.h index 8935751f15..85d0153b32 100644 --- a/paddle/operators/net_op.h +++ b/paddle/operators/net_op.h @@ -65,9 +65,9 @@ class NetOp : public framework::OperatorBase { * will be used. */ void Run(const framework::Scope& scope, - const platform::DeviceContext& dev_ctx) const override { + const platform::Place& place) const override { for (auto& op : ops_) { - op->Run(scope, dev_ctx); + op->Run(scope, place); } } diff --git a/paddle/operators/net_op_test.cc b/paddle/operators/net_op_test.cc index 22fba9568d..dfd86546e8 100644 --- a/paddle/operators/net_op_test.cc +++ b/paddle/operators/net_op_test.cc @@ -13,8 +13,7 @@ class TestOp : public framework::OperatorBase { public: using framework::OperatorBase::OperatorBase; DEFINE_OP_CLONE_METHOD(TestOp); - void Run(const Scope& scope, - const platform::DeviceContext& dev_ctx) const override { + void Run(const Scope& scope, const platform::Place& place) const override { ++run_cnt; } }; diff --git a/paddle/operators/positive_negative_pair_op.cc b/paddle/operators/positive_negative_pair_op.cc index ab9f67bfe6..c607c93a15 100644 --- a/paddle/operators/positive_negative_pair_op.cc +++ b/paddle/operators/positive_negative_pair_op.cc @@ -154,13 +154,14 @@ class PositiveNegativePairOpMaker : public framework::OpProtoAndCheckerMaker { "Noting that reducing on the first dim will make the LoD info lost.") .SetDefault(0); AddComment(R"DOC( - PositiveNegativePairOp can be used to evaluate Learning To Rank(LTR) - model performance. - Within some context, e.g. the "query", a LTR model generates scores - for a list of items, which gives a partial order of the items. - PositiveNegativePairOp takes a list of reference rank order - (Input("Label")) and the model generated scores (Input(Score)) as - inputs and counts the pairs that ranked correctly and incorrectly. +PositiveNegativePairOp can be used to evaluate Learning To Rank(LTR) model's +performance. + +Within some context, e.g. the "query", a LTR model generates scores for a list +of items, which gives a partial order of the items. PositiveNegativePairOp +takes a list of reference rank order (Input("Label")) and the model generated +scores (Input(Score)) as inputs and counts the pairs that ranked correctly +and incorrectly. )DOC"); } }; diff --git a/paddle/operators/recurrent_op.cc b/paddle/operators/recurrent_op.cc index 5981d5745d..77f3a40b76 100644 --- a/paddle/operators/recurrent_op.cc +++ b/paddle/operators/recurrent_op.cc @@ -227,14 +227,15 @@ class RecurrentOp : public RecurrentBase { : RecurrentBase(type, inputs, outputs, attrs) {} void Run(const framework::Scope &scope, - const platform::DeviceContext &dev_ctx) const override { + const platform::Place &place) const override { auto seq_len = static_cast(this->GetSequenceLength(scope)); VLOG(3) << "Static RNN input sequence length = " << seq_len; StepScopes scopes = CreateStepScopes(scope, seq_len); auto reverse = Attr(kReverse); - framework::Executor executor(dev_ctx); + framework::Executor executor(place); auto *block = Attr(kStepBlock); + auto *program = block->Program(); for (size_t i = 0; i < seq_len; ++i) { @@ -270,6 +271,10 @@ class RecurrentOp : public RecurrentBase { executor.Run(*program, &cur_scope, block->ID(), false /*create_local_scope*/); + // get device context from pool + platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); + auto &dev_ctx = *pool.Borrow(place); + // Copy inside::output -> outside::output // outside::output[seq_offset: seq_offset + 1] = inside::output this->LinkTensorWithCallback( @@ -278,14 +283,13 @@ class RecurrentOp : public RecurrentBase { framework::LoDTensor *dst_tensor) { if (i == 0) { // create output tensor at begin dst_tensor->Resize(PrependDims(seq_len, src_tensor.dims())); - dst_tensor->mutable_data(dev_ctx.GetPlace(), src_tensor.type()); + dst_tensor->mutable_data(place, src_tensor.type()); } auto dst_out = dst_tensor->Slice(seq_offset, seq_offset + 1); // Explicit copy output since the local RNN scope can be destroyed // early. - framework::CopyFrom(src_tensor, dev_ctx.GetPlace(), dev_ctx, - &dst_out); + framework::CopyFrom(src_tensor, place, dev_ctx, &dst_out); }); scopes.Next(); @@ -311,15 +315,20 @@ class RecurrentGradOp : public RecurrentBase { : RecurrentBase(type, inputs, outputs, attrs) {} void Run(const framework::Scope &scope, - const platform::DeviceContext &dev_ctx) const override { + const platform::Place &place) const override { auto seq_len = static_cast(GetSequenceLength(scope)); StepScopes scopes = CreateStepScopes(scope, seq_len); auto reverse = Attr(kReverse); - framework::Executor executor(dev_ctx); + framework::Executor executor(place); auto *block = Attr(kStepBlock); + auto *program = block->Program(); + // get device context from pool + platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); + auto &dev_ctx = *pool.Borrow(place); + for (size_t step_id = 0; step_id < seq_len; ++step_id) { size_t seq_offset = reverse ? step_id : seq_len - step_id - 1; VLOG(3) << "Recurrent backward operate at the time step " << seq_offset; @@ -366,8 +375,7 @@ class RecurrentGradOp : public RecurrentBase { auto *cur_grad_var = cur_scope.Var(cur_grad); auto cur_grad_tensor = cur_grad_var->GetMutable(); - framework::CopyFrom(ex_tensor, dev_ctx.GetPlace(), dev_ctx, - cur_grad_tensor); + framework::CopyFrom(ex_tensor, place, dev_ctx, cur_grad_tensor); } } @@ -410,7 +418,7 @@ class RecurrentGradOp : public RecurrentBase { auto zero_op = framework::OpRegistry::CreateOp( "fill_constant", framework::VariableNameMap{}, {{"Out", {pg_names[param_id]}}}, attrs); - zero_op->Run(scope, dev_ctx); + zero_op->Run(scope, place); } auto new_inside_name = cur_scope.Rename(inside_grad_name); @@ -419,7 +427,7 @@ class RecurrentGradOp : public RecurrentBase { auto sum_op = framework::OpRegistry::CreateOp( "sum", {{"X", {pg_names[param_id], new_inside_name}}}, {{"Out", {pg_names[param_id]}}}, framework::AttributeMap{}); - sum_op->Run(cur_scope, dev_ctx); + sum_op->Run(cur_scope, place); cur_scope.Rename(new_inside_name, inside_grad_name); } @@ -437,11 +445,11 @@ class RecurrentGradOp : public RecurrentBase { } if (step_id == 0) { // alloc memory outside->Resize(PrependDims(seq_len, inside.dims())); - outside->mutable_data(dev_ctx.GetPlace(), inside.type()); + outside->mutable_data(place, inside.type()); } auto dst = outside->Slice(seq_offset, seq_offset + 1); - framework::CopyFrom(inside, dev_ctx.GetPlace(), dev_ctx, &dst); + framework::CopyFrom(inside, place, dev_ctx, &dst); }); VLOG(5) << "Link outside gradient finished "; @@ -453,8 +461,8 @@ class RecurrentGradOp : public RecurrentBase { [&](const framework::LoDTensor &inside, framework::LoDTensor *outside) { outside->Resize(inside.dims()); - outside->mutable_data(dev_ctx.GetPlace(), inside.type()); - framework::CopyFrom(inside, dev_ctx.GetPlace(), dev_ctx, outside); + outside->mutable_data(place, inside.type()); + framework::CopyFrom(inside, place, dev_ctx, outside); }); VLOG(5) << "Link initialize state gradient finished "; } diff --git a/paddle/operators/recv_op.cc b/paddle/operators/recv_op.cc index ed4206dc4c..7712d98df5 100644 --- a/paddle/operators/recv_op.cc +++ b/paddle/operators/recv_op.cc @@ -73,7 +73,7 @@ class RecvOp : public framework::OperatorBase { } void Run(const framework::Scope &scope, - const platform::DeviceContext &dev_ctx) const override { + const platform::Place &dev_place) const override { // FIXME(typhoonzero): no new scopes for every run. framework::Scope &recv_scope = scope.NewScope(); rpc_service_->SetScope(&recv_scope); @@ -113,7 +113,9 @@ class RecvOp : public framework::OperatorBase { auto *var = recv_scope.Var(grad_var_name); auto *tensor = var->GetMutable(); // FIXME(typhoonzero): do not copy - framework::CopyFrom(v.second, dev_ctx.GetPlace(), dev_ctx, tensor); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); + auto &dev_ctx = *pool.Borrow(place); + framework::CopyFrom(v.second, place, dev_ctx, tensor); } rpc_service_->Reset(); @@ -121,7 +123,7 @@ class RecvOp : public framework::OperatorBase { framework::proto::ProgramDesc program_desc; program_desc.ParseFromString(program_str); framework::ProgramDesc program(program_desc); - framework::Executor executor(dev_ctx); + framework::Executor executor(place); // Run sub graph to get optimized tensor try { executor.Run(program, &recv_scope, 0, /*global_block*/ diff --git a/paddle/operators/reorder_lod_tensor_by_rank_op.cc b/paddle/operators/reorder_lod_tensor_by_rank_op.cc new file mode 100644 index 0000000000..09d3ccc356 --- /dev/null +++ b/paddle/operators/reorder_lod_tensor_by_rank_op.cc @@ -0,0 +1,235 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/framework/lod_rank_table.h" +#include "paddle/framework/op_registry.h" +#include "paddle/operators/detail/safe_ref.h" +#include "paddle/platform/device_context.h" + +namespace paddle { +namespace operators { + +class ReorderLoDTensorByRankTableOpProtoMaker + : public framework::OpProtoAndCheckerMaker { + public: + ReorderLoDTensorByRankTableOpProtoMaker(OpProto *proto, + OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "(LoDTensor) the input lod tensor need to be reordered."); + AddInput("RankTable", + "(LoDRankTable) the rank table that input need follow"); + AddOutput("Out", "(LoDTensor) reordered lod tensor"); + AddComment(R"DOC(ReorderLoDTensorByRankTable + +Reorder the input X by the rank of `RankTable`. If `RankTable` is ordered by +index [3, 0, 2, 1]. Input X will reorder its sequence, the third sequence of +X will be the first sequence of Output. + +NOTE: The RankTable does not need to be calculated by X. + +For example: +The X = [Seq0, Seq1, Seq2, Seq3]. The indices of RankTable are [3, 0, 2, 1]. + +The Out = [Seq3, Seq0, Seq2, Seq1] with correct LoD information. +)DOC"); + } +}; + +class ReorderLoDTensorByRankTableBase : public framework::OperatorBase { + public: + ReorderLoDTensorByRankTableBase(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + void Run(const framework::Scope &scope, + const platform::Place &place) const override { + auto &x = + detail::Ref(scope.FindVar(Input("X")), + "Cannot find input lod tensor variable %s", Input("X")) + .Get(); + auto &rank_table = detail::Ref(scope.FindVar(Input("RankTable")), + "Cannot find input rank table variable %s", + Input("RankTable")) + .Get(); + auto &out = + *detail::Ref(scope.FindVar(Output("Out")), + "Cannot find output lod tensor variable %s", Output("Out")) + .GetMutable(); + + out.Resize(x.dims()); + out.mutable_data(x.place(), x.type()); + this->process(place, x, rank_table, &out); + } + + protected: + virtual void process(const platform::Place &place, + const framework::LoDTensor &x, + const framework::LoDRankTable &rank_table, + framework::LoDTensor *out) const = 0; + + struct AbsoluteRankTableItem { + size_t offset; // the absolute/accumulated offset. + size_t length; // the length + framework::LoD lod; + }; + + std::vector GetAbsoluteOffsetAndLengthByLoDRankTable( + const framework::LoDTensor &x) const { + std::vector absolute_table; + size_t level = 0; + size_t size = x.lod()[level].size(); + + for (size_t i = 0; i < size - 1; ++i) { + auto lod_offset = + framework::GetSubLoDAndAbsoluteOffset(x.lod(), i, i + 1, level); + + auto &offset = lod_offset.second; + + absolute_table.emplace_back(); + absolute_table.back().length = offset.second - offset.first; + absolute_table.back().offset = offset.first; + absolute_table.back().lod = lod_offset.first; + } + return absolute_table; + } + + size_t CopyTensorAndLod(const platform::Place &place, + const AbsoluteRankTableItem &item, + const framework::LoDTensor &x, + framework::LoDTensor *out, size_t out_offset) const { + auto &out_lod = *out->mutable_lod(); + auto len = item.length; + auto x_offset = item.offset; + + if (out_lod.empty()) { + for (size_t i = 0; i < item.lod.size(); ++i) { + out_lod.push_back(std::vector({0})); + } + } + + for (size_t i = 0; i < out_lod.size(); ++i) { + auto &out_v = out_lod[i]; + auto &new_lod_v = item.lod[i]; + + for (auto &detail : new_lod_v) { + out_v.push_back(out_v.back() + detail); + } + } + + auto x_sliced = x.Slice(x_offset, x_offset + len); + auto out_sliced = out->Slice(out_offset, out_offset + len); + + platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); + auto &dev_ctx = *pool.Borrow(place); + framework::CopyFrom(x_sliced, out_sliced.place(), dev_ctx, &out_sliced); + out_offset += len; + return out_offset; + } +}; + +class ReorderLoDTensorByRankTableOp : public ReorderLoDTensorByRankTableBase { + public: + ReorderLoDTensorByRankTableOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : ReorderLoDTensorByRankTableBase(type, inputs, outputs, attrs) {} + + protected: + void process(const platform::Place &place, const framework::LoDTensor &x, + const framework::LoDRankTable &rank_table, + framework::LoDTensor *out) const override { + auto absolute_table = GetAbsoluteOffsetAndLengthByLoDRankTable(x); + size_t out_offset = 0; + out->mutable_lod()->clear(); + for (auto &item : rank_table.items()) { + PADDLE_ENFORCE_LT(item.index, absolute_table.size()); + out_offset = CopyTensorAndLod(place, absolute_table[item.index], x, out, + out_offset); + } + } +}; + +class IdentityInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *context) const override { + context->SetOutputDim("Out", context->GetInputDim("X")); + } +}; + +class ReorderLodTensorByRankGradOpMaker + : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto *grad_op = new framework::OpDesc(); + grad_op->SetType("reorder_lod_tensor_by_rank_grad"); + grad_op->SetInput("X", OutputGrad("Out")); + grad_op->SetOutput("Out", InputGrad("X")); + grad_op->SetInput("RankTable", Input("RankTable")); + return std::unique_ptr(grad_op); + } +}; + +class ReorderLoDTensorByRankGradOp : public ReorderLoDTensorByRankTableBase { + public: + ReorderLoDTensorByRankGradOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : ReorderLoDTensorByRankTableBase(type, inputs, outputs, attrs) {} + + protected: + void process(const platform::Place &place, const framework::LoDTensor &x, + const framework::LoDRankTable &rank_table, + framework::LoDTensor *out) const override { + auto absolute_table = GetAbsoluteOffsetAndLengthByLoDRankTable(x); + + // offsets = enumerate([item.index for item in rank_table.items()]) + std::vector> offsets; + offsets.reserve(rank_table.items().size()); + for (size_t i = 0; i < rank_table.items().size(); ++i) { + offsets.push_back({i, rank_table.items()[i].index}); + } + + // offsets.sort(key=lambda x: x[1]) + std::sort( + offsets.begin(), offsets.end(), + [](const std::pair &a, + const std::pair &b) { return a.second < b.second; }); + + // Copy TensorAndLod + size_t out_offset = 0; + for (auto &offset : offsets) { + out_offset = this->CopyTensorAndLod(place, absolute_table[offset.first], + x, out, out_offset); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(reorder_lod_tensor_by_rank, + ops::ReorderLoDTensorByRankTableOp, + ops::ReorderLodTensorByRankGradOpMaker, + ops::ReorderLoDTensorByRankTableOpProtoMaker, + ops::IdentityInferShape); +REGISTER_OPERATOR(reorder_lod_tensor_by_rank_grad, + ops::ReorderLoDTensorByRankGradOp, ops::IdentityInferShape); diff --git a/paddle/operators/rnn_memory_helper_op.cc b/paddle/operators/rnn_memory_helper_op.cc index 795bdf3e51..edd475ec39 100644 --- a/paddle/operators/rnn_memory_helper_op.cc +++ b/paddle/operators/rnn_memory_helper_op.cc @@ -25,7 +25,7 @@ class RNNMemoryHelperOp : public framework::OperatorBase { const framework::AttributeMap &attrs) : OperatorBase(type, inputs, outputs, attrs) {} void Run(const framework::Scope &scope, - const platform::DeviceContext &dev_ctx) const override { + const platform::Place &dev_place) const override { auto mem_var_name = Input("X"); auto *mem_var = scope.FindVar(mem_var_name); PADDLE_ENFORCE(mem_var != nullptr, @@ -77,7 +77,7 @@ class RNNMemoryHelperGradOp : public framework::OperatorBase { const framework::AttributeMap &attrs) : OperatorBase(type, inputs, outputs, attrs) {} void Run(const framework::Scope &scope, - const platform::DeviceContext &dev_ctx) const override { + const platform::Place &dev_place) const override { auto out_grad_var_name = Input(framework::GradVarName("Out")); auto *out_grad_var = scope.FindVar(out_grad_var_name); @@ -100,7 +100,7 @@ class RNNMemoryHelperGradOp : public framework::OperatorBase { auto zero_op = framework::OpRegistry::CreateOp( "fill_constant", {}, {{"Out", {in_grad_var_name}}}, attrs); - zero_op->Run(scope, dev_ctx); + zero_op->Run(scope, dev_place); } else { auto &out_grad_tensor = out_grad_var->Get(); auto *in_grad_tensor = in_grad_var->GetMutable(); diff --git a/paddle/operators/save_load_op_test.cc b/paddle/operators/save_load_op_test.cc index a57466a48d..6606613d73 100644 --- a/paddle/operators/save_load_op_test.cc +++ b/paddle/operators/save_load_op_test.cc @@ -21,7 +21,7 @@ USE_NO_KERNEL_OP(load); TEST(SaveLoadOp, CPU) { paddle::framework::Scope scope; paddle::platform::CPUPlace place; - paddle::platform::CPUDeviceContext ctx(place); + auto var = scope.Var("test_var"); auto tensor = var->GetMutable(); tensor->Resize({10, 10}); @@ -42,13 +42,13 @@ TEST(SaveLoadOp, CPU) { auto save_op = paddle::framework::OpRegistry::CreateOp( "save", {{"X", {"test_var"}}}, {}, attrs); - save_op->Run(scope, ctx); + save_op->Run(scope, place); auto load_var = scope.Var("out_var"); auto target = load_var->GetMutable(); auto load_op = paddle::framework::OpRegistry::CreateOp( "load", {}, {{"Out", {"out_var"}}}, attrs); - load_op->Run(scope, ctx); + load_op->Run(scope, place); int* actual = target->data(); for (int64_t i = 0; i < tensor->numel(); ++i) { EXPECT_EQ(expect[i], actual[i]); diff --git a/paddle/operators/save_op.cc b/paddle/operators/save_op.cc index eae1146d6c..f763b8d6bf 100644 --- a/paddle/operators/save_op.cc +++ b/paddle/operators/save_op.cc @@ -21,6 +21,7 @@ #include "paddle/framework/framework.pb.h" #include "paddle/framework/lod_tensor.h" #include "paddle/framework/op_registry.h" +#include "paddle/platform/device_context.h" namespace paddle { namespace operators { @@ -62,7 +63,7 @@ class SaveOp : public framework::OperatorBase { const framework::AttributeMap &attrs) : OperatorBase(type, inputs, outputs, attrs) {} void Run(const framework::Scope &scope, - const platform::DeviceContext &dev_ctx) const override { + const platform::Place &place) const override { auto filename = Attr("file_path"); auto overwrite = Attr("overwrite"); @@ -88,6 +89,11 @@ class SaveOp : public framework::OperatorBase { "SaveOp only support LoDTensor, %s has wrong type", iname); auto &tensor = var->Get(); + + // get device context from pool + platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); + auto &dev_ctx = *pool.Borrow(place); + framework::SerializeToStream(fout, tensor, dev_ctx); } }; diff --git a/paddle/operators/shrink_rnn_memory_op.cc b/paddle/operators/shrink_rnn_memory_op.cc index 48194a547b..3ee6bd190d 100644 --- a/paddle/operators/shrink_rnn_memory_op.cc +++ b/paddle/operators/shrink_rnn_memory_op.cc @@ -27,11 +27,11 @@ class ShrinkRNNMemoryOp : public ArrayOp { : ArrayOp(type, inputs, outputs, attrs) {} void Run(const framework::Scope &scope, - const platform::DeviceContext &dev_ctx) const override { + const platform::Place &place) const override { auto *x_var = scope.FindVar(Input("X")); PADDLE_ENFORCE(x_var != nullptr, "Input X must be set"); auto &x_tensor = x_var->Get(); - size_t offset = this->GetOffset(scope, dev_ctx); + size_t offset = this->GetOffset(scope, place); auto *rank_table_var = scope.FindVar(Input("RankTable")); PADDLE_ENFORCE(rank_table_var != nullptr, "RankTable must be set"); auto &rank_table = rank_table_var->Get(); @@ -93,7 +93,7 @@ class ShrinkRNNMemoryGradOp : public ArrayOp { : ArrayOp(type, inputs, outputs, attrs) {} void Run(const framework::Scope &scope, - const platform::DeviceContext &dev_ctx) const override { + const platform::Place &place) const override { auto *dout_var = scope.FindVar(Input(framework::GradVarName("Out"))); auto *dx_var = scope.FindVar(Output(framework::GradVarName("X"))); PADDLE_ENFORCE(dx_var != nullptr, "Input Gradient should not be nullptr"); @@ -105,6 +105,10 @@ class ShrinkRNNMemoryGradOp : public ArrayOp { dx_tensor.Resize(x_tensor.dims()); dx_tensor.mutable_data(x_tensor.place(), x_tensor.type()); + // get device context from pool + platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); + auto &dev_ctx = *pool.Borrow(place); + if (dout_var == nullptr) { // dx_tensor fill zero math::set_constant(dev_ctx, &dx_tensor, 0.0f); } else { diff --git a/paddle/operators/split_lod_tensor_op.cc b/paddle/operators/split_lod_tensor_op.cc index 3542d8624f..89826ca6ee 100644 --- a/paddle/operators/split_lod_tensor_op.cc +++ b/paddle/operators/split_lod_tensor_op.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include "paddle/framework/op_registry.h" #include "paddle/memory/memcpy.h" +#include "paddle/platform/device_context.h" namespace paddle { namespace operators { @@ -33,7 +34,7 @@ class SplitLoDTensorOp : public framework::OperatorBase { const framework::AttributeMap &attrs) : OperatorBase(type, inputs, outputs, attrs) {} void Run(const framework::Scope &scope, - const platform::DeviceContext &dev_ctx) const override { + const platform::Place &dev_place) const override { auto &x = scope.FindVar(Input("X"))->Get(); auto &mask = scope.FindVar(Input("Mask"))->Get(); auto *out_true = @@ -44,6 +45,9 @@ class SplitLoDTensorOp : public framework::OperatorBase { auto &x_lod = x.lod(); auto &mask_dim = mask.dims(); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); + auto &dev_ctx = *pool.Borrow(dev_place); + std::unique_ptr cpu_mask{new framework::LoDTensor()}; if (platform::is_cpu_place(mask.place())) { cpu_mask->ShareDataWith(mask); diff --git a/paddle/operators/tensor_array_read_write_op.cc b/paddle/operators/tensor_array_read_write_op.cc index 90cbc19d1b..2ee9bf700c 100644 --- a/paddle/operators/tensor_array_read_write_op.cc +++ b/paddle/operators/tensor_array_read_write_op.cc @@ -25,11 +25,11 @@ class WriteToArrayOp : public ArrayOp { : ArrayOp(type, inputs, outputs, attrs) {} void Run(const framework::Scope &scope, - const platform::DeviceContext &dev_ctx) const override { + const platform::Place &place) const override { auto *x = scope.FindVar(Input("X")); if (x == nullptr) return; auto &x_tensor = x->Get(); - size_t offset = GetOffset(scope, dev_ctx); + size_t offset = GetOffset(scope, place); auto *out = scope.FindVar(Output("Out"))->GetMutable(); if (offset >= out->size()) { @@ -39,7 +39,11 @@ class WriteToArrayOp : public ArrayOp { } if (x_tensor.memory_size() > 0) { auto *out_tensor = &out->at(offset); - CopyFrom(x_tensor, dev_ctx.GetPlace(), dev_ctx, out_tensor); + + platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); + auto &dev_ctx = *pool.Borrow(place); + + CopyFrom(x_tensor, place, dev_ctx, out_tensor); out_tensor->set_lod(x_tensor.lod()); } else { VLOG(10) << "WARNING: The input tensor 'x_tensor' holds no memory, so " @@ -119,17 +123,18 @@ class ReadFromArrayOp : public ArrayOp { const framework::AttributeMap &attrs) : ArrayOp(type, inputs, outputs, attrs) {} void Run(const framework::Scope &scope, - const platform::DeviceContext &dev_ctx) const override { + const platform::Place &place) const override { auto *x = scope.FindVar(Input("X")); PADDLE_ENFORCE(x != nullptr, "X must be set"); auto &x_array = x->Get(); auto *out = scope.FindVar(Output("Out")); PADDLE_ENFORCE(out != nullptr, "Out must be set"); auto *out_tensor = out->GetMutable(); - size_t offset = GetOffset(scope, dev_ctx); + size_t offset = GetOffset(scope, place); if (offset < x_array.size()) { - framework::CopyFrom(x_array[offset], dev_ctx.GetPlace(), dev_ctx, - out_tensor); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); + auto &dev_ctx = *pool.Borrow(place); + framework::CopyFrom(x_array[offset], place, dev_ctx, out_tensor); out_tensor->set_lod(x_array[offset].lod()); } else { VLOG(10) << "offset " << offset << " >= " << x_array.size(); diff --git a/paddle/operators/transpose_op.cc b/paddle/operators/transpose_op.cc index 0109b8bc5c..f18be38434 100644 --- a/paddle/operators/transpose_op.cc +++ b/paddle/operators/transpose_op.cc @@ -70,18 +70,19 @@ class TransposeOpMaker : public framework::OpProtoAndCheckerMaker { Transpose Operator. The input tensor will be permuted according to the axis values given. -The op functions similar to how numpy.transpose works in python. -For example: - >> input = numpy.arange(6).reshape((2,3)) - >> input - array([[0, 1, 2], - [3, 4, 5]]) - >> axis = [1, 0] - >> output = input.transpose(axis) - >> output - array([[0, 3], - [1, 4], - [2, 5]]) +The op functions is similar to how numpy.transpose works in python. + +For example: input = numpy.arange(6).reshape((2,3)) +the input is: +array([[0, 1, 2], + [3, 4, 5]]) +given axis is: [1, 0] + +output = input.transpose(axis) +then the output is: +array([[0, 3], + [1, 4], + [2, 5]]) So, given a input tensor of shape(N, C, H, W) and the axis is {0, 2, 3, 1}, the output tensor shape will be (N, H, W, C) diff --git a/paddle/operators/unpool_op.cc b/paddle/operators/unpool_op.cc index 7c035c0b48..1b682d5c72 100644 --- a/paddle/operators/unpool_op.cc +++ b/paddle/operators/unpool_op.cc @@ -53,16 +53,14 @@ class Unpool2dOpMaker : public framework::OpProtoAndCheckerMaker { "(string), unpooling type, can be \"max\" for max-unpooling ") .InEnum({"max"}); AddComment(R"DOC( - "Input shape: $(N, C_{in}, H_{in}, W_{in})$ - Output shape: $(N, C_{out}, H_{out}, W_{out})$ - Where - $$ - H_{out} = (H_{in}−1) * strides[0] − 2 * paddings[0] + ksize[0] \\ - W_{out} = (W_{in}−1) * strides[1] − 2 * paddings[1] + ksize[1] - $$ - Paper: http://www.matthewzeiler.com/wp-content/uploads/2017 - /07/iccv2011.pdf - )DOC"); +Input shape is: $(N, C_{in}, H_{in}, W_{in})$, Output shape is: +$(N, C_{out}, H_{out}, W_{out})$, where +$$ +H_{out} = (H_{in}−1) * strides[0] − 2 * paddings[0] + ksize[0] \\ +W_{out} = (W_{in}−1) * strides[1] − 2 * paddings[1] + ksize[1] +$$ +Paper: http://www.matthewzeiler.com/wp-content/uploads/2017/07/iccv2011.pdf +)DOC"); } }; diff --git a/paddle/operators/while_op.cc b/paddle/operators/while_op.cc index 324c8b98c4..11ee96faad 100644 --- a/paddle/operators/while_op.cc +++ b/paddle/operators/while_op.cc @@ -40,13 +40,14 @@ class WhileOp : public framework::OperatorBase { : framework::OperatorBase(type, inputs, outputs, attrs) {} void Run(const framework::Scope &scope, - const platform::DeviceContext &dev_ctx) const override { + const platform::Place &dev_place) const override { PADDLE_ENFORCE_NOT_NULL(scope.FindVar(Input(kCondition))); auto &cond = scope.FindVar(Input(kCondition))->Get(); PADDLE_ENFORCE_EQ(cond.dims(), paddle::framework::make_ddim({1})); - framework::Executor executor(dev_ctx); + framework::Executor executor(dev_place); auto *block = Attr(kStepBlock); + auto *program = block->Program(); auto step_scopes = @@ -97,8 +98,8 @@ class WhileGradOp : public framework::OperatorBase { : framework::OperatorBase(type, inputs, outputs, attrs) {} void Run(const framework::Scope &scope, - const platform::DeviceContext &dev_ctx) const override { - framework::Executor executor(dev_ctx); + const platform::Place &dev_place) const override { + framework::Executor executor(dev_place); auto *block = Attr(kStepBlock); auto *program = block->Program(); @@ -189,7 +190,7 @@ class WhileGradOp : public framework::OperatorBase { auto zero_op = framework::OpRegistry::CreateOp( "fill_constant", framework::VariableNameMap{}, {{"Out", {pg_names[param_id]}}}, attrs); - zero_op->Run(scope, dev_ctx); + zero_op->Run(scope, dev_place); } } @@ -197,7 +198,7 @@ class WhileGradOp : public framework::OperatorBase { auto sum_op = framework::OpRegistry::CreateOp( "sum", {{"X", {pg_names[param_id], new_inside_name}}}, {{"Out", {pg_names[param_id]}}}, framework::AttributeMap{}); - sum_op->Run(cur_scope, dev_ctx); + sum_op->Run(cur_scope, dev_place); cur_scope.Rename(new_inside_name, inside_grad_name); } } diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt index 88df28a966..f0a0ea70a0 100644 --- a/paddle/platform/CMakeLists.txt +++ b/paddle/platform/CMakeLists.txt @@ -25,7 +25,7 @@ ENDIF() # avoiding cycle dependencies cc_library(device_context SRCS device_context.cc DEPS memory buddy_allocator system_allocator memory_block meta_data meta_cache place eigen3 ${GPU_CTX_DEPS}) -nv_test(device_context_test SRCS device_context_test.cc DEPS device_context gpu_info) +nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info) nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda) nv_test(transform_test SRCS transform_test.cu DEPS paddle_memory place device_context) diff --git a/paddle/platform/device_context.cc b/paddle/platform/device_context.cc index dacee74fff..a28e9de716 100644 --- a/paddle/platform/device_context.cc +++ b/paddle/platform/device_context.cc @@ -15,6 +15,59 @@ limitations under the License. */ namespace paddle { namespace platform { +DeviceContextPool* DeviceContextPool::pool = nullptr; + +const platform::DeviceContext* DeviceContextPool::Borrow( + const platform::Place& place) { + auto it = device_contexts_.find(place); + if (it == device_contexts_.end()) { + PADDLE_THROW( + "'Place' is not supported, Please re-compile with WITH_GPU " + "option"); + } + return it->second; +} + +std::vector DeviceContextPool::Borrow( + const std::vector& places) { + PADDLE_ENFORCE_GT(places.size(), 0); + PADDLE_ENFORCE_LE(places.size(), device_contexts_.size()); + std::vector borrowed_contexts; + for (auto& place : places) { + auto it = device_contexts_.find(place); + if (it != device_contexts_.end()) { + borrowed_contexts.emplace_back(it->second); + } else { + PADDLE_THROW( + "'Place' is not supported, Please re-compile with WITH_GPU " + "option"); + } + } + return borrowed_contexts; +} + +DeviceContextPool::DeviceContextPool( + const std::vector& places) { + PADDLE_ENFORCE_GT(places.size(), 0); + for (size_t i = 0; i < places.size(); i++) { + if (platform::is_cpu_place(places[i])) { + device_contexts_.emplace(places[i], + new platform::CPUDeviceContext( + boost::get(places[i]))); + } else if (platform::is_gpu_place(places[i])) { +#ifdef PADDLE_WITH_CUDA + device_contexts_.emplace(places[i], + new platform::CUDADeviceContext( + boost::get(places[i]))); +#else + PADDLE_THROW( + "'GPUPlace' is not supported, Please re-compile with WITH_GPU " + "option"); +#endif + } + } +} + CPUDeviceContext::CPUDeviceContext() { eigen_device_.reset(new Eigen::DefaultDevice()); } diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h index 6cc0508522..9b958f7c92 100644 --- a/paddle/platform/device_context.h +++ b/paddle/platform/device_context.h @@ -11,8 +11,8 @@ limitations under the License. */ #pragma once -#include "paddle/platform/enforce.h" -#include "paddle/platform/place.h" +#include +#include #ifdef PADDLE_WITH_CUDA #include "paddle/platform/dynload/cublas.h" @@ -20,10 +20,13 @@ limitations under the License. */ #include "paddle/platform/gpu_info.h" #define EIGEN_USE_GPU #endif -#include + +#include "paddle/platform/enforce.h" #include "paddle/platform/place.h" #include "unsupported/Eigen/CXX11/Tensor" +#include "glog/logging.h" + namespace paddle { namespace platform { @@ -105,5 +108,51 @@ class CUDNNDeviceContext : public CUDADeviceContext { #endif +/*! \brief device context pool singleton */ +class DeviceContextPool { + public: + explicit DeviceContextPool(const std::vector& places); + + static DeviceContextPool& Get() { + PADDLE_ENFORCE_NOT_NULL(pool, "Need to Create DeviceContextPool first!"); + return *pool; + } + + /*! \brief Create should only called by Init function */ + static DeviceContextPool& Create(const std::vector& places) { + if (pool == nullptr) { + pool = new DeviceContextPool(places); + } + return *pool; + } + + /*! \brief Return handle of single device context. */ + const platform::DeviceContext* Borrow(const platform::Place& place); + + /*! \brief Return handle of multi-device context. */ + std::vector Borrow( + const std::vector& places); + + ~DeviceContextPool() {} + + private: + static DeviceContextPool* pool; + constexpr static int LEFT_SHIFT = 8; + struct Hash { + std::hash hash_; + size_t operator()(const platform::Place& place) const { + int pre_hash = place.which() + (1 << LEFT_SHIFT); + if (platform::is_gpu_place(place)) { + pre_hash += boost::get(place).GetDeviceId(); + } + return hash_(pre_hash); + } + }; + std::unordered_map + device_contexts_; + DISABLE_COPY_AND_ASSIGN(DeviceContextPool); +}; + } // namespace platform } // namespace paddle diff --git a/paddle/platform/device_context_test.cc b/paddle/platform/device_context_test.cu similarity index 58% rename from paddle/platform/device_context_test.cc rename to paddle/platform/device_context_test.cu index 109c13a881..f046c79e0a 100644 --- a/paddle/platform/device_context_test.cc +++ b/paddle/platform/device_context_test.cu @@ -12,8 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/platform/device_context.h" #include "gtest/gtest.h" +#include "paddle/platform/device_context.h" + +#include "glog/logging.h" TEST(Device, Init) { using paddle::platform::DeviceContext; @@ -62,3 +64,54 @@ TEST(Device, CUDNNDeviceContext) { } } } + +TEST(Device, DeviceContextPool) { + using paddle::platform::DeviceContextPool; + using paddle::platform::CUDADeviceContext; + using paddle::platform::Place; + using paddle::platform::CPUPlace; + using paddle::platform::GPUPlace; + + DeviceContextPool& pool = DeviceContextPool::Get(); + auto cpu_dev_ctx1 = pool.Borrow(CPUPlace()); + auto cpu_dev_ctx2 = pool.Borrow(CPUPlace()); + EXPECT_TRUE(cpu_dev_ctx2 == cpu_dev_ctx1); + + std::vector gpu_places; + int count = paddle::platform::GetCUDADeviceCount(); + for (int i = 0; i < count; ++i) { + gpu_places.emplace_back(GPUPlace(i)); + } + auto dev_ctxs = pool.Borrow(gpu_places); + for (size_t i = 0; i < dev_ctxs.size(); ++i) { + auto* dev_ctx = static_cast(dev_ctxs[i]); + + // check same as GPUPlace(i) + GPUPlace place = boost::get(dev_ctx->GetPlace()); + EXPECT_EQ(place.GetDeviceId(), static_cast(i)); + } +} + +int main(int argc, char** argv) { + int dev_count = paddle::platform::GetCUDADeviceCount(); + if (dev_count <= 1) { + LOG(WARNING) << "Cannot test multi-gpu DeviceContextPool, because the CUDA " + "device count is " + << dev_count; + return 0; + } + + std::vector places; + + places.emplace_back(paddle::platform::CPUPlace()); + int count = paddle::platform::GetCUDADeviceCount(); + for (int i = 0; i < count; ++i) { + places.emplace_back(paddle::platform::GPUPlace(i)); + } + + VLOG(0) << " DeviceCount " << count; + paddle::platform::DeviceContextPool::Create(places); + + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/paddle/platform/dynload/nccl.h b/paddle/platform/dynload/nccl.h index 11007c1031..cb31e00b8e 100644 --- a/paddle/platform/dynload/nccl.h +++ b/paddle/platform/dynload/nccl.h @@ -63,6 +63,8 @@ extern void LoadNCCLDSO(); __macro(ncclAllReduce); \ __macro(ncclBcast); \ __macro(ncclAllGather); \ + __macro(ncclGroupStart); \ + __macro(ncclGroupEnd); \ __macro(ncclReduce); \ __macro(ncclGetErrorString); diff --git a/paddle/platform/enforce.h b/paddle/platform/enforce.h index 5abd4d4a34..d1c7be0790 100644 --- a/paddle/platform/enforce.h +++ b/paddle/platform/enforce.h @@ -22,6 +22,7 @@ limitations under the License. */ #include #include +#include "paddle/platform/macros.h" #include "paddle/string/printf.h" #include "paddle/string/to_string.h" diff --git a/paddle/platform/nccl_test.cu b/paddle/platform/nccl_test.cu index 94ab360a19..6750c8da7d 100644 --- a/paddle/platform/nccl_test.cu +++ b/paddle/platform/nccl_test.cu @@ -12,17 +12,19 @@ See the License for the specific language governing permissions and limitations under the License. */ +#include +#include +#include + #include "glog/logging.h" #include "gtest/gtest.h" + +#include "paddle/framework/init.h" #include "paddle/platform/device_context.h" #include "paddle/platform/dynload/nccl.h" #include "paddle/platform/enforce.h" #include "paddle/platform/gpu_info.h" -#include -#include -#include - static int dev_count = 0; namespace paddle { @@ -31,7 +33,8 @@ namespace platform { TEST(NCCL, init) { std::vector comms; comms.resize(dev_count); - dynload::ncclCommInitAll(comms.data(), dev_count, nullptr); + PADDLE_ENFORCE(dynload::ncclCommInitAll(comms.data(), dev_count, nullptr)); + for (int i = 0; i < dev_count; ++i) { dynload::ncclCommDestroy(comms[i]); } @@ -131,6 +134,18 @@ int main(int argc, char** argv) { << dev_count; return 0; } + + std::vector places; + + places.emplace_back(paddle::platform::CPUPlace()); + int count = paddle::platform::GetCUDADeviceCount(); + for (int i = 0; i < count; ++i) { + places.emplace_back(paddle::platform::GPUPlace(i)); + } + + VLOG(0) << " DeviceCount " << count; + paddle::platform::DeviceContextPool::Create(places); + testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); } diff --git a/paddle/platform/place.h b/paddle/platform/place.h index ca98920d41..daeafbbcd7 100644 --- a/paddle/platform/place.h +++ b/paddle/platform/place.h @@ -60,26 +60,18 @@ struct IsGPUPlace : public boost::static_visitor { bool operator()(const CPUPlace &) const { return false; } bool operator()(const MKLDNNPlace &) const { return false; } bool operator()(const GPUPlace &gpu) const { return true; } + bool operator()(const CUDNNPlace &) const { return true; } }; struct IsMKLDNNPlace : public boost::static_visitor { bool operator()(const MKLDNNPlace &) const { return true; } bool operator()(const CPUPlace &) const { return false; } bool operator()(const GPUPlace &) const { return false; } + bool operator()(const CUDNNPlace &) const { return false; } }; -// Define the max number of Place in bit length. i.e., the max number of places -// should be less equal than 2^(NUM_PLACE_TYPE_LIMIT_IN_BIT) -#define NUM_PLACE_TYPE_LIMIT_IN_BIT 4 - typedef boost::variant Place; -// static check number of place types is less equal than -// 2^(NUM_PLACE_TYPE_LIMIT_IN_BIT) -BOOST_MPL_ASSERT((boost::mpl::less_equal< - Place::types::size, - boost::mpl::long_<1 << NUM_PLACE_TYPE_LIMIT_IN_BIT>>)); - void set_place(const Place &); const Place &get_place(); diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc index 2d7fe25141..de6b24f70d 100644 --- a/paddle/pybind/pybind.cc +++ b/paddle/pybind/pybind.cc @@ -360,10 +360,10 @@ All parameter, weight, gradient are variables in Paddle. }) .def("run", [](OperatorBase &self, const Scope &scope, - const platform::DeviceContext &dev_ctx) { - self.Run(scope, dev_ctx); - dev_ctx.Wait(); - }) + const platform::CPUPlace &place) { self.Run(scope, place); }) + .def("run", + [](OperatorBase &self, const Scope &scope, + const platform::GPUPlace &place) { self.Run(scope, place); }) .def("type", [](const OperatorBase &op) -> std::string { return op.Type(); }) .def("outputs", @@ -417,7 +417,7 @@ All parameter, weight, gradient are variables in Paddle. }); py::class_(m, "Executor") - .def(py::init &>()) + .def(py::init()) .def("run", &Executor::Run); m.def("unique_integer", UniqueIntegerGenerator); diff --git a/paddle/pybind/tensor_py.h b/paddle/pybind/tensor_py.h index 268a0f2fa3..413fd9b046 100644 --- a/paddle/pybind/tensor_py.h +++ b/paddle/pybind/tensor_py.h @@ -14,9 +14,9 @@ #pragma once #include -#include "paddle/framework/executor.h" #include "paddle/framework/tensor.h" #include "paddle/memory/memcpy.h" +#include "paddle/platform/device_context.h" #include "pybind11/numpy.h" #include "pybind11/pybind11.h" @@ -63,8 +63,7 @@ struct CastToPyBufferImpl { auto *dst_ptr = static_cast(dst_tensor.mutable_data( tensor.dims(), platform::CPUPlace())); - framework::DeviceContextPool &pool = - framework::DeviceContextPool::Get(); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); auto dev_ctx = static_cast( pool.Borrow(tensor.place())); @@ -138,7 +137,7 @@ void PyCUDATensorSetFromArray( self.Resize(framework::make_ddim(dims)); auto *dst = self.mutable_data(place); - framework::DeviceContextPool &pool = framework::DeviceContextPool::Get(); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); auto dev_ctx = static_cast(pool.Borrow(place)); paddle::platform::GpuMemcpyAsync(dst, array.data(), sizeof(T) * array.size(), diff --git a/paddle/scripts/CMakeLists.txt b/paddle/scripts/CMakeLists.txt index a52f06fe49..68cb5a19f9 100644 --- a/paddle/scripts/CMakeLists.txt +++ b/paddle/scripts/CMakeLists.txt @@ -5,11 +5,3 @@ configure_file(submit_local.sh.in install(FILES ${CMAKE_CURRENT_BINARY_DIR}/paddle DESTINATION bin PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ GROUP_EXECUTE GROUP_READ WORLD_EXECUTE WORLD_READ) - -configure_file(tools/usage_stat/usage.sh - paddle_usage - @ONLY) - -install(FILES ${CMAKE_CURRENT_BINARY_DIR}/paddle_usage DESTINATION opt/paddle/bin - PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ - GROUP_EXECUTE GROUP_READ WORLD_EXECUTE WORLD_READ) diff --git a/paddle/scripts/submit_local.sh.in b/paddle/scripts/submit_local.sh.in index 43d2d1b410..a94bc01b35 100755 --- a/paddle/scripts/submit_local.sh.in +++ b/paddle/scripts/submit_local.sh.in @@ -165,9 +165,6 @@ case "$1" in "make_diagram") python -m paddle.utils.make_model_diagram ${@:2} ;; - "usage") - $PADDLE_BIN_PATH/paddle_usage ${@:2} - ;; "version") version ;; diff --git a/paddle/scripts/tools/usage_stat/usage.sh b/paddle/scripts/tools/usage_stat/usage.sh deleted file mode 100755 index 7dbd1f5884..0000000000 --- a/paddle/scripts/tools/usage_stat/usage.sh +++ /dev/null @@ -1,168 +0,0 @@ -#!/bin/bash - -ARGPARSE=`getopt -o u:vin:l:e: --long git-user:,help,dry-run,task-name:,log-file:,exit-code: -- "$@"` -KEEP_ANONYMOUS="A_USER_DOES_NOT_TELL_US" -# paddle config home dir, same as paddle -PADDLE_CONF_HOME="$HOME/.config/paddle" -# api url, mirror url(s) will be append later -PD_URLS="http://api.paddlepaddle.org/version" - -usage() -{ - echo "Usage: `basename $0` [options]" - echo "Options:" - echo " -e, --exit-code=EXIT_CODE The train/predict process's exit code" - echo " -l, --log-file=LOG_FILE_PATH Read which log file to get the duration of process" - echo " -n, --task-name=TASK_NAME The name of demo or example" - echo " -u, --git-user=GITHUB_USER provide contact info, like username or email" - echo " -v, -i Verbose output and interact with user when necessary" - echo " --help display this help message" -} - -eval set -- "${ARGPARSE}" -while true; do - case "$1" in - -l|--log-file) - log_file=$2 - shift 2 - ;; - -e|--exit-code) - exit_code=$2 - shift 2 - ;; - -u|--git-user) - github_user=$2 - shift 2 - ;; - -n|--task-name) - task=$2 - shift 2 - ;; - -v|-i) - v=1 - shift - ;; - --dry-run) - dry_run=1 - shift - ;; - --) - shift - break - ;; - --help) - usage - exit 0 - ;; - *) - echo "Invalid option $1" - usage - exit 1 - ;; - esac -done - -# parse the log_file to get the time costs -if [ -s "${log_file}" ]; then - duration=`awk 'BEGIN{day=0;last_sec=0;min_sec=0;max_sec=0;} - {if(index($2,":")==3){ - t=substr($2,1,8); - sec=day*86400+substr(t,1,2)*3600+substr(t,4,2)*60+substr(t,7,2); - if(secsec){min_sec=sec;} - if(max_sec==0 || max_sec/dev/null` - git_url=`git config --get remote.origin.url 2>/dev/null` - if [ "`echo ${git_url} | cut -b 1-19`" = "https://github.com/" ]; then - # under a git url, like https://github.com/user_xxx/proj_yyy.git - if [ "${v}" = "1" ]; then echo " from github url..."; fi - github_user=`echo ${git_url} | cut -d "/" -f 4` - if [ "${github_user}" = "PaddlePaddle" ]; then - github_user= - fi - fi - if [ -n "${git_username}" -a -z "${github_user}" ]; then - if [ "${v}" = "1" ]; then echo " from global git username..."; fi - github_user=${git_username} - fi - fi -fi -# allow user to set the user name, if it's not found -if [ -z "${github_user}" -a "${v}" = "1" ]; then - read -p "Please input your github username or email, or just return to keep this feedback anonymous:" - github_user=${REPLY} - if [ -z "${github_user}" ]; then - # empty input, consider as one anonymous user - github_user="${KEEP_ANONYMOUS}" - fi -fi -if [ -n "${github_user}" -a -z "${dry_run}" ]; then - # valid user and not in dry-run mode, then save to cache - mkdir -p ${PADDLE_CONF_HOME} - echo "${github_user}" >${PADDLE_CONF_HOME}/github_user -fi -if [ "${v}" = "1" ]; then echo "username: ${github_user}"; fi -if [ "${github_user}" = "${KEEP_ANONYMOUS}" ]; then - # anonymous user should keep the var empty. - github_user= -fi - -# read local paddle version -paddle_version=`paddle version | grep PaddlePaddle | head -n1 | cut -d " " -f 2 | cut -d "," -f 1` -if [ "${v}" = "1" ]; then echo "version:${paddle_version}"; fi - -# read local system time -system_time=`date "+%Y%m%d%H%M%S"` -if [ "${v}" = "1" ]; then echo "system time:${system_time}"; fi - -# make empty job_name as default value. -if [ -z "${task}" ]; then - task="(unknown_task)" -fi -if [ "${v}" = "1" ]; then echo "task: ${task}"; fi - -# concat the curl command -params="content={\"data_type\":\"usage\",\ -\"system_time\":${system_time},\"paddle_version\":\"${paddle_version}\",\ -\"github_user\":\"${github_user}\",\"job_name\":\"${task}\",\ -\"duration\":${duration},\"exit_code\":\"${exit_code}\"\ -}&type=1" -curl_cmd_prefix="curl -m 5 -X POST -d ${params}\ - -b ${PADDLE_CONF_HOME}/paddle.cookie -c ${PADDLE_CONF_HOME}/paddle.cookie " - -if [ "${dry_run}" = "1" ]; then - first_url=`echo ${PD_URLS} | cut -d " " -f 1` - echo "(dry-run mode)curl command: ${curl_cmd_prefix} ${first_url}" - exit 0 -else - for u in ${PD_URLS}; do - curl_cmd="${curl_cmd_prefix} ${u}" - if [ "${v}" = "1" ]; then echo "run: ${curl_cmd}"; fi - ${curl_cmd} >/dev/null 2>&1 - if [ $? -eq 0 ]; then - if [ "${v}" = "1" ]; then echo "upload OK!"; fi - exit 0 - else - if [ "${v}" = "1" ]; then echo "upload failed...try next"; fi - fi - done - if [ "${v}" = "1" ]; then echo "all urls tried but all failed...exit"; fi - exit 1 -fi diff --git a/paddle/testing/CMakeLists.txt b/paddle/testing/CMakeLists.txt index 8132742749..77f84cd43b 100644 --- a/paddle/testing/CMakeLists.txt +++ b/paddle/testing/CMakeLists.txt @@ -6,7 +6,6 @@ if(WITH_TESTING) add_library(paddle_test_util STATIC TestUtil.cpp) add_dependencies(paddle_test_util paddle_proto ${external_project_dependencies}) if(NOT MOBILE_INFERENCE) - add_library(paddle_gtest_main STATIC paddle_gtest_main.cc) - add_dependencies(paddle_gtest_main paddle_memory gtest gflags) + cc_library(paddle_gtest_main SRCS paddle_gtest_main.cc DEPS init paddle_memory gtest gflags) endif() endif() diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc index a491322b7e..7ba1bf095a 100644 --- a/paddle/testing/paddle_gtest_main.cc +++ b/paddle/testing/paddle_gtest_main.cc @@ -13,8 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include "gflags/gflags.h" #include "gtest/gtest.h" +#include "paddle/framework/init.h" #include "paddle/memory/memory.h" int main(int argc, char** argv) { @@ -32,8 +34,11 @@ int main(int argc, char** argv) { google::ParseCommandLineFlags(&new_argc, &new_argv_address, false); testing::InitGoogleTest(&argc, argv); paddle::memory::Used(paddle::platform::CPUPlace()); + std::vector devs = {"CPU"}; #ifdef PADDLE_WITH_CUDA paddle::memory::Used(paddle::platform::GPUPlace(0)); + devs.push_back("GPU:0"); #endif + paddle::framework::InitDevices(devs); return RUN_ALL_TESTS(); } diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index 7e118b24a4..19e2ab1b7d 100644 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -270,7 +270,7 @@ class LayerType(object): @staticmethod def is_layer_type(type_name): """ - If type_name is a layer type. + Whether type_name is a layer type. :param type_name: layer type name. Because layer type enumerations are strings. @@ -441,7 +441,7 @@ def full_matrix_projection(input, size=0, param_attr=None): with mixed_layer(size=100) as m: m += full_matrix_projection(input=layer) - 2. When used as an independant object like this, you must set the size: + 2. When used as an independent object like this, you must set the size: .. code-block:: python @@ -451,11 +451,11 @@ def full_matrix_projection(input, size=0, param_attr=None): :param input: The input of this layer. :type input: LayerOutput - :param size: The parameter size. Means the width of parameter. + :param size: The dimension of this layer. :type size: int - :param param_attr: Parameter config, None if use default. + :param param_attr: The parameter attribute. See ParameterAttribute for details. :type param_attr: ParameterAttribute - :return: A FullMatrixProjection Object. + :return: FullMatrixProjection Object. :rtype: FullMatrixProjection """ proj = FullMatrixProjection( @@ -468,12 +468,12 @@ def full_matrix_projection(input, size=0, param_attr=None): def trans_full_matrix_projection(input, size=0, param_attr=None): """ Different from full_matrix_projection, this projection performs matrix - multiplication, using transpose of weight. + multiplication, using the transpose of weight. .. math:: out.row[i] += in.row[i] * w^\mathrm{T} - :math:`w^\mathrm{T}` means transpose of weight. + :math:`w^\mathrm{T}` means the transpose of weight. The simply usage is: .. code-block:: python @@ -489,9 +489,9 @@ def trans_full_matrix_projection(input, size=0, param_attr=None): :type input: LayerOutput :param size: The parameter size. Means the width of parameter. :type size: int - :param param_attr: Parameter config, None if use default. + :param param_attr: The parameter attribute. See ParameterAttribute for details. :type param_attr: ParameterAttribute - :return: A TransposedFullMatrixProjection Object. + :return: TransposedFullMatrixProjection Object. :rtype: TransposedFullMatrixProjection """ proj = TransposedFullMatrixProjection( @@ -521,7 +521,7 @@ def table_projection(input, size=0, param_attr=None): with mixed_layer(size=100) as m: m += table_projection(input=layer) - 2. When used as an independant object like this, you must set the size: + 2. When used as an independent object like this, you must set the size: .. code-block:: python @@ -532,11 +532,11 @@ def table_projection(input, size=0, param_attr=None): :param input: The input of this layer, which must contains id fields. :type input: LayerOutput - :param size: The parameter size. Means the width of parameter. + :param size: The dimension of the output. :type size: int - :param param_attr: Parameter config, None if use default. + :param param_attr: The parameter attribute. See ParameterAttribute for details. :type param_attr: ParameterAttribute - :return: A TableProjection Object. + :return: TableProjection Object. :rtype: TableProjection """ proj = TableProjection( @@ -547,7 +547,7 @@ def table_projection(input, size=0, param_attr=None): def identity_projection(input, offset=None, size=None): """ - 1. IdentityProjection if offset=None. It performs: + 1. If offset=None, it performs IdentityProjection as follows: .. math:: out.row[i] += in.row[i] @@ -559,9 +559,8 @@ def identity_projection(input, offset=None, size=None): proj = identity_projection(input=layer) - 2. IdentityOffsetProjection if offset!=None. It likes IdentityProjection, - but layer size may be smaller than input size. - It select dimesions [offset, offset+layer_size) from input: + 2. If offset!=None, It executes IdentityOffsetProjection and takes the + elements of the input in the range [offset, offset+size) as output. .. math:: out.row[i] += in.row[i + \\textrm{offset}] @@ -573,14 +572,20 @@ def identity_projection(input, offset=None, size=None): proj = identity_projection(input=layer, offset=10) - Note that both of two projections should not have any parameter. + Note that neither of the projections have trainable parameter. :param input: The input of this layer. :type input: LayerOutput - :param offset: Offset, None if use default. + :param offset: The offset from the start of the input. The input's + elements in the range [offset, offset+size) will be + taken as output. If this parameter is not set or set + to None, the output will be the same as the input. :type offset: int - :return: A IdentityProjection or IdentityOffsetProjection object - :rtype: IdentityProjection or IdentityOffsetProjection + :param size: The dimension of this layer. It will be neglected + when offset is None or not set. + :type size: int + :return: IdentityProjection or IdentityOffsetProjection object + :rtype: IdentityProjection | IdentityOffsetProjection """ if offset is None: proj = IdentityProjection(input_layer_name=input.name) @@ -596,8 +601,8 @@ def identity_projection(input, offset=None, size=None): def slice_projection(input, slices): """ - slice_projection can slice the input value into multiple parts, - and then select some of them to merge into a new output. + slice_projection slices the input value into multiple parts, + then selects and merges some of them into a new output. .. math:: output = [input.slices()] @@ -608,15 +613,13 @@ def slice_projection(input, slices): proj = slice_projection(input=layer, slices=[(0, 10), (20, 30)]) - Note that slice_projection should not have any parameter. + Note that slice_projection has no trainable parameter. :param input: The input of this layer. :type input: LayerOutput - :param slices: An array of slice parameters. - Each slice contains the start and end offsets based - on the input. - :type slices: pair of int - :return: A SliceProjection object + :param slices: A list of start and end offsets of each slice. + :type slices: list of tuple + :return: SliceProjection object. :rtype: SliceProjection """ assert len(slices) >= 1 @@ -636,8 +639,7 @@ def slice_projection(input, slices): @wrap_param_attr_default() def scaling_projection(input, param_attr=None): """ - scaling_projection multiplies the input with a scalar parameter and add to - the output. + scaling_projection multiplies the input with a scalar parameter. .. math:: out += w * in @@ -650,9 +652,9 @@ def scaling_projection(input, param_attr=None): :param input: The input of this layer. :type input: LayerOutput - :param param_attr: Parameter config, None if use default. + :param param_attr: The parameter attribute. See ParameterAttribute for details. :type param_attr: ParameterAttribute - :return: A ScalingProjection object + :return: ScalingProjection object. :rtype: ScalingProjection """ proj = ScalingProjection(input_layer_name=input.name, **param_attr.attr) @@ -663,8 +665,8 @@ def scaling_projection(input, param_attr=None): @wrap_param_attr_default() def dotmul_projection(input, param_attr=None): """ - DotMulProjection with a layer as input. - It performs element-wise multiplication with weight. + DotMulProjection takes a layer as input and performs + element-wise multiplication with weight. .. math:: out.row[i] += in.row[i] .* weight @@ -679,9 +681,9 @@ def dotmul_projection(input, param_attr=None): :param input: The input of this layer. :type input: LayerOutput - :param param_attr: Parameter config, None if use default. + :param param_attr: The parameter attribute. See ParameterAttribute for details. :type param_attr: ParameterAttribute - :return: A DotMulProjection Object. + :return: DotMulProjection object. :rtype: DotMulProjection """ proj = DotMulProjection( @@ -698,7 +700,7 @@ def dotmul_operator(a=None, b=None, scale=1, **kwargs): out.row[i] += scale * (a.row[i] .* b.row[i]) where :math:`.*` means element-wise multiplication, and - scale is a config scalar, its default value is one. + scale is a config scalar, its default value is 1. The example usage is: @@ -706,13 +708,13 @@ def dotmul_operator(a=None, b=None, scale=1, **kwargs): op = dotmul_operator(a=layer1, b=layer2, scale=0.5) - :param a: Input layer1 + :param a: The first input of this layer. :type a: LayerOutput - :param b: Input layer2 + :param b: The second input of this layer. :type b: LayerOutput - :param scale: config scalar, default value is one. + :param scale: A scalar to scale the product. Its default value is 1. :type scale: float - :return: A DotMulOperator Object. + :return: DotMulOperator object. :rtype: DotMulOperator """ if 'x' in kwargs or 'y' in kwargs: @@ -738,28 +740,29 @@ def context_projection(input, """ Context Projection. - It just simply reorganizes input sequence, combines "context_len" sequence - to one context from context_start. "context_start" will be set to - -(context_len - 1) / 2 by default. If context position out of sequence + It just reorganizes input sequence, combines "context_len" elements of the + sequence to one context from context_start. "context_start" will be set to + -(context_len - 1) / 2 by default. When context position is out of sequence length, padding will be filled as zero if padding_attr = False, otherwise it is trainable. - For example, origin sequence is [A B C D E F G], context len is 3, then - after context projection and not set padding_attr, sequence will + For example, origin sequence is [A B C D E F G], context len is 3, padding_attr + is not set, then after context projection, sequence will be [ 0AB ABC BCD CDE DEF EFG FG0 ]. :param input: The input of this layer, which should be a sequence. :type input: LayerOutput - :param context_len: context length. + :param context_len: The length of the context. :type context_len: int - :param context_start: context start position. Default is + :param context_start: The start position of the context. The default value is -(context_len - 1)/2 :type context_start: int - :param padding_attr: Padding Parameter Attribute. If false, it means padding - always be zero. Otherwise Padding is learnable, and - parameter attribute is set by this parameter. + :param padding_attr: Parameter attribute of the padding. If the parameter is + set to False, padding will be zero. In other cases, the + padding is trainable, and its parameter attribute is set + by this parameter. :type padding_attr: bool | ParameterAttribute - :return: Projection + :return: Projection object. :rtype: Projection """ context_start = -( @@ -791,10 +794,9 @@ class MixedLayerType(LayerOutput): def __init__(self, name, size, act, bias_attr, layer_attr, parents=None): """ - Ctor. - :param name: layer name. + :param name: The name of this layer. :type name: basestring - :param size: layer size. + :param size: The dimension of this layer. :type size: int :param act: Activation type. :type act: BaseActivation @@ -802,8 +804,9 @@ class MixedLayerType(LayerOutput): whose type is not ParameterAttribute, no bias is defined. If the parameter is set to True, the bias is initialized to zero. :type bias_attr: ParameterAttribute | None | bool | Any - :param layer_attr: Extra Layer Attribute. - :type layer_attr: ExtraLayerAttribute or None + :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for + details. + :type layer_attr: ExtraLayerAttribute | None """ LayerOutput.__init__( self, @@ -868,12 +871,12 @@ def mixed_layer(size=0, bias_attr=False, layer_attr=None): """ - Mixed Layer. A mixed layer will add all inputs together, then activate. - Each inputs is a projection or operator. + Mixed Layer. A mixed layer will add all inputs together, then activate the sum. + Each input is a projection or operator. There are two styles of usages. - 1. When not set inputs parameter, use mixed_layer like this: + 1. When the parameter input is not set, use mixed_layer like this: .. code-block:: python @@ -889,21 +892,21 @@ def mixed_layer(size=0, input=[full_matrix_projection(input=layer1), full_matrix_projection(input=layer2)]) - :param name: mixed layer name. Can be referenced by other layer. + :param name: The name of this layer. It is optional. :type name: basestring - :param size: layer size. + :param size: The dimension of this layer. :type size: int - :param input: The input of this layer. It is an optional parameter. If set, - then this function will just return layer's name. + :param input: The input of this layer. It is an optional parameter. :param act: Activation Type. LinearActivation is the default activation. :type act: BaseActivation :param bias_attr: The bias attribute. If the parameter is set to False or an object whose type is not ParameterAttribute, no bias is defined. If the parameter is set to True, the bias is initialized to zero. :type bias_attr: ParameterAttribute | None | bool | Any - :param layer_attr: The extra layer config. Default is None. + :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for + details. :type layer_attr: ExtraLayerAttribute - :return: MixedLayerType object can add inputs or layer name. + :return: MixedLayerType object. :rtype: MixedLayerType """ @@ -938,14 +941,15 @@ def data_layer(name, size, depth=None, height=None, width=None, :param name: The name of this layer. :type name: basestring - :param size: Size of this data layer. + :param size: The dimension of this data layer. :type size: int - :param height: Height of this data layer, used for image + :param height: The height of the input image data. :type height: int | None - :param width: Width of this data layer, used for image + :param width: The width of the input image data. :type width: int | None - :param layer_attr: Extra Layer Attribute. - :type layer_attr: ExtraLayerAttribute. + :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for + details. + :type layer_attr: ExtraLayerAttribute :return: LayerOutput object. :rtype: LayerOutput """ @@ -978,14 +982,15 @@ def embedding_layer(input, size, name=None, param_attr=None, layer_attr=None): :param name: The name of this layer. It is optional. :type name: basestring - :param input: The input of this layer, which must be Index Data. + :param input: The input of this layer, whose type must be Index Data. :type input: LayerOutput - :param size: The embedding dimension. + :param size: The dimension of the embedding vector. :type size: int :param param_attr: The embedding parameter attribute. See ParameterAttribute for details. - :type param_attr: ParameterAttribute | None - :param layer_attr: Extra layer Config. Default is None. + :type param_attr: ParameterAttribute + :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for + details. :type layer_attr: ExtraLayerAttribute | None :return: LayerOutput object. :rtype: LayerOutput @@ -1013,7 +1018,7 @@ def fc_layer(input, bias_attr=None, layer_attr=None): """ - Helper for declare fully connected layer. + The fully connected layer. The example usage is: @@ -1035,17 +1040,18 @@ def fc_layer(input, :type name: basestring :param input: The input of this layer. :type input: LayerOutput | list | tuple - :param size: The layer dimension. + :param size: The dimension of this layer. :type size: int :param act: Activation Type. TanhActivation is the default activation. :type act: BaseActivation - :param param_attr: The Parameter Attribute|list. + :param param_attr: The parameter attribute. See ParameterAttribute for details. :type param_attr: ParameterAttribute :param bias_attr: The bias attribute. If the parameter is set to False or an object whose type is not ParameterAttribute, no bias is defined. If the parameter is set to True, the bias is initialized to zero. :type bias_attr: ParameterAttribute | None | bool | Any - :param layer_attr: Extra Layer config. + :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for + details. :type layer_attr: ExtraLayerAttribute | None :return: LayerOutput object. :rtype: LayerOutput @@ -1086,13 +1092,15 @@ def fc_layer(input, @wrap_name_default("print") def printer_layer(input, format=None, name=None): """ - Print the output value of input layers. This layer is useful for debugging. + Print the output value of the layers specified by the parameter input. + This layer is useful for debugging. :param name: The name of this layer. It is optional. :type name: basestring :param input: The input of this layer. :type input: LayerOutput | list | tuple - :return: LayerOutput + :return: LayerOutput object. + :rtype: LayerOutput """ if isinstance(input, LayerOutput): input = [input] @@ -1135,11 +1143,12 @@ def priorbox_layer(input, :param aspect_ratio: The aspect ratio. :type aspect_ratio: list :param variance: The bounding box variance. - :type min_size: The min size of the priorbox width/height. + :type min_size: The minimum size of the priorbox width/height. :param min_size: list - :type max_size: The max size of the priorbox width/height. Could be NULL. + :type max_size: The maximum size of the priorbox width/height. It could be NULL. :param max_size: list - :return: LayerOutput + :return: LayerOutput object. + :rtype: LayerOutput """ # plus one for ratio 1. num_filters = (len(aspect_ratio) * 2 + 1 + len(max_size)) * 4 @@ -1177,7 +1186,7 @@ def multibox_loss_layer(input_loc, :param name: The name of this layer. It is optional. :type name: basestring - :param input_loc: The input predict locations. + :param input_loc: The input predicted locations. :type input_loc: LayerOutput | List of LayerOutput :param input_conf: The input priorbox confidence. :type input_conf: LayerOutput | List of LayerOutput @@ -1189,13 +1198,15 @@ def multibox_loss_layer(input_loc, :type num_classes: int :param overlap_threshold: The threshold of the overlap. :type overlap_threshold: float - :param neg_pos_ratio: The ratio of the negative bbox to the positive bbox. + :param neg_pos_ratio: The ratio of the negative bounding box to + the positive bounding box. :type neg_pos_ratio: float - :param neg_overlap: The negative bbox overlap threshold. + :param neg_overlap: The negative bounding box overlap threshold. :type neg_overlap: float :param background_id: The background class index. :type background_id: int - :return: LayerOutput + :return: LayerOutput object. + :rtype: LayerOutput """ if isinstance(input_loc, LayerOutput): input_loc = [input_loc] @@ -1258,19 +1269,20 @@ def detection_output_layer(input_loc, :type input_conf: LayerOutput | List of LayerOutput. :param priorbox: The input priorbox location and the variance. :type priorbox: LayerOutput - :param num_classes: The number of the classification. + :param num_classes: The number of the classes. :type num_classes: int :param nms_threshold: The Non-maximum suppression threshold. :type nms_threshold: float - :param nms_top_k: The bbox number kept of the NMS's output + :param nms_top_k: The bounding boxes number kept of the NMS's output. :type nms_top_k: int - :param keep_top_k: The bbox number kept of the layer's output + :param keep_top_k: The bounding boxes number kept of the layer's output. :type keep_top_k: int - :param confidence_threshold: The classification confidence threshold + :param confidence_threshold: The classification confidence threshold. :type confidence_threshold: float :param background_id: The background class index. :type background_id: int - :return: LayerOutput + :return: LayerOutput object. + :rtype: LayerOutput """ if isinstance(input_loc, LayerOutput): input_loc = [input_loc] @@ -1326,7 +1338,7 @@ def roi_pool_layer(input, A layer used by Fast R-CNN to extract feature maps of ROIs from the last feature map. - :param name: The Layer Name. + :param name: The name of this layer. It is optional. :type name: basestring :param input: The input layer. :type input: LayerOutput. @@ -1338,9 +1350,10 @@ def roi_pool_layer(input, :type pooled_height: int :param spatial_scale: The spatial scale between the image and feature map. :type spatial_scale: float - :param num_channels: number of input channel. + :param num_channels: The number of the input channels. :type num_channels: int - :return: LayerOutput + :return: LayerOutput object. + :rtype: LayerOutput """ if num_channels is None: assert input.num_filters is not None @@ -1361,18 +1374,19 @@ def roi_pool_layer(input, @wrap_name_default("cross_channel_norm") def cross_channel_norm_layer(input, name=None, param_attr=None): """ - Normalize a layer's output. This layer is necessary for ssd. - This layer applys normalize across the channels of each sample to - a conv layer's output and scale the output by a group of trainable - factors which dimensions equal to the channel's number. + Normalize a layer's output. This layer is necessary for ssd. This + layer applys normalization across the channels of each sample to + a convolutional layer's output and scales the output by a group of + trainable factors whose dimensions equal to the channel's number. :param name: The name of this layer. It is optional. :type name: basestring :param input: The input of this layer. :type input: LayerOutput - :param param_attr: The Parameter Attribute|list. + :param param_attr: The parameter attribute. See ParameterAttribute for details. :type param_attr: ParameterAttribute - :return: LayerOutput + :return: LayerOutput object. + :rtype: LayerOutput """ assert input.num_filters is not None Layer( @@ -1413,12 +1427,9 @@ def pooling_layer(input, Pooling layer for sequence inputs, not used for Image. If stride > 0, this layer slides a window whose size is determined by stride, - and return the pooling value of the window as the output. Thus, a long sequence - will be shorten. - - The parameter stride specifies the intervals at which to apply the pooling - operation. Note that for sequence with sub-sequence, the default value - of stride is -1. + and returns the pooling value of the sequence in the window as the output. Thus, + a long sequence will be shortened. Note that for sequence with sub-sequence, the + default value of stride is -1. The example usage is: @@ -1435,16 +1446,16 @@ def pooling_layer(input, :type name: basestring :param input: The input of this layer. :type input: LayerOutput - :param pooling_type: Type of pooling, MaxPooling(default), AvgPooling, - SumPooling, SquareRootNPooling. + :param pooling_type: Type of pooling. MaxPooling is the default pooling. :type pooling_type: BasePoolingType | None :param stride: The step size between successive pooling regions. - :type stride: Int + :type stride: int :param bias_attr: The bias attribute. If the parameter is set to False or an object whose type is not ParameterAttribute, no bias is defined. If the parameter is set to True, the bias is initialized to zero. :type bias_attr: ParameterAttribute | None | bool | Any - :param layer_attr: The Extra Attributes for layer, such as dropout. + :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for + details. :type layer_attr: ExtraLayerAttribute | None :return: LayerOutput object. :rtype: LayerOutput @@ -6618,7 +6629,7 @@ def row_conv_layer(input, .. math:: r_{t,r} = \sum_{j=1}^{k + 1} {w_{i,j}h_{t+j-1, i}} - \quad \text{for} \quad (1 \leq i \leq d) + \quad \\text{for} \quad (1 \leq i \leq d) Note: The `context_len` is `k + 1`. That is to say, the lookahead step @@ -6767,7 +6778,7 @@ def gated_unit_layer(input, The gated unit layer implements a simple gating mechanism over the input. The input :math:`X` is first projected into a new space :math:`X'`, and it is also used to produce a gate weight :math:`\sigma`. Element-wise - product between :match:`X'` and :math:`\sigma` is finally returned. + product between :math:`X'` and :math:`\sigma` is finally returned. Reference: `Language Modeling with Gated Convolutional Networks @@ -7463,7 +7474,7 @@ def factorization_machine(input, Factorization Machine with the formula: .. math:: - y = \sum_{i=1}^{n-1}\sum_{j=i+1}^n\langle v_i, v_j \rangle x_i x_j + y = \sum_{i=1}^{n-1}\sum_{j=i+1}^n\langle v_i, v_j \\rangle x_i x_j Note: X is the input vector with size n. V is the factor matrix. Each row of V diff --git a/python/paddle/v2/fluid/__init__.py b/python/paddle/v2/fluid/__init__.py index 471255ef50..051b9094aa 100644 --- a/python/paddle/v2/fluid/__init__.py +++ b/python/paddle/v2/fluid/__init__.py @@ -42,5 +42,10 @@ def __read_gflags_from_env__(): core.init_gflags([sys.argv[0]] + ["--tryfromenv=" + ",".join(read_env_flags)]) + if core.is_compile_gpu(): + core.init_devices(["CPU", "GPU:0"]) + else: + core.init_devices(["CPU"]) + __read_gflags_from_env__() diff --git a/python/paddle/v2/fluid/executor.py b/python/paddle/v2/fluid/executor.py index 4b4a0820ab..cdd576294f 100644 --- a/python/paddle/v2/fluid/executor.py +++ b/python/paddle/v2/fluid/executor.py @@ -47,13 +47,14 @@ class Executor(object): act_places.append(p) # TODO(dzhwinter) : consider that our fluid tests all written in - # GPUPlace(gpu_id), this will be changed in next PR. + # GPUPlace(gpu_id), this will be changed in the future if core.is_compile_gpu(): core.init_devices(["CPU", "GPU:0"]) else: core.init_devices(["CPU"]) - self.executor = core.Executor(act_places) + # TODO(dzhwinter) : only use the first place + self.executor = core.Executor(act_places[0]) self.places = places def aslodtensor(self, data): diff --git a/python/paddle/v2/fluid/framework.py b/python/paddle/v2/fluid/framework.py index 920a49b69c..7b65fe80ae 100644 --- a/python/paddle/v2/fluid/framework.py +++ b/python/paddle/v2/fluid/framework.py @@ -393,7 +393,10 @@ class Operator(object): % (in_proto.name, len(in_args))) in_arg_names = [] for arg in in_args: - in_arg_names.append(arg.name) + if isinstance(arg, basestring): + in_arg_names.append(arg) + else: + in_arg_names.append(arg.name) self.desc.set_input(in_proto.name, in_arg_names) else: self.desc.set_input(in_proto.name, []) diff --git a/python/paddle/v2/fluid/layer_helper.py b/python/paddle/v2/fluid/layer_helper.py index 8df30ad76b..a076f26f7f 100644 --- a/python/paddle/v2/fluid/layer_helper.py +++ b/python/paddle/v2/fluid/layer_helper.py @@ -194,3 +194,9 @@ class LayerHelper(object): else: # For integer and boolean types, initialize with all zeros return Constant() + + def is_instance(self, param_name, cls): + param = self.kwargs.get(param_name, None) + if not isinstance(param, cls): + raise TypeError("The input {0} parameter of method {1} must be {2}", + param_name, self.layer_type, cls.__name__) diff --git a/python/paddle/v2/fluid/layers/control_flow.py b/python/paddle/v2/fluid/layers/control_flow.py index 5b7979f39f..22a37c22c3 100644 --- a/python/paddle/v2/fluid/layers/control_flow.py +++ b/python/paddle/v2/fluid/layers/control_flow.py @@ -3,6 +3,7 @@ from ..framework import Program, Variable, Operator from .. import core from tensor import assign, fill_constant import contextlib +from ..registry import autodoc __all__ = [ 'split_lod_tensor', 'merge_lod_tensor', 'BlockGuard', 'StaticRNNGuard', @@ -10,7 +11,7 @@ __all__ = [ 'max_sequence_len', 'topk', 'lod_tensor_to_array', 'array_to_lod_tensor', 'increment', 'array_write', 'create_array', 'less_than', 'array_read', 'shrink_memory', 'array_length', 'IfElse', 'DynamicRNN', 'ConditionalBlock', - 'StaticRNN' + 'StaticRNN', 'reorder_lod_tensor_by_rank' ] @@ -1082,3 +1083,18 @@ class DynamicRNN(object): if self.status != DynamicRNN.IN_RNN: raise ValueError("{0} can only be invoked inside rnn block.".format( method)) + + +@autodoc +def reorder_lod_tensor_by_rank(x, rank_table): + helper = LayerHelper('reorder_lod_tensor_by_rank', **locals()) + helper.is_instance('x', Variable) + helper.is_instance('rank_table', Variable) + + out = helper.create_tmp_variable(dtype=x.dtype) + helper.append_op( + type='reorder_lod_tensor_by_rank', + inputs={'X': [x], + 'RankTable': [rank_table]}, + outputs={'Out': [out]}) + return out diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py index 20d44f0ea2..2adce99d05 100644 --- a/python/paddle/v2/fluid/layers/nn.py +++ b/python/paddle/v2/fluid/layers/nn.py @@ -13,7 +13,8 @@ __all__ = [ 'crf_decoding', 'cos_sim', 'cross_entropy', 'square_error_cost', 'accuracy', 'chunk_eval', 'sequence_conv', 'conv2d', 'sequence_pool', 'pool2d', 'batch_norm', 'beam_search_decode', 'conv2d_transpose', 'sequence_expand', - 'lstm_unit', 'reduce_sum', 'reduce_mean' + 'lstm_unit', 'reduce_sum', 'reduce_mean', 'sequence_first_step', + 'sequence_last_step' ] @@ -574,9 +575,53 @@ def conv2d(input, def sequence_pool(input, pool_type, **kwargs): """ - This function add the operator for sequence pooling. - This is applied on top of the input using pool_type mentioned - in the parameters. + This function add the operator for sequence pooling. + It pools features of all time-steps of each instance, and is applied + on top of the input using pool_type mentioned in the parameters. + + It supports four pool_type: + + - average: :math:`Out[i] = \\frac{\sum_i X_i}{N}` + - sum: :math:`Out[i] = \sum_jX_{ij}` + - sqrt: :math:`Out[i] = \\frac{\sum_jX_{ij}}{\sqrt{len(X_i)}}` + - max: :math:`Out[i] = max(X_i)` + + .. code-block:: text + + x is a 1-level LoDTensor: + x.lod = [[0, 2, 5, 7]] + x.data = [1, 3, 2, 4, 6, 5, 1] + x.dims = [7, 1] + + then output is a Tensor: + out.dim = [3, 1] + with condition len(x.lod[-1]) - 1 == out.dims[0] + + for different pool_type: + average: out.data = [2, 4, 3], where 2=(1+3)/2, 4=(2+4+6)/3, 3=(5+1)/2 + sum : out.data = [4, 12, 6], where 4=1+3, 12=2+4+6, 6=5+1 + sqrt : out.data = [2.82, 6.93, 4.24], where 2.82=(1+3)/sqrt(2), + 6.93=(2+4+6)/sqrt(3), 4.24=(5+1)/sqrt(2) + max : out.data = [3, 6, 5], where 3=max(1,3), 6=max(2,4,6), 5=max(5,1) + + Args: + input(variable): The input variable which is a LoDTensor. + pool_type (string): The pooling type of sequence_pool. + It supports average, sum, sqrt and max. + + Returns: + The sequence pooling variable which is a Tensor. + + Examples: + + .. code-block:: python + + x = fluid.layers.data(name='x', shape=[7, 1], + dtype='float32', lod_level=1) + avg_x = fluid.layers.sequence_pool(input=x, pool_type='average') + sum_x = fluid.layers.sequence_pool(input=x, pool_type='sum') + sqrt_x = fluid.layers.sequence_pool(input=x, pool_type='sqrt') + max_x = fluid.layers.sequence_pool(input=x, pool_type='max') """ helper = LayerHelper('sequence_pool', input=input, **kwargs) dtype = helper.input_dtype() @@ -593,6 +638,72 @@ def sequence_pool(input, pool_type, **kwargs): return pool_out +def sequence_first_step(input, **kwargs): + """ + This funciton get the first step of sequence. + + .. code-block:: text + + x is a 1-level LoDTensor: + x.lod = [[0, 2, 5, 7]] + x.data = [1, 3, 2, 4, 6, 5, 1] + x.dims = [7, 1] + + then output is a Tensor: + out.dim = [3, 1] + with condition len(x.lod[-1]) - 1 == out.dims[0] + out.data = [1, 2, 5], where 1=first(1,3), 2=first(2,4,6), 5=first(5,1) + + Args: + input(variable): The input variable which is a LoDTensor. + + Returns: + The sequence's first step variable which is a Tensor. + + Examples: + + .. code-block:: python + + x = fluid.layers.data(name='x', shape=[7, 1], + dtype='float32', lod_level=1) + x_first_step = fluid.layers.sequence_first_step(input=x) + """ + return sequence_pool(input=input, pool_type="first") + + +def sequence_last_step(input, **kwargs): + """ + This funciton get the last step of sequence. + + .. code-block:: text + + x is a 1-level LoDTensor: + x.lod = [[0, 2, 5, 7]] + x.data = [1, 3, 2, 4, 6, 5, 1] + x.dims = [7, 1] + + then output is a Tensor: + out.dim = [3, 1] + with condition len(x.lod[-1]) - 1 == out.dims[0] + out.data = [3, 6, 1], where 3=last(1,3), 6=last(2,4,6), 1=last(5,1) + + Args: + input(variable): The input variable which is a LoDTensor. + + Returns: + The sequence's last step variable which is a Tensor. + + Examples: + + .. code-block:: python + + x = fluid.layers.data(name='x', shape=[7, 1], + dtype='float32', lod_level=1) + x_last_step = fluid.layers.sequence_last_step(input=x) + """ + return sequence_pool(input=input, pool_type="last") + + def pool2d(input, pool_size, pool_type, diff --git a/python/paddle/v2/fluid/registry.py b/python/paddle/v2/fluid/registry.py index 6f5dd365de..7aa8290611 100644 --- a/python/paddle/v2/fluid/registry.py +++ b/python/paddle/v2/fluid/registry.py @@ -8,7 +8,7 @@ import proto.framework_pb2 as framework_pb2 from framework import OpProtoHolder, Variable, Program, Operator from paddle.v2.fluid.layer_helper import LayerHelper, unique_name -__all__ = ['deprecated', 'register_layer'] +__all__ = ['deprecated', 'register_layer', 'autodoc'] def _convert_(name): @@ -175,12 +175,18 @@ def deprecated(func_or_class): """ Wrap func with deprecated warning """ - warnings.simplefilter('always', DeprecationWarning) #turn off filter + warnings.simplefilter('always', DeprecationWarning) # turn off filter warnings.warn( "Call to deprecated function {}.".format(func.__name__), category=DeprecationWarning, stacklevel=2) - warnings.simplefilter('default', DeprecationWarning) #reset filter + warnings.simplefilter('default', DeprecationWarning) # reset filter return func(*args, **kwargs) return func_wrapper + + +def autodoc(func): + func.__doc__ = _generate_doc_string_(OpProtoHolder.instance().get_op_proto( + func.__name__)) + return func diff --git a/python/paddle/v2/fluid/tests/__init__.py b/python/paddle/v2/fluid/tests/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/python/paddle/v2/fluid/tests/book/test_machine_translation.py b/python/paddle/v2/fluid/tests/book/test_machine_translation.py index 80ffc5a544..e79864b397 100644 --- a/python/paddle/v2/fluid/tests/book/test_machine_translation.py +++ b/python/paddle/v2/fluid/tests/book/test_machine_translation.py @@ -33,7 +33,7 @@ def encoder_decoder(): fc1 = fluid.layers.fc(input=src_embedding, size=hidden_dim * 4, act='tanh') lstm_hidden0, lstm_0 = layers.dynamic_lstm(input=fc1, size=hidden_dim * 4) - encoder_out = layers.sequence_pool(input=lstm_hidden0, pool_type="last") + encoder_out = layers.sequence_last_step(input=lstm_hidden0) # decoder trg_language_word = layers.data( diff --git a/python/paddle/v2/fluid/tests/book/test_recommender_system.py b/python/paddle/v2/fluid/tests/book/test_recommender_system.py index db91ca4f9c..b0c11ba341 100644 --- a/python/paddle/v2/fluid/tests/book/test_recommender_system.py +++ b/python/paddle/v2/fluid/tests/book/test_recommender_system.py @@ -125,10 +125,11 @@ def model(): # need cos sim inference = layers.cos_sim(X=usr_combined_features, Y=mov_combined_features) + scale_infer = layers.scale(x=inference, scale=5.0) label = layers.data(name='score', shape=[1], dtype='float32') - square_cost = layers.square_error_cost(input=inference, label=label) + square_cost = layers.square_error_cost(input=scale_infer, label=label) avg_cost = layers.mean(x=square_cost) diff --git a/python/paddle/v2/fluid/tests/op_test.py b/python/paddle/v2/fluid/tests/op_test.py index e83c4a0622..087283bfde 100644 --- a/python/paddle/v2/fluid/tests/op_test.py +++ b/python/paddle/v2/fluid/tests/op_test.py @@ -90,12 +90,10 @@ def get_numeric_gradient(scope, def product(dim): return reduce(lambda a, b: a * b, dim, 1) - ctx = core.DeviceContext.create(core.CPUPlace()) - def get_output(): sum = [] for output_name in output_names: - op.run(scope, ctx) + op.run(scope, core.CPUPlace()) sum.append( np.array(scope.find_var(output_name).get_tensor()).mean()) return np.array(sum).mean() diff --git a/python/paddle/v2/fluid/tests/test_adagrad_op.py b/python/paddle/v2/fluid/tests/test_adagrad_op.py index 903e84c328..1ff3932164 100644 --- a/python/paddle/v2/fluid/tests/test_adagrad_op.py +++ b/python/paddle/v2/fluid/tests/test_adagrad_op.py @@ -113,8 +113,7 @@ class TestSparseAdagradOp(unittest.TestCase): LearningRate='LearningRate', epsilon=2.0) - ctx = core.DeviceContext.create(place) - adagrad_op.run(scope, ctx) + adagrad_op.run(scope, place) # get and compare moment result moment_result_array = np.array(moment) diff --git a/python/paddle/v2/fluid/tests/test_batch_norm_op.py b/python/paddle/v2/fluid/tests/test_batch_norm_op.py index a9c0b1cfd3..dfc047e1f0 100644 --- a/python/paddle/v2/fluid/tests/test_batch_norm_op.py +++ b/python/paddle/v2/fluid/tests/test_batch_norm_op.py @@ -296,8 +296,7 @@ class TestBatchNormOp(OpTest): momentum=momentum, epsilon=epsilon) - ctx = core.DeviceContext.create(place) - batch_norm_op.run(scope, ctx) + batch_norm_op.run(scope, place) # check forward result self.__assert_close(y_tensor, y_out, "y_out") @@ -320,7 +319,7 @@ class TestBatchNormOp(OpTest): ["y_out", "mean", "variance", "saved_mean", "saved_variance"], place, feed_dict={"y_out": y_grad}) - batch_norm_op_grad.run(scope, ctx) + batch_norm_op_grad.run(scope, place) x_grad_tensor = create_or_get_tensor(scope, grad_var_name("x_val"), None, diff --git a/python/paddle/v2/fluid/tests/test_beam_search_decode_op.py b/python/paddle/v2/fluid/tests/test_beam_search_decode_op.py index 5fad7d8cce..f329214dce 100644 --- a/python/paddle/v2/fluid/tests/test_beam_search_decode_op.py +++ b/python/paddle/v2/fluid/tests/test_beam_search_decode_op.py @@ -57,8 +57,7 @@ class TestBeamSearchDecodeOp(unittest.TestCase): SentenceIds="sentence_ids", SentenceScores="sentence_scores") - ctx = core.DeviceContext.create(self.cpu_place) - beam_search_decode_op.run(self.scope, ctx) + beam_search_decode_op.run(self.scope, self.cpu_place) expected_lod = [[0, 4, 8], [0, 1, 3, 6, 9, 10, 13, 16, 19]] self.assertEqual(sentence_ids.lod(), expected_lod) diff --git a/python/paddle/v2/fluid/tests/test_beam_search_op.py b/python/paddle/v2/fluid/tests/test_beam_search_op.py index cc7c09bb59..595f132fa8 100644 --- a/python/paddle/v2/fluid/tests/test_beam_search_op.py +++ b/python/paddle/v2/fluid/tests/test_beam_search_op.py @@ -14,7 +14,6 @@ def create_tensor(scope, name, np_data): class BeamSearchOpTester(unittest.TestCase): def setUp(self): self.scope = core.Scope() - self.ctx = core.DeviceContext.create(core.CPUPlace()) self._create_ids() self._create_scores() self._create_pre_ids() @@ -32,7 +31,7 @@ class BeamSearchOpTester(unittest.TestCase): level=0, beam_size=2, end_id=0, ) - op.run(self.scope, self.ctx) + op.run(self.scope, core.CPUPlace()) selected_ids = self.scope.find_var("selected_ids").get_tensor() print 'selected_ids', np.array(selected_ids) print 'lod', selected_ids.lod() diff --git a/python/paddle/v2/fluid/tests/test_cond_op.py b/python/paddle/v2/fluid/tests/test_cond_op.py index 9d1df44b90..32e54084e4 100644 --- a/python/paddle/v2/fluid/tests/test_cond_op.py +++ b/python/paddle/v2/fluid/tests/test_cond_op.py @@ -65,8 +65,7 @@ class TestCondOp(unittest.TestCase): self.create_global_variables() self.create_cond_op() self.create_sub_net() - ctx = core.DeviceContext.create(core.CPUPlace()) - self.condop.run(self.scope, ctx) + self.condop.run(self.scope, core.CPUPlace()) return np.array(self.scope.find_var("Out").get_tensor()) def create_global_variables(self): diff --git a/python/paddle/v2/fluid/tests/test_dyn_rnn.py b/python/paddle/v2/fluid/tests/test_dyn_rnn.py index 034266c26f..8090c5f478 100644 --- a/python/paddle/v2/fluid/tests/test_dyn_rnn.py +++ b/python/paddle/v2/fluid/tests/test_dyn_rnn.py @@ -63,8 +63,7 @@ class TestDynRNN(unittest.TestCase): all_timesteps = fluid.layers.array_to_lod_tensor( x=out, table=rank_table) - last = fluid.layers.sequence_pool( - input=all_timesteps, pool_type='last') + last = fluid.layers.sequence_last_step(input=all_timesteps) logits = fluid.layers.fc(input=last, size=1, act=None) loss = fluid.layers.sigmoid_cross_entropy_with_logits( x=logits, label=label) @@ -101,7 +100,7 @@ class TestDynRNN(unittest.TestCase): rnn.update_memory(mem, out_) rnn.output(out_) - last = fluid.layers.sequence_pool(input=rnn(), pool_type='last') + last = fluid.layers.sequence_last_step(input=rnn()) logits = fluid.layers.fc(input=last, size=1, act=None) label = fluid.layers.data(name='label', shape=[1], dtype='float32') loss = fluid.layers.sigmoid_cross_entropy_with_logits( diff --git a/python/paddle/v2/fluid/tests/test_gaussian_random_op.py b/python/paddle/v2/fluid/tests/test_gaussian_random_op.py index a9d943b8b7..4afe0c6a6d 100644 --- a/python/paddle/v2/fluid/tests/test_gaussian_random_op.py +++ b/python/paddle/v2/fluid/tests/test_gaussian_random_op.py @@ -24,7 +24,6 @@ class TestGaussianRandomOp(unittest.TestCase): def gaussian_random_test(self, place): - context = core.DeviceContext.create(place) program = fluid.Program() block = program.global_block() vout = block.create_var(name="Out") diff --git a/python/paddle/v2/fluid/tests/test_is_empty_op.py b/python/paddle/v2/fluid/tests/test_is_empty_op.py index ed6e3fe24f..0a4dd0f4fa 100644 --- a/python/paddle/v2/fluid/tests/test_is_empty_op.py +++ b/python/paddle/v2/fluid/tests/test_is_empty_op.py @@ -33,8 +33,7 @@ class TestIsEmptyOp(unittest.TestCase): def one_case(self, input, target): op = Operator(type="is_empty", X=input, Out="out") - ctx = core.DeviceContext.create(core.CPUPlace()) - op.run(self.scope, ctx) + op.run(self.scope, core.CPUPlace()) out = self.scope.var("out").get_tensor() self.assertEqual(np.array(out)[0], target) diff --git a/python/paddle/v2/fluid/tests/test_reorder_lod_tensor.py b/python/paddle/v2/fluid/tests/test_reorder_lod_tensor.py new file mode 100644 index 0000000000..8f5774835e --- /dev/null +++ b/python/paddle/v2/fluid/tests/test_reorder_lod_tensor.py @@ -0,0 +1,47 @@ +import unittest +import paddle.v2.fluid as fluid +import numpy + + +class TestReorderLoDTensor(unittest.TestCase): + def test_reorder(self): + dat = fluid.layers.data(name='input', shape=[1], lod_level=2) + dat.stop_gradient = False + rank_dat = fluid.layers.data(name='ref', shape=[1], lod_level=1) + table = fluid.layers.lod_rank_table(rank_dat) + new_dat = fluid.layers.reorder_lod_tensor_by_rank( + x=dat, rank_table=table) + loss = fluid.layers.mean(x=new_dat) + fluid.backward.append_backward_ops(loss=loss) + + cpu = fluid.CPUPlace() + exe = fluid.Executor(cpu) + exe.run(fluid.default_startup_program()) + + ref = fluid.Tensor() + ref_lod = [0, 3, 4, 7, 8, 14] + ref.set_lod([ref_lod]) + + ref.set(numpy.random.random(size=[14, 1]).astype('float32'), cpu) + input = fluid.Tensor() + lod_level_0 = numpy.random.randint(low=1, high=5, size=5) + lod_level_0 = [0] + numpy.cumsum(lod_level_0).tolist() + lod_level_1 = numpy.random.randint(low=1, high=5, size=lod_level_0[-1]) + lod_level_1 = [0] + numpy.cumsum(lod_level_1).tolist() + + input.set_lod([lod_level_0, lod_level_1]) + input.set( + numpy.random.random(size=[lod_level_1[-1], 1]).astype('float32'), + cpu) + + ig = exe.run(fluid.default_main_program(), + feed={'input': input, + 'ref': ref}, + fetch_list=['input@GRAD'], + return_numpy=False)[0] + self.assertAlmostEqual(numpy.array(ig).sum(), 1.0, delta=0.001) + self.assertEqual(input.lod(), ig.lod()) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/fluid/tests/test_sgd_op.py b/python/paddle/v2/fluid/tests/test_sgd_op.py index ca05a381f0..9c345792be 100644 --- a/python/paddle/v2/fluid/tests/test_sgd_op.py +++ b/python/paddle/v2/fluid/tests/test_sgd_op.py @@ -55,8 +55,7 @@ class TestSparseSGDOp(unittest.TestCase): Grad='Grad', ParamOut='Param', LearningRate='LearningRate') - ctx = core.DeviceContext.create(place) - sgd_op.run(scope, ctx) + sgd_op.run(scope, place) # get and compare result result_array = np.array(param) diff --git a/python/paddle/v2/fluid/tests/test_uniform_random_op.py b/python/paddle/v2/fluid/tests/test_uniform_random_op.py index 00b4f19620..d6872c8ba3 100644 --- a/python/paddle/v2/fluid/tests/test_uniform_random_op.py +++ b/python/paddle/v2/fluid/tests/test_uniform_random_op.py @@ -26,7 +26,6 @@ class TestUniformRandomOp(unittest.TestCase): self.uniform_random_test(place=core.GPUPlace(0)) def uniform_random_test(self, place): - context = core.DeviceContext.create(place) program = fluid.Program() block = program.global_block() vout = block.create_var(name="Out") diff --git a/python/setup.py.in b/python/setup.py.in index 8396fb44cf..66ccfe8087 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -79,8 +79,7 @@ if '${CMAKE_SYSTEM_PROCESSOR}' not in ['arm', 'armv7-a', 'aarch64']: # the prefix is sys.prefix which should always be usr paddle_bin_dir = 'opt/paddle/bin' -paddle_bins = ['${PADDLE_BINARY_DIR}/paddle/scripts/paddle_usage', - '${PADDLE_BINARY_DIR}/paddle/trainer/paddle_trainer', +paddle_bins = ['${PADDLE_BINARY_DIR}/paddle/trainer/paddle_trainer', '${PADDLE_BINARY_DIR}/paddle/trainer/paddle_merge_model', '${PADDLE_BINARY_DIR}/paddle/pserver/paddle_pserver_main', '${PADDLE_BINARY_DIR}/paddle/scripts/paddle']