From 540cc2c1c1a203758346cd2ce226d7564c0dad88 Mon Sep 17 00:00:00 2001 From: qijun Date: Fri, 29 Sep 2017 22:11:48 -0700 Subject: [PATCH 01/61] add executor class and interface --- paddle/framework/CMakeLists.txt | 2 + paddle/framework/executor.cc | 108 ++++++++++++++++++++++++++++++ paddle/framework/executor.h | 32 +++++++++ paddle/framework/executor_test.cc | 18 +++++ 4 files changed, 160 insertions(+) create mode 100644 paddle/framework/executor.cc create mode 100644 paddle/framework/executor.h create mode 100644 paddle/framework/executor_test.cc diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt index 8a5d8532bb..3ee721ac93 100644 --- a/paddle/framework/CMakeLists.txt +++ b/paddle/framework/CMakeLists.txt @@ -43,3 +43,5 @@ add_custom_command(TARGET framework_py_proto POST_BUILD cc_library(backward SRCS backward.cc DEPS net_op) cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context) + +cc_library(executor SRCS executor.cc DEPS device_context framework_proto) diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc new file mode 100644 index 0000000000..ccf6716949 --- /dev/null +++ b/paddle/framework/executor.cc @@ -0,0 +1,108 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/framework/executor.h" + +#include "paddle/platform/device_context.h" + +namespace paddle { +namespace framework { + +class LinearListView; +class GraphView; + +// Immutable view of a ProgramDesc organized for efficient execution. +class ProgramDescView { + public: + virtual ~ProgramDescView() {} + virtual void Initialize(const ProgramDesc*) = 0; + static ProgramDescView* Create(bool is_linear); +}; + +class LinearListView : public ProgramDescView { + public: + void Initialize(const ProgramDesc*) override; +}; + +class GraphView : public ProgramDescView { + public: + void Initialize(const ProgramDesc*) override; +}; + +static ProgramDescView* Create(bool is_linear) { + if (is_linear) { + return new LinearListView(); + } else { + return new GraphView(); + } +} + +void LinearListView::Initialize(const ProgramDesc*) { + // get a LinearView of ProgramDesc +} + +void GraphView::Initialize(const ProgramDesc*) { + // get a GraphView of ProgramDesc +} + +class ExecutorImpl : public Executor { + public: + ExecutorImpl(const platform::DeviceContext* ctx, const ProgramDesc* pdesc, + bool is_linear) + : device_context_(ctx), + program_desc_(pdesc), + view_(ProgramDescView::Create(is_linear)) {} + + virtual ~ExecutorImpl() { + if (view_) delete view_; + } + + void Run() override; + + void Initialize(); + + private: + const platform::DeviceContext* device_context_; + const ProgramDesc* program_desc_; + ProgramDescView* view_; +}; + +static Executor* NewLocalExecutor(const platform::Place& place, + const ProgramDesc& pdesc, bool is_linear) { + platform::DeviceContext* device_context = nullptr; + if (platform::is_cpu_place(place)) { + device_context = + new platform::CPUDeviceContext(boost::get(place)); + } +#ifndef PADDLE_ONLY_CPU + else if { + device_context = + new platform::CUDADeviceContext(boost::get(place)); + } +#endif + return new ExecutorImpl(device_context, &pdesc, is_linear); +} + +void ExecutorImpl::Run() { + // operators running + device_context_->Wait(); +} + +void ExecutorImpl::Initialize() { + // Initialize the ProgramDescView + view_->Initialize(program_desc_); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/framework/executor.h b/paddle/framework/executor.h new file mode 100644 index 0000000000..69f0e3f18f --- /dev/null +++ b/paddle/framework/executor.h @@ -0,0 +1,32 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/framework/framework.pb.h" +#include "paddle/platform/place.h" + +namespace paddle { +namespace framework { + +class Executor { + public: + virtual ~Executor() {} + virtual void Run() = 0; +}; + +static Executor* NewLocalExecutor(const platform::Place&, const ProgramDesc&); + +} // namespace framework +} // namespace paddle diff --git a/paddle/framework/executor_test.cc b/paddle/framework/executor_test.cc new file mode 100644 index 0000000000..f8a41b12ad --- /dev/null +++ b/paddle/framework/executor_test.cc @@ -0,0 +1,18 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/framework/executor.h" +#include "gtest/gtest.h" + +TEST(Executor, Init) {} \ No newline at end of file From 3481bdc865571d2cfac1576d0913ab3f827b5955 Mon Sep 17 00:00:00 2001 From: qijun Date: Fri, 29 Sep 2017 22:32:41 -0700 Subject: [PATCH 02/61] add global device context --- paddle/framework/executor.cc | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc index ccf6716949..8534e70f48 100644 --- a/paddle/framework/executor.cc +++ b/paddle/framework/executor.cc @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/framework/executor.h" - +#include #include "paddle/platform/device_context.h" namespace paddle { @@ -78,17 +78,28 @@ class ExecutorImpl : public Executor { ProgramDescView* view_; }; +template +std::unique_ptr make_unique(Args&&... args) { + return std::unique_ptr(new T(std::forward(args)...)); +} + +static std::unique_ptr g_cpu_device_context = + make_unique(platform::CPUPlace()); + +#ifndef PADDLE_ONLY_CPU +static std::unique_ptr g_cuda_device_context = + make_unique(platform::GPUPlace(0)); +#endif + static Executor* NewLocalExecutor(const platform::Place& place, const ProgramDesc& pdesc, bool is_linear) { platform::DeviceContext* device_context = nullptr; if (platform::is_cpu_place(place)) { - device_context = - new platform::CPUDeviceContext(boost::get(place)); + device_context = g_cpu_device_context.get(); } #ifndef PADDLE_ONLY_CPU else if { - device_context = - new platform::CUDADeviceContext(boost::get(place)); + device_context = g_cuda_device_context.get(); } #endif return new ExecutorImpl(device_context, &pdesc, is_linear); From e42cafb24f3868713958213777d798cd54140b40 Mon Sep 17 00:00:00 2001 From: qijun Date: Fri, 29 Sep 2017 22:50:40 -0700 Subject: [PATCH 03/61] add executor unittest --- paddle/framework/CMakeLists.txt | 1 + paddle/framework/executor.cc | 6 +++--- paddle/framework/executor.h | 2 +- paddle/framework/executor_test.cc | 10 +++++++++- 4 files changed, 14 insertions(+), 5 deletions(-) diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt index 3ee721ac93..2cad2e54fa 100644 --- a/paddle/framework/CMakeLists.txt +++ b/paddle/framework/CMakeLists.txt @@ -45,3 +45,4 @@ cc_library(backward SRCS backward.cc DEPS net_op) cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context) cc_library(executor SRCS executor.cc DEPS device_context framework_proto) +cc_test(executor_test SRCS executor_test.cc DEPS executor) diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc index 8534e70f48..7fda2332b8 100644 --- a/paddle/framework/executor.cc +++ b/paddle/framework/executor.cc @@ -40,7 +40,7 @@ class GraphView : public ProgramDescView { void Initialize(const ProgramDesc*) override; }; -static ProgramDescView* Create(bool is_linear) { +ProgramDescView* ProgramDescView::Create(bool is_linear) { if (is_linear) { return new LinearListView(); } else { @@ -91,8 +91,8 @@ static std::unique_ptr g_cuda_device_context = make_unique(platform::GPUPlace(0)); #endif -static Executor* NewLocalExecutor(const platform::Place& place, - const ProgramDesc& pdesc, bool is_linear) { +Executor* NewLocalExecutor(const platform::Place& place, + const ProgramDesc& pdesc, bool is_linear) { platform::DeviceContext* device_context = nullptr; if (platform::is_cpu_place(place)) { device_context = g_cpu_device_context.get(); diff --git a/paddle/framework/executor.h b/paddle/framework/executor.h index 69f0e3f18f..25ef2d4d48 100644 --- a/paddle/framework/executor.h +++ b/paddle/framework/executor.h @@ -26,7 +26,7 @@ class Executor { virtual void Run() = 0; }; -static Executor* NewLocalExecutor(const platform::Place&, const ProgramDesc&); +Executor* NewLocalExecutor(const platform::Place&, const ProgramDesc&, bool); } // namespace framework } // namespace paddle diff --git a/paddle/framework/executor_test.cc b/paddle/framework/executor_test.cc index f8a41b12ad..c046ae3158 100644 --- a/paddle/framework/executor_test.cc +++ b/paddle/framework/executor_test.cc @@ -15,4 +15,12 @@ limitations under the License. */ #include "paddle/framework/executor.h" #include "gtest/gtest.h" -TEST(Executor, Init) {} \ No newline at end of file +using namespace paddle::platform; +using namespace paddle::framework; + +TEST(Executor, Init) { + ProgramDesc pdesc; + CPUPlace cpu_place; + Executor* executor = NewLocalExecutor(cpu_place, pdesc, true); + executor->Run(); +} \ No newline at end of file From d4be9730fced2a8effaf06412fa48e2aa0a8c325 Mon Sep 17 00:00:00 2001 From: qijun Date: Fri, 29 Sep 2017 23:44:52 -0700 Subject: [PATCH 04/61] fix gpu build error --- paddle/framework/executor.cc | 26 +++++++++++++++++--------- paddle/framework/executor_test.cc | 1 + 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc index 7fda2332b8..b38d6be16f 100644 --- a/paddle/framework/executor.cc +++ b/paddle/framework/executor.cc @@ -80,26 +80,34 @@ class ExecutorImpl : public Executor { template std::unique_ptr make_unique(Args&&... args) { - return std::unique_ptr(new T(std::forward(args)...)); + return std::unique_ptr(new T(std::forward(args)...)); } -static std::unique_ptr g_cpu_device_context = - make_unique(platform::CPUPlace()); +platform::CPUDeviceContext* GetCPUDeviceContext() { + static std::unique_ptr g_cpu_device_context = + make_unique(platform::CPUPlace()); + return g_cpu_device_context.get(); +} #ifndef PADDLE_ONLY_CPU -static std::unique_ptr g_cuda_device_context = - make_unique(platform::GPUPlace(0)); +platform::CUDADeviceContext* GetCUDADeviceContext() { + static std::unique_ptr g_cuda_device_context = + make_unique(platform::GPUPlace(0)); + return g_cuda_device_context.get(); +} #endif Executor* NewLocalExecutor(const platform::Place& place, const ProgramDesc& pdesc, bool is_linear) { platform::DeviceContext* device_context = nullptr; if (platform::is_cpu_place(place)) { - device_context = g_cpu_device_context.get(); - } + device_context = GetCPUDeviceContext(); + } else if (platform::is_gpu_place(place)) { #ifndef PADDLE_ONLY_CPU - else if { - device_context = g_cuda_device_context.get(); + device_context = GetCUDADeviceContext(); + } +#else + PADDLE_THROW("'GPUPlace' is not supported in CPU only device."); } #endif return new ExecutorImpl(device_context, &pdesc, is_linear); diff --git a/paddle/framework/executor_test.cc b/paddle/framework/executor_test.cc index c046ae3158..6f8ca38768 100644 --- a/paddle/framework/executor_test.cc +++ b/paddle/framework/executor_test.cc @@ -23,4 +23,5 @@ TEST(Executor, Init) { CPUPlace cpu_place; Executor* executor = NewLocalExecutor(cpu_place, pdesc, true); executor->Run(); + delete executor; } \ No newline at end of file From b630d4019a0bad74d694633930180912ec19a67c Mon Sep 17 00:00:00 2001 From: qijun Date: Sat, 30 Sep 2017 15:52:05 -0700 Subject: [PATCH 05/61] add scope --- paddle/framework/CMakeLists.txt | 2 +- paddle/framework/executor.cc | 24 +++++++++++++++++------- 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt index 2cad2e54fa..df79bc0e8f 100644 --- a/paddle/framework/CMakeLists.txt +++ b/paddle/framework/CMakeLists.txt @@ -44,5 +44,5 @@ add_custom_command(TARGET framework_py_proto POST_BUILD cc_library(backward SRCS backward.cc DEPS net_op) cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context) -cc_library(executor SRCS executor.cc DEPS device_context framework_proto) +cc_library(executor SRCS executor.cc DEPS device_context scope framework_proto) cc_test(executor_test SRCS executor_test.cc DEPS executor) diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc index b38d6be16f..52963d20f0 100644 --- a/paddle/framework/executor.cc +++ b/paddle/framework/executor.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include "paddle/framework/executor.h" #include +#include "paddle/framework/scope.h" #include "paddle/platform/device_context.h" namespace paddle { @@ -58,9 +59,10 @@ void GraphView::Initialize(const ProgramDesc*) { class ExecutorImpl : public Executor { public: - ExecutorImpl(const platform::DeviceContext* ctx, const ProgramDesc* pdesc, - bool is_linear) - : device_context_(ctx), + ExecutorImpl(Scope* scope, const platform::DeviceContext* ctx, + const ProgramDesc* pdesc, bool is_linear) + : scope_(scope), + device_context_(ctx), program_desc_(pdesc), view_(ProgramDescView::Create(is_linear)) {} @@ -73,6 +75,7 @@ class ExecutorImpl : public Executor { void Initialize(); private: + Scope* scope_; const platform::DeviceContext* device_context_; const ProgramDesc* program_desc_; ProgramDescView* view_; @@ -80,23 +83,29 @@ class ExecutorImpl : public Executor { template std::unique_ptr make_unique(Args&&... args) { - return std::unique_ptr(new T(std::forward(args)...)); + return std::unique_ptr(new T(std::forward(args)...)); } platform::CPUDeviceContext* GetCPUDeviceContext() { static std::unique_ptr g_cpu_device_context = - make_unique(platform::CPUPlace()); + make_unique(platform::CPUPlace()); return g_cpu_device_context.get(); } #ifndef PADDLE_ONLY_CPU platform::CUDADeviceContext* GetCUDADeviceContext() { static std::unique_ptr g_cuda_device_context = - make_unique(platform::GPUPlace(0)); + make_unique(platform::GPUPlace(0)); return g_cuda_device_context.get(); } #endif +framework::Scope* GetScope() { + static std::unique_ptr g_scope = + make_unique(); + return g_scope.get(); +} + Executor* NewLocalExecutor(const platform::Place& place, const ProgramDesc& pdesc, bool is_linear) { platform::DeviceContext* device_context = nullptr; @@ -110,11 +119,12 @@ Executor* NewLocalExecutor(const platform::Place& place, PADDLE_THROW("'GPUPlace' is not supported in CPU only device."); } #endif - return new ExecutorImpl(device_context, &pdesc, is_linear); + return new ExecutorImpl(GetScope(), device_context, &pdesc, is_linear); } void ExecutorImpl::Run() { // operators running + scope_->NewVar(); device_context_->Wait(); } From 09500917eee2f3f991b1f92acbb4738d3ea5dba2 Mon Sep 17 00:00:00 2001 From: qijun Date: Sat, 30 Sep 2017 16:44:55 -0700 Subject: [PATCH 06/61] pass place to GetCUDADeviceContext --- paddle/framework/executor.cc | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc index 52963d20f0..74153f2449 100644 --- a/paddle/framework/executor.cc +++ b/paddle/framework/executor.cc @@ -86,16 +86,16 @@ std::unique_ptr make_unique(Args&&... args) { return std::unique_ptr(new T(std::forward(args)...)); } -platform::CPUDeviceContext* GetCPUDeviceContext() { +platform::CPUDeviceContext* GetCPUDeviceContext(platform::CPUPlace& place) { static std::unique_ptr g_cpu_device_context = - make_unique(platform::CPUPlace()); + make_unique(place); return g_cpu_device_context.get(); } #ifndef PADDLE_ONLY_CPU -platform::CUDADeviceContext* GetCUDADeviceContext() { +platform::CUDADeviceContext* GetCUDADeviceContext(platform::GPUPlace& place) { static std::unique_ptr g_cuda_device_context = - make_unique(platform::GPUPlace(0)); + make_unique(place); return g_cuda_device_context.get(); } #endif @@ -110,10 +110,12 @@ Executor* NewLocalExecutor(const platform::Place& place, const ProgramDesc& pdesc, bool is_linear) { platform::DeviceContext* device_context = nullptr; if (platform::is_cpu_place(place)) { - device_context = GetCPUDeviceContext(); + auto cpu_place = boost::get(place); + device_context = GetCPUDeviceContext(cpu_place); } else if (platform::is_gpu_place(place)) { #ifndef PADDLE_ONLY_CPU - device_context = GetCUDADeviceContext(); + auto gpu_place = boost::get(place); + device_context = GetCUDADeviceContext(gpu_place); } #else PADDLE_THROW("'GPUPlace' is not supported in CPU only device."); From ce4d14b4ed5384dc5fb9eb4e2c6d7f1c6b9bc6dd Mon Sep 17 00:00:00 2001 From: qijun Date: Sun, 1 Oct 2017 15:08:20 -0700 Subject: [PATCH 07/61] add struct Device --- paddle/framework/CMakeLists.txt | 2 +- paddle/framework/executor.cc | 73 ++++++++++++++++++++++----------- 2 files changed, 51 insertions(+), 24 deletions(-) diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt index 1168fc38af..129a0eb707 100644 --- a/paddle/framework/CMakeLists.txt +++ b/paddle/framework/CMakeLists.txt @@ -44,5 +44,5 @@ add_custom_command(TARGET framework_py_proto POST_BUILD cc_library(backward SRCS backward.cc DEPS net_op) cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context) -cc_library(executor SRCS executor.cc DEPS device_context scope framework_proto) +cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto) cc_test(executor_test SRCS executor_test.cc DEPS executor) diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc index 74153f2449..559cbe125f 100644 --- a/paddle/framework/executor.cc +++ b/paddle/framework/executor.cc @@ -14,6 +14,8 @@ limitations under the License. */ #include "paddle/framework/executor.h" #include +#include "paddle/framework/op_registry.h" +#include "paddle/framework/operator.h" #include "paddle/framework/scope.h" #include "paddle/platform/device_context.h" @@ -34,6 +36,9 @@ class ProgramDescView { class LinearListView : public ProgramDescView { public: void Initialize(const ProgramDesc*) override; + + private: + std::vector> ops_; }; class GraphView : public ProgramDescView { @@ -49,20 +54,36 @@ ProgramDescView* ProgramDescView::Create(bool is_linear) { } } -void LinearListView::Initialize(const ProgramDesc*) { +void LinearListView::Initialize(const ProgramDesc* pdesc) { // get a LinearView of ProgramDesc + for (auto& block_desc : pdesc->blocks()) { + for (auto& op_desc : block_desc.ops()) { + ops_.emplace_back(OpRegistry::CreateOp(op_desc)); + } + } } -void GraphView::Initialize(const ProgramDesc*) { +void GraphView::Initialize(const ProgramDesc* pdesc) { // get a GraphView of ProgramDesc } +struct Device { + platform::CPUDeviceContext* cpu_device_context; +#ifndef PADDLE_ONLY_CPU + Device(platform::CPUDeviceContext* cpu, platform::CUDADeviceContext* gpu) + : cpu_device_context(cpu), cuda_device_context(gpu) {} + platform::CDUADeviceContext* cuda_device_context; +#else + explicit Device(platform::CPUDeviceContext* cpu) : cpu_device_context(cpu) {} +#endif +}; + class ExecutorImpl : public Executor { public: - ExecutorImpl(Scope* scope, const platform::DeviceContext* ctx, - const ProgramDesc* pdesc, bool is_linear) + ExecutorImpl(Scope* scope, const Device* device, const ProgramDesc* pdesc, + bool is_linear) : scope_(scope), - device_context_(ctx), + device_(device), program_desc_(pdesc), view_(ProgramDescView::Create(is_linear)) {} @@ -76,7 +97,7 @@ class ExecutorImpl : public Executor { private: Scope* scope_; - const platform::DeviceContext* device_context_; + const Device* device_; const ProgramDesc* program_desc_; ProgramDescView* view_; }; @@ -86,20 +107,36 @@ std::unique_ptr make_unique(Args&&... args) { return std::unique_ptr(new T(std::forward(args)...)); } -platform::CPUDeviceContext* GetCPUDeviceContext(platform::CPUPlace& place) { +platform::CPUDeviceContext* GetCPUDeviceContext( + const platform::CPUPlace& place) { static std::unique_ptr g_cpu_device_context = make_unique(place); return g_cpu_device_context.get(); } #ifndef PADDLE_ONLY_CPU -platform::CUDADeviceContext* GetCUDADeviceContext(platform::GPUPlace& place) { +platform::CUDADeviceContext* GetCUDADeviceContext( + const platform::GPUPlace& place) { static std::unique_ptr g_cuda_device_context = make_unique(place); return g_cuda_device_context.get(); } #endif +Device* GetDevice(const platform::Place& place) { + platform::CPUPlace cpu_place; +#ifndef PADDLE_ONLY_CPU + platform::GPUPlace gpu_place = boost::get(place); + static std::unique_ptr g_device = make_unique( + GetCPUDeviceContext(cpu_place), GetCUDADeviceContext(gpu_place)); + return g_device.get(); +#else + static std::unique_ptr g_device = + make_unique(GetCPUDeviceContext(cpu_place)); + return g_device.get(); +#endif +} + framework::Scope* GetScope() { static std::unique_ptr g_scope = make_unique(); @@ -108,26 +145,16 @@ framework::Scope* GetScope() { Executor* NewLocalExecutor(const platform::Place& place, const ProgramDesc& pdesc, bool is_linear) { - platform::DeviceContext* device_context = nullptr; - if (platform::is_cpu_place(place)) { - auto cpu_place = boost::get(place); - device_context = GetCPUDeviceContext(cpu_place); - } else if (platform::is_gpu_place(place)) { -#ifndef PADDLE_ONLY_CPU - auto gpu_place = boost::get(place); - device_context = GetCUDADeviceContext(gpu_place); - } -#else - PADDLE_THROW("'GPUPlace' is not supported in CPU only device."); - } -#endif - return new ExecutorImpl(GetScope(), device_context, &pdesc, is_linear); + return new ExecutorImpl(GetScope(), GetDevice(place), &pdesc, is_linear); } void ExecutorImpl::Run() { // operators running scope_->NewVar(); - device_context_->Wait(); + device_->cpu_device_context->Wait(); +#ifndef PADDLE_ONLY_CPU + device_->cuda_device_context->Wait(); +#endif } void ExecutorImpl::Initialize() { From f29a6b020f633e7c69ae487b7372146c28046597 Mon Sep 17 00:00:00 2001 From: qijun Date: Sun, 1 Oct 2017 15:24:18 -0700 Subject: [PATCH 08/61] fix gpu build error --- paddle/framework/executor.cc | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc index 559cbe125f..ebe3259bc0 100644 --- a/paddle/framework/executor.cc +++ b/paddle/framework/executor.cc @@ -69,10 +69,13 @@ void GraphView::Initialize(const ProgramDesc* pdesc) { struct Device { platform::CPUDeviceContext* cpu_device_context; +#ifndef PADDLE_ONLY_CPU + platform::CUDADeviceContext* cuda_device_context; +#endif + #ifndef PADDLE_ONLY_CPU Device(platform::CPUDeviceContext* cpu, platform::CUDADeviceContext* gpu) : cpu_device_context(cpu), cuda_device_context(gpu) {} - platform::CDUADeviceContext* cuda_device_context; #else explicit Device(platform::CPUDeviceContext* cpu) : cpu_device_context(cpu) {} #endif @@ -126,10 +129,16 @@ platform::CUDADeviceContext* GetCUDADeviceContext( Device* GetDevice(const platform::Place& place) { platform::CPUPlace cpu_place; #ifndef PADDLE_ONLY_CPU - platform::GPUPlace gpu_place = boost::get(place); - static std::unique_ptr g_device = make_unique( - GetCPUDeviceContext(cpu_place), GetCUDADeviceContext(gpu_place)); - return g_device.get(); + if (platform::is_gpu_place(place)) { + platform::GPUPlace gpu_place = boost::get(place); + static std::unique_ptr g_device = make_unique( + GetCPUDeviceContext(cpu_place), GetCUDADeviceContext(gpu_place)); + return g_device.get(); + } else { + static std::unique_ptr g_device = + make_unique(GetCPUDeviceContext(cpu_place), nullptr); + return g_device.get(); + } #else static std::unique_ptr g_device = make_unique(GetCPUDeviceContext(cpu_place)); @@ -153,7 +162,9 @@ void ExecutorImpl::Run() { scope_->NewVar(); device_->cpu_device_context->Wait(); #ifndef PADDLE_ONLY_CPU - device_->cuda_device_context->Wait(); + if (device_->cuda_device_context) { + device_->cuda_device_context->Wait(); + } #endif } From b5dbe88b5ab504f88c6e7eaaa8b27d3965701478 Mon Sep 17 00:00:00 2001 From: qijun Date: Mon, 2 Oct 2017 20:26:17 -0700 Subject: [PATCH 09/61] follow comments --- paddle/framework/CMakeLists.txt | 2 +- paddle/framework/executor.cc | 159 +++--------------------------- paddle/framework/executor.h | 14 ++- paddle/framework/executor_test.cc | 12 ++- paddle/platform/CMakeLists.txt | 2 + paddle/platform/device.cc | 59 +++++++++++ paddle/platform/device.h | 45 +++++++++ 7 files changed, 139 insertions(+), 154 deletions(-) create mode 100644 paddle/platform/device.cc create mode 100644 paddle/platform/device.h diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt index 984fc62aa3..506d0f9833 100644 --- a/paddle/framework/CMakeLists.txt +++ b/paddle/framework/CMakeLists.txt @@ -44,5 +44,5 @@ add_custom_command(TARGET framework_py_proto POST_BUILD cc_library(backward SRCS backward.cc DEPS net_op) cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context) -cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto) +cc_library(executor SRCS executor.cc DEPS op_registry device scope framework_proto) cc_test(executor_test SRCS executor_test.cc DEPS executor) diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc index ebe3259bc0..57e177bb0a 100644 --- a/paddle/framework/executor.cc +++ b/paddle/framework/executor.cc @@ -15,162 +15,31 @@ limitations under the License. */ #include "paddle/framework/executor.h" #include #include "paddle/framework/op_registry.h" -#include "paddle/framework/operator.h" #include "paddle/framework/scope.h" -#include "paddle/platform/device_context.h" namespace paddle { namespace framework { -class LinearListView; -class GraphView; - -// Immutable view of a ProgramDesc organized for efficient execution. -class ProgramDescView { - public: - virtual ~ProgramDescView() {} - virtual void Initialize(const ProgramDesc*) = 0; - static ProgramDescView* Create(bool is_linear); -}; - -class LinearListView : public ProgramDescView { - public: - void Initialize(const ProgramDesc*) override; - - private: - std::vector> ops_; -}; - -class GraphView : public ProgramDescView { - public: - void Initialize(const ProgramDesc*) override; -}; - -ProgramDescView* ProgramDescView::Create(bool is_linear) { - if (is_linear) { - return new LinearListView(); - } else { - return new GraphView(); - } -} - -void LinearListView::Initialize(const ProgramDesc* pdesc) { - // get a LinearView of ProgramDesc - for (auto& block_desc : pdesc->blocks()) { - for (auto& op_desc : block_desc.ops()) { - ops_.emplace_back(OpRegistry::CreateOp(op_desc)); - } +Executor::Executor(const std::vector& places) { + devices_.resize(places.size()); + for (size_t i = 0; i < places.size(); i++) { + devices_[i] = platform::GetDevice(places[i]); } } -void GraphView::Initialize(const ProgramDesc* pdesc) { - // get a GraphView of ProgramDesc -} - -struct Device { - platform::CPUDeviceContext* cpu_device_context; -#ifndef PADDLE_ONLY_CPU - platform::CUDADeviceContext* cuda_device_context; -#endif - -#ifndef PADDLE_ONLY_CPU - Device(platform::CPUDeviceContext* cpu, platform::CUDADeviceContext* gpu) - : cpu_device_context(cpu), cuda_device_context(gpu) {} -#else - explicit Device(platform::CPUDeviceContext* cpu) : cpu_device_context(cpu) {} -#endif -}; - -class ExecutorImpl : public Executor { - public: - ExecutorImpl(Scope* scope, const Device* device, const ProgramDesc* pdesc, - bool is_linear) - : scope_(scope), - device_(device), - program_desc_(pdesc), - view_(ProgramDescView::Create(is_linear)) {} - - virtual ~ExecutorImpl() { - if (view_) delete view_; - } - - void Run() override; - - void Initialize(); - - private: - Scope* scope_; - const Device* device_; - const ProgramDesc* program_desc_; - ProgramDescView* view_; -}; - -template -std::unique_ptr make_unique(Args&&... args) { - return std::unique_ptr(new T(std::forward(args)...)); -} - -platform::CPUDeviceContext* GetCPUDeviceContext( - const platform::CPUPlace& place) { - static std::unique_ptr g_cpu_device_context = - make_unique(place); - return g_cpu_device_context.get(); -} - -#ifndef PADDLE_ONLY_CPU -platform::CUDADeviceContext* GetCUDADeviceContext( - const platform::GPUPlace& place) { - static std::unique_ptr g_cuda_device_context = - make_unique(place); - return g_cuda_device_context.get(); -} -#endif - -Device* GetDevice(const platform::Place& place) { - platform::CPUPlace cpu_place; -#ifndef PADDLE_ONLY_CPU - if (platform::is_gpu_place(place)) { - platform::GPUPlace gpu_place = boost::get(place); - static std::unique_ptr g_device = make_unique( - GetCPUDeviceContext(cpu_place), GetCUDADeviceContext(gpu_place)); - return g_device.get(); - } else { - static std::unique_ptr g_device = - make_unique(GetCPUDeviceContext(cpu_place), nullptr); - return g_device.get(); - } -#else - static std::unique_ptr g_device = - make_unique(GetCPUDeviceContext(cpu_place)); - return g_device.get(); -#endif -} - -framework::Scope* GetScope() { - static std::unique_ptr g_scope = - make_unique(); - return g_scope.get(); -} - -Executor* NewLocalExecutor(const platform::Place& place, - const ProgramDesc& pdesc, bool is_linear) { - return new ExecutorImpl(GetScope(), GetDevice(place), &pdesc, is_linear); -} - -void ExecutorImpl::Run() { +void Executor::Run(const ProgramDesc& pdesc, Scope* scope, + std::vector* outputs) { // operators running - scope_->NewVar(); - device_->cpu_device_context->Wait(); + Scope& local_scope = scope->NewScope(); + local_scope.NewVar(); + for (auto device : devices_) { + device->cpu_device_context->Wait(); #ifndef PADDLE_ONLY_CPU - if (device_->cuda_device_context) { - device_->cuda_device_context->Wait(); - } + if (device->cuda_device_context) { + device->cuda_device_context->Wait(); + } #endif -} - -void ExecutorImpl::Initialize() { - // Initialize the ProgramDescView - view_->Initialize(program_desc_); + } } } // namespace framework diff --git a/paddle/framework/executor.h b/paddle/framework/executor.h index 25ef2d4d48..5d6d7f37a6 100644 --- a/paddle/framework/executor.h +++ b/paddle/framework/executor.h @@ -15,18 +15,22 @@ limitations under the License. */ #pragma once #include "paddle/framework/framework.pb.h" -#include "paddle/platform/place.h" +#include "paddle/framework/scope.h" +#include "paddle/framework/tensor.h" +#include "paddle/platform/device.h" namespace paddle { namespace framework { class Executor { public: - virtual ~Executor() {} - virtual void Run() = 0; -}; + explicit Executor(const std::vector& places); + ~Executor() {} + void Run(const ProgramDesc&, Scope*, std::vector*); -Executor* NewLocalExecutor(const platform::Place&, const ProgramDesc&, bool); + private: + std::vector devices_; +}; } // namespace framework } // namespace paddle diff --git a/paddle/framework/executor_test.cc b/paddle/framework/executor_test.cc index 6f8ca38768..51d2dfc1c3 100644 --- a/paddle/framework/executor_test.cc +++ b/paddle/framework/executor_test.cc @@ -19,9 +19,15 @@ using namespace paddle::platform; using namespace paddle::framework; TEST(Executor, Init) { + CPUPlace cpu_place1, cpu_place2; + std::vector places; + places.push_back(cpu_place1); + places.push_back(cpu_place2); + Executor* executor = new Executor(places); + ProgramDesc pdesc; - CPUPlace cpu_place; - Executor* executor = NewLocalExecutor(cpu_place, pdesc, true); - executor->Run(); + Scope s; + std::vector* outputs{nullptr}; + executor->Run(pdesc, &s, outputs); delete executor; } \ No newline at end of file diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt index daf519b91d..b581937393 100644 --- a/paddle/platform/CMakeLists.txt +++ b/paddle/platform/CMakeLists.txt @@ -23,5 +23,7 @@ cc_library(device_context SRCS device_context.cc DEPS memory buddy_allocator system_allocator memory_block meta_data meta_cache place eigen3 ${GPU_CTX_DEPS}) nv_test(device_context_test SRCS device_context_test.cc DEPS device_context gpu_info) +cc_library(device SRCS device.cc DEPS device_context) + nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda) nv_test(transform_test SRCS transform_test.cu DEPS paddle_memory place device_context) diff --git a/paddle/platform/device.cc b/paddle/platform/device.cc new file mode 100644 index 0000000000..7acd87c8c3 --- /dev/null +++ b/paddle/platform/device.cc @@ -0,0 +1,59 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/platform/device.h" + +namespace paddle { +namespace platform { + +template +std::unique_ptr make_unique(Args&&... args) { + return std::unique_ptr(new T(std::forward(args)...)); +} + +CPUDeviceContext* GetCPUDeviceContext(const CPUPlace& place) { + static std::unique_ptr g_cpu_device_context = + make_unique(place); + return g_cpu_device_context.get(); +} + +#ifndef PADDLE_ONLY_CPU +CUDADeviceContext* GetCUDADeviceContext(const GPUPlace& place) { + static std::unique_ptr g_cuda_device_context = + make_unique(place); + return g_cuda_device_context.get(); +} +#endif + +Device* GetDevice(const Place& place) { + CPUPlace cpu_place; +#ifndef PADDLE_ONLY_CPU + if (is_gpu_place(place)) { + GPUPlace gpu_place = boost::get(place); + static std::unique_ptr g_device = make_unique( + GetCPUDeviceContext(cpu_place), GetCUDADeviceContext(gpu_place)); + return g_device.get(); + } else { + static std::unique_ptr g_device = + make_unique(GetCPUDeviceContext(cpu_place), nullptr); + return g_device.get(); + } +#else + static std::unique_ptr g_device = + make_unique(GetCPUDeviceContext(cpu_place)); + return g_device.get(); +#endif +} +} // namespace platform +} // namespace paddle diff --git a/paddle/platform/device.h b/paddle/platform/device.h new file mode 100644 index 0000000000..b1bb8073cf --- /dev/null +++ b/paddle/platform/device.h @@ -0,0 +1,45 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/platform/device_context.h" +#include "paddle/platform/place.h" + +namespace paddle { +namespace platform { + +struct Device { + CPUDeviceContext* cpu_device_context; +#ifndef PADDLE_ONLY_CPU + CUDADeviceContext* cuda_device_context; +#endif + +#ifndef PADDLE_ONLY_CPU + Device(CPUDeviceContext* cpu, CUDADeviceContext* gpu) + : cpu_device_context(cpu), cuda_device_context(gpu) {} +#else + explicit Device(CPUDeviceContext* cpu) : cpu_device_context(cpu) {} +#endif +}; + +CPUDeviceContext* GetCPUDeviceContext(const platform::CPUPlace& place); + +#ifndef PADDLE_ONLY_CPU +CUDADeviceContext* GetCUDADeviceContext(const platform::GPUPlace& place); +#endif + +Device* GetDevice(const platform::Place& place); +} // namespace platform +} // namespace paddle From 6e2f96841a5d3e64dc1c4eabb85b7984099b1d0e Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Tue, 3 Oct 2017 17:36:29 +0000 Subject: [PATCH 10/61] simple test --- paddle/framework/executor.cc | 30 ++++++++++++++++++------ paddle/framework/executor_test.cc | 39 ++++++++++++++++++++++++++++++- 2 files changed, 61 insertions(+), 8 deletions(-) diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc index ebe3259bc0..9e7f6f88df 100644 --- a/paddle/framework/executor.cc +++ b/paddle/framework/executor.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include "paddle/framework/executor.h" #include +#include #include "paddle/framework/op_registry.h" #include "paddle/framework/operator.h" #include "paddle/framework/scope.h" @@ -22,6 +23,8 @@ limitations under the License. */ namespace paddle { namespace framework { +// using std::unique_ptr op_ptr; + class LinearListView; class GraphView; @@ -158,14 +161,27 @@ Executor* NewLocalExecutor(const platform::Place& place, } void ExecutorImpl::Run() { - // operators running - scope_->NewVar(); - device_->cpu_device_context->Wait(); -#ifndef PADDLE_ONLY_CPU - if (device_->cuda_device_context) { - device_->cuda_device_context->Wait(); + // TODO(tonyyang-svail): only runs the first block + auto& block = program_desc_->blocks(0); + + for (auto& var : block.vars()) { + scope_->NewVar(var.name()); } -#endif + + // std::vector ops; + for (auto& op_desc : block.ops()) { + auto op = framework::OpRegistry::CreateOp(op_desc); + op->InferShape(device_->cpu_device_context); + op->Compute(); + } + + // TODO(tonyyang-svail): need to test gpu device + // device_->cpu_device_context->Wait(); + // #ifndef PADDLE_ONLY_CPU + // if (device_->cuda_device_context) { + // device_->cuda_device_context->Wait(); + // } + // #endif } void ExecutorImpl::Initialize() { diff --git a/paddle/framework/executor_test.cc b/paddle/framework/executor_test.cc index 6f8ca38768..9ab1b65803 100644 --- a/paddle/framework/executor_test.cc +++ b/paddle/framework/executor_test.cc @@ -13,6 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/framework/executor.h" +#include "paddle/framework/attribute.h" + #include "gtest/gtest.h" using namespace paddle::platform; @@ -20,8 +22,43 @@ using namespace paddle::framework; TEST(Executor, Init) { ProgramDesc pdesc; + + auto root_block = pdesc.add_blocks(); + root_block->set_idx(0); + root_block->set_parent_idx(-1); + + auto a = root_block->add_vars(); + a->set_name("a"); + auto a_lt = a->mutable_lod_tensor(); + a_lt->set_data_type(paddle::framework::DataType::FP32); + a_lt->add_dims(640); + a_lt->add_dims(640); + + auto b = root_block->add_vars(); + b->set_name("b"); + auto b_lt = b->mutable_lod_tensor(); + b_lt->set_data_type(paddle::framework::DataType::FP32); + b_lt->add_dims(640); + b_lt->add_dims(640); + + auto c = root_block->add_vars(); + c->set_name("c"); + auto c_lt = c->mutable_lod_tensor(); + c_lt->set_data_type(paddle::framework::DataType::FP32); + c_lt->add_dims(640); + c_lt->add_dims(640); + + auto op1 = root_block->add_ops(); + op1->set_type("elementwise_add"); + auto X = op1->add_inputs(); + X->set_parameter("X"); + X->add_arguments("a"); + auto Y = op1->add_inputs(); + Y->set_parameter("Y"); + Y->add_arguments("b"); + CPUPlace cpu_place; Executor* executor = NewLocalExecutor(cpu_place, pdesc, true); executor->Run(); delete executor; -} \ No newline at end of file +} From e946fc15192e7a05df42aeea0b4bf1b87fb77472 Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Tue, 3 Oct 2017 19:42:18 +0000 Subject: [PATCH 11/61] add elementwise_add --- paddle/framework/CMakeLists.txt | 2 +- paddle/framework/executor.cc | 25 +++++++++++++++++++++++++ paddle/framework/executor.h | 1 + paddle/framework/executor_test.cc | 8 +++++++- 4 files changed, 34 insertions(+), 2 deletions(-) diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt index cbd39dd095..58e78e9a6a 100644 --- a/paddle/framework/CMakeLists.txt +++ b/paddle/framework/CMakeLists.txt @@ -44,7 +44,7 @@ add_custom_command(TARGET framework_py_proto POST_BUILD cc_library(backward SRCS backward.cc DEPS net_op) cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context) -cc_library(executor SRCS executor.cc DEPS op_registry device scope framework_proto) +cc_library(executor SRCS executor.cc DEPS op_registry device scope framework_proto ${GLOB_OP_LIB}) cc_test(executor_test SRCS executor_test.cc DEPS executor) cc_library(tensor_array SRCS tensor_array.cc DEPS lod_tensor) diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc index a61f0f7162..94b9b3b350 100644 --- a/paddle/framework/executor.cc +++ b/paddle/framework/executor.cc @@ -31,6 +31,31 @@ Executor::Executor(const std::vector& places) { void Executor::Run(const ProgramDesc& pdesc, Scope* scope, std::vector* outputs) { // operators running + // TODO(tonyyang-svail): + // - only runs the first block + // - only runs on the first device + auto& block = pdesc.blocks(0); + auto& device = devices_[0]; + + for (auto& var : block.vars()) { + scope->NewVar(var.name()); + } + + // std::vector ops; + for (auto& op_desc : block.ops()) { + auto op = framework::OpRegistry::CreateOp(op_desc); + // op->InferShape(*scope); + op->Run(*scope, *device->cpu_device_context); + } + + // TODO(tonyyang-svail): need to test gpu device + // device_->cpu_device_context->Wait(); + // #ifndef PADDLE_ONLY_CPU + // if (device_->cuda_device_context) { + // device_->cuda_device_context->Wait(); + // } + // #endif + Scope& local_scope = scope->NewScope(); local_scope.NewVar(); for (auto device : devices_) { diff --git a/paddle/framework/executor.h b/paddle/framework/executor.h index 5d6d7f37a6..cdb80bc104 100644 --- a/paddle/framework/executor.h +++ b/paddle/framework/executor.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include "paddle/framework/framework.pb.h" +#include "paddle/framework/op_info.h" #include "paddle/framework/scope.h" #include "paddle/framework/tensor.h" #include "paddle/platform/device.h" diff --git a/paddle/framework/executor_test.cc b/paddle/framework/executor_test.cc index 4560d6c503..11255af808 100644 --- a/paddle/framework/executor_test.cc +++ b/paddle/framework/executor_test.cc @@ -13,9 +13,15 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/framework/executor.h" +#include "gtest/gtest.h" #include "paddle/framework/attribute.h" -#include "gtest/gtest.h" +#include +#include "paddle/framework/grad_op_builder.h" +#include "paddle/framework/op_registry.h" +#include "paddle/framework/operator.h" + +USE_OP(elementwise_add); using namespace paddle::platform; using namespace paddle::framework; From 6c4d1f551d96dda505be54c9a705d5a6784dd062 Mon Sep 17 00:00:00 2001 From: qijun Date: Tue, 3 Oct 2017 13:43:25 -0700 Subject: [PATCH 12/61] refine codes --- paddle/framework/CMakeLists.txt | 8 +- paddle/framework/executor.cc | 44 ++++---- paddle/framework/executor.h | 4 +- paddle/framework/executor_test.cc | 103 ++++++++++-------- paddle/platform/CMakeLists.txt | 2 +- paddle/platform/device.cc | 59 ---------- paddle/platform/device_context_manager.cc | 68 ++++++++++++ .../{device.h => device_context_manager.h} | 45 +++++--- 8 files changed, 188 insertions(+), 145 deletions(-) delete mode 100644 paddle/platform/device.cc create mode 100644 paddle/platform/device_context_manager.cc rename paddle/platform/{device.h => device_context_manager.h} (52%) diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt index 58e78e9a6a..898b3a990d 100644 --- a/paddle/framework/CMakeLists.txt +++ b/paddle/framework/CMakeLists.txt @@ -44,8 +44,12 @@ add_custom_command(TARGET framework_py_proto POST_BUILD cc_library(backward SRCS backward.cc DEPS net_op) cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context) -cc_library(executor SRCS executor.cc DEPS op_registry device scope framework_proto ${GLOB_OP_LIB}) -cc_test(executor_test SRCS executor_test.cc DEPS executor) +cc_library(executor SRCS executor.cc DEPS op_registry device_context_manager scope framework_proto ${GLOB_OP_LIB}) +if(WITH_GPU) + nv_test(executor_test SRCS executor_test.cc DEPS executor) +else() + cc_test(executor_test SRCS executor_test.cc DEPS executor) +endif() cc_library(tensor_array SRCS tensor_array.cc DEPS lod_tensor) cc_test(tensor_array_test SRCS tensor_array_test.cc DEPS tensor_array place) diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc index 94b9b3b350..717f9bf81a 100644 --- a/paddle/framework/executor.cc +++ b/paddle/framework/executor.cc @@ -22,9 +22,21 @@ namespace paddle { namespace framework { Executor::Executor(const std::vector& places) { - devices_.resize(places.size()); + device_contexts_.resize(places.size()); for (size_t i = 0; i < places.size(); i++) { - devices_[i] = platform::GetDevice(places[i]); + if (platform::is_cpu_place(places[i])) { + device_contexts_[i] = platform::DeviceContextManager::Get() + ->GetDeviceContext( + boost::get(places[i])); + } else { +#ifndef PADDLE_ONLY_CPU + device_contexts_[i] = platform::DeviceContextManager::Get() + ->GetDeviceContext( + boost::get(places[i])); +#else + PADDLE_THROW("'GPUPlace' is not supported in CPU only device."); +#endif + } } } @@ -34,37 +46,25 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, // TODO(tonyyang-svail): // - only runs the first block // - only runs on the first device + Scope& local_scope = scope->NewScope(); + auto& block = pdesc.blocks(0); - auto& device = devices_[0]; + auto& device_context = device_contexts_[0]; for (auto& var : block.vars()) { - scope->NewVar(var.name()); + local_scope.NewVar(var.name()); } // std::vector ops; for (auto& op_desc : block.ops()) { auto op = framework::OpRegistry::CreateOp(op_desc); - // op->InferShape(*scope); - op->Run(*scope, *device->cpu_device_context); + // InferShape is now doing inside Run method. + op->Run(local_scope, *device_context); } // TODO(tonyyang-svail): need to test gpu device - // device_->cpu_device_context->Wait(); - // #ifndef PADDLE_ONLY_CPU - // if (device_->cuda_device_context) { - // device_->cuda_device_context->Wait(); - // } - // #endif - - Scope& local_scope = scope->NewScope(); - local_scope.NewVar(); - for (auto device : devices_) { - device->cpu_device_context->Wait(); -#ifndef PADDLE_ONLY_CPU - if (device->cuda_device_context) { - device->cuda_device_context->Wait(); - } -#endif + for (auto device_context : device_contexts_) { + device_context->Wait(); } } diff --git a/paddle/framework/executor.h b/paddle/framework/executor.h index cdb80bc104..795b8ffdab 100644 --- a/paddle/framework/executor.h +++ b/paddle/framework/executor.h @@ -18,7 +18,7 @@ limitations under the License. */ #include "paddle/framework/op_info.h" #include "paddle/framework/scope.h" #include "paddle/framework/tensor.h" -#include "paddle/platform/device.h" +#include "paddle/platform/device_context_manager.h" namespace paddle { namespace framework { @@ -30,7 +30,7 @@ class Executor { void Run(const ProgramDesc&, Scope*, std::vector*); private: - std::vector devices_; + std::vector device_contexts_; }; } // namespace framework diff --git a/paddle/framework/executor_test.cc b/paddle/framework/executor_test.cc index 11255af808..810ff2a512 100644 --- a/paddle/framework/executor_test.cc +++ b/paddle/framework/executor_test.cc @@ -15,8 +15,6 @@ limitations under the License. */ #include "paddle/framework/executor.h" #include "gtest/gtest.h" #include "paddle/framework/attribute.h" - -#include #include "paddle/framework/grad_op_builder.h" #include "paddle/framework/op_registry.h" #include "paddle/framework/operator.h" @@ -26,52 +24,71 @@ USE_OP(elementwise_add); using namespace paddle::platform; using namespace paddle::framework; -TEST(Executor, Init) { - ProgramDesc pdesc; - - auto root_block = pdesc.add_blocks(); - root_block->set_idx(0); - root_block->set_parent_idx(-1); - - auto a = root_block->add_vars(); - a->set_name("a"); - auto a_lt = a->mutable_lod_tensor(); - a_lt->set_data_type(paddle::framework::DataType::FP32); - a_lt->add_dims(640); - a_lt->add_dims(640); - - auto b = root_block->add_vars(); - b->set_name("b"); - auto b_lt = b->mutable_lod_tensor(); - b_lt->set_data_type(paddle::framework::DataType::FP32); - b_lt->add_dims(640); - b_lt->add_dims(640); - - auto c = root_block->add_vars(); - c->set_name("c"); - auto c_lt = c->mutable_lod_tensor(); - c_lt->set_data_type(paddle::framework::DataType::FP32); - c_lt->add_dims(640); - c_lt->add_dims(640); - - auto op1 = root_block->add_ops(); - op1->set_type("elementwise_add"); - auto X = op1->add_inputs(); - X->set_parameter("X"); - X->add_arguments("a"); - auto Y = op1->add_inputs(); - Y->set_parameter("Y"); - Y->add_arguments("b"); - - CPUPlace cpu_place1, cpu_place2; +class ExecutorTester : public ::testing::Test { + public: + virtual void SetUp() override { + auto root_block = pdesc_.add_blocks(); + root_block->set_idx(0); + root_block->set_parent_idx(-1); + + auto a = root_block->add_vars(); + a->set_name("a"); + auto a_lt = a->mutable_lod_tensor(); + a_lt->set_data_type(paddle::framework::DataType::FP32); + a_lt->add_dims(640); + a_lt->add_dims(640); + + auto b = root_block->add_vars(); + b->set_name("b"); + auto b_lt = b->mutable_lod_tensor(); + b_lt->set_data_type(paddle::framework::DataType::FP32); + b_lt->add_dims(640); + b_lt->add_dims(640); + + auto c = root_block->add_vars(); + c->set_name("c"); + auto c_lt = c->mutable_lod_tensor(); + c_lt->set_data_type(paddle::framework::DataType::FP32); + c_lt->add_dims(640); + c_lt->add_dims(640); + + auto op1 = root_block->add_ops(); + op1->set_type("elementwise_add"); + auto X = op1->add_inputs(); + X->set_parameter("X"); + X->add_arguments("a"); + auto Y = op1->add_inputs(); + Y->set_parameter("Y"); + Y->add_arguments("b"); + } + + protected: + std::vector* outputs_{nullptr}; + ProgramDesc pdesc_; + Scope scope_; +}; + +TEST_F(ExecutorTester, InitCPU) { std::vector places; + CPUPlace cpu_place1, cpu_place2; places.push_back(cpu_place1); places.push_back(cpu_place2); Executor* executor = new Executor(places); - Scope s; - std::vector* outputs{nullptr}; - executor->Run(pdesc, &s, outputs); + executor->Run(pdesc_, &scope_, outputs_); + delete executor; +} + +#ifndef PADDLE_ONLY_CPU +TEST_F(ExecutorTester, InitGPU) { + std::vector places; + GPUPlace gpu_place0(0); + GPUPlace gpu_place1(1); + places.push_back(gpu_place0); + places.push_back(gpu_place1); + Executor* executor = new Executor(places); + executor->Run(pdesc_, &scope_, outputs_); delete executor; } +#endif diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt index b581937393..b4ddf721dd 100644 --- a/paddle/platform/CMakeLists.txt +++ b/paddle/platform/CMakeLists.txt @@ -23,7 +23,7 @@ cc_library(device_context SRCS device_context.cc DEPS memory buddy_allocator system_allocator memory_block meta_data meta_cache place eigen3 ${GPU_CTX_DEPS}) nv_test(device_context_test SRCS device_context_test.cc DEPS device_context gpu_info) -cc_library(device SRCS device.cc DEPS device_context) +cc_library(device_context_manager SRCS device_context_manager.cc DEPS device_context) nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda) nv_test(transform_test SRCS transform_test.cu DEPS paddle_memory place device_context) diff --git a/paddle/platform/device.cc b/paddle/platform/device.cc deleted file mode 100644 index 7acd87c8c3..0000000000 --- a/paddle/platform/device.cc +++ /dev/null @@ -1,59 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/platform/device.h" - -namespace paddle { -namespace platform { - -template -std::unique_ptr make_unique(Args&&... args) { - return std::unique_ptr(new T(std::forward(args)...)); -} - -CPUDeviceContext* GetCPUDeviceContext(const CPUPlace& place) { - static std::unique_ptr g_cpu_device_context = - make_unique(place); - return g_cpu_device_context.get(); -} - -#ifndef PADDLE_ONLY_CPU -CUDADeviceContext* GetCUDADeviceContext(const GPUPlace& place) { - static std::unique_ptr g_cuda_device_context = - make_unique(place); - return g_cuda_device_context.get(); -} -#endif - -Device* GetDevice(const Place& place) { - CPUPlace cpu_place; -#ifndef PADDLE_ONLY_CPU - if (is_gpu_place(place)) { - GPUPlace gpu_place = boost::get(place); - static std::unique_ptr g_device = make_unique( - GetCPUDeviceContext(cpu_place), GetCUDADeviceContext(gpu_place)); - return g_device.get(); - } else { - static std::unique_ptr g_device = - make_unique(GetCPUDeviceContext(cpu_place), nullptr); - return g_device.get(); - } -#else - static std::unique_ptr g_device = - make_unique(GetCPUDeviceContext(cpu_place)); - return g_device.get(); -#endif -} -} // namespace platform -} // namespace paddle diff --git a/paddle/platform/device_context_manager.cc b/paddle/platform/device_context_manager.cc new file mode 100644 index 0000000000..156d317c8a --- /dev/null +++ b/paddle/platform/device_context_manager.cc @@ -0,0 +1,68 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/platform/device_context_manager.h" + +namespace paddle { +namespace platform { + +DeviceContextManager::DeviceContextManager() { +#ifndef PADDLE_ONLY_CPU + device_count_ = GetDeviceCount(); + cuda_contexts_.reserve(device_count_); + for (int i = 0; i < device_count_; i++) { + cuda_contexts_[i] = nullptr; + } +#endif +} + +template <> +CPUDeviceContext* DeviceContextManager::GetDeviceContext< + CPUPlace, CPUDeviceContext>(const CPUPlace& place) { + if (!cpu_context_) { + cpu_context_ = new CPUDeviceContext(place); + } + return cpu_context_; +} + +#ifndef PADDLE_ONLY_CPU +template <> +CUDADeviceContext* DeviceContextManager::GetDeviceContext< + GPUPlace, CUDADeviceContext>(const GPUPlace& place) { + int gpu_id = place.device; + PADDLE_ENFORCE(gpu_id < device_count_, + "GPU device id must less than device count"); + SetDeviceId(gpu_id); + if (!cuda_contexts_[gpu_id]) { + cuda_contexts_[gpu_id] = new CUDADeviceContext(place); + } + return cuda_contexts_[gpu_id]; +} +#endif + +DeviceContextManager::~DeviceContextManager() { + if (cpu_context_) { + delete cpu_context_; + } +#ifndef PADDLE_ONLY_CPU + for (int i = 0; i < device_count_; i++) { + if (cuda_contexts_[i]) { + delete cuda_contexts_[i]; + } + } +#endif +} + +} // namespace platform +} // namespace paddle diff --git a/paddle/platform/device.h b/paddle/platform/device_context_manager.h similarity index 52% rename from paddle/platform/device.h rename to paddle/platform/device_context_manager.h index b1bb8073cf..da15808a60 100644 --- a/paddle/platform/device.h +++ b/paddle/platform/device_context_manager.h @@ -13,33 +13,46 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once - #include "paddle/platform/device_context.h" -#include "paddle/platform/place.h" namespace paddle { namespace platform { -struct Device { - CPUDeviceContext* cpu_device_context; -#ifndef PADDLE_ONLY_CPU - CUDADeviceContext* cuda_device_context; -#endif +template +struct Converter; + +template <> +struct Converter { + using DeviceContextType = CPUDeviceContext; +}; #ifndef PADDLE_ONLY_CPU - Device(CPUDeviceContext* cpu, CUDADeviceContext* gpu) - : cpu_device_context(cpu), cuda_device_context(gpu) {} -#else - explicit Device(CPUDeviceContext* cpu) : cpu_device_context(cpu) {} -#endif +template <> +struct Converter { + using DeviceContextType = CUDADeviceContext; }; +#endif + +class DeviceContextManager { + public: + DeviceContextManager(); + ~DeviceContextManager(); + + template ::DeviceContextType> + DeviceType* GetDeviceContext(const PlaceType& place); -CPUDeviceContext* GetCPUDeviceContext(const platform::CPUPlace& place); + static DeviceContextManager* Get() { + static DeviceContextManager inst; + return &inst; + } + private: + CPUDeviceContext* cpu_context_; #ifndef PADDLE_ONLY_CPU -CUDADeviceContext* GetCUDADeviceContext(const platform::GPUPlace& place); + int device_count_; + std::vector cuda_contexts_; #endif - -Device* GetDevice(const platform::Place& place); +}; } // namespace platform } // namespace paddle From f5e73f4c7e526e10ec8efe4afc4487b8f60e743d Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Tue, 3 Oct 2017 23:29:03 +0000 Subject: [PATCH 13/61] pass simple elementwise_add op --- paddle/framework/executor.cc | 36 ++++++++---------- paddle/framework/executor_test.cc | 63 +++++++++++++++++++++---------- 2 files changed, 58 insertions(+), 41 deletions(-) diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc index 94b9b3b350..da387b47ba 100644 --- a/paddle/framework/executor.cc +++ b/paddle/framework/executor.cc @@ -13,8 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/framework/executor.h" +#include #include #include +#include "paddle/framework/lod_tensor.h" #include "paddle/framework/op_registry.h" #include "paddle/framework/scope.h" @@ -30,41 +32,33 @@ Executor::Executor(const std::vector& places) { void Executor::Run(const ProgramDesc& pdesc, Scope* scope, std::vector* outputs) { - // operators running // TODO(tonyyang-svail): // - only runs the first block // - only runs on the first device + // - test on gpu auto& block = pdesc.blocks(0); auto& device = devices_[0]; + // TODO(tonyyang-svail): + // - runs on a new local scope + // Scope& local_scope = scope->NewScope(); + for (auto& var : block.vars()) { scope->NewVar(var.name()); } - // std::vector ops; for (auto& op_desc : block.ops()) { - auto op = framework::OpRegistry::CreateOp(op_desc); - // op->InferShape(*scope); + auto op = paddle::framework::OpRegistry::CreateOp(op_desc); op->Run(*scope, *device->cpu_device_context); } - // TODO(tonyyang-svail): need to test gpu device - // device_->cpu_device_context->Wait(); - // #ifndef PADDLE_ONLY_CPU - // if (device_->cuda_device_context) { - // device_->cuda_device_context->Wait(); - // } - // #endif - - Scope& local_scope = scope->NewScope(); - local_scope.NewVar(); - for (auto device : devices_) { - device->cpu_device_context->Wait(); -#ifndef PADDLE_ONLY_CPU - if (device->cuda_device_context) { - device->cuda_device_context->Wait(); - } -#endif + // print tensor value + for (auto& var : block.vars()) { + std::cout << var.name() << std::endl; + auto v = scope->FindVar(var.name()); + const LoDTensor& t = v->Get(); + for (int i = 0; i < t.numel(); ++i) std::cout << t.data()[i] << " "; + std::cout << std::endl; } } diff --git a/paddle/framework/executor_test.cc b/paddle/framework/executor_test.cc index 11255af808..300de36b87 100644 --- a/paddle/framework/executor_test.cc +++ b/paddle/framework/executor_test.cc @@ -16,16 +16,49 @@ limitations under the License. */ #include "gtest/gtest.h" #include "paddle/framework/attribute.h" -#include #include "paddle/framework/grad_op_builder.h" #include "paddle/framework/op_registry.h" #include "paddle/framework/operator.h" +#include + USE_OP(elementwise_add); +USE_OP(gaussian_random); using namespace paddle::platform; using namespace paddle::framework; +typedef paddle::framework::BlockDesc proto_block; +typedef paddle::framework::OpDesc proto_op; + +using std::string; + +void add_gaussian_random_op(string var_name, proto_block* block) { + std::vector dim{2, 3}; + + // insert variable + auto a = block->add_vars(); + a->set_name(var_name); + auto a_lt = a->mutable_lod_tensor(); + a_lt->set_data_type(paddle::framework::DataType::FP32); + for (int i : dim) { + a_lt->add_dims(i); + } + + // insert operation + auto op = block->add_ops(); + op->set_type("gaussian_random"); + auto dims = op->add_attrs(); + dims->set_name("dims"); + dims->set_type(paddle::framework::AttrType::INTS); + for (int i : dim) { + dims->add_ints(i); + } + auto Out = op->add_outputs(); + Out->set_parameter("Out"); + Out->add_arguments(var_name); +} + TEST(Executor, Init) { ProgramDesc pdesc; @@ -33,35 +66,25 @@ TEST(Executor, Init) { root_block->set_idx(0); root_block->set_parent_idx(-1); - auto a = root_block->add_vars(); - a->set_name("a"); - auto a_lt = a->mutable_lod_tensor(); - a_lt->set_data_type(paddle::framework::DataType::FP32); - a_lt->add_dims(640); - a_lt->add_dims(640); - - auto b = root_block->add_vars(); - b->set_name("b"); - auto b_lt = b->mutable_lod_tensor(); - b_lt->set_data_type(paddle::framework::DataType::FP32); - b_lt->add_dims(640); - b_lt->add_dims(640); + add_gaussian_random_op("a", root_block); + add_gaussian_random_op("b", root_block); auto c = root_block->add_vars(); c->set_name("c"); auto c_lt = c->mutable_lod_tensor(); c_lt->set_data_type(paddle::framework::DataType::FP32); - c_lt->add_dims(640); - c_lt->add_dims(640); - auto op1 = root_block->add_ops(); - op1->set_type("elementwise_add"); - auto X = op1->add_inputs(); + auto op = root_block->add_ops(); + op->set_type("elementwise_add"); + auto X = op->add_inputs(); X->set_parameter("X"); X->add_arguments("a"); - auto Y = op1->add_inputs(); + auto Y = op->add_inputs(); Y->set_parameter("Y"); Y->add_arguments("b"); + auto Out = op->add_outputs(); + Out->set_parameter("Out"); + Out->add_arguments("c"); CPUPlace cpu_place1, cpu_place2; std::vector places; From 395051512dbaaa8baa4570f8bac10da152bb68ad Mon Sep 17 00:00:00 2001 From: qijun Date: Tue, 3 Oct 2017 16:56:50 -0700 Subject: [PATCH 14/61] remove device context manager --- paddle/framework/CMakeLists.txt | 2 +- paddle/framework/executor.cc | 12 ++-- paddle/framework/executor.h | 3 +- paddle/platform/CMakeLists.txt | 2 - paddle/platform/device_context_manager.cc | 68 ----------------------- paddle/platform/device_context_manager.h | 58 ------------------- 6 files changed, 7 insertions(+), 138 deletions(-) delete mode 100644 paddle/platform/device_context_manager.cc delete mode 100644 paddle/platform/device_context_manager.h diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt index 898b3a990d..dde96d19e4 100644 --- a/paddle/framework/CMakeLists.txt +++ b/paddle/framework/CMakeLists.txt @@ -44,7 +44,7 @@ add_custom_command(TARGET framework_py_proto POST_BUILD cc_library(backward SRCS backward.cc DEPS net_op) cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context) -cc_library(executor SRCS executor.cc DEPS op_registry device_context_manager scope framework_proto ${GLOB_OP_LIB}) +cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto ${GLOB_OP_LIB}) if(WITH_GPU) nv_test(executor_test SRCS executor_test.cc DEPS executor) else() diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc index 717f9bf81a..766945db9b 100644 --- a/paddle/framework/executor.cc +++ b/paddle/framework/executor.cc @@ -25,14 +25,12 @@ Executor::Executor(const std::vector& places) { device_contexts_.resize(places.size()); for (size_t i = 0; i < places.size(); i++) { if (platform::is_cpu_place(places[i])) { - device_contexts_[i] = platform::DeviceContextManager::Get() - ->GetDeviceContext( - boost::get(places[i])); + device_contexts_[i].reset(new platform::CPUDeviceContext( + boost::get(places[i]))); } else { #ifndef PADDLE_ONLY_CPU - device_contexts_[i] = platform::DeviceContextManager::Get() - ->GetDeviceContext( - boost::get(places[i])); + device_contexts_[i].reset(new platform::CUDADeviceContext( + boost::get(places[i]))); #else PADDLE_THROW("'GPUPlace' is not supported in CPU only device."); #endif @@ -63,7 +61,7 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, } // TODO(tonyyang-svail): need to test gpu device - for (auto device_context : device_contexts_) { + for (auto& device_context : device_contexts_) { device_context->Wait(); } } diff --git a/paddle/framework/executor.h b/paddle/framework/executor.h index 795b8ffdab..d5c21c59fe 100644 --- a/paddle/framework/executor.h +++ b/paddle/framework/executor.h @@ -18,7 +18,6 @@ limitations under the License. */ #include "paddle/framework/op_info.h" #include "paddle/framework/scope.h" #include "paddle/framework/tensor.h" -#include "paddle/platform/device_context_manager.h" namespace paddle { namespace framework { @@ -30,7 +29,7 @@ class Executor { void Run(const ProgramDesc&, Scope*, std::vector*); private: - std::vector device_contexts_; + std::vector> device_contexts_; }; } // namespace framework diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt index b4ddf721dd..daf519b91d 100644 --- a/paddle/platform/CMakeLists.txt +++ b/paddle/platform/CMakeLists.txt @@ -23,7 +23,5 @@ cc_library(device_context SRCS device_context.cc DEPS memory buddy_allocator system_allocator memory_block meta_data meta_cache place eigen3 ${GPU_CTX_DEPS}) nv_test(device_context_test SRCS device_context_test.cc DEPS device_context gpu_info) -cc_library(device_context_manager SRCS device_context_manager.cc DEPS device_context) - nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda) nv_test(transform_test SRCS transform_test.cu DEPS paddle_memory place device_context) diff --git a/paddle/platform/device_context_manager.cc b/paddle/platform/device_context_manager.cc deleted file mode 100644 index 156d317c8a..0000000000 --- a/paddle/platform/device_context_manager.cc +++ /dev/null @@ -1,68 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/platform/device_context_manager.h" - -namespace paddle { -namespace platform { - -DeviceContextManager::DeviceContextManager() { -#ifndef PADDLE_ONLY_CPU - device_count_ = GetDeviceCount(); - cuda_contexts_.reserve(device_count_); - for (int i = 0; i < device_count_; i++) { - cuda_contexts_[i] = nullptr; - } -#endif -} - -template <> -CPUDeviceContext* DeviceContextManager::GetDeviceContext< - CPUPlace, CPUDeviceContext>(const CPUPlace& place) { - if (!cpu_context_) { - cpu_context_ = new CPUDeviceContext(place); - } - return cpu_context_; -} - -#ifndef PADDLE_ONLY_CPU -template <> -CUDADeviceContext* DeviceContextManager::GetDeviceContext< - GPUPlace, CUDADeviceContext>(const GPUPlace& place) { - int gpu_id = place.device; - PADDLE_ENFORCE(gpu_id < device_count_, - "GPU device id must less than device count"); - SetDeviceId(gpu_id); - if (!cuda_contexts_[gpu_id]) { - cuda_contexts_[gpu_id] = new CUDADeviceContext(place); - } - return cuda_contexts_[gpu_id]; -} -#endif - -DeviceContextManager::~DeviceContextManager() { - if (cpu_context_) { - delete cpu_context_; - } -#ifndef PADDLE_ONLY_CPU - for (int i = 0; i < device_count_; i++) { - if (cuda_contexts_[i]) { - delete cuda_contexts_[i]; - } - } -#endif -} - -} // namespace platform -} // namespace paddle diff --git a/paddle/platform/device_context_manager.h b/paddle/platform/device_context_manager.h deleted file mode 100644 index da15808a60..0000000000 --- a/paddle/platform/device_context_manager.h +++ /dev/null @@ -1,58 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "paddle/platform/device_context.h" - -namespace paddle { -namespace platform { - -template -struct Converter; - -template <> -struct Converter { - using DeviceContextType = CPUDeviceContext; -}; - -#ifndef PADDLE_ONLY_CPU -template <> -struct Converter { - using DeviceContextType = CUDADeviceContext; -}; -#endif - -class DeviceContextManager { - public: - DeviceContextManager(); - ~DeviceContextManager(); - - template ::DeviceContextType> - DeviceType* GetDeviceContext(const PlaceType& place); - - static DeviceContextManager* Get() { - static DeviceContextManager inst; - return &inst; - } - - private: - CPUDeviceContext* cpu_context_; -#ifndef PADDLE_ONLY_CPU - int device_count_; - std::vector cuda_contexts_; -#endif -}; -} // namespace platform -} // namespace paddle From fe10e86dd536cc22f65a07a1900bb8b199a8bd5b Mon Sep 17 00:00:00 2001 From: qijun Date: Wed, 4 Oct 2017 14:05:37 -0700 Subject: [PATCH 15/61] fix gpu build error --- paddle/framework/executor.cc | 19 ++++++++++++++----- paddle/framework/executor.h | 4 ++-- paddle/platform/gpu_info.cc | 2 +- 3 files changed, 17 insertions(+), 8 deletions(-) diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc index 03504952ed..aa36b7438f 100644 --- a/paddle/framework/executor.cc +++ b/paddle/framework/executor.cc @@ -27,12 +27,12 @@ Executor::Executor(const std::vector& places) { device_contexts_.resize(places.size()); for (size_t i = 0; i < places.size(); i++) { if (platform::is_cpu_place(places[i])) { - device_contexts_[i].reset(new platform::CPUDeviceContext( - boost::get(places[i]))); - } else { + device_contexts_[i] = new platform::CPUDeviceContext( + boost::get(places[i])); + } else if (platform::is_gpu_place(places[i])) { #ifndef PADDLE_ONLY_CPU - device_contexts_[i].reset(new platform::CUDADeviceContext( - boost::get(places[i]))); + device_contexts_[i] = new platform::CUDADeviceContext( + boost::get(places[i])); #else PADDLE_THROW("'GPUPlace' is not supported in CPU only device."); #endif @@ -40,6 +40,14 @@ Executor::Executor(const std::vector& places) { } } +Executor::~Executor() { + for (auto& device_context : device_contexts_) { + if (device_context) { + delete device_context; + } + } +} + void Executor::Run(const ProgramDesc& pdesc, Scope* scope, std::vector* outputs) { // TODO(tonyyang-svail): @@ -59,6 +67,7 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, for (auto& op_desc : block.ops()) { auto op = paddle::framework::OpRegistry::CreateOp(op_desc); + std::cout << op->DebugString() << std::endl; op->Run(*scope, *device); } diff --git a/paddle/framework/executor.h b/paddle/framework/executor.h index d5c21c59fe..fc53be37c3 100644 --- a/paddle/framework/executor.h +++ b/paddle/framework/executor.h @@ -25,11 +25,11 @@ namespace framework { class Executor { public: explicit Executor(const std::vector& places); - ~Executor() {} + ~Executor(); void Run(const ProgramDesc&, Scope*, std::vector*); private: - std::vector> device_contexts_; + std::vector device_contexts_; }; } // namespace framework diff --git a/paddle/platform/gpu_info.cc b/paddle/platform/gpu_info.cc index f487014871..0464797f31 100644 --- a/paddle/platform/gpu_info.cc +++ b/paddle/platform/gpu_info.cc @@ -43,7 +43,7 @@ int GetCurrentDeviceId() { } void SetDeviceId(int id) { - PADDLE_ENFORCE(id < GetDeviceCount(), "id must less than GPU count") + PADDLE_ENFORCE(id < GetDeviceCount(), "id must less than GPU count"); PADDLE_ENFORCE(cudaSetDevice(id), "cudaSetDevice failed in paddle::platform::SetDeviceId"); } From 3014f6a1135e113cb55a6a2cb771d477502a8b00 Mon Sep 17 00:00:00 2001 From: qijun Date: Wed, 4 Oct 2017 17:36:19 -0700 Subject: [PATCH 16/61] correct macro --- paddle/framework/executor.cc | 2 +- paddle/framework/executor_test.cc | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc index aa36b7438f..7c3cac359e 100644 --- a/paddle/framework/executor.cc +++ b/paddle/framework/executor.cc @@ -30,7 +30,7 @@ Executor::Executor(const std::vector& places) { device_contexts_[i] = new platform::CPUDeviceContext( boost::get(places[i])); } else if (platform::is_gpu_place(places[i])) { -#ifndef PADDLE_ONLY_CPU +#ifdef PADDLE_WITH_GPU device_contexts_[i] = new platform::CUDADeviceContext( boost::get(places[i])); #else diff --git a/paddle/framework/executor_test.cc b/paddle/framework/executor_test.cc index f746242a6b..ca7e8ca7d2 100644 --- a/paddle/framework/executor_test.cc +++ b/paddle/framework/executor_test.cc @@ -103,7 +103,7 @@ TEST_F(ExecutorTester, InitCPU) { delete executor; } -#ifndef PADDLE_ONLY_CPU +#ifdef PADDLE_WITH_GPU TEST_F(ExecutorTester, InitGPU) { std::vector places; GPUPlace gpu_place0(0); From 623848afa1f0bb3a69c7e49c4fa0f763a252669d Mon Sep 17 00:00:00 2001 From: qijun Date: Thu, 5 Oct 2017 12:11:56 -0700 Subject: [PATCH 17/61] add feed operator --- paddle/framework/scope.cc | 16 ++++++++++ paddle/framework/scope.h | 2 ++ paddle/operators/activation_op.cu | 18 +++++------ paddle/operators/feed_op.cc | 52 +++++++++++++++++++++++++++++++ paddle/operators/feed_op.cu | 18 +++++++++++ paddle/operators/feed_op.h | 40 ++++++++++++++++++++++++ 6 files changed, 137 insertions(+), 9 deletions(-) create mode 100644 paddle/operators/feed_op.cc create mode 100644 paddle/operators/feed_op.cu create mode 100644 paddle/operators/feed_op.h diff --git a/paddle/framework/scope.cc b/paddle/framework/scope.cc index 080b4ac621..b04120abf2 100644 --- a/paddle/framework/scope.cc +++ b/paddle/framework/scope.cc @@ -13,6 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/framework/scope.h" +#include // for unique_ptr +#include // for call_once #include "paddle/string/printf.h" namespace paddle { @@ -62,5 +64,19 @@ void Scope::DropKids() { kids_.clear(); } +std::once_flag feed_variable_flag; + +template +std::unique_ptr make_unique(Args&&... args) { + return std::unique_ptr(new T(std::forward(args)...)); +} + +framework::Scope* GetScope() { + static std::unique_ptr g_scope = + make_unique(); + std::call_once(feed_variable_flag, [&]() { g_scope->NewVar("feed_value"); }); + return g_scope.get(); +} + } // namespace framework } // namespace paddle diff --git a/paddle/framework/scope.h b/paddle/framework/scope.h index 7047f0d55e..96f3ae875b 100644 --- a/paddle/framework/scope.h +++ b/paddle/framework/scope.h @@ -73,5 +73,7 @@ class Scope { DISABLE_COPY_AND_ASSIGN(Scope); }; +framework::Scope* GetScope(); + } // namespace framework } // namespace paddle diff --git a/paddle/operators/activation_op.cu b/paddle/operators/activation_op.cu index 93e9f1c694..44a6aaf9cb 100644 --- a/paddle/operators/activation_op.cu +++ b/paddle/operators/activation_op.cu @@ -1,16 +1,16 @@ /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 +http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ #define EIGEN_USE_GPU #include "paddle/operators/activation_op.h" diff --git a/paddle/operators/feed_op.cc b/paddle/operators/feed_op.cc new file mode 100644 index 0000000000..805c3600be --- /dev/null +++ b/paddle/operators/feed_op.cc @@ -0,0 +1,52 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/operators/feed_op.h" + +namespace paddle { +namespace operators { + +class FeedOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContextBase* ctx) const override { + typedef std::vector FeedInputs; + PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output should be not null."); + int col = ctx->Attrs().Get("col"); + framework::Variable* g_feed_variable = + framework::GetScope()->FindVar("feed_value"); + FeedInputs tensors = g_feed_variable->Get(); + auto in_dim = tensors[col].dims(); + ctx->SetOutputDim("Y", in_dim); + // need to handle LodTensor later + } +}; + +class FeedOpMaker : public framework::OpProtoAndCheckerMaker { + public: + FeedOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddAttr("col", "The col in Global Feed Variable"); + AddOutput("Out", "The output of dropout op."); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(feed, ops::FeedOp, ops::FeedOpMaker); +REGISTER_OP_CPU_KERNEL(feed, ops::FeedKernel); diff --git a/paddle/operators/feed_op.cu b/paddle/operators/feed_op.cu new file mode 100644 index 0000000000..7b6a2ac91e --- /dev/null +++ b/paddle/operators/feed_op.cu @@ -0,0 +1,18 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/operators/feed_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_GPU_KERNEL(feed, ops::FeedKernel); diff --git a/paddle/operators/feed_op.h b/paddle/operators/feed_op.h new file mode 100644 index 0000000000..57781e205f --- /dev/null +++ b/paddle/operators/feed_op.h @@ -0,0 +1,40 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/framework/eigen.h" +#include "paddle/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class FeedKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + typedef std::vector FeedInputs; + Tensor* out = ctx.Output("Out"); + out->mutable_data(ctx.GetPlace()); + framework::Variable* g_feed_variable = + framework::GetScope()->FindVar("feed_value"); + int col = ctx.template Attr("col"); + FeedInputs tensors = g_feed_variable->Get(); + out->CopyFrom(tensors[col], ctx.GetPlace()); + } +}; + +} // namespace operators +} // namespace paddle From 20725f2d52bd3f6d54df45c710872b9b8ee52e14 Mon Sep 17 00:00:00 2001 From: qijun Date: Thu, 5 Oct 2017 14:55:29 -0700 Subject: [PATCH 18/61] add executor feed operator test --- paddle/framework/executor.cc | 20 ++-- paddle/framework/executor.h | 2 +- paddle/framework/executor_test.cc | 155 +++++++++++++++++++++++++++--- paddle/operators/feed_op.cc | 15 ++- 4 files changed, 167 insertions(+), 25 deletions(-) diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc index 7c3cac359e..aafef12554 100644 --- a/paddle/framework/executor.cc +++ b/paddle/framework/executor.cc @@ -48,8 +48,7 @@ Executor::~Executor() { } } -void Executor::Run(const ProgramDesc& pdesc, Scope* scope, - std::vector* outputs) { +void Executor::Run(const ProgramDesc& pdesc, Scope* scope) { // TODO(tonyyang-svail): // - only runs the first block // - only runs on the first device @@ -76,14 +75,15 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, device_context->Wait(); } // // print tensor value - // for (auto& var : block.vars()) { - // std::cout << var.name() << std::endl; - // auto v = scope->FindVar(var.name()); - // const LoDTensor& t = v->Get(); - // for (int i = 0; i < t.numel(); ++i) - // std::cout << t.data()[i] << " "; - // std::cout << std::endl; - // } + for (auto& var : block.vars()) { + std::cout << var.name() << std::endl; + auto v = scope->FindVar(var.name()); + const LoDTensor& t = v->Get(); + for (int i = 0; i < t.numel(); ++i) { + std::cout << t.data()[i] << " "; + } + std::cout << std::endl; + } } } // namespace framework diff --git a/paddle/framework/executor.h b/paddle/framework/executor.h index fc53be37c3..9e443c8fca 100644 --- a/paddle/framework/executor.h +++ b/paddle/framework/executor.h @@ -26,7 +26,7 @@ class Executor { public: explicit Executor(const std::vector& places); ~Executor(); - void Run(const ProgramDesc&, Scope*, std::vector*); + void Run(const ProgramDesc&, Scope*); private: std::vector device_contexts_; diff --git a/paddle/framework/executor_test.cc b/paddle/framework/executor_test.cc index ca7e8ca7d2..0856d1f32e 100644 --- a/paddle/framework/executor_test.cc +++ b/paddle/framework/executor_test.cc @@ -13,17 +13,18 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/framework/executor.h" +#include // for unique_ptr +#include // for call_once +#include #include "gtest/gtest.h" #include "paddle/framework/attribute.h" - #include "paddle/framework/grad_op_builder.h" #include "paddle/framework/op_registry.h" #include "paddle/framework/operator.h" -#include - USE_OP(elementwise_add); USE_OP(gaussian_random); +USE_OP(feed); using std::string; using namespace paddle::platform; @@ -58,7 +59,67 @@ void add_gaussian_random_op(string var_name, proto_block* block) { Out->add_arguments(var_name); } -class ExecutorTester : public ::testing::Test { +void add_feed_op(string var_name, int index, proto_block* block) { + std::vector dim{3}; + + // insert variable + auto a = block->add_vars(); + a->set_name(var_name); + auto a_lt = a->mutable_lod_tensor(); + a_lt->set_data_type(paddle::framework::DataType::FP32); + for (int i : dim) { + a_lt->add_dims(i); + } + + // insert operation + auto op = block->add_ops(); + op->set_type("feed"); + + // set dims attr + auto dims = op->add_attrs(); + dims->set_name("dims"); + dims->set_type(paddle::framework::AttrType::INTS); + for (int i : dim) { + dims->add_ints(i); + } + + // set col attr + auto col = op->add_attrs(); + col->set_name("col"); + col->set_type(paddle::framework::AttrType::INT); + col->set_i(index); + + auto Out = op->add_outputs(); + Out->set_parameter("Out"); + Out->add_arguments(var_name); +} + +std::once_flag set_variable_flag; + +template +void set_feed_variable(const std::vector>& inputs) { + typedef std::vector FeedInputs; + Variable* g_feed_value = GetScope()->FindVar("feed_value"); + FeedInputs& feed_inputs = *(g_feed_value->GetMutable()); + auto size = inputs.size(); + + std::call_once(set_variable_flag, [&]() { + feed_inputs.reserve(size); + for (size_t i = 0; i < size; i++) { + paddle::framework::Tensor tmp; + tmp.mutable_data(make_ddim({static_cast(inputs[i].size())}), + CPUPlace()); + feed_inputs.push_back(tmp); + } + }); + + for (size_t i = 0; i < size; i++) { + memcpy(feed_inputs[i].data(), inputs[i].data(), + inputs[i].size() * sizeof(T)); + } +} + +class ExecutorTesterRandom : public ::testing::Test { public: virtual void SetUp() override { auto root_block = pdesc_.add_blocks(); @@ -84,33 +145,103 @@ class ExecutorTester : public ::testing::Test { auto Out = op->add_outputs(); Out->set_parameter("Out"); Out->add_arguments("c"); + + scope_ = GetScope(); } protected: - std::vector* outputs_{nullptr}; ProgramDesc pdesc_; - Scope scope_; + Scope* scope_; }; -TEST_F(ExecutorTester, InitCPU) { +class ExecutorTesterFeed : public ::testing::Test { + public: + virtual void SetUp() override { + auto root_block = pdesc_.add_blocks(); + root_block->set_idx(0); + root_block->set_parent_idx(-1); + + add_feed_op("a", 0, root_block); + add_feed_op("b", 1, root_block); + + auto c = root_block->add_vars(); + c->set_name("c"); + auto c_lt = c->mutable_lod_tensor(); + c_lt->set_data_type(paddle::framework::DataType::FP32); + + auto op = root_block->add_ops(); + op->set_type("elementwise_add"); + auto X = op->add_inputs(); + X->set_parameter("X"); + X->add_arguments("a"); + auto Y = op->add_inputs(); + Y->set_parameter("Y"); + Y->add_arguments("b"); + auto Out = op->add_outputs(); + Out->set_parameter("Out"); + Out->add_arguments("c"); + + std::vector vec1 = {1.0, 2.0, 3.0}; + std::vector vec2 = {4.0, 5.0, 6.0}; + inputs_.push_back(vec1); + inputs_.push_back(vec2); + } + + protected: + ProgramDesc pdesc_; + std::vector> inputs_; +}; + +TEST_F(ExecutorTesterRandom, CPU) { std::vector places; CPUPlace cpu_place1, cpu_place2; places.push_back(cpu_place1); places.push_back(cpu_place2); Executor* executor = new Executor(places); - executor->Run(pdesc_, &scope_, outputs_); + executor->Run(pdesc_, scope_); + delete executor; +} + +TEST_F(ExecutorTesterFeed, CPU) { + std::vector places; + CPUPlace cpu_place; + places.push_back(cpu_place); + + Executor* executor = new Executor(places); + + // 3 mini-batch + for (int i = 0; i < 3; i++) { + // need to set feed variable before Executor::Run + set_feed_variable(inputs_); + executor->Run(pdesc_, GetScope()); + } + delete executor; } #ifdef PADDLE_WITH_GPU -TEST_F(ExecutorTester, InitGPU) { +TEST_F(ExecutorTesterRandom, GPU) { + std::vector places; + GPUPlace gpu_place(0); + places.push_back(gpu_place); + + Executor* executor = new Executor(places); + executor->Run(pdesc_, scope_); + delete executor; +} + +TEST_F(ExecutorTesterFeed, GPU) { std::vector places; - GPUPlace gpu_place0(0); - places.push_back(gpu_place0); + GPUPlace gpu_place(0); + places.push_back(gpu_place); Executor* executor = new Executor(places); - executor->Run(pdesc_, &scope_, outputs_); + + // need to set feed variable before Executor::Run + set_feed_variable(inputs_); + executor->Run(pdesc_, scope_); + delete executor; } #endif diff --git a/paddle/operators/feed_op.cc b/paddle/operators/feed_op.cc index 805c3600be..5ae882bc8a 100644 --- a/paddle/operators/feed_op.cc +++ b/paddle/operators/feed_op.cc @@ -28,19 +28,30 @@ class FeedOp : public framework::OperatorWithKernel { int col = ctx->Attrs().Get("col"); framework::Variable* g_feed_variable = framework::GetScope()->FindVar("feed_value"); + FeedInputs tensors = g_feed_variable->Get(); + auto in_dim = tensors[col].dims(); - ctx->SetOutputDim("Y", in_dim); + ctx->SetOutputDim("Out", in_dim); // need to handle LodTensor later } + + framework::DataType IndicateDataType( + const framework::ExecutionContext& ctx) const override { + return static_cast(Attr("data_type")); + } }; class FeedOpMaker : public framework::OpProtoAndCheckerMaker { public: FeedOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { - AddAttr("col", "The col in Global Feed Variable"); + AddAttr("data_type", "output data type") + .SetDefault(framework::DataType::FP32); + AddAttr("col", "The col in global feed variable").SetDefault(0); + AddAttr>("dims", "The dimension of random tensor."); AddOutput("Out", "The output of dropout op."); + AddComment(R"DOC(Feed data to global feed variable)DOC"); } }; From 45c4dcaabb4cbf140384dcffe3392d2e10b2a6d7 Mon Sep 17 00:00:00 2001 From: qijun Date: Thu, 5 Oct 2017 15:54:44 -0700 Subject: [PATCH 19/61] add fetch operator --- paddle/framework/executor.cc | 18 ++++---- paddle/framework/executor_test.cc | 67 ++++++++++++++++++++++++++++++ paddle/framework/scope.cc | 5 ++- paddle/operators/activation_op.cu | 18 ++++---- paddle/operators/feed_op.cc | 6 +-- paddle/operators/fetch_op.cc | 68 +++++++++++++++++++++++++++++++ paddle/operators/fetch_op.cu | 18 ++++++++ paddle/operators/fetch_op.h | 40 ++++++++++++++++++ 8 files changed, 218 insertions(+), 22 deletions(-) create mode 100644 paddle/operators/fetch_op.cc create mode 100644 paddle/operators/fetch_op.cu create mode 100644 paddle/operators/fetch_op.h diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc index aafef12554..51ddb7e58e 100644 --- a/paddle/framework/executor.cc +++ b/paddle/framework/executor.cc @@ -75,15 +75,15 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope) { device_context->Wait(); } // // print tensor value - for (auto& var : block.vars()) { - std::cout << var.name() << std::endl; - auto v = scope->FindVar(var.name()); - const LoDTensor& t = v->Get(); - for (int i = 0; i < t.numel(); ++i) { - std::cout << t.data()[i] << " "; - } - std::cout << std::endl; - } + // for (auto& var : block.vars()) { + // std::cout << var.name() << std::endl; + // auto v = scope->FindVar(var.name()); + // const LoDTensor& t = v->Get(); + // for (int i = 0; i < t.numel(); ++i) { + // std::cout << t.data()[i] << " "; + // } + // std::cout << std::endl; + // } } } // namespace framework diff --git a/paddle/framework/executor_test.cc b/paddle/framework/executor_test.cc index 0856d1f32e..980f5f579c 100644 --- a/paddle/framework/executor_test.cc +++ b/paddle/framework/executor_test.cc @@ -25,6 +25,7 @@ limitations under the License. */ USE_OP(elementwise_add); USE_OP(gaussian_random); USE_OP(feed); +USE_OP(fetch); using std::string; using namespace paddle::platform; @@ -94,6 +95,41 @@ void add_feed_op(string var_name, int index, proto_block* block) { Out->add_arguments(var_name); } +void add_fetch_op(string var_name, int index, proto_block* block) { + std::vector dim{3}; + + // insert variable + auto a = block->add_vars(); + a->set_name(var_name); + auto a_lt = a->mutable_lod_tensor(); + a_lt->set_data_type(paddle::framework::DataType::FP32); + for (int i : dim) { + a_lt->add_dims(i); + } + + // insert operation + auto op = block->add_ops(); + op->set_type("fetch"); + + // set dims attr + auto dims = op->add_attrs(); + dims->set_name("dims"); + dims->set_type(paddle::framework::AttrType::INTS); + for (int i : dim) { + dims->add_ints(i); + } + + // set col attr + auto col = op->add_attrs(); + col->set_name("col"); + col->set_type(paddle::framework::AttrType::INT); + col->set_i(index); + + auto Out = op->add_inputs(); + Out->set_parameter("Input"); + Out->add_arguments(var_name); +} + std::once_flag set_variable_flag; template @@ -119,6 +155,27 @@ void set_feed_variable(const std::vector>& inputs) { } } +template +std::vector> get_fetch_variable() { + typedef std::vector FetchOutputs; + Variable* g_fetch_value = GetScope()->FindVar("fetch_value"); + FetchOutputs& fetch_outputs = *(g_fetch_value->GetMutable()); + auto size = fetch_outputs.size(); + + std::vector> result; + result.reserve(size); + + for (size_t i = 0; i < size; i++) { + std::vector tmp; + tmp.reserve(fetch_outputs[i].numel()); + memcpy(tmp.data(), fetch_outputs[i].data(), + fetch_outputs[i].numel() * sizeof(T)); + result.push_back(tmp); + } + + return result; +} + class ExecutorTesterRandom : public ::testing::Test { public: virtual void SetUp() override { @@ -181,6 +238,8 @@ class ExecutorTesterFeed : public ::testing::Test { Out->set_parameter("Out"); Out->add_arguments("c"); + add_fetch_op("c", 0, root_block); + std::vector vec1 = {1.0, 2.0, 3.0}; std::vector vec2 = {4.0, 5.0, 6.0}; inputs_.push_back(vec1); @@ -213,8 +272,16 @@ TEST_F(ExecutorTesterFeed, CPU) { // 3 mini-batch for (int i = 0; i < 3; i++) { // need to set feed variable before Executor::Run + std::cout << "start mini-batch " << i << std::endl; set_feed_variable(inputs_); executor->Run(pdesc_, GetScope()); + std::vector> result = get_fetch_variable(); + for (auto& vec : result) { + for (auto& num : vec) { + std::cout << num << " "; + } + std::cout << std::endl; + } } delete executor; diff --git a/paddle/framework/scope.cc b/paddle/framework/scope.cc index b04120abf2..2c416570cf 100644 --- a/paddle/framework/scope.cc +++ b/paddle/framework/scope.cc @@ -74,7 +74,10 @@ std::unique_ptr make_unique(Args&&... args) { framework::Scope* GetScope() { static std::unique_ptr g_scope = make_unique(); - std::call_once(feed_variable_flag, [&]() { g_scope->NewVar("feed_value"); }); + std::call_once(feed_variable_flag, [&]() { + g_scope->NewVar("feed_value"); + g_scope->NewVar("fetch_value"); + }); return g_scope.get(); } diff --git a/paddle/operators/activation_op.cu b/paddle/operators/activation_op.cu index 44a6aaf9cb..93e9f1c694 100644 --- a/paddle/operators/activation_op.cu +++ b/paddle/operators/activation_op.cu @@ -1,16 +1,16 @@ /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at -http://www.apache.org/licenses/LICENSE-2.0 + http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ #define EIGEN_USE_GPU #include "paddle/operators/activation_op.h" diff --git a/paddle/operators/feed_op.cc b/paddle/operators/feed_op.cc index 5ae882bc8a..a61855cb99 100644 --- a/paddle/operators/feed_op.cc +++ b/paddle/operators/feed_op.cc @@ -49,9 +49,9 @@ class FeedOpMaker : public framework::OpProtoAndCheckerMaker { AddAttr("data_type", "output data type") .SetDefault(framework::DataType::FP32); AddAttr("col", "The col in global feed variable").SetDefault(0); - AddAttr>("dims", "The dimension of random tensor."); - AddOutput("Out", "The output of dropout op."); - AddComment(R"DOC(Feed data to global feed variable)DOC"); + AddAttr>("dims", "The dimension of feed tensor."); + AddOutput("Out", "The output of feed op."); + AddComment(R"DOC(Feed data from global feed variable)DOC"); } }; diff --git a/paddle/operators/fetch_op.cc b/paddle/operators/fetch_op.cc new file mode 100644 index 0000000000..68e8d26dbe --- /dev/null +++ b/paddle/operators/fetch_op.cc @@ -0,0 +1,68 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/operators/fetch_op.h" + +namespace paddle { +namespace operators { + +class FetchOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContextBase* ctx) const override { + typedef std::vector FetchOutputs; + PADDLE_ENFORCE(ctx->HasInput("Input"), "Input should be not null."); + int col = ctx->Attrs().Get("col"); + framework::Variable* g_fetch_variable = + framework::GetScope()->FindVar("fetch_value"); + + FetchOutputs* tensors = g_fetch_variable->GetMutable(); + if (tensors->size() < col) { + tensors->resize(col); + } + + auto input_dim = ctx->GetInputDim("Input"); + framework::Tensor tmp; + tmp.Resize(input_dim); + (*tensors)[col].Resize(input_dim); + // need to handle LodTensor later + } + + framework::DataType IndicateDataType( + const framework::ExecutionContext& ctx) const override { + return static_cast(Attr("data_type")); + } +}; + +class FetchOpMaker : public framework::OpProtoAndCheckerMaker { + public: + FetchOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddAttr("data_type", "output data type") + .SetDefault(framework::DataType::FP32); + AddAttr("col", "The col in global fetch variable").SetDefault(0); + AddAttr>("dims", "The dimension of fetch tensor."); + AddInput("Input", "The output of fetch op."); + AddComment(R"DOC(Fetch data to global fetch variable)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(fetch, ops::FetchOp, ops::FetchOpMaker); +REGISTER_OP_CPU_KERNEL(fetch, ops::FetchKernel); diff --git a/paddle/operators/fetch_op.cu b/paddle/operators/fetch_op.cu new file mode 100644 index 0000000000..2e24d3a8ad --- /dev/null +++ b/paddle/operators/fetch_op.cu @@ -0,0 +1,18 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/operators/feed_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_GPU_KERNEL(fetch, ops::FetchKernel); diff --git a/paddle/operators/fetch_op.h b/paddle/operators/fetch_op.h new file mode 100644 index 0000000000..95e7986a22 --- /dev/null +++ b/paddle/operators/fetch_op.h @@ -0,0 +1,40 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/framework/eigen.h" +#include "paddle/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class FetchKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + typedef std::vector FetchOutputs; + Tensor* input = ctx.Output("Input"); + int col = ctx.template Attr("col"); + framework::Variable* g_fetch_variable = + framework::GetScope()->FindVar("fetch_value"); + FetchOutputs tensors = g_fetch_variable->Get(); + tensors[col].mutable_data(platform::CPUPlace()); + tensors[col].CopyFrom(*input, platform::CPUPlace()); + } +}; + +} // namespace operators +} // namespace paddle From 48b080db9fcc4f34535c98878112e6633d6d8d7d Mon Sep 17 00:00:00 2001 From: qijun Date: Thu, 5 Oct 2017 20:48:04 -0700 Subject: [PATCH 20/61] ensure global BuddyAllocator is initialized before global Scope --- paddle/framework/executor_test.cc | 94 +++++++++++++++++-------------- paddle/operators/feed_op.cc | 4 +- paddle/operators/feed_op.h | 2 +- paddle/operators/fetch_op.cc | 7 ++- paddle/operators/fetch_op.h | 8 +-- 5 files changed, 62 insertions(+), 53 deletions(-) diff --git a/paddle/framework/executor_test.cc b/paddle/framework/executor_test.cc index 980f5f579c..d3ea18d154 100644 --- a/paddle/framework/executor_test.cc +++ b/paddle/framework/executor_test.cc @@ -13,8 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/framework/executor.h" -#include // for unique_ptr -#include // for call_once #include #include "gtest/gtest.h" #include "paddle/framework/attribute.h" @@ -34,9 +32,8 @@ using namespace paddle::framework; typedef paddle::framework::BlockDesc proto_block; typedef paddle::framework::OpDesc proto_op; -void add_gaussian_random_op(string var_name, proto_block* block) { - std::vector dim{2, 3}; - +void add_gaussian_random_op(string var_name, std::vector& dim, + proto_block* block) { // insert variable auto a = block->add_vars(); a->set_name(var_name); @@ -60,9 +57,8 @@ void add_gaussian_random_op(string var_name, proto_block* block) { Out->add_arguments(var_name); } -void add_feed_op(string var_name, int index, proto_block* block) { - std::vector dim{3}; - +void add_feed_op(string var_name, std::vector& dim, int index, + proto_block* block) { // insert variable auto a = block->add_vars(); a->set_name(var_name); @@ -95,9 +91,8 @@ void add_feed_op(string var_name, int index, proto_block* block) { Out->add_arguments(var_name); } -void add_fetch_op(string var_name, int index, proto_block* block) { - std::vector dim{3}; - +void add_fetch_op(string var_name, std::vector& dim, int index, + proto_block* block) { // insert variable auto a = block->add_vars(); a->set_name(var_name); @@ -138,20 +133,11 @@ void set_feed_variable(const std::vector>& inputs) { Variable* g_feed_value = GetScope()->FindVar("feed_value"); FeedInputs& feed_inputs = *(g_feed_value->GetMutable()); auto size = inputs.size(); - - std::call_once(set_variable_flag, [&]() { - feed_inputs.reserve(size); - for (size_t i = 0; i < size; i++) { - paddle::framework::Tensor tmp; - tmp.mutable_data(make_ddim({static_cast(inputs[i].size())}), - CPUPlace()); - feed_inputs.push_back(tmp); - } - }); - + feed_inputs.resize(size); for (size_t i = 0; i < size; i++) { - memcpy(feed_inputs[i].data(), inputs[i].data(), - inputs[i].size() * sizeof(T)); + T* dst = feed_inputs[i].mutable_data( + make_ddim({static_cast(inputs[i].size())}), CPUPlace()); + memcpy(dst, inputs[i].data(), inputs[i].size() * sizeof(T)); } } @@ -160,19 +146,17 @@ std::vector> get_fetch_variable() { typedef std::vector FetchOutputs; Variable* g_fetch_value = GetScope()->FindVar("fetch_value"); FetchOutputs& fetch_outputs = *(g_fetch_value->GetMutable()); - auto size = fetch_outputs.size(); + auto size = fetch_outputs.size(); std::vector> result; result.reserve(size); - for (size_t i = 0; i < size; i++) { std::vector tmp; - tmp.reserve(fetch_outputs[i].numel()); + tmp.resize(fetch_outputs[i].numel()); memcpy(tmp.data(), fetch_outputs[i].data(), fetch_outputs[i].numel() * sizeof(T)); result.push_back(tmp); } - return result; } @@ -183,8 +167,9 @@ class ExecutorTesterRandom : public ::testing::Test { root_block->set_idx(0); root_block->set_parent_idx(-1); - add_gaussian_random_op("a", root_block); - add_gaussian_random_op("b", root_block); + std::vector dim{2, 3}; + add_gaussian_random_op("a", dim, root_block); + add_gaussian_random_op("b", dim, root_block); auto c = root_block->add_vars(); c->set_name("c"); @@ -203,12 +188,11 @@ class ExecutorTesterRandom : public ::testing::Test { Out->set_parameter("Out"); Out->add_arguments("c"); - scope_ = GetScope(); + add_fetch_op("c", dim, 0, root_block); } protected: ProgramDesc pdesc_; - Scope* scope_; }; class ExecutorTesterFeed : public ::testing::Test { @@ -218,8 +202,10 @@ class ExecutorTesterFeed : public ::testing::Test { root_block->set_idx(0); root_block->set_parent_idx(-1); - add_feed_op("a", 0, root_block); - add_feed_op("b", 1, root_block); + std::vector dim{6}; + + add_feed_op("a", dim, 0, root_block); + add_feed_op("b", dim, 1, root_block); auto c = root_block->add_vars(); c->set_name("c"); @@ -238,10 +224,10 @@ class ExecutorTesterFeed : public ::testing::Test { Out->set_parameter("Out"); Out->add_arguments("c"); - add_fetch_op("c", 0, root_block); + add_fetch_op("c", dim, 0, root_block); - std::vector vec1 = {1.0, 2.0, 3.0}; - std::vector vec2 = {4.0, 5.0, 6.0}; + std::vector vec1 = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0}; + std::vector vec2 = {4.0, 5.0, 6.0, 7.0, 8.0, 9.0}; inputs_.push_back(vec1); inputs_.push_back(vec2); } @@ -253,12 +239,24 @@ class ExecutorTesterFeed : public ::testing::Test { TEST_F(ExecutorTesterRandom, CPU) { std::vector places; - CPUPlace cpu_place1, cpu_place2; - places.push_back(cpu_place1); - places.push_back(cpu_place2); + CPUPlace cpu_place; + places.push_back(cpu_place); + + // We have a global Scope and BuddyAllocator, and we must ensure + // global BuddyAllocator is initialized before global Scope. Thus, + // global Scope will deconstruct before BuddyAllocator. Otherwise, + // "pointer being freed was not allocated" error will appear. + paddle::memory::Used(cpu_place); Executor* executor = new Executor(places); - executor->Run(pdesc_, scope_); + executor->Run(pdesc_, GetScope()); + std::vector> result = get_fetch_variable(); + for (auto& vec : result) { + for (auto& num : vec) { + std::cout << num << " "; + } + std::cout << std::endl; + } delete executor; } @@ -267,6 +265,12 @@ TEST_F(ExecutorTesterFeed, CPU) { CPUPlace cpu_place; places.push_back(cpu_place); + // We have a global Scope and BuddyAllocator, and we must ensure + // global BuddyAllocator is initialized before global Scope. Thus, + // global Scope will deconstruct before BuddyAllocator. Otherwise, + // "pointer being freed was not allocated" error will appear. + paddle::memory::Used(cpu_place); + Executor* executor = new Executor(places); // 3 mini-batch @@ -293,8 +297,10 @@ TEST_F(ExecutorTesterRandom, GPU) { GPUPlace gpu_place(0); places.push_back(gpu_place); + paddle::memory::Used(gpu_place); + Executor* executor = new Executor(places); - executor->Run(pdesc_, scope_); + executor->Run(pdesc_, GetScope()); delete executor; } @@ -303,11 +309,13 @@ TEST_F(ExecutorTesterFeed, GPU) { GPUPlace gpu_place(0); places.push_back(gpu_place); + paddle::memory::Used(gpu_place); + Executor* executor = new Executor(places); // need to set feed variable before Executor::Run set_feed_variable(inputs_); - executor->Run(pdesc_, scope_); + executor->Run(pdesc_, GetScope()); delete executor; } diff --git a/paddle/operators/feed_op.cc b/paddle/operators/feed_op.cc index a61855cb99..d40db3ff2e 100644 --- a/paddle/operators/feed_op.cc +++ b/paddle/operators/feed_op.cc @@ -29,11 +29,11 @@ class FeedOp : public framework::OperatorWithKernel { framework::Variable* g_feed_variable = framework::GetScope()->FindVar("feed_value"); - FeedInputs tensors = g_feed_variable->Get(); + const FeedInputs& tensors = g_feed_variable->Get(); auto in_dim = tensors[col].dims(); ctx->SetOutputDim("Out", in_dim); - // need to handle LodTensor later + // TODO(qijun) need to handle LodTensor later } framework::DataType IndicateDataType( diff --git a/paddle/operators/feed_op.h b/paddle/operators/feed_op.h index 57781e205f..cf93b6f434 100644 --- a/paddle/operators/feed_op.h +++ b/paddle/operators/feed_op.h @@ -31,7 +31,7 @@ class FeedKernel : public framework::OpKernel { framework::Variable* g_feed_variable = framework::GetScope()->FindVar("feed_value"); int col = ctx.template Attr("col"); - FeedInputs tensors = g_feed_variable->Get(); + const FeedInputs& tensors = g_feed_variable->Get(); out->CopyFrom(tensors[col], ctx.GetPlace()); } }; diff --git a/paddle/operators/fetch_op.cc b/paddle/operators/fetch_op.cc index 68e8d26dbe..a885deacc8 100644 --- a/paddle/operators/fetch_op.cc +++ b/paddle/operators/fetch_op.cc @@ -30,15 +30,16 @@ class FetchOp : public framework::OperatorWithKernel { framework::GetScope()->FindVar("fetch_value"); FetchOutputs* tensors = g_fetch_variable->GetMutable(); - if (tensors->size() < col) { - tensors->resize(col); + if (tensors->size() < static_cast(col + 1)) { + tensors->resize(col + 1); } auto input_dim = ctx->GetInputDim("Input"); framework::Tensor tmp; tmp.Resize(input_dim); (*tensors)[col].Resize(input_dim); - // need to handle LodTensor later + + // TODO(qijun) need to handle LodTensor later } framework::DataType IndicateDataType( diff --git a/paddle/operators/fetch_op.h b/paddle/operators/fetch_op.h index 95e7986a22..e8d5e3a9c0 100644 --- a/paddle/operators/fetch_op.h +++ b/paddle/operators/fetch_op.h @@ -26,13 +26,13 @@ class FetchKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { typedef std::vector FetchOutputs; - Tensor* input = ctx.Output("Input"); + const Tensor* input = ctx.Input("Input"); int col = ctx.template Attr("col"); framework::Variable* g_fetch_variable = framework::GetScope()->FindVar("fetch_value"); - FetchOutputs tensors = g_fetch_variable->Get(); - tensors[col].mutable_data(platform::CPUPlace()); - tensors[col].CopyFrom(*input, platform::CPUPlace()); + FetchOutputs* tensors = g_fetch_variable->GetMutable(); + (*tensors)[col].mutable_data(platform::CPUPlace()); + (*tensors)[col].CopyFrom(*input, platform::CPUPlace()); } }; From bbceb72398f23902fae2f011c2b6c7f2a8b7b8e3 Mon Sep 17 00:00:00 2001 From: qijun Date: Thu, 5 Oct 2017 20:54:16 -0700 Subject: [PATCH 21/61] refine some codes --- paddle/framework/executor.cc | 10 ---------- paddle/framework/executor_test.cc | 2 ++ paddle/framework/scope.cc | 9 ++------- paddle/operators/feed_op.cc | 2 +- paddle/operators/fetch_op.cc | 2 +- 5 files changed, 6 insertions(+), 19 deletions(-) diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc index 51ddb7e58e..ee0df039ac 100644 --- a/paddle/framework/executor.cc +++ b/paddle/framework/executor.cc @@ -74,16 +74,6 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope) { for (auto& device_context : device_contexts_) { device_context->Wait(); } - // // print tensor value - // for (auto& var : block.vars()) { - // std::cout << var.name() << std::endl; - // auto v = scope->FindVar(var.name()); - // const LoDTensor& t = v->Get(); - // for (int i = 0; i < t.numel(); ++i) { - // std::cout << t.data()[i] << " "; - // } - // std::cout << std::endl; - // } } } // namespace framework diff --git a/paddle/framework/executor_test.cc b/paddle/framework/executor_test.cc index d3ea18d154..5e327cc893 100644 --- a/paddle/framework/executor_test.cc +++ b/paddle/framework/executor_test.cc @@ -130,6 +130,7 @@ std::once_flag set_variable_flag; template void set_feed_variable(const std::vector>& inputs) { typedef std::vector FeedInputs; + // Tensors in feed value variable will only be in CPUPlace Variable* g_feed_value = GetScope()->FindVar("feed_value"); FeedInputs& feed_inputs = *(g_feed_value->GetMutable()); auto size = inputs.size(); @@ -144,6 +145,7 @@ void set_feed_variable(const std::vector>& inputs) { template std::vector> get_fetch_variable() { typedef std::vector FetchOutputs; + // Tensors in fetch value variable will only be in CPUPlace Variable* g_fetch_value = GetScope()->FindVar("fetch_value"); FetchOutputs& fetch_outputs = *(g_fetch_value->GetMutable()); diff --git a/paddle/framework/scope.cc b/paddle/framework/scope.cc index 2c416570cf..b6a9d7fbc2 100644 --- a/paddle/framework/scope.cc +++ b/paddle/framework/scope.cc @@ -66,15 +66,10 @@ void Scope::DropKids() { std::once_flag feed_variable_flag; -template -std::unique_ptr make_unique(Args&&... args) { - return std::unique_ptr(new T(std::forward(args)...)); -} - framework::Scope* GetScope() { - static std::unique_ptr g_scope = - make_unique(); + static std::unique_ptr g_scope{nullptr}; std::call_once(feed_variable_flag, [&]() { + g_scope.reset(new framework::Scope()); g_scope->NewVar("feed_value"); g_scope->NewVar("fetch_value"); }); diff --git a/paddle/operators/feed_op.cc b/paddle/operators/feed_op.cc index d40db3ff2e..f2c498e2e2 100644 --- a/paddle/operators/feed_op.cc +++ b/paddle/operators/feed_op.cc @@ -33,7 +33,7 @@ class FeedOp : public framework::OperatorWithKernel { auto in_dim = tensors[col].dims(); ctx->SetOutputDim("Out", in_dim); - // TODO(qijun) need to handle LodTensor later + // TODO(qijun): need to handle LodTensor later } framework::DataType IndicateDataType( diff --git a/paddle/operators/fetch_op.cc b/paddle/operators/fetch_op.cc index a885deacc8..f6882cbd03 100644 --- a/paddle/operators/fetch_op.cc +++ b/paddle/operators/fetch_op.cc @@ -39,7 +39,7 @@ class FetchOp : public framework::OperatorWithKernel { tmp.Resize(input_dim); (*tensors)[col].Resize(input_dim); - // TODO(qijun) need to handle LodTensor later + // TODO(qijun): need to handle LodTensor later } framework::DataType IndicateDataType( From 1f5192a27b968a7980c2eead7b6885e66f09575a Mon Sep 17 00:00:00 2001 From: qijun Date: Fri, 6 Oct 2017 11:06:59 -0700 Subject: [PATCH 22/61] fix executor gpu unittest --- paddle/framework/executor.cc | 2 +- paddle/framework/executor_test.cc | 20 +++++++++++++++----- paddle/operators/fetch_op.cu | 2 +- paddle/platform/gpu_info.cc | 3 ++- 4 files changed, 19 insertions(+), 8 deletions(-) diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc index ee0df039ac..c18ba049c8 100644 --- a/paddle/framework/executor.cc +++ b/paddle/framework/executor.cc @@ -30,7 +30,7 @@ Executor::Executor(const std::vector& places) { device_contexts_[i] = new platform::CPUDeviceContext( boost::get(places[i])); } else if (platform::is_gpu_place(places[i])) { -#ifdef PADDLE_WITH_GPU +#ifdef PADDLE_WITH_CUDA device_contexts_[i] = new platform::CUDADeviceContext( boost::get(places[i])); #else diff --git a/paddle/framework/executor_test.cc b/paddle/framework/executor_test.cc index 5e327cc893..55e209628b 100644 --- a/paddle/framework/executor_test.cc +++ b/paddle/framework/executor_test.cc @@ -293,7 +293,7 @@ TEST_F(ExecutorTesterFeed, CPU) { delete executor; } -#ifdef PADDLE_WITH_GPU +#ifdef PADDLE_WITH_CUDA TEST_F(ExecutorTesterRandom, GPU) { std::vector places; GPUPlace gpu_place(0); @@ -315,10 +315,20 @@ TEST_F(ExecutorTesterFeed, GPU) { Executor* executor = new Executor(places); - // need to set feed variable before Executor::Run - set_feed_variable(inputs_); - executor->Run(pdesc_, GetScope()); - + // 3 mini-batch + for (int i = 0; i < 3; i++) { + // need to set feed variable before Executor::Run + std::cout << "start mini-batch " << i << std::endl; + set_feed_variable(inputs_); + executor->Run(pdesc_, GetScope()); + std::vector> result = get_fetch_variable(); + for (auto& vec : result) { + for (auto& num : vec) { + std::cout << num << " "; + } + std::cout << std::endl; + } + } delete executor; } #endif diff --git a/paddle/operators/fetch_op.cu b/paddle/operators/fetch_op.cu index 2e24d3a8ad..ca39d24c79 100644 --- a/paddle/operators/fetch_op.cu +++ b/paddle/operators/fetch_op.cu @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/operators/feed_op.h" +#include "paddle/operators/fetch_op.h" namespace ops = paddle::operators; REGISTER_OP_GPU_KERNEL(fetch, ops::FetchKernel); diff --git a/paddle/platform/gpu_info.cc b/paddle/platform/gpu_info.cc index 486dcd623a..aa76bb209d 100644 --- a/paddle/platform/gpu_info.cc +++ b/paddle/platform/gpu_info.cc @@ -43,7 +43,8 @@ int GetCurrentDeviceId() { } void SetDeviceId(int id) { - PADDLE_ENFORCE(id < GetDeviceCount(), "id must less than GPU count"); + // TODO(qijun): find a better way to cache the cuda device count + PADDLE_ENFORCE(id < GetCUDADeviceCount(), "id must less than GPU count"); PADDLE_ENFORCE(cudaSetDevice(id), "cudaSetDevice failed in paddle::platform::SetDeviceId"); } From ac0e3828b49cab7fd943c293516917cfdc4404ee Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Fri, 6 Oct 2017 21:15:29 +0000 Subject: [PATCH 23/61] test text --- paddle/framework/executor.cc | 102 +++++++++++++++++++++++++++++------ paddle/framework/executor.h | 16 ++++++ 2 files changed, 103 insertions(+), 15 deletions(-) diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc index aafef12554..89b83f82fb 100644 --- a/paddle/framework/executor.cc +++ b/paddle/framework/executor.cc @@ -13,13 +13,17 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/framework/executor.h" +#include #include #include +#include #include #include "paddle/framework/lod_tensor.h" #include "paddle/framework/op_registry.h" #include "paddle/framework/scope.h" +#include + namespace paddle { namespace framework { @@ -64,26 +68,94 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope) { scope->NewVar(var.name()); } - for (auto& op_desc : block.ops()) { - auto op = paddle::framework::OpRegistry::CreateOp(op_desc); - std::cout << op->DebugString() << std::endl; - op->Run(*scope, *device); + std::vector should_run = Preprocess(pdesc); + PADDLE_ENFORCE(should_run.size() == block.ops_size(), + "should_run.size() != block.ops_size()"); + for (int i = 0; i < should_run.size(); ++i) { + if (should_run[i]) { + auto op = paddle::framework::OpRegistry::CreateOp(block.ops(i)); + std::cout << op->DebugString() << std::endl; + op->Run(*scope, *device); + } } - // TODO(tonyyang-svail): need to test gpu device - for (auto& device_context : device_contexts_) { - device_context->Wait(); - } // // print tensor value - for (auto& var : block.vars()) { - std::cout << var.name() << std::endl; - auto v = scope->FindVar(var.name()); - const LoDTensor& t = v->Get(); - for (int i = 0; i < t.numel(); ++i) { - std::cout << t.data()[i] << " "; + // for (auto& var : block.vars()) { + // std::cout << var.name() << std::endl; + // auto v = scope->FindVar(var.name()); + // const LoDTensor& t = v->Get(); + // for (int i = 0; i < t.numel(); ++i) { + // std::cout << t.data()[i] << " "; + // } + // std::cout << std::endl; + // } +} + +std::vector Executor::Preprocess(const ProgramDesc& pdesc) { + // TODO(tonyyang-svail): + // - only runs the first block + + auto& block = pdesc.blocks(0); + auto& ops = block.ops(); + + bool expect_feed = true; + for (auto& op_desc : ops) { + PADDLE_ENFORCE(op_desc.type() != "feed" || expect_feed, + "All FeedOps are at the beginning of the ProgramDesc"); + expect_feed = (op_desc.type() == "feed"); + } + + bool expect_fetch = true; + for (auto op_iter = ops.rbegin(); op_iter != ops.rend(); ++op_iter) { + auto& op_desc = *op_iter; + PADDLE_ENFORCE(op_desc.type() != "fetch" || expect_fetch, + "All FetchOps must at the end of the ProgramDesc"); + expect_fetch = (op_desc.type() == "fetch"); + } + + std::set dependent_vars; + std::vector should_run; + for (auto op_iter = ops.rbegin(); op_iter != ops.rend(); ++op_iter) { + auto& op_desc = *op_iter; + + bool found_dependent_vars = false; + for (auto& var : op_desc.outputs()) { + for (auto& argu : var.arguments()) { + if (dependent_vars.count(argu) != 0) { + found_dependent_vars = true; + } + } + } + + // TODO(tonyyang-svail): add VLOG here for debugging + if (op_desc.type() == "fetch" || found_dependent_vars) { + // erase its output to the dependency graph + for (auto& var : op_desc.outputs()) { + for (auto& argu : var.arguments()) { + dependent_vars.erase(argu); + } + } + + // insert its input to the dependency graph + for (auto& var : op_desc.inputs()) { + for (auto& argu : var.arguments()) { + dependent_vars.insert(argu); + } + } + + // this op should be executed + should_run.push_back(true); + } else { + // this op should NOT be executed + should_run.push_back(false); } - std::cout << std::endl; } + + // since we are traversing the ProgramDesc in reverse order + // we reverse the should_run vector + std::reverse(should_run.begin(), should_run.end()); + + return should_run; } } // namespace framework diff --git a/paddle/framework/executor.h b/paddle/framework/executor.h index 9e443c8fca..1d2e6c96de 100644 --- a/paddle/framework/executor.h +++ b/paddle/framework/executor.h @@ -26,8 +26,24 @@ class Executor { public: explicit Executor(const std::vector& places); ~Executor(); + + /* @Brief + * Runtime evaluation of the given ProgramDesc under certain Scope + * + * @param + * ProgramDesc + * Scope + */ void Run(const ProgramDesc&, Scope*); + protected: + /* @Brief + * + * @param + * ProgramDesc + */ + std::vector Preprocess(const ProgramDesc& pdesc); + private: std::vector device_contexts_; }; From e8a678e1eecd11fee219a93c6c586ee24663a506 Mon Sep 17 00:00:00 2001 From: qijun Date: Fri, 6 Oct 2017 22:46:04 +0000 Subject: [PATCH 24/61] fix executor gpu unittest runtime error --- paddle/framework/executor_test.cc | 19 ++++++++++++++++--- paddle/operators/fetch_op.cc | 2 -- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/paddle/framework/executor_test.cc b/paddle/framework/executor_test.cc index 55e209628b..82f9bd6f2d 100644 --- a/paddle/framework/executor_test.cc +++ b/paddle/framework/executor_test.cc @@ -239,6 +239,7 @@ class ExecutorTesterFeed : public ::testing::Test { std::vector> inputs_; }; +#ifndef PADDLE_WITH_CUDA TEST_F(ExecutorTesterRandom, CPU) { std::vector places; CPUPlace cpu_place; @@ -292,13 +293,19 @@ TEST_F(ExecutorTesterFeed, CPU) { delete executor; } - -#ifdef PADDLE_WITH_CUDA +#else TEST_F(ExecutorTesterRandom, GPU) { std::vector places; GPUPlace gpu_place(0); places.push_back(gpu_place); + // We have a global Scope and BuddyAllocator, and we must ensure + // global BuddyAllocator is initialized before global Scope. Thus, + // global Scope will deconstruct before BuddyAllocator. Otherwise, + // "pointer being freed was not allocated" error will appear. + // If paddle is compiled with GPU, both CPU and GPU BuddyAllocator + // need to be used at first. + paddle::memory::Used(CPUPlace()); paddle::memory::Used(gpu_place); Executor* executor = new Executor(places); @@ -310,7 +317,13 @@ TEST_F(ExecutorTesterFeed, GPU) { std::vector places; GPUPlace gpu_place(0); places.push_back(gpu_place); - + // We have a global Scope and BuddyAllocator, and we must ensure + // global BuddyAllocator is initialized before global Scope. Thus, + // global Scope will deconstruct before BuddyAllocator. Otherwise, + // "pointer being freed was not allocated" error will appear. + // If paddle is compiled with GPU, both CPU and GPU BuddyAllocator + // need to be used at first. + paddle::memory::Used(CPUPlace()); paddle::memory::Used(gpu_place); Executor* executor = new Executor(places); diff --git a/paddle/operators/fetch_op.cc b/paddle/operators/fetch_op.cc index f6882cbd03..4b6b3ca85a 100644 --- a/paddle/operators/fetch_op.cc +++ b/paddle/operators/fetch_op.cc @@ -35,8 +35,6 @@ class FetchOp : public framework::OperatorWithKernel { } auto input_dim = ctx->GetInputDim("Input"); - framework::Tensor tmp; - tmp.Resize(input_dim); (*tensors)[col].Resize(input_dim); // TODO(qijun): need to handle LodTensor later From 91f5d2b9cb23cbb6048180ed791e53659532cf04 Mon Sep 17 00:00:00 2001 From: qijun Date: Fri, 6 Oct 2017 16:09:19 -0700 Subject: [PATCH 25/61] follow comments and create local_scope inside executor run method --- paddle/framework/executor.cc | 6 ++---- paddle/framework/executor_test.cc | 12 ++++++------ paddle/framework/scope.cc | 2 +- paddle/framework/scope.h | 2 +- paddle/operators/feed_op.cc | 2 +- paddle/operators/feed_op.h | 6 ++---- paddle/operators/fetch_op.cc | 2 +- paddle/operators/fetch_op.h | 6 ++---- 8 files changed, 16 insertions(+), 22 deletions(-) diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc index c18ba049c8..7fc407ebc9 100644 --- a/paddle/framework/executor.cc +++ b/paddle/framework/executor.cc @@ -56,9 +56,7 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope) { auto& block = pdesc.blocks(0); auto& device = device_contexts_[0]; - // TODO(tonyyang-svail): - // - runs on a new local scope - // Scope& local_scope = scope->NewScope(); + Scope& local_scope = scope->NewScope(); for (auto& var : block.vars()) { scope->NewVar(var.name()); @@ -67,7 +65,7 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope) { for (auto& op_desc : block.ops()) { auto op = paddle::framework::OpRegistry::CreateOp(op_desc); std::cout << op->DebugString() << std::endl; - op->Run(*scope, *device); + op->Run(local_scope, *device); } // TODO(tonyyang-svail): need to test gpu device diff --git a/paddle/framework/executor_test.cc b/paddle/framework/executor_test.cc index 82f9bd6f2d..bf6c1dffc1 100644 --- a/paddle/framework/executor_test.cc +++ b/paddle/framework/executor_test.cc @@ -131,7 +131,7 @@ template void set_feed_variable(const std::vector>& inputs) { typedef std::vector FeedInputs; // Tensors in feed value variable will only be in CPUPlace - Variable* g_feed_value = GetScope()->FindVar("feed_value"); + Variable* g_feed_value = GetGlobalScope()->FindVar("feed_value"); FeedInputs& feed_inputs = *(g_feed_value->GetMutable()); auto size = inputs.size(); feed_inputs.resize(size); @@ -146,7 +146,7 @@ template std::vector> get_fetch_variable() { typedef std::vector FetchOutputs; // Tensors in fetch value variable will only be in CPUPlace - Variable* g_fetch_value = GetScope()->FindVar("fetch_value"); + Variable* g_fetch_value = GetGlobalScope()->FindVar("fetch_value"); FetchOutputs& fetch_outputs = *(g_fetch_value->GetMutable()); auto size = fetch_outputs.size(); @@ -252,7 +252,7 @@ TEST_F(ExecutorTesterRandom, CPU) { paddle::memory::Used(cpu_place); Executor* executor = new Executor(places); - executor->Run(pdesc_, GetScope()); + executor->Run(pdesc_, GetGlobalScope()); std::vector> result = get_fetch_variable(); for (auto& vec : result) { for (auto& num : vec) { @@ -281,7 +281,7 @@ TEST_F(ExecutorTesterFeed, CPU) { // need to set feed variable before Executor::Run std::cout << "start mini-batch " << i << std::endl; set_feed_variable(inputs_); - executor->Run(pdesc_, GetScope()); + executor->Run(pdesc_, GetGlobalScope()); std::vector> result = get_fetch_variable(); for (auto& vec : result) { for (auto& num : vec) { @@ -309,7 +309,7 @@ TEST_F(ExecutorTesterRandom, GPU) { paddle::memory::Used(gpu_place); Executor* executor = new Executor(places); - executor->Run(pdesc_, GetScope()); + executor->Run(pdesc_, GetGlobalScope()); delete executor; } @@ -333,7 +333,7 @@ TEST_F(ExecutorTesterFeed, GPU) { // need to set feed variable before Executor::Run std::cout << "start mini-batch " << i << std::endl; set_feed_variable(inputs_); - executor->Run(pdesc_, GetScope()); + executor->Run(pdesc_, GetGlobalScope()); std::vector> result = get_fetch_variable(); for (auto& vec : result) { for (auto& num : vec) { diff --git a/paddle/framework/scope.cc b/paddle/framework/scope.cc index b6a9d7fbc2..2a0d9bbf33 100644 --- a/paddle/framework/scope.cc +++ b/paddle/framework/scope.cc @@ -66,7 +66,7 @@ void Scope::DropKids() { std::once_flag feed_variable_flag; -framework::Scope* GetScope() { +framework::Scope* GetGlobalScope() { static std::unique_ptr g_scope{nullptr}; std::call_once(feed_variable_flag, [&]() { g_scope.reset(new framework::Scope()); diff --git a/paddle/framework/scope.h b/paddle/framework/scope.h index 96f3ae875b..319d291efe 100644 --- a/paddle/framework/scope.h +++ b/paddle/framework/scope.h @@ -73,7 +73,7 @@ class Scope { DISABLE_COPY_AND_ASSIGN(Scope); }; -framework::Scope* GetScope(); +framework::Scope* GetGlobalScope(); } // namespace framework } // namespace paddle diff --git a/paddle/operators/feed_op.cc b/paddle/operators/feed_op.cc index f2c498e2e2..b9e43be966 100644 --- a/paddle/operators/feed_op.cc +++ b/paddle/operators/feed_op.cc @@ -27,7 +27,7 @@ class FeedOp : public framework::OperatorWithKernel { PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output should be not null."); int col = ctx->Attrs().Get("col"); framework::Variable* g_feed_variable = - framework::GetScope()->FindVar("feed_value"); + framework::GetGlobalScope()->FindVar("feed_value"); const FeedInputs& tensors = g_feed_variable->Get(); diff --git a/paddle/operators/feed_op.h b/paddle/operators/feed_op.h index cf93b6f434..de8ec6ff61 100644 --- a/paddle/operators/feed_op.h +++ b/paddle/operators/feed_op.h @@ -19,17 +19,15 @@ limitations under the License. */ namespace paddle { namespace operators { -using Tensor = framework::Tensor; - template class FeedKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { typedef std::vector FeedInputs; - Tensor* out = ctx.Output("Out"); + framework::Tensor* out = ctx.Output("Out"); out->mutable_data(ctx.GetPlace()); framework::Variable* g_feed_variable = - framework::GetScope()->FindVar("feed_value"); + framework::GetGlobalScope()->FindVar("feed_value"); int col = ctx.template Attr("col"); const FeedInputs& tensors = g_feed_variable->Get(); out->CopyFrom(tensors[col], ctx.GetPlace()); diff --git a/paddle/operators/fetch_op.cc b/paddle/operators/fetch_op.cc index 4b6b3ca85a..7bde4953cd 100644 --- a/paddle/operators/fetch_op.cc +++ b/paddle/operators/fetch_op.cc @@ -27,7 +27,7 @@ class FetchOp : public framework::OperatorWithKernel { PADDLE_ENFORCE(ctx->HasInput("Input"), "Input should be not null."); int col = ctx->Attrs().Get("col"); framework::Variable* g_fetch_variable = - framework::GetScope()->FindVar("fetch_value"); + framework::GetGlobalScope()->FindVar("fetch_value"); FetchOutputs* tensors = g_fetch_variable->GetMutable(); if (tensors->size() < static_cast(col + 1)) { diff --git a/paddle/operators/fetch_op.h b/paddle/operators/fetch_op.h index e8d5e3a9c0..3bec9c9974 100644 --- a/paddle/operators/fetch_op.h +++ b/paddle/operators/fetch_op.h @@ -19,17 +19,15 @@ limitations under the License. */ namespace paddle { namespace operators { -using Tensor = framework::Tensor; - template class FetchKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { typedef std::vector FetchOutputs; - const Tensor* input = ctx.Input("Input"); + const framework::Tensor* input = ctx.Input("Input"); int col = ctx.template Attr("col"); framework::Variable* g_fetch_variable = - framework::GetScope()->FindVar("fetch_value"); + framework::GetGlobalScope()->FindVar("fetch_value"); FetchOutputs* tensors = g_fetch_variable->GetMutable(); (*tensors)[col].mutable_data(platform::CPUPlace()); (*tensors)[col].CopyFrom(*input, platform::CPUPlace()); From a7d700e0ba35e78cfbe85acf2d0b4cb72d22b10f Mon Sep 17 00:00:00 2001 From: qijun Date: Fri, 6 Oct 2017 16:30:44 -0700 Subject: [PATCH 26/61] revert local scope to TODO --- paddle/framework/executor.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc index 7fc407ebc9..c18ba049c8 100644 --- a/paddle/framework/executor.cc +++ b/paddle/framework/executor.cc @@ -56,7 +56,9 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope) { auto& block = pdesc.blocks(0); auto& device = device_contexts_[0]; - Scope& local_scope = scope->NewScope(); + // TODO(tonyyang-svail): + // - runs on a new local scope + // Scope& local_scope = scope->NewScope(); for (auto& var : block.vars()) { scope->NewVar(var.name()); @@ -65,7 +67,7 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope) { for (auto& op_desc : block.ops()) { auto op = paddle::framework::OpRegistry::CreateOp(op_desc); std::cout << op->DebugString() << std::endl; - op->Run(local_scope, *device); + op->Run(*scope, *device); } // TODO(tonyyang-svail): need to test gpu device From b68a95f7f488f8ff94f4793ec294121aa004d02d Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Sat, 7 Oct 2017 00:47:19 +0000 Subject: [PATCH 27/61] prune pass simple test --- paddle/framework/executor.cc | 2 ++ paddle/framework/executor.h | 4 ++++ paddle/framework/executor_test.cc | 1 + 3 files changed, 7 insertions(+) diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc index 10d22ba01c..4f217277d0 100644 --- a/paddle/framework/executor.cc +++ b/paddle/framework/executor.cc @@ -145,9 +145,11 @@ std::vector Executor::Preprocess(const ProgramDesc& pdesc) { // this op should be executed should_run.push_back(true); + LOG(INFO) << "Yes " << op_desc.type(); } else { // this op should NOT be executed should_run.push_back(false); + LOG(INFO) << "No " << op_desc.type(); } } diff --git a/paddle/framework/executor.h b/paddle/framework/executor.h index 1d2e6c96de..75cb5939ff 100644 --- a/paddle/framework/executor.h +++ b/paddle/framework/executor.h @@ -38,9 +38,13 @@ class Executor { protected: /* @Brief + * Pruning the graph * * @param * ProgramDesc + * + * @return + * vector Same size as ops. Indicates whether an op should be run. */ std::vector Preprocess(const ProgramDesc& pdesc); diff --git a/paddle/framework/executor_test.cc b/paddle/framework/executor_test.cc index bf6c1dffc1..6a4b2e3d1a 100644 --- a/paddle/framework/executor_test.cc +++ b/paddle/framework/executor_test.cc @@ -226,6 +226,7 @@ class ExecutorTesterFeed : public ::testing::Test { Out->set_parameter("Out"); Out->add_arguments("c"); + add_fetch_op("a", dim, 0, root_block); add_fetch_op("c", dim, 0, root_block); std::vector vec1 = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0}; From 005f15b4957fcce594e1a3b8a27be1c1723ab0fc Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Sat, 7 Oct 2017 21:46:00 +0000 Subject: [PATCH 28/61] FeedOp and FetchOp unit test --- paddle/framework/executor.cc | 6 ++-- paddle/framework/executor_test.cc | 56 +++++++++++-------------------- 2 files changed, 22 insertions(+), 40 deletions(-) diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc index 4f217277d0..9391e18ded 100644 --- a/paddle/framework/executor.cc +++ b/paddle/framework/executor.cc @@ -69,12 +69,10 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope) { } std::vector should_run = Preprocess(pdesc); - PADDLE_ENFORCE(should_run.size() == block.ops_size(), - "should_run.size() != block.ops_size()"); - for (int i = 0; i < should_run.size(); ++i) { + PADDLE_ENFORCE(should_run.size() == block.ops_size()); + for (size_t i = 0; i < should_run.size(); ++i) { if (should_run[i]) { auto op = paddle::framework::OpRegistry::CreateOp(block.ops(i)); - std::cout << op->DebugString() << std::endl; op->Run(*scope, *device); } } diff --git a/paddle/framework/executor_test.cc b/paddle/framework/executor_test.cc index 6a4b2e3d1a..b198fa143c 100644 --- a/paddle/framework/executor_test.cc +++ b/paddle/framework/executor_test.cc @@ -127,10 +127,11 @@ void add_fetch_op(string var_name, std::vector& dim, int index, std::once_flag set_variable_flag; +// Tensors in feed value variable will only be in CPUPlace +// So we can memcpy the data from vector to feed_value template void set_feed_variable(const std::vector>& inputs) { typedef std::vector FeedInputs; - // Tensors in feed value variable will only be in CPUPlace Variable* g_feed_value = GetGlobalScope()->FindVar("feed_value"); FeedInputs& feed_inputs = *(g_feed_value->GetMutable()); auto size = inputs.size(); @@ -142,10 +143,11 @@ void set_feed_variable(const std::vector>& inputs) { } } +// Tensors in fetch value variable will only be in CPUPlace +// So we can memcpy the data from fetch_value to vector template std::vector> get_fetch_variable() { typedef std::vector FetchOutputs; - // Tensors in fetch value variable will only be in CPUPlace Variable* g_fetch_value = GetGlobalScope()->FindVar("fetch_value"); FetchOutputs& fetch_outputs = *(g_fetch_value->GetMutable()); @@ -159,6 +161,7 @@ std::vector> get_fetch_variable() { fetch_outputs[i].numel() * sizeof(T)); result.push_back(tmp); } + return result; } @@ -197,7 +200,7 @@ class ExecutorTesterRandom : public ::testing::Test { ProgramDesc pdesc_; }; -class ExecutorTesterFeed : public ::testing::Test { +class ExecutorTesterFeedAndFetch : public ::testing::Test { public: virtual void SetUp() override { auto root_block = pdesc_.add_blocks(); @@ -208,26 +211,8 @@ class ExecutorTesterFeed : public ::testing::Test { add_feed_op("a", dim, 0, root_block); add_feed_op("b", dim, 1, root_block); - - auto c = root_block->add_vars(); - c->set_name("c"); - auto c_lt = c->mutable_lod_tensor(); - c_lt->set_data_type(paddle::framework::DataType::FP32); - - auto op = root_block->add_ops(); - op->set_type("elementwise_add"); - auto X = op->add_inputs(); - X->set_parameter("X"); - X->add_arguments("a"); - auto Y = op->add_inputs(); - Y->set_parameter("Y"); - Y->add_arguments("b"); - auto Out = op->add_outputs(); - Out->set_parameter("Out"); - Out->add_arguments("c"); - add_fetch_op("a", dim, 0, root_block); - add_fetch_op("c", dim, 0, root_block); + add_fetch_op("b", dim, 1, root_block); std::vector vec1 = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0}; std::vector vec2 = {4.0, 5.0, 6.0, 7.0, 8.0, 9.0}; @@ -255,6 +240,7 @@ TEST_F(ExecutorTesterRandom, CPU) { Executor* executor = new Executor(places); executor->Run(pdesc_, GetGlobalScope()); std::vector> result = get_fetch_variable(); + for (auto& vec : result) { for (auto& num : vec) { std::cout << num << " "; @@ -264,7 +250,7 @@ TEST_F(ExecutorTesterRandom, CPU) { delete executor; } -TEST_F(ExecutorTesterFeed, CPU) { +TEST_F(ExecutorTesterFeedAndFetch, CPU) { std::vector places; CPUPlace cpu_place; places.push_back(cpu_place); @@ -279,16 +265,15 @@ TEST_F(ExecutorTesterFeed, CPU) { // 3 mini-batch for (int i = 0; i < 3; i++) { - // need to set feed variable before Executor::Run - std::cout << "start mini-batch " << i << std::endl; set_feed_variable(inputs_); executor->Run(pdesc_, GetGlobalScope()); std::vector> result = get_fetch_variable(); - for (auto& vec : result) { - for (auto& num : vec) { - std::cout << num << " "; + PADDLE_ENFORCE_EQ(result.size(), inputs_.size()); + for (size_t i = 0; i < result.size(); ++i) { + PADDLE_ENFORCE_EQ(result[i].size(), inputs_[i].size()); + for (size_t j = 0; j < result[i].size(); ++j) { + PADDLE_ENFORCE_EQ(result[i][j], inputs_[i][j]); } - std::cout << std::endl; } } @@ -314,7 +299,7 @@ TEST_F(ExecutorTesterRandom, GPU) { delete executor; } -TEST_F(ExecutorTesterFeed, GPU) { +TEST_F(ExecutorTesterFeedAndFetch, GPU) { std::vector places; GPUPlace gpu_place(0); places.push_back(gpu_place); @@ -331,16 +316,15 @@ TEST_F(ExecutorTesterFeed, GPU) { // 3 mini-batch for (int i = 0; i < 3; i++) { - // need to set feed variable before Executor::Run - std::cout << "start mini-batch " << i << std::endl; set_feed_variable(inputs_); executor->Run(pdesc_, GetGlobalScope()); std::vector> result = get_fetch_variable(); - for (auto& vec : result) { - for (auto& num : vec) { - std::cout << num << " "; + PADDLE_ENFORCE_EQ(result.size(), inputs_.size()); + for (size_t i = 0; i < result.size(); ++i) { + PADDLE_ENFORCE_EQ(result[i].size(), inputs_[i].size()); + for (size_t j = 0; j < result[i].size(); ++j) { + PADDLE_ENFORCE_EQ(result[i][j], inputs_[i][j]); } - std::cout << std::endl; } } delete executor; From a67e8ea3eb8475a17f6285e5cfbe1bf231e0bd28 Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Sun, 8 Oct 2017 04:49:10 +0000 Subject: [PATCH 29/61] Add AddOp --- paddle/framework/executor_test.cc | 147 +++++++++++++++++++++++++----- 1 file changed, 125 insertions(+), 22 deletions(-) diff --git a/paddle/framework/executor_test.cc b/paddle/framework/executor_test.cc index b198fa143c..cf1752f6d8 100644 --- a/paddle/framework/executor_test.cc +++ b/paddle/framework/executor_test.cc @@ -16,7 +16,9 @@ limitations under the License. */ #include #include "gtest/gtest.h" #include "paddle/framework/attribute.h" +#include "paddle/framework/block_desc.h" #include "paddle/framework/grad_op_builder.h" +#include "paddle/framework/op_desc.h" #include "paddle/framework/op_registry.h" #include "paddle/framework/operator.h" @@ -24,6 +26,7 @@ USE_OP(elementwise_add); USE_OP(gaussian_random); USE_OP(feed); USE_OP(fetch); +USE_OP(mul); using std::string; using namespace paddle::platform; @@ -32,7 +35,71 @@ using namespace paddle::framework; typedef paddle::framework::BlockDesc proto_block; typedef paddle::framework::OpDesc proto_op; -void add_gaussian_random_op(string var_name, std::vector& dim, +struct SetAttrDescVisitor : public boost::static_visitor { + explicit SetAttrDescVisitor(OpDesc::Attr* attr) : attr_(attr) {} + mutable OpDesc::Attr* attr_; + void operator()(int v) const { attr_->set_i(v); } + void operator()(float v) const { attr_->set_f(v); } + void operator()(const std::string& v) const { attr_->set_s(v); } + void operator()(bool b) const { attr_->set_b(b); } + + void operator()(const std::vector& v) const { + VectorToRepeated(v, attr_->mutable_ints()); + } + void operator()(const std::vector& v) const { + VectorToRepeated(v, attr_->mutable_floats()); + } + void operator()(const std::vector& v) const { + VectorToRepeated(v, attr_->mutable_strings()); + } + void operator()(const std::vector& v) const { + VectorToRepeated(v, attr_->mutable_bools()); + } + void operator()(BlockDesc* desc) const { attr_->set_block_idx(desc->idx()); } + void operator()(boost::blank) const { PADDLE_THROW("Unexpected branch"); } +}; + +void AddOp(const std::string& type, const VariableNameMap& inputs, + const VariableNameMap& outputs, AttributeMap attrs, + proto_block* block) { + // insert output + for (auto kv : outputs) { + for (auto v : kv.second) { + auto var = block->add_vars(); + var->set_name(v); + auto var_lt = var->mutable_lod_tensor(); + var_lt->set_data_type(paddle::framework::DataType::FP32); + } + } + + // insert op + auto op = block->add_ops(); + op->set_type(type); + for (auto kv : inputs) { + auto X = op->add_inputs(); + X->set_parameter(kv.first); + for (auto argu : kv.second) { + X->add_arguments(argu); + } + } + for (auto kv : outputs) { + auto X = op->add_outputs(); + X->set_parameter(kv.first); + for (auto argu : kv.second) { + X->add_arguments(argu); + } + } + for (auto& attr : attrs) { + auto* attr_desc = op->add_attrs(); + attr_desc->set_name(attr.first); + attr_desc->set_type( + static_cast(attr.second.which() - 1)); + SetAttrDescVisitor visitor(attr_desc); + boost::apply_visitor(visitor, attr.second); + } +} + +void add_gaussian_random_op(string var_name, std::vector dim, proto_block* block) { // insert variable auto a = block->add_vars(); @@ -91,7 +158,7 @@ void add_feed_op(string var_name, std::vector& dim, int index, Out->add_arguments(var_name); } -void add_fetch_op(string var_name, std::vector& dim, int index, +void add_fetch_op(string var_name, std::vector dim, int index, proto_block* block) { // insert variable auto a = block->add_vars(); @@ -125,6 +192,28 @@ void add_fetch_op(string var_name, std::vector& dim, int index, Out->add_arguments(var_name); } +void add_mul_op(string X_str, string Y_str, string Out_str, + proto_block* block) { + // insert variable + auto a = block->add_vars(); + a->set_name(Out_str); + auto a_lt = a->mutable_lod_tensor(); + a_lt->set_data_type(paddle::framework::DataType::FP32); + + // insert op + auto op = block->add_ops(); + op->set_type("mul"); + auto X = op->add_inputs(); + X->set_parameter("X"); + X->add_arguments(X_str); + auto Y = op->add_inputs(); + Y->set_parameter("Y"); + Y->add_arguments(Y_str); + auto Out = op->add_outputs(); + Out->set_parameter("Out"); + Out->add_arguments(Out_str); +} + std::once_flag set_variable_flag; // Tensors in feed value variable will only be in CPUPlace @@ -168,36 +257,37 @@ std::vector> get_fetch_variable() { class ExecutorTesterRandom : public ::testing::Test { public: virtual void SetUp() override { + int input_dim = 5, batch_size = 2, embed_dim = 5; + + // init pdesc + auto init_root_block = init_pdesc_.add_blocks(); + init_root_block->set_idx(0); + init_root_block->set_parent_idx(-1); + AddOp("gaussian_random", {}, {{"Out", {"w1"}}}, + {{"dims", std::vector{input_dim, embed_dim}}}, init_root_block); + AddOp("gaussian_random", {}, {{"Out", {"w2"}}}, + {{"dims", std::vector{embed_dim, input_dim}}}, init_root_block); + AddOp("fetch", {{"Input", {"w1"}}}, {}, + {{"dims", std::vector{input_dim, embed_dim}}}, init_root_block); + AddOp("fetch", {{"Input", {"w2"}}}, {}, + {{"dims", std::vector{embed_dim, input_dim}}}, init_root_block); + + // run pdesc auto root_block = pdesc_.add_blocks(); root_block->set_idx(0); root_block->set_parent_idx(-1); - std::vector dim{2, 3}; - add_gaussian_random_op("a", dim, root_block); - add_gaussian_random_op("b", dim, root_block); + add_gaussian_random_op("a", {batch_size, input_dim}, root_block); - auto c = root_block->add_vars(); - c->set_name("c"); - auto c_lt = c->mutable_lod_tensor(); - c_lt->set_data_type(paddle::framework::DataType::FP32); + add_mul_op("a", "w1", "b", root_block); + add_mul_op("b", "w2", "a_out", root_block); - auto op = root_block->add_ops(); - op->set_type("elementwise_add"); - auto X = op->add_inputs(); - X->set_parameter("X"); - X->add_arguments("a"); - auto Y = op->add_inputs(); - Y->set_parameter("Y"); - Y->add_arguments("b"); - auto Out = op->add_outputs(); - Out->set_parameter("Out"); - Out->add_arguments("c"); - - add_fetch_op("c", dim, 0, root_block); + add_fetch_op("a_out", {input_dim, batch_size}, 0, root_block); } protected: ProgramDesc pdesc_; + ProgramDesc init_pdesc_; }; class ExecutorTesterFeedAndFetch : public ::testing::Test { @@ -238,6 +328,7 @@ TEST_F(ExecutorTesterRandom, CPU) { paddle::memory::Used(cpu_place); Executor* executor = new Executor(places); + executor->Run(init_pdesc_, GetGlobalScope()); executor->Run(pdesc_, GetGlobalScope()); std::vector> result = get_fetch_variable(); @@ -295,7 +386,19 @@ TEST_F(ExecutorTesterRandom, GPU) { paddle::memory::Used(gpu_place); Executor* executor = new Executor(places); + + LOG(INFO) << "Run Init"; + executor->Run(init_pdesc_, GetGlobalScope()); + LOG(INFO) << "Run"; executor->Run(pdesc_, GetGlobalScope()); + std::vector> result = get_fetch_variable(); + + for (auto& vec : result) { + for (auto& num : vec) { + std::cout << num << " "; + } + std::cout << std::endl; + } delete executor; } From c83ea1cdca1b751b93a1c63ea8fa58706131951b Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Sun, 8 Oct 2017 05:11:40 +0000 Subject: [PATCH 30/61] remove hardcode add_XX_op --- paddle/framework/executor_test.cc | 147 +++++------------------------- 1 file changed, 21 insertions(+), 126 deletions(-) diff --git a/paddle/framework/executor_test.cc b/paddle/framework/executor_test.cc index cf1752f6d8..e8ea09b77d 100644 --- a/paddle/framework/executor_test.cc +++ b/paddle/framework/executor_test.cc @@ -99,121 +99,6 @@ void AddOp(const std::string& type, const VariableNameMap& inputs, } } -void add_gaussian_random_op(string var_name, std::vector dim, - proto_block* block) { - // insert variable - auto a = block->add_vars(); - a->set_name(var_name); - auto a_lt = a->mutable_lod_tensor(); - a_lt->set_data_type(paddle::framework::DataType::FP32); - for (int i : dim) { - a_lt->add_dims(i); - } - - // insert operation - auto op = block->add_ops(); - op->set_type("gaussian_random"); - auto dims = op->add_attrs(); - dims->set_name("dims"); - dims->set_type(paddle::framework::AttrType::INTS); - for (int i : dim) { - dims->add_ints(i); - } - auto Out = op->add_outputs(); - Out->set_parameter("Out"); - Out->add_arguments(var_name); -} - -void add_feed_op(string var_name, std::vector& dim, int index, - proto_block* block) { - // insert variable - auto a = block->add_vars(); - a->set_name(var_name); - auto a_lt = a->mutable_lod_tensor(); - a_lt->set_data_type(paddle::framework::DataType::FP32); - for (int i : dim) { - a_lt->add_dims(i); - } - - // insert operation - auto op = block->add_ops(); - op->set_type("feed"); - - // set dims attr - auto dims = op->add_attrs(); - dims->set_name("dims"); - dims->set_type(paddle::framework::AttrType::INTS); - for (int i : dim) { - dims->add_ints(i); - } - - // set col attr - auto col = op->add_attrs(); - col->set_name("col"); - col->set_type(paddle::framework::AttrType::INT); - col->set_i(index); - - auto Out = op->add_outputs(); - Out->set_parameter("Out"); - Out->add_arguments(var_name); -} - -void add_fetch_op(string var_name, std::vector dim, int index, - proto_block* block) { - // insert variable - auto a = block->add_vars(); - a->set_name(var_name); - auto a_lt = a->mutable_lod_tensor(); - a_lt->set_data_type(paddle::framework::DataType::FP32); - for (int i : dim) { - a_lt->add_dims(i); - } - - // insert operation - auto op = block->add_ops(); - op->set_type("fetch"); - - // set dims attr - auto dims = op->add_attrs(); - dims->set_name("dims"); - dims->set_type(paddle::framework::AttrType::INTS); - for (int i : dim) { - dims->add_ints(i); - } - - // set col attr - auto col = op->add_attrs(); - col->set_name("col"); - col->set_type(paddle::framework::AttrType::INT); - col->set_i(index); - - auto Out = op->add_inputs(); - Out->set_parameter("Input"); - Out->add_arguments(var_name); -} - -void add_mul_op(string X_str, string Y_str, string Out_str, - proto_block* block) { - // insert variable - auto a = block->add_vars(); - a->set_name(Out_str); - auto a_lt = a->mutable_lod_tensor(); - a_lt->set_data_type(paddle::framework::DataType::FP32); - - // insert op - auto op = block->add_ops(); - op->set_type("mul"); - auto X = op->add_inputs(); - X->set_parameter("X"); - X->add_arguments(X_str); - auto Y = op->add_inputs(); - Y->set_parameter("Y"); - Y->add_arguments(Y_str); - auto Out = op->add_outputs(); - Out->set_parameter("Out"); - Out->add_arguments(Out_str); -} - std::once_flag set_variable_flag; // Tensors in feed value variable will only be in CPUPlace @@ -268,21 +153,27 @@ class ExecutorTesterRandom : public ::testing::Test { AddOp("gaussian_random", {}, {{"Out", {"w2"}}}, {{"dims", std::vector{embed_dim, input_dim}}}, init_root_block); AddOp("fetch", {{"Input", {"w1"}}}, {}, - {{"dims", std::vector{input_dim, embed_dim}}}, init_root_block); + {{"dims", std::vector{input_dim, embed_dim}}, {"col", 0}}, + init_root_block); AddOp("fetch", {{"Input", {"w2"}}}, {}, - {{"dims", std::vector{embed_dim, input_dim}}}, init_root_block); + {{"dims", std::vector{embed_dim, input_dim}}, {"col", 1}}, + init_root_block); // run pdesc auto root_block = pdesc_.add_blocks(); root_block->set_idx(0); root_block->set_parent_idx(-1); - add_gaussian_random_op("a", {batch_size, input_dim}, root_block); - - add_mul_op("a", "w1", "b", root_block); - add_mul_op("b", "w2", "a_out", root_block); + AddOp("gaussian_random", {}, {{"Out", {"a"}}}, + {{"dims", std::vector{batch_size, input_dim}}}, root_block); + AddOp("mul", {{"X", {"a"}}, {"Y", {"w1"}}}, {{"Out", {"b"}}}, {}, + root_block); + AddOp("mul", {{"X", {"b"}}, {"Y", {"w2"}}}, {{"Out", {"a_out"}}}, {}, + root_block); - add_fetch_op("a_out", {input_dim, batch_size}, 0, root_block); + AddOp("fetch", {{"Input", {"a_out"}}}, {}, + {{"dims", std::vector{input_dim, batch_size}}, {"col", 1}}, + root_block); } protected: @@ -299,10 +190,14 @@ class ExecutorTesterFeedAndFetch : public ::testing::Test { std::vector dim{6}; - add_feed_op("a", dim, 0, root_block); - add_feed_op("b", dim, 1, root_block); - add_fetch_op("a", dim, 0, root_block); - add_fetch_op("b", dim, 1, root_block); + AddOp("feed", {}, {{"Out", {"a"}}}, {{"dims", dim}, {"col", 0}}, + root_block); + AddOp("feed", {}, {{"Out", {"b"}}}, {{"dims", dim}, {"col", 1}}, + root_block); + AddOp("fetch", {{"Input", {"a"}}}, {}, {{"dims", dim}, {"col", 0}}, + root_block); + AddOp("fetch", {{"Input", {"b"}}}, {}, {{"dims", dim}, {"col", 1}}, + root_block); std::vector vec1 = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0}; std::vector vec2 = {4.0, 5.0, 6.0, 7.0, 8.0, 9.0}; From 6e7666f199ab1849e37c4f2e1e2570316dcf5c04 Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Sun, 8 Oct 2017 05:36:19 +0000 Subject: [PATCH 31/61] before backward --- paddle/framework/CMakeLists.txt | 2 +- paddle/framework/executor_test.cc | 14 +++++++++++--- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt index d8812d7743..7dc9d5c804 100644 --- a/paddle/framework/CMakeLists.txt +++ b/paddle/framework/CMakeLists.txt @@ -44,7 +44,7 @@ add_custom_command(TARGET framework_py_proto POST_BUILD cc_library(backward SRCS backward.cc DEPS net_op) cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context) -cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto ${GLOB_OP_LIB}) +cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto backward ${GLOB_OP_LIB}) if(WITH_GPU) nv_test(executor_test SRCS executor_test.cc DEPS executor) else() diff --git a/paddle/framework/executor_test.cc b/paddle/framework/executor_test.cc index e8ea09b77d..7ce472ed2f 100644 --- a/paddle/framework/executor_test.cc +++ b/paddle/framework/executor_test.cc @@ -16,6 +16,7 @@ limitations under the License. */ #include #include "gtest/gtest.h" #include "paddle/framework/attribute.h" +#include "paddle/framework/backward.h" #include "paddle/framework/block_desc.h" #include "paddle/framework/grad_op_builder.h" #include "paddle/framework/op_desc.h" @@ -27,6 +28,7 @@ USE_OP(gaussian_random); USE_OP(feed); USE_OP(fetch); USE_OP(mul); +USE_OP(squared_l2_distance); using std::string; using namespace paddle::platform; @@ -170,10 +172,16 @@ class ExecutorTesterRandom : public ::testing::Test { root_block); AddOp("mul", {{"X", {"b"}}, {"Y", {"w2"}}}, {{"Out", {"a_out"}}}, {}, root_block); - - AddOp("fetch", {{"Input", {"a_out"}}}, {}, - {{"dims", std::vector{input_dim, batch_size}}, {"col", 1}}, + AddOp("squared_l2_distance", {{"X", {"a"}}, {"Y", {"a_out"}}}, + {{"Out", {"l2_distance"}}, {"sub_result", {"l2_distance_sub"}}}, {}, root_block); + + AppendBackward(pdesc_, {}); + // AddOp("fetch", {{"Input", {"sub_result"}}}, {}, + // {{"dims", std::vector{input_dim, batch_size}}, {"col", 0}}, + // root_block); + AddOp("fetch", {{"Input", {"l2_distance"}}}, {}, + {{"dims", std::vector{batch_size}}, {"col", 1}}, root_block); } protected: From ba791f7b3f0b4f2b43f4391f7ccc10cdf7b0d06c Mon Sep 17 00:00:00 2001 From: chengduoZH Date: Wed, 27 Sep 2017 23:16:35 +0800 Subject: [PATCH 32/61] Add vol2col functor and unit test --- paddle/operators/math/CMakeLists.txt | 7 +- paddle/operators/math/vol2col.cc | 155 +++++++++++++++++++ paddle/operators/math/vol2col.cu | 204 ++++++++++++++++++++++++++ paddle/operators/math/vol2col.h | 78 ++++++++++ paddle/operators/math/vol2col_test.cc | 156 ++++++++++++++++++++ 5 files changed, 597 insertions(+), 3 deletions(-) create mode 100644 paddle/operators/math/vol2col.cc create mode 100644 paddle/operators/math/vol2col.cu create mode 100644 paddle/operators/math/vol2col.h create mode 100644 paddle/operators/math/vol2col_test.cc diff --git a/paddle/operators/math/CMakeLists.txt b/paddle/operators/math/CMakeLists.txt index 91ae3d49f1..176d357f2e 100644 --- a/paddle/operators/math/CMakeLists.txt +++ b/paddle/operators/math/CMakeLists.txt @@ -1,16 +1,17 @@ if(WITH_GPU) nv_library(math_function SRCS math_function.cc math_function.cu im2col.cc - im2col.cu DEPS cblas device_context operator) + im2col.cu vol2col.cc vol2col.cu DEPS cblas device_context operator) nv_library(softmax_function SRCS softmax.cc softmax.cu DEPS operator) nv_library(cross_entropy_function SRCS cross_entropy.cc cross_entropy.cu DEPS operator) else() - cc_library(math_function SRCS math_function.cc im2col.cc - DEPS cblas device_context operator) + cc_library(math_function SRCS math_function.cc im2col.cc vol2col.cc + DEPS cblas device_context operator) cc_library(softmax_function SRCS softmax.cc DEPS operator) cc_library(cross_entropy_function SRCS cross_entropy.cc DEPS operator) endif() nv_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor) cc_test(im2col_test SRCS im2col_test.cc DEPS math_function tensor) +cc_test(vol2col_test SRCS vol2col_test.cc DEPS math_function tensor) diff --git a/paddle/operators/math/vol2col.cc b/paddle/operators/math/vol2col.cc new file mode 100644 index 0000000000..5bad2e8073 --- /dev/null +++ b/paddle/operators/math/vol2col.cc @@ -0,0 +1,155 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/operators/math/vol2col.h" + +namespace paddle { +namespace operators { +namespace math { + +/* + * vol = [input_channels, input_depth, input_height, input_width] + * col = + * [input_channels, filter_depth, filter_height, filter_width, + * output_depth, output_height, output_width] + */ +template +class Vol2ColFunctor { + public: + void operator()(const platform::DeviceContext& context, + const framework::Tensor& vol, framework::Tensor& col, + int stride_depth, int stride_height, int stride_width, + int padding_depth, int padding_height, + int padding_width) const { + PADDLE_ENFORCE(vol.dims().size() == 4); + PADDLE_ENFORCE(col.dims().size() == 7); + + int input_channels = vol.dims()[0]; + int input_depth = vol.dims()[1]; + int input_height = vol.dims()[2]; + int input_width = vol.dims()[3]; + int filter_depth = col.dims()[1]; + int filter_height = col.dims()[2]; + int filter_width = col.dims()[3]; + int output_depth = col.dims()[4]; + int output_height = col.dims()[5]; + int output_width = col.dims()[6]; + int channels_col = + input_channels * filter_depth * filter_height * filter_width; + + const T* vol_data = vol.data(); + T* col_data = col.data(); + + for (int c = 0; c < channels_col; ++c) { + int w_offset = c % filter_width; + int h_offset = (c / filter_width) % filter_height; + int d_offset = (c / filter_width / filter_height) % filter_depth; + int c_in = c / filter_width / filter_height / filter_depth; + for (int d = 0; d < output_depth; ++d) { + int d_pad = d * stride_depth - padding_depth + d_offset; + for (int h = 0; h < output_height; ++h) { + int h_pad = h * stride_height - padding_height + h_offset; + for (int w = 0; w < output_width; ++w) { + int w_pad = w * stride_width - padding_width + w_offset; + + int col_idx = + ((c * output_depth + d) * output_height + h) * output_width + w; + if (h_pad < 0 || h_pad >= input_height || w_pad < 0 || + w_pad >= input_width || d_pad < 0 || d_pad >= input_depth) { + col_data[col_idx] = T(0); + } else { + int vol_idx = + ((c_in * input_depth + d_pad) * input_height + h_pad) * + input_width + + w_pad; + col_data[col_idx] = vol_data[vol_idx]; + } + } + } + } + } + } +}; + +/* + * vol = [input_channels,input_depth, input_height, input_width] + * col = + * [input_channels, filter_depth, filter_height, filter_width, + * output_depth, output_height, output_width] + */ +template +class Col2VolFunctor { + public: + void operator()(const platform::DeviceContext& context, + framework::Tensor& vol, const framework::Tensor& col, + int stride_depth, int stride_height, int stride_width, + int padding_depth, int padding_height, + int padding_width) const { + PADDLE_ENFORCE(vol.dims().size() == 4); + PADDLE_ENFORCE(col.dims().size() == 7); + + int input_channels = vol.dims()[0]; + int input_depth = vol.dims()[1]; + int input_height = vol.dims()[2]; + int input_width = vol.dims()[3]; + int filter_depth = col.dims()[1]; + int filter_height = col.dims()[2]; + int filter_width = col.dims()[3]; + int output_depth = col.dims()[4]; + int output_height = col.dims()[5]; + int output_width = col.dims()[6]; + int channels_col = + input_channels * filter_depth * filter_height * filter_width; + + T* vol_data = vol.data(); + const T* col_data = col.data(); + + for (int c = 0; c < channels_col; ++c) { + int w_offset = c % filter_width; + int h_offset = (c / filter_width) % filter_height; + int d_offset = (c / filter_width / filter_height) % filter_depth; + int cIm = c / filter_width / filter_height / filter_depth; + for (int d = 0; d < output_depth; ++d) { + int d_pad = d * stride_depth - padding_depth + d_offset; + for (int h = 0; h < output_height; ++h) { + int h_pad = h * stride_height - padding_height + h_offset; + for (int w = 0; w < output_width; ++w) { + int w_pad = w * stride_width - padding_width + w_offset; + + if (h_pad >= 0 && h_pad < input_height && w_pad >= 0 && + w_pad < input_width && d_pad >= 0 && d_pad < input_depth) { + int vol_idx = + ((cIm * input_depth + d_pad) * input_height + h_pad) * + input_width + + w_pad; + int col_idx = + ((c * output_depth + d) * output_height + h) * output_width + + w; + vol_data[vol_idx] += col_data[col_idx]; + } + } + } + } + } + } +}; + +template class Vol2ColFunctor; +template class Vol2ColFunctor; +template class Col2VolFunctor; +template class Col2VolFunctor; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/math/vol2col.cu b/paddle/operators/math/vol2col.cu new file mode 100644 index 0000000000..27b11fb237 --- /dev/null +++ b/paddle/operators/math/vol2col.cu @@ -0,0 +1,204 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/operators/math/vol2col.h" +#include "paddle/platform/cuda_helper.h" + +namespace paddle { +namespace operators { +namespace math { + +template +__global__ void vol2col(int num_kernels, const T* data_vol, int depth, + int height, int width, int filter_depth, + int filter_height, int filter_width, int stride_depth, + int stride_height, int stride_width, int padding_depth, + int padding_height, int padding_width, int output_detph, + int output_height, int output_width, T* data_col) { + for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < num_kernels; + index += blockDim.x * gridDim.x) { + int w_out = index % output_width; + int h_out = (index / output_width) % output_height; + int d_out = (index / output_width / output_height) % output_detph; + int channel_in = index / output_width / output_height / output_detph; + int channel_out = channel_in * filter_depth * filter_height * filter_width; + int w_in = w_out * stride_width - padding_width; + int h_in = h_out * stride_height - padding_height; + int d_in = d_out * stride_depth - padding_depth; + + data_col += ((channel_out * output_detph + d_out) * output_height + h_out) * + output_width + + w_out; + data_vol += ((channel_in * depth + d_in) * height + h_in) * width + w_in; + for (int k = 0; k < filter_depth; ++k) { + for (int i = 0; i < filter_height; ++i) { + for (int j = 0; j < filter_width; ++j) { + int d = d_in + k; + int h = h_in + i; + int w = w_in + j; + *data_col = (d >= 0 && d < depth && h >= 0 && h < height && w >= 0 && + w < width) + ? data_vol[(k * height + i) * width + j] + : 0; + data_col += output_detph * output_height * output_width; + } + } + } + } +} + +/* + * im = [input_channels,intpu_depth, input_height, input_width] + * col = + * [input_channels, filter_depth, filter_height, filter_width, + * output_depth, output_height, output_width] + */ +template +class Vol2ColFunctor { + public: + void operator()(const platform::DeviceContext& context, + const framework::Tensor& vol, framework::Tensor& col, + int stride_depth, int stride_height, int stride_width, + int padding_depth, int padding_height, + int padding_width) const { + PADDLE_ENFORCE(vol.dims().size() == 4); + PADDLE_ENFORCE(col.dims().size() == 7); + + int input_channels = vol.dims()[0]; + int input_depth = vol.dims()[1]; + int input_height = vol.dims()[2]; + int input_width = vol.dims()[3]; + int filter_depth = col.dims()[1]; + int filter_height = col.dims()[2]; + int filter_width = col.dims()[3]; + int output_depth = col.dims()[4]; + int output_height = col.dims()[5]; + int output_width = col.dims()[6]; + + int num_outputs = + input_channels * output_depth * output_height * output_width; + + const int threads = 1024; + const int blocks = (num_outputs + 1024 - 1) / 1024; + vol2col<<(context) + .stream()>>>( + num_outputs, vol.data(), input_depth, input_height, input_width, + filter_depth, filter_height, filter_width, stride_depth, stride_height, + stride_width, padding_depth, padding_height, padding_width, + output_depth, output_height, output_width, col.data()); + } +}; + +template +__global__ void col2vol(int num_kernels, const T* data_col, int depth, + int height, int width, int filter_depth, + int filter_height, int filter_width, int stride_depth, + int stride_height, int stride_width, int padding_depth, + int padding_height, int padding_width, int output_detph, + int output_height, int output_width, T* data_vol) { + for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < num_kernels; + index += blockDim.x * gridDim.x) { + T src_val = 0; + int w = index % width + padding_width; + int h = (index / width) % height + padding_height; + int d = (index / width / height) % depth + padding_depth; + int c = index / width / height / depth; + // compute the start and end of the output + int w_col_start = + (w < filter_width) ? 0 : (w - filter_width) / stride_width + 1; + int w_col_end = min(w / stride_width + 1, output_width); + int h_col_start = + (h < filter_height) ? 0 : (h - filter_height) / stride_height + 1; + int h_col_end = min(h / stride_height + 1, output_height); + int d_col_start = + (d < filter_depth) ? 0 : (d - filter_depth) / stride_depth + 1; + int d_col_end = min(d / stride_depth + 1, output_detph); + + int offset = (c * filter_depth * filter_height * filter_width + + d * filter_width * filter_height + h * filter_width + w) * + output_detph * output_height * output_width; + + int coeff_d_col = + (1 - stride_depth * filter_width * filter_height * output_detph) * + output_height * output_width; + int coeff_h_col = + (1 - stride_height * filter_width * output_detph * output_height) * + output_width; + int coeff_w_col = + (1 - stride_width * output_detph * output_height * output_width); + + for (int d_col = d_col_start; d_col < d_col_end; ++d_col) { + for (int h_col = h_col_start; h_col < h_col_end; ++h_col) { + for (int w_col = w_col_start; w_col < w_col_end; ++w_col) { + src_val += data_col[offset + d_col * coeff_d_col + + h_col * coeff_h_col + w_col * coeff_w_col]; + } + } + } + data_vol[index] = src_val; + } +} + +/* + * im = [input_channels, input_depth, input_height, input_width] + * col = + * [input_channels, filter_depth, filter_height, filter_width, + * output_depth, output_height, output_width] + */ +template +class Col2VolFunctor { + public: + void operator()(const platform::DeviceContext& context, + framework::Tensor& vol, const framework::Tensor& col, + int stride_depth, int stride_height, int stride_width, + int padding_depth, int padding_height, + int padding_width) const { + PADDLE_ENFORCE(vol.dims().size() == 4); + PADDLE_ENFORCE(col.dims().size() == 7); + + int input_channels = vol.dims()[0]; + int input_depth = vol.dims()[1]; + int input_height = vol.dims()[2]; + int input_width = vol.dims()[3]; + int filter_depth = col.dims()[1]; + int filter_height = col.dims()[2]; + int filter_width = col.dims()[3]; + int output_depth = col.dims()[4]; + int output_height = col.dims()[5]; + int output_width = col.dims()[6]; + + int num_kernels = input_channels * input_depth * input_height * input_width; + + const int threads = 1024; + const int blocks = (num_kernels + 1024 - 1) / 1024; + + col2vol<<(context) + .stream()>>>( + num_kernels, col.data(), input_depth, input_height, input_width, + filter_depth, filter_height, filter_width, stride_depth, stride_height, + stride_width, padding_depth, padding_height, padding_width, + output_depth, output_height, output_width, vol.data()); + } +}; + +template class Vol2ColFunctor; +template class Vol2ColFunctor; +template class Col2VolFunctor; +template class Col2VolFunctor; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/math/vol2col.h b/paddle/operators/math/vol2col.h new file mode 100644 index 0000000000..f022365a16 --- /dev/null +++ b/paddle/operators/math/vol2col.h @@ -0,0 +1,78 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/framework/tensor.h" +#include "paddle/platform/device_context.h" + +namespace paddle { +namespace operators { +namespace math { +/* + * \brief Converts the feature data of four dimensions(CDHW) into a colData of + * seven dimensions in the Vol2ColFunctor calculation, + * And in the Col2VolFunctor calculation, it is reversed. + * + * \param volData Vol data. + * \param volShape The shape of volData, + * [input_channels, input_depth, input_height, input_width]. + * \param colData Column data. + * \param colShape The shape of colData. + * + * The shape of colData is: + * [input_channels, filter_depth, filter_height, filter_width, output_depth, + * output_height, output_width] + * So, it is easy to reshape into a convolution matrix for convolution + * calculation based on matrix multiplication. + * The shape of convolution matrix is [height, width], where the height is equal + * input_channels * filter_depth * filter_height * filter_width, and the width + * is equal output_depth * output_height * output_width. + * + * Reshape: + * shape of colData shape of convolution matrix + * [input_channels, + * filter_depth, + * filter_height, + * filter_width, ======> [height, width] + * output_depth, + * output_height, + * output_width] + * + * \note The caller needs to ensure that volShape.inputChannels is equal to + * colShape.inputChannels. + */ +template +class Vol2ColFunctor { + public: + void operator()(const platform::DeviceContext& context, + const framework::Tensor& vol, framework::Tensor& col, + int stride_depth, int stride_height, int stride_width, + int padding_depth, int padding_height, + int padding_width) const; +}; + +template +class Col2VolFunctor { + public: + void operator()(const platform::DeviceContext& context, + framework::Tensor& vol, const framework::Tensor& col, + int stride_depth, int stride_height, int stride_width, + int padding_depth, int padding_height, + int padding_width) const; +}; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/math/vol2col_test.cc b/paddle/operators/math/vol2col_test.cc new file mode 100644 index 0000000000..107a94511f --- /dev/null +++ b/paddle/operators/math/vol2col_test.cc @@ -0,0 +1,156 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/operators/math/vol2col.h" +#include +#include + +template +void testVol2col() { + paddle::framework::Tensor input_tmp; + paddle::framework::Tensor input; + paddle::framework::Tensor output_cfo; + paddle::framework::Tensor output_ocf; + paddle::framework::Tensor output_tmp; + + auto* place = new Place(); + paddle::platform::DeviceContext* context; + if (paddle::platform::is_cpu_place(*place)) { + context = + new paddle::platform::CPUDeviceContext(paddle::platform::CPUPlace()); + } else { +#ifndef PADDLE_ONLY_CPU + context = + new paddle::platform::CUDADeviceContext(paddle::platform::GPUPlace()); +#else + PADDLE_THROW("no GPU support"); +#endif // PADDLE_ONLY_CPU + } + + /** + * input = [[0, 1, 2, + * 3, 4, 5] + * [6, 7, 8, + * 9, 10, 11]] + * + * output_cfo = [0, 1 + * 1, 2 + * 3, 4 + * 4, 5 + * 6, 7 + * 7, 8 + * 9, 10 + * 10, 11] + * + * col2vol = [[0, 2, 2, + * 3, 8, 5] + * [6, 14, 8, + * 9, 20, 11]] + * + */ + int input_depth = 2; + int input_height = 2; + int input_width = 3; + int filter_size = 2; + int stride = 1; + int padding = 0; + int output_depth = (input_depth - filter_size + 2 * padding) / stride + 1; + int output_height = (input_height - filter_size + 2 * padding) / stride + 1; + int output_width = (input_width - filter_size + 2 * padding) / stride + 1; + + // Vol2Col test + float* input_ptr = + input_tmp.mutable_data({1, input_depth, input_height, input_width}, + paddle::platform::CPUPlace()); + float arr[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; + memcpy(input_ptr, arr, 12 * sizeof(float)); + + if (paddle::platform::is_cpu_place(*place)) { + input = input_tmp; + } else { + input.CopyFrom(input_tmp, *place); + } + output_cfo.mutable_data({1, filter_size, filter_size, filter_size, + output_depth, output_height, output_width}, + *place); + + paddle::operators::math::Vol2ColFunctor vol2col; + vol2col(*context, input, output_cfo, stride, stride, stride, padding, padding, + padding); + + float* out_cfo_ptr; + if (paddle::platform::is_cpu_place(*place)) { + out_cfo_ptr = output_cfo.data(); + } else { + output_tmp.CopyFrom(output_cfo, paddle::platform::CPUPlace()); + out_cfo_ptr = output_tmp.data(); + } + + EXPECT_EQ(out_cfo_ptr[0], 0); + EXPECT_EQ(out_cfo_ptr[1], 1); + EXPECT_EQ(out_cfo_ptr[2], 1); + EXPECT_EQ(out_cfo_ptr[3], 2); + EXPECT_EQ(out_cfo_ptr[4], 3); + EXPECT_EQ(out_cfo_ptr[5], 4); + EXPECT_EQ(out_cfo_ptr[6], 4); + EXPECT_EQ(out_cfo_ptr[7], 5); + EXPECT_EQ(out_cfo_ptr[8], 6); + EXPECT_EQ(out_cfo_ptr[9], 7); + EXPECT_EQ(out_cfo_ptr[10], 7); + EXPECT_EQ(out_cfo_ptr[11], 8); + EXPECT_EQ(out_cfo_ptr[12], 9); + EXPECT_EQ(out_cfo_ptr[13], 10); + EXPECT_EQ(out_cfo_ptr[14], 10); + EXPECT_EQ(out_cfo_ptr[15], 11); + + // Col2Vol test + memset(input_ptr, 0, 12 * sizeof(float)); + if (paddle::platform::is_cpu_place(*place)) { + input = input_tmp; + } else { + input.CopyFrom(input_tmp, *place); + } + + paddle::operators::math::Col2VolFunctor col2vol; + col2vol(*context, input, output_cfo, stride, stride, stride, padding, padding, + padding); + + float* in_cfo_ptr; + if (paddle::platform::is_cpu_place(*place)) { + in_cfo_ptr = input.data(); + } else { + input_tmp.CopyFrom(input, paddle::platform::CPUPlace()); + in_cfo_ptr = input_tmp.data(); + } + + EXPECT_EQ(in_cfo_ptr[0], 0); + EXPECT_EQ(in_cfo_ptr[1], 2); + EXPECT_EQ(in_cfo_ptr[2], 2); + EXPECT_EQ(in_cfo_ptr[3], 3); + EXPECT_EQ(in_cfo_ptr[4], 8); + EXPECT_EQ(in_cfo_ptr[5], 5); + EXPECT_EQ(in_cfo_ptr[6], 6); + EXPECT_EQ(in_cfo_ptr[7], 14); + EXPECT_EQ(in_cfo_ptr[8], 8); + EXPECT_EQ(in_cfo_ptr[9], 9); + EXPECT_EQ(in_cfo_ptr[10], 20); + EXPECT_EQ(in_cfo_ptr[11], 11); +} + +TEST(math, vol2col) { + testVol2col(); +#ifndef PADDLE_ONLY_CPU + testVol2col(); +#endif +} From 089cc11df48c8b29b34eda8ea19328a090d4c9f6 Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Mon, 9 Oct 2017 03:30:53 +0000 Subject: [PATCH 33/61] clean up && fix #4624 --- paddle/framework/block_desc.cc | 6 ++ paddle/framework/executor.cc | 37 +++------ paddle/framework/executor_test.cc | 129 ++++++++++++------------------ 3 files changed, 68 insertions(+), 104 deletions(-) diff --git a/paddle/framework/block_desc.cc b/paddle/framework/block_desc.cc index 01f50e1393..509aa235d3 100644 --- a/paddle/framework/block_desc.cc +++ b/paddle/framework/block_desc.cc @@ -74,6 +74,12 @@ void BlockDescBind::Sync() { for (auto &op_desc : ops_) { op_field.AddAllocated(op_desc->Proto()); } + auto &var_field = *this->desc_->mutable_vars(); + var_field.Clear(); + var_field.Reserve(static_cast(vars_.size())); + for (auto &var_desc : vars_) { + var_field.AddAllocated(var_desc.second->Proto()); + } need_update_ = false; } } diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc index 9391e18ded..c6c9d13469 100644 --- a/paddle/framework/executor.cc +++ b/paddle/framework/executor.cc @@ -54,39 +54,33 @@ Executor::~Executor() { void Executor::Run(const ProgramDesc& pdesc, Scope* scope) { // TODO(tonyyang-svail): - // - only runs the first block - // - only runs on the first device - // - test on gpu + // - only runs the first block (i.e. no RNN support) + // - only runs on the first device (i.e. no interdevice communication) auto& block = pdesc.blocks(0); auto& device = device_contexts_[0]; - // TODO(tonyyang-svail): - // - runs on a new local scope - // Scope& local_scope = scope->NewScope(); - + // Instantiate all the vars in the global scope for (auto& var : block.vars()) { scope->NewVar(var.name()); } + Scope& local_scope = scope->NewScope(); + std::vector should_run = Preprocess(pdesc); PADDLE_ENFORCE(should_run.size() == block.ops_size()); for (size_t i = 0; i < should_run.size(); ++i) { if (should_run[i]) { + for (auto var : block.ops(i).outputs()) { + for (auto argu : var.arguments()) { + if (local_scope.FindVar(argu) == nullptr) { + local_scope.NewVar(argu); + } + } + } auto op = paddle::framework::OpRegistry::CreateOp(block.ops(i)); - op->Run(*scope, *device); + op->Run(local_scope, *device); } } - - // // print tensor value - // for (auto& var : block.vars()) { - // std::cout << var.name() << std::endl; - // auto v = scope->FindVar(var.name()); - // const LoDTensor& t = v->Get(); - // for (int i = 0; i < t.numel(); ++i) { - // std::cout << t.data()[i] << " "; - // } - // std::cout << std::endl; - // } } std::vector Executor::Preprocess(const ProgramDesc& pdesc) { @@ -125,7 +119,6 @@ std::vector Executor::Preprocess(const ProgramDesc& pdesc) { } } - // TODO(tonyyang-svail): add VLOG here for debugging if (op_desc.type() == "fetch" || found_dependent_vars) { // erase its output to the dependency graph for (auto& var : op_desc.outputs()) { @@ -141,13 +134,9 @@ std::vector Executor::Preprocess(const ProgramDesc& pdesc) { } } - // this op should be executed should_run.push_back(true); - LOG(INFO) << "Yes " << op_desc.type(); } else { - // this op should NOT be executed should_run.push_back(false); - LOG(INFO) << "No " << op_desc.type(); } } diff --git a/paddle/framework/executor_test.cc b/paddle/framework/executor_test.cc index 7ce472ed2f..99f80d04e8 100644 --- a/paddle/framework/executor_test.cc +++ b/paddle/framework/executor_test.cc @@ -18,7 +18,7 @@ limitations under the License. */ #include "paddle/framework/attribute.h" #include "paddle/framework/backward.h" #include "paddle/framework/block_desc.h" -#include "paddle/framework/grad_op_builder.h" +// #include "paddle/framework/grad_op_builder.h" #include "paddle/framework/op_desc.h" #include "paddle/framework/op_registry.h" #include "paddle/framework/operator.h" @@ -37,68 +37,27 @@ using namespace paddle::framework; typedef paddle::framework::BlockDesc proto_block; typedef paddle::framework::OpDesc proto_op; -struct SetAttrDescVisitor : public boost::static_visitor { - explicit SetAttrDescVisitor(OpDesc::Attr* attr) : attr_(attr) {} - mutable OpDesc::Attr* attr_; - void operator()(int v) const { attr_->set_i(v); } - void operator()(float v) const { attr_->set_f(v); } - void operator()(const std::string& v) const { attr_->set_s(v); } - void operator()(bool b) const { attr_->set_b(b); } - - void operator()(const std::vector& v) const { - VectorToRepeated(v, attr_->mutable_ints()); - } - void operator()(const std::vector& v) const { - VectorToRepeated(v, attr_->mutable_floats()); - } - void operator()(const std::vector& v) const { - VectorToRepeated(v, attr_->mutable_strings()); - } - void operator()(const std::vector& v) const { - VectorToRepeated(v, attr_->mutable_bools()); - } - void operator()(BlockDesc* desc) const { attr_->set_block_idx(desc->idx()); } - void operator()(boost::blank) const { PADDLE_THROW("Unexpected branch"); } -}; - void AddOp(const std::string& type, const VariableNameMap& inputs, const VariableNameMap& outputs, AttributeMap attrs, - proto_block* block) { + paddle::framework::BlockDescBind* block) { // insert output for (auto kv : outputs) { for (auto v : kv.second) { - auto var = block->add_vars(); - var->set_name(v); - auto var_lt = var->mutable_lod_tensor(); - var_lt->set_data_type(paddle::framework::DataType::FP32); + auto var = block->NewVar(v); + var->SetDataType(paddle::framework::DataType::FP32); } } // insert op - auto op = block->add_ops(); - op->set_type(type); + auto op = block->AppendOp(); + op->SetType(type); for (auto kv : inputs) { - auto X = op->add_inputs(); - X->set_parameter(kv.first); - for (auto argu : kv.second) { - X->add_arguments(argu); - } + op->SetInput(kv.first, kv.second); } for (auto kv : outputs) { - auto X = op->add_outputs(); - X->set_parameter(kv.first); - for (auto argu : kv.second) { - X->add_arguments(argu); - } - } - for (auto& attr : attrs) { - auto* attr_desc = op->add_attrs(); - attr_desc->set_name(attr.first); - attr_desc->set_type( - static_cast(attr.second.which() - 1)); - SetAttrDescVisitor visitor(attr_desc); - boost::apply_visitor(visitor, attr.second); + op->SetOutput(kv.first, kv.second); } + op->SetAttrMap(attrs); } std::once_flag set_variable_flag; @@ -146,10 +105,16 @@ class ExecutorTesterRandom : public ::testing::Test { virtual void SetUp() override { int input_dim = 5, batch_size = 2, embed_dim = 5; - // init pdesc - auto init_root_block = init_pdesc_.add_blocks(); - init_root_block->set_idx(0); - init_root_block->set_parent_idx(-1); + // init pdesc ----------------------------------------- + auto temp_init_root_block = init_pdesc_.add_blocks(); + temp_init_root_block->set_idx(0); + temp_init_root_block->set_parent_idx(-1); + + // wrap to BlockDescBind + paddle::framework::ProgramDescBind& init_program = + paddle::framework::ProgramDescBind::Instance(&init_pdesc_); + paddle::framework::BlockDescBind* init_root_block = init_program.Block(0); + AddOp("gaussian_random", {}, {{"Out", {"w1"}}}, {{"dims", std::vector{input_dim, embed_dim}}}, init_root_block); AddOp("gaussian_random", {}, {{"Out", {"w2"}}}, @@ -160,11 +125,18 @@ class ExecutorTesterRandom : public ::testing::Test { AddOp("fetch", {{"Input", {"w2"}}}, {}, {{"dims", std::vector{embed_dim, input_dim}}, {"col", 1}}, init_root_block); + // flush + init_program.Proto(); + + // run pdesc ----------------------------------------- + auto temp_root_block = pdesc_.add_blocks(); + temp_root_block->set_idx(0); + temp_root_block->set_parent_idx(-1); - // run pdesc - auto root_block = pdesc_.add_blocks(); - root_block->set_idx(0); - root_block->set_parent_idx(-1); + // wrap to BlockDescBind + paddle::framework::ProgramDescBind& program = + paddle::framework::ProgramDescBind::Instance(&pdesc_); + paddle::framework::BlockDescBind* root_block = program.Block(0); AddOp("gaussian_random", {}, {{"Out", {"a"}}}, {{"dims", std::vector{batch_size, input_dim}}}, root_block); @@ -175,13 +147,16 @@ class ExecutorTesterRandom : public ::testing::Test { AddOp("squared_l2_distance", {{"X", {"a"}}, {"Y", {"a_out"}}}, {{"Out", {"l2_distance"}}, {"sub_result", {"l2_distance_sub"}}}, {}, root_block); - - AppendBackward(pdesc_, {}); - // AddOp("fetch", {{"Input", {"sub_result"}}}, {}, - // {{"dims", std::vector{input_dim, batch_size}}, {"col", 0}}, - // root_block); AddOp("fetch", {{"Input", {"l2_distance"}}}, {}, {{"dims", std::vector{batch_size}}, {"col", 1}}, root_block); + // flush + program.Proto(); + + // TODO(tonyyang-svail): + // - Test with Backward + // AddOp("gaussian_random", {}, {{"Out", {"l2_distance@GRAD"}}}, + // {{"dims", std::vector{batch_size, 1}}}, root_block); + // AppendBackward(program, {}); } protected: @@ -192,9 +167,14 @@ class ExecutorTesterRandom : public ::testing::Test { class ExecutorTesterFeedAndFetch : public ::testing::Test { public: virtual void SetUp() override { - auto root_block = pdesc_.add_blocks(); - root_block->set_idx(0); - root_block->set_parent_idx(-1); + auto temp_root_block = pdesc_.add_blocks(); + temp_root_block->set_idx(0); + temp_root_block->set_parent_idx(-1); + + // wrap to BlockDescBind + paddle::framework::ProgramDescBind& program = + paddle::framework::ProgramDescBind::Instance(&pdesc_); + paddle::framework::BlockDescBind* root_block = program.Block(0); std::vector dim{6}; @@ -207,6 +187,9 @@ class ExecutorTesterFeedAndFetch : public ::testing::Test { AddOp("fetch", {{"Input", {"b"}}}, {}, {{"dims", dim}, {"col", 1}}, root_block); + // flush + program.Proto(); + std::vector vec1 = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0}; std::vector vec2 = {4.0, 5.0, 6.0, 7.0, 8.0, 9.0}; inputs_.push_back(vec1); @@ -235,12 +218,6 @@ TEST_F(ExecutorTesterRandom, CPU) { executor->Run(pdesc_, GetGlobalScope()); std::vector> result = get_fetch_variable(); - for (auto& vec : result) { - for (auto& num : vec) { - std::cout << num << " "; - } - std::cout << std::endl; - } delete executor; } @@ -290,18 +267,10 @@ TEST_F(ExecutorTesterRandom, GPU) { Executor* executor = new Executor(places); - LOG(INFO) << "Run Init"; executor->Run(init_pdesc_, GetGlobalScope()); - LOG(INFO) << "Run"; executor->Run(pdesc_, GetGlobalScope()); std::vector> result = get_fetch_variable(); - for (auto& vec : result) { - for (auto& num : vec) { - std::cout << num << " "; - } - std::cout << std::endl; - } delete executor; } From e51557130e91383afb0e54dee00710664c9bf555 Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Mon, 9 Oct 2017 22:57:11 +0000 Subject: [PATCH 34/61] clean up for review --- paddle/framework/executor.cc | 40 ++++++++++++++------- paddle/framework/executor.h | 2 +- paddle/framework/executor_test.cc | 60 +++++++++++++------------------ paddle/framework/scope.cc | 1 + paddle/operators/feed_op.cc | 1 + paddle/operators/fetch_op.cc | 1 + paddle/platform/gpu_info.cc | 2 +- 7 files changed, 56 insertions(+), 51 deletions(-) diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc index c6c9d13469..3ac752388f 100644 --- a/paddle/framework/executor.cc +++ b/paddle/framework/executor.cc @@ -13,11 +13,13 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/framework/executor.h" + #include #include #include #include #include + #include "paddle/framework/lod_tensor.h" #include "paddle/framework/op_registry.h" #include "paddle/framework/scope.h" @@ -27,7 +29,11 @@ limitations under the License. */ namespace paddle { namespace framework { +const std::string kFeedOpType = "feed"; +const std::string kFetchOpType = "fetch"; + Executor::Executor(const std::vector& places) { + PADDLE_ENFORCE_GT(places.size(), 0); device_contexts_.resize(places.size()); for (size_t i = 0; i < places.size(); i++) { if (platform::is_cpu_place(places[i])) { @@ -46,9 +52,7 @@ Executor::Executor(const std::vector& places) { Executor::~Executor() { for (auto& device_context : device_contexts_) { - if (device_context) { - delete device_context; - } + delete device_context; } } @@ -56,6 +60,8 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope) { // TODO(tonyyang-svail): // - only runs the first block (i.e. no RNN support) // - only runs on the first device (i.e. no interdevice communication) + // - will change to use multiple blocks for RNN op and Cond Op + PADDLE_ENFORCE_GT(pdesc.blocks_size(), 0); auto& block = pdesc.blocks(0); auto& device = device_contexts_[0]; @@ -66,12 +72,12 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope) { Scope& local_scope = scope->NewScope(); - std::vector should_run = Preprocess(pdesc); - PADDLE_ENFORCE(should_run.size() == block.ops_size()); + std::vector should_run = Prune(pdesc); + PADDLE_ENFORCE_EQ(should_run.size(), block.ops_size()); for (size_t i = 0; i < should_run.size(); ++i) { if (should_run[i]) { - for (auto var : block.ops(i).outputs()) { - for (auto argu : var.arguments()) { + for (auto& var : block.ops(i).outputs()) { + for (auto& argu : var.arguments()) { if (local_scope.FindVar(argu) == nullptr) { local_scope.NewVar(argu); } @@ -81,28 +87,32 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope) { op->Run(local_scope, *device); } } + + // TODO(tonyyang-svail): + // - Destroy local_scope } -std::vector Executor::Preprocess(const ProgramDesc& pdesc) { +std::vector Executor::Prune(const ProgramDesc& pdesc) { // TODO(tonyyang-svail): // - only runs the first block + // - will change to use multiple blocks for RNN op and Cond Op auto& block = pdesc.blocks(0); auto& ops = block.ops(); bool expect_feed = true; for (auto& op_desc : ops) { - PADDLE_ENFORCE(op_desc.type() != "feed" || expect_feed, + PADDLE_ENFORCE(op_desc.type() != kFeedOpType || expect_feed, "All FeedOps are at the beginning of the ProgramDesc"); - expect_feed = (op_desc.type() == "feed"); + expect_feed = (op_desc.type() == kFeedOpType); } bool expect_fetch = true; for (auto op_iter = ops.rbegin(); op_iter != ops.rend(); ++op_iter) { auto& op_desc = *op_iter; - PADDLE_ENFORCE(op_desc.type() != "fetch" || expect_fetch, + PADDLE_ENFORCE(op_desc.type() != kFetchOpType || expect_fetch, "All FetchOps must at the end of the ProgramDesc"); - expect_fetch = (op_desc.type() == "fetch"); + expect_fetch = (op_desc.type() == kFetchOpType); } std::set dependent_vars; @@ -119,7 +129,7 @@ std::vector Executor::Preprocess(const ProgramDesc& pdesc) { } } - if (op_desc.type() == "fetch" || found_dependent_vars) { + if (op_desc.type() == kFetchOpType || found_dependent_vars) { // erase its output to the dependency graph for (auto& var : op_desc.outputs()) { for (auto& argu : var.arguments()) { @@ -140,6 +150,10 @@ std::vector Executor::Preprocess(const ProgramDesc& pdesc) { } } + // TODO(tonyyang-svail): + // - check this after integration of Init + // PADDLE_ENFORCE(dependent_vars.empty()); + // since we are traversing the ProgramDesc in reverse order // we reverse the should_run vector std::reverse(should_run.begin(), should_run.end()); diff --git a/paddle/framework/executor.h b/paddle/framework/executor.h index 75cb5939ff..f832b0d7d6 100644 --- a/paddle/framework/executor.h +++ b/paddle/framework/executor.h @@ -46,7 +46,7 @@ class Executor { * @return * vector Same size as ops. Indicates whether an op should be run. */ - std::vector Preprocess(const ProgramDesc& pdesc); + std::vector Prune(const ProgramDesc& pdesc); private: std::vector device_contexts_; diff --git a/paddle/framework/executor_test.cc b/paddle/framework/executor_test.cc index 99f80d04e8..f28651e809 100644 --- a/paddle/framework/executor_test.cc +++ b/paddle/framework/executor_test.cc @@ -13,12 +13,14 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/framework/executor.h" + +#include #include + #include "gtest/gtest.h" #include "paddle/framework/attribute.h" #include "paddle/framework/backward.h" #include "paddle/framework/block_desc.h" -// #include "paddle/framework/grad_op_builder.h" #include "paddle/framework/op_desc.h" #include "paddle/framework/op_registry.h" #include "paddle/framework/operator.h" @@ -34,9 +36,6 @@ using std::string; using namespace paddle::platform; using namespace paddle::framework; -typedef paddle::framework::BlockDesc proto_block; -typedef paddle::framework::OpDesc proto_op; - void AddOp(const std::string& type, const VariableNameMap& inputs, const VariableNameMap& outputs, AttributeMap attrs, paddle::framework::BlockDescBind* block) { @@ -51,10 +50,10 @@ void AddOp(const std::string& type, const VariableNameMap& inputs, // insert op auto op = block->AppendOp(); op->SetType(type); - for (auto kv : inputs) { + for (auto& kv : inputs) { op->SetInput(kv.first, kv.second); } - for (auto kv : outputs) { + for (auto& kv : outputs) { op->SetOutput(kv.first, kv.second); } op->SetAttrMap(attrs); @@ -65,11 +64,11 @@ std::once_flag set_variable_flag; // Tensors in feed value variable will only be in CPUPlace // So we can memcpy the data from vector to feed_value template -void set_feed_variable(const std::vector>& inputs) { +void SetFeedVariable(const std::vector>& inputs) { typedef std::vector FeedInputs; Variable* g_feed_value = GetGlobalScope()->FindVar("feed_value"); FeedInputs& feed_inputs = *(g_feed_value->GetMutable()); - auto size = inputs.size(); + size_t size = inputs.size(); feed_inputs.resize(size); for (size_t i = 0; i < size; i++) { T* dst = feed_inputs[i].mutable_data( @@ -81,12 +80,12 @@ void set_feed_variable(const std::vector>& inputs) { // Tensors in fetch value variable will only be in CPUPlace // So we can memcpy the data from fetch_value to vector template -std::vector> get_fetch_variable() { +std::vector> GetFetchVariable() { typedef std::vector FetchOutputs; Variable* g_fetch_value = GetGlobalScope()->FindVar("fetch_value"); FetchOutputs& fetch_outputs = *(g_fetch_value->GetMutable()); - auto size = fetch_outputs.size(); + size_t size = fetch_outputs.size(); std::vector> result; result.reserve(size); for (size_t i = 0; i < size; i++) { @@ -105,7 +104,7 @@ class ExecutorTesterRandom : public ::testing::Test { virtual void SetUp() override { int input_dim = 5, batch_size = 2, embed_dim = 5; - // init pdesc ----------------------------------------- + // init pdesc auto temp_init_root_block = init_pdesc_.add_blocks(); temp_init_root_block->set_idx(0); temp_init_root_block->set_parent_idx(-1); @@ -128,7 +127,7 @@ class ExecutorTesterRandom : public ::testing::Test { // flush init_program.Proto(); - // run pdesc ----------------------------------------- + // run pdesc auto temp_root_block = pdesc_.add_blocks(); temp_root_block->set_idx(0); temp_root_block->set_parent_idx(-1); @@ -154,9 +153,6 @@ class ExecutorTesterRandom : public ::testing::Test { // TODO(tonyyang-svail): // - Test with Backward - // AddOp("gaussian_random", {}, {{"Out", {"l2_distance@GRAD"}}}, - // {{"dims", std::vector{batch_size, 1}}}, root_block); - // AppendBackward(program, {}); } protected: @@ -213,12 +209,11 @@ TEST_F(ExecutorTesterRandom, CPU) { // "pointer being freed was not allocated" error will appear. paddle::memory::Used(cpu_place); - Executor* executor = new Executor(places); + std::unique_ptr executor(new Executor(places)); + executor->Run(init_pdesc_, GetGlobalScope()); executor->Run(pdesc_, GetGlobalScope()); - std::vector> result = get_fetch_variable(); - - delete executor; + std::vector> result = GetFetchVariable(); } TEST_F(ExecutorTesterFeedAndFetch, CPU) { @@ -232,13 +227,12 @@ TEST_F(ExecutorTesterFeedAndFetch, CPU) { // "pointer being freed was not allocated" error will appear. paddle::memory::Used(cpu_place); - Executor* executor = new Executor(places); + std::unique_ptr executor(new Executor(places)); - // 3 mini-batch - for (int i = 0; i < 3; i++) { - set_feed_variable(inputs_); + for (int batch_id = 0; batch_id < 3; batch_id++) { + SetFeedVariable(inputs_); executor->Run(pdesc_, GetGlobalScope()); - std::vector> result = get_fetch_variable(); + std::vector> result = GetFetchVariable(); PADDLE_ENFORCE_EQ(result.size(), inputs_.size()); for (size_t i = 0; i < result.size(); ++i) { PADDLE_ENFORCE_EQ(result[i].size(), inputs_[i].size()); @@ -247,8 +241,6 @@ TEST_F(ExecutorTesterFeedAndFetch, CPU) { } } } - - delete executor; } #else TEST_F(ExecutorTesterRandom, GPU) { @@ -265,13 +257,11 @@ TEST_F(ExecutorTesterRandom, GPU) { paddle::memory::Used(CPUPlace()); paddle::memory::Used(gpu_place); - Executor* executor = new Executor(places); + std::unique_ptr executor(new Executor(places)); executor->Run(init_pdesc_, GetGlobalScope()); executor->Run(pdesc_, GetGlobalScope()); - std::vector> result = get_fetch_variable(); - - delete executor; + std::vector> result = GetFetchVariable(); } TEST_F(ExecutorTesterFeedAndFetch, GPU) { @@ -287,13 +277,12 @@ TEST_F(ExecutorTesterFeedAndFetch, GPU) { paddle::memory::Used(CPUPlace()); paddle::memory::Used(gpu_place); - Executor* executor = new Executor(places); + std::unique_ptr executor(new Executor(places)); - // 3 mini-batch - for (int i = 0; i < 3; i++) { - set_feed_variable(inputs_); + for (int batch_id = 0; batch_id < 3; batch_id++) { + SetFeedVariable(inputs_); executor->Run(pdesc_, GetGlobalScope()); - std::vector> result = get_fetch_variable(); + std::vector> result = GetFetchVariable(); PADDLE_ENFORCE_EQ(result.size(), inputs_.size()); for (size_t i = 0; i < result.size(); ++i) { PADDLE_ENFORCE_EQ(result[i].size(), inputs_[i].size()); @@ -302,6 +291,5 @@ TEST_F(ExecutorTesterFeedAndFetch, GPU) { } } } - delete executor; } #endif diff --git a/paddle/framework/scope.cc b/paddle/framework/scope.cc index 2a0d9bbf33..c9e53a0d85 100644 --- a/paddle/framework/scope.cc +++ b/paddle/framework/scope.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/framework/scope.h" + #include // for unique_ptr #include // for call_once #include "paddle/string/printf.h" diff --git a/paddle/operators/feed_op.cc b/paddle/operators/feed_op.cc index b9e43be966..dcd5f7fb77 100644 --- a/paddle/operators/feed_op.cc +++ b/paddle/operators/feed_op.cc @@ -31,6 +31,7 @@ class FeedOp : public framework::OperatorWithKernel { const FeedInputs& tensors = g_feed_variable->Get(); + PADDLE_ENFORCE_GT(tensors.size(), col); auto in_dim = tensors[col].dims(); ctx->SetOutputDim("Out", in_dim); // TODO(qijun): need to handle LodTensor later diff --git a/paddle/operators/fetch_op.cc b/paddle/operators/fetch_op.cc index 7bde4953cd..5adb83144a 100644 --- a/paddle/operators/fetch_op.cc +++ b/paddle/operators/fetch_op.cc @@ -35,6 +35,7 @@ class FetchOp : public framework::OperatorWithKernel { } auto input_dim = ctx->GetInputDim("Input"); + PADDLE_ENFORCE_GT(tensors->size(), col); (*tensors)[col].Resize(input_dim); // TODO(qijun): need to handle LodTensor later diff --git a/paddle/platform/gpu_info.cc b/paddle/platform/gpu_info.cc index aa76bb209d..0cab5ffc56 100644 --- a/paddle/platform/gpu_info.cc +++ b/paddle/platform/gpu_info.cc @@ -44,7 +44,7 @@ int GetCurrentDeviceId() { void SetDeviceId(int id) { // TODO(qijun): find a better way to cache the cuda device count - PADDLE_ENFORCE(id < GetCUDADeviceCount(), "id must less than GPU count"); + PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count"); PADDLE_ENFORCE(cudaSetDevice(id), "cudaSetDevice failed in paddle::platform::SetDeviceId"); } From 340d21d4ed7d8f0f2cc511b6480771965234570e Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Tue, 10 Oct 2017 00:02:47 +0000 Subject: [PATCH 35/61] Init at block[0]; Run at block[1] --- paddle/framework/executor.cc | 16 ++++---- paddle/framework/executor.h | 4 +- paddle/framework/executor_test.cc | 63 +++++++++++++------------------ 3 files changed, 36 insertions(+), 47 deletions(-) diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc index 3ac752388f..bbc7f77a94 100644 --- a/paddle/framework/executor.cc +++ b/paddle/framework/executor.cc @@ -56,13 +56,12 @@ Executor::~Executor() { } } -void Executor::Run(const ProgramDesc& pdesc, Scope* scope) { +void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id) { // TODO(tonyyang-svail): - // - only runs the first block (i.e. no RNN support) // - only runs on the first device (i.e. no interdevice communication) // - will change to use multiple blocks for RNN op and Cond Op - PADDLE_ENFORCE_GT(pdesc.blocks_size(), 0); - auto& block = pdesc.blocks(0); + PADDLE_ENFORCE_GT(pdesc.blocks_size(), block_id); + auto& block = pdesc.blocks(block_id); auto& device = device_contexts_[0]; // Instantiate all the vars in the global scope @@ -72,7 +71,7 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope) { Scope& local_scope = scope->NewScope(); - std::vector should_run = Prune(pdesc); + std::vector should_run = Prune(pdesc, block_id); PADDLE_ENFORCE_EQ(should_run.size(), block.ops_size()); for (size_t i = 0; i < should_run.size(); ++i) { if (should_run[i]) { @@ -92,12 +91,11 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope) { // - Destroy local_scope } -std::vector Executor::Prune(const ProgramDesc& pdesc) { +std::vector Executor::Prune(const ProgramDesc& pdesc, int block_id) { // TODO(tonyyang-svail): - // - only runs the first block // - will change to use multiple blocks for RNN op and Cond Op - auto& block = pdesc.blocks(0); + auto& block = pdesc.blocks(block_id); auto& ops = block.ops(); bool expect_feed = true; @@ -144,8 +142,10 @@ std::vector Executor::Prune(const ProgramDesc& pdesc) { } } + LOG(INFO) << "1 " << op_desc.type(); should_run.push_back(true); } else { + LOG(INFO) << "0 " << op_desc.type(); should_run.push_back(false); } } diff --git a/paddle/framework/executor.h b/paddle/framework/executor.h index f832b0d7d6..7fac4f4f46 100644 --- a/paddle/framework/executor.h +++ b/paddle/framework/executor.h @@ -34,7 +34,7 @@ class Executor { * ProgramDesc * Scope */ - void Run(const ProgramDesc&, Scope*); + void Run(const ProgramDesc&, Scope*, int); protected: /* @Brief @@ -46,7 +46,7 @@ class Executor { * @return * vector Same size as ops. Indicates whether an op should be run. */ - std::vector Prune(const ProgramDesc& pdesc); + std::vector Prune(const ProgramDesc& pdesc, int block_id); private: std::vector device_contexts_; diff --git a/paddle/framework/executor_test.cc b/paddle/framework/executor_test.cc index f28651e809..b64ba1c98f 100644 --- a/paddle/framework/executor_test.cc +++ b/paddle/framework/executor_test.cc @@ -104,50 +104,40 @@ class ExecutorTesterRandom : public ::testing::Test { virtual void SetUp() override { int input_dim = 5, batch_size = 2, embed_dim = 5; - // init pdesc - auto temp_init_root_block = init_pdesc_.add_blocks(); - temp_init_root_block->set_idx(0); - temp_init_root_block->set_parent_idx(-1); - - // wrap to BlockDescBind - paddle::framework::ProgramDescBind& init_program = - paddle::framework::ProgramDescBind::Instance(&init_pdesc_); - paddle::framework::BlockDescBind* init_root_block = init_program.Block(0); + auto temp_root_block = pdesc_.add_blocks(); + temp_root_block->set_idx(0); + temp_root_block->set_parent_idx(-1); + paddle::framework::ProgramDescBind& program = + paddle::framework::ProgramDescBind::Instance(&pdesc_); + paddle::framework::BlockDescBind* root_block = program.Block(0); + // block[0] AddOp("gaussian_random", {}, {{"Out", {"w1"}}}, - {{"dims", std::vector{input_dim, embed_dim}}}, init_root_block); + {{"dims", std::vector{input_dim, embed_dim}}}, root_block); AddOp("gaussian_random", {}, {{"Out", {"w2"}}}, - {{"dims", std::vector{embed_dim, input_dim}}}, init_root_block); + {{"dims", std::vector{embed_dim, input_dim}}}, root_block); AddOp("fetch", {{"Input", {"w1"}}}, {}, {{"dims", std::vector{input_dim, embed_dim}}, {"col", 0}}, - init_root_block); + root_block); AddOp("fetch", {{"Input", {"w2"}}}, {}, {{"dims", std::vector{embed_dim, input_dim}}, {"col", 1}}, - init_root_block); - // flush - init_program.Proto(); - - // run pdesc - auto temp_root_block = pdesc_.add_blocks(); - temp_root_block->set_idx(0); - temp_root_block->set_parent_idx(-1); - - // wrap to BlockDescBind - paddle::framework::ProgramDescBind& program = - paddle::framework::ProgramDescBind::Instance(&pdesc_); - paddle::framework::BlockDescBind* root_block = program.Block(0); + root_block); + // block[1] + paddle::framework::BlockDescBind* run_block = + program.AppendBlock(*root_block); AddOp("gaussian_random", {}, {{"Out", {"a"}}}, - {{"dims", std::vector{batch_size, input_dim}}}, root_block); + {{"dims", std::vector{batch_size, input_dim}}}, run_block); AddOp("mul", {{"X", {"a"}}, {"Y", {"w1"}}}, {{"Out", {"b"}}}, {}, - root_block); + run_block); AddOp("mul", {{"X", {"b"}}, {"Y", {"w2"}}}, {{"Out", {"a_out"}}}, {}, - root_block); + run_block); AddOp("squared_l2_distance", {{"X", {"a"}}, {"Y", {"a_out"}}}, {{"Out", {"l2_distance"}}, {"sub_result", {"l2_distance_sub"}}}, {}, - root_block); + run_block); AddOp("fetch", {{"Input", {"l2_distance"}}}, {}, - {{"dims", std::vector{batch_size}}, {"col", 1}}, root_block); + {{"dims", std::vector{batch_size}}, {"col", 1}}, run_block); + // flush program.Proto(); @@ -157,7 +147,6 @@ class ExecutorTesterRandom : public ::testing::Test { protected: ProgramDesc pdesc_; - ProgramDesc init_pdesc_; }; class ExecutorTesterFeedAndFetch : public ::testing::Test { @@ -211,8 +200,8 @@ TEST_F(ExecutorTesterRandom, CPU) { std::unique_ptr executor(new Executor(places)); - executor->Run(init_pdesc_, GetGlobalScope()); - executor->Run(pdesc_, GetGlobalScope()); + executor->Run(pdesc_, GetGlobalScope(), 0); + executor->Run(pdesc_, GetGlobalScope(), 1); std::vector> result = GetFetchVariable(); } @@ -231,7 +220,7 @@ TEST_F(ExecutorTesterFeedAndFetch, CPU) { for (int batch_id = 0; batch_id < 3; batch_id++) { SetFeedVariable(inputs_); - executor->Run(pdesc_, GetGlobalScope()); + executor->Run(pdesc_, GetGlobalScope(), 0); std::vector> result = GetFetchVariable(); PADDLE_ENFORCE_EQ(result.size(), inputs_.size()); for (size_t i = 0; i < result.size(); ++i) { @@ -259,8 +248,8 @@ TEST_F(ExecutorTesterRandom, GPU) { std::unique_ptr executor(new Executor(places)); - executor->Run(init_pdesc_, GetGlobalScope()); - executor->Run(pdesc_, GetGlobalScope()); + executor->Run(pdesc_, GetGlobalScope(), 0); + executor->Run(pdesc_, GetGlobalScope(), 1); std::vector> result = GetFetchVariable(); } @@ -281,7 +270,7 @@ TEST_F(ExecutorTesterFeedAndFetch, GPU) { for (int batch_id = 0; batch_id < 3; batch_id++) { SetFeedVariable(inputs_); - executor->Run(pdesc_, GetGlobalScope()); + executor->Run(pdesc_, GetGlobalScope(), 0); std::vector> result = GetFetchVariable(); PADDLE_ENFORCE_EQ(result.size(), inputs_.size()); for (size_t i = 0; i < result.size(); ++i) { From 932402c16b1ad41851a307e2fcb432e674609071 Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Tue, 10 Oct 2017 02:59:49 +0000 Subject: [PATCH 36/61] debug for sum --- paddle/framework/backward.cc | 1 + paddle/framework/executor.cc | 13 +++++- paddle/framework/executor_test.cc | 69 +++++++++++++++++++++---------- paddle/operators/feed_op.cc | 2 +- paddle/operators/fetch_op.cc | 2 +- 5 files changed, 62 insertions(+), 25 deletions(-) diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc index 0a4688db9c..9a5c4e9cf0 100644 --- a/paddle/framework/backward.cc +++ b/paddle/framework/backward.cc @@ -378,6 +378,7 @@ std::vector> MakeBlockBackward( backward_descs[dup_op[i]]->Rename(out_name, new_name); sum_op_inputs.emplace_back(new_name); } + LOG(INFO) << "fuck " << sum_op_inputs.size(); std::unique_ptr sum_op(new OpDescBind( "sum", {{"X", sum_op_inputs}}, {{"Out", {out_name}}}, {})); pending_sum_ops.push_back({dup_op.back(), std::move(sum_op)}); diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc index bbc7f77a94..ee6243a9bf 100644 --- a/paddle/framework/executor.cc +++ b/paddle/framework/executor.cc @@ -74,7 +74,8 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id) { std::vector should_run = Prune(pdesc, block_id); PADDLE_ENFORCE_EQ(should_run.size(), block.ops_size()); for (size_t i = 0; i < should_run.size(); ++i) { - if (should_run[i]) { + // if (should_run[i]) { + if (true) { for (auto& var : block.ops(i).outputs()) { for (auto& argu : var.arguments()) { if (local_scope.FindVar(argu) == nullptr) { @@ -82,7 +83,17 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id) { } } } + LOG(INFO) << block.ops(i).type(); + if (block.ops(i).type() == "sum") { + LOG(INFO) << "Here"; + for (auto& var : block.ops(i).inputs()) { + for (auto& argu : var.arguments()) { + LOG(INFO) << var.parameter() << " " << argu; + } + } + } auto op = paddle::framework::OpRegistry::CreateOp(block.ops(i)); + LOG(INFO) << op->DebugString(); op->Run(local_scope, *device); } } diff --git a/paddle/framework/executor_test.cc b/paddle/framework/executor_test.cc index b64ba1c98f..12be79d01b 100644 --- a/paddle/framework/executor_test.cc +++ b/paddle/framework/executor_test.cc @@ -30,6 +30,7 @@ USE_OP(gaussian_random); USE_OP(feed); USE_OP(fetch); USE_OP(mul); +USE_OP(sum); USE_OP(squared_l2_distance); using std::string; @@ -104,40 +105,63 @@ class ExecutorTesterRandom : public ::testing::Test { virtual void SetUp() override { int input_dim = 5, batch_size = 2, embed_dim = 5; - auto temp_root_block = pdesc_.add_blocks(); - temp_root_block->set_idx(0); - temp_root_block->set_parent_idx(-1); - paddle::framework::ProgramDescBind& program = - paddle::framework::ProgramDescBind::Instance(&pdesc_); - paddle::framework::BlockDescBind* root_block = program.Block(0); + auto temp_init_root_block = init_pdesc_.add_blocks(); + temp_init_root_block->set_idx(0); + temp_init_root_block->set_parent_idx(-1); + paddle::framework::ProgramDescBind& init_program = + paddle::framework::ProgramDescBind::Instance(&init_pdesc_); + paddle::framework::BlockDescBind* init_root_block = init_program.Block(0); - // block[0] AddOp("gaussian_random", {}, {{"Out", {"w1"}}}, - {{"dims", std::vector{input_dim, embed_dim}}}, root_block); + {{"dims", std::vector{input_dim, embed_dim}}}, init_root_block); AddOp("gaussian_random", {}, {{"Out", {"w2"}}}, - {{"dims", std::vector{embed_dim, input_dim}}}, root_block); + {{"dims", std::vector{embed_dim, input_dim}}}, init_root_block); AddOp("fetch", {{"Input", {"w1"}}}, {}, {{"dims", std::vector{input_dim, embed_dim}}, {"col", 0}}, - root_block); + init_root_block); AddOp("fetch", {{"Input", {"w2"}}}, {}, {{"dims", std::vector{embed_dim, input_dim}}, {"col", 1}}, - root_block); + init_root_block); + + // flush + init_program.Proto(); + + auto temp_root_block = pdesc_.add_blocks(); + temp_root_block->set_idx(0); + temp_root_block->set_parent_idx(-1); + paddle::framework::ProgramDescBind& program = + paddle::framework::ProgramDescBind::Instance(&pdesc_); + paddle::framework::BlockDescBind* root_block = program.Block(0); - // block[1] - paddle::framework::BlockDescBind* run_block = - program.AppendBlock(*root_block); AddOp("gaussian_random", {}, {{"Out", {"a"}}}, - {{"dims", std::vector{batch_size, input_dim}}}, run_block); + {{"dims", std::vector{batch_size, input_dim}}}, root_block); AddOp("mul", {{"X", {"a"}}, {"Y", {"w1"}}}, {{"Out", {"b"}}}, {}, - run_block); + root_block); AddOp("mul", {{"X", {"b"}}, {"Y", {"w2"}}}, {{"Out", {"a_out"}}}, {}, - run_block); + root_block); AddOp("squared_l2_distance", {{"X", {"a"}}, {"Y", {"a_out"}}}, {{"Out", {"l2_distance"}}, {"sub_result", {"l2_distance_sub"}}}, {}, - run_block); - AddOp("fetch", {{"Input", {"l2_distance"}}}, {}, - {{"dims", std::vector{batch_size}}, {"col", 1}}, run_block); + root_block); + AddOp("gaussian_random", {}, {{"Out", {"l2_distance@GRAD"}}}, + {{"dims", std::vector{batch_size, 1}}}, root_block); + AppendBackward(program, {}); + + program.Proto(); + + for (auto& op : pdesc_.blocks(0).ops()) { + if (op.type() == "sum") { + LOG(INFO) << "Here"; + for (auto& var : op.inputs()) { + for (auto& argu : var.arguments()) { + LOG(INFO) << var.parameter() << " " << argu; + } + } + } + } + + AddOp("fetch", {{"Input", {"l2_distance"}}}, {}, + {{"dims", std::vector{batch_size}}, {"col", 1}}, root_block); // flush program.Proto(); @@ -146,6 +170,7 @@ class ExecutorTesterRandom : public ::testing::Test { } protected: + ProgramDesc init_pdesc_; ProgramDesc pdesc_; }; @@ -200,8 +225,8 @@ TEST_F(ExecutorTesterRandom, CPU) { std::unique_ptr executor(new Executor(places)); + executor->Run(init_pdesc_, GetGlobalScope(), 0); executor->Run(pdesc_, GetGlobalScope(), 0); - executor->Run(pdesc_, GetGlobalScope(), 1); std::vector> result = GetFetchVariable(); } @@ -248,8 +273,8 @@ TEST_F(ExecutorTesterRandom, GPU) { std::unique_ptr executor(new Executor(places)); + executor->Run(init_pdesc_, GetGlobalScope(), 0); executor->Run(pdesc_, GetGlobalScope(), 0); - executor->Run(pdesc_, GetGlobalScope(), 1); std::vector> result = GetFetchVariable(); } diff --git a/paddle/operators/feed_op.cc b/paddle/operators/feed_op.cc index dcd5f7fb77..b15bc86ae1 100644 --- a/paddle/operators/feed_op.cc +++ b/paddle/operators/feed_op.cc @@ -22,7 +22,7 @@ class FeedOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; protected: - void InferShape(framework::InferShapeContextBase* ctx) const override { + void InferShape(framework::InferShapeContext* ctx) const override { typedef std::vector FeedInputs; PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output should be not null."); int col = ctx->Attrs().Get("col"); diff --git a/paddle/operators/fetch_op.cc b/paddle/operators/fetch_op.cc index 5adb83144a..7ca3762c36 100644 --- a/paddle/operators/fetch_op.cc +++ b/paddle/operators/fetch_op.cc @@ -22,7 +22,7 @@ class FetchOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; protected: - void InferShape(framework::InferShapeContextBase* ctx) const override { + void InferShape(framework::InferShapeContext* ctx) const override { typedef std::vector FetchOutputs; PADDLE_ENFORCE(ctx->HasInput("Input"), "Input should be not null."); int col = ctx->Attrs().Get("col"); From 15400748ae6d21facb0b8e656b4298e1ae83df89 Mon Sep 17 00:00:00 2001 From: qijun Date: Mon, 9 Oct 2017 20:42:29 -0700 Subject: [PATCH 37/61] follow comments and refine codes --- paddle/framework/backward.cc | 2 +- paddle/framework/executor_test.cc | 44 +++++++++++++++---------------- paddle/operators/feed_op.cc | 6 ++--- paddle/operators/feed_op.h | 4 +-- paddle/operators/fetch_op.cc | 4 +-- paddle/operators/fetch_op.h | 4 +-- 6 files changed, 31 insertions(+), 33 deletions(-) diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc index 9a5c4e9cf0..774d8e4918 100644 --- a/paddle/framework/backward.cc +++ b/paddle/framework/backward.cc @@ -378,7 +378,7 @@ std::vector> MakeBlockBackward( backward_descs[dup_op[i]]->Rename(out_name, new_name); sum_op_inputs.emplace_back(new_name); } - LOG(INFO) << "fuck " << sum_op_inputs.size(); + LOG(INFO) << "sum_op_inputs size " << sum_op_inputs.size(); std::unique_ptr sum_op(new OpDescBind( "sum", {{"X", sum_op_inputs}}, {{"Out", {out_name}}}, {})); pending_sum_ops.push_back({dup_op.back(), std::move(sum_op)}); diff --git a/paddle/framework/executor_test.cc b/paddle/framework/executor_test.cc index 12be79d01b..0515fb2216 100644 --- a/paddle/framework/executor_test.cc +++ b/paddle/framework/executor_test.cc @@ -60,15 +60,13 @@ void AddOp(const std::string& type, const VariableNameMap& inputs, op->SetAttrMap(attrs); } -std::once_flag set_variable_flag; - // Tensors in feed value variable will only be in CPUPlace -// So we can memcpy the data from vector to feed_value +// So we can memcpy the data from vector to feed_value template void SetFeedVariable(const std::vector>& inputs) { - typedef std::vector FeedInputs; Variable* g_feed_value = GetGlobalScope()->FindVar("feed_value"); - FeedInputs& feed_inputs = *(g_feed_value->GetMutable()); + auto& feed_inputs = + *(g_feed_value->GetMutable>()); size_t size = inputs.size(); feed_inputs.resize(size); for (size_t i = 0; i < size; i++) { @@ -82,9 +80,9 @@ void SetFeedVariable(const std::vector>& inputs) { // So we can memcpy the data from fetch_value to vector template std::vector> GetFetchVariable() { - typedef std::vector FetchOutputs; Variable* g_fetch_value = GetGlobalScope()->FindVar("fetch_value"); - FetchOutputs& fetch_outputs = *(g_fetch_value->GetMutable()); + auto& fetch_outputs = + *(g_fetch_value->GetMutable>()); size_t size = fetch_outputs.size(); std::vector> result; @@ -143,22 +141,22 @@ class ExecutorTesterRandom : public ::testing::Test { {{"Out", {"l2_distance"}}, {"sub_result", {"l2_distance_sub"}}}, {}, root_block); - AddOp("gaussian_random", {}, {{"Out", {"l2_distance@GRAD"}}}, - {{"dims", std::vector{batch_size, 1}}}, root_block); - AppendBackward(program, {}); - - program.Proto(); - - for (auto& op : pdesc_.blocks(0).ops()) { - if (op.type() == "sum") { - LOG(INFO) << "Here"; - for (auto& var : op.inputs()) { - for (auto& argu : var.arguments()) { - LOG(INFO) << var.parameter() << " " << argu; - } - } - } - } + // AddOp("gaussian_random", {}, {{"Out", {"l2_distance@GRAD"}}}, + // {{"dims", std::vector{batch_size, 1}}}, root_block); + // AppendBackward(program, {}); + + // program.Proto(); + + // for (auto& op : pdesc_.blocks(0).ops()) { + // if (op.type() == "sum") { + // LOG(INFO) << "Here"; + // for (auto& var : op.inputs()) { + // for (auto& argu : var.arguments()) { + // LOG(INFO) << var.parameter() << " " << argu; + // } + // } + // } + // } AddOp("fetch", {{"Input", {"l2_distance"}}}, {}, {{"dims", std::vector{batch_size}}, {"col", 1}}, root_block); diff --git a/paddle/operators/feed_op.cc b/paddle/operators/feed_op.cc index b15bc86ae1..29e128ce7e 100644 --- a/paddle/operators/feed_op.cc +++ b/paddle/operators/feed_op.cc @@ -23,15 +23,15 @@ class FeedOp : public framework::OperatorWithKernel { protected: void InferShape(framework::InferShapeContext* ctx) const override { - typedef std::vector FeedInputs; PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output should be not null."); int col = ctx->Attrs().Get("col"); framework::Variable* g_feed_variable = framework::GetGlobalScope()->FindVar("feed_value"); - const FeedInputs& tensors = g_feed_variable->Get(); + const auto& tensors = + g_feed_variable->Get>(); - PADDLE_ENFORCE_GT(tensors.size(), col); + PADDLE_ENFORCE_GT(tensors.size(), static_cast(col)); auto in_dim = tensors[col].dims(); ctx->SetOutputDim("Out", in_dim); // TODO(qijun): need to handle LodTensor later diff --git a/paddle/operators/feed_op.h b/paddle/operators/feed_op.h index de8ec6ff61..96e3bf52bd 100644 --- a/paddle/operators/feed_op.h +++ b/paddle/operators/feed_op.h @@ -23,13 +23,13 @@ template class FeedKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - typedef std::vector FeedInputs; framework::Tensor* out = ctx.Output("Out"); out->mutable_data(ctx.GetPlace()); framework::Variable* g_feed_variable = framework::GetGlobalScope()->FindVar("feed_value"); int col = ctx.template Attr("col"); - const FeedInputs& tensors = g_feed_variable->Get(); + const auto& tensors = + g_feed_variable->Get>(); out->CopyFrom(tensors[col], ctx.GetPlace()); } }; diff --git a/paddle/operators/fetch_op.cc b/paddle/operators/fetch_op.cc index 7ca3762c36..77e3450a73 100644 --- a/paddle/operators/fetch_op.cc +++ b/paddle/operators/fetch_op.cc @@ -23,13 +23,13 @@ class FetchOp : public framework::OperatorWithKernel { protected: void InferShape(framework::InferShapeContext* ctx) const override { - typedef std::vector FetchOutputs; PADDLE_ENFORCE(ctx->HasInput("Input"), "Input should be not null."); int col = ctx->Attrs().Get("col"); framework::Variable* g_fetch_variable = framework::GetGlobalScope()->FindVar("fetch_value"); - FetchOutputs* tensors = g_fetch_variable->GetMutable(); + auto* tensors = + g_fetch_variable->GetMutable>(); if (tensors->size() < static_cast(col + 1)) { tensors->resize(col + 1); } diff --git a/paddle/operators/fetch_op.h b/paddle/operators/fetch_op.h index 3bec9c9974..fd98552055 100644 --- a/paddle/operators/fetch_op.h +++ b/paddle/operators/fetch_op.h @@ -23,12 +23,12 @@ template class FetchKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - typedef std::vector FetchOutputs; const framework::Tensor* input = ctx.Input("Input"); int col = ctx.template Attr("col"); framework::Variable* g_fetch_variable = framework::GetGlobalScope()->FindVar("fetch_value"); - FetchOutputs* tensors = g_fetch_variable->GetMutable(); + auto* tensors = + g_fetch_variable->GetMutable>(); (*tensors)[col].mutable_data(platform::CPUPlace()); (*tensors)[col].CopyFrom(*input, platform::CPUPlace()); } From e3161bb61a4686d96588bc1eb86c3edc0e26e6ee Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Tue, 10 Oct 2017 04:49:45 +0000 Subject: [PATCH 38/61] pass simple backward --- paddle/framework/executor_test.cc | 51 ++++++++++++++++++------------- 1 file changed, 29 insertions(+), 22 deletions(-) diff --git a/paddle/framework/executor_test.cc b/paddle/framework/executor_test.cc index 0515fb2216..9f8a6f8593 100644 --- a/paddle/framework/executor_test.cc +++ b/paddle/framework/executor_test.cc @@ -32,6 +32,8 @@ USE_OP(fetch); USE_OP(mul); USE_OP(sum); USE_OP(squared_l2_distance); +USE_OP(fill_constant); +USE_OP(sgd); using std::string; using namespace paddle::platform; @@ -124,6 +126,7 @@ class ExecutorTesterRandom : public ::testing::Test { // flush init_program.Proto(); + // run block auto temp_root_block = pdesc_.add_blocks(); temp_root_block->set_idx(0); temp_root_block->set_parent_idx(-1); @@ -131,6 +134,7 @@ class ExecutorTesterRandom : public ::testing::Test { paddle::framework::ProgramDescBind::Instance(&pdesc_); paddle::framework::BlockDescBind* root_block = program.Block(0); + // forward AddOp("gaussian_random", {}, {{"Out", {"a"}}}, {{"dims", std::vector{batch_size, input_dim}}}, root_block); AddOp("mul", {{"X", {"a"}}, {"Y", {"w1"}}}, {{"Out", {"b"}}}, {}, @@ -141,30 +145,33 @@ class ExecutorTesterRandom : public ::testing::Test { {{"Out", {"l2_distance"}}, {"sub_result", {"l2_distance_sub"}}}, {}, root_block); - // AddOp("gaussian_random", {}, {{"Out", {"l2_distance@GRAD"}}}, - // {{"dims", std::vector{batch_size, 1}}}, root_block); - // AppendBackward(program, {}); - - // program.Proto(); - - // for (auto& op : pdesc_.blocks(0).ops()) { - // if (op.type() == "sum") { - // LOG(INFO) << "Here"; - // for (auto& var : op.inputs()) { - // for (auto& argu : var.arguments()) { - // LOG(INFO) << var.parameter() << " " << argu; - // } - // } - // } - // } - - AddOp("fetch", {{"Input", {"l2_distance"}}}, {}, - {{"dims", std::vector{batch_size}}, {"col", 1}}, root_block); + // backward + AddOp("fill_constant", {}, {{"Out", {"l2_distance@GRAD"}}}, + {{"shape", std::vector{batch_size, 1}}, {"value", float(1.0)}}, + root_block); + AppendBackward(program, {}); + + // update + AddOp("fill_constant", {}, {{"Out", {"learning_rate"}}}, + {{"shape", std::vector{1}}, {"value", float(1.0)}}, root_block); + AddOp("sgd", {{"Param", {"w1"}}, + {"LearningRate", {"learning_rate"}}, + {"Grad", {"w1@GRAD"}}}, + {{"ParamOut", {"w1"}}}, {}, root_block); + AddOp("sgd", {{"Param", {"w2"}}, + {"LearningRate", {"learning_rate"}}, + {"Grad", {"w2@GRAD"}}}, + {{"ParamOut", {"w2"}}}, {}, root_block); + + AddOp("fetch", {{"Input", {"w1"}}}, {}, + {{"dims", std::vector{input_dim, embed_dim}}, {"col", 0}}, + root_block); + AddOp("fetch", {{"Input", {"w2"}}}, {}, + {{"dims", std::vector{embed_dim, input_dim}}, {"col", 1}}, + root_block); + // flush program.Proto(); - - // TODO(tonyyang-svail): - // - Test with Backward } protected: From 2fc7fc7a18fb8cbb78d380caf51947097138597c Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Tue, 10 Oct 2017 05:33:11 +0000 Subject: [PATCH 39/61] pass multiple forward backward --- paddle/framework/executor_test.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/paddle/framework/executor_test.cc b/paddle/framework/executor_test.cc index 9f8a6f8593..259205f7c1 100644 --- a/paddle/framework/executor_test.cc +++ b/paddle/framework/executor_test.cc @@ -279,8 +279,10 @@ TEST_F(ExecutorTesterRandom, GPU) { std::unique_ptr executor(new Executor(places)); executor->Run(init_pdesc_, GetGlobalScope(), 0); - executor->Run(pdesc_, GetGlobalScope(), 0); - std::vector> result = GetFetchVariable(); + for (int batch_id = 0; batch_id < 3; batch_id++) { + executor->Run(pdesc_, GetGlobalScope(), 0); + std::vector> result = GetFetchVariable(); + } } TEST_F(ExecutorTesterFeedAndFetch, GPU) { From 975a51294e20c122e7143a232261d4fd49ac5643 Mon Sep 17 00:00:00 2001 From: qijun Date: Mon, 9 Oct 2017 23:55:35 -0700 Subject: [PATCH 40/61] infer feed operator output variable shape with dims attribute --- paddle/operators/feed_op.cc | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/paddle/operators/feed_op.cc b/paddle/operators/feed_op.cc index 29e128ce7e..1d65c2bb46 100644 --- a/paddle/operators/feed_op.cc +++ b/paddle/operators/feed_op.cc @@ -32,8 +32,12 @@ class FeedOp : public framework::OperatorWithKernel { g_feed_variable->Get>(); PADDLE_ENFORCE_GT(tensors.size(), static_cast(col)); - auto in_dim = tensors[col].dims(); - ctx->SetOutputDim("Out", in_dim); + + auto& shape = ctx->Attrs().Get>("dims"); + std::vector shape_int64(shape.size(), 0); + std::transform(shape.begin(), shape.end(), shape_int64.begin(), + [](int a) { return static_cast(a); }); + ctx->SetOutputDim("Out", framework::make_ddim(shape_int64)); // TODO(qijun): need to handle LodTensor later } From 67edd04a2f37c6bee5642d1d75be5ca5eb250b4b Mon Sep 17 00:00:00 2001 From: chengduoZH Date: Tue, 10 Oct 2017 21:29:18 +0800 Subject: [PATCH 41/61] fix doc --- paddle/operators/pool_op.cc | 75 ++++++++++++++++---------- paddle/operators/pool_with_index_op.cc | 7 +-- 2 files changed, 51 insertions(+), 31 deletions(-) diff --git a/paddle/operators/pool_op.cc b/paddle/operators/pool_op.cc index ba3b5ed207..acc7e66c08 100644 --- a/paddle/operators/pool_op.cc +++ b/paddle/operators/pool_op.cc @@ -40,8 +40,6 @@ class PoolOp : public framework::OperatorWithKernel { std::vector strides = ctx->Attrs().Get>("strides"); std::vector paddings = ctx->Attrs().Get>("paddings"); - PADDLE_ENFORCE(pooling_type == "max" || pooling_type == "avg", - "pooling_type should be 'max' or 'avg'"); PADDLE_ENFORCE(in_x_dims.size() == 4 || in_x_dims.size() == 5, "Pooling intput should be 4-D or 5-D"); @@ -52,13 +50,11 @@ class PoolOp : public framework::OperatorWithKernel { } PADDLE_ENFORCE(in_x_dims.size() - ksize.size() == 2U, - "Input size and Pooling size should be consistent."); - PADDLE_ENFORCE(ksize.size() == 2 || ksize.size() == 3, - "Pooling size should be 2 elements. or 3 elements."); + "Input size and pooling size should be consistent."); PADDLE_ENFORCE_EQ(ksize.size(), strides.size(), - "strides size and pooling size should be the same."); + "Strides size and pooling size should be the same."); PADDLE_ENFORCE_EQ(ksize.size(), paddings.size(), - "paddings size and pooling size should be the same."); + "Paddings size and pooling size should be the same."); std::vector output_shape({in_x_dims[0], in_x_dims[1]}); for (size_t i = 0; i < ksize.size(); ++i) { @@ -75,10 +71,9 @@ class PoolOpGrad : public framework::OperatorWithKernel { protected: void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("X"), - "X(Input) of Pooling should not be null."); + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null."); PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), - "Input@Grad of Pooling should not be null."); + "Input(X@GRAD) should not be null."); ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); } }; @@ -94,17 +89,22 @@ class Pool2dOpMaker : public framework::OpProtoAndCheckerMaker { "number of channels, H and W is the height and width of feature."); AddOutput("Out", "The output tensor of pooling operator." - "The format of output tensor is also NCHW."); + "The format of output tensor is also NCHW." + "Where N is batch size, C is " + "the number of channels, H and W is the height and " + "width of feature."); AddAttr("poolingType", "PoolingType of pooling operator." "Str constant equal to 'max' or 'avg'.") .InEnum({"max", "avg"}); + AddAttr>( "ksize", - "Pooling size(depth, height, width) of pooling operator." + "The pooling size(height, width) of pooling operator." "If globalPooling = true, ksize is ignored and need not be " - "specified."); // TODO(Add checker) + "specified."); // TODO(Chengduo): Add checker. (Currently, + // TypedAttrChecker don't support vector type.) AddAttr( "globalPooling", "Whether to use the globalPooling." @@ -114,15 +114,22 @@ class Pool2dOpMaker : public framework::OpProtoAndCheckerMaker { .SetDefault(false); AddAttr>("strides", "Strides(height, width) of pooling operator." - "Default {1,1}") - .SetDefault({1, 1}); // TODO(Add checker) + "Default {1,1}.") + .SetDefault({1, 1}); // TODO(Chengduo): Add checker. (Currently, + // TypedAttrChecker don't support vector type.) AddAttr>("paddings", "Paddings(height, width) of pooling operator." "Default {0,0}.") - .SetDefault({0, 0}); // TODO(Add checker) + .SetDefault({0, 0}); // TODO(Chengduo): Add checker. (Currently, + // TypedAttrChecker don't support vector type.) + AddComment(R"DOC( The pooling2d operation calculates the output based on the input, poolingType and ksize, strides, paddings parameters. +Input(X) and output(Out) are in NCHW format. Where N is batch size, C is the +number of channels, H and W is the height and width of feature. +Parameters(ksize, strides, paddings) are two elements. +These two elements represent height and width, respectively. )DOC"); } }; @@ -131,25 +138,30 @@ class Pool3dOpMaker : public framework::OpProtoAndCheckerMaker { public: Pool3dOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { - AddInput("X", - "The input tensor of pooling operator. " - "The format of input tensor is NCDHW. Where N is batch size, C is " - "the " - "number of channels, D, H and W is the depth, height and width of " - "feature."); + AddInput( + "X", + "The input tensor of pooling operator. " + "The format of input tensor is NCDHW. Where N is batch size, C is " + "the number of channels, D, H and W is the depth, height and width of " + "feature."); AddOutput("Out", "The output tensor of pooling operator." - "The format of output tensor is also NCDHW."); + "The format of output tensor is also NCDHW." + "Where N is batch size, C is " + "the number of channels, D, H and W is the depth, height and " + "width of feature."); AddAttr("poolingType", "PoolingType of pooling operator." - "str constant equal to 'max' or 'avg'.") + "Str constant equal to 'max' or 'avg'.") .InEnum({"max", "avg"}); + AddAttr>( "ksize", - "Pooling size(depth, height, width) of pooling operator." + "The pooling size(depth, height, width) of pooling operator." "If globalPooling = true, ksize is ignored and need not be " - "specified."); // TODO(Add checker) + "specified."); // TODO(Chengduo): Add checker. (Currently, + // TypedAttrChecker don't support vector type.) AddAttr( "globalPooling", "Whether to use the globalPooling." @@ -161,15 +173,22 @@ class Pool3dOpMaker : public framework::OpProtoAndCheckerMaker { "strides", "Strides(depth, height, width) of pooling operator." "Default {1,1,1}.") - .SetDefault({1, 1, 1}); // TODO(Add checker) + .SetDefault({1, 1, 1}); // TODO(Chengduo): Add checker. (Currently, + // TypedAttrChecker don't support vector type.) AddAttr>( "paddings", "Paddings(depth, height, width) of pooling operator." "Default {0,0,0}.") - .SetDefault({0, 0, 0}); // TODO(Add checker) + .SetDefault({0, 0, 0}); // TODO(Chengduo): Add checker. (Currently, + // TypedAttrChecker don't support vector type.) + AddComment(R"DOC( The pooling3d operation calculates the output based on the input, poolingType and ksize, strides, paddings parameters. +Input(X) and output(Out) are in NCDHW format. Where N is batch +size, C is the number of channels, D, H and W is the depth, height and +width of feature. Parameters(ksize, strides, paddings) are three elements. +These three elements represent depth, height and width, respectively. )DOC"); } }; diff --git a/paddle/operators/pool_with_index_op.cc b/paddle/operators/pool_with_index_op.cc index ab933a3400..b49d486d7c 100644 --- a/paddle/operators/pool_with_index_op.cc +++ b/paddle/operators/pool_with_index_op.cc @@ -28,7 +28,7 @@ class MaxPoolWithIndexOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; protected: - void InferShape(framework::InferShapeContextBase *ctx) const override { + void InferShape(framework::InferShapeContext *ctx) const override { PADDLE_ENFORCE(ctx->HasInput("X"), "X(Input) of Pooling should not be null."); PADDLE_ENFORCE(ctx->HasOutput("Out"), @@ -52,7 +52,7 @@ class MaxPoolWithIndexOp : public framework::OperatorWithKernel { } PADDLE_ENFORCE(in_x_dims.size() - ksize.size() == 2U, - "Intput size and pooling size should be consistent."); + "Input size and pooling size should be consistent."); PADDLE_ENFORCE_EQ(ksize.size(), strides.size(), "Strides size and pooling size should be the same."); PADDLE_ENFORCE_EQ(ksize.size(), paddings.size(), @@ -73,7 +73,8 @@ class MaxPoolWithIndexOpGrad : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; protected: - void InferShape(framework::InferShapeContextBase *ctx) const override { + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Mask"), "Input(Mask) must not be null."); PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null."); PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), "Input(X@GRAD) should not be null."); From 6db476ed89b64a91e07ed7e13344645d27c9f1fb Mon Sep 17 00:00:00 2001 From: chengduoZH Date: Tue, 10 Oct 2017 21:35:39 +0800 Subject: [PATCH 42/61] Separate the declarations and implementation of the PoolOp and PoolMaker class in order to reuse in pool_cudnn --- paddle/operators/pool_op.cc | 290 +++++++++++++++++------------------- paddle/operators/pool_op.h | 28 ++++ 2 files changed, 164 insertions(+), 154 deletions(-) diff --git a/paddle/operators/pool_op.cc b/paddle/operators/pool_op.cc index acc7e66c08..25fd01844b 100644 --- a/paddle/operators/pool_op.cc +++ b/paddle/operators/pool_op.cc @@ -22,108 +22,94 @@ int OutputSizePool(int input_size, int filter_size, int padding, int stride) { return output_size; } -class PoolOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - protected: - void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("X"), - "X(Input) of Pooling should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("Out"), - "Out(Output) of Pooling should not be null."); - - auto in_x_dims = ctx->GetInputDim("X"); - - std::string pooling_type = ctx->Attrs().Get("poolingType"); - std::vector ksize = ctx->Attrs().Get>("ksize"); - std::vector strides = ctx->Attrs().Get>("strides"); - std::vector paddings = ctx->Attrs().Get>("paddings"); - - PADDLE_ENFORCE(in_x_dims.size() == 4 || in_x_dims.size() == 5, - "Pooling intput should be 4-D or 5-D"); - - if (ctx->Attrs().Get("globalPooling")) { - ksize.resize(static_cast(in_x_dims.size()) - 2); - for (size_t i = 0; i < ksize.size(); ++i) - ksize[i] = static_cast(in_x_dims[i + 2]); - } - - PADDLE_ENFORCE(in_x_dims.size() - ksize.size() == 2U, - "Input size and pooling size should be consistent."); - PADDLE_ENFORCE_EQ(ksize.size(), strides.size(), - "Strides size and pooling size should be the same."); - PADDLE_ENFORCE_EQ(ksize.size(), paddings.size(), - "Paddings size and pooling size should be the same."); - - std::vector output_shape({in_x_dims[0], in_x_dims[1]}); - for (size_t i = 0; i < ksize.size(); ++i) { - output_shape.push_back( - OutputSizePool(in_x_dims[i + 2], ksize[i], paddings[i], strides[i])); - } - ctx->SetOutputDim("Out", framework::make_ddim(output_shape)); +void PoolOp::InferShape(framework::InferShapeContext *ctx) const { + PADDLE_ENFORCE(ctx->HasInput("X"), "X(Input) of Pooling should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Out(Output) of Pooling should not be null."); + + auto in_x_dims = ctx->GetInputDim("X"); + + std::string pooling_type = ctx->Attrs().Get("poolingType"); + std::vector ksize = ctx->Attrs().Get>("ksize"); + std::vector strides = ctx->Attrs().Get>("strides"); + std::vector paddings = ctx->Attrs().Get>("paddings"); + + PADDLE_ENFORCE(in_x_dims.size() == 4 || in_x_dims.size() == 5, + "Pooling intput should be 4-D or 5-D"); + + if (ctx->Attrs().Get("globalPooling")) { + ksize.resize(static_cast(in_x_dims.size()) - 2); + for (size_t i = 0; i < ksize.size(); ++i) + ksize[i] = static_cast(in_x_dims[i + 2]); } -}; - -class PoolOpGrad : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - protected: - void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null."); - PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), - "Input(X@GRAD) should not be null."); - ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + + PADDLE_ENFORCE(in_x_dims.size() - ksize.size() == 2U, + "Input size and pooling size should be consistent."); + PADDLE_ENFORCE_EQ(ksize.size(), strides.size(), + "Strides size and pooling size should be the same."); + PADDLE_ENFORCE_EQ(ksize.size(), paddings.size(), + "Paddings size and pooling size should be the same."); + + std::vector output_shape({in_x_dims[0], in_x_dims[1]}); + for (size_t i = 0; i < ksize.size(); ++i) { + output_shape.push_back( + OutputSizePool(in_x_dims[i + 2], ksize[i], paddings[i], strides[i])); } -}; - -class Pool2dOpMaker : public framework::OpProtoAndCheckerMaker { - public: - Pool2dOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker) - : OpProtoAndCheckerMaker(proto, op_checker) { - AddInput( - "X", - "The input tensor of pooling operator. " - "The format of input tensor is NCHW. Where N is batch size, C is the " - "number of channels, H and W is the height and width of feature."); - AddOutput("Out", - "The output tensor of pooling operator." - "The format of output tensor is also NCHW." - "Where N is batch size, C is " - "the number of channels, H and W is the height and " - "width of feature."); - - AddAttr("poolingType", - "PoolingType of pooling operator." - "Str constant equal to 'max' or 'avg'.") - .InEnum({"max", "avg"}); - - AddAttr>( - "ksize", - "The pooling size(height, width) of pooling operator." - "If globalPooling = true, ksize is ignored and need not be " - "specified."); // TODO(Chengduo): Add checker. (Currently, - // TypedAttrChecker don't support vector type.) - AddAttr( - "globalPooling", - "Whether to use the globalPooling." - "Bool constant equal to false or true." - "Default false." - "If globalPooling = true, ksize is ignored and need not be specified.") - .SetDefault(false); - AddAttr>("strides", - "Strides(height, width) of pooling operator." - "Default {1,1}.") - .SetDefault({1, 1}); // TODO(Chengduo): Add checker. (Currently, - // TypedAttrChecker don't support vector type.) - AddAttr>("paddings", - "Paddings(height, width) of pooling operator." - "Default {0,0}.") - .SetDefault({0, 0}); // TODO(Chengduo): Add checker. (Currently, - // TypedAttrChecker don't support vector type.) - - AddComment(R"DOC( + ctx->SetOutputDim("Out", framework::make_ddim(output_shape)); +} + +void PoolOpGrad::InferShape(framework::InferShapeContext *ctx) const { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null."); + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), + "Input(X@GRAD) should not be null."); + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); +} + +Pool2dOpMaker::Pool2dOpMaker(framework::OpProto *proto, + framework::OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput( + "X", + "The input tensor of pooling operator. " + "The format of input tensor is NCHW. Where N is batch size, C is the " + "number of channels, H and W is the height and width of feature."); + AddOutput("Out", + "The output tensor of pooling operator." + "The format of output tensor is also NCHW." + "Where N is batch size, C is " + "the number of channels, H and W is the height and " + "width of feature."); + + AddAttr("poolingType", + "PoolingType of pooling operator." + "Str constant equal to 'max' or 'avg'.") + .InEnum({"max", "avg"}); + + AddAttr>( + "ksize", + "The pooling size(height, width) of pooling operator." + "If globalPooling = true, ksize is ignored and need not be " + "specified."); // TODO(Chengduo): Add checker. (Currently, + // TypedAttrChecker don't support vector type.) + AddAttr( + "globalPooling", + "Whether to use the globalPooling." + "Bool constant equal to false or true." + "Default false." + "If globalPooling = true, ksize is ignored and need not be specified.") + .SetDefault(false); + AddAttr>("strides", + "Strides(height, width) of pooling operator." + "Default {1,1}.") + .SetDefault({1, 1}); // TODO(Chengduo): Add checker. (Currently, + // TypedAttrChecker don't support vector type.) + AddAttr>("paddings", + "Paddings(height, width) of pooling operator." + "Default {0,0}.") + .SetDefault({0, 0}); // TODO(Chengduo): Add checker. (Currently, + // TypedAttrChecker don't support vector type.) + + AddComment(R"DOC( The pooling2d operation calculates the output based on the input, poolingType and ksize, strides, paddings parameters. Input(X) and output(Out) are in NCHW format. Where N is batch size, C is the @@ -131,58 +117,55 @@ number of channels, H and W is the height and width of feature. Parameters(ksize, strides, paddings) are two elements. These two elements represent height and width, respectively. )DOC"); - } -}; - -class Pool3dOpMaker : public framework::OpProtoAndCheckerMaker { - public: - Pool3dOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker) - : OpProtoAndCheckerMaker(proto, op_checker) { - AddInput( - "X", - "The input tensor of pooling operator. " - "The format of input tensor is NCDHW. Where N is batch size, C is " - "the number of channels, D, H and W is the depth, height and width of " - "feature."); - AddOutput("Out", - "The output tensor of pooling operator." - "The format of output tensor is also NCDHW." - "Where N is batch size, C is " - "the number of channels, D, H and W is the depth, height and " - "width of feature."); - - AddAttr("poolingType", - "PoolingType of pooling operator." - "Str constant equal to 'max' or 'avg'.") - .InEnum({"max", "avg"}); - - AddAttr>( - "ksize", - "The pooling size(depth, height, width) of pooling operator." - "If globalPooling = true, ksize is ignored and need not be " - "specified."); // TODO(Chengduo): Add checker. (Currently, - // TypedAttrChecker don't support vector type.) - AddAttr( - "globalPooling", - "Whether to use the globalPooling." - "Bool constant equal to false or true." - "Default false." - "If globalPooling = true, ksize is ignored and need not be specified.") - .SetDefault(false); - AddAttr>( - "strides", - "Strides(depth, height, width) of pooling operator." - "Default {1,1,1}.") - .SetDefault({1, 1, 1}); // TODO(Chengduo): Add checker. (Currently, - // TypedAttrChecker don't support vector type.) - AddAttr>( - "paddings", - "Paddings(depth, height, width) of pooling operator." - "Default {0,0,0}.") - .SetDefault({0, 0, 0}); // TODO(Chengduo): Add checker. (Currently, - // TypedAttrChecker don't support vector type.) - - AddComment(R"DOC( +} + +Pool3dOpMaker::Pool3dOpMaker(framework::OpProto *proto, + framework::OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput( + "X", + "The input tensor of pooling operator. " + "The format of input tensor is NCDHW. Where N is batch size, C is " + "the number of channels, D, H and W is the depth, height and width of " + "feature."); + AddOutput("Out", + "The output tensor of pooling operator." + "The format of output tensor is also NCDHW." + "Where N is batch size, C is " + "the number of channels, D, H and W is the depth, height and " + "width of feature."); + + AddAttr("poolingType", + "PoolingType of pooling operator." + "Str constant equal to 'max' or 'avg'.") + .InEnum({"max", "avg"}); + + AddAttr>( + "ksize", + "The pooling size(depth, height, width) of pooling operator." + "If globalPooling = true, ksize is ignored and need not be " + "specified."); // TODO(Chengduo): Add checker. (Currently, + // TypedAttrChecker don't support vector type.) + AddAttr( + "globalPooling", + "Whether to use the globalPooling." + "Bool constant equal to false or true." + "Default false." + "If globalPooling = true, ksize is ignored and need not be specified.") + .SetDefault(false); + AddAttr>("strides", + "Strides(depth, height, width) of pooling operator." + "Default {1,1,1}.") + .SetDefault({1, 1, 1}); // TODO(Chengduo): Add checker. (Currently, + // TypedAttrChecker don't support vector type.) + AddAttr>( + "paddings", + "Paddings(depth, height, width) of pooling operator." + "Default {0,0,0}.") + .SetDefault({0, 0, 0}); // TODO(Chengduo): Add checker. (Currently, + // TypedAttrChecker don't support vector type.) + + AddComment(R"DOC( The pooling3d operation calculates the output based on the input, poolingType and ksize, strides, paddings parameters. Input(X) and output(Out) are in NCDHW format. Where N is batch @@ -190,8 +173,7 @@ size, C is the number of channels, D, H and W is the depth, height and width of feature. Parameters(ksize, strides, paddings) are three elements. These three elements represent depth, height and width, respectively. )DOC"); - } -}; +} } // namespace operators } // namespace paddle diff --git a/paddle/operators/pool_op.h b/paddle/operators/pool_op.h index c2bc358def..e5016d573d 100644 --- a/paddle/operators/pool_op.h +++ b/paddle/operators/pool_op.h @@ -24,6 +24,34 @@ namespace operators { using Tensor = framework::Tensor; +class PoolOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override; +}; + +class PoolOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override; +}; + +class Pool2dOpMaker : public framework::OpProtoAndCheckerMaker { + public: + Pool2dOpMaker(framework::OpProto* proto, + framework::OpAttrChecker* op_checker); +}; + +class Pool3dOpMaker : public framework::OpProtoAndCheckerMaker { + public: + Pool3dOpMaker(framework::OpProto* proto, + framework::OpAttrChecker* op_checker); +}; + template class PoolKernel : public framework::OpKernel { public: From a308ff29af714be50e321c65fdcd88729a505ebe Mon Sep 17 00:00:00 2001 From: qijun Date: Tue, 10 Oct 2017 10:25:01 -0700 Subject: [PATCH 43/61] make infershape of feedop and fetchop compatible with compile-time design --- paddle/framework/executor_test.cc | 22 ++++++---------------- paddle/operators/feed_op.cc | 13 ++----------- paddle/operators/feed_op.h | 3 ++- paddle/operators/fetch_op.cc | 20 ++------------------ paddle/operators/fetch_op.h | 8 +++++++- 5 files changed, 19 insertions(+), 47 deletions(-) diff --git a/paddle/framework/executor_test.cc b/paddle/framework/executor_test.cc index 259205f7c1..0710eb5779 100644 --- a/paddle/framework/executor_test.cc +++ b/paddle/framework/executor_test.cc @@ -116,12 +116,8 @@ class ExecutorTesterRandom : public ::testing::Test { {{"dims", std::vector{input_dim, embed_dim}}}, init_root_block); AddOp("gaussian_random", {}, {{"Out", {"w2"}}}, {{"dims", std::vector{embed_dim, input_dim}}}, init_root_block); - AddOp("fetch", {{"Input", {"w1"}}}, {}, - {{"dims", std::vector{input_dim, embed_dim}}, {"col", 0}}, - init_root_block); - AddOp("fetch", {{"Input", {"w2"}}}, {}, - {{"dims", std::vector{embed_dim, input_dim}}, {"col", 1}}, - init_root_block); + AddOp("fetch", {{"Input", {"w1"}}}, {}, {{"col", 0}}, init_root_block); + AddOp("fetch", {{"Input", {"w2"}}}, {}, {{"col", 1}}, init_root_block); // flush init_program.Proto(); @@ -163,12 +159,8 @@ class ExecutorTesterRandom : public ::testing::Test { {"Grad", {"w2@GRAD"}}}, {{"ParamOut", {"w2"}}}, {}, root_block); - AddOp("fetch", {{"Input", {"w1"}}}, {}, - {{"dims", std::vector{input_dim, embed_dim}}, {"col", 0}}, - root_block); - AddOp("fetch", {{"Input", {"w2"}}}, {}, - {{"dims", std::vector{embed_dim, input_dim}}, {"col", 1}}, - root_block); + AddOp("fetch", {{"Input", {"w1"}}}, {}, {{"col", 0}}, root_block); + AddOp("fetch", {{"Input", {"w2"}}}, {}, {{"col", 1}}, root_block); // flush program.Proto(); @@ -197,10 +189,8 @@ class ExecutorTesterFeedAndFetch : public ::testing::Test { root_block); AddOp("feed", {}, {{"Out", {"b"}}}, {{"dims", dim}, {"col", 1}}, root_block); - AddOp("fetch", {{"Input", {"a"}}}, {}, {{"dims", dim}, {"col", 0}}, - root_block); - AddOp("fetch", {{"Input", {"b"}}}, {}, {{"dims", dim}, {"col", 1}}, - root_block); + AddOp("fetch", {{"Input", {"a"}}}, {}, {{"col", 0}}, root_block); + AddOp("fetch", {{"Input", {"b"}}}, {}, {{"col", 1}}, root_block); // flush program.Proto(); diff --git a/paddle/operators/feed_op.cc b/paddle/operators/feed_op.cc index 1d65c2bb46..fa325bb282 100644 --- a/paddle/operators/feed_op.cc +++ b/paddle/operators/feed_op.cc @@ -24,15 +24,6 @@ class FeedOp : public framework::OperatorWithKernel { protected: void InferShape(framework::InferShapeContext* ctx) const override { PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output should be not null."); - int col = ctx->Attrs().Get("col"); - framework::Variable* g_feed_variable = - framework::GetGlobalScope()->FindVar("feed_value"); - - const auto& tensors = - g_feed_variable->Get>(); - - PADDLE_ENFORCE_GT(tensors.size(), static_cast(col)); - auto& shape = ctx->Attrs().Get>("dims"); std::vector shape_int64(shape.size(), 0); std::transform(shape.begin(), shape.end(), shape_int64.begin(), @@ -43,7 +34,7 @@ class FeedOp : public framework::OperatorWithKernel { framework::DataType IndicateDataType( const framework::ExecutionContext& ctx) const override { - return static_cast(Attr("data_type")); + return static_cast(Attr("dataType")); } }; @@ -51,7 +42,7 @@ class FeedOpMaker : public framework::OpProtoAndCheckerMaker { public: FeedOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { - AddAttr("data_type", "output data type") + AddAttr("dataType", "output data type") .SetDefault(framework::DataType::FP32); AddAttr("col", "The col in global feed variable").SetDefault(0); AddAttr>("dims", "The dimension of feed tensor."); diff --git a/paddle/operators/feed_op.h b/paddle/operators/feed_op.h index 96e3bf52bd..47344e309c 100644 --- a/paddle/operators/feed_op.h +++ b/paddle/operators/feed_op.h @@ -27,9 +27,10 @@ class FeedKernel : public framework::OpKernel { out->mutable_data(ctx.GetPlace()); framework::Variable* g_feed_variable = framework::GetGlobalScope()->FindVar("feed_value"); - int col = ctx.template Attr("col"); const auto& tensors = g_feed_variable->Get>(); + int col = ctx.template Attr("col"); + PADDLE_ENFORCE_GT(tensors.size(), static_cast(col)); out->CopyFrom(tensors[col], ctx.GetPlace()); } }; diff --git a/paddle/operators/fetch_op.cc b/paddle/operators/fetch_op.cc index 77e3450a73..90737c8c55 100644 --- a/paddle/operators/fetch_op.cc +++ b/paddle/operators/fetch_op.cc @@ -24,26 +24,11 @@ class FetchOp : public framework::OperatorWithKernel { protected: void InferShape(framework::InferShapeContext* ctx) const override { PADDLE_ENFORCE(ctx->HasInput("Input"), "Input should be not null."); - int col = ctx->Attrs().Get("col"); - framework::Variable* g_fetch_variable = - framework::GetGlobalScope()->FindVar("fetch_value"); - - auto* tensors = - g_fetch_variable->GetMutable>(); - if (tensors->size() < static_cast(col + 1)) { - tensors->resize(col + 1); - } - - auto input_dim = ctx->GetInputDim("Input"); - PADDLE_ENFORCE_GT(tensors->size(), col); - (*tensors)[col].Resize(input_dim); - - // TODO(qijun): need to handle LodTensor later } framework::DataType IndicateDataType( const framework::ExecutionContext& ctx) const override { - return static_cast(Attr("data_type")); + return static_cast(Attr("dataType")); } }; @@ -51,10 +36,9 @@ class FetchOpMaker : public framework::OpProtoAndCheckerMaker { public: FetchOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { - AddAttr("data_type", "output data type") + AddAttr("dataType", "output data type") .SetDefault(framework::DataType::FP32); AddAttr("col", "The col in global fetch variable").SetDefault(0); - AddAttr>("dims", "The dimension of fetch tensor."); AddInput("Input", "The output of fetch op."); AddComment(R"DOC(Fetch data to global fetch variable)DOC"); } diff --git a/paddle/operators/fetch_op.h b/paddle/operators/fetch_op.h index fd98552055..6fee8b0589 100644 --- a/paddle/operators/fetch_op.h +++ b/paddle/operators/fetch_op.h @@ -24,13 +24,19 @@ class FetchKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { const framework::Tensor* input = ctx.Input("Input"); - int col = ctx.template Attr("col"); framework::Variable* g_fetch_variable = framework::GetGlobalScope()->FindVar("fetch_value"); auto* tensors = g_fetch_variable->GetMutable>(); + int col = ctx.template Attr("col"); + if (tensors->size() < static_cast(col + 1)) { + tensors->resize(col + 1); + } + PADDLE_ENFORCE_GT(tensors->size(), static_cast(col)); + (*tensors)[col].Resize(input->dims()); (*tensors)[col].mutable_data(platform::CPUPlace()); (*tensors)[col].CopyFrom(*input, platform::CPUPlace()); + // TODO(qijun): need to handle LodTensor later } }; From 3f9e247a7358ae7824c3ce63a7231b54b31944a3 Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Tue, 10 Oct 2017 18:53:54 +0000 Subject: [PATCH 44/61] set variable support dim --- paddle/framework/executor.cc | 3 +-- paddle/framework/executor_test.cc | 30 ++++++++++++++++++++++-------- 2 files changed, 23 insertions(+), 10 deletions(-) diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc index ee6243a9bf..f4cc37cfa6 100644 --- a/paddle/framework/executor.cc +++ b/paddle/framework/executor.cc @@ -74,8 +74,7 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id) { std::vector should_run = Prune(pdesc, block_id); PADDLE_ENFORCE_EQ(should_run.size(), block.ops_size()); for (size_t i = 0; i < should_run.size(); ++i) { - // if (should_run[i]) { - if (true) { + if (should_run[i]) { for (auto& var : block.ops(i).outputs()) { for (auto& argu : var.arguments()) { if (local_scope.FindVar(argu) == nullptr) { diff --git a/paddle/framework/executor_test.cc b/paddle/framework/executor_test.cc index 0710eb5779..ce8b599e0e 100644 --- a/paddle/framework/executor_test.cc +++ b/paddle/framework/executor_test.cc @@ -65,15 +65,15 @@ void AddOp(const std::string& type, const VariableNameMap& inputs, // Tensors in feed value variable will only be in CPUPlace // So we can memcpy the data from vector to feed_value template -void SetFeedVariable(const std::vector>& inputs) { +void SetFeedVariable(const std::vector>& inputs, + const std::vector>& dims) { Variable* g_feed_value = GetGlobalScope()->FindVar("feed_value"); auto& feed_inputs = *(g_feed_value->GetMutable>()); size_t size = inputs.size(); feed_inputs.resize(size); for (size_t i = 0; i < size; i++) { - T* dst = feed_inputs[i].mutable_data( - make_ddim({static_cast(inputs[i].size())}), CPUPlace()); + T* dst = feed_inputs[i].mutable_data(make_ddim(dims[i]), CPUPlace()); memcpy(dst, inputs[i].data(), inputs[i].size() * sizeof(T)); } } @@ -103,7 +103,7 @@ std::vector> GetFetchVariable() { class ExecutorTesterRandom : public ::testing::Test { public: virtual void SetUp() override { - int input_dim = 5, batch_size = 2, embed_dim = 5; + int input_dim = 3, batch_size = 2, embed_dim = 5; auto temp_init_root_block = init_pdesc_.add_blocks(); temp_init_root_block->set_idx(0); @@ -130,9 +130,16 @@ class ExecutorTesterRandom : public ::testing::Test { paddle::framework::ProgramDescBind::Instance(&pdesc_); paddle::framework::BlockDescBind* root_block = program.Block(0); + // feed data + inputs_.push_back({1.0, 2.0, 3.0, 4.0, 5.0, 6.0}); + dims_.push_back({batch_size, input_dim}); + AddOp("feed", {}, {{"Out", {"a"}}}, + {{"dims", std::vector{batch_size, input_dim}}, {"col", 0}}, + root_block); + // forward - AddOp("gaussian_random", {}, {{"Out", {"a"}}}, - {{"dims", std::vector{batch_size, input_dim}}}, root_block); + // AddOp("gaussian_random", {}, {{"Out", {"a"}}}, + // {{"dims", std::vector{batch_size, input_dim}}}, root_block); AddOp("mul", {{"X", {"a"}}, {"Y", {"w1"}}}, {{"Out", {"b"}}}, {}, root_block); AddOp("mul", {{"X", {"b"}}, {"Y", {"w2"}}}, {{"Out", {"a_out"}}}, {}, @@ -161,6 +168,7 @@ class ExecutorTesterRandom : public ::testing::Test { AddOp("fetch", {{"Input", {"w1"}}}, {}, {{"col", 0}}, root_block); AddOp("fetch", {{"Input", {"w2"}}}, {}, {{"col", 1}}, root_block); + AddOp("fetch", {{"Input", {"l2_distance"}}}, {}, {{"col", 0}}, root_block); // flush program.Proto(); @@ -169,6 +177,8 @@ class ExecutorTesterRandom : public ::testing::Test { protected: ProgramDesc init_pdesc_; ProgramDesc pdesc_; + std::vector> inputs_; + std::vector> dims_; }; class ExecutorTesterFeedAndFetch : public ::testing::Test { @@ -199,11 +209,14 @@ class ExecutorTesterFeedAndFetch : public ::testing::Test { std::vector vec2 = {4.0, 5.0, 6.0, 7.0, 8.0, 9.0}; inputs_.push_back(vec1); inputs_.push_back(vec2); + dims_.push_back({static_cast(vec1.size())}); + dims_.push_back({static_cast(vec2.size())}); } protected: ProgramDesc pdesc_; std::vector> inputs_; + std::vector> dims_; }; #ifndef PADDLE_WITH_CUDA @@ -239,7 +252,7 @@ TEST_F(ExecutorTesterFeedAndFetch, CPU) { std::unique_ptr executor(new Executor(places)); for (int batch_id = 0; batch_id < 3; batch_id++) { - SetFeedVariable(inputs_); + SetFeedVariable(inputs_, dims_); executor->Run(pdesc_, GetGlobalScope(), 0); std::vector> result = GetFetchVariable(); PADDLE_ENFORCE_EQ(result.size(), inputs_.size()); @@ -270,6 +283,7 @@ TEST_F(ExecutorTesterRandom, GPU) { executor->Run(init_pdesc_, GetGlobalScope(), 0); for (int batch_id = 0; batch_id < 3; batch_id++) { + SetFeedVariable(inputs_, dims_); executor->Run(pdesc_, GetGlobalScope(), 0); std::vector> result = GetFetchVariable(); } @@ -291,7 +305,7 @@ TEST_F(ExecutorTesterFeedAndFetch, GPU) { std::unique_ptr executor(new Executor(places)); for (int batch_id = 0; batch_id < 3; batch_id++) { - SetFeedVariable(inputs_); + SetFeedVariable(inputs_, dims_); executor->Run(pdesc_, GetGlobalScope(), 0); std::vector> result = GetFetchVariable(); PADDLE_ENFORCE_EQ(result.size(), inputs_.size()); From 293a7d1e75d14a744852523383bdbef1663887be Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Tue, 10 Oct 2017 18:55:16 +0000 Subject: [PATCH 45/61] add feed infershape todo --- paddle/operators/feed_op.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/paddle/operators/feed_op.h b/paddle/operators/feed_op.h index 47344e309c..e406d22209 100644 --- a/paddle/operators/feed_op.h +++ b/paddle/operators/feed_op.h @@ -31,6 +31,9 @@ class FeedKernel : public framework::OpKernel { g_feed_variable->Get>(); int col = ctx.template Attr("col"); PADDLE_ENFORCE_GT(tensors.size(), static_cast(col)); + // TODO(qijun): + // check tensors[col].dims() with attribute, + // except the first dimenson. out->CopyFrom(tensors[col], ctx.GetPlace()); } }; From 062ff4d77b61fc72b0654064911b193714cfb18f Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Tue, 10 Oct 2017 19:07:21 +0000 Subject: [PATCH 46/61] clean up --- paddle/framework/executor.cc | 14 +------------- paddle/framework/executor_test.cc | 8 +++----- 2 files changed, 4 insertions(+), 18 deletions(-) diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc index f4cc37cfa6..def1d1fd06 100644 --- a/paddle/framework/executor.cc +++ b/paddle/framework/executor.cc @@ -72,7 +72,7 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id) { Scope& local_scope = scope->NewScope(); std::vector should_run = Prune(pdesc, block_id); - PADDLE_ENFORCE_EQ(should_run.size(), block.ops_size()); + PADDLE_ENFORCE_EQ(should_run.size(), static_cast(block.ops_size())); for (size_t i = 0; i < should_run.size(); ++i) { if (should_run[i]) { for (auto& var : block.ops(i).outputs()) { @@ -82,17 +82,7 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id) { } } } - LOG(INFO) << block.ops(i).type(); - if (block.ops(i).type() == "sum") { - LOG(INFO) << "Here"; - for (auto& var : block.ops(i).inputs()) { - for (auto& argu : var.arguments()) { - LOG(INFO) << var.parameter() << " " << argu; - } - } - } auto op = paddle::framework::OpRegistry::CreateOp(block.ops(i)); - LOG(INFO) << op->DebugString(); op->Run(local_scope, *device); } } @@ -152,10 +142,8 @@ std::vector Executor::Prune(const ProgramDesc& pdesc, int block_id) { } } - LOG(INFO) << "1 " << op_desc.type(); should_run.push_back(true); } else { - LOG(INFO) << "0 " << op_desc.type(); should_run.push_back(false); } } diff --git a/paddle/framework/executor_test.cc b/paddle/framework/executor_test.cc index ce8b599e0e..5ad5b98e7b 100644 --- a/paddle/framework/executor_test.cc +++ b/paddle/framework/executor_test.cc @@ -131,15 +131,13 @@ class ExecutorTesterRandom : public ::testing::Test { paddle::framework::BlockDescBind* root_block = program.Block(0); // feed data - inputs_.push_back({1.0, 2.0, 3.0, 4.0, 5.0, 6.0}); + inputs_.push_back({1.0, 1.0, 1.0, 1.0, 1.0, 1.0}); dims_.push_back({batch_size, input_dim}); AddOp("feed", {}, {{"Out", {"a"}}}, {{"dims", std::vector{batch_size, input_dim}}, {"col", 0}}, root_block); // forward - // AddOp("gaussian_random", {}, {{"Out", {"a"}}}, - // {{"dims", std::vector{batch_size, input_dim}}}, root_block); AddOp("mul", {{"X", {"a"}}, {"Y", {"w1"}}}, {{"Out", {"b"}}}, {}, root_block); AddOp("mul", {{"X", {"b"}}, {"Y", {"w2"}}}, {{"Out", {"a_out"}}}, {}, @@ -156,7 +154,8 @@ class ExecutorTesterRandom : public ::testing::Test { // update AddOp("fill_constant", {}, {{"Out", {"learning_rate"}}}, - {{"shape", std::vector{1}}, {"value", float(1.0)}}, root_block); + {{"shape", std::vector{1}}, {"value", float(0.001)}}, + root_block); AddOp("sgd", {{"Param", {"w1"}}, {"LearningRate", {"learning_rate"}}, {"Grad", {"w1@GRAD"}}}, @@ -285,7 +284,6 @@ TEST_F(ExecutorTesterRandom, GPU) { for (int batch_id = 0; batch_id < 3; batch_id++) { SetFeedVariable(inputs_, dims_); executor->Run(pdesc_, GetGlobalScope(), 0); - std::vector> result = GetFetchVariable(); } } From 2e7cd201a4337f49ce07de8cde11c3b8dd90f9ab Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Tue, 10 Oct 2017 21:30:03 +0000 Subject: [PATCH 47/61] remove log in backward --- paddle/framework/backward.cc | 1 - paddle/framework/executor_test.cc | 1 - 2 files changed, 2 deletions(-) diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc index 774d8e4918..0a4688db9c 100644 --- a/paddle/framework/backward.cc +++ b/paddle/framework/backward.cc @@ -378,7 +378,6 @@ std::vector> MakeBlockBackward( backward_descs[dup_op[i]]->Rename(out_name, new_name); sum_op_inputs.emplace_back(new_name); } - LOG(INFO) << "sum_op_inputs size " << sum_op_inputs.size(); std::unique_ptr sum_op(new OpDescBind( "sum", {{"X", sum_op_inputs}}, {{"Out", {out_name}}}, {})); pending_sum_ops.push_back({dup_op.back(), std::move(sum_op)}); diff --git a/paddle/framework/executor_test.cc b/paddle/framework/executor_test.cc index 5ad5b98e7b..1cd7270240 100644 --- a/paddle/framework/executor_test.cc +++ b/paddle/framework/executor_test.cc @@ -35,7 +35,6 @@ USE_OP(squared_l2_distance); USE_OP(fill_constant); USE_OP(sgd); -using std::string; using namespace paddle::platform; using namespace paddle::framework; From 436ea50d5fc8867848892fc53b7f82aa59ae3b41 Mon Sep 17 00:00:00 2001 From: qijun Date: Tue, 10 Oct 2017 14:31:47 -0700 Subject: [PATCH 48/61] follow comments --- paddle/framework/executor.cc | 4 +++- paddle/framework/executor_test.cc | 17 +++++++++-------- paddle/framework/scope.cc | 4 ++-- paddle/framework/scope.h | 2 +- paddle/operators/feed_op.h | 2 +- paddle/operators/fetch_op.h | 2 +- 6 files changed, 17 insertions(+), 14 deletions(-) diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc index def1d1fd06..1db5c878d6 100644 --- a/paddle/framework/executor.cc +++ b/paddle/framework/executor.cc @@ -44,7 +44,9 @@ Executor::Executor(const std::vector& places) { device_contexts_[i] = new platform::CUDADeviceContext( boost::get(places[i])); #else - PADDLE_THROW("'GPUPlace' is not supported in CPU only device."); + PADDLE_THROW( + "'GPUPlace' is not supported, Please re-compile with WITH_GPU " + "option"); #endif } } diff --git a/paddle/framework/executor_test.cc b/paddle/framework/executor_test.cc index 5ad5b98e7b..f36284b528 100644 --- a/paddle/framework/executor_test.cc +++ b/paddle/framework/executor_test.cc @@ -67,7 +67,7 @@ void AddOp(const std::string& type, const VariableNameMap& inputs, template void SetFeedVariable(const std::vector>& inputs, const std::vector>& dims) { - Variable* g_feed_value = GetGlobalScope()->FindVar("feed_value"); + Variable* g_feed_value = GetGlobalScope().FindVar("feed_value"); auto& feed_inputs = *(g_feed_value->GetMutable>()); size_t size = inputs.size(); @@ -82,7 +82,7 @@ void SetFeedVariable(const std::vector>& inputs, // So we can memcpy the data from fetch_value to vector template std::vector> GetFetchVariable() { - Variable* g_fetch_value = GetGlobalScope()->FindVar("fetch_value"); + Variable* g_fetch_value = GetGlobalScope().FindVar("fetch_value"); auto& fetch_outputs = *(g_fetch_value->GetMutable>()); @@ -232,8 +232,9 @@ TEST_F(ExecutorTesterRandom, CPU) { std::unique_ptr executor(new Executor(places)); - executor->Run(init_pdesc_, GetGlobalScope(), 0); - executor->Run(pdesc_, GetGlobalScope(), 0); + executor->Run(init_pdesc_, &GetGlobalScope(), 0); + SetFeedVariable(inputs_, dims_); + executor->Run(pdesc_, &GetGlobalScope(), 0); std::vector> result = GetFetchVariable(); } @@ -252,7 +253,7 @@ TEST_F(ExecutorTesterFeedAndFetch, CPU) { for (int batch_id = 0; batch_id < 3; batch_id++) { SetFeedVariable(inputs_, dims_); - executor->Run(pdesc_, GetGlobalScope(), 0); + executor->Run(pdesc_, &GetGlobalScope(), 0); std::vector> result = GetFetchVariable(); PADDLE_ENFORCE_EQ(result.size(), inputs_.size()); for (size_t i = 0; i < result.size(); ++i) { @@ -280,10 +281,10 @@ TEST_F(ExecutorTesterRandom, GPU) { std::unique_ptr executor(new Executor(places)); - executor->Run(init_pdesc_, GetGlobalScope(), 0); + executor->Run(init_pdesc_, &GetGlobalScope(), 0); for (int batch_id = 0; batch_id < 3; batch_id++) { SetFeedVariable(inputs_, dims_); - executor->Run(pdesc_, GetGlobalScope(), 0); + executor->Run(pdesc_, &GetGlobalScope(), 0); } } @@ -304,7 +305,7 @@ TEST_F(ExecutorTesterFeedAndFetch, GPU) { for (int batch_id = 0; batch_id < 3; batch_id++) { SetFeedVariable(inputs_, dims_); - executor->Run(pdesc_, GetGlobalScope(), 0); + executor->Run(pdesc_, &GetGlobalScope(), 0); std::vector> result = GetFetchVariable(); PADDLE_ENFORCE_EQ(result.size(), inputs_.size()); for (size_t i = 0; i < result.size(); ++i) { diff --git a/paddle/framework/scope.cc b/paddle/framework/scope.cc index c9e53a0d85..5821bac928 100644 --- a/paddle/framework/scope.cc +++ b/paddle/framework/scope.cc @@ -67,14 +67,14 @@ void Scope::DropKids() { std::once_flag feed_variable_flag; -framework::Scope* GetGlobalScope() { +framework::Scope& GetGlobalScope() { static std::unique_ptr g_scope{nullptr}; std::call_once(feed_variable_flag, [&]() { g_scope.reset(new framework::Scope()); g_scope->NewVar("feed_value"); g_scope->NewVar("fetch_value"); }); - return g_scope.get(); + return *(g_scope.get()); } } // namespace framework diff --git a/paddle/framework/scope.h b/paddle/framework/scope.h index 319d291efe..a8cfb107c2 100644 --- a/paddle/framework/scope.h +++ b/paddle/framework/scope.h @@ -73,7 +73,7 @@ class Scope { DISABLE_COPY_AND_ASSIGN(Scope); }; -framework::Scope* GetGlobalScope(); +framework::Scope& GetGlobalScope(); } // namespace framework } // namespace paddle diff --git a/paddle/operators/feed_op.h b/paddle/operators/feed_op.h index e406d22209..9d8158299f 100644 --- a/paddle/operators/feed_op.h +++ b/paddle/operators/feed_op.h @@ -26,7 +26,7 @@ class FeedKernel : public framework::OpKernel { framework::Tensor* out = ctx.Output("Out"); out->mutable_data(ctx.GetPlace()); framework::Variable* g_feed_variable = - framework::GetGlobalScope()->FindVar("feed_value"); + framework::GetGlobalScope().FindVar("feed_value"); const auto& tensors = g_feed_variable->Get>(); int col = ctx.template Attr("col"); diff --git a/paddle/operators/fetch_op.h b/paddle/operators/fetch_op.h index 6fee8b0589..eb9c3a7b59 100644 --- a/paddle/operators/fetch_op.h +++ b/paddle/operators/fetch_op.h @@ -25,7 +25,7 @@ class FetchKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& ctx) const override { const framework::Tensor* input = ctx.Input("Input"); framework::Variable* g_fetch_variable = - framework::GetGlobalScope()->FindVar("fetch_value"); + framework::GetGlobalScope().FindVar("fetch_value"); auto* tensors = g_fetch_variable->GetMutable>(); int col = ctx.template Attr("col"); From a528a9717ec5880f271b9d216cb5532cee9d4504 Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Tue, 10 Oct 2017 21:32:03 +0000 Subject: [PATCH 49/61] remove prune as member function to function --- paddle/framework/executor.cc | 120 +++++++++++++++++------------------ paddle/framework/executor.h | 23 ++++--- 2 files changed, 71 insertions(+), 72 deletions(-) diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc index def1d1fd06..3c35102ff9 100644 --- a/paddle/framework/executor.cc +++ b/paddle/framework/executor.cc @@ -32,66 +32,7 @@ namespace framework { const std::string kFeedOpType = "feed"; const std::string kFetchOpType = "fetch"; -Executor::Executor(const std::vector& places) { - PADDLE_ENFORCE_GT(places.size(), 0); - device_contexts_.resize(places.size()); - for (size_t i = 0; i < places.size(); i++) { - if (platform::is_cpu_place(places[i])) { - device_contexts_[i] = new platform::CPUDeviceContext( - boost::get(places[i])); - } else if (platform::is_gpu_place(places[i])) { -#ifdef PADDLE_WITH_CUDA - device_contexts_[i] = new platform::CUDADeviceContext( - boost::get(places[i])); -#else - PADDLE_THROW("'GPUPlace' is not supported in CPU only device."); -#endif - } - } -} - -Executor::~Executor() { - for (auto& device_context : device_contexts_) { - delete device_context; - } -} - -void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id) { - // TODO(tonyyang-svail): - // - only runs on the first device (i.e. no interdevice communication) - // - will change to use multiple blocks for RNN op and Cond Op - PADDLE_ENFORCE_GT(pdesc.blocks_size(), block_id); - auto& block = pdesc.blocks(block_id); - auto& device = device_contexts_[0]; - - // Instantiate all the vars in the global scope - for (auto& var : block.vars()) { - scope->NewVar(var.name()); - } - - Scope& local_scope = scope->NewScope(); - - std::vector should_run = Prune(pdesc, block_id); - PADDLE_ENFORCE_EQ(should_run.size(), static_cast(block.ops_size())); - for (size_t i = 0; i < should_run.size(); ++i) { - if (should_run[i]) { - for (auto& var : block.ops(i).outputs()) { - for (auto& argu : var.arguments()) { - if (local_scope.FindVar(argu) == nullptr) { - local_scope.NewVar(argu); - } - } - } - auto op = paddle::framework::OpRegistry::CreateOp(block.ops(i)); - op->Run(local_scope, *device); - } - } - - // TODO(tonyyang-svail): - // - Destroy local_scope -} - -std::vector Executor::Prune(const ProgramDesc& pdesc, int block_id) { +std::vector Prune(const ProgramDesc& pdesc, int block_id) { // TODO(tonyyang-svail): // - will change to use multiple blocks for RNN op and Cond Op @@ -159,5 +100,64 @@ std::vector Executor::Prune(const ProgramDesc& pdesc, int block_id) { return should_run; } +Executor::Executor(const std::vector& places) { + PADDLE_ENFORCE_GT(places.size(), 0); + device_contexts_.resize(places.size()); + for (size_t i = 0; i < places.size(); i++) { + if (platform::is_cpu_place(places[i])) { + device_contexts_[i] = new platform::CPUDeviceContext( + boost::get(places[i])); + } else if (platform::is_gpu_place(places[i])) { +#ifdef PADDLE_WITH_CUDA + device_contexts_[i] = new platform::CUDADeviceContext( + boost::get(places[i])); +#else + PADDLE_THROW("'GPUPlace' is not supported in CPU only device."); +#endif + } + } +} + +Executor::~Executor() { + for (auto& device_context : device_contexts_) { + delete device_context; + } +} + +void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id) { + // TODO(tonyyang-svail): + // - only runs on the first device (i.e. no interdevice communication) + // - will change to use multiple blocks for RNN op and Cond Op + PADDLE_ENFORCE_GT(pdesc.blocks_size(), block_id); + auto& block = pdesc.blocks(block_id); + auto& device = device_contexts_[0]; + + // Instantiate all the vars in the global scope + for (auto& var : block.vars()) { + scope->NewVar(var.name()); + } + + Scope& local_scope = scope->NewScope(); + + std::vector should_run = Prune(pdesc, block_id); + PADDLE_ENFORCE_EQ(should_run.size(), static_cast(block.ops_size())); + for (size_t i = 0; i < should_run.size(); ++i) { + if (should_run[i]) { + for (auto& var : block.ops(i).outputs()) { + for (auto& argu : var.arguments()) { + if (local_scope.FindVar(argu) == nullptr) { + local_scope.NewVar(argu); + } + } + } + auto op = paddle::framework::OpRegistry::CreateOp(block.ops(i)); + op->Run(local_scope, *device); + } + } + + // TODO(tonyyang-svail): + // - Destroy local_scope +} + } // namespace framework } // namespace paddle diff --git a/paddle/framework/executor.h b/paddle/framework/executor.h index 7fac4f4f46..4e3bc2c0a5 100644 --- a/paddle/framework/executor.h +++ b/paddle/framework/executor.h @@ -36,21 +36,20 @@ class Executor { */ void Run(const ProgramDesc&, Scope*, int); - protected: - /* @Brief - * Pruning the graph - * - * @param - * ProgramDesc - * - * @return - * vector Same size as ops. Indicates whether an op should be run. - */ - std::vector Prune(const ProgramDesc& pdesc, int block_id); - private: std::vector device_contexts_; }; +/* @Brief + * Pruning the graph + * + * @param + * ProgramDesc + * + * @return + * vector Same size as ops. Indicates whether an op should be run. + */ +std::vector Prune(const ProgramDesc& pdesc, int block_id); + } // namespace framework } // namespace paddle From 434949ca2d23a2fec5c3b4ab8e6bcb0ea18921fc Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Tue, 10 Oct 2017 21:51:43 +0000 Subject: [PATCH 50/61] clean up for merge --- paddle/framework/executor.cc | 61 +----------------------------------- 1 file changed, 1 insertion(+), 60 deletions(-) diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc index 69c21d7457..886e9ab33e 100644 --- a/paddle/framework/executor.cc +++ b/paddle/framework/executor.cc @@ -93,7 +93,7 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id) { // - Destroy local_scope } -std::vector Executor::Prune(const ProgramDesc& pdesc, int block_id) { +std::vector Prune(const ProgramDesc& pdesc, int block_id) { // TODO(tonyyang-svail): // - will change to use multiple blocks for RNN op and Cond Op @@ -161,64 +161,5 @@ std::vector Executor::Prune(const ProgramDesc& pdesc, int block_id) { return should_run; } -Executor::Executor(const std::vector& places) { - PADDLE_ENFORCE_GT(places.size(), 0); - device_contexts_.resize(places.size()); - for (size_t i = 0; i < places.size(); i++) { - if (platform::is_cpu_place(places[i])) { - device_contexts_[i] = new platform::CPUDeviceContext( - boost::get(places[i])); - } else if (platform::is_gpu_place(places[i])) { -#ifdef PADDLE_WITH_CUDA - device_contexts_[i] = new platform::CUDADeviceContext( - boost::get(places[i])); -#else - PADDLE_THROW("'GPUPlace' is not supported in CPU only device."); -#endif - } - } -} - -Executor::~Executor() { - for (auto& device_context : device_contexts_) { - delete device_context; - } -} - -void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id) { - // TODO(tonyyang-svail): - // - only runs on the first device (i.e. no interdevice communication) - // - will change to use multiple blocks for RNN op and Cond Op - PADDLE_ENFORCE_GT(pdesc.blocks_size(), block_id); - auto& block = pdesc.blocks(block_id); - auto& device = device_contexts_[0]; - - // Instantiate all the vars in the global scope - for (auto& var : block.vars()) { - scope->NewVar(var.name()); - } - - Scope& local_scope = scope->NewScope(); - - std::vector should_run = Prune(pdesc, block_id); - PADDLE_ENFORCE_EQ(should_run.size(), static_cast(block.ops_size())); - for (size_t i = 0; i < should_run.size(); ++i) { - if (should_run[i]) { - for (auto& var : block.ops(i).outputs()) { - for (auto& argu : var.arguments()) { - if (local_scope.FindVar(argu) == nullptr) { - local_scope.NewVar(argu); - } - } - } - auto op = paddle::framework::OpRegistry::CreateOp(block.ops(i)); - op->Run(local_scope, *device); - } - } - - // TODO(tonyyang-svail): - // - Destroy local_scope -} - } // namespace framework } // namespace paddle From 72d3d814b5a62617d41e49cd2c6e662ad613ad78 Mon Sep 17 00:00:00 2001 From: chengduoZH Date: Wed, 11 Oct 2017 09:32:29 +0800 Subject: [PATCH 51/61] fix math/CMakeLists.txt --- paddle/operators/CMakeLists.txt | 6 +++++- paddle/operators/math/CMakeLists.txt | 6 ++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt index d132c1813e..89b1895a3d 100644 --- a/paddle/operators/CMakeLists.txt +++ b/paddle/operators/CMakeLists.txt @@ -112,7 +112,9 @@ set(DEPS_OPS cond_op cross_entropy_op softmax_with_cross_entropy_op - sum_op) + sum_op + pool_op + pool_with_index_op) op_library(recurrent_op SRCS recurrent_op.cc rnn/recurrent_op_utils.cc @@ -121,6 +123,8 @@ op_library(cond_op SRCS cond_op.cc DEPS framework_proto tensor operator net_op) op_library(cross_entropy_op DEPS cross_entropy) op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax) op_library(sum_op DEPS net_op) +op_library(pool_op DEPS pooling) +op_library(pool_with_index_op DEPS pooling) list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS}) foreach(src ${GENERAL_OPS}) diff --git a/paddle/operators/math/CMakeLists.txt b/paddle/operators/math/CMakeLists.txt index a0ceb029e3..6e2611af7b 100644 --- a/paddle/operators/math/CMakeLists.txt +++ b/paddle/operators/math/CMakeLists.txt @@ -1,13 +1,15 @@ if(WITH_GPU) - nv_library(math_function SRCS math_function.cc math_function.cu im2col.cc im2col.cu pooling.cc pooling.cu DEPS cblas device_context operator) + nv_library(math_function SRCS math_function.cc math_function.cu im2col.cc im2col.cu DEPS cblas device_context operator) nv_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor) nv_library(softmax SRCS softmax.cc softmax.cu DEPS operator) nv_library(cross_entropy SRCS cross_entropy.cc cross_entropy.cu DEPS operator) + nv_library(pooling SRCS pooling.cc pooling.cu DEPS operator) else() - cc_library(math_function SRCS math_function.cc im2col.cc pooling.cc DEPS cblas device_context operator) + cc_library(math_function SRCS math_function.cc im2col.cc DEPS cblas device_context operator) cc_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor) cc_library(softmax SRCS softmax.cc DEPS operator) cc_library(cross_entropy SRCS cross_entropy.cc DEPS operator) + cc_library(pooling SRCS pooling.cc DEPS operator) endif() cc_test(im2col_test SRCS im2col_test.cc DEPS math_function tensor) From c85d777f879e128a3a9b00ddfc243879a747f5da Mon Sep 17 00:00:00 2001 From: chengduoZH Date: Tue, 10 Oct 2017 22:35:55 +0800 Subject: [PATCH 52/61] follow comments --- paddle/operators/math/CMakeLists.txt | 8 ++++-- paddle/operators/math/vol2col.cc | 2 +- paddle/operators/math/vol2col_test.cc | 40 +++++++-------------------- 3 files changed, 16 insertions(+), 34 deletions(-) diff --git a/paddle/operators/math/CMakeLists.txt b/paddle/operators/math/CMakeLists.txt index d6e8373210..575e89eed8 100644 --- a/paddle/operators/math/CMakeLists.txt +++ b/paddle/operators/math/CMakeLists.txt @@ -1,15 +1,17 @@ if(WITH_GPU) - nv_library(math_function SRCS math_function.cc math_function.cu im2col.cc im2col.cu vol2col.cc vol2col.cu pooling.cc pooling.cu DEPS cblas device_context operator) + nv_library(math_function SRCS math_function.cc math_function.cu im2col.cc im2col.cu pooling.cc pooling.cu DEPS cblas device_context operator) nv_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor) nv_library(softmax SRCS softmax.cc softmax.cu DEPS operator) nv_library(cross_entropy SRCS cross_entropy.cc cross_entropy.cu DEPS operator) + nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS cblas device_context operator) else() - cc_library(math_function SRCS math_function.cc im2col.cc vol2col.cc pooling.cc DEPS cblas device_context operator) + cc_library(math_function SRCS math_function.cc im2col.cc pooling.cc DEPS cblas device_context operator) cc_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor) cc_library(softmax SRCS softmax.cc DEPS operator) cc_library(cross_entropy SRCS cross_entropy.cc DEPS operator) + cc_library(vol2col SRCS vol2col.cc DEPS cblas device_context operator) endif() cc_test(im2col_test SRCS im2col_test.cc DEPS math_function tensor) -cc_test(vol2col_test SRCS vol2col_test.cc DEPS math_function tensor) +cc_test(vol2col_test SRCS vol2col_test.cc DEPS vol2col tensor) diff --git a/paddle/operators/math/vol2col.cc b/paddle/operators/math/vol2col.cc index 5bad2e8073..e9718a0473 100644 --- a/paddle/operators/math/vol2col.cc +++ b/paddle/operators/math/vol2col.cc @@ -67,7 +67,7 @@ class Vol2ColFunctor { ((c * output_depth + d) * output_height + h) * output_width + w; if (h_pad < 0 || h_pad >= input_height || w_pad < 0 || w_pad >= input_width || d_pad < 0 || d_pad >= input_depth) { - col_data[col_idx] = T(0); + col_data[col_idx] = static_cast(0); } else { int vol_idx = ((c_in * input_depth + d_pad) * input_height + h_pad) * diff --git a/paddle/operators/math/vol2col_test.cc b/paddle/operators/math/vol2col_test.cc index 107a94511f..e3c599da87 100644 --- a/paddle/operators/math/vol2col_test.cc +++ b/paddle/operators/math/vol2col_test.cc @@ -30,12 +30,12 @@ void testVol2col() { context = new paddle::platform::CPUDeviceContext(paddle::platform::CPUPlace()); } else { -#ifndef PADDLE_ONLY_CPU +#ifdef PADDLE_WITH_CUDA context = new paddle::platform::CUDADeviceContext(paddle::platform::GPUPlace()); #else PADDLE_THROW("no GPU support"); -#endif // PADDLE_ONLY_CPU +#endif // PADDLE_WITH_CUDA } /** @@ -89,6 +89,7 @@ void testVol2col() { vol2col(*context, input, output_cfo, stride, stride, stride, padding, padding, padding); + float vol_2_col[] = {0, 1, 1, 2, 3, 4, 4, 5, 6, 7, 7, 8, 9, 10, 10, 11}; float* out_cfo_ptr; if (paddle::platform::is_cpu_place(*place)) { out_cfo_ptr = output_cfo.data(); @@ -97,24 +98,12 @@ void testVol2col() { out_cfo_ptr = output_tmp.data(); } - EXPECT_EQ(out_cfo_ptr[0], 0); - EXPECT_EQ(out_cfo_ptr[1], 1); - EXPECT_EQ(out_cfo_ptr[2], 1); - EXPECT_EQ(out_cfo_ptr[3], 2); - EXPECT_EQ(out_cfo_ptr[4], 3); - EXPECT_EQ(out_cfo_ptr[5], 4); - EXPECT_EQ(out_cfo_ptr[6], 4); - EXPECT_EQ(out_cfo_ptr[7], 5); - EXPECT_EQ(out_cfo_ptr[8], 6); - EXPECT_EQ(out_cfo_ptr[9], 7); - EXPECT_EQ(out_cfo_ptr[10], 7); - EXPECT_EQ(out_cfo_ptr[11], 8); - EXPECT_EQ(out_cfo_ptr[12], 9); - EXPECT_EQ(out_cfo_ptr[13], 10); - EXPECT_EQ(out_cfo_ptr[14], 10); - EXPECT_EQ(out_cfo_ptr[15], 11); + for (int i = 0; i < 16; ++i) { + EXPECT_EQ(out_cfo_ptr[i], vol_2_col[i]); + } // Col2Vol test + float col_2_vol[] = {0, 2, 2, 3, 8, 5, 6, 14, 8, 9, 20, 11}; memset(input_ptr, 0, 12 * sizeof(float)); if (paddle::platform::is_cpu_place(*place)) { input = input_tmp; @@ -134,18 +123,9 @@ void testVol2col() { in_cfo_ptr = input_tmp.data(); } - EXPECT_EQ(in_cfo_ptr[0], 0); - EXPECT_EQ(in_cfo_ptr[1], 2); - EXPECT_EQ(in_cfo_ptr[2], 2); - EXPECT_EQ(in_cfo_ptr[3], 3); - EXPECT_EQ(in_cfo_ptr[4], 8); - EXPECT_EQ(in_cfo_ptr[5], 5); - EXPECT_EQ(in_cfo_ptr[6], 6); - EXPECT_EQ(in_cfo_ptr[7], 14); - EXPECT_EQ(in_cfo_ptr[8], 8); - EXPECT_EQ(in_cfo_ptr[9], 9); - EXPECT_EQ(in_cfo_ptr[10], 20); - EXPECT_EQ(in_cfo_ptr[11], 11); + for (int i = 0; i < 12; ++i) { + EXPECT_EQ(in_cfo_ptr[i], col_2_vol[i]); + } } TEST(math, vol2col) { From 1d41a6d45a917010f21088d1917040565649f2a5 Mon Sep 17 00:00:00 2001 From: chengduoZH Date: Wed, 11 Oct 2017 10:11:45 +0800 Subject: [PATCH 53/61] update paddle/operators/math/CMakeLists.txt --- paddle/operators/math/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/operators/math/CMakeLists.txt b/paddle/operators/math/CMakeLists.txt index 575e89eed8..d32924db85 100644 --- a/paddle/operators/math/CMakeLists.txt +++ b/paddle/operators/math/CMakeLists.txt @@ -3,13 +3,13 @@ if(WITH_GPU) nv_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor) nv_library(softmax SRCS softmax.cc softmax.cu DEPS operator) nv_library(cross_entropy SRCS cross_entropy.cc cross_entropy.cu DEPS operator) - nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS cblas device_context operator) + nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS device_context operator) else() cc_library(math_function SRCS math_function.cc im2col.cc pooling.cc DEPS cblas device_context operator) cc_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor) cc_library(softmax SRCS softmax.cc DEPS operator) cc_library(cross_entropy SRCS cross_entropy.cc DEPS operator) - cc_library(vol2col SRCS vol2col.cc DEPS cblas device_context operator) + cc_library(vol2col SRCS vol2col.cc DEPS device_context operator) endif() From 1397e17f6b1fe1088af6ab3117eb7b6c5f4adea3 Mon Sep 17 00:00:00 2001 From: kavyasrinet Date: Tue, 10 Oct 2017 19:50:34 -0700 Subject: [PATCH 54/61] Implemented the hardShrink activation (#4653) * Implemented the hardShrink activation * Fixing the unit test --- paddle/operators/activation_op.cc | 21 ++++++++++ paddle/operators/activation_op.h | 38 +++++++++++++++++-- .../v2/framework/tests/test_activation_op.py | 20 ++++++++++ 3 files changed, 76 insertions(+), 3 deletions(-) diff --git a/paddle/operators/activation_op.cc b/paddle/operators/activation_op.cc index a6bb738af3..61a201b6cd 100644 --- a/paddle/operators/activation_op.cc +++ b/paddle/operators/activation_op.cc @@ -137,6 +137,24 @@ class TanhShrinkOpMaker : public framework::OpProtoAndCheckerMaker { } }; +template +class HardShrinkOpMaker : public framework::OpProtoAndCheckerMaker { + public: + HardShrinkOpMaker(framework::OpProto *proto, + framework::OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "Input of HardShrink operator"); + AddOutput("Y", "Output of HardShrink operator"); + AddComment( + "HardShrink activation operator, " + "hard_shrink(x) = x if x > lambda" + "hard_shrink(x) = x if x < -lambda" + "hard_shrink(x) = 0 otherwise"); + AddAttr("threshold", "The value of threshold for HardShrink") + .SetDefault(static_cast(0.5)); + } +}; + class SqrtOpMaker : public framework::OpProtoAndCheckerMaker { public: SqrtOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker) @@ -357,6 +375,9 @@ REGISTER_OP(pow, ops::ActivationOp, ops::PowOpMaker, pow_grad, REGISTER_OP(stanh, ops::ActivationOp, ops::STanhOpMaker, stanh_grad, ops::ActivationOpGrad); +REGISTER_OP(hard_shrink, ops::ActivationOp, ops::HardShrinkOpMaker, + hard_shrink_grad, ops::ActivationOpGrad); + #define REGISTER_ACTIVATION_CPU_KERNEL(act_type, functor, grad_functor) \ REGISTER_OP_CPU_KERNEL( \ act_type, \ diff --git a/paddle/operators/activation_op.h b/paddle/operators/activation_op.h index 70d5a62052..29f159bbae 100644 --- a/paddle/operators/activation_op.h +++ b/paddle/operators/activation_op.h @@ -199,6 +199,39 @@ struct TanhShrinkGradFunctor : public BaseActivationFunctor { } }; +// tanhshrink(x) = x - tanh(x) +// where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x)) +template +struct HardShrinkFunctor : public BaseActivationFunctor { + float threshold; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"threshold", &threshold}}; + } + template + void operator()(Device d, X x, Y y) const { + auto temp1 = (x < (threshold * -1)).template cast().eval(); + auto temp2 = (x > threshold).template cast().eval(); + y.device(d) = x * (temp1 + temp2); + } +}; + +template +struct HardShrinkGradFunctor : public BaseActivationFunctor { + float threshold; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"threshold", &threshold}}; + } + + template + void operator()(Device d, X x, Y y, dY dy, dX dx) const { + auto temp1 = (x < (threshold * -1)).template cast().eval(); + auto temp2 = (x > threshold).template cast().eval(); + dx.device(d) = dy * (temp1 + temp2).template cast(); + } +}; + // softshrink(x) = x - lambda, if x > lambda; x + lambda, if x < lambda; 0 // otherwise template @@ -351,8 +384,6 @@ template struct Relu6Functor : public BaseActivationFunctor { float threshold; - // NOTE: Explicit hides the `BaseActivationFunctor::GetAttrs` - // not polymorphism for speed. typename BaseActivationFunctor::AttrPair GetAttrs() { return {{"threshold", &threshold}}; } @@ -555,4 +586,5 @@ struct STanhGradFunctor : public BaseActivationFunctor { __macro(relu6, Relu6Functor, Relu6GradFunctor); \ __macro(leaky_relu, LeakyReluFunctor, LeakyReluGradFunctor); \ __macro(tanh_shrink, TanhShrinkFunctor, TanhShrinkGradFunctor); \ - __macro(elu, ELUFunctor, ELUGradFunctor) + __macro(elu, ELUFunctor, ELUGradFunctor); \ + __macro(hard_shrink, HardShrinkFunctor, HardShrinkGradFunctor) diff --git a/python/paddle/v2/framework/tests/test_activation_op.py b/python/paddle/v2/framework/tests/test_activation_op.py index 9157e00f6e..52e027bd54 100644 --- a/python/paddle/v2/framework/tests/test_activation_op.py +++ b/python/paddle/v2/framework/tests/test_activation_op.py @@ -78,6 +78,26 @@ class TestTanhShrink(OpTest): self.check_grad(['X'], 'Y', max_relative_error=0.008) +class TestHardShrink(OpTest): + def setUp(self): + self.op_type = "hard_shrink" + x = np.random.uniform(-1, 1, [4, 4]).astype("float32") + threshold = 0.5 + + self.inputs = {'X': x} + self.attrs = {'lambda': threshold} + + t = np.copy(x) + t[(t >= -threshold) & (t <= threshold)] = 0 + self.outputs = {'Y': t} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(['X'], 'Y', max_relative_error=0.005) + + class TestSoftShrink(OpTest): def setUp(self): self.op_type = "softshrink" From 696874ac6ee1b2b284d9817988aa4c99f74c0c76 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Tue, 10 Oct 2017 19:54:01 -0700 Subject: [PATCH 55/61] Optimizer Design (#4656) * init optimizer design * fix index * optimize the interface * add a link to python_api.md * optimize the code of Optimizer --- doc/design/optimizer.md | 105 +++++++++++++++++++++++++++++++++++++++ doc/design/python_api.md | 4 ++ 2 files changed, 109 insertions(+) create mode 100644 doc/design/optimizer.md diff --git a/doc/design/optimizer.md b/doc/design/optimizer.md new file mode 100644 index 0000000000..17440fae50 --- /dev/null +++ b/doc/design/optimizer.md @@ -0,0 +1,105 @@ +## Optimizer Design + +### The Problem + +A PaddlePaddle program, or a block, is a sequence of operators operating variables. A training program needs to do three kinds of works: + +1. the forward pass, which computes intermediate results and the cost(s), +1. the backward pass, which derives gradients from intermediate results and costs, and +1. the optimization pass, which update model parameters to optimize the cost(s). + +These works rely on three kinds of operators: + +1. forward operators, +1. gradient operators, and +1. optimization operators. + +It's true that users should be able to create all these operators manually by calling some low-level API, but it would be much more convenient if they could only describe the forward pass and let PaddlePaddle create the backward and optimization operators automatically. + +In this design, we propose a high-level API that automatically derives the optimisation pass and operators from the forward pass. + + +### High-level Python API to describe the training process + +1. User write code to describe the network: + + ```python + images = layer.data("images") + labels = layer.data("labels") + w1 = pd.var("w1") + b1 = pd.var("b1") + hidden = layer.fc(images, w=w1, b=b1) + cost = layer.mse(hidden, labels) + ``` + + The above code snippet will create forward operators in [Block](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.md). + + +2. Users create a certain kind of Optimizer with some argument. + + ```python + optimizer = AdagradOptimizer(learing_rate=0.001) + ``` + +3. Users use the optimizer to `minimize` a certain `cost` through updating parameters in parameter_list. + + ```python + opt_op_list = optimizer.minimize(cost, parameter_list=[w1, b1]) + ``` + The above code snippet will create gradient and optimization operators in Block. The return value of `minimize()` is list of optimization operators that will be run by session. + +4. Users use Session/Executor to run this opt_op_list as target to do training. + + ```python + sess.run(target= opt_op_list, ...) + ``` + +#### Optimizer Python interface: + +```python +class Optimizer(object): + """Optimizer Base class. + + """ + + def __init__(self): + pass + + def create_backward_pass(self, loss, parameter_list=None): + """ + create and add gradient Operators in BlockDesc to Compute gradients of `loss` + for parameters in parameter_list + + Args: + loss: an variable generated by cost function. + parameter_list: parameters that need to compute gradient and update to optimize the lost. + + Returns: + list of (parameters, gradients) pair. + """ + return None + + def create_optimization_pass(self, parameters_and_grads): + """Add optimization operators to update gradients to variables. + + Args: + parameters_and_grads: a list of (variable, gradient) pair to update. + + Returns: + optmization_op_list: a list of optimization operator that will update parameter using gradient. + """ + return None + + def minimize(self, loss, parameter_list): + """Add operations to minimize `loss` by updating `parameter_list`. + + This method combines interface `create_backward_pass()` and + `create_optimization_pass()` into one. + """ + params_grads = self.create_backward_pass(loss, parameter_list) + update_ops = self.create_optimization_pass(params_grads) + return update_ops + +``` + +Users can inherit the Optimizer above to create their own Optimizer with some special logic, such as AdagradOptimizer. diff --git a/doc/design/python_api.md b/doc/design/python_api.md index c4665e44fc..56ae1d925a 100644 --- a/doc/design/python_api.md +++ b/doc/design/python_api.md @@ -214,3 +214,7 @@ def fc_layer(input, size, ...): out.writer = op return out ``` + +## Optimizer + +[Optimizer Design Doc](./optimizer.md) From 23407e7a649d07d0b92219d926179f1dcf6f9326 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Tue, 10 Oct 2017 19:57:44 -0700 Subject: [PATCH 56/61] Design doc of SelectedRows (#4652) * Design doc of SelectedRows * Follow comments * Update protobuf message * Follow comments, seperate LoDTensorDesc and SelectedRows Desc --- doc/design/selected_rows.md | 74 +++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 doc/design/selected_rows.md diff --git a/doc/design/selected_rows.md b/doc/design/selected_rows.md new file mode 100644 index 0000000000..9e6f3b20cb --- /dev/null +++ b/doc/design/selected_rows.md @@ -0,0 +1,74 @@ +# Design Doc: Selected Rows + +`SelectedRows` is a kind of sparse tensor data type, which is designed to support `embedding` operators. The gradient of embedding table is a sparse tensor. Only a few rows are non-zero values in that tensor. It is straightforward to represent the sparse tensor by the following sparse tensor data structure: + +```cpp +class SelectedRows { + private: + vector rows_; + Tensor value_; + int height_; +}; +``` + +The field `height_` shows the first dimension of `SelectedRows`. The `rows` are the indices of which rows of `SelectedRows` are non-zeros. The `value_` field is an N-dim tensor and shape is `[rows.size() /* NUM_ROWS */, ...]`, which supplies values for each row. The dimension of `SelectedRows` satisfies `[height_] + value_.shape[1:]`. + +Suppose that a SelectedRows-typed variable `x` has many rows, but only two of them have values -- row 73 is `[1, 2]` and row 84 is `[3, 4]`, the `SelectedRows` representation would be: + +``` +x = SelectedRow { + rows = [73, 84], + value = [[1, 2], [3,4]] +} +``` + + +## SelectedRows in Protobuf + +`SelectedRows` is a kind of `Variable`. `VarDesc` in protobuf should describe the `SelectedRows` information. Only the tensor dimension of a `SelectedRows` will be described in compile-time since the `rows_` and `value_` are related to training data. +So we use `TensorDesc` to unify `data_type` and `dims`. A LodTensorDesc contains a `TensorDesc` and `lod_level`. The description of `SelectedRows` is a Tensor description. + +```proto +message TensorDesc { + required DataType data_type = 1; + repeated int64 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480] +} + +message LodTensorDesc { + required TensorDesc tensor = 1; + optional int lod_level = 2; +} + +message VarDesc { + required string name = 1; + enum VarType { + LOD_TENSOR = 0; + SELECTED_ROWS = 1; + } + required VarType type = 2; + optional LodTensorDesc lod_desc = 3; + optional TensorDesc selected_rows_desc = 4; + optional bool persistable = 5 [ default = false ]; +} +``` + +## InferShape for Selected Rows + +Just like `LoD` information, `InferShape` method will inference output tensor type as well. The operator should decide whether its output is a `SelectedRows` or `Dense` tensor. + +For example, the gradient operator of `TableLookup` will always generate `SelectedRows`. Its `InferShape` method should be like following + +```cpp +void TableLookupGrad::InferShape(context) { + ... + context.SetDataType("Embedding.Grad", kSelectedRows); +} +``` + + +## Sparse Operators + +There are several operators should be written to support `SelectedRows`. They are: + +1. Operators which generates `SelectedRows` gradient. e.g. Gradient of `TableLookupOp`. +2. Optimize operators which support `SelectedRows` gradient. e.g. `SGD` or `AdaGrad` for `SelectedRows`. However, there should be only one `SGD` operator. `OpWithKernel::Run` should select a suitable kernel for both `dense` tensor or `SelectedRows`. From f5ac335046feb81529e85cd0c386379746771157 Mon Sep 17 00:00:00 2001 From: chengduoZH Date: Wed, 11 Oct 2017 11:02:26 +0800 Subject: [PATCH 57/61] follow comments --- paddle/operators/math/CMakeLists.txt | 5 ++- paddle/operators/math/vol2col_test.cc | 47 +++++++++++++-------------- 2 files changed, 25 insertions(+), 27 deletions(-) diff --git a/paddle/operators/math/CMakeLists.txt b/paddle/operators/math/CMakeLists.txt index d32924db85..2fd559e90a 100644 --- a/paddle/operators/math/CMakeLists.txt +++ b/paddle/operators/math/CMakeLists.txt @@ -3,14 +3,13 @@ if(WITH_GPU) nv_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor) nv_library(softmax SRCS softmax.cc softmax.cu DEPS operator) nv_library(cross_entropy SRCS cross_entropy.cc cross_entropy.cu DEPS operator) - nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS device_context operator) + nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS device_context) else() cc_library(math_function SRCS math_function.cc im2col.cc pooling.cc DEPS cblas device_context operator) cc_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor) cc_library(softmax SRCS softmax.cc DEPS operator) cc_library(cross_entropy SRCS cross_entropy.cc DEPS operator) - cc_library(vol2col SRCS vol2col.cc DEPS device_context operator) - + cc_library(vol2col SRCS vol2col.cc DEPS device_context) endif() cc_test(im2col_test SRCS im2col_test.cc DEPS math_function tensor) diff --git a/paddle/operators/math/vol2col_test.cc b/paddle/operators/math/vol2col_test.cc index e3c599da87..81225e9a98 100644 --- a/paddle/operators/math/vol2col_test.cc +++ b/paddle/operators/math/vol2col_test.cc @@ -18,10 +18,9 @@ limitations under the License. */ template void testVol2col() { - paddle::framework::Tensor input_tmp; paddle::framework::Tensor input; - paddle::framework::Tensor output_cfo; - paddle::framework::Tensor output_ocf; + paddle::framework::Tensor input_tmp; + paddle::framework::Tensor output; paddle::framework::Tensor output_tmp; auto* place = new Place(); @@ -44,14 +43,14 @@ void testVol2col() { * [6, 7, 8, * 9, 10, 11]] * - * output_cfo = [0, 1 - * 1, 2 - * 3, 4 - * 4, 5 - * 6, 7 - * 7, 8 - * 9, 10 - * 10, 11] + * output = [0, 1 + * 1, 2 + * 3, 4 + * 4, 5 + * 6, 7 + * 7, 8 + * 9, 10 + * 10, 11] * * col2vol = [[0, 2, 2, * 3, 8, 5] @@ -81,20 +80,20 @@ void testVol2col() { } else { input.CopyFrom(input_tmp, *place); } - output_cfo.mutable_data({1, filter_size, filter_size, filter_size, - output_depth, output_height, output_width}, - *place); + output.mutable_data({1, filter_size, filter_size, filter_size, + output_depth, output_height, output_width}, + *place); paddle::operators::math::Vol2ColFunctor vol2col; - vol2col(*context, input, output_cfo, stride, stride, stride, padding, padding, + vol2col(*context, input, output, stride, stride, stride, padding, padding, padding); float vol_2_col[] = {0, 1, 1, 2, 3, 4, 4, 5, 6, 7, 7, 8, 9, 10, 10, 11}; float* out_cfo_ptr; if (paddle::platform::is_cpu_place(*place)) { - out_cfo_ptr = output_cfo.data(); + out_cfo_ptr = output.data(); } else { - output_tmp.CopyFrom(output_cfo, paddle::platform::CPUPlace()); + output_tmp.CopyFrom(output, paddle::platform::CPUPlace()); out_cfo_ptr = output_tmp.data(); } @@ -112,25 +111,25 @@ void testVol2col() { } paddle::operators::math::Col2VolFunctor col2vol; - col2vol(*context, input, output_cfo, stride, stride, stride, padding, padding, + col2vol(*context, input, output, stride, stride, stride, padding, padding, padding); - float* in_cfo_ptr; + float* in_ptr; if (paddle::platform::is_cpu_place(*place)) { - in_cfo_ptr = input.data(); + in_ptr = input.data(); } else { input_tmp.CopyFrom(input, paddle::platform::CPUPlace()); - in_cfo_ptr = input_tmp.data(); + in_ptr = input_tmp.data(); } for (int i = 0; i < 12; ++i) { - EXPECT_EQ(in_cfo_ptr[i], col_2_vol[i]); + EXPECT_EQ(in_ptr[i], col_2_vol[i]); } } TEST(math, vol2col) { testVol2col(); -#ifndef PADDLE_ONLY_CPU +#ifdef PADDLE_WITH_CUDA testVol2col(); -#endif +#endif // PADDLE_WITH_CUDA } From 7454ec0400429676edaa46578b024ff4bd4c028e Mon Sep 17 00:00:00 2001 From: Liu Yiqun Date: Wed, 11 Oct 2017 03:48:43 +0000 Subject: [PATCH 58/61] Simplify backward when inserting a sum operator to accumulate all duplicated variables. --- paddle/framework/backward.cc | 28 ++++++---------------------- 1 file changed, 6 insertions(+), 22 deletions(-) diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc index 0a4688db9c..063b108500 100644 --- a/paddle/framework/backward.cc +++ b/paddle/framework/backward.cc @@ -172,30 +172,14 @@ static std::unique_ptr BackwardRecursive( std::to_string(i)); net->ops_[op_offset]->Rename(name, dup_outputs.back()); } - // collect all the offset to append `add` op for each alias - // - // one variable is shared between multiple operators. - // insert add operator one by one, then add it to output - for (size_t output_idx = 0; output_idx < dup_outputs.size() - 1; - ++output_idx) { - auto insert_add_x = dup_outputs[output_idx]; - auto insert_add_y = dup_outputs[output_idx + 1]; - auto insert_add_out = name + "@SHARED@" + std::to_string(output_idx); - // first add op inserted - if (output_idx == dup_outputs.size() - 2) { - insert_add_out = name; - } - if (output_idx != 0) { - insert_add_y = name + "@SHARED@" + std::to_string(output_idx - 1); - } - insert_position.push_back( - {dup_op.back(), - OpRegistry::CreateOp("sum", {{"X", {insert_add_x, insert_add_y}}}, - {{"Out", {insert_add_out}}}, {})}); - } + // collect all the offset for each alias, + // insert a sum operator to add all aliases to output + insert_position.push_back( + {dup_op.back(), OpRegistry::CreateOp("sum", {{"X", dup_outputs}}, + {{"Out", {name}}}, {})}); } - // make sure the inserted `add` ops follow the BFS order. + // make sure the inserted `sum` ops follow the BFS order. insert_position.sort( [](const Pos& l, const Pos& r) { return l.first > r.first; }); From 9995aed114148ff96b8e06f1548cff0445fce628 Mon Sep 17 00:00:00 2001 From: kexinzhao <19hskevin87@gmail.com> Date: Tue, 10 Oct 2017 21:14:48 -0700 Subject: [PATCH 59/61] Implementing Softplus operator (#4690) * implementing softplus * small fix * small fix * small fix * small fix --- paddle/operators/activation_op.cc | 14 ++++++++++ paddle/operators/activation_op.h | 28 +++++++++++++++++++ .../v2/framework/tests/test_activation_op.py | 15 ++++++++++ 3 files changed, 57 insertions(+) diff --git a/paddle/operators/activation_op.cc b/paddle/operators/activation_op.cc index 61a201b6cd..ced14a8923 100644 --- a/paddle/operators/activation_op.cc +++ b/paddle/operators/activation_op.cc @@ -206,6 +206,17 @@ class SquareOpMaker : public framework::OpProtoAndCheckerMaker { } }; +class SoftplusOpMaker : public framework::OpProtoAndCheckerMaker { + public: + SoftplusOpMaker(framework::OpProto *proto, + framework::OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "Input of Softplus operator"); + AddOutput("Y", "Output of Softplus operator"); + AddComment("Softplus activation operator, softplus(x) = log(1 + exp(x))"); + } +}; + class SoftsignOpMaker : public framework::OpProtoAndCheckerMaker { public: SoftsignOpMaker(framework::OpProto *proto, @@ -351,6 +362,9 @@ REGISTER_OP(log, ops::ActivationOp, ops::LogOpMaker, log_grad, REGISTER_OP(square, ops::ActivationOp, ops::SquareOpMaker, square_grad, ops::ActivationOpGrad); +REGISTER_OP(softplus, ops::ActivationOp, ops::SoftplusOpMaker, softplus_grad, + ops::ActivationOpGrad); + REGISTER_OP(softsign, ops::ActivationOp, ops::SoftsignOpMaker, softsign_grad, ops::ActivationOpGrad); diff --git a/paddle/operators/activation_op.h b/paddle/operators/activation_op.h index 29f159bbae..f88c9c48eb 100644 --- a/paddle/operators/activation_op.h +++ b/paddle/operators/activation_op.h @@ -407,6 +407,33 @@ struct Relu6GradFunctor : public BaseActivationFunctor { } }; +// softplus(x) = log(1 + exp(x)) +// When x is a very large positive number, exp(x) may explode to inf, +// Using trick below for numerical stability +// https://hips.seas.harvard.edu/blog/2013/01/09/computing-log-sum-exp/ +// Then: softplus(x) = max(x, 0) + log(exp(-max(x, 0)) + exp(x - max(x, 0))) +template +struct SoftplusFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Y y) { + auto temp = x.cwiseMax(static_cast(0)); // temp = max(x, 0) + y.device(d) = temp + (((-temp).exp() + (x - temp).exp()).log()); + } +}; + +// d(softplus(x))/dx = exp(x) / (1 + exp(x)) +// For numerical stability: +// d(softplus(x))/dx = exp(x - max(x, 0)) / (exp(-max(x, 0)) + +// exp(x - max(x, 0))) +template +struct SoftplusGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Y y, dY dy, dX dx) { + auto temp = x.cwiseMax(static_cast(0)); // temp = max(x, 0) + dx.device(d) = dy * ((x - temp).exp() / ((-temp).exp() + (x - temp).exp())); + } +}; + // softsign(x) = x / (1 + |x|) template struct SoftsignFunctor : public BaseActivationFunctor { @@ -582,6 +609,7 @@ struct STanhGradFunctor : public BaseActivationFunctor { __macro(soft_relu, SoftReluFunctor, SoftReluGradFunctor); \ __macro(pow, PowFunctor, PowGradFunctor); \ __macro(stanh, STanhFunctor, STanhGradFunctor); \ + __macro(softplus, SoftplusFunctor, SoftplusGradFunctor); \ __macro(softsign, SoftsignFunctor, SoftsignGradFunctor); \ __macro(relu6, Relu6Functor, Relu6GradFunctor); \ __macro(leaky_relu, LeakyReluFunctor, LeakyReluGradFunctor); \ diff --git a/python/paddle/v2/framework/tests/test_activation_op.py b/python/paddle/v2/framework/tests/test_activation_op.py index 52e027bd54..a28c4431e1 100644 --- a/python/paddle/v2/framework/tests/test_activation_op.py +++ b/python/paddle/v2/framework/tests/test_activation_op.py @@ -331,6 +331,21 @@ class TestSTanh(OpTest): self.check_grad(['X'], 'Y', max_relative_error=0.007) +class TestSoftplus(OpTest): + def setUp(self): + self.op_type = "softplus" + self.inputs = { + 'X': np.random.uniform(-1, 1, [11, 17]).astype("float32") + } + self.outputs = {'Y': np.log(1 + np.exp(self.inputs['X']))} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(['X'], 'Y', max_relative_error=0.007) + + class TestSoftsign(OpTest): def setUp(self): self.op_type = "softsign" From 82a2b1a92db573f0021d145a96f4bddbaf0606e8 Mon Sep 17 00:00:00 2001 From: Luo Tao Date: Wed, 11 Oct 2017 15:29:48 +0800 Subject: [PATCH 60/61] fix Compile error [fatal error: boost/range/adaptor/reversed.hpp No such file or directory] --- paddle/framework/executor.cc | 2 -- 1 file changed, 2 deletions(-) diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc index 886e9ab33e..c388b2198e 100644 --- a/paddle/framework/executor.cc +++ b/paddle/framework/executor.cc @@ -24,8 +24,6 @@ limitations under the License. */ #include "paddle/framework/op_registry.h" #include "paddle/framework/scope.h" -#include - namespace paddle { namespace framework { From 4849fba7e98044b4d2e951638562342da5d399fe Mon Sep 17 00:00:00 2001 From: chengduoZH Date: Wed, 11 Oct 2017 15:36:23 +0800 Subject: [PATCH 61/61] follow comments --- paddle/operators/math/CMakeLists.txt | 4 +- paddle/operators/pool_op.cc | 41 +++++++++++++++----- paddle/operators/pool_with_index_op.cc | 52 +++++++++++++++++++------- 3 files changed, 72 insertions(+), 25 deletions(-) diff --git a/paddle/operators/math/CMakeLists.txt b/paddle/operators/math/CMakeLists.txt index 2c1bc6d910..1a2f623ce7 100644 --- a/paddle/operators/math/CMakeLists.txt +++ b/paddle/operators/math/CMakeLists.txt @@ -3,14 +3,14 @@ if(WITH_GPU) nv_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor) nv_library(softmax SRCS softmax.cc softmax.cu DEPS operator) nv_library(cross_entropy SRCS cross_entropy.cc cross_entropy.cu DEPS operator) - nv_library(pooling SRCS pooling.cc pooling.cu DEPS operator) + nv_library(pooling SRCS pooling.cc pooling.cu DEPS device_context) nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS device_context) else() cc_library(math_function SRCS math_function.cc im2col.cc DEPS cblas device_context operator) cc_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor) cc_library(softmax SRCS softmax.cc DEPS operator) cc_library(cross_entropy SRCS cross_entropy.cc DEPS operator) - cc_library(pooling SRCS pooling.cc DEPS operator) + cc_library(pooling SRCS pooling.cc DEPS device_context) cc_library(vol2col SRCS vol2col.cc DEPS device_context) endif() diff --git a/paddle/operators/pool_op.cc b/paddle/operators/pool_op.cc index 25fd01844b..c6d9aae133 100644 --- a/paddle/operators/pool_op.cc +++ b/paddle/operators/pool_op.cc @@ -35,7 +35,7 @@ void PoolOp::InferShape(framework::InferShapeContext *ctx) const { std::vector paddings = ctx->Attrs().Get>("paddings"); PADDLE_ENFORCE(in_x_dims.size() == 4 || in_x_dims.size() == 5, - "Pooling intput should be 4-D or 5-D"); + "Pooling intput should be 4-D or 5-D tensor."); if (ctx->Attrs().Get("globalPooling")) { ksize.resize(static_cast(in_x_dims.size()) - 2); @@ -70,11 +70,11 @@ Pool2dOpMaker::Pool2dOpMaker(framework::OpProto *proto, : OpProtoAndCheckerMaker(proto, op_checker) { AddInput( "X", - "The input tensor of pooling operator. " + "(Tensor) The input tensor of pooling operator. " "The format of input tensor is NCHW. Where N is batch size, C is the " "number of channels, H and W is the height and width of feature."); AddOutput("Out", - "The output tensor of pooling operator." + "(Tensor) The output tensor of pooling operator." "The format of output tensor is also NCHW." "Where N is batch size, C is " "the number of channels, H and W is the height and " @@ -87,7 +87,7 @@ Pool2dOpMaker::Pool2dOpMaker(framework::OpProto *proto, AddAttr>( "ksize", - "The pooling size(height, width) of pooling operator." + "The pooling window size(height, width) of pooling operator." "If globalPooling = true, ksize is ignored and need not be " "specified."); // TODO(Chengduo): Add checker. (Currently, // TypedAttrChecker don't support vector type.) @@ -99,12 +99,12 @@ Pool2dOpMaker::Pool2dOpMaker(framework::OpProto *proto, "If globalPooling = true, ksize is ignored and need not be specified.") .SetDefault(false); AddAttr>("strides", - "Strides(height, width) of pooling operator." + "The strides(height, width) of pooling window." "Default {1,1}.") .SetDefault({1, 1}); // TODO(Chengduo): Add checker. (Currently, // TypedAttrChecker don't support vector type.) AddAttr>("paddings", - "Paddings(height, width) of pooling operator." + "The zero padding(height, width) size on both sides" "Default {0,0}.") .SetDefault({0, 0}); // TODO(Chengduo): Add checker. (Currently, // TypedAttrChecker don't support vector type.) @@ -116,6 +116,17 @@ Input(X) and output(Out) are in NCHW format. Where N is batch size, C is the number of channels, H and W is the height and width of feature. Parameters(ksize, strides, paddings) are two elements. These two elements represent height and width, respectively. +The input(X) size and output(Out) size may be different. + +Example: + Input: + X shape: (N, C, H_in, W_in) + Output: + Out shape: (N, C, H_out, W_out) + Mask shape: (N, C, H_out, W_out) + where + H_out = (H_in - ksize[0] + 2 * paddings[0]) / strides[0] + 1; + W_out = (W_in - ksize[1] + 2 * paddings[1]) / strides[1] + 1; )DOC"); } @@ -124,12 +135,12 @@ Pool3dOpMaker::Pool3dOpMaker(framework::OpProto *proto, : OpProtoAndCheckerMaker(proto, op_checker) { AddInput( "X", - "The input tensor of pooling operator. " + "(Tensor) The input tensor of pooling operator. " "The format of input tensor is NCDHW. Where N is batch size, C is " "the number of channels, D, H and W is the depth, height and width of " "feature."); AddOutput("Out", - "The output tensor of pooling operator." + "(Tensor) The output tensor of pooling operator." "The format of output tensor is also NCDHW." "Where N is batch size, C is " "the number of channels, D, H and W is the depth, height and " @@ -142,7 +153,7 @@ Pool3dOpMaker::Pool3dOpMaker(framework::OpProto *proto, AddAttr>( "ksize", - "The pooling size(depth, height, width) of pooling operator." + "The pooling window size(depth, height, width) of pooling operator." "If globalPooling = true, ksize is ignored and need not be " "specified."); // TODO(Chengduo): Add checker. (Currently, // TypedAttrChecker don't support vector type.) @@ -172,6 +183,18 @@ Input(X) and output(Out) are in NCDHW format. Where N is batch size, C is the number of channels, D, H and W is the depth, height and width of feature. Parameters(ksize, strides, paddings) are three elements. These three elements represent depth, height and width, respectively. +The input(X) size and output(Out) size may be different. + +Example: + Input: + X shape: (N, C, D_in, H_in, W_in) + Output: + Out shape: (N, C, D_out, H_out, W_out) + Mask shape: (N, C, D_out, H_out, W_out) + where + D_out = (D_in - ksize[0] + 2 * paddings[0]) / strides[0] + 1; + H_out = (H_in - ksize[1] + 2 * paddings[1]) / strides[1] + 1; + W_out = (W_in - ksize[2] + 2 * paddings[2]) / strides[2] + 1; )DOC"); } } // namespace operators diff --git a/paddle/operators/pool_with_index_op.cc b/paddle/operators/pool_with_index_op.cc index ae6a81d871..005ee88693 100644 --- a/paddle/operators/pool_with_index_op.cc +++ b/paddle/operators/pool_with_index_op.cc @@ -43,7 +43,7 @@ class MaxPoolWithIndexOp : public framework::OperatorWithKernel { std::vector paddings = ctx->Attrs().Get>("paddings"); PADDLE_ENFORCE(in_x_dims.size() == 4 || in_x_dims.size() == 5, - "Pooling intput should be 4-D or 5-D"); + "Pooling intput should be 4-D or 5-D tensor."); if (ctx->Attrs().Get("globalPooling")) { ksize.resize(static_cast(in_x_dims.size()) - 2); @@ -74,8 +74,8 @@ class MaxPoolWithIndexOpGrad : public framework::OperatorWithKernel { protected: void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null."); PADDLE_ENFORCE(ctx->HasInput("Mask"), "Input(Mask) must not be null."); + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null."); PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), "Input(X@GRAD) should not be null."); ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); @@ -89,17 +89,17 @@ class MaxPool2dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker { : OpProtoAndCheckerMaker(proto, op_checker) { AddInput( "X", - "The input tensor of pooling operator. " + "(Tensor) The input tensor of pooling operator. " "The format of input tensor is NCHW. Where N is batch size, C is the " "number of channels, H and W is the height and width of image."); AddOutput("Out", - "The output tensor of pooling operator." + "(Tensor) The output tensor of pooling operator." "The format of output tensor is also NCHW." "Where N is batch size, C is " "the number of channels, H and W is the height and " "width of image."); AddOutput("Mask", - "The Mask tensor of pooling operator." + "(Tensor) The Mask tensor of pooling operator." "The format of output tensor is also NCHW." "Where N is batch size, C is the number of channels, H and W " "is the height and width of image." @@ -107,7 +107,7 @@ class MaxPool2dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker { AddAttr>( "ksize", - "The pooling size(height, width) of pooling operator." + "The pooling window size(height, width) of pooling operator." "If globalPooling = true, ksize is ignored and need not be " "specified."); // TODO(Chengduo): Add checker. (Currently, // TypedAttrChecker don't support vector type.) @@ -119,13 +119,14 @@ class MaxPool2dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker { "If globalPooling = true, ksize is ignored and need not be specified.") .SetDefault(false); AddAttr>("strides", - "Strides(height, width) of pooling operator." + "The strides(height, width) of pooling window." "Default {1,1}.") .SetDefault({1, 1}); // TODO(Chengduo): Add checker. (Currently, // TypedAttrChecker don't support vector type.) - AddAttr>("paddings", - "Paddings(height, width) of pooling operator." - "Default {0,0}.") + AddAttr>( + "paddings", + "The zero padding(height, width) size on both sides" + "Default {0,0}.") .SetDefault({0, 0}); // TODO(Chengduo): Add checker. (Currently, // TypedAttrChecker don't support vector type.) @@ -136,6 +137,17 @@ output(Out, Mask) are in NCHW format. Where N is batch size, C is the number of channels, H and W is the height and width of feature. Parameters(ksize, strides, paddings) are two elements. These two elements represent height and width, respectively. +The input(X) size and output(Out, Mask) size may be different. + +Example: + Input: + X shape: (N, C, H_in, W_in) + Output: + Out shape: (N, C, H_out, W_out) + Mask shape: (N, C, H_out, W_out) + where + H_out = (H_in - ksize[0] + 2 * paddings[0]) / strides[0] + 1; + W_out = (W_in - ksize[1] + 2 * paddings[1]) / strides[1] + 1; )DOC"); } }; @@ -147,18 +159,18 @@ class MaxPool3dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker { : OpProtoAndCheckerMaker(proto, op_checker) { AddInput( "X", - "The input tensor of pooling operator. " + "(Tensor) The input tensor of pooling operator. " "The format of input tensor is NCDHW. Where N is batch size, C is " "the number of channels, D, H and W is the depth, height and width of " "image."); AddOutput("Out", - "The output tensor of pooling operator." + "(Tensor) The output tensor of pooling operator." "The format of output tensor is also NCDHW." "Where N is batch size, C is " "the number of channels, D, H and W is the depth, height and " "width of image."); AddOutput("Mask", - "The Mask tensor of pooling operator." + "(Tensor) The Mask tensor of pooling operator." "The format of output tensor is also NCDHW." "Where N is batch size, C is the number of channels, D, H and W " "is the depth, height and width of image." @@ -166,7 +178,7 @@ class MaxPool3dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker { AddAttr>( "ksize", - "The pooling size(depth, height, width) of pooling operator." + "The pooling window size(depth, height, width) of pooling operator." "If globalPooling = true, ksize is ignored and need not be " "specified."); // TODO(Chengduo): Add checker. (Currently, // TypedAttrChecker don't support vector type.) @@ -197,6 +209,18 @@ Input(X) and output(Out, Mask) are in NCDHW format. Where N is batch size, C is the number of channels, D, H and W is the depth, height and width of feature. Parameters(ksize, strides, paddings) are three elements. These three elements represent depth, height and width, respectively. +The input(X) size and output(Out, Mask) size may be different. + +Example: + Input: + X shape: (N, C, D_in, H_in, W_in) + Output: + Out shape: (N, C, D_out, H_out, W_out) + Mask shape: (N, C, D_out, H_out, W_out) + where + D_out = (D_in - ksize[0] + 2 * paddings[0]) / strides[0] + 1; + H_out = (H_in - ksize[1] + 2 * paddings[1]) / strides[1] + 1; + W_out = (W_in - ksize[2] + 2 * paddings[2]) / strides[2] + 1; )DOC"); } };