Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into fix_transpose_doc
commit
641df37393
@ -0,0 +1,149 @@
|
||||
# Design Doc: Add MKLDNN Kernel in Fluid Operator
|
||||
|
||||
## Principles
|
||||
|
||||
First of all, we should follow some basical principles like:
|
||||
1. [How to write a new operator](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/new_op_en.md). We are trying to add a new kind of kernel into operators, so basically we should follow this doc.
|
||||
2. [Supporting new Device/Library](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/support_new_device.md). Since MKLDNN is a new library to fluid, we should add `MKLDNNDeviceContext` and maybe `mkldnn_helper.h`, just like [cudnn_helper.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/cudnn_helper.h).
|
||||
3. [Switch Kernel](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md). Another important point is that we should ensure the data synchronization between different kernel types, which is this [topic](https://github.com/PaddlePaddle/Paddle/issues/6549). So basically we should override `GetExpectedKernelType` and `trans` functions to support switching kernels.
|
||||
4. [The Keys of Operator Kernel Type](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/operator_kernel_type.md). Kernel Type is a pivotal conception which can record the `Place`, `Library`, `DataType` and `Layout`.
|
||||
|
||||
## Sulution
|
||||
|
||||
In general, there are four parts we should follow to run a MKL-DNN primitive.
|
||||
- Create a primitive descriptor that describe this operator
|
||||
- Create a primitive itself by primitive descriptor and the engine
|
||||
- Create all memory buffers that primitive needed
|
||||
- Launch a stream to execute the primitive created
|
||||
More details can refer to [here](http://01org.github.io/mkl-dnn).
|
||||
|
||||
It's better to avoid reinitialization of primitives and memory handles in the first three stages in every iteration. \
|
||||
So we plan to create a map to record all the `primitive` and `memory`, which should not take too much memories as discussed [here](https://github.com/PaddlePaddle/Paddle/issues/6822).
|
||||
|
||||
It's assumed that following three conditions should be satisfied.
|
||||
1. there is a unique key for each operator instance. May be the actual name of `Output Tensor`.
|
||||
2. the `Input Tensor` inside `Compute` function is the one after converted.
|
||||
3. we can get the phase(eg. `is_test`) inside `Compute` function, otherwise we need to expose this attribue to user.
|
||||
|
||||
### Compute
|
||||
The algorithm of `Compute` would be described as follow, let's take conv like an example.
|
||||
|
||||
```c++
|
||||
|
||||
PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()), "It must use CPUPlace.");
|
||||
PADDLE_ENFORCE(platform::is_mkldnn_library(ctx.GetLibrary()), "It must use MKLDNN Library.");
|
||||
|
||||
auto& dev_ctx = ctx.template device_context<platform::MKLDNNDeviceContext>();
|
||||
|
||||
// find primitive by unique key from mkldnn context
|
||||
// the op_key should be a unique name of this op instance
|
||||
auto& p = dev_ctx.findPrimitive(op_key + "_fwd");
|
||||
|
||||
// assuming the input tensor inside this compute function is the one after converted
|
||||
// this point should be guarantee by another mechanism
|
||||
auto& i = dev_ctx.findMemory(op_key + "_input");
|
||||
|
||||
if (p == nullptr || i == nullptr || inputSizeChanged(p, i)) {
|
||||
auto fwd_primitive_desc = createPrimitiveDesc(ctx);
|
||||
auto* input = ctx.Input<Tensor>("Input");
|
||||
auto* filter = ctx.Input<Tensor>("Filter");
|
||||
auto* output = ctx.Output<Tensor>("Output");
|
||||
shared_ptr<mkldnn::memory> in(new mkldnn::memory(fwd_primitive_desc->src_primitive_desc(), input->data<T>()));
|
||||
shared_ptr<mkldnn::memory> wgt(new mkldnn::memory(fwd_primitive_desc->weights_primitive_desc(), filter->data<T>()));
|
||||
shared_ptr<mkldnn::memory> out(new mkldnn::memory(fwd_primitive_desc->dst_primitive_desc(), output->mutable_data<T>(ctx.GetPlace())));
|
||||
shared_ptr<mkldnn::conv_fwd> fwd_primitive(new mkldnn::conv_fwd(*fwd_primitive_desc, *in, *wgt, *out));
|
||||
|
||||
dev_ctx.addMemory(op_key+"_input", in);
|
||||
dev_ctx.addMemory(op_key+"_output", out);
|
||||
dev_ctx.addMemory(op_key+"_filer", wgt);
|
||||
dev_ctx.addPrimitive(op_key+"_fwd", fwd_primitive);
|
||||
dev_ctx.addPrimitiveDesc(op_key+"_fwd_PD", fwd_primitive_desc);
|
||||
}
|
||||
|
||||
p = dev_ctx.findPrimitive(op_key + "_fwd");
|
||||
|
||||
PADDLE_ENFORCE(p, "Should have forward Primitive");
|
||||
PADDLE_ENFORCE(dev_ctx.findMemory(op_unique_key+"_input"), "Should have input memory");
|
||||
PADDLE_ENFORCE(dev_ctx.findMemory(op_unique_key+"_output"), "Should have output memory");
|
||||
PADDLE_ENFORCE(dev_ctx.findMemory(op_unique_key+"_filter"), "Should have filter memory");
|
||||
PADDLE_ENFORCE(dev_ctx.findPrimitiveDesc(op_unique_key+"_fwd_PD"), "Should have forward PrimitiveDesc");
|
||||
dev_ctx.submit(p);
|
||||
dev_ctx.execute(); // the convert primitive should have already contained.
|
||||
|
||||
```
|
||||
|
||||
The `createPrimitiveDesc` returns the primitive descripotor of this operator, would be like this:
|
||||
```c++
|
||||
auto* input = ctx.Input<Tensor>("Input");
|
||||
auto* filter = ctx.Input<Tensor>("Filter");
|
||||
auto* output = ctx.Output<Tensor>("Output");
|
||||
std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
|
||||
std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
|
||||
std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
|
||||
int groups = ctx.Attr<int>("groups");
|
||||
algorithm algo = static_cast<algorithm>(ctx.Attr<int>("convolution_algorithm_option"));
|
||||
prop_kind pk = ctx.Attr<bool>("is_test") ? prop_kind::forward_inference : prop_kind::forward_training;
|
||||
|
||||
auto fwd_desc = mkldnn::conv_fwd::desc(/* all the setting above*/);
|
||||
shared_ptr<mkldnn::conv_fwd::primitive_desc> fwd_primitive_desc(new mkldnn::conv_fwd::primitive_desc(fwd_desc, ctx.getEngine()));
|
||||
|
||||
return fwd_primitive_desc;
|
||||
}
|
||||
```
|
||||
|
||||
### MKLDNNDeviceContext
|
||||
`MKLDNNDeviceContext`, which is very straightforward, should contain some base information like: `stream`, `engine` and the map needed.
|
||||
|
||||
|
||||
### mkldnn_helper
|
||||
Some functions would be put in `paddle/platform/mkldnn_helper.h`.
|
||||
- create MKLDNN memories
|
||||
- create MKLDNN primitives
|
||||
- error check function
|
||||
- etc
|
||||
|
||||
|
||||
### Kernel Switch
|
||||
We should `reorder` the different Layout from other device or to other device. `GetExpectedKernelType` and `trans` functions can help us to implement it.
|
||||
|
||||
`GetExpectedKernelType` should get the context, and this operator can return the best `KernelType`.
|
||||
`trans` would be like this:
|
||||
|
||||
```c++
|
||||
void trans(inputs, ctx) override {
|
||||
if (NoNeedTrans()) {
|
||||
return;
|
||||
}
|
||||
// find reorder primitive by op_key from context
|
||||
auto& dev_ctx = ctx.template device_context<platform::MKLDNNDeviceContext>();
|
||||
auto& p = dev_ctx.findPrimitive(op_key + "_reorder_input");
|
||||
auto& i = dev_ctx.findMemory(op_key + "_src_input");
|
||||
|
||||
if (p == nullptr || i == nullptr || changeSized(i, input)) {
|
||||
auto prim = createPrimitiveDesc(ctx);
|
||||
auto src = createMemory(memoryDesc(input->dims(), actual_layout), input->data);
|
||||
auto newbuffer = paddle::memory::Alloc(ctx.GetPlace(), input->size_in_bytes());
|
||||
auto dst = createMemory(p->expected_desc(), newbuffer->data);
|
||||
auto reorder_primitive(new mkldnn::reorder(src, dst));
|
||||
|
||||
dev_ctx.addMemory(op_key+"_src_input", src);
|
||||
dev_ctx.addMemory(op_key+"_input", dst);
|
||||
dev_ctx.addPrimitive(op_key+"_reorder_input", reorder_primitive);
|
||||
}
|
||||
|
||||
p = dev_ctx.findPrimitive(op_key + "_reorder_input");
|
||||
PADDLE_ENFORCE(p, "Should have Reorder Primitive");
|
||||
dev_ctx.submit(p);
|
||||
if (! this->isMKLDNNKernel()) {
|
||||
// execute immediately only if this is not mkldnn kernel function.
|
||||
// otherwise, it can be executed with the operator primitive in Compute
|
||||
dev_ctx.stream();
|
||||
}
|
||||
// after submit, the input tensor in ExecutionContext should be changed as the converted one
|
||||
// there should be another mechanism to ensure this
|
||||
}
|
||||
```
|
||||
|
||||
### Unit Test
|
||||
All the functions should be tested corresponding.
|
||||
TBD
|
@ -0,0 +1,51 @@
|
||||
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. */
|
||||
|
||||
#include "paddle/framework/op_kernel_type.h"
|
||||
#include <gtest/gtest.h>
|
||||
#include <iostream>
|
||||
|
||||
TEST(OpKernelType, ToString) {
|
||||
using OpKernelType = paddle::framework::OpKernelType;
|
||||
using DataType = paddle::framework::proto::DataType;
|
||||
using CPUPlace = paddle::platform::CPUPlace;
|
||||
using DataLayout = paddle::framework::DataLayout;
|
||||
using LibraryType = paddle::framework::LibraryType;
|
||||
|
||||
OpKernelType op_kernel_type(DataType::FP32, CPUPlace(), DataLayout::kNCHW,
|
||||
LibraryType::kCUDNN);
|
||||
|
||||
std::ostringstream stream;
|
||||
stream << op_kernel_type;
|
||||
ASSERT_EQ(
|
||||
stream.str(),
|
||||
"data_type[5]:data_layout[NCHW]:place[CPUPlace]:library_type[CUDNN]");
|
||||
}
|
||||
|
||||
TEST(OpKernelType, Hash) {
|
||||
using OpKernelType = paddle::framework::OpKernelType;
|
||||
using DataType = paddle::framework::proto::DataType;
|
||||
using CPUPlace = paddle::platform::CPUPlace;
|
||||
using CUDAPlace = paddle::platform::CUDAPlace;
|
||||
using DataLayout = paddle::framework::DataLayout;
|
||||
using LibraryType = paddle::framework::LibraryType;
|
||||
|
||||
OpKernelType op_kernel_type_1(DataType::FP32, CPUPlace(), DataLayout::kNCHW,
|
||||
LibraryType::kCUDNN);
|
||||
OpKernelType op_kernel_type_2(DataType::FP32, CUDAPlace(0), DataLayout::kNCHW,
|
||||
LibraryType::kCUDNN);
|
||||
|
||||
OpKernelType::Hash hasher;
|
||||
ASSERT_NE(hasher(op_kernel_type_1), hasher(op_kernel_type_2));
|
||||
}
|
@ -0,0 +1,24 @@
|
||||
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. */
|
||||
|
||||
#include "paddle/framework/threadpool.h"
|
||||
|
||||
namespace paddle {
|
||||
namespace framework {
|
||||
|
||||
std::unique_ptr<ThreadPool> ThreadPool::threadpool(nullptr);
|
||||
std::once_flag ThreadPool::init_flag;
|
||||
|
||||
} // namespace framework
|
||||
} // namespace paddle
|
@ -0,0 +1,156 @@
|
||||
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. */
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <condition_variable>
|
||||
#include <functional>
|
||||
#include <mutex>
|
||||
#include <queue>
|
||||
#include <thread>
|
||||
|
||||
#include "paddle/platform/enforce.h"
|
||||
|
||||
namespace paddle {
|
||||
namespace framework {
|
||||
|
||||
typedef std::function<void()> Task;
|
||||
|
||||
class ThreadPool {
|
||||
public:
|
||||
/**
|
||||
* @brief Get a instance of threadpool, the thread number will
|
||||
* be specified as the number of hardware thread contexts
|
||||
*/
|
||||
static ThreadPool* GetInstance() {
|
||||
std::call_once(init_flag, &ThreadPool::Init);
|
||||
return threadpool.get();
|
||||
}
|
||||
|
||||
~ThreadPool() {
|
||||
{
|
||||
// notify all threads to stop running
|
||||
running_ = false;
|
||||
scheduled_.notify_all();
|
||||
}
|
||||
|
||||
for (auto& t : threads_) {
|
||||
t->join();
|
||||
t.reset(nullptr);
|
||||
}
|
||||
}
|
||||
|
||||
int GetNumThreads() const { return num_threads_; }
|
||||
|
||||
int GetAvailable() {
|
||||
std::unique_lock<std::mutex> lock(mutex_);
|
||||
return available_;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Push a function to the queue, and will be scheduled and
|
||||
* executed if a thread is available.
|
||||
* @param[in] Task will be pushed to the task queue.
|
||||
*/
|
||||
void Run(const Task& fn) {
|
||||
std::unique_lock<std::mutex> lock(mutex_);
|
||||
tasks_.push(fn);
|
||||
lock.unlock();
|
||||
scheduled_.notify_one();
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Wait until all the tasks are completed.
|
||||
*/
|
||||
void Wait() {
|
||||
std::unique_lock<std::mutex> lock(mutex_);
|
||||
completed_.wait(lock, [=] { return Done() == true; });
|
||||
}
|
||||
|
||||
private:
|
||||
DISABLE_COPY_AND_ASSIGN(ThreadPool);
|
||||
|
||||
explicit ThreadPool(int num_threads)
|
||||
: num_threads_(num_threads), available_(num_threads), running_(true) {
|
||||
threads_.resize(num_threads);
|
||||
for (auto& thread : threads_) {
|
||||
// TODO(Yancey1989): binding the thread on the specify CPU number
|
||||
thread.reset(new std::thread(std::bind(&ThreadPool::TaskLoop, this)));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief If the task queue is empty and avaialbe
|
||||
* is equal to the number of threads, means that
|
||||
* all tasks are completed.
|
||||
*
|
||||
* Note: this function is not thread-safe.
|
||||
*
|
||||
* @return true if all tasks are completed.
|
||||
*/
|
||||
bool Done() { return tasks_.empty() && available_ == num_threads_; }
|
||||
|
||||
void TaskLoop() {
|
||||
while (running_) {
|
||||
std::unique_lock<std::mutex> lock(mutex_);
|
||||
scheduled_.wait(lock, [=] { return !tasks_.empty() || !running_; });
|
||||
|
||||
if (!running_) {
|
||||
break;
|
||||
}
|
||||
// pop a task from the task queue
|
||||
auto task = tasks_.front();
|
||||
tasks_.pop();
|
||||
|
||||
--available_;
|
||||
lock.unlock();
|
||||
|
||||
// run the task
|
||||
task();
|
||||
|
||||
{
|
||||
std::unique_lock<std::mutex> lock(mutex_);
|
||||
++available_;
|
||||
if (Done()) {
|
||||
completed_.notify_all();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void Init() {
|
||||
if (threadpool.get() == nullptr) {
|
||||
// TODO(Yancey1989): specify the max threads number
|
||||
int num_threads = std::thread::hardware_concurrency();
|
||||
PADDLE_ENFORCE_GT(num_threads, 0);
|
||||
threadpool.reset(new ThreadPool(num_threads));
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
static std::unique_ptr<ThreadPool> threadpool;
|
||||
static std::once_flag init_flag;
|
||||
|
||||
int num_threads_;
|
||||
int available_;
|
||||
bool running_;
|
||||
std::queue<Task> tasks_;
|
||||
std::vector<std::unique_ptr<std::thread>> threads_;
|
||||
std::mutex mutex_;
|
||||
std::condition_variable scheduled_;
|
||||
std::condition_variable completed_;
|
||||
};
|
||||
|
||||
} // namespace framework
|
||||
} // namespace paddle
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue