commit
5c537941c2
@ -0,0 +1,35 @@
|
||||
if(NOT WITH_GPU)
|
||||
return()
|
||||
endif()
|
||||
|
||||
include(ExternalProject)
|
||||
|
||||
set(CUB_SOURCE_DIR ${THIRD_PARTY_PATH}/cub)
|
||||
set(CUB_INCLUDE_DIR ${CUB_SOURCE_DIR}/src/extern_cub)
|
||||
|
||||
include_directories(${CUB_INCLUDE_DIR})
|
||||
|
||||
ExternalProject_Add(
|
||||
extern_cub
|
||||
${EXTERNAL_PROJECT_LOG_ARGS}
|
||||
GIT_REPOSITORY "https://github.com/NVlabs/cub.git"
|
||||
GIT_TAG "v1.8.0"
|
||||
PREFIX ${CUB_SOURCE_DIR}
|
||||
UPDATE_COMMAND ""
|
||||
CONFIGURE_COMMAND ""
|
||||
BUILD_COMMAND ""
|
||||
INSTALL_COMMAND ""
|
||||
TEST_COMMAND ""
|
||||
)
|
||||
|
||||
if(${CMAKE_VERSION} VERSION_LESS "3.3.0")
|
||||
set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/cub_dummy.c)
|
||||
file(WRITE ${dummyfile} "const char *dummy = \"${dummyfile}\";")
|
||||
add_library(cub STATIC ${dummyfile})
|
||||
else()
|
||||
add_library(cub INTERFACE)
|
||||
endif()
|
||||
|
||||
add_dependencies(cub extern_cub)
|
||||
|
||||
LIST(APPEND externl_project_dependencies cub)
|
@ -0,0 +1,58 @@
|
||||
# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
set(WITH_XBYAK ON)
|
||||
if(WIN32 OR APPLE)
|
||||
SET(WITH_XBYAK OFF CACHE STRING "Disable XBYAK in Windows and MacOS" FORCE)
|
||||
return()
|
||||
endif()
|
||||
|
||||
include(ExternalProject)
|
||||
|
||||
set(XBYAK_PROJECT extern_xbyak)
|
||||
set(XBYAK_PREFIX_DIR ${THIRD_PARTY_PATH}/xbyak)
|
||||
set(XBYAK_INSTALL_ROOT ${THIRD_PARTY_PATH}/install/xbyak)
|
||||
set(XBYAK_INC_DIR ${XBYAK_INSTALL_ROOT}/include)
|
||||
|
||||
include_directories(${XBYAK_INC_DIR})
|
||||
include_directories(${XBYAK_INC_DIR}/xbyak)
|
||||
|
||||
add_definitions(-DPADDLE_WITH_XBYAK)
|
||||
|
||||
# xbyak options
|
||||
add_definitions(-DXBYAK64)
|
||||
add_definitions(-DXBYAK_NO_OP_NAMES)
|
||||
|
||||
ExternalProject_Add(
|
||||
${XBYAK_PROJECT}
|
||||
${EXTERNAL_PROJECT_LOG_ARGS}
|
||||
DEPENDS ""
|
||||
GIT_REPOSITORY "https://github.com/herumi/xbyak.git"
|
||||
GIT_TAG "v5.661" # Jul 26th
|
||||
PREFIX ${XBYAK_PREFIX_DIR}
|
||||
UPDATE_COMMAND ""
|
||||
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${XBYAK_INSTALL_ROOT}
|
||||
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${XBYAK_INSTALL_ROOT}
|
||||
)
|
||||
|
||||
if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
|
||||
set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/xbyak_dummy.c)
|
||||
file(WRITE ${dummyfile} "const char *dummy_xbyak = \"${dummyfile}\";")
|
||||
add_library(xbyak STATIC ${dummyfile})
|
||||
else()
|
||||
add_library(xbyak INTERFACE)
|
||||
endif()
|
||||
|
||||
add_dependencies(xbyak ${XBYAK_PROJECT})
|
||||
list(APPEND external_project_dependencies xbyak)
|
@ -1,89 +0,0 @@
|
||||
## Motivation
|
||||
|
||||
There is a ```gap``` between the ```Program``` defined by
|
||||
user and the ```Executable``` that can be scheduled
|
||||
efficiently on heterogeneous hardware, either locally
|
||||
or distributedly.
|
||||
|
||||
Usually, the ```gap``` is bridged by
|
||||
|
||||
* A serious transformations with defined order.
|
||||
|
||||
* These transformations usually involve
|
||||
```insert, delete, clustering, split, dependency analysis```.
|
||||
|
||||
* Has a simple way to verify and debug each transformation.
|
||||
|
||||
* Flexible to add, remove or customize transformations to fit
|
||||
the requirements of various algorithms (models) and hardware secenarios.
|
||||
|
||||
Some other events also push us to a better unified pattern.
|
||||
|
||||
* The deep learning framework is built around the concepts of graphs.
|
||||
To leverage tools such as compilation (e.g. TVM and nGraph) or
|
||||
cross-framework conversion (e.g. ONNX), we also need a intermediate
|
||||
representation that can be connected to the rest of the ecosystem.
|
||||
|
||||
|
||||
We need a unified pattern to naturally support the requirements
|
||||
described above. The pattern should fit both training, inference
|
||||
and other offline serielized model transformations.
|
||||
Learned from LLVM and other deep learning framework, we draft the
|
||||
design below.
|
||||
|
||||
|
||||
## Design
|
||||
|
||||
### Major Concepts
|
||||
|
||||
#### Node
|
||||
|
||||
```Node``` represents an operation that performs some computation or
|
||||
a variable that is input or output of operation.
|
||||
|
||||
```Node```s are connected to other ```Node```s via inputs and outputs.
|
||||
|
||||
Other properties (maybe device placement information) can be added
|
||||
to ```Node``` in the future if it's a
|
||||
common requirement of many other ```Pass```es. Otherwise, it should live
|
||||
in a ```Node``` wrapper class that is private to some ```Pass``` or be
|
||||
a local member of a ```Pass```.
|
||||
|
||||
#### Graph
|
||||
|
||||
```Graph``` contains a list of ```Node```s, which are connected to
|
||||
each other via inputs and outputs.
|
||||
|
||||
TODO: Better definitions for the graph.
|
||||
|
||||
```Graph``` can also contain ```Attribute```s. ```Attribute```s
|
||||
can be ``any`` thing. For example, it can be a list of "wraper"
|
||||
nodes. The ```wrapper``` nodes compose ```Node```s and provide
|
||||
helper method for execution or transformation. ```Attribute```
|
||||
can also contain other things that describe some properties of
|
||||
the ```Graph``` or ```Graph``` nodes. ```Attribute``` can be passed
|
||||
across ```Pass```. However, it should be used with care.
|
||||
|
||||
#### Pass
|
||||
|
||||
```Pass``` represents a transformation of ```Graph```. Its input
|
||||
is a ```Graph``` and its output is also a ```Graph```. For example,
|
||||
a ```Pass``` can simply print out the ```Graph```. A ```Pass```
|
||||
can also fuse some ```Graph```'s ```Node```s.
|
||||
|
||||
#### Optimize
|
||||
|
||||
```Optimize``` contains a series of ```Pass``` with defined order.
|
||||
```Optimize``` transforms a ```Graph``` that only contains raw
|
||||
modeling logic to a ```Graph``` that can be run efficiently while
|
||||
maintaining the original modeling logic.
|
||||
|
||||
|
||||
### Optimize Process
|
||||
|
||||
* Program is first converted to Graph.
|
||||
* Graph goes through a series of Pass
|
||||
* Graph is transformed from raw model logic to a
|
||||
form that is efficient to execute.
|
||||
|
||||
Program->ProgramToGraph->Graph->Pass1->Graph->Pass2->Graph->Pass3->Graph->Executor
|
@ -0,0 +1,185 @@
|
||||
## Motivation
|
||||
|
||||
There is a `gap` between the `Program` defined by
|
||||
user and the `Executable` that can be scheduled
|
||||
efficiently on heterogeneous hardware, either locally
|
||||
or distributedly.
|
||||
|
||||
Usually, the `gap` is bridged by
|
||||
|
||||
* A serious transformations with defined order.
|
||||
|
||||
* These transformations usually involve
|
||||
`insert, delete, clustering, split, dependency analysis`.
|
||||
|
||||
* Has a simple way to verify and debug each transformation.
|
||||
|
||||
* Flexible to add, remove or customize transformations to fit
|
||||
the requirements of various algorithms (models) and hardware secenarios.
|
||||
|
||||
Some other events also push us to a better unified pattern.
|
||||
|
||||
* The deep learning framework is built around the concepts of graphs.
|
||||
To leverage tools such as compilation (e.g. TVM and nGraph) or
|
||||
cross-framework conversion (e.g. ONNX), we also need a intermediate
|
||||
representation that can be connected to the rest of the ecosystem.
|
||||
|
||||
|
||||
We need a unified pattern to naturally support the requirements
|
||||
described above. The pattern should fit both training, inference
|
||||
and other offline serielized model transformations.
|
||||
Learned from LLVM and other deep learning framework, we draft the
|
||||
design below.
|
||||
|
||||
|
||||
## Design
|
||||
|
||||
### Major Concepts
|
||||
|
||||
#### Node
|
||||
|
||||
`Node` represents an operation that performs some computation or
|
||||
a variable that is input or output of operation.
|
||||
|
||||
`Node`s are connected to other `Node`s via inputs and outputs.
|
||||
|
||||
Other properties (maybe device placement information) can be added
|
||||
to `Node` in the future if it's a
|
||||
common requirement of many other `Pass`es. Otherwise, it should live
|
||||
in a `Node` wrapper class that is private to some `Pass` or be
|
||||
a local member of a `Pass`.
|
||||
|
||||
#### Graph
|
||||
|
||||
`Graph` contains a list of `Node`s, which are connected to
|
||||
each other via inputs and outputs.
|
||||
|
||||
TODO: Better definitions for the graph.
|
||||
|
||||
`Graph` can also contain `Attribute`s. `Attribute`s
|
||||
can be `any` thing. For example, it can be a list of "wraper"
|
||||
nodes. The `wrapper` nodes compose `Node`s and provide
|
||||
helper method for execution or transformation. `Attribute`
|
||||
can also contain other things that describe some properties of
|
||||
the `Graph` or `Graph` nodes. `Attribute` can be passed
|
||||
across `Pass`. However, it should be used with care.
|
||||
|
||||
```cpp
|
||||
class Graph {
|
||||
public:
|
||||
explicit Graph(const ProgramDesc &program);
|
||||
|
||||
bool Has(const std::string &attr_name) const;
|
||||
|
||||
template <typename AttrType>
|
||||
AttrType &Get(const std::string &attr_name) const;
|
||||
|
||||
template <typename AttrType>
|
||||
void Set(const std::string &attr_name, AttrType *attr);
|
||||
const std::unordered_set<ir::Node *> &Nodes() const;
|
||||
|
||||
// Create a normal variable with non-null VarDesc.
|
||||
ir::Node *CreateVarNode(VarDesc *var_desc);
|
||||
|
||||
// Create a normal runnable operator with OpDesc.
|
||||
ir::Node *CreateOpNode(OpDesc *op_desc);
|
||||
|
||||
// Create a control dependency var that connects 2 operations. The
|
||||
// var doesn't hold any data. Other than that, it's no different from
|
||||
// other var, considering dependency analysis.
|
||||
ir::Node *CreateControlDepVar();
|
||||
|
||||
// A more free style way of creating a graph node. Mostly use for test
|
||||
// or "copy" from another node. Avoid using it if possible.
|
||||
ir::Node *CreateEmptyNode(const std::string &name, ir::Node::Type type);
|
||||
|
||||
// Clear all node information of the graph and return the ownership of the
|
||||
// nodes.
|
||||
std::vector<std::unique_ptr<ir::Node>> ReleaseNodes();
|
||||
};
|
||||
```
|
||||
|
||||
#### Pass
|
||||
|
||||
`Pass` represents a transformation of `Graph`. Its input
|
||||
is a `Graph` and its output is also a `Graph`. For example,
|
||||
a `Pass` can simply print out the `Graph`. A `Pass`
|
||||
can also fuse some `Graph`'s `Node`s.
|
||||
|
||||
```cpp
|
||||
class Pass {
|
||||
public:
|
||||
|
||||
std::unique_ptr<Graph> Apply(std::unique_ptr<Graph> graph) const {
|
||||
// Some correctness check.
|
||||
auto new_graph = ApplyImpl(std::move(graph));
|
||||
// Some correctness check.
|
||||
return new_graph;
|
||||
}
|
||||
|
||||
// Get a reference to the attributed previously set.
|
||||
template <typename AttrType>
|
||||
AttrType &Get(const std::string &attr_name) const;
|
||||
|
||||
// Set a pointer to the attribute. Pass takes ownership of the attribute.
|
||||
template <typename AttrType>
|
||||
void Set(const std::string &attr_name, AttrType *attr) ;
|
||||
|
||||
// Set a pointer to the attribute. Pass doesn't take ownership. Caller
|
||||
// should delete the attribute.
|
||||
template <typename AttrType>
|
||||
void SetNotOwned(const std::string &attr_name, AttrType *attr);
|
||||
|
||||
protected:
|
||||
virtual std::unique_ptr<Graph> ApplyImpl(std::unique_ptr<Graph> graph) const = 0;
|
||||
};
|
||||
|
||||
// In my_pass.cc
|
||||
class MyPass : public Pass {
|
||||
protected:
|
||||
std::unique_ptr<Graph> ApplyImpl(std::unique_ptr<Graph> graph) const override {
|
||||
// do something.
|
||||
return graph;
|
||||
}
|
||||
}
|
||||
REGISTER_PASS(my_pass, MyPass)
|
||||
.RequirePassAttr("places")
|
||||
.RequireGraphAttr("dep_vars");
|
||||
|
||||
|
||||
// To use the pass.
|
||||
auto my_pass = ir::PassRegistry::Instance().Get("my_pass");
|
||||
graph = my_pass->Apply(std::move(graph));
|
||||
// Note: to force link my_pass.cc, in the code:
|
||||
USE_PASS(my_pass);
|
||||
```
|
||||
|
||||
#### Optimize
|
||||
|
||||
`Optimize` contains a series of `Pass` with defined order.
|
||||
`Optimize` transforms a `Graph` that only contains raw
|
||||
modeling logic to a `Graph` that can be run efficiently while
|
||||
maintaining the original modeling logic.
|
||||
|
||||
|
||||
### Optimize Process
|
||||
|
||||
* Program is first converted to Graph.
|
||||
* Graph goes through a series of Pass
|
||||
* Graph is transformed from raw model logic to a
|
||||
form that is efficient to execute.
|
||||
|
||||
```
|
||||
// Program->ProgramToGraph->Graph->Pass1->Graph->Pass2->Graph->Pass3->Graph->Executor
|
||||
auto graph = Graph(program);
|
||||
graph = PassRegistry::Instance().Get("op_fuse_pass").Apply(std::move(grah));
|
||||
// For more complex Pass, Optimize Process can provide Pass attributes.
|
||||
auto mem_opt_pass = PassRegistry::Instance().Get("memory_optimization_pass");
|
||||
mem_opt_pass.SetNotOwned<int>("optimize_level", 1);
|
||||
mem_opt_pass->Apply(std::move(graph));
|
||||
graph = PassRegistry::Instance().Get("multi_devices_pass").Apply(std::move(grah));
|
||||
graph = PassRegistry::Instance().Get("multi_devices_check_pass").Apply(std::move(grah));
|
||||
Executor exe;
|
||||
exe.Run(graph);
|
||||
|
||||
```
|
@ -0,0 +1,20 @@
|
||||
# Operator fusion
|
||||
Fusing multiple operators together is an important method to optimize the program execution, particularly for GPU or other specialized accelerators. An obvious benefit is to avoid the overhead of saving the intermediate result back into global memory.
|
||||
|
||||
There are generally two ways to fuse operators, fusing directly connected operators and fusing non directly connected operators. The first method is mainly used by [NNVM Compiler](https://github.com/dmlc/tvm/) and [XLA](https://www.tensorflow.org/performance/xla/). The second method is mainly used by Dynet and TensorFlow Fold to do auto-batching. The principle of fusing operator is according to some rules to combine multiple operations into one, for example, `Y = X * W` and `Z = Y + B` can be fused to `Z = X * W + B`, and `Y1 = X1 * W` and `Y2 = X2 * W` can be fused to `[Y1;Y2] = [X1;X2] * W`. In order to get a short-term profit, we decided to try to manually specify these rules.
|
||||
|
||||
## Challenge
|
||||
The challenge of fusing operators is:
|
||||
- how to make the rules.
|
||||
- how to implement these rules efficiently.
|
||||
|
||||
### How to make the rules?
|
||||
|
||||
The problem of determining the best single location for a fusion operator is an NP-hard combinatorial problem. After analysis the operators of the DL model, we found there are two group of operators can be fused explicitly, one is the simple and adjacent operations, for example, `tmp = x + y` and `z = Relu(tmp)`, and the other is the operators that have the same function, for example, a serials of `SGD` or `Momentum`. They usually appear in the model in a large number. So we should think about how to fuse them separately first.
|
||||
|
||||
### How to implement these rules efficiently?
|
||||
#### How to fuse the adjacent operations efficiently?
|
||||
Here we use a template function to represent the fused operations. The pros of using a template function are that it is simple and efficient, and the cons are that it is not easy to expand, and it can only be used to express some simple operations. So taking into account our current needs, the template function is more appropriate.
|
||||
|
||||
#### How to fuse the operators that have the same function efficiently?
|
||||
We take SGD operator as an example, the training model may have hundreds of parameters and correspondingly have the same number of SGD operators. The expression(`w = w - lr*w_g`) of those operators is the same, so during of training, the executor will execute this expression hundreds time in CPU or other specialized accelerators. If we can fuse them and make the address of all `w` and all `w_g` continuous respectively, we only need execute one time. For some accelerators, the time of launching kernel is not neglected, so the time of hundreds of times of launching and executing kernel may be larger than launching and executing only once. There usually are many operators that similar to `SGD` in the DL model, such as `AllReduce` and `FC`.
|
@ -0,0 +1,40 @@
|
||||
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
#include "paddle/fluid/framework/data_type.h"
|
||||
|
||||
#include <string>
|
||||
#include "gtest/gtest.h"
|
||||
#include "paddle/fluid/framework/tensor.h"
|
||||
|
||||
TEST(DataType, float16) {
|
||||
using paddle::framework::Tensor;
|
||||
using paddle::platform::CPUPlace;
|
||||
using paddle::platform::float16;
|
||||
namespace f = paddle::framework;
|
||||
f::proto::VarType::Type dtype = f::proto::VarType::FP16;
|
||||
|
||||
Tensor tensor;
|
||||
CPUPlace cpu;
|
||||
tensor.mutable_data(cpu, f::ToTypeIndex(dtype));
|
||||
|
||||
// test fp16 tensor
|
||||
EXPECT_EQ(tensor.type(), std::type_index(typeid(float16)));
|
||||
|
||||
// test fp16 size
|
||||
EXPECT_EQ(f::SizeOfType(f::ToTypeIndex(dtype)), 2u);
|
||||
|
||||
// test debug info
|
||||
std::string type = "float16";
|
||||
EXPECT_STREQ(f::DataTypeToString(dtype).c_str(), type.c_str());
|
||||
}
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue